voxforge.org
VoxForge Dev
Show
Ignore:
Timestamp:
05/25/08 20:14:38 (6 months ago)
Author:
kmaclean
Message:

AudioSegmentation scripts -snapshot

Files:

Legend:

Unmodified
Added
Removed
Modified
Copied
Moved
  • Trunk/Scripts/Audio_scripts/AudioSegmentation/AudioBook/Text.pm

    r2589 r2590  
    3838        $self{"inputfilename"} = $textFile; 
    3939        $self{'log'} = $super->{'log'};          
     40        $self{"log_single_quotes"} = $super->{"log_single_quotes"}; 
    4041        _clean(\%self); 
    4142        bless(\%self,$class); 
     
    5152        my $inputfilename = $self->{"inputfilename"}; 
    5253        my $log = $self->{'log'};        
    53  
     54         
    5455        open(IN, "<$inputfilename") or confess ("error: cannot open input $inputfilename for input");  
    5556        open(LOG,">>$log") or confess ("cannot open $log file");         
     
    6566                $line =~ s/\r/ /g; # remove all carriage returns from the text file 
    6667                $line =~ tr/a-z/A-Z/; # change to uppercase 
    67                 $line =~ s/,//g; # remove commas  
    68                 $line =~ s/\.\"/ /g; # period followed by double quote 
    69                 $line =~ s/\.\'/ /g; # period followed by single quote 
    70                 # !!!!!! see below for processing of periods  
     68 
     69                $line =~ s/\.\"//g; # period followed by double quote 
     70                $line =~ s/\,\"//g; # comma followed by double quote 
     71                $line =~ s/\?\"//g; # question mark followed by double quote 
     72                $line =~ s/\!\"//g; # exclamation mark followed by double quote  
     73                $line =~ s/\.\'//g; # period followed by single quote 
     74                $line =~ s/\,\'//g; # comma followed by single quote 
     75                $line =~ s/\?\'//g; # question mark followed by single quote 
     76                $line =~ s/\!\'//g; # exclamation mark followed by single quote 
     77                 
     78                # see below for further processing of periods & single quotes 
    7179                #line =~ s/\./ /g; # remove periods # too broad - periods in emails or URLs are pronounced, but need to avoid acronyms. 
    72                 # !!!!!! 
    73                 #  $line =~ s/\'//g; # remove single quotes; but need words like "don't" - need to research this more ... 
    74                 $line =~ s/\"//g; # remove all double quotes 
     80                #$line =~ s/\'//g; # remove single quotes; but need words like "don't" - need to research this more ... 
     81                $line =~ s/\"//g; # remove all double quotes                             
     82                $line =~ s/,//g; # remove commas 
     83                                 
    7584                $line =~ s/://g; # remove colon 
    7685                #  $line =~ s/-//g; # compound word dash; but VoxForge dictionnary contains words with dashes ... 
     
    7887                $line =~ s/ - / /g; # dash punctuation   
    7988                $line =~ s/ -/ /g; # dash punctuation            
    80                 $line =~ s/-/ /g; # dash - compound word        
     89                $line =~ s/-/ /g; # dash - compound word; replace with space, so they can be looked up in pronunciation dictionary 
    8190                $line =~ s/;//g; # semi-colon 
    8291                $line =~ s/!//g; # exclamation mark 
     
    8897                $line =~ s/_//g; # remove underscore     
    8998                $line =~ s/\[//g; # remove left bracket 
    90                 $line =~ s/\]//g; # remove right bracket                         
     99                $line =~ s/\]//g; # remove right bracket         
     100                $line =~ s/\*//g; # remove star                          
    91101                # Other cleanup !!!!!! need to change the prompts files directly rather than doing this!!! or add to dictionnary!!! 
    92102                $line =~ s/&/AND/g;  
     
    96106                        if ($word =~ /\S/) {  #Anything other than white space  [^ \r\t\n\f] 
    97107                                $word =~ s/\s+//g; 
    98                                 $word =~ s/^\'+//; # remove single quote(s) from beginning of word 
    99                                 $word =~ s/\'+$//; # remove single quote(s) from end of word 
     108                                # !!!!!! 
     109                                #$$word =~ s/^\'+//; # remove on or more single quote(s) from beginning of word  
     110                                #$word =~ s/\'+$//; # remove single quote(s) from end of word ... but some contractions use a single quote at the end of the word 
     111                                if ($word =~ /\'/) {# $words containing single quotes 
     112                                        _processSingleQuote ($self, \$word,\@quotelog);  
     113                                }  
     114                                # !!!!!!! 
    100115                                if ($word =~ /[\w\-.]+@[\w\-.]+\.[A-Za-z]{2,4}/) { #email address 
    101116                                        _processEmails($self, \@words, $word); 
     
    110125                                } elsif ($word =~ /\d/) { # word contains numbers and letters 
    111126                                        _processWordsContainingNumbers ($self, \@words,$word); 
    112                                 } elsif ($word =~ /^\.$/) {# entire word only contains a period  
    113                                         $word =~ s/\.//g; 
     127                                } elsif ($word =~ /^\.+$/) {# entire word only contains one or more periods  
     128                                        $word =~ s/\.//g; # not actually doing anything with this statement, since there is no push 
    114129                                } elsif ($word =~ /\./) {# $word contains at least one period  
    115130                                        _processPeriods ($self, \@words, $word,\@periodlog,\@quotelog);  
    116                                 } elsif ($word =~ /\'/) {# $words containing single quotes 
    117                                         push (@quotelog, "$word\n");             
    118                                         push (@words, $word);                                    
     131                                } elsif ($word =~ /^\'+$/) {# entire word only contains one or more single quotes  
     132                                        $word =~ s/\'//g; # not actually doing anything with this statement, since there is no push 
    119133                                } else { 
    120134                                        push (@words, $word); 
     
    128142                print LOG $line;         
    129143        } 
    130         print LOG "\nWords with single quotes (no change made to word) - please review:\n"; 
    131         print LOG   "------------------------------------------------------------------\n";    
     144        print LOG "\nWords with single quotes (no change made to word, unless otherwise stated) - please review:\n"; 
     145        print LOG   "-------------------------------------------------------------------------------------------\n";   
    132146        foreach my $line (@quotelog) { 
    133147                print LOG $line;         
     
    136150        close(IN); 
    137151}  
     152 
     153sub _processSingleQuote { #private 
     154        my ($self, $word, $quotelog)= @_; 
     155        my $debug = $self->{"debug"}; 
     156        my $log_single_quotes = $self->{"log_single_quotes"}; 
     157         
     158        if ( ($$word =~ /^\'/) and ($$word =~ /\'$/) ) { # word within two single quotes (from beginning and end of word) - assume it is a quoted word  
     159                my $tempword = $$word; 
     160                $$word =~ s/^\'+//; 
     161                $$word =~ s/\'+$//;  
     162                push (@$quotelog, "changed:$tempword to:$$word\n");  
     163        } elsif ($$word  =~ /^\'+/) { # one or more single quote(s) from beginning of word 
     164                my $tempword = $$word;           
     165                $$word =~ s/^\'+//;  
     166                push (@$quotelog, "changed:$tempword to:$$word\n");  
     167        } else {                 
     168                # don;t remove single quote(s) from end of word - further manual processing might be required 
     169                #$word =~ s/\'+$//; # single quote(s) from end of word ... but some contractions use a single quote at the end of the word 
     170                push (@$quotelog, "$$word\n") if $log_single_quotes;             
     171        } 
     172         
     173} 
    138174 
    139175sub _processPeriods { #private 
     
    163199                        $word =~ s/\./ /g; # replace period with a space 
    164200                        push (@$periodlog, "from:$wordsbefore to:$word\n");      
     201                        push (@$wordarray, $word) 
    165202        } else { 
    166203                confess "error: period got lost???\n";