- Timestamp:
- 05/25/08 20:14:38 (6 months ago)
- Files:
Legend:
- Unmodified
- Added
- Removed
- Modified
- Copied
- Moved
Trunk/Scripts/Audio_scripts/AudioSegmentation/AudioBook/Text.pm
r2589 r2590 38 38 $self{"inputfilename"} = $textFile; 39 39 $self{'log'} = $super->{'log'}; 40 $self{"log_single_quotes"} = $super->{"log_single_quotes"}; 40 41 _clean(\%self); 41 42 bless(\%self,$class); … … 51 52 my $inputfilename = $self->{"inputfilename"}; 52 53 my $log = $self->{'log'}; 53 54 54 55 open(IN, "<$inputfilename") or confess ("error: cannot open input $inputfilename for input"); 55 56 open(LOG,">>$log") or confess ("cannot open $log file"); … … 65 66 $line =~ s/\r/ /g; # remove all carriage returns from the text file 66 67 $line =~ tr/a-z/A-Z/; # change to uppercase 67 $line =~ s/,//g; # remove commas 68 $line =~ s/\.\"/ /g; # period followed by double quote 69 $line =~ s/\.\'/ /g; # period followed by single quote 70 # !!!!!! see below for processing of periods 68 69 $line =~ s/\.\"//g; # period followed by double quote 70 $line =~ s/\,\"//g; # comma followed by double quote 71 $line =~ s/\?\"//g; # question mark followed by double quote 72 $line =~ s/\!\"//g; # exclamation mark followed by double quote 73 $line =~ s/\.\'//g; # period followed by single quote 74 $line =~ s/\,\'//g; # comma followed by single quote 75 $line =~ s/\?\'//g; # question mark followed by single quote 76 $line =~ s/\!\'//g; # exclamation mark followed by single quote 77 78 # see below for further processing of periods & single quotes 71 79 #line =~ s/\./ /g; # remove periods # too broad - periods in emails or URLs are pronounced, but need to avoid acronyms. 72 # !!!!!! 73 # $line =~ s/\'//g; # remove single quotes; but need words like "don't" - need to research this more ... 74 $line =~ s/\"//g; # remove all double quotes 80 #$line =~ s/\'//g; # remove single quotes; but need words like "don't" - need to research this more ... 81 $line =~ s/\"//g; # remove all double quotes 82 $line =~ s/,//g; # remove commas 83 75 84 $line =~ s/://g; # remove colon 76 85 # $line =~ s/-//g; # compound word dash; but VoxForge dictionnary contains words with dashes ... … … 78 87 $line =~ s/ - / /g; # dash punctuation 79 88 $line =~ s/ -/ /g; # dash punctuation 80 $line =~ s/-/ /g; # dash - compound word 89 $line =~ s/-/ /g; # dash - compound word; replace with space, so they can be looked up in pronunciation dictionary 81 90 $line =~ s/;//g; # semi-colon 82 91 $line =~ s/!//g; # exclamation mark … … 88 97 $line =~ s/_//g; # remove underscore 89 98 $line =~ s/\[//g; # remove left bracket 90 $line =~ s/\]//g; # remove right bracket 99 $line =~ s/\]//g; # remove right bracket 100 $line =~ s/\*//g; # remove star 91 101 # Other cleanup !!!!!! need to change the prompts files directly rather than doing this!!! or add to dictionnary!!! 92 102 $line =~ s/&/AND/g; … … 96 106 if ($word =~ /\S/) { #Anything other than white space [^ \r\t\n\f] 97 107 $word =~ s/\s+//g; 98 $word =~ s/^\'+//; # remove single quote(s) from beginning of word 99 $word =~ s/\'+$//; # remove single quote(s) from end of word 108 # !!!!!! 109 #$$word =~ s/^\'+//; # remove on or more single quote(s) from beginning of word 110 #$word =~ s/\'+$//; # remove single quote(s) from end of word ... but some contractions use a single quote at the end of the word 111 if ($word =~ /\'/) {# $words containing single quotes 112 _processSingleQuote ($self, \$word,\@quotelog); 113 } 114 # !!!!!!! 100 115 if ($word =~ /[\w\-.]+@[\w\-.]+\.[A-Za-z]{2,4}/) { #email address 101 116 _processEmails($self, \@words, $word); … … 110 125 } elsif ($word =~ /\d/) { # word contains numbers and letters 111 126 _processWordsContainingNumbers ($self, \@words,$word); 112 } elsif ($word =~ /^\. $/) {# entire word only contains a period113 $word =~ s/\.//g; 127 } elsif ($word =~ /^\.+$/) {# entire word only contains one or more periods 128 $word =~ s/\.//g; # not actually doing anything with this statement, since there is no push 114 129 } elsif ($word =~ /\./) {# $word contains at least one period 115 130 _processPeriods ($self, \@words, $word,\@periodlog,\@quotelog); 116 } elsif ($word =~ /\'/) {# $words containing single quotes 117 push (@quotelog, "$word\n"); 118 push (@words, $word); 131 } elsif ($word =~ /^\'+$/) {# entire word only contains one or more single quotes 132 $word =~ s/\'//g; # not actually doing anything with this statement, since there is no push 119 133 } else { 120 134 push (@words, $word); … … 128 142 print LOG $line; 129 143 } 130 print LOG "\nWords with single quotes (no change made to word ) - please review:\n";131 print LOG "------------------------------------------------------------------ \n";144 print LOG "\nWords with single quotes (no change made to word, unless otherwise stated) - please review:\n"; 145 print LOG "-------------------------------------------------------------------------------------------\n"; 132 146 foreach my $line (@quotelog) { 133 147 print LOG $line; … … 136 150 close(IN); 137 151 } 152 153 sub _processSingleQuote { #private 154 my ($self, $word, $quotelog)= @_; 155 my $debug = $self->{"debug"}; 156 my $log_single_quotes = $self->{"log_single_quotes"}; 157 158 if ( ($$word =~ /^\'/) and ($$word =~ /\'$/) ) { # word within two single quotes (from beginning and end of word) - assume it is a quoted word 159 my $tempword = $$word; 160 $$word =~ s/^\'+//; 161 $$word =~ s/\'+$//; 162 push (@$quotelog, "changed:$tempword to:$$word\n"); 163 } elsif ($$word =~ /^\'+/) { # one or more single quote(s) from beginning of word 164 my $tempword = $$word; 165 $$word =~ s/^\'+//; 166 push (@$quotelog, "changed:$tempword to:$$word\n"); 167 } else { 168 # don;t remove single quote(s) from end of word - further manual processing might be required 169 #$word =~ s/\'+$//; # single quote(s) from end of word ... but some contractions use a single quote at the end of the word 170 push (@$quotelog, "$$word\n") if $log_single_quotes; 171 } 172 173 } 138 174 139 175 sub _processPeriods { #private … … 163 199 $word =~ s/\./ /g; # replace period with a space 164 200 push (@$periodlog, "from:$wordsbefore to:$word\n"); 201 push (@$wordarray, $word) 165 202 } else { 166 203 confess "error: period got lost???\n";