- Timestamp:
- 05/20/08 12:57:22 (7 months ago)
- Files:
Legend:
- Unmodified
- Added
- Removed
- Modified
- Copied
- Moved
Trunk/Scripts/Audio_scripts/AudioSegmentation/AudioBook/Text.pm
r2587 r2588 54 54 open(IN, "<$inputfilename") or confess ("error: cannot open input $inputfilename for input"); 55 55 open(LOG,">>$log") or confess ("cannot open $log file"); 56 print LOG "VoxForge speech segmentation script log\n\n"; 57 56 58 print LOG "Changes made to Text file\n"; 57 59 print LOG "-------------------------\n"; 58 60 my @eText = <IN>; # slurp in entire file into an array 59 61 my @words; 62 my (@periodlog, @quotelog); 60 63 foreach my $line (@eText) { 61 64 $line =~ s/\n/ /g; # remove all line feeds from the text file … … 84 87 $line =~ s/\)//g; # remove parenthesis 85 88 $line =~ s/_//g; # remove underscore 89 $line =~ s/\[//g; # remove left bracket 90 $line =~ s/\]//g; # remove right bracket 86 91 # Other cleanup !!!!!! need to change the prompts files directly rather than doing this!!! or add to dictionnary!!! 87 92 $line =~ s/&/AND/g; … … 99 104 _processUrls($self, \@words, $word); 100 105 } elsif ($word =~ /^\d+$/) { # word contains one or more numbers 101 print LOG "changed digit: $word \n";102 106 _processNumbers ($self, \@words, $word); 103 }elsif ($word =~ /\d/) { # word contains numbers and letters 104 print LOG "changed digit: $word \n"; 107 } elsif ($word =~ /^\d+\.$/) { # word contains one or more numbers and a period at the end 108 $word =~ s/\.//g; # remove the period 109 _processNumbers ($self, \@words, $word); 110 } elsif ($word =~ /\d/) { # word contains numbers and letters 105 111 _processWordsContainingNumbers ($self, \@words,$word); 106 112 } elsif ($word =~ /^\.$/) {# $word only contains a period 107 113 $word =~ s/\.//g; 108 114 } elsif ($word =~ /\./) {# $word contains a period 109 print LOG "changed word with period in it: $word to"; 110 $word =~ s/\.//g; 111 print LOG " $word\n"; 115 my $wordsbefore; 116 if ($word =~ /\.$/){ # period is at end of word 117 $word =~ s/\.//g; 118 } else { 119 $wordsbefore = $word; 120 $word =~ s/\.//g; 121 push (@periodlog, "from:$wordsbefore to:$word\n"); 122 } 123 push (@words, $word); 124 } elsif ($word =~ /\'/) {# $words containing single quotes 125 push (@quotelog, "$word\n"); 112 126 push (@words, $word); 113 127 } else { … … 117 131 } 118 132 } 133 print LOG "\nWords with period (\".\") removed from body of word - please review:\n"; 134 print LOG "------------------------------------------------------------------\n"; 135 foreach my $line (@periodlog) { 136 print LOG $line; 137 } 138 print LOG "\nWords with single quotes (no change made to word) - please review:\n"; 139 print LOG "------------------------------------------------------------------\n"; 140 foreach my $line (@quotelog) { 141 print LOG $line; 142 } 119 143 $self->{"contents"} = \@words; 120 144 close(IN); … … 168 192 my $word1 = $1; 169 193 my $word2 = $2; 170 print LOG "changed URL $word to:$word1 ;DOT;$word2;\n";194 print LOG "changed URL $word to:$word1 DOT $word2;\n"; 171 195 if ($word1 =~ /\d+/) { 172 196 print "processingUrls: splitword2:$word1 DOT $word2;\n" if $debug; … … 185 209 my ($self,$wordarray, $subword)= @_; 186 210 my $debug = $self->{"debug"}; 187 # separates numbers contained in a word, and converts ordinal poriton of a wordsinto its own word.211 # separates numbers contained in a word, and converts numerical section of a word into its own word. 188 212 if ($subword =~ /\d+\D+/) { # assume single, consecutive set of numbers (i.e no split numbers in word) 189 213 my $number = $subword;