- Timestamp:
- 05/30/08 14:49:08 (6 months ago)
- Files:
Legend:
- Unmodified
- Added
- Removed
- Modified
- Copied
- Moved
Trunk/Scripts/Audio_scripts/AudioSegmentation/AudioBook/Dictionary.pm
r2598 r2599 12 12 use diagnostics; 13 13 use Carp; 14 use File::Copy; 14 15 use Lingua::EN::Numbers qw(num2en num2en_ordinal); 15 16 use Lingua::EN::Numbers::Years; … … 176 177 177 178 Add alternate pronunications generated by Sequitor G2P to original dict file, then perform forced alingment to see which 178 pronunciation that HVite picks, based on the phonemes it recognizes fromthe audio.179 pronunciation that HVite picks, based on the phonemes it recognizes in the audio. 179 180 180 181 Also create a folder containing the missing words and a prompt file containing with only those prompts that contain the missing words, 181 for manual confirmation of pronunciations. 182 for manual confirmation of pronunciations. 183 todo: set this up as an audacity project containing the wav files and text prompts. 182 184 183 185 =cut … … 188 190 my $missing_words = $self->{'missing_words'}; 189 191 190 $self->_createAltDict($originalDict,$altDict);191 192 my $missingWordList = $self->_getMissingWordList($missing_words); 192 193 my %prompts; 193 194 open(PROMPTS,"$prompts") or confess ("cannot open $prompts file"); 194 open(MISSINGWORDPROMPTS," AudioBook/interim_files/missingWordsFolder/missingWordPrompts") or confess ("cannot open AudioBook/interim_files/wav/missingWordPrompts file");195 open(MISSINGWORDSVAL," AudioBook/interim_files/missingWords_val") or confess ("cannot open AudioBook/interim_files/wav/missingWordPromptsfile");195 open(MISSINGWORDPROMPTS,">AudioBook/interim_files/missingWordsFolder/missingWordPrompts") or confess ("cannot open AudioBook/interim_files/wav/missingWordPrompts file"); 196 open(MISSINGWORDSVAL,">AudioBook/interim_files/missingWords_validated") or confess ("cannot open AudioBook/interim_files/missingWords_validated file"); 196 197 while (my $line = <PROMPTS>) { 197 198 chomp $line; … … 199 200 my $promptID = shift @line; 200 201 foreach my $word (@line) { 201 if ($missingWordList->{$word}) { 202 if ($missingWordList->{$word}) { # there is a missing word in this prompt line 202 203 my $wavfilename = $promptID . "\.wav"; 203 204 copy("AudioBook/interim_files/wav/$wavfilename","AudioBook/interim_files/missingWordsFolder/$wavfilename"); 204 205 print MISSINGWORDPROMPTS "$word:$promptID,@line\n"; 205 my @phoneList = _forceAlignPromptLine($word,$promptID,@line);206 my @phoneList = $self->_forceAlignPromptLine($altDict, $word,$promptID,\@line); # force align entire prompt line 206 207 print MISSINGWORDSVAL "$word [$word] @phoneList\n"; 207 208 } … … 217 218 Sequitor G2P), in doing so, it picks the most likely pronunciation, thereby validating a Sequitor G2P with real audio. 218 219 220 Assumes only one missing word per prompt line 221 219 222 =cut 220 223 221 224 sub _forceAlignPromptLine { # private 222 my ($self,$ word,$promptID,$promptLine)= @_;223 my $altDict = $self->{"altDict"};224 my ($aligned_out, $log) = AudioBook::Audio ::forceAlign($self, $promptID, $promptLine, $altDict);225 my ($self,$altDict,$word,$promptID,$promptLine)= @_; 226 227 my ($aligned_out, $log) = AudioBook::Audio->forceAlign($self, $promptID, $promptLine, $altDict); 225 228 open(ALIGNED_OUT,"$aligned_out") or confess ("cannot open $aligned_out file"); 226 229 my (@phoneList,$gatherPhones); 230 print "_forceAlignPromptLine:word=$word\n"; 227 231 while (my $line = <ALIGNED_OUT>) { 228 232 my @line = split(/\s/, $line); 233 print "_forceAlignPromptLine:line:@line\n"; 229 234 my($startTime,$stopTime, $phone, $probability, $recword) = @line; 235 print "_forceAlignPromptLine:$startTime,$stopTime, $phone, $probability, $recword\n"; 230 236 if (defined($recword)) { 231 237 if ($recword eq $word) { 232 238 $gatherPhones=1; 233 } els e{234 $gatherPhones=0;239 } elsif ($gatherPhones) { 240 last; 235 241 } 236 } 242 } 243 237 244 if ($gatherPhones) { 238 push (@phoneList,$phone); 245 if ($phone ne "sp"){ 246 push (@phoneList,$phone); 247 } 239 248 } 240 249 } … … 262 271 =head2 createAltDict 263 272 264 Need to merge the output from Sequitor G2P in getAlternatePronunciations() to the dict for the submission 265 266 (todo: may need to create individual dict files for each out-of-vocabulary word to speed things up) 267 268 =cut 269 270 sub _createAltDict { # private273 Merge the output from Sequitor G2P to HDMan's dict file (for the submission... this is not dict file for a particular prompt line) 274 275 todo: may need to create individual dict files for each out-of-vocabulary word to speed things up 276 277 =cut 278 279 sub createAltDict { # 271 280 my ($self,$originalDict,$altDict)= @_; 272 281 my $missing_words_alt = $self->{"missing_words_alt"}; 273 282 274 $self->{"altDict"} = $altDict;275 276 283 open(MISSINGWORDSALT,"$missing_words_alt") or confess ("cannot open $missing_words_alt file"); 277 284 my @missingWordAlt;