| 183 | | =head2 validateAlternatePronunciations |
|---|
| 184 | | |
|---|
| 185 | | Add alternate pronunications generated by Sequitor G2P to copy of original dict file, then perform forced alingment to see which |
|---|
| 186 | | pronunciation that HVite picks, based on the phonemes it recognizes in the audio. |
|---|
| 187 | | |
|---|
| 188 | | Also create a folder containing the missing words and a prompt file containing with only those prompts that contain the missing words, |
|---|
| 189 | | for manual confirmation of pronunciations. |
|---|
| 190 | | todo: set this up as an audacity project containing the wav files and text prompts. |
|---|
| 191 | | |
|---|
| 192 | | =cut |
|---|
| 193 | | |
|---|
| 194 | | sub validateAlternatePronunciations { # public |
|---|
| 195 | | my ($self,$originalDict,$altDict,$prompts)= @_; |
|---|
| 196 | | my $audioBook = $self->{'audiobookObject'}; |
|---|
| 197 | | my $debug = $audioBook->getDebug; |
|---|
| 198 | | my $missing_words = $self->{'missing_words'}; |
|---|
| 199 | | |
|---|
| 200 | | my $missingWordList = $self->_getMissingWordList($missing_words); |
|---|
| 201 | | my %prompts; |
|---|
| 202 | | my @missingWordsPrompts; |
|---|
| 203 | | my @missingWordsValidated; |
|---|
| 204 | | print "\nValidate Alternate Pronunciations:\n"; |
|---|
| 205 | | print "----------------------------------\n"; |
|---|
| 206 | | open(PROMPTS,"$prompts") or confess ("cannot open $prompts file"); |
|---|
| 207 | | while (my $line = <PROMPTS>) { |
|---|
| 208 | | chomp $line; |
|---|
| 209 | | my @line = split(/\s/, $line); |
|---|
| 210 | | my $promptID = shift @line; |
|---|
| 211 | | foreach my $word (@line) { |
|---|
| 212 | | if ($missingWordList->{$word}) { # there is a missing word in this prompt line |
|---|
| 213 | | my $wavfilename = $promptID . "\.wav"; |
|---|
| 214 | | copy("AudioBook/interim_files/wav/$wavfilename","AudioBook/interim_files/missingWordsFolder/$wavfilename"); |
|---|
| 215 | | push (@missingWordsPrompts,"$word:$promptID @line\n"); |
|---|
| 216 | | print "\."; |
|---|
| 217 | | my @phoneList = $self->_forceAlignPromptLine($altDict, $word,$promptID,\@line); # force align entire prompt line |
|---|
| 218 | | push (@missingWordsValidated,"$word [$word] @phoneList\n"); |
|---|
| 219 | | } |
|---|
| 220 | | } |
|---|
| 221 | | } |
|---|
| 222 | | close PROMPTS; |
|---|
| 223 | | |
|---|
| 224 | | open(MISSINGWORDPROMPTS,">AudioBook/interim_files/MissingWords_prompts") or confess ("cannot open AudioBook/interim_files/missingWords_prompts file"); |
|---|
| 225 | | my %missingWordsPrompts; |
|---|
| 226 | | foreach my $line (sort(@missingWordsPrompts)) { |
|---|
| 227 | | print MISSINGWORDPROMPTS $line; |
|---|
| 228 | | chomp $line; |
|---|
| 229 | | my @temp = split (/:/,$line); |
|---|
| 230 | | my ($word) = shift(@temp); |
|---|
| 231 | | print "word:$word\n"; |
|---|
| 232 | | if (defined($missingWordsPrompts{$word})) { |
|---|
| 233 | | my $temp = "$missingWordsPrompts{$word}\n"; |
|---|
| 234 | | $missingWordsPrompts{$word} = $temp . "$line"; |
|---|
| 235 | | } else { |
|---|
| 236 | | $missingWordsPrompts{$word} = $line; |
|---|
| 237 | | } |
|---|
| 238 | | |
|---|
| 239 | | } |
|---|
| 240 | | close MISSINGWORDPROMPTS; |
|---|
| 241 | | # !!!!!! |
|---|
| 242 | | open(MISSINGWORDSVAL,">AudioBook/interim_files/MissingWords_validated") or confess ("cannot open AudioBook/interim_files/missingWords_validated file"); |
|---|
| 243 | | my %missingWordsValidated; |
|---|
| 244 | | foreach my $line (sort(@missingWordsValidated)) { |
|---|
| 245 | | print MISSINGWORDSVAL $line; |
|---|
| 246 | | chomp $line; |
|---|
| 247 | | my @temp = split (/\s/,$line); |
|---|
| 248 | | my ($word,$returnword, @phones) = @temp; |
|---|
| 249 | | if (defined($missingWordsValidated{$word})) { |
|---|
| 250 | | my $array = $missingWordsValidated{$word}; |
|---|
| 251 | | push @$array,\@phones; # see perlref |
|---|
| 252 | | } else { |
|---|
| 253 | | $missingWordsValidated{$word} = []; |
|---|
| 254 | | my $array = $missingWordsValidated{$word}; |
|---|
| 255 | | push @$array,\@phones; # see perlref |
|---|
| 256 | | } |
|---|
| 257 | | } |
|---|
| 258 | | close MISSINGWORDSVAL; |
|---|
| 259 | | |
|---|
| 260 | | open(MISSINGWORDSOUT,"AudioBook/interim_files/MissingWords_out") or confess ("cannot open AudioBook/interim_files/MissingWords_out file"); |
|---|
| 261 | | open(MISSINGWORDSCOMB,">AudioBook/interim_files/MissingWords_combined") or confess ("cannot open AudioBook/interim_files/missingWords_validated file"); |
|---|
| 262 | | my ($word,$returnword, $phones, @phones); |
|---|
| 263 | | |
|---|
| 264 | | while (my $line = <MISSINGWORDSOUT>) { |
|---|
| 265 | | chomp $line; |
|---|
| 266 | | my @temp = split (/\s+/,$line); |
|---|
| 267 | | ($word,$returnword, @phones) = @temp; |
|---|
| 268 | | print MISSINGWORDSCOMB "$missingWordsPrompts{$word}\n"; |
|---|
| 269 | | $phones = join(" ",@phones); |
|---|
| 270 | | format_name MISSINGWORDSCOMB "G2P"; |
|---|
| 271 | | write MISSINGWORDSCOMB; |
|---|
| 272 | | my $array = $missingWordsValidated{$word}; |
|---|
| 273 | | foreach my $lines (@$array) { |
|---|
| 274 | | $phones = join(" ",@$lines); |
|---|
| 275 | | format_name MISSINGWORDSCOMB "HVITE"; |
|---|
| 276 | | write MISSINGWORDSCOMB; |
|---|
| 277 | | } |
|---|
| 278 | | } |
|---|
| 279 | | format PROMPTS = |
|---|
| 280 | | @<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< |
|---|
| 281 | | "g2p",$word, $phones |
|---|
| 282 | | . |
|---|
| 283 | | format G2P = |
|---|
| 284 | | @<<<@<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<@<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< |
|---|
| 285 | | "g2p",$word, $phones |
|---|
| 286 | | . |
|---|
| 287 | | format HVITE = |
|---|
| 288 | | @<<<<<<<@<<<<<<<<<<<<<<<<<<<<<<<<<<<<<@<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< |
|---|
| 289 | | " hvite:",$word, $phones |
|---|
| 290 | | . |
|---|
| 291 | | close MISSINGWORDSOUT; |
|---|
| 292 | | close MISSINGWORDSCOMB; |
|---|
| 293 | | # !!!!!! |
|---|
| 294 | | print "----------------------------------\n"; |
|---|
| 295 | | } |
|---|
| 296 | | |
|---|
| 297 | | =head2 _forceAlignPromptLine |
|---|
| 298 | | |
|---|
| 299 | | HVite looks at the audio, and tries to find a matching phone sequence in $altdict (which contains many alternate pronunications generated by |
|---|
| 300 | | Sequitor G2P), in doing so, it picks the most likely pronunciation, thereby validating a Sequitor G2P with real audio. |
|---|
| 301 | | |
|---|
| 302 | | Assumes only one missing word per prompt line |
|---|
| 303 | | |
|---|
| 304 | | =cut |
|---|
| 305 | | |
|---|
| 306 | | sub _forceAlignPromptLine { # private |
|---|
| 307 | | my ($self,$altDict,$word,$promptID,$promptLine)= @_; |
|---|
| 308 | | my $audioBook = $self->{'audiobookObject'}; |
|---|
| 309 | | my $debug = $audioBook->getDebug; |
|---|
| 310 | | |
|---|
| 311 | | my ($aligned_out, $log) = AudioBook::Audio->forceAlign($self, $promptID, $promptLine, $altDict); |
|---|
| 312 | | open(ALIGNED_OUT,"$aligned_out") or confess ("cannot open $aligned_out file"); |
|---|
| 313 | | my (@phoneList,$gatherPhones); |
|---|
| 314 | | while (my $line = <ALIGNED_OUT>) { |
|---|
| 315 | | my @line = split(/\s/, $line); |
|---|
| 316 | | my($startTime,$stopTime, $phone, $probability, $recword) = @line; |
|---|
| 317 | | if (defined($recword)) { |
|---|
| 318 | | if ($recword eq $word) { |
|---|
| 319 | | $gatherPhones=1; |
|---|
| 320 | | } elsif ($gatherPhones) { |
|---|
| 321 | | last; |
|---|
| 322 | | } |
|---|
| 323 | | } |
|---|
| 324 | | if ($gatherPhones) { |
|---|
| 325 | | if ($phone ne "sp"){ |
|---|
| 326 | | push (@phoneList,$phone); |
|---|
| 327 | | } |
|---|
| 328 | | } |
|---|
| 329 | | } |
|---|
| 330 | | return @phoneList; |
|---|
| 331 | | } |
|---|
| 332 | | |
|---|
| 333 | | =head2 _getMissingWordList |
|---|
| 334 | | |
|---|
| 335 | | read missing word list into an array for processing |
|---|
| 336 | | |
|---|
| 337 | | =cut |
|---|
| 338 | | |
|---|
| 339 | | sub _getMissingWordList { # private |
|---|
| 340 | | my ($self,$missing_words)= @_; |
|---|
| 341 | | my $audioBook = $self->{'audiobookObject'}; |
|---|
| 342 | | my $debug = $audioBook->getDebug; |
|---|
| 343 | | |
|---|
| 344 | | my %missingWordList; |
|---|
| 345 | | open(MISSINGWORDS,"$missing_words") or confess ("cannot open $missing_words file"); |
|---|
| 346 | | while (my $line = <MISSINGWORDS>) { |
|---|
| 347 | | chomp $line; |
|---|
| 348 | | $missingWordList{$line} = 1; |
|---|
| 349 | | } |
|---|
| 350 | | close MISSINGWORDS; |
|---|
| 351 | | return \%missingWordList; |
|---|
| 352 | | } |
|---|
| | 183 | |
|---|