voxforge.org
VoxForge Dev
Show
Ignore:
Timestamp:
06/09/08 21:09:55 (6 months ago)
Author:
kmaclean
Message:

AudioSegmentation scripts - snapshot

Files:

Legend:

Unmodified
Added
Removed
Modified
Copied
Moved
  • Trunk/Scripts/Audio_scripts/AudioSegmentation/AudioBook/Dictionary.pm

    r2606 r2608  
    158158        my $model = $audioBook->getG2p_model(); 
    159159        my $missing_words = $self->{'missing_words'}; 
    160         my $log = $self->getLog(); 
     160        my $log = $audioBook->getLog(); 
    161161         
    162162        $self->{"missing_words_alt"} = $missing_words_alt; 
     
    181181} 
    182182 
    183 =head2 validateAlternatePronunciations  
    184  
    185 Add alternate pronunications generated by Sequitor G2P to copy of original dict file, then perform forced alingment to see which 
    186 pronunciation that HVite picks, based on the phonemes it recognizes in the audio. 
    187  
    188 Also create a folder containing the missing words and a prompt file containing with only those prompts that contain the missing words, 
    189 for manual confirmation of pronunciations.   
    190 todo: set this up as an audacity project containing the wav files and text prompts. 
    191  
    192 =cut 
    193  
    194 sub validateAlternatePronunciations { # public  
    195         my ($self,$originalDict,$altDict,$prompts)= @_; 
    196         my $audioBook = $self->{'audiobookObject'}; 
    197         my $debug = $audioBook->getDebug; 
    198         my $missing_words = $self->{'missing_words'};    
    199          
    200         my $missingWordList = $self->_getMissingWordList($missing_words); 
    201         my %prompts; 
    202         my @missingWordsPrompts;         
    203         my @missingWordsValidated; 
    204         print "\nValidate Alternate Pronunciations:\n";  
    205         print   "----------------------------------\n";  
    206         open(PROMPTS,"$prompts") or confess ("cannot open $prompts file"); 
    207         while (my $line = <PROMPTS>) {  
    208                 chomp $line; 
    209                 my @line = split(/\s/, $line); 
    210                 my $promptID = shift @line; 
    211                 foreach my $word (@line) { 
    212                         if ($missingWordList->{$word}) { # there is a missing word in this prompt line 
    213                                 my $wavfilename = $promptID . "\.wav"; 
    214                                 copy("AudioBook/interim_files/wav/$wavfilename","AudioBook/interim_files/missingWordsFolder/$wavfilename");                              
    215                                 push (@missingWordsPrompts,"$word:$promptID @line\n"); 
    216                                 print "\."; 
    217                                 my @phoneList = $self->_forceAlignPromptLine($altDict, $word,$promptID,\@line); # force align entire prompt line 
    218                                 push (@missingWordsValidated,"$word [$word] @phoneList\n"); 
    219                         } 
    220                 } 
    221         } 
    222         close PROMPTS; 
    223          
    224         open(MISSINGWORDPROMPTS,">AudioBook/interim_files/MissingWords_prompts") or confess ("cannot open AudioBook/interim_files/missingWords_prompts file");           
    225         my %missingWordsPrompts; 
    226         foreach my $line (sort(@missingWordsPrompts)) { 
    227                 print MISSINGWORDPROMPTS $line; 
    228                 chomp $line; 
    229                 my @temp = split (/:/,$line); 
    230                 my ($word) = shift(@temp);  
    231                 print "word:$word\n"; 
    232                 if (defined($missingWordsPrompts{$word})) { 
    233                         my $temp = "$missingWordsPrompts{$word}\n"; 
    234                         $missingWordsPrompts{$word} = $temp . "$line";                   
    235                 } else { 
    236                         $missingWordsPrompts{$word} = $line; 
    237                 } 
    238                  
    239         } 
    240         close MISSINGWORDPROMPTS;        
    241         # !!!!!!                 
    242         open(MISSINGWORDSVAL,">AudioBook/interim_files/MissingWords_validated") or confess ("cannot open AudioBook/interim_files/missingWords_validated file");          
    243         my %missingWordsValidated; 
    244         foreach my $line (sort(@missingWordsValidated)) { 
    245                 print MISSINGWORDSVAL $line;             
    246                 chomp $line; 
    247                 my @temp = split (/\s/,$line); 
    248                 my ($word,$returnword, @phones) = @temp;  
    249                 if (defined($missingWordsValidated{$word})) { 
    250                         my $array = $missingWordsValidated{$word}; 
    251                         push @$array,\@phones;  # see perlref 
    252                 } else { 
    253                         $missingWordsValidated{$word} = []; 
    254                         my $array = $missingWordsValidated{$word};               
    255                         push @$array,\@phones;  # see perlref 
    256                 } 
    257         } 
    258         close MISSINGWORDSVAL; 
    259          
    260         open(MISSINGWORDSOUT,"AudioBook/interim_files/MissingWords_out") or confess ("cannot open AudioBook/interim_files/MissingWords_out file");               
    261         open(MISSINGWORDSCOMB,">AudioBook/interim_files/MissingWords_combined") or confess ("cannot open AudioBook/interim_files/missingWords_validated file"); 
    262         my ($word,$returnword, $phones, @phones);        
    263  
    264         while (my $line = <MISSINGWORDSOUT>) { 
    265                 chomp $line; 
    266                 my @temp = split (/\s+/,$line); 
    267                 ($word,$returnword, @phones) = @temp;  
    268                 print MISSINGWORDSCOMB "$missingWordsPrompts{$word}\n"; 
    269                 $phones = join(" ",@phones); 
    270         format_name MISSINGWORDSCOMB "G2P"; 
    271                 write MISSINGWORDSCOMB; 
    272                 my $array = $missingWordsValidated{$word};               
    273                 foreach my $lines (@$array) { 
    274                         $phones = join(" ",@$lines); 
    275                 format_name MISSINGWORDSCOMB "HVITE"; 
    276                         write MISSINGWORDSCOMB; 
    277                 } 
    278         } 
    279         format PROMPTS = 
    280 @<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< 
    281 "g2p",$word, $phones 
    282 
    283         format G2P = 
    284 @<<<@<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<@<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< 
    285 "g2p",$word, $phones 
    286 
    287         format HVITE = 
    288 @<<<<<<<@<<<<<<<<<<<<<<<<<<<<<<<<<<<<<@<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< 
    289 "  hvite:",$word, $phones 
    290 
    291         close MISSINGWORDSOUT; 
    292         close MISSINGWORDSCOMB; 
    293         # !!!!!!         
    294         print   "----------------------------------\n";  
    295 
    296  
    297 =head2 _forceAlignPromptLine  
    298  
    299 HVite looks at the audio, and tries to find a matching phone sequence in $altdict (which contains many alternate pronunications generated by  
    300 Sequitor G2P), in doing so, it picks the most likely pronunciation, thereby validating a Sequitor G2P with real audio. 
    301  
    302 Assumes only one missing word per prompt line 
    303  
    304 =cut 
    305  
    306 sub _forceAlignPromptLine { # private  
    307         my ($self,$altDict,$word,$promptID,$promptLine)= @_; 
    308         my $audioBook = $self->{'audiobookObject'}; 
    309         my $debug = $audioBook->getDebug; 
    310                  
    311         my ($aligned_out, $log) = AudioBook::Audio->forceAlign($self, $promptID, $promptLine, $altDict); 
    312         open(ALIGNED_OUT,"$aligned_out") or confess ("cannot open $aligned_out file"); 
    313         my (@phoneList,$gatherPhones); 
    314         while (my $line = <ALIGNED_OUT>) { 
    315                 my @line = split(/\s/, $line); 
    316                 my($startTime,$stopTime, $phone, $probability, $recword) = @line; 
    317                 if (defined($recword)) { 
    318                         if ($recword eq $word) { 
    319                                 $gatherPhones=1; 
    320                         } elsif ($gatherPhones) { 
    321                                 last; 
    322                         } 
    323                 }  
    324                 if ($gatherPhones) { 
    325                         if ($phone ne "sp"){ 
    326                                 push (@phoneList,$phone); 
    327                         } 
    328                 } 
    329         }  
    330         return @phoneList; 
    331 
    332  
    333 =head2 _getMissingWordList  
    334  
    335 read missing word list into an array for processing 
    336  
    337 =cut 
    338  
    339 sub _getMissingWordList { # private  
    340         my ($self,$missing_words)= @_; 
    341         my $audioBook = $self->{'audiobookObject'}; 
    342         my $debug = $audioBook->getDebug;        
    343          
    344         my %missingWordList; 
    345         open(MISSINGWORDS,"$missing_words") or confess ("cannot open $missing_words file");      
    346         while (my $line = <MISSINGWORDS>) {  
    347                 chomp $line; 
    348                 $missingWordList{$line} = 1; 
    349         } 
    350         close MISSINGWORDS; 
    351         return \%missingWordList; 
    352 
     183 
    353184 
    354185=head2 createAltDict  
     
    364195        my $audioBook = $self->{'audiobookObject'}; 
    365196        my $debug = $audioBook->getDebug; 
     197        print "!!!!!!createAltDict\n"; 
    366198        my $missing_words_alt = $self->{"missing_words_alt"};    
    367199 
     
    434266        close(DICTIONARY); 
    435267        return 1; 
     268} 
     269 
     270=head2 Gettors  
     271 
     272=item * getAverage_sentence_length() 
     273 
     274=cut 
     275 
     276sub getMissing_words { 
     277        my $self = shift; 
     278        return $self->{"missing_words"}; 
    436279} 
    437280