voxforge.org
VoxForge Dev
Show
Ignore:
Timestamp:
05/29/08 23:01:55 (6 months ago)
Author:
kmaclean
Message:

AudioSegmentation scripts - Sequitor G2P pronunciation alternatives validation to Audio using HVite Forced Alignment - snapshot

Files:

Legend:

Unmodified
Added
Removed
Modified
Copied
Moved
  • Trunk/Scripts/Audio_scripts/AudioSegmentation/AudioBook/Audio.pm

    r2595 r2597  
    2626my $command; 
    2727 
    28 =head1 METHODS (not user accessible) 
     28=head1 CLASS METHODS  
    2929 
    3030=head2 new  
     
    5050}     
    5151 
     52=head2 forceAlign  
     53 
     54Force Aligns an individual prompt line with its corresponding speech audio file. 
     55  
     56=cut 
     57 
     58sub forceAlign {  # public 
     59        my ($class,$super,$fileID, $promptLine, $dict)= @_; 
     60        my $wavfilename = $fileID . "\.wav"; 
     61        my $htk_files = $super->{'htk_files'}; 
     62         
     63        my $segmentMlf = "AudioBook/interim_files/$fileID.mlf"; 
     64        createMLF($class,$fileID, $promptLine,$segmentMlf); 
     65         
     66        my $aligned_out = "AudioBook/interim_files/$fileID.aligned.out"; 
     67        my $log = "AudioBook/interim_files/$fileID.forceAlign.log"; 
     68                 
     69        $command = ("sox AudioBook/interim_files/wav/$wavfilename -c 1 -r 16000 -w AudioBook/interim_files/downsampled.wav 2>&1 > AudioBook/interim_files/logs/Segment_Soxlog"); system($command) == 0 or confess "$command failed: $?"; 
     70        $command = ("HVite -A -D -T 1 -a -b SENT-END -m -C $htk_files/wav_config -H $htk_files/models/macros -H $htk_files/models/hmmdefs -m -t 250.0 150.0 1000.0 -I $segmentMlf  
     71         -i $aligned_out $dict $htk_files/models/tiedlist $wavfilename > $log"); system($command) == 0 or confess "error: $command failed: $?"; 
     72           
     73        return ($aligned_out, $log); 
     74} 
     75 
     76=head2 _createMLFFile  
     77 
     78Internal creates an MLF (HTK required "Multi-Label File") file for a prompt. 
     79  
     80=cut 
     81 
     82sub createMLF { # public 
     83        my ($class,$fileID, $promptLine,$segmentMlf)= @_; 
     84        open(MLF, ">$segmentMlf") or confess ("cannot open $segmentMlf for output");     
     85        print MLF "#!MLF!#\n"; #  
     86        print MLF "\"AudioBook/interim_files/$fileID.lab\"\n"; 
     87        foreach my $word (@$promptLine) { 
     88                print MLF "$word\n"; 
     89        } 
     90        print MLF "\.\n"; 
     91} 
     92 
     93=head1 INSTANCE METHODS 
     94 
    5295=head2 segment  
    5396 
     
    81124           
    82125        _forceAlign($self);  
    83         my $aligned_words= _processHViteOutput($self); 
     126        my $aligned_words= $self->_processHViteOutput(); 
    84127        $self->{"aligned_words"} = $aligned_words; 
    85128                 
    86         print "### segment::$filename ###########################\n";    
     129        print "\nSegmenting:$filename (each dot represents a newly created segmented audio file)\n";     
     130        print "-------------------------------------------------------------------------------\n";               
    87131        # copy to "interim_files"" directory for processing; also converts to 16 bits per sample (-w=16-bits) so it can be processed by HVite 
    88132        $command = ("sox $filename -w AudioBook/interim_files/$filename_nopath"); print "$command\n"; system($command) == 0 or confess "fullrun $command failed: $?";  
     
    130174                } 
    131175        } 
    132          
     176        print "\n"; 
    133177        $command = ("rm AudioBook/interim_files/temp.wav"); print "cmd:$command\n" if $debug; system($command); 
    134178 
     
    136180        print LOG "\nAudio Segmenting summary:\n"; 
    137181        print LOG   "-------------------------\n";       
    138         print LOG "Settings:average_sentence_length: $average_sentence_length\n"; 
    139         print LOG "         max_sentence_length: $max_sentence_length\n";       
     182        print LOG "Settings:average sentence length: $average_sentence_length\n"; 
     183        print LOG "         target max sentence length: $max_sentence_length\n";        
    140184        print LOG "         pause length: $min_pause_for_sentence_break (" . $min_pause_for_sentence_break/10000000 . " seconds)\n\n";    
    141185        print LOG "Sentence Length: min:$min_sentence_length_linenumber: $min_sentence_length_found\n"; 
     
    174218of the speech audio file and figuring you the time stamps for each of the words.  This helps determine where the pauses are. 
    175219 
     220Sox audio editor is used to downsample the audio to 16kHz-16bit - Hvite only works with 16kHz sampling rate audio 
     221 
    176222The time stamps are put in the "AudioBooks/interim_files/aligned.out" file  
    177223 
     
    192238        # forced alignment - creates aligned.out 
    193239        print "\nRunning HVite ...\n"; 
    194         print "if this seems to take too long, check interim_files/logs/HVite_log for a possible explanation\n";  
     240        print "check interim_files/logs/HVite_log for a possible errors\n";  
    195241        print "(like \"no tokens surviving\"... which means that text does not match audio)\n\n"; 
    196242        ####################################################################     
     
    354400                 max:aud0003: 16 
    355401 
    356  Prompt lines with more than max_sentence_length of 20 words: 
     402 Prompt lines with more than target max_sentence_length of 20 words: 
    357403        none 
    358404  
     
    371417                print "increment:$$increment\n"; 
    372418                print "min_sentence_length_found:$min_sentence_length_found:\n"; 
    373                 print "max_sentence_length:$max_sentence_length:\n"; 
     419                print "target max_sentence_length:$max_sentence_length:\n"; 
    374420        } 
    375421        my $sentence_length = (($$sentence_end + $$increment)-$$sentence_start); 
     
    451497                # sox command to create a proper wav file with a RIFF header;  
    452498                $command = ("sox  -t .raw -r $samplerate -sw AudioBook/interim_files/temp.wav AudioBook/interim_files/wav/$filename_prefix$padded_fileid.wav"); print "cmd:$command\n" if $debug; system($command);      
    453                 print "AudioBook/interim_files/wav/$filename_prefix$padded_fileid.wav\n" if not $debug
     499                print "\."
    454500        } else { 
    455501                print "AudioBook/interim_files/wav/$filename_prefix$padded_fileid.wav\t$startTime:$endTime:\n" if $debug; 
     
    471517        my $debug = $self->{"debug"};    
    472518        my $log = $self->{"log"}; 
    473  
    474         print "Verify Prompts (each dot represents a processed prompt file):\n";                 
     519         
     520        open(LOG,">>$log") or confess ("cannot open $log file"); 
     521        print LOG "Checking for \"No tokens survived to final node of network at beam\" warnings:\n";    
     522        print LOG   "----------------------------------------------------------------------------\n"; 
     523                 
     524        print "\nVerify Prompts (each dot represents a processed prompt file):\n";       
     525        print "-------------------------------------------------------------\n";                 
    475526        open(PROMPTS, "<AudioBook/interim_files/prompts") or confess ("cannot open AudioBook/output_files/prompts for output"); 
    476527        my @promptScores;        
     
    482533                $self->_createMLFFile(\@promptList); 
    483534                #print "Force Align:$wavfilename:@promptList\n";         
    484                 $self->_forceAlignSegment($wavfilename); 
     535                $self->_forceAlignSegment($wavfilename, "AudioBook/interim_files/segment.mlf"); 
    485536                my $avgLogLikelihoodPerFrame = $self->_processHviteLog($wavfilename); 
    486537                push (@promptScores,[$avgLogLikelihoodPerFrame,$fileID,"@promptList"]); 
     
    493544        my @sortedlist = sort { $a->[0] cmp $b->[0] } (@promptScores); # sort of 1st element of @promptScores (which is an array of arrays) 
    494545         
    495         open(LOG,">>$log") or confess ("cannot open $log file");         
     546 
    496547        print LOG "\nTop 15 prompts with the lowest average log likelihood per frame\n"; 
    497548        print LOG   "(confirm anything with an avg log likelihood of less than 60):\n";  
     
    506557} 
    507558 
    508 =head3 _createMLFFile  
     559=head2 _createMLFFile  
    509560 
    510561Internal creates an MLF (HTK required "Multi-Label File") file for an individual prompt file. 
     
    524575} 
    525576 
    526 =head3 _forceAlignSegment  
    527  
    528 Force Aligns an individual prompt lien with its corresponding speech audio file. 
     577=head2 _forceAlignSegment  
     578 
     579Force Aligns an individual prompt line with its corresponding speech audio file. 
    529580  
    530581=cut 
    531582 
    532583sub _forceAlignSegment {  # private 
    533         my ($self,$wavfilename)= @_; 
     584        my ($self,$wavfilename, $segmentMLF)= @_; 
    534585        my $debug = $self->{"debug"};    
    535586        my $htk_files = $self->{'htk_files'}; 
    536587 
    537588        $command = ("sox AudioBook/interim_files/wav/$wavfilename -c 1 -r 16000 -w AudioBook/interim_files/downsampled.wav 2>&1 > AudioBook/interim_files/logs/Segment_Soxlog"); system($command) == 0 or confess "$command failed: $?"; 
    538         $command = ("HVite -A -D -T 1 -a -b SENT-END -m -C $htk_files/wav_config -H $htk_files/models/macros -H $htk_files/models/hmmdefs -m -t 250.0 150.0 1000.0 -I AudioBook/interim_files/segment.mlf  -i AudioBook/interim_files/aligned.out AudioBook/interim_files/dict $htk_files/models/tiedlist AudioBook/interim_files/downsampled.wav > AudioBook/interim_files/logs/Segment_$wavfilename.log"); system($command) == 0 or confess "error: $command failed: $?"; 
     589        $command = ("HVite -A -D -T 1 -a -b SENT-END -m -C $htk_files/wav_config -H $htk_files/models/macros -H $htk_files/models/hmmdefs -m -t 250.0 150.0 1000.0 -I $segmentMLF  -i AudioBook/interim_files/aligned.out AudioBook/interim_files/dict $htk_files/models/tiedlist AudioBook/interim_files/downsampled.wav > AudioBook/interim_files/logs/Segment_$wavfilename.log"); system($command) == 0 or confess "error: $command failed: $?"; 
    539590} 
    540591 
     
    552603        my $beam_width = $self->{"beam_width"}; 
    553604        my $avgLogLikelihoodPerFrame; 
     605        my $log = $self->{"log"};        
    554606         
    555607        open (Segment_Log,"AudioBook/interim_files/logs/Segment_$wavfilename.log") || confess "error: can't open AudioBook/interim_files/logs/Segment_$wavfilename.log: $?"; 
     
    564616                        my $beam = pop (@line); 
    565617                        $beam =~ s/ //g; 
    566                         print "**** check that audio corresponds to prompt in $wavfilename *** beam= $beam\n"; 
     618                        print "\n**** check that audio corresponds to prompt in $wavfilename *** beam= $beam\n"; 
     619                        print LOG "check that audio corresponds to prompt in $wavfilename; beam= $beam\n";                       
    567620                        if ($beam > $beam_width) { 
    568621                                confess "audio not corresponding to prompt file, check HVite_Log; error code: $?" ;