voxforge.org
VoxForge Dev
Show
Ignore:
Timestamp:
05/26/08 14:11:03 (6 months ago)
Author:
kmaclean
Message:

AudioSegmentation scripts -add POD docs

Files:

Legend:

Unmodified
Added
Removed
Modified
Copied
Moved
  • Trunk/Scripts/Audio_scripts/AudioSegmentation/AudioBook/Audio.pm

    r2591 r2593  
    11#! /usr/bin/perl 
    2 #################################################################### 
    3 ### 
    4 ### script name : Audio.pm 
    5 ### version: 0.2 
    6 ### created by: Ken MacLean 
    7 ### mail: contact@voxforge.org 
    8 ### Date: 2007.03.22 
    9 ###    
    10 ### Copyright (C) 2007 Ken MacLean 
    11 ### 
    12 ### This program is free software; you can redistribute it and/or 
    13 ### modify it under the terms of the GNU General Public License 
    14 ### as published by the Free Software Foundation; either version 2 
    15 ### of the License, or (at your option) any later version. 
    16 ### 
    17 ### This program is distributed in the hope that it will be useful, 
    18 ### but WITHOUT ANY WARRANTY; without even the implied warranty of 
    19 ### MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
    20 ### GNU General Public License for more details. 
    21 ###  
    22 ### Changes:                                                             
    23 ### 2007/06/12 - 0.1.2 - modularize code 
    24 ### 2008/05/02 - 0.2 - Convert to class; major refacture; renamed from htksegment to Audio.pm 
    25 ### 
    26 ### Current Issues: 
    27 ### Warning:  Use of uninitialized value in numeric gt (>) "if ($sentence_length>$max_sentence_length) {" 
    28 #################################################################### 
     2$VERSION = 0.2; 
     3 
     4=head1 NAME 
     5 
     6AudioBook::Audio - Audio processing   
     7 
     8=cut  
     9 
    2910package AudioBook::Audio; 
    3011use strict; 
     
    4829my $down_increment = -1; 
    4930my $command; 
    50 #################################################################### 
    51 ### Constructor 
    52 #################################################################### 
     31 
     32=head1 METHODS (not user accessible) 
     33 
     34=cut 
     35 
     36=head2 new  
     37 
     38Constructor - creates an audio object 
     39 
     40=cut 
     41 
    5342sub new { 
    5443        my ($class,$super) = @_;  
     
    6655        return \%self; 
    6756}     
    68 #################################################################### 
    69 ### Methods 
    70 #################################################################### 
     57 
     58=head2 segment  
     59 
     60Public Method that does the actual segmentation of the speech audio file using the text file supplied by the user.  
     61 
     62First pass Forced Alignment on user submitted speech audio file and corresponding transcription text.  This generates  
     63the segmented audio files with corresponding prompt entries in the PROMPTS file.  
     64 
     65=cut 
     66 
     67sub segment { # public 
     68        my ($self,$filename,$textContents) = @_;         
     69        my $debug = $self->{"debug"};            
     70        my $log = $self->{"log"}; 
     71         
     72        my $filename_nopath = basename($filename); 
     73        my $filename_nosuffix = fileparse($filename, "wav"); 
     74        $filename_nosuffix  =~ s/\.//;   
     75        $self->{"filename"} = $filename;         
     76        $self->{"filename_nosuffix"} = $filename_nosuffix; 
     77        $self->{"filename_nopath"} = $filename_nopath; 
     78        $self->{"filename_prefix"} = lc(substr($filename_nopath,0,3)); 
     79        $self->{"textContents"} = $textContents; 
     80         
     81        my $average_sentence_length = $self->{"average_sentence_length"}; 
     82        my $max_sentence_length = $self->{"max_sentence_length"}; 
     83        my $min_pause_for_sentence_break = $self->{"min_pause_for_sentence_break"}; 
     84 
     85        my $samplerate = _getSampleRate($self); 
     86        $self->{"samplerate"} = $samplerate; 
     87           
     88        _forceAlign($self);  
     89        my $aligned_words= _processHViteOutput($self); 
     90        $self->{"aligned_words"} = $aligned_words; 
     91                 
     92        print "### segment::$filename ###########################\n";    
     93        # copy to "interim_files"" directory for processing; also converts to 16 bits per sample (-w=16-bits) so it can be processed by HVite 
     94        $command = ("sox $filename -w AudioBook/interim_files/$filename_nopath"); print "$command\n"; system($command) == 0 or confess "fullrun $command failed: $?";  
     95         
     96        my $loop; 
     97        my $lastSentence = 0; 
     98        my $sentence_start = 0; 
     99        my $sentence_end = $average_sentence_length; 
     100        my $up = 1; 
     101        my $down = 0; 
     102 
     103        my $fileid = 1; 
     104        my @aligned_line = split(/ /,$$aligned_words[$sentence_end]); 
     105        my ($word,$startTime,$endTime,$pause) = @aligned_line;   
     106        open(PROMPTS, ">AudioBook/interim_files/prompts") or confess ("cannot open AudioBook/output_files/prompts for output");  
     107        while (!$lastSentence) { 
     108                $loop++; 
     109                if ($up) { 
     110                        if (($sentence_end + $up_increment) < $#$aligned_words) { 
     111                                #print "segment-up:" . ($sentence_end + $up_increment). "sentence_end:$sentence_end;up_increment:$up_increment\n"; # !!!!!! 
     112                                _sentence_test ($self, "up", $up_increment, $aligned_words, \$fileid, \$sentence_start, \$sentence_end); 
     113                                $up = 0; 
     114                                $down = 1; 
     115                                $up_increment++; 
     116                        } elsif (($sentence_end + $up_increment) >= $#$aligned_words)   { # catches last prompt line 
     117                                $sentence_end = $#$aligned_words; 
     118                                #print "sentence_end:$sentence_end\n"; # !!!!!! 
     119                        _last_sentence ($self, $aligned_words, \$fileid, \$sentence_start, \$sentence_end); 
     120                                $lastSentence = 1; 
     121                        }  
     122                } elsif ($down) { 
     123                        if ((($sentence_end + $down_increment) >= 0) and (($sentence_end + $down_increment)>$sentence_start) and (($sentence_end + $down_increment) < $#$aligned_words)){ 
     124                                #print "segment-down:" . ($sentence_end + $down_increment) . "sentence_end:$sentence_end;down_increment:$down_increment\n"; # !!!!!! 
     125                                _sentence_test ($self, "down", $down_increment,$aligned_words, \$fileid, \$sentence_start, \$sentence_end); 
     126                                $down = 0; 
     127                                $up = 1; 
     128                                $down_increment--; 
     129                        } elsif (($sentence_end + $down_increment) <= $sentence_start) { 
     130                                $up = 1; 
     131                        } elsif (($sentence_end + $down_increment) >= $#$aligned_words) { # sometimes the sentence_end is larger than the total number of words. 
     132                                $up = 1; 
     133                        } elsif (($sentence_end + $down_increment) < 0)  { 
     134                                confess "Error\n"; 
     135                        } 
     136                } 
     137        } 
     138         
     139        $command = ("rm AudioBook/interim_files/temp.wav"); print "cmd:$command\n" if $debug; system($command); 
     140 
     141        open(LOG,">>$log") or confess ("cannot open $log file");         
     142        print LOG "\nAudio Segmenting summary:\n"; 
     143        print LOG   "-------------------------\n";       
     144        print LOG "Settings:average_sentence_length: $average_sentence_length\n"; 
     145        print LOG "         max_sentence_length: $max_sentence_length\n";         
     146        print LOG "         pause length: $min_pause_for_sentence_break (" . $min_pause_for_sentence_break/10000000 . " seconds)\n\n";    
     147        print LOG "Sentence Length: min:$min_sentence_length_linenumber: $min_sentence_length_found\n"; 
     148        print LOG "                 max:$max_sentence_length_linenumber: $max_sentence_length_found\n\n"; 
     149        print LOG "Prompt lines with more than max_sentence_length of $max_sentence_length words:\n"; 
     150        if (@max_sentences) { 
     151                foreach my $line (@max_sentences) { 
     152                        print LOG "\t$line\n"; 
     153                }        
     154        } else { 
     155                print LOG "\tnone\n"; 
     156        } 
     157        close LOG; 
     158
     159 
     160=head3 _getSampleRate  
     161 
     162gets the sample rate of the audio file using Audio::Wav module 
     163 
     164=cut 
     165 
    71166sub _getSampleRate { # private 
    72167        my ($self) = @_;         
     
    79174        return ($$details{sample_rate}); #sampling rate 
    80175} 
     176 
     177=head3 _forceAlign  
     178 
     179Performs "Forced Alignement" of the user submitted speech audio file.  This is the process of taking the text transcription  
     180of the speech audio file and figuring you the time stamps for each of the words.  This helps determine where the pauses are. 
     181 
     182The time stamps are put in the "AudioBooks/interim_files/aligned.out" file  
     183 
     184=cut 
    81185 
    82186sub     _forceAlign { # private 
     
    130234} 
    131235 
     236=head3 _processHViteOutput  
     237 
     238This method reads the HVite output in the "aligned.out" file (generated by the _forceAlign method) and parses it into a  
     239more user friendly format, and puts into the "AudioBooks/interim_files/htksegment_log" file.  
     240 
     241=cut 
     242 
    132243sub _processHViteOutput { # private 
    133244        my ($self) = @_;         
     
    170281} 
    171282 
    172 sub segment { # public 
    173         my ($self,$filename,$textContents) = @_;         
    174         my $debug = $self->{"debug"};            
    175         my $log = $self->{"log"}; 
    176          
    177         my $filename_nopath = basename($filename); 
    178         my $filename_nosuffix = fileparse($filename, "wav"); 
    179         $filename_nosuffix  =~ s/\.//;   
    180         $self->{"filename"} = $filename;         
    181         $self->{"filename_nosuffix"} = $filename_nosuffix; 
    182         $self->{"filename_nopath"} = $filename_nopath; 
    183         $self->{"filename_prefix"} = lc(substr($filename_nopath,0,3)); 
    184         $self->{"textContents"} = $textContents; 
    185          
    186         my $average_sentence_length = $self->{"average_sentence_length"}; 
    187         my $max_sentence_length = $self->{"max_sentence_length"}; 
    188         my $min_pause_for_sentence_break = $self->{"min_pause_for_sentence_break"}; 
    189  
    190         my $samplerate = _getSampleRate($self); 
    191         $self->{"samplerate"} = $samplerate; 
    192            
    193         _forceAlign($self);  
    194         my $aligned_words= _processHViteOutput($self); 
    195         $self->{"aligned_words"} = $aligned_words; 
    196                  
    197         print "### segment::$filename ###########################\n";    
    198         # copy to "interim_files"" directory for processing; also converts to 16 bits per sample (-w=16-bits) so it can be processed by HVite 
    199         $command = ("sox $filename -w AudioBook/interim_files/$filename_nopath"); print "$command\n"; system($command) == 0 or confess "fullrun $command failed: $?";  
    200          
    201         my $loop; 
    202         my $lastSentence = 0; 
    203         my $sentence_start = 0; 
    204         my $sentence_end = $average_sentence_length; 
    205         my $up = 1; 
    206         my $down = 0; 
    207  
    208         my $fileid = 1; 
    209         my @aligned_line = split(/ /,$$aligned_words[$sentence_end]); 
    210         my ($word,$startTime,$endTime,$pause) = @aligned_line;   
    211         open(PROMPTS, ">AudioBook/interim_files/prompts") or confess ("cannot open AudioBook/output_files/prompts for output");  
    212         while (!$lastSentence) { 
    213                 $loop++; 
    214                 if ($up) { 
    215                         if (($sentence_end + $up_increment) < $#$aligned_words) { 
    216                                 #print "segment-up:" . ($sentence_end + $up_increment). "sentence_end:$sentence_end;up_increment:$up_increment\n"; # !!!!!! 
    217                                 _sentence_test ($self, "up", $up_increment, $aligned_words, \$fileid, \$sentence_start, \$sentence_end); 
    218                                 $up = 0; 
    219                                 $down = 1; 
    220                                 $up_increment++; 
    221                         } elsif (($sentence_end + $up_increment) >= $#$aligned_words)   { # catches last prompt line 
    222                                 $sentence_end = $#$aligned_words; 
    223                                 #print "sentence_end:$sentence_end\n"; # !!!!!! 
    224                         _last_sentence ($self, $aligned_words, \$fileid, \$sentence_start, \$sentence_end); 
    225                                 $lastSentence = 1; 
    226                         }  
    227                 } elsif ($down) { 
    228                         if ((($sentence_end + $down_increment) >= 0) and (($sentence_end + $down_increment)>$sentence_start) and (($sentence_end + $down_increment) < $#$aligned_words)){ 
    229                                 #print "segment-down:" . ($sentence_end + $down_increment) . "sentence_end:$sentence_end;down_increment:$down_increment\n"; # !!!!!! 
    230                                 _sentence_test ($self, "down", $down_increment,$aligned_words, \$fileid, \$sentence_start, \$sentence_end); 
    231                                 $down = 0; 
    232                                 $up = 1; 
    233                                 $down_increment--; 
    234                         } elsif (($sentence_end + $down_increment) <= $sentence_start) { 
    235                                 $up = 1; 
    236                         } elsif (($sentence_end + $down_increment) >= $#$aligned_words) { # sometimes the sentence_end is larger than the total number of words. 
    237                                 $up = 1; 
    238                         } elsif (($sentence_end + $down_increment) < 0)  { 
    239                                 confess "Error\n"; 
    240                         } 
    241                 } 
    242         } 
    243          
    244         $command = ("rm AudioBook/interim_files/temp.wav"); print "cmd:$command\n" if $debug; system($command); 
    245  
    246         open(LOG,">>$log") or confess ("cannot open $log file");         
    247         print LOG "\nAudio Segmenting summary:\n"; 
    248         print LOG   "-------------------------\n";       
    249         print LOG "Settings:average_sentence_length: $average_sentence_length\n"; 
    250         print LOG "         max_sentence_length: $max_sentence_length\n";         
    251         print LOG "         pause length: $min_pause_for_sentence_break (" . $min_pause_for_sentence_break/10000000 . " seconds)\n\n";    
    252         print LOG "Sentence Length: min:$min_sentence_length_linenumber: $min_sentence_length_found\n"; 
    253         print LOG "                 max:$max_sentence_length_linenumber: $max_sentence_length_found\n\n"; 
    254         print LOG "Prompt lines with more than max_sentence_length of $max_sentence_length words:\n"; 
    255         if (@max_sentences) { 
    256                 foreach my $line (@max_sentences) { 
    257                         print LOG "\t$line\n"; 
    258                 }        
    259         } else { 
    260                 print LOG "\tnone\n"; 
    261         } 
    262         close LOG; 
    263 
     283=head3 _sentence_test  
     284 
     285Determines if pause is long enough after the 15th word, and if not, iterates up and down until a word with a suitable pause  
     286duration is found. 
     287  
     288=cut 
    264289 
    265290sub _sentence_test { # private 
     
    328353} 
    329354 
     355=head3 _calculateStats  
     356 
     357Calculates prompt statistics for Audio Segmenting summary i nthe AudioBook_Log as follows: 
     358 
     359 Sentence Length: min:aud0002: 11 
     360                 max:aud0003: 16 
     361 
     362 Prompt lines with more than max_sentence_length of 20 words: 
     363        none 
     364  
     365=cut 
     366 
    330367sub _calculateStats { # Calculate min and max sentence 
    331368        my ($self,$sentence_start,$sentence_end,$increment,$filename_prefix,$padded_fileid ) = @_; 
     
    357394} 
    358395 
     396=head3 _last_sentence  
     397 
     398Special processing was requird for last prompt to segment properly. 
     399  
     400=cut 
     401 
    359402sub _last_sentence { # private 
    360403        my ($self, $aligned_words, $fileid, $sentence_start, $sentence_end) = @_; 
     
    389432        _processAudio ($self, $startTime, $endTime,$padded_fileid);              
    390433} 
     434 
     435=head3 _processAudio  
     436 
     437Create a segmented audio file from the original audio file using startTime and endTime generated from segmentation routines. 
     438  
     439=cut 
    391440 
    392441sub _processAudio { # private 
     
    413462        } 
    414463} 
     464 
     465=head2 verifySegments  
     466 
     467Create a segmented audio file from the original audio file using startTime and endTime generated from segmentation routines. 
     468 
     469Second pass Forced Alignment on the segmented audio files generated from the segment method.  It tries to perform forced aligment  
     470on each individual segmented audio file using the corresponded prompt line, to confirm that they match.  If they do not, it flags  
     471the prompt for further editing.  
     472  
     473=cut 
    415474 
    416475sub verifySegments { #public 
     
    450509} 
    451510 
     511=head3 _createMLFFile  
     512 
     513Internal creates an MLF (HTK required "Multi-Label File") file for an individual prompt file. 
     514  
     515=cut 
     516 
    452517sub _createMLFFile { # private 
    453518        my ($self,$promptList)= @_; 
     
    462527} 
    463528 
     529=head3 _forceAlignSegment  
     530 
     531Force Aligns an individual prompt lien with its corresponding speech audio file. 
     532  
     533=cut 
     534 
    464535sub _forceAlignSegment {  # private 
    465536        my ($self,$wavfilename)= @_; 
     
    470541        $command = ("HVite -A -D -T 1 -a -b SENT-END -m -C $htk_files/wav_config -H $htk_files/models/macros -H $htk_files/models/hmmdefs -m -t 250.0 150.0 1000.0 -I AudioBook/interim_files/segment.mlf  -i AudioBook/interim_files/aligned.out AudioBook/interim_files/dict $htk_files/models/tiedlist AudioBook/interim_files/downsampled.wav > AudioBook/interim_files/logs/Segment_$wavfilename.log"); system($command) == 0 or confess "error: $command failed: $?"; 
    471542} 
     543 
     544=head3 _processHviteLog  
     545 
     546Checks to make sure that the forced alignment process worked properly, if not, flags as an error. 
     547 
     548Checks for "No tokens survived to final node of network at beam" warnings in the HVite log file. 
     549  
     550=cut 
    472551 
    473552sub _processHviteLog {  # private 
     
    504583} 
    505584 
    506 #################################################################### 
    507 ### Gettors - Public 
    508 #################################################################### 
     585=head2 Gettors - Public (used by methods in other sub-classes) 
     586 
     587=item * getFilename() 
     588 
     589=cut 
     590 
    509591sub getFilename { 
    510592        my $self = shift; 
     
    512594} 
    513595 
     596=item * getFilename_nopath() 
     597 
     598=cut 
     599 
    514600sub getFilename_nopath { 
    515601        my $self = shift; 
     
    517603} 
    518604 
     605=item * filename_prefix() 
     606 
     607=cut 
     608 
    519609sub filename_prefix { 
    520610        my $self = shift; 
    521611        return $self->{"filename_prefix"} ; 
    522612} 
     613 
     614=head1 Change Log     
     615 
     616        2008/05/02 - 0.2 - Convert to class; major refacture; renamed from htksegment to Audio.pm 
     617        2007/06/12 - 0.1.2 - modularize code 
     618        2007.03.22 - 0.1 - created 
     619         
     620=head1 Current Unresolved Issues: 
     621 
     622Warning:  Use of uninitialized value in numeric gt (>) "if ($sentence_length>$max_sentence_length) {" 
     623 
     624=head1 AUTHOR 
     625     
     626Ken MacLean 
     627contact@voxforge.org 
     628       
     629=head1 COPYRIGHT AND LICENSE        
     630       
     631Copyright (C) 2007 Ken MacLean 
     632    
     633This program is free software; you can redistribute it and/or 
     634modify it under the terms of the GNU General Public License 
     635as published by the Free Software Foundation; either version 2 
     636of the License, or (at your option) any later version. 
     637    
     638This program is distributed in the hope that it will be useful, 
     639but WITHOUT ANY WARRANTY; without even the implied warranty of 
     640MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
     641GNU General Public License for more details. 
     642     
     643=cut 
     644     
    5236451;