| 103 | | =head2 segment |
|---|
| | 103 | =head1 processChapterAudio |
|---|
| | 104 | |
|---|
| | 105 | gets audio file sample rate, performs forced alignment (using HVite), and converts HVite |
|---|
| | 106 | output to includes pauses. |
|---|
| | 107 | |
|---|
| | 108 | Creates file containing "$current_word $current_startTime $current_endTime $pause" |
|---|
| | 109 | |
|---|
| | 110 | =cut |
|---|
| | 111 | |
|---|
| | 112 | sub processChapterAudio{ # public |
|---|
| | 113 | my ($self) = @_; |
|---|
| | 114 | my $audioBook = $self->{'audioBookObject'}; |
|---|
| | 115 | my $debug = $audioBook->getDebug; |
|---|
| | 116 | my $log = $audioBook->getLog(); |
|---|
| | 117 | |
|---|
| | 118 | $self->{"samplerate"} = $self->_getSampleRate(); |
|---|
| | 119 | $self->_forceAlign(); |
|---|
| | 120 | $self->{"aligned_words"} = $self->_processHViteOutput(); |
|---|
| | 121 | return 1; |
|---|
| | 122 | } |
|---|
| | 123 | |
|---|
| | 124 | =head3 _getSampleRate |
|---|
| | 125 | |
|---|
| | 126 | gets the sample rate of the audio file using Audio::Wav module |
|---|
| | 127 | |
|---|
| | 128 | =cut |
|---|
| | 129 | |
|---|
| | 130 | sub _getSampleRate { # private |
|---|
| | 131 | my ($self) = @_; |
|---|
| | 132 | my $audioBook = $self->{'audioBookObject'}; |
|---|
| | 133 | my $debug = $audioBook->getDebug; |
|---|
| | 134 | #my $wavfilename = $self->{"filename"}; |
|---|
| | 135 | my $wavfilename = $audioBook->getAudiofile(); |
|---|
| | 136 | |
|---|
| | 137 | my $wav = new Audio::Wav || confess "AUDIO.pm error: $?"; |
|---|
| | 138 | my $read = $wav -> read( "$wavfilename" ) || confess "AUDIO.pm error: $?"; |
|---|
| | 139 | my $details = $read -> details() || confess "AudioBook.pl error: $?"; |
|---|
| | 140 | return ($$details{sample_rate}); #sampling rate |
|---|
| | 141 | } |
|---|
| | 142 | |
|---|
| | 143 | =head3 _forceAlign |
|---|
| | 144 | |
|---|
| | 145 | Performs "Forced Alignement" of the user submitted speech audio file. This is the process of taking the text transcription |
|---|
| | 146 | of the speech audio file and figuring you the time stamps for each of the words. This helps determine where the pauses are. |
|---|
| | 147 | |
|---|
| | 148 | Sox audio editor is used to downsample the audio to 16kHz-16bit - Hvite only works with 16kHz sampling rate audio |
|---|
| | 149 | |
|---|
| | 150 | The time stamps are put in the "AudioBooks/interim_files/aligned.out" file |
|---|
| | 151 | |
|---|
| | 152 | =cut |
|---|
| | 153 | |
|---|
| | 154 | sub _forceAlign { # private |
|---|
| | 155 | my ($self) = @_; |
|---|
| | 156 | my $audioBook = $self->{'audioBookObject'}; |
|---|
| | 157 | my $debug = $audioBook->getDebug; |
|---|
| | 158 | #my $filename = $self->{"filename"}; |
|---|
| | 159 | my $filename = $audioBook->getAudiofile(); |
|---|
| | 160 | |
|---|
| | 161 | my $htk_files = $audioBook->getHtk_files(); |
|---|
| | 162 | my $text = $self->{"ChapterTextObject"}; |
|---|
| | 163 | my $beam_width = $audioBook->getBeam_width(); |
|---|
| | 164 | |
|---|
| | 165 | # Hvite only works with 16kHz sampling rate audio |
|---|
| | 166 | $command = ("sox $filename -c 1 -r 16000 -w AudioBook/interim_files/downsampled.wav"); print "$command\n" if $debug; system($command) == 0 or confess "fullrun $command failed: $?"; |
|---|
| | 167 | # create mlf file |
|---|
| | 168 | $text->createMLFFile("downsampled","AudioBook/interim_files/words.mlf" ); |
|---|
| | 169 | # forced alignment - creates aligned.out |
|---|
| | 170 | print "\nRunning HVite ...\n"; |
|---|
| | 171 | print "check interim_files/logs/HVite_log for a possible errors\n"; |
|---|
| | 172 | print "(like \"no tokens surviving\"... which means that text *might* not exactly match audio)\n\n"; |
|---|
| | 173 | #################################################################### |
|---|
| | 174 | # process audio file with HVite without using a script file (i.e. -S $htk_files/train.scp); need to remove -l '*' |
|---|
| | 175 | #HVite -A -D -T 1 -a -b SENT-END -m -C AudioBook/input_files/htk/wav_config |
|---|
| | 176 | # -H AudioBook/input_files/htk/models/macros |
|---|
| | 177 | # -H AudioBook/input_files/htk/models/hmmdefs |
|---|
| | 178 | # -m -t 250.0 150.0 1000.0 -I AudioBook/interim_files/words.mlf |
|---|
| | 179 | # -i AudioBook/interim_files/aligned.out |
|---|
| | 180 | # AudioBook/interim_files/dict AudioBook/input_files/htk/models/tiedlist |
|---|
| | 181 | # AudioBook/interim_files/downsampled.wav |
|---|
| | 182 | # !!!!!! |
|---|
| | 183 | # $command = ("pwd && HVite -A -D -T 1 -l '*' -a -b SENT-END -m -C $htk_files/wav_config -H $htk_files/models/macros -H $htk_files/models/hmmdefs -m -t 250.0 150.0 1000.0 -I AudioBook/interim_files/words.mlf -i AudioBook/interim_files/aligned.out -S $htk_files/train.scp AudioBook/interim_files/dict $htk_files/models/tiedlist > AudioBook/interim_files/logs/HVite_log"); system($command) == 0 or confess "error: $command failed: $?"; |
|---|
| | 184 | $command = ("pwd && HVite -A -D -T 1 -l '*' -a -b SENT-END -m -C $htk_files/wav_config -H $htk_files/models/macros -H $htk_files/models/hmmdefs -m -t 500.0 150.0 8000.0 -I AudioBook/interim_files/words.mlf -i AudioBook/interim_files/aligned.out -S $htk_files/train.scp AudioBook/interim_files/dict $htk_files/models/tiedlist > AudioBook/interim_files/logs/HVite_log"); system($command) == 0 or confess "error: $command failed: $?"; |
|---|
| | 185 | # !!!!!! |
|---|
| | 186 | open (HVite_Log,"AudioBook/interim_files/logs/HVite_log") || confess "error: can't open AudioBook/interim_files/HVite_log: $?"; |
|---|
| | 187 | while (my $line = <HVite_Log>) { |
|---|
| | 188 | chomp $line; |
|---|
| | 189 | my $filename; |
|---|
| | 190 | if ($line =~ /Aligning File:/) { |
|---|
| | 191 | my @line=split(/:/, $line); |
|---|
| | 192 | $filename = pop(@line); |
|---|
| | 193 | } elsif ($line =~ /No tokens survived to final node of network at beam/) { |
|---|
| | 194 | my @line=split(/ /, $line); |
|---|
| | 195 | my $beam = pop (@line); |
|---|
| | 196 | $beam =~ s/ //g; |
|---|
| | 197 | print "\n\n******************************************************************\n"; |
|---|
| | 198 | print "**** check that audio corresponds to prompt in file ***\n"; |
|---|
| | 199 | print "******************************************************************\n\n"; |
|---|
| | 200 | if ($beam > $beam_width) { |
|---|
| | 201 | confess "audio not corresponding to prompt file, check HVite_Log; error code: $?" ; |
|---|
| | 202 | } |
|---|
| | 203 | } |
|---|
| | 204 | } |
|---|
| | 205 | } |
|---|
| | 206 | |
|---|
| | 207 | =head3 _processHViteOutput |
|---|
| | 208 | |
|---|
| | 209 | This method reads the HVite output in the "aligned.out" file (generated by the _forceAlign method) and parses it into a |
|---|
| | 210 | more user friendly format, and puts into the "AudioBooks/interim_files/htksegment_log" file. |
|---|
| | 211 | |
|---|
| | 212 | =cut |
|---|
| | 213 | |
|---|
| | 214 | sub _processHViteOutput { # private |
|---|
| | 215 | my ($self) = @_; |
|---|
| | 216 | my $audioBook = $self->{'audioBookObject'}; |
|---|
| | 217 | my $debug = $audioBook->getDebug; |
|---|
| | 218 | |
|---|
| | 219 | open(ALIGNED, "<AudioBook/interim_files/aligned.out") or confess ("can\'t open AudioBook/interim_files/aligned.out file for reading"); |
|---|
| | 220 | my @aligned = <ALIGNED>; |
|---|
| | 221 | close ALIGNED; |
|---|
| | 222 | my ($pause, @aligned_words, $current_word, $current_startTime, $current_endTime); |
|---|
| | 223 | my $first_pass = 1; |
|---|
| | 224 | foreach my $line (@aligned) { |
|---|
| | 225 | chomp $line; |
|---|
| | 226 | my @labels = split(/ /, $line); |
|---|
| | 227 | #print "@labels\n"; |
|---|
| | 228 | my ($startTime,$endTime,$phone,$score,$word)= @labels; |
|---|
| | 229 | if (!defined($word)) {$word=""}; |
|---|
| | 230 | if (!defined($phone)) {$phone=""}; |
|---|
| | 231 | if ($phone eq "sp") { |
|---|
| | 232 | $pause = $endTime - $startTime; |
|---|
| | 233 | } |
|---|
| | 234 | if (($word ne "") and ($word ne "SENT-END") and ($first_pass)) { |
|---|
| | 235 | $current_word = $word; |
|---|
| | 236 | $current_startTime = $startTime; |
|---|
| | 237 | $first_pass = undef; |
|---|
| | 238 | } elsif (($word ne "") and (!$first_pass)) { |
|---|
| | 239 | #print "$startTime,$endTime,$phone,$score,$word,pause:$pause\n"; |
|---|
| | 240 | $current_endTime = $startTime-$pause; |
|---|
| | 241 | push (@aligned_words, "$current_word $current_startTime $current_endTime $pause"); |
|---|
| | 242 | $current_word = $word; |
|---|
| | 243 | $current_startTime = $startTime; |
|---|
| | 244 | } |
|---|
| | 245 | } |
|---|
| | 246 | open(HTKLOG, ">AudioBook/interim_files/htksegment_log") or confess ("cannot open htksegment_log for output"); |
|---|
| | 247 | foreach my $line (@aligned_words) { |
|---|
| | 248 | print HTKLOG "$line\n"; |
|---|
| | 249 | } |
|---|
| | 250 | close (HTKLOG); |
|---|
| | 251 | |
|---|
| | 252 | return (\@aligned_words); |
|---|
| | 253 | } |
|---|
| | 254 | |
|---|
| | 255 | =head1 createSegmentedAudio |
|---|
| 206 | | =head3 _getSampleRate |
|---|
| 207 | | |
|---|
| 208 | | gets the sample rate of the audio file using Audio::Wav module |
|---|
| 209 | | |
|---|
| 210 | | =cut |
|---|
| 211 | | |
|---|
| 212 | | sub _getSampleRate { # private |
|---|
| 213 | | my ($self) = @_; |
|---|
| 214 | | my $debug = $self->{"debug"}; |
|---|
| 215 | | my $wavfilename = $self->{"filename"}; |
|---|
| 216 | | |
|---|
| 217 | | my $wav = new Audio::Wav || confess "AUDIO.pm error: $?"; |
|---|
| 218 | | my $read = $wav -> read( "$wavfilename" ) || confess "AUDIO.pm error: $?"; |
|---|
| 219 | | my $details = $read -> details() || confess "AudioBook.pl error: $?"; |
|---|
| 220 | | return ($$details{sample_rate}); #sampling rate |
|---|
| 221 | | } |
|---|
| 222 | | |
|---|
| 223 | | =head3 _forceAlign |
|---|
| 224 | | |
|---|
| 225 | | Performs "Forced Alignement" of the user submitted speech audio file. This is the process of taking the text transcription |
|---|
| 226 | | of the speech audio file and figuring you the time stamps for each of the words. This helps determine where the pauses are. |
|---|
| 227 | | |
|---|
| 228 | | Sox audio editor is used to downsample the audio to 16kHz-16bit - Hvite only works with 16kHz sampling rate audio |
|---|
| 229 | | |
|---|
| 230 | | The time stamps are put in the "AudioBooks/interim_files/aligned.out" file |
|---|
| 231 | | |
|---|
| 232 | | =cut |
|---|
| 233 | | |
|---|
| 234 | | sub _forceAlign { # private |
|---|
| 235 | | my ($self) = @_; |
|---|
| 236 | | my $debug = $self->{"debug"}; |
|---|
| 237 | | my $filename = $self->{"filename"}; |
|---|
| 238 | | my $htk_files = $self->{'htk_files'}; |
|---|
| 239 | | my $textContents = $self->{"textContents"}; |
|---|
| 240 | | my $beam_width = $self->{"beam_width"}; |
|---|
| 241 | | |
|---|
| 242 | | # Hvite only works with 16kHz sampling rate audio |
|---|
| 243 | | $command = ("sox $filename -c 1 -r 16000 -w AudioBook/interim_files/downsampled.wav"); print "$command\n"; system($command) == 0 or confess "fullrun $command failed: $?"; |
|---|
| 244 | | # create mlf file |
|---|
| 245 | | $textContents->createMLFFile("downsampled","AudioBook/interim_files/words.mlf" ); |
|---|
| 246 | | # forced alignment - creates aligned.out |
|---|
| 247 | | print "\nRunning HVite ...\n"; |
|---|
| 248 | | print "check interim_files/logs/HVite_log for a possible errors\n"; |
|---|
| 249 | | print "(like \"no tokens surviving\"... which means that text does not match audio)\n\n"; |
|---|
| 250 | | #################################################################### |
|---|
| 251 | | # process audio file with HVite without using a script file (i.e. -S $htk_files/train.scp); need to remove -l '*' |
|---|
| 252 | | #HVite -A -D -T 1 -a -b SENT-END -m -C AudioBook/input_files/htk/wav_config |
|---|
| 253 | | # -H AudioBook/input_files/htk/models/macros |
|---|
| 254 | | # -H AudioBook/input_files/htk/models/hmmdefs |
|---|
| 255 | | # -m -t 250.0 150.0 1000.0 -I AudioBook/interim_files/words.mlf |
|---|
| 256 | | # -i AudioBook/interim_files/aligned.out |
|---|
| 257 | | # AudioBook/interim_files/dict AudioBook/input_files/htk/models/tiedlist |
|---|
| 258 | | # AudioBook/interim_files/downsampled.wav |
|---|
| 259 | | # !!!!!! |
|---|
| 260 | | # $command = ("pwd && HVite -A -D -T 1 -l '*' -a -b SENT-END -m -C $htk_files/wav_config -H $htk_files/models/macros -H $htk_files/models/hmmdefs -m -t 250.0 150.0 1000.0 -I AudioBook/interim_files/words.mlf -i AudioBook/interim_files/aligned.out -S $htk_files/train.scp AudioBook/interim_files/dict $htk_files/models/tiedlist > AudioBook/interim_files/logs/HVite_log"); system($command) == 0 or confess "error: $command failed: $?"; |
|---|
| 261 | | $command = ("pwd && HVite -A -D -T 1 -l '*' -a -b SENT-END -m -C $htk_files/wav_config -H $htk_files/models/macros -H $htk_files/models/hmmdefs -m -t 500.0 150.0 8000.0 -I AudioBook/interim_files/words.mlf -i AudioBook/interim_files/aligned.out -S $htk_files/train.scp AudioBook/interim_files/dict $htk_files/models/tiedlist > AudioBook/interim_files/logs/HVite_log"); system($command) == 0 or confess "error: $command failed: $?"; |
|---|
| 262 | | # !!!!!! |
|---|
| 263 | | open (HVite_Log,"AudioBook/interim_files/logs/HVite_log") || confess "error: can't open AudioBook/interim_files/HVite_log: $?"; |
|---|
| 264 | | while (my $line = <HVite_Log>) { |
|---|
| 265 | | chomp $line; |
|---|
| 266 | | my $filename; |
|---|
| 267 | | if ($line =~ /Aligning File:/) { |
|---|
| 268 | | my @line=split(/:/, $line); |
|---|
| 269 | | $filename = pop(@line); |
|---|
| 270 | | } elsif ($line =~ /No tokens survived to final node of network at beam/) { |
|---|
| 271 | | my @line=split(/ /, $line); |
|---|
| 272 | | my $beam = pop (@line); |
|---|
| 273 | | $beam =~ s/ //g; |
|---|
| 274 | | print "\n\n******************************************************************\n"; |
|---|
| 275 | | print "**** check that audio corresponds to prompt in file ***\n"; |
|---|
| 276 | | print "******************************************************************\n\n"; |
|---|
| 277 | | if ($beam > $beam_width) { |
|---|
| 278 | | confess "audio not corresponding to prompt file, check HVite_Log; error code: $?" ; |
|---|
| 279 | | } |
|---|
| 280 | | } |
|---|
| 281 | | } |
|---|
| 282 | | } |
|---|
| 283 | | |
|---|
| 284 | | =head3 _processHViteOutput |
|---|
| 285 | | |
|---|
| 286 | | This method reads the HVite output in the "aligned.out" file (generated by the _forceAlign method) and parses it into a |
|---|
| 287 | | more user friendly format, and puts into the "AudioBooks/interim_files/htksegment_log" file. |
|---|
| 288 | | |
|---|
| 289 | | =cut |
|---|
| 290 | | |
|---|
| 291 | | sub _processHViteOutput { # private |
|---|
| 292 | | my ($self) = @_; |
|---|
| 293 | | my $debug = $self->{"debug"}; |
|---|
| 294 | | |
|---|
| 295 | | open(ALIGNED, "<AudioBook/interim_files/aligned.out") or confess ("can\'t open AudioBook/interim_files/aligned.out file for reading"); |
|---|
| 296 | | my @aligned = <ALIGNED>; |
|---|
| 297 | | close ALIGNED; |
|---|
| 298 | | my ($pause, @aligned_words, $current_word, $current_startTime, $current_endTime); |
|---|
| 299 | | my $first_pass = 1; |
|---|
| 300 | | foreach my $line (@aligned) { |
|---|
| 301 | | chomp $line; |
|---|
| 302 | | my @labels = split(/ /, $line); |
|---|
| 303 | | #print "@labels\n"; |
|---|
| 304 | | my ($startTime,$endTime,$phone,$score,$word)= @labels; |
|---|
| 305 | | if (!defined($word)) {$word=""}; |
|---|
| 306 | | if (!defined($phone)) {$phone=""}; |
|---|
| 307 | | if ($phone eq "sp") { |
|---|
| 308 | | $pause = $endTime - $startTime; |
|---|
| 309 | | } |
|---|
| 310 | | if (($word ne "") and ($word ne "SENT-END") and ($first_pass)) { |
|---|
| 311 | | $current_word = $word; |
|---|
| 312 | | $current_startTime = $startTime; |
|---|
| 313 | | $first_pass = undef; |
|---|
| 314 | | } elsif (($word ne "") and (!$first_pass)) { |
|---|
| 315 | | #print "$startTime,$endTime,$phone,$score,$word,pause:$pause\n"; |
|---|
| 316 | | $current_endTime = $startTime-$pause; |
|---|
| 317 | | push (@aligned_words, "$current_word $current_startTime $current_endTime $pause"); |
|---|
| 318 | | $current_word = $word; |
|---|
| 319 | | $current_startTime = $startTime; |
|---|
| 320 | | } |
|---|
| 321 | | } |
|---|
| 322 | | open(LOG, ">AudioBook/interim_files/htksegment_log") or confess ("cannot open htksegment_log for output"); |
|---|
| 323 | | foreach my $line (@aligned_words) { |
|---|
| 324 | | print LOG "$line\n"; |
|---|
| 325 | | } |
|---|
| 326 | | close (LOG); |
|---|
| 327 | | |
|---|
| 328 | | return (\@aligned_words); |
|---|
| 329 | | } |
|---|
| 330 | | |
|---|