| 134 | | $dictionary->createAltDict($originalDict,$altDict); # merge & sort missing_words_alt and originalDict into altDict |
|---|
| 135 | | $dictionary->validateAlternatePronunciations($originalDict,$altDict,$prompts); |
|---|
| | 134 | $dictionary->createAltDict($originalDict,$altDict); # merge & sort missing_words_alt and originalDict into altDict |
|---|
| | 135 | # !!!!!! |
|---|
| | 136 | #$dictionary->validateAlternatePronunciations($originalDict,$altDict,$prompts); |
|---|
| | 137 | $self->validateAlternatePronunciations($originalDict,$altDict,$prompts); |
|---|
| | 138 | # !!!!!! |
|---|
| | 139 | } |
|---|
| | 140 | |
|---|
| | 141 | =head2 validateAlternatePronunciations |
|---|
| | 142 | |
|---|
| | 143 | Add alternate pronunications generated by Sequitor G2P to copy of original dict file, then perform forced alingment to see which |
|---|
| | 144 | pronunciation that HVite picks, based on the phonemes it recognizes in the audio. |
|---|
| | 145 | |
|---|
| | 146 | Also create a folder containing the missing words and a prompt file containing with only those prompts that contain the missing words, |
|---|
| | 147 | for manual confirmation of pronunciations. |
|---|
| | 148 | todo: set this up as an audacity project containing the wav files and text prompts. |
|---|
| | 149 | |
|---|
| | 150 | =cut |
|---|
| | 151 | |
|---|
| | 152 | sub validateAlternatePronunciations { # public |
|---|
| | 153 | my ($self,$originalDict,$altDict,$prompts)= @_; |
|---|
| | 154 | my $audioBook = $self->{'audioBookObject'}; |
|---|
| | 155 | my $debug = $audioBook->getDebug; |
|---|
| | 156 | # !!!!!! |
|---|
| | 157 | my $dictionary = $self->{'dictionaryObject'}; |
|---|
| | 158 | my $missing_words = $dictionary->getMissing_words(); |
|---|
| | 159 | # !!!!!!! |
|---|
| | 160 | |
|---|
| | 161 | my $missingWordList = $self->_getMissingWordList($missing_words); |
|---|
| | 162 | my %prompts; |
|---|
| | 163 | my @missingWordsPrompts; |
|---|
| | 164 | my @missingWordsValidated; |
|---|
| | 165 | print "\nValidate Alternate Pronunciations:\n"; |
|---|
| | 166 | print "----------------------------------\n"; |
|---|
| | 167 | open(PROMPTS,"$prompts") or confess ("cannot open $prompts file"); |
|---|
| | 168 | while (my $line = <PROMPTS>) { |
|---|
| | 169 | chomp $line; |
|---|
| | 170 | my @line = split(/\s/, $line); |
|---|
| | 171 | my $promptID = shift @line; |
|---|
| | 172 | foreach my $word (@line) { |
|---|
| | 173 | if ($missingWordList->{$word}) { # there is a missing word in this prompt line |
|---|
| | 174 | my $wavfilename = $promptID . "\.wav"; |
|---|
| | 175 | copy("AudioBook/interim_files/wav/$wavfilename","AudioBook/interim_files/missingWordsFolder/$wavfilename"); |
|---|
| | 176 | push (@missingWordsPrompts,"$word:$promptID @line\n"); |
|---|
| | 177 | print "\."; |
|---|
| | 178 | my @phoneList = $self->_forceAlignPromptLine($altDict, $word,$promptID,\@line); # force align entire prompt line |
|---|
| | 179 | push (@missingWordsValidated,"$word [$word] @phoneList\n"); |
|---|
| | 180 | } |
|---|
| | 181 | } |
|---|
| | 182 | } |
|---|
| | 183 | close PROMPTS; |
|---|
| | 184 | |
|---|
| | 185 | open(MISSINGWORDPROMPTS,">AudioBook/interim_files/MissingWords_prompts") or confess ("cannot open AudioBook/interim_files/missingWords_prompts file"); |
|---|
| | 186 | my %missingWordsPrompts; |
|---|
| | 187 | foreach my $line (sort(@missingWordsPrompts)) { |
|---|
| | 188 | print MISSINGWORDPROMPTS $line; |
|---|
| | 189 | chomp $line; |
|---|
| | 190 | my @temp = split (/:/,$line); |
|---|
| | 191 | my ($word) = shift(@temp); |
|---|
| | 192 | print "word:$word\n"; |
|---|
| | 193 | if (defined($missingWordsPrompts{$word})) { |
|---|
| | 194 | my $temp = "$missingWordsPrompts{$word}\n"; |
|---|
| | 195 | $missingWordsPrompts{$word} = $temp . "$line"; |
|---|
| | 196 | } else { |
|---|
| | 197 | $missingWordsPrompts{$word} = $line; |
|---|
| | 198 | } |
|---|
| | 199 | |
|---|
| | 200 | } |
|---|
| | 201 | close MISSINGWORDPROMPTS; |
|---|
| | 202 | # !!!!!! |
|---|
| | 203 | open(MISSINGWORDSVAL,">AudioBook/interim_files/MissingWords_validated") or confess ("cannot open AudioBook/interim_files/missingWords_validated file"); |
|---|
| | 204 | my %missingWordsValidated; |
|---|
| | 205 | foreach my $line (sort(@missingWordsValidated)) { |
|---|
| | 206 | print MISSINGWORDSVAL $line; |
|---|
| | 207 | chomp $line; |
|---|
| | 208 | my @temp = split (/\s/,$line); |
|---|
| | 209 | my ($word,$returnword, @phones) = @temp; |
|---|
| | 210 | if (defined($missingWordsValidated{$word})) { |
|---|
| | 211 | my $array = $missingWordsValidated{$word}; |
|---|
| | 212 | push @$array,\@phones; # see perlref |
|---|
| | 213 | } else { |
|---|
| | 214 | $missingWordsValidated{$word} = []; |
|---|
| | 215 | my $array = $missingWordsValidated{$word}; |
|---|
| | 216 | push @$array,\@phones; # see perlref |
|---|
| | 217 | } |
|---|
| | 218 | } |
|---|
| | 219 | close MISSINGWORDSVAL; |
|---|
| | 220 | |
|---|
| | 221 | open(MISSINGWORDSOUT,"AudioBook/interim_files/MissingWords_out") or confess ("cannot open AudioBook/interim_files/MissingWords_out file"); |
|---|
| | 222 | open(MISSINGWORDSCOMB,">AudioBook/interim_files/MissingWords_combined") or confess ("cannot open AudioBook/interim_files/missingWords_validated file"); |
|---|
| | 223 | my ($word,$returnword, $phones, @phones); |
|---|
| | 224 | |
|---|
| | 225 | while (my $line = <MISSINGWORDSOUT>) { |
|---|
| | 226 | chomp $line; |
|---|
| | 227 | my @temp = split (/\s+/,$line); |
|---|
| | 228 | ($word,$returnword, @phones) = @temp; |
|---|
| | 229 | print MISSINGWORDSCOMB "$missingWordsPrompts{$word}\n"; |
|---|
| | 230 | $phones = join(" ",@phones); |
|---|
| | 231 | format_name MISSINGWORDSCOMB "G2P"; |
|---|
| | 232 | write MISSINGWORDSCOMB; |
|---|
| | 233 | my $array = $missingWordsValidated{$word}; |
|---|
| | 234 | foreach my $lines (@$array) { |
|---|
| | 235 | $phones = join(" ",@$lines); |
|---|
| | 236 | format_name MISSINGWORDSCOMB "HVITE"; |
|---|
| | 237 | write MISSINGWORDSCOMB; |
|---|
| | 238 | } |
|---|
| | 239 | } |
|---|
| | 240 | format PROMPTS = |
|---|
| | 241 | @<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< |
|---|
| | 242 | "g2p",$word, $phones |
|---|
| | 243 | . |
|---|
| | 244 | format G2P = |
|---|
| | 245 | @<<<@<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<@<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< |
|---|
| | 246 | "g2p",$word, $phones |
|---|
| | 247 | . |
|---|
| | 248 | format HVITE = |
|---|
| | 249 | @<<<<<<<@<<<<<<<<<<<<<<<<<<<<<<<<<<<<<@<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< |
|---|
| | 250 | " hvite:",$word, $phones |
|---|
| | 251 | . |
|---|
| | 252 | close MISSINGWORDSOUT; |
|---|
| | 253 | close MISSINGWORDSCOMB; |
|---|
| | 254 | # !!!!!! |
|---|
| | 255 | print "----------------------------------\n"; |
|---|
| | 256 | } |
|---|
| | 257 | |
|---|
| | 258 | =head2 _forceAlignPromptLine |
|---|
| | 259 | |
|---|
| | 260 | HVite looks at the audio, and tries to find a matching phone sequence in $altdict (which contains many alternate pronunications generated by |
|---|
| | 261 | Sequitor G2P), in doing so, it picks the most likely pronunciation, thereby validating a Sequitor G2P with real audio. |
|---|
| | 262 | |
|---|
| | 263 | Assumes only one missing word per prompt line |
|---|
| | 264 | |
|---|
| | 265 | =cut |
|---|
| | 266 | |
|---|
| | 267 | sub _forceAlignPromptLine { # private |
|---|
| | 268 | my ($self,$altDict,$word,$promptID,$promptLine)= @_; |
|---|
| | 269 | my $audioBook = $self->{'audioBookObject'}; |
|---|
| | 270 | my $debug = $audioBook->getDebug; |
|---|
| | 271 | |
|---|
| | 272 | my ($aligned_out, $log) = AudioBook::Audio->forceAlign($self, $promptID, $promptLine, $altDict); |
|---|
| | 273 | open(ALIGNED_OUT,"$aligned_out") or confess ("cannot open $aligned_out file"); |
|---|
| | 274 | my (@phoneList,$gatherPhones); |
|---|
| | 275 | while (my $line = <ALIGNED_OUT>) { |
|---|
| | 276 | my @line = split(/\s/, $line); |
|---|
| | 277 | my($startTime,$stopTime, $phone, $probability, $recword) = @line; |
|---|
| | 278 | if (defined($recword)) { |
|---|
| | 279 | if ($recword eq $word) { |
|---|
| | 280 | $gatherPhones=1; |
|---|
| | 281 | } elsif ($gatherPhones) { |
|---|
| | 282 | last; |
|---|
| | 283 | } |
|---|
| | 284 | } |
|---|
| | 285 | if ($gatherPhones) { |
|---|
| | 286 | if ($phone ne "sp"){ |
|---|
| | 287 | push (@phoneList,$phone); |
|---|
| | 288 | } |
|---|
| | 289 | } |
|---|
| | 290 | } |
|---|
| | 291 | return @phoneList; |
|---|
| | 292 | } |
|---|
| | 293 | |
|---|
| | 294 | =head2 _getMissingWordList |
|---|
| | 295 | |
|---|
| | 296 | read missing word list file into an array for processing |
|---|
| | 297 | |
|---|
| | 298 | =cut |
|---|
| | 299 | |
|---|
| | 300 | sub _getMissingWordList { # private |
|---|
| | 301 | my ($self,$missing_words)= @_; |
|---|
| | 302 | my $audioBook = $self->{'audioBookObject'}; |
|---|
| | 303 | my $debug = $audioBook->getDebug; |
|---|
| | 304 | |
|---|
| | 305 | my %missingWordList; |
|---|
| | 306 | open(MISSINGWORDS,"$missing_words") or confess ("cannot open $missing_words file"); |
|---|
| | 307 | while (my $line = <MISSINGWORDS>) { |
|---|
| | 308 | chomp $line; |
|---|
| | 309 | $missingWordList{$line} = 1; |
|---|
| | 310 | } |
|---|
| | 311 | close MISSINGWORDS; |
|---|
| | 312 | return \%missingWordList; |
|---|