| | 138 | =head2 getAlternatePronunciations |
|---|
| | 139 | |
|---|
| | 140 | Use the Sequitor g2p script (Python) to generate many *alternate* pronunications for out-of-vocabulary words. |
|---|
| | 141 | These will be later used by HTK's HVite to try to determine the actual pronunication used by the speech in the audio file. |
|---|
| | 142 | |
|---|
| | 143 | Creates $missing_words_alt file, which contains the top x pronunication alternatives for each missing word. |
|---|
| | 144 | |
|---|
| | 145 | =cut |
|---|
| | 146 | |
|---|
| | 147 | sub getAlternatePronunciations { # public |
|---|
| | 148 | my ($self,$missing_words_alt, $numberOfPronunciations)= @_; |
|---|
| | 149 | my $debug = $self->{'debug'}; |
|---|
| | 150 | my $model = $self->{'g2p_model'}; |
|---|
| | 151 | my $missing_words = $self->{'missing_words'}; |
|---|
| | 152 | my $log = $self->{'log'}; |
|---|
| | 153 | |
|---|
| | 154 | $self->{"missing_words_alt"} = $missing_words_alt; |
|---|
| | 155 | |
|---|
| | 156 | my @missingwords = `export PYTHONPATH=/usr/local/lib64/python2.4/site-packages && |
|---|
| | 157 | g2p.py --model $model --apply $missing_words --variants-number=$numberOfPronunciations`; |
|---|
| | 158 | if ($?) { |
|---|
| | 159 | confess "g2p.py $command failed: $?"; |
|---|
| | 160 | } |
|---|
| | 161 | open(MISSINGWORDSALT,">$missing_words_alt") or confess ("cannot open $missing_words_alt file"); |
|---|
| | 162 | my ($word, $seqnum, $prob, $phonemes); |
|---|
| | 163 | format MISSINGWORDSALT = |
|---|
| | 164 | @<<<<<<<<<<<<<<<@<<<<<<<<<<<<<<<@<<<<<<<<<<<<<<<<<<<<<<<<<<<<< |
|---|
| | 165 | $word,"[" . $word ."]",$phonemes |
|---|
| | 166 | . |
|---|
| | 167 | foreach my $line (@missingwords) { |
|---|
| | 168 | chomp $line; |
|---|
| | 169 | print MISSINGWORDSALT "$line\n"; |
|---|
| | 170 | } |
|---|
| | 171 | close MISSINGWORDSALT; |
|---|
| | 172 | return 1; |
|---|
| | 173 | } |
|---|
| | 174 | |
|---|
| | 175 | =head2 validateAlternatePronunciations |
|---|
| | 176 | |
|---|
| | 177 | Todo: This might need to go in a new Prompts class |
|---|
| | 178 | |
|---|
| | 179 | =cut |
|---|
| | 180 | |
|---|
| | 181 | sub validateAlternatePronunciations { # public |
|---|
| | 182 | my ($self,$originalDict,$altDict,$prompts)= @_; |
|---|
| | 183 | my $debug = $self->{'debug'}; |
|---|
| | 184 | my $missing_words = $self->{'missing_words'}; |
|---|
| | 185 | |
|---|
| | 186 | $self->_createAltDict($originalDict,$altDict); |
|---|
| | 187 | my $missingWordList = $self->_getMissingWordList($missing_words); |
|---|
| | 188 | my %prompts; |
|---|
| | 189 | open(PROMPTS,"$prompts") or confess ("cannot open $prompts file"); |
|---|
| | 190 | open(MISSINGWORDPROMPTS,"AudioBook/interim_files/missingWordsFolder/missingWordPrompts") or confess ("cannot open AudioBook/interim_files/wav/missingWordPrompts file"); |
|---|
| | 191 | open(MISSINGWORDSVAL,"AudioBook/interim_files/missingWords_val") or confess ("cannot open AudioBook/interim_files/wav/missingWordPrompts file"); |
|---|
| | 192 | while (my $line = <PROMPTS>) { |
|---|
| | 193 | chomp $line; |
|---|
| | 194 | my @line = split(/\s/, $line); |
|---|
| | 195 | my $promptID = shift @line; |
|---|
| | 196 | foreach my $word (@line) { |
|---|
| | 197 | if ($missingWordList->{$word}) { |
|---|
| | 198 | my $wavfilename = $promptID . "\.wav"; |
|---|
| | 199 | copy("AudioBook/interim_files/wav/$wavfilename","AudioBook/interim_files/missingWordsFolder/$wavfilename"); |
|---|
| | 200 | print MISSINGWORDPROMPTS "$word:$promptID,@line\n"; |
|---|
| | 201 | my @phoneList = _forceAlignPromptLine($word,$promptID,@line); |
|---|
| | 202 | print MISSINGWORDSVAL "$word [$word] @phoneList\n"; |
|---|
| | 203 | } |
|---|
| | 204 | } |
|---|
| | 205 | } |
|---|
| | 206 | close PROMPTS; |
|---|
| | 207 | close MISSINGWORDPROMPTS; |
|---|
| | 208 | } |
|---|
| | 209 | |
|---|
| | 210 | =head2 _forceAlignPromptLine |
|---|
| | 211 | |
|---|
| | 212 | HVite looks at the audio, and tries to find a matching phone sequence in $altdict (which contains many alternate pronunications generated by |
|---|
| | 213 | Sequitor G2P), in doing so, it picks the most likely pronunciation, thereby validating a Sequitor G2P with real audio. |
|---|
| | 214 | |
|---|
| | 215 | =cut |
|---|
| | 216 | |
|---|
| | 217 | sub _forceAlignPromptLine { # private |
|---|
| | 218 | my ($self,$word,$promptID,$promptLine)= @_; |
|---|
| | 219 | my $altDict = $self->{"altDict"}; |
|---|
| | 220 | my ($aligned_out, $log) = AudioBook::Audio::forceAlign($self, $promptID, $promptLine, $altDict); |
|---|
| | 221 | open(ALIGNED_OUT,"$aligned_out") or confess ("cannot open $prompts file"); |
|---|
| | 222 | my @phoneList; |
|---|
| | 223 | while (my $line = <ALIGNED_OUT>) { |
|---|
| | 224 | my @line = split(/\s/, $line); |
|---|
| | 225 | my($startTime,$stopTime, $phone, $probability, $recword) = @line; |
|---|
| | 226 | if (defined($recword)) { |
|---|
| | 227 | if ($recword eq $word) { |
|---|
| | 228 | $gatherPhones=1; |
|---|
| | 229 | } else { |
|---|
| | 230 | $gatherPhones=0; |
|---|
| | 231 | } |
|---|
| | 232 | } |
|---|
| | 233 | if $gatherPhones { |
|---|
| | 234 | push (@phoneList,$phone); |
|---|
| | 235 | } |
|---|
| | 236 | } |
|---|
| | 237 | return @phoneList; |
|---|
| | 238 | } |
|---|
| | 239 | |
|---|
| | 240 | =head2 _getMissingWordList |
|---|
| | 241 | |
|---|
| | 242 | =cut |
|---|
| | 243 | |
|---|
| | 244 | sub _getMissingWordList { # private |
|---|
| | 245 | my ($self,$missing_words)= @_; |
|---|
| | 246 | my %missingWordList; |
|---|
| | 247 | open(MISSINGWORDS,"$missing_words") or confess ("cannot open $missing_words file"); |
|---|
| | 248 | while (my $line = <MISSINGWORDS>) { |
|---|
| | 249 | chomp $line; |
|---|
| | 250 | $missingWordList{$line} = 1; |
|---|
| | 251 | } |
|---|
| | 252 | close MISSINGWORDS; |
|---|
| | 253 | return \%missingWordList; |
|---|
| | 254 | } |
|---|
| | 255 | |
|---|
| | 256 | =head2 createAltDict |
|---|
| | 257 | |
|---|
| | 258 | Need to merge the output from Sequitor G2P in getAlternatePronunciations() to the dict for the submission |
|---|
| | 259 | |
|---|
| | 260 | (todo: may need to create individual dict files for each out-of-vocabulary word to speed things up) |
|---|
| | 261 | |
|---|
| | 262 | =cut |
|---|
| | 263 | |
|---|
| | 264 | sub _createAltDict { # private |
|---|
| | 265 | my ($self,$originalDict,$altDict)= @_; |
|---|
| | 266 | my $missing_words_alt = $self->{"missing_words_alt"}; |
|---|
| | 267 | |
|---|
| | 268 | $self->{"altDict"} = $altDict; |
|---|
| | 269 | |
|---|
| | 270 | open(MISSINGWORDSALT,"$missing_words_alt") or confess ("cannot open $missing_words_alt file"); |
|---|
| | 271 | my @missingWordAlt; |
|---|
| | 272 | while (my $line = <MISSINGWORDSALT>) { # reformat MISSINGWORDSALT so that it can be merged with dict. |
|---|
| | 273 | chomp $line; |
|---|
| | 274 | my @line = split(/\t/, $line); |
|---|
| | 275 | my ($word, $seqnum, $prob, $phonemes) = @line; |
|---|
| | 276 | push (@missingWordAlt, "$word [$word] $phonemes\n"); |
|---|
| | 277 | } |
|---|
| | 278 | open(DICT,$originalDict) or confess ("cannot open $originalDict file"); |
|---|
| | 279 | my @dict = <DICT>; # slurp in entire file into an array |
|---|
| | 280 | my @altdict = sort(@missingWordAlt,@dict); # merge and sort |
|---|
| | 281 | |
|---|
| | 282 | open(ALTDICT,">$altDict") or confess ("cannot open $altDict file"); |
|---|
| | 283 | foreach my $line (@altdict) { |
|---|
| | 284 | print ALTDICT $line; |
|---|
| | 285 | } |
|---|
| | 286 | |
|---|
| | 287 | close MISSINGWORDSALT; |
|---|
| | 288 | close DICT; |
|---|
| | 289 | close ALTDICT; |
|---|
| | 290 | } |
|---|
| | 291 | |
|---|