Changeset 2217
- Timestamp:
- 06/07/07 14:20:17 (2 years ago)
- Files:
-
- Trunk/Scripts/Audio_scripts/AudioBooks/archive/eText2prompts.pl (moved) (moved from Trunk/Scripts/Audio_scripts/AudioBooks/eText2prompts.pl)
- Trunk/Scripts/Audio_scripts/AudioBooks/archive/eText2wlist.pl-bak (moved) (moved from Trunk/Scripts/Audio_scripts/AudioBooks/eText2wlist.pl-bak)
- Trunk/Scripts/Audio_scripts/AudioBooks/archive/etext2mlf.pl (moved) (moved from Trunk/Scripts/Audio_scripts/AudioBooks/etext2mlf.pl)
- Trunk/Scripts/Audio_scripts/AudioBooks/archive/etext2wlist.pl (moved) (moved from Trunk/Scripts/Audio_scripts/AudioBooks/etext2wlist.pl) (3 diffs)
- Trunk/Scripts/Audio_scripts/AudioBooks/etext2wlistmlf.pl (added)
- Trunk/Scripts/Audio_scripts/AudioBooks/htksegment.pl (modified) (7 diffs)
Legend:
- Unmodified
- Added
- Removed
- Modified
- Copied
- Moved
Trunk/Scripts/Audio_scripts/AudioBooks/archive/etext2wlist.pl
r2037 r2217 7 7 ### mail: contact@voxforge.org 8 8 ### Date: 2007.3.20 9 ### Command: perl ./etext2wlist.pl [eText filename]9 ### Command: perl ./etext2wlist.pl eTextFilename [wav filename (no suffix)] 10 10 ### 11 11 ### Copyright (C) 2007 Ken MacLean … … 24 24 use strict; 25 25 26 if ($#ARGV != 0) { 27 print "usage: [eText filename]\n"; 28 exit; 26 my $inputfilename = $ARGV[0]; 27 my $wavfilename = undef; 28 if ($ARGV[1]) { 29 $wavfilename = $ARGV[1]; 29 30 } 30 my $inputfilename = $ARGV[0];31 31 32 32 open(IN, "<$inputfilename") or die ("cannot open input filename for input"); 33 open(PROMPTS, ">prompts") or die ("cannot open prompts for output");34 33 my @eText = <IN>; # slurp in entire file into an array 35 my $eText = "@eText"; # convert the array to a scalar variable 36 $eText =~ s/\n//g; # remove all line feeds from the text file 37 $eText =~ s/\r//g; # remove all carriage returns from the text file 34 close(IN); 38 35 39 my $x=0; 36 #################################################################### 37 ### Cleans up eText and generates prompts file 40 38 my @words; 41 foreach my $line (@eText) { 39 foreach my $line (@eText) { # !!!!!! does this even get a chance to loop since all LFs and CRs have been removed???? 40 $line =~ s/\n/ /g; # remove all line feeds from the text file 41 $line =~ s/\r/ /g; # remove all carriage returns from the text file 42 42 $line =~ tr/a-z/A-Z/; # change to uppercase 43 43 $line =~ s/,//g; # remove commas 44 $line =~ s/\./ /g; # remove periods44 $line =~ s/\./ /g; # remove periods 45 45 # $line =~ s/\'//g; # remove single quotes; but need words like "don't" - need to research this more ... 46 46 $line =~ s/\"//g; # remove all double quotes 47 47 $line =~ s/://g; # remove colon 48 48 # $line =~ s/-//g; # compound word dash; but VoxForge dictionnary contains words with dashes ... 49 $line =~ s/--/ /g; #double dash49 $line =~ s/--/ /g; #double dash 50 50 $line =~ s/ - / /g; # dash punctuation 51 51 $line =~ s/ -/ /g; # dash punctuation 52 $line =~ s/-/ /g; # dash - compound word 52 53 $line =~ s/;//g; # semi-colon 53 54 $line =~ s/!//g; # exclamation mark 54 55 $line =~ s/\?//g; # question mark 55 56 $line =~ s/ / /g; # cleanup double spaces 57 $line =~ s/=//g; # remove equal sign 58 $line =~ s/\(//g; # remove parenthesis 59 $line =~ s/\)//g; # remove parenthesis 60 $line =~ s/_//g; # remove underscore 56 61 # Other cleanup !!!!!! need to change the prompts files directly rather than doing this!!! or add to dictionnary!!! 57 62 $line =~ s/&/AND/g; 58 print PROMPTS "$line\n";59 63 60 64 my @wordlist = split(/ /,$line); 61 65 foreach my $word (@wordlist) { 62 66 if ($word =~ /\S/) { #Anything other than white space [^ \r\t\n\f] 63 $word =~ s/\s//g; 67 $word =~ s/\s//g; 68 $word =~ s/\A\'//; # remove single quote from beginning of word 69 $word =~ s/\'\Z//; # remove single quote from end of word 64 70 push (@words, $word); 65 71 } 66 72 } 67 73 } 68 74 #################################################################### 75 ### create MLF file 76 if ($wavfilename) { 77 open(MLF, ">words.mlf") or die ("cannot open words.mlf for output"); 78 print MLF "#!MLF!#\n"; # 79 print MLF "\"$wavfilename.lab\"\n"; 80 foreach my $word (@words) { 81 print MLF "$word\n"; 82 } 83 print MLF "\.\n"; 84 close (MLF); 85 } 86 #################################################################### 87 ### create WLIST file 88 push (@words, "SENT-END"); 89 push (@words, "SENT-START"); 69 90 my @words2 = sort(@words); 70 91 my %seen; … … 74 95 print WLIST "$word\n"; 75 96 } 76 close(IN);77 close(PROMPTS);78 97 close(WLIST); 98 99 Trunk/Scripts/Audio_scripts/AudioBooks/htksegment.pl
r2212 r2217 25 25 #################################################################### 26 26 my $average_sentence_length = 10; 27 my $max_sentence_length = 20; 28 my $min_pause_for_sentence_break = 250000; # HTK time format - 100 millisecond increments 27 my $max_sentence_length = 15; 28 #my $min_pause_for_sentence_break = 2000000; # HTK time format - 100 millisecond increments 29 my $min_pause_for_sentence_break = 200000; 29 30 my $debug = 0; 31 my $process_audio = 1; 30 32 #################################################################### 31 33 if ($#ARGV != 1) { … … 66 68 } 67 69 } 68 foreach my $line (@aligned_words) { 69 # my @aligned_line = split(/ /,$line); 70 # my ($word,$startTime, $endTime, $pause) = @aligned_line; 71 print "$line\n"; 72 } 70 #if ($debug) { 71 open(LOG, ">htksegment_log") or die ("cannot open htksegment_log for output"); 72 foreach my $line (@aligned_words) { 73 print LOG "$line\n"; 74 } 75 close (LOG); 76 #} 73 77 print "####################################################################\n"; 74 78 … … 105 109 # Process Audio 106 110 my @firstword = split(/ /,$aligned_words[$sentence_start]); 107 my @lastword = split(/ /,$aligned_words[$sentence_end + $up_increment]); 108 my $startTime = $firstword[1] - ($min_pause_for_sentence_break*.90); 111 my @lastword = split(/ /,$aligned_words[$sentence_end + $up_increment]); 112 # !!!!!! 113 # insert pause that follows previous word 114 my @previousword = split(/ /,$aligned_words[$sentence_start-1]); 115 $startTime = $previousword[2]; # sentence end of previous word 116 print "Sentence_start:$sentence_start:firstword: @firstword\n\tlastword:@lastword\n\tpreviousword:@previousword\n" if $debug; 117 # !!!!!! 109 118 my $endTime = $lastword[2] + $lastword[3]; 110 # !!!!!! process_audio ($startTime, $endTime,$padded_fileid);119 process_audio ($startTime, $endTime,$padded_fileid) if $process_audio; 111 120 $true = 0; 112 121 } … … 126 135 $command = ("rm wav/temp.wav"); print "cmd:$command\n" if $debug; system($command); 127 136 print "\nCompleted!\nSentence Length: min:$min_sentence_length_linenumber->$min_sentence_length_found; max:$max_sentence_length_linenumber->$max_sentence_length_found\n"; 128 print " sentences over max_sentence_length:$max_sentence_length:\n";137 print "\nSentences over max_sentence_length of $max_sentence_length words:\n"; 129 138 foreach my $line (@max_sentences) { 130 print " $line\n";139 print "\t$line\n"; 131 140 } 132 141 sub sentence_test { … … 148 157 my @firstword = split(/ /,$aligned_words[$sentence_start]); 149 158 my @lastword = split(/ /,$aligned_words[$sentence_end + $increment]); 150 my $startTime = $firstword[1] - ($min_pause_for_sentence_break*.90); 159 # !!!!!! 160 if ($sentence_start == 0) { 161 $startTime = 0; 162 print "Sentence_start:$sentence_start:firstword: @firstword\n\tlastword:@lastword\n" if $debug; 163 } else { 164 # insert pause that follows previous word 165 my @previousword = split(/ /,$aligned_words[$sentence_start-1]); 166 $startTime = $previousword[2]; # sentence end of previous word 167 print "Sentence_start:$sentence_start:firstword: @firstword\n\tlastword:@lastword\n\tpreviousword:@previousword\n" if $debug; 168 } 169 # !!!!!! 151 170 my $endTime = $lastword[2] + $lastword[3]; 152 # !!!!!! process_audio ($startTime, $endTime,$padded_fileid);171 process_audio ($startTime, $endTime,$padded_fileid) if $process_audio; 153 172 # Calculate min and max sentence 154 173 if ((($sentence_end + $increment)-$sentence_start) > $max_sentence_length) { 155 my $wordcount = (( $sentence_end + $increment)-$sentence_start);174 my $wordcount = ((($sentence_end + $increment)-$sentence_start)+1); 156 175 push (@max_sentences, "$filename_nosuffix$padded_fileid:$wordcount"); 157 176 } elsif ((($sentence_end + $increment)-$sentence_start) < $min_sentence_length_found) { … … 159 178 $min_sentence_length_linenumber = "$filename_nosuffix$padded_fileid"; 160 179 } elsif ((($sentence_end + $increment)-$sentence_start) > $max_sentence_length_found) { 161 $max_sentence_length_found = ( $sentence_end + $increment)-$sentence_start;180 $max_sentence_length_found = (($sentence_end + $increment)-$sentence_start)+1; 162 181 $max_sentence_length_linenumber = "$filename_nosuffix$padded_fileid"; 163 182 } … … 173 192 sub process_audio { 174 193 my ($startTime, $endTime,$padded_fileid) = @_; 194 print "\t$startTime:$endTime:$padded_fileid\n" if $debug; 175 195 # HCopy can only process 16 bit files! 176 196 # HCopy does not create proper WAV/RIFF Headers!