voxforge.org
VoxForge Dev

Changeset 2217

Show
Ignore:
Timestamp:
06/07/07 14:20:17 (2 years ago)
Author:
kmaclean
Message:

updates to AudioBook? scripts

Files:

Legend:

Unmodified
Added
Removed
Modified
Copied
Moved
  • Trunk/Scripts/Audio_scripts/AudioBooks/archive/etext2wlist.pl

    r2037 r2217  
    77### mail: contact@voxforge.org 
    88### Date: 2007.3.20 
    9 ### Command: perl ./etext2wlist.pl [eText filename]  
     9### Command: perl ./etext2wlist.pl eTextFilename [wav filename (no suffix)]  
    1010###    
    1111### Copyright (C) 2007 Ken MacLean 
     
    2424use strict; 
    2525 
    26 if ($#ARGV != 0) {  
    27  print "usage: [eText filename]\n"; 
    28  exit; 
     26my $inputfilename = $ARGV[0]; 
     27my $wavfilename = undef; 
     28if ($ARGV[1]) { 
     29        $wavfilename = $ARGV[1]; 
    2930} 
    30 my $inputfilename = $ARGV[0]; 
    3131 
    3232open(IN, "<$inputfilename") or die ("cannot open input filename for input");  
    33 open(PROMPTS, ">prompts") or die ("cannot open prompts for output"); 
    3433my @eText = <IN>; # slurp in entire file into an array 
    35 my $eText = "@eText";  # convert the array to a scalar variable 
    36 $eText =~ s/\n//g; # remove all line feeds from the text file 
    37 $eText =~ s/\r//g; # remove all carriage returns from the text file 
     34close(IN); 
    3835 
    39 my $x=0; 
     36#################################################################### 
     37### Cleans up eText and generates prompts file 
    4038my @words; 
    41 foreach my $line (@eText) { 
     39foreach my $line (@eText) { # !!!!!! does this even get a chance to loop since all LFs and CRs have been removed???? 
     40        $line =~ s/\n/ /g; # remove all line feeds from the text file 
     41        $line =~ s/\r/ /g; # remove all carriage returns from the text file 
    4242        $line =~ tr/a-z/A-Z/; # change to uppercase 
    4343        $line =~ s/,//g; # remove commas  
    44         $line =~ s/\.//g; # remove periods   
     44        $line =~ s/\./ /g; # remove periods   
    4545        #  $line =~ s/\'//g; # remove single quotes; but need words like "don't" - need to research this more ... 
    4646        $line =~ s/\"//g; # remove all double quotes 
    4747        $line =~ s/://g; # remove colon 
    4848        #  $line =~ s/-//g; # compound word dash; but VoxForge dictionnary contains words with dashes ... 
    49         $line =~ s/--//g; #double dash 
     49        $line =~ s/--/ /g; #double dash 
    5050        $line =~ s/ - / /g; # dash punctuation   
    5151        $line =~ s/ -/ /g; # dash punctuation            
     52        $line =~ s/-/ /g; # dash - compound word         
    5253        $line =~ s/;//g; # semi-colon 
    5354        $line =~ s/!//g; # exclamation mark 
    5455        $line =~ s/\?//g; # question mark                
    5556        $line =~ s/  / /g; # cleanup double spaces       
     57        $line =~ s/=//g; # remove equal sign 
     58        $line =~ s/\(//g; # remove parenthesis   
     59        $line =~ s/\)//g; # remove parenthesis   
     60        $line =~ s/_//g; # remove underscore     
    5661        # Other cleanup !!!!!! need to change the prompts files directly rather than doing this!!! or add to dictionnary!!! 
    5762        $line =~ s/&/AND/g;  
    58         print PROMPTS "$line\n"; 
    5963         
    6064        my @wordlist = split(/ /,$line); 
    6165        foreach my $word (@wordlist) { 
    6266                if ($word =~ /\S/) {  #Anything other than white space  [^ \r\t\n\f] 
    63                 $word =~ s/\s//g; 
     67                        $word =~ s/\s//g; 
     68                        $word =~ s/\A\'//; # remove single quote from beginning of word 
     69                        $word =~ s/\'\Z//; # remove single quote from end of word 
    6470                        push (@words, $word); 
    6571                } 
    6672        } 
    6773} 
    68  
     74#################################################################### 
     75### create MLF file 
     76if ($wavfilename) { 
     77        open(MLF, ">words.mlf") or die ("cannot open words.mlf for output"); 
     78        print MLF "#!MLF!#\n"; #  
     79        print MLF "\"$wavfilename.lab\"\n"; 
     80        foreach my $word (@words) { 
     81                print MLF "$word\n"; 
     82        } 
     83        print MLF "\.\n"; 
     84        close (MLF); 
     85
     86#################################################################### 
     87### create WLIST file 
     88push (@words, "SENT-END"); 
     89push (@words, "SENT-START"); 
    6990my @words2 = sort(@words);  
    7091my %seen; 
     
    7495                print WLIST "$word\n"; 
    7596} 
    76 close(IN); 
    77 close(PROMPTS); 
    7897close(WLIST); 
     98 
     99 
  • Trunk/Scripts/Audio_scripts/AudioBooks/htksegment.pl

    r2212 r2217  
    2525#################################################################### 
    2626my $average_sentence_length = 10; 
    27 my $max_sentence_length = 20; 
    28 my $min_pause_for_sentence_break = 250000; # HTK time format - 100 millisecond increments 
     27my $max_sentence_length = 15; 
     28#my $min_pause_for_sentence_break = 2000000; # HTK time format - 100 millisecond increments 
     29my $min_pause_for_sentence_break = 200000; 
    2930my $debug = 0; 
     31my $process_audio = 1; 
    3032#################################################################### 
    3133if ($#ARGV != 1) {  
     
    6668        }        
    6769} 
    68 foreach my $line (@aligned_words) { 
    69 #       my @aligned_line = split(/ /,$line); 
    70 #       my ($word,$startTime, $endTime, $pause) = @aligned_line;         
    71         print "$line\n"; 
    72 
     70#if ($debug) { 
     71        open(LOG, ">htksegment_log") or die ("cannot open htksegment_log for output");   
     72        foreach my $line (@aligned_words) { 
     73                print LOG "$line\n"; 
     74        } 
     75        close (LOG); 
     76#} 
    7377print "####################################################################\n"; 
    7478 
     
    105109                        # Process Audio 
    106110                        my @firstword = split(/ /,$aligned_words[$sentence_start]);              
    107                         my @lastword = split(/ /,$aligned_words[$sentence_end + $up_increment]);         
    108                         my $startTime = $firstword[1] - ($min_pause_for_sentence_break*.90); 
     111                        my @lastword = split(/ /,$aligned_words[$sentence_end + $up_increment]); 
     112                        # !!!!!!         
     113                        # insert pause that follows previous word 
     114                        my @previousword = split(/ /,$aligned_words[$sentence_start-1]);         
     115                        $startTime = $previousword[2]; # sentence end of previous word 
     116                        print "Sentence_start:$sentence_start:firstword: @firstword\n\tlastword:@lastword\n\tpreviousword:@previousword\n" if $debug; 
     117                        # !!!!!! 
    109118                        my $endTime = $lastword[2] + $lastword[3]; 
    110 # !!!!!!                       process_audio ($startTime, $endTime,$padded_fileid);                            
     119                       process_audio ($startTime, $endTime,$padded_fileid) if $process_audio;                          
    111120                        $true = 0; 
    112121                }  
     
    126135$command = ("rm wav/temp.wav"); print "cmd:$command\n" if $debug; system($command);  
    127136print "\nCompleted!\nSentence Length: min:$min_sentence_length_linenumber->$min_sentence_length_found; max:$max_sentence_length_linenumber->$max_sentence_length_found\n"; 
    128 print "sentences over max_sentence_length:$max_sentence_length:\n"; 
     137print "\nSentences over max_sentence_length of $max_sentence_length words:\n"; 
    129138foreach my $line (@max_sentences) { 
    130         print "$line\n"; 
     139        print "\t$line\n"; 
    131140} 
    132141sub sentence_test { 
     
    148157                my @firstword = split(/ /,$aligned_words[$sentence_start]);              
    149158                my @lastword = split(/ /,$aligned_words[$sentence_end + $increment]);    
    150                 my $startTime = $firstword[1]  - ($min_pause_for_sentence_break*.90); 
     159                # !!!!!! 
     160                if ($sentence_start == 0) { 
     161                        $startTime = 0; 
     162                        print "Sentence_start:$sentence_start:firstword: @firstword\n\tlastword:@lastword\n" if $debug; 
     163                } else { 
     164                        # insert pause that follows previous word 
     165                        my @previousword = split(/ /,$aligned_words[$sentence_start-1]);         
     166                        $startTime = $previousword[2]; # sentence end of previous word 
     167                        print "Sentence_start:$sentence_start:firstword: @firstword\n\tlastword:@lastword\n\tpreviousword:@previousword\n" if $debug; 
     168                } 
     169                # !!!!!! 
    151170                my $endTime = $lastword[2] + $lastword[3]; 
    152 # !!!!!!               process_audio ($startTime, $endTime,$padded_fileid)
     171               process_audio ($startTime, $endTime,$padded_fileid) if $process_audio
    153172                # Calculate min and max sentence 
    154173                if ((($sentence_end + $increment)-$sentence_start) > $max_sentence_length) { 
    155                         my $wordcount = (($sentence_end + $increment)-$sentence_start); 
     174                        my $wordcount = ((($sentence_end + $increment)-$sentence_start)+1); 
    156175                        push (@max_sentences, "$filename_nosuffix$padded_fileid:$wordcount"); 
    157176                } elsif ((($sentence_end + $increment)-$sentence_start) < $min_sentence_length_found) { 
     
    159178                        $min_sentence_length_linenumber = "$filename_nosuffix$padded_fileid"; 
    160179            } elsif ((($sentence_end + $increment)-$sentence_start) > $max_sentence_length_found) { 
    161                 $max_sentence_length_found = ($sentence_end + $increment)-$sentence_start
     180                $max_sentence_length_found = (($sentence_end + $increment)-$sentence_start)+1
    162181                $max_sentence_length_linenumber = "$filename_nosuffix$padded_fileid"; 
    163182            } 
     
    173192sub process_audio { 
    174193        my ($startTime, $endTime,$padded_fileid) = @_; 
     194        print "\t$startTime:$endTime:$padded_fileid\n" if $debug; 
    175195        # HCopy can only process 16 bit files!  
    176196        # HCopy does not create proper WAV/RIFF Headers!