voxforge.org
VoxForge Dev
Show
Ignore:
Timestamp:
06/09/08 17:06:50 (6 months ago)
Author:
kmaclean
Message:

refacture to create Chapter, Segments & MissingWords? classes - snapshop

Files:

Legend:

Unmodified
Added
Removed
Modified
Copied
Moved
  • Trunk/Scripts/Audio_scripts/AudioSegmentation/AudioBook/Text.pm

    r2604 r2606  
    11#! /usr/bin/perl 
    2 $VERSION = 0.2
     2$VERSION = 0.2.1
    33 
    44=head1 NAME 
    55 
    6 AudioBook::Text - Text transcription processing    
     6Text - Text transcription processing    
    77 
    88=cut  
     
    2727 
    2828sub new { 
    29         my ($class,$super,$textFile) = @_;  
    30         my %self; 
    31         $self{"inputfilename"} = $textFile; 
    32         $self{'log'} = $super->{'log'};          
    33         $self{"log_single_quotes"} = $super->{"log_single_quotes"}; 
    34         _clean(\%self); 
    35         bless(\%self,$class); 
    36         return \%self; 
     29        my ($class,$chapter) = @_;  
     30        my $self = {}; 
     31        bless($self,$class); 
     32        $self->{'audiobookObject'} = $chapter->getAudioBookObject();     
     33        _clean($self); 
     34        return $self; 
    3735}     
    3836 
    3937=head2 _clean  
    4038 
    41 Called by the "new" constructor - removes many (not all) non-alphanumeric characters. 
    42  
    43                 $line =~ s/\n/ /g; # remove all line feeds from the text file 
    44                 $line =~ s/\r/ /g; # remove all carriage returns from the text file 
    45                 $line =~ tr/a-z/A-Z/; # change to uppercase 
    46                 $line =~ s/\.\"//g; # period followed by double quote 
    47                 $line =~ s/\,\"//g; # comma followed by double quote 
    48                 $line =~ s/\;\"//g; # semi-colon followed by double quote                
    49                 $line =~ s/\?\"//g; # question mark followed by double quote 
    50                 $line =~ s/\!\"//g; # exclamation mark followed by double quote  
    51                 $line =~ s/\.\'//g; # period followed by single quote 
    52                 $line =~ s/\,\'//g; # comma followed by single quote 
    53                 $line =~ s/\;\'//g; # semi-colon followed by single quote                                
    54                 $line =~ s/\?\'//g; # question mark followed by single quote 
    55                 $line =~ s/\!\'//g; # exclamation mark followed by single quote 
    56                 $line =~ s/\"//g; # remove all double quotes                             
    57                 $line =~ s/,//g; # remove commas 
    58                 $line =~ s/://g; # remove colon 
    59                 $line =~ s/--/ /g; #double dash 
    60                 $line =~ s/ - / /g; # dash punctuation   
    61                 $line =~ s/ -/ /g; # dash punctuation            
    62                 $line =~ s/-/ /g; # dash - compound word; replace with space, so they can be looked up in pronunciation dictionary 
    63                 $line =~ s/;//g; # semi-colon 
    64                 $line =~ s/!//g; # exclamation mark 
    65                 $line =~ s/\?//g; # question mark                
    66                 $line =~ s/  / /g; # cleanup double spaces       
    67                 $line =~ s/=//g; # remove equal sign 
    68                 $line =~ s/\(//g; # remove parenthesis   
    69                 $line =~ s/\)//g; # remove parenthesis   
    70                 $line =~ s/_//g; # remove underscore     
    71                 $line =~ s/\[//g; # remove left bracket 
    72                 $line =~ s/\]//g; # remove right bracket         
    73                 $line =~ s/\*//g; # remove star                          
    74                 $line =~ s/&/AND/g;  
     39Called by the "new" constructor - removes many (not all) non-alphanumeric characters & create a list of words, one word per line, as 
     40they appear in the orinal text file. 
    7541 
    7642=cut 
     
    7844sub _clean { # private 
    7945        my ($self) = @_; 
    80         my $debug = $self->{"debug"};   
    81         my $inputfilename = $self->{"inputfilename"}; 
    82         my $log = $self->{'log'};       
    83          
     46        my $audioBook = $self->{'audiobookObject'}; 
     47        my $debug = $audioBook->getDebug;       
     48        my $inputfilename = $audioBook->getTextFile(); 
     49        my $log = $audioBook->getLog(); 
    8450        open(IN, "<$inputfilename") or confess ("error: cannot open input $inputfilename for input");  
    8551        open(LOG,">>$log") or confess ("cannot open $log file");         
     
    190156sub _processSingleQuote { #private 
    191157        my ($self, $word, $quotelog)= @_; 
    192         my $debug = $self->{"debug"}; 
    193         my $log_single_quotes = $self->{"log_single_quotes"}; 
     158        my $audioBook = $self->{'audiobookObject'}; 
     159        my $debug = $audioBook->getDebug; 
     160        my $log_single_quotes = $audioBook->getLog_single_quotes(); 
    194161         
    195162        if ( ($$word =~ /^\'/) and ($$word =~ /\'$/) ) { # word within two single quotes (from beginning and end of word) - assume it is a quoted word  
     
    222189sub _processPeriods { #private 
    223190        my ($self,$wordarray, $word,$periodlog,$quotelog)= @_; 
    224         my $debug = $self->{"debug"}; 
     191        my $audioBook = $self->{'audiobookObject'}; 
     192        my $debug = $audioBook->getDebug; 
    225193        my $wordsbefore; 
    226194        my $match = 0; 
     
    263231sub _processEmails { #private 
    264232        my ($self,$wordarray, $word)= @_; 
    265         my $debug = $self->{"debug"}; 
     233        my $audioBook = $self->{'audiobookObject'}; 
     234        my $debug = $audioBook->getDebug; 
     235         
    266236        if ($word =~ /\d/) { 
    267237                confess "error - fix: $word contains numbers\n" 
     
    295265sub _processUrls { #private 
    296266        my ($self,$wordarray, $word)= @_; 
    297         my $debug = $self->{"debug"}; 
    298         print "processingUrls: $word\n" if $debug;       
     267        my $audioBook = $self->{'audiobookObject'}; 
     268        my $debug = $audioBook->getDebug; 
     269         
     270        print "processing Urls: $word\n" if $debug;      
    299271         
    300272        if ($word =~ /(www)?\.(\w+)\.([A-Za-z]{2,4})/)  { # URL: www.abc.com 
     
    341313sub _processWordsContainingNumbers { #private 
    342314        my ($self,$wordarray, $subword)= @_; 
    343         my $debug = $self->{"debug"}; 
     315        my $audioBook = $self->{'audiobookObject'}; 
     316        my $debug = $audioBook->getDebug; 
     317         
    344318        # separates numbers contained in a word, and converts numerical section of a word into its own word. 
    345319        if ($subword =~ /\d+\D+/) {  # assume single, consecutive set of numbers (i.e no split numbers in word) 
     
    386360sub _processNumbers { #private 
    387361        my ($self,$words, $number)= @_; 
    388         my $debug = $self->{"debug"};    
     362        my $audioBook = $self->{'audiobookObject'}; 
     363        my $debug = $audioBook->getDebug; 
    389364 
    390365        if ($number =~ /^\d+,\d+/) { # comma separated number #!!!!!! but commas are removed above ... 
     
    432407                push (@$words, "DOLLARS"); 
    433408                print LOG "converted dollars:$number: to $wordnum dollars\n"; 
    434 # skip this - minutes do not get processed properly (since they use a signle quote) - do manually in eText file.                 
    435 #       } elsif ($number =~ /^\d{1,3}°$/) { # convert degrees to words 
    436 #               $number =~ s/°//; # remove degree sign 
    437 #               my $wordnum = num2en($number); 
    438 #               $wordnum =~ s/-/ /g; # dash - compound number                    
    439 #               my @wordnumlist = split(/ /,$wordnum); 
    440 #               foreach my $word (@wordnumlist) { 
    441 #                       push (@$words, uc $word); 
    442 #               } 
    443 #               push (@$words, "DEGREES"); 
    444 #               print "converted degrees:$number°: to $wordnum degrees\n"; 
    445409        } elsif (($number =~ /^\d+\w+/) and  ($number !~ /°/)){ # convert ordinal number to words 
    446410                my $numbertemp = $number; 
     
    478442sub createWLISTFile { # public 
    479443        my ($self, $WLISTFile)= @_; 
    480         my $debug = $self->{"debug"};    
     444        my $audioBook = $self->{'audiobookObject'}; 
     445        my $debug = $audioBook->getDebug;        
    481446        my $words  = $self->{"contents"}; 
    482447         
     
    507472sub createMLFFile { # public 
    508473        my ($self,$wavfilename,$MLFFile)= @_; 
    509         my $debug = $self->{"debug"};   
    510         $self->{"wavfilename"} = $wavfilename;  
    511         print "wavfilename:$wavfilename\n"; 
     474        my $audioBook = $self->{'audiobookObject'}; 
     475        my $debug = $audioBook->getDebug; 
     476        print "createMLFFile:wavfilename:$wavfilename\n"; 
    512477         
    513478        my $words = $self->{"contents"}; 
     
    525490        print MLF "\.\n"; 
    526491        close (MLF); 
     492 
     493        $self->{"wavfilename"} = $wavfilename;   
     494        return 1; 
    527495} 
    528496 
    529497=head1 Change Log     
    530498 
     4992008/06/09 - 0.2.1 - class created - part of refacture to create Chapter, Segments & MissingWords classes 
    5315002008/05/02 - 0.2 - Convert to class; major refacture; renamed from etext2wlist.pl to Text.pm 
    532501