- Timestamp:
- 06/09/08 17:06:50 (6 months ago)
- Files:
Legend:
- Unmodified
- Added
- Removed
- Modified
- Copied
- Moved
Trunk/Scripts/Audio_scripts/AudioSegmentation/AudioBook/Text.pm
r2604 r2606 1 1 #! /usr/bin/perl 2 $VERSION = 0.2 ;2 $VERSION = 0.2.1; 3 3 4 4 =head1 NAME 5 5 6 AudioBook::Text - Text transcription processing6 Text - Text transcription processing 7 7 8 8 =cut … … 27 27 28 28 sub new { 29 my ($class,$super,$textFile) = @_; 30 my %self; 31 $self{"inputfilename"} = $textFile; 32 $self{'log'} = $super->{'log'}; 33 $self{"log_single_quotes"} = $super->{"log_single_quotes"}; 34 _clean(\%self); 35 bless(\%self,$class); 36 return \%self; 29 my ($class,$chapter) = @_; 30 my $self = {}; 31 bless($self,$class); 32 $self->{'audiobookObject'} = $chapter->getAudioBookObject(); 33 _clean($self); 34 return $self; 37 35 } 38 36 39 37 =head2 _clean 40 38 41 Called by the "new" constructor - removes many (not all) non-alphanumeric characters. 42 43 $line =~ s/\n/ /g; # remove all line feeds from the text file 44 $line =~ s/\r/ /g; # remove all carriage returns from the text file 45 $line =~ tr/a-z/A-Z/; # change to uppercase 46 $line =~ s/\.\"//g; # period followed by double quote 47 $line =~ s/\,\"//g; # comma followed by double quote 48 $line =~ s/\;\"//g; # semi-colon followed by double quote 49 $line =~ s/\?\"//g; # question mark followed by double quote 50 $line =~ s/\!\"//g; # exclamation mark followed by double quote 51 $line =~ s/\.\'//g; # period followed by single quote 52 $line =~ s/\,\'//g; # comma followed by single quote 53 $line =~ s/\;\'//g; # semi-colon followed by single quote 54 $line =~ s/\?\'//g; # question mark followed by single quote 55 $line =~ s/\!\'//g; # exclamation mark followed by single quote 56 $line =~ s/\"//g; # remove all double quotes 57 $line =~ s/,//g; # remove commas 58 $line =~ s/://g; # remove colon 59 $line =~ s/--/ /g; #double dash 60 $line =~ s/ - / /g; # dash punctuation 61 $line =~ s/ -/ /g; # dash punctuation 62 $line =~ s/-/ /g; # dash - compound word; replace with space, so they can be looked up in pronunciation dictionary 63 $line =~ s/;//g; # semi-colon 64 $line =~ s/!//g; # exclamation mark 65 $line =~ s/\?//g; # question mark 66 $line =~ s/ / /g; # cleanup double spaces 67 $line =~ s/=//g; # remove equal sign 68 $line =~ s/\(//g; # remove parenthesis 69 $line =~ s/\)//g; # remove parenthesis 70 $line =~ s/_//g; # remove underscore 71 $line =~ s/\[//g; # remove left bracket 72 $line =~ s/\]//g; # remove right bracket 73 $line =~ s/\*//g; # remove star 74 $line =~ s/&/AND/g; 39 Called by the "new" constructor - removes many (not all) non-alphanumeric characters & create a list of words, one word per line, as 40 they appear in the orinal text file. 75 41 76 42 =cut … … 78 44 sub _clean { # private 79 45 my ($self) = @_; 80 my $ debug = $self->{"debug"};81 my $ inputfilename = $self->{"inputfilename"};82 my $ log = $self->{'log'};83 46 my $audioBook = $self->{'audiobookObject'}; 47 my $debug = $audioBook->getDebug; 48 my $inputfilename = $audioBook->getTextFile(); 49 my $log = $audioBook->getLog(); 84 50 open(IN, "<$inputfilename") or confess ("error: cannot open input $inputfilename for input"); 85 51 open(LOG,">>$log") or confess ("cannot open $log file"); … … 190 156 sub _processSingleQuote { #private 191 157 my ($self, $word, $quotelog)= @_; 192 my $debug = $self->{"debug"}; 193 my $log_single_quotes = $self->{"log_single_quotes"}; 158 my $audioBook = $self->{'audiobookObject'}; 159 my $debug = $audioBook->getDebug; 160 my $log_single_quotes = $audioBook->getLog_single_quotes(); 194 161 195 162 if ( ($$word =~ /^\'/) and ($$word =~ /\'$/) ) { # word within two single quotes (from beginning and end of word) - assume it is a quoted word … … 222 189 sub _processPeriods { #private 223 190 my ($self,$wordarray, $word,$periodlog,$quotelog)= @_; 224 my $debug = $self->{"debug"}; 191 my $audioBook = $self->{'audiobookObject'}; 192 my $debug = $audioBook->getDebug; 225 193 my $wordsbefore; 226 194 my $match = 0; … … 263 231 sub _processEmails { #private 264 232 my ($self,$wordarray, $word)= @_; 265 my $debug = $self->{"debug"}; 233 my $audioBook = $self->{'audiobookObject'}; 234 my $debug = $audioBook->getDebug; 235 266 236 if ($word =~ /\d/) { 267 237 confess "error - fix: $word contains numbers\n" … … 295 265 sub _processUrls { #private 296 266 my ($self,$wordarray, $word)= @_; 297 my $debug = $self->{"debug"}; 298 print "processingUrls: $word\n" if $debug; 267 my $audioBook = $self->{'audiobookObject'}; 268 my $debug = $audioBook->getDebug; 269 270 print "processing Urls: $word\n" if $debug; 299 271 300 272 if ($word =~ /(www)?\.(\w+)\.([A-Za-z]{2,4})/) { # URL: www.abc.com … … 341 313 sub _processWordsContainingNumbers { #private 342 314 my ($self,$wordarray, $subword)= @_; 343 my $debug = $self->{"debug"}; 315 my $audioBook = $self->{'audiobookObject'}; 316 my $debug = $audioBook->getDebug; 317 344 318 # separates numbers contained in a word, and converts numerical section of a word into its own word. 345 319 if ($subword =~ /\d+\D+/) { # assume single, consecutive set of numbers (i.e no split numbers in word) … … 386 360 sub _processNumbers { #private 387 361 my ($self,$words, $number)= @_; 388 my $debug = $self->{"debug"}; 362 my $audioBook = $self->{'audiobookObject'}; 363 my $debug = $audioBook->getDebug; 389 364 390 365 if ($number =~ /^\d+,\d+/) { # comma separated number #!!!!!! but commas are removed above ... … … 432 407 push (@$words, "DOLLARS"); 433 408 print LOG "converted dollars:$number: to $wordnum dollars\n"; 434 # skip this - minutes do not get processed properly (since they use a signle quote) - do manually in eText file.435 # } elsif ($number =~ /^\d{1,3}°$/) { # convert degrees to words436 # $number =~ s/°//; # remove degree sign437 # my $wordnum = num2en($number);438 # $wordnum =~ s/-/ /g; # dash - compound number439 # my @wordnumlist = split(/ /,$wordnum);440 # foreach my $word (@wordnumlist) {441 # push (@$words, uc $word);442 # }443 # push (@$words, "DEGREES");444 # print "converted degrees:$number°: to $wordnum degrees\n";445 409 } elsif (($number =~ /^\d+\w+/) and ($number !~ /°/)){ # convert ordinal number to words 446 410 my $numbertemp = $number; … … 478 442 sub createWLISTFile { # public 479 443 my ($self, $WLISTFile)= @_; 480 my $debug = $self->{"debug"}; 444 my $audioBook = $self->{'audiobookObject'}; 445 my $debug = $audioBook->getDebug; 481 446 my $words = $self->{"contents"}; 482 447 … … 507 472 sub createMLFFile { # public 508 473 my ($self,$wavfilename,$MLFFile)= @_; 509 my $ debug = $self->{"debug"};510 $self->{"wavfilename"} = $wavfilename;511 print " wavfilename:$wavfilename\n";474 my $audioBook = $self->{'audiobookObject'}; 475 my $debug = $audioBook->getDebug; 476 print "createMLFFile:wavfilename:$wavfilename\n"; 512 477 513 478 my $words = $self->{"contents"}; … … 525 490 print MLF "\.\n"; 526 491 close (MLF); 492 493 $self->{"wavfilename"} = $wavfilename; 494 return 1; 527 495 } 528 496 529 497 =head1 Change Log 530 498 499 2008/06/09 - 0.2.1 - class created - part of refacture to create Chapter, Segments & MissingWords classes 531 500 2008/05/02 - 0.2 - Convert to class; major refacture; renamed from etext2wlist.pl to Text.pm 532 501