voxforge.org
VoxForge Dev
Show
Ignore:
Timestamp:
05/26/08 14:11:03 (6 months ago)
Author:
kmaclean
Message:

AudioSegmentation scripts -add POD docs

Files:

Legend:

Unmodified
Added
Removed
Modified
Copied
Moved
  • Trunk/Scripts/Audio_scripts/AudioSegmentation/AudioBook/Text.pm

    r2590 r2593  
    11#! /usr/bin/perl 
    2 #################################################################### 
    3 ### 
    4 ### script name : Text.pm 
    5 ### version: 0.2 
    6 ### created by: Ken MacLean 
    7 ### mail: contact@voxforge.org 
    8 ### Date: 2007.3.20 
    9 ###    
    10 ### Copyright (C) 2007 Ken MacLean 
    11 ### 
    12 ### This program is free software; you can redistribute it and/or 
    13 ### modify it under the terms of the GNU General Public License 
    14 ### as published by the Free Software Foundation; either version 2 
    15 ### of the License, or (at your option) any later version. 
    16 ### 
    17 ### This program is distributed in the hope that it will be useful, 
    18 ### but WITHOUT ANY WARRANTY; without even the implied warranty of 
    19 ### MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
    20 ### GNU General Public License for more details. 
    21 ###         
    22 ### Change History:     
    23 ### 2008/05/02 - 0.2 - Convert to class; major refacture; renamed from etext2wlist.pl to Text.pm 
    24 #################################################################### 
     2$VERSION = 0.2; 
     3 
     4=head1 NAME 
     5 
     6AudioBook::Text - Text transcription processing    
     7 
     8=cut  
     9 
    2510package AudioBook::Text; 
    2611use strict; 
     
    3015use Lingua::EN::Numbers qw(num2en num2en_ordinal); 
    3116use Lingua::EN::Numbers::Years; 
    32 #################################################################### 
    33 ### Constructor 
    34 #################################################################### 
     17 
     18=head1 METHODS (not user accessible) 
     19 
     20=cut 
     21 
     22=head2 new  
     23 
     24Constructor - creates a text object 
     25 
     26=cut 
     27 
    3528sub new { 
    3629        my ($class,$super,$textFile) = @_;  
     
    4336        return \%self; 
    4437}     
    45 #################################################################### 
    46 ### Class Methods 
    47 #################################################################### 
    48 ### Cleans up eText  
     38 
     39=head2 _clean  
     40 
     41Called by the "new" constructor - removes many (not all) non-alphanumeric characters. 
     42 
     43                $line =~ s/\n/ /g; # remove all line feeds from the text file 
     44                $line =~ s/\r/ /g; # remove all carriage returns from the text file 
     45                $line =~ tr/a-z/A-Z/; # change to uppercase 
     46                $line =~ s/\.\"//g; # period followed by double quote 
     47                $line =~ s/\,\"//g; # comma followed by double quote 
     48                $line =~ s/\?\"//g; # question mark followed by double quote 
     49                $line =~ s/\!\"//g; # exclamation mark followed by double quote  
     50                $line =~ s/\.\'//g; # period followed by single quote 
     51                $line =~ s/\,\'//g; # comma followed by single quote 
     52                $line =~ s/\?\'//g; # question mark followed by single quote 
     53                $line =~ s/\!\'//g; # exclamation mark followed by single quote 
     54                $line =~ s/\"//g; # remove all double quotes                             
     55                $line =~ s/,//g; # remove commas 
     56                $line =~ s/://g; # remove colon 
     57                $line =~ s/--/ /g; #double dash 
     58                $line =~ s/ - / /g; # dash punctuation   
     59                $line =~ s/ -/ /g; # dash punctuation            
     60                $line =~ s/-/ /g; # dash - compound word; replace with space, so they can be looked up in pronunciation dictionary 
     61                $line =~ s/;//g; # semi-colon 
     62                $line =~ s/!//g; # exclamation mark 
     63                $line =~ s/\?//g; # question mark                
     64                $line =~ s/  / /g; # cleanup double spaces       
     65                $line =~ s/=//g; # remove equal sign 
     66                $line =~ s/\(//g; # remove parenthesis   
     67                $line =~ s/\)//g; # remove parenthesis   
     68                $line =~ s/_//g; # remove underscore     
     69                $line =~ s/\[//g; # remove left bracket 
     70                $line =~ s/\]//g; # remove right bracket         
     71                $line =~ s/\*//g; # remove star                          
     72                $line =~ s/&/AND/g;  
     73 
     74=cut 
     75 
    4976sub _clean { # private 
    5077        my ($self) = @_; 
     
    151178}  
    152179 
     180=head2 _processSingleQuote  
     181 
     182Special logic to deal with single quotes 
     183 
     184=cut 
     185 
    153186sub _processSingleQuote { #private 
    154187        my ($self, $word, $quotelog)= @_; 
     
    173206} 
    174207 
     208=head2 _processPeriods  
     209 
     210Special logic to deal with periods 
     211 
     212=cut 
     213 
    175214sub _processPeriods { #private 
    176215        my ($self,$wordarray, $word,$periodlog,$quotelog)= @_; 
     
    204243        } 
    205244} 
     245 
     246=head2 _processEmails  
     247 
     248Special logic to deal with emails 
     249 
     250 convert a period to the word "DOT" 
     251 convert "@" sign to the word "AT" 
     252 
     253=cut 
    206254 
    207255sub _processEmails { #private 
     
    225273} 
    226274 
     275=head2 _processUrls  
     276 
     277Special logic to deal with URLs 
     278 
     279 convert a period to the word "DOT" 
     280 
     281=head3 Notes 
     282 
     283Does not transcribe "http://" need to listen to the audio to confirm how the user says this. 
     284 
     285=cut 
     286 
    227287sub _processUrls { #private 
    228288        my ($self,$wordarray, $word)= @_; 
     
    260320        } 
    261321} 
     322 
     323=head2 _processWordsContainingNumbers  
     324 
     325Special logic to deal with words that contain numbers 
     326 
     327For example, converts the word "day12" to the words "DAY ONE TWO".   
     328 
     329Need to listen to the audio to confirm how the user says this. 
     330 
     331=cut 
    262332 
    263333sub _processWordsContainingNumbers { #private 
     
    295365        }        
    296366} 
     367 
     368=head2 _processNumbers  
     369 
     370Special logic to convert numbers to words 
     371 
     372uses:  
     373 Lingua::EN::Numbers qw(num2en num2en_ordinal); 
     374 Lingua::EN::Numbers::Years; 
     375 
     376=cut 
    297377         
    298378sub _processNumbers { #private 
     
    382462} 
    383463 
     464=head2 createWLISTFile  
     465 
     466creates the word list file (called Wlist) for use by HTK tools 
     467 
     468=cut 
     469 
    384470sub createWLISTFile { # public 
    385471        my ($self, $WLISTFile)= @_; 
     
    400486} 
    401487 
     488=head2 createMLFFile  
     489 
     490creates the Multi-Label File (called MLF file) for use by HTK tools 
     491 
     492=cut 
     493 
    402494sub createMLFFile { # public 
    403495        my ($self,$wavfilename,$MLFFile)= @_; 
     
    416508        close (MLF); 
    417509} 
     510 
     511=head1 Change Log     
     512 
     513        2008/05/02 - 0.2 - Convert to class; major refacture; renamed from etext2wlist.pl to Text.pm 
     514 
     515=head1 AUTHOR 
     516     
     517Ken MacLean 
     518contact@voxforge.org 
     519       
     520=head1 COPYRIGHT AND LICENSE        
     521       
     522Copyright (C) 2007 Ken MacLean 
     523    
     524This program is free software; you can redistribute it and/or 
     525modify it under the terms of the GNU General Public License 
     526as published by the Free Software Foundation; either version 2 
     527of the License, or (at your option) any later version. 
     528    
     529This program is distributed in the hope that it will be useful, 
     530but WITHOUT ANY WARRANTY; without even the implied warranty of 
     531MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
     532GNU General Public License for more details. 
     533     
     534=cut 
     535 
    4185361;