git.gag.com Git - debian/freetts/blob - com/sun/speech/freetts/en/TokenizerImpl.java

   1 /**
   2  * Portions Copyright 2001 Sun Microsystems, Inc.
   3  * Portions Copyright 1999-2001 Language Technologies Institute,
   4  * Carnegie Mellon University.
   5  * All Rights Reserved.  Use is subject to license terms.
   6  *
   7  * See the file "license.terms" for information on usage and
   8  * redistribution of this file, and for a DISCLAIMER OF ALL
   9  * WARRANTIES.
  10  */
  11 package com.sun.speech.freetts.en;
  12
  13 import com.sun.speech.freetts.Token;
  14 import com.sun.speech.freetts.Tokenizer;
  15 import java.io.Reader;
  16 import java.io.IOException;
  17
  18
  19 /**
  20  * Implements the tokenizer interface. Breaks an input sequence of
  21  * characters into a set of tokens.
  22  */
  23 public class TokenizerImpl implements Tokenizer {
  24
  25     /** A constant indicating that the end of the stream has been read. */
  26     public static final int EOF = -1;
  27
  28     /** A string containing the default whitespace characters. */
  29     public static final String DEFAULT_WHITESPACE_SYMBOLS = " \t\n\r";
  30
  31     /** A string containing the default single characters. */
  32     public static final String DEFAULT_SINGLE_CHAR_SYMBOLS = "(){}[]";
  33
  34     /** A string containing the default pre-punctuation characters. */
  35     public static final String DEFAULT_PREPUNCTUATION_SYMBOLS = "\"'`({[";
  36
  37     /** A string containing the default post-punctuation characters. */
  38     public static final String DEFAULT_POSTPUNCTUATION_SYMBOLS
  39         = "\"'`.,:;!?(){}[]";
  40
  41
  42     // the line number
  43     private int lineNumber = 0;
  44
  45     // the input text (from the Utterance) to tokenize
  46     private String inputText = null;
  47
  48     // the file to read input text from, if using file mode
  49     private Reader reader = null;
  50
  51     // the token position - doesn't seem really necessary at this point
  52     // private int tokenPosition = 0;
  53
  54     // the current character, whether its from the file or the input text
  55     private int currentChar = 0;
  56
  57     // the current char position for the input text (not the file)
  58     // this is called "file_pos" in flite
  59     private int currentPosition = 0;
  60
  61
  62     // the delimiting symbols of this tokenizer
  63     private String whitespaceSymbols = DEFAULT_WHITESPACE_SYMBOLS;
  64     private String singleCharSymbols = DEFAULT_SINGLE_CHAR_SYMBOLS;
  65     private String prepunctuationSymbols = DEFAULT_PREPUNCTUATION_SYMBOLS;
  66     private String postpunctuationSymbols = DEFAULT_POSTPUNCTUATION_SYMBOLS;
  67
  68     // The error description
  69     private String errorDescription = null;
  70
  71     // a place to store the current token
  72     private Token token;
  73     private Token lastToken = null;
  74
  75     // for timing
  76     private long duration = 0;
  77
  78
  79     /**
  80      * Constructs a Tokenizer.
  81      */
  82     public TokenizerImpl() {
  83     }
  84
  85
  86     /**
  87      * Creates a tokenizer that will return tokens from
  88      * the given string.
  89      *
  90      * @param string the string to tokenize
  91      */
  92     public TokenizerImpl(String string) {
  93         setInputText(string);
  94     }
  95
  96     /**
  97      * Creates a tokenizer that will return tokens from
  98      * the given file.
  99      *
 100      * @param file where to read the input from
 101      */
 102     public TokenizerImpl(Reader file) {
 103         setInputReader(file);
 104     }
 105
 106
 107     /**
 108      * Sets the whitespace symbols of this Tokenizer to the given symbols.
 109      *
 110      * @param symbols the whitespace symbols
 111      */
 112     public void setWhitespaceSymbols(String symbols) {
 113         whitespaceSymbols = symbols;
 114     }
 115
 116
 117     /**
 118      * Sets the single character symbols of this Tokenizer to the given
 119      * symbols.
 120      *
 121      * @param symbols the single character symbols
 122      */
 123     public void setSingleCharSymbols(String symbols) {
 124         singleCharSymbols = symbols;
 125     }
 126
 127
 128     /**
 129      * Sets the prepunctuation symbols of this Tokenizer to the given
 130      * symbols.
 131      *
 132      * @param symbols the prepunctuation symbols
 133      */
 134     public void setPrepunctuationSymbols(String symbols) {
 135         prepunctuationSymbols = symbols;
 136     }
 137
 138
 139     /**
 140      * Sets the postpunctuation symbols of this Tokenizer to the given
 141      * symbols.
 142      *
 143      * @param symbols the postpunctuation symbols
 144      */
 145     public void setPostpunctuationSymbols(String symbols) {
 146         postpunctuationSymbols = symbols;
 147     }
 148
 149
 150     /**
 151      * Sets the text to tokenize.
 152      *
 153      * @param  inputString  the string to tokenize
 154      */
 155     public void setInputText(String inputString) {
 156         inputText = inputString;
 157         currentPosition = 0;
 158
 159         if (inputText != null) {
 160             getNextChar();
 161         }
 162     }
 163
 164     /**
 165      * Sets the input reader
 166      *
 167      * @param  reader the input source
 168      */
 169     public void setInputReader(Reader reader) {
 170         this.reader = reader;
 171         getNextChar();
 172     }
 173
 174
 175     /**
 176      * Returns the next token.
 177      *
 178      * @return  the next token if it exists,
 179      *          <code>null</code> if no more tokens
 180      */
 181     public Token getNextToken() {
 182         lastToken = token;
 183         token = new Token();
 184
 185         // Skip whitespace
 186         token.setWhitespace(getTokenOfCharClass(whitespaceSymbols));
 187
 188         // quoted strings currently ignored
 189
 190         // get prepunctuation
 191         token.setPrepunctuation(getTokenOfCharClass(prepunctuationSymbols));
 192
 193         // get the symbol itself
 194         if (singleCharSymbols.indexOf(currentChar) != -1) {
 195             token.setWord(String.valueOf((char) currentChar));
 196             getNextChar();
 197         } else {
 198             token.setWord(getTokenNotOfCharClass(whitespaceSymbols));
 199         }
 200
 201         token.setPosition(currentPosition);
 202         token.setLineNumber(lineNumber);
 203
 204         // This'll have token *plus* postpunctuation
 205         // Get postpunctuation
 206         removeTokenPostpunctuation();
 207
 208         return token;
 209     }
 210
 211
 212     /**
 213      * Returns <code>true</code> if there are more tokens,
 214      *          <code>false</code> otherwise.
 215      *
 216      * @return <code>true</code> if there are more tokens
 217      *         <code>false</code> otherwise
 218      */
 219     public boolean hasMoreTokens() {
 220         int nextChar = currentChar;
 221         return (nextChar != EOF);
 222     }
 223
 224
 225     /**
 226      * Advances the currentPosition pointer by 1 (if not exceeding
 227      * length of inputText, and returns the character pointed by
 228      * currentPosition.
 229      *
 230      * @return the next character EOF if no more characters exist
 231      */
 232     private int getNextChar() {
 233         if (reader != null) {
 234             try {
 235                 int readVal  = reader.read();
 236                 if (readVal == -1) {
 237                     currentChar = EOF;
 238                 } else {
 239                     currentChar = (char) readVal;
 240                 }
 241             } catch (IOException ioe) {
 242                 currentChar = EOF;
 243                 errorDescription = ioe.getMessage();
 244             }
 245         } else if (inputText != null) {
 246             if (currentPosition < inputText.length()) {
 247                 currentChar = (int) inputText.charAt(currentPosition);
 248             } else {
 249                 currentChar = EOF;
 250             }
 251         }
 252         if (currentChar != EOF) {
 253             currentPosition++;
 254         }
 255         if (currentChar == '\n') {
 256             lineNumber++;
 257         }
 258         return currentChar;
 259     }
 260
 261
 262     /**
 263      * Starting from the current position of the input text,
 264      * returns the subsequent characters of type charClass,
 265      * and not of type singleCharSymbols.
 266      *
 267      * @param  charClass  the type of characters to look for
 268      * @param  buffer  the place to append characters of type charClass
 269      *
 270      * @return  a string of characters starting from the current position
 271      *          of the input text, until it encounters a character not
 272      *          in the string charClass
 273      *
 274      */
 275     private String getTokenOfCharClass(String charClass) {
 276         return getTokenByCharClass(charClass, true);
 277     }
 278
 279     /**
 280      * Starting from the current position of the input text/file,
 281      * returns the subsequent characters, not of type singleCharSymbols,
 282      * and ended at characters of type endingCharClass.  E.g., if the current
 283      * string is "xxxxyyy", endingCharClass is "yz", and singleCharClass
 284      * "abc". Then this method will return to "xxxx".
 285      *
 286      * @param  endingCharClass  the type of characters to look for
 287      *
 288      * @return  a string of characters from the current position until
 289      *          it encounters characters in endingCharClass
 290      *
 291      */
 292     private String getTokenNotOfCharClass(String endingCharClass) {
 293         return getTokenByCharClass(endingCharClass, false);
 294     }
 295
 296     /**
 297      * Provides a `compressed' method from getTokenOfCharClass() and
 298      * getTokenNotOfCharClass().
 299      * If parameter containThisCharClass is <code>true</code>,
 300      * then a string from the
 301      * current position to the last character in charClass is returned.
 302      * If containThisCharClass is <code>false</code>, then a string
 303      * before the first
 304      * occurrence of a character in containThisCharClass is returned.
 305      *
 306      * @param  charClass  the string of characters you want included or
 307      *                    excluded in your return
 308      * @param  containThisCharClass  determines if you want characters
 309      *                in charClass in the returned string or not
 310      *
 311      * @return  a string of characters from the current position until
 312      *          it encounters characters in endingCharClass
 313      */
 314     private String getTokenByCharClass(String charClass,
 315                                        boolean containThisCharClass) {
 316         StringBuffer buffer = new StringBuffer();
 317
 318         // if we want the returned string to contain chars in charClass, then
 319         // containThisCharClass is TRUE and
 320         // (charClass.indexOf(currentChar) != 1) == containThisCharClass)
 321         // returns true; if we want it to stop at characters of charClass,
 322         // then containThisCharClass is FALSE, and the condition returns
 323         // false.
 324         while ((charClass.indexOf(currentChar) != -1)
 325                == containThisCharClass  &&
 326                singleCharSymbols.indexOf(currentChar) == -1 &&
 327                currentChar != EOF) {
 328             buffer.append((char) currentChar);
 329             getNextChar();
 330         }
 331         return buffer.toString();
 332     }
 333
 334     /**
 335      * Removes the postpunctuation characters from the current token.
 336      * Copies those postpunctuation characters to the class
 337      * variable 'postpunctuation'.
 338      */
 339     private void removeTokenPostpunctuation() {
 340         if (token != null) {
 341             String tokenWord = token.getWord();
 342
 343             int tokenLength = tokenWord.length();
 344             int position = tokenLength - 1;
 345
 346             while (position > 0 &&
 347                    postpunctuationSymbols.indexOf
 348                    ((int)tokenWord.charAt(position)) != -1) {
 349                 position--;
 350             }
 351
 352             if (tokenLength - 1 != position) {
 353                 // Copy postpunctuation from token
 354                 token.setPostpunctuation( tokenWord.substring(position+1));
 355
 356                 // truncate token at postpunctuation
 357                 token.setWord(tokenWord.substring(0, position+1));
 358             } else {
 359                 token.setPostpunctuation("");
 360             }
 361         }
 362     }
 363
 364     /**
 365      * Returns <code>true</code> if there were errors while reading tokens
 366      *
 367      * @return <code>true</code> if there were errors;
 368      *          <code>false</code> otherwise
 369      */
 370     public boolean hasErrors() {
 371         return errorDescription != null;
 372     }
 373
 374     /**
 375      * if hasErrors returns <code>true</code>, this will return a
 376      * description of the error encountered, otherwise
 377      * it will return <code>null</code>
 378      *
 379      * @return a description of the last error that occurred.
 380      */
 381     public String getErrorDescription() {
 382         return errorDescription;
 383     }
 384
 385     /**
 386      * Determines if the current token should start a new sentence.
 387      *
 388      * @return <code>true</code> if a new sentence should be started
 389      */
 390     public boolean isBreak() {
 391
 392         String tokenWhiteSpace = token.getWhitespace();
 393         String lastTokenPostpunctuation = null;
 394         if (lastToken != null) {
 395             lastTokenPostpunctuation = lastToken.getPostpunctuation();
 396         }
 397
 398         if (lastToken == null || token == null) {
 399             return false;
 400         } else if (tokenWhiteSpace.indexOf('\n') !=
 401                    tokenWhiteSpace.lastIndexOf('\n')) {
 402             return true;
 403         } else if (lastTokenPostpunctuation.indexOf(':') != -1 ||
 404                    lastTokenPostpunctuation.indexOf('?') != -1 ||
 405                    lastTokenPostpunctuation.indexOf('!') != -1) {
 406             return true;
 407         } else if (lastTokenPostpunctuation.indexOf('.') != -1 &&
 408                    tokenWhiteSpace.length() > 1 &&
 409                    Character.isUpperCase(token.getWord().charAt(0))) {
 410             return true;
 411         } else {
 412             String lastWord = lastToken.getWord();
 413             int lastWordLength = lastWord.length();
 414
 415             if (lastTokenPostpunctuation.indexOf('.') != -1 &&
 416                 /* next word starts with a capital */
 417                 Character.isUpperCase(token.getWord().charAt(0)) &&
 418                 /* last word isn't an abbreviation */
 419                 !(Character.isUpperCase
 420                   (lastWord.charAt(lastWordLength - 1)) ||
 421                   (lastWordLength < 4 &&
 422                    Character.isUpperCase(lastWord.charAt(0))))) {
 423                 return true;
 424             }
 425         }
 426         return false;
 427     }
 428 }
 429