2 * Portions Copyright 2001 Sun Microsystems, Inc.
3 * Portions Copyright 1999-2001 Language Technologies Institute,
4 * Carnegie Mellon University.
5 * All Rights Reserved. Use is subject to license terms.
7 * See the file "license.terms" for information on usage and
8 * redistribution of this file, and for a DISCLAIMER OF ALL
11 package com.sun.speech.freetts.en;
13 import com.sun.speech.freetts.Token;
14 import com.sun.speech.freetts.Tokenizer;
15 import java.io.Reader;
16 import java.io.IOException;
20 * Implements the tokenizer interface. Breaks an input sequence of
21 * characters into a set of tokens.
23 public class TokenizerImpl implements Tokenizer {
25 /** A constant indicating that the end of the stream has been read. */
26 public static final int EOF = -1;
28 /** A string containing the default whitespace characters. */
29 public static final String DEFAULT_WHITESPACE_SYMBOLS = " \t\n\r";
31 /** A string containing the default single characters. */
32 public static final String DEFAULT_SINGLE_CHAR_SYMBOLS = "(){}[]";
34 /** A string containing the default pre-punctuation characters. */
35 public static final String DEFAULT_PREPUNCTUATION_SYMBOLS = "\"'`({[";
37 /** A string containing the default post-punctuation characters. */
38 public static final String DEFAULT_POSTPUNCTUATION_SYMBOLS
43 private int lineNumber = 0;
45 // the input text (from the Utterance) to tokenize
46 private String inputText = null;
48 // the file to read input text from, if using file mode
49 private Reader reader = null;
51 // the token position - doesn't seem really necessary at this point
52 // private int tokenPosition = 0;
54 // the current character, whether its from the file or the input text
55 private int currentChar = 0;
57 // the current char position for the input text (not the file)
58 // this is called "file_pos" in flite
59 private int currentPosition = 0;
62 // the delimiting symbols of this tokenizer
63 private String whitespaceSymbols = DEFAULT_WHITESPACE_SYMBOLS;
64 private String singleCharSymbols = DEFAULT_SINGLE_CHAR_SYMBOLS;
65 private String prepunctuationSymbols = DEFAULT_PREPUNCTUATION_SYMBOLS;
66 private String postpunctuationSymbols = DEFAULT_POSTPUNCTUATION_SYMBOLS;
68 // The error description
69 private String errorDescription = null;
71 // a place to store the current token
73 private Token lastToken = null;
76 private long duration = 0;
80 * Constructs a Tokenizer.
82 public TokenizerImpl() {
87 * Creates a tokenizer that will return tokens from
90 * @param string the string to tokenize
92 public TokenizerImpl(String string) {
97 * Creates a tokenizer that will return tokens from
100 * @param file where to read the input from
102 public TokenizerImpl(Reader file) {
103 setInputReader(file);
108 * Sets the whitespace symbols of this Tokenizer to the given symbols.
110 * @param symbols the whitespace symbols
112 public void setWhitespaceSymbols(String symbols) {
113 whitespaceSymbols = symbols;
118 * Sets the single character symbols of this Tokenizer to the given
121 * @param symbols the single character symbols
123 public void setSingleCharSymbols(String symbols) {
124 singleCharSymbols = symbols;
129 * Sets the prepunctuation symbols of this Tokenizer to the given
132 * @param symbols the prepunctuation symbols
134 public void setPrepunctuationSymbols(String symbols) {
135 prepunctuationSymbols = symbols;
140 * Sets the postpunctuation symbols of this Tokenizer to the given
143 * @param symbols the postpunctuation symbols
145 public void setPostpunctuationSymbols(String symbols) {
146 postpunctuationSymbols = symbols;
151 * Sets the text to tokenize.
153 * @param inputString the string to tokenize
155 public void setInputText(String inputString) {
156 inputText = inputString;
159 if (inputText != null) {
165 * Sets the input reader
167 * @param reader the input source
169 public void setInputReader(Reader reader) {
170 this.reader = reader;
176 * Returns the next token.
178 * @return the next token if it exists,
179 * <code>null</code> if no more tokens
181 public Token getNextToken() {
186 token.setWhitespace(getTokenOfCharClass(whitespaceSymbols));
188 // quoted strings currently ignored
190 // get prepunctuation
191 token.setPrepunctuation(getTokenOfCharClass(prepunctuationSymbols));
193 // get the symbol itself
194 if (singleCharSymbols.indexOf(currentChar) != -1) {
195 token.setWord(String.valueOf((char) currentChar));
198 token.setWord(getTokenNotOfCharClass(whitespaceSymbols));
201 token.setPosition(currentPosition);
202 token.setLineNumber(lineNumber);
204 // This'll have token *plus* postpunctuation
205 // Get postpunctuation
206 removeTokenPostpunctuation();
213 * Returns <code>true</code> if there are more tokens,
214 * <code>false</code> otherwise.
216 * @return <code>true</code> if there are more tokens
217 * <code>false</code> otherwise
219 public boolean hasMoreTokens() {
220 int nextChar = currentChar;
221 return (nextChar != EOF);
226 * Advances the currentPosition pointer by 1 (if not exceeding
227 * length of inputText, and returns the character pointed by
230 * @return the next character EOF if no more characters exist
232 private int getNextChar() {
233 if (reader != null) {
235 int readVal = reader.read();
239 currentChar = (char) readVal;
241 } catch (IOException ioe) {
243 errorDescription = ioe.getMessage();
245 } else if (inputText != null) {
246 if (currentPosition < inputText.length()) {
247 currentChar = (int) inputText.charAt(currentPosition);
252 if (currentChar != EOF) {
255 if (currentChar == '\n') {
263 * Starting from the current position of the input text,
264 * returns the subsequent characters of type charClass,
265 * and not of type singleCharSymbols.
267 * @param charClass the type of characters to look for
268 * @param buffer the place to append characters of type charClass
270 * @return a string of characters starting from the current position
271 * of the input text, until it encounters a character not
272 * in the string charClass
275 private String getTokenOfCharClass(String charClass) {
276 return getTokenByCharClass(charClass, true);
280 * Starting from the current position of the input text/file,
281 * returns the subsequent characters, not of type singleCharSymbols,
282 * and ended at characters of type endingCharClass. E.g., if the current
283 * string is "xxxxyyy", endingCharClass is "yz", and singleCharClass
284 * "abc". Then this method will return to "xxxx".
286 * @param endingCharClass the type of characters to look for
288 * @return a string of characters from the current position until
289 * it encounters characters in endingCharClass
292 private String getTokenNotOfCharClass(String endingCharClass) {
293 return getTokenByCharClass(endingCharClass, false);
297 * Provides a `compressed' method from getTokenOfCharClass() and
298 * getTokenNotOfCharClass().
299 * If parameter containThisCharClass is <code>true</code>,
300 * then a string from the
301 * current position to the last character in charClass is returned.
302 * If containThisCharClass is <code>false</code>, then a string
304 * occurrence of a character in containThisCharClass is returned.
306 * @param charClass the string of characters you want included or
307 * excluded in your return
308 * @param containThisCharClass determines if you want characters
309 * in charClass in the returned string or not
311 * @return a string of characters from the current position until
312 * it encounters characters in endingCharClass
314 private String getTokenByCharClass(String charClass,
315 boolean containThisCharClass) {
316 StringBuffer buffer = new StringBuffer();
318 // if we want the returned string to contain chars in charClass, then
319 // containThisCharClass is TRUE and
320 // (charClass.indexOf(currentChar) != 1) == containThisCharClass)
321 // returns true; if we want it to stop at characters of charClass,
322 // then containThisCharClass is FALSE, and the condition returns
324 while ((charClass.indexOf(currentChar) != -1)
325 == containThisCharClass &&
326 singleCharSymbols.indexOf(currentChar) == -1 &&
327 currentChar != EOF) {
328 buffer.append((char) currentChar);
331 return buffer.toString();
335 * Removes the postpunctuation characters from the current token.
336 * Copies those postpunctuation characters to the class
337 * variable 'postpunctuation'.
339 private void removeTokenPostpunctuation() {
341 String tokenWord = token.getWord();
343 int tokenLength = tokenWord.length();
344 int position = tokenLength - 1;
346 while (position > 0 &&
347 postpunctuationSymbols.indexOf
348 ((int)tokenWord.charAt(position)) != -1) {
352 if (tokenLength - 1 != position) {
353 // Copy postpunctuation from token
354 token.setPostpunctuation( tokenWord.substring(position+1));
356 // truncate token at postpunctuation
357 token.setWord(tokenWord.substring(0, position+1));
359 token.setPostpunctuation("");
365 * Returns <code>true</code> if there were errors while reading tokens
367 * @return <code>true</code> if there were errors;
368 * <code>false</code> otherwise
370 public boolean hasErrors() {
371 return errorDescription != null;
375 * if hasErrors returns <code>true</code>, this will return a
376 * description of the error encountered, otherwise
377 * it will return <code>null</code>
379 * @return a description of the last error that occurred.
381 public String getErrorDescription() {
382 return errorDescription;
386 * Determines if the current token should start a new sentence.
388 * @return <code>true</code> if a new sentence should be started
390 public boolean isBreak() {
392 String tokenWhiteSpace = token.getWhitespace();
393 String lastTokenPostpunctuation = null;
394 if (lastToken != null) {
395 lastTokenPostpunctuation = lastToken.getPostpunctuation();
398 if (lastToken == null || token == null) {
400 } else if (tokenWhiteSpace.indexOf('\n') !=
401 tokenWhiteSpace.lastIndexOf('\n')) {
403 } else if (lastTokenPostpunctuation.indexOf(':') != -1 ||
404 lastTokenPostpunctuation.indexOf('?') != -1 ||
405 lastTokenPostpunctuation.indexOf('!') != -1) {
407 } else if (lastTokenPostpunctuation.indexOf('.') != -1 &&
408 tokenWhiteSpace.length() > 1 &&
409 Character.isUpperCase(token.getWord().charAt(0))) {
412 String lastWord = lastToken.getWord();
413 int lastWordLength = lastWord.length();
415 if (lastTokenPostpunctuation.indexOf('.') != -1 &&
416 /* next word starts with a capital */
417 Character.isUpperCase(token.getWord().charAt(0)) &&
418 /* last word isn't an abbreviation */
419 !(Character.isUpperCase
420 (lastWord.charAt(lastWordLength - 1)) ||
421 (lastWordLength < 4 &&
422 Character.isUpperCase(lastWord.charAt(0))))) {