2 * Portions Copyright 2001-2003 Sun Microsystems, Inc.
3 * Portions Copyright 1999-2001 Language Technologies Institute,
4 * Carnegie Mellon University.
5 * All Rights Reserved. Use is subject to license terms.
7 * See the file "license.terms" for information on usage and
8 * redistribution of this file, and for a DISCLAIMER OF ALL
11 package com.sun.speech.freetts.en.us;
13 import java.util.Hashtable;
14 import java.util.regex.Matcher;
15 import java.util.regex.Pattern;
17 import com.sun.speech.freetts.FeatureSet;
18 import com.sun.speech.freetts.Item;
19 import com.sun.speech.freetts.ProcessException;
20 import com.sun.speech.freetts.Relation;
21 import com.sun.speech.freetts.Utterance;
22 import com.sun.speech.freetts.UtteranceProcessor;
23 import com.sun.speech.freetts.cart.CART;
24 import com.sun.speech.freetts.util.Utilities;
28 * Converts the Tokens (in US English words) in an
29 * Utterance into a list of words. It puts the produced list back
30 * into the Utterance. Usually, the tokens that gets expanded are numbers
31 * like "23" (to "twenty" "three").
32 * <p> * It translates the following code from flite:
35 * lang/usenglish/us_text.c
38 public class TokenToWords implements UtteranceProcessor {
40 /** Regular expression for something that has a vowel */
41 private static final String RX_HAS_VOWEL = ".*[aeiouAEIOU].*";
43 // Patterns for regular expression matching
44 private static final Pattern alphabetPattern;
45 private static final Pattern commaIntPattern;
46 private static final Pattern digits2DashPattern;
47 private static final Pattern digitsPattern;
48 private static final Pattern digitsSlashDigitsPattern;
49 private static final Pattern dottedAbbrevPattern;
50 private static final Pattern doublePattern;
51 private static final Pattern drStPattern;
52 private static final Pattern fourDigitsPattern;
53 private static final Pattern hasVowelPattern;
54 private static final Pattern illionPattern;
55 private static final Pattern numberTimePattern;
56 private static final Pattern numessPattern;
57 private static final Pattern ordinalPattern;
58 private static final Pattern romanNumbersPattern;
59 private static final Pattern sevenPhoneNumberPattern;
60 private static final Pattern threeDigitsPattern;
61 private static final Pattern usMoneyPattern;
64 alphabetPattern = Pattern.compile(USEnglish.RX_ALPHABET);
65 commaIntPattern = Pattern.compile(USEnglish.RX_COMMAINT);
66 digits2DashPattern = Pattern.compile(USEnglish.RX_DIGITS2DASH);
67 digitsPattern = Pattern.compile(USEnglish.RX_DIGITS);
68 digitsSlashDigitsPattern = Pattern.compile(USEnglish.RX_DIGITSSLASHDIGITS);
69 dottedAbbrevPattern = Pattern.compile(USEnglish.RX_DOTTED_ABBREV);
70 doublePattern = Pattern.compile(USEnglish.RX_DOUBLE);
71 drStPattern = Pattern.compile(USEnglish.RX_DRST);
72 fourDigitsPattern = Pattern.compile(USEnglish.RX_FOUR_DIGIT);
73 hasVowelPattern = Pattern.compile(USEnglish.RX_HAS_VOWEL);
74 illionPattern = Pattern.compile(USEnglish.RX_ILLION);
75 numberTimePattern = Pattern.compile(USEnglish.RX_NUMBER_TIME);
76 numessPattern = Pattern.compile(USEnglish.RX_NUMESS);
77 ordinalPattern = Pattern.compile(USEnglish.RX_ORDINAL_NUMBER);
78 romanNumbersPattern = Pattern.compile(USEnglish.RX_ROMAN_NUMBER);
79 sevenPhoneNumberPattern = Pattern.compile(USEnglish.RX_SEVEN_DIGIT_PHONE_NUMBER);
80 threeDigitsPattern = Pattern.compile(USEnglish.RX_THREE_DIGIT);
81 usMoneyPattern = Pattern.compile(USEnglish.RX_US_MONEY);
85 private static final String[] kingNames = {
86 "louis", "henry", "charles", "philip", "george",
87 "edward", "pius", "william", "richard", "ptolemy",
88 "john", "paul", "peter", "nicholas", "frederick",
89 "james", "alfonso", "ivan", "napoleon", "leo",
90 "gregory", "catherine", "alexandria", "pierre", "elizabeth",
93 private static final String[] kingTitles = {
94 "king", "queen", "pope", "duke", "tsar",
95 "emperor", "shah", "caesar", "duchess", "tsarina",
96 "empress", "baron", "baroness", "sultan", "count",
100 private static final String[] sectionTypes = {
101 "section", "chapter", "part", "phrase", "verse",
102 "scene", "act", "book", "volume", "chap",
103 "war", "apollo", "trek", "fortran" };
106 * Here we use a hashtable for constant time matching, instead of using
107 * if (A.equals(B) || A.equals(C) || ...) to match Strings
109 private static Hashtable kingSectionLikeHash = new Hashtable();
111 private static final String KING_NAMES = "kingNames";
112 private static final String KING_TITLES = "kingTitles";
113 private static final String SECTION_TYPES = "sectionTypes";
115 // Hashtable initialization
117 for (int i = 0; i < kingNames.length; i++) {
118 kingSectionLikeHash.put(kingNames[i], KING_NAMES);
120 for (int i = 0; i < kingTitles.length; i++) {
121 kingSectionLikeHash.put(kingTitles[i], KING_TITLES);
123 for (int i = 0; i < sectionTypes.length; i++) {
124 kingSectionLikeHash.put(sectionTypes[i], SECTION_TYPES);
128 private static final String[] postrophes = {
129 "'s", "'ll", "'ve", "'d" };
131 // Finite state machines to check if a Token is pronounceable
132 private PronounceableFSM prefixFSM = null;
133 private PronounceableFSM suffixFSM = null;
135 // List of US states abbreviations and their full names
136 private static final String[][] usStates =
138 { "AL", "ambiguous", "alabama" },
139 { "Al", "ambiguous", "alabama" },
140 { "Ala", "", "alabama" },
141 { "AK", "", "alaska" },
142 { "Ak", "", "alaska" },
143 { "AZ", "", "arizona" },
144 { "Az", "", "arizona" },
145 { "CA", "", "california" },
146 { "Ca", "", "california" },
147 { "Cal", "ambiguous", "california" },
148 { "Calif", "", "california" },
149 { "CO", "ambiguous", "colorado" },
150 { "Co", "ambiguous", "colorado" },
151 { "Colo", "", "colorado" },
152 { "DC", "", "d" , "c" },
153 { "DE", "", "delaware" },
154 { "De", "ambiguous", "delaware" },
155 { "Del", "ambiguous", "delaware" },
156 { "FL", "", "florida" },
157 { "Fl", "ambiguous", "florida" },
158 { "Fla", "", "florida" },
159 { "GA", "", "georgia" },
160 { "Ga", "", "georgia" },
161 { "HI", "ambiguous", "hawaii" },
162 { "Hi", "ambiguous", "hawaii" },
163 { "IA", "", "iowa" },
164 { "Ia", "ambiguous", "iowa" },
165 { "IN", "ambiguous", "indiana" },
166 { "In", "ambiguous", "indiana" },
167 { "Ind", "ambiguous", "indiana" },
168 { "ID", "ambiguous", "idaho" },
169 { "IL", "ambiguous", "illinois" },
170 { "Il", "ambiguous", "illinois" },
171 { "ILL", "ambiguous", "illinois" },
172 { "KS", "", "kansas" },
173 { "Ks", "", "kansas" },
174 { "Kans", "", "kansas" },
175 { "KY", "ambiguous", "kentucky" },
176 { "Ky", "ambiguous", "kentucky" },
177 { "LA", "ambiguous", "louisiana" },
178 { "La", "ambiguous", "louisiana" },
179 { "Lou", "ambiguous", "louisiana" },
180 { "Lous", "ambiguous", "louisiana" },
181 { "MA", "ambiguous", "massachusetts" },
182 { "Mass", "ambiguous", "massachusetts" },
183 { "Ma", "ambiguous", "massachusetts" },
184 { "MD", "ambiguous", "maryland" },
185 { "Md", "ambiguous", "maryland" },
186 { "ME", "ambiguous", "maine" },
187 { "Me", "ambiguous", "maine" },
188 { "MI", "", "michigan" },
189 { "Mi", "ambiguous", "michigan" },
190 { "Mich", "ambiguous", "michigan" },
191 { "MN", "ambiguous", "minnestota" },
192 { "Minn", "ambiguous", "minnestota" },
193 { "MS", "ambiguous", "mississippi" },
194 { "Miss", "ambiguous", "mississippi" },
195 { "MT", "ambiguous", "montanna" },
196 { "Mt", "ambiguous", "montanna" },
197 { "MO", "ambiguous", "missouri" },
198 { "Mo", "ambiguous", "missouri" },
199 { "NC", "ambiguous", "north" , "carolina" },
200 { "ND", "ambiguous", "north" , "dakota" },
201 { "NE", "ambiguous", "nebraska" },
202 { "Ne", "ambiguous", "nebraska" },
203 { "Neb", "ambiguous", "nebraska" },
204 { "NH", "ambiguous", "new" , "hampshire" },
205 { "NV", "", "nevada" },
206 { "Nev", "", "nevada" },
207 { "NY", "", "new" , "york" },
208 { "OH", "ambiguous", "ohio" },
209 { "OK", "ambiguous", "oklahoma" },
210 { "Okla", "", "oklahoma" },
211 { "OR", "ambiguous", "oregon" },
212 { "Or", "ambiguous", "oregon" },
213 { "Ore", "ambiguous", "oregon" },
214 { "PA", "ambiguous", "pennsylvania" },
215 { "Pa", "ambiguous", "pennsylvania" },
216 { "Penn", "ambiguous", "pennsylvania" },
217 { "RI", "ambiguous", "rhode" , "island" },
218 { "SC", "ambiguous", "south" , "carlolina" },
219 { "SD", "ambiguous", "south" , "dakota" },
220 { "TN", "ambiguous", "tennesee" },
221 { "Tn", "ambiguous", "tennesee" },
222 { "Tenn", "ambiguous", "tennesee" },
223 { "TX", "ambiguous", "texas" },
224 { "Tx", "ambiguous", "texas" },
225 { "Tex", "ambiguous", "texas" },
226 { "UT", "ambiguous", "utah" },
227 { "VA", "ambiguous", "virginia" },
228 { "WA", "ambiguous", "washington" },
229 { "Wa", "ambiguous", "washington" },
230 { "Wash", "ambiguous", "washington" },
231 { "WI", "ambiguous", "wisconsin" },
232 { "Wi", "ambiguous", "wisconsin" },
233 { "WV", "ambiguous", "west" , "virginia" },
234 { "WY", "ambiguous", "wyoming" },
235 { "Wy", "ambiguous", "wyoming" },
236 { "Wyo", "", "wyoming" },
237 { "PR", "ambiguous", "puerto" , "rico" }
240 // Again hashtable for constant time searching
241 private static Hashtable usStatesHash = new Hashtable();
243 // initialize the Hashtable for usStates
245 for (int i = 0; i < usStates.length; i++) {
246 usStatesHash.put(usStates[i][0], usStates[i]);
253 // the word relation that we are building
254 private WordRelation wordRelation;
256 // the current token Item
257 private Item tokenItem;
259 // a CART for classifying numbers
264 * Constructs a default USTokenWordProcessor. It uses the USEnglish
265 * regular expression set (USEngRegExp) by default.
267 * @param usNumbersCART the cart to use to classify numbers
269 public TokenToWords(CART usNumbersCART,
270 PronounceableFSM prefixFSM,
271 PronounceableFSM suffixFSM) {
272 this.cart = usNumbersCART;
273 this.prefixFSM = prefixFSM;
274 this.suffixFSM = suffixFSM;
279 * Returns the currently processing token Item.
281 * @return the current token Item; null if no item
283 public Item getTokenItem() {
289 * process the utterance
291 * @param utterance the utterance contain the tokens
293 * @throws ProcessException if an IOException is thrown during the
294 * processing of the utterance
296 public void processUtterance(Utterance utterance) throws ProcessException {
297 Relation tokenRelation;
298 if ((tokenRelation = utterance.getRelation(Relation.TOKEN)) == null) {
299 throw new IllegalStateException
300 ("TokenToWords: Token relation does not exist");
303 wordRelation = WordRelation.createWordRelation(utterance, this);
305 for (tokenItem = tokenRelation.getHead();
307 tokenItem = tokenItem.getNext()) {
309 FeatureSet featureSet = tokenItem.getFeatures();
310 String tokenVal = featureSet.getString("name");
312 // convert the token into a list of words
313 tokenToWords(tokenVal);
319 * Returns true if the given token matches part of a phone number
321 * @param tokenItem the token
322 * @param tokenVal the string value of the token
324 * @return true or false
326 private boolean matchesPartPhoneNumber(String tokenVal) {
328 String n_name = (String) tokenItem.findFeature("n.name");
329 String n_n_name = (String) tokenItem.findFeature("n.n.name");
330 String p_name = (String) tokenItem.findFeature("p.name");
331 String p_p_name = (String) tokenItem.findFeature("p.p.name");
333 boolean matches3DigitsP_name = matches(threeDigitsPattern, p_name);
335 return ((matches(threeDigitsPattern, tokenVal) &&
336 ((!matches(digitsPattern, p_name)
337 && matches(threeDigitsPattern, n_name)
338 && matches(fourDigitsPattern, n_n_name)) ||
339 (matches(sevenPhoneNumberPattern, n_name)) ||
340 (!matches(digitsPattern, p_p_name)
341 && matches3DigitsP_name
342 && matches(fourDigitsPattern, n_name)))) ||
343 (matches(fourDigitsPattern, tokenVal) &&
344 (!matches(digitsPattern, n_name)
345 && matches3DigitsP_name
346 && matches(threeDigitsPattern, p_p_name))));
351 * Returns true if the given string is in the given string array.
353 * @param value the string to check
354 * @param stringArray the array to check
356 * @return true if the string is in the array, false otherwise
358 private static boolean inStringArray(String value, String[] stringArray) {
359 for (int i = 0; i < stringArray.length; i++) {
360 if (stringArray[i].equals(value)) {
370 * Converts the given Token into (word) Items in the WordRelation.
372 * @param tokenVal the String value of the token, which may or may not be
373 * same as the one in called "name" in flite
376 private void tokenToWords(String tokenVal) {
378 FeatureSet tokenFeatures = tokenItem.getFeatures();
379 String itemName = tokenFeatures.getString("name");
380 int tokenLength = tokenVal.length();
382 if (tokenFeatures.isPresent("phones")) {
383 wordRelation.addWord(tokenVal);
385 } else if ((tokenVal.equals("a") || tokenVal.equals("A")) &&
386 ((tokenItem.getNext() == null) ||
387 !(tokenVal.equals(itemName)) ||
388 !(((String) tokenItem.findFeature("punc")).equals("")))) {
389 /* if A is a sub part of a token, then its ey not ah */
390 wordRelation.addWord("_a");
392 } else if (matches(alphabetPattern, tokenVal)) {
394 if (matches(romanNumbersPattern, tokenVal)) {
397 romanToWords(tokenVal);
399 } else if (matches(illionPattern, tokenVal) &&
400 matches(usMoneyPattern,
401 (String) tokenItem.findFeature("p.name"))) {
403 wordRelation.addWord(tokenVal);
404 wordRelation.addWord("dollars");
406 } else if (matches(drStPattern, tokenVal)) {
408 /* St Andrew's St, Dr King Dr */
409 drStToWords(tokenVal);
411 } else if (tokenVal.equals("Mr")) {
413 tokenItem.getFeatures().setString("punc", "");
414 wordRelation.addWord("mister");
416 } else if (tokenVal.equals("Mrs")) {
418 tokenItem.getFeatures().setString("punc", "");
419 wordRelation.addWord("missus");
421 } else if (tokenLength == 1
422 && isUppercaseLetter(tokenVal.charAt(0))
423 && ((String)tokenItem.findFeature("n.whitespace")).equals(" ")
425 (((String) tokenItem.findFeature("n.name")).charAt(0))) {
427 tokenFeatures.setString("punc", "");
428 String aaa = tokenVal.toLowerCase();
429 if (aaa.equals("a")) {
430 wordRelation.addWord("_a");
432 wordRelation.addWord(aaa);
434 } else if (isStateName(tokenVal)) {
436 The name of a US state
437 isStateName() has already added the full name of the
438 state, so we're all set.
440 } else if (tokenLength > 1 && !isPronounceable(tokenVal)) {
441 /* Need common exception list */
442 /* unpronouncable list of alphas */
443 NumberExpander.expandLetters
444 (tokenVal, wordRelation);
448 wordRelation.addWord(tokenVal.toLowerCase());
451 } else if (matches(dottedAbbrevPattern, tokenVal)) {
455 String aaa = Utilities.deleteChar(tokenVal, '.');
456 NumberExpander.expandLetters(aaa, wordRelation);
458 } else if (matches(commaIntPattern, tokenVal)) {
461 String aaa = Utilities.deleteChar(tokenVal, ',');
462 NumberExpander.expandReal(aaa, wordRelation);
464 } else if (matches(sevenPhoneNumberPattern, tokenVal)) {
466 /* 234-3434 telephone numbers */
467 int dashIndex = tokenVal.indexOf('-');
468 String aaa = tokenVal.substring(0, dashIndex);
469 String bbb = tokenVal.substring(dashIndex+1);
471 NumberExpander.expandDigits(aaa, wordRelation);
472 wordRelation.addBreak();
473 NumberExpander.expandDigits(bbb, wordRelation);
475 } else if (matchesPartPhoneNumber(tokenVal)) {
477 /* part of a telephone number */
478 String punctuation = (String) tokenItem.findFeature("punc");
479 if (punctuation.equals("")) {
480 tokenItem.getFeatures().setString("punc", ",");
482 NumberExpander.expandDigits(tokenVal, wordRelation);
483 wordRelation.addBreak();
485 } else if (matches(numberTimePattern, tokenVal)) {
488 int colonIndex = tokenVal.indexOf(':');
489 String aaa = tokenVal.substring(0, colonIndex);
490 String bbb = tokenVal.substring(colonIndex+1);
492 NumberExpander.expandNumber(aaa, wordRelation);
493 if (!(bbb.equals("00"))) {
494 NumberExpander.expandID(bbb, wordRelation);
497 } else if (matches(digits2DashPattern, tokenVal)) {
500 digitsDashToWords(tokenVal);
502 } else if (matches(digitsPattern, tokenVal)) {
504 digitsToWords(tokenVal);
506 } else if (tokenLength == 1
507 && isUppercaseLetter(tokenVal.charAt(0))
508 && ((String)tokenItem.findFeature("n.whitespace")).equals
511 (((String) tokenItem.findFeature("n.name")).charAt(0))) {
513 tokenFeatures.setString("punc", "");
514 String aaa = tokenVal.toLowerCase();
515 if (aaa.equals("a")) {
516 wordRelation.addWord("_a");
518 wordRelation.addWord(aaa);
520 } else if (matches(doublePattern, tokenVal)) {
522 NumberExpander.expandReal(tokenVal, wordRelation);
524 } else if (matches(ordinalPattern, tokenVal)) {
526 /* explicit ordinals */
527 String aaa = tokenVal.substring(0, tokenLength - 2);
528 NumberExpander.expandOrdinal(aaa, wordRelation);
530 } else if (matches(usMoneyPattern, tokenVal)) {
533 usMoneyToWords(tokenVal);
535 } else if (tokenLength > 0
536 && tokenVal.charAt(tokenLength - 1) == '%') {
539 tokenToWords(tokenVal.substring(0, tokenLength - 1));
540 wordRelation.addWord("per");
541 wordRelation.addWord("cent");
543 } else if (matches(numessPattern, tokenVal)) {
545 /* 60s and 7s and 9s */
546 tokenToWords(tokenVal.substring(0, tokenLength - 1));
547 wordRelation.addWord("'s");
549 } else if (tokenVal.indexOf('\'') != -1) {
551 postropheToWords(tokenVal);
553 } else if (matches(digitsSlashDigitsPattern, tokenVal) &&
554 tokenVal.equals(itemName)) {
556 digitsSlashDigitsToWords(tokenVal);
558 } else if (tokenVal.indexOf('-') != -1) {
560 dashToWords(tokenVal);
562 } else if (tokenLength > 1 &&
563 !matches(alphabetPattern, tokenVal)) {
565 notJustAlphasToWords(tokenVal);
569 wordRelation.addWord(tokenVal.toLowerCase());
575 * Convert the given digit token with dashes (e.g. 999-999-999)
576 * into (word) Items in the WordRelation.
578 * @param tokenVal the digit string
580 private void digitsDashToWords(String tokenVal) {
581 int tokenLength = tokenVal.length();
583 for (int p = 0; p <= tokenLength; p++) {
584 if (p == tokenLength || tokenVal.charAt(p) == '-') {
585 String aaa = tokenVal.substring(a, p);
586 NumberExpander.expandDigits(aaa, wordRelation);
587 wordRelation.addBreak();
595 * Convert the given digit token into (word) Items in the WordRelation.
597 * @param tokenVal the digit string
599 private void digitsToWords(String tokenVal) {
600 FeatureSet featureSet = tokenItem.getFeatures();
602 if (featureSet.isPresent("nsw")) {
603 nsw = featureSet.getString("nsw");
606 if (nsw.equals("nide")) {
607 NumberExpander.expandID(tokenVal, wordRelation);
609 String rName = featureSet.getString("name");
610 String digitsType = null;
612 if (tokenVal.equals(rName)) {
613 digitsType = (String) cart.interpret(tokenItem);
615 featureSet.setString("name", tokenVal);
616 digitsType = (String) cart.interpret(tokenItem);
617 featureSet.setString("name", rName);
620 if (digitsType.equals("ordinal")) {
621 NumberExpander.expandOrdinal(tokenVal, wordRelation);
622 } else if (digitsType.equals("digits")) {
623 NumberExpander.expandDigits(tokenVal, wordRelation);
624 } else if (digitsType.equals("year")) {
625 NumberExpander.expandID(tokenVal, wordRelation);
627 NumberExpander.expandNumber(tokenVal, wordRelation);
634 * Converts the given Roman numeral string into (word) Items in the
637 * @param romanString the roman numeral string
639 private void romanToWords(String romanString) {
640 String punctuation = (String) tokenItem.findFeature("p.punc");
642 if (punctuation.equals("")) {
643 /* no preceeding punctuation */
644 String n = String.valueOf(NumberExpander.expandRoman(romanString));
646 if (kingLike(tokenItem)) {
647 wordRelation.addWord("the");
648 NumberExpander.expandOrdinal(n, wordRelation);
649 } else if (sectionLike(tokenItem)) {
650 NumberExpander.expandNumber(n, wordRelation);
652 NumberExpander.expandLetters(romanString, wordRelation);
655 NumberExpander.expandLetters(romanString, wordRelation);
661 * Returns true if the given key is in the kingSectionLikeHash
662 * Hashtable, and the value is the same as the given value.
664 * @param key key to look for in the hashtable
665 * @param value the value to match
667 * @return true if it matches, or false if it does not or if
668 * the key is not mapped to any value in the hashtable.
670 private static boolean inKingSectionLikeHash(String key, String value) {
671 String hashValue = (String) kingSectionLikeHash.get(key);
672 if (hashValue != null) {
673 return (hashValue.equals(value));
682 * Returns true if the given token item contains a token that is
683 * in a king-like context, e.g., "King" or "Louis".
685 * @param tokenItem the token item to check
687 * @return true or false
689 public static boolean kingLike(Item tokenItem) {
691 ((String) tokenItem.findFeature("p.name")).toLowerCase();
692 if (inKingSectionLikeHash(kingName, KING_NAMES)) {
696 ((String) tokenItem.findFeature("p.p.name")).toLowerCase();
697 return inKingSectionLikeHash(kingTitle, KING_TITLES);
703 * Returns true if the given token item contains a token that is
704 * in a section-like context, e.g., "chapter" or "act".
706 * @param tokenItem the token item to check
708 * @return true or false
710 public static boolean sectionLike(Item tokenItem) {
712 ((String) tokenItem.findFeature("p.name")).toLowerCase();
713 return inKingSectionLikeHash(sectionType, SECTION_TYPES);
718 * Converts the given string containing "St" and "Dr" to (word) Items
719 * in the WordRelation.
721 * @param drStString the string with "St" and "Dr"
723 private void drStToWords(String drStString) {
724 String street = null;
726 char c0 = drStString.charAt(0);
728 if (c0 == 's' || c0 == 'S') {
736 FeatureSet featureSet = tokenItem.getFeatures();
737 String punctuation = featureSet.getString("punc");
739 String featPunctuation = (String) tokenItem.findFeature("punc");
741 if (tokenItem.getNext() == null ||
742 punctuation.indexOf(',') != -1) {
743 wordRelation.addWord(street);
744 } else if (featPunctuation.equals(",")) {
745 wordRelation.addWord(saint);
747 String pName = (String) tokenItem.findFeature("p.name");
748 String nName = (String) tokenItem.findFeature("n.name");
750 char p0 = pName.charAt(0);
751 char n0 = nName.charAt(0);
753 if (isUppercaseLetter(p0) && isLowercaseLetter(n0)) {
754 wordRelation.addWord(street);
755 } else if (NumberExpander.isDigit(p0) && isLowercaseLetter(n0)) {
756 wordRelation.addWord(street);
757 } else if (isLowercaseLetter(p0) && isUppercaseLetter(n0)) {
758 wordRelation.addWord(saint);
760 String whitespace = (String) tokenItem.findFeature("n.whitespace");
761 if (whitespace.equals(" ")) {
762 wordRelation.addWord(saint);
764 wordRelation.addWord(street);
769 if (punctuation != null && punctuation.equals(".")) {
770 featureSet.setString("punc", "");
776 * Converts US money string into (word) Items in the WordRelation.
778 * @param tokenVal the US money string
780 private void usMoneyToWords(String tokenVal) {
782 int dotIndex = tokenVal.indexOf('.');
784 if (matches(illionPattern,
785 (String) tokenItem.findFeature("n.name"))) {
786 NumberExpander.expandReal(tokenVal.substring(1), wordRelation);
787 } else if (dotIndex == -1) {
789 String aaa = tokenVal.substring(1);
792 if (aaa.equals("1")) {
793 wordRelation.addWord("dollar");
795 wordRelation.addWord("dollars");
797 } else if (dotIndex == (tokenVal.length() - 1) ||
798 (tokenVal.length() - dotIndex) > 3) {
799 /* simply read as mumble point mumble */
800 NumberExpander.expandReal(tokenVal.substring(1), wordRelation);
801 wordRelation.addWord("dollars");
803 String aaa = tokenVal.substring(1, dotIndex);
804 aaa = Utilities.deleteChar(aaa, ',');
805 String bbb = tokenVal.substring(dotIndex+1);
807 NumberExpander.expandNumber(aaa, wordRelation);
809 if (aaa.equals("1")) {
810 wordRelation.addWord("dollar");
812 wordRelation.addWord("dollars");
815 if (bbb.equals("00")) {
816 // add nothing to the word list
818 NumberExpander.expandNumber(bbb, wordRelation);
819 if (bbb.equals("01")) {
820 wordRelation.addWord("cent");
822 wordRelation.addWord("cents");
830 * Convert the given apostrophed word into (word) Items in the Word
833 * @param tokenVal the apostrophed word string
835 private void postropheToWords(String tokenVal) {
836 int index = tokenVal.indexOf('\'');
837 String bbb = tokenVal.substring(index).toLowerCase();
839 if (inStringArray(bbb, postrophes)) {
840 String aaa = tokenVal.substring(0, index);
842 wordRelation.addWord(bbb);
844 } else if (bbb.equals("'tve")) {
845 String aaa = tokenVal.substring(0, index-2);
847 wordRelation.addWord("'ve");
850 /* internal single quote deleted */
851 StringBuffer buffer = new StringBuffer(tokenVal);
852 buffer.deleteCharAt(index);
853 tokenToWords(buffer.toString());
859 * Convert the given digits/digits string into word (Items) in the
862 * @param tokenVal the digits/digits string
864 private void digitsSlashDigitsToWords(String tokenVal) {
866 /* might be fraction, or not */
867 int index = tokenVal.indexOf('/');
868 String aaa = tokenVal.substring(0, index);
869 String bbb = tokenVal.substring(index+1);
872 // if the previous token is a number, add an "and"
873 if (matches(digitsPattern, (String) tokenItem.findFeature("p.name"))
874 && tokenItem.getPrevious() != null) {
875 wordRelation.addWord("and");
878 if (aaa.equals("1") && bbb.equals("2")) {
879 wordRelation.addWord("a");
880 wordRelation.addWord("half");
881 } else if ((a = Integer.parseInt(aaa)) < (b = Integer.parseInt(bbb))) {
882 NumberExpander.expandNumber(aaa, wordRelation);
883 NumberExpander.expandOrdinal(bbb, wordRelation);
885 wordRelation.addWord("'s");
888 NumberExpander.expandNumber(aaa, wordRelation);
889 wordRelation.addWord("slash");
890 NumberExpander.expandNumber(bbb, wordRelation);
896 * Convert the given dashed string (e.g. "aaa-bbb") into (word) Items
897 * in the WordRelation.
899 * @param tokenVal the dashed string
901 private void dashToWords(String tokenVal) {
903 int index = tokenVal.indexOf('-');
904 String aaa = tokenVal.substring(0, index);
905 String bbb = tokenVal.substring(index+1, tokenVal.length());
907 if (matches(digitsPattern, aaa) && matches(digitsPattern, bbb)) {
908 FeatureSet featureSet = tokenItem.getFeatures();
909 featureSet.setString("name", aaa);
911 wordRelation.addWord("to");
912 featureSet.setString("name", bbb);
914 featureSet.setString("name", "");
923 * Convert the given string (which does not only consist of alphabet)
924 * into (word) Items in the WordRelation.
926 * @param tokenVal the string
928 private void notJustAlphasToWords(String tokenVal) {
930 /* its not just alphas */
932 int tokenLength = tokenVal.length();
934 for (; index < tokenLength; index++) {
935 if (isTextSplitable(tokenVal, index)) {
940 String aaa = tokenVal.substring(0, index+1);
941 String bbb = tokenVal.substring(index+1, tokenLength);
943 FeatureSet featureSet = tokenItem.getFeatures();
944 featureSet.setString("nsw", "nide");
951 * Returns true if the given word is pronounceable.
952 * This method is originally called us_aswd() in Flite 1.1.
954 * @param word the word to test
956 * @return true if the word is pronounceable, false otherwise
958 public boolean isPronounceable(String word) {
959 String lowerCaseWord = word.toLowerCase();
960 return (prefixFSM.accept(lowerCaseWord) &&
961 suffixFSM.accept(lowerCaseWord));
966 * Returns true if the given token is the name of a US state.
967 * If it is, it will add the name of the state to (word) Items in the
970 * @param tokenVal the token string
972 private boolean isStateName(String tokenVal) {
973 String[] state = (String[]) usStatesHash.get(tokenVal);
975 boolean expandState = false;
977 // check to see if the state initials are ambiguous
978 // in the English language
979 if (state[1].equals("ambiguous")) {
980 String previous = (String) tokenItem.findFeature("p.name");
981 String next = (String) tokenItem.findFeature("n.name");
983 // System.out.println("previous = " + previous);
984 // System.out.println("next = " + next);
986 int nextLength = next.length();
987 FeatureSet featureSet = tokenItem.getFeatures();
989 // check if the previous word starts with a capital letter,
990 // is at least 3 letters long, is an alphabet sequence,
992 boolean previousIsCity =
993 (isUppercaseLetter(previous.charAt(0))
994 && previous.length() > 2
995 && matches(alphabetPattern, previous)
996 && tokenItem.findFeature("p.punc").equals(","));
998 // check if next token starts with a lower case, or
999 // this is the end of sentence, or if next token
1000 // is a period (".") or a zip code (5 or 10 digits).
1001 boolean nextIsGood =
1002 (isLowercaseLetter(next.charAt(0))
1003 || tokenItem.getNext() == null
1004 || featureSet.getString("punc").equals(".")
1005 || ((nextLength == 5 || nextLength == 10) &&
1006 matches(digitsPattern, next)));
1008 if (previousIsCity && nextIsGood) {
1011 expandState = false;
1017 for (int j = 2; j < state.length; j++) {
1018 if (state[j] != null) {
1019 wordRelation.addWord(state[j]);
1030 * Determines if the given input matches the given Pattern.
1032 * @param pattern the pattern to match
1033 * @param input the string to test
1035 * @return <code>true</code> if the input string matches the given Pattern;
1036 * <code>false</code> otherwise
1038 private static boolean matches(Pattern pattern, String input) {
1039 Matcher m = pattern.matcher(input);
1045 * Determines if the character at the given position of the given
1046 * input text is splittable. A character is splittable if:
1048 * 1) the character and the following character are not letters
1049 * in the English alphabet (A-Z and a-z)
1051 * 2) the character and the following character are not digits (0-9)
1053 * @param text the text containing the character of interest
1054 * @param index the index of the character of interest
1056 * @return true if the position of the given text is splittable
1059 private static boolean isTextSplitable(String text, int index) {
1061 char c0 = text.charAt(index);
1062 char c1 = text.charAt(index+1);
1064 if (isLetter(c0) && isLetter(c1)) {
1066 } else if (NumberExpander.isDigit(c0) && NumberExpander.isDigit(c1)) {
1075 * Returns true if the given character is a letter (a-z or A-Z).
1077 * @param ch the character to test
1079 * @return true or false
1081 private static boolean isLetter(char ch) {
1082 return (('a' <= ch && ch <= 'z') || ('A' <= ch && ch <= 'Z'));
1087 * Returns true if the given character is an uppercase letter (A-Z).
1089 * @param ch the character to test
1091 * @return true or false
1093 private static boolean isUppercaseLetter(char ch) {
1094 return ('A' <= ch && ch <= 'Z');
1099 * Returns true if the given character is a lowercase letter (a-z).
1101 * @param ch the character to test
1103 * @return true or false
1105 private static boolean isLowercaseLetter(char ch) {
1106 return ('a' <= ch && ch <= 'z');
1111 * Converts this object to its String representation
1113 * @return the string representation of this object
1115 public String toString() {
1116 return "TokenToWords";