git.gag.com Git - debian/freetts/blob - com/sun/speech/freetts/en/us/TokenToWords.java

   1 /**
   2  * Portions Copyright 2001-2003 Sun Microsystems, Inc.
   3  * Portions Copyright 1999-2001 Language Technologies Institute,
   4  * Carnegie Mellon University.
   5  * All Rights Reserved.  Use is subject to license terms.
   6  *
   7  * See the file "license.terms" for information on usage and
   8  * redistribution of this file, and for a DISCLAIMER OF ALL
   9  * WARRANTIES.
  10  */
  11 package com.sun.speech.freetts.en.us;
  12
  13 import java.util.Hashtable;
  14 import java.util.regex.Matcher;
  15 import java.util.regex.Pattern;
  16
  17 import com.sun.speech.freetts.FeatureSet;
  18 import com.sun.speech.freetts.Item;
  19 import com.sun.speech.freetts.ProcessException;
  20 import com.sun.speech.freetts.Relation;
  21 import com.sun.speech.freetts.Utterance;
  22 import com.sun.speech.freetts.UtteranceProcessor;
  23 import com.sun.speech.freetts.cart.CART;
  24 import com.sun.speech.freetts.util.Utilities;
  25
  26
  27 /**
  28  * Converts the Tokens (in US English words) in an
  29  * Utterance into a list of words. It puts the produced list back
  30  * into the Utterance. Usually, the tokens that gets expanded are numbers
  31  * like "23" (to "twenty" "three").
  32  * <p> * It translates the following code from flite:
  33  * <br>
  34  * <code>
  35  * lang/usenglish/us_text.c
  36  * </code>
  37  */
  38 public class TokenToWords implements UtteranceProcessor {
  39
  40     /** Regular expression for something that has a vowel */
  41     private static final String RX_HAS_VOWEL = ".*[aeiouAEIOU].*";
  42
  43     // Patterns for regular expression matching
  44     private static final Pattern alphabetPattern;
  45     private static final Pattern commaIntPattern;
  46     private static final Pattern digits2DashPattern;
  47     private static final Pattern digitsPattern;
  48     private static final Pattern digitsSlashDigitsPattern;
  49     private static final Pattern dottedAbbrevPattern;
  50     private static final Pattern doublePattern;
  51     private static final Pattern drStPattern;
  52     private static final Pattern fourDigitsPattern;
  53     private static final Pattern hasVowelPattern;
  54     private static final Pattern illionPattern;
  55     private static final Pattern numberTimePattern;
  56     private static final Pattern numessPattern;
  57     private static final Pattern ordinalPattern;
  58     private static final Pattern romanNumbersPattern;
  59     private static final Pattern sevenPhoneNumberPattern;
  60     private static final Pattern threeDigitsPattern;
  61     private static final Pattern usMoneyPattern;
  62
  63     static {
  64         alphabetPattern = Pattern.compile(USEnglish.RX_ALPHABET);
  65         commaIntPattern = Pattern.compile(USEnglish.RX_COMMAINT);
  66         digits2DashPattern = Pattern.compile(USEnglish.RX_DIGITS2DASH);
  67         digitsPattern = Pattern.compile(USEnglish.RX_DIGITS);
  68         digitsSlashDigitsPattern = Pattern.compile(USEnglish.RX_DIGITSSLASHDIGITS);
  69         dottedAbbrevPattern = Pattern.compile(USEnglish.RX_DOTTED_ABBREV);
  70         doublePattern = Pattern.compile(USEnglish.RX_DOUBLE);
  71         drStPattern = Pattern.compile(USEnglish.RX_DRST);
  72         fourDigitsPattern = Pattern.compile(USEnglish.RX_FOUR_DIGIT);
  73         hasVowelPattern = Pattern.compile(USEnglish.RX_HAS_VOWEL);
  74         illionPattern = Pattern.compile(USEnglish.RX_ILLION);
  75         numberTimePattern = Pattern.compile(USEnglish.RX_NUMBER_TIME);
  76         numessPattern = Pattern.compile(USEnglish.RX_NUMESS);
  77         ordinalPattern = Pattern.compile(USEnglish.RX_ORDINAL_NUMBER);
  78         romanNumbersPattern = Pattern.compile(USEnglish.RX_ROMAN_NUMBER);
  79         sevenPhoneNumberPattern = Pattern.compile(USEnglish.RX_SEVEN_DIGIT_PHONE_NUMBER);
  80         threeDigitsPattern = Pattern.compile(USEnglish.RX_THREE_DIGIT);
  81         usMoneyPattern = Pattern.compile(USEnglish.RX_US_MONEY);
  82     }
  83
  84     // King-like words
  85     private static final String[] kingNames = {
  86         "louis", "henry", "charles", "philip", "george",
  87         "edward", "pius", "william", "richard", "ptolemy",
  88         "john", "paul", "peter", "nicholas", "frederick",
  89         "james", "alfonso", "ivan", "napoleon", "leo",
  90         "gregory", "catherine", "alexandria", "pierre", "elizabeth",
  91         "mary" };
  92
  93     private static final String[] kingTitles = {
  94         "king", "queen", "pope", "duke", "tsar",
  95         "emperor", "shah", "caesar", "duchess", "tsarina",
  96         "empress", "baron", "baroness", "sultan", "count",
  97         "countess" };
  98
  99     // Section-like words
 100     private static final String[] sectionTypes = {
 101         "section", "chapter", "part", "phrase", "verse",
 102         "scene", "act", "book", "volume", "chap",
 103         "war", "apollo", "trek", "fortran" };
 104
 105     /**
 106      * Here we use a hashtable for constant time matching, instead of using
 107      * if (A.equals(B) || A.equals(C) || ...) to match Strings
 108      */
 109     private static Hashtable kingSectionLikeHash = new Hashtable();
 110
 111     private static final String KING_NAMES = "kingNames";
 112     private static final String KING_TITLES = "kingTitles";
 113     private static final String SECTION_TYPES = "sectionTypes";
 114
 115     // Hashtable initialization
 116     static {
 117         for (int i = 0; i < kingNames.length; i++) {
 118             kingSectionLikeHash.put(kingNames[i], KING_NAMES);
 119         }
 120         for (int i = 0; i < kingTitles.length; i++) {
 121             kingSectionLikeHash.put(kingTitles[i], KING_TITLES);
 122         }
 123         for (int i = 0; i < sectionTypes.length; i++) {
 124             kingSectionLikeHash.put(sectionTypes[i], SECTION_TYPES);
 125         }
 126     }
 127
 128     private static final String[] postrophes = {
 129         "'s", "'ll", "'ve", "'d" };
 130
 131     // Finite state machines to check if a Token is pronounceable
 132     private PronounceableFSM prefixFSM = null;
 133     private PronounceableFSM suffixFSM = null;
 134
 135     // List of US states abbreviations and their full names
 136     private static final String[][] usStates =
 137     {
 138         { "AL", "ambiguous", "alabama"  },
 139         { "Al", "ambiguous", "alabama"  },
 140         { "Ala", "", "alabama"  },
 141         { "AK", "", "alaska"  },
 142         { "Ak", "", "alaska"  },
 143         { "AZ", "", "arizona"  },
 144         { "Az", "", "arizona"  },
 145         { "CA", "", "california"  },
 146         { "Ca", "", "california"  },
 147         { "Cal", "ambiguous", "california"  },
 148         { "Calif", "", "california"  },
 149         { "CO", "ambiguous", "colorado"  },
 150         { "Co", "ambiguous", "colorado"  },
 151         { "Colo", "", "colorado"  },
 152         { "DC", "", "d" , "c" },
 153         { "DE", "", "delaware"  },
 154         { "De", "ambiguous", "delaware"  },
 155         { "Del", "ambiguous", "delaware"  },
 156         { "FL", "", "florida"  },
 157         { "Fl", "ambiguous", "florida"  },
 158         { "Fla", "", "florida"  },
 159         { "GA", "", "georgia"  },
 160         { "Ga", "", "georgia"  },
 161         { "HI", "ambiguous", "hawaii"  },
 162         { "Hi", "ambiguous", "hawaii"  },
 163         { "IA", "", "iowa"  },
 164         { "Ia", "ambiguous", "iowa"  },
 165         { "IN", "ambiguous", "indiana"  },
 166         { "In", "ambiguous", "indiana"  },
 167         { "Ind", "ambiguous", "indiana"  },
 168         { "ID", "ambiguous", "idaho"  },
 169         { "IL", "ambiguous", "illinois"  },
 170         { "Il", "ambiguous", "illinois"  },
 171         { "ILL", "ambiguous", "illinois"  },
 172         { "KS", "", "kansas"  },
 173         { "Ks", "", "kansas"  },
 174         { "Kans", "", "kansas"  },
 175         { "KY", "ambiguous", "kentucky"  },
 176         { "Ky", "ambiguous", "kentucky"  },
 177         { "LA", "ambiguous", "louisiana"  },
 178         { "La", "ambiguous", "louisiana"  },
 179         { "Lou", "ambiguous", "louisiana"  },
 180         { "Lous", "ambiguous", "louisiana"  },
 181         { "MA", "ambiguous", "massachusetts"  },
 182         { "Mass", "ambiguous", "massachusetts"  },
 183         { "Ma", "ambiguous", "massachusetts"  },
 184         { "MD", "ambiguous", "maryland"  },
 185         { "Md", "ambiguous", "maryland"  },
 186         { "ME", "ambiguous", "maine"  },
 187         { "Me", "ambiguous", "maine"  },
 188         { "MI", "", "michigan"  },
 189         { "Mi", "ambiguous", "michigan"  },
 190         { "Mich", "ambiguous", "michigan"  },
 191         { "MN", "ambiguous", "minnestota"  },
 192         { "Minn", "ambiguous", "minnestota"  },
 193         { "MS", "ambiguous", "mississippi"  },
 194         { "Miss", "ambiguous", "mississippi"  },
 195         { "MT", "ambiguous", "montanna"  },
 196         { "Mt", "ambiguous", "montanna"  },
 197         { "MO", "ambiguous", "missouri"  },
 198         { "Mo", "ambiguous", "missouri"  },
 199         { "NC", "ambiguous", "north" , "carolina" },
 200         { "ND", "ambiguous", "north" , "dakota" },
 201         { "NE", "ambiguous", "nebraska"  },
 202         { "Ne", "ambiguous", "nebraska"  },
 203         { "Neb", "ambiguous", "nebraska"  },
 204         { "NH", "ambiguous", "new" , "hampshire" },
 205         { "NV", "", "nevada"  },
 206         { "Nev", "", "nevada"  },
 207         { "NY", "", "new" , "york" },
 208         { "OH", "ambiguous", "ohio"  },
 209         { "OK", "ambiguous", "oklahoma"  },
 210         { "Okla", "", "oklahoma"  },
 211         { "OR", "ambiguous", "oregon"  },
 212         { "Or", "ambiguous", "oregon"  },
 213         { "Ore", "ambiguous", "oregon"  },
 214         { "PA", "ambiguous", "pennsylvania"  },
 215         { "Pa", "ambiguous", "pennsylvania"  },
 216         { "Penn", "ambiguous", "pennsylvania"  },
 217         { "RI", "ambiguous", "rhode" , "island" },
 218         { "SC", "ambiguous", "south" , "carlolina" },
 219         { "SD", "ambiguous", "south" , "dakota" },
 220         { "TN", "ambiguous", "tennesee"  },
 221         { "Tn", "ambiguous", "tennesee"  },
 222         { "Tenn", "ambiguous", "tennesee"  },
 223         { "TX", "ambiguous", "texas"  },
 224         { "Tx", "ambiguous", "texas"  },
 225         { "Tex", "ambiguous", "texas"  },
 226         { "UT", "ambiguous", "utah"  },
 227         { "VA", "ambiguous", "virginia"  },
 228         { "WA", "ambiguous", "washington"  },
 229         { "Wa", "ambiguous", "washington"  },
 230         { "Wash", "ambiguous", "washington"  },
 231         { "WI", "ambiguous", "wisconsin"  },
 232         { "Wi", "ambiguous", "wisconsin"  },
 233         { "WV", "ambiguous", "west" , "virginia" },
 234         { "WY", "ambiguous", "wyoming"  },
 235         { "Wy", "ambiguous", "wyoming"  },
 236         { "Wyo", "", "wyoming"  },
 237         { "PR", "ambiguous", "puerto" , "rico" }
 238     };
 239
 240     // Again hashtable for constant time searching
 241     private static Hashtable usStatesHash = new Hashtable();
 242
 243     // initialize the Hashtable for usStates
 244     static {
 245         for (int i = 0; i < usStates.length; i++) {
 246             usStatesHash.put(usStates[i][0], usStates[i]);
 247         }
 248     };
 249
 250
 251     // class variables
 252
 253     // the word relation that we are building
 254     private WordRelation wordRelation;
 255
 256     // the current token Item
 257     private Item tokenItem;
 258
 259     // a CART for classifying numbers
 260     private CART cart;
 261
 262
 263     /**
 264      * Constructs a default USTokenWordProcessor. It uses the USEnglish
 265      * regular expression set (USEngRegExp) by default.
 266      *
 267      * @param usNumbersCART the cart to use to classify numbers
 268      */
 269     public TokenToWords(CART usNumbersCART,
 270                         PronounceableFSM prefixFSM,
 271                         PronounceableFSM suffixFSM) {
 272         this.cart = usNumbersCART;
 273         this.prefixFSM = prefixFSM;
 274         this.suffixFSM = suffixFSM;
 275     }
 276
 277
 278     /**
 279      * Returns the currently processing token Item.
 280      *
 281      * @return the current token Item; null if no item
 282      */
 283     public Item getTokenItem() {
 284         return tokenItem;
 285     }
 286
 287
 288     /**
 289      *  process the utterance
 290      *
 291      * @param  utterance  the utterance contain the tokens
 292      *
 293      * @throws ProcessException if an IOException is thrown during the
 294      *         processing of the utterance
 295      */
 296     public void processUtterance(Utterance utterance) throws ProcessException {
 297         Relation tokenRelation;
 298         if ((tokenRelation = utterance.getRelation(Relation.TOKEN)) == null) {
 299             throw new IllegalStateException
 300                 ("TokenToWords: Token relation does not exist");
 301         }
 302
 303         wordRelation = WordRelation.createWordRelation(utterance, this);
 304
 305         for (tokenItem = tokenRelation.getHead();
 306              tokenItem != null;
 307              tokenItem = tokenItem.getNext()) {
 308
 309             FeatureSet featureSet = tokenItem.getFeatures();
 310             String tokenVal = featureSet.getString("name");
 311
 312             // convert the token into a list of words
 313             tokenToWords(tokenVal);
 314         }
 315     }
 316
 317
 318     /**
 319      * Returns true if the given token matches part of a phone number
 320      *
 321      * @param tokenItem the token
 322      * @param tokenVal the string value of the token
 323      *
 324      * @return true or false
 325      */
 326     private boolean matchesPartPhoneNumber(String tokenVal) {
 327
 328         String n_name = (String) tokenItem.findFeature("n.name");
 329         String n_n_name = (String) tokenItem.findFeature("n.n.name");
 330         String p_name = (String) tokenItem.findFeature("p.name");
 331         String p_p_name = (String) tokenItem.findFeature("p.p.name");
 332
 333         boolean matches3DigitsP_name = matches(threeDigitsPattern, p_name);
 334
 335         return ((matches(threeDigitsPattern, tokenVal) &&
 336                  ((!matches(digitsPattern, p_name)
 337                    && matches(threeDigitsPattern, n_name)
 338                    && matches(fourDigitsPattern, n_n_name)) ||
 339                   (matches(sevenPhoneNumberPattern, n_name)) ||
 340                   (!matches(digitsPattern, p_p_name)
 341                    && matches3DigitsP_name
 342                    && matches(fourDigitsPattern, n_name)))) ||
 343                 (matches(fourDigitsPattern, tokenVal) &&
 344                  (!matches(digitsPattern, n_name)
 345                   && matches3DigitsP_name
 346                   && matches(threeDigitsPattern, p_p_name))));
 347     }
 348
 349
 350     /**
 351      * Returns true if the given string is in the given string array.
 352      *
 353      * @param value the string to check
 354      * @param stringArray the array to check
 355      *
 356      * @return true if the string is in the array, false otherwise
 357      */
 358     private static boolean inStringArray(String value, String[] stringArray) {
 359         for (int i = 0; i < stringArray.length; i++) {
 360             if (stringArray[i].equals(value)) {
 361                 return true;
 362             }
 363         }
 364         return false;
 365     }
 366
 367
 368
 369     /**
 370      * Converts the given Token into (word) Items in the WordRelation.
 371      *
 372      * @param  tokenVal the String value of the token, which may or may not be
 373      *                  same as the one in called "name" in flite
 374      *
 375      */
 376     private void tokenToWords(String tokenVal) {
 377
 378         FeatureSet tokenFeatures = tokenItem.getFeatures();
 379         String itemName = tokenFeatures.getString("name");
 380         int tokenLength = tokenVal.length();
 381
 382         if (tokenFeatures.isPresent("phones")) {
 383             wordRelation.addWord(tokenVal);
 384
 385         } else if ((tokenVal.equals("a") || tokenVal.equals("A")) &&
 386                 ((tokenItem.getNext() == null) ||
 387                  !(tokenVal.equals(itemName)) ||
 388                  !(((String) tokenItem.findFeature("punc")).equals("")))) {
 389             /* if A is a sub part of a token, then its ey not ah */
 390             wordRelation.addWord("_a");
 391
 392         } else if (matches(alphabetPattern, tokenVal)) {
 393
 394             if (matches(romanNumbersPattern, tokenVal)) {
 395
 396                 /* XVIII */
 397                 romanToWords(tokenVal);
 398
 399             } else if (matches(illionPattern, tokenVal) &&
 400                        matches(usMoneyPattern,
 401                                (String) tokenItem.findFeature("p.name"))) {
 402                 /* $ X -illion */
 403                 wordRelation.addWord(tokenVal);
 404                 wordRelation.addWord("dollars");
 405
 406             } else if (matches(drStPattern, tokenVal)) {
 407
 408                 /* St Andrew's St, Dr King Dr */
 409                 drStToWords(tokenVal);
 410
 411             } else if (tokenVal.equals("Mr")) {
 412
 413                 tokenItem.getFeatures().setString("punc", "");
 414                 wordRelation.addWord("mister");
 415
 416             } else if (tokenVal.equals("Mrs")) {
 417
 418                 tokenItem.getFeatures().setString("punc", "");
 419                 wordRelation.addWord("missus");
 420
 421             } else if (tokenLength == 1
 422                        && isUppercaseLetter(tokenVal.charAt(0))
 423                        && ((String)tokenItem.findFeature("n.whitespace")).equals(" ")
 424                        && isUppercaseLetter
 425                        (((String) tokenItem.findFeature("n.name")).charAt(0))) {
 426
 427                 tokenFeatures.setString("punc", "");
 428                 String aaa = tokenVal.toLowerCase();
 429                 if (aaa.equals("a")) {
 430                     wordRelation.addWord("_a");
 431                 } else {
 432                     wordRelation.addWord(aaa);
 433                 }
 434             } else if (isStateName(tokenVal)) {
 435                 /*
 436                   The name of a US state
 437                   isStateName() has already added the full name of the
 438                   state, so we're all set.
 439                 */
 440             } else if (tokenLength > 1 && !isPronounceable(tokenVal)) {
 441                 /* Need common exception list */
 442                 /* unpronouncable list of alphas */
 443                 NumberExpander.expandLetters
 444                     (tokenVal, wordRelation);
 445
 446             } else {
 447                 /* just a word */
 448                 wordRelation.addWord(tokenVal.toLowerCase());
 449             }
 450
 451         } else if (matches(dottedAbbrevPattern, tokenVal)) {
 452
 453             /* U.S.A. */
 454             // remove all dots
 455             String aaa = Utilities.deleteChar(tokenVal, '.');
 456             NumberExpander.expandLetters(aaa, wordRelation);
 457
 458         } else if (matches(commaIntPattern, tokenVal)) {
 459
 460             /* 99,999,999 */
 461             String aaa = Utilities.deleteChar(tokenVal, ',');
 462             NumberExpander.expandReal(aaa, wordRelation);
 463
 464         } else if (matches(sevenPhoneNumberPattern, tokenVal)) {
 465
 466             /* 234-3434  telephone numbers */
 467             int dashIndex = tokenVal.indexOf('-');
 468             String aaa = tokenVal.substring(0, dashIndex);
 469             String bbb = tokenVal.substring(dashIndex+1);
 470
 471             NumberExpander.expandDigits(aaa, wordRelation);
 472             wordRelation.addBreak();
 473             NumberExpander.expandDigits(bbb, wordRelation);
 474
 475         } else if (matchesPartPhoneNumber(tokenVal)) {
 476
 477             /* part of a telephone number */
 478             String punctuation = (String) tokenItem.findFeature("punc");
 479             if (punctuation.equals("")) {
 480                 tokenItem.getFeatures().setString("punc", ",");
 481             }
 482             NumberExpander.expandDigits(tokenVal, wordRelation);
 483             wordRelation.addBreak();
 484
 485         } else if (matches(numberTimePattern, tokenVal)) {
 486
 487             /* 12:35 */
 488             int colonIndex = tokenVal.indexOf(':');
 489             String aaa = tokenVal.substring(0, colonIndex);
 490             String bbb = tokenVal.substring(colonIndex+1);
 491
 492             NumberExpander.expandNumber(aaa, wordRelation);
 493             if (!(bbb.equals("00"))) {
 494                 NumberExpander.expandID(bbb, wordRelation);
 495             }
 496
 497         } else if (matches(digits2DashPattern, tokenVal)) {
 498
 499             /* 999-999-999 */
 500             digitsDashToWords(tokenVal);
 501
 502         } else if (matches(digitsPattern, tokenVal)) {
 503
 504             digitsToWords(tokenVal);
 505
 506         } else if (tokenLength == 1
 507                    && isUppercaseLetter(tokenVal.charAt(0))
 508                    && ((String)tokenItem.findFeature("n.whitespace")).equals
 509                    (" ")
 510                    && isUppercaseLetter
 511                    (((String) tokenItem.findFeature("n.name")).charAt(0))) {
 512
 513             tokenFeatures.setString("punc", "");
 514             String aaa = tokenVal.toLowerCase();
 515             if (aaa.equals("a")) {
 516                 wordRelation.addWord("_a");
 517             } else {
 518                 wordRelation.addWord(aaa);
 519             }
 520         } else if (matches(doublePattern, tokenVal)) {
 521
 522             NumberExpander.expandReal(tokenVal, wordRelation);
 523
 524         } else if (matches(ordinalPattern, tokenVal)) {
 525
 526             /* explicit ordinals */
 527             String aaa = tokenVal.substring(0, tokenLength - 2);
 528             NumberExpander.expandOrdinal(aaa, wordRelation);
 529
 530         } else if (matches(usMoneyPattern, tokenVal)) {
 531
 532             /* US money */
 533             usMoneyToWords(tokenVal);
 534
 535         } else if (tokenLength > 0
 536                    && tokenVal.charAt(tokenLength - 1) == '%') {
 537
 538             /* Y% */
 539             tokenToWords(tokenVal.substring(0, tokenLength - 1));
 540             wordRelation.addWord("per");
 541             wordRelation.addWord("cent");
 542
 543         } else if (matches(numessPattern, tokenVal)) {
 544
 545             /* 60s and 7s and 9s */
 546             tokenToWords(tokenVal.substring(0, tokenLength - 1));
 547             wordRelation.addWord("'s");
 548
 549         } else if (tokenVal.indexOf('\'') != -1) {
 550
 551             postropheToWords(tokenVal);
 552
 553         } else if (matches(digitsSlashDigitsPattern, tokenVal) &&
 554                    tokenVal.equals(itemName)) {
 555
 556             digitsSlashDigitsToWords(tokenVal);
 557
 558         } else if (tokenVal.indexOf('-') != -1) {
 559
 560             dashToWords(tokenVal);
 561
 562         } else if (tokenLength > 1 &&
 563                    !matches(alphabetPattern, tokenVal)) {
 564
 565             notJustAlphasToWords(tokenVal);
 566
 567         } else {
 568             /* just a word */
 569             wordRelation.addWord(tokenVal.toLowerCase());
 570         }
 571     }
 572
 573
 574     /**
 575      * Convert the given digit token with dashes (e.g. 999-999-999)
 576      * into (word) Items in the WordRelation.
 577      *
 578      * @param tokenVal  the digit string
 579      */
 580     private void digitsDashToWords(String tokenVal) {
 581         int tokenLength = tokenVal.length();
 582         int a = 0;
 583         for (int p = 0; p <= tokenLength; p++) {
 584             if (p == tokenLength || tokenVal.charAt(p) == '-') {
 585                 String aaa = tokenVal.substring(a, p);
 586                 NumberExpander.expandDigits(aaa, wordRelation);
 587                 wordRelation.addBreak();
 588                 a = p+1;
 589             }
 590         }
 591     }
 592
 593
 594     /**
 595      * Convert the given digit token into (word) Items in the WordRelation.
 596      *
 597      * @param tokenVal  the digit string
 598      */
 599     private void digitsToWords(String tokenVal) {
 600         FeatureSet featureSet = tokenItem.getFeatures();
 601         String nsw = "";
 602         if (featureSet.isPresent("nsw")) {
 603             nsw = featureSet.getString("nsw");
 604         }
 605
 606         if (nsw.equals("nide")) {
 607             NumberExpander.expandID(tokenVal, wordRelation);
 608         } else {
 609             String rName = featureSet.getString("name");
 610             String digitsType = null;
 611
 612             if (tokenVal.equals(rName)) {
 613                 digitsType = (String) cart.interpret(tokenItem);
 614             } else {
 615                 featureSet.setString("name", tokenVal);
 616                 digitsType = (String) cart.interpret(tokenItem);
 617                 featureSet.setString("name", rName);
 618             }
 619
 620             if (digitsType.equals("ordinal")) {
 621                 NumberExpander.expandOrdinal(tokenVal, wordRelation);
 622             } else if (digitsType.equals("digits")) {
 623                 NumberExpander.expandDigits(tokenVal, wordRelation);
 624             } else if (digitsType.equals("year")) {
 625                 NumberExpander.expandID(tokenVal, wordRelation);
 626             } else {
 627                 NumberExpander.expandNumber(tokenVal, wordRelation);
 628             }
 629         }
 630     }
 631
 632
 633     /**
 634      * Converts the given Roman numeral string into (word) Items in the
 635      * WordRelation.
 636      *
 637      * @param romanString the roman numeral string
 638      */
 639     private void romanToWords(String romanString) {
 640         String punctuation = (String) tokenItem.findFeature("p.punc");
 641
 642         if (punctuation.equals("")) {
 643             /* no preceeding punctuation */
 644             String n = String.valueOf(NumberExpander.expandRoman(romanString));
 645
 646             if (kingLike(tokenItem)) {
 647                 wordRelation.addWord("the");
 648                 NumberExpander.expandOrdinal(n, wordRelation);
 649             } else if (sectionLike(tokenItem)) {
 650                 NumberExpander.expandNumber(n, wordRelation);
 651             } else {
 652                 NumberExpander.expandLetters(romanString, wordRelation);
 653             }
 654         } else {
 655             NumberExpander.expandLetters(romanString, wordRelation);
 656         }
 657     }
 658
 659
 660     /**
 661      * Returns true if the given key is in the kingSectionLikeHash
 662      * Hashtable, and the value is the same as the given value.
 663      *
 664      * @param key key to look for in the hashtable
 665      * @param value the value to match
 666      *
 667      * @return true if it matches, or false if it does not or if
 668      * the key is not mapped to any value in the hashtable.
 669      */
 670     private static boolean inKingSectionLikeHash(String key, String value) {
 671         String hashValue = (String) kingSectionLikeHash.get(key);
 672         if (hashValue != null) {
 673             return (hashValue.equals(value));
 674         } else {
 675             return false;
 676         }
 677     }
 678
 679
 680
 681     /**
 682      * Returns true if the given token item contains a token that is
 683      * in a king-like context, e.g., "King" or "Louis".
 684      *
 685      * @param tokenItem the token item to check
 686      *
 687      * @return true or false
 688      */
 689     public static boolean kingLike(Item tokenItem) {
 690         String kingName =
 691             ((String) tokenItem.findFeature("p.name")).toLowerCase();
 692         if (inKingSectionLikeHash(kingName, KING_NAMES)) {
 693             return true;
 694         } else {
 695             String kingTitle =
 696                 ((String) tokenItem.findFeature("p.p.name")).toLowerCase();
 697             return inKingSectionLikeHash(kingTitle, KING_TITLES);
 698         }
 699     }
 700
 701
 702     /**
 703      * Returns true if the given token item contains a token that is
 704      * in a section-like context, e.g., "chapter" or "act".
 705      *
 706      * @param tokenItem the token item to check
 707      *
 708      * @return true or false
 709      */
 710     public static boolean sectionLike(Item tokenItem) {
 711         String sectionType =
 712             ((String) tokenItem.findFeature("p.name")).toLowerCase();
 713         return inKingSectionLikeHash(sectionType, SECTION_TYPES);
 714     }
 715
 716
 717     /**
 718      * Converts the given string containing "St" and "Dr" to (word) Items
 719      * in the WordRelation.
 720      *
 721      * @param drStString the string with "St" and "Dr"
 722      */
 723     private void drStToWords(String drStString) {
 724         String street = null;
 725         String saint = null;
 726         char c0 = drStString.charAt(0);
 727
 728         if (c0 == 's' || c0 == 'S') {
 729             street = "street";
 730             saint = "saint";
 731         } else {
 732             street = "drive";
 733             saint = "doctor";
 734         }
 735
 736         FeatureSet featureSet = tokenItem.getFeatures();
 737         String punctuation = featureSet.getString("punc");
 738
 739         String featPunctuation = (String) tokenItem.findFeature("punc");
 740
 741         if (tokenItem.getNext() == null ||
 742             punctuation.indexOf(',') != -1) {
 743             wordRelation.addWord(street);
 744         } else if (featPunctuation.equals(",")) {
 745             wordRelation.addWord(saint);
 746         } else {
 747             String pName = (String) tokenItem.findFeature("p.name");
 748             String nName = (String) tokenItem.findFeature("n.name");
 749
 750             char p0 = pName.charAt(0);
 751             char n0 = nName.charAt(0);
 752
 753             if (isUppercaseLetter(p0) && isLowercaseLetter(n0)) {
 754                 wordRelation.addWord(street);
 755             } else if (NumberExpander.isDigit(p0) && isLowercaseLetter(n0)) {
 756                 wordRelation.addWord(street);
 757             } else if (isLowercaseLetter(p0) && isUppercaseLetter(n0)) {
 758                 wordRelation.addWord(saint);
 759             } else {
 760                 String whitespace = (String) tokenItem.findFeature("n.whitespace");
 761                 if (whitespace.equals(" ")) {
 762                     wordRelation.addWord(saint);
 763                 } else {
 764                     wordRelation.addWord(street);
 765                 }
 766             }
 767         }
 768
 769         if (punctuation != null && punctuation.equals(".")) {
 770             featureSet.setString("punc", "");
 771         }
 772     }
 773
 774
 775     /**
 776      * Converts US money string into (word) Items in the WordRelation.
 777      *
 778      * @param tokenVal the US money string
 779      */
 780     private void usMoneyToWords(String tokenVal) {
 781
 782         int dotIndex = tokenVal.indexOf('.');
 783
 784         if (matches(illionPattern,
 785                     (String) tokenItem.findFeature("n.name"))) {
 786             NumberExpander.expandReal(tokenVal.substring(1), wordRelation);
 787         } else if (dotIndex == -1) {
 788
 789             String aaa = tokenVal.substring(1);
 790             tokenToWords(aaa);
 791
 792             if (aaa.equals("1")) {
 793                 wordRelation.addWord("dollar");
 794             } else {
 795                 wordRelation.addWord("dollars");
 796             }
 797         } else if (dotIndex == (tokenVal.length() - 1) ||
 798                    (tokenVal.length() - dotIndex) > 3) {
 799             /* simply read as mumble point mumble */
 800             NumberExpander.expandReal(tokenVal.substring(1), wordRelation);
 801             wordRelation.addWord("dollars");
 802         } else {
 803             String aaa = tokenVal.substring(1, dotIndex);
 804             aaa = Utilities.deleteChar(aaa, ',');
 805             String bbb = tokenVal.substring(dotIndex+1);
 806
 807             NumberExpander.expandNumber(aaa, wordRelation);
 808
 809             if (aaa.equals("1")) {
 810                 wordRelation.addWord("dollar");
 811             } else {
 812                 wordRelation.addWord("dollars");
 813             }
 814
 815             if (bbb.equals("00")) {
 816                 // add nothing to the word list
 817             } else {
 818                 NumberExpander.expandNumber(bbb, wordRelation);
 819                 if (bbb.equals("01")) {
 820                     wordRelation.addWord("cent");
 821                 } else {
 822                     wordRelation.addWord("cents");
 823                 }
 824             }
 825         }
 826     }
 827
 828
 829     /**
 830      * Convert the given apostrophed word into (word) Items in the Word
 831      * Relation.
 832      *
 833      * @param tokenVal the apostrophed word string
 834      */
 835     private void postropheToWords(String tokenVal) {
 836         int index = tokenVal.indexOf('\'');
 837         String bbb = tokenVal.substring(index).toLowerCase();
 838
 839         if (inStringArray(bbb, postrophes)) {
 840             String aaa = tokenVal.substring(0, index);
 841             tokenToWords(aaa);
 842             wordRelation.addWord(bbb);
 843
 844         } else if (bbb.equals("'tve")) {
 845             String aaa = tokenVal.substring(0, index-2);
 846             tokenToWords(aaa);
 847             wordRelation.addWord("'ve");
 848
 849         } else {
 850             /* internal single quote deleted */
 851             StringBuffer buffer = new StringBuffer(tokenVal);
 852             buffer.deleteCharAt(index);
 853             tokenToWords(buffer.toString());
 854         }
 855     }
 856
 857
 858     /**
 859      * Convert the given digits/digits string into word (Items) in the
 860      * WordRelation.
 861      *
 862      * @param tokenVal the digits/digits string
 863      */
 864     private void digitsSlashDigitsToWords(String tokenVal) {
 865
 866         /* might be fraction, or not */
 867         int index = tokenVal.indexOf('/');
 868         String aaa = tokenVal.substring(0, index);
 869         String bbb = tokenVal.substring(index+1);
 870         int a, b;
 871
 872         // if the previous token is a number, add an "and"
 873         if (matches(digitsPattern, (String) tokenItem.findFeature("p.name"))
 874             && tokenItem.getPrevious() != null) {
 875             wordRelation.addWord("and");
 876         }
 877
 878         if (aaa.equals("1") && bbb.equals("2")) {
 879             wordRelation.addWord("a");
 880             wordRelation.addWord("half");
 881         } else if ((a = Integer.parseInt(aaa)) < (b = Integer.parseInt(bbb))) {
 882             NumberExpander.expandNumber(aaa, wordRelation);
 883             NumberExpander.expandOrdinal(bbb, wordRelation);
 884             if (a > 1) {
 885                 wordRelation.addWord("'s");
 886             }
 887         } else {
 888             NumberExpander.expandNumber(aaa, wordRelation);
 889             wordRelation.addWord("slash");
 890             NumberExpander.expandNumber(bbb, wordRelation);
 891         }
 892     }
 893
 894
 895     /**
 896      * Convert the given dashed string (e.g. "aaa-bbb") into (word) Items
 897      * in the WordRelation.
 898      *
 899      * @param tokenVal the dashed string
 900      */
 901     private void dashToWords(String tokenVal) {
 902
 903         int index = tokenVal.indexOf('-');
 904         String aaa = tokenVal.substring(0, index);
 905         String bbb = tokenVal.substring(index+1, tokenVal.length());
 906
 907         if (matches(digitsPattern, aaa) && matches(digitsPattern, bbb)) {
 908             FeatureSet featureSet = tokenItem.getFeatures();
 909             featureSet.setString("name", aaa);
 910             tokenToWords(aaa);
 911             wordRelation.addWord("to");
 912             featureSet.setString("name", bbb);
 913             tokenToWords(bbb);
 914             featureSet.setString("name", "");
 915         } else {
 916             tokenToWords(aaa);
 917             tokenToWords(bbb);
 918         }
 919     }
 920
 921
 922     /**
 923      * Convert the given string (which does not only consist of alphabet)
 924      * into (word) Items in the WordRelation.
 925      *
 926      * @param tokenVal the string
 927      */
 928     private void notJustAlphasToWords(String tokenVal) {
 929
 930         /* its not just alphas */
 931         int index = 0;
 932         int tokenLength = tokenVal.length();
 933
 934         for (; index < tokenLength; index++) {
 935             if (isTextSplitable(tokenVal, index)) {
 936                 break;
 937             }
 938         }
 939
 940         String aaa = tokenVal.substring(0, index+1);
 941         String bbb = tokenVal.substring(index+1, tokenLength);
 942
 943         FeatureSet featureSet = tokenItem.getFeatures();
 944         featureSet.setString("nsw", "nide");
 945         tokenToWords(aaa);
 946         tokenToWords(bbb);
 947     }
 948
 949
 950     /**
 951      * Returns true if the given word is pronounceable.
 952      * This method is originally called us_aswd() in Flite 1.1.
 953      *
 954      * @param word the word to test
 955      *
 956      * @return true if the word is pronounceable, false otherwise
 957      */
 958     public boolean isPronounceable(String word) {
 959         String lowerCaseWord = word.toLowerCase();
 960         return (prefixFSM.accept(lowerCaseWord) &&
 961                 suffixFSM.accept(lowerCaseWord));
 962     }
 963
 964
 965     /**
 966      * Returns true if the given token is the name of a US state.
 967      * If it is, it will add the name of the state to (word) Items in the
 968      * WordRelation.
 969      *
 970      * @param tokenVal the token string
 971      */
 972     private boolean isStateName(String tokenVal) {
 973         String[] state = (String[]) usStatesHash.get(tokenVal);
 974         if (state != null) {
 975             boolean expandState = false;
 976
 977             // check to see if the state initials are ambiguous
 978             // in the English language
 979             if (state[1].equals("ambiguous")) {
 980                 String previous = (String) tokenItem.findFeature("p.name");
 981                 String next = (String) tokenItem.findFeature("n.name");
 982
 983                 // System.out.println("previous = " + previous);
 984                 // System.out.println("next = " + next);
 985
 986                 int nextLength = next.length();
 987                 FeatureSet featureSet = tokenItem.getFeatures();
 988
 989                 // check if the previous word starts with a capital letter,
 990                 // is at least 3 letters long, is an alphabet sequence,
 991                 // and has a comma.
 992                 boolean previousIsCity =
 993                     (isUppercaseLetter(previous.charAt(0))
 994                      && previous.length() > 2
 995                      && matches(alphabetPattern, previous)
 996                      && tokenItem.findFeature("p.punc").equals(","));
 997
 998                 // check if next token starts with a lower case, or
 999                 // this is the end of sentence, or if next token
1000                 // is a period (".") or a zip code (5 or 10 digits).
1001                 boolean nextIsGood =
1002                     (isLowercaseLetter(next.charAt(0))
1003                      || tokenItem.getNext() == null
1004                      || featureSet.getString("punc").equals(".")
1005                      || ((nextLength == 5 || nextLength == 10) &&
1006                          matches(digitsPattern, next)));
1007
1008                 if (previousIsCity && nextIsGood) {
1009                     expandState = true;
1010                 } else {
1011                     expandState = false;
1012                 }
1013             } else {
1014                 expandState = true;
1015             }
1016             if (expandState) {
1017                 for (int j = 2; j < state.length; j++) {
1018                     if (state[j] != null) {
1019                         wordRelation.addWord(state[j]);
1020                     }
1021                 }
1022                 return true;
1023             }
1024         }
1025         return false;
1026     }
1027
1028
1029     /**
1030      * Determines if the given input matches the given Pattern.
1031      *
1032      * @param pattern the pattern to match
1033      * @param input the string to test
1034      *
1035      * @return <code>true</code> if the input string matches the given Pattern;
1036      *         <code>false</code> otherwise
1037      */
1038     private static boolean matches(Pattern pattern, String input) {
1039         Matcher m = pattern.matcher(input);
1040         return m.matches();
1041     }
1042
1043
1044     /**
1045      * Determines if the character at the given position of the given
1046      * input text is splittable. A character is splittable if:
1047      * <p>
1048      * 1) the character and the following character are not letters
1049      *    in the English alphabet (A-Z and a-z)
1050      * <p>
1051      * 2) the character and the following character are not digits (0-9)
1052      * <p>
1053      * @param text the text containing the character of interest
1054      * @param index the index of the character of interest
1055      *
1056      * @return true if the position of the given text is splittable
1057      *         false otherwise
1058      */
1059     private static boolean isTextSplitable(String text, int index) {
1060
1061         char c0 = text.charAt(index);
1062         char c1 = text.charAt(index+1);
1063
1064         if (isLetter(c0) && isLetter(c1)) {
1065             return false;
1066         } else if (NumberExpander.isDigit(c0) && NumberExpander.isDigit(c1)) {
1067             return false;
1068         } else {
1069             return true;
1070         }
1071     }
1072
1073
1074     /**
1075      * Returns true if the given character is a letter (a-z or A-Z).
1076      *
1077      * @param ch the character to test
1078      *
1079      * @return true or false
1080      */
1081     private static boolean isLetter(char ch) {
1082         return (('a' <= ch && ch <= 'z') || ('A' <= ch && ch <= 'Z'));
1083     }
1084
1085
1086     /**
1087      * Returns true if the given character is an uppercase letter (A-Z).
1088      *
1089      * @param ch the character to test
1090      *
1091      * @return true or false
1092      */
1093     private static boolean isUppercaseLetter(char ch) {
1094         return ('A' <= ch && ch <= 'Z');
1095     }
1096
1097
1098     /**
1099      * Returns true if the given character is a lowercase letter (a-z).
1100      *
1101      * @param ch the character to test
1102      *
1103      * @return true or false
1104      */
1105     private static boolean isLowercaseLetter(char ch) {
1106         return ('a' <= ch && ch <= 'z');
1107     }
1108
1109
1110     /**
1111      * Converts this object to its String representation
1112      *
1113      * @return the string representation of this object
1114      */
1115     public String toString() {
1116         return "TokenToWords";
1117     }
1118 }