2 * Portions Copyright 2001 Sun Microsystems, Inc.
3 * Portions Copyright 1999-2001 Language Technologies Institute,
4 * Carnegie Mellon University.
5 * All Rights Reserved. Use is subject to license terms.
7 * See the file "license.terms" for information on usage and
8 * redistribution of this file, and for a DISCLAIMER OF ALL
11 package com.sun.speech.freetts.en.us;
16 * Provides the definitions for US English whitespace, punctuations,
17 * prepunctuation, and postpunctuation symbols. It also contains a set of
18 * Regular Expressions for the US English language.
19 * With regular expressions, it specifies what are whitespace,
20 * letters in the alphabet, uppercase and lowercase letters, alphanumeric
21 * characters, identifiers, integers, doubles, digits, and 'comma and int'.
23 * It translates the following code from flite:
24 * src/regex/cst_regex.c
25 * lang/usenglish/us_text.c
27 public class USEnglish {
29 /** default whitespace regular expression pattern */
30 public static final String RX_DEFAULT_US_EN_WHITESPACE = "[ \n\t\r]+";
31 /** default letter regular expression pattern */
32 public static final String RX_DEFAULT_US_EN_ALPHABET = "[A-Za-z]+";
33 /** default uppercase regular expression pattern */
34 public static final String RX_DEFAULT_US_EN_UPPERCASE = "[A-Z]+";
35 /** default lowercase regular expression pattern */
36 public static final String RX_DEFAULT_US_EN_LOWERCASE = "[a-z]+";
37 /** default alpha-numeric regular expression pattern */
38 public static final String RX_DEFAULT_US_EN_ALPHANUMERIC = "[0-9A-Za-z]+";
39 /** default identifier regular expression pattern */
40 public static final String RX_DEFAULT_US_EN_IDENTIFIER = "[A-Za-z_][0-9A-Za-z_]+";
41 /** default integer regular expression pattern */
42 public static final String RX_DEFAULT_US_EN_INT = "-?[0-9]+";
43 /** default double regular expression pattern */
44 public static final String RX_DEFAULT_US_EN_DOUBLE =
45 "-?(([0-9]+\\.[0-9]*)|([0-9]+)|(\\.[0-9]+))([eE][---+]?[0-9]+)?";
46 /** default integer with commas regular expression pattern */
47 public static final String RX_DEFAULT_US_EN_COMMAINT =
48 "[0-9][0-9]?[0-9]?,([0-9][0-9][0-9],)*[0-9][0-9][0-9](\\.[0-9]+)?";
49 /** default digits regular expression pattern */
50 public static final String RX_DEFAULT_US_EN_DIGITS = "[0-9][0-9]*";
51 /** default dotted abbreviation regular expression pattern */
52 public static final String RX_DEFAULT_US_EN_DOTTED_ABBREV = "([A-Za-z]\\.)*[A-Za-z]";
53 /** default ordinal number regular expression pattern */
54 public static final String RX_DEFAULT_US_EN_ORDINAL_NUMBER =
55 "[0-9][0-9,]*(th|TH|st|ST|nd|ND|rd|RD)";
56 /** default has-vowel regular expression */
57 public static final String RX_DEFAULT_HAS_VOWEL = ".*[aeiouAEIOU].*";
58 /** default US money regular expression */
59 public static final String RX_DEFAULT_US_MONEY = "\\$[0-9,]+(\\.[0-9]+)?";
60 /** default -illion regular expression */
61 public static final String RX_DEFAULT_ILLION = ".*illion";
62 /** default digits2dash (e.g. 999-999-999) regular expression */
63 public static final String RX_DEFAULT_DIGITS2DASH = "[0-9]+(-[0-9]+)(-[0-9]+)+";
64 /** default digits/digits (e.g. 999/999) regular expression */
65 public static final String RX_DEFAULT_DIGITSSLASHDIGITS = "[0-9]+/[0-9]+";
66 /** default number time regular expression */
67 public static final String RX_DEFAULT_NUMBER_TIME = "((0[0-2])|(1[0-9])):([0-5][0-9])";
68 /** default Roman numerals regular expression */
69 public static final String RX_DEFAULT_ROMAN_NUMBER =
70 "(II?I?|IV|VI?I?I?|IX|X[VIX]*)";
71 /** default drst "Dr. St" regular expression */
72 public static final String RX_DEFAULT_DRST = "([dD][Rr]|[Ss][Tt])";
74 public static final String RX_DEFAULT_NUMESS = "[0-9]+s";
75 /** default 7-digit phone number */
76 public static final String RX_DEFAULT_SEVEN_DIGIT_PHONE_NUMBER =
77 "[0-9][0-9][0-9]-[0-9][0-9][0-9][0-9]";
78 /** default 4-digit number */
79 public static final String RX_DEFAULT_FOUR_DIGIT =
80 "[0-9][0-9][0-9][0-9]";
81 /** default 3-digit number */
82 public static final String RX_DEFAULT_THREE_DIGIT =
86 /** whitespace regular expression pattern */
87 public static String RX_WHITESPACE = RX_DEFAULT_US_EN_WHITESPACE;
88 /** letter regular expression pattern */
89 public static String RX_ALPHABET = RX_DEFAULT_US_EN_ALPHABET;
90 /** uppercase regular expression pattern */
91 public static String RX_UPPERCASE = RX_DEFAULT_US_EN_UPPERCASE;
92 /** lowercase regular expression pattern */
93 public static String RX_LOWERCASE = RX_DEFAULT_US_EN_LOWERCASE;
94 /** alphanumeric regular expression pattern */
95 public static String RX_ALPHANUMERIC = RX_DEFAULT_US_EN_ALPHANUMERIC;
96 /** identifier regular expression pattern */
97 public static String RX_IDENTIFIER = RX_DEFAULT_US_EN_IDENTIFIER;
98 /** integer regular expression pattern */
99 public static String RX_INT = RX_DEFAULT_US_EN_INT;
100 /** double regular expression pattern */
101 public static String RX_DOUBLE = RX_DEFAULT_US_EN_DOUBLE;
102 /** comma separated integer regular expression pattern */
103 public static String RX_COMMAINT = RX_DEFAULT_US_EN_COMMAINT;
104 /** digits regular expression pattern */
105 public static String RX_DIGITS = RX_DEFAULT_US_EN_DIGITS;
106 /** dotted abbreviation regular expression pattern */
107 public static String RX_DOTTED_ABBREV = RX_DEFAULT_US_EN_DOTTED_ABBREV;
108 /** ordinal number regular expression pattern */
109 public static String RX_ORDINAL_NUMBER = RX_DEFAULT_US_EN_ORDINAL_NUMBER;
110 /** has-vowel regular expression */
111 public static final String RX_HAS_VOWEL = RX_DEFAULT_HAS_VOWEL;
112 /** US money regular expression */
113 public static final String RX_US_MONEY = RX_DEFAULT_US_MONEY;
114 /** -illion regular expression */
115 public static final String RX_ILLION = RX_DEFAULT_ILLION;
116 /** digits2dash (e.g. 999-999-999) regular expression */
117 public static final String RX_DIGITS2DASH = RX_DEFAULT_DIGITS2DASH;
118 /** digits/digits (e.g. 999/999) regular expression */
119 public static final String RX_DIGITSSLASHDIGITS = RX_DEFAULT_DIGITSSLASHDIGITS;
120 /** number time regular expression */
121 public static final String RX_NUMBER_TIME = RX_DEFAULT_NUMBER_TIME;
122 /** Roman numerals regular expression */
123 public static final String RX_ROMAN_NUMBER = RX_DEFAULT_ROMAN_NUMBER;
124 /** drst "Dr. St" regular expression */
125 public static final String RX_DRST = RX_DEFAULT_DRST;
126 /** default numess */
127 public static final String RX_NUMESS = RX_DEFAULT_NUMESS;
128 /** 7-digit phone number */
129 public static final String RX_SEVEN_DIGIT_PHONE_NUMBER = RX_DEFAULT_SEVEN_DIGIT_PHONE_NUMBER;
130 /** 4-digit number */
131 public static final String RX_FOUR_DIGIT = RX_DEFAULT_FOUR_DIGIT;
132 /** 3-digit number */
133 public static final String RX_THREE_DIGIT = RX_DEFAULT_THREE_DIGIT;
136 // the following symbols are from lang/usenglish/us_text.c
138 /** punctuation regular expression pattern */
139 public static final String PUNCTUATION_SYMBOLS = "\"'`.,:;!?(){}[]";
140 /** pre-punctuation regular expression pattern */
141 public static final String PREPUNCTUATION_SYMBOLS = "\"'`({[";
142 /** single char symbols regular expression pattern */
143 public static final String SINGLE_CHAR_SYMBOLS = "";
144 /** whitespace symbols regular expression pattern */
145 public static final String WHITESPACE_SYMBOLS = " \t\n\r";
151 private USEnglish() {}