2 * Portions Copyright 2001 Sun Microsystems, Inc.
3 * Portions Copyright 1999-2001 Language Technologies Institute,
4 * Carnegie Mellon University.
5 * All Rights Reserved. Use is subject to license terms.
7 * See the file "license.terms" for information on usage and
8 * redistribution of this file, and for a DISCLAIMER OF ALL
11 package com.sun.speech.freetts.lexicon;
13 import com.sun.speech.freetts.util.Utilities;
14 import com.sun.speech.freetts.util.BulkTimer;
16 import java.io.BufferedInputStream;
17 import java.io.BufferedOutputStream;
18 import java.io.BufferedReader;
19 import java.io.DataInputStream;
20 import java.io.DataOutputStream;
21 import java.io.FileInputStream;
22 import java.io.FileNotFoundException;
23 import java.io.FileOutputStream;
24 import java.io.InputStream;
25 import java.io.InputStreamReader;
26 import java.io.IOException;
27 import java.nio.channels.FileChannel;
28 import java.nio.ByteBuffer;
29 import java.nio.MappedByteBuffer;
31 import java.net.MalformedURLException;
34 import java.util.ArrayList;
35 import java.util.Collections;
36 import java.util.HashMap;
37 import java.util.LinkedHashMap;
38 import java.util.Iterator;
39 import java.util.List;
42 import java.util.StringTokenizer;
45 * Provides an implementation of a Lexicon.
47 * <p>This implementation will either read from a straight ASCII file
48 * or a binary file. When reading from an ASCII file, you can specify
49 * when the input line is tokenized: load, lookup, or never. If you
50 * specify 'load', the entire file will be parsed when it is loaded.
51 * If you specify 'lookup', the file will be loaded, but the parsing
52 * for each line will be delayed until it is referenced and the parsed
53 * form will be saved away. If you specify 'never', the lines will
54 * parsed each time they are referenced. The default is 'never'. To
55 * specify the load type, set the system property as follows:
58 * -Dcom.sun.speech.freetts.lexicon.LexTokenize=load
61 * <p>If a binary file is used, you can also specify whether the new
62 * IO package is used. The new IO package is new for JDK1.4, and can
63 * greatly improve the speed of loading files. To enable new IO, use
64 * the following system property (it is enabled by default):
67 * -Dcom.sun.speech.freetts.useNewIO=true
70 * <p>The implementation also allows users to define their own addenda
71 * that will be used in addition to the system addenda. If the user
72 * defines their own addenda, it values will be added to the system
73 * addenda, overriding any existing elements in the system addenda.
74 * To define a user addenda, the user needs to set the following
78 * -Dcom.sun.speeech.freetts.lexicon.userAddenda=<URLToUserAddenda>
81 * Where <URLToUserAddenda> is a URL pointing to an ASCII file
82 * containing addenda entries.
84 * <p>[[[TODO: support multiple homographs with the same part of speech.]]]
86 abstract public class LexiconImpl implements Lexicon {
88 * If true, the phone string is replaced with the phone array in
89 * the hashmap when the phone array is loaded. The side effects
90 * of this are quicker lookups, but more memory usage and a longer
93 protected boolean tokenizeOnLoad = false;
96 * If true, the phone string is replaced with the phone array in
97 * the hashmap when the phone array is first looked up. The side effects
98 * Set by cmufilelex.tokenize=lookup.
100 protected boolean tokenizeOnLookup = false;
103 * Magic number for binary Lexicon files.
105 private final static int MAGIC = 0xBABB1E;
108 * Current binary file version.
110 private final static int VERSION = 1;
113 * URL for the compiled form.
115 private URL compiledURL;
118 * URL for the addenda.
120 private URL addendaURL;
123 * URL for the letter to sound rules.
125 private URL letterToSoundURL;
133 * The compiled lexicon.
135 private Map compiled;
138 * The LetterToSound rules.
140 private LetterToSound letterToSound = null;
145 private ArrayList partsOfSpeech = new ArrayList();
148 * A static directory of compiledURL URL objects and associated
149 * already-loaded compiled Map objects. This is used to share
150 * the immutable compiled lexicons between lexicon instances.
151 * As the addenda can be changed using <code>addAddendum()</code>
152 * and <code>removeAddendum</code>, each lexicon instance has its
155 private static Map loadedCompiledLexicons;
160 * Loaded State of the lexicon
162 private boolean loaded = false;
165 * Type of lexicon to load
167 private boolean binary = false;
170 * No phones for this word.
172 final static private String[] NO_PHONES = new String[0];
175 * Temporary place holder.
177 private char charBuffer[] = new char[128];
180 * Use the new IO package?
182 private boolean useNewIO =
183 Utilities.getProperty("com.sun.speech.freetts.useNewIO",
184 "true").equals("true");
187 * Create a new LexiconImpl by reading from the given URLS.
189 * @param compiledURL a URL pointing to the compiled lexicon
190 * @param addendaURL a URL pointing to lexicon addenda
191 * @param letterToSoundURL a LetterToSound to use if a word cannot
192 * be found in the compiled form or the addenda
193 * @param binary if <code>true</code>, the input streams are binary;
194 * otherwise, they are text.
196 public LexiconImpl(URL compiledURL, URL addendaURL,
197 URL letterToSoundURL,
200 setLexiconParameters(compiledURL, addendaURL, letterToSoundURL, binary);
204 * Class constructor for an empty Lexicon.
206 public LexiconImpl() {
207 // Find out when to convert the phone string into an array.
210 Utilities.getProperty("com.sun.speech.freetts.lexicon.LexTokenize",
212 tokenizeOnLoad = tokenize.equals("load");
213 tokenizeOnLookup = tokenize.equals("lookup");
217 * Sets the lexicon parameters
218 * @param compiledURL a URL pointing to the compiled lexicon
219 * @param addendaURL a URL pointing to lexicon addenda
220 * @param letterToSoundURL a URL pointing to the LetterToSound to use
221 * @param binary if <code>true</code>, the input streams are binary;
222 * otherwise, they are text.
224 protected void setLexiconParameters(URL compiledURL,
226 URL letterToSoundURL,
228 this.compiledURL = compiledURL;
229 this.addendaURL = addendaURL;
230 this.letterToSoundURL = letterToSoundURL;
231 this.binary = binary;
235 * Determines if this lexicon is loaded.
237 * @return <code>true</code> if the lexicon is loaded
239 public boolean isLoaded() {
244 * Loads the data for this lexicon. If the
246 * @throws IOException if errors occur during loading
248 public void load() throws IOException {
249 BulkTimer.LOAD.start("Lexicon");
251 if (compiledURL == null) {
252 throw new IOException("Can't load lexicon");
255 if (addendaURL == null) {
256 throw new IOException("Can't load lexicon addenda " );
259 if (loadedCompiledLexicons == null) {
260 loadedCompiledLexicons = new HashMap();
262 if (!loadedCompiledLexicons.containsKey(compiledURL)) {
263 InputStream compiledIS = Utilities.getInputStream(compiledURL);
264 if (compiledIS == null) {
265 throw new IOException("Can't load lexicon from " + compiledURL);
267 Map newCompiled = createLexicon(compiledIS, binary, 65000);
268 loadedCompiledLexicons.put(compiledURL, newCompiled);
271 compiled = Collections.unmodifiableMap((Map)loadedCompiledLexicons.get(compiledURL));
273 InputStream addendaIS = Utilities.getInputStream(addendaURL);
274 if (addendaIS == null) {
275 throw new IOException("Can't load lexicon addenda from "
279 // [[[TODO: what is the best way to derive the estimated sizes?]]]
281 addenda = createLexicon(addendaIS, binary, 50);
284 /* Load the user-defined addenda and override any existing
285 * entries in the system addenda.
287 String userAddenda = Utilities.getProperty(
288 "com.sun.speech.freetts.lexicon.userAddenda", null);
289 if (userAddenda != null) {
291 URL userAddendaURL = new URL(userAddenda);
292 InputStream userAddendaIS = Utilities.getInputStream(
294 if (userAddendaIS == null) {
295 throw new IOException("Can't load user addenda from "
298 Map tmpAddenda = createLexicon(userAddendaIS, false, 50);
299 userAddendaIS.close();
300 for (Iterator keys = tmpAddenda.keySet().iterator();
302 Object key = keys.next();
303 addenda.put(key, tmpAddenda.get(key));
305 } catch (MalformedURLException e) {
306 throw new IOException("User addenda URL is malformed: " +
312 BulkTimer.LOAD.stop("Lexicon");
313 letterToSound = new LetterToSoundImpl(letterToSoundURL, binary);
317 * Reads the given input stream as lexicon data and returns the
318 * results in a <code>Map</code>.
320 * @param is the input stream
321 * @param binary if <code>true</code>, the data is binary
322 * @param estimatedSize the estimated size of the lexicon
324 * @throws IOException if errors are encountered while reading the data
326 protected Map createLexicon(InputStream is,
331 if (useNewIO && is instanceof FileInputStream) {
332 FileInputStream fis = (FileInputStream) is;
333 return loadMappedBinaryLexicon(fis, estimatedSize);
335 DataInputStream dis = new DataInputStream(
336 new BufferedInputStream(is));
337 return loadBinaryLexicon(dis, estimatedSize);
340 return loadTextLexicon(is, estimatedSize);
345 * Reads the given input stream as text lexicon data and returns the
346 * results in a <code>Map</code>.
348 * @param is the input stream
349 * @param estimatedSize the estimated number of entries of the lexicon
351 * @throws IOException if errors are encountered while reading the data
353 protected Map loadTextLexicon(InputStream is, int estimatedSize)
355 Map lexicon = new LinkedHashMap(estimatedSize * 4 / 3);
356 BufferedReader reader = new BufferedReader(new InputStreamReader(is));
359 line = reader.readLine();
360 while (line != null) {
361 if (!line.startsWith("***")) {
362 parseAndAdd(lexicon, line);
364 line = reader.readLine();
370 * Creates a word from the given input line and add it to the lexicon.
372 * @param lexicon the lexicon
373 * @param line the input text
375 protected void parseAndAdd(Map lexicon, String line) {
376 StringTokenizer tokenizer = new StringTokenizer(line,"\t");
377 String phones = null;
379 String wordAndPos = tokenizer.nextToken();
380 String pos = wordAndPos.substring(wordAndPos.length() - 1);
381 if (!partsOfSpeech.contains(pos)) {
382 partsOfSpeech.add(pos);
384 if (tokenizer.hasMoreTokens()) {
385 phones = tokenizer.nextToken();
387 if ((phones != null) && (tokenizeOnLoad)) {
388 lexicon.put(wordAndPos, getPhones(phones));
389 } else if (phones == null) {
390 lexicon.put(wordAndPos, NO_PHONES);
392 lexicon.put(wordAndPos, phones);
397 * Gets the phone list for a given word. If a phone list cannot
398 * be found, returns <code>null</code>. The format is lexicon
399 * dependent. If the part of speech does not matter, pass in
402 * @param word the word to find
403 * @param partOfSpeech the part of speech
405 * @return the list of phones for word or <code>null</code>
407 public String[] getPhones(String word, String partOfSpeech) {
408 return getPhones(word, partOfSpeech, true);
412 * Gets the phone list for a given word. If a phone list cannot
413 * be found, <code>null</code> is returned. The
414 * <code>partOfSpeech</code> is implementation dependent, but
415 * <code>null</code> always matches.
417 * @param word the word to find
418 * @param partOfSpeech the part of speech or <code>null</code>
419 * @param useLTS whether to use the letter-to-sound rules when
420 * the word is not in the lexicon.
422 * @return the list of phones for word or null
424 public String[] getPhones
425 (String word, String partOfSpeech, boolean useLTS){
426 String[] phones = null;
427 phones = getPhones(addenda, word, partOfSpeech);
428 if (phones == null) {
429 phones = getPhones(compiled, word, partOfSpeech);
432 if (phones == null && letterToSound != null) {
433 phones = letterToSound.getPhones(word, partOfSpeech);
437 String[] copy = new String[phones.length];
438 System.arraycopy(phones, 0, copy, 0, phones.length);
445 * Gets a phone list for a word from a given lexicon. If a phone
446 * list cannot be found, returns <code>null</code>. The format is
447 * lexicon dependent. If the part of speech does not matter, pass
448 * in <code>null</code>.
450 * @param lexicon the lexicon
451 * @param word the word to find
452 * @param partOfSpeech the part of speech
454 * @return the list of phones for word or <code>null</code>
456 protected String[] getPhones(Map lexicon,
458 String partOfSpeech) {
460 partOfSpeech = fixPartOfSpeech(partOfSpeech);
461 phones = getPhones(lexicon, word+partOfSpeech);
463 (i < partsOfSpeech.size()) && (phones == null);
465 if (!partOfSpeech.equals((String) partsOfSpeech.get(i))) {
466 phones = getPhones(lexicon,
467 word + (String) partsOfSpeech.get(i));
474 * Gets a phone list for a word from a given lexicon. If a phone
475 * list cannot be found, returns <code>null</code>.
477 * @param lexicon the lexicon
478 * @param wordAndPartOfSpeech word and part of speech concatenated
481 * @return the list of phones for word or <code>null</code>
483 protected String[] getPhones(Map lexicon,
484 String wordAndPartOfSpeech) {
485 Object value = lexicon.get(wordAndPartOfSpeech);
486 if (value instanceof String[]) {
487 return (String[]) value;
488 } else if (value instanceof String) {
490 phoneArray = getPhones((String) value);
491 if (tokenizeOnLookup) {
492 lexicon.put(wordAndPartOfSpeech, phoneArray);
501 * Turns the phone <code>String</code> into a <code>String[]</code>,
502 * using " " as the delimiter.
504 * @param phones the phones
506 * @return the phones split into an array
508 protected String[] getPhones(String phones) {
509 ArrayList phoneList = new ArrayList();
510 StringTokenizer tokenizer = new StringTokenizer(phones, " ");
511 while (tokenizer.hasMoreTokens()) {
512 phoneList.add(tokenizer.nextToken());
514 return (String[]) phoneList.toArray(new String[0]);
518 * Adds a word to the addenda.
520 * @param word the word to find
521 * @param partOfSpeech the part of speech
522 * @param phones the phones for the word
525 public void addAddendum(String word,
528 String pos = fixPartOfSpeech(partOfSpeech);
529 if (!partsOfSpeech.contains(pos)) {
530 partsOfSpeech.add(pos);
532 addenda.put(word + pos, phones);
536 * Removes a word from the addenda.
538 * @param word the word to remove
539 * @param partOfSpeech the part of speech
541 public void removeAddendum(String word, String partOfSpeech) {
542 addenda.remove(word + fixPartOfSpeech(partOfSpeech));
546 * Outputs a string to a data output stream.
548 * @param dos the data output stream
549 * @param s the string to output
551 * @throws IOException if errors occur during writing
553 private void outString(DataOutputStream dos, String s)
555 dos.writeByte((byte) s.length());
556 for (int i = 0; i < s.length(); i++) {
557 dos.writeChar(s.charAt(i));
562 * Inputs a string from a DataInputStream. This method is not re-entrant.
564 * @param dis the data input stream
568 * @throws IOException if errors occur during reading
570 private String getString(DataInputStream dis) throws IOException {
571 int size = dis.readByte();
572 for (int i = 0; i < size; i++) {
573 charBuffer[i] = dis.readChar();
575 return new String(charBuffer, 0, size);
579 * Inputs a string from a DataInputStream. This method is not re-entrant.
581 * @param bb the input byte buffer
585 * @throws IOException if errors occur during reading
587 private String getString(ByteBuffer bb) throws IOException {
589 for (int i = 0; i < size; i++) {
590 charBuffer[i] = bb.getChar();
592 return new String(charBuffer, 0, size);
597 * Dumps a binary form of the database. This method is not thread-safe.
599 * <p>Binary format is:
608 * (String) nameWithPOS
615 * <p>Strings are formatted as: <code>(byte) len char0 char1 charN</code>
617 * <p>Limits: Strings: 128 chars
618 * <p>Limits: Strings: 128 phonemes per word
620 * @param lexicon the lexicon to dump
621 * @param path the path to dump the file to
623 private void dumpBinaryLexicon(Map lexicon, String path) {
625 FileOutputStream fos = new FileOutputStream(path);
626 DataOutputStream dos = new DataOutputStream(new
627 BufferedOutputStream(fos));
628 List phonemeList = findPhonemes(lexicon);
631 dos.writeInt(VERSION);
632 dos.writeInt(phonemeList.size());
634 for (int i = 0; i < phonemeList.size(); i++) {
635 outString(dos, (String) phonemeList.get(i));
638 dos.writeInt(lexicon.keySet().size());
639 for (Iterator i = lexicon.keySet().iterator(); i.hasNext(); ) {
640 String key = (String) i.next();
642 String[] phonemes = getPhones(lexicon, key);
643 dos.writeByte((byte) phonemes.length);
644 for (int index = 0; index < phonemes.length; index++) {
645 int phonemeIndex = phonemeList.indexOf(phonemes[index]);
646 if (phonemeIndex == -1) {
647 throw new Error("Can't find phoneme index");
649 dos.writeByte((byte) phonemeIndex);
653 } catch (FileNotFoundException fe) {
654 throw new Error("Can't dump binary database " +
656 } catch (IOException ioe) {
657 throw new Error("Can't write binary database " +
663 * Loads the binary lexicon from the given InputStream.
664 * This method is not thread safe.
666 * @param is the InputStream to load the database from
667 * @param estimatedSize estimate of how large the database is
669 * @return a <code>Map</code> containing the lexicon
671 * @throws IOException if an IO error occurs
673 private Map loadMappedBinaryLexicon(FileInputStream is, int estimatedSize)
675 FileChannel fc = is.getChannel();
677 MappedByteBuffer bb = fc.map(FileChannel.MapMode.READ_ONLY,
682 List phonemeList = new ArrayList();
684 // we get better performance for some reason if we
685 // just ignore estimated size
687 // Map lexicon = new HashMap();
688 Map lexicon = new LinkedHashMap(estimatedSize * 4 / 3);
690 if (bb.getInt() != MAGIC) {
691 throw new Error("bad magic number in lexicon");
694 if (bb.getInt() != VERSION) {
695 throw new Error("bad version number in lexicon");
699 for (int i = 0; i < size; i++) {
700 String phoneme = getString(bb);
701 phonemeList.add(phoneme);
703 numEntries = bb.getInt();
705 for (int i = 0; i < numEntries; i++) {
706 String wordAndPos = getString(bb);
707 String pos = Character.toString(
708 wordAndPos.charAt(wordAndPos.length() - 1));
709 if (!partsOfSpeech.contains(pos)) {
710 partsOfSpeech.add(pos);
713 int numPhonemes = bb.get();
714 String[] phonemes = new String[numPhonemes];
716 for (int j = 0; j < numPhonemes; j++) {
717 phonemes[j] = (String) phonemeList.get(bb.get());
719 lexicon.put(wordAndPos, phonemes);
726 * Loads the binary lexicon from the given InputStream.
727 * This method is not thread safe.
729 * @param is the InputStream to load the database from
730 * @param estimatedSize estimate of how large the database is
732 * @return a <code>Map</code> containing the lexicon
734 * @throws IOException if an IO error occurs
736 private Map loadBinaryLexicon(InputStream is, int estimatedSize)
738 DataInputStream dis = new DataInputStream(new
739 BufferedInputStream(is));
742 List phonemeList = new ArrayList();
744 // we get better performance for some reason if we
745 // just ignore estimated size
747 Map lexicon = new LinkedHashMap();
749 if (dis.readInt() != MAGIC) {
750 throw new Error("bad magic number in lexicon");
753 if (dis.readInt() != VERSION) {
754 throw new Error("bad version number in lexicon");
757 size = dis.readInt();
758 for (int i = 0; i < size; i++) {
759 String phoneme = getString(dis);
760 phonemeList.add(phoneme);
762 numEntries = dis.readInt();
764 for (int i = 0; i < numEntries; i++) {
765 String wordAndPos = getString(dis);
766 String pos = Character.toString(
767 wordAndPos.charAt(wordAndPos.length() - 1));
768 if (!partsOfSpeech.contains(pos)) {
769 partsOfSpeech.add(pos);
772 int numPhonemes = dis.readByte();
773 String[] phonemes = new String[numPhonemes];
775 for (int j = 0; j < numPhonemes; j++) {
776 phonemes[j] = (String) phonemeList.get(dis.readByte());
778 lexicon.put(wordAndPos, phonemes);
785 * Dumps this lexicon (just the compiled form). Lexicon will be
786 * dumped to two binary files PATH_compiled.bin and
789 * @param path the root path to dump it to
791 public void dumpBinary(String path) {
792 String compiledPath = path + "_compiled.bin";
793 String addendaPath = path + "_addenda.bin";
795 dumpBinaryLexicon(compiled, compiledPath);
796 dumpBinaryLexicon(addenda, addendaPath);
800 * Returns a list of the unique phonemes in the lexicon.
802 * @param lexicon the lexicon of interest
804 * @return list the unique set of phonemes
806 private List findPhonemes(Map lexicon) {
807 List phonemeList = new ArrayList();
808 for (Iterator i = lexicon.keySet().iterator(); i.hasNext(); ) {
809 String key = (String) i.next();
810 String[] phonemes = getPhones(lexicon, key);
811 for (int index = 0; index < phonemes.length; index++) {
812 if (!phonemeList.contains(phonemes[index])) {
813 phonemeList.add(phonemes[index]);
822 * Tests to see if this lexicon is identical to the other for
823 * debugging purposes.
825 * @param other the other lexicon to compare to
827 * @return true if lexicons are identical
829 public boolean compare(LexiconImpl other) {
830 return compare(addenda, other.addenda) &&
831 compare(compiled, other.compiled);
835 * Determines if the two lexicons are identical for debugging purposes.
837 * @param lex this lex
838 * @param other the other lexicon to chd
840 * @return true if they are identical
842 private boolean compare(Map lex, Map other) {
843 for (Iterator i = lex.keySet().iterator(); i.hasNext(); ) {
844 String key = (String) i.next();
845 String[] thisPhonemes = getPhones(lex, key);
846 String[] otherPhonemes = getPhones(other, key);
847 if (thisPhonemes == null) {
848 System.out.println(key + " not found in this.");
850 } else if (otherPhonemes == null) {
851 System.out.println(key + " not found in other.");
853 } else if (thisPhonemes.length == otherPhonemes.length) {
854 for (int j = 0; j < thisPhonemes.length; j++) {
855 if (!thisPhonemes[j].equals(otherPhonemes[j])) {
867 * Fixes the part of speech if it is <code>null</code>. The
868 * default representation of a <code>null</code> part of speech
871 static protected String fixPartOfSpeech(String partOfSpeech) {
872 return (partOfSpeech == null) ? "0" : partOfSpeech;