git.gag.com Git - debian/freetts/blob - com/sun/speech/freetts/lexicon/LexiconImpl.java

   1 /**
   2  * Portions Copyright 2001 Sun Microsystems, Inc.
   3  * Portions Copyright 1999-2001 Language Technologies Institute,
   4  * Carnegie Mellon University.
   5  * All Rights Reserved.  Use is subject to license terms.
   6  *
   7  * See the file "license.terms" for information on usage and
   8  * redistribution of this file, and for a DISCLAIMER OF ALL
   9  * WARRANTIES.
  10  */
  11 package com.sun.speech.freetts.lexicon;
  12
  13 import com.sun.speech.freetts.util.Utilities;
  14 import com.sun.speech.freetts.util.BulkTimer;
  15
  16 import java.io.BufferedInputStream;
  17 import java.io.BufferedOutputStream;
  18 import java.io.BufferedReader;
  19 import java.io.DataInputStream;
  20 import java.io.DataOutputStream;
  21 import java.io.FileInputStream;
  22 import java.io.FileNotFoundException;
  23 import java.io.FileOutputStream;
  24 import java.io.InputStream;
  25 import java.io.InputStreamReader;
  26 import java.io.IOException;
  27 import java.nio.channels.FileChannel;
  28 import java.nio.ByteBuffer;
  29 import java.nio.MappedByteBuffer;
  30
  31 import java.net.MalformedURLException;
  32 import java.net.URL;
  33
  34 import java.util.ArrayList;
  35 import java.util.Collections;
  36 import java.util.HashMap;
  37 import java.util.LinkedHashMap;
  38 import java.util.Iterator;
  39 import java.util.List;
  40 import java.util.Map;
  41 import java.util.Set;
  42 import java.util.StringTokenizer;
  43
  44 /**
  45  * Provides an implementation of a Lexicon.
  46  *
  47  * <p>This implementation will either read from a straight ASCII file
  48  * or a binary file.  When reading from an ASCII file, you can specify
  49  * when the input line is tokenized:  load, lookup, or never.  If you
  50  * specify 'load', the entire file will be parsed when it is loaded.
  51  * If you specify 'lookup', the file will be loaded, but the parsing
  52  * for each line will be delayed until it is referenced and the parsed
  53  * form will be saved away.  If you specify 'never', the lines will
  54  * parsed each time they are referenced.  The default is 'never'.  To
  55  * specify the load type, set the system property as follows:
  56  *
  57  * <pre>
  58  *   -Dcom.sun.speech.freetts.lexicon.LexTokenize=load
  59  * </pre>
  60  *
  61  * <p>If a binary file is used, you can also specify whether the new
  62  * IO package is used.  The new IO package is new for JDK1.4, and can
  63  * greatly improve the speed of loading files.  To enable new IO, use
  64  * the following system property (it is enabled by default):
  65  *
  66  * <pre>
  67  *   -Dcom.sun.speech.freetts.useNewIO=true
  68  * </pre>
  69  *
  70  * <p>The implementation also allows users to define their own addenda
  71  * that will be used in addition to the system addenda.  If the user
  72  * defines their own addenda, it values will be added to the system
  73  * addenda, overriding any existing elements in the system addenda.
  74  * To define a user addenda, the user needs to set the following
  75  * property:
  76  *
  77  * <pre>
  78  *   -Dcom.sun.speeech.freetts.lexicon.userAddenda=&lt;URLToUserAddenda>
  79  * </pre>
  80  *
  81  * Where &lt;URLToUserAddenda> is a URL pointing to an ASCII file
  82  * containing addenda entries.
  83  *
  84  * <p>[[[TODO: support multiple homographs with the same part of speech.]]]
  85  */
  86 abstract public class LexiconImpl implements Lexicon {
  87     /**
  88      * If true, the phone string is replaced with the phone array in
  89      * the hashmap when the phone array is loaded.  The side effects
  90      * of this are quicker lookups, but more memory usage and a longer
  91      * startup time.
  92      */
  93     protected boolean tokenizeOnLoad = false;
  94
  95     /**
  96      * If true, the phone string is replaced with the phone array in
  97      * the hashmap when the phone array is first looked up.  The side effects
  98      * Set by cmufilelex.tokenize=lookup.
  99      */
 100     protected boolean tokenizeOnLookup = false;
 101
 102     /**
 103      * Magic number for binary Lexicon files.
 104      */
 105     private final static int MAGIC = 0xBABB1E;
 106
 107     /**
 108      * Current binary file version.
 109      */
 110     private final static int VERSION = 1;
 111
 112     /**
 113      * URL for the compiled form.
 114      */
 115     private URL compiledURL;
 116
 117     /**
 118      * URL for the addenda.
 119      */
 120     private URL addendaURL;
 121
 122     /**
 123      * URL for the letter to sound rules.
 124      */
 125     private URL letterToSoundURL;
 126
 127     /**
 128      * The addenda.
 129      */
 130     private Map addenda;
 131
 132     /**
 133      * The compiled lexicon.
 134      */
 135     private Map compiled;
 136
 137     /**
 138      * The LetterToSound rules.
 139      */
 140     private LetterToSound letterToSound = null;
 141
 142     /**
 143      * Parts of Speech.
 144      */
 145     private ArrayList partsOfSpeech = new ArrayList();
 146
 147     /**
 148      * A static directory of compiledURL URL objects and associated
 149      * already-loaded compiled Map objects. This is used to share
 150      * the immutable compiled lexicons between lexicon instances.
 151      * As the addenda can be changed using <code>addAddendum()</code>
 152      * and <code>removeAddendum</code>, each lexicon instance has its
 153      * own addenda.
 154      */
 155     private static Map loadedCompiledLexicons;
 156
 157
 158
 159     /**
 160      * Loaded State of the lexicon
 161      */
 162     private boolean loaded = false;
 163
 164     /**
 165      * Type of lexicon to load
 166      */
 167     private boolean binary = false;
 168
 169     /**
 170      * No phones for this word.
 171      */
 172     final static private String[] NO_PHONES = new String[0];
 173
 174     /**
 175      * Temporary place holder.
 176      */
 177     private char charBuffer[] = new char[128];
 178
 179     /**
 180      * Use the new IO package?
 181      */
 182     private boolean useNewIO =
 183         Utilities.getProperty("com.sun.speech.freetts.useNewIO",
 184                 "true").equals("true");
 185
 186     /**
 187      * Create a new LexiconImpl by reading from the given URLS.
 188      *
 189      * @param compiledURL a URL pointing to the compiled lexicon
 190      * @param addendaURL a URL pointing to lexicon addenda
 191      * @param letterToSoundURL a LetterToSound to use if a word cannot
 192      *   be found in the compiled form or the addenda
 193      * @param binary if <code>true</code>, the input streams are binary;
 194      *   otherwise, they are text.
 195      */
 196     public LexiconImpl(URL compiledURL, URL addendaURL,
 197                        URL letterToSoundURL,
 198                        boolean binary) {
 199         this();
 200         setLexiconParameters(compiledURL, addendaURL, letterToSoundURL, binary);
 201     }
 202
 203     /**
 204      * Class constructor for an empty Lexicon.
 205      */
 206     public LexiconImpl() {
 207         // Find out when to convert the phone string into an array.
 208         //
 209         String tokenize =
 210             Utilities.getProperty("com.sun.speech.freetts.lexicon.LexTokenize",
 211                                "never");
 212         tokenizeOnLoad = tokenize.equals("load");
 213         tokenizeOnLookup = tokenize.equals("lookup");
 214     }
 215
 216     /**
 217      * Sets the lexicon parameters
 218      * @param compiledURL a URL pointing to the compiled lexicon
 219      * @param addendaURL a URL pointing to lexicon addenda
 220      * @param letterToSoundURL a URL pointing to the LetterToSound to use
 221      * @param binary if <code>true</code>, the input streams are binary;
 222      *   otherwise, they are text.
 223      */
 224     protected void setLexiconParameters(URL compiledURL,
 225                                         URL addendaURL,
 226                                         URL letterToSoundURL,
 227                                         boolean binary) {
 228         this.compiledURL = compiledURL;
 229         this.addendaURL = addendaURL;
 230         this.letterToSoundURL = letterToSoundURL;
 231         this.binary = binary;
 232     }
 233
 234     /**
 235      * Determines if this lexicon is loaded.
 236      *
 237      * @return <code>true</code> if the lexicon is loaded
 238      */
 239     public boolean isLoaded() {
 240         return loaded;
 241     }
 242
 243     /**
 244      * Loads the data for this lexicon.  If the
 245      *
 246      * @throws IOException if errors occur during loading
 247      */
 248     public void load() throws IOException {
 249         BulkTimer.LOAD.start("Lexicon");
 250
 251         if (compiledURL == null) {
 252             throw new IOException("Can't load lexicon");
 253         }
 254
 255         if (addendaURL == null) {
 256             throw new IOException("Can't load lexicon addenda " );
 257         }
 258
 259         if (loadedCompiledLexicons == null) {
 260             loadedCompiledLexicons = new HashMap();
 261         }
 262         if (!loadedCompiledLexicons.containsKey(compiledURL)) {
 263                 InputStream compiledIS = Utilities.getInputStream(compiledURL);
 264                 if (compiledIS == null) {
 265                     throw new IOException("Can't load lexicon from " + compiledURL);
 266                 }
 267                 Map newCompiled = createLexicon(compiledIS, binary, 65000);
 268         loadedCompiledLexicons.put(compiledURL, newCompiled);
 269         compiledIS.close();
 270         }
 271         compiled = Collections.unmodifiableMap((Map)loadedCompiledLexicons.get(compiledURL));
 272
 273         InputStream addendaIS = Utilities.getInputStream(addendaURL);
 274         if (addendaIS == null) {
 275             throw new IOException("Can't load lexicon addenda from "
 276                     + addendaURL);
 277         }
 278
 279         // [[[TODO: what is the best way to derive the estimated sizes?]]]
 280         //
 281         addenda = createLexicon(addendaIS, binary, 50);
 282         addendaIS.close();
 283
 284         /* Load the user-defined addenda and override any existing
 285          * entries in the system addenda.
 286          */
 287         String userAddenda = Utilities.getProperty(
 288             "com.sun.speech.freetts.lexicon.userAddenda", null);
 289         if (userAddenda != null) {
 290             try {
 291                 URL userAddendaURL = new URL(userAddenda);
 292                 InputStream userAddendaIS = Utilities.getInputStream(
 293                     userAddendaURL);
 294                 if (userAddendaIS == null) {
 295                     throw new IOException("Can't load user addenda from "
 296                                           + userAddenda);
 297                 }
 298                 Map tmpAddenda = createLexicon(userAddendaIS, false, 50);
 299                 userAddendaIS.close();
 300                 for (Iterator keys = tmpAddenda.keySet().iterator();
 301                      keys.hasNext();) {
 302                     Object key = keys.next();
 303                     addenda.put(key, tmpAddenda.get(key));
 304                 }
 305             } catch (MalformedURLException e) {
 306                 throw new IOException("User addenda URL is malformed: " +
 307                                       userAddenda);
 308             }
 309         }
 310
 311         loaded = true;
 312         BulkTimer.LOAD.stop("Lexicon");
 313         letterToSound = new LetterToSoundImpl(letterToSoundURL, binary);
 314     }
 315
 316     /**
 317      * Reads the given input stream as lexicon data and returns the
 318      * results in a <code>Map</code>.
 319      *
 320      * @param is the input stream
 321      * @param binary if <code>true</code>, the data is binary
 322      * @param estimatedSize the estimated size of the lexicon
 323      *
 324      * @throws IOException if errors are encountered while reading the data
 325      */
 326     protected Map createLexicon(InputStream is,
 327                                 boolean binary,
 328                                 int estimatedSize)
 329         throws IOException {
 330         if (binary) {
 331             if (useNewIO && is instanceof FileInputStream) {
 332                 FileInputStream fis = (FileInputStream) is;
 333                 return loadMappedBinaryLexicon(fis, estimatedSize);
 334             } else {
 335                 DataInputStream dis = new DataInputStream(
 336                         new BufferedInputStream(is));
 337                 return loadBinaryLexicon(dis, estimatedSize);
 338             }
 339         }  else {
 340             return loadTextLexicon(is, estimatedSize);
 341         }
 342     }
 343
 344     /**
 345      * Reads the given input stream as text lexicon data and returns the
 346      * results in a <code>Map</code>.
 347      *
 348      * @param is the input stream
 349      * @param estimatedSize the estimated number of entries of the lexicon
 350      *
 351      * @throws IOException if errors are encountered while reading the data
 352      */
 353     protected Map loadTextLexicon(InputStream is, int estimatedSize)
 354         throws IOException {
 355         Map lexicon = new LinkedHashMap(estimatedSize * 4 / 3);
 356         BufferedReader reader = new BufferedReader(new InputStreamReader(is));
 357         String line;
 358
 359         line = reader.readLine();
 360         while (line != null) {
 361             if (!line.startsWith("***")) {
 362                 parseAndAdd(lexicon, line);
 363             }
 364             line = reader.readLine();
 365         }
 366         return lexicon;
 367     }
 368
 369     /**
 370      * Creates a word from the given input line and add it to the lexicon.
 371      *
 372      * @param lexicon the lexicon
 373      * @param line the input text
 374      */
 375     protected void parseAndAdd(Map lexicon, String line) {
 376         StringTokenizer tokenizer = new StringTokenizer(line,"\t");
 377         String phones = null;
 378
 379         String wordAndPos = tokenizer.nextToken();
 380         String pos = wordAndPos.substring(wordAndPos.length() - 1);
 381         if (!partsOfSpeech.contains(pos)) {
 382             partsOfSpeech.add(pos);
 383         }
 384         if (tokenizer.hasMoreTokens()) {
 385             phones = tokenizer.nextToken();
 386         }
 387         if ((phones != null) && (tokenizeOnLoad)) {
 388             lexicon.put(wordAndPos, getPhones(phones));
 389         } else if (phones == null) {
 390             lexicon.put(wordAndPos, NO_PHONES);
 391         } else {
 392             lexicon.put(wordAndPos, phones);
 393         }
 394     }
 395
 396     /**
 397      * Gets the phone list for a given word.  If a phone list cannot
 398      * be found, returns <code>null</code>.  The format is lexicon
 399      * dependent.  If the part of speech does not matter, pass in
 400      * <code>null</code>.
 401      *
 402      * @param word the word to find
 403      * @param partOfSpeech the part of speech
 404      *
 405      * @return the list of phones for word or <code>null</code>
 406      */
 407     public String[] getPhones(String word, String partOfSpeech) {
 408         return getPhones(word, partOfSpeech, true);
 409     }
 410
 411     /**
 412      * Gets the phone list for a given word.  If a phone list cannot
 413      * be found, <code>null</code> is returned.  The
 414      * <code>partOfSpeech</code> is implementation dependent, but
 415      * <code>null</code> always matches.
 416      *
 417      * @param word the word to find
 418      * @param partOfSpeech the part of speech or <code>null</code>
 419      * @param useLTS whether to use the letter-to-sound rules when
 420      *        the word is not in the lexicon.
 421      *
 422      * @return the list of phones for word or null
 423      */
 424     public String[] getPhones
 425                         (String word, String partOfSpeech, boolean useLTS){
 426         String[] phones = null;
 427         phones = getPhones(addenda, word, partOfSpeech);
 428         if (phones == null) {
 429             phones = getPhones(compiled, word, partOfSpeech);
 430         }
 431         if(useLTS){
 432             if (phones == null && letterToSound != null) {
 433                 phones = letterToSound.getPhones(word, partOfSpeech);
 434             }
 435         }
 436         if(phones != null){
 437         String[] copy = new String[phones.length];
 438         System.arraycopy(phones, 0, copy, 0, phones.length);
 439             return copy;
 440         }
 441         else return null;
 442
 443     }
 444     /**
 445      * Gets a phone list for a word from a given lexicon.  If a phone
 446      * list cannot be found, returns <code>null</code>.  The format is
 447      * lexicon dependent.  If the part of speech does not matter, pass
 448      * in <code>null</code>.
 449      *
 450      * @param lexicon the lexicon
 451      * @param word the word to find
 452      * @param partOfSpeech the part of speech
 453      *
 454      * @return the list of phones for word or <code>null</code>
 455      */
 456     protected String[] getPhones(Map lexicon,
 457                                  String word,
 458                                  String partOfSpeech) {
 459         String[] phones;
 460         partOfSpeech = fixPartOfSpeech(partOfSpeech);
 461         phones = getPhones(lexicon, word+partOfSpeech);
 462         for (int i = 0;
 463              (i < partsOfSpeech.size()) && (phones == null);
 464              i++) {
 465             if (!partOfSpeech.equals((String) partsOfSpeech.get(i))) {
 466                 phones = getPhones(lexicon,
 467                                    word + (String) partsOfSpeech.get(i));
 468             }
 469         }
 470         return phones;
 471     }
 472
 473     /**
 474      * Gets a phone list for a word from a given lexicon.  If a phone
 475      * list cannot be found, returns <code>null</code>.
 476      *
 477      * @param lexicon the lexicon
 478      * @param wordAndPartOfSpeech word and part of speech concatenated
 479      *   together
 480      *
 481      * @return the list of phones for word or <code>null</code>
 482      */
 483     protected String[] getPhones(Map lexicon,
 484                                  String wordAndPartOfSpeech) {
 485         Object value = lexicon.get(wordAndPartOfSpeech);
 486         if (value instanceof String[]) {
 487             return (String[]) value;
 488         } else if (value instanceof String) {
 489             String[] phoneArray;
 490             phoneArray = getPhones((String) value);
 491             if (tokenizeOnLookup) {
 492                 lexicon.put(wordAndPartOfSpeech, phoneArray);
 493             }
 494             return phoneArray;
 495         } else {
 496             return null;
 497         }
 498     }
 499
 500     /**
 501      * Turns the phone <code>String</code> into a <code>String[]</code>,
 502      * using " " as the delimiter.
 503      *
 504      * @param phones the phones
 505      *
 506      * @return the phones split into an array
 507      */
 508     protected String[] getPhones(String phones) {
 509         ArrayList phoneList = new ArrayList();
 510         StringTokenizer tokenizer = new StringTokenizer(phones, " ");
 511         while (tokenizer.hasMoreTokens()) {
 512             phoneList.add(tokenizer.nextToken());
 513         }
 514         return (String[]) phoneList.toArray(new String[0]);
 515     }
 516
 517     /**
 518      * Adds a word to the addenda.
 519      *
 520      * @param word the word to find
 521      * @param partOfSpeech the part of speech
 522      * @param phones the phones for the word
 523      *
 524      */
 525     public void addAddendum(String word,
 526                             String partOfSpeech,
 527                             String[] phones) {
 528         String pos = fixPartOfSpeech(partOfSpeech);
 529         if (!partsOfSpeech.contains(pos)) {
 530             partsOfSpeech.add(pos);
 531         }
 532         addenda.put(word + pos, phones);
 533     }
 534
 535     /**
 536      * Removes a word from the addenda.
 537      *
 538      * @param word the word to remove
 539      * @param partOfSpeech the part of speech
 540      */
 541     public void removeAddendum(String word, String partOfSpeech) {
 542         addenda.remove(word + fixPartOfSpeech(partOfSpeech));
 543     }
 544
 545     /**
 546      * Outputs a string to a data output stream.
 547      *
 548      * @param dos the data output stream
 549      * @param s the string to output
 550      *
 551      * @throws IOException if errors occur during writing
 552      */
 553     private void outString(DataOutputStream dos, String s)
 554                         throws IOException {
 555         dos.writeByte((byte) s.length());
 556         for (int i = 0; i < s.length(); i++) {
 557             dos.writeChar(s.charAt(i));
 558         }
 559     }
 560
 561     /**
 562      * Inputs a string from a DataInputStream.  This method is not re-entrant.
 563      *
 564      * @param dis the data input stream
 565      *
 566      * @return the string
 567      *
 568      * @throws IOException if errors occur during reading
 569      */
 570     private String getString(DataInputStream dis) throws IOException {
 571         int size = dis.readByte();
 572         for (int i = 0; i < size; i++) {
 573             charBuffer[i] = dis.readChar();
 574         }
 575         return new String(charBuffer, 0, size);
 576     }
 577
 578     /**
 579      * Inputs a string from a DataInputStream.  This method is not re-entrant.
 580      *
 581      * @param bb the input byte buffer
 582      *
 583      * @return the string
 584      *
 585      * @throws IOException if errors occur during reading
 586      */
 587     private String getString(ByteBuffer bb) throws IOException {
 588         int size = bb.get();
 589         for (int i = 0; i < size; i++) {
 590             charBuffer[i] = bb.getChar();
 591         }
 592         return new String(charBuffer, 0, size);
 593     }
 594
 595
 596     /**
 597      * Dumps a binary form of the database.  This method is not thread-safe.
 598      *
 599      * <p>Binary format is:
 600      * <pre>
 601      * MAGIC
 602      * VERSION
 603      * (int) numPhonemes
 604      * (String) phoneme0
 605      * (String) phoneme1
 606      * (String) phonemeN
 607      * (int) numEntries
 608      * (String) nameWithPOS
 609      * (byte) numPhonemes
 610      * phoneme index 1
 611      * phoneme index 2
 612      * phoneme index n
 613      * </pre>
 614      *
 615      * <p>Strings are formatted as: <code>(byte) len char0 char1 charN</code>
 616      *
 617      * <p>Limits: Strings: 128 chars
 618      * <p>Limits: Strings: 128 phonemes per word
 619      *
 620      * @param lexicon the lexicon to dump
 621      * @param path the path to dump the file to
 622      */
 623     private void dumpBinaryLexicon(Map lexicon, String path) {
 624         try {
 625             FileOutputStream fos = new FileOutputStream(path);
 626             DataOutputStream dos = new DataOutputStream(new
 627                     BufferedOutputStream(fos));
 628             List phonemeList = findPhonemes(lexicon);
 629
 630             dos.writeInt(MAGIC);
 631             dos.writeInt(VERSION);
 632             dos.writeInt(phonemeList.size());
 633
 634             for (int i = 0; i < phonemeList.size(); i++) {
 635                 outString(dos, (String) phonemeList.get(i));
 636             }
 637
 638             dos.writeInt(lexicon.keySet().size());
 639             for (Iterator i = lexicon.keySet().iterator(); i.hasNext(); ) {
 640                 String key = (String) i.next();
 641                 outString(dos, key);
 642                 String[] phonemes = getPhones(lexicon, key);
 643                 dos.writeByte((byte) phonemes.length);
 644                 for (int index = 0; index < phonemes.length; index++) {
 645                     int phonemeIndex = phonemeList.indexOf(phonemes[index]);
 646                     if (phonemeIndex == -1) {
 647                         throw new Error("Can't find phoneme index");
 648                     }
 649                     dos.writeByte((byte) phonemeIndex);
 650                 }
 651             }
 652             dos.close();
 653         } catch (FileNotFoundException fe) {
 654             throw new Error("Can't dump binary database " +
 655                     fe.getMessage());
 656         } catch (IOException ioe) {
 657             throw new Error("Can't write binary database " +
 658                     ioe.getMessage());
 659         }
 660     }
 661
 662     /**
 663      * Loads the binary lexicon from the given InputStream.
 664      * This method is not thread safe.
 665      *
 666      * @param is the InputStream to load the database from
 667      * @param estimatedSize estimate of how large the database is
 668      *
 669      * @return a <code>Map</code> containing the lexicon
 670      *
 671      * @throws IOException if an IO error occurs
 672      */
 673     private Map loadMappedBinaryLexicon(FileInputStream is, int estimatedSize)
 674         throws IOException {
 675         FileChannel fc = is.getChannel();
 676
 677         MappedByteBuffer bb = fc.map(FileChannel.MapMode.READ_ONLY,
 678                 0, (int) fc.size());
 679         bb.load();
 680         int size = 0;
 681         int numEntries = 0;
 682         List phonemeList = new ArrayList();
 683
 684         // we get better performance for some reason if we
 685         // just ignore estimated size
 686         //
 687         // Map lexicon = new HashMap();
 688         Map lexicon = new LinkedHashMap(estimatedSize * 4 / 3);
 689
 690         if (bb.getInt() != MAGIC) {
 691             throw new Error("bad magic number in lexicon");
 692         }
 693
 694         if (bb.getInt() != VERSION) {
 695             throw new Error("bad version number in lexicon");
 696         }
 697
 698         size = bb.getInt();
 699         for (int i = 0; i < size; i++) {
 700             String phoneme = getString(bb);
 701             phonemeList.add(phoneme);
 702         }
 703         numEntries = bb.getInt();
 704
 705         for (int i = 0; i < numEntries; i++) {
 706             String wordAndPos = getString(bb);
 707             String pos = Character.toString(
 708                     wordAndPos.charAt(wordAndPos.length() - 1));
 709             if (!partsOfSpeech.contains(pos)) {
 710                 partsOfSpeech.add(pos);
 711             }
 712
 713             int numPhonemes = bb.get();
 714             String[] phonemes = new String[numPhonemes];
 715
 716             for (int j = 0; j < numPhonemes; j++) {
 717                 phonemes[j] = (String) phonemeList.get(bb.get());
 718             }
 719             lexicon.put(wordAndPos, phonemes);
 720         }
 721         fc.close();
 722         return lexicon;
 723     }
 724
 725     /**
 726      * Loads the binary lexicon from the given InputStream.
 727      * This method is not thread safe.
 728      *
 729      * @param is the InputStream to load the database from
 730      * @param estimatedSize estimate of how large the database is
 731      *
 732      * @return a <code>Map</code> containing the lexicon
 733      *
 734      * @throws IOException if an IO error occurs
 735      */
 736     private Map loadBinaryLexicon(InputStream is, int estimatedSize)
 737         throws IOException {
 738         DataInputStream dis = new DataInputStream(new
 739                 BufferedInputStream(is));
 740         int size = 0;
 741         int numEntries = 0;
 742         List phonemeList = new ArrayList();
 743
 744         // we get better performance for some reason if we
 745         // just ignore estimated size
 746         //
 747         Map lexicon = new LinkedHashMap();
 748
 749         if (dis.readInt() != MAGIC) {
 750             throw new Error("bad magic number in lexicon");
 751         }
 752
 753         if (dis.readInt() != VERSION) {
 754             throw new Error("bad version number in lexicon");
 755         }
 756
 757         size = dis.readInt();
 758         for (int i = 0; i < size; i++) {
 759             String phoneme = getString(dis);
 760             phonemeList.add(phoneme);
 761         }
 762         numEntries = dis.readInt();
 763
 764         for (int i = 0; i < numEntries; i++) {
 765             String wordAndPos = getString(dis);
 766             String pos = Character.toString(
 767                     wordAndPos.charAt(wordAndPos.length() - 1));
 768             if (!partsOfSpeech.contains(pos)) {
 769                 partsOfSpeech.add(pos);
 770             }
 771
 772             int numPhonemes = dis.readByte();
 773             String[] phonemes = new String[numPhonemes];
 774
 775             for (int j = 0; j < numPhonemes; j++) {
 776                 phonemes[j] = (String) phonemeList.get(dis.readByte());
 777             }
 778             lexicon.put(wordAndPos, phonemes);
 779         }
 780         dis.close();
 781         return lexicon;
 782     }
 783
 784     /**
 785      * Dumps this lexicon (just the compiled form). Lexicon will be
 786      * dumped to two binary files PATH_compiled.bin and
 787      * PATH_addenda.bin
 788      *
 789      * @param path the root path to dump it to
 790      */
 791     public void dumpBinary(String path) {
 792         String compiledPath = path + "_compiled.bin";
 793         String addendaPath = path + "_addenda.bin";
 794
 795         dumpBinaryLexicon(compiled, compiledPath);
 796         dumpBinaryLexicon(addenda, addendaPath);
 797     }
 798
 799     /**
 800      * Returns a list of the unique phonemes in the lexicon.
 801      *
 802      * @param lexicon the lexicon of interest
 803      *
 804      * @return list the unique set of phonemes
 805      */
 806     private List findPhonemes(Map lexicon) {
 807         List phonemeList = new ArrayList();
 808         for (Iterator i = lexicon.keySet().iterator(); i.hasNext(); ) {
 809             String key = (String) i.next();
 810             String[] phonemes = getPhones(lexicon, key);
 811             for (int index = 0; index < phonemes.length; index++) {
 812                 if (!phonemeList.contains(phonemes[index])) {
 813                     phonemeList.add(phonemes[index]);
 814                 }
 815             }
 816         }
 817         return phonemeList;
 818     }
 819
 820
 821     /**
 822      * Tests to see if this lexicon is identical to the other for
 823      * debugging purposes.
 824      *
 825      * @param other the other lexicon to compare to
 826      *
 827      * @return true if lexicons are identical
 828      */
 829     public boolean compare(LexiconImpl other) {
 830         return compare(addenda, other.addenda) &&
 831               compare(compiled, other.compiled);
 832     }
 833
 834     /**
 835      * Determines if the two lexicons are identical for debugging purposes.
 836      *
 837      * @param lex this lex
 838      * @param other the other lexicon to chd
 839      *
 840      * @return true if they are identical
 841      */
 842     private boolean compare(Map lex, Map other) {
 843         for (Iterator i = lex.keySet().iterator(); i.hasNext(); ) {
 844             String key = (String) i.next();
 845             String[] thisPhonemes = getPhones(lex, key);
 846             String[] otherPhonemes = getPhones(other, key);
 847             if (thisPhonemes == null) {
 848                 System.out.println(key + " not found in this.");
 849                 return false;
 850             } else if (otherPhonemes == null) {
 851                 System.out.println(key + " not found in other.");
 852                 return false;
 853             } else if (thisPhonemes.length == otherPhonemes.length) {
 854                 for (int j = 0; j < thisPhonemes.length; j++) {
 855                     if (!thisPhonemes[j].equals(otherPhonemes[j])) {
 856                         return false;
 857                     }
 858                 }
 859             } else {
 860                 return false;
 861             }
 862         }
 863         return true;
 864     }
 865
 866     /**
 867      * Fixes the part of speech if it is <code>null</code>.  The
 868      * default representation of a <code>null</code> part of speech
 869      * is the number "0".
 870      */
 871     static protected String fixPartOfSpeech(String partOfSpeech) {
 872         return (partOfSpeech == null) ? "0" : partOfSpeech;
 873     }
 874 }