/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.analysis.cn.smart.hhmm;

import java.io.DataInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.ObjectInputStream;
import java.io.ObjectOutputStream;
import java.nio.ByteBuffer;
import java.nio.ByteOrder;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;

import org.apache.lucene.analysis.cn.smart.AnalyzerProfile;
import org.apache.lucene.analysis.cn.smart.Utility;

SmartChineseAnalyzer Word Dictionary
@lucene.experimental
/** * SmartChineseAnalyzer Word Dictionary * @lucene.experimental */
class WordDictionary extends AbstractDictionary { private WordDictionary() { } private static WordDictionary singleInstance;
Large prime number for hash function
/** * Large prime number for hash function */
public static final int PRIME_INDEX_LENGTH = 12071;
wordIndexTable guarantees to hash all Chinese characters in Unicode into PRIME_INDEX_LENGTH array. There will be conflict, but in reality this program only handles the 6768 characters found in GB2312 plus some ASCII characters. Therefore in order to guarantee better precision, it is necessary to retain the original symbol in the charIndexTable.
/** * wordIndexTable guarantees to hash all Chinese characters in Unicode into * PRIME_INDEX_LENGTH array. There will be conflict, but in reality this * program only handles the 6768 characters found in GB2312 plus some * ASCII characters. Therefore in order to guarantee better precision, it is * necessary to retain the original symbol in the charIndexTable. */
private short[] wordIndexTable; private char[] charIndexTable;
To avoid taking too much space, the data structure needed to store the lexicon requires two multidimensional arrays to store word and frequency. Each word is placed in a char[]. Each char represents a Chinese char or other symbol. Each frequency is put into an int. These two arrays correspond to each other one-to-one. Therefore, one can use wordItem_charArrayTable[i][j] to look up word from lexicon, and wordItem_frequencyTable[i][j] to look up the corresponding frequency.
/** * To avoid taking too much space, the data structure needed to store the * lexicon requires two multidimensional arrays to store word and frequency. * Each word is placed in a char[]. Each char represents a Chinese char or * other symbol. Each frequency is put into an int. These two arrays * correspond to each other one-to-one. Therefore, one can use * wordItem_charArrayTable[i][j] to look up word from lexicon, and * wordItem_frequencyTable[i][j] to look up the corresponding frequency. */
private char[][][] wordItem_charArrayTable; private int[][] wordItem_frequencyTable; // static Logger log = Logger.getLogger(WordDictionary.class);
Get the singleton dictionary instance.
Returns:singleton
/** * Get the singleton dictionary instance. * @return singleton */
public synchronized static WordDictionary getInstance() { if (singleInstance == null) { singleInstance = new WordDictionary(); try { singleInstance.load(); } catch (IOException e) { String wordDictRoot = AnalyzerProfile.ANALYSIS_DATA_DIR; singleInstance.load(wordDictRoot); } catch (ClassNotFoundException e) { throw new RuntimeException(e); } } return singleInstance; }
Attempt to load dictionary from provided directory, first trying coredict.mem, failing back on coredict.dct
Params:
  • dctFileRoot – path to dictionary directory
/** * Attempt to load dictionary from provided directory, first trying coredict.mem, failing back on coredict.dct * * @param dctFileRoot path to dictionary directory */
public void load(String dctFileRoot) { String dctFilePath = dctFileRoot + "/coredict.dct"; Path serialObj = Paths.get(dctFileRoot + "/coredict.mem"); if (Files.exists(serialObj) && loadFromObj(serialObj)) { } else { try { wordIndexTable = new short[PRIME_INDEX_LENGTH]; charIndexTable = new char[PRIME_INDEX_LENGTH]; for (int i = 0; i < PRIME_INDEX_LENGTH; i++) { charIndexTable[i] = 0; wordIndexTable[i] = -1; } wordItem_charArrayTable = new char[GB2312_CHAR_NUM][][]; wordItem_frequencyTable = new int[GB2312_CHAR_NUM][]; // int total = loadMainDataFromFile(dctFilePath); expandDelimiterData(); mergeSameWords(); sortEachItems(); // log.info("load dictionary: " + dctFilePath + " total:" + total); } catch (IOException e) { throw new RuntimeException(e.getMessage()); } saveToObj(serialObj); } }
Load coredict.mem internally from the jar file.
Throws:
  • IOException – If there is a low-level I/O error.
/** * Load coredict.mem internally from the jar file. * * @throws IOException If there is a low-level I/O error. */
public void load() throws IOException, ClassNotFoundException { InputStream input = this.getClass().getResourceAsStream("coredict.mem"); loadFromObjectInputStream(input); } private boolean loadFromObj(Path serialObj) { try { loadFromObjectInputStream(Files.newInputStream(serialObj)); return true; } catch (Exception e) { throw new RuntimeException(e); } } private void loadFromObjectInputStream(InputStream serialObjectInputStream) throws IOException, ClassNotFoundException { try (ObjectInputStream input = new ObjectInputStream(serialObjectInputStream)) { wordIndexTable = (short[]) input.readObject(); charIndexTable = (char[]) input.readObject(); wordItem_charArrayTable = (char[][][]) input.readObject(); wordItem_frequencyTable = (int[][]) input.readObject(); // log.info("load core dict from serialization."); } } private void saveToObj(Path serialObj) { try (ObjectOutputStream output = new ObjectOutputStream(Files.newOutputStream(serialObj))) { output.writeObject(wordIndexTable); output.writeObject(charIndexTable); output.writeObject(wordItem_charArrayTable); output.writeObject(wordItem_frequencyTable); // log.info("serialize core dict."); } catch (Exception e) { // log.warn(e.getMessage()); } }
Load the datafile into this WordDictionary
Params:
  • dctFilePath – path to word dictionary (coredict.dct)
Throws:
Returns:number of words read
/** * Load the datafile into this WordDictionary * * @param dctFilePath path to word dictionary (coredict.dct) * @return number of words read * @throws IOException If there is a low-level I/O error. */
private int loadMainDataFromFile(String dctFilePath) throws IOException { int i, cnt, length, total = 0; // The file only counted 6763 Chinese characters plus 5 reserved slots 3756~3760. // The 3756th is used (as a header) to store information. int[] buffer = new int[3]; byte[] intBuffer = new byte[4]; String tmpword; DataInputStream dctFile = new DataInputStream(Files.newInputStream(Paths.get(dctFilePath))); // GB2312 characters 0 - 6768 for (i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR + CHAR_NUM_IN_FILE; i++) { // if (i == 5231) // System.out.println(i); dctFile.read(intBuffer); // the dictionary was developed for C, and byte order must be converted to work with Java cnt = ByteBuffer.wrap(intBuffer).order(ByteOrder.LITTLE_ENDIAN).getInt(); if (cnt <= 0) { wordItem_charArrayTable[i] = null; wordItem_frequencyTable[i] = null; continue; } wordItem_charArrayTable[i] = new char[cnt][]; wordItem_frequencyTable[i] = new int[cnt]; total += cnt; int j = 0; while (j < cnt) { // wordItemTable[i][j] = new WordItem(); dctFile.read(intBuffer); buffer[0] = ByteBuffer.wrap(intBuffer).order(ByteOrder.LITTLE_ENDIAN) .getInt();// frequency dctFile.read(intBuffer); buffer[1] = ByteBuffer.wrap(intBuffer).order(ByteOrder.LITTLE_ENDIAN) .getInt();// length dctFile.read(intBuffer); buffer[2] = ByteBuffer.wrap(intBuffer).order(ByteOrder.LITTLE_ENDIAN) .getInt();// handle // wordItemTable[i][j].frequency = buffer[0]; wordItem_frequencyTable[i][j] = buffer[0]; length = buffer[1]; if (length > 0) { byte[] lchBuffer = new byte[length]; dctFile.read(lchBuffer); tmpword = new String(lchBuffer, "GB2312"); // indexTable[i].wordItems[j].word = tmpword; // wordItemTable[i][j].charArray = tmpword.toCharArray(); wordItem_charArrayTable[i][j] = tmpword.toCharArray(); } else { // wordItemTable[i][j].charArray = null; wordItem_charArrayTable[i][j] = null; } // System.out.println(indexTable[i].wordItems[j]); j++; } String str = getCCByGB2312Id(i); setTableIndex(str.charAt(0), i); } dctFile.close(); return total; }
The original lexicon puts all information with punctuation into a chart (from 1 to 3755). Here it then gets expanded, separately being placed into the chart that has the corresponding symbol.
/** * The original lexicon puts all information with punctuation into a * chart (from 1 to 3755). Here it then gets expanded, separately being * placed into the chart that has the corresponding symbol. */
private void expandDelimiterData() { int i; int cnt; // Punctuation then treating index 3755 as 1, // distribute the original punctuation corresponding dictionary into int delimiterIndex = 3755 + GB2312_FIRST_CHAR; i = 0; while (i < wordItem_charArrayTable[delimiterIndex].length) { char c = wordItem_charArrayTable[delimiterIndex][i][0]; int j = getGB2312Id(c);// the id value of the punctuation if (wordItem_charArrayTable[j] == null) { int k = i; // Starting from i, count the number of the following worditem symbol from j while (k < wordItem_charArrayTable[delimiterIndex].length && wordItem_charArrayTable[delimiterIndex][k][0] == c) { k++; } // c is the punctuation character, j is the id value of c // k-1 represents the index of the last punctuation character cnt = k - i; if (cnt != 0) { wordItem_charArrayTable[j] = new char[cnt][]; wordItem_frequencyTable[j] = new int[cnt]; } // Assign value for each wordItem. for (k = 0; k < cnt; k++, i++) { // wordItemTable[j][k] = new WordItem(); wordItem_frequencyTable[j][k] = wordItem_frequencyTable[delimiterIndex][i]; wordItem_charArrayTable[j][k] = new char[wordItem_charArrayTable[delimiterIndex][i].length - 1]; System.arraycopy(wordItem_charArrayTable[delimiterIndex][i], 1, wordItem_charArrayTable[j][k], 0, wordItem_charArrayTable[j][k].length); } setTableIndex(c, j); } } // Delete the original corresponding symbol array. wordItem_charArrayTable[delimiterIndex] = null; wordItem_frequencyTable[delimiterIndex] = null; } /* * since we aren't doing POS-tagging, merge the frequencies for entries of the same word (with different POS) */ private void mergeSameWords() { int i; for (i = 0; i < GB2312_FIRST_CHAR + CHAR_NUM_IN_FILE; i++) { if (wordItem_charArrayTable[i] == null) continue; int len = 1; for (int j = 1; j < wordItem_charArrayTable[i].length; j++) { if (Utility.compareArray(wordItem_charArrayTable[i][j], 0, wordItem_charArrayTable[i][j - 1], 0) != 0) len++; } if (len < wordItem_charArrayTable[i].length) { char[][] tempArray = new char[len][]; int[] tempFreq = new int[len]; int k = 0; tempArray[0] = wordItem_charArrayTable[i][0]; tempFreq[0] = wordItem_frequencyTable[i][0]; for (int j = 1; j < wordItem_charArrayTable[i].length; j++) { if (Utility.compareArray(wordItem_charArrayTable[i][j], 0, tempArray[k], 0) != 0) { k++; // temp[k] = wordItemTable[i][j]; tempArray[k] = wordItem_charArrayTable[i][j]; tempFreq[k] = wordItem_frequencyTable[i][j]; } else { // temp[k].frequency += wordItemTable[i][j].frequency; tempFreq[k] += wordItem_frequencyTable[i][j]; } } // wordItemTable[i] = temp; wordItem_charArrayTable[i] = tempArray; wordItem_frequencyTable[i] = tempFreq; } } } private void sortEachItems() { char[] tmpArray; int tmpFreq; for (int i = 0; i < wordItem_charArrayTable.length; i++) { if (wordItem_charArrayTable[i] != null && wordItem_charArrayTable[i].length > 1) { for (int j = 0; j < wordItem_charArrayTable[i].length - 1; j++) { for (int j2 = j + 1; j2 < wordItem_charArrayTable[i].length; j2++) { if (Utility.compareArray(wordItem_charArrayTable[i][j], 0, wordItem_charArrayTable[i][j2], 0) > 0) { tmpArray = wordItem_charArrayTable[i][j]; tmpFreq = wordItem_frequencyTable[i][j]; wordItem_charArrayTable[i][j] = wordItem_charArrayTable[i][j2]; wordItem_frequencyTable[i][j] = wordItem_frequencyTable[i][j2]; wordItem_charArrayTable[i][j2] = tmpArray; wordItem_frequencyTable[i][j2] = tmpFreq; } } } } } } /* * Calculate character c's position in hash table, * then initialize the value of that position in the address table. */ private boolean setTableIndex(char c, int j) { int index = getAvaliableTableIndex(c); if (index != -1) { charIndexTable[index] = c; wordIndexTable[index] = (short) j; return true; } else return false; } private short getAvaliableTableIndex(char c) { int hash1 = (int) (hash1(c) % PRIME_INDEX_LENGTH); int hash2 = hash2(c) % PRIME_INDEX_LENGTH; if (hash1 < 0) hash1 = PRIME_INDEX_LENGTH + hash1; if (hash2 < 0) hash2 = PRIME_INDEX_LENGTH + hash2; int index = hash1; int i = 1; while (charIndexTable[index] != 0 && charIndexTable[index] != c && i < PRIME_INDEX_LENGTH) { index = (hash1 + i * hash2) % PRIME_INDEX_LENGTH; i++; } // System.out.println(i - 1); if (i < PRIME_INDEX_LENGTH && (charIndexTable[index] == 0 || charIndexTable[index] == c)) { return (short) index; } else return -1; } private short getWordItemTableIndex(char c) { int hash1 = (int) (hash1(c) % PRIME_INDEX_LENGTH); int hash2 = hash2(c) % PRIME_INDEX_LENGTH; if (hash1 < 0) hash1 = PRIME_INDEX_LENGTH + hash1; if (hash2 < 0) hash2 = PRIME_INDEX_LENGTH + hash2; int index = hash1; int i = 1; while (charIndexTable[index] != 0 && charIndexTable[index] != c && i < PRIME_INDEX_LENGTH) { index = (hash1 + i * hash2) % PRIME_INDEX_LENGTH; i++; } if (i < PRIME_INDEX_LENGTH && charIndexTable[index] == c) { return (short) index; } else return -1; }
Look up the text string corresponding with the word char array, and return the position of the word list.
Params:
  • knownHashIndex – already figure out position of the first word symbol charArray[0] in hash table. If not calculated yet, can be replaced with function int findInTable(char[] charArray).
  • charArray – look up the char array corresponding with the word.
Returns:word location in word array. If not found, then return -1.
/** * Look up the text string corresponding with the word char array, * and return the position of the word list. * * @param knownHashIndex already figure out position of the first word * symbol charArray[0] in hash table. If not calculated yet, can be * replaced with function int findInTable(char[] charArray). * @param charArray look up the char array corresponding with the word. * @return word location in word array. If not found, then return -1. */
private int findInTable(short knownHashIndex, char[] charArray) { if (charArray == null || charArray.length == 0) return -1; char[][] items = wordItem_charArrayTable[wordIndexTable[knownHashIndex]]; int start = 0, end = items.length - 1; int mid = (start + end) / 2, cmpResult; // Binary search for the index of idArray while (start <= end) { cmpResult = Utility.compareArray(items[mid], 0, charArray, 1); if (cmpResult == 0) return mid;// find it else if (cmpResult < 0) start = mid + 1; else if (cmpResult > 0) end = mid - 1; mid = (start + end) / 2; } return -1; }
Find the first word in the dictionary that starts with the supplied prefix
Params:
  • charArray – input prefix
See Also:
  • getPrefixMatch(char[], int)
Returns:index of word, or -1 if not found
/** * Find the first word in the dictionary that starts with the supplied prefix * * @see #getPrefixMatch(char[], int) * @param charArray input prefix * @return index of word, or -1 if not found */
public int getPrefixMatch(char[] charArray) { return getPrefixMatch(charArray, 0); }
Find the nth word in the dictionary that starts with the supplied prefix
Params:
  • charArray – input prefix
  • knownStart – relative position in the dictionary to start
See Also:
  • getPrefixMatch(char[])
Returns:index of word, or -1 if not found
/** * Find the nth word in the dictionary that starts with the supplied prefix * * @see #getPrefixMatch(char[]) * @param charArray input prefix * @param knownStart relative position in the dictionary to start * @return index of word, or -1 if not found */
public int getPrefixMatch(char[] charArray, int knownStart) { short index = getWordItemTableIndex(charArray[0]); if (index == -1) return -1; char[][] items = wordItem_charArrayTable[wordIndexTable[index]]; int start = knownStart, end = items.length - 1; int mid = (start + end) / 2, cmpResult; // Binary search for the index of idArray while (start <= end) { cmpResult = Utility.compareArrayByPrefix(charArray, 1, items[mid], 0); if (cmpResult == 0) { // Get the first item which match the current word while (mid >= 0 && Utility.compareArrayByPrefix(charArray, 1, items[mid], 0) == 0) mid--; mid++; return mid;// Find the first word that uses charArray as prefix. } else if (cmpResult < 0) end = mid - 1; else start = mid + 1; mid = (start + end) / 2; } return -1; }
Get the frequency of a word from the dictionary
Params:
  • charArray – input word
Returns:word frequency, or zero if the word is not found
/** * Get the frequency of a word from the dictionary * * @param charArray input word * @return word frequency, or zero if the word is not found */
public int getFrequency(char[] charArray) { short hashIndex = getWordItemTableIndex(charArray[0]); if (hashIndex == -1) return 0; int itemIndex = findInTable(hashIndex, charArray); if (itemIndex != -1) return wordItem_frequencyTable[wordIndexTable[hashIndex]][itemIndex]; return 0; }
Return true if the dictionary entry at itemIndex for table charArray[0] is charArray
Params:
  • charArray – input word
  • itemIndex – item index for table charArray[0]
Returns:true if the entry exists
/** * Return true if the dictionary entry at itemIndex for table charArray[0] is charArray * * @param charArray input word * @param itemIndex item index for table charArray[0] * @return true if the entry exists */
public boolean isEqual(char[] charArray, int itemIndex) { short hashIndex = getWordItemTableIndex(charArray[0]); return Utility.compareArray(charArray, 1, wordItem_charArrayTable[wordIndexTable[hashIndex]][itemIndex], 0) == 0; } }