org.apache.lucene/lucene-analyzers-kuromoji/8.2.0 : org/apache/lucene/analysis/ja/JapaneseIterationMarkCharFilter.java

JapaneseIterationMarkCharFilter
http://lucene.apache.org/lucene-parent/lucene-analyzers-kuromoji: Lucene Kuromoji Japanese Morphological Analyzer (The Apache Software Foundation)
Apache 2
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.analysis.ja;


import org.apache.lucene.analysis.CharFilter;
import org.apache.lucene.analysis.util.RollingCharBuffer;

import java.io.IOException;
import java.io.Reader;

Normalizes Japanese horizontal iteration marks (odoriji) to their expanded form.

Sequences of iteration marks are supported.  In case an illegal sequence of iteration
marks is encountered, the implementation emits the illegal source character as-is
without considering its script.  For example, with input "?ゝ", we get
"??" even though the question mark isn't hiragana.


Note that a full stop punctuation character "。" (U+3002) can not be iterated
(see below). Iteration marks themselves can be emitted in case they are illegal,
i.e. if they go back past the beginning of the character stream.


The implementation buffers input until a full stop punctuation character (U+3002)
or EOF is reached in order to not keep a copy of the character stream in memory.
Vertical iteration marks, which are even rarer than horizontal iteration marks in
contemporary Japanese, are unsupported.

/**
 * Normalizes Japanese horizontal iteration marks (odoriji) to their expanded form.
 * <p>
 * Sequences of iteration marks are supported.  In case an illegal sequence of iteration
 * marks is encountered, the implementation emits the illegal source character as-is
 * without considering its script.  For example, with input "?ゝ", we get
 * "??" even though the question mark isn't hiragana.
 * </p>
 * <p>
 * Note that a full stop punctuation character "。" (U+3002) can not be iterated
 * (see below). Iteration marks themselves can be emitted in case they are illegal,
 * i.e. if they go back past the beginning of the character stream.
 * </p>
 * <p>
 * The implementation buffers input until a full stop punctuation character (U+3002)
 * or EOF is reached in order to not keep a copy of the character stream in memory.
 * Vertical iteration marks, which are even rarer than horizontal iteration marks in
 * contemporary Japanese, are unsupported.
 * </p>
 */
public class JapaneseIterationMarkCharFilter extends CharFilter {

  Normalize kanji iteration marks by default /** Normalize kanji iteration marks by default */
  public static final boolean NORMALIZE_KANJI_DEFAULT = true; 

  Normalize kana iteration marks by default /** Normalize kana iteration marks by default */
  public static final boolean NORMALIZE_KANA_DEFAULT = true;

  private static final char KANJI_ITERATION_MARK = '\u3005';           // 々

  private static final char HIRAGANA_ITERATION_MARK = '\u309d';        // ゝ

  private static final char HIRAGANA_VOICED_ITERATION_MARK = '\u309e'; // ゞ

  private static final char KATAKANA_ITERATION_MARK = '\u30fd';        // ヽ

  private static final char KATAKANA_VOICED_ITERATION_MARK = '\u30fe'; // ヾ

  private static final char FULL_STOP_PUNCTUATION = '\u3002';           // 。

  // Hiragana to dakuten map (lookup using code point - 0x30ab（か）*/
  private static char[] h2d = new char[50];

  // Katakana to dakuten map (lookup using code point - 0x30ab（カ
  private static char[] k2d = new char[50];

  private final RollingCharBuffer buffer = new RollingCharBuffer();

  private int bufferPosition = 0;

  private int iterationMarksSpanSize = 0;

  private int iterationMarkSpanEndPosition = 0;

  private boolean normalizeKanji;

  private boolean normalizeKana;

  static {
    // Hiragana dakuten map
    h2d[0] = '\u304c';  // か => が
    h2d[1] = '\u304c';  // が => が
    h2d[2] = '\u304e';  // き => ぎ
    h2d[3] = '\u304e';  // ぎ => ぎ
    h2d[4] = '\u3050';  // く => ぐ
    h2d[5] = '\u3050';  // ぐ => ぐ
    h2d[6] = '\u3052';  // け => げ
    h2d[7] = '\u3052';  // げ => げ
    h2d[8] = '\u3054';  // こ => ご
    h2d[9] = '\u3054';  // ご => ご
    h2d[10] = '\u3056'; // さ => ざ
    h2d[11] = '\u3056'; // ざ => ざ
    h2d[12] = '\u3058'; // し => じ
    h2d[13] = '\u3058'; // じ => じ
    h2d[14] = '\u305a'; // す => ず
    h2d[15] = '\u305a'; // ず => ず
    h2d[16] = '\u305c'; // せ => ぜ
    h2d[17] = '\u305c'; // ぜ => ぜ
    h2d[18] = '\u305e'; // そ => ぞ
    h2d[19] = '\u305e'; // ぞ => ぞ
    h2d[20] = '\u3060'; // た => だ
    h2d[21] = '\u3060'; // だ => だ
    h2d[22] = '\u3062'; // ち => ぢ
    h2d[23] = '\u3062'; // ぢ => ぢ
    h2d[24] = '\u3063';
    h2d[25] = '\u3065'; // つ => づ
    h2d[26] = '\u3065'; // づ => づ
    h2d[27] = '\u3067'; // て => で
    h2d[28] = '\u3067'; // で => で
    h2d[29] = '\u3069'; // と => ど
    h2d[30] = '\u3069'; // ど => ど
    h2d[31] = '\u306a';
    h2d[32] = '\u306b';
    h2d[33] = '\u306c';
    h2d[34] = '\u306d';
    h2d[35] = '\u306e';
    h2d[36] = '\u3070'; // は => ば
    h2d[37] = '\u3070'; // ば => ば
    h2d[38] = '\u3071';
    h2d[39] = '\u3073'; // ひ => び
    h2d[40] = '\u3073'; // び => び
    h2d[41] = '\u3074';
    h2d[42] = '\u3076'; // ふ => ぶ
    h2d[43] = '\u3076'; // ぶ => ぶ
    h2d[44] = '\u3077';
    h2d[45] = '\u3079'; // へ => べ
    h2d[46] = '\u3079'; // べ => べ
    h2d[47] = '\u307a';
    h2d[48] = '\u307c'; // ほ => ぼ
    h2d[49] = '\u307c'; // ぼ => ぼ

    // Make katakana dakuten map from hiragana map
    char codePointDifference = '\u30ab' - '\u304b'; // カ - か
    assert h2d.length == k2d.length;
    for (int i = 0; i < k2d.length; i++) {
      k2d[i] = (char) (h2d[i] + codePointDifference);
    }
  }

  Constructor. Normalizes both kanji and kana iteration marks by default.
Params: input – char stream/**
   * Constructor. Normalizes both kanji and kana iteration marks by default.
   *
   * @param input char stream
   */
  public JapaneseIterationMarkCharFilter(Reader input) {
    this(input, NORMALIZE_KANJI_DEFAULT, NORMALIZE_KANA_DEFAULT);
  }


  Constructor
Params: input –          char stream
normalizeKanji – indicates whether kanji iteration marks should be normalized
normalizeKana – indicates whether kana iteration marks should be normalized/**
   * Constructor
   *
   * @param input          char stream
   * @param normalizeKanji indicates whether kanji iteration marks should be normalized
   * @param normalizeKana indicates whether kana iteration marks should be normalized
   */
  public JapaneseIterationMarkCharFilter(Reader input, boolean normalizeKanji, boolean normalizeKana) {
    super(input);
    this.normalizeKanji = normalizeKanji;
    this.normalizeKana = normalizeKana;
    buffer.reset(input);
  }

  @Override
  public int read(char[] buffer, int offset, int length) throws IOException {
    int read = 0;

    for (int i = offset; i < offset + length; i++) {
      int c = read();
      if (c == -1) {
        break;
      }
      buffer[i] = (char) c;
      read++;
    }

    return read == 0 ? -1 : read;
  }

  @Override
  public int read() throws IOException {
    int ic = buffer.get(bufferPosition);

    // End of input
    if (ic == -1) {
      buffer.freeBefore(bufferPosition);
      return ic;
    }
    
    char c = (char) ic;

    // Skip surrogate pair characters
    if (Character.isHighSurrogate(c) || Character.isLowSurrogate(c)) {
      iterationMarkSpanEndPosition = bufferPosition + 1;
    }

    // Free rolling buffer on full stop
    if (c == FULL_STOP_PUNCTUATION) {
      buffer.freeBefore(bufferPosition);
      iterationMarkSpanEndPosition = bufferPosition + 1;
    }
    
    // Normalize iteration mark
    if (isIterationMark(c)) {
      c = normalizeIterationMark(c);
    }
    
    bufferPosition++;
    return c;
  }

  Normalizes the iteration mark character c
Params: c – iteration mark character to normalize
Throws: IOException – If there is a low-level I/O error.
Returns: normalized iteration mark/**
   * Normalizes the iteration mark character c
   *
   * @param c iteration mark character to normalize
   * @return normalized iteration mark
   * @throws IOException If there is a low-level I/O error.
   */
  private char normalizeIterationMark(char c) throws IOException {

    // Case 1: Inside an iteration mark span
    if (bufferPosition < iterationMarkSpanEndPosition) {
      return normalize(sourceCharacter(bufferPosition, iterationMarksSpanSize), c);
    }

    // Case 2: New iteration mark spans starts where the previous one ended, which is illegal
    if (bufferPosition == iterationMarkSpanEndPosition) {
      // Emit the illegal iteration mark and increase end position to indicate that we can't
      // start a new span on the next position either
      iterationMarkSpanEndPosition++;
      return c;
    }

    // Case 3: New iteration mark span
    iterationMarksSpanSize = nextIterationMarkSpanSize();
    iterationMarkSpanEndPosition = bufferPosition + iterationMarksSpanSize;
    return normalize(sourceCharacter(bufferPosition, iterationMarksSpanSize), c);
  }

  Finds the number of subsequent next iteration marks
Throws: IOException – If there is a low-level I/O error.
Returns: number of iteration marks starting at the current buffer position/**
   * Finds the number of subsequent next iteration marks
   *
   * @return number of iteration marks starting at the current buffer position
   * @throws IOException If there is a low-level I/O error.
   */
  private int nextIterationMarkSpanSize() throws IOException {
    int spanSize = 0;
    for (int i = bufferPosition; buffer.get(i) != -1 && isIterationMark((char) (buffer.get(i))); i++) {
      spanSize++;
    }
    // Restrict span size so that we don't go past the previous end position
    if (bufferPosition - spanSize < iterationMarkSpanEndPosition) {
      spanSize = bufferPosition - iterationMarkSpanEndPosition;
    }
    return spanSize;
  }

  Returns the source character for a given position and iteration mark span size
Params: position – buffer position (should not exceed bufferPosition)
spanSize – iteration mark span size
Throws: IOException – If there is a low-level I/O error.
Returns: source character/**
   * Returns the source character for a given position and iteration mark span size
   *
   * @param position buffer position (should not exceed bufferPosition)
   * @param spanSize iteration mark span size
   * @return source character
   * @throws IOException If there is a low-level I/O error.
   */
  private char sourceCharacter(int position, int spanSize) throws IOException {
    return (char) buffer.get(position - spanSize);
  }

  Normalize a character
Params: c – character to normalize
m – repetition mark referring to c
Returns: normalized character - return c on illegal iteration marks/**
   * Normalize a character
   *
   * @param c character to normalize
   * @param m repetition mark referring to c
   * @return normalized character - return c on illegal iteration marks
   */
  private char normalize(char c, char m) {
    if (isHiraganaIterationMark(m)) {
      return normalizedHiragana(c, m);
    }

    if (isKatakanaIterationMark(m)) {
      return normalizedKatakana(c, m);
    }

    return c; // If m is not kana and we are to normalize it, we assume it is kanji and simply return it
  }

  Normalize hiragana character
Params: c – hiragana character
m – repetition mark referring to c
Returns: normalized character - return c on illegal iteration marks/**
   * Normalize hiragana character
   *
   * @param c hiragana character
   * @param m repetition mark referring to c
   * @return normalized character - return c on illegal iteration marks
   */
  private char normalizedHiragana(char c, char m) {
    switch (m) {
      case HIRAGANA_ITERATION_MARK:
        return isHiraganaDakuten(c) ? (char) (c - 1) : c;
      case HIRAGANA_VOICED_ITERATION_MARK:
        return lookupHiraganaDakuten(c);
      default:
        return c;
    }
  }

  Normalize katakana character
Params: c – katakana character
m – repetition mark referring to c
Returns: normalized character - return c on illegal iteration marks/**
   * Normalize katakana character
   *
   * @param c katakana character
   * @param m repetition mark referring to c
   * @return normalized character - return c on illegal iteration marks
   */
  private char normalizedKatakana(char c, char m) {
    switch (m) {
      case KATAKANA_ITERATION_MARK:
        return isKatakanaDakuten(c) ? (char) (c - 1) : c;
      case KATAKANA_VOICED_ITERATION_MARK:
        return lookupKatakanaDakuten(c);
      default:
        return c;
    }
  }

  Iteration mark character predicate
Params: c – character to test
Returns: true if c is an iteration mark character.  Otherwise false./**
   * Iteration mark character predicate
   *
   * @param c character to test
   * @return true if c is an iteration mark character.  Otherwise false.
   */
  private boolean isIterationMark(char c) {
    return isKanjiIterationMark(c) || isHiraganaIterationMark(c) || isKatakanaIterationMark(c);
  }

  Hiragana iteration mark character predicate
Params: c – character to test
Returns: true if c is a hiragana iteration mark character.  Otherwise false./**
   * Hiragana iteration mark character predicate
   *
   * @param c character to test
   * @return true if c is a hiragana iteration mark character.  Otherwise false.
   */
  private boolean isHiraganaIterationMark(char c) {
    if (normalizeKana) {
      return c == HIRAGANA_ITERATION_MARK || c == HIRAGANA_VOICED_ITERATION_MARK;
    } else {
      return false;
    }
  }

  Katakana iteration mark character predicate
Params: c – character to test
Returns: true if c is a katakana iteration mark character.  Otherwise false./**
   * Katakana iteration mark character predicate
   *
   * @param c character to test
   * @return true if c is a katakana iteration mark character.  Otherwise false.
   */
  private boolean isKatakanaIterationMark(char c) {
    if (normalizeKana) {
      return c == KATAKANA_ITERATION_MARK || c == KATAKANA_VOICED_ITERATION_MARK;
    } else {
      return false;
    }
  }

  Kanji iteration mark character predicate
Params: c – character to test
Returns: true if c is a kanji iteration mark character.  Otherwise false./**
   * Kanji iteration mark character predicate
   *
   * @param c character to test
   * @return true if c is a kanji iteration mark character.  Otherwise false.
   */
  private boolean isKanjiIterationMark(char c) {
    if (normalizeKanji) {
      return c == KANJI_ITERATION_MARK;
    } else {
      return false;
    }
  }

  Look up hiragana dakuten
Params: c – character to look up
Returns: hiragana dakuten variant of c or c itself if no dakuten variant exists/**
   * Look up hiragana dakuten
   *
   * @param c character to look up
   * @return hiragana dakuten variant of c or c itself if no dakuten variant exists
   */
  private char lookupHiraganaDakuten(char c) {
    return lookup(c, h2d, '\u304b'); // Code point is for か
  }

  Look up katakana dakuten. Only full-width katakana are supported.
Params: c – character to look up
Returns: katakana dakuten variant of c or c itself if no dakuten variant exists/**
   * Look up katakana dakuten. Only full-width katakana are supported.
   *
   * @param c character to look up
   * @return katakana dakuten variant of c or c itself if no dakuten variant exists
   */
  private char lookupKatakanaDakuten(char c) {
    return lookup(c, k2d, '\u30ab'); // Code point is for カ
  }

  Hiragana dakuten predicate
Params: c – character to check
Returns: true if c is a hiragana dakuten and otherwise false/**
   * Hiragana dakuten predicate
   *
   * @param c character to check
   * @return true if c is a hiragana dakuten and otherwise false
   */
  private boolean isHiraganaDakuten(char c) {
    return inside(c, h2d, '\u304b') && c == lookupHiraganaDakuten(c);
  }

  Katakana dakuten predicate
Params: c – character to check
Returns: true if c is a hiragana dakuten and otherwise false/**
   * Katakana dakuten predicate
   *
   * @param c character to check
   * @return true if c is a hiragana dakuten and otherwise false
   */
  private boolean isKatakanaDakuten(char c) {
    return inside(c, k2d, '\u30ab') && c == lookupKatakanaDakuten(c);
  }

  Looks up a character in dakuten map and returns the dakuten variant if it exists.
Otherwise return the character being looked up itself
Params: c –      character to look up
map –    dakuten map
offset – code point offset from c
Returns: mapped character or c if no mapping exists/**
   * Looks up a character in dakuten map and returns the dakuten variant if it exists.
   * Otherwise return the character being looked up itself
   *
   * @param c      character to look up
   * @param map    dakuten map
   * @param offset code point offset from c
   * @return mapped character or c if no mapping exists
   */
  private char lookup(char c, char[] map, char offset) {
    if (!inside(c, map, offset)) {
      return c;
    } else {
      return map[c - offset];
    }
  }

  Predicate indicating if the lookup character is within dakuten map range
Params: c –      character to look up
map –    dakuten map
offset – code point offset from c
Returns: true if c is mapped by map and otherwise false/**
   * Predicate indicating if the lookup character is within dakuten map range
   *
   * @param c      character to look up
   * @param map    dakuten map
   * @param offset code point offset from c
   * @return true if c is mapped by map and otherwise false
   */
  private boolean inside(char c, char[] map, char offset) {
    return c >= offset && c < offset + map.length;
  }


  @Override
  protected int correct(int currentOff) {
    return currentOff; // this filter doesn't change the length of strings
  }
}
Params:	c – iteration mark character to normalize
Throws:	IOException – If there is a low-level I/O error.
Returns:	normalized iteration mark
Params:	position – buffer position (should not exceed bufferPosition) spanSize – iteration mark span size
Throws:	IOException – If there is a low-level I/O error.
Returns:	source character
Params:	c – character to normalize m – repetition mark referring to c
Returns:	normalized character - return c on illegal iteration marks
Params:	c – hiragana character m – repetition mark referring to c
Returns:	normalized character - return c on illegal iteration marks
Params:	c – katakana character m – repetition mark referring to c
Returns:	normalized character - return c on illegal iteration marks
Params:	c – character to test
Returns:	true if c is an iteration mark character. Otherwise false.
Params:	c – character to look up
Returns:	hiragana dakuten variant of c or c itself if no dakuten variant exists
Params:	c – character to check
Returns:	true if c is a hiragana dakuten and otherwise false
Params:	c – character to look up map – dakuten map offset – code point offset from c
Returns:	mapped character or c if no mapping exists
/

org.apache.lucene/ lucene-analyzers-kuromoji/ 8.2.0/ org/apache/lucene/analysis/ja/JapaneseIterationMarkCharFilter.java