org.apache.lucene/lucene-analyzers-icu/8.2.0 : org/apache/lucene/analysis/icu/ICUNormalizer2CharFilter.java

ICUNormalizer2CharFilter
http://lucene.apache.org/lucene-parent/lucene-analyzers-icu: Provides integration with ICU (International Components for Unicode) for stronger Unicode and internationalization support. (The Apache Software Foundation)
Apache 2
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.analysis.icu;


import java.io.IOException;
import java.io.Reader;
import java.util.Objects;

import org.apache.lucene.analysis.CharacterUtils;
import org.apache.lucene.analysis.charfilter.BaseCharFilter;

import com.ibm.icu.text.Normalizer2;

Normalize token text with ICU's Normalizer2. /**
 * Normalize token text with ICU's {@link Normalizer2}.
 */
public final class ICUNormalizer2CharFilter extends BaseCharFilter {

  private final Normalizer2 normalizer;
  private final StringBuilder inputBuffer = new StringBuilder();
  private final StringBuilder resultBuffer = new StringBuilder();

  private boolean inputFinished;
  private boolean afterQuickCheckYes;
  private int checkedInputBoundary;
  private int charCount;


  Create a new Normalizer2CharFilter that combines NFKC normalization, Case
Folding, and removes Default Ignorables (NFKC_Casefold)
/**
   * Create a new Normalizer2CharFilter that combines NFKC normalization, Case
   * Folding, and removes Default Ignorables (NFKC_Casefold)
   */
  public ICUNormalizer2CharFilter(Reader in) {
    this(in, Normalizer2.getInstance(null, "nfkc_cf", Normalizer2.Mode.COMPOSE));
  }

  Create a new Normalizer2CharFilter with the specified Normalizer2
Params: in – text
normalizer – normalizer to use/**
   * Create a new Normalizer2CharFilter with the specified Normalizer2
   * @param in text
   * @param normalizer normalizer to use
   */
  public ICUNormalizer2CharFilter(Reader in, Normalizer2 normalizer) {
    this(in, normalizer, 128);
  }
  
  // for testing ONLY
  ICUNormalizer2CharFilter(Reader in, Normalizer2 normalizer, int bufferSize) {
    super(in);
    this.normalizer = Objects.requireNonNull(normalizer);
    this.tmpBuffer = CharacterUtils.newCharacterBuffer(bufferSize);
  }

  @Override
  public int read(char[] cbuf, int off, int len) throws IOException {
    if (off < 0) throw new IllegalArgumentException("off < 0");
    if (off >= cbuf.length) throw new IllegalArgumentException("off >= cbuf.length");
    if (len <= 0) throw new IllegalArgumentException("len <= 0");

    while (!inputFinished || inputBuffer.length() > 0 || resultBuffer.length() > 0) {
      int retLen;

      if (resultBuffer.length() > 0) {
        retLen = outputFromResultBuffer(cbuf, off, len);
        if (retLen > 0) {
          return retLen;
        }
      }

      int resLen = readAndNormalizeFromInput();
      if (resLen > 0) {
        retLen = outputFromResultBuffer(cbuf, off, len);
        if (retLen > 0) {
          return retLen;
        }
      }

      readInputToBuffer();
    }

    return -1;
  }

  private final CharacterUtils.CharacterBuffer tmpBuffer;

  private void readInputToBuffer() throws IOException {
    while (true) {
      // CharacterUtils.fill is supplementary char aware
      final boolean hasRemainingChars = CharacterUtils.fill(tmpBuffer, input);

      assert tmpBuffer.getOffset() == 0;
      inputBuffer.append(tmpBuffer.getBuffer(), 0, tmpBuffer.getLength());

      if (hasRemainingChars == false) {
        inputFinished = true;
        break;
      }

      final int lastCodePoint = Character.codePointBefore(tmpBuffer.getBuffer(), tmpBuffer.getLength(), 0);
      if (normalizer.isInert(lastCodePoint)) {
        // we require an inert char so that we can normalize content before and
        // after this character independently
        break;
      }
    }

    // if checkedInputBoundary was at the end of a buffer, we need to check that char again
    checkedInputBoundary = Math.max(checkedInputBoundary - 1, 0);
  }

  private int readAndNormalizeFromInput() {
    if (inputBuffer.length() <= 0) {
      afterQuickCheckYes = false;
      return 0;
    }
    if (!afterQuickCheckYes) {
      int resLen = readFromInputWhileSpanQuickCheckYes();
      afterQuickCheckYes = true;
      if (resLen > 0) return resLen;
    }
    int resLen = readFromIoNormalizeUptoBoundary();
    if(resLen > 0){
      afterQuickCheckYes = false;
    }
    return resLen;
  }

  private int readFromInputWhileSpanQuickCheckYes() {
    int end = normalizer.spanQuickCheckYes(inputBuffer);
    if (end > 0) {
      resultBuffer.append(inputBuffer.subSequence(0, end));
      inputBuffer.delete(0, end);
      checkedInputBoundary = Math.max(checkedInputBoundary - end, 0);
      charCount += end;
    }
    return end;
  }

  private int readFromIoNormalizeUptoBoundary() {
    // if there's no buffer to normalize, return 0
    if (inputBuffer.length() <= 0) {
      return 0;
    }

    boolean foundBoundary = false;
    final int bufLen = inputBuffer.length();

    while (checkedInputBoundary <= bufLen - 1) {
      int charLen = Character.charCount(inputBuffer.codePointAt(checkedInputBoundary));
      checkedInputBoundary += charLen;
      if (checkedInputBoundary < bufLen && normalizer.hasBoundaryBefore(inputBuffer
        .codePointAt(checkedInputBoundary))) {
        foundBoundary = true;
        break;
      }
    }
    if (!foundBoundary && checkedInputBoundary >= bufLen && inputFinished) {
      foundBoundary = true;
      checkedInputBoundary = bufLen;
    }

    if (!foundBoundary) {
      return 0;
    }

    return normalizeInputUpto(checkedInputBoundary);
  }

  private int normalizeInputUpto(final int length) {
    final int destOrigLen = resultBuffer.length();
    normalizer.normalizeSecondAndAppend(resultBuffer,
      inputBuffer.subSequence(0, length));
    inputBuffer.delete(0, length);
    checkedInputBoundary = Math.max(checkedInputBoundary - length, 0);
    final int resultLength = resultBuffer.length() - destOrigLen;
    recordOffsetDiff(length, resultLength);
    return resultLength;
  }

  private void recordOffsetDiff(int inputLength, int outputLength) {
    if (inputLength == outputLength) {
      charCount += outputLength;
      return;
    }
    final int diff = inputLength - outputLength;
    final int cumuDiff = getLastCumulativeDiff();
    if (diff < 0) {
      for (int i = 1;  i <= -diff; ++i) {
        addOffCorrectMap(charCount + i, cumuDiff - i);
      }
    } else {
      addOffCorrectMap(charCount + outputLength, cumuDiff + diff);
    }
    charCount += outputLength;
  }

  private int outputFromResultBuffer(char[] cbuf, int begin, int len) {
    len = Math.min(resultBuffer.length(), len);
    resultBuffer.getChars(0, len, cbuf, begin);
    if (len > 0) {
      resultBuffer.delete(0, len);
    }
    return len;
  }
}
/

org.apache.lucene/ lucene-analyzers-icu/ 8.2.0/ org/apache/lucene/analysis/icu/ICUNormalizer2CharFilter.java