/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.analysis.icu;


import java.io.IOException;

import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;

import com.ibm.icu.text.Replaceable;
import com.ibm.icu.text.Transliterator;
import com.ibm.icu.text.UTF16;
import com.ibm.icu.text.UnicodeSet;

A TokenFilter that transforms text with ICU.

ICU provides text-transformation functionality via its Transliteration API. Although script conversion is its most common use, a Transliterator can actually perform a more general class of tasks. In fact, Transliterator defines a very general API which specifies only that a segment of the input text is replaced by new text. The particulars of this conversion are determined entirely by subclasses of Transliterator.

Some useful transformations for search are built-in:

  • Conversion from Traditional to Simplified Chinese characters
  • Conversion from Hiragana to Katakana
  • Conversion from Fullwidth to Halfwidth forms.
  • Script conversions, for example Serbian Cyrillic to Latin

Example usage:

stream = new ICUTransformFilter(stream, Transliterator.getInstance("Traditional-Simplified"));

For more details, see the ICU User Guide.
/** * A {@link TokenFilter} that transforms text with ICU. * <p> * ICU provides text-transformation functionality via its Transliteration API. * Although script conversion is its most common use, a Transliterator can * actually perform a more general class of tasks. In fact, Transliterator * defines a very general API which specifies only that a segment of the input * text is replaced by new text. The particulars of this conversion are * determined entirely by subclasses of Transliterator. * </p> * <p> * Some useful transformations for search are built-in: * <ul> * <li>Conversion from Traditional to Simplified Chinese characters * <li>Conversion from Hiragana to Katakana * <li>Conversion from Fullwidth to Halfwidth forms. * <li>Script conversions, for example Serbian Cyrillic to Latin * </ul> * <p> * Example usage: <blockquote>stream = new ICUTransformFilter(stream, * Transliterator.getInstance("Traditional-Simplified"));</blockquote> * <br> * For more details, see the <a * href="http://userguide.icu-project.org/transforms/general">ICU User * Guide</a>. */
public final class ICUTransformFilter extends TokenFilter { // Transliterator to transform the text private final Transliterator transform; // Reusable position object private final Transliterator.Position position = new Transliterator.Position(); // term attribute, will be updated with transformed text. private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); // Wraps a termAttribute around the replaceable interface. private final ReplaceableTermAttribute replaceableAttribute = new ReplaceableTermAttribute();
Create a new ICUTransformFilter that transforms text on the given stream.
Params:
  • input – TokenStream to filter.
  • transform – Transliterator to transform the text.
/** * Create a new ICUTransformFilter that transforms text on the given stream. * * @param input {@link TokenStream} to filter. * @param transform Transliterator to transform the text. */
public ICUTransformFilter(TokenStream input, Transliterator transform) { super(input); this.transform = transform; /* * This is cheating, but speeds things up a lot. * If we wanted to use pkg-private APIs we could probably do better. */ if (transform.getFilter() == null && transform instanceof com.ibm.icu.text.RuleBasedTransliterator) { final UnicodeSet sourceSet = transform.getSourceSet(); if (sourceSet != null && !sourceSet.isEmpty()) transform.setFilter(sourceSet); } } @Override public boolean incrementToken() throws IOException { /* * Wrap around replaceable. clear the positions, and transliterate. */ if (input.incrementToken()) { replaceableAttribute.setText(termAtt); final int length = termAtt.length(); position.start = 0; position.limit = length; position.contextStart = 0; position.contextLimit = length; transform.filteredTransliterate(replaceableAttribute, position, false); return true; } else { return false; } }
Wrap a CharTermAttribute with the Replaceable API.
/** * Wrap a {@link CharTermAttribute} with the Replaceable API. */
static final class ReplaceableTermAttribute implements Replaceable { private char buffer[]; private int length; private CharTermAttribute token; void setText(final CharTermAttribute token) { this.token = token; this.buffer = token.buffer(); this.length = token.length(); } @Override public int char32At(int pos) { return UTF16.charAt(buffer, 0, length, pos); } @Override public char charAt(int pos) { return buffer[pos]; } @Override public void copy(int start, int limit, int dest) { char text[] = new char[limit - start]; getChars(start, limit, text, 0); replace(dest, dest, text, 0, limit - start); } @Override public void getChars(int srcStart, int srcLimit, char[] dst, int dstStart) { System.arraycopy(buffer, srcStart, dst, dstStart, srcLimit - srcStart); } @Override public boolean hasMetaData() { return false; } @Override public int length() { return length; } @Override public void replace(int start, int limit, String text) { final int charsLen = text.length(); final int newLength = shiftForReplace(start, limit, charsLen); // insert the replacement text text.getChars(0, charsLen, buffer, start); token.setLength(length = newLength); } @Override public void replace(int start, int limit, char[] text, int charsStart, int charsLen) { // shift text if necessary for the replacement final int newLength = shiftForReplace(start, limit, charsLen); // insert the replacement text System.arraycopy(text, charsStart, buffer, start, charsLen); token.setLength(length = newLength); }
shift text (if necessary) for a replacement operation
/** shift text (if necessary) for a replacement operation */
private int shiftForReplace(int start, int limit, int charsLen) { final int replacementLength = limit - start; final int newLength = length - replacementLength + charsLen; // resize if necessary if (newLength > length) buffer = token.resizeBuffer(newLength); // if the substring being replaced is longer or shorter than the // replacement, need to shift things around if (replacementLength != charsLen && limit < length) System.arraycopy(buffer, limit, buffer, start + charsLen, length - limit); return newLength; } } }