/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.analysis.icu.segmentation;

import com.ibm.icu.text.BreakIterator;
import com.ibm.icu.text.RuleBasedBreakIterator;
import com.ibm.icu.text.UTF16;
import com.ibm.icu.text.UnicodeSet;

Wraps RuleBasedBreakIterator, making object reuse convenient and emitting a rule status for emoji sequences.
@lucene.experimental
/** * Wraps RuleBasedBreakIterator, making object reuse convenient and * emitting a rule status for emoji sequences. * @lucene.experimental */
final class BreakIteratorWrapper { private final CharArrayIterator textIterator = new CharArrayIterator(); private final RuleBasedBreakIterator rbbi; private char text[]; private int start; private int status; BreakIteratorWrapper(RuleBasedBreakIterator rbbi) { this.rbbi = rbbi; } int current() { return rbbi.current(); } int getRuleStatus() { return status; } int next() { int current = rbbi.current(); int next = rbbi.next(); status = calcStatus(current, next); return next; }
Returns current rule status for the text between breaks. (determines token type)
/** Returns current rule status for the text between breaks. (determines token type) */
private int calcStatus(int current, int next) { // to support presentation selectors, we need to handle alphanum, num, and none at least, so currently not worth optimizing. // https://unicode.org/cldr/utility/list-unicodeset.jsp?a=%5B%3AEmoji%3A%5D-%5B%3AEmoji_Presentation%3A%5D&g=Word_Break&i= if (next != BreakIterator.DONE && isEmoji(current, next)) { return ICUTokenizerConfig.EMOJI_SEQUENCE_STATUS; } else { return rbbi.getRuleStatus(); } } // See unicode doc L2/16-315 for rationale. // basically for us the ambiguous cases (keycap/etc) as far as types go. static final UnicodeSet EMOJI_RK = new UnicodeSet("[\u002a\u00230-9©®™〰〽]").freeze(); // faster than doing hasBinaryProperty() checks, at the cost of 1KB ram static final UnicodeSet EMOJI = new UnicodeSet("[[:Emoji:][:Extended_Pictographic:]]").freeze();
Returns true if the current text represents emoji character or sequence
/** Returns true if the current text represents emoji character or sequence */
private boolean isEmoji(int current, int next) { int begin = start + current; int end = start + next; int codepoint = UTF16.charAt(text, 0, end, begin); if (EMOJI.contains(codepoint)) { if (EMOJI_RK.contains(codepoint)) { // if its in EmojiRK, we don't treat it as emoji unless there is evidence it forms emoji sequence, // an emoji presentation selector or keycap follows. int trailer = begin + Character.charCount(codepoint); return trailer < end && (text[trailer] == 0xFE0F || text[trailer] == 0x20E3); } else { return true; } } return false; } void setText(char text[], int start, int length) { this.text = text; this.start = start; textIterator.setText(text, start, length); rbbi.setText(textIterator); status = RuleBasedBreakIterator.WORD_NONE; } }