/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.analysis.util;


import java.io.IOException;
import java.io.Reader;
import java.text.BreakIterator;

import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.util.AttributeFactory;

Breaks text into sentences with a BreakIterator and allows subclasses to decompose these sentences into words.

This can be used by subclasses that need sentence context for tokenization purposes, such as CJK segmenters.

Additionally it can be used by subclasses that want to mark sentence boundaries (with a custom attribute, extra token, position increment, etc) for downstream processing.

@lucene.experimental
/** * Breaks text into sentences with a {@link BreakIterator} and * allows subclasses to decompose these sentences into words. * <p> * This can be used by subclasses that need sentence context * for tokenization purposes, such as CJK segmenters. * <p> * Additionally it can be used by subclasses that want to mark * sentence boundaries (with a custom attribute, extra token, position * increment, etc) for downstream processing. * * @lucene.experimental */
public abstract class SegmentingTokenizerBase extends Tokenizer { protected static final int BUFFERMAX = 1024; protected final char buffer[] = new char[BUFFERMAX];
true length of text in the buffer
/** true length of text in the buffer */
private int length = 0;
length in buffer that can be evaluated safely, up to a safe end point
/** length in buffer that can be evaluated safely, up to a safe end point */
private int usableLength = 0;
accumulated offset of previous buffers for this reader, for offsetAtt
/** accumulated offset of previous buffers for this reader, for offsetAtt */
protected int offset = 0; private final BreakIterator iterator; private final CharArrayIterator wrapper = CharArrayIterator.newSentenceInstance(); private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
Construct a new SegmenterBase, using the provided BreakIterator for sentence segmentation.

Note that you should never share BreakIterators across different TokenStreams, instead a newly created or cloned one should always be provided to this constructor.

/** * Construct a new SegmenterBase, using * the provided BreakIterator for sentence segmentation. * <p> * Note that you should never share BreakIterators across different * TokenStreams, instead a newly created or cloned one should always * be provided to this constructor. */
public SegmentingTokenizerBase(BreakIterator iterator) { this(DEFAULT_TOKEN_ATTRIBUTE_FACTORY, iterator); }
Construct a new SegmenterBase, also supplying the AttributeFactory
/** * Construct a new SegmenterBase, also supplying the AttributeFactory */
public SegmentingTokenizerBase(AttributeFactory factory, BreakIterator iterator) { super(factory); this.iterator = iterator; } @Override public final boolean incrementToken() throws IOException { if (length == 0 || !incrementWord()) { while (!incrementSentence()) { refill(); if (length <= 0) // no more bytes to read; return false; } } return true; } @Override public void reset() throws IOException { super.reset(); wrapper.setText(buffer, 0, 0); iterator.setText(wrapper); length = usableLength = offset = 0; } @Override public final void end() throws IOException { super.end(); final int finalOffset = correctOffset(length < 0 ? offset : offset + length); offsetAtt.setOffset(finalOffset, finalOffset); }
Returns the last unambiguous break position in the text.
/** Returns the last unambiguous break position in the text. */
private int findSafeEnd() { for (int i = length - 1; i >= 0; i--) if (isSafeEnd(buffer[i])) return i + 1; return -1; }
For sentence tokenization, these are the unambiguous break positions.
/** For sentence tokenization, these are the unambiguous break positions. */
protected boolean isSafeEnd(char ch) { switch(ch) { case 0x000D: case 0x000A: case 0x0085: case 0x2028: case 0x2029: return true; default: return false; } }
Refill the buffer, accumulating the offset and setting usableLength to the last unambiguous break position
/** * Refill the buffer, accumulating the offset and setting usableLength to the * last unambiguous break position */
private void refill() throws IOException { offset += usableLength; int leftover = length - usableLength; System.arraycopy(buffer, usableLength, buffer, 0, leftover); int requested = buffer.length - leftover; int returned = read(input, buffer, leftover, requested); length = returned < 0 ? leftover : returned + leftover; if (returned < requested) /* reader has been emptied, process the rest */ usableLength = length; else { /* still more data to be read, find a safe-stopping place */ usableLength = findSafeEnd(); if (usableLength < 0) usableLength = length; /* * more than IOBUFFER of text without breaks, * gonna possibly truncate tokens */ } wrapper.setText(buffer, 0, Math.max(0, usableLength)); iterator.setText(wrapper); } // TODO: refactor to a shared readFully somewhere // (NGramTokenizer does this too):
commons-io's readFully, but without bugs if offset != 0
/** commons-io's readFully, but without bugs if offset != 0 */
private static int read(Reader input, char[] buffer, int offset, int length) throws IOException { assert length >= 0 : "length must not be negative: " + length; int remaining = length; while (remaining > 0) { int location = length - remaining; int count = input.read(buffer, offset + location, remaining); if (-1 == count) { // EOF break; } remaining -= count; } return length - remaining; }
return true if there is a token from the buffer, or null if it is exhausted.
/** * return true if there is a token from the buffer, or null if it is * exhausted. */
private boolean incrementSentence() throws IOException { if (length == 0) // we must refill the buffer return false; while (true) { int start = iterator.current(); if (start == BreakIterator.DONE) return false; // BreakIterator exhausted // find the next set of boundaries int end = iterator.next(); if (end == BreakIterator.DONE) return false; // BreakIterator exhausted setNextSentence(start, end); if (incrementWord()) { return true; } } }
Provides the next input sentence for analysis
/** Provides the next input sentence for analysis */
protected abstract void setNextSentence(int sentenceStart, int sentenceEnd);
Returns true if another word is available
/** Returns true if another word is available */
protected abstract boolean incrementWord(); }