/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.analysis;


import java.io.IOException;
import java.io.Reader;

Utility class to write tokenizers or token filters.
@lucene.internal
/** * Utility class to write tokenizers or token filters. * @lucene.internal */
public final class CharacterUtils { private CharacterUtils() {} // no instantiation
Creates a new CharacterBuffer and allocates a char[] of the given bufferSize.
Params:
  • bufferSize – the internal char buffer size, must be >= 2
Returns:a new CharacterBuffer instance.
/** * Creates a new {@link CharacterBuffer} and allocates a <code>char[]</code> * of the given bufferSize. * * @param bufferSize * the internal char buffer size, must be <code>&gt;= 2</code> * @return a new {@link CharacterBuffer} instance. */
public static CharacterBuffer newCharacterBuffer(final int bufferSize) { if (bufferSize < 2) { throw new IllegalArgumentException("buffersize must be >= 2"); } return new CharacterBuffer(new char[bufferSize], 0, 0); }
Converts each unicode codepoint to lowerCase via Character.toLowerCase(int) starting at the given offset.
Params:
  • buffer – the char buffer to lowercase
  • offset – the offset to start at
  • limit – the max char in the buffer to lower case
/** * Converts each unicode codepoint to lowerCase via {@link Character#toLowerCase(int)} starting * at the given offset. * @param buffer the char buffer to lowercase * @param offset the offset to start at * @param limit the max char in the buffer to lower case */
public static void toLowerCase(final char[] buffer, final int offset, final int limit) { assert buffer.length >= limit; assert offset <=0 && offset <= buffer.length; for (int i = offset; i < limit;) { i += Character.toChars( Character.toLowerCase( Character.codePointAt(buffer, i, limit)), buffer, i); } }
Converts each unicode codepoint to UpperCase via Character.toUpperCase(int) starting at the given offset.
Params:
  • buffer – the char buffer to UPPERCASE
  • offset – the offset to start at
  • limit – the max char in the buffer to lower case
/** * Converts each unicode codepoint to UpperCase via {@link Character#toUpperCase(int)} starting * at the given offset. * @param buffer the char buffer to UPPERCASE * @param offset the offset to start at * @param limit the max char in the buffer to lower case */
public static void toUpperCase(final char[] buffer, final int offset, final int limit) { assert buffer.length >= limit; assert offset <=0 && offset <= buffer.length; for (int i = offset; i < limit;) { i += Character.toChars( Character.toUpperCase( Character.codePointAt(buffer, i, limit)), buffer, i); } }
Converts a sequence of Java characters to a sequence of unicode code points. @return the number of code points written to the destination buffer
/** Converts a sequence of Java characters to a sequence of unicode code points. * @return the number of code points written to the destination buffer */
public static int toCodePoints(char[] src, int srcOff, int srcLen, int[] dest, int destOff) { if (srcLen < 0) { throw new IllegalArgumentException("srcLen must be >= 0"); } int codePointCount = 0; for (int i = 0; i < srcLen; ) { final int cp = Character.codePointAt(src, srcOff + i, srcOff + srcLen); final int charCount = Character.charCount(cp); dest[destOff + codePointCount++] = cp; i += charCount; } return codePointCount; }
Converts a sequence of unicode code points to a sequence of Java characters. @return the number of chars written to the destination buffer
/** Converts a sequence of unicode code points to a sequence of Java characters. * @return the number of chars written to the destination buffer */
public static int toChars(int[] src, int srcOff, int srcLen, char[] dest, int destOff) { if (srcLen < 0) { throw new IllegalArgumentException("srcLen must be >= 0"); } int written = 0; for (int i = 0; i < srcLen; ++i) { written += Character.toChars(src[srcOff + i], dest, destOff + written); } return written; }
Fills the CharacterBuffer with characters read from the given reader Reader. This method tries to read numChars characters into the CharacterBuffer, each call to fill will start filling the buffer from offset 0 up to numChars. In case code points can span across 2 java characters, this method may only fill numChars - 1 characters in order not to split in the middle of a surrogate pair, even if there are remaining characters in the Reader.

This method guarantees that the given CharacterBuffer will never contain a high surrogate character as the last element in the buffer unless it is the last available character in the reader. In other words, high and low surrogate pairs will always be preserved across buffer boarders.

A return value of false means that this method call exhausted the reader, but there may be some bytes which have been read, which can be verified by checking whether buffer.getLength() > 0.

Params:
  • buffer – the buffer to fill.
  • reader – the reader to read characters from.
  • numChars – the number of chars to read
Throws:
Returns:false if and only if reader.read returned -1 while trying to fill the buffer
/** * Fills the {@link CharacterBuffer} with characters read from the given * reader {@link Reader}. This method tries to read <code>numChars</code> * characters into the {@link CharacterBuffer}, each call to fill will start * filling the buffer from offset <code>0</code> up to <code>numChars</code>. * In case code points can span across 2 java characters, this method may * only fill <code>numChars - 1</code> characters in order not to split in * the middle of a surrogate pair, even if there are remaining characters in * the {@link Reader}. * <p> * This method guarantees * that the given {@link CharacterBuffer} will never contain a high surrogate * character as the last element in the buffer unless it is the last available * character in the reader. In other words, high and low surrogate pairs will * always be preserved across buffer boarders. * </p> * <p> * A return value of <code>false</code> means that this method call exhausted * the reader, but there may be some bytes which have been read, which can be * verified by checking whether <code>buffer.getLength() &gt; 0</code>. * </p> * * @param buffer * the buffer to fill. * @param reader * the reader to read characters from. * @param numChars * the number of chars to read * @return <code>false</code> if and only if reader.read returned -1 while trying to fill the buffer * @throws IOException * if the reader throws an {@link IOException}. */
public static boolean fill(CharacterBuffer buffer, Reader reader, int numChars) throws IOException { assert buffer.buffer.length >= 2; if (numChars < 2 || numChars > buffer.buffer.length) { throw new IllegalArgumentException("numChars must be >= 2 and <= the buffer size"); } final char[] charBuffer = buffer.buffer; buffer.offset = 0; final int offset; // Install the previously saved ending high surrogate: if (buffer.lastTrailingHighSurrogate != 0) { charBuffer[0] = buffer.lastTrailingHighSurrogate; buffer.lastTrailingHighSurrogate = 0; offset = 1; } else { offset = 0; } final int read = readFully(reader, charBuffer, offset, numChars - offset); buffer.length = offset + read; final boolean result = buffer.length == numChars; if (buffer.length < numChars) { // We failed to fill the buffer. Even if the last char is a high // surrogate, there is nothing we can do return result; } if (Character.isHighSurrogate(charBuffer[buffer.length - 1])) { buffer.lastTrailingHighSurrogate = charBuffer[--buffer.length]; } return result; }
Convenience method which calls fill(buffer, reader, buffer.buffer.length).
/** Convenience method which calls <code>fill(buffer, reader, buffer.buffer.length)</code>. */
public static boolean fill(CharacterBuffer buffer, Reader reader) throws IOException { return fill(buffer, reader, buffer.buffer.length); } static int readFully(Reader reader, char[] dest, int offset, int len) throws IOException { int read = 0; while (read < len) { final int r = reader.read(dest, offset + read, len - read); if (r == -1) { break; } read += r; } return read; }
A simple IO buffer to use with CharacterUtils.fill(CharacterBuffer, Reader).
/** * A simple IO buffer to use with * {@link CharacterUtils#fill(CharacterBuffer, Reader)}. */
public static final class CharacterBuffer { private final char[] buffer; private int offset; private int length; // NOTE: not private so outer class can access without // $access methods: char lastTrailingHighSurrogate; CharacterBuffer(char[] buffer, int offset, int length) { this.buffer = buffer; this.offset = offset; this.length = length; }
Returns the internal buffer
Returns:the buffer
/** * Returns the internal buffer * * @return the buffer */
public char[] getBuffer() { return buffer; }
Returns the data offset in the internal buffer.
Returns:the offset
/** * Returns the data offset in the internal buffer. * * @return the offset */
public int getOffset() { return offset; }
Return the length of the data in the internal buffer starting at getOffset()
Returns:the length
/** * Return the length of the data in the internal buffer starting at * {@link #getOffset()} * * @return the length */
public int getLength() { return length; }
Resets the CharacterBuffer. All internals are reset to its default values.
/** * Resets the CharacterBuffer. All internals are reset to its default * values. */
public void reset() { offset = 0; length = 0; lastTrailingHighSurrogate = 0; } } }