/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.util;


import java.io.DataInputStream;
import java.math.BigInteger;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.Arrays;
import java.util.Properties;

Methods for manipulating strings.
@lucene.internal
/** * Methods for manipulating strings. * * @lucene.internal */
public abstract class StringHelper {
Compares two BytesRef, element by element, and returns the number of elements common to both arrays (from the start of each). This method assumes currentTerm comes after priorTerm.
Params:
  • priorTerm – The first BytesRef to compare
  • currentTerm – The second BytesRef to compare
Returns:The number of common elements (from the start of each).
/** * Compares two {@link BytesRef}, element by element, and returns the * number of elements common to both arrays (from the start of each). * This method assumes currentTerm comes after priorTerm. * * @param priorTerm The first {@link BytesRef} to compare * @param currentTerm The second {@link BytesRef} to compare * @return The number of common elements (from the start of each). */
public static int bytesDifference(BytesRef priorTerm, BytesRef currentTerm) { int mismatch = FutureArrays.mismatch(priorTerm.bytes, priorTerm.offset, priorTerm.offset + priorTerm.length, currentTerm.bytes, currentTerm.offset, currentTerm.offset + currentTerm.length); if (mismatch < 0) { throw new IllegalArgumentException("terms out of order: priorTerm=" + priorTerm + ",currentTerm=" + currentTerm); } return mismatch; }
Returns the length of currentTerm needed for use as a sort key. so that BytesRef.compareTo(BytesRef) still returns the same result. This method assumes currentTerm comes after priorTerm.
/** * Returns the length of {@code currentTerm} needed for use as a sort key. * so that {@link BytesRef#compareTo(BytesRef)} still returns the same result. * This method assumes currentTerm comes after priorTerm. */
public static int sortKeyLength(final BytesRef priorTerm, final BytesRef currentTerm) { return bytesDifference(priorTerm, currentTerm) + 1; } private StringHelper() { }
Returns true iff the ref starts with the given prefix. Otherwise false.
Params:
  • ref – the byte[] to test
  • prefix – the expected prefix
Returns:Returns true iff the ref starts with the given prefix. Otherwise false.
/** * Returns <code>true</code> iff the ref starts with the given prefix. * Otherwise <code>false</code>. * * @param ref * the {@code byte[]} to test * @param prefix * the expected prefix * @return Returns <code>true</code> iff the ref starts with the given prefix. * Otherwise <code>false</code>. */
public static boolean startsWith(byte[] ref, BytesRef prefix) { // not long enough to start with the prefix if (ref.length < prefix.length) { return false; } return FutureArrays.equals(ref, 0, prefix.length, prefix.bytes, prefix.offset, prefix.offset + prefix.length); }
Returns true iff the ref starts with the given prefix. Otherwise false.
Params:
  • ref – the BytesRef to test
  • prefix – the expected prefix
Returns:Returns true iff the ref starts with the given prefix. Otherwise false.
/** * Returns <code>true</code> iff the ref starts with the given prefix. * Otherwise <code>false</code>. * * @param ref * the {@link BytesRef} to test * @param prefix * the expected prefix * @return Returns <code>true</code> iff the ref starts with the given prefix. * Otherwise <code>false</code>. */
public static boolean startsWith(BytesRef ref, BytesRef prefix) { // not long enough to start with the prefix if (ref.length < prefix.length) { return false; } return FutureArrays.equals(ref.bytes, ref.offset, ref.offset + prefix.length, prefix.bytes, prefix.offset, prefix.offset + prefix.length); }
Returns true iff the ref ends with the given suffix. Otherwise false.
Params:
  • ref – the BytesRef to test
  • suffix – the expected suffix
Returns:Returns true iff the ref ends with the given suffix. Otherwise false.
/** * Returns <code>true</code> iff the ref ends with the given suffix. Otherwise * <code>false</code>. * * @param ref * the {@link BytesRef} to test * @param suffix * the expected suffix * @return Returns <code>true</code> iff the ref ends with the given suffix. * Otherwise <code>false</code>. */
public static boolean endsWith(BytesRef ref, BytesRef suffix) { int startAt = ref.length - suffix.length; // not long enough to start with the suffix if (startAt < 0) { return false; } return FutureArrays.equals(ref.bytes, ref.offset + startAt, ref.offset + startAt + suffix.length, suffix.bytes, suffix.offset, suffix.offset + suffix.length); }
Pass this as the seed to murmurhash3_x86_32.
/** Pass this as the seed to {@link #murmurhash3_x86_32}. */
// Poached from Guava: set a different salt/seed // for each JVM instance, to frustrate hash key collision // denial of service attacks, and to catch any places that // somehow rely on hash function/order across JVM // instances: public static final int GOOD_FAST_HASH_SEED; static { String prop = System.getProperty("tests.seed"); if (prop != null) { // So if there is a test failure that relied on hash // order, we remain reproducible based on the test seed: GOOD_FAST_HASH_SEED = prop.hashCode(); } else { GOOD_FAST_HASH_SEED = (int) System.currentTimeMillis(); } }
Returns the MurmurHash3_x86_32 hash. Original source/tests at https://github.com/yonik/java_util/
/** Returns the MurmurHash3_x86_32 hash. * Original source/tests at https://github.com/yonik/java_util/ */
@SuppressWarnings("fallthrough") public static int murmurhash3_x86_32(byte[] data, int offset, int len, int seed) { final int c1 = 0xcc9e2d51; final int c2 = 0x1b873593; int h1 = seed; int roundedEnd = offset + (len & 0xfffffffc); // round down to 4 byte block for (int i=offset; i<roundedEnd; i+=4) { // little endian load order int k1 = (data[i] & 0xff) | ((data[i+1] & 0xff) << 8) | ((data[i+2] & 0xff) << 16) | (data[i+3] << 24); k1 *= c1; k1 = Integer.rotateLeft(k1, 15); k1 *= c2; h1 ^= k1; h1 = Integer.rotateLeft(h1, 13); h1 = h1*5+0xe6546b64; } // tail int k1 = 0; switch(len & 0x03) { case 3: k1 = (data[roundedEnd + 2] & 0xff) << 16; // fallthrough case 2: k1 |= (data[roundedEnd + 1] & 0xff) << 8; // fallthrough case 1: k1 |= (data[roundedEnd] & 0xff); k1 *= c1; k1 = Integer.rotateLeft(k1, 15); k1 *= c2; h1 ^= k1; } // finalization h1 ^= len; // fmix(h1); h1 ^= h1 >>> 16; h1 *= 0x85ebca6b; h1 ^= h1 >>> 13; h1 *= 0xc2b2ae35; h1 ^= h1 >>> 16; return h1; } public static int murmurhash3_x86_32(BytesRef bytes, int seed) { return murmurhash3_x86_32(bytes.bytes, bytes.offset, bytes.length, seed); } // Holds 128 bit unsigned value: private static BigInteger nextId; private static final BigInteger mask128; private static final Object idLock = new Object(); static { // 128 bit unsigned mask byte[] maskBytes128 = new byte[16]; Arrays.fill(maskBytes128, (byte) 0xff); mask128 = new BigInteger(1, maskBytes128); String prop = System.getProperty("tests.seed"); // State for xorshift128: long x0; long x1; if (prop != null) { // So if there is a test failure that somehow relied on this id, // we remain reproducible based on the test seed: if (prop.length() > 8) { prop = prop.substring(prop.length()-8); } x0 = Long.parseLong(prop, 16); x1 = x0; } else { // seed from /dev/urandom, if its available try (DataInputStream is = new DataInputStream(Files.newInputStream(Paths.get("/dev/urandom")))) { x0 = is.readLong(); x1 = is.readLong(); } catch (Exception unavailable) { // may not be available on this platform // fall back to lower quality randomness from 3 different sources: x0 = System.nanoTime(); x1 = StringHelper.class.hashCode() << 32; StringBuilder sb = new StringBuilder(); // Properties can vary across JVM instances: try { Properties p = System.getProperties(); for (String s: p.stringPropertyNames()) { sb.append(s); sb.append(p.getProperty(s)); } x1 |= sb.toString().hashCode(); } catch (SecurityException notallowed) { // getting Properties requires wildcard read-write: may not be allowed x1 |= StringBuffer.class.hashCode(); } } } // Use a few iterations of xorshift128 to scatter the seed // in case multiple Lucene instances starting up "near" the same // nanoTime, since we use ++ (mod 2^128) for full period cycle: for(int i=0;i<10;i++) { long s1 = x0; long s0 = x1; x0 = s0; s1 ^= s1 << 23; // a x1 = s1 ^ s0 ^ (s1 >>> 17) ^ (s0 >>> 26); // b, c } // 64-bit unsigned mask byte[] maskBytes64 = new byte[8]; Arrays.fill(maskBytes64, (byte) 0xff); BigInteger mask64 = new BigInteger(1, maskBytes64); // First make unsigned versions of x0, x1: BigInteger unsignedX0 = BigInteger.valueOf(x0).and(mask64); BigInteger unsignedX1 = BigInteger.valueOf(x1).and(mask64); // Concatentate bits of x0 and x1, as unsigned 128 bit integer: nextId = unsignedX0.shiftLeft(64).or(unsignedX1); }
length in bytes of an ID
/** length in bytes of an ID */
public static final int ID_LENGTH = 16;
Generates a non-cryptographic globally unique id.
/** Generates a non-cryptographic globally unique id. */
public static byte[] randomId() { // NOTE: we don't use Java's UUID.randomUUID() implementation here because: // // * It's overkill for our usage: it tries to be cryptographically // secure, whereas for this use we don't care if someone can // guess the IDs. // // * It uses SecureRandom, which on Linux can easily take a long time // (I saw ~ 10 seconds just running a Lucene test) when entropy // harvesting is falling behind. // // * It loses a few (6) bits to version and variant and it's not clear // what impact that has on the period, whereas the simple ++ (mod 2^128) // we use here is guaranteed to have the full period. byte bits[]; synchronized(idLock) { bits = nextId.toByteArray(); nextId = nextId.add(BigInteger.ONE).and(mask128); } // toByteArray() always returns a sign bit, so it may require an extra byte (always zero) if (bits.length > ID_LENGTH) { assert bits.length == ID_LENGTH + 1; assert bits[0] == 0; return ArrayUtil.copyOfSubArray(bits, 1, bits.length); } else { byte[] result = new byte[ID_LENGTH]; System.arraycopy(bits, 0, result, result.length - bits.length, bits.length); return result; } }
Helper method to render an ID as a string, for debugging

Returns the string (null) if the id is null. Otherwise, returns a string representation for debugging. Never throws an exception. The returned string may indicate if the id is definitely invalid.

/** * Helper method to render an ID as a string, for debugging * <p> * Returns the string {@code (null)} if the id is null. * Otherwise, returns a string representation for debugging. * Never throws an exception. The returned string may * indicate if the id is definitely invalid. */
public static String idToString(byte id[]) { if (id == null) { return "(null)"; } else { StringBuilder sb = new StringBuilder(); sb.append(new BigInteger(1, id).toString(Character.MAX_RADIX)); if (id.length != ID_LENGTH) { sb.append(" (INVALID FORMAT)"); } return sb.toString(); } }
Just converts each int in the incoming IntsRef to each byte in the returned BytesRef, throwing IllegalArgumentException if any int value is out of bounds for a byte.
/** Just converts each int in the incoming {@link IntsRef} to each byte * in the returned {@link BytesRef}, throwing {@code IllegalArgumentException} * if any int value is out of bounds for a byte. */
public static BytesRef intsRefToBytesRef(IntsRef ints) { byte[] bytes = new byte[ints.length]; for(int i=0;i<ints.length;i++) { int x = ints.ints[ints.offset+i]; if (x < 0 || x > 255) { throw new IllegalArgumentException("int at pos=" + i + " with value=" + x + " is out-of-bounds for byte"); } bytes[i] = (byte) x; } return new BytesRef(bytes); } }