package org.apache.lucene.codecs.simpletext;
import java.io.Closeable;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.function.IntFunction;
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.codecs.MutablePointValues;
import org.apache.lucene.index.PointValues.IntersectVisitor;
import org.apache.lucene.index.PointValues.Relation;
import org.apache.lucene.store.ChecksumIndexInput;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IOContext;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.store.TrackingDirectoryWrapper;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefBuilder;
import org.apache.lucene.util.FixedBitSet;
import org.apache.lucene.util.FutureArrays;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.NumericUtils;
import org.apache.lucene.util.bkd.BKDRadixSelector;
import org.apache.lucene.util.bkd.BKDWriter;
import org.apache.lucene.util.bkd.HeapPointWriter;
import org.apache.lucene.util.bkd.MutablePointsReaderUtils;
import org.apache.lucene.util.bkd.OfflinePointWriter;
import org.apache.lucene.util.bkd.PointReader;
import org.apache.lucene.util.bkd.PointValue;
import org.apache.lucene.util.bkd.PointWriter;
import static org.apache.lucene.codecs.simpletext.SimpleTextPointsWriter.BLOCK_COUNT;
import static org.apache.lucene.codecs.simpletext.SimpleTextPointsWriter.BLOCK_DOC_ID;
import static org.apache.lucene.codecs.simpletext.SimpleTextPointsWriter.BLOCK_FP;
import static org.apache.lucene.codecs.simpletext.SimpleTextPointsWriter.BLOCK_VALUE;
import static org.apache.lucene.codecs.simpletext.SimpleTextPointsWriter.BYTES_PER_DIM;
import static org.apache.lucene.codecs.simpletext.SimpleTextPointsWriter.DOC_COUNT;
import static org.apache.lucene.codecs.simpletext.SimpleTextPointsWriter.INDEX_COUNT;
import static org.apache.lucene.codecs.simpletext.SimpleTextPointsWriter.MAX_LEAF_POINTS;
import static org.apache.lucene.codecs.simpletext.SimpleTextPointsWriter.MAX_VALUE;
import static org.apache.lucene.codecs.simpletext.SimpleTextPointsWriter.MIN_VALUE;
import static org.apache.lucene.codecs.simpletext.SimpleTextPointsWriter.NUM_DATA_DIMS;
import static org.apache.lucene.codecs.simpletext.SimpleTextPointsWriter.NUM_INDEX_DIMS;
import static org.apache.lucene.codecs.simpletext.SimpleTextPointsWriter.POINT_COUNT;
import static org.apache.lucene.codecs.simpletext.SimpleTextPointsWriter.SPLIT_COUNT;
import static org.apache.lucene.codecs.simpletext.SimpleTextPointsWriter.SPLIT_DIM;
import static org.apache.lucene.codecs.simpletext.SimpleTextPointsWriter.SPLIT_VALUE;
final class SimpleTextBKDWriter implements Closeable {
public static final String CODEC_NAME = "BKD";
public static final int VERSION_START = 0;
public static final int VERSION_COMPRESSED_DOC_IDS = 1;
public static final int VERSION_COMPRESSED_VALUES = 2;
public static final int VERSION_IMPLICIT_SPLIT_DIM_1D = 3;
public static final int VERSION_CURRENT = VERSION_IMPLICIT_SPLIT_DIM_1D;
private final int bytesPerDoc;
public static final int DEFAULT_MAX_POINTS_IN_LEAF_NODE = 1024;
public static final float DEFAULT_MAX_MB_SORT_IN_HEAP = 16.0f;
public static final int MAX_DIMS = 8;
protected final int numDataDims;
protected final int numIndexDims;
protected final int bytesPerDim;
protected final int packedBytesLength;
protected final int packedIndexBytesLength;
final BytesRefBuilder scratch = new BytesRefBuilder();
final TrackingDirectoryWrapper tempDir;
final String tempFileNamePrefix;
final double maxMBSortInHeap;
final byte[] scratchDiff;
final byte[] scratch1;
final byte[] scratch2;
final BytesRef scratchBytesRef1 = new BytesRef();
final BytesRef scratchBytesRef2 = new BytesRef();
final int[] commonPrefixLengths;
protected final FixedBitSet docsSeen;
private PointWriter pointWriter;
private boolean finished;
private IndexOutput tempInput;
protected final int maxPointsInLeafNode;
private final int maxPointsSortInHeap;
protected final byte[] minPackedValue;
protected final byte[] maxPackedValue;
protected long pointCount;
private final long totalPointCount;
private final int maxDoc;
public SimpleTextBKDWriter(int maxDoc, Directory tempDir, String tempFileNamePrefix, int numDataDims, int numIndexDims, int bytesPerDim,
int maxPointsInLeafNode, double maxMBSortInHeap, long totalPointCount) throws IOException {
verifyParams(numDataDims, numIndexDims, maxPointsInLeafNode, maxMBSortInHeap, totalPointCount);
this.tempDir = new TrackingDirectoryWrapper(tempDir);
this.tempFileNamePrefix = tempFileNamePrefix;
this.maxPointsInLeafNode = maxPointsInLeafNode;
this.numDataDims = numDataDims;
this.numIndexDims = numIndexDims;
this.bytesPerDim = bytesPerDim;
this.totalPointCount = totalPointCount;
this.maxDoc = maxDoc;
docsSeen = new FixedBitSet(maxDoc);
packedBytesLength = numDataDims * bytesPerDim;
packedIndexBytesLength = numIndexDims * bytesPerDim;
scratchDiff = new byte[bytesPerDim];
scratch1 = new byte[packedBytesLength];
scratch2 = new byte[packedBytesLength];
commonPrefixLengths = new int[numDataDims];
minPackedValue = new byte[packedIndexBytesLength];
maxPackedValue = new byte[packedIndexBytesLength];
bytesPerDoc = packedBytesLength + Integer.BYTES;
maxPointsSortInHeap = (int) ((maxMBSortInHeap * 1024 * 1024) / (bytesPerDoc * numDataDims));
if (maxPointsSortInHeap < maxPointsInLeafNode) {
throw new IllegalArgumentException("maxMBSortInHeap=" + maxMBSortInHeap + " only allows for maxPointsSortInHeap=" + maxPointsSortInHeap + ", but this is less than maxPointsInLeafNode=" + maxPointsInLeafNode + "; either increase maxMBSortInHeap or decrease maxPointsInLeafNode");
}
this.maxMBSortInHeap = maxMBSortInHeap;
}
public static void verifyParams(int numDataDims, int numIndexDims, int maxPointsInLeafNode, double maxMBSortInHeap, long totalPointCount) {
if (numDataDims < 1 || numDataDims > MAX_DIMS) {
throw new IllegalArgumentException("numDataDims must be 1 .. " + MAX_DIMS + " (got: " + numDataDims + ")");
}
if (numIndexDims < 1 || numIndexDims > numDataDims) {
throw new IllegalArgumentException("numIndexDims must be 1 .. " + numDataDims + " (got: " + numIndexDims + ")");
}
if (maxPointsInLeafNode <= 0) {
throw new IllegalArgumentException("maxPointsInLeafNode must be > 0; got " + maxPointsInLeafNode);
}
if (maxPointsInLeafNode > ArrayUtil.MAX_ARRAY_LENGTH) {
throw new IllegalArgumentException("maxPointsInLeafNode must be <= ArrayUtil.MAX_ARRAY_LENGTH (= " + ArrayUtil.MAX_ARRAY_LENGTH + "); got " + maxPointsInLeafNode);
}
if (maxMBSortInHeap < 0.0) {
throw new IllegalArgumentException("maxMBSortInHeap must be >= 0.0 (got: " + maxMBSortInHeap + ")");
}
if (totalPointCount < 0) {
throw new IllegalArgumentException("totalPointCount must be >=0 (got: " + totalPointCount + ")");
}
}
public void add(byte[] packedValue, int docID) throws IOException {
if (packedValue.length != packedBytesLength) {
throw new IllegalArgumentException("packedValue should be length=" + packedBytesLength + " (got: " + packedValue.length + ")");
}
if (pointCount >= totalPointCount) {
throw new IllegalStateException("totalPointCount=" + totalPointCount + " was passed when we were created, but we just hit " + (pointCount + 1) + " values");
}
if (pointCount == 0) {
assert pointWriter == null : "Point writer is already initialized";
if (totalPointCount > maxPointsSortInHeap) {
pointWriter = new OfflinePointWriter(tempDir, tempFileNamePrefix, packedBytesLength, "spill", 0);
tempInput = ((OfflinePointWriter)pointWriter).out;
} else {
pointWriter = new HeapPointWriter(Math.toIntExact(totalPointCount), packedBytesLength);
}
System.arraycopy(packedValue, 0, minPackedValue, 0, packedIndexBytesLength);
System.arraycopy(packedValue, 0, maxPackedValue, 0, packedIndexBytesLength);
} else {
for(int dim=0;dim<numIndexDims;dim++) {
int offset = dim*bytesPerDim;
if (FutureArrays.compareUnsigned(packedValue, offset, offset + bytesPerDim, minPackedValue, offset, offset + bytesPerDim) < 0) {
System.arraycopy(packedValue, offset, minPackedValue, offset, bytesPerDim);
}
if (FutureArrays.compareUnsigned(packedValue, offset, offset + bytesPerDim, maxPackedValue, offset, offset + bytesPerDim) > 0) {
System.arraycopy(packedValue, offset, maxPackedValue, offset, bytesPerDim);
}
}
}
pointWriter.append(packedValue, docID);
pointCount++;
docsSeen.set(docID);
}
public long getPointCount() {
return pointCount;
}
public long writeField(IndexOutput out, String fieldName, MutablePointValues reader) throws IOException {
if (numIndexDims == 1) {
return writeField1Dim(out, fieldName, reader);
} else {
return writeFieldNDims(out, fieldName, reader);
}
}
private long writeFieldNDims(IndexOutput out, String fieldName, MutablePointValues values) throws IOException {
if (pointCount != 0) {
throw new IllegalStateException("cannot mix add and writeField");
}
if (finished == true) {
throw new IllegalStateException("already finished");
}
finished = true;
long countPerLeaf = pointCount = values.size();
long innerNodeCount = 1;
while (countPerLeaf > maxPointsInLeafNode) {
countPerLeaf = (countPerLeaf+1)/2;
innerNodeCount *= 2;
}
int numLeaves = Math.toIntExact(innerNodeCount);
checkMaxLeafNodeCount(numLeaves);
final byte[] splitPackedValues = new byte[numLeaves * (bytesPerDim + 1)];
final long[] leafBlockFPs = new long[numLeaves];
Arrays.fill(minPackedValue, (byte) 0xff);
Arrays.fill(maxPackedValue, (byte) 0);
for (int i = 0; i < Math.toIntExact(pointCount); ++i) {
values.getValue(i, scratchBytesRef1);
for(int dim=0;dim<numIndexDims;dim++) {
int offset = dim*bytesPerDim;
if (FutureArrays.compareUnsigned(scratchBytesRef1.bytes, scratchBytesRef1.offset + offset, scratchBytesRef1.offset + offset + bytesPerDim, minPackedValue, offset, offset + bytesPerDim) < 0) {
System.arraycopy(scratchBytesRef1.bytes, scratchBytesRef1.offset + offset, minPackedValue, offset, bytesPerDim);
}
if (FutureArrays.compareUnsigned(scratchBytesRef1.bytes, scratchBytesRef1.offset + offset, scratchBytesRef1.offset + offset + bytesPerDim, maxPackedValue, offset, offset + bytesPerDim) > 0) {
System.arraycopy(scratchBytesRef1.bytes, scratchBytesRef1.offset + offset, maxPackedValue, offset, bytesPerDim);
}
}
docsSeen.set(values.getDocID(i));
}
build(1, numLeaves, values, 0, Math.toIntExact(pointCount), out,
minPackedValue, maxPackedValue, splitPackedValues, leafBlockFPs,
new int[maxPointsInLeafNode]);
long indexFP = out.getFilePointer();
writeIndex(out, leafBlockFPs, splitPackedValues);
return indexFP;
}
private long writeField1Dim(IndexOutput out, String fieldName, MutablePointValues reader) throws IOException {
MutablePointsReaderUtils.sort(maxDoc, packedIndexBytesLength, reader, 0, Math.toIntExact(reader.size()));
final OneDimensionBKDWriter oneDimWriter = new OneDimensionBKDWriter(out);
reader.intersect(new IntersectVisitor() {
@Override
public void visit(int docID, byte[] packedValue) throws IOException {
oneDimWriter.add(packedValue, docID);
}
@Override
public void visit(int docID) throws IOException {
throw new IllegalStateException();
}
@Override
public Relation compare(byte[] minPackedValue, byte[] maxPackedValue) {
return Relation.CELL_CROSSES_QUERY;
}
});
return oneDimWriter.finish();
}
private class OneDimensionBKDWriter {
final IndexOutput out;
final List<Long> leafBlockFPs = new ArrayList<>();
final List<byte[]> leafBlockStartValues = new ArrayList<>();
final byte[] leafValues = new byte[maxPointsInLeafNode * packedBytesLength];
final int[] leafDocs = new int[maxPointsInLeafNode];
long valueCount;
int leafCount;
OneDimensionBKDWriter(IndexOutput out) {
if (numIndexDims != 1) {
throw new UnsupportedOperationException("numIndexDims must be 1 but got " + numIndexDims);
}
if (pointCount != 0) {
throw new IllegalStateException("cannot mix add and merge");
}
if (finished == true) {
throw new IllegalStateException("already finished");
}
finished = true;
this.out = out;
lastPackedValue = new byte[packedBytesLength];
}
final byte[] lastPackedValue;
int lastDocID;
void add(byte[] packedValue, int docID) throws IOException {
assert valueInOrder(valueCount + leafCount,
0, lastPackedValue, packedValue, 0, docID, lastDocID);
System.arraycopy(packedValue, 0, leafValues, leafCount * packedBytesLength, packedBytesLength);
leafDocs[leafCount] = docID;
docsSeen.set(docID);
leafCount++;
if (valueCount > totalPointCount) {
throw new IllegalStateException("totalPointCount=" + totalPointCount + " was passed when we were created, but we just hit " + pointCount + " values");
}
if (leafCount == maxPointsInLeafNode) {
writeLeafBlock();
leafCount = 0;
}
assert (lastDocID = docID) >= 0;
}
public long finish() throws IOException {
if (leafCount > 0) {
writeLeafBlock();
leafCount = 0;
}
if (valueCount == 0) {
return -1;
}
pointCount = valueCount;
long indexFP = out.getFilePointer();
int numInnerNodes = leafBlockStartValues.size();
byte[] index = new byte[(1+numInnerNodes) * (1+bytesPerDim)];
rotateToTree(1, 0, numInnerNodes, index, leafBlockStartValues);
long[] arr = new long[leafBlockFPs.size()];
for(int i=0;i<leafBlockFPs.size();i++) {
arr[i] = leafBlockFPs.get(i);
}
writeIndex(out, arr, index);
return indexFP;
}
private void writeLeafBlock() throws IOException {
assert leafCount != 0;
if (valueCount == 0) {
System.arraycopy(leafValues, 0, minPackedValue, 0, packedIndexBytesLength);
}
System.arraycopy(leafValues, (leafCount - 1) * packedBytesLength, maxPackedValue, 0, packedIndexBytesLength);
valueCount += leafCount;
if (leafBlockFPs.size() > 0) {
leafBlockStartValues.add(ArrayUtil.copyOfSubArray(leafValues, 0, packedBytesLength));
}
leafBlockFPs.add(out.getFilePointer());
checkMaxLeafNodeCount(leafBlockFPs.size());
Arrays.fill(commonPrefixLengths, bytesPerDim);
for(int dim=0;dim<numDataDims;dim++) {
int offset1 = dim * bytesPerDim;
int offset2 = (leafCount - 1) * packedBytesLength + offset1;
for(int j=0;j<commonPrefixLengths[dim];j++) {
if (leafValues[offset1+j] != leafValues[offset2+j]) {
commonPrefixLengths[dim] = j;
break;
}
}
}
writeLeafBlockDocs(out, leafDocs, 0, leafCount);
final IntFunction<BytesRef> packedValues = new IntFunction<BytesRef>() {
final BytesRef scratch = new BytesRef();
{
scratch.length = packedBytesLength;
scratch.bytes = leafValues;
}
@Override
public BytesRef apply(int i) {
scratch.offset = packedBytesLength * i;
return scratch;
}
};
assert valuesInOrderAndBounds(leafCount, 0, ArrayUtil.copyOfSubArray(leafValues, 0, packedBytesLength),
ArrayUtil.copyOfSubArray(leafValues, (leafCount - 1) * packedBytesLength, leafCount * packedBytesLength),
packedValues, leafDocs, 0);
writeLeafBlockPackedValues(out, commonPrefixLengths, leafCount, 0, packedValues);
}
}
private void rotateToTree(int nodeID, int offset, int count, byte[] index, List<byte[]> leafBlockStartValues) {
if (count == 1) {
System.arraycopy(leafBlockStartValues.get(offset), 0, index, nodeID*(1+bytesPerDim)+1, bytesPerDim);
} else if (count > 1) {
int countAtLevel = 1;
int totalCount = 0;
while (true) {
int countLeft = count - totalCount;
if (countLeft <= countAtLevel) {
int lastLeftCount = Math.min(countAtLevel/2, countLeft);
assert lastLeftCount >= 0;
int leftHalf = (totalCount-1)/2 + lastLeftCount;
int rootOffset = offset + leftHalf;
System.arraycopy(leafBlockStartValues.get(rootOffset), 0, index, nodeID*(1+bytesPerDim)+1, bytesPerDim);
rotateToTree(2*nodeID, offset, leftHalf, index, leafBlockStartValues);
rotateToTree(2*nodeID+1, rootOffset+1, count-leftHalf-1, index, leafBlockStartValues);
return;
}
totalCount += countAtLevel;
countAtLevel *= 2;
}
} else {
assert count == 0;
}
}
private void checkMaxLeafNodeCount(int numLeaves) {
if ((1+bytesPerDim) * (long) numLeaves > ArrayUtil.MAX_ARRAY_LENGTH) {
throw new IllegalStateException("too many nodes; increase maxPointsInLeafNode (currently " + maxPointsInLeafNode + ") and reindex");
}
}
public long finish(IndexOutput out) throws IOException {
if (pointCount == 0) {
throw new IllegalStateException("must index at least one point");
}
if (finished == true) {
throw new IllegalStateException("already finished");
}
finished = true;
pointWriter.close();
BKDRadixSelector.PathSlice points = new BKDRadixSelector.PathSlice(pointWriter, 0, pointCount);
tempInput = null;
pointWriter = null;
long countPerLeaf = pointCount;
long innerNodeCount = 1;
while (countPerLeaf > maxPointsInLeafNode) {
countPerLeaf = (countPerLeaf+1)/2;
innerNodeCount *= 2;
}
int numLeaves = (int) innerNodeCount;
checkMaxLeafNodeCount(numLeaves);
byte[] splitPackedValues = new byte[Math.toIntExact(numLeaves*(1+bytesPerDim))];
long[] leafBlockFPs = new long[numLeaves];
assert pointCount / numLeaves <= maxPointsInLeafNode: "pointCount=" + pointCount + " numLeaves=" + numLeaves + " maxPointsInLeafNode=" + maxPointsInLeafNode;
BKDRadixSelector radixSelector = new BKDRadixSelector(numDataDims, numIndexDims, bytesPerDim, maxPointsSortInHeap, tempDir, tempFileNamePrefix);
boolean success = false;
try {
build(1, numLeaves, points, out,
radixSelector, minPackedValue, maxPackedValue,
splitPackedValues, leafBlockFPs, new int[maxPointsInLeafNode]);
assert tempDir.getCreatedFiles().isEmpty();
success = true;
} finally {
if (success == false) {
IOUtils.deleteFilesIgnoringExceptions(tempDir, tempDir.getCreatedFiles());
}
}
long indexFP = out.getFilePointer();
writeIndex(out, leafBlockFPs, splitPackedValues);
return indexFP;
}
private void writeIndex(IndexOutput out, long[] leafBlockFPs, byte[] splitPackedValues) throws IOException {
write(out, NUM_DATA_DIMS);
writeInt(out, numDataDims);
newline(out);
write(out, NUM_INDEX_DIMS);
writeInt(out, numIndexDims);
newline(out);
write(out, BYTES_PER_DIM);
writeInt(out, bytesPerDim);
newline(out);
write(out, MAX_LEAF_POINTS);
writeInt(out, maxPointsInLeafNode);
newline(out);
write(out, INDEX_COUNT);
writeInt(out, leafBlockFPs.length);
newline(out);
write(out, MIN_VALUE);
BytesRef br = new BytesRef(minPackedValue, 0, minPackedValue.length);
write(out, br.toString());
newline(out);
write(out, MAX_VALUE);
br = new BytesRef(maxPackedValue, 0, maxPackedValue.length);
write(out, br.toString());
newline(out);
write(out, POINT_COUNT);
writeLong(out, pointCount);
newline(out);
write(out, DOC_COUNT);
writeInt(out, docsSeen.cardinality());
newline(out);
for(int i=0;i<leafBlockFPs.length;i++) {
write(out, BLOCK_FP);
writeLong(out, leafBlockFPs[i]);
newline(out);
}
assert (splitPackedValues.length % (1 + bytesPerDim)) == 0;
int count = splitPackedValues.length / (1 + bytesPerDim);
assert count == leafBlockFPs.length;
write(out, SPLIT_COUNT);
writeInt(out, count);
newline(out);
for(int i=0;i<count;i++) {
write(out, SPLIT_DIM);
writeInt(out, splitPackedValues[i * (1 + bytesPerDim)] & 0xff);
newline(out);
write(out, SPLIT_VALUE);
br = new BytesRef(splitPackedValues, 1+(i * (1+bytesPerDim)), bytesPerDim);
write(out, br.toString());
newline(out);
}
}
protected void writeLeafBlockDocs(IndexOutput out, int[] docIDs, int start, int count) throws IOException {
write(out, BLOCK_COUNT);
writeInt(out, count);
newline(out);
for(int i=0;i<count;i++) {
write(out, BLOCK_DOC_ID);
writeInt(out, docIDs[start+i]);
newline(out);
}
}
protected void writeLeafBlockPackedValues(IndexOutput out, int[] commonPrefixLengths, int count, int sortedDim, IntFunction<BytesRef> packedValues) throws IOException {
for (int i = 0; i < count; ++i) {
BytesRef packedValue = packedValues.apply(i);
write(out, BLOCK_VALUE);
write(out, packedValue.toString());
newline(out);
}
}
private void writeLeafBlockPackedValuesRange(IndexOutput out, int[] commonPrefixLengths, int start, int end, IntFunction<BytesRef> packedValues) throws IOException {
for (int i = start; i < end; ++i) {
BytesRef ref = packedValues.apply(i);
assert ref.length == packedBytesLength;
for(int dim=0;dim<numDataDims;dim++) {
int prefix = commonPrefixLengths[dim];
out.writeBytes(ref.bytes, ref.offset + dim*bytesPerDim + prefix, bytesPerDim-prefix);
}
}
}
private static int runLen(IntFunction<BytesRef> packedValues, int start, int end, int byteOffset) {
BytesRef first = packedValues.apply(start);
byte b = first.bytes[first.offset + byteOffset];
for (int i = start + 1; i < end; ++i) {
BytesRef ref = packedValues.apply(i);
byte b2 = ref.bytes[ref.offset + byteOffset];
assert Byte.toUnsignedInt(b2) >= Byte.toUnsignedInt(b);
if (b != b2) {
return i - start;
}
}
return end - start;
}
@Override
public void close() throws IOException {
if (tempInput != null) {
try {
tempInput.close();
} finally {
tempDir.deleteFile(tempInput.getName());
tempInput = null;
}
}
}
private Error verifyChecksum(Throwable priorException, PointWriter writer) throws IOException {
assert priorException != null;
if (writer instanceof OfflinePointWriter) {
String tempFileName = ((OfflinePointWriter) writer).name;
try (ChecksumIndexInput in = tempDir.openChecksumInput(tempFileName, IOContext.READONCE)) {
CodecUtil.checkFooter(in, priorException);
}
}
throw IOUtils.rethrowAlways(priorException);
}
private boolean valueInBounds(BytesRef packedValue, byte[] minPackedValue, byte[] maxPackedValue) {
for(int dim=0;dim<numIndexDims;dim++) {
int offset = bytesPerDim*dim;
if (FutureArrays.compareUnsigned(packedValue.bytes, packedValue.offset + offset, packedValue.offset + offset + bytesPerDim, minPackedValue, offset, offset + bytesPerDim) < 0) {
return false;
}
if (FutureArrays.compareUnsigned(packedValue.bytes, packedValue.offset + offset, packedValue.offset + offset + bytesPerDim, maxPackedValue, offset, offset + bytesPerDim) > 0) {
return false;
}
}
return true;
}
protected int split(byte[] minPackedValue, byte[] maxPackedValue) {
int splitDim = -1;
for(int dim=0;dim<numIndexDims;dim++) {
NumericUtils.subtract(bytesPerDim, dim, maxPackedValue, minPackedValue, scratchDiff);
if (splitDim == -1 || FutureArrays.compareUnsigned(scratchDiff, 0, bytesPerDim, scratch1, 0, bytesPerDim) > 0) {
System.arraycopy(scratchDiff, 0, scratch1, 0, bytesPerDim);
splitDim = dim;
}
}
return splitDim;
}
private HeapPointWriter switchToHeap(PointWriter source) throws IOException {
int count = Math.toIntExact(source.count());
try (PointReader reader = source.getReader(0, count);
HeapPointWriter writer = new HeapPointWriter(count, packedBytesLength)) {
for(int i=0;i<count;i++) {
boolean hasNext = reader.next();
assert hasNext;
writer.append(reader.pointValue());
}
return writer;
} catch (Throwable t) {
throw verifyChecksum(t, source);
}
}
private void build(int nodeID, int leafNodeOffset,
MutablePointValues reader, int from, int to,
IndexOutput out,
byte[] minPackedValue, byte[] maxPackedValue,
byte[] splitPackedValues,
long[] leafBlockFPs,
int[] spareDocIds) throws IOException {
if (nodeID >= leafNodeOffset) {
final int count = to - from;
assert count <= maxPointsInLeafNode;
Arrays.fill(commonPrefixLengths, bytesPerDim);
reader.getValue(from, scratchBytesRef1);
for (int i = from + 1; i < to; ++i) {
reader.getValue(i, scratchBytesRef2);
for (int dim=0;dim<numDataDims;dim++) {
final int offset = dim * bytesPerDim;
for(int j=0;j<commonPrefixLengths[dim];j++) {
if (scratchBytesRef1.bytes[scratchBytesRef1.offset+offset+j] != scratchBytesRef2.bytes[scratchBytesRef2.offset+offset+j]) {
commonPrefixLengths[dim] = j;
break;
}
}
}
}
FixedBitSet[] usedBytes = new FixedBitSet[numDataDims];
for (int dim = 0; dim < numDataDims; ++dim) {
if (commonPrefixLengths[dim] < bytesPerDim) {
usedBytes[dim] = new FixedBitSet(256);
}
}
for (int i = from + 1; i < to; ++i) {
for (int dim=0;dim<numDataDims;dim++) {
if (usedBytes[dim] != null) {
byte b = reader.getByteAt(i, dim * bytesPerDim + commonPrefixLengths[dim]);
usedBytes[dim].set(Byte.toUnsignedInt(b));
}
}
}
int sortedDim = 0;
int sortedDimCardinality = Integer.MAX_VALUE;
for (int dim = 0; dim < numDataDims; ++dim) {
if (usedBytes[dim] != null) {
final int cardinality = usedBytes[dim].cardinality();
if (cardinality < sortedDimCardinality) {
sortedDim = dim;
sortedDimCardinality = cardinality;
}
}
}
MutablePointsReaderUtils.sortByDim(numDataDims, numIndexDims, sortedDim, bytesPerDim, commonPrefixLengths,
reader, from, to, scratchBytesRef1, scratchBytesRef2);
leafBlockFPs[nodeID - leafNodeOffset] = out.getFilePointer();
int[] docIDs = spareDocIds;
for (int i = from; i < to; ++i) {
docIDs[i - from] = reader.getDocID(i);
}
writeLeafBlockDocs(out, docIDs, 0, count);
reader.getValue(from, scratchBytesRef1);
System.arraycopy(scratchBytesRef1.bytes, scratchBytesRef1.offset, scratch1, 0, packedBytesLength);
IntFunction<BytesRef> packedValues = new IntFunction<BytesRef>() {
@Override
public BytesRef apply(int i) {
reader.getValue(from + i, scratchBytesRef1);
return scratchBytesRef1;
}
};
assert valuesInOrderAndBounds(count, sortedDim, minPackedValue, maxPackedValue, packedValues,
docIDs, 0);
writeLeafBlockPackedValues(out, commonPrefixLengths, count, sortedDim, packedValues);
} else {
final int splitDim = split(minPackedValue, maxPackedValue);
final int mid = (from + to + 1) >>> 1;
int commonPrefixLen = bytesPerDim;
for (int i = 0; i < bytesPerDim; ++i) {
if (minPackedValue[splitDim * bytesPerDim + i] != maxPackedValue[splitDim * bytesPerDim + i]) {
commonPrefixLen = i;
break;
}
}
MutablePointsReaderUtils.partition(numDataDims, numIndexDims, maxDoc, splitDim, bytesPerDim, commonPrefixLen,
reader, from, to, mid, scratchBytesRef1, scratchBytesRef2);
final int address = nodeID * (1+bytesPerDim);
splitPackedValues[address] = (byte) splitDim;
reader.getValue(mid, scratchBytesRef1);
System.arraycopy(scratchBytesRef1.bytes, scratchBytesRef1.offset + splitDim * bytesPerDim, splitPackedValues, address + 1, bytesPerDim);
byte[] minSplitPackedValue = ArrayUtil.copyOfSubArray(minPackedValue, 0, packedIndexBytesLength);
byte[] maxSplitPackedValue = ArrayUtil.copyOfSubArray(maxPackedValue, 0, packedIndexBytesLength);
System.arraycopy(scratchBytesRef1.bytes, scratchBytesRef1.offset + splitDim * bytesPerDim,
minSplitPackedValue, splitDim * bytesPerDim, bytesPerDim);
System.arraycopy(scratchBytesRef1.bytes, scratchBytesRef1.offset + splitDim * bytesPerDim,
maxSplitPackedValue, splitDim * bytesPerDim, bytesPerDim);
build(nodeID * 2, leafNodeOffset, reader, from, mid, out,
minPackedValue, maxSplitPackedValue, splitPackedValues, leafBlockFPs, spareDocIds);
build(nodeID * 2 + 1, leafNodeOffset, reader, mid, to, out,
minSplitPackedValue, maxPackedValue, splitPackedValues, leafBlockFPs, spareDocIds);
}
}
private void build(int nodeID, int leafNodeOffset,
BKDRadixSelector.PathSlice points,
IndexOutput out,
BKDRadixSelector radixSelector,
byte[] minPackedValue, byte[] maxPackedValue,
byte[] splitPackedValues,
long[] leafBlockFPs,
int[] spareDocIds) throws IOException {
if (nodeID >= leafNodeOffset) {
HeapPointWriter heapSource;
if (points.writer instanceof HeapPointWriter == false) {
heapSource = switchToHeap(points.writer);
} else {
heapSource = (HeapPointWriter) points.writer;
}
int from = Math.toIntExact(points.start);
int to = Math.toIntExact(points.start + points.count);
computeCommonPrefixLength(heapSource, scratch1);
int sortedDim = 0;
int sortedDimCardinality = Integer.MAX_VALUE;
FixedBitSet[] usedBytes = new FixedBitSet[numDataDims];
for (int dim = 0; dim < numDataDims; ++dim) {
if (commonPrefixLengths[dim] < bytesPerDim) {
usedBytes[dim] = new FixedBitSet(256);
}
}
for (int dim = 0; dim < numDataDims; dim++) {
int prefix = commonPrefixLengths[dim];
if (prefix < bytesPerDim) {
int offset = dim * bytesPerDim;
for (int i = 0; i < heapSource.count(); ++i) {
PointValue value = heapSource.getPackedValueSlice(i);
BytesRef packedValue = value.packedValue();
int bucket = packedValue.bytes[packedValue.offset + offset + prefix] & 0xff;
usedBytes[dim].set(bucket);
}
int cardinality = usedBytes[dim].cardinality();
if (cardinality < sortedDimCardinality) {
sortedDim = dim;
sortedDimCardinality = cardinality;
}
}
}
radixSelector.heapRadixSort(heapSource, from, to, sortedDim, commonPrefixLengths[sortedDim]);
leafBlockFPs[nodeID - leafNodeOffset] = out.getFilePointer();
int count = to - from;
assert count > 0: "nodeID=" + nodeID + " leafNodeOffset=" + leafNodeOffset;
int[] docIDs = spareDocIds;
for (int i = 0; i < count; i++) {
docIDs[i] = heapSource.getPackedValueSlice(from + i).docID();
}
writeLeafBlockDocs(out, spareDocIds, 0, count);
IntFunction<BytesRef> packedValues = new IntFunction<BytesRef>() {
final BytesRef scratch = new BytesRef();
{
scratch.length = packedBytesLength;
}
@Override
public BytesRef apply(int i) {
PointValue value = heapSource.getPackedValueSlice(from + i);
return value.packedValue();
}
};
assert valuesInOrderAndBounds(count, sortedDim, minPackedValue, maxPackedValue, packedValues,
docIDs, 0);
writeLeafBlockPackedValues(out, commonPrefixLengths, count, sortedDim, packedValues);
} else {
int splitDim;
if (numIndexDims > 1) {
splitDim = split(minPackedValue, maxPackedValue);
} else {
splitDim = 0;
}
assert nodeID < splitPackedValues.length : "nodeID=" + nodeID + " splitValues.length=" + splitPackedValues.length;
long rightCount = points.count / 2;
long leftCount = points.count - rightCount;
int commonPrefixLen = FutureArrays.mismatch(minPackedValue, splitDim * bytesPerDim,
splitDim * bytesPerDim + bytesPerDim, maxPackedValue, splitDim * bytesPerDim,
splitDim * bytesPerDim + bytesPerDim);
if (commonPrefixLen == -1) {
commonPrefixLen = bytesPerDim;
}
BKDRadixSelector.PathSlice[] pathSlices = new BKDRadixSelector.PathSlice[2];
byte[] splitValue = radixSelector.select(points, pathSlices, points.start, points.start + points.count, points.start + leftCount, splitDim, commonPrefixLen);
int address = nodeID * (1 + bytesPerDim);
splitPackedValues[address] = (byte) splitDim;
System.arraycopy(splitValue, 0, splitPackedValues, address + 1, bytesPerDim);
byte[] minSplitPackedValue = new byte[packedIndexBytesLength];
System.arraycopy(minPackedValue, 0, minSplitPackedValue, 0, packedIndexBytesLength);
byte[] maxSplitPackedValue = new byte[packedIndexBytesLength];
System.arraycopy(maxPackedValue, 0, maxSplitPackedValue, 0, packedIndexBytesLength);
System.arraycopy(splitValue, 0, minSplitPackedValue, splitDim * bytesPerDim, bytesPerDim);
System.arraycopy(splitValue, 0, maxSplitPackedValue, splitDim * bytesPerDim, bytesPerDim);
build(2*nodeID, leafNodeOffset, pathSlices[0], out, radixSelector,
minPackedValue, maxSplitPackedValue, splitPackedValues, leafBlockFPs, spareDocIds);
build(2*nodeID+1, leafNodeOffset, pathSlices[1], out, radixSelector,
minSplitPackedValue, maxPackedValue, splitPackedValues, leafBlockFPs, spareDocIds);
}
}
private void computeCommonPrefixLength(HeapPointWriter heapPointWriter, byte[] commonPrefix) {
Arrays.fill(commonPrefixLengths, bytesPerDim);
PointValue value = heapPointWriter.getPackedValueSlice(0);
BytesRef packedValue = value.packedValue();
for (int dim = 0; dim < numDataDims; dim++) {
System.arraycopy(packedValue.bytes, packedValue.offset + dim * bytesPerDim, commonPrefix, dim * bytesPerDim, bytesPerDim);
}
for (int i = 1; i < heapPointWriter.count(); i++) {
value = heapPointWriter.getPackedValueSlice(i);
packedValue = value.packedValue();
for (int dim = 0; dim < numDataDims; dim++) {
if (commonPrefixLengths[dim] != 0) {
int j = FutureArrays.mismatch(commonPrefix, dim * bytesPerDim, dim * bytesPerDim + commonPrefixLengths[dim], packedValue.bytes, packedValue.offset + dim * bytesPerDim, packedValue.offset + dim * bytesPerDim + commonPrefixLengths[dim]);
if (j != -1) {
commonPrefixLengths[dim] = j;
}
}
}
}
}
private boolean valuesInOrderAndBounds(int count, int sortedDim, byte[] minPackedValue, byte[] maxPackedValue,
IntFunction<BytesRef> values, int[] docs, int docsOffset) throws IOException {
byte[] lastPackedValue = new byte[packedBytesLength];
int lastDoc = -1;
for (int i=0;i<count;i++) {
BytesRef packedValue = values.apply(i);
assert packedValue.length == packedBytesLength;
assert valueInOrder(i, sortedDim, lastPackedValue, packedValue.bytes, packedValue.offset,
docs[docsOffset + i], lastDoc);
lastDoc = docs[docsOffset + i];
assert valueInBounds(packedValue, minPackedValue, maxPackedValue);
}
return true;
}
private boolean valueInOrder(long ord, int sortedDim, byte[] lastPackedValue, byte[] packedValue, int packedValueOffset,
int doc, int lastDoc) {
int dimOffset = sortedDim * bytesPerDim;
if (ord > 0) {
int cmp = FutureArrays.compareUnsigned(lastPackedValue, dimOffset, dimOffset + bytesPerDim, packedValue, packedValueOffset + dimOffset, packedValueOffset + dimOffset + bytesPerDim);
if (cmp > 0) {
throw new AssertionError("values out of order: last value=" + new BytesRef(lastPackedValue) + " current value=" + new BytesRef(packedValue, packedValueOffset, packedBytesLength) + " ord=" + ord + " sortedDim=" + sortedDim);
}
if (cmp == 0 && numDataDims > numIndexDims) {
int dataOffset = numIndexDims * bytesPerDim;
cmp = FutureArrays.compareUnsigned(lastPackedValue, dataOffset, packedBytesLength, packedValue, packedValueOffset + dataOffset, packedValueOffset + packedBytesLength);
if (cmp > 0) {
throw new AssertionError("data values out of order: last value=" + new BytesRef(lastPackedValue) + " current value=" + new BytesRef(packedValue, packedValueOffset, packedBytesLength) + " ord=" + ord);
}
}
if (cmp == 0 && doc < lastDoc) {
throw new AssertionError("docs out of order: last doc=" + lastDoc + " current doc=" + doc + " ord=" + ord + " sortedDim=" + sortedDim);
}
}
System.arraycopy(packedValue, packedValueOffset, lastPackedValue, 0, packedBytesLength);
return true;
}
private void write(IndexOutput out, String s) throws IOException {
SimpleTextUtil.write(out, s, scratch);
}
private void writeInt(IndexOutput out, int x) throws IOException {
SimpleTextUtil.write(out, Integer.toString(x), scratch);
}
private void writeLong(IndexOutput out, long x) throws IOException {
SimpleTextUtil.write(out, Long.toString(x), scratch);
}
private void write(IndexOutput out, BytesRef b) throws IOException {
SimpleTextUtil.write(out, b);
}
private void newline(IndexOutput out) throws IOException {
SimpleTextUtil.writeNewline(out);
}
}