/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.search;

import java.io.IOException;
import java.util.AbstractCollection;
import java.util.Arrays;
import java.util.Collection;
import java.util.Iterator;
import java.util.NoSuchElementException;

import org.apache.lucene.document.IntPoint;
import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.PointValues;
import org.apache.lucene.index.PointValues.IntersectVisitor;
import org.apache.lucene.index.PointValues.Relation;
import org.apache.lucene.index.PrefixCodedTerms;
import org.apache.lucene.index.PrefixCodedTerms.TermIterator;
import org.apache.lucene.util.Accountable;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefBuilder;
import org.apache.lucene.util.BytesRefIterator;
import org.apache.lucene.util.DocIdSetBuilder;
import org.apache.lucene.util.FutureArrays;
import org.apache.lucene.util.RamUsageEstimator;

Abstract query class to find all documents whose single or multi-dimensional point values, previously indexed with e.g. IntPoint, is contained in the specified set.

This is for subclasses and works on the underlying binary encoding: to create range queries for lucene's standard Point types, refer to factory methods on those classes, e.g. IntPoint.newSetQuery() for fields indexed with IntPoint.

See Also:
@lucene.experimental
/** * Abstract query class to find all documents whose single or multi-dimensional point values, previously indexed with e.g. {@link IntPoint}, * is contained in the specified set. * * <p> * This is for subclasses and works on the underlying binary encoding: to * create range queries for lucene's standard {@code Point} types, refer to factory * methods on those classes, e.g. {@link IntPoint#newSetQuery IntPoint.newSetQuery()} for * fields indexed with {@link IntPoint}. * @see PointValues * @lucene.experimental */
public abstract class PointInSetQuery extends Query implements Accountable { protected static final long BASE_RAM_BYTES = RamUsageEstimator.shallowSizeOfInstance(PointInSetQuery.class); // A little bit overkill for us, since all of our "terms" are always in the same field: final PrefixCodedTerms sortedPackedPoints; final int sortedPackedPointsHashCode; final String field; final int numDims; final int bytesPerDim; final long ramBytesUsed; // cache
Iterator of encoded point values.
/** * Iterator of encoded point values. */
// TODO: if we want to stream, maybe we should use jdk stream class? public static abstract class Stream implements BytesRefIterator { @Override public abstract BytesRef next(); };
The packedPoints iterator must be in sorted order.
/** The {@code packedPoints} iterator must be in sorted order. */
protected PointInSetQuery(String field, int numDims, int bytesPerDim, Stream packedPoints) { this.field = field; if (bytesPerDim < 1 || bytesPerDim > PointValues.MAX_NUM_BYTES) { throw new IllegalArgumentException("bytesPerDim must be > 0 and <= " + PointValues.MAX_NUM_BYTES + "; got " + bytesPerDim); } this.bytesPerDim = bytesPerDim; if (numDims < 1 || numDims > PointValues.MAX_DIMENSIONS) { throw new IllegalArgumentException("numDims must be > 0 and <= " + PointValues.MAX_DIMENSIONS + "; got " + numDims); } this.numDims = numDims; // In the 1D case this works well (the more points, the more common prefixes they share, typically), but in // the > 1 D case, where we are only looking at the first dimension's prefix bytes, it can at worst not hurt: PrefixCodedTerms.Builder builder = new PrefixCodedTerms.Builder(); BytesRefBuilder previous = null; BytesRef current; while ((current = packedPoints.next()) != null) { if (current.length != numDims * bytesPerDim) { throw new IllegalArgumentException("packed point length should be " + (numDims * bytesPerDim) + " but got " + current.length + "; field=\"" + field + "\" numDims=" + numDims + " bytesPerDim=" + bytesPerDim); } if (previous == null) { previous = new BytesRefBuilder(); } else { int cmp = previous.get().compareTo(current); if (cmp == 0) { continue; // deduplicate } else if (cmp > 0) { throw new IllegalArgumentException("values are out of order: saw " + previous + " before " + current); } } builder.add(field, current); previous.copyBytes(current); } sortedPackedPoints = builder.finish(); sortedPackedPointsHashCode = sortedPackedPoints.hashCode(); ramBytesUsed = BASE_RAM_BYTES + RamUsageEstimator.sizeOfObject(field) + RamUsageEstimator.sizeOfObject(sortedPackedPoints); } @Override public void visit(QueryVisitor visitor) { if (visitor.acceptField(field)) { visitor.visitLeaf(this); } } @Override public final Weight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost) throws IOException { // We don't use RandomAccessWeight here: it's no good to approximate with "match all docs". // This is an inverted structure and should be used in the first pass: return new ConstantScoreWeight(this, boost) { @Override public Scorer scorer(LeafReaderContext context) throws IOException { LeafReader reader = context.reader(); PointValues values = reader.getPointValues(field); if (values == null) { // No docs in this segment/field indexed any points return null; } if (values.getNumIndexDimensions() != numDims) { throw new IllegalArgumentException("field=\"" + field + "\" was indexed with numIndexDims=" + values.getNumIndexDimensions() + " but this query has numIndexDims=" + numDims); } if (values.getBytesPerDimension() != bytesPerDim) { throw new IllegalArgumentException("field=\"" + field + "\" was indexed with bytesPerDim=" + values.getBytesPerDimension() + " but this query has bytesPerDim=" + bytesPerDim); } DocIdSetBuilder result = new DocIdSetBuilder(reader.maxDoc(), values, field); if (numDims == 1) { // We optimize this common case, effectively doing a merge sort of the indexed values vs the queried set: values.intersect(new MergePointVisitor(sortedPackedPoints, result)); } else { // NOTE: this is naive implementation, where for each point we re-walk the KD tree to intersect. We could instead do a similar // optimization as the 1D case, but I think it'd mean building a query-time KD tree so we could efficiently intersect against the // index, which is probably tricky! SinglePointVisitor visitor = new SinglePointVisitor(result); TermIterator iterator = sortedPackedPoints.iterator(); for (BytesRef point = iterator.next(); point != null; point = iterator.next()) { visitor.setPoint(point); values.intersect(visitor); } } return new ConstantScoreScorer(this, score(), scoreMode, result.build().iterator()); } @Override public boolean isCacheable(LeafReaderContext ctx) { return true; } }; }
Essentially does a merge sort, only collecting hits when the indexed point and query point are the same. This is an optimization, used in the 1D case.
/** Essentially does a merge sort, only collecting hits when the indexed point and query point are the same. This is an optimization, * used in the 1D case. */
private class MergePointVisitor implements IntersectVisitor { private final DocIdSetBuilder result; private TermIterator iterator; private BytesRef nextQueryPoint; private final BytesRef scratch = new BytesRef(); private final PrefixCodedTerms sortedPackedPoints; private DocIdSetBuilder.BulkAdder adder; public MergePointVisitor(PrefixCodedTerms sortedPackedPoints, DocIdSetBuilder result) throws IOException { this.result = result; this.sortedPackedPoints = sortedPackedPoints; scratch.length = bytesPerDim; this.iterator = this.sortedPackedPoints.iterator(); nextQueryPoint = iterator.next(); } @Override public void grow(int count) { adder = result.grow(count); } @Override public void visit(int docID) { adder.add(docID); } @Override public void visit(int docID, byte[] packedValue) { if (matches(packedValue)) { visit(docID); } } @Override public void visit(DocIdSetIterator iterator, byte[] packedValue) throws IOException { if (matches(packedValue)) { int docID; while ((docID = iterator.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { visit(docID); } } } private boolean matches(byte[] packedValue) { scratch.bytes = packedValue; while (nextQueryPoint != null) { int cmp = nextQueryPoint.compareTo(scratch); if (cmp == 0) { return true; } else if (cmp < 0) { // Query point is before index point, so we move to next query point nextQueryPoint = iterator.next(); } else { // Query point is after index point, so we don't collect and we return: break; } } return false; } @Override public Relation compare(byte[] minPackedValue, byte[] maxPackedValue) { while (nextQueryPoint != null) { scratch.bytes = minPackedValue; int cmpMin = nextQueryPoint.compareTo(scratch); if (cmpMin < 0) { // query point is before the start of this cell nextQueryPoint = iterator.next(); continue; } scratch.bytes = maxPackedValue; int cmpMax = nextQueryPoint.compareTo(scratch); if (cmpMax > 0) { // query point is after the end of this cell return Relation.CELL_OUTSIDE_QUERY; } if (cmpMin == 0 && cmpMax == 0) { // NOTE: we only hit this if we are on a cell whose min and max values are exactly equal to our point, // which can easily happen if many (> 1024) docs share this one value return Relation.CELL_INSIDE_QUERY; } else { return Relation.CELL_CROSSES_QUERY; } } // We exhausted all points in the query: return Relation.CELL_OUTSIDE_QUERY; } }
IntersectVisitor that queries against a highly degenerate shape: a single point. This is used in the > 1D case.
/** IntersectVisitor that queries against a highly degenerate shape: a single point. This is used in the > 1D case. */
private class SinglePointVisitor implements IntersectVisitor { private final DocIdSetBuilder result; private final byte[] pointBytes; private DocIdSetBuilder.BulkAdder adder; public SinglePointVisitor(DocIdSetBuilder result) { this.result = result; this.pointBytes = new byte[bytesPerDim * numDims]; } public void setPoint(BytesRef point) { // we verified this up front in query's ctor: assert point.length == pointBytes.length; System.arraycopy(point.bytes, point.offset, pointBytes, 0, pointBytes.length); } @Override public void grow(int count) { adder = result.grow(count); } @Override public void visit(int docID) { adder.add(docID); } @Override public void visit(int docID, byte[] packedValue) { assert packedValue.length == pointBytes.length; if (Arrays.equals(packedValue, pointBytes)) { // The point for this doc matches the point we are querying on visit(docID); } } @Override public void visit(DocIdSetIterator iterator, byte[] packedValue) throws IOException { assert packedValue.length == pointBytes.length; if (Arrays.equals(packedValue, pointBytes)) { // The point for this set of docs matches the point we are querying on int docID; while ((docID = iterator.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { visit(docID); } } } @Override public Relation compare(byte[] minPackedValue, byte[] maxPackedValue) { boolean crosses = false; for(int dim=0;dim<numDims;dim++) { int offset = dim*bytesPerDim; int cmpMin = FutureArrays.compareUnsigned(minPackedValue, offset, offset + bytesPerDim, pointBytes, offset, offset + bytesPerDim); if (cmpMin > 0) { return Relation.CELL_OUTSIDE_QUERY; } int cmpMax = FutureArrays.compareUnsigned(maxPackedValue, offset, offset + bytesPerDim, pointBytes, offset, offset + bytesPerDim); if (cmpMax < 0) { return Relation.CELL_OUTSIDE_QUERY; } if (cmpMin != 0 || cmpMax != 0) { crosses = true; } } if (crosses) { return Relation.CELL_CROSSES_QUERY; } else { // NOTE: we only hit this if we are on a cell whose min and max values are exactly equal to our point, // which can easily happen if many docs share this one value return Relation.CELL_INSIDE_QUERY; } } } public Collection<byte[]> getPackedPoints() { return new AbstractCollection<byte[]>() { @Override public Iterator<byte[]> iterator() { int size = (int) sortedPackedPoints.size(); PrefixCodedTerms.TermIterator iterator = sortedPackedPoints.iterator(); return new Iterator<byte[]>() { int upto = 0; @Override public boolean hasNext() { return upto < size; } @Override public byte[] next() { if (upto == size) { throw new NoSuchElementException(); } upto++; BytesRef next = iterator.next(); return BytesRef.deepCopyOf(next).bytes; } }; } @Override public int size() { return (int) sortedPackedPoints.size(); } }; } public String getField() { return field; } public int getNumDims() { return numDims; } public int getBytesPerDim() { return bytesPerDim; } @Override public final int hashCode() { int hash = classHash(); hash = 31 * hash + field.hashCode(); hash = 31 * hash + sortedPackedPointsHashCode; hash = 31 * hash + numDims; hash = 31 * hash + bytesPerDim; return hash; } @Override public final boolean equals(Object other) { return sameClassAs(other) && equalsTo(getClass().cast(other)); } private boolean equalsTo(PointInSetQuery other) { return other.field.equals(field) && other.numDims == numDims && other.bytesPerDim == bytesPerDim && other.sortedPackedPointsHashCode == sortedPackedPointsHashCode && other.sortedPackedPoints.equals(sortedPackedPoints); } @Override public final String toString(String field) { final StringBuilder sb = new StringBuilder(); if (this.field.equals(field) == false) { sb.append(this.field); sb.append(':'); } sb.append("{"); TermIterator iterator = sortedPackedPoints.iterator(); byte[] pointBytes = new byte[numDims * bytesPerDim]; boolean first = true; for (BytesRef point = iterator.next(); point != null; point = iterator.next()) { if (first == false) { sb.append(" "); } first = false; System.arraycopy(point.bytes, point.offset, pointBytes, 0, pointBytes.length); sb.append(toString(pointBytes)); } sb.append("}"); return sb.toString(); }
Returns a string of a single value in a human-readable format for debugging. This is used by Query.toString(). The default implementation encodes the individual byte values.
Params:
  • value – single value, never null
Returns:human readable value for debugging
/** * Returns a string of a single value in a human-readable format for debugging. * This is used by {@link #toString()}. * * The default implementation encodes the individual byte values. * * @param value single value, never null * @return human readable value for debugging */
protected abstract String toString(byte[] value); @Override public long ramBytesUsed() { return ramBytesUsed; } }