/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.lucene.search.uhighlight;

import java.io.Closeable;
import java.io.IOException;
import java.util.HashMap;
import java.util.List;
import java.util.Objects;
import java.util.PriorityQueue;
import java.util.function.Supplier;

import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.MatchesIterator;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.QueryVisitor;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefBuilder;
import org.apache.lucene.util.IOUtils;

An enumeration/iterator of a term and its offsets for use by FieldHighlighter. It is advanced and is placed in a priority queue by FieldHighlighter.highlightOffsetsEnums(OffsetsEnum) based on the start offset.
@lucene.internal
/** * An enumeration/iterator of a term and its offsets for use by {@link FieldHighlighter}. * It is advanced and is placed in a priority queue by * {@link FieldHighlighter#highlightOffsetsEnums(OffsetsEnum)} based on the start offset. * * @lucene.internal */
public abstract class OffsetsEnum implements Comparable<OffsetsEnum>, Closeable { // note: the ordering clearly changes as the postings enum advances // note: would be neat to use some Comparator utilities with method // references but our methods throw IOException @Override public int compareTo(OffsetsEnum other) { try { int cmp = Integer.compare(startOffset(), other.startOffset()); if (cmp != 0) { return cmp; // vast majority of the time we return here. } cmp = Integer.compare(endOffset(), other.endOffset()); if (cmp != 0) { return cmp; } final BytesRef thisTerm = this.getTerm(); final BytesRef otherTerm = other.getTerm(); if (thisTerm == null || otherTerm == null) { if (thisTerm == null && otherTerm == null) { return 0; } else if (thisTerm == null) { return 1; // put "this" (wildcard mtq enum) last } else { return -1; } } return thisTerm.compareTo(otherTerm); } catch (IOException e) { throw new RuntimeException(e); } }
Advances to the next position and returns true, or if can't then returns false. Note that the initial state of this class is not positioned.
/** * Advances to the next position and returns true, or if can't then returns false. * Note that the initial state of this class is not positioned. */
public abstract boolean nextPosition() throws IOException;
An estimate of the number of occurrences of this term/OffsetsEnum.
/** An estimate of the number of occurrences of this term/OffsetsEnum. */
public abstract int freq() throws IOException;
The term at this position. This BytesRef is safe to continue to refer to, even after we move to the next position.
See Also:
  • getMatchTerms.getMatchTerms()
/** * The term at this position. * This BytesRef is safe to continue to refer to, even after we move to the next position. * * @see Passage#getMatchTerms() */
public abstract BytesRef getTerm() throws IOException; public abstract int startOffset() throws IOException; public abstract int endOffset() throws IOException; @Override public void close() throws IOException { } @Override public String toString() { final String name = getClass().getSimpleName(); String offset = ""; try { offset = ",[" + startOffset() + "-" + endOffset() + "]"; } catch (Exception e) { //ignore; for debugging only } try { return name + "(term:" + getTerm().utf8ToString() + offset + ")"; } catch (Exception e) { return name; } }
Based on a PostingsEnum -- the typical/standard OE impl.
/** * Based on a {@link PostingsEnum} -- the typical/standard OE impl. */
public static class OfPostings extends OffsetsEnum { private final BytesRef term; private final PostingsEnum postingsEnum; // with offsets private final int freq; private int posCounter = -1; public OfPostings(BytesRef term, int freq, PostingsEnum postingsEnum) throws IOException { this.term = Objects.requireNonNull(term); this.postingsEnum = Objects.requireNonNull(postingsEnum); this.freq = freq; this.posCounter = this.postingsEnum.freq(); } public OfPostings(BytesRef term, PostingsEnum postingsEnum) throws IOException { this(term, postingsEnum.freq(), postingsEnum); } public PostingsEnum getPostingsEnum() { return postingsEnum; } @Override public boolean nextPosition() throws IOException { if (posCounter > 0) { posCounter--; postingsEnum.nextPosition(); // note: we don't need to save the position return true; } else { return false; } } @Override public BytesRef getTerm() throws IOException { return term; } @Override public int startOffset() throws IOException { return postingsEnum.startOffset(); } @Override public int endOffset() throws IOException { return postingsEnum.endOffset(); } @Override public int freq() throws IOException { return freq; } }
Based on a MatchesIterator with submatches.
/** Based on a {@link MatchesIterator} with submatches. */
public static class OfMatchesIteratorWithSubs extends OffsetsEnum { //Either CachedOE impls (which are the submatches) or OfMatchesIterator impls private final PriorityQueue<OffsetsEnum> pendingQueue = new PriorityQueue<>(); private final HashMap<Query,BytesRef> queryToTermMap = new HashMap<>(); public OfMatchesIteratorWithSubs(MatchesIterator matchesIterator) { pendingQueue.add(new OfMatchesIterator(matchesIterator, () -> queryToTerm(matchesIterator.getQuery()))); } @Override public boolean nextPosition() throws IOException { OffsetsEnum formerHeadOE = pendingQueue.poll(); // removes the head if (formerHeadOE instanceof CachedOE) { // we're done with the former head. CachedOE's are one use only. // Look at the new head... OffsetsEnum newHeadOE = pendingQueue.peek(); if (newHeadOE instanceof OfMatchesIterator) { // We found the matchesIterator. Requires processing. nextWhenMatchesIterator((OfMatchesIterator) newHeadOE); // May or may not remove or re-queue itself } // else new head is a CacheOE or no more. Nothing to do with it. } else { // formerHeadOE is OfMatchesIterator; advance it OfMatchesIterator miOE = (OfMatchesIterator) formerHeadOE; if (miOE.nextPosition()) { nextWhenMatchesIterator(miOE); // requires processing. May or may not re-enqueue itself } } return pendingQueue.isEmpty() == false; } private void nextWhenMatchesIterator(OfMatchesIterator miOE) throws IOException { boolean isHead = miOE == pendingQueue.peek(); MatchesIterator subMatches = miOE.matchesIterator.getSubMatches(); if (subMatches != null) { // remove this miOE from the queue, add it's submatches, next() it, then re-enqueue it if (isHead) { pendingQueue.poll(); // remove } enqueueCachedMatches(subMatches); if (miOE.nextPosition()) { pendingQueue.add(miOE); assert pendingQueue.peek() != miOE; // miOE should follow cached entries } } else { // else has no subMatches. It will stay enqueued. if (!isHead) { pendingQueue.add(miOE); } // else it's *already* in pendingQueue } } private boolean enqueueCachedMatches(MatchesIterator thisMI) throws IOException { if (thisMI == null) { return false; } else { while (thisMI.next()) { if (false == enqueueCachedMatches(thisMI.getSubMatches())) { // recursion // if no sub-matches then add ourselves pendingQueue.add(new CachedOE(queryToTerm(thisMI.getQuery()), thisMI.startOffset(), thisMI.endOffset())); } } return true; } } /** Maps a Query from {@link MatchesIterator#getQuery()} to {@link OffsetsEnum#getTerm()}. * See {@link Passage#getMatchTerms()}. */ private BytesRef queryToTerm(Query query) { // compute an approximate BytesRef term of a Query. We cache this since we're likely to see the same query again. // Our approach is to visit each matching term in order, concatenating them with an adjoining space. // If we don't have any (perhaps due to an MTQ like a wildcard) then we fall back on the toString() of the query. return queryToTermMap.computeIfAbsent(query, (Query q) -> { BytesRefBuilder bytesRefBuilder = new BytesRefBuilder(); q.visit(new QueryVisitor() { @Override public void consumeTerms(Query query, Term... terms) { for (Term term : terms) { if (bytesRefBuilder.length() > 0) { bytesRefBuilder.append((byte) ' '); } bytesRefBuilder.append(term.bytes()); } } }); if (bytesRefBuilder.length() > 0) { return bytesRefBuilder.get(); } // fallback: (likely a MultiTermQuery) return new BytesRef(q.toString()); }); } @Override public int freq() throws IOException { return pendingQueue.peek().freq(); } @Override public BytesRef getTerm() throws IOException { return pendingQueue.peek().getTerm(); } @Override public int startOffset() throws IOException { return pendingQueue.peek().startOffset(); } @Override public int endOffset() throws IOException { return pendingQueue.peek().endOffset(); } private static class CachedOE extends OffsetsEnum { final BytesRef term; final int startOffset; final int endOffset; private CachedOE(BytesRef term, int startOffset, int endOffset) { this.term = term; this.startOffset = startOffset; this.endOffset = endOffset; } @Override public boolean nextPosition() throws IOException { return false; } @Override public int freq() throws IOException { return 1; // documented short-coming of MatchesIterator based UnifiedHighlighter } @Override public BytesRef getTerm() throws IOException { return term; } @Override public int startOffset() throws IOException { return startOffset; } @Override public int endOffset() throws IOException { return endOffset; } } }
Based on a MatchesIterator; does not look at submatches.
/** Based on a {@link MatchesIterator}; does not look at submatches. */
public static class OfMatchesIterator extends OffsetsEnum { private final MatchesIterator matchesIterator; private final Supplier<BytesRef> termSupplier; public OfMatchesIterator(MatchesIterator matchesIterator, Supplier<BytesRef> termSupplier) { this.matchesIterator = matchesIterator; this.termSupplier = termSupplier; } @Override public boolean nextPosition() throws IOException { return matchesIterator.next(); } @Override public int freq() throws IOException { return 1; // documented short-coming of MatchesIterator based UnifiedHighlighter } @Override public BytesRef getTerm() throws IOException { return termSupplier.get(); } @Override public int startOffset() throws IOException { return matchesIterator.startOffset(); } @Override public int endOffset() throws IOException { return matchesIterator.endOffset(); } }
Empty enumeration
/** * Empty enumeration */
public static final OffsetsEnum EMPTY = new OffsetsEnum() { @Override public boolean nextPosition() throws IOException { return false; } @Override public BytesRef getTerm() throws IOException { throw new UnsupportedOperationException(); } @Override public int startOffset() throws IOException { throw new UnsupportedOperationException(); } @Override public int endOffset() throws IOException { throw new UnsupportedOperationException(); } @Override public int freq() throws IOException { return 0; } };
A view over several OffsetsEnum instances, merging them in-place
/** * A view over several OffsetsEnum instances, merging them in-place */
//If OffsetsEnum and MatchesIterator ever truly merge then this could go away in lieu of DisjunctionMatchesIterator public static class MultiOffsetsEnum extends OffsetsEnum { private final PriorityQueue<OffsetsEnum> queue; private boolean started = false; public MultiOffsetsEnum(List<OffsetsEnum> inner) throws IOException { this.queue = new PriorityQueue<>(); for (OffsetsEnum oe : inner) { if (oe.nextPosition()) this.queue.add(oe); } } @Override public boolean nextPosition() throws IOException { if (started == false) { started = true; return this.queue.size() > 0; } if (this.queue.size() > 0) { OffsetsEnum top = this.queue.poll(); if (top.nextPosition()) { this.queue.add(top); return true; } else { top.close(); } return this.queue.size() > 0; } return false; } @Override public BytesRef getTerm() throws IOException { return this.queue.peek().getTerm(); } @Override public int startOffset() throws IOException { return this.queue.peek().startOffset(); } @Override public int endOffset() throws IOException { return this.queue.peek().endOffset(); } @Override public int freq() throws IOException { return this.queue.peek().freq(); } @Override public void close() throws IOException { // most child enums will have been closed in .nextPosition() // here all remaining non-exhausted enums are closed IOUtils.close(queue); } } }