/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.search;

import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;

import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexReaderContext;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.index.ReaderUtil;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermState;
import org.apache.lucene.index.TermStates;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.similarities.Similarity;
import org.apache.lucene.search.spans.SpanNearQuery;
import org.apache.lucene.util.Accountable;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.RamUsageEstimator;
import org.apache.lucene.util.automaton.Automaton;
import org.apache.lucene.util.automaton.Operations;
import org.apache.lucene.util.automaton.Transition;

import static org.apache.lucene.util.automaton.Operations.DEFAULT_MAX_DETERMINIZED_STATES;

// TODO
//    - compare perf to PhraseQuery exact and sloppy
//    - optimize: find terms that are in fact MUST (because all paths
//      through the A include that term)
//    - if we ever store posLength in the index, it would be easy[ish]
//      to take it into account here

A proximity query that lets you express an automaton, whose transitions are terms, to match documents. This is a generalization of other proximity queries like PhraseQuery, MultiPhraseQuery and SpanNearQuery. It is likely slow, since it visits any document having any of the terms (i.e. it acts like a disjunction, not a conjunction like PhraseQuery), and then it must merge-sort all positions within each document to test whether/how many times the automaton matches.

After creating the query, use createState, setAccept, addTransition and addAnyTransition to build up the automaton. Once you are done, call finish and then execute the query.

This code is very new and likely has exciting bugs! @lucene.experimental

/** A proximity query that lets you express an automaton, whose * transitions are terms, to match documents. This is a generalization * of other proximity queries like {@link PhraseQuery}, {@link * MultiPhraseQuery} and {@link SpanNearQuery}. It is likely * slow, since it visits any document having any of the terms (i.e. it * acts like a disjunction, not a conjunction like {@link * PhraseQuery}), and then it must merge-sort all positions within each * document to test whether/how many times the automaton matches. * * <p>After creating the query, use {@link #createState}, {@link * #setAccept}, {@link #addTransition} and {@link #addAnyTransition} to * build up the automaton. Once you are done, call {@link #finish} and * then execute the query. * * <p>This code is very new and likely has exciting bugs! * * @lucene.experimental */
public class TermAutomatonQuery extends Query implements Accountable { private static final long BASE_RAM_BYTES = RamUsageEstimator.shallowSizeOfInstance(TermAutomatonQuery.class); private final String field; private final Automaton.Builder builder; Automaton det; private final Map<BytesRef,Integer> termToID = new HashMap<>(); private final Map<Integer,BytesRef> idToTerm = new HashMap<>(); private int anyTermID = -1; public TermAutomatonQuery(String field) { this.field = field; this.builder = new Automaton.Builder(); }
Returns a new state; state 0 is always the initial state.
/** Returns a new state; state 0 is always the initial state. */
public int createState() { return builder.createState(); }
Marks the specified state as accept or not.
/** Marks the specified state as accept or not. */
public void setAccept(int state, boolean accept) { builder.setAccept(state, accept); }
Adds a transition to the automaton.
/** Adds a transition to the automaton. */
public void addTransition(int source, int dest, String term) { addTransition(source, dest, new BytesRef(term)); }
Adds a transition to the automaton.
/** Adds a transition to the automaton. */
public void addTransition(int source, int dest, BytesRef term) { if (term == null) { throw new NullPointerException("term should not be null"); } builder.addTransition(source, dest, getTermID(term)); }
Adds a transition matching any term.
/** Adds a transition matching any term. */
public void addAnyTransition(int source, int dest) { builder.addTransition(source, dest, getTermID(null)); }
Call this once you are done adding states/transitions.
/** Call this once you are done adding states/transitions. */
public void finish() { finish(DEFAULT_MAX_DETERMINIZED_STATES); }
Call this once you are done adding states/transitions.
Params:
  • maxDeterminizedStates – Maximum number of states created when determinizing the automaton. Higher numbers allow this operation to consume more memory but allow more complex automatons.
/** * Call this once you are done adding states/transitions. * @param maxDeterminizedStates Maximum number of states created when * determinizing the automaton. Higher numbers allow this operation to * consume more memory but allow more complex automatons. */
public void finish(int maxDeterminizedStates) { Automaton automaton = builder.finish(); // System.out.println("before det:\n" + automaton.toDot()); Transition t = new Transition(); // TODO: should we add "eps back to initial node" for all states, // and det that? then we don't need to revisit initial node at // every position? but automaton could blow up? And, this makes it // harder to skip useless positions at search time? if (anyTermID != -1) { // Make sure there are no leading or trailing ANY: int count = automaton.initTransition(0, t); for(int i=0;i<count;i++) { automaton.getNextTransition(t); if (anyTermID >= t.min && anyTermID <= t.max) { throw new IllegalStateException("automaton cannot lead with an ANY transition"); } } int numStates = automaton.getNumStates(); for(int i=0;i<numStates;i++) { count = automaton.initTransition(i, t); for(int j=0;j<count;j++) { automaton.getNextTransition(t); if (automaton.isAccept(t.dest) && anyTermID >= t.min && anyTermID <= t.max) { throw new IllegalStateException("automaton cannot end with an ANY transition"); } } } int termCount = termToID.size(); // We have to carefully translate these transitions so automaton // realizes they also match all other terms: Automaton newAutomaton = new Automaton(); for(int i=0;i<numStates;i++) { newAutomaton.createState(); newAutomaton.setAccept(i, automaton.isAccept(i)); } for(int i=0;i<numStates;i++) { count = automaton.initTransition(i, t); for(int j=0;j<count;j++) { automaton.getNextTransition(t); int min, max; if (t.min <= anyTermID && anyTermID <= t.max) { // Match any term min = 0; max = termCount-1; } else { min = t.min; max = t.max; } newAutomaton.addTransition(t.source, t.dest, min, max); } } newAutomaton.finishState(); automaton = newAutomaton; } det = Operations.removeDeadStates(Operations.determinize(automaton, maxDeterminizedStates)); if (det.isAccept(0)) { throw new IllegalStateException("cannot accept the empty string"); } } @Override public Weight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost) throws IOException { IndexReaderContext context = searcher.getTopReaderContext(); Map<Integer,TermStates> termStates = new HashMap<>(); for (Map.Entry<BytesRef,Integer> ent : termToID.entrySet()) { if (ent.getKey() != null) { termStates.put(ent.getValue(), TermStates.build(context, new Term(field, ent.getKey()), scoreMode.needsScores())); } } return new TermAutomatonWeight(det, searcher, termStates, boost); } @Override public String toString(String field) { // TODO: what really am I supposed to do with the incoming field... StringBuilder sb = new StringBuilder(); sb.append("TermAutomatonQuery(field="); sb.append(this.field); if (det != null) { sb.append(" numStates="); sb.append(det.getNumStates()); } sb.append(')'); return sb.toString(); } private int getTermID(BytesRef term) { Integer id = termToID.get(term); if (id == null) { id = termToID.size(); if (term != null) { term = BytesRef.deepCopyOf(term); } termToID.put(term, id); idToTerm.put(id, term); if (term == null) { anyTermID = id; } } return id; }
Returns true iff o is equal to this.
/** Returns true iff <code>o</code> is equal to this. */
@Override public boolean equals(Object other) { return sameClassAs(other) && equalsTo(getClass().cast(other)); } private static boolean checkFinished(TermAutomatonQuery q) { if (q.det == null) { throw new IllegalStateException("Call finish first on: " + q); } return true; } private boolean equalsTo(TermAutomatonQuery other) { return checkFinished(this) && checkFinished(other) && other == this; } @Override public int hashCode() { checkFinished(this); // LUCENE-7295: this used to be very awkward toDot() call; it is safer to assume // that no two instances are equivalent instead (until somebody finds a better way to check // on automaton equivalence quickly). return System.identityHashCode(this); } @Override public long ramBytesUsed() { return BASE_RAM_BYTES + RamUsageEstimator.sizeOfObject(builder) + RamUsageEstimator.sizeOfObject(det) + RamUsageEstimator.sizeOfObject(field) + RamUsageEstimator.sizeOfObject(idToTerm) + RamUsageEstimator.sizeOfObject(termToID); }
Returns the dot (graphviz) representation of this automaton. This is extremely useful for visualizing the automaton.
/** Returns the dot (graphviz) representation of this automaton. * This is extremely useful for visualizing the automaton. */
public String toDot() { // TODO: refactor & share with Automaton.toDot! StringBuilder b = new StringBuilder(); b.append("digraph Automaton {\n"); b.append(" rankdir = LR\n"); final int numStates = det.getNumStates(); if (numStates > 0) { b.append(" initial [shape=plaintext,label=\"0\"]\n"); b.append(" initial -> 0\n"); } Transition t = new Transition(); for(int state=0;state<numStates;state++) { b.append(" "); b.append(state); if (det.isAccept(state)) { b.append(" [shape=doublecircle,label=\"").append(state).append("\"]\n"); } else { b.append(" [shape=circle,label=\"").append(state).append("\"]\n"); } int numTransitions = det.initTransition(state, t); for(int i=0;i<numTransitions;i++) { det.getNextTransition(t); assert t.max >= t.min; for(int j=t.min;j<=t.max;j++) { b.append(" "); b.append(state); b.append(" -> "); b.append(t.dest); b.append(" [label=\""); if (j == anyTermID) { b.append('*'); } else { b.append(idToTerm.get(j).utf8ToString()); } b.append("\"]\n"); } } } b.append('}'); return b.toString(); } // TODO: should we impl rewrite to return BooleanQuery of PhraseQuery, // when 1) automaton is finite, 2) doesn't use ANY transition, 3) is // "small enough"? static class EnumAndScorer { public final int termID; public final PostingsEnum posEnum; // How many positions left in the current document: public int posLeft; // Current position public int pos; public EnumAndScorer(int termID, PostingsEnum posEnum) { this.termID = termID; this.posEnum = posEnum; } } final class TermAutomatonWeight extends Weight { final Automaton automaton; private final Map<Integer,TermStates> termStates; private final Similarity.SimScorer stats; private final Similarity similarity; public TermAutomatonWeight(Automaton automaton, IndexSearcher searcher, Map<Integer,TermStates> termStates, float boost) throws IOException { super(TermAutomatonQuery.this); this.automaton = automaton; this.termStates = termStates; this.similarity = searcher.getSimilarity(); List<TermStatistics> allTermStats = new ArrayList<>(); for(Map.Entry<Integer,BytesRef> ent : idToTerm.entrySet()) { Integer termID = ent.getKey(); if (ent.getValue() != null) { TermStatistics stats = searcher.termStatistics(new Term(field, ent.getValue()), termStates.get(termID)); if (stats != null) { allTermStats.add(stats); } } } if (allTermStats.isEmpty()) { stats = null; // no terms matched at all, will not use sim } else { stats = similarity.scorer(boost, searcher.collectionStatistics(field), allTermStats.toArray(new TermStatistics[allTermStats.size()])); } } @Override public void extractTerms(Set<Term> terms) { for(BytesRef text : termToID.keySet()) { if (text != null) { terms.add(new Term(field, text)); } } } @Override public String toString() { return "weight(" + TermAutomatonQuery.this + ")"; } @Override public Scorer scorer(LeafReaderContext context) throws IOException { // Initialize the enums; null for a given slot means that term didn't appear in this reader EnumAndScorer[] enums = new EnumAndScorer[idToTerm.size()]; boolean any = false; for(Map.Entry<Integer,TermStates> ent : termStates.entrySet()) { TermStates termStates = ent.getValue(); assert termStates.wasBuiltFor(ReaderUtil.getTopLevelContext(context)) : "The top-reader used to create Weight is not the same as the current reader's top-reader (" + ReaderUtil.getTopLevelContext(context); BytesRef term = idToTerm.get(ent.getKey()); TermState state = termStates.get(context); if (state != null) { TermsEnum termsEnum = context.reader().terms(field).iterator(); termsEnum.seekExact(term, state); enums[ent.getKey()] = new EnumAndScorer(ent.getKey(), termsEnum.postings(null, PostingsEnum.POSITIONS)); any = true; } } if (any) { return new TermAutomatonScorer(this, enums, anyTermID, idToTerm, new LeafSimScorer(stats, context.reader(), field, true)); } else { return null; } } @Override public boolean isCacheable(LeafReaderContext ctx) { return true; } @Override public Explanation explain(LeafReaderContext context, int doc) throws IOException { // TODO return null; } } public Query rewrite(IndexReader reader) throws IOException { if (Operations.isEmpty(det)) { return new MatchNoDocsQuery(); } IntsRef single = Operations.getSingleton(det); if (single != null && single.length == 1) { return new TermQuery(new Term(field, idToTerm.get(single.ints[single.offset]))); } // TODO: can PhraseQuery really handle multiple terms at the same position? If so, why do we even have MultiPhraseQuery? // Try for either PhraseQuery or MultiPhraseQuery, which only works when the automaton is a sausage: MultiPhraseQuery.Builder mpq = new MultiPhraseQuery.Builder(); PhraseQuery.Builder pq = new PhraseQuery.Builder(); Transition t = new Transition(); int state = 0; int pos = 0; query: while (true) { int count = det.initTransition(state, t); if (count == 0) { if (det.isAccept(state) == false) { mpq = null; pq = null; } break; } else if (det.isAccept(state)) { mpq = null; pq = null; break; } int dest = -1; List<Term> terms = new ArrayList<>(); boolean matchesAny = false; for(int i=0;i<count;i++) { det.getNextTransition(t); if (i == 0) { dest = t.dest; } else if (dest != t.dest) { mpq = null; pq = null; break query; } matchesAny |= anyTermID >= t.min && anyTermID <= t.max; if (matchesAny == false) { for(int termID=t.min;termID<=t.max;termID++) { terms.add(new Term(field, idToTerm.get(termID))); } } } if (matchesAny == false) { mpq.add(terms.toArray(new Term[terms.size()]), pos); if (pq != null) { if (terms.size() == 1) { pq.add(terms.get(0), pos); } else { pq = null; } } } state = dest; pos++; } if (pq != null) { return pq.build(); } else if (mpq != null) { return mpq.build(); } // TODO: we could maybe also rewrite to union of PhraseQuery (pull all finite strings) if it's "worth it"? return this; } @Override public void visit(QueryVisitor visitor) { if (visitor.acceptField(field) == false) { return; } QueryVisitor v = visitor.getSubVisitor(BooleanClause.Occur.SHOULD, this); for (BytesRef term : termToID.keySet()) { v.consumeTerms(this, new Term(field, term)); } } }