org.apache.lucene/lucene-analyzers-common/8.2.0 : org/apache/lucene/analysis/miscellaneous/ConcatenateGraphFilter.java

ConcatenateGraphFilter
http://lucene.apache.org/lucene-parent/lucene-analyzers-common: Additional Analyzers (The Apache Software Foundation)
Apache 2
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.analysis.miscellaneous;

import java.io.IOException;

import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.TokenStreamToAutomaton;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
import org.apache.lucene.util.AttributeImpl;
import org.apache.lucene.util.AttributeReflector;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefBuilder;
import org.apache.lucene.util.CharsRefBuilder;
import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.automaton.Automaton;
import org.apache.lucene.util.automaton.LimitedFiniteStringsIterator;
import org.apache.lucene.util.automaton.Operations;
import org.apache.lucene.util.automaton.TooComplexToDeterminizeException;
import org.apache.lucene.util.automaton.Transition;
import org.apache.lucene.util.fst.Util;

Concatenates/Joins every incoming token with a separator into one output token for every path through the
token stream (which is a graph).  In simple cases this yields one token, but in the presence of any tokens with
a zero positionIncrmeent (e.g. synonyms) it will be more.  This filter uses the token bytes, position increment,
and position length of the incoming stream.  Other attributes are not used or manipulated.
@lucene.experimental /**
 * Concatenates/Joins every incoming token with a separator into one output token for every path through the
 * token stream (which is a graph).  In simple cases this yields one token, but in the presence of any tokens with
 * a zero positionIncrmeent (e.g. synonyms) it will be more.  This filter uses the token bytes, position increment,
 * and position length of the incoming stream.  Other attributes are not used or manipulated.
 *
 * @lucene.experimental
 */
public final class ConcatenateGraphFilter extends TokenStream {

  /*
   * Token stream which converts a provided token stream to an automaton.
   * The accepted strings enumeration from the automaton are available through the
   * {@link org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute} attribute
   * The token stream uses a {@link org.apache.lucene.analysis.tokenattributes.PayloadAttribute} to store
   * a completion's payload (see {@link ConcatenateGraphFilter#setPayload(org.apache.lucene.util.BytesRef)})
   */

  Represents the separation between tokens, if
preserveSep is true.
/**
   * Represents the separation between tokens, if
   * <code>preserveSep</code> is <code>true</code>.
   */
  public final static int SEP_LABEL = TokenStreamToAutomaton.POS_SEP;
  public final static int DEFAULT_MAX_GRAPH_EXPANSIONS = Operations.DEFAULT_MAX_DETERMINIZED_STATES;
  public final static boolean DEFAULT_PRESERVE_SEP = true;
  public final static boolean DEFAULT_PRESERVE_POSITION_INCREMENTS = true;

  private final BytesRefBuilderTermAttribute bytesAtt = addAttribute(BytesRefBuilderTermAttribute.class);
  private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
  private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);

  private final TokenStream inputTokenStream;
  private final boolean preserveSep;
  private final boolean preservePositionIncrements;
  private final int maxGraphExpansions;

  private LimitedFiniteStringsIterator finiteStrings;
  private CharTermAttribute charTermAttribute;
  private boolean wasReset = false;
  private int endOffset;

  Creates a token stream to convert input to a token stream
of accepted strings by its token stream graph.

This constructor uses the default settings of the constants in this class.
/**
   * Creates a token stream to convert <code>input</code> to a token stream
   * of accepted strings by its token stream graph.
   * <p>
   * This constructor uses the default settings of the constants in this class.
   */
  public ConcatenateGraphFilter(TokenStream inputTokenStream) {
    this(inputTokenStream, DEFAULT_PRESERVE_SEP, DEFAULT_PRESERVE_POSITION_INCREMENTS, DEFAULT_MAX_GRAPH_EXPANSIONS);
  }

  Creates a token stream to convert input to a token stream
of accepted strings by its token stream graph.
Params: inputTokenStream – The input/incoming TokenStream
preserveSep – Whether SEP_LABEL should separate the input tokens in the concatenated token
preservePositionIncrements – Whether to add an empty token for missing positions. The effect is a consecutive SEP_LABEL. When false, it's as if there were no missing positions (we pretend the surrounding tokens were adjacent).
maxGraphExpansions – If the tokenStream graph has more than this many possible paths through, then we'll throw TooComplexToDeterminizeException to preserve the stability and memory of the machine.
Throws: TooComplexToDeterminizeException – if the tokenStream graph has more than maxGraphExpansions expansions/**
   * Creates a token stream to convert <code>input</code> to a token stream
   * of accepted strings by its token stream graph.
   *
   * @param inputTokenStream The input/incoming TokenStream
   * @param preserveSep Whether {@link #SEP_LABEL} should separate the input tokens in the concatenated token
   * @param preservePositionIncrements Whether to add an empty token for missing positions.
   *                                   The effect is a consecutive {@link #SEP_LABEL}.
   *                                   When false, it's as if there were no missing positions
   *                                     (we pretend the surrounding tokens were adjacent).
   * @param maxGraphExpansions If the tokenStream graph has more than this many possible paths through, then we'll throw
   *                           {@link TooComplexToDeterminizeException} to preserve the stability and memory of the
   *                           machine.
   * @throws TooComplexToDeterminizeException if the tokenStream graph has more than {@code maxGraphExpansions}
   *         expansions
   *
   */
  public ConcatenateGraphFilter(TokenStream inputTokenStream, boolean preserveSep, boolean preservePositionIncrements, int maxGraphExpansions) {
    // Don't call the super(input) ctor - this is a true delegate and has a new attribute source since we consume
    // the input stream entirely in the first call to incrementToken
    this.inputTokenStream = inputTokenStream;
    this.preserveSep = preserveSep;
    this.preservePositionIncrements = preservePositionIncrements;
    this.maxGraphExpansions = maxGraphExpansions;
  }

  @Override
  public void reset() throws IOException {
    super.reset();
    // we only capture this if we really need it to save the UTF-8 to UTF-16 conversion
    charTermAttribute = getAttribute(CharTermAttribute.class); // may return null
    wasReset = true;
  }

  @Override
  public boolean incrementToken() throws IOException {
    if (finiteStrings == null) {
      if (wasReset == false) {
        throw new IllegalStateException("reset() missing before incrementToken");
      }
      // lazy init/consume
      Automaton automaton = toAutomaton(); // calls reset(), incrementToken() repeatedly, and end() on inputTokenStream
      finiteStrings = new LimitedFiniteStringsIterator(automaton, maxGraphExpansions);
      //note: would be nice to know the startOffset but toAutomaton doesn't capture it.  We'll assume 0
      endOffset = inputTokenStream.getAttribute(OffsetAttribute.class).endOffset();
    }

    IntsRef string = finiteStrings.next();
    if (string == null) {
      return false;
    }

    clearAttributes();

    if (finiteStrings.size() > 1) { // if number of iterated strings so far is more than one...
      posIncrAtt.setPositionIncrement(0); // stacked
    }

    offsetAtt.setOffset(0, endOffset);

    Util.toBytesRef(string, bytesAtt.builder()); // now we have UTF-8
    if (charTermAttribute != null) {
      charTermAttribute.setLength(0);
      charTermAttribute.append(bytesAtt.toUTF16());
    }

    return true;
  }

  @Override
  public void end() throws IOException {
    super.end();
    if (finiteStrings == null) { // thus inputTokenStream hasn't yet received end()
      inputTokenStream.end(); // the input TS may really want to see "end()" called even if incrementToken hasn't.
    } // else we already eagerly consumed inputTokenStream including end()
    if (endOffset != -1) {
      offsetAtt.setOffset(0, endOffset);
    }
  }

  @Override
  public void close() throws IOException {
    super.close();
    //delegate lifecycle.  Note toAutomaton does not close the stream
    inputTokenStream.close();
    finiteStrings = null;
    wasReset = false;//reset
    endOffset = -1;//reset
  }

  Converts the tokenStream to an automaton, treating the transition labels as utf-8.  Does *not* close it.
/**
   * Converts the tokenStream to an automaton, treating the transition labels as utf-8.  Does *not* close it.
   */
  public Automaton toAutomaton() throws IOException {
    return toAutomaton(false);
  }

  Converts the tokenStream to an automaton.  Does *not* close it.
/**
   * Converts the tokenStream to an automaton.  Does *not* close it.
   */
  public Automaton toAutomaton(boolean unicodeAware) throws IOException {
    // TODO refactor this
    // maybe we could hook up a modified automaton from TermAutomatonQuery here?

    // Create corresponding automaton: labels are bytes
    // from each analyzed token, with byte 0 used as
    // separator between tokens:
    final TokenStreamToAutomaton tsta;
    if (preserveSep) {
      tsta = new EscapingTokenStreamToAutomaton(SEP_LABEL);
    } else {
      // When we're not preserving sep, we don't steal 0xff
      // byte, so we don't need to do any escaping:
      tsta = new TokenStreamToAutomaton();
    }
    tsta.setPreservePositionIncrements(preservePositionIncrements);
    tsta.setUnicodeArcs(unicodeAware);

    Automaton automaton = tsta.toAutomaton(inputTokenStream);

    // TODO: we can optimize this somewhat by determinizing
    // while we convert
    automaton = replaceSep(automaton, preserveSep, SEP_LABEL);
    // This automaton should not blow up during determinize:
    return Operations.determinize(automaton, maxGraphExpansions);
  }

  Just escapes the ConcatenateGraphFilter.SEP_LABEL byte with an extra. /**
   * Just escapes the {@link #SEP_LABEL} byte with an extra.
   */
  private static final class EscapingTokenStreamToAutomaton extends TokenStreamToAutomaton {

    final BytesRefBuilder spare = new BytesRefBuilder();
    final byte sepLabel;

    public EscapingTokenStreamToAutomaton(int sepLabel) {
      assert sepLabel <= Byte.MAX_VALUE;
      this.sepLabel = (byte) sepLabel;
    }

    @Override
    protected BytesRef changeToken(BytesRef in) {
      int upto = 0;
      for (int i = 0; i < in.length; i++) {
        byte b = in.bytes[in.offset + i];
        if (b == sepLabel) {
          spare.grow(upto + 2);
          spare.setByteAt(upto++, sepLabel);
          spare.setByteAt(upto++, b);
        } else {
          spare.grow(upto + 1);
          spare.setByteAt(upto++, b);
        }
      }
      spare.setLength(upto);
      return spare.get();
    }
  }

  // Replaces SEP with epsilon or remaps them if
  // we were asked to preserve them:
  private static Automaton replaceSep(Automaton a, boolean preserveSep, int sepLabel) {

    Automaton result = new Automaton();

    // Copy all states over
    int numStates = a.getNumStates();
    for (int s = 0; s < numStates; s++) {
      result.createState();
      result.setAccept(s, a.isAccept(s));
    }

    // Go in reverse topo sort so we know we only have to
    // make one pass:
    Transition t = new Transition();
    int[] topoSortStates = Operations.topoSortStates(a);
    for (int i = 0; i < topoSortStates.length; i++) {
      int state = topoSortStates[topoSortStates.length - 1 - i];
      int count = a.initTransition(state, t);
      for (int j = 0; j < count; j++) {
        a.getNextTransition(t);
        if (t.min == TokenStreamToAutomaton.POS_SEP) {
          assert t.max == TokenStreamToAutomaton.POS_SEP;
          if (preserveSep) {
            // Remap to SEP_LABEL:
            result.addTransition(state, t.dest, sepLabel);
          } else {
            result.addEpsilon(state, t.dest);
          }
        } else if (t.min == TokenStreamToAutomaton.HOLE) {
          assert t.max == TokenStreamToAutomaton.HOLE;

          // Just remove the hole: there will then be two
          // SEP tokens next to each other, which will only
          // match another hole at search time.  Note that
          // it will also match an empty-string token ... if
          // that's somehow a problem we can always map HOLE
          // to a dedicated byte (and escape it in the
          // input).
          result.addEpsilon(state, t.dest);
        } else {
          result.addTransition(state, t.dest, t.min, t.max);
        }
      }
    }

    result.finishState();

    return result;
  }

  Attribute providing access to the term builder and UTF-16 conversion
@lucene.internal /**
   * Attribute providing access to the term builder and UTF-16 conversion
   * @lucene.internal
   */
  public interface BytesRefBuilderTermAttribute extends TermToBytesRefAttribute {
    Returns the builder from which the term is derived.
/**
     * Returns the builder from which the term is derived.
     */
    BytesRefBuilder builder();

    Returns the term represented as UTF-16
/**
     * Returns the term represented as UTF-16
     */
    CharSequence toUTF16();
  }

  Implementation of BytesRefBuilderTermAttribute 
@lucene.internal /**
   * Implementation of {@link BytesRefBuilderTermAttribute}
   * @lucene.internal
   */
  public static final class BytesRefBuilderTermAttributeImpl extends AttributeImpl implements BytesRefBuilderTermAttribute, TermToBytesRefAttribute {
    private final BytesRefBuilder bytes = new BytesRefBuilder();
    private transient CharsRefBuilder charsRef;

    Sole constructor
no-op
/**
     * Sole constructor
     * no-op
     */
    public BytesRefBuilderTermAttributeImpl() {
    }

    @Override
    public BytesRefBuilder builder() {
      return bytes;
    }

    @Override
    public BytesRef getBytesRef() {
      return bytes.get();
    }

    @Override
    public void clear() {
      bytes.clear();
    }

    @Override
    public void copyTo(AttributeImpl target) {
      BytesRefBuilderTermAttributeImpl other = (BytesRefBuilderTermAttributeImpl) target;
      other.bytes.copyBytes(bytes);
    }

    @Override
    public AttributeImpl clone() {
      BytesRefBuilderTermAttributeImpl other = new BytesRefBuilderTermAttributeImpl();
      copyTo(other);
      return other;
    }

    @Override
    public void reflectWith(AttributeReflector reflector) {
      reflector.reflect(TermToBytesRefAttribute.class, "bytes", getBytesRef());
    }

    @Override
    public CharSequence toUTF16() {
      if (charsRef == null) {
        charsRef = new CharsRefBuilder();
      }
      charsRef.copyUTF8Bytes(getBytesRef());
      return charsRef.get();
    }
  }
}
/

org.apache.lucene/ lucene-analyzers-common/ 8.2.0/ org/apache/lucene/analysis/miscellaneous/ConcatenateGraphFilter.java