org.apache.lucene/lucene-highlighter/8.2.0 : org/apache/lucene/search/highlight/TokenStreamFromTermVector.java

TokenStreamFromTermVector
http://lucene.apache.org/lucene-parent/lucene-highlighter: This is the highlighter for apache lucene java (The Apache Software Foundation)
Apache 2
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.search.highlight;

import java.io.IOException;

import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefArray;
import org.apache.lucene.util.BytesRefBuilder;
import org.apache.lucene.util.CharsRefBuilder;
import org.apache.lucene.util.Counter;
import org.apache.lucene.util.UnicodeUtil;

TokenStream created from a term vector field. The term vector requires positions and/or offsets (either). If you want payloads add PayloadAttributeImpl (as you would normally) but don't assume the attribute is already added just because you know the term vector has payloads, since the first call to incrementToken() will observe if you asked for them and if not then won't get them. This TokenStream supports an efficient reset(), so there's no need to wrap with a caching impl. 
The implementation will create an array of tokens indexed by token position.  As long as there aren't massive jumps
in positions, this is fine.  And it assumes there aren't large numbers of tokens at the same position, since it adds
them to a linked-list per position in O(N^2) complexity.  When there aren't positions in the term vector, it divides
the startOffset by 8 to use as a temporary substitute. In that case, tokens with the same startOffset will occupy
the same final position; otherwise tokens become adjacent.
@lucene.internal /**
 * TokenStream created from a term vector field. The term vector requires positions and/or offsets (either). If you
 * want payloads add PayloadAttributeImpl (as you would normally) but don't assume the attribute is already added just
 * because you know the term vector has payloads, since the first call to incrementToken() will observe if you asked
 * for them and if not then won't get them.  This TokenStream supports an efficient {@link #reset()}, so there's
 * no need to wrap with a caching impl.
 * <p>
 * The implementation will create an array of tokens indexed by token position.  As long as there aren't massive jumps
 * in positions, this is fine.  And it assumes there aren't large numbers of tokens at the same position, since it adds
 * them to a linked-list per position in O(N^2) complexity.  When there aren't positions in the term vector, it divides
 * the startOffset by 8 to use as a temporary substitute. In that case, tokens with the same startOffset will occupy
 * the same final position; otherwise tokens become adjacent.
 *
 * @lucene.internal
 */
public final class TokenStreamFromTermVector extends TokenStream {

  private final Terms vector;

  private final CharTermAttribute termAttribute;

  private final PositionIncrementAttribute positionIncrementAttribute;

  private final int maxStartOffset;

  private OffsetAttribute offsetAttribute;//maybe null

  private PayloadAttribute payloadAttribute;//maybe null

  private CharsRefBuilder termCharsBuilder;//term data here

  private BytesRefArray payloadsBytesRefArray;//only used when payloadAttribute is non-null
  private BytesRefBuilder spareBytesRefBuilder;//only used when payloadAttribute is non-null

  private TokenLL firstToken = null; // the head of a linked-list

  private TokenLL incrementToken = null;

  private boolean initialized = false;//lazy

  Constructor. The uninversion doesn't happen here; it's delayed till the first call to incrementToken. 
Params: vector – Terms that contains the data for
       creating the TokenStream. Must have positions and/or offsets.
maxStartOffset – if a token's start offset exceeds this then the token is not added. -1 disables the limit./**
   * Constructor. The uninversion doesn't happen here; it's delayed till the first call to
   * {@link #incrementToken}.
   *
   * @param vector Terms that contains the data for
   *        creating the TokenStream. Must have positions and/or offsets.
   * @param maxStartOffset if a token's start offset exceeds this then the token is not added. -1 disables the limit.
   */
  public TokenStreamFromTermVector(Terms vector, int maxStartOffset) throws IOException {
    this.maxStartOffset = maxStartOffset < 0 ? Integer.MAX_VALUE : maxStartOffset;
    assert !hasAttribute(PayloadAttribute.class) : "AttributeFactory shouldn't have payloads *yet*";
    if (!vector.hasPositions() && !vector.hasOffsets()) {
      throw new IllegalArgumentException("The term vector needs positions and/or offsets.");
    }
    assert vector.hasFreqs();
    this.vector = vector;
    termAttribute = addAttribute(CharTermAttribute.class);
    positionIncrementAttribute = addAttribute(PositionIncrementAttribute.class);
  }

  public Terms getTermVectorTerms() { return vector; }

  @Override
  public void reset() throws IOException {
    incrementToken = null;
    super.reset();
  }

  //We delay initialization because we can see which attributes the consumer wants, particularly payloads
  private void init() throws IOException {
    assert !initialized;
    short dpEnumFlags = PostingsEnum.POSITIONS;
    if (vector.hasOffsets()) {
      dpEnumFlags |= PostingsEnum.OFFSETS;
      offsetAttribute = addAttribute(OffsetAttribute.class);
    }
    if (vector.hasPayloads() && hasAttribute(PayloadAttribute.class)) {
      dpEnumFlags |= (PostingsEnum.OFFSETS | PostingsEnum.PAYLOADS);//must ask for offsets too
      payloadAttribute = getAttribute(PayloadAttribute.class);
      payloadsBytesRefArray = new BytesRefArray(Counter.newCounter());
      spareBytesRefBuilder = new BytesRefBuilder();
    }

    // We put term data here
    termCharsBuilder = new CharsRefBuilder();
    termCharsBuilder.grow((int) (vector.size() * 7));//7 is over-estimate of average term len

    // Step 1: iterate termsEnum and create a token, placing into an array of tokens by position

    TokenLL[] positionedTokens = initTokensArray();

    int lastPosition = -1;

    final TermsEnum termsEnum = vector.iterator();
    BytesRef termBytesRef;
    PostingsEnum dpEnum = null;
    CharsRefBuilder tempCharsRefBuilder = new CharsRefBuilder();//only for UTF8->UTF16 call
    //int sumFreq = 0;
    while ((termBytesRef = termsEnum.next()) != null) {
      //Grab the term (in same way as BytesRef.utf8ToString() but we don't want a String obj)
      // note: if term vectors supported seek by ord then we might just keep an int and seek by ord on-demand
      tempCharsRefBuilder.grow(termBytesRef.length);
      final int termCharsLen = UnicodeUtil.UTF8toUTF16(termBytesRef, tempCharsRefBuilder.chars());
      final int termCharsOff = termCharsBuilder.length();
      termCharsBuilder.append(tempCharsRefBuilder.chars(), 0, termCharsLen);

      dpEnum = termsEnum.postings(dpEnum, dpEnumFlags);
      assert dpEnum != null; // presumably checked by TokenSources.hasPositions earlier
      dpEnum.nextDoc();
      final int freq = dpEnum.freq();
      //sumFreq += freq;
      for (int j = 0; j < freq; j++) {
        int pos = dpEnum.nextPosition();
        TokenLL token = new TokenLL();
        token.termCharsOff = termCharsOff;
        token.termCharsLen = (short) Math.min(termCharsLen, Short.MAX_VALUE);
        if (offsetAttribute != null) {
          token.startOffset = dpEnum.startOffset();
          if (token.startOffset > maxStartOffset) {
            continue;//filter this token out; exceeds threshold
          }
          token.endOffsetInc = (short) Math.min(dpEnum.endOffset() - token.startOffset, Short.MAX_VALUE);
          if (pos == -1) {
            pos = token.startOffset >> 3;//divide by 8
          }
        }

        if (payloadAttribute != null) {
          final BytesRef payload = dpEnum.getPayload();
          token.payloadIndex = payload == null ? -1 : payloadsBytesRefArray.append(payload);
        }

        //Add token to an array indexed by position
        if (positionedTokens.length <= pos) {
          //grow, but not 2x since we think our original length estimate is close
          TokenLL[] newPositionedTokens = new TokenLL[(int)((pos + 1) * 1.5f)];
          System.arraycopy(positionedTokens, 0, newPositionedTokens, 0, lastPosition + 1);
          positionedTokens = newPositionedTokens;
        }
        positionedTokens[pos] = token.insertIntoSortedLinkedList(positionedTokens[pos]);

        lastPosition = Math.max(lastPosition, pos);
      }
    }

//    System.out.println(String.format(
//        "SumFreq: %5d Size: %4d SumFreq/size: %3.3f MaxPos: %4d MaxPos/SumFreq: %3.3f WastePct: %3.3f",
//        sumFreq, vector.size(), (sumFreq / (float)vector.size()), lastPosition, ((float)lastPosition)/sumFreq,
//        (originalPositionEstimate/(lastPosition + 1.0f))));

    // Step 2:  Link all Tokens into a linked-list and set position increments as we go

    int prevTokenPos = -1;
    TokenLL prevToken = null;
    for (int pos = 0; pos <= lastPosition; pos++) {
      TokenLL token = positionedTokens[pos];
      if (token == null) {
        continue;
      }
      //link
      if (prevToken != null) {
        assert prevToken.next == null;
        prevToken.next = token; //concatenate linked-list
      } else {
        assert firstToken == null;
        firstToken = token;
      }
      //set increments
      if (vector.hasPositions()) {
        token.positionIncrement = pos - prevTokenPos;
        while (token.next != null) {
          token = token.next;
          token.positionIncrement = 0;
        }
      } else {
        token.positionIncrement = 1;
        while (token.next != null) {
          prevToken = token;
          token = token.next;
          if (prevToken.startOffset == token.startOffset) {
            token.positionIncrement = 0;
          } else {
            token.positionIncrement = 1;
          }
        }
      }
      prevTokenPos = pos;
      prevToken = token;
    }

    initialized = true;
  }

  private TokenLL[] initTokensArray() throws IOException {
    // Estimate the number of position slots we need from term stats.  We use some estimation factors taken from
    //  Wikipedia that reduce the likelihood of needing to expand the array.
    int sumTotalTermFreq = (int) vector.getSumTotalTermFreq();
    assert sumTotalTermFreq != -1;

    final int originalPositionEstimate = (int) (sumTotalTermFreq * 1.5);//less than 1 in 10 docs exceed this

    // This estimate is based on maxStartOffset. Err on the side of this being larger than needed.
    final int offsetLimitPositionEstimate = (int) (maxStartOffset / 5.0);

    // Take the smaller of the two estimates, but no smaller than 64
    return new TokenLL[Math.max(64, Math.min(originalPositionEstimate, offsetLimitPositionEstimate))];
  }

  @Override
  public boolean incrementToken() throws IOException {
    if (incrementToken == null) {
      if (!initialized) {
        init();
        assert initialized;
      }
      incrementToken = firstToken;
      if (incrementToken == null) {
        return false;
      }
    } else if (incrementToken.next != null) {
      incrementToken = incrementToken.next;
    } else {
      return false;
    }
    clearAttributes();
    termAttribute.copyBuffer(termCharsBuilder.chars(), incrementToken.termCharsOff, incrementToken.termCharsLen);
    positionIncrementAttribute.setPositionIncrement(incrementToken.positionIncrement);
    if (offsetAttribute != null) {
      offsetAttribute.setOffset(incrementToken.startOffset, incrementToken.startOffset + incrementToken.endOffsetInc);
    }
    if (payloadAttribute != null) {
      if (incrementToken.payloadIndex == -1) {
        payloadAttribute.setPayload(null);
      } else {
        payloadAttribute.setPayload(payloadsBytesRefArray.get(spareBytesRefBuilder, incrementToken.payloadIndex));
      }
    }
    return true;
  }

  private static class TokenLL {
    // This class should weigh 32 bytes, including object header

    int termCharsOff; // see termCharsBuilder
    short termCharsLen;

    int positionIncrement;
    int startOffset;
    short endOffsetInc; // add to startOffset to get endOffset
    int payloadIndex;

    TokenLL next;

    Given the head of a linked-list (possibly null) this inserts the token at the correct
spot to maintain the desired order, and returns the head (which could be this token if it's the smallest).
O(N^2) complexity but N should be a handful at most.
/** Given the head of a linked-list (possibly null) this inserts the token at the correct
     * spot to maintain the desired order, and returns the head (which could be this token if it's the smallest).
     * O(N^2) complexity but N should be a handful at most.
     */
    TokenLL insertIntoSortedLinkedList(final TokenLL head) {
      assert next == null;
      if (head == null) {
        return this;
      } else if (this.compareOffsets(head) <= 0) {
        this.next = head;
        return this;
      }
      TokenLL prev = head;
      while (prev.next != null && this.compareOffsets(prev.next) > 0) {
        prev = prev.next;
      }
      this.next = prev.next;
      prev.next = this;
      return head;
    }

    by startOffset then endOffset /** by startOffset then endOffset */
    int compareOffsets(TokenLL tokenB) {
      int cmp = Integer.compare(this.startOffset, tokenB.startOffset);
      if (cmp == 0) {
        cmp = Short.compare(this.endOffsetInc, tokenB.endOffsetInc);
      }
      return cmp;
    }
  }
}
/

org.apache.lucene/ lucene-highlighter/ 8.2.0/ org/apache/lucene/search/highlight/TokenStreamFromTermVector.java