org.apache.lucene/lucene-suggest/8.2.0 : org/apache/lucene/search/suggest/analyzing/FreeTextSuggester.java

FreeTextSuggester
http://lucene.apache.org/lucene-parent/lucene-suggest: Lucene Suggest Module (The Apache Software Foundation)
Apache 2
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.search.suggest.analyzing;

// TODO
//   - test w/ syns
//   - add pruning of low-freq ngrams?

import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashSet;
import java.util.List;
import java.util.Set;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.AnalyzerWrapper;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.shingle.ShingleFilter;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.MultiTerms;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.suggest.InputIterator;
import org.apache.lucene.search.suggest.Lookup;
import org.apache.lucene.store.DataInput;
import org.apache.lucene.store.DataOutput;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Accountable;
import org.apache.lucene.util.Accountables;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefBuilder;
import org.apache.lucene.util.CharsRefBuilder;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.IntsRefBuilder;
import org.apache.lucene.util.fst.Builder;
import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.FST.Arc;
import org.apache.lucene.util.fst.FST.BytesReader;
import org.apache.lucene.util.fst.Outputs;
import org.apache.lucene.util.fst.PositiveIntOutputs;
import org.apache.lucene.util.fst.Util;
import org.apache.lucene.util.fst.Util.Result;
import org.apache.lucene.util.fst.Util.TopResults;

//import java.io.PrintWriter;

Builds an ngram model from the text sent to build and predicts based on the last grams-1 tokens in the request sent to lookup. This tries to handle the "long tail" of suggestions for when the incoming query is a never before seen query string. Likely this suggester would only be used as a
fallback, when the primary suggester fails to find
any suggestions.
Note that the weight for each suggestion is unused,
and the suggestions are the analyzed forms (so your
analysis process should normally be very "light").
This uses the stupid backoff language model to smooth
scores across ngram models; see
"Large language models in machine translation",
http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.76.1126
for details.
 From lookup, the key of each result is the ngram token; the value is Long.MAX_VALUE * score (fixed point, cast to long). Divide by Long.MAX_VALUE to get the score back, which ranges from 0.0 to 1.0. onlyMorePopular is unused. 
@lucene.experimental /**
 * Builds an ngram model from the text sent to {@link
 * #build} and predicts based on the last grams-1 tokens in
 * the request sent to {@link #lookup}.  This tries to
 * handle the "long tail" of suggestions for when the
 * incoming query is a never before seen query string.
 *
 * <p>Likely this suggester would only be used as a
 * fallback, when the primary suggester fails to find
 * any suggestions.
 *
 * <p>Note that the weight for each suggestion is unused,
 * and the suggestions are the analyzed forms (so your
 * analysis process should normally be very "light").
 *
 * <p>This uses the stupid backoff language model to smooth
 * scores across ngram models; see
 * "Large language models in machine translation",
 * http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.76.1126
 * for details.
 *
 * <p> From {@link #lookup}, the key of each result is the
 * ngram token; the value is Long.MAX_VALUE * score (fixed
 * point, cast to long).  Divide by Long.MAX_VALUE to get
 * the score back, which ranges from 0.0 to 1.0.
 * 
 * onlyMorePopular is unused.
 *
 * @lucene.experimental
 */
// redundant 'implements Accountable' to workaround javadocs bugs
public class FreeTextSuggester extends Lookup implements Accountable {

  Codec name used in the header for the saved model. /** Codec name used in the header for the saved model. */
  public final static String CODEC_NAME = "freetextsuggest";

  Initial version of the the saved model file format. /** Initial version of the the saved model file format. */
  public final static int VERSION_START = 0;

  Current version of the the saved model file format. /** Current version of the the saved model file format. */
  public final static int VERSION_CURRENT = VERSION_START;

  By default we use a bigram model. /** By default we use a bigram model. */
  public static final int DEFAULT_GRAMS = 2;

  // In general this could vary with gram, but the
  // original paper seems to use this constant:
  The constant used for backoff smoothing; during
 lookup, this means that if a given trigram did not
 occur, and we backoff to the bigram, the overall score
 will be 0.4 times what the bigram model would have
 assigned. /** The constant used for backoff smoothing; during
   *  lookup, this means that if a given trigram did not
   *  occur, and we backoff to the bigram, the overall score
   *  will be 0.4 times what the bigram model would have
   *  assigned. */
  public final static double ALPHA = 0.4;

  Holds 1gram, 2gram, 3gram models as a single FST. /** Holds 1gram, 2gram, 3gram models as a single FST. */
  private FST<Long> fst;
 
  Analyzer that will be used for analyzing suggestions at
index time.
/** 
   * Analyzer that will be used for analyzing suggestions at
   * index time.
   */
  private final Analyzer indexAnalyzer;

  private long totTokens;

  Analyzer that will be used for analyzing suggestions at
query time.
/** 
   * Analyzer that will be used for analyzing suggestions at
   * query time.
   */
  private final Analyzer queryAnalyzer;
  
  // 2 = bigram, 3 = trigram
  private final int grams;

  private final byte separator;

  Number of entries the lookup was built with /** Number of entries the lookup was built with */
  private long count = 0;

  The default character used to join multiple tokens
 into a single ngram token.  The input tokens produced
 by the analyzer must not contain this character. /** The default character used to join multiple tokens
   *  into a single ngram token.  The input tokens produced
   *  by the analyzer must not contain this character. */
  public static final byte DEFAULT_SEPARATOR = 0x1e;

  Instantiate, using the provided analyzer for both
 indexing and lookup, using bigram model by default. /** Instantiate, using the provided analyzer for both
   *  indexing and lookup, using bigram model by default. */
  public FreeTextSuggester(Analyzer analyzer) {
    this(analyzer, analyzer, DEFAULT_GRAMS);
  }

  Instantiate, using the provided indexing and lookup
 analyzers, using bigram model by default. /** Instantiate, using the provided indexing and lookup
   *  analyzers, using bigram model by default. */
  public FreeTextSuggester(Analyzer indexAnalyzer, Analyzer queryAnalyzer) {
    this(indexAnalyzer, queryAnalyzer, DEFAULT_GRAMS);
  }

  Instantiate, using the provided indexing and lookup
 analyzers, with the specified model (2
 = bigram, 3 = trigram, etc.). /** Instantiate, using the provided indexing and lookup
   *  analyzers, with the specified model (2
   *  = bigram, 3 = trigram, etc.). */
  public FreeTextSuggester(Analyzer indexAnalyzer, Analyzer queryAnalyzer, int grams) {
    this(indexAnalyzer, queryAnalyzer, grams, DEFAULT_SEPARATOR);
  }

  Instantiate, using the provided indexing and lookup analyzers, and specified model (2 = bigram, 3 = trigram ,etc.). The separator is passed to ShingleFilter.setTokenSeparator to join multiple tokens into a single ngram token; it must be an ascii (7-bit-clean) byte. No input tokens should have this byte, otherwise IllegalArgumentException is thrown. /** Instantiate, using the provided indexing and lookup
   *  analyzers, and specified model (2 = bigram, 3 =
   *  trigram ,etc.).  The separator is passed to {@link
   *  ShingleFilter#setTokenSeparator} to join multiple
   *  tokens into a single ngram token; it must be an ascii
   *  (7-bit-clean) byte.  No input tokens should have this
   *  byte, otherwise {@code IllegalArgumentException} is
   *  thrown. */
  public FreeTextSuggester(Analyzer indexAnalyzer, Analyzer queryAnalyzer, int grams, byte separator) {
    this.grams = grams;
    this.indexAnalyzer = addShingles(indexAnalyzer);
    this.queryAnalyzer = addShingles(queryAnalyzer);
    if (grams < 1) {
      throw new IllegalArgumentException("grams must be >= 1");
    }
    if ((separator & 0x80) != 0) {
      throw new IllegalArgumentException("separator must be simple ascii character");
    }
    this.separator = separator;
  }

  Returns byte size of the underlying FST. /** Returns byte size of the underlying FST. */ 
  @Override
  public long ramBytesUsed() {
    if (fst == null) {
      return 0;
    }
    return fst.ramBytesUsed();
  }

  @Override
  public Collection<Accountable> getChildResources() {
    if (fst == null) {
      return Collections.emptyList();
    } else {
      return Collections.singletonList(Accountables.namedAccountable("fst", fst));
    }
  }

  private Analyzer addShingles(final Analyzer other) {
    if (grams == 1) {
      return other;
    } else {
      // TODO: use ShingleAnalyzerWrapper?
      // Tack on ShingleFilter to the end, to generate token ngrams:
      return new AnalyzerWrapper(other.getReuseStrategy()) {
        @Override
        protected Analyzer getWrappedAnalyzer(String fieldName) {
          return other;
        }

        @Override
        protected TokenStreamComponents wrapComponents(String fieldName, TokenStreamComponents components) {
          ShingleFilter shingles = new ShingleFilter(components.getTokenStream(), 2, grams);
          shingles.setTokenSeparator(Character.toString((char) separator));
          return new TokenStreamComponents(components.getSource(), shingles);
        }
      };
    }
  }

  @Override
  public void build(InputIterator iterator) throws IOException {
    build(iterator, IndexWriterConfig.DEFAULT_RAM_BUFFER_SIZE_MB);
  }

  Build the suggest index, using up to the specified
 amount of temporary RAM while building.  Note that
 the weights for the suggestions are ignored. /** Build the suggest index, using up to the specified
   *  amount of temporary RAM while building.  Note that
   *  the weights for the suggestions are ignored. */
  public void build(InputIterator iterator, double ramBufferSizeMB) throws IOException {
    if (iterator.hasPayloads()) {
      throw new IllegalArgumentException("this suggester doesn't support payloads");
    }
    if (iterator.hasContexts()) {
      throw new IllegalArgumentException("this suggester doesn't support contexts");
    }

    String prefix = getClass().getSimpleName();
    Path tempIndexPath = Files.createTempDirectory(prefix + ".index.");

    Directory dir = FSDirectory.open(tempIndexPath);

    IndexWriterConfig iwc = new IndexWriterConfig(indexAnalyzer);
    iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE);
    iwc.setRAMBufferSizeMB(ramBufferSizeMB);
    IndexWriter writer = new IndexWriter(dir, iwc);

    FieldType ft = new FieldType(TextField.TYPE_NOT_STORED);
    // TODO: if only we had IndexOptions.TERMS_ONLY...
    ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS);
    ft.setOmitNorms(true);
    ft.freeze();

    Document doc = new Document();
    Field field = new Field("body", "", ft);
    doc.add(field);

    totTokens = 0;
    IndexReader reader = null;

    boolean success = false;
    count = 0;
    try {
      while (true) {
        BytesRef surfaceForm = iterator.next();
        if (surfaceForm == null) {
          break;
        }
        field.setStringValue(surfaceForm.utf8ToString());
        writer.addDocument(doc);
        count++;
      }
      reader = DirectoryReader.open(writer);

      Terms terms = MultiTerms.getTerms(reader, "body");
      if (terms == null) {
        throw new IllegalArgumentException("need at least one suggestion");
      }

      // Move all ngrams into an FST:
      TermsEnum termsEnum = terms.iterator();

      Outputs<Long> outputs = PositiveIntOutputs.getSingleton();
      Builder<Long> builder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs);

      IntsRefBuilder scratchInts = new IntsRefBuilder();
      while (true) {
        BytesRef term = termsEnum.next();
        if (term == null) {
          break;
        }
        int ngramCount = countGrams(term);
        if (ngramCount > grams) {
          throw new IllegalArgumentException("tokens must not contain separator byte; got token=" + term + " but gramCount=" + ngramCount + ", which is greater than expected max ngram size=" + grams);
        }
        if (ngramCount == 1) {
          totTokens += termsEnum.totalTermFreq();
        }

        builder.add(Util.toIntsRef(term, scratchInts), encodeWeight(termsEnum.totalTermFreq()));
      }

      fst = builder.finish();
      if (fst == null) {
        throw new IllegalArgumentException("need at least one suggestion");
      }
      //System.out.println("FST: " + fst.getNodeCount() + " nodes");

      /*
      PrintWriter pw = new PrintWriter("/x/tmp/out.dot");
      Util.toDot(fst, pw, true, true);
      pw.close();
      */

      // Writer was only temporary, to count up bigrams,
      // which we transferred to the FST, so now we
      // rollback:
      writer.rollback();
      success = true;
    } finally {
      try {
        if (success) {
          IOUtils.close(reader, dir);
        } else {
          IOUtils.closeWhileHandlingException(reader, writer, dir);
        }
      } finally {
        IOUtils.rm(tempIndexPath);
      }
    }
  }

  @Override
  public boolean store(DataOutput output) throws IOException {
    CodecUtil.writeHeader(output, CODEC_NAME, VERSION_CURRENT);
    output.writeVLong(count);
    output.writeByte(separator);
    output.writeVInt(grams);
    output.writeVLong(totTokens);
    fst.save(output);
    return true;
  }

  @Override
  public boolean load(DataInput input) throws IOException {
    CodecUtil.checkHeader(input, CODEC_NAME, VERSION_START, VERSION_START);
    count = input.readVLong();
    byte separatorOrig = input.readByte();
    if (separatorOrig != separator) {
      throw new IllegalStateException("separator=" + separator + " is incorrect: original model was built with separator=" + separatorOrig);
    }
    int gramsOrig = input.readVInt();
    if (gramsOrig != grams) {
      throw new IllegalStateException("grams=" + grams + " is incorrect: original model was built with grams=" + gramsOrig);
    }
    totTokens = input.readVLong();

    fst = new FST<>(input, PositiveIntOutputs.getSingleton());

    return true;
  }

  @Override
  public List<LookupResult> lookup(final CharSequence key, /* ignored */ boolean onlyMorePopular, int num) {
    return lookup(key, null, onlyMorePopular, num);
  }

  Lookup, without any context. /** Lookup, without any context. */
  public List<LookupResult> lookup(final CharSequence key, int num) {
    return lookup(key, null, true, num);
  }

  @Override
  public List<LookupResult> lookup(final CharSequence key, Set<BytesRef> contexts, /* ignored */ boolean onlyMorePopular, int num) {
    try {
      return lookup(key, contexts, num);
    } catch (IOException ioe) {
      // bogus:
      throw new RuntimeException(ioe);
    }
  }

  @Override
  public long getCount() {
    return count;
  }
  
  private int countGrams(BytesRef token) {
    int count = 1;
    for(int i=0;i<token.length;i++) {
      if (token.bytes[token.offset + i] == separator) {
        count++;
      }
    }

    return count;
  }

  Retrieve suggestions. /** Retrieve suggestions. */
  public List<LookupResult> lookup(final CharSequence key, Set<BytesRef> contexts, int num) throws IOException {
    if (contexts != null) {
      throw new IllegalArgumentException("this suggester doesn't support contexts");
    }
    if (fst == null) {
      throw new IllegalStateException("Lookup not supported at this time");
    }

    try (TokenStream ts = queryAnalyzer.tokenStream("", key.toString())) {
      TermToBytesRefAttribute termBytesAtt = ts.addAttribute(TermToBytesRefAttribute.class);
      OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class);
      PositionLengthAttribute posLenAtt = ts.addAttribute(PositionLengthAttribute.class);
      PositionIncrementAttribute posIncAtt = ts.addAttribute(PositionIncrementAttribute.class);
      ts.reset();
      
      BytesRefBuilder[] lastTokens = new BytesRefBuilder[grams];
      //System.out.println("lookup: key='" + key + "'");
      
      // Run full analysis, but save only the
      // last 1gram, last 2gram, etc.:
      int maxEndOffset = -1;
      boolean sawRealToken = false;
      while(ts.incrementToken()) {
        BytesRef tokenBytes = termBytesAtt.getBytesRef();
        sawRealToken |= tokenBytes.length > 0;
        // TODO: this is somewhat iffy; today, ShingleFilter
        // sets posLen to the gram count; maybe we should make
        // a separate dedicated att for this?
        int gramCount = posLenAtt.getPositionLength();
        
        assert gramCount <= grams;
        
        // Safety: make sure the recalculated count "agrees":
        if (countGrams(tokenBytes) != gramCount) {
          throw new IllegalArgumentException("tokens must not contain separator byte; got token=" + tokenBytes + " but gramCount=" + gramCount + " does not match recalculated count=" + countGrams(tokenBytes));
        }
        maxEndOffset = Math.max(maxEndOffset, offsetAtt.endOffset());
        BytesRefBuilder b = new BytesRefBuilder();
        b.append(tokenBytes);
        lastTokens[gramCount-1] = b;
      }
      ts.end();
      
      if (!sawRealToken) {
        throw new IllegalArgumentException("no tokens produced by analyzer, or the only tokens were empty strings");
      }
      
      // Carefully fill last tokens with _ tokens;
      // ShingleFilter appraently won't emit "only hole"
      // tokens:
      int endPosInc = posIncAtt.getPositionIncrement();
      
      // Note this will also be true if input is the empty
      // string (in which case we saw no tokens and
      // maxEndOffset is still -1), which in fact works out OK
      // because we fill the unigram with an empty BytesRef
      // below:
      boolean lastTokenEnded = offsetAtt.endOffset() > maxEndOffset || endPosInc > 0;
      //System.out.println("maxEndOffset=" + maxEndOffset + " vs " + offsetAtt.endOffset());
      
      if (lastTokenEnded) {
        //System.out.println("  lastTokenEnded");
        // If user hit space after the last token, then
        // "upgrade" all tokens.  This way "foo " will suggest
        // all bigrams starting w/ foo, and not any unigrams
        // starting with "foo":
        for(int i=grams-1;i>0;i--) {
          BytesRefBuilder token = lastTokens[i-1];
          if (token == null) {
            continue;
          }
          token.append(separator);
          lastTokens[i] = token;
        }
        lastTokens[0] = new BytesRefBuilder();
      }
      
      Arc<Long> arc = new Arc<>();
      
      BytesReader bytesReader = fst.getBytesReader();
      
      // Try highest order models first, and if they return
      // results, return that; else, fallback:
      double backoff = 1.0;
      
      List<LookupResult> results = new ArrayList<>(num);
      
      // We only add a given suffix once, from the highest
      // order model that saw it; for subsequent lower order
      // models we skip it:
      final Set<BytesRef> seen = new HashSet<>();
      
      for(int gram=grams-1;gram>=0;gram--) {
        BytesRefBuilder token = lastTokens[gram];
        // Don't make unigram predictions from empty string:
        if (token == null || (token.length() == 0 && key.length() > 0)) {
          // Input didn't have enough tokens:
          //System.out.println("  gram=" + gram + ": skip: not enough input");
          continue;
        }
        
        if (endPosInc > 0 && gram <= endPosInc) {
          // Skip hole-only predictions; in theory we
          // shouldn't have to do this, but we'd need to fix
          // ShingleFilter to produce only-hole tokens:
          //System.out.println("  break: only holes now");
          break;
        }
        
        //System.out.println("try " + (gram+1) + " gram token=" + token.utf8ToString());
        
        // TODO: we could add fuzziness here
        // match the prefix portion exactly
        //Pair<Long,BytesRef> prefixOutput = null;
        Long prefixOutput = null;
        try {
          prefixOutput = lookupPrefix(fst, bytesReader, token.get(), arc);
        } catch (IOException bogus) {
          throw new RuntimeException(bogus);
        }
        //System.out.println("  prefixOutput=" + prefixOutput);
        
        if (prefixOutput == null) {
          // This model never saw this prefix, e.g. the
          // trigram model never saw context "purple mushroom"
          backoff *= ALPHA;
          continue;
        }
        
        // TODO: we could do this division at build time, and
        // bake it into the FST?
        
        // Denominator for computing scores from current
        // model's predictions:
        long contextCount = totTokens;
        
        BytesRef lastTokenFragment = null;
        
        for(int i=token.length()-1;i>=0;i--) {
          if (token.byteAt(i) == separator) {
            BytesRef context = new BytesRef(token.bytes(), 0, i);
            Long output = Util.get(fst, Util.toIntsRef(context, new IntsRefBuilder()));
            assert output != null;
            contextCount = decodeWeight(output);
            lastTokenFragment = new BytesRef(token.bytes(), i + 1, token.length() - i - 1);
            break;
          }
        }
        
        final BytesRefBuilder finalLastToken = new BytesRefBuilder();
        if (lastTokenFragment == null) {
          finalLastToken.copyBytes(token.get());
        } else {
          finalLastToken.copyBytes(lastTokenFragment);
        }
        
        CharsRefBuilder spare = new CharsRefBuilder();
        
        // complete top-N
        TopResults<Long> completions = null;
        try {
          
          // Because we store multiple models in one FST
          // (1gram, 2gram, 3gram), we must restrict the
          // search so that it only considers the current
          // model.  For highest order model, this is not
          // necessary since all completions in the FST
          // must be from this model, but for lower order
          // models we have to filter out the higher order
          // ones:
          
          // Must do num+seen.size() for queue depth because we may
          // reject up to seen.size() paths in acceptResult():
          Util.TopNSearcher<Long> searcher = new Util.TopNSearcher<Long>(fst, num, num+seen.size(), weightComparator) {
            
            BytesRefBuilder scratchBytes = new BytesRefBuilder();
            
            @Override
            protected void addIfCompetitive(Util.FSTPath<Long> path) {
              if (path.arc.label != separator) {
                //System.out.println("    keep path: " + Util.toBytesRef(path.input, new BytesRef()).utf8ToString() + "; " + path + "; arc=" + path.arc);
                super.addIfCompetitive(path);
              } else {
                //System.out.println("    prevent path: " + Util.toBytesRef(path.input, new BytesRef()).utf8ToString() + "; " + path + "; arc=" + path.arc);
              }
            }
            
            @Override
            protected boolean acceptResult(IntsRef input, Long output) {
              Util.toBytesRef(input, scratchBytes);
              finalLastToken.grow(finalLastToken.length() + scratchBytes.length());
              int lenSav = finalLastToken.length();
              finalLastToken.append(scratchBytes);
              //System.out.println("    accept? input='" + scratchBytes.utf8ToString() + "'; lastToken='" + finalLastToken.utf8ToString() + "'; return " + (seen.contains(finalLastToken) == false));
              boolean ret = seen.contains(finalLastToken.get()) == false;
              
              finalLastToken.setLength(lenSav);
              return ret;
            }
          };
          
          // since this search is initialized with a single start node 
          // it is okay to start with an empty input path here
          searcher.addStartPaths(arc, prefixOutput, true, new IntsRefBuilder());
          
          completions = searcher.search();
          assert completions.isComplete;
        } catch (IOException bogus) {
          throw new RuntimeException(bogus);
        }
        
        int prefixLength = token.length();
        
        BytesRefBuilder suffix = new BytesRefBuilder();
        //System.out.println("    " + completions.length + " completions");
        
        nextCompletion:
          for (Result<Long> completion : completions) {
            token.setLength(prefixLength);
            // append suffix
            Util.toBytesRef(completion.input, suffix);
            token.append(suffix);
            
            //System.out.println("    completion " + token.utf8ToString());
            
            // Skip this path if a higher-order model already
            // saw/predicted its last token:
            BytesRef lastToken = token.get();
            for(int i=token.length()-1;i>=0;i--) {
              if (token.byteAt(i) == separator) {
                assert token.length()-i-1 > 0;
                lastToken = new BytesRef(token.bytes(), i+1, token.length()-i-1);
                break;
              }
            }
            if (seen.contains(lastToken)) {
              //System.out.println("      skip dup " + lastToken.utf8ToString());
              continue nextCompletion;
            }
            seen.add(BytesRef.deepCopyOf(lastToken));
            spare.copyUTF8Bytes(token.get());
            LookupResult result = new LookupResult(spare.toString(), (long) (Long.MAX_VALUE * backoff * ((double) decodeWeight(completion.output)) / contextCount));
            results.add(result);
            assert results.size() == seen.size();
            //System.out.println("  add result=" + result);
          }
        backoff *= ALPHA;
      }
      
      Collections.sort(results, new Comparator<LookupResult>() {
        @Override
        public int compare(LookupResult a, LookupResult b) {
          if (a.value > b.value) {
            return -1;
          } else if (a.value < b.value) {
            return 1;
          } else {
            // Tie break by UTF16 sort order:
            return ((String) a.key).compareTo((String) b.key);
          }
        }
      });
      
      if (results.size() > num) {
        results.subList(num, results.size()).clear();
      }
      
      return results;
    }
  }

  weight -> cost /** weight -&gt; cost */
  private long encodeWeight(long ngramCount) {
    return Long.MAX_VALUE - ngramCount;
  }

  cost -> weight /** cost -&gt; weight */
  //private long decodeWeight(Pair<Long,BytesRef> output) {
  private long decodeWeight(Long output) {
    assert output != null;
    return (int)(Long.MAX_VALUE - output);
  }
  
  // NOTE: copied from WFSTCompletionLookup & tweaked
  private Long lookupPrefix(FST<Long> fst, FST.BytesReader bytesReader,
                            BytesRef scratch, Arc<Long> arc) throws /*Bogus*/IOException {

    Long output = fst.outputs.getNoOutput();
    
    fst.getFirstArc(arc);
    
    byte[] bytes = scratch.bytes;
    int pos = scratch.offset;
    int end = pos + scratch.length;
    while (pos < end) {
      if (fst.findTargetArc(bytes[pos++] & 0xff, arc, arc, bytesReader) == null) {
        return null;
      } else {
        output = fst.outputs.add(output, arc.output);
      }
    }
    
    return output;
  }

  static final Comparator<Long> weightComparator = new Comparator<Long> () {
    @Override
    public int compare(Long left, Long right) {
      return left.compareTo(right);
    }  
  };

  Returns the weight associated with an input string,
or null if it does not exist.
/**
   * Returns the weight associated with an input string,
   * or null if it does not exist.
   */
  public Object get(CharSequence key) {
    throw new UnsupportedOperationException();
  }
}
/

org.apache.lucene/ lucene-suggest/ 8.2.0/ org/apache/lucene/search/suggest/analyzing/FreeTextSuggester.java