/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.search.suggest.document;

import java.io.IOException;

import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.codecs.FieldsConsumer;
import org.apache.lucene.codecs.FieldsProducer;
import org.apache.lucene.codecs.PostingsFormat;
import org.apache.lucene.index.FieldInfos;
import org.apache.lucene.index.SegmentReadState;
import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.store.DataOutput;
import org.apache.lucene.util.fst.FST;

A PostingsFormat which supports document suggestion based on indexed SuggestFields. Document suggestion is based on an weighted FST which map analyzed terms of a SuggestField to its surface form and document id.

Files:

Completion Dictionary

The .lkp file contains an FST for each suggest field

Notes:

  • Header is a CodecHeader storing the version information for the Completion implementation.
  • FST maps all analyzed forms to surface forms of a SuggestField

Completion Index

The .cmp file contains an index into the completion dictionary, so that it can be accessed randomly.

  • CompletionIndex (.cmp) --> Header, NumSuggestFields, EntryNumSuggestFields, Footer
  • Header --> CodecHeader
  • NumSuggestFields --> Uint32
  • Entry --> FieldNumber, CompletionDictionaryOffset, MinWeight, MaxWeight, Type
  • FieldNumber --> Uint32
  • CompletionDictionaryOffset --> Uint64
  • MinWeight --> Uint64
  • MaxWeight --> Uint64
  • Type --> Byte
  • Footer --> CodecFooter

Notes:

  • Header is a CodecHeader storing the version information for the Completion implementation.
  • NumSuggestFields is the number of suggest fields indexed
  • FieldNumber is the fields number from FieldInfos. (.fnm)
  • CompletionDictionaryOffset is the file offset of a field's FST in CompletionDictionary (.lkp)
  • MinWeight and MaxWeight are the global minimum and maximum weight for the field
  • Type indicates if the suggester has context or not
@lucene.experimental
/** * <p> * A {@link PostingsFormat} which supports document suggestion based on * indexed {@link SuggestField}s. * Document suggestion is based on an weighted FST which map analyzed * terms of a {@link SuggestField} to its surface form and document id. * </p> * <p> * Files: * <ul> * <li><tt>.lkp</tt>: <a href="#Completiondictionary">Completion Dictionary</a></li> * <li><tt>.cmp</tt>: <a href="#Completionindex">Completion Index</a></li> * </ul> * <p> * <a name="Completionictionary"></a> * <h3>Completion Dictionary</h3> * <p>The .lkp file contains an FST for each suggest field * </p> * <ul> * <li>CompletionDict (.lkp) --&gt; Header, FST<sup>NumSuggestFields</sup>, Footer</li> * <li>Header --&gt; {@link CodecUtil#writeHeader CodecHeader}</li> * <!-- TODO: should the FST output be mentioned at all? --> * <li>FST --&gt; {@link FST FST&lt;Long, BytesRef&gt;}</li> * <li>Footer --&gt; {@link CodecUtil#writeFooter CodecFooter}</li> * </ul> * <p>Notes:</p> * <ul> * <li>Header is a {@link CodecUtil#writeHeader CodecHeader} storing the version information * for the Completion implementation.</li> * <li>FST maps all analyzed forms to surface forms of a SuggestField</li> * </ul> * <a name="Completionindex"></a> * <h3>Completion Index</h3> * <p>The .cmp file contains an index into the completion dictionary, so that it can be * accessed randomly.</p> * <ul> * <li>CompletionIndex (.cmp) --&gt; Header, NumSuggestFields, Entry<sup>NumSuggestFields</sup>, Footer</li> * <li>Header --&gt; {@link CodecUtil#writeHeader CodecHeader}</li> * <li>NumSuggestFields --&gt; {@link DataOutput#writeVInt Uint32}</li> * <li>Entry --&gt; FieldNumber, CompletionDictionaryOffset, MinWeight, MaxWeight, Type</li> * <li>FieldNumber --&gt; {@link DataOutput#writeVInt Uint32}</li> * <li>CompletionDictionaryOffset --&gt; {@link DataOutput#writeVLong Uint64}</li> * <li>MinWeight --&gt; {@link DataOutput#writeVLong Uint64}</li> * <li>MaxWeight --&gt; {@link DataOutput#writeVLong Uint64}</li> * <li>Type --&gt; {@link DataOutput#writeByte Byte}</li> * <li>Footer --&gt; {@link CodecUtil#writeFooter CodecFooter}</li> * </ul> * <p>Notes:</p> * <ul> * <li>Header is a {@link CodecUtil#writeHeader CodecHeader} storing the version information * for the Completion implementation.</li> * <li>NumSuggestFields is the number of suggest fields indexed</li> * <li>FieldNumber is the fields number from {@link FieldInfos}. (.fnm)</li> * <li>CompletionDictionaryOffset is the file offset of a field's FST in CompletionDictionary (.lkp)</li> * <li>MinWeight and MaxWeight are the global minimum and maximum weight for the field</li> * <li>Type indicates if the suggester has context or not</li> * </ul> * * @lucene.experimental */
public abstract class CompletionPostingsFormat extends PostingsFormat { static final String CODEC_NAME = "completion"; static final int COMPLETION_CODEC_VERSION = 1; static final int COMPLETION_VERSION_CURRENT = COMPLETION_CODEC_VERSION; static final String INDEX_EXTENSION = "cmp"; static final String DICT_EXTENSION = "lkp";
An enum that allows to control if suggester FSTs are loaded into memory or read off-heap
/** * An enum that allows to control if suggester FSTs are loaded into memory or read off-heap */
public enum FSTLoadMode {
Always read FSTs from disk. NOTE: If this option is used the FST will be read off-heap even if buffered directory implementations are used.
/** * Always read FSTs from disk. * NOTE: If this option is used the FST will be read off-heap even if buffered directory implementations * are used. */
OFF_HEAP,
Never read FSTs from disk ie. all suggest fields FSTs are loaded into memory
/** * Never read FSTs from disk ie. all suggest fields FSTs are loaded into memory */
ON_HEAP,
Automatically make the decision if FSTs are read from disk depending if the segment read from an MMAPDirectory
/** * Automatically make the decision if FSTs are read from disk depending if the segment read from an MMAPDirectory */
AUTO } private final FSTLoadMode fstLoadMode;
Used only by core Lucene at read-time via Service Provider instantiation
/** * Used only by core Lucene at read-time via Service Provider instantiation */
public CompletionPostingsFormat() { this(FSTLoadMode.ON_HEAP); }
Creates a CompletionPostingsFormat that will use the provided fstLoadMode to determine if the completion FST should be loaded on or off heap.
/** * Creates a {@link CompletionPostingsFormat} that will * use the provided <code>fstLoadMode</code> to determine * if the completion FST should be loaded on or off heap. */
public CompletionPostingsFormat(FSTLoadMode fstLoadMode) { super(CODEC_NAME); this.fstLoadMode = fstLoadMode; }
Concrete implementation should specify the delegating postings format
/** * Concrete implementation should specify the delegating postings format */
protected abstract PostingsFormat delegatePostingsFormat(); @Override public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException { PostingsFormat delegatePostingsFormat = delegatePostingsFormat(); if (delegatePostingsFormat == null) { throw new UnsupportedOperationException("Error - " + getClass().getName() + " has been constructed without a choice of PostingsFormat"); } return new CompletionFieldsConsumer(delegatePostingsFormat, state); } @Override public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException { return new CompletionFieldsProducer(state, fstLoadMode); } }