/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.codecs;

import java.io.Closeable;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;

import org.apache.lucene.index.DocIDMerger;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.FieldInfos;
import org.apache.lucene.index.Fields;
import org.apache.lucene.index.MergeState;
import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.store.DataInput;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefBuilder;

import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS;

Codec API for writing term vectors:
  1. For every document, startDocument(int) is called, informing the Codec how many fields will be written.
  2. startField(FieldInfo, int, boolean, boolean, boolean) is called for each field in the document, informing the codec how many terms will be written for that field, and whether or not positions, offsets, or payloads are enabled.
  3. Within each field, startTerm(BytesRef, int) is called for each term.
  4. If offsets and/or positions are enabled, then addPosition(int, int, int, BytesRef) will be called for each term occurrence.
  5. After all documents have been written, finish(FieldInfos, int) is called for verification/sanity-checks.
  6. Finally the writer is closed (close())
@lucene.experimental
/** * Codec API for writing term vectors: * <ol> * <li>For every document, {@link #startDocument(int)} is called, * informing the Codec how many fields will be written. * <li>{@link #startField(FieldInfo, int, boolean, boolean, boolean)} is called for * each field in the document, informing the codec how many terms * will be written for that field, and whether or not positions, * offsets, or payloads are enabled. * <li>Within each field, {@link #startTerm(BytesRef, int)} is called * for each term. * <li>If offsets and/or positions are enabled, then * {@link #addPosition(int, int, int, BytesRef)} will be called for each term * occurrence. * <li>After all documents have been written, {@link #finish(FieldInfos, int)} * is called for verification/sanity-checks. * <li>Finally the writer is closed ({@link #close()}) * </ol> * * @lucene.experimental */
public abstract class TermVectorsWriter implements Closeable {
Sole constructor. (For invocation by subclass constructors, typically implicit.)
/** Sole constructor. (For invocation by subclass * constructors, typically implicit.) */
protected TermVectorsWriter() { }
Called before writing the term vectors of the document. startField(FieldInfo, int, boolean, boolean, boolean) will be called numVectorFields times. Note that if term vectors are enabled, this is called even if the document has no vector fields, in this case numVectorFields will be zero.
/** Called before writing the term vectors of the document. * {@link #startField(FieldInfo, int, boolean, boolean, boolean)} will * be called <code>numVectorFields</code> times. Note that if term * vectors are enabled, this is called even if the document * has no vector fields, in this case <code>numVectorFields</code> * will be zero. */
public abstract void startDocument(int numVectorFields) throws IOException;
Called after a doc and all its fields have been added.
/** Called after a doc and all its fields have been added. */
public void finishDocument() throws IOException {};
Called before writing the terms of the field. startTerm(BytesRef, int) will be called numTerms times.
/** Called before writing the terms of the field. * {@link #startTerm(BytesRef, int)} will be called <code>numTerms</code> times. */
public abstract void startField(FieldInfo info, int numTerms, boolean positions, boolean offsets, boolean payloads) throws IOException;
Called after a field and all its terms have been added.
/** Called after a field and all its terms have been added. */
public void finishField() throws IOException {};
Adds a term and its term frequency freq. If this field has positions and/or offsets enabled, then addPosition(int, int, int, BytesRef) will be called freq times respectively.
/** Adds a term and its term frequency <code>freq</code>. * If this field has positions and/or offsets enabled, then * {@link #addPosition(int, int, int, BytesRef)} will be called * <code>freq</code> times respectively. */
public abstract void startTerm(BytesRef term, int freq) throws IOException;
Called after a term and all its positions have been added.
/** Called after a term and all its positions have been added. */
public void finishTerm() throws IOException {}
Adds a term position and offsets
/** Adds a term position and offsets */
public abstract void addPosition(int position, int startOffset, int endOffset, BytesRef payload) throws IOException;
Called before close(), passing in the number of documents that were written. Note that this is intentionally redundant (equivalent to the number of calls to startDocument(int), but a Codec should check that this is the case to detect the JRE bug described in LUCENE-1282.
/** Called before {@link #close()}, passing in the number * of documents that were written. Note that this is * intentionally redundant (equivalent to the number of * calls to {@link #startDocument(int)}, but a Codec should * check that this is the case to detect the JRE bug described * in LUCENE-1282. */
public abstract void finish(FieldInfos fis, int numDocs) throws IOException;
Called by IndexWriter when writing new segments.

This is an expert API that allows the codec to consume positions and offsets directly from the indexer.

The default implementation calls addPosition(int, int, int, BytesRef), but subclasses can override this if they want to efficiently write all the positions, then all the offsets, for example.

NOTE: This API is extremely expert and subject to change or removal!!!

@lucene.internal
/** * Called by IndexWriter when writing new segments. * <p> * This is an expert API that allows the codec to consume * positions and offsets directly from the indexer. * <p> * The default implementation calls {@link #addPosition(int, int, int, BytesRef)}, * but subclasses can override this if they want to efficiently write * all the positions, then all the offsets, for example. * <p> * NOTE: This API is extremely expert and subject to change or removal!!! * @lucene.internal */
// TODO: we should probably nuke this and make a more efficient 4.x format // PreFlex-RW could then be slow and buffer (it's only used in tests...) public void addProx(int numProx, DataInput positions, DataInput offsets) throws IOException { int position = 0; int lastOffset = 0; BytesRefBuilder payload = null; for (int i = 0; i < numProx; i++) { final int startOffset; final int endOffset; final BytesRef thisPayload; if (positions == null) { position = -1; thisPayload = null; } else { int code = positions.readVInt(); position += code >>> 1; if ((code & 1) != 0) { // This position has a payload final int payloadLength = positions.readVInt(); if (payload == null) { payload = new BytesRefBuilder(); } payload.grow(payloadLength); positions.readBytes(payload.bytes(), 0, payloadLength); payload.setLength(payloadLength); thisPayload = payload.get(); } else { thisPayload = null; } } if (offsets == null) { startOffset = endOffset = -1; } else { startOffset = lastOffset + offsets.readVInt(); endOffset = startOffset + offsets.readVInt(); lastOffset = endOffset; } addPosition(position, startOffset, endOffset, thisPayload); } } private static class TermVectorsMergeSub extends DocIDMerger.Sub { private final TermVectorsReader reader; private final int maxDoc; int docID = -1; public TermVectorsMergeSub(MergeState.DocMap docMap, TermVectorsReader reader, int maxDoc) { super(docMap); this.maxDoc = maxDoc; this.reader = reader; } @Override public int nextDoc() { docID++; if (docID == maxDoc) { return NO_MORE_DOCS; } else { return docID; } } }
Merges in the term vectors from the readers in mergeState. The default implementation skips over deleted documents, and uses startDocument(int), startField(FieldInfo, int, boolean, boolean, boolean), startTerm(BytesRef, int), addPosition(int, int, int, BytesRef), and finish(FieldInfos, int), returning the number of documents that were written. Implementations can override this method for more sophisticated merging (bulk-byte copying, etc).
/** Merges in the term vectors from the readers in * <code>mergeState</code>. The default implementation skips * over deleted documents, and uses {@link #startDocument(int)}, * {@link #startField(FieldInfo, int, boolean, boolean, boolean)}, * {@link #startTerm(BytesRef, int)}, {@link #addPosition(int, int, int, BytesRef)}, * and {@link #finish(FieldInfos, int)}, * returning the number of documents that were written. * Implementations can override this method for more sophisticated * merging (bulk-byte copying, etc). */
public int merge(MergeState mergeState) throws IOException { List<TermVectorsMergeSub> subs = new ArrayList<>(); for(int i=0;i<mergeState.termVectorsReaders.length;i++) { TermVectorsReader reader = mergeState.termVectorsReaders[i]; if (reader != null) { reader.checkIntegrity(); } subs.add(new TermVectorsMergeSub(mergeState.docMaps[i], reader, mergeState.maxDocs[i])); } final DocIDMerger<TermVectorsMergeSub> docIDMerger = DocIDMerger.of(subs, mergeState.needsIndexSort); int docCount = 0; while (true) { TermVectorsMergeSub sub = docIDMerger.next(); if (sub == null) { break; } // NOTE: it's very important to first assign to vectors then pass it to // termVectorsWriter.addAllDocVectors; see LUCENE-1282 Fields vectors; if (sub.reader == null) { vectors = null; } else { vectors = sub.reader.get(sub.docID); } addAllDocVectors(vectors, mergeState); docCount++; } finish(mergeState.mergeFieldInfos, docCount); return docCount; }
Safe (but, slowish) default method to write every vector field in the document.
/** Safe (but, slowish) default method to write every * vector field in the document. */
protected final void addAllDocVectors(Fields vectors, MergeState mergeState) throws IOException { if (vectors == null) { startDocument(0); finishDocument(); return; } int numFields = vectors.size(); if (numFields == -1) { // count manually! TODO: Maybe enforce that Fields.size() returns something valid? numFields = 0; for (final Iterator<String> it = vectors.iterator(); it.hasNext(); ) { it.next(); numFields++; } } startDocument(numFields); String lastFieldName = null; TermsEnum termsEnum = null; PostingsEnum docsAndPositionsEnum = null; int fieldCount = 0; for(String fieldName : vectors) { fieldCount++; final FieldInfo fieldInfo = mergeState.mergeFieldInfos.fieldInfo(fieldName); assert lastFieldName == null || fieldName.compareTo(lastFieldName) > 0: "lastFieldName=" + lastFieldName + " fieldName=" + fieldName; lastFieldName = fieldName; final Terms terms = vectors.terms(fieldName); if (terms == null) { // FieldsEnum shouldn't lie... continue; } final boolean hasPositions = terms.hasPositions(); final boolean hasOffsets = terms.hasOffsets(); final boolean hasPayloads = terms.hasPayloads(); assert !hasPayloads || hasPositions; int numTerms = (int) terms.size(); if (numTerms == -1) { // count manually. It is stupid, but needed, as Terms.size() is not a mandatory statistics function numTerms = 0; termsEnum = terms.iterator(); while(termsEnum.next() != null) { numTerms++; } } startField(fieldInfo, numTerms, hasPositions, hasOffsets, hasPayloads); termsEnum = terms.iterator(); int termCount = 0; while(termsEnum.next() != null) { termCount++; final int freq = (int) termsEnum.totalTermFreq(); startTerm(termsEnum.term(), freq); if (hasPositions || hasOffsets) { docsAndPositionsEnum = termsEnum.postings(docsAndPositionsEnum, PostingsEnum.OFFSETS | PostingsEnum.PAYLOADS); assert docsAndPositionsEnum != null; final int docID = docsAndPositionsEnum.nextDoc(); assert docID != DocIdSetIterator.NO_MORE_DOCS; assert docsAndPositionsEnum.freq() == freq; for(int posUpto=0; posUpto<freq; posUpto++) { final int pos = docsAndPositionsEnum.nextPosition(); final int startOffset = docsAndPositionsEnum.startOffset(); final int endOffset = docsAndPositionsEnum.endOffset(); final BytesRef payload = docsAndPositionsEnum.getPayload(); assert !hasPositions || pos >= 0 ; addPosition(pos, startOffset, endOffset, payload); } } finishTerm(); } assert termCount == numTerms; finishField(); } assert fieldCount == numFields; finishDocument(); } @Override public abstract void close() throws IOException; }