/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.codecs.lucene50;


import java.io.IOException;
import java.util.Objects;

import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.codecs.StoredFieldsFormat;
import org.apache.lucene.codecs.StoredFieldsReader;
import org.apache.lucene.codecs.StoredFieldsWriter;
import org.apache.lucene.codecs.compressing.CompressingStoredFieldsFormat;
import org.apache.lucene.codecs.compressing.CompressingStoredFieldsIndexWriter;
import org.apache.lucene.codecs.compressing.CompressionMode;
import org.apache.lucene.index.FieldInfos;
import org.apache.lucene.index.SegmentInfo;
import org.apache.lucene.index.StoredFieldVisitor;
import org.apache.lucene.store.DataOutput;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IOContext;
import org.apache.lucene.util.packed.PackedInts;

Lucene 5.0 stored fields format.

Principle

This StoredFieldsFormat compresses blocks of documents in order to improve the compression ratio compared to document-level compression. It uses the LZ4 compression algorithm by default in 16KB blocks, which is fast to compress and very fast to decompress data. Although the default compression method that is used (BEST_SPEED) focuses more on speed than on compression ratio, it should provide interesting compression ratios for redundant inputs (such as log files, HTML or plain text). For higher compression, you can choose (BEST_COMPRESSION), which uses the DEFLATE algorithm with 60KB blocks for a better ratio at the expense of slower performance. These two options can be configured like this:

  // the default: for high performance
  indexWriterConfig.setCodec(new Lucene54Codec(Mode.BEST_SPEED));
  // instead for higher performance (but slower):
  // indexWriterConfig.setCodec(new Lucene54Codec(Mode.BEST_COMPRESSION));

File formats

Stored fields are represented by two files:

  1. A fields data file (extension .fdt). This file stores a compact representation of documents in compressed blocks of 16KB or more. When writing a segment, documents are appended to an in-memory byte[] buffer. When its size reaches 16KB or more, some metadata about the documents is flushed to disk, immediately followed by a compressed representation of the buffer using the LZ4 compression format.

    Here is a more detailed description of the field data file format:

    • FieldData (.fdt) --> <Header>, PackedIntsVersion, <Chunk>ChunkCount, ChunkCount, DirtyChunkCount, Footer
    • Header --> IndexHeader
    • PackedIntsVersion --> PackedInts.VERSION_CURRENT as a VInt
    • ChunkCount is not known in advance and is the number of chunks necessary to store all document of the segment
    • Chunk --> DocBase, ChunkDocs, DocFieldCounts, DocLengths, <CompressedDocs>
    • DocBase --> the ID of the first document of the chunk as a VInt
    • ChunkDocs --> the number of documents in the chunk as a VInt
    • DocFieldCounts --> the number of stored fields of every document in the chunk, encoded as followed:
      • if chunkDocs=1, the unique value is encoded as a VInt
      • else read a VInt (let's call it bitsRequired)
        • if bitsRequired is 0 then all values are equal, and the common value is the following VInt
        • else bitsRequired is the number of bits required to store any value, and values are stored in a packed array where every value is stored on exactly bitsRequired bits
    • DocLengths --> the lengths of all documents in the chunk, encoded with the same method as DocFieldCounts
    • CompressedDocs --> a compressed representation of <Docs> using the LZ4 compression format
    • Docs --> <Doc>ChunkDocs
    • Doc --> <FieldNumAndType, Value>DocFieldCount
    • FieldNumAndType --> a VLong, whose 3 last bits are Type and other bits are FieldNum
    • Type -->
      • 0: Value is String
      • 1: Value is BinaryValue
      • 2: Value is Int
      • 3: Value is Float
      • 4: Value is Long
      • 5: Value is Double
      • 6, 7: unused
    • FieldNum --> an ID of the field
    • Value --> String | BinaryValue | Int | Float | Long | Double depending on Type
    • BinaryValue --> ValueLength <Byte>ValueLength
    • ChunkCount --> the number of chunks in this file
    • DirtyChunkCount --> the number of prematurely flushed chunks in this file
    • Footer --> CodecFooter

    Notes

    • If documents are larger than 16KB then chunks will likely contain only one document. However, documents can never spread across several chunks (all fields of a single document are in the same chunk).
    • When at least one document in a chunk is large enough so that the chunk is larger than 32KB, the chunk will actually be compressed in several LZ4 blocks of 16KB. This allows StoredFieldVisitors which are only interested in the first fields of a document to not have to decompress 10MB of data if the document is 10MB, but only 16KB.
    • Given that the original lengths are written in the metadata of the chunk, the decompressor can leverage this information to stop decoding as soon as enough data has been decompressed.
    • In case documents are incompressible, CompressedDocs will be less than 0.5% larger than Docs.
  2. A fields index file (extension .fdx).

Known limitations

This StoredFieldsFormat does not support individual documents larger than (231 - 214) bytes.

@lucene.experimental
/** * Lucene 5.0 stored fields format. * * <p><b>Principle</b> * <p>This {@link StoredFieldsFormat} compresses blocks of documents in * order to improve the compression ratio compared to document-level * compression. It uses the <a href="http://code.google.com/p/lz4/">LZ4</a> * compression algorithm by default in 16KB blocks, which is fast to compress * and very fast to decompress data. Although the default compression method * that is used ({@link Mode#BEST_SPEED BEST_SPEED}) focuses more on speed than on * compression ratio, it should provide interesting compression ratios * for redundant inputs (such as log files, HTML or plain text). For higher * compression, you can choose ({@link Mode#BEST_COMPRESSION BEST_COMPRESSION}), which uses * the <a href="http://en.wikipedia.org/wiki/DEFLATE">DEFLATE</a> algorithm with 60KB blocks * for a better ratio at the expense of slower performance. * These two options can be configured like this: * <pre class="prettyprint"> * // the default: for high performance * indexWriterConfig.setCodec(new Lucene54Codec(Mode.BEST_SPEED)); * // instead for higher performance (but slower): * // indexWriterConfig.setCodec(new Lucene54Codec(Mode.BEST_COMPRESSION)); * </pre> * <p><b>File formats</b> * <p>Stored fields are represented by two files: * <ol> * <li><a name="field_data"></a> * <p>A fields data file (extension <tt>.fdt</tt>). This file stores a compact * representation of documents in compressed blocks of 16KB or more. When * writing a segment, documents are appended to an in-memory <tt>byte[]</tt> * buffer. When its size reaches 16KB or more, some metadata about the documents * is flushed to disk, immediately followed by a compressed representation of * the buffer using the * <a href="http://code.google.com/p/lz4/">LZ4</a> * <a href="http://fastcompression.blogspot.fr/2011/05/lz4-explained.html">compression format</a>.</p> * <p>Here is a more detailed description of the field data file format:</p> * <ul> * <li>FieldData (.fdt) --&gt; &lt;Header&gt;, PackedIntsVersion, &lt;Chunk&gt;<sup>ChunkCount</sup>, ChunkCount, DirtyChunkCount, Footer</li> * <li>Header --&gt; {@link CodecUtil#writeIndexHeader IndexHeader}</li> * <li>PackedIntsVersion --&gt; {@link PackedInts#VERSION_CURRENT} as a {@link DataOutput#writeVInt VInt}</li> * <li>ChunkCount is not known in advance and is the number of chunks necessary to store all document of the segment</li> * <li>Chunk --&gt; DocBase, ChunkDocs, DocFieldCounts, DocLengths, &lt;CompressedDocs&gt;</li> * <li>DocBase --&gt; the ID of the first document of the chunk as a {@link DataOutput#writeVInt VInt}</li> * <li>ChunkDocs --&gt; the number of documents in the chunk as a {@link DataOutput#writeVInt VInt}</li> * <li>DocFieldCounts --&gt; the number of stored fields of every document in the chunk, encoded as followed:<ul> * <li>if chunkDocs=1, the unique value is encoded as a {@link DataOutput#writeVInt VInt}</li> * <li>else read a {@link DataOutput#writeVInt VInt} (let's call it <tt>bitsRequired</tt>)<ul> * <li>if <tt>bitsRequired</tt> is <tt>0</tt> then all values are equal, and the common value is the following {@link DataOutput#writeVInt VInt}</li> * <li>else <tt>bitsRequired</tt> is the number of bits required to store any value, and values are stored in a {@link PackedInts packed} array where every value is stored on exactly <tt>bitsRequired</tt> bits</li> * </ul></li> * </ul></li> * <li>DocLengths --&gt; the lengths of all documents in the chunk, encoded with the same method as DocFieldCounts</li> * <li>CompressedDocs --&gt; a compressed representation of &lt;Docs&gt; using the LZ4 compression format</li> * <li>Docs --&gt; &lt;Doc&gt;<sup>ChunkDocs</sup></li> * <li>Doc --&gt; &lt;FieldNumAndType, Value&gt;<sup>DocFieldCount</sup></li> * <li>FieldNumAndType --&gt; a {@link DataOutput#writeVLong VLong}, whose 3 last bits are Type and other bits are FieldNum</li> * <li>Type --&gt;<ul> * <li>0: Value is String</li> * <li>1: Value is BinaryValue</li> * <li>2: Value is Int</li> * <li>3: Value is Float</li> * <li>4: Value is Long</li> * <li>5: Value is Double</li> * <li>6, 7: unused</li> * </ul></li> * <li>FieldNum --&gt; an ID of the field</li> * <li>Value --&gt; {@link DataOutput#writeString(String) String} | BinaryValue | Int | Float | Long | Double depending on Type</li> * <li>BinaryValue --&gt; ValueLength &lt;Byte&gt;<sup>ValueLength</sup></li> * <li>ChunkCount --&gt; the number of chunks in this file</li> * <li>DirtyChunkCount --&gt; the number of prematurely flushed chunks in this file</li> * <li>Footer --&gt; {@link CodecUtil#writeFooter CodecFooter}</li> * </ul> * <p>Notes * <ul> * <li>If documents are larger than 16KB then chunks will likely contain only * one document. However, documents can never spread across several chunks (all * fields of a single document are in the same chunk).</li> * <li>When at least one document in a chunk is large enough so that the chunk * is larger than 32KB, the chunk will actually be compressed in several LZ4 * blocks of 16KB. This allows {@link StoredFieldVisitor}s which are only * interested in the first fields of a document to not have to decompress 10MB * of data if the document is 10MB, but only 16KB.</li> * <li>Given that the original lengths are written in the metadata of the chunk, * the decompressor can leverage this information to stop decoding as soon as * enough data has been decompressed.</li> * <li>In case documents are incompressible, CompressedDocs will be less than * 0.5% larger than Docs.</li> * </ul> * </li> * <li><a name="field_index"></a> * <p>A fields index file (extension <tt>.fdx</tt>).</p> * <ul> * <li>FieldsIndex (.fdx) --&gt; &lt;Header&gt;, &lt;ChunkIndex&gt;, Footer</li> * <li>Header --&gt; {@link CodecUtil#writeIndexHeader IndexHeader}</li> * <li>ChunkIndex: See {@link CompressingStoredFieldsIndexWriter}</li> * <li>Footer --&gt; {@link CodecUtil#writeFooter CodecFooter}</li> * </ul> * </li> * </ol> * <p><b>Known limitations</b> * <p>This {@link StoredFieldsFormat} does not support individual documents * larger than (<tt>2<sup>31</sup> - 2<sup>14</sup></tt>) bytes. * @lucene.experimental */
public final class Lucene50StoredFieldsFormat extends StoredFieldsFormat {
Configuration option for stored fields.
/** Configuration option for stored fields. */
public static enum Mode {
Trade compression ratio for retrieval speed.
/** Trade compression ratio for retrieval speed. */
BEST_SPEED,
Trade retrieval speed for compression ratio.
/** Trade retrieval speed for compression ratio. */
BEST_COMPRESSION }
Attribute key for compression mode.
/** Attribute key for compression mode. */
public static final String MODE_KEY = Lucene50StoredFieldsFormat.class.getSimpleName() + ".mode"; final Mode mode;
Stored fields format with default options
/** Stored fields format with default options */
public Lucene50StoredFieldsFormat() { this(Mode.BEST_SPEED); }
Stored fields format with specified mode
/** Stored fields format with specified mode */
public Lucene50StoredFieldsFormat(Mode mode) { this.mode = Objects.requireNonNull(mode); } @Override public StoredFieldsReader fieldsReader(Directory directory, SegmentInfo si, FieldInfos fn, IOContext context) throws IOException { String value = si.getAttribute(MODE_KEY); if (value == null) { throw new IllegalStateException("missing value for " + MODE_KEY + " for segment: " + si.name); } Mode mode = Mode.valueOf(value); return impl(mode).fieldsReader(directory, si, fn, context); } @Override public StoredFieldsWriter fieldsWriter(Directory directory, SegmentInfo si, IOContext context) throws IOException { String previous = si.putAttribute(MODE_KEY, mode.name()); if (previous != null && previous.equals(mode.name()) == false) { throw new IllegalStateException("found existing value for " + MODE_KEY + " for segment: " + si.name + "old=" + previous + ", new=" + mode.name()); } return impl(mode).fieldsWriter(directory, si, context); } StoredFieldsFormat impl(Mode mode) { switch (mode) { case BEST_SPEED: return new CompressingStoredFieldsFormat("Lucene50StoredFieldsFast", CompressionMode.FAST, 1 << 14, 128, 1024); case BEST_COMPRESSION: return new CompressingStoredFieldsFormat("Lucene50StoredFieldsHigh", CompressionMode.HIGH_COMPRESSION, 61440, 512, 1024); default: throw new AssertionError(); } } }