/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.codecs.simpletext;


import java.io.IOException;

import org.apache.lucene.codecs.DocValuesConsumer;
import org.apache.lucene.codecs.DocValuesProducer;
import org.apache.lucene.codecs.DocValuesFormat;
import org.apache.lucene.index.SegmentReadState;
import org.apache.lucene.index.SegmentWriteState;

plain text doc values format.

FOR RECREATIONAL USE ONLY

the .dat file contains the data. for numbers this is a "fixed-width" file, for example a single byte range:

 field myField
   type NUMERIC
   minvalue 0
   pattern 000
 005
 T
 234
 T
 123
 T
 ...
 
so a document's value (delta encoded from minvalue) can be retrieved by seeking to startOffset + (1+pattern.length()+2)*docid. The extra 1 is the newline. The extra 2 is another newline and 'T' or 'F': true if the value is real, false if missing. for bytes this is also a "fixed-width" file, for example:
 field myField
   type BINARY
   maxlength 6
   pattern 0
 length 6
 foobar[space][space]
 T
 length 3
 baz[space][space][space][space][space]
 T
 ...
 
so a doc's value can be retrieved by seeking to startOffset + (9+pattern.length+maxlength+2)*doc the extra 9 is 2 newlines, plus "length " itself. the extra 2 is another newline and 'T' or 'F': true if the value is real, false if missing. for sorted bytes this is a fixed-width file, for example:
 field myField
   type SORTED
   numvalues 10
   maxLength 8
   pattern 0
   ordpattern 00
 length 6
 foobar[space][space]
 length 3
 baz[space][space][space][space][space]
 ...
 03
 06
 01
 10
 ...
 
so the "ord section" begins at startOffset + (9+pattern.length+maxlength)*numValues. a document's ord can be retrieved by seeking to "ord section" + (1+ordpattern.length())*docid an ord's value can be retrieved by seeking to startOffset + (9+pattern.length+maxlength)*ord for sorted set this is a fixed-width file very similar to the SORTED case, for example:
 field myField
   type SORTED_SET
   numvalues 10
   maxLength 8
   pattern 0
   ordpattern XXXXX
 length 6
 foobar[space][space]
 length 3
 baz[space][space][space][space][space]
 ...
 0,3,5   
 1,2
 
 10
 ...
 
so the "ord section" begins at startOffset + (9+pattern.length+maxlength)*numValues. a document's ord list can be retrieved by seeking to "ord section" + (1+ordpattern.length())*docid this is a comma-separated list, and it's padded with spaces to be fixed width. so trim() and split() it. and beware the empty string! an ord's value can be retrieved by seeking to startOffset + (9+pattern.length+maxlength)*ord for sorted numerics, it's encoded (not very creatively) as a comma-separated list of strings the same as binary. beware the empty string! the reader can just scan this file when it opens, skipping over the data blocks and saving the offset/etc for each field. @lucene.experimental
/** * plain text doc values format. * <p> * <b>FOR RECREATIONAL USE ONLY</b> * <p> * the .dat file contains the data. * for numbers this is a "fixed-width" file, for example a single byte range: * <pre> * field myField * type NUMERIC * minvalue 0 * pattern 000 * 005 * T * 234 * T * 123 * T * ... * </pre> * so a document's value (delta encoded from minvalue) can be retrieved by * seeking to startOffset + (1+pattern.length()+2)*docid. The extra 1 is the newline. * The extra 2 is another newline and 'T' or 'F': true if the value is real, false if missing. * * for bytes this is also a "fixed-width" file, for example: * <pre> * field myField * type BINARY * maxlength 6 * pattern 0 * length 6 * foobar[space][space] * T * length 3 * baz[space][space][space][space][space] * T * ... * </pre> * so a doc's value can be retrieved by seeking to startOffset + (9+pattern.length+maxlength+2)*doc * the extra 9 is 2 newlines, plus "length " itself. * the extra 2 is another newline and 'T' or 'F': true if the value is real, false if missing. * * for sorted bytes this is a fixed-width file, for example: * <pre> * field myField * type SORTED * numvalues 10 * maxLength 8 * pattern 0 * ordpattern 00 * length 6 * foobar[space][space] * length 3 * baz[space][space][space][space][space] * ... * 03 * 06 * 01 * 10 * ... * </pre> * so the "ord section" begins at startOffset + (9+pattern.length+maxlength)*numValues. * a document's ord can be retrieved by seeking to "ord section" + (1+ordpattern.length())*docid * an ord's value can be retrieved by seeking to startOffset + (9+pattern.length+maxlength)*ord * * for sorted set this is a fixed-width file very similar to the SORTED case, for example: * <pre> * field myField * type SORTED_SET * numvalues 10 * maxLength 8 * pattern 0 * ordpattern XXXXX * length 6 * foobar[space][space] * length 3 * baz[space][space][space][space][space] * ... * 0,3,5 * 1,2 * * 10 * ... * </pre> * so the "ord section" begins at startOffset + (9+pattern.length+maxlength)*numValues. * a document's ord list can be retrieved by seeking to "ord section" + (1+ordpattern.length())*docid * this is a comma-separated list, and it's padded with spaces to be fixed width. so trim() and split() it. * and beware the empty string! * an ord's value can be retrieved by seeking to startOffset + (9+pattern.length+maxlength)*ord * * for sorted numerics, it's encoded (not very creatively) as a comma-separated list of strings the same as binary. * beware the empty string! * * the reader can just scan this file when it opens, skipping over the data blocks * and saving the offset/etc for each field. * @lucene.experimental */
class SimpleTextDocValuesFormat extends DocValuesFormat { public SimpleTextDocValuesFormat() { super("SimpleText"); } @Override public DocValuesConsumer fieldsConsumer(SegmentWriteState state) throws IOException { return new SimpleTextDocValuesWriter(state, "dat"); } @Override public DocValuesProducer fieldsProducer(SegmentReadState state) throws IOException { return new SimpleTextDocValuesReader(state, "dat"); } }