/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.facet.taxonomy.directory;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import java.util.logging.Level;
import java.util.logging.Logger;

import org.apache.lucene.document.Document;
import org.apache.lucene.facet.FacetsConfig;
import org.apache.lucene.facet.taxonomy.FacetLabel;
import org.apache.lucene.facet.taxonomy.LRUHashMap;
import org.apache.lucene.facet.taxonomy.ParallelTaxonomyArrays;
import org.apache.lucene.facet.taxonomy.TaxonomyReader;
import org.apache.lucene.index.CorruptIndexException; // javadocs
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.MultiTerms;
import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.index.SegmentReader;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.Accountable;
import org.apache.lucene.util.Accountables;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.RamUsageEstimator;

A TaxonomyReader which retrieves stored taxonomy information from a Directory.

Reading from the on-disk index on every method call is too slow, so this implementation employs caching: Some methods cache recent requests and their results, while other methods prefetch all the data into memory and then provide answers directly from in-memory tables. See the documentation of individual methods for comments on their performance.

@lucene.experimental
/** * A {@link TaxonomyReader} which retrieves stored taxonomy information from a * {@link Directory}. * <P> * Reading from the on-disk index on every method call is too slow, so this * implementation employs caching: Some methods cache recent requests and their * results, while other methods prefetch all the data into memory and then * provide answers directly from in-memory tables. See the documentation of * individual methods for comments on their performance. * * @lucene.experimental */
public class DirectoryTaxonomyReader extends TaxonomyReader implements Accountable { private static final Logger log = Logger.getLogger(DirectoryTaxonomyReader.class.getName()); private static final int DEFAULT_CACHE_VALUE = 4000; // NOTE: very coarse estimate! private static final int BYTES_PER_CACHE_ENTRY = 4 * RamUsageEstimator.NUM_BYTES_OBJECT_REF + 4 * RamUsageEstimator.NUM_BYTES_OBJECT_HEADER + 8 * Character.BYTES; private final DirectoryTaxonomyWriter taxoWriter; private final long taxoEpoch; // used in doOpenIfChanged private final DirectoryReader indexReader; // TODO: test DoubleBarrelLRUCache and consider using it instead private LRUHashMap<FacetLabel, Integer> ordinalCache; private LRUHashMap<Integer, FacetLabel> categoryCache; private volatile TaxonomyIndexArrays taxoArrays;
Called only from doOpenIfChanged(). If the taxonomy has been recreated, you should pass null as the caches and parent/children arrays.
/** * Called only from {@link #doOpenIfChanged()}. If the taxonomy has been * recreated, you should pass {@code null} as the caches and parent/children * arrays. */
DirectoryTaxonomyReader(DirectoryReader indexReader, DirectoryTaxonomyWriter taxoWriter, LRUHashMap<FacetLabel,Integer> ordinalCache, LRUHashMap<Integer,FacetLabel> categoryCache, TaxonomyIndexArrays taxoArrays) throws IOException { this.indexReader = indexReader; this.taxoWriter = taxoWriter; this.taxoEpoch = taxoWriter == null ? -1 : taxoWriter.getTaxonomyEpoch(); // use the same instance of the cache, note the protective code in getOrdinal and getPath this.ordinalCache = ordinalCache == null ? new LRUHashMap<FacetLabel,Integer>(DEFAULT_CACHE_VALUE) : ordinalCache; this.categoryCache = categoryCache == null ? new LRUHashMap<Integer,FacetLabel>(DEFAULT_CACHE_VALUE) : categoryCache; this.taxoArrays = taxoArrays != null ? new TaxonomyIndexArrays(indexReader, taxoArrays) : null; }
Open for reading a taxonomy stored in a given Directory.
Params:
  • directory – The Directory in which the taxonomy resides.
Throws:
/** * Open for reading a taxonomy stored in a given {@link Directory}. * * @param directory * The {@link Directory} in which the taxonomy resides. * @throws CorruptIndexException * if the Taxonomy is corrupt. * @throws IOException * if another error occurred. */
public DirectoryTaxonomyReader(Directory directory) throws IOException { indexReader = openIndexReader(directory); taxoWriter = null; taxoEpoch = -1; // These are the default cache sizes; they can be configured after // construction with the cache's setMaxSize() method ordinalCache = new LRUHashMap<>(DEFAULT_CACHE_VALUE); categoryCache = new LRUHashMap<>(DEFAULT_CACHE_VALUE); }
Opens a DirectoryTaxonomyReader over the given DirectoryTaxonomyWriter (for NRT).
Params:
/** * Opens a {@link DirectoryTaxonomyReader} over the given * {@link DirectoryTaxonomyWriter} (for NRT). * * @param taxoWriter * The {@link DirectoryTaxonomyWriter} from which to obtain newly * added categories, in real-time. */
public DirectoryTaxonomyReader(DirectoryTaxonomyWriter taxoWriter) throws IOException { this.taxoWriter = taxoWriter; taxoEpoch = taxoWriter.getTaxonomyEpoch(); indexReader = openIndexReader(taxoWriter.getInternalIndexWriter()); // These are the default cache sizes; they can be configured after // construction with the cache's setMaxSize() method ordinalCache = new LRUHashMap<>(DEFAULT_CACHE_VALUE); categoryCache = new LRUHashMap<>(DEFAULT_CACHE_VALUE); } private synchronized void initTaxoArrays() throws IOException { if (taxoArrays == null) { // according to Java Concurrency in Practice, this might perform better on // some JVMs, because the array initialization doesn't happen on the // volatile member. TaxonomyIndexArrays tmpArrays = new TaxonomyIndexArrays(indexReader); taxoArrays = tmpArrays; } } @Override protected void doClose() throws IOException { indexReader.close(); taxoArrays = null; // do not clear() the caches, as they may be used by other DTR instances. ordinalCache = null; categoryCache = null; }
Implements the opening of a new DirectoryTaxonomyReader instance if the taxonomy has changed.

NOTE: the returned DirectoryTaxonomyReader shares the ordinal and category caches with this reader. This is not expected to cause any issues, unless the two instances continue to live. The reader guarantees that the two instances cannot affect each other in terms of correctness of the caches, however if the size of the cache is changed through setCacheSize(int), it will affect both reader instances.

/** * Implements the opening of a new {@link DirectoryTaxonomyReader} instance if * the taxonomy has changed. * * <p> * <b>NOTE:</b> the returned {@link DirectoryTaxonomyReader} shares the * ordinal and category caches with this reader. This is not expected to cause * any issues, unless the two instances continue to live. The reader * guarantees that the two instances cannot affect each other in terms of * correctness of the caches, however if the size of the cache is changed * through {@link #setCacheSize(int)}, it will affect both reader instances. */
@Override protected DirectoryTaxonomyReader doOpenIfChanged() throws IOException { ensureOpen(); // This works for both NRT and non-NRT readers (i.e. an NRT reader remains NRT). final DirectoryReader r2 = DirectoryReader.openIfChanged(indexReader); if (r2 == null) { return null; // no changes, nothing to do } // check if the taxonomy was recreated boolean success = false; try { boolean recreated = false; if (taxoWriter == null) { // not NRT, check epoch from commit data String t1 = indexReader.getIndexCommit().getUserData().get(DirectoryTaxonomyWriter.INDEX_EPOCH); String t2 = r2.getIndexCommit().getUserData().get(DirectoryTaxonomyWriter.INDEX_EPOCH); if (t1 == null) { if (t2 != null) { recreated = true; } } else if (!t1.equals(t2)) { // t1 != null and t2 must not be null b/c DirTaxoWriter always puts the commit data. // it's ok to use String.equals because we require the two epoch values to be the same. recreated = true; } } else { // NRT, compare current taxoWriter.epoch() vs the one that was given at construction if (taxoEpoch != taxoWriter.getTaxonomyEpoch()) { recreated = true; } } final DirectoryTaxonomyReader newtr; if (recreated) { // if recreated, do not reuse anything from this instace. the information // will be lazily computed by the new instance when needed. newtr = new DirectoryTaxonomyReader(r2, taxoWriter, null, null, null); } else { newtr = new DirectoryTaxonomyReader(r2, taxoWriter, ordinalCache, categoryCache, taxoArrays); } success = true; return newtr; } finally { if (!success) { IOUtils.closeWhileHandlingException(r2); } } }
Open the DirectoryReader from this Directory.
/** Open the {@link DirectoryReader} from this {@link * Directory}. */
protected DirectoryReader openIndexReader(Directory directory) throws IOException { return DirectoryReader.open(directory); }
Open the DirectoryReader from this IndexWriter.
/** Open the {@link DirectoryReader} from this {@link * IndexWriter}. */
protected DirectoryReader openIndexReader(IndexWriter writer) throws IOException { return DirectoryReader.open(writer); }
Expert: returns the underlying DirectoryReader instance that is used by this TaxonomyReader.
/** * Expert: returns the underlying {@link DirectoryReader} instance that is * used by this {@link TaxonomyReader}. */
DirectoryReader getInternalIndexReader() { ensureOpen(); return indexReader; } @Override public ParallelTaxonomyArrays getParallelTaxonomyArrays() throws IOException { ensureOpen(); if (taxoArrays == null) { initTaxoArrays(); } return taxoArrays; } @Override public Map<String, String> getCommitUserData() throws IOException { ensureOpen(); return indexReader.getIndexCommit().getUserData(); } @Override public int getOrdinal(FacetLabel cp) throws IOException { ensureOpen(); if (cp.length == 0) { return ROOT_ORDINAL; } // First try to find the answer in the LRU cache: synchronized (ordinalCache) { Integer res = ordinalCache.get(cp); if (res != null) { if (res.intValue() < indexReader.maxDoc()) { // Since the cache is shared with DTR instances allocated from // doOpenIfChanged, we need to ensure that the ordinal is one that // this DTR instance recognizes. return res.intValue(); } else { // if we get here, it means that the category was found in the cache, // but is not recognized by this TR instance. Therefore there's no // need to continue search for the path on disk, because we won't find // it there too. return TaxonomyReader.INVALID_ORDINAL; } } } // If we're still here, we have a cache miss. We need to fetch the // value from disk, and then also put it in the cache: int ret = TaxonomyReader.INVALID_ORDINAL; PostingsEnum docs = MultiTerms.getTermPostingsEnum(indexReader, Consts.FULL, new BytesRef(FacetsConfig.pathToString(cp.components, cp.length)), 0); if (docs != null && docs.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) { ret = docs.docID(); // we only store the fact that a category exists, not its inexistence. // This is required because the caches are shared with new DTR instances // that are allocated from doOpenIfChanged. Therefore, if we only store // information about found categories, we cannot accidently tell a new // generation of DTR that a category does not exist. synchronized (ordinalCache) { ordinalCache.put(cp, Integer.valueOf(ret)); } } return ret; } @Override public FacetLabel getPath(int ordinal) throws IOException { ensureOpen(); // Since the cache is shared with DTR instances allocated from // doOpenIfChanged, we need to ensure that the ordinal is one that this DTR // instance recognizes. Therefore we do this check up front, before we hit // the cache. if (ordinal < 0 || ordinal >= indexReader.maxDoc()) { return null; } // TODO: can we use an int-based hash impl, such as IntToObjectMap, // wrapped as LRU? Integer catIDInteger = Integer.valueOf(ordinal); synchronized (categoryCache) { FacetLabel res = categoryCache.get(catIDInteger); if (res != null) { return res; } } Document doc = indexReader.document(ordinal); FacetLabel ret = new FacetLabel(FacetsConfig.stringToPath(doc.get(Consts.FULL))); synchronized (categoryCache) { categoryCache.put(catIDInteger, ret); } return ret; } @Override public int getSize() { ensureOpen(); return indexReader.numDocs(); } @Override public synchronized long ramBytesUsed() { ensureOpen(); long ramBytesUsed = 0; for (LeafReaderContext ctx : indexReader.leaves()) { ramBytesUsed += ((SegmentReader) ctx.reader()).ramBytesUsed(); } if (taxoArrays != null) { ramBytesUsed += taxoArrays.ramBytesUsed(); } synchronized (categoryCache) { ramBytesUsed += BYTES_PER_CACHE_ENTRY * categoryCache.size(); } synchronized (ordinalCache) { ramBytesUsed += BYTES_PER_CACHE_ENTRY * ordinalCache.size(); } return ramBytesUsed; } @Override public synchronized Collection<Accountable> getChildResources() { final List<Accountable> resources = new ArrayList<>(); long ramBytesUsed = 0; for (LeafReaderContext ctx : indexReader.leaves()) { ramBytesUsed += ((SegmentReader) ctx.reader()).ramBytesUsed(); } resources.add(Accountables.namedAccountable("indexReader", ramBytesUsed)); if (taxoArrays != null) { resources.add(Accountables.namedAccountable("taxoArrays", taxoArrays)); } synchronized (categoryCache) { resources.add(Accountables.namedAccountable("categoryCache", BYTES_PER_CACHE_ENTRY * categoryCache.size())); } synchronized (ordinalCache) { resources.add(Accountables.namedAccountable("ordinalCache", BYTES_PER_CACHE_ENTRY * ordinalCache.size())); } return Collections.unmodifiableList(resources); }
setCacheSize controls the maximum allowed size of each of the caches used by getPath(int) and getOrdinal(FacetLabel).

Currently, if the given size is smaller than the current size of a cache, it will not shrink, and rather we be limited to its current size.

Params:
  • size – the new maximum cache size, in number of entries.
/** * setCacheSize controls the maximum allowed size of each of the caches * used by {@link #getPath(int)} and {@link #getOrdinal(FacetLabel)}. * <P> * Currently, if the given size is smaller than the current size of * a cache, it will not shrink, and rather we be limited to its current * size. * @param size the new maximum cache size, in number of entries. */
public void setCacheSize(int size) { ensureOpen(); synchronized (categoryCache) { categoryCache.setMaxSize(size); } synchronized (ordinalCache) { ordinalCache.setMaxSize(size); } }
Returns ordinal -> label mapping, up to the provided max ordinal or number of ordinals, whichever is smaller.
/** Returns ordinal -&gt; label mapping, up to the provided * max ordinal or number of ordinals, whichever is * smaller. */
public String toString(int max) { ensureOpen(); StringBuilder sb = new StringBuilder(); int upperl = Math.min(max, indexReader.maxDoc()); for (int i = 0; i < upperl; i++) { try { FacetLabel category = this.getPath(i); if (category == null) { sb.append(i).append(": NULL!! \n"); continue; } if (category.length == 0) { sb.append(i).append(": EMPTY STRING!! \n"); continue; } sb.append(i).append(": ").append(category.toString()).append("\n"); } catch (IOException e) { if (log.isLoggable(Level.FINEST)) { log.log(Level.FINEST, e.getMessage(), e); } } } return sb.toString(); } }