/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.facet.taxonomy.directory;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import java.util.logging.Level;
import java.util.logging.Logger;
import org.apache.lucene.document.Document;
import org.apache.lucene.facet.FacetsConfig;
import org.apache.lucene.facet.taxonomy.FacetLabel;
import org.apache.lucene.facet.taxonomy.LRUHashMap;
import org.apache.lucene.facet.taxonomy.ParallelTaxonomyArrays;
import org.apache.lucene.facet.taxonomy.TaxonomyReader;
import org.apache.lucene.index.CorruptIndexException; // javadocs
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.MultiTerms;
import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.index.SegmentReader;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.Accountable;
import org.apache.lucene.util.Accountables;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.RamUsageEstimator;
A TaxonomyReader
which retrieves stored taxonomy information from a Directory
.
Reading from the on-disk index on every method call is too slow, so this
implementation employs caching: Some methods cache recent requests and their
results, while other methods prefetch all the data into memory and then
provide answers directly from in-memory tables. See the documentation of
individual methods for comments on their performance.
@lucene.experimental
/**
* A {@link TaxonomyReader} which retrieves stored taxonomy information from a
* {@link Directory}.
* <P>
* Reading from the on-disk index on every method call is too slow, so this
* implementation employs caching: Some methods cache recent requests and their
* results, while other methods prefetch all the data into memory and then
* provide answers directly from in-memory tables. See the documentation of
* individual methods for comments on their performance.
*
* @lucene.experimental
*/
public class DirectoryTaxonomyReader extends TaxonomyReader implements Accountable {
private static final Logger log = Logger.getLogger(DirectoryTaxonomyReader.class.getName());
private static final int DEFAULT_CACHE_VALUE = 4000;
// NOTE: very coarse estimate!
private static final int BYTES_PER_CACHE_ENTRY = 4 * RamUsageEstimator.NUM_BYTES_OBJECT_REF + 4 * RamUsageEstimator.NUM_BYTES_OBJECT_HEADER + 8 * Character.BYTES;
private final DirectoryTaxonomyWriter taxoWriter;
private final long taxoEpoch; // used in doOpenIfChanged
private final DirectoryReader indexReader;
// TODO: test DoubleBarrelLRUCache and consider using it instead
private LRUHashMap<FacetLabel, Integer> ordinalCache;
private LRUHashMap<Integer, FacetLabel> categoryCache;
private volatile TaxonomyIndexArrays taxoArrays;
Called only from doOpenIfChanged()
. If the taxonomy has been recreated, you should pass null
as the caches and parent/children arrays. /**
* Called only from {@link #doOpenIfChanged()}. If the taxonomy has been
* recreated, you should pass {@code null} as the caches and parent/children
* arrays.
*/
DirectoryTaxonomyReader(DirectoryReader indexReader, DirectoryTaxonomyWriter taxoWriter,
LRUHashMap<FacetLabel,Integer> ordinalCache, LRUHashMap<Integer,FacetLabel> categoryCache,
TaxonomyIndexArrays taxoArrays) throws IOException {
this.indexReader = indexReader;
this.taxoWriter = taxoWriter;
this.taxoEpoch = taxoWriter == null ? -1 : taxoWriter.getTaxonomyEpoch();
// use the same instance of the cache, note the protective code in getOrdinal and getPath
this.ordinalCache = ordinalCache == null ? new LRUHashMap<FacetLabel,Integer>(DEFAULT_CACHE_VALUE) : ordinalCache;
this.categoryCache = categoryCache == null ? new LRUHashMap<Integer,FacetLabel>(DEFAULT_CACHE_VALUE) : categoryCache;
this.taxoArrays = taxoArrays != null ? new TaxonomyIndexArrays(indexReader, taxoArrays) : null;
}
Open for reading a taxonomy stored in a given Directory
. Params: - directory – The
Directory
in which the taxonomy resides.
Throws: - CorruptIndexException –
if the Taxonomy is corrupt.
- IOException –
if another error occurred.
/**
* Open for reading a taxonomy stored in a given {@link Directory}.
*
* @param directory
* The {@link Directory} in which the taxonomy resides.
* @throws CorruptIndexException
* if the Taxonomy is corrupt.
* @throws IOException
* if another error occurred.
*/
public DirectoryTaxonomyReader(Directory directory) throws IOException {
indexReader = openIndexReader(directory);
taxoWriter = null;
taxoEpoch = -1;
// These are the default cache sizes; they can be configured after
// construction with the cache's setMaxSize() method
ordinalCache = new LRUHashMap<>(DEFAULT_CACHE_VALUE);
categoryCache = new LRUHashMap<>(DEFAULT_CACHE_VALUE);
}
Opens a DirectoryTaxonomyReader
over the given DirectoryTaxonomyWriter
(for NRT). Params: - taxoWriter – The
DirectoryTaxonomyWriter
from which to obtain newly added categories, in real-time.
/**
* Opens a {@link DirectoryTaxonomyReader} over the given
* {@link DirectoryTaxonomyWriter} (for NRT).
*
* @param taxoWriter
* The {@link DirectoryTaxonomyWriter} from which to obtain newly
* added categories, in real-time.
*/
public DirectoryTaxonomyReader(DirectoryTaxonomyWriter taxoWriter) throws IOException {
this.taxoWriter = taxoWriter;
taxoEpoch = taxoWriter.getTaxonomyEpoch();
indexReader = openIndexReader(taxoWriter.getInternalIndexWriter());
// These are the default cache sizes; they can be configured after
// construction with the cache's setMaxSize() method
ordinalCache = new LRUHashMap<>(DEFAULT_CACHE_VALUE);
categoryCache = new LRUHashMap<>(DEFAULT_CACHE_VALUE);
}
private synchronized void initTaxoArrays() throws IOException {
if (taxoArrays == null) {
// according to Java Concurrency in Practice, this might perform better on
// some JVMs, because the array initialization doesn't happen on the
// volatile member.
TaxonomyIndexArrays tmpArrays = new TaxonomyIndexArrays(indexReader);
taxoArrays = tmpArrays;
}
}
@Override
protected void doClose() throws IOException {
indexReader.close();
taxoArrays = null;
// do not clear() the caches, as they may be used by other DTR instances.
ordinalCache = null;
categoryCache = null;
}
Implements the opening of a new DirectoryTaxonomyReader
instance if the taxonomy has changed.
NOTE: the returned DirectoryTaxonomyReader
shares the ordinal and category caches with this reader. This is not expected to cause any issues, unless the two instances continue to live. The reader guarantees that the two instances cannot affect each other in terms of correctness of the caches, however if the size of the cache is changed through setCacheSize(int)
, it will affect both reader instances.
/**
* Implements the opening of a new {@link DirectoryTaxonomyReader} instance if
* the taxonomy has changed.
*
* <p>
* <b>NOTE:</b> the returned {@link DirectoryTaxonomyReader} shares the
* ordinal and category caches with this reader. This is not expected to cause
* any issues, unless the two instances continue to live. The reader
* guarantees that the two instances cannot affect each other in terms of
* correctness of the caches, however if the size of the cache is changed
* through {@link #setCacheSize(int)}, it will affect both reader instances.
*/
@Override
protected DirectoryTaxonomyReader doOpenIfChanged() throws IOException {
ensureOpen();
// This works for both NRT and non-NRT readers (i.e. an NRT reader remains NRT).
final DirectoryReader r2 = DirectoryReader.openIfChanged(indexReader);
if (r2 == null) {
return null; // no changes, nothing to do
}
// check if the taxonomy was recreated
boolean success = false;
try {
boolean recreated = false;
if (taxoWriter == null) {
// not NRT, check epoch from commit data
String t1 = indexReader.getIndexCommit().getUserData().get(DirectoryTaxonomyWriter.INDEX_EPOCH);
String t2 = r2.getIndexCommit().getUserData().get(DirectoryTaxonomyWriter.INDEX_EPOCH);
if (t1 == null) {
if (t2 != null) {
recreated = true;
}
} else if (!t1.equals(t2)) {
// t1 != null and t2 must not be null b/c DirTaxoWriter always puts the commit data.
// it's ok to use String.equals because we require the two epoch values to be the same.
recreated = true;
}
} else {
// NRT, compare current taxoWriter.epoch() vs the one that was given at construction
if (taxoEpoch != taxoWriter.getTaxonomyEpoch()) {
recreated = true;
}
}
final DirectoryTaxonomyReader newtr;
if (recreated) {
// if recreated, do not reuse anything from this instace. the information
// will be lazily computed by the new instance when needed.
newtr = new DirectoryTaxonomyReader(r2, taxoWriter, null, null, null);
} else {
newtr = new DirectoryTaxonomyReader(r2, taxoWriter, ordinalCache, categoryCache, taxoArrays);
}
success = true;
return newtr;
} finally {
if (!success) {
IOUtils.closeWhileHandlingException(r2);
}
}
}
Open the DirectoryReader
from this Directory
. /** Open the {@link DirectoryReader} from this {@link
* Directory}. */
protected DirectoryReader openIndexReader(Directory directory) throws IOException {
return DirectoryReader.open(directory);
}
Open the DirectoryReader
from this IndexWriter
. /** Open the {@link DirectoryReader} from this {@link
* IndexWriter}. */
protected DirectoryReader openIndexReader(IndexWriter writer) throws IOException {
return DirectoryReader.open(writer);
}
Expert: returns the underlying DirectoryReader
instance that is used by this TaxonomyReader
. /**
* Expert: returns the underlying {@link DirectoryReader} instance that is
* used by this {@link TaxonomyReader}.
*/
DirectoryReader getInternalIndexReader() {
ensureOpen();
return indexReader;
}
@Override
public ParallelTaxonomyArrays getParallelTaxonomyArrays() throws IOException {
ensureOpen();
if (taxoArrays == null) {
initTaxoArrays();
}
return taxoArrays;
}
@Override
public Map<String, String> getCommitUserData() throws IOException {
ensureOpen();
return indexReader.getIndexCommit().getUserData();
}
@Override
public int getOrdinal(FacetLabel cp) throws IOException {
ensureOpen();
if (cp.length == 0) {
return ROOT_ORDINAL;
}
// First try to find the answer in the LRU cache:
synchronized (ordinalCache) {
Integer res = ordinalCache.get(cp);
if (res != null) {
if (res.intValue() < indexReader.maxDoc()) {
// Since the cache is shared with DTR instances allocated from
// doOpenIfChanged, we need to ensure that the ordinal is one that
// this DTR instance recognizes.
return res.intValue();
} else {
// if we get here, it means that the category was found in the cache,
// but is not recognized by this TR instance. Therefore there's no
// need to continue search for the path on disk, because we won't find
// it there too.
return TaxonomyReader.INVALID_ORDINAL;
}
}
}
// If we're still here, we have a cache miss. We need to fetch the
// value from disk, and then also put it in the cache:
int ret = TaxonomyReader.INVALID_ORDINAL;
PostingsEnum docs = MultiTerms.getTermPostingsEnum(indexReader, Consts.FULL, new BytesRef(FacetsConfig.pathToString(cp.components, cp.length)), 0);
if (docs != null && docs.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
ret = docs.docID();
// we only store the fact that a category exists, not its inexistence.
// This is required because the caches are shared with new DTR instances
// that are allocated from doOpenIfChanged. Therefore, if we only store
// information about found categories, we cannot accidently tell a new
// generation of DTR that a category does not exist.
synchronized (ordinalCache) {
ordinalCache.put(cp, Integer.valueOf(ret));
}
}
return ret;
}
@Override
public FacetLabel getPath(int ordinal) throws IOException {
ensureOpen();
// Since the cache is shared with DTR instances allocated from
// doOpenIfChanged, we need to ensure that the ordinal is one that this DTR
// instance recognizes. Therefore we do this check up front, before we hit
// the cache.
if (ordinal < 0 || ordinal >= indexReader.maxDoc()) {
return null;
}
// TODO: can we use an int-based hash impl, such as IntToObjectMap,
// wrapped as LRU?
Integer catIDInteger = Integer.valueOf(ordinal);
synchronized (categoryCache) {
FacetLabel res = categoryCache.get(catIDInteger);
if (res != null) {
return res;
}
}
Document doc = indexReader.document(ordinal);
FacetLabel ret = new FacetLabel(FacetsConfig.stringToPath(doc.get(Consts.FULL)));
synchronized (categoryCache) {
categoryCache.put(catIDInteger, ret);
}
return ret;
}
@Override
public int getSize() {
ensureOpen();
return indexReader.numDocs();
}
@Override
public synchronized long ramBytesUsed() {
ensureOpen();
long ramBytesUsed = 0;
for (LeafReaderContext ctx : indexReader.leaves()) {
ramBytesUsed += ((SegmentReader) ctx.reader()).ramBytesUsed();
}
if (taxoArrays != null) {
ramBytesUsed += taxoArrays.ramBytesUsed();
}
synchronized (categoryCache) {
ramBytesUsed += BYTES_PER_CACHE_ENTRY * categoryCache.size();
}
synchronized (ordinalCache) {
ramBytesUsed += BYTES_PER_CACHE_ENTRY * ordinalCache.size();
}
return ramBytesUsed;
}
@Override
public synchronized Collection<Accountable> getChildResources() {
final List<Accountable> resources = new ArrayList<>();
long ramBytesUsed = 0;
for (LeafReaderContext ctx : indexReader.leaves()) {
ramBytesUsed += ((SegmentReader) ctx.reader()).ramBytesUsed();
}
resources.add(Accountables.namedAccountable("indexReader", ramBytesUsed));
if (taxoArrays != null) {
resources.add(Accountables.namedAccountable("taxoArrays", taxoArrays));
}
synchronized (categoryCache) {
resources.add(Accountables.namedAccountable("categoryCache", BYTES_PER_CACHE_ENTRY * categoryCache.size()));
}
synchronized (ordinalCache) {
resources.add(Accountables.namedAccountable("ordinalCache", BYTES_PER_CACHE_ENTRY * ordinalCache.size()));
}
return Collections.unmodifiableList(resources);
}
setCacheSize controls the maximum allowed size of each of the caches used by getPath(int)
and getOrdinal(FacetLabel)
.
Currently, if the given size is smaller than the current size of
a cache, it will not shrink, and rather we be limited to its current
size.
Params: - size – the new maximum cache size, in number of entries.
/**
* setCacheSize controls the maximum allowed size of each of the caches
* used by {@link #getPath(int)} and {@link #getOrdinal(FacetLabel)}.
* <P>
* Currently, if the given size is smaller than the current size of
* a cache, it will not shrink, and rather we be limited to its current
* size.
* @param size the new maximum cache size, in number of entries.
*/
public void setCacheSize(int size) {
ensureOpen();
synchronized (categoryCache) {
categoryCache.setMaxSize(size);
}
synchronized (ordinalCache) {
ordinalCache.setMaxSize(size);
}
}
Returns ordinal -> label mapping, up to the provided
max ordinal or number of ordinals, whichever is
smaller. /** Returns ordinal -> label mapping, up to the provided
* max ordinal or number of ordinals, whichever is
* smaller. */
public String toString(int max) {
ensureOpen();
StringBuilder sb = new StringBuilder();
int upperl = Math.min(max, indexReader.maxDoc());
for (int i = 0; i < upperl; i++) {
try {
FacetLabel category = this.getPath(i);
if (category == null) {
sb.append(i).append(": NULL!! \n");
continue;
}
if (category.length == 0) {
sb.append(i).append(": EMPTY STRING!! \n");
continue;
}
sb.append(i).append(": ").append(category.toString()).append("\n");
} catch (IOException e) {
if (log.isLoggable(Level.FINEST)) {
log.log(Level.FINEST, e.getMessage(), e);
}
}
}
return sb.toString();
}
}