/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.lucene.util.graph;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.BitSet;
import java.util.Collections;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;

import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.BytesTermAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
import org.apache.lucene.index.Term;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.automaton.Automaton;
import org.apache.lucene.util.automaton.FiniteStringsIterator;
import org.apache.lucene.util.automaton.Operations;
import org.apache.lucene.util.automaton.Transition;

import static org.apache.lucene.util.automaton.Operations.DEFAULT_MAX_DETERMINIZED_STATES;

Consumes a TokenStream and creates an Automaton where the transition labels are terms from the TermToBytesRefAttribute. This class also provides helpers to explore the different paths of the Automaton.
/** * Consumes a TokenStream and creates an {@link Automaton} where the transition labels are terms from * the {@link TermToBytesRefAttribute}. * This class also provides helpers to explore the different paths of the {@link Automaton}. */
public final class GraphTokenStreamFiniteStrings { private final Map<Integer, BytesRef> idToTerm = new HashMap<>(); private final Map<Integer, Integer> idToInc = new HashMap<>(); private final Automaton det; private final Transition transition = new Transition(); private class FiniteStringsTokenStream extends TokenStream { private final BytesTermAttribute termAtt = addAttribute(BytesTermAttribute.class); private final PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class); private final IntsRef ids; private final int end; private int offset; FiniteStringsTokenStream(final IntsRef ids) { assert ids != null; this.ids = ids; this.offset = ids.offset; this.end = ids.offset + ids.length; } @Override public boolean incrementToken() throws IOException { if (offset < end) { clearAttributes(); int id = ids.ints[offset]; termAtt.setBytesRef(idToTerm.get(id)); int incr = 1; if (idToInc.containsKey(id)) { incr = idToInc.get(id); } posIncAtt.setPositionIncrement(incr); offset++; return true; } return false; } } public GraphTokenStreamFiniteStrings(TokenStream in) throws IOException { Automaton aut = build(in); this.det = Operations.removeDeadStates(Operations.determinize(aut, DEFAULT_MAX_DETERMINIZED_STATES)); }
Returns whether the provided state is the start of multiple side paths of different length (eg: new york, ny)
/** * Returns whether the provided state is the start of multiple side paths of different length (eg: new york, ny) */
public boolean hasSidePath(int state) { int numT = det.initTransition(state, transition); if (numT <= 1) { return false; } det.getNextTransition(transition); int dest = transition.dest; for (int i = 1; i < numT; i++) { det.getNextTransition(transition); if (dest != transition.dest) { return true; } } return false; }
Returns the list of terms that start at the provided state
/** * Returns the list of terms that start at the provided state */
public Term[] getTerms(String field, int state) { int numT = det.initTransition(state, transition); List<Term> terms = new ArrayList<> (); for (int i = 0; i < numT; i++) { det.getNextTransition(transition); for (int id = transition.min; id <= transition.max; id++) { Term term = new Term(field, idToTerm.get(id)); terms.add(term); } } return terms.toArray(new Term[terms.size()]); }
Get all finite strings from the automaton.
/** * Get all finite strings from the automaton. */
public Iterator<TokenStream> getFiniteStrings() throws IOException { return getFiniteStrings(0, -1); }
Get all finite strings that start at startState and end at endState.
/** * Get all finite strings that start at {@code startState} and end at {@code endState}. */
public Iterator<TokenStream> getFiniteStrings(int startState, int endState) throws IOException { final FiniteStringsIterator it = new FiniteStringsIterator(det, startState, endState); return new Iterator<TokenStream> () { IntsRef current; boolean finished = false; @Override public boolean hasNext() { if (finished == false && current == null) { current = it.next(); if (current == null) { finished = true; } } return current != null; } @Override public TokenStream next() { if (current == null) { hasNext(); } TokenStream next = new FiniteStringsTokenStream(current); current = null; return next; } }; }
Returns the articulation points (or cut vertices) of the graph: https://en.wikipedia.org/wiki/Biconnected_component
/** * Returns the articulation points (or cut vertices) of the graph: * https://en.wikipedia.org/wiki/Biconnected_component */
public int[] articulationPoints() { if (det.getNumStates() == 0) { return new int[0]; } // Automaton.Builder undirect = new Automaton.Builder(); undirect.copy(det); for (int i = 0; i < det.getNumStates(); i++) { int numT = det.initTransition(i, transition); for (int j = 0; j < numT; j++) { det.getNextTransition(transition); undirect.addTransition(transition.dest, i, transition.min); } } int numStates = det.getNumStates(); BitSet visited = new BitSet(numStates); int[] depth = new int[det.getNumStates()]; int[] low = new int[det.getNumStates()]; int[] parent = new int[det.getNumStates()]; Arrays.fill(parent, -1); List<Integer> points = new ArrayList<>(); articulationPointsRecurse(undirect.finish(), 0, 0, depth, low, parent, visited, points); Collections.reverse(points); return points.stream().mapToInt(p -> p).toArray(); }
Build an automaton from the provided TokenStream.
/** * Build an automaton from the provided {@link TokenStream}. */
private Automaton build(final TokenStream in) throws IOException { Automaton.Builder builder = new Automaton.Builder(); final TermToBytesRefAttribute termBytesAtt = in.addAttribute(TermToBytesRefAttribute.class); final PositionIncrementAttribute posIncAtt = in.addAttribute(PositionIncrementAttribute.class); final PositionLengthAttribute posLengthAtt = in.addAttribute(PositionLengthAttribute.class); in.reset(); int pos = -1; int prevIncr = 1; int state = -1; int gap = 0; while (in.incrementToken()) { int currentIncr = posIncAtt.getPositionIncrement(); if (pos == -1 && currentIncr < 1) { throw new IllegalStateException("Malformed TokenStream, start token can't have increment less than 1"); } if (currentIncr == 0) { if (gap > 0) { pos -= gap; } } else { pos++; gap = currentIncr - 1; } int endPos = pos + posLengthAtt.getPositionLength() + gap; while (state < endPos) { state = builder.createState(); } BytesRef term = termBytesAtt.getBytesRef(); int id = getTermID(currentIncr, prevIncr, term); //System.out.println("Adding transition: " + term.utf8ToString() + "@" + pos + "->" + endPos); builder.addTransition(pos, endPos, id); pos += gap; // only save last increment on non-zero increment in case we have multiple stacked tokens if (currentIncr > 0) { prevIncr = currentIncr; } } in.end(); if (state != -1) { builder.setAccept(state, true); } return builder.finish(); }
Gets an integer id for a given term and saves the position increment if needed.
/** * Gets an integer id for a given term and saves the position increment if needed. */
private int getTermID(int incr, int prevIncr, BytesRef term) { assert term != null; boolean isStackedGap = incr == 0 && prevIncr > 1; int id = idToTerm.size(); idToTerm.put(id, BytesRef.deepCopyOf(term)); // stacked token should have the same increment as original token at this position if (isStackedGap) { idToInc.put(id, prevIncr); } else if (incr > 1) { idToInc.put(id, incr); } return id; } private static void articulationPointsRecurse(Automaton a, int state, int d, int[] depth, int[] low, int[] parent, BitSet visited, List<Integer> points) { visited.set(state); depth[state] = d; low[state] = d; int childCount = 0; boolean isArticulation = false; Transition t = new Transition(); int numT = a.initTransition(state, t); for (int i = 0; i < numT; i++) { a.getNextTransition(t); if (visited.get(t.dest) == false) { parent[t.dest] = state; articulationPointsRecurse(a, t.dest, d + 1, depth, low, parent, visited, points); childCount++; if (low[t.dest] >= depth[state]) { isArticulation = true; } low[state] = Math.min(low[state], low[t.dest]); } else if (t.dest != parent[state]) { low[state] = Math.min(low[state], depth[t.dest]); } } if ((parent[state] != -1 && isArticulation) || (parent[state] == -1 && childCount > 1)) { points.add(state); } } }