/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.search.spell;

N-Gram version of edit distance based on paper by Grzegorz Kondrak, "N-gram similarity and distance". Proceedings of the Twelfth International Conference on String Processing and Information Retrieval (SPIRE 2005), pp. 115-126, Buenos Aires, Argentina, November 2005. http://www.cs.ualberta.ca/~kondrak/papers/spire05.pdf This implementation uses the position-based optimization to compute partial matches of n-gram sub-strings and adds a null-character prefix of size n-1 so that the first character is contained in the same number of n-grams as a middle character. Null-character prefix matches are discounted so that strings with no matching characters will return a distance of 0.
/** * N-Gram version of edit distance based on paper by Grzegorz Kondrak, * "N-gram similarity and distance". Proceedings of the Twelfth International * Conference on String Processing and Information Retrieval (SPIRE 2005), pp. 115-126, * Buenos Aires, Argentina, November 2005. * http://www.cs.ualberta.ca/~kondrak/papers/spire05.pdf * * This implementation uses the position-based optimization to compute partial * matches of n-gram sub-strings and adds a null-character prefix of size n-1 * so that the first character is contained in the same number of n-grams as * a middle character. Null-character prefix matches are discounted so that * strings with no matching characters will return a distance of 0. * */
public class NGramDistance implements StringDistance { private int n;
Creates an N-Gram distance measure using n-grams of the specified size.
Params:
  • size – The size of the n-gram to be used to compute the string distance.
/** * Creates an N-Gram distance measure using n-grams of the specified size. * @param size The size of the n-gram to be used to compute the string distance. */
public NGramDistance(int size) { this.n = size; }
Creates an N-Gram distance measure using n-grams of size 2.
/** * Creates an N-Gram distance measure using n-grams of size 2. */
public NGramDistance() { this(2); } @Override public float getDistance(String source, String target) { final int sl = source.length(); final int tl = target.length(); if (sl == 0 || tl == 0) { if (sl == tl) { return 1; } else { return 0; } } int cost = 0; if (sl < n || tl < n) { for (int i=0,ni=Math.min(sl,tl);i<ni;i++) { if (source.charAt(i) == target.charAt(i)) { cost++; } } return (float) cost/Math.max(sl, tl); } char[] sa = new char[sl+n-1]; float p[]; //'previous' cost array, horizontally float d[]; // cost array, horizontally float _d[]; //placeholder to assist in swapping p and d //construct sa with prefix for (int i=0;i<sa.length;i++) { if (i < n-1) { sa[i]=0; //add prefix } else { sa[i] = source.charAt(i-n+1); } } p = new float[sl+1]; d = new float[sl+1]; // indexes into strings s and t int i; // iterates through source int j; // iterates through target char[] t_j = new char[n]; // jth n-gram of t for (i = 0; i<=sl; i++) { p[i] = i; } for (j = 1; j<=tl; j++) { //construct t_j n-gram if (j < n) { for (int ti=0;ti<n-j;ti++) { t_j[ti]=0; //add prefix } for (int ti=n-j;ti<n;ti++) { t_j[ti]=target.charAt(ti-(n-j)); } } else { t_j = target.substring(j-n, j).toCharArray(); } d[0] = j; for (i=1; i<=sl; i++) { cost = 0; int tn=n; //compare sa to t_j for (int ni=0;ni<n;ni++) { if (sa[i-1+ni] != t_j[ni]) { cost++; } else if (sa[i-1+ni] == 0) { //discount matches on prefix tn--; } } float ec = (float) cost/tn; // minimum of cell to the left+1, to the top+1, diagonally left and up +cost d[i] = Math.min(Math.min(d[i-1]+1, p[i]+1), p[i-1]+ec); } // copy current distance counts to 'previous row' distance counts _d = p; p = d; d = _d; } // our last action in the above loop was to switch d and p, so p now // actually has the most recent cost counts return 1.0f - (p[sl] / Math.max(tl, sl)); } @Override public int hashCode() { return 1427 * n * getClass().hashCode(); } @Override public boolean equals(Object obj) { if (this == obj) return true; if (null == obj || getClass() != obj.getClass()) return false; NGramDistance o = (NGramDistance)obj; return o.n == this.n; } @Override public String toString() { return "ngram(" + n + ")"; } }