/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.search.uhighlight;


Creates a formatted snippet from the top passages.

The default implementation marks the query terms as bold, and places ellipses between unconnected passages.

/** * Creates a formatted snippet from the top passages. * <p> * The default implementation marks the query terms as bold, and places * ellipses between unconnected passages. */
public class DefaultPassageFormatter extends PassageFormatter {
text that will appear before highlighted terms
/** text that will appear before highlighted terms */
protected final String preTag;
text that will appear after highlighted terms
/** text that will appear after highlighted terms */
protected final String postTag;
text that will appear between two unconnected passages
/** text that will appear between two unconnected passages */
protected final String ellipsis;
true if we should escape for html
/** true if we should escape for html */
protected final boolean escape;
Creates a new DefaultPassageFormatter with the default tags.
/** * Creates a new DefaultPassageFormatter with the default tags. */
public DefaultPassageFormatter() { this("<b>", "</b>", "... ", false); }
Creates a new DefaultPassageFormatter with custom tags.
Params:
  • preTag – text which should appear before a highlighted term.
  • postTag – text which should appear after a highlighted term.
  • ellipsis – text which should be used to connect two unconnected passages.
  • escape – true if text should be html-escaped
/** * Creates a new DefaultPassageFormatter with custom tags. * * @param preTag text which should appear before a highlighted term. * @param postTag text which should appear after a highlighted term. * @param ellipsis text which should be used to connect two unconnected passages. * @param escape true if text should be html-escaped */
public DefaultPassageFormatter(String preTag, String postTag, String ellipsis, boolean escape) { if (preTag == null || postTag == null || ellipsis == null) { throw new NullPointerException(); } this.preTag = preTag; this.postTag = postTag; this.ellipsis = ellipsis; this.escape = escape; } @Override public String format(Passage passages[], String content) { StringBuilder sb = new StringBuilder(); int pos = 0; for (Passage passage : passages) { // don't add ellipsis if its the first one, or if its connected. if (passage.getStartOffset() > pos && pos > 0) { sb.append(ellipsis); } pos = passage.getStartOffset(); for (int i = 0; i < passage.getNumMatches(); i++) { int start = passage.getMatchStarts()[i]; assert start >= pos && start < passage.getEndOffset(); //append content before this start append(sb, content, pos, start); int end = passage.getMatchEnds()[i]; assert end > start; // its possible to have overlapping terms. // Look ahead to expand 'end' past all overlapping: while (i + 1 < passage.getNumMatches() && passage.getMatchStarts()[i+1] < end) { end = passage.getMatchEnds()[++i]; } end = Math.min(end, passage.getEndOffset()); // in case match straddles past passage sb.append(preTag); append(sb, content, start, end); sb.append(postTag); pos = end; } // its possible a "term" from the analyzer could span a sentence boundary. append(sb, content, pos, Math.max(pos, passage.getEndOffset())); pos = passage.getEndOffset(); } return sb.toString(); }
Appends original text to the response.
Params:
  • dest – resulting text, possibly transformed or encoded
  • content – original text content
  • start – index of the first character in content
  • end – index of the character following the last character in content
/** * Appends original text to the response. * * @param dest resulting text, possibly transformed or encoded * @param content original text content * @param start index of the first character in content * @param end index of the character following the last character in content */
protected void append(StringBuilder dest, String content, int start, int end) { if (escape) { // note: these are the rules from owasp.org for (int i = start; i < end; i++) { char ch = content.charAt(i); switch (ch) { case '&': dest.append("&amp;"); break; case '<': dest.append("&lt;"); break; case '>': dest.append("&gt;"); break; case '"': dest.append("&quot;"); break; case '\'': dest.append("&#x27;"); break; case '/': dest.append("&#x2F;"); break; default: if (ch >= 0x30 && ch <= 0x39 || ch >= 0x41 && ch <= 0x5A || ch >= 0x61 && ch <= 0x7A) { dest.append(ch); } else if (ch < 0xff) { dest.append("&#"); dest.append((int) ch); dest.append(";"); } else { dest.append(ch); } } } } else { dest.append(content, start, end); } } }