org.apache.lucene/lucene-highlighter/8.2.0 : org/apache/lucene/search/uhighlight/DefaultPassageFormatter.java

DefaultPassageFormatter
http://lucene.apache.org/lucene-parent/lucene-highlighter: This is the highlighter for apache lucene java (The Apache Software Foundation)
Apache 2
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.search.uhighlight;


Creates a formatted snippet from the top passages.

The default implementation marks the query terms as bold, and places
ellipses between unconnected passages.
/**
 * Creates a formatted snippet from the top passages.
 * <p>
 * The default implementation marks the query terms as bold, and places
 * ellipses between unconnected passages.
 */
public class DefaultPassageFormatter extends PassageFormatter {
  text that will appear before highlighted terms /** text that will appear before highlighted terms */
  protected final String preTag;
  text that will appear after highlighted terms /** text that will appear after highlighted terms */
  protected final String postTag;
  text that will appear between two unconnected passages /** text that will appear between two unconnected passages */
  protected final String ellipsis;
  true if we should escape for html /** true if we should escape for html */
  protected final boolean escape;

  Creates a new DefaultPassageFormatter with the default tags.
/**
   * Creates a new DefaultPassageFormatter with the default tags.
   */
  public DefaultPassageFormatter() {
    this("<b>", "</b>", "... ", false);
  }

  Creates a new DefaultPassageFormatter with custom tags.
Params: preTag –   text which should appear before a highlighted term.
postTag –  text which should appear after a highlighted term.
ellipsis – text which should be used to connect two unconnected passages.
escape –   true if text should be html-escaped/**
   * Creates a new DefaultPassageFormatter with custom tags.
   *
   * @param preTag   text which should appear before a highlighted term.
   * @param postTag  text which should appear after a highlighted term.
   * @param ellipsis text which should be used to connect two unconnected passages.
   * @param escape   true if text should be html-escaped
   */
  public DefaultPassageFormatter(String preTag, String postTag, String ellipsis, boolean escape) {
    if (preTag == null || postTag == null || ellipsis == null) {
      throw new NullPointerException();
    }
    this.preTag = preTag;
    this.postTag = postTag;
    this.ellipsis = ellipsis;
    this.escape = escape;
  }

  @Override
  public String format(Passage passages[], String content) {
    StringBuilder sb = new StringBuilder();
    int pos = 0;
    for (Passage passage : passages) {
      // don't add ellipsis if its the first one, or if its connected.
      if (passage.getStartOffset() > pos && pos > 0) {
        sb.append(ellipsis);
      }
      pos = passage.getStartOffset();
      for (int i = 0; i < passage.getNumMatches(); i++) {
        int start = passage.getMatchStarts()[i];
        assert start >= pos && start < passage.getEndOffset();
        //append content before this start
        append(sb, content, pos, start);

        int end = passage.getMatchEnds()[i];
        assert end > start;
        // its possible to have overlapping terms.
        //   Look ahead to expand 'end' past all overlapping:
        while (i + 1 < passage.getNumMatches() && passage.getMatchStarts()[i+1] < end) {
          end = passage.getMatchEnds()[++i];
        }
        end = Math.min(end, passage.getEndOffset()); // in case match straddles past passage

        sb.append(preTag);
        append(sb, content, start, end);
        sb.append(postTag);

        pos = end;
      }
      // its possible a "term" from the analyzer could span a sentence boundary.
      append(sb, content, pos, Math.max(pos, passage.getEndOffset()));
      pos = passage.getEndOffset();
    }
    return sb.toString();
  }

  Appends original text to the response.
Params: dest –    resulting text, possibly transformed or encoded
content – original text content
start –   index of the first character in content
end –     index of the character following the last character in content/**
   * Appends original text to the response.
   *
   * @param dest    resulting text, possibly transformed or encoded
   * @param content original text content
   * @param start   index of the first character in content
   * @param end     index of the character following the last character in content
   */
  protected void append(StringBuilder dest, String content, int start, int end) {
    if (escape) {
      // note: these are the rules from owasp.org
      for (int i = start; i < end; i++) {
        char ch = content.charAt(i);
        switch (ch) {
          case '&':
            dest.append("&amp;");
            break;
          case '<':
            dest.append("&lt;");
            break;
          case '>':
            dest.append("&gt;");
            break;
          case '"':
            dest.append("&quot;");
            break;
          case '\'':
            dest.append("&#x27;");
            break;
          case '/':
            dest.append("&#x2F;");
            break;
          default:
            if (ch >= 0x30 && ch <= 0x39 || ch >= 0x41 && ch <= 0x5A || ch >= 0x61 && ch <= 0x7A) {
              dest.append(ch);
            } else if (ch < 0xff) {
              dest.append("&#");
              dest.append((int) ch);
              dest.append(";");
            } else {
              dest.append(ch);
            }
        }
      }
    } else {
      dest.append(content, start, end);
    }
  }
}
/

org.apache.lucene/ lucene-highlighter/ 8.2.0/ org/apache/lucene/search/uhighlight/DefaultPassageFormatter.java