org.apache.lucene/lucene-core/8.2.0 : org/apache/lucene/analysis/WordlistLoader.java

WordlistLoader
http://lucene.apache.org/lucene-parent/lucene-core: Apache Lucene Java Core (The Apache Software Foundation)
Apache 2
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.analysis;


import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.Reader;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.List;

import org.apache.lucene.util.IOUtils;

Loader for text files that represent a list of stopwords.
See Also: to obtain Reader instances
@lucene.internal /**
 * Loader for text files that represent a list of stopwords.
 * 
 * @see IOUtils to obtain {@link Reader} instances
 * @lucene.internal
 */
public class WordlistLoader {
  
  private static final int INITIAL_CAPACITY = 16;
  
  no instance /** no instance */
  private WordlistLoader() {}
  
  Reads lines from a Reader and adds every line as an entry to a CharArraySet (omitting
leading and trailing whitespace). Every line of the Reader should contain only
one word. The words need to be in lowercase if you make use of an
Analyzer which uses LowerCaseFilter (like StandardAnalyzer).
Params: reader – Reader containing the wordlist
result – the CharArraySet to fill with the readers words
Returns: the given CharArraySet with the reader's words/**
   * Reads lines from a Reader and adds every line as an entry to a CharArraySet (omitting
   * leading and trailing whitespace). Every line of the Reader should contain only
   * one word. The words need to be in lowercase if you make use of an
   * Analyzer which uses LowerCaseFilter (like StandardAnalyzer).
   *
   * @param reader Reader containing the wordlist
   * @param result the {@link CharArraySet} to fill with the readers words
   * @return the given {@link CharArraySet} with the reader's words
   */
  public static CharArraySet getWordSet(Reader reader, CharArraySet result) throws IOException {
    BufferedReader br = null;
    try {
      br = getBufferedReader(reader);
      String word = null;
      while ((word = br.readLine()) != null) {
        result.add(word.trim());
      }
    }
    finally {
      IOUtils.close(br);
    }
    return result;
  }
  
  Reads lines from a Reader and adds every line as an entry to a CharArraySet (omitting
leading and trailing whitespace). Every line of the Reader should contain only
one word. The words need to be in lowercase if you make use of an
Analyzer which uses LowerCaseFilter (like StandardAnalyzer).
Params: reader – Reader containing the wordlist
Returns: A CharArraySet with the reader's words/**
   * Reads lines from a Reader and adds every line as an entry to a CharArraySet (omitting
   * leading and trailing whitespace). Every line of the Reader should contain only
   * one word. The words need to be in lowercase if you make use of an
   * Analyzer which uses LowerCaseFilter (like StandardAnalyzer).
   *
   * @param reader Reader containing the wordlist
   * @return A {@link CharArraySet} with the reader's words
   */
  public static CharArraySet getWordSet(Reader reader) throws IOException {
    return getWordSet(reader, new CharArraySet(INITIAL_CAPACITY, false));
  }

  Reads lines from a Reader and adds every non-comment line as an entry to a CharArraySet (omitting
leading and trailing whitespace). Every line of the Reader should contain only
one word. The words need to be in lowercase if you make use of an
Analyzer which uses LowerCaseFilter (like StandardAnalyzer).
Params: reader – Reader containing the wordlist
comment – The string representing a comment.
Returns: A CharArraySet with the reader's words/**
   * Reads lines from a Reader and adds every non-comment line as an entry to a CharArraySet (omitting
   * leading and trailing whitespace). Every line of the Reader should contain only
   * one word. The words need to be in lowercase if you make use of an
   * Analyzer which uses LowerCaseFilter (like StandardAnalyzer).
   *
   * @param reader Reader containing the wordlist
   * @param comment The string representing a comment.
   * @return A CharArraySet with the reader's words
   */
  public static CharArraySet getWordSet(Reader reader, String comment) throws IOException {
    return getWordSet(reader, comment, new CharArraySet(INITIAL_CAPACITY, false));
  }

  Reads lines from a Reader and adds every non-comment line as an entry to a CharArraySet (omitting
leading and trailing whitespace). Every line of the Reader should contain only
one word. The words need to be in lowercase if you make use of an
Analyzer which uses LowerCaseFilter (like StandardAnalyzer).
Params: reader – Reader containing the wordlist
comment – The string representing a comment.
result – the CharArraySet to fill with the readers words
Returns: the given CharArraySet with the reader's words/**
   * Reads lines from a Reader and adds every non-comment line as an entry to a CharArraySet (omitting
   * leading and trailing whitespace). Every line of the Reader should contain only
   * one word. The words need to be in lowercase if you make use of an
   * Analyzer which uses LowerCaseFilter (like StandardAnalyzer).
   *
   * @param reader Reader containing the wordlist
   * @param comment The string representing a comment.
   * @param result the {@link CharArraySet} to fill with the readers words
   * @return the given {@link CharArraySet} with the reader's words
   */
  public static CharArraySet getWordSet(Reader reader, String comment, CharArraySet result) throws IOException {
    BufferedReader br = null;
    try {
      br = getBufferedReader(reader);
      String word = null;
      while ((word = br.readLine()) != null) {
        if (word.startsWith(comment) == false){
          result.add(word.trim());
        }
      }
    }
    finally {
      IOUtils.close(br);
    }
    return result;
  }

  
  Reads stopwords from a stopword list in Snowball format.

The snowball format is the following:

Lines may contain multiple words separated by whitespace.
The comment character is the vertical line (|).
Lines may contain trailing comments.

Params: reader – Reader containing a Snowball stopword list
result – the CharArraySet to fill with the readers words
Returns: the given CharArraySet with the reader's words/**
   * Reads stopwords from a stopword list in Snowball format.
   * <p>
   * The snowball format is the following:
   * <ul>
   * <li>Lines may contain multiple words separated by whitespace.
   * <li>The comment character is the vertical line (&#124;).
   * <li>Lines may contain trailing comments.
   * </ul>
   * 
   * @param reader Reader containing a Snowball stopword list
   * @param result the {@link CharArraySet} to fill with the readers words
   * @return the given {@link CharArraySet} with the reader's words
   */
  public static CharArraySet getSnowballWordSet(Reader reader, CharArraySet result)
      throws IOException {
    BufferedReader br = null;
    try {
      br = getBufferedReader(reader);
      String line = null;
      while ((line = br.readLine()) != null) {
        int comment = line.indexOf('|');
        if (comment >= 0) line = line.substring(0, comment);
        String words[] = line.split("\\s+");
        for (int i = 0; i < words.length; i++)
          if (words[i].length() > 0) result.add(words[i]);
      }
    } finally {
      IOUtils.close(br);
    }
    return result;
  }
  
  Reads stopwords from a stopword list in Snowball format.

The snowball format is the following:

Lines may contain multiple words separated by whitespace.
The comment character is the vertical line (|).
Lines may contain trailing comments.

Params: reader – Reader containing a Snowball stopword list
Returns: A CharArraySet with the reader's words/**
   * Reads stopwords from a stopword list in Snowball format.
   * <p>
   * The snowball format is the following:
   * <ul>
   * <li>Lines may contain multiple words separated by whitespace.
   * <li>The comment character is the vertical line (&#124;).
   * <li>Lines may contain trailing comments.
   * </ul>
   * 
   * @param reader Reader containing a Snowball stopword list
   * @return A {@link CharArraySet} with the reader's words
   */
  public static CharArraySet getSnowballWordSet(Reader reader) throws IOException {
    return getSnowballWordSet(reader, new CharArraySet(INITIAL_CAPACITY, false));
  }


  Reads a stem dictionary. Each line contains:
word\tstem
(i.e. two tab separated words)
Throws: IOException – If there is a low-level I/O error.
Returns: stem dictionary that overrules the stemming algorithm/**
   * Reads a stem dictionary. Each line contains:
   * <pre>word<b>\t</b>stem</pre>
   * (i.e. two tab separated words)
   *
   * @return stem dictionary that overrules the stemming algorithm
   * @throws IOException If there is a low-level I/O error.
   */
  public static CharArrayMap<String> getStemDict(Reader reader, CharArrayMap<String> result) throws IOException {
    BufferedReader br = null;
    try {
      br = getBufferedReader(reader);
      String line;
      while ((line = br.readLine()) != null) {
        String[] wordstem = line.split("\t", 2);
        result.put(wordstem[0], wordstem[1]);
      }
    } finally {
      IOUtils.close(br);
    }
    return result;
  }
  
  Accesses a resource by name and returns the (non comment) lines containing
data using the given character encoding.

A comment line is any line that starts with the character "#"

Throws: IOException – If there is a low-level I/O error.
Returns: a list of non-blank non-comment lines with whitespace trimmed/**
   * Accesses a resource by name and returns the (non comment) lines containing
   * data using the given character encoding.
   *
   * <p>
   * A comment line is any line that starts with the character "#"
   * </p>
   *
   * @return a list of non-blank non-comment lines with whitespace trimmed
   * @throws IOException If there is a low-level I/O error.
   */
  public static List<String> getLines(InputStream stream, Charset charset) throws IOException{
    BufferedReader input = null;
    ArrayList<String> lines;
    boolean success = false;
    try {
      input = getBufferedReader(IOUtils.getDecodingReader(stream, charset));

      lines = new ArrayList<>();
      for (String word=null; (word=input.readLine())!=null;) {
        // skip initial bom marker
        if (lines.isEmpty() && word.length() > 0 && word.charAt(0) == '\uFEFF')
          word = word.substring(1);
        // skip comments
        if (word.startsWith("#")) continue;
        word=word.trim();
        // skip blank lines
        if (word.length()==0) continue;
        lines.add(word);
      }
      success = true;
      return lines;
    } finally {
      if (success) {
        IOUtils.close(input);
      } else {
        IOUtils.closeWhileHandlingException(input);
      }
    }
  }
  
  private static BufferedReader getBufferedReader(Reader reader) {
    return (reader instanceof BufferedReader) ? (BufferedReader) reader
        : new BufferedReader(reader);
  }
  
}
Params:	reader – Reader containing the wordlist result – the `CharArraySet` to fill with the readers words
Returns:	the given `CharArraySet` with the reader's words
Params:	reader – Reader containing the wordlist
Returns:	A `CharArraySet` with the reader's words
Params:	reader – Reader containing a Snowball stopword list result – the `CharArraySet` to fill with the readers words
Returns:	the given `CharArraySet` with the reader's words
Throws:	IOException – If there is a low-level I/O error.
Returns:	stem dictionary that overrules the stemming algorithm
/

org.apache.lucene/ lucene-core/ 8.2.0/ org/apache/lucene/analysis/WordlistLoader.java