/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.analysis.ko;

import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.nio.charset.Charset;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.CodingErrorAction;
import java.util.Locale;
import java.util.Map;

import org.apache.lucene.analysis.ko.dict.UserDictionary;
import org.apache.lucene.analysis.util.TokenizerFactory;
import org.apache.lucene.util.AttributeFactory;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.analysis.util.ResourceLoader;
import org.apache.lucene.analysis.util.ResourceLoaderAware;
import org.apache.lucene.analysis.ko.KoreanTokenizer.DecompoundMode;

Factory for KoreanTokenizer.
<fieldType name="text_ko" class="solr.TextField">
  <analyzer>
    <tokenizer class="solr.KoreanTokenizerFactory"
               decompoundMode="discard"
               userDictionary="user.txt"
               userDictionaryEncoding="UTF-8"
               outputUnknownUnigrams="false"
               discardPunctuation="true"
    />
 </analyzer>
</fieldType>

Supports the following attributes:

  • userDictionary: User dictionary path.
  • userDictionaryEncoding: User dictionary encoding.
  • decompoundMode: Decompound mode. Either 'none', 'discard', 'mixed'. Default is discard. See DecompoundMode
  • outputUnknownUnigrams: If true outputs unigrams for unknown words.
  • discardPunctuation: true if punctuation tokens should be dropped from the output.
@lucene.experimental
Since:7.4.0
/** * Factory for {@link KoreanTokenizer}. * <pre class="prettyprint"> * &lt;fieldType name="text_ko" class="solr.TextField"&gt; * &lt;analyzer&gt; * &lt;tokenizer class="solr.KoreanTokenizerFactory" * decompoundMode="discard" * userDictionary="user.txt" * userDictionaryEncoding="UTF-8" * outputUnknownUnigrams="false" * discardPunctuation="true" * /&gt; * &lt;/analyzer&gt; * &lt;/fieldType&gt; * </pre> * * <p> * Supports the following attributes: * <ul> * <li>userDictionary: User dictionary path.</li> * <li>userDictionaryEncoding: User dictionary encoding.</li> * <li>decompoundMode: Decompound mode. Either 'none', 'discard', 'mixed'. Default is discard. See {@link DecompoundMode}</li> * <li>outputUnknownUnigrams: If true outputs unigrams for unknown words.</li> * <li>discardPunctuation: true if punctuation tokens should be dropped from the output.</li> * </ul> * @lucene.experimental * * @since 7.4.0 */
public class KoreanTokenizerFactory extends TokenizerFactory implements ResourceLoaderAware { private static final String USER_DICT_PATH = "userDictionary"; private static final String USER_DICT_ENCODING = "userDictionaryEncoding"; private static final String DECOMPOUND_MODE = "decompoundMode"; private static final String OUTPUT_UNKNOWN_UNIGRAMS = "outputUnknownUnigrams"; private static final String DISCARD_PUNCTUATION = "discardPunctuation"; private final String userDictionaryPath; private final String userDictionaryEncoding; private UserDictionary userDictionary; private final KoreanTokenizer.DecompoundMode mode; private final boolean outputUnknownUnigrams; private final boolean discardPunctuation;
Creates a new KoreanTokenizerFactory
/** Creates a new KoreanTokenizerFactory */
public KoreanTokenizerFactory(Map<String, String> args) { super(args); userDictionaryPath = args.remove(USER_DICT_PATH); userDictionaryEncoding = args.remove(USER_DICT_ENCODING); mode = KoreanTokenizer.DecompoundMode.valueOf(get(args, DECOMPOUND_MODE, KoreanTokenizer.DEFAULT_DECOMPOUND.toString()).toUpperCase(Locale.ROOT)); outputUnknownUnigrams = getBoolean(args, OUTPUT_UNKNOWN_UNIGRAMS, false); discardPunctuation = getBoolean(args, DISCARD_PUNCTUATION, true); if (!args.isEmpty()) { throw new IllegalArgumentException("Unknown parameters: " + args); } } @Override public void inform(ResourceLoader loader) throws IOException { if (userDictionaryPath != null) { try (InputStream stream = loader.openResource(userDictionaryPath)) { String encoding = userDictionaryEncoding; if (encoding == null) { encoding = IOUtils.UTF_8; } CharsetDecoder decoder = Charset.forName(encoding).newDecoder() .onMalformedInput(CodingErrorAction.REPORT) .onUnmappableCharacter(CodingErrorAction.REPORT); Reader reader = new InputStreamReader(stream, decoder); userDictionary = UserDictionary.open(reader); } } else { userDictionary = null; } } @Override public KoreanTokenizer create(AttributeFactory factory) { return new KoreanTokenizer(factory, userDictionary, mode, outputUnknownUnigrams, discardPunctuation); } }