/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

/* $Id$ */

package org.apache.fop.complexscripts.util;

import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;

import org.apache.fop.util.CharUtilities;

Script related utilities.

This work was originally authored by Glenn Adams (gadams@apache.org).

/** * <p>Script related utilities.</p> * * <p>This work was originally authored by Glenn Adams (gadams@apache.org).</p> */
public final class CharScript { // CSOFF: LineLength // // The following script codes are based on ISO 15924. Codes less than 1000 are // official assignments from 15924; those equal to or greater than 1000 are FOP // implementation specific. //
hebrew script constant
/** hebrew script constant */
public static final int SCRIPT_HEBREW = 125; // 'hebr'
mongolian script constant
/** mongolian script constant */
public static final int SCRIPT_MONGOLIAN = 145; // 'mong'
arabic script constant
/** arabic script constant */
public static final int SCRIPT_ARABIC = 160; // 'arab'
greek script constant
/** greek script constant */
public static final int SCRIPT_GREEK = 200; // 'grek'
latin script constant
/** latin script constant */
public static final int SCRIPT_LATIN = 215; // 'latn'
cyrillic script constant
/** cyrillic script constant */
public static final int SCRIPT_CYRILLIC = 220; // 'cyrl'
georgian script constant
/** georgian script constant */
public static final int SCRIPT_GEORGIAN = 240; // 'geor'
bopomofo script constant
/** bopomofo script constant */
public static final int SCRIPT_BOPOMOFO = 285; // 'bopo'
hangul script constant
/** hangul script constant */
public static final int SCRIPT_HANGUL = 286; // 'hang'
gurmukhi script constant
/** gurmukhi script constant */
public static final int SCRIPT_GURMUKHI = 310; // 'guru'
gurmukhi 2 script constant
/** gurmukhi 2 script constant */
public static final int SCRIPT_GURMUKHI_2 = 1310; // 'gur2' -- MSFT (pseudo) script tag for variant shaping semantics
devanagari script constant
/** devanagari script constant */
public static final int SCRIPT_DEVANAGARI = 315; // 'deva'
devanagari 2 script constant
/** devanagari 2 script constant */
public static final int SCRIPT_DEVANAGARI_2 = 1315; // 'dev2' -- MSFT (pseudo) script tag for variant shaping semantics
gujarati script constant
/** gujarati script constant */
public static final int SCRIPT_GUJARATI = 320; // 'gujr'
gujarati 2 script constant
/** gujarati 2 script constant */
public static final int SCRIPT_GUJARATI_2 = 1320; // 'gjr2' -- MSFT (pseudo) script tag for variant shaping semantics
bengali script constant
/** bengali script constant */
public static final int SCRIPT_BENGALI = 326; // 'beng'
bengali 2 script constant
/** bengali 2 script constant */
public static final int SCRIPT_BENGALI_2 = 1326; // 'bng2' -- MSFT (pseudo) script tag for variant shaping semantics
oriya script constant
/** oriya script constant */
public static final int SCRIPT_ORIYA = 327; // 'orya'
oriya 2 script constant
/** oriya 2 script constant */
public static final int SCRIPT_ORIYA_2 = 1327; // 'ory2' -- MSFT (pseudo) script tag for variant shaping semantics
tibetan script constant
/** tibetan script constant */
public static final int SCRIPT_TIBETAN = 330; // 'tibt'
telugu script constant
/** telugu script constant */
public static final int SCRIPT_TELUGU = 340; // 'telu'
telugu 2 script constant
/** telugu 2 script constant */
public static final int SCRIPT_TELUGU_2 = 1340; // 'tel2' -- MSFT (pseudo) script tag for variant shaping semantics
kannada script constant
/** kannada script constant */
public static final int SCRIPT_KANNADA = 345; // 'knda'
kannada 2 script constant
/** kannada 2 script constant */
public static final int SCRIPT_KANNADA_2 = 1345; // 'knd2' -- MSFT (pseudo) script tag for variant shaping semantics
tamil script constant
/** tamil script constant */
public static final int SCRIPT_TAMIL = 346; // 'taml'
tamil 2 script constant
/** tamil 2 script constant */
public static final int SCRIPT_TAMIL_2 = 1346; // 'tml2' -- MSFT (pseudo) script tag for variant shaping semantics
malayalam script constant
/** malayalam script constant */
public static final int SCRIPT_MALAYALAM = 347; // 'mlym'
malayalam 2 script constant
/** malayalam 2 script constant */
public static final int SCRIPT_MALAYALAM_2 = 1347; // 'mlm2' -- MSFT (pseudo) script tag for variant shaping semantics
sinhalese script constant
/** sinhalese script constant */
public static final int SCRIPT_SINHALESE = 348; // 'sinh'
burmese script constant
/** burmese script constant */
public static final int SCRIPT_BURMESE = 350; // 'mymr'
thai script constant
/** thai script constant */
public static final int SCRIPT_THAI = 352; // 'thai'
khmer script constant
/** khmer script constant */
public static final int SCRIPT_KHMER = 355; // 'khmr'
lao script constant
/** lao script constant */
public static final int SCRIPT_LAO = 356; // 'laoo'
hiragana script constant
/** hiragana script constant */
public static final int SCRIPT_HIRAGANA = 410; // 'hira'
ethiopic script constant
/** ethiopic script constant */
public static final int SCRIPT_ETHIOPIC = 430; // 'ethi'
han script constant
/** han script constant */
public static final int SCRIPT_HAN = 500; // 'hani'
katakana script constant
/** katakana script constant */
public static final int SCRIPT_KATAKANA = 410; // 'kana'
math script constant
/** math script constant */
public static final int SCRIPT_MATH = 995; // 'zmth'
symbol script constant
/** symbol script constant */
public static final int SCRIPT_SYMBOL = 996; // 'zsym'
undetermined script constant
/** undetermined script constant */
public static final int SCRIPT_UNDETERMINED = 998; // 'zyyy'
uncoded script constant
/** uncoded script constant */
public static final int SCRIPT_UNCODED = 999; // 'zzzz'
A static (class) parameter indicating whether V2 indic shaping rules apply or not, with default being true.
/** * A static (class) parameter indicating whether V2 indic shaping * rules apply or not, with default being <code>true</code>. */
private static final boolean USE_V2_INDIC = true; private CharScript() { }
Determine if character c is punctuation.
Params:
  • c – a character represented as a unicode scalar value
Returns:true if character is punctuation
/** * Determine if character c is punctuation. * @param c a character represented as a unicode scalar value * @return true if character is punctuation */
public static boolean isPunctuation(int c) { if ((c >= 0x0021) && (c <= 0x002F)) { // basic latin punctuation return true; } else if ((c >= 0x003A) && (c <= 0x0040)) { // basic latin punctuation return true; } else if ((c >= 0x005F) && (c <= 0x0060)) { // basic latin punctuation return true; } else if ((c >= 0x007E) && (c <= 0x007E)) { // basic latin punctuation return true; } else if ((c >= 0x00A1) && (c <= 0x00BF)) { // latin supplement punctuation return true; } else if ((c >= 0x00D7) && (c <= 0x00D7)) { // latin supplement punctuation return true; } else if ((c >= 0x00F7) && (c <= 0x00F7)) { // latin supplement punctuation return true; } else if ((c >= 0x2000) && (c <= 0x206F)) { // general punctuation return true; } else { // [TBD] - not complete return false; } }
Determine if character c is a digit.
Params:
  • c – a character represented as a unicode scalar value
Returns:true if character is a digit
/** * Determine if character c is a digit. * @param c a character represented as a unicode scalar value * @return true if character is a digit */
public static boolean isDigit(int c) { if ((c >= 0x0030) && (c <= 0x0039)) { // basic latin digits return true; } else { // [TBD] - not complete return false; } }
Determine if character c belong to the hebrew script.
Params:
  • c – a character represented as a unicode scalar value
Returns:true if character belongs to hebrew script
/** * Determine if character c belong to the hebrew script. * @param c a character represented as a unicode scalar value * @return true if character belongs to hebrew script */
public static boolean isHebrew(int c) { if ((c >= 0x0590) && (c <= 0x05FF)) { // hebrew block return true; } else if ((c >= 0xFB00) && (c <= 0xFB4F)) { // hebrew presentation forms block return true; } else { return false; } }
Determine if character c belong to the mongolian script.
Params:
  • c – a character represented as a unicode scalar value
Returns:true if character belongs to mongolian script
/** * Determine if character c belong to the mongolian script. * @param c a character represented as a unicode scalar value * @return true if character belongs to mongolian script */
public static boolean isMongolian(int c) { if ((c >= 0x1800) && (c <= 0x18AF)) { // mongolian block return true; } else { return false; } }
Determine if character c belong to the arabic script.
Params:
  • c – a character represented as a unicode scalar value
Returns:true if character belongs to arabic script
/** * Determine if character c belong to the arabic script. * @param c a character represented as a unicode scalar value * @return true if character belongs to arabic script */
public static boolean isArabic(int c) { if ((c >= 0x0600) && (c <= 0x06FF)) { // arabic block return true; } else if ((c >= 0x0750) && (c <= 0x077F)) { // arabic supplement block return true; } else if ((c >= 0xFB50) && (c <= 0xFDFF)) { // arabic presentation forms a block return true; } else if ((c >= 0xFE70) && (c <= 0xFEFF)) { // arabic presentation forms b block return true; } else { return false; } }
Determine if character c belong to the greek script.
Params:
  • c – a character represented as a unicode scalar value
Returns:true if character belongs to greek script
/** * Determine if character c belong to the greek script. * @param c a character represented as a unicode scalar value * @return true if character belongs to greek script */
public static boolean isGreek(int c) { if ((c >= 0x0370) && (c <= 0x03FF)) { // greek (and coptic) block return true; } else if ((c >= 0x1F00) && (c <= 0x1FFF)) { // greek extended block return true; } else { return false; } }
Determine if character c belong to the latin script.
Params:
  • c – a character represented as a unicode scalar value
Returns:true if character belongs to latin script
/** * Determine if character c belong to the latin script. * @param c a character represented as a unicode scalar value * @return true if character belongs to latin script */
public static boolean isLatin(int c) { if ((c >= 0x0041) && (c <= 0x005A)) { // basic latin upper case return true; } else if ((c >= 0x0061) && (c <= 0x007A)) { // basic latin lower case return true; } else if ((c >= 0x00C0) && (c <= 0x00D6)) { // latin supplement upper case return true; } else if ((c >= 0x00D8) && (c <= 0x00DF)) { // latin supplement upper case return true; } else if ((c >= 0x00E0) && (c <= 0x00F6)) { // latin supplement lower case return true; } else if ((c >= 0x00F8) && (c <= 0x00FF)) { // latin supplement lower case return true; } else if ((c >= 0x0100) && (c <= 0x017F)) { // latin extended a return true; } else if ((c >= 0x0180) && (c <= 0x024F)) { // latin extended b return true; } else if ((c >= 0x1E00) && (c <= 0x1EFF)) { // latin extended additional return true; } else if ((c >= 0x2C60) && (c <= 0x2C7F)) { // latin extended c return true; } else if ((c >= 0xA720) && (c <= 0xA7FF)) { // latin extended d return true; } else if ((c >= 0xFB00) && (c <= 0xFB0F)) { // latin ligatures return true; } else { return false; } }
Determine if character c belong to the cyrillic script.
Params:
  • c – a character represented as a unicode scalar value
Returns:true if character belongs to cyrillic script
/** * Determine if character c belong to the cyrillic script. * @param c a character represented as a unicode scalar value * @return true if character belongs to cyrillic script */
public static boolean isCyrillic(int c) { if ((c >= 0x0400) && (c <= 0x04FF)) { // cyrillic block return true; } else if ((c >= 0x0500) && (c <= 0x052F)) { // cyrillic supplement block return true; } else if ((c >= 0x2DE0) && (c <= 0x2DFF)) { // cyrillic extended-a block return true; } else if ((c >= 0xA640) && (c <= 0xA69F)) { // cyrillic extended-b block return true; } else { return false; } }
Determine if character c belong to the georgian script.
Params:
  • c – a character represented as a unicode scalar value
Returns:true if character belongs to georgian script
/** * Determine if character c belong to the georgian script. * @param c a character represented as a unicode scalar value * @return true if character belongs to georgian script */
public static boolean isGeorgian(int c) { if ((c >= 0x10A0) && (c <= 0x10FF)) { // georgian block return true; } else if ((c >= 0x2D00) && (c <= 0x2D2F)) { // georgian supplement block return true; } else { return false; } }
Determine if character c belong to the hangul script.
Params:
  • c – a character represented as a unicode scalar value
Returns:true if character belongs to hangul script
/** * Determine if character c belong to the hangul script. * @param c a character represented as a unicode scalar value * @return true if character belongs to hangul script */
public static boolean isHangul(int c) { if ((c >= 0x1100) && (c <= 0x11FF)) { // hangul jamo return true; } else if ((c >= 0x3130) && (c <= 0x318F)) { // hangul compatibility jamo return true; } else if ((c >= 0xA960) && (c <= 0xA97F)) { // hangul jamo extended a return true; } else if ((c >= 0xAC00) && (c <= 0xD7A3)) { // hangul syllables return true; } else if ((c >= 0xD7B0) && (c <= 0xD7FF)) { // hangul jamo extended a return true; } else { return false; } }
Determine if character c belong to the gurmukhi script.
Params:
  • c – a character represented as a unicode scalar value
Returns:true if character belongs to gurmukhi script
/** * Determine if character c belong to the gurmukhi script. * @param c a character represented as a unicode scalar value * @return true if character belongs to gurmukhi script */
public static boolean isGurmukhi(int c) { if ((c >= 0x0A00) && (c <= 0x0A7F)) { // gurmukhi block return true; } else { return false; } }
Determine if character c belong to the devanagari script.
Params:
  • c – a character represented as a unicode scalar value
Returns:true if character belongs to devanagari script
/** * Determine if character c belong to the devanagari script. * @param c a character represented as a unicode scalar value * @return true if character belongs to devanagari script */
public static boolean isDevanagari(int c) { if ((c >= 0x0900) && (c <= 0x097F)) { // devangari block return true; } else if ((c >= 0xA8E0) && (c <= 0xA8FF)) { // devangari extended block return true; } else { return false; } }
Determine if character c belong to the gujarati script.
Params:
  • c – a character represented as a unicode scalar value
Returns:true if character belongs to gujarati script
/** * Determine if character c belong to the gujarati script. * @param c a character represented as a unicode scalar value * @return true if character belongs to gujarati script */
public static boolean isGujarati(int c) { if ((c >= 0x0A80) && (c <= 0x0AFF)) { // gujarati block return true; } else { return false; } }
Determine if character c belong to the bengali script.
Params:
  • c – a character represented as a unicode scalar value
Returns:true if character belongs to bengali script
/** * Determine if character c belong to the bengali script. * @param c a character represented as a unicode scalar value * @return true if character belongs to bengali script */
public static boolean isBengali(int c) { if ((c >= 0x0980) && (c <= 0x09FF)) { // bengali block return true; } else { return false; } }
Determine if character c belong to the oriya script.
Params:
  • c – a character represented as a unicode scalar value
Returns:true if character belongs to oriya script
/** * Determine if character c belong to the oriya script. * @param c a character represented as a unicode scalar value * @return true if character belongs to oriya script */
public static boolean isOriya(int c) { if ((c >= 0x0B00) && (c <= 0x0B7F)) { // oriya block return true; } else { return false; } }
Determine if character c belong to the tibetan script.
Params:
  • c – a character represented as a unicode scalar value
Returns:true if character belongs to tibetan script
/** * Determine if character c belong to the tibetan script. * @param c a character represented as a unicode scalar value * @return true if character belongs to tibetan script */
public static boolean isTibetan(int c) { if ((c >= 0x0F00) && (c <= 0x0FFF)) { // tibetan block return true; } else { return false; } }
Determine if character c belong to the telugu script.
Params:
  • c – a character represented as a unicode scalar value
Returns:true if character belongs to telugu script
/** * Determine if character c belong to the telugu script. * @param c a character represented as a unicode scalar value * @return true if character belongs to telugu script */
public static boolean isTelugu(int c) { if ((c >= 0x0C00) && (c <= 0x0C7F)) { // telugu block return true; } else { return false; } }
Determine if character c belong to the kannada script.
Params:
  • c – a character represented as a unicode scalar value
Returns:true if character belongs to kannada script
/** * Determine if character c belong to the kannada script. * @param c a character represented as a unicode scalar value * @return true if character belongs to kannada script */
public static boolean isKannada(int c) { if ((c >= 0x0C00) && (c <= 0x0C7F)) { // kannada block return true; } else { return false; } }
Determine if character c belong to the tamil script.
Params:
  • c – a character represented as a unicode scalar value
Returns:true if character belongs to tamil script
/** * Determine if character c belong to the tamil script. * @param c a character represented as a unicode scalar value * @return true if character belongs to tamil script */
public static boolean isTamil(int c) { if ((c >= 0x0B80) && (c <= 0x0BFF)) { // tamil block return true; } else { return false; } }
Determine if character c belong to the malayalam script.
Params:
  • c – a character represented as a unicode scalar value
Returns:true if character belongs to malayalam script
/** * Determine if character c belong to the malayalam script. * @param c a character represented as a unicode scalar value * @return true if character belongs to malayalam script */
public static boolean isMalayalam(int c) { if ((c >= 0x0D00) && (c <= 0x0D7F)) { // malayalam block return true; } else { return false; } }
Determine if character c belong to the sinhalese script.
Params:
  • c – a character represented as a unicode scalar value
Returns:true if character belongs to sinhalese script
/** * Determine if character c belong to the sinhalese script. * @param c a character represented as a unicode scalar value * @return true if character belongs to sinhalese script */
public static boolean isSinhalese(int c) { if ((c >= 0x0D80) && (c <= 0x0DFF)) { // sinhala block return true; } else { return false; } }
Determine if character c belong to the burmese script.
Params:
  • c – a character represented as a unicode scalar value
Returns:true if character belongs to burmese script
/** * Determine if character c belong to the burmese script. * @param c a character represented as a unicode scalar value * @return true if character belongs to burmese script */
public static boolean isBurmese(int c) { if ((c >= 0x1000) && (c <= 0x109F)) { // burmese (myanmar) block return true; } else if ((c >= 0xAA60) && (c <= 0xAA7F)) { // burmese (myanmar) extended block return true; } else { return false; } }
Determine if character c belong to the thai script.
Params:
  • c – a character represented as a unicode scalar value
Returns:true if character belongs to thai script
/** * Determine if character c belong to the thai script. * @param c a character represented as a unicode scalar value * @return true if character belongs to thai script */
public static boolean isThai(int c) { if ((c >= 0x0E00) && (c <= 0x0E7F)) { // thai block return true; } else { return false; } }
Determine if character c belong to the khmer script.
Params:
  • c – a character represented as a unicode scalar value
Returns:true if character belongs to khmer script
/** * Determine if character c belong to the khmer script. * @param c a character represented as a unicode scalar value * @return true if character belongs to khmer script */
public static boolean isKhmer(int c) { if ((c >= 0x1780) && (c <= 0x17FF)) { // khmer block return true; } else if ((c >= 0x19E0) && (c <= 0x19FF)) { // khmer symbols block return true; } else { return false; } }
Determine if character c belong to the lao script.
Params:
  • c – a character represented as a unicode scalar value
Returns:true if character belongs to lao script
/** * Determine if character c belong to the lao script. * @param c a character represented as a unicode scalar value * @return true if character belongs to lao script */
public static boolean isLao(int c) { if ((c >= 0x0E80) && (c <= 0x0EFF)) { // lao block return true; } else { return false; } }
Determine if character c belong to the ethiopic (amharic) script.
Params:
  • c – a character represented as a unicode scalar value
Returns:true if character belongs to ethiopic (amharic) script
/** * Determine if character c belong to the ethiopic (amharic) script. * @param c a character represented as a unicode scalar value * @return true if character belongs to ethiopic (amharic) script */
public static boolean isEthiopic(int c) { if ((c >= 0x1200) && (c <= 0x137F)) { // ethiopic block return true; } else if ((c >= 0x1380) && (c <= 0x139F)) { // ethoipic supplement block return true; } else if ((c >= 0x2D80) && (c <= 0x2DDF)) { // ethoipic extended block return true; } else if ((c >= 0xAB00) && (c <= 0xAB2F)) { // ethoipic extended-a block return true; } else { return false; } }
Determine if character c belong to the han (unified cjk) script.
Params:
  • c – a character represented as a unicode scalar value
Returns:true if character belongs to han (unified cjk) script
/** * Determine if character c belong to the han (unified cjk) script. * @param c a character represented as a unicode scalar value * @return true if character belongs to han (unified cjk) script */
public static boolean isHan(int c) { if ((c >= 0x3400) && (c <= 0x4DBF)) { return true; // cjk unified ideographs extension a } else if ((c >= 0x4E00) && (c <= 0x9FFF)) { return true; // cjk unified ideographs } else if ((c >= 0xF900) && (c <= 0xFAFF)) { return true; // cjk compatibility ideographs } else if ((c >= 0x20000) && (c <= 0x2A6DF)) { return true; // cjk unified ideographs extension b } else if ((c >= 0x2A700) && (c <= 0x2B73F)) { return true; // cjk unified ideographs extension c } else if ((c >= 0x2F800) && (c <= 0x2FA1F)) { return true; // cjk compatibility ideographs supplement } else { return false; } }
Determine if character c belong to the bopomofo script.
Params:
  • c – a character represented as a unicode scalar value
Returns:true if character belongs to bopomofo script
/** * Determine if character c belong to the bopomofo script. * @param c a character represented as a unicode scalar value * @return true if character belongs to bopomofo script */
public static boolean isBopomofo(int c) { if ((c >= 0x3100) && (c <= 0x312F)) { return true; } else { return false; } }
Determine if character c belong to the hiragana script.
Params:
  • c – a character represented as a unicode scalar value
Returns:true if character belongs to hiragana script
/** * Determine if character c belong to the hiragana script. * @param c a character represented as a unicode scalar value * @return true if character belongs to hiragana script */
public static boolean isHiragana(int c) { if ((c >= 0x3040) && (c <= 0x309F)) { return true; } else { return false; } }
Determine if character c belong to the katakana script.
Params:
  • c – a character represented as a unicode scalar value
Returns:true if character belongs to katakana script
/** * Determine if character c belong to the katakana script. * @param c a character represented as a unicode scalar value * @return true if character belongs to katakana script */
public static boolean isKatakana(int c) { if ((c >= 0x30A0) && (c <= 0x30FF)) { return true; } else if ((c >= 0x31F0) && (c <= 0x31FF)) { return true; } else { return false; } }
Obtain ISO15924 numeric script code of character. If script is not or cannot be determined, then the script code 998 ('zyyy') is returned.
Params:
  • c – the character to obtain script
Returns:an ISO15924 script code
/** * Obtain ISO15924 numeric script code of character. If script is not or cannot be determined, * then the script code 998 ('zyyy') is returned. * @param c the character to obtain script * @return an ISO15924 script code */
public static int scriptOf(int c) { // [TBD] - needs optimization!!! if (CharUtilities.isAnySpace(c)) { return SCRIPT_UNDETERMINED; } else if (isPunctuation(c)) { return SCRIPT_UNDETERMINED; } else if (isDigit(c)) { return SCRIPT_UNDETERMINED; } else if (isLatin(c)) { return SCRIPT_LATIN; } else if (isCyrillic(c)) { return SCRIPT_CYRILLIC; } else if (isGreek(c)) { return SCRIPT_GREEK; } else if (isHan(c)) { return SCRIPT_HAN; } else if (isBopomofo(c)) { return SCRIPT_BOPOMOFO; } else if (isKatakana(c)) { return SCRIPT_KATAKANA; } else if (isHiragana(c)) { return SCRIPT_HIRAGANA; } else if (isHangul(c)) { return SCRIPT_HANGUL; } else if (isArabic(c)) { return SCRIPT_ARABIC; } else if (isHebrew(c)) { return SCRIPT_HEBREW; } else if (isMongolian(c)) { return SCRIPT_MONGOLIAN; } else if (isGeorgian(c)) { return SCRIPT_GEORGIAN; } else if (isGurmukhi(c)) { return useV2IndicRules(SCRIPT_GURMUKHI); } else if (isDevanagari(c)) { return useV2IndicRules(SCRIPT_DEVANAGARI); } else if (isGujarati(c)) { return useV2IndicRules(SCRIPT_GUJARATI); } else if (isBengali(c)) { return useV2IndicRules(SCRIPT_BENGALI); } else if (isOriya(c)) { return useV2IndicRules(SCRIPT_ORIYA); } else if (isTibetan(c)) { return SCRIPT_TIBETAN; } else if (isTelugu(c)) { return useV2IndicRules(SCRIPT_TELUGU); } else if (isKannada(c)) { return useV2IndicRules(SCRIPT_KANNADA); } else if (isTamil(c)) { return useV2IndicRules(SCRIPT_TAMIL); } else if (isMalayalam(c)) { return useV2IndicRules(SCRIPT_MALAYALAM); } else if (isSinhalese(c)) { return SCRIPT_SINHALESE; } else if (isBurmese(c)) { return SCRIPT_BURMESE; } else if (isThai(c)) { return SCRIPT_THAI; } else if (isKhmer(c)) { return SCRIPT_KHMER; } else if (isLao(c)) { return SCRIPT_LAO; } else if (isEthiopic(c)) { return SCRIPT_ETHIOPIC; } else { return SCRIPT_UNDETERMINED; } }
Obtain the V2 indic script code corresponding to V1 indic script code SC if and only iff V2 indic rules apply; otherwise return SC.
Params:
  • sc – a V1 indic script code
Returns:either SC or the V2 flavor of SC if V2 indic rules apply
/** * Obtain the V2 indic script code corresponding to V1 indic script code SC if * and only iff V2 indic rules apply; otherwise return SC. * @param sc a V1 indic script code * @return either SC or the V2 flavor of SC if V2 indic rules apply */
public static int useV2IndicRules(int sc) { if (USE_V2_INDIC) { return (sc < 1000) ? (sc + 1000) : sc; } else { return sc; } }
Obtain the script codes of each character in a character sequence. If script is not or cannot be determined for some character, then the script code 998 ('zyyy') is returned.
Params:
  • cs – the character sequence
Returns:a (possibly empty) array of script codes
/** * Obtain the script codes of each character in a character sequence. If script * is not or cannot be determined for some character, then the script code 998 * ('zyyy') is returned. * @param cs the character sequence * @return a (possibly empty) array of script codes */
public static int[] scriptsOf(CharSequence cs) { Set s = new HashSet(); for (int i = 0, n = cs.length(); i < n; i++) { s.add(scriptOf(cs.charAt(i))); } int[] sa = new int [ s.size() ]; int ns = 0; for (Object value : s) { sa[ns++] = (Integer) value; } Arrays.sort(sa); return sa; }
Determine the dominant script of a character sequence.
Params:
  • cs – the character sequence
Returns:the dominant script or SCRIPT_UNDETERMINED
/** * Determine the dominant script of a character sequence. * @param cs the character sequence * @return the dominant script or SCRIPT_UNDETERMINED */
public static int dominantScript(CharSequence cs) { Map m = new HashMap(); for (int i = 0, n = cs.length(); i < n; i++) { int c = cs.charAt(i); int s = scriptOf(c); Integer k = s; Integer v = (Integer) m.get(k); if (v != null) { m.put(k, v + 1); } else { m.put(k, 0); } } int sMax = -1; int cMax = -1; for (Object o : m.entrySet()) { Map.Entry e = (Map.Entry) o; Integer k = (Integer) e.getKey(); int s = k; switch (s) { case SCRIPT_UNDETERMINED: case SCRIPT_UNCODED: break; default: Integer v = (Integer) e.getValue(); assert v != null; int c = v; if (c > cMax) { cMax = c; sMax = s; } break; } } if (sMax < 0) { sMax = SCRIPT_UNDETERMINED; } return sMax; }
Determine if script tag denotes an 'Indic' script, where a script is an 'Indic' script if it is intended to be processed by the generic 'Indic' Script Processor.
Params:
  • script – a script tag
Returns:true if script tag is a designated 'Indic' script
/** * Determine if script tag denotes an 'Indic' script, where a * script is an 'Indic' script if it is intended to be processed by * the generic 'Indic' Script Processor. * @param script a script tag * @return true if script tag is a designated 'Indic' script */
public static boolean isIndicScript(String script) { return isIndicScript(scriptCodeFromTag(script)); }
Determine if script tag denotes an 'Indic' script, where a script is an 'Indic' script if it is intended to be processed by the generic 'Indic' Script Processor.
Params:
  • script – a script code
Returns:true if script code is a designated 'Indic' script
/** * Determine if script tag denotes an 'Indic' script, where a * script is an 'Indic' script if it is intended to be processed by * the generic 'Indic' Script Processor. * @param script a script code * @return true if script code is a designated 'Indic' script */
public static boolean isIndicScript(int script) { switch (script) { case SCRIPT_BENGALI: case SCRIPT_BENGALI_2: case SCRIPT_BURMESE: case SCRIPT_DEVANAGARI: case SCRIPT_DEVANAGARI_2: case SCRIPT_GUJARATI: case SCRIPT_GUJARATI_2: case SCRIPT_GURMUKHI: case SCRIPT_GURMUKHI_2: case SCRIPT_KANNADA: case SCRIPT_KANNADA_2: case SCRIPT_MALAYALAM: case SCRIPT_MALAYALAM_2: case SCRIPT_ORIYA: case SCRIPT_ORIYA_2: case SCRIPT_TAMIL: case SCRIPT_TAMIL_2: case SCRIPT_TELUGU: case SCRIPT_TELUGU_2: return true; default: return false; } }
Determine the script tag associated with an internal script code.
Params:
  • code – the script code
Returns:a script tag
/** * Determine the script tag associated with an internal script code. * @param code the script code * @return a script tag */
public static String scriptTagFromCode(int code) { Map<Integer, String> m = getScriptTagsMap(); if (m != null) { String tag; if ((tag = m.get(code)) != null) { return tag; } else { return ""; } } else { return ""; } }
Determine the internal script code associated with a script tag.
Params:
  • tag – the script tag
Returns:a script code
/** * Determine the internal script code associated with a script tag. * @param tag the script tag * @return a script code */
public static int scriptCodeFromTag(String tag) { Map<String, Integer> m = getScriptCodeMap(); if (m != null) { Integer c; if ((c = m.get(tag)) != null) { return (int) c; } else { return SCRIPT_UNDETERMINED; } } else { return SCRIPT_UNDETERMINED; } } private static Map<Integer, String> scriptTagsMap; private static Map<String, Integer> scriptCodeMap; private static void putScriptTag(Map tm, Map cm, int code, String tag) { assert tag != null; assert tag.length() != 0; assert code >= 0; assert code < 2000; tm.put(code, tag); cm.put(tag, code); } private static void makeScriptMaps() { HashMap<Integer, String> tm = new HashMap<Integer, String>(); HashMap<String, Integer> cm = new HashMap<String, Integer>(); putScriptTag(tm, cm, SCRIPT_HEBREW, "hebr"); putScriptTag(tm, cm, SCRIPT_MONGOLIAN, "mong"); putScriptTag(tm, cm, SCRIPT_ARABIC, "arab"); putScriptTag(tm, cm, SCRIPT_GREEK, "grek"); putScriptTag(tm, cm, SCRIPT_LATIN, "latn"); putScriptTag(tm, cm, SCRIPT_CYRILLIC, "cyrl"); putScriptTag(tm, cm, SCRIPT_GEORGIAN, "geor"); putScriptTag(tm, cm, SCRIPT_BOPOMOFO, "bopo"); putScriptTag(tm, cm, SCRIPT_HANGUL, "hang"); putScriptTag(tm, cm, SCRIPT_GURMUKHI, "guru"); putScriptTag(tm, cm, SCRIPT_GURMUKHI_2, "gur2"); putScriptTag(tm, cm, SCRIPT_DEVANAGARI, "deva"); putScriptTag(tm, cm, SCRIPT_DEVANAGARI_2, "dev2"); putScriptTag(tm, cm, SCRIPT_GUJARATI, "gujr"); putScriptTag(tm, cm, SCRIPT_GUJARATI_2, "gjr2"); putScriptTag(tm, cm, SCRIPT_BENGALI, "beng"); putScriptTag(tm, cm, SCRIPT_BENGALI_2, "bng2"); putScriptTag(tm, cm, SCRIPT_ORIYA, "orya"); putScriptTag(tm, cm, SCRIPT_ORIYA_2, "ory2"); putScriptTag(tm, cm, SCRIPT_TIBETAN, "tibt"); putScriptTag(tm, cm, SCRIPT_TELUGU, "telu"); putScriptTag(tm, cm, SCRIPT_TELUGU_2, "tel2"); putScriptTag(tm, cm, SCRIPT_KANNADA, "knda"); putScriptTag(tm, cm, SCRIPT_KANNADA_2, "knd2"); putScriptTag(tm, cm, SCRIPT_TAMIL, "taml"); putScriptTag(tm, cm, SCRIPT_TAMIL_2, "tml2"); putScriptTag(tm, cm, SCRIPT_MALAYALAM, "mlym"); putScriptTag(tm, cm, SCRIPT_MALAYALAM_2, "mlm2"); putScriptTag(tm, cm, SCRIPT_SINHALESE, "sinh"); putScriptTag(tm, cm, SCRIPT_BURMESE, "mymr"); putScriptTag(tm, cm, SCRIPT_THAI, "thai"); putScriptTag(tm, cm, SCRIPT_KHMER, "khmr"); putScriptTag(tm, cm, SCRIPT_LAO, "laoo"); putScriptTag(tm, cm, SCRIPT_HIRAGANA, "hira"); putScriptTag(tm, cm, SCRIPT_ETHIOPIC, "ethi"); putScriptTag(tm, cm, SCRIPT_HAN, "hani"); putScriptTag(tm, cm, SCRIPT_KATAKANA, "kana"); putScriptTag(tm, cm, SCRIPT_MATH, "zmth"); putScriptTag(tm, cm, SCRIPT_SYMBOL, "zsym"); putScriptTag(tm, cm, SCRIPT_UNDETERMINED, "zyyy"); putScriptTag(tm, cm, SCRIPT_UNCODED, "zzzz"); scriptTagsMap = tm; scriptCodeMap = cm; } private static Map<Integer, String> getScriptTagsMap() { if (scriptTagsMap == null) { makeScriptMaps(); } return scriptTagsMap; } private static Map<String, Integer> getScriptCodeMap() { if (scriptCodeMap == null) { makeScriptMaps(); } return scriptCodeMap; } }