286 lines
11 KiB
Java
286 lines
11 KiB
Java
/* GENERATED SOURCE. DO NOT MODIFY. */
|
||
// © 2017 and later: Unicode, Inc. and others.
|
||
// License & terms of use: http://www.unicode.org/copyright.html
|
||
package android.icu.impl;
|
||
|
||
import java.util.EnumMap;
|
||
import java.util.Map;
|
||
|
||
import android.icu.impl.UResource.Value;
|
||
import android.icu.text.UnicodeSet;
|
||
import android.icu.util.ULocale;
|
||
import android.icu.util.UResourceBundle;
|
||
|
||
/**
|
||
* This class statically initializes UnicodeSets, originally built for number parsing. Microbenchmarks
|
||
* show this to bring a very sizeable performance boost.
|
||
*
|
||
* IMPORTANT ASSUMPTION FOR NUMBER PARSING: All of the sets contain code points (no strings) and they are
|
||
* all case-folded. If this assumption were ever broken, logic in classes such as SymbolMatcher would
|
||
* need to be updated in order to return well-formed sets upon calls to getLeadCodePoints().
|
||
*
|
||
* @author sffc
|
||
* @hide Only a subset of ICU is exposed in Android
|
||
*/
|
||
public class StaticUnicodeSets {
|
||
/**
|
||
* @hide Only a subset of ICU is exposed in Android
|
||
*/
|
||
public static enum Key {
|
||
EMPTY,
|
||
// Ignorables
|
||
DEFAULT_IGNORABLES,
|
||
STRICT_IGNORABLES,
|
||
|
||
// Separators
|
||
// Notes:
|
||
// - COMMA is a superset of STRICT_COMMA
|
||
// - PERIOD is a superset of SCRICT_PERIOD
|
||
// - ALL_SEPARATORS is the union of COMMA, PERIOD, and OTHER_GROUPING_SEPARATORS
|
||
// - STRICT_ALL_SEPARATORS is the union of STRICT_COMMA, STRICT_PERIOD, and OTHER_GRP_SEPARATORS
|
||
COMMA,
|
||
PERIOD,
|
||
STRICT_COMMA,
|
||
STRICT_PERIOD,
|
||
APOSTROPHE_SIGN,
|
||
OTHER_GROUPING_SEPARATORS,
|
||
ALL_SEPARATORS,
|
||
STRICT_ALL_SEPARATORS,
|
||
|
||
// Symbols
|
||
// TODO: NaN?
|
||
MINUS_SIGN,
|
||
PLUS_SIGN,
|
||
PERCENT_SIGN,
|
||
PERMILLE_SIGN,
|
||
INFINITY_SIGN,
|
||
|
||
// Currency Symbols
|
||
DOLLAR_SIGN,
|
||
POUND_SIGN,
|
||
RUPEE_SIGN,
|
||
YEN_SIGN,
|
||
WON_SIGN,
|
||
|
||
// Other
|
||
DIGITS,
|
||
|
||
// Combined Separators with Digits (for lead code points)
|
||
DIGITS_OR_ALL_SEPARATORS,
|
||
DIGITS_OR_STRICT_ALL_SEPARATORS,
|
||
};
|
||
|
||
private static final Map<Key, UnicodeSet> unicodeSets = new EnumMap<>(Key.class);
|
||
|
||
/**
|
||
* Gets the static-allocated UnicodeSet according to the provided key.
|
||
*
|
||
* @param key
|
||
* The desired UnicodeSet according to the enum in this file.
|
||
* @return The requested UnicodeSet. Guaranteed to be frozen and non-null, but may be empty if an
|
||
* error occurred during data loading.
|
||
*/
|
||
public static UnicodeSet get(Key key) {
|
||
UnicodeSet candidate = unicodeSets.get(key);
|
||
if (candidate == null) {
|
||
return UnicodeSet.EMPTY;
|
||
}
|
||
return candidate;
|
||
}
|
||
|
||
/**
|
||
* Checks if the UnicodeSet given by key1 contains the given string.
|
||
*
|
||
* @param str
|
||
* The string to check.
|
||
* @param key1
|
||
* The set to check.
|
||
* @return key1 if the set contains str, or COUNT if not.
|
||
*/
|
||
public static Key chooseFrom(String str, Key key1) {
|
||
return get(key1).contains(str) ? key1 : null;
|
||
}
|
||
|
||
/**
|
||
* Checks if the UnicodeSet given by either key1 or key2 contains the string.
|
||
*
|
||
* Exported as U_COMMON_API for numparse_decimal.cpp
|
||
*
|
||
* @param str
|
||
* The string to check.
|
||
* @param key1
|
||
* The first set to check.
|
||
* @param key2
|
||
* The second set to check.
|
||
* @return key1 if that set contains str; key2 if that set contains str; or COUNT if neither set
|
||
* contains str.
|
||
*/
|
||
public static Key chooseFrom(String str, Key key1, Key key2) {
|
||
return get(key1).contains(str) ? key1 : chooseFrom(str, key2);
|
||
}
|
||
|
||
/**
|
||
* Looks through all Currency-related sets for the given string, returning the first match or null if
|
||
* no match was round.
|
||
*/
|
||
public static Key chooseCurrency(String str) {
|
||
if (get(Key.DOLLAR_SIGN).contains(str)) {
|
||
return Key.DOLLAR_SIGN;
|
||
} else if (get(Key.POUND_SIGN).contains(str)) {
|
||
return Key.POUND_SIGN;
|
||
} else if (get(Key.RUPEE_SIGN).contains(str)) {
|
||
return Key.RUPEE_SIGN;
|
||
} else if (get(Key.YEN_SIGN).contains(str)) {
|
||
return Key.YEN_SIGN;
|
||
} else if (get(Key.WON_SIGN).contains(str)) {
|
||
return Key.WON_SIGN;
|
||
} else {
|
||
return null;
|
||
}
|
||
}
|
||
|
||
private static UnicodeSet computeUnion(Key k1, Key k2) {
|
||
return new UnicodeSet().addAll(get(k1)).addAll(get(k2)).freeze();
|
||
}
|
||
|
||
private static UnicodeSet computeUnion(Key k1, Key k2, Key k3) {
|
||
return new UnicodeSet().addAll(get(k1)).addAll(get(k2)).addAll(get(k3)).freeze();
|
||
}
|
||
|
||
private static void saveSet(Key key, String unicodeSetPattern) {
|
||
assert unicodeSets.get(key) == null;
|
||
unicodeSets.put(key, new UnicodeSet(unicodeSetPattern).freeze());
|
||
}
|
||
|
||
/*
|
||
parse{
|
||
date{
|
||
lenient{
|
||
"[\\--/]",
|
||
"[\\:∶]",
|
||
}
|
||
}
|
||
general{
|
||
lenient{
|
||
"[.․。︒﹒.。]",
|
||
"[\$﹩$$]",
|
||
"[£₤]",
|
||
"[₨₹{Rp}{Rs}]",
|
||
}
|
||
}
|
||
number{
|
||
lenient{
|
||
"[\\-‒⁻₋−➖﹣-]",
|
||
"[,،٫、︐︑﹐﹑,、]",
|
||
"[+⁺₊➕﬩﹢+]",
|
||
}
|
||
stricter{
|
||
"[,٫︐﹐,]",
|
||
"[.․﹒.。]",
|
||
}
|
||
}
|
||
}
|
||
*/
|
||
static class ParseDataSink extends UResource.Sink {
|
||
@Override
|
||
public void put(android.icu.impl.UResource.Key key, Value value, boolean noFallback) {
|
||
UResource.Table contextsTable = value.getTable();
|
||
for (int i = 0; contextsTable.getKeyAndValue(i, key, value); i++) {
|
||
if (key.contentEquals("date")) {
|
||
// ignore
|
||
} else {
|
||
assert key.contentEquals("general") || key.contentEquals("number");
|
||
UResource.Table strictnessTable = value.getTable();
|
||
for (int j = 0; strictnessTable.getKeyAndValue(j, key, value); j++) {
|
||
boolean isLenient = key.contentEquals("lenient");
|
||
UResource.Array array = value.getArray();
|
||
for (int k = 0; k < array.getSize(); k++) {
|
||
array.getValue(k, value);
|
||
String str = value.toString();
|
||
// There is both lenient and strict data for comma/period,
|
||
// but not for any of the other symbols.
|
||
if (str.indexOf('.') != -1) {
|
||
saveSet(isLenient ? Key.PERIOD : Key.STRICT_PERIOD, str);
|
||
} else if (str.indexOf(',') != -1) {
|
||
saveSet(isLenient ? Key.COMMA : Key.STRICT_COMMA, str);
|
||
} else if (str.indexOf('+') != -1) {
|
||
saveSet(Key.PLUS_SIGN, str);
|
||
} else if (str.indexOf('-') != -1) {
|
||
saveSet(Key.MINUS_SIGN, str);
|
||
} else if (str.indexOf('$') != -1) {
|
||
saveSet(Key.DOLLAR_SIGN, str);
|
||
} else if (str.indexOf('£') != -1) {
|
||
saveSet(Key.POUND_SIGN, str);
|
||
} else if (str.indexOf('₹') != -1) {
|
||
saveSet(Key.RUPEE_SIGN, str);
|
||
} else if (str.indexOf('¥') != -1) {
|
||
saveSet(Key.YEN_SIGN, str);
|
||
} else if (str.indexOf('₩') != -1) {
|
||
saveSet(Key.WON_SIGN, str);
|
||
} else if (str.indexOf('%') != -1) {
|
||
saveSet(Key.PERCENT_SIGN, str);
|
||
} else if (str.indexOf('‰') != -1) {
|
||
saveSet(Key.PERMILLE_SIGN, str);
|
||
} else if (str.indexOf('’') != -1) {
|
||
saveSet(Key.APOSTROPHE_SIGN, str);
|
||
} else {
|
||
// TODO(ICU-20428): Make ICU automatically accept new classes?
|
||
throw new AssertionError("Unknown class of parse lenients: " + str);
|
||
}
|
||
}
|
||
}
|
||
}
|
||
}
|
||
}
|
||
}
|
||
|
||
static {
|
||
unicodeSets.put(Key.EMPTY, new UnicodeSet("[]").freeze());
|
||
// These sets were decided after discussion with icu-design@. See tickets #13084 and #13309.
|
||
// Zs+TAB is "horizontal whitespace" according to UTS #18 (blank property).
|
||
unicodeSets.put(Key.DEFAULT_IGNORABLES,
|
||
new UnicodeSet("[[:Zs:][\\u0009][:Bidi_Control:][:Variation_Selector:]]").freeze());
|
||
unicodeSets.put(Key.STRICT_IGNORABLES, new UnicodeSet("[[:Bidi_Control:]]").freeze());
|
||
|
||
// CLDR provides data for comma, period, minus sign, and plus sign.
|
||
ICUResourceBundle rb = (ICUResourceBundle) UResourceBundle
|
||
.getBundleInstance(ICUData.ICU_BASE_NAME, ULocale.ROOT);
|
||
rb.getAllItemsWithFallback("parse", new ParseDataSink());
|
||
|
||
// NOTE: It is OK for these assertions to fail if there was a no-data build.
|
||
assert unicodeSets.containsKey(Key.COMMA);
|
||
assert unicodeSets.containsKey(Key.STRICT_COMMA);
|
||
assert unicodeSets.containsKey(Key.PERIOD);
|
||
assert unicodeSets.containsKey(Key.STRICT_PERIOD);
|
||
assert unicodeSets.containsKey(Key.APOSTROPHE_SIGN);
|
||
|
||
UnicodeSet otherGrouping = new UnicodeSet(
|
||
"[٬‘\\u0020\\u00A0\\u2000-\\u200A\\u202F\\u205F\\u3000]");
|
||
otherGrouping.addAll(unicodeSets.get(Key.APOSTROPHE_SIGN));
|
||
unicodeSets.put(Key.OTHER_GROUPING_SEPARATORS, otherGrouping.freeze());
|
||
unicodeSets.put(Key.ALL_SEPARATORS,
|
||
computeUnion(Key.COMMA, Key.PERIOD, Key.OTHER_GROUPING_SEPARATORS));
|
||
unicodeSets.put(Key.STRICT_ALL_SEPARATORS,
|
||
computeUnion(Key.STRICT_COMMA, Key.STRICT_PERIOD, Key.OTHER_GROUPING_SEPARATORS));
|
||
|
||
assert unicodeSets.containsKey(Key.MINUS_SIGN);
|
||
assert unicodeSets.containsKey(Key.PLUS_SIGN);
|
||
assert unicodeSets.containsKey(Key.PERCENT_SIGN);
|
||
assert unicodeSets.containsKey(Key.PERMILLE_SIGN);
|
||
|
||
unicodeSets.put(Key.INFINITY_SIGN, new UnicodeSet("[∞]").freeze());
|
||
|
||
assert unicodeSets.containsKey(Key.DOLLAR_SIGN);
|
||
assert unicodeSets.containsKey(Key.POUND_SIGN);
|
||
assert unicodeSets.containsKey(Key.RUPEE_SIGN);
|
||
assert unicodeSets.containsKey(Key.YEN_SIGN);
|
||
assert unicodeSets.containsKey(Key.WON_SIGN);
|
||
|
||
unicodeSets.put(Key.DIGITS, new UnicodeSet("[:digit:]").freeze());
|
||
|
||
unicodeSets.put(Key.DIGITS_OR_ALL_SEPARATORS, computeUnion(Key.DIGITS, Key.ALL_SEPARATORS));
|
||
unicodeSets.put(Key.DIGITS_OR_STRICT_ALL_SEPARATORS,
|
||
computeUnion(Key.DIGITS, Key.STRICT_ALL_SEPARATORS));
|
||
}
|
||
}
|