/* GENERATED SOURCE. DO NOT MODIFY. */ // © 2017 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html package android.icu.impl; import java.util.EnumMap; import java.util.Map; import android.icu.impl.UResource.Value; import android.icu.text.UnicodeSet; import android.icu.util.ULocale; import android.icu.util.UResourceBundle; /** * This class statically initializes UnicodeSets, originally built for number parsing. Microbenchmarks * show this to bring a very sizeable performance boost. * * IMPORTANT ASSUMPTION FOR NUMBER PARSING: All of the sets contain code points (no strings) and they are * all case-folded. If this assumption were ever broken, logic in classes such as SymbolMatcher would * need to be updated in order to return well-formed sets upon calls to getLeadCodePoints(). * * @author sffc * @hide Only a subset of ICU is exposed in Android */ public class StaticUnicodeSets { /** * @hide Only a subset of ICU is exposed in Android */ public static enum Key { EMPTY, // Ignorables DEFAULT_IGNORABLES, STRICT_IGNORABLES, // Separators // Notes: // - COMMA is a superset of STRICT_COMMA // - PERIOD is a superset of SCRICT_PERIOD // - ALL_SEPARATORS is the union of COMMA, PERIOD, and OTHER_GROUPING_SEPARATORS // - STRICT_ALL_SEPARATORS is the union of STRICT_COMMA, STRICT_PERIOD, and OTHER_GRP_SEPARATORS COMMA, PERIOD, STRICT_COMMA, STRICT_PERIOD, APOSTROPHE_SIGN, OTHER_GROUPING_SEPARATORS, ALL_SEPARATORS, STRICT_ALL_SEPARATORS, // Symbols // TODO: NaN? MINUS_SIGN, PLUS_SIGN, PERCENT_SIGN, PERMILLE_SIGN, INFINITY_SIGN, // Currency Symbols DOLLAR_SIGN, POUND_SIGN, RUPEE_SIGN, YEN_SIGN, WON_SIGN, // Other DIGITS, // Combined Separators with Digits (for lead code points) DIGITS_OR_ALL_SEPARATORS, DIGITS_OR_STRICT_ALL_SEPARATORS, }; private static final Map unicodeSets = new EnumMap<>(Key.class); /** * Gets the static-allocated UnicodeSet according to the provided key. * * @param key * The desired UnicodeSet according to the enum in this file. * @return The requested UnicodeSet. Guaranteed to be frozen and non-null, but may be empty if an * error occurred during data loading. */ public static UnicodeSet get(Key key) { UnicodeSet candidate = unicodeSets.get(key); if (candidate == null) { return UnicodeSet.EMPTY; } return candidate; } /** * Checks if the UnicodeSet given by key1 contains the given string. * * @param str * The string to check. * @param key1 * The set to check. * @return key1 if the set contains str, or COUNT if not. */ public static Key chooseFrom(String str, Key key1) { return get(key1).contains(str) ? key1 : null; } /** * Checks if the UnicodeSet given by either key1 or key2 contains the string. * * Exported as U_COMMON_API for numparse_decimal.cpp * * @param str * The string to check. * @param key1 * The first set to check. * @param key2 * The second set to check. * @return key1 if that set contains str; key2 if that set contains str; or COUNT if neither set * contains str. */ public static Key chooseFrom(String str, Key key1, Key key2) { return get(key1).contains(str) ? key1 : chooseFrom(str, key2); } /** * Looks through all Currency-related sets for the given string, returning the first match or null if * no match was round. */ public static Key chooseCurrency(String str) { if (get(Key.DOLLAR_SIGN).contains(str)) { return Key.DOLLAR_SIGN; } else if (get(Key.POUND_SIGN).contains(str)) { return Key.POUND_SIGN; } else if (get(Key.RUPEE_SIGN).contains(str)) { return Key.RUPEE_SIGN; } else if (get(Key.YEN_SIGN).contains(str)) { return Key.YEN_SIGN; } else if (get(Key.WON_SIGN).contains(str)) { return Key.WON_SIGN; } else { return null; } } private static UnicodeSet computeUnion(Key k1, Key k2) { return new UnicodeSet().addAll(get(k1)).addAll(get(k2)).freeze(); } private static UnicodeSet computeUnion(Key k1, Key k2, Key k3) { return new UnicodeSet().addAll(get(k1)).addAll(get(k2)).addAll(get(k3)).freeze(); } private static void saveSet(Key key, String unicodeSetPattern) { assert unicodeSets.get(key) == null; unicodeSets.put(key, new UnicodeSet(unicodeSetPattern).freeze()); } /* parse{ date{ lenient{ "[\\--/]", "[\\:∶]", } } general{ lenient{ "[.․。︒﹒.。]", "[\$﹩$$]", "[£₤]", "[₨₹{Rp}{Rs}]", } } number{ lenient{ "[\\-‒⁻₋−➖﹣-]", "[,،٫、︐︑﹐﹑,、]", "[+⁺₊➕﬩﹢+]", } stricter{ "[,٫︐﹐,]", "[.․﹒.。]", } } } */ static class ParseDataSink extends UResource.Sink { @Override public void put(android.icu.impl.UResource.Key key, Value value, boolean noFallback) { UResource.Table contextsTable = value.getTable(); for (int i = 0; contextsTable.getKeyAndValue(i, key, value); i++) { if (key.contentEquals("date")) { // ignore } else { assert key.contentEquals("general") || key.contentEquals("number"); UResource.Table strictnessTable = value.getTable(); for (int j = 0; strictnessTable.getKeyAndValue(j, key, value); j++) { boolean isLenient = key.contentEquals("lenient"); UResource.Array array = value.getArray(); for (int k = 0; k < array.getSize(); k++) { array.getValue(k, value); String str = value.toString(); // There is both lenient and strict data for comma/period, // but not for any of the other symbols. if (str.indexOf('.') != -1) { saveSet(isLenient ? Key.PERIOD : Key.STRICT_PERIOD, str); } else if (str.indexOf(',') != -1) { saveSet(isLenient ? Key.COMMA : Key.STRICT_COMMA, str); } else if (str.indexOf('+') != -1) { saveSet(Key.PLUS_SIGN, str); } else if (str.indexOf('-') != -1) { saveSet(Key.MINUS_SIGN, str); } else if (str.indexOf('$') != -1) { saveSet(Key.DOLLAR_SIGN, str); } else if (str.indexOf('£') != -1) { saveSet(Key.POUND_SIGN, str); } else if (str.indexOf('₹') != -1) { saveSet(Key.RUPEE_SIGN, str); } else if (str.indexOf('¥') != -1) { saveSet(Key.YEN_SIGN, str); } else if (str.indexOf('₩') != -1) { saveSet(Key.WON_SIGN, str); } else if (str.indexOf('%') != -1) { saveSet(Key.PERCENT_SIGN, str); } else if (str.indexOf('‰') != -1) { saveSet(Key.PERMILLE_SIGN, str); } else if (str.indexOf('’') != -1) { saveSet(Key.APOSTROPHE_SIGN, str); } else { // TODO(ICU-20428): Make ICU automatically accept new classes? throw new AssertionError("Unknown class of parse lenients: " + str); } } } } } } } static { unicodeSets.put(Key.EMPTY, new UnicodeSet("[]").freeze()); // These sets were decided after discussion with icu-design@. See tickets #13084 and #13309. // Zs+TAB is "horizontal whitespace" according to UTS #18 (blank property). unicodeSets.put(Key.DEFAULT_IGNORABLES, new UnicodeSet("[[:Zs:][\\u0009][:Bidi_Control:][:Variation_Selector:]]").freeze()); unicodeSets.put(Key.STRICT_IGNORABLES, new UnicodeSet("[[:Bidi_Control:]]").freeze()); // CLDR provides data for comma, period, minus sign, and plus sign. ICUResourceBundle rb = (ICUResourceBundle) UResourceBundle .getBundleInstance(ICUData.ICU_BASE_NAME, ULocale.ROOT); rb.getAllItemsWithFallback("parse", new ParseDataSink()); // NOTE: It is OK for these assertions to fail if there was a no-data build. assert unicodeSets.containsKey(Key.COMMA); assert unicodeSets.containsKey(Key.STRICT_COMMA); assert unicodeSets.containsKey(Key.PERIOD); assert unicodeSets.containsKey(Key.STRICT_PERIOD); assert unicodeSets.containsKey(Key.APOSTROPHE_SIGN); UnicodeSet otherGrouping = new UnicodeSet( "[٬‘\\u0020\\u00A0\\u2000-\\u200A\\u202F\\u205F\\u3000]"); otherGrouping.addAll(unicodeSets.get(Key.APOSTROPHE_SIGN)); unicodeSets.put(Key.OTHER_GROUPING_SEPARATORS, otherGrouping.freeze()); unicodeSets.put(Key.ALL_SEPARATORS, computeUnion(Key.COMMA, Key.PERIOD, Key.OTHER_GROUPING_SEPARATORS)); unicodeSets.put(Key.STRICT_ALL_SEPARATORS, computeUnion(Key.STRICT_COMMA, Key.STRICT_PERIOD, Key.OTHER_GROUPING_SEPARATORS)); assert unicodeSets.containsKey(Key.MINUS_SIGN); assert unicodeSets.containsKey(Key.PLUS_SIGN); assert unicodeSets.containsKey(Key.PERCENT_SIGN); assert unicodeSets.containsKey(Key.PERMILLE_SIGN); unicodeSets.put(Key.INFINITY_SIGN, new UnicodeSet("[∞]").freeze()); assert unicodeSets.containsKey(Key.DOLLAR_SIGN); assert unicodeSets.containsKey(Key.POUND_SIGN); assert unicodeSets.containsKey(Key.RUPEE_SIGN); assert unicodeSets.containsKey(Key.YEN_SIGN); assert unicodeSets.containsKey(Key.WON_SIGN); unicodeSets.put(Key.DIGITS, new UnicodeSet("[:digit:]").freeze()); unicodeSets.put(Key.DIGITS_OR_ALL_SEPARATORS, computeUnion(Key.DIGITS, Key.ALL_SEPARATORS)); unicodeSets.put(Key.DIGITS_OR_STRICT_ALL_SEPARATORS, computeUnion(Key.DIGITS, Key.STRICT_ALL_SEPARATORS)); } }