/* GENERATED SOURCE. DO NOT MODIFY. */ // © 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html // // Copyright (C) 2002-2014, International Business Machines Corporation and others. // All Rights Reserved. // // package android.icu.text; import java.io.DataOutputStream; import java.io.IOException; import java.io.OutputStream; import java.nio.charset.StandardCharsets; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Set; import android.icu.impl.Assert; import android.icu.impl.ICUBinary; import android.icu.impl.ICUDebug; import android.icu.impl.RBBIDataWrapper; class RBBIRuleBuilder { // This is the main class for building (compiling) break rules into the tables // required by the runtime RBBI engine. // String fDebugEnv; // controls debug trace output String fRules; // The rule string that we are compiling StringBuilder fStrippedRules; // The rule string, with comments stripped. RBBIRuleScanner fScanner; // The scanner. // // There are four separate parse trees generated, one for each of the // forward rules, reverse rules, safe forward rules and safe reverse rules. // This array references the root of each of the trees. // Only fForwardTree data is actually used to generate a state table. // The other three are retained for back compatibility with old rule files, // which may have safe and reverse rules. These are still parsed. // RBBINode[] fTreeRoots = new RBBINode[4]; static final int fForwardTree = 0; // Indexes into the above fTreeRoots array static final int fReverseTree = 1; // for each of the trees. static final int fSafeFwdTree = 2; // (in C, these are pointer variables and static final int fSafeRevTree = 3; // there is no array.) int fDefaultTree = fForwardTree; // For rules not qualified with a ! // the tree to which they belong to. boolean fChainRules; // True for chained Unicode TR style rules. // False for traditional regexp rules. boolean fLookAheadHardBreak; // True: Look ahead matches cause an // immediate break, no continuing for the // longest match. RBBISetBuilder fSetBuilder; // Set and Character Category builder. List fUSetNodes; // Vector of all used nodes. RBBITableBuilder fForwardTable; // State transition tables // // Status {tag} values. These structures are common to all of the rule sets (Forward, Reverse, etc.). // Map, Integer> fStatusSets = new HashMap<>(); // Status value sets encountered so far. // Map Key is the set of values. // Map Value is the runtime array index. List fRuleStatusVals; // List of Integer objects. Has same layout as the // runtime array of status (tag) values - // number of values in group 1 // first status value in group 1 // 2nd status value in group 1 // ... // number of values in group 2 // first status value in group 2 // etc. // // Error codes from ICU4C. // using these simplified the porting, and consolidated the // creation of Java exceptions // static final int U_ILLEGAL_CHAR_FOUND = 12; /**< Character conversion: Illegal input sequence/combination of input units. */ static final int U_BRK_ERROR_START = 0x10200; /**< Start of codes indicating Break Iterator failures */ static final int U_BRK_INTERNAL_ERROR = 0x10201; /**< An internal error (bug) was detected. */ static final int U_BRK_HEX_DIGITS_EXPECTED = 0x10202; /**< Hex digits expected as part of a escaped char in a rule. */ static final int U_BRK_SEMICOLON_EXPECTED = 0x10203; /**< Missing ';' at the end of a RBBI rule. */ static final int U_BRK_RULE_SYNTAX = 0x10204; /**< Syntax error in RBBI rule. */ static final int U_BRK_UNCLOSED_SET = 0x10205; /**< UnicodeSet writing an RBBI rule missing a closing ']'. */ static final int U_BRK_ASSIGN_ERROR = 0x10206; /**< Syntax error in RBBI rule assignment statement. */ static final int U_BRK_VARIABLE_REDFINITION = 0x10207; /**< RBBI rule $Variable redefined. */ static final int U_BRK_MISMATCHED_PAREN = 0x10208; /**< Mis-matched parentheses in an RBBI rule. */ static final int U_BRK_NEW_LINE_IN_QUOTED_STRING = 0x10209; /**< Missing closing quote in an RBBI rule. */ static final int U_BRK_UNDEFINED_VARIABLE = 0x1020a; /**< Use of an undefined $Variable in an RBBI rule. */ static final int U_BRK_INIT_ERROR = 0x1020b; /**< Initialization failure. Probable missing ICU Data. */ static final int U_BRK_RULE_EMPTY_SET = 0x1020c; /**< Rule contains an empty Unicode Set. */ static final int U_BRK_UNRECOGNIZED_OPTION = 0x1020d; /**< !!option in RBBI rules not recognized. */ static final int U_BRK_MALFORMED_RULE_TAG = 0x1020e; /**< The {nnn} tag on a rule is mal formed */ static final int U_BRK_MALFORMED_SET = 0x1020f; static final int U_BRK_ERROR_LIMIT = 0x10210; /**< This must always be the last value to indicate the limit for Break Iterator failures */ //---------------------------------------------------------------------------------------- // // Constructor. // //---------------------------------------------------------------------------------------- RBBIRuleBuilder(String rules) { fDebugEnv = ICUDebug.enabled("rbbi") ? ICUDebug.value("rbbi") : null; fRules = rules; fStrippedRules = new StringBuilder(rules); fUSetNodes = new ArrayList<>(); fRuleStatusVals = new ArrayList<>(); fScanner = new RBBIRuleScanner(this); fSetBuilder = new RBBISetBuilder(this); } //---------------------------------------------------------------------------------------- // // flattenData() - Collect up the compiled RBBI rule data and put it into // the format for saving in ICU data files, // // See the ICU4C file common/rbidata.h for a detailed description. // //---------------------------------------------------------------------------------------- static final int align8(int i) { return (i + 7) & 0xfffffff8; } void flattenData(OutputStream os) throws IOException { DataOutputStream dos = new DataOutputStream(os); int i; // Remove whitespace from the rules to make it smaller. // The rule parser has already removed comments. String strippedRules = RBBIRuleScanner.stripRules(fStrippedRules.toString()); // Calculate the size of each section in the data in bytes. // Sizes here are padded up to a multiple of 8 for better memory alignment. // Sections sizes actually stored in the header are for the actual data // without the padding. // int headerSize = RBBIDataWrapper.DH_SIZE * 4; // align8(sizeof(RBBIDataHeader)); int forwardTableSize = align8(fForwardTable.getTableSize()); int reverseTableSize = align8(fForwardTable.getSafeTableSize()); int trieSize = align8(fSetBuilder.getTrieSize()); int statusTableSize = align8(fRuleStatusVals.size() * 4); byte[] strippedRulesUTF8 = strippedRules.getBytes(StandardCharsets.UTF_8); int rulesSize = align8(strippedRulesUTF8.length + 1); int totalSize = headerSize + forwardTableSize + reverseTableSize + statusTableSize + trieSize + rulesSize; int outputPos = 0; // Track stream position, starting from RBBIDataHeader. // // Write out an ICU Data Header // ICUBinary.writeHeader(RBBIDataWrapper.DATA_FORMAT, RBBIDataWrapper.FORMAT_VERSION, 0, dos); // // Write out the RBBIDataHeader // int[] header = new int[RBBIDataWrapper.DH_SIZE]; // sizeof struct RBBIDataHeader header[RBBIDataWrapper.DH_MAGIC] = 0xb1a0; header[RBBIDataWrapper.DH_FORMATVERSION] = RBBIDataWrapper.FORMAT_VERSION; header[RBBIDataWrapper.DH_LENGTH] = totalSize; // fLength, the total size of all rule sections. header[RBBIDataWrapper.DH_CATCOUNT] = fSetBuilder.getNumCharCategories(); header[RBBIDataWrapper.DH_FTABLE] = headerSize; // fFTable header[RBBIDataWrapper.DH_FTABLELEN] = forwardTableSize; // fTableLen header[RBBIDataWrapper.DH_RTABLE] = header[RBBIDataWrapper.DH_FTABLE] + forwardTableSize; // fRTable header[RBBIDataWrapper.DH_RTABLELEN] = reverseTableSize; // fRTableLen header[RBBIDataWrapper.DH_TRIE] = header[RBBIDataWrapper.DH_RTABLE] + header[RBBIDataWrapper.DH_RTABLELEN]; // fTrie header[RBBIDataWrapper.DH_TRIELEN] = fSetBuilder.getTrieSize(); // fTrieLen header[RBBIDataWrapper.DH_STATUSTABLE] = header[RBBIDataWrapper.DH_TRIE] + trieSize; header[RBBIDataWrapper.DH_STATUSTABLELEN] = statusTableSize; // fStatusTableLen header[RBBIDataWrapper.DH_RULESOURCE] = header[RBBIDataWrapper.DH_STATUSTABLE] + statusTableSize; header[RBBIDataWrapper.DH_RULESOURCELEN] = strippedRulesUTF8.length; for (i = 0; i < header.length; i++) { dos.writeInt(header[i]); outputPos += 4; } // Write out the actual state tables. RBBIDataWrapper.RBBIStateTable table = fForwardTable.exportTable(); assert(outputPos == header[RBBIDataWrapper.DH_FTABLE]); outputPos += table.put(dos); table = fForwardTable.exportSafeTable(); Assert.assrt(outputPos == header[RBBIDataWrapper.DH_RTABLE]); outputPos += table.put(dos); // write out the Trie table Assert.assrt(outputPos == header[RBBIDataWrapper.DH_TRIE]); fSetBuilder.serializeTrie(os); outputPos += header[RBBIDataWrapper.DH_TRIELEN]; while (outputPos % 8 != 0) { // pad to an 8 byte boundary dos.write(0); outputPos += 1; } // Write out the status {tag} table. Assert.assrt(outputPos == header[RBBIDataWrapper.DH_STATUSTABLE]); for (Integer val : fRuleStatusVals) { dos.writeInt(val.intValue()); outputPos += 4; } while (outputPos % 8 != 0) { // pad to an 8 byte boundary dos.write(0); outputPos += 1; } // Write out the stripped rules (rules with extra spaces removed // These go last in the data area, even though they are not last in the header. Assert.assrt(outputPos == header[RBBIDataWrapper.DH_RULESOURCE]); dos.write(strippedRulesUTF8, 0, strippedRulesUTF8.length); dos.write(0); // Null termination outputPos += strippedRulesUTF8.length + 1; while (outputPos % 8 != 0) { // pad to an 8 byte boundary dos.write(0); outputPos += 1; } } //---------------------------------------------------------------------------------------- // // compileRules compile source rules, placing the compiled form into a output stream // The compiled form is identical to that from ICU4C (Big Endian). // //---------------------------------------------------------------------------------------- static void compileRules(String rules, OutputStream os) throws IOException { // // Read the input rules, generate a parse tree, symbol table, // and list of all Unicode Sets referenced by the rules. // RBBIRuleBuilder builder = new RBBIRuleBuilder(rules); builder.build(os); } /** * Compile rules to the binary form, write that to an output stream. * */ void build(OutputStream os) throws IOException { fScanner.parse(); // // UnicodeSet processing. // Munge the Unicode Sets to create an initial set of character categories. // fSetBuilder.buildRanges(); // // Generate the DFA state transition table. // fForwardTable = new RBBITableBuilder(this, fForwardTree); fForwardTable.buildForwardTable(); // State table and character category optimization. // Merge equivalent rows and columns. // Note that this process alters the the initial set of character categories, // causing the representation of UnicodeSets in the parse tree to become invalid. optimizeTables(); fForwardTable.buildSafeReverseTable(); if (fDebugEnv != null && fDebugEnv.indexOf("states") >= 0) { fForwardTable.printStates(); fForwardTable.printRuleStatusTable(); fForwardTable.printReverseTable(); } // Generate the mapping tables (TRIE) from input code points to // the character categories. // fSetBuilder.buildTrie(); // // Package up the compiled data, writing it to an output stream // in the serialization format. This is the same as the ICU4C runtime format. // flattenData(os); } static class IntPair { int first = 0; int second = 0; IntPair() {}; IntPair(int f, int s) { first = f; second = s; } } void optimizeTables() { boolean didSomething; do { didSomething = false; // Begin looking for duplicates with char class 3. // Classes 0, 1 and 2 are special; they are unused, {bof} and {eof} respectively, // and should not have other categories merged into them. IntPair duplPair = new IntPair(3, 0); while (fForwardTable.findDuplCharClassFrom(duplPair)) { fSetBuilder.mergeCategories(duplPair); fForwardTable.removeColumn(duplPair.second); didSomething = true; } while (fForwardTable.removeDuplicateStates() > 0) { didSomething = true; }; } while (didSomething); } }