/* GENERATED SOURCE. DO NOT MODIFY. */ // © 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ******************************************************************************* * Copyright (C) 2003-2011, International Business Machines Corporation and * * others. All Rights Reserved. * ******************************************************************************* */ package android.icu.text; import java.io.ByteArrayOutputStream; import java.io.IOException; import java.io.OutputStream; import java.util.ArrayList; import java.util.List; import android.icu.impl.Assert; import android.icu.text.RBBIRuleBuilder.IntPair; import android.icu.util.CodePointTrie; import android.icu.util.MutableCodePointTrie; // // RBBISetBuilder Handles processing of Unicode Sets from RBBI rules // (part of the rule building process.) // // Starting with the rules parse tree from the scanner, // // - Enumerate the set of UnicodeSets that are referenced // by the RBBI rules. // - compute a set of non-overlapping character ranges // with all characters within a range belonging to the same // set of input unicode sets. // - Derive a set of non-overlapping UnicodeSet (like things) // that will correspond to columns in the state table for // the RBBI execution engine. All characters within one // of these sets belong to the same set of the original // UnicodeSets from the user's rules. // - construct the trie table that maps input characters // to the index of the matching non-overlapping set of set from // the previous step. // class RBBISetBuilder { static class RangeDescriptor { int fStartChar = 0; // Start of range, unicode 32 bit value. int fEndChar = 0; // End of range, unicode 32 bit value. int fNum = 0; // runtime-mapped input value for this range. boolean fIncludesDict = false; // True if the range includes $dictionary. boolean fFirstInGroup = false; // True if first range in a group with the same fNum. List fIncludesSets; // vector of the the original // Unicode sets that include this range. // (Contains ptrs to uset nodes) RangeDescriptor fNext; // Next RangeDescriptor in the linked list. RangeDescriptor() { fIncludesSets = new ArrayList<>(); } RangeDescriptor(RangeDescriptor other) { fStartChar = other.fStartChar; fEndChar = other.fEndChar; fNum = other.fNum; fIncludesDict = other.fIncludesDict; fFirstInGroup = other.fFirstInGroup; fIncludesSets = new ArrayList<>(other.fIncludesSets); } //------------------------------------------------------------------------------------- // // RangeDesriptor::split() // //------------------------------------------------------------------------------------- void split(int where) { Assert.assrt(where>fStartChar && where<=fEndChar); RangeDescriptor nr = new RangeDescriptor(this); // RangeDescriptor copy constructor copies all fields. // Only need to update those that are different after the split. nr.fStartChar = where; this.fEndChar = where-1; nr.fNext = this.fNext; this.fNext = nr; // TODO: fIncludesSets is not updated. Check it out. // Probably because they haven't been populated yet, // but still sloppy. } /** * Test whether this range includes characters from the original Unicode Set named "dictionary". * * This function looks through the Unicode Sets that * the range includes, checking for one named "dictionary" */ // TODO: a faster way would be to find the set node for // "dictionary" just once, rather than looking it // up by name every time. // boolean isDictionaryRange() { for (int i=0; i=0) {printSets();} // Initialize the process by creating a single range encompassing all characters // that is in no sets. // fRangeList = new RangeDescriptor(); fRangeList.fStartChar = 0; fRangeList.fEndChar = 0x10ffff; // // Find the set of non-overlapping ranges of characters // for (RBBINode usetNode : fRB.fUSetNodes) { UnicodeSet inputSet = usetNode.fInputSet; int inputSetRangeCount = inputSet.getRangeCount(); int inputSetRangeIndex = 0; rlRange = fRangeList; for (;;) { if (inputSetRangeIndex >= inputSetRangeCount) { break; } int inputSetRangeBegin = inputSet.getRangeStart(inputSetRangeIndex); int inputSetRangeEnd = inputSet.getRangeEnd(inputSetRangeIndex); // skip over ranges from the range list that are completely // below the current range from the input unicode set. while (rlRange.fEndChar < inputSetRangeBegin) { rlRange = rlRange.fNext; } // If the start of the range from the range list is before with // the start of the range from the unicode set, split the range list range // in two, with one part being before (wholly outside of) the unicode set // and the other containing the rest. // Then continue the loop; the post-split current range will then be skipped // over if (rlRange.fStartChar < inputSetRangeBegin) { rlRange.split(inputSetRangeBegin); continue; } // Same thing at the end of the ranges... // If the end of the range from the range list doesn't coincide with // the end of the range from the unicode set, split the range list // range in two. The first part of the split range will be // wholly inside the Unicode set. if (rlRange.fEndChar > inputSetRangeEnd) { rlRange.split(inputSetRangeEnd+1); } // The current rlRange is now entirely within the UnicodeSet range. // Add this unicode set to the list of sets for this rlRange if (rlRange.fIncludesSets.indexOf(usetNode) == -1) { rlRange.fIncludesSets.add(usetNode); } // Advance over ranges that we are finished with. if (inputSetRangeEnd == rlRange.fEndChar) { inputSetRangeIndex++; } rlRange = rlRange.fNext; } } if (fRB.fDebugEnv!=null && fRB.fDebugEnv.indexOf("range")>=0) { printRanges();} // // Group the above ranges, with each group consisting of one or more // ranges that are in exactly the same set of original UnicodeSets. // The groups are numbered, and these group numbers are the set of // input symbols recognized by the run-time state machine. // // Numbering: # 0 (state table column 0) is unused. // # 1 is reserved - table column 1 is for end-of-input // # 2 is reserved - table column 2 is for beginning-of-input // # 3 is the first range list. // RangeDescriptor rlSearchRange; int dictGroupCount = 0; for (rlRange = fRangeList; rlRange!=null; rlRange=rlRange.fNext) { for (rlSearchRange=fRangeList; rlSearchRange != rlRange; rlSearchRange=rlSearchRange.fNext) { if (rlRange.fIncludesSets.equals(rlSearchRange.fIncludesSets)) { rlRange.fNum = rlSearchRange.fNum; rlRange.fIncludesDict = rlSearchRange.fIncludesDict; break; } } if (rlRange.fNum == 0) { rlRange.fFirstInGroup = true; if (rlRange.isDictionaryRange()) { rlRange.fNum = ++dictGroupCount; rlRange.fIncludesDict = true; } else { fGroupCount++; rlRange.fNum = fGroupCount + 2; addValToSets(rlRange.fIncludesSets, fGroupCount + 2); } } } // Move the character category numbers for any dictionary ranges up, so that they // immediately follow the non-dictionary ranges. fDictCategoriesStart = fGroupCount + 3; for (rlRange = fRangeList; rlRange!=null; rlRange=rlRange.fNext) { if (rlRange.fIncludesDict) { rlRange.fNum += fDictCategoriesStart - 1; if (rlRange.fFirstInGroup) { addValToSets(rlRange.fIncludesSets, rlRange.fNum); } } } fGroupCount += dictGroupCount; // Handle input sets that contain the special string {eof}. // Column 1 of the state table is reserved for EOF on input. // Column 2 is reserved for before-the-start-input. // (This column can be optimized away later if there are no rule // references to {bof}.) // Add this column value (1 or 2) to the equivalent expression // subtree for each UnicodeSet that contains the string {eof} // Because {bof} and {eof} are not a characters in the normal sense, // they doesn't affect the computation of ranges or TRIE. String eofString = "eof"; String bofString = "bof"; for (RBBINode usetNode : fRB.fUSetNodes) { UnicodeSet inputSet = usetNode.fInputSet; if (inputSet.contains(eofString)) { addValToSet(usetNode, 1); } if (inputSet.contains(bofString)) { addValToSet(usetNode, 2); fSawBOF = true; } } if (fRB.fDebugEnv!=null && fRB.fDebugEnv.indexOf("rgroup")>=0) {printRangeGroups();} if (fRB.fDebugEnv!=null && fRB.fDebugEnv.indexOf("esets")>=0) {printSets();} } private static final int MAX_CHAR_CATEGORIES_FOR_8BITS_TRIE = 255; /** * Build the Trie table for mapping UChar32 values to the corresponding * range group number. */ void buildTrie() { fTrie = new MutableCodePointTrie(0, // Initial value for all code points. 0); // Error value for out-of-range input. for (RangeDescriptor rlRange = fRangeList; rlRange!=null; rlRange=rlRange.fNext) { fTrie.setRange(rlRange.fStartChar, // Range start rlRange.fEndChar, // Range end (inclusive) rlRange.fNum // value for range ); } } /** * Merge two character categories that have been identified as having equivalent behavior. * The ranges belonging to the second category (table column) will be added to the first. * @param categories the pair of categories to be merged. */ void mergeCategories(IntPair categories) { assert(categories.first >= 1); assert(categories.second > categories.first); assert((categories.first < fDictCategoriesStart && categories.second < fDictCategoriesStart) || (categories.first >= fDictCategoriesStart && categories.second >= fDictCategoriesStart)); for (RangeDescriptor rd = fRangeList; rd != null; rd = rd.fNext) { int rangeNum = rd.fNum; if (rangeNum == categories.second) { rd.fNum = categories.first; } else if (rangeNum > categories.second) { rd.fNum--; } } --fGroupCount; if (categories.second <= fDictCategoriesStart) { --fDictCategoriesStart; } } //----------------------------------------------------------------------------------- // // freezeTrieIfNotYet() Ensure the trie is frozen. Shared code by getTrieSize // and serializeTrie. // //----------------------------------------------------------------------------------- void freezeTrieIfNotYet() { if (fFrozenTrie == null) { boolean use8Bits = getNumCharCategories() <= MAX_CHAR_CATEGORIES_FOR_8BITS_TRIE; fFrozenTrie = fTrie.buildImmutable(CodePointTrie.Type.FAST, use8Bits ? CodePointTrie.ValueWidth.BITS_8 : CodePointTrie.ValueWidth.BITS_16); fTrie = null; } } //----------------------------------------------------------------------------------- // // getTrieSize() Return the size that will be required to serialize the Trie. // //----------------------------------------------------------------------------------- int getTrieSize() { freezeTrieIfNotYet(); return fFrozenTrie.toBinary(new ByteArrayOutputStream()); } //----------------------------------------------------------------------------------- // // serializeTrie() Write the serialized trie to an output stream // //----------------------------------------------------------------------------------- void serializeTrie(OutputStream os) throws IOException { freezeTrieIfNotYet(); fFrozenTrie.toBinary(os); } //------------------------------------------------------------------------ // // addValToSets Add a runtime-mapped input value to each uset from a // list of uset nodes. (val corresponds to a state table column.) // For each of the original Unicode sets - which correspond // directly to uset nodes - a logically equivalent expression // is constructed in terms of the remapped runtime input // symbol set. This function adds one runtime input symbol to // a list of sets. // // The "logically equivalent expression" is the tree for an // or-ing together of all of the symbols that go into the set. // //------------------------------------------------------------------------ void addValToSets(List sets, int val) { for (RBBINode usetNode : sets) { addValToSet(usetNode, val); } } void addValToSet(RBBINode usetNode, int val) { RBBINode leafNode = new RBBINode(RBBINode.leafChar); leafNode.fVal = val; if (usetNode.fLeftChild == null) { usetNode.fLeftChild = leafNode; leafNode.fParent = usetNode; } else { // There are already input symbols present for this set. // Set up an OR node, with the previous stuff as the left child // and the new value as the right child. RBBINode orNode = new RBBINode(RBBINode.opOr); orNode.fLeftChild = usetNode.fLeftChild; orNode.fRightChild = leafNode; orNode.fLeftChild.fParent = orNode; orNode.fRightChild.fParent = orNode; usetNode.fLeftChild = orNode; orNode.fParent = usetNode; } } //------------------------------------------------------------------------ // // getNumCharCategories // //------------------------------------------------------------------------ int getNumCharCategories() { return fGroupCount + 3; } //------------------------------------------------------------------------ // // getDictCategoriesStart // //------------------------------------------------------------------------ int getDictCategoriesStart() { return fDictCategoriesStart; } //------------------------------------------------------------------------ // // sawBOF // //------------------------------------------------------------------------ boolean sawBOF() { return fSawBOF; } //------------------------------------------------------------------------ // // getFirstChar Given a runtime RBBI character category, find // the first UChar32 that is in the set of chars // in the category. //------------------------------------------------------------------------ int getFirstChar(int category) { RangeDescriptor rlRange; int retVal = -1; for (rlRange = fRangeList; rlRange!=null; rlRange=rlRange.fNext) { if (rlRange.fNum == category) { retVal = rlRange.fStartChar; break; } } return retVal; } //------------------------------------------------------------------------ // // printRanges A debugging function. // dump out all of the range definitions. // //------------------------------------------------------------------------ ///CLOVER:OFF void printRanges() { RangeDescriptor rlRange; int i; System.out.print("\n\n Nonoverlapping Ranges ...\n"); for (rlRange = fRangeList; rlRange!=null; rlRange=rlRange.fNext) { System.out.printf("%04x-%04x ", rlRange.fStartChar, rlRange.fEndChar); for (i=0; i= fDictCategoriesStart) { System.out.print(" ");} for (i=0; i