1450 lines
60 KiB
Java
1450 lines
60 KiB
Java
![]() |
/* GENERATED SOURCE. DO NOT MODIFY. */
|
||
|
// © 2016 and later: Unicode, Inc. and others.
|
||
|
// License & terms of use: http://www.unicode.org/copyright.html
|
||
|
/*
|
||
|
**********************************************************************
|
||
|
* Copyright (c) 2002-2016, International Business Machines
|
||
|
* Corporation and others. All Rights Reserved.
|
||
|
**********************************************************************
|
||
|
*/
|
||
|
|
||
|
package android.icu.text;
|
||
|
|
||
|
import java.util.ArrayList;
|
||
|
import java.util.Arrays;
|
||
|
import java.util.Collection;
|
||
|
import java.util.HashSet;
|
||
|
import java.util.List;
|
||
|
import java.util.Set;
|
||
|
import java.util.SortedSet;
|
||
|
import java.util.TreeSet;
|
||
|
|
||
|
import android.icu.impl.Assert;
|
||
|
import android.icu.impl.RBBIDataWrapper;
|
||
|
import android.icu.text.RBBIRuleBuilder.IntPair;
|
||
|
|
||
|
/**
|
||
|
* This class is part of the RBBI rule compiler.
|
||
|
* It builds the state transition table used by the RBBI runtime
|
||
|
* from the expression syntax tree generated by the rule scanner.
|
||
|
*
|
||
|
* This class is part of the RBBI implementation only.
|
||
|
* There is no user-visible public API here.
|
||
|
*/
|
||
|
class RBBITableBuilder {
|
||
|
|
||
|
//
|
||
|
// RBBIStateDescriptor - The DFA is initially constructed as a set of these descriptors,
|
||
|
// one for each state.
|
||
|
static class RBBIStateDescriptor {
|
||
|
boolean fMarked;
|
||
|
int fAccepting;
|
||
|
int fLookAhead;
|
||
|
SortedSet<Integer> fTagVals;
|
||
|
int fTagsIdx;
|
||
|
Set<RBBINode> fPositions; // Set of parse tree positions associated
|
||
|
// with this state. Unordered (it's a set).
|
||
|
// UVector contents are RBBINode *
|
||
|
|
||
|
int[] fDtran; // Transitions out of this state.
|
||
|
// indexed by input character
|
||
|
// contents is int index of dest state
|
||
|
// in RBBITableBuilder.fDStates
|
||
|
|
||
|
RBBIStateDescriptor(int maxInputSymbol) {
|
||
|
fTagVals = new TreeSet<>();
|
||
|
fPositions = new HashSet<>();
|
||
|
fDtran = new int[maxInputSymbol+1]; // fDtran needs to be pre-sized.
|
||
|
// It is indexed by input symbols, and will
|
||
|
// hold the next state number for each
|
||
|
// symbol.
|
||
|
}
|
||
|
}
|
||
|
|
||
|
|
||
|
private RBBIRuleBuilder fRB;
|
||
|
|
||
|
/** The array index into RBBIRuleBuilder.fTreeRoots for the parse tree to operate on. */
|
||
|
private int fRootIx;
|
||
|
|
||
|
/** D states (Aho's terminology). Index is state number. */
|
||
|
private List<RBBIStateDescriptor> fDStates;
|
||
|
|
||
|
/** Synthesized safe table, a List of row arrays. */
|
||
|
private List<short[]> fSafeTable;
|
||
|
|
||
|
private static final int MAX_STATE_FOR_8BITS_TABLE = 255;
|
||
|
|
||
|
/** Map from rule number (fVal in look ahead nodes) to sequential lookahead index. */
|
||
|
int[] fLookAheadRuleMap;
|
||
|
|
||
|
/** Counter used when assigning lookahead rule numbers.
|
||
|
* Contains the last look-ahead number already in use.
|
||
|
* The first look-ahead number is 2; Number 1 (ACCEPTING_UNCONDITIONAL) is reserved
|
||
|
* for non-lookahead accepting states. See the declarations of RBBIStateTableRowT. */
|
||
|
int fLASlotsInUse = RBBIDataWrapper.ACCEPTING_UNCONDITIONAL;
|
||
|
|
||
|
//-----------------------------------------------------------------------------
|
||
|
//
|
||
|
// Constructor for RBBITableBuilder.
|
||
|
//
|
||
|
// rootNode is an index into the array of root nodes that is held by
|
||
|
// the overall RBBIRuleBuilder.
|
||
|
//-----------------------------------------------------------------------------
|
||
|
RBBITableBuilder(RBBIRuleBuilder rb, int rootNodeIx) {
|
||
|
fRootIx = rootNodeIx;
|
||
|
fRB = rb;
|
||
|
fDStates = new ArrayList<>();
|
||
|
}
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
//-----------------------------------------------------------------------------
|
||
|
//
|
||
|
// RBBITableBuilder::buildForwardTable - This is the main function for building
|
||
|
// the DFA state transition table from the RBBI rules parse tree.
|
||
|
//
|
||
|
//-----------------------------------------------------------------------------
|
||
|
void buildForwardTable() {
|
||
|
// If there were no rules, just return. This situation can easily arise
|
||
|
// for the reverse rules.
|
||
|
if (fRB.fTreeRoots[fRootIx]==null) {
|
||
|
return;
|
||
|
}
|
||
|
|
||
|
//
|
||
|
// Walk through the tree, replacing any references to $variables with a copy of the
|
||
|
// parse tree for the substitution expression.
|
||
|
//
|
||
|
fRB.fTreeRoots[fRootIx] = fRB.fTreeRoots[fRootIx].flattenVariables(0);
|
||
|
if (fRB.fDebugEnv!=null && fRB.fDebugEnv.indexOf("ftree")>=0) {
|
||
|
System.out.println("Parse tree after flattening variable references.");
|
||
|
fRB.fTreeRoots[fRootIx].printTree(true);
|
||
|
}
|
||
|
|
||
|
//
|
||
|
// If the rules contained any references to {bof}
|
||
|
// add a {bof} <cat> <former root of tree> to the
|
||
|
// tree. Means that all matches must start out with the
|
||
|
// {bof} fake character.
|
||
|
//
|
||
|
if (fRB.fSetBuilder.sawBOF()) {
|
||
|
RBBINode bofTop = new RBBINode(RBBINode.opCat);
|
||
|
RBBINode bofLeaf = new RBBINode(RBBINode.leafChar);
|
||
|
bofTop.fLeftChild = bofLeaf;
|
||
|
bofTop.fRightChild = fRB.fTreeRoots[fRootIx];
|
||
|
bofLeaf.fParent = bofTop;
|
||
|
bofLeaf.fVal = 2; // Reserved value for {bof}.
|
||
|
fRB.fTreeRoots[fRootIx] = bofTop;
|
||
|
}
|
||
|
|
||
|
//
|
||
|
// Add a unique right-end marker to the expression.
|
||
|
// Appears as a cat-node, left child being the original tree,
|
||
|
// right child being the end marker.
|
||
|
//
|
||
|
RBBINode cn = new RBBINode(RBBINode.opCat);
|
||
|
cn.fLeftChild = fRB.fTreeRoots[fRootIx];
|
||
|
fRB.fTreeRoots[fRootIx].fParent = cn;
|
||
|
RBBINode endMarkerNode = cn.fRightChild = new RBBINode(RBBINode.endMark);
|
||
|
cn.fRightChild.fParent = cn;
|
||
|
fRB.fTreeRoots[fRootIx] = cn;
|
||
|
|
||
|
//
|
||
|
// Replace all references to UnicodeSets with the tree for the equivalent
|
||
|
// expression.
|
||
|
//
|
||
|
fRB.fTreeRoots[fRootIx].flattenSets();
|
||
|
if (fRB.fDebugEnv!=null && fRB.fDebugEnv.indexOf("stree")>=0) {
|
||
|
System.out.println("Parse tree after flattening Unicode Set references.");
|
||
|
fRB.fTreeRoots[fRootIx].printTree(true);
|
||
|
}
|
||
|
|
||
|
|
||
|
//
|
||
|
// calculate the functions nullable, firstpos, lastpos and followpos on
|
||
|
// nodes in the parse tree.
|
||
|
// See the algorithm description in Aho.
|
||
|
// Understanding how this works by looking at the code alone will be
|
||
|
// nearly impossible.
|
||
|
//
|
||
|
calcNullable(fRB.fTreeRoots[fRootIx]);
|
||
|
calcFirstPos(fRB.fTreeRoots[fRootIx]);
|
||
|
calcLastPos(fRB.fTreeRoots[fRootIx]);
|
||
|
calcFollowPos(fRB.fTreeRoots[fRootIx]);
|
||
|
if (fRB.fDebugEnv!=null && fRB.fDebugEnv.indexOf("pos")>=0) {
|
||
|
System.out.print("\n");
|
||
|
printPosSets(fRB.fTreeRoots[fRootIx]);
|
||
|
}
|
||
|
|
||
|
//
|
||
|
// For "chained" rules, modify the followPos sets
|
||
|
//
|
||
|
if (fRB.fChainRules) {
|
||
|
calcChainedFollowPos(fRB.fTreeRoots[fRootIx], endMarkerNode);
|
||
|
}
|
||
|
|
||
|
//
|
||
|
// BOF (start of input) test fixup.
|
||
|
//
|
||
|
if (fRB.fSetBuilder.sawBOF()) {
|
||
|
bofFixup();
|
||
|
}
|
||
|
|
||
|
//
|
||
|
// Build the DFA state transition tables.
|
||
|
//
|
||
|
buildStateTable();
|
||
|
mapLookAheadRules();
|
||
|
flagAcceptingStates();
|
||
|
flagLookAheadStates();
|
||
|
flagTaggedStates();
|
||
|
|
||
|
//
|
||
|
// Update the global table of rule status {tag} values
|
||
|
// The rule builder has a global vector of status values that are common
|
||
|
// for all tables. Merge the ones from this table into the global set.
|
||
|
//
|
||
|
mergeRuleStatusVals();
|
||
|
}
|
||
|
|
||
|
|
||
|
|
||
|
//-----------------------------------------------------------------------------
|
||
|
//
|
||
|
// calcNullable. Impossible to explain succinctly. See Aho, section 3.9
|
||
|
//
|
||
|
//-----------------------------------------------------------------------------
|
||
|
void calcNullable(RBBINode n) {
|
||
|
if (n == null) {
|
||
|
return;
|
||
|
}
|
||
|
if (n.fType == RBBINode.setRef ||
|
||
|
n.fType == RBBINode.endMark ) {
|
||
|
// These are non-empty leaf node types.
|
||
|
n.fNullable = false;
|
||
|
return;
|
||
|
}
|
||
|
|
||
|
if (n.fType == RBBINode.lookAhead || n.fType == RBBINode.tag) {
|
||
|
// Lookahead marker node. It's a leaf, so no recursion on children.
|
||
|
// It's nullable because it does not match any literal text from the input stream.
|
||
|
n.fNullable = true;
|
||
|
return;
|
||
|
}
|
||
|
|
||
|
|
||
|
// The node is not a leaf.
|
||
|
// Calculate nullable on its children.
|
||
|
calcNullable(n.fLeftChild);
|
||
|
calcNullable(n.fRightChild);
|
||
|
|
||
|
// Apply functions from table 3.40 in Aho
|
||
|
if (n.fType == RBBINode.opOr) {
|
||
|
n.fNullable = n.fLeftChild.fNullable || n.fRightChild.fNullable;
|
||
|
}
|
||
|
else if (n.fType == RBBINode.opCat) {
|
||
|
n.fNullable = n.fLeftChild.fNullable && n.fRightChild.fNullable;
|
||
|
}
|
||
|
else if (n.fType == RBBINode.opStar || n.fType == RBBINode.opQuestion) {
|
||
|
n.fNullable = true;
|
||
|
}
|
||
|
else {
|
||
|
n.fNullable = false;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
//-----------------------------------------------------------------------------
|
||
|
//
|
||
|
// calcFirstPos. Impossible to explain succinctly. See Aho, section 3.9
|
||
|
//
|
||
|
//-----------------------------------------------------------------------------
|
||
|
void calcFirstPos(RBBINode n) {
|
||
|
if (n == null) {
|
||
|
return;
|
||
|
}
|
||
|
if (n.fType == RBBINode.leafChar ||
|
||
|
n.fType == RBBINode.endMark ||
|
||
|
n.fType == RBBINode.lookAhead ||
|
||
|
n.fType == RBBINode.tag) {
|
||
|
// These are non-empty leaf node types.
|
||
|
n.fFirstPosSet.add(n);
|
||
|
return;
|
||
|
}
|
||
|
|
||
|
// The node is not a leaf.
|
||
|
// Calculate firstPos on its children.
|
||
|
calcFirstPos(n.fLeftChild);
|
||
|
calcFirstPos(n.fRightChild);
|
||
|
|
||
|
// Apply functions from table 3.40 in Aho
|
||
|
if (n.fType == RBBINode.opOr) {
|
||
|
n.fFirstPosSet.addAll(n.fLeftChild.fFirstPosSet);
|
||
|
n.fFirstPosSet.addAll(n.fRightChild.fFirstPosSet);
|
||
|
}
|
||
|
else if (n.fType == RBBINode.opCat) {
|
||
|
n.fFirstPosSet.addAll(n.fLeftChild.fFirstPosSet);
|
||
|
if (n.fLeftChild.fNullable) {
|
||
|
n.fFirstPosSet.addAll(n.fRightChild.fFirstPosSet);
|
||
|
}
|
||
|
}
|
||
|
else if (n.fType == RBBINode.opStar ||
|
||
|
n.fType == RBBINode.opQuestion ||
|
||
|
n.fType == RBBINode.opPlus) {
|
||
|
n.fFirstPosSet.addAll(n.fLeftChild.fFirstPosSet);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
|
||
|
|
||
|
//-----------------------------------------------------------------------------
|
||
|
//
|
||
|
// calcLastPos. Impossible to explain succinctly. See Aho, section 3.9
|
||
|
//
|
||
|
//-----------------------------------------------------------------------------
|
||
|
void calcLastPos(RBBINode n) {
|
||
|
if (n == null) {
|
||
|
return;
|
||
|
}
|
||
|
if (n.fType == RBBINode.leafChar ||
|
||
|
n.fType == RBBINode.endMark ||
|
||
|
n.fType == RBBINode.lookAhead ||
|
||
|
n.fType == RBBINode.tag) {
|
||
|
// These are non-empty leaf node types.
|
||
|
n.fLastPosSet.add(n);
|
||
|
return;
|
||
|
}
|
||
|
|
||
|
// The node is not a leaf.
|
||
|
// Calculate lastPos on its children.
|
||
|
calcLastPos(n.fLeftChild);
|
||
|
calcLastPos(n.fRightChild);
|
||
|
|
||
|
// Apply functions from table 3.40 in Aho
|
||
|
if (n.fType == RBBINode.opOr) {
|
||
|
n.fLastPosSet.addAll(n.fLeftChild.fLastPosSet);
|
||
|
n.fLastPosSet.addAll(n.fRightChild.fLastPosSet);
|
||
|
}
|
||
|
else if (n.fType == RBBINode.opCat) {
|
||
|
n.fLastPosSet.addAll(n.fRightChild.fLastPosSet);
|
||
|
if (n.fRightChild.fNullable) {
|
||
|
n.fLastPosSet.addAll(n.fLeftChild.fLastPosSet);
|
||
|
}
|
||
|
}
|
||
|
else if (n.fType == RBBINode.opStar ||
|
||
|
n.fType == RBBINode.opQuestion ||
|
||
|
n.fType == RBBINode.opPlus) {
|
||
|
n.fLastPosSet.addAll(n.fLeftChild.fLastPosSet);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
|
||
|
|
||
|
//-----------------------------------------------------------------------------
|
||
|
//
|
||
|
// calcFollowPos. Impossible to explain succinctly. See Aho, section 3.9
|
||
|
//
|
||
|
//-----------------------------------------------------------------------------
|
||
|
void calcFollowPos(RBBINode n) {
|
||
|
if (n == null ||
|
||
|
n.fType == RBBINode.leafChar ||
|
||
|
n.fType == RBBINode.endMark) {
|
||
|
return;
|
||
|
}
|
||
|
|
||
|
calcFollowPos(n.fLeftChild);
|
||
|
calcFollowPos(n.fRightChild);
|
||
|
|
||
|
// Aho rule #1
|
||
|
if (n.fType == RBBINode.opCat) {
|
||
|
for (RBBINode i /* is 'i' in Aho's description */ : n.fLeftChild.fLastPosSet) {
|
||
|
i.fFollowPos.addAll(n.fRightChild.fFirstPosSet);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
// Aho rule #2
|
||
|
if (n.fType == RBBINode.opStar ||
|
||
|
n.fType == RBBINode.opPlus) {
|
||
|
for (RBBINode i /* again, n and i are the names from Aho's description */ : n.fLastPosSet) {
|
||
|
i.fFollowPos.addAll(n.fFirstPosSet);
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
//-----------------------------------------------------------------------------
|
||
|
//
|
||
|
// addRuleRootNodes Recursively walk a parse tree, adding all nodes flagged
|
||
|
// as roots of a rule to a destination vector.
|
||
|
//
|
||
|
//-----------------------------------------------------------------------------
|
||
|
void addRuleRootNodes(List<RBBINode> dest, RBBINode node) {
|
||
|
if (node == null) {
|
||
|
return;
|
||
|
}
|
||
|
if (node.fRuleRoot) {
|
||
|
dest.add(node);
|
||
|
// Note: rules cannot nest. If we found a rule start node,
|
||
|
// no child node can also be a start node.
|
||
|
return;
|
||
|
}
|
||
|
addRuleRootNodes(dest, node.fLeftChild);
|
||
|
addRuleRootNodes(dest, node.fRightChild);
|
||
|
}
|
||
|
|
||
|
//-----------------------------------------------------------------------------
|
||
|
//
|
||
|
// calcChainedFollowPos. Modify the previously calculated followPos sets
|
||
|
// to implement rule chaining. NOT described by Aho
|
||
|
//
|
||
|
//-----------------------------------------------------------------------------
|
||
|
void calcChainedFollowPos(RBBINode tree, RBBINode endMarkNode) {
|
||
|
|
||
|
List<RBBINode> leafNodes = new ArrayList<>();
|
||
|
|
||
|
// get a list all leaf nodes
|
||
|
tree.findNodes(leafNodes, RBBINode.leafChar);
|
||
|
|
||
|
// Collect all leaf nodes that can start matches for rules
|
||
|
// with inbound chaining enabled, which is the union of the
|
||
|
// firstPosition sets from each of the rule root nodes.
|
||
|
|
||
|
List<RBBINode> ruleRootNodes = new ArrayList<>();
|
||
|
addRuleRootNodes(ruleRootNodes, tree);
|
||
|
|
||
|
Set<RBBINode> matchStartNodes = new HashSet<>();
|
||
|
for (RBBINode node: ruleRootNodes) {
|
||
|
if (node.fChainIn) {
|
||
|
matchStartNodes.addAll(node.fFirstPosSet);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
// Iterate over all leaf nodes,
|
||
|
//
|
||
|
for (RBBINode endNode : leafNodes) {
|
||
|
|
||
|
// Identify leaf nodes that correspond to overall rule match positions.
|
||
|
// These include the endMarkNode in their followPos sets.
|
||
|
//
|
||
|
// Note: do not consider other end marker nodes, those that are added to
|
||
|
// look-ahead rules. These can't chain; a match immediately stops
|
||
|
// further matching. This leaves exactly one end marker node, the one
|
||
|
// at the end of the complete tree.
|
||
|
|
||
|
if (!endNode.fFollowPos.contains(endMarkNode)) {
|
||
|
continue;
|
||
|
}
|
||
|
|
||
|
// We've got a node that can end a match.
|
||
|
|
||
|
// Now iterate over the nodes that can start a match, looking for ones
|
||
|
// with the same char class as our ending node.
|
||
|
for (RBBINode startNode : matchStartNodes) {
|
||
|
if (startNode.fType != RBBINode.leafChar) {
|
||
|
continue;
|
||
|
}
|
||
|
|
||
|
if (endNode.fVal == startNode.fVal) {
|
||
|
// The end val (character class) of one possible match is the
|
||
|
// same as the start of another.
|
||
|
|
||
|
// Add all nodes from the followPos of the start node to the
|
||
|
// followPos set of the end node, which will have the effect of
|
||
|
// letting matches transition from a match state at endNode
|
||
|
// to the second char of a match starting with startNode.
|
||
|
endNode.fFollowPos.addAll(startNode.fFollowPos);
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
|
||
|
//-----------------------------------------------------------------------------
|
||
|
//
|
||
|
// bofFixup. Fixup for state tables that include {bof} beginning of input testing.
|
||
|
// Do an swizzle similar to chaining, modifying the followPos set of
|
||
|
// the bofNode to include the followPos nodes from other {bot} nodes
|
||
|
// scattered through the tree.
|
||
|
//
|
||
|
// This function has much in common with calcChainedFollowPos().
|
||
|
//
|
||
|
//-----------------------------------------------------------------------------
|
||
|
void bofFixup() {
|
||
|
//
|
||
|
// The parse tree looks like this ...
|
||
|
// fTree root --. <cat>
|
||
|
// / \
|
||
|
// <cat> <#end node>
|
||
|
// / \
|
||
|
// <bofNode> rest
|
||
|
// of tree
|
||
|
//
|
||
|
// We will be adding things to the followPos set of the <bofNode>
|
||
|
//
|
||
|
RBBINode bofNode = fRB.fTreeRoots[fRootIx].fLeftChild.fLeftChild;
|
||
|
Assert.assrt(bofNode.fType == RBBINode.leafChar);
|
||
|
Assert.assrt(bofNode.fVal == 2);
|
||
|
|
||
|
// Get all nodes that can be the start a match of the user-written rules
|
||
|
// (excluding the fake bofNode)
|
||
|
// We want the nodes that can start a match in the
|
||
|
// part labeled "rest of tree"
|
||
|
//
|
||
|
Set<RBBINode> matchStartNodes = fRB.fTreeRoots[fRootIx].fLeftChild.fRightChild.fFirstPosSet;
|
||
|
for (RBBINode startNode : matchStartNodes) {
|
||
|
if (startNode.fType != RBBINode.leafChar) {
|
||
|
continue;
|
||
|
}
|
||
|
|
||
|
if (startNode.fVal == bofNode.fVal) {
|
||
|
// We found a leaf node corresponding to a {bof} that was
|
||
|
// explicitly written into a rule.
|
||
|
// Add everything from the followPos set of this node to the
|
||
|
// followPos set of the fake bofNode at the start of the tree.
|
||
|
//
|
||
|
bofNode.fFollowPos.addAll(startNode.fFollowPos);
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
//-----------------------------------------------------------------------------
|
||
|
//
|
||
|
// buildStateTable() Determine the set of runtime DFA states and the
|
||
|
// transition tables for these states, by the algorithm
|
||
|
// of fig. 3.44 in Aho.
|
||
|
//
|
||
|
// Most of the comments are quotes of Aho's psuedo-code.
|
||
|
//
|
||
|
//-----------------------------------------------------------------------------
|
||
|
void buildStateTable() {
|
||
|
//
|
||
|
// Add a dummy state 0 - the stop state. Not from Aho.
|
||
|
int lastInputSymbol = fRB.fSetBuilder.getNumCharCategories() - 1;
|
||
|
RBBIStateDescriptor failState = new RBBIStateDescriptor(lastInputSymbol);
|
||
|
fDStates.add(failState);
|
||
|
|
||
|
// initially, the only unmarked state in Dstates is firstpos(root),
|
||
|
// where toot is the root of the syntax tree for (r)#;
|
||
|
RBBIStateDescriptor initialState = new RBBIStateDescriptor(lastInputSymbol);
|
||
|
initialState.fPositions.addAll(fRB.fTreeRoots[fRootIx].fFirstPosSet);
|
||
|
fDStates.add(initialState);
|
||
|
|
||
|
// while there is an unmarked state T in Dstates do begin
|
||
|
for (;;) {
|
||
|
RBBIStateDescriptor T = null;
|
||
|
int tx;
|
||
|
for (tx=1; tx<fDStates.size(); tx++) {
|
||
|
RBBIStateDescriptor temp = fDStates.get(tx);
|
||
|
if (temp.fMarked == false) {
|
||
|
T = temp;
|
||
|
break;
|
||
|
}
|
||
|
}
|
||
|
if (T == null) {
|
||
|
break;
|
||
|
}
|
||
|
|
||
|
// mark T;
|
||
|
T.fMarked = true;
|
||
|
|
||
|
// for each input symbol a do begin
|
||
|
int a;
|
||
|
for (a = 1; a<=lastInputSymbol; a++) {
|
||
|
// let U be the set of positions that are in followpos(p)
|
||
|
// for some position p in T
|
||
|
// such that the symbol at position p is a;
|
||
|
Set<RBBINode> U = null;
|
||
|
for (RBBINode p : T.fPositions) {
|
||
|
if ((p.fType == RBBINode.leafChar) && (p.fVal == a)) {
|
||
|
if (U == null) {
|
||
|
U = new HashSet<>();
|
||
|
}
|
||
|
U.addAll(p.fFollowPos);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
// if U is not empty and not in DStates then
|
||
|
int ux = 0;
|
||
|
boolean UinDstates = false;
|
||
|
if (U != null) {
|
||
|
Assert.assrt(U.size() > 0);
|
||
|
int ix;
|
||
|
for (ix=0; ix<fDStates.size(); ix++) {
|
||
|
RBBIStateDescriptor temp2;
|
||
|
temp2 = fDStates.get(ix);
|
||
|
if (U.equals(temp2.fPositions)) {
|
||
|
U = temp2.fPositions;
|
||
|
ux = ix;
|
||
|
UinDstates = true;
|
||
|
break;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
// Add U as an unmarked state to Dstates
|
||
|
if (!UinDstates)
|
||
|
{
|
||
|
RBBIStateDescriptor newState = new RBBIStateDescriptor(lastInputSymbol);
|
||
|
newState.fPositions = U;
|
||
|
fDStates.add(newState);
|
||
|
ux = fDStates.size()-1;
|
||
|
}
|
||
|
|
||
|
// Dtran[T, a] := U;
|
||
|
T.fDtran[a] = ux;
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* mapLookAheadRules
|
||
|
*
|
||
|
*/
|
||
|
void mapLookAheadRules() {
|
||
|
fLookAheadRuleMap = new int[fRB.fScanner.numRules() + 1];
|
||
|
|
||
|
for (RBBIStateDescriptor sd: fDStates) {
|
||
|
int laSlotForState = 0;
|
||
|
|
||
|
// Establish the look-ahead slot for this state, if the state covers
|
||
|
// any look-ahead nodes - corresponding to the '/' in look-ahead rules.
|
||
|
|
||
|
// If any of the look-ahead nodes already have a slot assigned, use it,
|
||
|
// otherwise assign a new one.
|
||
|
|
||
|
boolean sawLookAheadNode = false;
|
||
|
for (RBBINode node: sd.fPositions) {
|
||
|
if (node.fType != RBBINode.lookAhead) {
|
||
|
continue;
|
||
|
}
|
||
|
sawLookAheadNode = true;
|
||
|
int ruleNum = node.fVal; // Set when rule was originally parsed.
|
||
|
assert(ruleNum < fLookAheadRuleMap.length);
|
||
|
assert(ruleNum > 0);
|
||
|
int laSlot = fLookAheadRuleMap[ruleNum];
|
||
|
if (laSlot != 0) {
|
||
|
if (laSlotForState == 0) {
|
||
|
laSlotForState = laSlot;
|
||
|
} else {
|
||
|
// TODO: figure out if this can fail, change to setting an error code if so.
|
||
|
assert(laSlot == laSlotForState);
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
if (!sawLookAheadNode) {
|
||
|
continue;
|
||
|
}
|
||
|
|
||
|
if (laSlotForState == 0) {
|
||
|
laSlotForState = ++fLASlotsInUse;
|
||
|
}
|
||
|
|
||
|
// For each look ahead node covered by this state,
|
||
|
// set the mapping from the node's rule number to the look ahead slot.
|
||
|
// There can be multiple nodes/rule numbers going to the same la slot.
|
||
|
|
||
|
for (RBBINode node: sd.fPositions) {
|
||
|
if (node.fType != RBBINode.lookAhead) {
|
||
|
continue;
|
||
|
}
|
||
|
int ruleNum = node.fVal; // Set when rule was originally parsed.
|
||
|
int existingVal = fLookAheadRuleMap[ruleNum];
|
||
|
assert(existingVal == 0 || existingVal == laSlotForState);
|
||
|
fLookAheadRuleMap[ruleNum] = laSlotForState;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
}
|
||
|
|
||
|
//-----------------------------------------------------------------------------
|
||
|
//
|
||
|
// flagAcceptingStates Identify accepting states.
|
||
|
// First get a list of all of the end marker nodes.
|
||
|
// Then, for each state s,
|
||
|
// if s contains one of the end marker nodes in its list of tree positions then
|
||
|
// s is an accepting state.
|
||
|
//
|
||
|
//-----------------------------------------------------------------------------
|
||
|
void flagAcceptingStates() {
|
||
|
List<RBBINode> endMarkerNodes = new ArrayList<>();
|
||
|
RBBINode endMarker;
|
||
|
int i;
|
||
|
int n;
|
||
|
|
||
|
fRB.fTreeRoots[fRootIx].findNodes(endMarkerNodes, RBBINode.endMark);
|
||
|
|
||
|
for (i=0; i<endMarkerNodes.size(); i++) {
|
||
|
endMarker = endMarkerNodes.get(i);
|
||
|
for (n=0; n<fDStates.size(); n++) {
|
||
|
RBBIStateDescriptor sd = fDStates.get(n);
|
||
|
if (sd.fPositions.contains(endMarker)) {
|
||
|
// Any non-zero value for fAccepting means this is an accepting node.
|
||
|
// The value is what will be returned to the user as the break status.
|
||
|
// If no other value was specified, force it to ACCEPTING_UNCONDITIONAL (1).
|
||
|
|
||
|
if (sd.fAccepting==0) {
|
||
|
// State hasn't been marked as accepting yet. Do it now.
|
||
|
sd.fAccepting = fLookAheadRuleMap[endMarker.fVal];
|
||
|
if (sd.fAccepting == 0) {
|
||
|
sd.fAccepting = RBBIDataWrapper.ACCEPTING_UNCONDITIONAL;
|
||
|
}
|
||
|
}
|
||
|
if (sd.fAccepting==RBBIDataWrapper.ACCEPTING_UNCONDITIONAL && endMarker.fVal != 0) {
|
||
|
// Both lookahead and non-lookahead accepting for this state.
|
||
|
// Favor the look-ahead, because a look-ahead match needs to
|
||
|
// immediately stop the run-time engine. First match, not longest.
|
||
|
sd.fAccepting = fLookAheadRuleMap[endMarker.fVal];
|
||
|
}
|
||
|
// implicit else:
|
||
|
// if sd.fAccepting already had a value other than 0 or 1, leave it be.
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
|
||
|
//-----------------------------------------------------------------------------
|
||
|
//
|
||
|
// flagLookAheadStates Very similar to flagAcceptingStates, above.
|
||
|
//
|
||
|
//-----------------------------------------------------------------------------
|
||
|
void flagLookAheadStates() {
|
||
|
List<RBBINode> lookAheadNodes = new ArrayList<>();
|
||
|
RBBINode lookAheadNode;
|
||
|
int i;
|
||
|
int n;
|
||
|
|
||
|
fRB.fTreeRoots[fRootIx].findNodes(lookAheadNodes, RBBINode.lookAhead);
|
||
|
for (i=0; i<lookAheadNodes.size(); i++) {
|
||
|
lookAheadNode = lookAheadNodes.get(i);
|
||
|
for (n=0; n<fDStates.size(); n++) {
|
||
|
RBBIStateDescriptor sd = fDStates.get(n);
|
||
|
if (sd.fPositions.contains(lookAheadNode)) {
|
||
|
int lookaheadSlot = fLookAheadRuleMap[lookAheadNode.fVal];
|
||
|
assert(sd.fLookAhead == 0 || sd.fLookAhead == lookaheadSlot);
|
||
|
sd.fLookAhead = lookaheadSlot;
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
//-----------------------------------------------------------------------------
|
||
|
//
|
||
|
// flagTaggedStates
|
||
|
//
|
||
|
//-----------------------------------------------------------------------------
|
||
|
void flagTaggedStates() {
|
||
|
List<RBBINode> tagNodes = new ArrayList<>();
|
||
|
RBBINode tagNode;
|
||
|
int i;
|
||
|
int n;
|
||
|
|
||
|
fRB.fTreeRoots[fRootIx].findNodes(tagNodes, RBBINode.tag);
|
||
|
for (i=0; i<tagNodes.size(); i++) { // For each tag node t (all of 'em)
|
||
|
tagNode = tagNodes.get(i);
|
||
|
|
||
|
for (n=0; n<fDStates.size(); n++) { // For each state s (row in the state table)
|
||
|
RBBIStateDescriptor sd = fDStates.get(n);
|
||
|
if (sd.fPositions.contains(tagNode)) { // if s include the tag node t
|
||
|
sd.fTagVals.add(tagNode.fVal);
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
|
||
|
|
||
|
//-----------------------------------------------------------------------------
|
||
|
//
|
||
|
// mergeRuleStatusVals
|
||
|
//
|
||
|
// Allocate positions in the global array of rule status {tag} values
|
||
|
//
|
||
|
// The RBBI runtime uses an array of {sets of status values} that can
|
||
|
// be returned for boundaries. Each accepting state that has non-zero
|
||
|
// status includes an index into this array. The format of the array
|
||
|
// is
|
||
|
// Num of status values in group 1
|
||
|
// status val
|
||
|
// status val
|
||
|
// ...
|
||
|
// Num of status vals in group 2
|
||
|
// status val
|
||
|
// status val
|
||
|
// ...
|
||
|
// etc.
|
||
|
//
|
||
|
//
|
||
|
//-----------------------------------------------------------------------------
|
||
|
|
||
|
void mergeRuleStatusVals() {
|
||
|
//
|
||
|
// The basic outline of what happens here is this...
|
||
|
//
|
||
|
// for each state in this state table
|
||
|
// if the status tag list for this state is in the global statuses list
|
||
|
// record where and
|
||
|
// continue with the next state
|
||
|
// else
|
||
|
// add the tag list for this state to the global list.
|
||
|
//
|
||
|
int n;
|
||
|
|
||
|
// Pre-load a single tag of {0} into the table.
|
||
|
// We will need this as a default, for rule sets with no explicit tagging,
|
||
|
// or with explicit tagging of {0}.
|
||
|
if (fRB.fRuleStatusVals.size() == 0) {
|
||
|
fRB.fRuleStatusVals.add(1); // Num of statuses in group
|
||
|
fRB.fRuleStatusVals.add(0); // and our single status of zero
|
||
|
|
||
|
SortedSet<Integer> s0 = new TreeSet<>(); // mapping for rules with no explicit tagging
|
||
|
fRB.fStatusSets.put(s0, 0); // (key is an empty set).
|
||
|
|
||
|
SortedSet<Integer> s1 = new TreeSet<>(); // mapping for rules with explicit tagging of {0}
|
||
|
s1.add(0);
|
||
|
fRB.fStatusSets.put(s1, 0);
|
||
|
}
|
||
|
|
||
|
// For each state, check whether the state's status tag values are
|
||
|
// already entered into the status values array, and add them if not.
|
||
|
for (n=0; n<fDStates.size(); n++) {
|
||
|
RBBIStateDescriptor sd = fDStates.get(n);
|
||
|
Set<Integer> statusVals = sd.fTagVals;
|
||
|
Integer arrayIndexI = fRB.fStatusSets.get(statusVals);
|
||
|
if (arrayIndexI == null) {
|
||
|
// This is the first encounter of this set of status values.
|
||
|
// Add them to the statusSets map, This map associates
|
||
|
// the set of status values with an index in the runtime status
|
||
|
// values array.
|
||
|
arrayIndexI = fRB.fRuleStatusVals.size();
|
||
|
fRB.fStatusSets.put(statusVals, arrayIndexI);
|
||
|
|
||
|
// Add the new set of status values to the vector of values that
|
||
|
// will eventually become the array used by the runtime engine.
|
||
|
fRB.fRuleStatusVals.add(statusVals.size());
|
||
|
fRB.fRuleStatusVals.addAll(statusVals);
|
||
|
}
|
||
|
|
||
|
// Save the runtime array index back into the state descriptor.
|
||
|
sd.fTagsIdx = arrayIndexI.intValue();
|
||
|
}
|
||
|
}
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
//-----------------------------------------------------------------------------
|
||
|
//
|
||
|
// printPosSets Debug function. Dump Nullable, firstpos, lastpos and followpos
|
||
|
// for each node in the tree.
|
||
|
//
|
||
|
//-----------------------------------------------------------------------------
|
||
|
|
||
|
void printPosSets(RBBINode n) {
|
||
|
if (n==null) {
|
||
|
return;
|
||
|
}
|
||
|
RBBINode.printNode(n);
|
||
|
System.out.print(" Nullable: " + n.fNullable);
|
||
|
|
||
|
System.out.print(" firstpos: ");
|
||
|
printSet(n.fFirstPosSet);
|
||
|
|
||
|
System.out.print(" lastpos: ");
|
||
|
printSet(n.fLastPosSet);
|
||
|
|
||
|
System.out.print(" followpos: ");
|
||
|
printSet(n.fFollowPos);
|
||
|
|
||
|
printPosSets(n.fLeftChild);
|
||
|
printPosSets(n.fRightChild);
|
||
|
}
|
||
|
|
||
|
|
||
|
|
||
|
/**
|
||
|
* Find duplicate (redundant) character classes. Begin looking with categories.first.
|
||
|
* Duplicates, if found are returned in the categories parameter.
|
||
|
* This is an iterator-like function, used to identify character classes
|
||
|
* (state table columns) that can be eliminated.
|
||
|
* @param categories in/out parameter, specifies where to start looking for duplicates,
|
||
|
* and returns the first pair of duplicates found, if any.
|
||
|
* @return true if duplicate char classes were found, false otherwise.
|
||
|
* @hide draft / provisional / internal are hidden on Android
|
||
|
*/
|
||
|
boolean findDuplCharClassFrom(RBBIRuleBuilder.IntPair categories) {
|
||
|
int numStates = fDStates.size();
|
||
|
int numCols = fRB.fSetBuilder.getNumCharCategories();
|
||
|
|
||
|
int table_base = 0;
|
||
|
int table_dupl = 0;
|
||
|
for (; categories.first < numCols-1; ++categories.first) {
|
||
|
// Note: dictionary & non-dictionary columns cannot be merged.
|
||
|
// The limitSecond value prevents considering mixed pairs.
|
||
|
// Dictionary categories are >= DictCategoriesStart.
|
||
|
// Non dict categories are < DictCategoriesStart.
|
||
|
int limitSecond = categories.first < fRB.fSetBuilder.getDictCategoriesStart() ?
|
||
|
fRB.fSetBuilder.getDictCategoriesStart() : numCols;
|
||
|
for (categories.second=categories.first+1; categories.second < limitSecond; ++categories.second) {
|
||
|
for (int state=0; state<numStates; state++) {
|
||
|
RBBIStateDescriptor sd = fDStates.get(state);
|
||
|
table_base = sd.fDtran[categories.first];
|
||
|
table_dupl = sd.fDtran[categories.second];
|
||
|
if (table_base != table_dupl) {
|
||
|
break;
|
||
|
}
|
||
|
}
|
||
|
if (table_base == table_dupl) {
|
||
|
return true;
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
return false;
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Remove a column from the state table. Used when two character categories
|
||
|
* have been found equivalent, and merged together, to eliminate the unneeded table column.
|
||
|
*/
|
||
|
void removeColumn(int column) {
|
||
|
int numStates = fDStates.size();
|
||
|
for (int state=0; state<numStates; state++) {
|
||
|
RBBIStateDescriptor sd = fDStates.get(state);
|
||
|
assert(column < sd.fDtran.length);
|
||
|
int[] newArray = Arrays.copyOf(sd.fDtran, sd.fDtran.length - 1);
|
||
|
System.arraycopy(sd.fDtran, column+1, newArray, column, newArray.length - column);
|
||
|
sd.fDtran = newArray;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
|
||
|
/**
|
||
|
* Find duplicate (redundant) states, beginning at the specified pair,
|
||
|
* within this state table. This is an iterator-like function, used to
|
||
|
* identify states (state table rows) that can be eliminated.
|
||
|
* @param states in/out parameter, specifies where to start looking for duplicates,
|
||
|
* and returns the first pair of duplicates found, if any.
|
||
|
* @return true if duplicate states were found, false otherwise.
|
||
|
* @hide draft / provisional / internal are hidden on Android
|
||
|
*/
|
||
|
boolean findDuplicateState(RBBIRuleBuilder.IntPair states) {
|
||
|
int numStates = fDStates.size();
|
||
|
int numCols = fRB.fSetBuilder.getNumCharCategories();
|
||
|
|
||
|
for (; states.first<numStates-1; ++states.first) {
|
||
|
RBBIStateDescriptor firstSD = fDStates.get(states.first);
|
||
|
for (states.second=states.first+1; states.second<numStates; ++states.second) {
|
||
|
RBBIStateDescriptor duplSD = fDStates.get(states.second);
|
||
|
if (firstSD.fAccepting != duplSD.fAccepting ||
|
||
|
firstSD.fLookAhead != duplSD.fLookAhead ||
|
||
|
firstSD.fTagsIdx != duplSD.fTagsIdx) {
|
||
|
continue;
|
||
|
}
|
||
|
boolean rowsMatch = true;
|
||
|
for (int col=0; col < numCols; ++col) {
|
||
|
int firstVal = firstSD.fDtran[col];
|
||
|
int duplVal = duplSD.fDtran[col];
|
||
|
if (!((firstVal == duplVal) ||
|
||
|
((firstVal == states.first || firstVal == states.second) &&
|
||
|
(duplVal == states.first || duplVal == states.second)))) {
|
||
|
rowsMatch = false;
|
||
|
break;
|
||
|
}
|
||
|
}
|
||
|
if (rowsMatch) {
|
||
|
return true;
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
return false;
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Find the next duplicate state in the safe reverse table. An iterator function.
|
||
|
* @param states in/out parameter, specifies where to start looking for duplicates,
|
||
|
* and returns the first pair of duplicates found, if any.
|
||
|
* @return true if duplicate states were found, false otherwise.
|
||
|
* @hide draft / provisional / internal are hidden on Android
|
||
|
*/
|
||
|
boolean findDuplicateSafeState(RBBIRuleBuilder.IntPair states) {
|
||
|
int numStates = fSafeTable.size();
|
||
|
|
||
|
for (; states.first<numStates-1; ++states.first) {
|
||
|
short[] firstRow = fSafeTable.get(states.first);
|
||
|
for (states.second=states.first+1; states.second<numStates; ++states.second) {
|
||
|
short[] duplRow = fSafeTable.get(states.second);
|
||
|
boolean rowsMatch = true;
|
||
|
int numCols = firstRow.length;
|
||
|
for (int col=0; col < numCols; ++col) {
|
||
|
int firstVal = firstRow[col];
|
||
|
int duplVal = duplRow[col];
|
||
|
if (!((firstVal == duplVal) ||
|
||
|
((firstVal == states.first || firstVal == states.second) &&
|
||
|
(duplVal == states.first || duplVal == states.second)))) {
|
||
|
rowsMatch = false;
|
||
|
break;
|
||
|
}
|
||
|
}
|
||
|
if (rowsMatch) {
|
||
|
return true;
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
return false;
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Remove a duplicate state (row) from the state table. All references to the deleted (second) state
|
||
|
* are redirected to first state.
|
||
|
* @param duplStates The duplicate pair of states.
|
||
|
* @hide draft / provisional / internal are hidden on Android
|
||
|
*/
|
||
|
void removeState(IntPair duplStates) {
|
||
|
final int keepState = duplStates.first;
|
||
|
final int duplState = duplStates.second;
|
||
|
assert(keepState < duplState);
|
||
|
assert(duplState < fDStates.size());
|
||
|
|
||
|
fDStates.remove(duplState);
|
||
|
|
||
|
int numStates = fDStates.size();
|
||
|
int numCols = fRB.fSetBuilder.getNumCharCategories();
|
||
|
for (int state=0; state<numStates; ++state) {
|
||
|
RBBIStateDescriptor sd = fDStates.get(state);
|
||
|
for (int col=0; col<numCols; col++) {
|
||
|
int existingVal = sd.fDtran[col];
|
||
|
int newVal = existingVal;
|
||
|
if (existingVal == duplState) {
|
||
|
newVal = keepState;
|
||
|
} else if (existingVal > duplState) {
|
||
|
newVal = existingVal - 1;
|
||
|
}
|
||
|
sd.fDtran[col] = newVal;
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Remove a duplicate state from the safe table.
|
||
|
* @param duplStates The duplicate pair of states. The first is kept, the second is removed.
|
||
|
* All references to the second in the state table are retargeted
|
||
|
* to the first.
|
||
|
* @hide draft / provisional / internal are hidden on Android
|
||
|
*/
|
||
|
void removeSafeState(IntPair duplStates) {
|
||
|
final int keepState = duplStates.first;
|
||
|
final int duplState = duplStates.second;
|
||
|
assert(keepState < duplState);
|
||
|
assert(duplState < fSafeTable.size());
|
||
|
|
||
|
fSafeTable.remove(duplState);
|
||
|
int numStates = fSafeTable.size();
|
||
|
for (int state=0; state<numStates; ++state) {
|
||
|
short[] row = fSafeTable.get(state);
|
||
|
for (int col=0; col<row.length; col++) {
|
||
|
int existingVal = row[col];
|
||
|
int newVal = existingVal;
|
||
|
if (existingVal == duplState) {
|
||
|
newVal = keepState;
|
||
|
} else if (existingVal > duplState) {
|
||
|
newVal = existingVal - 1;
|
||
|
}
|
||
|
row[col] = (short)newVal;
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
|
||
|
/**
|
||
|
* Check for, and remove duplicate states (table rows).
|
||
|
* @return the number of states removed.
|
||
|
* @hide draft / provisional / internal are hidden on Android
|
||
|
*/
|
||
|
int removeDuplicateStates() {
|
||
|
IntPair dupls = new IntPair(3, 0);
|
||
|
int numStatesRemoved = 0;
|
||
|
|
||
|
while (findDuplicateState(dupls)) {
|
||
|
// System.out.printf("Removing duplicate states (%d, %d)\n", dupls.first, dupls.second);
|
||
|
removeState(dupls);
|
||
|
++numStatesRemoved;
|
||
|
}
|
||
|
return numStatesRemoved;
|
||
|
}
|
||
|
|
||
|
|
||
|
/**
|
||
|
* Calculate the size in bytes of the serialized form of this state transition table,
|
||
|
* which is identical to the ICU4C runtime form.
|
||
|
* Refer to common/rbbidata.h from ICU4C for the declarations of the structures
|
||
|
* being matched by this calculation.
|
||
|
*/
|
||
|
int getTableSize() {
|
||
|
if (fRB.fTreeRoots[fRootIx] == null) {
|
||
|
return 0;
|
||
|
}
|
||
|
int size = RBBIDataWrapper.RBBIStateTable.fHeaderSize; // The header, with no rows to the table.
|
||
|
int numRows = fDStates.size();
|
||
|
int numCols = fRB.fSetBuilder.getNumCharCategories();
|
||
|
boolean use8Bits = numRows <= MAX_STATE_FOR_8BITS_TABLE;
|
||
|
int rowSize = (use8Bits ? 1 : 2 ) * (RBBIDataWrapper.NEXTSTATES + numCols);
|
||
|
size += numRows * rowSize;
|
||
|
size = (size + 7) & ~7; // round up to a multiple of 8 bytes
|
||
|
return size;
|
||
|
}
|
||
|
|
||
|
|
||
|
|
||
|
/**
|
||
|
* Create a RBBIDataWrapper.RBBIStateTable for a newly compiled table.
|
||
|
* RBBIDataWrapper.RBBIStateTable is similar to struct RBBIStateTable in ICU4C,
|
||
|
* in common/rbbidata.h
|
||
|
*/
|
||
|
RBBIDataWrapper.RBBIStateTable exportTable() {
|
||
|
int state;
|
||
|
int col;
|
||
|
|
||
|
RBBIDataWrapper.RBBIStateTable table = new RBBIDataWrapper.RBBIStateTable();
|
||
|
if (fRB.fTreeRoots[fRootIx] == null) {
|
||
|
return table;
|
||
|
}
|
||
|
|
||
|
Assert.assrt(fRB.fSetBuilder.getNumCharCategories() < 0x7fff &&
|
||
|
fDStates.size() < 0x7fff);
|
||
|
table.fNumStates = fDStates.size();
|
||
|
table.fDictCategoriesStart = fRB.fSetBuilder.getDictCategoriesStart();
|
||
|
table.fLookAheadResultsSize =
|
||
|
fLASlotsInUse == RBBIDataWrapper.ACCEPTING_UNCONDITIONAL ? 0 : fLASlotsInUse + 1;
|
||
|
boolean use8Bits = table.fNumStates <= MAX_STATE_FOR_8BITS_TABLE;
|
||
|
|
||
|
// Size of table size in shorts.
|
||
|
int rowLen = RBBIDataWrapper.NEXTSTATES + fRB.fSetBuilder.getNumCharCategories(); // Row Length in shorts.
|
||
|
int tableSize;
|
||
|
if (use8Bits) {
|
||
|
tableSize = (getTableSize() - RBBIDataWrapper.RBBIStateTable.fHeaderSize); // fTable length in bytes.
|
||
|
table.fTable = new char[tableSize];
|
||
|
table.fRowLen = rowLen; // Row length in bytes.
|
||
|
} else {
|
||
|
tableSize = (getTableSize() - RBBIDataWrapper.RBBIStateTable.fHeaderSize) / 2; // fTable length in shorts.
|
||
|
table.fTable = new char[tableSize];
|
||
|
table.fRowLen = rowLen * 2; // Row length in bytes.
|
||
|
}
|
||
|
|
||
|
if (fRB.fLookAheadHardBreak) {
|
||
|
table.fFlags |= RBBIDataWrapper.RBBI_LOOKAHEAD_HARD_BREAK;
|
||
|
}
|
||
|
if (fRB.fSetBuilder.sawBOF()) {
|
||
|
table.fFlags |= RBBIDataWrapper.RBBI_BOF_REQUIRED;
|
||
|
}
|
||
|
if (use8Bits) {
|
||
|
table.fFlags |= RBBIDataWrapper.RBBI_8BITS_ROWS;
|
||
|
}
|
||
|
|
||
|
int numCharCategories = fRB.fSetBuilder.getNumCharCategories();
|
||
|
for (state=0; state<table.fNumStates; state++) {
|
||
|
RBBIStateDescriptor sd = fDStates.get(state);
|
||
|
int row = state*rowLen;
|
||
|
if (use8Bits) {
|
||
|
Assert.assrt (0 <= sd.fAccepting && sd.fAccepting <= 255);
|
||
|
Assert.assrt (0 <= sd.fLookAhead && sd.fLookAhead <= 255);
|
||
|
} else {
|
||
|
Assert.assrt (0 <= sd.fAccepting && sd.fAccepting <= 0xffff);
|
||
|
Assert.assrt (0 <= sd.fLookAhead && sd.fLookAhead <= 0xffff);
|
||
|
}
|
||
|
table.fTable[row + RBBIDataWrapper.ACCEPTING] = (char)sd.fAccepting;
|
||
|
table.fTable[row + RBBIDataWrapper.LOOKAHEAD] = (char)sd.fLookAhead;
|
||
|
table.fTable[row + RBBIDataWrapper.TAGSIDX] = (char)sd.fTagsIdx;
|
||
|
for (col=0; col<numCharCategories; col++) {
|
||
|
if (use8Bits) {
|
||
|
Assert.assrt (0 <= sd.fDtran[col] && sd.fDtran[col] <= MAX_STATE_FOR_8BITS_TABLE);
|
||
|
}
|
||
|
table.fTable[row + RBBIDataWrapper.NEXTSTATES + col] = (char)sd.fDtran[col];
|
||
|
}
|
||
|
}
|
||
|
return table;
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Synthesize a safe state table from the main state table.
|
||
|
*/
|
||
|
void buildSafeReverseTable() {
|
||
|
// Safe Reverse table construction is described in more detail in the corresponding
|
||
|
// function in ICU4C, in source/common/rbbitblb.cpp. Not duplicated here because
|
||
|
// it is too likely to get out of sync.
|
||
|
|
||
|
// Each safe pair is stored as two chars in the safePair stringBuilder.
|
||
|
StringBuilder safePairs = new StringBuilder();
|
||
|
|
||
|
int numCharClasses = fRB.fSetBuilder.getNumCharCategories();
|
||
|
int numStates = fDStates.size();
|
||
|
|
||
|
for (int c1=0; c1<numCharClasses; ++c1) {
|
||
|
for (int c2=0; c2 < numCharClasses; ++c2) {
|
||
|
int wantedEndState = -1;
|
||
|
int endState = 0;
|
||
|
for (int startState = 1; startState < numStates; ++startState) {
|
||
|
RBBIStateDescriptor startStateD = fDStates.get(startState);
|
||
|
int s2 = startStateD.fDtran[c1];
|
||
|
RBBIStateDescriptor s2StateD = fDStates.get(s2);
|
||
|
endState = s2StateD.fDtran[c2];
|
||
|
if (wantedEndState < 0) {
|
||
|
wantedEndState = endState;
|
||
|
} else {
|
||
|
if (wantedEndState != endState) {
|
||
|
break;
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
if (wantedEndState == endState) {
|
||
|
safePairs.append((char)c1);
|
||
|
safePairs.append((char)c2);
|
||
|
// System.out.printf("(%d, %d) ", c1, c2);
|
||
|
}
|
||
|
}
|
||
|
// System.out.printf("\n");
|
||
|
}
|
||
|
|
||
|
// Populate the initial safe table.
|
||
|
// The table as a whole is a List<short[]>
|
||
|
// Row 0 is the stop state.
|
||
|
// Row 1 is the start sate.
|
||
|
// Row 2 and beyond are other states, initially one per char class, but
|
||
|
// after initial construction, many of the states will be combined, compacting the table.)
|
||
|
// The String holds the nextState data only. The four leading fields of a row, fAccepting,
|
||
|
// fLookAhead, etc. are not needed for the safe table, and are omitted at this stage of building.
|
||
|
|
||
|
assert(fSafeTable == null);
|
||
|
fSafeTable = new ArrayList<>();
|
||
|
for (int row=0; row<numCharClasses + 2; ++row) {
|
||
|
fSafeTable.add(new short[numCharClasses]);
|
||
|
}
|
||
|
|
||
|
// From the start state, each input char class transitions to the state for that input.
|
||
|
short[] startState = fSafeTable.get(1);
|
||
|
for (int charClass=0; charClass < numCharClasses; ++charClass) {
|
||
|
// Note: +2 to skip the start & stop state rows.
|
||
|
startState[charClass] = (short)(charClass+2);
|
||
|
}
|
||
|
|
||
|
// Initially make every other state table row look like the start state row
|
||
|
// (except for the stop state, which remains all 0)
|
||
|
for (int row=2; row<numCharClasses+2; ++row) {
|
||
|
System.arraycopy(startState, 0, fSafeTable.get(row), 0, startState.length);
|
||
|
}
|
||
|
|
||
|
// Run through the safe pairs, set the next state to zero when pair has been seen.
|
||
|
// Zero being the stop state, meaning we found a safe point.
|
||
|
for (int pairIdx=0; pairIdx<safePairs.length(); pairIdx+=2) {
|
||
|
int c1 = safePairs.charAt(pairIdx);
|
||
|
int c2 = safePairs.charAt(pairIdx + 1);
|
||
|
|
||
|
short[] rowState = fSafeTable.get(c2 + 2);
|
||
|
rowState[c1] = 0;
|
||
|
}
|
||
|
|
||
|
// Remove duplicate or redundant rows from the table.
|
||
|
RBBIRuleBuilder.IntPair states = new RBBIRuleBuilder.IntPair(1, 0);
|
||
|
while (findDuplicateSafeState(states)) {
|
||
|
// System.out.printf("Removing duplicate safe states (%d, %d)\n", states.first, states.second);
|
||
|
removeSafeState(states);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
|
||
|
/**
|
||
|
* Calculate the size of the runtime form of this safe state table.
|
||
|
*/
|
||
|
int getSafeTableSize() {
|
||
|
if (fSafeTable == null) {
|
||
|
return 0;
|
||
|
}
|
||
|
int size = RBBIDataWrapper.RBBIStateTable.fHeaderSize; // The header, with no rows to the table.
|
||
|
int numRows = fSafeTable.size();
|
||
|
int numCols = fSafeTable.get(0).length;
|
||
|
boolean use8Bits = numRows <= MAX_STATE_FOR_8BITS_TABLE;
|
||
|
|
||
|
int rowSize = (use8Bits ? 1 : 2 ) * (RBBIDataWrapper.NEXTSTATES + numCols);
|
||
|
size += numRows * rowSize;
|
||
|
// TODO: there are redundant round-up. Figure out best place, get rid of the rest.
|
||
|
size = (size + 7) & ~7; // round up to a multiple of 8 bytes
|
||
|
return size;
|
||
|
}
|
||
|
|
||
|
|
||
|
/**
|
||
|
* Create a RBBIDataWrapper.RBBIStateTable for the safe reverse table.
|
||
|
* RBBIDataWrapper.RBBIStateTable is similar to struct RBBIStateTable in ICU4C,
|
||
|
* in common/rbbidata.h
|
||
|
*/
|
||
|
RBBIDataWrapper.RBBIStateTable exportSafeTable() {
|
||
|
RBBIDataWrapper.RBBIStateTable table = new RBBIDataWrapper.RBBIStateTable();
|
||
|
table.fNumStates = fSafeTable.size();
|
||
|
boolean use8Bits = table.fNumStates <= MAX_STATE_FOR_8BITS_TABLE;
|
||
|
int numCharCategories = fSafeTable.get(0).length;
|
||
|
|
||
|
// Size of table size in shorts.
|
||
|
int rowLen = RBBIDataWrapper.NEXTSTATES + numCharCategories;
|
||
|
// TODO: tableSize is basically numStates * numCharCategories,
|
||
|
// except for alignment padding. Clean up here, and in main exportTable().
|
||
|
int tableSize = (getSafeTableSize() - RBBIDataWrapper.RBBIStateTable.fHeaderSize); // fTable length in bytes.
|
||
|
if (use8Bits) {
|
||
|
table.fFlags |= RBBIDataWrapper.RBBI_8BITS_ROWS;
|
||
|
table.fTable = new char[tableSize];
|
||
|
table.fRowLen = rowLen; // Row length in bytes.
|
||
|
} else {
|
||
|
tableSize /= 2; // fTable length in shorts.
|
||
|
table.fTable = new char[tableSize];
|
||
|
table.fRowLen = rowLen * 2; // Row length in bytes.
|
||
|
}
|
||
|
|
||
|
for (int state=0; state<table.fNumStates; state++) {
|
||
|
short[] rowArray = fSafeTable.get(state);
|
||
|
int row = state * rowLen;
|
||
|
|
||
|
for (int col=0; col<numCharCategories; col++) {
|
||
|
if (use8Bits) {
|
||
|
Assert.assrt (rowArray[col] <= MAX_STATE_FOR_8BITS_TABLE);
|
||
|
}
|
||
|
table.fTable[row + RBBIDataWrapper.NEXTSTATES + col] = (char)rowArray[col];
|
||
|
}
|
||
|
}
|
||
|
return table;
|
||
|
}
|
||
|
|
||
|
|
||
|
//-----------------------------------------------------------------------------
|
||
|
//
|
||
|
// printSet Debug function. Print the contents of a set of Nodes
|
||
|
//
|
||
|
//-----------------------------------------------------------------------------
|
||
|
|
||
|
void printSet(Collection<RBBINode> s) {
|
||
|
for (RBBINode n : s) {
|
||
|
RBBINode.printInt(n.fSerialNum, 8);
|
||
|
}
|
||
|
System.out.println();
|
||
|
}
|
||
|
|
||
|
|
||
|
|
||
|
//-----------------------------------------------------------------------------
|
||
|
//
|
||
|
// printStates Debug Function. Dump the fully constructed state transition table.
|
||
|
//
|
||
|
//-----------------------------------------------------------------------------
|
||
|
|
||
|
void printStates() {
|
||
|
int c; // input "character"
|
||
|
int n; // state number
|
||
|
|
||
|
System.out.print("state | i n p u t s y m b o l s \n");
|
||
|
System.out.print(" | Acc LA Tag");
|
||
|
for (c=0; c<fRB.fSetBuilder.getNumCharCategories(); c++) {
|
||
|
RBBINode.printInt(c, 4);
|
||
|
}
|
||
|
System.out.print("\n");
|
||
|
System.out.print(" |---------------");
|
||
|
for (c=0; c<fRB.fSetBuilder.getNumCharCategories(); c++) {
|
||
|
System.out.print("----");
|
||
|
}
|
||
|
System.out.print("\n");
|
||
|
|
||
|
for (n=0; n<fDStates.size(); n++) {
|
||
|
RBBIStateDescriptor sd = fDStates.get(n);
|
||
|
RBBINode.printInt(n, 5);
|
||
|
System.out.print(" | ");
|
||
|
|
||
|
RBBINode.printInt(sd.fAccepting, 3);
|
||
|
RBBINode.printInt(sd.fLookAhead, 4);
|
||
|
RBBINode.printInt(sd.fTagsIdx, 6);
|
||
|
System.out.print(" ");
|
||
|
for (c=0; c<fRB.fSetBuilder.getNumCharCategories(); c++) {
|
||
|
RBBINode.printInt(sd.fDtran[c], 4);
|
||
|
}
|
||
|
System.out.print("\n");
|
||
|
}
|
||
|
System.out.print("\n\n");
|
||
|
}
|
||
|
|
||
|
|
||
|
/**
|
||
|
* Debug Function. Dump the fully constructed safe reverse table.
|
||
|
*/
|
||
|
void printReverseTable() {
|
||
|
int c; // input "character"
|
||
|
|
||
|
System.out.printf(" Safe Reverse Table \n");
|
||
|
if (fSafeTable == null) {
|
||
|
System.out.printf(" --- nullptr ---\n");
|
||
|
return;
|
||
|
}
|
||
|
int numCharCategories = fSafeTable.get(0).length;
|
||
|
System.out.printf("state | i n p u t s y m b o l s \n");
|
||
|
System.out.printf(" | Acc LA Tag");
|
||
|
for (c=0; c< numCharCategories; c++) {
|
||
|
System.out.printf(" %2d", c);
|
||
|
}
|
||
|
System.out.printf("\n");
|
||
|
System.out.printf(" |---------------");
|
||
|
for (c=0; c<numCharCategories; c++) {
|
||
|
System.out.printf("---");
|
||
|
}
|
||
|
System.out.printf("\n");
|
||
|
|
||
|
for (int n=0; n<fSafeTable.size(); n++) {
|
||
|
short rowArray[] = fSafeTable.get(n);
|
||
|
System.out.printf(" %3d | " , n);
|
||
|
System.out.printf("%3d %3d %5d ", 0, 0, 0); // Accepting, LookAhead, Tags
|
||
|
for (c=0; c<numCharCategories; c++) {
|
||
|
System.out.printf(" %2d", rowArray[c]);
|
||
|
}
|
||
|
System.out.printf("\n");
|
||
|
}
|
||
|
System.out.printf("\n\n");
|
||
|
}
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
//-----------------------------------------------------------------------------
|
||
|
//
|
||
|
// printRuleStatusTable Debug Function. Dump the common rule status table
|
||
|
//
|
||
|
//-----------------------------------------------------------------------------
|
||
|
|
||
|
void printRuleStatusTable() {
|
||
|
int thisRecord = 0;
|
||
|
int nextRecord = 0;
|
||
|
int i;
|
||
|
List<Integer> tbl = fRB.fRuleStatusVals;
|
||
|
|
||
|
System.out.print("index | tags \n");
|
||
|
System.out.print("-------------------\n");
|
||
|
|
||
|
while (nextRecord < tbl.size()) {
|
||
|
thisRecord = nextRecord;
|
||
|
nextRecord = thisRecord + tbl.get(thisRecord).intValue() + 1;
|
||
|
RBBINode.printInt(thisRecord, 7);
|
||
|
for (i=thisRecord+1; i<nextRecord; i++) {
|
||
|
int val = tbl.get(i).intValue();
|
||
|
RBBINode.printInt(val, 7);
|
||
|
}
|
||
|
System.out.print("\n");
|
||
|
}
|
||
|
System.out.print("\n\n");
|
||
|
}
|
||
|
|
||
|
|
||
|
|
||
|
}
|