258 lines
9.9 KiB
Java
258 lines
9.9 KiB
Java
/* GENERATED SOURCE. DO NOT MODIFY. */
|
|
// © 2016 and later: Unicode, Inc. and others.
|
|
// License & terms of use: http://www.unicode.org/copyright.html
|
|
/*
|
|
*******************************************************************************
|
|
* Copyright (C) 1996-2010, International Business Machines Corporation and *
|
|
* others. All Rights Reserved. *
|
|
*******************************************************************************
|
|
*/
|
|
package android.icu.text;
|
|
|
|
import java.util.ArrayList;
|
|
import java.util.List;
|
|
|
|
import android.icu.impl.UtilityExtensions;
|
|
|
|
/**
|
|
* A set of rules for a <code>RuleBasedTransliterator</code>. This set encodes
|
|
* the transliteration in one direction from one set of characters or short
|
|
* strings to another. A <code>RuleBasedTransliterator</code> consists of up to
|
|
* two such sets, one for the forward direction, and one for the reverse.
|
|
*
|
|
* <p>A <code>TransliterationRuleSet</code> has one important operation, that of
|
|
* finding a matching rule at a given point in the text. This is accomplished
|
|
* by the <code>findMatch()</code> method.
|
|
*
|
|
* <p>Copyright © IBM Corporation 1999. All rights reserved.
|
|
*
|
|
* @author Alan Liu
|
|
*/
|
|
class TransliterationRuleSet {
|
|
/**
|
|
* Vector of rules, in the order added.
|
|
*/
|
|
private List<TransliterationRule> ruleVector;
|
|
|
|
/**
|
|
* Length of the longest preceding context
|
|
*/
|
|
private int maxContextLength;
|
|
|
|
/**
|
|
* Sorted and indexed table of rules. This is created by freeze() from
|
|
* the rules in ruleVector. rules.length >= ruleVector.size(), and the
|
|
* references in rules[] are aliases of the references in ruleVector.
|
|
* A single rule in ruleVector is listed one or more times in rules[].
|
|
*/
|
|
private TransliterationRule[] rules;
|
|
|
|
/**
|
|
* Index table. For text having a first character c, compute x = c&0xFF.
|
|
* Now use rules[index[x]..index[x+1]-1]. This index table is created by
|
|
* freeze().
|
|
*/
|
|
private int[] index;
|
|
|
|
/**
|
|
* Construct a new empty rule set.
|
|
*/
|
|
public TransliterationRuleSet() {
|
|
ruleVector = new ArrayList<TransliterationRule>();
|
|
maxContextLength = 0;
|
|
}
|
|
|
|
/**
|
|
* Return the maximum context length.
|
|
* @return the length of the longest preceding context.
|
|
*/
|
|
public int getMaximumContextLength() {
|
|
return maxContextLength;
|
|
}
|
|
|
|
/**
|
|
* Add a rule to this set. Rules are added in order, and order is
|
|
* significant.
|
|
* @param rule the rule to add
|
|
*/
|
|
public void addRule(TransliterationRule rule) {
|
|
ruleVector.add(rule);
|
|
int len;
|
|
if ((len = rule.getAnteContextLength()) > maxContextLength) {
|
|
maxContextLength = len;
|
|
}
|
|
|
|
rules = null;
|
|
}
|
|
|
|
/**
|
|
* Close this rule set to further additions, check it for masked rules,
|
|
* and index it to optimize performance.
|
|
* @exception IllegalArgumentException if some rules are masked
|
|
*/
|
|
public void freeze() {
|
|
/* Construct the rule array and index table. We reorder the
|
|
* rules by sorting them into 256 bins. Each bin contains all
|
|
* rules matching the index value for that bin. A rule
|
|
* matches an index value if string whose first key character
|
|
* has a low byte equal to the index value can match the rule.
|
|
*
|
|
* Each bin contains zero or more rules, in the same order
|
|
* they were found originally. However, the total rules in
|
|
* the bins may exceed the number in the original vector,
|
|
* since rules that have a variable as their first key
|
|
* character will generally fall into more than one bin.
|
|
*
|
|
* That is, each bin contains all rules that either have that
|
|
* first index value as their first key character, or have
|
|
* a set containing the index value as their first character.
|
|
*/
|
|
int n = ruleVector.size();
|
|
index = new int[257]; // [sic]
|
|
List<TransliterationRule> v = new ArrayList<TransliterationRule>(2*n); // heuristic; adjust as needed
|
|
|
|
/* Precompute the index values. This saves a LOT of time.
|
|
*/
|
|
int[] indexValue = new int[n];
|
|
for (int j=0; j<n; ++j) {
|
|
TransliterationRule r = ruleVector.get(j);
|
|
indexValue[j] = r.getIndexValue();
|
|
}
|
|
for (int x=0; x<256; ++x) {
|
|
index[x] = v.size();
|
|
for (int j=0; j<n; ++j) {
|
|
if (indexValue[j] >= 0) {
|
|
if (indexValue[j] == x) {
|
|
v.add(ruleVector.get(j));
|
|
}
|
|
} else {
|
|
// If the indexValue is < 0, then the first key character is
|
|
// a set, and we must use the more time-consuming
|
|
// matchesIndexValue check. In practice this happens
|
|
// rarely, so we seldom tread this code path.
|
|
TransliterationRule r = ruleVector.get(j);
|
|
if (r.matchesIndexValue(x)) {
|
|
v.add(r);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
index[256] = v.size();
|
|
|
|
/* Freeze things into an array.
|
|
*/
|
|
rules = new TransliterationRule[v.size()];
|
|
v.toArray(rules);
|
|
|
|
StringBuilder errors = null;
|
|
|
|
/* Check for masking. This is MUCH faster than our old check,
|
|
* which was each rule against each following rule, since we
|
|
* only have to check for masking within each bin now. It's
|
|
* 256*O(n2^2) instead of O(n1^2), where n1 is the total rule
|
|
* count, and n2 is the per-bin rule count. But n2<<n1, so
|
|
* it's a big win.
|
|
*/
|
|
for (int x=0; x<256; ++x) {
|
|
for (int j=index[x]; j<index[x+1]-1; ++j) {
|
|
TransliterationRule r1 = rules[j];
|
|
for (int k=j+1; k<index[x+1]; ++k) {
|
|
TransliterationRule r2 = rules[k];
|
|
if (r1.masks(r2)) {
|
|
if (errors == null) {
|
|
errors = new StringBuilder();
|
|
} else {
|
|
errors.append("\n");
|
|
}
|
|
errors.append("Rule " + r1 + " masks " + r2);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
if (errors != null) {
|
|
throw new IllegalArgumentException(errors.toString());
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Transliterate the given text with the given UTransPosition
|
|
* indices. Return true if the transliteration should continue
|
|
* or false if it should halt (because of a U_PARTIAL_MATCH match).
|
|
* Note that false is only ever returned if isIncremental is true.
|
|
* @param text the text to be transliterated
|
|
* @param pos the position indices, which will be updated
|
|
* @param incremental if true, assume new text may be inserted
|
|
* at index.limit, and return false if there is a partial match.
|
|
* @return true unless a U_PARTIAL_MATCH has been obtained,
|
|
* indicating that transliteration should stop until more text
|
|
* arrives.
|
|
*/
|
|
public boolean transliterate(Replaceable text,
|
|
Transliterator.Position pos,
|
|
boolean incremental) {
|
|
int indexByte = text.char32At(pos.start) & 0xFF;
|
|
for (int i=index[indexByte]; i<index[indexByte+1]; ++i) {
|
|
int m = rules[i].matchAndReplace(text, pos, incremental);
|
|
switch (m) {
|
|
case UnicodeMatcher.U_MATCH:
|
|
if (Transliterator.DEBUG) {
|
|
System.out.println((incremental ? "Rule.i: match ":"Rule: match ") +
|
|
rules[i].toRule(true) + " => " +
|
|
UtilityExtensions.formatInput(text, pos));
|
|
}
|
|
return true;
|
|
case UnicodeMatcher.U_PARTIAL_MATCH:
|
|
if (Transliterator.DEBUG) {
|
|
System.out.println((incremental ? "Rule.i: partial match ":"Rule: partial match ") +
|
|
rules[i].toRule(true) + " => " +
|
|
UtilityExtensions.formatInput(text, pos));
|
|
}
|
|
return false;
|
|
default:
|
|
if (Transliterator.DEBUG) {
|
|
System.out.println("Rule: no match " + rules[i]);
|
|
}
|
|
}
|
|
}
|
|
// No match or partial match from any rule
|
|
pos.start += UTF16.getCharCount(text.char32At(pos.start));
|
|
if (Transliterator.DEBUG) {
|
|
System.out.println((incremental ? "Rule.i: no match => ":"Rule: no match => ") +
|
|
UtilityExtensions.formatInput(text, pos));
|
|
}
|
|
return true;
|
|
}
|
|
|
|
/**
|
|
* Create rule strings that represents this rule set.
|
|
*/
|
|
String toRules(boolean escapeUnprintable) {
|
|
int i;
|
|
int count = ruleVector.size();
|
|
StringBuilder ruleSource = new StringBuilder();
|
|
for (i=0; i<count; ++i) {
|
|
if (i != 0) {
|
|
ruleSource.append('\n');
|
|
}
|
|
TransliterationRule r = ruleVector.get(i);
|
|
ruleSource.append(r.toRule(escapeUnprintable));
|
|
}
|
|
return ruleSource.toString();
|
|
}
|
|
|
|
// TODO Handle the case where we have :: [a] ; a > |b ; b > c ;
|
|
// TODO Merge into r.addSourceTargetSet, to avoid duplicate testing
|
|
void addSourceTargetSet(UnicodeSet filter, UnicodeSet sourceSet, UnicodeSet targetSet) {
|
|
UnicodeSet currentFilter = new UnicodeSet(filter);
|
|
UnicodeSet revisiting = new UnicodeSet();
|
|
int count = ruleVector.size();
|
|
for (int i=0; i<count; ++i) {
|
|
TransliterationRule r = ruleVector.get(i);
|
|
r.addSourceTargetSet(currentFilter, sourceSet, targetSet, revisiting.clear());
|
|
currentFilter.addAll(revisiting);
|
|
}
|
|
}
|
|
|
|
}
|