1859 lines
74 KiB
Java
1859 lines
74 KiB
Java
![]() |
/* GENERATED SOURCE. DO NOT MODIFY. */
|
||
|
// © 2016 and later: Unicode, Inc. and others.
|
||
|
// License & terms of use: http://www.unicode.org/copyright.html
|
||
|
/*
|
||
|
*******************************************************************************
|
||
|
* Copyright (C) 2005-2016 International Business Machines Corporation and
|
||
|
* others. All Rights Reserved.
|
||
|
*******************************************************************************
|
||
|
*/
|
||
|
|
||
|
package android.icu.text;
|
||
|
|
||
|
import static android.icu.impl.CharacterIteration.DONE32;
|
||
|
import static android.icu.impl.CharacterIteration.next32;
|
||
|
import static android.icu.impl.CharacterIteration.nextTrail32;
|
||
|
|
||
|
import java.io.ByteArrayOutputStream;
|
||
|
import java.io.IOException;
|
||
|
import java.io.InputStream;
|
||
|
import java.io.OutputStream;
|
||
|
import java.nio.ByteBuffer;
|
||
|
import java.text.CharacterIterator;
|
||
|
import java.util.MissingResourceException;
|
||
|
import java.util.concurrent.ConcurrentLinkedQueue;
|
||
|
|
||
|
import android.icu.impl.CharacterIteration;
|
||
|
import android.icu.impl.ICUBinary;
|
||
|
import android.icu.impl.ICUDebug;
|
||
|
import android.icu.impl.RBBIDataWrapper;
|
||
|
import android.icu.impl.breakiter.BurmeseBreakEngine;
|
||
|
import android.icu.impl.breakiter.CjkBreakEngine;
|
||
|
import android.icu.impl.breakiter.DictionaryBreakEngine;
|
||
|
import android.icu.impl.breakiter.KhmerBreakEngine;
|
||
|
import android.icu.impl.breakiter.LSTMBreakEngine;
|
||
|
import android.icu.impl.breakiter.LanguageBreakEngine;
|
||
|
import android.icu.impl.breakiter.LaoBreakEngine;
|
||
|
import android.icu.impl.breakiter.ThaiBreakEngine;
|
||
|
import android.icu.impl.breakiter.UnhandledBreakEngine;
|
||
|
import android.icu.lang.UCharacter;
|
||
|
import android.icu.lang.UProperty;
|
||
|
import android.icu.lang.UScript;
|
||
|
import android.icu.util.CodePointTrie;
|
||
|
|
||
|
/**
|
||
|
* Rule Based Break Iterator
|
||
|
* This is a port of the C++ class RuleBasedBreakIterator from ICU4C.
|
||
|
*
|
||
|
* @hide Only a subset of ICU is exposed in Android
|
||
|
*/
|
||
|
public class RuleBasedBreakIterator extends BreakIterator {
|
||
|
//=======================================================================
|
||
|
// Constructors & Factories
|
||
|
//=======================================================================
|
||
|
|
||
|
/**
|
||
|
* private constructor
|
||
|
*/
|
||
|
private RuleBasedBreakIterator() {
|
||
|
fDictionaryCharCount = 0;
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Create a break iterator from a precompiled set of break rules.
|
||
|
*
|
||
|
* Creating a break iterator from the binary rules is much faster than
|
||
|
* creating one from source rules.
|
||
|
*
|
||
|
* The binary rules are generated by the RuleBasedBreakIterator.compileRules() function.
|
||
|
* Binary break iterator rules are not guaranteed to be compatible between
|
||
|
* different versions of ICU.
|
||
|
*
|
||
|
* @param is an input stream supplying the compiled binary rules.
|
||
|
* @throws IOException if there is an error while reading the rules from the InputStream.
|
||
|
* @see #compileRules(String, OutputStream)
|
||
|
*/
|
||
|
public static RuleBasedBreakIterator getInstanceFromCompiledRules(InputStream is) throws IOException {
|
||
|
RuleBasedBreakIterator This = new RuleBasedBreakIterator();
|
||
|
This.fRData = RBBIDataWrapper.get(ICUBinary.getByteBufferFromInputStreamAndCloseStream(is));
|
||
|
This.fLookAheadMatches = new int[This.fRData.fFTable.fLookAheadResultsSize];
|
||
|
return This;
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* This factory method doesn't have an access modifier; it is only accessible in the same
|
||
|
* package.
|
||
|
*
|
||
|
* Create a break iterator from a precompiled set of break rules.
|
||
|
*
|
||
|
* Creating a break iterator from the binary rules is much faster than
|
||
|
* creating one from source rules.
|
||
|
*
|
||
|
* The binary rules are generated by the RuleBasedBreakIterator.compileRules() function.
|
||
|
* Binary break iterator rules are not guaranteed to be compatible between
|
||
|
* different versions of ICU.
|
||
|
*
|
||
|
* @param bytes a buffer supplying the compiled binary rules.
|
||
|
* @param phraseBreaking a flag indicating if phrase breaking is required.
|
||
|
* @throws IOException if there is an error while reading the rules from the buffer.
|
||
|
* @see #compileRules(String, OutputStream)
|
||
|
* @hide draft / provisional / internal are hidden on Android
|
||
|
*/
|
||
|
/* package-potected */ static RuleBasedBreakIterator getInstanceFromCompiledRules(
|
||
|
ByteBuffer bytes, boolean phraseBreaking) throws IOException {
|
||
|
RuleBasedBreakIterator instance = getInstanceFromCompiledRules(bytes);
|
||
|
instance.fPhraseBreaking = phraseBreaking;
|
||
|
return instance;
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Create a break iterator from a precompiled set of break rules.
|
||
|
*
|
||
|
* Creating a break iterator from the binary rules is much faster than
|
||
|
* creating one from source rules.
|
||
|
*
|
||
|
* The binary rules are generated by the RuleBasedBreakIterator.compileRules() function.
|
||
|
* Binary break iterator rules are not guaranteed to be compatible between
|
||
|
* different versions of ICU.
|
||
|
*
|
||
|
* @param bytes a buffer supplying the compiled binary rules.
|
||
|
* @throws IOException if there is an error while reading the rules from the buffer.
|
||
|
* @see #compileRules(String, OutputStream)
|
||
|
* @deprecated This API is ICU internal only.
|
||
|
* @hide draft / provisional / internal are hidden on Android
|
||
|
*/
|
||
|
@Deprecated
|
||
|
public static RuleBasedBreakIterator getInstanceFromCompiledRules(ByteBuffer bytes) throws IOException {
|
||
|
RuleBasedBreakIterator This = new RuleBasedBreakIterator();
|
||
|
This.fRData = RBBIDataWrapper.get(bytes);
|
||
|
This.fLookAheadMatches = new int[This.fRData.fFTable.fLookAheadResultsSize];
|
||
|
return This;
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Construct a RuleBasedBreakIterator from a set of rules supplied as a string.
|
||
|
* @param rules The break rules to be used.
|
||
|
*/
|
||
|
public RuleBasedBreakIterator(String rules) {
|
||
|
this();
|
||
|
try {
|
||
|
ByteArrayOutputStream ruleOS = new ByteArrayOutputStream();
|
||
|
compileRules(rules, ruleOS);
|
||
|
fRData = RBBIDataWrapper.get(ByteBuffer.wrap(ruleOS.toByteArray()));
|
||
|
fLookAheadMatches = new int[fRData.fFTable.fLookAheadResultsSize];
|
||
|
} catch (IOException e) {
|
||
|
///CLOVER:OFF
|
||
|
// An IO exception can only arrive here if there is a bug in the RBBI Rule compiler,
|
||
|
// causing bogus compiled rules to be produced, but with no compile error raised.
|
||
|
RuntimeException rte = new RuntimeException("RuleBasedBreakIterator rule compilation internal error: "
|
||
|
+ e.getMessage());
|
||
|
throw rte;
|
||
|
///CLOVER:ON
|
||
|
}
|
||
|
}
|
||
|
|
||
|
//=======================================================================
|
||
|
// Boilerplate
|
||
|
//=======================================================================
|
||
|
|
||
|
/**
|
||
|
* Clones this iterator.
|
||
|
* @return A newly-constructed RuleBasedBreakIterator with the same
|
||
|
* behavior as this one.
|
||
|
*/
|
||
|
@Override
|
||
|
public Object clone() {
|
||
|
RuleBasedBreakIterator result;
|
||
|
result = (RuleBasedBreakIterator)super.clone();
|
||
|
if (fText != null) {
|
||
|
result.fText = (CharacterIterator)(fText.clone());
|
||
|
}
|
||
|
result.fLookAheadMatches = new int[fRData.fFTable.fLookAheadResultsSize];
|
||
|
result.fBreakCache = result.new BreakCache(fBreakCache);
|
||
|
result.fDictionaryCache = result.new DictionaryCache(fDictionaryCache);
|
||
|
return result;
|
||
|
}
|
||
|
|
||
|
|
||
|
/**
|
||
|
* Returns true if both BreakIterators are of the same class, have the same
|
||
|
* rules, and iterate over the same text.
|
||
|
*/
|
||
|
@Override
|
||
|
public boolean equals(Object that) {
|
||
|
if (that == null) {
|
||
|
return false;
|
||
|
}
|
||
|
if (this == that) {
|
||
|
return true;
|
||
|
}
|
||
|
try {
|
||
|
RuleBasedBreakIterator other = (RuleBasedBreakIterator) that;
|
||
|
if (fRData != other.fRData && (fRData == null || other.fRData == null)) {
|
||
|
return false;
|
||
|
}
|
||
|
if (fRData != null && other.fRData != null &&
|
||
|
(!fRData.fRuleSource.equals(other.fRData.fRuleSource))) {
|
||
|
return false;
|
||
|
}
|
||
|
if (fText == null && other.fText == null) {
|
||
|
return true;
|
||
|
}
|
||
|
if (fText == null || other.fText == null || !fText.equals(other.fText)) {
|
||
|
return false;
|
||
|
}
|
||
|
return fPosition == other.fPosition;
|
||
|
}
|
||
|
catch(ClassCastException e) {
|
||
|
return false;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Returns the description (rules) used to create this iterator.
|
||
|
* (In ICU4C, the same function is RuleBasedBreakIterator::getRules())
|
||
|
*/
|
||
|
@Override
|
||
|
public String toString() {
|
||
|
String retStr = "";
|
||
|
if (fRData != null) {
|
||
|
retStr = fRData.fRuleSource;
|
||
|
}
|
||
|
return retStr;
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Compute a hashcode for this BreakIterator
|
||
|
* @return A hash code
|
||
|
*/
|
||
|
@Override
|
||
|
public int hashCode()
|
||
|
{
|
||
|
return fRData.fRuleSource.hashCode();
|
||
|
}
|
||
|
|
||
|
|
||
|
private static final int START_STATE = 1; // The state number of the starting state
|
||
|
private static final int STOP_STATE = 0; // The state-transition value indicating "stop"
|
||
|
|
||
|
// RBBIRunMode - the state machine runs an extra iteration at the beginning and end
|
||
|
// of user text. A variable with this enum type keeps track of where we
|
||
|
// are. The state machine only fetches user text input while in RUN mode.
|
||
|
private static final int RBBI_START = 0;
|
||
|
private static final int RBBI_RUN = 1;
|
||
|
private static final int RBBI_END = 2;
|
||
|
|
||
|
/**
|
||
|
* The character iterator through which this BreakIterator accesses the text.
|
||
|
*/
|
||
|
private CharacterIterator fText = new java.text.StringCharacterIterator("");
|
||
|
|
||
|
/**
|
||
|
* The rule data for this BreakIterator instance.
|
||
|
* Not intended for public use. Declared public for testing purposes only.
|
||
|
* @deprecated This API is ICU internal only.
|
||
|
* @hide draft / provisional / internal are hidden on Android
|
||
|
*/
|
||
|
@Deprecated
|
||
|
public RBBIDataWrapper fRData;
|
||
|
|
||
|
/**
|
||
|
* The iteration state - current position, rule status for the current position,
|
||
|
* and whether the iterator ran off the end, yielding UBRK_DONE.
|
||
|
* Current position is pinned to be 0 < position <= text.length.
|
||
|
* Current position is always set to a boundary.
|
||
|
*
|
||
|
* The current position of the iterator. Pinned, 0 < fPosition <= text.length.
|
||
|
* Never has the value UBRK_DONE (-1).
|
||
|
*/
|
||
|
private int fPosition;
|
||
|
|
||
|
/**
|
||
|
* Index of the Rule {tag} values for the most recent match.
|
||
|
*/
|
||
|
private int fRuleStatusIndex;
|
||
|
|
||
|
/**
|
||
|
* True when iteration has run off the end, and iterator functions should return UBRK_DONE.
|
||
|
*/
|
||
|
private boolean fDone;
|
||
|
|
||
|
/**
|
||
|
* Array of look-ahead tentative results.
|
||
|
*/
|
||
|
private int[] fLookAheadMatches;
|
||
|
|
||
|
/**
|
||
|
* Cache of previously determined boundary positions.
|
||
|
*/
|
||
|
private BreakCache fBreakCache = new BreakCache();
|
||
|
|
||
|
/**
|
||
|
* Flag used to indicate if phrase breaking is required.
|
||
|
*/
|
||
|
private boolean fPhraseBreaking = false;
|
||
|
|
||
|
|
||
|
/**
|
||
|
* Counter for the number of characters encountered with the "dictionary"
|
||
|
* flag set. Normal RBBI iterators don't use it, although the code
|
||
|
* for updating it is live. Dictionary Based break iterators (a subclass
|
||
|
* of us) access this field directly.
|
||
|
* @hide draft / provisional / internal are hidden on Android
|
||
|
*/
|
||
|
private int fDictionaryCharCount;
|
||
|
|
||
|
private DictionaryCache fDictionaryCache = new DictionaryCache();
|
||
|
|
||
|
/**
|
||
|
* ICU debug argument name for RBBI
|
||
|
*/
|
||
|
private static final String RBBI_DEBUG_ARG = "rbbi";
|
||
|
|
||
|
/**
|
||
|
* Debugging flag. Trace operation of state machine when true.
|
||
|
*/
|
||
|
private static final boolean TRACE = ICUDebug.enabled(RBBI_DEBUG_ARG)
|
||
|
&& ICUDebug.value(RBBI_DEBUG_ARG).indexOf("trace") >= 0;
|
||
|
|
||
|
/**
|
||
|
* The "default" break engine - just skips over ranges of dictionary words,
|
||
|
* producing no breaks. Should only be used if characters need to be handled
|
||
|
* by a dictionary but we have no dictionary implementation for them.
|
||
|
*
|
||
|
* Only one instance; shared by all break iterators.
|
||
|
*/
|
||
|
private static final UnhandledBreakEngine gUnhandledBreakEngine;
|
||
|
|
||
|
/**
|
||
|
* List of all known break engines, common for all break iterators.
|
||
|
* Lazily updated as break engines are needed, because instantiation of
|
||
|
* break engines is expensive.
|
||
|
*
|
||
|
* Important notes:
|
||
|
* <ul>Because we don't want to add the same LanguageBreakEngine multiple times, all writes
|
||
|
* are synchronized.
|
||
|
* <ul>Read access avoids explicit synchronization, but will end up being synchronized if
|
||
|
* needed.
|
||
|
*/
|
||
|
private static final ConcurrentLinkedQueue<LanguageBreakEngine> gAllBreakEngines;
|
||
|
|
||
|
static {
|
||
|
gUnhandledBreakEngine = new UnhandledBreakEngine();
|
||
|
gAllBreakEngines = new ConcurrentLinkedQueue<>();
|
||
|
gAllBreakEngines.add(gUnhandledBreakEngine);
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Dump the contents of the state table and character classes for this break iterator.
|
||
|
* For debugging only.
|
||
|
* @deprecated This API is ICU internal only.
|
||
|
* @hide draft / provisional / internal are hidden on Android
|
||
|
*/
|
||
|
@Deprecated
|
||
|
public void dump(java.io.PrintStream out) {
|
||
|
if (out == null) {
|
||
|
out = System.out;
|
||
|
}
|
||
|
this.fRData.dump(out);
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Compile a set of source break rules into the binary state tables used
|
||
|
* by the break iterator engine. Creating a break iterator from precompiled
|
||
|
* rules is much faster than creating one from source rules.
|
||
|
*
|
||
|
* Binary break rules are not guaranteed to be compatible between different
|
||
|
* versions of ICU.
|
||
|
*
|
||
|
*
|
||
|
* @param rules The source form of the break rules
|
||
|
* @param ruleBinary An output stream to receive the compiled rules.
|
||
|
* @throws IOException If there is an error writing the output.
|
||
|
* @see #getInstanceFromCompiledRules(InputStream)
|
||
|
*/
|
||
|
public static void compileRules(String rules, OutputStream ruleBinary) throws IOException {
|
||
|
RBBIRuleBuilder.compileRules(rules, ruleBinary);
|
||
|
}
|
||
|
|
||
|
//=======================================================================
|
||
|
// BreakIterator overrides
|
||
|
//=======================================================================
|
||
|
|
||
|
/**
|
||
|
* Sets the current iteration position to the beginning of the text.
|
||
|
* (i.e., the CharacterIterator's starting offset).
|
||
|
* @return The offset of the beginning of the text.
|
||
|
*/
|
||
|
@Override
|
||
|
public int first() {
|
||
|
if (fText == null) {
|
||
|
return BreakIterator.DONE;
|
||
|
}
|
||
|
fText.first();
|
||
|
int start = fText.getIndex();
|
||
|
if (!fBreakCache.seek(start)) {
|
||
|
fBreakCache.populateNear(start);
|
||
|
}
|
||
|
fBreakCache.current();
|
||
|
assert(fPosition == start);
|
||
|
return fPosition;
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Sets the current iteration position to the end of the text.
|
||
|
* (i.e., the CharacterIterator's ending offset).
|
||
|
* @return The text's past-the-end offset.
|
||
|
*/
|
||
|
@Override
|
||
|
public int last() {
|
||
|
if (fText == null) {
|
||
|
return BreakIterator.DONE;
|
||
|
}
|
||
|
int endPos = fText.getEndIndex();
|
||
|
boolean endShouldBeBoundary = isBoundary(endPos); // Has side effect of setting iterator position.
|
||
|
assert(endShouldBeBoundary);
|
||
|
if (fPosition != endPos) {
|
||
|
assert(fPosition == endPos);
|
||
|
}
|
||
|
return endPos;
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Advances the iterator either forward or backward the specified number of steps.
|
||
|
* Negative values move backward, and positive values move forward. This is
|
||
|
* equivalent to repeatedly calling next() or previous().
|
||
|
* @param n The number of steps to move. The sign indicates the direction
|
||
|
* (negative is backwards, and positive is forwards).
|
||
|
* @return The character offset of the boundary position n boundaries away from
|
||
|
* the current one.
|
||
|
*/
|
||
|
@Override
|
||
|
public int next(int n) {
|
||
|
int result = 0;
|
||
|
if (n > 0) {
|
||
|
for (; n > 0 && result != DONE; --n) {
|
||
|
result = next();
|
||
|
}
|
||
|
} else if (n < 0) {
|
||
|
for (; n < 0 && result != DONE; ++n) {
|
||
|
result = previous();
|
||
|
}
|
||
|
} else {
|
||
|
result = current();
|
||
|
}
|
||
|
return result;
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Advances the iterator to the next boundary position.
|
||
|
* @return The position of the first boundary after this one.
|
||
|
*/
|
||
|
@Override
|
||
|
public int next() {
|
||
|
fBreakCache.next();
|
||
|
return fDone ? DONE : fPosition;
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Moves the iterator backwards, to the boundary preceding the current one.
|
||
|
* @return The position of the boundary position immediately preceding the starting position.
|
||
|
*/
|
||
|
@Override
|
||
|
public int previous() {
|
||
|
fBreakCache.previous();
|
||
|
return fDone ? DONE : fPosition;
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Sets the iterator to refer to the first boundary position following
|
||
|
* the specified position.
|
||
|
* @param startPos The position from which to begin searching for a break position.
|
||
|
* @return The position of the first break after the current position.
|
||
|
*/
|
||
|
@Override
|
||
|
public int following(int startPos) {
|
||
|
// if the supplied position is before the beginning, return the
|
||
|
// text's starting offset
|
||
|
if (startPos < fText.getBeginIndex()) {
|
||
|
return first();
|
||
|
}
|
||
|
|
||
|
// Move requested offset to a code point start. It might be between a lead and trail surrogate.
|
||
|
// Or it may be beyond the end of the text.
|
||
|
startPos = CISetIndex32(fText, startPos);
|
||
|
fBreakCache.following(startPos);
|
||
|
return fDone ? DONE : fPosition;
|
||
|
}
|
||
|
|
||
|
|
||
|
/**
|
||
|
* Sets the iterator to refer to the last boundary position before the
|
||
|
* specified position.
|
||
|
* @param offset The position to begin searching for a break from.
|
||
|
* @return The position of the last boundary before the starting position.
|
||
|
*/
|
||
|
@Override
|
||
|
public int preceding(int offset) {
|
||
|
if (fText == null || offset > fText.getEndIndex()) {
|
||
|
return last();
|
||
|
} else if (offset < fText.getBeginIndex()) {
|
||
|
return first();
|
||
|
}
|
||
|
|
||
|
// Move requested offset to a code point start. It might be between a lead and trail surrogate.
|
||
|
// int adjustedOffset = CISetIndex32(fText, offset); // TODO: restore to match ICU4C behavior.
|
||
|
int adjustedOffset = offset;
|
||
|
fBreakCache.preceding(adjustedOffset);
|
||
|
return fDone ? DONE : fPosition;
|
||
|
|
||
|
}
|
||
|
|
||
|
|
||
|
/**
|
||
|
* Throw IllegalArgumentException unless begin <= offset < end.
|
||
|
*/
|
||
|
protected static final void checkOffset(int offset, CharacterIterator text) {
|
||
|
if (offset < text.getBeginIndex() || offset > text.getEndIndex()) {
|
||
|
throw new IllegalArgumentException("offset out of bounds");
|
||
|
}
|
||
|
}
|
||
|
|
||
|
|
||
|
/**
|
||
|
* Returns true if the specified position is a boundary position. As a side
|
||
|
* effect, leaves the iterator pointing to the first boundary position at
|
||
|
* or after "offset".
|
||
|
* @param offset the offset to check.
|
||
|
* @return True if "offset" is a boundary position.
|
||
|
*/
|
||
|
@Override
|
||
|
public boolean isBoundary(int offset) {
|
||
|
// TODO: behavior difference with ICU4C, which considers out-of-range offsets
|
||
|
// to not be boundaries, and to not be errors.
|
||
|
checkOffset(offset, fText);
|
||
|
|
||
|
// Adjust offset to be on a code point boundary and not beyond the end of the text.
|
||
|
// Note that isBoundary() is always false for offsets that are not on code point boundaries.
|
||
|
// But we still need the side effect of leaving iteration at the following boundary.
|
||
|
int adjustedOffset = CISetIndex32(fText, offset);
|
||
|
|
||
|
boolean result = false;
|
||
|
if (fBreakCache.seek(adjustedOffset) || fBreakCache.populateNear(adjustedOffset)) {
|
||
|
result = (fBreakCache.current() == offset);
|
||
|
}
|
||
|
|
||
|
if (!result) {
|
||
|
// Not on a boundary. isBoundary() must leave iterator on the following boundary.
|
||
|
// fBreakCache.seek(), above, left us on the preceding boundary, so advance one.
|
||
|
next();
|
||
|
}
|
||
|
return result;
|
||
|
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Returns the current iteration position. Note that DONE is never
|
||
|
* returned from this function; if iteration has run to the end of a
|
||
|
* string, current() will return the length of the string while
|
||
|
* next() will return BreakIterator.DONE).
|
||
|
* @return The current iteration position.
|
||
|
*/
|
||
|
@Override
|
||
|
public int current() {
|
||
|
return (fText != null) ? fPosition : BreakIterator.DONE;
|
||
|
}
|
||
|
|
||
|
|
||
|
/**
|
||
|
* Return the status tag from the break rule that determined the boundary at
|
||
|
* the current iteration position. The values appear in the rule source
|
||
|
* within brackets, {123}, for example. For rules that do not specify a
|
||
|
* status, a default value of 0 is returned. If more than one rule applies,
|
||
|
* the numerically largest of the possible status values is returned.
|
||
|
* <p>
|
||
|
* Of the standard types of ICU break iterators, only the word and line break
|
||
|
* iterator provides status values. The values are defined in
|
||
|
* class RuleBasedBreakIterator, and allow distinguishing between words
|
||
|
* that contain alphabetic letters, "words" that appear to be numbers,
|
||
|
* punctuation and spaces, words containing ideographic characters, and
|
||
|
* more. Call <code>getRuleStatus</code> after obtaining a boundary
|
||
|
* position from <code>next()</code>, <code>previous()</code>, or
|
||
|
* any other break iterator functions that returns a boundary position.
|
||
|
* <p>
|
||
|
* Note that <code>getRuleStatus()</code> returns the value corresponding to
|
||
|
* <code>current()</code> index even after <code>next()</code> has returned DONE.
|
||
|
* <p>
|
||
|
|
||
|
* @return the status from the break rule that determined the boundary
|
||
|
* at the current iteration position.
|
||
|
*/
|
||
|
|
||
|
@Override
|
||
|
public int getRuleStatus() {
|
||
|
// Status records have this form:
|
||
|
// Count N <-- fLastRuleStatusIndex points here.
|
||
|
// Status val 0
|
||
|
// Status val 1
|
||
|
// ...
|
||
|
// Status val N-1 <-- the value we need to return
|
||
|
// The status values are sorted in ascending order.
|
||
|
// This function returns the last (largest) of the array of status values.
|
||
|
int idx = fRuleStatusIndex + fRData.fStatusTable[fRuleStatusIndex];
|
||
|
int tagVal = fRData.fStatusTable[idx];
|
||
|
return tagVal;
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Get the status (tag) values from the break rule(s) that determined the boundary
|
||
|
* at the current iteration position. The values appear in the rule source
|
||
|
* within brackets, {123}, for example. The default status value for rules
|
||
|
* that do not explicitly provide one is zero.
|
||
|
* <p>
|
||
|
* The status values used by the standard ICU break rules are defined
|
||
|
* as public constants in class RuleBasedBreakIterator.
|
||
|
* <p>
|
||
|
* If the size of the output array is insufficient to hold the data,
|
||
|
* the output will be truncated to the available length. No exception
|
||
|
* will be thrown.
|
||
|
*
|
||
|
* @param fillInArray an array to be filled in with the status values.
|
||
|
* @return The number of rule status values from the rules that determined
|
||
|
* the boundary at the current iteration position.
|
||
|
* In the event that the array is too small, the return value
|
||
|
* is the total number of status values that were available,
|
||
|
* not the reduced number that were actually returned.
|
||
|
*/
|
||
|
@Override
|
||
|
public int getRuleStatusVec(int[] fillInArray) {
|
||
|
int numStatusVals = fRData.fStatusTable[fRuleStatusIndex];
|
||
|
if (fillInArray != null) {
|
||
|
int numToCopy = Math.min(numStatusVals, fillInArray.length);
|
||
|
for (int i=0; i<numToCopy; i++) {
|
||
|
fillInArray[i] = fRData.fStatusTable[fRuleStatusIndex + i + 1];
|
||
|
}
|
||
|
}
|
||
|
return numStatusVals;
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Returns a CharacterIterator over the text being analyzed.
|
||
|
* <p>
|
||
|
* <b><i>Caution:</i></b>The state of the returned CharacterIterator
|
||
|
* must not be modified in any way while the BreakIterator is still in use.
|
||
|
* Doing so will lead to undefined behavior of the BreakIterator.
|
||
|
* Clone the returned CharacterIterator first and work with that.
|
||
|
* <p>
|
||
|
* The returned CharacterIterator is a reference
|
||
|
* to the <b>actual iterator being used</b> by the BreakIterator.
|
||
|
* No guarantees are made about the current position
|
||
|
* of this iterator when it is returned; it may differ from the
|
||
|
* BreakIterators current position. If you need to move that
|
||
|
* position to examine the text, clone this function's return value first.
|
||
|
* @return An iterator over the text being analyzed.
|
||
|
*/
|
||
|
@Override
|
||
|
public CharacterIterator getText() {
|
||
|
return fText;
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Set the iterator to analyze a new piece of text. This function resets
|
||
|
* the current iteration position to the beginning of the text.
|
||
|
* (The old iterator is dropped.)
|
||
|
* <p>
|
||
|
* <b><i>Caution:</i></b> The supplied CharacterIterator is used
|
||
|
* directly by the BreakIterator, and must not be altered in any
|
||
|
* way by code outside of the BreakIterator.
|
||
|
* Doing so will lead to undefined behavior of the BreakIterator.
|
||
|
*
|
||
|
* @param newText An iterator over the text to analyze.
|
||
|
*/
|
||
|
@Override
|
||
|
public void setText(CharacterIterator newText) {
|
||
|
if (newText != null) {
|
||
|
fBreakCache.reset(newText.getBeginIndex(), 0);
|
||
|
} else {
|
||
|
fBreakCache.reset();
|
||
|
}
|
||
|
fDictionaryCache.reset();
|
||
|
fText = newText;
|
||
|
this.first();
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Control debug, trace and dump options.
|
||
|
* @deprecated This API is ICU internal only.
|
||
|
* @hide draft / provisional / internal are hidden on Android
|
||
|
*/
|
||
|
@Deprecated
|
||
|
public static final String fDebugEnv = ICUDebug.enabled(RBBI_DEBUG_ARG) ?
|
||
|
ICUDebug.value(RBBI_DEBUG_ARG) : null;
|
||
|
|
||
|
|
||
|
private LanguageBreakEngine getLanguageBreakEngine(int c) {
|
||
|
|
||
|
// We have a dictionary character.
|
||
|
// Does an already instantiated break engine handle it?
|
||
|
// First read without synchronization, which could lead to a new language
|
||
|
// break engine being added and we didn't go over it.
|
||
|
for (LanguageBreakEngine candidate : gAllBreakEngines) {
|
||
|
if (candidate.handles(c)) {
|
||
|
return candidate;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
synchronized (gAllBreakEngines) {
|
||
|
// Another break iterator may have instantiated the desired engine.
|
||
|
for (LanguageBreakEngine candidate : gAllBreakEngines) {
|
||
|
if (candidate.handles(c)) {
|
||
|
return candidate;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
// The global list doesn't have an existing engine, build one.
|
||
|
int script = UCharacter.getIntPropertyValue(c, UProperty.SCRIPT);
|
||
|
if (script == UScript.KATAKANA || script == UScript.HIRAGANA) {
|
||
|
// Katakana, Hiragana and Han are handled by the same dictionary engine.
|
||
|
// Fold them together for mapping from script -> engine.
|
||
|
script = UScript.HAN;
|
||
|
}
|
||
|
|
||
|
LanguageBreakEngine eng;
|
||
|
try {
|
||
|
switch (script) {
|
||
|
case UScript.THAI:
|
||
|
try {
|
||
|
eng = LSTMBreakEngine.create(script, LSTMBreakEngine.createData(script));
|
||
|
} catch (MissingResourceException e) {
|
||
|
eng = new ThaiBreakEngine();
|
||
|
}
|
||
|
break;
|
||
|
case UScript.LAO:
|
||
|
eng = new LaoBreakEngine();
|
||
|
break;
|
||
|
case UScript.MYANMAR:
|
||
|
try {
|
||
|
eng = LSTMBreakEngine.create(script, LSTMBreakEngine.createData(script));
|
||
|
} catch (MissingResourceException e) {
|
||
|
eng = new BurmeseBreakEngine();
|
||
|
}
|
||
|
break;
|
||
|
case UScript.KHMER:
|
||
|
eng = new KhmerBreakEngine();
|
||
|
break;
|
||
|
case UScript.HAN:
|
||
|
eng = new CjkBreakEngine(false);
|
||
|
break;
|
||
|
case UScript.HANGUL:
|
||
|
eng = new CjkBreakEngine(true);
|
||
|
break;
|
||
|
default:
|
||
|
gUnhandledBreakEngine.handleChar(c);
|
||
|
eng = gUnhandledBreakEngine;
|
||
|
break;
|
||
|
}
|
||
|
} catch (IOException e) {
|
||
|
eng = null;
|
||
|
}
|
||
|
|
||
|
if (eng != null && eng != gUnhandledBreakEngine) {
|
||
|
gAllBreakEngines.add(eng);
|
||
|
}
|
||
|
return eng;
|
||
|
} // end synchronized(gAllBreakEngines)
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* The State Machine Engine for moving forward is here.
|
||
|
* This function is the heart of the RBBI run time engine.
|
||
|
*
|
||
|
* Input
|
||
|
* fPosition, the position in the text to begin from.
|
||
|
* Output
|
||
|
* fPosition: the boundary following the starting position.
|
||
|
* fDictionaryCharCount the number of dictionary characters encountered.
|
||
|
* If > 0, the segment will be further subdivided
|
||
|
* fRuleStatusIndex Info from the state table indicating which rules caused the boundary.
|
||
|
*
|
||
|
* @return the new iterator position
|
||
|
*
|
||
|
* A note on supplementary characters and the position of underlying
|
||
|
* Java CharacterIterator: Normally, a character iterator is positioned at
|
||
|
* the char most recently returned by next(). Within this function, when
|
||
|
* a supplementary char is being processed, the char iterator is left
|
||
|
* sitting on the trail surrogate, in the middle of the code point.
|
||
|
* This is different from everywhere else, where an iterator always
|
||
|
* points at the lead surrogate of a supplementary.
|
||
|
*/
|
||
|
private int handleNext() {
|
||
|
if (TRACE) {
|
||
|
System.out.println("Handle Next pos char state category");
|
||
|
}
|
||
|
|
||
|
// handleNext always sets the break tag value.
|
||
|
// Set the default for it.
|
||
|
fRuleStatusIndex = 0;
|
||
|
fDictionaryCharCount = 0;
|
||
|
|
||
|
// caches for quicker access
|
||
|
CharacterIterator text = fText;
|
||
|
CodePointTrie trie = fRData.fTrie;
|
||
|
|
||
|
char[] stateTable = fRData.fFTable.fTable;
|
||
|
int initialPosition = fPosition;
|
||
|
text.setIndex(initialPosition);
|
||
|
int result = initialPosition;
|
||
|
|
||
|
// Set up the starting char
|
||
|
int c = text.current();
|
||
|
if (c >= UTF16.LEAD_SURROGATE_MIN_VALUE) {
|
||
|
c = nextTrail32(text, c);
|
||
|
if (c == DONE32) {
|
||
|
fDone = true;
|
||
|
return BreakIterator.DONE;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
// Set the initial state for the state machine
|
||
|
int state = START_STATE;
|
||
|
int row = fRData.getRowIndex(state);
|
||
|
short category = 3;
|
||
|
int flagsState = fRData.fFTable.fFlags;
|
||
|
int dictStart = fRData.fFTable.fDictCategoriesStart;
|
||
|
int mode = RBBI_RUN;
|
||
|
if ((flagsState & RBBIDataWrapper.RBBI_BOF_REQUIRED) != 0) {
|
||
|
category = 2;
|
||
|
mode = RBBI_START;
|
||
|
if (TRACE) {
|
||
|
System.out.print(" " + RBBIDataWrapper.intToString(text.getIndex(), 5));
|
||
|
System.out.print(RBBIDataWrapper.intToHexString(c, 10));
|
||
|
System.out.println(RBBIDataWrapper.intToString(state,7) + RBBIDataWrapper.intToString(category,6));
|
||
|
}
|
||
|
}
|
||
|
|
||
|
// loop until we reach the end of the text or transition to state 0
|
||
|
while (state != STOP_STATE) {
|
||
|
if (c == DONE32) {
|
||
|
// Reached end of input string.
|
||
|
if (mode == RBBI_END) {
|
||
|
// We have already run the loop one last time with the
|
||
|
// character set to the pseudo {eof} value. Now it is time
|
||
|
// to unconditionally bail out.
|
||
|
break;
|
||
|
}
|
||
|
// Run the loop one last time with the fake end-of-input character category
|
||
|
mode = RBBI_END;
|
||
|
category = 1;
|
||
|
}
|
||
|
else if (mode == RBBI_RUN) {
|
||
|
// Get the char category. An incoming category of 1 or 2 mens that
|
||
|
// we are preset for doing the beginning or end of input, and
|
||
|
// that we shouldn't get a category from an actual text input character.
|
||
|
//
|
||
|
|
||
|
// look up the current character's character category, which tells us
|
||
|
// which column in the state table to look at.
|
||
|
//
|
||
|
category = (short) trie.get(c);
|
||
|
|
||
|
// Check for categories that require word dictionary handling.
|
||
|
if (category >= dictStart) {
|
||
|
fDictionaryCharCount++;
|
||
|
}
|
||
|
|
||
|
if (TRACE) {
|
||
|
System.out.print(" " + RBBIDataWrapper.intToString(text.getIndex(), 5));
|
||
|
System.out.print(RBBIDataWrapper.intToHexString(c, 10));
|
||
|
System.out.println(RBBIDataWrapper.intToString(state,7) + RBBIDataWrapper.intToString(category,6));
|
||
|
}
|
||
|
|
||
|
// Advance to the next character.
|
||
|
// If this is a beginning-of-input loop iteration, don't advance.
|
||
|
// The next iteration will be processing the first real input character.
|
||
|
c = text.next();
|
||
|
if (c >= UTF16.LEAD_SURROGATE_MIN_VALUE) {
|
||
|
c = nextTrail32(text, c);
|
||
|
}
|
||
|
}
|
||
|
else {
|
||
|
mode = RBBI_RUN;
|
||
|
}
|
||
|
|
||
|
// look up a state transition in the state table
|
||
|
state = stateTable[row + RBBIDataWrapper.NEXTSTATES + category];
|
||
|
row = fRData.getRowIndex(state);
|
||
|
int accepting = stateTable[row + RBBIDataWrapper.ACCEPTING];
|
||
|
if (accepting == RBBIDataWrapper.ACCEPTING_UNCONDITIONAL) {
|
||
|
// Match found, common case
|
||
|
result = text.getIndex();
|
||
|
if (c >= UTF16.SUPPLEMENTARY_MIN_VALUE && c <= UTF16.CODEPOINT_MAX_VALUE) {
|
||
|
// The iterator has been left in the middle of a surrogate pair.
|
||
|
// We want the start of it.
|
||
|
result--;
|
||
|
}
|
||
|
|
||
|
// Remember the break status (tag) values.
|
||
|
fRuleStatusIndex = stateTable[row + RBBIDataWrapper.TAGSIDX];
|
||
|
} else if (accepting > RBBIDataWrapper.ACCEPTING_UNCONDITIONAL) {
|
||
|
// Lookahead match is completed
|
||
|
int lookaheadResult = fLookAheadMatches[accepting];
|
||
|
if (lookaheadResult >= 0) {
|
||
|
fRuleStatusIndex = stateTable[row + RBBIDataWrapper.TAGSIDX];
|
||
|
fPosition = lookaheadResult;
|
||
|
return lookaheadResult;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
|
||
|
// If we are at the position of the '/' in a look-ahead (hard break) rule;
|
||
|
// record the current position, to be returned later, if the full rule matches.
|
||
|
// TODO: Move this check before the previous check of fAccepting.
|
||
|
// This would enable hard-break rules with no following context.
|
||
|
// But there are line break test failures when trying this. Investigate.
|
||
|
// Issue ICU-20837
|
||
|
int rule = stateTable[row + RBBIDataWrapper.LOOKAHEAD];
|
||
|
if (rule != 0) {
|
||
|
int pos = text.getIndex();
|
||
|
if (c >= UTF16.SUPPLEMENTARY_MIN_VALUE && c <= UTF16.CODEPOINT_MAX_VALUE) {
|
||
|
// The iterator has been left in the middle of a surrogate pair.
|
||
|
// We want the beginning of it.
|
||
|
pos--;
|
||
|
}
|
||
|
fLookAheadMatches[rule] = pos;
|
||
|
}
|
||
|
|
||
|
|
||
|
} // End of state machine main loop
|
||
|
|
||
|
// The state machine is done. Check whether it found a match...
|
||
|
|
||
|
// If the iterator failed to advance in the match engine force it ahead by one.
|
||
|
// This indicates a defect in the break rules, which should always match
|
||
|
// at least one character.
|
||
|
|
||
|
if (result == initialPosition) {
|
||
|
if (TRACE) {
|
||
|
System.out.println("Iterator did not move. Advancing by 1.");
|
||
|
}
|
||
|
text.setIndex(initialPosition);
|
||
|
next32(text);
|
||
|
result = text.getIndex();
|
||
|
fRuleStatusIndex = 0;
|
||
|
}
|
||
|
|
||
|
// Leave the iterator at our result position.
|
||
|
// (we may have advanced beyond the last accepting position chasing after
|
||
|
// longer matches that never completed.)
|
||
|
fPosition = result;
|
||
|
|
||
|
if (TRACE) {
|
||
|
System.out.println("result = " + result);
|
||
|
}
|
||
|
return result;
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Iterate backwards from an arbitrary position in the input text using the Safe Reverse rules.
|
||
|
* This locates a "Safe Position" from which the forward break rules
|
||
|
* will operate correctly. A Safe Position is not necessarily a boundary itself.
|
||
|
*
|
||
|
* The logic of this function is very similar to handleNext(), above, but simpler
|
||
|
* because the safe table does not require as many options.
|
||
|
*
|
||
|
* @param fromPosition the position in the input text to begin the iteration.
|
||
|
* @hide draft / provisional / internal are hidden on Android
|
||
|
*/
|
||
|
private int handleSafePrevious(int fromPosition) {
|
||
|
char state;
|
||
|
short category = 0;
|
||
|
int result = 0;
|
||
|
|
||
|
// caches for quicker access
|
||
|
CharacterIterator text = fText;
|
||
|
CodePointTrie trie = fRData.fTrie;
|
||
|
char[] stateTable = fRData.fRTable.fTable;
|
||
|
|
||
|
CISetIndex32(text, fromPosition);
|
||
|
if (TRACE) {
|
||
|
System.out.print("Handle Previous pos char state category");
|
||
|
}
|
||
|
|
||
|
// if we're already at the start of the text, return DONE.
|
||
|
if (text.getIndex() == text.getBeginIndex()) {
|
||
|
return BreakIterator.DONE;
|
||
|
}
|
||
|
|
||
|
// Set the initial state for the state machine
|
||
|
int c = CharacterIteration.previous32(text);
|
||
|
state = START_STATE;
|
||
|
int row = fRData.getRowIndex(state);
|
||
|
|
||
|
// loop until we reach the start of the text or transition to state 0
|
||
|
//
|
||
|
for (; c != DONE32; c = CharacterIteration.previous32(text)) {
|
||
|
|
||
|
// look up the current character's character category, which tells us
|
||
|
// which column in the state table to look at.
|
||
|
//
|
||
|
// And off the dictionary flag bit. For reverse iteration it is not used.
|
||
|
category = (short) trie.get(c);
|
||
|
if (TRACE) {
|
||
|
System.out.print(" " + RBBIDataWrapper.intToString(text.getIndex(), 5));
|
||
|
System.out.print(RBBIDataWrapper.intToHexString(c, 10));
|
||
|
System.out.println(RBBIDataWrapper.intToString(state,7) + RBBIDataWrapper.intToString(category,6));
|
||
|
}
|
||
|
|
||
|
// State Transition - move machine to its next state
|
||
|
//
|
||
|
assert(category < fRData.fHeader.fCatCount);
|
||
|
state = stateTable[row + RBBIDataWrapper.NEXTSTATES + category];
|
||
|
row = fRData.getRowIndex(state);
|
||
|
|
||
|
if (state == STOP_STATE) {
|
||
|
// This is the normal exit from the lookup state machine.
|
||
|
// Transition to state zero means we have found a safe point.
|
||
|
break;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
// The state machine is done.
|
||
|
result = text.getIndex();
|
||
|
if (TRACE) {
|
||
|
System.out.println("result = " + result);
|
||
|
}
|
||
|
return result;
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Set the index of a CharacterIterator.
|
||
|
* Pin the index to the valid range range of BeginIndex <= index <= EndIndex.
|
||
|
* If the index points to a trail surrogate of a supplementary character, adjust it
|
||
|
* to the start (lead surrogate) index.
|
||
|
*
|
||
|
* @param ci A CharacterIterator to set
|
||
|
* @param index the index to set
|
||
|
* @return the resulting index, possibly pinned or adjusted.
|
||
|
*/
|
||
|
private static int CISetIndex32(CharacterIterator ci, int index) {
|
||
|
if (index <= ci.getBeginIndex()) {
|
||
|
ci.first();
|
||
|
} else if (index >= ci.getEndIndex()) {
|
||
|
ci.setIndex(ci.getEndIndex());
|
||
|
} else if (Character.isLowSurrogate(ci.setIndex(index))) {
|
||
|
if (!Character.isHighSurrogate(ci.previous())) {
|
||
|
ci.next();
|
||
|
}
|
||
|
}
|
||
|
return ci.getIndex();
|
||
|
}
|
||
|
|
||
|
/** DictionaryCache stores the boundaries obtained from a run of dictionary characters.
|
||
|
* Dictionary boundaries are moved first to this cache, then from here
|
||
|
* to the main BreakCache, where they may inter-leave with non-dictionary
|
||
|
* boundaries. The public BreakIterator API always fetches directly
|
||
|
* from the main BreakCache, not from here.
|
||
|
*
|
||
|
* In common situations, the number of boundaries in a single dictionary run
|
||
|
* should be quite small, it will be terminated by punctuation, spaces,
|
||
|
* or any other non-dictionary characters. The main BreakCache may end
|
||
|
* up with boundaries from multiple dictionary based runs.
|
||
|
*
|
||
|
* The boundaries are stored in a simple ArrayList (vector), with the
|
||
|
* assumption that they will be accessed sequentially.
|
||
|
*/
|
||
|
class DictionaryCache {
|
||
|
|
||
|
void reset() {
|
||
|
fPositionInCache = -1;
|
||
|
fStart = 0;
|
||
|
fLimit = 0;
|
||
|
fFirstRuleStatusIndex = 0;
|
||
|
fOtherRuleStatusIndex = 0;
|
||
|
fBreaks.removeAllElements();
|
||
|
};
|
||
|
|
||
|
boolean following(int fromPos) {
|
||
|
if (fromPos >= fLimit || fromPos < fStart) {
|
||
|
fPositionInCache = -1;
|
||
|
return false;
|
||
|
}
|
||
|
|
||
|
// Sequential iteration, move from previous boundary to the following
|
||
|
|
||
|
int r = 0;
|
||
|
if (fPositionInCache >= 0 && fPositionInCache < fBreaks.size() && fBreaks.elementAt(fPositionInCache) == fromPos) {
|
||
|
++fPositionInCache;
|
||
|
if (fPositionInCache >= fBreaks.size()) {
|
||
|
fPositionInCache = -1;
|
||
|
return false;
|
||
|
}
|
||
|
r = fBreaks.elementAt(fPositionInCache);
|
||
|
assert(r > fromPos);
|
||
|
fBoundary = r;
|
||
|
fStatusIndex = fOtherRuleStatusIndex;
|
||
|
return true;
|
||
|
}
|
||
|
|
||
|
// Random indexing. Linear search for the boundary following the given position.
|
||
|
|
||
|
for (fPositionInCache = 0; fPositionInCache < fBreaks.size(); ++fPositionInCache) {
|
||
|
r= fBreaks.elementAt(fPositionInCache);
|
||
|
if (r > fromPos) {
|
||
|
fBoundary = r;
|
||
|
fStatusIndex = fOtherRuleStatusIndex;
|
||
|
return true;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
// Internal error. fStart <= fromPos < fLimit, but no cached boundary.
|
||
|
assert(false);
|
||
|
fPositionInCache = -1;
|
||
|
return false;
|
||
|
};
|
||
|
|
||
|
boolean preceding(int fromPos) {
|
||
|
if (fromPos <= fStart || fromPos > fLimit) {
|
||
|
fPositionInCache = -1;
|
||
|
return false;
|
||
|
}
|
||
|
|
||
|
if (fromPos == fLimit) {
|
||
|
fPositionInCache = fBreaks.size() - 1;
|
||
|
if (fPositionInCache >= 0) {
|
||
|
assert(fBreaks.elementAt(fPositionInCache) == fromPos);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
int r;
|
||
|
if (fPositionInCache > 0 && fPositionInCache < fBreaks.size() && fBreaks.elementAt(fPositionInCache) == fromPos) {
|
||
|
--fPositionInCache;
|
||
|
r = fBreaks.elementAt(fPositionInCache);
|
||
|
assert(r < fromPos);
|
||
|
fBoundary = r;
|
||
|
fStatusIndex = ( r== fStart) ? fFirstRuleStatusIndex : fOtherRuleStatusIndex;
|
||
|
return true;
|
||
|
}
|
||
|
|
||
|
if (fPositionInCache == 0) {
|
||
|
fPositionInCache = -1;
|
||
|
return false;
|
||
|
}
|
||
|
|
||
|
for (fPositionInCache = fBreaks.size()-1; fPositionInCache >= 0; --fPositionInCache) {
|
||
|
r = fBreaks.elementAt(fPositionInCache);
|
||
|
if (r < fromPos) {
|
||
|
fBoundary = r;
|
||
|
fStatusIndex = ( r == fStart) ? fFirstRuleStatusIndex : fOtherRuleStatusIndex;
|
||
|
return true;
|
||
|
}
|
||
|
}
|
||
|
assert(false);
|
||
|
fPositionInCache = -1;
|
||
|
return false;
|
||
|
};
|
||
|
|
||
|
/**
|
||
|
* Populate the cache with the dictionary based boundaries within a region of text.
|
||
|
* @param startPos The start position of a range of text
|
||
|
* @param endPos The end position of a range of text
|
||
|
* @param firstRuleStatus The rule status index that applies to the break at startPos
|
||
|
* @param otherRuleStatus The rule status index that applies to boundaries other than startPos
|
||
|
* @hide draft / provisional / internal are hidden on Android
|
||
|
*/
|
||
|
void populateDictionary(int startPos, int endPos,
|
||
|
int firstRuleStatus, int otherRuleStatus) {
|
||
|
if ((endPos - startPos) <= 1) {
|
||
|
return;
|
||
|
}
|
||
|
|
||
|
reset();
|
||
|
fFirstRuleStatusIndex = firstRuleStatus;
|
||
|
fOtherRuleStatusIndex = otherRuleStatus;
|
||
|
|
||
|
int rangeStart = startPos;
|
||
|
int rangeEnd = endPos;
|
||
|
|
||
|
int category;
|
||
|
int current;
|
||
|
int foundBreakCount = 0;
|
||
|
|
||
|
// Loop through the text, looking for ranges of dictionary characters.
|
||
|
// For each span, find the appropriate break engine, and ask it to find
|
||
|
// any breaks within the span.
|
||
|
|
||
|
fText.setIndex(rangeStart);
|
||
|
int c = CharacterIteration.current32(fText);
|
||
|
category = (short)fRData.fTrie.get(c);
|
||
|
int dictStart = fRData.fFTable.fDictCategoriesStart;
|
||
|
|
||
|
while(true) {
|
||
|
while((current = fText.getIndex()) < rangeEnd && (category < dictStart)) {
|
||
|
c = CharacterIteration.next32(fText); // pre-increment
|
||
|
category = (short)fRData.fTrie.get(c);
|
||
|
}
|
||
|
if (current >= rangeEnd) {
|
||
|
break;
|
||
|
}
|
||
|
|
||
|
// We now have a dictionary character. Get the appropriate language object
|
||
|
// to deal with it.
|
||
|
LanguageBreakEngine lbe = getLanguageBreakEngine(c);
|
||
|
|
||
|
// Ask the language object if there are any breaks. It will add them to the cache and
|
||
|
// leave the text pointer on the other side of its range, ready to search for the next one.
|
||
|
if (lbe != null) {
|
||
|
foundBreakCount += lbe.findBreaks(fText, rangeStart, rangeEnd, fBreaks, fPhraseBreaking);
|
||
|
}
|
||
|
|
||
|
// Reload the loop variables for the next go-round
|
||
|
c = CharacterIteration.current32(fText);
|
||
|
category = (short)fRData.fTrie.get(c);
|
||
|
}
|
||
|
|
||
|
// If we found breaks, ensure that the first and last entries are
|
||
|
// the original starting and ending position. And initialize the
|
||
|
// cache iteration position to the first entry.
|
||
|
|
||
|
// System.out.printf("foundBreakCount = %d%n", foundBreakCount);
|
||
|
if (foundBreakCount > 0) {
|
||
|
assert(foundBreakCount == fBreaks.size());
|
||
|
if (startPos < fBreaks.elementAt(0)) {
|
||
|
// The dictionary did not place a boundary at the start of the segment of text.
|
||
|
// Add one now. This should not commonly happen, but it would be easy for interactions
|
||
|
// of the rules for dictionary segments and the break engine implementations to
|
||
|
// inadvertently cause it. Cover it here, just in case.
|
||
|
fBreaks.offer(startPos);
|
||
|
}
|
||
|
if (endPos > fBreaks.peek()) {
|
||
|
fBreaks.push(endPos);
|
||
|
}
|
||
|
fPositionInCache = 0;
|
||
|
// Note: Dictionary matching may extend beyond the original limit.
|
||
|
fStart = fBreaks.elementAt(0);
|
||
|
fLimit = fBreaks.peek();
|
||
|
} else {
|
||
|
// there were no language-based breaks, even though the segment contained
|
||
|
// dictionary characters. Subsequent attempts to fetch boundaries from the dictionary cache
|
||
|
// for this range will fail, and the calling code will fall back to the rule based boundaries.
|
||
|
}
|
||
|
|
||
|
};
|
||
|
|
||
|
|
||
|
DictionaryCache() {
|
||
|
fPositionInCache = -1;
|
||
|
fBreaks = new DictionaryBreakEngine.DequeI();
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* copy constructor. Used by RuleBasedBreakIterator.clone().
|
||
|
*
|
||
|
* @param src the source object to be copied.
|
||
|
*/
|
||
|
DictionaryCache(DictionaryCache src) {
|
||
|
try {
|
||
|
fBreaks = (DictionaryBreakEngine.DequeI)src.fBreaks.clone();
|
||
|
}
|
||
|
catch (CloneNotSupportedException e) {
|
||
|
throw new RuntimeException(e);
|
||
|
}
|
||
|
fPositionInCache = src.fPositionInCache;
|
||
|
fStart = src.fStart;
|
||
|
fLimit = src.fLimit;
|
||
|
fFirstRuleStatusIndex = src.fFirstRuleStatusIndex;
|
||
|
fOtherRuleStatusIndex = src.fOtherRuleStatusIndex;
|
||
|
fBoundary = src.fBoundary;
|
||
|
fStatusIndex = src.fStatusIndex;
|
||
|
}
|
||
|
|
||
|
// A data structure containing the boundaries themselves. Essentially a vector of raw ints.
|
||
|
DictionaryBreakEngine.DequeI fBreaks;
|
||
|
int fPositionInCache; // Index in fBreaks of last boundary returned by following()
|
||
|
// // or preceding(). Optimizes sequential access.
|
||
|
int fStart; // Text position of first boundary in cache.
|
||
|
int fLimit; // Last boundary in cache. Which is the limit of the
|
||
|
// // text segment being handled by the dictionary.
|
||
|
int fFirstRuleStatusIndex; // Rule status info for first boundary.
|
||
|
int fOtherRuleStatusIndex; // Rule status info for 2nd through last boundaries.
|
||
|
int fBoundary; // Current boundary. Set by preceding(), following().
|
||
|
int fStatusIndex; // Current rule status index. Set by preceding, following().
|
||
|
};
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
/*
|
||
|
* class BreakCache
|
||
|
*
|
||
|
* Cache of break boundary positions and rule status values.
|
||
|
* Break iterator API functions, next(), previous(), etc., will use cached results
|
||
|
* when possible, and otherwise cache new results as they are obtained.
|
||
|
*
|
||
|
* Uniformly caches both dictionary and rule based (non-dictionary) boundaries.
|
||
|
*
|
||
|
* The cache is implemented as a single circular buffer.
|
||
|
*/
|
||
|
|
||
|
/*
|
||
|
* size of the circular cache buffer.
|
||
|
*/
|
||
|
|
||
|
class BreakCache {
|
||
|
|
||
|
BreakCache() {
|
||
|
reset();
|
||
|
};
|
||
|
|
||
|
void reset(int pos, int ruleStatus) {
|
||
|
fStartBufIdx = 0;
|
||
|
fEndBufIdx = 0;
|
||
|
fTextIdx = pos;
|
||
|
fBufIdx = 0;
|
||
|
fBoundaries[0] = pos;
|
||
|
fStatuses[0] = (short)ruleStatus;
|
||
|
}
|
||
|
|
||
|
void reset() {reset(0, 0); };
|
||
|
|
||
|
void next() {
|
||
|
if (fBufIdx == fEndBufIdx) {
|
||
|
fDone = !populateFollowing();
|
||
|
fPosition = fTextIdx;
|
||
|
fRuleStatusIndex = fStatuses[fBufIdx];
|
||
|
} else {
|
||
|
fBufIdx = modChunkSize(fBufIdx + 1);
|
||
|
fTextIdx = fPosition = fBoundaries[fBufIdx];
|
||
|
fRuleStatusIndex = fStatuses[fBufIdx];
|
||
|
}
|
||
|
};
|
||
|
|
||
|
void previous() {
|
||
|
int initialBufIdx = fBufIdx;
|
||
|
if (fBufIdx == fStartBufIdx) {
|
||
|
// At start of cache. Prepend to it.
|
||
|
populatePreceding();
|
||
|
} else {
|
||
|
// Cache already holds the next boundary
|
||
|
fBufIdx = modChunkSize(fBufIdx - 1);
|
||
|
fTextIdx = fBoundaries[fBufIdx];
|
||
|
}
|
||
|
fDone = (fBufIdx == initialBufIdx);
|
||
|
fPosition = fTextIdx;
|
||
|
fRuleStatusIndex = fStatuses[fBufIdx];
|
||
|
return;
|
||
|
};
|
||
|
|
||
|
// Move the iteration state to the position following the startPosition.
|
||
|
// Input position must be pinned to the input length.
|
||
|
void following(int startPos) {
|
||
|
if (startPos == fTextIdx || seek(startPos) || populateNear(startPos)) {
|
||
|
// startPos is in the cache. Do a next() from that position.
|
||
|
// TODO: an awkward set of interactions with bi->fDone
|
||
|
// seek() does not clear it; it can't because of interactions with populateNear().
|
||
|
// next() does not clear it in the fast-path case, where everything matters. Maybe it should.
|
||
|
// So clear it here, for the case where seek() succeeded on an iterator that had previously run off the end.
|
||
|
fDone = false;
|
||
|
next();
|
||
|
}
|
||
|
|
||
|
};
|
||
|
|
||
|
void preceding(int startPos) {
|
||
|
if (startPos == fTextIdx || seek(startPos) || populateNear(startPos)) {
|
||
|
if (startPos == fTextIdx) {
|
||
|
previous();
|
||
|
} else {
|
||
|
// seek() leaves the BreakCache positioned at the preceding boundary
|
||
|
// if the requested position is between two boundaries.
|
||
|
// current() pushes the BreakCache position out to the BreakIterator itself.
|
||
|
assert(startPos > fTextIdx);
|
||
|
current();
|
||
|
}
|
||
|
}
|
||
|
return;
|
||
|
};
|
||
|
|
||
|
/**
|
||
|
* Update the state of the public BreakIterator (fBI) to reflect the
|
||
|
* current state of the break iterator cache (this).
|
||
|
*/
|
||
|
int current() {
|
||
|
fPosition = fTextIdx;
|
||
|
fRuleStatusIndex = fStatuses[fBufIdx];
|
||
|
fDone = false;
|
||
|
return fTextIdx;
|
||
|
};
|
||
|
|
||
|
/**
|
||
|
* Add boundaries to the cache near the specified position.
|
||
|
* The given position need not be a boundary itself.
|
||
|
* The input position must be within the range of the text, and
|
||
|
* on a code point boundary.
|
||
|
* If the requested position is a break boundary, leave the iteration
|
||
|
* position on it.
|
||
|
* If the requested position is not a boundary, leave the iteration
|
||
|
* position on the preceding boundary and include both the the
|
||
|
* preceding and following boundaries in the cache.
|
||
|
* Additional boundaries, either preceding or following, may be added
|
||
|
* to the cache as a side effect.
|
||
|
*
|
||
|
* Return false if the operation failed.
|
||
|
*/
|
||
|
boolean populateNear(int position) {
|
||
|
assert(position < fBoundaries[fStartBufIdx] || position > fBoundaries[fEndBufIdx]);
|
||
|
|
||
|
// Add boundaries to the cache near the specified position.
|
||
|
// The given position need not be a boundary itself.
|
||
|
// The input position must be within the range of the text, and
|
||
|
// on a code point boundary.
|
||
|
// If the requested position is a break boundary, leave the iteration
|
||
|
// position on it.
|
||
|
// If the requested position is not a boundary, leave the iteration
|
||
|
// position on the preceding boundary and include both the
|
||
|
// preceding and following boundaries in the cache.
|
||
|
// Additional boundaries, either preceding or following, may be added
|
||
|
// to the cache as a side effect.
|
||
|
|
||
|
// If the requested position is not near already cached positions, clear the existing cache,
|
||
|
// find a near-by boundary and begin new cache contents there.
|
||
|
|
||
|
// Threshold for a text position to be considered near to existing cache contents.
|
||
|
// TODO: See issue ICU-22024 "perf tuning of Cache needed."
|
||
|
// This value is subject to change. See the ticket for more details.
|
||
|
final int CACHE_NEAR = 15;
|
||
|
|
||
|
int startOfText = fText.getBeginIndex();
|
||
|
int aBoundary = -1;
|
||
|
int ruleStatusIndex = 0;
|
||
|
boolean retainCache = false;
|
||
|
if ((position > fBoundaries[fStartBufIdx] - CACHE_NEAR) && position < (fBoundaries[fEndBufIdx] + CACHE_NEAR)) {
|
||
|
// Requested position is near the existing cache. Retain it.
|
||
|
retainCache = true;
|
||
|
} else if (position <= startOfText + CACHE_NEAR) {
|
||
|
// Requested position is near the start of the text. Fill cache from start, skipping
|
||
|
// the need to find a safe point.
|
||
|
retainCache = false;
|
||
|
aBoundary = startOfText;
|
||
|
} else {
|
||
|
// Requested position is not near the existing cache.
|
||
|
// Find a safe point to refill the cache from.
|
||
|
int backupPos = handleSafePrevious(position);
|
||
|
|
||
|
if (fBoundaries[fEndBufIdx] < position && fBoundaries[fEndBufIdx] >= (backupPos - CACHE_NEAR)) {
|
||
|
// The requested position is beyond the end of the existing cache, but the
|
||
|
// reverse rules produced a position near or before the cached region.
|
||
|
// Retain the existing cache, and fill from the end of it.
|
||
|
retainCache = true;
|
||
|
} else if (backupPos < startOfText + CACHE_NEAR) {
|
||
|
// The safe reverse rules moved us to near the start of text.
|
||
|
// Take that (index 0) as the backup boundary, avoiding the complication
|
||
|
// (in the following block) of moving forward from the safe point to a known boundary.
|
||
|
//
|
||
|
// Retain the cache if it begins not too far from the requested position.
|
||
|
aBoundary = startOfText;
|
||
|
retainCache = (fBoundaries[fStartBufIdx] <= (position + CACHE_NEAR));
|
||
|
} else {
|
||
|
// The safe reverse rules produced a position that is neither near the existing
|
||
|
// cache, nor near the start of text.
|
||
|
// Advance to the boundary following.
|
||
|
// There is a complication: the safe reverse rules identify pairs of code points
|
||
|
// that are safe. If advancing from the safe point moves forwards by less than
|
||
|
// two code points, we need to advance one more time to ensure that the boundary
|
||
|
// is good, including a correct rules status value.
|
||
|
//
|
||
|
retainCache = false;
|
||
|
fPosition = backupPos;
|
||
|
aBoundary = handleNext();
|
||
|
if (aBoundary == backupPos + 1 ||
|
||
|
(aBoundary == backupPos + 2 &&
|
||
|
Character.isHighSurrogate(fText.setIndex(backupPos)) &&
|
||
|
Character.isLowSurrogate(fText.next()))) {
|
||
|
// The initial handleNext() only advanced by a single code point. Go again.
|
||
|
// Safe rules identify safe pairs.
|
||
|
aBoundary = handleNext();
|
||
|
}
|
||
|
if (aBoundary == BreakIterator.DONE) {
|
||
|
aBoundary = fText.getEndIndex();
|
||
|
}
|
||
|
ruleStatusIndex = fRuleStatusIndex;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
if (!retainCache) {
|
||
|
assert(aBoundary != -1);
|
||
|
reset(aBoundary, ruleStatusIndex); // Reset cache to hold aBoundary as a single starting point.
|
||
|
}
|
||
|
|
||
|
|
||
|
// Fill in boundaries between existing cache content and the new requested position.
|
||
|
|
||
|
if (fBoundaries[fEndBufIdx] < position) {
|
||
|
// The last position in the cache precedes the requested position.
|
||
|
// Add following position(s) to the cache.
|
||
|
while (fBoundaries[fEndBufIdx] < position) {
|
||
|
if (!populateFollowing()) {
|
||
|
assert false;
|
||
|
return false;
|
||
|
}
|
||
|
}
|
||
|
fBufIdx = fEndBufIdx; // Set iterator position to the end of the buffer.
|
||
|
fTextIdx = fBoundaries[fBufIdx]; // Required because populateFollowing may add extra boundaries.
|
||
|
while (fTextIdx > position) { // Move backwards to a position at or preceding the requested pos.
|
||
|
previous();
|
||
|
}
|
||
|
return true;
|
||
|
}
|
||
|
|
||
|
if (fBoundaries[fStartBufIdx] > position) {
|
||
|
// The first position in the cache is beyond the requested position.
|
||
|
// back up more until we get a boundary <= the requested position.
|
||
|
while (fBoundaries[fStartBufIdx] > position) {
|
||
|
populatePreceding();
|
||
|
}
|
||
|
fBufIdx = fStartBufIdx; // Set iterator position to the start of the buffer.
|
||
|
fTextIdx = fBoundaries[fBufIdx]; // Required because populatePreceding may add extra boundaries.
|
||
|
while (fTextIdx < position) { // Move forwards to a position at or following the requested pos.
|
||
|
next();
|
||
|
}
|
||
|
if (fTextIdx > position) {
|
||
|
// If position is not itself a boundary, the next() loop above will overshoot.
|
||
|
// Back up one, leaving cache position at the boundary preceding the requested position.
|
||
|
previous();
|
||
|
}
|
||
|
return true;
|
||
|
}
|
||
|
|
||
|
assert fTextIdx == position;
|
||
|
return true;
|
||
|
|
||
|
};
|
||
|
|
||
|
/**
|
||
|
* Add boundary(s) to the cache following the current last boundary.
|
||
|
* Return false if at the end of the text, and no more boundaries can be added.
|
||
|
* Leave iteration position at the first newly added boundary, or unchanged if no boundary was added.
|
||
|
*/
|
||
|
boolean populateFollowing() {
|
||
|
int fromPosition = fBoundaries[fEndBufIdx];
|
||
|
int fromRuleStatusIdx = fStatuses[fEndBufIdx];
|
||
|
int pos = 0;
|
||
|
int ruleStatusIdx = 0;
|
||
|
|
||
|
if (fDictionaryCache.following(fromPosition)) {
|
||
|
addFollowing(fDictionaryCache.fBoundary, fDictionaryCache.fStatusIndex, UpdateCachePosition);
|
||
|
return true;
|
||
|
}
|
||
|
|
||
|
fPosition = fromPosition;
|
||
|
pos = handleNext();
|
||
|
if (pos == BreakIterator.DONE) {
|
||
|
return false;
|
||
|
}
|
||
|
|
||
|
ruleStatusIdx = fRuleStatusIndex;
|
||
|
if (fDictionaryCharCount > 0) {
|
||
|
// The text segment obtained from the rules includes dictionary characters.
|
||
|
// Subdivide it, with subdivided results going into the dictionary cache.
|
||
|
fDictionaryCache.populateDictionary(fromPosition, pos, fromRuleStatusIdx, ruleStatusIdx);
|
||
|
if (fDictionaryCache.following(fromPosition)) {
|
||
|
addFollowing(fDictionaryCache.fBoundary, fDictionaryCache.fStatusIndex, UpdateCachePosition);
|
||
|
return true;
|
||
|
// TODO: may want to move a sizable chunk of the dictionary cache to the break cache at this point.
|
||
|
// But be careful with interactions with populateNear().
|
||
|
}
|
||
|
}
|
||
|
|
||
|
// Rule based segment did not include dictionary characters.
|
||
|
// Or, it did contain dictionary chars, but the dictionary segmenter didn't handle them,
|
||
|
// meaning that we didn't take the return, above.
|
||
|
// Add its end point to the cache.
|
||
|
addFollowing(pos, ruleStatusIdx, UpdateCachePosition);
|
||
|
|
||
|
// Add several non-dictionary boundaries at this point, to optimize straight forward iteration.
|
||
|
// (subsequent calls to BreakIterator::next() will take the fast path, getting cached results.
|
||
|
//
|
||
|
for (int count=0; count<6; ++count) {
|
||
|
pos = handleNext();
|
||
|
if (pos == BreakIterator.DONE || fDictionaryCharCount > 0) {
|
||
|
break;
|
||
|
}
|
||
|
addFollowing(pos, fRuleStatusIndex, RetainCachePosition);
|
||
|
}
|
||
|
return true;
|
||
|
};
|
||
|
|
||
|
/**
|
||
|
* Add one or more boundaries to the cache preceding the first currently cached boundary.
|
||
|
* Leave the iteration position on the first added boundary.
|
||
|
* Return false if no boundaries could be added (if at the start of the text.)
|
||
|
*/
|
||
|
boolean populatePreceding() {
|
||
|
int textBegin = fText.getBeginIndex();
|
||
|
int fromPosition = fBoundaries[fStartBufIdx];
|
||
|
if (fromPosition == textBegin) {
|
||
|
return false;
|
||
|
}
|
||
|
|
||
|
int position = textBegin;
|
||
|
int positionStatusIdx = 0;
|
||
|
|
||
|
if (fDictionaryCache.preceding(fromPosition)) {
|
||
|
addPreceding(fDictionaryCache.fBoundary, fDictionaryCache.fStatusIndex, UpdateCachePosition);
|
||
|
return true;
|
||
|
}
|
||
|
|
||
|
int backupPosition = fromPosition;
|
||
|
|
||
|
// Find a boundary somewhere preceding the first already-cached boundary
|
||
|
do {
|
||
|
backupPosition = backupPosition - 30;
|
||
|
if (backupPosition <= textBegin) {
|
||
|
backupPosition = textBegin;
|
||
|
} else {
|
||
|
backupPosition = handleSafePrevious(backupPosition);
|
||
|
}
|
||
|
if (backupPosition == BreakIterator.DONE || backupPosition == textBegin) {
|
||
|
position = textBegin;
|
||
|
positionStatusIdx = 0;
|
||
|
} else {
|
||
|
// Advance to the boundary following the backup position.
|
||
|
// There is a complication: the safe reverse rules identify pairs of code points
|
||
|
// that are safe. If advancing from the safe point moves forwards by less than
|
||
|
// two code points, we need to advance one more time to ensure that the boundary
|
||
|
// is good, including a correct rules status value.
|
||
|
//
|
||
|
fPosition = backupPosition; // TODO: pass starting position in a clearer way.
|
||
|
position = handleNext();
|
||
|
if (position == backupPosition + 1 ||
|
||
|
(position == backupPosition + 2 &&
|
||
|
Character.isHighSurrogate(fText.setIndex(backupPosition)) &&
|
||
|
Character.isLowSurrogate(fText.next()))) {
|
||
|
// The initial handleNext() only advanced by a single code point. Go again.
|
||
|
// Safe rules identify safe pairs.
|
||
|
position = handleNext();
|
||
|
}
|
||
|
positionStatusIdx = fRuleStatusIndex;
|
||
|
}
|
||
|
} while (position >= fromPosition);
|
||
|
|
||
|
// Find boundaries between the one we just located and the first already-cached boundary
|
||
|
// Put them in a side buffer, because we don't yet know where they will fall in the circular cache buffer.
|
||
|
|
||
|
fSideBuffer.removeAllElements();
|
||
|
fSideBuffer.push(position);
|
||
|
fSideBuffer.push(positionStatusIdx);
|
||
|
|
||
|
do {
|
||
|
int prevPosition = fPosition = position;
|
||
|
int prevStatusIdx = positionStatusIdx;
|
||
|
position = handleNext();
|
||
|
positionStatusIdx = fRuleStatusIndex;
|
||
|
if (position == BreakIterator.DONE) {
|
||
|
break;
|
||
|
}
|
||
|
|
||
|
boolean segmentHandledByDictionary = false;
|
||
|
if (fDictionaryCharCount != 0) {
|
||
|
// Segment from the rules includes dictionary characters.
|
||
|
// Subdivide it, with subdivided results going into the dictionary cache.
|
||
|
int dictSegEndPosition = position;
|
||
|
fDictionaryCache.populateDictionary(prevPosition, dictSegEndPosition, prevStatusIdx, positionStatusIdx);
|
||
|
while (fDictionaryCache.following(prevPosition)) {
|
||
|
position = fDictionaryCache.fBoundary;
|
||
|
positionStatusIdx = fDictionaryCache.fStatusIndex;
|
||
|
segmentHandledByDictionary = true;
|
||
|
assert(position > prevPosition);
|
||
|
if (position >= fromPosition) {
|
||
|
break;
|
||
|
}
|
||
|
assert(position <= dictSegEndPosition);
|
||
|
fSideBuffer.push(position);
|
||
|
fSideBuffer.push(positionStatusIdx);
|
||
|
prevPosition = position;
|
||
|
}
|
||
|
assert(position==dictSegEndPosition || position>=fromPosition);
|
||
|
}
|
||
|
|
||
|
if (!segmentHandledByDictionary && position < fromPosition) {
|
||
|
fSideBuffer.push(position);
|
||
|
fSideBuffer.push(positionStatusIdx);
|
||
|
}
|
||
|
} while (position < fromPosition);
|
||
|
|
||
|
// Move boundaries from the side buffer to the main circular buffer.
|
||
|
boolean success = false;
|
||
|
if (!fSideBuffer.isEmpty()) {
|
||
|
positionStatusIdx = fSideBuffer.pop();
|
||
|
position = fSideBuffer.pop();
|
||
|
addPreceding(position, positionStatusIdx, UpdateCachePosition);
|
||
|
success = true;
|
||
|
}
|
||
|
|
||
|
while (!fSideBuffer.isEmpty()) {
|
||
|
positionStatusIdx = fSideBuffer.pop();
|
||
|
position = fSideBuffer.pop();
|
||
|
if (!addPreceding(position, positionStatusIdx, RetainCachePosition)) {
|
||
|
// No space in circular buffer to hold a new preceding result while
|
||
|
// also retaining the current cache (iteration) position.
|
||
|
// Bailing out is safe; the cache will refill again if needed.
|
||
|
break;
|
||
|
}
|
||
|
}
|
||
|
return success;
|
||
|
};
|
||
|
|
||
|
|
||
|
static final boolean RetainCachePosition = false;
|
||
|
static final boolean UpdateCachePosition = true;
|
||
|
|
||
|
/**
|
||
|
* Add the boundary following the current position.
|
||
|
* The current position can be left as it was, or changed to the newly added boundary,
|
||
|
* as specified by the update parameter.
|
||
|
*/
|
||
|
void addFollowing(int position, int ruleStatusIdx, boolean update) {
|
||
|
assert(position > fBoundaries[fEndBufIdx]);
|
||
|
assert(ruleStatusIdx <= Short.MAX_VALUE);
|
||
|
int nextIdx = modChunkSize(fEndBufIdx + 1);
|
||
|
if (nextIdx == fStartBufIdx) {
|
||
|
fStartBufIdx = modChunkSize(fStartBufIdx + 6); // TODO: experiment. Probably revert to 1.
|
||
|
}
|
||
|
fBoundaries[nextIdx] = position;
|
||
|
fStatuses[nextIdx] = (short)ruleStatusIdx;
|
||
|
fEndBufIdx = nextIdx;
|
||
|
if (update == UpdateCachePosition) {
|
||
|
// Set current position to the newly added boundary.
|
||
|
fBufIdx = nextIdx;
|
||
|
fTextIdx = position;
|
||
|
} else {
|
||
|
// Retaining the original cache position.
|
||
|
// Check if the added boundary wraps around the buffer, and would over-write the original position.
|
||
|
// It's the responsibility of callers of this function to not add too many.
|
||
|
assert(nextIdx != fBufIdx);
|
||
|
}
|
||
|
|
||
|
};
|
||
|
|
||
|
|
||
|
/**
|
||
|
* Add the boundary preceding the current position.
|
||
|
* The current position can be left as it was, or changed to the newly added boundary,
|
||
|
* as specified by the update parameter.
|
||
|
*/
|
||
|
boolean addPreceding(int position, int ruleStatusIdx, boolean update) {
|
||
|
assert(position < fBoundaries[fStartBufIdx]);
|
||
|
assert(ruleStatusIdx <= Short.MAX_VALUE);
|
||
|
int nextIdx = modChunkSize(fStartBufIdx - 1);
|
||
|
if (nextIdx == fEndBufIdx) {
|
||
|
if (fBufIdx == fEndBufIdx && update == RetainCachePosition) {
|
||
|
// Failure. The insertion of the new boundary would claim the buffer position that is the
|
||
|
// current iteration position. And we also want to retain the current iteration position.
|
||
|
// (The buffer is already completely full of entries that precede the iteration position.)
|
||
|
return false;
|
||
|
}
|
||
|
fEndBufIdx = modChunkSize(fEndBufIdx - 1);
|
||
|
}
|
||
|
fBoundaries[nextIdx] = position;
|
||
|
fStatuses[nextIdx] = (short)ruleStatusIdx;
|
||
|
fStartBufIdx = nextIdx;
|
||
|
if (update == UpdateCachePosition) {
|
||
|
fBufIdx = nextIdx;
|
||
|
fTextIdx = position;
|
||
|
}
|
||
|
return true;
|
||
|
};
|
||
|
|
||
|
/**
|
||
|
* Set the cache position to the specified position, or, if the position
|
||
|
* falls between to cached boundaries, to the preceding boundary.
|
||
|
* Fails if the requested position is outside of the range of boundaries currently held by the cache.
|
||
|
* The startPosition must be on a code point boundary.
|
||
|
*
|
||
|
* Return true if successful, false if the specified position is after
|
||
|
* the last cached boundary or before the first.
|
||
|
*/
|
||
|
boolean seek(int pos) {
|
||
|
if (pos < fBoundaries[fStartBufIdx] || pos > fBoundaries[fEndBufIdx]) {
|
||
|
return false;
|
||
|
}
|
||
|
if (pos == fBoundaries[fStartBufIdx]) {
|
||
|
// Common case: seek(0), from BreakIterator::first()
|
||
|
fBufIdx = fStartBufIdx;
|
||
|
fTextIdx = fBoundaries[fBufIdx];
|
||
|
return true;
|
||
|
}
|
||
|
if (pos == fBoundaries[fEndBufIdx]) {
|
||
|
fBufIdx = fEndBufIdx;
|
||
|
fTextIdx = fBoundaries[fBufIdx];
|
||
|
return true;
|
||
|
}
|
||
|
|
||
|
int min = fStartBufIdx;
|
||
|
int max = fEndBufIdx;
|
||
|
while (min != max) {
|
||
|
int probe = (min + max + (min>max ? CACHE_SIZE : 0)) / 2;
|
||
|
probe = modChunkSize(probe);
|
||
|
if (fBoundaries[probe] > pos) {
|
||
|
max = probe;
|
||
|
} else {
|
||
|
min = modChunkSize(probe + 1);
|
||
|
}
|
||
|
}
|
||
|
assert(fBoundaries[max] > pos);
|
||
|
fBufIdx = modChunkSize(max - 1);
|
||
|
fTextIdx = fBoundaries[fBufIdx];
|
||
|
assert(fTextIdx <= pos);
|
||
|
return true;
|
||
|
|
||
|
};
|
||
|
|
||
|
|
||
|
/**
|
||
|
* copy constructor, used from RuleBasedBreakIterator.clone().
|
||
|
*
|
||
|
* @param src
|
||
|
*/
|
||
|
BreakCache(BreakCache src) {
|
||
|
fStartBufIdx = src.fStartBufIdx;
|
||
|
fEndBufIdx = src.fEndBufIdx;
|
||
|
fTextIdx = src.fTextIdx;
|
||
|
fBufIdx = src.fBufIdx;
|
||
|
fBoundaries = src.fBoundaries.clone();
|
||
|
fStatuses = src.fStatuses.clone();
|
||
|
fSideBuffer = new DictionaryBreakEngine.DequeI(); // Transient, no need to clone contents.
|
||
|
}
|
||
|
|
||
|
void dumpCache() {
|
||
|
System.out.printf("fTextIdx:%d fBufIdx:%d%n", fTextIdx, fBufIdx);
|
||
|
for (int i=fStartBufIdx; ; i=modChunkSize(i+1)) {
|
||
|
System.out.printf("%d %d%n", i, fBoundaries[i]);
|
||
|
if (i == fEndBufIdx) {
|
||
|
break;
|
||
|
}
|
||
|
}
|
||
|
};
|
||
|
|
||
|
private final int modChunkSize(int index) { return index & (CACHE_SIZE - 1); };
|
||
|
|
||
|
static final int CACHE_SIZE = 128;
|
||
|
// static_assert((CACHE_SIZE & (CACHE_SIZE-1)) == 0, "CACHE_SIZE must be power of two.");
|
||
|
|
||
|
int fStartBufIdx;
|
||
|
int fEndBufIdx; // inclusive
|
||
|
|
||
|
int fTextIdx;
|
||
|
int fBufIdx;
|
||
|
|
||
|
int[] fBoundaries = new int[CACHE_SIZE];
|
||
|
short[] fStatuses = new short[CACHE_SIZE];
|
||
|
|
||
|
DictionaryBreakEngine.DequeI fSideBuffer = new DictionaryBreakEngine.DequeI();
|
||
|
};
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
}
|
||
|
|