2068 lines
80 KiB
Java
2068 lines
80 KiB
Java
/* GENERATED SOURCE. DO NOT MODIFY. */
|
|
// © 2016 and later: Unicode, Inc. and others.
|
|
// License & terms of use: http://www.unicode.org/copyright.html
|
|
/*
|
|
*******************************************************************************
|
|
* Copyright (C) 1996-2016, International Business Machines Corporation and
|
|
* others. All Rights Reserved.
|
|
*******************************************************************************
|
|
*/
|
|
package android.icu.text;
|
|
|
|
import java.text.CharacterIterator;
|
|
import java.text.StringCharacterIterator;
|
|
import java.util.Locale;
|
|
|
|
import android.icu.util.ICUException;
|
|
import android.icu.util.ULocale;
|
|
|
|
// Java porting note:
|
|
//
|
|
// The ICU4C implementation contains dead code in many places.
|
|
// While porting the ICU4C linear search implementation, this dead code
|
|
// was not fully ported. The code blocks tagged by "// *** Boyer-Moore ***"
|
|
// are those dead code blocks, still available in ICU4C.
|
|
|
|
// The ICU4C implementation does not seem to handle UCharacterIterator pointing
|
|
// to a fragment of text properly. ICU4J uses CharacterIterator to navigate through
|
|
// the input text. We need to carefully review the code ported from ICU4C
|
|
// assuming the start index is 0.
|
|
|
|
// ICU4C implementation initializes pattern.CE and pattern.PCE. It looks like
|
|
// CE is no longer used, except in a few places checking CELength. It looks like this
|
|
// is a leftover from already-disabled Boyer-Moore search code. This Java implementation
|
|
// preserves the code, but we should clean this up later.
|
|
|
|
/**
|
|
*
|
|
* <tt>StringSearch</tt> is a {@link SearchIterator} that provides
|
|
* language-sensitive text searching based on the comparison rules defined
|
|
* in a {@link RuleBasedCollator} object.
|
|
* StringSearch ensures that language eccentricity can be
|
|
* handled, e.g. for the German collator, characters ß and SS will be matched
|
|
* if case is chosen to be ignored.
|
|
* See the <a href="https://htmlpreview.github.io/?https://github.com/unicode-org/icu-docs/blob/main/design/collation/ICU_collation_design.htm">
|
|
* "ICU Collation Design Document"</a> for more information.
|
|
* <p>
|
|
* There are 2 match options for selection:<br>
|
|
* Let S' be the sub-string of a text string S between the offsets start and
|
|
* end [start, end].
|
|
* <br>
|
|
* A pattern string P matches a text string S at the offsets [start, end]
|
|
* if
|
|
* <pre>
|
|
* option 1. Some canonical equivalent of P matches some canonical equivalent
|
|
* of S'
|
|
* option 2. P matches S' and if P starts or ends with a combining mark,
|
|
* there exists no non-ignorable combining mark before or after S?
|
|
* in S respectively.
|
|
* </pre>
|
|
* Option 2. is the default.
|
|
* <p>
|
|
* This search has APIs similar to that of other text iteration mechanisms
|
|
* such as the break iterators in {@link BreakIterator}. Using these
|
|
* APIs, it is easy to scan through text looking for all occurrences of
|
|
* a given pattern. This search iterator allows changing of direction by
|
|
* calling a {@link #reset} followed by a {@link #next} or {@link #previous}.
|
|
* Though a direction change can occur without calling {@link #reset} first,
|
|
* this operation comes with some speed penalty.
|
|
* Match results in the forward direction will match the result matches in
|
|
* the backwards direction in the reverse order
|
|
* <p>
|
|
* {@link SearchIterator} provides APIs to specify the starting position
|
|
* within the text string to be searched, e.g. {@link SearchIterator#setIndex setIndex},
|
|
* {@link SearchIterator#preceding preceding} and {@link SearchIterator#following following}.
|
|
* Since the starting position will be set as it is specified, please take note that
|
|
* there are some danger points at which the search may render incorrect
|
|
* results:
|
|
* <ul>
|
|
* <li> In the midst of a substring that requires normalization.
|
|
* <li> If the following match is to be found, the position should not be the
|
|
* second character which requires swapping with the preceding
|
|
* character. Vice versa, if the preceding match is to be found, the
|
|
* position to search from should not be the first character which
|
|
* requires swapping with the next character. E.g certain Thai and
|
|
* Lao characters require swapping.
|
|
* <li> If a following pattern match is to be found, any position within a
|
|
* contracting sequence except the first will fail. Vice versa if a
|
|
* preceding pattern match is to be found, an invalid starting point
|
|
* would be any character within a contracting sequence except the last.
|
|
* </ul>
|
|
* <p>
|
|
* A {@link BreakIterator} can be used if only matches at logical breaks are desired.
|
|
* Using a {@link BreakIterator} will only give you results that exactly matches the
|
|
* boundaries given by the {@link BreakIterator}. For instance the pattern "e" will
|
|
* not be found in the string "\u00e9" if a character break iterator is used.
|
|
* <p>
|
|
* Options are provided to handle overlapping matches.
|
|
* E.g. In English, overlapping matches produces the result 0 and 2
|
|
* for the pattern "abab" in the text "ababab", where mutually
|
|
* exclusive matches only produces the result of 0.
|
|
* <p>
|
|
* Options are also provided to implement "asymmetric search" as described in
|
|
* <a href="http://www.unicode.org/reports/tr10/#Asymmetric_Search">
|
|
* UTS #10 Unicode Collation Algorithm</a>, specifically the ElementComparisonType
|
|
* values.
|
|
* <p>
|
|
* Though collator attributes will be taken into consideration while
|
|
* performing matches, there are no APIs here for setting and getting the
|
|
* attributes. These attributes can be set by getting the collator
|
|
* from {@link #getCollator} and using the APIs in {@link RuleBasedCollator}.
|
|
* Lastly to update <tt>StringSearch</tt> to the new collator attributes,
|
|
* {@link #reset} has to be called.
|
|
* <p>
|
|
* Restriction: <br>
|
|
* Currently there are no composite characters that consists of a
|
|
* character with combining class > 0 before a character with combining
|
|
* class == 0. However, if such a character exists in the future,
|
|
* <tt>StringSearch</tt> does not guarantee the results for option 1.
|
|
* <p>
|
|
* Consult the {@link SearchIterator} documentation for information on
|
|
* and examples of how to use instances of this class to implement text
|
|
* searching.
|
|
* <p>
|
|
* Note, <tt>StringSearch</tt> is not to be subclassed.
|
|
* </p>
|
|
* @see SearchIterator
|
|
* @see RuleBasedCollator
|
|
* @author Laura Werner, synwee
|
|
*/
|
|
// internal notes: all methods do not guarantee the correct status of the
|
|
// characteriterator. the caller has to maintain the original index position
|
|
// if necessary. methods could change the index position as it deems fit
|
|
public final class StringSearch extends SearchIterator {
|
|
|
|
private Pattern pattern_;
|
|
private RuleBasedCollator collator_;
|
|
|
|
// positions within the collation element iterator is used to determine
|
|
// if we are at the start of the text.
|
|
private CollationElementIterator textIter_;
|
|
private CollationPCE textProcessedIter_;
|
|
|
|
// utility collation element, used throughout program for temporary
|
|
// iteration.
|
|
private CollationElementIterator utilIter_;
|
|
|
|
private Normalizer2 nfd_;
|
|
|
|
private int strength_;
|
|
int ceMask_;
|
|
int variableTop_;
|
|
|
|
private boolean toShift_;
|
|
|
|
// *** Boyer-Moore ***
|
|
// private char[] canonicalPrefixAccents_;
|
|
// private char[] canonicalSuffixAccents_;
|
|
|
|
/**
|
|
* Initializes the iterator to use the language-specific rules defined in
|
|
* the argument collator to search for argument pattern in the argument
|
|
* target text. The argument <code>breakiter</code> is used to define logical matches.
|
|
* See super class documentation for more details on the use of the target
|
|
* text and {@link BreakIterator}.
|
|
* @param pattern text to look for.
|
|
* @param target target text to search for pattern.
|
|
* @param collator {@link RuleBasedCollator} that defines the language rules
|
|
* @param breakiter A {@link BreakIterator} that is used to determine the
|
|
* boundaries of a logical match. This argument can be null.
|
|
* @throws IllegalArgumentException thrown when argument target is null,
|
|
* or of length 0
|
|
* @see BreakIterator
|
|
* @see RuleBasedCollator
|
|
*/
|
|
public StringSearch(String pattern, CharacterIterator target, RuleBasedCollator collator,
|
|
BreakIterator breakiter) {
|
|
|
|
// This implementation is ported from ICU4C usearch_open()
|
|
|
|
super(target, breakiter);
|
|
|
|
// string search does not really work when numeric collation is turned on
|
|
if (collator.getNumericCollation()) {
|
|
throw new UnsupportedOperationException("Numeric collation is not supported by StringSearch");
|
|
}
|
|
|
|
collator_ = collator;
|
|
strength_ = collator.getStrength();
|
|
ceMask_ = getMask(strength_);
|
|
toShift_ = collator.isAlternateHandlingShifted();
|
|
variableTop_ = collator.getVariableTop();
|
|
|
|
nfd_ = Normalizer2.getNFDInstance();
|
|
|
|
pattern_ = new Pattern(pattern);
|
|
|
|
search_.setMatchedLength(0);
|
|
search_.matchedIndex_ = DONE;
|
|
|
|
utilIter_ = null;
|
|
textIter_ = new CollationElementIterator(target, collator);
|
|
|
|
textProcessedIter_ = null;
|
|
|
|
// This is done by super class constructor
|
|
/*
|
|
search_.isOverlap_ = false;
|
|
search_.isCanonicalMatch_ = false;
|
|
search_.elementComparisonType_ = ElementComparisonType.STANDARD_ELEMENT_COMPARISON;
|
|
search_.isForwardSearching_ = true;
|
|
search_.reset_ = true;
|
|
*/
|
|
ULocale collLocale = collator.getLocale(ULocale.VALID_LOCALE);
|
|
search_.internalBreakIter_ = BreakIterator.getCharacterInstance(collLocale == null ? ULocale.ROOT : collLocale);
|
|
search_.internalBreakIter_.setText((CharacterIterator)target.clone()); // We need to create a clone
|
|
|
|
initialize();
|
|
}
|
|
|
|
/**
|
|
* Initializes the iterator to use the language-specific rules defined in
|
|
* the argument collator to search for argument pattern in the argument
|
|
* target text. No {@link BreakIterator}s are set to test for logical matches.
|
|
* @param pattern text to look for.
|
|
* @param target target text to search for pattern.
|
|
* @param collator {@link RuleBasedCollator} that defines the language rules
|
|
* @throws IllegalArgumentException thrown when argument target is null,
|
|
* or of length 0
|
|
* @see RuleBasedCollator
|
|
*/
|
|
public StringSearch(String pattern, CharacterIterator target, RuleBasedCollator collator) {
|
|
this(pattern, target, collator, null);
|
|
}
|
|
|
|
/**
|
|
* Initializes the iterator to use the language-specific rules and
|
|
* break iterator rules defined in the argument locale to search for
|
|
* argument pattern in the argument target text.
|
|
* @param pattern text to look for.
|
|
* @param target target text to search for pattern.
|
|
* @param locale locale to use for language and break iterator rules
|
|
* @throws IllegalArgumentException thrown when argument target is null,
|
|
* or of length 0. ClassCastException thrown if the collator for
|
|
* the specified locale is not a RuleBasedCollator.
|
|
*/
|
|
public StringSearch(String pattern, CharacterIterator target, Locale locale) {
|
|
this(pattern, target, ULocale.forLocale(locale));
|
|
}
|
|
|
|
/**
|
|
* Initializes the iterator to use the language-specific rules and
|
|
* break iterator rules defined in the argument locale to search for
|
|
* argument pattern in the argument target text.
|
|
* See super class documentation for more details on the use of the target
|
|
* text and {@link BreakIterator}.
|
|
* @param pattern text to look for.
|
|
* @param target target text to search for pattern.
|
|
* @param locale locale to use for language and break iterator rules
|
|
* @throws IllegalArgumentException thrown when argument target is null,
|
|
* or of length 0. ClassCastException thrown if the collator for
|
|
* the specified locale is not a RuleBasedCollator.
|
|
* @see BreakIterator
|
|
* @see RuleBasedCollator
|
|
* @see SearchIterator
|
|
*/
|
|
public StringSearch(String pattern, CharacterIterator target, ULocale locale) {
|
|
this(pattern, target, (RuleBasedCollator) Collator.getInstance(locale), null);
|
|
}
|
|
|
|
/**
|
|
* Initializes the iterator to use the language-specific rules and
|
|
* break iterator rules defined in the default locale to search for
|
|
* argument pattern in the argument target text.
|
|
* @param pattern text to look for.
|
|
* @param target target text to search for pattern.
|
|
* @throws IllegalArgumentException thrown when argument target is null,
|
|
* or of length 0. ClassCastException thrown if the collator for
|
|
* the default locale is not a RuleBasedCollator.
|
|
*/
|
|
public StringSearch(String pattern, String target) {
|
|
this(pattern, new StringCharacterIterator(target),
|
|
(RuleBasedCollator) Collator.getInstance(), null);
|
|
}
|
|
|
|
/**
|
|
* Gets the {@link RuleBasedCollator} used for the language rules.
|
|
* <p>
|
|
* Since <tt>StringSearch</tt> depends on the returned {@link RuleBasedCollator}, any
|
|
* changes to the {@link RuleBasedCollator} result should follow with a call to
|
|
* either {@link #reset()} or {@link #setCollator(RuleBasedCollator)} to ensure the correct
|
|
* search behavior.
|
|
* </p>
|
|
* @return {@link RuleBasedCollator} used by this <tt>StringSearch</tt>
|
|
* @see RuleBasedCollator
|
|
* @see #setCollator
|
|
*/
|
|
public RuleBasedCollator getCollator() {
|
|
return collator_;
|
|
}
|
|
|
|
/**
|
|
* Sets the {@link RuleBasedCollator} to be used for language-specific searching.
|
|
* <p>
|
|
* The iterator's position will not be changed by this method.
|
|
* @param collator to use for this <tt>StringSearch</tt>
|
|
* @throws IllegalArgumentException thrown when collator is null
|
|
* @see #getCollator
|
|
*/
|
|
public void setCollator(RuleBasedCollator collator) {
|
|
if (collator == null) {
|
|
throw new IllegalArgumentException("Collator can not be null");
|
|
}
|
|
collator_ = collator;
|
|
ceMask_ = getMask(collator_.getStrength());
|
|
|
|
ULocale collLocale = collator.getLocale(ULocale.VALID_LOCALE);
|
|
search_.internalBreakIter_ = BreakIterator.getCharacterInstance(collLocale == null ? ULocale.ROOT : collLocale);
|
|
search_.internalBreakIter_.setText((CharacterIterator)search_.text().clone()); // We need to create a clone
|
|
|
|
toShift_ = collator.isAlternateHandlingShifted();
|
|
variableTop_ = collator.getVariableTop();
|
|
textIter_ = new CollationElementIterator(pattern_.text_, collator);
|
|
utilIter_ = new CollationElementIterator(pattern_.text_, collator);
|
|
|
|
// initialize() _after_ setting the iterators for the new collator.
|
|
initialize();
|
|
}
|
|
|
|
/**
|
|
* Returns the pattern for which <tt>StringSearch</tt> is searching for.
|
|
* @return the pattern searched for
|
|
*/
|
|
public String getPattern() {
|
|
return pattern_.text_;
|
|
}
|
|
|
|
/**
|
|
* Set the pattern to search for.
|
|
* The iterator's position will not be changed by this method.
|
|
* @param pattern for searching
|
|
* @see #getPattern
|
|
* @exception IllegalArgumentException thrown if pattern is null or of
|
|
* length 0
|
|
*/
|
|
public void setPattern(String pattern) {
|
|
if (pattern == null || pattern.length() <= 0) {
|
|
throw new IllegalArgumentException(
|
|
"Pattern to search for can not be null or of length 0");
|
|
}
|
|
pattern_.text_ = pattern;
|
|
initialize();
|
|
}
|
|
|
|
/**
|
|
* Determines whether canonical matches (option 1, as described in the
|
|
* class documentation) is set.
|
|
* See setCanonical(boolean) for more information.
|
|
* @see #setCanonical
|
|
* @return true if canonical matches is set, false otherwise
|
|
*/
|
|
//TODO: hoist this to SearchIterator
|
|
public boolean isCanonical() {
|
|
return search_.isCanonicalMatch_;
|
|
}
|
|
|
|
/**
|
|
* Set the canonical match mode. See class documentation for details.
|
|
* The default setting for this property is false.
|
|
* @param allowCanonical flag indicator if canonical matches are allowed
|
|
* @see #isCanonical
|
|
*/
|
|
//TODO: hoist this to SearchIterator
|
|
public void setCanonical(boolean allowCanonical) {
|
|
search_.isCanonicalMatch_ = allowCanonical;
|
|
}
|
|
|
|
/**
|
|
* {@inheritDoc}
|
|
*/
|
|
@Override
|
|
public void setTarget(CharacterIterator text) {
|
|
super.setTarget(text);
|
|
textIter_.setText(text);
|
|
}
|
|
|
|
/**
|
|
* {@inheritDoc}
|
|
*/
|
|
@Override
|
|
public int getIndex() {
|
|
int result = textIter_.getOffset();
|
|
if (isOutOfBounds(search_.beginIndex(), search_.endIndex(), result)) {
|
|
return DONE;
|
|
}
|
|
return result;
|
|
}
|
|
|
|
/**
|
|
* {@inheritDoc}
|
|
*/
|
|
@Override
|
|
public void setIndex(int position) {
|
|
// Java porting note: This method is equivalent to setOffset() in ICU4C.
|
|
// ICU4C SearchIterator::setOffset() is a pure virtual method, while
|
|
// ICU4J SearchIterator.setIndex() is not abstract method.
|
|
|
|
super.setIndex(position);
|
|
textIter_.setOffset(position);
|
|
}
|
|
|
|
/**
|
|
* {@inheritDoc}
|
|
*/
|
|
@Override
|
|
public void reset() {
|
|
// reset is setting the attributes that are already in
|
|
// string search, hence all attributes in the collator should
|
|
// be retrieved without any problems
|
|
|
|
boolean sameCollAttribute = true;
|
|
int ceMask;
|
|
boolean shift;
|
|
int varTop;
|
|
|
|
// **** hack to deal w/ how processed CEs encode quaternary ****
|
|
int newStrength = collator_.getStrength();
|
|
if ((strength_ < Collator.QUATERNARY && newStrength >= Collator.QUATERNARY)
|
|
|| (strength_ >= Collator.QUATERNARY && newStrength < Collator.QUATERNARY)) {
|
|
sameCollAttribute = false;
|
|
}
|
|
|
|
strength_ = collator_.getStrength();
|
|
ceMask = getMask(strength_);
|
|
if (ceMask_ != ceMask) {
|
|
ceMask_ = ceMask;
|
|
sameCollAttribute = false;
|
|
}
|
|
|
|
shift = collator_.isAlternateHandlingShifted();
|
|
if (toShift_ != shift) {
|
|
toShift_ = shift;
|
|
sameCollAttribute = false;
|
|
}
|
|
|
|
varTop = collator_.getVariableTop();
|
|
if (variableTop_ != varTop) {
|
|
variableTop_ = varTop;
|
|
sameCollAttribute = false;
|
|
}
|
|
|
|
if (!sameCollAttribute) {
|
|
initialize();
|
|
}
|
|
|
|
textIter_.setText(search_.text());
|
|
|
|
search_.setMatchedLength(0);
|
|
search_.matchedIndex_ = DONE;
|
|
search_.isOverlap_ = false;
|
|
search_.isCanonicalMatch_ = false;
|
|
search_.elementComparisonType_ = ElementComparisonType.STANDARD_ELEMENT_COMPARISON;
|
|
search_.isForwardSearching_ = true;
|
|
search_.reset_ = true;
|
|
}
|
|
|
|
/**
|
|
* {@inheritDoc}
|
|
*/
|
|
@Override
|
|
protected int handleNext(int position) {
|
|
if (pattern_.CELength_ == 0) {
|
|
search_.matchedIndex_ = search_.matchedIndex_ == DONE ?
|
|
getIndex() : search_.matchedIndex_ + 1;
|
|
search_.setMatchedLength(0);
|
|
textIter_.setOffset(search_.matchedIndex_);
|
|
if (search_.matchedIndex_ == search_.endIndex()) {
|
|
search_.matchedIndex_ = DONE;
|
|
}
|
|
} else {
|
|
if (search_.matchedLength() <= 0) {
|
|
// the flipping direction issue has already been handled
|
|
// in next()
|
|
// for boundary check purposes. this will ensure that the
|
|
// next match will not preceed the current offset
|
|
// note search_.matchedIndex_ will always be set to something
|
|
// in the code
|
|
search_.matchedIndex_ = position - 1;
|
|
}
|
|
|
|
textIter_.setOffset(position);
|
|
|
|
// ICU4C comment:
|
|
// if strsrch_->breakIter is always the same as m_breakiterator_
|
|
// then we don't need to check the match boundaries here because
|
|
// usearch_handleNextXXX will already have done it.
|
|
if (search_.isCanonicalMatch_) {
|
|
// *could* actually use exact here 'cause no extra accents allowed...
|
|
handleNextCanonical();
|
|
} else {
|
|
handleNextExact();
|
|
}
|
|
|
|
if (search_.matchedIndex_ == DONE) {
|
|
textIter_.setOffset(search_.endIndex());
|
|
} else {
|
|
textIter_.setOffset(search_.matchedIndex_);
|
|
}
|
|
|
|
return search_.matchedIndex_;
|
|
}
|
|
|
|
return DONE;
|
|
}
|
|
|
|
/**
|
|
* {@inheritDoc}
|
|
*/
|
|
@Override
|
|
protected int handlePrevious(int position) {
|
|
if (pattern_.CELength_ == 0) {
|
|
search_.matchedIndex_ =
|
|
search_.matchedIndex_ == DONE ? getIndex() : search_.matchedIndex_;
|
|
if (search_.matchedIndex_ == search_.beginIndex()) {
|
|
setMatchNotFound();
|
|
} else {
|
|
search_.matchedIndex_--;
|
|
textIter_.setOffset(search_.matchedIndex_);
|
|
search_.setMatchedLength(0);
|
|
}
|
|
} else {
|
|
textIter_.setOffset(position);
|
|
|
|
if (search_.isCanonicalMatch_) {
|
|
// *could* use exact match here since extra accents *not* allowed!
|
|
handlePreviousCanonical();
|
|
} else {
|
|
handlePreviousExact();
|
|
}
|
|
}
|
|
|
|
return search_.matchedIndex_;
|
|
}
|
|
|
|
// ------------------ Internal implementation code ---------------------------
|
|
|
|
private static final int INITIAL_ARRAY_SIZE_ = 256;
|
|
|
|
// *** Boyer-Moore ***
|
|
// private static final Normalizer2Impl nfcImpl_ = Norm2AllModes.getNFCInstance().impl;
|
|
// private static final int LAST_BYTE_MASK_ = 0xff;
|
|
// private static final int SECOND_LAST_BYTE_SHIFT_ = 8;
|
|
|
|
private static final int PRIMARYORDERMASK = 0xffff0000;
|
|
private static final int SECONDARYORDERMASK = 0x0000ff00;
|
|
private static final int TERTIARYORDERMASK = 0x000000ff;
|
|
|
|
/**
|
|
* Getting the mask for collation strength
|
|
* @param strength collation strength
|
|
* @return collation element mask
|
|
*/
|
|
private static int getMask(int strength) {
|
|
switch (strength) {
|
|
case Collator.PRIMARY:
|
|
return PRIMARYORDERMASK;
|
|
case Collator.SECONDARY:
|
|
return SECONDARYORDERMASK | PRIMARYORDERMASK;
|
|
default:
|
|
return TERTIARYORDERMASK | SECONDARYORDERMASK | PRIMARYORDERMASK;
|
|
}
|
|
}
|
|
|
|
|
|
// *** Boyer-Moore ***
|
|
/*
|
|
private final char getFCD(String str, int offset) {
|
|
char ch = str.charAt(offset);
|
|
if (ch < 0x180) {
|
|
return (char) nfcImpl_.getFCD16FromBelow180(ch);
|
|
} else if (nfcImpl_.singleLeadMightHaveNonZeroFCD16(ch)) {
|
|
if (!Character.isHighSurrogate(ch)) {
|
|
return (char) nfcImpl_.getFCD16FromNormData(ch);
|
|
} else {
|
|
char c2;
|
|
if (++offset < str.length() && Character.isLowSurrogate(c2 = str.charAt(offset))) {
|
|
return (char) nfcImpl_.getFCD16FromNormData(Character.toCodePoint(ch, c2));
|
|
}
|
|
}
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
private final char getFCD(int c) {
|
|
return (char)nfcImpl_.getFCD16(c);
|
|
}
|
|
*/
|
|
|
|
/**
|
|
* Getting the modified collation elements taking into account the collation
|
|
* attributes.
|
|
*
|
|
* @param sourcece
|
|
* @return the modified collation element
|
|
*/
|
|
private int getCE(int sourcece) {
|
|
// note for tertiary we can't use the collator->tertiaryMask, that
|
|
// is a preprocessed mask that takes into account case options. since
|
|
// we are only concerned with exact matches, we don't need that.
|
|
sourcece &= ceMask_;
|
|
|
|
if (toShift_) {
|
|
// alternate handling here, since only the 16 most significant digits
|
|
// is only used, we can safely do a compare without masking
|
|
// if the ce is a variable, we mask and get only the primary values
|
|
// no shifting to quartenary is required since all primary values
|
|
// less than variabletop will need to be masked off anyway.
|
|
if (variableTop_ > sourcece) {
|
|
if (strength_ >= Collator.QUATERNARY) {
|
|
sourcece &= PRIMARYORDERMASK;
|
|
} else {
|
|
sourcece = CollationElementIterator.IGNORABLE;
|
|
}
|
|
}
|
|
} else if (strength_ >= Collator.QUATERNARY && sourcece == CollationElementIterator.IGNORABLE) {
|
|
sourcece = 0xFFFF;
|
|
}
|
|
|
|
return sourcece;
|
|
}
|
|
|
|
/**
|
|
* Direct port of ICU4C static int32_t * addTouint32_tArray(...) in usearch.cpp
|
|
* (except not taking destination buffer size and status param).
|
|
* This is used for appending a PCE to Pattern.PCE_ buffer. We probably should
|
|
* implement this in Pattern class.
|
|
*
|
|
* @param destination target array
|
|
* @param offset destination offset to add value
|
|
* @param value to be added
|
|
* @param increments incremental size expected
|
|
* @return new destination array, destination if there was no new allocation
|
|
*/
|
|
private static int[] addToIntArray(int[] destination, int offset, int value, int increments) {
|
|
int newlength = destination.length;
|
|
if (offset + 1 == newlength) {
|
|
newlength += increments;
|
|
int temp[] = new int[newlength];
|
|
System.arraycopy(destination, 0, temp, 0, offset);
|
|
destination = temp;
|
|
}
|
|
destination[offset] = value;
|
|
return destination;
|
|
}
|
|
|
|
/**
|
|
* Direct port of ICU4C static int64_t * addTouint64_tArray(...) in usearch.cpp.
|
|
* This is used for appending a PCE to Pattern.PCE_ buffer. We probably should
|
|
* implement this in Pattern class.
|
|
*
|
|
* @param destination target array
|
|
* @param offset destination offset to add value
|
|
* @param destinationlength target array size
|
|
* @param value to be added
|
|
* @param increments incremental size expected
|
|
* @return new destination array, destination if there was no new allocation
|
|
*/
|
|
private static long[] addToLongArray(long[] destination, int offset, int destinationlength,
|
|
long value, int increments) {
|
|
int newlength = destinationlength;
|
|
if (offset + 1 == newlength) {
|
|
newlength += increments;
|
|
long temp[] = new long[newlength];
|
|
System.arraycopy(destination, 0, temp, 0, offset);
|
|
destination = temp;
|
|
}
|
|
destination[offset] = value;
|
|
return destination;
|
|
}
|
|
|
|
/**
|
|
* Initializing the ce table for a pattern.
|
|
* Stores non-ignorable collation keys.
|
|
* Table size will be estimated by the size of the pattern text. Table
|
|
* expansion will be perform as we go along. Adding 1 to ensure that the table
|
|
* size definitely increases.
|
|
* @return total number of expansions
|
|
*/
|
|
// TODO: We probably do not need Pattern CE table.
|
|
private int initializePatternCETable() {
|
|
int[] cetable = new int[INITIAL_ARRAY_SIZE_];
|
|
int patternlength = pattern_.text_.length();
|
|
CollationElementIterator coleiter = utilIter_;
|
|
|
|
if (coleiter == null) {
|
|
coleiter = new CollationElementIterator(pattern_.text_, collator_);
|
|
utilIter_ = coleiter;
|
|
} else {
|
|
coleiter.setText(pattern_.text_);
|
|
}
|
|
|
|
int offset = 0;
|
|
int result = 0;
|
|
int ce;
|
|
|
|
while ((ce = coleiter.next()) != CollationElementIterator.NULLORDER) {
|
|
int newce = getCE(ce);
|
|
if (newce != CollationElementIterator.IGNORABLE /* 0 */) {
|
|
int[] temp = addToIntArray(cetable, offset, newce,
|
|
patternlength - coleiter.getOffset() + 1);
|
|
offset++;
|
|
cetable = temp;
|
|
}
|
|
result += (coleiter.getMaxExpansion(ce) - 1);
|
|
}
|
|
|
|
cetable[offset] = 0;
|
|
pattern_.CE_ = cetable;
|
|
pattern_.CELength_ = offset;
|
|
|
|
return result;
|
|
}
|
|
|
|
/**
|
|
* Initializing the pce table for a pattern.
|
|
* Stores non-ignorable collation keys.
|
|
* Table size will be estimated by the size of the pattern text. Table
|
|
* expansion will be perform as we go along. Adding 1 to ensure that the table
|
|
* size definitely increases.
|
|
* @return total number of expansions
|
|
*/
|
|
private int initializePatternPCETable() {
|
|
long[] pcetable = new long[INITIAL_ARRAY_SIZE_];
|
|
int pcetablesize = pcetable.length;
|
|
int patternlength = pattern_.text_.length();
|
|
CollationElementIterator coleiter = utilIter_;
|
|
|
|
if (coleiter == null) {
|
|
coleiter = new CollationElementIterator(pattern_.text_, collator_);
|
|
utilIter_ = coleiter;
|
|
} else {
|
|
coleiter.setText(pattern_.text_);
|
|
}
|
|
|
|
int offset = 0;
|
|
int result = 0;
|
|
long pce;
|
|
|
|
CollationPCE iter = new CollationPCE(coleiter);
|
|
|
|
// ** Should processed CEs be signed or unsigned?
|
|
// ** (the rest of the code in this file seems to play fast-and-loose with
|
|
// ** whether a CE is signed or unsigned. For example, look at routine above this one.)
|
|
while ((pce = iter.nextProcessed(null)) != CollationPCE.PROCESSED_NULLORDER) {
|
|
long[] temp = addToLongArray(pcetable, offset, pcetablesize, pce, patternlength - coleiter.getOffset() + 1);
|
|
offset++;
|
|
pcetable = temp;
|
|
}
|
|
|
|
pcetable[offset] = 0;
|
|
pattern_.PCE_ = pcetable;
|
|
pattern_.PCELength_ = offset;
|
|
|
|
return result;
|
|
}
|
|
|
|
// TODO: This method only triggers initializePatternCETable(), which is probably no
|
|
// longer needed.
|
|
private int initializePattern() {
|
|
// Since the strength is primary, accents are ignored in the pattern.
|
|
|
|
// *** Boyer-Moore ***
|
|
/*
|
|
if (strength_ == Collator.PRIMARY) {
|
|
pattern_.hasPrefixAccents_ = false;
|
|
pattern_.hasSuffixAccents_ = false;
|
|
} else {
|
|
pattern_.hasPrefixAccents_ = (getFCD(pattern_.text_, 0) >>> SECOND_LAST_BYTE_SHIFT_) != 0;
|
|
pattern_.hasSuffixAccents_ = (getFCD(pattern_.text_.codePointBefore(pattern_.text_.length())) & LAST_BYTE_MASK_) != 0;
|
|
}
|
|
*/
|
|
|
|
pattern_.PCE_ = null;
|
|
|
|
// since intializePattern is an internal method status is a success.
|
|
return initializePatternCETable();
|
|
}
|
|
|
|
// *** Boyer-Moore ***
|
|
/*
|
|
private final void setShiftTable(char shift[],
|
|
char backshift[],
|
|
int cetable[], int cesize,
|
|
int expansionsize,
|
|
int defaultforward,
|
|
int defaultbackward) {
|
|
// No implementation
|
|
}
|
|
*/
|
|
|
|
// TODO: This method only triggers initializePattern(), which is probably no
|
|
// longer needed.
|
|
private void initialize() {
|
|
/* int expandlength = */ initializePattern();
|
|
|
|
// *** Boyer-Moore ***
|
|
/*
|
|
if (pattern_.CELength_ > 0) {
|
|
int cesize = pattern_.CELength_;
|
|
int minlength = cesize > expandlength ? cesize - expandlength : 1;
|
|
pattern_.defaultShiftSize_ = minlength;
|
|
setShiftTable(pattern_.shift_, pattern_.backShift_, pattern_.CE_, cesize,
|
|
expandlength, minlength, minlength);
|
|
return;
|
|
}
|
|
return pattern_.defaultShiftSize_;
|
|
*/
|
|
}
|
|
|
|
/**
|
|
* @deprecated This API is ICU internal only.
|
|
* @hide original deprecated declaration
|
|
* @hide draft / provisional / internal are hidden on Android
|
|
*/
|
|
@Override
|
|
@Deprecated
|
|
protected void setMatchNotFound() {
|
|
super.setMatchNotFound();
|
|
// SearchIterator#setMatchNotFound() does following:
|
|
// search_.matchedIndex_ = DONE;
|
|
// search_.setMatchedLength(0);
|
|
if (search_.isForwardSearching_) {
|
|
textIter_.setOffset(search_.text().getEndIndex());
|
|
} else {
|
|
textIter_.setOffset(0);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Checks if the offset runs out of the text string range
|
|
* @param textstart offset of the first character in the range
|
|
* @param textlimit limit offset of the text string range
|
|
* @param offset to test
|
|
* @return true if offset is out of bounds, false otherwise
|
|
*/
|
|
private static final boolean isOutOfBounds(int textstart, int textlimit, int offset) {
|
|
return offset < textstart || offset > textlimit;
|
|
}
|
|
|
|
/**
|
|
* Checks for identical match
|
|
* @param start offset of possible match
|
|
* @param end offset of possible match
|
|
* @return true if identical match is found
|
|
*/
|
|
private boolean checkIdentical(int start, int end) {
|
|
if (strength_ != Collator.IDENTICAL) {
|
|
return true;
|
|
}
|
|
// Note: We could use Normalizer::compare() or similar, but for short strings
|
|
// which may not be in FCD it might be faster to just NFD them.
|
|
String textstr = getString(targetText, start, end - start);
|
|
if (Normalizer.quickCheck(textstr, Normalizer.NFD, 0) == Normalizer.NO) {
|
|
textstr = Normalizer.decompose(textstr, false);
|
|
}
|
|
String patternstr = pattern_.text_;
|
|
if (Normalizer.quickCheck(patternstr, Normalizer.NFD, 0) == Normalizer.NO) {
|
|
patternstr = Normalizer.decompose(patternstr, false);
|
|
}
|
|
return textstr.equals(patternstr);
|
|
}
|
|
|
|
private boolean initTextProcessedIter() {
|
|
if (textProcessedIter_ == null) {
|
|
textProcessedIter_ = new CollationPCE(textIter_);
|
|
} else {
|
|
textProcessedIter_.init(textIter_);
|
|
}
|
|
return true;
|
|
}
|
|
|
|
/*
|
|
* Find the next break boundary after startIndex. If the UStringSearch object
|
|
* has an external break iterator, use that. Otherwise use the internal character
|
|
* break iterator.
|
|
*/
|
|
private int nextBoundaryAfter(int startIndex) {
|
|
BreakIterator breakiterator = search_.breakIter();
|
|
|
|
if (breakiterator == null) {
|
|
breakiterator = search_.internalBreakIter_;
|
|
}
|
|
|
|
if (breakiterator != null) {
|
|
return breakiterator.following(startIndex);
|
|
}
|
|
|
|
return startIndex;
|
|
}
|
|
|
|
/*
|
|
* Returns true if index is on a break boundary. If the UStringSearch
|
|
* has an external break iterator, test using that, otherwise test
|
|
* using the internal character break iterator.
|
|
*/
|
|
private boolean isBreakBoundary(int index) {
|
|
BreakIterator breakiterator = search_.breakIter();
|
|
|
|
if (breakiterator == null) {
|
|
breakiterator = search_.internalBreakIter_;
|
|
}
|
|
|
|
return (breakiterator != null && breakiterator.isBoundary(index));
|
|
}
|
|
|
|
|
|
// Java porting note: Followings are corresponding to UCompareCEsResult enum
|
|
private static final int CE_MATCH = -1;
|
|
private static final int CE_NO_MATCH = 0;
|
|
private static final int CE_SKIP_TARG = 1;
|
|
private static final int CE_SKIP_PATN = 2;
|
|
|
|
private static int CE_LEVEL2_BASE = 0x00000005;
|
|
private static int CE_LEVEL3_BASE = 0x00050000;
|
|
|
|
private static int compareCE64s(long targCE, long patCE, ElementComparisonType compareType) {
|
|
if (targCE == patCE) {
|
|
return CE_MATCH;
|
|
}
|
|
if (compareType == ElementComparisonType.STANDARD_ELEMENT_COMPARISON) {
|
|
return CE_NO_MATCH;
|
|
}
|
|
|
|
long targCEshifted = targCE >>> 32;
|
|
long patCEshifted = patCE >>> 32;
|
|
long mask;
|
|
|
|
mask = 0xFFFF0000L;
|
|
int targLev1 = (int)(targCEshifted & mask);
|
|
int patLev1 = (int)(patCEshifted & mask);
|
|
if (targLev1 != patLev1) {
|
|
if (targLev1 == 0) {
|
|
return CE_SKIP_TARG;
|
|
}
|
|
if (patLev1 == 0
|
|
&& compareType == ElementComparisonType.ANY_BASE_WEIGHT_IS_WILDCARD) {
|
|
return CE_SKIP_PATN;
|
|
}
|
|
return CE_NO_MATCH;
|
|
}
|
|
|
|
mask = 0x0000FFFFL;
|
|
int targLev2 = (int)(targCEshifted & mask);
|
|
int patLev2 = (int)(patCEshifted & mask);
|
|
if (targLev2 != patLev2) {
|
|
if (targLev2 == 0) {
|
|
return CE_SKIP_TARG;
|
|
}
|
|
if (patLev2 == 0
|
|
&& compareType == ElementComparisonType.ANY_BASE_WEIGHT_IS_WILDCARD) {
|
|
return CE_SKIP_PATN;
|
|
}
|
|
return (patLev2 == CE_LEVEL2_BASE ||
|
|
(compareType == ElementComparisonType.ANY_BASE_WEIGHT_IS_WILDCARD &&
|
|
targLev2 == CE_LEVEL2_BASE)) ? CE_MATCH : CE_NO_MATCH;
|
|
}
|
|
|
|
mask = 0xFFFF0000L;
|
|
int targLev3 = (int)(targCE & mask);
|
|
int patLev3 = (int)(patCE & mask);
|
|
if (targLev3 != patLev3) {
|
|
return (patLev3 == CE_LEVEL3_BASE ||
|
|
(compareType == ElementComparisonType.ANY_BASE_WEIGHT_IS_WILDCARD &&
|
|
targLev3 == CE_LEVEL3_BASE) )? CE_MATCH: CE_NO_MATCH;
|
|
}
|
|
|
|
return CE_MATCH;
|
|
}
|
|
|
|
/**
|
|
* An object used for receiving matched index in search() and
|
|
* searchBackwards().
|
|
*/
|
|
private static class Match {
|
|
int start_ = -1;
|
|
int limit_ = -1;
|
|
}
|
|
|
|
private boolean search(int startIdx, Match m) {
|
|
// Input parameter sanity check.
|
|
if (pattern_.CELength_ == 0
|
|
|| startIdx < search_.beginIndex()
|
|
|| startIdx > search_.endIndex()) {
|
|
throw new IllegalArgumentException("search(" + startIdx + ", m) - expected position to be between " +
|
|
search_.beginIndex() + " and " + search_.endIndex());
|
|
}
|
|
|
|
if (pattern_.PCE_ == null) {
|
|
initializePatternPCETable();
|
|
}
|
|
|
|
textIter_.setOffset(startIdx);
|
|
CEBuffer ceb = new CEBuffer(this);
|
|
|
|
int targetIx = 0;
|
|
CEI targetCEI = null;
|
|
int patIx;
|
|
boolean found;
|
|
|
|
int mStart = -1;
|
|
int mLimit = -1;
|
|
int minLimit;
|
|
int maxLimit;
|
|
|
|
// Outer loop moves over match starting positions in the
|
|
// target CE space.
|
|
// Here we see the target as a sequence of collation elements, resulting from the following:
|
|
// 1. Target characters were decomposed, and (if appropriate) other compressions and expansions are applied
|
|
// (for example, digraphs such as IJ may be broken into two characters).
|
|
// 2. An int64_t CE weight is determined for each resulting unit (high 16 bits are primary strength, next
|
|
// 16 bits are secondary, next 16 (the high 16 bits of the low 32-bit half) are tertiary. Any of these
|
|
// fields that are for strengths below that of the collator are set to 0. If this makes the int64_t
|
|
// CE weight 0 (as for a combining diacritic with secondary weight when the collator strength is primary),
|
|
// then the CE is deleted, so the following code sees only CEs that are relevant.
|
|
// For each CE, the lowIndex and highIndex correspond to where this CE begins and ends in the original text.
|
|
// If lowIndex==highIndex, either the CE resulted from an expansion/decomposition of one of the original text
|
|
// characters, or the CE marks the limit of the target text (in which case the CE weight is UCOL_PROCESSED_NULLORDER).
|
|
for (targetIx = 0; ; targetIx++) {
|
|
found = true;
|
|
// Inner loop checks for a match beginning at each
|
|
// position from the outer loop.
|
|
int targetIxOffset = 0;
|
|
long patCE = 0;
|
|
// For targetIx > 0, this ceb.get gets a CE that is as far back in the ring buffer
|
|
// (compared to the last CE fetched for the previous targetIx value) as we need to go
|
|
// for this targetIx value, so if it is non-NULL then other ceb.get calls should be OK.
|
|
CEI firstCEI = ceb.get(targetIx);
|
|
if (firstCEI == null) {
|
|
throw new ICUException("CEBuffer.get(" + targetIx + ") returned null.");
|
|
}
|
|
|
|
for (patIx = 0; patIx < pattern_.PCELength_; patIx++) {
|
|
patCE = pattern_.PCE_[patIx];
|
|
targetCEI = ceb.get(targetIx + patIx + targetIxOffset);
|
|
// Compare CE from target string with CE from the pattern.
|
|
// Note that the target CE will be UCOL_PROCESSED_NULLORDER if we reach the end of input,
|
|
// which will fail the compare, below.
|
|
int ceMatch = compareCE64s(targetCEI.ce_, patCE, search_.elementComparisonType_);
|
|
if (ceMatch == CE_NO_MATCH) {
|
|
found = false;
|
|
break;
|
|
} else if (ceMatch > CE_NO_MATCH) {
|
|
if (ceMatch == CE_SKIP_TARG) {
|
|
// redo with same patCE, next targCE
|
|
patIx--;
|
|
targetIxOffset++;
|
|
} else { // ceMatch == CE_SKIP_PATN
|
|
// redo with same targCE, next patCE
|
|
targetIxOffset--;
|
|
}
|
|
}
|
|
}
|
|
targetIxOffset += pattern_.PCELength_; // this is now the offset in target CE space to end of the match so far
|
|
|
|
if (!found && ((targetCEI == null) || (targetCEI.ce_ != CollationPCE.PROCESSED_NULLORDER))) {
|
|
// No match at this targetIx. Try again at the next.
|
|
continue;
|
|
}
|
|
|
|
if (!found) {
|
|
// No match at all, we have run off the end of the target text.
|
|
break;
|
|
}
|
|
|
|
// We have found a match in CE space.
|
|
// Now determine the bounds in string index space.
|
|
// There still is a chance of match failure if the CE range not correspond to
|
|
// an acceptable character range.
|
|
//
|
|
CEI lastCEI = ceb.get(targetIx + targetIxOffset -1);
|
|
|
|
mStart = firstCEI.lowIndex_;
|
|
minLimit = lastCEI.lowIndex_;
|
|
|
|
// Look at the CE following the match. If it is UCOL_NULLORDER the match
|
|
// extended to the end of input, and the match is good.
|
|
|
|
// Look at the high and low indices of the CE following the match. If
|
|
// they are the same it means one of two things:
|
|
// 1. The match extended to the last CE from the target text, which is OK, or
|
|
// 2. The last CE that was part of the match is in an expansion that extends
|
|
// to the first CE after the match. In this case, we reject the match.
|
|
CEI nextCEI = null;
|
|
if (search_.elementComparisonType_ == ElementComparisonType.STANDARD_ELEMENT_COMPARISON) {
|
|
nextCEI = ceb.get(targetIx + targetIxOffset);
|
|
maxLimit = nextCEI.lowIndex_;
|
|
if (nextCEI.lowIndex_ == nextCEI.highIndex_ && nextCEI.ce_ != CollationPCE.PROCESSED_NULLORDER) {
|
|
found = false;
|
|
}
|
|
} else {
|
|
for (;; ++targetIxOffset) {
|
|
nextCEI = ceb.get(targetIx + targetIxOffset);
|
|
maxLimit = nextCEI.lowIndex_;
|
|
// If we are at the end of the target too, match succeeds
|
|
if (nextCEI.ce_ == CollationPCE.PROCESSED_NULLORDER) {
|
|
break;
|
|
}
|
|
// As long as the next CE has primary weight of 0,
|
|
// it is part of the last target element matched by the pattern;
|
|
// make sure it can be part of a match with the last patCE
|
|
if ((((nextCEI.ce_) >>> 32) & 0xFFFF0000L) == 0) {
|
|
int ceMatch = compareCE64s(nextCEI.ce_, patCE, search_.elementComparisonType_);
|
|
if (ceMatch == CE_NO_MATCH || ceMatch == CE_SKIP_PATN ) {
|
|
found = false;
|
|
break;
|
|
}
|
|
// If lowIndex == highIndex, this target CE is part of an expansion of the last matched
|
|
// target element, but it has non-zero primary weight => match fails
|
|
} else if ( nextCEI.lowIndex_ == nextCEI.highIndex_ ) {
|
|
found = false;
|
|
break;
|
|
// Else the target CE is not part of an expansion of the last matched element, match succeeds
|
|
} else {
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
// Check for the start of the match being within a combining sequence.
|
|
// This can happen if the pattern itself begins with a combining char, and
|
|
// the match found combining marks in the target text that were attached
|
|
// to something else.
|
|
// This type of match should be rejected for not completely consuming a
|
|
// combining sequence.
|
|
if (!isBreakBoundary(mStart)) {
|
|
found = false;
|
|
}
|
|
|
|
// Check for the start of the match being within an Collation Element Expansion,
|
|
// meaning that the first char of the match is only partially matched.
|
|
// With expansions, the first CE will report the index of the source
|
|
// character, and all subsequent (expansions) CEs will report the source index of the
|
|
// _following_ character.
|
|
int secondIx = firstCEI.highIndex_;
|
|
if (mStart == secondIx) {
|
|
found = false;
|
|
}
|
|
|
|
// Allow matches to end in the middle of a grapheme cluster if the following
|
|
// conditions are met; this is needed to make prefix search work properly in
|
|
// Indic, see #11750
|
|
// * the default breakIter is being used
|
|
// * the next collation element after this combining sequence
|
|
// - has non-zero primary weight
|
|
// - corresponds to a separate character following the one at end of the current match
|
|
// (the second of these conditions, and perhaps both, may be redundant given the
|
|
// subsequent check for normalization boundary; however they are likely much faster
|
|
// tests in any case)
|
|
// * the match limit is a normalization boundary
|
|
boolean allowMidclusterMatch =
|
|
breakIterator == null &&
|
|
(((nextCEI.ce_) >>> 32) & 0xFFFF0000L) != 0 &&
|
|
maxLimit >= lastCEI.highIndex_ && nextCEI.highIndex_ > maxLimit &&
|
|
(nfd_.hasBoundaryBefore(codePointAt(targetText, maxLimit)) ||
|
|
nfd_.hasBoundaryAfter(codePointBefore(targetText, maxLimit)));
|
|
|
|
// If those conditions are met, then:
|
|
// * do NOT advance the candidate match limit (mLimit) to a break boundary; however
|
|
// the match limit may be backed off to a previous break boundary. This handles
|
|
// cases in which mLimit includes target characters that are ignorable with current
|
|
// settings (such as space) and which extend beyond the pattern match.
|
|
// * do NOT require that end of the combining sequence not extend beyond the match in CE space
|
|
// * do NOT require that match limit be on a breakIter boundary
|
|
|
|
// Advance the match end position to the first acceptable match boundary.
|
|
// This advances the index over any combining characters.
|
|
mLimit = maxLimit;
|
|
if (minLimit < maxLimit) {
|
|
// When the last CE's low index is same with its high index, the CE is likely
|
|
// a part of expansion. In this case, the index is located just after the
|
|
// character corresponding to the CEs compared above. If the index is right
|
|
// at the break boundary, move the position to the next boundary will result
|
|
// incorrect match length when there are ignorable characters exist between
|
|
// the position and the next character produces CE(s). See ticket#8482.
|
|
if (minLimit == lastCEI.highIndex_ && isBreakBoundary(minLimit)) {
|
|
mLimit = minLimit;
|
|
} else {
|
|
int nba = nextBoundaryAfter(minLimit);
|
|
// Note that we can have nba < maxLimit && nba >= minLImit, in which
|
|
// case we want to set mLimit to nba regardless of allowMidclusterMatch
|
|
// (i.e. we back off mLimit to the previous breakIterator boundary).
|
|
if (nba >= lastCEI.highIndex_ && (!allowMidclusterMatch || nba < maxLimit)) {
|
|
mLimit = nba;
|
|
}
|
|
}
|
|
}
|
|
|
|
if (!allowMidclusterMatch) {
|
|
// If advancing to the end of a combining sequence in character indexing space
|
|
// advanced us beyond the end of the match in CE space, reject this match.
|
|
if (mLimit > maxLimit) {
|
|
found = false;
|
|
}
|
|
|
|
if (!isBreakBoundary(mLimit)) {
|
|
found = false;
|
|
}
|
|
}
|
|
|
|
if (!checkIdentical(mStart, mLimit)) {
|
|
found = false;
|
|
}
|
|
|
|
if (found) {
|
|
break;
|
|
}
|
|
}
|
|
|
|
// All Done. Store back the match bounds to the caller.
|
|
//
|
|
if (found == false) {
|
|
mLimit = -1;
|
|
mStart = -1;
|
|
}
|
|
|
|
if (m != null) {
|
|
m.start_ = mStart;
|
|
m.limit_ = mLimit;
|
|
}
|
|
|
|
return found;
|
|
}
|
|
|
|
private static int codePointAt(CharacterIterator iter, int index) {
|
|
int currentIterIndex = iter.getIndex();
|
|
char codeUnit = iter.setIndex(index);
|
|
int cp = codeUnit;
|
|
if (Character.isHighSurrogate(codeUnit)) {
|
|
char nextUnit = iter.next();
|
|
if (Character.isLowSurrogate(nextUnit)) {
|
|
cp = Character.toCodePoint(codeUnit, nextUnit);
|
|
}
|
|
}
|
|
iter.setIndex(currentIterIndex); // restore iter position
|
|
return cp;
|
|
}
|
|
|
|
private static int codePointBefore(CharacterIterator iter, int index) {
|
|
int currentIterIndex = iter.getIndex();
|
|
iter.setIndex(index);
|
|
char codeUnit = iter.previous();
|
|
int cp = codeUnit;
|
|
if (Character.isLowSurrogate(codeUnit)) {
|
|
char prevUnit = iter.previous();
|
|
if (Character.isHighSurrogate(prevUnit)) {
|
|
cp = Character.toCodePoint(prevUnit, codeUnit);
|
|
}
|
|
}
|
|
iter.setIndex(currentIterIndex); // restore iter position
|
|
return cp;
|
|
}
|
|
|
|
private boolean searchBackwards(int startIdx, Match m) {
|
|
//ICU4C_TODO comment: reject search patterns beginning with a combining char.
|
|
|
|
// Input parameter sanity check.
|
|
if (pattern_.CELength_ == 0
|
|
|| startIdx < search_.beginIndex()
|
|
|| startIdx > search_.endIndex()) {
|
|
throw new IllegalArgumentException("searchBackwards(" + startIdx + ", m) - expected position to be between " +
|
|
search_.beginIndex() + " and " + search_.endIndex());
|
|
}
|
|
|
|
if (pattern_.PCE_ == null) {
|
|
initializePatternPCETable();
|
|
}
|
|
|
|
CEBuffer ceb = new CEBuffer(this);
|
|
int targetIx = 0;
|
|
|
|
/*
|
|
* Pre-load the buffer with the CE's for the grapheme
|
|
* after our starting position so that we're sure that
|
|
* we can look at the CE following the match when we
|
|
* check the match boundaries.
|
|
*
|
|
* This will also pre-fetch the first CE that we'll
|
|
* consider for the match.
|
|
*/
|
|
if (startIdx < search_.endIndex()) {
|
|
BreakIterator bi = search_.internalBreakIter_;
|
|
int next = bi.following(startIdx);
|
|
|
|
textIter_.setOffset(next);
|
|
|
|
for (targetIx = 0; ; targetIx++) {
|
|
if (ceb.getPrevious(targetIx).lowIndex_ < startIdx) {
|
|
break;
|
|
}
|
|
}
|
|
} else {
|
|
textIter_.setOffset(startIdx);
|
|
}
|
|
|
|
CEI targetCEI = null;
|
|
int patIx;
|
|
boolean found;
|
|
|
|
int limitIx = targetIx;
|
|
int mStart = -1;
|
|
int mLimit = -1;
|
|
int minLimit;
|
|
int maxLimit;
|
|
|
|
// Outer loop moves over match starting positions in the
|
|
// target CE space.
|
|
// Here, targetIx values increase toward the beginning of the base text (i.e. we get the text CEs in reverse order).
|
|
// But patIx is 0 at the beginning of the pattern and increases toward the end.
|
|
// So this loop performs a comparison starting with the end of pattern, and prcessd toward the beginning of the pattern
|
|
// and the beginning of the base text.
|
|
for (targetIx = limitIx; ; targetIx++) {
|
|
found = true;
|
|
// For targetIx > limitIx, this ceb.getPrevious gets a CE that is as far back in the ring buffer
|
|
// (compared to the last CE fetched for the previous targetIx value) as we need to go
|
|
// for this targetIx value, so if it is non-NULL then other ceb.getPrevious calls should be OK.
|
|
CEI lastCEI = ceb.getPrevious(targetIx);
|
|
if (lastCEI == null) {
|
|
throw new ICUException("CEBuffer.getPrevious(" + targetIx + ") returned null.");
|
|
}
|
|
// Inner loop checks for a match beginning at each
|
|
// position from the outer loop.
|
|
int targetIxOffset = 0;
|
|
for (patIx = pattern_.PCELength_ - 1; patIx >= 0; patIx--) {
|
|
long patCE = pattern_.PCE_[patIx];
|
|
|
|
targetCEI = ceb.getPrevious(targetIx + pattern_.PCELength_ - 1 - patIx + targetIxOffset);
|
|
// Compare CE from target string with CE from the pattern.
|
|
// Note that the target CE will be UCOL_NULLORDER if we reach the end of input,
|
|
// which will fail the compare, below.
|
|
int ceMatch = compareCE64s(targetCEI.ce_, patCE, search_.elementComparisonType_);
|
|
if (ceMatch == CE_NO_MATCH) {
|
|
found = false;
|
|
break;
|
|
} else if (ceMatch > CE_NO_MATCH) {
|
|
if (ceMatch == CE_SKIP_TARG) {
|
|
// redo with same patCE, next targCE
|
|
patIx++;
|
|
targetIxOffset++;
|
|
} else { // ceMatch == CE_SKIP_PATN
|
|
// redo with same targCE, next patCE
|
|
targetIxOffset--;
|
|
}
|
|
}
|
|
}
|
|
|
|
if (!found && ((targetCEI == null) || (targetCEI.ce_ != CollationPCE.PROCESSED_NULLORDER))) {
|
|
// No match at this targetIx. Try again at the next.
|
|
continue;
|
|
}
|
|
|
|
if (!found) {
|
|
// No match at all, we have run off the end of the target text.
|
|
break;
|
|
}
|
|
|
|
// We have found a match in CE space.
|
|
// Now determine the bounds in string index space.
|
|
// There still is a chance of match failure if the CE range not correspond to
|
|
// an acceptable character range.
|
|
//
|
|
CEI firstCEI = ceb.getPrevious(targetIx + pattern_.PCELength_ - 1 + targetIxOffset);
|
|
mStart = firstCEI.lowIndex_;
|
|
|
|
// Check for the start of the match being within a combining sequence.
|
|
// This can happen if the pattern itself begins with a combining char, and
|
|
// the match found combining marks in the target text that were attached
|
|
// to something else.
|
|
// This type of match should be rejected for not completely consuming a
|
|
// combining sequence.
|
|
if (!isBreakBoundary(mStart)) {
|
|
found = false;
|
|
}
|
|
|
|
// Look at the high index of the first CE in the match. If it's the same as the
|
|
// low index, the first CE in the match is in the middle of an expansion.
|
|
if (mStart == firstCEI.highIndex_) {
|
|
found = false;
|
|
}
|
|
|
|
minLimit = lastCEI.lowIndex_;
|
|
|
|
if (targetIx > 0) {
|
|
// Look at the CE following the match. If it is UCOL_NULLORDER the match
|
|
// extended to the end of input, and the match is good.
|
|
|
|
// Look at the high and low indices of the CE following the match. If
|
|
// they are the same it means one of two things:
|
|
// 1. The match extended to the last CE from the target text, which is OK, or
|
|
// 2. The last CE that was part of the match is in an expansion that extends
|
|
// to the first CE after the match. In this case, we reject the match.
|
|
CEI nextCEI = ceb.getPrevious(targetIx - 1);
|
|
|
|
if (nextCEI.lowIndex_ == nextCEI.highIndex_ && nextCEI.ce_ != CollationPCE.PROCESSED_NULLORDER) {
|
|
found = false;
|
|
}
|
|
|
|
mLimit = maxLimit = nextCEI.lowIndex_;
|
|
|
|
// Allow matches to end in the middle of a grapheme cluster if the following
|
|
// conditions are met; this is needed to make prefix search work properly in
|
|
// Indic, see #11750
|
|
// * the default breakIter is being used
|
|
// * the next collation element after this combining sequence
|
|
// - has non-zero primary weight
|
|
// - corresponds to a separate character following the one at end of the current match
|
|
// (the second of these conditions, and perhaps both, may be redundant given the
|
|
// subsequent check for normalization boundary; however they are likely much faster
|
|
// tests in any case)
|
|
// * the match limit is a normalization boundary
|
|
boolean allowMidclusterMatch =
|
|
breakIterator == null &&
|
|
(((nextCEI.ce_) >>> 32) & 0xFFFF0000L) != 0 &&
|
|
maxLimit >= lastCEI.highIndex_ && nextCEI.highIndex_ > maxLimit &&
|
|
(nfd_.hasBoundaryBefore(codePointAt(targetText, maxLimit)) ||
|
|
nfd_.hasBoundaryAfter(codePointBefore(targetText, maxLimit)));
|
|
|
|
// If those conditions are met, then:
|
|
// * do NOT advance the candidate match limit (mLimit) to a break boundary; however
|
|
// the match limit may be backed off to a previous break boundary. This handles
|
|
// cases in which mLimit includes target characters that are ignorable with current
|
|
// settings (such as space) and which extend beyond the pattern match.
|
|
// * do NOT require that end of the combining sequence not extend beyond the match in CE space
|
|
// * do NOT require that match limit be on a breakIter boundary
|
|
|
|
// Advance the match end position to the first acceptable match boundary.
|
|
// This advances the index over any combining characters.
|
|
if (minLimit < maxLimit) {
|
|
int nba = nextBoundaryAfter(minLimit);
|
|
// Note that we can have nba < maxLimit && nba >= minLImit, in which
|
|
// case we want to set mLimit to nba regardless of allowMidclusterMatch
|
|
// (i.e. we back off mLimit to the previous breakIterator boundary).
|
|
if (nba >= lastCEI.highIndex_ && (!allowMidclusterMatch || nba < maxLimit)) {
|
|
mLimit = nba;
|
|
}
|
|
}
|
|
|
|
if (!allowMidclusterMatch) {
|
|
// If advancing to the end of a combining sequence in character indexing space
|
|
// advanced us beyond the end of the match in CE space, reject this match.
|
|
if (mLimit > maxLimit) {
|
|
found = false;
|
|
}
|
|
|
|
// Make sure the end of the match is on a break boundary
|
|
if (!isBreakBoundary(mLimit)) {
|
|
found = false;
|
|
}
|
|
}
|
|
|
|
} else {
|
|
// No non-ignorable CEs after this point.
|
|
// The maximum position is detected by boundary after
|
|
// the last non-ignorable CE. Combining sequence
|
|
// across the start index will be truncated.
|
|
int nba = nextBoundaryAfter(minLimit);
|
|
mLimit = maxLimit = (nba > 0) && (startIdx > nba) ? nba : startIdx;
|
|
}
|
|
|
|
if (!checkIdentical(mStart, mLimit)) {
|
|
found = false;
|
|
}
|
|
|
|
if (found) {
|
|
break;
|
|
}
|
|
}
|
|
|
|
// All Done. Store back the match bounds to the caller.
|
|
//
|
|
if (found == false) {
|
|
mLimit = -1;
|
|
mStart = -1;
|
|
}
|
|
|
|
if (m != null) {
|
|
m.start_ = mStart;
|
|
m.limit_ = mLimit;
|
|
}
|
|
|
|
return found;
|
|
}
|
|
|
|
// Java porting note:
|
|
//
|
|
// ICU4C usearch_handleNextExact() is identical to usearch_handleNextCanonical()
|
|
// for the linear search implementation. The differences are addressed in search().
|
|
//
|
|
private boolean handleNextExact() {
|
|
return handleNextCommonImpl();
|
|
}
|
|
|
|
private boolean handleNextCanonical() {
|
|
return handleNextCommonImpl();
|
|
}
|
|
|
|
private boolean handleNextCommonImpl() {
|
|
int textOffset = textIter_.getOffset();
|
|
Match match = new Match();
|
|
|
|
if (search(textOffset, match)) {
|
|
search_.matchedIndex_ = match.start_;
|
|
search_.setMatchedLength(match.limit_ - match.start_);
|
|
return true;
|
|
} else {
|
|
setMatchNotFound();
|
|
return false;
|
|
}
|
|
}
|
|
|
|
// Java porting note:
|
|
//
|
|
// ICU4C usearch_handlePreviousExact() is identical to usearch_handlePreviousCanonical()
|
|
// for the linear search implementation. The differences are addressed in searchBackwards().
|
|
//
|
|
private boolean handlePreviousExact() {
|
|
return handlePreviousCommonImpl();
|
|
}
|
|
|
|
private boolean handlePreviousCanonical() {
|
|
return handlePreviousCommonImpl();
|
|
}
|
|
|
|
private boolean handlePreviousCommonImpl() {
|
|
int textOffset;
|
|
|
|
if (search_.isOverlap_) {
|
|
if (search_.matchedIndex_ != DONE) {
|
|
textOffset = search_.matchedIndex_ + search_.matchedLength() - 1;
|
|
} else {
|
|
// move the start position at the end of possible match
|
|
initializePatternPCETable();
|
|
if (!initTextProcessedIter()) {
|
|
setMatchNotFound();
|
|
return false;
|
|
}
|
|
for (int nPCEs = 0; nPCEs < pattern_.PCELength_ - 1; nPCEs++) {
|
|
long pce = textProcessedIter_.nextProcessed(null);
|
|
if (pce == CollationPCE.PROCESSED_NULLORDER) {
|
|
// at the end of the text
|
|
break;
|
|
}
|
|
}
|
|
textOffset = textIter_.getOffset();
|
|
}
|
|
} else {
|
|
textOffset = textIter_.getOffset();
|
|
}
|
|
|
|
Match match = new Match();
|
|
if (searchBackwards(textOffset, match)) {
|
|
search_.matchedIndex_ = match.start_;
|
|
search_.setMatchedLength(match.limit_ - match.start_);
|
|
return true;
|
|
} else {
|
|
setMatchNotFound();
|
|
return false;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Gets a substring out of a CharacterIterator
|
|
*
|
|
* Java porting note: Not available in ICU4C
|
|
*
|
|
* @param text CharacterIterator
|
|
* @param start start offset
|
|
* @param length of substring
|
|
* @return substring from text starting at start and length length
|
|
*/
|
|
private static final String getString(CharacterIterator text, int start, int length) {
|
|
StringBuilder result = new StringBuilder(length);
|
|
int offset = text.getIndex();
|
|
text.setIndex(start);
|
|
for (int i = 0; i < length; i++) {
|
|
result.append(text.current());
|
|
text.next();
|
|
}
|
|
text.setIndex(offset);
|
|
return result.toString();
|
|
}
|
|
|
|
/**
|
|
* Java port of ICU4C struct UPattern (usrchimp.h)
|
|
*/
|
|
private static final class Pattern {
|
|
/** Pattern string */
|
|
String text_;
|
|
|
|
long[] PCE_;
|
|
int PCELength_ = 0;
|
|
|
|
// TODO: We probably do not need CE_ / CELength_
|
|
@SuppressWarnings("unused")
|
|
int[] CE_;
|
|
int CELength_ = 0;
|
|
|
|
// *** Boyer-Moore ***
|
|
// boolean hasPrefixAccents_ = false;
|
|
// boolean hasSuffixAccents_ = false;
|
|
// int defaultShiftSize_;
|
|
// char[] shift_;
|
|
// char[] backShift_;
|
|
|
|
protected Pattern(String pattern) {
|
|
text_ = pattern;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Java port of ICU4C UCollationPCE (usrchimp.h)
|
|
*/
|
|
private static class CollationPCE {
|
|
public static final long PROCESSED_NULLORDER = -1;
|
|
|
|
private static final int DEFAULT_BUFFER_SIZE = 16;
|
|
private static final int BUFFER_GROW = 8;
|
|
|
|
// Note: PRIMARYORDERMASK is also duplicated in StringSearch class
|
|
private static final int PRIMARYORDERMASK = 0xffff0000;
|
|
private static final int CONTINUATION_MARKER = 0xc0;
|
|
|
|
private PCEBuffer pceBuffer_ = new PCEBuffer();
|
|
private CollationElementIterator cei_;
|
|
private int strength_;
|
|
private boolean toShift_;
|
|
private boolean isShifted_;
|
|
private int variableTop_;
|
|
|
|
public CollationPCE(CollationElementIterator iter) {
|
|
init(iter);
|
|
}
|
|
|
|
public void init(CollationElementIterator iter) {
|
|
cei_ = iter;
|
|
init(iter.getRuleBasedCollator());
|
|
}
|
|
|
|
private void init(RuleBasedCollator coll) {
|
|
strength_ = coll.getStrength();
|
|
toShift_ = coll.isAlternateHandlingShifted();
|
|
isShifted_ = false;
|
|
variableTop_ = coll.getVariableTop();
|
|
}
|
|
|
|
@SuppressWarnings("fallthrough")
|
|
private long processCE(int ce) {
|
|
long primary = 0, secondary = 0, tertiary = 0, quaternary = 0;
|
|
|
|
// This is clean, but somewhat slow...
|
|
// We could apply the mask to ce and then
|
|
// just get all three orders...
|
|
switch (strength_) {
|
|
default:
|
|
tertiary = CollationElementIterator.tertiaryOrder(ce);
|
|
/* note fall-through */
|
|
|
|
case Collator.SECONDARY:
|
|
secondary = CollationElementIterator.secondaryOrder(ce);
|
|
/* note fall-through */
|
|
|
|
case Collator.PRIMARY:
|
|
primary = CollationElementIterator.primaryOrder(ce);
|
|
}
|
|
|
|
// **** This should probably handle continuations too. ****
|
|
// **** That means that we need 24 bits for the primary ****
|
|
// **** instead of the 16 that we're currently using. ****
|
|
// **** So we can lay out the 64 bits as: 24.12.12.16. ****
|
|
// **** Another complication with continuations is that ****
|
|
// **** the *second* CE is marked as a continuation, so ****
|
|
// **** we always have to peek ahead to know how long ****
|
|
// **** the primary is... ****
|
|
if ((toShift_ && variableTop_ > ce && primary != 0) || (isShifted_ && primary == 0)) {
|
|
|
|
if (primary == 0) {
|
|
return CollationElementIterator.IGNORABLE;
|
|
}
|
|
|
|
if (strength_ >= Collator.QUATERNARY) {
|
|
quaternary = primary;
|
|
}
|
|
|
|
primary = secondary = tertiary = 0;
|
|
isShifted_ = true;
|
|
} else {
|
|
if (strength_ >= Collator.QUATERNARY) {
|
|
quaternary = 0xFFFF;
|
|
}
|
|
|
|
isShifted_ = false;
|
|
}
|
|
|
|
return primary << 48 | secondary << 32 | tertiary << 16 | quaternary;
|
|
}
|
|
|
|
/**
|
|
* Get the processed ordering priority of the next collation element in the text.
|
|
* A single character may contain more than one collation element.
|
|
*
|
|
* Note: This is equivalent to
|
|
* UCollationPCE::nextProcessed(int32_t *ixLow, int32_t *ixHigh, UErrorCode *status);
|
|
*
|
|
* @param range receiving the iterator index before/after fetching the CE.
|
|
* @return The next collation elements ordering, otherwise returns PROCESSED_NULLORDER
|
|
* if an error has occurred or if the end of string has been reached
|
|
*/
|
|
public long nextProcessed(Range range) {
|
|
long result = CollationElementIterator.IGNORABLE;
|
|
int low = 0, high = 0;
|
|
|
|
pceBuffer_.reset();
|
|
|
|
do {
|
|
low = cei_.getOffset();
|
|
int ce = cei_.next();
|
|
high = cei_.getOffset();
|
|
|
|
if (ce == CollationElementIterator.NULLORDER) {
|
|
result = PROCESSED_NULLORDER;
|
|
break;
|
|
}
|
|
|
|
result = processCE(ce);
|
|
} while (result == CollationElementIterator.IGNORABLE);
|
|
|
|
if (range != null) {
|
|
range.ixLow_ = low;
|
|
range.ixHigh_ = high;
|
|
}
|
|
|
|
return result;
|
|
}
|
|
|
|
/**
|
|
* Get the processed ordering priority of the previous collation element in the text.
|
|
* A single character may contain more than one collation element.
|
|
*
|
|
* Note: This is equivalent to
|
|
* UCollationPCE::previousProcessed(int32_t *ixLow, int32_t *ixHigh, UErrorCode *status);
|
|
*
|
|
* @param range receiving the iterator index before/after fetching the CE.
|
|
* @return The previous collation elements ordering, otherwise returns
|
|
* PROCESSED_NULLORDER if an error has occurred or if the start of
|
|
* string has been reached.
|
|
*/
|
|
public long previousProcessed(Range range) {
|
|
long result = CollationElementIterator.IGNORABLE;
|
|
int low = 0, high = 0;
|
|
|
|
// pceBuffer_.reset();
|
|
|
|
while (pceBuffer_.empty()) {
|
|
// buffer raw CEs up to non-ignorable primary
|
|
RCEBuffer rceb = new RCEBuffer();
|
|
int ce;
|
|
|
|
boolean finish = false;
|
|
|
|
// **** do we need to reset rceb, or will it always be empty at this point ****
|
|
do {
|
|
high = cei_.getOffset();
|
|
ce = cei_.previous();
|
|
low = cei_.getOffset();
|
|
|
|
if (ce == CollationElementIterator.NULLORDER) {
|
|
if (!rceb.empty()) {
|
|
break;
|
|
}
|
|
|
|
finish = true;
|
|
break;
|
|
}
|
|
|
|
rceb.put(ce, low, high);
|
|
} while ((ce & PRIMARYORDERMASK) == 0 || isContinuation(ce));
|
|
|
|
if (finish) {
|
|
break;
|
|
}
|
|
|
|
// process the raw CEs
|
|
while (!rceb.empty()) {
|
|
RCEI rcei = rceb.get();
|
|
|
|
result = processCE(rcei.ce_);
|
|
|
|
if (result != CollationElementIterator.IGNORABLE) {
|
|
pceBuffer_.put(result, rcei.low_, rcei.high_);
|
|
}
|
|
}
|
|
}
|
|
|
|
if (pceBuffer_.empty()) {
|
|
// **** Is -1 the right value for ixLow, ixHigh? ****
|
|
if (range != null) {
|
|
range.ixLow_ = -1;
|
|
range.ixHigh_ = -1;
|
|
}
|
|
return CollationElementIterator.NULLORDER;
|
|
}
|
|
|
|
PCEI pcei = pceBuffer_.get();
|
|
|
|
if (range != null) {
|
|
range.ixLow_ = pcei.low_;
|
|
range.ixHigh_ = pcei.high_;
|
|
}
|
|
|
|
return pcei.ce_;
|
|
}
|
|
|
|
private static boolean isContinuation(int ce) {
|
|
return ((ce & CONTINUATION_MARKER) == CONTINUATION_MARKER);
|
|
}
|
|
|
|
/**
|
|
* @hide Only a subset of ICU is exposed in Android
|
|
*/
|
|
public static final class Range {
|
|
int ixLow_;
|
|
int ixHigh_;
|
|
}
|
|
|
|
/** Processed collation element buffer stuff ported from ICU4C ucoleitr.cpp */
|
|
private static final class PCEI {
|
|
long ce_;
|
|
int low_;
|
|
int high_;
|
|
}
|
|
|
|
private static final class PCEBuffer {
|
|
private PCEI[] buffer_ = new PCEI[DEFAULT_BUFFER_SIZE];
|
|
private int bufferIndex_ = 0;
|
|
|
|
void reset() {
|
|
bufferIndex_ = 0;
|
|
}
|
|
|
|
boolean empty() {
|
|
return bufferIndex_ <= 0;
|
|
}
|
|
|
|
void put(long ce, int ixLow, int ixHigh)
|
|
{
|
|
if (bufferIndex_ >= buffer_.length) {
|
|
PCEI[] newBuffer = new PCEI[buffer_.length + BUFFER_GROW];
|
|
System.arraycopy(buffer_, 0, newBuffer, 0, buffer_.length);
|
|
buffer_ = newBuffer;
|
|
}
|
|
buffer_[bufferIndex_] = new PCEI();
|
|
buffer_[bufferIndex_].ce_ = ce;
|
|
buffer_[bufferIndex_].low_ = ixLow;
|
|
buffer_[bufferIndex_].high_ = ixHigh;
|
|
|
|
bufferIndex_ += 1;
|
|
}
|
|
|
|
PCEI get() {
|
|
if (bufferIndex_ > 0) {
|
|
return buffer_[--bufferIndex_];
|
|
}
|
|
return null;
|
|
}
|
|
}
|
|
|
|
/** Raw collation element buffer stuff ported from ICU4C ucoleitr.cpp */
|
|
private static final class RCEI {
|
|
int ce_;
|
|
int low_;
|
|
int high_;
|
|
}
|
|
|
|
private static final class RCEBuffer {
|
|
private RCEI[] buffer_ = new RCEI[DEFAULT_BUFFER_SIZE];
|
|
private int bufferIndex_ = 0;
|
|
|
|
boolean empty() {
|
|
return bufferIndex_ <= 0;
|
|
}
|
|
|
|
void put(int ce, int ixLow, int ixHigh) {
|
|
if (bufferIndex_ >= buffer_.length) {
|
|
RCEI[] newBuffer = new RCEI[buffer_.length + BUFFER_GROW];
|
|
System.arraycopy(buffer_, 0, newBuffer, 0, buffer_.length);
|
|
buffer_ = newBuffer;
|
|
}
|
|
buffer_[bufferIndex_] = new RCEI();
|
|
buffer_[bufferIndex_].ce_ = ce;
|
|
buffer_[bufferIndex_].low_ = ixLow;
|
|
buffer_[bufferIndex_].high_ = ixHigh;
|
|
|
|
bufferIndex_ += 1;
|
|
}
|
|
|
|
RCEI get() {
|
|
if (bufferIndex_ > 0) {
|
|
return buffer_[--bufferIndex_];
|
|
}
|
|
return null;
|
|
}
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Java port of ICU4C CEI (usearch.cpp)
|
|
*
|
|
* CEI Collation Element + source text index.
|
|
* These structs are kept in the circular buffer.
|
|
*/
|
|
private static class CEI {
|
|
long ce_;
|
|
int lowIndex_;
|
|
int highIndex_;
|
|
}
|
|
|
|
/**
|
|
* CEBuffer A circular buffer of CEs from the text being searched
|
|
*/
|
|
private static class CEBuffer {
|
|
// Java porting note: ICU4C uses the size for stack buffer
|
|
// static final int DEFAULT_CEBUFFER_SIZE = 96;
|
|
|
|
static final int CEBUFFER_EXTRA = 32;
|
|
static final int MAX_TARGET_IGNORABLES_PER_PAT_JAMO_L = 8;
|
|
static final int MAX_TARGET_IGNORABLES_PER_PAT_OTHER = 3;
|
|
|
|
CEI[] buf_;
|
|
int bufSize_;
|
|
int firstIx_;
|
|
int limitIx_;
|
|
|
|
// Java porting note: No references in ICU4C implementation
|
|
// CollationElementIterator ceIter_;
|
|
|
|
StringSearch strSearch_;
|
|
|
|
CEBuffer(StringSearch ss) {
|
|
strSearch_ = ss;
|
|
bufSize_ = ss.pattern_.PCELength_ + CEBUFFER_EXTRA;
|
|
if (ss.search_.elementComparisonType_ != ElementComparisonType.STANDARD_ELEMENT_COMPARISON) {
|
|
String patText = ss.pattern_.text_;
|
|
if (patText != null) {
|
|
for (int i = 0; i < patText.length(); i++) {
|
|
char c = patText.charAt(i);
|
|
if (MIGHT_BE_JAMO_L(c)) {
|
|
bufSize_ += MAX_TARGET_IGNORABLES_PER_PAT_JAMO_L;
|
|
} else {
|
|
// No check for surrogates, we might allocate slightly more buffer than necessary.
|
|
bufSize_ += MAX_TARGET_IGNORABLES_PER_PAT_OTHER;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Not used - see above
|
|
// ceIter_ = ss.textIter_;
|
|
|
|
firstIx_ = 0;
|
|
limitIx_ = 0;
|
|
|
|
if (!ss.initTextProcessedIter()) {
|
|
return;
|
|
}
|
|
|
|
buf_ = new CEI[bufSize_];
|
|
}
|
|
|
|
// Get the CE with the specified index.
|
|
// Index must be in the range
|
|
// n-history_size < index < n+1
|
|
// where n is the largest index to have been fetched by some previous call to this function.
|
|
// The CE value will be UCOL__PROCESSED_NULLORDER at end of input.
|
|
//
|
|
CEI get(int index) {
|
|
int i = index % bufSize_;
|
|
|
|
if (index >= firstIx_ && index < limitIx_) {
|
|
// The request was for an entry already in our buffer.
|
|
// Just return it.
|
|
return buf_[i];
|
|
}
|
|
|
|
// Caller is requesting a new, never accessed before, CE.
|
|
// Verify that it is the next one in sequence, which is all
|
|
// that is allowed.
|
|
if (index != limitIx_) {
|
|
assert(false);
|
|
return null;
|
|
}
|
|
|
|
// Manage the circular CE buffer indexing
|
|
limitIx_++;
|
|
|
|
if (limitIx_ - firstIx_ >= bufSize_) {
|
|
// The buffer is full, knock out the lowest-indexed entry.
|
|
firstIx_++;
|
|
}
|
|
|
|
CollationPCE.Range range = new CollationPCE.Range();
|
|
if (buf_[i] == null) {
|
|
buf_[i] = new CEI();
|
|
}
|
|
buf_[i].ce_ = strSearch_.textProcessedIter_.nextProcessed(range);
|
|
buf_[i].lowIndex_ = range.ixLow_;
|
|
buf_[i].highIndex_ = range.ixHigh_;
|
|
|
|
return buf_[i];
|
|
}
|
|
|
|
// Get the CE with the specified index.
|
|
// Index must be in the range
|
|
// n-history_size < index < n+1
|
|
// where n is the largest index to have been fetched by some previous call to this function.
|
|
// The CE value will be UCOL__PROCESSED_NULLORDER at end of input.
|
|
//
|
|
CEI getPrevious(int index) {
|
|
int i = index % bufSize_;
|
|
|
|
if (index >= firstIx_ && index < limitIx_) {
|
|
// The request was for an entry already in our buffer.
|
|
// Just return it.
|
|
return buf_[i];
|
|
}
|
|
|
|
// Caller is requesting a new, never accessed before, CE.
|
|
// Verify that it is the next one in sequence, which is all
|
|
// that is allowed.
|
|
if (index != limitIx_) {
|
|
assert(false);
|
|
return null;
|
|
}
|
|
|
|
// Manage the circular CE buffer indexing
|
|
limitIx_++;
|
|
|
|
if (limitIx_ - firstIx_ >= bufSize_) {
|
|
// The buffer is full, knock out the lowest-indexed entry.
|
|
firstIx_++;
|
|
}
|
|
|
|
CollationPCE.Range range = new CollationPCE.Range();
|
|
if (buf_[i] == null) {
|
|
buf_[i] = new CEI();
|
|
}
|
|
buf_[i].ce_ = strSearch_.textProcessedIter_.previousProcessed(range);
|
|
buf_[i].lowIndex_ = range.ixLow_;
|
|
buf_[i].highIndex_ = range.ixHigh_;
|
|
|
|
return buf_[i];
|
|
}
|
|
|
|
static boolean MIGHT_BE_JAMO_L(char c) {
|
|
return (c >= 0x1100 && c <= 0x115E)
|
|
|| (c >= 0x3131 && c <= 0x314E)
|
|
|| (c >= 0x3165 && c <= 0x3186);
|
|
}
|
|
}
|
|
}
|