/* GENERATED SOURCE. DO NOT MODIFY. */
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/**
*******************************************************************************
* Copyright (C) 1996-2016, International Business Machines Corporation and
* others. All Rights Reserved.
*******************************************************************************
*/
package android.icu.text;
import java.text.CharacterIterator;
import java.util.HashMap;
import java.util.Map;
import android.icu.impl.CharacterIteratorWrapper;
import android.icu.impl.coll.Collation;
import android.icu.impl.coll.CollationData;
import android.icu.impl.coll.CollationIterator;
import android.icu.impl.coll.ContractionsAndExpansions;
import android.icu.impl.coll.FCDIterCollationIterator;
import android.icu.impl.coll.FCDUTF16CollationIterator;
import android.icu.impl.coll.IterCollationIterator;
import android.icu.impl.coll.UTF16CollationIterator;
import android.icu.impl.coll.UVector32;
/**
* CollationElementIterator
is an iterator created by
* a RuleBasedCollator to walk through a string. The return result of
* each iteration is a 32-bit collation element (CE) that defines the
* ordering priority of the next character or sequence of characters
* in the source string.
*
*
For illustration, consider the following in Slovak and in traditional Spanish collation: *
** And in German phonebook collation, ** "ca" -> the first collation element is CE('c') and the second * collation element is CE('a'). * "cha" -> the first collation element is CE('ch') and the second * collation element is CE('a'). **
** ** Since the character 'æ' is a composed character of 'a' and 'e', the * iterator returns two collation elements for the single character 'æ' * * "æb" -> the first collation element is collation_element('a'), the * second collation element is collation_element('e'), and the * third collation element is collation_element('b'). **
For collation ordering comparison, the collation element results * can not be compared simply by using basic arithmetic operators, * e.g. <, == or >, further processing has to be done. Details * can be found in the ICU * * User Guide. An example of using the CollationElementIterator * for collation ordering comparison is the class * {@link android.icu.text.StringSearch}. * *
To construct a CollationElementIterator object, users * call the method getCollationElementIterator() on a * RuleBasedCollator that defines the desired sorting order. * *
Example: *
*** String testString = "This is a test"; * RuleBasedCollator rbc = new RuleBasedCollator("&a<b"); * CollationElementIterator iterator = rbc.getCollationElementIterator(testString); * int primaryOrder = iterator.IGNORABLE; * while (primaryOrder != iterator.NULLORDER) { * int order = iterator.next(); * if (order != iterator.IGNORABLE && * order != iterator.NULLORDER) { * // order is valid, not ignorable and we have not passed the end * // of the iteration, we do something * primaryOrder = CollationElementIterator.primaryOrder(order); * System.out.println("Next primary order 0x" + * Integer.toHexString(primaryOrder)); * } * } **
* The method next() returns the collation order of the next character based on * the comparison level of the collator. The method previous() returns the * collation order of the previous character based on the comparison level of * the collator. The Collation Element Iterator moves only in one direction * between calls to reset(), setOffset(), or setText(). That is, next() and * previous() can not be inter-used. Whenever previous() is to be called after * next() or vice versa, reset(), setOffset() or setText() has to be called first * to reset the status, shifting current position to either the end or the start of * the string (reset() or setText()), or the specified position (setOffset()). * Hence at the next call of next() or previous(), the first or last collation order, * or collation order at the specified position will be returned. If a change of * direction is done without one of these calls, the result is undefined. *
* This class is not subclassable. * @see Collator * @see RuleBasedCollator * @see StringSearch * @author Syn Wee Quek */ public final class CollationElementIterator { private CollationIterator iter_; // owned private RuleBasedCollator rbc_; // aliased private int otherHalf_; /** * <0: backwards; 0: just after reset() (previous() begins from end); * 1: just after setOffset(); >1: forward */ private byte dir_; /** * Stores offsets from expansions and from unsafe-backwards iteration, * so that getOffset() returns intermediate offsets for the CEs * that are consistent with forward iteration. */ private UVector32 offsets_; private String string_; // TODO: needed in Java? if so, then add a UCharacterIterator field too? /** * This constant is returned by the iterator in the methods * next() and previous() when the end or the beginning of the * source string has been reached, and there are no more valid * collation elements to return. * *
See class documentation for an example of use. * @see #next * @see #previous */ public final static int NULLORDER = 0xffffffff; /** * This constant is returned by the iterator in the methods * next() and previous() when a collation element result is to be * ignored. * *
See class documentation for an example of use. * @see #next * @see #previous */ public static final int IGNORABLE = 0; /** * Return the primary order of the specified collation element, * i.e. the first 16 bits. This value is unsigned. * @param ce the collation element * @return the element's 16 bits primary order. */ public final static int primaryOrder(int ce) { return (ce >>> 16) & 0xffff; } /** * Return the secondary order of the specified collation element, * i.e. the 16th to 23th bits, inclusive. This value is unsigned. * @param ce the collation element * @return the element's 8 bits secondary order */ public final static int secondaryOrder(int ce) { return (ce >>> 8) & 0xff; } /** * Return the tertiary order of the specified collation element, i.e. the last * 8 bits. This value is unsigned. * @param ce the collation element * @return the element's 8 bits tertiary order */ public final static int tertiaryOrder(int ce) { return ce & 0xff; } private static final int getFirstHalf(long p, int lower32) { return ((int)p & 0xffff0000) | ((lower32 >> 16) & 0xff00) | ((lower32 >> 8) & 0xff); } private static final int getSecondHalf(long p, int lower32) { return ((int)p << 16) | ((lower32 >> 8) & 0xff00) | (lower32 & 0x3f); } private static final boolean ceNeedsTwoParts(long ce) { return (ce & 0xffff00ff003fL) != 0; } private CollationElementIterator(RuleBasedCollator collator) { iter_ = null; rbc_ = collator; otherHalf_ = 0; dir_ = 0; offsets_ = null; } /** * CollationElementIterator constructor. This takes a source * string and a RuleBasedCollator. The iterator will walk through * the source string based on the rules defined by the * collator. If the source string is empty, NULLORDER will be * returned on the first call to next(). * * @param source the source string. * @param collator the RuleBasedCollator */ CollationElementIterator(String source, RuleBasedCollator collator) { this(collator); setText(source); } // Note: The constructors should take settings & tailoring, not a collator, // to avoid circular dependencies. // However, for equals() we would need to be able to compare tailoring data for equality // without making CollationData or CollationTailoring depend on TailoredSet. // (See the implementation of RuleBasedCollator.equals().) // That might require creating an intermediate class that would be used // by both CollationElementIterator and RuleBasedCollator // but only contain the part of RBC.equals() related to data and rules. /** * CollationElementIterator constructor. This takes a source * character iterator and a RuleBasedCollator. The iterator will * walk through the source string based on the rules defined by * the collator. If the source string is empty, NULLORDER will be * returned on the first call to next(). * * @param source the source string iterator. * @param collator the RuleBasedCollator */ CollationElementIterator(CharacterIterator source, RuleBasedCollator collator) { this(collator); setText(source); } /** * CollationElementIterator constructor. This takes a source * character iterator and a RuleBasedCollator. The iterator will * walk through the source string based on the rules defined by * the collator. If the source string is empty, NULLORDER will be * returned on the first call to next(). * * @param source the source string iterator. * @param collator the RuleBasedCollator */ CollationElementIterator(UCharacterIterator source, RuleBasedCollator collator) { this(collator); setText(source); } /** * Returns the character offset in the source string * corresponding to the next collation element. I.e., getOffset() * returns the position in the source string corresponding to the * collation element that will be returned by the next call to * next() or previous(). This value could be any of: *
setOffset(offset)
sets the index in the middle of
* a contraction, getOffset()
returns the index of
* the first character in the contraction, which may not be equal
* to the original offset that was set. Hence calling getOffset()
* immediately after setOffset(offset) does not guarantee that the
* original offset set will be returned.)
* This iterator iterates over a sequence of collation elements * that were built from the string. Because there isn't * necessarily a one-to-one mapping from characters to collation * elements, this doesn't mean the same thing as "return the * collation element [or ordering priority] of the next character * in the string". * *
This function returns the collation element that the * iterator is currently pointing to, and then updates the * internal pointer to point to the next element. * * @return the next collation element or NULLORDER if the end of the * iteration has been reached. */ public int next() { if (dir_ > 1) { // Continue forward iteration. Test this first. if (otherHalf_ != 0) { int oh = otherHalf_; otherHalf_ = 0; return oh; } } else if (dir_ == 1) { // next() after setOffset() dir_ = 2; } else if (dir_ == 0) { // The iter_ is already reset to the start of the text. dir_ = 2; } else /* dir_ < 0 */{ // illegal change of direction throw new IllegalStateException("Illegal change of direction"); // Java porting note: ICU4C sets U_INVALID_STATE_ERROR to the return status. } // No need to keep all CEs in the buffer when we iterate. iter_.clearCEsIfNoneRemaining(); long ce = iter_.nextCE(); if (ce == Collation.NO_CE) { return NULLORDER; } // Turn the 64-bit CE into two old-style 32-bit CEs, without quaternary bits. long p = ce >>> 32; int lower32 = (int) ce; int firstHalf = getFirstHalf(p, lower32); int secondHalf = getSecondHalf(p, lower32); if (secondHalf != 0) { otherHalf_ = secondHalf | 0xc0; // continuation CE } return firstHalf; } /** * Get the previous collation element in the source string. * *
This iterator iterates over a sequence of collation elements * that were built from the string. Because there isn't * necessarily a one-to-one mapping from characters to collation * elements, this doesn't mean the same thing as "return the * collation element [or ordering priority] of the previous * character in the string". * *
This function updates the iterator's internal pointer to * point to the collation element preceding the one it's currently * pointing to and then returns that element, while next() returns * the current element and then updates the pointer. * * @return the previous collation element, or NULLORDER when the start of * the iteration has been reached. */ public int previous() { if (dir_ < 0) { // Continue backwards iteration. Test this first. if (otherHalf_ != 0) { int oh = otherHalf_; otherHalf_ = 0; return oh; } } else if (dir_ == 0) { iter_.resetToOffset(string_.length()); dir_ = -1; } else if (dir_ == 1) { // previous() after setOffset() dir_ = -1; } else /* dir_ > 1 */{ // illegal change of direction throw new IllegalStateException("Illegal change of direction"); // Java porting note: ICU4C sets U_INVALID_STATE_ERROR to the return status. } if (offsets_ == null) { offsets_ = new UVector32(); } // If we already have expansion CEs, then we also have offsets. // Otherwise remember the trailing offset in case we need to // write offsets for an artificial expansion. int limitOffset = iter_.getCEsLength() == 0 ? iter_.getOffset() : 0; long ce = iter_.previousCE(offsets_); if (ce == Collation.NO_CE) { return NULLORDER; } // Turn the 64-bit CE into two old-style 32-bit CEs, without quaternary bits. long p = ce >>> 32; int lower32 = (int) ce; int firstHalf = getFirstHalf(p, lower32); int secondHalf = getSecondHalf(p, lower32); if (secondHalf != 0) { if (offsets_.isEmpty()) { // When we convert a single 64-bit CE into two 32-bit CEs, // we need to make this artificial expansion behave like a normal expansion. // See CollationIterator.previousCE(). offsets_.addElement(iter_.getOffset()); offsets_.addElement(limitOffset); } otherHalf_ = firstHalf; return secondHalf | 0xc0; // continuation CE } return firstHalf; } /** * Resets the cursor to the beginning of the string. The next * call to next() or previous() will return the first and last * collation element in the string, respectively. * *
If the RuleBasedCollator used by this iterator has had its * attributes changed, calling reset() will reinitialize the * iterator to use the new attributes. */ public void reset() { iter_ .resetToOffset(0); otherHalf_ = 0; dir_ = 0; } /** * Sets the iterator to point to the collation element * corresponding to the character at the specified offset. The * value returned by the next call to next() will be the collation * element corresponding to the characters at offset. * *
If offset is in the middle of a contracting character * sequence, the iterator is adjusted to the start of the * contracting sequence. This means that getOffset() is not * guaranteed to return the same value set by this method. * *
If the decomposition mode is on, and offset is in the middle * of a decomposible range of source text, the iterator may not * return a correct result for the next forwards or backwards * iteration. The user must ensure that the offset is not in the * middle of a decomposible range. * * @param newOffset the character offset into the original source string to * set. Note that this is not an offset into the corresponding * sequence of collation elements. */ public void setOffset(int newOffset) { if (0 < newOffset && newOffset < string_.length()) { int offset = newOffset; do { char c = string_.charAt(offset); if (!rbc_.isUnsafe(c) || (Character.isHighSurrogate(c) && !rbc_.isUnsafe(string_.codePointAt(offset)))) { break; } // Back up to before this unsafe character. --offset; } while (offset > 0); if (offset < newOffset) { // We might have backed up more than necessary. // For example, contractions "ch" and "cu" make both 'h' and 'u' unsafe, // but for text "chu" setOffset(2) should remain at 2 // although we initially back up to offset 0. // Find the last safe offset no greater than newOffset by iterating forward. int lastSafeOffset = offset; do { iter_.resetToOffset(lastSafeOffset); do { iter_.nextCE(); } while ((offset = iter_.getOffset()) == lastSafeOffset); if (offset <= newOffset) { lastSafeOffset = offset; } } while (offset < newOffset); newOffset = lastSafeOffset; } } iter_.resetToOffset(newOffset); otherHalf_ = 0; dir_ = 1; } /** * Set a new source string for iteration, and reset the offset * to the beginning of the text. * * @param source the new source string for iteration. */ public void setText(String source) { string_ = source; // TODO: do we need to remember the source string in a field? CollationIterator newIter; boolean numeric = rbc_.settings.readOnly().isNumeric(); if (rbc_.settings.readOnly().dontCheckFCD()) { newIter = new UTF16CollationIterator(rbc_.data, numeric, string_, 0); } else { newIter = new FCDUTF16CollationIterator(rbc_.data, numeric, string_, 0); } iter_ = newIter; otherHalf_ = 0; dir_ = 0; } /** * Set a new source string iterator for iteration, and reset the * offset to the beginning of the text. * *
The source iterator's integrity will be preserved since a new copy
* will be created for use.
* @param source the new source string iterator for iteration.
*/
public void setText(UCharacterIterator source) {
string_ = source.getText(); // TODO: do we need to remember the source string in a field?
// Note: In C++, we just setText(source.getText()).
// In Java, we actually operate on a character iterator.
// (The old code apparently did so only for a CharacterIterator;
// for a UCharacterIterator it also just used source.getText()).
// TODO: do we need to remember the cloned iterator in a field?
UCharacterIterator src;
try {
src = (UCharacterIterator) source.clone();
} catch (CloneNotSupportedException e) {
// Fall back to ICU 52 behavior of iterating over the text contents
// of the UCharacterIterator.
setText(source.getText());
return;
}
src.setToStart();
CollationIterator newIter;
boolean numeric = rbc_.settings.readOnly().isNumeric();
if (rbc_.settings.readOnly().dontCheckFCD()) {
newIter = new IterCollationIterator(rbc_.data, numeric, src);
} else {
newIter = new FCDIterCollationIterator(rbc_.data, numeric, src, 0);
}
iter_ = newIter;
otherHalf_ = 0;
dir_ = 0;
}
/**
* Set a new source string iterator for iteration, and reset the
* offset to the beginning of the text.
*
* @param source the new source string iterator for iteration.
*/
public void setText(CharacterIterator source) {
// Note: In C++, we just setText(source.getText()).
// In Java, we actually operate on a character iterator.
// TODO: do we need to remember the iterator in a field?
// TODO: apparently we don't clone a CharacterIterator in Java,
// we only clone the text for a UCharacterIterator?? see the old code in the constructors
UCharacterIterator src = new CharacterIteratorWrapper(source);
src.setToStart();
string_ = src.getText(); // TODO: do we need to remember the source string in a field?
CollationIterator newIter;
boolean numeric = rbc_.settings.readOnly().isNumeric();
if (rbc_.settings.readOnly().dontCheckFCD()) {
newIter = new IterCollationIterator(rbc_.data, numeric, src);
} else {
newIter = new FCDIterCollationIterator(rbc_.data, numeric, src, 0);
}
iter_ = newIter;
otherHalf_ = 0;
dir_ = 0;
}
private static final class MaxExpSink implements ContractionsAndExpansions.CESink {
MaxExpSink(Map