537 lines
21 KiB
Java
537 lines
21 KiB
Java
/* GENERATED SOURCE. DO NOT MODIFY. */
|
|
// © 2016 and later: Unicode, Inc. and others.
|
|
// License & terms of use: http://www.unicode.org/copyright.html
|
|
/*
|
|
******************************************************************************
|
|
* Copyright (C) 1996-2015, International Business Machines Corporation and
|
|
* others. All Rights Reserved.
|
|
******************************************************************************
|
|
*/
|
|
|
|
package android.icu.impl;
|
|
|
|
import java.util.NoSuchElementException;
|
|
|
|
import android.icu.lang.UCharacter;
|
|
import android.icu.text.UTF16;
|
|
import android.icu.util.RangeValueIterator;
|
|
|
|
/**
|
|
* <p>Class enabling iteration of the values in a Trie.</p>
|
|
*
|
|
* <p>2015-sep-03 TODO: Only used in test code, move there.
|
|
*
|
|
* <p>Result of each iteration contains the interval of codepoints that have
|
|
* the same value type and the value type itself.</p>
|
|
* <p>The comparison of each codepoint value is done via extract(), which the
|
|
* default implementation is to return the value as it is.</p>
|
|
* <p>Method extract() can be overwritten to perform manipulations on
|
|
* codepoint values in order to perform specialized comparison.</p>
|
|
* <p>TrieIterator is designed to be a generic iterator for the CharTrie
|
|
* and the IntTrie, hence to accommodate both types of data, the return
|
|
* result will be in terms of int (32 bit) values.</p>
|
|
* <p>See android.icu.text.UCharacterTypeIterator for examples of use.</p>
|
|
* <p>Notes for porting utrie_enum from icu4c to icu4j:<br>
|
|
* Internally, icu4c's utrie_enum performs all iterations in its body. In Java
|
|
* sense, the caller will have to pass a object with a callback function
|
|
* UTrieEnumRange(const void *context, UChar32 start, UChar32 limit,
|
|
* uint32_t value) into utrie_enum. utrie_enum will then find ranges of
|
|
* codepoints with the same value as determined by
|
|
* UTrieEnumValue(const void *context, uint32_t value). for each range,
|
|
* utrie_enum calls the callback function to perform a task. In this way,
|
|
* icu4c performs the iteration within utrie_enum.
|
|
* To follow the JDK model, icu4j is slightly different from icu4c.
|
|
* Instead of requesting the caller to implement an object for a callback.
|
|
* The caller will have to implement a subclass of TrieIterator, fleshing out
|
|
* the method extract(int) (equivalent to UTrieEnumValue). Independent of icu4j,
|
|
* the caller will have to code his own iteration and flesh out the task
|
|
* (equivalent to UTrieEnumRange) to be performed in the iteration loop.
|
|
* </p>
|
|
* <p>There are basically 3 usage scenarios for porting:</p>
|
|
* <p>1) UTrieEnumValue is the only implemented callback then just implement a
|
|
* subclass of TrieIterator and override the extract(int) method. The
|
|
* extract(int) method is analogous to UTrieEnumValue callback.
|
|
* </p>
|
|
* <p>2) UTrieEnumValue and UTrieEnumRange both are implemented then implement
|
|
* a subclass of TrieIterator, override the extract method and iterate, e.g
|
|
* </p>
|
|
* <p>utrie_enum(&normTrie, _enumPropertyStartsValue, _enumPropertyStartsRange,
|
|
* set);<br>
|
|
* In Java :<br>
|
|
* <pre>
|
|
* class TrieIteratorImpl extends TrieIterator{
|
|
* public TrieIteratorImpl(Trie data){
|
|
* super(data);
|
|
* }
|
|
* public int extract(int value){
|
|
* // port the implementation of _enumPropertyStartsValue here
|
|
* }
|
|
* }
|
|
* ....
|
|
* TrieIterator fcdIter = new TrieIteratorImpl(fcdTrieImpl.fcdTrie);
|
|
* while(fcdIter.next(result)) {
|
|
* // port the implementation of _enumPropertyStartsRange
|
|
* }
|
|
* </pre>
|
|
* </p>
|
|
* <p>3) UTrieEnumRange is the only implemented callback then just implement
|
|
* the while loop, when utrie_enum is called
|
|
* <pre>
|
|
* // utrie_enum(&fcdTrie, NULL, _enumPropertyStartsRange, set);
|
|
* TrieIterator fcdIter = new TrieIterator(fcdTrieImpl.fcdTrie);
|
|
* while(fcdIter.next(result)){
|
|
* set.add(result.start);
|
|
* }
|
|
* </p>
|
|
* @author synwee
|
|
* @see android.icu.impl.Trie
|
|
* @hide Only a subset of ICU is exposed in Android
|
|
*/
|
|
public class TrieIterator implements RangeValueIterator
|
|
|
|
{
|
|
// public constructor ---------------------------------------------
|
|
|
|
/**
|
|
* TrieEnumeration constructor
|
|
* @param trie to be used
|
|
* @exception IllegalArgumentException throw when argument is null.
|
|
*/
|
|
public TrieIterator(Trie trie)
|
|
{
|
|
if (trie == null) {
|
|
throw new IllegalArgumentException(
|
|
"Argument trie cannot be null");
|
|
}
|
|
m_trie_ = trie;
|
|
// synwee: check that extract belongs to the child class
|
|
m_initialValue_ = extract(m_trie_.getInitialValue());
|
|
reset();
|
|
}
|
|
|
|
// public methods -------------------------------------------------
|
|
|
|
/**
|
|
* <p>Returns true if we are not at the end of the iteration, false
|
|
* otherwise.</p>
|
|
* <p>The next set of codepoints with the same value type will be
|
|
* calculated during this call and returned in the argument element.</p>
|
|
* @param element return result
|
|
* @return true if we are not at the end of the iteration, false otherwise.
|
|
* @exception NoSuchElementException - if no more elements exist.
|
|
* @see android.icu.util.RangeValueIterator.Element
|
|
*/
|
|
@Override
|
|
public final boolean next(Element element)
|
|
{
|
|
if (m_nextCodepoint_ > UCharacter.MAX_VALUE) {
|
|
return false;
|
|
}
|
|
if (m_nextCodepoint_ < UCharacter.SUPPLEMENTARY_MIN_VALUE &&
|
|
calculateNextBMPElement(element)) {
|
|
return true;
|
|
}
|
|
calculateNextSupplementaryElement(element);
|
|
return true;
|
|
}
|
|
|
|
/**
|
|
* Resets the iterator to the beginning of the iteration
|
|
*/
|
|
@Override
|
|
public final void reset()
|
|
{
|
|
m_currentCodepoint_ = 0;
|
|
m_nextCodepoint_ = 0;
|
|
m_nextIndex_ = 0;
|
|
m_nextBlock_ = m_trie_.m_index_[0] << Trie.INDEX_STAGE_2_SHIFT_;
|
|
if (m_nextBlock_ == m_trie_.m_dataOffset_) {
|
|
m_nextValue_ = m_initialValue_;
|
|
}
|
|
else {
|
|
m_nextValue_ = extract(m_trie_.getValue(m_nextBlock_));
|
|
}
|
|
m_nextBlockIndex_ = 0;
|
|
m_nextTrailIndexOffset_ = TRAIL_SURROGATE_INDEX_BLOCK_LENGTH_;
|
|
}
|
|
|
|
// protected methods ----------------------------------------------
|
|
|
|
/**
|
|
* Called by next() to extracts a 32 bit value from a trie value
|
|
* used for comparison.
|
|
* This method is to be overwritten if special manipulation is to be done
|
|
* to retrieve a relevant comparison.
|
|
* The default function is to return the value as it is.
|
|
* @param value a value from the trie
|
|
* @return extracted value
|
|
*/
|
|
protected int extract(int value)
|
|
{
|
|
return value;
|
|
}
|
|
|
|
// private methods ------------------------------------------------
|
|
|
|
/**
|
|
* Set the result values
|
|
* @param element return result object
|
|
* @param start codepoint of range
|
|
* @param limit (end + 1) codepoint of range
|
|
* @param value common value of range
|
|
*/
|
|
private final void setResult(Element element, int start, int limit,
|
|
int value)
|
|
{
|
|
element.start = start;
|
|
element.limit = limit;
|
|
element.value = value;
|
|
}
|
|
|
|
/**
|
|
* Finding the next element.
|
|
* This method is called just before returning the result of
|
|
* next().
|
|
* We always store the next element before it is requested.
|
|
* In the case that we have to continue calculations into the
|
|
* supplementary planes, a false will be returned.
|
|
* @param element return result object
|
|
* @return true if the next range is found, false if we have to proceed to
|
|
* the supplementary range.
|
|
*/
|
|
private final boolean calculateNextBMPElement(Element element)
|
|
{
|
|
int currentValue = m_nextValue_;
|
|
m_currentCodepoint_ = m_nextCodepoint_;
|
|
m_nextCodepoint_ ++;
|
|
m_nextBlockIndex_ ++;
|
|
if (!checkBlockDetail(currentValue)) {
|
|
setResult(element, m_currentCodepoint_, m_nextCodepoint_,
|
|
currentValue);
|
|
return true;
|
|
}
|
|
// synwee check that next block index == 0 here
|
|
// enumerate BMP - the main loop enumerates data blocks
|
|
while (m_nextCodepoint_ < UCharacter.SUPPLEMENTARY_MIN_VALUE) {
|
|
// because of the way the character is split to form the index
|
|
// the lead surrogate and trail surrogate can not be in the
|
|
// mid of a block
|
|
if (m_nextCodepoint_ == LEAD_SURROGATE_MIN_VALUE_) {
|
|
// skip lead surrogate code units,
|
|
// go to lead surrogate codepoints
|
|
m_nextIndex_ = BMP_INDEX_LENGTH_;
|
|
}
|
|
else if (m_nextCodepoint_ == TRAIL_SURROGATE_MIN_VALUE_) {
|
|
// go back to regular BMP code points
|
|
m_nextIndex_ = m_nextCodepoint_ >> Trie.INDEX_STAGE_1_SHIFT_;
|
|
} else {
|
|
m_nextIndex_ ++;
|
|
}
|
|
|
|
m_nextBlockIndex_ = 0;
|
|
if (!checkBlock(currentValue)) {
|
|
setResult(element, m_currentCodepoint_, m_nextCodepoint_,
|
|
currentValue);
|
|
return true;
|
|
}
|
|
}
|
|
m_nextCodepoint_ --; // step one back since this value has not been
|
|
m_nextBlockIndex_ --; // retrieved yet.
|
|
return false;
|
|
}
|
|
|
|
/**
|
|
* Finds the next supplementary element.
|
|
* For each entry in the trie, the value to be delivered is passed through
|
|
* extract().
|
|
* We always store the next element before it is requested.
|
|
* Called after calculateNextBMP() completes its round of BMP characters.
|
|
* There is a slight difference in the usage of m_currentCodepoint_
|
|
* here as compared to calculateNextBMP(). Though both represents the
|
|
* lower bound of the next element, in calculateNextBMP() it gets set
|
|
* at the start of any loop, where-else, in calculateNextSupplementary()
|
|
* since m_currentCodepoint_ already contains the lower bound of the
|
|
* next element (passed down from calculateNextBMP()), we keep it till
|
|
* the end before resetting it to the new value.
|
|
* Note, if there are no more iterations, it will never get to here.
|
|
* Blocked out by next().
|
|
* @param element return result object
|
|
*/
|
|
private final void calculateNextSupplementaryElement(Element element)
|
|
{
|
|
int currentValue = m_nextValue_;
|
|
m_nextCodepoint_ ++;
|
|
m_nextBlockIndex_ ++;
|
|
|
|
if (UTF16.getTrailSurrogate(m_nextCodepoint_)
|
|
!= UTF16.TRAIL_SURROGATE_MIN_VALUE) {
|
|
// this piece is only called when we are in the middle of a lead
|
|
// surrogate block
|
|
if (!checkNullNextTrailIndex() && !checkBlockDetail(currentValue)) {
|
|
setResult(element, m_currentCodepoint_, m_nextCodepoint_,
|
|
currentValue);
|
|
m_currentCodepoint_ = m_nextCodepoint_;
|
|
return;
|
|
}
|
|
// we have cleared one block
|
|
m_nextIndex_ ++;
|
|
m_nextTrailIndexOffset_ ++;
|
|
if (!checkTrailBlock(currentValue)) {
|
|
setResult(element, m_currentCodepoint_, m_nextCodepoint_,
|
|
currentValue);
|
|
m_currentCodepoint_ = m_nextCodepoint_;
|
|
return;
|
|
}
|
|
}
|
|
int nextLead = UTF16.getLeadSurrogate(m_nextCodepoint_);
|
|
// enumerate supplementary code points
|
|
while (nextLead < TRAIL_SURROGATE_MIN_VALUE_) {
|
|
// lead surrogate access
|
|
final int leadBlock =
|
|
m_trie_.m_index_[nextLead >> Trie.INDEX_STAGE_1_SHIFT_] <<
|
|
Trie.INDEX_STAGE_2_SHIFT_;
|
|
if (leadBlock == m_trie_.m_dataOffset_) {
|
|
// no entries for a whole block of lead surrogates
|
|
if (currentValue != m_initialValue_) {
|
|
m_nextValue_ = m_initialValue_;
|
|
m_nextBlock_ = leadBlock; // == m_trie_.m_dataOffset_
|
|
m_nextBlockIndex_ = 0;
|
|
setResult(element, m_currentCodepoint_, m_nextCodepoint_,
|
|
currentValue);
|
|
m_currentCodepoint_ = m_nextCodepoint_;
|
|
return;
|
|
}
|
|
|
|
nextLead += DATA_BLOCK_LENGTH_;
|
|
// number of total affected supplementary codepoints in one
|
|
// block
|
|
// this is not a simple addition of
|
|
// DATA_BLOCK_SUPPLEMENTARY_LENGTH since we need to consider
|
|
// that we might have moved some of the codepoints
|
|
m_nextCodepoint_ = Character.toCodePoint((char)nextLead, (char)UTF16.TRAIL_SURROGATE_MIN_VALUE);
|
|
continue;
|
|
}
|
|
if (m_trie_.m_dataManipulate_ == null) {
|
|
throw new NullPointerException(
|
|
"The field DataManipulate in this Trie is null");
|
|
}
|
|
// enumerate trail surrogates for this lead surrogate
|
|
m_nextIndex_ = m_trie_.m_dataManipulate_.getFoldingOffset(
|
|
m_trie_.getValue(leadBlock +
|
|
(nextLead & Trie.INDEX_STAGE_3_MASK_)));
|
|
if (m_nextIndex_ <= 0) {
|
|
// no data for this lead surrogate
|
|
if (currentValue != m_initialValue_) {
|
|
m_nextValue_ = m_initialValue_;
|
|
m_nextBlock_ = m_trie_.m_dataOffset_;
|
|
m_nextBlockIndex_ = 0;
|
|
setResult(element, m_currentCodepoint_, m_nextCodepoint_,
|
|
currentValue);
|
|
m_currentCodepoint_ = m_nextCodepoint_;
|
|
return;
|
|
}
|
|
m_nextCodepoint_ += TRAIL_SURROGATE_COUNT_;
|
|
} else {
|
|
m_nextTrailIndexOffset_ = 0;
|
|
if (!checkTrailBlock(currentValue)) {
|
|
setResult(element, m_currentCodepoint_, m_nextCodepoint_,
|
|
currentValue);
|
|
m_currentCodepoint_ = m_nextCodepoint_;
|
|
return;
|
|
}
|
|
}
|
|
nextLead ++;
|
|
}
|
|
|
|
// deliver last range
|
|
setResult(element, m_currentCodepoint_, UCharacter.MAX_VALUE + 1,
|
|
currentValue);
|
|
}
|
|
|
|
/**
|
|
* Internal block value calculations
|
|
* Performs calculations on a data block to find codepoints in m_nextBlock_
|
|
* after the index m_nextBlockIndex_ that has the same value.
|
|
* Note m_*_ variables at this point is the next codepoint whose value
|
|
* has not been calculated.
|
|
* But when returned with false, it will be the last codepoint whose
|
|
* value has been calculated.
|
|
* @param currentValue the value which other codepoints are tested against
|
|
* @return true if the whole block has the same value as currentValue or if
|
|
* the whole block has been calculated, false otherwise.
|
|
*/
|
|
private final boolean checkBlockDetail(int currentValue)
|
|
{
|
|
while (m_nextBlockIndex_ < DATA_BLOCK_LENGTH_) {
|
|
m_nextValue_ = extract(m_trie_.getValue(m_nextBlock_ +
|
|
m_nextBlockIndex_));
|
|
if (m_nextValue_ != currentValue) {
|
|
return false;
|
|
}
|
|
++ m_nextBlockIndex_;
|
|
++ m_nextCodepoint_;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
/**
|
|
* Internal block value calculations
|
|
* Performs calculations on a data block to find codepoints in m_nextBlock_
|
|
* that has the same value.
|
|
* Will call checkBlockDetail() if highlevel check fails.
|
|
* Note m_*_ variables at this point is the next codepoint whose value
|
|
* has not been calculated.
|
|
* @param currentBlock the initial block containing all currentValue
|
|
* @param currentValue the value which other codepoints are tested against
|
|
* @return true if the whole block has the same value as currentValue or if
|
|
* the whole block has been calculated, false otherwise.
|
|
*/
|
|
private final boolean checkBlock(int currentValue)
|
|
{
|
|
int currentBlock = m_nextBlock_;
|
|
m_nextBlock_ = m_trie_.m_index_[m_nextIndex_] <<
|
|
Trie.INDEX_STAGE_2_SHIFT_;
|
|
if (m_nextBlock_ == currentBlock &&
|
|
(m_nextCodepoint_ - m_currentCodepoint_) >= DATA_BLOCK_LENGTH_) {
|
|
// the block is the same as the previous one, filled with
|
|
// currentValue
|
|
m_nextCodepoint_ += DATA_BLOCK_LENGTH_;
|
|
}
|
|
else if (m_nextBlock_ == m_trie_.m_dataOffset_) {
|
|
// this is the all-initial-value block
|
|
if (currentValue != m_initialValue_) {
|
|
m_nextValue_ = m_initialValue_;
|
|
m_nextBlockIndex_ = 0;
|
|
return false;
|
|
}
|
|
m_nextCodepoint_ += DATA_BLOCK_LENGTH_;
|
|
}
|
|
else {
|
|
if (!checkBlockDetail(currentValue)) {
|
|
return false;
|
|
}
|
|
}
|
|
return true;
|
|
}
|
|
|
|
/**
|
|
* Internal block value calculations
|
|
* Performs calculations on multiple data blocks for a set of trail
|
|
* surrogates to find codepoints in m_nextBlock_ that has the same value.
|
|
* Will call checkBlock() for internal block checks.
|
|
* Note m_*_ variables at this point is the next codepoint whose value
|
|
* has not been calculated.
|
|
* @param currentValue the value which other codepoints are tested against
|
|
* @return true if the whole block has the same value as currentValue or if
|
|
* the whole block has been calculated, false otherwise.
|
|
*/
|
|
private final boolean checkTrailBlock(int currentValue)
|
|
{
|
|
// enumerate code points for this lead surrogate
|
|
while (m_nextTrailIndexOffset_ < TRAIL_SURROGATE_INDEX_BLOCK_LENGTH_)
|
|
{
|
|
// if we ever reach here, we are at the start of a new block
|
|
m_nextBlockIndex_ = 0;
|
|
// copy of most of the body of the BMP loop
|
|
if (!checkBlock(currentValue)) {
|
|
return false;
|
|
}
|
|
m_nextTrailIndexOffset_ ++;
|
|
m_nextIndex_ ++;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
/**
|
|
* Checks if we are beginning at the start of a initial block.
|
|
* If we are then the rest of the codepoints in this initial block
|
|
* has the same values.
|
|
* We increment m_nextCodepoint_ and relevant data members if so.
|
|
* This is used only in for the supplementary codepoints because
|
|
* the offset to the trail indexes could be 0.
|
|
* @return true if we are at the start of a initial block.
|
|
*/
|
|
private final boolean checkNullNextTrailIndex()
|
|
{
|
|
if (m_nextIndex_ <= 0) {
|
|
m_nextCodepoint_ += TRAIL_SURROGATE_COUNT_ - 1;
|
|
int nextLead = UTF16.getLeadSurrogate(m_nextCodepoint_);
|
|
int leadBlock =
|
|
m_trie_.m_index_[nextLead >> Trie.INDEX_STAGE_1_SHIFT_] <<
|
|
Trie.INDEX_STAGE_2_SHIFT_;
|
|
if (m_trie_.m_dataManipulate_ == null) {
|
|
throw new NullPointerException(
|
|
"The field DataManipulate in this Trie is null");
|
|
}
|
|
m_nextIndex_ = m_trie_.m_dataManipulate_.getFoldingOffset(
|
|
m_trie_.getValue(leadBlock +
|
|
(nextLead & Trie.INDEX_STAGE_3_MASK_)));
|
|
m_nextIndex_ --;
|
|
m_nextBlockIndex_ = DATA_BLOCK_LENGTH_;
|
|
return true;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
// private data members --------------------------------------------
|
|
|
|
/**
|
|
* Size of the stage 1 BMP indexes
|
|
*/
|
|
private static final int BMP_INDEX_LENGTH_ =
|
|
0x10000 >> Trie.INDEX_STAGE_1_SHIFT_;
|
|
/**
|
|
* Lead surrogate minimum value
|
|
*/
|
|
private static final int LEAD_SURROGATE_MIN_VALUE_ = 0xD800;
|
|
/**
|
|
* Trail surrogate minimum value
|
|
*/
|
|
private static final int TRAIL_SURROGATE_MIN_VALUE_ = 0xDC00;
|
|
/*
|
|
* Trail surrogate maximum value
|
|
*/
|
|
//private static final int TRAIL_SURROGATE_MAX_VALUE_ = 0xDFFF;
|
|
/**
|
|
* Number of trail surrogate
|
|
*/
|
|
private static final int TRAIL_SURROGATE_COUNT_ = 0x400;
|
|
/**
|
|
* Number of stage 1 indexes for supplementary calculations that maps to
|
|
* each lead surrogate character.
|
|
* See second pass into getRawOffset for the trail surrogate character.
|
|
* 10 for significant number of bits for trail surrogates, 5 for what we
|
|
* discard during shifting.
|
|
*/
|
|
private static final int TRAIL_SURROGATE_INDEX_BLOCK_LENGTH_ =
|
|
1 << (10 - Trie.INDEX_STAGE_1_SHIFT_);
|
|
/**
|
|
* Number of data values in a stage 2 (data array) block.
|
|
*/
|
|
private static final int DATA_BLOCK_LENGTH_ =
|
|
1 << Trie.INDEX_STAGE_1_SHIFT_;
|
|
// /**
|
|
// * Number of codepoints in a stage 2 block
|
|
// */
|
|
// private static final int DATA_BLOCK_SUPPLEMENTARY_LENGTH_ =
|
|
// DATA_BLOCK_LENGTH_ << 10;
|
|
/**
|
|
* Trie instance
|
|
*/
|
|
private Trie m_trie_;
|
|
/**
|
|
* Initial value for trie values
|
|
*/
|
|
private int m_initialValue_;
|
|
/**
|
|
* Next element results and data.
|
|
*/
|
|
private int m_currentCodepoint_;
|
|
private int m_nextCodepoint_;
|
|
private int m_nextValue_;
|
|
private int m_nextIndex_;
|
|
private int m_nextBlock_;
|
|
private int m_nextBlockIndex_;
|
|
private int m_nextTrailIndexOffset_;
|
|
}
|