2607 lines
102 KiB
Java
2607 lines
102 KiB
Java
/* GENERATED SOURCE. DO NOT MODIFY. */
|
|
// © 2016 and later: Unicode, Inc. and others.
|
|
// License & terms of use: http://www.unicode.org/copyright.html
|
|
/*
|
|
*******************************************************************************
|
|
* Copyright (C) 2000-2016, International Business Machines Corporation and
|
|
* others. All Rights Reserved.
|
|
*******************************************************************************
|
|
*/
|
|
package android.icu.text;
|
|
import java.nio.CharBuffer;
|
|
import java.text.CharacterIterator;
|
|
|
|
import android.icu.impl.Norm2AllModes;
|
|
import android.icu.impl.Normalizer2Impl;
|
|
import android.icu.impl.UCaseProps;
|
|
import android.icu.lang.UCharacter;
|
|
import android.icu.util.ICUCloneNotSupportedException;
|
|
|
|
/**
|
|
* Old Unicode normalization API.
|
|
*
|
|
* <p>This API has been replaced by the {@link Normalizer2} class and is only available
|
|
* for backward compatibility. This class simply delegates to the Normalizer2 class.
|
|
* There are two exceptions: The new API does not provide a replacement for
|
|
* <code>QuickCheckResult</code> and <code>compare()</code>.
|
|
*
|
|
* <p><code>normalize</code> transforms Unicode text into an equivalent composed or
|
|
* decomposed form, allowing for easier sorting and searching of text.
|
|
* <code>normalize</code> supports the standard normalization forms described in
|
|
* <a href="https://www.unicode.org/reports/tr15/" target="unicode">
|
|
* Unicode Standard Annex #15 — Unicode Normalization Forms</a>.
|
|
*
|
|
* <p>Characters with accents or other adornments can be encoded in
|
|
* several different ways in Unicode. For example, take the character A-acute.
|
|
* In Unicode, this can be encoded as a single character (the
|
|
* "composed" form):
|
|
*
|
|
* <pre>
|
|
* 00C1 LATIN CAPITAL LETTER A WITH ACUTE
|
|
* </pre>
|
|
*
|
|
* or as two separate characters (the "decomposed" form):
|
|
*
|
|
* <pre>
|
|
* 0041 LATIN CAPITAL LETTER A
|
|
* 0301 COMBINING ACUTE ACCENT
|
|
* </pre>
|
|
*
|
|
* <p>To a user of your program, however, both of these sequences should be
|
|
* treated as the same "user-level" character "A with acute accent". When you
|
|
* are searching or comparing text, you must ensure that these two sequences are
|
|
* treated equivalently. In addition, you must handle characters with more than
|
|
* one accent. Sometimes the order of a character's combining accents is
|
|
* significant, while in other cases accent sequences in different orders are
|
|
* really equivalent.
|
|
*
|
|
* <p>Similarly, the string "ffi" can be encoded as three separate letters:
|
|
*
|
|
* <pre>
|
|
* 0066 LATIN SMALL LETTER F
|
|
* 0066 LATIN SMALL LETTER F
|
|
* 0069 LATIN SMALL LETTER I
|
|
* </pre>
|
|
*
|
|
* or as the single character
|
|
*
|
|
* <pre>
|
|
* FB03 LATIN SMALL LIGATURE FFI
|
|
* </pre>
|
|
*
|
|
* <p>The ffi ligature is not a distinct semantic character, and strictly speaking
|
|
* it shouldn't be in Unicode at all, but it was included for compatibility
|
|
* with existing character sets that already provided it. The Unicode standard
|
|
* identifies such characters by giving them "compatibility" decompositions
|
|
* into the corresponding semantic characters. When sorting and searching, you
|
|
* will often want to use these mappings.
|
|
*
|
|
* <p><code>normalize</code> helps solve these problems by transforming text into
|
|
* the canonical composed and decomposed forms as shown in the first example
|
|
* above. In addition, you can have it perform compatibility decompositions so
|
|
* that you can treat compatibility characters the same as their equivalents.
|
|
* Finally, <code>normalize</code> rearranges accents into the proper canonical
|
|
* order, so that you do not have to worry about accent rearrangement on your
|
|
* own.
|
|
*
|
|
* <p>Form FCD, "Fast C or D", is also designed for collation.
|
|
* It allows to work on strings that are not necessarily normalized
|
|
* with an algorithm (like in collation) that works under "canonical closure",
|
|
* i.e., it treats precomposed characters and their decomposed equivalents the
|
|
* same.
|
|
*
|
|
* <p>It is not a normalization form because it does not provide for uniqueness of
|
|
* representation. Multiple strings may be canonically equivalent (their NFDs
|
|
* are identical) and may all conform to FCD without being identical themselves.
|
|
*
|
|
* <p>The form is defined such that the "raw decomposition", the recursive
|
|
* canonical decomposition of each character, results in a string that is
|
|
* canonically ordered. This means that precomposed characters are allowed for
|
|
* as long as their decompositions do not need canonical reordering.
|
|
*
|
|
* <p>Its advantage for a process like collation is that all NFD and most NFC texts
|
|
* - and many unnormalized texts - already conform to FCD and do not need to be
|
|
* normalized (NFD) for such a process. The FCD quick check will return YES for
|
|
* most strings in practice.
|
|
*
|
|
* <p>normalize(FCD) may be implemented with NFD.
|
|
*
|
|
* <p>For more details on FCD see Unicode Technical Note #5 (Canonical Equivalence in Applications):
|
|
* http://www.unicode.org/notes/tn5/#FCD
|
|
*
|
|
* <p>ICU collation performs either NFD or FCD normalization automatically if
|
|
* normalization is turned on for the collator object. Beyond collation and
|
|
* string search, normalized strings may be useful for string equivalence
|
|
* comparisons, transliteration/transcription, unique representations, etc.
|
|
*
|
|
* <p>The W3C generally recommends to exchange texts in NFC.
|
|
* Note also that most legacy character encodings use only precomposed forms and
|
|
* often do not encode any combining marks by themselves. For conversion to such
|
|
* character encodings the Unicode text needs to be normalized to NFC.
|
|
* For more usage examples, see the Unicode Standard Annex.
|
|
*
|
|
* <p>Note: The Normalizer class also provides API for iterative normalization.
|
|
* While the setIndex() and getIndex() refer to indices in the
|
|
* underlying Unicode input text, the next() and previous() methods
|
|
* iterate through characters in the normalized output.
|
|
* This means that there is not necessarily a one-to-one correspondence
|
|
* between characters returned by next() and previous() and the indices
|
|
* passed to and returned from setIndex() and getIndex().
|
|
* It is for this reason that Normalizer does not implement the CharacterIterator interface.
|
|
*/
|
|
public final class Normalizer implements Cloneable {
|
|
// The input text and our position in it
|
|
private UCharacterIterator text;
|
|
private Normalizer2 norm2;
|
|
private Mode mode;
|
|
private int options;
|
|
|
|
// The normalization buffer is the result of normalization
|
|
// of the source in [currentIndex..nextIndex[ .
|
|
private int currentIndex;
|
|
private int nextIndex;
|
|
|
|
// A buffer for holding intermediate results
|
|
private StringBuilder buffer;
|
|
private int bufferPos;
|
|
|
|
// Helper classes to defer loading of normalization data.
|
|
private static final class ModeImpl {
|
|
private ModeImpl(Normalizer2 n2) {
|
|
normalizer2 = n2;
|
|
}
|
|
private final Normalizer2 normalizer2;
|
|
}
|
|
private static final class NFDModeImpl {
|
|
private static final ModeImpl INSTANCE = new ModeImpl(Normalizer2.getNFDInstance());
|
|
}
|
|
private static final class NFKDModeImpl {
|
|
private static final ModeImpl INSTANCE = new ModeImpl(Normalizer2.getNFKDInstance());
|
|
}
|
|
private static final class NFCModeImpl {
|
|
private static final ModeImpl INSTANCE = new ModeImpl(Normalizer2.getNFCInstance());
|
|
}
|
|
private static final class NFKCModeImpl {
|
|
private static final ModeImpl INSTANCE = new ModeImpl(Normalizer2.getNFKCInstance());
|
|
}
|
|
private static final class FCDModeImpl {
|
|
private static final ModeImpl INSTANCE = new ModeImpl(Norm2AllModes.getFCDNormalizer2());
|
|
}
|
|
|
|
private static final class Unicode32 {
|
|
private static final UnicodeSet INSTANCE = new UnicodeSet("[:age=3.2:]").freeze();
|
|
}
|
|
private static final class NFD32ModeImpl {
|
|
private static final ModeImpl INSTANCE =
|
|
new ModeImpl(new FilteredNormalizer2(Normalizer2.getNFDInstance(),
|
|
Unicode32.INSTANCE));
|
|
}
|
|
private static final class NFKD32ModeImpl {
|
|
private static final ModeImpl INSTANCE =
|
|
new ModeImpl(new FilteredNormalizer2(Normalizer2.getNFKDInstance(),
|
|
Unicode32.INSTANCE));
|
|
}
|
|
private static final class NFC32ModeImpl {
|
|
private static final ModeImpl INSTANCE =
|
|
new ModeImpl(new FilteredNormalizer2(Normalizer2.getNFCInstance(),
|
|
Unicode32.INSTANCE));
|
|
}
|
|
private static final class NFKC32ModeImpl {
|
|
private static final ModeImpl INSTANCE =
|
|
new ModeImpl(new FilteredNormalizer2(Normalizer2.getNFKCInstance(),
|
|
Unicode32.INSTANCE));
|
|
}
|
|
private static final class FCD32ModeImpl {
|
|
private static final ModeImpl INSTANCE =
|
|
new ModeImpl(new FilteredNormalizer2(Norm2AllModes.getFCDNormalizer2(),
|
|
Unicode32.INSTANCE));
|
|
}
|
|
|
|
/**
|
|
* Options bit set value to select Unicode 3.2 normalization
|
|
* (except NormalizationCorrections).
|
|
* At most one Unicode version can be selected at a time.
|
|
*
|
|
* @deprecated ICU 56 Use {@link FilteredNormalizer2} instead.
|
|
* @hide original deprecated declaration
|
|
*/
|
|
@Deprecated
|
|
public static final int UNICODE_3_2=0x20;
|
|
|
|
/**
|
|
* Constant indicating that the end of the iteration has been reached.
|
|
* This is guaranteed to have the same value as {@link UCharacterIterator#DONE}.
|
|
*
|
|
* @deprecated ICU 56
|
|
* @hide original deprecated declaration
|
|
*/
|
|
@Deprecated
|
|
public static final int DONE = UCharacterIterator.DONE;
|
|
|
|
/**
|
|
* Constants for normalization modes.
|
|
* <p>
|
|
* The Mode class is not intended for public subclassing.
|
|
* Only the Mode constants provided by the Normalizer class should be used,
|
|
* and any fields or methods should not be called or overridden by users.
|
|
*
|
|
* @deprecated ICU 56 Use {@link Normalizer2} instead.
|
|
* @hide Only a subset of ICU is exposed in Android
|
|
* @hide original deprecated declaration
|
|
*/
|
|
@Deprecated
|
|
public static abstract class Mode {
|
|
/**
|
|
* Sole constructor
|
|
* @deprecated This API is ICU internal only.
|
|
* @hide original deprecated declaration
|
|
* @hide draft / provisional / internal are hidden on Android
|
|
*/
|
|
@Deprecated
|
|
protected Mode() {
|
|
}
|
|
|
|
/**
|
|
* @deprecated This API is ICU internal only.
|
|
* @hide original deprecated declaration
|
|
* @hide draft / provisional / internal are hidden on Android
|
|
*/
|
|
@Deprecated
|
|
protected abstract Normalizer2 getNormalizer2(int options);
|
|
}
|
|
|
|
private static final class NONEMode extends Mode {
|
|
@Override
|
|
protected Normalizer2 getNormalizer2(int options) { return Norm2AllModes.NOOP_NORMALIZER2; }
|
|
}
|
|
private static final class NFDMode extends Mode {
|
|
@Override
|
|
protected Normalizer2 getNormalizer2(int options) {
|
|
return (options&UNICODE_3_2) != 0 ?
|
|
NFD32ModeImpl.INSTANCE.normalizer2 : NFDModeImpl.INSTANCE.normalizer2;
|
|
}
|
|
}
|
|
private static final class NFKDMode extends Mode {
|
|
@Override
|
|
protected Normalizer2 getNormalizer2(int options) {
|
|
return (options&UNICODE_3_2) != 0 ?
|
|
NFKD32ModeImpl.INSTANCE.normalizer2 : NFKDModeImpl.INSTANCE.normalizer2;
|
|
}
|
|
}
|
|
private static final class NFCMode extends Mode {
|
|
@Override
|
|
protected Normalizer2 getNormalizer2(int options) {
|
|
return (options&UNICODE_3_2) != 0 ?
|
|
NFC32ModeImpl.INSTANCE.normalizer2 : NFCModeImpl.INSTANCE.normalizer2;
|
|
}
|
|
}
|
|
private static final class NFKCMode extends Mode {
|
|
@Override
|
|
protected Normalizer2 getNormalizer2(int options) {
|
|
return (options&UNICODE_3_2) != 0 ?
|
|
NFKC32ModeImpl.INSTANCE.normalizer2 : NFKCModeImpl.INSTANCE.normalizer2;
|
|
}
|
|
}
|
|
private static final class FCDMode extends Mode {
|
|
@Override
|
|
protected Normalizer2 getNormalizer2(int options) {
|
|
return (options&UNICODE_3_2) != 0 ?
|
|
FCD32ModeImpl.INSTANCE.normalizer2 : FCDModeImpl.INSTANCE.normalizer2;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* No decomposition/composition.
|
|
*
|
|
* @deprecated ICU 56 Use {@link Normalizer2} instead.
|
|
* @hide original deprecated declaration
|
|
*/
|
|
@Deprecated
|
|
public static final Mode NONE = new NONEMode();
|
|
|
|
/**
|
|
* Canonical decomposition.
|
|
*
|
|
* @deprecated ICU 56 Use {@link Normalizer2} instead.
|
|
* @hide original deprecated declaration
|
|
*/
|
|
@Deprecated
|
|
public static final Mode NFD = new NFDMode();
|
|
|
|
/**
|
|
* Compatibility decomposition.
|
|
*
|
|
* @deprecated ICU 56 Use {@link Normalizer2} instead.
|
|
* @hide original deprecated declaration
|
|
*/
|
|
@Deprecated
|
|
public static final Mode NFKD = new NFKDMode();
|
|
|
|
/**
|
|
* Canonical decomposition followed by canonical composition.
|
|
*
|
|
* @deprecated ICU 56 Use {@link Normalizer2} instead.
|
|
* @hide original deprecated declaration
|
|
*/
|
|
@Deprecated
|
|
public static final Mode NFC = new NFCMode();
|
|
|
|
/**
|
|
* Default normalization.
|
|
*
|
|
* @deprecated ICU 56 Use {@link Normalizer2} instead.
|
|
* @hide original deprecated declaration
|
|
*/
|
|
@Deprecated
|
|
public static final Mode DEFAULT = NFC;
|
|
|
|
/**
|
|
* Compatibility decomposition followed by canonical composition.
|
|
*
|
|
* @deprecated ICU 56 Use {@link Normalizer2} instead.
|
|
* @hide original deprecated declaration
|
|
*/
|
|
@Deprecated
|
|
public static final Mode NFKC =new NFKCMode();
|
|
|
|
/**
|
|
* "Fast C or D" form.
|
|
*
|
|
* @deprecated ICU 56 Use {@link Normalizer2} instead.
|
|
* @hide original deprecated declaration
|
|
*/
|
|
@Deprecated
|
|
public static final Mode FCD = new FCDMode();
|
|
|
|
/**
|
|
* Null operation for use with the {@link android.icu.text.Normalizer constructors}
|
|
* and the static {@link #normalize normalize} method. This value tells
|
|
* the <tt>Normalizer</tt> to do nothing but return unprocessed characters
|
|
* from the underlying String or CharacterIterator. If you have code which
|
|
* requires raw text at some times and normalized text at others, you can
|
|
* use <tt>NO_OP</tt> for the cases where you want raw text, rather
|
|
* than having a separate code path that bypasses <tt>Normalizer</tt>
|
|
* altogether.
|
|
* <p>
|
|
* @see #setMode
|
|
* @deprecated ICU 2.8. Use Nomalizer.NONE
|
|
* @see #NONE
|
|
* @hide original deprecated declaration
|
|
*/
|
|
@Deprecated
|
|
public static final Mode NO_OP = NONE;
|
|
|
|
/**
|
|
* Canonical decomposition followed by canonical composition. Used with the
|
|
* {@link android.icu.text.Normalizer constructors} and the static
|
|
* {@link #normalize normalize} method to determine the operation to be
|
|
* performed.
|
|
* <p>
|
|
* If all optional features (<i>e.g.</i> {@link #IGNORE_HANGUL}) are turned
|
|
* off, this operation produces output that is in
|
|
* <a href=https://www.unicode.org/reports/tr15/>Unicode Canonical
|
|
* Form</a>
|
|
* <b>C</b>.
|
|
* <p>
|
|
* @see #setMode
|
|
* @deprecated ICU 2.8. Use Normalier.NFC
|
|
* @see #NFC
|
|
* @hide original deprecated declaration
|
|
*/
|
|
@Deprecated
|
|
public static final Mode COMPOSE = NFC;
|
|
|
|
/**
|
|
* Compatibility decomposition followed by canonical composition.
|
|
* Used with the {@link android.icu.text.Normalizer constructors} and the static
|
|
* {@link #normalize normalize} method to determine the operation to be
|
|
* performed.
|
|
* <p>
|
|
* If all optional features (<i>e.g.</i> {@link #IGNORE_HANGUL}) are turned
|
|
* off, this operation produces output that is in
|
|
* <a href=https://www.unicode.org/reports/tr15/>Unicode Canonical
|
|
* Form</a>
|
|
* <b>KC</b>.
|
|
* <p>
|
|
* @see #setMode
|
|
* @deprecated ICU 2.8. Use Normalizer.NFKC
|
|
* @see #NFKC
|
|
* @hide original deprecated declaration
|
|
*/
|
|
@Deprecated
|
|
public static final Mode COMPOSE_COMPAT = NFKC;
|
|
|
|
/**
|
|
* Canonical decomposition. This value is passed to the
|
|
* {@link android.icu.text.Normalizer constructors} and the static
|
|
* {@link #normalize normalize}
|
|
* method to determine the operation to be performed.
|
|
* <p>
|
|
* If all optional features (<i>e.g.</i> {@link #IGNORE_HANGUL}) are turned
|
|
* off, this operation produces output that is in
|
|
* <a href=https://www.unicode.org/reports/tr15/>Unicode Canonical
|
|
* Form</a>
|
|
* <b>D</b>.
|
|
* <p>
|
|
* @see #setMode
|
|
* @deprecated ICU 2.8. Use Normalizer.NFD
|
|
* @see #NFD
|
|
* @hide original deprecated declaration
|
|
*/
|
|
@Deprecated
|
|
public static final Mode DECOMP = NFD;
|
|
|
|
/**
|
|
* Compatibility decomposition. This value is passed to the
|
|
* {@link android.icu.text.Normalizer constructors} and the static
|
|
* {@link #normalize normalize}
|
|
* method to determine the operation to be performed.
|
|
* <p>
|
|
* If all optional features (<i>e.g.</i> {@link #IGNORE_HANGUL}) are turned
|
|
* off, this operation produces output that is in
|
|
* <a href=https://www.unicode.org/reports/tr15/>Unicode Canonical
|
|
* Form</a>
|
|
* <b>KD</b>.
|
|
* <p>
|
|
* @see #setMode
|
|
* @deprecated ICU 2.8. Use Normalizer.NFKD
|
|
* @see #NFKD
|
|
* @hide original deprecated declaration
|
|
*/
|
|
@Deprecated
|
|
public static final Mode DECOMP_COMPAT = NFKD;
|
|
|
|
/**
|
|
* Option to disable Hangul/Jamo composition and decomposition.
|
|
* This option applies to Korean text,
|
|
* which can be represented either in the Jamo alphabet or in Hangul
|
|
* characters, which are really just two or three Jamo combined
|
|
* into one visual glyph. Since Jamo takes up more storage space than
|
|
* Hangul, applications that process only Hangul text may wish to turn
|
|
* this option on when decomposing text.
|
|
* <p>
|
|
* The Unicode standard treats Hangul to Jamo conversion as a
|
|
* canonical decomposition, so this option must be turned <b>off</b> if you
|
|
* wish to transform strings into one of the standard
|
|
* <a href="https://www.unicode.org/reports/tr15/" target="unicode">
|
|
* Unicode Normalization Forms</a>.
|
|
* <p>
|
|
* @see #setOption
|
|
* @deprecated ICU 2.8. This option is no longer supported.
|
|
* @hide original deprecated declaration
|
|
*/
|
|
@Deprecated
|
|
public static final int IGNORE_HANGUL = 0x0001;
|
|
|
|
/**
|
|
* Result values for quickCheck().
|
|
* For details see Unicode Technical Report 15.
|
|
*/
|
|
public static final class QuickCheckResult{
|
|
//private int resultValue;
|
|
private QuickCheckResult(int value) {
|
|
//resultValue=value;
|
|
}
|
|
}
|
|
/**
|
|
* Indicates that string is not in the normalized format
|
|
*/
|
|
public static final QuickCheckResult NO = new QuickCheckResult(0);
|
|
|
|
/**
|
|
* Indicates that string is in the normalized format
|
|
*/
|
|
public static final QuickCheckResult YES = new QuickCheckResult(1);
|
|
|
|
/**
|
|
* Indicates it cannot be determined if string is in the normalized
|
|
* format without further thorough checks.
|
|
*/
|
|
public static final QuickCheckResult MAYBE = new QuickCheckResult(2);
|
|
|
|
/**
|
|
* Option bit for compare:
|
|
* Case sensitively compare the strings
|
|
*/
|
|
public static final int FOLD_CASE_DEFAULT = UCharacter.FOLD_CASE_DEFAULT;
|
|
|
|
/**
|
|
* Option bit for compare:
|
|
* Both input strings are assumed to fulfill FCD conditions.
|
|
*/
|
|
public static final int INPUT_IS_FCD = 0x20000;
|
|
|
|
/**
|
|
* Option bit for compare:
|
|
* Perform case-insensitive comparison.
|
|
*/
|
|
public static final int COMPARE_IGNORE_CASE = 0x10000;
|
|
|
|
/**
|
|
* Option bit for compare:
|
|
* Compare strings in code point order instead of code unit order.
|
|
*/
|
|
public static final int COMPARE_CODE_POINT_ORDER = 0x8000;
|
|
|
|
/**
|
|
* Option value for case folding:
|
|
* Use the modified set of mappings provided in CaseFolding.txt to handle dotted I
|
|
* and dotless i appropriately for Turkic languages (tr, az).
|
|
* @see UCharacter#FOLD_CASE_EXCLUDE_SPECIAL_I
|
|
*/
|
|
public static final int FOLD_CASE_EXCLUDE_SPECIAL_I = UCharacter.FOLD_CASE_EXCLUDE_SPECIAL_I;
|
|
|
|
/**
|
|
* Lowest-order bit number of compare() options bits corresponding to
|
|
* normalization options bits.
|
|
*
|
|
* The options parameter for compare() uses most bits for
|
|
* itself and for various comparison and folding flags.
|
|
* The most significant bits, however, are shifted down and passed on
|
|
* to the normalization implementation.
|
|
* (That is, from compare(..., options, ...),
|
|
* options>>COMPARE_NORM_OPTIONS_SHIFT will be passed on to the
|
|
* internal normalization functions.)
|
|
*
|
|
* @see #compare
|
|
* @deprecated ICU 56 Use {@link Normalizer2} instead.
|
|
* @hide original deprecated declaration
|
|
*/
|
|
@Deprecated
|
|
public static final int COMPARE_NORM_OPTIONS_SHIFT = 20;
|
|
|
|
//-------------------------------------------------------------------------
|
|
// Iterator constructors
|
|
//-------------------------------------------------------------------------
|
|
|
|
/**
|
|
* Creates a new <tt>Normalizer</tt> object for iterating over the
|
|
* normalized form of a given string.
|
|
* <p>
|
|
* The <tt>options</tt> parameter specifies which optional
|
|
* <tt>Normalizer</tt> features are to be enabled for this object.
|
|
* <p>
|
|
* @param str The string to be normalized. The normalization
|
|
* will start at the beginning of the string.
|
|
*
|
|
* @param mode The normalization mode.
|
|
*
|
|
* @param opt Any optional features to be enabled.
|
|
* Currently the only available option is {@link #UNICODE_3_2}.
|
|
* If you want the default behavior corresponding to one of the
|
|
* standard Unicode Normalization Forms, use 0 for this argument.
|
|
* @deprecated ICU 56 Use {@link Normalizer2} instead.
|
|
* @hide original deprecated declaration
|
|
*/
|
|
@Deprecated
|
|
public Normalizer(String str, Mode mode, int opt) {
|
|
this.text = UCharacterIterator.getInstance(str);
|
|
this.mode = mode;
|
|
this.options=opt;
|
|
norm2 = mode.getNormalizer2(opt);
|
|
buffer = new StringBuilder();
|
|
}
|
|
|
|
/**
|
|
* Creates a new <tt>Normalizer</tt> object for iterating over the
|
|
* normalized form of the given text.
|
|
* <p>
|
|
* @param iter The input text to be normalized. The normalization
|
|
* will start at the beginning of the string.
|
|
*
|
|
* @param mode The normalization mode.
|
|
*
|
|
* @param opt Any optional features to be enabled.
|
|
* Currently the only available option is {@link #UNICODE_3_2}.
|
|
* If you want the default behavior corresponding to one of the
|
|
* standard Unicode Normalization Forms, use 0 for this argument.
|
|
* @deprecated ICU 56 Use {@link Normalizer2} instead.
|
|
* @hide original deprecated declaration
|
|
*/
|
|
@Deprecated
|
|
public Normalizer(CharacterIterator iter, Mode mode, int opt) {
|
|
this.text = UCharacterIterator.getInstance((CharacterIterator)iter.clone());
|
|
this.mode = mode;
|
|
this.options = opt;
|
|
norm2 = mode.getNormalizer2(opt);
|
|
buffer = new StringBuilder();
|
|
}
|
|
|
|
/**
|
|
* Creates a new <tt>Normalizer</tt> object for iterating over the
|
|
* normalized form of the given text.
|
|
* <p>
|
|
* @param iter The input text to be normalized. The normalization
|
|
* will start at the beginning of the string.
|
|
*
|
|
* @param mode The normalization mode.
|
|
* @param options The normalization options, ORed together (0 for no options).
|
|
* @deprecated ICU 56 Use {@link Normalizer2} instead.
|
|
* @hide original deprecated declaration
|
|
*/
|
|
@Deprecated
|
|
public Normalizer(UCharacterIterator iter, Mode mode, int options) {
|
|
try {
|
|
this.text = (UCharacterIterator)iter.clone();
|
|
this.mode = mode;
|
|
this.options = options;
|
|
norm2 = mode.getNormalizer2(options);
|
|
buffer = new StringBuilder();
|
|
} catch (CloneNotSupportedException e) {
|
|
throw new ICUCloneNotSupportedException(e);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Clones this <tt>Normalizer</tt> object. All properties of this
|
|
* object are duplicated in the new object, including the cloning of any
|
|
* {@link CharacterIterator} that was passed in to the constructor
|
|
* or to {@link #setText(CharacterIterator) setText}.
|
|
* However, the text storage underlying
|
|
* the <tt>CharacterIterator</tt> is not duplicated unless the
|
|
* iterator's <tt>clone</tt> method does so.
|
|
*
|
|
* @deprecated ICU 56 Use {@link Normalizer2} instead.
|
|
* @hide original deprecated declaration
|
|
*/
|
|
@Deprecated
|
|
@Override
|
|
public Object clone() {
|
|
try {
|
|
Normalizer copy = (Normalizer) super.clone();
|
|
copy.text = (UCharacterIterator) text.clone();
|
|
copy.mode = mode;
|
|
copy.options = options;
|
|
copy.norm2 = norm2;
|
|
copy.buffer = new StringBuilder(buffer);
|
|
copy.bufferPos = bufferPos;
|
|
copy.currentIndex = currentIndex;
|
|
copy.nextIndex = nextIndex;
|
|
return copy;
|
|
}
|
|
catch (CloneNotSupportedException e) {
|
|
throw new ICUCloneNotSupportedException(e);
|
|
}
|
|
}
|
|
|
|
//--------------------------------------------------------------------------
|
|
// Static Utility methods
|
|
//--------------------------------------------------------------------------
|
|
|
|
private static final Normalizer2 getComposeNormalizer2(boolean compat, int options) {
|
|
return (compat ? NFKC : NFC).getNormalizer2(options);
|
|
}
|
|
private static final Normalizer2 getDecomposeNormalizer2(boolean compat, int options) {
|
|
return (compat ? NFKD : NFD).getNormalizer2(options);
|
|
}
|
|
|
|
/**
|
|
* Compose a string.
|
|
* The string will be composed to according to the specified mode.
|
|
* @param str The string to compose.
|
|
* @param compat If true the string will be composed according to
|
|
* NFKC rules and if false will be composed according to
|
|
* NFC rules.
|
|
* @return String The composed string
|
|
* @deprecated ICU 56 Use {@link Normalizer2} instead.
|
|
* @hide original deprecated declaration
|
|
*/
|
|
@Deprecated
|
|
public static String compose(String str, boolean compat) {
|
|
return compose(str,compat,0);
|
|
}
|
|
|
|
/**
|
|
* Compose a string.
|
|
* The string will be composed to according to the specified mode.
|
|
* @param str The string to compose.
|
|
* @param compat If true the string will be composed according to
|
|
* NFKC rules and if false will be composed according to
|
|
* NFC rules.
|
|
* @param options The only recognized option is UNICODE_3_2
|
|
* @return String The composed string
|
|
* @deprecated ICU 56 Use {@link Normalizer2} instead.
|
|
* @hide original deprecated declaration
|
|
*/
|
|
@Deprecated
|
|
public static String compose(String str, boolean compat, int options) {
|
|
return getComposeNormalizer2(compat, options).normalize(str);
|
|
}
|
|
|
|
/**
|
|
* Compose a string.
|
|
* The string will be composed to according to the specified mode.
|
|
* @param source The char array to compose.
|
|
* @param target A char buffer to receive the normalized text.
|
|
* @param compat If true the char array will be composed according to
|
|
* NFKC rules and if false will be composed according to
|
|
* NFC rules.
|
|
* @param options The normalization options, ORed together (0 for no options).
|
|
* @return int The total buffer size needed;if greater than length of
|
|
* result, the output was truncated.
|
|
* @exception IndexOutOfBoundsException if target.length is less than the
|
|
* required length
|
|
* @deprecated ICU 56 Use {@link Normalizer2} instead.
|
|
* @hide original deprecated declaration
|
|
*/
|
|
@Deprecated
|
|
public static int compose(char[] source,char[] target, boolean compat, int options) {
|
|
return compose(source, 0, source.length, target, 0, target.length, compat, options);
|
|
}
|
|
|
|
/**
|
|
* Compose a string.
|
|
* The string will be composed to according to the specified mode.
|
|
* @param src The char array to compose.
|
|
* @param srcStart Start index of the source
|
|
* @param srcLimit Limit index of the source
|
|
* @param dest The char buffer to fill in
|
|
* @param destStart Start index of the destination buffer
|
|
* @param destLimit End index of the destination buffer
|
|
* @param compat If true the char array will be composed according to
|
|
* NFKC rules and if false will be composed according to
|
|
* NFC rules.
|
|
* @param options The normalization options, ORed together (0 for no options).
|
|
* @return int The total buffer size needed;if greater than length of
|
|
* result, the output was truncated.
|
|
* @exception IndexOutOfBoundsException if target.length is less than the
|
|
* required length
|
|
* @deprecated ICU 56 Use {@link Normalizer2} instead.
|
|
* @hide original deprecated declaration
|
|
*/
|
|
@Deprecated
|
|
public static int compose(char[] src,int srcStart, int srcLimit,
|
|
char[] dest,int destStart, int destLimit,
|
|
boolean compat, int options) {
|
|
CharBuffer srcBuffer = CharBuffer.wrap(src, srcStart, srcLimit - srcStart);
|
|
CharsAppendable app = new CharsAppendable(dest, destStart, destLimit);
|
|
getComposeNormalizer2(compat, options).normalize(srcBuffer, app);
|
|
return app.length();
|
|
}
|
|
|
|
/**
|
|
* Decompose a string.
|
|
* The string will be decomposed to according to the specified mode.
|
|
* @param str The string to decompose.
|
|
* @param compat If true the string will be decomposed according to NFKD
|
|
* rules and if false will be decomposed according to NFD
|
|
* rules.
|
|
* @return String The decomposed string
|
|
* @deprecated ICU 56 Use {@link Normalizer2} instead.
|
|
* @hide original deprecated declaration
|
|
*/
|
|
@Deprecated
|
|
public static String decompose(String str, boolean compat) {
|
|
return decompose(str,compat,0);
|
|
}
|
|
|
|
/**
|
|
* Decompose a string.
|
|
* The string will be decomposed to according to the specified mode.
|
|
* @param str The string to decompose.
|
|
* @param compat If true the string will be decomposed according to NFKD
|
|
* rules and if false will be decomposed according to NFD
|
|
* rules.
|
|
* @param options The normalization options, ORed together (0 for no options).
|
|
* @return String The decomposed string
|
|
* @deprecated ICU 56 Use {@link Normalizer2} instead.
|
|
* @hide original deprecated declaration
|
|
*/
|
|
@Deprecated
|
|
public static String decompose(String str, boolean compat, int options) {
|
|
return getDecomposeNormalizer2(compat, options).normalize(str);
|
|
}
|
|
|
|
/**
|
|
* Decompose a string.
|
|
* The string will be decomposed to according to the specified mode.
|
|
* @param source The char array to decompose.
|
|
* @param target A char buffer to receive the normalized text.
|
|
* @param compat If true the char array will be decomposed according to NFKD
|
|
* rules and if false will be decomposed according to
|
|
* NFD rules.
|
|
* @return int The total buffer size needed;if greater than length of
|
|
* result,the output was truncated.
|
|
* @param options The normalization options, ORed together (0 for no options).
|
|
* @exception IndexOutOfBoundsException if the target capacity is less than
|
|
* the required length
|
|
* @deprecated ICU 56 Use {@link Normalizer2} instead.
|
|
* @hide original deprecated declaration
|
|
*/
|
|
@Deprecated
|
|
public static int decompose(char[] source,char[] target, boolean compat, int options) {
|
|
return decompose(source, 0, source.length, target, 0, target.length, compat, options);
|
|
}
|
|
|
|
/**
|
|
* Decompose a string.
|
|
* The string will be decomposed to according to the specified mode.
|
|
* @param src The char array to compose.
|
|
* @param srcStart Start index of the source
|
|
* @param srcLimit Limit index of the source
|
|
* @param dest The char buffer to fill in
|
|
* @param destStart Start index of the destination buffer
|
|
* @param destLimit End index of the destination buffer
|
|
* @param compat If true the char array will be decomposed according to NFKD
|
|
* rules and if false will be decomposed according to
|
|
* NFD rules.
|
|
* @param options The normalization options, ORed together (0 for no options).
|
|
* @return int The total buffer size needed;if greater than length of
|
|
* result,the output was truncated.
|
|
* @exception IndexOutOfBoundsException if the target capacity is less than
|
|
* the required length
|
|
* @deprecated ICU 56 Use {@link Normalizer2} instead.
|
|
* @hide original deprecated declaration
|
|
*/
|
|
@Deprecated
|
|
public static int decompose(char[] src,int srcStart, int srcLimit,
|
|
char[] dest,int destStart, int destLimit,
|
|
boolean compat, int options) {
|
|
CharBuffer srcBuffer = CharBuffer.wrap(src, srcStart, srcLimit - srcStart);
|
|
CharsAppendable app = new CharsAppendable(dest, destStart, destLimit);
|
|
getDecomposeNormalizer2(compat, options).normalize(srcBuffer, app);
|
|
return app.length();
|
|
}
|
|
|
|
/**
|
|
* Normalizes a <tt>String</tt> using the given normalization operation.
|
|
* <p>
|
|
* The <tt>options</tt> parameter specifies which optional
|
|
* <tt>Normalizer</tt> features are to be enabled for this operation.
|
|
* Currently the only available option is {@link #UNICODE_3_2}.
|
|
* If you want the default behavior corresponding to one of the standard
|
|
* Unicode Normalization Forms, use 0 for this argument.
|
|
* <p>
|
|
* @param str the input string to be normalized.
|
|
* @param mode the normalization mode
|
|
* @param options the optional features to be enabled.
|
|
* @return String the normalized string
|
|
* @deprecated ICU 56 Use {@link Normalizer2} instead.
|
|
* @hide original deprecated declaration
|
|
*/
|
|
@Deprecated
|
|
public static String normalize(String str, Mode mode, int options) {
|
|
return mode.getNormalizer2(options).normalize(str);
|
|
}
|
|
|
|
/**
|
|
* Normalize a string.
|
|
* The string will be normalized according to the specified normalization
|
|
* mode and options.
|
|
* @param src The string to normalize.
|
|
* @param mode The normalization mode; one of Normalizer.NONE,
|
|
* Normalizer.NFD, Normalizer.NFC, Normalizer.NFKC,
|
|
* Normalizer.NFKD, Normalizer.DEFAULT
|
|
* @return the normalized string
|
|
* @deprecated ICU 56 Use {@link Normalizer2} instead.
|
|
* @hide original deprecated declaration
|
|
*/
|
|
@Deprecated
|
|
public static String normalize(String src,Mode mode) {
|
|
return normalize(src, mode, 0);
|
|
}
|
|
/**
|
|
* Normalize a string.
|
|
* The string will be normalized according to the specified normalization
|
|
* mode and options.
|
|
* @param source The char array to normalize.
|
|
* @param target A char buffer to receive the normalized text.
|
|
* @param mode The normalization mode; one of Normalizer.NONE,
|
|
* Normalizer.NFD, Normalizer.NFC, Normalizer.NFKC,
|
|
* Normalizer.NFKD, Normalizer.DEFAULT
|
|
* @param options The normalization options, ORed together (0 for no options).
|
|
* @return int The total buffer size needed;if greater than length of
|
|
* result, the output was truncated.
|
|
* @exception IndexOutOfBoundsException if the target capacity is less
|
|
* than the required length
|
|
* @deprecated ICU 56 Use {@link Normalizer2} instead.
|
|
* @hide original deprecated declaration
|
|
*/
|
|
@Deprecated
|
|
public static int normalize(char[] source,char[] target, Mode mode, int options) {
|
|
return normalize(source,0,source.length,target,0,target.length,mode, options);
|
|
}
|
|
|
|
/**
|
|
* Normalize a string.
|
|
* The string will be normalized according to the specified normalization
|
|
* mode and options.
|
|
* @param src The char array to compose.
|
|
* @param srcStart Start index of the source
|
|
* @param srcLimit Limit index of the source
|
|
* @param dest The char buffer to fill in
|
|
* @param destStart Start index of the destination buffer
|
|
* @param destLimit End index of the destination buffer
|
|
* @param mode The normalization mode; one of Normalizer.NONE,
|
|
* Normalizer.NFD, Normalizer.NFC, Normalizer.NFKC,
|
|
* Normalizer.NFKD, Normalizer.DEFAULT
|
|
* @param options The normalization options, ORed together (0 for no options).
|
|
* @return int The total buffer size needed;if greater than length of
|
|
* result, the output was truncated.
|
|
* @exception IndexOutOfBoundsException if the target capacity is
|
|
* less than the required length
|
|
* @deprecated ICU 56 Use {@link Normalizer2} instead.
|
|
* @hide original deprecated declaration
|
|
*/
|
|
@Deprecated
|
|
public static int normalize(char[] src,int srcStart, int srcLimit,
|
|
char[] dest,int destStart, int destLimit,
|
|
Mode mode, int options) {
|
|
CharBuffer srcBuffer = CharBuffer.wrap(src, srcStart, srcLimit - srcStart);
|
|
CharsAppendable app = new CharsAppendable(dest, destStart, destLimit);
|
|
mode.getNormalizer2(options).normalize(srcBuffer, app);
|
|
return app.length();
|
|
}
|
|
|
|
/**
|
|
* Normalize a codepoint according to the given mode
|
|
* @param char32 The input string to be normalized.
|
|
* @param mode The normalization mode
|
|
* @param options Options for use with exclusion set and tailored Normalization
|
|
* The only option that is currently recognized is UNICODE_3_2
|
|
* @return String The normalized string
|
|
* @see #UNICODE_3_2
|
|
* @deprecated ICU 56 Use {@link Normalizer2} instead.
|
|
* @hide original deprecated declaration
|
|
*/
|
|
@Deprecated
|
|
public static String normalize(int char32, Mode mode, int options) {
|
|
if(mode == NFD && options == 0) {
|
|
String decomposition = Normalizer2.getNFCInstance().getDecomposition(char32);
|
|
if(decomposition == null) {
|
|
decomposition = UTF16.valueOf(char32);
|
|
}
|
|
return decomposition;
|
|
}
|
|
return normalize(UTF16.valueOf(char32), mode, options);
|
|
}
|
|
|
|
/**
|
|
* Convenience method to normalize a codepoint according to the given mode
|
|
* @param char32 The input string to be normalized.
|
|
* @param mode The normalization mode
|
|
* @return String The normalized string
|
|
* @deprecated ICU 56 Use {@link Normalizer2} instead.
|
|
* @hide original deprecated declaration
|
|
*/
|
|
@Deprecated
|
|
public static String normalize(int char32, Mode mode) {
|
|
return normalize(char32, mode, 0);
|
|
}
|
|
|
|
/**
|
|
* Convenience method.
|
|
*
|
|
* @param source string for determining if it is in a normalized format
|
|
* @param mode normalization format (Normalizer.NFC,Normalizer.NFD,
|
|
* Normalizer.NFKC,Normalizer.NFKD)
|
|
* @return Return code to specify if the text is normalized or not
|
|
* (Normalizer.YES, Normalizer.NO or Normalizer.MAYBE)
|
|
* @deprecated ICU 56 Use {@link Normalizer2} instead.
|
|
* @hide original deprecated declaration
|
|
*/
|
|
@Deprecated
|
|
public static QuickCheckResult quickCheck(String source, Mode mode) {
|
|
return quickCheck(source, mode, 0);
|
|
}
|
|
|
|
/**
|
|
* Performing quick check on a string, to quickly determine if the string is
|
|
* in a particular normalization format.
|
|
* Three types of result can be returned Normalizer.YES, Normalizer.NO or
|
|
* Normalizer.MAYBE. Result Normalizer.YES indicates that the argument
|
|
* string is in the desired normalized format, Normalizer.NO determines that
|
|
* argument string is not in the desired normalized format. A
|
|
* Normalizer.MAYBE result indicates that a more thorough check is required,
|
|
* the user may have to put the string in its normalized form and compare
|
|
* the results.
|
|
*
|
|
* @param source string for determining if it is in a normalized format
|
|
* @param mode normalization format (Normalizer.NFC,Normalizer.NFD,
|
|
* Normalizer.NFKC,Normalizer.NFKD)
|
|
* @param options Options for use with exclusion set and tailored Normalization
|
|
* The only option that is currently recognized is UNICODE_3_2
|
|
* @return Return code to specify if the text is normalized or not
|
|
* (Normalizer.YES, Normalizer.NO or Normalizer.MAYBE)
|
|
* @deprecated ICU 56 Use {@link Normalizer2} instead.
|
|
* @hide original deprecated declaration
|
|
*/
|
|
@Deprecated
|
|
public static QuickCheckResult quickCheck(String source, Mode mode, int options) {
|
|
return mode.getNormalizer2(options).quickCheck(source);
|
|
}
|
|
|
|
/**
|
|
* Convenience method.
|
|
*
|
|
* @param source Array of characters for determining if it is in a
|
|
* normalized format
|
|
* @param mode normalization format (Normalizer.NFC,Normalizer.NFD,
|
|
* Normalizer.NFKC,Normalizer.NFKD)
|
|
* @param options Options for use with exclusion set and tailored Normalization
|
|
* The only option that is currently recognized is UNICODE_3_2
|
|
* @return Return code to specify if the text is normalized or not
|
|
* (Normalizer.YES, Normalizer.NO or Normalizer.MAYBE)
|
|
* @deprecated ICU 56 Use {@link Normalizer2} instead.
|
|
* @hide original deprecated declaration
|
|
*/
|
|
@Deprecated
|
|
public static QuickCheckResult quickCheck(char[] source, Mode mode, int options) {
|
|
return quickCheck(source, 0, source.length, mode, options);
|
|
}
|
|
|
|
/**
|
|
* Performing quick check on a string, to quickly determine if the string is
|
|
* in a particular normalization format.
|
|
* Three types of result can be returned Normalizer.YES, Normalizer.NO or
|
|
* Normalizer.MAYBE. Result Normalizer.YES indicates that the argument
|
|
* string is in the desired normalized format, Normalizer.NO determines that
|
|
* argument string is not in the desired normalized format. A
|
|
* Normalizer.MAYBE result indicates that a more thorough check is required,
|
|
* the user may have to put the string in its normalized form and compare
|
|
* the results.
|
|
*
|
|
* @param source string for determining if it is in a normalized format
|
|
* @param start the start index of the source
|
|
* @param limit the limit index of the source it is equal to the length
|
|
* @param mode normalization format (Normalizer.NFC,Normalizer.NFD,
|
|
* Normalizer.NFKC,Normalizer.NFKD)
|
|
* @param options Options for use with exclusion set and tailored Normalization
|
|
* The only option that is currently recognized is UNICODE_3_2
|
|
* @return Return code to specify if the text is normalized or not
|
|
* (Normalizer.YES, Normalizer.NO or
|
|
* Normalizer.MAYBE)
|
|
* @deprecated ICU 56 Use {@link Normalizer2} instead.
|
|
* @hide original deprecated declaration
|
|
*/
|
|
@Deprecated
|
|
public static QuickCheckResult quickCheck(char[] source,int start,
|
|
int limit, Mode mode,int options) {
|
|
CharBuffer srcBuffer = CharBuffer.wrap(source, start, limit - start);
|
|
return mode.getNormalizer2(options).quickCheck(srcBuffer);
|
|
}
|
|
|
|
/**
|
|
* Test if a string is in a given normalization form.
|
|
* This is semantically equivalent to source.equals(normalize(source, mode)).
|
|
*
|
|
* Unlike quickCheck(), this function returns a definitive result,
|
|
* never a "maybe".
|
|
* For NFD, NFKD, and FCD, both functions work exactly the same.
|
|
* For NFC and NFKC where quickCheck may return "maybe", this function will
|
|
* perform further tests to arrive at a true/false result.
|
|
* @param src The input array of characters to be checked to see if
|
|
* it is normalized
|
|
* @param start The strart index in the source
|
|
* @param limit The limit index in the source
|
|
* @param mode the normalization mode
|
|
* @param options Options for use with exclusion set and tailored Normalization
|
|
* The only option that is currently recognized is UNICODE_3_2
|
|
* @return Boolean value indicating whether the source string is in the
|
|
* "mode" normalization form
|
|
* @deprecated ICU 56 Use {@link Normalizer2} instead.
|
|
* @hide original deprecated declaration
|
|
*/
|
|
@Deprecated
|
|
public static boolean isNormalized(char[] src,int start,
|
|
int limit, Mode mode,
|
|
int options) {
|
|
CharBuffer srcBuffer = CharBuffer.wrap(src, start, limit - start);
|
|
return mode.getNormalizer2(options).isNormalized(srcBuffer);
|
|
}
|
|
|
|
/**
|
|
* Test if a string is in a given normalization form.
|
|
* This is semantically equivalent to source.equals(normalize(source, mode)).
|
|
*
|
|
* Unlike quickCheck(), this function returns a definitive result,
|
|
* never a "maybe".
|
|
* For NFD, NFKD, and FCD, both functions work exactly the same.
|
|
* For NFC and NFKC where quickCheck may return "maybe", this function will
|
|
* perform further tests to arrive at a true/false result.
|
|
* @param str the input string to be checked to see if it is
|
|
* normalized
|
|
* @param mode the normalization mode
|
|
* @param options Options for use with exclusion set and tailored Normalization
|
|
* The only option that is currently recognized is UNICODE_3_2
|
|
* @see #isNormalized
|
|
* @deprecated ICU 56 Use {@link Normalizer2} instead.
|
|
* @hide original deprecated declaration
|
|
*/
|
|
@Deprecated
|
|
public static boolean isNormalized(String str, Mode mode, int options) {
|
|
return mode.getNormalizer2(options).isNormalized(str);
|
|
}
|
|
|
|
/**
|
|
* Convenience Method
|
|
* @param char32 the input code point to be checked to see if it is
|
|
* normalized
|
|
* @param mode the normalization mode
|
|
* @param options Options for use with exclusion set and tailored Normalization
|
|
* The only option that is currently recognized is UNICODE_3_2
|
|
*
|
|
* @see #isNormalized
|
|
* @deprecated ICU 56 Use {@link Normalizer2} instead.
|
|
* @hide original deprecated declaration
|
|
*/
|
|
@Deprecated
|
|
public static boolean isNormalized(int char32, Mode mode,int options) {
|
|
return isNormalized(UTF16.valueOf(char32), mode, options);
|
|
}
|
|
|
|
/**
|
|
* Compare two strings for canonical equivalence.
|
|
* Further options include case-insensitive comparison and
|
|
* code point order (as opposed to code unit order).
|
|
*
|
|
* Canonical equivalence between two strings is defined as their normalized
|
|
* forms (NFD or NFC) being identical.
|
|
* This function compares strings incrementally instead of normalizing
|
|
* (and optionally case-folding) both strings entirely,
|
|
* improving performance significantly.
|
|
*
|
|
* Bulk normalization is only necessary if the strings do not fulfill the
|
|
* FCD conditions. Only in this case, and only if the strings are relatively
|
|
* long, is memory allocated temporarily.
|
|
* For FCD strings and short non-FCD strings there is no memory allocation.
|
|
*
|
|
* Semantically, this is equivalent to
|
|
* strcmp[CodePointOrder](foldCase(NFD(s1)), foldCase(NFD(s2)))
|
|
* where code point order and foldCase are all optional.
|
|
*
|
|
* @param s1 First source character array.
|
|
* @param s1Start start index of source
|
|
* @param s1Limit limit of the source
|
|
*
|
|
* @param s2 Second source character array.
|
|
* @param s2Start start index of the source
|
|
* @param s2Limit limit of the source
|
|
*
|
|
* @param options A bit set of options:
|
|
* - FOLD_CASE_DEFAULT or 0 is used for default options:
|
|
* Case-sensitive comparison in code unit order, and the input strings
|
|
* are quick-checked for FCD.
|
|
*
|
|
* - INPUT_IS_FCD
|
|
* Set if the caller knows that both s1 and s2 fulfill the FCD
|
|
* conditions.If not set, the function will quickCheck for FCD
|
|
* and normalize if necessary.
|
|
*
|
|
* - COMPARE_CODE_POINT_ORDER
|
|
* Set to choose code point order instead of code unit order
|
|
*
|
|
* - COMPARE_IGNORE_CASE
|
|
* Set to compare strings case-insensitively using case folding,
|
|
* instead of case-sensitively.
|
|
* If set, then the following case folding options are used.
|
|
*
|
|
*
|
|
* @return <0 or 0 or >0 as usual for string comparisons
|
|
*/
|
|
public static int compare(char[] s1, int s1Start, int s1Limit,
|
|
char[] s2, int s2Start, int s2Limit,
|
|
int options) {
|
|
if( s1==null || s1Start<0 || s1Limit<0 ||
|
|
s2==null || s2Start<0 || s2Limit<0 ||
|
|
s1Limit<s1Start || s2Limit<s2Start
|
|
) {
|
|
throw new IllegalArgumentException();
|
|
}
|
|
return internalCompare(CharBuffer.wrap(s1, s1Start, s1Limit-s1Start),
|
|
CharBuffer.wrap(s2, s2Start, s2Limit-s2Start),
|
|
options);
|
|
}
|
|
|
|
/**
|
|
* Compare two strings for canonical equivalence.
|
|
* Further options include case-insensitive comparison and
|
|
* code point order (as opposed to code unit order).
|
|
*
|
|
* Canonical equivalence between two strings is defined as their normalized
|
|
* forms (NFD or NFC) being identical.
|
|
* This function compares strings incrementally instead of normalizing
|
|
* (and optionally case-folding) both strings entirely,
|
|
* improving performance significantly.
|
|
*
|
|
* Bulk normalization is only necessary if the strings do not fulfill the
|
|
* FCD conditions. Only in this case, and only if the strings are relatively
|
|
* long, is memory allocated temporarily.
|
|
* For FCD strings and short non-FCD strings there is no memory allocation.
|
|
*
|
|
* Semantically, this is equivalent to
|
|
* strcmp[CodePointOrder](foldCase(NFD(s1)), foldCase(NFD(s2)))
|
|
* where code point order and foldCase are all optional.
|
|
*
|
|
* @param s1 First source string.
|
|
* @param s2 Second source string.
|
|
*
|
|
* @param options A bit set of options:
|
|
* - FOLD_CASE_DEFAULT or 0 is used for default options:
|
|
* Case-sensitive comparison in code unit order, and the input strings
|
|
* are quick-checked for FCD.
|
|
*
|
|
* - INPUT_IS_FCD
|
|
* Set if the caller knows that both s1 and s2 fulfill the FCD
|
|
* conditions. If not set, the function will quickCheck for FCD
|
|
* and normalize if necessary.
|
|
*
|
|
* - COMPARE_CODE_POINT_ORDER
|
|
* Set to choose code point order instead of code unit order
|
|
*
|
|
* - COMPARE_IGNORE_CASE
|
|
* Set to compare strings case-insensitively using case folding,
|
|
* instead of case-sensitively.
|
|
* If set, then the following case folding options are used.
|
|
*
|
|
* @return <0 or 0 or >0 as usual for string comparisons
|
|
*/
|
|
public static int compare(String s1, String s2, int options) {
|
|
return internalCompare(s1, s2, options);
|
|
}
|
|
|
|
/**
|
|
* Compare two strings for canonical equivalence.
|
|
* Further options include case-insensitive comparison and
|
|
* code point order (as opposed to code unit order).
|
|
* Convenience method.
|
|
*
|
|
* @param s1 First source string.
|
|
* @param s2 Second source string.
|
|
*
|
|
* @param options A bit set of options:
|
|
* - FOLD_CASE_DEFAULT or 0 is used for default options:
|
|
* Case-sensitive comparison in code unit order, and the input strings
|
|
* are quick-checked for FCD.
|
|
*
|
|
* - INPUT_IS_FCD
|
|
* Set if the caller knows that both s1 and s2 fulfill the FCD
|
|
* conditions. If not set, the function will quickCheck for FCD
|
|
* and normalize if necessary.
|
|
*
|
|
* - COMPARE_CODE_POINT_ORDER
|
|
* Set to choose code point order instead of code unit order
|
|
*
|
|
* - COMPARE_IGNORE_CASE
|
|
* Set to compare strings case-insensitively using case folding,
|
|
* instead of case-sensitively.
|
|
* If set, then the following case folding options are used.
|
|
*
|
|
* @return <0 or 0 or >0 as usual for string comparisons
|
|
*/
|
|
public static int compare(char[] s1, char[] s2, int options) {
|
|
return internalCompare(CharBuffer.wrap(s1), CharBuffer.wrap(s2), options);
|
|
}
|
|
|
|
/**
|
|
* Convenience method that can have faster implementation
|
|
* by not allocating buffers.
|
|
* @param char32a the first code point to be checked against the
|
|
* @param char32b the second code point
|
|
* @param options A bit set of options
|
|
*/
|
|
public static int compare(int char32a, int char32b, int options) {
|
|
return internalCompare(UTF16.valueOf(char32a), UTF16.valueOf(char32b), options|INPUT_IS_FCD);
|
|
}
|
|
|
|
/**
|
|
* Convenience method that can have faster implementation
|
|
* by not allocating buffers.
|
|
* @param char32a the first code point to be checked against
|
|
* @param str2 the second string
|
|
* @param options A bit set of options
|
|
*/
|
|
public static int compare(int char32a, String str2, int options) {
|
|
return internalCompare(UTF16.valueOf(char32a), str2, options);
|
|
}
|
|
|
|
/* Concatenation of normalized strings --------------------------------- */
|
|
/**
|
|
* Concatenate normalized strings, making sure that the result is normalized
|
|
* as well.
|
|
*
|
|
* If both the left and the right strings are in
|
|
* the normalization form according to "mode",
|
|
* then the result will be
|
|
*
|
|
* <code>
|
|
* dest=normalize(left+right, mode)
|
|
* </code>
|
|
*
|
|
* With the input strings already being normalized,
|
|
* this function will use next() and previous()
|
|
* to find the adjacent end pieces of the input strings.
|
|
* Only the concatenation of these end pieces will be normalized and
|
|
* then concatenated with the remaining parts of the input strings.
|
|
*
|
|
* It is allowed to have dest==left to avoid copying the entire left string.
|
|
*
|
|
* @param left Left source array, may be same as dest.
|
|
* @param leftStart start in the left array.
|
|
* @param leftLimit limit in the left array (==length)
|
|
* @param right Right source array.
|
|
* @param rightStart start in the right array.
|
|
* @param rightLimit limit in the right array (==length)
|
|
* @param dest The output buffer; can be null if destStart==destLimit==0
|
|
* for pure preflighting.
|
|
* @param destStart start in the destination array
|
|
* @param destLimit limit in the destination array (==length)
|
|
* @param mode The normalization mode.
|
|
* @param options The normalization options, ORed together (0 for no options).
|
|
* @return Length of output (number of chars) when successful or
|
|
* IndexOutOfBoundsException
|
|
* @exception IndexOutOfBoundsException whose message has the string
|
|
* representation of destination capacity required.
|
|
* @see #normalize
|
|
* @see #next
|
|
* @see #previous
|
|
* @exception IndexOutOfBoundsException if target capacity is less than the
|
|
* required length
|
|
* @deprecated ICU 56 Use {@link Normalizer2} instead.
|
|
* @hide original deprecated declaration
|
|
*/
|
|
@Deprecated
|
|
public static int concatenate(char[] left, int leftStart, int leftLimit,
|
|
char[] right, int rightStart, int rightLimit,
|
|
char[] dest, int destStart, int destLimit,
|
|
Normalizer.Mode mode, int options) {
|
|
if(dest == null) {
|
|
throw new IllegalArgumentException();
|
|
}
|
|
|
|
/* check for overlapping right and destination */
|
|
if (right == dest && rightStart < destLimit && destStart < rightLimit) {
|
|
throw new IllegalArgumentException("overlapping right and dst ranges");
|
|
}
|
|
|
|
/* allow left==dest */
|
|
StringBuilder destBuilder=new StringBuilder(leftLimit-leftStart+rightLimit-rightStart+16);
|
|
destBuilder.append(left, leftStart, leftLimit-leftStart);
|
|
CharBuffer rightBuffer=CharBuffer.wrap(right, rightStart, rightLimit-rightStart);
|
|
mode.getNormalizer2(options).append(destBuilder, rightBuffer);
|
|
int destLength=destBuilder.length();
|
|
if(destLength<=(destLimit-destStart)) {
|
|
destBuilder.getChars(0, destLength, dest, destStart);
|
|
return destLength;
|
|
} else {
|
|
throw new IndexOutOfBoundsException(Integer.toString(destLength));
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Concatenate normalized strings, making sure that the result is normalized
|
|
* as well.
|
|
*
|
|
* If both the left and the right strings are in
|
|
* the normalization form according to "mode",
|
|
* then the result will be
|
|
*
|
|
* <code>
|
|
* dest=normalize(left+right, mode)
|
|
* </code>
|
|
*
|
|
* For details see concatenate
|
|
*
|
|
* @param left Left source string.
|
|
* @param right Right source string.
|
|
* @param mode The normalization mode.
|
|
* @param options The normalization options, ORed together (0 for no options).
|
|
* @return result
|
|
*
|
|
* @see #concatenate
|
|
* @see #normalize
|
|
* @see #next
|
|
* @see #previous
|
|
* @see #concatenate
|
|
* @deprecated ICU 56 Use {@link Normalizer2} instead.
|
|
* @hide original deprecated declaration
|
|
*/
|
|
@Deprecated
|
|
public static String concatenate(char[] left, char[] right,Mode mode, int options) {
|
|
StringBuilder dest=new StringBuilder(left.length+right.length+16).append(left);
|
|
return mode.getNormalizer2(options).append(dest, CharBuffer.wrap(right)).toString();
|
|
}
|
|
|
|
/**
|
|
* Concatenate normalized strings, making sure that the result is normalized
|
|
* as well.
|
|
*
|
|
* If both the left and the right strings are in
|
|
* the normalization form according to "mode",
|
|
* then the result will be
|
|
*
|
|
* <code>
|
|
* dest=normalize(left+right, mode)
|
|
* </code>
|
|
*
|
|
* With the input strings already being normalized,
|
|
* this function will use next() and previous()
|
|
* to find the adjacent end pieces of the input strings.
|
|
* Only the concatenation of these end pieces will be normalized and
|
|
* then concatenated with the remaining parts of the input strings.
|
|
*
|
|
* @param left Left source string.
|
|
* @param right Right source string.
|
|
* @param mode The normalization mode.
|
|
* @param options The normalization options, ORed together (0 for no options).
|
|
* @return result
|
|
*
|
|
* @see #concatenate
|
|
* @see #normalize
|
|
* @see #next
|
|
* @see #previous
|
|
* @see #concatenate
|
|
* @deprecated ICU 56 Use {@link Normalizer2} instead.
|
|
* @hide original deprecated declaration
|
|
*/
|
|
@Deprecated
|
|
public static String concatenate(String left, String right, Mode mode, int options) {
|
|
StringBuilder dest=new StringBuilder(left.length()+right.length()+16).append(left);
|
|
return mode.getNormalizer2(options).append(dest, right).toString();
|
|
}
|
|
|
|
/**
|
|
* Gets the FC_NFKC closure value.
|
|
* @param c The code point whose closure value is to be retrieved
|
|
* @param dest The char array to receive the closure value
|
|
* @return the length of the closure value; 0 if there is none
|
|
* @deprecated ICU 56
|
|
* @hide original deprecated declaration
|
|
*/
|
|
@Deprecated
|
|
public static int getFC_NFKC_Closure(int c,char[] dest) {
|
|
String closure=getFC_NFKC_Closure(c);
|
|
int length=closure.length();
|
|
if(length!=0 && dest!=null && length<=dest.length) {
|
|
closure.getChars(0, length, dest, 0);
|
|
}
|
|
return length;
|
|
}
|
|
/**
|
|
* Gets the FC_NFKC closure value.
|
|
* @param c The code point whose closure value is to be retrieved
|
|
* @return String representation of the closure value; "" if there is none
|
|
* @deprecated ICU 56
|
|
* @hide original deprecated declaration
|
|
*/
|
|
@Deprecated
|
|
public static String getFC_NFKC_Closure(int c) {
|
|
// Compute the FC_NFKC_Closure on the fly:
|
|
// We have the API for complete coverage of Unicode properties, although
|
|
// this value by itself is not useful via API.
|
|
// (What could be useful is a custom normalization table that combines
|
|
// case folding and NFKC.)
|
|
// For the derivation, see Unicode's DerivedNormalizationProps.txt.
|
|
Normalizer2 nfkc=NFKCModeImpl.INSTANCE.normalizer2;
|
|
UCaseProps csp=UCaseProps.INSTANCE;
|
|
// first: b = NFKC(Fold(a))
|
|
StringBuilder folded=new StringBuilder();
|
|
int folded1Length=csp.toFullFolding(c, folded, 0);
|
|
if(folded1Length<0) {
|
|
Normalizer2Impl nfkcImpl=((Norm2AllModes.Normalizer2WithImpl)nfkc).impl;
|
|
if(nfkcImpl.getCompQuickCheck(nfkcImpl.getNorm16(c))!=0) {
|
|
return ""; // c does not change at all under CaseFolding+NFKC
|
|
}
|
|
folded.appendCodePoint(c);
|
|
} else {
|
|
if(folded1Length>UCaseProps.MAX_STRING_LENGTH) {
|
|
folded.appendCodePoint(folded1Length);
|
|
}
|
|
}
|
|
String kc1=nfkc.normalize(folded);
|
|
// second: c = NFKC(Fold(b))
|
|
String kc2=nfkc.normalize(UCharacter.foldCase(kc1, 0));
|
|
// if (c != b) add the mapping from a to c
|
|
if(kc1.equals(kc2)) {
|
|
return "";
|
|
} else {
|
|
return kc2;
|
|
}
|
|
}
|
|
|
|
//-------------------------------------------------------------------------
|
|
// Iteration API
|
|
//-------------------------------------------------------------------------
|
|
|
|
/**
|
|
* Return the current character in the normalized text.
|
|
* @return The codepoint as an int
|
|
* @deprecated ICU 56
|
|
* @hide original deprecated declaration
|
|
*/
|
|
@Deprecated
|
|
public int current() {
|
|
if(bufferPos<buffer.length() || nextNormalize()) {
|
|
return buffer.codePointAt(bufferPos);
|
|
} else {
|
|
return DONE;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Return the next character in the normalized text and advance
|
|
* the iteration position by one. If the end
|
|
* of the text has already been reached, {@link #DONE} is returned.
|
|
* @return The codepoint as an int
|
|
* @deprecated ICU 56
|
|
* @hide original deprecated declaration
|
|
*/
|
|
@Deprecated
|
|
public int next() {
|
|
if(bufferPos<buffer.length() || nextNormalize()) {
|
|
int c=buffer.codePointAt(bufferPos);
|
|
bufferPos+=Character.charCount(c);
|
|
return c;
|
|
} else {
|
|
return DONE;
|
|
}
|
|
}
|
|
|
|
|
|
/**
|
|
* Return the previous character in the normalized text and decrement
|
|
* the iteration position by one. If the beginning
|
|
* of the text has already been reached, {@link #DONE} is returned.
|
|
* @return The codepoint as an int
|
|
* @deprecated ICU 56
|
|
* @hide original deprecated declaration
|
|
*/
|
|
@Deprecated
|
|
public int previous() {
|
|
if(bufferPos>0 || previousNormalize()) {
|
|
int c=buffer.codePointBefore(bufferPos);
|
|
bufferPos-=Character.charCount(c);
|
|
return c;
|
|
} else {
|
|
return DONE;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Reset the index to the beginning of the text.
|
|
* This is equivalent to setIndexOnly(startIndex)).
|
|
* @deprecated ICU 56
|
|
* @hide original deprecated declaration
|
|
*/
|
|
@Deprecated
|
|
public void reset() {
|
|
text.setToStart();
|
|
currentIndex=nextIndex=0;
|
|
clearBuffer();
|
|
}
|
|
|
|
/**
|
|
* Set the iteration position in the input text that is being normalized,
|
|
* without any immediate normalization.
|
|
* After setIndexOnly(), getIndex() will return the same index that is
|
|
* specified here.
|
|
*
|
|
* @param index the desired index in the input text.
|
|
* @deprecated ICU 56
|
|
* @hide original deprecated declaration
|
|
*/
|
|
@Deprecated
|
|
public void setIndexOnly(int index) {
|
|
text.setIndex(index); // validates index
|
|
currentIndex=nextIndex=index;
|
|
clearBuffer();
|
|
}
|
|
|
|
/**
|
|
* Set the iteration position in the input text that is being normalized
|
|
* and return the first normalized character at that position.
|
|
* <p>
|
|
* <b>Note:</b> This method sets the position in the <em>input</em> text,
|
|
* while {@link #next} and {@link #previous} iterate through characters
|
|
* in the normalized <em>output</em>. This means that there is not
|
|
* necessarily a one-to-one correspondence between characters returned
|
|
* by <tt>next</tt> and <tt>previous</tt> and the indices passed to and
|
|
* returned from <tt>setIndex</tt> and {@link #getIndex}.
|
|
* <p>
|
|
* @param index the desired index in the input text.
|
|
*
|
|
* @return the first normalized character that is the result of iterating
|
|
* forward starting at the given index.
|
|
*
|
|
* @throws IllegalArgumentException if the given index is less than
|
|
* {@link #getBeginIndex} or greater than {@link #getEndIndex}.
|
|
* @deprecated ICU 3.2
|
|
* @obsolete ICU 3.2
|
|
* @hide original deprecated declaration
|
|
*/
|
|
@Deprecated
|
|
///CLOVER:OFF
|
|
public int setIndex(int index) {
|
|
setIndexOnly(index);
|
|
return current();
|
|
}
|
|
///CLOVER:ON
|
|
/**
|
|
* Retrieve the index of the start of the input text. This is the begin
|
|
* index of the <tt>CharacterIterator</tt> or the start (i.e. 0) of the
|
|
* <tt>String</tt> over which this <tt>Normalizer</tt> is iterating
|
|
* @deprecated ICU 2.2. Use startIndex() instead.
|
|
* @return The codepoint as an int
|
|
* @see #startIndex
|
|
* @hide original deprecated declaration
|
|
*/
|
|
@Deprecated
|
|
public int getBeginIndex() {
|
|
return 0;
|
|
}
|
|
|
|
/**
|
|
* Retrieve the index of the end of the input text. This is the end index
|
|
* of the <tt>CharacterIterator</tt> or the length of the <tt>String</tt>
|
|
* over which this <tt>Normalizer</tt> is iterating
|
|
* @deprecated ICU 2.2. Use endIndex() instead.
|
|
* @return The codepoint as an int
|
|
* @see #endIndex
|
|
* @hide original deprecated declaration
|
|
*/
|
|
@Deprecated
|
|
public int getEndIndex() {
|
|
return endIndex();
|
|
}
|
|
/**
|
|
* Return the first character in the normalized text. This resets
|
|
* the <tt>Normalizer's</tt> position to the beginning of the text.
|
|
* @return The codepoint as an int
|
|
* @deprecated ICU 56
|
|
* @hide original deprecated declaration
|
|
*/
|
|
@Deprecated
|
|
public int first() {
|
|
reset();
|
|
return next();
|
|
}
|
|
|
|
/**
|
|
* Return the last character in the normalized text. This resets
|
|
* the <tt>Normalizer's</tt> position to be just before the
|
|
* the input text corresponding to that normalized character.
|
|
* @return The codepoint as an int
|
|
* @deprecated ICU 56
|
|
* @hide original deprecated declaration
|
|
*/
|
|
@Deprecated
|
|
public int last() {
|
|
text.setToLimit();
|
|
currentIndex=nextIndex=text.getIndex();
|
|
clearBuffer();
|
|
return previous();
|
|
}
|
|
|
|
/**
|
|
* Retrieve the current iteration position in the input text that is
|
|
* being normalized. This method is useful in applications such as
|
|
* searching, where you need to be able to determine the position in
|
|
* the input text that corresponds to a given normalized output character.
|
|
* <p>
|
|
* <b>Note:</b> This method sets the position in the <em>input</em>, while
|
|
* {@link #next} and {@link #previous} iterate through characters in the
|
|
* <em>output</em>. This means that there is not necessarily a one-to-one
|
|
* correspondence between characters returned by <tt>next</tt> and
|
|
* <tt>previous</tt> and the indices passed to and returned from
|
|
* <tt>setIndex</tt> and {@link #getIndex}.
|
|
* @return The current iteration position
|
|
* @deprecated ICU 56
|
|
* @hide original deprecated declaration
|
|
*/
|
|
@Deprecated
|
|
public int getIndex() {
|
|
if(bufferPos<buffer.length()) {
|
|
return currentIndex;
|
|
} else {
|
|
return nextIndex;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Retrieve the index of the start of the input text. This is the begin
|
|
* index of the <tt>CharacterIterator</tt> or the start (i.e. 0) of the
|
|
* <tt>String</tt> over which this <tt>Normalizer</tt> is iterating
|
|
* @return The current iteration position
|
|
* @deprecated ICU 56
|
|
* @hide original deprecated declaration
|
|
*/
|
|
@Deprecated
|
|
public int startIndex() {
|
|
return 0;
|
|
}
|
|
|
|
/**
|
|
* Retrieve the index of the end of the input text. This is the end index
|
|
* of the <tt>CharacterIterator</tt> or the length of the <tt>String</tt>
|
|
* over which this <tt>Normalizer</tt> is iterating
|
|
* @return The current iteration position
|
|
* @deprecated ICU 56
|
|
* @hide original deprecated declaration
|
|
*/
|
|
@Deprecated
|
|
public int endIndex() {
|
|
return text.getLength();
|
|
}
|
|
|
|
//-------------------------------------------------------------------------
|
|
// Iterator attributes
|
|
//-------------------------------------------------------------------------
|
|
/**
|
|
* Set the normalization mode for this object.
|
|
* <p>
|
|
* <b>Note:</b>If the normalization mode is changed while iterating
|
|
* over a string, calls to {@link #next} and {@link #previous} may
|
|
* return previously buffers characters in the old normalization mode
|
|
* until the iteration is able to re-sync at the next base character.
|
|
* It is safest to call {@link #setText setText()}, {@link #first},
|
|
* {@link #last}, etc. after calling <tt>setMode</tt>.
|
|
* <p>
|
|
* @param newMode the new mode for this <tt>Normalizer</tt>.
|
|
* The supported modes are:
|
|
* <ul>
|
|
* <li>{@link #NFC} - Unicode canonical decompositiion
|
|
* followed by canonical composition.
|
|
* <li>{@link #NFKC} - Unicode compatibility decompositiion
|
|
* followed by canonical composition.
|
|
* <li>{@link #NFD} - Unicode canonical decomposition
|
|
* <li>{@link #NFKD} - Unicode compatibility decomposition.
|
|
* <li>{@link #NONE} - Do nothing but return characters
|
|
* from the underlying input text.
|
|
* </ul>
|
|
*
|
|
* @see #getMode
|
|
* @deprecated ICU 56
|
|
* @hide original deprecated declaration
|
|
*/
|
|
@Deprecated
|
|
public void setMode(Mode newMode) {
|
|
mode = newMode;
|
|
norm2 = mode.getNormalizer2(options);
|
|
}
|
|
/**
|
|
* Return the basic operation performed by this <tt>Normalizer</tt>
|
|
*
|
|
* @see #setMode
|
|
* @deprecated ICU 56
|
|
* @hide original deprecated declaration
|
|
*/
|
|
@Deprecated
|
|
public Mode getMode() {
|
|
return mode;
|
|
}
|
|
/**
|
|
* Set options that affect this <tt>Normalizer</tt>'s operation.
|
|
* Options do not change the basic composition or decomposition operation
|
|
* that is being performed , but they control whether
|
|
* certain optional portions of the operation are done.
|
|
* Currently the only available option is:
|
|
*
|
|
* <ul>
|
|
* <li>{@link #UNICODE_3_2} - Use Normalization conforming to Unicode version 3.2.
|
|
* </ul>
|
|
*
|
|
* @param option the option whose value is to be set.
|
|
* @param value the new setting for the option. Use <tt>true</tt> to
|
|
* turn the option on and <tt>false</tt> to turn it off.
|
|
*
|
|
* @see #getOption
|
|
* @deprecated ICU 56
|
|
* @hide original deprecated declaration
|
|
*/
|
|
@Deprecated
|
|
public void setOption(int option,boolean value) {
|
|
if (value) {
|
|
options |= option;
|
|
} else {
|
|
options &= (~option);
|
|
}
|
|
norm2 = mode.getNormalizer2(options);
|
|
}
|
|
|
|
/**
|
|
* Determine whether an option is turned on or off.
|
|
* <p>
|
|
* @see #setOption
|
|
* @deprecated ICU 56
|
|
* @hide original deprecated declaration
|
|
*/
|
|
@Deprecated
|
|
public int getOption(int option) {
|
|
if((options & option)!=0) {
|
|
return 1 ;
|
|
} else {
|
|
return 0;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Gets the underlying text storage
|
|
* @param fillIn the char buffer to fill the UTF-16 units.
|
|
* The length of the buffer should be equal to the length of the
|
|
* underlying text storage
|
|
* @throws IndexOutOfBoundsException If the index passed for the array is invalid.
|
|
* @see #getLength
|
|
* @deprecated ICU 56
|
|
* @hide original deprecated declaration
|
|
*/
|
|
@Deprecated
|
|
public int getText(char[] fillIn) {
|
|
return text.getText(fillIn);
|
|
}
|
|
|
|
/**
|
|
* Gets the length of underlying text storage
|
|
* @return the length
|
|
* @deprecated ICU 56
|
|
* @hide original deprecated declaration
|
|
*/
|
|
@Deprecated
|
|
public int getLength() {
|
|
return text.getLength();
|
|
}
|
|
|
|
/**
|
|
* Returns the text under iteration as a string
|
|
* @return a copy of the text under iteration.
|
|
* @deprecated ICU 56
|
|
* @hide original deprecated declaration
|
|
*/
|
|
@Deprecated
|
|
public String getText() {
|
|
return text.getText();
|
|
}
|
|
|
|
/**
|
|
* Set the input text over which this <tt>Normalizer</tt> will iterate.
|
|
* The iteration position is set to the beginning of the input text.
|
|
* @param newText The new string to be normalized.
|
|
* @deprecated ICU 56
|
|
* @hide original deprecated declaration
|
|
*/
|
|
@Deprecated
|
|
public void setText(StringBuffer newText) {
|
|
UCharacterIterator newIter = UCharacterIterator.getInstance(newText);
|
|
if (newIter == null) {
|
|
throw new IllegalStateException("Could not create a new UCharacterIterator");
|
|
}
|
|
text = newIter;
|
|
reset();
|
|
}
|
|
|
|
/**
|
|
* Set the input text over which this <tt>Normalizer</tt> will iterate.
|
|
* The iteration position is set to the beginning of the input text.
|
|
* @param newText The new string to be normalized.
|
|
* @deprecated ICU 56
|
|
* @hide original deprecated declaration
|
|
*/
|
|
@Deprecated
|
|
public void setText(char[] newText) {
|
|
UCharacterIterator newIter = UCharacterIterator.getInstance(newText);
|
|
if (newIter == null) {
|
|
throw new IllegalStateException("Could not create a new UCharacterIterator");
|
|
}
|
|
text = newIter;
|
|
reset();
|
|
}
|
|
|
|
/**
|
|
* Set the input text over which this <tt>Normalizer</tt> will iterate.
|
|
* The iteration position is set to the beginning of the input text.
|
|
* @param newText The new string to be normalized.
|
|
* @deprecated ICU 56
|
|
* @hide original deprecated declaration
|
|
*/
|
|
@Deprecated
|
|
public void setText(String newText) {
|
|
UCharacterIterator newIter = UCharacterIterator.getInstance(newText);
|
|
if (newIter == null) {
|
|
throw new IllegalStateException("Could not create a new UCharacterIterator");
|
|
}
|
|
text = newIter;
|
|
reset();
|
|
}
|
|
|
|
/**
|
|
* Set the input text over which this <tt>Normalizer</tt> will iterate.
|
|
* The iteration position is set to the beginning of the input text.
|
|
* @param newText The new string to be normalized.
|
|
* @deprecated ICU 56
|
|
* @hide original deprecated declaration
|
|
*/
|
|
@Deprecated
|
|
public void setText(CharacterIterator newText) {
|
|
UCharacterIterator newIter = UCharacterIterator.getInstance(newText);
|
|
if (newIter == null) {
|
|
throw new IllegalStateException("Could not create a new UCharacterIterator");
|
|
}
|
|
text = newIter;
|
|
reset();
|
|
}
|
|
|
|
/**
|
|
* Set the input text over which this <tt>Normalizer</tt> will iterate.
|
|
* The iteration position is set to the beginning of the string.
|
|
* @param newText The new string to be normalized.
|
|
* @deprecated ICU 56
|
|
* @hide original deprecated declaration
|
|
*/
|
|
@Deprecated
|
|
public void setText(UCharacterIterator newText) {
|
|
try{
|
|
UCharacterIterator newIter = (UCharacterIterator)newText.clone();
|
|
if (newIter == null) {
|
|
throw new IllegalStateException("Could not create a new UCharacterIterator");
|
|
}
|
|
text = newIter;
|
|
reset();
|
|
}catch(CloneNotSupportedException e) {
|
|
throw new ICUCloneNotSupportedException("Could not clone the UCharacterIterator", e);
|
|
}
|
|
}
|
|
|
|
private void clearBuffer() {
|
|
buffer.setLength(0);
|
|
bufferPos=0;
|
|
}
|
|
|
|
private boolean nextNormalize() {
|
|
clearBuffer();
|
|
currentIndex=nextIndex;
|
|
text.setIndex(nextIndex);
|
|
// Skip at least one character so we make progress.
|
|
int c=text.nextCodePoint();
|
|
if(c<0) {
|
|
return false;
|
|
}
|
|
StringBuilder segment=new StringBuilder().appendCodePoint(c);
|
|
while((c=text.nextCodePoint())>=0) {
|
|
if(norm2.hasBoundaryBefore(c)) {
|
|
text.moveCodePointIndex(-1);
|
|
break;
|
|
}
|
|
segment.appendCodePoint(c);
|
|
}
|
|
nextIndex=text.getIndex();
|
|
norm2.normalize(segment, buffer);
|
|
return buffer.length()!=0;
|
|
}
|
|
|
|
private boolean previousNormalize() {
|
|
clearBuffer();
|
|
nextIndex=currentIndex;
|
|
text.setIndex(currentIndex);
|
|
StringBuilder segment=new StringBuilder();
|
|
int c;
|
|
while((c=text.previousCodePoint())>=0) {
|
|
if(c<=0xffff) {
|
|
segment.insert(0, (char)c);
|
|
} else {
|
|
segment.insert(0, Character.toChars(c));
|
|
}
|
|
if(norm2.hasBoundaryBefore(c)) {
|
|
break;
|
|
}
|
|
}
|
|
currentIndex=text.getIndex();
|
|
norm2.normalize(segment, buffer);
|
|
bufferPos=buffer.length();
|
|
return buffer.length()!=0;
|
|
}
|
|
|
|
/* compare canonically equivalent ------------------------------------------- */
|
|
|
|
// TODO: Broaden the public compare(String, String, options) API like this. Ticket #7407
|
|
private static int internalCompare(CharSequence s1, CharSequence s2, int options) {
|
|
int normOptions=options>>>COMPARE_NORM_OPTIONS_SHIFT;
|
|
options|= COMPARE_EQUIV;
|
|
|
|
/*
|
|
* UAX #21 Case Mappings, as fixed for Unicode version 4
|
|
* (see Jitterbug 2021), defines a canonical caseless match as
|
|
*
|
|
* A string X is a canonical caseless match
|
|
* for a string Y if and only if
|
|
* NFD(toCasefold(NFD(X))) = NFD(toCasefold(NFD(Y)))
|
|
*
|
|
* For better performance, we check for FCD (or let the caller tell us that
|
|
* both strings are in FCD) for the inner normalization.
|
|
* BasicNormalizerTest::FindFoldFCDExceptions() makes sure that
|
|
* case-folding preserves the FCD-ness of a string.
|
|
* The outer normalization is then only performed by NormalizerImpl.cmpEquivFold()
|
|
* when there is a difference.
|
|
*
|
|
* Exception: When using the Turkic case-folding option, we do perform
|
|
* full NFD first. This is because in the Turkic case precomposed characters
|
|
* with 0049 capital I or 0069 small i fold differently whether they
|
|
* are first decomposed or not, so an FCD check - a check only for
|
|
* canonical order - is not sufficient.
|
|
*/
|
|
if((options&INPUT_IS_FCD)==0 || (options&FOLD_CASE_EXCLUDE_SPECIAL_I)!=0) {
|
|
Normalizer2 n2;
|
|
if((options&FOLD_CASE_EXCLUDE_SPECIAL_I)!=0) {
|
|
n2=NFD.getNormalizer2(normOptions);
|
|
} else {
|
|
n2=FCD.getNormalizer2(normOptions);
|
|
}
|
|
|
|
// check if s1 and/or s2 fulfill the FCD conditions
|
|
int spanQCYes1=n2.spanQuickCheckYes(s1);
|
|
int spanQCYes2=n2.spanQuickCheckYes(s2);
|
|
|
|
/*
|
|
* ICU 2.4 had a further optimization:
|
|
* If both strings were not in FCD, then they were both NFD'ed,
|
|
* and the COMPARE_EQUIV option was turned off.
|
|
* It is not entirely clear that this is valid with the current
|
|
* definition of the canonical caseless match.
|
|
* Therefore, ICU 2.6 removes that optimization.
|
|
*/
|
|
|
|
if(spanQCYes1<s1.length()) {
|
|
StringBuilder fcd1=new StringBuilder(s1.length()+16).append(s1, 0, spanQCYes1);
|
|
s1=n2.normalizeSecondAndAppend(fcd1, s1.subSequence(spanQCYes1, s1.length()));
|
|
}
|
|
if(spanQCYes2<s2.length()) {
|
|
StringBuilder fcd2=new StringBuilder(s2.length()+16).append(s2, 0, spanQCYes2);
|
|
s2=n2.normalizeSecondAndAppend(fcd2, s2.subSequence(spanQCYes2, s2.length()));
|
|
}
|
|
}
|
|
|
|
return cmpEquivFold(s1, s2, options);
|
|
}
|
|
|
|
/*
|
|
* Compare two strings for canonical equivalence.
|
|
* Further options include case-insensitive comparison and
|
|
* code point order (as opposed to code unit order).
|
|
*
|
|
* In this function, canonical equivalence is optional as well.
|
|
* If canonical equivalence is tested, then both strings must fulfill
|
|
* the FCD check.
|
|
*
|
|
* Semantically, this is equivalent to
|
|
* strcmp[CodePointOrder](NFD(foldCase(s1)), NFD(foldCase(s2)))
|
|
* where code point order, NFD and foldCase are all optional.
|
|
*
|
|
* String comparisons almost always yield results before processing both strings
|
|
* completely.
|
|
* They are generally more efficient working incrementally instead of
|
|
* performing the sub-processing (strlen, normalization, case-folding)
|
|
* on the entire strings first.
|
|
*
|
|
* It is also unnecessary to not normalize identical characters.
|
|
*
|
|
* This function works in principle as follows:
|
|
*
|
|
* loop {
|
|
* get one code unit c1 from s1 (-1 if end of source)
|
|
* get one code unit c2 from s2 (-1 if end of source)
|
|
*
|
|
* if(either string finished) {
|
|
* return result;
|
|
* }
|
|
* if(c1==c2) {
|
|
* continue;
|
|
* }
|
|
*
|
|
* // c1!=c2
|
|
* try to decompose/case-fold c1/c2, and continue if one does;
|
|
*
|
|
* // still c1!=c2 and neither decomposes/case-folds, return result
|
|
* return c1-c2;
|
|
* }
|
|
*
|
|
* When a character decomposes, then the pointer for that source changes to
|
|
* the decomposition, pushing the previous pointer onto a stack.
|
|
* When the end of the decomposition is reached, then the code unit reader
|
|
* pops the previous source from the stack.
|
|
* (Same for case-folding.)
|
|
*
|
|
* This is complicated further by operating on variable-width UTF-16.
|
|
* The top part of the loop works on code units, while lookups for decomposition
|
|
* and case-folding need code points.
|
|
* Code points are assembled after the equality/end-of-source part.
|
|
* The source pointer is only advanced beyond all code units when the code point
|
|
* actually decomposes/case-folds.
|
|
*
|
|
* If we were on a trail surrogate unit when assembling a code point,
|
|
* and the code point decomposes/case-folds, then the decomposition/folding
|
|
* result must be compared with the part of the other string that corresponds to
|
|
* this string's lead surrogate.
|
|
* Since we only assemble a code point when hitting a trail unit when the
|
|
* preceding lead units were identical, we back up the other string by one unit
|
|
* in such a case.
|
|
*
|
|
* The optional code point order comparison at the end works with
|
|
* the same fix-up as the other code point order comparison functions.
|
|
* See ustring.c and the comment near the end of this function.
|
|
*
|
|
* Assumption: A decomposition or case-folding result string never contains
|
|
* a single surrogate. This is a safe assumption in the Unicode Standard.
|
|
* Therefore, we do not need to check for surrogate pairs across
|
|
* decomposition/case-folding boundaries.
|
|
*
|
|
* Further assumptions (see verifications tstnorm.cpp):
|
|
* The API function checks for FCD first, while the core function
|
|
* first case-folds and then decomposes. This requires that case-folding does not
|
|
* un-FCD any strings.
|
|
*
|
|
* The API function may also NFD the input and turn off decomposition.
|
|
* This requires that case-folding does not un-NFD strings either.
|
|
*
|
|
* TODO If any of the above two assumptions is violated,
|
|
* then this entire code must be re-thought.
|
|
* If this happens, then a simple solution is to case-fold both strings up front
|
|
* and to turn off UNORM_INPUT_IS_FCD.
|
|
* We already do this when not both strings are in FCD because makeFCD
|
|
* would be a partial NFD before the case folding, which does not work.
|
|
* Note that all of this is only a problem when case-folding _and_
|
|
* canonical equivalence come together.
|
|
* (Comments in unorm_compare() are more up to date than this TODO.)
|
|
*/
|
|
|
|
/* stack element for previous-level source/decomposition pointers */
|
|
private static final class CmpEquivLevel {
|
|
CharSequence cs;
|
|
int s;
|
|
};
|
|
private static final CmpEquivLevel[] createCmpEquivLevelStack() {
|
|
return new CmpEquivLevel[] {
|
|
new CmpEquivLevel(), new CmpEquivLevel()
|
|
};
|
|
}
|
|
|
|
/**
|
|
* Internal option for unorm_cmpEquivFold() for decomposing.
|
|
* If not set, just do strcasecmp().
|
|
*/
|
|
private static final int COMPARE_EQUIV=0x80000;
|
|
|
|
/* internal function; package visibility for use by UTF16.StringComparator */
|
|
/*package*/ static int cmpEquivFold(CharSequence cs1, CharSequence cs2, int options) {
|
|
Normalizer2Impl nfcImpl;
|
|
UCaseProps csp;
|
|
|
|
/* current-level start/limit - s1/s2 as current */
|
|
int s1, s2, limit1, limit2;
|
|
|
|
/* decomposition and case folding variables */
|
|
int length;
|
|
|
|
/* stacks of previous-level start/current/limit */
|
|
CmpEquivLevel[] stack1=null, stack2=null;
|
|
|
|
/* buffers for algorithmic decompositions */
|
|
String decomp1, decomp2;
|
|
|
|
/* case folding buffers, only use current-level start/limit */
|
|
StringBuilder fold1, fold2;
|
|
|
|
/* track which is the current level per string */
|
|
int level1, level2;
|
|
|
|
/* current code units, and code points for lookups */
|
|
int c1, c2, cp1, cp2;
|
|
|
|
/* no argument error checking because this itself is not an API */
|
|
|
|
/*
|
|
* assume that at least one of the options _COMPARE_EQUIV and U_COMPARE_IGNORE_CASE is set
|
|
* otherwise this function must behave exactly as uprv_strCompare()
|
|
* not checking for that here makes testing this function easier
|
|
*/
|
|
|
|
/* normalization/properties data loaded? */
|
|
if((options&COMPARE_EQUIV)!=0) {
|
|
nfcImpl=Norm2AllModes.getNFCInstance().impl;
|
|
} else {
|
|
nfcImpl=null;
|
|
}
|
|
if((options&COMPARE_IGNORE_CASE)!=0) {
|
|
csp=UCaseProps.INSTANCE;
|
|
fold1=new StringBuilder();
|
|
fold2=new StringBuilder();
|
|
} else {
|
|
csp=null;
|
|
fold1=fold2=null;
|
|
}
|
|
|
|
/* initialize */
|
|
s1=0;
|
|
limit1=cs1.length();
|
|
s2=0;
|
|
limit2=cs2.length();
|
|
|
|
level1=level2=0;
|
|
c1=c2=-1;
|
|
|
|
/* comparison loop */
|
|
for(;;) {
|
|
/*
|
|
* here a code unit value of -1 means "get another code unit"
|
|
* below it will mean "this source is finished"
|
|
*/
|
|
|
|
if(c1<0) {
|
|
/* get next code unit from string 1, post-increment */
|
|
for(;;) {
|
|
if(s1==limit1) {
|
|
if(level1==0) {
|
|
c1=-1;
|
|
break;
|
|
}
|
|
} else {
|
|
c1=cs1.charAt(s1++);
|
|
break;
|
|
}
|
|
|
|
/* reached end of level buffer, pop one level */
|
|
do {
|
|
--level1;
|
|
cs1=stack1[level1].cs;
|
|
} while(cs1==null);
|
|
s1=stack1[level1].s;
|
|
limit1=cs1.length();
|
|
}
|
|
}
|
|
|
|
if(c2<0) {
|
|
/* get next code unit from string 2, post-increment */
|
|
for(;;) {
|
|
if(s2==limit2) {
|
|
if(level2==0) {
|
|
c2=-1;
|
|
break;
|
|
}
|
|
} else {
|
|
c2=cs2.charAt(s2++);
|
|
break;
|
|
}
|
|
|
|
/* reached end of level buffer, pop one level */
|
|
do {
|
|
--level2;
|
|
cs2=stack2[level2].cs;
|
|
} while(cs2==null);
|
|
s2=stack2[level2].s;
|
|
limit2=cs2.length();
|
|
}
|
|
}
|
|
|
|
/*
|
|
* compare c1 and c2
|
|
* either variable c1, c2 is -1 only if the corresponding string is finished
|
|
*/
|
|
if(c1==c2) {
|
|
if(c1<0) {
|
|
return 0; /* c1==c2==-1 indicating end of strings */
|
|
}
|
|
c1=c2=-1; /* make us fetch new code units */
|
|
continue;
|
|
} else if(c1<0) {
|
|
return -1; /* string 1 ends before string 2 */
|
|
} else if(c2<0) {
|
|
return 1; /* string 2 ends before string 1 */
|
|
}
|
|
/* c1!=c2 && c1>=0 && c2>=0 */
|
|
|
|
/* get complete code points for c1, c2 for lookups if either is a surrogate */
|
|
cp1=c1;
|
|
if(UTF16.isSurrogate(c1)) {
|
|
char c;
|
|
|
|
if(Normalizer2Impl.UTF16Plus.isSurrogateLead(c1)) {
|
|
if(s1!=limit1 && Character.isLowSurrogate(c=cs1.charAt(s1))) {
|
|
/* advance ++s1; only below if cp1 decomposes/case-folds */
|
|
cp1=Character.toCodePoint((char)c1, c);
|
|
}
|
|
} else /* isTrail(c1) */ {
|
|
if(0<=(s1-2) && Character.isHighSurrogate(c=cs1.charAt(s1-2))) {
|
|
cp1=Character.toCodePoint(c, (char)c1);
|
|
}
|
|
}
|
|
}
|
|
|
|
cp2=c2;
|
|
if(UTF16.isSurrogate(c2)) {
|
|
char c;
|
|
|
|
if(Normalizer2Impl.UTF16Plus.isSurrogateLead(c2)) {
|
|
if(s2!=limit2 && Character.isLowSurrogate(c=cs2.charAt(s2))) {
|
|
/* advance ++s2; only below if cp2 decomposes/case-folds */
|
|
cp2=Character.toCodePoint((char)c2, c);
|
|
}
|
|
} else /* isTrail(c2) */ {
|
|
if(0<=(s2-2) && Character.isHighSurrogate(c=cs2.charAt(s2-2))) {
|
|
cp2=Character.toCodePoint(c, (char)c2);
|
|
}
|
|
}
|
|
}
|
|
|
|
/*
|
|
* go down one level for each string
|
|
* continue with the main loop as soon as there is a real change
|
|
*/
|
|
|
|
if( level1==0 && (options&COMPARE_IGNORE_CASE)!=0 &&
|
|
(length=csp.toFullFolding(cp1, fold1, options))>=0
|
|
) {
|
|
/* cp1 case-folds to the code point "length" or to p[length] */
|
|
if(UTF16.isSurrogate(c1)) {
|
|
if(Normalizer2Impl.UTF16Plus.isSurrogateLead(c1)) {
|
|
/* advance beyond source surrogate pair if it case-folds */
|
|
++s1;
|
|
} else /* isTrail(c1) */ {
|
|
/*
|
|
* we got a supplementary code point when hitting its trail surrogate,
|
|
* therefore the lead surrogate must have been the same as in the other string;
|
|
* compare this decomposition with the lead surrogate in the other string
|
|
* remember that this simulates bulk text replacement:
|
|
* the decomposition would replace the entire code point
|
|
*/
|
|
--s2;
|
|
c2=cs2.charAt(s2-1);
|
|
}
|
|
}
|
|
|
|
/* push current level pointers */
|
|
if(stack1==null) {
|
|
stack1=createCmpEquivLevelStack();
|
|
}
|
|
stack1[0].cs=cs1;
|
|
stack1[0].s=s1;
|
|
++level1;
|
|
|
|
/* copy the folding result to fold1[] */
|
|
/* Java: the buffer was probably not empty, remove the old contents */
|
|
if(length<=UCaseProps.MAX_STRING_LENGTH) {
|
|
fold1.delete(0, fold1.length()-length);
|
|
} else {
|
|
fold1.setLength(0);
|
|
fold1.appendCodePoint(length);
|
|
}
|
|
|
|
/* set next level pointers to case folding */
|
|
cs1=fold1;
|
|
s1=0;
|
|
limit1=fold1.length();
|
|
|
|
/* get ready to read from decomposition, continue with loop */
|
|
c1=-1;
|
|
continue;
|
|
}
|
|
|
|
if( level2==0 && (options&COMPARE_IGNORE_CASE)!=0 &&
|
|
(length=csp.toFullFolding(cp2, fold2, options))>=0
|
|
) {
|
|
/* cp2 case-folds to the code point "length" or to p[length] */
|
|
if(UTF16.isSurrogate(c2)) {
|
|
if(Normalizer2Impl.UTF16Plus.isSurrogateLead(c2)) {
|
|
/* advance beyond source surrogate pair if it case-folds */
|
|
++s2;
|
|
} else /* isTrail(c2) */ {
|
|
/*
|
|
* we got a supplementary code point when hitting its trail surrogate,
|
|
* therefore the lead surrogate must have been the same as in the other string;
|
|
* compare this decomposition with the lead surrogate in the other string
|
|
* remember that this simulates bulk text replacement:
|
|
* the decomposition would replace the entire code point
|
|
*/
|
|
--s1;
|
|
c1=cs1.charAt(s1-1);
|
|
}
|
|
}
|
|
|
|
/* push current level pointers */
|
|
if(stack2==null) {
|
|
stack2=createCmpEquivLevelStack();
|
|
}
|
|
stack2[0].cs=cs2;
|
|
stack2[0].s=s2;
|
|
++level2;
|
|
|
|
/* copy the folding result to fold2[] */
|
|
/* Java: the buffer was probably not empty, remove the old contents */
|
|
if(length<=UCaseProps.MAX_STRING_LENGTH) {
|
|
fold2.delete(0, fold2.length()-length);
|
|
} else {
|
|
fold2.setLength(0);
|
|
fold2.appendCodePoint(length);
|
|
}
|
|
|
|
/* set next level pointers to case folding */
|
|
cs2=fold2;
|
|
s2=0;
|
|
limit2=fold2.length();
|
|
|
|
/* get ready to read from decomposition, continue with loop */
|
|
c2=-1;
|
|
continue;
|
|
}
|
|
|
|
if( level1<2 && (options&COMPARE_EQUIV)!=0 &&
|
|
(decomp1=nfcImpl.getDecomposition(cp1))!=null
|
|
) {
|
|
/* cp1 decomposes into p[length] */
|
|
if(UTF16.isSurrogate(c1)) {
|
|
if(Normalizer2Impl.UTF16Plus.isSurrogateLead(c1)) {
|
|
/* advance beyond source surrogate pair if it decomposes */
|
|
++s1;
|
|
} else /* isTrail(c1) */ {
|
|
/*
|
|
* we got a supplementary code point when hitting its trail surrogate,
|
|
* therefore the lead surrogate must have been the same as in the other string;
|
|
* compare this decomposition with the lead surrogate in the other string
|
|
* remember that this simulates bulk text replacement:
|
|
* the decomposition would replace the entire code point
|
|
*/
|
|
--s2;
|
|
c2=cs2.charAt(s2-1);
|
|
}
|
|
}
|
|
|
|
/* push current level pointers */
|
|
if(stack1==null) {
|
|
stack1=createCmpEquivLevelStack();
|
|
}
|
|
stack1[level1].cs=cs1;
|
|
stack1[level1].s=s1;
|
|
++level1;
|
|
|
|
/* set empty intermediate level if skipped */
|
|
if(level1<2) {
|
|
stack1[level1++].cs=null;
|
|
}
|
|
|
|
/* set next level pointers to decomposition */
|
|
cs1=decomp1;
|
|
s1=0;
|
|
limit1=decomp1.length();
|
|
|
|
/* get ready to read from decomposition, continue with loop */
|
|
c1=-1;
|
|
continue;
|
|
}
|
|
|
|
if( level2<2 && (options&COMPARE_EQUIV)!=0 &&
|
|
(decomp2=nfcImpl.getDecomposition(cp2))!=null
|
|
) {
|
|
/* cp2 decomposes into p[length] */
|
|
if(UTF16.isSurrogate(c2)) {
|
|
if(Normalizer2Impl.UTF16Plus.isSurrogateLead(c2)) {
|
|
/* advance beyond source surrogate pair if it decomposes */
|
|
++s2;
|
|
} else /* isTrail(c2) */ {
|
|
/*
|
|
* we got a supplementary code point when hitting its trail surrogate,
|
|
* therefore the lead surrogate must have been the same as in the other string;
|
|
* compare this decomposition with the lead surrogate in the other string
|
|
* remember that this simulates bulk text replacement:
|
|
* the decomposition would replace the entire code point
|
|
*/
|
|
--s1;
|
|
c1=cs1.charAt(s1-1);
|
|
}
|
|
}
|
|
|
|
/* push current level pointers */
|
|
if(stack2==null) {
|
|
stack2=createCmpEquivLevelStack();
|
|
}
|
|
stack2[level2].cs=cs2;
|
|
stack2[level2].s=s2;
|
|
++level2;
|
|
|
|
/* set empty intermediate level if skipped */
|
|
if(level2<2) {
|
|
stack2[level2++].cs=null;
|
|
}
|
|
|
|
/* set next level pointers to decomposition */
|
|
cs2=decomp2;
|
|
s2=0;
|
|
limit2=decomp2.length();
|
|
|
|
/* get ready to read from decomposition, continue with loop */
|
|
c2=-1;
|
|
continue;
|
|
}
|
|
|
|
/*
|
|
* no decomposition/case folding, max level for both sides:
|
|
* return difference result
|
|
*
|
|
* code point order comparison must not just return cp1-cp2
|
|
* because when single surrogates are present then the surrogate pairs
|
|
* that formed cp1 and cp2 may be from different string indexes
|
|
*
|
|
* example: { d800 d800 dc01 } vs. { d800 dc00 }, compare at second code units
|
|
* c1=d800 cp1=10001 c2=dc00 cp2=10000
|
|
* cp1-cp2>0 but c1-c2<0 and in fact in UTF-32 it is { d800 10001 } < { 10000 }
|
|
*
|
|
* therefore, use same fix-up as in ustring.c/uprv_strCompare()
|
|
* except: uprv_strCompare() fetches c=*s while this functions fetches c=*s++
|
|
* so we have slightly different pointer/start/limit comparisons here
|
|
*/
|
|
|
|
if(c1>=0xd800 && c2>=0xd800 && (options&COMPARE_CODE_POINT_ORDER)!=0) {
|
|
/* subtract 0x2800 from BMP code points to make them smaller than supplementary ones */
|
|
if(
|
|
(c1<=0xdbff && s1!=limit1 && Character.isLowSurrogate(cs1.charAt(s1))) ||
|
|
(Character.isLowSurrogate((char)c1) && 0!=(s1-1) && Character.isHighSurrogate(cs1.charAt(s1-2)))
|
|
) {
|
|
/* part of a surrogate pair, leave >=d800 */
|
|
} else {
|
|
/* BMP code point - may be surrogate code point - make <d800 */
|
|
c1-=0x2800;
|
|
}
|
|
|
|
if(
|
|
(c2<=0xdbff && s2!=limit2 && Character.isLowSurrogate(cs2.charAt(s2))) ||
|
|
(Character.isLowSurrogate((char)c2) && 0!=(s2-1) && Character.isHighSurrogate(cs2.charAt(s2-2)))
|
|
) {
|
|
/* part of a surrogate pair, leave >=d800 */
|
|
} else {
|
|
/* BMP code point - may be surrogate code point - make <d800 */
|
|
c2-=0x2800;
|
|
}
|
|
}
|
|
|
|
return c1-c2;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* An Appendable that writes into a char array with a capacity that may be
|
|
* less than array.length.
|
|
* (By contrast, CharBuffer will write beyond destLimit all the way up to array.length.)
|
|
* <p>
|
|
* An overflow is only reported at the end, for the old Normalizer API functions that write
|
|
* to char arrays.
|
|
*/
|
|
private static final class CharsAppendable implements Appendable {
|
|
public CharsAppendable(char[] dest, int destStart, int destLimit) {
|
|
chars=dest;
|
|
start=offset=destStart;
|
|
limit=destLimit;
|
|
}
|
|
public int length() {
|
|
int len=offset-start;
|
|
if(offset<=limit) {
|
|
return len;
|
|
} else {
|
|
throw new IndexOutOfBoundsException(Integer.toString(len));
|
|
}
|
|
}
|
|
@Override
|
|
public Appendable append(char c) {
|
|
if(offset<limit) {
|
|
chars[offset]=c;
|
|
}
|
|
++offset;
|
|
return this;
|
|
}
|
|
@Override
|
|
public Appendable append(CharSequence s) {
|
|
return append(s, 0, s.length());
|
|
}
|
|
@Override
|
|
public Appendable append(CharSequence s, int sStart, int sLimit) {
|
|
int len=sLimit-sStart;
|
|
if(len<=(limit-offset)) {
|
|
while(sStart<sLimit) { // TODO: Is there a better way to copy the characters?
|
|
chars[offset++]=s.charAt(sStart++);
|
|
}
|
|
} else {
|
|
offset+=len;
|
|
}
|
|
return this;
|
|
}
|
|
|
|
private final char[] chars;
|
|
private final int start, limit;
|
|
private int offset;
|
|
}
|
|
}
|