1533 lines
60 KiB
Java
1533 lines
60 KiB
Java
/* GENERATED SOURCE. DO NOT MODIFY. */
|
||
// © 2016 and later: Unicode, Inc. and others.
|
||
// License & terms of use: http://www.unicode.org/copyright.html
|
||
package android.icu.impl;
|
||
|
||
import java.io.IOException;
|
||
import java.text.CharacterIterator;
|
||
import java.util.Locale;
|
||
|
||
import android.icu.lang.UCharacter;
|
||
import android.icu.lang.UCharacterCategory;
|
||
import android.icu.text.BreakIterator;
|
||
import android.icu.text.Edits;
|
||
import android.icu.util.ICUUncheckedIOException;
|
||
import android.icu.util.ULocale;
|
||
|
||
/**
|
||
* @hide Only a subset of ICU is exposed in Android
|
||
*/
|
||
public final class CaseMapImpl {
|
||
/**
|
||
* Implementation of UCaseProps.ContextIterator, iterates over a String.
|
||
* See ustrcase.c/utf16_caseContextIterator().
|
||
* @hide Only a subset of ICU is exposed in Android
|
||
*/
|
||
public static final class StringContextIterator implements UCaseProps.ContextIterator {
|
||
/**
|
||
* Constructor.
|
||
* @param src String to iterate over.
|
||
*/
|
||
public StringContextIterator(CharSequence src) {
|
||
this.s=src;
|
||
limit=src.length();
|
||
cpStart=cpLimit=index=0;
|
||
dir=0;
|
||
}
|
||
|
||
/**
|
||
* Constructor.
|
||
* @param src String to iterate over.
|
||
* @param cpStart Start index of the current code point.
|
||
* @param cpLimit Limit index of the current code point.
|
||
*/
|
||
public StringContextIterator(CharSequence src, int cpStart, int cpLimit) {
|
||
s = src;
|
||
index = 0;
|
||
limit = src.length();
|
||
this.cpStart = cpStart;
|
||
this.cpLimit = cpLimit;
|
||
dir = 0;
|
||
}
|
||
|
||
/**
|
||
* Set the iteration limit for nextCaseMapCP() to an index within the string.
|
||
* If the limit parameter is negative or past the string, then the
|
||
* string length is restored as the iteration limit.
|
||
*
|
||
* <p>This limit does not affect the next() function which always
|
||
* iterates to the very end of the string.
|
||
*
|
||
* @param lim The iteration limit.
|
||
*/
|
||
public void setLimit(int lim) {
|
||
if(0<=lim && lim<=s.length()) {
|
||
limit=lim;
|
||
} else {
|
||
limit=s.length();
|
||
}
|
||
}
|
||
|
||
/**
|
||
* Move to the iteration limit without fetching code points up to there.
|
||
*/
|
||
public void moveToLimit() {
|
||
cpStart=cpLimit=limit;
|
||
}
|
||
|
||
public void moveTo(int i) {
|
||
cpStart=cpLimit=i;
|
||
}
|
||
|
||
/**
|
||
* Iterate forward through the string to fetch the next code point
|
||
* to be case-mapped, and set the context indexes for it.
|
||
*
|
||
* <p>When the iteration limit is reached (and -1 is returned),
|
||
* getCPStart() will be at the iteration limit.
|
||
*
|
||
* <p>Iteration with next() does not affect the position for nextCaseMapCP().
|
||
*
|
||
* @return The next code point to be case-mapped, or <0 when the iteration is done.
|
||
*/
|
||
public int nextCaseMapCP() {
|
||
cpStart=cpLimit;
|
||
if(cpLimit<limit) {
|
||
int c=Character.codePointAt(s, cpLimit);
|
||
cpLimit+=Character.charCount(c);
|
||
return c;
|
||
} else {
|
||
return -1;
|
||
}
|
||
}
|
||
|
||
public void setCPStartAndLimit(int s, int l) {
|
||
cpStart = s;
|
||
cpLimit = l;
|
||
dir = 0;
|
||
}
|
||
/**
|
||
* Returns the start of the code point that was last returned
|
||
* by nextCaseMapCP().
|
||
*/
|
||
public int getCPStart() {
|
||
return cpStart;
|
||
}
|
||
|
||
/**
|
||
* Returns the limit of the code point that was last returned
|
||
* by nextCaseMapCP().
|
||
*/
|
||
public int getCPLimit() {
|
||
return cpLimit;
|
||
}
|
||
|
||
public int getCPLength() {
|
||
return cpLimit-cpStart;
|
||
}
|
||
|
||
// implement UCaseProps.ContextIterator
|
||
// The following code is not used anywhere in this private class
|
||
@Override
|
||
public void reset(int direction) {
|
||
if(direction>0) {
|
||
/* reset for forward iteration */
|
||
dir=1;
|
||
index=cpLimit;
|
||
} else if(direction<0) {
|
||
/* reset for backward iteration */
|
||
dir=-1;
|
||
index=cpStart;
|
||
} else {
|
||
// not a valid direction
|
||
dir=0;
|
||
index=0;
|
||
}
|
||
}
|
||
|
||
@Override
|
||
public int next() {
|
||
int c;
|
||
|
||
if(dir>0 && index<s.length()) {
|
||
c=Character.codePointAt(s, index);
|
||
index+=Character.charCount(c);
|
||
return c;
|
||
} else if(dir<0 && index>0) {
|
||
c=Character.codePointBefore(s, index);
|
||
index-=Character.charCount(c);
|
||
return c;
|
||
}
|
||
return -1;
|
||
}
|
||
|
||
// variables
|
||
protected CharSequence s;
|
||
protected int index, limit, cpStart, cpLimit;
|
||
protected int dir; // 0=initial state >0=forward <0=backward
|
||
}
|
||
|
||
public static final int TITLECASE_WHOLE_STRING = 0x20;
|
||
public static final int TITLECASE_SENTENCES = 0x40;
|
||
|
||
/**
|
||
* Bit mask for the titlecasing iterator options bit field.
|
||
* Currently only 3 out of 8 values are used:
|
||
* 0 (words), TITLECASE_WHOLE_STRING, TITLECASE_SENTENCES.
|
||
* See stringoptions.h.
|
||
* @hide draft / provisional / internal are hidden on Android
|
||
*/
|
||
private static final int TITLECASE_ITERATOR_MASK = 0xe0;
|
||
|
||
public static final int TITLECASE_ADJUST_TO_CASED = 0x400;
|
||
|
||
/**
|
||
* Bit mask for the titlecasing index adjustment options bit set.
|
||
* Currently two bits are defined:
|
||
* TITLECASE_NO_BREAK_ADJUSTMENT, TITLECASE_ADJUST_TO_CASED.
|
||
* See stringoptions.h.
|
||
* @hide draft / provisional / internal are hidden on Android
|
||
*/
|
||
private static final int TITLECASE_ADJUSTMENT_MASK = 0x600;
|
||
|
||
public static int addTitleAdjustmentOption(int options, int newOption) {
|
||
int adjOptions = options & TITLECASE_ADJUSTMENT_MASK;
|
||
if (adjOptions !=0 && adjOptions != newOption) {
|
||
throw new IllegalArgumentException("multiple titlecasing index adjustment options");
|
||
}
|
||
return options | newOption;
|
||
}
|
||
|
||
private static final char ACUTE = '\u0301';
|
||
|
||
private static final int U_GC_M_MASK =
|
||
(1 << UCharacterCategory.NON_SPACING_MARK) |
|
||
(1 << UCharacterCategory.COMBINING_SPACING_MARK) |
|
||
(1 << UCharacterCategory.ENCLOSING_MARK);
|
||
|
||
private static final int LNS =
|
||
(1 << UCharacterCategory.UPPERCASE_LETTER) |
|
||
(1 << UCharacterCategory.LOWERCASE_LETTER) |
|
||
(1 << UCharacterCategory.TITLECASE_LETTER) |
|
||
// Not MODIFIER_LETTER: We count only cased modifier letters.
|
||
(1 << UCharacterCategory.OTHER_LETTER) |
|
||
|
||
(1 << UCharacterCategory.DECIMAL_DIGIT_NUMBER) |
|
||
(1 << UCharacterCategory.LETTER_NUMBER) |
|
||
(1 << UCharacterCategory.OTHER_NUMBER) |
|
||
|
||
(1 << UCharacterCategory.MATH_SYMBOL) |
|
||
(1 << UCharacterCategory.CURRENCY_SYMBOL) |
|
||
(1 << UCharacterCategory.MODIFIER_SYMBOL) |
|
||
(1 << UCharacterCategory.OTHER_SYMBOL) |
|
||
|
||
(1 << UCharacterCategory.PRIVATE_USE);
|
||
|
||
private static boolean isLNS(int c) {
|
||
// Letter, number, symbol,
|
||
// or a private use code point because those are typically used as letters or numbers.
|
||
// Consider modifier letters only if they are cased.
|
||
int gc = UCharacterProperty.INSTANCE.getType(c);
|
||
return ((1 << gc) & LNS) != 0 ||
|
||
(gc == UCharacterCategory.MODIFIER_LETTER &&
|
||
UCaseProps.INSTANCE.getType(c) != UCaseProps.NONE);
|
||
}
|
||
|
||
public static int addTitleIteratorOption(int options, int newOption) {
|
||
int iterOptions = options & TITLECASE_ITERATOR_MASK;
|
||
if (iterOptions !=0 && iterOptions != newOption) {
|
||
throw new IllegalArgumentException("multiple titlecasing iterator options");
|
||
}
|
||
return options | newOption;
|
||
}
|
||
|
||
public static BreakIterator getTitleBreakIterator(
|
||
Locale locale, int options, BreakIterator iter) {
|
||
options &= TITLECASE_ITERATOR_MASK;
|
||
if (options != 0 && iter != null) {
|
||
throw new IllegalArgumentException(
|
||
"titlecasing iterator option together with an explicit iterator");
|
||
}
|
||
if (iter == null) {
|
||
switch (options) {
|
||
case 0:
|
||
iter = BreakIterator.getWordInstance(locale);
|
||
break;
|
||
case TITLECASE_WHOLE_STRING:
|
||
iter = new WholeStringBreakIterator();
|
||
break;
|
||
case TITLECASE_SENTENCES:
|
||
iter = BreakIterator.getSentenceInstance(locale);
|
||
break;
|
||
default:
|
||
throw new IllegalArgumentException("unknown titlecasing iterator option");
|
||
}
|
||
}
|
||
return iter;
|
||
}
|
||
|
||
public static BreakIterator getTitleBreakIterator(
|
||
ULocale locale, int options, BreakIterator iter) {
|
||
options &= TITLECASE_ITERATOR_MASK;
|
||
if (options != 0 && iter != null) {
|
||
throw new IllegalArgumentException(
|
||
"titlecasing iterator option together with an explicit iterator");
|
||
}
|
||
if (iter == null) {
|
||
switch (options) {
|
||
case 0:
|
||
iter = BreakIterator.getWordInstance(locale);
|
||
break;
|
||
case TITLECASE_WHOLE_STRING:
|
||
iter = new WholeStringBreakIterator();
|
||
break;
|
||
case TITLECASE_SENTENCES:
|
||
iter = BreakIterator.getSentenceInstance(locale);
|
||
break;
|
||
default:
|
||
throw new IllegalArgumentException("unknown titlecasing iterator option");
|
||
}
|
||
}
|
||
return iter;
|
||
}
|
||
|
||
/**
|
||
* Omit unchanged text when case-mapping with Edits.
|
||
*/
|
||
public static final int OMIT_UNCHANGED_TEXT = 0x4000;
|
||
|
||
private static final class WholeStringBreakIterator extends BreakIterator {
|
||
private int length;
|
||
|
||
private static void notImplemented() {
|
||
throw new UnsupportedOperationException("should not occur");
|
||
}
|
||
|
||
@Override
|
||
public int first() {
|
||
return 0;
|
||
}
|
||
|
||
@Override
|
||
public int last() {
|
||
notImplemented();
|
||
return 0;
|
||
}
|
||
|
||
@Override
|
||
public int next(int n) {
|
||
notImplemented();
|
||
return 0;
|
||
}
|
||
|
||
@Override
|
||
public int next() {
|
||
return length;
|
||
}
|
||
|
||
@Override
|
||
public int previous() {
|
||
notImplemented();
|
||
return 0;
|
||
}
|
||
|
||
@Override
|
||
public int following(int offset) {
|
||
notImplemented();
|
||
return 0;
|
||
}
|
||
|
||
@Override
|
||
public int current() {
|
||
notImplemented();
|
||
return 0;
|
||
}
|
||
|
||
@Override
|
||
public CharacterIterator getText() {
|
||
notImplemented();
|
||
return null;
|
||
}
|
||
|
||
@Override
|
||
public void setText(CharacterIterator newText) {
|
||
length = newText.getEndIndex();
|
||
}
|
||
|
||
@Override
|
||
public void setText(CharSequence newText) {
|
||
length = newText.length();
|
||
}
|
||
|
||
@Override
|
||
public void setText(String newText) {
|
||
length = newText.length();
|
||
}
|
||
}
|
||
|
||
private static int appendCodePoint(Appendable a, int c) throws IOException {
|
||
if (c <= Character.MAX_VALUE) {
|
||
a.append((char)c);
|
||
return 1;
|
||
} else {
|
||
a.append((char)(0xd7c0 + (c >> 10)));
|
||
a.append((char)(Character.MIN_LOW_SURROGATE + (c & 0x3ff)));
|
||
return 2;
|
||
}
|
||
}
|
||
|
||
/**
|
||
* Appends a full case mapping result, see {@link UCaseProps#MAX_STRING_LENGTH}.
|
||
* @throws IOException
|
||
*/
|
||
private static void appendResult(int result, Appendable dest,
|
||
int cpLength, int options, Edits edits) throws IOException {
|
||
// Decode the result.
|
||
if (result < 0) {
|
||
// (not) original code point
|
||
if (edits != null) {
|
||
edits.addUnchanged(cpLength);
|
||
}
|
||
if ((options & OMIT_UNCHANGED_TEXT) != 0) {
|
||
return;
|
||
}
|
||
appendCodePoint(dest, ~result);
|
||
} else if (result <= UCaseProps.MAX_STRING_LENGTH) {
|
||
// The mapping has already been appended to result.
|
||
if (edits != null) {
|
||
edits.addReplace(cpLength, result);
|
||
}
|
||
} else {
|
||
// Append the single-code point mapping.
|
||
int length = appendCodePoint(dest, result);
|
||
if (edits != null) {
|
||
edits.addReplace(cpLength, length);
|
||
}
|
||
}
|
||
}
|
||
|
||
private static final void appendUnchanged(CharSequence src, int start, int length,
|
||
Appendable dest, int options, Edits edits) throws IOException {
|
||
if (length > 0) {
|
||
if (edits != null) {
|
||
edits.addUnchanged(length);
|
||
}
|
||
if ((options & OMIT_UNCHANGED_TEXT) != 0) {
|
||
return;
|
||
}
|
||
dest.append(src, start, start + length);
|
||
}
|
||
}
|
||
|
||
private static String applyEdits(CharSequence src, StringBuilder replacementChars, Edits edits) {
|
||
if (!edits.hasChanges()) {
|
||
return src.toString();
|
||
}
|
||
StringBuilder result = new StringBuilder(src.length() + edits.lengthDelta());
|
||
for (Edits.Iterator ei = edits.getCoarseIterator(); ei.next();) {
|
||
if (ei.hasChange()) {
|
||
int i = ei.replacementIndex();
|
||
result.append(replacementChars, i, i + ei.newLength());
|
||
} else {
|
||
int i = ei.sourceIndex();
|
||
result.append(src, i, i + ei.oldLength());
|
||
}
|
||
}
|
||
return result.toString();
|
||
}
|
||
|
||
private static final Trie2_16 CASE_TRIE = UCaseProps.getTrie();
|
||
|
||
/**
|
||
* caseLocale >= 0: Lowercases [srcStart..srcLimit[ but takes context [0..srcLength[ into account.
|
||
* caseLocale < 0: Case-folds [srcStart..srcLimit[.
|
||
*/
|
||
private static void internalToLower(int caseLocale, int options,
|
||
CharSequence src, int srcStart, int srcLimit, StringContextIterator iter,
|
||
Appendable dest, Edits edits) throws IOException {
|
||
byte[] latinToLower;
|
||
if (caseLocale == UCaseProps.LOC_ROOT ||
|
||
(caseLocale >= 0 ?
|
||
!(caseLocale == UCaseProps.LOC_TURKISH || caseLocale == UCaseProps.LOC_LITHUANIAN) :
|
||
(options & UCaseProps.FOLD_CASE_OPTIONS_MASK) == UCharacter.FOLD_CASE_DEFAULT)) {
|
||
latinToLower = UCaseProps.LatinCase.TO_LOWER_NORMAL;
|
||
} else {
|
||
latinToLower = UCaseProps.LatinCase.TO_LOWER_TR_LT;
|
||
}
|
||
int prev = srcStart;
|
||
int srcIndex = srcStart;
|
||
outerLoop:
|
||
for (;;) {
|
||
// fast path for simple cases
|
||
char lead;
|
||
for (;;) {
|
||
if (srcIndex >= srcLimit) {
|
||
break outerLoop;
|
||
}
|
||
lead = src.charAt(srcIndex);
|
||
int delta;
|
||
if (lead < UCaseProps.LatinCase.LONG_S) {
|
||
byte d = latinToLower[lead];
|
||
if (d == UCaseProps.LatinCase.EXC) { break; }
|
||
++srcIndex;
|
||
if (d == 0) { continue; }
|
||
delta = d;
|
||
} else if (lead >= 0xd800) {
|
||
break; // surrogate or higher
|
||
} else {
|
||
int props = CASE_TRIE.getFromU16SingleLead(lead);
|
||
if (UCaseProps.propsHasException(props)) { break; }
|
||
++srcIndex;
|
||
if (!UCaseProps.isUpperOrTitleFromProps(props) ||
|
||
(delta = UCaseProps.getDelta(props)) == 0) {
|
||
continue;
|
||
}
|
||
}
|
||
lead += delta;
|
||
appendUnchanged(src, prev, srcIndex - 1 - prev, dest, options, edits);
|
||
dest.append(lead);
|
||
if (edits != null) {
|
||
edits.addReplace(1, 1);
|
||
}
|
||
prev = srcIndex;
|
||
}
|
||
// slow path
|
||
int cpStart = srcIndex++;
|
||
char trail;
|
||
int c;
|
||
if (Character.isHighSurrogate(lead) && srcIndex < srcLimit &&
|
||
Character.isLowSurrogate(trail = src.charAt(srcIndex))) {
|
||
c = Character.toCodePoint(lead, trail);
|
||
++srcIndex;
|
||
} else {
|
||
c = lead;
|
||
}
|
||
// We need to append unchanged text before calling the UCaseProps.toFullXyz() methods
|
||
// because they will sometimes append their mapping to dest,
|
||
// and that must be after copying the previous text.
|
||
appendUnchanged(src, prev, cpStart - prev, dest, options, edits);
|
||
prev = cpStart;
|
||
if (caseLocale >= 0) {
|
||
if (iter == null) {
|
||
iter = new StringContextIterator(src, cpStart, srcIndex);
|
||
} else {
|
||
iter.setCPStartAndLimit(cpStart, srcIndex);
|
||
}
|
||
c = UCaseProps.INSTANCE.toFullLower(c, iter, dest, caseLocale);
|
||
} else {
|
||
c = UCaseProps.INSTANCE.toFullFolding(c, dest, options);
|
||
}
|
||
if (c >= 0) {
|
||
appendResult(c, dest, srcIndex - cpStart, options, edits);
|
||
prev = srcIndex;
|
||
}
|
||
}
|
||
appendUnchanged(src, prev, srcIndex - prev, dest, options, edits);
|
||
}
|
||
|
||
private static void internalToUpper(int caseLocale, int options,
|
||
CharSequence src, Appendable dest, Edits edits) throws IOException {
|
||
StringContextIterator iter = null;
|
||
byte[] latinToUpper;
|
||
if (caseLocale == UCaseProps.LOC_TURKISH) {
|
||
latinToUpper = UCaseProps.LatinCase.TO_UPPER_TR;
|
||
} else {
|
||
latinToUpper = UCaseProps.LatinCase.TO_UPPER_NORMAL;
|
||
}
|
||
int prev = 0;
|
||
int srcIndex = 0;
|
||
int srcLength = src.length();
|
||
outerLoop:
|
||
for (;;) {
|
||
// fast path for simple cases
|
||
char lead;
|
||
for (;;) {
|
||
if (srcIndex >= srcLength) {
|
||
break outerLoop;
|
||
}
|
||
lead = src.charAt(srcIndex);
|
||
int delta;
|
||
if (lead < UCaseProps.LatinCase.LONG_S) {
|
||
byte d = latinToUpper[lead];
|
||
if (d == UCaseProps.LatinCase.EXC) { break; }
|
||
++srcIndex;
|
||
if (d == 0) { continue; }
|
||
delta = d;
|
||
} else if (lead >= 0xd800) {
|
||
break; // surrogate or higher
|
||
} else {
|
||
int props = CASE_TRIE.getFromU16SingleLead(lead);
|
||
if (UCaseProps.propsHasException(props)) { break; }
|
||
++srcIndex;
|
||
if (UCaseProps.getTypeFromProps(props) != UCaseProps.LOWER ||
|
||
(delta = UCaseProps.getDelta(props)) == 0) {
|
||
continue;
|
||
}
|
||
}
|
||
lead += delta;
|
||
appendUnchanged(src, prev, srcIndex - 1 - prev, dest, options, edits);
|
||
dest.append(lead);
|
||
if (edits != null) {
|
||
edits.addReplace(1, 1);
|
||
}
|
||
prev = srcIndex;
|
||
}
|
||
// slow path
|
||
int cpStart = srcIndex++;
|
||
char trail;
|
||
int c;
|
||
if (Character.isHighSurrogate(lead) && srcIndex < srcLength &&
|
||
Character.isLowSurrogate(trail = src.charAt(srcIndex))) {
|
||
c = Character.toCodePoint(lead, trail);
|
||
++srcIndex;
|
||
} else {
|
||
c = lead;
|
||
}
|
||
if (iter == null) {
|
||
iter = new StringContextIterator(src, cpStart, srcIndex);
|
||
} else {
|
||
iter.setCPStartAndLimit(cpStart, srcIndex);
|
||
}
|
||
// We need to append unchanged text before calling UCaseProps.toFullUpper()
|
||
// because it will sometimes append its mapping to dest,
|
||
// and that must be after copying the previous text.
|
||
appendUnchanged(src, prev, cpStart - prev, dest, options, edits);
|
||
prev = cpStart;
|
||
c = UCaseProps.INSTANCE.toFullUpper(c, iter, dest, caseLocale);
|
||
if (c >= 0) {
|
||
appendResult(c, dest, srcIndex - cpStart, options, edits);
|
||
prev = srcIndex;
|
||
}
|
||
}
|
||
appendUnchanged(src, prev, srcIndex - prev, dest, options, edits);
|
||
}
|
||
|
||
public static String toLower(int caseLocale, int options, CharSequence src) {
|
||
if (src.length() <= 100 && (options & OMIT_UNCHANGED_TEXT) == 0) {
|
||
if (src.length() == 0) {
|
||
return src.toString();
|
||
}
|
||
// Collect and apply only changes.
|
||
// Good if no or few changes. Bad (slow) if many changes.
|
||
Edits edits = new Edits();
|
||
StringBuilder replacementChars = toLower(
|
||
caseLocale, options | OMIT_UNCHANGED_TEXT, src, new StringBuilder(), edits);
|
||
return applyEdits(src, replacementChars, edits);
|
||
} else {
|
||
return toLower(caseLocale, options, src,
|
||
new StringBuilder(src.length()), null).toString();
|
||
}
|
||
}
|
||
|
||
public static <A extends Appendable> A toLower(int caseLocale, int options,
|
||
CharSequence src, A dest, Edits edits) {
|
||
try {
|
||
if (edits != null) {
|
||
edits.reset();
|
||
}
|
||
internalToLower(caseLocale, options, src, 0, src.length(), null, dest, edits);
|
||
return dest;
|
||
} catch (IOException e) {
|
||
throw new ICUUncheckedIOException(e);
|
||
}
|
||
}
|
||
|
||
public static String toUpper(int caseLocale, int options, CharSequence src) {
|
||
if (src.length() <= 100 && (options & OMIT_UNCHANGED_TEXT) == 0) {
|
||
if (src.length() == 0) {
|
||
return src.toString();
|
||
}
|
||
// Collect and apply only changes.
|
||
// Good if no or few changes. Bad (slow) if many changes.
|
||
Edits edits = new Edits();
|
||
StringBuilder replacementChars = toUpper(
|
||
caseLocale, options | OMIT_UNCHANGED_TEXT, src, new StringBuilder(), edits);
|
||
return applyEdits(src, replacementChars, edits);
|
||
} else {
|
||
return toUpper(caseLocale, options, src,
|
||
new StringBuilder(src.length()), null).toString();
|
||
}
|
||
}
|
||
|
||
public static <A extends Appendable> A toUpper(int caseLocale, int options,
|
||
CharSequence src, A dest, Edits edits) {
|
||
try {
|
||
if (edits != null) {
|
||
edits.reset();
|
||
}
|
||
if (caseLocale == UCaseProps.LOC_GREEK) {
|
||
return GreekUpper.toUpper(options, src, dest, edits);
|
||
}
|
||
internalToUpper(caseLocale, options, src, dest, edits);
|
||
return dest;
|
||
} catch (IOException e) {
|
||
throw new ICUUncheckedIOException(e);
|
||
}
|
||
}
|
||
|
||
public static String toTitle(int caseLocale, int options, BreakIterator iter, CharSequence src) {
|
||
if (src.length() <= 100 && (options & OMIT_UNCHANGED_TEXT) == 0) {
|
||
if (src.length() == 0) {
|
||
return src.toString();
|
||
}
|
||
// Collect and apply only changes.
|
||
// Good if no or few changes. Bad (slow) if many changes.
|
||
Edits edits = new Edits();
|
||
StringBuilder replacementChars = toTitle(
|
||
caseLocale, options | OMIT_UNCHANGED_TEXT, iter, src,
|
||
new StringBuilder(), edits);
|
||
return applyEdits(src, replacementChars, edits);
|
||
} else {
|
||
return toTitle(caseLocale, options, iter, src,
|
||
new StringBuilder(src.length()), null).toString();
|
||
}
|
||
}
|
||
|
||
public static <A extends Appendable> A toTitle(
|
||
int caseLocale, int options, BreakIterator titleIter,
|
||
CharSequence src, A dest, Edits edits) {
|
||
try {
|
||
if (edits != null) {
|
||
edits.reset();
|
||
}
|
||
|
||
/* set up local variables */
|
||
StringContextIterator iter = new StringContextIterator(src);
|
||
int srcLength = src.length();
|
||
int prev=0;
|
||
boolean isFirstIndex=true;
|
||
|
||
/* titlecasing loop */
|
||
while(prev<srcLength) {
|
||
/* find next index where to titlecase */
|
||
int index;
|
||
if(isFirstIndex) {
|
||
isFirstIndex=false;
|
||
index=titleIter.first();
|
||
} else {
|
||
index=titleIter.next();
|
||
}
|
||
if(index==BreakIterator.DONE || index>srcLength) {
|
||
index=srcLength;
|
||
}
|
||
|
||
/*
|
||
* Segment [prev..index[ into 3 parts:
|
||
* a) skipped characters (copy as-is) [prev..titleStart[
|
||
* b) first letter (titlecase) [titleStart..titleLimit[
|
||
* c) subsequent characters (lowercase) [titleLimit..index[
|
||
*/
|
||
if(prev<index) {
|
||
// Find and copy skipped characters [prev..titleStart[
|
||
int titleStart=prev;
|
||
iter.setLimit(index);
|
||
int c=iter.nextCaseMapCP();
|
||
if ((options&UCharacter.TITLECASE_NO_BREAK_ADJUSTMENT)==0) {
|
||
// Adjust the titlecasing index to the next cased character,
|
||
// or to the next letter/number/symbol/private use.
|
||
// Stop with titleStart<titleLimit<=index
|
||
// if there is a character to be titlecased,
|
||
// or else stop with titleStart==titleLimit==index.
|
||
boolean toCased = (options&CaseMapImpl.TITLECASE_ADJUST_TO_CASED) != 0;
|
||
while ((toCased ?
|
||
UCaseProps.NONE==UCaseProps.INSTANCE.getType(c) :
|
||
!CaseMapImpl.isLNS(c)) &&
|
||
(c=iter.nextCaseMapCP())>=0) {}
|
||
// If c<0 then we have only uncased characters in [prev..index[
|
||
// and stopped with titleStart==titleLimit==index.
|
||
titleStart=iter.getCPStart();
|
||
if (prev < titleStart) {
|
||
appendUnchanged(src, prev, titleStart-prev, dest, options, edits);
|
||
}
|
||
}
|
||
|
||
if(titleStart<index) {
|
||
// titlecase c which is from [titleStart..titleLimit[
|
||
c = UCaseProps.INSTANCE.toFullTitle(c, iter, dest, caseLocale);
|
||
appendResult(c, dest, iter.getCPLength(), options, edits);
|
||
|
||
// Special case Dutch IJ titlecasing
|
||
int titleLimit;
|
||
if (titleStart+1 < index && caseLocale == UCaseProps.LOC_DUTCH) {
|
||
if (c < 0) {
|
||
c = ~c;
|
||
}
|
||
if (c == 'I' || c == 'Í') {
|
||
titleLimit = maybeTitleDutchIJ(src, c, titleStart + 1, index, dest, options, edits);
|
||
iter.moveTo(titleLimit);
|
||
}
|
||
else {
|
||
titleLimit = iter.getCPLimit();
|
||
}
|
||
} else {
|
||
titleLimit = iter.getCPLimit();
|
||
}
|
||
|
||
// lowercase [titleLimit..index[
|
||
if(titleLimit<index) {
|
||
if((options&UCharacter.TITLECASE_NO_LOWERCASE)==0) {
|
||
// Normal operation: Lowercase the rest of the word.
|
||
internalToLower(caseLocale, options,
|
||
src, titleLimit, index, iter, dest, edits);
|
||
} else {
|
||
// Optionally just copy the rest of the word unchanged.
|
||
appendUnchanged(src, titleLimit, index-titleLimit, dest, options, edits);
|
||
}
|
||
iter.moveToLimit();
|
||
}
|
||
}
|
||
}
|
||
|
||
prev=index;
|
||
}
|
||
return dest;
|
||
} catch (IOException e) {
|
||
throw new ICUUncheckedIOException(e);
|
||
}
|
||
}
|
||
|
||
/**
|
||
* Input: c is a letter I with or without acute accent.
|
||
* start is the index in src after c, and is less than segmentLimit.
|
||
* If a plain i/I is followed by a plain j/J,
|
||
* or an i/I with acute (precomposed or decomposed) is followed by a j/J with acute,
|
||
* then we output accordingly.
|
||
*
|
||
* @return the src index after the titlecased sequence, or the start index if no Dutch IJ
|
||
* @throws IOException
|
||
*/
|
||
private static <A extends Appendable> int maybeTitleDutchIJ(
|
||
CharSequence src, int c, int start, int segmentLimit,
|
||
A dest, int options, Edits edits) throws IOException {
|
||
assert start < segmentLimit;
|
||
|
||
int index = start;
|
||
boolean withAcute = false;
|
||
|
||
// If the conditions are met, then the following variables tell us what to output.
|
||
int unchanged1 = 0; // code units before the j, or the whole sequence (0..3)
|
||
boolean doTitleJ = false; // true if the j needs to be titlecased
|
||
int unchanged2 = 0; // after the j (0 or 1)
|
||
|
||
// next character after the first letter
|
||
char c2 = src.charAt(index++);
|
||
|
||
// Is the first letter an i/I with accent?
|
||
if (c == 'I') {
|
||
if (c2 == ACUTE) {
|
||
withAcute = true;
|
||
unchanged1 = 1;
|
||
if (index == segmentLimit) { return start; }
|
||
c2 = src.charAt(index++);
|
||
}
|
||
} else { // Í
|
||
withAcute = true;
|
||
}
|
||
// Is the next character a j/J?
|
||
if (c2 == 'j') {
|
||
doTitleJ = true;
|
||
} else if (c2 == 'J') {
|
||
++unchanged1;
|
||
} else {
|
||
return start;
|
||
}
|
||
// A plain i/I must be followed by a plain j/J.
|
||
// An i/I with acute must be followed by a j/J with acute.
|
||
if (withAcute) {
|
||
if (index == segmentLimit || src.charAt(index++) != ACUTE) { return start; }
|
||
if (doTitleJ) {
|
||
unchanged2 = 1;
|
||
} else {
|
||
++unchanged1;
|
||
}
|
||
}
|
||
// There must not be another combining mark.
|
||
if (index < segmentLimit) {
|
||
int cp = Character.codePointAt(src, index);
|
||
int bit = 1 << UCharacter.getType(cp);
|
||
if ((bit & U_GC_M_MASK) != 0) {
|
||
return start;
|
||
}
|
||
}
|
||
// Output the rest of the Dutch IJ.
|
||
appendUnchanged(src, start, unchanged1, dest, options, edits);
|
||
start += unchanged1;
|
||
if (doTitleJ) {
|
||
dest.append('J');
|
||
if (edits != null) {
|
||
edits.addReplace(1, 1);
|
||
}
|
||
++start;
|
||
}
|
||
appendUnchanged(src, start, unchanged2, dest, options, edits);
|
||
assert start + unchanged2 == index;
|
||
return index;
|
||
}
|
||
|
||
public static String fold(int options, CharSequence src) {
|
||
if (src.length() <= 100 && (options & OMIT_UNCHANGED_TEXT) == 0) {
|
||
if (src.length() == 0) {
|
||
return src.toString();
|
||
}
|
||
// Collect and apply only changes.
|
||
// Good if no or few changes. Bad (slow) if many changes.
|
||
Edits edits = new Edits();
|
||
StringBuilder replacementChars = fold(
|
||
options | OMIT_UNCHANGED_TEXT, src, new StringBuilder(), edits);
|
||
return applyEdits(src, replacementChars, edits);
|
||
} else {
|
||
return fold(options, src, new StringBuilder(src.length()), null).toString();
|
||
}
|
||
}
|
||
|
||
public static <A extends Appendable> A fold(int options,
|
||
CharSequence src, A dest, Edits edits) {
|
||
try {
|
||
if (edits != null) {
|
||
edits.reset();
|
||
}
|
||
internalToLower(-1, options, src, 0, src.length(), null, dest, edits);
|
||
return dest;
|
||
} catch (IOException e) {
|
||
throw new ICUUncheckedIOException(e);
|
||
}
|
||
}
|
||
|
||
private static final class GreekUpper {
|
||
// Data bits.
|
||
private static final int UPPER_MASK = 0x3ff;
|
||
private static final int HAS_VOWEL = 0x1000;
|
||
private static final int HAS_YPOGEGRAMMENI = 0x2000;
|
||
private static final int HAS_ACCENT = 0x4000;
|
||
private static final int HAS_DIALYTIKA = 0x8000;
|
||
// Further bits during data building and processing, not stored in the data map.
|
||
private static final int HAS_COMBINING_DIALYTIKA = 0x10000;
|
||
private static final int HAS_OTHER_GREEK_DIACRITIC = 0x20000;
|
||
|
||
private static final int HAS_VOWEL_AND_ACCENT = HAS_VOWEL | HAS_ACCENT;
|
||
private static final int HAS_VOWEL_AND_ACCENT_AND_DIALYTIKA =
|
||
HAS_VOWEL_AND_ACCENT | HAS_DIALYTIKA;
|
||
private static final int HAS_EITHER_DIALYTIKA = HAS_DIALYTIKA | HAS_COMBINING_DIALYTIKA;
|
||
|
||
// State bits.
|
||
private static final int AFTER_CASED = 1;
|
||
private static final int AFTER_VOWEL_WITH_COMBINING_ACCENT = 2;
|
||
private static final int AFTER_VOWEL_WITH_PRECOMPOSED_ACCENT = 4;
|
||
|
||
// Data generated by prototype code, see
|
||
// https://icu.unicode.org/design/case/greek-upper
|
||
// TODO: Move this data into ucase.icu.
|
||
private static final char[] data0370 = {
|
||
// U+0370..03FF
|
||
0x0370, // Ͱ
|
||
0x0370, // ͱ
|
||
0x0372, // Ͳ
|
||
0x0372, // ͳ
|
||
0,
|
||
0,
|
||
0x0376, // Ͷ
|
||
0x0376, // ͷ
|
||
0,
|
||
0,
|
||
0x037A, // ͺ
|
||
0x03FD, // ͻ
|
||
0x03FE, // ͼ
|
||
0x03FF, // ͽ
|
||
0,
|
||
0x037F, // Ϳ
|
||
0,
|
||
0,
|
||
0,
|
||
0,
|
||
0,
|
||
0,
|
||
0x0391 | HAS_VOWEL | HAS_ACCENT, // Ά
|
||
0,
|
||
0x0395 | HAS_VOWEL | HAS_ACCENT, // Έ
|
||
0x0397 | HAS_VOWEL | HAS_ACCENT, // Ή
|
||
0x0399 | HAS_VOWEL | HAS_ACCENT, // Ί
|
||
0,
|
||
0x039F | HAS_VOWEL | HAS_ACCENT, // Ό
|
||
0,
|
||
0x03A5 | HAS_VOWEL | HAS_ACCENT, // Ύ
|
||
0x03A9 | HAS_VOWEL | HAS_ACCENT, // Ώ
|
||
0x0399 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA, // ΐ
|
||
0x0391 | HAS_VOWEL, // Α
|
||
0x0392, // Β
|
||
0x0393, // Γ
|
||
0x0394, // Δ
|
||
0x0395 | HAS_VOWEL, // Ε
|
||
0x0396, // Ζ
|
||
0x0397 | HAS_VOWEL, // Η
|
||
0x0398, // Θ
|
||
0x0399 | HAS_VOWEL, // Ι
|
||
0x039A, // Κ
|
||
0x039B, // Λ
|
||
0x039C, // Μ
|
||
0x039D, // Ν
|
||
0x039E, // Ξ
|
||
0x039F | HAS_VOWEL, // Ο
|
||
0x03A0, // Π
|
||
0x03A1, // Ρ
|
||
0,
|
||
0x03A3, // Σ
|
||
0x03A4, // Τ
|
||
0x03A5 | HAS_VOWEL, // Υ
|
||
0x03A6, // Φ
|
||
0x03A7, // Χ
|
||
0x03A8, // Ψ
|
||
0x03A9 | HAS_VOWEL, // Ω
|
||
0x0399 | HAS_VOWEL | HAS_DIALYTIKA, // Ϊ
|
||
0x03A5 | HAS_VOWEL | HAS_DIALYTIKA, // Ϋ
|
||
0x0391 | HAS_VOWEL | HAS_ACCENT, // ά
|
||
0x0395 | HAS_VOWEL | HAS_ACCENT, // έ
|
||
0x0397 | HAS_VOWEL | HAS_ACCENT, // ή
|
||
0x0399 | HAS_VOWEL | HAS_ACCENT, // ί
|
||
0x03A5 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA, // ΰ
|
||
0x0391 | HAS_VOWEL, // α
|
||
0x0392, // β
|
||
0x0393, // γ
|
||
0x0394, // δ
|
||
0x0395 | HAS_VOWEL, // ε
|
||
0x0396, // ζ
|
||
0x0397 | HAS_VOWEL, // η
|
||
0x0398, // θ
|
||
0x0399 | HAS_VOWEL, // ι
|
||
0x039A, // κ
|
||
0x039B, // λ
|
||
0x039C, // μ
|
||
0x039D, // ν
|
||
0x039E, // ξ
|
||
0x039F | HAS_VOWEL, // ο
|
||
0x03A0, // π
|
||
0x03A1, // ρ
|
||
0x03A3, // ς
|
||
0x03A3, // σ
|
||
0x03A4, // τ
|
||
0x03A5 | HAS_VOWEL, // υ
|
||
0x03A6, // φ
|
||
0x03A7, // χ
|
||
0x03A8, // ψ
|
||
0x03A9 | HAS_VOWEL, // ω
|
||
0x0399 | HAS_VOWEL | HAS_DIALYTIKA, // ϊ
|
||
0x03A5 | HAS_VOWEL | HAS_DIALYTIKA, // ϋ
|
||
0x039F | HAS_VOWEL | HAS_ACCENT, // ό
|
||
0x03A5 | HAS_VOWEL | HAS_ACCENT, // ύ
|
||
0x03A9 | HAS_VOWEL | HAS_ACCENT, // ώ
|
||
0x03CF, // Ϗ
|
||
0x0392, // ϐ
|
||
0x0398, // ϑ
|
||
0x03D2, // ϒ
|
||
0x03D2 | HAS_ACCENT, // ϓ
|
||
0x03D2 | HAS_DIALYTIKA, // ϔ
|
||
0x03A6, // ϕ
|
||
0x03A0, // ϖ
|
||
0x03CF, // ϗ
|
||
0x03D8, // Ϙ
|
||
0x03D8, // ϙ
|
||
0x03DA, // Ϛ
|
||
0x03DA, // ϛ
|
||
0x03DC, // Ϝ
|
||
0x03DC, // ϝ
|
||
0x03DE, // Ϟ
|
||
0x03DE, // ϟ
|
||
0x03E0, // Ϡ
|
||
0x03E0, // ϡ
|
||
0,
|
||
0,
|
||
0,
|
||
0,
|
||
0,
|
||
0,
|
||
0,
|
||
0,
|
||
0,
|
||
0,
|
||
0,
|
||
0,
|
||
0,
|
||
0,
|
||
0x039A, // ϰ
|
||
0x03A1, // ϱ
|
||
0x03F9, // ϲ
|
||
0x037F, // ϳ
|
||
0x03F4, // ϴ
|
||
0x0395 | HAS_VOWEL, // ϵ
|
||
0,
|
||
0x03F7, // Ϸ
|
||
0x03F7, // ϸ
|
||
0x03F9, // Ϲ
|
||
0x03FA, // Ϻ
|
||
0x03FA, // ϻ
|
||
0x03FC, // ϼ
|
||
0x03FD, // Ͻ
|
||
0x03FE, // Ͼ
|
||
0x03FF, // Ͽ
|
||
};
|
||
|
||
private static final char[] data1F00 = {
|
||
// U+1F00..1FFF
|
||
0x0391 | HAS_VOWEL, // ἀ
|
||
0x0391 | HAS_VOWEL, // ἁ
|
||
0x0391 | HAS_VOWEL | HAS_ACCENT, // ἂ
|
||
0x0391 | HAS_VOWEL | HAS_ACCENT, // ἃ
|
||
0x0391 | HAS_VOWEL | HAS_ACCENT, // ἄ
|
||
0x0391 | HAS_VOWEL | HAS_ACCENT, // ἅ
|
||
0x0391 | HAS_VOWEL | HAS_ACCENT, // ἆ
|
||
0x0391 | HAS_VOWEL | HAS_ACCENT, // ἇ
|
||
0x0391 | HAS_VOWEL, // Ἀ
|
||
0x0391 | HAS_VOWEL, // Ἁ
|
||
0x0391 | HAS_VOWEL | HAS_ACCENT, // Ἂ
|
||
0x0391 | HAS_VOWEL | HAS_ACCENT, // Ἃ
|
||
0x0391 | HAS_VOWEL | HAS_ACCENT, // Ἄ
|
||
0x0391 | HAS_VOWEL | HAS_ACCENT, // Ἅ
|
||
0x0391 | HAS_VOWEL | HAS_ACCENT, // Ἆ
|
||
0x0391 | HAS_VOWEL | HAS_ACCENT, // Ἇ
|
||
0x0395 | HAS_VOWEL, // ἐ
|
||
0x0395 | HAS_VOWEL, // ἑ
|
||
0x0395 | HAS_VOWEL | HAS_ACCENT, // ἒ
|
||
0x0395 | HAS_VOWEL | HAS_ACCENT, // ἓ
|
||
0x0395 | HAS_VOWEL | HAS_ACCENT, // ἔ
|
||
0x0395 | HAS_VOWEL | HAS_ACCENT, // ἕ
|
||
0,
|
||
0,
|
||
0x0395 | HAS_VOWEL, // Ἐ
|
||
0x0395 | HAS_VOWEL, // Ἑ
|
||
0x0395 | HAS_VOWEL | HAS_ACCENT, // Ἒ
|
||
0x0395 | HAS_VOWEL | HAS_ACCENT, // Ἓ
|
||
0x0395 | HAS_VOWEL | HAS_ACCENT, // Ἔ
|
||
0x0395 | HAS_VOWEL | HAS_ACCENT, // Ἕ
|
||
0,
|
||
0,
|
||
0x0397 | HAS_VOWEL, // ἠ
|
||
0x0397 | HAS_VOWEL, // ἡ
|
||
0x0397 | HAS_VOWEL | HAS_ACCENT, // ἢ
|
||
0x0397 | HAS_VOWEL | HAS_ACCENT, // ἣ
|
||
0x0397 | HAS_VOWEL | HAS_ACCENT, // ἤ
|
||
0x0397 | HAS_VOWEL | HAS_ACCENT, // ἥ
|
||
0x0397 | HAS_VOWEL | HAS_ACCENT, // ἦ
|
||
0x0397 | HAS_VOWEL | HAS_ACCENT, // ἧ
|
||
0x0397 | HAS_VOWEL, // Ἠ
|
||
0x0397 | HAS_VOWEL, // Ἡ
|
||
0x0397 | HAS_VOWEL | HAS_ACCENT, // Ἢ
|
||
0x0397 | HAS_VOWEL | HAS_ACCENT, // Ἣ
|
||
0x0397 | HAS_VOWEL | HAS_ACCENT, // Ἤ
|
||
0x0397 | HAS_VOWEL | HAS_ACCENT, // Ἥ
|
||
0x0397 | HAS_VOWEL | HAS_ACCENT, // Ἦ
|
||
0x0397 | HAS_VOWEL | HAS_ACCENT, // Ἧ
|
||
0x0399 | HAS_VOWEL, // ἰ
|
||
0x0399 | HAS_VOWEL, // ἱ
|
||
0x0399 | HAS_VOWEL | HAS_ACCENT, // ἲ
|
||
0x0399 | HAS_VOWEL | HAS_ACCENT, // ἳ
|
||
0x0399 | HAS_VOWEL | HAS_ACCENT, // ἴ
|
||
0x0399 | HAS_VOWEL | HAS_ACCENT, // ἵ
|
||
0x0399 | HAS_VOWEL | HAS_ACCENT, // ἶ
|
||
0x0399 | HAS_VOWEL | HAS_ACCENT, // ἷ
|
||
0x0399 | HAS_VOWEL, // Ἰ
|
||
0x0399 | HAS_VOWEL, // Ἱ
|
||
0x0399 | HAS_VOWEL | HAS_ACCENT, // Ἲ
|
||
0x0399 | HAS_VOWEL | HAS_ACCENT, // Ἳ
|
||
0x0399 | HAS_VOWEL | HAS_ACCENT, // Ἴ
|
||
0x0399 | HAS_VOWEL | HAS_ACCENT, // Ἵ
|
||
0x0399 | HAS_VOWEL | HAS_ACCENT, // Ἶ
|
||
0x0399 | HAS_VOWEL | HAS_ACCENT, // Ἷ
|
||
0x039F | HAS_VOWEL, // ὀ
|
||
0x039F | HAS_VOWEL, // ὁ
|
||
0x039F | HAS_VOWEL | HAS_ACCENT, // ὂ
|
||
0x039F | HAS_VOWEL | HAS_ACCENT, // ὃ
|
||
0x039F | HAS_VOWEL | HAS_ACCENT, // ὄ
|
||
0x039F | HAS_VOWEL | HAS_ACCENT, // ὅ
|
||
0,
|
||
0,
|
||
0x039F | HAS_VOWEL, // Ὀ
|
||
0x039F | HAS_VOWEL, // Ὁ
|
||
0x039F | HAS_VOWEL | HAS_ACCENT, // Ὂ
|
||
0x039F | HAS_VOWEL | HAS_ACCENT, // Ὃ
|
||
0x039F | HAS_VOWEL | HAS_ACCENT, // Ὄ
|
||
0x039F | HAS_VOWEL | HAS_ACCENT, // Ὅ
|
||
0,
|
||
0,
|
||
0x03A5 | HAS_VOWEL, // ὐ
|
||
0x03A5 | HAS_VOWEL, // ὑ
|
||
0x03A5 | HAS_VOWEL | HAS_ACCENT, // ὒ
|
||
0x03A5 | HAS_VOWEL | HAS_ACCENT, // ὓ
|
||
0x03A5 | HAS_VOWEL | HAS_ACCENT, // ὔ
|
||
0x03A5 | HAS_VOWEL | HAS_ACCENT, // ὕ
|
||
0x03A5 | HAS_VOWEL | HAS_ACCENT, // ὖ
|
||
0x03A5 | HAS_VOWEL | HAS_ACCENT, // ὗ
|
||
0,
|
||
0x03A5 | HAS_VOWEL, // Ὑ
|
||
0,
|
||
0x03A5 | HAS_VOWEL | HAS_ACCENT, // Ὓ
|
||
0,
|
||
0x03A5 | HAS_VOWEL | HAS_ACCENT, // Ὕ
|
||
0,
|
||
0x03A5 | HAS_VOWEL | HAS_ACCENT, // Ὗ
|
||
0x03A9 | HAS_VOWEL, // ὠ
|
||
0x03A9 | HAS_VOWEL, // ὡ
|
||
0x03A9 | HAS_VOWEL | HAS_ACCENT, // ὢ
|
||
0x03A9 | HAS_VOWEL | HAS_ACCENT, // ὣ
|
||
0x03A9 | HAS_VOWEL | HAS_ACCENT, // ὤ
|
||
0x03A9 | HAS_VOWEL | HAS_ACCENT, // ὥ
|
||
0x03A9 | HAS_VOWEL | HAS_ACCENT, // ὦ
|
||
0x03A9 | HAS_VOWEL | HAS_ACCENT, // ὧ
|
||
0x03A9 | HAS_VOWEL, // Ὠ
|
||
0x03A9 | HAS_VOWEL, // Ὡ
|
||
0x03A9 | HAS_VOWEL | HAS_ACCENT, // Ὢ
|
||
0x03A9 | HAS_VOWEL | HAS_ACCENT, // Ὣ
|
||
0x03A9 | HAS_VOWEL | HAS_ACCENT, // Ὤ
|
||
0x03A9 | HAS_VOWEL | HAS_ACCENT, // Ὥ
|
||
0x03A9 | HAS_VOWEL | HAS_ACCENT, // Ὦ
|
||
0x03A9 | HAS_VOWEL | HAS_ACCENT, // Ὧ
|
||
0x0391 | HAS_VOWEL | HAS_ACCENT, // ὰ
|
||
0x0391 | HAS_VOWEL | HAS_ACCENT, // ά
|
||
0x0395 | HAS_VOWEL | HAS_ACCENT, // ὲ
|
||
0x0395 | HAS_VOWEL | HAS_ACCENT, // έ
|
||
0x0397 | HAS_VOWEL | HAS_ACCENT, // ὴ
|
||
0x0397 | HAS_VOWEL | HAS_ACCENT, // ή
|
||
0x0399 | HAS_VOWEL | HAS_ACCENT, // ὶ
|
||
0x0399 | HAS_VOWEL | HAS_ACCENT, // ί
|
||
0x039F | HAS_VOWEL | HAS_ACCENT, // ὸ
|
||
0x039F | HAS_VOWEL | HAS_ACCENT, // ό
|
||
0x03A5 | HAS_VOWEL | HAS_ACCENT, // ὺ
|
||
0x03A5 | HAS_VOWEL | HAS_ACCENT, // ύ
|
||
0x03A9 | HAS_VOWEL | HAS_ACCENT, // ὼ
|
||
0x03A9 | HAS_VOWEL | HAS_ACCENT, // ώ
|
||
0,
|
||
0,
|
||
0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI, // ᾀ
|
||
0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI, // ᾁ
|
||
0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾂ
|
||
0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾃ
|
||
0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾄ
|
||
0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾅ
|
||
0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾆ
|
||
0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾇ
|
||
0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI, // ᾈ
|
||
0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI, // ᾉ
|
||
0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾊ
|
||
0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾋ
|
||
0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾌ
|
||
0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾍ
|
||
0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾎ
|
||
0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾏ
|
||
0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI, // ᾐ
|
||
0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI, // ᾑ
|
||
0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾒ
|
||
0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾓ
|
||
0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾔ
|
||
0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾕ
|
||
0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾖ
|
||
0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾗ
|
||
0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI, // ᾘ
|
||
0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI, // ᾙ
|
||
0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾚ
|
||
0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾛ
|
||
0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾜ
|
||
0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾝ
|
||
0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾞ
|
||
0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾟ
|
||
0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI, // ᾠ
|
||
0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI, // ᾡ
|
||
0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾢ
|
||
0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾣ
|
||
0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾤ
|
||
0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾥ
|
||
0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾦ
|
||
0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾧ
|
||
0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI, // ᾨ
|
||
0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI, // ᾩ
|
||
0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾪ
|
||
0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾫ
|
||
0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾬ
|
||
0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾭ
|
||
0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾮ
|
||
0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾯ
|
||
0x0391 | HAS_VOWEL, // ᾰ
|
||
0x0391 | HAS_VOWEL, // ᾱ
|
||
0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾲ
|
||
0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI, // ᾳ
|
||
0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾴ
|
||
0,
|
||
0x0391 | HAS_VOWEL | HAS_ACCENT, // ᾶ
|
||
0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ᾷ
|
||
0x0391 | HAS_VOWEL, // Ᾰ
|
||
0x0391 | HAS_VOWEL, // Ᾱ
|
||
0x0391 | HAS_VOWEL | HAS_ACCENT, // Ὰ
|
||
0x0391 | HAS_VOWEL | HAS_ACCENT, // Ά
|
||
0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI, // ᾼ
|
||
0,
|
||
0x0399 | HAS_VOWEL, // ι
|
||
0,
|
||
0,
|
||
0,
|
||
0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ῂ
|
||
0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI, // ῃ
|
||
0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ῄ
|
||
0,
|
||
0x0397 | HAS_VOWEL | HAS_ACCENT, // ῆ
|
||
0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ῇ
|
||
0x0395 | HAS_VOWEL | HAS_ACCENT, // Ὲ
|
||
0x0395 | HAS_VOWEL | HAS_ACCENT, // Έ
|
||
0x0397 | HAS_VOWEL | HAS_ACCENT, // Ὴ
|
||
0x0397 | HAS_VOWEL | HAS_ACCENT, // Ή
|
||
0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI, // ῌ
|
||
0,
|
||
0,
|
||
0,
|
||
0x0399 | HAS_VOWEL, // ῐ
|
||
0x0399 | HAS_VOWEL, // ῑ
|
||
0x0399 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA, // ῒ
|
||
0x0399 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA, // ΐ
|
||
0,
|
||
0,
|
||
0x0399 | HAS_VOWEL | HAS_ACCENT, // ῖ
|
||
0x0399 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA, // ῗ
|
||
0x0399 | HAS_VOWEL, // Ῐ
|
||
0x0399 | HAS_VOWEL, // Ῑ
|
||
0x0399 | HAS_VOWEL | HAS_ACCENT, // Ὶ
|
||
0x0399 | HAS_VOWEL | HAS_ACCENT, // Ί
|
||
0,
|
||
0,
|
||
0,
|
||
0,
|
||
0x03A5 | HAS_VOWEL, // ῠ
|
||
0x03A5 | HAS_VOWEL, // ῡ
|
||
0x03A5 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA, // ῢ
|
||
0x03A5 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA, // ΰ
|
||
0x03A1, // ῤ
|
||
0x03A1, // ῥ
|
||
0x03A5 | HAS_VOWEL | HAS_ACCENT, // ῦ
|
||
0x03A5 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA, // ῧ
|
||
0x03A5 | HAS_VOWEL, // Ῠ
|
||
0x03A5 | HAS_VOWEL, // Ῡ
|
||
0x03A5 | HAS_VOWEL | HAS_ACCENT, // Ὺ
|
||
0x03A5 | HAS_VOWEL | HAS_ACCENT, // Ύ
|
||
0x03A1, // Ῥ
|
||
0,
|
||
0,
|
||
0,
|
||
0,
|
||
0,
|
||
0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ῲ
|
||
0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI, // ῳ
|
||
0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ῴ
|
||
0,
|
||
0x03A9 | HAS_VOWEL | HAS_ACCENT, // ῶ
|
||
0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, // ῷ
|
||
0x039F | HAS_VOWEL | HAS_ACCENT, // Ὸ
|
||
0x039F | HAS_VOWEL | HAS_ACCENT, // Ό
|
||
0x03A9 | HAS_VOWEL | HAS_ACCENT, // Ὼ
|
||
0x03A9 | HAS_VOWEL | HAS_ACCENT, // Ώ
|
||
0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI, // ῼ
|
||
0,
|
||
0,
|
||
0,
|
||
};
|
||
|
||
// U+2126 Ohm sign
|
||
private static final char data2126 = 0x03A9 | HAS_VOWEL; // Ω
|
||
|
||
private static final int getLetterData(int c) {
|
||
if (c < 0x370 || 0x2126 < c || (0x3ff < c && c < 0x1f00)) {
|
||
return 0;
|
||
} else if (c <= 0x3ff) {
|
||
return data0370[c - 0x370];
|
||
} else if (c <= 0x1fff) {
|
||
return data1F00[c - 0x1f00];
|
||
} else if (c == 0x2126) {
|
||
return data2126;
|
||
} else {
|
||
return 0;
|
||
}
|
||
}
|
||
|
||
/**
|
||
* Returns a non-zero value for each of the Greek combining diacritics
|
||
* listed in The Unicode Standard, version 8, chapter 7.2 Greek,
|
||
* plus some perispomeni look-alikes.
|
||
*/
|
||
private static final int getDiacriticData(int c) {
|
||
switch (c) {
|
||
case '\u0300': // varia
|
||
case '\u0301': // tonos = oxia
|
||
case '\u0342': // perispomeni
|
||
case '\u0302': // circumflex can look like perispomeni
|
||
case '\u0303': // tilde can look like perispomeni
|
||
case '\u0311': // inverted breve can look like perispomeni
|
||
return HAS_ACCENT;
|
||
case '\u0308': // dialytika = diaeresis
|
||
return HAS_COMBINING_DIALYTIKA;
|
||
case '\u0344': // dialytika tonos
|
||
return HAS_COMBINING_DIALYTIKA | HAS_ACCENT;
|
||
case '\u0345': // ypogegrammeni = iota subscript
|
||
return HAS_YPOGEGRAMMENI;
|
||
case '\u0304': // macron
|
||
case '\u0306': // breve
|
||
case '\u0313': // comma above
|
||
case '\u0314': // reversed comma above
|
||
case '\u0343': // koronis
|
||
return HAS_OTHER_GREEK_DIACRITIC;
|
||
default:
|
||
return 0;
|
||
}
|
||
}
|
||
|
||
private static boolean isFollowedByCasedLetter(CharSequence s, int i) {
|
||
while (i < s.length()) {
|
||
int c = Character.codePointAt(s, i);
|
||
int type = UCaseProps.INSTANCE.getTypeOrIgnorable(c);
|
||
if ((type & UCaseProps.IGNORABLE) != 0) {
|
||
// Case-ignorable, continue with the loop.
|
||
i += Character.charCount(c);
|
||
} else if (type != UCaseProps.NONE) {
|
||
return true; // Followed by cased letter.
|
||
} else {
|
||
return false; // Uncased and not case-ignorable.
|
||
}
|
||
}
|
||
return false; // Not followed by cased letter.
|
||
}
|
||
|
||
/**
|
||
* Greek string uppercasing with a state machine.
|
||
* Probably simpler than a stateless function that has to figure out complex context-before
|
||
* for each character.
|
||
* TODO: Try to re-consolidate one way or another with the non-Greek function.
|
||
*
|
||
* <p>Keep this consistent with the C++ versions in ustrcase.cpp (UTF-16) and ucasemap.cpp (UTF-8).
|
||
* @throws IOException
|
||
*/
|
||
private static <A extends Appendable> A toUpper(int options,
|
||
CharSequence src, A dest, Edits edits) throws IOException {
|
||
int state = 0;
|
||
for (int i = 0; i < src.length();) {
|
||
int c = Character.codePointAt(src, i);
|
||
int nextIndex = i + Character.charCount(c);
|
||
int nextState = 0;
|
||
int type = UCaseProps.INSTANCE.getTypeOrIgnorable(c);
|
||
if ((type & UCaseProps.IGNORABLE) != 0) {
|
||
// c is case-ignorable
|
||
nextState |= (state & AFTER_CASED);
|
||
} else if (type != UCaseProps.NONE) {
|
||
// c is cased
|
||
nextState |= AFTER_CASED;
|
||
}
|
||
int data = getLetterData(c);
|
||
if (data > 0) {
|
||
int upper = data & UPPER_MASK;
|
||
// Add a dialytika to this iota or ypsilon vowel
|
||
// if we removed a tonos from the previous vowel,
|
||
// and that previous vowel did not also have (or gain) a dialytika.
|
||
// Adding one only to the final vowel in a longer sequence
|
||
// (which does not occur in normal writing) would require lookahead.
|
||
// Set the same flag as for preserving an existing dialytika.
|
||
if ((data & HAS_VOWEL) != 0
|
||
&& (state & (AFTER_VOWEL_WITH_PRECOMPOSED_ACCENT | AFTER_VOWEL_WITH_COMBINING_ACCENT)) != 0
|
||
&& (upper == 'Ι' || upper == 'Υ')) {
|
||
data |= (state & AFTER_VOWEL_WITH_PRECOMPOSED_ACCENT) != 0 ? HAS_DIALYTIKA
|
||
: HAS_COMBINING_DIALYTIKA;
|
||
}
|
||
int numYpogegrammeni = 0; // Map each one to a trailing, spacing, capital iota.
|
||
if ((data & HAS_YPOGEGRAMMENI) != 0) {
|
||
numYpogegrammeni = 1;
|
||
}
|
||
final boolean hasPrecomposedAccent = (data & HAS_ACCENT) != 0;
|
||
// Skip combining diacritics after this Greek letter.
|
||
while (nextIndex < src.length()) {
|
||
int diacriticData = getDiacriticData(src.charAt(nextIndex));
|
||
if (diacriticData != 0) {
|
||
data |= diacriticData;
|
||
if ((diacriticData & HAS_YPOGEGRAMMENI) != 0) {
|
||
++numYpogegrammeni;
|
||
}
|
||
++nextIndex;
|
||
} else {
|
||
break; // not a Greek diacritic
|
||
}
|
||
}
|
||
if ((data & HAS_VOWEL_AND_ACCENT_AND_DIALYTIKA) == HAS_VOWEL_AND_ACCENT) {
|
||
nextState |= hasPrecomposedAccent ? AFTER_VOWEL_WITH_PRECOMPOSED_ACCENT
|
||
: AFTER_VOWEL_WITH_COMBINING_ACCENT;
|
||
}
|
||
// Map according to Greek rules.
|
||
boolean addTonos = false;
|
||
if (upper == 'Η' &&
|
||
(data & HAS_ACCENT) != 0 &&
|
||
numYpogegrammeni == 0 &&
|
||
(state & AFTER_CASED) == 0 &&
|
||
!isFollowedByCasedLetter(src, nextIndex)) {
|
||
// Keep disjunctive "or" with (only) a tonos.
|
||
// We use the same "word boundary" conditions as for the Final_Sigma test.
|
||
if (hasPrecomposedAccent) {
|
||
upper = 'Ή'; // Preserve the precomposed form.
|
||
} else {
|
||
addTonos = true;
|
||
}
|
||
} else if ((data & HAS_DIALYTIKA) != 0) {
|
||
// Preserve a vowel with dialytika in precomposed form if it exists.
|
||
if (upper == 'Ι') {
|
||
upper = 'Ϊ';
|
||
data &= ~HAS_EITHER_DIALYTIKA;
|
||
} else if (upper == 'Υ') {
|
||
upper = 'Ϋ';
|
||
data &= ~HAS_EITHER_DIALYTIKA;
|
||
}
|
||
}
|
||
|
||
boolean change;
|
||
if (edits == null && (options & OMIT_UNCHANGED_TEXT) == 0) {
|
||
change = true; // common, simple usage
|
||
} else {
|
||
// Find out first whether we are changing the text.
|
||
change = src.charAt(i) != upper || numYpogegrammeni > 0;
|
||
int i2 = i + 1;
|
||
if ((data & HAS_EITHER_DIALYTIKA) != 0) {
|
||
change |= i2 >= nextIndex || src.charAt(i2) != 0x308;
|
||
++i2;
|
||
}
|
||
if (addTonos) {
|
||
change |= i2 >= nextIndex || src.charAt(i2) != 0x301;
|
||
++i2;
|
||
}
|
||
int oldLength = nextIndex - i;
|
||
int newLength = (i2 - i) + numYpogegrammeni;
|
||
change |= oldLength != newLength;
|
||
if (change) {
|
||
if (edits != null) {
|
||
edits.addReplace(oldLength, newLength);
|
||
}
|
||
} else {
|
||
if (edits != null) {
|
||
edits.addUnchanged(oldLength);
|
||
}
|
||
// Write unchanged text?
|
||
change = (options & OMIT_UNCHANGED_TEXT) == 0;
|
||
}
|
||
}
|
||
|
||
if (change) {
|
||
dest.append((char)upper);
|
||
if ((data & HAS_EITHER_DIALYTIKA) != 0) {
|
||
dest.append('\u0308'); // restore or add a dialytika
|
||
}
|
||
if (addTonos) {
|
||
dest.append('\u0301');
|
||
}
|
||
while (numYpogegrammeni > 0) {
|
||
dest.append('Ι');
|
||
--numYpogegrammeni;
|
||
}
|
||
}
|
||
} else {
|
||
c = UCaseProps.INSTANCE.toFullUpper(c, null, dest, UCaseProps.LOC_GREEK);
|
||
appendResult(c, dest, nextIndex - i, options, edits);
|
||
}
|
||
i = nextIndex;
|
||
state = nextState;
|
||
}
|
||
return dest;
|
||
}
|
||
}
|
||
}
|