/* GENERATED SOURCE. DO NOT MODIFY. */ // © 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ******************************************************************************* * Copyright (C) 2009-2015, International Business Machines * Corporation and others. All Rights Reserved. ******************************************************************************* */ package android.icu.impl; import java.io.IOException; import java.nio.ByteBuffer; import java.util.ArrayList; import android.icu.text.UTF16; import android.icu.text.UnicodeSet; import android.icu.util.CodePointMap; import android.icu.util.CodePointTrie; import android.icu.util.ICUUncheckedIOException; import android.icu.util.MutableCodePointTrie; import android.icu.util.VersionInfo; /** * Low-level implementation of the Unicode Normalization Algorithm. * For the data structure and details see the documentation at the end of * C++ normalizer2impl.h and in the design doc at * https://icu.unicode.org/design/normalization/custom * @hide Only a subset of ICU is exposed in Android */ public final class Normalizer2Impl { /** * @hide Only a subset of ICU is exposed in Android */ public static final class Hangul { /* Korean Hangul and Jamo constants */ public static final int JAMO_L_BASE=0x1100; /* "lead" jamo */ public static final int JAMO_L_END=0x1112; public static final int JAMO_V_BASE=0x1161; /* "vowel" jamo */ public static final int JAMO_V_END=0x1175; public static final int JAMO_T_BASE=0x11a7; /* "trail" jamo */ public static final int JAMO_T_END=0x11c2; public static final int HANGUL_BASE=0xac00; public static final int HANGUL_END=0xd7a3; public static final int JAMO_L_COUNT=19; public static final int JAMO_V_COUNT=21; public static final int JAMO_T_COUNT=28; public static final int JAMO_L_LIMIT=JAMO_L_BASE+JAMO_L_COUNT; public static final int JAMO_V_LIMIT=JAMO_V_BASE+JAMO_V_COUNT; public static final int JAMO_VT_COUNT=JAMO_V_COUNT*JAMO_T_COUNT; public static final int HANGUL_COUNT=JAMO_L_COUNT*JAMO_V_COUNT*JAMO_T_COUNT; public static final int HANGUL_LIMIT=HANGUL_BASE+HANGUL_COUNT; public static boolean isHangul(int c) { return HANGUL_BASE<=c && c * If dest is a StringBuilder, then the buffer writes directly to it. * Otherwise, the buffer maintains a StringBuilder for intermediate text segments * until no further changes are necessary and whole segments are appended. * append() methods that take combining-class values always write to the StringBuilder. * Other append() methods flush and append to the Appendable. * @hide Only a subset of ICU is exposed in Android */ public static final class ReorderingBuffer implements Appendable { public ReorderingBuffer(Normalizer2Impl ni, Appendable dest, int destCapacity) { impl=ni; app=dest; if(app instanceof StringBuilder) { appIsStringBuilder=true; str=(StringBuilder)dest; // In Java, the constructor subsumes public void init(int destCapacity) { str.ensureCapacity(destCapacity); reorderStart=0; if(str.length()==0) { lastCC=0; } else { setIterator(); lastCC=previousCC(); // Set reorderStart after the last code point with cc<=1 if there is one. if(lastCC>1) { while(previousCC()>1) {} } reorderStart=codePointLimit; } } else { appIsStringBuilder=false; str=new StringBuilder(); reorderStart=0; lastCC=0; } } public boolean isEmpty() { return str.length()==0; } public int length() { return str.length(); } public int getLastCC() { return lastCC; } public StringBuilder getStringBuilder() { return str; } public boolean equals(CharSequence s, int start, int limit) { return UTF16Plus.equal(str, 0, str.length(), s, start, limit); } public void append(int c, int cc) { if(lastCC<=cc || cc==0) { str.appendCodePoint(c); lastCC=cc; if(cc<=1) { reorderStart=str.length(); } } else { insert(c, cc); } } public void append(CharSequence s, int start, int limit, boolean isNFD, int leadCC, int trailCC) { if(start==limit) { return; } if(lastCC<=leadCC || leadCC==0) { if(trailCC<=1) { reorderStart=str.length()+(limit-start); } else if(leadCC<=1) { reorderStart=str.length()+1; // Ok if not a code point boundary. } str.append(s, start, limit); lastCC=trailCC; } else { int c=Character.codePointAt(s, start); start+=Character.charCount(c); insert(c, leadCC); // insert first code point while(startcc;) {} // insert c at codePointLimit, after the character with prevCC<=cc if(c<=0xffff) { str.insert(codePointLimit, (char)c); if(cc<=1) { reorderStart=codePointLimit+1; } } else { str.insert(codePointLimit, Character.toChars(c)); if(cc<=1) { reorderStart=codePointLimit+2; } } } private final Normalizer2Impl impl; private final Appendable app; private final StringBuilder str; private final boolean appIsStringBuilder; private int reorderStart; private int lastCC; // private backward iterator private void setIterator() { codePointStart=str.length(); } private void skipPrevious() { // Requires 0=codePointStart) { return 0; } int c=str.codePointBefore(codePointStart); codePointStart-=Character.charCount(c); return impl.getCCFromYesOrMaybeCP(c); } private int codePointStart, codePointLimit; } // TODO: Propose as public API on the UTF16 class. // TODO: Propose widening UTF16 methods that take char to take int. // TODO: Propose widening UTF16 methods that take String to take CharSequence. /** * @hide Only a subset of ICU is exposed in Android */ public static final class UTF16Plus { /** * Is this code point a lead surrogate (U+d800..U+dbff)? * @param c code unit or code point * @return true or false */ public static boolean isLeadSurrogate(int c) { return (c & 0xfffffc00) == 0xd800; } /** * Is this code point a trail surrogate (U+dc00..U+dfff)? * @param c code unit or code point * @return true or false */ public static boolean isTrailSurrogate(int c) { return (c & 0xfffffc00) == 0xdc00; } /** * Is this code point a surrogate (U+d800..U+dfff)? * @param c code unit or code point * @return true or false */ public static boolean isSurrogate(int c) { return (c & 0xfffff800) == 0xd800; } /** * Assuming c is a surrogate code point (UTF16.isSurrogate(c)), * is it a lead surrogate? * @param c code unit or code point * @return true or false */ public static boolean isSurrogateLead(int c) { return (c&0x400)==0; } /** * Compares two CharSequence objects for binary equality. * @param s1 first sequence * @param s2 second sequence * @return true if s1 contains the same text as s2 */ public static boolean equal(CharSequence s1, CharSequence s2) { if(s1==s2) { return true; } int length=s1.length(); if(length!=s2.length()) { return false; } for(int i=0; i>DELTA_SHIFT)-MAX_DELTA-1; // Read the normTrie. int offset=inIndexes[IX_NORM_TRIE_OFFSET]; int nextOffset=inIndexes[IX_EXTRA_DATA_OFFSET]; int triePosition = bytes.position(); normTrie = CodePointTrie.Fast16.fromBinary(bytes); int trieLength = bytes.position() - triePosition; if(trieLength>(nextOffset-offset)) { throw new ICUUncheckedIOException("Normalizer2 data: not enough bytes for normTrie"); } ICUBinary.skipBytes(bytes, (nextOffset-offset)-trieLength); // skip padding after trie bytes // Read the composition and mapping data. offset=nextOffset; nextOffset=inIndexes[IX_SMALL_FCD_OFFSET]; int numChars=(nextOffset-offset)/2; if(numChars!=0) { maybeYesCompositions=ICUBinary.getString(bytes, numChars, 0); extraData=maybeYesCompositions.substring((MIN_NORMAL_MAYBE_YES-minMaybeYes)>>OFFSET_SHIFT); } // smallFCD: new in formatVersion 2 offset=nextOffset; smallFCD=new byte[0x100]; bytes.get(smallFCD); return this; } catch(IOException e) { throw new ICUUncheckedIOException(e); } } public Normalizer2Impl load(String name) { return load(ICUBinary.getRequiredData(name)); } public void addLcccChars(UnicodeSet set) { int start = 0; CodePointMap.Range range = new CodePointMap.Range(); while (normTrie.getRange(start, CodePointMap.RangeOption.FIXED_LEAD_SURROGATES, INERT, null, range)) { int end = range.getEnd(); int norm16 = range.getValue(); if (norm16 > MIN_NORMAL_MAYBE_YES && norm16 != JAMO_VT) { set.add(start, end); } else if (minNoNoCompNoMaybeCC <= norm16 && norm16 < limitNoNo) { int fcd16 = getFCD16(start); if (fcd16 > 0xff) { set.add(start, end); } } start = end + 1; } } public void addPropertyStarts(UnicodeSet set) { // Add the start code point of each same-value range of the trie. int start = 0; CodePointMap.Range range = new CodePointMap.Range(); while (normTrie.getRange(start, CodePointMap.RangeOption.FIXED_LEAD_SURROGATES, INERT, null, range)) { int end = range.getEnd(); int value = range.getValue(); set.add(start); if (start != end && isAlgorithmicNoNo(value) && (value & DELTA_TCCC_MASK) > DELTA_TCCC_1) { // Range of code points with same-norm16-value algorithmic decompositions. // They might have different non-zero FCD16 values. int prevFCD16 = getFCD16(start); while (++start <= end) { int fcd16 = getFCD16(start); if (fcd16 != prevFCD16) { set.add(start); prevFCD16 = fcd16; } } } start = end + 1; } /* add Hangul LV syllables and LV+1 because of skippables */ for(int c=Hangul.HANGUL_BASE; c(); int start = 0; CodePointMap.Range range = new CodePointMap.Range(); while (normTrie.getRange(start, CodePointMap.RangeOption.FIXED_LEAD_SURROGATES, INERT, null, range)) { final int end = range.getEnd(); final int norm16 = range.getValue(); if(isInert(norm16) || (minYesNo<=norm16 && norm16 minYesNo) { // c decomposes, get everything from the variable-length extra data int mapping=norm16_2>>OFFSET_SHIFT; int firstUnit=extraData.charAt(mapping); int length=firstUnit&MAPPING_LENGTH_MASK; if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0) { if(c==c2 && (extraData.charAt(mapping-1)&0xff)!=0) { newValue|=CANON_NOT_SEGMENT_STARTER; // original c has cc!=0 } } // Skip empty mappings (no characters in the decomposition). if(length!=0) { ++mapping; // skip over the firstUnit // add c to first code point's start set int limit=mapping+length; c2=extraData.codePointAt(mapping); addToStartSet(mutableTrie, c, c2); // Set CANON_NOT_SEGMENT_STARTER for each remaining code point of a // one-way mapping. A 2-way mapping is possible here after // intermediate algorithmic mapping. if(norm16_2>=minNoNo) { while((mapping+=Character.charCount(c2))=MIN_NORMAL_MAYBE_YES) { return getCCFromNormalYesOrMaybe(norm16); } if(norm16> OFFSET_SHIFT) & 0xff; } public static int getCCFromYesOrMaybe(int norm16) { return norm16>=MIN_NORMAL_MAYBE_YES ? getCCFromNormalYesOrMaybe(norm16) : 0; } public int getCCFromYesOrMaybeCP(int c) { if (c < minCompNoMaybeCP) { return 0; } return getCCFromYesOrMaybe(getNorm16(c)); } /** * Returns the FCD data for code point c. * @param c A Unicode code point. * @return The lccc(c) in bits 15..8 and tccc(c) in bits 7..0. */ public int getFCD16(int c) { if(c>8]; if(bits==0) { return false; } return ((bits>>((lead>>5)&7))&1)!=0; } /** Gets the FCD value from the regular normalization data. */ public int getFCD16FromNormData(int c) { int norm16=getNorm16(c); if (norm16 >= limitNoNo) { if(norm16>=MIN_NORMAL_MAYBE_YES) { // combining mark norm16=getCCFromNormalYesOrMaybe(norm16); return norm16|(norm16<<8); } else if(norm16>=minMaybeYes) { return 0; } else { // isDecompNoAlgorithmic(norm16) int deltaTrailCC = norm16 & DELTA_TCCC_MASK; if (deltaTrailCC <= DELTA_TCCC_1) { return deltaTrailCC >> OFFSET_SHIFT; } // Maps to an isCompYesAndZeroCC. c=mapAlgorithmic(c, norm16); norm16 = getRawNorm16(c); } } if(norm16<=minYesNo || isHangulLVT(norm16)) { // no decomposition or Hangul syllable, all zeros return 0; } // c decomposes, get everything from the variable-length extra data int mapping=norm16>>OFFSET_SHIFT; int firstUnit=extraData.charAt(mapping); int fcd16=firstUnit>>8; // tccc if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0) { fcd16|=extraData.charAt(mapping-1)&0xff00; // lccc } return fcd16; } /** * Gets the decomposition for one code point. * @param c code point * @return c's decomposition, if it has one; returns null if it does not have a decomposition */ public String getDecomposition(int c) { int norm16; if(c>OFFSET_SHIFT; int length=extraData.charAt(mapping++)&MAPPING_LENGTH_MASK; return extraData.substring(mapping, mapping+length); } /** * Gets the raw decomposition for one code point. * @param c code point * @return c's raw decomposition, if it has one; returns null if it does not have a decomposition */ public String getRawDecomposition(int c) { int norm16; if(c>OFFSET_SHIFT; int firstUnit=extraData.charAt(mapping); int mLength=firstUnit&MAPPING_LENGTH_MASK; // length of normal mapping if((firstUnit&MAPPING_HAS_RAW_MAPPING)!=0) { // Read the raw mapping from before the firstUnit and before the optional ccc/lccc word. // Bit 7=MAPPING_HAS_CCC_LCCC_WORD int rawMapping=mapping-((firstUnit>>7)&1)-1; char rm0=extraData.charAt(rawMapping); if(rm0<=MAPPING_LENGTH_MASK) { return extraData.substring(rawMapping-rm0, rawMapping); } else { // Copy the normal mapping and replace its first two code units with rm0. StringBuilder buffer=new StringBuilder(mLength-1).append(rm0); mapping+=1+2; // skip over the firstUnit and the first two mapping code units return buffer.append(extraData, mapping, mapping+mLength-2).toString(); } } else { mapping+=1; // skip over the firstUnit return extraData.substring(mapping, mapping+mLength); } } /** * Returns true if code point c starts a canonical-iterator string segment. * {@link #ensureCanonIterData()} must have been called before this method, * or else this method will crash. * @param c A Unicode code point. * @return true if c starts a canonical-iterator string segment. */ public boolean isCanonSegmentStarter(int c) { return canonIterData.get(c)>=0; } /** * Returns true if there are characters whose decomposition starts with c. * If so, then the set is cleared and then filled with those characters. * {@link #ensureCanonIterData()} must have been called before this method, * or else this method will crash. * @param c A Unicode code point. * @param set A UnicodeSet to receive the characters whose decompositions * start with c, if there are any. * @return true if there are characters whose decomposition starts with c. */ public boolean getCanonStartSet(int c, UnicodeSet set) { int canonValue=canonIterData.get(c)&~CANON_NOT_SEGMENT_STARTER; if(canonValue==0) { return false; } set.clear(); int value=canonValue&CANON_VALUE_MASK; if((canonValue&CANON_HAS_SET)!=0) { set.addAll(canonStartSets.get(value)); } else if(value!=0) { set.add(value); } if((canonValue&CANON_HAS_COMPOSITIONS)!=0) { int norm16 = getRawNorm16(c); if(norm16==JAMO_L) { int syllable=Hangul.HANGUL_BASE+(c-Hangul.JAMO_L_BASE)*Hangul.JAMO_VT_COUNT; set.add(syllable, syllable+Hangul.JAMO_VT_COUNT-1); } else { addComposites(getCompositionsList(norm16), set); } } return true; } // Fixed norm16 values. public static final int MIN_YES_YES_WITH_CC=0xfe02; public static final int JAMO_VT=0xfe00; public static final int MIN_NORMAL_MAYBE_YES=0xfc00; public static final int JAMO_L=2; // offset=1 hasCompBoundaryAfter=false public static final int INERT=1; // offset=0 hasCompBoundaryAfter=true // norm16 bit 0 is comp-boundary-after. public static final int HAS_COMP_BOUNDARY_AFTER=1; public static final int OFFSET_SHIFT=1; // For algorithmic one-way mappings, norm16 bits 2..1 indicate the // tccc (0, 1, >1) for quick FCC boundary-after tests. public static final int DELTA_TCCC_0=0; public static final int DELTA_TCCC_1=2; public static final int DELTA_TCCC_GT_1=4; public static final int DELTA_TCCC_MASK=6; public static final int DELTA_SHIFT=3; public static final int MAX_DELTA=0x40; // Byte offsets from the start of the data, after the generic header. public static final int IX_NORM_TRIE_OFFSET=0; public static final int IX_EXTRA_DATA_OFFSET=1; public static final int IX_SMALL_FCD_OFFSET=2; public static final int IX_RESERVED3_OFFSET=3; public static final int IX_TOTAL_SIZE=7; // Code point thresholds for quick check codes. public static final int IX_MIN_DECOMP_NO_CP=8; public static final int IX_MIN_COMP_NO_MAYBE_CP=9; // Norm16 value thresholds for quick check combinations and types of extra data. /** Mappings & compositions in [minYesNo..minYesNoMappingsOnly[. */ public static final int IX_MIN_YES_NO=10; /** Mappings are comp-normalized. */ public static final int IX_MIN_NO_NO=11; public static final int IX_LIMIT_NO_NO=12; public static final int IX_MIN_MAYBE_YES=13; /** Mappings only in [minYesNoMappingsOnly..minNoNo[. */ public static final int IX_MIN_YES_NO_MAPPINGS_ONLY=14; /** Mappings are not comp-normalized but have a comp boundary before. */ public static final int IX_MIN_NO_NO_COMP_BOUNDARY_BEFORE=15; /** Mappings do not have a comp boundary before. */ public static final int IX_MIN_NO_NO_COMP_NO_MAYBE_CC=16; /** Mappings to the empty string. */ public static final int IX_MIN_NO_NO_EMPTY=17; public static final int IX_MIN_LCCC_CP=18; public static final int IX_COUNT=20; public static final int MAPPING_HAS_CCC_LCCC_WORD=0x80; public static final int MAPPING_HAS_RAW_MAPPING=0x40; // unused bit 0x20; public static final int MAPPING_LENGTH_MASK=0x1f; public static final int COMP_1_LAST_TUPLE=0x8000; public static final int COMP_1_TRIPLE=1; public static final int COMP_1_TRAIL_LIMIT=0x3400; public static final int COMP_1_TRAIL_MASK=0x7ffe; public static final int COMP_1_TRAIL_SHIFT=9; // 10-1 for the "triple" bit public static final int COMP_2_TRAIL_SHIFT=6; public static final int COMP_2_TRAIL_MASK=0xffc0; // higher-level functionality ------------------------------------------ *** // NFD without an NFD Normalizer2 instance. public Appendable decompose(CharSequence s, StringBuilder dest) { decompose(s, 0, s.length(), dest, s.length()); return dest; } /** * Decomposes s[src, limit[ and writes the result to dest. * limit can be NULL if src is NUL-terminated. * destLengthEstimate is the initial dest buffer capacity and can be -1. */ public void decompose(CharSequence s, int src, int limit, StringBuilder dest, int destLengthEstimate) { if(destLengthEstimate<0) { destLengthEstimate=limit-src; } dest.setLength(0); ReorderingBuffer buffer=new ReorderingBuffer(this, dest, destLengthEstimate); decompose(s, src, limit, buffer); } // Dual functionality: // buffer!=NULL: normalize // buffer==NULL: isNormalized/quickCheck/spanQuickCheckYes public int decompose(CharSequence s, int src, int limit, ReorderingBuffer buffer) { int minNoCP=minDecompNoCP; int prevSrc; int c=0; int norm16=0; // only for quick check int prevBoundary=src; int prevCC=0; for(;;) { // count code units below the minimum or with irrelevant data for the quick check for(prevSrc=src; src!=limit;) { if( (c=s.charAt(src))=limit) { break; } c=Character.codePointAt(s, src); cc=getCC(getNorm16(c)); }; buffer.append(s, 0, src, false, firstCC, prevCC); buffer.append(s, src, limit); } // Very similar to composeQuickCheck(): Make the same changes in both places if relevant. // doCompose: normalize // !doCompose: isNormalized (buffer must be empty and initialized) public boolean compose(CharSequence s, int src, int limit, boolean onlyContiguous, boolean doCompose, ReorderingBuffer buffer) { int prevBoundary=src; int minNoMaybeCP=minCompNoMaybeCP; for (;;) { // Fast path: Scan over a sequence of characters below the minimum "no or maybe" code point, // or with (compYes && ccc==0) properties. int prevSrc; int c = 0; int norm16 = 0; for (;;) { if (src == limit) { if (prevBoundary != limit && doCompose) { buffer.append(s, prevBoundary, limit); } return true; } if( (c=s.charAt(src))=minNoNo. // The current character is either a "noNo" (has a mapping) // or a "maybeYes" (combines backward) // or a "yesYes" with ccc!=0. // It is not a Hangul syllable or Jamo L because those have "yes" properties. // Medium-fast path: Handle cases that do not require full decomposition and recomposition. if (!isMaybeOrNonZeroCC(norm16)) { // minNoNo <= norm16 < minMaybeYes if (!doCompose) { return false; } // Fast path for mapping a character that is immediately surrounded by boundaries. // In this case, we need not decompose around the current character. if (isDecompNoAlgorithmic(norm16)) { // Maps to a single isCompYesAndZeroCC character // which also implies hasCompBoundaryBefore. if (norm16HasCompBoundaryAfter(norm16, onlyContiguous) || hasCompBoundaryBefore(s, src, limit)) { if (prevBoundary != prevSrc) { buffer.append(s, prevBoundary, prevSrc); } buffer.append(mapAlgorithmic(c, norm16), 0); prevBoundary = src; continue; } } else if (norm16 < minNoNoCompBoundaryBefore) { // The mapping is comp-normalized which also implies hasCompBoundaryBefore. if (norm16HasCompBoundaryAfter(norm16, onlyContiguous) || hasCompBoundaryBefore(s, src, limit)) { if (prevBoundary != prevSrc) { buffer.append(s, prevBoundary, prevSrc); } int mapping = norm16 >> OFFSET_SHIFT; int length = extraData.charAt(mapping++) & MAPPING_LENGTH_MASK; buffer.append(extraData, mapping, mapping + length); prevBoundary = src; continue; } } else if (norm16 >= minNoNoEmpty) { // The current character maps to nothing. // Simply omit it from the output if there is a boundary before _or_ after it. // The character itself implies no boundaries. if (hasCompBoundaryBefore(s, src, limit) || hasCompBoundaryAfter(s, prevBoundary, prevSrc, onlyContiguous)) { if (prevBoundary != prevSrc) { buffer.append(s, prevBoundary, prevSrc); } prevBoundary = src; continue; } } // Other "noNo" type, or need to examine more text around this character: // Fall through to the slow path. } else if (isJamoVT(norm16) && prevBoundary != prevSrc) { char prev=s.charAt(prevSrc-1); if(c= 0) { int syllable = Hangul.HANGUL_BASE + (l*Hangul.JAMO_V_COUNT + (c-Hangul.JAMO_V_BASE)) * Hangul.JAMO_T_COUNT + t; --prevSrc; // Replace the Jamo L as well. if (prevBoundary != prevSrc) { buffer.append(s, prevBoundary, prevSrc); } buffer.append((char)syllable); prevBoundary = src; continue; } // If we see L+V+x where x!=T then we drop to the slow path, // decompose and recompose. // This is to deal with NFKC finding normal L and V but a // compatibility variant of a T. // We need to either fully compose that combination here // (which would complicate the code and may not work with strange custom data) // or use the slow path. } } else if (Hangul.isHangulLV(prev)) { // The current character is a Jamo Trailing consonant, // compose with previous Hangul LV that does not contain a Jamo T. if (!doCompose) { return false; } int syllable = prev + c - Hangul.JAMO_T_BASE; --prevSrc; // Replace the Hangul LV as well. if (prevBoundary != prevSrc) { buffer.append(s, prevBoundary, prevSrc); } buffer.append((char)syllable); prevBoundary = src; continue; } // No matching context, or may need to decompose surrounding text first: // Fall through to the slow path. } else if (norm16 > JAMO_VT) { // norm16 >= MIN_YES_YES_WITH_CC // One or more combining marks that do not combine-back: // Check for canonical order, copy unchanged if ok and // if followed by a character with a boundary-before. int cc = getCCFromNormalYesOrMaybe(norm16); // cc!=0 if (onlyContiguous /* FCC */ && getPreviousTrailCC(s, prevBoundary, prevSrc) > cc) { // Fails FCD test, need to decompose and contiguously recompose. if (!doCompose) { return false; } } else { // If !onlyContiguous (not FCC), then we ignore the tccc of // the previous character which passed the quick check "yes && ccc==0" test. int n16; for (;;) { if (src == limit) { if (doCompose) { buffer.append(s, prevBoundary, limit); } return true; } int prevCC = cc; c = Character.codePointAt(s, src); n16 = normTrie.get(c); if (n16 >= MIN_YES_YES_WITH_CC) { cc = getCCFromNormalYesOrMaybe(n16); if (prevCC > cc) { if (!doCompose) { return false; } break; } } else { break; } src += Character.charCount(c); } // p is after the last in-order combining mark. // If there is a boundary here, then we continue with no change. if (norm16HasCompBoundaryBefore(n16)) { if (isCompYesAndZeroCC(n16)) { src += Character.charCount(c); } continue; } // Use the slow path. There is no boundary in [prevSrc, src[. } } // Slow path: Find the nearest boundaries around the current character, // decompose and recompose. if (prevBoundary != prevSrc && !norm16HasCompBoundaryBefore(norm16)) { c = Character.codePointBefore(s, prevSrc); norm16 = normTrie.get(c); if (!norm16HasCompBoundaryAfter(norm16, onlyContiguous)) { prevSrc -= Character.charCount(c); } } if (doCompose && prevBoundary != prevSrc) { buffer.append(s, prevBoundary, prevSrc); } int recomposeStartIndex=buffer.length(); // We know there is not a boundary here. decomposeShort(s, prevSrc, src, false /* !stopAtCompBoundary */, onlyContiguous, buffer); // Decompose until the next boundary. src = decomposeShort(s, src, limit, true /* stopAtCompBoundary */, onlyContiguous, buffer); recompose(buffer, recomposeStartIndex, onlyContiguous); if(!doCompose) { if(!buffer.equals(s, prevSrc, src)) { return false; } buffer.remove(); } prevBoundary=src; } } /** * Very similar to compose(): Make the same changes in both places if relevant. * doSpan: spanQuickCheckYes (ignore bit 0 of the return value) * !doSpan: quickCheck * @return bits 31..1: spanQuickCheckYes (==s.length() if "yes") and * bit 0: set if "maybe"; otherwise, if the span length<s.length() * then the quick check result is "no" */ public int composeQuickCheck(CharSequence s, int src, int limit, boolean onlyContiguous, boolean doSpan) { int qcResult=0; int prevBoundary=src; int minNoMaybeCP=minCompNoMaybeCP; for(;;) { // Fast path: Scan over a sequence of characters below the minimum "no or maybe" code point, // or with (compYes && ccc==0) properties. int prevSrc; int c = 0; int norm16 = 0; for (;;) { if(src==limit) { return (src<<1)|qcResult; // "yes" or "maybe" } if( (c=s.charAt(src))=minNoNo. // The current character is either a "noNo" (has a mapping) // or a "maybeYes" (combines backward) // or a "yesYes" with ccc!=0. // It is not a Hangul syllable or Jamo L because those have "yes" properties. int prevNorm16 = INERT; if (prevBoundary != prevSrc) { prevBoundary = prevSrc; if (!norm16HasCompBoundaryBefore(norm16)) { c = Character.codePointBefore(s, prevSrc); int n16 = getNorm16(c); if (!norm16HasCompBoundaryAfter(n16, onlyContiguous)) { prevBoundary -= Character.charCount(c); prevNorm16 = n16; } } } if(isMaybeOrNonZeroCC(norm16)) { int cc=getCCFromYesOrMaybe(norm16); if (onlyContiguous /* FCC */ && cc != 0 && getTrailCCFromCompYesAndZeroCC(prevNorm16) > cc) { // The [prevBoundary..prevSrc[ character // passed the quick check "yes && ccc==0" test // but is out of canonical order with the current combining mark. } else { // If !onlyContiguous (not FCC), then we ignore the tccc of // the previous character which passed the quick check "yes && ccc==0" test. for (;;) { if (norm16 < MIN_YES_YES_WITH_CC) { if (!doSpan) { qcResult = 1; } else { return prevBoundary << 1; // spanYes does not care to know it's "maybe" } } if (src == limit) { return (src<<1) | qcResult; // "yes" or "maybe" } int prevCC = cc; c = Character.codePointAt(s, src); norm16 = getNorm16(c); if (isMaybeOrNonZeroCC(norm16)) { cc = getCCFromYesOrMaybe(norm16); if (!(prevCC <= cc || cc == 0)) { break; } } else { break; } src += Character.charCount(c); } // src is after the last in-order combining mark. if (isCompYesAndZeroCC(norm16)) { prevBoundary = src; src += Character.charCount(c); continue; } } } return prevBoundary<<1; // "no" } } public void composeAndAppend(CharSequence s, boolean doCompose, boolean onlyContiguous, ReorderingBuffer buffer) { int src=0, limit=s.length(); if(!buffer.isEmpty()) { int firstStarterInSrc=findNextCompBoundary(s, 0, limit, onlyContiguous); if(0!=firstStarterInSrc) { int lastStarterInDest=findPreviousCompBoundary(buffer.getStringBuilder(), buffer.length(), onlyContiguous); StringBuilder middle=new StringBuilder((buffer.length()-lastStarterInDest)+ firstStarterInSrc+16); middle.append(buffer.getStringBuilder(), lastStarterInDest, buffer.length()); buffer.removeSuffix(buffer.length()-lastStarterInDest); middle.append(s, 0, firstStarterInSrc); compose(middle, 0, middle.length(), onlyContiguous, true, buffer); src=firstStarterInSrc; } } if(doCompose) { compose(s, src, limit, onlyContiguous, true, buffer); } else { buffer.append(s, src, limit); } } // Dual functionality: // buffer!=NULL: normalize // buffer==NULL: isNormalized/quickCheck/spanQuickCheckYes public int makeFCD(CharSequence s, int src, int limit, ReorderingBuffer buffer) { // Note: In this function we use buffer->appendZeroCC() because we track // the lead and trail combining classes here, rather than leaving it to // the ReorderingBuffer. // The exception is the call to decomposeShort() which uses the buffer // in the normal way. // Tracks the last FCD-safe boundary, before lccc=0 or after properly-ordered tccc<=1. // Similar to the prevBoundary in the compose() implementation. int prevBoundary=src; int prevSrc; int c=0; int prevFCD16=0; int fcd16=0; for(;;) { // count code units with lccc==0 for(prevSrc=src; src!=limit;) { if((c=s.charAt(src))1) { --prevBoundary; } } } else { int p=src-1; if( Character.isLowSurrogate(s.charAt(p)) && prevSrc

1) { prevBoundary=p; } } if(buffer!=null) { // The last lccc==0 character is excluded from the // flush-and-append call in case it needs to be modified. buffer.flushAndAppendZeroCC(s, prevSrc, prevBoundary); buffer.append(s, prevBoundary, src); } // The start of the current character (c). prevSrc=src; } else if(src==limit) { break; } src+=Character.charCount(c); // The current character (c) at [prevSrc..src[ has a non-zero lead combining class. // Check for proper order, and decompose locally if necessary. if((prevFCD16&0xff)<=(fcd16>>8)) { // proper order: prev tccc <= current lccc if((fcd16&0xff)<=1) { prevBoundary=src; } if(buffer!=null) { buffer.appendZeroCC(c); } prevFCD16=fcd16; continue; } else if(buffer==null) { return prevBoundary; // quick check "no" } else { /* * Back out the part of the source that we copied or appended * already but is now going to be decomposed. * prevSrc is set to after what was copied/appended. */ buffer.removeSuffix(prevSrc-prevBoundary); /* * Find the part of the source that needs to be decomposed, * up to the next safe boundary. */ src=findNextFCDBoundary(s, src, limit); /* * The source text does not fulfill the conditions for FCD. * Decompose and reorder a limited piece of the text. */ decomposeShort(s, prevBoundary, src, false, false, buffer); prevBoundary=src; prevFCD16=0; } } return src; } public void makeFCDAndAppend(CharSequence s, boolean doMakeFCD, ReorderingBuffer buffer) { int src=0, limit=s.length(); if(!buffer.isEmpty()) { int firstBoundaryInSrc=findNextFCDBoundary(s, 0, limit); if(0!=firstBoundaryInSrc) { int lastBoundaryInDest=findPreviousFCDBoundary(buffer.getStringBuilder(), buffer.length()); StringBuilder middle=new StringBuilder((buffer.length()-lastBoundaryInDest)+ firstBoundaryInSrc+16); middle.append(buffer.getStringBuilder(), lastBoundaryInDest, buffer.length()); buffer.removeSuffix(buffer.length()-lastBoundaryInDest); middle.append(s, 0, firstBoundaryInSrc); makeFCD(middle, 0, middle.length(), buffer); src=firstBoundaryInSrc; } } if(doMakeFCD) { makeFCD(s, src, limit, buffer); } else { buffer.append(s, src, limit); } } public boolean hasDecompBoundaryBefore(int c) { return c < minLcccCP || (c <= 0xffff && !singleLeadMightHaveNonZeroFCD16(c)) || norm16HasDecompBoundaryBefore(getNorm16(c)); } public boolean norm16HasDecompBoundaryBefore(int norm16) { if (norm16 < minNoNoCompNoMaybeCC) { return true; } if (norm16 >= limitNoNo) { return norm16 <= MIN_NORMAL_MAYBE_YES || norm16 == JAMO_VT; } // c decomposes, get everything from the variable-length extra data int mapping=norm16>>OFFSET_SHIFT; int firstUnit=extraData.charAt(mapping); // true if leadCC==0 (hasFCDBoundaryBefore()) return (firstUnit&MAPPING_HAS_CCC_LCCC_WORD)==0 || (extraData.charAt(mapping-1)&0xff00)==0; } public boolean hasDecompBoundaryAfter(int c) { if (c < minDecompNoCP) { return true; } if (c <= 0xffff && !singleLeadMightHaveNonZeroFCD16(c)) { return true; } return norm16HasDecompBoundaryAfter(getNorm16(c)); } public boolean norm16HasDecompBoundaryAfter(int norm16) { if(norm16 <= minYesNo || isHangulLVT(norm16)) { return true; } if (norm16 >= limitNoNo) { if (isMaybeOrNonZeroCC(norm16)) { return norm16 <= MIN_NORMAL_MAYBE_YES || norm16 == JAMO_VT; } // Maps to an isCompYesAndZeroCC. return (norm16 & DELTA_TCCC_MASK) <= DELTA_TCCC_1; } // c decomposes, get everything from the variable-length extra data int mapping=norm16>>OFFSET_SHIFT; int firstUnit=extraData.charAt(mapping); // decomp after-boundary: same as hasFCDBoundaryAfter(), // fcd16<=1 || trailCC==0 if(firstUnit>0x1ff) { return false; // trailCC>1 } if(firstUnit<=0xff) { return true; // trailCC==0 } // if(trailCC==1) test leadCC==0, same as checking for before-boundary // true if leadCC==0 (hasFCDBoundaryBefore()) return (firstUnit&MAPPING_HAS_CCC_LCCC_WORD)==0 || (extraData.charAt(mapping-1)&0xff00)==0; } public boolean isDecompInert(int c) { return isDecompYesAndZeroCC(getNorm16(c)); } public boolean hasCompBoundaryBefore(int c) { return c>OFFSET_SHIFT) <= 0x1ff); } public boolean hasFCDBoundaryBefore(int c) { return hasDecompBoundaryBefore(c); } public boolean hasFCDBoundaryAfter(int c) { return hasDecompBoundaryAfter(c); } public boolean isFCDInert(int c) { return getFCD16(c)<=1; } private boolean isMaybe(int norm16) { return minMaybeYes<=norm16 && norm16<=JAMO_VT; } private boolean isMaybeOrNonZeroCC(int norm16) { return norm16>=minMaybeYes; } private static boolean isInert(int norm16) { return norm16==INERT; } private static boolean isJamoL(int norm16) { return norm16==JAMO_L; } private static boolean isJamoVT(int norm16) { return norm16==JAMO_VT; } private int hangulLVT() { return minYesNoMappingsOnly|HAS_COMP_BOUNDARY_AFTER; } private boolean isHangulLV(int norm16) { return norm16==minYesNo; } private boolean isHangulLVT(int norm16) { return norm16==hangulLVT(); } private boolean isCompYesAndZeroCC(int norm16) { return norm16=MIN_YES_YES_WITH_CC || norm16=limitNoNo; } // For use with isCompYes(). // Perhaps the compiler can combine the two tests for MIN_YES_YES_WITH_CC. // static uint8_t getCCFromYes(uint16_t norm16) { // return norm16>=MIN_YES_YES_WITH_CC ? getCCFromNormalYesOrMaybe(norm16) : 0; // } private int getCCFromNoNo(int norm16) { int mapping=norm16>>OFFSET_SHIFT; if((extraData.charAt(mapping)&MAPPING_HAS_CCC_LCCC_WORD)!=0) { return extraData.charAt(mapping-1)&0xff; } else { return 0; } } int getTrailCCFromCompYesAndZeroCC(int norm16) { if(norm16<=minYesNo) { return 0; // yesYes and Hangul LV have ccc=tccc=0 } else { // For Hangul LVT we harmlessly fetch a firstUnit with tccc=0 here. return extraData.charAt(norm16>>OFFSET_SHIFT)>>8; // tccc from yesNo } } // Requires algorithmic-NoNo. private int mapAlgorithmic(int c, int norm16) { return c+(norm16>>DELTA_SHIFT)-centerNoNoDelta; } // Requires minYesNo>OFFSET_SHIFT); } /** * @return index into maybeYesCompositions, or -1 */ private int getCompositionsListForDecompYes(int norm16) { if(norm16>OFFSET_SHIFT; } } /** * @return index into maybeYesCompositions */ private int getCompositionsListForComposite(int norm16) { // A composite has both mapping & compositions list. int list=((MIN_NORMAL_MAYBE_YES-minMaybeYes)+norm16)>>OFFSET_SHIFT; int firstUnit=maybeYesCompositions.charAt(list); return list+ // mapping in maybeYesCompositions 1+ // +1 to skip the first unit with the mapping length (firstUnit&MAPPING_LENGTH_MASK); // + mapping length } private int getCompositionsListForMaybe(int norm16) { // minMaybeYes<=norm16>OFFSET_SHIFT; } /** * @param c code point must have compositions * @return index into maybeYesCompositions */ private int getCompositionsList(int norm16) { return isDecompYes(norm16) ? getCompositionsListForDecompYes(norm16) : getCompositionsListForComposite(norm16); } // Decompose a short piece of text which is likely to contain characters that // fail the quick check loop and/or where the quick check loop's overhead // is unlikely to be amortized. // Called by the compose() and makeFCD() implementations. // Public in Java for collation implementation code. private int decomposeShort( CharSequence s, int src, int limit, boolean stopAtCompBoundary, boolean onlyContiguous, ReorderingBuffer buffer) { while(src= limitNoNo) { if (isMaybeOrNonZeroCC(norm16)) { buffer.append(c, getCCFromYesOrMaybe(norm16)); return; } // Maps to an isCompYesAndZeroCC. c=mapAlgorithmic(c, norm16); norm16 = getRawNorm16(c); } if (norm16 < minYesNo) { // c does not decompose buffer.append(c, 0); } else if(isHangulLV(norm16) || isHangulLVT(norm16)) { // Hangul syllable: decompose algorithmically Hangul.decompose(c, buffer); } else { // c decomposes, get everything from the variable-length extra data int mapping=norm16>>OFFSET_SHIFT; int firstUnit=extraData.charAt(mapping); int length=firstUnit&MAPPING_LENGTH_MASK; int leadCC, trailCC; trailCC=firstUnit>>8; if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0) { leadCC=extraData.charAt(mapping-1)>>8; } else { leadCC=0; } ++mapping; // skip over the firstUnit buffer.append(extraData, mapping, mapping+length, true, leadCC, trailCC); } } /** * Finds the recomposition result for * a forward-combining "lead" character, * specified with a pointer to its compositions list, * and a backward-combining "trail" character. * *

If the lead and trail characters combine, then this function returns * the following "compositeAndFwd" value: *

     * Bits 21..1  composite character
     * Bit      0  set if the composite is a forward-combining starter
     * 
* otherwise it returns -1. * *

The compositions list has (trail, compositeAndFwd) pair entries, * encoded as either pairs or triples of 16-bit units. * The last entry has the high bit of its first unit set. * *

The list is sorted by ascending trail characters (there are no duplicates). * A linear search is used. * *

See normalizer2impl.h for a more detailed description * of the compositions list format. */ private static int combine(String compositions, int list, int trail) { int key1, firstUnit; if(trail(firstUnit=compositions.charAt(list))) { list+=2+(firstUnit&COMP_1_TRIPLE); } if(key1==(firstUnit&COMP_1_TRAIL_MASK)) { if((firstUnit&COMP_1_TRIPLE)!=0) { return (compositions.charAt(list+1)<<16)|compositions.charAt(list+2); } else { return compositions.charAt(list+1); } } } else { // trail character is 3400..10FFFF // result entry has 3 units key1=COMP_1_TRAIL_LIMIT+(((trail>>COMP_1_TRAIL_SHIFT))&~COMP_1_TRIPLE); int key2=(trail<(firstUnit=compositions.charAt(list))) { list+=2+(firstUnit&COMP_1_TRIPLE); } else if(key1==(firstUnit&COMP_1_TRAIL_MASK)) { if(key2>(secondUnit=compositions.charAt(list+1))) { if((firstUnit&COMP_1_LAST_TUPLE)!=0) { break; } else { list+=3; } } else if(key2==(secondUnit&COMP_2_TRAIL_MASK)) { return ((secondUnit&~COMP_2_TRAIL_MASK)<<16)|compositions.charAt(list+2); } else { break; } } else { break; } } } return -1; } /** * @param list some character's compositions list * @param set recursively receives the composites from these compositions */ private void addComposites(int list, UnicodeSet set) { int firstUnit, compositeAndFwd; do { firstUnit=maybeYesCompositions.charAt(list); if((firstUnit&COMP_1_TRIPLE)==0) { compositeAndFwd=maybeYesCompositions.charAt(list+1); list+=2; } else { compositeAndFwd=((maybeYesCompositions.charAt(list+1)&~COMP_2_TRAIL_MASK)<<16)| maybeYesCompositions.charAt(list+2); list+=3; } int composite=compositeAndFwd>>1; if((compositeAndFwd&1)!=0) { addComposites(getCompositionsListForComposite(getRawNorm16(composite)), set); } set.add(composite); } while((firstUnit&COMP_1_LAST_TUPLE)==0); } /* * Recomposes the buffer text starting at recomposeStartIndex * (which is in NFD - decomposed and canonically ordered), * and truncates the buffer contents. * * Note that recomposition never lengthens the text: * Any character consists of either one or two code units; * a composition may contain at most one more code unit than the original starter, * while the combining mark that is removed has at least one code unit. */ private void recompose(ReorderingBuffer buffer, int recomposeStartIndex, boolean onlyContiguous) { StringBuilder sb=buffer.getStringBuilder(); int p=recomposeStartIndex; if(p==sb.length()) { return; } int starter, pRemove; int compositionsList; int c, compositeAndFwd; int norm16; int cc, prevCC; boolean starterIsSupplementary; // Some of the following variables are not used until we have a forward-combining starter // and are only initialized now to avoid compiler warnings. compositionsList=-1; // used as indicator for whether we have a forward-combining starter starter=-1; starterIsSupplementary=false; prevCC=0; for(;;) { c=sb.codePointAt(p); p+=Character.charCount(c); norm16=getNorm16(c); cc=getCCFromYesOrMaybe(norm16); if( // this character combines backward and isMaybe(norm16) && // we have seen a starter that combines forward and compositionsList>=0 && // the backward-combining character is not blocked (prevCC=0) { // The starter and the combining mark (c) do combine. int composite=compositeAndFwd>>1; // Remove the combining mark. pRemove=p-Character.charCount(c); // pRemove & p: start & limit of the combining mark sb.delete(pRemove, p); p=pRemove; // Replace the starter with the composite. if(starterIsSupplementary) { if(composite>0xffff) { // both are supplementary sb.setCharAt(starter, UTF16.getLeadSurrogate(composite)); sb.setCharAt(starter+1, UTF16.getTrailSurrogate(composite)); } else { sb.setCharAt(starter, (char)c); sb.deleteCharAt(starter+1); // The composite is shorter than the starter, // move the intermediate characters forward one. starterIsSupplementary=false; --p; } } else if(composite>0xffff) { // The composite is longer than the starter, // move the intermediate characters back one. starterIsSupplementary=true; sb.setCharAt(starter, UTF16.getLeadSurrogate(composite)); sb.insert(starter+1, UTF16.getTrailSurrogate(composite)); ++p; } else { // both are on the BMP sb.setCharAt(starter, (char)composite); } // Keep prevCC because we removed the combining mark. if(p==sb.length()) { break; } // Is the composite a starter that combines forward? if((compositeAndFwd&1)!=0) { compositionsList= getCompositionsListForComposite(getRawNorm16(composite)); } else { compositionsList=-1; } // We combined; continue with looking for compositions. continue; } } // no combination this time prevCC=cc; if(p==sb.length()) { break; } // If c did not combine, then check if it is a starter. if(cc==0) { // Found a new starter. if((compositionsList=getCompositionsListForDecompYes(norm16))>=0) { // It may combine with something, prepare for it. if(c<=0xffff) { starterIsSupplementary=false; starter=p-1; } else { starterIsSupplementary=true; starter=p-2; } } } else if(onlyContiguous) { // FCC: no discontiguous compositions; any intervening character blocks. compositionsList=-1; } } buffer.flush(); } public int composePair(int a, int b) { int norm16=getNorm16(a); // maps an out-of-range 'a' to inert norm16 int list; if(isInert(norm16)) { return -1; } else if(norm16>OFFSET_SHIFT; if(norm16>minYesNo) { // composite 'a' has both mapping & compositions list list+= // mapping pointer 1+ // +1 to skip the first unit with the mapping length (maybeYesCompositions.charAt(list)&MAPPING_LENGTH_MASK); // + mapping length } } } else if(norm16>1; } /** * Does c have a composition boundary before it? * True if its decomposition begins with a character that has * ccc=0 && NFC_QC=Yes (isCompYesAndZeroCC()). * As a shortcut, this is true if c itself has ccc=0 && NFC_QC=Yes * (isCompYesAndZeroCC()) so we need not decompose. */ private boolean hasCompBoundaryBefore(int c, int norm16) { return c> OFFSET_SHIFT) <= 0x1ff); } private int findPreviousCompBoundary(CharSequence s, int p, boolean onlyContiguous) { while(p>0) { int c=Character.codePointBefore(s, p); int norm16 = getNorm16(c); if (norm16HasCompBoundaryAfter(norm16, onlyContiguous)) { break; } p-=Character.charCount(c); if(hasCompBoundaryBefore(c, norm16)) { break; } } return p; } private int findNextCompBoundary(CharSequence s, int p, int limit, boolean onlyContiguous) { while(p0) { int c=Character.codePointBefore(s, p); int norm16; if (c < minDecompNoCP || norm16HasDecompBoundaryAfter(norm16 = getNorm16(c))) { break; } p-=Character.charCount(c); if (norm16HasDecompBoundaryBefore(norm16)) { break; } } return p; } private int findNextFCDBoundary(CharSequence s, int p, int limit) { while(p canonStartSets; // bits in canonIterData private static final int CANON_NOT_SEGMENT_STARTER = 0x80000000; private static final int CANON_HAS_COMPOSITIONS = 0x40000000; private static final int CANON_HAS_SET = 0x200000; private static final int CANON_VALUE_MASK = 0x1fffff; }