/* GENERATED SOURCE. DO NOT MODIFY. */ // © 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ******************************************************************************* * Copyright (C) 2012-2016, International Business Machines Corporation and * * others. All Rights Reserved. * ******************************************************************************* */ package android.icu.impl.breakiter; import static android.icu.impl.CharacterIteration.DONE32; import static android.icu.impl.CharacterIteration.current32; import static android.icu.impl.CharacterIteration.next32; import static android.icu.impl.CharacterIteration.previous32; import java.io.IOException; import java.text.CharacterIterator; import java.util.HashSet; import android.icu.impl.Assert; import android.icu.impl.ICUConfig; import android.icu.impl.ICUData; import android.icu.text.Normalizer; import android.icu.text.UnicodeSet; import android.icu.text.UnicodeSetIterator; import android.icu.util.UResourceBundle; import android.icu.util.UResourceBundleIterator; /** * @hide Only a subset of ICU is exposed in Android */ public class CjkBreakEngine extends DictionaryBreakEngine { private UnicodeSet fHangulWordSet; private UnicodeSet fDigitOrOpenPunctuationOrAlphabetSet; private UnicodeSet fClosePunctuationSet; private DictionaryMatcher fDictionary = null; private HashSet fSkipSet; private MlBreakEngine fMlBreakEngine; private boolean isCj = false; public CjkBreakEngine(boolean korean) throws IOException { fHangulWordSet = new UnicodeSet("[\\uac00-\\ud7a3]"); fHangulWordSet.freeze(); // Digit, open punctuation and Alphabetic characters. fDigitOrOpenPunctuationOrAlphabetSet = new UnicodeSet("[[:Nd:][:Pi:][:Ps:][:Alphabetic:]]"); fDigitOrOpenPunctuationOrAlphabetSet.freeze(); fClosePunctuationSet = new UnicodeSet("[[:Pc:][:Pd:][:Pe:][:Pf:][:Po:]]"); fClosePunctuationSet.freeze(); fSkipSet = new HashSet(); fDictionary = DictionaryData.loadDictionaryFor("Hira"); if (korean) { setCharacters(fHangulWordSet); } else { //Chinese and Japanese isCj = true; UnicodeSet cjSet = new UnicodeSet("[[:Han:][:Hiragana:][:Katakana:]\\u30fc\\uff70\\uff9e\\uff9f]"); setCharacters(cjSet); if (Boolean.parseBoolean( ICUConfig.get("android.icu.impl.breakiter.useMLPhraseBreaking", "false"))) { fMlBreakEngine = new MlBreakEngine(fDigitOrOpenPunctuationOrAlphabetSet, fClosePunctuationSet); } else { initializeJapanesePhraseParamater(); } } } private void initializeJapanesePhraseParamater() { loadJapaneseExtensions(); loadHiragana(); } private void loadJapaneseExtensions() { UResourceBundle rb = UResourceBundle.getBundleInstance(ICUData.ICU_BRKITR_BASE_NAME, "ja"); final String tag = "extensions"; UResourceBundle bundle = rb.get(tag); UResourceBundleIterator iterator = bundle.getIterator(); while (iterator.hasNext()) { fSkipSet.add(iterator.nextString()); } } private void loadHiragana() { UnicodeSet hiraganaWordSet = new UnicodeSet("[:Hiragana:]"); hiraganaWordSet.freeze(); UnicodeSetIterator iterator = new UnicodeSetIterator(hiraganaWordSet); while (iterator.next()) { fSkipSet.add(iterator.getString()); } } @Override public boolean equals(Object obj) { if (obj instanceof CjkBreakEngine) { CjkBreakEngine other = (CjkBreakEngine)obj; return this.fSet.equals(other.fSet); } return false; } @Override public int hashCode() { return getClass().hashCode(); } private static final int kMaxKatakanaLength = 8; private static final int kMaxKatakanaGroupLength = 20; private static final int maxSnlp = 255; private static final int kint32max = Integer.MAX_VALUE; private static int getKatakanaCost(int wordlength) { int katakanaCost[] = new int[] { 8192, 984, 408, 240, 204, 252, 300, 372, 480 }; return (wordlength > kMaxKatakanaLength) ? 8192 : katakanaCost[wordlength]; } private static boolean isKatakana(int value) { return (value >= 0x30A1 && value <= 0x30FE && value != 0x30FB) || (value >= 0xFF66 && value <= 0xFF9F); } @Override public int divideUpDictionaryRange(CharacterIterator inText, int startPos, int endPos, DequeI foundBreaks, boolean isPhraseBreaking) { if (startPos >= endPos) { return 0; } inText.setIndex(startPos); int inputLength = endPos - startPos; int[] charPositions = new int[inputLength + 1]; StringBuffer s = new StringBuffer(""); inText.setIndex(startPos); while (inText.getIndex() < endPos) { s.append(inText.current()); inText.next(); } String prenormstr = s.toString(); boolean isNormalized = Normalizer.quickCheck(prenormstr, Normalizer.NFKC) == Normalizer.YES || Normalizer.isNormalized(prenormstr, Normalizer.NFKC, 0); CharacterIterator text; int numCodePts = 0; if (isNormalized) { text = new java.text.StringCharacterIterator(prenormstr); int index = 0; charPositions[0] = 0; while (index < prenormstr.length()) { int codepoint = prenormstr.codePointAt(index); index += Character.charCount(codepoint); numCodePts++; charPositions[numCodePts] = index; } } else { String normStr = Normalizer.normalize(prenormstr, Normalizer.NFKC); text = new java.text.StringCharacterIterator(normStr); charPositions = new int[normStr.length() + 1]; Normalizer normalizer = new Normalizer(prenormstr, Normalizer.NFKC, 0); int index = 0; charPositions[0] = 0; while (index < normalizer.endIndex()) { normalizer.next(); numCodePts++; index = normalizer.getIndex(); charPositions[numCodePts] = index; } } // Use ML phrase breaking if (Boolean.parseBoolean( ICUConfig.get("android.icu.impl.breakiter.useMLPhraseBreaking", "false"))) { // PhraseBreaking is supported in ja and ko; MlBreakEngine only supports ja. if (isPhraseBreaking && isCj) { return fMlBreakEngine.divideUpRange(inText, startPos, endPos, text, numCodePts, charPositions, foundBreaks); } } // From here on out, do the algorithm. Note that our indices // refer to indices within the normalized string. int[] bestSnlp = new int[numCodePts + 1]; bestSnlp[0] = 0; for (int i = 1; i <= numCodePts; i++) { bestSnlp[i] = kint32max; } int[] prev = new int[numCodePts + 1]; for (int i = 0; i <= numCodePts; i++) { prev[i] = -1; } final int maxWordSize = 20; int values[] = new int[numCodePts]; int lengths[] = new int[numCodePts]; // dynamic programming to find the best segmentation // In outer loop, i is the code point index, // ix is the corresponding code unit index. // They differ when the string contains supplementary characters. int ix = 0; text.setIndex(ix); boolean is_prev_katakana = false; for (int i = 0; i < numCodePts; i++, text.setIndex(ix), next32(text)) { ix = text.getIndex(); if (bestSnlp[i] == kint32max) { continue; } int maxSearchLength = (i + maxWordSize < numCodePts) ? maxWordSize : (numCodePts - i); int[] count_ = new int[1]; fDictionary.matches(text, maxSearchLength, lengths, count_, maxSearchLength, values); int count = count_[0]; // if there are no single character matches found in the dictionary // starting with this character, treat character as a 1-character word // with the highest value possible (i.e. the least likely to occur). // Exclude Korean characters from this treatment, as they should be // left together by default. text.setIndex(ix); // fDictionary.matches() advances the text position; undo that. if ((count == 0 || lengths[0] != 1) && current32(text) != DONE32 && !fHangulWordSet.contains(current32(text))) { values[count] = maxSnlp; lengths[count] = 1; count++; } for (int j = 0; j < count; j++) { int newSnlp = bestSnlp[i] + values[j]; if (newSnlp < bestSnlp[lengths[j] + i]) { bestSnlp[lengths[j] + i] = newSnlp; prev[lengths[j] + i] = i; } } // In Japanese, single-character Katakana words are pretty rare. // So we apply the following heuristic to Katakana: any continuous // run of Katakana characters is considered a candidate word with // a default cost specified in the katakanaCost table according // to its length. boolean is_katakana = isKatakana(current32(text)); if (!is_prev_katakana && is_katakana) { int j = i + 1; next32(text); while (j < numCodePts && (j - i) < kMaxKatakanaGroupLength && isKatakana(current32(text))) { next32(text); ++j; } if ((j - i) < kMaxKatakanaGroupLength) { int newSnlp = bestSnlp[i] + getKatakanaCost(j - i); if (newSnlp < bestSnlp[j]) { bestSnlp[j] = newSnlp; prev[j] = i; } } } is_prev_katakana = is_katakana; } int t_boundary[] = new int[numCodePts + 1]; int numBreaks = 0; if (bestSnlp[numCodePts] == kint32max) { t_boundary[numBreaks] = numCodePts; numBreaks++; } else if (isPhraseBreaking) { t_boundary[numBreaks] = numCodePts; numBreaks++; int prevIdx = numCodePts; int codeUnitIdx = 0, prevCodeUnitIdx = 0, length = 0; for (int i = prev[numCodePts]; i > 0; i = prev[i]) { codeUnitIdx = prenormstr.offsetByCodePoints(0, i); prevCodeUnitIdx = prenormstr.offsetByCodePoints(0, prevIdx); length = prevCodeUnitIdx - codeUnitIdx; prevIdx = i; String pattern = getPatternFromText(text, s, codeUnitIdx, length); // Keep the breakpoint if the pattern is not in the fSkipSet and continuous Katakana // characters don't occur. text.setIndex(codeUnitIdx); if (!fSkipSet.contains(pattern) && (!isKatakana(current32(text)) || !isKatakana(previous32(text)))) { t_boundary[numBreaks] = i; numBreaks++; } } } else { for (int i = numCodePts; i > 0; i = prev[i]) { t_boundary[numBreaks] = i; numBreaks++; } Assert.assrt(prev[t_boundary[numBreaks - 1]] == 0); } if (foundBreaks.size() == 0 || foundBreaks.peek() < startPos) { t_boundary[numBreaks++] = 0; } int correctedNumBreaks = 0; int previous = -1; for (int i = numBreaks - 1; i >= 0; i--) { int pos = charPositions[t_boundary[i]] + startPos; // In phrase breaking, there has to be a breakpoint between Cj character and close // punctuation. // E.g.［携帯電話］正しい選択 -> ［携帯▁電話］▁正しい▁選択 -> breakpoint between ］ and 正 inText.setIndex(pos); if (pos > previous) { if (pos != startPos || (isPhraseBreaking && pos > 0 && fClosePunctuationSet.contains(previous32(inText)))) { foundBreaks.push(charPositions[t_boundary[i]] + startPos); correctedNumBreaks++; } } previous = pos; } if (!foundBreaks.isEmpty() && foundBreaks.peek() == endPos) { // In phrase breaking, there has to be a breakpoint between Cj character and // the number/open punctuation. // E.g. る文字「そうだ、京都」->る▁文字▁「そうだ、▁京都」-> breakpoint between 字 and「 // E.g. 乗車率９０％程度だろうか -> 乗車▁率▁９０％▁程度だろうか -> breakpoint between 率 and ９ // E.g. しかもロゴがＵｎｉｃｏｄｅ！ -> しかも▁ロゴが▁Ｕｎｉｃｏｄｅ！-> breakpoint between が and Ｕ if (isPhraseBreaking) { inText.setIndex(endPos); int current = current32(inText); if (current != DONE32 && !fDigitOrOpenPunctuationOrAlphabetSet.contains(current)) { foundBreaks.pop(); correctedNumBreaks--; } } else { foundBreaks.pop(); correctedNumBreaks--; } } if (!foundBreaks.isEmpty()) inText.setIndex(foundBreaks.peek()); return correctedNumBreaks; } private String getPatternFromText(CharacterIterator text, StringBuffer sb, int start, int length) { sb.setLength(0); if (length > 0) { text.setIndex(start); sb.append(text.current()); for (int i = 1; i < length; i++) { sb.append(text.next()); } } return sb.toString(); } }