350 lines
14 KiB
Java
350 lines
14 KiB
Java
/* GENERATED SOURCE. DO NOT MODIFY. */
|
||
// © 2016 and later: Unicode, Inc. and others.
|
||
// License & terms of use: http://www.unicode.org/copyright.html
|
||
/*
|
||
*******************************************************************************
|
||
* Copyright (C) 2012-2016, International Business Machines Corporation and *
|
||
* others. All Rights Reserved. *
|
||
*******************************************************************************
|
||
*/
|
||
package android.icu.impl.breakiter;
|
||
|
||
import static android.icu.impl.CharacterIteration.DONE32;
|
||
import static android.icu.impl.CharacterIteration.current32;
|
||
import static android.icu.impl.CharacterIteration.next32;
|
||
import static android.icu.impl.CharacterIteration.previous32;
|
||
|
||
import java.io.IOException;
|
||
import java.text.CharacterIterator;
|
||
import java.util.HashSet;
|
||
|
||
import android.icu.impl.Assert;
|
||
import android.icu.impl.ICUConfig;
|
||
import android.icu.impl.ICUData;
|
||
import android.icu.text.Normalizer;
|
||
import android.icu.text.UnicodeSet;
|
||
import android.icu.text.UnicodeSetIterator;
|
||
import android.icu.util.UResourceBundle;
|
||
import android.icu.util.UResourceBundleIterator;
|
||
|
||
/**
|
||
* @hide Only a subset of ICU is exposed in Android
|
||
*/
|
||
public class CjkBreakEngine extends DictionaryBreakEngine {
|
||
private UnicodeSet fHangulWordSet;
|
||
private UnicodeSet fDigitOrOpenPunctuationOrAlphabetSet;
|
||
private UnicodeSet fClosePunctuationSet;
|
||
private DictionaryMatcher fDictionary = null;
|
||
private HashSet<String> fSkipSet;
|
||
private MlBreakEngine fMlBreakEngine;
|
||
private boolean isCj = false;
|
||
|
||
public CjkBreakEngine(boolean korean) throws IOException {
|
||
fHangulWordSet = new UnicodeSet("[\\uac00-\\ud7a3]");
|
||
fHangulWordSet.freeze();
|
||
// Digit, open punctuation and Alphabetic characters.
|
||
fDigitOrOpenPunctuationOrAlphabetSet = new UnicodeSet("[[:Nd:][:Pi:][:Ps:][:Alphabetic:]]");
|
||
fDigitOrOpenPunctuationOrAlphabetSet.freeze();
|
||
|
||
fClosePunctuationSet = new UnicodeSet("[[:Pc:][:Pd:][:Pe:][:Pf:][:Po:]]");
|
||
fClosePunctuationSet.freeze();
|
||
fSkipSet = new HashSet<String>();
|
||
|
||
fDictionary = DictionaryData.loadDictionaryFor("Hira");
|
||
if (korean) {
|
||
setCharacters(fHangulWordSet);
|
||
} else { //Chinese and Japanese
|
||
isCj = true;
|
||
UnicodeSet cjSet = new UnicodeSet("[[:Han:][:Hiragana:][:Katakana:]\\u30fc\\uff70\\uff9e\\uff9f]");
|
||
setCharacters(cjSet);
|
||
if (Boolean.parseBoolean(
|
||
ICUConfig.get("android.icu.impl.breakiter.useMLPhraseBreaking", "false"))) {
|
||
fMlBreakEngine = new MlBreakEngine(fDigitOrOpenPunctuationOrAlphabetSet,
|
||
fClosePunctuationSet);
|
||
} else {
|
||
initializeJapanesePhraseParamater();
|
||
}
|
||
}
|
||
}
|
||
|
||
private void initializeJapanesePhraseParamater() {
|
||
loadJapaneseExtensions();
|
||
loadHiragana();
|
||
}
|
||
|
||
private void loadJapaneseExtensions() {
|
||
UResourceBundle rb = UResourceBundle.getBundleInstance(ICUData.ICU_BRKITR_BASE_NAME, "ja");
|
||
final String tag = "extensions";
|
||
UResourceBundle bundle = rb.get(tag);
|
||
UResourceBundleIterator iterator = bundle.getIterator();
|
||
while (iterator.hasNext()) {
|
||
fSkipSet.add(iterator.nextString());
|
||
}
|
||
}
|
||
|
||
private void loadHiragana() {
|
||
UnicodeSet hiraganaWordSet = new UnicodeSet("[:Hiragana:]");
|
||
hiraganaWordSet.freeze();
|
||
UnicodeSetIterator iterator = new UnicodeSetIterator(hiraganaWordSet);
|
||
while (iterator.next()) {
|
||
fSkipSet.add(iterator.getString());
|
||
}
|
||
}
|
||
|
||
@Override
|
||
public boolean equals(Object obj) {
|
||
if (obj instanceof CjkBreakEngine) {
|
||
CjkBreakEngine other = (CjkBreakEngine)obj;
|
||
return this.fSet.equals(other.fSet);
|
||
}
|
||
return false;
|
||
}
|
||
|
||
@Override
|
||
public int hashCode() {
|
||
return getClass().hashCode();
|
||
}
|
||
|
||
private static final int kMaxKatakanaLength = 8;
|
||
private static final int kMaxKatakanaGroupLength = 20;
|
||
private static final int maxSnlp = 255;
|
||
private static final int kint32max = Integer.MAX_VALUE;
|
||
private static int getKatakanaCost(int wordlength) {
|
||
int katakanaCost[] = new int[] { 8192, 984, 408, 240, 204, 252, 300, 372, 480 };
|
||
return (wordlength > kMaxKatakanaLength) ? 8192 : katakanaCost[wordlength];
|
||
}
|
||
|
||
private static boolean isKatakana(int value) {
|
||
return (value >= 0x30A1 && value <= 0x30FE && value != 0x30FB) ||
|
||
(value >= 0xFF66 && value <= 0xFF9F);
|
||
}
|
||
|
||
@Override
|
||
public int divideUpDictionaryRange(CharacterIterator inText, int startPos, int endPos,
|
||
DequeI foundBreaks, boolean isPhraseBreaking) {
|
||
if (startPos >= endPos) {
|
||
return 0;
|
||
}
|
||
|
||
inText.setIndex(startPos);
|
||
|
||
int inputLength = endPos - startPos;
|
||
int[] charPositions = new int[inputLength + 1];
|
||
StringBuffer s = new StringBuffer("");
|
||
inText.setIndex(startPos);
|
||
while (inText.getIndex() < endPos) {
|
||
s.append(inText.current());
|
||
inText.next();
|
||
}
|
||
String prenormstr = s.toString();
|
||
boolean isNormalized = Normalizer.quickCheck(prenormstr, Normalizer.NFKC) == Normalizer.YES ||
|
||
Normalizer.isNormalized(prenormstr, Normalizer.NFKC, 0);
|
||
CharacterIterator text;
|
||
int numCodePts = 0;
|
||
if (isNormalized) {
|
||
text = new java.text.StringCharacterIterator(prenormstr);
|
||
int index = 0;
|
||
charPositions[0] = 0;
|
||
while (index < prenormstr.length()) {
|
||
int codepoint = prenormstr.codePointAt(index);
|
||
index += Character.charCount(codepoint);
|
||
numCodePts++;
|
||
charPositions[numCodePts] = index;
|
||
}
|
||
} else {
|
||
String normStr = Normalizer.normalize(prenormstr, Normalizer.NFKC);
|
||
text = new java.text.StringCharacterIterator(normStr);
|
||
charPositions = new int[normStr.length() + 1];
|
||
Normalizer normalizer = new Normalizer(prenormstr, Normalizer.NFKC, 0);
|
||
int index = 0;
|
||
charPositions[0] = 0;
|
||
while (index < normalizer.endIndex()) {
|
||
normalizer.next();
|
||
numCodePts++;
|
||
index = normalizer.getIndex();
|
||
charPositions[numCodePts] = index;
|
||
}
|
||
}
|
||
// Use ML phrase breaking
|
||
if (Boolean.parseBoolean(
|
||
ICUConfig.get("android.icu.impl.breakiter.useMLPhraseBreaking", "false"))) {
|
||
// PhraseBreaking is supported in ja and ko; MlBreakEngine only supports ja.
|
||
if (isPhraseBreaking && isCj) {
|
||
return fMlBreakEngine.divideUpRange(inText, startPos, endPos, text,
|
||
numCodePts, charPositions, foundBreaks);
|
||
}
|
||
}
|
||
|
||
// From here on out, do the algorithm. Note that our indices
|
||
// refer to indices within the normalized string.
|
||
int[] bestSnlp = new int[numCodePts + 1];
|
||
bestSnlp[0] = 0;
|
||
for (int i = 1; i <= numCodePts; i++) {
|
||
bestSnlp[i] = kint32max;
|
||
}
|
||
|
||
int[] prev = new int[numCodePts + 1];
|
||
for (int i = 0; i <= numCodePts; i++) {
|
||
prev[i] = -1;
|
||
}
|
||
|
||
final int maxWordSize = 20;
|
||
int values[] = new int[numCodePts];
|
||
int lengths[] = new int[numCodePts];
|
||
// dynamic programming to find the best segmentation
|
||
|
||
// In outer loop, i is the code point index,
|
||
// ix is the corresponding code unit index.
|
||
// They differ when the string contains supplementary characters.
|
||
int ix = 0;
|
||
text.setIndex(ix);
|
||
boolean is_prev_katakana = false;
|
||
for (int i = 0; i < numCodePts; i++, text.setIndex(ix), next32(text)) {
|
||
ix = text.getIndex();
|
||
if (bestSnlp[i] == kint32max) {
|
||
continue;
|
||
}
|
||
|
||
int maxSearchLength = (i + maxWordSize < numCodePts) ? maxWordSize : (numCodePts - i);
|
||
int[] count_ = new int[1];
|
||
fDictionary.matches(text, maxSearchLength, lengths, count_, maxSearchLength, values);
|
||
int count = count_[0];
|
||
|
||
// if there are no single character matches found in the dictionary
|
||
// starting with this character, treat character as a 1-character word
|
||
// with the highest value possible (i.e. the least likely to occur).
|
||
// Exclude Korean characters from this treatment, as they should be
|
||
// left together by default.
|
||
text.setIndex(ix); // fDictionary.matches() advances the text position; undo that.
|
||
if ((count == 0 || lengths[0] != 1) && current32(text) != DONE32 && !fHangulWordSet.contains(current32(text))) {
|
||
values[count] = maxSnlp;
|
||
lengths[count] = 1;
|
||
count++;
|
||
}
|
||
|
||
for (int j = 0; j < count; j++) {
|
||
int newSnlp = bestSnlp[i] + values[j];
|
||
if (newSnlp < bestSnlp[lengths[j] + i]) {
|
||
bestSnlp[lengths[j] + i] = newSnlp;
|
||
prev[lengths[j] + i] = i;
|
||
}
|
||
}
|
||
|
||
// In Japanese, single-character Katakana words are pretty rare.
|
||
// So we apply the following heuristic to Katakana: any continuous
|
||
// run of Katakana characters is considered a candidate word with
|
||
// a default cost specified in the katakanaCost table according
|
||
// to its length.
|
||
boolean is_katakana = isKatakana(current32(text));
|
||
if (!is_prev_katakana && is_katakana) {
|
||
int j = i + 1;
|
||
next32(text);
|
||
while (j < numCodePts && (j - i) < kMaxKatakanaGroupLength && isKatakana(current32(text))) {
|
||
next32(text);
|
||
++j;
|
||
}
|
||
|
||
if ((j - i) < kMaxKatakanaGroupLength) {
|
||
int newSnlp = bestSnlp[i] + getKatakanaCost(j - i);
|
||
if (newSnlp < bestSnlp[j]) {
|
||
bestSnlp[j] = newSnlp;
|
||
prev[j] = i;
|
||
}
|
||
}
|
||
}
|
||
is_prev_katakana = is_katakana;
|
||
}
|
||
|
||
int t_boundary[] = new int[numCodePts + 1];
|
||
int numBreaks = 0;
|
||
if (bestSnlp[numCodePts] == kint32max) {
|
||
t_boundary[numBreaks] = numCodePts;
|
||
numBreaks++;
|
||
} else if (isPhraseBreaking) {
|
||
t_boundary[numBreaks] = numCodePts;
|
||
numBreaks++;
|
||
int prevIdx = numCodePts;
|
||
int codeUnitIdx = 0, prevCodeUnitIdx = 0, length = 0;
|
||
for (int i = prev[numCodePts]; i > 0; i = prev[i]) {
|
||
codeUnitIdx = prenormstr.offsetByCodePoints(0, i);
|
||
prevCodeUnitIdx = prenormstr.offsetByCodePoints(0, prevIdx);
|
||
length = prevCodeUnitIdx - codeUnitIdx;
|
||
prevIdx = i;
|
||
String pattern = getPatternFromText(text, s, codeUnitIdx, length);
|
||
// Keep the breakpoint if the pattern is not in the fSkipSet and continuous Katakana
|
||
// characters don't occur.
|
||
text.setIndex(codeUnitIdx);
|
||
if (!fSkipSet.contains(pattern)
|
||
&& (!isKatakana(current32(text)) || !isKatakana(previous32(text)))) {
|
||
t_boundary[numBreaks] = i;
|
||
numBreaks++;
|
||
}
|
||
}
|
||
} else {
|
||
for (int i = numCodePts; i > 0; i = prev[i]) {
|
||
t_boundary[numBreaks] = i;
|
||
numBreaks++;
|
||
}
|
||
Assert.assrt(prev[t_boundary[numBreaks - 1]] == 0);
|
||
}
|
||
|
||
if (foundBreaks.size() == 0 || foundBreaks.peek() < startPos) {
|
||
t_boundary[numBreaks++] = 0;
|
||
}
|
||
|
||
int correctedNumBreaks = 0;
|
||
int previous = -1;
|
||
for (int i = numBreaks - 1; i >= 0; i--) {
|
||
int pos = charPositions[t_boundary[i]] + startPos;
|
||
// In phrase breaking, there has to be a breakpoint between Cj character and close
|
||
// punctuation.
|
||
// E.g.[携帯電話]正しい選択 -> [携帯▁電話]▁正しい▁選択 -> breakpoint between ] and 正
|
||
inText.setIndex(pos);
|
||
if (pos > previous) {
|
||
if (pos != startPos
|
||
|| (isPhraseBreaking && pos > 0
|
||
&& fClosePunctuationSet.contains(previous32(inText)))) {
|
||
foundBreaks.push(charPositions[t_boundary[i]] + startPos);
|
||
correctedNumBreaks++;
|
||
}
|
||
}
|
||
previous = pos;
|
||
}
|
||
|
||
if (!foundBreaks.isEmpty() && foundBreaks.peek() == endPos) {
|
||
// In phrase breaking, there has to be a breakpoint between Cj character and
|
||
// the number/open punctuation.
|
||
// E.g. る文字「そうだ、京都」->る▁文字▁「そうだ、▁京都」-> breakpoint between 字 and「
|
||
// E.g. 乗車率90%程度だろうか -> 乗車▁率▁90%▁程度だろうか -> breakpoint between 率 and 9
|
||
// E.g. しかもロゴがUnicode! -> しかも▁ロゴが▁Unicode!-> breakpoint between が and U
|
||
if (isPhraseBreaking) {
|
||
inText.setIndex(endPos);
|
||
int current = current32(inText);
|
||
if (current != DONE32 && !fDigitOrOpenPunctuationOrAlphabetSet.contains(current)) {
|
||
foundBreaks.pop();
|
||
correctedNumBreaks--;
|
||
}
|
||
} else {
|
||
foundBreaks.pop();
|
||
correctedNumBreaks--;
|
||
}
|
||
}
|
||
if (!foundBreaks.isEmpty())
|
||
inText.setIndex(foundBreaks.peek());
|
||
return correctedNumBreaks;
|
||
}
|
||
|
||
private String getPatternFromText(CharacterIterator text, StringBuffer sb, int start,
|
||
int length) {
|
||
sb.setLength(0);
|
||
if (length > 0) {
|
||
text.setIndex(start);
|
||
sb.append(text.current());
|
||
for (int i = 1; i < length; i++) {
|
||
sb.append(text.next());
|
||
}
|
||
}
|
||
return sb.toString();
|
||
}
|
||
}
|