175 lines
6.3 KiB
Java
175 lines
6.3 KiB
Java
![]() |
/* GENERATED SOURCE. DO NOT MODIFY. */
|
||
|
// © 2016 and later: Unicode, Inc. and others.
|
||
|
// License & terms of use: http://www.unicode.org/copyright.html
|
||
|
/*
|
||
|
*******************************************************************************
|
||
|
* Copyright (C) 2005 - 2012, International Business Machines Corporation and *
|
||
|
* others. All Rights Reserved. *
|
||
|
*******************************************************************************
|
||
|
*/
|
||
|
package android.icu.text;
|
||
|
|
||
|
/**
|
||
|
* class CharsetRecog_2022 part of the ICU charset detection implementation.
|
||
|
* This is a superclass for the individual detectors for
|
||
|
* each of the detectable members of the ISO 2022 family
|
||
|
* of encodings.
|
||
|
*
|
||
|
* The separate classes are nested within this class.
|
||
|
*/
|
||
|
abstract class CharsetRecog_2022 extends CharsetRecognizer {
|
||
|
|
||
|
|
||
|
/**
|
||
|
* Matching function shared among the 2022 detectors JP, CN and KR
|
||
|
* Counts up the number of legal an unrecognized escape sequences in
|
||
|
* the sample of text, and computes a score based on the total number &
|
||
|
* the proportion that fit the encoding.
|
||
|
*
|
||
|
*
|
||
|
* @param text the byte buffer containing text to analyse
|
||
|
* @param textLen the size of the text in the byte.
|
||
|
* @param escapeSequences the byte escape sequences to test for.
|
||
|
* @return match quality, in the range of 0-100.
|
||
|
*/
|
||
|
int match(byte [] text, int textLen, byte [][] escapeSequences) {
|
||
|
int i, j;
|
||
|
int escN;
|
||
|
int hits = 0;
|
||
|
int misses = 0;
|
||
|
int shifts = 0;
|
||
|
int quality;
|
||
|
scanInput:
|
||
|
for (i=0; i<textLen; i++) {
|
||
|
if (text[i] == 0x1b) {
|
||
|
checkEscapes:
|
||
|
for (escN=0; escN<escapeSequences.length; escN++) {
|
||
|
byte [] seq = escapeSequences[escN];
|
||
|
|
||
|
if ((textLen - i) < seq.length) {
|
||
|
continue checkEscapes;
|
||
|
}
|
||
|
|
||
|
for (j=1; j<seq.length; j++) {
|
||
|
if (seq[j] != text[i+j]) {
|
||
|
continue checkEscapes;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
hits++;
|
||
|
i += seq.length-1;
|
||
|
continue scanInput;
|
||
|
}
|
||
|
|
||
|
misses++;
|
||
|
}
|
||
|
|
||
|
if (text[i] == 0x0e || text[i] == 0x0f) {
|
||
|
// Shift in/out
|
||
|
shifts++;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
if (hits == 0) {
|
||
|
return 0;
|
||
|
}
|
||
|
|
||
|
//
|
||
|
// Initial quality is based on relative proportion of recognized vs.
|
||
|
// unrecognized escape sequences.
|
||
|
// All good: quality = 100;
|
||
|
// half or less good: quality = 0;
|
||
|
// linear inbetween.
|
||
|
quality = (100*hits - 100*misses) / (hits + misses);
|
||
|
|
||
|
// Back off quality if there were too few escape sequences seen.
|
||
|
// Include shifts in this computation, so that KR does not get penalized
|
||
|
// for having only a single Escape sequence, but many shifts.
|
||
|
if (hits+shifts < 5) {
|
||
|
quality -= (5-(hits+shifts))*10;
|
||
|
}
|
||
|
|
||
|
if (quality < 0) {
|
||
|
quality = 0;
|
||
|
}
|
||
|
return quality;
|
||
|
}
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
static class CharsetRecog_2022JP extends CharsetRecog_2022 {
|
||
|
private byte [] [] escapeSequences = {
|
||
|
{0x1b, 0x24, 0x28, 0x43}, // KS X 1001:1992
|
||
|
{0x1b, 0x24, 0x28, 0x44}, // JIS X 212-1990
|
||
|
{0x1b, 0x24, 0x40}, // JIS C 6226-1978
|
||
|
{0x1b, 0x24, 0x41}, // GB 2312-80
|
||
|
{0x1b, 0x24, 0x42}, // JIS X 208-1983
|
||
|
{0x1b, 0x26, 0x40}, // JIS X 208 1990, 1997
|
||
|
{0x1b, 0x28, 0x42}, // ASCII
|
||
|
{0x1b, 0x28, 0x48}, // JIS-Roman
|
||
|
{0x1b, 0x28, 0x49}, // Half-width katakana
|
||
|
{0x1b, 0x28, 0x4a}, // JIS-Roman
|
||
|
{0x1b, 0x2e, 0x41}, // ISO 8859-1
|
||
|
{0x1b, 0x2e, 0x46} // ISO 8859-7
|
||
|
};
|
||
|
|
||
|
@Override
|
||
|
String getName() {
|
||
|
return "ISO-2022-JP";
|
||
|
}
|
||
|
|
||
|
@Override
|
||
|
CharsetMatch match(CharsetDetector det) {
|
||
|
int confidence = match(det.fInputBytes, det.fInputLen, escapeSequences);
|
||
|
return confidence == 0 ? null : new CharsetMatch(det, this, confidence);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
static class CharsetRecog_2022KR extends CharsetRecog_2022 {
|
||
|
private byte [] [] escapeSequences = {
|
||
|
{0x1b, 0x24, 0x29, 0x43}
|
||
|
};
|
||
|
|
||
|
@Override
|
||
|
String getName() {
|
||
|
return "ISO-2022-KR";
|
||
|
}
|
||
|
|
||
|
@Override
|
||
|
CharsetMatch match(CharsetDetector det) {
|
||
|
int confidence = match(det.fInputBytes, det.fInputLen, escapeSequences);
|
||
|
return confidence == 0 ? null : new CharsetMatch(det, this, confidence);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
static class CharsetRecog_2022CN extends CharsetRecog_2022 {
|
||
|
private byte [] [] escapeSequences = {
|
||
|
{0x1b, 0x24, 0x29, 0x41}, // GB 2312-80
|
||
|
{0x1b, 0x24, 0x29, 0x47}, // CNS 11643-1992 Plane 1
|
||
|
{0x1b, 0x24, 0x2A, 0x48}, // CNS 11643-1992 Plane 2
|
||
|
{0x1b, 0x24, 0x29, 0x45}, // ISO-IR-165
|
||
|
{0x1b, 0x24, 0x2B, 0x49}, // CNS 11643-1992 Plane 3
|
||
|
{0x1b, 0x24, 0x2B, 0x4A}, // CNS 11643-1992 Plane 4
|
||
|
{0x1b, 0x24, 0x2B, 0x4B}, // CNS 11643-1992 Plane 5
|
||
|
{0x1b, 0x24, 0x2B, 0x4C}, // CNS 11643-1992 Plane 6
|
||
|
{0x1b, 0x24, 0x2B, 0x4D}, // CNS 11643-1992 Plane 7
|
||
|
{0x1b, 0x4e}, // SS2
|
||
|
{0x1b, 0x4f}, // SS3
|
||
|
};
|
||
|
|
||
|
@Override
|
||
|
String getName() {
|
||
|
return "ISO-2022-CN";
|
||
|
}
|
||
|
|
||
|
@Override
|
||
|
CharsetMatch match(CharsetDetector det) {
|
||
|
int confidence = match(det.fInputBytes, det.fInputLen, escapeSequences);
|
||
|
return confidence == 0 ? null : new CharsetMatch(det, this, confidence);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
}
|
||
|
|