/* GENERATED SOURCE. DO NOT MODIFY. */
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/**
*******************************************************************************
* Copyright (C) 2005-2016, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*/
package android.icu.text;
import java.io.IOException;
import java.io.InputStream;
import java.io.Reader;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
/**
* CharsetDetector
provides a facility for detecting the
* charset or encoding of character data in an unknown format.
* The input data can either be from an input stream or an array of bytes.
* The result of the detection operation is a list of possibly matching
* charsets, or, for simple use, you can just ask for a Java Reader that
* will will work over the input data.
*
* Character set detection is at best an imprecise operation. The detection * process will attempt to identify the charset that best matches the characteristics * of the byte data, but the process is partly statistical in nature, and * the results can not be guaranteed to always be correct. *
* For best accuracy in charset detection, the input data should be primarily * in a single language, and a minimum of a few hundred bytes worth of plain text * in the language are needed. The detection process will attempt to * ignore html or xml style markup that could otherwise obscure the content. *
* @hide Only a subset of ICU is exposed in Android */ public class CharsetDetector { // Question: Should we have getters corresponding to the setters for input text // and declared encoding? // A thought: If we were to create our own type of Java Reader, we could defer // figuring out an actual charset for data that starts out with too much English // only ASCII until the user actually read through to something that didn't look // like 7 bit English. If nothing else ever appeared, we would never need to // actually choose the "real" charset. All assuming that the application just // wants the data, and doesn't care about a char set name. /** * Constructor */ public CharsetDetector() { } /** * Set the declared encoding for charset detection. * The declared encoding of an input text is an encoding obtained * from an http header or xml declaration or similar source that * can be provided as additional information to the charset detector. * A match between a declared encoding and a possible detected encoding * will raise the quality of that detected encoding by a small delta, * and will also appear as a "reason" for the match. *
* A declared encoding that is incompatible with the input data being * analyzed will not be added to the list of possible encodings. * * @param encoding The declared encoding */ public CharsetDetector setDeclaredEncoding(String encoding) { fDeclaredEncoding = encoding; return this; } /** * Set the input text (byte) data whose charset is to be detected. * * @param in the input text of unknown encoding * * @return This CharsetDetector */ public CharsetDetector setText(byte [] in) { fRawInput = in; fRawLength = in.length; return this; } private static final int kBufSize = 8000; /** * Set the input text (byte) data whose charset is to be detected. *
* The input stream that supplies the character data must have markSupported() * == true; the charset detection process will read a small amount of data, * then return the stream to its original position via * the InputStream.reset() operation. The exact amount that will * be read depends on the characteristics of the data itself. * * @param in the input text of unknown encoding * * @return This CharsetDetector */ public CharsetDetector setText(InputStream in) throws IOException { fInputStream = in; fInputStream.mark(kBufSize); fRawInput = new byte[kBufSize]; // Always make a new buffer because the // previous one may have come from the caller, // in which case we can't touch it. fRawLength = 0; int remainingLength = kBufSize; while (remainingLength > 0 ) { // read() may give data in smallish chunks, esp. for remote sources. Hence, this loop. int bytesRead = fInputStream.read(fRawInput, fRawLength, remainingLength); if (bytesRead <= 0) { break; } fRawLength += bytesRead; remainingLength -= bytesRead; } fInputStream.reset(); return this; } /** * Return the charset that best matches the supplied input data. * * Note though, that because the detection * only looks at the start of the input data, * there is a possibility that the returned charset will fail to handle * the full set of input data. *
* Raise an exception if *
null
if there are no matches.
*/
public CharsetMatch detect() {
// TODO: A better implementation would be to copy the detect loop from
// detectAll(), and cut it short as soon as a match with a high confidence
// is found. This is something to be done later, after things are otherwise
// working.
CharsetMatch matches[] = detectAll();
if (matches == null || matches.length == 0) {
return null;
}
return matches[0];
}
/**
* Return an array of all charsets that appear to be plausible
* matches with the input data. The array is ordered with the
* best quality match first.
* * Raise an exception if *
* This is a convenience method that is equivalent to
* this.setDeclaredEncoding(declaredEncoding).setText(in).detect().getReader();
*
* For the input stream that supplies the character data, markSupported() * must be true; the charset detection will read a small amount of data, * then return the stream to its original position via * the InputStream.reset() operation. The exact amount that will * be read depends on the characteristics of the data itself. *
* Raise an exception if no charsets appear to match the input data. * * @param in The source of the byte data in the unknown charset. * * @param declaredEncoding A declared encoding for the data, if available, * or null or an empty string if none is available. */ public Reader getReader(InputStream in, String declaredEncoding) { fDeclaredEncoding = declaredEncoding; try { setText(in); CharsetMatch match = detect(); if (match == null) { return null; } return match.getReader(); } catch (IOException e) { return null; } } /** * Autodetect the charset of an inputStream, and return a String * containing the converted input data. *
* This is a convenience method that is equivalent to
* this.setDeclaredEncoding(declaredEncoding).setText(in).detect().getString();
*
* Raise an exception if no charsets appear to match the input data.
*
* @param in The source of the byte data in the unknown charset.
*
* @param declaredEncoding A declared encoding for the data, if available,
* or null or an empty string if none is available.
*/
public String getString(byte[] in, String declaredEncoding)
{
fDeclaredEncoding = declaredEncoding;
try {
setText(in);
CharsetMatch match = detect();
if (match == null) {
return null;
}
return match.getString(-1);
} catch (IOException e) {
return null;
}
}
/**
* Get the names of all charsets supported by CharsetDetector
class.
*
* Note: Multiple different charset encodings in a same family may use
* a single shared name in this implementation. For example, this method returns
* an array including "ISO-8859-1" (ISO Latin 1), but not including "windows-1252"
* (Windows Latin 1). However, actual detection result could be "windows-1252"
* when the input data matches Latin 1 code points with any points only available
* in "windows-1252".
*
* @return an array of the names of all charsets supported by
* CharsetDetector
class.
*/
public static String[] getAllDetectableCharsets() {
String[] allCharsetNames = new String[ALL_CS_RECOGNIZERS.size()];
for (int i = 0; i < allCharsetNames.length; i++) {
allCharsetNames[i] = ALL_CS_RECOGNIZERS.get(i).recognizer.getName();
}
return allCharsetNames;
}
/**
* Test whether or not input filtering is enabled.
*
* @return true
if input text will be filtered.
*
* @see #enableInputFilter
*/
public boolean inputFilterEnabled()
{
return fStripTags;
}
/**
* Enable filtering of input text. If filtering is enabled,
* text within angle brackets ("<" and ">") will be removed
* before detection.
*
* @param filter true
to enable input text filtering.
*
* @return The previous setting.
*/
public boolean enableInputFilter(boolean filter)
{
boolean previous = fStripTags;
fStripTags = filter;
return previous;
}
/*
* MungeInput - after getting a set of raw input data to be analyzed, preprocess
* it by removing what appears to be html markup.
*/
private void MungeInput() {
int srci = 0;
int dsti = 0;
byte b;
boolean inMarkup = false;
int openTags = 0;
int badTags = 0;
//
// html / xml markup stripping.
// quick and dirty, not 100% accurate, but hopefully good enough, statistically.
// discard everything within < brackets >
// Count how many total '<' and illegal (nested) '<' occur, so we can make some
// guess as to whether the input was actually marked up at all.
if (fStripTags) {
for (srci = 0; srci < fRawLength && dsti < fInputBytes.length; srci++) {
b = fRawInput[srci];
if (b == (byte)'<') {
if (inMarkup) {
badTags++;
}
inMarkup = true;
openTags++;
}
if (! inMarkup) {
fInputBytes[dsti++] = b;
}
if (b == (byte)'>') {
inMarkup = false;
}
}
fInputLen = dsti;
}
//
// If it looks like this input wasn't marked up, or if it looks like it's
// essentially nothing but markup abandon the markup stripping.
// Detection will have to work on the unstripped input.
//
if (openTags<5 || openTags/5 < badTags ||
(fInputLen < 100 && fRawLength>600)) {
int limit = fRawLength;
if (limit > kBufSize) {
limit = kBufSize;
}
for (srci=0; srcitrue
to enable, or false
to disable the
* charset encoding.
* @return A reference to this CharsetDetector
.
* @throws IllegalArgumentException when the name of charset encoding is
* not supported.
*
* @deprecated This API is ICU internal only.
* @hide draft / provisional / internal are hidden on Android
*/
@Deprecated
public CharsetDetector setDetectableCharset(String encoding, boolean enabled) {
int modIdx = -1;
boolean isDefaultVal = false;
for (int i = 0; i < ALL_CS_RECOGNIZERS.size(); i++) {
CSRecognizerInfo csrinfo = ALL_CS_RECOGNIZERS.get(i);
if (csrinfo.recognizer.getName().equals(encoding)) {
modIdx = i;
isDefaultVal = (csrinfo.isDefaultEnabled == enabled);
break;
}
}
if (modIdx < 0) {
// No matching encoding found
throw new IllegalArgumentException("Invalid encoding: " + "\"" + encoding + "\"");
}
if (fEnabledRecognizers == null && !isDefaultVal) {
// Create an array storing the non default setting
fEnabledRecognizers = new boolean[ALL_CS_RECOGNIZERS.size()];
// Initialize the array with default info
for (int i = 0; i < ALL_CS_RECOGNIZERS.size(); i++) {
fEnabledRecognizers[i] = ALL_CS_RECOGNIZERS.get(i).isDefaultEnabled;
}
}
if (fEnabledRecognizers != null) {
fEnabledRecognizers[modIdx] = enabled;
}
return this;
}
}