110 lines
4.4 KiB
Java
110 lines
4.4 KiB
Java
/*
|
|
* Copyright (C) 2022 The Android Open Source Project
|
|
*
|
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
* you may not use this file except in compliance with the License.
|
|
* You may obtain a copy of the License at
|
|
*
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|
*
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
* See the License for the specific language governing permissions and
|
|
* limitations under the License.
|
|
*/
|
|
|
|
package com.android.modules.utils;
|
|
|
|
import java.io.UTFDataFormatException;
|
|
|
|
public class ModifiedUtf8 {
|
|
/**
|
|
* Decodes a byte array containing <i>modified UTF-8</i> bytes into a string.
|
|
*
|
|
* <p>Note that although this method decodes the (supposedly impossible) zero byte to U+0000,
|
|
* that's what the RI does too.
|
|
*/
|
|
public static String decode(byte[] in, char[] out, int offset, int utfSize)
|
|
throws UTFDataFormatException {
|
|
int count = 0, s = 0, a;
|
|
while (count < utfSize) {
|
|
if ((out[s] = (char) in[offset + count++]) < '\u0080') {
|
|
s++;
|
|
} else if (((a = out[s]) & 0xe0) == 0xc0) {
|
|
if (count >= utfSize) {
|
|
throw new UTFDataFormatException("bad second byte at " + count);
|
|
}
|
|
int b = in[offset + count++];
|
|
if ((b & 0xC0) != 0x80) {
|
|
throw new UTFDataFormatException("bad second byte at " + (count - 1));
|
|
}
|
|
out[s++] = (char) (((a & 0x1F) << 6) | (b & 0x3F));
|
|
} else if ((a & 0xf0) == 0xe0) {
|
|
if (count + 1 >= utfSize) {
|
|
throw new UTFDataFormatException("bad third byte at " + (count + 1));
|
|
}
|
|
int b = in[offset + count++];
|
|
int c = in[offset + count++];
|
|
if (((b & 0xC0) != 0x80) || ((c & 0xC0) != 0x80)) {
|
|
throw new UTFDataFormatException("bad second or third byte at " + (count - 2));
|
|
}
|
|
out[s++] = (char) (((a & 0x0F) << 12) | ((b & 0x3F) << 6) | (c & 0x3F));
|
|
} else {
|
|
throw new UTFDataFormatException("bad byte at " + (count - 1));
|
|
}
|
|
}
|
|
return new String(out, 0, s);
|
|
}
|
|
|
|
/**
|
|
* Returns the number of bytes the modified UTF-8 representation of 's' would take. Note
|
|
* that this is just the space for the bytes representing the characters, not the length
|
|
* which precedes those bytes, because different callers represent the length differently,
|
|
* as two, four, or even eight bytes. If {@code shortLength} is true, we'll throw an
|
|
* exception if the string is too long for its length to be represented by a short.
|
|
*/
|
|
public static long countBytes(String s, boolean shortLength) throws UTFDataFormatException {
|
|
long result = 0;
|
|
final int length = s.length();
|
|
for (int i = 0; i < length; ++i) {
|
|
char ch = s.charAt(i);
|
|
if (ch != 0 && ch <= 127) { // U+0000 uses two bytes.
|
|
++result;
|
|
} else if (ch <= 2047) {
|
|
result += 2;
|
|
} else {
|
|
result += 3;
|
|
}
|
|
if (shortLength && result > 65535) {
|
|
throw new UTFDataFormatException("String more than 65535 UTF bytes long");
|
|
}
|
|
}
|
|
return result;
|
|
}
|
|
|
|
/**
|
|
* Encodes the <i>modified UTF-8</i> bytes corresponding to string {@code s} into the
|
|
* byte array {@code dst}, starting at the given {@code offset}.
|
|
*/
|
|
public static void encode(byte[] dst, int offset, String s) {
|
|
final int length = s.length();
|
|
for (int i = 0; i < length; i++) {
|
|
char ch = s.charAt(i);
|
|
if (ch != 0 && ch <= 127) { // U+0000 uses two bytes.
|
|
dst[offset++] = (byte) ch;
|
|
} else if (ch <= 2047) {
|
|
dst[offset++] = (byte) (0xc0 | (0x1f & (ch >> 6)));
|
|
dst[offset++] = (byte) (0x80 | (0x3f & ch));
|
|
} else {
|
|
dst[offset++] = (byte) (0xe0 | (0x0f & (ch >> 12)));
|
|
dst[offset++] = (byte) (0x80 | (0x3f & (ch >> 6)));
|
|
dst[offset++] = (byte) (0x80 | (0x3f & ch));
|
|
}
|
|
}
|
|
}
|
|
|
|
private ModifiedUtf8() {
|
|
}
|
|
}
|