From cb24d478810c665d4eb4f29a52b7d5a2feb6635b Mon Sep 17 00:00:00 2001 From: teble Date: Mon, 16 Oct 2023 13:52:30 +0800 Subject: [PATCH] Support MUtf8 format string parsing(@kotlin.Metadata d1 Protobuf String) --- .../result/AnnotationEncodeArrayData.kt | 16 ++- .../org/luckypray/dexkit/util/MUtf8Util.java | 134 ++++++++++++++++++ .../util/StringUnicodeEncoderDecoder.java | 107 ++++++++++++++ 3 files changed, 252 insertions(+), 5 deletions(-) create mode 100644 dexkit/src/main/java/org/luckypray/dexkit/util/MUtf8Util.java create mode 100644 dexkit/src/main/java/org/luckypray/dexkit/util/StringUnicodeEncoderDecoder.java diff --git a/dexkit/src/main/java/org/luckypray/dexkit/result/AnnotationEncodeArrayData.kt b/dexkit/src/main/java/org/luckypray/dexkit/result/AnnotationEncodeArrayData.kt index 114dfe97..64d0f69a 100644 --- a/dexkit/src/main/java/org/luckypray/dexkit/result/AnnotationEncodeArrayData.kt +++ b/dexkit/src/main/java/org/luckypray/dexkit/result/AnnotationEncodeArrayData.kt @@ -41,6 +41,8 @@ import org.luckypray.dexkit.InnerFieldMeta import org.luckypray.dexkit.InnerMethodMeta import org.luckypray.dexkit.query.enums.AnnotationEncodeValueType import org.luckypray.dexkit.result.base.BaseData +import org.luckypray.dexkit.util.MUtf8Util +import org.luckypray.dexkit.util.StringUnicodeEncoderDecoder class AnnotationEncodeArrayData private constructor( bridge: DexKitBridge, @@ -65,13 +67,17 @@ class AnnotationEncodeArrayData private constructor( AnnotationEncodeValueType.FloatValue -> (encodeValue.value(InnerEncodeValueFloat()) as InnerEncodeValueFloat).value AnnotationEncodeValueType.DoubleValue -> (encodeValue.value(InnerEncodeValueDouble()) as InnerEncodeValueDouble).value AnnotationEncodeValueType.StringValue -> { + val encodeValueString = (encodeValue.value(InnerEncodeValueString()) as InnerEncodeValueString) try { - (encodeValue.value(InnerEncodeValueString()) as InnerEncodeValueString).value!! + encodeValueString.value!! } catch (e: IllegalArgumentException) { - if (e.message?.contains("Invalid UTF-8") == false) { - throw e - } - "" + // try to unescape unicode + runCatching { + encodeValueString.valueAsByteBuffer.let { + val mUtf8String = MUtf8Util.decode(it) + StringUnicodeEncoderDecoder.encodeStringToUnicodeSequence(mUtf8String) + } + }.getOrElse { "" } } } AnnotationEncodeValueType.TypeValue -> ClassData.from(bridge, encodeValue.value(InnerClassMeta()) as InnerClassMeta) diff --git a/dexkit/src/main/java/org/luckypray/dexkit/util/MUtf8Util.java b/dexkit/src/main/java/org/luckypray/dexkit/util/MUtf8Util.java new file mode 100644 index 00000000..9b798a91 --- /dev/null +++ b/dexkit/src/main/java/org/luckypray/dexkit/util/MUtf8Util.java @@ -0,0 +1,134 @@ +/* + * Copyright (C) 2011 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * DexKit - An high-performance runtime parsing library for dex + * implemented in C++ + * Copyright (C) 2022-2023 LuckyPray + * https://github.com/LuckyPray/DexKit + * + * This program is free software: you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, either + * version 3 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see + * . + * . + */ +package org.luckypray.dexkit.util; + +import java.io.UTFDataFormatException; +import java.nio.ByteBuffer; + +/** + * Modified UTF-8 as described in the dex file format spec. + * + *

Derived from libcore's MUTF-8 encoder at java.nio.charset.ModifiedUtf8. + * MUTF-8 + */ +public final class MUtf8Util { + private MUtf8Util() {} + /** + * Decodes bytes from {@code in} into {@code out} until a delimiter 0x00 is + * encountered. Returns a new string containing the decoded characters. + */ + public static String decode(ByteBuffer in) throws UTFDataFormatException { + StringBuilder sb = new StringBuilder(); + while (true) { + if (!in.hasRemaining()) { + return sb.toString(); + } + char a = (char) (in.get() & 0xff); + if (a == 0) { + return sb.toString(); + } + if (a < '\u0080') { + sb.append(a); + } else if ((a & 0xe0) == 0xc0) { + int b = in.get() & 0xff; + if ((b & 0xC0) != 0x80) { + throw new UTFDataFormatException("bad second byte"); + } + sb.append((char) (((a & 0x1F) << 6) | (b & 0x3F))); + } else if ((a & 0xf0) == 0xe0) { + int b = in.get() & 0xff; + int c = in.get() & 0xff; + if (((b & 0xC0) != 0x80) || ((c & 0xC0) != 0x80)) { + throw new UTFDataFormatException("bad second or third byte"); + } + sb.append((char) (((a & 0x0F) << 12) | ((b & 0x3F) << 6) | (c & 0x3F))); + } else { + throw new UTFDataFormatException("bad byte"); + } + } + } + /** + * Returns the number of bytes the modified UTF8 representation of 's' would take. + */ + private static long countBytes(String s, boolean shortLength) throws UTFDataFormatException { + long result = 0; + final int length = s.length(); + for (int i = 0; i < length; ++i) { + char ch = s.charAt(i); + if (ch != 0 && ch <= 127) { // U+0000 uses two bytes. + ++result; + } else if (ch <= 2047) { + result += 2; + } else { + result += 3; + } + if (shortLength && result > 65535) { + throw new UTFDataFormatException("String more than 65535 UTF bytes long"); + } + } + return result; + } + /** + * Encodes the modified UTF-8 bytes corresponding to {@code s} into {@code + * dst}, starting at {@code offset}. + */ + public static void encode(byte[] dst, int offset, String s) { + final int length = s.length(); + for (int i = 0; i < length; i++) { + char ch = s.charAt(i); + if (ch != 0 && ch <= 127) { // U+0000 uses two bytes. + dst[offset++] = (byte) ch; + } else if (ch <= 2047) { + dst[offset++] = (byte) (0xc0 | (0x1f & (ch >> 6))); + dst[offset++] = (byte) (0x80 | (0x3f & ch)); + } else { + dst[offset++] = (byte) (0xe0 | (0x0f & (ch >> 12))); + dst[offset++] = (byte) (0x80 | (0x3f & (ch >> 6))); + dst[offset++] = (byte) (0x80 | (0x3f & ch)); + } + } + } + /** + * Returns an array containing the modified UTF-8 form of {@code s}. + */ + public static byte[] encode(String s) throws UTFDataFormatException { + int utfCount = (int) countBytes(s, true); + byte[] result = new byte[utfCount]; + encode(result, 0, s); + return result; + } +} \ No newline at end of file diff --git a/dexkit/src/main/java/org/luckypray/dexkit/util/StringUnicodeEncoderDecoder.java b/dexkit/src/main/java/org/luckypray/dexkit/util/StringUnicodeEncoderDecoder.java new file mode 100644 index 00000000..947257a0 --- /dev/null +++ b/dexkit/src/main/java/org/luckypray/dexkit/util/StringUnicodeEncoderDecoder.java @@ -0,0 +1,107 @@ +package org.luckypray.dexkit.util; + +/** + * This class provides Unicode conversion utility methods that allow to convert a string into Unicode sequence and vice-versa. (See methods + * descriptions for details) + * + * @author Michael Gantman + */ +public class StringUnicodeEncoderDecoder { + private final static String UNICODE_PREFIX = "\\u"; + private final static String UPPER_CASE_UNICODE_PREFIX = "\\U"; + private final static String UPPER_CASE_UNICODE_PREFIX_REGEX = "\\\\U"; + private final static String DELIMITER = "\\\\u"; + + /** + * This method converts a {@link String} of characters in any language into a String That contains a sequence of Unicode codes corresponding to + * characters in the original String For Example String "Hello" will be converted into a String "\u005c\u00750048\u005c\u00750065\u005c\u0075006c\u005c\u0075006c\u005c\u0075006f" Null or empty + * String conversion will return an empty String + * + * @param txt {@link String} that contains a sequence of characters to convert + * @return {@link String} That contains a sequence of unicode codes corresponding to the characters in the original String. Each code will be in + * hexadecimal format preceded by prefix "\u005c\u0075" with no spaces between them. The String also will have no leading or trailing + * white spaces + */ + public static String encodeStringToUnicodeSequence(String txt) { + StringBuilder result = new StringBuilder(); + if (txt != null && !txt.isEmpty()) { + for (int i = 0; i < txt.length(); i++) { + result.append(convertCodePointToUnicodeString(Character.codePointAt(txt, i))); + if (Character.isHighSurrogate(txt.charAt(i))) { + i++; + } + } + } + return result.toString(); + } + + /** + * This method converts {@link String} that contains a sequence of Unicode codes onto a String of corresponding characters. For example a String + * "\u005c\u00750048\u005c\u00750065\u005c\u0075006c\u005c\u0075006c\u005c\u0075006f" will be converted into String "Hello" by this method. This method performs reverse conversion of the one + * performed by method {@link #encodeStringToUnicodeSequence(String)} I.e. Any textual String converted into sequence of Unicode codes by method + * {@link #encodeStringToUnicodeSequence(String)} may be retrieved back by invoking this method on that Unicode sequence String. + * + * @param unicodeSequence {@link String} That contains sequence of Unicode codes. Each code must be in hexadecimal format and must be preceded by + * "'backslash' + 'u'" prefix. (note that prefix '\U' is now valid as opposed to earlier versions). This method allows + * leading and trailing whitespaces for the whole String as well as spaces between codes. Those white spaces will be ignored. + * @return {@link String} That contains sequence of characters that correspond to the respective Unicode codes in the original String + * @throws IllegalArgumentException if input String is in invalid format. For example if any code is not in hexadecimal format or the code is not a valid Unicode code + * (not valid code point). + */ + public static String decodeUnicodeSequenceToString(String unicodeSequence) throws IllegalArgumentException { + StringBuilder result = new StringBuilder(); + try { + unicodeSequence = replaceUpperCase_U_WithLoverCase(unicodeSequence); + unicodeSequence = unicodeSequence.trim().substring(UNICODE_PREFIX.length()); + for (String codePointStr : unicodeSequence.split(DELIMITER)) { + result.append(Character.toChars(Integer.parseInt(codePointStr.trim(), 16))); + } + } catch (Exception e) { + throw new IllegalArgumentException("Error occurred while converting unicode sequence String to String", e); + } + return result.toString(); + } + + private static String replaceUpperCase_U_WithLoverCase(String unicodeSequence) { + String result = unicodeSequence; + if(unicodeSequence != null && unicodeSequence.contains(UPPER_CASE_UNICODE_PREFIX)) { + result = unicodeSequence.replaceAll(UPPER_CASE_UNICODE_PREFIX_REGEX, DELIMITER); + } + return result; + } + + /** + * This method converts an integer that holds a unicode code value into a String + * + * @param codePoint a unicode code value + * @return {@link String} that starts with prefix "'backslash' + 'u'" that follows with hexadecimal value of an integer. If the hexadecimal value + * of an integer is less then four digits the value is padded with preceding zeros. For example if the integer has value 32 (decimal) it + * will be converted into String "\u0020" + */ + private static String convertCodePointToUnicodeString(int codePoint) { + StringBuilder result = new StringBuilder(UNICODE_PREFIX); + String codePointHexStr = Integer.toHexString(codePoint); + codePointHexStr = codePointHexStr.startsWith("0") ? codePointHexStr.substring(1) : codePointHexStr; + if (codePointHexStr.length() <= 4) { + result.append(getPrecedingZerosStr(codePointHexStr.length())); + } + result.append(codePointHexStr); + return result.toString(); + } + + /** + * This method receives a length of a String and if it is less then 4 it generates a padding String of zeros that can be appended to the String to + * make it of length 4 I.e. if parameter passed is 1 the returned String will be "000". If the parameter passed is 4 or greater empty String is + * returned. + * + * @param codePointStrLength Length of a String to be padded by preceding zeros to the length of 4 + * @return padding String + */ + private static String getPrecedingZerosStr(int codePointStrLength) { + StringBuilder result = new StringBuilder(); + for (int i = 0; i < 4 - codePointStrLength; i++) { + result.append("0"); + } + return result.toString(); + } +} \ No newline at end of file