Support MUtf8 format string parsing(@kotlin.Metadata d1 Protobuf String)

LuckyPray · Oct 16, 2023 · cb24d47 · cb24d47
1 parent c2af9cd
commit cb24d47
Show file tree

Hide file tree

Showing 3 changed files with 252 additions and 5 deletions.
diff --git a/dexkit/src/main/java/org/luckypray/dexkit/result/AnnotationEncodeArrayData.kt b/dexkit/src/main/java/org/luckypray/dexkit/result/AnnotationEncodeArrayData.kt
@@ -41,6 +41,8 @@ import org.luckypray.dexkit.InnerFieldMeta
 import org.luckypray.dexkit.InnerMethodMeta
 import org.luckypray.dexkit.query.enums.AnnotationEncodeValueType
 import org.luckypray.dexkit.result.base.BaseData
+import org.luckypray.dexkit.util.MUtf8Util
+import org.luckypray.dexkit.util.StringUnicodeEncoderDecoder
 
 class AnnotationEncodeArrayData private constructor(
     bridge: DexKitBridge,
@@ -65,13 +67,17 @@ class AnnotationEncodeArrayData private constructor(
                         AnnotationEncodeValueType.FloatValue -> (encodeValue.value(InnerEncodeValueFloat()) as InnerEncodeValueFloat).value
                         AnnotationEncodeValueType.DoubleValue -> (encodeValue.value(InnerEncodeValueDouble()) as InnerEncodeValueDouble).value
                         AnnotationEncodeValueType.StringValue -> {
+                            val encodeValueString = (encodeValue.value(InnerEncodeValueString()) as InnerEncodeValueString)
                             try {
-                                (encodeValue.value(InnerEncodeValueString()) as InnerEncodeValueString).value!!
+                                encodeValueString.value!!
                             } catch (e: IllegalArgumentException) {
-                                if (e.message?.contains("Invalid UTF-8") == false) {
-                                    throw e
-                                }
-                                ""
+                                // try to unescape unicode
+                                runCatching {
+                                    encodeValueString.valueAsByteBuffer.let {
+                                        val mUtf8String = MUtf8Util.decode(it)
+                                        StringUnicodeEncoderDecoder.encodeStringToUnicodeSequence(mUtf8String)
+                                    }
+                                }.getOrElse { "" }
                             }
                         }
                         AnnotationEncodeValueType.TypeValue -> ClassData.from(bridge, encodeValue.value(InnerClassMeta()) as InnerClassMeta)

diff --git a/dexkit/src/main/java/org/luckypray/dexkit/util/MUtf8Util.java b/dexkit/src/main/java/org/luckypray/dexkit/util/MUtf8Util.java
@@ -0,0 +1,134 @@
+/*
+ * Copyright (C) 2011 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * DexKit - An high-performance runtime parsing library for dex
+ * implemented in C++
+ * Copyright (C) 2022-2023 LuckyPray
+ * https://github.com/LuckyPray/DexKit
+ *
+ * This program is free software: you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation, either
+ * version 3 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see
+ * <https://www.gnu.org/licenses/>.
+ * <https://github.com/LuckyPray/DexKit/blob/master/LICENSE>.
+ */
+package org.luckypray.dexkit.util;
+
+import java.io.UTFDataFormatException;
+import java.nio.ByteBuffer;
+
+/**
+ * Modified UTF-8 as described in the dex file format spec.
+ *
+ * <p>Derived from libcore's MUTF-8 encoder at java.nio.charset.ModifiedUtf8.
+ * <a href="https://source.android.com/docs/core/runtime/dex-format#mutf-8">MUTF-8</a>
+ */
+public final class MUtf8Util {
+    private MUtf8Util() {}
+    /**
+     * Decodes bytes from {@code in} into {@code out} until a delimiter 0x00 is
+     * encountered. Returns a new string containing the decoded characters.
+     */
+    public static String decode(ByteBuffer in) throws UTFDataFormatException {
+        StringBuilder sb = new StringBuilder();
+        while (true) {
+            if (!in.hasRemaining()) {
+                return sb.toString();
+            }
+            char a = (char) (in.get() & 0xff);
+            if (a == 0) {
+                return sb.toString();
+            }
+            if (a < '\u0080') {
+                sb.append(a);
+            } else if ((a & 0xe0) == 0xc0) {
+                int b = in.get() & 0xff;
+                if ((b & 0xC0) != 0x80) {
+                    throw new UTFDataFormatException("bad second byte");
+                }
+                sb.append((char) (((a & 0x1F) << 6) | (b & 0x3F)));
+            } else if ((a & 0xf0) == 0xe0) {
+                int b = in.get() & 0xff;
+                int c = in.get() & 0xff;
+                if (((b & 0xC0) != 0x80) || ((c & 0xC0) != 0x80)) {
+                    throw new UTFDataFormatException("bad second or third byte");
+                }
+                sb.append((char) (((a & 0x0F) << 12) | ((b & 0x3F) << 6) | (c & 0x3F)));
+            } else {
+                throw new UTFDataFormatException("bad byte");
+            }
+        }
+    }
+    /**
+     * Returns the number of bytes the modified UTF8 representation of 's' would take.
+     */
+    private static long countBytes(String s, boolean shortLength) throws UTFDataFormatException {
+        long result = 0;
+        final int length = s.length();
+        for (int i = 0; i < length; ++i) {
+            char ch = s.charAt(i);
+            if (ch != 0 && ch <= 127) { // U+0000 uses two bytes.
+                ++result;
+            } else if (ch <= 2047) {
+                result += 2;
+            } else {
+                result += 3;
+            }
+            if (shortLength && result > 65535) {
+                throw new UTFDataFormatException("String more than 65535 UTF bytes long");
+            }
+        }
+        return result;
+    }
+    /**
+     * Encodes the modified UTF-8 bytes corresponding to {@code s} into  {@code
+     * dst}, starting at {@code offset}.
+     */
+    public static void encode(byte[] dst, int offset, String s) {
+        final int length = s.length();
+        for (int i = 0; i < length; i++) {
+            char ch = s.charAt(i);
+            if (ch != 0 && ch <= 127) { // U+0000 uses two bytes.
+                dst[offset++] = (byte) ch;
+            } else if (ch <= 2047) {
+                dst[offset++] = (byte) (0xc0 | (0x1f & (ch >> 6)));
+                dst[offset++] = (byte) (0x80 | (0x3f & ch));
+            } else {
+                dst[offset++] = (byte) (0xe0 | (0x0f & (ch >> 12)));
+                dst[offset++] = (byte) (0x80 | (0x3f & (ch >> 6)));
+                dst[offset++] = (byte) (0x80 | (0x3f & ch));
+            }
+        }
+    }
+    /**
+     * Returns an array containing the <i>modified UTF-8</i> form of {@code s}.
+     */
+    public static byte[] encode(String s) throws UTFDataFormatException {
+        int utfCount = (int) countBytes(s, true);
+        byte[] result = new byte[utfCount];
+        encode(result, 0, s);
+        return result;
+    }
+}
diff --git a/dexkit/src/main/java/org/luckypray/dexkit/util/StringUnicodeEncoderDecoder.java b/dexkit/src/main/java/org/luckypray/dexkit/util/StringUnicodeEncoderDecoder.java
@@ -0,0 +1,107 @@
+package org.luckypray.dexkit.util;
+
+/**
+ * This class provides Unicode conversion utility methods that allow to convert a string into Unicode sequence and vice-versa. (See methods
+ * descriptions for details)
+ *
+ * @author Michael Gantman
+ */
+public class StringUnicodeEncoderDecoder {
+    private final static String UNICODE_PREFIX = "\\u";
+    private final static String UPPER_CASE_UNICODE_PREFIX = "\\U";
+    private final static String UPPER_CASE_UNICODE_PREFIX_REGEX = "\\\\U";
+    private final static String DELIMITER = "\\\\u";
+
+    /**
+     * This method converts a {@link String} of characters in any language into a String That contains a sequence of Unicode codes corresponding to
+     * characters in the original String For Example String "Hello" will be converted into a String "\u005c\u00750048\u005c\u00750065\u005c\u0075006c\u005c\u0075006c\u005c\u0075006f" Null or empty
+     * String conversion will return an empty String
+     *
+     * @param txt {@link String} that contains a sequence of characters to convert
+     * @return {@link String} That contains a sequence of unicode codes corresponding to the characters in the original String. Each code will be in
+     *         hexadecimal format preceded by prefix "\u005c\u0075" with no spaces between them. The String also will have no leading or trailing
+     *         white spaces
+     */
+    public static String encodeStringToUnicodeSequence(String txt) {
+        StringBuilder result = new StringBuilder();
+        if (txt != null && !txt.isEmpty()) {
+            for (int i = 0; i < txt.length(); i++) {
+                result.append(convertCodePointToUnicodeString(Character.codePointAt(txt, i)));
+                if (Character.isHighSurrogate(txt.charAt(i))) {
+                    i++;
+                }
+            }
+        }
+        return result.toString();
+    }
+
+    /**
+     * This method converts {@link String} that contains a sequence of Unicode codes onto a String of corresponding characters. For example a String
+     * "\u005c\u00750048\u005c\u00750065\u005c\u0075006c\u005c\u0075006c\u005c\u0075006f" will be converted into String "Hello" by this method. This method performs reverse conversion of the one
+     * performed by method {@link #encodeStringToUnicodeSequence(String)} I.e. Any textual String converted into sequence of Unicode codes by method
+     * {@link #encodeStringToUnicodeSequence(String)} may be retrieved back by invoking this method on that Unicode sequence String.
+     *
+     * @param unicodeSequence {@link String} That contains sequence of Unicode codes. Each code must be in hexadecimal format and must be preceded by
+     *                        "'backslash' + 'u'" prefix. (note that prefix '\U' is now valid as opposed to earlier versions). This method allows
+     *                        leading and trailing whitespaces for the whole String as well as spaces between codes. Those white spaces will be ignored.
+     * @return {@link String} That contains sequence of characters that correspond to the respective Unicode codes in the original String
+     * @throws IllegalArgumentException if input String is in invalid format. For example if any code is not in hexadecimal format or the code is not a valid Unicode code
+     *                                  (not valid code point).
+     */
+    public static String decodeUnicodeSequenceToString(String unicodeSequence) throws IllegalArgumentException {
+        StringBuilder result = new StringBuilder();
+        try {
+            unicodeSequence = replaceUpperCase_U_WithLoverCase(unicodeSequence);
+            unicodeSequence = unicodeSequence.trim().substring(UNICODE_PREFIX.length());
+            for (String codePointStr : unicodeSequence.split(DELIMITER)) {
+                result.append(Character.toChars(Integer.parseInt(codePointStr.trim(), 16)));
+            }
+        } catch (Exception e) {
+            throw new IllegalArgumentException("Error occurred while converting unicode sequence String to String", e);
+        }
+        return result.toString();
+    }
+
+    private static String replaceUpperCase_U_WithLoverCase(String unicodeSequence) {
+        String result = unicodeSequence;
+        if(unicodeSequence != null && unicodeSequence.contains(UPPER_CASE_UNICODE_PREFIX)) {
+            result = unicodeSequence.replaceAll(UPPER_CASE_UNICODE_PREFIX_REGEX, DELIMITER);
+        }
+        return result;
+    }
+
+    /**
+     * This method converts an integer that holds a unicode code value into a String
+     *
+     * @param codePoint a unicode code value
+     * @return {@link String} that starts with prefix "'backslash' + 'u'" that follows with hexadecimal value of an integer. If the hexadecimal value
+     *         of an integer is less then four digits the value is padded with preceding zeros. For example if the integer has value 32 (decimal) it
+     *         will be converted into String "\u0020"
+     */
+    private static String convertCodePointToUnicodeString(int codePoint) {
+        StringBuilder result = new StringBuilder(UNICODE_PREFIX);
+        String codePointHexStr = Integer.toHexString(codePoint);
+        codePointHexStr = codePointHexStr.startsWith("0") ? codePointHexStr.substring(1) : codePointHexStr;
+        if (codePointHexStr.length() <= 4) {
+            result.append(getPrecedingZerosStr(codePointHexStr.length()));
+        }
+        result.append(codePointHexStr);
+        return result.toString();
+    }
+
+    /**
+     * This method receives a length of a String and if it is less then 4 it generates a padding String of zeros that can be appended to the String to
+     * make it of length 4 I.e. if parameter passed is 1 the returned String will be "000". If the parameter passed is 4 or greater empty String is
+     * returned.
+     *
+     * @param codePointStrLength Length of a String to be padded by preceding zeros to the length of 4
+     * @return padding String
+     */
+    private static String getPrecedingZerosStr(int codePointStrLength) {
+        StringBuilder result = new StringBuilder();
+        for (int i = 0; i < 4 - codePointStrLength; i++) {
+            result.append("0");
+        }
+        return result.toString();
+    }
+}