-
Notifications
You must be signed in to change notification settings - Fork 60
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Support MUtf8 format string parsing(@kotlin.Metadata d1 Protobuf String)
- Loading branch information
Showing
3 changed files
with
252 additions
and
5 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
134 changes: 134 additions & 0 deletions
134
dexkit/src/main/java/org/luckypray/dexkit/util/MUtf8Util.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,134 @@ | ||
/* | ||
* Copyright (C) 2011 The Android Open Source Project | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
/* | ||
* DexKit - An high-performance runtime parsing library for dex | ||
* implemented in C++ | ||
* Copyright (C) 2022-2023 LuckyPray | ||
* https://github.com/LuckyPray/DexKit | ||
* | ||
* This program is free software: you can redistribute it and/or | ||
* modify it under the terms of the GNU Lesser General Public | ||
* License as published by the Free Software Foundation, either | ||
* version 3 of the License, or (at your option) any later version. | ||
* | ||
* This program is distributed in the hope that it will be useful, | ||
* but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
* GNU General Public License for more details. | ||
* | ||
* You should have received a copy of the GNU General Public License | ||
* along with this program. If not, see | ||
* <https://www.gnu.org/licenses/>. | ||
* <https://github.com/LuckyPray/DexKit/blob/master/LICENSE>. | ||
*/ | ||
package org.luckypray.dexkit.util; | ||
|
||
import java.io.UTFDataFormatException; | ||
import java.nio.ByteBuffer; | ||
|
||
/** | ||
* Modified UTF-8 as described in the dex file format spec. | ||
* | ||
* <p>Derived from libcore's MUTF-8 encoder at java.nio.charset.ModifiedUtf8. | ||
* <a href="https://source.android.com/docs/core/runtime/dex-format#mutf-8">MUTF-8</a> | ||
*/ | ||
public final class MUtf8Util { | ||
private MUtf8Util() {} | ||
/** | ||
* Decodes bytes from {@code in} into {@code out} until a delimiter 0x00 is | ||
* encountered. Returns a new string containing the decoded characters. | ||
*/ | ||
public static String decode(ByteBuffer in) throws UTFDataFormatException { | ||
StringBuilder sb = new StringBuilder(); | ||
while (true) { | ||
if (!in.hasRemaining()) { | ||
return sb.toString(); | ||
} | ||
char a = (char) (in.get() & 0xff); | ||
if (a == 0) { | ||
return sb.toString(); | ||
} | ||
if (a < '\u0080') { | ||
sb.append(a); | ||
} else if ((a & 0xe0) == 0xc0) { | ||
int b = in.get() & 0xff; | ||
if ((b & 0xC0) != 0x80) { | ||
throw new UTFDataFormatException("bad second byte"); | ||
} | ||
sb.append((char) (((a & 0x1F) << 6) | (b & 0x3F))); | ||
} else if ((a & 0xf0) == 0xe0) { | ||
int b = in.get() & 0xff; | ||
int c = in.get() & 0xff; | ||
if (((b & 0xC0) != 0x80) || ((c & 0xC0) != 0x80)) { | ||
throw new UTFDataFormatException("bad second or third byte"); | ||
} | ||
sb.append((char) (((a & 0x0F) << 12) | ((b & 0x3F) << 6) | (c & 0x3F))); | ||
} else { | ||
throw new UTFDataFormatException("bad byte"); | ||
} | ||
} | ||
} | ||
/** | ||
* Returns the number of bytes the modified UTF8 representation of 's' would take. | ||
*/ | ||
private static long countBytes(String s, boolean shortLength) throws UTFDataFormatException { | ||
long result = 0; | ||
final int length = s.length(); | ||
for (int i = 0; i < length; ++i) { | ||
char ch = s.charAt(i); | ||
if (ch != 0 && ch <= 127) { // U+0000 uses two bytes. | ||
++result; | ||
} else if (ch <= 2047) { | ||
result += 2; | ||
} else { | ||
result += 3; | ||
} | ||
if (shortLength && result > 65535) { | ||
throw new UTFDataFormatException("String more than 65535 UTF bytes long"); | ||
} | ||
} | ||
return result; | ||
} | ||
/** | ||
* Encodes the modified UTF-8 bytes corresponding to {@code s} into {@code | ||
* dst}, starting at {@code offset}. | ||
*/ | ||
public static void encode(byte[] dst, int offset, String s) { | ||
final int length = s.length(); | ||
for (int i = 0; i < length; i++) { | ||
char ch = s.charAt(i); | ||
if (ch != 0 && ch <= 127) { // U+0000 uses two bytes. | ||
dst[offset++] = (byte) ch; | ||
} else if (ch <= 2047) { | ||
dst[offset++] = (byte) (0xc0 | (0x1f & (ch >> 6))); | ||
dst[offset++] = (byte) (0x80 | (0x3f & ch)); | ||
} else { | ||
dst[offset++] = (byte) (0xe0 | (0x0f & (ch >> 12))); | ||
dst[offset++] = (byte) (0x80 | (0x3f & (ch >> 6))); | ||
dst[offset++] = (byte) (0x80 | (0x3f & ch)); | ||
} | ||
} | ||
} | ||
/** | ||
* Returns an array containing the <i>modified UTF-8</i> form of {@code s}. | ||
*/ | ||
public static byte[] encode(String s) throws UTFDataFormatException { | ||
int utfCount = (int) countBytes(s, true); | ||
byte[] result = new byte[utfCount]; | ||
encode(result, 0, s); | ||
return result; | ||
} | ||
} |
107 changes: 107 additions & 0 deletions
107
dexkit/src/main/java/org/luckypray/dexkit/util/StringUnicodeEncoderDecoder.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,107 @@ | ||
package org.luckypray.dexkit.util; | ||
|
||
/** | ||
* This class provides Unicode conversion utility methods that allow to convert a string into Unicode sequence and vice-versa. (See methods | ||
* descriptions for details) | ||
* | ||
* @author Michael Gantman | ||
*/ | ||
public class StringUnicodeEncoderDecoder { | ||
private final static String UNICODE_PREFIX = "\\u"; | ||
private final static String UPPER_CASE_UNICODE_PREFIX = "\\U"; | ||
private final static String UPPER_CASE_UNICODE_PREFIX_REGEX = "\\\\U"; | ||
private final static String DELIMITER = "\\\\u"; | ||
|
||
/** | ||
* This method converts a {@link String} of characters in any language into a String That contains a sequence of Unicode codes corresponding to | ||
* characters in the original String For Example String "Hello" will be converted into a String "\u005c\u00750048\u005c\u00750065\u005c\u0075006c\u005c\u0075006c\u005c\u0075006f" Null or empty | ||
* String conversion will return an empty String | ||
* | ||
* @param txt {@link String} that contains a sequence of characters to convert | ||
* @return {@link String} That contains a sequence of unicode codes corresponding to the characters in the original String. Each code will be in | ||
* hexadecimal format preceded by prefix "\u005c\u0075" with no spaces between them. The String also will have no leading or trailing | ||
* white spaces | ||
*/ | ||
public static String encodeStringToUnicodeSequence(String txt) { | ||
StringBuilder result = new StringBuilder(); | ||
if (txt != null && !txt.isEmpty()) { | ||
for (int i = 0; i < txt.length(); i++) { | ||
result.append(convertCodePointToUnicodeString(Character.codePointAt(txt, i))); | ||
if (Character.isHighSurrogate(txt.charAt(i))) { | ||
i++; | ||
} | ||
} | ||
} | ||
return result.toString(); | ||
} | ||
|
||
/** | ||
* This method converts {@link String} that contains a sequence of Unicode codes onto a String of corresponding characters. For example a String | ||
* "\u005c\u00750048\u005c\u00750065\u005c\u0075006c\u005c\u0075006c\u005c\u0075006f" will be converted into String "Hello" by this method. This method performs reverse conversion of the one | ||
* performed by method {@link #encodeStringToUnicodeSequence(String)} I.e. Any textual String converted into sequence of Unicode codes by method | ||
* {@link #encodeStringToUnicodeSequence(String)} may be retrieved back by invoking this method on that Unicode sequence String. | ||
* | ||
* @param unicodeSequence {@link String} That contains sequence of Unicode codes. Each code must be in hexadecimal format and must be preceded by | ||
* "'backslash' + 'u'" prefix. (note that prefix '\U' is now valid as opposed to earlier versions). This method allows | ||
* leading and trailing whitespaces for the whole String as well as spaces between codes. Those white spaces will be ignored. | ||
* @return {@link String} That contains sequence of characters that correspond to the respective Unicode codes in the original String | ||
* @throws IllegalArgumentException if input String is in invalid format. For example if any code is not in hexadecimal format or the code is not a valid Unicode code | ||
* (not valid code point). | ||
*/ | ||
public static String decodeUnicodeSequenceToString(String unicodeSequence) throws IllegalArgumentException { | ||
StringBuilder result = new StringBuilder(); | ||
try { | ||
unicodeSequence = replaceUpperCase_U_WithLoverCase(unicodeSequence); | ||
unicodeSequence = unicodeSequence.trim().substring(UNICODE_PREFIX.length()); | ||
for (String codePointStr : unicodeSequence.split(DELIMITER)) { | ||
result.append(Character.toChars(Integer.parseInt(codePointStr.trim(), 16))); | ||
} | ||
} catch (Exception e) { | ||
throw new IllegalArgumentException("Error occurred while converting unicode sequence String to String", e); | ||
} | ||
return result.toString(); | ||
} | ||
|
||
private static String replaceUpperCase_U_WithLoverCase(String unicodeSequence) { | ||
String result = unicodeSequence; | ||
if(unicodeSequence != null && unicodeSequence.contains(UPPER_CASE_UNICODE_PREFIX)) { | ||
result = unicodeSequence.replaceAll(UPPER_CASE_UNICODE_PREFIX_REGEX, DELIMITER); | ||
} | ||
return result; | ||
} | ||
|
||
/** | ||
* This method converts an integer that holds a unicode code value into a String | ||
* | ||
* @param codePoint a unicode code value | ||
* @return {@link String} that starts with prefix "'backslash' + 'u'" that follows with hexadecimal value of an integer. If the hexadecimal value | ||
* of an integer is less then four digits the value is padded with preceding zeros. For example if the integer has value 32 (decimal) it | ||
* will be converted into String "\u0020" | ||
*/ | ||
private static String convertCodePointToUnicodeString(int codePoint) { | ||
StringBuilder result = new StringBuilder(UNICODE_PREFIX); | ||
String codePointHexStr = Integer.toHexString(codePoint); | ||
codePointHexStr = codePointHexStr.startsWith("0") ? codePointHexStr.substring(1) : codePointHexStr; | ||
if (codePointHexStr.length() <= 4) { | ||
result.append(getPrecedingZerosStr(codePointHexStr.length())); | ||
} | ||
result.append(codePointHexStr); | ||
return result.toString(); | ||
} | ||
|
||
/** | ||
* This method receives a length of a String and if it is less then 4 it generates a padding String of zeros that can be appended to the String to | ||
* make it of length 4 I.e. if parameter passed is 1 the returned String will be "000". If the parameter passed is 4 or greater empty String is | ||
* returned. | ||
* | ||
* @param codePointStrLength Length of a String to be padded by preceding zeros to the length of 4 | ||
* @return padding String | ||
*/ | ||
private static String getPrecedingZerosStr(int codePointStrLength) { | ||
StringBuilder result = new StringBuilder(); | ||
for (int i = 0; i < 4 - codePointStrLength; i++) { | ||
result.append("0"); | ||
} | ||
return result.toString(); | ||
} | ||
} |