Optionally trust the C API to always receive valid UTF-8 (#597)

This adds a new feature to the C API that allows it to skip UTF-8 validation on strings passed to it. Depending on where the C API is used from you may always be sure that valid UTF-8 strings are being passed, so no additional validation should be necessary. On top of that since we always know that the strings passed from JavaScript in the web are going to be valid UTF-8, the validation is always skipped there.
LiveSplit · Nov 8, 2022 · 7a02ef2 · 7a02ef2
1 parent 7e673df
commit 7a02ef2
Showing 4 changed files with 38 additions and 257 deletions.
diff --git a/capi/Cargo.toml b/capi/Cargo.toml
@@ -20,3 +20,4 @@ image-shrinking = ["livesplit-core/image-shrinking"]
 software-rendering = ["livesplit-core/software-rendering"]
 wasm-web = ["livesplit-core/wasm-web"]
 auto-splitting = ["livesplit-core/auto-splitting"]
+assume-str-parameters-are-utf8 = []
diff --git a/capi/bind_gen/src/wasm.rs b/capi/bind_gen/src/wasm.rs
@@ -472,69 +472,10 @@ export async function load(path?: string) {
     wasm = await WebAssembly.instantiate(bytes, imports);
 }
 
-let encodeUtf8: (str: string) => Uint8Array;
-if (!(global as any)["TextEncoder"]) {
-    encodeUtf8 = (str) => {
-        var utf8 = [];
-        for (var i = 0; i < str.length; i++) {
-            var charcode = str.charCodeAt(i);
-            if (charcode < 0x80) {
-                utf8.push(charcode);
-            } else if (charcode < 0x800) {
-                utf8.push(0xc0 | (charcode >> 6),
-                    0x80 | (charcode & 0x3f));
-            } else if (charcode < 0xd800 || charcode >= 0xe000) {
-                utf8.push(0xe0 | (charcode >> 12),
-                    0x80 | ((charcode >> 6) & 0x3f),
-                    0x80 | (charcode & 0x3f));
-            } else {
-                i++;
-                charcode = 0x10000 + (((charcode & 0x3ff) << 10)
-                    | (str.charCodeAt(i) & 0x3ff))
-                utf8.push(0xf0 | (charcode >> 18),
-                    0x80 | ((charcode >> 12) & 0x3f),
-                    0x80 | ((charcode >> 6) & 0x3f),
-                    0x80 | (charcode & 0x3f));
-            }
-        }
-        return new Uint8Array(utf8);
-    };
-} else {
-    const encoder = new TextEncoder("UTF-8");
-    encodeUtf8 = (str) => encoder.encode(str);
-}
-
-let decodeUtf8: (data: Uint8Array) => string;
-if (!(global as any)["TextDecoder"]) {
-    decodeUtf8 = (data) => {
-        var str = '',
-            i;
-
-        for (i = 0; i < data.length; i++) {
-            var value = data[i];
-
-            if (value < 0x80) {
-                str += String.fromCharCode(value);
-            } else if (value > 0xBF && value < 0xE0) {
-                str += String.fromCharCode((value & 0x1F) << 6 | data[i + 1] & 0x3F);
-                i += 1;
-            } else if (value > 0xDF && value < 0xF0) {
-                str += String.fromCharCode((value & 0x0F) << 12 | (data[i + 1] & 0x3F) << 6 | data[i + 2] & 0x3F);
-                i += 2;
-            } else {
-                var charCode = ((value & 0x07) << 18 | (data[i + 1] & 0x3F) << 12 | (data[i + 2] & 0x3F) << 6 | data[i + 3] & 0x3F) - 0x010000;
-
-                str += String.fromCharCode(charCode >> 10 | 0xD800, charCode & 0x03FF | 0xDC00);
-                i += 3;
-            }
-        }
-
-        return str;
-    };
-} else {
-    const decoder = new TextDecoder("UTF-8");
-    decodeUtf8 = (data) => decoder.decode(data);
-}
+const encoder = new TextEncoder("UTF-8");
+const decoder = new TextDecoder("UTF-8");
+const encodeUtf8: (str: string) => Uint8Array = (str) => encoder.encode(str);
+const decodeUtf8: (data: Uint8Array) => string = (data) => decoder.decode(data);
 
 interface Slice {
     ptr: number,
@@ -645,70 +586,10 @@ exports.load = async function (path) {
     wasm = await WebAssembly.instantiate(bytes, imports);
 }
 
-let encodeUtf8;
-if (!global["TextEncoder"]) {
-    encodeUtf8 = (str) => {
-        var utf8 = [];
-        for (var i = 0; i < str.length; i++) {
-            var charcode = str.charCodeAt(i);
-            if (charcode < 0x80) {
-                utf8.push(charcode);
-            } else if (charcode < 0x800) {
-                utf8.push(0xc0 | (charcode >> 6),
-                    0x80 | (charcode & 0x3f));
-            }
-            else if (charcode < 0xd800 || charcode >= 0xe000) {
-                utf8.push(0xe0 | (charcode >> 12),
-                    0x80 | ((charcode >> 6) & 0x3f),
-                    0x80 | (charcode & 0x3f));
-            } else {
-                i++;
-                charcode = 0x10000 + (((charcode & 0x3ff) << 10)
-                    | (str.charCodeAt(i) & 0x3ff))
-                utf8.push(0xf0 | (charcode >> 18),
-                    0x80 | ((charcode >> 12) & 0x3f),
-                    0x80 | ((charcode >> 6) & 0x3f),
-                    0x80 | (charcode & 0x3f));
-            }
-        }
-        return new Uint8Array(utf8);
-    };
-} else {
-    const encoder = new TextEncoder("UTF-8");
-    encodeUtf8 = (str) => encoder.encode(str);
-}
-
-let decodeUtf8;
-if (!global["TextDecoder"]) {
-    decodeUtf8 = (data) => {
-        var str = '',
-            i;
-
-        for (i = 0; i < data.length; i++) {
-            var value = data[i];
-
-            if (value < 0x80) {
-                str += String.fromCharCode(value);
-            } else if (value > 0xBF && value < 0xE0) {
-                str += String.fromCharCode((value & 0x1F) << 6 | data[i + 1] & 0x3F);
-                i += 1;
-            } else if (value > 0xDF && value < 0xF0) {
-                str += String.fromCharCode((value & 0x0F) << 12 | (data[i + 1] & 0x3F) << 6 | data[i + 2] & 0x3F);
-                i += 2;
-            } else {
-                var charCode = ((value & 0x07) << 18 | (data[i + 1] & 0x3F) << 12 | (data[i + 2] & 0x3F) << 6 | data[i + 3] & 0x3F) - 0x010000;
-
-                str += String.fromCharCode(charCode >> 10 | 0xD800, charCode & 0x03FF | 0xDC00);
-                i += 3;
-            }
-        }
-
-        return str;
-    };
-} else {
-    const decoder = new TextDecoder("UTF-8");
-    decodeUtf8 = (data) => decoder.decode(data);
-}
+const encoder = new TextEncoder("UTF-8");
+const decoder = new TextDecoder("UTF-8");
+const encodeUtf8 = (str) => encoder.encode(str);
+const decodeUtf8 = (data) => decoder.decode(data);
 
 function allocInt8Array(src) {
     const len = src.length;

diff --git a/capi/bind_gen/src/wasm_bindgen.rs b/capi/bind_gen/src/wasm_bindgen.rs
@@ -392,69 +392,10 @@ declare namespace TextEncoding {
     }
 }
 
-let encodeUtf8: (str: string) => Uint8Array;
-if (!(global as any)["TextEncoder"]) {
-    encodeUtf8 = (str) => {
-        var utf8 = [];
-        for (var i = 0; i < str.length; i++) {
-            var charcode = str.charCodeAt(i);
-            if (charcode < 0x80) {
-                utf8.push(charcode);
-            } else if (charcode < 0x800) {
-                utf8.push(0xc0 | (charcode >> 6),
-                    0x80 | (charcode & 0x3f));
-            } else if (charcode < 0xd800 || charcode >= 0xe000) {
-                utf8.push(0xe0 | (charcode >> 12),
-                    0x80 | ((charcode >> 6) & 0x3f),
-                    0x80 | (charcode & 0x3f));
-            } else {
-                i++;
-                charcode = 0x10000 + (((charcode & 0x3ff) << 10)
-                    | (str.charCodeAt(i) & 0x3ff))
-                utf8.push(0xf0 | (charcode >> 18),
-                    0x80 | ((charcode >> 12) & 0x3f),
-                    0x80 | ((charcode >> 6) & 0x3f),
-                    0x80 | (charcode & 0x3f));
-            }
-        }
-        return new Uint8Array(utf8);
-    };
-} else {
-    const encoder = new TextEncoder("UTF-8");
-    encodeUtf8 = (str) => encoder.encode(str);
-}
-
-let decodeUtf8: (data: Uint8Array) => string;
-if (!(global as any)["TextDecoder"]) {
-    decodeUtf8 = (data) => {
-        var str = '',
-            i;
-
-        for (i = 0; i < data.length; i++) {
-            var value = data[i];
-
-            if (value < 0x80) {
-                str += String.fromCharCode(value);
-            } else if (value > 0xBF && value < 0xE0) {
-                str += String.fromCharCode((value & 0x1F) << 6 | data[i + 1] & 0x3F);
-                i += 1;
-            } else if (value > 0xDF && value < 0xF0) {
-                str += String.fromCharCode((value & 0x0F) << 12 | (data[i + 1] & 0x3F) << 6 | data[i + 2] & 0x3F);
-                i += 2;
-            } else {
-                var charCode = ((value & 0x07) << 18 | (data[i + 1] & 0x3F) << 12 | (data[i + 2] & 0x3F) << 6 | data[i + 3] & 0x3F) - 0x010000;
-
-                str += String.fromCharCode(charCode >> 10 | 0xD800, charCode & 0x03FF | 0xDC00);
-                i += 3;
-            }
-        }
-
-        return str;
-    };
-} else {
-    const decoder = new TextDecoder("UTF-8");
-    decodeUtf8 = (data) => decoder.decode(data);
-}
+const encoder = new TextEncoder("UTF-8");
+const decoder = new TextDecoder("UTF-8");
+const encodeUtf8: (str: string) => Uint8Array = (str) => encoder.encode(str);
+const decodeUtf8: (data: Uint8Array) => string = (data) => decoder.decode(data);
 
 interface Slice {
     ptr: number,
@@ -506,70 +447,10 @@ function dealloc(slice: Slice) {
             "{}",
             r#"import * as wasm from "./livesplit_core_bg.wasm";
 
-let encodeUtf8;
-if (!global["TextEncoder"]) {
-    encodeUtf8 = (str) => {
-        var utf8 = [];
-        for (var i = 0; i < str.length; i++) {
-            var charcode = str.charCodeAt(i);
-            if (charcode < 0x80) {
-                utf8.push(charcode);
-            } else if (charcode < 0x800) {
-                utf8.push(0xc0 | (charcode >> 6),
-                    0x80 | (charcode & 0x3f));
-            }
-            else if (charcode < 0xd800 || charcode >= 0xe000) {
-                utf8.push(0xe0 | (charcode >> 12),
-                    0x80 | ((charcode >> 6) & 0x3f),
-                    0x80 | (charcode & 0x3f));
-            } else {
-                i++;
-                charcode = 0x10000 + (((charcode & 0x3ff) << 10)
-                    | (str.charCodeAt(i) & 0x3ff))
-                utf8.push(0xf0 | (charcode >> 18),
-                    0x80 | ((charcode >> 12) & 0x3f),
-                    0x80 | ((charcode >> 6) & 0x3f),
-                    0x80 | (charcode & 0x3f));
-            }
-        }
-        return new Uint8Array(utf8);
-    };
-} else {
-    const encoder = new TextEncoder("UTF-8");
-    encodeUtf8 = (str) => encoder.encode(str);
-}
-
-let decodeUtf8;
-if (!global["TextDecoder"]) {
-    decodeUtf8 = (data) => {
-        var str = '',
-            i;
-
-        for (i = 0; i < data.length; i++) {
-            var value = data[i];
-
-            if (value < 0x80) {
-                str += String.fromCharCode(value);
-            } else if (value > 0xBF && value < 0xE0) {
-                str += String.fromCharCode((value & 0x1F) << 6 | data[i + 1] & 0x3F);
-                i += 1;
-            } else if (value > 0xDF && value < 0xF0) {
-                str += String.fromCharCode((value & 0x0F) << 12 | (data[i + 1] & 0x3F) << 6 | data[i + 2] & 0x3F);
-                i += 2;
-            } else {
-                var charCode = ((value & 0x07) << 18 | (data[i + 1] & 0x3F) << 12 | (data[i + 2] & 0x3F) << 6 | data[i + 3] & 0x3F) - 0x010000;
-
-                str += String.fromCharCode(charCode >> 10 | 0xD800, charCode & 0x03FF | 0xDC00);
-                i += 3;
-            }
-        }
-
-        return str;
-    };
-} else {
-    const decoder = new TextDecoder("UTF-8");
-    decodeUtf8 = (data) => decoder.decode(data);
-}
+const encoder = new TextEncoder("UTF-8");
+const decoder = new TextDecoder("UTF-8");
+const encodeUtf8 = (str) => encoder.encode(str);
+const decodeUtf8 = (data) => decoder.decode(data);
 
 function allocUint8Array(src) {
     const len = src.length;

diff --git a/capi/src/lib.rs b/capi/src/lib.rs
@@ -143,7 +143,25 @@ unsafe fn str(s: *const c_char) -> &'static str {
     if s.is_null() {
         ""
     } else {
-        CStr::from_ptr(s as _).to_str().unwrap()
+        let bytes = CStr::from_ptr(s as _).to_bytes();
+
+        // Depending on where the C API is used, you may be able to fully trust
+        // that the caller always passes valid UTF-8. On the web we use the
+        // `TextEncoder` which always produces valid UTF-8.
+        #[cfg(any(
+            feature = "assume-str-parameters-are-utf8",
+            all(target_family = "wasm", feature = "wasm-web"),
+        ))]
+        {
+            std::str::from_utf8_unchecked(bytes)
+        }
+        #[cfg(not(any(
+            feature = "assume-str-parameters-are-utf8",
+            all(target_family = "wasm", feature = "wasm-web"),
+        )))]
+        {
+            simdutf8::basic::from_utf8(bytes).unwrap()
+        }
     }
 }
 
@@ -166,7 +184,7 @@ unsafe fn get_file(_: i64) -> ManuallyDrop<File> {
 }
 
 /// Allocate memory.
-#[cfg(all(target_arch = "wasm32", not(target_os = "wasi")))]
+#[cfg(all(target_family = "wasm", not(target_os = "wasi")))]
 #[no_mangle]
 pub extern "C" fn alloc(size: usize) -> *mut u8 {
     let mut buf = Vec::with_capacity(size);
@@ -176,7 +194,7 @@ pub extern "C" fn alloc(size: usize) -> *mut u8 {
 }
 
 /// Deallocate memory.
-#[cfg(all(target_arch = "wasm32", not(target_os = "wasi")))]
+#[cfg(all(target_family = "wasm", not(target_os = "wasi")))]
 #[no_mangle]
 pub extern "C" fn dealloc(ptr: *mut u8, cap: usize) {
     unsafe {