From d24d409e0448f6ca4b45bc930b240d7860d3861e Mon Sep 17 00:00:00 2001 From: "leonid.stashevsky" Date: Wed, 15 May 2024 09:56:54 +0200 Subject: [PATCH] Add js charset --- .../charsets/{CharsetJS.kt => Charset.js.kt} | 61 +++++--- .../io/ktor/utils/io/charsets/Decoder.js.kt | 32 ++++ .../ktor/utils/io/charsets/TextDecoder.js.kt | 29 ++++ .../io/charsets/TextDecoderFallback.js.kt | 83 +++++++++++ .../ktor/utils/io/charsets/TextEncoder.js.kt | 9 ++ .../io/ktor/utils/io/charsets/DecodeUtils.kt | 13 ++ .../src/io/ktor/utils/io/charsets/ISO88591.kt | 20 +++ .../io/ktor/utils/io/charsets/Win1252Table.kt | 141 ++++++++++++++++++ 8 files changed, 366 insertions(+), 22 deletions(-) rename ktor-io/js/src/io/ktor/utils/io/charsets/{CharsetJS.kt => Charset.js.kt} (64%) create mode 100644 ktor-io/js/src/io/ktor/utils/io/charsets/Decoder.js.kt create mode 100644 ktor-io/js/src/io/ktor/utils/io/charsets/TextDecoder.js.kt create mode 100644 ktor-io/js/src/io/ktor/utils/io/charsets/TextDecoderFallback.js.kt create mode 100644 ktor-io/js/src/io/ktor/utils/io/charsets/TextEncoder.js.kt create mode 100644 ktor-io/jsAndWasmShared/src/io/ktor/utils/io/charsets/DecodeUtils.kt create mode 100644 ktor-io/jsAndWasmShared/src/io/ktor/utils/io/charsets/ISO88591.kt create mode 100644 ktor-io/jsAndWasmShared/src/io/ktor/utils/io/charsets/Win1252Table.kt diff --git a/ktor-io/js/src/io/ktor/utils/io/charsets/CharsetJS.kt b/ktor-io/js/src/io/ktor/utils/io/charsets/Charset.js.kt similarity index 64% rename from ktor-io/js/src/io/ktor/utils/io/charsets/CharsetJS.kt rename to ktor-io/js/src/io/ktor/utils/io/charsets/Charset.js.kt index 256fefe8db9..1d850b0d5f9 100644 --- a/ktor-io/js/src/io/ktor/utils/io/charsets/CharsetJS.kt +++ b/ktor-io/js/src/io/ktor/utils/io/charsets/Charset.js.kt @@ -4,16 +4,14 @@ package io.ktor.utils.io.charsets -import io.ktor.utils.io.charsets.* -import io.ktor.utils.io.charsets.CharsetDecoder -import io.ktor.utils.io.charsets.CharsetEncoder -import io.ktor.utils.io.charsets.Charsets +import io.ktor.utils.io.js.* import kotlinx.io.* +import org.khronos.webgl.* /** * Find a charset by name. */ -public actual fun Charsets.forName(name: String): io.ktor.utils.io.charsets.Charset = Charset.forName(name) +public actual fun Charsets.forName(name: String): Charset = Charset.forName(name) /** * Check if a charset is supported by the current platform. @@ -28,11 +26,9 @@ public actual abstract class Charset(internal val _name: String) { if (this === other) return true if (other == null || this::class.js != other::class.js) return false - other as io.ktor.utils.io.charsets.Charset + other as Charset - if (_name != other._name) return false - - return true + return _name == other._name } override fun hashCode(): Int { @@ -44,7 +40,7 @@ public actual abstract class Charset(internal val _name: String) { } public companion object { - public fun forName(name: String): io.ktor.utils.io.charsets.Charset { + public fun forName(name: String): Charset { if (name == "UTF-8" || name == "utf-8" || name == "UTF8" || name == "utf8") return Charsets.UTF_8 if (name == "ISO-8859-1" || name == "iso-8859-1" || name.replace('_', '-').let { it == "iso-8859-1" || it.lowercase() == "iso-8859-1" } || @@ -66,14 +62,14 @@ public actual abstract class Charset(internal val _name: String) { } } -public actual val io.ktor.utils.io.charsets.Charset.name: String get() = _name +public actual val Charset.name: String get() = _name // ----------------------- -public actual abstract class CharsetEncoder(internal val _charset: io.ktor.utils.io.charsets.Charset) -private data class CharsetEncoderImpl(private val charset: io.ktor.utils.io.charsets.Charset) : CharsetEncoder(charset) +public actual abstract class CharsetEncoder(internal val _charset: Charset) +private data class CharsetEncoderImpl(private val charset: Charset) : CharsetEncoder(charset) -public actual val CharsetEncoder.charset: io.ktor.utils.io.charsets.Charset get() = _charset +public actual val CharsetEncoder.charset: Charset get() = _charset public actual fun CharsetEncoder.encodeToByteArray(input: CharSequence, fromIndex: Int, toIndex: Int): ByteArray = encodeToByteArrayImpl(input, fromIndex, toIndex) @@ -85,16 +81,26 @@ internal actual fun CharsetEncoder.encodeImpl( toIndex: Int, dst: Sink ): Int { - TODO() + require(fromIndex <= toIndex) + if (charset == Charsets.ISO_8859_1) { + return encodeISO88591(input, fromIndex, toIndex, dst) + } + + require(charset === Charsets.UTF_8) { "Only UTF-8 encoding is supported in JS" } + + val encoder = TextEncoder() // Only UTF-8 is supported so we know that at most 6 bytes per character is used + val result = encoder.encode(input.substring(fromIndex, toIndex)) + dst.write(result.unsafeCast()) + return result.length } // ---------------------------------------------------------------------- -public actual abstract class CharsetDecoder(internal val _charset: io.ktor.utils.io.charsets.Charset) +public actual abstract class CharsetDecoder(internal val _charset: Charset) -private data class CharsetDecoderImpl(private val charset: io.ktor.utils.io.charsets.Charset) : CharsetDecoder(charset) +private data class CharsetDecoderImpl(private val charset: Charset) : CharsetDecoder(charset) -public actual val CharsetDecoder.charset: io.ktor.utils.io.charsets.Charset get() = _charset +public actual val CharsetDecoder.charset: Charset get() = _charset internal actual fun CharsetEncoder.encodeToByteArrayImpl( input: CharSequence, @@ -116,16 +122,27 @@ internal actual fun CharsetEncoder.encodeToByteArrayImpl( return dst.readByteArray() } +@OptIn(SnapshotApi::class, InternalIoApi::class) public actual fun CharsetDecoder.decode(input: Source, dst: Appendable, max: Int): Int { - TODO() + val decoder = Decoder(charset.name, true) + + val count = minOf(input.buffer.size, max.toLong()) + val array = input.readByteArray(count.toInt()) as Int8Array + val result = try { + decoder.decode(array) + } catch (cause: Throwable) { + throw MalformedInputException("Failed to decode bytes: ${cause.message ?: "no cause provided"}") + } + dst.append(result) + return result.length } public actual object Charsets { - public actual val UTF_8: io.ktor.utils.io.charsets.Charset = CharsetImpl("UTF-8") - public actual val ISO_8859_1: io.ktor.utils.io.charsets.Charset = CharsetImpl("ISO-8859-1") + public actual val UTF_8: Charset = CharsetImpl("UTF-8") + public actual val ISO_8859_1: Charset = CharsetImpl("ISO-8859-1") } -private data class CharsetImpl(val name: String) : io.ktor.utils.io.charsets.Charset(name) { +private data class CharsetImpl(val name: String) : Charset(name) { override fun newEncoder(): CharsetEncoder = CharsetEncoderImpl(this) override fun newDecoder(): CharsetDecoder = CharsetDecoderImpl(this) } diff --git a/ktor-io/js/src/io/ktor/utils/io/charsets/Decoder.js.kt b/ktor-io/js/src/io/ktor/utils/io/charsets/Decoder.js.kt new file mode 100644 index 00000000000..8202f9a8672 --- /dev/null +++ b/ktor-io/js/src/io/ktor/utils/io/charsets/Decoder.js.kt @@ -0,0 +1,32 @@ +/* + * Copyright 2014-2021 JetBrains s.r.o and contributors. Use of this source code is governed by the Apache 2.0 license. + */ + +package io.ktor.utils.io.charsets + +import org.khronos.webgl.* + +internal fun Decoder(encoding: String, fatal: Boolean = true): Decoder = try { + TextDecoder(encoding, textDecoderOptions(fatal)).toKtor() +} catch (cause: Throwable) { + TextDecoderFallback(encoding, fatal) +} + +internal interface Decoder { + fun decode(): String + fun decode(buffer: ArrayBufferView): String + fun decode(buffer: ArrayBufferView, options: dynamic): String +} + +@Suppress("NOTHING_TO_INLINE") +internal inline fun Decoder.decodeStream(buffer: ArrayBufferView, stream: Boolean): String { + decodeWrap { + return decode(buffer, decodeOptions(stream)) + } +} + +internal fun decodeOptions(stream: Boolean): dynamic = Any().apply { + with(this.asDynamic()) { + this.stream = stream + } +} diff --git a/ktor-io/js/src/io/ktor/utils/io/charsets/TextDecoder.js.kt b/ktor-io/js/src/io/ktor/utils/io/charsets/TextDecoder.js.kt new file mode 100644 index 00000000000..a88e66490b2 --- /dev/null +++ b/ktor-io/js/src/io/ktor/utils/io/charsets/TextDecoder.js.kt @@ -0,0 +1,29 @@ +/* + * Copyright 2014-2021 JetBrains s.r.o and contributors. Use of this source code is governed by the Apache 2.0 license. + */ + +package io.ktor.utils.io.charsets + +import org.khronos.webgl.* + +internal external class TextDecoder(encoding: String, options: dynamic = definedExternally) { + val encoding: String + + fun decode(): String + fun decode(buffer: ArrayBuffer): String + fun decode(buffer: ArrayBuffer, options: dynamic): String + fun decode(buffer: ArrayBufferView): String + fun decode(buffer: ArrayBufferView, options: dynamic): String +} + +internal fun TextDecoder.toKtor(): Decoder = object : Decoder { + override fun decode(): String = this@toKtor.decode() + override fun decode(buffer: ArrayBufferView): String = this@toKtor.decode(buffer) + override fun decode(buffer: ArrayBufferView, options: dynamic): String = this@toKtor.decode(buffer, options) +} + +internal fun textDecoderOptions(fatal: Boolean = false): Any = Any().apply { + with(this.asDynamic()) { + this.fatal = fatal + } +} diff --git a/ktor-io/js/src/io/ktor/utils/io/charsets/TextDecoderFallback.js.kt b/ktor-io/js/src/io/ktor/utils/io/charsets/TextDecoderFallback.js.kt new file mode 100644 index 00000000000..348661369bc --- /dev/null +++ b/ktor-io/js/src/io/ktor/utils/io/charsets/TextDecoderFallback.js.kt @@ -0,0 +1,83 @@ +/* + * Copyright 2014-2021 JetBrains s.r.o and contributors. Use of this source code is governed by the Apache 2.0 license. + */ + +package io.ktor.utils.io.charsets + +import io.ktor.utils.io.core.* +import org.khronos.webgl.* + +private val ENCODING_ALIASES = setOf( + "ansi_x3.4-1968", + "ascii", + "cp1252", + "cp819", + "csisolatin1", + "ibm819", + "iso-8859-1", + "iso-ir-100", + "iso8859-1", + "iso88591", + "iso_8859-1", + "iso_8859-1:1987", + "l1", + "latin1", + "us-ascii", + "windows-1252", + "x-cp1252" +) + +private val REPLACEMENT = byteArrayOf(0xEF.toByte(), 0xBF.toByte(), 0xBD.toByte()) + +/** + * Windows-1252 decoder. + * + * According to https://encoding.spec.whatwg.org/, ISO-8859-1 should be treated as windows-1252 for http. + */ +internal class TextDecoderFallback( + encoding: String, + val fatal: Boolean +) : Decoder { + + init { + val requestedEncoding = encoding.trim().lowercase() + check(ENCODING_ALIASES.contains(requestedEncoding)) { "$encoding is not supported." } + } + + override fun decode(): String = "" + + override fun decode(buffer: ArrayBufferView): String = buildPacket { + val bytes = buffer as Int8Array + for (index in 0 until bytes.length) { + val byte = bytes[index] + val point: Int = byte.toCodePoint() + + if (point < 0) { + check(!fatal) { "Invalid character: $point" } + writeFully(REPLACEMENT) + continue + } + + if (point > 0xFF) { + writeByte((point shr 8).toByte()) + } + + writeByte((point and 0xFF).toByte()) + } + }.readBytes().decodeToString() + + override fun decode(buffer: ArrayBufferView, options: dynamic): String { + return decode(buffer) + } +} + +private fun Byte.toCodePoint(): Int { + val value = toInt() and 0xFF + if (value.isASCII()) { + return value + } + + return WIN1252_TABLE[value - 0x80] +} + +private fun Int.isASCII(): Boolean = this in 0..0x7F diff --git a/ktor-io/js/src/io/ktor/utils/io/charsets/TextEncoder.js.kt b/ktor-io/js/src/io/ktor/utils/io/charsets/TextEncoder.js.kt new file mode 100644 index 00000000000..c02badb9f1a --- /dev/null +++ b/ktor-io/js/src/io/ktor/utils/io/charsets/TextEncoder.js.kt @@ -0,0 +1,9 @@ +package io.ktor.utils.io.js + +import org.khronos.webgl.* + +internal external class TextEncoder() { + val encoding: String + + public fun encode(input: String): Uint8Array +} diff --git a/ktor-io/jsAndWasmShared/src/io/ktor/utils/io/charsets/DecodeUtils.kt b/ktor-io/jsAndWasmShared/src/io/ktor/utils/io/charsets/DecodeUtils.kt new file mode 100644 index 00000000000..d6c541147a1 --- /dev/null +++ b/ktor-io/jsAndWasmShared/src/io/ktor/utils/io/charsets/DecodeUtils.kt @@ -0,0 +1,13 @@ +/* + * Copyright 2014-2024 JetBrains s.r.o and contributors. Use of this source code is governed by the Apache 2.0 license. + */ + +package io.ktor.utils.io.charsets + +internal inline fun decodeWrap(block: () -> R): R { + try { + return block() + } catch (cause: Throwable) { + throw MalformedInputException("Failed to decode bytes: ${cause.message ?: "no cause provided"}") + } +} diff --git a/ktor-io/jsAndWasmShared/src/io/ktor/utils/io/charsets/ISO88591.kt b/ktor-io/jsAndWasmShared/src/io/ktor/utils/io/charsets/ISO88591.kt new file mode 100644 index 00000000000..c755fe65309 --- /dev/null +++ b/ktor-io/jsAndWasmShared/src/io/ktor/utils/io/charsets/ISO88591.kt @@ -0,0 +1,20 @@ +package io.ktor.utils.io.charsets + +import kotlinx.io.* + +internal fun encodeISO88591(input: CharSequence, fromIndex: Int, toIndex: Int, dst: Sink): Int { + if (fromIndex >= toIndex) return 0 + + for (index in fromIndex until toIndex) { + val character = input[index].code + if (character > 0xff) { + failedToMapError(character) + } + dst.writeByte(character.toByte()) + } + return toIndex - fromIndex +} + +private fun failedToMapError(ch: Int): Nothing { + throw MalformedInputException("The character with unicode point $ch couldn't be mapped to ISO-8859-1 character") +} diff --git a/ktor-io/jsAndWasmShared/src/io/ktor/utils/io/charsets/Win1252Table.kt b/ktor-io/jsAndWasmShared/src/io/ktor/utils/io/charsets/Win1252Table.kt new file mode 100644 index 00000000000..064248689e9 --- /dev/null +++ b/ktor-io/jsAndWasmShared/src/io/ktor/utils/io/charsets/Win1252Table.kt @@ -0,0 +1,141 @@ +/* + * Copyright 2014-2021 JetBrains s.r.o and contributors. Use of this source code is governed by the Apache 2.0 license. + */ + +package io.ktor.utils.io.charsets + +/** + * Mapping for non-ascii characters for Windows-1252 encoding. + * + * https://www.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP1252.TXT + */ +internal val WIN1252_TABLE = intArrayOf( + 0x20AC, + -1, + 0x201A, + 0x0192, + 0x201E, + 0x2026, + 0x2020, + 0x2021, + 0x02C6, + 0x2030, + 0x0160, + 0x2039, + 0x0152, + -1, + 0x017D, + -1, + -1, + 0x2018, + 0x2019, + 0x201C, + 0x201D, + 0x2022, + 0x2013, + 0x2014, + 0x02DC, + 0x2122, + 0x0161, + 0x203A, + 0x0153, + -1, + 0x017E, + 0x0178, + 0x00A0, + 0x00A1, + 0x00A2, + 0x00A3, + 0x00A4, + 0x00A5, + 0x00A6, + 0x00A7, + 0x00A8, + 0x00A9, + 0x00AA, + 0x00AB, + 0x00AC, + 0x00AD, + 0x00AE, + 0x00AF, + 0x00B0, + 0x00B1, + 0x00B2, + 0x00B3, + 0x00B4, + 0x00B5, + 0x00B6, + 0x00B7, + 0x00B8, + 0x00B9, + 0x00BA, + 0x00BB, + 0x00BC, + 0x00BD, + 0x00BE, + 0x00BF, + 0x00C0, + 0x00C1, + 0x00C2, + 0x00C3, + 0x00C4, + 0x00C5, + 0x00C6, + 0x00C7, + 0x00C8, + 0x00C9, + 0x00CA, + 0x00CB, + 0x00CC, + 0x00CD, + 0x00CE, + 0x00CF, + 0x00D0, + 0x00D1, + 0x00D2, + 0x00D3, + 0x00D4, + 0x00D5, + 0x00D6, + 0x00D7, + 0x00D8, + 0x00D9, + 0x00DA, + 0x00DB, + 0x00DC, + 0x00DD, + 0x00DE, + 0x00DF, + 0x00E0, + 0x00E1, + 0x00E2, + 0x00E3, + 0x00E4, + 0x00E5, + 0x00E6, + 0x00E7, + 0x00E8, + 0x00E9, + 0x00EA, + 0x00EB, + 0x00EC, + 0x00ED, + 0x00EE, + 0x00EF, + 0x00F0, + 0x00F1, + 0x00F2, + 0x00F3, + 0x00F4, + 0x00F5, + 0x00F6, + 0x00F7, + 0x00F8, + 0x00F9, + 0x00FA, + 0x00FB, + 0x00FC, + 0x00FD, + 0x00FE, + 0x00FF +)