Skip to content

Commit

Permalink
Add js charset
Browse files Browse the repository at this point in the history
  • Loading branch information
e5l committed May 15, 2024
1 parent 4eb677e commit d24d409
Show file tree
Hide file tree
Showing 8 changed files with 366 additions and 22 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -4,16 +4,14 @@

package io.ktor.utils.io.charsets

import io.ktor.utils.io.charsets.*
import io.ktor.utils.io.charsets.CharsetDecoder
import io.ktor.utils.io.charsets.CharsetEncoder
import io.ktor.utils.io.charsets.Charsets
import io.ktor.utils.io.js.*
import kotlinx.io.*
import org.khronos.webgl.*

/**
* Find a charset by name.
*/
public actual fun Charsets.forName(name: String): io.ktor.utils.io.charsets.Charset = Charset.forName(name)
public actual fun Charsets.forName(name: String): Charset = Charset.forName(name)

/**
* Check if a charset is supported by the current platform.
Expand All @@ -28,11 +26,9 @@ public actual abstract class Charset(internal val _name: String) {
if (this === other) return true
if (other == null || this::class.js != other::class.js) return false

other as io.ktor.utils.io.charsets.Charset
other as Charset

if (_name != other._name) return false

return true
return _name == other._name
}

override fun hashCode(): Int {
Expand All @@ -44,7 +40,7 @@ public actual abstract class Charset(internal val _name: String) {
}

public companion object {
public fun forName(name: String): io.ktor.utils.io.charsets.Charset {
public fun forName(name: String): Charset {
if (name == "UTF-8" || name == "utf-8" || name == "UTF8" || name == "utf8") return Charsets.UTF_8
if (name == "ISO-8859-1" || name == "iso-8859-1" ||
name.replace('_', '-').let { it == "iso-8859-1" || it.lowercase() == "iso-8859-1" } ||
Expand All @@ -66,14 +62,14 @@ public actual abstract class Charset(internal val _name: String) {
}
}

public actual val io.ktor.utils.io.charsets.Charset.name: String get() = _name
public actual val Charset.name: String get() = _name

// -----------------------

public actual abstract class CharsetEncoder(internal val _charset: io.ktor.utils.io.charsets.Charset)
private data class CharsetEncoderImpl(private val charset: io.ktor.utils.io.charsets.Charset) : CharsetEncoder(charset)
public actual abstract class CharsetEncoder(internal val _charset: Charset)
private data class CharsetEncoderImpl(private val charset: Charset) : CharsetEncoder(charset)

public actual val CharsetEncoder.charset: io.ktor.utils.io.charsets.Charset get() = _charset
public actual val CharsetEncoder.charset: Charset get() = _charset

public actual fun CharsetEncoder.encodeToByteArray(input: CharSequence, fromIndex: Int, toIndex: Int): ByteArray =
encodeToByteArrayImpl(input, fromIndex, toIndex)
Expand All @@ -85,16 +81,26 @@ internal actual fun CharsetEncoder.encodeImpl(
toIndex: Int,
dst: Sink
): Int {
TODO()
require(fromIndex <= toIndex)
if (charset == Charsets.ISO_8859_1) {
return encodeISO88591(input, fromIndex, toIndex, dst)
}

require(charset === Charsets.UTF_8) { "Only UTF-8 encoding is supported in JS" }

val encoder = TextEncoder() // Only UTF-8 is supported so we know that at most 6 bytes per character is used
val result = encoder.encode(input.substring(fromIndex, toIndex))
dst.write(result.unsafeCast<ByteArray>())
return result.length
}

// ----------------------------------------------------------------------

public actual abstract class CharsetDecoder(internal val _charset: io.ktor.utils.io.charsets.Charset)
public actual abstract class CharsetDecoder(internal val _charset: Charset)

private data class CharsetDecoderImpl(private val charset: io.ktor.utils.io.charsets.Charset) : CharsetDecoder(charset)
private data class CharsetDecoderImpl(private val charset: Charset) : CharsetDecoder(charset)

public actual val CharsetDecoder.charset: io.ktor.utils.io.charsets.Charset get() = _charset
public actual val CharsetDecoder.charset: Charset get() = _charset

internal actual fun CharsetEncoder.encodeToByteArrayImpl(
input: CharSequence,
Expand All @@ -116,16 +122,27 @@ internal actual fun CharsetEncoder.encodeToByteArrayImpl(
return dst.readByteArray()
}

@OptIn(SnapshotApi::class, InternalIoApi::class)
public actual fun CharsetDecoder.decode(input: Source, dst: Appendable, max: Int): Int {
TODO()
val decoder = Decoder(charset.name, true)

val count = minOf(input.buffer.size, max.toLong())
val array = input.readByteArray(count.toInt()) as Int8Array
val result = try {
decoder.decode(array)
} catch (cause: Throwable) {
throw MalformedInputException("Failed to decode bytes: ${cause.message ?: "no cause provided"}")
}
dst.append(result)
return result.length
}

public actual object Charsets {
public actual val UTF_8: io.ktor.utils.io.charsets.Charset = CharsetImpl("UTF-8")
public actual val ISO_8859_1: io.ktor.utils.io.charsets.Charset = CharsetImpl("ISO-8859-1")
public actual val UTF_8: Charset = CharsetImpl("UTF-8")
public actual val ISO_8859_1: Charset = CharsetImpl("ISO-8859-1")
}

private data class CharsetImpl(val name: String) : io.ktor.utils.io.charsets.Charset(name) {
private data class CharsetImpl(val name: String) : Charset(name) {
override fun newEncoder(): CharsetEncoder = CharsetEncoderImpl(this)
override fun newDecoder(): CharsetDecoder = CharsetDecoderImpl(this)
}
Expand Down
32 changes: 32 additions & 0 deletions ktor-io/js/src/io/ktor/utils/io/charsets/Decoder.js.kt
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
/*
* Copyright 2014-2021 JetBrains s.r.o and contributors. Use of this source code is governed by the Apache 2.0 license.
*/

package io.ktor.utils.io.charsets

import org.khronos.webgl.*

internal fun Decoder(encoding: String, fatal: Boolean = true): Decoder = try {
TextDecoder(encoding, textDecoderOptions(fatal)).toKtor()
} catch (cause: Throwable) {
TextDecoderFallback(encoding, fatal)
}

internal interface Decoder {
fun decode(): String
fun decode(buffer: ArrayBufferView): String
fun decode(buffer: ArrayBufferView, options: dynamic): String
}

@Suppress("NOTHING_TO_INLINE")
internal inline fun Decoder.decodeStream(buffer: ArrayBufferView, stream: Boolean): String {
decodeWrap {
return decode(buffer, decodeOptions(stream))
}
}

internal fun decodeOptions(stream: Boolean): dynamic = Any().apply {
with(this.asDynamic()) {
this.stream = stream
}
}
29 changes: 29 additions & 0 deletions ktor-io/js/src/io/ktor/utils/io/charsets/TextDecoder.js.kt
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
/*
* Copyright 2014-2021 JetBrains s.r.o and contributors. Use of this source code is governed by the Apache 2.0 license.
*/

package io.ktor.utils.io.charsets

import org.khronos.webgl.*

internal external class TextDecoder(encoding: String, options: dynamic = definedExternally) {
val encoding: String

fun decode(): String
fun decode(buffer: ArrayBuffer): String
fun decode(buffer: ArrayBuffer, options: dynamic): String
fun decode(buffer: ArrayBufferView): String
fun decode(buffer: ArrayBufferView, options: dynamic): String
}

internal fun TextDecoder.toKtor(): Decoder = object : Decoder {
override fun decode(): String = this@toKtor.decode()
override fun decode(buffer: ArrayBufferView): String = this@toKtor.decode(buffer)
override fun decode(buffer: ArrayBufferView, options: dynamic): String = this@toKtor.decode(buffer, options)
}

internal fun textDecoderOptions(fatal: Boolean = false): Any = Any().apply {
with(this.asDynamic()) {
this.fatal = fatal
}
}
83 changes: 83 additions & 0 deletions ktor-io/js/src/io/ktor/utils/io/charsets/TextDecoderFallback.js.kt
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
/*
* Copyright 2014-2021 JetBrains s.r.o and contributors. Use of this source code is governed by the Apache 2.0 license.
*/

package io.ktor.utils.io.charsets

import io.ktor.utils.io.core.*
import org.khronos.webgl.*

private val ENCODING_ALIASES = setOf(
"ansi_x3.4-1968",
"ascii",
"cp1252",
"cp819",
"csisolatin1",
"ibm819",
"iso-8859-1",
"iso-ir-100",
"iso8859-1",
"iso88591",
"iso_8859-1",
"iso_8859-1:1987",
"l1",
"latin1",
"us-ascii",
"windows-1252",
"x-cp1252"
)

private val REPLACEMENT = byteArrayOf(0xEF.toByte(), 0xBF.toByte(), 0xBD.toByte())

/**
* Windows-1252 decoder.
*
* According to https://encoding.spec.whatwg.org/, ISO-8859-1 should be treated as windows-1252 for http.
*/
internal class TextDecoderFallback(
encoding: String,
val fatal: Boolean
) : Decoder {

init {
val requestedEncoding = encoding.trim().lowercase()
check(ENCODING_ALIASES.contains(requestedEncoding)) { "$encoding is not supported." }
}

override fun decode(): String = ""

override fun decode(buffer: ArrayBufferView): String = buildPacket {
val bytes = buffer as Int8Array
for (index in 0 until bytes.length) {
val byte = bytes[index]
val point: Int = byte.toCodePoint()

if (point < 0) {
check(!fatal) { "Invalid character: $point" }
writeFully(REPLACEMENT)
continue
}

if (point > 0xFF) {
writeByte((point shr 8).toByte())
}

writeByte((point and 0xFF).toByte())
}
}.readBytes().decodeToString()

override fun decode(buffer: ArrayBufferView, options: dynamic): String {
return decode(buffer)
}
}

private fun Byte.toCodePoint(): Int {
val value = toInt() and 0xFF
if (value.isASCII()) {
return value
}

return WIN1252_TABLE[value - 0x80]
}

private fun Int.isASCII(): Boolean = this in 0..0x7F
9 changes: 9 additions & 0 deletions ktor-io/js/src/io/ktor/utils/io/charsets/TextEncoder.js.kt
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
package io.ktor.utils.io.js

import org.khronos.webgl.*

internal external class TextEncoder() {
val encoding: String

public fun encode(input: String): Uint8Array
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
/*
* Copyright 2014-2024 JetBrains s.r.o and contributors. Use of this source code is governed by the Apache 2.0 license.
*/

package io.ktor.utils.io.charsets

internal inline fun <R> decodeWrap(block: () -> R): R {
try {
return block()
} catch (cause: Throwable) {
throw MalformedInputException("Failed to decode bytes: ${cause.message ?: "no cause provided"}")
}
}
20 changes: 20 additions & 0 deletions ktor-io/jsAndWasmShared/src/io/ktor/utils/io/charsets/ISO88591.kt
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
package io.ktor.utils.io.charsets

import kotlinx.io.*

internal fun encodeISO88591(input: CharSequence, fromIndex: Int, toIndex: Int, dst: Sink): Int {
if (fromIndex >= toIndex) return 0

for (index in fromIndex until toIndex) {
val character = input[index].code
if (character > 0xff) {
failedToMapError(character)
}
dst.writeByte(character.toByte())
}
return toIndex - fromIndex
}

private fun failedToMapError(ch: Int): Nothing {
throw MalformedInputException("The character with unicode point $ch couldn't be mapped to ISO-8859-1 character")
}
Loading

0 comments on commit d24d409

Please sign in to comment.