Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added ability to buffered read huge strings in custom KSerializers #2012

Merged
merged 17 commits into from
Feb 16, 2023
Merged
Show file tree
Hide file tree
Changes from 8 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions core/api/kotlinx-serialization-core.api
Original file line number Diff line number Diff line change
Expand Up @@ -413,6 +413,10 @@ public abstract class kotlinx/serialization/encoding/AbstractEncoder : kotlinx/s
public fun shouldEncodeElementDefault (Lkotlinx/serialization/descriptors/SerialDescriptor;I)Z
}

public abstract interface class kotlinx/serialization/encoding/ChunkedDecoder {
public abstract fun decodeStringChunked (Lkotlin/jvm/functions/Function1;)V
}

public abstract interface class kotlinx/serialization/encoding/CompositeDecoder {
public static final field Companion Lkotlinx/serialization/encoding/CompositeDecoder$Companion;
public static final field DECODE_DONE I
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
package kotlinx.serialization.encoding

import kotlinx.serialization.ExperimentalSerializationApi
import kotlinx.serialization.descriptors.PrimitiveKind

public interface ChunkedDecoder {
sandwwraith marked this conversation as resolved.
Show resolved Hide resolved
sandwwraith marked this conversation as resolved.
Show resolved Hide resolved
sandwwraith marked this conversation as resolved.
Show resolved Hide resolved
/**
* Decodes a string value by chunks (16k by default), outputs string them to consumer.
sandwwraith marked this conversation as resolved.
Show resolved Hide resolved
* Corresponding kind is [PrimitiveKind.STRING].
*/
@ExperimentalSerializationApi
public fun decodeStringChunked(consumeChunk:(chunk:String) -> Unit)
sandwwraith marked this conversation as resolved.
Show resolved Hide resolved
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
package kotlinx.serialization.json

import kotlinx.serialization.*
import kotlinx.serialization.Serializable
import kotlinx.serialization.descriptors.*
import kotlinx.serialization.encoding.*
import org.junit.Test
import java.io.*
import java.util.*
import kotlin.random.Random
import kotlin.test.*


@Serializable(with = LargeStringSerializer::class)
data class LargeBinaryData(val binaryData: ByteArray) {
override fun equals(other: Any?): Boolean {
if (this === other) return true
if (javaClass != other?.javaClass) return false

other as LargeBinaryData

if (!binaryData.contentEquals(other.binaryData)) return false

return true
}

override fun hashCode(): Int {
return binaryData.contentHashCode()
}
}

@Serializable
data class ClassWithBinaryDataField(val binaryField: LargeBinaryData)

object LargeStringSerializer : KSerializer<LargeBinaryData> {
private val b64Decoder: Base64.Decoder = Base64.getDecoder()
override val descriptor: SerialDescriptor = PrimitiveSerialDescriptor("LargeStringContent", PrimitiveKind.STRING)

override fun deserialize(decoder: Decoder): LargeBinaryData {
require(decoder is ChunkedDecoder) { "Only chunked decoder supported" }

var reminder = ""
val decodedBytes = ByteArrayOutputStream().use { bos ->
decoder.decodeStringChunked {
val actualChunk = reminder + it
val reminderLength = actualChunk.length % 4
val alignedLength = actualChunk.length - reminderLength
val alignedChunk = actualChunk.take(alignedLength)
reminder = actualChunk.takeLast(reminderLength)
bos.write(b64Decoder.decode(alignedChunk))
}
bos.toByteArray()
}

return LargeBinaryData(decodedBytes)
}

override fun serialize(encoder: Encoder, value: LargeBinaryData) {
encoder.encodeString(Base64.getEncoder().encodeToString(value.binaryData))
}
}


class JsonChunkedDecoderTest:JsonTestBase() {
fred01 marked this conversation as resolved.
Show resolved Hide resolved

@Test
fun decodeBase64String() {
val sourceObject = ClassWithBinaryDataField(LargeBinaryData(Random.nextBytes(16 * 1024))) // After encoding will be more than BATCH_SIZE (16k)
val serializedObject = Json.encodeToString(sourceObject)

JsonTestingMode.values().forEach { mode ->
if (mode == JsonTestingMode.TREE) {
assertFails("Only chunked decoder supported") {
sandwwraith marked this conversation as resolved.
Show resolved Hide resolved
Json.decodeFromString<ClassWithBinaryDataField>(serializedObject, mode)
}
} else {
val deserializedObject = Json.decodeFromString<ClassWithBinaryDataField>(serializedObject, mode)
assertEquals(sourceObject.binaryField, deserializedObject.binaryField)
}
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ import kotlinx.serialization.json.*
import kotlinx.serialization.modules.*
import kotlin.jvm.*


sandwwraith marked this conversation as resolved.
Show resolved Hide resolved
/**
* [JsonDecoder] which reads given JSON from [AbstractJsonLexer] field by field.
*/
Expand All @@ -24,7 +25,7 @@ internal open class StreamingJsonDecoder(
@JvmField internal val lexer: AbstractJsonLexer,
descriptor: SerialDescriptor,
discriminatorHolder: DiscriminatorHolder?
) : JsonDecoder, AbstractDecoder() {
) : JsonDecoder, ChunkedDecoder, AbstractDecoder() {

// A mutable reference to the discriminator that have to be skipped when in optimistic phase
// of polymorphic serialization, see `decodeSerializableValue`
Expand Down Expand Up @@ -343,6 +344,10 @@ internal open class StreamingJsonDecoder(
}
}

override fun decodeStringChunked(consumeChunk: (chunk: String) -> Unit) {
lexer.consumeStringChunked(consumeChunk)
sandwwraith marked this conversation as resolved.
Show resolved Hide resolved
}

override fun decodeInline(descriptor: SerialDescriptor): Decoder =
if (descriptor.isUnsignedNumber) JsonDecoderForUnsignedTypes(lexer, json)
else super.decodeInline(descriptor)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@

package kotlinx.serialization.json.internal

import kotlinx.serialization.json.internal.*
import kotlinx.serialization.json.internal.CharMappings.CHAR_TO_TOKEN
import kotlinx.serialization.json.internal.CharMappings.ESCAPE_2_CHAR
import kotlin.js.*
Expand Down Expand Up @@ -307,6 +306,8 @@ internal abstract class AbstractJsonLexer {
*/
abstract fun consumeKeyString(): String

abstract fun consumeStringChunked(consumeChunk: (stringChunk: String) -> Unit)

fun consumeString(): String {
if (peekedString != null) {
return takePeeked()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,32 @@ internal class ReaderJsonLexer(
preload(spaceLeft)
}

override fun consumeStringChunked(consumeChunk: (stringChunk: String) -> Unit) {
consumeNextToken(STRING)
var currentPosition = this.currentPosition
var lastPosition = currentPosition
var char = source[currentPosition] // Avoid two range checks visible in the profiler
while (char != STRING) {
if (++currentPosition >= source.length) {
// end of chunk
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It seems that you're not handling string escape sequences. Is that intentional?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes and no. For the first time, it was intentionally, because I suppose to consume base64 string, which can't contain double quote. But later, I prefer to generic approach, and seems, now I should handle double quotes as well. Will fix

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added support for escaping

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also, please note - to properly handle escaping, I'm forced to move actual decoding method from ReaderJsonLexer to AbstractJsonLexer. In other case I would need un-private a lot amount of methods of AbstractJsonLexer.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sorry, I don't quite understand. Can you elaborate on that please?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Previously, I placed my method in ReaderJsonLexer to use highest possible hierarchy level. But to properly handle escaping I forced to move handling method to AbstractJsonLexer, go down one level. Is it OK?

writeRange(lastPosition, currentPosition, consumeChunk)
currentPosition = prefetchOrEof(currentPosition)
if (currentPosition == -1)
fail("EOF", currentPosition)
lastPosition = currentPosition
}
char = source[currentPosition]
}
writeRange(lastPosition, currentPosition, consumeChunk)
this.currentPosition = currentPosition + 1 // Consume closing '"' (STRING)
}

private fun writeRange(fromIndex: Int, toIndex: Int, consumeChunk: (stringChunk: String) -> Unit) {
val tmp = StringBuilder()
sandwwraith marked this conversation as resolved.
Show resolved Hide resolved
tmp.appendRange(source, fromIndex, toIndex)
consumeChunk(tmp.toString())
}

override fun consumeKeyString(): String {
/*
* For strings we assume that escaped symbols are rather an exception, so firstly
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,10 @@ internal class StringJsonLexer(override val source: String) : AbstractJsonLexer(
return source.substring(current, closingQuote)
}

override fun consumeStringChunked(consumeChunk: (stringChunk: String) -> Unit) {
consumeKeyString().chunked(BATCH_SIZE).forEach(consumeChunk)
sandwwraith marked this conversation as resolved.
Show resolved Hide resolved
}

override fun consumeLeadingMatchingValue(keyToMatch: String, isLenient: Boolean): String? {
val positionSnapshot = currentPosition
try {
Expand Down