From bb0dc8f6c6d9ff0b7b889869bb16b219c6b16b8c Mon Sep 17 00:00:00 2001 From: Vsevolod Tolstopyatov Date: Tue, 17 Aug 2021 18:12:21 +0300 Subject: [PATCH] Performance-friendly JsonLexer (#1635) * Performance-friendly JsonLexer --- .../src/kotlinx/serialization/json/Json.kt | 2 +- .../json/internal/JsonExceptions.kt | 2 +- .../json/internal/JsonTreeReader.kt | 2 +- .../json/internal/StreamingJsonDecoder.kt | 8 +- .../json/internal/TreeJsonDecoder.kt | 2 +- .../AbstractJsonLexer.kt} | 135 ++++++------------ .../json/internal/lexer/StringJsonLexer.kt | 95 ++++++++++++ .../serialization/json/JsonTestBase.kt | 2 +- .../kotlinx/serialization/json/JvmStreams.kt | 2 +- .../json/internal/JsonLexerJvm.kt | 83 ++++++++++- .../features/JsonStreamFlowTest.kt | 2 +- 11 files changed, 227 insertions(+), 108 deletions(-) rename formats/json/commonMain/src/kotlinx/serialization/json/internal/{JsonLexer.kt => lexer/AbstractJsonLexer.kt} (85%) create mode 100644 formats/json/commonMain/src/kotlinx/serialization/json/internal/lexer/StringJsonLexer.kt diff --git a/formats/json/commonMain/src/kotlinx/serialization/json/Json.kt b/formats/json/commonMain/src/kotlinx/serialization/json/Json.kt index 9d7efecd0a..bfced1cd51 100644 --- a/formats/json/commonMain/src/kotlinx/serialization/json/Json.kt +++ b/formats/json/commonMain/src/kotlinx/serialization/json/Json.kt @@ -95,7 +95,7 @@ public sealed class Json( * @throws [SerializationException] if the given JSON string cannot be deserialized to the value of type [T]. */ public final override fun decodeFromString(deserializer: DeserializationStrategy, string: String): T { - val lexer = JsonLexer(string) + val lexer = StringJsonLexer(string) val input = StreamingJsonDecoder(this, WriteMode.OBJ, lexer, deserializer.descriptor) val result = input.decodeSerializableValue(deserializer) lexer.expectEof() diff --git a/formats/json/commonMain/src/kotlinx/serialization/json/internal/JsonExceptions.kt b/formats/json/commonMain/src/kotlinx/serialization/json/internal/JsonExceptions.kt index a78051d2f5..1f57de47b8 100644 --- a/formats/json/commonMain/src/kotlinx/serialization/json/internal/JsonExceptions.kt +++ b/formats/json/commonMain/src/kotlinx/serialization/json/internal/JsonExceptions.kt @@ -45,7 +45,7 @@ internal fun InvalidFloatingPointDecoded(value: Number, key: String, output: Str JsonDecodingException(-1, unexpectedFpErrorMessage(value, key, output)) // Extension on JSON reader and fail immediately -internal fun JsonLexer.throwInvalidFloatingPointDecoded(result: Number): Nothing { +internal fun AbstractJsonLexer.throwInvalidFloatingPointDecoded(result: Number): Nothing { fail("Unexpected special floating-point value $result. By default, " + "non-finite floating point values are prohibited because they do not conform JSON specification. " + specialFlowingValuesHint diff --git a/formats/json/commonMain/src/kotlinx/serialization/json/internal/JsonTreeReader.kt b/formats/json/commonMain/src/kotlinx/serialization/json/internal/JsonTreeReader.kt index e60b321f25..b6d56b044d 100644 --- a/formats/json/commonMain/src/kotlinx/serialization/json/internal/JsonTreeReader.kt +++ b/formats/json/commonMain/src/kotlinx/serialization/json/internal/JsonTreeReader.kt @@ -10,7 +10,7 @@ import kotlinx.serialization.json.* @OptIn(ExperimentalSerializationApi::class) internal class JsonTreeReader( configuration: JsonConfiguration, - private val lexer: JsonLexer + private val lexer: AbstractJsonLexer ) { private val isLenient = configuration.isLenient private var stackDepth = 0 diff --git a/formats/json/commonMain/src/kotlinx/serialization/json/internal/StreamingJsonDecoder.kt b/formats/json/commonMain/src/kotlinx/serialization/json/internal/StreamingJsonDecoder.kt index a0ec61ad69..f7f591410f 100644 --- a/formats/json/commonMain/src/kotlinx/serialization/json/internal/StreamingJsonDecoder.kt +++ b/formats/json/commonMain/src/kotlinx/serialization/json/internal/StreamingJsonDecoder.kt @@ -13,13 +13,13 @@ import kotlinx.serialization.modules.* import kotlin.jvm.* /** - * [JsonDecoder] which reads given JSON from [JsonLexer] field by field. + * [JsonDecoder] which reads given JSON from [AbstractJsonLexer] field by field. */ @OptIn(ExperimentalSerializationApi::class, ExperimentalUnsignedTypes::class) internal open class StreamingJsonDecoder( final override val json: Json, private val mode: WriteMode, - @JvmField internal val lexer: JsonLexer, + @JvmField internal val lexer: AbstractJsonLexer, descriptor: SerialDescriptor ) : JsonDecoder, AbstractDecoder() { @@ -256,7 +256,7 @@ internal open class StreamingJsonDecoder( @OptIn(ExperimentalSerializationApi::class) @ExperimentalUnsignedTypes internal class JsonDecoderForUnsignedTypes( - private val lexer: JsonLexer, + private val lexer: AbstractJsonLexer, json: Json ) : AbstractDecoder() { override val serializersModule: SerializersModule = json.serializersModule @@ -268,7 +268,7 @@ internal class JsonDecoderForUnsignedTypes( override fun decodeShort(): Short = lexer.parseString("UShort") { toUShort().toShort() } } -private inline fun JsonLexer.parseString(expectedType: String, block: String.() -> T): T { +private inline fun AbstractJsonLexer.parseString(expectedType: String, block: String.() -> T): T { val input = consumeStringLenient() try { return input.block() diff --git a/formats/json/commonMain/src/kotlinx/serialization/json/internal/TreeJsonDecoder.kt b/formats/json/commonMain/src/kotlinx/serialization/json/internal/TreeJsonDecoder.kt index 0fde4c9964..55e23a13be 100644 --- a/formats/json/commonMain/src/kotlinx/serialization/json/internal/TreeJsonDecoder.kt +++ b/formats/json/commonMain/src/kotlinx/serialization/json/internal/TreeJsonDecoder.kt @@ -163,7 +163,7 @@ private sealed class AbstractJsonTreeDecoder( @OptIn(ExperimentalUnsignedTypes::class) override fun decodeTaggedInline(tag: String, inlineDescriptor: SerialDescriptor): Decoder = - if (inlineDescriptor.isUnsignedNumber) JsonDecoderForUnsignedTypes(JsonLexer(getPrimitiveValue(tag).content), json) + if (inlineDescriptor.isUnsignedNumber) JsonDecoderForUnsignedTypes(StringJsonLexer(getPrimitiveValue(tag).content), json) else super.decodeTaggedInline(tag, inlineDescriptor) } diff --git a/formats/json/commonMain/src/kotlinx/serialization/json/internal/JsonLexer.kt b/formats/json/commonMain/src/kotlinx/serialization/json/internal/lexer/AbstractJsonLexer.kt similarity index 85% rename from formats/json/commonMain/src/kotlinx/serialization/json/internal/JsonLexer.kt rename to formats/json/commonMain/src/kotlinx/serialization/json/internal/lexer/AbstractJsonLexer.kt index 36accfa8fa..1766f019e0 100644 --- a/formats/json/commonMain/src/kotlinx/serialization/json/internal/JsonLexer.kt +++ b/formats/json/commonMain/src/kotlinx/serialization/json/internal/lexer/AbstractJsonLexer.kt @@ -4,9 +4,11 @@ package kotlinx.serialization.json.internal +import kotlinx.serialization.json.internal.* import kotlinx.serialization.json.internal.CharMappings.CHAR_TO_TOKEN import kotlinx.serialization.json.internal.CharMappings.ESCAPE_2_CHAR -import kotlin.jvm.JvmField +import kotlin.js.* +import kotlin.jvm.* internal const val lenientHint = "Use 'isLenient = true' in 'Json {}` builder to accept non-compliant JSON." internal const val coerceInputValuesHint = "Use 'coerceInputValues = true' in 'Json {}` builder to coerce nulls to default values." @@ -118,60 +120,47 @@ internal fun charToTokenClass(c: Char) = if (c.code < CTC_MAX) CHAR_TO_TOKEN[c.c internal fun escapeToChar(c: Int): Char = if (c < ESC2C_MAX) ESCAPE_2_CHAR[c] else INVALID -// Streaming JSON reader -internal open class JsonLexer(@JvmField protected var source: CharSequence) { +/** + * The base class that reads the JSON from the given char sequence source. + * It has two implementations: one over the raw [String] instance, [StringJsonLexer], + * and one over an arbitrary stream of data, [ReaderJsonLexer] (JVM-only). + * + * [AbstractJsonLexer] contains base implementation for cold or not performance-sensitive + * methods on top of [CharSequence], but [StringJsonLexer] overrides some + * of them for the performance reasons (devirtualization of [CharSequence] and avoid + * of additional spills). + */ +internal abstract class AbstractJsonLexer { + + protected abstract val source: CharSequence @JvmField protected var currentPosition: Int = 0 // position in source open fun ensureHaveChars() {} - fun expectEof() { - val nextToken = consumeNextToken() - if (nextToken != TC_EOF) - fail("Expected EOF, but had ${source[currentPosition - 1]} instead") - } + // Used as bound check in loops + abstract fun definitelyNotEof(position: Int): Int - // should be used inside loops instead of range checks - protected open fun definitelyNotEof(position: Int): Int = if (position < source.length) position else -1 + abstract fun tryConsumeComma(): Boolean + abstract fun canConsumeValue(): Boolean - fun tryConsumeComma(): Boolean { - val current = skipWhitespaces() - if (current >= source.length || current == -1) return false - if (source[current] == ',') { - ++currentPosition - return true - } - return false - } - - fun canConsumeValue(): Boolean { - ensureHaveChars() - var current = currentPosition - while (true) { - current = definitelyNotEof(current) - if (current == -1) break // could be inline function but KT-1436 - val c = source[current] - // Inlined skipWhitespaces without field spill and nested loop. Also faster then char2TokenClass - if (c == ' ' || c == '\n' || c == '\r' || c == '\t') { - ++current - continue - } - currentPosition = current - return isValidValueStart(c) - } - currentPosition = current - return false - } + abstract fun consumeNextToken(): Byte - private fun isValidValueStart(c: Char): Boolean { + protected fun isValidValueStart(c: Char): Boolean { return when (c) { '}', ']', ':', ',' -> false else -> true } } + fun expectEof() { + val nextToken = consumeNextToken() + if (nextToken != TC_EOF) + fail("Expected EOF, but had ${source[currentPosition - 1]} instead") + } + /* * Peeked string for coerced enums. * If the value was picked, 'consumeString' will take it without scanning the source. @@ -188,7 +177,7 @@ internal open class JsonLexer(@JvmField protected var source: CharSequence) { return token } - fun consumeNextToken(expected: Char) { + open fun consumeNextToken(expected: Char) { ensureHaveChars() val source = source var cpos = currentPosition @@ -205,7 +194,7 @@ internal open class JsonLexer(@JvmField protected var source: CharSequence) { unexpectedToken(expected) // EOF } - private fun unexpectedToken(expected: Char) { + protected fun unexpectedToken(expected: Char) { --currentPosition // To properly handle null if (expected == STRING && consumeStringLenient() == NULL) { fail("Expected string literal but 'null' literal was found.\n$coerceInputValuesHint", currentPosition - 4) @@ -213,7 +202,7 @@ internal open class JsonLexer(@JvmField protected var source: CharSequence) { fail(charToTokenClass(expected)) } - private fun fail(expectedToken: Byte) { + protected fun fail(expectedToken: Byte) { // We know that the token was consumed prior to this call // Slow path, never called in normal code, can avoid optimizing it val expected = when (expectedToken) { @@ -248,26 +237,6 @@ internal open class JsonLexer(@JvmField protected var source: CharSequence) { return TC_EOF } - fun consumeNextToken(): Byte { - ensureHaveChars() - val source = source - var cpos = currentPosition - while (true) { - cpos = definitelyNotEof(cpos) - if (cpos == -1) break - val ch = source[cpos++] - return when (val tc = charToTokenClass(ch)) { - TC_WHITESPACE -> continue - else -> { - currentPosition = cpos - tc - } - } - } - currentPosition = cpos - return TC_EOF - } - /** * Tries to consume `null` token from input. * Returns `true` if the next 4 chars in input are not `null`, @@ -291,7 +260,7 @@ internal open class JsonLexer(@JvmField protected var source: CharSequence) { return false } - private fun skipWhitespaces(): Int { + open fun skipWhitespaces(): Int { var current = currentPosition // Skip whitespaces while (true) { @@ -329,33 +298,7 @@ internal open class JsonLexer(@JvmField protected var source: CharSequence) { * This method is a copy of consumeString, but used for key of json objects, so there * is no need to lookup peeked string. */ - fun consumeKeyString(): String { - /* - * For strings we assume that escaped symbols are rather an exception, so firstly - * we optimistically scan for closing quote via intrinsified and blazing-fast 'indexOf', - * than do our pessimistic check for backslash and fallback to slow-path if necessary. - */ - consumeNextToken(STRING) - var current = currentPosition - val closingQuote = indexOf('"', current) - if (closingQuote == -1) { - current = definitelyNotEof(current) - if (current == -1) fail(TC_STRING) - // it's also possible just to resize buffer, - // instead of falling back to slow path, - // not sure what is better - else return consumeString(currentPosition, current) - } - // Now we _optimistically_ know where the string ends (it might have been an escaped quote) - for (i in current until closingQuote) { - // Encountered escape sequence, should fallback to "slow" path and symmbolic scanning - if (source[i] == STRING_ESC) { - return consumeString(currentPosition, i) - } - } - this.currentPosition = closingQuote + 1 - return substring(current, closingQuote) - } + abstract fun consumeKeyString(): String fun consumeString(): String { if (peekedString != null) { @@ -365,10 +308,10 @@ internal open class JsonLexer(@JvmField protected var source: CharSequence) { return consumeKeyString() } - private fun consumeString(startPosition: Int, current: Int): String { + @JsName("consumeString2") // WA for JS issue + protected fun consumeString(source: CharSequence, startPosition: Int, current: Int): String { var currentPosition = current var lastPosition = startPosition - var source = source var char = source[currentPosition] // Avoid two range checks visible in the profiler var usedAppend = false while (char != STRING) { @@ -383,7 +326,6 @@ internal open class JsonLexer(@JvmField protected var source: CharSequence) { currentPosition = definitelyNotEof(currentPosition) if (currentPosition == -1) fail("EOF", currentPosition) - source = this.source lastPosition = currentPosition } char = source[currentPosition] @@ -453,11 +395,13 @@ internal open class JsonLexer(@JvmField protected var source: CharSequence) { if (current >= source.length) { usedAppend = true appendRange(currentPosition, current) - current = definitelyNotEof(current) - if (current == -1) { + val eof = definitelyNotEof(current) + if (eof == -1) { // to handle plain lenient strings, such as top-level currentPosition = current return decodedString(0, 0) + } else { + current = eof } } } @@ -647,6 +591,7 @@ internal open class JsonLexer(@JvmField protected var source: CharSequence) { return result } + @JsName("consumeBoolean2") // WA for JS issue private fun consumeBoolean(start: Int): Boolean { /* * In ASCII representation, upper and lower case letters are different diff --git a/formats/json/commonMain/src/kotlinx/serialization/json/internal/lexer/StringJsonLexer.kt b/formats/json/commonMain/src/kotlinx/serialization/json/internal/lexer/StringJsonLexer.kt new file mode 100644 index 0000000000..fc6a6c1ac0 --- /dev/null +++ b/formats/json/commonMain/src/kotlinx/serialization/json/internal/lexer/StringJsonLexer.kt @@ -0,0 +1,95 @@ +package kotlinx.serialization.json.internal + +internal class StringJsonLexer(override val source: String) : AbstractJsonLexer() { + + override fun definitelyNotEof(position: Int): Int = if (position < source.length) position else -1 + + override fun consumeNextToken(): Byte { + val source = source + while (currentPosition != -1 && currentPosition < source.length) { + val ch = source[currentPosition++] + return when (val tc = charToTokenClass(ch)) { + TC_WHITESPACE -> continue + else -> tc + } + } + return TC_EOF + } + + override fun tryConsumeComma(): Boolean { + val current = skipWhitespaces() + if (current == source.length || current == -1) return false + if (source[current] == ',') { + ++currentPosition + return true + } + return false + } + + override fun canConsumeValue(): Boolean { + var current = currentPosition + if (current == -1) return false + while (current < source.length) { + val c = source[current] + // Inlined skipWhitespaces without field spill and nested loop. Also faster then char2TokenClass + if (c == ' ' || c == '\n' || c == '\r' || c == '\t') { + ++current + continue + } + currentPosition = current + return isValidValueStart(c) + } + currentPosition = current + return false + } + + override fun skipWhitespaces(): Int { + var current = currentPosition + if (current == -1) return current + // Skip whitespaces + while (current < source.length) { + val c = source[current] + // Faster than char2TokenClass actually + if (c == ' ' || c == '\n' || c == '\r' || c == '\t') { + ++current + } else { + break + } + } + currentPosition = current + return current + } + + override fun consumeNextToken(expected: Char) { + if (currentPosition == -1) unexpectedToken(expected) + val source = source + while (currentPosition < source.length) { + val c = source[currentPosition++] + if (c == ' ' || c == '\n' || c == '\r' || c == '\t') continue + if (c == expected) return + unexpectedToken(expected) + } + unexpectedToken(expected) // EOF + } + + override fun consumeKeyString(): String { + /* + * For strings we assume that escaped symbols are rather an exception, so firstly + * we optimistically scan for closing quote via intrinsified and blazing-fast 'indexOf', + * than do our pessimistic check for backslash and fallback to slow-path if necessary. + */ + consumeNextToken(STRING) + val current = currentPosition + val closingQuote = source.indexOf('"', current) + if (closingQuote == -1) fail(TC_STRING) + // Now we _optimistically_ know where the string ends (it might have been an escaped quote) + for (i in current until closingQuote) { + // Encountered escape sequence, should fallback to "slow" path and symbolic scanning + if (source[i] == STRING_ESC) { + return consumeString(source, currentPosition, i) + } + } + this.currentPosition = closingQuote + 1 + return source.substring(current, closingQuote) + } +} diff --git a/formats/json/commonTest/src/kotlinx/serialization/json/JsonTestBase.kt b/formats/json/commonTest/src/kotlinx/serialization/json/JsonTestBase.kt index 38f33a82b1..4b39308d46 100644 --- a/formats/json/commonTest/src/kotlinx/serialization/json/JsonTestBase.kt +++ b/formats/json/commonTest/src/kotlinx/serialization/json/JsonTestBase.kt @@ -66,7 +66,7 @@ abstract class JsonTestBase { decodeViaStream(deserializer, source) } JsonTestingMode.TREE -> { - val lexer = JsonLexer(source) + val lexer = StringJsonLexer(source) val input = StreamingJsonDecoder(this, WriteMode.OBJ, lexer, deserializer.descriptor) val tree = input.decodeJsonElement() lexer.expectEof() diff --git a/formats/json/jvmMain/src/kotlinx/serialization/json/JvmStreams.kt b/formats/json/jvmMain/src/kotlinx/serialization/json/JvmStreams.kt index 4efbaa793b..b4f0b31dd7 100644 --- a/formats/json/jvmMain/src/kotlinx/serialization/json/JvmStreams.kt +++ b/formats/json/jvmMain/src/kotlinx/serialization/json/JvmStreams.kt @@ -57,7 +57,7 @@ public fun Json.decodeFromStream( stream: InputStream, charset: Charset = Charsets.UTF_8 ): T { - val lexer = JsonReaderLexer(stream, charset) + val lexer = ReaderJsonLexer(stream, charset) val input = StreamingJsonDecoder(this, WriteMode.OBJ, lexer, deserializer.descriptor) return input.decodeSerializableValue(deserializer) } diff --git a/formats/json/jvmMain/src/kotlinx/serialization/json/internal/JsonLexerJvm.kt b/formats/json/jvmMain/src/kotlinx/serialization/json/internal/JsonLexerJvm.kt index cd4935ce7c..f157d95cd8 100644 --- a/formats/json/jvmMain/src/kotlinx/serialization/json/internal/JsonLexerJvm.kt +++ b/formats/json/jvmMain/src/kotlinx/serialization/json/internal/JsonLexerJvm.kt @@ -26,18 +26,49 @@ private class ArrayAsSequence(private val source: CharArray) : CharSequence { } } -internal class JsonReaderLexer( +internal class ReaderJsonLexer( private val reader: Reader, private var _source: CharArray = CharArray(BATCH_SIZE) -) : JsonLexer(ArrayAsSequence(_source)) { +) : AbstractJsonLexer() { private var threshold: Int = DEFAULT_THRESHOLD // chars constructor(i: InputStream, charset: Charset) : this(i.reader(charset).buffered(READER_BUF_SIZE)) + override var source: CharSequence = ArrayAsSequence(_source) + init { preload(0) } + override fun tryConsumeComma(): Boolean { + val current = skipWhitespaces() + if (current >= source.length || current == -1) return false + if (source[current] == ',') { + ++currentPosition + return true + } + return false + } + + override fun canConsumeValue(): Boolean { + ensureHaveChars() + var current = currentPosition + while (true) { + current = definitelyNotEof(current) + if (current == -1) break // could be inline function but KT-1436 + val c = source[current] + // Inlined skipWhitespaces without field spill and nested loop. Also faster then char2TokenClass + if (c == ' ' || c == '\n' || c == '\r' || c == '\t') { + ++current + continue + } + currentPosition = current + return isValidValueStart(c) + } + currentPosition = current + return false + } + private fun preload(spaceLeft: Int) { val buffer = _source System.arraycopy(buffer, currentPosition, buffer, 0, spaceLeft) @@ -66,6 +97,26 @@ internal class JsonReaderLexer( return 0 } + override fun consumeNextToken(): Byte { + ensureHaveChars() + val source = source + var cpos = currentPosition + while (true) { + cpos = definitelyNotEof(cpos) + if (cpos == -1) break + val ch = source[cpos++] + return when (val tc = charToTokenClass(ch)) { + TC_WHITESPACE -> continue + else -> { + currentPosition = cpos + tc + } + } + } + currentPosition = cpos + return TC_EOF + } + override fun ensureHaveChars() { val cur = currentPosition val oldSize = _source.size @@ -76,6 +127,34 @@ internal class JsonReaderLexer( preload(spaceLeft) } + override fun consumeKeyString(): String { + /* + * For strings we assume that escaped symbols are rather an exception, so firstly + * we optimistically scan for closing quote via intrinsified and blazing-fast 'indexOf', + * than do our pessimistic check for backslash and fallback to slow-path if necessary. + */ + consumeNextToken(STRING) + var current = currentPosition + val closingQuote = indexOf('"', current) + if (closingQuote == -1) { + current = definitelyNotEof(current) + if (current == -1) fail(TC_STRING) + // it's also possible just to resize buffer, + // instead of falling back to slow path, + // not sure what is better + else return consumeString(source, currentPosition, current) + } + // Now we _optimistically_ know where the string ends (it might have been an escaped quote) + for (i in current until closingQuote) { + // Encountered escape sequence, should fallback to "slow" path and symmbolic scanning + if (source[i] == STRING_ESC) { + return consumeString(source, currentPosition, i) + } + } + this.currentPosition = closingQuote + 1 + return substring(current, closingQuote) + } + override fun indexOf(char: Char, startPos: Int): Int { val src = _source for (i in startPos until src.size) { diff --git a/formats/json/jvmTest/src/kotlinx/serialization/features/JsonStreamFlowTest.kt b/formats/json/jvmTest/src/kotlinx/serialization/features/JsonStreamFlowTest.kt index affd91338c..95fe7c4013 100644 --- a/formats/json/jvmTest/src/kotlinx/serialization/features/JsonStreamFlowTest.kt +++ b/formats/json/jvmTest/src/kotlinx/serialization/features/JsonStreamFlowTest.kt @@ -38,7 +38,7 @@ class JsonStreamFlowTest { f.writeToStream(os) } - assertEquals(inputString, os.toString(Charsets.UTF_8)) + assertEquals(inputString, os.toString(Charsets.UTF_8.name())) } @Test