Skip to content

Commit

Permalink
Support unicode symbols in strings (#122)
Browse files Browse the repository at this point in the history
Fixes #72
  • Loading branch information
Valentin Rocher authored Mar 11, 2022
1 parent baed32d commit 4d2fb18
Show file tree
Hide file tree
Showing 8 changed files with 169 additions and 24 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ import com.akuleshov7.ktoml.exceptions.ParseException
import com.akuleshov7.ktoml.parsers.trimBrackets
import com.akuleshov7.ktoml.parsers.trimQuotes
import com.akuleshov7.ktoml.parsers.trimSingleQuotes
import com.akuleshov7.ktoml.utils.appendCodePointCompat
import kotlinx.datetime.*

/**
Expand Down Expand Up @@ -75,6 +76,12 @@ internal constructor(
) : this(content.verifyAndTrimQuotes(lineNo), lineNo)

public companion object {
private const val COMPLEX_UNICODE_LENGTH = 8
private const val COMPLEX_UNICODE_PREFIX = 'U'
private const val HEX_RADIX = 16
private const val SIMPLE_UNICODE_LENGTH = 4
private const val SIMPLE_UNICODE_PREFIX = 'u'

private fun String.verifyAndTrimQuotes(lineNo: Int): Any =
if (startsWith("\"") && endsWith("\"")) {
trimQuotes()
Expand Down Expand Up @@ -104,41 +111,70 @@ internal constructor(

private fun String.convertSpecialCharacters(lineNo: Int): String {
val resultString = StringBuilder()
var updatedOnPreviousStep = false
var i = 0
while (i < this.length) {
val newCharacter = if (this[i] == '\\' && i != this.length - 1) {
updatedOnPreviousStep = true
when (this[i + 1]) {
// table that is used to convert escaped string literals to proper char symbols
't' -> '\t'
'b' -> '\b'
'r' -> '\r'
'n' -> '\n'
'\\' -> '\\'
'\'' -> '\''
'"' -> '"'
while (i < length) {
val currentChar = get(i)
var offset = 1
if (currentChar == '\\' && i != lastIndex) {
// Escaped
val next = get(i + 1)
offset++
when (next) {
't' -> resultString.append('\t')
'b' -> resultString.append('\b')
'r' -> resultString.append('\r')
'n' -> resultString.append('\n')
'\\' -> resultString.append('\\')
'\'' -> resultString.append('\'')
'"' -> resultString.append('"')
SIMPLE_UNICODE_PREFIX, COMPLEX_UNICODE_PREFIX ->
offset += resultString.appendEscapedUnicode(this, next, i + 2, lineNo)
else -> throw ParseException(
"According to TOML documentation unknown" +
" escape symbols are not allowed. Please check: [\\${this[i + 1]}]",
" escape symbols are not allowed. Please check: [\\$next]",
lineNo
)
}
} else {
this[i]
}
// need to skip the next character if we have processed special escaped symbol
if (updatedOnPreviousStep) {
updatedOnPreviousStep = false
i += 2
} else {
i += 1
resultString.append(currentChar)
}

resultString.append(newCharacter)
i += offset
}
return resultString.toString()
}

private fun StringBuilder.appendEscapedUnicode(
fullString: String,
marker: Char,
codeStartIndex: Int,
lineNo: Int
): Int {
val nbUnicodeChars = if (marker == SIMPLE_UNICODE_PREFIX) {
SIMPLE_UNICODE_LENGTH
} else {
COMPLEX_UNICODE_LENGTH
}
if (codeStartIndex + nbUnicodeChars > fullString.length) {
val invalid = fullString.substring(codeStartIndex - 1)
throw ParseException(
"According to TOML documentation unknown" +
" escape symbols are not allowed. Please check: [\\$invalid]",
lineNo
)
}
val hexCode = fullString.substring(codeStartIndex, codeStartIndex + nbUnicodeChars)
val codePoint = hexCode.toInt(HEX_RADIX)
try {
appendCodePointCompat(codePoint)
} catch (e: IllegalArgumentException) {
throw ParseException(
"According to TOML documentation unknown" +
" escape symbols are not allowed. Please check: [\\$marker$hexCode]",
lineNo
)
}
return nbUnicodeChars
}
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,15 @@ package com.akuleshov7.ktoml.utils
import com.akuleshov7.ktoml.tree.TomlNode
import com.akuleshov7.ktoml.tree.TomlTablePrimitive

/**
* Append a code point to a [StringBuilder]
*
* @param codePoint code point
* @return [StringBuilder] with appended code point
*/
@Throws(IllegalArgumentException::class)
internal expect fun StringBuilder.appendCodePointCompat(codePoint: Int): StringBuilder

/**
* searching (BFS) the table with the [fullTableName]
*
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,21 @@ class ValueParserTest {
test = TomlKeyValuePrimitive(Pair("a", "\"hello\t\\\\\\\\world\""), 0)
assertEquals("hello\t\\\\world", test.value.content)

test = TomlKeyValuePrimitive("a" to "\"Ɣ is greek\"", 0)
assertEquals("Ɣ is greek", test.value.content)

test = TomlKeyValuePrimitive("a" to "\"\\u0194 is greek\"", 0)
assertEquals("Ɣ is greek", test.value.content)

test = TomlKeyValuePrimitive("a" to "\"\\U0001F615 is emoji\"", 0)
assertEquals("\uD83D\uDE15 is emoji", test.value.content)

test = TomlKeyValuePrimitive("a" to "\"\uD83D\uDE15 is emoji\"", 0)
assertEquals("\uD83D\uDE15 is emoji", test.value.content)

test = TomlKeyValuePrimitive("a" to "\"I'm a string. \\\"You can quote me\\\". Name\\tJos\\u00E9\\nLocation\\tSF.\"", 0)
assertEquals("I'm a string. \"You can quote me\". Name\tJosé\nLocation\tSF.", test.value.content)

// regression test related to comments with an equals symbol after it
var pairTest =
"lineCaptureGroup = 1 # index `warningTextHasLine = false`\n".splitKeyValue(0)
Expand All @@ -105,6 +120,18 @@ class ValueParserTest {
0
)
}
assertFailsWith<ParseException> {
TomlKeyValuePrimitive("a" to "val\\ue", 0)
}
assertFailsWith<ParseException> {
TomlKeyValuePrimitive("a" to "\\x33", 0)
}
assertFailsWith<ParseException> {
TomlKeyValuePrimitive("a" to "\\UFFFFFFFF", 0)
}
assertFailsWith<ParseException> {
TomlKeyValuePrimitive("a" to "\\U00D80000", 0)
}
}
}

Expand Down
21 changes: 21 additions & 0 deletions ktoml-core/src/jsMain/kotlin/com/akuleshov7/ktoml/utils/UtilsJs.kt
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
/**
* Specific implementation for utilities
*/

package com.akuleshov7.ktoml.utils

@Suppress("LONG_NUMERICAL_VALUES_SEPARATED")
private const val MAX_CODE_POINT = 0x10FFFFf
private const val MIN_SUPPLEMENTARY_CODE_POINT: Int = 0x10000
private const val MIN_LOW_SURROGATE: Int = '\uDC00'.code
private const val MIN_HIGH_SURROGATE: Int = '\uD800'.code

@Suppress("MAGIC_NUMBER")
internal actual fun StringBuilder.appendCodePointCompat(codePoint: Int): StringBuilder = when (codePoint) {
in 0 until MIN_SUPPLEMENTARY_CODE_POINT -> append(codePoint.toChar())
in MIN_SUPPLEMENTARY_CODE_POINT..MAX_CODE_POINT -> {
append(Char.MIN_HIGH_SURROGATE + ((codePoint - 0x10000) shr 10))
append(Char.MIN_LOW_SURROGATE + (codePoint and 0x3ff))
}
else -> throw IllegalArgumentException()
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
/**
* Specific implementation for utilities
*/

package com.akuleshov7.ktoml.utils

internal actual fun StringBuilder.appendCodePointCompat(codePoint: Int): StringBuilder = appendCodePoint(codePoint)
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
/**
* Specific implementation for utilities
*/

package com.akuleshov7.ktoml.utils

@Suppress("MAGIC_NUMBER")
internal actual fun StringBuilder.appendCodePointCompat(codePoint: Int): StringBuilder = when (codePoint) {
in 0 until Char.MIN_SUPPLEMENTARY_CODE_POINT -> append(codePoint.toChar())
in Char.MIN_SUPPLEMENTARY_CODE_POINT..Char.MAX_CODE_POINT -> {
append(Char.MIN_HIGH_SURROGATE + ((codePoint - 0x10000) shr 10))
append(Char.MIN_LOW_SURROGATE + (codePoint and 0x3ff))
}
else -> throw IllegalArgumentException()
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
/**
* Specific implementation for utilities
*/

package com.akuleshov7.ktoml.utils

@Suppress("MAGIC_NUMBER")
internal actual fun StringBuilder.appendCodePointCompat(codePoint: Int): StringBuilder = when (codePoint) {
in 0 until Char.MIN_SUPPLEMENTARY_CODE_POINT -> append(codePoint.toChar())
in Char.MIN_SUPPLEMENTARY_CODE_POINT..Char.MAX_CODE_POINT -> {
append(Char.MIN_HIGH_SURROGATE + ((codePoint - 0x10000) shr 10))
append(Char.MIN_LOW_SURROGATE + (codePoint and 0x3ff))
}
else -> throw IllegalArgumentException()
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
/**
* Specific implementation for utilities
*/

package com.akuleshov7.ktoml.utils

@Suppress("MAGIC_NUMBER")
internal actual fun StringBuilder.appendCodePointCompat(codePoint: Int): StringBuilder = when (codePoint) {
in 0 until Char.MIN_SUPPLEMENTARY_CODE_POINT -> append(codePoint.toChar())
in Char.MIN_SUPPLEMENTARY_CODE_POINT..Char.MAX_CODE_POINT -> {
append(Char.MIN_HIGH_SURROGATE + ((codePoint - 0x10000) shr 10))
append(Char.MIN_LOW_SURROGATE + (codePoint and 0x3ff))
}
else -> throw IllegalArgumentException()
}

0 comments on commit 4d2fb18

Please sign in to comment.