Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support unicode symbols in strings #122

Merged
merged 3 commits into from
Mar 11, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ import com.akuleshov7.ktoml.exceptions.ParseException
import com.akuleshov7.ktoml.parsers.trimBrackets
import com.akuleshov7.ktoml.parsers.trimQuotes
import com.akuleshov7.ktoml.parsers.trimSingleQuotes
import com.akuleshov7.ktoml.utils.appendCodePointCompat
import kotlinx.datetime.*

/**
Expand Down Expand Up @@ -75,6 +76,12 @@ internal constructor(
) : this(content.verifyAndTrimQuotes(lineNo), lineNo)

public companion object {
private const val COMPLEX_UNICODE_LENGTH = 8
private const val COMPLEX_UNICODE_PREFIX = 'U'
private const val HEX_RADIX = 16
private const val SIMPLE_UNICODE_LENGTH = 4
private const val SIMPLE_UNICODE_PREFIX = 'u'

private fun String.verifyAndTrimQuotes(lineNo: Int): Any =
if (startsWith("\"") && endsWith("\"")) {
trimQuotes()
Expand Down Expand Up @@ -104,41 +111,70 @@ internal constructor(

private fun String.convertSpecialCharacters(lineNo: Int): String {
val resultString = StringBuilder()
var updatedOnPreviousStep = false
var i = 0
while (i < this.length) {
val newCharacter = if (this[i] == '\\' && i != this.length - 1) {
updatedOnPreviousStep = true
when (this[i + 1]) {
// table that is used to convert escaped string literals to proper char symbols
't' -> '\t'
'b' -> '\b'
'r' -> '\r'
'n' -> '\n'
'\\' -> '\\'
'\'' -> '\''
'"' -> '"'
while (i < length) {
val currentChar = get(i)
var offset = 1
if (currentChar == '\\' && i != lastIndex) {
// Escaped
val next = get(i + 1)
offset++
when (next) {
't' -> resultString.append('\t')
'b' -> resultString.append('\b')
'r' -> resultString.append('\r')
'n' -> resultString.append('\n')
'\\' -> resultString.append('\\')
'\'' -> resultString.append('\'')
'"' -> resultString.append('"')
SIMPLE_UNICODE_PREFIX, COMPLEX_UNICODE_PREFIX ->
offset += resultString.appendEscapedUnicode(this, next, i + 2, lineNo)
else -> throw ParseException(
"According to TOML documentation unknown" +
" escape symbols are not allowed. Please check: [\\${this[i + 1]}]",
" escape symbols are not allowed. Please check: [\\$next]",
lineNo
)
}
} else {
this[i]
}
// need to skip the next character if we have processed special escaped symbol
if (updatedOnPreviousStep) {
updatedOnPreviousStep = false
i += 2
} else {
i += 1
resultString.append(currentChar)
}

resultString.append(newCharacter)
i += offset
}
return resultString.toString()
}

private fun StringBuilder.appendEscapedUnicode(
fullString: String,
marker: Char,
codeStartIndex: Int,
lineNo: Int
): Int {
val nbUnicodeChars = if (marker == SIMPLE_UNICODE_PREFIX) {
SIMPLE_UNICODE_LENGTH
} else {
COMPLEX_UNICODE_LENGTH
}
if (codeStartIndex + nbUnicodeChars > fullString.length) {
val invalid = fullString.substring(codeStartIndex - 1)
throw ParseException(
"According to TOML documentation unknown" +
" escape symbols are not allowed. Please check: [\\$invalid]",
lineNo
)
}
val hexCode = fullString.substring(codeStartIndex, codeStartIndex + nbUnicodeChars)
val codePoint = hexCode.toInt(HEX_RADIX)
try {
appendCodePointCompat(codePoint)
} catch (e: IllegalArgumentException) {
throw ParseException(
"According to TOML documentation unknown" +
" escape symbols are not allowed. Please check: [\\$marker$hexCode]",
lineNo
)
}
return nbUnicodeChars
}
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,15 @@ package com.akuleshov7.ktoml.utils
import com.akuleshov7.ktoml.tree.TomlNode
import com.akuleshov7.ktoml.tree.TomlTablePrimitive

/**
* Append a code point to a [StringBuilder]
*
* @param codePoint code point
* @return [StringBuilder] with appended code point
*/
@Throws(IllegalArgumentException::class)
internal expect fun StringBuilder.appendCodePointCompat(codePoint: Int): StringBuilder

/**
* searching (BFS) the table with the [fullTableName]
*
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,21 @@ class ValueParserTest {
test = TomlKeyValuePrimitive(Pair("a", "\"hello\t\\\\\\\\world\""), 0)
assertEquals("hello\t\\\\world", test.value.content)

test = TomlKeyValuePrimitive("a" to "\"Ɣ is greek\"", 0)
assertEquals("Ɣ is greek", test.value.content)

test = TomlKeyValuePrimitive("a" to "\"\\u0194 is greek\"", 0)
assertEquals("Ɣ is greek", test.value.content)

test = TomlKeyValuePrimitive("a" to "\"\\U0001F615 is emoji\"", 0)
assertEquals("\uD83D\uDE15 is emoji", test.value.content)

test = TomlKeyValuePrimitive("a" to "\"\uD83D\uDE15 is emoji\"", 0)
assertEquals("\uD83D\uDE15 is emoji", test.value.content)

test = TomlKeyValuePrimitive("a" to "\"I'm a string. \\\"You can quote me\\\". Name\\tJos\\u00E9\\nLocation\\tSF.\"", 0)
assertEquals("I'm a string. \"You can quote me\". Name\tJosé\nLocation\tSF.", test.value.content)

// regression test related to comments with an equals symbol after it
var pairTest =
"lineCaptureGroup = 1 # index `warningTextHasLine = false`\n".splitKeyValue(0)
Expand All @@ -105,6 +120,18 @@ class ValueParserTest {
0
)
}
assertFailsWith<ParseException> {
TomlKeyValuePrimitive("a" to "val\\ue", 0)
}
assertFailsWith<ParseException> {
TomlKeyValuePrimitive("a" to "\\x33", 0)
}
assertFailsWith<ParseException> {
TomlKeyValuePrimitive("a" to "\\UFFFFFFFF", 0)
}
assertFailsWith<ParseException> {
TomlKeyValuePrimitive("a" to "\\U00D80000", 0)
}
}
}

Expand Down
21 changes: 21 additions & 0 deletions ktoml-core/src/jsMain/kotlin/com/akuleshov7/ktoml/utils/UtilsJs.kt
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
/**
* Specific implementation for utilities
*/

package com.akuleshov7.ktoml.utils

@Suppress("LONG_NUMERICAL_VALUES_SEPARATED")
private const val MAX_CODE_POINT = 0x10FFFFf
private const val MIN_SUPPLEMENTARY_CODE_POINT: Int = 0x10000
private const val MIN_LOW_SURROGATE: Int = '\uDC00'.code
private const val MIN_HIGH_SURROGATE: Int = '\uD800'.code

@Suppress("MAGIC_NUMBER")
internal actual fun StringBuilder.appendCodePointCompat(codePoint: Int): StringBuilder = when (codePoint) {
in 0 until MIN_SUPPLEMENTARY_CODE_POINT -> append(codePoint.toChar())
in MIN_SUPPLEMENTARY_CODE_POINT..MAX_CODE_POINT -> {
append(Char.MIN_HIGH_SURROGATE + ((codePoint - 0x10000) shr 10))
append(Char.MIN_LOW_SURROGATE + (codePoint and 0x3ff))
}
else -> throw IllegalArgumentException()
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
/**
* Specific implementation for utilities
*/

package com.akuleshov7.ktoml.utils

internal actual fun StringBuilder.appendCodePointCompat(codePoint: Int): StringBuilder = appendCodePoint(codePoint)
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
/**
* Specific implementation for utilities
*/

package com.akuleshov7.ktoml.utils

@Suppress("MAGIC_NUMBER")
internal actual fun StringBuilder.appendCodePointCompat(codePoint: Int): StringBuilder = when (codePoint) {
in 0 until Char.MIN_SUPPLEMENTARY_CODE_POINT -> append(codePoint.toChar())
in Char.MIN_SUPPLEMENTARY_CODE_POINT..Char.MAX_CODE_POINT -> {
append(Char.MIN_HIGH_SURROGATE + ((codePoint - 0x10000) shr 10))
append(Char.MIN_LOW_SURROGATE + (codePoint and 0x3ff))
}
else -> throw IllegalArgumentException()
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
/**
* Specific implementation for utilities
*/

package com.akuleshov7.ktoml.utils

@Suppress("MAGIC_NUMBER")
internal actual fun StringBuilder.appendCodePointCompat(codePoint: Int): StringBuilder = when (codePoint) {
in 0 until Char.MIN_SUPPLEMENTARY_CODE_POINT -> append(codePoint.toChar())
in Char.MIN_SUPPLEMENTARY_CODE_POINT..Char.MAX_CODE_POINT -> {
append(Char.MIN_HIGH_SURROGATE + ((codePoint - 0x10000) shr 10))
append(Char.MIN_LOW_SURROGATE + (codePoint and 0x3ff))
}
else -> throw IllegalArgumentException()
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
/**
* Specific implementation for utilities
*/

package com.akuleshov7.ktoml.utils

@Suppress("MAGIC_NUMBER")
internal actual fun StringBuilder.appendCodePointCompat(codePoint: Int): StringBuilder = when (codePoint) {
in 0 until Char.MIN_SUPPLEMENTARY_CODE_POINT -> append(codePoint.toChar())
in Char.MIN_SUPPLEMENTARY_CODE_POINT..Char.MAX_CODE_POINT -> {
append(Char.MIN_HIGH_SURROGATE + ((codePoint - 0x10000) shr 10))
append(Char.MIN_LOW_SURROGATE + (codePoint and 0x3ff))
}
else -> throw IllegalArgumentException()
}