From a22a2dfb5a6ad60cc83363c84f0122ff5ba4c52e Mon Sep 17 00:00:00 2001 From: Jolan Rensen Date: Mon, 28 Oct 2024 14:51:15 +0100 Subject: [PATCH 1/3] introduces FastDoubleParser and the useFastDoubleParser parameter in ParserOptions, plus tests --- core/build.gradle.kts | 1 + .../kotlinx/dataframe/api/convert.kt | 31 ++- .../jetbrains/kotlinx/dataframe/api/parse.kt | 17 ++ .../kotlinx/dataframe/documentation/utils.kt | 7 + .../kotlinx/dataframe/impl/api/parse.kt | 39 +-- .../dataframe/impl/io/FastDoubleParser.kt | 233 ++++++++++++++++++ .../dataframe/io/FastDoubleParserTests.kt | 163 ++++++++++++ .../kotlinx/dataframe/io/ParserTests.kt | 90 ++++--- gradle/libs.versions.toml | 2 + 9 files changed, 505 insertions(+), 78 deletions(-) create mode 100644 core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/io/FastDoubleParser.kt create mode 100644 core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/FastDoubleParserTests.kt diff --git a/core/build.gradle.kts b/core/build.gradle.kts index d3a3d06a75..4bfbb2df6a 100644 --- a/core/build.gradle.kts +++ b/core/build.gradle.kts @@ -71,6 +71,7 @@ dependencies { implementation(libs.commonsIo) implementation(libs.serialization.core) implementation(libs.serialization.json) + implementation(libs.fastDoubleParser) implementation(libs.fuel) diff --git a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/convert.kt b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/convert.kt index aa3aacd82f..f921af7d3f 100644 --- a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/convert.kt +++ b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/convert.kt @@ -185,21 +185,32 @@ public fun DataColumn.convertToDouble(): DataColumn = conve public fun DataColumn.convertToDouble(): DataColumn = convertTo() /** - * Parse String column to Double considering locale (number format). + * Parses a String column to Double considering locale (number format). * If [locale] parameter is defined, it's number format is used for parsing. - * If [locale] parameter is null, the current system locale is used. If column can not be parsed, then POSIX format is used. + * If [locale] parameter is null, the current system locale is used. + * If the column cannot be parsed, then the POSIX format is used. + * + * @param useFastDoubleParser whether to use the new _experimental_ FastDoubleParser, defaults to `false` for now. */ @JvmName("convertToDoubleFromString") -public fun DataColumn.convertToDouble(locale: Locale? = null): DataColumn = - this.castToNullable().convertToDouble(locale).castToNotNullable() +public fun DataColumn.convertToDouble( + locale: Locale? = null, + useFastDoubleParser: Boolean = false, +): DataColumn = this.castToNullable().convertToDouble(locale, useFastDoubleParser).castToNotNullable() /** - * Parse String column to Double considering locale (number format). + * Parses a String column to Double considering locale (number format). * If [locale] parameter is defined, it's number format is used for parsing. - * If [locale] parameter is null, the current system locale is used. If column can not be parsed, then POSIX format is used. + * If [locale] parameter is null, the current system locale is used. + * If the column cannot be parsed, then the POSIX format is used. + * + * @param useFastDoubleParser whether to use the new _experimental_ FastDoubleParser, defaults to `false` for now. */ @JvmName("convertToDoubleFromStringNullable") -public fun DataColumn.convertToDouble(locale: Locale? = null): DataColumn { +public fun DataColumn.convertToDouble( + locale: Locale? = null, + useFastDoubleParser: Boolean = false, +): DataColumn { fun applyParser(parser: (String) -> Double?): DataColumn { var currentRow = 0 try { @@ -220,14 +231,14 @@ public fun DataColumn.convertToDouble(locale: Locale? = null): DataColu } return if (locale != null) { - val explicitParser = Parsers.getDoubleParser(locale) + val explicitParser = Parsers.getDoubleParser(locale, useFastDoubleParser) applyParser(explicitParser) } else { try { - val defaultParser = Parsers.getDoubleParser() + val defaultParser = Parsers.getDoubleParser(useFastDoubleParser = useFastDoubleParser) applyParser(defaultParser) } catch (e: TypeConversionException) { - val posixParser = Parsers.getDoubleParser(Locale.forLanguageTag("C.UTF-8")) + val posixParser = Parsers.getDoubleParser(Locale.forLanguageTag("C.UTF-8"), useFastDoubleParser) applyParser(posixParser) } } diff --git a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/parse.kt b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/parse.kt index 5d4bedcbd4..0ebef02c1f 100644 --- a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/parse.kt +++ b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/parse.kt @@ -40,12 +40,29 @@ public interface GlobalParserOptions { public var locale: Locale } +/** + * ### Options for parsing [String]`?` columns + * + * @param locale locale to use for parsing dates and numbers, defaults to the System default locale. + * If specified instead of [dateTimeFormatter], it will be used in combination with [dateTimePattern] + * to create a [DateTimeFormatter]. Just providing [locale] will not allow you to parse + * locale-specific dates! + * @param dateTimeFormatter a [DateTimeFormatter] to use for parsing dates, if not specified, it will be created + * from [dateTimePattern] and [locale]. If neither [dateTimeFormatter] nor [dateTimePattern] are specified, + * [DateTimeFormatter.ISO_LOCAL_DATE_TIME] will be used. + * @param dateTimePattern a pattern to use for parsing dates. If specified instead of [dateTimeFormatter], + * it will be used to create a [DateTimeFormatter]. + * @param nullStrings a set of strings that should be treated as `null` values. By default, it's + * ["null", "NULL", "NA", "N/A"]. + * @param useFastDoubleParser whether to use the new _experimental_ FastDoubleParser, defaults to `false` for now. + */ public data class ParserOptions( val locale: Locale? = null, // TODO, migrate to kotlinx.datetime.format.DateTimeFormat? https://github.com/Kotlin/dataframe/issues/876 val dateTimeFormatter: DateTimeFormatter? = null, val dateTimePattern: String? = null, val nullStrings: Set? = null, + val useFastDoubleParser: Boolean = false, ) { internal fun getDateTimeFormatter(): DateTimeFormatter? = when { diff --git a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/documentation/utils.kt b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/documentation/utils.kt index c17fcd610c..0377567a8b 100644 --- a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/documentation/utils.kt +++ b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/documentation/utils.kt @@ -19,24 +19,31 @@ import kotlin.annotation.AnnotationTarget.VALUE_PARAMETER * {@include [Indent]} * */ +@ExcludeFromSources internal interface LineBreak /**   */ +@ExcludeFromSources internal interface QuarterIndent /**    */ +@ExcludeFromSources internal interface HalfIndent /**      */ +@ExcludeFromSources internal interface Indent /**          */ +@ExcludeFromSources internal interface DoubleIndent /**              */ +@ExcludeFromSources internal interface TripleIndent /**                  */ +@ExcludeFromSources internal interface QuadrupleIndent /** diff --git a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/parse.kt b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/parse.kt index c8577a2f98..147f4c275a 100644 --- a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/parse.kt +++ b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/parse.kt @@ -33,13 +33,13 @@ import org.jetbrains.kotlinx.dataframe.hasNulls import org.jetbrains.kotlinx.dataframe.impl.canParse import org.jetbrains.kotlinx.dataframe.impl.catchSilent import org.jetbrains.kotlinx.dataframe.impl.createStarProjectedType +import org.jetbrains.kotlinx.dataframe.impl.io.FastDoubleParser import org.jetbrains.kotlinx.dataframe.impl.javaDurationCanParse import org.jetbrains.kotlinx.dataframe.io.isURL import org.jetbrains.kotlinx.dataframe.io.readJsonStr import org.jetbrains.kotlinx.dataframe.values import java.math.BigDecimal import java.net.URL -import java.text.NumberFormat import java.text.ParsePosition import java.time.format.DateTimeFormatter import java.time.format.DateTimeFormatterBuilder @@ -274,29 +274,6 @@ internal object Parsers : GlobalParserOptions { null } - private fun String.parseDouble(format: NumberFormat) = - when (uppercase(Locale.getDefault())) { - "NAN" -> Double.NaN - - "INF" -> Double.POSITIVE_INFINITY - - "-INF" -> Double.NEGATIVE_INFINITY - - "INFINITY" -> Double.POSITIVE_INFINITY - - "-INFINITY" -> Double.NEGATIVE_INFINITY - - else -> { - val parsePosition = ParsePosition(0) - val result: Double? = format.parse(this, parsePosition)?.toDouble() - if (parsePosition.index != this.length) { - null - } else { - result - } - } - } - inline fun stringParser( catch: Boolean = false, coveredBy: Set = emptySet(), @@ -316,11 +293,15 @@ internal object Parsers : GlobalParserOptions { ): StringParserWithFormat = StringParserWithFormat(typeOf(), coveredBy, body) private val parserToDoubleWithOptions = stringParserWithOptions { options -> - val numberFormat = NumberFormat.getInstance(options?.locale ?: Locale.getDefault()) - val parser = { it: String -> it.parseDouble(numberFormat) } + val fastDoubleParser = FastDoubleParser(options ?: ParserOptions()) + val parser = { it: String -> fastDoubleParser.parseOrNull(it) } parser } + private val posixDoubleParser = FastDoubleParser( + ParserOptions(locale = Locale.forLanguageTag("C.UTF-8")), + ) + internal val parsersOrder = listOf( // Int stringParser { it.toIntOrNull() }, @@ -383,7 +364,7 @@ internal object Parsers : GlobalParserOptions { // Double, with explicit number format or taken from current locale parserToDoubleWithOptions, // Double, with POSIX format - stringParser { it.parseDouble(NumberFormat.getInstance(Locale.forLanguageTag("C.UTF-8"))) }, + stringParser { posixDoubleParser.parseOrNull(it) }, // Boolean stringParser { it.toBooleanOrNull() }, // BigDecimal @@ -448,9 +429,9 @@ internal object Parsers : GlobalParserOptions { return parser.applyOptions(options) } - internal fun getDoubleParser(locale: Locale? = null): (String) -> Double? { + internal fun getDoubleParser(locale: Locale? = null, useFastDoubleParser: Boolean): (String) -> Double? { val options = if (locale != null) { - ParserOptions(locale = locale) + ParserOptions(locale = locale, useFastDoubleParser = useFastDoubleParser) } else { null } diff --git a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/io/FastDoubleParser.kt b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/io/FastDoubleParser.kt new file mode 100644 index 0000000000..64fbcd1312 --- /dev/null +++ b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/io/FastDoubleParser.kt @@ -0,0 +1,233 @@ +package org.jetbrains.kotlinx.dataframe.impl.io + +import ch.randelshofer.fastdoubleparser.ConfigurableDoubleParser +import ch.randelshofer.fastdoubleparser.NumberFormatSymbols +import io.github.oshai.kotlinlogging.KotlinLogging +import org.jetbrains.kotlinx.dataframe.api.ParserOptions +import java.nio.charset.Charset +import java.text.DecimalFormatSymbols +import java.text.NumberFormat +import java.text.ParsePosition +import java.util.Locale + +private val logger = KotlinLogging.logger {} + +// (lowercase) strings that are recognized to represent infinity and NaN in doubles in all locales +private val INFINITIES = arrayOf("∞", "inf", "infinity", "infty") +private val PLUS_INFINITIES = INFINITIES.map { "+$it" } +private val MINUS_INFINITIES = INFINITIES.map { "-$it" } +private val NANS = arrayOf("nan", "na", "n/a") + +/** + * Parses a [String]/[CharSequence], [CharArray], or [ByteArray] into a [Double]. + * + * If [ParserOptions.useFastDoubleParser] is enabled, it will try to parse the input with an _EXPERIMENTAL_ + * fast double parser, [FastDoubleParser](https://github.com/wrandelshofer/FastDoubleParser). + * If not, or if it fails, it will use [NumberFormat] to parse the input. + * + * Public, so it can be used in other modules. + * + * @param parserOptions can be supplied to configure the parser. + * We'll only use [ParserOptions.locale] and [ParserOptions.useFastDoubleParser]. + */ +@Suppress("ktlint:standard:comment-wrapping") +public class FastDoubleParser(private val parserOptions: ParserOptions) { + + private val supportedFastCharsets = setOf(Charsets.UTF_8, Charsets.ISO_8859_1, Charsets.US_ASCII) + + private val locale = parserOptions.locale ?: Locale.getDefault() + private val fallbackLocale = Locale.ROOT + + private val localDecimalFormatSymbols = DecimalFormatSymbols.getInstance(locale) + private val fallbackDecimalFormatSymbols = DecimalFormatSymbols.getInstance(fallbackLocale) + + private val parser = ConfigurableDoubleParser(/* symbols = */ setupNumberFormatSymbols(), /* ignoreCase = */ true) + + /** + * Sets up the [NumberFormatSymbols] for the [ConfigurableDoubleParser] based on + * [localDecimalFormatSymbols] with fallbacks from [fallbackDecimalFormatSymbols]. + * + * Fallback characters/strings are only added if they're not clashing with local characters/strings. + */ + private fun setupNumberFormatSymbols(): NumberFormatSymbols { + // collect all chars and strings that are locale-specific such that we can check whether + // fallback chars and strings are safe to add + val localChars = with(localDecimalFormatSymbols) { + buildSet { + add(decimalSeparator.lowercaseChar()) + add(groupingSeparator.lowercaseChar()) + add(minusSign.lowercaseChar()) + add('+') + add(zeroDigit.lowercaseChar()) + } + } + val localStrings = with(localDecimalFormatSymbols) { + buildSet { + add(exponentSeparator.lowercase()) + add(infinity.lowercase()) + add(naN.lowercase()) + } + } + + /** + * Builds a set with the specified char from [localDecimalFormatSymbols] and + * its fallback char from [fallbackDecimalFormatSymbols] if it's safe to do so. + * [additionals] will be added to the set too, when they're safe to add. + */ + fun ((DecimalFormatSymbols) -> Char).fromLocalWithFallBack(vararg additionals: Char): Set = + buildSet { + val getChar = this@fromLocalWithFallBack + val char = getChar(localDecimalFormatSymbols).lowercaseChar() + add(char) + + // add fallback char if it's safe to do so + val fallbackChar = getChar(fallbackDecimalFormatSymbols).lowercaseChar() + if (fallbackChar !in localChars && !localStrings.any { fallbackChar in it }) { + add(fallbackChar) + } + + // Fixes NBSP and other whitespace characters not being recognized if the user writes space instead. + if (char.isWhitespace()) add(' ') + + // add additional chars if needed + for (additional in additionals) { + val lowercase = additional.lowercaseChar() + if (lowercase !in localChars && !localStrings.any { lowercase in it }) { + add(lowercase) + } + } + } + + /** + * Builds a set with the specified string from [localDecimalFormatSymbols] and + * its fallback string from [fallbackDecimalFormatSymbols] if it's safe to do so. + * [additionals] will be added to the set too, when they're safe to add. + */ + fun ((DecimalFormatSymbols) -> String).fromLocalWithFallBack(vararg additionals: String): Set = + buildSet { + val getString = this@fromLocalWithFallBack + val string = getString(localDecimalFormatSymbols).lowercase() + add(string) + + // add fallback string if it's safe to do so + val fallbackString = getString(fallbackDecimalFormatSymbols).lowercase() + if (!fallbackString.any { it in localChars } && fallbackString !in localStrings) { + add(fallbackString) + } + + // Fixes NBSP and other whitespace characters not being recognized if the user writes space instead. + if (string.isBlank()) add(" ") + + // add additional strings if needed + for (additional in additionals) { + val lowercase = additional.lowercase() + if (!lowercase.any { it in localChars } && lowercase !in localStrings) { + add(lowercase) + } + } + } + + return NumberFormatSymbols.fromDecimalFormatSymbols(localDecimalFormatSymbols) + .withPlusSign(setOf('+')) + .withDecimalSeparator(DecimalFormatSymbols::getDecimalSeparator.fromLocalWithFallBack()) + .withGroupingSeparator(DecimalFormatSymbols::getGroupingSeparator.fromLocalWithFallBack()) + .withExponentSeparator(DecimalFormatSymbols::getExponentSeparator.fromLocalWithFallBack()) + .withMinusSign(DecimalFormatSymbols::getMinusSign.fromLocalWithFallBack()) + .withInfinity(DecimalFormatSymbols::getInfinity.fromLocalWithFallBack(*INFINITIES)) + .withNaN(DecimalFormatSymbols::getNaN.fromLocalWithFallBack(*NANS)) + } + + /** Fallback method for parsing doubles. */ + private fun String.parseToDoubleOrNullFallback(): Double? = + when (lowercase()) { + in INFINITIES, in PLUS_INFINITIES -> Double.POSITIVE_INFINITY + + in MINUS_INFINITIES -> Double.NEGATIVE_INFINITY + + in NANS -> Double.NaN + + else -> { + // not thread safe; must be created here + val numberFormat = NumberFormat.getInstance(locale) + val parsePosition = ParsePosition(0) + val result = numberFormat.parse(this, parsePosition)?.toDouble() + if (parsePosition.index != this.length || parsePosition.errorIndex != -1) { + null + } else { + result + } + } + }.also { + if (it == null) { + logger.debug { "Could not parse '$this' as Double with NumberFormat with locale '$locale'." } + } + } + + /** + * Parses a double value from a substring of the specified byte array. + * + * It uses the [fast double parser][ConfigurableDoubleParser] if [ParserOptions.useFastDoubleParser] is enabled, + * else, or if that fails, it uses [parseToDoubleOrNullFallback]. + */ + public fun parseOrNull( + ba: ByteArray, + offset: Int = 0, + length: Int = ba.size, + charset: Charset = Charsets.UTF_8, + ): Double? { + if (parserOptions.useFastDoubleParser && charset in supportedFastCharsets) { + try { + return parser.parseDouble(ba, offset, length) + } catch (e: Exception) { + logger.debug(e) { + "Failed to parse '${ + ba.toString(charset) + }' from a ByteArray to Double with FastDoubleParser with locale '$locale'." + } + } + } + return String(bytes = ba, offset = offset, length = length, charset = charset) + .parseToDoubleOrNullFallback() + } + + /** + * Parses a double value from the specified [CharSequence]. + * + * It uses the [fast double parser][ConfigurableDoubleParser] if [ParserOptions.useFastDoubleParser] is enabled, + * else, or if that fails, it uses [parseToDoubleOrNullFallback]. + */ + public fun parseOrNull(cs: CharSequence): Double? { + if (parserOptions.useFastDoubleParser) { + try { + return parser.parseDouble(cs) + } catch (e: Exception) { + logger.debug(e) { + "Failed to parse '$cs' from a CharSequence to Double with FastDoubleParser with locale '$locale'." + } + } + } + + return cs.toString().parseToDoubleOrNullFallback() + } + + /** + * Parses a double value from the specified [CharArray]. + * + * It uses the [fast double parser][ConfigurableDoubleParser] if [ParserOptions.useFastDoubleParser] is enabled, + * else, or if that fails, it uses [parseToDoubleOrNullFallback]. + */ + public fun parseOrNull(ca: CharArray, offset: Int = 0, length: Int = ca.size): Double? { + if (parserOptions.useFastDoubleParser) { + try { + return parser.parseDouble(ca, offset, length) + } catch (e: Exception) { + logger.debug(e) { + "Failed to parse '${ + ca.joinToString("") + }' as from a CharArray to Double with FastDoubleParser with locale '$locale'." + } + } + } + return String(chars = ca, offset = offset, length = length).parseToDoubleOrNullFallback() + } +} diff --git a/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/FastDoubleParserTests.kt b/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/FastDoubleParserTests.kt new file mode 100644 index 0000000000..2d90652c61 --- /dev/null +++ b/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/FastDoubleParserTests.kt @@ -0,0 +1,163 @@ +package org.jetbrains.kotlinx.dataframe.io + +import io.kotest.matchers.collections.shouldContainInOrder +import org.jetbrains.kotlinx.dataframe.api.ParserOptions +import org.jetbrains.kotlinx.dataframe.impl.io.FastDoubleParser +import org.junit.After +import org.junit.Before +import org.junit.Test +import java.util.Locale + +private const val LOG_LEVEL = "org.slf4j.simpleLogger.defaultLogLevel" + +class FastDoubleParserTests { + + private var loggerBefore: String? = null + + @Before + fun setLogger() { + loggerBefore = System.getProperty(LOG_LEVEL) + System.setProperty(LOG_LEVEL, "debug") + } + + @After + fun restoreLogger() { + if (loggerBefore != null) { + System.setProperty(LOG_LEVEL, loggerBefore) + } + } + + @Test + fun `can fast parse doubles`() { + val parser = FastDoubleParser(ParserOptions(locale = Locale.ROOT, useFastDoubleParser = true)) + + val numbers = listOf( + "+12.45", + "-13.35", + "100123.35", + "-204,235.23", + "1.234e3", + "3e-04", // failed with old double parser + "nAn", + "-N/a", + "inf", + "-InfinIty", + ) + + val expectedDoubles = listOf( + 12.45, + -13.35, + 100_123.35, + -204_235.23, + 1.234e3, + 3e-04, + Double.NaN, + -Double.NaN, + Double.POSITIVE_INFINITY, + Double.NEGATIVE_INFINITY, + ) + + // CharSequence + numbers.map { parser.parseOrNull(it) }.shouldContainInOrder(expectedDoubles) + + // CharArray + numbers.map { parser.parseOrNull(it.toCharArray()) }.shouldContainInOrder(expectedDoubles) + + // ByteArray + numbers.map { parser.parseOrNull(it.toByteArray()) }.shouldContainInOrder(expectedDoubles) + } + + @Test + fun `can fast parse german locale`() { + val parser = FastDoubleParser(ParserOptions(locale = Locale.GERMANY, useFastDoubleParser = true)) + + val numbers = listOf( + "12,45", + "-13,35", + "100.123,35", + "-204.235,23", + "1,234e3", + ) + + val expectedDoubles = listOf( + 12.45, + -13.35, + 100_123.35, + -204_235.23, + 1.234e3, + ) + + // CharSequence + numbers.map { parser.parseOrNull(it) }.shouldContainInOrder(expectedDoubles) + + // CharArray + numbers.map { parser.parseOrNull(it.toCharArray()) }.shouldContainInOrder(expectedDoubles) + + // ByteArray + numbers.map { parser.parseOrNull(it.toByteArray()) }.shouldContainInOrder(expectedDoubles) + } + + @Test + fun `can fast parse french locale`() { + val parser = FastDoubleParser(ParserOptions(locale = Locale.FRANCE, useFastDoubleParser = true)) + + val numbers = listOf( + "12,45", + "-13,35", + "100 123,35", + "-204 235,23", + "1,234e3", + ) + + val expectedDoubles = listOf( + 12.45, + -13.35, + 100_123.35, + -204_235.23, + 1.234e3, + ) + + // CharSequence + numbers.map { parser.parseOrNull(it) }.shouldContainInOrder(expectedDoubles) + + // CharArray + numbers.map { parser.parseOrNull(it.toCharArray()) }.shouldContainInOrder(expectedDoubles) + + // ByteArray + numbers.map { parser.parseOrNull(it.toByteArray()) }.shouldContainInOrder(expectedDoubles) + } + + @Test + fun `can fast parse estonian locale`() { + val parser = FastDoubleParser( + ParserOptions(locale = Locale.forLanguageTag("et-EE"), useFastDoubleParser = true), + ) + + val numbers = listOf( + "12,45", + "−13,35", // note the different minus sign '−' vs '-' + "100 123,35", + "−204 235,23", // note the different minus sign '−' vs '-' + "1,234e3", + "-345,122", // check forgiving behavior with 'ordinary' minus sign + ) + + val expectedDoubles = listOf( + 12.45, + -13.35, + 100_123.35, + -204_235.23, + 1.234e3, + -345.122, + ) + + // CharSequence + numbers.map { parser.parseOrNull(it) }.shouldContainInOrder(expectedDoubles) + + // CharArray + numbers.map { parser.parseOrNull(it.toCharArray()) }.shouldContainInOrder(expectedDoubles) + + // ByteArray + numbers.map { parser.parseOrNull(it.toByteArray()) }.shouldContainInOrder(expectedDoubles) + } +} diff --git a/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/ParserTests.kt b/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/ParserTests.kt index 6ca02e2f41..c293865117 100644 --- a/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/ParserTests.kt +++ b/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/ParserTests.kt @@ -9,6 +9,7 @@ import kotlinx.datetime.toKotlinLocalDate import kotlinx.datetime.toKotlinLocalDateTime import org.jetbrains.kotlinx.dataframe.DataColumn import org.jetbrains.kotlinx.dataframe.DataFrame +import org.jetbrains.kotlinx.dataframe.api.ParserOptions import org.jetbrains.kotlinx.dataframe.api.cast import org.jetbrains.kotlinx.dataframe.api.columnOf import org.jetbrains.kotlinx.dataframe.api.convertTo @@ -129,6 +130,17 @@ class ParserTests { } @Test + fun `custom nullStrings`() { + val col by columnOf("1", "2", "null", "3", "NA", "nothing", "4.0", "5.0") + + val parsed = col.tryParse( + ParserOptions(nullStrings = setOf("null", "NA", "nothing")), + ) + parsed.type() shouldBe typeOf() + parsed.toList() shouldBe listOf(1, 2, null, 3, null, null, 4.0, 5.0) + } + + @Test // This does not yet use fastDoubleParser! fun `converting String to Double in different locales`() { val currentLocale = Locale.getDefault() try { @@ -148,68 +160,68 @@ class ParserTests { Locale.setDefault(Locale.forLanguageTag("C.UTF-8")) - columnDot.convertTo().shouldBe(columnOf(12.345, 67.89)) - columnComma.convertTo().shouldBe(columnOf(12345.0, 67890.0)) - columnMixed.convertTo().shouldBe(columnOf(12.345, 67890.0)) + columnDot.convertTo() shouldBe columnOf(12.345, 67.89) + columnComma.convertTo() shouldBe columnOf(12345.0, 67890.0) + columnMixed.convertTo() shouldBe columnOf(12.345, 67890.0) - columnDot.convertTo().shouldBe(columnOf(12.345, 67.89)) - columnComma.convertTo().shouldBe(columnOf(12345.0, 67890.0)) - columnMixed.convertTo().shouldBe(columnOf(12.345, 67890.0)) + columnDot.convertTo() shouldBe columnOf(12.345, 67.89) + columnComma.convertTo() shouldBe columnOf(12345.0, 67890.0) + columnMixed.convertTo() shouldBe columnOf(12.345, 67890.0) - columnDot.convertToDouble(parsingLocaleNotDefined).shouldBe(columnOf(12.345, 67.89)) - columnComma.convertToDouble(parsingLocaleNotDefined).shouldBe(columnOf(12345.0, 67890.0)) - columnMixed.convertToDouble(parsingLocaleNotDefined).shouldBe(columnOf(12.345, 67890.0)) + columnDot.convertToDouble(parsingLocaleNotDefined) shouldBe columnOf(12.345, 67.89) + columnComma.convertToDouble(parsingLocaleNotDefined) shouldBe columnOf(12345.0, 67890.0) + columnMixed.convertToDouble(parsingLocaleNotDefined) shouldBe columnOf(12.345, 67890.0) - columnDot.convertToDouble(parsingLocaleUsesDot).shouldBe(columnOf(12.345, 67.89)) - columnComma.convertToDouble(parsingLocaleUsesDot).shouldBe(columnOf(12345.0, 67890.0)) - columnMixed.convertToDouble(parsingLocaleUsesDot).shouldBe(columnOf(12.345, 67890.0)) + columnDot.convertToDouble(parsingLocaleUsesDot) shouldBe columnOf(12.345, 67.89) + columnComma.convertToDouble(parsingLocaleUsesDot) shouldBe columnOf(12345.0, 67890.0) + columnMixed.convertToDouble(parsingLocaleUsesDot) shouldBe columnOf(12.345, 67890.0) shouldThrow { columnDot.convertToDouble(parsingLocaleUsesComma) } - columnComma.convertToDouble(parsingLocaleUsesComma).shouldBe(columnOf(12.345, 67.89)) + columnComma.convertToDouble(parsingLocaleUsesComma) shouldBe columnOf(12.345, 67.89) shouldThrow { columnMixed.convertToDouble(parsingLocaleUsesComma) } Locale.setDefault(Locale.forLanguageTag("en-US")) - columnDot.convertTo().shouldBe(columnOf(12.345, 67.89)) - columnComma.convertTo().shouldBe(columnOf(12345.0, 67890.0)) - columnMixed.convertTo().shouldBe(columnOf(12.345, 67890.0)) + columnDot.convertTo() shouldBe columnOf(12.345, 67.89) + columnComma.convertTo() shouldBe columnOf(12345.0, 67890.0) + columnMixed.convertTo() shouldBe columnOf(12.345, 67890.0) - columnDot.convertTo().shouldBe(columnOf(12.345, 67.89)) - columnComma.convertTo().shouldBe(columnOf(12345.0, 67890.0)) - columnMixed.convertTo().shouldBe(columnOf(12.345, 67890.0)) + columnDot.convertTo() shouldBe columnOf(12.345, 67.89) + columnComma.convertTo() shouldBe columnOf(12345.0, 67890.0) + columnMixed.convertTo() shouldBe columnOf(12.345, 67890.0) - columnDot.convertToDouble(parsingLocaleNotDefined).shouldBe(columnOf(12.345, 67.89)) - columnComma.convertToDouble(parsingLocaleNotDefined).shouldBe(columnOf(12345.0, 67890.0)) - columnMixed.convertToDouble(parsingLocaleNotDefined).shouldBe(columnOf(12.345, 67890.0)) + columnDot.convertToDouble(parsingLocaleNotDefined) shouldBe columnOf(12.345, 67.89) + columnComma.convertToDouble(parsingLocaleNotDefined) shouldBe columnOf(12345.0, 67890.0) + columnMixed.convertToDouble(parsingLocaleNotDefined) shouldBe columnOf(12.345, 67890.0) - columnDot.convertToDouble(parsingLocaleUsesDot).shouldBe(columnOf(12.345, 67.89)) - columnComma.convertToDouble(parsingLocaleUsesDot).shouldBe(columnOf(12345.0, 67890.0)) - columnMixed.convertToDouble(parsingLocaleUsesDot).shouldBe(columnOf(12.345, 67890.0)) + columnDot.convertToDouble(parsingLocaleUsesDot) shouldBe columnOf(12.345, 67.89) + columnComma.convertToDouble(parsingLocaleUsesDot) shouldBe columnOf(12345.0, 67890.0) + columnMixed.convertToDouble(parsingLocaleUsesDot) shouldBe columnOf(12.345, 67890.0) shouldThrow { columnDot.convertToDouble(parsingLocaleUsesComma) } - columnComma.convertToDouble(parsingLocaleUsesComma).shouldBe(columnOf(12.345, 67.89)) + columnComma.convertToDouble(parsingLocaleUsesComma) shouldBe columnOf(12.345, 67.89) shouldThrow { columnMixed.convertToDouble(parsingLocaleUsesComma) } Locale.setDefault(Locale.forLanguageTag("ru-RU")) - columnDot.convertTo().shouldBe(columnOf(12.345, 67.89)) - columnComma.convertTo().shouldBe(columnOf(12.345, 67.89)) - columnMixed.convertTo().shouldBe(columnOf(12.345, 67890.0)) + columnDot.convertTo() shouldBe columnOf(12.345, 67.89) + columnComma.convertTo() shouldBe columnOf(12.345, 67.89) + columnMixed.convertTo() shouldBe columnOf(12.345, 67890.0) - columnDot.convertTo().shouldBe(columnOf(12.345, 67.89)) - columnComma.convertTo().shouldBe(columnOf(12.345, 67.89)) - columnMixed.convertTo().shouldBe(columnOf(12.345, 67890.0)) + columnDot.convertTo() shouldBe columnOf(12.345, 67.89) + columnComma.convertTo() shouldBe columnOf(12.345, 67.89) + columnMixed.convertTo() shouldBe columnOf(12.345, 67890.0) - columnDot.convertToDouble(parsingLocaleNotDefined).shouldBe(columnOf(12.345, 67.89)) - columnComma.convertToDouble(parsingLocaleNotDefined).shouldBe(columnOf(12.345, 67.89)) - columnMixed.convertToDouble(parsingLocaleNotDefined).shouldBe(columnOf(12.345, 67890.0)) + columnDot.convertToDouble(parsingLocaleNotDefined) shouldBe columnOf(12.345, 67.89) + columnComma.convertToDouble(parsingLocaleNotDefined) shouldBe columnOf(12.345, 67.89) + columnMixed.convertToDouble(parsingLocaleNotDefined) shouldBe columnOf(12.345, 67890.0) - columnDot.convertToDouble(parsingLocaleUsesDot).shouldBe(columnOf(12.345, 67.89)) - columnComma.convertToDouble(parsingLocaleUsesDot).shouldBe(columnOf(12345.0, 67890.0)) - columnMixed.convertToDouble(parsingLocaleUsesDot).shouldBe(columnOf(12.345, 67890.0)) + columnDot.convertToDouble(parsingLocaleUsesDot) shouldBe columnOf(12.345, 67.89) + columnComma.convertToDouble(parsingLocaleUsesDot) shouldBe columnOf(12345.0, 67890.0) + columnMixed.convertToDouble(parsingLocaleUsesDot) shouldBe columnOf(12.345, 67890.0) shouldThrow { columnDot.convertToDouble(parsingLocaleUsesComma) } - columnComma.convertToDouble(parsingLocaleUsesComma).shouldBe(columnOf(12.345, 67.89)) + columnComma.convertToDouble(parsingLocaleUsesComma) shouldBe columnOf(12.345, 67.89) shouldThrow { columnMixed.convertToDouble(parsingLocaleUsesComma) } } finally { Locale.setDefault(currentLocale) diff --git a/gradle/libs.versions.toml b/gradle/libs.versions.toml index 4be1812d3d..123b3d83a8 100644 --- a/gradle/libs.versions.toml +++ b/gradle/libs.versions.toml @@ -17,6 +17,7 @@ dataframe = "0.14.1" korro = "0.1.6" kover = "0.8.3" +fastDoubleParser = "2.0.0" commonsCsv = "1.11.0" commonsCompress = "1.27.1" commonsIo = "2.16.1" @@ -65,6 +66,7 @@ kotlin-stdlib-jdk8 = { group = "org.jetbrains.kotlin", name = "kotlin-stdlib-jdk kotlin-reflect = { group = "org.jetbrains.kotlin", name = "kotlin-reflect", version.ref = "kotlin" } kotlin-scriptingJvm = { group = "org.jetbrains.kotlin", name = "kotlin-scripting-jvm", version.ref = "kotlin" } +fastDoubleParser = { group = "ch.randelshofer", name = "fastdoubleparser", version.ref = "fastDoubleParser" } commonsCsv = { group = "org.apache.commons", name = "commons-csv", version.ref = "commonsCsv" } commonsCompress = { group = "org.apache.commons", name = "commons-compress", version.ref = "commonsCompress" } commonsIo = { group = "commons-io", name = "commons-io", version.ref = "commonsIo" } From 57d492f05d9ffb0e56fd4965613dfe71a54bf0a9 Mon Sep 17 00:00:00 2001 From: Jolan Rensen Date: Tue, 29 Oct 2024 18:49:38 +0100 Subject: [PATCH 2/3] avoiding binary incompatibility issues by modifying the public API --- .../kotlinx/dataframe/api/convert.kt | 28 +++++++++---- .../jetbrains/kotlinx/dataframe/api/parse.kt | 40 +++++++++++++++++++ .../dataframe/util/deprecationMessages.kt | 4 ++ 3 files changed, 64 insertions(+), 8 deletions(-) diff --git a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/convert.kt b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/convert.kt index f921af7d3f..7eed1e126b 100644 --- a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/convert.kt +++ b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/convert.kt @@ -24,6 +24,7 @@ import org.jetbrains.kotlinx.dataframe.columns.ColumnReference import org.jetbrains.kotlinx.dataframe.columns.toColumnSet import org.jetbrains.kotlinx.dataframe.dataTypes.IFRAME import org.jetbrains.kotlinx.dataframe.dataTypes.IMG +import org.jetbrains.kotlinx.dataframe.documentation.ExcludeFromSources import org.jetbrains.kotlinx.dataframe.exceptions.CellConversionException import org.jetbrains.kotlinx.dataframe.exceptions.TypeConversionException import org.jetbrains.kotlinx.dataframe.impl.api.Parsers @@ -189,27 +190,38 @@ public fun DataColumn.convertToDouble(): DataColumn = con * If [locale] parameter is defined, it's number format is used for parsing. * If [locale] parameter is null, the current system locale is used. * If the column cannot be parsed, then the POSIX format is used. - * + */ +@ExcludeFromSources +private interface DataColumnStringConvertToDoubleDoc + +/** @include [DataColumnStringConvertToDoubleDoc] */ +@JvmName("convertToDoubleFromString") +public fun DataColumn.convertToDouble(locale: Locale? = null): DataColumn = + convertToDouble(locale = locale, useFastDoubleParser = false) + +/** + * @include [DataColumnStringConvertToDoubleDoc] * @param useFastDoubleParser whether to use the new _experimental_ FastDoubleParser, defaults to `false` for now. */ @JvmName("convertToDoubleFromString") public fun DataColumn.convertToDouble( locale: Locale? = null, - useFastDoubleParser: Boolean = false, + useFastDoubleParser: Boolean, ): DataColumn = this.castToNullable().convertToDouble(locale, useFastDoubleParser).castToNotNullable() +/** @include [DataColumnStringConvertToDoubleDoc] */ +@JvmName("convertToDoubleFromStringNullable") +public fun DataColumn.convertToDouble(locale: Locale? = null): DataColumn = + convertToDouble(locale = locale, useFastDoubleParser = false) + /** - * Parses a String column to Double considering locale (number format). - * If [locale] parameter is defined, it's number format is used for parsing. - * If [locale] parameter is null, the current system locale is used. - * If the column cannot be parsed, then the POSIX format is used. - * + * @include [DataColumnStringConvertToDoubleDoc] * @param useFastDoubleParser whether to use the new _experimental_ FastDoubleParser, defaults to `false` for now. */ @JvmName("convertToDoubleFromStringNullable") public fun DataColumn.convertToDouble( locale: Locale? = null, - useFastDoubleParser: Boolean = false, + useFastDoubleParser: Boolean, ): DataColumn { fun applyParser(parser: (String) -> Double?): DataColumn { var currentRow = 0 diff --git a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/parse.kt b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/parse.kt index 0ebef02c1f..c42ce19168 100644 --- a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/parse.kt +++ b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/parse.kt @@ -11,6 +11,8 @@ import org.jetbrains.kotlinx.dataframe.impl.api.StringParser import org.jetbrains.kotlinx.dataframe.impl.api.parseImpl import org.jetbrains.kotlinx.dataframe.impl.api.tryParseImpl import org.jetbrains.kotlinx.dataframe.typeClass +import org.jetbrains.kotlinx.dataframe.util.PARSER_OPTIONS +import org.jetbrains.kotlinx.dataframe.util.PARSER_OPTIONS_COPY import java.time.format.DateTimeFormatter import java.util.Locale import kotlin.reflect.KProperty @@ -64,6 +66,44 @@ public data class ParserOptions( val nullStrings: Set? = null, val useFastDoubleParser: Boolean = false, ) { + + /** For binary compatibility. */ + @Deprecated( + message = PARSER_OPTIONS, + level = DeprecationLevel.HIDDEN, + ) + public constructor( + locale: Locale? = null, + dateTimeFormatter: DateTimeFormatter? = null, + dateTimePattern: String? = null, + nullStrings: Set? = null, + ) : this( + locale = locale, + dateTimeFormatter = dateTimeFormatter, + dateTimePattern = dateTimePattern, + nullStrings = nullStrings, + useFastDoubleParser = false, + ) + + /** For binary compatibility. */ + @Deprecated( + message = PARSER_OPTIONS_COPY, + level = DeprecationLevel.HIDDEN, + ) + public fun copy( + locale: Locale? = this.locale, + dateTimeFormatter: DateTimeFormatter? = this.dateTimeFormatter, + dateTimePattern: String? = this.dateTimePattern, + nullStrings: Set? = this.nullStrings, + ): ParserOptions = + ParserOptions( + locale = locale, + dateTimeFormatter = dateTimeFormatter, + dateTimePattern = dateTimePattern, + nullStrings = nullStrings, + useFastDoubleParser = useFastDoubleParser, + ) + internal fun getDateTimeFormatter(): DateTimeFormatter? = when { dateTimeFormatter != null -> dateTimeFormatter diff --git a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/util/deprecationMessages.kt b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/util/deprecationMessages.kt index 05951691a8..2d53370ba0 100644 --- a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/util/deprecationMessages.kt +++ b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/util/deprecationMessages.kt @@ -15,6 +15,10 @@ internal const val DF_READ_NO_CSV = "This function is deprecated and should be r internal const val DF_READ_NO_CSV_REPLACE = "this.readCSV(fileOrUrl, delimiter, header, colTypes, skipLines, readLines, duplicate, charset)" +internal const val PARSER_OPTIONS = "This constructor is only here for binary compatibility. $MESSAGE_0_16" + +internal const val PARSER_OPTIONS_COPY = "This function is only here for binary compatibility. $MESSAGE_0_16" + // endregion // region WARNING in 0.16, ERROR in 0.17 From f588127b42d0e84fdbff3a207a97a653f7fa3893 Mon Sep 17 00:00:00 2001 From: Jolan Rensen Date: Thu, 31 Oct 2024 14:42:20 +0100 Subject: [PATCH 3/3] added double parsing test for all locales and streamlined behavior for "-" in RTL languages with NumberFormat --- .../dataframe/impl/io/FastDoubleParser.kt | 30 +++++++++++++ .../dataframe/io/FastDoubleParserTests.kt | 42 +++++++++++++++++++ 2 files changed, 72 insertions(+) diff --git a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/io/FastDoubleParser.kt b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/io/FastDoubleParser.kt index 64fbcd1312..54b584336b 100644 --- a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/io/FastDoubleParser.kt +++ b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/io/FastDoubleParser.kt @@ -43,6 +43,10 @@ public class FastDoubleParser(private val parserOptions: ParserOptions) { private val parser = ConfigurableDoubleParser(/* symbols = */ setupNumberFormatSymbols(), /* ignoreCase = */ true) + // Fix for Java 8 RTL languages minus sign not being recognized + private val minusSignIsFormatSymbol = + Character.getType(localDecimalFormatSymbols.minusSign) == Character.FORMAT.toInt() + /** * Sets up the [NumberFormatSymbols] for the [ConfigurableDoubleParser] based on * [localDecimalFormatSymbols] with fallbacks from [fallbackDecimalFormatSymbols]. @@ -177,6 +181,17 @@ public class FastDoubleParser(private val parserOptions: ParserOptions) { ): Double? { if (parserOptions.useFastDoubleParser && charset in supportedFastCharsets) { try { + // Fixes RTL minus sign not being recognized + if (minusSignIsFormatSymbol && ba.toString(charset).startsWith(localDecimalFormatSymbols.minusSign)) { + val localMinusSize = localDecimalFormatSymbols.minusSign.toString().toByteArray(charset).size + val fallbackMinusSize = fallbackDecimalFormatSymbols.minusSign.toString().toByteArray(charset).size + val newOffset = (localMinusSize - fallbackMinusSize).coerceAtLeast(0) + val newBa = ba.copyOf() + fallbackDecimalFormatSymbols.minusSign.toString().toByteArray(charset) + .copyInto(destination = newBa, destinationOffset = newOffset) + + return parser.parseDouble(newBa, newOffset, length - newOffset) + } return parser.parseDouble(ba, offset, length) } catch (e: Exception) { logger.debug(e) { @@ -199,6 +214,15 @@ public class FastDoubleParser(private val parserOptions: ParserOptions) { public fun parseOrNull(cs: CharSequence): Double? { if (parserOptions.useFastDoubleParser) { try { + // Fixes RTL minus sign not being recognized + if (minusSignIsFormatSymbol && cs.startsWith(localDecimalFormatSymbols.minusSign)) { + val newCs = cs.toString().replaceFirst( + localDecimalFormatSymbols.minusSign, + fallbackDecimalFormatSymbols.minusSign, + ) + return parser.parseDouble(newCs) + } + return parser.parseDouble(cs) } catch (e: Exception) { logger.debug(e) { @@ -219,6 +243,12 @@ public class FastDoubleParser(private val parserOptions: ParserOptions) { public fun parseOrNull(ca: CharArray, offset: Int = 0, length: Int = ca.size): Double? { if (parserOptions.useFastDoubleParser) { try { + // Fixes RTL minus sign not being recognized. + if (minusSignIsFormatSymbol && ca.firstOrNull() == localDecimalFormatSymbols.minusSign) { + val newCa = ca.copyOf() + newCa[0] = fallbackDecimalFormatSymbols.minusSign + return parser.parseDouble(newCa, offset, length) + } return parser.parseDouble(ca, offset, length) } catch (e: Exception) { logger.debug(e) { diff --git a/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/FastDoubleParserTests.kt b/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/FastDoubleParserTests.kt index 2d90652c61..57dbdc6380 100644 --- a/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/FastDoubleParserTests.kt +++ b/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/FastDoubleParserTests.kt @@ -6,6 +6,7 @@ import org.jetbrains.kotlinx.dataframe.impl.io.FastDoubleParser import org.junit.After import org.junit.Before import org.junit.Test +import java.text.NumberFormat import java.util.Locale private const val LOG_LEVEL = "org.slf4j.simpleLogger.defaultLogLevel" @@ -160,4 +161,45 @@ class FastDoubleParserTests { // ByteArray numbers.map { parser.parseOrNull(it.toByteArray()) }.shouldContainInOrder(expectedDoubles) } + + @Test + fun `fast parse any locale`() { + val locales = Locale.getAvailableLocales() + val doubles = listOf( + 12.45, + -12.45, + 100_123.35, + -204_235.23, + 1.234e3, + -345.122, + 0.0, + Double.POSITIVE_INFINITY, + Double.NEGATIVE_INFINITY, + Double.NaN, + ) + + for (locale in locales) { + val parser = FastDoubleParser(ParserOptions(locale = locale, useFastDoubleParser = true)) + val formatter = NumberFormat.getInstance(locale) + for (double in doubles) { + val formatted = formatter.format(double) + val parsedByNumberFormatter = formatter.parse(formatted)?.toDouble() + + val parsedString = parser.parseOrNull(formatted) + assert(double == parsedString || double.isNaN() && parsedString?.isNaN() == true) { + "Failed to parse $formatted with locale $locale. Expected $double, got $parsedString. NumberFormat parsed it like: $parsedByNumberFormatter" + } + + val parsedCharArray = parser.parseOrNull(formatted.toCharArray()) + assert(double == parsedCharArray || double.isNaN() && parsedCharArray?.isNaN() == true) { + "Failed to parse $formatted with locale $locale. Expected $double, got $parsedCharArray. NumberFormat parsed it like: $parsedByNumberFormatter" + } + + val parsedByteArray = parser.parseOrNull(formatted.toByteArray()) + assert(double == parsedByteArray || double.isNaN() && parsedByteArray?.isNaN() == true) { + "Failed to parse $formatted with locale $locale. Expected $double, got $parsedByteArray. NumberFormat parsed it like: $parsedByNumberFormatter" + } + } + } + } }