Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fast double parser #935

Merged
merged 4 commits into from
Oct 31, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 21 additions & 2 deletions core/api/core.api
Original file line number Diff line number Diff line change
Expand Up @@ -3817,9 +3817,13 @@ public final class org/jetbrains/kotlinx/dataframe/api/ConvertKt {
public static final fun convertToByteFromT (Lorg/jetbrains/kotlinx/dataframe/DataColumn;)Lorg/jetbrains/kotlinx/dataframe/DataColumn;
public static final fun convertToDouble (Lorg/jetbrains/kotlinx/dataframe/DataColumn;)Lorg/jetbrains/kotlinx/dataframe/DataColumn;
public static final fun convertToDoubleFromString (Lorg/jetbrains/kotlinx/dataframe/DataColumn;Ljava/util/Locale;)Lorg/jetbrains/kotlinx/dataframe/DataColumn;
public static final fun convertToDoubleFromString (Lorg/jetbrains/kotlinx/dataframe/DataColumn;Ljava/util/Locale;Z)Lorg/jetbrains/kotlinx/dataframe/DataColumn;
public static synthetic fun convertToDoubleFromString$default (Lorg/jetbrains/kotlinx/dataframe/DataColumn;Ljava/util/Locale;ILjava/lang/Object;)Lorg/jetbrains/kotlinx/dataframe/DataColumn;
public static synthetic fun convertToDoubleFromString$default (Lorg/jetbrains/kotlinx/dataframe/DataColumn;Ljava/util/Locale;ZILjava/lang/Object;)Lorg/jetbrains/kotlinx/dataframe/DataColumn;
public static final fun convertToDoubleFromStringNullable (Lorg/jetbrains/kotlinx/dataframe/DataColumn;Ljava/util/Locale;)Lorg/jetbrains/kotlinx/dataframe/DataColumn;
public static final fun convertToDoubleFromStringNullable (Lorg/jetbrains/kotlinx/dataframe/DataColumn;Ljava/util/Locale;Z)Lorg/jetbrains/kotlinx/dataframe/DataColumn;
public static synthetic fun convertToDoubleFromStringNullable$default (Lorg/jetbrains/kotlinx/dataframe/DataColumn;Ljava/util/Locale;ILjava/lang/Object;)Lorg/jetbrains/kotlinx/dataframe/DataColumn;
public static synthetic fun convertToDoubleFromStringNullable$default (Lorg/jetbrains/kotlinx/dataframe/DataColumn;Ljava/util/Locale;ZILjava/lang/Object;)Lorg/jetbrains/kotlinx/dataframe/DataColumn;
public static final fun convertToDoubleFromT (Lorg/jetbrains/kotlinx/dataframe/DataColumn;)Lorg/jetbrains/kotlinx/dataframe/DataColumn;
public static final fun convertToFloat (Lorg/jetbrains/kotlinx/dataframe/DataColumn;)Lorg/jetbrains/kotlinx/dataframe/DataColumn;
public static final fun convertToFloatFromT (Lorg/jetbrains/kotlinx/dataframe/DataColumn;)Lorg/jetbrains/kotlinx/dataframe/DataColumn;
Expand Down Expand Up @@ -6482,19 +6486,25 @@ public final class org/jetbrains/kotlinx/dataframe/api/ParseKt {

public final class org/jetbrains/kotlinx/dataframe/api/ParserOptions {
public fun <init> ()V
public fun <init> (Ljava/util/Locale;Ljava/time/format/DateTimeFormatter;Ljava/lang/String;Ljava/util/Set;)V
public synthetic fun <init> (Ljava/util/Locale;Ljava/time/format/DateTimeFormatter;Ljava/lang/String;Ljava/util/Set;)V
public synthetic fun <init> (Ljava/util/Locale;Ljava/time/format/DateTimeFormatter;Ljava/lang/String;Ljava/util/Set;ILkotlin/jvm/internal/DefaultConstructorMarker;)V
public fun <init> (Ljava/util/Locale;Ljava/time/format/DateTimeFormatter;Ljava/lang/String;Ljava/util/Set;Z)V
public synthetic fun <init> (Ljava/util/Locale;Ljava/time/format/DateTimeFormatter;Ljava/lang/String;Ljava/util/Set;ZILkotlin/jvm/internal/DefaultConstructorMarker;)V
public final fun component1 ()Ljava/util/Locale;
public final fun component2 ()Ljava/time/format/DateTimeFormatter;
public final fun component3 ()Ljava/lang/String;
public final fun component4 ()Ljava/util/Set;
public final fun copy (Ljava/util/Locale;Ljava/time/format/DateTimeFormatter;Ljava/lang/String;Ljava/util/Set;)Lorg/jetbrains/kotlinx/dataframe/api/ParserOptions;
public final fun component5 ()Z
public final synthetic fun copy (Ljava/util/Locale;Ljava/time/format/DateTimeFormatter;Ljava/lang/String;Ljava/util/Set;)Lorg/jetbrains/kotlinx/dataframe/api/ParserOptions;
public final fun copy (Ljava/util/Locale;Ljava/time/format/DateTimeFormatter;Ljava/lang/String;Ljava/util/Set;Z)Lorg/jetbrains/kotlinx/dataframe/api/ParserOptions;
public static synthetic fun copy$default (Lorg/jetbrains/kotlinx/dataframe/api/ParserOptions;Ljava/util/Locale;Ljava/time/format/DateTimeFormatter;Ljava/lang/String;Ljava/util/Set;ILjava/lang/Object;)Lorg/jetbrains/kotlinx/dataframe/api/ParserOptions;
public static synthetic fun copy$default (Lorg/jetbrains/kotlinx/dataframe/api/ParserOptions;Ljava/util/Locale;Ljava/time/format/DateTimeFormatter;Ljava/lang/String;Ljava/util/Set;ZILjava/lang/Object;)Lorg/jetbrains/kotlinx/dataframe/api/ParserOptions;
public fun equals (Ljava/lang/Object;)Z
public final fun getDateTimeFormatter ()Ljava/time/format/DateTimeFormatter;
public final fun getDateTimePattern ()Ljava/lang/String;
public final fun getLocale ()Ljava/util/Locale;
public final fun getNullStrings ()Ljava/util/Set;
public final fun getUseFastDoubleParser ()Z
public fun hashCode ()I
public fun toString ()Ljava/lang/String;
}
Expand Down Expand Up @@ -10198,6 +10208,15 @@ public final class org/jetbrains/kotlinx/dataframe/impl/columns/UtilsKt {
public static final fun asAnyFrameColumn (Lorg/jetbrains/kotlinx/dataframe/DataColumn;)Lorg/jetbrains/kotlinx/dataframe/columns/FrameColumn;
}

public final class org/jetbrains/kotlinx/dataframe/impl/io/FastDoubleParser {
public fun <init> (Lorg/jetbrains/kotlinx/dataframe/api/ParserOptions;)V
public final fun parseOrNull (Ljava/lang/CharSequence;)Ljava/lang/Double;
public final fun parseOrNull ([BIILjava/nio/charset/Charset;)Ljava/lang/Double;
public final fun parseOrNull ([CII)Ljava/lang/Double;
public static synthetic fun parseOrNull$default (Lorg/jetbrains/kotlinx/dataframe/impl/io/FastDoubleParser;[BIILjava/nio/charset/Charset;ILjava/lang/Object;)Ljava/lang/Double;
public static synthetic fun parseOrNull$default (Lorg/jetbrains/kotlinx/dataframe/impl/io/FastDoubleParser;[CIIILjava/lang/Object;)Ljava/lang/Double;
}

public final class org/jetbrains/kotlinx/dataframe/impl/schema/DataFrameSchemaImpl : org/jetbrains/kotlinx/dataframe/schema/DataFrameSchema {
public fun <init> (Ljava/util/Map;)V
public fun compare (Lorg/jetbrains/kotlinx/dataframe/schema/DataFrameSchema;)Lorg/jetbrains/kotlinx/dataframe/schema/CompareResult;
Expand Down
1 change: 1 addition & 0 deletions core/build.gradle.kts
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@ dependencies {
implementation(libs.commonsIo)
implementation(libs.serialization.core)
implementation(libs.serialization.json)
implementation(libs.fastDoubleParser)

implementation(libs.fuel)

Expand Down
43 changes: 33 additions & 10 deletions core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/convert.kt
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ import org.jetbrains.kotlinx.dataframe.columns.ColumnReference
import org.jetbrains.kotlinx.dataframe.columns.toColumnSet
import org.jetbrains.kotlinx.dataframe.dataTypes.IFRAME
import org.jetbrains.kotlinx.dataframe.dataTypes.IMG
import org.jetbrains.kotlinx.dataframe.documentation.ExcludeFromSources
import org.jetbrains.kotlinx.dataframe.exceptions.CellConversionException
import org.jetbrains.kotlinx.dataframe.exceptions.TypeConversionException
import org.jetbrains.kotlinx.dataframe.impl.api.Parsers
Expand Down Expand Up @@ -185,21 +186,43 @@ public fun <T : Any> DataColumn<T>.convertToDouble(): DataColumn<Double> = conve
public fun <T : Any> DataColumn<T?>.convertToDouble(): DataColumn<Double?> = convertTo()

/**
* Parse String column to Double considering locale (number format).
* Parses a String column to Double considering locale (number format).
* If [locale] parameter is defined, it's number format is used for parsing.
* If [locale] parameter is null, the current system locale is used. If column can not be parsed, then POSIX format is used.
* If [locale] parameter is null, the current system locale is used.
* If the column cannot be parsed, then the POSIX format is used.
*/
@ExcludeFromSources
private interface DataColumnStringConvertToDoubleDoc

/** @include [DataColumnStringConvertToDoubleDoc] */
@JvmName("convertToDoubleFromString")
public fun DataColumn<String>.convertToDouble(locale: Locale? = null): DataColumn<Double> =
this.castToNullable().convertToDouble(locale).castToNotNullable()
convertToDouble(locale = locale, useFastDoubleParser = false)

/**
* Parse String column to Double considering locale (number format).
* If [locale] parameter is defined, it's number format is used for parsing.
* If [locale] parameter is null, the current system locale is used. If column can not be parsed, then POSIX format is used.
* @include [DataColumnStringConvertToDoubleDoc]
* @param useFastDoubleParser whether to use the new _experimental_ FastDoubleParser, defaults to `false` for now.
*/
@JvmName("convertToDoubleFromString")
public fun DataColumn<String>.convertToDouble(
locale: Locale? = null,
useFastDoubleParser: Boolean,
): DataColumn<Double> = this.castToNullable().convertToDouble(locale, useFastDoubleParser).castToNotNullable()

/** @include [DataColumnStringConvertToDoubleDoc] */
@JvmName("convertToDoubleFromStringNullable")
public fun DataColumn<String?>.convertToDouble(locale: Locale? = null): DataColumn<Double?> =
convertToDouble(locale = locale, useFastDoubleParser = false)

/**
* @include [DataColumnStringConvertToDoubleDoc]
* @param useFastDoubleParser whether to use the new _experimental_ FastDoubleParser, defaults to `false` for now.
*/
@JvmName("convertToDoubleFromStringNullable")
public fun DataColumn<String?>.convertToDouble(locale: Locale? = null): DataColumn<Double?> {
public fun DataColumn<String?>.convertToDouble(
locale: Locale? = null,
useFastDoubleParser: Boolean,
): DataColumn<Double?> {
fun applyParser(parser: (String) -> Double?): DataColumn<Double?> {
var currentRow = 0
try {
Expand All @@ -220,14 +243,14 @@ public fun DataColumn<String?>.convertToDouble(locale: Locale? = null): DataColu
}

return if (locale != null) {
val explicitParser = Parsers.getDoubleParser(locale)
val explicitParser = Parsers.getDoubleParser(locale, useFastDoubleParser)
applyParser(explicitParser)
} else {
try {
val defaultParser = Parsers.getDoubleParser()
val defaultParser = Parsers.getDoubleParser(useFastDoubleParser = useFastDoubleParser)
applyParser(defaultParser)
} catch (e: TypeConversionException) {
val posixParser = Parsers.getDoubleParser(Locale.forLanguageTag("C.UTF-8"))
val posixParser = Parsers.getDoubleParser(Locale.forLanguageTag("C.UTF-8"), useFastDoubleParser)
applyParser(posixParser)
}
}
Expand Down
57 changes: 57 additions & 0 deletions core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/parse.kt
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@ import org.jetbrains.kotlinx.dataframe.impl.api.StringParser
import org.jetbrains.kotlinx.dataframe.impl.api.parseImpl
import org.jetbrains.kotlinx.dataframe.impl.api.tryParseImpl
import org.jetbrains.kotlinx.dataframe.typeClass
import org.jetbrains.kotlinx.dataframe.util.PARSER_OPTIONS
import org.jetbrains.kotlinx.dataframe.util.PARSER_OPTIONS_COPY
import java.time.format.DateTimeFormatter
import java.util.Locale
import kotlin.reflect.KProperty
Expand Down Expand Up @@ -40,13 +42,68 @@ public interface GlobalParserOptions {
public var locale: Locale
}

/**
* ### Options for parsing [String]`?` columns
*
* @param locale locale to use for parsing dates and numbers, defaults to the System default locale.
* If specified instead of [dateTimeFormatter], it will be used in combination with [dateTimePattern]
* to create a [DateTimeFormatter]. Just providing [locale] will not allow you to parse
* locale-specific dates!
* @param dateTimeFormatter a [DateTimeFormatter] to use for parsing dates, if not specified, it will be created
* from [dateTimePattern] and [locale]. If neither [dateTimeFormatter] nor [dateTimePattern] are specified,
* [DateTimeFormatter.ISO_LOCAL_DATE_TIME] will be used.
* @param dateTimePattern a pattern to use for parsing dates. If specified instead of [dateTimeFormatter],
* it will be used to create a [DateTimeFormatter].
* @param nullStrings a set of strings that should be treated as `null` values. By default, it's
* ["null", "NULL", "NA", "N/A"].
* @param useFastDoubleParser whether to use the new _experimental_ FastDoubleParser, defaults to `false` for now.
*/
public data class ParserOptions(
val locale: Locale? = null,
// TODO, migrate to kotlinx.datetime.format.DateTimeFormat? https://github.com/Kotlin/dataframe/issues/876
val dateTimeFormatter: DateTimeFormatter? = null,
val dateTimePattern: String? = null,
val nullStrings: Set<String>? = null,
val useFastDoubleParser: Boolean = false,
) {

/** For binary compatibility. */
@Deprecated(
message = PARSER_OPTIONS,
level = DeprecationLevel.HIDDEN,
)
public constructor(
locale: Locale? = null,
dateTimeFormatter: DateTimeFormatter? = null,
dateTimePattern: String? = null,
nullStrings: Set<String>? = null,
) : this(
locale = locale,
dateTimeFormatter = dateTimeFormatter,
dateTimePattern = dateTimePattern,
nullStrings = nullStrings,
useFastDoubleParser = false,
)

/** For binary compatibility. */
@Deprecated(
message = PARSER_OPTIONS_COPY,
level = DeprecationLevel.HIDDEN,
)
public fun copy(
locale: Locale? = this.locale,
dateTimeFormatter: DateTimeFormatter? = this.dateTimeFormatter,
dateTimePattern: String? = this.dateTimePattern,
nullStrings: Set<String>? = this.nullStrings,
): ParserOptions =
ParserOptions(
locale = locale,
dateTimeFormatter = dateTimeFormatter,
dateTimePattern = dateTimePattern,
nullStrings = nullStrings,
useFastDoubleParser = useFastDoubleParser,
)

internal fun getDateTimeFormatter(): DateTimeFormatter? =
when {
dateTimeFormatter != null -> dateTimeFormatter
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,24 +19,31 @@ import kotlin.annotation.AnnotationTarget.VALUE_PARAMETER
* {@include [Indent]}
*
*/
@ExcludeFromSources
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These documentation interfaces can be excluded from the sources as they are only ever accessed from kdoc.

internal interface LineBreak

/** &nbsp; */
@ExcludeFromSources
internal interface QuarterIndent

/** &nbsp;&nbsp; */
@ExcludeFromSources
internal interface HalfIndent

/** &nbsp;&nbsp;&nbsp;&nbsp; */
@ExcludeFromSources
internal interface Indent

/** &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; */
@ExcludeFromSources
internal interface DoubleIndent

/** &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; */
@ExcludeFromSources
internal interface TripleIndent

/** &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; */
@ExcludeFromSources
internal interface QuadrupleIndent

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,13 +34,13 @@ import org.jetbrains.kotlinx.dataframe.hasNulls
import org.jetbrains.kotlinx.dataframe.impl.canParse
import org.jetbrains.kotlinx.dataframe.impl.catchSilent
import org.jetbrains.kotlinx.dataframe.impl.createStarProjectedType
import org.jetbrains.kotlinx.dataframe.impl.io.FastDoubleParser
import org.jetbrains.kotlinx.dataframe.impl.javaDurationCanParse
import org.jetbrains.kotlinx.dataframe.io.isURL
import org.jetbrains.kotlinx.dataframe.io.readJsonStr
import org.jetbrains.kotlinx.dataframe.values
import java.math.BigDecimal
import java.net.URL
import java.text.NumberFormat
import java.text.ParsePosition
import java.time.format.DateTimeFormatter
import java.time.format.DateTimeFormatterBuilder
Expand Down Expand Up @@ -275,29 +275,6 @@ internal object Parsers : GlobalParserOptions {
null
}

private fun String.parseDouble(format: NumberFormat) =
when (uppercase(Locale.getDefault())) {
"NAN" -> Double.NaN

"INF" -> Double.POSITIVE_INFINITY

"-INF" -> Double.NEGATIVE_INFINITY

"INFINITY" -> Double.POSITIVE_INFINITY

"-INFINITY" -> Double.NEGATIVE_INFINITY

else -> {
val parsePosition = ParsePosition(0)
val result: Double? = format.parse(this, parsePosition)?.toDouble()
if (parsePosition.index != this.length) {
null
} else {
result
}
}
}

inline fun <reified T : Any> stringParser(
catch: Boolean = false,
coveredBy: Set<KType> = emptySet(),
Expand All @@ -317,11 +294,15 @@ internal object Parsers : GlobalParserOptions {
): StringParserWithFormat<T> = StringParserWithFormat(typeOf<T>(), coveredBy, body)

private val parserToDoubleWithOptions = stringParserWithOptions { options ->
val numberFormat = NumberFormat.getInstance(options?.locale ?: Locale.getDefault())
val parser = { it: String -> it.parseDouble(numberFormat) }
val fastDoubleParser = FastDoubleParser(options ?: ParserOptions())
val parser = { it: String -> fastDoubleParser.parseOrNull(it) }
parser
}

private val posixDoubleParser = FastDoubleParser(
ParserOptions(locale = Locale.forLanguageTag("C.UTF-8")),
)

internal val parsersOrder = listOf(
// Int
stringParser<Int> { it.toIntOrNull() },
Expand Down Expand Up @@ -384,7 +365,7 @@ internal object Parsers : GlobalParserOptions {
// Double, with explicit number format or taken from current locale
parserToDoubleWithOptions,
// Double, with POSIX format
stringParser<Double> { it.parseDouble(NumberFormat.getInstance(Locale.forLanguageTag("C.UTF-8"))) },
stringParser<Double> { posixDoubleParser.parseOrNull(it) },
// Boolean
stringParser<Boolean> { it.toBooleanOrNull() },
// BigDecimal
Expand Down Expand Up @@ -449,9 +430,9 @@ internal object Parsers : GlobalParserOptions {
return parser.applyOptions(options)
}

internal fun getDoubleParser(locale: Locale? = null): (String) -> Double? {
internal fun getDoubleParser(locale: Locale? = null, useFastDoubleParser: Boolean): (String) -> Double? {
val options = if (locale != null) {
ParserOptions(locale = locale)
ParserOptions(locale = locale, useFastDoubleParser = useFastDoubleParser)
} else {
null
}
Expand Down
Loading