Skip to content

Commit 4ebb5a0

Browse files
committed
removed ParserOptions.allTypesExcept; use convertTo in this case. Adapted both csv implementations to use convertTo. Addition of DataColumn<String>.convertTo overloads to allow for ParserOptions (for nullStrings etc.) Moved DataColumn<String>.convertToDouble to impl. Fixed nullstrings support for it, cleaned the parsers. Added tests for Issue #921
1 parent 6bb3502 commit 4ebb5a0

File tree

8 files changed

+195
-89
lines changed

8 files changed

+195
-89
lines changed

core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/convert.kt

Lines changed: 37 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -25,10 +25,9 @@ import org.jetbrains.kotlinx.dataframe.columns.toColumnSet
2525
import org.jetbrains.kotlinx.dataframe.dataTypes.IFRAME
2626
import org.jetbrains.kotlinx.dataframe.dataTypes.IMG
2727
import org.jetbrains.kotlinx.dataframe.documentation.ExcludeFromSources
28-
import org.jetbrains.kotlinx.dataframe.exceptions.CellConversionException
29-
import org.jetbrains.kotlinx.dataframe.exceptions.TypeConversionException
3028
import org.jetbrains.kotlinx.dataframe.impl.api.Parsers
3129
import org.jetbrains.kotlinx.dataframe.impl.api.convertRowColumnImpl
30+
import org.jetbrains.kotlinx.dataframe.impl.api.convertToDoubleImpl
3231
import org.jetbrains.kotlinx.dataframe.impl.api.convertToTypeImpl
3332
import org.jetbrains.kotlinx.dataframe.impl.api.defaultTimeZone
3433
import org.jetbrains.kotlinx.dataframe.impl.api.toLocalDate
@@ -37,14 +36,12 @@ import org.jetbrains.kotlinx.dataframe.impl.api.toLocalTime
3736
import org.jetbrains.kotlinx.dataframe.impl.api.withRowCellImpl
3837
import org.jetbrains.kotlinx.dataframe.impl.headPlusArray
3938
import org.jetbrains.kotlinx.dataframe.io.toDataFrame
40-
import org.jetbrains.kotlinx.dataframe.path
4139
import java.math.BigDecimal
4240
import java.net.URL
4341
import java.util.Locale
4442
import kotlin.reflect.KProperty
4543
import kotlin.reflect.KType
4644
import kotlin.reflect.full.isSubtypeOf
47-
import kotlin.reflect.full.withNullability
4845
import kotlin.reflect.typeOf
4946

5047
@Interpretable("Convert0")
@@ -130,15 +127,30 @@ public inline fun <T, C, reified R> Convert<T, C>.perRowCol(
130127

131128
public inline fun <reified C> AnyCol.convertTo(): DataColumn<C> = convertTo(typeOf<C>()) as DataColumn<C>
132129

133-
public fun AnyCol.convertTo(newType: KType): AnyCol {
134-
val isTypesAreCorrect = this.type().withNullability(true).isSubtypeOf(typeOf<String?>()) &&
135-
newType.withNullability(true) == typeOf<Double?>()
130+
@Suppress("UNCHECKED_CAST")
131+
public fun AnyCol.convertTo(newType: KType): AnyCol =
132+
when {
133+
type().isSubtypeOf(typeOf<String?>()) ->
134+
(this as DataColumn<String?>).convertTo(newType)
136135

137-
if (isTypesAreCorrect) {
138-
return (this as DataColumn<String?>).convertToDouble().setNullable(newType.isMarkedNullable)
136+
else -> convertToTypeImpl(newType)
137+
}
138+
139+
public inline fun <reified C> DataColumn<String?>.convertTo(
140+
parserOptions: ParserOptions = ParserOptions(),
141+
): DataColumn<C> = convertTo(typeOf<C>(), parserOptions) as DataColumn<C>
142+
143+
public fun DataColumn<String?>.convertTo(newType: KType, parserOptions: ParserOptions = ParserOptions()): AnyCol =
144+
when {
145+
newType.isSubtypeOf(typeOf<Double?>()) ->
146+
(this as DataColumn<String?>).convertToDoubleImpl(
147+
locale = parserOptions.locale,
148+
nullStrings = parserOptions.nullStrings,
149+
useFastDoubleParser = parserOptions.useFastDoubleParser,
150+
).setNullable(newType.isMarkedNullable)
151+
152+
else -> convertToTypeImpl(newType, parserOptions)
139153
}
140-
return convertToTypeImpl(newType)
141-
}
142154

143155
@JvmName("convertToLocalDateTimeFromT")
144156
public fun <T : Any> DataColumn<T>.convertToLocalDateTime(): DataColumn<LocalDateTime> = convertTo()
@@ -197,64 +209,42 @@ private interface DataColumnStringConvertToDoubleDoc
197209
/** @include [DataColumnStringConvertToDoubleDoc] */
198210
@JvmName("convertToDoubleFromString")
199211
public fun DataColumn<String>.convertToDouble(locale: Locale? = null): DataColumn<Double> =
200-
convertToDouble(locale = locale, useFastDoubleParser = false)
212+
convertToDouble(locale = locale, nullStrings = null, useFastDoubleParser = false)
201213

202214
/**
203215
* @include [DataColumnStringConvertToDoubleDoc]
216+
* @param nullStrings a set of strings that should be treated as `null` values. By default, it's ["null", "NULL", "NA", "N/A"].
204217
* @param useFastDoubleParser whether to use the new _experimental_ FastDoubleParser, defaults to `false` for now.
205218
*/
206219
@JvmName("convertToDoubleFromString")
207220
public fun DataColumn<String>.convertToDouble(
208221
locale: Locale? = null,
222+
nullStrings: Set<String>?,
209223
useFastDoubleParser: Boolean,
210-
): DataColumn<Double> = this.castToNullable().convertToDouble(locale, useFastDoubleParser).castToNotNullable()
224+
): DataColumn<Double> =
225+
this.castToNullable().convertToDouble(locale, nullStrings, useFastDoubleParser).castToNotNullable()
211226

212227
/** @include [DataColumnStringConvertToDoubleDoc] */
213228
@JvmName("convertToDoubleFromStringNullable")
214229
public fun DataColumn<String?>.convertToDouble(locale: Locale? = null): DataColumn<Double?> =
215-
convertToDouble(locale = locale, useFastDoubleParser = false)
230+
convertToDouble(locale = locale, nullStrings = null, useFastDoubleParser = false)
216231

217232
/**
218233
* @include [DataColumnStringConvertToDoubleDoc]
234+
* @param nullStrings a set of strings that should be treated as `null` values. By default, it's ["null", "NULL", "NA", "N/A"].
219235
* @param useFastDoubleParser whether to use the new _experimental_ FastDoubleParser, defaults to `false` for now.
220236
*/
221237
@JvmName("convertToDoubleFromStringNullable")
222238
public fun DataColumn<String?>.convertToDouble(
223239
locale: Locale? = null,
240+
nullStrings: Set<String>?,
224241
useFastDoubleParser: Boolean,
225-
): DataColumn<Double?> {
226-
fun applyParser(parser: (String) -> Double?): DataColumn<Double?> {
227-
var currentRow = 0
228-
try {
229-
return mapIndexed { row, value ->
230-
currentRow = row
231-
value?.let {
232-
parser(value.trim()) ?: throw TypeConversionException(
233-
value = value,
234-
from = typeOf<String>(),
235-
to = typeOf<Double>(),
236-
column = path,
237-
)
238-
}
239-
}
240-
} catch (e: TypeConversionException) {
241-
throw CellConversionException(e.value, e.from, e.to, path, currentRow, e)
242-
}
243-
}
244-
245-
return if (locale != null) {
246-
val explicitParser = Parsers.getDoubleParser(locale, useFastDoubleParser)
247-
applyParser(explicitParser)
248-
} else {
249-
try {
250-
val defaultParser = Parsers.getDoubleParser(useFastDoubleParser = useFastDoubleParser)
251-
applyParser(defaultParser)
252-
} catch (e: TypeConversionException) {
253-
val posixParser = Parsers.getDoubleParser(Locale.forLanguageTag("C.UTF-8"), useFastDoubleParser)
254-
applyParser(posixParser)
255-
}
256-
}
257-
}
242+
): DataColumn<Double?> =
243+
convertToDoubleImpl(
244+
locale = locale,
245+
nullStrings = nullStrings,
246+
useFastDoubleParser = useFastDoubleParser,
247+
)
258248

259249
@JvmName("convertToFloatFromT")
260250
public fun <T : Any> DataColumn<T>.convertToFloat(): DataColumn<Float> = convertTo()

core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/parse.kt

Lines changed: 1 addition & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,7 @@ public interface GlobalParserOptions {
5858
* @param nullStrings a set of strings that should be treated as `null` values. By default, it's
5959
* ["null", "NULL", "NA", "N/A"].
6060
* @param skipTypes a set of types that should be skipped during parsing. Parsing will be attempted for all other types.
61-
* By default, it's an empty set. To skip all types except some specified ones, use [allTypesExcept].
61+
* By default, it's an empty set. To skip all types except a specified one, use [convertTo] instead.
6262
* @param useFastDoubleParser whether to use the new _experimental_ FastDoubleParser, defaults to `false` for now.
6363
*/
6464
public data class ParserOptions(
@@ -70,14 +70,6 @@ public data class ParserOptions(
7070
val skipTypes: Set<KType> = emptySet(),
7171
val useFastDoubleParser: Boolean = false,
7272
) {
73-
public companion object {
74-
/**
75-
* Small helper function to get all types except the ones specified.
76-
* Useful in combination with the [skipTypes] parameter.
77-
*/
78-
public fun allTypesExcept(vararg types: KType): Set<KType> =
79-
Parsers.parsersOrder.map { it.type }.toSet() - types.toSet()
80-
}
8173

8274
/** For binary compatibility. */
8375
@Deprecated(

core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/convert.kt

Lines changed: 63 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ import org.jetbrains.kotlinx.dataframe.api.Convert
2525
import org.jetbrains.kotlinx.dataframe.api.DataSchemaEnum
2626
import org.jetbrains.kotlinx.dataframe.api.Infer
2727
import org.jetbrains.kotlinx.dataframe.api.ParserOptions
28+
import org.jetbrains.kotlinx.dataframe.api.mapIndexed
2829
import org.jetbrains.kotlinx.dataframe.api.name
2930
import org.jetbrains.kotlinx.dataframe.api.to
3031
import org.jetbrains.kotlinx.dataframe.columns.values
@@ -50,6 +51,8 @@ import kotlin.reflect.full.memberProperties
5051
import kotlin.reflect.full.primaryConstructor
5152
import kotlin.reflect.full.withNullability
5253
import kotlin.reflect.jvm.jvmErasure
54+
import kotlin.reflect.typeOf
55+
import kotlin.text.trim
5356
import java.time.Instant as JavaInstant
5457
import java.time.LocalDate as JavaLocalDate
5558
import java.time.LocalDateTime as JavaLocalDateTime
@@ -69,7 +72,64 @@ internal fun <T, C, R> Convert<T, C>.convertRowColumnImpl(
6972
rowConverter: RowColumnExpression<T, C, R>,
7073
): DataFrame<T> = to { col -> df.newColumn(type, col.name, infer) { rowConverter(it, col) } }
7174

72-
internal fun AnyCol.convertToTypeImpl(to: KType): AnyCol {
75+
/**
76+
* Specific implementation for [convertToTypeImpl] for [String] -> [Double] conversion
77+
*
78+
* This function exists because [convertToTypeImpl] can only retrieve a single parser
79+
* double has two: one with the given locale (or system default) and one POSIX parser
80+
*/
81+
internal fun DataColumn<String?>.convertToDoubleImpl(
82+
locale: Locale?,
83+
nullStrings: Set<String>?,
84+
useFastDoubleParser: Boolean,
85+
): DataColumn<Double?> {
86+
val nullStrings = nullStrings ?: Parsers.nulls
87+
88+
fun applyParser(parser: (String) -> Double?): DataColumn<Double?> {
89+
var currentRow = 0
90+
try {
91+
return mapIndexed { row, value ->
92+
currentRow = row
93+
value?.let {
94+
if (it in nullStrings) return@let null
95+
96+
parser(value.trim()) ?: throw TypeConversionException(
97+
value = value,
98+
from = typeOf<String>(),
99+
to = typeOf<Double>(),
100+
column = path,
101+
)
102+
}
103+
}
104+
} catch (e: TypeConversionException) {
105+
throw CellConversionException(e.value, e.from, e.to, path, currentRow, e)
106+
}
107+
}
108+
109+
return if (locale != null) {
110+
val explicitParser = Parsers.getDoubleParser(
111+
locale = locale,
112+
useFastDoubleParser = useFastDoubleParser,
113+
)
114+
applyParser(explicitParser)
115+
} else {
116+
try {
117+
val defaultParser =
118+
Parsers.getDoubleParser(
119+
locale = null,
120+
useFastDoubleParser = useFastDoubleParser,
121+
)
122+
applyParser(defaultParser)
123+
} catch (_: TypeConversionException) {
124+
val posixParser = Parsers.getPosixDoubleParser(
125+
useFastDoubleParser = useFastDoubleParser,
126+
)
127+
applyParser(posixParser)
128+
}
129+
}
130+
}
131+
132+
internal fun AnyCol.convertToTypeImpl(to: KType, parserOptions: ParserOptions = ParserOptions()): AnyCol {
73133
val from = type
74134

75135
val nullsAreAllowed = to.isMarkedNullable
@@ -112,7 +172,7 @@ internal fun AnyCol.convertToTypeImpl(to: KType): AnyCol {
112172
value?.let {
113173
val clazz = it.javaClass.kotlin
114174
val type = clazz.createStarProjectedType(false)
115-
val converter = getConverter(type, to, ParserOptions(locale = Locale.getDefault()))
175+
val converter = getConverter(type, to, parserOptions)
116176
?: throw TypeConverterNotFoundException(from, to, path)
117177
converter(it)
118178
}.checkNulls()
@@ -139,7 +199,7 @@ internal fun AnyCol.convertToTypeImpl(to: KType): AnyCol {
139199
}
140200
}
141201

142-
return when (val converter = getConverter(from, to, ParserOptions(locale = Locale.getDefault()))) {
202+
return when (val converter = getConverter(from, to, parserOptions)) {
143203
null -> convertPerCell()
144204
else -> applyConverter(converter)
145205
}

core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/parse.kt

Lines changed: 15 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -298,9 +298,13 @@ internal object Parsers : GlobalParserOptions {
298298
parser
299299
}
300300

301-
private val posixDoubleParser = FastDoubleParser(
302-
ParserOptions(locale = Locale.forLanguageTag("C.UTF-8")),
303-
)
301+
// same as parserToDoubleWithOptions, but overrides the locale to C.UTF-8
302+
private val posixParserToDoubleWithOptions = stringParserWithOptions { options ->
303+
val parserOptions = (options ?: ParserOptions()).copy(locale = Locale.forLanguageTag("C.UTF-8"))
304+
val fastDoubleParser = FastDoubleParser(parserOptions)
305+
val parser = { it: String -> fastDoubleParser.parseOrNull(it) }
306+
parser
307+
}
304308

305309
internal val parsersOrder = listOf(
306310
// Int
@@ -364,7 +368,7 @@ internal object Parsers : GlobalParserOptions {
364368
// Double, with explicit number format or taken from current locale
365369
parserToDoubleWithOptions,
366370
// Double, with POSIX format
367-
stringParser<Double> { posixDoubleParser.parseOrNull(it) },
371+
posixParserToDoubleWithOptions,
368372
// Boolean
369373
stringParser<Boolean> { it.toBooleanOrNull() },
370374
// BigDecimal
@@ -431,14 +435,13 @@ internal object Parsers : GlobalParserOptions {
431435
return parser.applyOptions(options)
432436
}
433437

434-
internal fun getDoubleParser(locale: Locale? = null, useFastDoubleParser: Boolean): (String) -> Double? {
435-
val options = if (locale != null) {
436-
ParserOptions(locale = locale, useFastDoubleParser = useFastDoubleParser)
437-
} else {
438-
null
439-
}
440-
return parserToDoubleWithOptions.applyOptions(options)
441-
}
438+
internal fun getDoubleParser(locale: Locale?, useFastDoubleParser: Boolean): (String) -> Double? =
439+
parserToDoubleWithOptions
440+
.applyOptions(ParserOptions(locale = locale, useFastDoubleParser = useFastDoubleParser))
441+
442+
internal fun getPosixDoubleParser(useFastDoubleParser: Boolean): (String) -> Double? =
443+
posixParserToDoubleWithOptions
444+
.applyOptions(ParserOptions(useFastDoubleParser = useFastDoubleParser))
442445
}
443446

444447
/**

core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/io/readDelim.kt

Lines changed: 7 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ import org.jetbrains.kotlinx.dataframe.AnyFrame
66
import org.jetbrains.kotlinx.dataframe.DataColumn
77
import org.jetbrains.kotlinx.dataframe.DataFrame
88
import org.jetbrains.kotlinx.dataframe.api.ParserOptions
9+
import org.jetbrains.kotlinx.dataframe.api.convertTo
910
import org.jetbrains.kotlinx.dataframe.api.toDataFrame
1011
import org.jetbrains.kotlinx.dataframe.api.tryParse
1112
import org.jetbrains.kotlinx.dataframe.impl.ColumnNameGenerator
@@ -66,19 +67,15 @@ internal fun DataFrame.Companion.readDelimImpl(
6667
}
6768
}
6869
val column = DataColumn.createValueColumn(colName, values, typeOf<String>().withNullability(hasNulls))
69-
val skipTypes = when {
70+
when {
7071
colType != null ->
71-
// skip all types except the desired type
72-
ParserOptions.allTypesExcept(colType.toKType())
72+
column.convertTo(
73+
newType = colType.toKType().withNullability(true),
74+
parserOptions = parserOptions ?: ParserOptions(),
75+
)
7376

74-
else ->
75-
// respect the provided parser options
76-
parserOptions?.skipTypes ?: emptySet()
77+
else -> column.tryParse(parserOptions ?: ParserOptions())
7778
}
78-
val adjustsedParserOptions = (parserOptions ?: ParserOptions())
79-
.copy(skipTypes = skipTypes)
80-
81-
return@mapIndexed column.tryParse(adjustsedParserOptions)
8279
}
8380
return cols.toDataFrame()
8481
}

core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/CsvTests.kt

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ import org.jetbrains.kotlinx.dataframe.testCsv
2323
import org.jetbrains.kotlinx.dataframe.testResource
2424
import org.junit.Test
2525
import java.io.File
26+
import java.io.StringReader
2627
import java.io.StringWriter
2728
import java.net.URL
2829
import java.util.Locale
@@ -326,6 +327,36 @@ class CsvTests {
326327
emptyTsvStr shouldBe DataFrame.empty()
327328
}
328329

330+
// Issue #921
331+
@Test
332+
fun `read csv with custom null strings and given type`() {
333+
@Language("CSV")
334+
val csv =
335+
"""
336+
a,b
337+
noppes,2
338+
1.2,
339+
3,45
340+
,noppes
341+
1.3,1
342+
""".trimIndent()
343+
344+
val df = DataFrame.readDelim(
345+
reader = StringReader(csv),
346+
parserOptions = ParserOptions(
347+
nullStrings = setOf("noppes", ""),
348+
),
349+
colTypes = mapOf("a" to ColType.Double, "b" to ColType.Int),
350+
)
351+
df shouldBe dataFrameOf("a", "b")(
352+
null, 2,
353+
1.2, null,
354+
3.0, 45,
355+
null, null,
356+
1.3, 1,
357+
)
358+
}
359+
329360
companion object {
330361
private val simpleCsv = testCsv("testCSV")
331362
private val csvWithFrenchLocale = testCsv("testCSVwithFrenchLocale")

0 commit comments

Comments
 (0)