diff --git a/core/generated-sources/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/io/readJson.kt b/core/generated-sources/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/io/readJson.kt new file mode 100644 index 0000000000..f1053cda81 --- /dev/null +++ b/core/generated-sources/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/io/readJson.kt @@ -0,0 +1,606 @@ +package org.jetbrains.kotlinx.dataframe.impl.io + +import com.beust.klaxon.JsonArray +import com.beust.klaxon.JsonObject +import org.jetbrains.kotlinx.dataframe.AnyCol +import org.jetbrains.kotlinx.dataframe.AnyFrame +import org.jetbrains.kotlinx.dataframe.DataColumn +import org.jetbrains.kotlinx.dataframe.DataFrame +import org.jetbrains.kotlinx.dataframe.DataRow +import org.jetbrains.kotlinx.dataframe.api.JsonPath +import org.jetbrains.kotlinx.dataframe.api.KeyValueProperty +import org.jetbrains.kotlinx.dataframe.api.cast +import org.jetbrains.kotlinx.dataframe.api.columnOf +import org.jetbrains.kotlinx.dataframe.api.concat +import org.jetbrains.kotlinx.dataframe.api.dataFrameOf +import org.jetbrains.kotlinx.dataframe.api.firstOrNull +import org.jetbrains.kotlinx.dataframe.api.getColumn +import org.jetbrains.kotlinx.dataframe.api.mapIndexed +import org.jetbrains.kotlinx.dataframe.api.named +import org.jetbrains.kotlinx.dataframe.api.schema +import org.jetbrains.kotlinx.dataframe.api.splitInto +import org.jetbrains.kotlinx.dataframe.api.toDataFrame +import org.jetbrains.kotlinx.dataframe.columns.ColumnGroup +import org.jetbrains.kotlinx.dataframe.columns.FrameColumn +import org.jetbrains.kotlinx.dataframe.impl.ColumnNameGenerator +import org.jetbrains.kotlinx.dataframe.impl.DataCollectorBase +import org.jetbrains.kotlinx.dataframe.impl.asList +import org.jetbrains.kotlinx.dataframe.impl.columns.createColumn +import org.jetbrains.kotlinx.dataframe.impl.commonType +import org.jetbrains.kotlinx.dataframe.impl.createDataCollector +import org.jetbrains.kotlinx.dataframe.impl.guessValueType +import org.jetbrains.kotlinx.dataframe.impl.schema.DataFrameSchemaImpl +import org.jetbrains.kotlinx.dataframe.impl.schema.extractSchema +import org.jetbrains.kotlinx.dataframe.impl.schema.intersectSchemas +import org.jetbrains.kotlinx.dataframe.impl.splitByIndices +import org.jetbrains.kotlinx.dataframe.io.JSON.TypeClashTactic +import org.jetbrains.kotlinx.dataframe.io.JSON.TypeClashTactic.ANY_COLUMNS +import org.jetbrains.kotlinx.dataframe.io.JSON.TypeClashTactic.ARRAY_AND_VALUE_COLUMNS +import org.jetbrains.kotlinx.dataframe.io.arrayColumnName +import org.jetbrains.kotlinx.dataframe.io.valueColumnName +import org.jetbrains.kotlinx.dataframe.ncol +import org.jetbrains.kotlinx.dataframe.nrow +import org.jetbrains.kotlinx.dataframe.schema.ColumnSchema +import org.jetbrains.kotlinx.dataframe.type +import org.jetbrains.kotlinx.dataframe.typeClass +import org.jetbrains.kotlinx.dataframe.values +import kotlin.reflect.KType +import kotlin.reflect.KTypeProjection +import kotlin.reflect.full.createType +import kotlin.reflect.typeOf + +private fun DataFrame.unwrapUnnamedColumns() = + dataFrameOf(columns().map { it.unwrapUnnamedColumn() }) + +private fun AnyCol.unwrapUnnamedColumn() = if (this is UnnamedColumn) col else this + +private enum class AnyColType { + ANY, + ARRAYS, + OBJECTS, +} + +internal interface AnyKeyValueProperty : KeyValueProperty { + override val value: Any? +} + +internal fun readJson( + parsed: Any?, + header: List, + keyValuePaths: List = emptyList(), + typeClashTactic: TypeClashTactic = ARRAY_AND_VALUE_COLUMNS, +): DataFrame<*> { + val df: AnyFrame = when (typeClashTactic) { + ARRAY_AND_VALUE_COLUMNS -> { + when (parsed) { + is JsonArray<*> -> fromJsonListArrayAndValueColumns( + records = parsed.value, + header = header, + keyValuePaths = keyValuePaths, + ) + + else -> fromJsonListArrayAndValueColumns( + records = listOf(parsed), + keyValuePaths = keyValuePaths, + ) + } + } + + ANY_COLUMNS -> { + when (parsed) { + is JsonArray<*> -> fromJsonListAnyColumns( + records = parsed.value, + header = header, + keyValuePaths = keyValuePaths, + ) + + else -> fromJsonListAnyColumns( + records = listOf(parsed), + keyValuePaths = keyValuePaths, + ) + } + } + } + return df.unwrapUnnamedColumns() +} + +/** + * Json to DataFrame converter that creates [Any] columns. + * A.k.a. [TypeClashTactic.ANY_COLUMNS]. + * + * @param records List of json elements to be converted to a [DataFrame]. + * @param keyValuePaths List of [JsonPath]s where instead of a [ColumnGroup], a [FrameColumn]<[KeyValueProperty]> + * will be created. + * @param header Optional list of column names. If given, [records] will be read like an object with [header] being the keys. + * @return [DataFrame] from the given [records]. + */ +internal fun fromJsonListAnyColumns( + records: List<*>, + keyValuePaths: List = emptyList(), + header: List = emptyList(), + jsonPath: JsonPath = JsonPath(), +): AnyFrame { + var hasPrimitive = false + var hasArray = false + var hasObject = false + + // list element type can be JsonObject, JsonArray or primitive + val nameGenerator = ColumnNameGenerator() + records.forEach { + when (it) { + is JsonObject -> { + hasObject = true + it.entries.forEach { + nameGenerator.addIfAbsent(it.key) + } + } + + is JsonArray<*> -> hasArray = true + null -> Unit + else -> hasPrimitive = true + } + } + + val colType = when { + hasArray && !hasPrimitive && !hasObject -> AnyColType.ARRAYS + hasObject && !hasPrimitive && !hasArray -> AnyColType.OBJECTS + else -> AnyColType.ANY + } + val justPrimitives = hasPrimitive && !hasArray && !hasObject + val isKeyValue = keyValuePaths.any { jsonPath.matches(it) } + + if (isKeyValue && colType != AnyColType.OBJECTS) { + error("Key value path $jsonPath does not match objects.") + } + + @Suppress("KotlinConstantConditions") + val columns: List = when { + // Create one column of type Any? (or guessed primitive type) from all the records + colType == AnyColType.ANY -> { + val collector: DataCollectorBase = + if (justPrimitives) createDataCollector(records.size) // guess the type + else createDataCollector(records.size, typeOf()) // use Any? + + val nanIndices = mutableListOf() + records.forEachIndexed { i, v -> + when (v) { + is JsonObject -> { + val parsed = + fromJsonListAnyColumns( + records = listOf(v), + keyValuePaths = keyValuePaths, + jsonPath = jsonPath.replaceLastWildcardWithIndex(i), + ) + collector.add( + if (parsed.isSingleUnnamedColumn()) (parsed.getColumn(0) as UnnamedColumn).col.values.first() + else parsed.firstOrNull() ?: DataRow.empty + ) + } + + is JsonArray<*> -> { + val parsed = fromJsonListAnyColumns( + records = v, + keyValuePaths = keyValuePaths, + jsonPath = jsonPath.replaceLastWildcardWithIndex(i).appendArrayWithWildcard(), + ) + collector.add( + if (parsed.isSingleUnnamedColumn()) (parsed.getColumn(0) as UnnamedColumn).col.values.asList() + else parsed.unwrapUnnamedColumns() + ) + } + + "NaN" -> { + nanIndices.add(i) + collector.add(null) + } + + else -> collector.add(v) + } + } + val column = collector.toColumn(valueColumnName) + val res = if (nanIndices.isNotEmpty()) { + fun DataColumn.updateNaNs(nanValue: C): DataColumn { + var j = 0 + var nextNanIndex = nanIndices[j] + return mapIndexed(column.type) { i, v -> + if (i == nextNanIndex) { + j++ + nextNanIndex = if (j < nanIndices.size) nanIndices[j] else -1 + nanValue + } else v + } + } + when (column.typeClass) { + Double::class -> column.cast().updateNaNs(Double.NaN) + Float::class -> column.cast().updateNaNs(Float.NaN) + String::class -> column.cast().updateNaNs("NaN") + else -> column + } + } else column + listOf(UnnamedColumn(res)) + } + + // Create one column of type FrameColumn, or List<> from all the records if they are all arrays + colType == AnyColType.ARRAYS -> { + val values = mutableListOf() + val startIndices = ArrayList() + records.forEach { + startIndices.add(values.size) + when (it) { + is JsonArray<*> -> values.addAll(it.value) + null -> Unit + else -> error("Expected JsonArray, got $it") + } + } + val parsed = fromJsonListAnyColumns( + records = values, + keyValuePaths = keyValuePaths, + jsonPath = jsonPath.appendArrayWithWildcard(), + ) + + val res = when { + parsed.isSingleUnnamedColumn() -> { + val col = (parsed.getColumn(0) as UnnamedColumn).col + val elementType = col.type + val values = col.values.asList().splitByIndices(startIndices.asSequence()).toList() + DataColumn.createValueColumn( + name = arrayColumnName, + values = values, + type = List::class.createType(listOf(KTypeProjection.invariant(elementType))), + ) + } + + else -> DataColumn.createFrameColumn( + name = arrayColumnName, // will be erased + df = parsed.unwrapUnnamedColumns(), + startIndices = startIndices, + ) + } + listOf(UnnamedColumn(res)) + } + + // Create one column of type FrameColumn + colType == AnyColType.OBJECTS && isKeyValue -> { + // collect the value types to make sure Value columns with lists and other values aren't all turned into lists + val valueTypes = mutableSetOf() + val dataFrames = records.map { + when (it) { + is JsonObject -> { + val map = it.map.mapValues { (key, value) -> + val parsed = fromJsonListAnyColumns( + records = listOf(value), + keyValuePaths = keyValuePaths, + jsonPath = jsonPath.append(key), + ) + if (parsed.isSingleUnnamedColumn()) (parsed.getColumn(0) as UnnamedColumn).col.values.first() + else parsed.unwrapUnnamedColumns().firstOrNull() + } + val valueType = map.values.map { + guessValueType(sequenceOf(it)) + }.commonType() + + valueTypes += valueType + + dataFrameOf( + columnOf(*map.keys.toTypedArray()).named(KeyValueProperty<*>::key.name), + createColumn(values = map.values, suggestedType = valueType, guessType = false) + .named(KeyValueProperty<*>::value.name), + ) + } + + null -> DataFrame.emptyOf() + else -> error("Expected JsonObject, got $it") + } + } + + val valueColumns = dataFrames.map { it[KeyValueProperty<*>::value.name] } + val valueColumnSchema = when { + // in these cases we can safely combine the columns to get a single column schema + valueColumns.all { it is ColumnGroup<*> } || valueColumns.all { it is FrameColumn<*> } -> + valueColumns.concat().extractSchema() + // to avoid listification, we create the value columns schema ourselves (https://github.com/Kotlin/dataframe/issues/184) + else -> ColumnSchema.Value(valueTypes.commonType()) + } + + listOf( + UnnamedColumn( + DataColumn.createFrameColumn( + name = valueColumnName, // will be erased unless at top-level + groups = dataFrames, + schema = lazy { + DataFrameSchemaImpl( + columns = mapOf( + KeyValueProperty<*>::key.name to ColumnSchema.Value(typeOf()), + KeyValueProperty<*>::value.name to valueColumnSchema, + ) + ) + }, + ) + ) + ) + } + + // Create multiple columns from all the records if they are all objects, merging the objects in essence + colType == AnyColType.OBJECTS && !isKeyValue -> { + nameGenerator.names.map { colName -> + val values = ArrayList(records.size) + + records.forEach { + when (it) { + is JsonObject -> values.add(it[colName]) + null -> values.add(null) + else -> error("Expected JsonObject, got $it") + } + } + + val parsed = fromJsonListAnyColumns( + records = values, + keyValuePaths = keyValuePaths, + jsonPath = jsonPath.append(colName), + ) + when { + parsed.ncol == 0 -> + DataColumn.createValueColumn( + name = colName, + values = arrayOfNulls(values.size).toList(), + type = typeOf(), + ) + + parsed.isSingleUnnamedColumn() -> + (parsed.getColumn(0) as UnnamedColumn).col.rename(colName) + + else -> + DataColumn.createColumnGroup(colName, parsed.unwrapUnnamedColumns()) as AnyCol + } + } + } + + else -> error("") + } + + return when { + columns.isEmpty() -> DataFrame.empty(records.size) + + columns.size == 1 && hasArray && header.isNotEmpty() && columns[0].typeClass == List::class -> + columns[0] + .cast>() + .splitInto(*header.toTypedArray()) + + else -> columns.toDataFrame() + } +} + +private fun AnyFrame.isSingleUnnamedColumn() = ncol == 1 && getColumn(0) is UnnamedColumn + +/** + * Json to DataFrame converter that creates allows creates `value` and `array` accessors + * instead of [Any] columns. + * A.k.a. [TypeClashTactic.ARRAY_AND_VALUE_COLUMNS]. + * + * @param records List of json elements to be converted to a [DataFrame]. + * @param keyValuePaths List of [JsonPath]s where instead of a [ColumnGroup], a [FrameColumn]<[KeyValueProperty]> + * will be created. + * @param header Optional list of column names. If given, [records] will be read like an object with [header] being the keys. + * @return [DataFrame] from the given [records]. + */ +internal fun fromJsonListArrayAndValueColumns( + records: List<*>, + keyValuePaths: List = emptyList(), + header: List = emptyList(), + jsonPath: JsonPath = JsonPath(), +): AnyFrame { + var hasPrimitive = false + var hasArray = false + val isKeyValue = keyValuePaths.any { jsonPath.matches(it) } + + // list element type can be JsonObject, JsonArray or primitive + // So first, we gather all properties of objects to merge including "array" and "value" if needed + // so the resulting type of a property with instances 123, ["abc"], and { "a": 1, "b": 2 } will be + // { array: List, value: Int?, a: Int?, b: Int? } + // and instances will look like + // { "array": [], "value": 123, "a": null, "b": null } + + val nameGenerator = ColumnNameGenerator() + records.forEach { + when (it) { + is JsonObject -> it.entries.forEach { + nameGenerator.addIfAbsent(it.key) + } + + is JsonArray<*> -> hasArray = true + null -> Unit + else -> hasPrimitive = true + } + } + if (records.all { it == null }) hasPrimitive = true + + // Add a value column to the collected names if needed + val valueColumn = if (hasPrimitive || records.isEmpty()) { + nameGenerator.addUnique(valueColumnName) + } else null + + // Add an array column to the collected names if needed + val arrayColumn = if (hasArray) { + nameGenerator.addUnique(arrayColumnName) + } else null + + // only properties that consist of just objects (or are empty) can be merged to key/value FrameColumns + if (isKeyValue && (hasPrimitive || hasArray)) { + error("Key value path $jsonPath does not match objects.") + } + + // Create columns from the collected names + val columns: List = when { + // instead of using the names, generate a single key/value frame column + isKeyValue -> { + val dataFrames = records.map { + when (it) { + is JsonObject -> { + val map = it.map.mapValues { (key, value) -> + val parsed = fromJsonListArrayAndValueColumns( + records = listOf(value), + keyValuePaths = keyValuePaths, + jsonPath = jsonPath.append(key), + ) + if (parsed.isSingleUnnamedColumn()) (parsed.getColumn(0) as UnnamedColumn).col.values.first() + else parsed.unwrapUnnamedColumns().firstOrNull() + } + val valueType = + map.values.map { guessValueType(sequenceOf(it)) } + .commonType() + + dataFrameOf( + columnOf(*map.keys.toTypedArray()).named(KeyValueProperty<*>::key.name), + createColumn( + values = map.values, + suggestedType = valueType, + guessType = false, + ).named(KeyValueProperty<*>::value.name), + ) + } + + null -> DataFrame.emptyOf() + else -> error("Expected JsonObject, got $it") + } + } + + listOf( + UnnamedColumn( + DataColumn.createFrameColumn( + name = valueColumnName, // will be erased unless at top-level + groups = dataFrames, + schema = lazy { + dataFrames.mapNotNull { it.takeIf { it.nrow > 0 }?.schema() }.intersectSchemas() + }, + ) + ) + ) + } + + // generate columns using the collected names + else -> + nameGenerator.names.map { colName -> + when { + // Collect primitive values from records into the `value` column if needed + colName == valueColumn && (hasPrimitive || records.isEmpty()) -> { + val collector = createDataCollector(records.size) + val nanIndices = mutableListOf() + records.forEachIndexed { i, v -> + when (v) { + is JsonObject -> collector.add(null) + is JsonArray<*> -> collector.add(null) + "NaN" -> { + nanIndices.add(i) + collector.add(null) + } + + else -> collector.add(v) + } + } + val column = collector.toColumn(colName) + val res = if (nanIndices.isNotEmpty()) { + fun DataColumn.updateNaNs(nanValue: C): DataColumn { + var j = 0 + var nextNanIndex = nanIndices[j] + return mapIndexed(column.type) { i, v -> + if (i == nextNanIndex) { + j++ + nextNanIndex = if (j < nanIndices.size) nanIndices[j] else -1 + nanValue + } else v + } + } + when (column.typeClass) { + Double::class -> column.cast().updateNaNs(Double.NaN) + Float::class -> column.cast().updateNaNs(Float.NaN) + String::class -> column.cast().updateNaNs("NaN") + else -> column + } + } else column + UnnamedColumn(res) + } + + // Collect arrays from records into the `array` column if needed + colName == arrayColumn && hasArray -> { + val values = mutableListOf() + val startIndices = ArrayList() + records.forEach { + startIndices.add(values.size) + if (it is JsonArray<*>) values.addAll(it.value) + } + val parsed = fromJsonListArrayAndValueColumns( + records = values, + keyValuePaths = keyValuePaths, + jsonPath = jsonPath.appendArrayWithWildcard(), + ) + + val res = when { + parsed.isSingleUnnamedColumn() -> { + val col = (parsed.getColumn(0) as UnnamedColumn).col + val elementType = col.type + val values = col.values.asList().splitByIndices(startIndices.asSequence()).toList() + DataColumn.createValueColumn( + name = colName, + values = values, + type = List::class.createType(listOf(KTypeProjection.invariant(elementType))), + ) + } + + else -> DataColumn.createFrameColumn(colName, parsed.unwrapUnnamedColumns(), startIndices) + } + UnnamedColumn(res) + } + + // Collect the current column name as property from the objects in records + else -> { + val values = ArrayList(records.size) + records.forEach { + when (it) { + is JsonObject -> values.add(it[colName]) + else -> values.add(null) + } + } + + val parsed = fromJsonListArrayAndValueColumns( + records = values, + keyValuePaths = keyValuePaths, + jsonPath = jsonPath.append(colName), + ) + when { + parsed.ncol == 0 -> + DataColumn.createValueColumn( + name = colName, + values = arrayOfNulls(values.size).toList(), + type = typeOf(), + ) + + parsed.isSingleUnnamedColumn() -> + (parsed.getColumn(0) as UnnamedColumn).col.rename(colName) + + else -> + DataColumn.createColumnGroup(colName, parsed.unwrapUnnamedColumns()) as AnyCol + } + } + } + } + } + + return when { + columns.isEmpty() -> + DataFrame.empty(records.size) + + columns.size == 1 && hasArray && header.isNotEmpty() && columns[0].typeClass == List::class -> + columns[0] + .cast>() + .splitInto(*header.toTypedArray()) + + else -> + columns.toDataFrame() + } +} + +// we need it to check if AnyFrame created by recursive call has single unnamed column, +// unnamed column means this column is not created from field of a record [{"value": 1}, {"value": 2}], +// but filtered values [1, { ... }, []] -> [1, null, null] +// or arrays: [1, { ...}, []] -> [null, null, []] +private class UnnamedColumn(val col: DataColumn) : DataColumn by col diff --git a/core/generated-sources/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/io/writeJson.kt b/core/generated-sources/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/io/writeJson.kt new file mode 100644 index 0000000000..63cfcb03f0 --- /dev/null +++ b/core/generated-sources/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/io/writeJson.kt @@ -0,0 +1,212 @@ +package org.jetbrains.kotlinx.dataframe.impl.io + +import com.beust.klaxon.JsonArray +import com.beust.klaxon.JsonObject +import com.beust.klaxon.KlaxonJson +import org.jetbrains.kotlinx.dataframe.AnyCol +import org.jetbrains.kotlinx.dataframe.AnyFrame +import org.jetbrains.kotlinx.dataframe.ColumnsContainer +import org.jetbrains.kotlinx.dataframe.DataColumn +import org.jetbrains.kotlinx.dataframe.api.indices +import org.jetbrains.kotlinx.dataframe.api.isList +import org.jetbrains.kotlinx.dataframe.api.name +import org.jetbrains.kotlinx.dataframe.api.rows +import org.jetbrains.kotlinx.dataframe.api.take +import org.jetbrains.kotlinx.dataframe.columns.ColumnGroup +import org.jetbrains.kotlinx.dataframe.columns.ColumnKind +import org.jetbrains.kotlinx.dataframe.columns.FrameColumn +import org.jetbrains.kotlinx.dataframe.impl.io.SerializationKeys.COLUMNS +import org.jetbrains.kotlinx.dataframe.impl.io.SerializationKeys.DATA +import org.jetbrains.kotlinx.dataframe.impl.io.SerializationKeys.KIND +import org.jetbrains.kotlinx.dataframe.impl.io.SerializationKeys.KOTLIN_DATAFRAME +import org.jetbrains.kotlinx.dataframe.impl.io.SerializationKeys.METADATA +import org.jetbrains.kotlinx.dataframe.impl.io.SerializationKeys.NCOL +import org.jetbrains.kotlinx.dataframe.impl.io.SerializationKeys.NROW +import org.jetbrains.kotlinx.dataframe.impl.io.SerializationKeys.VERSION +import org.jetbrains.kotlinx.dataframe.io.arrayColumnName +import org.jetbrains.kotlinx.dataframe.io.valueColumnName +import org.jetbrains.kotlinx.dataframe.ncol +import org.jetbrains.kotlinx.dataframe.nrow +import org.jetbrains.kotlinx.dataframe.typeClass + +internal fun KlaxonJson.encodeRow(frame: ColumnsContainer<*>, index: Int): JsonObject? { + val values = frame.columns().map { col -> + when (col) { + is ColumnGroup<*> -> encodeRow(col, index) + is FrameColumn<*> -> encodeFrame(col[index]) + else -> encodeValue(col, index) + }.let { col.name to it } + } + if (values.isEmpty()) return null + return obj(values) +} + +internal object SerializationKeys { + const val DATA = "data" + const val METADATA = "metadata" + const val KIND = "kind" + const val NCOL = "ncol" + const val NROW = "nrow" + const val VERSION = "\$version" + const val COLUMNS = "columns" + const val KOTLIN_DATAFRAME = "kotlin_dataframe" +} + +internal const val SERIALIZATION_VERSION = "2.0.0" + +internal fun KlaxonJson.encodeRowWithMetadata( + frame: ColumnsContainer<*>, + index: Int, + rowLimit: Int? = null +): JsonObject? { + val values = frame.columns().map { col -> + when (col) { + is ColumnGroup<*> -> obj( + DATA to encodeRowWithMetadata(col, index, rowLimit), + METADATA to obj(KIND to ColumnKind.Group.toString()) + ) + + is FrameColumn<*> -> { + val data = if (rowLimit == null) encodeFrameWithMetadata(col[index]) + else encodeFrameWithMetadata(col[index].take(rowLimit), rowLimit) + obj( + DATA to data, + METADATA to obj( + KIND to ColumnKind.Frame.toString(), + NCOL to col[index].ncol, + NROW to col[index].nrow + ) + ) + } + + else -> encodeValue(col, index) + }.let { col.name to it } + } + if (values.isEmpty()) return null + return obj(values) +} + +private val valueTypes = + setOf(Boolean::class, Double::class, Int::class, Float::class, Long::class, Short::class, Byte::class) + +internal fun KlaxonJson.encodeValue(col: AnyCol, index: Int): Any? = when { + col.isList() -> col[index]?.let { array(it as List<*>) } ?: array() + col.typeClass in valueTypes -> { + val v = col[index] + if ((v is Double && v.isNaN()) || (v is Float && v.isNaN())) { + v.toString() + } else v + } + + else -> col[index]?.toString() +} + +internal fun KlaxonJson.encodeFrameWithMetadata(frame: AnyFrame, rowLimit: Int? = null): JsonArray<*> { + val valueColumn = frame.extractValueColumn() + val arrayColumn = frame.extractArrayColumn() + + val arraysAreFrames = arrayColumn?.kind() == ColumnKind.Frame + + val data = frame.indices().map { rowIndex -> + valueColumn + ?.get(rowIndex) + ?: arrayColumn?.get(rowIndex) + ?.let { + if (arraysAreFrames) encodeFrameWithMetadata(it as AnyFrame, rowLimit) else null + } + ?: encodeRowWithMetadata(frame, rowIndex, rowLimit) + } + + return array(data) +} + +internal fun AnyFrame.extractValueColumn(): DataColumn<*>? { + val allColumns = columns() + + return allColumns.filter { it.name.startsWith(valueColumnName) } + .takeIf { isPossibleToFindUnnamedColumns } + ?.maxByOrNull { it.name }?.let { valueCol -> + if (valueCol.kind() != ColumnKind.Value) { // check that value in this column is not null only when other values are null + null + } else { + // check that value in this column is not null only when other values are null + val isValidValueColumn = rows().all { row -> + if (valueCol[row] != null) { + allColumns.all { col -> + if (col.name != valueCol.name) col[row] == null + else true + } + } else true + } + if (isValidValueColumn) valueCol + else null + } + } +} + +// if there is only 1 column, then `isValidValueColumn` always true. +// But at the same time, we shouldn't treat dataFrameOf("value")(1,2,3) like unnamed column +// because it was created by user. +internal val AnyFrame.isPossibleToFindUnnamedColumns: Boolean + get() = columns().size != 1 + +internal fun AnyFrame.extractArrayColumn(): DataColumn<*>? { + val allColumns = columns() + + return columns().filter { it.name.startsWith(arrayColumnName) } + .takeIf { isPossibleToFindUnnamedColumns } + ?.maxByOrNull { it.name }?.let { arrayCol -> + if (arrayCol.kind() == ColumnKind.Group) null + else { + // check that value in this column is not null only when other values are null + val isValidArrayColumn = rows().all { row -> + if (arrayCol[row] != null) { + allColumns.all { col -> + if (col.name != arrayCol.name) col[row] == null + else true + } + } else true + } + if (isValidArrayColumn) arrayCol + else null + } + } +} + +internal fun KlaxonJson.encodeFrame(frame: AnyFrame): JsonArray<*> { + val valueColumn = frame.extractValueColumn() + val arrayColumn = frame.extractArrayColumn() + + val arraysAreFrames = arrayColumn?.kind() == ColumnKind.Frame + + val data = frame.indices().map { rowIndex -> + valueColumn + ?.get(rowIndex) + ?: arrayColumn?.get(rowIndex) + ?.let { + if (arraysAreFrames) encodeFrame(it as AnyFrame) else null + } + ?: encodeRow(frame, rowIndex) + } + + return array(data) +} + +internal fun KlaxonJson.encodeDataFrameWithMetadata( + frame: AnyFrame, + rowLimit: Int, + nestedRowLimit: Int? = null, +): JsonObject { + return obj( + VERSION to SERIALIZATION_VERSION, + METADATA to obj( + COLUMNS to frame.columnNames(), + NROW to frame.rowsCount(), + NCOL to frame.columnsCount() + ), + KOTLIN_DATAFRAME to encodeFrameWithMetadata( + frame.take(rowLimit), + rowLimit = nestedRowLimit + ), + ) +} diff --git a/core/generated-sources/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/json.kt b/core/generated-sources/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/json.kt index b464ccdb5e..e648458f90 100644 --- a/core/generated-sources/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/json.kt +++ b/core/generated-sources/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/json.kt @@ -1,66 +1,28 @@ package org.jetbrains.kotlinx.dataframe.io -import com.beust.klaxon.JsonArray -import com.beust.klaxon.JsonObject -import com.beust.klaxon.KlaxonJson import com.beust.klaxon.Parser import com.beust.klaxon.json -import org.jetbrains.kotlinx.dataframe.AnyCol import org.jetbrains.kotlinx.dataframe.AnyFrame import org.jetbrains.kotlinx.dataframe.AnyRow -import org.jetbrains.kotlinx.dataframe.ColumnsContainer -import org.jetbrains.kotlinx.dataframe.DataColumn import org.jetbrains.kotlinx.dataframe.DataFrame import org.jetbrains.kotlinx.dataframe.DataRow import org.jetbrains.kotlinx.dataframe.api.JsonPath import org.jetbrains.kotlinx.dataframe.api.KeyValueProperty -import org.jetbrains.kotlinx.dataframe.api.cast -import org.jetbrains.kotlinx.dataframe.api.columnOf -import org.jetbrains.kotlinx.dataframe.api.concat -import org.jetbrains.kotlinx.dataframe.api.dataFrameOf -import org.jetbrains.kotlinx.dataframe.api.firstOrNull -import org.jetbrains.kotlinx.dataframe.api.getColumn -import org.jetbrains.kotlinx.dataframe.api.indices -import org.jetbrains.kotlinx.dataframe.api.isList -import org.jetbrains.kotlinx.dataframe.api.mapIndexed -import org.jetbrains.kotlinx.dataframe.api.name -import org.jetbrains.kotlinx.dataframe.api.named -import org.jetbrains.kotlinx.dataframe.api.rows -import org.jetbrains.kotlinx.dataframe.api.schema import org.jetbrains.kotlinx.dataframe.api.single -import org.jetbrains.kotlinx.dataframe.api.splitInto -import org.jetbrains.kotlinx.dataframe.api.toDataFrame import org.jetbrains.kotlinx.dataframe.codeGen.DefaultReadDfMethod import org.jetbrains.kotlinx.dataframe.codeGen.DefaultReadJsonMethod import org.jetbrains.kotlinx.dataframe.columns.ColumnGroup -import org.jetbrains.kotlinx.dataframe.columns.ColumnKind import org.jetbrains.kotlinx.dataframe.columns.FrameColumn -import org.jetbrains.kotlinx.dataframe.impl.ColumnNameGenerator -import org.jetbrains.kotlinx.dataframe.impl.DataCollectorBase -import org.jetbrains.kotlinx.dataframe.impl.asList -import org.jetbrains.kotlinx.dataframe.impl.columns.createColumn -import org.jetbrains.kotlinx.dataframe.impl.commonType -import org.jetbrains.kotlinx.dataframe.impl.createDataCollector -import org.jetbrains.kotlinx.dataframe.impl.guessValueType -import org.jetbrains.kotlinx.dataframe.impl.schema.DataFrameSchemaImpl -import org.jetbrains.kotlinx.dataframe.impl.schema.extractSchema -import org.jetbrains.kotlinx.dataframe.impl.schema.intersectSchemas -import org.jetbrains.kotlinx.dataframe.impl.splitByIndices +import org.jetbrains.kotlinx.dataframe.impl.io.encodeDataFrameWithMetadata +import org.jetbrains.kotlinx.dataframe.impl.io.encodeFrame +import org.jetbrains.kotlinx.dataframe.impl.io.encodeRow +import org.jetbrains.kotlinx.dataframe.impl.io.readJson import org.jetbrains.kotlinx.dataframe.io.JSON.TypeClashTactic import org.jetbrains.kotlinx.dataframe.io.JSON.TypeClashTactic.ANY_COLUMNS import org.jetbrains.kotlinx.dataframe.io.JSON.TypeClashTactic.ARRAY_AND_VALUE_COLUMNS -import org.jetbrains.kotlinx.dataframe.ncol -import org.jetbrains.kotlinx.dataframe.nrow -import org.jetbrains.kotlinx.dataframe.schema.ColumnSchema -import org.jetbrains.kotlinx.dataframe.type -import org.jetbrains.kotlinx.dataframe.typeClass -import org.jetbrains.kotlinx.dataframe.values import java.io.File import java.io.InputStream import java.net.URL -import kotlin.reflect.KType -import kotlin.reflect.KTypeProjection -import kotlin.reflect.full.createType import kotlin.reflect.typeOf public class JSON( @@ -145,6 +107,9 @@ public class JSON( } } +public const val arrayColumnName: String = "array" +public const val valueColumnName: String = "value" + /** * @param file Where to fetch the Json as [InputStream] to be converted to a [DataFrame]. * @param keyValuePaths List of [JsonPath]s where instead of a [ColumnGroup], a [FrameColumn]<[KeyValueProperty]> @@ -295,649 +260,33 @@ public fun DataRow.Companion.readJsonStr( typeClashTactic: TypeClashTactic = ARRAY_AND_VALUE_COLUMNS, ): AnyRow = DataFrame.readJsonStr(text, header, keyValuePaths, typeClashTactic).single() -private fun readJson( - parsed: Any?, - header: List, - keyValuePaths: List = emptyList(), - typeClashTactic: TypeClashTactic = ARRAY_AND_VALUE_COLUMNS, -): DataFrame<*> { - val df: AnyFrame = when (typeClashTactic) { - ARRAY_AND_VALUE_COLUMNS -> { - when (parsed) { - is JsonArray<*> -> fromJsonListArrayAndValueColumns( - records = parsed.value, - header = header, - keyValuePaths = keyValuePaths, - ) - - else -> fromJsonListArrayAndValueColumns( - records = listOf(parsed), - keyValuePaths = keyValuePaths, - ) - } - } - - ANY_COLUMNS -> { - when (parsed) { - is JsonArray<*> -> fromJsonListAnyColumns( - records = parsed.value, - header = header, - keyValuePaths = keyValuePaths, - ) - - else -> fromJsonListAnyColumns( - records = listOf(parsed), - keyValuePaths = keyValuePaths, - ) - } - } - } - return df.unwrapUnnamedColumns() -} - -private fun DataFrame.unwrapUnnamedColumns() = - dataFrameOf(columns().map { it.unwrapUnnamedColumn() }) - -private fun AnyCol.unwrapUnnamedColumn() = if (this is UnnamedColumn) col else this - -private enum class AnyColType { - ANY, - ARRAYS, - OBJECTS, -} - -internal interface AnyKeyValueProperty : KeyValueProperty { - override val value: Any? +public fun AnyFrame.toJson(prettyPrint: Boolean = false, canonical: Boolean = false): String { + return json { + encodeFrame(this@toJson) + }.toJsonString(prettyPrint, canonical) } /** - * Json to DataFrame converter that creates [Any] columns. - * A.k.a. [TypeClashTactic.ANY_COLUMNS]. + * Converts the DataFrame to a JSON string representation with additional metadata about serialized data. + * It is heavily used to implement some integration features in Kotlin Notebook IntellJ IDEA plugin. * - * @param records List of json elements to be converted to a [DataFrame]. - * @param keyValuePaths List of [JsonPath]s where instead of a [ColumnGroup], a [FrameColumn]<[KeyValueProperty]> - * will be created. - * @param header Optional list of column names. If given, [records] will be read like an object with [header] being the keys. - * @return [DataFrame] from the given [records]. - */ -internal fun fromJsonListAnyColumns( - records: List<*>, - keyValuePaths: List = emptyList(), - header: List = emptyList(), - jsonPath: JsonPath = JsonPath(), -): AnyFrame { - var hasPrimitive = false - var hasArray = false - var hasObject = false - - // list element type can be JsonObject, JsonArray or primitive - val nameGenerator = ColumnNameGenerator() - records.forEach { - when (it) { - is JsonObject -> { - hasObject = true - it.entries.forEach { - nameGenerator.addIfAbsent(it.key) - } - } - - is JsonArray<*> -> hasArray = true - null -> Unit - else -> hasPrimitive = true - } - } - - val colType = when { - hasArray && !hasPrimitive && !hasObject -> AnyColType.ARRAYS - hasObject && !hasPrimitive && !hasArray -> AnyColType.OBJECTS - else -> AnyColType.ANY - } - val justPrimitives = hasPrimitive && !hasArray && !hasObject - val isKeyValue = keyValuePaths.any { jsonPath.matches(it) } - - if (isKeyValue && colType != AnyColType.OBJECTS) { - error("Key value path $jsonPath does not match objects.") - } - - @Suppress("KotlinConstantConditions") - val columns: List = when { - // Create one column of type Any? (or guessed primitive type) from all the records - colType == AnyColType.ANY -> { - val collector: DataCollectorBase = - if (justPrimitives) createDataCollector(records.size) // guess the type - else createDataCollector(records.size, typeOf()) // use Any? - - val nanIndices = mutableListOf() - records.forEachIndexed { i, v -> - when (v) { - is JsonObject -> { - val parsed = - fromJsonListAnyColumns( - records = listOf(v), - keyValuePaths = keyValuePaths, - jsonPath = jsonPath.replaceLastWildcardWithIndex(i), - ) - collector.add( - if (parsed.isSingleUnnamedColumn()) (parsed.getColumn(0) as UnnamedColumn).col.values.first() - else parsed.firstOrNull() ?: DataRow.empty - ) - } - - is JsonArray<*> -> { - val parsed = fromJsonListAnyColumns( - records = v, - keyValuePaths = keyValuePaths, - jsonPath = jsonPath.replaceLastWildcardWithIndex(i).appendArrayWithWildcard(), - ) - collector.add( - if (parsed.isSingleUnnamedColumn()) (parsed.getColumn(0) as UnnamedColumn).col.values.asList() - else parsed.unwrapUnnamedColumns() - ) - } - - "NaN" -> { - nanIndices.add(i) - collector.add(null) - } - - else -> collector.add(v) - } - } - val column = collector.toColumn(valueColumnName) - val res = if (nanIndices.isNotEmpty()) { - fun DataColumn.updateNaNs(nanValue: C): DataColumn { - var j = 0 - var nextNanIndex = nanIndices[j] - return mapIndexed(column.type) { i, v -> - if (i == nextNanIndex) { - j++ - nextNanIndex = if (j < nanIndices.size) nanIndices[j] else -1 - nanValue - } else v - } - } - when (column.typeClass) { - Double::class -> column.cast().updateNaNs(Double.NaN) - Float::class -> column.cast().updateNaNs(Float.NaN) - String::class -> column.cast().updateNaNs("NaN") - else -> column - } - } else column - listOf(UnnamedColumn(res)) - } - - // Create one column of type FrameColumn, or List<> from all the records if they are all arrays - colType == AnyColType.ARRAYS -> { - val values = mutableListOf() - val startIndices = ArrayList() - records.forEach { - startIndices.add(values.size) - when (it) { - is JsonArray<*> -> values.addAll(it.value) - null -> Unit - else -> error("Expected JsonArray, got $it") - } - } - val parsed = fromJsonListAnyColumns( - records = values, - keyValuePaths = keyValuePaths, - jsonPath = jsonPath.appendArrayWithWildcard(), - ) - - val res = when { - parsed.isSingleUnnamedColumn() -> { - val col = (parsed.getColumn(0) as UnnamedColumn).col - val elementType = col.type - val values = col.values.asList().splitByIndices(startIndices.asSequence()).toList() - DataColumn.createValueColumn( - name = arrayColumnName, - values = values, - type = List::class.createType(listOf(KTypeProjection.invariant(elementType))), - ) - } - - else -> DataColumn.createFrameColumn( - name = arrayColumnName, // will be erased - df = parsed.unwrapUnnamedColumns(), - startIndices = startIndices, - ) - } - listOf(UnnamedColumn(res)) - } - - // Create one column of type FrameColumn - colType == AnyColType.OBJECTS && isKeyValue -> { - // collect the value types to make sure Value columns with lists and other values aren't all turned into lists - val valueTypes = mutableSetOf() - val dataFrames = records.map { - when (it) { - is JsonObject -> { - val map = it.map.mapValues { (key, value) -> - val parsed = fromJsonListAnyColumns( - records = listOf(value), - keyValuePaths = keyValuePaths, - jsonPath = jsonPath.append(key), - ) - if (parsed.isSingleUnnamedColumn()) (parsed.getColumn(0) as UnnamedColumn).col.values.first() - else parsed.unwrapUnnamedColumns().firstOrNull() - } - val valueType = map.values.map { - guessValueType(sequenceOf(it)) - }.commonType() - - valueTypes += valueType - - dataFrameOf( - columnOf(*map.keys.toTypedArray()).named(KeyValueProperty<*>::key.name), - createColumn(values = map.values, suggestedType = valueType, guessType = false) - .named(KeyValueProperty<*>::value.name), - ) - } - - null -> DataFrame.emptyOf() - else -> error("Expected JsonObject, got $it") - } - } - - val valueColumns = dataFrames.map { it[KeyValueProperty<*>::value.name] } - val valueColumnSchema = when { - // in these cases we can safely combine the columns to get a single column schema - valueColumns.all { it is ColumnGroup<*> } || valueColumns.all { it is FrameColumn<*> } -> - valueColumns.concat().extractSchema() - // to avoid listification, we create the value columns schema ourselves (https://github.com/Kotlin/dataframe/issues/184) - else -> ColumnSchema.Value(valueTypes.commonType()) - } - - listOf( - UnnamedColumn( - DataColumn.createFrameColumn( - name = valueColumnName, // will be erased unless at top-level - groups = dataFrames, - schema = lazy { - DataFrameSchemaImpl( - columns = mapOf( - KeyValueProperty<*>::key.name to ColumnSchema.Value(typeOf()), - KeyValueProperty<*>::value.name to valueColumnSchema, - ) - ) - }, - ) - ) - ) - } - - // Create multiple columns from all the records if they are all objects, merging the objects in essence - colType == AnyColType.OBJECTS && !isKeyValue -> { - nameGenerator.names.map { colName -> - val values = ArrayList(records.size) - - records.forEach { - when (it) { - is JsonObject -> values.add(it[colName]) - null -> values.add(null) - else -> error("Expected JsonObject, got $it") - } - } - - val parsed = fromJsonListAnyColumns( - records = values, - keyValuePaths = keyValuePaths, - jsonPath = jsonPath.append(colName), - ) - when { - parsed.ncol == 0 -> - DataColumn.createValueColumn( - name = colName, - values = arrayOfNulls(values.size).toList(), - type = typeOf(), - ) - - parsed.isSingleUnnamedColumn() -> - (parsed.getColumn(0) as UnnamedColumn).col.rename(colName) - - else -> - DataColumn.createColumnGroup(colName, parsed.unwrapUnnamedColumns()) as AnyCol - } - } - } - - else -> error("") - } - - return when { - columns.isEmpty() -> DataFrame.empty(records.size) - - columns.size == 1 && hasArray && header.isNotEmpty() && columns[0].typeClass == List::class -> - columns[0] - .cast>() - .splitInto(*header.toTypedArray()) - - else -> columns.toDataFrame() - } -} - -public const val arrayColumnName: String = "array" -public const val valueColumnName: String = "value" - -private fun AnyFrame.isSingleUnnamedColumn() = ncol == 1 && getColumn(0) is UnnamedColumn - -/** - * Json to DataFrame converter that creates allows creates `value` and `array` accessors - * instead of [Any] columns. - * A.k.a. [TypeClashTactic.ARRAY_AND_VALUE_COLUMNS]. + * @param rowLimit The maximum number of top-level dataframe rows to include in the output JSON. + * @param nestedRowLimit The maximum number of nested frame rows to include in the output JSON. + * If null, all rows are included. + * Applied for each frame column recursively + * @param prettyPrint Specifies whether the output JSON should be formatted with indentation and line breaks. + * @param canonical Specifies whether the output JSON should be in a canonical form. * - * @param records List of json elements to be converted to a [DataFrame]. - * @param keyValuePaths List of [JsonPath]s where instead of a [ColumnGroup], a [FrameColumn]<[KeyValueProperty]> - * will be created. - * @param header Optional list of column names. If given, [records] will be read like an object with [header] being the keys. - * @return [DataFrame] from the given [records]. + * @return The DataFrame converted to a JSON string with metadata. */ -internal fun fromJsonListArrayAndValueColumns( - records: List<*>, - keyValuePaths: List = emptyList(), - header: List = emptyList(), - jsonPath: JsonPath = JsonPath(), -): AnyFrame { - var hasPrimitive = false - var hasArray = false - val isKeyValue = keyValuePaths.any { jsonPath.matches(it) } - - // list element type can be JsonObject, JsonArray or primitive - // So first, we gather all properties of objects to merge including "array" and "value" if needed - // so the resulting type of a property with instances 123, ["abc"], and { "a": 1, "b": 2 } will be - // { array: List, value: Int?, a: Int?, b: Int? } - // and instances will look like - // { "array": [], "value": 123, "a": null, "b": null } - - val nameGenerator = ColumnNameGenerator() - records.forEach { - when (it) { - is JsonObject -> it.entries.forEach { - nameGenerator.addIfAbsent(it.key) - } - - is JsonArray<*> -> hasArray = true - null -> Unit - else -> hasPrimitive = true - } - } - if (records.all { it == null }) hasPrimitive = true - - // Add a value column to the collected names if needed - val valueColumn = if (hasPrimitive || records.isEmpty()) { - nameGenerator.addUnique(valueColumnName) - } else null - - // Add an array column to the collected names if needed - val arrayColumn = if (hasArray) { - nameGenerator.addUnique(arrayColumnName) - } else null - - // only properties that consist of just objects (or are empty) can be merged to key/value FrameColumns - if (isKeyValue && (hasPrimitive || hasArray)) { - error("Key value path $jsonPath does not match objects.") - } - - // Create columns from the collected names - val columns: List = when { - // instead of using the names, generate a single key/value frame column - isKeyValue -> { - val dataFrames = records.map { - when (it) { - is JsonObject -> { - val map = it.map.mapValues { (key, value) -> - val parsed = fromJsonListArrayAndValueColumns( - records = listOf(value), - keyValuePaths = keyValuePaths, - jsonPath = jsonPath.append(key), - ) - if (parsed.isSingleUnnamedColumn()) (parsed.getColumn(0) as UnnamedColumn).col.values.first() - else parsed.unwrapUnnamedColumns().firstOrNull() - } - val valueType = - map.values.map { guessValueType(sequenceOf(it)) } - .commonType() - - dataFrameOf( - columnOf(*map.keys.toTypedArray()).named(KeyValueProperty<*>::key.name), - createColumn( - values = map.values, - suggestedType = valueType, - guessType = false, - ).named(KeyValueProperty<*>::value.name), - ) - } - - null -> DataFrame.emptyOf() - else -> error("Expected JsonObject, got $it") - } - } - - listOf( - UnnamedColumn( - DataColumn.createFrameColumn( - name = valueColumnName, // will be erased unless at top-level - groups = dataFrames, - schema = lazy { - dataFrames.mapNotNull { it.takeIf { it.nrow > 0 }?.schema() }.intersectSchemas() - }, - ) - ) - ) - } - - // generate columns using the collected names - else -> - nameGenerator.names.map { colName -> - when { - // Collect primitive values from records into the `value` column if needed - colName == valueColumn && (hasPrimitive || records.isEmpty()) -> { - val collector = createDataCollector(records.size) - val nanIndices = mutableListOf() - records.forEachIndexed { i, v -> - when (v) { - is JsonObject -> collector.add(null) - is JsonArray<*> -> collector.add(null) - "NaN" -> { - nanIndices.add(i) - collector.add(null) - } - - else -> collector.add(v) - } - } - val column = collector.toColumn(colName) - val res = if (nanIndices.isNotEmpty()) { - fun DataColumn.updateNaNs(nanValue: C): DataColumn { - var j = 0 - var nextNanIndex = nanIndices[j] - return mapIndexed(column.type) { i, v -> - if (i == nextNanIndex) { - j++ - nextNanIndex = if (j < nanIndices.size) nanIndices[j] else -1 - nanValue - } else v - } - } - when (column.typeClass) { - Double::class -> column.cast().updateNaNs(Double.NaN) - Float::class -> column.cast().updateNaNs(Float.NaN) - String::class -> column.cast().updateNaNs("NaN") - else -> column - } - } else column - UnnamedColumn(res) - } - - // Collect arrays from records into the `array` column if needed - colName == arrayColumn && hasArray -> { - val values = mutableListOf() - val startIndices = ArrayList() - records.forEach { - startIndices.add(values.size) - if (it is JsonArray<*>) values.addAll(it.value) - } - val parsed = fromJsonListArrayAndValueColumns( - records = values, - keyValuePaths = keyValuePaths, - jsonPath = jsonPath.appendArrayWithWildcard(), - ) - - val res = when { - parsed.isSingleUnnamedColumn() -> { - val col = (parsed.getColumn(0) as UnnamedColumn).col - val elementType = col.type - val values = col.values.asList().splitByIndices(startIndices.asSequence()).toList() - DataColumn.createValueColumn( - name = colName, - values = values, - type = List::class.createType(listOf(KTypeProjection.invariant(elementType))), - ) - } - - else -> DataColumn.createFrameColumn(colName, parsed.unwrapUnnamedColumns(), startIndices) - } - UnnamedColumn(res) - } - - // Collect the current column name as property from the objects in records - else -> { - val values = ArrayList(records.size) - records.forEach { - when (it) { - is JsonObject -> values.add(it[colName]) - else -> values.add(null) - } - } - - val parsed = fromJsonListArrayAndValueColumns( - records = values, - keyValuePaths = keyValuePaths, - jsonPath = jsonPath.append(colName), - ) - when { - parsed.ncol == 0 -> - DataColumn.createValueColumn( - name = colName, - values = arrayOfNulls(values.size).toList(), - type = typeOf(), - ) - - parsed.isSingleUnnamedColumn() -> - (parsed.getColumn(0) as UnnamedColumn).col.rename(colName) - - else -> - DataColumn.createColumnGroup(colName, parsed.unwrapUnnamedColumns()) as AnyCol - } - } - } - } - } - - return when { - columns.isEmpty() -> - DataFrame.empty(records.size) - - columns.size == 1 && hasArray && header.isNotEmpty() && columns[0].typeClass == List::class -> - columns[0] - .cast>() - .splitInto(*header.toTypedArray()) - - else -> - columns.toDataFrame() - } -} - -// we need it to check if AnyFrame created by recursive call has single unnamed column, -// unnamed column means this column is not created from field of a record [{"value": 1}, {"value": 2}], -// but filtered values [1, { ... }, []] -> [1, null, null] -// or arrays: [1, { ...}, []] -> [null, null, []] -private class UnnamedColumn(val col: DataColumn) : DataColumn by col - -private val valueTypes = - setOf(Boolean::class, Double::class, Int::class, Float::class, Long::class, Short::class, Byte::class) - -internal fun KlaxonJson.encodeRow(frame: ColumnsContainer<*>, index: Int): JsonObject? { - val values = frame.columns().map { col -> - when { - col is ColumnGroup<*> -> encodeRow(col, index) - col is FrameColumn<*> -> encodeFrame(col[index]) - col.isList() -> { - col[index]?.let { array(it as List<*>) } ?: array() - } - - col.typeClass in valueTypes -> { - val v = col[index] - if ((v is Double && v.isNaN()) || (v is Float && v.isNaN())) { - v.toString() - } else v - } - - else -> col[index]?.toString() - }.let { col.name to it } - } - if (values.isEmpty()) return null - return obj(values) -} - -internal fun KlaxonJson.encodeFrame(frame: AnyFrame): JsonArray<*> { - val allColumns = frame.columns() - - // if there is only 1 column, then `isValidValueColumn` always true. - // But at the same time, we shouldn't treat dataFrameOf("value")(1,2,3) like unnamed column - // because it was created by user. - val isPossibleToFindUnnamedColumns = allColumns.size != 1 - val valueColumn = allColumns.filter { it.name.startsWith(valueColumnName) } - .takeIf { isPossibleToFindUnnamedColumns } - ?.maxByOrNull { it.name }?.let { valueCol -> - if (valueCol.kind() != ColumnKind.Value) { // check that value in this column is not null only when other values are null - null - } else { - // check that value in this column is not null only when other values are null - val isValidValueColumn = frame.rows().all { row -> - if (valueCol[row] != null) { - allColumns.all { col -> - if (col.name != valueCol.name) col[row] == null - else true - } - } else true - } - if (isValidValueColumn) valueCol - else null - } - } - - val arrayColumn = frame.columns().filter { it.name.startsWith(arrayColumnName) } - .takeIf { isPossibleToFindUnnamedColumns } - ?.maxByOrNull { it.name }?.let { arrayCol -> - if (arrayCol.kind() == ColumnKind.Group) null - else { - // check that value in this column is not null only when other values are null - val isValidArrayColumn = frame.rows().all { row -> - if (arrayCol[row] != null) { - allColumns.all { col -> - if (col.name != arrayCol.name) col[row] == null - else true - } - } else true - } - if (isValidArrayColumn) arrayCol - else null - } - } - - val arraysAreFrames = arrayColumn?.kind() == ColumnKind.Frame - - val data = frame.indices().map { rowIndex -> - valueColumn?.get(rowIndex) ?: arrayColumn?.get(rowIndex) - ?.let { if (arraysAreFrames) encodeFrame(it as AnyFrame) else null } ?: encodeRow(frame, rowIndex) - } - return array(data) -} - -public fun AnyFrame.toJson(prettyPrint: Boolean = false, canonical: Boolean = false): String { +public fun AnyFrame.toJsonWithMetadata( + rowLimit: Int, + nestedRowLimit: Int? = null, + prettyPrint: Boolean = false, + canonical: Boolean = false +): String { return json { - encodeFrame(this@toJson) + encodeDataFrameWithMetadata(this@toJsonWithMetadata, rowLimit, nestedRowLimit) }.toJsonString(prettyPrint, canonical) } diff --git a/core/generated-sources/src/main/kotlin/org/jetbrains/kotlinx/dataframe/jupyter/JupyterHtmlRenderer.kt b/core/generated-sources/src/main/kotlin/org/jetbrains/kotlinx/dataframe/jupyter/JupyterHtmlRenderer.kt index 1486448cd1..536470fa84 100644 --- a/core/generated-sources/src/main/kotlin/org/jetbrains/kotlinx/dataframe/jupyter/JupyterHtmlRenderer.kt +++ b/core/generated-sources/src/main/kotlin/org/jetbrains/kotlinx/dataframe/jupyter/JupyterHtmlRenderer.kt @@ -1,12 +1,12 @@ package org.jetbrains.kotlinx.dataframe.jupyter import com.beust.klaxon.json -import org.jetbrains.kotlinx.dataframe.api.rows -import org.jetbrains.kotlinx.dataframe.api.toDataFrame +import org.jetbrains.kotlinx.dataframe.api.take +import org.jetbrains.kotlinx.dataframe.impl.io.encodeFrame import org.jetbrains.kotlinx.dataframe.io.DataFrameHtmlData import org.jetbrains.kotlinx.dataframe.io.DisplayConfiguration -import org.jetbrains.kotlinx.dataframe.io.encodeFrame import org.jetbrains.kotlinx.dataframe.io.toHTML +import org.jetbrains.kotlinx.dataframe.io.toJsonWithMetadata import org.jetbrains.kotlinx.dataframe.io.toStaticHtml import org.jetbrains.kotlinx.dataframe.jupyter.KotlinNotebookPluginUtils.convertToDataFrame import org.jetbrains.kotlinx.dataframe.nrow @@ -22,6 +22,7 @@ import org.jetbrains.kotlinx.jupyter.api.renderHtmlAsIFrameIfNeeded /** Starting from this version, dataframe integration will respond with additional data for rendering in Kotlin Notebooks plugin. */ private const val MIN_KERNEL_VERSION_FOR_NEW_TABLES_UI = "0.11.0.311" +private const val MIN_IDE_VERSION_SUPPORT_JSON_WITH_METADATA = 241 internal class JupyterHtmlRenderer( val display: DisplayConfiguration, @@ -60,21 +61,32 @@ internal inline fun JupyterHtmlRenderer.render( val staticHtml = df.toStaticHtml(reifiedDisplayConfiguration, DefaultCellRenderer).toJupyterHtmlData() if (notebook.kernelVersion >= KotlinKernelVersion.from(MIN_KERNEL_VERSION_FOR_NEW_TABLES_UI)!!) { - val jsonEncodedDf = json { - obj( - "nrow" to df.size.nrow, - "ncol" to df.size.ncol, - "columns" to df.columnNames(), - "kotlin_dataframe" to encodeFrame(df.rows().take(limit).toDataFrame()), - ) - }.toJsonString() + val ideBuildNumber = KotlinNotebookPluginUtils.getKotlinNotebookIDEBuildNumber() + + val jsonEncodedDf = + if (ideBuildNumber == null || ideBuildNumber.majorVersion < MIN_IDE_VERSION_SUPPORT_JSON_WITH_METADATA) { + json { + obj( + "nrow" to df.size.nrow, + "ncol" to df.size.ncol, + "columns" to df.columnNames(), + "kotlin_dataframe" to encodeFrame(df.take(limit)), + ) + }.toJsonString() + } else { + df.toJsonWithMetadata(limit, reifiedDisplayConfiguration.rowsLimit) + } notebook.renderAsIFrameAsNeeded(html, staticHtml, jsonEncodedDf) } else { notebook.renderHtmlAsIFrameIfNeeded(html) } } -internal fun Notebook.renderAsIFrameAsNeeded(data: HtmlData, staticData: HtmlData, jsonEncodedDf: String): MimeTypedResult { +internal fun Notebook.renderAsIFrameAsNeeded( + data: HtmlData, + staticData: HtmlData, + jsonEncodedDf: String +): MimeTypedResult { val textHtml = if (jupyterClientType == JupyterClientType.KOTLIN_NOTEBOOK) { data.generateIframePlaneText(currentColorScheme) + staticData.toString(currentColorScheme) diff --git a/core/generated-sources/src/main/kotlin/org/jetbrains/kotlinx/dataframe/jupyter/KotlinNotebookPluginUtils.kt b/core/generated-sources/src/main/kotlin/org/jetbrains/kotlinx/dataframe/jupyter/KotlinNotebookPluginUtils.kt index 0d80306d21..b0d8d28f5e 100644 --- a/core/generated-sources/src/main/kotlin/org/jetbrains/kotlinx/dataframe/jupyter/KotlinNotebookPluginUtils.kt +++ b/core/generated-sources/src/main/kotlin/org/jetbrains/kotlinx/dataframe/jupyter/KotlinNotebookPluginUtils.kt @@ -41,6 +41,8 @@ import org.jetbrains.kotlinx.dataframe.impl.ColumnNameGenerator * DISPLAY(KotlinNotebooksPluginUtils.getRowsSubsetForRendering(Out[...], 0, 20), "") */ public object KotlinNotebookPluginUtils { + private const val KTNB_IDE_BUILD_PROP = "KTNB_IDE_BUILD_NUMBER" + /** * Returns a subset of rows from the given dataframe for rendering. * It's used for example for dynamic pagination in Kotlin Notebook Plugin. @@ -166,4 +168,36 @@ public object KotlinNotebookPluginUtils { usedNames: List = emptyList() ): String = ColumnNameGenerator(usedNames).addUnique(preferredName) + + /** + * Retrieves the build number of the Kotlin Notebook IDE. + * + * @return The build number of the Kotlin Notebook IDE as an instance of [IdeBuildNumber], + * or null if the build number is not available. + */ + public fun getKotlinNotebookIDEBuildNumber(): IdeBuildNumber? { + val value = System.getProperty(KTNB_IDE_BUILD_PROP, null) ?: return null + return IdeBuildNumber.fromString(value) + } + + public data class IdeBuildNumber(val ideName: String, val majorVersion: Int, val buildId: Int) { + public companion object { + public fun fromString(buildNumber: String): IdeBuildNumber? { + val parts = buildNumber.split(";") + return if (parts.size >= 3) constructIdeBuildNumber(parts) else null + } + + private fun constructIdeBuildNumber(parts: List): IdeBuildNumber? { + val ideName = parts[0] + val majorVersion = parts[1].toIntOrNull() + val buildId = parts[2].toIntOrNull() + + return if (majorVersion != null && buildId != null) { + IdeBuildNumber(ideName, majorVersion, buildId) + } else { + null + } + } + } + } } diff --git a/core/generated-sources/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/json.kt b/core/generated-sources/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/json.kt index f02d0060f3..a9328a214b 100644 --- a/core/generated-sources/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/json.kt +++ b/core/generated-sources/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/json.kt @@ -1,5 +1,8 @@ package org.jetbrains.kotlinx.dataframe.io +import com.beust.klaxon.JsonArray +import com.beust.klaxon.JsonObject +import com.beust.klaxon.Parser import io.kotest.matchers.collections.shouldBeIn import io.kotest.matchers.shouldBe import io.kotest.matchers.string.shouldContain @@ -22,14 +25,26 @@ import org.jetbrains.kotlinx.dataframe.api.schema import org.jetbrains.kotlinx.dataframe.api.toDouble import org.jetbrains.kotlinx.dataframe.api.toMap import org.jetbrains.kotlinx.dataframe.columns.ColumnGroup +import org.jetbrains.kotlinx.dataframe.columns.ColumnKind import org.jetbrains.kotlinx.dataframe.columns.FrameColumn import org.jetbrains.kotlinx.dataframe.columns.ValueColumn +import org.jetbrains.kotlinx.dataframe.impl.io.SERIALIZATION_VERSION +import org.jetbrains.kotlinx.dataframe.impl.io.SerializationKeys.COLUMNS +import org.jetbrains.kotlinx.dataframe.impl.io.SerializationKeys.DATA +import org.jetbrains.kotlinx.dataframe.impl.io.SerializationKeys.KIND +import org.jetbrains.kotlinx.dataframe.impl.io.SerializationKeys.KOTLIN_DATAFRAME +import org.jetbrains.kotlinx.dataframe.impl.io.SerializationKeys.METADATA +import org.jetbrains.kotlinx.dataframe.impl.io.SerializationKeys.NCOL +import org.jetbrains.kotlinx.dataframe.impl.io.SerializationKeys.NROW +import org.jetbrains.kotlinx.dataframe.impl.io.SerializationKeys.VERSION import org.jetbrains.kotlinx.dataframe.impl.nothingType -import org.jetbrains.kotlinx.dataframe.io.JSON.TypeClashTactic.* +import org.jetbrains.kotlinx.dataframe.io.JSON.TypeClashTactic.ANY_COLUMNS +import org.jetbrains.kotlinx.dataframe.io.JSON.TypeClashTactic.ARRAY_AND_VALUE_COLUMNS +import org.jetbrains.kotlinx.dataframe.testJson import org.jetbrains.kotlinx.dataframe.type import org.jetbrains.kotlinx.dataframe.values import org.junit.Test -import kotlin.reflect.* +import kotlin.reflect.typeOf class JsonTests { @@ -951,4 +966,98 @@ class JsonTests { val df = dataFrameOf("a", "b")("1", null, "2", 12) df.toJson(canonical = true) shouldContain "\"b\":null" } + + @Test + @Suppress("UNCHECKED_CAST") + fun `json with metadata flat table`() { + @Language("json") + val data = """ + [{"id":3602279,"node_id":"MDEwOlJlcG9zaXRvcnkzNjAyMjc5","name":"kotlin-web-demo","full_name":"JetBrains/kotlin-web-demo"}] + """.trimIndent() + val df = DataFrame.readJsonStr(data) + val jsonStr = df.toJsonWithMetadata(df.rowsCount()).trimIndent() + val json = parseJsonStr(jsonStr) + + json[VERSION] shouldBe SERIALIZATION_VERSION + + val metadata = (json[METADATA] as JsonObject) + metadata[NROW] shouldBe 1 + metadata[NCOL] shouldBe 4 + val columns = metadata[COLUMNS] as List + columns shouldBe listOf("id", "node_id", "name", "full_name") + + val decodedData = json[KOTLIN_DATAFRAME] as JsonArray<*> + val decodedDf = DataFrame.readJsonStr(decodedData.toJsonString()) + decodedDf shouldBe df + } + + private fun parseJsonStr(jsonStr: String): JsonObject { + val parser = Parser.default() + return parser.parse(StringBuilder(jsonStr)) as JsonObject + } + + @Test + fun `json with metadata column group`() { + @Language("json") + val data = """ + [{"permissions":{"admin":false,"maintain":false,"push":false,"triage":false,"pull":true}}] + """.trimIndent() + val df = DataFrame.readJsonStr(data) + val jsonStr = df.toJsonWithMetadata(df.rowsCount()).trimIndent() + val json = parseJsonStr(jsonStr) + + val row = (json[KOTLIN_DATAFRAME] as JsonArray<*>)[0] as JsonObject + + val permissions = row["permissions"] as JsonObject + val metadata = permissions[METADATA] as JsonObject + metadata[KIND] shouldBe ColumnKind.Group.toString() + + val decodedData = permissions[DATA] as JsonObject + + decodedData["admin"] shouldBe false + decodedData["maintain"] shouldBe false + decodedData["push"] shouldBe false + decodedData["triage"] shouldBe false + decodedData["pull"] shouldBe true + } + + @Test + fun `json with metadata frame column`() { + val df = DataFrame.readJson(testJson("repositories")) + val jsonStr = df.toJsonWithMetadata(df.rowsCount()).trimIndent() + val json = parseJsonStr(jsonStr) + val row = (json[KOTLIN_DATAFRAME] as JsonArray<*>)[0] as JsonObject + + val contributors = row["contributors"] as JsonObject + + val metadata = contributors[METADATA] as JsonObject + metadata[KIND] shouldBe ColumnKind.Frame.toString() + metadata[NCOL] shouldBe 8 + metadata[NROW] shouldBe 29 + + val decodedData = contributors[DATA] as JsonArray<*> + decodedData.size shouldBe 29 + + val decodedDf = DataFrame.readJsonStr(decodedData.toJsonString()) + decodedDf shouldBe df[0]["contributors"] as AnyFrame + } + + @Test + fun `json with metadata test row limit`() { + val df = DataFrame.readJson(testJson("repositories")) + val nestedFrameRowLimit = 20 + val jsonStr = df.toJsonWithMetadata(df.rowsCount(), nestedFrameRowLimit).trimIndent() + val json = parseJsonStr(jsonStr) + val row = (json[KOTLIN_DATAFRAME] as JsonArray<*>)[0] as JsonObject + + val contributors = row["contributors"] as JsonObject + + val metadata = contributors[METADATA] as JsonObject + metadata[KIND] shouldBe ColumnKind.Frame.toString() + metadata[NCOL] shouldBe 8 + metadata[NROW] shouldBe 29 + + val decodedData = contributors[DATA] as JsonArray<*> + decodedData.size shouldBe nestedFrameRowLimit + } } diff --git a/core/generated-sources/src/test/kotlin/org/jetbrains/kotlinx/dataframe/jupyter/RenderingTests.kt b/core/generated-sources/src/test/kotlin/org/jetbrains/kotlinx/dataframe/jupyter/RenderingTests.kt index 8f48073e73..a68f1ede3a 100644 --- a/core/generated-sources/src/test/kotlin/org/jetbrains/kotlinx/dataframe/jupyter/RenderingTests.kt +++ b/core/generated-sources/src/test/kotlin/org/jetbrains/kotlinx/dataframe/jupyter/RenderingTests.kt @@ -10,8 +10,12 @@ import io.kotest.matchers.shouldBe import io.kotest.matchers.string.shouldContain import io.kotest.matchers.string.shouldNotContain import org.intellij.lang.annotations.Language +import org.jetbrains.kotlinx.dataframe.impl.io.SerializationKeys.DATA +import org.jetbrains.kotlinx.dataframe.impl.io.SerializationKeys.KOTLIN_DATAFRAME +import org.jetbrains.kotlinx.dataframe.impl.io.SerializationKeys.METADATA import org.jetbrains.kotlinx.jupyter.api.MimeTypedResult import org.jetbrains.kotlinx.jupyter.testkit.JupyterReplTestCase +import org.junit.BeforeClass import org.junit.Test class RenderingTests : JupyterReplTestCase() { @@ -94,7 +98,7 @@ class RenderingTests : JupyterReplTestCase() { assertDataFrameDimensions(json, 30, 1) - val rows = json.array>("kotlin_dataframe")!! + val rows = json.array>(KOTLIN_DATAFRAME)!! rows.getObj(0).int("id") shouldBe 21 rows.getObj(rows.lastIndex).int("id") shouldBe 50 } @@ -111,8 +115,8 @@ class RenderingTests : JupyterReplTestCase() { } private fun assertDataFrameDimensions(json: JsonObject, expectedRows: Int, expectedColumns: Int) { - json.int("nrow") shouldBe expectedRows - json.int("ncol") shouldBe expectedColumns + json.obj(METADATA)!!.int("nrow") shouldBe expectedRows + json.obj(METADATA)!!.int("ncol") shouldBe expectedColumns } private fun parseDataframeJson(result: MimeTypedResult): JsonObject { @@ -120,7 +124,7 @@ class RenderingTests : JupyterReplTestCase() { return parser.parse(StringBuilder(result["application/kotlindataframe+json"]!!)) as JsonObject } - private fun JsonArray<*>.getObj(index: Int) = this.get(index) as JsonObject + private fun JsonArray<*>.getObj(index: Int) = this[index] as JsonObject @Test fun `test kotlin notebook plugin utils sort by one column asc`() { @@ -138,7 +142,7 @@ class RenderingTests : JupyterReplTestCase() { @Suppress("UNCHECKED_CAST") private fun assertSortedById(json: JsonObject, desc: Boolean) { - val rows = json["kotlin_dataframe"] as JsonArray + val rows = json[KOTLIN_DATAFRAME] as JsonArray var previousId = if (desc) 101 else 0 rows.forEach { row -> val currentId = row.int("id")!! @@ -177,7 +181,7 @@ class RenderingTests : JupyterReplTestCase() { assertDataFrameDimensions(json, 100, 2) - val rows = json["kotlin_dataframe"] as JsonArray + val rows = json[KOTLIN_DATAFRAME] as JsonArray assertSortedByCategory(rows) assertSortedById(rows) } @@ -213,16 +217,16 @@ class RenderingTests : JupyterReplTestCase() { val json = executeScriptAndParseDataframeResult( """ data class Row(val id: Int, val group: Int) - val df = (1..100).map { Row(it, if (it <= 50) 1 else 2) }.toDataFrame() + val df = (1..20).map { Row(it, if (it <= 10) 1 else 2) }.toDataFrame() KotlinNotebookPluginUtils.convertToDataFrame(df.groupBy("group")) """.trimIndent() ) assertDataFrameDimensions(json, 2, 2) - val rows = json.array>("kotlin_dataframe")!! - rows.getObj(0).array("group1")!!.size shouldBe 50 - rows.getObj(1).array("group1")!!.size shouldBe 50 + val rows = json.array>(KOTLIN_DATAFRAME)!! + (rows.getObj(0).obj("group1")!![DATA] as JsonArray<*>).size shouldBe 10 + (rows.getObj(1).obj("group1")!![DATA] as JsonArray<*>).size shouldBe 10 } // Regression KTNB-424 @@ -240,4 +244,15 @@ class RenderingTests : JupyterReplTestCase() { assertDataFrameDimensions(json, 2, 2) } } + + companion object { + /** + * Set the system property for the IDE version needed for specific serialization testing purposes. + */ + @BeforeClass + @JvmStatic + internal fun setupOnce() { + System.setProperty("KTNB_IDE_BUILD_NUMBER", "IU;241;14015") + } + } } diff --git a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/io/readJson.kt b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/io/readJson.kt new file mode 100644 index 0000000000..f1053cda81 --- /dev/null +++ b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/io/readJson.kt @@ -0,0 +1,606 @@ +package org.jetbrains.kotlinx.dataframe.impl.io + +import com.beust.klaxon.JsonArray +import com.beust.klaxon.JsonObject +import org.jetbrains.kotlinx.dataframe.AnyCol +import org.jetbrains.kotlinx.dataframe.AnyFrame +import org.jetbrains.kotlinx.dataframe.DataColumn +import org.jetbrains.kotlinx.dataframe.DataFrame +import org.jetbrains.kotlinx.dataframe.DataRow +import org.jetbrains.kotlinx.dataframe.api.JsonPath +import org.jetbrains.kotlinx.dataframe.api.KeyValueProperty +import org.jetbrains.kotlinx.dataframe.api.cast +import org.jetbrains.kotlinx.dataframe.api.columnOf +import org.jetbrains.kotlinx.dataframe.api.concat +import org.jetbrains.kotlinx.dataframe.api.dataFrameOf +import org.jetbrains.kotlinx.dataframe.api.firstOrNull +import org.jetbrains.kotlinx.dataframe.api.getColumn +import org.jetbrains.kotlinx.dataframe.api.mapIndexed +import org.jetbrains.kotlinx.dataframe.api.named +import org.jetbrains.kotlinx.dataframe.api.schema +import org.jetbrains.kotlinx.dataframe.api.splitInto +import org.jetbrains.kotlinx.dataframe.api.toDataFrame +import org.jetbrains.kotlinx.dataframe.columns.ColumnGroup +import org.jetbrains.kotlinx.dataframe.columns.FrameColumn +import org.jetbrains.kotlinx.dataframe.impl.ColumnNameGenerator +import org.jetbrains.kotlinx.dataframe.impl.DataCollectorBase +import org.jetbrains.kotlinx.dataframe.impl.asList +import org.jetbrains.kotlinx.dataframe.impl.columns.createColumn +import org.jetbrains.kotlinx.dataframe.impl.commonType +import org.jetbrains.kotlinx.dataframe.impl.createDataCollector +import org.jetbrains.kotlinx.dataframe.impl.guessValueType +import org.jetbrains.kotlinx.dataframe.impl.schema.DataFrameSchemaImpl +import org.jetbrains.kotlinx.dataframe.impl.schema.extractSchema +import org.jetbrains.kotlinx.dataframe.impl.schema.intersectSchemas +import org.jetbrains.kotlinx.dataframe.impl.splitByIndices +import org.jetbrains.kotlinx.dataframe.io.JSON.TypeClashTactic +import org.jetbrains.kotlinx.dataframe.io.JSON.TypeClashTactic.ANY_COLUMNS +import org.jetbrains.kotlinx.dataframe.io.JSON.TypeClashTactic.ARRAY_AND_VALUE_COLUMNS +import org.jetbrains.kotlinx.dataframe.io.arrayColumnName +import org.jetbrains.kotlinx.dataframe.io.valueColumnName +import org.jetbrains.kotlinx.dataframe.ncol +import org.jetbrains.kotlinx.dataframe.nrow +import org.jetbrains.kotlinx.dataframe.schema.ColumnSchema +import org.jetbrains.kotlinx.dataframe.type +import org.jetbrains.kotlinx.dataframe.typeClass +import org.jetbrains.kotlinx.dataframe.values +import kotlin.reflect.KType +import kotlin.reflect.KTypeProjection +import kotlin.reflect.full.createType +import kotlin.reflect.typeOf + +private fun DataFrame.unwrapUnnamedColumns() = + dataFrameOf(columns().map { it.unwrapUnnamedColumn() }) + +private fun AnyCol.unwrapUnnamedColumn() = if (this is UnnamedColumn) col else this + +private enum class AnyColType { + ANY, + ARRAYS, + OBJECTS, +} + +internal interface AnyKeyValueProperty : KeyValueProperty { + override val value: Any? +} + +internal fun readJson( + parsed: Any?, + header: List, + keyValuePaths: List = emptyList(), + typeClashTactic: TypeClashTactic = ARRAY_AND_VALUE_COLUMNS, +): DataFrame<*> { + val df: AnyFrame = when (typeClashTactic) { + ARRAY_AND_VALUE_COLUMNS -> { + when (parsed) { + is JsonArray<*> -> fromJsonListArrayAndValueColumns( + records = parsed.value, + header = header, + keyValuePaths = keyValuePaths, + ) + + else -> fromJsonListArrayAndValueColumns( + records = listOf(parsed), + keyValuePaths = keyValuePaths, + ) + } + } + + ANY_COLUMNS -> { + when (parsed) { + is JsonArray<*> -> fromJsonListAnyColumns( + records = parsed.value, + header = header, + keyValuePaths = keyValuePaths, + ) + + else -> fromJsonListAnyColumns( + records = listOf(parsed), + keyValuePaths = keyValuePaths, + ) + } + } + } + return df.unwrapUnnamedColumns() +} + +/** + * Json to DataFrame converter that creates [Any] columns. + * A.k.a. [TypeClashTactic.ANY_COLUMNS]. + * + * @param records List of json elements to be converted to a [DataFrame]. + * @param keyValuePaths List of [JsonPath]s where instead of a [ColumnGroup], a [FrameColumn]<[KeyValueProperty]> + * will be created. + * @param header Optional list of column names. If given, [records] will be read like an object with [header] being the keys. + * @return [DataFrame] from the given [records]. + */ +internal fun fromJsonListAnyColumns( + records: List<*>, + keyValuePaths: List = emptyList(), + header: List = emptyList(), + jsonPath: JsonPath = JsonPath(), +): AnyFrame { + var hasPrimitive = false + var hasArray = false + var hasObject = false + + // list element type can be JsonObject, JsonArray or primitive + val nameGenerator = ColumnNameGenerator() + records.forEach { + when (it) { + is JsonObject -> { + hasObject = true + it.entries.forEach { + nameGenerator.addIfAbsent(it.key) + } + } + + is JsonArray<*> -> hasArray = true + null -> Unit + else -> hasPrimitive = true + } + } + + val colType = when { + hasArray && !hasPrimitive && !hasObject -> AnyColType.ARRAYS + hasObject && !hasPrimitive && !hasArray -> AnyColType.OBJECTS + else -> AnyColType.ANY + } + val justPrimitives = hasPrimitive && !hasArray && !hasObject + val isKeyValue = keyValuePaths.any { jsonPath.matches(it) } + + if (isKeyValue && colType != AnyColType.OBJECTS) { + error("Key value path $jsonPath does not match objects.") + } + + @Suppress("KotlinConstantConditions") + val columns: List = when { + // Create one column of type Any? (or guessed primitive type) from all the records + colType == AnyColType.ANY -> { + val collector: DataCollectorBase = + if (justPrimitives) createDataCollector(records.size) // guess the type + else createDataCollector(records.size, typeOf()) // use Any? + + val nanIndices = mutableListOf() + records.forEachIndexed { i, v -> + when (v) { + is JsonObject -> { + val parsed = + fromJsonListAnyColumns( + records = listOf(v), + keyValuePaths = keyValuePaths, + jsonPath = jsonPath.replaceLastWildcardWithIndex(i), + ) + collector.add( + if (parsed.isSingleUnnamedColumn()) (parsed.getColumn(0) as UnnamedColumn).col.values.first() + else parsed.firstOrNull() ?: DataRow.empty + ) + } + + is JsonArray<*> -> { + val parsed = fromJsonListAnyColumns( + records = v, + keyValuePaths = keyValuePaths, + jsonPath = jsonPath.replaceLastWildcardWithIndex(i).appendArrayWithWildcard(), + ) + collector.add( + if (parsed.isSingleUnnamedColumn()) (parsed.getColumn(0) as UnnamedColumn).col.values.asList() + else parsed.unwrapUnnamedColumns() + ) + } + + "NaN" -> { + nanIndices.add(i) + collector.add(null) + } + + else -> collector.add(v) + } + } + val column = collector.toColumn(valueColumnName) + val res = if (nanIndices.isNotEmpty()) { + fun DataColumn.updateNaNs(nanValue: C): DataColumn { + var j = 0 + var nextNanIndex = nanIndices[j] + return mapIndexed(column.type) { i, v -> + if (i == nextNanIndex) { + j++ + nextNanIndex = if (j < nanIndices.size) nanIndices[j] else -1 + nanValue + } else v + } + } + when (column.typeClass) { + Double::class -> column.cast().updateNaNs(Double.NaN) + Float::class -> column.cast().updateNaNs(Float.NaN) + String::class -> column.cast().updateNaNs("NaN") + else -> column + } + } else column + listOf(UnnamedColumn(res)) + } + + // Create one column of type FrameColumn, or List<> from all the records if they are all arrays + colType == AnyColType.ARRAYS -> { + val values = mutableListOf() + val startIndices = ArrayList() + records.forEach { + startIndices.add(values.size) + when (it) { + is JsonArray<*> -> values.addAll(it.value) + null -> Unit + else -> error("Expected JsonArray, got $it") + } + } + val parsed = fromJsonListAnyColumns( + records = values, + keyValuePaths = keyValuePaths, + jsonPath = jsonPath.appendArrayWithWildcard(), + ) + + val res = when { + parsed.isSingleUnnamedColumn() -> { + val col = (parsed.getColumn(0) as UnnamedColumn).col + val elementType = col.type + val values = col.values.asList().splitByIndices(startIndices.asSequence()).toList() + DataColumn.createValueColumn( + name = arrayColumnName, + values = values, + type = List::class.createType(listOf(KTypeProjection.invariant(elementType))), + ) + } + + else -> DataColumn.createFrameColumn( + name = arrayColumnName, // will be erased + df = parsed.unwrapUnnamedColumns(), + startIndices = startIndices, + ) + } + listOf(UnnamedColumn(res)) + } + + // Create one column of type FrameColumn + colType == AnyColType.OBJECTS && isKeyValue -> { + // collect the value types to make sure Value columns with lists and other values aren't all turned into lists + val valueTypes = mutableSetOf() + val dataFrames = records.map { + when (it) { + is JsonObject -> { + val map = it.map.mapValues { (key, value) -> + val parsed = fromJsonListAnyColumns( + records = listOf(value), + keyValuePaths = keyValuePaths, + jsonPath = jsonPath.append(key), + ) + if (parsed.isSingleUnnamedColumn()) (parsed.getColumn(0) as UnnamedColumn).col.values.first() + else parsed.unwrapUnnamedColumns().firstOrNull() + } + val valueType = map.values.map { + guessValueType(sequenceOf(it)) + }.commonType() + + valueTypes += valueType + + dataFrameOf( + columnOf(*map.keys.toTypedArray()).named(KeyValueProperty<*>::key.name), + createColumn(values = map.values, suggestedType = valueType, guessType = false) + .named(KeyValueProperty<*>::value.name), + ) + } + + null -> DataFrame.emptyOf() + else -> error("Expected JsonObject, got $it") + } + } + + val valueColumns = dataFrames.map { it[KeyValueProperty<*>::value.name] } + val valueColumnSchema = when { + // in these cases we can safely combine the columns to get a single column schema + valueColumns.all { it is ColumnGroup<*> } || valueColumns.all { it is FrameColumn<*> } -> + valueColumns.concat().extractSchema() + // to avoid listification, we create the value columns schema ourselves (https://github.com/Kotlin/dataframe/issues/184) + else -> ColumnSchema.Value(valueTypes.commonType()) + } + + listOf( + UnnamedColumn( + DataColumn.createFrameColumn( + name = valueColumnName, // will be erased unless at top-level + groups = dataFrames, + schema = lazy { + DataFrameSchemaImpl( + columns = mapOf( + KeyValueProperty<*>::key.name to ColumnSchema.Value(typeOf()), + KeyValueProperty<*>::value.name to valueColumnSchema, + ) + ) + }, + ) + ) + ) + } + + // Create multiple columns from all the records if they are all objects, merging the objects in essence + colType == AnyColType.OBJECTS && !isKeyValue -> { + nameGenerator.names.map { colName -> + val values = ArrayList(records.size) + + records.forEach { + when (it) { + is JsonObject -> values.add(it[colName]) + null -> values.add(null) + else -> error("Expected JsonObject, got $it") + } + } + + val parsed = fromJsonListAnyColumns( + records = values, + keyValuePaths = keyValuePaths, + jsonPath = jsonPath.append(colName), + ) + when { + parsed.ncol == 0 -> + DataColumn.createValueColumn( + name = colName, + values = arrayOfNulls(values.size).toList(), + type = typeOf(), + ) + + parsed.isSingleUnnamedColumn() -> + (parsed.getColumn(0) as UnnamedColumn).col.rename(colName) + + else -> + DataColumn.createColumnGroup(colName, parsed.unwrapUnnamedColumns()) as AnyCol + } + } + } + + else -> error("") + } + + return when { + columns.isEmpty() -> DataFrame.empty(records.size) + + columns.size == 1 && hasArray && header.isNotEmpty() && columns[0].typeClass == List::class -> + columns[0] + .cast>() + .splitInto(*header.toTypedArray()) + + else -> columns.toDataFrame() + } +} + +private fun AnyFrame.isSingleUnnamedColumn() = ncol == 1 && getColumn(0) is UnnamedColumn + +/** + * Json to DataFrame converter that creates allows creates `value` and `array` accessors + * instead of [Any] columns. + * A.k.a. [TypeClashTactic.ARRAY_AND_VALUE_COLUMNS]. + * + * @param records List of json elements to be converted to a [DataFrame]. + * @param keyValuePaths List of [JsonPath]s where instead of a [ColumnGroup], a [FrameColumn]<[KeyValueProperty]> + * will be created. + * @param header Optional list of column names. If given, [records] will be read like an object with [header] being the keys. + * @return [DataFrame] from the given [records]. + */ +internal fun fromJsonListArrayAndValueColumns( + records: List<*>, + keyValuePaths: List = emptyList(), + header: List = emptyList(), + jsonPath: JsonPath = JsonPath(), +): AnyFrame { + var hasPrimitive = false + var hasArray = false + val isKeyValue = keyValuePaths.any { jsonPath.matches(it) } + + // list element type can be JsonObject, JsonArray or primitive + // So first, we gather all properties of objects to merge including "array" and "value" if needed + // so the resulting type of a property with instances 123, ["abc"], and { "a": 1, "b": 2 } will be + // { array: List, value: Int?, a: Int?, b: Int? } + // and instances will look like + // { "array": [], "value": 123, "a": null, "b": null } + + val nameGenerator = ColumnNameGenerator() + records.forEach { + when (it) { + is JsonObject -> it.entries.forEach { + nameGenerator.addIfAbsent(it.key) + } + + is JsonArray<*> -> hasArray = true + null -> Unit + else -> hasPrimitive = true + } + } + if (records.all { it == null }) hasPrimitive = true + + // Add a value column to the collected names if needed + val valueColumn = if (hasPrimitive || records.isEmpty()) { + nameGenerator.addUnique(valueColumnName) + } else null + + // Add an array column to the collected names if needed + val arrayColumn = if (hasArray) { + nameGenerator.addUnique(arrayColumnName) + } else null + + // only properties that consist of just objects (or are empty) can be merged to key/value FrameColumns + if (isKeyValue && (hasPrimitive || hasArray)) { + error("Key value path $jsonPath does not match objects.") + } + + // Create columns from the collected names + val columns: List = when { + // instead of using the names, generate a single key/value frame column + isKeyValue -> { + val dataFrames = records.map { + when (it) { + is JsonObject -> { + val map = it.map.mapValues { (key, value) -> + val parsed = fromJsonListArrayAndValueColumns( + records = listOf(value), + keyValuePaths = keyValuePaths, + jsonPath = jsonPath.append(key), + ) + if (parsed.isSingleUnnamedColumn()) (parsed.getColumn(0) as UnnamedColumn).col.values.first() + else parsed.unwrapUnnamedColumns().firstOrNull() + } + val valueType = + map.values.map { guessValueType(sequenceOf(it)) } + .commonType() + + dataFrameOf( + columnOf(*map.keys.toTypedArray()).named(KeyValueProperty<*>::key.name), + createColumn( + values = map.values, + suggestedType = valueType, + guessType = false, + ).named(KeyValueProperty<*>::value.name), + ) + } + + null -> DataFrame.emptyOf() + else -> error("Expected JsonObject, got $it") + } + } + + listOf( + UnnamedColumn( + DataColumn.createFrameColumn( + name = valueColumnName, // will be erased unless at top-level + groups = dataFrames, + schema = lazy { + dataFrames.mapNotNull { it.takeIf { it.nrow > 0 }?.schema() }.intersectSchemas() + }, + ) + ) + ) + } + + // generate columns using the collected names + else -> + nameGenerator.names.map { colName -> + when { + // Collect primitive values from records into the `value` column if needed + colName == valueColumn && (hasPrimitive || records.isEmpty()) -> { + val collector = createDataCollector(records.size) + val nanIndices = mutableListOf() + records.forEachIndexed { i, v -> + when (v) { + is JsonObject -> collector.add(null) + is JsonArray<*> -> collector.add(null) + "NaN" -> { + nanIndices.add(i) + collector.add(null) + } + + else -> collector.add(v) + } + } + val column = collector.toColumn(colName) + val res = if (nanIndices.isNotEmpty()) { + fun DataColumn.updateNaNs(nanValue: C): DataColumn { + var j = 0 + var nextNanIndex = nanIndices[j] + return mapIndexed(column.type) { i, v -> + if (i == nextNanIndex) { + j++ + nextNanIndex = if (j < nanIndices.size) nanIndices[j] else -1 + nanValue + } else v + } + } + when (column.typeClass) { + Double::class -> column.cast().updateNaNs(Double.NaN) + Float::class -> column.cast().updateNaNs(Float.NaN) + String::class -> column.cast().updateNaNs("NaN") + else -> column + } + } else column + UnnamedColumn(res) + } + + // Collect arrays from records into the `array` column if needed + colName == arrayColumn && hasArray -> { + val values = mutableListOf() + val startIndices = ArrayList() + records.forEach { + startIndices.add(values.size) + if (it is JsonArray<*>) values.addAll(it.value) + } + val parsed = fromJsonListArrayAndValueColumns( + records = values, + keyValuePaths = keyValuePaths, + jsonPath = jsonPath.appendArrayWithWildcard(), + ) + + val res = when { + parsed.isSingleUnnamedColumn() -> { + val col = (parsed.getColumn(0) as UnnamedColumn).col + val elementType = col.type + val values = col.values.asList().splitByIndices(startIndices.asSequence()).toList() + DataColumn.createValueColumn( + name = colName, + values = values, + type = List::class.createType(listOf(KTypeProjection.invariant(elementType))), + ) + } + + else -> DataColumn.createFrameColumn(colName, parsed.unwrapUnnamedColumns(), startIndices) + } + UnnamedColumn(res) + } + + // Collect the current column name as property from the objects in records + else -> { + val values = ArrayList(records.size) + records.forEach { + when (it) { + is JsonObject -> values.add(it[colName]) + else -> values.add(null) + } + } + + val parsed = fromJsonListArrayAndValueColumns( + records = values, + keyValuePaths = keyValuePaths, + jsonPath = jsonPath.append(colName), + ) + when { + parsed.ncol == 0 -> + DataColumn.createValueColumn( + name = colName, + values = arrayOfNulls(values.size).toList(), + type = typeOf(), + ) + + parsed.isSingleUnnamedColumn() -> + (parsed.getColumn(0) as UnnamedColumn).col.rename(colName) + + else -> + DataColumn.createColumnGroup(colName, parsed.unwrapUnnamedColumns()) as AnyCol + } + } + } + } + } + + return when { + columns.isEmpty() -> + DataFrame.empty(records.size) + + columns.size == 1 && hasArray && header.isNotEmpty() && columns[0].typeClass == List::class -> + columns[0] + .cast>() + .splitInto(*header.toTypedArray()) + + else -> + columns.toDataFrame() + } +} + +// we need it to check if AnyFrame created by recursive call has single unnamed column, +// unnamed column means this column is not created from field of a record [{"value": 1}, {"value": 2}], +// but filtered values [1, { ... }, []] -> [1, null, null] +// or arrays: [1, { ...}, []] -> [null, null, []] +private class UnnamedColumn(val col: DataColumn) : DataColumn by col diff --git a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/io/writeJson.kt b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/io/writeJson.kt new file mode 100644 index 0000000000..63cfcb03f0 --- /dev/null +++ b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/io/writeJson.kt @@ -0,0 +1,212 @@ +package org.jetbrains.kotlinx.dataframe.impl.io + +import com.beust.klaxon.JsonArray +import com.beust.klaxon.JsonObject +import com.beust.klaxon.KlaxonJson +import org.jetbrains.kotlinx.dataframe.AnyCol +import org.jetbrains.kotlinx.dataframe.AnyFrame +import org.jetbrains.kotlinx.dataframe.ColumnsContainer +import org.jetbrains.kotlinx.dataframe.DataColumn +import org.jetbrains.kotlinx.dataframe.api.indices +import org.jetbrains.kotlinx.dataframe.api.isList +import org.jetbrains.kotlinx.dataframe.api.name +import org.jetbrains.kotlinx.dataframe.api.rows +import org.jetbrains.kotlinx.dataframe.api.take +import org.jetbrains.kotlinx.dataframe.columns.ColumnGroup +import org.jetbrains.kotlinx.dataframe.columns.ColumnKind +import org.jetbrains.kotlinx.dataframe.columns.FrameColumn +import org.jetbrains.kotlinx.dataframe.impl.io.SerializationKeys.COLUMNS +import org.jetbrains.kotlinx.dataframe.impl.io.SerializationKeys.DATA +import org.jetbrains.kotlinx.dataframe.impl.io.SerializationKeys.KIND +import org.jetbrains.kotlinx.dataframe.impl.io.SerializationKeys.KOTLIN_DATAFRAME +import org.jetbrains.kotlinx.dataframe.impl.io.SerializationKeys.METADATA +import org.jetbrains.kotlinx.dataframe.impl.io.SerializationKeys.NCOL +import org.jetbrains.kotlinx.dataframe.impl.io.SerializationKeys.NROW +import org.jetbrains.kotlinx.dataframe.impl.io.SerializationKeys.VERSION +import org.jetbrains.kotlinx.dataframe.io.arrayColumnName +import org.jetbrains.kotlinx.dataframe.io.valueColumnName +import org.jetbrains.kotlinx.dataframe.ncol +import org.jetbrains.kotlinx.dataframe.nrow +import org.jetbrains.kotlinx.dataframe.typeClass + +internal fun KlaxonJson.encodeRow(frame: ColumnsContainer<*>, index: Int): JsonObject? { + val values = frame.columns().map { col -> + when (col) { + is ColumnGroup<*> -> encodeRow(col, index) + is FrameColumn<*> -> encodeFrame(col[index]) + else -> encodeValue(col, index) + }.let { col.name to it } + } + if (values.isEmpty()) return null + return obj(values) +} + +internal object SerializationKeys { + const val DATA = "data" + const val METADATA = "metadata" + const val KIND = "kind" + const val NCOL = "ncol" + const val NROW = "nrow" + const val VERSION = "\$version" + const val COLUMNS = "columns" + const val KOTLIN_DATAFRAME = "kotlin_dataframe" +} + +internal const val SERIALIZATION_VERSION = "2.0.0" + +internal fun KlaxonJson.encodeRowWithMetadata( + frame: ColumnsContainer<*>, + index: Int, + rowLimit: Int? = null +): JsonObject? { + val values = frame.columns().map { col -> + when (col) { + is ColumnGroup<*> -> obj( + DATA to encodeRowWithMetadata(col, index, rowLimit), + METADATA to obj(KIND to ColumnKind.Group.toString()) + ) + + is FrameColumn<*> -> { + val data = if (rowLimit == null) encodeFrameWithMetadata(col[index]) + else encodeFrameWithMetadata(col[index].take(rowLimit), rowLimit) + obj( + DATA to data, + METADATA to obj( + KIND to ColumnKind.Frame.toString(), + NCOL to col[index].ncol, + NROW to col[index].nrow + ) + ) + } + + else -> encodeValue(col, index) + }.let { col.name to it } + } + if (values.isEmpty()) return null + return obj(values) +} + +private val valueTypes = + setOf(Boolean::class, Double::class, Int::class, Float::class, Long::class, Short::class, Byte::class) + +internal fun KlaxonJson.encodeValue(col: AnyCol, index: Int): Any? = when { + col.isList() -> col[index]?.let { array(it as List<*>) } ?: array() + col.typeClass in valueTypes -> { + val v = col[index] + if ((v is Double && v.isNaN()) || (v is Float && v.isNaN())) { + v.toString() + } else v + } + + else -> col[index]?.toString() +} + +internal fun KlaxonJson.encodeFrameWithMetadata(frame: AnyFrame, rowLimit: Int? = null): JsonArray<*> { + val valueColumn = frame.extractValueColumn() + val arrayColumn = frame.extractArrayColumn() + + val arraysAreFrames = arrayColumn?.kind() == ColumnKind.Frame + + val data = frame.indices().map { rowIndex -> + valueColumn + ?.get(rowIndex) + ?: arrayColumn?.get(rowIndex) + ?.let { + if (arraysAreFrames) encodeFrameWithMetadata(it as AnyFrame, rowLimit) else null + } + ?: encodeRowWithMetadata(frame, rowIndex, rowLimit) + } + + return array(data) +} + +internal fun AnyFrame.extractValueColumn(): DataColumn<*>? { + val allColumns = columns() + + return allColumns.filter { it.name.startsWith(valueColumnName) } + .takeIf { isPossibleToFindUnnamedColumns } + ?.maxByOrNull { it.name }?.let { valueCol -> + if (valueCol.kind() != ColumnKind.Value) { // check that value in this column is not null only when other values are null + null + } else { + // check that value in this column is not null only when other values are null + val isValidValueColumn = rows().all { row -> + if (valueCol[row] != null) { + allColumns.all { col -> + if (col.name != valueCol.name) col[row] == null + else true + } + } else true + } + if (isValidValueColumn) valueCol + else null + } + } +} + +// if there is only 1 column, then `isValidValueColumn` always true. +// But at the same time, we shouldn't treat dataFrameOf("value")(1,2,3) like unnamed column +// because it was created by user. +internal val AnyFrame.isPossibleToFindUnnamedColumns: Boolean + get() = columns().size != 1 + +internal fun AnyFrame.extractArrayColumn(): DataColumn<*>? { + val allColumns = columns() + + return columns().filter { it.name.startsWith(arrayColumnName) } + .takeIf { isPossibleToFindUnnamedColumns } + ?.maxByOrNull { it.name }?.let { arrayCol -> + if (arrayCol.kind() == ColumnKind.Group) null + else { + // check that value in this column is not null only when other values are null + val isValidArrayColumn = rows().all { row -> + if (arrayCol[row] != null) { + allColumns.all { col -> + if (col.name != arrayCol.name) col[row] == null + else true + } + } else true + } + if (isValidArrayColumn) arrayCol + else null + } + } +} + +internal fun KlaxonJson.encodeFrame(frame: AnyFrame): JsonArray<*> { + val valueColumn = frame.extractValueColumn() + val arrayColumn = frame.extractArrayColumn() + + val arraysAreFrames = arrayColumn?.kind() == ColumnKind.Frame + + val data = frame.indices().map { rowIndex -> + valueColumn + ?.get(rowIndex) + ?: arrayColumn?.get(rowIndex) + ?.let { + if (arraysAreFrames) encodeFrame(it as AnyFrame) else null + } + ?: encodeRow(frame, rowIndex) + } + + return array(data) +} + +internal fun KlaxonJson.encodeDataFrameWithMetadata( + frame: AnyFrame, + rowLimit: Int, + nestedRowLimit: Int? = null, +): JsonObject { + return obj( + VERSION to SERIALIZATION_VERSION, + METADATA to obj( + COLUMNS to frame.columnNames(), + NROW to frame.rowsCount(), + NCOL to frame.columnsCount() + ), + KOTLIN_DATAFRAME to encodeFrameWithMetadata( + frame.take(rowLimit), + rowLimit = nestedRowLimit + ), + ) +} diff --git a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/json.kt b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/json.kt index b464ccdb5e..e648458f90 100644 --- a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/json.kt +++ b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/json.kt @@ -1,66 +1,28 @@ package org.jetbrains.kotlinx.dataframe.io -import com.beust.klaxon.JsonArray -import com.beust.klaxon.JsonObject -import com.beust.klaxon.KlaxonJson import com.beust.klaxon.Parser import com.beust.klaxon.json -import org.jetbrains.kotlinx.dataframe.AnyCol import org.jetbrains.kotlinx.dataframe.AnyFrame import org.jetbrains.kotlinx.dataframe.AnyRow -import org.jetbrains.kotlinx.dataframe.ColumnsContainer -import org.jetbrains.kotlinx.dataframe.DataColumn import org.jetbrains.kotlinx.dataframe.DataFrame import org.jetbrains.kotlinx.dataframe.DataRow import org.jetbrains.kotlinx.dataframe.api.JsonPath import org.jetbrains.kotlinx.dataframe.api.KeyValueProperty -import org.jetbrains.kotlinx.dataframe.api.cast -import org.jetbrains.kotlinx.dataframe.api.columnOf -import org.jetbrains.kotlinx.dataframe.api.concat -import org.jetbrains.kotlinx.dataframe.api.dataFrameOf -import org.jetbrains.kotlinx.dataframe.api.firstOrNull -import org.jetbrains.kotlinx.dataframe.api.getColumn -import org.jetbrains.kotlinx.dataframe.api.indices -import org.jetbrains.kotlinx.dataframe.api.isList -import org.jetbrains.kotlinx.dataframe.api.mapIndexed -import org.jetbrains.kotlinx.dataframe.api.name -import org.jetbrains.kotlinx.dataframe.api.named -import org.jetbrains.kotlinx.dataframe.api.rows -import org.jetbrains.kotlinx.dataframe.api.schema import org.jetbrains.kotlinx.dataframe.api.single -import org.jetbrains.kotlinx.dataframe.api.splitInto -import org.jetbrains.kotlinx.dataframe.api.toDataFrame import org.jetbrains.kotlinx.dataframe.codeGen.DefaultReadDfMethod import org.jetbrains.kotlinx.dataframe.codeGen.DefaultReadJsonMethod import org.jetbrains.kotlinx.dataframe.columns.ColumnGroup -import org.jetbrains.kotlinx.dataframe.columns.ColumnKind import org.jetbrains.kotlinx.dataframe.columns.FrameColumn -import org.jetbrains.kotlinx.dataframe.impl.ColumnNameGenerator -import org.jetbrains.kotlinx.dataframe.impl.DataCollectorBase -import org.jetbrains.kotlinx.dataframe.impl.asList -import org.jetbrains.kotlinx.dataframe.impl.columns.createColumn -import org.jetbrains.kotlinx.dataframe.impl.commonType -import org.jetbrains.kotlinx.dataframe.impl.createDataCollector -import org.jetbrains.kotlinx.dataframe.impl.guessValueType -import org.jetbrains.kotlinx.dataframe.impl.schema.DataFrameSchemaImpl -import org.jetbrains.kotlinx.dataframe.impl.schema.extractSchema -import org.jetbrains.kotlinx.dataframe.impl.schema.intersectSchemas -import org.jetbrains.kotlinx.dataframe.impl.splitByIndices +import org.jetbrains.kotlinx.dataframe.impl.io.encodeDataFrameWithMetadata +import org.jetbrains.kotlinx.dataframe.impl.io.encodeFrame +import org.jetbrains.kotlinx.dataframe.impl.io.encodeRow +import org.jetbrains.kotlinx.dataframe.impl.io.readJson import org.jetbrains.kotlinx.dataframe.io.JSON.TypeClashTactic import org.jetbrains.kotlinx.dataframe.io.JSON.TypeClashTactic.ANY_COLUMNS import org.jetbrains.kotlinx.dataframe.io.JSON.TypeClashTactic.ARRAY_AND_VALUE_COLUMNS -import org.jetbrains.kotlinx.dataframe.ncol -import org.jetbrains.kotlinx.dataframe.nrow -import org.jetbrains.kotlinx.dataframe.schema.ColumnSchema -import org.jetbrains.kotlinx.dataframe.type -import org.jetbrains.kotlinx.dataframe.typeClass -import org.jetbrains.kotlinx.dataframe.values import java.io.File import java.io.InputStream import java.net.URL -import kotlin.reflect.KType -import kotlin.reflect.KTypeProjection -import kotlin.reflect.full.createType import kotlin.reflect.typeOf public class JSON( @@ -145,6 +107,9 @@ public class JSON( } } +public const val arrayColumnName: String = "array" +public const val valueColumnName: String = "value" + /** * @param file Where to fetch the Json as [InputStream] to be converted to a [DataFrame]. * @param keyValuePaths List of [JsonPath]s where instead of a [ColumnGroup], a [FrameColumn]<[KeyValueProperty]> @@ -295,649 +260,33 @@ public fun DataRow.Companion.readJsonStr( typeClashTactic: TypeClashTactic = ARRAY_AND_VALUE_COLUMNS, ): AnyRow = DataFrame.readJsonStr(text, header, keyValuePaths, typeClashTactic).single() -private fun readJson( - parsed: Any?, - header: List, - keyValuePaths: List = emptyList(), - typeClashTactic: TypeClashTactic = ARRAY_AND_VALUE_COLUMNS, -): DataFrame<*> { - val df: AnyFrame = when (typeClashTactic) { - ARRAY_AND_VALUE_COLUMNS -> { - when (parsed) { - is JsonArray<*> -> fromJsonListArrayAndValueColumns( - records = parsed.value, - header = header, - keyValuePaths = keyValuePaths, - ) - - else -> fromJsonListArrayAndValueColumns( - records = listOf(parsed), - keyValuePaths = keyValuePaths, - ) - } - } - - ANY_COLUMNS -> { - when (parsed) { - is JsonArray<*> -> fromJsonListAnyColumns( - records = parsed.value, - header = header, - keyValuePaths = keyValuePaths, - ) - - else -> fromJsonListAnyColumns( - records = listOf(parsed), - keyValuePaths = keyValuePaths, - ) - } - } - } - return df.unwrapUnnamedColumns() -} - -private fun DataFrame.unwrapUnnamedColumns() = - dataFrameOf(columns().map { it.unwrapUnnamedColumn() }) - -private fun AnyCol.unwrapUnnamedColumn() = if (this is UnnamedColumn) col else this - -private enum class AnyColType { - ANY, - ARRAYS, - OBJECTS, -} - -internal interface AnyKeyValueProperty : KeyValueProperty { - override val value: Any? +public fun AnyFrame.toJson(prettyPrint: Boolean = false, canonical: Boolean = false): String { + return json { + encodeFrame(this@toJson) + }.toJsonString(prettyPrint, canonical) } /** - * Json to DataFrame converter that creates [Any] columns. - * A.k.a. [TypeClashTactic.ANY_COLUMNS]. + * Converts the DataFrame to a JSON string representation with additional metadata about serialized data. + * It is heavily used to implement some integration features in Kotlin Notebook IntellJ IDEA plugin. * - * @param records List of json elements to be converted to a [DataFrame]. - * @param keyValuePaths List of [JsonPath]s where instead of a [ColumnGroup], a [FrameColumn]<[KeyValueProperty]> - * will be created. - * @param header Optional list of column names. If given, [records] will be read like an object with [header] being the keys. - * @return [DataFrame] from the given [records]. - */ -internal fun fromJsonListAnyColumns( - records: List<*>, - keyValuePaths: List = emptyList(), - header: List = emptyList(), - jsonPath: JsonPath = JsonPath(), -): AnyFrame { - var hasPrimitive = false - var hasArray = false - var hasObject = false - - // list element type can be JsonObject, JsonArray or primitive - val nameGenerator = ColumnNameGenerator() - records.forEach { - when (it) { - is JsonObject -> { - hasObject = true - it.entries.forEach { - nameGenerator.addIfAbsent(it.key) - } - } - - is JsonArray<*> -> hasArray = true - null -> Unit - else -> hasPrimitive = true - } - } - - val colType = when { - hasArray && !hasPrimitive && !hasObject -> AnyColType.ARRAYS - hasObject && !hasPrimitive && !hasArray -> AnyColType.OBJECTS - else -> AnyColType.ANY - } - val justPrimitives = hasPrimitive && !hasArray && !hasObject - val isKeyValue = keyValuePaths.any { jsonPath.matches(it) } - - if (isKeyValue && colType != AnyColType.OBJECTS) { - error("Key value path $jsonPath does not match objects.") - } - - @Suppress("KotlinConstantConditions") - val columns: List = when { - // Create one column of type Any? (or guessed primitive type) from all the records - colType == AnyColType.ANY -> { - val collector: DataCollectorBase = - if (justPrimitives) createDataCollector(records.size) // guess the type - else createDataCollector(records.size, typeOf()) // use Any? - - val nanIndices = mutableListOf() - records.forEachIndexed { i, v -> - when (v) { - is JsonObject -> { - val parsed = - fromJsonListAnyColumns( - records = listOf(v), - keyValuePaths = keyValuePaths, - jsonPath = jsonPath.replaceLastWildcardWithIndex(i), - ) - collector.add( - if (parsed.isSingleUnnamedColumn()) (parsed.getColumn(0) as UnnamedColumn).col.values.first() - else parsed.firstOrNull() ?: DataRow.empty - ) - } - - is JsonArray<*> -> { - val parsed = fromJsonListAnyColumns( - records = v, - keyValuePaths = keyValuePaths, - jsonPath = jsonPath.replaceLastWildcardWithIndex(i).appendArrayWithWildcard(), - ) - collector.add( - if (parsed.isSingleUnnamedColumn()) (parsed.getColumn(0) as UnnamedColumn).col.values.asList() - else parsed.unwrapUnnamedColumns() - ) - } - - "NaN" -> { - nanIndices.add(i) - collector.add(null) - } - - else -> collector.add(v) - } - } - val column = collector.toColumn(valueColumnName) - val res = if (nanIndices.isNotEmpty()) { - fun DataColumn.updateNaNs(nanValue: C): DataColumn { - var j = 0 - var nextNanIndex = nanIndices[j] - return mapIndexed(column.type) { i, v -> - if (i == nextNanIndex) { - j++ - nextNanIndex = if (j < nanIndices.size) nanIndices[j] else -1 - nanValue - } else v - } - } - when (column.typeClass) { - Double::class -> column.cast().updateNaNs(Double.NaN) - Float::class -> column.cast().updateNaNs(Float.NaN) - String::class -> column.cast().updateNaNs("NaN") - else -> column - } - } else column - listOf(UnnamedColumn(res)) - } - - // Create one column of type FrameColumn, or List<> from all the records if they are all arrays - colType == AnyColType.ARRAYS -> { - val values = mutableListOf() - val startIndices = ArrayList() - records.forEach { - startIndices.add(values.size) - when (it) { - is JsonArray<*> -> values.addAll(it.value) - null -> Unit - else -> error("Expected JsonArray, got $it") - } - } - val parsed = fromJsonListAnyColumns( - records = values, - keyValuePaths = keyValuePaths, - jsonPath = jsonPath.appendArrayWithWildcard(), - ) - - val res = when { - parsed.isSingleUnnamedColumn() -> { - val col = (parsed.getColumn(0) as UnnamedColumn).col - val elementType = col.type - val values = col.values.asList().splitByIndices(startIndices.asSequence()).toList() - DataColumn.createValueColumn( - name = arrayColumnName, - values = values, - type = List::class.createType(listOf(KTypeProjection.invariant(elementType))), - ) - } - - else -> DataColumn.createFrameColumn( - name = arrayColumnName, // will be erased - df = parsed.unwrapUnnamedColumns(), - startIndices = startIndices, - ) - } - listOf(UnnamedColumn(res)) - } - - // Create one column of type FrameColumn - colType == AnyColType.OBJECTS && isKeyValue -> { - // collect the value types to make sure Value columns with lists and other values aren't all turned into lists - val valueTypes = mutableSetOf() - val dataFrames = records.map { - when (it) { - is JsonObject -> { - val map = it.map.mapValues { (key, value) -> - val parsed = fromJsonListAnyColumns( - records = listOf(value), - keyValuePaths = keyValuePaths, - jsonPath = jsonPath.append(key), - ) - if (parsed.isSingleUnnamedColumn()) (parsed.getColumn(0) as UnnamedColumn).col.values.first() - else parsed.unwrapUnnamedColumns().firstOrNull() - } - val valueType = map.values.map { - guessValueType(sequenceOf(it)) - }.commonType() - - valueTypes += valueType - - dataFrameOf( - columnOf(*map.keys.toTypedArray()).named(KeyValueProperty<*>::key.name), - createColumn(values = map.values, suggestedType = valueType, guessType = false) - .named(KeyValueProperty<*>::value.name), - ) - } - - null -> DataFrame.emptyOf() - else -> error("Expected JsonObject, got $it") - } - } - - val valueColumns = dataFrames.map { it[KeyValueProperty<*>::value.name] } - val valueColumnSchema = when { - // in these cases we can safely combine the columns to get a single column schema - valueColumns.all { it is ColumnGroup<*> } || valueColumns.all { it is FrameColumn<*> } -> - valueColumns.concat().extractSchema() - // to avoid listification, we create the value columns schema ourselves (https://github.com/Kotlin/dataframe/issues/184) - else -> ColumnSchema.Value(valueTypes.commonType()) - } - - listOf( - UnnamedColumn( - DataColumn.createFrameColumn( - name = valueColumnName, // will be erased unless at top-level - groups = dataFrames, - schema = lazy { - DataFrameSchemaImpl( - columns = mapOf( - KeyValueProperty<*>::key.name to ColumnSchema.Value(typeOf()), - KeyValueProperty<*>::value.name to valueColumnSchema, - ) - ) - }, - ) - ) - ) - } - - // Create multiple columns from all the records if they are all objects, merging the objects in essence - colType == AnyColType.OBJECTS && !isKeyValue -> { - nameGenerator.names.map { colName -> - val values = ArrayList(records.size) - - records.forEach { - when (it) { - is JsonObject -> values.add(it[colName]) - null -> values.add(null) - else -> error("Expected JsonObject, got $it") - } - } - - val parsed = fromJsonListAnyColumns( - records = values, - keyValuePaths = keyValuePaths, - jsonPath = jsonPath.append(colName), - ) - when { - parsed.ncol == 0 -> - DataColumn.createValueColumn( - name = colName, - values = arrayOfNulls(values.size).toList(), - type = typeOf(), - ) - - parsed.isSingleUnnamedColumn() -> - (parsed.getColumn(0) as UnnamedColumn).col.rename(colName) - - else -> - DataColumn.createColumnGroup(colName, parsed.unwrapUnnamedColumns()) as AnyCol - } - } - } - - else -> error("") - } - - return when { - columns.isEmpty() -> DataFrame.empty(records.size) - - columns.size == 1 && hasArray && header.isNotEmpty() && columns[0].typeClass == List::class -> - columns[0] - .cast>() - .splitInto(*header.toTypedArray()) - - else -> columns.toDataFrame() - } -} - -public const val arrayColumnName: String = "array" -public const val valueColumnName: String = "value" - -private fun AnyFrame.isSingleUnnamedColumn() = ncol == 1 && getColumn(0) is UnnamedColumn - -/** - * Json to DataFrame converter that creates allows creates `value` and `array` accessors - * instead of [Any] columns. - * A.k.a. [TypeClashTactic.ARRAY_AND_VALUE_COLUMNS]. + * @param rowLimit The maximum number of top-level dataframe rows to include in the output JSON. + * @param nestedRowLimit The maximum number of nested frame rows to include in the output JSON. + * If null, all rows are included. + * Applied for each frame column recursively + * @param prettyPrint Specifies whether the output JSON should be formatted with indentation and line breaks. + * @param canonical Specifies whether the output JSON should be in a canonical form. * - * @param records List of json elements to be converted to a [DataFrame]. - * @param keyValuePaths List of [JsonPath]s where instead of a [ColumnGroup], a [FrameColumn]<[KeyValueProperty]> - * will be created. - * @param header Optional list of column names. If given, [records] will be read like an object with [header] being the keys. - * @return [DataFrame] from the given [records]. + * @return The DataFrame converted to a JSON string with metadata. */ -internal fun fromJsonListArrayAndValueColumns( - records: List<*>, - keyValuePaths: List = emptyList(), - header: List = emptyList(), - jsonPath: JsonPath = JsonPath(), -): AnyFrame { - var hasPrimitive = false - var hasArray = false - val isKeyValue = keyValuePaths.any { jsonPath.matches(it) } - - // list element type can be JsonObject, JsonArray or primitive - // So first, we gather all properties of objects to merge including "array" and "value" if needed - // so the resulting type of a property with instances 123, ["abc"], and { "a": 1, "b": 2 } will be - // { array: List, value: Int?, a: Int?, b: Int? } - // and instances will look like - // { "array": [], "value": 123, "a": null, "b": null } - - val nameGenerator = ColumnNameGenerator() - records.forEach { - when (it) { - is JsonObject -> it.entries.forEach { - nameGenerator.addIfAbsent(it.key) - } - - is JsonArray<*> -> hasArray = true - null -> Unit - else -> hasPrimitive = true - } - } - if (records.all { it == null }) hasPrimitive = true - - // Add a value column to the collected names if needed - val valueColumn = if (hasPrimitive || records.isEmpty()) { - nameGenerator.addUnique(valueColumnName) - } else null - - // Add an array column to the collected names if needed - val arrayColumn = if (hasArray) { - nameGenerator.addUnique(arrayColumnName) - } else null - - // only properties that consist of just objects (or are empty) can be merged to key/value FrameColumns - if (isKeyValue && (hasPrimitive || hasArray)) { - error("Key value path $jsonPath does not match objects.") - } - - // Create columns from the collected names - val columns: List = when { - // instead of using the names, generate a single key/value frame column - isKeyValue -> { - val dataFrames = records.map { - when (it) { - is JsonObject -> { - val map = it.map.mapValues { (key, value) -> - val parsed = fromJsonListArrayAndValueColumns( - records = listOf(value), - keyValuePaths = keyValuePaths, - jsonPath = jsonPath.append(key), - ) - if (parsed.isSingleUnnamedColumn()) (parsed.getColumn(0) as UnnamedColumn).col.values.first() - else parsed.unwrapUnnamedColumns().firstOrNull() - } - val valueType = - map.values.map { guessValueType(sequenceOf(it)) } - .commonType() - - dataFrameOf( - columnOf(*map.keys.toTypedArray()).named(KeyValueProperty<*>::key.name), - createColumn( - values = map.values, - suggestedType = valueType, - guessType = false, - ).named(KeyValueProperty<*>::value.name), - ) - } - - null -> DataFrame.emptyOf() - else -> error("Expected JsonObject, got $it") - } - } - - listOf( - UnnamedColumn( - DataColumn.createFrameColumn( - name = valueColumnName, // will be erased unless at top-level - groups = dataFrames, - schema = lazy { - dataFrames.mapNotNull { it.takeIf { it.nrow > 0 }?.schema() }.intersectSchemas() - }, - ) - ) - ) - } - - // generate columns using the collected names - else -> - nameGenerator.names.map { colName -> - when { - // Collect primitive values from records into the `value` column if needed - colName == valueColumn && (hasPrimitive || records.isEmpty()) -> { - val collector = createDataCollector(records.size) - val nanIndices = mutableListOf() - records.forEachIndexed { i, v -> - when (v) { - is JsonObject -> collector.add(null) - is JsonArray<*> -> collector.add(null) - "NaN" -> { - nanIndices.add(i) - collector.add(null) - } - - else -> collector.add(v) - } - } - val column = collector.toColumn(colName) - val res = if (nanIndices.isNotEmpty()) { - fun DataColumn.updateNaNs(nanValue: C): DataColumn { - var j = 0 - var nextNanIndex = nanIndices[j] - return mapIndexed(column.type) { i, v -> - if (i == nextNanIndex) { - j++ - nextNanIndex = if (j < nanIndices.size) nanIndices[j] else -1 - nanValue - } else v - } - } - when (column.typeClass) { - Double::class -> column.cast().updateNaNs(Double.NaN) - Float::class -> column.cast().updateNaNs(Float.NaN) - String::class -> column.cast().updateNaNs("NaN") - else -> column - } - } else column - UnnamedColumn(res) - } - - // Collect arrays from records into the `array` column if needed - colName == arrayColumn && hasArray -> { - val values = mutableListOf() - val startIndices = ArrayList() - records.forEach { - startIndices.add(values.size) - if (it is JsonArray<*>) values.addAll(it.value) - } - val parsed = fromJsonListArrayAndValueColumns( - records = values, - keyValuePaths = keyValuePaths, - jsonPath = jsonPath.appendArrayWithWildcard(), - ) - - val res = when { - parsed.isSingleUnnamedColumn() -> { - val col = (parsed.getColumn(0) as UnnamedColumn).col - val elementType = col.type - val values = col.values.asList().splitByIndices(startIndices.asSequence()).toList() - DataColumn.createValueColumn( - name = colName, - values = values, - type = List::class.createType(listOf(KTypeProjection.invariant(elementType))), - ) - } - - else -> DataColumn.createFrameColumn(colName, parsed.unwrapUnnamedColumns(), startIndices) - } - UnnamedColumn(res) - } - - // Collect the current column name as property from the objects in records - else -> { - val values = ArrayList(records.size) - records.forEach { - when (it) { - is JsonObject -> values.add(it[colName]) - else -> values.add(null) - } - } - - val parsed = fromJsonListArrayAndValueColumns( - records = values, - keyValuePaths = keyValuePaths, - jsonPath = jsonPath.append(colName), - ) - when { - parsed.ncol == 0 -> - DataColumn.createValueColumn( - name = colName, - values = arrayOfNulls(values.size).toList(), - type = typeOf(), - ) - - parsed.isSingleUnnamedColumn() -> - (parsed.getColumn(0) as UnnamedColumn).col.rename(colName) - - else -> - DataColumn.createColumnGroup(colName, parsed.unwrapUnnamedColumns()) as AnyCol - } - } - } - } - } - - return when { - columns.isEmpty() -> - DataFrame.empty(records.size) - - columns.size == 1 && hasArray && header.isNotEmpty() && columns[0].typeClass == List::class -> - columns[0] - .cast>() - .splitInto(*header.toTypedArray()) - - else -> - columns.toDataFrame() - } -} - -// we need it to check if AnyFrame created by recursive call has single unnamed column, -// unnamed column means this column is not created from field of a record [{"value": 1}, {"value": 2}], -// but filtered values [1, { ... }, []] -> [1, null, null] -// or arrays: [1, { ...}, []] -> [null, null, []] -private class UnnamedColumn(val col: DataColumn) : DataColumn by col - -private val valueTypes = - setOf(Boolean::class, Double::class, Int::class, Float::class, Long::class, Short::class, Byte::class) - -internal fun KlaxonJson.encodeRow(frame: ColumnsContainer<*>, index: Int): JsonObject? { - val values = frame.columns().map { col -> - when { - col is ColumnGroup<*> -> encodeRow(col, index) - col is FrameColumn<*> -> encodeFrame(col[index]) - col.isList() -> { - col[index]?.let { array(it as List<*>) } ?: array() - } - - col.typeClass in valueTypes -> { - val v = col[index] - if ((v is Double && v.isNaN()) || (v is Float && v.isNaN())) { - v.toString() - } else v - } - - else -> col[index]?.toString() - }.let { col.name to it } - } - if (values.isEmpty()) return null - return obj(values) -} - -internal fun KlaxonJson.encodeFrame(frame: AnyFrame): JsonArray<*> { - val allColumns = frame.columns() - - // if there is only 1 column, then `isValidValueColumn` always true. - // But at the same time, we shouldn't treat dataFrameOf("value")(1,2,3) like unnamed column - // because it was created by user. - val isPossibleToFindUnnamedColumns = allColumns.size != 1 - val valueColumn = allColumns.filter { it.name.startsWith(valueColumnName) } - .takeIf { isPossibleToFindUnnamedColumns } - ?.maxByOrNull { it.name }?.let { valueCol -> - if (valueCol.kind() != ColumnKind.Value) { // check that value in this column is not null only when other values are null - null - } else { - // check that value in this column is not null only when other values are null - val isValidValueColumn = frame.rows().all { row -> - if (valueCol[row] != null) { - allColumns.all { col -> - if (col.name != valueCol.name) col[row] == null - else true - } - } else true - } - if (isValidValueColumn) valueCol - else null - } - } - - val arrayColumn = frame.columns().filter { it.name.startsWith(arrayColumnName) } - .takeIf { isPossibleToFindUnnamedColumns } - ?.maxByOrNull { it.name }?.let { arrayCol -> - if (arrayCol.kind() == ColumnKind.Group) null - else { - // check that value in this column is not null only when other values are null - val isValidArrayColumn = frame.rows().all { row -> - if (arrayCol[row] != null) { - allColumns.all { col -> - if (col.name != arrayCol.name) col[row] == null - else true - } - } else true - } - if (isValidArrayColumn) arrayCol - else null - } - } - - val arraysAreFrames = arrayColumn?.kind() == ColumnKind.Frame - - val data = frame.indices().map { rowIndex -> - valueColumn?.get(rowIndex) ?: arrayColumn?.get(rowIndex) - ?.let { if (arraysAreFrames) encodeFrame(it as AnyFrame) else null } ?: encodeRow(frame, rowIndex) - } - return array(data) -} - -public fun AnyFrame.toJson(prettyPrint: Boolean = false, canonical: Boolean = false): String { +public fun AnyFrame.toJsonWithMetadata( + rowLimit: Int, + nestedRowLimit: Int? = null, + prettyPrint: Boolean = false, + canonical: Boolean = false +): String { return json { - encodeFrame(this@toJson) + encodeDataFrameWithMetadata(this@toJsonWithMetadata, rowLimit, nestedRowLimit) }.toJsonString(prettyPrint, canonical) } diff --git a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/jupyter/JupyterHtmlRenderer.kt b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/jupyter/JupyterHtmlRenderer.kt index 1486448cd1..536470fa84 100644 --- a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/jupyter/JupyterHtmlRenderer.kt +++ b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/jupyter/JupyterHtmlRenderer.kt @@ -1,12 +1,12 @@ package org.jetbrains.kotlinx.dataframe.jupyter import com.beust.klaxon.json -import org.jetbrains.kotlinx.dataframe.api.rows -import org.jetbrains.kotlinx.dataframe.api.toDataFrame +import org.jetbrains.kotlinx.dataframe.api.take +import org.jetbrains.kotlinx.dataframe.impl.io.encodeFrame import org.jetbrains.kotlinx.dataframe.io.DataFrameHtmlData import org.jetbrains.kotlinx.dataframe.io.DisplayConfiguration -import org.jetbrains.kotlinx.dataframe.io.encodeFrame import org.jetbrains.kotlinx.dataframe.io.toHTML +import org.jetbrains.kotlinx.dataframe.io.toJsonWithMetadata import org.jetbrains.kotlinx.dataframe.io.toStaticHtml import org.jetbrains.kotlinx.dataframe.jupyter.KotlinNotebookPluginUtils.convertToDataFrame import org.jetbrains.kotlinx.dataframe.nrow @@ -22,6 +22,7 @@ import org.jetbrains.kotlinx.jupyter.api.renderHtmlAsIFrameIfNeeded /** Starting from this version, dataframe integration will respond with additional data for rendering in Kotlin Notebooks plugin. */ private const val MIN_KERNEL_VERSION_FOR_NEW_TABLES_UI = "0.11.0.311" +private const val MIN_IDE_VERSION_SUPPORT_JSON_WITH_METADATA = 241 internal class JupyterHtmlRenderer( val display: DisplayConfiguration, @@ -60,21 +61,32 @@ internal inline fun JupyterHtmlRenderer.render( val staticHtml = df.toStaticHtml(reifiedDisplayConfiguration, DefaultCellRenderer).toJupyterHtmlData() if (notebook.kernelVersion >= KotlinKernelVersion.from(MIN_KERNEL_VERSION_FOR_NEW_TABLES_UI)!!) { - val jsonEncodedDf = json { - obj( - "nrow" to df.size.nrow, - "ncol" to df.size.ncol, - "columns" to df.columnNames(), - "kotlin_dataframe" to encodeFrame(df.rows().take(limit).toDataFrame()), - ) - }.toJsonString() + val ideBuildNumber = KotlinNotebookPluginUtils.getKotlinNotebookIDEBuildNumber() + + val jsonEncodedDf = + if (ideBuildNumber == null || ideBuildNumber.majorVersion < MIN_IDE_VERSION_SUPPORT_JSON_WITH_METADATA) { + json { + obj( + "nrow" to df.size.nrow, + "ncol" to df.size.ncol, + "columns" to df.columnNames(), + "kotlin_dataframe" to encodeFrame(df.take(limit)), + ) + }.toJsonString() + } else { + df.toJsonWithMetadata(limit, reifiedDisplayConfiguration.rowsLimit) + } notebook.renderAsIFrameAsNeeded(html, staticHtml, jsonEncodedDf) } else { notebook.renderHtmlAsIFrameIfNeeded(html) } } -internal fun Notebook.renderAsIFrameAsNeeded(data: HtmlData, staticData: HtmlData, jsonEncodedDf: String): MimeTypedResult { +internal fun Notebook.renderAsIFrameAsNeeded( + data: HtmlData, + staticData: HtmlData, + jsonEncodedDf: String +): MimeTypedResult { val textHtml = if (jupyterClientType == JupyterClientType.KOTLIN_NOTEBOOK) { data.generateIframePlaneText(currentColorScheme) + staticData.toString(currentColorScheme) diff --git a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/jupyter/KotlinNotebookPluginUtils.kt b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/jupyter/KotlinNotebookPluginUtils.kt index 0d80306d21..b0d8d28f5e 100644 --- a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/jupyter/KotlinNotebookPluginUtils.kt +++ b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/jupyter/KotlinNotebookPluginUtils.kt @@ -41,6 +41,8 @@ import org.jetbrains.kotlinx.dataframe.impl.ColumnNameGenerator * DISPLAY(KotlinNotebooksPluginUtils.getRowsSubsetForRendering(Out[...], 0, 20), "") */ public object KotlinNotebookPluginUtils { + private const val KTNB_IDE_BUILD_PROP = "KTNB_IDE_BUILD_NUMBER" + /** * Returns a subset of rows from the given dataframe for rendering. * It's used for example for dynamic pagination in Kotlin Notebook Plugin. @@ -166,4 +168,36 @@ public object KotlinNotebookPluginUtils { usedNames: List = emptyList() ): String = ColumnNameGenerator(usedNames).addUnique(preferredName) + + /** + * Retrieves the build number of the Kotlin Notebook IDE. + * + * @return The build number of the Kotlin Notebook IDE as an instance of [IdeBuildNumber], + * or null if the build number is not available. + */ + public fun getKotlinNotebookIDEBuildNumber(): IdeBuildNumber? { + val value = System.getProperty(KTNB_IDE_BUILD_PROP, null) ?: return null + return IdeBuildNumber.fromString(value) + } + + public data class IdeBuildNumber(val ideName: String, val majorVersion: Int, val buildId: Int) { + public companion object { + public fun fromString(buildNumber: String): IdeBuildNumber? { + val parts = buildNumber.split(";") + return if (parts.size >= 3) constructIdeBuildNumber(parts) else null + } + + private fun constructIdeBuildNumber(parts: List): IdeBuildNumber? { + val ideName = parts[0] + val majorVersion = parts[1].toIntOrNull() + val buildId = parts[2].toIntOrNull() + + return if (majorVersion != null && buildId != null) { + IdeBuildNumber(ideName, majorVersion, buildId) + } else { + null + } + } + } + } } diff --git a/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/json.kt b/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/json.kt index f02d0060f3..a9328a214b 100644 --- a/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/json.kt +++ b/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/json.kt @@ -1,5 +1,8 @@ package org.jetbrains.kotlinx.dataframe.io +import com.beust.klaxon.JsonArray +import com.beust.klaxon.JsonObject +import com.beust.klaxon.Parser import io.kotest.matchers.collections.shouldBeIn import io.kotest.matchers.shouldBe import io.kotest.matchers.string.shouldContain @@ -22,14 +25,26 @@ import org.jetbrains.kotlinx.dataframe.api.schema import org.jetbrains.kotlinx.dataframe.api.toDouble import org.jetbrains.kotlinx.dataframe.api.toMap import org.jetbrains.kotlinx.dataframe.columns.ColumnGroup +import org.jetbrains.kotlinx.dataframe.columns.ColumnKind import org.jetbrains.kotlinx.dataframe.columns.FrameColumn import org.jetbrains.kotlinx.dataframe.columns.ValueColumn +import org.jetbrains.kotlinx.dataframe.impl.io.SERIALIZATION_VERSION +import org.jetbrains.kotlinx.dataframe.impl.io.SerializationKeys.COLUMNS +import org.jetbrains.kotlinx.dataframe.impl.io.SerializationKeys.DATA +import org.jetbrains.kotlinx.dataframe.impl.io.SerializationKeys.KIND +import org.jetbrains.kotlinx.dataframe.impl.io.SerializationKeys.KOTLIN_DATAFRAME +import org.jetbrains.kotlinx.dataframe.impl.io.SerializationKeys.METADATA +import org.jetbrains.kotlinx.dataframe.impl.io.SerializationKeys.NCOL +import org.jetbrains.kotlinx.dataframe.impl.io.SerializationKeys.NROW +import org.jetbrains.kotlinx.dataframe.impl.io.SerializationKeys.VERSION import org.jetbrains.kotlinx.dataframe.impl.nothingType -import org.jetbrains.kotlinx.dataframe.io.JSON.TypeClashTactic.* +import org.jetbrains.kotlinx.dataframe.io.JSON.TypeClashTactic.ANY_COLUMNS +import org.jetbrains.kotlinx.dataframe.io.JSON.TypeClashTactic.ARRAY_AND_VALUE_COLUMNS +import org.jetbrains.kotlinx.dataframe.testJson import org.jetbrains.kotlinx.dataframe.type import org.jetbrains.kotlinx.dataframe.values import org.junit.Test -import kotlin.reflect.* +import kotlin.reflect.typeOf class JsonTests { @@ -951,4 +966,98 @@ class JsonTests { val df = dataFrameOf("a", "b")("1", null, "2", 12) df.toJson(canonical = true) shouldContain "\"b\":null" } + + @Test + @Suppress("UNCHECKED_CAST") + fun `json with metadata flat table`() { + @Language("json") + val data = """ + [{"id":3602279,"node_id":"MDEwOlJlcG9zaXRvcnkzNjAyMjc5","name":"kotlin-web-demo","full_name":"JetBrains/kotlin-web-demo"}] + """.trimIndent() + val df = DataFrame.readJsonStr(data) + val jsonStr = df.toJsonWithMetadata(df.rowsCount()).trimIndent() + val json = parseJsonStr(jsonStr) + + json[VERSION] shouldBe SERIALIZATION_VERSION + + val metadata = (json[METADATA] as JsonObject) + metadata[NROW] shouldBe 1 + metadata[NCOL] shouldBe 4 + val columns = metadata[COLUMNS] as List + columns shouldBe listOf("id", "node_id", "name", "full_name") + + val decodedData = json[KOTLIN_DATAFRAME] as JsonArray<*> + val decodedDf = DataFrame.readJsonStr(decodedData.toJsonString()) + decodedDf shouldBe df + } + + private fun parseJsonStr(jsonStr: String): JsonObject { + val parser = Parser.default() + return parser.parse(StringBuilder(jsonStr)) as JsonObject + } + + @Test + fun `json with metadata column group`() { + @Language("json") + val data = """ + [{"permissions":{"admin":false,"maintain":false,"push":false,"triage":false,"pull":true}}] + """.trimIndent() + val df = DataFrame.readJsonStr(data) + val jsonStr = df.toJsonWithMetadata(df.rowsCount()).trimIndent() + val json = parseJsonStr(jsonStr) + + val row = (json[KOTLIN_DATAFRAME] as JsonArray<*>)[0] as JsonObject + + val permissions = row["permissions"] as JsonObject + val metadata = permissions[METADATA] as JsonObject + metadata[KIND] shouldBe ColumnKind.Group.toString() + + val decodedData = permissions[DATA] as JsonObject + + decodedData["admin"] shouldBe false + decodedData["maintain"] shouldBe false + decodedData["push"] shouldBe false + decodedData["triage"] shouldBe false + decodedData["pull"] shouldBe true + } + + @Test + fun `json with metadata frame column`() { + val df = DataFrame.readJson(testJson("repositories")) + val jsonStr = df.toJsonWithMetadata(df.rowsCount()).trimIndent() + val json = parseJsonStr(jsonStr) + val row = (json[KOTLIN_DATAFRAME] as JsonArray<*>)[0] as JsonObject + + val contributors = row["contributors"] as JsonObject + + val metadata = contributors[METADATA] as JsonObject + metadata[KIND] shouldBe ColumnKind.Frame.toString() + metadata[NCOL] shouldBe 8 + metadata[NROW] shouldBe 29 + + val decodedData = contributors[DATA] as JsonArray<*> + decodedData.size shouldBe 29 + + val decodedDf = DataFrame.readJsonStr(decodedData.toJsonString()) + decodedDf shouldBe df[0]["contributors"] as AnyFrame + } + + @Test + fun `json with metadata test row limit`() { + val df = DataFrame.readJson(testJson("repositories")) + val nestedFrameRowLimit = 20 + val jsonStr = df.toJsonWithMetadata(df.rowsCount(), nestedFrameRowLimit).trimIndent() + val json = parseJsonStr(jsonStr) + val row = (json[KOTLIN_DATAFRAME] as JsonArray<*>)[0] as JsonObject + + val contributors = row["contributors"] as JsonObject + + val metadata = contributors[METADATA] as JsonObject + metadata[KIND] shouldBe ColumnKind.Frame.toString() + metadata[NCOL] shouldBe 8 + metadata[NROW] shouldBe 29 + + val decodedData = contributors[DATA] as JsonArray<*> + decodedData.size shouldBe nestedFrameRowLimit + } } diff --git a/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/jupyter/RenderingTests.kt b/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/jupyter/RenderingTests.kt index 8f48073e73..a68f1ede3a 100644 --- a/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/jupyter/RenderingTests.kt +++ b/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/jupyter/RenderingTests.kt @@ -10,8 +10,12 @@ import io.kotest.matchers.shouldBe import io.kotest.matchers.string.shouldContain import io.kotest.matchers.string.shouldNotContain import org.intellij.lang.annotations.Language +import org.jetbrains.kotlinx.dataframe.impl.io.SerializationKeys.DATA +import org.jetbrains.kotlinx.dataframe.impl.io.SerializationKeys.KOTLIN_DATAFRAME +import org.jetbrains.kotlinx.dataframe.impl.io.SerializationKeys.METADATA import org.jetbrains.kotlinx.jupyter.api.MimeTypedResult import org.jetbrains.kotlinx.jupyter.testkit.JupyterReplTestCase +import org.junit.BeforeClass import org.junit.Test class RenderingTests : JupyterReplTestCase() { @@ -94,7 +98,7 @@ class RenderingTests : JupyterReplTestCase() { assertDataFrameDimensions(json, 30, 1) - val rows = json.array>("kotlin_dataframe")!! + val rows = json.array>(KOTLIN_DATAFRAME)!! rows.getObj(0).int("id") shouldBe 21 rows.getObj(rows.lastIndex).int("id") shouldBe 50 } @@ -111,8 +115,8 @@ class RenderingTests : JupyterReplTestCase() { } private fun assertDataFrameDimensions(json: JsonObject, expectedRows: Int, expectedColumns: Int) { - json.int("nrow") shouldBe expectedRows - json.int("ncol") shouldBe expectedColumns + json.obj(METADATA)!!.int("nrow") shouldBe expectedRows + json.obj(METADATA)!!.int("ncol") shouldBe expectedColumns } private fun parseDataframeJson(result: MimeTypedResult): JsonObject { @@ -120,7 +124,7 @@ class RenderingTests : JupyterReplTestCase() { return parser.parse(StringBuilder(result["application/kotlindataframe+json"]!!)) as JsonObject } - private fun JsonArray<*>.getObj(index: Int) = this.get(index) as JsonObject + private fun JsonArray<*>.getObj(index: Int) = this[index] as JsonObject @Test fun `test kotlin notebook plugin utils sort by one column asc`() { @@ -138,7 +142,7 @@ class RenderingTests : JupyterReplTestCase() { @Suppress("UNCHECKED_CAST") private fun assertSortedById(json: JsonObject, desc: Boolean) { - val rows = json["kotlin_dataframe"] as JsonArray + val rows = json[KOTLIN_DATAFRAME] as JsonArray var previousId = if (desc) 101 else 0 rows.forEach { row -> val currentId = row.int("id")!! @@ -177,7 +181,7 @@ class RenderingTests : JupyterReplTestCase() { assertDataFrameDimensions(json, 100, 2) - val rows = json["kotlin_dataframe"] as JsonArray + val rows = json[KOTLIN_DATAFRAME] as JsonArray assertSortedByCategory(rows) assertSortedById(rows) } @@ -213,16 +217,16 @@ class RenderingTests : JupyterReplTestCase() { val json = executeScriptAndParseDataframeResult( """ data class Row(val id: Int, val group: Int) - val df = (1..100).map { Row(it, if (it <= 50) 1 else 2) }.toDataFrame() + val df = (1..20).map { Row(it, if (it <= 10) 1 else 2) }.toDataFrame() KotlinNotebookPluginUtils.convertToDataFrame(df.groupBy("group")) """.trimIndent() ) assertDataFrameDimensions(json, 2, 2) - val rows = json.array>("kotlin_dataframe")!! - rows.getObj(0).array("group1")!!.size shouldBe 50 - rows.getObj(1).array("group1")!!.size shouldBe 50 + val rows = json.array>(KOTLIN_DATAFRAME)!! + (rows.getObj(0).obj("group1")!![DATA] as JsonArray<*>).size shouldBe 10 + (rows.getObj(1).obj("group1")!![DATA] as JsonArray<*>).size shouldBe 10 } // Regression KTNB-424 @@ -240,4 +244,15 @@ class RenderingTests : JupyterReplTestCase() { assertDataFrameDimensions(json, 2, 2) } } + + companion object { + /** + * Set the system property for the IDE version needed for specific serialization testing purposes. + */ + @BeforeClass + @JvmStatic + internal fun setupOnce() { + System.setProperty("KTNB_IDE_BUILD_NUMBER", "IU;241;14015") + } + } } diff --git a/core/src/test/resources/repositories.json b/core/src/test/resources/repositories.json new file mode 100644 index 0000000000..078eb98a38 --- /dev/null +++ b/core/src/test/resources/repositories.json @@ -0,0 +1 @@ +[{"contributors":[{"login":"satamas","id":5521317,"node_id":"MDQ6VXNlcjU1MjEzMTc=","gravatar_id":"","url":"https://api.github.com/users/satamas","type":"User","site_admin":false,"contributions":998},{"login":"NataliaUkhorskaya","id":968879,"node_id":"MDQ6VXNlcjk2ODg3OQ==","gravatar_id":"","url":"https://api.github.com/users/NataliaUkhorskaya","type":"User","site_admin":false,"contributions":371},{"login":"AlexanderPrendota","id":10503748,"node_id":"MDQ6VXNlcjEwNTAzNzQ4","gravatar_id":"","url":"https://api.github.com/users/AlexanderPrendota","type":"User","site_admin":false,"contributions":190},{"login":"svtk","id":1447386,"node_id":"MDQ6VXNlcjE0NDczODY=","gravatar_id":"","url":"https://api.github.com/users/svtk","type":"User","site_admin":false,"contributions":53},{"login":"zarechenskiy","id":3757088,"node_id":"MDQ6VXNlcjM3NTcwODg=","gravatar_id":"","url":"https://api.github.com/users/zarechenskiy","type":"User","site_admin":false,"contributions":18},{"login":"abreslav","id":888318,"node_id":"MDQ6VXNlcjg4ODMxOA==","gravatar_id":"","url":"https://api.github.com/users/abreslav","type":"User","site_admin":false,"contributions":13},{"login":"yole","id":46553,"node_id":"MDQ6VXNlcjQ2NTUz","gravatar_id":"","url":"https://api.github.com/users/yole","type":"User","site_admin":false,"contributions":11},{"login":"zoobestik","id":242514,"node_id":"MDQ6VXNlcjI0MjUxNA==","gravatar_id":"","url":"https://api.github.com/users/zoobestik","type":"User","site_admin":false,"contributions":5},{"login":"ilya-g","id":4257577,"node_id":"MDQ6VXNlcjQyNTc1Nzc=","gravatar_id":"","url":"https://api.github.com/users/ilya-g","type":"User","site_admin":false,"contributions":5},{"login":"pTalanov","id":442640,"node_id":"MDQ6VXNlcjQ0MjY0MA==","gravatar_id":"","url":"https://api.github.com/users/pTalanov","type":"User","site_admin":false,"contributions":4},{"login":"bashor","id":485321,"node_id":"MDQ6VXNlcjQ4NTMyMQ==","gravatar_id":"","url":"https://api.github.com/users/bashor","type":"User","site_admin":false,"contributions":3},{"login":"nikpachoo","id":3338311,"node_id":"MDQ6VXNlcjMzMzgzMTE=","gravatar_id":"","url":"https://api.github.com/users/nikpachoo","type":"User","site_admin":false,"contributions":3},{"login":"udalov","id":292714,"node_id":"MDQ6VXNlcjI5MjcxNA==","gravatar_id":"","url":"https://api.github.com/users/udalov","type":"User","site_admin":false,"contributions":2},{"login":"anton-bannykh","id":1115872,"node_id":"MDQ6VXNlcjExMTU4NzI=","gravatar_id":"","url":"https://api.github.com/users/anton-bannykh","type":"User","site_admin":false,"contributions":2},{"login":"rayshade","id":5259872,"node_id":"MDQ6VXNlcjUyNTk4NzI=","gravatar_id":"","url":"https://api.github.com/users/rayshade","type":"User","site_admin":false,"contributions":2},{"login":"yu-ishicawa","id":843678,"node_id":"MDQ6VXNlcjg0MzY3OA==","gravatar_id":"","url":"https://api.github.com/users/yu-ishicawa","type":"User","site_admin":false,"contributions":2},{"login":"gildor","id":186017,"node_id":"MDQ6VXNlcjE4NjAxNw==","gravatar_id":"","url":"https://api.github.com/users/gildor","type":"User","site_admin":false,"contributions":1},{"login":"AndreOnCrypto","id":3066457,"node_id":"MDQ6VXNlcjMwNjY0NTc=","gravatar_id":"","url":"https://api.github.com/users/AndreOnCrypto","type":"User","site_admin":false,"contributions":1},{"login":"DipanshKhandelwal","id":24923974,"node_id":"MDQ6VXNlcjI0OTIzOTc0","gravatar_id":"","url":"https://api.github.com/users/DipanshKhandelwal","type":"User","site_admin":false,"contributions":1},{"login":"dsavvinov","id":6999635,"node_id":"MDQ6VXNlcjY5OTk2MzU=","gravatar_id":"","url":"https://api.github.com/users/dsavvinov","type":"User","site_admin":false,"contributions":1},{"login":"Noia","id":397736,"node_id":"MDQ6VXNlcjM5NzczNg==","gravatar_id":"","url":"https://api.github.com/users/Noia","type":"User","site_admin":false,"contributions":1},{"login":"gzoritchak","id":1110254,"node_id":"MDQ6VXNlcjExMTAyNTQ=","gravatar_id":"","url":"https://api.github.com/users/gzoritchak","type":"User","site_admin":false,"contributions":1},{"login":"Harmitage","id":44910736,"node_id":"MDQ6VXNlcjQ0OTEwNzM2","gravatar_id":"","url":"https://api.github.com/users/Harmitage","type":"User","site_admin":false,"contributions":1},{"login":"JLLeitschuh","id":1323708,"node_id":"MDQ6VXNlcjEzMjM3MDg=","gravatar_id":"","url":"https://api.github.com/users/JLLeitschuh","type":"User","site_admin":false,"contributions":1},{"login":"dalinaum","id":145585,"node_id":"MDQ6VXNlcjE0NTU4NQ==","gravatar_id":"","url":"https://api.github.com/users/dalinaum","type":"User","site_admin":false,"contributions":1},{"login":"robstoll","id":5557885,"node_id":"MDQ6VXNlcjU1NTc4ODU=","gravatar_id":"","url":"https://api.github.com/users/robstoll","type":"User","site_admin":false,"contributions":1},{"login":"tginsberg","id":432945,"node_id":"MDQ6VXNlcjQzMjk0NQ==","gravatar_id":"","url":"https://api.github.com/users/tginsberg","type":"User","site_admin":false,"contributions":1},{"login":"joeldudleyr3","id":24230167,"node_id":"MDQ6VXNlcjI0MjMwMTY3","gravatar_id":"","url":"https://api.github.com/users/joeldudleyr3","type":"User","site_admin":false,"contributions":1},{"login":"ligi","id":111600,"node_id":"MDQ6VXNlcjExMTYwMA==","gravatar_id":"","url":"https://api.github.com/users/ligi","type":"User","site_admin":false,"contributions":1}]}] diff --git a/docs/serialization_format.md b/docs/serialization_format.md new file mode 100644 index 0000000000..9270aad2c8 --- /dev/null +++ b/docs/serialization_format.md @@ -0,0 +1,49 @@ +## Serialization format for the Kotlin notebooks plugin +This document is an informal specification of the serialization format used for rendering Kotlin dataframes in the Kotlin notebooks plugin of IntelliJ IDEA. + +### Version 2.0.0 +### Top level json structure +```json +{ + "$version": "2.0.0", + "metadata": { + "columns": [ string, ... ], // column names + "nrow": int, + "ncol": int + }, + "kotlin_dataframe": [ Row, ... ] +} +``` +### Row +```json +{ + "": string|Boolean|Double|Int|Float|Long|Short|Byte|list,, + "": string|Boolean|Double|Int|Float|Long|Short|Byte|list,, + ... + "": ColumnGroup, + "": ColumnGroup, + ... + "": NestedFrame, + "": NestedFrame +} +``` +### ColumnGroup +```json +{ + "metadata": { + "kind": "ColumnGroup" + }, + "data": Row +} +``` +### NestedFrame +```json +{ + "metadata": { + "kind": "FrameColumn" + "nrow": int, + "ncol": int + }, + "data": [ Row, ... ] +} +```