Skip to content

DynamicDataFrameBuilder improvements #1082

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Mar 5, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions core/api/core.api
Original file line number Diff line number Diff line change
Expand Up @@ -4351,7 +4351,10 @@ public final class org/jetbrains/kotlinx/dataframe/api/DuplicateKt {

public final class org/jetbrains/kotlinx/dataframe/api/DynamicDataFrameBuilder {
public fun <init> ()V
public fun <init> (Z)V
public synthetic fun <init> (ZILkotlin/jvm/internal/DefaultConstructorMarker;)V
public final fun add (Lorg/jetbrains/kotlinx/dataframe/DataColumn;)Ljava/lang/String;
public final fun get (Ljava/lang/String;)Lorg/jetbrains/kotlinx/dataframe/DataColumn;
public final fun toDataFrame ()Lorg/jetbrains/kotlinx/dataframe/DataFrame;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -413,29 +413,78 @@ public class DataFrameBuilder(private val header: List<String>) {
}

/**
* Helper class for implementing operations when column names can be potentially duplicated.
* For example, operations involving multiple dataframes, computed columns or parsing some third-party data
* A builder class for dynamically constructing a DataFrame with provided columns.
* Allows adding columns manually while automatically handling duplicate column names by assigning unique names.
*
* @property checkDuplicateValues Whether to check for duplicate column (with identical names and values)
* when adding new columns. `true` by default.
*/
public class DynamicDataFrameBuilder {
private var cols: MutableList<AnyCol> = mutableListOf()
public class DynamicDataFrameBuilder(private val checkDuplicateValues: Boolean = true) {
private var cols: MutableMap<String, AnyCol> = mutableMapOf()
private val generator = ColumnNameGenerator()

/**
* Adds a column to the builder, ensuring its name is unique.
*
* - If a column with the same name already exists, the new column is renamed to a unique name.
* - If [checkDuplicateValues] is `true`, the method checks whether the new column has identical values
* to an existing column with the same name. If the values match, the column is not added.
*
* @param col The column to add to the DataFrame builder.
* @return The final unique name assigned to the column.
*/
public fun add(col: AnyCol): String {
val uniqueName = if (col.name().isEmpty()) {
val originalName = col.name()
if (checkDuplicateValues && generator.contains(originalName)) {
if (cols[originalName] == col) return originalName
}
val uniqueName = if (originalName.isEmpty()) {
generator.addUnique(UNNAMED_COLUMN_PREFIX)
} else {
generator.addUnique(col.name())
generator.addUnique(originalName)
}
val renamed = if (uniqueName != col.name()) {
val renamed = if (uniqueName != originalName) {
col.rename(uniqueName)
} else {
col
}
cols.add(renamed)
cols.put(uniqueName, renamed)
return uniqueName
}

public fun toDataFrame(): DataFrame<*> = dataFrameOf(cols)
/**
* Adds a column to the builder from the given iterable of values, ensuring the column's name is unique.
*
* The method automatically converts the given iterable into a column using the specified or default name
* and infers the type of the column's elements.
*
* - If a column with the same name already exists, the new column is renamed to a unique name.
* - If the [checkDuplicateValues] property of the builder is `true`, the method checks whether the new column
* has identical values to an existing column with the same name. If the values match, the column is not added.
*
* @param T The inferred type of the elements in the column.
* @param values The iterable collection of values to be added as a new column.
* @param name The name of the new column. If empty, a unique name will be generated automatically.
* @return The final unique name assigned to the column.
*/
public inline fun <reified T> add(values: Iterable<T>, name: String = ""): String =
add(values.toColumn(name, Infer.Type))

/**
* Retrieves a column from the builder by its name.
*
* @param column The name of the column to retrieve.
* @return The column corresponding to the specified name, or `null` if no such column exists.
*/
public fun get(column: String): AnyCol? = cols[column]

/**
* Converts the current `DynamicDataFrameBuilder` instance into a `DataFrame`.
* The resulting `DataFrame` is constructed from the columns stored in the builder.
*
* @return A `DataFrame` containing the columns defined in the `DynamicDataFrameBuilder`.
*/
public fun toDataFrame(): DataFrame<*> = cols.values.toDataFrame()
}

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,12 +31,39 @@ class ConstructorsTests {
@Test
fun `duplicated name`() {
val builder = DynamicDataFrameBuilder()
val column by columnOf(1, 2, 3)
builder.add(column)
builder.add(column)
val columnName = "columnName"
val columnA = columnOf(1, 2, 3) named columnName
val columnB = columnOf(4, 5, 6) named columnName
builder.add(columnA)
builder.add(columnB)
val df = builder.toDataFrame()
df.columnsCount() shouldBe 2
df.columnNames() shouldBe listOf(column.name(), "${column.name()}1")
df.columnNames() shouldBe listOf(columnName, "${columnName}1")
}

@Test
fun `get by new name`() {
val builder = DynamicDataFrameBuilder()
val columnName = "columnName"
val columnA = columnOf(1, 2, 3) named columnName
val columnB = columnOf(4, 5, 6) named columnName
builder.add(columnA)
val newName = builder.add(columnB)
builder.get(newName)!!.values shouldBe columnB.values
}

@Test
fun `duplicated column`() {
val builder = DynamicDataFrameBuilder()
val columnName = "columnName"
val columnA = columnOf(1, 2, 3) named columnName
val columnB = columnOf(4, 5, 6) named columnName
builder.add(columnA)
builder.add(columnB)
builder.add(columnA)
val df = builder.toDataFrame()
df.columnsCount() shouldBe 2
df.columnNames() shouldBe listOf(columnName, "${columnName}1")
}

@Test
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -413,29 +413,79 @@ public class DataFrameBuilder(private val header: List<String>) {
}

/**
* Helper class for implementing operations when column names can be potentially duplicated.
* For example, operations involving multiple dataframes, computed columns or parsing some third-party data
* A builder class for dynamically constructing a DataFrame with provided columns.
* Allows adding columns manually while automatically handling duplicate column names by assigning unique names.
*
* @property checkDuplicateValues Whether to check for duplicate column (with identical names and values). If `true`,
* doesn't add a new column if the identical one is already in the builder.
* when adding new columns. `true` by default.
*/
public class DynamicDataFrameBuilder {
private var cols: MutableList<AnyCol> = mutableListOf()
public class DynamicDataFrameBuilder(private val checkDuplicateValues: Boolean = true) {
private var cols: MutableMap<String, AnyCol> = mutableMapOf()
private val generator = ColumnNameGenerator()

/**
* Adds a column to the builder, ensuring its name is unique.
*
* - If a column with the same name already exists, the new column is renamed to a unique name.
* - If [checkDuplicateValues] is `true`, the method checks whether the new column has identical values
* to an existing column with the same name. If the values match, the column is not added.
*
* @param col The column to add to the DataFrame builder.
* @return The final unique name assigned to the column.
*/
public fun add(col: AnyCol): String {
val uniqueName = if (col.name().isEmpty()) {
val originalName = col.name()
if (checkDuplicateValues && generator.contains(originalName)) {
if (cols[originalName] == col) return originalName
}
val uniqueName = if (originalName.isEmpty()) {
generator.addUnique(UNNAMED_COLUMN_PREFIX)
} else {
generator.addUnique(col.name())
generator.addUnique(originalName)
}
val renamed = if (uniqueName != col.name()) {
val renamed = if (uniqueName != originalName) {
col.rename(uniqueName)
} else {
col
}
cols.add(renamed)
cols.put(uniqueName, renamed)
return uniqueName
}

public fun toDataFrame(): DataFrame<*> = dataFrameOf(cols)
/**
* Adds a column to the builder from the given iterable of values, ensuring the column's name is unique.
*
* The method automatically converts the given iterable into a column using the specified or default name
* and infers the type of the column's elements.
*
* - If a column with the same name already exists, the new column is renamed to a unique name.
* - If the [checkDuplicateValues] property of the builder is `true`, the method checks whether the new column
* has identical values to an existing column with the same name. If the values match, the column is not added.
*
* @param T The inferred type of the elements in the column.
* @param values The iterable collection of values to be added as a new column.
* @param name The name of the new column. If empty, a unique name will be generated automatically.
* @return The final unique name assigned to the column.
*/
public inline fun <reified T> add(values: Iterable<T>, name: String = ""): String =
add(values.toColumn(name, Infer.Type))

/**
* Retrieves a column from the builder by its name.
*
* @param column The name of the column to retrieve.
* @return The column corresponding to the specified name, or `null` if no such column exists.
*/
public fun get(column: String): AnyCol? = cols[column]

/**
* Converts the current [DynamicDataFrameBuilder] instance into a [DataFrame].
* The resulting [DataFrame] is constructed from the columns stored in the builder.
*
* @return A [DataFrame] containing the columns defined in the [DynamicDataFrameBuilder].
*/
public fun toDataFrame(): DataFrame<*> = cols.values.toDataFrame()
}

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,12 +31,39 @@ class ConstructorsTests {
@Test
fun `duplicated name`() {
val builder = DynamicDataFrameBuilder()
val column by columnOf(1, 2, 3)
builder.add(column)
builder.add(column)
val columnName = "columnName"
val columnA = columnOf(1, 2, 3) named columnName
val columnB = columnOf(4, 5, 6) named columnName
builder.add(columnA)
builder.add(columnB)
val df = builder.toDataFrame()
df.columnsCount() shouldBe 2
df.columnNames() shouldBe listOf(column.name(), "${column.name()}1")
df.columnNames() shouldBe listOf(columnName, "${columnName}1")
}

@Test
fun `get by new name`() {
val builder = DynamicDataFrameBuilder()
val columnName = "columnName"
val columnA = columnOf(1, 2, 3) named columnName
val columnB = columnOf(4, 5, 6) named columnName
builder.add(columnA)
val newName = builder.add(columnB)
builder.get(newName)!!.values shouldBe columnB.values
}

@Test
fun `duplicated column`() {
val builder = DynamicDataFrameBuilder()
val columnName = "columnName"
val columnA = columnOf(1, 2, 3) named columnName
val columnB = columnOf(4, 5, 6) named columnName
builder.add(columnA)
builder.add(columnB)
builder.add(columnA)
val df = builder.toDataFrame()
df.columnsCount() shouldBe 2
df.columnNames() shouldBe listOf(columnName, "${columnName}1")
}

@Test
Expand Down