Skip to content

Commit 6453fe5

Browse files
committed
[3.2][Kernel][Writes] Add support of inserting data into tables (#3030)
(Split from #2944) Adds support for inserting data into the table. Tests for inserting into partitioned and unpartitioned tables with various combinations of the types, partition values etc. Also tests the checkpoint is ready to create.
1 parent fe5d931 commit 6453fe5

File tree

20 files changed

+1085
-86
lines changed

20 files changed

+1085
-86
lines changed

kernel/kernel-api/src/main/java/io/delta/kernel/Transaction.java

+79-5
Original file line numberDiff line numberDiff line change
@@ -15,18 +15,31 @@
1515
*/
1616
package io.delta.kernel;
1717

18+
import java.net.URI;
1819
import java.util.List;
1920
import java.util.Map;
2021

2122
import io.delta.kernel.annotation.Evolving;
22-
import io.delta.kernel.data.FilteredColumnarBatch;
23-
import io.delta.kernel.data.Row;
23+
import io.delta.kernel.data.*;
2424
import io.delta.kernel.engine.Engine;
2525
import io.delta.kernel.exceptions.ConcurrentWriteException;
2626
import io.delta.kernel.expressions.Literal;
2727
import io.delta.kernel.types.StructType;
2828
import io.delta.kernel.utils.*;
2929

30+
import io.delta.kernel.internal.DataWriteContextImpl;
31+
import io.delta.kernel.internal.actions.AddFile;
32+
import io.delta.kernel.internal.actions.SingleAction;
33+
import io.delta.kernel.internal.fs.Path;
34+
import static io.delta.kernel.internal.DeltaErrors.dataSchemaMismatch;
35+
import static io.delta.kernel.internal.DeltaErrors.partitionColumnMissingInData;
36+
import static io.delta.kernel.internal.TransactionImpl.getStatisticsColumns;
37+
import static io.delta.kernel.internal.data.TransactionStateRow.*;
38+
import static io.delta.kernel.internal.util.PartitionUtils.getTargetDirectory;
39+
import static io.delta.kernel.internal.util.PartitionUtils.validateAndSanitizePartitionValues;
40+
import static io.delta.kernel.internal.util.Preconditions.checkArgument;
41+
import static io.delta.kernel.internal.util.SchemaUtils.findColIndex;
42+
3043
/**
3144
* Represents a transaction to mutate a Delta table.
3245
*
@@ -104,7 +117,41 @@ static CloseableIterator<FilteredColumnarBatch> transformLogicalData(
104117
Row transactionState,
105118
CloseableIterator<FilteredColumnarBatch> dataIter,
106119
Map<String, Literal> partitionValues) {
107-
throw new UnsupportedOperationException("Not implemented yet");
120+
121+
// Note: `partitionValues` are not used as of now in this API, but taking the partition
122+
// values as input forces the connector to not pass data from multiple partitions this
123+
// API in a single call.
124+
StructType tableSchema = getLogicalSchema(engine, transactionState);
125+
List<String> partitionColNames = getPartitionColumnsList(transactionState);
126+
validateAndSanitizePartitionValues(tableSchema, partitionColNames, partitionValues);
127+
128+
// TODO: add support for:
129+
// - enforcing the constraints
130+
// - generating the default value columns
131+
// - generating the generated columns
132+
133+
// Remove the partition columns from the data as they are already part of file metadata
134+
// and are not needed in the data files. TODO: once we start supporting uniform complaint
135+
// tables, we may conditionally skip this step.
136+
137+
// TODO: set the correct schema once writing into column mapping enabled table is supported.
138+
String tablePath = getTablePath(transactionState);
139+
return dataIter.map(
140+
filteredBatch -> {
141+
ColumnarBatch data = filteredBatch.getData();
142+
if (!data.getSchema().equals(tableSchema)) {
143+
throw dataSchemaMismatch(tablePath, tableSchema, data.getSchema());
144+
}
145+
for (String partitionColName : partitionColNames) {
146+
int partitionColIndex = findColIndex(data.getSchema(), partitionColName);
147+
if (partitionColIndex < 0) {
148+
throw partitionColumnMissingInData(tablePath, partitionColName);
149+
}
150+
data = data.withDeletedColumnAt(partitionColIndex);
151+
}
152+
return new FilteredColumnarBatch(data, filteredBatch.getSelectionVector());
153+
}
154+
);
108155
}
109156

110157
/**
@@ -124,7 +171,21 @@ static DataWriteContext getWriteContext(
124171
Engine engine,
125172
Row transactionState,
126173
Map<String, Literal> partitionValues) {
127-
throw new UnsupportedOperationException("Not implemented yet");
174+
StructType tableSchema = getLogicalSchema(engine, transactionState);
175+
List<String> partitionColNames = getPartitionColumnsList(transactionState);
176+
177+
partitionValues =
178+
validateAndSanitizePartitionValues(tableSchema, partitionColNames, partitionValues);
179+
180+
String targetDirectory = getTargetDirectory(
181+
getTablePath(transactionState),
182+
partitionColNames,
183+
partitionValues);
184+
185+
return new DataWriteContextImpl(
186+
targetDirectory,
187+
partitionValues,
188+
getStatisticsColumns(engine, transactionState));
128189
}
129190

130191
/**
@@ -146,6 +207,19 @@ static CloseableIterator<Row> generateAppendActions(
146207
Row transactionState,
147208
CloseableIterator<DataFileStatus> fileStatusIter,
148209
DataWriteContext dataWriteContext) {
149-
throw new UnsupportedOperationException("Not implemented yet");
210+
checkArgument(dataWriteContext instanceof DataWriteContextImpl,
211+
"DataWriteContext is not created by the `Transaction.getWriteContext()`");
212+
213+
URI tableRoot = new Path(getTablePath(transactionState)).toUri();
214+
return fileStatusIter.map(
215+
dataFileStatus -> {
216+
Row addFileRow = AddFile.convertDataFileStatus(
217+
tableRoot,
218+
dataFileStatus,
219+
((DataWriteContextImpl) dataWriteContext).getPartitionValues(),
220+
true /* dataChange */);
221+
return SingleAction.createAddFileSingleAction(addFileRow);
222+
}
223+
);
150224
}
151225
}

kernel/kernel-api/src/main/java/io/delta/kernel/engine/JsonHandler.java

+2-1
Original file line numberDiff line numberDiff line change
@@ -110,7 +110,8 @@ CloseableIterator<ColumnarBatch> readJsonFiles(
110110
* <ul>
111111
* <li>Primitive types: @code boolean, byte, short, int, long, float, double, string}</li>
112112
* <li>{@code struct}: any element whose value is null is not written to file</li>
113-
* <li>{@code map}: only a {@code map} with {@code string} key type is supported</li>
113+
* <li>{@code map}: only a {@code map} with {@code string} key type is supported. If an
114+
* entry value is {@code null}, it should be written to the file.</li>
114115
* <li>{@code array}: {@code null} value elements are written to file</li>
115116
* </ul>
116117
*

kernel/kernel-api/src/main/java/io/delta/kernel/internal/DeltaErrors.java

+16
Original file line numberDiff line numberDiff line change
@@ -172,6 +172,22 @@ public static KernelException tableAlreadyExists(String tablePath, String messag
172172
return new TableAlreadyExistsException(tablePath, message);
173173
}
174174

175+
public static KernelException dataSchemaMismatch(
176+
String tablePath,
177+
StructType tableSchema,
178+
StructType dataSchema) {
179+
String msgT = "The schema of the data to be written to the table doesn't match " +
180+
"the table schema. \nTable: %s\nTable schema: %s, \nData schema: %s";
181+
return new KernelException(format(msgT, tablePath, tableSchema, dataSchema));
182+
}
183+
184+
public static KernelException partitionColumnMissingInData(
185+
String tablePath,
186+
String partitionColumn) {
187+
String msgT = "Missing partition column '%s' in the data to be written to the table '%s'.";
188+
return new KernelException(format(msgT, partitionColumn, tablePath));
189+
}
190+
175191
/* ------------------------ HELPER METHODS ----------------------------- */
176192
private static String formatTimestamp(long millisSinceEpochUTC) {
177193
return new Timestamp(millisSinceEpochUTC).toInstant().toString();

kernel/kernel-api/src/main/java/io/delta/kernel/internal/TransactionImpl.java

+16-1
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
import io.delta.kernel.data.Row;
2525
import io.delta.kernel.engine.Engine;
2626
import io.delta.kernel.exceptions.ConcurrentWriteException;
27+
import io.delta.kernel.expressions.Column;
2728
import io.delta.kernel.types.StructType;
2829
import io.delta.kernel.utils.CloseableIterable;
2930
import io.delta.kernel.utils.CloseableIterator;
@@ -158,7 +159,9 @@ private boolean isReadyForCheckpoint(long newVersion) {
158159
}
159160

160161
private boolean isBlindAppend() {
161-
return isNewTable; // Later can add more conditions to determine if it is a blind append
162+
// For now, Kernel just supports blind append.
163+
// Change this when read-after-write is supported.
164+
return true;
162165
}
163166

164167
private Map<String, String> getOperationParameters() {
@@ -171,4 +174,16 @@ private Map<String, String> getOperationParameters() {
171174
}
172175
return Collections.emptyMap();
173176
}
177+
178+
/**
179+
* Get the part of the schema of the table that needs the statistics to be collected per file.
180+
*
181+
* @param engine {@link Engine} instance to use.
182+
* @param transactionState State of the transaction
183+
* @return
184+
*/
185+
public static List<Column> getStatisticsColumns(Engine engine, Row transactionState) {
186+
// TODO: implement this once we start supporting collecting stats
187+
return Collections.emptyList();
188+
}
174189
}

kernel/kernel-api/src/main/java/io/delta/kernel/internal/actions/AddFile.java

+45
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,21 @@
1515
*/
1616
package io.delta.kernel.internal.actions;
1717

18+
import java.net.URI;
19+
import java.util.HashMap;
20+
import java.util.Map;
21+
import java.util.stream.IntStream;
22+
import static java.util.stream.Collectors.toMap;
23+
24+
import io.delta.kernel.data.Row;
25+
import io.delta.kernel.expressions.Literal;
1826
import io.delta.kernel.types.*;
27+
import io.delta.kernel.utils.DataFileStatus;
28+
29+
import io.delta.kernel.internal.data.GenericRow;
30+
import io.delta.kernel.internal.fs.Path;
31+
import static io.delta.kernel.internal.util.InternalUtils.relativizePath;
32+
import static io.delta.kernel.internal.util.PartitionUtils.serializePartitionMap;
1933

2034
/**
2135
* Delta log action representing an `AddFile`
@@ -57,4 +71,35 @@ public class AddFile {
5771
true /* nullable */);
5872
// There are more fields which are added when row-id tracking and clustering is enabled.
5973
// When Kernel starts supporting row-ids and clustering, we should add those fields here.
74+
75+
private static final Map<String, Integer> COL_NAME_TO_ORDINAL =
76+
IntStream.range(0, FULL_SCHEMA.length())
77+
.boxed()
78+
.collect(toMap(i -> FULL_SCHEMA.at(i).getName(), i -> i));
79+
80+
/**
81+
* Utility to generate `AddFile` row from the given {@link DataFileStatus} and partition values.
82+
*/
83+
public static Row convertDataFileStatus(
84+
URI tableRoot,
85+
DataFileStatus dataFileStatus,
86+
Map<String, Literal> partitionValues,
87+
boolean dataChange) {
88+
Path filePath = new Path(dataFileStatus.getPath());
89+
Map<Integer, Object> valueMap = new HashMap<>();
90+
valueMap.put(COL_NAME_TO_ORDINAL.get("path"),
91+
relativizePath(filePath, tableRoot).toString());
92+
valueMap.put(COL_NAME_TO_ORDINAL.get("partitionValues"),
93+
serializePartitionMap(partitionValues));
94+
valueMap.put(COL_NAME_TO_ORDINAL.get("size"), dataFileStatus.getSize());
95+
valueMap.put(COL_NAME_TO_ORDINAL.get("modificationTime"),
96+
dataFileStatus.getModificationTime());
97+
valueMap.put(COL_NAME_TO_ORDINAL.get("dataChange"), dataChange);
98+
if (dataFileStatus.getStatistics().isPresent()) {
99+
valueMap.put(COL_NAME_TO_ORDINAL.get("stats"),
100+
dataFileStatus.getStatistics().get().serializeAsJson());
101+
}
102+
// any fields not present in the valueMap are considered null
103+
return new GenericRow(FULL_SCHEMA, valueMap);
104+
}
60105
}

kernel/kernel-api/src/main/java/io/delta/kernel/internal/data/TransactionStateRow.java

+16-1
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929

3030
public class TransactionStateRow extends GenericRow {
3131
private static final StructType SCHEMA = new StructType()
32+
.add("logicalSchemaString", StringType.STRING)
3233
.add("partitionColumns", new ArrayType(StringType.STRING, false))
3334
.add("tablePath", StringType.STRING);
3435

@@ -39,6 +40,7 @@ public class TransactionStateRow extends GenericRow {
3940

4041
public static TransactionStateRow of(Metadata metadata, String tablePath) {
4142
HashMap<Integer, Object> valueMap = new HashMap<>();
43+
valueMap.put(COL_NAME_TO_ORDINAL.get("logicalSchemaString"), metadata.getSchemaString());
4244
valueMap.put(COL_NAME_TO_ORDINAL.get("partitionColumns"), metadata.getPartitionColumns());
4345
valueMap.put(COL_NAME_TO_ORDINAL.get("tablePath"), tablePath);
4446
return new TransactionStateRow(valueMap);
@@ -48,11 +50,24 @@ private TransactionStateRow(HashMap<Integer, Object> valueMap) {
4850
super(SCHEMA, valueMap);
4951
}
5052

53+
/**
54+
* Get the logical schema of the table from the transaction state {@link Row} returned by
55+
* {@link Transaction#getTransactionState(Engine)}}
56+
*
57+
* @param engine {@link Engine} instance to use for parsing the schema
58+
* @param transactionState Transaction state state {@link Row}
59+
* @return Logical schema of the table as {@link StructType}
60+
*/
61+
public static StructType getLogicalSchema(Engine engine, Row transactionState) {
62+
return engine.getJsonHandler().deserializeStructType(
63+
transactionState.getString(COL_NAME_TO_ORDINAL.get("logicalSchemaString")));
64+
}
65+
5166
/**
5267
* Get the list of partition column names from the write state {@link Row} returned by
5368
* {@link Transaction#getTransactionState(Engine)}
5469
*
55-
* @param transactionState Scan state {@link Row}
70+
* @param transactionState Transaction state state {@link Row}
5671
* @return List of partition column names according to the scan state.
5772
*/
5873
public static List<String> getPartitionColumnsList(Row transactionState) {

kernel/kernel-api/src/main/java/io/delta/kernel/internal/util/InternalUtils.java

+28
Original file line numberDiff line numberDiff line change
@@ -16,13 +16,17 @@
1616
package io.delta.kernel.internal.util;
1717

1818
import java.io.IOException;
19+
import java.net.URI;
1920
import java.sql.Date;
2021
import java.sql.Timestamp;
2122
import java.time.LocalDate;
2223
import java.time.LocalDateTime;
2324
import java.time.ZoneOffset;
2425
import java.time.temporal.ChronoUnit;
26+
import java.util.Collection;
2527
import java.util.Optional;
28+
import java.util.Set;
29+
import java.util.stream.Collectors;
2630

2731
import io.delta.kernel.data.ColumnVector;
2832
import io.delta.kernel.data.ColumnarBatch;
@@ -31,6 +35,8 @@
3135
import io.delta.kernel.types.StringType;
3236
import io.delta.kernel.utils.CloseableIterator;
3337

38+
import io.delta.kernel.internal.fs.Path;
39+
3440
public class InternalUtils {
3541
private static final LocalDate EPOCH_DAY = LocalDate.ofEpochDay(0);
3642
private static final LocalDateTime EPOCH_DATETIME =
@@ -152,4 +158,26 @@ public static ColumnVector requireNonNull(ColumnVector vector, int rowId, String
152158
}
153159
return vector;
154160
}
161+
162+
/**
163+
* Relativize the given child path with respect to the given root URI. If the child path is
164+
* already a relative path, it is returned as is.
165+
*
166+
* @param child
167+
* @param root Root directory as URI. Relativization is done with respect to this root.
168+
* The relativize operation requires conversion to URI, so the caller is expected to
169+
* convert the root directory to URI once and use it for relativizing for multiple
170+
* child paths.
171+
* @return
172+
*/
173+
public static Path relativizePath(Path child, URI root) {
174+
if (child.isAbsolute()) {
175+
return new Path(root.relativize(child.toUri()));
176+
}
177+
return child;
178+
}
179+
180+
public static Set<String> toLowerCaseSet(Collection<String> set) {
181+
return set.stream().map(String::toLowerCase).collect(Collectors.toSet());
182+
}
155183
}

0 commit comments

Comments
 (0)