Skip to content

Commit

Permalink
[Java] Add recoverWithNull to JSONOptions and pass to Table.readJSON (#…
Browse files Browse the repository at this point in the history
…14078)

This PR exposes the recently added `json_reader_options_builder::recovery_mode` option in the JNI layer.


closes #14073

Authors:
  - Andy Grove (https://github.com/andygrove)
  - Nghia Truong (https://github.com/ttnghia)

Approvers:
  - Gera Shegalov (https://github.com/gerashegalov)
  - Robert (Bobby) Evans (https://github.com/revans2)
  - Raza Jafri (https://github.com/razajafri)
  - Nghia Truong (https://github.com/ttnghia)

URL: #14078
  • Loading branch information
andygrove authored Sep 12, 2023
1 parent 72c9583 commit 258e0fe
Show file tree
Hide file tree
Showing 5 changed files with 83 additions and 10 deletions.
25 changes: 24 additions & 1 deletion java/src/main/java/ai/rapids/cudf/JSONOptions.java
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
/*
*
* Copyright (c) 2019-2022, NVIDIA CORPORATION.
* Copyright (c) 2019-2023, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -29,11 +29,13 @@ public final class JSONOptions extends ColumnFilterOptions {

private final boolean dayFirst;
private final boolean lines;
private final boolean recoverWithNull;

private JSONOptions(Builder builder) {
super(builder);
dayFirst = builder.dayFirst;
lines = builder.lines;
recoverWithNull = builder.recoverWithNull;
}

public boolean isDayFirst() {
Expand All @@ -44,6 +46,11 @@ public boolean isLines() {
return lines;
}

/** Return the value of the recoverWithNull option */
public boolean isRecoverWithNull() {
return recoverWithNull;
}

@Override
String[] getIncludeColumnNames() {
throw new UnsupportedOperationException("JSON reader didn't support column prune");
Expand All @@ -57,6 +64,8 @@ public static final class Builder extends ColumnFilterOptions.Builder<JSONOptio
private boolean dayFirst = false;
private boolean lines = true;

private boolean recoverWithNull = false;

/**
* Whether to parse dates as DD/MM versus MM/DD
* @param dayFirst true: DD/MM, false, MM/DD
Expand All @@ -78,6 +87,20 @@ public Builder withLines(boolean perLine) {
return this;
}

/**
* Specify how to handle invalid lines when parsing json. Setting
* recoverWithNull to true will cause null values to be returned
* for invalid lines. Setting recoverWithNull to false will cause
* the parsing to fail with an exception.
*
* @param recoverWithNull true: return nulls, false: throw exception
* @return builder for chaining
*/
public Builder withRecoverWithNull(boolean recoverWithNull) {
this.recoverWithNull = recoverWithNull;
return this;
}

@Override
public Builder includeColumn(String... names) {
throw new UnsupportedOperationException("JSON reader didn't support column prune");
Expand Down
12 changes: 7 additions & 5 deletions java/src/main/java/ai/rapids/cudf/Table.java
Original file line number Diff line number Diff line change
Expand Up @@ -241,10 +241,11 @@ private static native long[] readCSV(String[] columnNames,
private static native long readJSON(String[] columnNames,
int[] dTypeIds, int[] dTypeScales,
String filePath, long address, long length,
boolean dayFirst, boolean lines) throws CudfException;
boolean dayFirst, boolean lines,
boolean recoverWithNulls) throws CudfException;

private static native long readAndInferJSON(long address, long length,
boolean dayFirst, boolean lines) throws CudfException;
boolean dayFirst, boolean lines, boolean recoverWithNulls) throws CudfException;

/**
* Read in Parquet formatted data.
Expand Down Expand Up @@ -1047,7 +1048,7 @@ public static Table readJSON(Schema schema, JSONOptions opts, File path) {
readJSON(schema.getColumnNames(), schema.getTypeIds(), schema.getTypeScales(),
path.getAbsolutePath(),
0, 0,
opts.isDayFirst(), opts.isLines()))) {
opts.isDayFirst(), opts.isLines(), opts.isRecoverWithNull()))) {

return gatherJSONColumns(schema, twm);
}
Expand Down Expand Up @@ -1099,7 +1100,7 @@ public static TableWithMeta readJSON(JSONOptions opts, HostMemoryBuffer buffer,
assert len <= buffer.length - offset;
assert offset >= 0 && offset < buffer.length;
return new TableWithMeta(readAndInferJSON(buffer.getAddress() + offset, len,
opts.isDayFirst(), opts.isLines()));
opts.isDayFirst(), opts.isLines(), opts.isRecoverWithNull()));
}

/**
Expand All @@ -1121,7 +1122,8 @@ public static Table readJSON(Schema schema, JSONOptions opts, HostMemoryBuffer b
assert offset >= 0 && offset < buffer.length;
try (TableWithMeta twm = new TableWithMeta(readJSON(schema.getColumnNames(),
schema.getTypeIds(), schema.getTypeScales(), null,
buffer.getAddress() + offset, len, opts.isDayFirst(), opts.isLines()))) {
buffer.getAddress() + offset, len, opts.isDayFirst(), opts.isLines(),
opts.isRecoverWithNull()))) {
return gatherJSONColumns(schema, twm);
}
}
Expand Down
18 changes: 14 additions & 4 deletions java/src/main/native/src/TableJni.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1331,7 +1331,8 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Table_endWriteCSVToBuffer(JNIEnv *env
}

JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readAndInferJSON(
JNIEnv *env, jclass, jlong buffer, jlong buffer_length, jboolean day_first, jboolean lines) {
JNIEnv *env, jclass, jlong buffer, jlong buffer_length, jboolean day_first, jboolean lines,
jboolean recover_with_null) {

JNI_NULL_CHECK(env, buffer, "buffer cannot be null", 0);
if (buffer_length <= 0) {
Expand All @@ -1344,9 +1345,13 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readAndInferJSON(
auto source = cudf::io::source_info{reinterpret_cast<char *>(buffer),
static_cast<std::size_t>(buffer_length)};

auto const recovery_mode = recover_with_null ?
cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL :
cudf::io::json_recovery_mode_t::FAIL;
cudf::io::json_reader_options_builder opts = cudf::io::json_reader_options::builder(source)
.dayfirst(static_cast<bool>(day_first))
.lines(static_cast<bool>(lines));
.lines(static_cast<bool>(lines))
.recovery_mode(recovery_mode);

auto result =
std::make_unique<cudf::io::table_with_metadata>(cudf::io::read_json(opts.build()));
Expand Down Expand Up @@ -1404,7 +1409,8 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_TableWithMeta_releaseTable(JNIE

JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSON(
JNIEnv *env, jclass, jobjectArray col_names, jintArray j_types, jintArray j_scales,
jstring inputfilepath, jlong buffer, jlong buffer_length, jboolean day_first, jboolean lines) {
jstring inputfilepath, jlong buffer, jlong buffer_length, jboolean day_first, jboolean lines,
jboolean recover_with_null) {

bool read_buffer = true;
if (buffer == 0) {
Expand Down Expand Up @@ -1448,9 +1454,13 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSON(
static_cast<std::size_t>(buffer_length)} :
cudf::io::source_info{filename.get()};

cudf::io::json_recovery_mode_t recovery_mode =
recover_with_null ? cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL :
cudf::io::json_recovery_mode_t::FAIL;
cudf::io::json_reader_options_builder opts = cudf::io::json_reader_options::builder(source)
.dayfirst(static_cast<bool>(day_first))
.lines(static_cast<bool>(lines));
.lines(static_cast<bool>(lines))
.recovery_mode(recovery_mode);

if (!n_col_names.is_null() && data_types.size() > 0) {
if (n_col_names.size() != n_types.size()) {
Expand Down
34 changes: 34 additions & 0 deletions java/src/test/java/ai/rapids/cudf/TableTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,7 @@ public class TableTest extends CudfTestBase {
private static final File TEST_ALL_TYPES_PLAIN_AVRO_FILE = TestUtils.getResourceAsFile("alltypes_plain.avro");
private static final File TEST_SIMPLE_CSV_FILE = TestUtils.getResourceAsFile("simple.csv");
private static final File TEST_SIMPLE_JSON_FILE = TestUtils.getResourceAsFile("people.json");
private static final File TEST_JSON_ERROR_FILE = TestUtils.getResourceAsFile("people_with_invalid_lines.json");

private static final Schema CSV_DATA_BUFFER_SCHEMA = Schema.builder()
.column(DType.INT32, "A")
Expand Down Expand Up @@ -326,6 +327,39 @@ void testReadJSONFile() {
}
}

@Test
void testReadJSONFileWithInvalidLines() {
Schema schema = Schema.builder()
.column(DType.STRING, "name")
.column(DType.INT32, "age")
.build();

// test with recoverWithNulls=true
{
JSONOptions opts = JSONOptions.builder()
.withLines(true)
.withRecoverWithNull(true)
.build();
try (Table expected = new Table.TestBuilder()
.column("Michael", "Andy", null, "Justin")
.column(null, 30, null, 19)
.build();
Table table = Table.readJSON(schema, opts, TEST_JSON_ERROR_FILE)) {
assertTablesAreEqual(expected, table);
}
}

// test with recoverWithNulls=false
{
JSONOptions opts = JSONOptions.builder()
.withLines(true)
.withRecoverWithNull(false)
.build();
assertThrows(CudfException.class, () ->
Table.readJSON(schema, opts, TEST_JSON_ERROR_FILE));
}
}

@Test
void testReadJSONFileWithDifferentColumnOrder() {
Schema schema = Schema.builder()
Expand Down
4 changes: 4 additions & 0 deletions java/src/test/resources/people_with_invalid_lines.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
{"name":"Michael"}
{"name":"Andy", "age":30}
this_line_is_not_valid
{"name":"Justin", "age":19}

0 comments on commit 258e0fe

Please sign in to comment.