From 258e0fef942b734af24adf612b7998cb5da523c5 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Tue, 12 Sep 2023 15:09:22 -0600 Subject: [PATCH] [Java] Add recoverWithNull to JSONOptions and pass to Table.readJSON (#14078) This PR exposes the recently added `json_reader_options_builder::recovery_mode` option in the JNI layer. closes #14073 Authors: - Andy Grove (https://github.com/andygrove) - Nghia Truong (https://github.com/ttnghia) Approvers: - Gera Shegalov (https://github.com/gerashegalov) - Robert (Bobby) Evans (https://github.com/revans2) - Raza Jafri (https://github.com/razajafri) - Nghia Truong (https://github.com/ttnghia) URL: https://github.com/rapidsai/cudf/pull/14078 --- .../main/java/ai/rapids/cudf/JSONOptions.java | 25 +++++++++++++- java/src/main/java/ai/rapids/cudf/Table.java | 12 ++++--- java/src/main/native/src/TableJni.cpp | 18 +++++++--- .../test/java/ai/rapids/cudf/TableTest.java | 34 +++++++++++++++++++ .../resources/people_with_invalid_lines.json | 4 +++ 5 files changed, 83 insertions(+), 10 deletions(-) create mode 100644 java/src/test/resources/people_with_invalid_lines.json diff --git a/java/src/main/java/ai/rapids/cudf/JSONOptions.java b/java/src/main/java/ai/rapids/cudf/JSONOptions.java index 85a9eb7beb3..f98687df5fa 100644 --- a/java/src/main/java/ai/rapids/cudf/JSONOptions.java +++ b/java/src/main/java/ai/rapids/cudf/JSONOptions.java @@ -1,6 +1,6 @@ /* * - * Copyright (c) 2019-2022, NVIDIA CORPORATION. + * Copyright (c) 2019-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -29,11 +29,13 @@ public final class JSONOptions extends ColumnFilterOptions { private final boolean dayFirst; private final boolean lines; + private final boolean recoverWithNull; private JSONOptions(Builder builder) { super(builder); dayFirst = builder.dayFirst; lines = builder.lines; + recoverWithNull = builder.recoverWithNull; } public boolean isDayFirst() { @@ -44,6 +46,11 @@ public boolean isLines() { return lines; } + /** Return the value of the recoverWithNull option */ + public boolean isRecoverWithNull() { + return recoverWithNull; + } + @Override String[] getIncludeColumnNames() { throw new UnsupportedOperationException("JSON reader didn't support column prune"); @@ -57,6 +64,8 @@ public static final class Builder extends ColumnFilterOptions.Builder= 0 && offset < buffer.length; return new TableWithMeta(readAndInferJSON(buffer.getAddress() + offset, len, - opts.isDayFirst(), opts.isLines())); + opts.isDayFirst(), opts.isLines(), opts.isRecoverWithNull())); } /** @@ -1121,7 +1122,8 @@ public static Table readJSON(Schema schema, JSONOptions opts, HostMemoryBuffer b assert offset >= 0 && offset < buffer.length; try (TableWithMeta twm = new TableWithMeta(readJSON(schema.getColumnNames(), schema.getTypeIds(), schema.getTypeScales(), null, - buffer.getAddress() + offset, len, opts.isDayFirst(), opts.isLines()))) { + buffer.getAddress() + offset, len, opts.isDayFirst(), opts.isLines(), + opts.isRecoverWithNull()))) { return gatherJSONColumns(schema, twm); } } diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp index b05fc9b7bc4..b208ef8f381 100644 --- a/java/src/main/native/src/TableJni.cpp +++ b/java/src/main/native/src/TableJni.cpp @@ -1331,7 +1331,8 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Table_endWriteCSVToBuffer(JNIEnv *env } JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readAndInferJSON( - JNIEnv *env, jclass, jlong buffer, jlong buffer_length, jboolean day_first, jboolean lines) { + JNIEnv *env, jclass, jlong buffer, jlong buffer_length, jboolean day_first, jboolean lines, + jboolean recover_with_null) { JNI_NULL_CHECK(env, buffer, "buffer cannot be null", 0); if (buffer_length <= 0) { @@ -1344,9 +1345,13 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readAndInferJSON( auto source = cudf::io::source_info{reinterpret_cast(buffer), static_cast(buffer_length)}; + auto const recovery_mode = recover_with_null ? + cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL : + cudf::io::json_recovery_mode_t::FAIL; cudf::io::json_reader_options_builder opts = cudf::io::json_reader_options::builder(source) .dayfirst(static_cast(day_first)) - .lines(static_cast(lines)); + .lines(static_cast(lines)) + .recovery_mode(recovery_mode); auto result = std::make_unique(cudf::io::read_json(opts.build())); @@ -1404,7 +1409,8 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_TableWithMeta_releaseTable(JNIE JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSON( JNIEnv *env, jclass, jobjectArray col_names, jintArray j_types, jintArray j_scales, - jstring inputfilepath, jlong buffer, jlong buffer_length, jboolean day_first, jboolean lines) { + jstring inputfilepath, jlong buffer, jlong buffer_length, jboolean day_first, jboolean lines, + jboolean recover_with_null) { bool read_buffer = true; if (buffer == 0) { @@ -1448,9 +1454,13 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSON( static_cast(buffer_length)} : cudf::io::source_info{filename.get()}; + cudf::io::json_recovery_mode_t recovery_mode = + recover_with_null ? cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL : + cudf::io::json_recovery_mode_t::FAIL; cudf::io::json_reader_options_builder opts = cudf::io::json_reader_options::builder(source) .dayfirst(static_cast(day_first)) - .lines(static_cast(lines)); + .lines(static_cast(lines)) + .recovery_mode(recovery_mode); if (!n_col_names.is_null() && data_types.size() > 0) { if (n_col_names.size() != n_types.size()) { diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java index 3740328615a..59f0d180c6e 100644 --- a/java/src/test/java/ai/rapids/cudf/TableTest.java +++ b/java/src/test/java/ai/rapids/cudf/TableTest.java @@ -86,6 +86,7 @@ public class TableTest extends CudfTestBase { private static final File TEST_ALL_TYPES_PLAIN_AVRO_FILE = TestUtils.getResourceAsFile("alltypes_plain.avro"); private static final File TEST_SIMPLE_CSV_FILE = TestUtils.getResourceAsFile("simple.csv"); private static final File TEST_SIMPLE_JSON_FILE = TestUtils.getResourceAsFile("people.json"); + private static final File TEST_JSON_ERROR_FILE = TestUtils.getResourceAsFile("people_with_invalid_lines.json"); private static final Schema CSV_DATA_BUFFER_SCHEMA = Schema.builder() .column(DType.INT32, "A") @@ -326,6 +327,39 @@ void testReadJSONFile() { } } + @Test + void testReadJSONFileWithInvalidLines() { + Schema schema = Schema.builder() + .column(DType.STRING, "name") + .column(DType.INT32, "age") + .build(); + + // test with recoverWithNulls=true + { + JSONOptions opts = JSONOptions.builder() + .withLines(true) + .withRecoverWithNull(true) + .build(); + try (Table expected = new Table.TestBuilder() + .column("Michael", "Andy", null, "Justin") + .column(null, 30, null, 19) + .build(); + Table table = Table.readJSON(schema, opts, TEST_JSON_ERROR_FILE)) { + assertTablesAreEqual(expected, table); + } + } + + // test with recoverWithNulls=false + { + JSONOptions opts = JSONOptions.builder() + .withLines(true) + .withRecoverWithNull(false) + .build(); + assertThrows(CudfException.class, () -> + Table.readJSON(schema, opts, TEST_JSON_ERROR_FILE)); + } + } + @Test void testReadJSONFileWithDifferentColumnOrder() { Schema schema = Schema.builder() diff --git a/java/src/test/resources/people_with_invalid_lines.json b/java/src/test/resources/people_with_invalid_lines.json new file mode 100644 index 00000000000..a99592e3eca --- /dev/null +++ b/java/src/test/resources/people_with_invalid_lines.json @@ -0,0 +1,4 @@ +{"name":"Michael"} +{"name":"Andy", "age":30} +this_line_is_not_valid +{"name":"Justin", "age":19}