GH-47756: [C++][CI] Fuzz CSV reader (#47757)

pitrou · web-flow · commit bfce5f208e24 · 2025-10-13T15:55:41.000+02:00
### Rationale for this change We are already fuzzing the IPC and Parquet reader. The CSV reader is another important user-facing component that is worth fuzzing. ### What changes are included in this PR? 1. Add fuzz target for the CSV reader (currently only fuzzing the non-streaming table reader) 2. Generate rudimentary seed corpus using CSV test files from the `arrow-testing` repo and the Pandas test suite ### Are these changes tested? Yes, by the OSS-Fuzz CI build. ### Are there any user-facing changes? No. * GitHub Issue: #47756 Authored-by: Antoine Pitrou <antoine@python.org> Signed-off-by: Antoine Pitrou <antoine@python.org>
diff --git a/ci/scripts/cpp_test.sh b/ci/scripts/cpp_test.sh
@@ -184,6 +184,7 @@ if [ "${ARROW_FUZZING}" == "ON" ]; then
     if [ "${ARROW_PARQUET}" == "ON" ]; then
       "${binary_output_dir}/parquet-arrow-fuzz" "${ARROW_TEST_DATA}"/parquet/fuzzing/*-testcase-*
     fi
+    # TODO run CSV fuzz regression tests once we have any
 fi
 
 popd
diff --git a/cpp/build-support/fuzzing/generate_corpuses.sh b/cpp/build-support/fuzzing/generate_corpuses.sh
@@ -27,6 +27,8 @@ fi
 set -ex
 
 CORPUS_DIR=/tmp/corpus
+PANDAS_DIR=/tmp/pandas
+
 ARROW_ROOT=$(cd $(dirname $BASH_SOURCE)/../../..; pwd)
 ARROW_CPP=$ARROW_ROOT/cpp
 OUT=$1
@@ -35,6 +37,8 @@ OUT=$1
 # where "<FUZZ TARGET>" is the exact name of the fuzz target executable the
 # seed corpus is generated for.
 
+# Arrow IPC
+
 IPC_INTEGRATION_FILES=$(find ${ARROW_ROOT}/testing/data/arrow-ipc-stream/integration -name "*.stream")
 
 rm -rf ${CORPUS_DIR}
@@ -52,9 +56,24 @@ rm -rf ${CORPUS_DIR}
 ${OUT}/arrow-ipc-generate-tensor-fuzz-corpus -stream ${CORPUS_DIR}
 ${ARROW_CPP}/build-support/fuzzing/pack_corpus.py ${CORPUS_DIR} ${OUT}/arrow-ipc-tensor-stream-fuzz_seed_corpus.zip
 
+# Parquet
+
 rm -rf ${CORPUS_DIR}
 ${OUT}/parquet-arrow-generate-fuzz-corpus ${CORPUS_DIR}
 # Add Parquet testing examples
 cp ${ARROW_CPP}/submodules/parquet-testing/data/*.parquet ${CORPUS_DIR}
 cp ${ARROW_CPP}/submodules/parquet-testing/bad_data/*.parquet ${CORPUS_DIR}
 ${ARROW_CPP}/build-support/fuzzing/pack_corpus.py ${CORPUS_DIR} ${OUT}/parquet-arrow-fuzz_seed_corpus.zip
+
+# CSV
+
+rm -rf ${PANDAS_DIR}
+git clone --depth=1 https://github.com/pandas-dev/pandas ${PANDAS_DIR}
+
+rm -rf ${CORPUS_DIR}
+mkdir -p ${CORPUS_DIR}
+# Add examples from arrow-testing repo
+cp ${ARROW_ROOT}/testing/data/csv/*.csv ${CORPUS_DIR}
+# Add examples from Pandas test suite
+find ${PANDAS_DIR}/ -name "*.csv" -exec cp --backup=numbered '{}' ${CORPUS_DIR} \;
+${ARROW_CPP}/build-support/fuzzing/pack_corpus.py ${CORPUS_DIR} ${OUT}/arrow-csv-fuzz_seed_corpus.zip
diff --git a/cpp/build-support/fuzzing/pack_corpus.py b/cpp/build-support/fuzzing/pack_corpus.py
@@ -29,7 +29,7 @@
 def process_dir(corpus_dir, zip_output):
     seen_hashes = {}
 
-    for child in corpus_dir.iterdir():
+    for child in sorted(corpus_dir.iterdir()):
         if not child.is_file():
             raise IOError(f"Not a file: {child}")
         with child.open('rb') as f:
@@ -39,6 +39,7 @@ def process_dir(corpus_dir, zip_output):
             raise ValueError(
                 f"Duplicate hash: {arcname} (in file {child}), "
                 f"already seen in file {seen_hashes[arcname]}")
+        print(f"  {child} -> {arcname}")
         zip_output.writestr(str(arcname), data)
         seen_hashes[arcname] = child
 
diff --git a/cpp/src/arrow/csv/CMakeLists.txt b/cpp/src/arrow/csv/CMakeLists.txt
@@ -30,6 +30,8 @@ add_arrow_benchmark(converter_benchmark PREFIX "arrow-csv")
 add_arrow_benchmark(parser_benchmark PREFIX "arrow-csv")
 add_arrow_benchmark(writer_benchmark PREFIX "arrow-csv")
 
+add_arrow_fuzz_target(fuzz PREFIX "arrow-csv")
+
 arrow_install_all_headers("arrow/csv")
 
 # pkg-config support
diff --git a/cpp/src/arrow/csv/fuzz.cc b/cpp/src/arrow/csv/fuzz.cc
@@ -0,0 +1,59 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <cstdint>
+#include <memory>
+
+#include "arrow/buffer.h"
+#include "arrow/csv/reader.h"
+#include "arrow/io/interfaces.h"
+#include "arrow/io/memory.h"
+#include "arrow/status.h"
+#include "arrow/table.h"
+#include "arrow/util/macros.h"
+
+namespace arrow::csv {
+
+Status FuzzCsvReader(const uint8_t* data, int64_t size) {
+  auto io_context = arrow::io::default_io_context();
+
+  auto read_options = ReadOptions::Defaults();
+  // Make chunking more likely
+  read_options.block_size = 4096;
+  auto parse_options = ParseOptions::Defaults();
+  auto convert_options = ConvertOptions::Defaults();
+  convert_options.auto_dict_encode = true;
+
+  auto input_stream =
+      std::make_shared<::arrow::io::BufferReader>(std::make_shared<Buffer>(data, size));
+
+  // TODO test other reader types
+  ARROW_ASSIGN_OR_RAISE(auto table_reader,
+                        TableReader::Make(io_context, input_stream, read_options,
+                                          parse_options, convert_options));
+  ARROW_ASSIGN_OR_RAISE(auto table, table_reader->Read());
+  RETURN_NOT_OK(table->ValidateFull());
+  return Status::OK();
+}
+
+}  // namespace arrow::csv
+
+extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) {
+  auto status = arrow::csv::FuzzCsvReader(data, static_cast<int64_t>(size));
+  ARROW_UNUSED(status);
+  return 0;
+}
diff --git a/docs/source/developers/cpp/fuzzing.rst b/docs/source/developers/cpp/fuzzing.rst
@@ -29,6 +29,7 @@ fuzz testing on several parts of the Arrow C++ feature set, currently:
 * the IPC stream format
 * the IPC file format
 * the Parquet file format
+* the CSV file format
 
 We welcome any contribution to expand the scope of fuzz testing and cover
 areas ingesting potentially invalid or malicious data.