Skip to content

Commit bfce5f2

Browse files
authored
GH-47756: [C++][CI] Fuzz CSV reader (#47757)
### Rationale for this change We are already fuzzing the IPC and Parquet reader. The CSV reader is another important user-facing component that is worth fuzzing. ### What changes are included in this PR? 1. Add fuzz target for the CSV reader (currently only fuzzing the non-streaming table reader) 2. Generate rudimentary seed corpus using CSV test files from the `arrow-testing` repo and the Pandas test suite ### Are these changes tested? Yes, by the OSS-Fuzz CI build. ### Are there any user-facing changes? No. * GitHub Issue: #47756 Authored-by: Antoine Pitrou <antoine@python.org> Signed-off-by: Antoine Pitrou <antoine@python.org>
1 parent be6dddf commit bfce5f2

File tree

6 files changed

+84
-1
lines changed

6 files changed

+84
-1
lines changed

ci/scripts/cpp_test.sh

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -184,6 +184,7 @@ if [ "${ARROW_FUZZING}" == "ON" ]; then
184184
if [ "${ARROW_PARQUET}" == "ON" ]; then
185185
"${binary_output_dir}/parquet-arrow-fuzz" "${ARROW_TEST_DATA}"/parquet/fuzzing/*-testcase-*
186186
fi
187+
# TODO run CSV fuzz regression tests once we have any
187188
fi
188189

189190
popd

cpp/build-support/fuzzing/generate_corpuses.sh

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,8 @@ fi
2727
set -ex
2828

2929
CORPUS_DIR=/tmp/corpus
30+
PANDAS_DIR=/tmp/pandas
31+
3032
ARROW_ROOT=$(cd $(dirname $BASH_SOURCE)/../../..; pwd)
3133
ARROW_CPP=$ARROW_ROOT/cpp
3234
OUT=$1
@@ -35,6 +37,8 @@ OUT=$1
3537
# where "<FUZZ TARGET>" is the exact name of the fuzz target executable the
3638
# seed corpus is generated for.
3739

40+
# Arrow IPC
41+
3842
IPC_INTEGRATION_FILES=$(find ${ARROW_ROOT}/testing/data/arrow-ipc-stream/integration -name "*.stream")
3943

4044
rm -rf ${CORPUS_DIR}
@@ -52,9 +56,24 @@ rm -rf ${CORPUS_DIR}
5256
${OUT}/arrow-ipc-generate-tensor-fuzz-corpus -stream ${CORPUS_DIR}
5357
${ARROW_CPP}/build-support/fuzzing/pack_corpus.py ${CORPUS_DIR} ${OUT}/arrow-ipc-tensor-stream-fuzz_seed_corpus.zip
5458

59+
# Parquet
60+
5561
rm -rf ${CORPUS_DIR}
5662
${OUT}/parquet-arrow-generate-fuzz-corpus ${CORPUS_DIR}
5763
# Add Parquet testing examples
5864
cp ${ARROW_CPP}/submodules/parquet-testing/data/*.parquet ${CORPUS_DIR}
5965
cp ${ARROW_CPP}/submodules/parquet-testing/bad_data/*.parquet ${CORPUS_DIR}
6066
${ARROW_CPP}/build-support/fuzzing/pack_corpus.py ${CORPUS_DIR} ${OUT}/parquet-arrow-fuzz_seed_corpus.zip
67+
68+
# CSV
69+
70+
rm -rf ${PANDAS_DIR}
71+
git clone --depth=1 https://github.com/pandas-dev/pandas ${PANDAS_DIR}
72+
73+
rm -rf ${CORPUS_DIR}
74+
mkdir -p ${CORPUS_DIR}
75+
# Add examples from arrow-testing repo
76+
cp ${ARROW_ROOT}/testing/data/csv/*.csv ${CORPUS_DIR}
77+
# Add examples from Pandas test suite
78+
find ${PANDAS_DIR}/ -name "*.csv" -exec cp --backup=numbered '{}' ${CORPUS_DIR} \;
79+
${ARROW_CPP}/build-support/fuzzing/pack_corpus.py ${CORPUS_DIR} ${OUT}/arrow-csv-fuzz_seed_corpus.zip

cpp/build-support/fuzzing/pack_corpus.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@
2929
def process_dir(corpus_dir, zip_output):
3030
seen_hashes = {}
3131

32-
for child in corpus_dir.iterdir():
32+
for child in sorted(corpus_dir.iterdir()):
3333
if not child.is_file():
3434
raise IOError(f"Not a file: {child}")
3535
with child.open('rb') as f:
@@ -39,6 +39,7 @@ def process_dir(corpus_dir, zip_output):
3939
raise ValueError(
4040
f"Duplicate hash: {arcname} (in file {child}), "
4141
f"already seen in file {seen_hashes[arcname]}")
42+
print(f" {child} -> {arcname}")
4243
zip_output.writestr(str(arcname), data)
4344
seen_hashes[arcname] = child
4445

cpp/src/arrow/csv/CMakeLists.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,8 @@ add_arrow_benchmark(converter_benchmark PREFIX "arrow-csv")
3030
add_arrow_benchmark(parser_benchmark PREFIX "arrow-csv")
3131
add_arrow_benchmark(writer_benchmark PREFIX "arrow-csv")
3232

33+
add_arrow_fuzz_target(fuzz PREFIX "arrow-csv")
34+
3335
arrow_install_all_headers("arrow/csv")
3436

3537
# pkg-config support

cpp/src/arrow/csv/fuzz.cc

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
// Licensed to the Apache Software Foundation (ASF) under one
2+
// or more contributor license agreements. See the NOTICE file
3+
// distributed with this work for additional information
4+
// regarding copyright ownership. The ASF licenses this file
5+
// to you under the Apache License, Version 2.0 (the
6+
// "License"); you may not use this file except in compliance
7+
// with the License. You may obtain a copy of the License at
8+
//
9+
// http://www.apache.org/licenses/LICENSE-2.0
10+
//
11+
// Unless required by applicable law or agreed to in writing,
12+
// software distributed under the License is distributed on an
13+
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
// KIND, either express or implied. See the License for the
15+
// specific language governing permissions and limitations
16+
// under the License.
17+
18+
#include <cstdint>
19+
#include <memory>
20+
21+
#include "arrow/buffer.h"
22+
#include "arrow/csv/reader.h"
23+
#include "arrow/io/interfaces.h"
24+
#include "arrow/io/memory.h"
25+
#include "arrow/status.h"
26+
#include "arrow/table.h"
27+
#include "arrow/util/macros.h"
28+
29+
namespace arrow::csv {
30+
31+
Status FuzzCsvReader(const uint8_t* data, int64_t size) {
32+
auto io_context = arrow::io::default_io_context();
33+
34+
auto read_options = ReadOptions::Defaults();
35+
// Make chunking more likely
36+
read_options.block_size = 4096;
37+
auto parse_options = ParseOptions::Defaults();
38+
auto convert_options = ConvertOptions::Defaults();
39+
convert_options.auto_dict_encode = true;
40+
41+
auto input_stream =
42+
std::make_shared<::arrow::io::BufferReader>(std::make_shared<Buffer>(data, size));
43+
44+
// TODO test other reader types
45+
ARROW_ASSIGN_OR_RAISE(auto table_reader,
46+
TableReader::Make(io_context, input_stream, read_options,
47+
parse_options, convert_options));
48+
ARROW_ASSIGN_OR_RAISE(auto table, table_reader->Read());
49+
RETURN_NOT_OK(table->ValidateFull());
50+
return Status::OK();
51+
}
52+
53+
} // namespace arrow::csv
54+
55+
extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) {
56+
auto status = arrow::csv::FuzzCsvReader(data, static_cast<int64_t>(size));
57+
ARROW_UNUSED(status);
58+
return 0;
59+
}

docs/source/developers/cpp/fuzzing.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ fuzz testing on several parts of the Arrow C++ feature set, currently:
2929
* the IPC stream format
3030
* the IPC file format
3131
* the Parquet file format
32+
* the CSV file format
3233

3334
We welcome any contribution to expand the scope of fuzz testing and cover
3435
areas ingesting potentially invalid or malicious data.

0 commit comments

Comments
 (0)