From f775c805597c556b35d054e8f09d2d979285a9c6 Mon Sep 17 00:00:00 2001 From: Adam Reeve Date: Thu, 2 May 2024 14:36:07 +1200 Subject: [PATCH 1/8] Allow getting a record batch reader for a dataset or scanner --- c_glib/arrow-dataset-glib/dataset.cpp | 34 ++++++++++++++++++++++++++- c_glib/arrow-dataset-glib/dataset.h | 3 +++ c_glib/arrow-dataset-glib/scanner.cpp | 21 +++++++++++++++++ c_glib/arrow-dataset-glib/scanner.h | 4 ++++ c_glib/arrow-glib/version.h.in | 23 ++++++++++++++++++ 5 files changed, 84 insertions(+), 1 deletion(-) diff --git a/c_glib/arrow-dataset-glib/dataset.cpp b/c_glib/arrow-dataset-glib/dataset.cpp index 704d6b589ee94..d8fbad02f2038 100644 --- a/c_glib/arrow-dataset-glib/dataset.cpp +++ b/c_glib/arrow-dataset-glib/dataset.cpp @@ -19,6 +19,7 @@ #include #include +#include #include #include @@ -152,12 +153,43 @@ gadataset_dataset_to_table(GADatasetDataset *dataset, GError **error) } auto arrow_scanner = *arrow_scanner_result; auto arrow_table_result = arrow_scanner->ToTable(); - if (!garrow::check(error, arrow_scanner_result, "[dataset][to-table]")) { + if (!garrow::check(error, arrow_table_result, "[dataset][to-table]")) { return NULL; } return garrow_table_new_raw(&(*arrow_table_result)); } +/** + * gadataset_dataset_get_reader: + * @dataset: A #GADatasetDataset. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: (transfer full) (nullable): + * A #GArrowRecordBatchReader on success, %NULL on error. + * + * Since: 17.0.0 + */ +GArrowRecordBatchReader * +gadataset_dataset_get_reader(GADatasetDataset *dataset, GError **error) +{ + auto arrow_dataset = gadataset_dataset_get_raw(dataset); + auto arrow_scanner_builder_result = arrow_dataset->NewScan(); + if (!garrow::check(error, arrow_scanner_builder_result, "[dataset][get-reader]")) { + return NULL; + } + auto arrow_scanner_builder = *arrow_scanner_builder_result; + auto arrow_scanner_result = arrow_scanner_builder->Finish(); + if (!garrow::check(error, arrow_scanner_result, "[dataset][get-reader]")) { + return NULL; + } + auto arrow_scanner = *arrow_scanner_result; + auto arrow_reader_result = arrow_scanner->ToRecordBatchReader(); + if (!garrow::check(error, arrow_reader_result, "[dataset][get-reader]")) { + return NULL; + } + return garrow_record_batch_reader_new_raw(&(*arrow_reader_result), nullptr); +} + /** * gadataset_dataset_get_type_name: * @dataset: A #GADatasetDataset. diff --git a/c_glib/arrow-dataset-glib/dataset.h b/c_glib/arrow-dataset-glib/dataset.h index 57f6c7729f073..ff88366d1dfc1 100644 --- a/c_glib/arrow-dataset-glib/dataset.h +++ b/c_glib/arrow-dataset-glib/dataset.h @@ -34,6 +34,9 @@ gadataset_dataset_to_table(GADatasetDataset *dataset, GError **error); GARROW_AVAILABLE_IN_5_0 gchar * gadataset_dataset_get_type_name(GADatasetDataset *dataset); +GARROW_AVAILABLE_IN_17_0 +GArrowRecordBatchReader * +gadataset_dataset_get_reader(GADatasetDataset *dataset, GError **error); #define GADATASET_TYPE_FILE_SYSTEM_DATASET_WRITE_OPTIONS \ (gadataset_file_system_dataset_write_options_get_type()) diff --git a/c_glib/arrow-dataset-glib/scanner.cpp b/c_glib/arrow-dataset-glib/scanner.cpp index 717532db9220f..95640c876f733 100644 --- a/c_glib/arrow-dataset-glib/scanner.cpp +++ b/c_glib/arrow-dataset-glib/scanner.cpp @@ -128,6 +128,27 @@ gadataset_scanner_to_table(GADatasetScanner *scanner, GError **error) } } +/** + * gadataset_scanner_get_reader: + * @scanner: A #GADatasetScanner. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: (transfer full) (nullable): + * A #GArrowRecordBatchReader on success, %NULL on error. + * + * Since: 17.0.0 + */ +GArrowRecordBatchReader * +gadataset_scanner_get_reader(GADatasetScanner *scanner, GError **error) +{ + auto arrow_scanner = gadataset_scanner_get_raw(scanner); + auto arrow_reader_result = arrow_scanner->ToRecordBatchReader(); + if (!garrow::check(error, arrow_reader_result, "[scanner][get-reader]")) { + return NULL; + } + return garrow_record_batch_reader_new_raw(&(*arrow_reader_result), nullptr); +} + typedef struct GADatasetScannerBuilderPrivate_ { std::shared_ptr scanner_builder; diff --git a/c_glib/arrow-dataset-glib/scanner.h b/c_glib/arrow-dataset-glib/scanner.h index 3c7432fb268e4..559697cf81ecc 100644 --- a/c_glib/arrow-dataset-glib/scanner.h +++ b/c_glib/arrow-dataset-glib/scanner.h @@ -35,6 +35,10 @@ GARROW_AVAILABLE_IN_5_0 GArrowTable * gadataset_scanner_to_table(GADatasetScanner *scanner, GError **error); +GARROW_AVAILABLE_IN_17_0 +GArrowRecordBatchReader * +gadataset_scanner_get_reader(GADatasetScanner *scanner, GError **error); + #define GADATASET_TYPE_SCANNER_BUILDER (gadataset_scanner_builder_get_type()) G_DECLARE_DERIVABLE_TYPE( GADatasetScannerBuilder, gadataset_scanner_builder, GADATASET, SCANNER_BUILDER, GObject) diff --git a/c_glib/arrow-glib/version.h.in b/c_glib/arrow-glib/version.h.in index a83c68a2a16dc..6ed5806f560ae 100644 --- a/c_glib/arrow-glib/version.h.in +++ b/c_glib/arrow-glib/version.h.in @@ -108,6 +108,15 @@ # define GARROW_UNAVAILABLE(major, minor) G_UNAVAILABLE(major, minor) #endif +/** + * GARROW_VERSION_17_0: + * + * You can use this macro value for compile time API version check. + * + * Since: 17.0.0 + */ +#define GARROW_VERSION_17_0 G_ENCODE_VERSION(17, 0) + /** * GARROW_VERSION_16_0: * @@ -362,6 +371,20 @@ #define GARROW_AVAILABLE_IN_ALL +#if GARROW_VERSION_MIN_REQUIRED >= GARROW_VERSION_17_0 +# define GARROW_DEPRECATED_IN_17_0 GARROW_DEPRECATED +# define GARROW_DEPRECATED_IN_17_0_FOR(function) GARROW_DEPRECATED_FOR(function) +#else +# define GARROW_DEPRECATED_IN_17_0 +# define GARROW_DEPRECATED_IN_17_0_FOR(function) +#endif + +#if GARROW_VERSION_MAX_ALLOWED < GARROW_VERSION_17_0 +# define GARROW_AVAILABLE_IN_17_0 GARROW_UNAVAILABLE(17, 0) +#else +# define GARROW_AVAILABLE_IN_17_0 +#endif + #if GARROW_VERSION_MIN_REQUIRED >= GARROW_VERSION_16_0 # define GARROW_DEPRECATED_IN_16_0 GARROW_DEPRECATED # define GARROW_DEPRECATED_IN_16_0_FOR(function) GARROW_DEPRECATED_FOR(function) From c5b890e64c75587a786fc2a42c3b2e718a9b37ba Mon Sep 17 00:00:00 2001 From: Adam Reeve Date: Tue, 21 May 2024 19:26:51 +1200 Subject: [PATCH 2/8] Add tests --- .../test/dataset/test-file-system-dataset.rb | 26 ++++++++++++++++--- c_glib/test/dataset/test-scanner.rb | 12 +++++++++ 2 files changed, 35 insertions(+), 3 deletions(-) diff --git a/c_glib/test/dataset/test-file-system-dataset.rb b/c_glib/test/dataset/test-file-system-dataset.rb index 0e856b678f860..366af52246538 100644 --- a/c_glib/test/dataset/test-file-system-dataset.rb +++ b/c_glib/test/dataset/test-file-system-dataset.rb @@ -56,6 +56,24 @@ def test_partitioning end def test_read_write + dataset, expected_table = create_dataset + assert_equal(expected_table, dataset.to_table) + end + + def test_record_batch_reader + dataset, expected_table = create_dataset + reader = dataset.reader + record_batches = [] + loop do + batch = reader.read_next + break if batch.nil? + record_batches << batch + end + table_from_batches = Arrow::Table.new(reader.schema, record_batches) + assert_equal(expected_table, table_from_batches) + end + + def create_dataset table = build_table(label: build_string_array(["a", "a", "b", "c"]), count: build_int32_array([1, 10, 2, 3])) table_reader = Arrow::TableBatchReader.new(table) @@ -73,7 +91,8 @@ def test_read_write end @factory.partition_base_dir = @dir dataset = @factory.finish - assert_equal(build_table(count: [ + + expected_table = build_table(count: [ build_int32_array([1, 10]), build_int32_array([2]), build_int32_array([3]), @@ -82,7 +101,8 @@ def test_read_write build_string_array(["a", "a"]), build_string_array(["b"]), build_string_array(["c"]), - ]), - dataset.to_table) + ]) + + return dataset, expected_table end end diff --git a/c_glib/test/dataset/test-scanner.rb b/c_glib/test/dataset/test-scanner.rb index f7702d4905fb6..2489803683ccb 100644 --- a/c_glib/test/dataset/test-scanner.rb +++ b/c_glib/test/dataset/test-scanner.rb @@ -45,4 +45,16 @@ def setup def test_to_table assert_equal(@table, @scanner.to_table) end + + def test_record_batch_reader + reader = @scanner.reader + record_batches = [] + loop do + batch = reader.read_next + break if batch.nil? + record_batches << batch + end + table_from_batches = Arrow::Table.new(reader.schema, record_batches) + assert_equal(@table, table_from_batches) + end end From d1d9765661e2ffd935f837ce29838a861bc566a7 Mon Sep 17 00:00:00 2001 From: Adam Reeve Date: Tue, 21 May 2024 19:56:44 +1200 Subject: [PATCH 3/8] Use to_reader rather than get_reader --- c_glib/arrow-dataset-glib/dataset.cpp | 4 ++-- c_glib/arrow-dataset-glib/dataset.h | 2 +- c_glib/arrow-dataset-glib/scanner.cpp | 4 ++-- c_glib/arrow-dataset-glib/scanner.h | 2 +- c_glib/test/dataset/test-file-system-dataset.rb | 2 +- c_glib/test/dataset/test-scanner.rb | 2 +- 6 files changed, 8 insertions(+), 8 deletions(-) diff --git a/c_glib/arrow-dataset-glib/dataset.cpp b/c_glib/arrow-dataset-glib/dataset.cpp index d8fbad02f2038..21b6a60dd8634 100644 --- a/c_glib/arrow-dataset-glib/dataset.cpp +++ b/c_glib/arrow-dataset-glib/dataset.cpp @@ -160,7 +160,7 @@ gadataset_dataset_to_table(GADatasetDataset *dataset, GError **error) } /** - * gadataset_dataset_get_reader: + * gadataset_dataset_to_reader: * @dataset: A #GADatasetDataset. * @error: (nullable): Return location for a #GError or %NULL. * @@ -170,7 +170,7 @@ gadataset_dataset_to_table(GADatasetDataset *dataset, GError **error) * Since: 17.0.0 */ GArrowRecordBatchReader * -gadataset_dataset_get_reader(GADatasetDataset *dataset, GError **error) +gadataset_dataset_to_reader(GADatasetDataset *dataset, GError **error) { auto arrow_dataset = gadataset_dataset_get_raw(dataset); auto arrow_scanner_builder_result = arrow_dataset->NewScan(); diff --git a/c_glib/arrow-dataset-glib/dataset.h b/c_glib/arrow-dataset-glib/dataset.h index ff88366d1dfc1..465775ca02e01 100644 --- a/c_glib/arrow-dataset-glib/dataset.h +++ b/c_glib/arrow-dataset-glib/dataset.h @@ -36,7 +36,7 @@ gchar * gadataset_dataset_get_type_name(GADatasetDataset *dataset); GARROW_AVAILABLE_IN_17_0 GArrowRecordBatchReader * -gadataset_dataset_get_reader(GADatasetDataset *dataset, GError **error); +gadataset_dataset_to_reader(GADatasetDataset *dataset, GError **error); #define GADATASET_TYPE_FILE_SYSTEM_DATASET_WRITE_OPTIONS \ (gadataset_file_system_dataset_write_options_get_type()) diff --git a/c_glib/arrow-dataset-glib/scanner.cpp b/c_glib/arrow-dataset-glib/scanner.cpp index 95640c876f733..06912e4a5439f 100644 --- a/c_glib/arrow-dataset-glib/scanner.cpp +++ b/c_glib/arrow-dataset-glib/scanner.cpp @@ -129,7 +129,7 @@ gadataset_scanner_to_table(GADatasetScanner *scanner, GError **error) } /** - * gadataset_scanner_get_reader: + * gadataset_scanner_to_reader: * @scanner: A #GADatasetScanner. * @error: (nullable): Return location for a #GError or %NULL. * @@ -139,7 +139,7 @@ gadataset_scanner_to_table(GADatasetScanner *scanner, GError **error) * Since: 17.0.0 */ GArrowRecordBatchReader * -gadataset_scanner_get_reader(GADatasetScanner *scanner, GError **error) +gadataset_scanner_to_reader(GADatasetScanner *scanner, GError **error) { auto arrow_scanner = gadataset_scanner_get_raw(scanner); auto arrow_reader_result = arrow_scanner->ToRecordBatchReader(); diff --git a/c_glib/arrow-dataset-glib/scanner.h b/c_glib/arrow-dataset-glib/scanner.h index 559697cf81ecc..b27732cdfba43 100644 --- a/c_glib/arrow-dataset-glib/scanner.h +++ b/c_glib/arrow-dataset-glib/scanner.h @@ -37,7 +37,7 @@ gadataset_scanner_to_table(GADatasetScanner *scanner, GError **error); GARROW_AVAILABLE_IN_17_0 GArrowRecordBatchReader * -gadataset_scanner_get_reader(GADatasetScanner *scanner, GError **error); +gadataset_scanner_to_reader(GADatasetScanner *scanner, GError **error); #define GADATASET_TYPE_SCANNER_BUILDER (gadataset_scanner_builder_get_type()) G_DECLARE_DERIVABLE_TYPE( diff --git a/c_glib/test/dataset/test-file-system-dataset.rb b/c_glib/test/dataset/test-file-system-dataset.rb index 366af52246538..e389f6b4fe5ec 100644 --- a/c_glib/test/dataset/test-file-system-dataset.rb +++ b/c_glib/test/dataset/test-file-system-dataset.rb @@ -62,7 +62,7 @@ def test_read_write def test_record_batch_reader dataset, expected_table = create_dataset - reader = dataset.reader + reader = dataset.to_reader record_batches = [] loop do batch = reader.read_next diff --git a/c_glib/test/dataset/test-scanner.rb b/c_glib/test/dataset/test-scanner.rb index 2489803683ccb..843c2221cebab 100644 --- a/c_glib/test/dataset/test-scanner.rb +++ b/c_glib/test/dataset/test-scanner.rb @@ -47,7 +47,7 @@ def test_to_table end def test_record_batch_reader - reader = @scanner.reader + reader = @scanner.to_reader record_batches = [] loop do batch = reader.read_next From 729eeb9104830f5a30b24978eb7ad0e19933f8a0 Mon Sep 17 00:00:00 2001 From: Adam Reeve Date: Tue, 21 May 2024 20:59:56 +1200 Subject: [PATCH 4/8] Use RecordBatchReader.read_all in tests Co-authored-by: Sutou Kouhei --- c_glib/test/dataset/test-file-system-dataset.rb | 9 +-------- c_glib/test/dataset/test-scanner.rb | 10 +--------- 2 files changed, 2 insertions(+), 17 deletions(-) diff --git a/c_glib/test/dataset/test-file-system-dataset.rb b/c_glib/test/dataset/test-file-system-dataset.rb index e389f6b4fe5ec..eb58f4d851863 100644 --- a/c_glib/test/dataset/test-file-system-dataset.rb +++ b/c_glib/test/dataset/test-file-system-dataset.rb @@ -63,14 +63,7 @@ def test_read_write def test_record_batch_reader dataset, expected_table = create_dataset reader = dataset.to_reader - record_batches = [] - loop do - batch = reader.read_next - break if batch.nil? - record_batches << batch - end - table_from_batches = Arrow::Table.new(reader.schema, record_batches) - assert_equal(expected_table, table_from_batches) + assert_equal(expected_table, reader.read_all) end def create_dataset diff --git a/c_glib/test/dataset/test-scanner.rb b/c_glib/test/dataset/test-scanner.rb index 843c2221cebab..dfa21f275936d 100644 --- a/c_glib/test/dataset/test-scanner.rb +++ b/c_glib/test/dataset/test-scanner.rb @@ -47,14 +47,6 @@ def test_to_table end def test_record_batch_reader - reader = @scanner.to_reader - record_batches = [] - loop do - batch = reader.read_next - break if batch.nil? - record_batches << batch - end - table_from_batches = Arrow::Table.new(reader.schema, record_batches) - assert_equal(@table, table_from_batches) + assert_equal(@table, @scanner.to_reader.read_all) end end From 7f5222bbff87c08ca644c36931714f9d21cbcb5b Mon Sep 17 00:00:00 2001 From: Adam Reeve Date: Tue, 21 May 2024 21:09:01 +1200 Subject: [PATCH 5/8] Rename method to to_record_batch_reader --- c_glib/arrow-dataset-glib/dataset.cpp | 12 +++++++----- c_glib/arrow-dataset-glib/dataset.h | 2 +- c_glib/arrow-dataset-glib/scanner.cpp | 6 +++--- c_glib/arrow-dataset-glib/scanner.h | 2 +- c_glib/test/dataset/test-file-system-dataset.rb | 2 +- c_glib/test/dataset/test-scanner.rb | 2 +- 6 files changed, 14 insertions(+), 12 deletions(-) diff --git a/c_glib/arrow-dataset-glib/dataset.cpp b/c_glib/arrow-dataset-glib/dataset.cpp index 21b6a60dd8634..39b2c93037653 100644 --- a/c_glib/arrow-dataset-glib/dataset.cpp +++ b/c_glib/arrow-dataset-glib/dataset.cpp @@ -160,7 +160,7 @@ gadataset_dataset_to_table(GADatasetDataset *dataset, GError **error) } /** - * gadataset_dataset_to_reader: + * gadataset_dataset_to_record_batch_reader: * @dataset: A #GADatasetDataset. * @error: (nullable): Return location for a #GError or %NULL. * @@ -170,21 +170,23 @@ gadataset_dataset_to_table(GADatasetDataset *dataset, GError **error) * Since: 17.0.0 */ GArrowRecordBatchReader * -gadataset_dataset_to_reader(GADatasetDataset *dataset, GError **error) +gadataset_dataset_to_record_batch_reader(GADatasetDataset *dataset, GError **error) { auto arrow_dataset = gadataset_dataset_get_raw(dataset); auto arrow_scanner_builder_result = arrow_dataset->NewScan(); - if (!garrow::check(error, arrow_scanner_builder_result, "[dataset][get-reader]")) { + if (!garrow::check(error, + arrow_scanner_builder_result, + "[dataset][to-record-batch-reader]")) { return NULL; } auto arrow_scanner_builder = *arrow_scanner_builder_result; auto arrow_scanner_result = arrow_scanner_builder->Finish(); - if (!garrow::check(error, arrow_scanner_result, "[dataset][get-reader]")) { + if (!garrow::check(error, arrow_scanner_result, "[dataset][to-record-batch-reader]")) { return NULL; } auto arrow_scanner = *arrow_scanner_result; auto arrow_reader_result = arrow_scanner->ToRecordBatchReader(); - if (!garrow::check(error, arrow_reader_result, "[dataset][get-reader]")) { + if (!garrow::check(error, arrow_reader_result, "[dataset][to-record-batch-reader]")) { return NULL; } return garrow_record_batch_reader_new_raw(&(*arrow_reader_result), nullptr); diff --git a/c_glib/arrow-dataset-glib/dataset.h b/c_glib/arrow-dataset-glib/dataset.h index 465775ca02e01..81321e60ddced 100644 --- a/c_glib/arrow-dataset-glib/dataset.h +++ b/c_glib/arrow-dataset-glib/dataset.h @@ -36,7 +36,7 @@ gchar * gadataset_dataset_get_type_name(GADatasetDataset *dataset); GARROW_AVAILABLE_IN_17_0 GArrowRecordBatchReader * -gadataset_dataset_to_reader(GADatasetDataset *dataset, GError **error); +gadataset_dataset_to_record_batch_reader(GADatasetDataset *dataset, GError **error); #define GADATASET_TYPE_FILE_SYSTEM_DATASET_WRITE_OPTIONS \ (gadataset_file_system_dataset_write_options_get_type()) diff --git a/c_glib/arrow-dataset-glib/scanner.cpp b/c_glib/arrow-dataset-glib/scanner.cpp index 06912e4a5439f..75b7efca014c6 100644 --- a/c_glib/arrow-dataset-glib/scanner.cpp +++ b/c_glib/arrow-dataset-glib/scanner.cpp @@ -129,7 +129,7 @@ gadataset_scanner_to_table(GADatasetScanner *scanner, GError **error) } /** - * gadataset_scanner_to_reader: + * gadataset_scanner_to_record_batch_reader: * @scanner: A #GADatasetScanner. * @error: (nullable): Return location for a #GError or %NULL. * @@ -139,11 +139,11 @@ gadataset_scanner_to_table(GADatasetScanner *scanner, GError **error) * Since: 17.0.0 */ GArrowRecordBatchReader * -gadataset_scanner_to_reader(GADatasetScanner *scanner, GError **error) +gadataset_scanner_to_record_batch_reader(GADatasetScanner *scanner, GError **error) { auto arrow_scanner = gadataset_scanner_get_raw(scanner); auto arrow_reader_result = arrow_scanner->ToRecordBatchReader(); - if (!garrow::check(error, arrow_reader_result, "[scanner][get-reader]")) { + if (!garrow::check(error, arrow_reader_result, "[scanner][to-record-batch-reader]")) { return NULL; } return garrow_record_batch_reader_new_raw(&(*arrow_reader_result), nullptr); diff --git a/c_glib/arrow-dataset-glib/scanner.h b/c_glib/arrow-dataset-glib/scanner.h index b27732cdfba43..4e355444ad63a 100644 --- a/c_glib/arrow-dataset-glib/scanner.h +++ b/c_glib/arrow-dataset-glib/scanner.h @@ -37,7 +37,7 @@ gadataset_scanner_to_table(GADatasetScanner *scanner, GError **error); GARROW_AVAILABLE_IN_17_0 GArrowRecordBatchReader * -gadataset_scanner_to_reader(GADatasetScanner *scanner, GError **error); +gadataset_scanner_to_record_batch_reader(GADatasetScanner *scanner, GError **error); #define GADATASET_TYPE_SCANNER_BUILDER (gadataset_scanner_builder_get_type()) G_DECLARE_DERIVABLE_TYPE( diff --git a/c_glib/test/dataset/test-file-system-dataset.rb b/c_glib/test/dataset/test-file-system-dataset.rb index eb58f4d851863..a64022b618083 100644 --- a/c_glib/test/dataset/test-file-system-dataset.rb +++ b/c_glib/test/dataset/test-file-system-dataset.rb @@ -62,7 +62,7 @@ def test_read_write def test_record_batch_reader dataset, expected_table = create_dataset - reader = dataset.to_reader + reader = dataset.to_record_batch_reader assert_equal(expected_table, reader.read_all) end diff --git a/c_glib/test/dataset/test-scanner.rb b/c_glib/test/dataset/test-scanner.rb index dfa21f275936d..714760d6a3fee 100644 --- a/c_glib/test/dataset/test-scanner.rb +++ b/c_glib/test/dataset/test-scanner.rb @@ -47,6 +47,6 @@ def test_to_table end def test_record_batch_reader - assert_equal(@table, @scanner.to_reader.read_all) + assert_equal(@table, @scanner.to_record_batch_reader.read_all) end end From 4058984640c76be42615e16b8ae5211f226b431e Mon Sep 17 00:00:00 2001 From: Adam Reeve Date: Wed, 22 May 2024 15:37:15 +1200 Subject: [PATCH 6/8] Ignore errors removing dataset directory and add GC.start --- c_glib/test/dataset/test-file-system-dataset.rb | 11 ++++++++++- c_glib/test/dataset/test-scanner.rb | 11 ++++++++++- 2 files changed, 20 insertions(+), 2 deletions(-) diff --git a/c_glib/test/dataset/test-file-system-dataset.rb b/c_glib/test/dataset/test-file-system-dataset.rb index a64022b618083..ac67c6d61d019 100644 --- a/c_glib/test/dataset/test-file-system-dataset.rb +++ b/c_glib/test/dataset/test-file-system-dataset.rb @@ -21,7 +21,8 @@ class TestDatasetFileSystemDataset < Test::Unit::TestCase def setup omit("Arrow Dataset is required") unless defined?(ArrowDataset) - Dir.mktmpdir do |tmpdir| + tmpdir = Dir.mktmpdir + begin @dir = tmpdir @format = ArrowDataset::IPCFileFormat.new @factory = ArrowDataset::FileSystemDatasetFactory.new(@format) @@ -32,6 +33,14 @@ def setup ArrowDataset::DirectoryPartitioning.new(partitioning_schema) @factory.partitioning = @partitioning yield + ensure + # We have to ignore errors trying to remove the directory due to + # the RecordBatchReader not closing files + # (https://github.com/apache/arrow/issues/41771). + # Also request GC first which should free any remaining RecordBatchReader + # and close open files. + GC.start + FileUtils.remove_entry(tmpdir, force: true) end end diff --git a/c_glib/test/dataset/test-scanner.rb b/c_glib/test/dataset/test-scanner.rb index 714760d6a3fee..ef834674cb8a0 100644 --- a/c_glib/test/dataset/test-scanner.rb +++ b/c_glib/test/dataset/test-scanner.rb @@ -21,7 +21,8 @@ class TestDatasetScanner < Test::Unit::TestCase def setup omit("Arrow Dataset is required") unless defined?(ArrowDataset) - Dir.mktmpdir do |tmpdir| + tmpdir = Dir.mktmpdir + begin path = File.join(tmpdir, "table.arrow") @table = build_table(visible: [ build_boolean_array([true, false, true]), @@ -39,6 +40,14 @@ def setup builder = @dataset.begin_scan @scanner = builder.finish yield + ensure + # We have to ignore errors trying to remove the directory due to + # the RecordBatchReader not closing files + # (https://github.com/apache/arrow/issues/41771). + # Also request GC first which should free any remaining RecordBatchReader + # and close open files. + GC.start + FileUtils.remove_entry(tmpdir, force: true) end end From 0475ad6f282b2da3e39a49876786eed4a5f6ba53 Mon Sep 17 00:00:00 2001 From: Adam Reeve Date: Sat, 25 May 2024 21:39:37 +1200 Subject: [PATCH 7/8] Apply suggestions from code review Co-authored-by: Sutou Kouhei --- c_glib/arrow-dataset-glib/dataset.cpp | 9 +++++---- c_glib/arrow-dataset-glib/scanner.cpp | 5 +++-- c_glib/test/dataset/test-file-system-dataset.rb | 2 +- c_glib/test/dataset/test-scanner.rb | 2 +- 4 files changed, 10 insertions(+), 8 deletions(-) diff --git a/c_glib/arrow-dataset-glib/dataset.cpp b/c_glib/arrow-dataset-glib/dataset.cpp index 39b2c93037653..f84e4e3db380a 100644 --- a/c_glib/arrow-dataset-glib/dataset.cpp +++ b/c_glib/arrow-dataset-glib/dataset.cpp @@ -177,19 +177,20 @@ gadataset_dataset_to_record_batch_reader(GADatasetDataset *dataset, GError **err if (!garrow::check(error, arrow_scanner_builder_result, "[dataset][to-record-batch-reader]")) { - return NULL; + return nullptr; } auto arrow_scanner_builder = *arrow_scanner_builder_result; auto arrow_scanner_result = arrow_scanner_builder->Finish(); if (!garrow::check(error, arrow_scanner_result, "[dataset][to-record-batch-reader]")) { - return NULL; + return nullptr; } auto arrow_scanner = *arrow_scanner_result; auto arrow_reader_result = arrow_scanner->ToRecordBatchReader(); if (!garrow::check(error, arrow_reader_result, "[dataset][to-record-batch-reader]")) { - return NULL; + return nullptr; } - return garrow_record_batch_reader_new_raw(&(*arrow_reader_result), nullptr); + auto sources = g_list_prepend(nullptr, dataset); + return garrow_record_batch_reader_new_raw(&(*arrow_reader_result), sources); } /** diff --git a/c_glib/arrow-dataset-glib/scanner.cpp b/c_glib/arrow-dataset-glib/scanner.cpp index 75b7efca014c6..28af1f16e5968 100644 --- a/c_glib/arrow-dataset-glib/scanner.cpp +++ b/c_glib/arrow-dataset-glib/scanner.cpp @@ -144,9 +144,10 @@ gadataset_scanner_to_record_batch_reader(GADatasetScanner *scanner, GError **err auto arrow_scanner = gadataset_scanner_get_raw(scanner); auto arrow_reader_result = arrow_scanner->ToRecordBatchReader(); if (!garrow::check(error, arrow_reader_result, "[scanner][to-record-batch-reader]")) { - return NULL; + return nullptr; } - return garrow_record_batch_reader_new_raw(&(*arrow_reader_result), nullptr); + auto sources = g_list_prepend(nullptr, scanner); + return garrow_record_batch_reader_new_raw(&(*arrow_reader_result), sources); } typedef struct GADatasetScannerBuilderPrivate_ diff --git a/c_glib/test/dataset/test-file-system-dataset.rb b/c_glib/test/dataset/test-file-system-dataset.rb index ac67c6d61d019..f5f28920e0494 100644 --- a/c_glib/test/dataset/test-file-system-dataset.rb +++ b/c_glib/test/dataset/test-file-system-dataset.rb @@ -69,7 +69,7 @@ def test_read_write assert_equal(expected_table, dataset.to_table) end - def test_record_batch_reader + def test_to_record_batch_reader dataset, expected_table = create_dataset reader = dataset.to_record_batch_reader assert_equal(expected_table, reader.read_all) diff --git a/c_glib/test/dataset/test-scanner.rb b/c_glib/test/dataset/test-scanner.rb index ef834674cb8a0..53f2787df4c5f 100644 --- a/c_glib/test/dataset/test-scanner.rb +++ b/c_glib/test/dataset/test-scanner.rb @@ -55,7 +55,7 @@ def test_to_table assert_equal(@table, @scanner.to_table) end - def test_record_batch_reader + def test_to_record_batch_reader assert_equal(@table, @scanner.to_record_batch_reader.read_all) end end From 0dba622536804904eca40dc65f99dc42b59121ac Mon Sep 17 00:00:00 2001 From: Adam Reeve Date: Sat, 25 May 2024 21:44:29 +1200 Subject: [PATCH 8/8] Use unref to free readers --- .../test/dataset/test-file-system-dataset.rb | 18 +++++++----------- c_glib/test/dataset/test-scanner.rb | 19 ++++++++----------- 2 files changed, 15 insertions(+), 22 deletions(-) diff --git a/c_glib/test/dataset/test-file-system-dataset.rb b/c_glib/test/dataset/test-file-system-dataset.rb index f5f28920e0494..96deedf6b4eb0 100644 --- a/c_glib/test/dataset/test-file-system-dataset.rb +++ b/c_glib/test/dataset/test-file-system-dataset.rb @@ -21,8 +21,7 @@ class TestDatasetFileSystemDataset < Test::Unit::TestCase def setup omit("Arrow Dataset is required") unless defined?(ArrowDataset) - tmpdir = Dir.mktmpdir - begin + Dir.mktmpdir do |tmpdir| @dir = tmpdir @format = ArrowDataset::IPCFileFormat.new @factory = ArrowDataset::FileSystemDatasetFactory.new(@format) @@ -33,14 +32,6 @@ def setup ArrowDataset::DirectoryPartitioning.new(partitioning_schema) @factory.partitioning = @partitioning yield - ensure - # We have to ignore errors trying to remove the directory due to - # the RecordBatchReader not closing files - # (https://github.com/apache/arrow/issues/41771). - # Also request GC first which should free any remaining RecordBatchReader - # and close open files. - GC.start - FileUtils.remove_entry(tmpdir, force: true) end end @@ -72,7 +63,12 @@ def test_read_write def test_to_record_batch_reader dataset, expected_table = create_dataset reader = dataset.to_record_batch_reader - assert_equal(expected_table, reader.read_all) + begin + assert_equal(expected_table, reader.read_all) + ensure + # Unref to ensure the reader closes files and we can delete the temp directory + reader.unref + end end def create_dataset diff --git a/c_glib/test/dataset/test-scanner.rb b/c_glib/test/dataset/test-scanner.rb index 53f2787df4c5f..5dc31eefc5f4c 100644 --- a/c_glib/test/dataset/test-scanner.rb +++ b/c_glib/test/dataset/test-scanner.rb @@ -21,8 +21,7 @@ class TestDatasetScanner < Test::Unit::TestCase def setup omit("Arrow Dataset is required") unless defined?(ArrowDataset) - tmpdir = Dir.mktmpdir - begin + Dir.mktmpdir do |tmpdir| path = File.join(tmpdir, "table.arrow") @table = build_table(visible: [ build_boolean_array([true, false, true]), @@ -40,14 +39,6 @@ def setup builder = @dataset.begin_scan @scanner = builder.finish yield - ensure - # We have to ignore errors trying to remove the directory due to - # the RecordBatchReader not closing files - # (https://github.com/apache/arrow/issues/41771). - # Also request GC first which should free any remaining RecordBatchReader - # and close open files. - GC.start - FileUtils.remove_entry(tmpdir, force: true) end end @@ -56,6 +47,12 @@ def test_to_table end def test_to_record_batch_reader - assert_equal(@table, @scanner.to_record_batch_reader.read_all) + reader = @scanner.to_record_batch_reader + begin + assert_equal(@table, reader.read_all) + ensure + # Unref to ensure the reader closes files and we can delete the temp directory + reader.unref + end end end