From 8dd6ff628b8e48eafade6c6e4782920e24ccb13b Mon Sep 17 00:00:00 2001 From: otegami Date: Tue, 25 Jul 2023 20:50:12 +0800 Subject: [PATCH 01/23] [Ruby] Add Arrow::Table#each_raw_record for iterable raw record access Add Arrow::Table#each_raw_record to make Arrow::Table#raw_records be iterable. --- ruby/red-arrow/ext/arrow/arrow.cpp | 3 ++ ruby/red-arrow/ext/arrow/raw-records.cpp | 60 ++++++++++++++++++++++-- ruby/red-arrow/ext/arrow/red-arrow.hpp | 1 + 3 files changed, 60 insertions(+), 4 deletions(-) diff --git a/ruby/red-arrow/ext/arrow/arrow.cpp b/ruby/red-arrow/ext/arrow/arrow.cpp index 8eb3b610090b5..8c1414978f1fc 100644 --- a/ruby/red-arrow/ext/arrow/arrow.cpp +++ b/ruby/red-arrow/ext/arrow/arrow.cpp @@ -87,6 +87,9 @@ extern "C" void Init_arrow() { rb_define_method(cArrowTable, "raw_records", reinterpret_cast(red_arrow::table_raw_records), 0); + rb_define_method(cArrowTable, "each_raw_record", + reinterpret_cast(red_arrow::table_each_raw_record), + 0); red_arrow::cDate = rb_const_get(rb_cObject, rb_intern("Date")); diff --git a/ruby/red-arrow/ext/arrow/raw-records.cpp b/ruby/red-arrow/ext/arrow/raw-records.cpp index e0326f9d2fdb7..f6ee63b0d4748 100644 --- a/ruby/red-arrow/ext/arrow/raw-records.cpp +++ b/ruby/red-arrow/ext/arrow/raw-records.cpp @@ -26,7 +26,9 @@ namespace red_arrow { explicit RawRecordsBuilder(VALUE records, int n_columns) : Converter(), records_(records), - n_columns_(n_columns) { + record_(Qnil), + n_columns_(n_columns), + is_produce_mode_(false) { } void build(const arrow::RecordBatch& record_batch) { @@ -68,6 +70,29 @@ namespace red_arrow { }); } + void produce(const arrow::Table& table) { + rb::protect([&] { + is_produce_mode_ = true; + const auto n_rows = table.num_rows(); + for (int64_t i = 0; i < n_rows; ++i) { + row_offset_ = i; + record_ = rb_ary_new_capa(n_columns_); + + for (int i = 0; i < n_columns_; ++i) { + const auto& chunked_array = table.column(i).get(); + column_index_ = i; + + for (const auto array : chunked_array->chunks()) { + check_status(array->Accept(this), + "[table][each-raw-record]"); + } + } + rb_yield(record_); + } + return Qnil; + }); + } + #define VISIT(TYPE) \ arrow::Status Visit(const arrow::TYPE ## Array& array) override { \ convert(array); \ @@ -125,9 +150,13 @@ namespace red_arrow { rb_ary_store(record, column_index_, value); } } else { - for (int64_t i = 0, ii = row_offset_; i < n; ++i, ++ii) { - auto record = rb_ary_entry(records_, ii); - rb_ary_store(record, column_index_, convert_value(array, i)); + if (is_produce_mode_) { + rb_ary_store(record_, column_index_, convert_value(array, row_offset_)); + } else { + for (int64_t i = 0, ii = row_offset_; i < n; ++i, ++ii) { + auto record = rb_ary_entry(records_, ii); + rb_ary_store(record, column_index_, convert_value(array, i)); + } } } } @@ -135,6 +164,9 @@ namespace red_arrow { // Destination for converted records. VALUE records_; + // Destination for converted record. + VALUE record_; + // The current column index. int column_index_; @@ -143,6 +175,8 @@ namespace red_arrow { // The number of columns. const int n_columns_; + + bool is_produce_mode_; }; } @@ -181,4 +215,22 @@ namespace red_arrow { return records; } + + VALUE + table_each_raw_record(VALUE rb_table) { + auto garrow_table = GARROW_TABLE(RVAL2GOBJ(rb_table)); + auto table = garrow_table_get_raw(garrow_table).get(); + const auto n_rows = table->num_rows(); + const auto n_columns = table->num_columns(); + auto records = rb_ary_new_capa(n_rows); + + try { + RawRecordsBuilder builder(records, n_columns); + builder.produce(*table); + } catch (rb::State& state) { + state.jump(); + } + + return Qnil; + } } diff --git a/ruby/red-arrow/ext/arrow/red-arrow.hpp b/ruby/red-arrow/ext/arrow/red-arrow.hpp index ba578076a7e39..da8d430f6e375 100644 --- a/ruby/red-arrow/ext/arrow/red-arrow.hpp +++ b/ruby/red-arrow/ext/arrow/red-arrow.hpp @@ -59,6 +59,7 @@ namespace red_arrow { VALUE record_batch_raw_records(VALUE obj); VALUE table_raw_records(VALUE obj); + VALUE table_each_raw_record(VALUE obj); inline VALUE time_unit_to_scale(const arrow::TimeUnit::type unit) { switch (unit) { From 3318abc8aa4e7d9e82da61a6b779d7556d97bee6 Mon Sep 17 00:00:00 2001 From: otegami Date: Mon, 14 Aug 2023 19:43:46 +0800 Subject: [PATCH 02/23] Introduce RawRecordProducer for specialized row-wise processing To enhance clarity and maintainability: - Implemented the new `RawRecordProducer` class, dedicated to processing records on a row-by-row basis. - This specialization eliminates conditional branching within the `convert` method previously present in the RawRecordsBuilder. --- ruby/red-arrow/ext/arrow/raw-record.cpp | 136 +++++++++++++++++++++++ ruby/red-arrow/ext/arrow/raw-records.cpp | 60 +--------- 2 files changed, 140 insertions(+), 56 deletions(-) create mode 100644 ruby/red-arrow/ext/arrow/raw-record.cpp diff --git a/ruby/red-arrow/ext/arrow/raw-record.cpp b/ruby/red-arrow/ext/arrow/raw-record.cpp new file mode 100644 index 0000000000000..9aef572520b02 --- /dev/null +++ b/ruby/red-arrow/ext/arrow/raw-record.cpp @@ -0,0 +1,136 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include "converters.hpp" + +namespace red_arrow { + namespace { + class RawRecordProducer : private Converter, public arrow::ArrayVisitor { + public: + explicit RawRecordProducer(int n_columns) + : Converter(), + record_(Qnil), + n_columns_(n_columns) { + } + + void produce(const arrow::Table& table) { + rb::protect([&] { + const auto n_rows = table.num_rows(); + for (int64_t i = 0; i < n_rows; ++i) { + row_offset_ = i; + record_ = rb_ary_new_capa(n_columns_); + + for (int i = 0; i < n_columns_; ++i) { + const auto& chunked_array = table.column(i).get(); + column_index_ = i; + + for (const auto array : chunked_array->chunks()) { + check_status(array->Accept(this), + "[table][each-raw-record]"); + } + } + rb_yield(record_); + } + return Qnil; + }); + } + +#define VISIT(TYPE) \ + arrow::Status Visit(const arrow::TYPE ## Array& array) override { \ + convert(array); \ + return arrow::Status::OK(); \ + } + + VISIT(Null) + VISIT(Boolean) + VISIT(Int8) + VISIT(Int16) + VISIT(Int32) + VISIT(Int64) + VISIT(UInt8) + VISIT(UInt16) + VISIT(UInt32) + VISIT(UInt64) + VISIT(HalfFloat) + VISIT(Float) + VISIT(Double) + VISIT(Binary) + VISIT(String) + VISIT(FixedSizeBinary) + VISIT(Date32) + VISIT(Date64) + VISIT(Time32) + VISIT(Time64) + VISIT(Timestamp) + VISIT(MonthInterval) + VISIT(DayTimeInterval) + VISIT(MonthDayNanoInterval) + VISIT(List) + VISIT(Struct) + VISIT(Map) + VISIT(SparseUnion) + VISIT(DenseUnion) + VISIT(Dictionary) + VISIT(Decimal128) + VISIT(Decimal256) + // TODO + // VISIT(Extension) + +#undef VISIT + + private: + template + void convert(const ArrayType& array) { + auto value = Qnil; + if (!array.IsNull(row_offset_)) { + value = convert_value(array, row_offset_); + } + rb_ary_store(record_, column_index_, value); + } + + // Destination for converted record. + VALUE record_; + + // The current column index. + int column_index_; + + // The current row offset. + int64_t row_offset_; + + // The number of columns. + const int n_columns_; + }; + } + + VALUE + table_each_raw_record(VALUE rb_table) { + auto garrow_table = GARROW_TABLE(RVAL2GOBJ(rb_table)); + auto table = garrow_table_get_raw(garrow_table).get(); + const auto n_columns = table->num_columns(); + + try { + RawRecordProducer producer(n_columns); + producer.produce(*table); + } catch (rb::State& state) { + state.jump(); + } + + return Qnil; + } +} diff --git a/ruby/red-arrow/ext/arrow/raw-records.cpp b/ruby/red-arrow/ext/arrow/raw-records.cpp index f6ee63b0d4748..e0326f9d2fdb7 100644 --- a/ruby/red-arrow/ext/arrow/raw-records.cpp +++ b/ruby/red-arrow/ext/arrow/raw-records.cpp @@ -26,9 +26,7 @@ namespace red_arrow { explicit RawRecordsBuilder(VALUE records, int n_columns) : Converter(), records_(records), - record_(Qnil), - n_columns_(n_columns), - is_produce_mode_(false) { + n_columns_(n_columns) { } void build(const arrow::RecordBatch& record_batch) { @@ -70,29 +68,6 @@ namespace red_arrow { }); } - void produce(const arrow::Table& table) { - rb::protect([&] { - is_produce_mode_ = true; - const auto n_rows = table.num_rows(); - for (int64_t i = 0; i < n_rows; ++i) { - row_offset_ = i; - record_ = rb_ary_new_capa(n_columns_); - - for (int i = 0; i < n_columns_; ++i) { - const auto& chunked_array = table.column(i).get(); - column_index_ = i; - - for (const auto array : chunked_array->chunks()) { - check_status(array->Accept(this), - "[table][each-raw-record]"); - } - } - rb_yield(record_); - } - return Qnil; - }); - } - #define VISIT(TYPE) \ arrow::Status Visit(const arrow::TYPE ## Array& array) override { \ convert(array); \ @@ -150,13 +125,9 @@ namespace red_arrow { rb_ary_store(record, column_index_, value); } } else { - if (is_produce_mode_) { - rb_ary_store(record_, column_index_, convert_value(array, row_offset_)); - } else { - for (int64_t i = 0, ii = row_offset_; i < n; ++i, ++ii) { - auto record = rb_ary_entry(records_, ii); - rb_ary_store(record, column_index_, convert_value(array, i)); - } + for (int64_t i = 0, ii = row_offset_; i < n; ++i, ++ii) { + auto record = rb_ary_entry(records_, ii); + rb_ary_store(record, column_index_, convert_value(array, i)); } } } @@ -164,9 +135,6 @@ namespace red_arrow { // Destination for converted records. VALUE records_; - // Destination for converted record. - VALUE record_; - // The current column index. int column_index_; @@ -175,8 +143,6 @@ namespace red_arrow { // The number of columns. const int n_columns_; - - bool is_produce_mode_; }; } @@ -215,22 +181,4 @@ namespace red_arrow { return records; } - - VALUE - table_each_raw_record(VALUE rb_table) { - auto garrow_table = GARROW_TABLE(RVAL2GOBJ(rb_table)); - auto table = garrow_table_get_raw(garrow_table).get(); - const auto n_rows = table->num_rows(); - const auto n_columns = table->num_columns(); - auto records = rb_ary_new_capa(n_rows); - - try { - RawRecordsBuilder builder(records, n_columns); - builder.produce(*table); - } catch (rb::State& state) { - state.jump(); - } - - return Qnil; - } } From d3e5f3f0307bb7036a02e32d26c17f8d7c68af88 Mon Sep 17 00:00:00 2001 From: otegami Date: Wed, 16 Aug 2023 07:51:42 +0800 Subject: [PATCH 03/23] Refactor RawRecordsProducer for Row-Wise Processing in raw-records.cpp Because it produces multiple raw records. --- ruby/red-arrow/ext/arrow/raw-record.cpp | 136 ----------------------- ruby/red-arrow/ext/arrow/raw-records.cpp | 112 +++++++++++++++++++ 2 files changed, 112 insertions(+), 136 deletions(-) delete mode 100644 ruby/red-arrow/ext/arrow/raw-record.cpp diff --git a/ruby/red-arrow/ext/arrow/raw-record.cpp b/ruby/red-arrow/ext/arrow/raw-record.cpp deleted file mode 100644 index 9aef572520b02..0000000000000 --- a/ruby/red-arrow/ext/arrow/raw-record.cpp +++ /dev/null @@ -1,136 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -#include "converters.hpp" - -namespace red_arrow { - namespace { - class RawRecordProducer : private Converter, public arrow::ArrayVisitor { - public: - explicit RawRecordProducer(int n_columns) - : Converter(), - record_(Qnil), - n_columns_(n_columns) { - } - - void produce(const arrow::Table& table) { - rb::protect([&] { - const auto n_rows = table.num_rows(); - for (int64_t i = 0; i < n_rows; ++i) { - row_offset_ = i; - record_ = rb_ary_new_capa(n_columns_); - - for (int i = 0; i < n_columns_; ++i) { - const auto& chunked_array = table.column(i).get(); - column_index_ = i; - - for (const auto array : chunked_array->chunks()) { - check_status(array->Accept(this), - "[table][each-raw-record]"); - } - } - rb_yield(record_); - } - return Qnil; - }); - } - -#define VISIT(TYPE) \ - arrow::Status Visit(const arrow::TYPE ## Array& array) override { \ - convert(array); \ - return arrow::Status::OK(); \ - } - - VISIT(Null) - VISIT(Boolean) - VISIT(Int8) - VISIT(Int16) - VISIT(Int32) - VISIT(Int64) - VISIT(UInt8) - VISIT(UInt16) - VISIT(UInt32) - VISIT(UInt64) - VISIT(HalfFloat) - VISIT(Float) - VISIT(Double) - VISIT(Binary) - VISIT(String) - VISIT(FixedSizeBinary) - VISIT(Date32) - VISIT(Date64) - VISIT(Time32) - VISIT(Time64) - VISIT(Timestamp) - VISIT(MonthInterval) - VISIT(DayTimeInterval) - VISIT(MonthDayNanoInterval) - VISIT(List) - VISIT(Struct) - VISIT(Map) - VISIT(SparseUnion) - VISIT(DenseUnion) - VISIT(Dictionary) - VISIT(Decimal128) - VISIT(Decimal256) - // TODO - // VISIT(Extension) - -#undef VISIT - - private: - template - void convert(const ArrayType& array) { - auto value = Qnil; - if (!array.IsNull(row_offset_)) { - value = convert_value(array, row_offset_); - } - rb_ary_store(record_, column_index_, value); - } - - // Destination for converted record. - VALUE record_; - - // The current column index. - int column_index_; - - // The current row offset. - int64_t row_offset_; - - // The number of columns. - const int n_columns_; - }; - } - - VALUE - table_each_raw_record(VALUE rb_table) { - auto garrow_table = GARROW_TABLE(RVAL2GOBJ(rb_table)); - auto table = garrow_table_get_raw(garrow_table).get(); - const auto n_columns = table->num_columns(); - - try { - RawRecordProducer producer(n_columns); - producer.produce(*table); - } catch (rb::State& state) { - state.jump(); - } - - return Qnil; - } -} diff --git a/ruby/red-arrow/ext/arrow/raw-records.cpp b/ruby/red-arrow/ext/arrow/raw-records.cpp index e0326f9d2fdb7..048d2166cfc32 100644 --- a/ruby/red-arrow/ext/arrow/raw-records.cpp +++ b/ruby/red-arrow/ext/arrow/raw-records.cpp @@ -144,6 +144,102 @@ namespace red_arrow { // The number of columns. const int n_columns_; }; + + class RawRecordsProducer : private Converter, public arrow::ArrayVisitor { + public: + explicit RawRecordsProducer(int n_columns) + : Converter(), + record_(Qnil), + n_columns_(n_columns) { + } + + void produce(const arrow::Table& table) { + rb::protect([&] { + const auto n_rows = table.num_rows(); + for (int64_t i = 0; i < n_rows; ++i) { + row_offset_ = i; + record_ = rb_ary_new_capa(n_columns_); + + for (int i = 0; i < n_columns_; ++i) { + const auto& chunked_array = table.column(i).get(); + column_index_ = i; + + for (const auto array : chunked_array->chunks()) { + check_status(array->Accept(this), + "[table][each-raw-record]"); + } + } + rb_yield(record_); + } + return Qnil; + }); + } + +#define VISIT(TYPE) \ + arrow::Status Visit(const arrow::TYPE ## Array& array) override { \ + convert(array); \ + return arrow::Status::OK(); \ + } + + VISIT(Null) + VISIT(Boolean) + VISIT(Int8) + VISIT(Int16) + VISIT(Int32) + VISIT(Int64) + VISIT(UInt8) + VISIT(UInt16) + VISIT(UInt32) + VISIT(UInt64) + VISIT(HalfFloat) + VISIT(Float) + VISIT(Double) + VISIT(Binary) + VISIT(String) + VISIT(FixedSizeBinary) + VISIT(Date32) + VISIT(Date64) + VISIT(Time32) + VISIT(Time64) + VISIT(Timestamp) + VISIT(MonthInterval) + VISIT(DayTimeInterval) + VISIT(MonthDayNanoInterval) + VISIT(List) + VISIT(Struct) + VISIT(Map) + VISIT(SparseUnion) + VISIT(DenseUnion) + VISIT(Dictionary) + VISIT(Decimal128) + VISIT(Decimal256) + // TODO + // VISIT(Extension) + +#undef VISIT + + private: + template + void convert(const ArrayType& array) { + auto value = Qnil; + if (!array.IsNull(row_offset_)) { + value = convert_value(array, row_offset_); + } + rb_ary_store(record_, column_index_, value); + } + + // Destination for converted record. + VALUE record_; + + // The current column index. + int column_index_; + + // The current row offset. + int64_t row_offset_; + + // The number of columns. + const int n_columns_; + }; } VALUE @@ -181,4 +277,20 @@ namespace red_arrow { return records; } + + VALUE + table_each_raw_record(VALUE rb_table) { + auto garrow_table = GARROW_TABLE(RVAL2GOBJ(rb_table)); + auto table = garrow_table_get_raw(garrow_table).get(); + const auto n_columns = table->num_columns(); + + try { + RawRecordsProducer producer(n_columns); + producer.produce(*table); + } catch (rb::State& state) { + state.jump(); + } + + return Qnil; + } } From 93adaaca4dfa44521814193e980e170ebe563fcc Mon Sep 17 00:00:00 2001 From: otegami Date: Wed, 16 Aug 2023 08:05:37 +0800 Subject: [PATCH 04/23] Add EachRawRecordBasicArraysTest#test_boolean --- .../test/each-raw-record/test-basic-arrays.rb | 41 +++++++++++++++++++ 1 file changed, 41 insertions(+) create mode 100644 ruby/red-arrow/test/each-raw-record/test-basic-arrays.rb diff --git a/ruby/red-arrow/test/each-raw-record/test-basic-arrays.rb b/ruby/red-arrow/test/each-raw-record/test-basic-arrays.rb new file mode 100644 index 0000000000000..35e38eacedc7a --- /dev/null +++ b/ruby/red-arrow/test/each-raw-record/test-basic-arrays.rb @@ -0,0 +1,41 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +module EachRawRecordBasicArraysTests + def test_boolean + records = [ + [true], + [nil], + [false], + ] + + iterated_records = [] + target = build({column: :boolean}, records) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(records, iterated_records) + end +end + +class EachRawRecordBasicArraysTest < Test::Unit::TestCase + include EachRawRecordBasicArraysTests + + def build(schema, records) + Arrow::Table.new(schema, records) + end +end From a16c4c42b25a28aa4f66f460e8034986f0e51f67 Mon Sep 17 00:00:00 2001 From: otegami Date: Thu, 17 Aug 2023 21:53:09 +0800 Subject: [PATCH 05/23] Add the additonal test cases about the other data types to EachRawRecordBasicArraysTests --- .../test/each-raw-record/test-basic-arrays.rb | 477 +++++++++++++++++- 1 file changed, 476 insertions(+), 1 deletion(-) diff --git a/ruby/red-arrow/test/each-raw-record/test-basic-arrays.rb b/ruby/red-arrow/test/each-raw-record/test-basic-arrays.rb index 35e38eacedc7a..958c283cf7821 100644 --- a/ruby/red-arrow/test/each-raw-record/test-basic-arrays.rb +++ b/ruby/red-arrow/test/each-raw-record/test-basic-arrays.rb @@ -16,13 +16,26 @@ # under the License. module EachRawRecordBasicArraysTests + def test_null + records = [ + [nil], + [nil], + [nil], + ] + iterated_records = [] + target = build({column: :null}, records) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(records, iterated_records) + end + def test_boolean records = [ [true], [nil], [false], ] - iterated_records = [] target = build({column: :boolean}, records) target.each_raw_record do |record| @@ -30,6 +43,468 @@ def test_boolean end assert_equal(records, iterated_records) end + + def test_int8 + records = [ + [-(2 ** 7)], + [nil], + [(2 ** 7) - 1], + ] + iterated_records = [] + target = build({column: :int8}, records) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(records, iterated_records) + end + + def test_uint8 + records = [ + [0], + [nil], + [(2 ** 8) - 1], + ] + iterated_records = [] + target = build({column: :uint8}, records) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(records, iterated_records) + end + + def test_int16 + records = [ + [-(2 ** 15)], + [nil], + [(2 ** 15) - 1], + ] + iterated_records = [] + target = build({column: :int16}, records) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(records, iterated_records) + end + + def test_uint16 + records = [ + [0], + [nil], + [(2 ** 16) - 1], + ] + iterated_records = [] + target = build({column: :uint16}, records) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(records, iterated_records) + end + + def test_int32 + records = [ + [-(2 ** 31)], + [nil], + [(2 ** 31) - 1], + ] + iterated_records = [] + target = build({column: :int32}, records) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(records, iterated_records) + end + + def test_uint32 + records = [ + [0], + [nil], + [(2 ** 32) - 1], + ] + iterated_records = [] + target = build({column: :uint32}, records) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(records, iterated_records) + end + + def test_int64 + records = [ + [-(2 ** 63)], + [nil], + [(2 ** 63) - 1], + ] + iterated_records = [] + target = build({column: :int64}, records) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(records, iterated_records) + end + + def test_uint64 + records = [ + [0], + [nil], + [(2 ** 64) - 1], + ] + iterated_records = [] + target = build({column: :uint64}, records) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(records, iterated_records) + end + + def test_half_float + records = [ + [-1.5], + [nil], + [1.5], + ] + iterated_records = [] + target = build({column: :half_float}, records) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(records, iterated_records) + end + + def test_float + records = [ + [-1.0], + [nil], + [1.0], + ] + iterated_records = [] + target = build({column: :float}, records) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(records, iterated_records) + end + + def test_double + records = [ + [-1.0], + [nil], + [1.0], + ] + iterated_records = [] + target = build({column: :double}, records) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(records, iterated_records) + end + + def test_binary + records = [ + ["\x00".b], + [nil], + ["\xff".b], + ] + iterated_records = [] + target = build({column: :binary}, records) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(records, iterated_records) + end + + def test_string + records = [ + ["Ruby"], + [nil], + ["\u3042"], # U+3042 HIRAGANA LETTER A + ] + iterated_records = [] + target = build({column: :string}, records) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(records, iterated_records) + end + + def test_date32 + records = [ + [Date.new(1960, 1, 1)], + [nil], + [Date.new(2017, 8, 23)], + ] + iterated_records = [] + target = build({column: :date32}, records) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(records, iterated_records) + end + + def test_date64 + records = [ + [DateTime.new(1960, 1, 1, 2, 9, 30)], + [nil], + [DateTime.new(2017, 8, 23, 14, 57, 2)], + ] + iterated_records = [] + target = build({column: :date64}, records) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(records, iterated_records) + end + + def test_timestamp_second + records = [ + [Time.parse("1960-01-01T02:09:30Z")], + [nil], + [Time.parse("2017-08-23T14:57:02Z")], + ] + iterated_records = [] + target = build({ + column: { + type: :timestamp, + unit: :second, + } + }, + records) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(records, iterated_records) + end + + def test_timestamp_milli + records = [ + [Time.parse("1960-01-01T02:09:30.123Z")], + [nil], + [Time.parse("2017-08-23T14:57:02.987Z")], + ] + iterated_records = [] + target = build({ + column: { + type: :timestamp, + unit: :milli, + } + }, + records) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(records, iterated_records) + end + + def test_timestamp_micro + records = [ + [Time.parse("1960-01-01T02:09:30.123456Z")], + [nil], + [Time.parse("2017-08-23T14:57:02.987654Z")], + ] + iterated_records = [] + target = build({ + column: { + type: :timestamp, + unit: :micro, + } + }, + records) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(records, iterated_records) + end + + def test_timestamp_nano + records = [ + [Time.parse("1960-01-01T02:09:30.123456789Z")], + [nil], + [Time.parse("2017-08-23T14:57:02.987654321Z")], + ] + iterated_records = [] + target = build({ + column: { + type: :timestamp, + unit: :nano, + } + }, + records) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(records, iterated_records) + end + + def test_time32_second + unit = Arrow::TimeUnit::SECOND + records = [ + [Arrow::Time.new(unit, 60 * 10)], # 00:10:00 + [nil], + [Arrow::Time.new(unit, 60 * 60 * 2 + 9)], # 02:00:09 + ] + iterated_records = [] + target = build({ + column: { + type: :time32, + unit: :second, + } + }, + records) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(records, iterated_records) + end + + def test_time32_milli + unit = Arrow::TimeUnit::MILLI + records = [ + [Arrow::Time.new(unit, (60 * 10) * 1000 + 123)], # 00:10:00.123 + [nil], + [Arrow::Time.new(unit, (60 * 60 * 2 + 9) * 1000 + 987)], # 02:00:09.987 + ] + iterated_records = [] + target = build({ + column: { + type: :time32, + unit: :milli, + } + }, + records) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(records, iterated_records) + end + + def test_time64_micro + unit = Arrow::TimeUnit::MICRO + records = [ + # 00:10:00.123456 + [Arrow::Time.new(unit, (60 * 10) * 1_000_000 + 123_456)], + [nil], + # 02:00:09.987654 + [Arrow::Time.new(unit, (60 * 60 * 2 + 9) * 1_000_000 + 987_654)], + ] + iterated_records = [] + target = build({ + column: { + type: :time64, + unit: :micro, + } + }, + records) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(records, iterated_records) + end + + def test_time64_nano + unit = Arrow::TimeUnit::NANO + records = [ + # 00:10:00.123456789 + [Arrow::Time.new(unit, (60 * 10) * 1_000_000_000 + 123_456_789)], + [nil], + # 02:00:09.987654321 + [Arrow::Time.new(unit, (60 * 60 * 2 + 9) * 1_000_000_000 + 987_654_321)], + ] + iterated_records = [] + target = build({ + column: { + type: :time64, + unit: :nano, + } + }, + records) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(records, iterated_records) + end + + def test_decimal128 + records = [ + [BigDecimal("92.92")], + [nil], + [BigDecimal("29.29")], + ] + iterated_records = [] + target = build({ + column: { + type: :decimal128, + precision: 8, + scale: 2, + } + }, + records) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(records, iterated_records) + end + + def test_decimal256 + records = [ + [BigDecimal("92.92")], + [nil], + [BigDecimal("29.29")], + ] + iterated_records = [] + target = build({ + column: { + type: :decimal256, + precision: 38, + scale: 2, + } + }, + records) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(records, iterated_records) + end + + def test_month_interval + records = [ + [1], + [nil], + [12], + ] + iterated_records = [] + target = build({column: :month_interval}, records) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(records, iterated_records) + end + + def test_day_time_interval + records = [ + [{day: 1, millisecond: 100}], + [nil], + [{day: 2, millisecond: 300}], + ] + iterated_records = [] + target = build({column: :day_time_interval}, records) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(records, iterated_records) + end + + def test_month_day_nano_interval + records = [ + [{month: 1, day: 1, nanosecond: 100}], + [nil], + [{month: 2, day: 3, nanosecond: 400}], + ] + iterated_records = [] + target = build({column: :month_day_nano_interval}, records) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(records, iterated_records) + end end class EachRawRecordBasicArraysTest < Test::Unit::TestCase From b35401173653d61c900de3f2759a21b50d2c4c20 Mon Sep 17 00:00:00 2001 From: otegami Date: Thu, 17 Aug 2023 22:10:05 +0800 Subject: [PATCH 06/23] Implemented RecordBatch#each_raw_record --- ruby/red-arrow/ext/arrow/arrow.cpp | 3 ++ ruby/red-arrow/ext/arrow/raw-records.cpp | 36 +++++++++++++++++++ ruby/red-arrow/ext/arrow/red-arrow.hpp | 1 + .../test/each-raw-record/test-basic-arrays.rb | 10 +++++- 4 files changed, 49 insertions(+), 1 deletion(-) diff --git a/ruby/red-arrow/ext/arrow/arrow.cpp b/ruby/red-arrow/ext/arrow/arrow.cpp index 8c1414978f1fc..404ec8996f232 100644 --- a/ruby/red-arrow/ext/arrow/arrow.cpp +++ b/ruby/red-arrow/ext/arrow/arrow.cpp @@ -82,6 +82,9 @@ extern "C" void Init_arrow() { rb_define_method(cArrowRecordBatch, "raw_records", reinterpret_cast(red_arrow::record_batch_raw_records), 0); + rb_define_method(cArrowRecordBatch, "each_raw_record", + reinterpret_cast(red_arrow::record_batch_each_raw_record), + 0); auto cArrowTable = rb_const_get_at(mArrow, rb_intern("Table")); rb_define_method(cArrowTable, "raw_records", diff --git a/ruby/red-arrow/ext/arrow/raw-records.cpp b/ruby/red-arrow/ext/arrow/raw-records.cpp index 048d2166cfc32..8a30dae8d0c7a 100644 --- a/ruby/red-arrow/ext/arrow/raw-records.cpp +++ b/ruby/red-arrow/ext/arrow/raw-records.cpp @@ -153,6 +153,26 @@ namespace red_arrow { n_columns_(n_columns) { } + void produce(const arrow::RecordBatch& record_batch) { + rb::protect([&] { + const auto n_rows = record_batch.num_rows(); + for (int64_t i = 0; i < n_rows; ++i) { + record_ = rb_ary_new_capa(n_columns_); + row_offset_ = i; + + for (int i = 0; i < n_columns_; ++i) { + const auto array = record_batch.column(i).get(); + column_index_ = i; + + check_status(array->Accept(this), + "[record-batch][each-raw-record]"); + } + rb_yield(record_); + } + return Qnil; + }); + } + void produce(const arrow::Table& table) { rb::protect([&] { const auto n_rows = table.num_rows(); @@ -278,6 +298,22 @@ namespace red_arrow { return records; } + VALUE + record_batch_each_raw_record(VALUE rb_record_batch){ + auto garrow_record_batch = GARROW_RECORD_BATCH(RVAL2GOBJ(rb_record_batch)); + auto record_batch = garrow_record_batch_get_raw(garrow_record_batch).get(); + const auto n_columns = record_batch->num_columns(); + + try { + RawRecordsProducer producer(n_columns); + producer.produce(*record_batch); + } catch (rb::State& state) { + state.jump(); + } + + return Qnil; + } + VALUE table_each_raw_record(VALUE rb_table) { auto garrow_table = GARROW_TABLE(RVAL2GOBJ(rb_table)); diff --git a/ruby/red-arrow/ext/arrow/red-arrow.hpp b/ruby/red-arrow/ext/arrow/red-arrow.hpp index da8d430f6e375..ffc24f9844bb0 100644 --- a/ruby/red-arrow/ext/arrow/red-arrow.hpp +++ b/ruby/red-arrow/ext/arrow/red-arrow.hpp @@ -59,6 +59,7 @@ namespace red_arrow { VALUE record_batch_raw_records(VALUE obj); VALUE table_raw_records(VALUE obj); + VALUE record_batch_each_raw_record(VALUE obj); VALUE table_each_raw_record(VALUE obj); inline VALUE time_unit_to_scale(const arrow::TimeUnit::type unit) { diff --git a/ruby/red-arrow/test/each-raw-record/test-basic-arrays.rb b/ruby/red-arrow/test/each-raw-record/test-basic-arrays.rb index 958c283cf7821..8a30c6bc81126 100644 --- a/ruby/red-arrow/test/each-raw-record/test-basic-arrays.rb +++ b/ruby/red-arrow/test/each-raw-record/test-basic-arrays.rb @@ -507,7 +507,15 @@ def test_month_day_nano_interval end end -class EachRawRecordBasicArraysTest < Test::Unit::TestCase +class EachRawRecordRecordBatchBasicArraysTest< Test::Unit::TestCase + include EachRawRecordBasicArraysTests + + def build(schema, records) + Arrow::RecordBatch.new(schema, records) + end +end + +class EachRawRecordTableBasicArraysTest < Test::Unit::TestCase include EachRawRecordBasicArraysTests def build(schema, records) From 92b4b76c5f6c885a48a8bb960130e15259bafd2c Mon Sep 17 00:00:00 2001 From: otegami Date: Fri, 18 Aug 2023 07:41:51 +0800 Subject: [PATCH 07/23] Fix typo test_tring -> test_string --- ruby/red-arrow/test/raw-records/test-basic-arrays.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ruby/red-arrow/test/raw-records/test-basic-arrays.rb b/ruby/red-arrow/test/raw-records/test-basic-arrays.rb index 15cdee68209e7..9b95a30ad44d5 100644 --- a/ruby/red-arrow/test/raw-records/test-basic-arrays.rb +++ b/ruby/red-arrow/test/raw-records/test-basic-arrays.rb @@ -157,7 +157,7 @@ def test_binary assert_equal(records, target.raw_records) end - def test_tring + def test_string records = [ ["Ruby"], [nil], From 66ea5805a6cc023043e9dcba2c8b882629300640 Mon Sep 17 00:00:00 2001 From: otegami Date: Fri, 18 Aug 2023 20:33:05 +0800 Subject: [PATCH 08/23] Avoid needless reference count increment --- ruby/red-arrow/ext/arrow/raw-records.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ruby/red-arrow/ext/arrow/raw-records.cpp b/ruby/red-arrow/ext/arrow/raw-records.cpp index 8a30dae8d0c7a..86c0c2e5a1ba7 100644 --- a/ruby/red-arrow/ext/arrow/raw-records.cpp +++ b/ruby/red-arrow/ext/arrow/raw-records.cpp @@ -184,7 +184,7 @@ namespace red_arrow { const auto& chunked_array = table.column(i).get(); column_index_ = i; - for (const auto array : chunked_array->chunks()) { + for (const auto& array : chunked_array->chunks()) { check_status(array->Accept(this), "[table][each-raw-record]"); } From b0731a78e6ef5e30d6451db244ffd7f6f7b45a5d Mon Sep 17 00:00:00 2001 From: otegami Date: Fri, 18 Aug 2023 20:44:30 +0800 Subject: [PATCH 09/23] Refactor n_columns definition in produce methods --- ruby/red-arrow/ext/arrow/raw-records.cpp | 24 ++++++++++-------------- 1 file changed, 10 insertions(+), 14 deletions(-) diff --git a/ruby/red-arrow/ext/arrow/raw-records.cpp b/ruby/red-arrow/ext/arrow/raw-records.cpp index 86c0c2e5a1ba7..3460cc09614bc 100644 --- a/ruby/red-arrow/ext/arrow/raw-records.cpp +++ b/ruby/red-arrow/ext/arrow/raw-records.cpp @@ -147,20 +147,20 @@ namespace red_arrow { class RawRecordsProducer : private Converter, public arrow::ArrayVisitor { public: - explicit RawRecordsProducer(int n_columns) + explicit RawRecordsProducer() : Converter(), - record_(Qnil), - n_columns_(n_columns) { + record_(Qnil) { } void produce(const arrow::RecordBatch& record_batch) { rb::protect([&] { + auto n_columns = record_batch.num_columns(); const auto n_rows = record_batch.num_rows(); for (int64_t i = 0; i < n_rows; ++i) { - record_ = rb_ary_new_capa(n_columns_); + record_ = rb_ary_new_capa(n_columns); row_offset_ = i; - for (int i = 0; i < n_columns_; ++i) { + for (int i = 0; i < n_columns; ++i) { const auto array = record_batch.column(i).get(); column_index_ = i; @@ -175,12 +175,13 @@ namespace red_arrow { void produce(const arrow::Table& table) { rb::protect([&] { + auto n_columns = table.num_columns(); const auto n_rows = table.num_rows(); for (int64_t i = 0; i < n_rows; ++i) { row_offset_ = i; - record_ = rb_ary_new_capa(n_columns_); + record_ = rb_ary_new_capa(n_columns); - for (int i = 0; i < n_columns_; ++i) { + for (int i = 0; i < n_columns; ++i) { const auto& chunked_array = table.column(i).get(); column_index_ = i; @@ -256,9 +257,6 @@ namespace red_arrow { // The current row offset. int64_t row_offset_; - - // The number of columns. - const int n_columns_; }; } @@ -302,10 +300,9 @@ namespace red_arrow { record_batch_each_raw_record(VALUE rb_record_batch){ auto garrow_record_batch = GARROW_RECORD_BATCH(RVAL2GOBJ(rb_record_batch)); auto record_batch = garrow_record_batch_get_raw(garrow_record_batch).get(); - const auto n_columns = record_batch->num_columns(); try { - RawRecordsProducer producer(n_columns); + RawRecordsProducer producer; producer.produce(*record_batch); } catch (rb::State& state) { state.jump(); @@ -318,10 +315,9 @@ namespace red_arrow { table_each_raw_record(VALUE rb_table) { auto garrow_table = GARROW_TABLE(RVAL2GOBJ(rb_table)); auto table = garrow_table_get_raw(garrow_table).get(); - const auto n_columns = table->num_columns(); try { - RawRecordsProducer producer(n_columns); + RawRecordsProducer producer; producer.produce(*table); } catch (rb::State& state) { state.jump(); From 34b53e0a3ed95788cdcc617b010093636a3ad8c3 Mon Sep 17 00:00:00 2001 From: otegami Date: Sat, 19 Aug 2023 17:18:03 +0800 Subject: [PATCH 10/23] Support multi-chunk ArrowTable structures for RawRecordsProducer#produce - Enhanced the `produce` method to handle ArrowTables with multiple chunks. - Modified the test to simulate ArrowTable structures having multiple chunks using `Arrow::RecordBatch`. --- ruby/red-arrow/ext/arrow/raw-records.cpp | 28 +++++++++---------- .../test/each-raw-record/test-basic-arrays.rb | 6 +++- 2 files changed, 18 insertions(+), 16 deletions(-) diff --git a/ruby/red-arrow/ext/arrow/raw-records.cpp b/ruby/red-arrow/ext/arrow/raw-records.cpp index 3460cc09614bc..c6714e425ece4 100644 --- a/ruby/red-arrow/ext/arrow/raw-records.cpp +++ b/ruby/red-arrow/ext/arrow/raw-records.cpp @@ -159,11 +159,9 @@ namespace red_arrow { for (int64_t i = 0; i < n_rows; ++i) { record_ = rb_ary_new_capa(n_columns); row_offset_ = i; - for (int i = 0; i < n_columns; ++i) { const auto array = record_batch.column(i).get(); column_index_ = i; - check_status(array->Accept(this), "[record-batch][each-raw-record]"); } @@ -175,23 +173,23 @@ namespace red_arrow { void produce(const arrow::Table& table) { rb::protect([&] { - auto n_columns = table.num_columns(); - const auto n_rows = table.num_rows(); - for (int64_t i = 0; i < n_rows; ++i) { - row_offset_ = i; - record_ = rb_ary_new_capa(n_columns); - - for (int i = 0; i < n_columns; ++i) { - const auto& chunked_array = table.column(i).get(); - column_index_ = i; - - for (const auto& array : chunked_array->chunks()) { + int n_columns = table.num_columns(); + const auto& base_column = table.column(0); + for (int i = 0; i < base_column->num_chunks(); ++i) { + int chunk_length = base_column->chunk(i)->length(); + for (int j = 0; j < chunk_length; ++j) { + row_offset_ = j; + record_ = rb_ary_new_capa(n_columns); + for (int k = 0; k < n_columns; ++k) { + column_index_ = k; + const auto& array = table.column(k)->chunk(i); check_status(array->Accept(this), - "[table][each-raw-record]"); + "[table][each-raw-record]"); } + rb_yield(record_); } - rb_yield(record_); } + return Qnil; }); } diff --git a/ruby/red-arrow/test/each-raw-record/test-basic-arrays.rb b/ruby/red-arrow/test/each-raw-record/test-basic-arrays.rb index 8a30c6bc81126..b55a2c3c10b86 100644 --- a/ruby/red-arrow/test/each-raw-record/test-basic-arrays.rb +++ b/ruby/red-arrow/test/each-raw-record/test-basic-arrays.rb @@ -519,6 +519,10 @@ class EachRawRecordTableBasicArraysTest < Test::Unit::TestCase include EachRawRecordBasicArraysTests def build(schema, records) - Arrow::Table.new(schema, records) + Arrow::Table.new(schema, + [ + Arrow::RecordBatch.new(schema, records[0, 2]), + Arrow::RecordBatch.new(schema, records[2..-1]), + ]) end end From 83059264b16cc0d32bb0de98d7e70bac2b07a663 Mon Sep 17 00:00:00 2001 From: takuya kodama Date: Mon, 21 Aug 2023 20:13:40 +0800 Subject: [PATCH 11/23] Initialized all member variables Co-authored-by: Sutou Kouhei --- ruby/red-arrow/ext/arrow/raw-records.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/ruby/red-arrow/ext/arrow/raw-records.cpp b/ruby/red-arrow/ext/arrow/raw-records.cpp index c6714e425ece4..d732f44f2cae6 100644 --- a/ruby/red-arrow/ext/arrow/raw-records.cpp +++ b/ruby/red-arrow/ext/arrow/raw-records.cpp @@ -149,7 +149,9 @@ namespace red_arrow { public: explicit RawRecordsProducer() : Converter(), - record_(Qnil) { + record_(Qnil), + column_index_(0), + row_offset_(0) { } void produce(const arrow::RecordBatch& record_batch) { From c0a07e24b61008d88caddb444d9575810d8eca26 Mon Sep 17 00:00:00 2001 From: takuya kodama Date: Mon, 21 Aug 2023 20:16:04 +0800 Subject: [PATCH 12/23] Update data retrieval logic to handle varying chunk layouts Adjusted logic to support columns with different chunk structures, such as the first column having [[1], [2, 3], [4]] while the second contains [[true, false], [false, true]]. Co-authored-by: Sutou Kouhei --- ruby/red-arrow/ext/arrow/raw-records.cpp | 34 +++++++++++++++--------- 1 file changed, 21 insertions(+), 13 deletions(-) diff --git a/ruby/red-arrow/ext/arrow/raw-records.cpp b/ruby/red-arrow/ext/arrow/raw-records.cpp index d732f44f2cae6..2d595624e8687 100644 --- a/ruby/red-arrow/ext/arrow/raw-records.cpp +++ b/ruby/red-arrow/ext/arrow/raw-records.cpp @@ -175,21 +175,29 @@ namespace red_arrow { void produce(const arrow::Table& table) { rb::protect([&] { - int n_columns = table.num_columns(); - const auto& base_column = table.column(0); - for (int i = 0; i < base_column->num_chunks(); ++i) { - int chunk_length = base_column->chunk(i)->length(); - for (int j = 0; j < chunk_length; ++j) { - row_offset_ = j; - record_ = rb_ary_new_capa(n_columns); - for (int k = 0; k < n_columns; ++k) { - column_index_ = k; - const auto& array = table.column(k)->chunk(i); - check_status(array->Accept(this), - "[table][each-raw-record]"); + auto n_columns = table.num_columns(); + auto n_rows = table.num_rows(); + std::vector chunk_indexes(n_columns); + std::vector row_offsets(n_columns); + for (int64_t i_row = 0; i_row < n_rows; ++i_row) { + record_ = rb_ary_new_capa(n_columns); + for (int i_column = 0; i_column < n_columns; ++i_column) { + column_index_ = i_column; + const auto chunked_array = table.column(i_column).get(); + auto& chunk_index = chunk_indexes[i_column]; + auto& row_offset = row_offsets[i_column]; + auto array = chunked_array->chunk(chunk_index).get(); + while (array->length() == row_offset) { + ++chunk_index; + row_offset = 0; + array = chunked_array->chunk(chunk_index).get(); } - rb_yield(record_); + row_offset_ = row_offset; + check_status(array->Accept(this), + "[table][each-raw-record]"); + ++row_offset; } + rb_yield(record_); } return Qnil; From 1089b8ccdea0b1945cc545e12d494bf402c73fbd Mon Sep 17 00:00:00 2001 From: takuya kodama Date: Mon, 21 Aug 2023 20:18:59 +0800 Subject: [PATCH 13/23] Add empty chunk test case for the edge case Co-authored-by: Sutou Kouhei --- .../test/each-raw-record/test-basic-arrays.rb | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/ruby/red-arrow/test/each-raw-record/test-basic-arrays.rb b/ruby/red-arrow/test/each-raw-record/test-basic-arrays.rb index b55a2c3c10b86..dbbbd79ee0acb 100644 --- a/ruby/red-arrow/test/each-raw-record/test-basic-arrays.rb +++ b/ruby/red-arrow/test/each-raw-record/test-basic-arrays.rb @@ -519,10 +519,13 @@ class EachRawRecordTableBasicArraysTest < Test::Unit::TestCase include EachRawRecordBasicArraysTests def build(schema, records) - Arrow::Table.new(schema, - [ - Arrow::RecordBatch.new(schema, records[0, 2]), - Arrow::RecordBatch.new(schema, records[2..-1]), - ]) + record_batch = Arrow::RecordBatch.new(schema, records) + # Multiple chunks + record_batches = [ + record_batch.slice(0, 2), + record_batch.slice(2, 0), # Empty chunk + record_batch.slice(2, record_batch.length - 2), + ] + Arrow::Table.new(schema, record_batches) end end From 97ff172aa4b73e020b7f2c5690021c6ddff9f39c Mon Sep 17 00:00:00 2001 From: otegami Date: Tue, 22 Aug 2023 07:43:22 +0800 Subject: [PATCH 14/23] Refactor: Make number of columns and rows constants --- ruby/red-arrow/ext/arrow/raw-records.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ruby/red-arrow/ext/arrow/raw-records.cpp b/ruby/red-arrow/ext/arrow/raw-records.cpp index 2d595624e8687..0043ca3aaf2cc 100644 --- a/ruby/red-arrow/ext/arrow/raw-records.cpp +++ b/ruby/red-arrow/ext/arrow/raw-records.cpp @@ -156,7 +156,7 @@ namespace red_arrow { void produce(const arrow::RecordBatch& record_batch) { rb::protect([&] { - auto n_columns = record_batch.num_columns(); + const auto n_columns = record_batch.num_columns(); const auto n_rows = record_batch.num_rows(); for (int64_t i = 0; i < n_rows; ++i) { record_ = rb_ary_new_capa(n_columns); @@ -175,8 +175,8 @@ namespace red_arrow { void produce(const arrow::Table& table) { rb::protect([&] { - auto n_columns = table.num_columns(); - auto n_rows = table.num_rows(); + const auto n_columns = table.num_columns(); + const auto n_rows = table.num_rows(); std::vector chunk_indexes(n_columns); std::vector row_offsets(n_columns); for (int64_t i_row = 0; i_row < n_rows; ++i_row) { From 98a35992731a1774ee9834020a738b0ed89ef0c0 Mon Sep 17 00:00:00 2001 From: otegami Date: Mon, 28 Aug 2023 07:30:45 +0800 Subject: [PATCH 15/23] Add test cases about dense union arrays --- .../each-raw-record/test-dense-union-array.rb | 706 ++++++++++++++++++ 1 file changed, 706 insertions(+) create mode 100644 ruby/red-arrow/test/each-raw-record/test-dense-union-array.rb diff --git a/ruby/red-arrow/test/each-raw-record/test-dense-union-array.rb b/ruby/red-arrow/test/each-raw-record/test-dense-union-array.rb new file mode 100644 index 0000000000000..7c784cccde3a1 --- /dev/null +++ b/ruby/red-arrow/test/each-raw-record/test-dense-union-array.rb @@ -0,0 +1,706 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +module EachRawRecordDenseUnionArrayTests + def build_schema(type, type_codes) + field_description = {} + if type.is_a?(Hash) + field_description = field_description.merge(type) + else + field_description[:type] = type + end + { + column: { + type: :dense_union, + fields: [ + field_description.merge(name: "0"), + field_description.merge(name: "1"), + ], + type_codes: type_codes, + }, + } + end + + # TODO: Use Arrow::RecordBatch.new(build_schema(type, type_codes), records) + def build_record_batch(type, records) + type_codes = [0, 1] + schema = Arrow::Schema.new(build_schema(type, type_codes)) + type_ids = [] + offsets = [] + arrays = schema.fields[0].data_type.fields.collect do |field| + sub_schema = Arrow::Schema.new([field]) + sub_records = [] + records.each do |record| + column = record[0] + next if column.nil? + next unless column.key?(field.name) + sub_records << [column[field.name]] + end + sub_record_batch = Arrow::RecordBatch.new(sub_schema, + sub_records) + sub_record_batch.columns[0].data + end + records.each do |record| + column = record[0] + if column.key?("0") + type_id = type_codes[0] + type_ids << type_id + offsets << (type_ids.count(type_id) - 1) + elsif column.key?("1") + type_id = type_codes[1] + type_ids << type_id + offsets << (type_ids.count(type_id) - 1) + end + end + union_array = Arrow::DenseUnionArray.new(schema.fields[0].data_type, + Arrow::Int8Array.new(type_ids), + Arrow::Int32Array.new(offsets), + arrays) + schema = Arrow::Schema.new(column: union_array.value_data_type) + Arrow::RecordBatch.new(schema, + records.size, + [union_array]) + end + + def remove_field_names(records) + records.collect do |record| + record.collect do |column| + if column.nil? + column + else + column.values[0] + end + end + end + end + + def test_null + records = [ + [{"0" => nil}], + ] + iterated_records = [] + target = build(:null, records) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(remove_field_names(records), + iterated_records) + end + + def test_boolean + records = [ + [{"0" => true}], + [{"1" => nil}], + ] + iterated_records = [] + target = build(:boolean, records) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(remove_field_names(records), + iterated_records) + end + + def test_int8 + records = [ + [{"0" => -(2 ** 7)}], + [{"1" => nil}], + ] + iterated_records = [] + target = build(:int8, records) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(remove_field_names(records), + iterated_records) + end + + def test_uint8 + records = [ + [{"0" => (2 ** 8) - 1}], + [{"1" => nil}], + ] + iterated_records = [] + target = build(:uint8, records) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(remove_field_names(records), + iterated_records) + end + + def test_int16 + records = [ + [{"0" => -(2 ** 15)}], + [{"1" => nil}], + ] + iterated_records = [] + target = build(:int16, records) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(remove_field_names(records), + iterated_records) + end + + def test_uint16 + records = [ + [{"0" => (2 ** 16) - 1}], + [{"1" => nil}], + ] + iterated_records = [] + target = build(:uint16, records) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(remove_field_names(records), + iterated_records) + end + + def test_int32 + records = [ + [{"0" => -(2 ** 31)}], + [{"1" => nil}], + ] + iterated_records = [] + target = build(:int32, records) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(remove_field_names(records), + iterated_records) + end + + def test_uint32 + records = [ + [{"0" => (2 ** 32) - 1}], + [{"1" => nil}], + ] + iterated_records = [] + target = build(:uint32, records) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(remove_field_names(records), + iterated_records) + end + + def test_int64 + records = [ + [{"0" => -(2 ** 63)}], + [{"1" => nil}], + ] + iterated_records = [] + target = build(:int64, records) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(remove_field_names(records), + iterated_records) + end + + def test_uint64 + records = [ + [{"0" => (2 ** 64) - 1}], + [{"1" => nil}], + ] + iterated_records = [] + target = build(:uint64, records) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(remove_field_names(records), + iterated_records) + end + + def test_float + records = [ + [{"0" => -1.0}], + [{"1" => nil}], + ] + iterated_records = [] + target = build(:float, records) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(remove_field_names(records), + iterated_records) + end + + def test_double + records = [ + [{"0" => -1.0}], + [{"1" => nil}], + ] + iterated_records = [] + target = build(:double, records) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(remove_field_names(records), + iterated_records) + end + + def test_binary + records = [ + [{"0" => "\xff".b}], + [{"1" => nil}], + ] + iterated_records = [] + target = build(:binary, records) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(remove_field_names(records), + iterated_records) + end + + def test_string + records = [ + [{"0" => "Ruby"}], + [{"1" => nil}], + ] + iterated_records = [] + target = build(:string, records) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(remove_field_names(records), + iterated_records) + end + + def test_date32 + records = [ + [{"0" => Date.new(1960, 1, 1)}], + [{"1" => nil}], + ] + iterated_records = [] + target = build(:date32, records) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(remove_field_names(records), + target.raw_records) + end + + def test_date64 + records = [ + [{"0" => DateTime.new(1960, 1, 1, 2, 9, 30)}], + [{"1" => nil}], + ] + iterated_records = [] + target = build(:date64, records) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(remove_field_names(records), + iterated_records) + end + + def test_timestamp_second + records = [ + [{"0" => Time.parse("1960-01-01T02:09:30Z")}], + [{"1" => nil}], + ] + iterated_records = [] + target = build({ + type: :timestamp, + unit: :second, + }, + records) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(remove_field_names(records), + iterated_records) + end + + def test_timestamp_milli + records = [ + [{"0" => Time.parse("1960-01-01T02:09:30.123Z")}], + [{"1" => nil}], + ] + iterated_records = [] + target = build({ + type: :timestamp, + unit: :milli, + }, + records) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(remove_field_names(records), + iterated_records) + end + + def test_timestamp_micro + records = [ + [{"0" => Time.parse("1960-01-01T02:09:30.123456Z")}], + [{"1" => nil}], + ] + iterated_records = [] + target = build({ + type: :timestamp, + unit: :micro, + }, + records) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(remove_field_names(records), + iterated_records) + end + + def test_timestamp_nano + records = [ + [{"0" => Time.parse("1960-01-01T02:09:30.123456789Z")}], + [{"1" => nil}], + ] + iterated_records = [] + target = build({ + type: :timestamp, + unit: :nano, + }, + records) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(remove_field_names(records), + iterated_records) + end + + def test_time32_second + unit = Arrow::TimeUnit::SECOND + records = [ + # 00:10:00 + [{"0" => Arrow::Time.new(unit, 60 * 10)}], + [{"1" => nil}], + ] + iterated_records = [] + target = build({ + type: :time32, + unit: :second, + }, + records) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(remove_field_names(records), + iterated_records) + end + + def test_time32_milli + unit = Arrow::TimeUnit::MILLI + records = [ + # 00:10:00.123 + [{"0" => Arrow::Time.new(unit, (60 * 10) * 1000 + 123)}], + [{"1" => nil}], + ] + iterated_records = [] + target = build({ + type: :time32, + unit: :milli, + }, + records) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(remove_field_names(records), + iterated_records) + end + + def test_time64_micro + unit = Arrow::TimeUnit::MICRO + records = [ + # 00:10:00.123456 + [{"0" => Arrow::Time.new(unit, (60 * 10) * 1_000_000 + 123_456)}], + [{"1" => nil}], + ] + iterated_records = [] + target = build({ + type: :time64, + unit: :micro, + }, + records) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(remove_field_names(records), + iterated_records) + end + + def test_time64_nano + unit = Arrow::TimeUnit::NANO + records = [ + # 00:10:00.123456789 + [{"0" => Arrow::Time.new(unit, (60 * 10) * 1_000_000_000 + 123_456_789)}], + [{"1" => nil}], + ] + iterated_records = [] + target = build({ + type: :time64, + unit: :nano, + }, + records) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(remove_field_names(records), + iterated_records) + end + + def test_decimal128 + records = [ + [{"0" => BigDecimal("92.92")}], + [{"1" => nil}], + ] + iterated_records = [] + target = build({ + type: :decimal128, + precision: 8, + scale: 2, + }, + records) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(remove_field_names(records), + iterated_records) + end + + def test_decimal256 + records = [ + [{"0" => BigDecimal("92.92")}], + [{"1" => nil}], + ] + iterated_records = [] + target = build({ + type: :decimal256, + precision: 38, + scale: 2, + }, + records) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(remove_field_names(records), + iterated_records) + end + + def test_month_interval + records = [ + [{"0" => 1}], + [{"1" => nil}], + ] + iterated_records = [] + target = build(:month_interval, records) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(remove_field_names(records), + iterated_records) + end + + def test_day_time_interval + records = [ + [{"0" => {day: 1, millisecond: 100}}], + [{"1" => nil}], + ] + iterated_records = [] + target = build(:day_time_interval, records) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(remove_field_names(records), + iterated_records) + end + + def test_month_day_nano_interval + records = [ + [{"0" => {month: 1, day: 1, nanosecond: 100}}], + [{"1" => nil}], + ] + iterated_records = [] + target = build(:month_day_nano_interval, records) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(remove_field_names(records), + iterated_records) + end + + def test_list + records = [ + [{"0" => [true, nil, false]}], + [{"1" => nil}], + ] + iterated_records = [] + target = build({ + type: :list, + field: { + name: :sub_element, + type: :boolean, + }, + }, + records) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(remove_field_names(records), + iterated_records) + end + + def test_struct + records = [ + [{"0" => {"sub_field" => true}}], + [{"1" => nil}], + [{"0" => {"sub_field" => nil}}], + ] + iterated_records = [] + target = build({ + type: :struct, + fields: [ + { + name: :sub_field, + type: :boolean, + }, + ], + }, + records) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(remove_field_names(records), + iterated_records) + end + + def test_map + records = [ + [{"0" => {"key1" => true, "key2" => nil}}], + [{"1" => nil}], + ] + iterated_records = [] + target = build({ + type: :map, + key: :string, + item: :boolean, + }, + records) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(remove_field_names(records), + iterated_records) + end + + def test_sparse_union + records = [ + [{"0" => {"field1" => true}}], + [{"1" => nil}], + [{"0" => {"field2" => 29}}], + [{"0" => {"field2" => nil}}], + ] + iterated_records = [] + target = build({ + type: :sparse_union, + fields: [ + { + name: :field1, + type: :boolean, + }, + { + name: :field2, + type: :uint8, + }, + ], + type_codes: [0, 1], + }, + records) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(remove_field_names(remove_field_names(records)), + iterated_records) + end + + def test_dense_union + records = [ + [{"0" => {"field1" => true}}], + [{"1" => nil}], + [{"0" => {"field2" => 29}}], + [{"0" => {"field2" => nil}}], + ] + iterated_records = [] + target = build({ + type: :dense_union, + fields: [ + { + name: :field1, + type: :boolean, + }, + { + name: :field2, + type: :uint8, + }, + ], + type_codes: [0, 1], + }, + records) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(remove_field_names(remove_field_names(records)), + iterated_records) + end + + def test_dictionary + records = [ + [{"0" => "Ruby"}], + [{"1" => nil}], + [{"0" => "GLib"}], + ] + iterated_records = [] + target = build({ + type: :dictionary, + index_data_type: :int8, + value_data_type: :string, + ordered: false, + }, + records) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(remove_field_names(records), + iterated_records) + end +end + +class EachRawRecordRecordBatchDenseUnionArrayTest < Test::Unit::TestCase + include EachRawRecordDenseUnionArrayTests + + def build(type, records) + build_record_batch(type, records) + end +end + +class EachRawRecordTableDenseUnionArrayTest < Test::Unit::TestCase + include EachRawRecordDenseUnionArrayTests + + def build(type, records) + build_record_batch(type, records).to_table + end +end From 516ef62a45c1d4174442d74838e7cb9c88b183c0 Mon Sep 17 00:00:00 2001 From: otegami Date: Mon, 28 Aug 2023 08:06:14 +0800 Subject: [PATCH 16/23] Add test cases about dictionary arrays --- .../each-raw-record/test-dictionary-array.rb | 457 ++++++++++++++++++ 1 file changed, 457 insertions(+) create mode 100644 ruby/red-arrow/test/each-raw-record/test-dictionary-array.rb diff --git a/ruby/red-arrow/test/each-raw-record/test-dictionary-array.rb b/ruby/red-arrow/test/each-raw-record/test-dictionary-array.rb new file mode 100644 index 0000000000000..edc6c33cc091d --- /dev/null +++ b/ruby/red-arrow/test/each-raw-record/test-dictionary-array.rb @@ -0,0 +1,457 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +module EachRawRecordDictionaryArrayTests + def build_record_batch(array) + dictionary = array.dictionary_encode + schema = Arrow::Schema.new(column: dictionary.value_data_type) + Arrow::RecordBatch.new(schema, array.length, [dictionary]) + end + + def test_null + records = [ + [nil], + [nil], + [nil], + [nil], + ] + iterated_records = [] + target = build(Arrow::NullArray.new(records.collect(&:first))) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(records, iterated_records) + end + + def test_boolean + records = [ + [true], + [nil], + [false], + ] + iterated_records = [] + target = build(Arrow::BooleanArray.new(records.collect(&:first))) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(records, iterated_records) + end + + def test_int8 + records = [ + [-(2 ** 7)], + [nil], + [(2 ** 7) - 1], + ] + iterated_records = [] + target = build(Arrow::Int8Array.new(records.collect(&:first))) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(records, iterated_records) + end + + def test_uint8 + records = [ + [0], + [nil], + [(2 ** 8) - 1], + ] + iterated_records = [] + target = build(Arrow::UInt8Array.new(records.collect(&:first))) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(records, iterated_records) + end + + def test_int16 + records = [ + [-(2 ** 15)], + [nil], + [(2 ** 15) - 1], + ] + iterated_records = [] + target = build(Arrow::Int16Array.new(records.collect(&:first))) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(records, iterated_records) + end + + def test_uint16 + records = [ + [0], + [nil], + [(2 ** 16) - 1], + ] + iterated_records = [] + target = build(Arrow::UInt16Array.new(records.collect(&:first))) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(records, iterated_records) + end + + def test_int32 + records = [ + [-(2 ** 31)], + [nil], + [(2 ** 31) - 1], + ] + iterated_records = [] + target = build(Arrow::Int32Array.new(records.collect(&:first))) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(records, iterated_records) + end + + def test_uint32 + records = [ + [0], + [nil], + [(2 ** 32) - 1], + ] + iterated_records = [] + target = build(Arrow::UInt32Array.new(records.collect(&:first))) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(records, iterated_records) + end + + def test_int64 + records = [ + [-(2 ** 63)], + [nil], + [(2 ** 63) - 1], + ] + iterated_records = [] + target = build(Arrow::Int64Array.new(records.collect(&:first))) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(records, iterated_records) + end + + def test_uint64 + records = [ + [0], + [nil], + [(2 ** 64) - 1], + ] + iterated_records = [] + target = build(Arrow::UInt64Array.new(records.collect(&:first))) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(records, iterated_records) + end + + def test_float + records = [ + [-1.0], + [nil], + [1.0], + ] + iterated_records = [] + target = build(Arrow::FloatArray.new(records.collect(&:first))) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(records, iterated_records) + end + + def test_double + records = [ + [-1.0], + [nil], + [1.0], + ] + iterated_records = [] + target = build(Arrow::DoubleArray.new(records.collect(&:first))) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(records, iterated_records) + end + + def test_binary + records = [ + ["\x00".b], + [nil], + ["\xff".b], + ] + iterated_records = [] + target = build(Arrow::BinaryArray.new(records.collect(&:first))) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(records, iterated_records) + end + + def test_string + records = [ + ["Ruby"], + [nil], + ["\u3042"], # U+3042 HIRAGANA LETTER A + ] + iterated_records = [] + target = build(Arrow::StringArray.new(records.collect(&:first))) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(records, iterated_records) + end + + def test_date32 + records = [ + [Date.new(1960, 1, 1)], + [nil], + [Date.new(2017, 8, 23)], + ] + iterated_records = [] + target = build(Arrow::Date32Array.new(records.collect(&:first))) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(records, iterated_records) + end + + def test_date64 + records = [ + [DateTime.new(1960, 1, 1, 2, 9, 30)], + [nil], + [DateTime.new(2017, 8, 23, 14, 57, 2)], + ] + iterated_records = [] + target = build(Arrow::Date64Array.new(records.collect(&:first))) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(records, iterated_records) + end + + def test_timestamp_second + records = [ + [Time.parse("1960-01-01T02:09:30Z")], + [nil], + [Time.parse("2017-08-23T14:57:02Z")], + ] + iterated_records = [] + target = build(Arrow::TimestampArray.new(:second, records.collect(&:first))) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(records, iterated_records) + end + + def test_timestamp_milli + records = [ + [Time.parse("1960-01-01T02:09:30.123Z")], + [nil], + [Time.parse("2017-08-23T14:57:02.987Z")], + ] + iterated_records = [] + target = build(Arrow::TimestampArray.new(:milli, records.collect(&:first))) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(records, iterated_records) + end + + def test_timestamp_micro + records = [ + [Time.parse("1960-01-01T02:09:30.123456Z")], + [nil], + [Time.parse("2017-08-23T14:57:02.987654Z")], + ] + iterated_records = [] + target = build(Arrow::TimestampArray.new(:micro, records.collect(&:first))) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(records, iterated_records) + end + + def test_timestamp_nano + records = [ + [Time.parse("1960-01-01T02:09:30.123456789Z")], + [nil], + [Time.parse("2017-08-23T14:57:02.987654321Z")], + ] + iterated_records = [] + target = build(Arrow::TimestampArray.new(:nano, records.collect(&:first))) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(records, iterated_records) + end + + def test_time32_second + unit = Arrow::TimeUnit::SECOND + records = [ + [Arrow::Time.new(unit, 60 * 10)], # 00:10:00 + [nil], + [Arrow::Time.new(unit, 60 * 60 * 2 + 9)], # 02:00:09 + ] + iterated_records = [] + target = build(Arrow::Time32Array.new(unit, records.collect(&:first))) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(records, iterated_records) + end + + def test_time32_milli + unit = Arrow::TimeUnit::MILLI + records = [ + [Arrow::Time.new(unit, (60 * 10) * 1000 + 123)], # 00:10:00.123 + [nil], + [Arrow::Time.new(unit, (60 * 60 * 2 + 9) * 1000 + 987)], # 02:00:09.987 + ] + iterated_records = [] + target = build(Arrow::Time32Array.new(unit, records.collect(&:first))) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(records, iterated_records) + end + + def test_time64_micro + unit = Arrow::TimeUnit::MICRO + records = [ + # 00:10:00.123456 + [Arrow::Time.new(unit, (60 * 10) * 1_000_000 + 123_456)], + [nil], + # 02:00:09.987654 + [Arrow::Time.new(unit, (60 * 60 * 2 + 9) * 1_000_000 + 987_654)], + ] + iterated_records = [] + target = build(Arrow::Time64Array.new(unit, records.collect(&:first))) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(records, iterated_records) + end + + def test_time64_nano + unit = Arrow::TimeUnit::NANO + records = [ + # 00:10:00.123456789 + [Arrow::Time.new(unit, (60 * 10) * 1_000_000_000 + 123_456_789)], + [nil], + # 02:00:09.987654321 + [Arrow::Time.new(unit, (60 * 60 * 2 + 9) * 1_000_000_000 + 987_654_321)], + ] + iterated_records = [] + target = build(Arrow::Time64Array.new(unit, records.collect(&:first))) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(records, iterated_records) + end + + def test_decimal128 + records = [ + [BigDecimal("92.92")], + [nil], + [BigDecimal("29.29")], + ] + iterated_records = [] + data_type = Arrow::Decimal128DataType.new(8, 2) + target = build(Arrow::Decimal128Array.new(data_type, records.collect(&:first))) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(records, iterated_records) + end + + def test_decimal256 + records = [ + [BigDecimal("92.92")], + [nil], + [BigDecimal("29.29")], + ] + iterated_records = [] + data_type = Arrow::Decimal256DataType.new(38, 2) + target = build(Arrow::Decimal256Array.new(data_type, records.collect(&:first))) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(records, target.raw_records) + end + + def test_month_interval + records = [ + [1], + [nil], + [12], + ] + iterated_records = [] + target = build(Arrow::MonthIntervalArray.new(records.collect(&:first))) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(records, iterated_records) + end + + def test_day_time_interval + records = [ + [{day: 1, millisecond: 100}], + [nil], + [{day: 2, millisecond: 300}], + ] + iterated_records = [] + target = build(Arrow::DayTimeIntervalArray.new(records.collect(&:first))) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(records, iterated_records) + end + + def test_month_day_nano_interval + records = [ + [{month: 1, day: 1, nanosecond: 100}], + [nil], + [{month: 2, day: 3, nanosecond: 400}], + ] + iterated_records = [] + target = build(Arrow::MonthDayNanoIntervalArray.new(records.collect(&:first))) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(records, iterated_records) + end +end + +class EachRawRecordRecordBatchDictionaryArraysTest < Test::Unit::TestCase + include EachRawRecordDictionaryArrayTests + + def build(array) + build_record_batch(array) + end +end + +class EachRawRecordTableDictionaryArraysTest < Test::Unit::TestCase + include EachRawRecordDictionaryArrayTests + + def build(array) + build_record_batch(array).to_table + end +end From 7050dcc00acddee27864fa695892c87bf97a3702 Mon Sep 17 00:00:00 2001 From: otegami Date: Thu, 31 Aug 2023 07:43:35 +0800 Subject: [PATCH 17/23] Add test cases about list arrays --- .../test/each-raw-record/test-list-array.rb | 771 ++++++++++++++++++ 1 file changed, 771 insertions(+) create mode 100644 ruby/red-arrow/test/each-raw-record/test-list-array.rb diff --git a/ruby/red-arrow/test/each-raw-record/test-list-array.rb b/ruby/red-arrow/test/each-raw-record/test-list-array.rb new file mode 100644 index 0000000000000..64cc5839bd757 --- /dev/null +++ b/ruby/red-arrow/test/each-raw-record/test-list-array.rb @@ -0,0 +1,771 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +module EachRawRecordListArrayTests + def build_schema(type) + field_description = { + name: :element, + } + if type.is_a?(Hash) + field_description = field_description.merge(type) + else + field_description[:type] = type + end + { + column: { + type: :list, + field: field_description, + }, + } + end + + def test_null + records = [ + [[nil, nil, nil]], + [nil], + ] + iterated_records = [] + target = build(:null, records) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(records, iterated_records) + end + + def test_boolean + records = [ + [[true, nil, false]], + [nil], + ] + iterated_records = [] + target = build(:boolean, records) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(records, iterated_records) + end + + def test_int8 + records = [ + [[-(2 ** 7), nil, (2 ** 7) - 1]], + [nil], + ] + iterated_records = [] + target = build(:int8, records) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(records, iterated_records) + end + + def test_uint8 + records = [ + [[0, nil, (2 ** 8) - 1]], + [nil], + ] + iterated_records = [] + target = build(:uint8, records) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(records, iterated_records) + end + + def test_int16 + records = [ + [[-(2 ** 15), nil, (2 ** 15) - 1]], + [nil], + ] + iterated_records = [] + target = build(:int16, records) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(records, iterated_records) + end + + def test_uint16 + records = [ + [[0, nil, (2 ** 16) - 1]], + [nil], + ] + iterated_records = [] + target = build(:uint16, records) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(records, iterated_records) + end + + def test_int32 + records = [ + [[-(2 ** 31), nil, (2 ** 31) - 1]], + [nil], + ] + iterated_records = [] + target = build(:int32, records) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(records, iterated_records) + end + + def test_uint32 + records = [ + [[0, nil, (2 ** 32) - 1]], + [nil], + ] + iterated_records = [] + target = build(:uint32, records) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(records, iterated_records) + end + + def test_int64 + records = [ + [[-(2 ** 63), nil, (2 ** 63) - 1]], + [nil], + ] + iterated_records = [] + target = build(:int64, records) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(records, iterated_records) + end + + def test_uint64 + records = [ + [[0, nil, (2 ** 64) - 1]], + [nil], + ] + iterated_records = [] + target = build(:uint64, records) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(records, iterated_records) + end + + def test_float + records = [ + [[-1.0, nil, 1.0]], + [nil], + ] + iterated_records = [] + target = build(:float, records) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(records, iterated_records) + end + + def test_double + records = [ + [[-1.0, nil, 1.0]], + [nil], + ] + iterated_records = [] + target = build(:double, records) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(records, iterated_records) + end + + def test_binary + records = [ + [["\x00".b, nil, "\xff".b]], + [nil], + ] + iterated_records = [] + target = build(:binary, records) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(records, iterated_records) + end + + def test_string + records = [ + [ + [ + "Ruby", + nil, + "\u3042", # U+3042 HIRAGANA LETTER A + ], + ], + [nil], + ] + iterated_records = [] + target = build(:string, records) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(records, iterated_records) + end + + def test_date32 + records = [ + [ + [ + Date.new(1960, 1, 1), + nil, + Date.new(2017, 8, 23), + ], + ], + [nil], + ] + iterated_records = [] + target = build(:date32, records) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(records, iterated_records) + end + + def test_date64 + records = [ + [ + [ + DateTime.new(1960, 1, 1, 2, 9, 30), + nil, + DateTime.new(2017, 8, 23, 14, 57, 2), + ], + ], + [nil], + ] + iterated_records = [] + target = build(:date64, records) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(records, iterated_records) + end + + def test_timestamp_second + records = [ + [ + [ + Time.parse("1960-01-01T02:09:30Z"), + nil, + Time.parse("2017-08-23T14:57:02Z"), + ], + ], + [nil], + ] + iterated_records = [] + target = build({ + type: :timestamp, + unit: :second, + }, + records) + iterated_records = [] + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(records, iterated_records) + end + + def test_timestamp_milli + records = [ + [ + [ + Time.parse("1960-01-01T02:09:30.123Z"), + nil, + Time.parse("2017-08-23T14:57:02.987Z"), + ], + ], + [nil], + ] + iterated_records = [] + target = build({ + type: :timestamp, + unit: :milli, + }, + records) + iterated_records = [] + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(records, iterated_records) + end + + def test_timestamp_micro + records = [ + [ + [ + Time.parse("1960-01-01T02:09:30.123456Z"), + nil, + Time.parse("2017-08-23T14:57:02.987654Z"), + ], + ], + [nil], + ] + iterated_records = [] + target = build({ + type: :timestamp, + unit: :micro, + }, + records) + iterated_records = [] + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(records, iterated_records) + end + + def test_timestamp_nano + records = [ + [ + [ + Time.parse("1960-01-01T02:09:30.123456789Z"), + nil, + Time.parse("2017-08-23T14:57:02.987654321Z"), + ], + ], + [nil], + ] + iterated_records = [] + target = build({ + type: :timestamp, + unit: :nano, + }, + records) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(records, iterated_records) + end + + def test_time32_second + unit = Arrow::TimeUnit::SECOND + records = [ + [ + [ + # 00:10:00 + Arrow::Time.new(unit, 60 * 10), + nil, + # 02:00:09 + Arrow::Time.new(unit, 60 * 60 * 2 + 9), + ], + ], + [nil], + ] + iterated_records = [] + target = build({ + type: :time32, + unit: :second, + }, + records) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(records, iterated_records) + end + + def test_time32_milli + unit = Arrow::TimeUnit::MILLI + records = [ + [ + [ + # 00:10:00.123 + Arrow::Time.new(unit, (60 * 10) * 1000 + 123), + nil, + # 02:00:09.987 + Arrow::Time.new(unit, (60 * 60 * 2 + 9) * 1000 + 987), + ], + ], + [nil], + ] + iterated_records = [] + target = build({ + type: :time32, + unit: :milli, + }, + records) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(records, iterated_records) + end + + def test_time64_micro + unit = Arrow::TimeUnit::MICRO + records = [ + [ + [ + # 00:10:00.123456 + Arrow::Time.new(unit, (60 * 10) * 1_000_000 + 123_456), + nil, + # 02:00:09.987654 + Arrow::Time.new(unit, (60 * 60 * 2 + 9) * 1_000_000 + 987_654), + ], + ], + [nil], + ] + iterated_records = [] + target = build({ + type: :time64, + unit: :micro, + }, + records) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(records, iterated_records) + end + + def test_time64_nano + unit = Arrow::TimeUnit::NANO + records = [ + [ + [ + # 00:10:00.123456789 + Arrow::Time.new(unit, (60 * 10) * 1_000_000_000 + 123_456_789), + nil, + # 02:00:09.987654321 + Arrow::Time.new(unit, (60 * 60 * 2 + 9) * 1_000_000_000 + 987_654_321), + ], + ], + [nil], + ] + iterated_records = [] + target = build({ + type: :time64, + unit: :nano, + }, + records) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(records, iterated_records) + end + + def test_decimal128 + records = [ + [ + [ + BigDecimal("92.92"), + nil, + BigDecimal("29.29"), + ], + ], + [nil], + ] + iterated_records = [] + target = build({ + type: :decimal128, + precision: 8, + scale: 2, + }, + records) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(records, iterated_records) + end + + def test_decimal256 + records = [ + [ + [ + BigDecimal("92.92"), + nil, + BigDecimal("29.29"), + ], + ], + [nil], + ] + iterated_records = [] + target = build({ + type: :decimal256, + precision: 38, + scale: 2, + }, + records) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(records, iterated_records) + end + + def test_month_interval + records = [ + [[1, nil, 12]], + [nil], + ] + iterated_records = [] + target = build(:month_interval, records) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(records, iterated_records) + end + + def test_day_time_interval + records = [ + [ + [ + {day: 1, millisecond: 100}, + nil, + {day: 2, millisecond: 300}, + ] + ], + [nil], + ] + iterated_records = [] + target = build(:day_time_interval, records) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(records, iterated_records) + end + + def test_month_day_nano_interval + records = [ + [ + [ + {month: 1, day: 1, nanosecond: 100}, + nil, + {month: 2, day: 3, nanosecond: 400}, + ] + ], + [nil], + ] + iterated_records = [] + target = build(:month_day_nano_interval, records) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(records, iterated_records) + end + + def test_list + records = [ + [ + [ + [ + true, + nil, + ], + nil, + [ + nil, + false, + ], + ], + ], + [nil], + ] + iterated_records = [] + target = build({ + type: :list, + field: { + name: :sub_element, + type: :boolean, + }, + }, + records) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(records, iterated_records) + end + + def test_struct + records = [ + [ + [ + {"field" => true}, + nil, + {"field" => nil}, + ], + ], + [nil], + ] + iterated_records = [] + target = build({ + type: :struct, + fields: [ + { + name: :field, + type: :boolean, + }, + ], + }, + records) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(records, iterated_records) + end + + def test_map + records = [ + [ + [ + {"key1" => true, "key2" => nil}, + nil, + ], + ], + [nil], + ] + iterated_records = [] + target = build({ + type: :map, + key: :string, + item: :boolean, + }, + records) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(records, iterated_records) + end + + def remove_union_field_names(records) + records.collect do |record| + record.collect do |column| + if column.nil? + column + else + column.collect do |value| + if value.nil? + value + else + value.values[0] + end + end + end + end + end + end + + def test_sparse_union + records = [ + [ + [ + {"field1" => true}, + nil, + {"field2" => 29}, + {"field2" => nil}, + ], + ], + [nil], + ] + iterated_records = [] + target = build({ + type: :sparse_union, + fields: [ + { + name: :field1, + type: :boolean, + }, + { + name: :field2, + type: :uint8, + }, + ], + type_codes: [0, 1], + }, + records) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(remove_union_field_names(records), + iterated_records) + end + + def test_dense_union + records = [ + [ + [ + {"field1" => true}, + nil, + {"field2" => 29}, + {"field2" => nil}, + ], + ], + [nil], + ] + iterated_records = [] + target = build({ + type: :dense_union, + fields: [ + { + name: :field1, + type: :boolean, + }, + { + name: :field2, + type: :uint8, + }, + ], + type_codes: [0, 1], + }, + records) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(remove_union_field_names(records), + iterated_records) + end + + def test_dictionary + records = [ + [ + [ + "Ruby", + nil, + "GLib", + ], + ], + [nil], + ] + iterated_records = [] + target = build({ + type: :dictionary, + index_data_type: :int8, + value_data_type: :string, + ordered: false, + }, + records) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(records, iterated_records) + end +end + +class EachRawRecordRecordBatchListArrayTest < Test::Unit::TestCase + include EachRawRecordListArrayTests + + def build(type, records) + Arrow::RecordBatch.new(build_schema(type), records) + end +end + +class EachRawRecordTableListArrayTest < Test::Unit::TestCase + include EachRawRecordListArrayTests + + def build(type, records) + Arrow::Table.new(build_schema(type), records) + end +end From 72cc0b48d8f933951ce5d10102941db9fcbdb29e Mon Sep 17 00:00:00 2001 From: otegami Date: Thu, 31 Aug 2023 07:55:20 +0800 Subject: [PATCH 18/23] Add test cases about map arrays --- .../test/each-raw-record/test-map-array.rb | 647 ++++++++++++++++++ 1 file changed, 647 insertions(+) create mode 100644 ruby/red-arrow/test/each-raw-record/test-map-array.rb diff --git a/ruby/red-arrow/test/each-raw-record/test-map-array.rb b/ruby/red-arrow/test/each-raw-record/test-map-array.rb new file mode 100644 index 0000000000000..b802275a7f00f --- /dev/null +++ b/ruby/red-arrow/test/each-raw-record/test-map-array.rb @@ -0,0 +1,647 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +module EachRawRecordMapArrayTests + def build_schema(type) + { + column: { + type: :map, + key: :string, + item: type + }, + } + end + + def test_null + records = [ + [{"key1" => nil}], + [nil], + ] + iterated_records = [] + target = build(:null, records) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(records, iterated_records) + end + + def test_boolean + records = [ + [{"key1" => true, "key2" => nil}], + [nil], + ] + iterated_records = [] + target = build(:boolean, records) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(records, iterated_records) + end + + def test_int8 + records = [ + [{"key1" => -(2 ** 7), "key2" => nil}], + [nil], + ] + iterated_records = [] + target = build(:int8, records) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(records, iterated_records) + end + + def test_uint8 + records = [ + [{"key1" => (2 ** 8) - 1, "key2" => nil}], + [nil], + ] + iterated_records = [] + target = build(:uint8, records) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(records, iterated_records) + end + + def test_int16 + records = [ + [{"key1" => -(2 ** 15), "key2" => nil}], + [nil], + ] + iterated_records = [] + target = build(:int16, records) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(records, iterated_records) + end + + def test_uint16 + records = [ + [{"key1" => (2 ** 16) - 1, "key2" => nil}], + [nil], + ] + iterated_records = [] + target = build(:uint16, records) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(records, iterated_records) + end + + def test_int32 + records = [ + [{"key1" => -(2 ** 31), "key2" => nil}], + [nil], + ] + iterated_records = [] + target = build(:int32, records) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(records, iterated_records) + end + + def test_uint32 + records = [ + [{"key1" => (2 ** 32) - 1, "key2" => nil}], + [nil], + ] + iterated_records = [] + target = build(:uint32, records) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(records, iterated_records) + end + + def test_int64 + records = [ + [{"key1" => -(2 ** 63), "key2" => nil}], + [nil], + ] + iterated_records = [] + target = build(:int64, records) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(records, iterated_records) + end + + def test_uint64 + records = [ + [{"key1" => (2 ** 64) - 1, "key2" => nil}], + [nil], + ] + iterated_records = [] + target = build(:uint64, records) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(records, iterated_records) + end + + def test_float + records = [ + [{"key1" => -1.0, "key2" => nil}], + [nil], + ] + iterated_records = [] + target = build(:float, records) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(records, iterated_records) + end + + def test_double + records = [ + [{"key1" => -1.0, "key2" => nil}], + [nil], + ] + iterated_records = [] + target = build(:double, records) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(records, iterated_records) + end + + def test_binary + records = [ + [{"key1" => "\xff".b, "key2" => nil}], + [nil], + ] + iterated_records = [] + target = build(:binary, records) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(records, iterated_records) + end + + def test_string + records = [ + [{"key1" => "Ruby", "key2" => nil}], + [nil], + ] + iterated_records = [] + target = build(:string, records) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(records, iterated_records) + end + + def test_date32 + records = [ + [{"key1" => Date.new(1960, 1, 1), "key2" => nil}], + [nil], + ] + iterated_records = [] + target = build(:date32, records) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(records, iterated_records) + end + + def test_date64 + records = [ + [{"key1" => DateTime.new(1960, 1, 1, 2, 9, 30), "key2" => nil}], + [nil], + ] + iterated_records = [] + target = build(:date64, records) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(records, iterated_records) + end + + def test_timestamp_second + records = [ + [{"key1" => Time.parse("1960-01-01T02:09:30Z"), "key2" => nil}], + [nil], + ] + iterated_records = [] + target = build({ + type: :timestamp, + unit: :second, + }, + records) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(records, iterated_records) + end + + def test_timestamp_milli + records = [ + [{"key1" => Time.parse("1960-01-01T02:09:30.123Z"), "key2" => nil}], + [nil], + ] + iterated_records = [] + target = build({ + type: :timestamp, + unit: :milli, + }, + records) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(records, iterated_records) + end + + def test_timestamp_micro + records = [ + [{"key1" => Time.parse("1960-01-01T02:09:30.123456Z"), "key2" => nil}], + [nil], + ] + iterated_records = [] + target = build({ + type: :timestamp, + unit: :micro, + }, + records) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(records, iterated_records) + end + + def test_timestamp_nano + records = [ + [{"key1" => Time.parse("1960-01-01T02:09:30.123456789Z"), "key2" => nil}], + [nil], + ] + iterated_records = [] + target = build({ + type: :timestamp, + unit: :nano, + }, + records) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(records, iterated_records) + end + + def test_time32_second + unit = Arrow::TimeUnit::SECOND + records = [ + # 00:10:00 + [{"key1" => Arrow::Time.new(unit, 60 * 10), "key2" => nil}], + [nil], + ] + iterated_records = [] + target = build({ + type: :time32, + unit: :second, + }, + records) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(records, iterated_records) + end + + def test_time32_milli + unit = Arrow::TimeUnit::MILLI + records = [ + # 00:10:00.123 + [{"key1" => Arrow::Time.new(unit, (60 * 10) * 1000 + 123), "key2" => nil}], + [nil], + ] + iterated_records = [] + target = build({ + type: :time32, + unit: :milli, + }, + records) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(records, iterated_records) + end + + def test_time64_micro + unit = Arrow::TimeUnit::MICRO + records = [ + # 00:10:00.123456 + [{"key1" => Arrow::Time.new(unit, (60 * 10) * 1_000_000 + 123_456), "key2" => nil}], + [nil], + ] + iterated_records = [] + target = build({ + type: :time64, + unit: :micro, + }, + records) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(records, iterated_records) + end + + def test_time64_nano + unit = Arrow::TimeUnit::NANO + records = [ + # 00:10:00.123456789 + [{"key1" => Arrow::Time.new(unit, (60 * 10) * 1_000_000_000 + 123_456_789), "key2" => nil}], + [nil], + ] + iterated_records = [] + target = build({ + type: :time64, + unit: :nano, + }, + records) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(records, iterated_records) + end + + def test_decimal128 + records = [ + [{"key1" => BigDecimal("92.92"), "key2" => nil}], + [nil], + ] + iterated_records = [] + target = build({ + type: :decimal128, + precision: 8, + scale: 2, + }, + records) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(records, iterated_records) + end + + def test_decimal256 + records = [ + [{"key1" => BigDecimal("92.92"), "key2" => nil}], + [nil], + ] + iterated_records = [] + target = build({ + type: :decimal256, + precision: 38, + scale: 2, + }, + records) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(records, iterated_records) + end + + def test_month_interval + records = [ + [{"key1" => 1, "key2" => nil}], + [nil], + ] + iterated_records = [] + target = build(:month_interval, records) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(records, iterated_records) + end + + def test_day_time_interval + records = [ + [ + { + "key1" => {day: 1, millisecond: 100}, + "key2" => nil, + }, + ], + [nil], + ] + iterated_records = [] + target = build(:day_time_interval, records) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(records, iterated_records) + end + + def test_month_day_nano_interval + records = [ + [ + { + "key1" => {month: 1, day: 1, nanosecond: 100}, + "key2" => nil, + }, + ], + [nil], + ] + iterated_records = [] + target = build(:month_day_nano_interval, records) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(records, iterated_records) + end + + def test_list + records = [ + [{"key1" => [true, nil, false], "key2" => nil}], + [nil], + ] + iterated_records = [] + target = build({ + type: :list, + field: { + name: :element, + type: :boolean, + }, + }, + records) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(records, iterated_records) + end + + def test_struct + records = [ + [{"key1" => {"field" => true}, "key2" => nil, "key3" => {"field" => nil}}], + [nil], + ] + iterated_records = [] + target = build({ + type: :struct, + fields: [ + { + name: :field, + type: :boolean, + }, + ], + }, + records) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(records, iterated_records) + end + + def test_map + records = [ + [{"key1" => {"sub_key1" => true, "sub_key2" => nil}, "key2" => nil}], + [nil], + ] + iterated_records = [] + target = build({ + type: :map, + key: :string, + item: :boolean, + }, + records) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(records, iterated_records) + end + + def remove_union_field_names(records) + records.collect do |record| + record.collect do |column| + if column.nil? + column + else + value = {} + column.each do |k, v| + v = v.values[0] unless v.nil? + value[k] = v + end + value + end + end + end + end + + def test_sparse_union + records = [ + [ + { + "key1" => {"field1" => true}, + "key2" => nil, + "key3" => {"field2" => 29}, + "key4" => {"field2" => nil}, + }, + ], + [nil], + ] + iterated_records = [] + target = build({ + type: :sparse_union, + fields: [ + { + name: :field1, + type: :boolean, + }, + { + name: :field2, + type: :uint8, + }, + ], + type_codes: [0, 1], + }, + records) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(remove_union_field_names(records), + target.raw_records) + end + + def test_dense_union + records = [ + [ + { + "key1" => {"field1" => true}, + "key2" => nil, + "key3" => {"field2" => 29}, + "key4" => {"field2" => nil}, + }, + ], + [nil], + ] + iterated_records = [] + target = build({ + type: :dense_union, + fields: [ + { + name: :field1, + type: :boolean, + }, + { + name: :field2, + type: :uint8, + }, + ], + type_codes: [0, 1], + }, + records) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(remove_union_field_names(records), + target.raw_records) + end + + def test_dictionary + records = [ + [{"key1" => "Ruby", "key2" => nil, "key3" => "GLib"}], + [nil], + ] + iterated_records = [] + target = build({ + type: :dictionary, + index_data_type: :int8, + value_data_type: :string, + ordered: false, + }, + records) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(records, iterated_records) + end +end + +class EachRawRecordRecordBatchMapArrayTest < Test::Unit::TestCase + include EachRawRecordMapArrayTests + + def build(type, records) + Arrow::RecordBatch.new(build_schema(type), records) + end +end + +class EachRawRecordTableMapArrayTest < Test::Unit::TestCase + include EachRawRecordMapArrayTests + + def build(type, records) + Arrow::Table.new(build_schema(type), records) + end +end From 44560021d964f4a14990a53050fc3edf7501fb70 Mon Sep 17 00:00:00 2001 From: otegami Date: Thu, 31 Aug 2023 07:58:42 +0800 Subject: [PATCH 19/23] Add test cases about multiple columns --- .../each-raw-record/test-multiple-columns.rb | 73 +++++++++++++++++++ 1 file changed, 73 insertions(+) create mode 100644 ruby/red-arrow/test/each-raw-record/test-multiple-columns.rb diff --git a/ruby/red-arrow/test/each-raw-record/test-multiple-columns.rb b/ruby/red-arrow/test/each-raw-record/test-multiple-columns.rb new file mode 100644 index 0000000000000..2d9b080dd6277 --- /dev/null +++ b/ruby/red-arrow/test/each-raw-record/test-multiple-columns.rb @@ -0,0 +1,73 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +module EachRawRecordMultipleColumnsTests + def test_3_elements + records = [ + [true, nil, "Ruby"], + [nil, 0, "GLib"], + [false, 2 ** 8 - 1, nil], + ] + iterated_records = [] + target = build([ + {name: :column0, type: :boolean}, + {name: :column1, type: :uint8}, + {name: :column2, type: :string}, + ], + records) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(records, iterated_records) + end + + def test_4_elements + records = [ + [true, nil, "Ruby", -(2 ** 63)], + [nil, 0, "GLib", nil], + [false, 2 ** 8 - 1, nil, (2 ** 63) - 1], + ] + iterated_records = [] + target = build([ + {name: :column0, type: :boolean}, + {name: :column1, type: :uint8}, + {name: :column2, type: :string}, + {name: :column3, type: :int64}, + ], + records) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(records, iterated_records) + end +end + +class EachRawRecordRecordBatchMultipleColumnsTest < Test::Unit::TestCase + include EachRawRecordMultipleColumnsTests + + def build(schema, records) + Arrow::RecordBatch.new(schema, records) + end +end + +class EachRawRecordTableMultipleColumnsTest < Test::Unit::TestCase + include EachRawRecordMultipleColumnsTests + + def build(schema, records) + Arrow::Table.new(schema, records) + end +end From e1e1cd24a7af96d640ffdef8dc6af33df040a10c Mon Sep 17 00:00:00 2001 From: otegami Date: Sat, 2 Sep 2023 11:04:59 +0800 Subject: [PATCH 20/23] Add test cases about sparse union arrays --- .../test-sparse-union-array.rb | 662 ++++++++++++++++++ 1 file changed, 662 insertions(+) create mode 100644 ruby/red-arrow/test/each-raw-record/test-sparse-union-array.rb diff --git a/ruby/red-arrow/test/each-raw-record/test-sparse-union-array.rb b/ruby/red-arrow/test/each-raw-record/test-sparse-union-array.rb new file mode 100644 index 0000000000000..4b1b941fb2079 --- /dev/null +++ b/ruby/red-arrow/test/each-raw-record/test-sparse-union-array.rb @@ -0,0 +1,662 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +module EachRawRecordSparseUnionArrayTests + def build_schema(type, type_codes) + field_description = {} + if type.is_a?(Hash) + field_description = field_description.merge(type) + else + field_description[:type] = type + end + { + column: { + type: :sparse_union, + fields: [ + field_description.merge(name: "0"), + field_description.merge(name: "1"), + ], + type_codes: type_codes, + }, + } + end + + # TODO: Use Arrow::RecordBatch.new(build_schema(type, type_codes), records) + def build_record_batch(type, records) + type_codes = [0, 1] + schema = Arrow::Schema.new(build_schema(type, type_codes)) + type_ids = [] + arrays = schema.fields[0].data_type.fields.collect do |field| + sub_schema = Arrow::Schema.new([field]) + sub_records = records.collect do |record| + [record[0].nil? ? nil : record[0][field.name]] + end + sub_record_batch = Arrow::RecordBatch.new(sub_schema, + sub_records) + sub_record_batch.columns[0].data + end + records.each do |record| + column = record[0] + if column.key?("0") + type_ids << type_codes[0] + elsif column.key?("1") + type_ids << type_codes[1] + end + end + union_array = Arrow::SparseUnionArray.new(schema.fields[0].data_type, + Arrow::Int8Array.new(type_ids), + arrays) + schema = Arrow::Schema.new(column: union_array.value_data_type) + Arrow::RecordBatch.new(schema, + records.size, + [union_array]) + end + + def remove_field_names(records) + records.collect do |record| + record.collect do |column| + if column.nil? + column + else + column.values[0] + end + end + end + end + + def test_null + records = [ + [{"0" => nil}], + ] + iterated_records = [] + target = build(:null, records) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(remove_field_names(records), iterated_records) + end + + def test_boolean + records = [ + [{"0" => true}], + [{"1" => nil}], + ] + iterated_records = [] + target = build(:boolean, records) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(remove_field_names(records), iterated_records) + end + + def test_int8 + records = [ + [{"0" => -(2 ** 7)}], + [{"1" => nil}], + ] + iterated_records = [] + target = build(:int8, records) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(remove_field_names(records), iterated_records) + end + + def test_uint8 + records = [ + [{"0" => (2 ** 8) - 1}], + [{"1" => nil}], + ] + iterated_records = [] + target = build(:uint8, records) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(remove_field_names(records), iterated_records) + end + + def test_int16 + records = [ + [{"0" => -(2 ** 15)}], + [{"1" => nil}], + ] + iterated_records = [] + target = build(:int16, records) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(remove_field_names(records), iterated_records) + end + + def test_uint16 + records = [ + [{"0" => (2 ** 16) - 1}], + [{"1" => nil}], + ] + iterated_records = [] + target = build(:uint16, records) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(remove_field_names(records), iterated_records) + end + + def test_int32 + records = [ + [{"0" => -(2 ** 31)}], + [{"1" => nil}], + ] + iterated_records = [] + target = build(:int32, records) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(remove_field_names(records), iterated_records) + end + + def test_uint32 + records = [ + [{"0" => (2 ** 32) - 1}], + [{"1" => nil}], + ] + iterated_records = [] + target = build(:uint32, records) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(remove_field_names(records), iterated_records) + end + + def test_int64 + records = [ + [{"0" => -(2 ** 63)}], + [{"1" => nil}], + ] + iterated_records = [] + target = build(:int64, records) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(remove_field_names(records), iterated_records) + end + + def test_uint64 + records = [ + [{"0" => (2 ** 64) - 1}], + [{"1" => nil}], + ] + iterated_records = [] + target = build(:uint64, records) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(remove_field_names(records), iterated_records) + end + + def test_float + records = [ + [{"0" => -1.0}], + [{"1" => nil}], + ] + iterated_records = [] + target = build(:float, records) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(remove_field_names(records), iterated_records) + end + + def test_double + records = [ + [{"0" => -1.0}], + [{"1" => nil}], + ] + iterated_records = [] + target = build(:double, records) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(remove_field_names(records), iterated_records) + end + + def test_binary + records = [ + [{"0" => "\xff".b}], + [{"1" => nil}], + ] + iterated_records = [] + target = build(:binary, records) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(remove_field_names(records), iterated_records) + end + + def test_string + records = [ + [{"0" => "Ruby"}], + [{"1" => nil}], + ] + iterated_records = [] + target = build(:string, records) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(remove_field_names(records), iterated_records) + end + + def test_date32 + records = [ + [{"0" => Date.new(1960, 1, 1)}], + [{"1" => nil}], + ] + iterated_records = [] + target = build(:date32, records) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(remove_field_names(records), iterated_records) + end + + def test_date64 + records = [ + [{"0" => DateTime.new(1960, 1, 1, 2, 9, 30)}], + [{"1" => nil}], + ] + iterated_records = [] + target = build(:date64, records) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(remove_field_names(records), iterated_records) + end + + def test_timestamp_second + records = [ + [{"0" => Time.parse("1960-01-01T02:09:30Z")}], + [{"1" => nil}], + ] + iterated_records = [] + target = build({ + type: :timestamp, + unit: :second, + }, + records) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(remove_field_names(records), iterated_records) + end + + def test_timestamp_milli + records = [ + [{"0" => Time.parse("1960-01-01T02:09:30.123Z")}], + [{"1" => nil}], + ] + iterated_records = [] + target = build({ + type: :timestamp, + unit: :milli, + }, + records) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(remove_field_names(records), iterated_records) + + end + + def test_timestamp_micro + records = [ + [{"0" => Time.parse("1960-01-01T02:09:30.123456Z")}], + [{"1" => nil}], + ] + iterated_records = [] + target = build({ + type: :timestamp, + unit: :micro, + }, + records) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(remove_field_names(records), iterated_records) + end + + def test_timestamp_nano + records = [ + [{"0" => Time.parse("1960-01-01T02:09:30.123456789Z")}], + [{"1" => nil}], + ] + iterated_records = [] + target = build({ + type: :timestamp, + unit: :nano, + }, + records) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(remove_field_names(records), iterated_records) + end + + def test_time32_second + unit = Arrow::TimeUnit::SECOND + records = [ + # 00:10:00 + [{"0" => Arrow::Time.new(unit, 60 * 10)}], + [{"1" => nil}], + ] + iterated_records = [] + target = build({ + type: :time32, + unit: :second, + }, + records) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(remove_field_names(records), iterated_records) + end + + def test_time32_milli + unit = Arrow::TimeUnit::MILLI + records = [ + # 00:10:00.123 + [{"0" => Arrow::Time.new(unit, (60 * 10) * 1000 + 123)}], + [{"1" => nil}], + ] + iterated_records = [] + target = build({ + type: :time32, + unit: :milli, + }, + records) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(remove_field_names(records), iterated_records) + end + + def test_time64_micro + unit = Arrow::TimeUnit::MICRO + records = [ + # 00:10:00.123456 + [{"0" => Arrow::Time.new(unit, (60 * 10) * 1_000_000 + 123_456)}], + [{"1" => nil}], + ] + iterated_records = [] + target = build({ + type: :time64, + unit: :micro, + }, + records) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(remove_field_names(records), iterated_records) + end + + def test_time64_nano + unit = Arrow::TimeUnit::NANO + records = [ + # 00:10:00.123456789 + [{"0" => Arrow::Time.new(unit, (60 * 10) * 1_000_000_000 + 123_456_789)}], + [{"1" => nil}], + ] + iterated_records = [] + target = build({ + type: :time64, + unit: :nano, + }, + records) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(remove_field_names(records), iterated_records) + end + + def test_decimal128 + records = [ + [{"0" => BigDecimal("92.92")}], + [{"1" => nil}], + ] + iterated_records = [] + target = build({ + type: :decimal128, + precision: 8, + scale: 2, + }, + records) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(remove_field_names(records), iterated_records) + end + + def test_decimal256 + records = [ + [{"0" => BigDecimal("92.92")}], + [{"1" => nil}], + ] + iterated_records = [] + target = build({ + type: :decimal256, + precision: 38, + scale: 2, + }, + records) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(remove_field_names(records), iterated_records) + end + + def test_month_interval + records = [ + [{"0" => 1}], + [{"1" => nil}], + ] + iterated_records = [] + target = build(:month_interval, records) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(remove_field_names(records), iterated_records) + end + + def test_day_time_interval + records = [ + [{"0" => {day: 1, millisecond: 100}}], + [{"1" => nil}], + ] + iterated_records = [] + target = build(:day_time_interval, records) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(remove_field_names(records), iterated_records) + end + + def test_month_day_nano_interval + records = [ + [{"0" => {month: 1, day: 1, nanosecond: 100}}], + [{"1" => nil}], + ] + iterated_records = [] + target = build(:month_day_nano_interval, records) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(remove_field_names(records), iterated_records) + end + + def test_list + records = [ + [{"0" => [true, nil, false]}], + [{"1" => nil}], + ] + iterated_records = [] + target = build({ + type: :list, + field: { + name: :sub_element, + type: :boolean, + }, + }, + records) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(remove_field_names(records), iterated_records) + end + + def test_struct + records = [ + [{"0" => {"sub_field" => true}}], + [{"1" => nil}], + [{"0" => {"sub_field" => nil}}], + ] + iterated_records = [] + target = build({ + type: :struct, + fields: [ + { + name: :sub_field, + type: :boolean, + }, + ], + }, + records) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(remove_field_names(records), iterated_records) + end + + def test_map + records = [ + [{"0" => {"key1" => true, "key2" => nil}}], + [{"1" => nil}], + ] + iterated_records = [] + target = build({ + type: :map, + key: :string, + item: :boolean, + }, + records) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(remove_field_names(records), iterated_records) + end + + def test_sparse_union + records = [ + [{"0" => {"field1" => true}}], + [{"1" => nil}], + [{"0" => {"field2" => 29}}], + [{"0" => {"field2" => nil}}], + ] + iterated_records = [] + target = build({ + type: :sparse_union, + fields: [ + { + name: :field1, + type: :boolean, + }, + { + name: :field2, + type: :uint8, + }, + ], + type_codes: [0, 1], + }, + records) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(remove_field_names(remove_field_names(records)), iterated_records) + end + + def test_dense_union + records = [ + [{"0" => {"field1" => true}}], + [{"1" => nil}], + [{"0" => {"field2" => 29}}], + [{"0" => {"field2" => nil}}], + ] + iterated_records = [] + target = build({ + type: :dense_union, + fields: [ + { + name: :field1, + type: :boolean, + }, + { + name: :field2, + type: :uint8, + }, + ], + type_codes: [0, 1], + }, + records) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(remove_field_names(remove_field_names(records)), iterated_records) + end + + def test_dictionary + records = [ + [{"0" => "Ruby"}], + [{"1" => nil}], + [{"0" => "GLib"}], + ] + iterated_records = [] + target = build({ + type: :dictionary, + index_data_type: :int8, + value_data_type: :string, + ordered: false, + }, + records) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(remove_field_names(records), iterated_records) + end +end + +class EachRawRecordRecordBatchSparseUnionArrayTest < Test::Unit::TestCase + include EachRawRecordSparseUnionArrayTests + + def build(type, records) + build_record_batch(type, records) + end +end + +class EachRawRecordTableSparseUnionArrayTest < Test::Unit::TestCase + include EachRawRecordSparseUnionArrayTests + + def build(type, records) + build_record_batch(type, records).to_table + end +end From c7f4135e25791fbde281e8ec458d218d6cd99719 Mon Sep 17 00:00:00 2001 From: otegami Date: Sat, 2 Sep 2023 11:15:57 +0800 Subject: [PATCH 21/23] Add test cases about struct arrays --- .../test/each-raw-record/test-struct-array.rb | 669 ++++++++++++++++++ 1 file changed, 669 insertions(+) create mode 100644 ruby/red-arrow/test/each-raw-record/test-struct-array.rb diff --git a/ruby/red-arrow/test/each-raw-record/test-struct-array.rb b/ruby/red-arrow/test/each-raw-record/test-struct-array.rb new file mode 100644 index 0000000000000..de6e78aaeef4c --- /dev/null +++ b/ruby/red-arrow/test/each-raw-record/test-struct-array.rb @@ -0,0 +1,669 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +module EachRawRecordStructArrayTests + def build_schema(type) + field_description = { + name: :field, + } + if type.is_a?(Hash) + field_description = field_description.merge(type) + else + field_description[:type] = type + end + { + column: { + type: :struct, + fields: [ + field_description, + ], + }, + } + end + + def test_null + records = [ + [{"field" => nil}], + [nil], + ] + iterated_records = [] + target = build(:null, records) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(records, iterated_records) + end + + def test_boolean + records = [ + [{"field" => true}], + [nil], + [{"field" => nil}], + ] + iterated_records = [] + target = build(:boolean, records) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(records, iterated_records) + end + + def test_int8 + records = [ + [{"field" => -(2 ** 7)}], + [nil], + [{"field" => nil}], + ] + iterated_records = [] + target = build(:int8, records) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(records, iterated_records) + end + + def test_uint8 + records = [ + [{"field" => (2 ** 8) - 1}], + [nil], + [{"field" => nil}], + ] + iterated_records = [] + target = build(:uint8, records) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(records, iterated_records) + end + + def test_int16 + records = [ + [{"field" => -(2 ** 15)}], + [nil], + [{"field" => nil}], + ] + iterated_records = [] + target = build(:int16, records) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(records, iterated_records) + end + + def test_uint16 + records = [ + [{"field" => (2 ** 16) - 1}], + [nil], + [{"field" => nil}], + ] + iterated_records = [] + target = build(:uint16, records) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(records, iterated_records) + end + + def test_int32 + records = [ + [{"field" => -(2 ** 31)}], + [nil], + [{"field" => nil}], + ] + iterated_records = [] + target = build(:int32, records) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(records, iterated_records) + end + + def test_uint32 + records = [ + [{"field" => (2 ** 32) - 1}], + [nil], + [{"field" => nil}], + ] + iterated_records = [] + target = build(:uint32, records) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(records, iterated_records) + end + + def test_int64 + records = [ + [{"field" => -(2 ** 63)}], + [nil], + [{"field" => nil}], + ] + iterated_records = [] + target = build(:int64, records) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(records, iterated_records) + end + + def test_uint64 + records = [ + [{"field" => (2 ** 64) - 1}], + [nil], + [{"field" => nil}], + ] + iterated_records = [] + target = build(:uint64, records) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(records, iterated_records) + end + + def test_float + records = [ + [{"field" => -1.0}], + [nil], + [{"field" => nil}], + ] + iterated_records = [] + target = build(:float, records) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(records, iterated_records) + end + + def test_double + records = [ + [{"field" => -1.0}], + [nil], + [{"field" => nil}], + ] + iterated_records = [] + target = build(:double, records) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(records, iterated_records) + end + + def test_binary + records = [ + [{"field" => "\xff".b}], + [nil], + [{"field" => nil}], + ] + iterated_records = [] + target = build(:binary, records) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(records, iterated_records) + end + + def test_string + records = [ + [{"field" => "Ruby"}], + [nil], + [{"field" => nil}], + ] + iterated_records = [] + target = build(:string, records) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(records, iterated_records) + end + + def test_date32 + records = [ + [{"field" => Date.new(1960, 1, 1)}], + [nil], + [{"field" => nil}], + ] + iterated_records = [] + target = build(:date32, records) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(records, iterated_records) + end + + def test_date64 + records = [ + [{"field" => DateTime.new(1960, 1, 1, 2, 9, 30)}], + [nil], + [{"field" => nil}], + ] + iterated_records = [] + target = build(:date64, records) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(records, iterated_records) + end + + def test_timestamp_second + records = [ + [{"field" => Time.parse("1960-01-01T02:09:30Z")}], + [nil], + [{"field" => nil}], + ] + iterated_records = [] + target = build({ + type: :timestamp, + unit: :second, + }, + records) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(records, iterated_records) + end + + def test_timestamp_milli + records = [ + [{"field" => Time.parse("1960-01-01T02:09:30.123Z")}], + [nil], + [{"field" => nil}], + ] + iterated_records = [] + target = build({ + type: :timestamp, + unit: :milli, + }, + records) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(records, iterated_records) + end + + def test_timestamp_micro + records = [ + [{"field" => Time.parse("1960-01-01T02:09:30.123456Z")}], + [nil], + [{"field" => nil}], + ] + iterated_records = [] + target = build({ + type: :timestamp, + unit: :micro, + }, + records) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(records, iterated_records) + end + + def test_timestamp_nano + records = [ + [{"field" => Time.parse("1960-01-01T02:09:30.123456789Z")}], + [nil], + [{"field" => nil}], + ] + iterated_records = [] + target = build({ + type: :timestamp, + unit: :nano, + }, + records) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(records, iterated_records) + end + + def test_time32_second + unit = Arrow::TimeUnit::SECOND + records = [ + # 00:10:00 + [{"field" => Arrow::Time.new(unit, 60 * 10)}], + [nil], + [{"field" => nil}], + ] + iterated_records = [] + target = build({ + type: :time32, + unit: :second, + }, + records) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(records, iterated_records) + end + + def test_time32_milli + unit = Arrow::TimeUnit::MILLI + records = [ + # 00:10:00.123 + [{"field" => Arrow::Time.new(unit, (60 * 10) * 1000 + 123)}], + [nil], + [{"field" => nil}], + ] + iterated_records = [] + target = build({ + type: :time32, + unit: :milli, + }, + records) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(records, iterated_records) + end + + def test_time64_micro + unit = Arrow::TimeUnit::MICRO + records = [ + # 00:10:00.123456 + [{"field" => Arrow::Time.new(unit, (60 * 10) * 1_000_000 + 123_456)}], + [nil], + [{"field" => nil}], + ] + iterated_records = [] + target = build({ + type: :time64, + unit: :micro, + }, + records) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(records, iterated_records) + end + + def test_time64_nano + unit = Arrow::TimeUnit::NANO + records = [ + # 00:10:00.123456789 + [{"field" => Arrow::Time.new(unit, (60 * 10) * 1_000_000_000 + 123_456_789)}], + [nil], + [{"field" => nil}], + ] + iterated_records = [] + target = build({ + type: :time64, + unit: :nano, + }, + records) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(records, iterated_records) + end + + def test_decimal128 + records = [ + [{"field" => BigDecimal("92.92")}], + [nil], + [{"field" => nil}], + ] + iterated_records = [] + target = build({ + type: :decimal128, + precision: 8, + scale: 2, + }, + records) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(records, iterated_records) + end + + def test_decimal256 + records = [ + [{"field" => BigDecimal("92.92")}], + [nil], + [{"field" => nil}], + ] + iterated_records = [] + target = build({ + type: :decimal256, + precision: 38, + scale: 2, + }, + records) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(records, iterated_records) + end + + def test_month_interval + records = [ + [{"field" => 1}], + [nil], + [{"field" => nil}], + ] + iterated_records = [] + target = build(:month_interval, records) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(records, iterated_records) + end + + def test_day_time_interval + records = [ + [{"field" => {day: 1, millisecond: 100}}], + [nil], + [{"field" => nil}], + ] + iterated_records = [] + target = build(:day_time_interval, records) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(records, iterated_records) + end + + def test_month_day_nano_interval + records = [ + [{"field" => {month: 1, day: 1, nanosecond: 100}}], + [nil], + [{"field" => nil}], + ] + iterated_records = [] + target = build(:month_day_nano_interval, records) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(records, iterated_records) + end + + def test_list + records = [ + [{"field" => [true, nil, false]}], + [nil], + [{"field" => nil}], + ] + iterated_records = [] + target = build({ + type: :list, + field: { + name: :sub_element, + type: :boolean, + }, + }, + records) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(records, iterated_records) + end + + def test_struct + records = [ + [{"field" => {"sub_field" => true}}], + [nil], + [{"field" => nil}], + [{"field" => {"sub_field" => nil}}], + ] + iterated_records = [] + target = build({ + type: :struct, + fields: [ + { + name: :sub_field, + type: :boolean, + }, + ], + }, + records) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(records, iterated_records) + end + + def test_map + records = [ + [{"field" => {"key1" => true, "key2" => nil}}], + [nil], + [{"field" => nil}], + ] + iterated_records = [] + target = build({ + type: :map, + key: :string, + item: :boolean, + }, + records) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(records, iterated_records) + end + + def remove_union_field_names(records) + records.collect do |record| + record.collect do |column| + if column.nil? + column + else + value = column["field"] + value = value.values[0] unless value.nil? + {"field" => value} + end + end + end + end + + def test_sparse_union + records = [ + [{"field" => {"field1" => true}}], + [nil], + [{"field" => nil}], + [{"field" => {"field2" => 29}}], + [{"field" => {"field2" => nil}}], + ] + iterated_records = [] + target = build({ + type: :sparse_union, + fields: [ + { + name: :field1, + type: :boolean, + }, + { + name: :field2, + type: :uint8, + }, + ], + type_codes: [0, 1], + }, + records) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(remove_union_field_names(records), + iterated_records) + end + + def test_dense_union + records = [ + [{"field" => {"field1" => true}}], + [nil], + [{"field" => nil}], + [{"field" => {"field2" => 29}}], + [{"field" => {"field2" => nil}}], + ] + iterated_records = [] + target = build({ + type: :dense_union, + fields: [ + { + name: :field1, + type: :boolean, + }, + { + name: :field2, + type: :uint8, + }, + ], + type_codes: [0, 1], + }, + records) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(remove_union_field_names(records), + iterated_records) + end + + def test_dictionary + records = [ + [{"field" => "Ruby"}], + [nil], + [{"field" => nil}], + [{"field" => "GLib"}], + ] + iterated_records = [] + target = build({ + type: :dictionary, + index_data_type: :int8, + value_data_type: :string, + ordered: false, + }, + records) + target.each_raw_record do |record| + iterated_records << record + end + assert_equal(records, iterated_records) + end +end + +class RawRecordsRecordBatchStructArrayTest < Test::Unit::TestCase + include EachRawRecordStructArrayTests + + def build(type, records) + Arrow::RecordBatch.new(build_schema(type), records) + end +end + +class RawRecordsTableStructArrayTest < Test::Unit::TestCase + include EachRawRecordStructArrayTests + + def build(type, records) + Arrow::Table.new(build_schema(type), records) + end +end From 595f0c9dd108b1b67559f234b7d8ad8a6785dc51 Mon Sep 17 00:00:00 2001 From: otegami Date: Sat, 2 Sep 2023 11:22:24 +0800 Subject: [PATCH 22/23] Add test cases about table --- .../test/each-raw-record/test-table.rb | 51 +++++++++++++++++++ 1 file changed, 51 insertions(+) create mode 100644 ruby/red-arrow/test/each-raw-record/test-table.rb diff --git a/ruby/red-arrow/test/each-raw-record/test-table.rb b/ruby/red-arrow/test/each-raw-record/test-table.rb new file mode 100644 index 0000000000000..b5bd80127c8b0 --- /dev/null +++ b/ruby/red-arrow/test/each-raw-record/test-table.rb @@ -0,0 +1,51 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +class EachRawRecordTableTest < Test::Unit::TestCase + test("2 arrays") do + raw_record_batches = [ + [ + [true, nil, "Ruby"], + [nil, 0, "GLib"], + [false, 2 ** 8 - 1, nil], + ], + [ + [nil, 10, "A"], + [true, 20, "B"], + [false, nil, "C"], + [nil, 40, nil], + ] + ] + raw_records = raw_record_batches.inject do |all_records, record_batch| + all_records + record_batch + end + schema = [ + {name: :column0, type: :boolean}, + {name: :column1, type: :uint8}, + {name: :column2, type: :string}, + ] + record_batches = raw_record_batches.collect do |record_batch| + Arrow::RecordBatch.new(schema, record_batch) + end + iterated_records = [] + table = Arrow::Table.new(schema, record_batches) + table.each_raw_record do |record| + iterated_records << record + end + assert_equal(raw_records, iterated_records) + end +end From 3f173fffc192334728d34d75a6c5f1481fb5087c Mon Sep 17 00:00:00 2001 From: otegami Date: Mon, 4 Sep 2023 19:42:20 +0800 Subject: [PATCH 23/23] Improved the test case for multiple columns Also considered the multiple chunked layout --- .../test/each-raw-record/test-multiple-columns.rb | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/ruby/red-arrow/test/each-raw-record/test-multiple-columns.rb b/ruby/red-arrow/test/each-raw-record/test-multiple-columns.rb index 2d9b080dd6277..c0547d324d26d 100644 --- a/ruby/red-arrow/test/each-raw-record/test-multiple-columns.rb +++ b/ruby/red-arrow/test/each-raw-record/test-multiple-columns.rb @@ -68,6 +68,13 @@ class EachRawRecordTableMultipleColumnsTest < Test::Unit::TestCase include EachRawRecordMultipleColumnsTests def build(schema, records) - Arrow::Table.new(schema, records) + record_batch = Arrow::RecordBatch.new(schema, records) + record_batches = [ + record_batch.slice(0, 2), + record_batch.slice(2, 0), + record_batch.slice(2, record_batch.length - 2), + ] + + Arrow::Table.new(schema, record_batches) end end