diff --git a/parquet_integration/write_parquet.py b/parquet_integration/write_parquet.py index 6c10f8b5f00..a4a0cb8ca0e 100644 --- a/parquet_integration/write_parquet.py +++ b/parquet_integration/write_parquet.py @@ -1,3 +1,5 @@ +from typing import Tuple + import pyarrow as pa import pyarrow.parquet import os @@ -6,7 +8,7 @@ PYARROW_PATH = "fixtures/pyarrow3" -def case_basic_nullable(size=1): +def case_basic_nullable() -> Tuple[dict, pa.Schema, str]: int64 = [0, 1, None, 3, None, 5, 6, 7, None, 9] float64 = [0.0, 1.0, None, 3.0, None, 5.0, 6.0, 7.0, None, 9.0] string = ["Hello", None, "aa", "", None, "abc", None, None, "def", "aaa"] @@ -38,27 +40,27 @@ def case_basic_nullable(size=1): return ( { - "int64": int64 * size, - "float64": float64 * size, - "string": string * size, - "bool": boolean * size, - "date": int64 * size, - "uint32": int64 * size, - "string_large": string_large * size, - "decimal_9": decimal * size, - "decimal_18": decimal * size, - "decimal_26": decimal * size, - "timestamp_us": int64 * size, - "timestamp_s": int64 * size, - "emoji": emoji * size, - "timestamp_s_utc": int64 * size, + "int64": int64, + "float64": float64, + "string": string, + "bool": boolean, + "date": int64, + "uint32": int64, + "string_large": string_large, + "decimal_9": decimal, + "decimal_18": decimal, + "decimal_26": decimal, + "timestamp_us": int64, + "timestamp_s": int64, + "emoji": emoji, + "timestamp_s_utc": int64, }, schema, - f"basic_nullable_{size*10}.parquet", + f"basic_nullable_10.parquet", ) -def case_basic_required(size=1): +def case_basic_required() -> Tuple[dict, pa.Schema, str]: int64 = [-256, -1, 0, 1, 2, 3, 4, 5, 6, 7] uint32 = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9] float64 = [0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0] @@ -87,22 +89,22 @@ def case_basic_required(size=1): return ( { - "int64": int64 * size, - "float64": float64 * size, - "string": string * size, - "bool": boolean * size, - "date": int64 * size, - "uint32": uint32 * size, - "decimal_9": decimal * size, - "decimal_18": decimal * size, - "decimal_26": decimal * size, + "int64": int64, + "float64": float64, + "string": string, + "bool": boolean, + "date": int64, + "uint32": uint32, + "decimal_9": decimal, + "decimal_18": decimal, + "decimal_26": decimal, }, schema, - f"basic_required_{size*10}.parquet", + f"basic_required_10.parquet", ) -def case_nested(size): +def case_nested() -> Tuple[dict, pa.Schema, str]: items_nullable = [[0, 1], None, [2, None, 3], [4, 5, 6], [], [7, 8, 9], None, [10]] items_required = [[0, 1], None, [2, 0, 3], [4, 5, 6], [], [7, 8, 9], None, [10]] all_required = [[0, 1], [], [2, 0, 3], [4, 5, 6], [], [7, 8, 9], [], [10]] @@ -178,23 +180,23 @@ def case_nested(size): schema = pa.schema(fields) return ( { - "list_int64": items_nullable * size, - "list_int64_required": items_required * size, - "list_int64_required_required": all_required * size, - "list_int16": i16 * size, - "list_bool": boolean * size, - "list_utf8": string * size, - "list_large_binary": string * size, - "list_nested_i64": items_nested * size, - "list_nested_inner_required_i64": items_required_nested * size, - "list_nested_inner_required_required_i64": items_required_nested_2 * size, + "list_int64": items_nullable, + "list_int64_required": items_required, + "list_int64_required_required": all_required, + "list_int16": i16, + "list_bool": boolean, + "list_utf8": string, + "list_large_binary": string, + "list_nested_i64": items_nested, + "list_nested_inner_required_i64": items_required_nested, + "list_nested_inner_required_required_i64": items_required_nested_2, }, schema, - f"nested_nullable_{size*10}.parquet", + f"nested_nullable_10.parquet", ) -def case_struct(size): +def case_struct() -> Tuple[dict, pa.Schema, str]: string = ["Hello", None, "aa", "", None, "abc", None, None, "def", "aaa"] boolean = [True, None, False, False, None, True, None, None, True, True] struct_fields = [ @@ -220,31 +222,30 @@ def case_struct(size): ) struct = pa.StructArray.from_arrays( - [pa.array(string * size), pa.array(boolean * size)], + [pa.array(string), pa.array(boolean)], fields=struct_fields, ) return ( { "struct": struct, "struct_struct": pa.StructArray.from_arrays( - [struct, pa.array(boolean * size)], + [struct, pa.array(boolean)], names=["f1", "f2"], ), }, schema, - f"struct_nullable_{size*10}.parquet", + f"struct_nullable_10.parquet", ) def write_pyarrow( case, - size: int, page_version: int, use_dictionary: bool, multiple_pages: bool, compression: str, ): - data, schema, path = case(size) + data, schema, path = case base_path = f"{PYARROW_PATH}/v{page_version}" if use_dictionary: @@ -279,12 +280,12 @@ def write_pyarrow( for version in [1, 2]: for use_dict in [True, False]: for compression in ["lz4", None, "snappy"]: - write_pyarrow(case, 1, version, use_dict, False, compression) + write_pyarrow(case(), version, use_dict, False, compression) def case_benches(size): assert size % 8 == 0 - data, schema, _ = case_basic_nullable(1) + data, schema, _ = case_basic_nullable() for k in data: data[k] = data[k][:8] * (size // 8) return data, schema, f"benches_{size}.parquet" @@ -292,7 +293,7 @@ def case_benches(size): def case_benches_required(size): assert size % 8 == 0 - data, schema, _ = case_basic_required(1) + data, schema, _ = case_basic_required() for k in data: data[k] = data[k][:8] * (size // 8) return data, schema, f"benches_required_{size}.parquet" @@ -301,14 +302,14 @@ def case_benches_required(size): # for read benchmarks for i in range(10, 22, 2): # two pages (dict) - write_pyarrow(case_benches, 2 ** i, 1, True, False, None) + write_pyarrow(case_benches(2 ** i), 1, True, False, None) # single page - write_pyarrow(case_benches, 2 ** i, 1, False, False, None) + write_pyarrow(case_benches(2 ** i), 1, False, False, None) # single page required - write_pyarrow(case_benches_required, 2 ** i, 1, False, False, None) + write_pyarrow(case_benches_required(2 ** i), 1, False, False, None) # multiple pages - write_pyarrow(case_benches, 2 ** i, 1, False, True, None) + write_pyarrow(case_benches(2 ** i), 1, False, True, None) # multiple compressed pages - write_pyarrow(case_benches, 2 ** i, 1, False, True, "snappy") + write_pyarrow(case_benches(2 ** i), 1, False, True, "snappy") # single compressed page - write_pyarrow(case_benches, 2 ** i, 1, False, False, "snappy") + write_pyarrow(case_benches(2 ** i), 1, False, False, "snappy") diff --git a/tests/it/io/parquet/mod.rs b/tests/it/io/parquet/mod.rs index 0fa82f69eb8..2bf5e3372f7 100644 --- a/tests/it/io/parquet/mod.rs +++ b/tests/it/io/parquet/mod.rs @@ -17,11 +17,19 @@ type ArrayStats = (Arc, Option>); pub fn read_column( mut reader: R, row_group: usize, - column: usize, + column: &str, ) -> Result { let metadata = read_metadata(&mut reader)?; let schema = infer_schema(&metadata)?; + let column = schema + .fields + .iter() + .enumerate() + .filter_map(|(i, f)| if f.name == column { Some(i) } else { None }) + .next() + .unwrap(); + let mut reader = FileReader::try_new(reader, Some(&[column]), None, None, None)?; let field = &schema.fields[column]; @@ -34,11 +42,11 @@ pub fn read_column( )) } -pub fn pyarrow_nested_nullable(column: usize) -> Box { +pub fn pyarrow_nested_nullable(column: &str) -> Box { let offsets = Buffer::from_slice([0, 2, 2, 5, 8, 8, 11, 11, 12]); let values = match column { - 0 => { + "list_int64" => { // [[0, 1], None, [2, None, 3], [4, 5, 6], [], [7, 8, 9], None, [10]] Arc::new(PrimitiveArray::::from(&[ Some(0), @@ -55,7 +63,7 @@ pub fn pyarrow_nested_nullable(column: usize) -> Box { Some(10), ])) as Arc } - 1 | 2 => { + "list_int64_required" | "list_int64_required_required" => { // [[0, 1], None, [2, 0, 3], [4, 5, 6], [], [7, 8, 9], None, [10]] Arc::new(PrimitiveArray::::from(&[ Some(0), @@ -72,7 +80,7 @@ pub fn pyarrow_nested_nullable(column: usize) -> Box { Some(10), ])) as Arc } - 3 => Arc::new(PrimitiveArray::::from(&[ + "list_int16" => Arc::new(PrimitiveArray::::from(&[ Some(0), Some(1), Some(2), @@ -86,7 +94,7 @@ pub fn pyarrow_nested_nullable(column: usize) -> Box { Some(9), Some(10), ])) as Arc, - 4 => Arc::new(BooleanArray::from(&[ + "list_bool" => Arc::new(BooleanArray::from(&[ Some(false), Some(true), Some(true), @@ -112,7 +120,7 @@ pub fn pyarrow_nested_nullable(column: usize) -> Box { [""], ] */ - 5 => Arc::new(Utf8Array::::from(&[ + "list_utf8" => Arc::new(Utf8Array::::from(&[ Some("Hello".to_string()), Some("bbb".to_string()), Some("aa".to_string()), @@ -126,7 +134,7 @@ pub fn pyarrow_nested_nullable(column: usize) -> Box { Some("bbb".to_string()), Some("".to_string()), ])), - 6 => Arc::new(BinaryArray::::from(&[ + "list_large_binary" => Arc::new(BinaryArray::::from(&[ Some(b"Hello".to_vec()), Some(b"bbb".to_vec()), Some(b"aa".to_vec()), @@ -140,40 +148,23 @@ pub fn pyarrow_nested_nullable(column: usize) -> Box { Some(b"bbb".to_vec()), Some(b"".to_vec()), ])), - 7 | 8 | 9 => Arc::new(NullArray::from_data(DataType::Null, 1)), + "list_nested_i64" + | "list_nested_inner_required_i64" + | "list_nested_inner_required_required_i64" => { + Arc::new(NullArray::from_data(DataType::Null, 1)) + } _ => unreachable!(), }; match column { - 0 | 1 | 3 | 4 | 5 | 6 => { - let field = match column { - 0 => Field::new("item", DataType::Int64, true), - 1 => Field::new("item", DataType::Int64, false), - 3 => Field::new("item", DataType::Int16, true), - 4 => Field::new("item", DataType::Boolean, true), - 5 => Field::new("item", DataType::Utf8, true), - 6 => Field::new("item", DataType::LargeBinary, true), - _ => unreachable!(), - }; - - let validity = Some(Bitmap::from([ - true, false, true, true, true, true, false, true, - ])); - // [0, 2, 2, 5, 8, 8, 11, 11, 12] - // [[a1, a2], None, [a3, a4, a5], [a6, a7, a8], [], [a9, a10, a11], None, [a12]] - let data_type = DataType::List(Box::new(field)); - Box::new(ListArray::::from_data( - data_type, offsets, values, validity, - )) - } - 2 => { + "list_int64_required_required" => { // [[0, 1], [], [2, None, 3], [4, 5, 6], [], [7, 8, 9], [], [10]] let data_type = DataType::List(Box::new(Field::new("item", DataType::Int64, false))); Box::new(ListArray::::from_data( data_type, offsets, values, None, )) } - 7 => { + "list_nested_i64" => { // [[0, 1]], None, [[2, None], [3]], [[4, 5], [6]], [], [[7], None, [9]], [[], [None], None], [[10]] let data = [ Some(vec![Some(vec![Some(0), Some(1)])]), @@ -191,7 +182,7 @@ pub fn pyarrow_nested_nullable(column: usize) -> Box { let array: ListArray = a.into(); Box::new(array) } - 8 => { + "list_nested_inner_required_i64" => { let data = [ Some(vec![Some(vec![Some(0), Some(1)])]), None, @@ -208,7 +199,7 @@ pub fn pyarrow_nested_nullable(column: usize) -> Box { let array: ListArray = a.into(); Box::new(array) } - 9 => { + "list_nested_inner_required_required_i64" => { let data = [ Some(vec![Some(vec![Some(0), Some(1)])]), None, @@ -229,11 +220,31 @@ pub fn pyarrow_nested_nullable(column: usize) -> Box { let array: ListArray = a.into(); Box::new(array) } - _ => unreachable!(), + _ => { + let field = match column { + "list_int64" => Field::new("item", DataType::Int64, true), + "list_int64_required" => Field::new("item", DataType::Int64, false), + "list_int16" => Field::new("item", DataType::Int16, true), + "list_bool" => Field::new("item", DataType::Boolean, true), + "list_utf8" => Field::new("item", DataType::Utf8, true), + "list_large_binary" => Field::new("item", DataType::LargeBinary, true), + _ => unreachable!(), + }; + + let validity = Some(Bitmap::from([ + true, false, true, true, true, true, false, true, + ])); + // [0, 2, 2, 5, 8, 8, 11, 11, 12] + // [[a1, a2], None, [a3, a4, a5], [a6, a7, a8], [], [a9, a10, a11], None, [a12]] + let data_type = DataType::List(Box::new(field)); + Box::new(ListArray::::from_data( + data_type, offsets, values, validity, + )) + } } } -pub fn pyarrow_nullable(column: usize) -> Box { +pub fn pyarrow_nullable(column: &str) -> Box { let i64_values = &[ Some(0), Some(1), @@ -248,8 +259,8 @@ pub fn pyarrow_nullable(column: usize) -> Box { ]; match column { - 0 => Box::new(PrimitiveArray::::from(i64_values)), - 1 => Box::new(PrimitiveArray::::from(&[ + "int64" => Box::new(PrimitiveArray::::from(i64_values)), + "float64" => Box::new(PrimitiveArray::::from(&[ Some(0.0), Some(1.0), None, @@ -261,7 +272,7 @@ pub fn pyarrow_nullable(column: usize) -> Box { None, Some(9.0), ])), - 2 => Box::new(Utf8Array::::from(&[ + "string" => Box::new(Utf8Array::::from(&[ Some("Hello".to_string()), None, Some("aa".to_string()), @@ -273,7 +284,7 @@ pub fn pyarrow_nullable(column: usize) -> Box { Some("def".to_string()), Some("aaa".to_string()), ])), - 3 => Box::new(BooleanArray::from(&[ + "bool" => Box::new(BooleanArray::from(&[ Some(true), None, Some(false), @@ -285,100 +296,94 @@ pub fn pyarrow_nullable(column: usize) -> Box { Some(true), Some(true), ])), - 4 => Box::new( + "date" => Box::new( PrimitiveArray::::from(i64_values) .to(DataType::Timestamp(TimeUnit::Millisecond, None)), ), - 5 => { + "uint32" => { let values = i64_values .iter() .map(|x| x.map(|x| x as u32)) .collect::>(); Box::new(PrimitiveArray::::from(values)) } - 6 => { + "string_large" => { let keys = PrimitiveArray::::from([Some(0), Some(1), None, Some(1)]); let values = Arc::new(PrimitiveArray::::from_slice([10, 200])); Box::new(DictionaryArray::::from_data(keys, values)) } - // decimal 9 - 7 => { + "decimal_9" => { let values = i64_values .iter() .map(|x| x.map(|x| x as i128)) .collect::>(); Box::new(PrimitiveArray::::from(values).to(DataType::Decimal(9, 0))) } - // decimal 18 - 8 => { + "decimal_18" => { let values = i64_values .iter() .map(|x| x.map(|x| x as i128)) .collect::>(); Box::new(PrimitiveArray::::from(values).to(DataType::Decimal(18, 0))) } - // decimal 26 - 9 => { + "decimal_26" => { let values = i64_values .iter() .map(|x| x.map(|x| x as i128)) .collect::>(); Box::new(PrimitiveArray::::from(values).to(DataType::Decimal(26, 0))) } - 10 => Box::new( + "timestamp_us" => Box::new( PrimitiveArray::::from(i64_values) .to(DataType::Timestamp(TimeUnit::Microsecond, None)), ), - 11 => Box::new( + "timestamp_s" => Box::new( PrimitiveArray::::from(i64_values).to(DataType::Timestamp(TimeUnit::Second, None)), ), - 13 => Box::new( - PrimitiveArray::::from(i64_values).to(DataType::Timestamp( - TimeUnit::Second, - Some("UTC".to_string()), - )), - ), + "timestamp_s_utc" => Box::new(PrimitiveArray::::from(i64_values).to( + DataType::Timestamp(TimeUnit::Second, Some("UTC".to_string())), + )), _ => unreachable!(), } } -pub fn pyarrow_nullable_statistics(column: usize) -> Option> { +pub fn pyarrow_nullable_statistics(column: &str) -> Option> { Some(match column { - 0 => Box::new(PrimitiveStatistics:: { + "int64" => Box::new(PrimitiveStatistics:: { data_type: DataType::Int64, distinct_count: None, null_count: Some(3), min_value: Some(0), max_value: Some(9), }), - 1 => Box::new(PrimitiveStatistics:: { + "float64" => Box::new(PrimitiveStatistics:: { data_type: DataType::Float64, distinct_count: None, null_count: Some(3), min_value: Some(0.0), max_value: Some(9.0), }), - 2 => Box::new(Utf8Statistics { + "string" => Box::new(Utf8Statistics { null_count: Some(4), distinct_count: None, min_value: Some("".to_string()), max_value: Some("def".to_string()), }), - 3 => Box::new(BooleanStatistics { + "bool" => Box::new(BooleanStatistics { null_count: Some(4), distinct_count: None, min_value: Some(false), max_value: Some(true), }), - 4 => Box::new(PrimitiveStatistics:: { + "date" => Box::new(PrimitiveStatistics:: { data_type: DataType::Timestamp(TimeUnit::Millisecond, None), distinct_count: None, null_count: Some(3), min_value: Some(0), max_value: Some(9), }), - 5 => Box::new(PrimitiveStatistics:: { + "uint32" => Box::new(PrimitiveStatistics:: { data_type: DataType::UInt32, null_count: Some(3), distinct_count: None, @@ -386,44 +391,43 @@ pub fn pyarrow_nullable_statistics(column: usize) -> Option> min_value: Some(0), max_value: Some(9), }), - 6 => return None, - // Decimal statistics - 7 => Box::new(PrimitiveStatistics:: { + "string_large" => return None, + "decimal_9" => Box::new(PrimitiveStatistics:: { distinct_count: None, null_count: Some(3), min_value: Some(0i128), max_value: Some(9i128), data_type: DataType::Decimal(9, 0), }), - 8 => Box::new(PrimitiveStatistics:: { + "decimal_18" => Box::new(PrimitiveStatistics:: { distinct_count: None, null_count: Some(3), min_value: Some(0i128), max_value: Some(9i128), data_type: DataType::Decimal(18, 0), }), - 9 => Box::new(PrimitiveStatistics:: { + "decimal_26" => Box::new(PrimitiveStatistics:: { distinct_count: None, null_count: Some(3), min_value: Some(0i128), max_value: Some(9i128), data_type: DataType::Decimal(26, 0), }), - 10 => Box::new(PrimitiveStatistics:: { + "timestamp_us" => Box::new(PrimitiveStatistics:: { data_type: DataType::Timestamp(TimeUnit::Microsecond, None), distinct_count: None, null_count: Some(3), min_value: Some(0), max_value: Some(9), }), - 11 => Box::new(PrimitiveStatistics:: { + "timestamp_s" => Box::new(PrimitiveStatistics:: { data_type: DataType::Timestamp(TimeUnit::Second, None), distinct_count: None, null_count: Some(3), min_value: Some(0), max_value: Some(9), }), - 13 => Box::new(PrimitiveStatistics:: { + "timestamp_s_utc" => Box::new(PrimitiveStatistics:: { data_type: DataType::Timestamp(TimeUnit::Second, Some("UTC".to_string())), distinct_count: None, null_count: Some(3), @@ -435,7 +439,7 @@ pub fn pyarrow_nullable_statistics(column: usize) -> Option> } // these values match the values in `integration` -pub fn pyarrow_required(column: usize) -> Box { +pub fn pyarrow_required(column: &str) -> Box { let i64_values = &[ Some(-256), Some(-1), @@ -450,31 +454,28 @@ pub fn pyarrow_required(column: usize) -> Box { ]; match column { - 0 => Box::new(PrimitiveArray::::from(i64_values)), - 3 => Box::new(BooleanArray::from_slice(&[ + "int64" => Box::new(PrimitiveArray::::from(i64_values)), + "bool" => Box::new(BooleanArray::from_slice(&[ true, true, false, false, false, true, true, true, true, true, ])), - 2 => Box::new(Utf8Array::::from_slice(&[ + "string" => Box::new(Utf8Array::::from_slice(&[ "Hello", "bbb", "aa", "", "bbb", "abc", "bbb", "bbb", "def", "aaa", ])), - // decimal 9 - 6 => { + "decimal_9" => { let values = i64_values .iter() .map(|x| x.map(|x| x as i128)) .collect::>(); Box::new(PrimitiveArray::::from(values).to(DataType::Decimal(9, 0))) } - // decimal 18 - 7 => { + "decimal_18" => { let values = i64_values .iter() .map(|x| x.map(|x| x as i128)) .collect::>(); Box::new(PrimitiveArray::::from(values).to(DataType::Decimal(18, 0))) } - // decimal 26 - 8 => { + "decimal_26" => { let values = i64_values .iter() .map(|x| x.map(|x| x as i128)) @@ -485,45 +486,42 @@ pub fn pyarrow_required(column: usize) -> Box { } } -pub fn pyarrow_required_statistics(column: usize) -> Option> { +pub fn pyarrow_required_statistics(column: &str) -> Option> { Some(match column { - 0 => Box::new(PrimitiveStatistics:: { + "int64" => Box::new(PrimitiveStatistics:: { data_type: DataType::Int64, null_count: Some(0), distinct_count: None, min_value: Some(0), max_value: Some(9), }), - 3 => Box::new(BooleanStatistics { + "bool" => Box::new(BooleanStatistics { null_count: Some(0), distinct_count: None, min_value: Some(false), max_value: Some(true), }), - 2 => Box::new(Utf8Statistics { + "string" => Box::new(Utf8Statistics { null_count: Some(0), distinct_count: None, min_value: Some("".to_string()), max_value: Some("def".to_string()), }), - // decimal_9 - 6 => Box::new(PrimitiveStatistics:: { + "decimal_9" => Box::new(PrimitiveStatistics:: { distinct_count: None, null_count: Some(0), min_value: Some(0i128), max_value: Some(9i128), data_type: DataType::Decimal(9, 0), }), - // decimal_18 - 7 => Box::new(PrimitiveStatistics:: { + "decimal_18" => Box::new(PrimitiveStatistics:: { distinct_count: None, null_count: Some(0), min_value: Some(0i128), max_value: Some(9i128), data_type: DataType::Decimal(18, 0), }), - // decimal_26 - 8 => Box::new(PrimitiveStatistics:: { + "decimal_26" => Box::new(PrimitiveStatistics:: { distinct_count: None, null_count: Some(0), min_value: Some(0i128), @@ -534,28 +532,28 @@ pub fn pyarrow_required_statistics(column: usize) -> Option> }) } -pub fn pyarrow_nested_nullable_statistics(column: usize) -> Option> { +pub fn pyarrow_nested_nullable_statistics(column: &str) -> Option> { Some(match column { - 3 => Box::new(PrimitiveStatistics:: { + "list_int16" => Box::new(PrimitiveStatistics:: { data_type: DataType::Int16, distinct_count: None, null_count: Some(1), min_value: Some(0), max_value: Some(10), }), - 4 => Box::new(BooleanStatistics { + "list_bool" => Box::new(BooleanStatistics { distinct_count: None, null_count: Some(1), min_value: Some(false), max_value: Some(true), }), - 5 => Box::new(Utf8Statistics { + "list_utf8" => Box::new(Utf8Statistics { distinct_count: None, null_count: Some(1), min_value: Some("".to_string()), max_value: Some("def".to_string()), }), - 6 => Box::new(BinaryStatistics { + "list_large_binary" => Box::new(BinaryStatistics { distinct_count: None, null_count: Some(1), min_value: Some(b"".to_vec()), @@ -571,7 +569,7 @@ pub fn pyarrow_nested_nullable_statistics(column: usize) -> Option Box { +pub fn pyarrow_struct(column: &str) -> Box { let boolean = [ Some(true), None, @@ -590,7 +588,7 @@ pub fn pyarrow_struct(column: usize) -> Box { Field::new("f2", DataType::Boolean, true), ]; match column { - 0 => { + "struct" => { let string = [ Some("Hello"), None, @@ -613,8 +611,8 @@ pub fn pyarrow_struct(column: usize) -> Box { None, )) } - 1 => { - let struct_ = pyarrow_struct(0).into(); + "struct_struct" => { + let struct_ = pyarrow_struct("struct").into(); let values = vec![struct_, boolean]; Box::new(StructArray::from_data( DataType::Struct(vec![ @@ -629,15 +627,15 @@ pub fn pyarrow_struct(column: usize) -> Box { } } -pub fn pyarrow_struct_statistics(column: usize) -> Option> { +pub fn pyarrow_struct_statistics(column: &str) -> Option> { match column { - 0 => Some(Box::new(BooleanStatistics { + "struct" => Some(Box::new(BooleanStatistics { distinct_count: None, null_count: Some(4), min_value: Some(false), max_value: Some(true), })), - 1 => Some(Box::new(BooleanStatistics { + "struct_struct" => Some(Box::new(BooleanStatistics { distinct_count: None, null_count: Some(1), min_value: Some(false), diff --git a/tests/it/io/parquet/read.rs b/tests/it/io/parquet/read.rs index 10a383fa590..fd0358d6cef 100644 --- a/tests/it/io/parquet/read.rs +++ b/tests/it/io/parquet/read.rs @@ -7,7 +7,7 @@ use arrow2::io::parquet::read::*; use super::*; fn test_pyarrow_integration( - column: usize, + column: &str, version: usize, type_: &str, use_dict: bool, @@ -56,327 +56,348 @@ fn test_pyarrow_integration( #[test] fn v1_int64_nullable() -> Result<()> { - test_pyarrow_integration(0, 1, "basic", false, false, None) + test_pyarrow_integration("int64", 1, "basic", false, false, None) } #[test] #[ignore] // see https://issues.apache.org/jira/browse/ARROW-15073 fn v1_int64_lz4_nullable() -> Result<()> { - test_pyarrow_integration(0, 1, "basic", false, false, Some("lz4")) + test_pyarrow_integration("int64", 1, "basic", false, false, Some("lz4")) } #[test] #[ignore] // see https://issues.apache.org/jira/browse/ARROW-15073 fn v1_int64_lz4_required() -> Result<()> { - test_pyarrow_integration(0, 1, "basic", false, true, Some("lz4")) + test_pyarrow_integration("int64", 1, "basic", false, true, Some("lz4")) } #[test] fn v1_int64_required() -> Result<()> { - test_pyarrow_integration(0, 1, "basic", false, true, None) + test_pyarrow_integration("int64", 1, "basic", false, true, None) } #[test] fn v1_float64_nullable() -> Result<()> { - test_pyarrow_integration(1, 1, "basic", false, false, None) + test_pyarrow_integration("float64", 1, "basic", false, false, None) } #[test] fn v1_utf8_nullable() -> Result<()> { - test_pyarrow_integration(2, 1, "basic", false, false, None) + test_pyarrow_integration("string", 1, "basic", false, false, None) } #[test] fn v1_utf8_required() -> Result<()> { - test_pyarrow_integration(2, 1, "basic", false, true, None) + test_pyarrow_integration("string", 1, "basic", false, true, None) } #[test] fn v1_boolean_nullable() -> Result<()> { - test_pyarrow_integration(3, 1, "basic", false, false, None) + test_pyarrow_integration("bool", 1, "basic", false, false, None) } #[test] fn v1_boolean_required() -> Result<()> { - test_pyarrow_integration(3, 1, "basic", false, true, None) + test_pyarrow_integration("bool", 1, "basic", false, true, None) } #[test] fn v1_timestamp_nullable() -> Result<()> { - test_pyarrow_integration(4, 1, "basic", false, false, None) + test_pyarrow_integration("date", 1, "basic", false, false, None) } #[test] #[ignore] // pyarrow issue; see https://issues.apache.org/jira/browse/ARROW-12201 fn v1_u32_nullable() -> Result<()> { - test_pyarrow_integration(5, 1, "basic", false, false, None) + test_pyarrow_integration("uint32", 1, "basic", false, false, None) } #[test] fn v2_int64_nullable() -> Result<()> { - test_pyarrow_integration(0, 2, "basic", false, false, None) + test_pyarrow_integration("int64", 2, "basic", false, false, None) } #[test] fn v2_int64_nullable_dict() -> Result<()> { - test_pyarrow_integration(0, 2, "basic", true, false, None) + test_pyarrow_integration("int64", 2, "basic", true, false, None) } #[test] #[ignore] // see https://issues.apache.org/jira/browse/ARROW-15073 fn v2_int64_nullable_dict_lz4() -> Result<()> { - test_pyarrow_integration(0, 2, "basic", true, false, Some("lz4")) + test_pyarrow_integration("int64", 2, "basic", true, false, Some("lz4")) } #[test] fn v1_int64_nullable_dict() -> Result<()> { - test_pyarrow_integration(0, 1, "basic", true, false, None) + test_pyarrow_integration("int64", 1, "basic", true, false, None) } #[test] fn v2_int64_required_dict() -> Result<()> { - test_pyarrow_integration(0, 2, "basic", true, true, None) + test_pyarrow_integration("int64", 2, "basic", true, true, None) } #[test] fn v1_int64_required_dict() -> Result<()> { - test_pyarrow_integration(0, 1, "basic", true, true, None) + test_pyarrow_integration("int64", 1, "basic", true, true, None) } #[test] fn v2_utf8_nullable() -> Result<()> { - test_pyarrow_integration(2, 2, "basic", false, false, None) + test_pyarrow_integration("string", 2, "basic", false, false, None) } #[test] fn v2_utf8_required() -> Result<()> { - test_pyarrow_integration(2, 2, "basic", false, true, None) + test_pyarrow_integration("string", 2, "basic", false, true, None) } #[test] fn v2_utf8_nullable_dict() -> Result<()> { - test_pyarrow_integration(2, 2, "basic", true, false, None) + test_pyarrow_integration("string", 2, "basic", true, false, None) } #[test] fn v1_utf8_nullable_dict() -> Result<()> { - test_pyarrow_integration(2, 1, "basic", true, false, None) + test_pyarrow_integration("string", 1, "basic", true, false, None) } #[test] fn v2_utf8_required_dict() -> Result<()> { - test_pyarrow_integration(2, 2, "basic", true, true, None) + test_pyarrow_integration("string", 2, "basic", true, true, None) } #[test] fn v1_utf8_required_dict() -> Result<()> { - test_pyarrow_integration(2, 1, "basic", true, true, None) + test_pyarrow_integration("string", 1, "basic", true, true, None) } #[test] fn v2_boolean_nullable() -> Result<()> { - test_pyarrow_integration(3, 2, "basic", false, false, None) + test_pyarrow_integration("bool", 2, "basic", false, false, None) } #[test] fn v2_boolean_required() -> Result<()> { - test_pyarrow_integration(3, 2, "basic", false, true, None) + test_pyarrow_integration("bool", 2, "basic", false, true, None) } #[test] fn v2_nested_int64_nullable() -> Result<()> { - test_pyarrow_integration(0, 2, "nested", false, false, None) + test_pyarrow_integration("list_int64", 2, "nested", false, false, None) } #[test] fn v1_nested_int64_nullable() -> Result<()> { - test_pyarrow_integration(0, 1, "nested", false, false, None) + test_pyarrow_integration("list_int64", 1, "nested", false, false, None) } #[test] fn v2_nested_int64_nullable_required() -> Result<()> { - test_pyarrow_integration(1, 2, "nested", false, false, None) + test_pyarrow_integration("list_int64", 2, "nested", false, false, None) } #[test] fn v1_nested_int64_nullable_required() -> Result<()> { - test_pyarrow_integration(1, 1, "nested", false, false, None) + test_pyarrow_integration("list_int64", 1, "nested", false, false, None) } #[test] fn v2_nested_int64_required_required() -> Result<()> { - test_pyarrow_integration(2, 2, "nested", false, false, None) + test_pyarrow_integration("list_int64_required", 2, "nested", false, false, None) } #[test] fn v1_nested_int64_required_required() -> Result<()> { - test_pyarrow_integration(2, 1, "nested", false, false, None) + test_pyarrow_integration("list_int64_required", 1, "nested", false, false, None) } #[test] fn v2_nested_i16() -> Result<()> { - test_pyarrow_integration(3, 2, "nested", false, false, None) + test_pyarrow_integration( + "list_int64_required_required", + 2, + "nested", + false, + false, + None, + ) } #[test] fn v1_nested_i16() -> Result<()> { - test_pyarrow_integration(3, 1, "nested", false, false, None) + test_pyarrow_integration("list_int16", 1, "nested", false, false, None) } #[test] fn v2_nested_bool() -> Result<()> { - test_pyarrow_integration(4, 2, "nested", false, false, None) + test_pyarrow_integration("list_bool", 2, "nested", false, false, None) } #[test] fn v1_nested_bool() -> Result<()> { - test_pyarrow_integration(4, 1, "nested", false, false, None) + test_pyarrow_integration("list_bool", 1, "nested", false, false, None) } #[test] fn v2_nested_utf8() -> Result<()> { - test_pyarrow_integration(5, 2, "nested", false, false, None) + test_pyarrow_integration("list_utf8", 2, "nested", false, false, None) } #[test] fn v1_nested_utf8() -> Result<()> { - test_pyarrow_integration(5, 1, "nested", false, false, None) + test_pyarrow_integration("list_utf8", 1, "nested", false, false, None) } #[test] fn v2_nested_large_binary() -> Result<()> { - test_pyarrow_integration(6, 2, "nested", false, false, None) + test_pyarrow_integration("list_large_binary", 2, "nested", false, false, None) } #[test] fn v1_nested_large_binary() -> Result<()> { - test_pyarrow_integration(6, 1, "nested", false, false, None) + test_pyarrow_integration("list_large_binary", 1, "nested", false, false, None) } #[test] fn v2_nested_nested() -> Result<()> { - test_pyarrow_integration(7, 2, "nested", false, false, None) + test_pyarrow_integration("list_nested_i64", 2, "nested", false, false, None) } #[test] fn v2_nested_nested_required() -> Result<()> { - test_pyarrow_integration(8, 2, "nested", false, false, None) + test_pyarrow_integration( + "list_nested_inner_required_i64", + 2, + "nested", + false, + false, + None, + ) } #[test] fn v2_nested_nested_required_required() -> Result<()> { - test_pyarrow_integration(9, 2, "nested", false, false, None) + test_pyarrow_integration( + "list_nested_inner_required_required_i64", + 2, + "nested", + false, + false, + None, + ) } #[test] fn v1_decimal_9_nullable() -> Result<()> { - test_pyarrow_integration(7, 1, "basic", false, false, None) + test_pyarrow_integration("decimal_9", 1, "basic", false, false, None) } #[test] fn v1_decimal_9_required() -> Result<()> { - test_pyarrow_integration(6, 1, "basic", false, true, None) + test_pyarrow_integration("decimal_9", 1, "basic", false, true, None) } #[test] fn v1_decimal_9_nullable_dict() -> Result<()> { - test_pyarrow_integration(7, 1, "basic", true, false, None) + test_pyarrow_integration("decimal_9", 1, "basic", true, false, None) } #[test] fn v1_decimal_18_nullable() -> Result<()> { - test_pyarrow_integration(8, 1, "basic", false, false, None) + test_pyarrow_integration("decimal_18", 1, "basic", false, false, None) } #[test] fn v1_decimal_18_required() -> Result<()> { - test_pyarrow_integration(7, 1, "basic", false, true, None) + test_pyarrow_integration("decimal_18", 1, "basic", false, true, None) } #[test] fn v1_decimal_26_nullable() -> Result<()> { - test_pyarrow_integration(9, 1, "basic", false, false, None) + test_pyarrow_integration("decimal_26", 1, "basic", false, false, None) } #[test] fn v1_decimal_26_required() -> Result<()> { - test_pyarrow_integration(8, 1, "basic", false, true, None) + test_pyarrow_integration("decimal_26", 1, "basic", false, true, None) } #[test] fn v2_decimal_9_nullable() -> Result<()> { - test_pyarrow_integration(7, 2, "basic", false, false, None) + test_pyarrow_integration("decimal_9", 2, "basic", false, false, None) } #[test] fn v2_decimal_9_required() -> Result<()> { - test_pyarrow_integration(6, 2, "basic", false, true, None) + test_pyarrow_integration("decimal_9", 2, "basic", false, true, None) } #[test] fn v2_decimal_9_required_dict() -> Result<()> { - test_pyarrow_integration(6, 2, "basic", true, true, None) + test_pyarrow_integration("decimal_9", 2, "basic", true, true, None) } #[test] fn v2_decimal_18_nullable() -> Result<()> { - test_pyarrow_integration(8, 2, "basic", false, false, None) + test_pyarrow_integration("decimal_18", 2, "basic", false, false, None) } #[test] fn v2_decimal_18_required() -> Result<()> { - test_pyarrow_integration(7, 2, "basic", false, true, None) + test_pyarrow_integration("decimal_18", 2, "basic", false, true, None) } #[test] fn v2_decimal_18_required_dict() -> Result<()> { - test_pyarrow_integration(7, 2, "basic", true, true, None) + test_pyarrow_integration("decimal_18", 2, "basic", true, true, None) } #[test] fn v2_decimal_26_nullable() -> Result<()> { - test_pyarrow_integration(9, 2, "basic", false, false, None) + test_pyarrow_integration("decimal_26", 2, "basic", false, false, None) } #[test] fn v1_timestamp_us_nullable() -> Result<()> { - test_pyarrow_integration(10, 1, "basic", false, false, None) + test_pyarrow_integration("timestamp_us", 1, "basic", false, false, None) } #[test] fn v1_timestamp_s_nullable() -> Result<()> { - test_pyarrow_integration(11, 1, "basic", false, false, None) + test_pyarrow_integration("timestamp_s", 1, "basic", false, false, None) } #[test] fn v1_timestamp_s_nullable_dict() -> Result<()> { - test_pyarrow_integration(11, 1, "basic", true, false, None) + test_pyarrow_integration("timestamp_s", 1, "basic", true, false, None) } #[test] fn v1_timestamp_s_utc_nullable() -> Result<()> { - test_pyarrow_integration(13, 1, "basic", false, false, None) + test_pyarrow_integration("timestamp_s_utc", 1, "basic", false, false, None) } #[test] fn v2_decimal_26_required() -> Result<()> { - test_pyarrow_integration(8, 2, "basic", false, true, None) + test_pyarrow_integration("decimal_26", 2, "basic", false, true, None) } #[test] fn v2_decimal_26_required_dict() -> Result<()> { - test_pyarrow_integration(8, 2, "basic", true, true, None) + test_pyarrow_integration("decimal_26", 2, "basic", true, true, None) } #[test] fn v1_struct_optional() -> Result<()> { - test_pyarrow_integration(0, 1, "struct", false, false, None) + test_pyarrow_integration("struct", 1, "struct", false, false, None) } #[test] #[ignore] fn v1_struct_struct_optional() -> Result<()> { - test_pyarrow_integration(1, 1, "struct", false, false, None) + test_pyarrow_integration("struct_struct", 1, "struct", false, false, None) } #[test] diff --git a/tests/it/io/parquet/write.rs b/tests/it/io/parquet/write.rs index 03f08d0faee..c9141f4d515 100644 --- a/tests/it/io/parquet/write.rs +++ b/tests/it/io/parquet/write.rs @@ -6,7 +6,7 @@ use arrow2::io::parquet::write::*; use super::*; fn round_trip( - column: usize, + column: &str, nullable: bool, nested: bool, version: Version, @@ -56,7 +56,7 @@ fn round_trip( let data = writer.into_inner(); - let (result, stats) = read_column(&mut Cursor::new(data), 0, 0)?; + let (result, stats) = read_column(&mut Cursor::new(data), 0, "a1")?; assert_eq!(array.as_ref(), result.as_ref()); assert_eq!(statistics.as_ref(), stats.as_ref()); Ok(()) @@ -65,7 +65,7 @@ fn round_trip( #[test] fn int64_optional_v1() -> Result<()> { round_trip( - 0, + "int64", true, false, Version::V1, @@ -77,7 +77,7 @@ fn int64_optional_v1() -> Result<()> { #[test] fn int64_required_v1() -> Result<()> { round_trip( - 0, + "int64", false, false, Version::V1, @@ -89,7 +89,7 @@ fn int64_required_v1() -> Result<()> { #[test] fn int64_optional_v2() -> Result<()> { round_trip( - 0, + "int64", true, false, Version::V2, @@ -101,7 +101,7 @@ fn int64_optional_v2() -> Result<()> { #[test] fn int64_optional_v2_compressed() -> Result<()> { round_trip( - 0, + "int64", true, false, Version::V2, @@ -113,7 +113,7 @@ fn int64_optional_v2_compressed() -> Result<()> { #[test] fn utf8_optional_v1() -> Result<()> { round_trip( - 2, + "string", true, false, Version::V1, @@ -125,7 +125,7 @@ fn utf8_optional_v1() -> Result<()> { #[test] fn utf8_required_v1() -> Result<()> { round_trip( - 2, + "string", false, false, Version::V1, @@ -137,7 +137,7 @@ fn utf8_required_v1() -> Result<()> { #[test] fn utf8_optional_v2() -> Result<()> { round_trip( - 2, + "string", true, false, Version::V2, @@ -149,7 +149,7 @@ fn utf8_optional_v2() -> Result<()> { #[test] fn utf8_required_v2() -> Result<()> { round_trip( - 2, + "string", false, false, Version::V2, @@ -161,7 +161,7 @@ fn utf8_required_v2() -> Result<()> { #[test] fn utf8_optional_v2_compressed() -> Result<()> { round_trip( - 2, + "string", true, false, Version::V2, @@ -173,7 +173,7 @@ fn utf8_optional_v2_compressed() -> Result<()> { #[test] fn utf8_required_v2_compressed() -> Result<()> { round_trip( - 2, + "string", false, false, Version::V2, @@ -185,7 +185,7 @@ fn utf8_required_v2_compressed() -> Result<()> { #[test] fn bool_optional_v1() -> Result<()> { round_trip( - 3, + "bool", true, false, Version::V1, @@ -197,7 +197,7 @@ fn bool_optional_v1() -> Result<()> { #[test] fn bool_required_v1() -> Result<()> { round_trip( - 3, + "bool", false, false, Version::V1, @@ -209,7 +209,7 @@ fn bool_required_v1() -> Result<()> { #[test] fn bool_optional_v2_uncompressed() -> Result<()> { round_trip( - 3, + "bool", true, false, Version::V2, @@ -221,7 +221,7 @@ fn bool_optional_v2_uncompressed() -> Result<()> { #[test] fn bool_required_v2_uncompressed() -> Result<()> { round_trip( - 3, + "bool", false, false, Version::V2, @@ -233,7 +233,7 @@ fn bool_required_v2_uncompressed() -> Result<()> { #[test] fn bool_required_v2_compressed() -> Result<()> { round_trip( - 3, + "bool", false, false, Version::V2, @@ -245,7 +245,7 @@ fn bool_required_v2_compressed() -> Result<()> { #[test] fn list_int64_optional_v2() -> Result<()> { round_trip( - 0, + "list_int64", true, true, Version::V2, @@ -257,7 +257,7 @@ fn list_int64_optional_v2() -> Result<()> { #[test] fn list_int64_optional_v1() -> Result<()> { round_trip( - 0, + "list_int64", true, true, Version::V1, @@ -269,7 +269,7 @@ fn list_int64_optional_v1() -> Result<()> { #[test] fn list_bool_optional_v2() -> Result<()> { round_trip( - 4, + "list_bool", true, true, Version::V2, @@ -281,7 +281,7 @@ fn list_bool_optional_v2() -> Result<()> { #[test] fn list_bool_optional_v1() -> Result<()> { round_trip( - 4, + "list_bool", true, true, Version::V1, @@ -293,7 +293,7 @@ fn list_bool_optional_v1() -> Result<()> { #[test] fn list_utf8_optional_v2() -> Result<()> { round_trip( - 5, + "list_utf8", true, true, Version::V2, @@ -305,7 +305,7 @@ fn list_utf8_optional_v2() -> Result<()> { #[test] fn list_utf8_optional_v1() -> Result<()> { round_trip( - 5, + "list_utf8", true, true, Version::V1, @@ -317,7 +317,7 @@ fn list_utf8_optional_v1() -> Result<()> { #[test] fn list_large_binary_optional_v2() -> Result<()> { round_trip( - 6, + "list_large_binary", true, true, Version::V2, @@ -329,7 +329,7 @@ fn list_large_binary_optional_v2() -> Result<()> { #[test] fn list_large_binary_optional_v1() -> Result<()> { round_trip( - 6, + "list_large_binary", true, true, Version::V1, @@ -342,7 +342,7 @@ fn list_large_binary_optional_v1() -> Result<()> { #[ignore] fn utf8_optional_v2_delta() -> Result<()> { round_trip( - 2, + "string", true, false, Version::V2, @@ -354,7 +354,7 @@ fn utf8_optional_v2_delta() -> Result<()> { #[test] fn i32_optional_v2_dict() -> Result<()> { round_trip( - 6, + "string_large", true, false, Version::V2, @@ -366,7 +366,7 @@ fn i32_optional_v2_dict() -> Result<()> { #[test] fn i32_optional_v2_dict_compressed() -> Result<()> { round_trip( - 6, + "string_large", true, false, Version::V2, @@ -379,7 +379,7 @@ fn i32_optional_v2_dict_compressed() -> Result<()> { #[test] fn decimal_9_optional_v1() -> Result<()> { round_trip( - 7, + "decimal_9", true, false, Version::V1, @@ -391,7 +391,7 @@ fn decimal_9_optional_v1() -> Result<()> { #[test] fn decimal_9_required_v1() -> Result<()> { round_trip( - 6, + "decimal_9", false, false, Version::V1, @@ -403,7 +403,7 @@ fn decimal_9_required_v1() -> Result<()> { #[test] fn decimal_18_optional_v1() -> Result<()> { round_trip( - 8, + "decimal_18", true, false, Version::V1, @@ -415,7 +415,7 @@ fn decimal_18_optional_v1() -> Result<()> { #[test] fn decimal_18_required_v1() -> Result<()> { round_trip( - 7, + "decimal_18", false, false, Version::V1, @@ -427,7 +427,7 @@ fn decimal_18_required_v1() -> Result<()> { #[test] fn decimal_26_optional_v1() -> Result<()> { round_trip( - 9, + "decimal_26", true, false, Version::V1, @@ -439,7 +439,7 @@ fn decimal_26_optional_v1() -> Result<()> { #[test] fn decimal_26_required_v1() -> Result<()> { round_trip( - 8, + "decimal_26", false, false, Version::V1, @@ -451,7 +451,7 @@ fn decimal_26_required_v1() -> Result<()> { #[test] fn decimal_9_optional_v2() -> Result<()> { round_trip( - 7, + "decimal_9", true, false, Version::V2, @@ -463,7 +463,7 @@ fn decimal_9_optional_v2() -> Result<()> { #[test] fn decimal_9_required_v2() -> Result<()> { round_trip( - 6, + "decimal_9", false, false, Version::V2, @@ -475,7 +475,7 @@ fn decimal_9_required_v2() -> Result<()> { #[test] fn decimal_18_optional_v2() -> Result<()> { round_trip( - 8, + "decimal_18", true, false, Version::V2, @@ -487,7 +487,7 @@ fn decimal_18_optional_v2() -> Result<()> { #[test] fn decimal_18_required_v2() -> Result<()> { round_trip( - 7, + "decimal_18", false, false, Version::V2, @@ -499,7 +499,7 @@ fn decimal_18_required_v2() -> Result<()> { #[test] fn decimal_26_optional_v2() -> Result<()> { round_trip( - 9, + "decimal_26", true, false, Version::V2, @@ -511,7 +511,7 @@ fn decimal_26_optional_v2() -> Result<()> { #[test] fn decimal_26_required_v2() -> Result<()> { round_trip( - 8, + "decimal_26", false, false, Version::V2,