@@ -1033,15 +1033,15 @@ impl ArrowColumnWriterFactory {
10331033
10341034 match data_type {
10351035 _ if data_type. is_primitive ( ) => out. push ( col ( leaves. next ( ) . unwrap ( ) ) ?) ,
1036- ArrowDataType :: FixedSizeBinary ( _) | ArrowDataType :: Boolean | ArrowDataType :: Null => out. push ( col ( leaves. next ( ) . unwrap ( ) ) ?) ,
1036+ ArrowDataType :: FixedSizeBinary ( _) | ArrowDataType :: Boolean | ArrowDataType :: Null => {
1037+ out. push ( col ( leaves. next ( ) . unwrap ( ) ) ?)
1038+ }
10371039 ArrowDataType :: LargeBinary
10381040 | ArrowDataType :: Binary
10391041 | ArrowDataType :: Utf8
10401042 | ArrowDataType :: LargeUtf8
10411043 | ArrowDataType :: BinaryView
1042- | ArrowDataType :: Utf8View => {
1043- out. push ( bytes ( leaves. next ( ) . unwrap ( ) ) ?)
1044- }
1044+ | ArrowDataType :: Utf8View => out. push ( bytes ( leaves. next ( ) . unwrap ( ) ) ?) ,
10451045 ArrowDataType :: List ( f)
10461046 | ArrowDataType :: LargeList ( f)
10471047 | ArrowDataType :: FixedSizeList ( f, _) => {
@@ -1058,21 +1058,29 @@ impl ArrowColumnWriterFactory {
10581058 self . get_arrow_column_writer ( f[ 1 ] . data_type ( ) , props, leaves, out) ?
10591059 }
10601060 _ => unreachable ! ( "invalid map type" ) ,
1061- }
1061+ } ,
10621062 ArrowDataType :: Dictionary ( _, value_type) => match value_type. as_ref ( ) {
1063- ArrowDataType :: Utf8 | ArrowDataType :: LargeUtf8 | ArrowDataType :: Binary | ArrowDataType :: LargeBinary => {
1064- out. push ( bytes ( leaves. next ( ) . unwrap ( ) ) ?)
1065- }
1063+ ArrowDataType :: Utf8
1064+ | ArrowDataType :: LargeUtf8
1065+ | ArrowDataType :: Binary
1066+ | ArrowDataType :: LargeBinary => out. push ( bytes ( leaves. next ( ) . unwrap ( ) ) ?) ,
10661067 ArrowDataType :: Utf8View | ArrowDataType :: BinaryView => {
10671068 out. push ( bytes ( leaves. next ( ) . unwrap ( ) ) ?)
10681069 }
1069- ArrowDataType :: FixedSizeBinary ( _) => {
1070+ ArrowDataType :: FixedSizeBinary ( _) => out. push ( bytes ( leaves. next ( ) . unwrap ( ) ) ?) ,
1071+ _ => out. push ( col ( leaves. next ( ) . unwrap ( ) ) ?) ,
1072+ } ,
1073+ ArrowDataType :: RunEndEncoded ( _run_ends, value_type) => match value_type. data_type ( ) {
1074+ ArrowDataType :: Utf8
1075+ | ArrowDataType :: LargeUtf8
1076+ | ArrowDataType :: Binary
1077+ | ArrowDataType :: LargeBinary => out. push ( bytes ( leaves. next ( ) . unwrap ( ) ) ?) ,
1078+ ArrowDataType :: Utf8View | ArrowDataType :: BinaryView => {
10701079 out. push ( bytes ( leaves. next ( ) . unwrap ( ) ) ?)
10711080 }
1072- _ => {
1073- out. push ( col ( leaves. next ( ) . unwrap ( ) ) ?)
1074- }
1075- }
1081+ ArrowDataType :: FixedSizeBinary ( _) => out. push ( bytes ( leaves. next ( ) . unwrap ( ) ) ?) ,
1082+ _ => out. push ( col ( leaves. next ( ) . unwrap ( ) ) ?) ,
1083+ } ,
10761084 _ => return Err ( ParquetError :: NYI (
10771085 format ! (
10781086 "Attempting to write an Arrow type {data_type:?} to parquet that is not yet implemented"
@@ -1166,6 +1174,7 @@ fn write_leaf(writer: &mut ColumnWriter<'_>, levels: &ArrayLevels) -> Result<usi
11661174 write_primitive ( typed, array. values ( ) , levels)
11671175 }
11681176 } ,
1177+ ArrowDataType :: RunEndEncoded ( _run_ends, _value_type) => todo ! ( ) ,
11691178 _ => {
11701179 let array = arrow_cast:: cast ( column, & ArrowDataType :: Int32 ) ?;
11711180 let array = array. as_primitive :: < Int32Type > ( ) ;
@@ -1248,6 +1257,7 @@ fn write_leaf(writer: &mut ColumnWriter<'_>, levels: &ArrayLevels) -> Result<usi
12481257 write_primitive ( typed, array. values ( ) , levels)
12491258 }
12501259 } ,
1260+ ArrowDataType :: RunEndEncoded ( _run_ends, _values) => todo ! ( ) ,
12511261 _ => {
12521262 let array = arrow_cast:: cast ( column, & ArrowDataType :: Int64 ) ?;
12531263 let array = array. as_primitive :: < Int64Type > ( ) ;
@@ -1324,6 +1334,7 @@ fn write_leaf(writer: &mut ColumnWriter<'_>, levels: &ArrayLevels) -> Result<usi
13241334 let array = column. as_primitive :: < Float16Type > ( ) ;
13251335 get_float_16_array_slice ( array, indices)
13261336 }
1337+ ArrowDataType :: RunEndEncoded ( _run_ends, _values) => todo ! ( ) ,
13271338 _ => {
13281339 return Err ( ParquetError :: NYI (
13291340 "Attempting to write an Arrow type that is not yet implemented" . to_string ( ) ,
@@ -4293,4 +4304,50 @@ mod tests {
42934304 assert_eq ! ( get_dict_page_size( col0_meta) , 1024 * 1024 ) ;
42944305 assert_eq ! ( get_dict_page_size( col1_meta) , 1024 * 1024 * 4 ) ;
42954306 }
4307+
4308+ #[ test]
4309+ fn arrow_writer_run_end_encoded ( ) {
4310+ // Create a run array of strings
4311+ let mut builder = StringRunBuilder :: < Int16Type > :: new ( ) ;
4312+ builder. extend (
4313+ vec ! [ Some ( "alpha" ) ; 1000 ]
4314+ . into_iter ( )
4315+ . chain ( vec ! [ Some ( "beta" ) ; 1000 ] ) ,
4316+ ) ;
4317+ let run_array: RunArray < Int16Type > = builder. finish ( ) ;
4318+ println ! ( "run_array type: {:?}" , run_array. data_type( ) ) ;
4319+ let schema = Arc :: new ( Schema :: new ( vec ! [ Field :: new(
4320+ "ree" ,
4321+ run_array. data_type( ) . clone( ) ,
4322+ run_array. is_nullable( ) ,
4323+ ) ] ) ) ;
4324+
4325+ // Write to parquet
4326+ let mut parquet_bytes: Vec < u8 > = Vec :: new ( ) ;
4327+ let mut writer = ArrowWriter :: try_new ( & mut parquet_bytes, schema. clone ( ) , None ) . unwrap ( ) ;
4328+ let batch = RecordBatch :: try_new ( schema. clone ( ) , vec ! [ Arc :: new( run_array) ] ) . unwrap ( ) ;
4329+ writer. write ( & batch) . unwrap ( ) ;
4330+ writer. close ( ) . unwrap ( ) ;
4331+
4332+ // Schema of output is plain, not dictionary or REE encoded!!
4333+ let expected_schema = Arc :: new ( Schema :: new ( vec ! [ Field :: new(
4334+ "ree" ,
4335+ arrow_schema:: DataType :: Utf8 ,
4336+ false ,
4337+ ) ] ) ) ;
4338+
4339+ // Read from parquet
4340+ let bytes = Bytes :: from ( parquet_bytes) ;
4341+ let reader = ParquetRecordBatchReaderBuilder :: try_new ( bytes) . unwrap ( ) ;
4342+ assert_eq ! ( reader. schema( ) , & expected_schema) ;
4343+ let batches: Vec < _ > = reader
4344+ . build ( )
4345+ . unwrap ( )
4346+ . collect :: < ArrowResult < Vec < _ > > > ( )
4347+ . unwrap ( ) ;
4348+ assert_eq ! ( batches. len( ) , 2 ) ;
4349+ // Count rows in total
4350+ let total_rows = batches. iter ( ) . map ( |b| b. num_rows ( ) ) . sum :: < usize > ( ) ;
4351+ assert_eq ! ( total_rows, 2000 ) ;
4352+ }
42964353}
0 commit comments