Skip to content
This repository has been archived by the owner on Feb 18, 2024. It is now read-only.

Write a two-dimensional list to parquet file failed #992

Closed
b41sh opened this issue May 18, 2022 · 3 comments · Fixed by #1007
Closed

Write a two-dimensional list to parquet file failed #992

b41sh opened this issue May 18, 2022 · 3 comments · Fixed by #1007
Assignees
Labels
bug Something isn't working

Comments

@b41sh
Copy link
Contributor

b41sh commented May 18, 2022

Write a two-dimensional list to parquet file failed

sample code:

use std::fs::File;
use std::sync::Arc;

use arrow2::{
    array::{Array, ListArray, PrimitiveArray},
    chunk::Chunk,
    datatypes::{DataType, Field, Schema},
    error::Result,
    io::parquet::write::{
        CompressionOptions, Encoding, FileWriter, RowGroupIterator, Version, WriteOptions,
    },
};

type LargeListArray = ListArray<i64>;

fn write_batch(path: &str, schema: Schema, columns: Chunk<Arc<dyn Array>>) -> Result<()> {
    let options = WriteOptions {
        write_statistics: true,
        compression: CompressionOptions::Uncompressed,
        version: Version::V2,
    };

    let iter = vec![Ok(columns)];

    let row_groups =
        RowGroupIterator::try_new(iter.into_iter(), &schema, options, vec![Encoding::Plain])?;

    // Create a new empty file
    let file = File::create(path)?;

    let mut writer = FileWriter::try_new(file, schema, options)?;

    writer.start()?;
    for group in row_groups {
        let g = group?;
        writer.write(g)?;
    }
    let _size = writer.end(None)?;
    Ok(())
}

fn main() -> Result<()> {
    let data_field = Field::new(
        "test",
        DataType::LargeList(Box::new(Field::new(
            "list",
            DataType::LargeList(Box::new(Field::new("list", DataType::Int64, false))),
            false,
        ))),
        false,
    );
    let schema = Schema::from(vec![data_field]);

    let mut values: Vec<i64> = Vec::with_capacity(9);

    let mut list_last_size1: usize = 0;
    let mut list_offsets1: Vec<i64> = Vec::with_capacity(3);
    list_offsets1.push(0);

    let mut list_offsets2: Vec<i64> = Vec::with_capacity(3);
    list_offsets2.push(0);

    let vals = vec![vec![1, 2, 3], vec![4, 5, 6], vec![7, 8, 9]];
    list_offsets2.push(vals.len() as i64);
    for val in vals {
        list_last_size1 += val.len();
        list_offsets1.push(list_last_size1 as i64);
        for v in val {
            values.push(v);
        }
    }

    let i64_array = Arc::new(PrimitiveArray::<i64>::from_data(
        DataType::Int64,
        values.into(),
        None,
    ));

    let data_type1 = DataType::LargeList(Box::new(Field::new(
        "list".to_string(),
        DataType::Int64,
        false,
    )));
    let list_array1 = Arc::new(LargeListArray::from_data(
        data_type1,
        list_offsets1.into(),
        i64_array,
        None,
    ));

    let data_type2 = DataType::LargeList(Box::new(Field::new(
        "list",
        DataType::LargeList(Box::new(Field::new("list", DataType::Int64, false))),
        false,
    )));

    let list_array2 = Arc::new(LargeListArray::from_data(
        data_type2,
        list_offsets2.into(),
        list_array1,
        None,
    ));

    let columns = Chunk::new(vec![list_array2 as Arc<dyn Array>]);

    let file_path = "test.parquet";
    match write_batch(file_path, schema, columns) {
        Ok(_) => println!("write ok"),
        Err(e) => println!("write err: {:#?}", e),
    }

    Ok(())
}

output

write err: ExternalFormat(
    "Invalid argument error: The GroupType {
        field_info: FieldInfo {
            name: \"list\",
            repetition: Required,
            id: None
        },
        logical_type: Some(List),
        converted_type: None,
        fields: [
            GroupType {
                field_info: FieldInfo {
                    name: \"list\",
                    repetition: Repeated,
                    id: None
                },
                logical_type: None,
                converted_type: None,
                fields: [
                    PrimitiveType(PrimitiveType {
                        field_info: FieldInfo {
                            name: \"list\",
                            repetition: Required,
                            id: None
                        },
                        logical_type: None,
                        converted_type: None,
                        physical_type: Int64
                    })
                ]
            }
        ]
    } is not a primitive type but it is trying to describe a primitive array",
)
@jorgecarleitao
Copy link
Owner

That is still not supported - structs and nested lists are not yet supported. I am working on this, though :)

@jorgecarleitao
Copy link
Owner

This is fixed by #1007 - I get write ok on that branch :)

@b41sh
Copy link
Contributor Author

b41sh commented May 27, 2022

This is fixed by #1007 - I get write ok on that branch :)

great job, thanks very much.

@jorgecarleitao jorgecarleitao added the bug Something isn't working label May 27, 2022
@jorgecarleitao jorgecarleitao added no-changelog Issues whose changes are covered by a PR and thus should not be shown in the changelog and removed no-changelog Issues whose changes are covered by a PR and thus should not be shown in the changelog labels Jun 5, 2022
Sign up for free to subscribe to this conversation on GitHub. Already have an account? Sign in.
Labels
bug Something isn't working
Projects
None yet
Development

Successfully merging a pull request may close this issue.

2 participants