// Minimal example for Arrow CSV parsing bug // Only crate dependency is Arrow - tested against 43.0.0 and 46.0.0 use std::{io::Cursor, sync::Arc}; use arrow::{ datatypes::{Schema, Field, DataType}, csv::reader::ReaderBuilder }; fn main() { // Simple three-column schema let schema = Schema::new(vec![ Field::new("field 1", DataType::Utf8, false), Field::new("string list", DataType::Utf8, false), Field::new("field 3", DataType::Utf8, false) ]); // CSV sample - first record OK, second record doesn't quote the string // list so ends up with too many columns when parsed. let sample = String::from("something,\"this,is,a,list\",something\nsomething,this,is,another,list,whoops\n").into_bytes(); // Build a CSV reader to read the sample using the provided schema let reader = ReaderBuilder::new(Arc::new(schema)) .build(Cursor::new(sample)) .unwrap(); // Reader errors on row 2 because it has too many columns and doesn't fit // the schema. The reader doesn't move on to the next row after the // error, it reads in more bytes and keeps adding them to the current line. // This eventually causes an out of memory error; the reader never exits // because it never reaches end of stream. // // Uncomment the println! statement to show the CSVError. (Leaving the // println! commented out demonstrates the OOM quicker.) for batch in reader { match batch { Ok (next) => println!("Got a batch of {} records", next.num_rows()), Err(_error) => { //println!("{:?}", _error); continue; } } } }