-
Notifications
You must be signed in to change notification settings - Fork 1.3k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Support null values in Avro string columns #6307
Changes from 1 commit
dcd6c64
c7e1cee
3488e22
bbf8d4b
33d8098
84b1c1f
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -96,6 +96,7 @@ mod tests { | |
use crate::datasource::file_format::test_util::scan_format; | ||
use crate::physical_plan::collect; | ||
use crate::prelude::{SessionConfig, SessionContext}; | ||
use arrow::array::{as_string_array, Array}; | ||
use datafusion_common::cast::{ | ||
as_binary_array, as_boolean_array, as_float32_array, as_float64_array, | ||
as_int32_array, as_timestamp_microsecond_array, | ||
|
@@ -221,6 +222,27 @@ mod tests { | |
Ok(()) | ||
} | ||
|
||
#[tokio::test] | ||
async fn read_null_bool_alltypes_plain_avro() -> Result<()> { | ||
let session_ctx = SessionContext::new(); | ||
let state = session_ctx.state(); | ||
let task_ctx = state.task_ctx(); | ||
let projection = Some(vec![2]); | ||
let exec = | ||
get_exec(&state, "alltypes_nulls_plain.avro", projection, None).await?; | ||
|
||
let batches = collect(exec, task_ctx).await?; | ||
assert_eq!(batches.len(), 1); | ||
assert_eq!(1, batches[0].num_columns()); | ||
assert_eq!(1, batches[0].num_rows()); | ||
|
||
let array = as_boolean_array(batches[0].column(0))?; | ||
|
||
assert!(array.is_null(0)); | ||
|
||
Ok(()) | ||
} | ||
|
||
#[tokio::test] | ||
async fn read_i32_alltypes_plain_avro() -> Result<()> { | ||
let session_ctx = SessionContext::new(); | ||
|
@@ -245,6 +267,27 @@ mod tests { | |
Ok(()) | ||
} | ||
|
||
#[tokio::test] | ||
async fn read_null_i32_alltypes_plain_avro() -> Result<()> { | ||
let session_ctx = SessionContext::new(); | ||
let state = session_ctx.state(); | ||
let task_ctx = state.task_ctx(); | ||
let projection = Some(vec![1]); | ||
let exec = | ||
get_exec(&state, "alltypes_nulls_plain.avro", projection, None).await?; | ||
|
||
let batches = collect(exec, task_ctx).await?; | ||
assert_eq!(batches.len(), 1); | ||
assert_eq!(1, batches[0].num_columns()); | ||
assert_eq!(1, batches[0].num_rows()); | ||
|
||
let array = as_int32_array(batches[0].column(0))?; | ||
|
||
assert!(array.is_null(0)); | ||
|
||
Ok(()) | ||
} | ||
|
||
#[tokio::test] | ||
async fn read_i96_alltypes_plain_avro() -> Result<()> { | ||
let session_ctx = SessionContext::new(); | ||
|
@@ -350,6 +393,48 @@ mod tests { | |
Ok(()) | ||
} | ||
|
||
#[tokio::test] | ||
async fn read_null_binary_alltypes_plain_avro() -> Result<()> { | ||
let session_ctx = SessionContext::new(); | ||
let state = session_ctx.state(); | ||
let task_ctx = state.task_ctx(); | ||
let projection = Some(vec![6]); | ||
let exec = | ||
get_exec(&state, "alltypes_nulls_plain.avro", projection, None).await?; | ||
|
||
let batches = collect(exec, task_ctx).await?; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It might also be worth checking out the https://docs.rs/datafusion/latest/datafusion/macro.assert_batches_eq.html macro to verify the rows / columns in a more easy to maintain wai There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I tried that as well, however since we're explicitly checking for null values, the expected value would be something like
... making it hard to differentiate between an empty string and null, so I opted to explicitly test via |
||
assert_eq!(batches.len(), 1); | ||
assert_eq!(1, batches[0].num_columns()); | ||
assert_eq!(1, batches[0].num_rows()); | ||
|
||
let array = as_binary_array(batches[0].column(0))?; | ||
|
||
assert!(array.is_null(0)); | ||
|
||
Ok(()) | ||
} | ||
|
||
#[tokio::test] | ||
async fn read_null_string_alltypes_plain_avro() -> Result<()> { | ||
let session_ctx = SessionContext::new(); | ||
let state = session_ctx.state(); | ||
let task_ctx = state.task_ctx(); | ||
let projection = Some(vec![0]); | ||
let exec = | ||
get_exec(&state, "alltypes_nulls_plain.avro", projection, None).await?; | ||
|
||
let batches = collect(exec, task_ctx).await?; | ||
assert_eq!(batches.len(), 1); | ||
assert_eq!(1, batches[0].num_columns()); | ||
assert_eq!(1, batches[0].num_rows()); | ||
|
||
let array = as_string_array(batches[0].column(0)); | ||
|
||
assert!(array.is_null(0)); | ||
|
||
Ok(()) | ||
} | ||
|
||
async fn get_exec( | ||
state: &SessionState, | ||
file_name: &str, | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Looks reasonable to me. 👍