From 6a9401aa2295d4c0fafa3006e6611d870d4f42cd Mon Sep 17 00:00:00 2001 From: Necati Demir Date: Tue, 11 Mar 2025 19:43:50 -0400 Subject: [PATCH 1/4] JSON Reader Faster Coercion of Primitives to String --- arrow-json/src/reader/string_array.rs | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/arrow-json/src/reader/string_array.rs b/arrow-json/src/reader/string_array.rs index 03d07ad8c8b3..475d7acdc2e2 100644 --- a/arrow-json/src/reader/string_array.rs +++ b/arrow-json/src/reader/string_array.rs @@ -20,6 +20,7 @@ use arrow_array::{Array, GenericStringArray, OffsetSizeTrait}; use arrow_data::ArrayData; use arrow_schema::ArrowError; use std::marker::PhantomData; +use std::io::Write; use crate::reader::tape::{Tape, TapeElement}; use crate::reader::ArrayDecoder; @@ -30,6 +31,7 @@ const FALSE: &str = "false"; pub struct StringArrayDecoder { coerce_primitive: bool, phantom: PhantomData, + number_buffer: Vec, } impl StringArrayDecoder { @@ -37,8 +39,16 @@ impl StringArrayDecoder { Self { coerce_primitive, phantom: Default::default(), + number_buffer: Vec::with_capacity(32), } } + + fn write_number(&mut self, n: T) -> &str { + self.number_buffer.clear(); + write!(&mut self.number_buffer, "{}", n).unwrap(); + // SAFETY: We just wrote valid UTF-8 using write! macro + unsafe { std::str::from_utf8_unchecked(&self.number_buffer) } + } } impl ArrayDecoder for StringArrayDecoder { @@ -103,20 +113,24 @@ impl ArrayDecoder for StringArrayDecoder { TapeElement::I64(high) if coerce_primitive => match tape.get(p + 1) { TapeElement::I32(low) => { let val = ((high as i64) << 32) | (low as u32) as i64; - builder.append_value(val.to_string()); + let s = self.write_number(val); + builder.append_value(s); } _ => unreachable!(), }, TapeElement::I32(n) if coerce_primitive => { - builder.append_value(n.to_string()); + let s = self.write_number(n); + builder.append_value(s); } TapeElement::F32(n) if coerce_primitive => { - builder.append_value(n.to_string()); + let s = self.write_number(n); + builder.append_value(s); } TapeElement::F64(high) if coerce_primitive => match tape.get(p + 1) { TapeElement::F32(low) => { let val = f64::from_bits(((high as u64) << 32) | low as u64); - builder.append_value(val.to_string()); + let s = self.write_number(val); + builder.append_value(s); } _ => unreachable!(), }, From 900ed2e064a9fe3ac4cc53407a4582672ea6d773 Mon Sep 17 00:00:00 2001 From: Necati Demir Date: Tue, 11 Mar 2025 20:12:29 -0400 Subject: [PATCH 2/4] added comment about safety --- arrow-json/src/reader/string_array.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arrow-json/src/reader/string_array.rs b/arrow-json/src/reader/string_array.rs index 475d7acdc2e2..cf19f32cd2cd 100644 --- a/arrow-json/src/reader/string_array.rs +++ b/arrow-json/src/reader/string_array.rs @@ -46,7 +46,8 @@ impl StringArrayDecoder { fn write_number(&mut self, n: T) -> &str { self.number_buffer.clear(); write!(&mut self.number_buffer, "{}", n).unwrap(); - // SAFETY: We just wrote valid UTF-8 using write! macro + // SAFETY: We only write ASCII characters (digits, signs, decimal points, + // exponent symbols) into `number_buffer`, which are guaranteed valid UTF-8. unsafe { std::str::from_utf8_unchecked(&self.number_buffer) } } } From ea3e7aed22d43054c4e43fbb8f3b2983c356c4ba Mon Sep 17 00:00:00 2001 From: Necati Demir Date: Tue, 11 Mar 2025 22:27:03 -0400 Subject: [PATCH 3/4] call write_number when TapeElement::Number --- arrow-json/src/reader/string_array.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arrow-json/src/reader/string_array.rs b/arrow-json/src/reader/string_array.rs index cf19f32cd2cd..7588a1ee157a 100644 --- a/arrow-json/src/reader/string_array.rs +++ b/arrow-json/src/reader/string_array.rs @@ -109,7 +109,8 @@ impl ArrayDecoder for StringArrayDecoder { builder.append_value(FALSE); } TapeElement::Number(idx) if coerce_primitive => { - builder.append_value(tape.get_string(idx)); + let s = self.write_number(tape.get_string(idx)); + builder.append_value(s); } TapeElement::I64(high) if coerce_primitive => match tape.get(p + 1) { TapeElement::I32(low) => { From 917bd7a92195eba568775654738799ddc1411886 Mon Sep 17 00:00:00 2001 From: Necati Demir Date: Tue, 11 Mar 2025 22:32:23 -0400 Subject: [PATCH 4/4] Revert "call write_number when TapeElement::Number" This reverts commit ea3e7aed22d43054c4e43fbb8f3b2983c356c4ba. --- arrow-json/src/reader/string_array.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arrow-json/src/reader/string_array.rs b/arrow-json/src/reader/string_array.rs index 7588a1ee157a..cf19f32cd2cd 100644 --- a/arrow-json/src/reader/string_array.rs +++ b/arrow-json/src/reader/string_array.rs @@ -109,8 +109,7 @@ impl ArrayDecoder for StringArrayDecoder { builder.append_value(FALSE); } TapeElement::Number(idx) if coerce_primitive => { - let s = self.write_number(tape.get_string(idx)); - builder.append_value(s); + builder.append_value(tape.get_string(idx)); } TapeElement::I64(high) if coerce_primitive => match tape.get(p + 1) { TapeElement::I32(low) => {