From d214dedd4e2952ed7f76eca02f35f9edde566c92 Mon Sep 17 00:00:00 2001 From: xxchan Date: Sun, 21 Jul 2024 14:37:41 +0800 Subject: [PATCH] fix: use avro fixed to represent decimal fix #144 Signed-off-by: xxchan --- crates/iceberg/src/avro/schema.rs | 52 ++++++++++++++++++++++++++++++- 1 file changed, 51 insertions(+), 1 deletion(-) diff --git a/crates/iceberg/src/avro/schema.rs b/crates/iceberg/src/avro/schema.rs index 11d000cc5..b30ea5d8c 100644 --- a/crates/iceberg/src/avro/schema.rs +++ b/crates/iceberg/src/avro/schema.rs @@ -258,13 +258,43 @@ pub(crate) fn avro_fixed_schema(len: usize, logical_type: Option<&str>) -> Resul } pub(crate) fn avro_decimal_schema(precision: usize, scale: usize) -> Result { + // Avro decimal logical type annotates Avro bytes _or_ fixed types. + // https://avro.apache.org/docs/1.11.1/specification/_print/#decimal + // Iceberg spec: Stored as _fixed_ using the minimum number of bytes for the given precision. + // https://iceberg.apache.org/spec/#avro Ok(AvroSchema::Decimal(DecimalSchema { precision, scale, - inner: Box::new(AvroSchema::Bytes), + inner: Box::new(AvroSchema::Fixed(FixedSchema { + // Name is not restricted by the spec. + // Refer to iceberg-python https://github.com/apache/iceberg-python/blob/d8bc1ca9af7957ce4d4db99a52c701ac75db7688/pyiceberg/utils/schema_conversion.py#L574-L582 + name: Name::new(&format!("decimal_{precision}_{scale}")).unwrap(), + aliases: None, + doc: None, + size: decimal_required_bytes(precision)?, + attributes: Default::default(), + })), })) } +/// `REQUIRED_LENGTH[precision]` represents the number of bytes required to store a decimal with the given precision. +/// +/// See `calculate_decimal_required_bytes` for how it's calculated. +const REQUIRED_LENGTH: [usize; 39] = [ + 0, 1, 2, 2, 2, 3, 3, 4, 4, 4, 5, 5, 6, 6, 7, 7, 7, 8, 8, 9, 9, 9, 10, 10, 11, 11, 12, 12, 12, + 13, 13, 14, 14, 14, 15, 15, 16, 16, 17, +]; +/// Compute the number of bytes required to store a precision. +fn decimal_required_bytes(precision: usize) -> Result { + if precision == 0 || precision > 38 { + return Err(Error::new( + ErrorKind::DataInvalid, + format!("Unsupported precision, outside of (0, 38]: {precision:?}"), + )); + } + Ok(REQUIRED_LENGTH[precision]) +} + fn avro_optional(avro_schema: AvroSchema) -> Result { Ok(AvroSchema::Union(UnionSchema::new(vec![ AvroSchema::Null, @@ -955,4 +985,24 @@ mod tests { converter.primitive(&AvroSchema::Date).unwrap().unwrap() ); } + + #[test] + fn calculate_decimal_required_bytes() { + // max_precision[n] represents the max precision can be represented by n bytes. + // n bytes can represent int range [-2^(8n-1), 2^(8n-1)-1], + // so we need to ensure 10^(max_precision[n]) <= 2^(8n-1)-1 + let max_precision = (0..24) + .map(|num_bytes| ((((8 * num_bytes) - 1) - 1) as f64).exp2().log10().floor() as usize) + .collect::>(); + + let required_length = (0..=38) + .map(|precision| { + // find the first index in max_precision with value >= precision + (0..24) + .find(|num_bytes| precision <= max_precision[*num_bytes]) + .unwrap() + }) + .collect::>(); + itertools::assert_equal(REQUIRED_LENGTH, required_length); + } }