Skip to content

Commit

Permalink
fix: use avro fixed to represent decimal
Browse files Browse the repository at this point in the history
fix #144

Signed-off-by: xxchan <xxchan22f@gmail.com>
  • Loading branch information
xxchan committed Jul 21, 2024
1 parent f30d872 commit d214ded
Showing 1 changed file with 51 additions and 1 deletion.
52 changes: 51 additions & 1 deletion crates/iceberg/src/avro/schema.rs
Original file line number Diff line number Diff line change
Expand Up @@ -258,13 +258,43 @@ pub(crate) fn avro_fixed_schema(len: usize, logical_type: Option<&str>) -> Resul
}

pub(crate) fn avro_decimal_schema(precision: usize, scale: usize) -> Result<AvroSchema> {
// Avro decimal logical type annotates Avro bytes _or_ fixed types.
// https://avro.apache.org/docs/1.11.1/specification/_print/#decimal
// Iceberg spec: Stored as _fixed_ using the minimum number of bytes for the given precision.
// https://iceberg.apache.org/spec/#avro
Ok(AvroSchema::Decimal(DecimalSchema {
precision,
scale,
inner: Box::new(AvroSchema::Bytes),
inner: Box::new(AvroSchema::Fixed(FixedSchema {
// Name is not restricted by the spec.
// Refer to iceberg-python https://github.com/apache/iceberg-python/blob/d8bc1ca9af7957ce4d4db99a52c701ac75db7688/pyiceberg/utils/schema_conversion.py#L574-L582
name: Name::new(&format!("decimal_{precision}_{scale}")).unwrap(),
aliases: None,
doc: None,
size: decimal_required_bytes(precision)?,
attributes: Default::default(),
})),
}))
}

/// `REQUIRED_LENGTH[precision]` represents the number of bytes required to store a decimal with the given precision.
///
/// See `calculate_decimal_required_bytes` for how it's calculated.
const REQUIRED_LENGTH: [usize; 39] = [
0, 1, 2, 2, 2, 3, 3, 4, 4, 4, 5, 5, 6, 6, 7, 7, 7, 8, 8, 9, 9, 9, 10, 10, 11, 11, 12, 12, 12,
13, 13, 14, 14, 14, 15, 15, 16, 16, 17,
];
/// Compute the number of bytes required to store a precision.
fn decimal_required_bytes(precision: usize) -> Result<usize> {
if precision == 0 || precision > 38 {
return Err(Error::new(
ErrorKind::DataInvalid,
format!("Unsupported precision, outside of (0, 38]: {precision:?}"),
));
}
Ok(REQUIRED_LENGTH[precision])
}

fn avro_optional(avro_schema: AvroSchema) -> Result<AvroSchema> {
Ok(AvroSchema::Union(UnionSchema::new(vec![
AvroSchema::Null,
Expand Down Expand Up @@ -955,4 +985,24 @@ mod tests {
converter.primitive(&AvroSchema::Date).unwrap().unwrap()
);
}

#[test]
fn calculate_decimal_required_bytes() {
// max_precision[n] represents the max precision can be represented by n bytes.
// n bytes can represent int range [-2^(8n-1), 2^(8n-1)-1],
// so we need to ensure 10^(max_precision[n]) <= 2^(8n-1)-1
let max_precision = (0..24)
.map(|num_bytes| ((((8 * num_bytes) - 1) - 1) as f64).exp2().log10().floor() as usize)
.collect::<Vec<_>>();

let required_length = (0..=38)
.map(|precision| {
// find the first index in max_precision with value >= precision
(0..24)
.find(|num_bytes| precision <= max_precision[*num_bytes])
.unwrap()
})
.collect::<Vec<_>>();
itertools::assert_equal(REQUIRED_LENGTH, required_length);
}
}

0 comments on commit d214ded

Please sign in to comment.