Skip to content

Commit 038b790

Browse files
authored
Merge branch 'main' into iceberg_partition_test
2 parents 8e0a702 + bcf28b4 commit 038b790

File tree

11 files changed

+526
-13
lines changed

11 files changed

+526
-13
lines changed

.github/workflows/release_python.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,7 @@ jobs:
6666
- { os: ubuntu-latest, target: "armv7l" }
6767
steps:
6868
- uses: actions/checkout@v4
69-
- uses: actions/setup-python@v4
69+
- uses: actions/setup-python@v5
7070
with:
7171
python-version: 3.9
7272
- name: Setup Rust toolchain

Cargo.toml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ members = [
2323
"crates/iceberg",
2424
"crates/integration_tests",
2525
"crates/integrations/*",
26+
"crates/puffin",
2627
"crates/test_utils",
2728
]
2829
exclude = ["bindings/python"]
@@ -98,3 +99,4 @@ uuid = { version = "1.6.1", features = ["v7"] }
9899
volo-thrift = "0.10"
99100
hive_metastore = "0.1"
100101
tera = "1"
102+
zstd = "0.13.2"

crates/iceberg/src/arrow/schema.rs

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -723,10 +723,21 @@ macro_rules! get_parquet_stat_as_datum {
723723
let Some(bytes) = stats.[<$limit_type _bytes_opt>]() else {
724724
return Ok(None);
725725
};
726-
727726
Some(Datum::new(
728727
primitive_type.clone(),
729-
PrimitiveLiteral::Int128(i128::from_le_bytes(bytes.try_into()?)),
728+
PrimitiveLiteral::Int128(i128::from_be_bytes(bytes.try_into()?)),
729+
))
730+
}
731+
(PrimitiveType::Decimal {
732+
precision: _,
733+
scale: _,
734+
}, Statistics::FixedLenByteArray(stats)) => {
735+
let Some(bytes) = stats.[<$limit_type _bytes_opt>]() else {
736+
return Ok(None);
737+
};
738+
Some(Datum::new(
739+
primitive_type.clone(),
740+
PrimitiveLiteral::Int128(i128::from_be_bytes(bytes.try_into()?)),
730741
))
731742
}
732743
(

crates/iceberg/src/io/storage_s3.rs

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,11 @@ pub const S3_ASSUME_ROLE_ARN: &str = "client.assume-role.arn";
5858
pub const S3_ASSUME_ROLE_EXTERNAL_ID: &str = "client.assume-role.external-id";
5959
/// Optional session name used to assume an IAM role.
6060
pub const S3_ASSUME_ROLE_SESSION_NAME: &str = "client.assume-role.session-name";
61+
/// Option to skip signing request (e.g. for public buckets/folders)
62+
pub const S3_ALLOW_ANONYMOUS: &str = "s3.allow-anonymous";
63+
/// Option to skip loading the credential from EC2 metadata (typically used in conjunction with
64+
/// `S3_ALLOW_ANONYMOUS`)
65+
pub const S3_DISABLE_EC2_METADATA: &str = "s3.disable-ec2-metadata";
6166

6267
/// Parse iceberg props to s3 config.
6368
pub(crate) fn s3_config_parse(mut m: HashMap<String, String>) -> Result<S3Config> {
@@ -81,7 +86,7 @@ pub(crate) fn s3_config_parse(mut m: HashMap<String, String>) -> Result<S3Config
8186
cfg.region = Some(region);
8287
};
8388
if let Some(path_style_access) = m.remove(S3_PATH_STYLE_ACCESS) {
84-
if ["true", "True", "1"].contains(&path_style_access.as_str()) {
89+
if ["true", "t", "1"].contains(&path_style_access.to_lowercase().as_str()) {
8590
cfg.enable_virtual_host_style = true;
8691
}
8792
};
@@ -126,6 +131,17 @@ pub(crate) fn s3_config_parse(mut m: HashMap<String, String>) -> Result<S3Config
126131
}
127132
};
128133

134+
if let Some(allow_anonymous) = m.remove(S3_ALLOW_ANONYMOUS) {
135+
if ["true", "t", "1", "on"].contains(&allow_anonymous.to_lowercase().as_str()) {
136+
cfg.allow_anonymous = true;
137+
}
138+
}
139+
if let Some(disable_ec2_metadata) = m.remove(S3_DISABLE_EC2_METADATA) {
140+
if ["true", "t", "1", "on"].contains(&disable_ec2_metadata.to_lowercase().as_str()) {
141+
cfg.disable_ec2_metadata = true;
142+
}
143+
};
144+
129145
Ok(cfg)
130146
}
131147

crates/iceberg/src/spec/table_metadata.rs

Lines changed: 113 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1010,10 +1010,14 @@ pub(super) mod _serde {
10101010
.collect(),
10111011
default_spec_id: v.default_spec.spec_id(),
10121012
last_partition_id: v.last_partition_id,
1013-
properties: Some(v.properties),
1014-
current_snapshot_id: v.current_snapshot_id.or(Some(-1)),
1013+
properties: if v.properties.is_empty() {
1014+
None
1015+
} else {
1016+
Some(v.properties)
1017+
},
1018+
current_snapshot_id: v.current_snapshot_id,
10151019
snapshots: if v.snapshots.is_empty() {
1016-
Some(vec![])
1020+
None
10171021
} else {
10181022
Some(
10191023
v.snapshots
@@ -1091,7 +1095,7 @@ pub(super) mod _serde {
10911095
} else {
10921096
Some(v.properties)
10931097
},
1094-
current_snapshot_id: v.current_snapshot_id.or(Some(-1)),
1098+
current_snapshot_id: v.current_snapshot_id,
10951099
snapshots: if v.snapshots.is_empty() {
10961100
None
10971101
} else {
@@ -1279,6 +1283,7 @@ mod tests {
12791283
"timestamp-ms": 1515100
12801284
}
12811285
],
1286+
"refs": {},
12821287
"sort-orders": [
12831288
{
12841289
"order-id": 0,
@@ -1349,7 +1354,11 @@ mod tests {
13491354
refs: HashMap::new(),
13501355
};
13511356

1357+
let expected_json_value = serde_json::to_value(&expected).unwrap();
13521358
check_table_metadata_serde(data, expected);
1359+
1360+
let json_value = serde_json::from_str::<serde_json::Value>(data).unwrap();
1361+
assert_eq!(json_value, expected_json_value);
13531362
}
13541363

13551364
#[test]
@@ -1519,6 +1528,106 @@ mod tests {
15191528
check_table_metadata_serde(data, expected);
15201529
}
15211530

1531+
#[test]
1532+
fn test_table_data_v2_no_snapshots() {
1533+
let data = r#"
1534+
{
1535+
"format-version" : 2,
1536+
"table-uuid": "fb072c92-a02b-11e9-ae9c-1bb7bc9eca94",
1537+
"location": "s3://b/wh/data.db/table",
1538+
"last-sequence-number" : 1,
1539+
"last-updated-ms": 1515100955770,
1540+
"last-column-id": 1,
1541+
"schemas": [
1542+
{
1543+
"schema-id" : 1,
1544+
"type" : "struct",
1545+
"fields" :[
1546+
{
1547+
"id": 1,
1548+
"name": "struct_name",
1549+
"required": true,
1550+
"type": "fixed[1]"
1551+
}
1552+
]
1553+
}
1554+
],
1555+
"current-schema-id" : 1,
1556+
"partition-specs": [
1557+
{
1558+
"spec-id": 0,
1559+
"fields": []
1560+
}
1561+
],
1562+
"refs": {},
1563+
"default-spec-id": 0,
1564+
"last-partition-id": 1000,
1565+
"metadata-log": [
1566+
{
1567+
"metadata-file": "s3://bucket/.../v1.json",
1568+
"timestamp-ms": 1515100
1569+
}
1570+
],
1571+
"sort-orders": [
1572+
{
1573+
"order-id": 0,
1574+
"fields": []
1575+
}
1576+
],
1577+
"default-sort-order-id": 0
1578+
}
1579+
"#;
1580+
1581+
let schema = Schema::builder()
1582+
.with_schema_id(1)
1583+
.with_fields(vec![Arc::new(NestedField::required(
1584+
1,
1585+
"struct_name",
1586+
Type::Primitive(PrimitiveType::Fixed(1)),
1587+
))])
1588+
.build()
1589+
.unwrap();
1590+
1591+
let partition_spec = BoundPartitionSpec::builder(schema.clone())
1592+
.with_spec_id(0)
1593+
.build()
1594+
.unwrap();
1595+
1596+
let expected = TableMetadata {
1597+
format_version: FormatVersion::V2,
1598+
table_uuid: Uuid::parse_str("fb072c92-a02b-11e9-ae9c-1bb7bc9eca94").unwrap(),
1599+
location: "s3://b/wh/data.db/table".to_string(),
1600+
last_updated_ms: 1515100955770,
1601+
last_column_id: 1,
1602+
schemas: HashMap::from_iter(vec![(1, Arc::new(schema))]),
1603+
current_schema_id: 1,
1604+
partition_specs: HashMap::from_iter(vec![(
1605+
0,
1606+
partition_spec.clone().into_schemaless().into(),
1607+
)]),
1608+
default_spec: partition_spec.into(),
1609+
last_partition_id: 1000,
1610+
default_sort_order_id: 0,
1611+
sort_orders: HashMap::from_iter(vec![(0, SortOrder::unsorted_order().into())]),
1612+
snapshots: HashMap::default(),
1613+
current_snapshot_id: None,
1614+
last_sequence_number: 1,
1615+
properties: HashMap::new(),
1616+
snapshot_log: Vec::new(),
1617+
metadata_log: vec![MetadataLog {
1618+
metadata_file: "s3://bucket/.../v1.json".to_string(),
1619+
timestamp_ms: 1515100,
1620+
}],
1621+
refs: HashMap::new(),
1622+
};
1623+
1624+
let expected_json_value = serde_json::to_value(&expected).unwrap();
1625+
check_table_metadata_serde(data, expected);
1626+
1627+
let json_value = serde_json::from_str::<serde_json::Value>(data).unwrap();
1628+
assert_eq!(json_value, expected_json_value);
1629+
}
1630+
15221631
#[test]
15231632
fn test_current_snapshot_id_must_match_main_branch() {
15241633
let data = r#"

crates/iceberg/src/spec/table_metadata_builder.rs

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -141,10 +141,10 @@ impl TableMetadataBuilder {
141141
#[must_use]
142142
pub fn new_from_metadata(
143143
previous: TableMetadata,
144-
previous_file_location: Option<String>,
144+
current_file_location: Option<String>,
145145
) -> Self {
146146
Self {
147-
previous_history_entry: previous_file_location.map(|l| MetadataLog {
147+
previous_history_entry: current_file_location.map(|l| MetadataLog {
148148
metadata_file: l,
149149
timestamp_ms: previous.last_updated_ms,
150150
}),
@@ -1220,6 +1220,7 @@ mod tests {
12201220
assert_eq!(metadata.last_partition_id, 1000);
12211221
assert_eq!(metadata.last_column_id, 3);
12221222
assert_eq!(metadata.snapshots.len(), 0);
1223+
assert_eq!(metadata.current_snapshot_id, None);
12231224
assert_eq!(metadata.refs.len(), 0);
12241225
assert_eq!(metadata.properties.len(), 0);
12251226
assert_eq!(metadata.metadata_log.len(), 0);
@@ -1268,6 +1269,7 @@ mod tests {
12681269
assert_eq!(metadata.last_partition_id, UNPARTITIONED_LAST_ASSIGNED_ID);
12691270
assert_eq!(metadata.last_column_id, 0);
12701271
assert_eq!(metadata.snapshots.len(), 0);
1272+
assert_eq!(metadata.current_snapshot_id, None);
12711273
assert_eq!(metadata.refs.len(), 0);
12721274
assert_eq!(metadata.properties.len(), 0);
12731275
assert_eq!(metadata.metadata_log.len(), 0);

0 commit comments

Comments
 (0)