diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml index c0e2640..66b03d0 100644 --- a/.github/workflows/CI.yml +++ b/.github/workflows/CI.yml @@ -14,21 +14,51 @@ jobs: JULIA_NUM_THREADS: 2 # TODO iceberg_rust_ffi requires this and panics otherwise, we should fix that (test # bucket is publicly accessible). - AWS_ACCESS_KEY_ID: "" + AWS_ACCESS_KEY_ID: root + AWS_SECRET_ACCESS_KEY: password + AWS_ENDPOINT_URL: http://localhost:9000 + AWS_REGION: us-east-1 runs-on: ${{ matrix.os }} strategy: fail-fast: false matrix: - version: - - '1.10' - - '1.11' - os: - - ubuntu-latest - - macOS-latest - arch: - - x64 + include: + # Ubuntu on x64 + - version: '1.10' + os: ubuntu-latest + arch: x64 + - version: '1.11' + os: ubuntu-latest + arch: x64 steps: - uses: actions/checkout@v3.5.0 + - name: Initialize containers + uses: gacts/run-and-post-run@v1 + with: + run: | + docker network create iceberg_net + docker run -d --name minio \ + --network=iceberg_net -p 9000:9000 \ + -e MINIO_ROOT_USER=root \ + -e MINIO_ROOT_PASSWORD=password \ + -e MINIO_DOMAIN=minio \ + -v ${{ github.workspace }}/assets/tpch:/input minio/minio:latest server /data/ + until (docker exec minio mc alias set minio http://localhost:9000 root password) do echo '... waiting ...' && sleep 1; done; + docker exec minio mc mb minio/warehouse + docker exec minio mc cp -r /input/tpch.sf01/ minio/warehouse/tpch.sf01/ + docker run -d --name rest \ + --network=iceberg_net \ + -p 8181:8181 \ + -v ${{ github.workspace }}/assets/rest:/tmp \ + -e AWS_ACCESS_KEY_ID=root \ + -e AWS_SECRET_ACCESS_KEY=password \ + -e AWS_ENDPOINT_URL=http://minio:9000 \ + -e AWS_REGION=us-east-1 \ + -e CATALOG_S3_ENDPOINT=http://minio:9000 \ + -e CATALOG_WAREHOUSE=s3://warehouse \ + -e CATALOG_IO__IMPL=org.apache.iceberg.aws.s3.S3FileIO apache/iceberg-rest-fixture + post: + docker stop rest minio && docker rm rest minio && docker network rm iceberg_net - uses: julia-actions/setup-julia@v1 with: version: ${{ matrix.version }} diff --git a/Manifest.toml b/Manifest.toml index bbea1b7..d4f0422 100644 --- a/Manifest.toml +++ b/Manifest.toml @@ -2,7 +2,7 @@ julia_version = "1.10.2" manifest_format = "2.0" -project_hash = "99f2d569c75339b191aacd71e0f3e72c8ea9f4a5" +project_hash = "4ebb16a8b4e4dee68434289ad64fac66757cf7e1" [[deps.ArgTools]] uuid = "0dad84c5-d112-42e6-8d28-ef12dabb789f" @@ -26,6 +26,12 @@ uuid = "56f22d72-fd6d-98f1-02f0-08ddc0907c33" [[deps.Base64]] uuid = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f" +[[deps.BenchmarkTools]] +deps = ["Compat", "JSON", "Logging", "Printf", "Profile", "Statistics", "UUIDs"] +git-tree-sha1 = "e38fbc49a620f5d0b660d7f543db1009fe0f8336" +uuid = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf" +version = "1.6.0" + [[deps.BitIntegers]] deps = ["Random"] git-tree-sha1 = "f98cfeaba814d9746617822032d50a31c9926604" @@ -46,16 +52,18 @@ version = "0.8.6" [[deps.Compat]] deps = ["TOML", "UUIDs"] -git-tree-sha1 = "0037835448781bb46feb39866934e243886d756a" +git-tree-sha1 = "9d8a54ce4b17aa5bdce0ea5c34bc5e7c340d16ad" uuid = "34da2185-b29b-5c13-b0c7-acf172513d20" -version = "4.18.0" +version = "4.18.1" +weakdeps = ["Dates", "LinearAlgebra"] [deps.Compat.extensions] CompatLinearAlgebraExt = "LinearAlgebra" - [deps.Compat.weakdeps] - Dates = "ade2ca70-3891-5945-98fb-dc099432e06a" - LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" +[[deps.CompilerSupportLibraries_jll]] +deps = ["Artifacts", "Libdl"] +uuid = "e66e0078-7015-5450-92f7-15fbd957f2ae" +version = "1.1.0+0" [[deps.ConcurrentUtilities]] deps = ["Serialization", "Sockets"] @@ -100,18 +108,15 @@ deps = ["Random"] uuid = "9fa8497b-333b-5362-9e8d-4d0656e87820" [[deps.InlineStrings]] -git-tree-sha1 = "8594fac023c5ce1ef78260f24d1ad18b4327b420" +git-tree-sha1 = "8f3d257792a522b4601c24a577954b0a8cd7334d" uuid = "842dd82b-1e85-43dc-bf29-5d0ee9dffc48" -version = "1.4.4" +version = "1.4.5" +weakdeps = ["ArrowTypes", "Parsers"] [deps.InlineStrings.extensions] ArrowTypesExt = "ArrowTypes" ParsersExt = "Parsers" - [deps.InlineStrings.weakdeps] - ArrowTypes = "31f734f8-188a-4ce0-8406-c8a06bd891cd" - Parsers = "69de0a69-1ddd-5017-9359-2bf0b02dc9f0" - [[deps.InteractiveUtils]] deps = ["Markdown"] uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240" @@ -127,6 +132,12 @@ git-tree-sha1 = "0533e564aae234aff59ab625543145446d8b6ec2" uuid = "692b3bcd-3c85-4b1f-b108-f13ce0eb3210" version = "1.7.1" +[[deps.JSON]] +deps = ["Dates", "Mmap", "Parsers", "Unicode"] +git-tree-sha1 = "31e996f0a15c7b280ba9f76636b3ff9e2ae58c9a" +uuid = "682c06a0-de6a-54ab-a142-c8b1cf79cde6" +version = "0.21.4" + [[deps.LibCURL]] deps = ["LibCURL_jll", "MozillaCACerts_jll"] uuid = "b27032c2-a3e7-50c8-80cd-2d36dbcbfd21" @@ -145,6 +156,10 @@ version = "1.11.0+1" [[deps.Libdl]] uuid = "8f399da3-3557-5675-b5ff-fb832c97cbdb" +[[deps.LinearAlgebra]] +deps = ["Libdl", "OpenBLAS_jll", "libblastrampoline_jll"] +uuid = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" + [[deps.Logging]] uuid = "56ddb016-857b-54e1-b83d-db4d58db5568" @@ -180,17 +195,40 @@ version = "2023.1.10" uuid = "ca575930-c2e3-43a9-ace4-1e988b2c1908" version = "1.2.0" +[[deps.OpenBLAS_jll]] +deps = ["Artifacts", "CompilerSupportLibraries_jll", "Libdl"] +uuid = "4536629a-c528-5b80-bd46-f80d51c5b363" +version = "0.3.23+4" + +[[deps.OpenSSL_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl"] +git-tree-sha1 = "f19301ae653233bc88b1810ae908194f07f8db9d" +uuid = "458c3c95-2e84-50aa-8efc-19380b2a3a95" +version = "3.5.4+0" + [[deps.OrderedCollections]] git-tree-sha1 = "05868e21324cede2207c6f0f466b4bfef6d5e7ee" uuid = "bac558e1-5e72-5ebc-8fee-abe8a469f55d" version = "1.8.1" +[[deps.Parsers]] +deps = ["Dates", "PrecompileTools", "UUIDs"] +git-tree-sha1 = "7d2f8f21da5db6a806faf7b9b292296da42b2810" +uuid = "69de0a69-1ddd-5017-9359-2bf0b02dc9f0" +version = "2.8.3" + [[deps.PooledArrays]] deps = ["DataAPI", "Future"] git-tree-sha1 = "36d8b4b899628fb92c2749eb488d884a926614d3" uuid = "2dfb63ee-cc39-5dd5-95bd-886bf059d720" version = "1.4.3" +[[deps.PrecompileTools]] +deps = ["Preferences"] +git-tree-sha1 = "5aa36f7049a63a1528fe8f7c3f2113413ffd4e1f" +uuid = "aea7be01-6a6a-4083-8856-8a6e6704d82a" +version = "1.2.1" + [[deps.Preferences]] deps = ["TOML"] git-tree-sha1 = "0f27480397253da18fe2c12a4ba4eb9eb208bf3d" @@ -201,6 +239,10 @@ version = "1.5.0" deps = ["Unicode"] uuid = "de0858da-6303-5e67-8744-51eddeeeb8d7" +[[deps.Profile]] +deps = ["Printf"] +uuid = "9abbd945-dff8-562f-b5e8-e1ebf5ef1b79" + [[deps.Random]] deps = ["SHA"] uuid = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" @@ -227,10 +269,25 @@ uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b" [[deps.Sockets]] uuid = "6462fe0b-24de-5631-8697-dd941f90decc" +[[deps.SparseArrays]] +deps = ["Libdl", "LinearAlgebra", "Random", "Serialization", "SuiteSparse_jll"] +uuid = "2f01184e-e22b-5df5-ae63-d93ebab69eaf" +version = "1.10.0" + +[[deps.Statistics]] +deps = ["LinearAlgebra", "SparseArrays"] +uuid = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" +version = "1.10.0" + [[deps.StringViews]] -git-tree-sha1 = "ec4bf39f7d25db401bcab2f11d2929798c0578e5" +git-tree-sha1 = "d3c7a06c80622b8404b1105886c732abcb25cc2b" uuid = "354b36f9-a18e-4713-926e-db85100087ba" -version = "1.3.4" +version = "1.3.5" + +[[deps.SuiteSparse_jll]] +deps = ["Artifacts", "Libdl", "libblastrampoline_jll"] +uuid = "bea87d4a-7f5b-5778-9afe-8cc45184846c" +version = "7.2.1+1" [[deps.TOML]] deps = ["Dates"] @@ -261,9 +318,9 @@ uuid = "8dfed614-e22c-5e08-85e1-65c5234f0b40" [[deps.TimeZones]] deps = ["Artifacts", "Dates", "Downloads", "InlineStrings", "Mocking", "Printf", "Scratch", "TZJData", "Unicode", "p7zip_jll"] -git-tree-sha1 = "1f9a3f379a2ce2a213a0f606895567a08a1a2d08" +git-tree-sha1 = "06f4f1f3e8ff09e42e59b043a747332e88e01aba" uuid = "f269a46b-ccf7-5d73-abea-4c690281aa53" -version = "1.22.0" +version = "1.22.1" [deps.TimeZones.extensions] TimeZonesRecipesBaseExt = "RecipesBase" @@ -295,10 +352,15 @@ uuid = "3161d3a3-bdf6-5164-811a-617609db77b4" version = "1.5.7+1" [[deps.iceberg_rust_ffi_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl"] -git-tree-sha1 = "f92cda6a47dc404e19e1df5e2f01915baeb0591a" +deps = ["Artifacts", "JLLWrappers", "Libdl", "OpenSSL_jll"] +git-tree-sha1 = "c0054a4811aa38d441839c919469d1262bbbd417" uuid = "6bd5c94f-693c-53e3-983d-a09fad412f22" -version = "0.1.0+0" +version = "0.2.0+0" + +[[deps.libblastrampoline_jll]] +deps = ["Artifacts", "Libdl"] +uuid = "8e850b90-86db-534c-a0d3-1478176c7d93" +version = "5.8.0+1" [[deps.nghttp2_jll]] deps = ["Artifacts", "Libdl"] diff --git a/Project.toml b/Project.toml index 4d3dc66..09a95b2 100644 --- a/Project.toml +++ b/Project.toml @@ -1,10 +1,11 @@ name = "RustyIceberg" uuid = "390bdf5b-b624-43dc-a846-0ef7a3405804" authors = ["Vukasin Stefanovic "] -version = "0.1.0" +version = "0.2.0" [deps] Arrow = "69666777-d1a9-59fb-9406-91d4454c9d45" +BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf" Libdl = "8f399da3-3557-5675-b5ff-fb832c97cbdb" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" iceberg_rust_ffi_jll = "6bd5c94f-693c-53e3-983d-a09fad412f22" diff --git a/assets/.gitignore b/assets/.gitignore new file mode 100644 index 0000000..c36868e --- /dev/null +++ b/assets/.gitignore @@ -0,0 +1,4 @@ +**/.minio* +rest/*.so +rest/*.so.lck +rest/hsperfdata* \ No newline at end of file diff --git a/assets/rest/iceberg_catalog.db b/assets/rest/iceberg_catalog.db new file mode 100644 index 0000000..7e76c37 Binary files /dev/null and b/assets/rest/iceberg_catalog.db differ diff --git a/assets/tpch/tpch.sf01/customer/data/data_customer-00000.parquet b/assets/tpch/tpch.sf01/customer/data/data_customer-00000.parquet new file mode 100644 index 0000000..fd08b91 Binary files /dev/null and b/assets/tpch/tpch.sf01/customer/data/data_customer-00000.parquet differ diff --git a/assets/tpch/tpch.sf01/customer/metadata/00000-462e1567-8801-45a5-978c-b3beb53e7a2e.metadata.json b/assets/tpch/tpch.sf01/customer/metadata/00000-462e1567-8801-45a5-978c-b3beb53e7a2e.metadata.json new file mode 100644 index 0000000..1e94fe8 --- /dev/null +++ b/assets/tpch/tpch.sf01/customer/metadata/00000-462e1567-8801-45a5-978c-b3beb53e7a2e.metadata.json @@ -0,0 +1 @@ +{"format-version":2,"table-uuid":"51667c9a-d3d1-4389-8461-d18626ab0e60","location":"s3://warehouse/tpch.sf01/customer","last-sequence-number":0,"last-updated-ms":1757415286085,"last-column-id":8,"current-schema-id":0,"schemas":[{"type":"struct","schema-id":0,"identifier-field-ids":[1],"fields":[{"id":1,"name":"c_custkey","required":true,"type":"long"},{"id":2,"name":"c_name","required":false,"type":"string"},{"id":3,"name":"c_address","required":false,"type":"string"},{"id":4,"name":"c_nationkey","required":false,"type":"int"},{"id":5,"name":"c_phone","required":false,"type":"string"},{"id":6,"name":"c_acctbal","required":false,"type":"decimal(15, 2)"},{"id":7,"name":"c_mktsegment","required":false,"type":"string"},{"id":8,"name":"c_comment","required":false,"type":"string"}]}],"default-spec-id":0,"partition-specs":[{"spec-id":0,"fields":[]}],"last-partition-id":999,"default-sort-order-id":0,"sort-orders":[{"order-id":0,"fields":[]}],"properties":{"write.format.default":"parquet","write.metadata.compression-codec":"none","write.parquet.compression-codec":"zstd"},"current-snapshot-id":-1,"refs":{},"snapshots":[],"statistics":[],"partition-statistics":[],"snapshot-log":[],"metadata-log":[]} \ No newline at end of file diff --git a/assets/tpch/tpch.sf01/customer/metadata/00001-76f6e7e4-b34f-492f-b6a1-cc9f8c8f4975.metadata.json b/assets/tpch/tpch.sf01/customer/metadata/00001-76f6e7e4-b34f-492f-b6a1-cc9f8c8f4975.metadata.json new file mode 100644 index 0000000..eca9f07 --- /dev/null +++ b/assets/tpch/tpch.sf01/customer/metadata/00001-76f6e7e4-b34f-492f-b6a1-cc9f8c8f4975.metadata.json @@ -0,0 +1 @@ +{"format-version":2,"table-uuid":"51667c9a-d3d1-4389-8461-d18626ab0e60","location":"s3://warehouse/tpch.sf01/customer","last-sequence-number":1,"last-updated-ms":1757415286199,"last-column-id":8,"current-schema-id":0,"schemas":[{"type":"struct","schema-id":0,"identifier-field-ids":[1],"fields":[{"id":1,"name":"c_custkey","required":true,"type":"long"},{"id":2,"name":"c_name","required":false,"type":"string"},{"id":3,"name":"c_address","required":false,"type":"string"},{"id":4,"name":"c_nationkey","required":false,"type":"int"},{"id":5,"name":"c_phone","required":false,"type":"string"},{"id":6,"name":"c_acctbal","required":false,"type":"decimal(15, 2)"},{"id":7,"name":"c_mktsegment","required":false,"type":"string"},{"id":8,"name":"c_comment","required":false,"type":"string"}]}],"default-spec-id":0,"partition-specs":[{"spec-id":0,"fields":[]}],"last-partition-id":999,"default-sort-order-id":0,"sort-orders":[{"order-id":0,"fields":[]}],"properties":{"write.format.default":"parquet","write.metadata.compression-codec":"none","write.parquet.compression-codec":"zstd"},"current-snapshot-id":3441867730092225551,"refs":{"main":{"snapshot-id":3441867730092225551,"type":"branch"}},"snapshots":[{"sequence-number":1,"snapshot-id":3441867730092225551,"timestamp-ms":1757415286199,"summary":{"operation":"append","total-files-size":"0","total-position-deletes":"0","total-data-files":"0","total-delete-files":"0","total-records":"0","total-equality-deletes":"0"},"manifest-list":"s3://warehouse/tpch.sf01/customer/metadata/snap-3441867730092225551-0-01992e1d-25b3-7111-9dfc-38f55f99f91b.avro","schema-id":0}],"statistics":[],"partition-statistics":[],"snapshot-log":[{"timestamp-ms":1757415286199,"snapshot-id":3441867730092225551}],"metadata-log":[{"timestamp-ms":1757415286085,"metadata-file":"s3://warehouse/tpch.sf01/customer/metadata/00000-462e1567-8801-45a5-978c-b3beb53e7a2e.metadata.json"}]} \ No newline at end of file diff --git a/assets/tpch/tpch.sf01/customer/metadata/01992e1d-25b3-7111-9dfc-38f55f99f91b-m0.avro b/assets/tpch/tpch.sf01/customer/metadata/01992e1d-25b3-7111-9dfc-38f55f99f91b-m0.avro new file mode 100644 index 0000000..49d01fc Binary files /dev/null and b/assets/tpch/tpch.sf01/customer/metadata/01992e1d-25b3-7111-9dfc-38f55f99f91b-m0.avro differ diff --git a/assets/tpch/tpch.sf01/customer/metadata/snap-3441867730092225551-0-01992e1d-25b3-7111-9dfc-38f55f99f91b.avro b/assets/tpch/tpch.sf01/customer/metadata/snap-3441867730092225551-0-01992e1d-25b3-7111-9dfc-38f55f99f91b.avro new file mode 100644 index 0000000..528f75f Binary files /dev/null and b/assets/tpch/tpch.sf01/customer/metadata/snap-3441867730092225551-0-01992e1d-25b3-7111-9dfc-38f55f99f91b.avro differ diff --git a/assets/tpch/tpch.sf01/lineitem/data/data_lineitem-00000.parquet b/assets/tpch/tpch.sf01/lineitem/data/data_lineitem-00000.parquet new file mode 100644 index 0000000..831e0ce Binary files /dev/null and b/assets/tpch/tpch.sf01/lineitem/data/data_lineitem-00000.parquet differ diff --git a/assets/tpch/tpch.sf01/lineitem/metadata/00000-8fbbff5c-f0a5-4b43-97b3-fb31d7565ad7.metadata.json b/assets/tpch/tpch.sf01/lineitem/metadata/00000-8fbbff5c-f0a5-4b43-97b3-fb31d7565ad7.metadata.json new file mode 100644 index 0000000..0cd35ef --- /dev/null +++ b/assets/tpch/tpch.sf01/lineitem/metadata/00000-8fbbff5c-f0a5-4b43-97b3-fb31d7565ad7.metadata.json @@ -0,0 +1 @@ +{"format-version":2,"table-uuid":"3526d88c-a26c-4cab-82f1-ac119e09266d","location":"s3://warehouse/tpch.sf01/lineitem","last-sequence-number":0,"last-updated-ms":1757415280577,"last-column-id":16,"current-schema-id":0,"schemas":[{"type":"struct","schema-id":0,"identifier-field-ids":[1],"fields":[{"id":1,"name":"l_orderkey","required":true,"type":"long"},{"id":2,"name":"l_partkey","required":false,"type":"long"},{"id":3,"name":"l_suppkey","required":false,"type":"long"},{"id":4,"name":"l_linenumber","required":false,"type":"long"},{"id":5,"name":"l_quantity","required":false,"type":"decimal(15, 2)"},{"id":6,"name":"l_extendedprice","required":false,"type":"decimal(15, 2)"},{"id":7,"name":"l_discount","required":false,"type":"decimal(15, 2)"},{"id":8,"name":"l_tax","required":false,"type":"decimal(15, 2)"},{"id":9,"name":"l_returnflag","required":false,"type":"string"},{"id":10,"name":"l_linestatus","required":false,"type":"string"},{"id":11,"name":"l_shipdate","required":false,"type":"date"},{"id":12,"name":"l_commitdate","required":false,"type":"date"},{"id":13,"name":"l_receiptdate","required":false,"type":"date"},{"id":14,"name":"l_shipinstruct","required":false,"type":"string"},{"id":15,"name":"l_shipmode","required":false,"type":"string"},{"id":16,"name":"l_comment","required":false,"type":"string"}]}],"default-spec-id":0,"partition-specs":[{"spec-id":0,"fields":[]}],"last-partition-id":999,"default-sort-order-id":0,"sort-orders":[{"order-id":0,"fields":[]}],"properties":{"write.format.default":"parquet","write.metadata.compression-codec":"none","write.parquet.compression-codec":"zstd"},"current-snapshot-id":-1,"refs":{},"snapshots":[],"statistics":[],"partition-statistics":[],"snapshot-log":[],"metadata-log":[]} \ No newline at end of file diff --git a/assets/tpch/tpch.sf01/lineitem/metadata/00001-b7bc27e6-dd3c-4452-a9cc-5f975a9752f5.metadata.json b/assets/tpch/tpch.sf01/lineitem/metadata/00001-b7bc27e6-dd3c-4452-a9cc-5f975a9752f5.metadata.json new file mode 100644 index 0000000..73c1cd1 --- /dev/null +++ b/assets/tpch/tpch.sf01/lineitem/metadata/00001-b7bc27e6-dd3c-4452-a9cc-5f975a9752f5.metadata.json @@ -0,0 +1 @@ +{"format-version":2,"table-uuid":"3526d88c-a26c-4cab-82f1-ac119e09266d","location":"s3://warehouse/tpch.sf01/lineitem","last-sequence-number":1,"last-updated-ms":1757415285459,"last-column-id":16,"current-schema-id":0,"schemas":[{"type":"struct","schema-id":0,"identifier-field-ids":[1],"fields":[{"id":1,"name":"l_orderkey","required":true,"type":"long"},{"id":2,"name":"l_partkey","required":false,"type":"long"},{"id":3,"name":"l_suppkey","required":false,"type":"long"},{"id":4,"name":"l_linenumber","required":false,"type":"long"},{"id":5,"name":"l_quantity","required":false,"type":"decimal(15, 2)"},{"id":6,"name":"l_extendedprice","required":false,"type":"decimal(15, 2)"},{"id":7,"name":"l_discount","required":false,"type":"decimal(15, 2)"},{"id":8,"name":"l_tax","required":false,"type":"decimal(15, 2)"},{"id":9,"name":"l_returnflag","required":false,"type":"string"},{"id":10,"name":"l_linestatus","required":false,"type":"string"},{"id":11,"name":"l_shipdate","required":false,"type":"date"},{"id":12,"name":"l_commitdate","required":false,"type":"date"},{"id":13,"name":"l_receiptdate","required":false,"type":"date"},{"id":14,"name":"l_shipinstruct","required":false,"type":"string"},{"id":15,"name":"l_shipmode","required":false,"type":"string"},{"id":16,"name":"l_comment","required":false,"type":"string"}]}],"default-spec-id":0,"partition-specs":[{"spec-id":0,"fields":[]}],"last-partition-id":999,"default-sort-order-id":0,"sort-orders":[{"order-id":0,"fields":[]}],"properties":{"write.format.default":"parquet","write.metadata.compression-codec":"none","write.parquet.compression-codec":"zstd"},"current-snapshot-id":5219799779815904705,"refs":{"main":{"snapshot-id":5219799779815904705,"type":"branch"}},"snapshots":[{"sequence-number":1,"snapshot-id":5219799779815904705,"timestamp-ms":1757415285459,"summary":{"operation":"append","total-data-files":"0","total-files-size":"0","total-position-deletes":"0","total-equality-deletes":"0","total-delete-files":"0","total-records":"0"},"manifest-list":"s3://warehouse/tpch.sf01/lineitem/metadata/snap-5219799779815904705-0-01992e1d-22cf-79f3-b2b0-476f11d4356f.avro","schema-id":0}],"statistics":[],"partition-statistics":[],"snapshot-log":[{"timestamp-ms":1757415285459,"snapshot-id":5219799779815904705}],"metadata-log":[{"timestamp-ms":1757415280577,"metadata-file":"s3://warehouse/tpch.sf01/lineitem/metadata/00000-8fbbff5c-f0a5-4b43-97b3-fb31d7565ad7.metadata.json"}]} \ No newline at end of file diff --git a/assets/tpch/tpch.sf01/lineitem/metadata/01992e1d-22cf-79f3-b2b0-476f11d4356f-m0.avro b/assets/tpch/tpch.sf01/lineitem/metadata/01992e1d-22cf-79f3-b2b0-476f11d4356f-m0.avro new file mode 100644 index 0000000..6962be0 Binary files /dev/null and b/assets/tpch/tpch.sf01/lineitem/metadata/01992e1d-22cf-79f3-b2b0-476f11d4356f-m0.avro differ diff --git a/assets/tpch/tpch.sf01/lineitem/metadata/snap-5219799779815904705-0-01992e1d-22cf-79f3-b2b0-476f11d4356f.avro b/assets/tpch/tpch.sf01/lineitem/metadata/snap-5219799779815904705-0-01992e1d-22cf-79f3-b2b0-476f11d4356f.avro new file mode 100644 index 0000000..e9c77e8 Binary files /dev/null and b/assets/tpch/tpch.sf01/lineitem/metadata/snap-5219799779815904705-0-01992e1d-22cf-79f3-b2b0-476f11d4356f.avro differ diff --git a/assets/tpch/tpch.sf01/nation/data/data_nation-00000.parquet b/assets/tpch/tpch.sf01/nation/data/data_nation-00000.parquet new file mode 100644 index 0000000..2ab594c Binary files /dev/null and b/assets/tpch/tpch.sf01/nation/data/data_nation-00000.parquet differ diff --git a/assets/tpch/tpch.sf01/nation/metadata/00000-3d36be32-ac36-489e-bb18-5cb5bc83ca62.metadata.json b/assets/tpch/tpch.sf01/nation/metadata/00000-3d36be32-ac36-489e-bb18-5cb5bc83ca62.metadata.json new file mode 100644 index 0000000..21f87a4 --- /dev/null +++ b/assets/tpch/tpch.sf01/nation/metadata/00000-3d36be32-ac36-489e-bb18-5cb5bc83ca62.metadata.json @@ -0,0 +1 @@ +{"format-version":2,"table-uuid":"10784b64-d6ed-41bd-9ecc-22031716ea4a","location":"s3://warehouse/tpch.sf01/nation","last-sequence-number":0,"last-updated-ms":1757415286623,"last-column-id":4,"current-schema-id":0,"schemas":[{"type":"struct","schema-id":0,"identifier-field-ids":[1],"fields":[{"id":1,"name":"n_nationkey","required":true,"type":"int"},{"id":2,"name":"n_name","required":false,"type":"string"},{"id":3,"name":"n_regionkey","required":false,"type":"int"},{"id":4,"name":"n_comment","required":false,"type":"string"}]}],"default-spec-id":0,"partition-specs":[{"spec-id":0,"fields":[]}],"last-partition-id":999,"default-sort-order-id":0,"sort-orders":[{"order-id":0,"fields":[]}],"properties":{"write.format.default":"parquet","write.metadata.compression-codec":"none","write.parquet.compression-codec":"zstd"},"current-snapshot-id":-1,"refs":{},"snapshots":[],"statistics":[],"partition-statistics":[],"snapshot-log":[],"metadata-log":[]} \ No newline at end of file diff --git a/assets/tpch/tpch.sf01/nation/metadata/00001-44f668fe-3688-49d5-851f-36e75d143321.metadata.json b/assets/tpch/tpch.sf01/nation/metadata/00001-44f668fe-3688-49d5-851f-36e75d143321.metadata.json new file mode 100644 index 0000000..67868d2 --- /dev/null +++ b/assets/tpch/tpch.sf01/nation/metadata/00001-44f668fe-3688-49d5-851f-36e75d143321.metadata.json @@ -0,0 +1 @@ +{"format-version":2,"table-uuid":"10784b64-d6ed-41bd-9ecc-22031716ea4a","location":"s3://warehouse/tpch.sf01/nation","last-sequence-number":1,"last-updated-ms":1757415286646,"last-column-id":4,"current-schema-id":0,"schemas":[{"type":"struct","schema-id":0,"identifier-field-ids":[1],"fields":[{"id":1,"name":"n_nationkey","required":true,"type":"int"},{"id":2,"name":"n_name","required":false,"type":"string"},{"id":3,"name":"n_regionkey","required":false,"type":"int"},{"id":4,"name":"n_comment","required":false,"type":"string"}]}],"default-spec-id":0,"partition-specs":[{"spec-id":0,"fields":[]}],"last-partition-id":999,"default-sort-order-id":0,"sort-orders":[{"order-id":0,"fields":[]}],"properties":{"write.format.default":"parquet","write.metadata.compression-codec":"none","write.parquet.compression-codec":"zstd"},"current-snapshot-id":7279317277629245920,"refs":{"main":{"snapshot-id":7279317277629245920,"type":"branch"}},"snapshots":[{"sequence-number":1,"snapshot-id":7279317277629245920,"timestamp-ms":1757415286646,"summary":{"operation":"append","total-records":"0","total-position-deletes":"0","total-equality-deletes":"0","total-delete-files":"0","total-files-size":"0","total-data-files":"0"},"manifest-list":"s3://warehouse/tpch.sf01/nation/metadata/snap-7279317277629245920-0-01992e1d-2772-7c42-bd53-aebdf9c39aa5.avro","schema-id":0}],"statistics":[],"partition-statistics":[],"snapshot-log":[{"timestamp-ms":1757415286646,"snapshot-id":7279317277629245920}],"metadata-log":[{"timestamp-ms":1757415286623,"metadata-file":"s3://warehouse/tpch.sf01/nation/metadata/00000-3d36be32-ac36-489e-bb18-5cb5bc83ca62.metadata.json"}]} \ No newline at end of file diff --git a/assets/tpch/tpch.sf01/nation/metadata/01992e1d-2772-7c42-bd53-aebdf9c39aa5-m0.avro b/assets/tpch/tpch.sf01/nation/metadata/01992e1d-2772-7c42-bd53-aebdf9c39aa5-m0.avro new file mode 100644 index 0000000..5c4f195 Binary files /dev/null and b/assets/tpch/tpch.sf01/nation/metadata/01992e1d-2772-7c42-bd53-aebdf9c39aa5-m0.avro differ diff --git a/assets/tpch/tpch.sf01/nation/metadata/snap-7279317277629245920-0-01992e1d-2772-7c42-bd53-aebdf9c39aa5.avro b/assets/tpch/tpch.sf01/nation/metadata/snap-7279317277629245920-0-01992e1d-2772-7c42-bd53-aebdf9c39aa5.avro new file mode 100644 index 0000000..f877f4a Binary files /dev/null and b/assets/tpch/tpch.sf01/nation/metadata/snap-7279317277629245920-0-01992e1d-2772-7c42-bd53-aebdf9c39aa5.avro differ diff --git a/assets/tpch/tpch.sf01/orders/data/data_orders-00000.parquet b/assets/tpch/tpch.sf01/orders/data/data_orders-00000.parquet new file mode 100644 index 0000000..55e9816 Binary files /dev/null and b/assets/tpch/tpch.sf01/orders/data/data_orders-00000.parquet differ diff --git a/assets/tpch/tpch.sf01/orders/metadata/00000-f49e4c80-9350-48d3-a736-de996c21799e.metadata.json b/assets/tpch/tpch.sf01/orders/metadata/00000-f49e4c80-9350-48d3-a736-de996c21799e.metadata.json new file mode 100644 index 0000000..697180a --- /dev/null +++ b/assets/tpch/tpch.sf01/orders/metadata/00000-f49e4c80-9350-48d3-a736-de996c21799e.metadata.json @@ -0,0 +1 @@ +{"format-version":2,"table-uuid":"45f1057c-bb1a-4ef8-b125-7a1fdaad9538","location":"s3://warehouse/tpch.sf01/orders","last-sequence-number":0,"last-updated-ms":1757415285486,"last-column-id":9,"current-schema-id":0,"schemas":[{"type":"struct","schema-id":0,"identifier-field-ids":[1],"fields":[{"id":1,"name":"o_orderkey","required":true,"type":"long"},{"id":2,"name":"o_custkey","required":false,"type":"long"},{"id":3,"name":"o_orderstatus","required":false,"type":"string"},{"id":4,"name":"o_totalprice","required":false,"type":"decimal(15, 2)"},{"id":5,"name":"o_orderdate","required":false,"type":"date"},{"id":6,"name":"o_orderpriority","required":false,"type":"string"},{"id":7,"name":"o_clerk","required":false,"type":"string"},{"id":8,"name":"o_shippriority","required":false,"type":"int"},{"id":9,"name":"o_comment","required":false,"type":"string"}]}],"default-spec-id":0,"partition-specs":[{"spec-id":0,"fields":[]}],"last-partition-id":999,"default-sort-order-id":0,"sort-orders":[{"order-id":0,"fields":[]}],"properties":{"write.format.default":"parquet","write.metadata.compression-codec":"none","write.parquet.compression-codec":"zstd"},"current-snapshot-id":-1,"refs":{},"snapshots":[],"statistics":[],"partition-statistics":[],"snapshot-log":[],"metadata-log":[]} \ No newline at end of file diff --git a/assets/tpch/tpch.sf01/orders/metadata/00001-9b3611ba-f4d5-4e53-b65d-a6ea5a7c9dce.metadata.json b/assets/tpch/tpch.sf01/orders/metadata/00001-9b3611ba-f4d5-4e53-b65d-a6ea5a7c9dce.metadata.json new file mode 100644 index 0000000..5f78f4f --- /dev/null +++ b/assets/tpch/tpch.sf01/orders/metadata/00001-9b3611ba-f4d5-4e53-b65d-a6ea5a7c9dce.metadata.json @@ -0,0 +1 @@ +{"format-version":2,"table-uuid":"45f1057c-bb1a-4ef8-b125-7a1fdaad9538","location":"s3://warehouse/tpch.sf01/orders","last-sequence-number":1,"last-updated-ms":1757415286064,"last-column-id":9,"current-schema-id":0,"schemas":[{"type":"struct","schema-id":0,"identifier-field-ids":[1],"fields":[{"id":1,"name":"o_orderkey","required":true,"type":"long"},{"id":2,"name":"o_custkey","required":false,"type":"long"},{"id":3,"name":"o_orderstatus","required":false,"type":"string"},{"id":4,"name":"o_totalprice","required":false,"type":"decimal(15, 2)"},{"id":5,"name":"o_orderdate","required":false,"type":"date"},{"id":6,"name":"o_orderpriority","required":false,"type":"string"},{"id":7,"name":"o_clerk","required":false,"type":"string"},{"id":8,"name":"o_shippriority","required":false,"type":"int"},{"id":9,"name":"o_comment","required":false,"type":"string"}]}],"default-spec-id":0,"partition-specs":[{"spec-id":0,"fields":[]}],"last-partition-id":999,"default-sort-order-id":0,"sort-orders":[{"order-id":0,"fields":[]}],"properties":{"write.format.default":"parquet","write.metadata.compression-codec":"none","write.parquet.compression-codec":"zstd"},"current-snapshot-id":8364432507415766515,"refs":{"main":{"snapshot-id":8364432507415766515,"type":"branch"}},"snapshots":[{"sequence-number":1,"snapshot-id":8364432507415766515,"timestamp-ms":1757415286064,"summary":{"operation":"append","total-position-deletes":"0","total-delete-files":"0","total-data-files":"0","total-records":"0","total-files-size":"0","total-equality-deletes":"0"},"manifest-list":"s3://warehouse/tpch.sf01/orders/metadata/snap-8364432507415766515-0-01992e1d-252c-7781-ad49-d743f2d9881a.avro","schema-id":0}],"statistics":[],"partition-statistics":[],"snapshot-log":[{"timestamp-ms":1757415286064,"snapshot-id":8364432507415766515}],"metadata-log":[{"timestamp-ms":1757415285486,"metadata-file":"s3://warehouse/tpch.sf01/orders/metadata/00000-f49e4c80-9350-48d3-a736-de996c21799e.metadata.json"}]} \ No newline at end of file diff --git a/assets/tpch/tpch.sf01/orders/metadata/01992e1d-252c-7781-ad49-d743f2d9881a-m0.avro b/assets/tpch/tpch.sf01/orders/metadata/01992e1d-252c-7781-ad49-d743f2d9881a-m0.avro new file mode 100644 index 0000000..dbbad88 Binary files /dev/null and b/assets/tpch/tpch.sf01/orders/metadata/01992e1d-252c-7781-ad49-d743f2d9881a-m0.avro differ diff --git a/assets/tpch/tpch.sf01/orders/metadata/snap-8364432507415766515-0-01992e1d-252c-7781-ad49-d743f2d9881a.avro b/assets/tpch/tpch.sf01/orders/metadata/snap-8364432507415766515-0-01992e1d-252c-7781-ad49-d743f2d9881a.avro new file mode 100644 index 0000000..bdf7690 Binary files /dev/null and b/assets/tpch/tpch.sf01/orders/metadata/snap-8364432507415766515-0-01992e1d-252c-7781-ad49-d743f2d9881a.avro differ diff --git a/assets/tpch/tpch.sf01/part/data/data_part-00000.parquet b/assets/tpch/tpch.sf01/part/data/data_part-00000.parquet new file mode 100644 index 0000000..8196f74 Binary files /dev/null and b/assets/tpch/tpch.sf01/part/data/data_part-00000.parquet differ diff --git a/assets/tpch/tpch.sf01/part/metadata/00000-a671a1f7-cf20-4f76-a635-932e88a83988.metadata.json b/assets/tpch/tpch.sf01/part/metadata/00000-a671a1f7-cf20-4f76-a635-932e88a83988.metadata.json new file mode 100644 index 0000000..0310449 --- /dev/null +++ b/assets/tpch/tpch.sf01/part/metadata/00000-a671a1f7-cf20-4f76-a635-932e88a83988.metadata.json @@ -0,0 +1 @@ +{"format-version":2,"table-uuid":"f8691ce3-0d69-4608-81f4-75eee1f25d9d","location":"s3://warehouse/tpch.sf01/part","last-sequence-number":0,"last-updated-ms":1757415286218,"last-column-id":9,"current-schema-id":0,"schemas":[{"type":"struct","schema-id":0,"identifier-field-ids":[1],"fields":[{"id":1,"name":"p_partkey","required":true,"type":"long"},{"id":2,"name":"p_name","required":false,"type":"string"},{"id":3,"name":"p_mfgr","required":false,"type":"string"},{"id":4,"name":"p_brand","required":false,"type":"string"},{"id":5,"name":"p_type","required":false,"type":"string"},{"id":6,"name":"p_size","required":false,"type":"int"},{"id":7,"name":"p_container","required":false,"type":"string"},{"id":8,"name":"p_retailprice","required":false,"type":"decimal(15, 2)"},{"id":9,"name":"p_comment","required":false,"type":"string"}]}],"default-spec-id":0,"partition-specs":[{"spec-id":0,"fields":[]}],"last-partition-id":999,"default-sort-order-id":0,"sort-orders":[{"order-id":0,"fields":[]}],"properties":{"write.format.default":"parquet","write.metadata.compression-codec":"none","write.parquet.compression-codec":"zstd"},"current-snapshot-id":-1,"refs":{},"snapshots":[],"statistics":[],"partition-statistics":[],"snapshot-log":[],"metadata-log":[]} \ No newline at end of file diff --git a/assets/tpch/tpch.sf01/part/metadata/00001-941e3774-3f57-497e-9597-f2ecfab39923.metadata.json b/assets/tpch/tpch.sf01/part/metadata/00001-941e3774-3f57-497e-9597-f2ecfab39923.metadata.json new file mode 100644 index 0000000..bbd1914 --- /dev/null +++ b/assets/tpch/tpch.sf01/part/metadata/00001-941e3774-3f57-497e-9597-f2ecfab39923.metadata.json @@ -0,0 +1 @@ +{"format-version":2,"table-uuid":"f8691ce3-0d69-4608-81f4-75eee1f25d9d","location":"s3://warehouse/tpch.sf01/part","last-sequence-number":1,"last-updated-ms":1757415286312,"last-column-id":9,"current-schema-id":0,"schemas":[{"type":"struct","schema-id":0,"identifier-field-ids":[1],"fields":[{"id":1,"name":"p_partkey","required":true,"type":"long"},{"id":2,"name":"p_name","required":false,"type":"string"},{"id":3,"name":"p_mfgr","required":false,"type":"string"},{"id":4,"name":"p_brand","required":false,"type":"string"},{"id":5,"name":"p_type","required":false,"type":"string"},{"id":6,"name":"p_size","required":false,"type":"int"},{"id":7,"name":"p_container","required":false,"type":"string"},{"id":8,"name":"p_retailprice","required":false,"type":"decimal(15, 2)"},{"id":9,"name":"p_comment","required":false,"type":"string"}]}],"default-spec-id":0,"partition-specs":[{"spec-id":0,"fields":[]}],"last-partition-id":999,"default-sort-order-id":0,"sort-orders":[{"order-id":0,"fields":[]}],"properties":{"write.format.default":"parquet","write.metadata.compression-codec":"none","write.parquet.compression-codec":"zstd"},"current-snapshot-id":6868592323279126150,"refs":{"main":{"snapshot-id":6868592323279126150,"type":"branch"}},"snapshots":[{"sequence-number":1,"snapshot-id":6868592323279126150,"timestamp-ms":1757415286312,"summary":{"operation":"append","total-position-deletes":"0","total-files-size":"0","total-data-files":"0","total-equality-deletes":"0","total-records":"0","total-delete-files":"0"},"manifest-list":"s3://warehouse/tpch.sf01/part/metadata/snap-6868592323279126150-0-01992e1d-2625-7760-bde7-22b5407604a5.avro","schema-id":0}],"statistics":[],"partition-statistics":[],"snapshot-log":[{"timestamp-ms":1757415286312,"snapshot-id":6868592323279126150}],"metadata-log":[{"timestamp-ms":1757415286218,"metadata-file":"s3://warehouse/tpch.sf01/part/metadata/00000-a671a1f7-cf20-4f76-a635-932e88a83988.metadata.json"}]} \ No newline at end of file diff --git a/assets/tpch/tpch.sf01/part/metadata/01992e1d-2625-7760-bde7-22b5407604a5-m0.avro b/assets/tpch/tpch.sf01/part/metadata/01992e1d-2625-7760-bde7-22b5407604a5-m0.avro new file mode 100644 index 0000000..a4f87bf Binary files /dev/null and b/assets/tpch/tpch.sf01/part/metadata/01992e1d-2625-7760-bde7-22b5407604a5-m0.avro differ diff --git a/assets/tpch/tpch.sf01/part/metadata/snap-6868592323279126150-0-01992e1d-2625-7760-bde7-22b5407604a5.avro b/assets/tpch/tpch.sf01/part/metadata/snap-6868592323279126150-0-01992e1d-2625-7760-bde7-22b5407604a5.avro new file mode 100644 index 0000000..538ed5e Binary files /dev/null and b/assets/tpch/tpch.sf01/part/metadata/snap-6868592323279126150-0-01992e1d-2625-7760-bde7-22b5407604a5.avro differ diff --git a/assets/tpch/tpch.sf01/partsupp/data/data_partsupp-00000.parquet b/assets/tpch/tpch.sf01/partsupp/data/data_partsupp-00000.parquet new file mode 100644 index 0000000..ffdd57d Binary files /dev/null and b/assets/tpch/tpch.sf01/partsupp/data/data_partsupp-00000.parquet differ diff --git a/assets/tpch/tpch.sf01/partsupp/metadata/00000-8b308426-4896-4b71-b515-141876d769fe.metadata.json b/assets/tpch/tpch.sf01/partsupp/metadata/00000-8b308426-4896-4b71-b515-141876d769fe.metadata.json new file mode 100644 index 0000000..63d2704 --- /dev/null +++ b/assets/tpch/tpch.sf01/partsupp/metadata/00000-8b308426-4896-4b71-b515-141876d769fe.metadata.json @@ -0,0 +1 @@ +{"format-version":2,"table-uuid":"f2cf563b-b72e-466d-8a0b-634f59405477","location":"s3://warehouse/tpch.sf01/partsupp","last-sequence-number":0,"last-updated-ms":1757415286329,"last-column-id":5,"current-schema-id":0,"schemas":[{"type":"struct","schema-id":0,"identifier-field-ids":[1],"fields":[{"id":1,"name":"ps_partkey","required":true,"type":"long"},{"id":2,"name":"ps_suppkey","required":false,"type":"long"},{"id":3,"name":"ps_availqty","required":false,"type":"long"},{"id":4,"name":"ps_supplycost","required":false,"type":"decimal(15, 2)"},{"id":5,"name":"ps_comment","required":false,"type":"string"}]}],"default-spec-id":0,"partition-specs":[{"spec-id":0,"fields":[]}],"last-partition-id":999,"default-sort-order-id":0,"sort-orders":[{"order-id":0,"fields":[]}],"properties":{"write.format.default":"parquet","write.metadata.compression-codec":"none","write.parquet.compression-codec":"zstd"},"current-snapshot-id":-1,"refs":{},"snapshots":[],"statistics":[],"partition-statistics":[],"snapshot-log":[],"metadata-log":[]} \ No newline at end of file diff --git a/assets/tpch/tpch.sf01/partsupp/metadata/00001-02c6a971-e64d-4070-87bd-4b101224cc0f.metadata.json b/assets/tpch/tpch.sf01/partsupp/metadata/00001-02c6a971-e64d-4070-87bd-4b101224cc0f.metadata.json new file mode 100644 index 0000000..8765617 --- /dev/null +++ b/assets/tpch/tpch.sf01/partsupp/metadata/00001-02c6a971-e64d-4070-87bd-4b101224cc0f.metadata.json @@ -0,0 +1 @@ +{"format-version":2,"table-uuid":"f2cf563b-b72e-466d-8a0b-634f59405477","location":"s3://warehouse/tpch.sf01/partsupp","last-sequence-number":1,"last-updated-ms":1757415286557,"last-column-id":5,"current-schema-id":0,"schemas":[{"type":"struct","schema-id":0,"identifier-field-ids":[1],"fields":[{"id":1,"name":"ps_partkey","required":true,"type":"long"},{"id":2,"name":"ps_suppkey","required":false,"type":"long"},{"id":3,"name":"ps_availqty","required":false,"type":"long"},{"id":4,"name":"ps_supplycost","required":false,"type":"decimal(15, 2)"},{"id":5,"name":"ps_comment","required":false,"type":"string"}]}],"default-spec-id":0,"partition-specs":[{"spec-id":0,"fields":[]}],"last-partition-id":999,"default-sort-order-id":0,"sort-orders":[{"order-id":0,"fields":[]}],"properties":{"write.format.default":"parquet","write.metadata.compression-codec":"none","write.parquet.compression-codec":"zstd"},"current-snapshot-id":8936585034194709635,"refs":{"main":{"snapshot-id":8936585034194709635,"type":"branch"}},"snapshots":[{"sequence-number":1,"snapshot-id":8936585034194709635,"timestamp-ms":1757415286557,"summary":{"operation":"append","total-delete-files":"0","total-data-files":"0","total-files-size":"0","total-records":"0","total-position-deletes":"0","total-equality-deletes":"0"},"manifest-list":"s3://warehouse/tpch.sf01/partsupp/metadata/snap-8936585034194709635-0-01992e1d-2718-7533-bdc0-e629f4d2a5b7.avro","schema-id":0}],"statistics":[],"partition-statistics":[],"snapshot-log":[{"timestamp-ms":1757415286557,"snapshot-id":8936585034194709635}],"metadata-log":[{"timestamp-ms":1757415286329,"metadata-file":"s3://warehouse/tpch.sf01/partsupp/metadata/00000-8b308426-4896-4b71-b515-141876d769fe.metadata.json"}]} \ No newline at end of file diff --git a/assets/tpch/tpch.sf01/partsupp/metadata/01992e1d-2718-7533-bdc0-e629f4d2a5b7-m0.avro b/assets/tpch/tpch.sf01/partsupp/metadata/01992e1d-2718-7533-bdc0-e629f4d2a5b7-m0.avro new file mode 100644 index 0000000..5ec7e57 Binary files /dev/null and b/assets/tpch/tpch.sf01/partsupp/metadata/01992e1d-2718-7533-bdc0-e629f4d2a5b7-m0.avro differ diff --git a/assets/tpch/tpch.sf01/partsupp/metadata/snap-8936585034194709635-0-01992e1d-2718-7533-bdc0-e629f4d2a5b7.avro b/assets/tpch/tpch.sf01/partsupp/metadata/snap-8936585034194709635-0-01992e1d-2718-7533-bdc0-e629f4d2a5b7.avro new file mode 100644 index 0000000..0f1ce38 Binary files /dev/null and b/assets/tpch/tpch.sf01/partsupp/metadata/snap-8936585034194709635-0-01992e1d-2718-7533-bdc0-e629f4d2a5b7.avro differ diff --git a/assets/tpch/tpch.sf01/region/data/data_region-00000.parquet b/assets/tpch/tpch.sf01/region/data/data_region-00000.parquet new file mode 100644 index 0000000..500a89b Binary files /dev/null and b/assets/tpch/tpch.sf01/region/data/data_region-00000.parquet differ diff --git a/assets/tpch/tpch.sf01/region/metadata/00000-82d4e01a-5eb6-4089-95ee-da39375997d0.metadata.json b/assets/tpch/tpch.sf01/region/metadata/00000-82d4e01a-5eb6-4089-95ee-da39375997d0.metadata.json new file mode 100644 index 0000000..651a94f --- /dev/null +++ b/assets/tpch/tpch.sf01/region/metadata/00000-82d4e01a-5eb6-4089-95ee-da39375997d0.metadata.json @@ -0,0 +1 @@ +{"format-version":2,"table-uuid":"285b7abf-a205-44d2-ba9b-a8bf134dfeb0","location":"s3://warehouse/tpch.sf01/region","last-sequence-number":0,"last-updated-ms":1757415286661,"last-column-id":3,"current-schema-id":0,"schemas":[{"type":"struct","schema-id":0,"identifier-field-ids":[1],"fields":[{"id":1,"name":"r_regionkey","required":true,"type":"int"},{"id":2,"name":"r_name","required":false,"type":"string"},{"id":3,"name":"r_comment","required":false,"type":"string"}]}],"default-spec-id":0,"partition-specs":[{"spec-id":0,"fields":[]}],"last-partition-id":999,"default-sort-order-id":0,"sort-orders":[{"order-id":0,"fields":[]}],"properties":{"write.format.default":"parquet","write.metadata.compression-codec":"none","write.parquet.compression-codec":"zstd"},"current-snapshot-id":-1,"refs":{},"snapshots":[],"statistics":[],"partition-statistics":[],"snapshot-log":[],"metadata-log":[]} \ No newline at end of file diff --git a/assets/tpch/tpch.sf01/region/metadata/00001-53fea646-6102-4ec8-aad4-65e9e96dffee.metadata.json b/assets/tpch/tpch.sf01/region/metadata/00001-53fea646-6102-4ec8-aad4-65e9e96dffee.metadata.json new file mode 100644 index 0000000..130fcee --- /dev/null +++ b/assets/tpch/tpch.sf01/region/metadata/00001-53fea646-6102-4ec8-aad4-65e9e96dffee.metadata.json @@ -0,0 +1 @@ +{"format-version":2,"table-uuid":"285b7abf-a205-44d2-ba9b-a8bf134dfeb0","location":"s3://warehouse/tpch.sf01/region","last-sequence-number":1,"last-updated-ms":1757415286686,"last-column-id":3,"current-schema-id":0,"schemas":[{"type":"struct","schema-id":0,"identifier-field-ids":[1],"fields":[{"id":1,"name":"r_regionkey","required":true,"type":"int"},{"id":2,"name":"r_name","required":false,"type":"string"},{"id":3,"name":"r_comment","required":false,"type":"string"}]}],"default-spec-id":0,"partition-specs":[{"spec-id":0,"fields":[]}],"last-partition-id":999,"default-sort-order-id":0,"sort-orders":[{"order-id":0,"fields":[]}],"properties":{"write.format.default":"parquet","write.metadata.compression-codec":"none","write.parquet.compression-codec":"zstd"},"current-snapshot-id":2494555001631895663,"refs":{"main":{"snapshot-id":2494555001631895663,"type":"branch"}},"snapshots":[{"sequence-number":1,"snapshot-id":2494555001631895663,"timestamp-ms":1757415286686,"summary":{"operation":"append","total-records":"0","total-files-size":"0","total-equality-deletes":"0","total-data-files":"0","total-delete-files":"0","total-position-deletes":"0"},"manifest-list":"s3://warehouse/tpch.sf01/region/metadata/snap-2494555001631895663-0-01992e1d-279b-7f30-9aca-86956294170f.avro","schema-id":0}],"statistics":[],"partition-statistics":[],"snapshot-log":[{"timestamp-ms":1757415286686,"snapshot-id":2494555001631895663}],"metadata-log":[{"timestamp-ms":1757415286661,"metadata-file":"s3://warehouse/tpch.sf01/region/metadata/00000-82d4e01a-5eb6-4089-95ee-da39375997d0.metadata.json"}]} \ No newline at end of file diff --git a/assets/tpch/tpch.sf01/region/metadata/01992e1d-279b-7f30-9aca-86956294170f-m0.avro b/assets/tpch/tpch.sf01/region/metadata/01992e1d-279b-7f30-9aca-86956294170f-m0.avro new file mode 100644 index 0000000..ae070d8 Binary files /dev/null and b/assets/tpch/tpch.sf01/region/metadata/01992e1d-279b-7f30-9aca-86956294170f-m0.avro differ diff --git a/assets/tpch/tpch.sf01/region/metadata/snap-2494555001631895663-0-01992e1d-279b-7f30-9aca-86956294170f.avro b/assets/tpch/tpch.sf01/region/metadata/snap-2494555001631895663-0-01992e1d-279b-7f30-9aca-86956294170f.avro new file mode 100644 index 0000000..ab51958 Binary files /dev/null and b/assets/tpch/tpch.sf01/region/metadata/snap-2494555001631895663-0-01992e1d-279b-7f30-9aca-86956294170f.avro differ diff --git a/assets/tpch/tpch.sf01/supplier/data/data_supplier-00000.parquet b/assets/tpch/tpch.sf01/supplier/data/data_supplier-00000.parquet new file mode 100644 index 0000000..df713e9 Binary files /dev/null and b/assets/tpch/tpch.sf01/supplier/data/data_supplier-00000.parquet differ diff --git a/assets/tpch/tpch.sf01/supplier/metadata/00000-5db94572-daf2-4f6c-b7a7-d9fb8ccdf5e1.metadata.json b/assets/tpch/tpch.sf01/supplier/metadata/00000-5db94572-daf2-4f6c-b7a7-d9fb8ccdf5e1.metadata.json new file mode 100644 index 0000000..8e2f2e8 --- /dev/null +++ b/assets/tpch/tpch.sf01/supplier/metadata/00000-5db94572-daf2-4f6c-b7a7-d9fb8ccdf5e1.metadata.json @@ -0,0 +1 @@ +{"format-version":2,"table-uuid":"b3c1f9ff-de6a-4d16-9564-4abdff0bb97d","location":"s3://warehouse/tpch.sf01/supplier","last-sequence-number":0,"last-updated-ms":1757415286574,"last-column-id":7,"current-schema-id":0,"schemas":[{"type":"struct","schema-id":0,"identifier-field-ids":[1],"fields":[{"id":1,"name":"s_suppkey","required":true,"type":"long"},{"id":2,"name":"s_name","required":false,"type":"string"},{"id":3,"name":"s_address","required":false,"type":"string"},{"id":4,"name":"s_nationkey","required":false,"type":"int"},{"id":5,"name":"s_phone","required":false,"type":"string"},{"id":6,"name":"s_acctbal","required":false,"type":"decimal(15, 2)"},{"id":7,"name":"s_comment","required":false,"type":"string"}]}],"default-spec-id":0,"partition-specs":[{"spec-id":0,"fields":[]}],"last-partition-id":999,"default-sort-order-id":0,"sort-orders":[{"order-id":0,"fields":[]}],"properties":{"write.format.default":"parquet","write.metadata.compression-codec":"none","write.parquet.compression-codec":"zstd"},"current-snapshot-id":-1,"refs":{},"snapshots":[],"statistics":[],"partition-statistics":[],"snapshot-log":[],"metadata-log":[]} \ No newline at end of file diff --git a/assets/tpch/tpch.sf01/supplier/metadata/00001-55144648-a61c-46b3-a7e5-fed95e61e1ae.metadata.json b/assets/tpch/tpch.sf01/supplier/metadata/00001-55144648-a61c-46b3-a7e5-fed95e61e1ae.metadata.json new file mode 100644 index 0000000..9011892 --- /dev/null +++ b/assets/tpch/tpch.sf01/supplier/metadata/00001-55144648-a61c-46b3-a7e5-fed95e61e1ae.metadata.json @@ -0,0 +1 @@ +{"format-version":2,"table-uuid":"b3c1f9ff-de6a-4d16-9564-4abdff0bb97d","location":"s3://warehouse/tpch.sf01/supplier","last-sequence-number":1,"last-updated-ms":1757415286606,"last-column-id":7,"current-schema-id":0,"schemas":[{"type":"struct","schema-id":0,"identifier-field-ids":[1],"fields":[{"id":1,"name":"s_suppkey","required":true,"type":"long"},{"id":2,"name":"s_name","required":false,"type":"string"},{"id":3,"name":"s_address","required":false,"type":"string"},{"id":4,"name":"s_nationkey","required":false,"type":"int"},{"id":5,"name":"s_phone","required":false,"type":"string"},{"id":6,"name":"s_acctbal","required":false,"type":"decimal(15, 2)"},{"id":7,"name":"s_comment","required":false,"type":"string"}]}],"default-spec-id":0,"partition-specs":[{"spec-id":0,"fields":[]}],"last-partition-id":999,"default-sort-order-id":0,"sort-orders":[{"order-id":0,"fields":[]}],"properties":{"write.format.default":"parquet","write.metadata.compression-codec":"none","write.parquet.compression-codec":"zstd"},"current-snapshot-id":8349235124768620733,"refs":{"main":{"snapshot-id":8349235124768620733,"type":"branch"}},"snapshots":[{"sequence-number":1,"snapshot-id":8349235124768620733,"timestamp-ms":1757415286606,"summary":{"operation":"append","total-files-size":"0","total-delete-files":"0","total-position-deletes":"0","total-records":"0","total-equality-deletes":"0","total-data-files":"0"},"manifest-list":"s3://warehouse/tpch.sf01/supplier/metadata/snap-8349235124768620733-0-01992e1d-274a-7d11-b26a-221276e8b50b.avro","schema-id":0}],"statistics":[],"partition-statistics":[],"snapshot-log":[{"timestamp-ms":1757415286606,"snapshot-id":8349235124768620733}],"metadata-log":[{"timestamp-ms":1757415286574,"metadata-file":"s3://warehouse/tpch.sf01/supplier/metadata/00000-5db94572-daf2-4f6c-b7a7-d9fb8ccdf5e1.metadata.json"}]} \ No newline at end of file diff --git a/assets/tpch/tpch.sf01/supplier/metadata/01992e1d-274a-7d11-b26a-221276e8b50b-m0.avro b/assets/tpch/tpch.sf01/supplier/metadata/01992e1d-274a-7d11-b26a-221276e8b50b-m0.avro new file mode 100644 index 0000000..2d57bab Binary files /dev/null and b/assets/tpch/tpch.sf01/supplier/metadata/01992e1d-274a-7d11-b26a-221276e8b50b-m0.avro differ diff --git a/assets/tpch/tpch.sf01/supplier/metadata/snap-8349235124768620733-0-01992e1d-274a-7d11-b26a-221276e8b50b.avro b/assets/tpch/tpch.sf01/supplier/metadata/snap-8349235124768620733-0-01992e1d-274a-7d11-b26a-221276e8b50b.avro new file mode 100644 index 0000000..817c46e Binary files /dev/null and b/assets/tpch/tpch.sf01/supplier/metadata/snap-8349235124768620733-0-01992e1d-274a-7d11-b26a-221276e8b50b.avro differ diff --git a/docker/docker-compose.yml b/docker/docker-compose.yml new file mode 100644 index 0000000..67e4e55 --- /dev/null +++ b/docker/docker-compose.yml @@ -0,0 +1,79 @@ +networks: + iceberg_net: + +services: + rest: + container_name: iceberg-rest + image: apache/iceberg-rest-fixture + environment: + - AWS_ACCESS_KEY_ID=root + - AWS_SECRET_ACCESS_KEY=password + - AWS_ENDPOINT_URL=http://minio:9000 + - AWS_REGION=us-east-1 + - CATALOG_S3_ENDPOINT=http://minio:9000 + - CATALOG_WAREHOUSE=s3://warehouse + - CATALOG_IO__IMPL=org.apache.iceberg.aws.s3.S3FileIO + volumes: + - ../assets/rest:/tmp + networks: + iceberg_net: + ports: + - "8181:8181" + minio: + container_name: minio + image: minio/minio:latest + ports: + - "9000:9000" # MinIO API port + - "9001:9001" # MinIO Console port + environment: + MINIO_ROOT_USER: root + MINIO_ROOT_PASSWORD: password + MINIO_DOMAIN: minio + networks: + iceberg_net: + aliases: + - warehouse.minio + command: server --console-address ":9001" /data + spark-iceberg: + image: tabulario/spark-iceberg + container_name: spark-iceberg + build: spark/ + networks: + iceberg_net: + depends_on: + - rest + - minio + volumes: + - ./warehouse:/home/iceberg/warehouse + - ./notebooks:/home/iceberg/notebooks/notebooks + environment: + - AWS_ACCESS_KEY_ID=root + - AWS_SECRET_ACCESS_KEY=password + - AWS_REGION=us-east-1 + - AWS_ENDPOINT_URL=http://minio:9000 + ports: + - 8888:8888 + - 8080:8080 + - 10000:10000 + - 10001:10001 + mc: + depends_on: + - minio + image: minio/mc + container_name: mc + networks: + iceberg_net: + volumes: + - ../assets/tpch:/input + environment: + - AWS_ACCESS_KEY_ID=root + - AWS_SECRET_ACCESS_KEY=password + - AWS_REGION=us-east-1 + entrypoint: | + /bin/sh -c " + until (/usr/bin/mc alias set minio http://minio:9000 root password) do echo '...waiting...' && sleep 1; done; + /usr/bin/mc rm -r --force minio/warehouse + /usr/bin/mc mb minio/warehouse; + /usr/bin/mc cp -r /input/tpch.sf01/ minio/warehouse/tpch.sf01/; + tail -f /dev/null + " \ No newline at end of file diff --git a/examples/basic_usage.jl b/examples/basic_usage.jl index 027b268..33016aa 100755 --- a/examples/basic_usage.jl +++ b/examples/basic_usage.jl @@ -1,12 +1,13 @@ #!/usr/bin/env julia +using BenchmarkTools # Basic usage example for RustyIceberg.jl println("=== RustyIceberg.jl Basic Usage Example ===") # Load the package println("Loading RustyIceberg package...") -include("../src/RustyIceberg.jl") -using .RustyIceberg +using RustyIceberg +using RustyIceberg: init_runtime, read_table using DataFrames println("✅ Package loaded successfully!") @@ -40,106 +41,113 @@ end # Load environment variables env_loaded = load_env_file(joinpath(@__DIR__, ".env")) -# Test library loading -println("Using default library path") -load_iceberg_library() -println("✅ Library loaded!") +# Initialize Iceberg runtime +println("Initializing Iceberg runtime...") +init_runtime() +println("✅ Runtime initialized!") # Test actual table reading using the same paths as integration test println("Testing table reading with actual data...") -# Use the same table and metadata paths as in integration_test.c -# table_path = "s3://vustef-dev/tpch-sf0.1-no-part/nation" -# metadata_path = "metadata/00001-1744d9f4-1472-4f8c-ac86-b0b7c291248e.metadata.json" -table_path = "s3://vustef-dev/tpch-sf0.1-no-part/customer" -metadata_path = "metadata/00001-0789fc06-57dd-45b5-b5cc-42ef1386b497.metadata.json" +snapshot_path = "s3://warehouse/tpch.sf01/customer/metadata/00001-76f6e7e4-b34f-492f-b6a1-cc9f8c8f4975.metadata.json" -println("Table path: $table_path") -println("Metadata path: $metadata_path") +println("Snapshot path: $(snapshot_path)") -try - # Read the table using the high-level function - now returns an iterator - println("Reading Iceberg table...") - table_iterator = read_iceberg_table(table_path, metadata_path) +function _read_table(snapshot_path, benchmark::Bool=false) + try + # Read the table using the high-level function - now returns an iterator + !benchmark && println("Reading Iceberg table...") + table_iterator = read_table(snapshot_path) - println("✅ Table iterator created successfully!") + !benchmark && println("✅ Table iterator created successfully!") - # Iterate over Arrow.Table objects and convert to DataFrames - all_dataframes = DataFrame[] - batch_count = 0 + # Iterate over Arrow.Table objects and convert to DataFrames + all_dataframes = DataFrame[] + batch_count = 0 - for arrow_table in table_iterator - batch_count += 1 - df = DataFrame(arrow_table) - push!(all_dataframes, df) + iter = !benchmark ? Iterators.take(table_iterator, 2) : table_iterator + for arrow_table in iter + batch_count += 1 + benchmark && continue + df = copy(DataFrame((arrow_table))) + push!(all_dataframes, df) - println("📦 Batch $batch_count:") - println(" - Rows: $(nrow(df))") - println(" - Columns: $(ncol(df))") - println(" - Column names: $(names(df))") + println("📦 Batch $batch_count:") + println(" - Rows: $(nrow(df))") + println(" - Columns: $(ncol(df))") + println(" - Column names: $(names(df))") - # Show first few rows of first batch - if batch_count == 1 && nrow(df) > 0 - println("📋 First few rows:") - println(first(df, 30)) + # Show first few rows of first batch + if batch_count == 1 && nrow(df) > 0 + println("📋 First few rows:") + println(first(df, 10)) + end end - end - - # Combine all DataFrames - if !isempty(all_dataframes) - combined_df = reduce(vcat, all_dataframes) - println("\n📊 Combined DataFrame info:") - println(" - Total rows: $(nrow(combined_df))") - println(" - Total columns: $(ncol(combined_df))") - println(" - Total batches: $batch_count") - else - println("\n📊 No data found in table") - end - - # Test with specific columns - println("\nTesting column selection...") - if !isempty(all_dataframes) && !isempty(names(all_dataframes[1])) - selected_columns = names(all_dataframes[1])[1:min(2, length(names(all_dataframes[1])))] - println("Selecting columns: $selected_columns") - selected_iterator = read_iceberg_table(table_path, metadata_path, columns=selected_columns) - selected_dataframes = DataFrame[] + # Combine all DataFrames + if !benchmark + if !isempty(all_dataframes) + combined_df = reduce(vcat, all_dataframes) + println("\n📊 Combined DataFrame info:") + println(" - Total rows: $(nrow(combined_df))") + println(" - Total columns: $(ncol(combined_df))") + println(" - Total batches: $(batch_count)") + else + println("\n📊 No data found in table") + end - for arrow_table in selected_iterator - df = DataFrame(arrow_table) - push!(selected_dataframes, df) + # Test with specific columns + println("\nTesting column selection...") + if !isempty(all_dataframes) && !isempty(names(all_dataframes[1])) + selected_columns = names(all_dataframes[1])[1:min(2, length(names(all_dataframes[1])))] + println("Selecting columns: $(selected_columns)") + + selected_iterator = RustyIceberg.read_table(snapshot_path; columns=selected_columns) + selected_dataframes = DataFrame[] + + for arrow_table in Iterators.take(selected_iterator, 2) + df = copy(DataFrame(arrow_table)) + push!(selected_dataframes, df) + end + + if !isempty(selected_dataframes) + combined_selected = reduce(vcat, selected_dataframes) + println("✅ Column selection successful!") + println(" - Selected columns: $(names(combined_selected))") + println(" - Rows: $(nrow(combined_selected))") + end + end end - if !isempty(selected_dataframes) - combined_selected = reduce(vcat, selected_dataframes) - println("✅ Column selection successful!") - println(" - Selected columns: $(names(combined_selected))") - println(" - Rows: $(nrow(combined_selected))") + catch e + println("❌ Error reading table: $e") + if env_loaded + println("\n💡 Troubleshooting tips:") + println(" 1. Make sure your .env file has correct AWS credentials") + println(" 2. Verify the S3 endpoint is accessible") + println(" 3. Check that the table path and metadata path are correct") + println(" 4. Ensure the Rust library was built correctly") + else + println("\n💡 Please configure your .env file first:") + println(" cp env.example .env") + println(" # Then edit .env with your actual credentials and paths") end - end - -catch e - println("❌ Error reading table: $e") - if env_loaded - println("\n💡 Troubleshooting tips:") - println(" 1. Make sure your .env file has correct AWS credentials") - println(" 2. Verify the S3 endpoint is accessible") - println(" 3. Check that the table path and metadata path are correct") - println(" 4. Ensure the library path in .env points to the correct location") - else - println("\n💡 Please configure your .env file first:") - println(" cp env.example .env") - println(" # Then edit .env with your actual credentials and paths") + rethrow(e) end end +_read_table(snapshot_path) + +# Locally using minio: 51.170 ms (9308 allocations: 504.50 KiB) +@btime _read_table(snapshot_path, true) + println("\n✅ Basic usage example completed!") println("\nTo use with your own data:") println(" # Get iterator over Arrow.Table objects:") -println(" iterator = read_iceberg_table(\"your-table-path\", \"your-metadata-path\")") +println(" iterator = read_iceberg_table(\"your-snapshot-path\")") println(" for arrow_table in iterator") println(" df = DataFrame(arrow_table) # Convert to DataFrame if needed") println(" # Process your data...") println(" end") println("\n # Or with column selection:") -println(" iterator = read_iceberg_table(\"your-table-path\", \"your-metadata-path\", columns=[\"col1\", \"col2\"])") +println(" iterator = read_iceberg_table(\"your-snapshot-path\", columns=[\"col1\", \"col2\"])") diff --git a/src/RustyIceberg.jl b/src/RustyIceberg.jl index 267dea8..6b5b4c1 100644 --- a/src/RustyIceberg.jl +++ b/src/RustyIceberg.jl @@ -1,61 +1,27 @@ module RustyIceberg using Base.Libc.Libdl: dlext +using Base: @kwdef, @lock +using Base.Threads: Atomic using Libdl using Arrow using iceberg_rust_ffi_jll -export IcebergTable, IcebergScan, ArrowBatch, IcebergResult -export IcebergTableIterator, IcebergTableIteratorState -export iceberg_table_open, iceberg_table_free, iceberg_table_scan -export iceberg_scan_select_columns, iceberg_scan_free, iceberg_scan_next_batch -export iceberg_arrow_batch_free, iceberg_error_message -export load_iceberg_library, unload_iceberg_library, read_iceberg_table - -# Constants -const ICEBERG_OK = 0 -const ICEBERG_ERROR = -1 -const ICEBERG_NULL_POINTER = -2 -const ICEBERG_IO_ERROR = -3 -const ICEBERG_INVALID_TABLE = -4 -const ICEBERG_END_OF_STREAM = -5 +export Table, Scan, ArrowBatch, StaticConfig, ArrowStream +export TableIterator, TableIteratorState +export init_runtime, read_table +export IcebergException -# Opaque pointer types -const IcebergTable = Ptr{Cvoid} -const IcebergScan = Ptr{Cvoid} - -# Arrow batch structure -struct ArrowBatch - data::Ptr{UInt8} - length::Csize_t - rust_ptr::Ptr{Cvoid} -end - -# Result type -const IcebergResult = Cint - -# Global variables for dynamic loading -const lib_handle = Ref{Ptr{Cvoid}}(C_NULL) -const function_pointers = Dict{Symbol, Ptr{Cvoid}}() +const Option{T} = Union{T, Nothing} -# Function pointer types -const iceberg_table_open_func_t = Ptr{Cvoid} -const iceberg_table_free_func_t = Ptr{Cvoid} -const iceberg_table_scan_func_t = Ptr{Cvoid} -const iceberg_scan_select_columns_func_t = Ptr{Cvoid} -const iceberg_scan_free_func_t = Ptr{Cvoid} -const iceberg_scan_next_batch_func_t = Ptr{Cvoid} -const iceberg_arrow_batch_free_func_t = Ptr{Cvoid} -const iceberg_error_message_func_t = Ptr{Cvoid} - -const RUST_LIB = if haskey(ENV, "ICEBERG_RUST_LIB") +const rust_lib = if haskey(ENV, "ICEBERG_RUST_LIB") # For development, e.g. run `cargo build --release` and point to `target/release/` dir. # Note this is set a precompilation time, as `ccall` needs this to be a `const`, # so you need to restart Julia / recompile the package if you change it. lib_path = realpath(joinpath(ENV["ICEBERG_RUST_LIB"], "libiceberg_rust_ffi.$(dlext)")) @warn """ - Using unreleased object_store_ffi library: - $(repr(contractuser(lib_path))) + Using unreleased iceberg_rust_ffi library: + $(repr(replace(lib_path, homedir() => "~"))) This is only intended for local development and should not be used in production. """ lib_path @@ -64,268 +30,569 @@ else end """ - load_iceberg_library(lib_path::String=iceberg_rust_ffi_jll.libiceberg_rust_ffi) +Runtime configuration for the Iceberg library. +""" +struct StaticConfig + n_threads::Culonglong +end + +function default_panic_hook() + println("Rust thread panicked, exiting the process") + exit(1) +end + +const _ICEBERG_STARTED = Atomic{Bool}(false) + +function iceberg_started() + return _ICEBERG_STARTED[] +end + +const _INIT_LOCK::ReentrantLock = ReentrantLock() +_PANIC_HOOK::Function = default_panic_hook + +Base.@ccallable function iceberg_panic_hook_wrapper()::Cint + global _PANIC_HOOK + _PANIC_HOOK() + return 0 +end + +# This is the callback that Rust calls to notify a Julia task of a completed operation. +Base.@ccallable function notify_result_iceberg(event_ptr::Ptr{Nothing})::Cint + event = unsafe_pointer_to_objref(event_ptr)::Base.Event + notify(event) + return 0 +end + +# A dict of all tasks that are waiting some result from Rust +const tasks_in_flight = IdDict{Task, Int64}() +const preserve_task_lock = Threads.SpinLock() +function preserve_task(x::Task) + @lock preserve_task_lock begin + v = get(tasks_in_flight, x, 0)::Int + tasks_in_flight[x] = v + 1 + end + nothing +end +function unpreserve_task(x::Task) + @lock preserve_task_lock begin + v = get(tasks_in_flight, x, 0)::Int + if v == 0 + error("unbalanced call to unpreserve_task for $(typeof(x))") + elseif v == 1 + pop!(tasks_in_flight, x) + else + tasks_in_flight[x] = v - 1 + end + end + nothing +end -Load the Iceberg C API library and resolve all function symbols. """ -function load_iceberg_library(lib_path::String=RUST_LIB) - println("Loading Iceberg C API library from: $(lib_path)") + init_runtime() + init_runtime(config::IcebergConfig) + init_runtime(config::IcebergConfig; on_rust_panic::Function) + +Initialize the Iceberg runtime. - # Try to open the dynamic library - lib_handle[] = dlopen(lib_path, RTLD_LAZY) - if lib_handle[] == C_NULL - error("Failed to load library: $(lib_path)") +This starts a `tokio` runtime for handling Iceberg requests. +It must be called before sending a request. +""" +function init_runtime( + config::StaticConfig=StaticConfig(0); + on_rust_panic::Function=default_panic_hook +) + global _PANIC_HOOK + @lock _INIT_LOCK begin + if _ICEBERG_STARTED[] + return nothing + end + _PANIC_HOOK = on_rust_panic + panic_fn_ptr = @cfunction(iceberg_panic_hook_wrapper, Cint, ()) + fn_ptr = @cfunction(notify_result_iceberg, Cint, (Ptr{Nothing},)) + res = @ccall rust_lib.iceberg_init_runtime(config::StaticConfig, panic_fn_ptr::Ptr{Nothing}, fn_ptr::Ptr{Nothing})::Cint + if res != 0 + throw(IcebergException("Failed to initialize Iceberg runtime.", res)) + end + _ICEBERG_STARTED[] = true end + return nothing +end - println("✅ Library loaded successfully") - - # Function names to resolve - functions = [ - :iceberg_table_open, - :iceberg_table_free, - :iceberg_table_scan, - :iceberg_scan_select_columns, - :iceberg_scan_free, - :iceberg_scan_next_batch, - :iceberg_arrow_batch_free, - :iceberg_error_message - ] - - # Resolve function symbols - for func_name in functions - ptr = dlsym(lib_handle[], String(func_name)) - if ptr == C_NULL - error("Failed to resolve $func_name") +function response_error_to_string(response, operation) + err = string("failed to process ", operation, " with error: ", unsafe_string(response.error_message)) + @ccall rust_lib.iceberg_destroy_cstring(response.error_message::Ptr{Cchar})::Cint + return err +end + +macro throw_on_error(response, operation, exception) + throw_on_error(response, operation, exception) +end + +function throw_on_error(response, operation, exception) + return :( $(esc(:($response.result != 0))) ? throw($exception($response_error_to_string($(esc(response)), $operation))) : $(nothing) ) +end + +function ensure_wait(event::Base.Event) + for _ in 1:20 + try + return wait(event) + catch e + @error "cannot skip this wait point to prevent UB, ignoring exception: $(e)" + end + end + + @error "ignored too many wait exceptions, giving up" + exit(1) +end + +function wait_or_cancel(event::Base.Event, response) + try + return wait(event) + catch e + # Cancel the operation on the Rust side + if response.context != C_NULL + @ccall rust_lib.iceberg_cancel_context(response.context::Ptr{Cvoid})::Cint + end + ensure_wait(event) + if response.error_message != C_NULL + @ccall rust_lib.iceberg_destroy_cstring(response.error_message::Ptr{Cchar})::Cint + end + rethrow(e) + finally + # Always cleanup the context + if response.context != C_NULL + @ccall rust_lib.iceberg_destroy_context(response.context::Ptr{Cvoid})::Cint end - function_pointers[func_name] = ptr end +end + +# Opaque pointer types +const Table = Ptr{Cvoid} +const Scan = Ptr{Cvoid} +const ScanRef = Ref{Scan} +const ArrowStream = Ptr{Cvoid} + +# Arrow batch structure +struct ArrowBatch + data::Ptr{UInt8} + length::Csize_t + rust_ptr::Ptr{Cvoid} +end + +# Response structures for async operations +mutable struct TableResponse + result::Cint + table::Table + error_message::Ptr{Cchar} + context::Ptr{Cvoid} - println("✅ All function symbols resolved successfully") - return true + TableResponse() = new(-1, C_NULL, C_NULL, C_NULL) end +mutable struct Response + result::Cint + error_message::Ptr{Cchar} + context::Ptr{Cvoid} + + Response() = new(-1, C_NULL, C_NULL) +end + +mutable struct ArrowStreamResponse + result::Cint + stream::ArrowStream + error_message::Ptr{Cchar} + context::Ptr{Cvoid} + + ArrowStreamResponse() = new(-1, C_NULL, C_NULL, C_NULL) +end + +mutable struct BatchResponse + result::Cint + batch::Ptr{ArrowBatch} + error_message::Ptr{Cchar} + context::Ptr{Cvoid} + + BatchResponse() = new(-1, C_NULL, C_NULL, C_NULL) +end + +# Exception types +struct IcebergException <: Exception + msg::String + code::Union{Int,Nothing} +end + +# High-level functions using the async API pattern from RustyObjectStore.jl + """ - unload_iceberg_library() + table_open(snapshot_path::String)::IcebergTable -Unload the Iceberg C API library. +Open an Iceberg table from the given snapshot path. """ -function unload_iceberg_library() - if lib_handle[] != C_NULL - dlclose(lib_handle[]) - lib_handle[] = C_NULL - empty!(function_pointers) - println("✅ Library unloaded") +function table_open(snapshot_path::String) + response = TableResponse() + ct = current_task() + event = Base.Event() + handle = pointer_from_objref(event) + + preserve_task(ct) + result = GC.@preserve response event try + result = @ccall rust_lib.iceberg_table_open( + snapshot_path::Cstring, + response::Ref{TableResponse}, + handle::Ptr{Cvoid} + )::Cint + + wait_or_cancel(event, response) + + result + finally + unpreserve_task(ct) end + + @throw_on_error(response, "table_open", TableOpenException) + + return response.table end -# Wrapper functions that call the C API """ - iceberg_table_open(table_path::String, metadata_path::String) -> Tuple{IcebergResult, IcebergTable} + new_scan(table::Table) -> IcebergScan -Open an Iceberg table from the given path and metadata file. +Create a scan for the given table. """ -function iceberg_table_open(table_path::String, metadata_path::String) - table_ref = Ref{IcebergTable}(C_NULL) - result = ccall( - function_pointers[:iceberg_table_open], - IcebergResult, - (Cstring, Cstring, Ref{IcebergTable}), - table_path, metadata_path, table_ref - ) - return result, table_ref[] +function new_scan(table::Table) + scan = @ccall rust_lib.iceberg_new_scan(table::Table)::Ptr{Cvoid} + return Ref(scan) end """ - iceberg_table_free(table::IcebergTable) + select_columns!(scan::ScanRef, column_names::Vector{String})::Cint -Free the memory associated with an Iceberg table. +Select specific columns for the scan. """ -function iceberg_table_free(table::IcebergTable) - ccall( - function_pointers[:iceberg_table_free], - Cvoid, - (IcebergTable,), - table - ) +function select_columns!(scan::ScanRef, column_names::Vector{String}) + # Convert String vector to Cstring array + c_strings = [pointer(col) for col in column_names] + result = @ccall rust_lib.iceberg_select_columns( + convert(Ptr{Ptr{Cvoid}}, pointer_from_objref(scan))::Ptr{Ptr{Cvoid}}, + pointer(c_strings)::Ptr{Cstring}, + length(column_names)::Csize_t + )::Cint + + if result != 0 + error("Failed to select columns") + end + return nothing end """ - iceberg_table_scan(table::IcebergTable) -> Tuple{IcebergResult, IcebergScan} + with_data_file_concurrency_limit!(scan::ScanRef, n::UInt)::Cint -Create a scan for the given table. +Sets the data file concurrency level for the scan. """ -function iceberg_table_scan(table::IcebergTable) - scan_ref = Ref{IcebergScan}(C_NULL) - result = ccall( - function_pointers[:iceberg_table_scan], - IcebergResult, - (IcebergTable, Ref{IcebergScan}), - table, scan_ref - ) - return result, scan_ref[] +function with_data_file_concurrency_limit!(scan::ScanRef, n::UInt) + return @ccall rust_lib.iceberg_scan_with_data_file_concurrency( + convert(Ptr{Ptr{Cvoid}}, pointer_from_objref(scan))::Ptr{Ptr{Cvoid}}, + n::Csize_t + )::Cint end """ - iceberg_scan_select_columns(scan::IcebergScan, column_names::Vector{String}) -> IcebergResult + with_manifest_entry_concurrency_limit!(scan::ScanRef, n::UInt)::Cint -Select specific columns for the scan. +Sets the manifest entry concurrency level for the scan. """ -function iceberg_scan_select_columns(scan::IcebergScan, column_names::Vector{String}) - # Convert String vector to Cstring array - c_strings = [pointer(col) for col in column_names] - result = ccall( - function_pointers[:iceberg_scan_select_columns], - IcebergResult, - (IcebergScan, Ptr{Cstring}, Csize_t), - scan, pointer(c_strings), length(column_names) - ) - return result +function with_manifest_entry_concurrency_limit!(scan::ScanRef, n::UInt) + return @ccall rust_lib.iceberg_scan_with_manifest_entry_concurrency( + convert(Ptr{Ptr{Cvoid}}, pointer_from_objref(scan))::Ptr{Ptr{Cvoid}}, + n::Csize_t + )::Cint end """ - iceberg_scan_free(scan::IcebergScan) + with_batch_size!(scan::ScanRef, n::UInt)::Cint -Free the memory associated with a scan. +Sets the batch size for the scan. """ -function iceberg_scan_free(scan::IcebergScan) - ccall( - function_pointers[:iceberg_scan_free], - Cvoid, - (IcebergScan,), - scan - ) +function with_batch_size!(scan::ScanRef, n::UInt) + return @ccall rust_lib.iceberg_scan_with_batch_size( + convert(Ptr{Ptr{Cvoid}}, pointer_from_objref(scan))::Ptr{Ptr{Cvoid}}, + n::Csize_t + )::Cint end """ - iceberg_scan_next_batch(scan::IcebergScan) -> Tuple{IcebergResult, ArrowBatch} + build!(scan::ScanRef)::Cint -Get the next Arrow batch from the scan. +Build the provided table scan object. """ -function iceberg_scan_next_batch(scan::IcebergScan) - batch_ref = Ref{Ptr{ArrowBatch}}(C_NULL) - result = ccall( - function_pointers[:iceberg_scan_next_batch], - IcebergResult, - (IcebergScan, Ref{Ptr{ArrowBatch}}), - scan, batch_ref - ) - return result, batch_ref[] +function build!(scan::ScanRef) + return _build!(convert(Ptr{Ptr{Cvoid}}, pointer_from_objref(scan))) +end + +function _build!(scan::Ptr{Ptr{Cvoid}}) + return @ccall rust_lib.iceberg_scan_build(scan::Ptr{Ptr{Cvoid}})::Cint +end + +""" + scan!(scan::ScanRef) -> Cint + +Build the provided table scan object. +""" +function scan!(scan::ScanRef) + result = build!(scan) + if result != 0 + throw(IcebergException("Failed to build scan", results)) + end + + return arrow_stream(scan[]) +end + +""" + arrow_stream(scan::Scan)::IcebergArrowStream + +Initialize an Arrow stream for the scan asynchronously. +""" +function arrow_stream(scan::Scan) + response = ArrowStreamResponse() + ct = current_task() + event = Base.Event() + handle = pointer_from_objref(event) + + preserve_task(ct) + result = GC.@preserve response event try + result = @ccall rust_lib.iceberg_arrow_stream( + scan::Scan, + response::Ref{ArrowStreamResponse}, + handle::Ptr{Cvoid} + )::Cint + + wait_or_cancel(event, response) + + result + finally + unpreserve_task(ct) + end + + @throw_on_error(response, "iceberg_arrow_stream", BatchException) + + return response.stream +end + +""" + next_batch(scan::Scan)::Ptr{ArrowBatch} + +Wait for the next batch from the initialized stream asynchronously and return it directly. +Returns C_NULL if end of stream is reached. +""" +function next_batch(stream::ArrowStream) + response = BatchResponse() + ct = current_task() + event = Base.Event() + handle = pointer_from_objref(event) + + preserve_task(ct) + result = GC.@preserve response event try + result = @ccall rust_lib.iceberg_next_batch( + stream::ArrowStream, + response::Ref{BatchResponse}, + handle::Ptr{Cvoid} + )::Cint + + wait_or_cancel(event, response) + + result + finally + unpreserve_task(ct) + end + + @throw_on_error(response, "iceberg_next_batch", BatchException) + + # Return the batch pointer directly + return response.batch +end + +""" + free_table(table::IcebergTable) + +Free the memory associated with an Iceberg table. +""" +function free_table(table::Table) + @ccall rust_lib.iceberg_table_free(table::Table)::Cvoid end """ - iceberg_arrow_batch_free(batch::Ptr{ArrowBatch}) + free_batch(batch::Ptr{ArrowBatch}) Free the memory associated with an Arrow batch. """ -function iceberg_arrow_batch_free(batch::Ptr{ArrowBatch}) - ccall( - function_pointers[:iceberg_arrow_batch_free], - Cvoid, - (Ptr{ArrowBatch},), - batch - ) +function free_batch(batch::Ptr{ArrowBatch}) + @ccall rust_lib.iceberg_arrow_batch_free(batch::Ptr{ArrowBatch})::Cvoid end """ - iceberg_error_message() -> String + free_stream(stream::ArrowStream) -Get the last error message from the C API. +Free the memory associated with an Arrow stream. """ -function iceberg_error_message() - msg_ptr = ccall( - function_pointers[:iceberg_error_message], - Cstring, - () - ) - return msg_ptr == C_NULL ? "" : unsafe_string(msg_ptr) +function free_stream(stream::ArrowStream) + @ccall rust_lib.iceberg_arrow_stream_free(stream::ArrowStream)::Cvoid end +""" + free_scan!(scan::IcebergScanRef) + +Free the memory associated with a scan. +""" +function free_scan!(scan::ScanRef) + @ccall rust_lib.iceberg_scan_free( + convert(Ptr{Ptr{Cvoid}}, pointer_from_objref(scan))::Ptr{Ptr{Cvoid}} + )::Cvoid +end + + # Iterator type for Arrow batches -struct IcebergTableIterator - table_path::String - metadata_path::String +struct TableIterator + snapshot_path::String columns::Vector{String} + batch_size::Union{UInt,Nothing} + data_file_concurrency_limit::Union{UInt,Nothing} + manifest_entry_concurrency_limit::Union{UInt,Nothing} end # Iterator state -mutable struct IcebergTableIteratorState - table::IcebergTable - scan::IcebergScan +mutable struct TableIteratorState + table::Table + scan::Ref{Ptr{Cvoid}} + stream::ArrowStream is_open::Bool + batch_ptr::Ptr{ArrowBatch} + + function TableIteratorState(table, scan, stream, is_open) + state = new(table, scan, stream, is_open, C_NULL) + # Ensure cleanup happens even if iterator is abandoned + finalizer(_cleanup_iterator_state!, state) + return state + end +end + +function _cleanup_iterator_state!(state::TableIteratorState) + if state.is_open + try + # Only free batch if we know we have one pending to prevent double-free + state.batch_ptr != C_NULL && free_batch(state.batch_ptr) + @assert state.stream != C_NULL + @assert state.scan != C_NULL + @assert state.table != C_NULL + free_stream(state.stream) + free_scan!(state.scan) + free_table(state.table) + catch e + # Log but don't throw in finalizer + # TODO: should we keep this log or throw? + @error "Error in IcebergTableIteratorState finalizer: $(e)" + finally + state.is_open = false + state.batch_ptr = C_NULL + state.stream = C_NULL + state.scan = C_NULL + state.table = C_NULL + end + end end """ Base.iterate(iter::IcebergTableIterator, state=nothing) -Iterate over Arrow.Table objects from the Iceberg table. +Iterate over `Arrow.Table` objects from the Iceberg table. """ -function Base.iterate(iter::IcebergTableIterator, state=nothing) - if state === nothing - # First iteration - open table and scan - if lib_handle[] == C_NULL - load_iceberg_library() - end +function Base.iterate(iter::TableIterator, state=nothing) + local arrow_table + local should_cleanup_resources = false + + try + if isnothing(state) + # First iteration - ensure runtime is initialized + if !iceberg_started() + init_runtime() + end - # Open table - result, table = iceberg_table_open(iter.table_path, iter.metadata_path) - if result != ICEBERG_OK - error("Failed to open table: $(iceberg_error_message())") - end + table, scan, stream = nothing, nothing, nothing - # Create scan - result, scan = iceberg_table_scan(table) - if result != ICEBERG_OK - # TODO: Is everything exception free? what if we get an exception in Julia - # between FFI calls to Rust, or between iterations? How do we deallocate objects then? - iceberg_table_free(table) - error("Failed to create scan: $(iceberg_error_message())") - end + try + # Open table + table = table_open(iter.snapshot_path) - # Select columns if specified - if !isempty(iter.columns) - result = iceberg_scan_select_columns(scan, iter.columns) - if result != ICEBERG_OK - iceberg_scan_free(scan) - iceberg_table_free(table) - error("Failed to select columns: $(iceberg_error_message())") - end - end + # Create scan + scan = new_scan(table) - state = IcebergTableIteratorState(table, scan, true) - end + # Select columns if specified + if !isempty(iter.columns) + select_columns!(scan, iter.columns) + end - # Get next batch - result, batch_ptr = iceberg_scan_next_batch(state.scan) - - if result == ICEBERG_END_OF_STREAM - # End of stream - cleanup and return nothing - iceberg_scan_free(state.scan) - iceberg_table_free(state.table) - return nothing - elseif result != ICEBERG_OK - # Error - cleanup and throw - iceberg_scan_free(state.scan) - iceberg_table_free(state.table) - error("Failed to get next batch: $(iceberg_error_message())") - end + if !isnothing(iter.data_file_concurrency_limit) + with_data_file_concurrency_limit!(scan, iter.data_file_concurrency_limit) + end - if batch_ptr == C_NULL - error("Received NULL batch") - end + if !isnothing(iter.manifest_entry_concurrency_limit) + with_manifest_entry_concurrency_limit!(scan, iter.manifest_entry_concurrency_limit) + end - # Convert ArrowBatch pointer to ArrowBatch struct - batch = unsafe_load(batch_ptr) + if !isnothing(iter.batch_size) + with_batch_size!(scan, iter.batch_size) + end - # Create IOBuffer from the serialized Arrow data - io = IOBuffer(unsafe_wrap(Array, batch.data, batch.length)) + stream = scan!(scan) + state = TableIteratorState(table, scan, stream, true) + catch e + !isnothing(stream) && free_stream(stream) + !isnothing(scan) && free_scan!(scan) + !isnothing(table) && free_table(table) + rethrow(e) + end + else + # Clean up the batch from the previous iteration. + if state.batch_ptr != C_NULL + free_batch(state.batch_ptr) + state.batch_ptr = C_NULL + end + end - # Read Arrow data - arrow_table = Arrow.Table(io) + # Wait for next batch asynchronously + state.batch_ptr = next_batch(state.stream) - # Free the batch - iceberg_arrow_batch_free(batch_ptr) + if state.batch_ptr == C_NULL + # End of stream - mark for cleanup and return nothing + should_cleanup_resources = true + return nothing + end - return arrow_table, state + # Convert ArrowBatch pointer to ArrowBatch struct + batch = unsafe_load(state.batch_ptr) + + # Read Arrow data + arrow_table = Arrow.Table(unsafe_wrap(Array, batch.data, batch.length)) + + return arrow_table, state + + catch e + # On exception, mark resources for cleanup before rethrowing + should_cleanup_resources = true + rethrow(e) + finally + # Clean up scan and table resources only if needed (end of stream or exception) + if should_cleanup_resources && state !== nothing && state.is_open + state.batch_ptr != C_NULL && free_batch(state.batch_ptr) + free_stream(state.stream) + free_scan!(state.scan) + free_table(state.table) + state.stream = C_NULL + state.scan = C_NULL + state.table = C_NULL + # Mark as closed to prevent finalizer from double-freeing + state.is_open = false + end + end end """ @@ -333,28 +600,35 @@ end Return the element type of the iterator. """ -Base.eltype(::Type{IcebergTableIterator}) = Arrow.Table +Base.eltype(::Type{TableIterator}) = Arrow.Table """ Base.IteratorSize(::Type{IcebergTableIterator}) Return the size trait of the iterator. """ -Base.IteratorSize(::Type{IcebergTableIterator}) = Base.SizeUnknown() +Base.IteratorSize(::Type{TableIterator}) = Base.SizeUnknown() # High-level Julia interface """ - read_iceberg_table(table_path::String, metadata_path::String; columns::Vector{String}=String[]) -> IcebergTableIterator + read_table(table_path::String, metadata_path::String; columns::Vector{String}=String[]) -> IcebergTableIterator Read an Iceberg table and return an iterator over Arrow.Table objects. """ -function read_iceberg_table(table_path::String, metadata_path::String; columns::Vector{String}=String[]) - return IcebergTableIterator(table_path, metadata_path, columns) -end - -# Cleanup on module unload -function __init__() - atexit(unload_iceberg_library) +function read_table( + snapshot_path::String; + columns::Vector{String}=String[], + batch_size::Union{UInt, Nothing}=nothing, + data_file_concurrency_limit::Union{UInt, Nothing}=nothing, + manifest_entry_concurrency_limit::Union{UInt, Nothing}=nothing +) + return TableIterator( + snapshot_path, + columns, + batch_size, + data_file_concurrency_limit, + manifest_entry_concurrency_limit + ) end -end # module Iceberg +end # module RustyIceberg diff --git a/test/runtests.jl b/test/runtests.jl index b9c2b55..8703f75 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -1,210 +1,184 @@ using Test using RustyIceberg -using RustyIceberg: ICEBERG_OK using DataFrames using Arrow @testset "RustyIceberg.jl" begin - @testset "Library Loading" begin - # Test library loading - @test load_iceberg_library() +@testset "Runtime Initialization" begin + # Test runtime initialization - this should work + @test_nowarn init_runtime() - # Test that we can unload and reload - unload_iceberg_library() - @test load_iceberg_library() - end - - @testset "Low-level API" begin - # Test table operations - table_path = "s3://vustef-dev/tpch-sf0.1-no-part/nation" - metadata_path = "metadata/00001-894cf98a-a055-47ba-a701-327455060d32.metadata.json" - - println("Testing table operations with:") - println(" Table path: $table_path") - println(" Metadata path: $metadata_path") - - # Test table open - result, table = iceberg_table_open(table_path, metadata_path) - if result == ICEBERG_OK - @test table != C_NULL - println("✅ Table opened successfully") - - # Test scan creation - result, scan = iceberg_table_scan(table) - if result == ICEBERG_OK - @test scan != C_NULL - println("✅ Scan created successfully") - - # Test batch reading - batch_count = 0 - total_bytes = 0 - - while true - result, batch_ptr = iceberg_scan_next_batch(scan) - - if result == ICEBERG_END_OF_STREAM - println("✅ Reached end of stream") - break - end - - if result != ICEBERG_OK - println("❌ Failed to get next batch: $(iceberg_error_message())") - break - end - - if batch_ptr == C_NULL - println("❌ Received NULL batch") - break - end - - batch_count += 1 - batch = unsafe_load(batch_ptr) - total_bytes += batch.length - - println("📦 Batch $batch_count:") - println(" - Serialized size: $(batch.length) bytes") - println(" - Data pointer: $(batch.data)") - - # Print first few bytes as hex for verification - print(" - First few bytes: ") - print_len = min(batch.length, 8) - for i in 1:print_len - print(string(unsafe_load(batch.data + i - 1), base=16, pad=2), " ") - end - println() - - # Test Arrow data reading - io = IOBuffer(unsafe_wrap(Array, batch.data, batch.length)) - arrow_table = Arrow.Table(io) - df = DataFrame(arrow_table) - println(" → Arrow data: $(size(df)) rows × $(length(names(df))) columns") - println(" → Columns: $(names(df))") - - # Free the batch - iceberg_arrow_batch_free(batch_ptr) - end + # Test that we can initialize multiple times safely + @test_nowarn init_runtime() - println("📊 Summary:") - println(" - Total batches: $batch_count") - println(" - Total bytes processed: $total_bytes") - - @test batch_count > 0 - @test total_bytes > 0 + println("✅ Runtime initialization successful") +end - # Cleanup scan - iceberg_scan_free(scan) - else - println("❌ Failed to create scan: $(iceberg_error_message())") - @test false - end +@testset "High-level API" begin + # Test with the actual customer table that we know works + snapshot_path = "s3://warehouse/tpch.sf01/customer/metadata/00001-76f6e7e4-b34f-492f-b6a1-cc9f8c8f4975.metadata.json" - # Cleanup table - iceberg_table_free(table) - else - println("❌ Failed to open table: $(iceberg_error_message())") - println("This may be expected due to S3 permissions or missing data") - @test_broken false # Mark as broken since this might fail due to external dependencies - end - end + println("Testing high-level API...") + println(" Snapshot path: $snapshot_path") - @testset "High-level API" begin - table_path = "s3://vustef-dev/tpch-sf0.1-no-part/nation" - metadata_path = "metadata/00001-894cf98a-a055-47ba-a701-327455060d32.metadata.json" + # Test creating table iterator + table_iterator = read_table(snapshot_path) + @test table_iterator isa TableIterator + println("✅ Table iterator created successfully") - println("Testing high-level API...") + # Test iteration over Arrow.Table objects + arrow_tables = Arrow.Table[] + batch_count = 0 + total_rows = 0 - try - # Test reading entire table - now returns an iterator - table_iterator = read_iceberg_table(table_path, metadata_path) - @test table_iterator isa IcebergTableIterator - println("✅ Table iterator created successfully") + for arrow_table in table_iterator + batch_count += 1 + push!(arrow_tables, arrow_table) - # Test iteration over Arrow.Table objects - arrow_tables = Arrow.Table[] - batch_count = 0 + # Convert to DataFrame for testing + df = DataFrame(arrow_table) + @test !isempty(df) + total_rows += nrow(df) - for arrow_table in table_iterator - batch_count += 1 - push!(arrow_tables, arrow_table) + # Only print details for first few batches to avoid spam + if batch_count <= 3 + println("📦 Batch $batch_count: $(size(df)) rows × $(length(names(df))) columns") + println(" → Columns: $(names(df))") + end - # Convert to DataFrame for testing - df = DataFrame(arrow_table) - @test !isempty(df) - println("📦 Batch $batch_count: $(size(df)) rows × $(length(names(df))) columns") - end + # Stop after a few batches for testing to avoid long test times + if batch_count >= 5 + println(" ... stopping after $batch_count batches for testing") + break + end + end - @test batch_count > 0 - @test !isempty(arrow_tables) - println("✅ High-level API iteration test successful") - println(" - Total batches: $batch_count") - println(" - Total Arrow tables: $(length(arrow_tables))") - - # Test reading with column selection - if !isempty(arrow_tables) - # Get column names from first batch - first_df = DataFrame(arrow_tables[1]) - if !isempty(names(first_df)) - selected_columns = [names(first_df)[1]] # Select first column - selected_iterator = read_iceberg_table(table_path, metadata_path, columns=selected_columns) - @test selected_iterator isa IcebergTableIterator - - selected_arrow_tables = Arrow.Table[] - for arrow_table in selected_iterator - push!(selected_arrow_tables, arrow_table) - end - - @test !isempty(selected_arrow_tables) - - # Check that selected columns match - if !isempty(selected_arrow_tables) - selected_df = DataFrame(selected_arrow_tables[1]) - @test names(selected_df) == selected_columns - println("✅ Column selection test successful") - println(" - Selected columns: $(names(selected_df))") - end + @test batch_count > 0 + @test total_rows > 0 + @test !isempty(arrow_tables) + println("✅ High-level API iteration test successful") + println(" - Total batches processed: $batch_count") + println(" - Total rows processed: $total_rows") + println(" - Total Arrow tables: $(length(arrow_tables))") + + # Test reading with column selection + if !isempty(arrow_tables) + # Get column names from first batch + first_df = DataFrame(arrow_tables[1]) + if !isempty(names(first_df)) + # Select first two columns for testing + selected_columns = names(first_df)[1:min(2, length(names(first_df)))] + selected_iterator = read_table( + snapshot_path; columns=selected_columns, batch_size=UInt(8) + ) + @test selected_iterator isa TableIterator + + selected_arrow_tables = Arrow.Table[] + selected_batch_count = 0 + for arrow_table in selected_iterator + @test length(arrow_table) <= 8 + selected_batch_count += 1 + push!(selected_arrow_tables, arrow_table) + + # Only process first batch for column selection test + if selected_batch_count >= 1 + break end end - catch e - println("❌ High-level API test failed: $e") - println("This may be expected due to S3 permissions or missing data") - @test_broken false # Mark as broken since this might fail due to external dependencies + @test !isempty(selected_arrow_tables) + + # Check that selected columns match + if !isempty(selected_arrow_tables) + selected_df = DataFrame(selected_arrow_tables[1]) + @test names(selected_df) == selected_columns + println("✅ Column selection test successful") + println(" - Selected columns: $(names(selected_df))") + end end end +end - @testset "Iterator Properties" begin - # Test iterator type properties - table_path = "s3://vustef-dev/tpch-sf0.1-no-part/nation" - metadata_path = "metadata/00001-894cf98a-a055-47ba-a701-327455060d32.metadata.json" - - table_iterator = read_iceberg_table(table_path, metadata_path) +@testset "Iterator Properties" begin + # Test iterator type properties + snapshot_path = "s3://warehouse/tpch.sf01/customer/metadata/00001-76f6e7e4-b34f-492f-b6a1-cc9f8c8f4975.metadata.json" - # Test eltype - @test Base.eltype(table_iterator) == Arrow.Table + table_iterator = read_table(snapshot_path) - # Test IteratorSize - @test Base.IteratorSize(table_iterator) == Base.SizeUnknown() + # Test eltype + @test Base.eltype(table_iterator) == Arrow.Table - println("✅ Iterator properties test successful") - end + # Test IteratorSize + @test Base.IteratorSize(table_iterator) == Base.SizeUnknown() - @testset "Error Handling" begin - # Test with invalid paths - result, table = iceberg_table_open("invalid/path", "invalid/metadata.json") - @test result != ICEBERG_OK - @test table == C_NULL + println("✅ Iterator properties test successful") +end - # Test error message - error_msg = iceberg_error_message() - @test !isempty(error_msg) - println("✅ Error handling test: $error_msg") +@testset "Error Handling" begin + # Test with invalid paths - this should throw an exception in our async API + try + invalid_iterator = read_table("invalid/path", "invalid/metadata.json") + # Try to iterate - this should fail + for arrow_table in invalid_iterator + @test false # Should not reach here + break + end + catch e + @test e isa Exception + println("✅ Error handling test successful: caught expected exception") end +end - @testset "Cleanup" begin - # Test library unloading - unload_iceberg_library() - println("✅ Library cleanup successful") +@testset "Read and verify nations table" begin + # Test reading the nations table and verify contents + nations_snapshot_path = "s3://warehouse/tpch.sf01/nation/metadata/00001-44f668fe-3688-49d5-851f-36e75d143321.metadata.json" + + println("Testing reading nations table...") + nations_iterator = read_table(nations_snapshot_path; batch_size=UInt(5)) + @test nations_iterator isa TableIterator + + rows = Tuple[] + for arrow_table in nations_iterator + df = DataFrame(arrow_table) + expected_columns = ["n_nationkey", "n_name", "n_regionkey", "n_comment"] + @test names(df) == expected_columns + # Collect rows as tuples for easier verification + for row in eachrow(df) + push!(rows, Tuple(row)) + end end + + @test rows == Tuple[ + (0, "ALGERIA", 0, "furiously regular requests. platelets affix furious"), + (1, "ARGENTINA", 1, "instructions wake quickly. final deposits haggle. final, silent theodolites "), + (2, "BRAZIL", 1, "asymptotes use fluffily quickly bold instructions. slyly bold dependencies sleep carefully pending accounts"), + (3, "CANADA", 1, "ss deposits wake across the pending foxes. packages after the carefully bold requests integrate caref"), + (4, "EGYPT", 4, "usly ironic, pending foxes. even, special instructions nag. sly, final foxes detect slyly fluffily "), + (5, "ETHIOPIA", 0, "regular requests sleep carefull"), + (6, "FRANCE", 3, "oggedly. regular packages solve across"), + (7, "GERMANY", 3, "ong the regular requests: blithely silent pinto beans hagg"), + (8, "INDIA", 2, "uriously unusual deposits about the slyly final pinto beans could"), + (9, "INDONESIA", 2, "d deposits sleep quickly according to the dogged, regular dolphins. special excuses haggle furiously special reque"), + (10, "IRAN", 4, "furiously idle platelets nag. express asymptotes s"), + (11, "IRAQ", 4, "pendencies; slyly express foxes integrate carefully across the reg"), + (12, "JAPAN", 2, " quickly final packages. furiously i"), + (13, "JORDAN", 4, "the slyly regular ideas. silent Tiresias affix slyly fu"), + (14, "KENYA", 0, "lyly special foxes. slyly regular deposits sleep carefully. carefully permanent accounts slee"), + (15, "MOROCCO", 0, "ct blithely: blithely express accounts nag carefully. silent packages haggle carefully abo"), + (16, "MOZAMBIQUE", 0, " beans after the carefully regular accounts r"), + (17, "PERU", 1, "ly final foxes. blithely ironic accounts haggle. regular foxes about the regular deposits are furiously ir"), + (18, "CHINA", 2, "ckly special packages cajole slyly. unusual, unusual theodolites mold furiously. slyly sile"), + (19, "ROMANIA", 3, "sly blithe requests. thinly bold deposits above the blithely regular accounts nag special, final requests. care"), + (20, "SAUDI ARABIA", 4, "se slyly across the blithely regular deposits. deposits use carefully regular "), + (21, "VIETNAM", 2, "lly across the quickly even pinto beans. caref"), + (22, "RUSSIA", 3, "uctions. furiously unusual instructions sleep furiously ironic packages. slyly "), + (23, "UNITED KINGDOM", 3, "carefully pending courts sleep above the ironic, regular theo"), + (24, "UNITED STATES", 1, "ly ironic requests along the slyly bold ideas hang after the blithely special notornis; blithely even accounts") + ] + + println("✅ Nations table read and verified successfully") end +end # End of testset + println("\n🎉 All tests completed!")