Skip to content

Commit

Permalink
Fix: PolarisFileSystem uses the dataset name instead of the dataset…
Browse files Browse the repository at this point in the history
… slug, causing a 404 (#146)

* use dataset slug name in the path

* fix  PosixPath json for printer.pretty error

* update zarr tutorial

* lint

* move sluggify to polarisfs

* update zarr tutorial

* lint

---------

Co-authored-by: zhu0619 <lu@valencelab.com>
  • Loading branch information
zhu0619 and zhu0619 authored Jul 21, 2024
1 parent 5cf6092 commit 4888a9b
Show file tree
Hide file tree
Showing 3 changed files with 174 additions and 35 deletions.
197 changes: 165 additions & 32 deletions docs/tutorials/dataset_zarr.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,16 @@
},
"tags": []
},
"outputs": [],
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/mnt/ps/home/CORP/lu.zhu/miniconda3/envs/po_datasets/lib/python3.12/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
" from .autonotebook import tqdm as notebook_tqdm\n"
]
}
],
"source": [
"import zarr\n",
"import platformdirs\n",
Expand Down Expand Up @@ -117,6 +126,28 @@
{
"cell_type": "code",
"execution_count": 4,
"id": "05712cbd",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<zarr.hierarchy.Group '/'>"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Consolidate the dataset for efficient loading from the cloud bucket\n",
"zarr.consolidate_metadata(base_path)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "15df9619-e659-4558-9c69-416a186c1f3a",
"metadata": {
"editable": true,
Expand All @@ -127,15 +158,15 @@
},
"outputs": [],
"source": [
"# For performance reasons, Polaris expects all data related to a column to be saved in a single Zarr array. \n",
"# To index a specific element in that array, the pointer path can have a suffix to specify the index. \n",
"# For performance reasons, Polaris expects all data related to a column to be saved in a single Zarr array.\n",
"# To index a specific element in that array, the pointer path can have a suffix to specify the index.\n",
"train_path = f\"{inp_col_name}#0\"\n",
"test_path = f\"{inp_col_name}#1\""
]
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": 6,
"id": "16543db7",
"metadata": {
"editable": true,
Expand All @@ -158,7 +189,7 @@
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": 7,
"id": "a257b09d",
"metadata": {
"editable": true,
Expand Down Expand Up @@ -197,7 +228,7 @@
},
{
"cell_type": "code",
"execution_count": 7,
"execution_count": 8,
"id": "19a39fab",
"metadata": {
"editable": true,
Expand All @@ -213,7 +244,7 @@
"'images#0'"
]
},
"execution_count": 7,
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
Expand All @@ -238,7 +269,7 @@
},
{
"cell_type": "code",
"execution_count": 8,
"execution_count": 9,
"id": "8189f312",
"metadata": {
"editable": true,
Expand All @@ -254,7 +285,7 @@
"(64, 64, 3)"
]
},
"execution_count": 8,
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
Expand All @@ -279,7 +310,7 @@
},
{
"cell_type": "code",
"execution_count": 9,
"execution_count": 10,
"id": "6f1c8766",
"metadata": {
"editable": true,
Expand All @@ -303,7 +334,7 @@
},
{
"cell_type": "code",
"execution_count": 10,
"execution_count": 11,
"id": "9a0c635c",
"metadata": {
"editable": true,
Expand Down Expand Up @@ -377,7 +408,7 @@
},
{
"cell_type": "code",
"execution_count": 11,
"execution_count": 12,
"id": "622287ed-16ad-484e-a0d7-ca6cf648ed5d",
"metadata": {},
"outputs": [],
Expand All @@ -388,7 +419,7 @@
},
{
"cell_type": "code",
"execution_count": 12,
"execution_count": 13,
"id": "12a06b89",
"metadata": {},
"outputs": [],
Expand All @@ -409,7 +440,7 @@
},
{
"cell_type": "code",
"execution_count": 13,
"execution_count": 14,
"id": "3c7c11ac",
"metadata": {},
"outputs": [
Expand All @@ -419,15 +450,15 @@
"'images#0'"
]
},
"execution_count": 13,
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from polaris.dataset import create_dataset_from_file\n",
"\n",
"# Because Polaris might restructure the Zarr archive, \n",
"# Because Polaris might restructure the Zarr archive,\n",
"# we need to specify a location to save the Zarr file to.\n",
"dataset = create_dataset_from_file(path, zarr_root_path=dm.fs.join(SAVE_DIR, \"zarr\", \"processed.zarr\"))\n",
"\n",
Expand All @@ -437,7 +468,7 @@
},
{
"cell_type": "code",
"execution_count": 14,
"execution_count": 15,
"id": "f8d1b42d",
"metadata": {},
"outputs": [
Expand All @@ -447,7 +478,7 @@
"(64, 64, 3)"
]
},
"execution_count": 14,
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
Expand All @@ -468,30 +499,47 @@
},
{
"cell_type": "code",
"execution_count": 15,
"execution_count": 16,
"id": "1cd94077",
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"\u001b[32m2024-07-21 13:11:49.273\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mpolaris._mixins\u001b[0m:\u001b[36mmd5sum\u001b[0m:\u001b[36m27\u001b[0m - \u001b[1mComputing the checksum. This can be slow for large datasets.\u001b[0m\n",
"Finding all files in the Zarr archive: 60%|██████ | 79/131 [00:00<00:00, 375.21it/s]"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Finding all files in the Zarr archive: 100%|██████████| 131/131 [00:00<00:00, 396.17it/s]\n",
"\u001b[32m2024-07-21 13:11:49.616\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mpolaris.dataset._dataset\u001b[0m:\u001b[36mto_json\u001b[0m:\u001b[36m431\u001b[0m - \u001b[1mCopying Zarr archive to /mnt/ps/home/CORP/lu.zhu/.cache/polaris-tutorials/002/json/data.zarr. This may take a while.\u001b[0m\n"
]
}
],
"source": [
"savedir = dm.fs.join(SAVE_DIR, \"json\")\n",
"json_path = dataset.to_json(savedir)"
]
},
{
"cell_type": "code",
"execution_count": 16,
"execution_count": 17,
"id": "c5147684",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['/home/cas/.cache/polaris-tutorials/002/zarr',\n",
" '/home/cas/.cache/polaris-tutorials/002/json',\n",
" '/home/cas/.cache/polaris-tutorials/002/data.zarr']"
"['/mnt/ps/home/CORP/lu.zhu/.cache/polaris-tutorials/002/json',\n",
" '/mnt/ps/home/CORP/lu.zhu/.cache/polaris-tutorials/002/data.zarr',\n",
" '/mnt/ps/home/CORP/lu.zhu/.cache/polaris-tutorials/002/zarr']"
]
},
"execution_count": 16,
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
Expand Down Expand Up @@ -519,14 +567,29 @@
},
{
"cell_type": "code",
"execution_count": 17,
"execution_count": 18,
"id": "33c25a55",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"\u001b[32m2024-07-21 13:12:16.485\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mpolaris._mixins\u001b[0m:\u001b[36mmd5sum\u001b[0m:\u001b[36m27\u001b[0m - \u001b[1mComputing the checksum. This can be slow for large datasets.\u001b[0m\n",
"Finding all files in the Zarr archive: 17%|█▋ | 22/131 [00:00<00:00, 211.62it/s]"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Finding all files in the Zarr archive: 100%|██████████| 131/131 [00:00<00:00, 246.81it/s]\n"
]
},
{
"data": {
"text/html": [
"<table border=\"1\"><tr><th>name</th><td>None</td></tr><tr><th>description</th><td></td></tr><tr><th>tags</th><td></td></tr><tr><th>user_attributes</th><td></td></tr><tr><th>owner</th><td>None</td></tr><tr><th>default_adapters</th><td></td></tr><tr><th>zarr_root_path</th><td>/home/cas/.cache/polaris-tutorials/002/json/data.zarr</td></tr><tr><th>md5sum</th><td>5488b4909fd67d3208624288e720e1b8</td></tr><tr><th>readme</th><td></td></tr><tr><th>annotations</th><td><table border=\"1\"><tr><th>images</th><td><table border=\"1\"><tr><th>is_pointer</th><td>True</td></tr><tr><th>modality</th><td>UNKNOWN</td></tr><tr><th>description</th><td>None</td></tr><tr><th>user_attributes</th><td></td></tr><tr><th>dtype</th><td>object</td></tr></table></td></tr></table></td></tr><tr><th>source</th><td>None</td></tr><tr><th>license</th><td>None</td></tr><tr><th>curation_reference</th><td>None</td></tr><tr><th>cache_dir</th><td>/home/cas/.cache/polaris/datasets/None/5488b4909fd67d3208624288e720e1b8</td></tr><tr><th>artifact_id</th><td>None</td></tr><tr><th>n_rows</th><td>1000</td></tr><tr><th>n_columns</th><td>1</td></tr></table>"
"<table border=\"1\"><tr><th>name</th><td>None</td></tr><tr><th>description</th><td></td></tr><tr><th>tags</th><td></td></tr><tr><th>user_attributes</th><td></td></tr><tr><th>owner</th><td>None</td></tr><tr><th>polaris_version</th><td>dev</td></tr><tr><th>default_adapters</th><td></td></tr><tr><th>zarr_root_path</th><td>/mnt/ps/home/CORP/lu.zhu/.cache/polaris-tutorials/002/zarr/processed.zarr</td></tr><tr><th>readme</th><td></td></tr><tr><th>annotations</th><td><table border=\"1\"><tr><th>images</th><td><table border=\"1\"><tr><th>is_pointer</th><td>True</td></tr><tr><th>modality</th><td>UNKNOWN</td></tr><tr><th>description</th><td>None</td></tr><tr><th>user_attributes</th><td></td></tr><tr><th>dtype</th><td>object</td></tr></table></td></tr></table></td></tr><tr><th>source</th><td>None</td></tr><tr><th>license</th><td>None</td></tr><tr><th>curation_reference</th><td>None</td></tr><tr><th>cache_dir</th><td>/mnt/ps/home/CORP/lu.zhu/.cache/polaris/datasets/97d642a2-001c-40aa-ac98-0e24353005d2</td></tr><tr><th>md5sum</th><td>b7c52acfbda1f9bba47ae218e9c4717f</td></tr><tr><th>artifact_id</th><td>None</td></tr><tr><th>n_rows</th><td>1000</td></tr><tr><th>n_columns</th><td>1</td></tr></table>"
],
"text/plain": [
"{\n",
Expand All @@ -535,9 +598,9 @@
" \"tags\": [],\n",
" \"user_attributes\": {},\n",
" \"owner\": null,\n",
" \"polaris_version\": \"dev\",\n",
" \"default_adapters\": {},\n",
" \"zarr_root_path\": \"/home/cas/.cache/polaris-tutorials/002/json/data.zarr\",\n",
" \"md5sum\": \"5488b4909fd67d3208624288e720e1b8\",\n",
" \"zarr_root_path\": \"/mnt/ps/home/CORP/lu.zhu/.cache/polaris-tutorials/002/zarr/processed.zarr\",\n",
" \"readme\": \"\",\n",
" \"annotations\": {\n",
" \"images\": {\n",
Expand All @@ -551,14 +614,15 @@
" \"source\": null,\n",
" \"license\": null,\n",
" \"curation_reference\": null,\n",
" \"cache_dir\": \"/home/cas/.cache/polaris/datasets/None/5488b4909fd67d3208624288e720e1b8\",\n",
" \"cache_dir\": \"/mnt/ps/home/CORP/lu.zhu/.cache/polaris/datasets/97d642a2-001c-40aa-ac98-0e24353005d2\",\n",
" \"md5sum\": \"b7c52acfbda1f9bba47ae218e9c4717f\",\n",
" \"artifact_id\": null,\n",
" \"n_rows\": 1000,\n",
" \"n_columns\": 1\n",
"}"
]
},
"execution_count": 17,
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
Expand All @@ -567,6 +631,75 @@
"Dataset.from_json(json_path)"
]
},
{
"cell_type": "markdown",
"id": "0503a3a7",
"metadata": {},
"source": [
"### Upload zarr dataset to Hub"
]
},
{
"cell_type": "code",
"execution_count": 22,
"id": "cf0f7e69",
"metadata": {},
"outputs": [],
"source": [
"# Define the zarr dataset metadata before uploading\n",
"dataset.name = \"tutorial_zarr\"\n",
"dataset.license = \"CC-BY-4.0\"\n",
"dataset.source = \"https://github.com/polaris-hub/polaris\""
]
},
{
"cell_type": "code",
"execution_count": 23,
"id": "5251b027",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"⠙ Uploading dataset... "
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"⠦ Uploading dataset... "
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\u001b[32m2024-07-21 13:19:12.188\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mpolaris.hub.client\u001b[0m:\u001b[36mupload_dataset\u001b[0m:\u001b[36m602\u001b[0m - \u001b[1mCopying Zarr archive to the Hub. This may take a while.\u001b[0m\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"✅ SUCCESS: \u001b[1mYour dataset has been successfully uploaded to the Hub. View it here: https://polarishub.io/datasets/polaris/tutorial_zarr\u001b[0m\n",
" \n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"/mnt/ps/home/CORP/lu.zhu/miniconda3/envs/po_datasets/lib/python3.12/site-packages/yaspin/core.py:228: UserWarning: color, on_color and attrs are not supported when running in jupyter\n",
" self._color = self._set_color(value) if value else value\n"
]
}
],
"source": [
"dataset.upload_to_hub(owner=\"polaris\")"
]
},
{
"cell_type": "markdown",
"id": "72767ef2",
Expand Down Expand Up @@ -598,7 +731,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.2"
"version": "3.12.4"
}
},
"nbformat": 4,
Expand Down
Loading

0 comments on commit 4888a9b

Please sign in to comment.