Skip to content

Commit

Permalink
add catalog enhancements (#56)
Browse files Browse the repository at this point in the history
  • Loading branch information
andersy005 authored Oct 21, 2024
1 parent a982034 commit 2a43792
Show file tree
Hide file tree
Showing 3 changed files with 69 additions and 44 deletions.
18 changes: 10 additions & 8 deletions docs/catalog.md
Original file line number Diff line number Diff line change
Expand Up @@ -53,14 +53,16 @@ The `catalog.yaml` file contains additional information about the dataset. The f

#### Object Properties for `stores`

| Property | Type | Description |
| ------------ | --------------------------- | ------------------------------- |
| `id` | String | ID of the store |
| `name` | String (optional) | Name of the store |
| `url` | String | URL of the store |
| `rechunking` | Array of Objects (optional) | Rechunking information |
| `public` | Boolean (optional) | Whether the store is public |
| `geospatial` | Boolean (optional) | Whether the store is geospatial |
| Property | Type | Description |
| -------------------- | --------------------------- | ------------------------------- |
| `id` | String | ID of the store |
| `name` | String (optional) | Name of the store |
| `url` | String | URL of the store |
| `rechunking` | Array of Objects (optional) | Rechunking information |
| `public` | Boolean (optional) | Whether the store is public |
| `geospatial` | Boolean (optional) | Whether the store is geospatial |
| `xarray_open_kwargs` | Object (optional) | Additional xarray open kwargs |
| `last_updated` | String (optional) | Last updated timestamp |

### Example YAML Files

Expand Down
67 changes: 51 additions & 16 deletions leap_data_management_utils/catalog.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import json
import re
import traceback
import typing

import cf_xarray # noqa: F401
import pydantic
Expand Down Expand Up @@ -32,13 +33,24 @@ def gs_to_https(gs_url: str) -> str:
return gs_url.replace('gs://', 'https://storage.googleapis.com/')


class XarrayOpenKwargs(pydantic.BaseModel):
engine: typing.Literal['zarr', 'kerchunk']


default_xarray_open_kwargs = XarrayOpenKwargs(engine='zarr')


class Store(pydantic.BaseModel):
id: str = pydantic.Field(..., description='ID of the store')
name: str = pydantic.Field(None, description='Name of the store')
url: str = pydantic.Field(..., description='URL of the store')
rechunking: list[dict[str, str]] | None = pydantic.Field(None, alias='ncviewjs:rechunking')
public: bool | None = pydantic.Field(None, description='Whether the store is public')
geospatial: bool | None = pydantic.Field(None, description='Whether the store is geospatial')
xarray_open_kwargs: XarrayOpenKwargs | None = pydantic.Field(
default_xarray_open_kwargs, description='Xarray open kwargs for the store'
)
last_updated: str | None = pydantic.Field(None, description='Last updated timestamp')


class Link(pydantic.BaseModel):
Expand Down Expand Up @@ -82,17 +94,23 @@ class Feedstock(pydantic.BaseModel):
tags: list[str] | None = pydantic.Field(None, description='Tags of the dataset')
links: list[Link] | None = None
stores: list[Store] | None = None
meta_yaml_url: pydantic.HttpUrl | None = pydantic.Field(None, alias='ncviewjs:meta_yaml_url')
meta_yaml_url: pydantic.HttpUrl | None = pydantic.Field(
None, alias='ncviewjs:meta_yaml_url', description='URL of the meta YAML'
)

@classmethod
def from_yaml(cls, path: str):
content = yaml.load(upath.UPath(path).read_text())
if 'ncviewjs:meta_yaml_url' in content:
meta_url = convert_to_raw_github_url(content['ncviewjs:meta_yaml_url'])

meta_url_key = next(
(key for key in ['meta_yaml_url', 'ncviewjs:meta_yaml_url'] if key in content), None
)

if meta_url_key:
meta_url = convert_to_raw_github_url(content[meta_url_key])
meta = yaml.load(upath.UPath(meta_url).read_text())
content = content | meta
data = cls.model_validate(content)
return data
return cls.model_validate(content)


def convert_to_raw_github_url(github_url):
Expand Down Expand Up @@ -168,9 +186,12 @@ def is_store_public(store) -> bool:
return False


def is_geospatial(store) -> bool:
def load_store(store: str, engine: str) -> xr.Dataset:
url = get_http_url(store)
ds = xr.open_dataset(url, engine='zarr', chunks={}, decode_cf=False)
return xr.open_dataset(url, engine=engine, chunks={}, decode_cf=False)


def is_geospatial(ds: xr.Dataset) -> bool:
cf_axes = ds.cf.axes

# Regex patterns that match 'lat', 'latitude', 'lon', 'longitude' and also allow prefixes
Expand All @@ -187,6 +208,27 @@ def is_geospatial(store) -> bool:
return ('X' in cf_axes and 'Y' in cf_axes) or (has_latitude and has_longitude)


def check_stores(feed: Feedstock) -> None:
for index, store in enumerate(feed.stores):
print(f' 🚦 {store.id} ({index + 1}/{len(feed.stores)})')
check_single_store(store)


def check_single_store(store: Store) -> None:
is_public = is_store_public(store.rechunking or store.url)
store.public = is_public
if is_public:
# check if the store is geospatial
ds = load_store(
store.rechunking or store.url,
store.xarray_open_kwargs.engine if store.xarray_open_kwargs else 'zarr',
)
is_geospatial_store = is_geospatial(ds)
store.geospatial = is_geospatial_store
# get last_updated_timestamp
store.last_updated = ds.attrs.get('pangeo_forge_build_timestamp', None)


def validate_feedstocks(*, feedstocks: list[upath.UPath]) -> list[Feedstock]:
errors = []
valid = []
Expand All @@ -197,15 +239,8 @@ def validate_feedstocks(*, feedstocks: list[upath.UPath]) -> list[Feedstock]:
feed = Feedstock.from_yaml(convert_to_raw_github_url(feedstock))
if feed.stores:
print('🔄 Checking stores')
for index, store in enumerate(feed.stores):
print(f' 🚦 {store.id} ({index + 1}/{len(feed.stores)})')
is_public = is_store_public(store.rechunking or store.url)
feed.stores[index].public = is_public
if is_public:
# check if the store is geospatial
# print('🌍 Checking geospatial')
is_geospatial_store = is_geospatial(store.rechunking or store.url)
feed.stores[index].geospatial = is_geospatial_store
check_stores(feed)

else:
print('🚀 No stores found.')
valid.append({'feedstock': str(feedstock), 'status': 'valid'})
Expand Down
28 changes: 8 additions & 20 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -22,46 +22,35 @@ classifiers = [
"Programming Language :: Python :: 3.11",
"Topic :: Scientific/Engineering",
]
dependencies = [
"dask",
"cftime",
"ruamel.yaml",
"xarray",
"zarr",
]
dependencies = ["dask", "cftime", "ruamel.yaml", "xarray", "zarr"]

[project.optional-dependencies]

bigquery=[
bigquery = [
"tqdm",
"google-api-core",
"google-cloud-bigquery",
"db_dtypes",
"pangeo-forge-esgf>0.3.0",
]
pangeo-forge=[
pangeo-forge = [
"pangeo-forge-recipes",
"apache-beam==2.58.0",
"dynamic-chunks",
"leap-data-management-utils[bigquery]",
]
]
catalog = [
"aiohttp",
"cf_xarray",
"pydantic-core",
"pydantic>=2",
"requests",
"universal-pathlib",
]
complete = ["leap-data-management-utils[pangeo-forge,catalog]"]
test = [
"pytest",
"leap-data-management-utils[complete]",
]
dev = [
"leap-data-management-utils[test]",
"pre-commit",
"kerchunk >=0.2.6",
]
complete = ["leap-data-management-utils[pangeo-forge,catalog]"]
test = ["pytest", "leap-data-management-utils[complete]"]
dev = ["leap-data-management-utils[test]", "pre-commit"]

[project.scripts]
leap-catalog = "leap_data_management_utils.catalog:main"
Expand Down Expand Up @@ -130,7 +119,6 @@ select = [
]



[tool.ruff.lint.mccabe]
max-complexity = 18

Expand Down

0 comments on commit 2a43792

Please sign in to comment.