From 2de612545c9300556972e5aea815b152124838b0 Mon Sep 17 00:00:00 2001 From: Julius Busecke Date: Wed, 1 May 2024 14:29:02 -0400 Subject: [PATCH 01/27] Request 144 --- feedstock/iids_pr.yaml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/feedstock/iids_pr.yaml b/feedstock/iids_pr.yaml index 03aba3d6..d90c7e0a 100644 --- a/feedstock/iids_pr.yaml +++ b/feedstock/iids_pr.yaml @@ -1 +1,4 @@ - "CMIP6.*.*.[CNRM-CM6-1,CanESM5].historical.*.Omon.[tos, so].*.*" + - 'CMIP6.HighResMIP.CMCC.CMCC-CM2-VHR4.[hist-1950,highres-future].r1i1p1f1.6hrPlevPt.[vas,uas,psl].gn.*', + - 'CMIP6.HighResMIP.EC-Earth-Consortium.EC-Earth3P-HR.[hist-1950,highres-future].r1i1p2f1.6hrPlevPt.[vas,uas,psl].gr.*', + - 'CMIP6.HighResMIP.MOHC.HadGEM3-GC31-HM.[hist-1950,highres-future].r1i1p1f1.[3hr, E3hr].[vas,uas,psl].gn.*', From 298cea74b052c9d978889107781774fcd98ab1a6 Mon Sep 17 00:00:00 2001 From: Julius Busecke Date: Wed, 1 May 2024 14:30:02 -0400 Subject: [PATCH 02/27] Update iids_pr.yaml --- feedstock/iids_pr.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/feedstock/iids_pr.yaml b/feedstock/iids_pr.yaml index d90c7e0a..72a07014 100644 --- a/feedstock/iids_pr.yaml +++ b/feedstock/iids_pr.yaml @@ -1,4 +1,4 @@ - "CMIP6.*.*.[CNRM-CM6-1,CanESM5].historical.*.Omon.[tos, so].*.*" - - 'CMIP6.HighResMIP.CMCC.CMCC-CM2-VHR4.[hist-1950,highres-future].r1i1p1f1.6hrPlevPt.[vas,uas,psl].gn.*', - - 'CMIP6.HighResMIP.EC-Earth-Consortium.EC-Earth3P-HR.[hist-1950,highres-future].r1i1p2f1.6hrPlevPt.[vas,uas,psl].gr.*', - - 'CMIP6.HighResMIP.MOHC.HadGEM3-GC31-HM.[hist-1950,highres-future].r1i1p1f1.[3hr, E3hr].[vas,uas,psl].gn.*', + - 'CMIP6.HighResMIP.CMCC.CMCC-CM2-VHR4.[hist-1950,highres-future].r1i1p1f1.6hrPlevPt.[vas,uas,psl].gn.*' + - 'CMIP6.HighResMIP.EC-Earth-Consortium.EC-Earth3P-HR.[hist-1950,highres-future].r1i1p2f1.6hrPlevPt.[vas,uas,psl].gr.*' + - 'CMIP6.HighResMIP.MOHC.HadGEM3-GC31-HM.[hist-1950,highres-future].r1i1p1f1.[3hr, E3hr].[vas,uas,psl].gn.*' From b8bdda6ddd1b0c56f96a9f1a0fe95eee5b816471 Mon Sep 17 00:00:00 2001 From: Julius Busecke Date: Wed, 1 May 2024 14:41:49 -0400 Subject: [PATCH 03/27] Update iids_pr.yaml --- feedstock/iids_pr.yaml | 1 - 1 file changed, 1 deletion(-) diff --git a/feedstock/iids_pr.yaml b/feedstock/iids_pr.yaml index 72a07014..22158e01 100644 --- a/feedstock/iids_pr.yaml +++ b/feedstock/iids_pr.yaml @@ -1,4 +1,3 @@ - - "CMIP6.*.*.[CNRM-CM6-1,CanESM5].historical.*.Omon.[tos, so].*.*" - 'CMIP6.HighResMIP.CMCC.CMCC-CM2-VHR4.[hist-1950,highres-future].r1i1p1f1.6hrPlevPt.[vas,uas,psl].gn.*' - 'CMIP6.HighResMIP.EC-Earth-Consortium.EC-Earth3P-HR.[hist-1950,highres-future].r1i1p2f1.6hrPlevPt.[vas,uas,psl].gr.*' - 'CMIP6.HighResMIP.MOHC.HadGEM3-GC31-HM.[hist-1950,highres-future].r1i1p1f1.[3hr, E3hr].[vas,uas,psl].gn.*' From 0df0d7aaa2f6dcd315b899d391b7ace1de7668be Mon Sep 17 00:00:00 2001 From: Julius Busecke Date: Mon, 6 May 2024 15:53:11 -0400 Subject: [PATCH 04/27] Try new esgf client --- feedstock/recipe.py | 30 +++++++++++++----------------- 1 file changed, 13 insertions(+), 17 deletions(-) diff --git a/feedstock/recipe.py b/feedstock/recipe.py index 28993457..7742f85e 100644 --- a/feedstock/recipe.py +++ b/feedstock/recipe.py @@ -8,7 +8,7 @@ from leap_data_management_utils import CMIPBQInterface, LogCMIPToBigQuery from leap_data_management_utils.data_management_transforms import Copy, InjectAttrs from leap_data_management_utils.cmip_transforms import TestDataset, Preprocessor -from pangeo_forge_esgf.parsing import parse_instance_ids +from pangeo_forge_esgf.client import ESGFClient from pangeo_forge_recipes.patterns import pattern_from_file_sequence from pangeo_forge_recipes.transforms import ( OpenURLWithFSSpec, @@ -95,17 +95,13 @@ def parse_wildcards(iids: List[str]) -> List[str]: # parse out wildcard iids using pangeo-forge-esgf print(f"{iids_raw = }") -iids = parse_wildcards(iids_raw) +client = ESGFClient() +iids = client.expand_instance_id_list(parse_iids) print(f"{iids = }") -# exclude dupes -iids = list(set(iids)) - # Prune the url dict to only include items that have not been logged to BQ yet print("Pruning iids that already exist") - bq_interface = CMIPBQInterface(table_id=table_id) - # get lists of the iids already logged iids_in_table = bq_interface.iid_list_exists(iids) @@ -132,16 +128,16 @@ def parse_wildcards(iids: List[str]) -> List[str]: iids_filtered = iids_filtered[0:200] print(f"🚀 Requesting a total of {len(iids_filtered)} iids") - -# Get the urls from ESGF at Runtime (only for the pruned list to save time) -url_dict = asyncio.run( - get_urls_from_esgf( - iids_filtered, - limit_per_host=20, - max_concurrency=20, - max_concurrency_response=20, - ) -) +input_dict = client.get_recipe_inputs_from_iid_list(iids_filtered) +# for now conform to the way this was set up with the async client(this is where we could extract other info, +# like checksums and tracking_id too!). That will require some sort of matching between dataset and file +# level results though! +url_dict = {} +for iid, tuple_list in input_dict.items(): + sorted_tuples = sorted(tuple_list) # we are sorting by filename here (which should include the year range) + # There might be a more reliable way to do this. + urls = [s[1] for s in sorted_tuples] + url_dict[iid] = urls if prune_submission: url_dict = {iid: url_dict[iid] for iid in list(url_dict.keys())[0:10]} From ea05400e1ff8d5f5d7cd244655646d86a49a1400 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 6 May 2024 19:53:17 +0000 Subject: [PATCH 05/27] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- feedstock/recipe.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/feedstock/recipe.py b/feedstock/recipe.py index 7742f85e..694d2605 100644 --- a/feedstock/recipe.py +++ b/feedstock/recipe.py @@ -4,7 +4,7 @@ import apache_beam as beam from typing import List, Dict from dask.utils import parse_bytes -from pangeo_forge_esgf import get_urls_from_esgf, setup_logging +from pangeo_forge_esgf import setup_logging from leap_data_management_utils import CMIPBQInterface, LogCMIPToBigQuery from leap_data_management_utils.data_management_transforms import Copy, InjectAttrs from leap_data_management_utils.cmip_transforms import TestDataset, Preprocessor @@ -17,7 +17,6 @@ ConsolidateMetadata, ConsolidateDimensionCoordinates, ) -import asyncio import logging import os import xarray as xr @@ -129,13 +128,15 @@ def parse_wildcards(iids: List[str]) -> List[str]: print(f"🚀 Requesting a total of {len(iids_filtered)} iids") input_dict = client.get_recipe_inputs_from_iid_list(iids_filtered) -# for now conform to the way this was set up with the async client(this is where we could extract other info, -# like checksums and tracking_id too!). That will require some sort of matching between dataset and file +# for now conform to the way this was set up with the async client(this is where we could extract other info, +# like checksums and tracking_id too!). That will require some sort of matching between dataset and file # level results though! url_dict = {} for iid, tuple_list in input_dict.items(): - sorted_tuples = sorted(tuple_list) # we are sorting by filename here (which should include the year range) - # There might be a more reliable way to do this. + sorted_tuples = sorted( + tuple_list + ) # we are sorting by filename here (which should include the year range) + # There might be a more reliable way to do this. urls = [s[1] for s in sorted_tuples] url_dict[iid] = urls From a41bd6d2c8ee2b6c65ed9d92632722efc2bbf7a3 Mon Sep 17 00:00:00 2001 From: Julius Busecke Date: Mon, 6 May 2024 15:54:09 -0400 Subject: [PATCH 06/27] switch pangeo-forge-esgf dependency to pr branch --- feedstock/requirements.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/feedstock/requirements.txt b/feedstock/requirements.txt index 333e2616..5efdfa90 100644 --- a/feedstock/requirements.txt +++ b/feedstock/requirements.txt @@ -1,5 +1,6 @@ leap-data-management-utils==0.0.7 -pangeo-forge-esgf==0.2.0 +#pangeo-forge-esgf==0.2.0 +git+https://github.com/jbusecke/pangeo-forge-esgf.git@new-request-scheme dynamic-chunks==0.0.3 git+https://github.com/ranchodeluxe/xarray@ranchodeluxe-patch-1#egg=xarray git+https://github.com/ranchodeluxe/rioxarray From 214fc0783aa80b79f6f6a48a75a1fce8a773bb43 Mon Sep 17 00:00:00 2001 From: Julius Busecke Date: Mon, 6 May 2024 15:59:39 -0400 Subject: [PATCH 07/27] Update recipe.py --- feedstock/recipe.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/feedstock/recipe.py b/feedstock/recipe.py index 694d2605..f4661b11 100644 --- a/feedstock/recipe.py +++ b/feedstock/recipe.py @@ -95,7 +95,7 @@ def parse_wildcards(iids: List[str]) -> List[str]: # parse out wildcard iids using pangeo-forge-esgf print(f"{iids_raw = }") client = ESGFClient() -iids = client.expand_instance_id_list(parse_iids) +iids = client.expand_instance_id_list(iids_raw) print(f"{iids = }") # Prune the url dict to only include items that have not been logged to BQ yet From 17edbd4f8b042415c75f890997a85d782f7d7886 Mon Sep 17 00:00:00 2001 From: Julius Busecke Date: Mon, 6 May 2024 16:21:29 -0400 Subject: [PATCH 08/27] Update recipe.py --- feedstock/recipe.py | 1 + 1 file changed, 1 insertion(+) diff --git a/feedstock/recipe.py b/feedstock/recipe.py index f4661b11..523a80e8 100644 --- a/feedstock/recipe.py +++ b/feedstock/recipe.py @@ -131,6 +131,7 @@ def parse_wildcards(iids: List[str]) -> List[str]: # for now conform to the way this was set up with the async client(this is where we could extract other info, # like checksums and tracking_id too!). That will require some sort of matching between dataset and file # level results though! +input_dict_flat = {iid:[(filename, data['url']) for filename, data in file_dict.items()] for iid, file_dict in input_dict.items()} url_dict = {} for iid, tuple_list in input_dict.items(): sorted_tuples = sorted( From 7deaa9761ff04222801cab6f9265f369c6f2c7ba Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 6 May 2024 20:21:33 +0000 Subject: [PATCH 09/27] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- feedstock/recipe.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/feedstock/recipe.py b/feedstock/recipe.py index 523a80e8..17672a8f 100644 --- a/feedstock/recipe.py +++ b/feedstock/recipe.py @@ -131,7 +131,10 @@ def parse_wildcards(iids: List[str]) -> List[str]: # for now conform to the way this was set up with the async client(this is where we could extract other info, # like checksums and tracking_id too!). That will require some sort of matching between dataset and file # level results though! -input_dict_flat = {iid:[(filename, data['url']) for filename, data in file_dict.items()] for iid, file_dict in input_dict.items()} +input_dict_flat = { + iid: [(filename, data["url"]) for filename, data in file_dict.items()] + for iid, file_dict in input_dict.items() +} url_dict = {} for iid, tuple_list in input_dict.items(): sorted_tuples = sorted( From 347fb749aba39f5674f050f5622f0642f42b96ea Mon Sep 17 00:00:00 2001 From: Julius Busecke Date: Mon, 6 May 2024 16:24:29 -0400 Subject: [PATCH 10/27] Update recipe.py --- feedstock/recipe.py | 16 +--------------- 1 file changed, 1 insertion(+), 15 deletions(-) diff --git a/feedstock/recipe.py b/feedstock/recipe.py index 17672a8f..0cb53729 100644 --- a/feedstock/recipe.py +++ b/feedstock/recipe.py @@ -78,21 +78,7 @@ iids_raw = yaml.safe_load(f) iids_raw = [iid for iid in iids_raw if iid] - -def parse_wildcards(iids: List[str]) -> List[str]: - """iterate through each list element and - if it contains wilcards apply the wildcard parser - """ - iids_parsed = [] - for iid in iids: - if "*" in iid: - iids_parsed += parse_instance_ids(iid) - else: - iids_parsed.append(iid) - return iids_parsed - - -# parse out wildcard iids using pangeo-forge-esgf +# parse out wildcard/square brackets using pangeo-forge-esgf print(f"{iids_raw = }") client = ESGFClient() iids = client.expand_instance_id_list(iids_raw) From 51c0ad20099987b7a8eae818d99654eea256867a Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 6 May 2024 20:24:34 +0000 Subject: [PATCH 11/27] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- feedstock/recipe.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/feedstock/recipe.py b/feedstock/recipe.py index 0cb53729..15c270be 100644 --- a/feedstock/recipe.py +++ b/feedstock/recipe.py @@ -2,7 +2,7 @@ """Modified transforms from Pangeo Forge""" import apache_beam as beam -from typing import List, Dict +from typing import Dict from dask.utils import parse_bytes from pangeo_forge_esgf import setup_logging from leap_data_management_utils import CMIPBQInterface, LogCMIPToBigQuery From fb34aa8e2d9b7ab30e6a0f16a2858f8a1b423702 Mon Sep 17 00:00:00 2001 From: Julius Busecke Date: Mon, 6 May 2024 16:50:28 -0400 Subject: [PATCH 12/27] Update recipe.py --- feedstock/recipe.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/feedstock/recipe.py b/feedstock/recipe.py index 15c270be..381a3fd8 100644 --- a/feedstock/recipe.py +++ b/feedstock/recipe.py @@ -122,7 +122,7 @@ for iid, file_dict in input_dict.items() } url_dict = {} -for iid, tuple_list in input_dict.items(): +for iid, tuple_list in input_dict_flat.items(): sorted_tuples = sorted( tuple_list ) # we are sorting by filename here (which should include the year range) From 029cf4921b056bd732a466ca7cb8cea3d7df7ce3 Mon Sep 17 00:00:00 2001 From: Julius Busecke Date: Mon, 6 May 2024 18:03:07 -0400 Subject: [PATCH 13/27] Update recipe.py --- feedstock/recipe.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/feedstock/recipe.py b/feedstock/recipe.py index 381a3fd8..9f25e374 100644 --- a/feedstock/recipe.py +++ b/feedstock/recipe.py @@ -79,13 +79,13 @@ iids_raw = [iid for iid in iids_raw if iid] # parse out wildcard/square brackets using pangeo-forge-esgf -print(f"{iids_raw = }") +logger.debug(f"{iids_raw = }") client = ESGFClient() iids = client.expand_instance_id_list(iids_raw) -print(f"{iids = }") +logger.info(f"{iids = }") # Prune the url dict to only include items that have not been logged to BQ yet -print("Pruning iids that already exist") +logger.info("Pruning iids that already exist") bq_interface = CMIPBQInterface(table_id=table_id) # get lists of the iids already logged iids_in_table = bq_interface.iid_list_exists(iids) @@ -102,11 +102,11 @@ del bq_interface # Maybe I want a more finegrained check here at some point, but for now this will prevent logged iids from rerunning -print(f"{overwrite_iids =}") +logger.debug(f"{overwrite_iids =}") iids_to_skip = set(iids_in_table) - set(overwrite_iids) -print(f"{iids_to_skip =}") +logger.debug(f"{iids_to_skip =}") iids_filtered = list(set(iids) - iids_to_skip) -print(f"Pruned {len(iids) - len(iids_filtered)}/{len(iids)} iids from input list") +logger.info(f"Pruned {len(iids) - len(iids_filtered)}/{len(iids)} iids from input list") if prune_iids: @@ -136,7 +136,7 @@ print(f"🚀 Submitting a total of {len(url_dict)} iids") # Print the actual urls -print(f"{url_dict = }") +logger.debug(f"{url_dict = }") ## Dynamic Chunking Wrapper From e35c7200004dff4e5b7f9a6d61f1b4a72156bb42 Mon Sep 17 00:00:00 2001 From: Julius Busecke Date: Mon, 6 May 2024 22:32:28 -0400 Subject: [PATCH 14/27] Update recipe.py --- feedstock/recipe.py | 33 ++++++++++++++++++++------------- 1 file changed, 20 insertions(+), 13 deletions(-) diff --git a/feedstock/recipe.py b/feedstock/recipe.py index 9f25e374..21731fde 100644 --- a/feedstock/recipe.py +++ b/feedstock/recipe.py @@ -121,22 +121,26 @@ iid: [(filename, data["url"]) for filename, data in file_dict.items()] for iid, file_dict in input_dict.items() } -url_dict = {} -for iid, tuple_list in input_dict_flat.items(): - sorted_tuples = sorted( - tuple_list - ) # we are sorting by filename here (which should include the year range) - # There might be a more reliable way to do this. - urls = [s[1] for s in sorted_tuples] - url_dict[iid] = urls +def combine_dicts(dicts): + result = {} + for d in dicts: + for key, value in d.items(): + if key in result: + result[key].append(value) + else: + result[key] = [value] + return result + +recipe_dict = {k:combine_dicts([i[1] for i in sorted(v)]) for k,v in input_dict_flat.items()} + if prune_submission: - url_dict = {iid: url_dict[iid] for iid in list(url_dict.keys())[0:10]} + recipe_dict = {iid: {k:v[0:10] for k,v in data.items()} for iid, data in recipe_dict.items()} -print(f"🚀 Submitting a total of {len(url_dict)} iids") +print(f"🚀 Submitting a total of {len(recipe_dict)} iids") # Print the actual urls -logger.debug(f"{url_dict = }") +logger.debug(f"{recipe_dict = }") ## Dynamic Chunking Wrapper @@ -218,7 +222,8 @@ def dynamic_chunking_func(ds: xr.Dataset) -> Dict[str, int]: ## Create the recipes recipes = {} -for iid, urls in url_dict.items(): +for iid, data in recipe_dict.items(): + urls = data['urls'] pattern = pattern_from_file_sequence(urls, concat_dim="time") recipes[iid] = ( f"Creating {iid}" >> beam.Create(pattern.items()) @@ -231,7 +236,9 @@ def dynamic_chunking_func(ds: xr.Dataset) -> Dict[str, int]: combine_dims=pattern.combine_dim_keys, dynamic_chunking_fn=dynamic_chunking_func, ) - | InjectAttrs() + | InjectAttrs({ + 'pangeo_forge_file_data' = data + }) | ConsolidateDimensionCoordinates() | ConsolidateMetadata() | Copy( From 951819c5d372519497f2cb877d873fd26a2b8350 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 7 May 2024 02:32:57 +0000 Subject: [PATCH 15/27] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- feedstock/recipe.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/feedstock/recipe.py b/feedstock/recipe.py index 21731fde..76fbd8b9 100644 --- a/feedstock/recipe.py +++ b/feedstock/recipe.py @@ -130,7 +130,7 @@ def combine_dicts(dicts): else: result[key] = [value] return result - + recipe_dict = {k:combine_dicts([i[1] for i in sorted(v)]) for k,v in input_dict_flat.items()} From 6b521b867e9f8f1afccc5317b1cb1f53afe23d36 Mon Sep 17 00:00:00 2001 From: Julius Busecke Date: Mon, 6 May 2024 22:36:56 -0400 Subject: [PATCH 16/27] Update recipe.py --- feedstock/recipe.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/feedstock/recipe.py b/feedstock/recipe.py index 76fbd8b9..80e0051e 100644 --- a/feedstock/recipe.py +++ b/feedstock/recipe.py @@ -237,7 +237,7 @@ def dynamic_chunking_func(ds: xr.Dataset) -> Dict[str, int]: dynamic_chunking_fn=dynamic_chunking_func, ) | InjectAttrs({ - 'pangeo_forge_file_data' = data + 'pangeo_forge_file_data':data }) | ConsolidateDimensionCoordinates() | ConsolidateMetadata() From ed871b57b7516f65fe0f6b3e8351900c0a5ac2c2 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 7 May 2024 02:37:02 +0000 Subject: [PATCH 17/27] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- feedstock/recipe.py | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/feedstock/recipe.py b/feedstock/recipe.py index 80e0051e..0ce0c12a 100644 --- a/feedstock/recipe.py +++ b/feedstock/recipe.py @@ -121,6 +121,8 @@ iid: [(filename, data["url"]) for filename, data in file_dict.items()] for iid, file_dict in input_dict.items() } + + def combine_dicts(dicts): result = {} for d in dicts: @@ -131,11 +133,16 @@ def combine_dicts(dicts): result[key] = [value] return result -recipe_dict = {k:combine_dicts([i[1] for i in sorted(v)]) for k,v in input_dict_flat.items()} + +recipe_dict = { + k: combine_dicts([i[1] for i in sorted(v)]) for k, v in input_dict_flat.items() +} if prune_submission: - recipe_dict = {iid: {k:v[0:10] for k,v in data.items()} for iid, data in recipe_dict.items()} + recipe_dict = { + iid: {k: v[0:10] for k, v in data.items()} for iid, data in recipe_dict.items() + } print(f"🚀 Submitting a total of {len(recipe_dict)} iids") @@ -223,7 +230,7 @@ def dynamic_chunking_func(ds: xr.Dataset) -> Dict[str, int]: recipes = {} for iid, data in recipe_dict.items(): - urls = data['urls'] + urls = data["urls"] pattern = pattern_from_file_sequence(urls, concat_dim="time") recipes[iid] = ( f"Creating {iid}" >> beam.Create(pattern.items()) @@ -236,9 +243,7 @@ def dynamic_chunking_func(ds: xr.Dataset) -> Dict[str, int]: combine_dims=pattern.combine_dim_keys, dynamic_chunking_fn=dynamic_chunking_func, ) - | InjectAttrs({ - 'pangeo_forge_file_data':data - }) + | InjectAttrs({"pangeo_forge_file_data": data}) | ConsolidateDimensionCoordinates() | ConsolidateMetadata() | Copy( From 7aade7d8f6e631e328e620f57dabede4b062251b Mon Sep 17 00:00:00 2001 From: Julius Busecke Date: Mon, 6 May 2024 22:45:18 -0400 Subject: [PATCH 18/27] Update recipe.py --- feedstock/recipe.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/feedstock/recipe.py b/feedstock/recipe.py index 0ce0c12a..0116f2b6 100644 --- a/feedstock/recipe.py +++ b/feedstock/recipe.py @@ -80,7 +80,11 @@ # parse out wildcard/square brackets using pangeo-forge-esgf logger.debug(f"{iids_raw = }") -client = ESGFClient() + +client = ESGFClient( + file_output_fields=['pid', 'tracking_id', 'further_info_url', 'citation_url', 'checksum', 'checksum_type'], + dataset_output_fields=['pid', 'tracking_id', 'further_info_url', 'citation_url'] +) iids = client.expand_instance_id_list(iids_raw) logger.info(f"{iids = }") From 2eb595d2bad30f63c82a912de0f1bb2843ff4a74 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 7 May 2024 02:45:24 +0000 Subject: [PATCH 19/27] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- feedstock/recipe.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/feedstock/recipe.py b/feedstock/recipe.py index 0116f2b6..e0a1ef2f 100644 --- a/feedstock/recipe.py +++ b/feedstock/recipe.py @@ -82,8 +82,15 @@ logger.debug(f"{iids_raw = }") client = ESGFClient( - file_output_fields=['pid', 'tracking_id', 'further_info_url', 'citation_url', 'checksum', 'checksum_type'], - dataset_output_fields=['pid', 'tracking_id', 'further_info_url', 'citation_url'] + file_output_fields=[ + "pid", + "tracking_id", + "further_info_url", + "citation_url", + "checksum", + "checksum_type", + ], + dataset_output_fields=["pid", "tracking_id", "further_info_url", "citation_url"], ) iids = client.expand_instance_id_list(iids_raw) logger.info(f"{iids = }") From 2dc65530dc6fd0634bfeee75ae6e644401c1f799 Mon Sep 17 00:00:00 2001 From: Julius Busecke Date: Mon, 6 May 2024 22:50:41 -0400 Subject: [PATCH 20/27] Update recipe.py --- feedstock/recipe.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/feedstock/recipe.py b/feedstock/recipe.py index e0a1ef2f..fca2da34 100644 --- a/feedstock/recipe.py +++ b/feedstock/recipe.py @@ -132,6 +132,7 @@ iid: [(filename, data["url"]) for filename, data in file_dict.items()] for iid, file_dict in input_dict.items() } +logger.debug(f"{input_dict_flat=}") def combine_dicts(dicts): @@ -148,6 +149,7 @@ def combine_dicts(dicts): recipe_dict = { k: combine_dicts([i[1] for i in sorted(v)]) for k, v in input_dict_flat.items() } +logger.debug(f"{recipe_dict=}") if prune_submission: From 1e3c7973b43b25aa4e788fac7039b1208353edbf Mon Sep 17 00:00:00 2001 From: Julius Busecke Date: Mon, 6 May 2024 23:32:29 -0400 Subject: [PATCH 21/27] Update recipe.py --- feedstock/recipe.py | 24 ++++++------------------ 1 file changed, 6 insertions(+), 18 deletions(-) diff --git a/feedstock/recipe.py b/feedstock/recipe.py index fca2da34..4f4cdd27 100644 --- a/feedstock/recipe.py +++ b/feedstock/recipe.py @@ -123,18 +123,6 @@ if prune_iids: iids_filtered = iids_filtered[0:200] -print(f"🚀 Requesting a total of {len(iids_filtered)} iids") -input_dict = client.get_recipe_inputs_from_iid_list(iids_filtered) -# for now conform to the way this was set up with the async client(this is where we could extract other info, -# like checksums and tracking_id too!). That will require some sort of matching between dataset and file -# level results though! -input_dict_flat = { - iid: [(filename, data["url"]) for filename, data in file_dict.items()] - for iid, file_dict in input_dict.items() -} -logger.debug(f"{input_dict_flat=}") - - def combine_dicts(dicts): result = {} for d in dicts: @@ -145,13 +133,14 @@ def combine_dicts(dicts): result[key] = [value] return result - -recipe_dict = { - k: combine_dicts([i[1] for i in sorted(v)]) for k, v in input_dict_flat.items() -} +print(f"🚀 Requesting a total of {len(iids_filtered)} iids") +input_dict = client.get_recipe_inputs_from_iid_list(iids_filtered) +logger.debug(f"{input_dict=}") +input_dict_flat = {iid: [(k,v) for k,v in data.items()] for iid, data in input_dict.items()} +logger.debug(f"{input_dict_flat=}") +recipe_dict = {iid:combine_dicts([i[1] for i in sorted(data)]) for iid, data in input_dict_flat.items()} logger.debug(f"{recipe_dict=}") - if prune_submission: recipe_dict = { iid: {k: v[0:10] for k, v in data.items()} for iid, data in recipe_dict.items() @@ -162,7 +151,6 @@ def combine_dicts(dicts): # Print the actual urls logger.debug(f"{recipe_dict = }") - ## Dynamic Chunking Wrapper def dynamic_chunking_func(ds: xr.Dataset) -> Dict[str, int]: import warnings From d20efa8fb1d340cbdf202db172209cbf3572f835 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 7 May 2024 03:32:35 +0000 Subject: [PATCH 22/27] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- feedstock/recipe.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/feedstock/recipe.py b/feedstock/recipe.py index 4f4cdd27..bcb62147 100644 --- a/feedstock/recipe.py +++ b/feedstock/recipe.py @@ -123,6 +123,7 @@ if prune_iids: iids_filtered = iids_filtered[0:200] + def combine_dicts(dicts): result = {} for d in dicts: @@ -133,12 +134,18 @@ def combine_dicts(dicts): result[key] = [value] return result + print(f"🚀 Requesting a total of {len(iids_filtered)} iids") input_dict = client.get_recipe_inputs_from_iid_list(iids_filtered) logger.debug(f"{input_dict=}") -input_dict_flat = {iid: [(k,v) for k,v in data.items()] for iid, data in input_dict.items()} +input_dict_flat = { + iid: [(k, v) for k, v in data.items()] for iid, data in input_dict.items() +} logger.debug(f"{input_dict_flat=}") -recipe_dict = {iid:combine_dicts([i[1] for i in sorted(data)]) for iid, data in input_dict_flat.items()} +recipe_dict = { + iid: combine_dicts([i[1] for i in sorted(data)]) + for iid, data in input_dict_flat.items() +} logger.debug(f"{recipe_dict=}") if prune_submission: @@ -151,6 +158,7 @@ def combine_dicts(dicts): # Print the actual urls logger.debug(f"{recipe_dict = }") + ## Dynamic Chunking Wrapper def dynamic_chunking_func(ds: xr.Dataset) -> Dict[str, int]: import warnings From 5bb730d0b5e4ad8f57e801543a000d863af28a44 Mon Sep 17 00:00:00 2001 From: Julius Busecke Date: Mon, 6 May 2024 23:40:39 -0400 Subject: [PATCH 23/27] Update recipe.py --- feedstock/recipe.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/feedstock/recipe.py b/feedstock/recipe.py index bcb62147..b5daac14 100644 --- a/feedstock/recipe.py +++ b/feedstock/recipe.py @@ -239,7 +239,7 @@ def dynamic_chunking_func(ds: xr.Dataset) -> Dict[str, int]: recipes = {} for iid, data in recipe_dict.items(): - urls = data["urls"] + urls = data["url"] pattern = pattern_from_file_sequence(urls, concat_dim="time") recipes[iid] = ( f"Creating {iid}" >> beam.Create(pattern.items()) From e3954b723c48d16e5dcd787a68adbaac97aabb5d Mon Sep 17 00:00:00 2001 From: Julius Busecke Date: Tue, 7 May 2024 00:02:25 -0400 Subject: [PATCH 24/27] Update iids.yaml --- feedstock/iids.yaml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/feedstock/iids.yaml b/feedstock/iids.yaml index 91efaf7a..b4d5eb7b 100644 --- a/feedstock/iids.yaml +++ b/feedstock/iids.yaml @@ -1,5 +1,9 @@ # from https://github.com/Timh37/CMIP6cex/issues/2 - "CMIP6.*.*.*.[historical, ssp245, ssp585].*.day.[psl, pr, sfcWind].*.*" + # from + - 'CMIP6.HighResMIP.CMCC.CMCC-CM2-VHR4.[hist-1950,highres-future].r1i1p1f1.6hrPlevPt.[vas,uas,psl].gn.*' + - 'CMIP6.HighResMIP.EC-Earth-Consortium.EC-Earth3P-HR.[hist-1950,highres-future].r1i1p2f1.6hrPlevPt.[vas,uas,psl].gr.*' + - 'CMIP6.HighResMIP.MOHC.HadGEM3-GC31-HM.[hist-1950,highres-future].r1i1p1f1.[3hr, E3hr].[vas,uas,psl].gn.*' # from https://github.com/pangeo-forge/cmip6-feedstock/issues/22 - "CMIP6.*.*.*.[historical, ssp126, ssp245, ssp585].*.Omon.zmeso.*.*" # PMIP velocities From 1a1ba327fd28e655ab9e7e3d85bfbefdec844812 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 7 May 2024 04:02:33 +0000 Subject: [PATCH 25/27] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- feedstock/iids.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/feedstock/iids.yaml b/feedstock/iids.yaml index b4d5eb7b..19d0af24 100644 --- a/feedstock/iids.yaml +++ b/feedstock/iids.yaml @@ -1,6 +1,6 @@ # from https://github.com/Timh37/CMIP6cex/issues/2 - "CMIP6.*.*.*.[historical, ssp245, ssp585].*.day.[psl, pr, sfcWind].*.*" - # from + # from - 'CMIP6.HighResMIP.CMCC.CMCC-CM2-VHR4.[hist-1950,highres-future].r1i1p1f1.6hrPlevPt.[vas,uas,psl].gn.*' - 'CMIP6.HighResMIP.EC-Earth-Consortium.EC-Earth3P-HR.[hist-1950,highres-future].r1i1p2f1.6hrPlevPt.[vas,uas,psl].gr.*' - 'CMIP6.HighResMIP.MOHC.HadGEM3-GC31-HM.[hist-1950,highres-future].r1i1p1f1.[3hr, E3hr].[vas,uas,psl].gn.*' From e701a1936ff7925c69aabe37c9465e5a3f84f272 Mon Sep 17 00:00:00 2001 From: Julius Busecke Date: Tue, 7 May 2024 00:03:17 -0400 Subject: [PATCH 26/27] Update iids_pr.yaml --- feedstock/iids_pr.yaml | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/feedstock/iids_pr.yaml b/feedstock/iids_pr.yaml index 22158e01..4552b589 100644 --- a/feedstock/iids_pr.yaml +++ b/feedstock/iids_pr.yaml @@ -1,3 +1 @@ - - 'CMIP6.HighResMIP.CMCC.CMCC-CM2-VHR4.[hist-1950,highres-future].r1i1p1f1.6hrPlevPt.[vas,uas,psl].gn.*' - - 'CMIP6.HighResMIP.EC-Earth-Consortium.EC-Earth3P-HR.[hist-1950,highres-future].r1i1p2f1.6hrPlevPt.[vas,uas,psl].gr.*' - - 'CMIP6.HighResMIP.MOHC.HadGEM3-GC31-HM.[hist-1950,highres-future].r1i1p1f1.[3hr, E3hr].[vas,uas,psl].gn.*' +- "CMIP6.*.*.[CNRM-CM6-1,CanESM5].historical.*.Omon.[tos, so].*.*" From 9323229fadfaf3638a66ec8a601fa1f39c8a5538 Mon Sep 17 00:00:00 2001 From: Julius Busecke Date: Tue, 7 May 2024 00:03:38 -0400 Subject: [PATCH 27/27] Update iids_pr.yaml --- feedstock/iids_pr.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/feedstock/iids_pr.yaml b/feedstock/iids_pr.yaml index 4552b589..ea10b8ca 100644 --- a/feedstock/iids_pr.yaml +++ b/feedstock/iids_pr.yaml @@ -1 +1 @@ -- "CMIP6.*.*.[CNRM-CM6-1,CanESM5].historical.*.Omon.[tos, so].*.*" + - "CMIP6.*.*.[CNRM-CM6-1,CanESM5].historical.*.Omon.[tos, so].*.*"