From 01bad47f462103d237cd9f4e5fb4ddd0de9c856d Mon Sep 17 00:00:00 2001 From: Julius Busecke Date: Tue, 7 May 2024 12:33:44 -0400 Subject: [PATCH 01/11] Testing newest changes to API client --- feedstock/recipe.py | 31 +++++++++++++++++++++---------- 1 file changed, 21 insertions(+), 10 deletions(-) diff --git a/feedstock/recipe.py b/feedstock/recipe.py index b5daac14..d66e1249 100644 --- a/feedstock/recipe.py +++ b/feedstock/recipe.py @@ -90,9 +90,15 @@ "checksum", "checksum_type", ], - dataset_output_fields=["pid", "tracking_id", "further_info_url", "citation_url"], + dataset_output_fields=[ + "pid", + "tracking_id", + "further_info_url", + "citation_url" + ], ) -iids = client.expand_instance_id_list(iids_raw) +iid_info_dict = client.expand_instance_id_list(iids_raw) +iids = iid_info_dict.keys() logger.info(f"{iids = }") # Prune the url dict to only include items that have not been logged to BQ yet @@ -124,6 +130,19 @@ iids_filtered = iids_filtered[0:200] +#Now that we have the iids that are not yet ingested, we can prune the full iid_info_dict and extract the 'id' field +iid_info_dict_filtered = {k:v for k,v in iid_info_dict.items() if k in iids_filtered} +dataset_ids_filtered = [v['id'] for v in iid_info_dict_filtered.values()] + +print(f"🚀 Requesting a total of {len(dataset_ids_filtered)} datasets") +input_dict = client.get_recipe_inputs_from_dataset_ids(dataset_ids_filtered) + +logger.debug(f"{input_dict=}") +input_dict_flat = { + iid: [(k, v) for k, v in data.items()] for iid, data in input_dict.items() +} +logger.debug(f"{input_dict_flat=}") + def combine_dicts(dicts): result = {} for d in dicts: @@ -134,14 +153,6 @@ def combine_dicts(dicts): result[key] = [value] return result - -print(f"🚀 Requesting a total of {len(iids_filtered)} iids") -input_dict = client.get_recipe_inputs_from_iid_list(iids_filtered) -logger.debug(f"{input_dict=}") -input_dict_flat = { - iid: [(k, v) for k, v in data.items()] for iid, data in input_dict.items() -} -logger.debug(f"{input_dict_flat=}") recipe_dict = { iid: combine_dicts([i[1] for i in sorted(data)]) for iid, data in input_dict_flat.items() From 504baf01e13b4b48de9b2ef6bdd463e12f1d5d34 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 7 May 2024 16:34:59 +0000 Subject: [PATCH 02/11] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- feedstock/recipe.py | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/feedstock/recipe.py b/feedstock/recipe.py index d66e1249..71a4734b 100644 --- a/feedstock/recipe.py +++ b/feedstock/recipe.py @@ -90,12 +90,7 @@ "checksum", "checksum_type", ], - dataset_output_fields=[ - "pid", - "tracking_id", - "further_info_url", - "citation_url" - ], + dataset_output_fields=["pid", "tracking_id", "further_info_url", "citation_url"], ) iid_info_dict = client.expand_instance_id_list(iids_raw) iids = iid_info_dict.keys() @@ -130,9 +125,9 @@ iids_filtered = iids_filtered[0:200] -#Now that we have the iids that are not yet ingested, we can prune the full iid_info_dict and extract the 'id' field -iid_info_dict_filtered = {k:v for k,v in iid_info_dict.items() if k in iids_filtered} -dataset_ids_filtered = [v['id'] for v in iid_info_dict_filtered.values()] +# Now that we have the iids that are not yet ingested, we can prune the full iid_info_dict and extract the 'id' field +iid_info_dict_filtered = {k: v for k, v in iid_info_dict.items() if k in iids_filtered} +dataset_ids_filtered = [v["id"] for v in iid_info_dict_filtered.values()] print(f"🚀 Requesting a total of {len(dataset_ids_filtered)} datasets") input_dict = client.get_recipe_inputs_from_dataset_ids(dataset_ids_filtered) @@ -143,6 +138,7 @@ } logger.debug(f"{input_dict_flat=}") + def combine_dicts(dicts): result = {} for d in dicts: @@ -153,6 +149,7 @@ def combine_dicts(dicts): result[key] = [value] return result + recipe_dict = { iid: combine_dicts([i[1] for i in sorted(data)]) for iid, data in input_dict_flat.items() From ea633c42ff00a5c7f479860f593ad86fa0404cfe Mon Sep 17 00:00:00 2001 From: Julius Busecke Date: Tue, 7 May 2024 12:42:39 -0400 Subject: [PATCH 03/11] Update recipe.py --- feedstock/recipe.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/feedstock/recipe.py b/feedstock/recipe.py index 71a4734b..3836998f 100644 --- a/feedstock/recipe.py +++ b/feedstock/recipe.py @@ -21,6 +21,7 @@ import os import xarray as xr import yaml +from tqdm.auto import tqdm logger = logging.getLogger(__name__) @@ -92,15 +93,20 @@ ], dataset_output_fields=["pid", "tracking_id", "further_info_url", "citation_url"], ) -iid_info_dict = client.expand_instance_id_list(iids_raw) +iid_info_dict = client.get_instance_id_input(iids_raw) iids = iid_info_dict.keys() logger.info(f"{iids = }") # Prune the url dict to only include items that have not been logged to BQ yet logger.info("Pruning iids that already exist") bq_interface = CMIPBQInterface(table_id=table_id) -# get lists of the iids already logged -iids_in_table = bq_interface.iid_list_exists(iids) +# Since we have more than 10k iids to check against the big query database, +# we need to run this in batches (bq does not take more than 10k inputs per query). +iids_in_table = [] +iid_batches = [iids[i : i + batchsize] for i in range(0, len(iids), 10000)] +for iids_batch in tqdm(iid_batches): + iids_in_table_batch = bq.iid_list_exists(iids_batch) + iids_in_table.extend(iids_in_table_batch) # manual overrides (these will be rewritten each time as long as they exist here) overwrite_iids = [ From 62c7fd4b669ce74d1f70dcb016f3a31baf148d00 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 7 May 2024 16:42:45 +0000 Subject: [PATCH 04/11] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- feedstock/recipe.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/feedstock/recipe.py b/feedstock/recipe.py index 3836998f..2a161942 100644 --- a/feedstock/recipe.py +++ b/feedstock/recipe.py @@ -100,7 +100,7 @@ # Prune the url dict to only include items that have not been logged to BQ yet logger.info("Pruning iids that already exist") bq_interface = CMIPBQInterface(table_id=table_id) -# Since we have more than 10k iids to check against the big query database, +# Since we have more than 10k iids to check against the big query database, # we need to run this in batches (bq does not take more than 10k inputs per query). iids_in_table = [] iid_batches = [iids[i : i + batchsize] for i in range(0, len(iids), 10000)] From 1f759c5ab01cfdaedc73d88b53b2f9936c1a5796 Mon Sep 17 00:00:00 2001 From: Julius Busecke Date: Tue, 7 May 2024 12:49:34 -0400 Subject: [PATCH 05/11] Update recipe.py --- feedstock/recipe.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/feedstock/recipe.py b/feedstock/recipe.py index 2a161942..f359e333 100644 --- a/feedstock/recipe.py +++ b/feedstock/recipe.py @@ -100,10 +100,13 @@ # Prune the url dict to only include items that have not been logged to BQ yet logger.info("Pruning iids that already exist") bq_interface = CMIPBQInterface(table_id=table_id) + +#TODO: Move this back to the BQ client https://github.com/leap-stc/leap-data-management-utils/issues/33 # Since we have more than 10k iids to check against the big query database, # we need to run this in batches (bq does not take more than 10k inputs per query). iids_in_table = [] -iid_batches = [iids[i : i + batchsize] for i in range(0, len(iids), 10000)] +batchsize = 10000 +iid_batches = [iids[i : i + batchsize] for i in range(0, len(iids), batchsize)] for iids_batch in tqdm(iid_batches): iids_in_table_batch = bq.iid_list_exists(iids_batch) iids_in_table.extend(iids_in_table_batch) From 60c0c3293508996dc62d3caa1a6463b1f71caa32 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 7 May 2024 16:49:39 +0000 Subject: [PATCH 06/11] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- feedstock/recipe.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/feedstock/recipe.py b/feedstock/recipe.py index f359e333..09906aae 100644 --- a/feedstock/recipe.py +++ b/feedstock/recipe.py @@ -101,7 +101,7 @@ logger.info("Pruning iids that already exist") bq_interface = CMIPBQInterface(table_id=table_id) -#TODO: Move this back to the BQ client https://github.com/leap-stc/leap-data-management-utils/issues/33 +# TODO: Move this back to the BQ client https://github.com/leap-stc/leap-data-management-utils/issues/33 # Since we have more than 10k iids to check against the big query database, # we need to run this in batches (bq does not take more than 10k inputs per query). iids_in_table = [] From 047dc73dde34b27547fdfe3c51fe3893b05ff03d Mon Sep 17 00:00:00 2001 From: Julius Busecke Date: Tue, 7 May 2024 13:13:21 -0400 Subject: [PATCH 07/11] Fix bugs locally --- feedstock/recipe.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/feedstock/recipe.py b/feedstock/recipe.py index 09906aae..060a1162 100644 --- a/feedstock/recipe.py +++ b/feedstock/recipe.py @@ -94,7 +94,7 @@ dataset_output_fields=["pid", "tracking_id", "further_info_url", "citation_url"], ) iid_info_dict = client.get_instance_id_input(iids_raw) -iids = iid_info_dict.keys() +iids = list(iid_info_dict.keys()) logger.info(f"{iids = }") # Prune the url dict to only include items that have not been logged to BQ yet @@ -108,7 +108,7 @@ batchsize = 10000 iid_batches = [iids[i : i + batchsize] for i in range(0, len(iids), batchsize)] for iids_batch in tqdm(iid_batches): - iids_in_table_batch = bq.iid_list_exists(iids_batch) + iids_in_table_batch = bq_interface.iid_list_exists(iids_batch) iids_in_table.extend(iids_in_table_batch) # manual overrides (these will be rewritten each time as long as they exist here) From 113e2b0530926f852c9d2cdfaa6c9fac7b5088c6 Mon Sep 17 00:00:00 2001 From: Julius Busecke Date: Tue, 7 May 2024 13:47:02 -0400 Subject: [PATCH 08/11] Update recipe.py --- feedstock/recipe.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/feedstock/recipe.py b/feedstock/recipe.py index 060a1162..16c20074 100644 --- a/feedstock/recipe.py +++ b/feedstock/recipe.py @@ -131,7 +131,7 @@ if prune_iids: - iids_filtered = iids_filtered[0:200] + iids_filtered = iids_filtered[0:20] # Now that we have the iids that are not yet ingested, we can prune the full iid_info_dict and extract the 'id' field @@ -166,9 +166,7 @@ def combine_dicts(dicts): logger.debug(f"{recipe_dict=}") if prune_submission: - recipe_dict = { - iid: {k: v[0:10] for k, v in data.items()} for iid, data in recipe_dict.items() - } + recipe_dict = {iid:[recipe_dict[iid] for iid in list(recipe_dict.keys())[0:10]]} print(f"🚀 Submitting a total of {len(recipe_dict)} iids") From f19e16722b934210625cbc62c5de3e26460a4aaf Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 7 May 2024 17:49:41 +0000 Subject: [PATCH 09/11] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- feedstock/recipe.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/feedstock/recipe.py b/feedstock/recipe.py index 16c20074..ccd8c8c3 100644 --- a/feedstock/recipe.py +++ b/feedstock/recipe.py @@ -166,7 +166,7 @@ def combine_dicts(dicts): logger.debug(f"{recipe_dict=}") if prune_submission: - recipe_dict = {iid:[recipe_dict[iid] for iid in list(recipe_dict.keys())[0:10]]} + recipe_dict = {iid: [recipe_dict[iid] for iid in list(recipe_dict.keys())[0:10]]} print(f"🚀 Submitting a total of {len(recipe_dict)} iids") From e2ee3268bc2abd27d675a1878826a17b741ad0d3 Mon Sep 17 00:00:00 2001 From: Julius Busecke Date: Tue, 7 May 2024 13:58:33 -0400 Subject: [PATCH 10/11] fix final pruning --- feedstock/recipe.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/feedstock/recipe.py b/feedstock/recipe.py index ccd8c8c3..5407395f 100644 --- a/feedstock/recipe.py +++ b/feedstock/recipe.py @@ -166,7 +166,7 @@ def combine_dicts(dicts): logger.debug(f"{recipe_dict=}") if prune_submission: - recipe_dict = {iid: [recipe_dict[iid] for iid in list(recipe_dict.keys())[0:10]]} + recipe_dict = {iid: recipe_dict[iid] for iid in list(recipe_dict.keys())[0:10]} print(f"🚀 Submitting a total of {len(recipe_dict)} iids") From 22d38ceb3d204a0aaa7e46034c97bd6a344ba641 Mon Sep 17 00:00:00 2001 From: Julius Busecke Date: Tue, 7 May 2024 14:07:19 -0400 Subject: [PATCH 11/11] Update recipe.py --- feedstock/recipe.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/feedstock/recipe.py b/feedstock/recipe.py index 5407395f..7ab6546b 100644 --- a/feedstock/recipe.py +++ b/feedstock/recipe.py @@ -166,7 +166,7 @@ def combine_dicts(dicts): logger.debug(f"{recipe_dict=}") if prune_submission: - recipe_dict = {iid: recipe_dict[iid] for iid in list(recipe_dict.keys())[0:10]} + recipe_dict = {iid: recipe_dict[iid] for iid in list(recipe_dict.keys())[0:5]} print(f"🚀 Submitting a total of {len(recipe_dict)} iids")