From da0f2fa2263ef6051b59583653fe24768a0d79a0 Mon Sep 17 00:00:00 2001 From: mike-gangl Date: Thu, 21 Jul 2022 15:55:57 -0700 Subject: [PATCH 01/10] added citation creation tests and functionality to subscriber and downloader --- subscriber/podaac_access.py | 48 ++++++++++++++++++++++++++++ subscriber/podaac_data_downloader.py | 8 +++++ subscriber/podaac_data_subscriber.py | 7 ++++ tests/test_subscriber.py | 43 +++++++++++++++++++++++++ 4 files changed, 106 insertions(+) diff --git a/subscriber/podaac_access.py b/subscriber/podaac_access.py index 327155a..a0eee7a 100644 --- a/subscriber/podaac_access.py +++ b/subscriber/podaac_access.py @@ -14,6 +14,7 @@ from urllib.parse import urlencode from urllib.request import Request, urlopen import hashlib +from datetime import datetime import requests @@ -436,3 +437,50 @@ def make_checksum(file_path, algorithm): for chunk in iter(lambda: f.read(4096), b""): hash_alg.update(chunk) return hash_alg.hexdigest() + +def get_cmr_collections(params, verbose=False): + query = urlencode(params) + url = "https://" + cmr + "/search/collections.umm_json?" + query + if verbose: + logging.info(url) + + # Build the request, add the search after header to it if it's not None (e.g. after the first iteration) + req = Request(url) + response = urlopen(req) + result = json.loads(response.read().decode()) + return result + + +def create_citation(collection_json, access_date): + citation_template = "{creator}. {year}. {title}. Ver. {version}. PO.DAAC, CA, USA. Dataset accessed {access_date} at {doi_authority}/{doi}" + + # Better error handling here may be needed... + doi = collection_json['DOI']["DOI"] + doi_authority = collection_json['DOI']["Authority"] + citation = collection_json["CollectionCitations"][0] + creator = citation["Creator"] + release_date = citation["ReleaseDate"] + title = citation["Title"] + version = citation["Version"] + year = datetime.strptime(release_date, "%Y-%m-%dT%H:%M:%S.000Z").year + return citation_template.format(creator=creator, year=year, title=title, version=version, doi_authority=doi_authority, doi=doi, access_date=access_date) + +def create_citation_file(short_name, provider, data_path, token=None): + # get collection umm-c METADATA + params = [ + ('provider', provider), + ('ShortName', short_name) + ] + if token is not None: + params.append(('token', token)) + + collection = get_cmr_collections(params, True)['items'][0] + + access_date = datetime.now().strftime("%Y-%m-%d") + + # create citation from umm-c metadata + citation = create_citation(collection['umm'], access_date) + # write file + + with open(data_path + "/" + short_name + ".citation.txt", "w") as text_file: + text_file.write(citation) diff --git a/subscriber/podaac_data_downloader.py b/subscriber/podaac_data_downloader.py index 158c541..aae568a 100644 --- a/subscriber/podaac_data_downloader.py +++ b/subscriber/podaac_data_downloader.py @@ -284,6 +284,14 @@ def run(args=None): logging.info("Downloaded Files: " + str(success_cnt)) logging.info("Failed Files: " + str(failure_cnt)) logging.info("Skipped Files: " + str(skip_cnt)) + + #create citation file if success > 0 + if success_cnt > 0: + try: + pa.create_citation_file(short_name, provider, data_path, token) + except: + logging.debug("Error generating citation") + pa.delete_token(token_url, token) logging.info("END\n\n") diff --git a/subscriber/podaac_data_subscriber.py b/subscriber/podaac_data_subscriber.py index 04df68d..1d0c542 100755 --- a/subscriber/podaac_data_subscriber.py +++ b/subscriber/podaac_data_subscriber.py @@ -314,6 +314,13 @@ def run(args=None): logging.info("Downloaded Files: " + str(success_cnt)) logging.info("Failed Files: " + str(failure_cnt)) logging.info("Skipped Files: " + str(skip_cnt)) + + if success_cnt > 0: + try: + pa.create_citation_file(short_name, provider, data_path, token) + except: + logging.debug("Error generating citation") + pa.delete_token(token_url, token) logging.info("END\n\n") #exit(0) diff --git a/tests/test_subscriber.py b/tests/test_subscriber.py index 2e9d4cb..b13c7e4 100644 --- a/tests/test_subscriber.py +++ b/tests/test_subscriber.py @@ -4,6 +4,9 @@ import os from pathlib import Path import shutil +import json +import tempfile +from os.path import exists def test_temporal_range(): @@ -23,6 +26,46 @@ def cleanup_update_test(): print("Cleanup...") shutil.rmtree(data_dir_with_updates) +def test_create_citation_file(): + with tempfile.TemporaryDirectory() as tmpdirname: + pa.create_citation_file("SWOT_SIMULATED_L2_KARIN_SSH_GLORYS_CALVAL_V1", "POCLOUD", tmpdirname) + assert exists(tmpdirname+"/SWOT_SIMULATED_L2_KARIN_SSH_GLORYS_CALVAL_V1.citation.txt") + +def test_citation_creation(): + collection_umm = '''{ + "DOI": { + "DOI": "10.5067/KARIN-2GLC1", + "Authority": "https://doi.org" + }, + "CollectionCitations": [ + { + "Creator": "SWOT", + "ReleasePlace": "PO.DAAC", + "Title": "SWOT Level-2 Simulated SSH from MITgcm LLC4320 Science Quality Version 1.0", + "Publisher": "PO.DAAC", + "ReleaseDate": "2022-01-31T00:00:00.000Z", + "Version": "1.0" + }, + { + "Creator": "CNES/CLS", + "ReleasePlace": "CNES/AVISO", + "Title": "Simulated SWOT products", + "OnlineResource": { + "Linkage": "http://doi.org/10.24400/527896/a01-2021.006", + "Name": " Simulated SWOT Sea Surface Height products", + "Description": "Simulated SWOT Sea Surface Height products KaRIn and Nadir.", + "MimeType": "text/html" + }, + "Publisher": "PODAAC", + "ReleaseDate": "2021-11-01T00:00:00.000Z", + "Version": "1.0" + } + ] + } + ''' + collection_umm_json = json.loads(collection_umm) + citation = pa.create_citation(collection_umm_json, "2022-07-21") + assert citation == "SWOT. 2022. SWOT Level-2 Simulated SSH from MITgcm LLC4320 Science Quality Version 1.0. Ver. 1.0. PO.DAAC, CA, USA. Dataset accessed 2022-07-21 at https://doi.org/10.5067/KARIN-2GLC1" def test_search_after(): # cmr query: https://cmr.earthdata.nasa.gov/search/granules.umm_json?page_size=2000&sort_key=-start_date&provider=POCLOUD&ShortName=JASON_CS_S6A_L2_ALT_LR_STD_OST_NRT_F&temporal=2000-01-01T10%3A00%3A00Z%2C2022-04-15T00%3A00%3A00Z&bounding_box=-180%2C-90%2C180%2C90 From 660c5262c980f398b5af653625280ddf777d3fd5 Mon Sep 17 00:00:00 2001 From: mike-gangl Date: Thu, 21 Jul 2022 16:05:39 -0700 Subject: [PATCH 02/10] added verbose option to create_citation_file command, previously hard coded --- subscriber/podaac_access.py | 4 ++-- subscriber/podaac_data_downloader.py | 2 +- subscriber/podaac_data_subscriber.py | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/subscriber/podaac_access.py b/subscriber/podaac_access.py index a0eee7a..e921d1c 100644 --- a/subscriber/podaac_access.py +++ b/subscriber/podaac_access.py @@ -465,7 +465,7 @@ def create_citation(collection_json, access_date): year = datetime.strptime(release_date, "%Y-%m-%dT%H:%M:%S.000Z").year return citation_template.format(creator=creator, year=year, title=title, version=version, doi_authority=doi_authority, doi=doi, access_date=access_date) -def create_citation_file(short_name, provider, data_path, token=None): +def create_citation_file(short_name, provider, data_path, token=None, verbose=False): # get collection umm-c METADATA params = [ ('provider', provider), @@ -474,7 +474,7 @@ def create_citation_file(short_name, provider, data_path, token=None): if token is not None: params.append(('token', token)) - collection = get_cmr_collections(params, True)['items'][0] + collection = get_cmr_collections(params, verbose)['items'][0] access_date = datetime.now().strftime("%Y-%m-%d") diff --git a/subscriber/podaac_data_downloader.py b/subscriber/podaac_data_downloader.py index aae568a..1182e80 100644 --- a/subscriber/podaac_data_downloader.py +++ b/subscriber/podaac_data_downloader.py @@ -288,7 +288,7 @@ def run(args=None): #create citation file if success > 0 if success_cnt > 0: try: - pa.create_citation_file(short_name, provider, data_path, token) + pa.create_citation_file(short_name, provider, data_path, token, args.verbose) except: logging.debug("Error generating citation") diff --git a/subscriber/podaac_data_subscriber.py b/subscriber/podaac_data_subscriber.py index 1d0c542..fdbf03f 100755 --- a/subscriber/podaac_data_subscriber.py +++ b/subscriber/podaac_data_subscriber.py @@ -317,9 +317,9 @@ def run(args=None): if success_cnt > 0: try: - pa.create_citation_file(short_name, provider, data_path, token) + pa.create_citation_file(short_name, provider, data_path, token, args.verbose) except: - logging.debug("Error generating citation") + logging.warning("Error generating citation") pa.delete_token(token_url, token) logging.info("END\n\n") From c17d9ab1d358adff2b1d3c7400af0895e45d2429 Mon Sep 17 00:00:00 2001 From: mike-gangl Date: Wed, 3 Aug 2022 14:50:16 -0700 Subject: [PATCH 03/10] updated changelog (whoops) and fixed regression test: 1. Issue where the citation file now downloaded affected the counts 2. Issue where the logic for determining if a file modified time was changing or not was picking up the new citation file which _always_ gets rewritten to update the 'last accessed' date. --- CHANGELOG.md | 4 ++++ tests/test_downloader_regression.py | 7 ++++++- 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index f06ebe6..ecb9338 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,10 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) +## [unreleased] +### Added +- Added citation file creation when data are downloaded [91](https://github.com/podaac/data-subscriber/issues/91). Required some updates to the regression testing. + ## [1.10.2] ### Fixed - Fixed an issue where using a default global bounding box prevented download of data that didn't use the horizontal spatial domain [87](https://github.com/podaac/data-subscriber/issues/87) diff --git a/tests/test_downloader_regression.py b/tests/test_downloader_regression.py index 875a517..1fc9722 100644 --- a/tests/test_downloader_regression.py +++ b/tests/test_downloader_regression.py @@ -22,7 +22,8 @@ def test_downloader_limit_MUR(): args2 = create_downloader_args('-c MUR25-JPL-L4-GLOB-v04.2 -d ./MUR25-JPL-L4-GLOB-v04.2 -sd 2020-01-01T00:00:00Z -ed 2020-01-30T00:00:00Z --limit 1'.split()) pdd.run(args2) # count number of files downloaded... - assert len([name for name in os.listdir('./MUR25-JPL-L4-GLOB-v04.2') if os.path.isfile('./MUR25-JPL-L4-GLOB-v04.2/' + name)])==1 + # Also include the citation file here (1+1 = 2) + assert len([name for name in os.listdir('./MUR25-JPL-L4-GLOB-v04.2') if os.path.isfile('./MUR25-JPL-L4-GLOB-v04.2/' + name)])==2 shutil.rmtree('./MUR25-JPL-L4-GLOB-v04.2') #Test the downlaoder on MUR25 data for start/stop/, yyyy/mmm/dd dir structure, @@ -73,6 +74,10 @@ def test_downloader_GRACE_with_SHA_512(tmpdir): pdd.run(args) assert len( os.listdir(directory_str) ) > 0 filename = directory_str + "/" + os.listdir(directory_str)[0] + #if the citation file was chosen above, get the next file since citation file is updated on successful run + if "citation.txt" in filename: + filename = directory_str + "/" + os.listdir(directory_str)[1] + modified_time_1 = os.path.getmtime(filename) print( modified_time_1 ) From 55050b8dab7a2fee34cb213915076081c4647311 Mon Sep 17 00:00:00 2001 From: mike-gangl Date: Wed, 3 Aug 2022 15:19:09 -0700 Subject: [PATCH 04/10] updated request to include exec_info in warning; fixed issue with params not being a dictionary caused errors --- CHANGELOG.md | 2 ++ subscriber/podaac_data_downloader.py | 8 ++++++-- subscriber/podaac_data_subscriber.py | 9 +++++++-- tests/test_subscriber.py | 17 +++++++++++++++++ 4 files changed, 32 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index ecb9338..69b832d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,8 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) ## [unreleased] +### Fixed +- Fixed an issue where token-refresh was expecting a dictionary, not a list of tuples ### Added - Added citation file creation when data are downloaded [91](https://github.com/podaac/data-subscriber/issues/91). Required some updates to the regression testing. diff --git a/subscriber/podaac_data_downloader.py b/subscriber/podaac_data_downloader.py index 1182e80..344c00f 100644 --- a/subscriber/podaac_data_downloader.py +++ b/subscriber/podaac_data_downloader.py @@ -193,7 +193,11 @@ def run(args=None): except HTTPError as e: if e.code == 401: token = pa.refresh_token(token, 'podaac-subscriber') - params['token'] = token + # Updated: This is not always a dictionary... + # in fact, here it's always a list of tuples + for i, p in enumerate(params) : + if p[1] == "token": + params[i] = ("token", token) results = pa.get_search_results(params, args.verbose) else: raise e @@ -290,7 +294,7 @@ def run(args=None): try: pa.create_citation_file(short_name, provider, data_path, token, args.verbose) except: - logging.debug("Error generating citation") + logging.debug("Error generating citation",exc_info=True) pa.delete_token(token_url, token) logging.info("END\n\n") diff --git a/subscriber/podaac_data_subscriber.py b/subscriber/podaac_data_subscriber.py index fdbf03f..8c495a6 100755 --- a/subscriber/podaac_data_subscriber.py +++ b/subscriber/podaac_data_subscriber.py @@ -218,7 +218,12 @@ def run(args=None): except HTTPError as e: if e.code == 401: token = pa.refresh_token(token, 'podaac-subscriber') - params['token'] = token + # Updated: This is not always a dictionary... + # in fact, here it's always a list of tuples + for i, p in enumerate(params) : + if p[1] == "token": + params[i] = ("token", token) + #params['token'] = token results = pa.get_search_results(params, args.verbose) else: raise e @@ -319,7 +324,7 @@ def run(args=None): try: pa.create_citation_file(short_name, provider, data_path, token, args.verbose) except: - logging.warning("Error generating citation") + logging.warning("Error generating citation", exc_info=True) pa.delete_token(token_url, token) logging.info("END\n\n") diff --git a/tests/test_subscriber.py b/tests/test_subscriber.py index b13c7e4..78bfc15 100644 --- a/tests/test_subscriber.py +++ b/tests/test_subscriber.py @@ -179,6 +179,23 @@ def test_validate(): # with pytest.raises(SystemExit): # a = validate(["-c", "viirs", "-d", "/data", "-m","60b"]) +def test_param_update(): + params = [ + ('sort_key', "-start_date"), + ('provider', "'POCLOUD'"), + ('token', "123"), + ] + + for i, p in enumerate(params) : + if p[1] == "token": + params[i] = ("token", "newToken") + + for i,p in enumerate(params) : + if p[1] == "token": + assert f2 == "newToken" + + + def validate(args): parser = pds.create_parser() From e34e367486ce1f0c01d97c45dd3456b0c62a64a7 Mon Sep 17 00:00:00 2001 From: mike-gangl Date: Wed, 3 Aug 2022 15:24:58 -0700 Subject: [PATCH 05/10] changed a warning to debug for citation file. fixed test issues --- subscriber/podaac_data_subscriber.py | 2 +- tests/test_subscriber.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/subscriber/podaac_data_subscriber.py b/subscriber/podaac_data_subscriber.py index 8c495a6..efe5ab6 100755 --- a/subscriber/podaac_data_subscriber.py +++ b/subscriber/podaac_data_subscriber.py @@ -324,7 +324,7 @@ def run(args=None): try: pa.create_citation_file(short_name, provider, data_path, token, args.verbose) except: - logging.warning("Error generating citation", exc_info=True) + logging.debug("Error generating citation", exc_info=True) pa.delete_token(token_url, token) logging.info("END\n\n") diff --git a/tests/test_subscriber.py b/tests/test_subscriber.py index 78bfc15..2c565ab 100644 --- a/tests/test_subscriber.py +++ b/tests/test_subscriber.py @@ -192,7 +192,7 @@ def test_param_update(): for i,p in enumerate(params) : if p[1] == "token": - assert f2 == "newToken" + assert p[2] == "newToken" From 753a6ee23ab3afe114a6928f073723a758f65a89 Mon Sep 17 00:00:00 2001 From: Frank Greguska Date: Wed, 3 Aug 2022 16:44:16 -0700 Subject: [PATCH 06/10] Enable debug logging during regression tests and set max parallel workflows to 2 --- .github/workflows/python-app.yml | 5 ++++- tests/test_downloader_regression.py | 6 +++--- tests/test_subscriber_regression.py | 2 +- 3 files changed, 8 insertions(+), 5 deletions(-) diff --git a/.github/workflows/python-app.yml b/.github/workflows/python-app.yml index dbde039..d212c98 100644 --- a/.github/workflows/python-app.yml +++ b/.github/workflows/python-app.yml @@ -13,6 +13,7 @@ jobs: build: strategy: fail-fast: false + max-parallel: 2 matrix: python-version: [ "3.7", "3.8", "3.9", "3.10" ] poetry-version: [ "1.1" ] @@ -47,5 +48,7 @@ jobs: username: ${{ secrets.EDL_OPS_USERNAME }} password: ${{ secrets.EDL_OPS_PASSWORD }} - name: Regression Test with pytest + env: + PODAAC_LOGLEVEL: "DEBUG" run: | - poetry run pytest -m "regression" + poetry run pytest -o log_cli=true --log-cli-level=DEBUG -m "regression" diff --git a/tests/test_downloader_regression.py b/tests/test_downloader_regression.py index 1fc9722..71172e7 100644 --- a/tests/test_downloader_regression.py +++ b/tests/test_downloader_regression.py @@ -19,7 +19,7 @@ def create_downloader_args(args): @pytest.mark.regression def test_downloader_limit_MUR(): shutil.rmtree('./MUR25-JPL-L4-GLOB-v04.2', ignore_errors=True) - args2 = create_downloader_args('-c MUR25-JPL-L4-GLOB-v04.2 -d ./MUR25-JPL-L4-GLOB-v04.2 -sd 2020-01-01T00:00:00Z -ed 2020-01-30T00:00:00Z --limit 1'.split()) + args2 = create_downloader_args('-c MUR25-JPL-L4-GLOB-v04.2 -d ./MUR25-JPL-L4-GLOB-v04.2 -sd 2020-01-01T00:00:00Z -ed 2020-01-30T00:00:00Z --limit 1 --verbose'.split()) pdd.run(args2) # count number of files downloaded... # Also include the citation file here (1+1 = 2) @@ -32,7 +32,7 @@ def test_downloader_limit_MUR(): @pytest.mark.regression def test_downloader_MUR(): shutil.rmtree('./MUR25-JPL-L4-GLOB-v04.2', ignore_errors=True) - args2 = create_downloader_args('-c MUR25-JPL-L4-GLOB-v04.2 -d ./MUR25-JPL-L4-GLOB-v04.2 -sd 2020-01-01T00:00:00Z -ed 2020-01-02T00:00:00Z -dymd --offset 4'.split()) + args2 = create_downloader_args('-c MUR25-JPL-L4-GLOB-v04.2 -d ./MUR25-JPL-L4-GLOB-v04.2 -sd 2020-01-01T00:00:00Z -ed 2020-01-02T00:00:00Z -dymd --offset 4 --verbose'.split()) pdd.run(args2) assert exists('./MUR25-JPL-L4-GLOB-v04.2/2020/01/01/20200101090000-JPL-L4_GHRSST-SSTfnd-MUR25-GLOB-v02.0-fv04.2.nc') assert exists('./MUR25-JPL-L4-GLOB-v04.2/2020/01/02/20200102090000-JPL-L4_GHRSST-SSTfnd-MUR25-GLOB-v02.0-fv04.2.nc') @@ -55,7 +55,7 @@ def test_downloader_MUR(): t1 = os.path.getmtime('./MUR25-JPL-L4-GLOB-v04.2/2020/01/01/20200101090000-JPL-L4_GHRSST-SSTfnd-MUR25-GLOB-v02.0-fv04.2.nc') # Set the args to --force to re-download those data - args2 = create_downloader_args('-c MUR25-JPL-L4-GLOB-v04.2 -d ./MUR25-JPL-L4-GLOB-v04.2 -sd 2020-01-01T00:00:00Z -ed 2020-01-02T00:00:00Z -dymd --offset 4 -f'.split()) + args2 = create_downloader_args('-c MUR25-JPL-L4-GLOB-v04.2 -d ./MUR25-JPL-L4-GLOB-v04.2 -sd 2020-01-01T00:00:00Z -ed 2020-01-02T00:00:00Z -dymd --offset 4 -f --verbose'.split()) pdd.run(args2) assert t1 != os.path.getmtime('./MUR25-JPL-L4-GLOB-v04.2/2020/01/01/20200101090000-JPL-L4_GHRSST-SSTfnd-MUR25-GLOB-v02.0-fv04.2.nc') assert t2 != os.path.getmtime('./MUR25-JPL-L4-GLOB-v04.2/2020/01/02/20200102090000-JPL-L4_GHRSST-SSTfnd-MUR25-GLOB-v02.0-fv04.2.nc') diff --git a/tests/test_subscriber_regression.py b/tests/test_subscriber_regression.py index 07dc2f8..ffa3685 100644 --- a/tests/test_subscriber_regression.py +++ b/tests/test_subscriber_regression.py @@ -53,7 +53,7 @@ def test_subscriber_MUR_update_file_no_redownload(): except OSError as e: print("Expecting this...") - args2 = create_args('-c MUR25-JPL-L4-GLOB-v04.2 -d ./MUR25-JPL-L4-GLOB-v04.2 -sd 2020-01-01T00:00:00Z -ed 2020-01-02T00:00:00Z -dymd --offset 4'.split()) + args2 = create_args('-c MUR25-JPL-L4-GLOB-v04.2 -d ./MUR25-JPL-L4-GLOB-v04.2 -sd 2020-01-01T00:00:00Z -ed 2020-01-02T00:00:00Z -dymd --offset 4 --verbose'.split()) pds.run(args2) assert exists('./MUR25-JPL-L4-GLOB-v04.2/2020/01/01/20200101090000-JPL-L4_GHRSST-SSTfnd-MUR25-GLOB-v02.0-fv04.2.nc') assert exists('./MUR25-JPL-L4-GLOB-v04.2/2020/01/02/20200102090000-JPL-L4_GHRSST-SSTfnd-MUR25-GLOB-v02.0-fv04.2.nc') From c04039d5a6852d227d6925584c2c37eb7ac25741 Mon Sep 17 00:00:00 2001 From: mike-gangl Date: Thu, 4 Aug 2022 07:03:37 -0700 Subject: [PATCH 07/10] added output to pytest --- .github/workflows/python-app.yml | 4 +++- pytest.ini | 3 +++ 2 files changed, 6 insertions(+), 1 deletion(-) create mode 100644 pytest.ini diff --git a/.github/workflows/python-app.yml b/.github/workflows/python-app.yml index dbde039..db426bf 100644 --- a/.github/workflows/python-app.yml +++ b/.github/workflows/python-app.yml @@ -47,5 +47,7 @@ jobs: username: ${{ secrets.EDL_OPS_USERNAME }} password: ${{ secrets.EDL_OPS_PASSWORD }} - name: Regression Test with pytest + env: + PODAAC_LOGLEVEL: debug run: | - poetry run pytest -m "regression" + poetry run pytest -s -m "regression" diff --git a/pytest.ini b/pytest.ini new file mode 100644 index 0000000..47398bb --- /dev/null +++ b/pytest.ini @@ -0,0 +1,3 @@ +[pytest] +log_cli = True + From 5ad8e149843d5499b8601740cbc963aaac0f46f5 Mon Sep 17 00:00:00 2001 From: mike-gangl Date: Thu, 4 Aug 2022 07:09:41 -0700 Subject: [PATCH 08/10] fixed test to only look for downlaoded data files not citation file due to 'random' cmr errors when creating a citation. --- pytest.ini | 3 --- tests/test_downloader_regression.py | 6 +++--- 2 files changed, 3 insertions(+), 6 deletions(-) delete mode 100644 pytest.ini diff --git a/pytest.ini b/pytest.ini deleted file mode 100644 index 47398bb..0000000 --- a/pytest.ini +++ /dev/null @@ -1,3 +0,0 @@ -[pytest] -log_cli = True - diff --git a/tests/test_downloader_regression.py b/tests/test_downloader_regression.py index 71172e7..cea7df0 100644 --- a/tests/test_downloader_regression.py +++ b/tests/test_downloader_regression.py @@ -21,9 +21,9 @@ def test_downloader_limit_MUR(): shutil.rmtree('./MUR25-JPL-L4-GLOB-v04.2', ignore_errors=True) args2 = create_downloader_args('-c MUR25-JPL-L4-GLOB-v04.2 -d ./MUR25-JPL-L4-GLOB-v04.2 -sd 2020-01-01T00:00:00Z -ed 2020-01-30T00:00:00Z --limit 1 --verbose'.split()) pdd.run(args2) - # count number of files downloaded... - # Also include the citation file here (1+1 = 2) - assert len([name for name in os.listdir('./MUR25-JPL-L4-GLOB-v04.2') if os.path.isfile('./MUR25-JPL-L4-GLOB-v04.2/' + name)])==2 + # So running the test in parallel, sometimes we get a 401 on the token... + # Let's ensure we're only looking for data files here + assert len([name for name in os.listdir('./MUR25-JPL-L4-GLOB-v04.2') if os.path.isfile('./MUR25-JPL-L4-GLOB-v04.2/' + name) and "citation.txt" not in name ])==1 shutil.rmtree('./MUR25-JPL-L4-GLOB-v04.2') #Test the downlaoder on MUR25 data for start/stop/, yyyy/mmm/dd dir structure, From 492db44b139d10e26d5af0f81db1aef14ba621fb Mon Sep 17 00:00:00 2001 From: mike-gangl Date: Thu, 4 Aug 2022 08:48:56 -0700 Subject: [PATCH 09/10] added mock testing and retry on 503 --- poetry.lock | 24 +++++++++++++++++++++--- pyproject.toml | 1 + subscriber/podaac_access.py | 23 +++++++++++++++++++++++ subscriber/podaac_data_downloader.py | 4 +++- subscriber/podaac_data_subscriber.py | 4 +++- tests/test_subscriber.py | 8 ++++++-- 6 files changed, 57 insertions(+), 7 deletions(-) diff --git a/poetry.lock b/poetry.lock index 0a666cb..94f8c70 100644 --- a/poetry.lock +++ b/poetry.lock @@ -124,8 +124,8 @@ python-versions = ">=3.6" importlib-metadata = {version = ">=0.12", markers = "python_version < \"3.8\""} [package.extras] -dev = ["pre-commit", "tox"] -testing = ["pytest", "pytest-benchmark"] +testing = ["pytest-benchmark", "pytest"] +dev = ["tox", "pre-commit"] [[package]] name = "py" @@ -184,6 +184,20 @@ tomli = ">=1.0.0" [package.extras] testing = ["argcomplete", "hypothesis (>=3.56)", "mock", "nose", "pygments (>=2.7.2)", "requests", "xmlschema"] +[[package]] +name = "pytest-mock" +version = "3.8.2" +description = "Thin-wrapper around the mock package for easier use with pytest" +category = "dev" +optional = false +python-versions = ">=3.7" + +[package.dependencies] +pytest = ">=5.0" + +[package.extras] +dev = ["pre-commit", "tox", "pytest-asyncio"] + [[package]] name = "requests" version = "2.27.1" @@ -257,7 +271,7 @@ testing = ["pytest (>=6)", "pytest-checkdocs (>=2.4)", "pytest-flake8", "pytest- [metadata] lock-version = "1.1" python-versions = "^3.7" -content-hash = "8ff84c55fbd8a74bae903e6216e256cea47f571d88fcfa54f0492125746244ed" +content-hash = "c5ece7741408cb266fe803842b66f646317dc3a384e9c54ecbe66a14ce895fed" [metadata.files] atomicwrites = [ @@ -328,6 +342,10 @@ pytest = [ {file = "pytest-7.1.2-py3-none-any.whl", hash = "sha256:13d0e3ccfc2b6e26be000cb6568c832ba67ba32e719443bfe725814d3c42433c"}, {file = "pytest-7.1.2.tar.gz", hash = "sha256:a06a0425453864a270bc45e71f783330a7428defb4230fb5e6a731fde06ecd45"}, ] +pytest-mock = [ + {file = "pytest-mock-3.8.2.tar.gz", hash = "sha256:77f03f4554392558700295e05aed0b1096a20d4a60a4f3ddcde58b0c31c8fca2"}, + {file = "pytest_mock-3.8.2-py3-none-any.whl", hash = "sha256:8a9e226d6c0ef09fcf20c94eb3405c388af438a90f3e39687f84166da82d5948"}, +] requests = [ {file = "requests-2.27.1-py2.py3-none-any.whl", hash = "sha256:f22fa1e554c9ddfd16e6e41ac79759e17be9e492b3587efa038054674760e72d"}, {file = "requests-2.27.1.tar.gz", hash = "sha256:68d7c56fd5a8999887728ef304a6d12edc7be74f1cfa47714fc8b414525c9a61"}, diff --git a/pyproject.toml b/pyproject.toml index 44ae341..13d676c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -19,6 +19,7 @@ tenacity = "^8.0.1" [tool.poetry.dev-dependencies] pytest = "^7.1.2" flake8 = "^4.0.1" +pytest-mock = "^3.8.2" [tool.poetry.scripts] podaac-data-subscriber = 'subscriber.podaac_data_subscriber:main' diff --git a/subscriber/podaac_access.py b/subscriber/podaac_access.py index e921d1c..81ba9e8 100644 --- a/subscriber/podaac_access.py +++ b/subscriber/podaac_access.py @@ -10,11 +10,14 @@ from typing import Dict from urllib import request from urllib.error import HTTPError +from urllib.request import urlretrieve import subprocess from urllib.parse import urlencode from urllib.request import Request, urlopen import hashlib from datetime import datetime +import time + import requests @@ -287,6 +290,26 @@ def get_temporal_range(start, end, now): raise ValueError("One of start-date or end-date must be specified.") +def download_file(remote_file, output_path, retries=3): + failed = False + for r in range(retries): + try: + urlretrieve(remote_file, output_path) + except HTTPError as e: + if e.code == 503: + logging.warning(f'Error downloading {remote_file}. Retrying download.') + # back off on sleep time each error... + time.sleep(r) + if r >= retries: + failed = True + else: + #downlaoded fie without 503 + break + + if failed: + raise Exception("Could not download file.") + + # Retry using random exponential backoff if a 500 error is raised. Maximum 10 attempts. @tenacity.retry(wait=tenacity.wait_random_exponential(multiplier=1, max=60), stop=tenacity.stop_after_attempt(10), diff --git a/subscriber/podaac_data_downloader.py b/subscriber/podaac_data_downloader.py index 344c00f..38c97d8 100644 --- a/subscriber/podaac_data_downloader.py +++ b/subscriber/podaac_data_downloader.py @@ -272,7 +272,9 @@ def run(args=None): skip_cnt += 1 continue - urlretrieve(f, output_path) + pa.download_file(f,output_path) + #urlretrieve(f, output_path) + pa.process_file(process_cmd, output_path, args) logging.info(str(datetime.now()) + " SUCCESS: " + f) success_cnt = success_cnt + 1 diff --git a/subscriber/podaac_data_subscriber.py b/subscriber/podaac_data_subscriber.py index efe5ab6..23cdc16 100755 --- a/subscriber/podaac_data_subscriber.py +++ b/subscriber/podaac_data_subscriber.py @@ -299,7 +299,9 @@ def run(args=None): skip_cnt += 1 continue - urlretrieve(f, output_path) + #urlretrieve(f, output_path) + pa.download_file(f,output_path) + pa.process_file(process_cmd, output_path, args) logging.info(str(datetime.now()) + " SUCCESS: " + f) success_cnt = success_cnt + 1 diff --git a/tests/test_subscriber.py b/tests/test_subscriber.py index 2c565ab..983cdce 100644 --- a/tests/test_subscriber.py +++ b/tests/test_subscriber.py @@ -1,5 +1,7 @@ from subscriber import podaac_data_subscriber as pds from subscriber import podaac_access as pa + +from urllib.error import HTTPError import pytest import os from pathlib import Path @@ -194,8 +196,10 @@ def test_param_update(): if p[1] == "token": assert p[2] == "newToken" - - +def test_downloader_retry(mocker): + mck = mocker.patch('subscriber.podaac_access.urlretrieve', side_effect=HTTPError("url", 503, "msg", None, None)) + pa.download_file("myUrl", "outputPath") + assert mck.call_count == 3 def validate(args): parser = pds.create_parser() From 78aac3ffb4d6586fbc796935e85e67df41f5b53b Mon Sep 17 00:00:00 2001 From: mike-gangl Date: Thu, 4 Aug 2022 08:50:18 -0700 Subject: [PATCH 10/10] added 503 fixes --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 69b832d..e4995cc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) ## [unreleased] ### Fixed - Fixed an issue where token-refresh was expecting a dictionary, not a list of tuples +- Fixed an issue with 503 errors on data download not being re-tried. [97](https://github.com/podaac/data-subscriber/issues/97) ### Added - Added citation file creation when data are downloaded [91](https://github.com/podaac/data-subscriber/issues/91). Required some updates to the regression testing.