diff --git a/.github/workflows/python-app.yml b/.github/workflows/python-app.yml index dbde039..d212c98 100644 --- a/.github/workflows/python-app.yml +++ b/.github/workflows/python-app.yml @@ -13,6 +13,7 @@ jobs: build: strategy: fail-fast: false + max-parallel: 2 matrix: python-version: [ "3.7", "3.8", "3.9", "3.10" ] poetry-version: [ "1.1" ] @@ -47,5 +48,7 @@ jobs: username: ${{ secrets.EDL_OPS_USERNAME }} password: ${{ secrets.EDL_OPS_PASSWORD }} - name: Regression Test with pytest + env: + PODAAC_LOGLEVEL: "DEBUG" run: | - poetry run pytest -m "regression" + poetry run pytest -o log_cli=true --log-cli-level=DEBUG -m "regression" diff --git a/CHANGELOG.md b/CHANGELOG.md index f06ebe6..e4995cc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,13 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) +## [unreleased] +### Fixed +- Fixed an issue where token-refresh was expecting a dictionary, not a list of tuples +- Fixed an issue with 503 errors on data download not being re-tried. [97](https://github.com/podaac/data-subscriber/issues/97) +### Added +- Added citation file creation when data are downloaded [91](https://github.com/podaac/data-subscriber/issues/91). Required some updates to the regression testing. + ## [1.10.2] ### Fixed - Fixed an issue where using a default global bounding box prevented download of data that didn't use the horizontal spatial domain [87](https://github.com/podaac/data-subscriber/issues/87) diff --git a/poetry.lock b/poetry.lock index 0a666cb..94f8c70 100644 --- a/poetry.lock +++ b/poetry.lock @@ -124,8 +124,8 @@ python-versions = ">=3.6" importlib-metadata = {version = ">=0.12", markers = "python_version < \"3.8\""} [package.extras] -dev = ["pre-commit", "tox"] -testing = ["pytest", "pytest-benchmark"] +testing = ["pytest-benchmark", "pytest"] +dev = ["tox", "pre-commit"] [[package]] name = "py" @@ -184,6 +184,20 @@ tomli = ">=1.0.0" [package.extras] testing = ["argcomplete", "hypothesis (>=3.56)", "mock", "nose", "pygments (>=2.7.2)", "requests", "xmlschema"] +[[package]] +name = "pytest-mock" +version = "3.8.2" +description = "Thin-wrapper around the mock package for easier use with pytest" +category = "dev" +optional = false +python-versions = ">=3.7" + +[package.dependencies] +pytest = ">=5.0" + +[package.extras] +dev = ["pre-commit", "tox", "pytest-asyncio"] + [[package]] name = "requests" version = "2.27.1" @@ -257,7 +271,7 @@ testing = ["pytest (>=6)", "pytest-checkdocs (>=2.4)", "pytest-flake8", "pytest- [metadata] lock-version = "1.1" python-versions = "^3.7" -content-hash = "8ff84c55fbd8a74bae903e6216e256cea47f571d88fcfa54f0492125746244ed" +content-hash = "c5ece7741408cb266fe803842b66f646317dc3a384e9c54ecbe66a14ce895fed" [metadata.files] atomicwrites = [ @@ -328,6 +342,10 @@ pytest = [ {file = "pytest-7.1.2-py3-none-any.whl", hash = "sha256:13d0e3ccfc2b6e26be000cb6568c832ba67ba32e719443bfe725814d3c42433c"}, {file = "pytest-7.1.2.tar.gz", hash = "sha256:a06a0425453864a270bc45e71f783330a7428defb4230fb5e6a731fde06ecd45"}, ] +pytest-mock = [ + {file = "pytest-mock-3.8.2.tar.gz", hash = "sha256:77f03f4554392558700295e05aed0b1096a20d4a60a4f3ddcde58b0c31c8fca2"}, + {file = "pytest_mock-3.8.2-py3-none-any.whl", hash = "sha256:8a9e226d6c0ef09fcf20c94eb3405c388af438a90f3e39687f84166da82d5948"}, +] requests = [ {file = "requests-2.27.1-py2.py3-none-any.whl", hash = "sha256:f22fa1e554c9ddfd16e6e41ac79759e17be9e492b3587efa038054674760e72d"}, {file = "requests-2.27.1.tar.gz", hash = "sha256:68d7c56fd5a8999887728ef304a6d12edc7be74f1cfa47714fc8b414525c9a61"}, diff --git a/pyproject.toml b/pyproject.toml index 44ae341..13d676c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -19,6 +19,7 @@ tenacity = "^8.0.1" [tool.poetry.dev-dependencies] pytest = "^7.1.2" flake8 = "^4.0.1" +pytest-mock = "^3.8.2" [tool.poetry.scripts] podaac-data-subscriber = 'subscriber.podaac_data_subscriber:main' diff --git a/subscriber/podaac_access.py b/subscriber/podaac_access.py index 327155a..81ba9e8 100644 --- a/subscriber/podaac_access.py +++ b/subscriber/podaac_access.py @@ -10,10 +10,14 @@ from typing import Dict from urllib import request from urllib.error import HTTPError +from urllib.request import urlretrieve import subprocess from urllib.parse import urlencode from urllib.request import Request, urlopen import hashlib +from datetime import datetime +import time + import requests @@ -286,6 +290,26 @@ def get_temporal_range(start, end, now): raise ValueError("One of start-date or end-date must be specified.") +def download_file(remote_file, output_path, retries=3): + failed = False + for r in range(retries): + try: + urlretrieve(remote_file, output_path) + except HTTPError as e: + if e.code == 503: + logging.warning(f'Error downloading {remote_file}. Retrying download.') + # back off on sleep time each error... + time.sleep(r) + if r >= retries: + failed = True + else: + #downlaoded fie without 503 + break + + if failed: + raise Exception("Could not download file.") + + # Retry using random exponential backoff if a 500 error is raised. Maximum 10 attempts. @tenacity.retry(wait=tenacity.wait_random_exponential(multiplier=1, max=60), stop=tenacity.stop_after_attempt(10), @@ -436,3 +460,50 @@ def make_checksum(file_path, algorithm): for chunk in iter(lambda: f.read(4096), b""): hash_alg.update(chunk) return hash_alg.hexdigest() + +def get_cmr_collections(params, verbose=False): + query = urlencode(params) + url = "https://" + cmr + "/search/collections.umm_json?" + query + if verbose: + logging.info(url) + + # Build the request, add the search after header to it if it's not None (e.g. after the first iteration) + req = Request(url) + response = urlopen(req) + result = json.loads(response.read().decode()) + return result + + +def create_citation(collection_json, access_date): + citation_template = "{creator}. {year}. {title}. Ver. {version}. PO.DAAC, CA, USA. Dataset accessed {access_date} at {doi_authority}/{doi}" + + # Better error handling here may be needed... + doi = collection_json['DOI']["DOI"] + doi_authority = collection_json['DOI']["Authority"] + citation = collection_json["CollectionCitations"][0] + creator = citation["Creator"] + release_date = citation["ReleaseDate"] + title = citation["Title"] + version = citation["Version"] + year = datetime.strptime(release_date, "%Y-%m-%dT%H:%M:%S.000Z").year + return citation_template.format(creator=creator, year=year, title=title, version=version, doi_authority=doi_authority, doi=doi, access_date=access_date) + +def create_citation_file(short_name, provider, data_path, token=None, verbose=False): + # get collection umm-c METADATA + params = [ + ('provider', provider), + ('ShortName', short_name) + ] + if token is not None: + params.append(('token', token)) + + collection = get_cmr_collections(params, verbose)['items'][0] + + access_date = datetime.now().strftime("%Y-%m-%d") + + # create citation from umm-c metadata + citation = create_citation(collection['umm'], access_date) + # write file + + with open(data_path + "/" + short_name + ".citation.txt", "w") as text_file: + text_file.write(citation) diff --git a/subscriber/podaac_data_downloader.py b/subscriber/podaac_data_downloader.py index 158c541..38c97d8 100644 --- a/subscriber/podaac_data_downloader.py +++ b/subscriber/podaac_data_downloader.py @@ -193,7 +193,11 @@ def run(args=None): except HTTPError as e: if e.code == 401: token = pa.refresh_token(token, 'podaac-subscriber') - params['token'] = token + # Updated: This is not always a dictionary... + # in fact, here it's always a list of tuples + for i, p in enumerate(params) : + if p[1] == "token": + params[i] = ("token", token) results = pa.get_search_results(params, args.verbose) else: raise e @@ -268,7 +272,9 @@ def run(args=None): skip_cnt += 1 continue - urlretrieve(f, output_path) + pa.download_file(f,output_path) + #urlretrieve(f, output_path) + pa.process_file(process_cmd, output_path, args) logging.info(str(datetime.now()) + " SUCCESS: " + f) success_cnt = success_cnt + 1 @@ -284,6 +290,14 @@ def run(args=None): logging.info("Downloaded Files: " + str(success_cnt)) logging.info("Failed Files: " + str(failure_cnt)) logging.info("Skipped Files: " + str(skip_cnt)) + + #create citation file if success > 0 + if success_cnt > 0: + try: + pa.create_citation_file(short_name, provider, data_path, token, args.verbose) + except: + logging.debug("Error generating citation",exc_info=True) + pa.delete_token(token_url, token) logging.info("END\n\n") diff --git a/subscriber/podaac_data_subscriber.py b/subscriber/podaac_data_subscriber.py index 04df68d..23cdc16 100755 --- a/subscriber/podaac_data_subscriber.py +++ b/subscriber/podaac_data_subscriber.py @@ -218,7 +218,12 @@ def run(args=None): except HTTPError as e: if e.code == 401: token = pa.refresh_token(token, 'podaac-subscriber') - params['token'] = token + # Updated: This is not always a dictionary... + # in fact, here it's always a list of tuples + for i, p in enumerate(params) : + if p[1] == "token": + params[i] = ("token", token) + #params['token'] = token results = pa.get_search_results(params, args.verbose) else: raise e @@ -294,7 +299,9 @@ def run(args=None): skip_cnt += 1 continue - urlretrieve(f, output_path) + #urlretrieve(f, output_path) + pa.download_file(f,output_path) + pa.process_file(process_cmd, output_path, args) logging.info(str(datetime.now()) + " SUCCESS: " + f) success_cnt = success_cnt + 1 @@ -314,6 +321,13 @@ def run(args=None): logging.info("Downloaded Files: " + str(success_cnt)) logging.info("Failed Files: " + str(failure_cnt)) logging.info("Skipped Files: " + str(skip_cnt)) + + if success_cnt > 0: + try: + pa.create_citation_file(short_name, provider, data_path, token, args.verbose) + except: + logging.debug("Error generating citation", exc_info=True) + pa.delete_token(token_url, token) logging.info("END\n\n") #exit(0) diff --git a/tests/test_downloader_regression.py b/tests/test_downloader_regression.py index 875a517..cea7df0 100644 --- a/tests/test_downloader_regression.py +++ b/tests/test_downloader_regression.py @@ -19,10 +19,11 @@ def create_downloader_args(args): @pytest.mark.regression def test_downloader_limit_MUR(): shutil.rmtree('./MUR25-JPL-L4-GLOB-v04.2', ignore_errors=True) - args2 = create_downloader_args('-c MUR25-JPL-L4-GLOB-v04.2 -d ./MUR25-JPL-L4-GLOB-v04.2 -sd 2020-01-01T00:00:00Z -ed 2020-01-30T00:00:00Z --limit 1'.split()) + args2 = create_downloader_args('-c MUR25-JPL-L4-GLOB-v04.2 -d ./MUR25-JPL-L4-GLOB-v04.2 -sd 2020-01-01T00:00:00Z -ed 2020-01-30T00:00:00Z --limit 1 --verbose'.split()) pdd.run(args2) - # count number of files downloaded... - assert len([name for name in os.listdir('./MUR25-JPL-L4-GLOB-v04.2') if os.path.isfile('./MUR25-JPL-L4-GLOB-v04.2/' + name)])==1 + # So running the test in parallel, sometimes we get a 401 on the token... + # Let's ensure we're only looking for data files here + assert len([name for name in os.listdir('./MUR25-JPL-L4-GLOB-v04.2') if os.path.isfile('./MUR25-JPL-L4-GLOB-v04.2/' + name) and "citation.txt" not in name ])==1 shutil.rmtree('./MUR25-JPL-L4-GLOB-v04.2') #Test the downlaoder on MUR25 data for start/stop/, yyyy/mmm/dd dir structure, @@ -31,7 +32,7 @@ def test_downloader_limit_MUR(): @pytest.mark.regression def test_downloader_MUR(): shutil.rmtree('./MUR25-JPL-L4-GLOB-v04.2', ignore_errors=True) - args2 = create_downloader_args('-c MUR25-JPL-L4-GLOB-v04.2 -d ./MUR25-JPL-L4-GLOB-v04.2 -sd 2020-01-01T00:00:00Z -ed 2020-01-02T00:00:00Z -dymd --offset 4'.split()) + args2 = create_downloader_args('-c MUR25-JPL-L4-GLOB-v04.2 -d ./MUR25-JPL-L4-GLOB-v04.2 -sd 2020-01-01T00:00:00Z -ed 2020-01-02T00:00:00Z -dymd --offset 4 --verbose'.split()) pdd.run(args2) assert exists('./MUR25-JPL-L4-GLOB-v04.2/2020/01/01/20200101090000-JPL-L4_GHRSST-SSTfnd-MUR25-GLOB-v02.0-fv04.2.nc') assert exists('./MUR25-JPL-L4-GLOB-v04.2/2020/01/02/20200102090000-JPL-L4_GHRSST-SSTfnd-MUR25-GLOB-v02.0-fv04.2.nc') @@ -54,7 +55,7 @@ def test_downloader_MUR(): t1 = os.path.getmtime('./MUR25-JPL-L4-GLOB-v04.2/2020/01/01/20200101090000-JPL-L4_GHRSST-SSTfnd-MUR25-GLOB-v02.0-fv04.2.nc') # Set the args to --force to re-download those data - args2 = create_downloader_args('-c MUR25-JPL-L4-GLOB-v04.2 -d ./MUR25-JPL-L4-GLOB-v04.2 -sd 2020-01-01T00:00:00Z -ed 2020-01-02T00:00:00Z -dymd --offset 4 -f'.split()) + args2 = create_downloader_args('-c MUR25-JPL-L4-GLOB-v04.2 -d ./MUR25-JPL-L4-GLOB-v04.2 -sd 2020-01-01T00:00:00Z -ed 2020-01-02T00:00:00Z -dymd --offset 4 -f --verbose'.split()) pdd.run(args2) assert t1 != os.path.getmtime('./MUR25-JPL-L4-GLOB-v04.2/2020/01/01/20200101090000-JPL-L4_GHRSST-SSTfnd-MUR25-GLOB-v02.0-fv04.2.nc') assert t2 != os.path.getmtime('./MUR25-JPL-L4-GLOB-v04.2/2020/01/02/20200102090000-JPL-L4_GHRSST-SSTfnd-MUR25-GLOB-v02.0-fv04.2.nc') @@ -73,6 +74,10 @@ def test_downloader_GRACE_with_SHA_512(tmpdir): pdd.run(args) assert len( os.listdir(directory_str) ) > 0 filename = directory_str + "/" + os.listdir(directory_str)[0] + #if the citation file was chosen above, get the next file since citation file is updated on successful run + if "citation.txt" in filename: + filename = directory_str + "/" + os.listdir(directory_str)[1] + modified_time_1 = os.path.getmtime(filename) print( modified_time_1 ) diff --git a/tests/test_subscriber.py b/tests/test_subscriber.py index 2e9d4cb..983cdce 100644 --- a/tests/test_subscriber.py +++ b/tests/test_subscriber.py @@ -1,9 +1,14 @@ from subscriber import podaac_data_subscriber as pds from subscriber import podaac_access as pa + +from urllib.error import HTTPError import pytest import os from pathlib import Path import shutil +import json +import tempfile +from os.path import exists def test_temporal_range(): @@ -23,6 +28,46 @@ def cleanup_update_test(): print("Cleanup...") shutil.rmtree(data_dir_with_updates) +def test_create_citation_file(): + with tempfile.TemporaryDirectory() as tmpdirname: + pa.create_citation_file("SWOT_SIMULATED_L2_KARIN_SSH_GLORYS_CALVAL_V1", "POCLOUD", tmpdirname) + assert exists(tmpdirname+"/SWOT_SIMULATED_L2_KARIN_SSH_GLORYS_CALVAL_V1.citation.txt") + +def test_citation_creation(): + collection_umm = '''{ + "DOI": { + "DOI": "10.5067/KARIN-2GLC1", + "Authority": "https://doi.org" + }, + "CollectionCitations": [ + { + "Creator": "SWOT", + "ReleasePlace": "PO.DAAC", + "Title": "SWOT Level-2 Simulated SSH from MITgcm LLC4320 Science Quality Version 1.0", + "Publisher": "PO.DAAC", + "ReleaseDate": "2022-01-31T00:00:00.000Z", + "Version": "1.0" + }, + { + "Creator": "CNES/CLS", + "ReleasePlace": "CNES/AVISO", + "Title": "Simulated SWOT products", + "OnlineResource": { + "Linkage": "http://doi.org/10.24400/527896/a01-2021.006", + "Name": " Simulated SWOT Sea Surface Height products", + "Description": "Simulated SWOT Sea Surface Height products KaRIn and Nadir.", + "MimeType": "text/html" + }, + "Publisher": "PODAAC", + "ReleaseDate": "2021-11-01T00:00:00.000Z", + "Version": "1.0" + } + ] + } + ''' + collection_umm_json = json.loads(collection_umm) + citation = pa.create_citation(collection_umm_json, "2022-07-21") + assert citation == "SWOT. 2022. SWOT Level-2 Simulated SSH from MITgcm LLC4320 Science Quality Version 1.0. Ver. 1.0. PO.DAAC, CA, USA. Dataset accessed 2022-07-21 at https://doi.org/10.5067/KARIN-2GLC1" def test_search_after(): # cmr query: https://cmr.earthdata.nasa.gov/search/granules.umm_json?page_size=2000&sort_key=-start_date&provider=POCLOUD&ShortName=JASON_CS_S6A_L2_ALT_LR_STD_OST_NRT_F&temporal=2000-01-01T10%3A00%3A00Z%2C2022-04-15T00%3A00%3A00Z&bounding_box=-180%2C-90%2C180%2C90 @@ -136,6 +181,25 @@ def test_validate(): # with pytest.raises(SystemExit): # a = validate(["-c", "viirs", "-d", "/data", "-m","60b"]) +def test_param_update(): + params = [ + ('sort_key', "-start_date"), + ('provider', "'POCLOUD'"), + ('token', "123"), + ] + + for i, p in enumerate(params) : + if p[1] == "token": + params[i] = ("token", "newToken") + + for i,p in enumerate(params) : + if p[1] == "token": + assert p[2] == "newToken" + +def test_downloader_retry(mocker): + mck = mocker.patch('subscriber.podaac_access.urlretrieve', side_effect=HTTPError("url", 503, "msg", None, None)) + pa.download_file("myUrl", "outputPath") + assert mck.call_count == 3 def validate(args): parser = pds.create_parser() diff --git a/tests/test_subscriber_regression.py b/tests/test_subscriber_regression.py index 07dc2f8..ffa3685 100644 --- a/tests/test_subscriber_regression.py +++ b/tests/test_subscriber_regression.py @@ -53,7 +53,7 @@ def test_subscriber_MUR_update_file_no_redownload(): except OSError as e: print("Expecting this...") - args2 = create_args('-c MUR25-JPL-L4-GLOB-v04.2 -d ./MUR25-JPL-L4-GLOB-v04.2 -sd 2020-01-01T00:00:00Z -ed 2020-01-02T00:00:00Z -dymd --offset 4'.split()) + args2 = create_args('-c MUR25-JPL-L4-GLOB-v04.2 -d ./MUR25-JPL-L4-GLOB-v04.2 -sd 2020-01-01T00:00:00Z -ed 2020-01-02T00:00:00Z -dymd --offset 4 --verbose'.split()) pds.run(args2) assert exists('./MUR25-JPL-L4-GLOB-v04.2/2020/01/01/20200101090000-JPL-L4_GHRSST-SSTfnd-MUR25-GLOB-v02.0-fv04.2.nc') assert exists('./MUR25-JPL-L4-GLOB-v04.2/2020/01/02/20200102090000-JPL-L4_GHRSST-SSTfnd-MUR25-GLOB-v02.0-fv04.2.nc')