Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Issues/91 #92

Merged
merged 11 commits into from
Aug 4, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion .github/workflows/python-app.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ jobs:
build:
strategy:
fail-fast: false
max-parallel: 2
matrix:
python-version: [ "3.7", "3.8", "3.9", "3.10" ]
poetry-version: [ "1.1" ]
Expand Down Expand Up @@ -47,5 +48,7 @@ jobs:
username: ${{ secrets.EDL_OPS_USERNAME }}
password: ${{ secrets.EDL_OPS_PASSWORD }}
- name: Regression Test with pytest
env:
PODAAC_LOGLEVEL: "DEBUG"
run: |
poetry run pytest -m "regression"
poetry run pytest -o log_cli=true --log-cli-level=DEBUG -m "regression"
7 changes: 7 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,13 @@ All notable changes to this project will be documented in this file.

The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)

## [unreleased]
### Fixed
- Fixed an issue where token-refresh was expecting a dictionary, not a list of tuples
- Fixed an issue with 503 errors on data download not being re-tried. [97](https://github.com/podaac/data-subscriber/issues/97)
### Added
- Added citation file creation when data are downloaded [91](https://github.com/podaac/data-subscriber/issues/91). Required some updates to the regression testing.

## [1.10.2]
### Fixed
- Fixed an issue where using a default global bounding box prevented download of data that didn't use the horizontal spatial domain [87](https://github.com/podaac/data-subscriber/issues/87)
Expand Down
24 changes: 21 additions & 3 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ tenacity = "^8.0.1"
[tool.poetry.dev-dependencies]
pytest = "^7.1.2"
flake8 = "^4.0.1"
pytest-mock = "^3.8.2"

[tool.poetry.scripts]
podaac-data-subscriber = 'subscriber.podaac_data_subscriber:main'
Expand Down
71 changes: 71 additions & 0 deletions subscriber/podaac_access.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,14 @@
from typing import Dict
from urllib import request
from urllib.error import HTTPError
from urllib.request import urlretrieve
import subprocess
from urllib.parse import urlencode
from urllib.request import Request, urlopen
import hashlib
from datetime import datetime
import time


import requests

Expand Down Expand Up @@ -286,6 +290,26 @@ def get_temporal_range(start, end, now):
raise ValueError("One of start-date or end-date must be specified.")


def download_file(remote_file, output_path, retries=3):
failed = False
for r in range(retries):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

try:
urlretrieve(remote_file, output_path)
except HTTPError as e:
if e.code == 503:
logging.warning(f'Error downloading {remote_file}. Retrying download.')
# back off on sleep time each error...
time.sleep(r)
if r >= retries:
failed = True
else:
#downlaoded fie without 503
break

if failed:
raise Exception("Could not download file.")


# Retry using random exponential backoff if a 500 error is raised. Maximum 10 attempts.
@tenacity.retry(wait=tenacity.wait_random_exponential(multiplier=1, max=60),
stop=tenacity.stop_after_attempt(10),
Expand Down Expand Up @@ -436,3 +460,50 @@ def make_checksum(file_path, algorithm):
for chunk in iter(lambda: f.read(4096), b""):
hash_alg.update(chunk)
return hash_alg.hexdigest()

def get_cmr_collections(params, verbose=False):
query = urlencode(params)
url = "https://" + cmr + "/search/collections.umm_json?" + query
if verbose:
logging.info(url)

# Build the request, add the search after header to it if it's not None (e.g. after the first iteration)
req = Request(url)
response = urlopen(req)
result = json.loads(response.read().decode())
return result


def create_citation(collection_json, access_date):
citation_template = "{creator}. {year}. {title}. Ver. {version}. PO.DAAC, CA, USA. Dataset accessed {access_date} at {doi_authority}/{doi}"

# Better error handling here may be needed...
doi = collection_json['DOI']["DOI"]
doi_authority = collection_json['DOI']["Authority"]
citation = collection_json["CollectionCitations"][0]
creator = citation["Creator"]
release_date = citation["ReleaseDate"]
title = citation["Title"]
version = citation["Version"]
year = datetime.strptime(release_date, "%Y-%m-%dT%H:%M:%S.000Z").year
return citation_template.format(creator=creator, year=year, title=title, version=version, doi_authority=doi_authority, doi=doi, access_date=access_date)

def create_citation_file(short_name, provider, data_path, token=None, verbose=False):
# get collection umm-c METADATA
params = [
('provider', provider),
('ShortName', short_name)
]
if token is not None:
params.append(('token', token))

collection = get_cmr_collections(params, verbose)['items'][0]

access_date = datetime.now().strftime("%Y-%m-%d")

# create citation from umm-c metadata
citation = create_citation(collection['umm'], access_date)
# write file

with open(data_path + "/" + short_name + ".citation.txt", "w") as text_file:
text_file.write(citation)
18 changes: 16 additions & 2 deletions subscriber/podaac_data_downloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -193,7 +193,11 @@ def run(args=None):
except HTTPError as e:
if e.code == 401:
token = pa.refresh_token(token, 'podaac-subscriber')
params['token'] = token
# Updated: This is not always a dictionary...
# in fact, here it's always a list of tuples
for i, p in enumerate(params) :
if p[1] == "token":
params[i] = ("token", token)
results = pa.get_search_results(params, args.verbose)
else:
raise e
Expand Down Expand Up @@ -268,7 +272,9 @@ def run(args=None):
skip_cnt += 1
continue

urlretrieve(f, output_path)
pa.download_file(f,output_path)
#urlretrieve(f, output_path)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Remove the commented out code


pa.process_file(process_cmd, output_path, args)
logging.info(str(datetime.now()) + " SUCCESS: " + f)
success_cnt = success_cnt + 1
Expand All @@ -284,6 +290,14 @@ def run(args=None):
logging.info("Downloaded Files: " + str(success_cnt))
logging.info("Failed Files: " + str(failure_cnt))
logging.info("Skipped Files: " + str(skip_cnt))

#create citation file if success > 0
if success_cnt > 0:
try:
pa.create_citation_file(short_name, provider, data_path, token, args.verbose)
except:
logging.debug("Error generating citation",exc_info=True)

pa.delete_token(token_url, token)
logging.info("END\n\n")

Expand Down
18 changes: 16 additions & 2 deletions subscriber/podaac_data_subscriber.py
Original file line number Diff line number Diff line change
Expand Up @@ -218,7 +218,12 @@ def run(args=None):
except HTTPError as e:
if e.code == 401:
token = pa.refresh_token(token, 'podaac-subscriber')
params['token'] = token
# Updated: This is not always a dictionary...
# in fact, here it's always a list of tuples
for i, p in enumerate(params) :
if p[1] == "token":
params[i] = ("token", token)
#params['token'] = token
results = pa.get_search_results(params, args.verbose)
else:
raise e
Expand Down Expand Up @@ -294,7 +299,9 @@ def run(args=None):
skip_cnt += 1
continue

urlretrieve(f, output_path)
#urlretrieve(f, output_path)
pa.download_file(f,output_path)

pa.process_file(process_cmd, output_path, args)
logging.info(str(datetime.now()) + " SUCCESS: " + f)
success_cnt = success_cnt + 1
Expand All @@ -314,6 +321,13 @@ def run(args=None):
logging.info("Downloaded Files: " + str(success_cnt))
logging.info("Failed Files: " + str(failure_cnt))
logging.info("Skipped Files: " + str(skip_cnt))

if success_cnt > 0:
try:
pa.create_citation_file(short_name, provider, data_path, token, args.verbose)
except:
logging.debug("Error generating citation", exc_info=True)

pa.delete_token(token_url, token)
logging.info("END\n\n")
#exit(0)
Expand Down
15 changes: 10 additions & 5 deletions tests/test_downloader_regression.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,11 @@ def create_downloader_args(args):
@pytest.mark.regression
def test_downloader_limit_MUR():
shutil.rmtree('./MUR25-JPL-L4-GLOB-v04.2', ignore_errors=True)
args2 = create_downloader_args('-c MUR25-JPL-L4-GLOB-v04.2 -d ./MUR25-JPL-L4-GLOB-v04.2 -sd 2020-01-01T00:00:00Z -ed 2020-01-30T00:00:00Z --limit 1'.split())
args2 = create_downloader_args('-c MUR25-JPL-L4-GLOB-v04.2 -d ./MUR25-JPL-L4-GLOB-v04.2 -sd 2020-01-01T00:00:00Z -ed 2020-01-30T00:00:00Z --limit 1 --verbose'.split())
pdd.run(args2)
# count number of files downloaded...
assert len([name for name in os.listdir('./MUR25-JPL-L4-GLOB-v04.2') if os.path.isfile('./MUR25-JPL-L4-GLOB-v04.2/' + name)])==1
# So running the test in parallel, sometimes we get a 401 on the token...
# Let's ensure we're only looking for data files here
assert len([name for name in os.listdir('./MUR25-JPL-L4-GLOB-v04.2') if os.path.isfile('./MUR25-JPL-L4-GLOB-v04.2/' + name) and "citation.txt" not in name ])==1
shutil.rmtree('./MUR25-JPL-L4-GLOB-v04.2')

#Test the downlaoder on MUR25 data for start/stop/, yyyy/mmm/dd dir structure,
Expand All @@ -31,7 +32,7 @@ def test_downloader_limit_MUR():
@pytest.mark.regression
def test_downloader_MUR():
shutil.rmtree('./MUR25-JPL-L4-GLOB-v04.2', ignore_errors=True)
args2 = create_downloader_args('-c MUR25-JPL-L4-GLOB-v04.2 -d ./MUR25-JPL-L4-GLOB-v04.2 -sd 2020-01-01T00:00:00Z -ed 2020-01-02T00:00:00Z -dymd --offset 4'.split())
args2 = create_downloader_args('-c MUR25-JPL-L4-GLOB-v04.2 -d ./MUR25-JPL-L4-GLOB-v04.2 -sd 2020-01-01T00:00:00Z -ed 2020-01-02T00:00:00Z -dymd --offset 4 --verbose'.split())
pdd.run(args2)
assert exists('./MUR25-JPL-L4-GLOB-v04.2/2020/01/01/20200101090000-JPL-L4_GHRSST-SSTfnd-MUR25-GLOB-v02.0-fv04.2.nc')
assert exists('./MUR25-JPL-L4-GLOB-v04.2/2020/01/02/20200102090000-JPL-L4_GHRSST-SSTfnd-MUR25-GLOB-v02.0-fv04.2.nc')
Expand All @@ -54,7 +55,7 @@ def test_downloader_MUR():
t1 = os.path.getmtime('./MUR25-JPL-L4-GLOB-v04.2/2020/01/01/20200101090000-JPL-L4_GHRSST-SSTfnd-MUR25-GLOB-v02.0-fv04.2.nc')

# Set the args to --force to re-download those data
args2 = create_downloader_args('-c MUR25-JPL-L4-GLOB-v04.2 -d ./MUR25-JPL-L4-GLOB-v04.2 -sd 2020-01-01T00:00:00Z -ed 2020-01-02T00:00:00Z -dymd --offset 4 -f'.split())
args2 = create_downloader_args('-c MUR25-JPL-L4-GLOB-v04.2 -d ./MUR25-JPL-L4-GLOB-v04.2 -sd 2020-01-01T00:00:00Z -ed 2020-01-02T00:00:00Z -dymd --offset 4 -f --verbose'.split())
pdd.run(args2)
assert t1 != os.path.getmtime('./MUR25-JPL-L4-GLOB-v04.2/2020/01/01/20200101090000-JPL-L4_GHRSST-SSTfnd-MUR25-GLOB-v02.0-fv04.2.nc')
assert t2 != os.path.getmtime('./MUR25-JPL-L4-GLOB-v04.2/2020/01/02/20200102090000-JPL-L4_GHRSST-SSTfnd-MUR25-GLOB-v02.0-fv04.2.nc')
Expand All @@ -73,6 +74,10 @@ def test_downloader_GRACE_with_SHA_512(tmpdir):
pdd.run(args)
assert len( os.listdir(directory_str) ) > 0
filename = directory_str + "/" + os.listdir(directory_str)[0]
#if the citation file was chosen above, get the next file since citation file is updated on successful run
if "citation.txt" in filename:
filename = directory_str + "/" + os.listdir(directory_str)[1]

modified_time_1 = os.path.getmtime(filename)
print( modified_time_1 )

Expand Down
64 changes: 64 additions & 0 deletions tests/test_subscriber.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,14 @@
from subscriber import podaac_data_subscriber as pds
from subscriber import podaac_access as pa

from urllib.error import HTTPError
import pytest
import os
from pathlib import Path
import shutil
import json
import tempfile
from os.path import exists

def test_temporal_range():

Expand All @@ -23,6 +28,46 @@ def cleanup_update_test():
print("Cleanup...")
shutil.rmtree(data_dir_with_updates)

def test_create_citation_file():
with tempfile.TemporaryDirectory() as tmpdirname:
pa.create_citation_file("SWOT_SIMULATED_L2_KARIN_SSH_GLORYS_CALVAL_V1", "POCLOUD", tmpdirname)
assert exists(tmpdirname+"/SWOT_SIMULATED_L2_KARIN_SSH_GLORYS_CALVAL_V1.citation.txt")

def test_citation_creation():
collection_umm = '''{
"DOI": {
"DOI": "10.5067/KARIN-2GLC1",
"Authority": "https://doi.org"
},
"CollectionCitations": [
{
"Creator": "SWOT",
"ReleasePlace": "PO.DAAC",
"Title": "SWOT Level-2 Simulated SSH from MITgcm LLC4320 Science Quality Version 1.0",
"Publisher": "PO.DAAC",
"ReleaseDate": "2022-01-31T00:00:00.000Z",
"Version": "1.0"
},
{
"Creator": "CNES/CLS",
"ReleasePlace": "CNES/AVISO",
"Title": "Simulated SWOT products",
"OnlineResource": {
"Linkage": "http://doi.org/10.24400/527896/a01-2021.006",
"Name": " Simulated SWOT Sea Surface Height products",
"Description": "Simulated SWOT Sea Surface Height products KaRIn and Nadir.",
"MimeType": "text/html"
},
"Publisher": "PODAAC",
"ReleaseDate": "2021-11-01T00:00:00.000Z",
"Version": "1.0"
}
]
}
'''
collection_umm_json = json.loads(collection_umm)
citation = pa.create_citation(collection_umm_json, "2022-07-21")
assert citation == "SWOT. 2022. SWOT Level-2 Simulated SSH from MITgcm LLC4320 Science Quality Version 1.0. Ver. 1.0. PO.DAAC, CA, USA. Dataset accessed 2022-07-21 at https://doi.org/10.5067/KARIN-2GLC1"

def test_search_after():
# cmr query: https://cmr.earthdata.nasa.gov/search/granules.umm_json?page_size=2000&sort_key=-start_date&provider=POCLOUD&ShortName=JASON_CS_S6A_L2_ALT_LR_STD_OST_NRT_F&temporal=2000-01-01T10%3A00%3A00Z%2C2022-04-15T00%3A00%3A00Z&bounding_box=-180%2C-90%2C180%2C90
Expand Down Expand Up @@ -136,6 +181,25 @@ def test_validate():
# with pytest.raises(SystemExit):
# a = validate(["-c", "viirs", "-d", "/data", "-m","60b"])

def test_param_update():
params = [
('sort_key', "-start_date"),
('provider', "'POCLOUD'"),
('token', "123"),
]

for i, p in enumerate(params) :
if p[1] == "token":
params[i] = ("token", "newToken")

for i,p in enumerate(params) :
if p[1] == "token":
assert p[2] == "newToken"

def test_downloader_retry(mocker):
mck = mocker.patch('subscriber.podaac_access.urlretrieve', side_effect=HTTPError("url", 503, "msg", None, None))
pa.download_file("myUrl", "outputPath")
assert mck.call_count == 3

def validate(args):
parser = pds.create_parser()
Expand Down
Loading