Skip to content

Commit

Permalink
Issues/91 (#92)
Browse files Browse the repository at this point in the history
* added citation creation tests and functionality to subscriber and downloader

* added verbose option to create_citation_file command, previously hard coded

* updated changelog (whoops) and fixed regression test:
1. Issue where the citation file now downloaded affected the counts
2. Issue where the logic for determining if a file modified time was changing or not was picking up the new citation file which _always_ gets rewritten to update the 'last accessed' date.

* updated request to include exec_info in warning; fixed issue with params not being a dictionary caused errors

* changed a warning to debug for citation file. fixed test issues

* Enable debug logging during regression tests and set max parallel workflows to 2

* added output to pytest

* fixed test to only look for downlaoded data files not citation file due to 'random' cmr errors when creating a citation.

* added mock testing and retry on 503

* added 503 fixes

Co-authored-by: Frank Greguska <Francis.Greguska@jpl.nasa.gov>
  • Loading branch information
mike-gangl and Frank Greguska authored Aug 4, 2022
1 parent 368c031 commit 9793d67
Show file tree
Hide file tree
Showing 10 changed files with 211 additions and 14 deletions.
5 changes: 4 additions & 1 deletion .github/workflows/python-app.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ jobs:
build:
strategy:
fail-fast: false
max-parallel: 2
matrix:
python-version: [ "3.7", "3.8", "3.9", "3.10" ]
poetry-version: [ "1.1" ]
Expand Down Expand Up @@ -47,5 +48,7 @@ jobs:
username: ${{ secrets.EDL_OPS_USERNAME }}
password: ${{ secrets.EDL_OPS_PASSWORD }}
- name: Regression Test with pytest
env:
PODAAC_LOGLEVEL: "DEBUG"
run: |
poetry run pytest -m "regression"
poetry run pytest -o log_cli=true --log-cli-level=DEBUG -m "regression"
7 changes: 7 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,13 @@ All notable changes to this project will be documented in this file.

The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)

## [unreleased]
### Fixed
- Fixed an issue where token-refresh was expecting a dictionary, not a list of tuples
- Fixed an issue with 503 errors on data download not being re-tried. [97](https://github.com/podaac/data-subscriber/issues/97)
### Added
- Added citation file creation when data are downloaded [91](https://github.com/podaac/data-subscriber/issues/91). Required some updates to the regression testing.

## [1.10.2]
### Fixed
- Fixed an issue where using a default global bounding box prevented download of data that didn't use the horizontal spatial domain [87](https://github.com/podaac/data-subscriber/issues/87)
Expand Down
24 changes: 21 additions & 3 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ tenacity = "^8.0.1"
[tool.poetry.dev-dependencies]
pytest = "^7.1.2"
flake8 = "^4.0.1"
pytest-mock = "^3.8.2"

[tool.poetry.scripts]
podaac-data-subscriber = 'subscriber.podaac_data_subscriber:main'
Expand Down
71 changes: 71 additions & 0 deletions subscriber/podaac_access.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,14 @@
from typing import Dict
from urllib import request
from urllib.error import HTTPError
from urllib.request import urlretrieve
import subprocess
from urllib.parse import urlencode
from urllib.request import Request, urlopen
import hashlib
from datetime import datetime
import time


import requests

Expand Down Expand Up @@ -286,6 +290,26 @@ def get_temporal_range(start, end, now):
raise ValueError("One of start-date or end-date must be specified.")


def download_file(remote_file, output_path, retries=3):
failed = False
for r in range(retries):
try:
urlretrieve(remote_file, output_path)
except HTTPError as e:
if e.code == 503:
logging.warning(f'Error downloading {remote_file}. Retrying download.')
# back off on sleep time each error...
time.sleep(r)
if r >= retries:
failed = True
else:
#downlaoded fie without 503
break

if failed:
raise Exception("Could not download file.")


# Retry using random exponential backoff if a 500 error is raised. Maximum 10 attempts.
@tenacity.retry(wait=tenacity.wait_random_exponential(multiplier=1, max=60),
stop=tenacity.stop_after_attempt(10),
Expand Down Expand Up @@ -436,3 +460,50 @@ def make_checksum(file_path, algorithm):
for chunk in iter(lambda: f.read(4096), b""):
hash_alg.update(chunk)
return hash_alg.hexdigest()

def get_cmr_collections(params, verbose=False):
query = urlencode(params)
url = "https://" + cmr + "/search/collections.umm_json?" + query
if verbose:
logging.info(url)

# Build the request, add the search after header to it if it's not None (e.g. after the first iteration)
req = Request(url)
response = urlopen(req)
result = json.loads(response.read().decode())
return result


def create_citation(collection_json, access_date):
citation_template = "{creator}. {year}. {title}. Ver. {version}. PO.DAAC, CA, USA. Dataset accessed {access_date} at {doi_authority}/{doi}"

# Better error handling here may be needed...
doi = collection_json['DOI']["DOI"]
doi_authority = collection_json['DOI']["Authority"]
citation = collection_json["CollectionCitations"][0]
creator = citation["Creator"]
release_date = citation["ReleaseDate"]
title = citation["Title"]
version = citation["Version"]
year = datetime.strptime(release_date, "%Y-%m-%dT%H:%M:%S.000Z").year
return citation_template.format(creator=creator, year=year, title=title, version=version, doi_authority=doi_authority, doi=doi, access_date=access_date)

def create_citation_file(short_name, provider, data_path, token=None, verbose=False):
# get collection umm-c METADATA
params = [
('provider', provider),
('ShortName', short_name)
]
if token is not None:
params.append(('token', token))

collection = get_cmr_collections(params, verbose)['items'][0]

access_date = datetime.now().strftime("%Y-%m-%d")

# create citation from umm-c metadata
citation = create_citation(collection['umm'], access_date)
# write file

with open(data_path + "/" + short_name + ".citation.txt", "w") as text_file:
text_file.write(citation)
18 changes: 16 additions & 2 deletions subscriber/podaac_data_downloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -193,7 +193,11 @@ def run(args=None):
except HTTPError as e:
if e.code == 401:
token = pa.refresh_token(token, 'podaac-subscriber')
params['token'] = token
# Updated: This is not always a dictionary...
# in fact, here it's always a list of tuples
for i, p in enumerate(params) :
if p[1] == "token":
params[i] = ("token", token)
results = pa.get_search_results(params, args.verbose)
else:
raise e
Expand Down Expand Up @@ -268,7 +272,9 @@ def run(args=None):
skip_cnt += 1
continue

urlretrieve(f, output_path)
pa.download_file(f,output_path)
#urlretrieve(f, output_path)

pa.process_file(process_cmd, output_path, args)
logging.info(str(datetime.now()) + " SUCCESS: " + f)
success_cnt = success_cnt + 1
Expand All @@ -284,6 +290,14 @@ def run(args=None):
logging.info("Downloaded Files: " + str(success_cnt))
logging.info("Failed Files: " + str(failure_cnt))
logging.info("Skipped Files: " + str(skip_cnt))

#create citation file if success > 0
if success_cnt > 0:
try:
pa.create_citation_file(short_name, provider, data_path, token, args.verbose)
except:
logging.debug("Error generating citation",exc_info=True)

pa.delete_token(token_url, token)
logging.info("END\n\n")

Expand Down
18 changes: 16 additions & 2 deletions subscriber/podaac_data_subscriber.py
Original file line number Diff line number Diff line change
Expand Up @@ -218,7 +218,12 @@ def run(args=None):
except HTTPError as e:
if e.code == 401:
token = pa.refresh_token(token, 'podaac-subscriber')
params['token'] = token
# Updated: This is not always a dictionary...
# in fact, here it's always a list of tuples
for i, p in enumerate(params) :
if p[1] == "token":
params[i] = ("token", token)
#params['token'] = token
results = pa.get_search_results(params, args.verbose)
else:
raise e
Expand Down Expand Up @@ -294,7 +299,9 @@ def run(args=None):
skip_cnt += 1
continue

urlretrieve(f, output_path)
#urlretrieve(f, output_path)
pa.download_file(f,output_path)

pa.process_file(process_cmd, output_path, args)
logging.info(str(datetime.now()) + " SUCCESS: " + f)
success_cnt = success_cnt + 1
Expand All @@ -314,6 +321,13 @@ def run(args=None):
logging.info("Downloaded Files: " + str(success_cnt))
logging.info("Failed Files: " + str(failure_cnt))
logging.info("Skipped Files: " + str(skip_cnt))

if success_cnt > 0:
try:
pa.create_citation_file(short_name, provider, data_path, token, args.verbose)
except:
logging.debug("Error generating citation", exc_info=True)

pa.delete_token(token_url, token)
logging.info("END\n\n")
#exit(0)
Expand Down
15 changes: 10 additions & 5 deletions tests/test_downloader_regression.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,11 @@ def create_downloader_args(args):
@pytest.mark.regression
def test_downloader_limit_MUR():
shutil.rmtree('./MUR25-JPL-L4-GLOB-v04.2', ignore_errors=True)
args2 = create_downloader_args('-c MUR25-JPL-L4-GLOB-v04.2 -d ./MUR25-JPL-L4-GLOB-v04.2 -sd 2020-01-01T00:00:00Z -ed 2020-01-30T00:00:00Z --limit 1'.split())
args2 = create_downloader_args('-c MUR25-JPL-L4-GLOB-v04.2 -d ./MUR25-JPL-L4-GLOB-v04.2 -sd 2020-01-01T00:00:00Z -ed 2020-01-30T00:00:00Z --limit 1 --verbose'.split())
pdd.run(args2)
# count number of files downloaded...
assert len([name for name in os.listdir('./MUR25-JPL-L4-GLOB-v04.2') if os.path.isfile('./MUR25-JPL-L4-GLOB-v04.2/' + name)])==1
# So running the test in parallel, sometimes we get a 401 on the token...
# Let's ensure we're only looking for data files here
assert len([name for name in os.listdir('./MUR25-JPL-L4-GLOB-v04.2') if os.path.isfile('./MUR25-JPL-L4-GLOB-v04.2/' + name) and "citation.txt" not in name ])==1
shutil.rmtree('./MUR25-JPL-L4-GLOB-v04.2')

#Test the downlaoder on MUR25 data for start/stop/, yyyy/mmm/dd dir structure,
Expand All @@ -31,7 +32,7 @@ def test_downloader_limit_MUR():
@pytest.mark.regression
def test_downloader_MUR():
shutil.rmtree('./MUR25-JPL-L4-GLOB-v04.2', ignore_errors=True)
args2 = create_downloader_args('-c MUR25-JPL-L4-GLOB-v04.2 -d ./MUR25-JPL-L4-GLOB-v04.2 -sd 2020-01-01T00:00:00Z -ed 2020-01-02T00:00:00Z -dymd --offset 4'.split())
args2 = create_downloader_args('-c MUR25-JPL-L4-GLOB-v04.2 -d ./MUR25-JPL-L4-GLOB-v04.2 -sd 2020-01-01T00:00:00Z -ed 2020-01-02T00:00:00Z -dymd --offset 4 --verbose'.split())
pdd.run(args2)
assert exists('./MUR25-JPL-L4-GLOB-v04.2/2020/01/01/20200101090000-JPL-L4_GHRSST-SSTfnd-MUR25-GLOB-v02.0-fv04.2.nc')
assert exists('./MUR25-JPL-L4-GLOB-v04.2/2020/01/02/20200102090000-JPL-L4_GHRSST-SSTfnd-MUR25-GLOB-v02.0-fv04.2.nc')
Expand All @@ -54,7 +55,7 @@ def test_downloader_MUR():
t1 = os.path.getmtime('./MUR25-JPL-L4-GLOB-v04.2/2020/01/01/20200101090000-JPL-L4_GHRSST-SSTfnd-MUR25-GLOB-v02.0-fv04.2.nc')

# Set the args to --force to re-download those data
args2 = create_downloader_args('-c MUR25-JPL-L4-GLOB-v04.2 -d ./MUR25-JPL-L4-GLOB-v04.2 -sd 2020-01-01T00:00:00Z -ed 2020-01-02T00:00:00Z -dymd --offset 4 -f'.split())
args2 = create_downloader_args('-c MUR25-JPL-L4-GLOB-v04.2 -d ./MUR25-JPL-L4-GLOB-v04.2 -sd 2020-01-01T00:00:00Z -ed 2020-01-02T00:00:00Z -dymd --offset 4 -f --verbose'.split())
pdd.run(args2)
assert t1 != os.path.getmtime('./MUR25-JPL-L4-GLOB-v04.2/2020/01/01/20200101090000-JPL-L4_GHRSST-SSTfnd-MUR25-GLOB-v02.0-fv04.2.nc')
assert t2 != os.path.getmtime('./MUR25-JPL-L4-GLOB-v04.2/2020/01/02/20200102090000-JPL-L4_GHRSST-SSTfnd-MUR25-GLOB-v02.0-fv04.2.nc')
Expand All @@ -73,6 +74,10 @@ def test_downloader_GRACE_with_SHA_512(tmpdir):
pdd.run(args)
assert len( os.listdir(directory_str) ) > 0
filename = directory_str + "/" + os.listdir(directory_str)[0]
#if the citation file was chosen above, get the next file since citation file is updated on successful run
if "citation.txt" in filename:
filename = directory_str + "/" + os.listdir(directory_str)[1]

modified_time_1 = os.path.getmtime(filename)
print( modified_time_1 )

Expand Down
64 changes: 64 additions & 0 deletions tests/test_subscriber.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,14 @@
from subscriber import podaac_data_subscriber as pds
from subscriber import podaac_access as pa

from urllib.error import HTTPError
import pytest
import os
from pathlib import Path
import shutil
import json
import tempfile
from os.path import exists

def test_temporal_range():

Expand All @@ -23,6 +28,46 @@ def cleanup_update_test():
print("Cleanup...")
shutil.rmtree(data_dir_with_updates)

def test_create_citation_file():
with tempfile.TemporaryDirectory() as tmpdirname:
pa.create_citation_file("SWOT_SIMULATED_L2_KARIN_SSH_GLORYS_CALVAL_V1", "POCLOUD", tmpdirname)
assert exists(tmpdirname+"/SWOT_SIMULATED_L2_KARIN_SSH_GLORYS_CALVAL_V1.citation.txt")

def test_citation_creation():
collection_umm = '''{
"DOI": {
"DOI": "10.5067/KARIN-2GLC1",
"Authority": "https://doi.org"
},
"CollectionCitations": [
{
"Creator": "SWOT",
"ReleasePlace": "PO.DAAC",
"Title": "SWOT Level-2 Simulated SSH from MITgcm LLC4320 Science Quality Version 1.0",
"Publisher": "PO.DAAC",
"ReleaseDate": "2022-01-31T00:00:00.000Z",
"Version": "1.0"
},
{
"Creator": "CNES/CLS",
"ReleasePlace": "CNES/AVISO",
"Title": "Simulated SWOT products",
"OnlineResource": {
"Linkage": "http://doi.org/10.24400/527896/a01-2021.006",
"Name": " Simulated SWOT Sea Surface Height products",
"Description": "Simulated SWOT Sea Surface Height products KaRIn and Nadir.",
"MimeType": "text/html"
},
"Publisher": "PODAAC",
"ReleaseDate": "2021-11-01T00:00:00.000Z",
"Version": "1.0"
}
]
}
'''
collection_umm_json = json.loads(collection_umm)
citation = pa.create_citation(collection_umm_json, "2022-07-21")
assert citation == "SWOT. 2022. SWOT Level-2 Simulated SSH from MITgcm LLC4320 Science Quality Version 1.0. Ver. 1.0. PO.DAAC, CA, USA. Dataset accessed 2022-07-21 at https://doi.org/10.5067/KARIN-2GLC1"

def test_search_after():
# cmr query: https://cmr.earthdata.nasa.gov/search/granules.umm_json?page_size=2000&sort_key=-start_date&provider=POCLOUD&ShortName=JASON_CS_S6A_L2_ALT_LR_STD_OST_NRT_F&temporal=2000-01-01T10%3A00%3A00Z%2C2022-04-15T00%3A00%3A00Z&bounding_box=-180%2C-90%2C180%2C90
Expand Down Expand Up @@ -136,6 +181,25 @@ def test_validate():
# with pytest.raises(SystemExit):
# a = validate(["-c", "viirs", "-d", "/data", "-m","60b"])

def test_param_update():
params = [
('sort_key', "-start_date"),
('provider', "'POCLOUD'"),
('token', "123"),
]

for i, p in enumerate(params) :
if p[1] == "token":
params[i] = ("token", "newToken")

for i,p in enumerate(params) :
if p[1] == "token":
assert p[2] == "newToken"

def test_downloader_retry(mocker):
mck = mocker.patch('subscriber.podaac_access.urlretrieve', side_effect=HTTPError("url", 503, "msg", None, None))
pa.download_file("myUrl", "outputPath")
assert mck.call_count == 3

def validate(args):
parser = pds.create_parser()
Expand Down
Loading

0 comments on commit 9793d67

Please sign in to comment.