Issues/91 (#92)

* added citation creation tests and functionality to subscriber and downloader * added verbose option to create_citation_file command, previously hard coded * updated changelog (whoops) and fixed regression test: 1. Issue where the citation file now downloaded affected the counts 2. Issue where the logic for determining if a file modified time was changing or not was picking up the new citation file which _always_ gets rewritten to update the 'last accessed' date. * updated request to include exec_info in warning; fixed issue with params not being a dictionary caused errors * changed a warning to debug for citation file. fixed test issues * Enable debug logging during regression tests and set max parallel workflows to 2 * added output to pytest * fixed test to only look for downlaoded data files not citation file due to 'random' cmr errors when creating a citation. * added mock testing and retry on 503 * added 503 fixes Co-authored-by: Frank Greguska <Francis.Greguska@jpl.nasa.gov>
podaac · Aug 4, 2022 · 9793d67 · 9793d67
1 parent 368c031
commit 9793d67
Show file tree

Hide file tree

Showing 10 changed files with 211 additions and 14 deletions.
diff --git a/.github/workflows/python-app.yml b/.github/workflows/python-app.yml
@@ -13,6 +13,7 @@ jobs:
   build:
     strategy:
       fail-fast: false
+      max-parallel: 2
       matrix:
         python-version: [ "3.7", "3.8", "3.9", "3.10" ]
         poetry-version: [ "1.1" ]
@@ -47,5 +48,7 @@ jobs:
        username: ${{ secrets.EDL_OPS_USERNAME }}
        password: ${{ secrets.EDL_OPS_PASSWORD }}
     - name: Regression Test with pytest
+      env:
+        PODAAC_LOGLEVEL: "DEBUG"
       run: |
-        poetry run pytest -m "regression"
+        poetry run pytest -o log_cli=true --log-cli-level=DEBUG -m "regression"
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -3,6 +3,13 @@ All notable changes to this project will be documented in this file.
 
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)
 
+## [unreleased]
+### Fixed
+- Fixed an issue where token-refresh was expecting a dictionary, not a list of tuples
+- Fixed an issue with 503 errors on data download not being re-tried. [97](https://github.com/podaac/data-subscriber/issues/97)
+### Added
+- Added citation file creation when data are downloaded [91](https://github.com/podaac/data-subscriber/issues/91). Required some updates to the regression testing.
+
 ## [1.10.2]
 ### Fixed
 - Fixed an issue where using a default global bounding box prevented download of data that didn't use the horizontal spatial domain [87](https://github.com/podaac/data-subscriber/issues/87)

diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -19,6 +19,7 @@ tenacity = "^8.0.1"
 [tool.poetry.dev-dependencies]
 pytest = "^7.1.2"
 flake8 = "^4.0.1"
+pytest-mock = "^3.8.2"
 
 [tool.poetry.scripts]
 podaac-data-subscriber = 'subscriber.podaac_data_subscriber:main'

diff --git a/subscriber/podaac_access.py b/subscriber/podaac_access.py
@@ -10,10 +10,14 @@
 from typing import Dict
 from urllib import request
 from urllib.error import HTTPError
+from urllib.request import urlretrieve
 import subprocess
 from urllib.parse import urlencode
 from urllib.request import Request, urlopen
 import hashlib
+from datetime import datetime
+import time
+
 
 import requests
 
@@ -286,6 +290,26 @@ def get_temporal_range(start, end, now):
     raise ValueError("One of start-date or end-date must be specified.")
 
 
+def download_file(remote_file, output_path, retries=3):
+    failed = False
+    for r in range(retries):
+        try:
+            urlretrieve(remote_file, output_path)
+        except HTTPError as e:
+            if e.code == 503:
+                logging.warning(f'Error downloading {remote_file}. Retrying download.')
+                # back off on sleep time each error...
+                time.sleep(r)
+                if r >= retries:
+                    failed = True
+        else:
+            #downlaoded fie without 503
+            break
+
+        if failed:
+            raise Exception("Could not download file.")
+
+
 # Retry using random exponential backoff if a 500 error is raised. Maximum 10 attempts.
 @tenacity.retry(wait=tenacity.wait_random_exponential(multiplier=1, max=60),
                 stop=tenacity.stop_after_attempt(10),
@@ -436,3 +460,50 @@ def make_checksum(file_path, algorithm):
         for chunk in iter(lambda: f.read(4096), b""):
             hash_alg.update(chunk)
     return hash_alg.hexdigest()
+
+def get_cmr_collections(params, verbose=False):
+    query = urlencode(params)
+    url = "https://" + cmr + "/search/collections.umm_json?" + query
+    if verbose:
+        logging.info(url)
+
+    # Build the request, add the search after header to it if it's not None (e.g. after the first iteration)
+    req = Request(url)
+    response = urlopen(req)
+    result = json.loads(response.read().decode())
+    return result
+
+
+def create_citation(collection_json, access_date):
+    citation_template = "{creator}. {year}. {title}. Ver. {version}. PO.DAAC, CA, USA. Dataset accessed {access_date} at {doi_authority}/{doi}"
+
+    # Better error handling here may be needed...
+    doi = collection_json['DOI']["DOI"]
+    doi_authority = collection_json['DOI']["Authority"]
+    citation = collection_json["CollectionCitations"][0]
+    creator = citation["Creator"]
+    release_date = citation["ReleaseDate"]
+    title = citation["Title"]
+    version = citation["Version"]
+    year = datetime.strptime(release_date, "%Y-%m-%dT%H:%M:%S.000Z").year
+    return citation_template.format(creator=creator, year=year, title=title, version=version, doi_authority=doi_authority, doi=doi, access_date=access_date)
+
+def create_citation_file(short_name, provider, data_path, token=None, verbose=False):
+    # get collection umm-c METADATA
+    params = [
+        ('provider', provider),
+        ('ShortName', short_name)
+    ]
+    if token is not None:
+        params.append(('token', token))
+
+    collection = get_cmr_collections(params, verbose)['items'][0]
+
+    access_date = datetime.now().strftime("%Y-%m-%d")
+
+    # create citation from umm-c metadata
+    citation = create_citation(collection['umm'], access_date)
+    # write file
+
+    with open(data_path + "/" + short_name + ".citation.txt", "w") as text_file:
+        text_file.write(citation)
diff --git a/subscriber/podaac_data_downloader.py b/subscriber/podaac_data_downloader.py
@@ -193,7 +193,11 @@ def run(args=None):
     except HTTPError as e:
         if e.code == 401:
             token = pa.refresh_token(token, 'podaac-subscriber')
-            params['token'] = token
+            # Updated: This is not always a dictionary...
+            # in fact, here it's always a list of tuples
+            for  i, p in enumerate(params) :
+                if p[1] == "token":
+                    params[i] = ("token", token)
             results = pa.get_search_results(params, args.verbose)
         else:
             raise e
@@ -268,7 +272,9 @@ def run(args=None):
                 skip_cnt += 1
                 continue
 
-            urlretrieve(f, output_path)
+            pa.download_file(f,output_path)
+            #urlretrieve(f, output_path)
+
             pa.process_file(process_cmd, output_path, args)
             logging.info(str(datetime.now()) + " SUCCESS: " + f)
             success_cnt = success_cnt + 1
@@ -284,6 +290,14 @@ def run(args=None):
     logging.info("Downloaded Files: " + str(success_cnt))
     logging.info("Failed Files:     " + str(failure_cnt))
     logging.info("Skipped Files:    " + str(skip_cnt))
+
+    #create citation file if success > 0
+    if success_cnt > 0:
+        try:
+            pa.create_citation_file(short_name, provider, data_path, token, args.verbose)
+        except:
+            logging.debug("Error generating citation",exc_info=True)
+
     pa.delete_token(token_url, token)
     logging.info("END\n\n")
 

diff --git a/subscriber/podaac_data_subscriber.py b/subscriber/podaac_data_subscriber.py
@@ -218,7 +218,12 @@ def run(args=None):
     except HTTPError as e:
         if e.code == 401:
             token = pa.refresh_token(token, 'podaac-subscriber')
-            params['token'] = token
+            # Updated: This is not always a dictionary...
+            # in fact, here it's always a list of tuples
+            for  i, p in enumerate(params) :
+                if p[1] == "token":
+                    params[i] = ("token", token)
+            #params['token'] = token
             results = pa.get_search_results(params, args.verbose)
         else:
             raise e
@@ -294,7 +299,9 @@ def run(args=None):
                 skip_cnt += 1
                 continue
 
-            urlretrieve(f, output_path)
+            #urlretrieve(f, output_path)
+            pa.download_file(f,output_path)
+
             pa.process_file(process_cmd, output_path, args)
             logging.info(str(datetime.now()) + " SUCCESS: " + f)
             success_cnt = success_cnt + 1
@@ -314,6 +321,13 @@ def run(args=None):
     logging.info("Downloaded Files: " + str(success_cnt))
     logging.info("Failed Files:     " + str(failure_cnt))
     logging.info("Skipped Files:    " + str(skip_cnt))
+
+    if success_cnt > 0:
+        try:
+            pa.create_citation_file(short_name, provider, data_path, token, args.verbose)
+        except:
+            logging.debug("Error generating citation", exc_info=True)
+
     pa.delete_token(token_url, token)
     logging.info("END\n\n")
     #exit(0)

diff --git a/tests/test_downloader_regression.py b/tests/test_downloader_regression.py
@@ -19,10 +19,11 @@ def create_downloader_args(args):
 @pytest.mark.regression
 def test_downloader_limit_MUR():
     shutil.rmtree('./MUR25-JPL-L4-GLOB-v04.2', ignore_errors=True)
-    args2 = create_downloader_args('-c MUR25-JPL-L4-GLOB-v04.2 -d ./MUR25-JPL-L4-GLOB-v04.2  -sd 2020-01-01T00:00:00Z -ed 2020-01-30T00:00:00Z --limit 1'.split())
+    args2 = create_downloader_args('-c MUR25-JPL-L4-GLOB-v04.2 -d ./MUR25-JPL-L4-GLOB-v04.2  -sd 2020-01-01T00:00:00Z -ed 2020-01-30T00:00:00Z --limit 1 --verbose'.split())
     pdd.run(args2)
-    # count number of files downloaded...
-    assert len([name for name in os.listdir('./MUR25-JPL-L4-GLOB-v04.2') if os.path.isfile('./MUR25-JPL-L4-GLOB-v04.2/' + name)])==1
+    # So running the test in parallel, sometimes we get a 401 on the token...
+    # Let's ensure we're only looking for data files here
+    assert len([name for name in os.listdir('./MUR25-JPL-L4-GLOB-v04.2') if os.path.isfile('./MUR25-JPL-L4-GLOB-v04.2/' + name) and "citation.txt" not in name ])==1
     shutil.rmtree('./MUR25-JPL-L4-GLOB-v04.2')
 
 #Test the downlaoder on MUR25 data for start/stop/, yyyy/mmm/dd dir structure,
@@ -31,7 +32,7 @@ def test_downloader_limit_MUR():
 @pytest.mark.regression
 def test_downloader_MUR():
     shutil.rmtree('./MUR25-JPL-L4-GLOB-v04.2', ignore_errors=True)
-    args2 = create_downloader_args('-c MUR25-JPL-L4-GLOB-v04.2 -d ./MUR25-JPL-L4-GLOB-v04.2  -sd 2020-01-01T00:00:00Z -ed 2020-01-02T00:00:00Z -dymd --offset 4'.split())
+    args2 = create_downloader_args('-c MUR25-JPL-L4-GLOB-v04.2 -d ./MUR25-JPL-L4-GLOB-v04.2  -sd 2020-01-01T00:00:00Z -ed 2020-01-02T00:00:00Z -dymd --offset 4 --verbose'.split())
     pdd.run(args2)
     assert exists('./MUR25-JPL-L4-GLOB-v04.2/2020/01/01/20200101090000-JPL-L4_GHRSST-SSTfnd-MUR25-GLOB-v02.0-fv04.2.nc')
     assert exists('./MUR25-JPL-L4-GLOB-v04.2/2020/01/02/20200102090000-JPL-L4_GHRSST-SSTfnd-MUR25-GLOB-v02.0-fv04.2.nc')
@@ -54,7 +55,7 @@ def test_downloader_MUR():
     t1 = os.path.getmtime('./MUR25-JPL-L4-GLOB-v04.2/2020/01/01/20200101090000-JPL-L4_GHRSST-SSTfnd-MUR25-GLOB-v02.0-fv04.2.nc')
 
     # Set the args to --force to re-download those data
-    args2 = create_downloader_args('-c MUR25-JPL-L4-GLOB-v04.2 -d ./MUR25-JPL-L4-GLOB-v04.2  -sd 2020-01-01T00:00:00Z -ed 2020-01-02T00:00:00Z -dymd --offset 4 -f'.split())
+    args2 = create_downloader_args('-c MUR25-JPL-L4-GLOB-v04.2 -d ./MUR25-JPL-L4-GLOB-v04.2  -sd 2020-01-01T00:00:00Z -ed 2020-01-02T00:00:00Z -dymd --offset 4 -f --verbose'.split())
     pdd.run(args2)
     assert t1 != os.path.getmtime('./MUR25-JPL-L4-GLOB-v04.2/2020/01/01/20200101090000-JPL-L4_GHRSST-SSTfnd-MUR25-GLOB-v02.0-fv04.2.nc')
     assert t2 != os.path.getmtime('./MUR25-JPL-L4-GLOB-v04.2/2020/01/02/20200102090000-JPL-L4_GHRSST-SSTfnd-MUR25-GLOB-v02.0-fv04.2.nc')
@@ -73,6 +74,10 @@ def test_downloader_GRACE_with_SHA_512(tmpdir):
     pdd.run(args)
     assert len( os.listdir(directory_str) ) > 0
     filename = directory_str + "/" + os.listdir(directory_str)[0]
+    #if the citation file was chosen above, get the next file since citation file is updated on successful run
+    if "citation.txt" in filename:
+        filename = directory_str + "/" + os.listdir(directory_str)[1]
+
     modified_time_1 = os.path.getmtime(filename)
     print( modified_time_1 )
 

diff --git a/tests/test_subscriber.py b/tests/test_subscriber.py
@@ -1,9 +1,14 @@
 from subscriber import podaac_data_subscriber as pds
 from subscriber import podaac_access as pa
+
+from urllib.error import HTTPError
 import pytest
 import os
 from pathlib import Path
 import shutil
+import json
+import tempfile
+from os.path import exists
 
 def test_temporal_range():
 
@@ -23,6 +28,46 @@ def cleanup_update_test():
     print("Cleanup...")
     shutil.rmtree(data_dir_with_updates)
 
+def test_create_citation_file():
+    with tempfile.TemporaryDirectory() as tmpdirname:
+        pa.create_citation_file("SWOT_SIMULATED_L2_KARIN_SSH_GLORYS_CALVAL_V1", "POCLOUD", tmpdirname)
+        assert exists(tmpdirname+"/SWOT_SIMULATED_L2_KARIN_SSH_GLORYS_CALVAL_V1.citation.txt")
+
+def test_citation_creation():
+    collection_umm = '''{
+        "DOI": {
+            "DOI": "10.5067/KARIN-2GLC1",
+            "Authority": "https://doi.org"
+        },
+        "CollectionCitations": [
+            {
+                "Creator": "SWOT",
+                "ReleasePlace": "PO.DAAC",
+                "Title": "SWOT Level-2 Simulated SSH from MITgcm LLC4320 Science Quality Version 1.0",
+                "Publisher": "PO.DAAC",
+                "ReleaseDate": "2022-01-31T00:00:00.000Z",
+                "Version": "1.0"
+            },
+            {
+                "Creator": "CNES/CLS",
+                "ReleasePlace": "CNES/AVISO",
+                "Title": "Simulated SWOT products",
+                "OnlineResource": {
+                    "Linkage": "http://doi.org/10.24400/527896/a01-2021.006",
+                    "Name": " Simulated SWOT Sea Surface Height products",
+                    "Description": "Simulated SWOT Sea Surface Height products KaRIn and Nadir.",
+                    "MimeType": "text/html"
+                },
+                "Publisher": "PODAAC",
+                "ReleaseDate": "2021-11-01T00:00:00.000Z",
+                "Version": "1.0"
+            }
+        ]
+    }
+    '''
+    collection_umm_json = json.loads(collection_umm)
+    citation = pa.create_citation(collection_umm_json, "2022-07-21")
+    assert citation == "SWOT. 2022. SWOT Level-2 Simulated SSH from MITgcm LLC4320 Science Quality Version 1.0. Ver. 1.0. PO.DAAC, CA, USA. Dataset accessed 2022-07-21 at https://doi.org/10.5067/KARIN-2GLC1"
 
 def test_search_after():
     # cmr query: https://cmr.earthdata.nasa.gov/search/granules.umm_json?page_size=2000&sort_key=-start_date&provider=POCLOUD&ShortName=JASON_CS_S6A_L2_ALT_LR_STD_OST_NRT_F&temporal=2000-01-01T10%3A00%3A00Z%2C2022-04-15T00%3A00%3A00Z&bounding_box=-180%2C-90%2C180%2C90
@@ -136,6 +181,25 @@ def test_validate():
     # with pytest.raises(SystemExit):
     #     a = validate(["-c", "viirs", "-d", "/data", "-m","60b"])
 
+def test_param_update():
+    params = [
+        ('sort_key', "-start_date"),
+        ('provider', "'POCLOUD'"),
+        ('token', "123"),
+    ]
+
+    for  i, p in enumerate(params) :
+        if p[1] == "token":
+            params[i] = ("token", "newToken")
+
+    for i,p in enumerate(params) :
+        if p[1] == "token":
+            assert p[2] == "newToken"
+
+def test_downloader_retry(mocker):
+    mck = mocker.patch('subscriber.podaac_access.urlretrieve', side_effect=HTTPError("url", 503, "msg", None, None))
+    pa.download_file("myUrl", "outputPath")
+    assert mck.call_count == 3
 
 def validate(args):
     parser = pds.create_parser()