diff --git a/CHANGELOG.md b/CHANGELOG.md index 9431f5c..e63c368 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,10 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) +## Unreleased +### Added +- Added new feature allowing regex to be used in `--extension` `-e` options. For example using -e `PTM_\\d+` would match data files like `filename.PTM_1`, `filename.PTM_2` and `filename.PTM_10`, instead of specifying all possible combinations (``-e PTM_1, -e PTM_2, ..., -e PMT_10`) + ## 1.12.0 ### Fixed - Added EDL based token downloading, removing CMR tokens [98](https://github.com/podaac/data-subscriber/issues/98), diff --git a/Downloader.md b/Downloader.md index f83d737..e0af08f 100644 --- a/Downloader.md +++ b/Downloader.md @@ -33,7 +33,7 @@ optional arguments: -dy Flag to use start time (Year) of downloaded data for directory where data products will be downloaded. --offset OFFSET Flag used to shift timestamp. Units are in hours, e.g. 10 or -10. -e EXTENSIONS, --extensions EXTENSIONS - The extensions of products to download. Default is [.nc, .h5, .zip, .tar.gz] + Regexps of extensions of products to download. Default is [.nc, .h5, .zip, .tar.gz, .tiff] -gr GRANULE, --granule-name GRANULE The name of the granule to download. Only one granule name can be specified. Script will download all files matching similar granule name sans extension. --process PROCESS_CMD @@ -219,13 +219,23 @@ Some collections have many files. To download a specific set of files, you can s ``` -e EXTENSIONS, --extensions EXTENSIONS - The extensions of products to download. Default is [.nc, .h5, .zip] + Regexps of extensions of products to download. Default is [.nc, .h5, .zip, .tar.gz, .tiff] ``` An example of the -e usage- note the -e option is additive: ``` podaac-data-subscriber -c VIIRS_N20-OSPO-L2P-v2.61 -d ./data -e .nc -e .h5 -sd 2020-06-01T00:46:02Z -ed 2020-07-01T00:46:02Z ``` + +One may also specify a regular expression to select files. For example, the following are equivalent: + +`podaac-data-subscriber -c VIIRS_N20-OSPO-L2P-v2.61 -d ./data -e PTM_1, -e PTM_2, ..., -e PMT_10 -sd 2020-06-01T00:46:02Z -ed 2020-07-01T00:46:02Z` + +and + +`podaac-data-subscriber -c VIIRS_N20-OSPO-L2P-v2.61 -d ./data -e PTM_\\d+ -sd 2020-06-01T00:46:02Z -ed 2020-07-01T00:46:02Z` + + ### run a post download process Using the `--process` option, you can run a simple command agaisnt the "just" downloaded file. This will take the format of " ". This means you can run a command like `--process gzip` to gzip all downloaded files. We do not support more advanced processes at this time (piping, running a process on a directory, etc). diff --git a/Subscriber.md b/Subscriber.md index 0cd5e3f..3df3344 100644 --- a/Subscriber.md +++ b/Subscriber.md @@ -28,8 +28,8 @@ optional arguments: --offset OFFSET Flag used to shift timestamp. Units are in hours, e.g. 10 or -10. -m MINUTES, --minutes MINUTES How far back in time, in minutes, should the script look for data. If running this script as a cron, this value should be equal to or greater than how often your cron runs (default: 60 minutes). - -e EXTENSIONS, --extensions EXTENSIONS - The extensions of products to download. Default is [.nc, .h5, .zip] +-e EXTENSIONS, --extensions EXTENSIONS + Regexps of extensions of products to download. Default is [.nc, .h5, .zip, .tar.gz, .tiff] --process PROCESS_CMD Processing command to run on each downloaded file (e.g., compression). Can be specified multiple times. --version Display script version information and exit. @@ -193,13 +193,22 @@ Some collections have many files. To download a specific set of files, you can s ``` -e EXTENSIONS, --extensions EXTENSIONS - The extensions of products to download. Default is [.nc, .h5, .zip] + Regexps of extensions of products to download. Default is [.nc, .h5, .zip, .tar.gz, .tiff] ``` An example of the -e usage- note the -e option is additive: ``` podaac-data-subscriber -c VIIRS_N20-OSPO-L2P-v2.61 -d ./data -e .nc -e .h5 ``` + +One may also specify a regular expression to select files. For example, the following are equivalent: + +`podaac-data-subscriber -c VIIRS_N20-OSPO-L2P-v2.61 -d ./data -e PTM_1, -e PTM_2, ..., -e PMT_10 -sd 2020-06-01T00:46:02Z -ed 2020-07-01T00:46:02Z` + +and + +`podaac-data-subscriber -c VIIRS_N20-OSPO-L2P-v2.61 -d ./data -e PTM_\\d+ -sd 2020-06-01T00:46:02Z -ed 2020-07-01T00:46:02Z` + ### run a post download process Using the `--process` option, you can run a simple command agaisnt the "just" downloaded file. This will take the format of " ". This means you can run a command like `--process gzip` to gzip all downloaded files. We do not support more advanced processes at this time (piping, running a process on a directory, etc). diff --git a/subscriber/podaac_access.py b/subscriber/podaac_access.py index 22d4e6e..5b2a331 100644 --- a/subscriber/podaac_access.py +++ b/subscriber/podaac_access.py @@ -2,6 +2,7 @@ import logging import netrc import subprocess +import re from datetime import datetime from http.cookiejar import CookieJar from os import makedirs @@ -28,7 +29,7 @@ from datetime import datetime __version__ = "1.12.0" -extensions = [".nc", ".h5", ".zip", ".tar.gz", ".tiff"] +extensions = ["\\.nc", "\\.h5", "\\.zip", "\\.tar.gz", "\\.tiff"] edl = "urs.earthdata.nasa.gov" cmr = "cmr.earthdata.nasa.gov" token_url = "https://" + edl + "/api/users" @@ -531,6 +532,11 @@ def create_citation(collection_json, access_date): year = datetime.strptime(release_date, "%Y-%m-%dT%H:%M:%S.000Z").year return citation_template.format(creator=creator, year=year, title=title, version=version, doi_authority=doi_authority, doi=doi, access_date=access_date) +def search_extension(extension, filename): + if re.search(extension + "$", filename) is not None: + return True + return False + def create_citation_file(short_name, provider, data_path, token=None, verbose=False): # get collection umm-c METADATA params = [ diff --git a/subscriber/podaac_data_downloader.py b/subscriber/podaac_data_downloader.py index 6f51bca..7a41f30 100755 --- a/subscriber/podaac_data_downloader.py +++ b/subscriber/podaac_data_downloader.py @@ -1,7 +1,7 @@ #!/usr/bin/env python3 import argparse import logging -import os +import os, re import sys from datetime import datetime, timedelta from os import makedirs @@ -86,7 +86,7 @@ def create_parser(): help="Flag used to shift timestamp. Units are in hours, e.g. 10 or -10.") # noqa E501 parser.add_argument("-e", "--extensions", dest="extensions", - help="The extensions of products to download. Default is [.nc, .h5, .zip, .tar.gz]", + help="Regexps of extensions of products to download. Default is [.nc, .h5, .zip, .tar.gz, .tiff]", default=None, action='append') # noqa E501 # Get specific granule from the search @@ -253,7 +253,7 @@ def run(args=None): filtered_downloads = [] for f in downloads: for extension in extensions: - if f.lower().endswith(extension): + if pa.search_extension(extension, f): filtered_downloads.append(f) downloads = filtered_downloads diff --git a/subscriber/podaac_data_subscriber.py b/subscriber/podaac_data_subscriber.py index eb103ac..2a0a9f3 100755 --- a/subscriber/podaac_data_subscriber.py +++ b/subscriber/podaac_data_subscriber.py @@ -14,7 +14,7 @@ # Accounts are free to create and take just a moment to set up. import argparse import logging -import os +import os, re import sys from datetime import datetime, timedelta from os import makedirs @@ -92,7 +92,7 @@ def create_parser(): help="How far back in time, in minutes, should the script look for data. If running this script as a cron, this value should be equal to or greater than how often your cron runs.", type=int, default=None) # noqa E501 parser.add_argument("-e", "--extensions", dest="extensions", - help="The extensions of products to download. Default is [.nc, .h5, .zip]", default=None, + help="Regexps of extensions of products to download. Default is [.nc, .h5, .zip, .tar.gz, .tiff]", default=None, action='append') # noqa E501 parser.add_argument("--process", dest="process_cmd", help="Processing command to run on each downloaded file (e.g., compression). Can be specified multiple times.", @@ -260,7 +260,7 @@ def run(args=None): filtered_downloads = [] for f in downloads: for extension in extensions: - if f.lower().endswith(extension): + if pa.search_extension(extension, f): filtered_downloads.append(f) downloads = filtered_downloads diff --git a/tests/test_subscriber.py b/tests/test_subscriber.py index 983cdce..23f8e46 100644 --- a/tests/test_subscriber.py +++ b/tests/test_subscriber.py @@ -206,3 +206,12 @@ def validate(args): args2 = parser.parse_args(args) pa.validate(args2) return args2 + +def test_extensions(): + assert pa.search_extension('\\.tiff', "myfile.tiff") == True + assert pa.search_extension('\\.tiff', "myfile.tif") == False + assert pa.search_extension('\\.tiff', "myfile.gtiff") == False + assert pa.search_extension('PTM_\\d+', "myfile.PTM_1") == True + assert pa.search_extension('PTM_\\d+', "myfile.PTM_10") == True + assert pa.search_extension('PTM_\\d+', "myfile.PTM_09") == True + assert pa.search_extension('PTM_\\d+', "myfile.PTM_9") == True