From f7f2f1e496d84805e7efdb4d4c9d95f2e9ae9e06 Mon Sep 17 00:00:00 2001 From: Michele Facchinelli Date: Wed, 12 Jun 2024 16:18:56 +0100 Subject: [PATCH 1/4] doc: fix typos in README --- README.md | 348 +++++++++++++++++++++++++++--------------------------- 1 file changed, 174 insertions(+), 174 deletions(-) diff --git a/README.md b/README.md index 552e6a4..b5daa54 100644 --- a/README.md +++ b/README.md @@ -1,174 +1,174 @@ -# IMAP Data Access Package - -This lightweight Python package allows users to download, query, and upload data from the IMAP Science Data Center (SDC). - -## Command Line Utility - -### To install - -```bash -pip install imap-data-access -imap-data-access -h -``` - -### Query / Search for data - -Find all files from the SWE instrument - -```bash -$ imap-data-access query --instrument swe -Found [2] matching files ----------------------------------------------------------------------------------------------------------------| -Instrument|Data Level|Descriptor|Start Date|Repointing|Version|Filename | ----------------------------------------------------------------------------------------------------------------| -swe |l0 |sci |20240105 | |v001 |imap_swe_l0_sci_20240105_v001.pkts | -swe |l0 |sci |20240105 | |v001 |imap_swe_l0_sci_20240105_v001.pkts | ----------------------------------------------------------------------------------------------------------------| -``` - -Find all files during the year 2024 and return the response as raw json - -```bash -$ imap-data-access query --start-date 20240101 --end-date 20241231 --output-format json -[{'file_path': 'imap/swe/l0/2024/01/imap_swe_l0_sci_20240105_v001.pkts', 'instrument': 'swe', 'data_level': 'l0', 'descriptor': 'sci', 'start_date': '20240105', 'version': 'v001', 'extension': 'pkts'}, {'file_path': 'imap/swe/l0/2024/01/imap_swe_l0_sci_20240105_v001.pkts', 'instrument': 'swe', 'data_level': 'l0', 'descriptor': 'sci', 'start_date': '20240105', 'version': 'v001', 'extension': 'pkts'}] -``` - -### Download a file - -Download a level 0 SWE file on 2024/01/05 - -```bash -$ imap-data-access download imap/swe/l0/2024/01/imap_swe_l0_sci_20240105_v001.pkts -Successfully downloaded the file to: /imap/swe/l0/2024/01/imap_swe_l0_sci_20240105_v001.pkts -``` - -### Upload a file - -Upload a l1a file after decoding the l0 CCSDS ".pkts" file - -```bash -$ imap-data-access upload /imap/swe/l1a/2024/01/imap_swe_l1a_sci_20240105_v001.cdf -Successfully uploaded the file to the IMAP SDC -``` - -## Importing as a package - -```python -import imap_data_access - -# Search for files -results = imap_data_access.query(instrument="mag", data_level="l0") -# results is a list of dictionaries -# [{'file_path': 'imap/swe/l0/2024/01/imap_swe_l0_sci_20240105_v001.pkts', 'instrument': 'swe', 'data_level': 'l0', 'descriptor': 'sci', 'start_date': '20240105','version': 'v001', 'extension': 'pkts'}, {'file_path': 'imap/swe/l0/2024/01/imap_swe_l0_sci_20240105_v001.pkts', 'instrument': 'swe', 'data_level': 'l0', 'descriptor': 'sci', 'start_date': '20240105', 'version': 'v001', 'extension': 'pkts'}] - -# Download a file that was returned from the search -imap_data_access.download("imap/mag/l0/2024/01/imap_mag_l0_raw_202040101_v001.pkts") - -# Upload a calibration file that exists locally -imap_data_access.upload("imap/swe/l1a/2024/01/imap_swe_l1a_sci_20240105_v001.cdf") -``` - -## Configuration - -### Data directory - -The folder structure for data files within the IMAP SDC is rigidly -defined, so the data access will mimic that structure to make sure -all data is stored in the same heirarchical structure as the SDC. -This will enable seamless transition between a user's local system -and the SDC. This is only used for downloads. - -A user's root data location can be specified as an environment -variable ``IMAP_DATA_DIR`` or through a configuration dictionary -within the package itself ``imap_data_access.config["DATA_DIR"]``. -If the ``IMAP_DATA_DIR`` variable is not set, the program defaults -to the user's current working directory + ``data/``. - -The following is the directory structure the IMAP SDC uses. - -```text -/ - imap/ - / - / - / - / - -``` - -for example, with ``IMAP_DATA_DIR=/data``: - -```text -/data/ - imap/ - swe/ - l0/ - 2024/ - 01/ - imap_swe_l0_sci_20240105_v001.pkts -``` - -### Data Access URL - -To change the default URL that the package accesses, you can set -the environment variable ``IMAP_DATA_ACCESS_URL`` or within the -package ``imap_data_access.config["DATA_ACCESS_URL"]``. The default -is the development server ``https://api.dev.imap-mission.com``. - -## Troubleshooting - -### Network issues - -#### SSL - -If you encounter SSL errors similar to the following: - -```text -urllib.error.URLError: -``` - -That generally means the Python environment you're using is not finding your system's root -certificates properly. This means you need to tell Python how to find those certificates -with the following potential solutions. - -1. **Upgrade the certifi package** - - ```bash - pip install --upgrade certifi - ``` - -2. **Install system certificates** - Depending on the Python version you installed the program with the command will look something like this: - - ```bash - /Applications/Python\ 3.10/Install\ Certificates.command - ``` - -#### HTTP Error 502: Bad Gateway - -This could mean that the service is temporarily down. If you -continue to encounter this, reach out to the IMAP SDC at -. - -#### FileNotFoundError - -This could mean that the local data directory is not set -up with the same paths as the SDC. See the [data directory](#data-directory) -section for an example of how to set this up. - -## File Validation - -This package validates filenames and paths to check they follow our standards, as defined by the filename conventions. There is also a class available for -use by other packages to create filepaths and filenames that follow the IMAP SDC conventions. - -To use this class, use `imap_data_access.ScienceFilepath`. - -Usage: - -```python - -science_file = imap_data_access.ScienceFilePath("imap_swe_l0_sci_20240101_v001.pkts") - -# Filepath = /imap/swe/l0/2024/01/imap_swe_l0_sci_20240101_v001.pkts -filepath = science_file.construct_file_path() -``` +# IMAP Data Access Package + +This lightweight Python package allows users to download, query, and upload data from the IMAP Science Data Center (SDC). + +## Command Line Utility + +### To install + +```bash +pip install imap-data-access +imap-data-access -h +``` + +### Query / Search for data + +Find all files from the SWE instrument + +```bash +$ imap-data-access query --instrument swe +Found [2] matching files +---------------------------------------------------------------------------------------------------------------| +Instrument|Data Level|Descriptor|Start Date|Repointing|Version|Filename | +---------------------------------------------------------------------------------------------------------------| +swe |l0 |sci |20240105 | |v001 |imap_swe_l0_sci_20240105_v001.pkts | +swe |l0 |sci |20240105 | |v001 |imap_swe_l0_sci_20240105_v001.pkts | +---------------------------------------------------------------------------------------------------------------| +``` + +Find all files during the year 2024 and return the response as raw json + +```bash +$ imap-data-access query --start-date 20240101 --end-date 20241231 --output-format json +[{'file_path': 'imap/swe/l0/2024/01/imap_swe_l0_sci_20240105_v001.pkts', 'instrument': 'swe', 'data_level': 'l0', 'descriptor': 'sci', 'start_date': '20240105', 'version': 'v001', 'extension': 'pkts'}, {'file_path': 'imap/swe/l0/2024/01/imap_swe_l0_sci_20240105_v001.pkts', 'instrument': 'swe', 'data_level': 'l0', 'descriptor': 'sci', 'start_date': '20240105', 'version': 'v001', 'extension': 'pkts'}] +``` + +### Download a file + +Download a level 0 SWE file on 2024/01/05 + +```bash +$ imap-data-access download imap/swe/l0/2024/01/imap_swe_l0_sci_20240105_v001.pkts +Successfully downloaded the file to: /imap/swe/l0/2024/01/imap_swe_l0_sci_20240105_v001.pkts +``` + +### Upload a file + +Upload a l1a file after decoding the l0 CCSDS ".pkts" file + +```bash +$ imap-data-access upload /imap/swe/l1a/2024/01/imap_swe_l1a_sci_20240105_v001.cdf +Successfully uploaded the file to the IMAP SDC +``` + +## Importing as a package + +```python +import imap_data_access + +# Search for files +results = imap_data_access.query(instrument="mag", data_level="l0") +# results is a list of dictionaries +# [{'file_path': 'imap/swe/l0/2024/01/imap_swe_l0_sci_20240105_v001.pkts', 'instrument': 'swe', 'data_level': 'l0', 'descriptor': 'sci', 'start_date': '20240105','version': 'v001', 'extension': 'pkts'}, {'file_path': 'imap/swe/l0/2024/01/imap_swe_l0_sci_20240105_v001.pkts', 'instrument': 'swe', 'data_level': 'l0', 'descriptor': 'sci', 'start_date': '20240105', 'version': 'v001', 'extension': 'pkts'}] + +# Download a file that was returned from the search +imap_data_access.download("imap/mag/l0/2024/01/imap_mag_l0_raw_202040101_v001.pkts") + +# Upload a calibration file that exists locally +imap_data_access.upload("imap/swe/l1a/2024/01/imap_swe_l1a_sci_20240105_v001.cdf") +``` + +## Configuration + +### Data directory + +The folder structure for data files within the IMAP SDC is rigidly +defined, so the data access will mimic that structure to make sure +all data is stored in the same heirarchical structure as the SDC. +This will enable seamless transition between a user's local system +and the SDC. This is only used for downloads. + +A user's root data location can be specified as an environment +variable ``IMAP_DATA_DIR`` or through a configuration dictionary +within the package itself ``imap_data_access.config["DATA_DIR"]``. +If the ``IMAP_DATA_DIR`` variable is not set, the program defaults +to the user's current working directory + ``data/``. + +The following is the directory structure the IMAP SDC uses. + +```text +/ + imap/ + / + / + / + / + +``` + +for example, with ``IMAP_DATA_DIR=/data``: + +```text +/data/ + imap/ + swe/ + l0/ + 2024/ + 01/ + imap_swe_l0_sci_20240105_v001.pkts +``` + +### Data Access URL + +To change the default URL that the package accesses, you can set +the environment variable ``IMAP_DATA_ACCESS_URL`` or within the +package ``imap_data_access.config["DATA_ACCESS_URL"]``. The default +is the development server ``https://api.dev.imap-mission.com``. + +## Troubleshooting + +### Network issues + +#### SSL + +If you encounter SSL errors similar to the following: + +```text +urllib.error.URLError: +``` + +That generally means the Python environment you're using is not finding your system's root +certificates properly. This means you need to tell Python how to find those certificates +with the following potential solutions. + +1. **Upgrade the certifi package** + + ```bash + pip install --upgrade certifi + ``` + +2. **Install system certificates** + Depending on the Python version you installed the program with the command will look something like this: + + ```bash + /Applications/Python\ 3.10/Install\ Certificates.command + ``` + +#### HTTP Error 502: Bad Gateway + +This could mean that the service is temporarily down. If you +continue to encounter this, reach out to the IMAP SDC at +. + +#### FileNotFoundError + +This could mean that the local data directory is not set +up with the same paths as the SDC. See the [data directory](#data-directory) +section for an example of how to set this up. + +## File Validation + +This package validates filenames and paths to check they follow our standards, as defined by the filename conventions. There is also a class available for +use by other packages to create filepaths and filenames that follow the IMAP SDC conventions. + +To use this class, use `imap_data_access.ScienceFilePath`. + +Usage: + +```python + +science_file = imap_data_access.ScienceFilePath("imap_swe_l0_sci_20240101_v001.pkts") + +# Filepath = /imap/swe/l0/2024/01/imap_swe_l0_sci_20240101_v001.pkts +filepath = science_file.construct_path() +``` From f06a457fef6661d92a7ed5df46560477141620fc Mon Sep 17 00:00:00 2001 From: Michele Facchinelli Date: Wed, 12 Jun 2024 16:19:10 +0100 Subject: [PATCH 2/4] fix: type suggestion --- imap_data_access/io.py | 402 ++++++++++++++++++++--------------------- 1 file changed, 201 insertions(+), 201 deletions(-) diff --git a/imap_data_access/io.py b/imap_data_access/io.py index 678152f..4a3b8b0 100644 --- a/imap_data_access/io.py +++ b/imap_data_access/io.py @@ -1,201 +1,201 @@ -"""Input/output capabilities for the IMAP data processing pipeline.""" - -# ruff: noqa: PLR0913 S310 -# too many arguments, but we want all of these explicitly listed -# potentially unsafe usage of urlopen, but we aren't concerned here -import contextlib -import json -import logging -import urllib.request -from pathlib import Path -from typing import Optional, Union -from urllib.error import HTTPError, URLError -from urllib.parse import urlencode - -import imap_data_access - -logger = logging.getLogger(__name__) - - -class IMAPDataAccessError(Exception): - """Base class for exceptions in this module.""" - - pass - - -@contextlib.contextmanager -def _get_url_response(request: urllib.request.Request): - """Get the response from a URL request. - - This is a helper function to make it easier to handle - the different types of errors that can occur when - opening a URL and write out the response body. - """ - try: - # Open the URL and yield the response - with urllib.request.urlopen(request) as response: - yield response - - except HTTPError as e: - message = ( - f"HTTP Error: {e.code} - {e.reason}\n" - f"Server Message: {e.read().decode('utf-8')}" - ) - raise IMAPDataAccessError(message) from e - except URLError as e: - message = f"URL Error: {e.reason}" - raise IMAPDataAccessError(message) from e - - -def download(file_path: Union[Path, str]) -> Path: - """Download a file from the data archive. - - Parameters - ---------- - file_path : pathlib.Path or str - Name of the file to download, optionally including the directory path - - Returns - ------- - pathlib.Path - Path to the downloaded file - """ - destination = imap_data_access.config["DATA_DIR"] - # Create the proper file path object based on the extension and filename - file_path = Path(file_path) - if file_path.suffix in imap_data_access.file_validation._SPICE_DIR_MAPPING: - # SPICE - path_obj = imap_data_access.SPICEFilePath(file_path.name) - else: - # Science - path_obj = imap_data_access.ScienceFilePath(file_path.name) - - destination = path_obj.construct_path() - - # Update the file_path with the full path for the download below - file_path = destination.relative_to(imap_data_access.config["DATA_DIR"]).as_posix() - - # Only download if the file doesn't already exist - # TODO: Do we want to verify any hashes to make sure we have the right file? - if destination.exists(): - logger.info("The file %s already exists, skipping download", destination) - return destination - - # encode the query parameters - url = f"{imap_data_access.config['DATA_ACCESS_URL']}" - url += f"/download/{file_path}" - logger.info("Downloading file %s from %s to %s", file_path, url, destination) - - # Create a request with the provided URL - request = urllib.request.Request(url, method="GET") - # Open the URL and download the file - with _get_url_response(request) as response: - logger.debug("Received response: %s", response) - # Save the file locally with the same filename - destination.parent.mkdir(parents=True, exist_ok=True) - with open(destination, "wb") as local_file: - local_file.write(response.read()) - - return destination - - -def query( - *, - instrument: Optional[str] = None, - data_level: Optional[str] = None, - descriptor: Optional[str] = None, - start_date: Optional[str] = None, - end_date: Optional[str] = None, - repointing: Optional[int] = None, - version: Optional[str] = None, - extension: Optional[str] = None, -) -> list[str]: - """Query the data archive for files matching the parameters. - - Parameters - ---------- - instrument : str, optional - Instrument name (e.g. ``mag``) - data_level : str, optional - Data level (e.g. ``l1a``) - descriptor : str, optional - Descriptor of the data product / product name (e.g. ``burst``) - start_date : str, optional - Start date in YYYYMMDD format. Note this is to search for all files - with start dates on or after this value. - end_date : str, optional - End date in YYYYMMDD format. Note this is to search for all files - with start dates before the requested end_date. - repointing : int, optional - Repointing number - version : str, optional - Data version in the format ``vXXX`` - extension : str, optional - File extension (``cdf``, ``pkts``) - - Returns - ------- - list - List of files matching the query - """ - # locals() gives us the keyword arguments passed to the function - # and allows us to filter out the None values - query_params = {key: value for key, value in locals().items() if value is not None} - if not query_params: - raise ValueError("At least one query parameter must be provided") - url = f"{imap_data_access.config['DATA_ACCESS_URL']}" - url += f"/query?{urlencode(query_params)}" - - logger.info("Querying data archive for %s with url %s", query_params, url) - request = urllib.request.Request(url, method="GET") - with _get_url_response(request) as response: - # Retrieve the response as a list of files - items = response.read().decode("utf-8") - logger.debug("Received response: %s", items) - # Decode the JSON string into a list - items = json.loads(items) - logger.debug("Decoded JSON: %s", items) - return items - - -def upload(file_path: Union[Path, str], *, api_key: Optional[str] = None) -> None: - """Upload a file to the data archive. - - Parameters - ---------- - file_path : pathlib.Path or str - Path to the file to upload. - api_key : str, optional - API key to authenticate with the data access API. If not provided, - the value from the IMAP_API_KEY environment variable will be used. - """ - file_path = Path(file_path).resolve() - if not file_path.exists(): - raise FileNotFoundError(file_path) - - url = f"{imap_data_access.config['DATA_ACCESS_URL']}" - # The upload name needs to be given as a path parameter - url += f"/upload/{file_path.name}" - logger.info("Uploading file %s to %s", file_path, url) - - # Create a request header with the API key - api_key = api_key or imap_data_access.config["API_KEY"] - # We send a GET request with the filename and the server - # will respond with an s3 presigned URL that we can use - # to upload the file to the data archive - headers = {"X-api-key": api_key} if api_key else {} - request = urllib.request.Request(url, method="GET", headers=headers) - - with _get_url_response(request) as response: - # Retrieve the key for the upload - s3_url = response.read().decode("utf-8") - logger.debug("Received s3 presigned URL: %s", s3_url) - s3_url = json.loads(s3_url) - - # Follow the presigned URL to upload the file with a PUT request - with open(file_path, "rb") as local_file: - request = urllib.request.Request( - s3_url, data=local_file.read(), method="PUT", headers={"Content-Type": ""} - ) - with _get_url_response(request) as response: - logger.debug("Received response: %s", response.read().decode("utf-8")) +"""Input/output capabilities for the IMAP data processing pipeline.""" + +# ruff: noqa: PLR0913 S310 +# too many arguments, but we want all of these explicitly listed +# potentially unsafe usage of urlopen, but we aren't concerned here +import contextlib +import json +import logging +import urllib.request +from pathlib import Path +from typing import Optional, Union +from urllib.error import HTTPError, URLError +from urllib.parse import urlencode + +import imap_data_access + +logger = logging.getLogger(__name__) + + +class IMAPDataAccessError(Exception): + """Base class for exceptions in this module.""" + + pass + + +@contextlib.contextmanager +def _get_url_response(request: urllib.request.Request): + """Get the response from a URL request. + + This is a helper function to make it easier to handle + the different types of errors that can occur when + opening a URL and write out the response body. + """ + try: + # Open the URL and yield the response + with urllib.request.urlopen(request) as response: + yield response + + except HTTPError as e: + message = ( + f"HTTP Error: {e.code} - {e.reason}\n" + f"Server Message: {e.read().decode('utf-8')}" + ) + raise IMAPDataAccessError(message) from e + except URLError as e: + message = f"URL Error: {e.reason}" + raise IMAPDataAccessError(message) from e + + +def download(file_path: Union[Path, str]) -> Path: + """Download a file from the data archive. + + Parameters + ---------- + file_path : pathlib.Path or str + Name of the file to download, optionally including the directory path + + Returns + ------- + pathlib.Path + Path to the downloaded file + """ + destination = imap_data_access.config["DATA_DIR"] + # Create the proper file path object based on the extension and filename + file_path = Path(file_path) + if file_path.suffix in imap_data_access.file_validation._SPICE_DIR_MAPPING: + # SPICE + path_obj = imap_data_access.SPICEFilePath(file_path.name) + else: + # Science + path_obj = imap_data_access.ScienceFilePath(file_path.name) + + destination = path_obj.construct_path() + + # Update the file_path with the full path for the download below + file_path = destination.relative_to(imap_data_access.config["DATA_DIR"]).as_posix() + + # Only download if the file doesn't already exist + # TODO: Do we want to verify any hashes to make sure we have the right file? + if destination.exists(): + logger.info("The file %s already exists, skipping download", destination) + return destination + + # encode the query parameters + url = f"{imap_data_access.config['DATA_ACCESS_URL']}" + url += f"/download/{file_path}" + logger.info("Downloading file %s from %s to %s", file_path, url, destination) + + # Create a request with the provided URL + request = urllib.request.Request(url, method="GET") + # Open the URL and download the file + with _get_url_response(request) as response: + logger.debug("Received response: %s", response) + # Save the file locally with the same filename + destination.parent.mkdir(parents=True, exist_ok=True) + with open(destination, "wb") as local_file: + local_file.write(response.read()) + + return destination + + +def query( + *, + instrument: Optional[str] = None, + data_level: Optional[str] = None, + descriptor: Optional[str] = None, + start_date: Optional[str] = None, + end_date: Optional[str] = None, + repointing: Optional[int] = None, + version: Optional[str] = None, + extension: Optional[str] = None, +) -> list[dict[str, str]]: + """Query the data archive for files matching the parameters. + + Parameters + ---------- + instrument : str, optional + Instrument name (e.g. ``mag``) + data_level : str, optional + Data level (e.g. ``l1a``) + descriptor : str, optional + Descriptor of the data product / product name (e.g. ``burst``) + start_date : str, optional + Start date in YYYYMMDD format. Note this is to search for all files + with start dates on or after this value. + end_date : str, optional + End date in YYYYMMDD format. Note this is to search for all files + with start dates before the requested end_date. + repointing : int, optional + Repointing number + version : str, optional + Data version in the format ``vXXX`` + extension : str, optional + File extension (``cdf``, ``pkts``) + + Returns + ------- + list + List of files matching the query + """ + # locals() gives us the keyword arguments passed to the function + # and allows us to filter out the None values + query_params = {key: value for key, value in locals().items() if value is not None} + if not query_params: + raise ValueError("At least one query parameter must be provided") + url = f"{imap_data_access.config['DATA_ACCESS_URL']}" + url += f"/query?{urlencode(query_params)}" + + logger.info("Querying data archive for %s with url %s", query_params, url) + request = urllib.request.Request(url, method="GET") + with _get_url_response(request) as response: + # Retrieve the response as a list of files + items = response.read().decode("utf-8") + logger.debug("Received response: %s", items) + # Decode the JSON string into a list + items = json.loads(items) + logger.debug("Decoded JSON: %s", items) + return items + + +def upload(file_path: Union[Path, str], *, api_key: Optional[str] = None) -> None: + """Upload a file to the data archive. + + Parameters + ---------- + file_path : pathlib.Path or str + Path to the file to upload. + api_key : str, optional + API key to authenticate with the data access API. If not provided, + the value from the IMAP_API_KEY environment variable will be used. + """ + file_path = Path(file_path).resolve() + if not file_path.exists(): + raise FileNotFoundError(file_path) + + url = f"{imap_data_access.config['DATA_ACCESS_URL']}" + # The upload name needs to be given as a path parameter + url += f"/upload/{file_path.name}" + logger.info("Uploading file %s to %s", file_path, url) + + # Create a request header with the API key + api_key = api_key or imap_data_access.config["API_KEY"] + # We send a GET request with the filename and the server + # will respond with an s3 presigned URL that we can use + # to upload the file to the data archive + headers = {"X-api-key": api_key} if api_key else {} + request = urllib.request.Request(url, method="GET", headers=headers) + + with _get_url_response(request) as response: + # Retrieve the key for the upload + s3_url = response.read().decode("utf-8") + logger.debug("Received s3 presigned URL: %s", s3_url) + s3_url = json.loads(s3_url) + + # Follow the presigned URL to upload the file with a PUT request + with open(file_path, "rb") as local_file: + request = urllib.request.Request( + s3_url, data=local_file.read(), method="PUT", headers={"Content-Type": ""} + ) + with _get_url_response(request) as response: + logger.debug("Received response: %s", response.read().decode("utf-8")) From fcaf92ff82561684f486b7e6746be88779cdf5ad Mon Sep 17 00:00:00 2001 From: Michele Facchinelli Date: Thu, 13 Jun 2024 09:39:20 +0100 Subject: [PATCH 3/4] fix: line endings --- README.md | 348 +++++++++++++++++------------------ imap_data_access/io.py | 402 ++++++++++++++++++++--------------------- 2 files changed, 375 insertions(+), 375 deletions(-) diff --git a/README.md b/README.md index b5daa54..09b54ad 100644 --- a/README.md +++ b/README.md @@ -1,174 +1,174 @@ -# IMAP Data Access Package - -This lightweight Python package allows users to download, query, and upload data from the IMAP Science Data Center (SDC). - -## Command Line Utility - -### To install - -```bash -pip install imap-data-access -imap-data-access -h -``` - -### Query / Search for data - -Find all files from the SWE instrument - -```bash -$ imap-data-access query --instrument swe -Found [2] matching files ----------------------------------------------------------------------------------------------------------------| -Instrument|Data Level|Descriptor|Start Date|Repointing|Version|Filename | ----------------------------------------------------------------------------------------------------------------| -swe |l0 |sci |20240105 | |v001 |imap_swe_l0_sci_20240105_v001.pkts | -swe |l0 |sci |20240105 | |v001 |imap_swe_l0_sci_20240105_v001.pkts | ----------------------------------------------------------------------------------------------------------------| -``` - -Find all files during the year 2024 and return the response as raw json - -```bash -$ imap-data-access query --start-date 20240101 --end-date 20241231 --output-format json -[{'file_path': 'imap/swe/l0/2024/01/imap_swe_l0_sci_20240105_v001.pkts', 'instrument': 'swe', 'data_level': 'l0', 'descriptor': 'sci', 'start_date': '20240105', 'version': 'v001', 'extension': 'pkts'}, {'file_path': 'imap/swe/l0/2024/01/imap_swe_l0_sci_20240105_v001.pkts', 'instrument': 'swe', 'data_level': 'l0', 'descriptor': 'sci', 'start_date': '20240105', 'version': 'v001', 'extension': 'pkts'}] -``` - -### Download a file - -Download a level 0 SWE file on 2024/01/05 - -```bash -$ imap-data-access download imap/swe/l0/2024/01/imap_swe_l0_sci_20240105_v001.pkts -Successfully downloaded the file to: /imap/swe/l0/2024/01/imap_swe_l0_sci_20240105_v001.pkts -``` - -### Upload a file - -Upload a l1a file after decoding the l0 CCSDS ".pkts" file - -```bash -$ imap-data-access upload /imap/swe/l1a/2024/01/imap_swe_l1a_sci_20240105_v001.cdf -Successfully uploaded the file to the IMAP SDC -``` - -## Importing as a package - -```python -import imap_data_access - -# Search for files -results = imap_data_access.query(instrument="mag", data_level="l0") -# results is a list of dictionaries -# [{'file_path': 'imap/swe/l0/2024/01/imap_swe_l0_sci_20240105_v001.pkts', 'instrument': 'swe', 'data_level': 'l0', 'descriptor': 'sci', 'start_date': '20240105','version': 'v001', 'extension': 'pkts'}, {'file_path': 'imap/swe/l0/2024/01/imap_swe_l0_sci_20240105_v001.pkts', 'instrument': 'swe', 'data_level': 'l0', 'descriptor': 'sci', 'start_date': '20240105', 'version': 'v001', 'extension': 'pkts'}] - -# Download a file that was returned from the search -imap_data_access.download("imap/mag/l0/2024/01/imap_mag_l0_raw_202040101_v001.pkts") - -# Upload a calibration file that exists locally -imap_data_access.upload("imap/swe/l1a/2024/01/imap_swe_l1a_sci_20240105_v001.cdf") -``` - -## Configuration - -### Data directory - -The folder structure for data files within the IMAP SDC is rigidly -defined, so the data access will mimic that structure to make sure -all data is stored in the same heirarchical structure as the SDC. -This will enable seamless transition between a user's local system -and the SDC. This is only used for downloads. - -A user's root data location can be specified as an environment -variable ``IMAP_DATA_DIR`` or through a configuration dictionary -within the package itself ``imap_data_access.config["DATA_DIR"]``. -If the ``IMAP_DATA_DIR`` variable is not set, the program defaults -to the user's current working directory + ``data/``. - -The following is the directory structure the IMAP SDC uses. - -```text -/ - imap/ - / - / - / - / - -``` - -for example, with ``IMAP_DATA_DIR=/data``: - -```text -/data/ - imap/ - swe/ - l0/ - 2024/ - 01/ - imap_swe_l0_sci_20240105_v001.pkts -``` - -### Data Access URL - -To change the default URL that the package accesses, you can set -the environment variable ``IMAP_DATA_ACCESS_URL`` or within the -package ``imap_data_access.config["DATA_ACCESS_URL"]``. The default -is the development server ``https://api.dev.imap-mission.com``. - -## Troubleshooting - -### Network issues - -#### SSL - -If you encounter SSL errors similar to the following: - -```text -urllib.error.URLError: -``` - -That generally means the Python environment you're using is not finding your system's root -certificates properly. This means you need to tell Python how to find those certificates -with the following potential solutions. - -1. **Upgrade the certifi package** - - ```bash - pip install --upgrade certifi - ``` - -2. **Install system certificates** - Depending on the Python version you installed the program with the command will look something like this: - - ```bash - /Applications/Python\ 3.10/Install\ Certificates.command - ``` - -#### HTTP Error 502: Bad Gateway - -This could mean that the service is temporarily down. If you -continue to encounter this, reach out to the IMAP SDC at -. - -#### FileNotFoundError - -This could mean that the local data directory is not set -up with the same paths as the SDC. See the [data directory](#data-directory) -section for an example of how to set this up. - -## File Validation - -This package validates filenames and paths to check they follow our standards, as defined by the filename conventions. There is also a class available for -use by other packages to create filepaths and filenames that follow the IMAP SDC conventions. - -To use this class, use `imap_data_access.ScienceFilePath`. - -Usage: - -```python - -science_file = imap_data_access.ScienceFilePath("imap_swe_l0_sci_20240101_v001.pkts") - -# Filepath = /imap/swe/l0/2024/01/imap_swe_l0_sci_20240101_v001.pkts -filepath = science_file.construct_path() -``` +# IMAP Data Access Package + +This lightweight Python package allows users to download, query, and upload data from the IMAP Science Data Center (SDC). + +## Command Line Utility + +### To install + +```bash +pip install imap-data-access +imap-data-access -h +``` + +### Query / Search for data + +Find all files from the SWE instrument + +```bash +$ imap-data-access query --instrument swe +Found [2] matching files +---------------------------------------------------------------------------------------------------------------| +Instrument|Data Level|Descriptor|Start Date|Repointing|Version|Filename | +---------------------------------------------------------------------------------------------------------------| +swe |l0 |sci |20240105 | |v001 |imap_swe_l0_sci_20240105_v001.pkts | +swe |l0 |sci |20240105 | |v001 |imap_swe_l0_sci_20240105_v001.pkts | +---------------------------------------------------------------------------------------------------------------| +``` + +Find all files during the year 2024 and return the response as raw json + +```bash +$ imap-data-access query --start-date 20240101 --end-date 20241231 --output-format json +[{'file_path': 'imap/swe/l0/2024/01/imap_swe_l0_sci_20240105_v001.pkts', 'instrument': 'swe', 'data_level': 'l0', 'descriptor': 'sci', 'start_date': '20240105', 'version': 'v001', 'extension': 'pkts'}, {'file_path': 'imap/swe/l0/2024/01/imap_swe_l0_sci_20240105_v001.pkts', 'instrument': 'swe', 'data_level': 'l0', 'descriptor': 'sci', 'start_date': '20240105', 'version': 'v001', 'extension': 'pkts'}] +``` + +### Download a file + +Download a level 0 SWE file on 2024/01/05 + +```bash +$ imap-data-access download imap/swe/l0/2024/01/imap_swe_l0_sci_20240105_v001.pkts +Successfully downloaded the file to: /imap/swe/l0/2024/01/imap_swe_l0_sci_20240105_v001.pkts +``` + +### Upload a file + +Upload a l1a file after decoding the l0 CCSDS ".pkts" file + +```bash +$ imap-data-access upload /imap/swe/l1a/2024/01/imap_swe_l1a_sci_20240105_v001.cdf +Successfully uploaded the file to the IMAP SDC +``` + +## Importing as a package + +```python +import imap_data_access + +# Search for files +results = imap_data_access.query(instrument="mag", data_level="l0") +# results is a list of dictionaries +# [{'file_path': 'imap/swe/l0/2024/01/imap_swe_l0_sci_20240105_v001.pkts', 'instrument': 'swe', 'data_level': 'l0', 'descriptor': 'sci', 'start_date': '20240105','version': 'v001', 'extension': 'pkts'}, {'file_path': 'imap/swe/l0/2024/01/imap_swe_l0_sci_20240105_v001.pkts', 'instrument': 'swe', 'data_level': 'l0', 'descriptor': 'sci', 'start_date': '20240105', 'version': 'v001', 'extension': 'pkts'}] + +# Download a file that was returned from the search +imap_data_access.download("imap/mag/l0/2024/01/imap_mag_l0_raw_202040101_v001.pkts") + +# Upload a calibration file that exists locally +imap_data_access.upload("imap/swe/l1a/2024/01/imap_swe_l1a_sci_20240105_v001.cdf") +``` + +## Configuration + +### Data directory + +The folder structure for data files within the IMAP SDC is rigidly +defined, so the data access will mimic that structure to make sure +all data is stored in the same heirarchical structure as the SDC. +This will enable seamless transition between a user's local system +and the SDC. This is only used for downloads. + +A user's root data location can be specified as an environment +variable ``IMAP_DATA_DIR`` or through a configuration dictionary +within the package itself ``imap_data_access.config["DATA_DIR"]``. +If the ``IMAP_DATA_DIR`` variable is not set, the program defaults +to the user's current working directory + ``data/``. + +The following is the directory structure the IMAP SDC uses. + +```text +/ + imap/ + / + / + / + / + +``` + +for example, with ``IMAP_DATA_DIR=/data``: + +```text +/data/ + imap/ + swe/ + l0/ + 2024/ + 01/ + imap_swe_l0_sci_20240105_v001.pkts +``` + +### Data Access URL + +To change the default URL that the package accesses, you can set +the environment variable ``IMAP_DATA_ACCESS_URL`` or within the +package ``imap_data_access.config["DATA_ACCESS_URL"]``. The default +is the development server ``https://api.dev.imap-mission.com``. + +## Troubleshooting + +### Network issues + +#### SSL + +If you encounter SSL errors similar to the following: + +```text +urllib.error.URLError: +``` + +That generally means the Python environment you're using is not finding your system's root +certificates properly. This means you need to tell Python how to find those certificates +with the following potential solutions. + +1. **Upgrade the certifi package** + + ```bash + pip install --upgrade certifi + ``` + +2. **Install system certificates** + Depending on the Python version you installed the program with the command will look something like this: + + ```bash + /Applications/Python\ 3.10/Install\ Certificates.command + ``` + +#### HTTP Error 502: Bad Gateway + +This could mean that the service is temporarily down. If you +continue to encounter this, reach out to the IMAP SDC at +. + +#### FileNotFoundError + +This could mean that the local data directory is not set +up with the same paths as the SDC. See the [data directory](#data-directory) +section for an example of how to set this up. + +## File Validation + +This package validates filenames and paths to check they follow our standards, as defined by the filename conventions. There is also a class available for +use by other packages to create filepaths and filenames that follow the IMAP SDC conventions. + +To use this class, use `imap_data_access.ScienceFilePath`. + +Usage: + +```python + +science_file = imap_data_access.ScienceFilePath("imap_swe_l0_sci_20240101_v001.pkts") + +# Filepath = /imap/swe/l0/2024/01/imap_swe_l0_sci_20240101_v001.pkts +filepath = science_file.construct_path() +``` diff --git a/imap_data_access/io.py b/imap_data_access/io.py index 4a3b8b0..ad26553 100644 --- a/imap_data_access/io.py +++ b/imap_data_access/io.py @@ -1,201 +1,201 @@ -"""Input/output capabilities for the IMAP data processing pipeline.""" - -# ruff: noqa: PLR0913 S310 -# too many arguments, but we want all of these explicitly listed -# potentially unsafe usage of urlopen, but we aren't concerned here -import contextlib -import json -import logging -import urllib.request -from pathlib import Path -from typing import Optional, Union -from urllib.error import HTTPError, URLError -from urllib.parse import urlencode - -import imap_data_access - -logger = logging.getLogger(__name__) - - -class IMAPDataAccessError(Exception): - """Base class for exceptions in this module.""" - - pass - - -@contextlib.contextmanager -def _get_url_response(request: urllib.request.Request): - """Get the response from a URL request. - - This is a helper function to make it easier to handle - the different types of errors that can occur when - opening a URL and write out the response body. - """ - try: - # Open the URL and yield the response - with urllib.request.urlopen(request) as response: - yield response - - except HTTPError as e: - message = ( - f"HTTP Error: {e.code} - {e.reason}\n" - f"Server Message: {e.read().decode('utf-8')}" - ) - raise IMAPDataAccessError(message) from e - except URLError as e: - message = f"URL Error: {e.reason}" - raise IMAPDataAccessError(message) from e - - -def download(file_path: Union[Path, str]) -> Path: - """Download a file from the data archive. - - Parameters - ---------- - file_path : pathlib.Path or str - Name of the file to download, optionally including the directory path - - Returns - ------- - pathlib.Path - Path to the downloaded file - """ - destination = imap_data_access.config["DATA_DIR"] - # Create the proper file path object based on the extension and filename - file_path = Path(file_path) - if file_path.suffix in imap_data_access.file_validation._SPICE_DIR_MAPPING: - # SPICE - path_obj = imap_data_access.SPICEFilePath(file_path.name) - else: - # Science - path_obj = imap_data_access.ScienceFilePath(file_path.name) - - destination = path_obj.construct_path() - - # Update the file_path with the full path for the download below - file_path = destination.relative_to(imap_data_access.config["DATA_DIR"]).as_posix() - - # Only download if the file doesn't already exist - # TODO: Do we want to verify any hashes to make sure we have the right file? - if destination.exists(): - logger.info("The file %s already exists, skipping download", destination) - return destination - - # encode the query parameters - url = f"{imap_data_access.config['DATA_ACCESS_URL']}" - url += f"/download/{file_path}" - logger.info("Downloading file %s from %s to %s", file_path, url, destination) - - # Create a request with the provided URL - request = urllib.request.Request(url, method="GET") - # Open the URL and download the file - with _get_url_response(request) as response: - logger.debug("Received response: %s", response) - # Save the file locally with the same filename - destination.parent.mkdir(parents=True, exist_ok=True) - with open(destination, "wb") as local_file: - local_file.write(response.read()) - - return destination - - -def query( - *, - instrument: Optional[str] = None, - data_level: Optional[str] = None, - descriptor: Optional[str] = None, - start_date: Optional[str] = None, - end_date: Optional[str] = None, - repointing: Optional[int] = None, - version: Optional[str] = None, - extension: Optional[str] = None, -) -> list[dict[str, str]]: - """Query the data archive for files matching the parameters. - - Parameters - ---------- - instrument : str, optional - Instrument name (e.g. ``mag``) - data_level : str, optional - Data level (e.g. ``l1a``) - descriptor : str, optional - Descriptor of the data product / product name (e.g. ``burst``) - start_date : str, optional - Start date in YYYYMMDD format. Note this is to search for all files - with start dates on or after this value. - end_date : str, optional - End date in YYYYMMDD format. Note this is to search for all files - with start dates before the requested end_date. - repointing : int, optional - Repointing number - version : str, optional - Data version in the format ``vXXX`` - extension : str, optional - File extension (``cdf``, ``pkts``) - - Returns - ------- - list - List of files matching the query - """ - # locals() gives us the keyword arguments passed to the function - # and allows us to filter out the None values - query_params = {key: value for key, value in locals().items() if value is not None} - if not query_params: - raise ValueError("At least one query parameter must be provided") - url = f"{imap_data_access.config['DATA_ACCESS_URL']}" - url += f"/query?{urlencode(query_params)}" - - logger.info("Querying data archive for %s with url %s", query_params, url) - request = urllib.request.Request(url, method="GET") - with _get_url_response(request) as response: - # Retrieve the response as a list of files - items = response.read().decode("utf-8") - logger.debug("Received response: %s", items) - # Decode the JSON string into a list - items = json.loads(items) - logger.debug("Decoded JSON: %s", items) - return items - - -def upload(file_path: Union[Path, str], *, api_key: Optional[str] = None) -> None: - """Upload a file to the data archive. - - Parameters - ---------- - file_path : pathlib.Path or str - Path to the file to upload. - api_key : str, optional - API key to authenticate with the data access API. If not provided, - the value from the IMAP_API_KEY environment variable will be used. - """ - file_path = Path(file_path).resolve() - if not file_path.exists(): - raise FileNotFoundError(file_path) - - url = f"{imap_data_access.config['DATA_ACCESS_URL']}" - # The upload name needs to be given as a path parameter - url += f"/upload/{file_path.name}" - logger.info("Uploading file %s to %s", file_path, url) - - # Create a request header with the API key - api_key = api_key or imap_data_access.config["API_KEY"] - # We send a GET request with the filename and the server - # will respond with an s3 presigned URL that we can use - # to upload the file to the data archive - headers = {"X-api-key": api_key} if api_key else {} - request = urllib.request.Request(url, method="GET", headers=headers) - - with _get_url_response(request) as response: - # Retrieve the key for the upload - s3_url = response.read().decode("utf-8") - logger.debug("Received s3 presigned URL: %s", s3_url) - s3_url = json.loads(s3_url) - - # Follow the presigned URL to upload the file with a PUT request - with open(file_path, "rb") as local_file: - request = urllib.request.Request( - s3_url, data=local_file.read(), method="PUT", headers={"Content-Type": ""} - ) - with _get_url_response(request) as response: - logger.debug("Received response: %s", response.read().decode("utf-8")) +"""Input/output capabilities for the IMAP data processing pipeline.""" + +# ruff: noqa: PLR0913 S310 +# too many arguments, but we want all of these explicitly listed +# potentially unsafe usage of urlopen, but we aren't concerned here +import contextlib +import json +import logging +import urllib.request +from pathlib import Path +from typing import Optional, Union +from urllib.error import HTTPError, URLError +from urllib.parse import urlencode + +import imap_data_access + +logger = logging.getLogger(__name__) + + +class IMAPDataAccessError(Exception): + """Base class for exceptions in this module.""" + + pass + + +@contextlib.contextmanager +def _get_url_response(request: urllib.request.Request): + """Get the response from a URL request. + + This is a helper function to make it easier to handle + the different types of errors that can occur when + opening a URL and write out the response body. + """ + try: + # Open the URL and yield the response + with urllib.request.urlopen(request) as response: + yield response + + except HTTPError as e: + message = ( + f"HTTP Error: {e.code} - {e.reason}\n" + f"Server Message: {e.read().decode('utf-8')}" + ) + raise IMAPDataAccessError(message) from e + except URLError as e: + message = f"URL Error: {e.reason}" + raise IMAPDataAccessError(message) from e + + +def download(file_path: Union[Path, str]) -> Path: + """Download a file from the data archive. + + Parameters + ---------- + file_path : pathlib.Path or str + Name of the file to download, optionally including the directory path + + Returns + ------- + pathlib.Path + Path to the downloaded file + """ + destination = imap_data_access.config["DATA_DIR"] + # Create the proper file path object based on the extension and filename + file_path = Path(file_path) + if file_path.suffix in imap_data_access.file_validation._SPICE_DIR_MAPPING: + # SPICE + path_obj = imap_data_access.SPICEFilePath(file_path.name) + else: + # Science + path_obj = imap_data_access.ScienceFilePath(file_path.name) + + destination = path_obj.construct_path() + + # Update the file_path with the full path for the download below + file_path = destination.relative_to(imap_data_access.config["DATA_DIR"]).as_posix() + + # Only download if the file doesn't already exist + # TODO: Do we want to verify any hashes to make sure we have the right file? + if destination.exists(): + logger.info("The file %s already exists, skipping download", destination) + return destination + + # encode the query parameters + url = f"{imap_data_access.config['DATA_ACCESS_URL']}" + url += f"/download/{file_path}" + logger.info("Downloading file %s from %s to %s", file_path, url, destination) + + # Create a request with the provided URL + request = urllib.request.Request(url, method="GET") + # Open the URL and download the file + with _get_url_response(request) as response: + logger.debug("Received response: %s", response) + # Save the file locally with the same filename + destination.parent.mkdir(parents=True, exist_ok=True) + with open(destination, "wb") as local_file: + local_file.write(response.read()) + + return destination + + +def query( + *, + instrument: Optional[str] = None, + data_level: Optional[str] = None, + descriptor: Optional[str] = None, + start_date: Optional[str] = None, + end_date: Optional[str] = None, + repointing: Optional[int] = None, + version: Optional[str] = None, + extension: Optional[str] = None, +) -> list[dict[str, str]]: + """Query the data archive for files matching the parameters. + + Parameters + ---------- + instrument : str, optional + Instrument name (e.g. ``mag``) + data_level : str, optional + Data level (e.g. ``l1a``) + descriptor : str, optional + Descriptor of the data product / product name (e.g. ``burst``) + start_date : str, optional + Start date in YYYYMMDD format. Note this is to search for all files + with start dates on or after this value. + end_date : str, optional + End date in YYYYMMDD format. Note this is to search for all files + with start dates before the requested end_date. + repointing : int, optional + Repointing number + version : str, optional + Data version in the format ``vXXX`` + extension : str, optional + File extension (``cdf``, ``pkts``) + + Returns + ------- + list + List of files matching the query + """ + # locals() gives us the keyword arguments passed to the function + # and allows us to filter out the None values + query_params = {key: value for key, value in locals().items() if value is not None} + if not query_params: + raise ValueError("At least one query parameter must be provided") + url = f"{imap_data_access.config['DATA_ACCESS_URL']}" + url += f"/query?{urlencode(query_params)}" + + logger.info("Querying data archive for %s with url %s", query_params, url) + request = urllib.request.Request(url, method="GET") + with _get_url_response(request) as response: + # Retrieve the response as a list of files + items = response.read().decode("utf-8") + logger.debug("Received response: %s", items) + # Decode the JSON string into a list + items = json.loads(items) + logger.debug("Decoded JSON: %s", items) + return items + + +def upload(file_path: Union[Path, str], *, api_key: Optional[str] = None) -> None: + """Upload a file to the data archive. + + Parameters + ---------- + file_path : pathlib.Path or str + Path to the file to upload. + api_key : str, optional + API key to authenticate with the data access API. If not provided, + the value from the IMAP_API_KEY environment variable will be used. + """ + file_path = Path(file_path).resolve() + if not file_path.exists(): + raise FileNotFoundError(file_path) + + url = f"{imap_data_access.config['DATA_ACCESS_URL']}" + # The upload name needs to be given as a path parameter + url += f"/upload/{file_path.name}" + logger.info("Uploading file %s to %s", file_path, url) + + # Create a request header with the API key + api_key = api_key or imap_data_access.config["API_KEY"] + # We send a GET request with the filename and the server + # will respond with an s3 presigned URL that we can use + # to upload the file to the data archive + headers = {"X-api-key": api_key} if api_key else {} + request = urllib.request.Request(url, method="GET", headers=headers) + + with _get_url_response(request) as response: + # Retrieve the key for the upload + s3_url = response.read().decode("utf-8") + logger.debug("Received s3 presigned URL: %s", s3_url) + s3_url = json.loads(s3_url) + + # Follow the presigned URL to upload the file with a PUT request + with open(file_path, "rb") as local_file: + request = urllib.request.Request( + s3_url, data=local_file.read(), method="PUT", headers={"Content-Type": ""} + ) + with _get_url_response(request) as response: + logger.debug("Received response: %s", response.read().decode("utf-8")) From bb62dca7bd40b5e3c597383b110a0d2a8c28be0b Mon Sep 17 00:00:00 2001 From: Michele Facchinelli Date: Sat, 15 Jun 2024 09:32:37 +0000 Subject: [PATCH 4/4] fix: add end date to list of valid arguments for `query` --- imap_data_access/cli.py | 1 + 1 file changed, 1 insertion(+) diff --git a/imap_data_access/cli.py b/imap_data_access/cli.py index 08a0c2c..37af021 100644 --- a/imap_data_access/cli.py +++ b/imap_data_access/cli.py @@ -94,6 +94,7 @@ def _query_parser(args: argparse.Namespace): "data_level", "descriptor", "start_date", + "end_date", "repointing", "version", "extension",