From 71219d7f68b3839ae1c5ca4668320647d092a3c0 Mon Sep 17 00:00:00 2001 From: karthik venkataramani Date: Mon, 12 Feb 2024 14:20:31 -0800 Subject: [PATCH 01/13] use pathlib.PurePath for file/folder operations --- earthaccess/store.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/earthaccess/store.py b/earthaccess/store.py index 4981fe50..ee9b9033 100644 --- a/earthaccess/store.py +++ b/earthaccess/store.py @@ -1,5 +1,5 @@ import datetime -import os +from pathlib import Path import shutil import traceback from functools import lru_cache @@ -466,7 +466,7 @@ def get( List of downloaded files """ if local_path is None: - local_path = os.path.join( + local_path = Path( ".", "data", f"{datetime.datetime.today().strftime('%Y-%m-%d')}-{uuid4().hex[:6]}", @@ -526,7 +526,7 @@ def _get_urls( # TODO: make this parallel or concurrent for file in data_links: s3_fs.get(file, local_path) - file_name = os.path.join(local_path, os.path.basename(file)) + file_name = Path(local_path, Path(file).name) print(f"Downloaded: {file_name}") downloaded_files.append(file_name) return downloaded_files @@ -572,7 +572,7 @@ def _get_granules( # TODO: make this async for file in data_links: s3_fs.get(file, local_path) - file_name = os.path.join(local_path, os.path.basename(file)) + file_name = Path(local_path, Path(file).name) print(f"Downloaded: {file_name}") downloaded_files.append(file_name) return downloaded_files @@ -597,7 +597,7 @@ def _download_file(self, url: str, directory: str) -> str: local_filename = url.split("/")[-1] path = Path(directory) / Path(local_filename) local_path = str(path) - if not os.path.exists(local_path): + if not Path(local_path).exists(): try: session = self.auth.get_session() with session.get( @@ -638,8 +638,8 @@ def _download_onprem_granules( raise ValueError( "We need to be logged into NASA EDL in order to download data granules" ) - if not os.path.exists(directory): - os.makedirs(directory) + if not Path(directory).exists(): + Path(directory).mkdir(parents=True) arguments = [(url, directory) for url in urls] results = pqdm( From 2f39a736c6f778b3a2f363326e3055f9c33bcf90 Mon Sep 17 00:00:00 2001 From: karthik venkataramani Date: Mon, 12 Feb 2024 14:27:40 -0800 Subject: [PATCH 02/13] update to use pathlib --- earthaccess/auth.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/earthaccess/auth.py b/earthaccess/auth.py index a8335f73..6308bf23 100644 --- a/earthaccess/auth.py +++ b/earthaccess/auth.py @@ -2,6 +2,7 @@ import importlib.metadata import logging import os +from pathlib import Path from netrc import NetrcParseError from pathlib import Path from typing import Any, Dict, List, Optional @@ -258,7 +259,7 @@ def _netrc(self) -> bool: my_netrc = Netrc() except FileNotFoundError as err: raise FileNotFoundError( - f"No .netrc found in {os.path.expanduser('~')}" + f"No .netrc found in {Path('~').expanduser()}" ) from err except NetrcParseError as err: raise NetrcParseError("Unable to parse .netrc") from err @@ -365,7 +366,7 @@ def _persist_user_credentials(self, username: str, password: str) -> bool: try: netrc_path = Path().home().joinpath(".netrc") netrc_path.touch(exist_ok=True) - os.chmod(netrc_path.absolute(), 0o600) + netrc_path.chmod(0o600) except Exception as e: print(e) return False From 1657e63aaa19c3e78d6ae61fec7d61cae4f01286 Mon Sep 17 00:00:00 2001 From: karthik venkataramani Date: Mon, 12 Feb 2024 22:06:07 -0800 Subject: [PATCH 03/13] update to use pathlib --- tests/integration/test_kerchunk.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/integration/test_kerchunk.py b/tests/integration/test_kerchunk.py index 58f93077..f15f5a1b 100644 --- a/tests/integration/test_kerchunk.py +++ b/tests/integration/test_kerchunk.py @@ -1,5 +1,6 @@ import logging import os +from pathlib import Path import unittest import earthaccess @@ -32,14 +33,14 @@ def granules(): @pytest.mark.parametrize("protocol", ["", "file://"]) def test_consolidate_metadata_outfile(tmp_path, granules, protocol): outfile = f"{protocol}{tmp_path / 'metadata.json'}" - assert not os.path.exists(outfile) + assert not Path(outfile).exists() result = earthaccess.consolidate_metadata( granules, outfile=outfile, access="indirect", kerchunk_options={"concat_dims": "Time"}, ) - assert os.path.exists(strip_protocol(outfile)) + assert Path(strip_protocol(outfile)).exists() assert result == outfile From 65320734bd5f02f43a9b3d7d7fd1c0a6691e2f87 Mon Sep 17 00:00:00 2001 From: karthik venkataramani Date: Mon, 12 Feb 2024 22:07:48 -0800 Subject: [PATCH 04/13] update to use pathlib --- tests/integration/test_api.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/integration/test_api.py b/tests/integration/test_api.py index 6fa1ccea..62cdb3e3 100644 --- a/tests/integration/test_api.py +++ b/tests/integration/test_api.py @@ -1,6 +1,7 @@ # package imports import logging import os +from pathlib import Path import unittest import earthaccess @@ -84,7 +85,7 @@ def test_download(tmp_path, selection, use_url): result = results[selection] files = earthaccess.download(result, str(tmp_path)) assertions.assertIsInstance(files, list) - assert all(os.path.exists(f) for f in files) + assert all(Path(f).exists() for f in files) def test_auth_environ(): From 9820912cb601f2d0eb19daabf9439c80b3aa2661 Mon Sep 17 00:00:00 2001 From: karthik venkataramani Date: Mon, 12 Feb 2024 22:11:21 -0800 Subject: [PATCH 05/13] update to use pathlib --- tests/integration/test_cloud_download.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/integration/test_cloud_download.py b/tests/integration/test_cloud_download.py index 63a05b93..07c13e99 100644 --- a/tests/integration/test_cloud_download.py +++ b/tests/integration/test_cloud_download.py @@ -1,6 +1,7 @@ # package imports import logging import os +from pathlib import Path import random import shutil import unittest @@ -166,4 +167,4 @@ def test_multi_file_granule(tmp_path): urls = granules[0].data_links() assert len(urls) > 1 files = earthaccess.download(granules, str(tmp_path)) - assert set(map(os.path.basename, urls)) == set(map(os.path.basename, files)) + assert set([Path(f).name for f in urls]) == set([Path(f).name for f in files]) From 7ea651f3d6b7660eba2626b1dcb349117bbb46cd Mon Sep 17 00:00:00 2001 From: Karthik Venkataramani Date: Tue, 13 Feb 2024 14:57:13 -0800 Subject: [PATCH 06/13] Update earthaccess/auth.py Co-authored-by: Joseph H Kennedy --- earthaccess/auth.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/earthaccess/auth.py b/earthaccess/auth.py index 6308bf23..c6c830db 100644 --- a/earthaccess/auth.py +++ b/earthaccess/auth.py @@ -259,7 +259,7 @@ def _netrc(self) -> bool: my_netrc = Netrc() except FileNotFoundError as err: raise FileNotFoundError( - f"No .netrc found in {Path('~').expanduser()}" + f"No .netrc found in {Path.home()}" ) from err except NetrcParseError as err: raise NetrcParseError("Unable to parse .netrc") from err From 780c1f43f40be7637bdf47264dd1ffb12bd6308e Mon Sep 17 00:00:00 2001 From: Karthik Venkataramani Date: Tue, 13 Feb 2024 14:57:35 -0800 Subject: [PATCH 07/13] Update earthaccess/store.py Co-authored-by: Joseph H Kennedy --- earthaccess/store.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/earthaccess/store.py b/earthaccess/store.py index ee9b9033..32dd2910 100644 --- a/earthaccess/store.py +++ b/earthaccess/store.py @@ -638,8 +638,7 @@ def _download_onprem_granules( raise ValueError( "We need to be logged into NASA EDL in order to download data granules" ) - if not Path(directory).exists(): - Path(directory).mkdir(parents=True) + Path(directory).mkdir(parents=True, exist_ok=True) arguments = [(url, directory) for url in urls] results = pqdm( From 9b7928ecf0e45254eb9b417b47ab1d2f48500272 Mon Sep 17 00:00:00 2001 From: Karthik Venkataramani Date: Tue, 13 Feb 2024 14:57:40 -0800 Subject: [PATCH 08/13] Update earthaccess/store.py Co-authored-by: Joseph H Kennedy --- earthaccess/store.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/earthaccess/store.py b/earthaccess/store.py index 32dd2910..abe0c85c 100644 --- a/earthaccess/store.py +++ b/earthaccess/store.py @@ -572,7 +572,7 @@ def _get_granules( # TODO: make this async for file in data_links: s3_fs.get(file, local_path) - file_name = Path(local_path, Path(file).name) + file_name = Path(local_path) / Path(file).name print(f"Downloaded: {file_name}") downloaded_files.append(file_name) return downloaded_files From 6591ead54604bc58244c7301b0381af2ad236e1d Mon Sep 17 00:00:00 2001 From: Karthik Venkataramani Date: Tue, 13 Feb 2024 14:57:45 -0800 Subject: [PATCH 09/13] Update earthaccess/store.py Co-authored-by: Joseph H Kennedy --- earthaccess/store.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/earthaccess/store.py b/earthaccess/store.py index abe0c85c..3cbb24d1 100644 --- a/earthaccess/store.py +++ b/earthaccess/store.py @@ -526,7 +526,7 @@ def _get_urls( # TODO: make this parallel or concurrent for file in data_links: s3_fs.get(file, local_path) - file_name = Path(local_path, Path(file).name) + file_name = Path(local_path) / Path(file).name print(f"Downloaded: {file_name}") downloaded_files.append(file_name) return downloaded_files From 95ce27f472d35b9b7afe4085d97657c1c89cad1f Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 13 Feb 2024 23:46:14 +0000 Subject: [PATCH 10/13] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- earthaccess/auth.py | 5 +---- earthaccess/store.py | 1 - tests/integration/test_api.py | 2 +- tests/integration/test_cloud_download.py | 1 - tests/integration/test_kerchunk.py | 2 +- 5 files changed, 3 insertions(+), 8 deletions(-) diff --git a/earthaccess/auth.py b/earthaccess/auth.py index c6c830db..2a820402 100644 --- a/earthaccess/auth.py +++ b/earthaccess/auth.py @@ -2,7 +2,6 @@ import importlib.metadata import logging import os -from pathlib import Path from netrc import NetrcParseError from pathlib import Path from typing import Any, Dict, List, Optional @@ -258,9 +257,7 @@ def _netrc(self) -> bool: try: my_netrc = Netrc() except FileNotFoundError as err: - raise FileNotFoundError( - f"No .netrc found in {Path.home()}" - ) from err + raise FileNotFoundError(f"No .netrc found in {Path.home()}") from err except NetrcParseError as err: raise NetrcParseError("Unable to parse .netrc") from err if my_netrc["urs.earthdata.nasa.gov"] is not None: diff --git a/earthaccess/store.py b/earthaccess/store.py index 3cbb24d1..9ec9c82c 100644 --- a/earthaccess/store.py +++ b/earthaccess/store.py @@ -1,5 +1,4 @@ import datetime -from pathlib import Path import shutil import traceback from functools import lru_cache diff --git a/tests/integration/test_api.py b/tests/integration/test_api.py index 62cdb3e3..8fd45489 100644 --- a/tests/integration/test_api.py +++ b/tests/integration/test_api.py @@ -1,8 +1,8 @@ # package imports import logging import os -from pathlib import Path import unittest +from pathlib import Path import earthaccess import pytest diff --git a/tests/integration/test_cloud_download.py b/tests/integration/test_cloud_download.py index 07c13e99..a9b9432c 100644 --- a/tests/integration/test_cloud_download.py +++ b/tests/integration/test_cloud_download.py @@ -1,7 +1,6 @@ # package imports import logging import os -from pathlib import Path import random import shutil import unittest diff --git a/tests/integration/test_kerchunk.py b/tests/integration/test_kerchunk.py index f15f5a1b..2e981cce 100644 --- a/tests/integration/test_kerchunk.py +++ b/tests/integration/test_kerchunk.py @@ -1,7 +1,7 @@ import logging import os -from pathlib import Path import unittest +from pathlib import Path import earthaccess import pytest From c68e63449b611476ac3492a94a85cce47fcb871c Mon Sep 17 00:00:00 2001 From: karthik venkataramani Date: Tue, 13 Feb 2024 16:57:38 -0800 Subject: [PATCH 11/13] update typehints and code for _get methods to use Path consistently --- earthaccess/store.py | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/earthaccess/store.py b/earthaccess/store.py index 9ec9c82c..44853918 100644 --- a/earthaccess/store.py +++ b/earthaccess/store.py @@ -480,7 +480,7 @@ def get( def _get( self, granules: Union[List[DataGranule], List[str]], - local_path: str, + local_path: Path, provider: Optional[str] = None, threads: int = 8, ) -> List[str]: @@ -508,7 +508,7 @@ def _get( def _get_urls( self, granules: List[str], - local_path: str, + local_path: Path, provider: Optional[str] = None, threads: int = 8, ) -> List[str]: @@ -524,8 +524,8 @@ def _get_urls( s3_fs = self.get_s3fs_session(provider=provider) # TODO: make this parallel or concurrent for file in data_links: - s3_fs.get(file, local_path) - file_name = Path(local_path) / Path(file).name + s3_fs.get(file, str(local_path)) + file_name = local_path / Path(file).name print(f"Downloaded: {file_name}") downloaded_files.append(file_name) return downloaded_files @@ -538,7 +538,7 @@ def _get_urls( def _get_granules( self, granules: List[DataGranule], - local_path: str, + local_path: Path, provider: Optional[str] = None, threads: int = 8, ) -> List[str]: @@ -570,15 +570,15 @@ def _get_granules( s3_fs = self.get_s3fs_session(provider=provider) # TODO: make this async for file in data_links: - s3_fs.get(file, local_path) - file_name = Path(local_path) / Path(file).name + s3_fs.get(file, str(local_path)) + file_name = local_path / Path(file).name print(f"Downloaded: {file_name}") downloaded_files.append(file_name) return downloaded_files else: # if the data are cloud-based, but we are not in AWS, # it will be downloaded as if it was on prem - return self._download_onprem_granules(data_links, local_path, threads) + return self._download_onprem_granules(data_links, str(local_path), threads) def _download_file(self, url: str, directory: str) -> str: """Download a single file from an on-prem location, a DAAC data center. @@ -595,8 +595,7 @@ def _download_file(self, url: str, directory: str) -> str: url = url.replace(".html", "") local_filename = url.split("/")[-1] path = Path(directory) / Path(local_filename) - local_path = str(path) - if not Path(local_path).exists(): + if not path.exists(): try: session = self.auth.get_session() with session.get( @@ -605,7 +604,7 @@ def _download_file(self, url: str, directory: str) -> str: allow_redirects=True, ) as r: r.raise_for_status() - with open(local_path, "wb") as f: + with open(path, "wb") as f: # This is to cap memory usage for large files at 1MB per write to disk per thread # https://docs.python-requests.org/en/latest/user/quickstart/#raw-response-content shutil.copyfileobj(r.raw, f, length=1024 * 1024) @@ -615,7 +614,7 @@ def _download_file(self, url: str, directory: str) -> str: raise Exception else: print(f"File {local_filename} already downloaded") - return local_path + return str(path) def _download_onprem_granules( self, urls: List[str], directory: str, threads: int = 8 From b036811b4f1827c97c02e2af470117dcc996b659 Mon Sep 17 00:00:00 2001 From: Joseph H Kennedy Date: Fri, 16 Feb 2024 15:06:14 -0900 Subject: [PATCH 12/13] a little path vs str cleanup --- earthaccess/store.py | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/earthaccess/store.py b/earthaccess/store.py index 44853918..15ae4ef2 100644 --- a/earthaccess/store.py +++ b/earthaccess/store.py @@ -442,7 +442,7 @@ def _open_urls( def get( self, granules: Union[List[DataGranule], List[str]], - local_path: Optional[str] = None, + local_path: Optional[Path] = None, provider: Optional[str] = None, threads: int = 8, ) -> List[str]: @@ -465,11 +465,10 @@ def get( List of downloaded files """ if local_path is None: - local_path = Path( - ".", - "data", - f"{datetime.datetime.today().strftime('%Y-%m-%d')}-{uuid4().hex[:6]}", - ) + today = datetime.datetime.today().strftime("%Y-%m-%d") + uuid = uuid4().hex[:6] + local_path = Path.cwd() / "data" / f"{today}-{uuid}" + if len(granules): files = self._get(granules, local_path, provider, threads) return files @@ -578,9 +577,9 @@ def _get_granules( else: # if the data are cloud-based, but we are not in AWS, # it will be downloaded as if it was on prem - return self._download_onprem_granules(data_links, str(local_path), threads) + return self._download_onprem_granules(data_links, local_path, threads) - def _download_file(self, url: str, directory: str) -> str: + def _download_file(self, url: str, directory: Path) -> str: """Download a single file from an on-prem location, a DAAC data center. Parameters: @@ -594,7 +593,7 @@ def _download_file(self, url: str, directory: str) -> str: if "opendap" in url and url.endswith(".html"): url = url.replace(".html", "") local_filename = url.split("/")[-1] - path = Path(directory) / Path(local_filename) + path = directory / Path(local_filename) if not path.exists(): try: session = self.auth.get_session() @@ -617,7 +616,7 @@ def _download_file(self, url: str, directory: str) -> str: return str(path) def _download_onprem_granules( - self, urls: List[str], directory: str, threads: int = 8 + self, urls: List[str], directory: Path, threads: int = 8 ) -> List[Any]: """Downloads a list of URLS into the data directory. @@ -636,7 +635,7 @@ def _download_onprem_granules( raise ValueError( "We need to be logged into NASA EDL in order to download data granules" ) - Path(directory).mkdir(parents=True, exist_ok=True) + directory.mkdir(parents=True, exist_ok=True) arguments = [(url, directory) for url in urls] results = pqdm( From 453e7f968977936a4fa662d739a2ed0dab79dcb8 Mon Sep 17 00:00:00 2001 From: Joseph H Kennedy Date: Fri, 16 Feb 2024 15:13:35 -0900 Subject: [PATCH 13/13] one more os->pathlib --- tests/integration/test_auth.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration/test_auth.py b/tests/integration/test_auth.py index d1bfae1e..a4879d12 100644 --- a/tests/integration/test_auth.py +++ b/tests/integration/test_auth.py @@ -30,7 +30,7 @@ def activate_netrc(): f.write( f"machine urs.earthdata.nasa.gov login {username} password {password}\n" ) - os.chmod(NETRC_PATH, 0o600) + NETRC_PATH.chmod(0o600) def delete_netrc():