diff --git a/scripts/files/fs.py b/scripts/files/fs.py index ce5ec6a12..96a7a23ce 100644 --- a/scripts/files/fs.py +++ b/scripts/files/fs.py @@ -28,3 +28,17 @@ def read(path: str) -> bytes: return fs_s3.read(path) return fs_local.read(path) + + +def exists(path: str) -> bool: + """Check if path (file or directory) exists + + Args: + path: A path to a directory or file + + Returns: + True if the path exists + """ + if is_s3(path): + return fs_s3.exists(path) + return fs_local.exists(path) diff --git a/scripts/files/fs_local.py b/scripts/files/fs_local.py index 9b7749afa..657e6961f 100644 --- a/scripts/files/fs_local.py +++ b/scripts/files/fs_local.py @@ -1,3 +1,6 @@ +import os + + def write(destination: str, source: bytes) -> None: """Write the source to the local destination file. @@ -20,3 +23,15 @@ def read(path: str) -> bytes: """ with open(path, "rb") as file: return file.read() + + +def exists(path: str) -> bool: + """Check if path (file or directory) exists + + Args: + path: A local path to a directory or file + + Returns: + True if the path exists + """ + return os.path.exists(path) diff --git a/scripts/files/fs_s3.py b/scripts/files/fs_s3.py index 5ef0cb85e..1b4ab05df 100644 --- a/scripts/files/fs_s3.py +++ b/scripts/files/fs_s3.py @@ -76,6 +76,56 @@ def read(path: str, needs_credentials: bool = False) -> bytes: return file +def exists(path: str, needs_credentials: bool = False) -> bool: + """Check if s3 Object exists + + Args: + path: path to the s3 object/key + needs_credentials: if acces to object needs credentials. Defaults to False. + + Raises: + ce: ClientError + nsb: NoSuchBucket + + Returns: + True if the S3 Object exists + """ + s3_path, key = parse_path(path) + s3 = boto3.resource("s3") + + try: + if needs_credentials: + s3 = get_session(path).resource("s3") + + if path.endswith("/"): + bucket_name = bucket_name_from_path(s3_path) + bucket = s3.Bucket(bucket_name) + # MaxKeys limits to 1 object in the response + objects = bucket.objects.filter(Prefix=key, MaxKeys=1) + + if len(list(objects)) > 0: + return True + return False + + # load() fetch the metadata, not the data. Calls a `head` behind the scene. + s3.Object(s3_path, key).load() + return True + except s3.meta.client.exceptions.NoSuchBucket as nsb: + get_log().debug("s3_bucket_not_found", path=path, info=f"The specified bucket does not seem to exist: {nsb}") + return False + except s3.meta.client.exceptions.ClientError as ce: + if not needs_credentials and ce.response["Error"]["Code"] == "AccessDenied": + get_log().debug("read_s3_needs_credentials", path=path) + return exists(path, True) + # https://boto3.amazonaws.com/v1/documentation/api/latest/guide/error-handling.html#parsing-error-responses-and-catching-exceptions-from-aws-services + # 404 for NoSuchKey - https://github.com/boto/boto3/issues/2442 + if ce.response["Error"]["Code"] == "404": + get_log().debug("s3_key_not_found", path=path, info=f"The specified key does not seem to exist: {ce}") + return False + get_log().error("s3_client_error", path=path, error=f"ClientError raised: {ce}") + raise ce + + def bucket_name_from_path(path: str) -> str: path_parts = path.replace("s3://", "").split("/") return path_parts.pop(0) diff --git a/scripts/files/tests/fs_local_test.py b/scripts/files/tests/fs_local_test.py index 4248df41d..fc6feb9c0 100644 --- a/scripts/files/tests/fs_local_test.py +++ b/scripts/files/tests/fs_local_test.py @@ -5,7 +5,7 @@ import pytest -from scripts.files.fs_local import read, write +from scripts.files.fs_local import exists, read, write @pytest.fixture(name="setup", autouse=True) @@ -36,3 +36,18 @@ def test_read(setup: str) -> None: write(path, content) file_content = read(path) assert file_content == content + + +@pytest.mark.dependency(name="exists", depends=["write"]) +def test_exists(setup: str) -> None: + content = b"test content" + target = setup + path = os.path.join(target, "test.file") + write(path, content) + found = exists(path) + assert found is True + + +def test_exists_file_not_found() -> None: + found = exists("/tmp/test.file") + assert found is False diff --git a/scripts/files/tests/fs_s3_test.py b/scripts/files/tests/fs_s3_test.py index 437fc0514..8eac507b7 100644 --- a/scripts/files/tests/fs_s3_test.py +++ b/scripts/files/tests/fs_s3_test.py @@ -7,7 +7,7 @@ from moto.s3.responses import DEFAULT_REGION_NAME from pytest import CaptureFixture -from scripts.files.fs_s3 import read, write +from scripts.files.fs_s3 import exists, read, write @mock_s3 # type: ignore @@ -54,3 +54,60 @@ def test_read_key_not_found(capsys: CaptureFixture[str]) -> None: logs = json.loads(capsys.readouterr().out) assert logs["msg"] == "s3_key_not_found" + + +@mock_s3 # type: ignore +def test_exists() -> None: + s3 = boto3.resource("s3", region_name=DEFAULT_REGION_NAME) + client = boto3.client("s3", region_name=DEFAULT_REGION_NAME) + s3.create_bucket(Bucket="testbucket") + client.put_object(Bucket="testbucket", Key="test.file", Body=b"test content") + + file_exists = exists("s3://testbucket/test.file") + + assert file_exists is True + + +@mock_s3 # type: ignore +def test_directory_exists() -> None: + s3 = boto3.resource("s3", region_name=DEFAULT_REGION_NAME) + client = boto3.client("s3", region_name=DEFAULT_REGION_NAME) + s3.create_bucket(Bucket="testbucket") + client.put_object(Bucket="testbucket", Key="hello/test.file", Body=b"test content") + + directory_exists = exists("s3://testbucket/hello/") + + assert directory_exists is True + + +@mock_s3 # type: ignore +def test_exists_bucket_not_exists(capsys: CaptureFixture[str]) -> None: + file_exists = exists("s3://testbucket/test.file") + + logs = json.loads(capsys.readouterr().out) + assert logs["msg"] == "s3_bucket_not_found" + assert file_exists is False + + +@mock_s3 # type: ignore +def test_exists_object_not_exists() -> None: + s3 = boto3.resource("s3", region_name=DEFAULT_REGION_NAME) + client = boto3.client("s3", region_name=DEFAULT_REGION_NAME) + s3.create_bucket(Bucket="testbucket") + client.put_object(Bucket="testbucket", Key="hello/another.file", Body=b"test content") + + file_exists = exists("s3://testbucket/test.file") + + assert file_exists is False + + +@mock_s3 # type: ignore +def test_exists_object_starting_with_not_exists() -> None: + s3 = boto3.resource("s3", region_name=DEFAULT_REGION_NAME) + client = boto3.client("s3", region_name=DEFAULT_REGION_NAME) + s3.create_bucket(Bucket="testbucket") + client.put_object(Bucket="testbucket", Key="hello/another.file", Body=b"test content") + + file_exists = exists("s3://testbucket/hello/another.fi") + + assert file_exists is False