Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature/anonymize #9

Merged
merged 5 commits into from
Sep 30, 2024
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ Keep it human-readable, your future self will thank you!
- Docsig precommit hooks
- Changelog merge strategy- Codeowners file
- Create dependency on wcwidth. MIT licence.
- Add anonimize() function.

### Changed
- downstream-ci should only runs for changes in src and tests
Expand Down
115 changes: 115 additions & 0 deletions src/anemoi/utils/sanetise.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
# (C) Copyright 2024 European Centre for Medium-Range Weather Forecasts.
# This software is licensed under the terms of the Apache Licence Version 2.0
# which can be obtained at http://www.apache.org/licenses/LICENSE-2.0.
# In applying this licence, ECMWF does not waive the privileges and immunities
# granted to it by virtue of its status as an intergovernmental organisation
# nor does it submit to any jurisdiction.


import os
import re
from pathlib import Path
from urllib.parse import parse_qs
from urllib.parse import urlencode
from urllib.parse import urlparse
from urllib.parse import urlunparse

# Patterns used but earthkit-data for url-patterns and path-patterns

RE1 = re.compile(r"{([^}]*)}")
RE2 = re.compile(r"\(([^}]*)\)")


def sanetise(obj):
"""sanetise an object:
- by replacing all full paths with shortened versions.
- by replacing URL passwords with '***'.
"""

if isinstance(obj, dict):
return {sanetise(k): sanetise(v) for k, v in obj.items()}

if isinstance(obj, list):
return [sanetise(v) for v in obj]

if isinstance(obj, tuple):
return tuple(sanetise(v) for v in obj)

if isinstance(obj, str):
return _sanetise_string(obj)

return obj


def _sanetise_string(obj):

parsed = urlparse(obj, allow_fragments=True)

if parsed.scheme:
return _sanetise_url(parsed)

if obj.startswith("/") or obj.startswith("~"):
return _sanetise_path(obj)

return obj


def _sanetise_url(parsed):

LIST = [
"pass",
"password",
"token",
"user",
"key",
"pwd",
"_key",
"_token",
"apikey",
"api_key",
"api_token",
"_api_token",
"_api_key",
"username",
"login",
]

scheme, netloc, path, params, query, fragment = parsed

if parsed.password or parsed.username:
_, host = netloc.split("@")
user = "user:***" if parsed.password else "user"
netloc = f"{user}@{host}"

if query:
qs = parse_qs(query)
for k in LIST:
if k in qs:
qs[k] = "hidden"
query = urlencode(qs, doseq=True)

if params:
qs = parse_qs(params)
for k in LIST:
if k in qs:
qs[k] = "hidden"
params = urlencode(qs, doseq=True)

return urlunparse([scheme, netloc, path, params, query, fragment])


def _sanetise_path(path):
bits = list(reversed(Path(path).parts))
result = [bits.pop(0)]
for bit in bits:
if RE1.match(bit) or RE2.match(bit):
result.append(bit)
continue
if result[-1] == "...":
continue
result.append("...")
result = os.path.join(*reversed(result))
if bits[-1] == "/":
result = os.path.join("/", result)

return result
10 changes: 10 additions & 0 deletions src/anemoi/utils/sanetize.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
# (C) Copyright 2024 European Centre for Medium-Range Weather Forecasts.
# This software is licensed under the terms of the Apache Licence Version 2.0
# which can be obtained at http://www.apache.org/licenses/LICENSE-2.0.
# In applying this licence, ECMWF does not waive the privileges and immunities
# granted to it by virtue of its status as an intergovernmental organisation
# nor does it submit to any jurisdiction.

from .sanetise import sanetise as sanetize

__all__ = ["sanetize"]
69 changes: 69 additions & 0 deletions tests/test_sanetise.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
# (C) Copyright 2024 European Centre for Medium-Range Weather Forecasts.
# This software is licensed under the terms of the Apache Licence Version 2.0
# which can be obtained at http://www.apache.org/licenses/LICENSE-2.0.
# In applying this licence, ECMWF does not waive the privileges and immunities
# granted to it by virtue of its status as an intergovernmental organisation
# nor does it submit to any jurisdiction.


from anemoi.utils.sanetise import sanetise


def test_sanetise_urls():
assert sanetise("http://johndoe:password@host:port/path") == "http://user:***@host:port/path"

assert sanetise("http://www.example.com/path?pass=secret") == "http://www.example.com/path?pass=hidden"
assert sanetise("http://www.example.com/path?password=secret") == "http://www.example.com/path?password=hidden"
assert sanetise("http://www.example.com/path?token=secret") == "http://www.example.com/path?token=hidden"
assert sanetise("http://www.example.com/path?user=secret") == "http://www.example.com/path?user=hidden"
assert sanetise("http://www.example.com/path?key=secret") == "http://www.example.com/path?key=hidden"
assert sanetise("http://www.example.com/path?pwd=secret") == "http://www.example.com/path?pwd=hidden"
assert sanetise("http://www.example.com/path?_key=secret") == "http://www.example.com/path?_key=hidden"
assert sanetise("http://www.example.com/path?_token=secret") == "http://www.example.com/path?_token=hidden"
assert sanetise("http://www.example.com/path?apikey=secret") == "http://www.example.com/path?apikey=hidden"
assert sanetise("http://www.example.com/path?api_key=secret") == "http://www.example.com/path?api_key=hidden"
assert sanetise("http://www.example.com/path?api_token=secret") == "http://www.example.com/path?api_token=hidden"
assert sanetise("http://www.example.com/path?_api_token=secret") == "http://www.example.com/path?_api_token=hidden"
assert sanetise("http://www.example.com/path?_api_key=secret") == "http://www.example.com/path?_api_key=hidden"
assert sanetise("http://www.example.com/path?username=secret") == "http://www.example.com/path?username=hidden"
assert sanetise("http://www.example.com/path?login=secret") == "http://www.example.com/path?login=hidden"

assert sanetise("http://www.example.com/path;pass=secret") == "http://www.example.com/path;pass=hidden"
assert sanetise("http://www.example.com/path;password=secret") == "http://www.example.com/path;password=hidden"
assert sanetise("http://www.example.com/path;token=secret") == "http://www.example.com/path;token=hidden"
assert sanetise("http://www.example.com/path;user=secret") == "http://www.example.com/path;user=hidden"
assert sanetise("http://www.example.com/path;key=secret") == "http://www.example.com/path;key=hidden"
assert sanetise("http://www.example.com/path;pwd=secret") == "http://www.example.com/path;pwd=hidden"
assert sanetise("http://www.example.com/path;_key=secret") == "http://www.example.com/path;_key=hidden"
assert sanetise("http://www.example.com/path;_token=secret") == "http://www.example.com/path;_token=hidden"
assert sanetise("http://www.example.com/path;apikey=secret") == "http://www.example.com/path;apikey=hidden"
assert sanetise("http://www.example.com/path;api_key=secret") == "http://www.example.com/path;api_key=hidden"
assert sanetise("http://www.example.com/path;api_token=secret") == "http://www.example.com/path;api_token=hidden"
assert sanetise("http://www.example.com/path;_api_token=secret") == "http://www.example.com/path;_api_token=hidden"
assert sanetise("http://www.example.com/path;_api_key=secret") == "http://www.example.com/path;_api_key=hidden"
assert sanetise("http://www.example.com/path;username=secret") == "http://www.example.com/path;username=hidden"
assert sanetise("http://www.example.com/path;login=secret") == "http://www.example.com/path;login=hidden"


def test_sanetise_paths():
# We want to keep earthkit-data's url and path pattern

assert sanetise("/home/johndoe/.ssh/id_rsa") == "/.../id_rsa"

assert (
sanetise("/data/model/{date:strftime(%Y)}/{date:strftime(%m)}/{date:strftime(%d)}/analysis.grib")
== "/.../{date:strftime(%Y)}/{date:strftime(%m)}/{date:strftime(%d)}/analysis.grib"
)

assert sanetise("test.grib") == "test.grib"
assert sanetise("../test.grib") == "../test.grib"
assert sanetise("./test.grib") == "./test.grib"
assert sanetise("sub/folder/test.grib") == "sub/folder/test.grib"
assert sanetise("./folder/test.grib") == "./folder/test.grib"


if __name__ == "__main__":
for name, obj in list(globals().items()):
if name.startswith("test_") and callable(obj):
print(f"Running {name}...")
obj()
Loading