Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: simplify object path split #1028

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
68 changes: 14 additions & 54 deletions src/uproot/_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@
import itertools
import numbers
import os
import pathlib
import platform
import re
import warnings
Expand Down Expand Up @@ -275,14 +274,10 @@ def regularize_path(path):
_windows_absolute_path_pattern = re.compile(r"^[A-Za-z]:[\\/]")
_windows_absolute_path_pattern_slash = re.compile(r"^[\\/][A-Za-z]:[\\/]")

# These schemes may not appear in fsspec if the corresponding libraries are not installed (e.g. s3fs)
_remote_schemes = ["root", "s3", "http", "https"]
_schemes = list({*_remote_schemes, *fsspec.available_protocols()})

_uri_scheme = re.compile("^(" + "|".join([re.escape(x) for x in _schemes]) + ")://")
_uri_scheme_chain = re.compile(
"^(" + "|".join([re.escape(x) for x in _schemes]) + ")::"
)


def file_object_path_split(urlpath: str) -> tuple[str, str | None]:
"""
Expand All @@ -298,54 +293,19 @@ def file_object_path_split(urlpath: str) -> tuple[str, str | None]:
"""

urlpath: str = regularize_path(urlpath).strip()
path = urlpath

def _split_path(path: str) -> list[str]:
parts = path.split(":")
if pathlib.PureWindowsPath(path).drive:
# Windows absolute path
assert len(parts) >= 2, f"could not split object from windows path {path}"
parts = [parts[0] + ":" + parts[1]] + parts[2:]
return parts

if "://" not in path:
path = "file://" + path

# replace the match of _uri_scheme_chain with "" until there is no match
while _uri_scheme_chain.match(path):
path = _uri_scheme_chain.sub("", path)

if _uri_scheme.match(path):
# if not a local path, attempt to match a URI scheme
if path.startswith("file://"):
parsed_url_path = path[7:]
else:
parsed_url_path = urlparse(path).path

if parsed_url_path.startswith("//"):
parsed_url_path = parsed_url_path[2:]

parts = _split_path(parsed_url_path)
else:
# invalid scheme
scheme = path.split("://")[0]
raise ValueError(
f"Invalid URI scheme: '{scheme}://' in {path}. Available schemes: {', '.join(_schemes)}."
)

if len(parts) == 1:
obj = None
elif len(parts) == 2:
obj = parts[1]
# remove the object from the path (including the colon)
urlpath = urlpath[: -len(obj) - 1]
# clean badly placed slashes
obj = obj.strip().lstrip("/")
while "//" in obj:
obj = obj.replace("//", "/")
else:
raise ValueError(f"could not split object from path {path}")

obj = None

separator = "::"
parts = urlpath.split(separator)
object_regex = re.compile(r"(.+\.root):(.*$)")
for i, part in enumerate(reversed(parts)):
match = object_regex.match(part)
if match:
obj = re.sub(r"/+", "/", match.group(2).strip().lstrip("/")).rstrip("/")
parts[-i - 1] = match.group(1)
break

urlpath = separator.join(parts)
return urlpath, obj


Expand Down
8 changes: 4 additions & 4 deletions tests/test_0001_source_class.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,13 +148,13 @@ def test_colons_and_ports():
"https://example.com:443",
None,
)
assert uproot._util.file_object_path_split("https://example.com:443/something") == (
"https://example.com:443/something",
assert uproot._util.file_object_path_split("https://example.com:443/file.root") == (
"https://example.com:443/file.root",
None,
)
assert uproot._util.file_object_path_split(
"https://example.com:443/something:else"
) == ("https://example.com:443/something", "else")
"https://example.com:443/file.root:object"
) == ("https://example.com:443/file.root", "object")


@pytest.mark.parametrize("use_threads", [True, False], indirect=True)
Expand Down
2 changes: 1 addition & 1 deletion tests/test_0692_fsspec_reading.py
Original file line number Diff line number Diff line change
Expand Up @@ -199,7 +199,7 @@ def test_fsspec_zip(tmp_path):

# open with fsspec
with uproot.open(
f"zip://{filename}::file://{filename_zip}:Events/MET_pt"
f"zip://{filename}:Events/MET_pt::file://{filename_zip}"
) as branch:
data = branch.array(library="np")
assert len(data) == 40
74 changes: 60 additions & 14 deletions tests/test_0976_path_object_split.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,24 +64,38 @@
),
),
(
"ssh://user@host:22/path/to/file:object",
"ssh://user@host:22/path/to/file.root:/object//path",
(
"ssh://user@host:22/path/to/file",
"object",
"ssh://user@host:22/path/to/file.root",
"object/path",
),
),
(
"ssh://user@host:50230/path/to/file",
"ssh://user@host:22/path/to/file.root:/object//path:with:colon:in:path/something/",
(
"ssh://user@host:50230/path/to/file",
"ssh://user@host:22/path/to/file.root",
"object/path:with:colon:in:path/something",
),
),
(
"ssh://user@host:50230/path/to/file.root",
(
"ssh://user@host:50230/path/to/file.root",
None,
),
),
(
"s3://bucket/path/to/file:object",
"s3://bucket/path/to/file.root:/dir////object",
(
"s3://bucket/path/to/file.root",
"dir/object",
),
),
(
"s3://bucket/path/to/file.root:",
(
"s3://bucket/path/to/file",
"object",
"s3://bucket/path/to/file.root",
"",
),
),
(
Expand All @@ -98,27 +112,56 @@
None,
),
),
# https://github.com/scikit-hep/uproot5/issues/975
(
"zip://uproot-issue121.root::file:///tmp/pytest-of-runner/pytest-0/test_fsspec_zip0/uproot-issue121.root.zip:Events/MET_pt",
"DAOD_PHYSLITE_2023-09-13T1230.art.rntuple.root:RNT:CollectionTree",
(
"DAOD_PHYSLITE_2023-09-13T1230.art.rntuple.root",
"RNT:CollectionTree",
),
),
(
"zip://uproot-issue121.root:Events/MET_pt::file:///tmp/pytest-of-runner/pytest-0/test_fsspec_zip0/uproot-issue121.root.zip",
(
"zip://uproot-issue121.root::file:///tmp/pytest-of-runner/pytest-0/test_fsspec_zip0/uproot-issue121.root.zip",
"Events/MET_pt",
),
),
(
"simplecache::zip://uproot-issue121.root::file:///tmp/pytest-of-runner/pytest-0/test_fsspec_zip0/uproot-issue121.root.zip:Events/MET_pt",
"simplecache::zip://uproot-issue121.root:Events/MET_pt::file:///tmp/pytest-of-runner/pytest-0/test_fsspec_zip0/uproot-issue121.root.zip",
(
"simplecache::zip://uproot-issue121.root::file:///tmp/pytest-of-runner/pytest-0/test_fsspec_zip0/uproot-issue121.root.zip",
"Events/MET_pt",
),
),
(
r"zip://uproot-issue121.root::file://C:\Users\runneradmin\AppData\Local\Temp\pytest-of-runneradmin\pytest-0\test_fsspec_zip0\uproot-issue121.root.zip:Events/MET_pt",
r"zip://uproot-issue121.root:Events/MET_pt::file://C:\Users\runneradmin\AppData\Local\Temp\pytest-of-runneradmin\pytest-0\test_fsspec_zip0\uproot-issue121.root.zip",
(
r"zip://uproot-issue121.root::file://C:\Users\runneradmin\AppData\Local\Temp\pytest-of-runneradmin\pytest-0\test_fsspec_zip0\uproot-issue121.root.zip",
"Events/MET_pt",
),
),
(
"zip://uproot-issue121.root:Events/MET_pt::file:///some/weird/path:with:colons/file.root",
(
"zip://uproot-issue121.root::file:///some/weird/path:with:colons/file.root",
"Events/MET_pt",
),
),
(
"/some/weird/path:with:colons/file.root:Events/MET_pt",
(
"/some/weird/path:with:colons/file.root",
"Events/MET_pt",
),
),
(
"/some/weird/path:with:colons/file.root",
(
"/some/weird/path:with:colons/file.root",
None,
),
),
],
)
def test_url_split(input_value, expected_output):
Expand All @@ -131,9 +174,12 @@ def test_url_split(input_value, expected_output):
@pytest.mark.parametrize(
"input_value",
[
"local/file.root://Events",
"local/file.root.zip://Events",
"local/file.roo://Events",
"local/file://Events",
],
)
def test_url_split_invalid(input_value):
with pytest.raises(ValueError):
uproot._util.file_object_path_split(input_value)
path, obj = uproot._util.file_object_path_split(input_value)
assert obj is None
assert path == input_value
Loading