Skip to content

Commit 91a7af0

Browse files
authored
Merge pull request #2707 from Suor/urlinfo
perf: refactor URLInfo to not use ParseResult inside
2 parents 809a85a + b83564d commit 91a7af0

File tree

3 files changed

+83
-92
lines changed

3 files changed

+83
-92
lines changed

dvc/path_info.py

Lines changed: 81 additions & 88 deletions
Original file line numberDiff line numberDiff line change
@@ -8,13 +8,12 @@
88

99
from dvc.utils.compat import str, builtin_str, basestring, is_py2
1010
from dvc.utils.compat import pathlib, urlparse
11+
from dvc.utils import relpath
1112

1213

1314
# On Python 2.7/Windows sys.getfilesystemencoding() is set to mbcs,
1415
# which is lossy, thus we can't use that,
1516
# see https://github.com/mcmtroffaes/pathlib2/issues/56.
16-
from dvc.utils import relpath
17-
1817
if is_py2:
1918
fs_encoding = "utf-8"
2019

@@ -112,52 +111,80 @@ class PosixPathInfo(PathInfo, pathlib.PurePosixPath):
112111
pass
113112

114113

114+
class _URLPathInfo(PosixPathInfo):
115+
def __str__(self):
116+
return self.__fspath__()
117+
118+
__unicode__ = __str__
119+
120+
115121
class _URLPathParents(object):
116-
def __init__(self, pathcls, scheme, netloc, path):
117-
self._scheme = scheme
118-
self._netloc = netloc
119-
self._parents = path.parents
120-
self._pathcls = pathcls
122+
def __init__(self, src):
123+
self.src = src
124+
self._parents = self.src._path.parents
121125

122126
def __len__(self):
123127
return len(self._parents)
124128

125129
def __getitem__(self, idx):
126-
return self._pathcls.from_parts(
127-
scheme=self._scheme,
128-
netloc=self._netloc,
129-
path=self._parents[idx].fspath,
130-
)
130+
return self.src.replace(path=self._parents[idx])
131131

132132
def __repr__(self):
133-
return "<{}.parents>".format(self._pathcls.__name__)
133+
return "<{}.parents>".format(self.src)
134134

135135

136136
class URLInfo(object):
137137
DEFAULT_PORTS = {"http": 80, "https": 443, "ssh": 22, "hdfs": 0}
138138

139139
def __init__(self, url):
140-
self.parsed = urlparse(url)
141-
assert self.parsed.scheme != "remote"
140+
p = urlparse(url)
141+
assert not p.query and not p.params and not p.fragment
142+
assert p.password is None
143+
144+
self.fill_parts(p.scheme, p.hostname, p.username, p.port, p.path)
142145

143146
@classmethod
144147
def from_parts(
145-
cls, scheme=None, netloc=None, host=None, user=None, port=None, path=""
148+
cls, scheme=None, host=None, user=None, port=None, path="", netloc=None
146149
):
147-
assert scheme and (bool(host) ^ bool(netloc))
150+
assert bool(host) ^ bool(netloc)
151+
152+
if netloc is not None:
153+
return cls("{}://{}{}".format(scheme, netloc, path))
154+
155+
obj = cls.__new__(cls)
156+
obj.fill_parts(scheme, host, user, port, path)
157+
return obj
158+
159+
def fill_parts(self, scheme, host, user, port, path):
160+
assert scheme != "remote"
161+
assert isinstance(path, (basestring, _URLPathInfo))
162+
163+
self.scheme, self.host, self.user = scheme, host, user
164+
self.port = int(port) if port else self.DEFAULT_PORTS.get(self.scheme)
165+
166+
if isinstance(path, _URLPathInfo):
167+
self._spath = builtin_str(path)
168+
self._path = path
169+
else:
170+
if path and path[0] != "/":
171+
path = "/" + path
172+
self._spath = path
173+
174+
@property
175+
def _base_parts(self):
176+
return (self.scheme, self.host, self.user, self.port)
177+
178+
@property
179+
def parts(self):
180+
return self._base_parts + self._path.parts
148181

149-
if netloc is None:
150-
netloc = host
151-
if user:
152-
netloc = user + "@" + host
153-
if port:
154-
netloc += ":" + str(port)
155-
return cls("{}://{}{}".format(scheme, netloc, path))
182+
def replace(self, path=None):
183+
return self.from_parts(*self._base_parts, path=path)
156184

157185
@cached_property
158186
def url(self):
159-
p = self.parsed
160-
return "{}://{}{}".format(p.scheme, self.netloc, p.path)
187+
return "{}://{}{}".format(self.scheme, self.netloc, self._spath)
161188

162189
def __str__(self):
163190
return self.url
@@ -170,107 +197,73 @@ def __eq__(self, other):
170197
other = self.__class__(other)
171198
return (
172199
self.__class__ == other.__class__
173-
and self.scheme == other.scheme
174-
and self.netloc == other.netloc
200+
and self._base_parts == other._base_parts
175201
and self._path == other._path
176202
)
177203

178204
def __hash__(self):
179-
return hash(self.url)
205+
return hash(self.parts)
180206

181207
def __div__(self, other):
182-
p = self.parsed
183-
new_path = posixpath.join(p.path, str(other))
184-
if not new_path.startswith("/"):
185-
new_path = "/" + new_path
186-
new_url = "{}://{}{}".format(p.scheme, p.netloc, new_path)
187-
return self.__class__(new_url)
208+
return self.replace(path=posixpath.join(self._spath, other))
188209

189210
__truediv__ = __div__
190211

191-
def __getattr__(self, name):
192-
# When deepcopy is called, it creates and object without __init__,
193-
# self.parsed is not initialized and it causes infinite recursion.
194-
# More on this special casing here:
195-
# https://stackoverflow.com/a/47300262/298182
196-
if name.startswith("__"):
197-
raise AttributeError(name)
198-
return getattr(self.parsed, name)
199-
200-
@cached_property
201-
def netloc(self):
202-
p = self.parsed
203-
netloc = p.hostname
204-
if p.username:
205-
netloc = p.username + "@" + netloc
206-
if p.port and int(p.port) != self.DEFAULT_PORTS.get(p.scheme):
207-
netloc += ":" + str(p.port)
208-
return netloc
209-
210212
@property
211-
def port(self):
212-
return self.parsed.port or self.DEFAULT_PORTS.get(self.parsed.scheme)
213-
214-
@property
215-
def host(self):
216-
return self.parsed.hostname
217-
218-
@property
219-
def user(self):
220-
return self.parsed.username
213+
def path(self):
214+
return self._spath
221215

222216
@cached_property
223217
def _path(self):
224-
return PosixPathInfo(self.parsed.path)
218+
return _URLPathInfo(self._spath)
225219

226220
@property
227221
def name(self):
228222
return self._path.name
229223

230-
@property
231-
def parts(self):
232-
return (self.scheme, self.netloc) + self._path.parts
224+
@cached_property
225+
def netloc(self):
226+
netloc = self.host
227+
if self.user:
228+
netloc = self.user + "@" + netloc
229+
if self.port and int(self.port) != self.DEFAULT_PORTS.get(self.scheme):
230+
netloc += ":" + str(self.port)
231+
return netloc
233232

234233
@property
235234
def bucket(self):
236-
return self.parsed.netloc
235+
return self.netloc
237236

238237
@property
239238
def parent(self):
240-
return self.from_parts(
241-
scheme=self.scheme,
242-
netloc=self.parsed.netloc,
243-
path=self._path.parent.fspath,
244-
)
239+
return self.replace(path=self._path.parent)
245240

246241
@property
247242
def parents(self):
248-
return _URLPathParents(
249-
type(self), self.scheme, self.parsed.netloc, self._path
250-
)
243+
return _URLPathParents(self)
251244

252245
def relative_to(self, other):
253-
if isinstance(other, str):
254-
other = URLInfo(other)
255-
if self.scheme != other.scheme or self.netloc != other.netloc:
256-
raise ValueError(
257-
"'{}' does not start with '{}'".format(self, other)
258-
)
246+
if isinstance(other, basestring):
247+
other = self.__class__(other)
248+
if self.__class__ != other.__class__:
249+
msg = "'{}' has incompatible class with '{}'".format(self, other)
250+
raise ValueError(msg)
251+
if self._base_parts != other._base_parts:
252+
msg = "'{}' does not start with '{}'".format(self, other)
253+
raise ValueError(msg)
259254
return self._path.relative_to(other._path)
260255

261256
def isin(self, other):
262257
if isinstance(other, basestring):
263258
other = self.__class__(other)
264259
elif self.__class__ != other.__class__:
265260
return False
266-
return (
267-
self.scheme == other.scheme
268-
and self.netloc == other.netloc
269-
and self._path.isin(other._path)
261+
return self._base_parts == other._base_parts and self._path.isin(
262+
other._path
270263
)
271264

272265

273266
class CloudURLInfo(URLInfo):
274267
@property
275268
def path(self):
276-
return self.parsed.path.lstrip("/")
269+
return self._spath.lstrip("/")

dvc/remote/s3.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@
44

55
import os
66
import logging
7-
import posixpath
87
from funcy import cached_property
98

109
from dvc.progress import Tqdm
@@ -273,4 +272,4 @@ def _generate_download_url(self, path_info, expires=3600):
273272

274273
def walk_files(self, path_info, max_items=None):
275274
for fname in self._list_paths(path_info, max_items):
276-
yield path_info / posixpath.relpath(fname, path_info.path)
275+
yield path_info.replace(path=fname)

dvc/remote/ssh/__init__.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@
44
import itertools
55
import io
66
import os
7-
import posixpath
87
import getpass
98
import logging
109
import threading
@@ -268,7 +267,7 @@ def list_cache_paths(self):
268267
def walk_files(self, path_info):
269268
with self.ssh(path_info) as ssh:
270269
for fname in ssh.walk_files(path_info.path):
271-
yield path_info / posixpath.relpath(fname, path_info.path)
270+
yield path_info.replace(path=fname)
272271

273272
def makedirs(self, path_info):
274273
with self.ssh(path_info) as ssh:

0 commit comments

Comments
 (0)