Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

GH-78079: Fix UNC device path root normalization in pathlib #102003

Merged
merged 10 commits into from
Apr 14, 2023
11 changes: 8 additions & 3 deletions Lib/pathlib.py
Original file line number Diff line number Diff line change
Expand Up @@ -322,9 +322,14 @@ def _parse_path(cls, path):
if altsep:
path = path.replace(altsep, sep)
drv, root, rel = cls._flavour.splitroot(path)
if drv.startswith(sep):
# pathlib assumes that UNC paths always have a root.
root = sep
if not root and drv.startswith(sep) and not drv.endswith(sep):
drv_parts = drv.split(sep)
if len(drv_parts) == 4 and drv_parts[2] not in '?.':
# e.g. //server/share
root = sep
elif len(drv_parts) == 6:
# e.g. //?/unc/server/share
root = sep
parsed = [sys.intern(str(x)) for x in rel.split(sep) if x and x != '.']
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Whoa, we sys.intern everything? That seems like a painful memory leak. Would we be better off with a class level lru_cache'd function1?

Don't we already know that rel will be str here? If it's bytes, str(x) is the wrong conversion anyway. Maybe the intern function should be cls._flavour.str_and_intern?

Footnotes

  1. @lru_cache(max_size=REASONABLE_VALUE) def intern(x): return x

Copy link
Contributor Author

@barneygale barneygale Apr 11, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The interning of the path parts is longstanding pathlib behaviour. Honestly I don't know enough about the benefits/drawbacks of interning to say whether it's reasonable.

Don't we already know that rel will be str here?

We know that it's an instance of str (and not bytes), but we don't know whether it's a true str object (required by sys.intern()).

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Whoa, we sys.intern everything? That seems like a painful memory leak.

Is it really all that bad? sys.intern() calls PyUnicode_InternInPlace(), not PyUnicode_InternImmortal(). An interned string gets deallocated based on its reference count, like any other object. For example:

>>> s = sys.intern('spam and eggs')
>>> t = sys.intern('spam and eggs')
>>> s is t
True
>>> id(s)
139835455199152
>>> del s, t
>>> s = sys.intern('spam and eggs')
>>> id(s)
139835455198912

return drv, root, parsed

Expand Down
8 changes: 8 additions & 0 deletions Lib/test/test_ntpath.py
Original file line number Diff line number Diff line change
Expand Up @@ -169,6 +169,7 @@ def test_splitroot(self):

# gh-81790: support device namespace, including UNC drives.
tester('ntpath.splitroot("//?/c:")', ("//?/c:", "", ""))
tester('ntpath.splitroot("//./c:")', ("//./c:", "", ""))
tester('ntpath.splitroot("//?/c:/")', ("//?/c:", "/", ""))
tester('ntpath.splitroot("//?/c:/dir")', ("//?/c:", "/", "dir"))
tester('ntpath.splitroot("//?/UNC")', ("//?/UNC", "", ""))
Expand All @@ -179,8 +180,12 @@ def test_splitroot(self):
tester('ntpath.splitroot("//?/VOLUME{00000000-0000-0000-0000-000000000000}/spam")',
('//?/VOLUME{00000000-0000-0000-0000-000000000000}', '/', 'spam'))
tester('ntpath.splitroot("//?/BootPartition/")', ("//?/BootPartition", "/", ""))
tester('ntpath.splitroot("//./BootPartition/")', ("//./BootPartition", "/", ""))
tester('ntpath.splitroot("//./PhysicalDrive0")', ("//./PhysicalDrive0", "", ""))
tester('ntpath.splitroot("//./nul")', ("//./nul", "", ""))

tester('ntpath.splitroot("\\\\?\\c:")', ("\\\\?\\c:", "", ""))
tester('ntpath.splitroot("\\\\.\\c:")', ("\\\\.\\c:", "", ""))
tester('ntpath.splitroot("\\\\?\\c:\\")', ("\\\\?\\c:", "\\", ""))
tester('ntpath.splitroot("\\\\?\\c:\\dir")', ("\\\\?\\c:", "\\", "dir"))
tester('ntpath.splitroot("\\\\?\\UNC")', ("\\\\?\\UNC", "", ""))
Expand All @@ -193,6 +198,9 @@ def test_splitroot(self):
tester('ntpath.splitroot("\\\\?\\VOLUME{00000000-0000-0000-0000-000000000000}\\spam")',
('\\\\?\\VOLUME{00000000-0000-0000-0000-000000000000}', '\\', 'spam'))
tester('ntpath.splitroot("\\\\?\\BootPartition\\")', ("\\\\?\\BootPartition", "\\", ""))
tester('ntpath.splitroot("\\\\.\\BootPartition\\")', ("\\\\.\\BootPartition", "\\", ""))
tester('ntpath.splitroot("\\\\.\\PhysicalDrive0")', ("\\\\.\\PhysicalDrive0", "", ""))
tester('ntpath.splitroot("\\\\.\\nul")', ("\\\\.\\nul", "", ""))

# gh-96290: support partial/invalid UNC drives
tester('ntpath.splitroot("//")', ("//", "", "")) # empty server & missing share
Expand Down
24 changes: 24 additions & 0 deletions Lib/test/test_pathlib.py
Original file line number Diff line number Diff line change
Expand Up @@ -810,6 +810,9 @@ def test_drive_root_parts(self):
check(('c:/a',), 'c:', '\\', ('c:\\', 'a'))
check(('/a',), '', '\\', ('\\', 'a'))
# UNC paths.
check(('//',), '\\\\', '', ('\\\\',))
check(('//a',), '\\\\a', '', ('\\\\a',))
check(('//a/',), '\\\\a\\', '', ('\\\\a\\',))
check(('//a/b',), '\\\\a\\b', '\\', ('\\\\a\\b\\',))
check(('//a/b/',), '\\\\a\\b', '\\', ('\\\\a\\b\\',))
check(('//a/b/c',), '\\\\a\\b', '\\', ('\\\\a\\b\\', 'c'))
Expand All @@ -823,12 +826,26 @@ def test_drive_root_parts(self):
# UNC paths.
check(('a', '//b/c//', 'd'), '\\\\b\\c', '\\', ('\\\\b\\c\\', 'd'))
# Extended paths.
check(('//./c:',), '\\\\.\\c:', '', ('\\\\.\\c:',))
check(('//?/c:/',), '\\\\?\\c:', '\\', ('\\\\?\\c:\\',))
check(('//?/c:/a',), '\\\\?\\c:', '\\', ('\\\\?\\c:\\', 'a'))
check(('//?/c:/a', '/b'), '\\\\?\\c:', '\\', ('\\\\?\\c:\\', 'b'))
# Extended UNC paths (format is "\\?\UNC\server\share").
check(('//?',), '\\\\?', '', ('\\\\?',))
check(('//?/',), '\\\\?\\', '', ('\\\\?\\',))
check(('//?/UNC',), '\\\\?\\UNC', '', ('\\\\?\\UNC',))
check(('//?/UNC/',), '\\\\?\\UNC\\', '', ('\\\\?\\UNC\\',))
check(('//?/UNC/b',), '\\\\?\\UNC\\b', '', ('\\\\?\\UNC\\b',))
check(('//?/UNC/b/',), '\\\\?\\UNC\\b\\', '', ('\\\\?\\UNC\\b\\',))
check(('//?/UNC/b/c',), '\\\\?\\UNC\\b\\c', '\\', ('\\\\?\\UNC\\b\\c\\',))
check(('//?/UNC/b/c/',), '\\\\?\\UNC\\b\\c', '\\', ('\\\\?\\UNC\\b\\c\\',))
check(('//?/UNC/b/c/d',), '\\\\?\\UNC\\b\\c', '\\', ('\\\\?\\UNC\\b\\c\\', 'd'))
# UNC device paths
check(('//./BootPartition/',), '\\\\.\\BootPartition', '\\', ('\\\\.\\BootPartition\\',))
check(('//?/BootPartition/',), '\\\\?\\BootPartition', '\\', ('\\\\?\\BootPartition\\',))
check(('//./PhysicalDrive0',), '\\\\.\\PhysicalDrive0', '', ('\\\\.\\PhysicalDrive0',))
check(('//?/Volume{}/',), '\\\\?\\Volume{}', '\\', ('\\\\?\\Volume{}\\',))
check(('//./nul',), '\\\\.\\nul', '', ('\\\\.\\nul',))
# Second part has a root but not drive.
check(('a', '/b', 'c'), '', '\\', ('\\', 'b', 'c'))
check(('Z:/a', '/b', 'c'), 'Z:', '\\', ('Z:\\', 'b', 'c'))
Expand Down Expand Up @@ -1371,6 +1388,13 @@ def test_join(self):
self.assertEqual(pp, P('C:/a/b/dd:s'))
pp = p.joinpath(P('E:d:s'))
self.assertEqual(pp, P('E:d:s'))
# Joining onto a UNC path with no root
pp = P('//').joinpath('server')
self.assertEqual(pp, P('//server'))
pp = P('//server').joinpath('share')
self.assertEqual(pp, P('//server/share'))
pp = P('//./BootPartition').joinpath('Windows')
self.assertEqual(pp, P('//./BootPartition/Windows'))

def test_div(self):
# Basically the same as joinpath().
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
Fix incorrect normalization of UNC device path roots, and partial UNC share
path roots, in :class:`pathlib.PurePath`. Pathlib no longer appends a
trailing slash to such paths.