diff --git a/3rdparty/python/requirements.txt b/3rdparty/python/requirements.txt index 1b0fcbd443b..6dcd93053c7 100644 --- a/3rdparty/python/requirements.txt +++ b/3rdparty/python/requirements.txt @@ -24,5 +24,6 @@ requests[security]>=2.20.1 responses==0.10.4 setproctitle==1.1.10 setuptools==40.6.3 +typing-extensions==3.7.4 wheel==0.31.1 www-authenticate==0.9.2 diff --git a/src/python/pants/base/BUILD b/src/python/pants/base/BUILD index 65a360afa32..183293bf976 100644 --- a/src/python/pants/base/BUILD +++ b/src/python/pants/base/BUILD @@ -98,6 +98,7 @@ python_library( name = 'hash_utils', sources = ['hash_utils.py'], dependencies = [ + '3rdparty/python:typing-extensions', ':deprecated', 'src/python/pants/util:objects', ] diff --git a/src/python/pants/base/hash_utils.py b/src/python/pants/base/hash_utils.py index 2683a3e6e40..22b9c2c6617 100644 --- a/src/python/pants/base/hash_utils.py +++ b/src/python/pants/base/hash_utils.py @@ -4,10 +4,14 @@ import hashlib import json import logging +import typing from collections import OrderedDict from collections.abc import Iterable, Mapping, Set +from pathlib import Path +from typing import Any, Optional, Type, Union from twitter.common.collections import OrderedSet +from typing_extensions import Protocol from pants.util.objects import DatatypeMixin from pants.util.strutil import ensure_binary @@ -16,7 +20,17 @@ logger = logging.getLogger(__name__) -def hash_all(strs, digest=None): +class Digest(Protocol): + """A post-hoc type stub for hashlib digest objects.""" + + def update(self, data: bytes) -> None: + ... + + def hexdigest(self) -> str: + ... + + +def hash_all(strs: typing.Iterable[Union[bytes, str]], digest: Optional[Digest] = None) -> str: """Returns a hash of the concatenation of all the strings in strs. If a hashlib message digest is not supplied a new sha1 message digest is used. @@ -28,7 +42,7 @@ def hash_all(strs, digest=None): return digest.hexdigest() -def hash_file(path, digest=None): +def hash_file(path: Union[str, Path], digest: Optional[Digest] = None) -> str: """Hashes the contents of the file at the given path and returns the hash digest in hex form. If a hashlib message digest is not supplied a new sha1 message digest is used. @@ -42,6 +56,26 @@ def hash_file(path, digest=None): return digest.hexdigest() +def hash_dir(path: Path, *, digest: Optional[Digest] = None) -> str: + """Hashes the recursive contents under the given directory path. + + If a hashlib message digest is not supplied a new sha1 message digest is used. + """ + if not isinstance(path, Path): + raise TypeError(f'Expected path to be a pathlib.Path, given a: {type(path)}') + + if not path.is_dir(): + raise ValueError(f'Expected path to de a directory, given: {path}') + + digest = digest or hashlib.sha1() + root = path.resolve() + for pth in sorted(p for p in root.rglob('*')): + digest.update(bytes(pth.relative_to(root))) + if not pth.is_dir(): + hash_file(pth, digest=digest) + return digest.hexdigest() + + class CoercingEncoder(json.JSONEncoder): """An encoder which performs coercions in order to serialize many otherwise illegal objects. @@ -118,7 +152,9 @@ def encode(self, o): return super().encode(self.default(o)) -def json_hash(obj, digest=None, encoder=None): +def json_hash( + obj: Any, digest: Optional[Digest] = None, encoder: Optional[Type[json.JSONEncoder]] = None +) -> str: """Hashes `obj` by dumping to JSON. :param obj: An object that can be rendered to json using the given `encoder`. @@ -135,7 +171,7 @@ def json_hash(obj, digest=None, encoder=None): # TODO(#6513): something like python 3's @lru_cache decorator could be useful here! -def stable_json_sha1(obj, digest=None): +def stable_json_sha1(obj: Any, digest: Optional[Digest] = None) -> str: """Hashes `obj` stably; ie repeated calls with the same inputs will produce the same hash. :param obj: An object that can be rendered to json using a :class:`CoercingEncoder`. diff --git a/tests/python/pants_test/base/test_hash_utils.py b/tests/python/pants_test/base/test_hash_utils.py index db052059541..819ac9cf8df 100644 --- a/tests/python/pants_test/base/test_hash_utils.py +++ b/tests/python/pants_test/base/test_hash_utils.py @@ -7,11 +7,13 @@ import re import unittest from collections import OrderedDict +from pathlib import Path from twitter.common.collections import OrderedSet -from pants.base.hash_utils import CoercingEncoder, Sharder, hash_all, hash_file, stable_json_sha1 -from pants.util.contextutil import temporary_file +from pants.base.hash_utils import (CoercingEncoder, Sharder, hash_all, hash_dir, hash_file, + stable_json_sha1) +from pants.util.contextutil import temporary_dir, temporary_file, temporary_file_path class TestHashUtils(unittest.TestCase): @@ -31,6 +33,65 @@ def test_hash_file(self): self.assertEqual(expected_hash.hexdigest(), hash_file(fd.name, digest=hashlib.md5())) + def test_hash_dir_invalid(self): + with temporary_file_path() as path: + with self.assertRaises(TypeError): + hash_dir(path) + with self.assertRaises(ValueError): + hash_dir(Path(path)) + + def test_hash_dir(self): + with temporary_dir() as root1: + root1_path = Path(root1) + with root1_path.joinpath('a').open(mode='wb') as fd: + fd.write(b'jake jones') + with root1_path.joinpath('b').open(mode='wb') as fd: + fd.write(b'jane george') + hash1 = hash_dir(root1_path) + + with temporary_dir() as root2: + root2_path = Path(root2) + with root2_path.joinpath('a').open(mode='wb') as fd: + fd.write(b'jake jones') + with root2_path.joinpath('b').open(mode='wb') as fd: + fd.write(b'jane george') + hash2 = hash_dir(root2_path) + + self.assertNotEqual(root1_path, root2_path, + "The path of the directory being hashed should not factor into the hash.") + self.assertEqual(hash1, hash2) + + with temporary_dir() as root3: + root3_path = Path(root3) + with root3_path.joinpath('a1').open(mode='wb') as fd: + fd.write(b'jake jones') + with root3_path.joinpath('b').open(mode='wb') as fd: + fd.write(b'jane george') + hash3 = hash_dir(root3_path) + + self.assertNotEqual(hash1, hash3, "File names should be included in the hash.") + + with temporary_dir() as root4: + root4_path = Path(root4) + with root4_path.joinpath('a').open(mode='wb') as fd: + fd.write(b'jake jones') + with root4_path.joinpath('b').open(mode='wb') as fd: + fd.write(b'jane george') + root4_path.joinpath("c").mkdir() + hash4 = hash_dir(root4_path) + + self.assertNotEqual(hash1, hash4, "Directory names should be included in the hash.") + + with temporary_dir() as root5: + root5_path = Path(root5) + with root5_path.joinpath('a').open(mode='wb') as fd: + fd.write(b'jake jones II') + with root5_path.joinpath('b').open(mode='wb') as fd: + fd.write(b'jane george') + hash5 = hash_dir(root5_path) + + self.assertNotEqual(hash1, hash5, "File content should be included in the hash.") + def test_compute_shard(self): # Spot-check a couple of values, to make sure compute_shard doesn't do something # completely degenerate.