Skip to content

Commit

Permalink
Introduce hash_utils.hash_dir.
Browse files Browse the repository at this point in the history
Implement a simple stable hash for the recursive contents of a directory
for use in v1 Tasks. In v2 we have the `fs/store` crate and intrinsics
that expose its types to @rules; this just tides v1 Tasks over for cases
where we just need a hash and not a Snapshot we'll never materialize
(via `self.context._scheduler.<adhoc sync apis>`). Performance is
~identical to the `self.context._scheduler.capture_snapshots` API (a
few percent faster on average).

This change supports #8263.
  • Loading branch information
jsirois committed Sep 24, 2019
1 parent 33c24aa commit 6069e15
Show file tree
Hide file tree
Showing 4 changed files with 105 additions and 6 deletions.
1 change: 1 addition & 0 deletions 3rdparty/python/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -24,5 +24,6 @@ requests[security]>=2.20.1
responses==0.10.4
setproctitle==1.1.10
setuptools==40.6.3
typing-extensions==3.7.4
wheel==0.31.1
www-authenticate==0.9.2
1 change: 1 addition & 0 deletions src/python/pants/base/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,7 @@ python_library(
name = 'hash_utils',
sources = ['hash_utils.py'],
dependencies = [
'3rdparty/python:typing-extensions',
':deprecated',
'src/python/pants/util:objects',
]
Expand Down
44 changes: 40 additions & 4 deletions src/python/pants/base/hash_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,14 @@
import hashlib
import json
import logging
import typing
from collections import OrderedDict
from collections.abc import Iterable, Mapping, Set
from pathlib import Path
from typing import Any, Optional, Type, Union

from twitter.common.collections import OrderedSet
from typing_extensions import Protocol

from pants.util.objects import DatatypeMixin
from pants.util.strutil import ensure_binary
Expand All @@ -16,7 +20,17 @@
logger = logging.getLogger(__name__)


def hash_all(strs, digest=None):
class Digest(Protocol):
"""A post-hoc type stub for hashlib digest objects."""

def update(self, data: bytes) -> None:
...

def hexdigest(self) -> str:
...


def hash_all(strs: typing.Iterable[Union[bytes, str]], digest: Optional[Digest] = None) -> str:
"""Returns a hash of the concatenation of all the strings in strs.
If a hashlib message digest is not supplied a new sha1 message digest is used.
Expand All @@ -28,7 +42,7 @@ def hash_all(strs, digest=None):
return digest.hexdigest()


def hash_file(path, digest=None):
def hash_file(path: Union[str, Path], digest: Optional[Digest] = None) -> str:
"""Hashes the contents of the file at the given path and returns the hash digest in hex form.
If a hashlib message digest is not supplied a new sha1 message digest is used.
Expand All @@ -42,6 +56,26 @@ def hash_file(path, digest=None):
return digest.hexdigest()


def hash_dir(path: Path, *, digest: Optional[Digest] = None) -> str:
"""Hashes the recursive contents under the given directory path.
If a hashlib message digest is not supplied a new sha1 message digest is used.
"""
if not isinstance(path, Path):
raise TypeError(f'Expected path to be a pathlib.Path, given a: {type(path)}')

if not path.is_dir():
raise ValueError(f'Expected path to de a directory, given: {path}')

digest = digest or hashlib.sha1()
root = path.resolve()
for pth in sorted(p for p in root.rglob('*')):
digest.update(bytes(pth.relative_to(root)))
if not pth.is_dir():
hash_file(pth, digest=digest)
return digest.hexdigest()


class CoercingEncoder(json.JSONEncoder):
"""An encoder which performs coercions in order to serialize many otherwise illegal objects.
Expand Down Expand Up @@ -118,7 +152,9 @@ def encode(self, o):
return super().encode(self.default(o))


def json_hash(obj, digest=None, encoder=None):
def json_hash(
obj: Any, digest: Optional[Digest] = None, encoder: Optional[Type[json.JSONEncoder]] = None
) -> str:
"""Hashes `obj` by dumping to JSON.
:param obj: An object that can be rendered to json using the given `encoder`.
Expand All @@ -135,7 +171,7 @@ def json_hash(obj, digest=None, encoder=None):


# TODO(#6513): something like python 3's @lru_cache decorator could be useful here!
def stable_json_sha1(obj, digest=None):
def stable_json_sha1(obj: Any, digest: Optional[Digest] = None) -> str:
"""Hashes `obj` stably; ie repeated calls with the same inputs will produce the same hash.
:param obj: An object that can be rendered to json using a :class:`CoercingEncoder`.
Expand Down
65 changes: 63 additions & 2 deletions tests/python/pants_test/base/test_hash_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,13 @@
import re
import unittest
from collections import OrderedDict
from pathlib import Path

from twitter.common.collections import OrderedSet

from pants.base.hash_utils import CoercingEncoder, Sharder, hash_all, hash_file, stable_json_sha1
from pants.util.contextutil import temporary_file
from pants.base.hash_utils import (CoercingEncoder, Sharder, hash_all, hash_dir, hash_file,
stable_json_sha1)
from pants.util.contextutil import temporary_dir, temporary_file, temporary_file_path


class TestHashUtils(unittest.TestCase):
Expand All @@ -31,6 +33,65 @@ def test_hash_file(self):

self.assertEqual(expected_hash.hexdigest(), hash_file(fd.name, digest=hashlib.md5()))

def test_hash_dir_invalid(self):
with temporary_file_path() as path:
with self.assertRaises(TypeError):
hash_dir(path)
with self.assertRaises(ValueError):
hash_dir(Path(path))

def test_hash_dir(self):
with temporary_dir() as root1:
root1_path = Path(root1)
with root1_path.joinpath('a').open(mode='wb') as fd:
fd.write(b'jake jones')
with root1_path.joinpath('b').open(mode='wb') as fd:
fd.write(b'jane george')
hash1 = hash_dir(root1_path)

with temporary_dir() as root2:
root2_path = Path(root2)
with root2_path.joinpath('a').open(mode='wb') as fd:
fd.write(b'jake jones')
with root2_path.joinpath('b').open(mode='wb') as fd:
fd.write(b'jane george')
hash2 = hash_dir(root2_path)

self.assertNotEqual(root1_path, root2_path,
"The path of the directory being hashed should not factor into the hash.")
self.assertEqual(hash1, hash2)

with temporary_dir() as root3:
root3_path = Path(root3)
with root3_path.joinpath('a1').open(mode='wb') as fd:
fd.write(b'jake jones')
with root3_path.joinpath('b').open(mode='wb') as fd:
fd.write(b'jane george')
hash3 = hash_dir(root3_path)

self.assertNotEqual(hash1, hash3, "File names should be included in the hash.")

with temporary_dir() as root4:
root4_path = Path(root4)
with root4_path.joinpath('a').open(mode='wb') as fd:
fd.write(b'jake jones')
with root4_path.joinpath('b').open(mode='wb') as fd:
fd.write(b'jane george')
root4_path.joinpath("c").mkdir()
hash4 = hash_dir(root4_path)

self.assertNotEqual(hash1, hash4, "Directory names should be included in the hash.")

with temporary_dir() as root5:
root5_path = Path(root5)
with root5_path.joinpath('a').open(mode='wb') as fd:
fd.write(b'jake jones II')
with root5_path.joinpath('b').open(mode='wb') as fd:
fd.write(b'jane george')
hash5 = hash_dir(root5_path)

self.assertNotEqual(hash1, hash5, "File content should be included in the hash.")

def test_compute_shard(self):
# Spot-check a couple of values, to make sure compute_shard doesn't do something
# completely degenerate.
Expand Down

0 comments on commit 6069e15

Please sign in to comment.