Skip to content

Commit

Permalink
Use a (fast-fail) hash for checking if files are the same in resolve_…
Browse files Browse the repository at this point in the history
…url (gwastro#4769)

* Use a reduced hash of files for checking if files are the same in resolve_url_to_file

* Loop through the file in chunks to fail fast

* Use the reduced hashing in various places in the code

* CC, docstring

* Ddint need to add the arguments to these

* missed this one

* Undo changes in codes that dont need it

* remove debugging printing
  • Loading branch information
GarethCabournDavies authored Jun 6, 2024
1 parent c06b684 commit c082c36
Show file tree
Hide file tree
Showing 2 changed files with 68 additions and 6 deletions.
58 changes: 54 additions & 4 deletions pycbc/workflow/configuration.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
from shutil import which
import urllib.parse
from urllib.parse import urlparse
import hashlib

from pycbc.types.config import InterpolatingConfigParser

Expand All @@ -47,7 +48,52 @@
urllib.parse.uses_netloc.append('osdf')


def resolve_url(url, directory=None, permissions=None, copy_to_cwd=True):
def hash_compare(filename_1, filename_2, chunk_size=None, max_chunks=None):
"""
Calculate the sha1 hash of a file, or of part of a file
Parameters
----------
filename_1 : string or path
the first file to be hashed / compared
filename_2 : string or path
the second file to be hashed / compared
chunk_size : integer
This size of chunks to be read in and hashed. If not given, will read
the whole file (may be slow for large files).
max_chunks: integer
This many chunks to be compared. If all chunks so far have been the
same, then just assume its the same file. Default 10
Returns
-------
hash : string
The hexdigest() after a sha1 hash of (part of) the file
"""

if max_chunks is None and chunk_size is not None:
max_chunks = 10
elif chunk_size is None:
max_chunks = 1

with open(filename_1, 'rb') as f1:
with open(filename_2, 'rb') as f2:
for _ in range(max_chunks):
h1 = hashlib.sha1(f1.read(chunk_size)).hexdigest()
h2 = hashlib.sha1(f2.read(chunk_size)).hexdigest()
if h1 != h2:
return False
return True


def resolve_url(
url,
directory=None,
permissions=None,
copy_to_cwd=True,
hash_max_chunks=None,
hash_chunk_size=None,
):
"""Resolves a URL to a local file, and returns the path to that file.
If a URL is given, the file will be copied to the current working
Expand Down Expand Up @@ -78,9 +124,13 @@ def resolve_url(url, directory=None, permissions=None, copy_to_cwd=True):
elif copy_to_cwd:
if os.path.isfile(filename):
# check to see if src and dest are the same file
src_inode = os.stat(u.path)[stat.ST_INO]
dst_inode = os.stat(filename)[stat.ST_INO]
if src_inode != dst_inode:
same_file = hash_compare(
u.path,
filename,
chunk_size=hash_chunk_size,
max_chunks=hash_max_chunks
)
if not same_file:
shutil.copy(u.path, filename)
else:
shutil.copy(u.path, filename)
Expand Down
16 changes: 14 additions & 2 deletions pycbc/workflow/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -2080,7 +2080,12 @@ def __str__(self):
return msg


def resolve_url_to_file(curr_pfn, attrs=None):
def resolve_url_to_file(
curr_pfn,
attrs=None,
hash_max_chunks=10,
hash_chunk_size=int(1e6)
):
"""
Resolves a PFN into a workflow.File object.
Expand All @@ -2104,6 +2109,9 @@ def resolve_url_to_file(curr_pfn, attrs=None):
not important with input files. Exceptions include things like input
template banks, where ifos and valid times will be checked in the workflow
and used in the naming of child job output files.
hash_max_chunks and hash_chunk_size are used to decide how much of the
files to check before they are considered the same, and not copied.
"""
cvmfsstr1 = 'file:///cvmfs/'
cvmfsstr2 = 'file://localhost/cvmfs/'
Expand All @@ -2122,7 +2130,11 @@ def resolve_url_to_file(curr_pfn, attrs=None):
curr_file = file_input_from_config_dict[curr_lfn][1]
else:
# Use resolve_url to download file/symlink as appropriate
local_file_path = resolve_url(curr_pfn)
local_file_path = resolve_url(
curr_pfn,
hash_max_chunks=hash_max_chunks,
hash_chunk_size=hash_chunk_size,
)
# Create File object with default local path
curr_file = File.from_path(local_file_path, attrs=attrs)

Expand Down

0 comments on commit c082c36

Please sign in to comment.