Use a (fast-fail) hash for checking if files are the same in resolve_…

…url (gwastro#4769) * Use a reduced hash of files for checking if files are the same in resolve_url_to_file * Loop through the file in chunks to fail fast * Use the reduced hashing in various places in the code * CC, docstring * Ddint need to add the arguments to these * missed this one * Undo changes in codes that dont need it * remove debugging printing
GarethCabournDavies · Jun 6, 2024 · c082c36 · c082c36
1 parent c06b684
commit c082c36
Show file tree

Hide file tree

Showing 2 changed files with 68 additions and 6 deletions.
diff --git a/pycbc/workflow/configuration.py b/pycbc/workflow/configuration.py
@@ -35,6 +35,7 @@
 from shutil import which
 import urllib.parse
 from urllib.parse import urlparse
+import hashlib
 
 from pycbc.types.config import InterpolatingConfigParser
 
@@ -47,7 +48,52 @@
 urllib.parse.uses_netloc.append('osdf')
 
 
-def resolve_url(url, directory=None, permissions=None, copy_to_cwd=True):
+def hash_compare(filename_1, filename_2, chunk_size=None, max_chunks=None):
+    """
+    Calculate the sha1 hash of a file, or of part of a file
+
+    Parameters
+    ----------
+    filename_1 : string or path
+        the first file to be hashed / compared
+    filename_2 : string or path
+        the second file to be hashed / compared
+    chunk_size : integer
+        This size of chunks to be read in and hashed. If not given, will read
+        the whole file (may be slow for large files).
+    max_chunks: integer
+        This many chunks to be compared. If all chunks so far have been the
+        same, then just assume its the same file. Default 10
+
+    Returns
+    -------
+    hash : string
+        The hexdigest() after a sha1 hash of (part of) the file
+    """
+
+    if max_chunks is None and chunk_size is not None:
+        max_chunks = 10
+    elif chunk_size is None:
+        max_chunks = 1
+
+    with open(filename_1, 'rb') as f1:
+        with open(filename_2, 'rb') as f2:
+            for _ in range(max_chunks):
+                h1 = hashlib.sha1(f1.read(chunk_size)).hexdigest()
+                h2 = hashlib.sha1(f2.read(chunk_size)).hexdigest()
+                if h1 != h2:
+                    return False
+    return True
+
+
+def resolve_url(
+    url,
+    directory=None,
+    permissions=None,
+    copy_to_cwd=True,
+    hash_max_chunks=None,
+    hash_chunk_size=None,
+):
     """Resolves a URL to a local file, and returns the path to that file.
 
     If a URL is given, the file will be copied to the current working
@@ -78,9 +124,13 @@ def resolve_url(url, directory=None, permissions=None, copy_to_cwd=True):
         elif copy_to_cwd:
             if os.path.isfile(filename):
                 # check to see if src and dest are the same file
-                src_inode = os.stat(u.path)[stat.ST_INO]
-                dst_inode = os.stat(filename)[stat.ST_INO]
-                if src_inode != dst_inode:
+                same_file = hash_compare(
+                    u.path,
+                    filename,
+                    chunk_size=hash_chunk_size,
+                    max_chunks=hash_max_chunks
+                )
+                if not same_file:
                     shutil.copy(u.path, filename)
             else:
                 shutil.copy(u.path, filename)

diff --git a/pycbc/workflow/core.py b/pycbc/workflow/core.py
@@ -2080,7 +2080,12 @@ def __str__(self):
         return msg
 
 
-def resolve_url_to_file(curr_pfn, attrs=None):
+def resolve_url_to_file(
+    curr_pfn,
+    attrs=None,
+    hash_max_chunks=10,
+    hash_chunk_size=int(1e6)
+):
     """
     Resolves a PFN into a workflow.File object.
 
@@ -2104,6 +2109,9 @@ def resolve_url_to_file(curr_pfn, attrs=None):
     not important with input files. Exceptions include things like input
     template banks, where ifos and valid times will be checked in the workflow
     and used in the naming of child job output files.
+
+    hash_max_chunks and hash_chunk_size are used to decide how much of the
+    files to check before they are considered the same, and not copied.
     """
     cvmfsstr1 = 'file:///cvmfs/'
     cvmfsstr2 = 'file://localhost/cvmfs/'
@@ -2122,7 +2130,11 @@ def resolve_url_to_file(curr_pfn, attrs=None):
         curr_file = file_input_from_config_dict[curr_lfn][1]
     else:
         # Use resolve_url to download file/symlink as appropriate
-        local_file_path = resolve_url(curr_pfn)
+        local_file_path = resolve_url(
+            curr_pfn,
+            hash_max_chunks=hash_max_chunks,
+            hash_chunk_size=hash_chunk_size,
+        )
         # Create File object with default local path
         curr_file = File.from_path(local_file_path, attrs=attrs)