Merge pull request #1306 from kremnik/recognition

Update internal db storage
serengil · Aug 15, 2024 · f3d1809 · f3d1809
2 parents ff03dc6 + fc5c4b9
commit f3d1809
Show file tree

Hide file tree

Showing 2 changed files with 59 additions and 11 deletions.
diff --git a/deepface/modules/recognition.py b/deepface/modules/recognition.py
@@ -1,7 +1,7 @@
 # built-in dependencies
 import os
 import pickle
-from typing import List, Union, Optional, Dict, Any
+from typing import List, Union, Optional, Dict, Any, Set
 import time
 
 # 3rd party dependencies
@@ -141,7 +141,7 @@ def find(
 
     # check each item of representations list has required keys
     for i, current_representation in enumerate(representations):
-        missing_keys = list(set(df_cols) - set(current_representation.keys()))
+        missing_keys = set(df_cols) - set(current_representation.keys())
         if len(missing_keys) > 0:
             raise ValueError(
                 f"{i}-th item does not have some required keys - {missing_keys}."
@@ -160,9 +160,7 @@ def find(
         raise ValueError(f"Nothing is found in {datastore_path}")
 
     must_save_pickle = False
-    new_images = []
-    old_images = []
-    replaced_images = []
+    new_images, old_images, replaced_images = set(), set(), set()
 
     if not refresh_database:
         logger.info(
@@ -172,8 +170,8 @@ def find(
 
     # Enforce data consistency amongst on disk images and pickle file
     if refresh_database:
-        new_images = list(set(storage_images) - set(pickled_images))  # images added to storage
-        old_images = list(set(pickled_images) - set(storage_images))  # images removed from storage
+        new_images = set(storage_images) - set(pickled_images)  # images added to storage
+        old_images = set(pickled_images) - set(storage_images)  # images removed from storage
 
         # detect replaced images
         for current_representation in representations:
@@ -184,7 +182,7 @@ def find(
             beta_hash = image_utils.find_image_hash(identity)
             if alpha_hash != beta_hash:
                 logger.debug(f"Even though {identity} represented before, it's replaced later.")
-                replaced_images.append(identity)
+                replaced_images.add(identity)
 
     if not silent and (len(new_images) > 0 or len(old_images) > 0 or len(replaced_images) > 0):
         logger.info(
@@ -194,8 +192,8 @@ def find(
         )
 
     # append replaced images into both old and new images. these will be dropped and re-added.
-    new_images = new_images + replaced_images
-    old_images = old_images + replaced_images
+    new_images.update(replaced_images)
+    old_images.update(replaced_images)
 
     # remove old images first
     if len(old_images) > 0:
@@ -316,7 +314,7 @@ def find(
 
 
 def __find_bulk_embeddings(
-    employees: List[str],
+    employees: Set[str],
     model_name: str = "VGG-Face",
     detector_backend: str = "opencv",
     enforce_detection: bool = True,

diff --git a/tests/test_find.py b/tests/test_find.py
@@ -101,3 +101,53 @@ def test_filetype_for_find_bulk_embeddings():
 
     # img47 is webp even though its extension is jpg
     assert "dataset/img47.jpg" not in imgs
+
+
+def test_find_without_refresh_database():
+    import shutil, hashlib
+
+    img_path = os.path.join("dataset", "img1.jpg")
+
+    # 1. Calculate hash of the .pkl file;
+    # 2. Move random image to the temporary created directory;
+    # 3. As a result, there will be a difference between the .pkl file and the disk files;
+    # 4. If refresh_database=False, then .pkl file should not be updated.
+    #    Recalculate hash and compare it with the hash from pt. 1;
+    # 5. After successful check, the image will be moved back to the original destination;
+
+    pkl_path = "dataset/ds_model_vggface_detector_opencv_aligned_normalization_base_expand_0.pkl"
+    with open(pkl_path, "rb") as f:
+        hash_before = hashlib.sha256(f.read())
+
+    image_name = "img28.jpg"
+    tmp_dir = "dataset/temp_image"
+    os.mkdir(tmp_dir)
+    shutil.move(os.path.join("dataset", image_name), os.path.join(tmp_dir, image_name))
+
+    dfs = DeepFace.find(img_path=img_path, db_path="dataset", silent=True, refresh_database=False)
+
+    with open(pkl_path, "rb") as f:
+        hash_after = hashlib.sha256(f.read())
+
+    shutil.move(os.path.join(tmp_dir, image_name), os.path.join("dataset", image_name))
+    os.rmdir(tmp_dir)
+
+    assert hash_before.hexdigest() == hash_after.hexdigest()
+
+    logger.info("✅ .pkl hashes before and after the recognition process are the same")
+
+    assert len(dfs) > 0
+    for df in dfs:
+        assert isinstance(df, pd.DataFrame)
+
+        # one is img1.jpg itself
+        identity_df = df[df["identity"] == img_path]
+        assert identity_df.shape[0] > 0
+
+        # validate reproducability
+        assert identity_df["distance"].values[0] < threshold
+
+        df = df[df["identity"] != img_path]
+        logger.debug(df.head())
+        assert df.shape[0] > 0
+    logger.info("✅ test find without refresh database done")