Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update internal db storage #1306

Merged
merged 2 commits into from
Aug 15, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 9 additions & 11 deletions deepface/modules/recognition.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# built-in dependencies
import os
import pickle
from typing import List, Union, Optional, Dict, Any
from typing import List, Union, Optional, Dict, Any, Set
import time

# 3rd party dependencies
Expand Down Expand Up @@ -141,7 +141,7 @@ def find(

# check each item of representations list has required keys
for i, current_representation in enumerate(representations):
missing_keys = list(set(df_cols) - set(current_representation.keys()))
missing_keys = set(df_cols) - set(current_representation.keys())
if len(missing_keys) > 0:
raise ValueError(
f"{i}-th item does not have some required keys - {missing_keys}."
Expand All @@ -160,9 +160,7 @@ def find(
raise ValueError(f"Nothing is found in {datastore_path}")

must_save_pickle = False
new_images = []
old_images = []
replaced_images = []
new_images, old_images, replaced_images = set(), set(), set()

if not refresh_database:
logger.info(
Expand All @@ -172,8 +170,8 @@ def find(

# Enforce data consistency amongst on disk images and pickle file
if refresh_database:
new_images = list(set(storage_images) - set(pickled_images)) # images added to storage
old_images = list(set(pickled_images) - set(storage_images)) # images removed from storage
new_images = set(storage_images) - set(pickled_images) # images added to storage
old_images = set(pickled_images) - set(storage_images) # images removed from storage

# detect replaced images
for current_representation in representations:
Expand All @@ -184,7 +182,7 @@ def find(
beta_hash = image_utils.find_image_hash(identity)
if alpha_hash != beta_hash:
logger.debug(f"Even though {identity} represented before, it's replaced later.")
replaced_images.append(identity)
replaced_images.add(identity)

if not silent and (len(new_images) > 0 or len(old_images) > 0 or len(replaced_images) > 0):
logger.info(
Expand All @@ -194,8 +192,8 @@ def find(
)

# append replaced images into both old and new images. these will be dropped and re-added.
new_images = new_images + replaced_images
old_images = old_images + replaced_images
new_images.update(replaced_images)
Copy link
Owner

@serengil serengil Aug 14, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

suppose that refresh_database = False, new_images and old_images are not initialized anymore in line 163 and 164. Line 195 and 196 will throw expception in that case.

IMO initializing new_images and old_images as empty sets in line 163, 164 will sort the problem

Additionally, we should have an unit test for refresh_database = False

old_images.update(replaced_images)

# remove old images first
if len(old_images) > 0:
Expand Down Expand Up @@ -316,7 +314,7 @@ def find(


def __find_bulk_embeddings(
employees: List[str],
employees: Set[str],
model_name: str = "VGG-Face",
detector_backend: str = "opencv",
enforce_detection: bool = True,
Expand Down
50 changes: 50 additions & 0 deletions tests/test_find.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,3 +101,53 @@ def test_filetype_for_find_bulk_embeddings():

# img47 is webp even though its extension is jpg
assert "dataset/img47.jpg" not in imgs


def test_find_without_refresh_database():
import shutil, hashlib

img_path = os.path.join("dataset", "img1.jpg")

# 1. Calculate hash of the .pkl file;
# 2. Move random image to the temporary created directory;
# 3. As a result, there will be a difference between the .pkl file and the disk files;
# 4. If refresh_database=False, then .pkl file should not be updated.
# Recalculate hash and compare it with the hash from pt. 1;
# 5. After successful check, the image will be moved back to the original destination;

pkl_path = "dataset/ds_model_vggface_detector_opencv_aligned_normalization_base_expand_0.pkl"
with open(pkl_path, "rb") as f:
hash_before = hashlib.sha256(f.read())

image_name = "img28.jpg"
tmp_dir = "dataset/temp_image"
os.mkdir(tmp_dir)
shutil.move(os.path.join("dataset", image_name), os.path.join(tmp_dir, image_name))

dfs = DeepFace.find(img_path=img_path, db_path="dataset", silent=True, refresh_database=False)

with open(pkl_path, "rb") as f:
hash_after = hashlib.sha256(f.read())

shutil.move(os.path.join(tmp_dir, image_name), os.path.join("dataset", image_name))
os.rmdir(tmp_dir)

assert hash_before.hexdigest() == hash_after.hexdigest()

logger.info("✅ .pkl hashes before and after the recognition process are the same")

assert len(dfs) > 0
for df in dfs:
assert isinstance(df, pd.DataFrame)

# one is img1.jpg itself
identity_df = df[df["identity"] == img_path]
assert identity_df.shape[0] > 0

# validate reproducability
assert identity_df["distance"].values[0] < threshold

df = df[df["identity"] != img_path]
logger.debug(df.head())
assert df.shape[0] > 0
logger.info("✅ test find without refresh database done")