Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add-image-hash-to-detect-rename-and-optimize-reindexing #143

Open
wants to merge 11 commits into
base: main
Choose a base branch
from
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -152,3 +152,5 @@ requirements.txt

benchmarks/datasets/
benchmarks/datasets

*.png
44 changes: 37 additions & 7 deletions rclip/db.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,21 +13,22 @@ class NewImage(ImageOmittable):
modified_at: float
size: int
vector: bytes
hash: str


class Image(NewImage):
id: int


class DB:
VERSION = 2
VERSION = 3

def __init__(self, filename: Union[str, pathlib.Path]):
self._con = sqlite3.connect(filename)
self._con.row_factory = sqlite3.Row
self.ensure_tables()
self.ensure_version()

self.ensure_tables()

def close(self):
self._con.commit()
self._con.close()
Expand All @@ -40,12 +41,19 @@ def ensure_tables(self):
filepath TEXT NOT NULL UNIQUE,
modified_at DATETIME NOT NULL,
size INTEGER NOT NULL,
vector BLOB NOT NULL
vector BLOB NOT NULL,
hash TEXT,
indexing BOOLEAN
)
''')
# Query for images
self._con.execute('CREATE UNIQUE INDEX IF NOT EXISTS existing_images ON images(filepath) WHERE deleted IS NULL')
self._con.execute('CREATE TABLE IF NOT EXISTS db_version (version INTEGER)')
# Check if 'hash' column exists before creating the index
cursor = self._con.execute("PRAGMA table_info(images)")
columns = [column[1] for column in cursor.fetchall()]
if 'hash' in columns:
self._con.execute('CREATE INDEX IF NOT EXISTS image_hash_index ON images(hash)')
self._con.commit()

def ensure_version(self):
Expand All @@ -61,6 +69,9 @@ def ensure_version(self):
if db_version < 2:
self._con.execute('ALTER TABLE images ADD COLUMN indexing BOOLEAN')
db_version = 2
if db_version < 3:
self._con.execute('ALTER TABLE images ADD COLUMN hash TEXT')
db_version = 3
if db_version < self.VERSION:
raise Exception('migration to a newer index version isn\'t implemented')
if db_version_entry:
Expand All @@ -74,10 +85,10 @@ def commit(self):

def upsert_image(self, image: NewImage, commit: bool = True):
self._con.execute('''
INSERT INTO images(deleted, indexing, filepath, modified_at, size, vector)
VALUES (:deleted, :indexing, :filepath, :modified_at, :size, :vector)
INSERT INTO images(deleted, indexing, filepath, modified_at, size, vector, hash)
VALUES (:deleted, :indexing, :filepath, :modified_at, :size, :vector, :hash)
ON CONFLICT(filepath) DO UPDATE SET
deleted=:deleted, indexing=:indexing, modified_at=:modified_at, size=:size, vector=:vector
deleted=:deleted, indexing=:indexing, modified_at=:modified_at, size=:size, vector=:vector, hash=:hash
''', {'deleted': None, 'indexing': None, **image})
if commit:
self._con.commit()
Expand Down Expand Up @@ -113,3 +124,22 @@ def get_image_vectors_by_dir_path(self, path: str) -> sqlite3.Cursor:
return self._con.execute(
f'SELECT filepath, vector FROM images WHERE filepath LIKE ? AND deleted IS NULL', (path + f'{os.path.sep}%',)
)

def get_image_by_hash(self, hash: str) -> Optional[Image]:
cur = self._con.execute('SELECT * FROM images WHERE hash = ? AND deleted IS NULL LIMIT 1', (hash,))
return cur.fetchone()

def update_image_filepath(self, old_filepath: str, new_filepath: str, commit: bool = True):
try:
self._con.execute('UPDATE images SET filepath = ? WHERE filepath = ?', (new_filepath, old_filepath))
except sqlite3.IntegrityError:
# If the new filepath already exists, we need to merge the entries
existing_image = self.get_image(filepath=new_filepath)
if existing_image:
# Delete the old entry
self._con.execute('DELETE FROM images WHERE filepath = ?', (old_filepath,))
else:
# If there's no existing image with the new filepath, re-raise the exception
raise
if commit:
self._con.commit()
66 changes: 45 additions & 21 deletions rclip/main.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,16 @@
import itertools
import os
import re
import sqlite3
import sys
import threading
from typing import Iterable, List, NamedTuple, Optional, Tuple, TypedDict, cast
from typing import List, NamedTuple, Optional, Tuple, TypedDict

import numpy as np
from tqdm import tqdm
import PIL
from PIL import Image, ImageFile
import hashlib

from rclip import db, fs, model
from rclip.utils.preview import preview
Expand All @@ -27,7 +29,7 @@ class ImageMeta(TypedDict):
PathMetaVector = Tuple[str, ImageMeta, model.FeatureVector]


def get_image_meta(entry: os.DirEntry) -> ImageMeta:
def get_image_meta(entry: os.DirEntry[str]) -> ImageMeta:
stat = entry.stat()
return ImageMeta(modified_at=stat.st_mtime, size=stat.st_size)

Expand Down Expand Up @@ -62,31 +64,54 @@ def __init__(
excluded_dirs = '|'.join(re.escape(dir) for dir in exclude_dirs or self.EXCLUDE_DIRS_DEFAULT)
self._exclude_dir_regex = re.compile(f'^.+\\{os.path.sep}({excluded_dirs})(\\{os.path.sep}.+)?$')

def _compute_image_hash(self, image_path: str) -> str:
with open(image_path, 'rb') as f:
return hashlib.md5(f.read()).hexdigest()

def _index_files(self, filepaths: List[str], metas: List[ImageMeta]):
images: List[Image.Image] = []
filtered_paths: List[str] = []
hashes: List[str] = []
for path in filepaths:
try:
image = Image.open(path)
images.append(image)
filtered_paths.append(path)
except PIL.UnidentifiedImageError as ex:
pass
except Exception as ex:
print(f'error loading image {path}:', ex, file=sys.stderr)
try:
image = Image.open(path)
images.append(image)
filtered_paths.append(path)
hashes.append(self._compute_image_hash(path))
except PIL.UnidentifiedImageError as ex:
pass
except Exception as ex:
print(f'error loading image {path}:', ex, file=sys.stderr)

try:
features = self._model.compute_image_features(images)
features = self._model.compute_image_features(images)
except Exception as ex:
print('error computing features:', ex, file=sys.stderr)
return
for path, meta, vector in cast(Iterable[PathMetaVector], zip(filtered_paths, metas, features)):
self._db.upsert_image(db.NewImage(
filepath=path,
modified_at=meta['modified_at'],
size=meta['size'],
vector=vector.tobytes()
), commit=False)
print('error computing features:', ex, file=sys.stderr)
return
for path, meta, vector, hash in zip(filtered_paths, metas, features, hashes):
existing_image = self._db.get_image_by_hash(hash)
if existing_image and existing_image['filepath'] != path:
# Image was renamed, update the filepath
try:
self._db.update_image_filepath(existing_image['filepath'], path, commit=False)
except sqlite3.IntegrityError:
# If updating fails, insert a new entry
self._db.upsert_image(db.NewImage(
filepath=path,
modified_at=meta['modified_at'],
size=meta['size'],
vector=vector.tobytes(),
hash=hash
), commit=False)
else:
self._db.upsert_image(db.NewImage(
filepath=path,
modified_at=meta['modified_at'],
size=meta['size'],
vector=vector.tobytes(),
hash=hash
), commit=False)
self._db.commit()

def ensure_index(self, directory: str):
print(
Expand Down Expand Up @@ -136,7 +161,6 @@ def update_total_images(count: int):
self._index_files(batch, metas)
batch = []
metas = []

if len(batch) != 0:
self._index_files(batch, metas)

Expand Down
67 changes: 67 additions & 0 deletions tests/e2e/gen_img.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
import os

# Create output directory if it doesn't exist
import cv2
import numpy as np
import random

# Create output directory if it doesn't exist
output_dir = './images nested directories/generated_images'
os.makedirs(output_dir, exist_ok=True)

# Number of images to generate
num_images = 10000
image_size = (512, 512) # Image size

# List of random shapes and objects to add to the image
shapes = ['circle', 'rectangle', 'line']
words = ['Tree', 'Car', 'House', 'Sun', 'Sky']

def generate_image(index: int):
# Create a blank white image
img = np.ones((image_size[0], image_size[1], 3), dtype=np.uint8) * 255

# Draw random shapes
for _ in range(random.randint(1, 5)): # Random number of shapes
shape = random.choice(shapes)

if shape == 'circle':
center = (random.randint(50, image_size[0]-50), random.randint(50, image_size[1]-50))
radius = random.randint(20, 80)
color = (random.randint(0, 255), random.randint(0, 255), random.randint(0, 255))
thickness = random.randint(1, 5)
cv2.circle(img, center, radius, color, thickness)

elif shape == 'rectangle':
pt1 = (random.randint(0, image_size[0]//2), random.randint(0, image_size[1]//2))
pt2 = (random.randint(image_size[0]//2, image_size[0]), random.randint(image_size[1]//2, image_size[1]))
color = (random.randint(0, 255), random.randint(0, 255), random.randint(0, 255))
thickness = random.randint(1, 5)
cv2.rectangle(img, pt1, pt2, color, thickness)

elif shape == 'line':
pt1 = (random.randint(0, image_size[0]), random.randint(0, image_size[1]))
pt2 = (random.randint(0, image_size[0]), random.randint(0, image_size[1]))
color = (random.randint(0, 255), random.randint(0, 255), random.randint(0, 255))
thickness = random.randint(1, 5)
cv2.line(img, pt1, pt2, color, thickness)

# Add random text (object names)
for _ in range(random.randint(1, 3)): # Random number of text objects
text = random.choice(words)
font = cv2.FONT_HERSHEY_SIMPLEX
position = (random.randint(50, image_size[0]-150), random.randint(50, image_size[1]-50))
font_scale = random.uniform(0.5, 1.5)
color = (random.randint(0, 255), random.randint(0, 255), random.randint(0, 255))
thickness = random.randint(1, 3)
cv2.putText(img, text, position, font, font_scale, color, thickness, cv2.LINE_AA)

# Save the image
cv2.imwrite(os.path.join(output_dir, f'image_{index}.png'), img)

# Generate the images
for i in range(num_images):
generate_image(i)

print(f'{num_images} meaningful images generated successfully in {output_dir} directory.')