From 6a4d0fdbcc19df767da0253c874ab0ebb1734540 Mon Sep 17 00:00:00 2001 From: prayas7102 Date: Thu, 17 Oct 2024 18:09:58 +0530 Subject: [PATCH 01/11] add-image-hash-to-detect-rename-and-optimize-reindexing --- rclip/db.py | 24 +++++++++-- rclip/main.py | 113 +++++++++++++++++++++++++++++--------------------- 2 files changed, 85 insertions(+), 52 deletions(-) diff --git a/rclip/db.py b/rclip/db.py index 35ae56f3..dcab3c36 100644 --- a/rclip/db.py +++ b/rclip/db.py @@ -13,6 +13,7 @@ class NewImage(ImageOmittable): modified_at: float size: int vector: bytes + hash: str class Image(NewImage): @@ -27,11 +28,20 @@ def __init__(self, filename: Union[str, pathlib.Path]): self._con.row_factory = sqlite3.Row self.ensure_tables() self.ensure_version() + self._migrate_db() def close(self): self._con.commit() self._con.close() + def _migrate_db(self): + try: + self._con.execute('ALTER TABLE images ADD COLUMN hash TEXT') + self._con.commit() + except sqlite3.OperationalError: + # Column already exists, skip + pass + def ensure_tables(self): self._con.execute(''' CREATE TABLE IF NOT EXISTS images ( @@ -40,6 +50,7 @@ def ensure_tables(self): filepath TEXT NOT NULL UNIQUE, modified_at DATETIME NOT NULL, size INTEGER NOT NULL, + hash TEXT NOT NULL, vector BLOB NOT NULL ) ''') @@ -74,14 +85,14 @@ def commit(self): def upsert_image(self, image: NewImage, commit: bool = True): self._con.execute(''' - INSERT INTO images(deleted, indexing, filepath, modified_at, size, vector) - VALUES (:deleted, :indexing, :filepath, :modified_at, :size, :vector) + INSERT INTO images(deleted, indexing, filepath, modified_at, size, hash, vector) + VALUES (:deleted, :indexing, :filepath, :modified_at, :size, :hash, :vector) ON CONFLICT(filepath) DO UPDATE SET - deleted=:deleted, indexing=:indexing, modified_at=:modified_at, size=:size, vector=:vector + deleted=:deleted, indexing=:indexing, modified_at=:modified_at, size=:size, hash=:hash, vector=:vector ''', {'deleted': None, 'indexing': None, **image}) if commit: self._con.commit() - + def remove_indexing_flag_from_all_images(self, commit: bool = True): self._con.execute('UPDATE images SET indexing = NULL') if commit: @@ -113,3 +124,8 @@ def get_image_vectors_by_dir_path(self, path: str) -> sqlite3.Cursor: return self._con.execute( f'SELECT filepath, vector FROM images WHERE filepath LIKE ? AND deleted IS NULL', (path + f'{os.path.sep}%',) ) + + def get_image_by_hash(self, hash: str) -> Optional[Image]: + row = self._con.execute('SELECT * FROM images WHERE hash = ?', (hash,)) + row = row.fetchone() + return Image(**row) if row else None \ No newline at end of file diff --git a/rclip/main.py b/rclip/main.py index 53a9a328..f60ab9ae 100644 --- a/rclip/main.py +++ b/rclip/main.py @@ -9,6 +9,7 @@ from tqdm import tqdm import PIL from PIL import Image, ImageFile +import imagehash from rclip import db, fs, model from rclip.utils.preview import preview @@ -22,14 +23,19 @@ class ImageMeta(TypedDict): modified_at: float size: int + hash: str PathMetaVector = Tuple[str, ImageMeta, model.FeatureVector] +def compute_image_hash(image_path: str) -> str: + with Image.open(image_path) as img: + return str(imagehash.average_hash(img)) + def get_image_meta(entry: os.DirEntry) -> ImageMeta: stat = entry.stat() - return ImageMeta(modified_at=stat.st_mtime, size=stat.st_size) + return ImageMeta(modified_at=stat.st_mtime,size=stat.st_size,hash=compute_image_hash(entry.path)) def is_image_meta_equal(image: db.Image, meta: ImageMeta) -> bool: @@ -85,63 +91,74 @@ def _index_files(self, filepaths: List[str], metas: List[ImageMeta]): filepath=path, modified_at=meta['modified_at'], size=meta['size'], - vector=vector.tobytes() + vector=vector.tobytes(), + hash=meta['hash'], ), commit=False) def ensure_index(self, directory: str): print( - 'checking images in the current directory for changes;' - ' use "--no-indexing" to skip this if no images were added, changed, or removed', - file=sys.stderr, + 'checking images in the current directory for changes;' + ' use "--no-indexing" to skip this if no images were added, changed, or removed', + file=sys.stderr, ) self._db.remove_indexing_flag_from_all_images(commit=False) self._db.flag_images_in_a_dir_as_indexing(directory, commit=True) with tqdm(total=None, unit='images') as pbar: - def update_total_images(count: int): - pbar.total = count - pbar.refresh() - counter_thread = threading.Thread( - target=fs.count_files, - args=(directory, self._exclude_dir_regex, self.IMAGE_REGEX, update_total_images), - ) - counter_thread.start() - - images_processed = 0 - batch: List[str] = [] - metas: List[ImageMeta] = [] - for entry in fs.walk(directory, self._exclude_dir_regex, self.IMAGE_REGEX): - filepath = entry.path - image = self._db.get_image(filepath=filepath) - try: - meta = get_image_meta(entry) - except Exception as ex: - print(f'error getting fs metadata for {filepath}:', ex, file=sys.stderr) - continue - - if not images_processed % self.DB_IMAGES_BEFORE_COMMIT: - self._db.commit() - images_processed += 1 - pbar.update() - - if image and is_image_meta_equal(image, meta): - self._db.remove_indexing_flag(filepath, commit=False) - continue - - batch.append(filepath) - metas.append(meta) - - if len(batch) >= self._indexing_batch_size: - self._index_files(batch, metas) - batch = [] - metas = [] - - if len(batch) != 0: - self._index_files(batch, metas) - - self._db.commit() - counter_thread.join() + def update_total_images(count: int): + pbar.total = count + pbar.refresh() + counter_thread = threading.Thread( + target=fs.count_files, + args=(directory, self._exclude_dir_regex, self.IMAGE_REGEX, update_total_images), + ) + counter_thread.start() + + images_processed = 0 + batch: List[str] = [] + metas: List[ImageMeta] = [] + for entry in fs.walk(directory, self._exclude_dir_regex, self.IMAGE_REGEX): + filepath = entry.path + try: + meta = get_image_meta(entry) + except Exception as ex: + print(f'error getting fs metadata for {filepath}:', ex, file=sys.stderr) + continue + + if not images_processed % self.DB_IMAGES_BEFORE_COMMIT: + self._db.commit() + images_processed += 1 + pbar.update() + + existing_image = self._db.get_image_by_hash(meta['hash']) + + if existing_image: + if existing_image['filepath'] != filepath: + # Image was renamed, update the filepath + self._db.upsert_image(db.NewImage( + filepath=filepath, + modified_at=meta['modified_at'], + size=meta['size'], + hash=meta['hash'], + vector=existing_image['vector'] + ), commit=False) + self._db.remove_indexing_flag(filepath, commit=False) + continue + + batch.append(filepath) + metas.append(meta) + + if len(batch) >= self._indexing_batch_size: + self._index_files(batch, metas) + batch = [] + metas = [] + + if len(batch) != 0: + self._index_files(batch, metas) + + self._db.commit() + counter_thread.join() self._db.flag_indexing_images_in_a_dir_as_deleted(directory) print('', file=sys.stderr) From af53cca3d4027b2eafb479e93a00ff9f02b78d8e Mon Sep 17 00:00:00 2001 From: prayas7102 Date: Fri, 18 Oct 2024 14:36:59 +0530 Subject: [PATCH 02/11] index for hash column --- rclip/db.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/rclip/db.py b/rclip/db.py index dcab3c36..735b4237 100644 --- a/rclip/db.py +++ b/rclip/db.py @@ -36,10 +36,11 @@ def close(self): def _migrate_db(self): try: - self._con.execute('ALTER TABLE images ADD COLUMN hash TEXT') + self._con.execute('ALTER TABLE images ADD COLUMN hash TEXT NOT NULL DEFAULT ""') + self._con.execute('CREATE INDEX IF NOT EXISTS idx_images_hash ON images(hash)') self._con.commit() except sqlite3.OperationalError: - # Column already exists, skip + # Column or index already exists, skip pass def ensure_tables(self): @@ -52,10 +53,11 @@ def ensure_tables(self): size INTEGER NOT NULL, hash TEXT NOT NULL, vector BLOB NOT NULL - ) + ); ''') # Query for images self._con.execute('CREATE UNIQUE INDEX IF NOT EXISTS existing_images ON images(filepath) WHERE deleted IS NULL') + self._con.execute('CREATE INDEX IF NOT EXISTS idx_images_hash ON images(hash)') self._con.execute('CREATE TABLE IF NOT EXISTS db_version (version INTEGER)') self._con.commit() From 81897e443aa70565482081d513f919245a58c3ce Mon Sep 17 00:00:00 2001 From: prayas7102 Date: Fri, 18 Oct 2024 17:25:10 +0530 Subject: [PATCH 03/11] performance test --- rclip/main.py | 11 ++++++-- tests/e2e/gen_img.py | 67 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 76 insertions(+), 2 deletions(-) create mode 100644 tests/e2e/gen_img.py diff --git a/rclip/main.py b/rclip/main.py index f60ab9ae..ce277f9f 100644 --- a/rclip/main.py +++ b/rclip/main.py @@ -3,6 +3,7 @@ import re import sys import threading +import time from typing import Iterable, List, NamedTuple, Optional, Tuple, TypedDict, cast import numpy as np @@ -101,7 +102,7 @@ def ensure_index(self, directory: str): ' use "--no-indexing" to skip this if no images were added, changed, or removed', file=sys.stderr, ) - + self._db.remove_indexing_flag_from_all_images(commit=False) self._db.flag_images_in_a_dir_as_indexing(directory, commit=True) @@ -118,6 +119,8 @@ def update_total_images(count: int): images_processed = 0 batch: List[str] = [] metas: List[ImageMeta] = [] + lookup_time_sum = 0 + start_time = time.time() for entry in fs.walk(directory, self._exclude_dir_regex, self.IMAGE_REGEX): filepath = entry.path try: @@ -131,7 +134,9 @@ def update_total_images(count: int): images_processed += 1 pbar.update() + lookup_start = time.time() existing_image = self._db.get_image_by_hash(meta['hash']) + lookup_time_sum += time.time() - lookup_start if existing_image: if existing_image['filepath'] != filepath: @@ -153,7 +158,9 @@ def update_total_images(count: int): self._index_files(batch, metas) batch = [] metas = [] - + total_time = time.time() - start_time + print(f"Total indexing time: {total_time:.2f} seconds {lookup_time_sum:.2f}") + print(f"Average lookup time: {lookup_time_sum/images_processed:.5f} seconds") if len(batch) != 0: self._index_files(batch, metas) diff --git a/tests/e2e/gen_img.py b/tests/e2e/gen_img.py new file mode 100644 index 00000000..71496c76 --- /dev/null +++ b/tests/e2e/gen_img.py @@ -0,0 +1,67 @@ +import os + +# Create output directory if it doesn't exist +import cv2 +import numpy as np +import random + +# Create output directory if it doesn't exist +output_dir = './tests/e2e/generated_images' +os.makedirs(output_dir, exist_ok=True) + +# Number of images to generate +num_images = 1000 +image_size = (512, 512) # Image size + +# List of random shapes and objects to add to the image +shapes = ['circle', 'rectangle', 'line'] +words = ['Tree', 'Car', 'House', 'Sun', 'Sky'] + +def generate_image(index: int): + # Create a blank white image + img = np.ones((image_size[0], image_size[1], 3), dtype=np.uint8) * 255 + + # Draw random shapes + for _ in range(random.randint(1, 5)): # Random number of shapes + shape = random.choice(shapes) + + if shape == 'circle': + center = (random.randint(50, image_size[0]-50), random.randint(50, image_size[1]-50)) + radius = random.randint(20, 80) + color = (random.randint(0, 255), random.randint(0, 255), random.randint(0, 255)) + thickness = random.randint(1, 5) + cv2.circle(img, center, radius, color, thickness) + + elif shape == 'rectangle': + pt1 = (random.randint(0, image_size[0]//2), random.randint(0, image_size[1]//2)) + pt2 = (random.randint(image_size[0]//2, image_size[0]), random.randint(image_size[1]//2, image_size[1])) + color = (random.randint(0, 255), random.randint(0, 255), random.randint(0, 255)) + thickness = random.randint(1, 5) + cv2.rectangle(img, pt1, pt2, color, thickness) + + elif shape == 'line': + pt1 = (random.randint(0, image_size[0]), random.randint(0, image_size[1])) + pt2 = (random.randint(0, image_size[0]), random.randint(0, image_size[1])) + color = (random.randint(0, 255), random.randint(0, 255), random.randint(0, 255)) + thickness = random.randint(1, 5) + cv2.line(img, pt1, pt2, color, thickness) + + # Add random text (object names) + for _ in range(random.randint(1, 3)): # Random number of text objects + text = random.choice(words) + font = cv2.FONT_HERSHEY_SIMPLEX + position = (random.randint(50, image_size[0]-150), random.randint(50, image_size[1]-50)) + font_scale = random.uniform(0.5, 1.5) + color = (random.randint(0, 255), random.randint(0, 255), random.randint(0, 255)) + thickness = random.randint(1, 3) + cv2.putText(img, text, position, font, font_scale, color, thickness, cv2.LINE_AA) + + # Save the image + cv2.imwrite(os.path.join(output_dir, f'image_{index}.png'), img) + +# Generate the images +for i in range(num_images): + generate_image(i) + +print(f'{num_images} meaningful images generated successfully in {output_dir} directory.') + From 956ab6eb41920415a66ebf867b37e13686ca84f6 Mon Sep 17 00:00:00 2001 From: prayas7102 Date: Sun, 20 Oct 2024 17:59:42 +0530 Subject: [PATCH 04/11] Add image hash support and handle renamed images --- rclip/db.py | 49 ++++++++------ rclip/main.py | 156 +++++++++++++++++++++---------------------- tests/e2e/gen_img.py | 4 +- 3 files changed, 105 insertions(+), 104 deletions(-) diff --git a/rclip/db.py b/rclip/db.py index 735b4237..4f30c034 100644 --- a/rclip/db.py +++ b/rclip/db.py @@ -21,28 +21,18 @@ class Image(NewImage): class DB: - VERSION = 2 + VERSION = 3 def __init__(self, filename: Union[str, pathlib.Path]): self._con = sqlite3.connect(filename) self._con.row_factory = sqlite3.Row self.ensure_tables() self.ensure_version() - self._migrate_db() def close(self): self._con.commit() self._con.close() - def _migrate_db(self): - try: - self._con.execute('ALTER TABLE images ADD COLUMN hash TEXT NOT NULL DEFAULT ""') - self._con.execute('CREATE INDEX IF NOT EXISTS idx_images_hash ON images(hash)') - self._con.commit() - except sqlite3.OperationalError: - # Column or index already exists, skip - pass - def ensure_tables(self): self._con.execute(''' CREATE TABLE IF NOT EXISTS images ( @@ -51,13 +41,13 @@ def ensure_tables(self): filepath TEXT NOT NULL UNIQUE, modified_at DATETIME NOT NULL, size INTEGER NOT NULL, - hash TEXT NOT NULL, - vector BLOB NOT NULL - ); + vector BLOB NOT NULL, + hash TEXT NOT NULL + ) ''') # Query for images self._con.execute('CREATE UNIQUE INDEX IF NOT EXISTS existing_images ON images(filepath) WHERE deleted IS NULL') - self._con.execute('CREATE INDEX IF NOT EXISTS idx_images_hash ON images(hash)') + self._con.execute('CREATE INDEX IF NOT EXISTS image_hash_index ON images(hash)') # New index for hash self._con.execute('CREATE TABLE IF NOT EXISTS db_version (version INTEGER)') self._con.commit() @@ -74,6 +64,17 @@ def ensure_version(self): if db_version < 2: self._con.execute('ALTER TABLE images ADD COLUMN indexing BOOLEAN') db_version = 2 + if db_version < 3: + # Check if the 'hash' column already exists + cursor = self._con.execute("PRAGMA table_info(images)") + columns = [column[1] for column in cursor.fetchall()] + if 'hash' not in columns: + self._con.execute('ALTER TABLE images ADD COLUMN hash TEXT') + # Check if the index already exists + cursor = self._con.execute("SELECT name FROM sqlite_master WHERE type='index' AND name='image_hash_index'") + if not cursor.fetchone(): + self._con.execute('CREATE INDEX image_hash_index ON images(hash)') + db_version = 3 if db_version < self.VERSION: raise Exception('migration to a newer index version isn\'t implemented') if db_version_entry: @@ -87,14 +88,14 @@ def commit(self): def upsert_image(self, image: NewImage, commit: bool = True): self._con.execute(''' - INSERT INTO images(deleted, indexing, filepath, modified_at, size, hash, vector) - VALUES (:deleted, :indexing, :filepath, :modified_at, :size, :hash, :vector) + INSERT INTO images(deleted, indexing, filepath, modified_at, size, vector, hash) + VALUES (:deleted, :indexing, :filepath, :modified_at, :size, :vector, :hash) ON CONFLICT(filepath) DO UPDATE SET - deleted=:deleted, indexing=:indexing, modified_at=:modified_at, size=:size, hash=:hash, vector=:vector + deleted=:deleted, indexing=:indexing, modified_at=:modified_at, size=:size, vector=:vector, hash=:hash ''', {'deleted': None, 'indexing': None, **image}) if commit: self._con.commit() - + def remove_indexing_flag_from_all_images(self, commit: bool = True): self._con.execute('UPDATE images SET indexing = NULL') if commit: @@ -128,6 +129,10 @@ def get_image_vectors_by_dir_path(self, path: str) -> sqlite3.Cursor: ) def get_image_by_hash(self, hash: str) -> Optional[Image]: - row = self._con.execute('SELECT * FROM images WHERE hash = ?', (hash,)) - row = row.fetchone() - return Image(**row) if row else None \ No newline at end of file + cur = self._con.execute('SELECT * FROM images WHERE hash = ? AND deleted IS NULL LIMIT 1', (hash,)) + return cur.fetchone() + + def update_image_filepath(self, old_filepath: str, new_filepath: str, commit: bool = True): + self._con.execute('UPDATE images SET filepath = ? WHERE filepath = ?', (new_filepath, old_filepath)) + if commit: + self._con.commit() \ No newline at end of file diff --git a/rclip/main.py b/rclip/main.py index ce277f9f..986fb594 100644 --- a/rclip/main.py +++ b/rclip/main.py @@ -4,13 +4,13 @@ import sys import threading import time -from typing import Iterable, List, NamedTuple, Optional, Tuple, TypedDict, cast +from typing import List, NamedTuple, Optional, Tuple, TypedDict import numpy as np from tqdm import tqdm import PIL from PIL import Image, ImageFile -import imagehash +import hashlib from rclip import db, fs, model from rclip.utils.preview import preview @@ -24,19 +24,14 @@ class ImageMeta(TypedDict): modified_at: float size: int - hash: str PathMetaVector = Tuple[str, ImageMeta, model.FeatureVector] -def compute_image_hash(image_path: str) -> str: - with Image.open(image_path) as img: - return str(imagehash.average_hash(img)) - -def get_image_meta(entry: os.DirEntry) -> ImageMeta: +def get_image_meta(entry: os.DirEntry[str]) -> ImageMeta: stat = entry.stat() - return ImageMeta(modified_at=stat.st_mtime,size=stat.st_size,hash=compute_image_hash(entry.path)) + return ImageMeta(modified_at=stat.st_mtime, size=stat.st_size) def is_image_meta_equal(image: db.Image, meta: ImageMeta) -> bool: @@ -69,14 +64,20 @@ def __init__( excluded_dirs = '|'.join(re.escape(dir) for dir in exclude_dirs or self.EXCLUDE_DIRS_DEFAULT) self._exclude_dir_regex = re.compile(f'^.+\\{os.path.sep}({excluded_dirs})(\\{os.path.sep}.+)?$') + def _compute_image_hash(self, image_path: str) -> str: + with open(image_path, 'rb') as f: + return hashlib.md5(f.read()).hexdigest() + def _index_files(self, filepaths: List[str], metas: List[ImageMeta]): images: List[Image.Image] = [] filtered_paths: List[str] = [] + hashes: List[str] = [] for path in filepaths: try: image = Image.open(path) images.append(image) filtered_paths.append(path) + hashes.append(self._compute_image_hash(path)) except PIL.UnidentifiedImageError as ex: pass except Exception as ex: @@ -87,85 +88,80 @@ def _index_files(self, filepaths: List[str], metas: List[ImageMeta]): except Exception as ex: print('error computing features:', ex, file=sys.stderr) return - for path, meta, vector in cast(Iterable[PathMetaVector], zip(filtered_paths, metas, features)): - self._db.upsert_image(db.NewImage( - filepath=path, - modified_at=meta['modified_at'], - size=meta['size'], - vector=vector.tobytes(), - hash=meta['hash'], - ), commit=False) + for path, meta, vector, hash in zip(filtered_paths, metas, features, hashes): + existing_image = self._db.get_image_by_hash(hash) + if existing_image and existing_image['filepath'] != path: + # Image was renamed, update the filepath + self._db.update_image_filepath(existing_image['filepath'], path, commit=False) + else: + self._db.upsert_image(db.NewImage( + filepath=path, + modified_at=meta['modified_at'], + size=meta['size'], + vector=vector.tobytes(), + hash=hash + ), commit=False) def ensure_index(self, directory: str): print( - 'checking images in the current directory for changes;' - ' use "--no-indexing" to skip this if no images were added, changed, or removed', - file=sys.stderr, + 'checking images in the current directory for changes;' + ' use "--no-indexing" to skip this if no images were added, changed, or removed', + file=sys.stderr, ) - + self._db.remove_indexing_flag_from_all_images(commit=False) self._db.flag_images_in_a_dir_as_indexing(directory, commit=True) with tqdm(total=None, unit='images') as pbar: - def update_total_images(count: int): - pbar.total = count - pbar.refresh() - counter_thread = threading.Thread( - target=fs.count_files, - args=(directory, self._exclude_dir_regex, self.IMAGE_REGEX, update_total_images), - ) - counter_thread.start() - - images_processed = 0 - batch: List[str] = [] - metas: List[ImageMeta] = [] - lookup_time_sum = 0 - start_time = time.time() - for entry in fs.walk(directory, self._exclude_dir_regex, self.IMAGE_REGEX): - filepath = entry.path - try: - meta = get_image_meta(entry) - except Exception as ex: - print(f'error getting fs metadata for {filepath}:', ex, file=sys.stderr) - continue - - if not images_processed % self.DB_IMAGES_BEFORE_COMMIT: - self._db.commit() - images_processed += 1 - pbar.update() - - lookup_start = time.time() - existing_image = self._db.get_image_by_hash(meta['hash']) - lookup_time_sum += time.time() - lookup_start - - if existing_image: - if existing_image['filepath'] != filepath: - # Image was renamed, update the filepath - self._db.upsert_image(db.NewImage( - filepath=filepath, - modified_at=meta['modified_at'], - size=meta['size'], - hash=meta['hash'], - vector=existing_image['vector'] - ), commit=False) - self._db.remove_indexing_flag(filepath, commit=False) - continue - - batch.append(filepath) - metas.append(meta) - - if len(batch) >= self._indexing_batch_size: - self._index_files(batch, metas) - batch = [] - metas = [] - total_time = time.time() - start_time - print(f"Total indexing time: {total_time:.2f} seconds {lookup_time_sum:.2f}") - print(f"Average lookup time: {lookup_time_sum/images_processed:.5f} seconds") - if len(batch) != 0: - self._index_files(batch, metas) - - self._db.commit() - counter_thread.join() + def update_total_images(count: int): + pbar.total = count + pbar.refresh() + counter_thread = threading.Thread( + target=fs.count_files, + args=(directory, self._exclude_dir_regex, self.IMAGE_REGEX, update_total_images), + ) + counter_thread.start() + + images_processed = 0 + batch: List[str] = [] + metas: List[ImageMeta] = [] + lookup_time_sum = 0 + start_time = time.time() + for entry in fs.walk(directory, self._exclude_dir_regex, self.IMAGE_REGEX): + filepath = entry.path + lookup_start = time.time() + image = self._db.get_image(filepath=filepath) + lookup_time_sum += time.time() - lookup_start + try: + meta = get_image_meta(entry) + except Exception as ex: + print(f'error getting fs metadata for {filepath}:', ex, file=sys.stderr) + continue + + if not images_processed % self.DB_IMAGES_BEFORE_COMMIT: + self._db.commit() + images_processed += 1 + pbar.update() + + if image and is_image_meta_equal(image, meta): + self._db.remove_indexing_flag(filepath, commit=False) + continue + + batch.append(filepath) + metas.append(meta) + + if len(batch) >= self._indexing_batch_size: + self._index_files(batch, metas) + batch = [] + metas = [] + total_time = time.time() - start_time + print(f"Total indexing time: {total_time:.2f} seconds") + print(f"Average lookup time: {lookup_time_sum/images_processed:.5f} seconds") + if len(batch) != 0: + self._index_files(batch, metas) + + self._db.commit() + counter_thread.join() self._db.flag_indexing_images_in_a_dir_as_deleted(directory) print('', file=sys.stderr) diff --git a/tests/e2e/gen_img.py b/tests/e2e/gen_img.py index 71496c76..91c00b73 100644 --- a/tests/e2e/gen_img.py +++ b/tests/e2e/gen_img.py @@ -6,11 +6,11 @@ import random # Create output directory if it doesn't exist -output_dir = './tests/e2e/generated_images' +output_dir = './tests/e2e/generated_images/images nested directories' os.makedirs(output_dir, exist_ok=True) # Number of images to generate -num_images = 1000 +num_images = 10000 image_size = (512, 512) # Image size # List of random shapes and objects to add to the image From 7fd52c941f5aee5ea52591173f87698b7625a69a Mon Sep 17 00:00:00 2001 From: prayas7102 Date: Sun, 20 Oct 2024 18:41:44 +0530 Subject: [PATCH 05/11] gen_img output_dir, path resolved --- tests/e2e/gen_img.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/e2e/gen_img.py b/tests/e2e/gen_img.py index 91c00b73..77986c68 100644 --- a/tests/e2e/gen_img.py +++ b/tests/e2e/gen_img.py @@ -6,7 +6,7 @@ import random # Create output directory if it doesn't exist -output_dir = './tests/e2e/generated_images/images nested directories' +output_dir = './images nested directories/generated_images' os.makedirs(output_dir, exist_ok=True) # Number of images to generate From 1b2f6a3aa6bc91dc7f1006f16fc3c10719524092 Mon Sep 17 00:00:00 2001 From: prayas7102 Date: Sun, 20 Oct 2024 19:10:21 +0530 Subject: [PATCH 06/11] _migrate_db added --- rclip/db.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/rclip/db.py b/rclip/db.py index 4f30c034..54a708e8 100644 --- a/rclip/db.py +++ b/rclip/db.py @@ -28,7 +28,16 @@ def __init__(self, filename: Union[str, pathlib.Path]): self._con.row_factory = sqlite3.Row self.ensure_tables() self.ensure_version() - + self._migrate_db() + + def _migrate_db(self): + try: + self._con.execute('ALTER TABLE images ADD COLUMN hash TEXT') + self._con.commit() + except sqlite3.OperationalError: + # Column already exists, skip + pass + def close(self): self._con.commit() self._con.close() From 8e4aec6030b1b25db51c16b8f95e1582b185e73a Mon Sep 17 00:00:00 2001 From: prayas7102 Date: Sun, 20 Oct 2024 19:43:45 +0530 Subject: [PATCH 07/11] check for the hash column before creating the index --- rclip/db.py | 23 ++++++++++------------- 1 file changed, 10 insertions(+), 13 deletions(-) diff --git a/rclip/db.py b/rclip/db.py index 54a708e8..86bd23f8 100644 --- a/rclip/db.py +++ b/rclip/db.py @@ -26,8 +26,8 @@ class DB: def __init__(self, filename: Union[str, pathlib.Path]): self._con = sqlite3.connect(filename) self._con.row_factory = sqlite3.Row - self.ensure_tables() self.ensure_version() + self.ensure_tables() self._migrate_db() def _migrate_db(self): @@ -51,13 +51,18 @@ def ensure_tables(self): modified_at DATETIME NOT NULL, size INTEGER NOT NULL, vector BLOB NOT NULL, - hash TEXT NOT NULL + hash TEXT, + indexing BOOLEAN ) ''') # Query for images self._con.execute('CREATE UNIQUE INDEX IF NOT EXISTS existing_images ON images(filepath) WHERE deleted IS NULL') - self._con.execute('CREATE INDEX IF NOT EXISTS image_hash_index ON images(hash)') # New index for hash self._con.execute('CREATE TABLE IF NOT EXISTS db_version (version INTEGER)') + # Check if 'hash' column exists before creating the index + cursor = self._con.execute("PRAGMA table_info(images)") + columns = [column[1] for column in cursor.fetchall()] + if 'hash' in columns: + self._con.execute('CREATE INDEX IF NOT EXISTS image_hash_index ON images(hash)') self._con.commit() def ensure_version(self): @@ -74,15 +79,7 @@ def ensure_version(self): self._con.execute('ALTER TABLE images ADD COLUMN indexing BOOLEAN') db_version = 2 if db_version < 3: - # Check if the 'hash' column already exists - cursor = self._con.execute("PRAGMA table_info(images)") - columns = [column[1] for column in cursor.fetchall()] - if 'hash' not in columns: - self._con.execute('ALTER TABLE images ADD COLUMN hash TEXT') - # Check if the index already exists - cursor = self._con.execute("SELECT name FROM sqlite_master WHERE type='index' AND name='image_hash_index'") - if not cursor.fetchone(): - self._con.execute('CREATE INDEX image_hash_index ON images(hash)') + self._con.execute('ALTER TABLE images ADD COLUMN hash TEXT') db_version = 3 if db_version < self.VERSION: raise Exception('migration to a newer index version isn\'t implemented') @@ -90,7 +87,7 @@ def ensure_version(self): self._con.execute('UPDATE db_version SET version=?', (self.VERSION,)) else: self._con.execute('INSERT INTO db_version(version) VALUES (?)', (self.VERSION,)) - self._con.commit() + self._con.commit() def commit(self): self._con.commit() From 6220a102aee9dd8240b8052eaacec430bd052a64 Mon Sep 17 00:00:00 2001 From: prayas7102 Date: Mon, 21 Oct 2024 07:19:56 +0530 Subject: [PATCH 08/11] provide indentation --- rclip/db.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rclip/db.py b/rclip/db.py index 86bd23f8..e5f45965 100644 --- a/rclip/db.py +++ b/rclip/db.py @@ -87,7 +87,7 @@ def ensure_version(self): self._con.execute('UPDATE db_version SET version=?', (self.VERSION,)) else: self._con.execute('INSERT INTO db_version(version) VALUES (?)', (self.VERSION,)) - self._con.commit() + self._con.commit() def commit(self): self._con.commit() From 6a2f978b1c0244eacec502885b4427e49a8fef64 Mon Sep 17 00:00:00 2001 From: prayas7102 Date: Mon, 21 Oct 2024 07:38:37 +0530 Subject: [PATCH 09/11] remove redundant migrate_db --- rclip/db.py | 9 --------- 1 file changed, 9 deletions(-) diff --git a/rclip/db.py b/rclip/db.py index e5f45965..65f8b7ad 100644 --- a/rclip/db.py +++ b/rclip/db.py @@ -28,15 +28,6 @@ def __init__(self, filename: Union[str, pathlib.Path]): self._con.row_factory = sqlite3.Row self.ensure_version() self.ensure_tables() - self._migrate_db() - - def _migrate_db(self): - try: - self._con.execute('ALTER TABLE images ADD COLUMN hash TEXT') - self._con.commit() - except sqlite3.OperationalError: - # Column already exists, skip - pass def close(self): self._con.commit() From ed06f47dc57efe7bf924d91de0b3c0e549bc6036 Mon Sep 17 00:00:00 2001 From: prayas7102 Date: Mon, 21 Oct 2024 09:39:49 +0530 Subject: [PATCH 10/11] modified update_image_filepath to resolve unique constraint error --- rclip/db.py | 14 ++++++++++-- rclip/main.py | 60 ++++++++++++++++++++++++++++++--------------------- 2 files changed, 48 insertions(+), 26 deletions(-) diff --git a/rclip/db.py b/rclip/db.py index 65f8b7ad..c24ae8f1 100644 --- a/rclip/db.py +++ b/rclip/db.py @@ -130,6 +130,16 @@ def get_image_by_hash(self, hash: str) -> Optional[Image]: return cur.fetchone() def update_image_filepath(self, old_filepath: str, new_filepath: str, commit: bool = True): - self._con.execute('UPDATE images SET filepath = ? WHERE filepath = ?', (new_filepath, old_filepath)) + try: + self._con.execute('UPDATE images SET filepath = ? WHERE filepath = ?', (new_filepath, old_filepath)) + except sqlite3.IntegrityError: + # If the new filepath already exists, we need to merge the entries + existing_image = self.get_image(filepath=new_filepath) + if existing_image: + # Delete the old entry + self._con.execute('DELETE FROM images WHERE filepath = ?', (old_filepath,)) + else: + # If there's no existing image with the new filepath, re-raise the exception + raise if commit: - self._con.commit() \ No newline at end of file + self._con.commit() \ No newline at end of file diff --git a/rclip/main.py b/rclip/main.py index 986fb594..8f03d38d 100644 --- a/rclip/main.py +++ b/rclip/main.py @@ -1,6 +1,7 @@ import itertools import os import re +import sqlite3 import sys import threading import time @@ -73,34 +74,45 @@ def _index_files(self, filepaths: List[str], metas: List[ImageMeta]): filtered_paths: List[str] = [] hashes: List[str] = [] for path in filepaths: - try: - image = Image.open(path) - images.append(image) - filtered_paths.append(path) - hashes.append(self._compute_image_hash(path)) - except PIL.UnidentifiedImageError as ex: - pass - except Exception as ex: - print(f'error loading image {path}:', ex, file=sys.stderr) + try: + image = Image.open(path) + images.append(image) + filtered_paths.append(path) + hashes.append(self._compute_image_hash(path)) + except PIL.UnidentifiedImageError as ex: + pass + except Exception as ex: + print(f'error loading image {path}:', ex, file=sys.stderr) try: - features = self._model.compute_image_features(images) + features = self._model.compute_image_features(images) except Exception as ex: - print('error computing features:', ex, file=sys.stderr) - return + print('error computing features:', ex, file=sys.stderr) + return for path, meta, vector, hash in zip(filtered_paths, metas, features, hashes): - existing_image = self._db.get_image_by_hash(hash) - if existing_image and existing_image['filepath'] != path: - # Image was renamed, update the filepath - self._db.update_image_filepath(existing_image['filepath'], path, commit=False) - else: - self._db.upsert_image(db.NewImage( - filepath=path, - modified_at=meta['modified_at'], - size=meta['size'], - vector=vector.tobytes(), - hash=hash - ), commit=False) + existing_image = self._db.get_image_by_hash(hash) + if existing_image and existing_image['filepath'] != path: + # Image was renamed, update the filepath + try: + self._db.update_image_filepath(existing_image['filepath'], path, commit=False) + except sqlite3.IntegrityError: + # If updating fails, insert a new entry + self._db.upsert_image(db.NewImage( + filepath=path, + modified_at=meta['modified_at'], + size=meta['size'], + vector=vector.tobytes(), + hash=hash + ), commit=False) + else: + self._db.upsert_image(db.NewImage( + filepath=path, + modified_at=meta['modified_at'], + size=meta['size'], + vector=vector.tobytes(), + hash=hash + ), commit=False) + self._db.commit() def ensure_index(self, directory: str): print( From 2be481dbe0711a59cfdf7f97b0b352f713f3d5da Mon Sep 17 00:00:00 2001 From: prayas7102 Date: Mon, 21 Oct 2024 16:20:35 +0530 Subject: [PATCH 11/11] removed time calculation --- .gitignore | 2 ++ rclip/main.py | 8 -------- 2 files changed, 2 insertions(+), 8 deletions(-) diff --git a/.gitignore b/.gitignore index d93036f5..8e8fad3b 100644 --- a/.gitignore +++ b/.gitignore @@ -152,3 +152,5 @@ requirements.txt benchmarks/datasets/ benchmarks/datasets + +*.png \ No newline at end of file diff --git a/rclip/main.py b/rclip/main.py index 8f03d38d..cbefc500 100644 --- a/rclip/main.py +++ b/rclip/main.py @@ -4,7 +4,6 @@ import sqlite3 import sys import threading -import time from typing import List, NamedTuple, Optional, Tuple, TypedDict import numpy as np @@ -137,13 +136,9 @@ def update_total_images(count: int): images_processed = 0 batch: List[str] = [] metas: List[ImageMeta] = [] - lookup_time_sum = 0 - start_time = time.time() for entry in fs.walk(directory, self._exclude_dir_regex, self.IMAGE_REGEX): filepath = entry.path - lookup_start = time.time() image = self._db.get_image(filepath=filepath) - lookup_time_sum += time.time() - lookup_start try: meta = get_image_meta(entry) except Exception as ex: @@ -166,9 +161,6 @@ def update_total_images(count: int): self._index_files(batch, metas) batch = [] metas = [] - total_time = time.time() - start_time - print(f"Total indexing time: {total_time:.2f} seconds") - print(f"Average lookup time: {lookup_time_sum/images_processed:.5f} seconds") if len(batch) != 0: self._index_files(batch, metas)