diff --git a/.gitignore b/.gitignore index d93036f5..8e8fad3b 100644 --- a/.gitignore +++ b/.gitignore @@ -152,3 +152,5 @@ requirements.txt benchmarks/datasets/ benchmarks/datasets + +*.png \ No newline at end of file diff --git a/rclip/db.py b/rclip/db.py index 35ae56f3..c24ae8f1 100644 --- a/rclip/db.py +++ b/rclip/db.py @@ -13,6 +13,7 @@ class NewImage(ImageOmittable): modified_at: float size: int vector: bytes + hash: str class Image(NewImage): @@ -20,14 +21,14 @@ class Image(NewImage): class DB: - VERSION = 2 + VERSION = 3 def __init__(self, filename: Union[str, pathlib.Path]): self._con = sqlite3.connect(filename) self._con.row_factory = sqlite3.Row - self.ensure_tables() self.ensure_version() - + self.ensure_tables() + def close(self): self._con.commit() self._con.close() @@ -40,12 +41,19 @@ def ensure_tables(self): filepath TEXT NOT NULL UNIQUE, modified_at DATETIME NOT NULL, size INTEGER NOT NULL, - vector BLOB NOT NULL + vector BLOB NOT NULL, + hash TEXT, + indexing BOOLEAN ) ''') # Query for images self._con.execute('CREATE UNIQUE INDEX IF NOT EXISTS existing_images ON images(filepath) WHERE deleted IS NULL') self._con.execute('CREATE TABLE IF NOT EXISTS db_version (version INTEGER)') + # Check if 'hash' column exists before creating the index + cursor = self._con.execute("PRAGMA table_info(images)") + columns = [column[1] for column in cursor.fetchall()] + if 'hash' in columns: + self._con.execute('CREATE INDEX IF NOT EXISTS image_hash_index ON images(hash)') self._con.commit() def ensure_version(self): @@ -61,6 +69,9 @@ def ensure_version(self): if db_version < 2: self._con.execute('ALTER TABLE images ADD COLUMN indexing BOOLEAN') db_version = 2 + if db_version < 3: + self._con.execute('ALTER TABLE images ADD COLUMN hash TEXT') + db_version = 3 if db_version < self.VERSION: raise Exception('migration to a newer index version isn\'t implemented') if db_version_entry: @@ -74,10 +85,10 @@ def commit(self): def upsert_image(self, image: NewImage, commit: bool = True): self._con.execute(''' - INSERT INTO images(deleted, indexing, filepath, modified_at, size, vector) - VALUES (:deleted, :indexing, :filepath, :modified_at, :size, :vector) + INSERT INTO images(deleted, indexing, filepath, modified_at, size, vector, hash) + VALUES (:deleted, :indexing, :filepath, :modified_at, :size, :vector, :hash) ON CONFLICT(filepath) DO UPDATE SET - deleted=:deleted, indexing=:indexing, modified_at=:modified_at, size=:size, vector=:vector + deleted=:deleted, indexing=:indexing, modified_at=:modified_at, size=:size, vector=:vector, hash=:hash ''', {'deleted': None, 'indexing': None, **image}) if commit: self._con.commit() @@ -113,3 +124,22 @@ def get_image_vectors_by_dir_path(self, path: str) -> sqlite3.Cursor: return self._con.execute( f'SELECT filepath, vector FROM images WHERE filepath LIKE ? AND deleted IS NULL', (path + f'{os.path.sep}%',) ) + + def get_image_by_hash(self, hash: str) -> Optional[Image]: + cur = self._con.execute('SELECT * FROM images WHERE hash = ? AND deleted IS NULL LIMIT 1', (hash,)) + return cur.fetchone() + + def update_image_filepath(self, old_filepath: str, new_filepath: str, commit: bool = True): + try: + self._con.execute('UPDATE images SET filepath = ? WHERE filepath = ?', (new_filepath, old_filepath)) + except sqlite3.IntegrityError: + # If the new filepath already exists, we need to merge the entries + existing_image = self.get_image(filepath=new_filepath) + if existing_image: + # Delete the old entry + self._con.execute('DELETE FROM images WHERE filepath = ?', (old_filepath,)) + else: + # If there's no existing image with the new filepath, re-raise the exception + raise + if commit: + self._con.commit() \ No newline at end of file diff --git a/rclip/main.py b/rclip/main.py index 53a9a328..cbefc500 100644 --- a/rclip/main.py +++ b/rclip/main.py @@ -1,14 +1,16 @@ import itertools import os import re +import sqlite3 import sys import threading -from typing import Iterable, List, NamedTuple, Optional, Tuple, TypedDict, cast +from typing import List, NamedTuple, Optional, Tuple, TypedDict import numpy as np from tqdm import tqdm import PIL from PIL import Image, ImageFile +import hashlib from rclip import db, fs, model from rclip.utils.preview import preview @@ -27,7 +29,7 @@ class ImageMeta(TypedDict): PathMetaVector = Tuple[str, ImageMeta, model.FeatureVector] -def get_image_meta(entry: os.DirEntry) -> ImageMeta: +def get_image_meta(entry: os.DirEntry[str]) -> ImageMeta: stat = entry.stat() return ImageMeta(modified_at=stat.st_mtime, size=stat.st_size) @@ -62,31 +64,54 @@ def __init__( excluded_dirs = '|'.join(re.escape(dir) for dir in exclude_dirs or self.EXCLUDE_DIRS_DEFAULT) self._exclude_dir_regex = re.compile(f'^.+\\{os.path.sep}({excluded_dirs})(\\{os.path.sep}.+)?$') + def _compute_image_hash(self, image_path: str) -> str: + with open(image_path, 'rb') as f: + return hashlib.md5(f.read()).hexdigest() + def _index_files(self, filepaths: List[str], metas: List[ImageMeta]): images: List[Image.Image] = [] filtered_paths: List[str] = [] + hashes: List[str] = [] for path in filepaths: - try: - image = Image.open(path) - images.append(image) - filtered_paths.append(path) - except PIL.UnidentifiedImageError as ex: - pass - except Exception as ex: - print(f'error loading image {path}:', ex, file=sys.stderr) + try: + image = Image.open(path) + images.append(image) + filtered_paths.append(path) + hashes.append(self._compute_image_hash(path)) + except PIL.UnidentifiedImageError as ex: + pass + except Exception as ex: + print(f'error loading image {path}:', ex, file=sys.stderr) try: - features = self._model.compute_image_features(images) + features = self._model.compute_image_features(images) except Exception as ex: - print('error computing features:', ex, file=sys.stderr) - return - for path, meta, vector in cast(Iterable[PathMetaVector], zip(filtered_paths, metas, features)): - self._db.upsert_image(db.NewImage( - filepath=path, - modified_at=meta['modified_at'], - size=meta['size'], - vector=vector.tobytes() - ), commit=False) + print('error computing features:', ex, file=sys.stderr) + return + for path, meta, vector, hash in zip(filtered_paths, metas, features, hashes): + existing_image = self._db.get_image_by_hash(hash) + if existing_image and existing_image['filepath'] != path: + # Image was renamed, update the filepath + try: + self._db.update_image_filepath(existing_image['filepath'], path, commit=False) + except sqlite3.IntegrityError: + # If updating fails, insert a new entry + self._db.upsert_image(db.NewImage( + filepath=path, + modified_at=meta['modified_at'], + size=meta['size'], + vector=vector.tobytes(), + hash=hash + ), commit=False) + else: + self._db.upsert_image(db.NewImage( + filepath=path, + modified_at=meta['modified_at'], + size=meta['size'], + vector=vector.tobytes(), + hash=hash + ), commit=False) + self._db.commit() def ensure_index(self, directory: str): print( @@ -136,7 +161,6 @@ def update_total_images(count: int): self._index_files(batch, metas) batch = [] metas = [] - if len(batch) != 0: self._index_files(batch, metas) diff --git a/tests/e2e/gen_img.py b/tests/e2e/gen_img.py new file mode 100644 index 00000000..77986c68 --- /dev/null +++ b/tests/e2e/gen_img.py @@ -0,0 +1,67 @@ +import os + +# Create output directory if it doesn't exist +import cv2 +import numpy as np +import random + +# Create output directory if it doesn't exist +output_dir = './images nested directories/generated_images' +os.makedirs(output_dir, exist_ok=True) + +# Number of images to generate +num_images = 10000 +image_size = (512, 512) # Image size + +# List of random shapes and objects to add to the image +shapes = ['circle', 'rectangle', 'line'] +words = ['Tree', 'Car', 'House', 'Sun', 'Sky'] + +def generate_image(index: int): + # Create a blank white image + img = np.ones((image_size[0], image_size[1], 3), dtype=np.uint8) * 255 + + # Draw random shapes + for _ in range(random.randint(1, 5)): # Random number of shapes + shape = random.choice(shapes) + + if shape == 'circle': + center = (random.randint(50, image_size[0]-50), random.randint(50, image_size[1]-50)) + radius = random.randint(20, 80) + color = (random.randint(0, 255), random.randint(0, 255), random.randint(0, 255)) + thickness = random.randint(1, 5) + cv2.circle(img, center, radius, color, thickness) + + elif shape == 'rectangle': + pt1 = (random.randint(0, image_size[0]//2), random.randint(0, image_size[1]//2)) + pt2 = (random.randint(image_size[0]//2, image_size[0]), random.randint(image_size[1]//2, image_size[1])) + color = (random.randint(0, 255), random.randint(0, 255), random.randint(0, 255)) + thickness = random.randint(1, 5) + cv2.rectangle(img, pt1, pt2, color, thickness) + + elif shape == 'line': + pt1 = (random.randint(0, image_size[0]), random.randint(0, image_size[1])) + pt2 = (random.randint(0, image_size[0]), random.randint(0, image_size[1])) + color = (random.randint(0, 255), random.randint(0, 255), random.randint(0, 255)) + thickness = random.randint(1, 5) + cv2.line(img, pt1, pt2, color, thickness) + + # Add random text (object names) + for _ in range(random.randint(1, 3)): # Random number of text objects + text = random.choice(words) + font = cv2.FONT_HERSHEY_SIMPLEX + position = (random.randint(50, image_size[0]-150), random.randint(50, image_size[1]-50)) + font_scale = random.uniform(0.5, 1.5) + color = (random.randint(0, 255), random.randint(0, 255), random.randint(0, 255)) + thickness = random.randint(1, 3) + cv2.putText(img, text, position, font, font_scale, color, thickness, cv2.LINE_AA) + + # Save the image + cv2.imwrite(os.path.join(output_dir, f'image_{index}.png'), img) + +# Generate the images +for i in range(num_images): + generate_image(i) + +print(f'{num_images} meaningful images generated successfully in {output_dir} directory.') +