Merge pull request #14 from miketvo/release-2023-06-07

Release 2023-06-07: v0.1.2-beta
miketvo · Jun 7, 2023 · bbebfb4 · bbebfb4
2 parents 5ce9e99 + bf8ac6b
commit bbebfb4
Show file tree

Hide file tree

Showing 8 changed files with 264 additions and 165 deletions.
diff --git a/README.md b/README.md
@@ -10,42 +10,20 @@ See [Releases](https://github.com/miketvo/imdupes-prototype/releases/) for lates
 ## Syntax
 
 ```text
-usage: imdupes {detect,clean} [OPTIONS] DIRECTORY
+usage: imdupes {scan,clean} ...
 
 Quickly detects and removes identical images. Has two modes:
-        - 'detect' console prints the detected identical image paths/filenames
-        - 'clean' removes the detected identical images, keeping only the first copy
-Warning: Deleted files are not recoverable, proceed with caution
-
-positional arguments:
-  {detect,clean}        run mode
-  directory             target image directory
+        - 'scan' scans and console prints detected identical image paths/filenames
+        - 'clean' scans and removes detected identical images (keeping only the first copy by default)
+See "imdupes {scan,clean} --help" for more information
 
 options:
-  -h, --help            show this help message and exit
-  -s HASH_SIZE, --hash-size HASH_SIZE
-                        specify a preferred hash size (integer) (default: 512)*
-  -e REGEX, --exclude REGEX
-                        exclude matched filenames based on REGEX pattern
-  -r, --recursive       recursively search for images in subdirectories in addition to the specified parent directory
-  -V, --verbose         explain what is being done
-  -f {absolute,prog-relative,dir-relative,filename}, --format {absolute,prog-relative,dir-relative,filename}
-                        console output file path format, always applied to detect mode and clean mode only when verbose is enabled (default: dir-relative)
-  -v, --version         show version information and exit
-
-detect mode options:
-  -o OUTPUT, --output OUTPUT
-                        save the console output to the specified OUTPUT file (overwriting if file already exist)
-
-clean mode options:
-  -i, --interactive     prompt before every file deletion and let the user choose which file to delete
-
-Note: This program ignores any non-image file in the target directory
+  -h, --help     show this help message and exit
+  -v, --version  show version information and exit
 
-*: Smaller hash sizes are better for detecting visually similar images, while larger hash sizes are
-   better for identifying identical images; The smaller the hash size, the better the performance
-
-   Smallest accepted hash size is 8
+run modes:
+  {scan,clean}
 
+Note: This program ignores any non-image file in the target directory
 Algorithm: Average Hash (https://www.hackerfactor.com/blog/index.php?/archives/432-Looks-Like-It.html)
 ```
diff --git a/src/imdupes/_version.py b/src/imdupes/_version.py
@@ -1,15 +1,27 @@
-__version__ = '0.1.1-beta'
+__version__ = '0.1.2-beta'
 __app_name__ = 'imdupes'
+
+__prog_usage__ = f'{__app_name__} {{scan,clean}} ...'
 __prog_desc__ = \
     'Quickly detects and removes identical images. Has two modes:\n' \
-    "\t- 'detect' console prints the detected identical image paths/filenames\n" \
-    "\t- 'clean' removes the detected identical images, keeping only the first copy\n" \
-    'Warning: Deleted files are not recoverable, proceed with caution'
+    "\t- 'scan' scans and console prints detected identical image paths/filenames\n" \
+    "\t- 'clean' scans and removes detected identical images (keeping only the first copy by default)\n" \
+    f'See "{__app_name__} {{scan,clean}} --help" for more information'
 __prog_epilog__ = \
-    'Note: This program ignores any non-image file in the target directory\n\n' \
-    '*: Smaller hash sizes are better for detecting visually similar images, while larger hash sizes are\n' \
-    '   better for identifying identical images; The smaller the hash size, the better the performance\n' \
-    '\n' \
-    '   Smallest accepted hash size is 8\n' \
-    '\n' \
+    'Note: This program ignores any non-image file in the target directory\n' \
     'Algorithm: Average Hash (https://www.hackerfactor.com/blog/index.php?/archives/432-Looks-Like-It.html)'
+
+__scan_usage__ = f'{__app_name__} scan [options] directory [-o OUTPUT]'
+__scan_desc__ = \
+    'scan and console print detected identical image paths/filenames'
+__scan_epilog__ = \
+    'Note: This program ignores any non-image file in the target directory\n' \
+    '*: Smaller hash sizes are better for detecting visually similar images, while larger hash sizes are better for\n' \
+    '   identifying identical images; The smaller the hash size, the better the performance; sSmallest accepted hash ' \
+    'size\n   is 8' \
+
+__clean_usage__ = f'{__app_name__} clean [options] input'
+__clean_desc__ = \
+    'scan and remove detected identical images (keeping only the first copy by default); deleted files are not\n' \
+    'recoverable, proceed with caution'
+__clean_epilog__ = __scan_epilog__
diff --git a/src/imdupes/detect.py → src/imdupes/detect_dup_images.py b/src/imdupes/detect.py → src/imdupes/detect_dup_images.py
@@ -1,3 +1,4 @@
+import sys
 import warnings
 import imagehash
 from PIL import Image
@@ -13,50 +14,63 @@
 warnings.simplefilter('ignore', Image.DecompressionBombWarning)
 
 
-def detect(
+def detect_dup_images(
         img_paths: list[str],
         hash_size: int = DEFAULT_HASH_SIZE,
         root_dir: str = None,
-        console_output: bool = True,
         output_path_format: PathFormat = PathFormat.DIR_RELATIVE,
-        verbose: bool = False
+        verbose: int = 0
 ) -> dict[str, list[ImageFileWrapper]]:
     hashed_images: dict[str, list[ImageFileWrapper]] = {}
 
     # Image hashing
     pbar = None
-    if verbose:
-        pbar = tqdm(total=len(img_paths), desc='Scanning for identical images', position=0, leave=False)
+    if verbose > 0:
+        pbar = tqdm(total=len(img_paths), desc='Scanning for identical images', file=sys.stdout, leave=False)
     for img_path in img_paths:
         if pbar is not None:
             pbar.update()
 
+        if verbose > 1:
+            pbar.write(f'Scanning "{format_path(img_path, output_path_format, root_dir)}"')
+
+        im = None
         try:
             im = Image.open(img_path)
+            im.verify()
+            im = Image.open(img_path)
+
             if im.format == 'PNG' and im.mode != 'RGBA':
                 im = im.convert('RGBA')
 
-        except (ValueError, TypeError, Image.DecompressionBombError, OSError, EOFError) as error:
+            image_hash = imagehash.average_hash(im, hash_size=hash_size).__str__()
+            if image_hash in hashed_images:
+                hashed_images[image_hash].append(ImageFileWrapper(im, img_path))
+            else:
+                hashed_images[image_hash] = [ImageFileWrapper(im, img_path)]
+
+            im.close()
+
+        except (ValueError, TypeError, Image.DecompressionBombError, OSError, EOFError, MemoryError) as error:
             if pbar is not None:
                 pbar.write(
-                    f"Error reading '{format_path(img_path, output_path_format, root_dir)}': "
+                    f"Error hashing '{format_path(img_path, output_path_format, root_dir)}': "
                     f'{error.__str__()}. '
                     f'File skipped.'
                 )
             else:
                 print(
-                    f"Error reading '{format_path(img_path, output_path_format, root_dir)}': "
+                    f"Error hashing '{format_path(img_path, output_path_format, root_dir)}': "
                     f'{error.__str__()}. '
                     f'File skipped.',
                     flush=True
                 )
+            if im is not None:
+                im.close()
             continue
 
-        image_hash = imagehash.average_hash(im, hash_size=hash_size).__str__()
-        if image_hash in hashed_images:
-            hashed_images[image_hash].append(ImageFileWrapper(im, img_path))
-        else:
-            hashed_images[image_hash] = [ImageFileWrapper(im, img_path)]
+    if pbar is not None:
+        pbar.close()
 
     # Remove hashes with a single path
     hashed_dups: dict[str, list[ImageFileWrapper]] = {
@@ -73,25 +87,13 @@ def detect(
             reverse=True
         )
 
-    # Output
-    if verbose:
+    if verbose > 0:
         print(
             f'Scanning for identical images... '
             f'Found {colored(str(len(hashed_dups.values())), attrs=["bold"])} duplication(s) '
             f'across {colored(str(sum(len(lst) for lst in hashed_dups.values())), attrs=["bold"])} file(s) '
             f'{colored("[DONE]", color="green", attrs=["bold"])}',
-            end='',
             flush=True
         )
 
-    if console_output:
-        if verbose:
-            print(':')
-        for dup_imgs in hashed_dups.values():
-            print()
-            for dup_img in dup_imgs:
-                print(format_path(dup_img.path, output_path_format, root_dir))
-    else:
-        print()
-
     return hashed_dups