Skip to content

Commit

Permalink
Merge pull request #14 from miketvo/release-2023-06-07
Browse files Browse the repository at this point in the history
Release 2023-06-07: v0.1.2-beta
  • Loading branch information
miketvo authored Jun 7, 2023
2 parents 5ce9e99 + bf8ac6b commit bbebfb4
Show file tree
Hide file tree
Showing 8 changed files with 264 additions and 165 deletions.
40 changes: 9 additions & 31 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,42 +10,20 @@ See [Releases](https://github.com/miketvo/imdupes-prototype/releases/) for lates
## Syntax

```text
usage: imdupes {detect,clean} [OPTIONS] DIRECTORY
usage: imdupes {scan,clean} ...
Quickly detects and removes identical images. Has two modes:
- 'detect' console prints the detected identical image paths/filenames
- 'clean' removes the detected identical images, keeping only the first copy
Warning: Deleted files are not recoverable, proceed with caution
positional arguments:
{detect,clean} run mode
directory target image directory
- 'scan' scans and console prints detected identical image paths/filenames
- 'clean' scans and removes detected identical images (keeping only the first copy by default)
See "imdupes {scan,clean} --help" for more information
options:
-h, --help show this help message and exit
-s HASH_SIZE, --hash-size HASH_SIZE
specify a preferred hash size (integer) (default: 512)*
-e REGEX, --exclude REGEX
exclude matched filenames based on REGEX pattern
-r, --recursive recursively search for images in subdirectories in addition to the specified parent directory
-V, --verbose explain what is being done
-f {absolute,prog-relative,dir-relative,filename}, --format {absolute,prog-relative,dir-relative,filename}
console output file path format, always applied to detect mode and clean mode only when verbose is enabled (default: dir-relative)
-v, --version show version information and exit
detect mode options:
-o OUTPUT, --output OUTPUT
save the console output to the specified OUTPUT file (overwriting if file already exist)
clean mode options:
-i, --interactive prompt before every file deletion and let the user choose which file to delete
Note: This program ignores any non-image file in the target directory
-h, --help show this help message and exit
-v, --version show version information and exit
*: Smaller hash sizes are better for detecting visually similar images, while larger hash sizes are
better for identifying identical images; The smaller the hash size, the better the performance
Smallest accepted hash size is 8
run modes:
{scan,clean}
Note: This program ignores any non-image file in the target directory
Algorithm: Average Hash (https://www.hackerfactor.com/blog/index.php?/archives/432-Looks-Like-It.html)
```
32 changes: 22 additions & 10 deletions src/imdupes/_version.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,27 @@
__version__ = '0.1.1-beta'
__version__ = '0.1.2-beta'
__app_name__ = 'imdupes'

__prog_usage__ = f'{__app_name__} {{scan,clean}} ...'
__prog_desc__ = \
'Quickly detects and removes identical images. Has two modes:\n' \
"\t- 'detect' console prints the detected identical image paths/filenames\n" \
"\t- 'clean' removes the detected identical images, keeping only the first copy\n" \
'Warning: Deleted files are not recoverable, proceed with caution'
"\t- 'scan' scans and console prints detected identical image paths/filenames\n" \
"\t- 'clean' scans and removes detected identical images (keeping only the first copy by default)\n" \
f'See "{__app_name__} {{scan,clean}} --help" for more information'
__prog_epilog__ = \
'Note: This program ignores any non-image file in the target directory\n\n' \
'*: Smaller hash sizes are better for detecting visually similar images, while larger hash sizes are\n' \
' better for identifying identical images; The smaller the hash size, the better the performance\n' \
'\n' \
' Smallest accepted hash size is 8\n' \
'\n' \
'Note: This program ignores any non-image file in the target directory\n' \
'Algorithm: Average Hash (https://www.hackerfactor.com/blog/index.php?/archives/432-Looks-Like-It.html)'

__scan_usage__ = f'{__app_name__} scan [options] directory [-o OUTPUT]'
__scan_desc__ = \
'scan and console print detected identical image paths/filenames'
__scan_epilog__ = \
'Note: This program ignores any non-image file in the target directory\n' \
'*: Smaller hash sizes are better for detecting visually similar images, while larger hash sizes are better for\n' \
' identifying identical images; The smaller the hash size, the better the performance; sSmallest accepted hash ' \
'size\n is 8' \

__clean_usage__ = f'{__app_name__} clean [options] input'
__clean_desc__ = \
'scan and remove detected identical images (keeping only the first copy by default); deleted files are not\n' \
'recoverable, proceed with caution'
__clean_epilog__ = __scan_epilog__
54 changes: 28 additions & 26 deletions src/imdupes/detect.py → src/imdupes/detect_dup_images.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import sys
import warnings
import imagehash
from PIL import Image
Expand All @@ -13,50 +14,63 @@
warnings.simplefilter('ignore', Image.DecompressionBombWarning)


def detect(
def detect_dup_images(
img_paths: list[str],
hash_size: int = DEFAULT_HASH_SIZE,
root_dir: str = None,
console_output: bool = True,
output_path_format: PathFormat = PathFormat.DIR_RELATIVE,
verbose: bool = False
verbose: int = 0
) -> dict[str, list[ImageFileWrapper]]:
hashed_images: dict[str, list[ImageFileWrapper]] = {}

# Image hashing
pbar = None
if verbose:
pbar = tqdm(total=len(img_paths), desc='Scanning for identical images', position=0, leave=False)
if verbose > 0:
pbar = tqdm(total=len(img_paths), desc='Scanning for identical images', file=sys.stdout, leave=False)
for img_path in img_paths:
if pbar is not None:
pbar.update()

if verbose > 1:
pbar.write(f'Scanning "{format_path(img_path, output_path_format, root_dir)}"')

im = None
try:
im = Image.open(img_path)
im.verify()
im = Image.open(img_path)

if im.format == 'PNG' and im.mode != 'RGBA':
im = im.convert('RGBA')

except (ValueError, TypeError, Image.DecompressionBombError, OSError, EOFError) as error:
image_hash = imagehash.average_hash(im, hash_size=hash_size).__str__()
if image_hash in hashed_images:
hashed_images[image_hash].append(ImageFileWrapper(im, img_path))
else:
hashed_images[image_hash] = [ImageFileWrapper(im, img_path)]

im.close()

except (ValueError, TypeError, Image.DecompressionBombError, OSError, EOFError, MemoryError) as error:
if pbar is not None:
pbar.write(
f"Error reading '{format_path(img_path, output_path_format, root_dir)}': "
f"Error hashing '{format_path(img_path, output_path_format, root_dir)}': "
f'{error.__str__()}. '
f'File skipped.'
)
else:
print(
f"Error reading '{format_path(img_path, output_path_format, root_dir)}': "
f"Error hashing '{format_path(img_path, output_path_format, root_dir)}': "
f'{error.__str__()}. '
f'File skipped.',
flush=True
)
if im is not None:
im.close()
continue

image_hash = imagehash.average_hash(im, hash_size=hash_size).__str__()
if image_hash in hashed_images:
hashed_images[image_hash].append(ImageFileWrapper(im, img_path))
else:
hashed_images[image_hash] = [ImageFileWrapper(im, img_path)]
if pbar is not None:
pbar.close()

# Remove hashes with a single path
hashed_dups: dict[str, list[ImageFileWrapper]] = {
Expand All @@ -73,25 +87,13 @@ def detect(
reverse=True
)

# Output
if verbose:
if verbose > 0:
print(
f'Scanning for identical images... '
f'Found {colored(str(len(hashed_dups.values())), attrs=["bold"])} duplication(s) '
f'across {colored(str(sum(len(lst) for lst in hashed_dups.values())), attrs=["bold"])} file(s) '
f'{colored("[DONE]", color="green", attrs=["bold"])}',
end='',
flush=True
)

if console_output:
if verbose:
print(':')
for dup_imgs in hashed_dups.values():
print()
for dup_img in dup_imgs:
print(format_path(dup_img.path, output_path_format, root_dir))
else:
print()

return hashed_dups
Loading

0 comments on commit bbebfb4

Please sign in to comment.