diff --git a/.gitignore b/.gitignore index ff5fc581..2e5f5d6c 100644 --- a/.gitignore +++ b/.gitignore @@ -7,6 +7,7 @@ __pycache__ .lock-waf* .tox /tags +*.eggs build dist diff --git a/core/app.py b/core/app.py index f2066a75..774e1dcc 100644 --- a/core/app.py +++ b/core/app.py @@ -138,6 +138,8 @@ def __init__(self, view, portable=False): self.app_mode = AppMode.STANDARD self.discarded_file_count = 0 self.exclude_list = ExcludeList() + hash_cache_file = op.join(self.appdata, "hash_cache.db") + fs.filesdb.connect(hash_cache_file) self.directories = directories.Directories(self.exclude_list) self.results = results.Results(self) self.ignore_list = IgnoreList() @@ -293,6 +295,7 @@ def _start_job(self, jobid, func, args=()): def _job_completed(self, jobid): if jobid == JobType.SCAN: self._results_changed() + fs.filesdb.commit() if not self.results.groups: self.view.show_message(tr("No duplicates found.")) else: @@ -420,6 +423,9 @@ def clear_picture_cache(self): except FileNotFoundError: pass # we don't care + def clear_hash_cache(self): + fs.filesdb.clear() + def copy_or_move(self, dupe, copy: bool, destination: str, dest_type: DestType): source_path = dupe.path location_path = first(p for p in self.directories if dupe.path in p) @@ -751,6 +757,9 @@ def save(self): self.exclude_list.save_to_xml(p) self.notify("save_session") + def close(self): + fs.filesdb.close() + def save_as(self, filename): """Save results in ``filename``. diff --git a/core/fs.py b/core/fs.py index a7978c0f..9a078818 100644 --- a/core/fs.py +++ b/core/fs.py @@ -14,7 +14,11 @@ import hashlib from math import floor import logging +import sqlite3 +from threading import Lock +from typing import Any +from hscommon.path import Path from hscommon.util import nonone, get_file_ext __all__ = [ @@ -78,6 +82,82 @@ class OperationError(FSError): cls_message = "Operation on '{name}' failed." +class FilesDB: + + create_table_query = "CREATE TABLE IF NOT EXISTS files (path TEXT PRIMARY KEY, size INTEGER, mtime_ns INTEGER, entry_dt DATETIME, md5 BLOB, md5partial BLOB)" + drop_table_query = "DROP TABLE files;" + select_query = "SELECT {key} FROM files WHERE path=:path AND size=:size and mtime_ns=:mtime_ns" + insert_query = """ + INSERT INTO files (path, size, mtime_ns, entry_dt, {key}) VALUES (:path, :size, :mtime_ns, datetime('now'), :value) + ON CONFLICT(path) DO UPDATE SET size=:size, mtime_ns=:mtime_ns, entry_dt=datetime('now'), {key}=:value; + """ + + def __init__(self): + self.conn = None + self.cur = None + self.lock = None + + def connect(self, path): + # type: (str, ) -> None + + self.conn = sqlite3.connect(path, check_same_thread=False) + self.cur = self.conn.cursor() + self.cur.execute(self.create_table_query) + self.lock = Lock() + + def clear(self): + # type: () -> None + + with self.lock: + self.cur.execute(self.drop_table_query) + self.cur.execute(self.create_table_query) + + def get(self, path, key): + # type: (Path, str) -> bytes + + stat = path.stat() + size = stat.st_size + mtime_ns = stat.st_mtime_ns + + with self.lock: + self.cur.execute(self.select_query.format(key=key), {"path": str(path), "size": size, "mtime_ns": mtime_ns}) + result = self.cur.fetchone() + + if result: + return result[0] + + return None + + def put(self, path, key, value): + # type: (Path, str, Any) -> None + + stat = path.stat() + size = stat.st_size + mtime_ns = stat.st_mtime_ns + + with self.lock: + self.cur.execute( + self.insert_query.format(key=key), + {"path": str(path), "size": size, "mtime_ns": mtime_ns, "value": value}, + ) + + def commit(self): + # type: () -> None + + with self.lock: + self.conn.commit() + + def close(self): + # type: () -> None + + with self.lock: + self.cur.close() + self.conn.close() + + +filesdb = FilesDB() # Singleton + + class File: """Represents a file and holds metadata to be used for scanning.""" @@ -107,10 +187,32 @@ def __getattribute__(self, attrname): result = self.INITIAL_INFO[attrname] return result - # This offset is where we should start reading the file to get a partial md5 - # For audio file, it should be where audio data starts - def _get_md5partial_offset_and_size(self): - return (0x4000, 0x4000) # 16Kb + def _calc_md5(self): + # type: () -> bytes + + with self.path.open("rb") as fp: + md5 = hashlib.md5() + # The goal here is to not run out of memory on really big files. However, the chunk + # size has to be large enough so that the python loop isn't too costly in terms of + # CPU. + CHUNK_SIZE = 1024 * 1024 # 1 mb + filedata = fp.read(CHUNK_SIZE) + while filedata: + md5.update(filedata) + filedata = fp.read(CHUNK_SIZE) + return md5.digest() + + def _calc_md5partial(self): + # type: () -> bytes + + # This offset is where we should start reading the file to get a partial md5 + # For audio file, it should be where audio data starts + offset, size = (0x4000, 0x4000) + + with self.path.open("rb") as fp: + fp.seek(offset) + partialdata = fp.read(size) + return hashlib.md5(partialdata).digest() def _read_info(self, field): # print(f"_read_info({field}) for {self}") @@ -120,28 +222,20 @@ def _read_info(self, field): self.mtime = nonone(stats.st_mtime, 0) elif field == "md5partial": try: - with self.path.open("rb") as fp: - offset, size = self._get_md5partial_offset_and_size() - fp.seek(offset) - partialdata = fp.read(size) - md5 = hashlib.md5(partialdata) - self.md5partial = md5.digest() - except Exception: - pass + self.md5partial = filesdb.get(self.path, "md5partial") + if self.md5partial is None: + self.md5partial = self._calc_md5partial() + filesdb.put(self.path, "md5partial", self.md5partial) + except Exception as e: + logging.warning("Couldn't get md5partial for %s: %s", self.path, e) elif field == "md5": try: - with self.path.open("rb") as fp: - md5 = hashlib.md5() - filedata = fp.read(CHUNK_SIZE) - while filedata: - md5.update(filedata) - filedata = fp.read(CHUNK_SIZE) - # FIXME For python 3.8 and later - # while filedata := fp.read(CHUNK_SIZE): - # md5.update(filedata) - self.md5 = md5.digest() - except Exception: - pass + self.md5 = filesdb.get(self.path, "md5") + if self.md5 is None: + self.md5 = self._calc_md5() + filesdb.put(self.path, "md5", self.md5) + except Exception as e: + logging.warning("Couldn't get md5 for %s: %s", self.path, e) elif field == "md5samples": try: with self.path.open("rb") as fp: diff --git a/qt/app.py b/qt/app.py index 1626a974..06d7eeef 100644 --- a/qt/app.py +++ b/qt/app.py @@ -129,11 +129,11 @@ def _setupActions(self): self.showDirectoriesWindow, ), ( - "actionClearPictureCache", + "actionClearCache", "Ctrl+Shift+P", "", - tr("Clear Picture Cache"), - self.clearPictureCacheTriggered, + tr("Clear Cache"), + self.clearCacheTriggered, ), ( "actionExcludeList", @@ -258,6 +258,7 @@ def shutdown(self): self.willSavePrefs.emit() self.prefs.save() self.model.save() + self.model.close() # Workaround for #857, hide() or close(). if self.details_dialog is not None: self.details_dialog.close() @@ -288,13 +289,14 @@ def finishedLaunching(self): self.model.load_from(results) self.recentResults.insertItem(results) - def clearPictureCacheTriggered(self): - title = tr("Clear Picture Cache") - msg = tr("Do you really want to remove all your cached picture analysis?") + def clearCacheTriggered(self): + title = tr("Clear Cache") + msg = tr("Do you really want to clear the cache? This will remove all cached file hashes and picture analysis.") if self.confirm(title, msg, QMessageBox.No): self.model.clear_picture_cache() + self.model.clear_hash_cache() active = QApplication.activeWindow() - QMessageBox.information(active, title, tr("Picture cache cleared.")) + QMessageBox.information(active, title, tr("Cache cleared.")) def ignoreListTriggered(self): if self.use_tabs: diff --git a/qt/directories_dialog.py b/qt/directories_dialog.py index 07b9a276..56a938ba 100644 --- a/qt/directories_dialog.py +++ b/qt/directories_dialog.py @@ -126,7 +126,7 @@ def _setupMenu(self): self.menuFile.addAction(self.actionLoadResults) self.menuFile.addAction(self.menuLoadRecent.menuAction()) self.menuFile.addSeparator() - self.menuFile.addAction(self.app.actionClearPictureCache) + self.menuFile.addAction(self.app.actionClearCache) self.menuFile.addSeparator() self.menuFile.addAction(self.actionLoadDirectories) self.menuFile.addAction(self.actionSaveDirectories)