Skip to content

Commit

Permalink
Merge pull request #942 from Dobatymo/hash-cache
Browse files Browse the repository at this point in the history
Implement hash cache for md5 hash based on sqlite
  • Loading branch information
arsenetar authored Nov 24, 2021
2 parents b80489f + 7746004 commit 34f41dc
Show file tree
Hide file tree
Showing 5 changed files with 138 additions and 32 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ __pycache__
.lock-waf*
.tox
/tags
*.eggs

build
dist
Expand Down
9 changes: 9 additions & 0 deletions core/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,8 @@ def __init__(self, view, portable=False):
self.app_mode = AppMode.STANDARD
self.discarded_file_count = 0
self.exclude_list = ExcludeList()
hash_cache_file = op.join(self.appdata, "hash_cache.db")
fs.filesdb.connect(hash_cache_file)
self.directories = directories.Directories(self.exclude_list)
self.results = results.Results(self)
self.ignore_list = IgnoreList()
Expand Down Expand Up @@ -293,6 +295,7 @@ def _start_job(self, jobid, func, args=()):
def _job_completed(self, jobid):
if jobid == JobType.SCAN:
self._results_changed()
fs.filesdb.commit()
if not self.results.groups:
self.view.show_message(tr("No duplicates found."))
else:
Expand Down Expand Up @@ -420,6 +423,9 @@ def clear_picture_cache(self):
except FileNotFoundError:
pass # we don't care

def clear_hash_cache(self):
fs.filesdb.clear()

def copy_or_move(self, dupe, copy: bool, destination: str, dest_type: DestType):
source_path = dupe.path
location_path = first(p for p in self.directories if dupe.path in p)
Expand Down Expand Up @@ -751,6 +757,9 @@ def save(self):
self.exclude_list.save_to_xml(p)
self.notify("save_session")

def close(self):
fs.filesdb.close()

def save_as(self, filename):
"""Save results in ``filename``.
Expand Down
142 changes: 118 additions & 24 deletions core/fs.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,11 @@
import hashlib
from math import floor
import logging
import sqlite3
from threading import Lock
from typing import Any

from hscommon.path import Path
from hscommon.util import nonone, get_file_ext

__all__ = [
Expand Down Expand Up @@ -78,6 +82,82 @@ class OperationError(FSError):
cls_message = "Operation on '{name}' failed."


class FilesDB:

create_table_query = "CREATE TABLE IF NOT EXISTS files (path TEXT PRIMARY KEY, size INTEGER, mtime_ns INTEGER, entry_dt DATETIME, md5 BLOB, md5partial BLOB)"
drop_table_query = "DROP TABLE files;"
select_query = "SELECT {key} FROM files WHERE path=:path AND size=:size and mtime_ns=:mtime_ns"
insert_query = """
INSERT INTO files (path, size, mtime_ns, entry_dt, {key}) VALUES (:path, :size, :mtime_ns, datetime('now'), :value)
ON CONFLICT(path) DO UPDATE SET size=:size, mtime_ns=:mtime_ns, entry_dt=datetime('now'), {key}=:value;
"""

def __init__(self):
self.conn = None
self.cur = None
self.lock = None

def connect(self, path):
# type: (str, ) -> None

self.conn = sqlite3.connect(path, check_same_thread=False)
self.cur = self.conn.cursor()
self.cur.execute(self.create_table_query)
self.lock = Lock()

def clear(self):
# type: () -> None

with self.lock:
self.cur.execute(self.drop_table_query)
self.cur.execute(self.create_table_query)

def get(self, path, key):
# type: (Path, str) -> bytes

stat = path.stat()
size = stat.st_size
mtime_ns = stat.st_mtime_ns

with self.lock:
self.cur.execute(self.select_query.format(key=key), {"path": str(path), "size": size, "mtime_ns": mtime_ns})
result = self.cur.fetchone()

if result:
return result[0]

return None

def put(self, path, key, value):
# type: (Path, str, Any) -> None

stat = path.stat()
size = stat.st_size
mtime_ns = stat.st_mtime_ns

with self.lock:
self.cur.execute(
self.insert_query.format(key=key),
{"path": str(path), "size": size, "mtime_ns": mtime_ns, "value": value},
)

def commit(self):
# type: () -> None

with self.lock:
self.conn.commit()

def close(self):
# type: () -> None

with self.lock:
self.cur.close()
self.conn.close()


filesdb = FilesDB() # Singleton


class File:
"""Represents a file and holds metadata to be used for scanning."""

Expand Down Expand Up @@ -107,10 +187,32 @@ def __getattribute__(self, attrname):
result = self.INITIAL_INFO[attrname]
return result

# This offset is where we should start reading the file to get a partial md5
# For audio file, it should be where audio data starts
def _get_md5partial_offset_and_size(self):
return (0x4000, 0x4000) # 16Kb
def _calc_md5(self):
# type: () -> bytes

with self.path.open("rb") as fp:
md5 = hashlib.md5()
# The goal here is to not run out of memory on really big files. However, the chunk
# size has to be large enough so that the python loop isn't too costly in terms of
# CPU.
CHUNK_SIZE = 1024 * 1024 # 1 mb
filedata = fp.read(CHUNK_SIZE)
while filedata:
md5.update(filedata)
filedata = fp.read(CHUNK_SIZE)
return md5.digest()

def _calc_md5partial(self):
# type: () -> bytes

# This offset is where we should start reading the file to get a partial md5
# For audio file, it should be where audio data starts
offset, size = (0x4000, 0x4000)

with self.path.open("rb") as fp:
fp.seek(offset)
partialdata = fp.read(size)
return hashlib.md5(partialdata).digest()

def _read_info(self, field):
# print(f"_read_info({field}) for {self}")
Expand All @@ -120,28 +222,20 @@ def _read_info(self, field):
self.mtime = nonone(stats.st_mtime, 0)
elif field == "md5partial":
try:
with self.path.open("rb") as fp:
offset, size = self._get_md5partial_offset_and_size()
fp.seek(offset)
partialdata = fp.read(size)
md5 = hashlib.md5(partialdata)
self.md5partial = md5.digest()
except Exception:
pass
self.md5partial = filesdb.get(self.path, "md5partial")
if self.md5partial is None:
self.md5partial = self._calc_md5partial()
filesdb.put(self.path, "md5partial", self.md5partial)
except Exception as e:
logging.warning("Couldn't get md5partial for %s: %s", self.path, e)
elif field == "md5":
try:
with self.path.open("rb") as fp:
md5 = hashlib.md5()
filedata = fp.read(CHUNK_SIZE)
while filedata:
md5.update(filedata)
filedata = fp.read(CHUNK_SIZE)
# FIXME For python 3.8 and later
# while filedata := fp.read(CHUNK_SIZE):
# md5.update(filedata)
self.md5 = md5.digest()
except Exception:
pass
self.md5 = filesdb.get(self.path, "md5")
if self.md5 is None:
self.md5 = self._calc_md5()
filesdb.put(self.path, "md5", self.md5)
except Exception as e:
logging.warning("Couldn't get md5 for %s: %s", self.path, e)
elif field == "md5samples":
try:
with self.path.open("rb") as fp:
Expand Down
16 changes: 9 additions & 7 deletions qt/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,11 +129,11 @@ def _setupActions(self):
self.showDirectoriesWindow,
),
(
"actionClearPictureCache",
"actionClearCache",
"Ctrl+Shift+P",
"",
tr("Clear Picture Cache"),
self.clearPictureCacheTriggered,
tr("Clear Cache"),
self.clearCacheTriggered,
),
(
"actionExcludeList",
Expand Down Expand Up @@ -258,6 +258,7 @@ def shutdown(self):
self.willSavePrefs.emit()
self.prefs.save()
self.model.save()
self.model.close()
# Workaround for #857, hide() or close().
if self.details_dialog is not None:
self.details_dialog.close()
Expand Down Expand Up @@ -288,13 +289,14 @@ def finishedLaunching(self):
self.model.load_from(results)
self.recentResults.insertItem(results)

def clearPictureCacheTriggered(self):
title = tr("Clear Picture Cache")
msg = tr("Do you really want to remove all your cached picture analysis?")
def clearCacheTriggered(self):
title = tr("Clear Cache")
msg = tr("Do you really want to clear the cache? This will remove all cached file hashes and picture analysis.")
if self.confirm(title, msg, QMessageBox.No):
self.model.clear_picture_cache()
self.model.clear_hash_cache()
active = QApplication.activeWindow()
QMessageBox.information(active, title, tr("Picture cache cleared."))
QMessageBox.information(active, title, tr("Cache cleared."))

def ignoreListTriggered(self):
if self.use_tabs:
Expand Down
2 changes: 1 addition & 1 deletion qt/directories_dialog.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,7 @@ def _setupMenu(self):
self.menuFile.addAction(self.actionLoadResults)
self.menuFile.addAction(self.menuLoadRecent.menuAction())
self.menuFile.addSeparator()
self.menuFile.addAction(self.app.actionClearPictureCache)
self.menuFile.addAction(self.app.actionClearCache)
self.menuFile.addSeparator()
self.menuFile.addAction(self.actionLoadDirectories)
self.menuFile.addAction(self.actionSaveDirectories)
Expand Down

0 comments on commit 34f41dc

Please sign in to comment.