Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

File cluster proof of concept #162

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
34 commits
Select commit Hold shift + click to select a range
44b5098
Implement files data-access object
stepan-anokhin Oct 16, 2020
a527fe1
Migrate to files-dao
stepan-anokhin Oct 16, 2020
51ee027
Update server tests
stepan-anokhin Oct 16, 2020
59a8eee
Fix regression
stepan-anokhin Oct 16, 2020
b664957
Make module naming more ideomatic
stepan-anokhin Oct 16, 2020
673bcec
Resolve linting issues
stepan-anokhin Oct 16, 2020
3dcc780
Support expunging by session scope
stepan-anokhin Oct 20, 2020
8ef8a47
Draft matches extraction
stepan-anokhin Oct 20, 2020
cf817ad
Filter matches by distance
stepan-anokhin Oct 20, 2020
3f6c8cc
Test query matches with cycles
stepan-anokhin Oct 20, 2020
f17623b
Test match loading
stepan-anokhin Oct 20, 2020
9a44755
Emulate matches pagination
stepan-anokhin Oct 21, 2020
a8e2e36
Update MatchesDAO tests
stepan-anokhin Oct 21, 2020
6ce8572
Migrate server.api to MatchesDAO
stepan-anokhin Oct 21, 2020
4c3a7dc
Update api/matches tests
stepan-anokhin Oct 21, 2020
ba56b8c
Test multiple hops with cycles
stepan-anokhin Oct 21, 2020
9a33710
Update file matches page
stepan-anokhin Oct 21, 2020
682f353
Update cluster page to consume new matches format
stepan-anokhin Oct 21, 2020
1eda409
Improve graph container responsiveness
stepan-anokhin Oct 22, 2020
c7fc8a9
Fix linting issues
stepan-anokhin Oct 22, 2020
d32e51c
Improve graph responsiveness
stepan-anokhin Oct 22, 2020
9a4dd5e
Implement generic loading trigger
stepan-anokhin Oct 23, 2020
7f1ec4b
Support matches filtering
stepan-anokhin Oct 23, 2020
4a3c5c0
Implement dynamic matches loading
stepan-anokhin Oct 23, 2020
bc8d97d
Fix loading trigger message
stepan-anokhin Oct 23, 2020
4e3a8f8
Fix match reducers
stepan-anokhin Oct 23, 2020
caa7361
Improve dynamic match loading
stepan-anokhin Oct 23, 2020
5ffbc5a
Implement neighbor loading
stepan-anokhin Oct 23, 2020
ec14b86
Adjust graph style
stepan-anokhin Oct 23, 2020
d63d90f
Enable zooming
stepan-anokhin Oct 23, 2020
20b6952
Enable cluster navigation
stepan-anokhin Oct 23, 2020
1fec2c5
Fix tooltips
stepan-anokhin Oct 23, 2020
a69700f
Make edge opacity dynamic
stepan-anokhin Oct 23, 2020
d03df19
Make color scheme static
stepan-anokhin Oct 23, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 5 additions & 2 deletions db/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,13 +36,16 @@ def drop_tables(self):
self.base.metadata.drop_all(bind=self.engine)

@contextmanager
def session_scope(self):
def session_scope(self, expunge=False):
"""Provide a transactional scope."""
session = self.session()
try:
yield session
if expunge:
session.flush()
session.expunge_all()
session.commit()
except:
except Exception:
session.rollback()
raise
finally:
Expand Down
Empty file added db/access/__init__.py
Empty file.
243 changes: 243 additions & 0 deletions db/access/files.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,243 @@
from dataclasses import dataclass, field
from datetime import datetime
from typing import List

from sqlalchemy import or_, func, literal_column
from sqlalchemy.orm import aliased

from db.schema import Files, Matches, VideoMetadata, Exif


class FileMatchFilter:
"""Enum for file match filtering criteria."""
ALL = "all"
RELATED = "related"
DUPLICATES = "duplicates"
UNIQUE = "unique"

values = {ALL, RELATED, DUPLICATES, UNIQUE}


class FileSort:
"""Enum for result ordering."""
DATE = "date"
LENGTH = "length"
RELATED = "related"
DUPLICATES = "duplicates"

values = {DATE, LENGTH, RELATED, DUPLICATES}


@dataclass
class ListFilesRequest:
"""Parameters for list-files query."""

limit: int = 20
offset: int = 0
path_query: str = None
extensions: List[str] = field(default_factory=list)
exif: bool = None
audio: bool = None
min_length: int = None
max_length: int = None
date_from: datetime = None
date_to: datetime = None
preload: list = field(default_factory=list)
sort: str = None
match_filter: str = FileMatchFilter.ALL
related_distance: float = 0.4
duplicate_distance: float = 0.1


@dataclass
class Counts:
"""Count of files by matches."""
total: int
related: int
duplicates: int
unique: int


@dataclass
class ListFilesResults:
"""Results of list-files query."""
items: List[Files]
counts: Counts


class FilesDAO:
"""Data-access object for files."""

# Format in which Dates are currently stored in exif table.
_EXIF_DATE_FORMAT = " UTC %Y-%m-%d 00"

# Label for related entities count (matches, scenes, etc.)
_LABEL_COUNT = "hit_count"
_countable_match = aliased(Matches)

@staticmethod
def list_files(req: ListFilesRequest, session) -> ListFilesResults:
"""Query multiple files."""
# Count files
query = session.query(Files)
query = FilesDAO._filter_by_file_attributes(req, query)
counts = FilesDAO.counts(query, req.related_distance, req.duplicate_distance)

# Select files
sortable_attributes = FilesDAO._sortable_attributes(req)
query = session.query(Files, *sortable_attributes)
query = FilesDAO._filter_by_file_attributes(req, query)
query = FilesDAO._filter_by_matches(req, query)
query = FilesDAO._sort_items(req, query)

# Retrieve slice
query = query.offset(req.offset).limit(req.limit)
items = query.all()

# Get files from result set if there are additional attributes.
if len(sortable_attributes) > 0:
items = [item[0] for item in items]

return ListFilesResults(items=items, counts=counts)

@staticmethod
def counts(query, related_distance, duplicate_distance):
"""Count queried files by matches."""
total = query.count()
duplicates = query.filter(FilesDAO.has_matches(duplicate_distance)).count()
related = query.filter(FilesDAO.has_matches(related_distance)).count()
unique = total - related
return Counts(
total=total,
related=related,
duplicates=duplicates,
unique=unique)

@staticmethod
def has_matches(threshold):
"""Create a filter criteria to check if there is a match
with distance lesser or equal to the given threshold."""
return or_(Files.source_matches.any(Matches.distance <= threshold),
Files.target_matches.any(Matches.distance <= threshold))

@staticmethod
def file_matches(file_id, session):
"""Query for all file matches."""
return session.query(Matches).filter(or_(
Matches.query_video_file_id == file_id,
Matches.match_video_file_id == file_id
))

@staticmethod
def _sortable_attributes(req: ListFilesRequest):
"""Get additional sortable attributes."""
values = []
if req.sort == FileSort.RELATED or req.sort == FileSort.DUPLICATES:
match_count = func.count(FilesDAO._countable_match.id).label(FilesDAO._LABEL_COUNT)
values.append(match_count)
return values

@staticmethod
def _sort_items(req: ListFilesRequest, query):
"""Apply ordering."""
if req.sort == FileSort.RELATED or req.sort == FileSort.DUPLICATES:
match = FilesDAO._countable_match
threshold = req.related_distance if req.sort == FileSort.RELATED else req.duplicate_distance
query = query.outerjoin(FilesDAO._countable_match,
((match.query_video_file_id == Files.id) |
(match.match_video_file_id == Files.id)) & (match.distance < threshold))
return query.group_by(Files.id).order_by(literal_column(FilesDAO._LABEL_COUNT).desc(), Files.id.asc())
elif req.sort == FileSort.LENGTH:
meta = aliased(VideoMetadata)
return query.outerjoin(meta).order_by(meta.video_length.desc(), Files.id.asc())
elif req.sort == FileSort.DATE:
exif = aliased(Exif)
return query.outerjoin(exif).order_by(exif.General_Encoded_Date.desc(), Files.id.asc())
return query

@staticmethod
def _filter_path(req: ListFilesRequest, query):
"""Filter by file name."""
if req.path_query:
return query.filter(Files.file_path.ilike(f"%{req.path_query}%"))
return query

@staticmethod
def _filter_extensions(req: ListFilesRequest, query):
"""Filter by file extension."""
if req.extensions:
conditions = (Files.file_path.ilike(f"%.{ext}") for ext in req.extensions)
return query.filter(or_(*conditions))
return query

@staticmethod
def _filter_exif(req: ListFilesRequest, query):
"""Filter by EXIF data presence."""
if req.exif is not None:
has_exif = Files.exif.has()
if req.exif:
return query.filter(has_exif)
else:
return query.filter(~has_exif)
return query

@staticmethod
def _filter_audio(req: ListFilesRequest, query):
"""Filter by audio presence."""
if req.audio is not None:
has_audio = Files.exif.has(Exif.Audio_Duration > 0)
if req.audio:
return query.filter(has_audio)
else:
return query.filter(~has_audio)
return query

@staticmethod
def _filter_date(req: ListFilesRequest, query):
"""Filter by creation date."""
if req.date_from is not None:
query = query.filter(
Files.exif.has(Exif.General_Encoded_Date >= req.date_from.strftime(FilesDAO._EXIF_DATE_FORMAT)))

if req.date_to is not None:
query = query.filter(
Files.exif.has(Exif.General_Encoded_Date <= req.date_to.strftime(FilesDAO._EXIF_DATE_FORMAT)))

return query

@staticmethod
def _filter_length(req: ListFilesRequest, query):
"""Filter by length."""
if req.min_length is not None or req.max_length is not None:
query = query.join(Files.meta)

if req.min_length is not None:
query = query.filter(VideoMetadata.video_length >= req.min_length)

if req.max_length is not None:
query = query.filter(VideoMetadata.video_length <= req.max_length)

return query

@staticmethod
def _filter_by_matches(req: ListFilesRequest, query):
"""Filter by presence of similar files."""
if req.match_filter == FileMatchFilter.DUPLICATES:
return query.filter(FilesDAO.has_matches(req.duplicate_distance))
elif req.match_filter == FileMatchFilter.RELATED:
return query.filter(FilesDAO.has_matches(req.related_distance))
elif req.match_filter == FileMatchFilter.UNIQUE:
return query.filter(~FilesDAO.has_matches(req.related_distance))
# else MatchCategory.ALL
return query

@staticmethod
def _filter_by_file_attributes(req: ListFilesRequest, query):
"""Apply filters related to the properties of video file itself."""
query = FilesDAO._filter_path(req, query)
query = FilesDAO._filter_extensions(req, query)
query = FilesDAO._filter_exif(req, query)
query = FilesDAO._filter_audio(req, query)
query = FilesDAO._filter_date(req, query)
query = FilesDAO._filter_length(req, query)
return query
Loading