From beddc28ee9f3ac916b630bfb8591065971be3146 Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Sun, 5 Feb 2023 15:20:20 +0000 Subject: [PATCH 1/4] sources.browser_new: initial work on using HPI + seanbreckenridge/browserexport later will rename to browser, and implement defensive fallback onto browser_old adapted from https://github.com/seanbreckenridge/promnesia/blob/master/promnesia_sean/sources/browsing.py related: https://github.com/karlicoss/promnesia/issues/339 Old vs new modules produce almost identical results (tested on various chrome & firefox databases) There are some minor differences vs the old module: - old database timestamps end with +00:00 UTC, new ones with +00:00 -- likely because browserexport is using timezone.utc instead of pytz - previously locator was pointing at the database file, now it's pointing at the URL I guess it's not necessarily in the 'spirit' of locator field, but on the other hand, not that it's very useful to point to an sqlite file either. Perhaps later it could be in some sort of extra debug field instead. --- src/promnesia/sources/browser_new.py | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) create mode 100644 src/promnesia/sources/browser_new.py diff --git a/src/promnesia/sources/browser_new.py b/src/promnesia/sources/browser_new.py new file mode 100644 index 00000000..76fd6dd7 --- /dev/null +++ b/src/promnesia/sources/browser_new.py @@ -0,0 +1,22 @@ +from typing import Optional + +from promnesia.common import Results, Visit, Loc, Second + + +def index() -> Results: + from . import hpi + from my.browser.all import history + + for v in history(): + desc: Optional[str] = None + duration: Optional[Second] = None + metadata = v.metadata + if metadata is not None: + desc = metadata.title + duration = metadata.duration + yield Visit( + url=v.url, + dt=v.dt, + locator=Loc(title=desc or v.url, href=v.url), + duration=duration, + ) From 3dd864fd020b7cad4bcfa21c5313a0e5471e405f Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Mon, 6 Feb 2023 00:33:45 +0000 Subject: [PATCH 2/4] sources.browser: move current browser module to browser_old --- src/promnesia/sources/{browser.py => browser_old.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename src/promnesia/sources/{browser.py => browser_old.py} (100%) diff --git a/src/promnesia/sources/browser.py b/src/promnesia/sources/browser_old.py similarity index 100% rename from src/promnesia/sources/browser.py rename to src/promnesia/sources/browser_old.py From 57cc7790e0c2f8b05025ab2a4ad93acdb556e0ea Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Thu, 9 Feb 2023 23:36:40 +0000 Subject: [PATCH 3/4] source.browser: implement fallbacks onto old browser module - if my.browser.export is available try to hack HPI config and use it - if not, fallback to promnesia.sources.browser_old --- src/promnesia/common.py | 8 +++ src/promnesia/sources/browser.py | 84 ++++++++++++++++++++++++++++ src/promnesia/sources/browser_new.py | 22 -------- src/promnesia/sources/browser_old.py | 14 +---- tox.ini | 1 + 5 files changed, 95 insertions(+), 34 deletions(-) create mode 100644 src/promnesia/sources/browser.py delete mode 100644 src/promnesia/sources/browser_new.py diff --git a/src/promnesia/common.py b/src/promnesia/common.py index 5a54b25c..000ba652 100644 --- a/src/promnesia/common.py +++ b/src/promnesia/common.py @@ -586,3 +586,11 @@ def measure(tag: str='', *, logger, unit: str='ms'): mult = {'s': 1, 'ms': 10**3, 'us': 10**6}[unit] xx = secs * mult logger.debug(f'[{tag}]: {xx:.1f}{unit} elapsed') + + +def is_sqlite_db(x: Path) -> bool: + return x.is_file() and mime(x) in { + 'application/x-sqlite3', + 'application/vnd.sqlite3', + # TODO this mime can also match wal files/journals, not sure + } diff --git a/src/promnesia/sources/browser.py b/src/promnesia/sources/browser.py new file mode 100644 index 00000000..337cdccc --- /dev/null +++ b/src/promnesia/sources/browser.py @@ -0,0 +1,84 @@ +import re +from typing import Optional, Iterator, Any, TYPE_CHECKING +import warnings + +from promnesia.common import Results, Visit, Loc, Second, PathIsh, logger, is_sqlite_db + + +def index(p: Optional[PathIsh]=None) -> Results: + from . import hpi + + if p is None: + from my.browser.all import history + yield from _index_new(history()) + return + + warnings.warn('Passing paths to promnesia.sources.browser is deprecated. You should switch to HPI for that. See https://github.com/seanbreckenridge/browserexport#hpi') + + # even if the used doesn't have HPI config for my.browser set up, + try: + yield from _index_new_with_adhoc_config(path=p) + except Exception as e: + logger.exception(e) + warnings.warn("Setting my.config.browser.export didn't work. You probably need to update HPI.") + else: + return + + logger.warning("Falling back onto legacy promnesia.sources.browser_old") + raise RuntimeError + yield from _index_old(path=p) + + +def _index_old(*, path: PathIsh) -> Results: + from . import browser_old + yield from browser_old.index(path) + + +def _index_new_with_adhoc_config(*, path: PathIsh) -> Results: + ## previously, it was possible to index be called with multiple different db search paths + ## this would result in each subsequent call to my.browser.export.history to invalidate cache every time + ## so we hack cachew path so it's different for each call + from my.core.core_config import config as hpi_core_config + hpi_cache_dir = hpi_core_config.get_cache_dir() + sanitized_path = re.sub(r'\W', '_', str(path)) + cache_override = None if hpi_cache_dir is None else hpi_cache_dir / sanitized_path + ## + + from my.core.common import classproperty, Paths, get_files + class config: + class core: + cache_dir = cache_override + + class browser: + class export: + @classproperty + def export_path(cls) -> Paths: + return tuple([f for f in get_files(path, glob='**/*') if is_sqlite_db(f)]) + + + from my.core.cfg import tmp_config + with tmp_config(modules='my.browser.export|my.core.core_config', config=config): + from my.browser.export import history + yield from _index_new(history()) + + +if TYPE_CHECKING: + from browserexport.merge import Visit as BrowserMergeVisit +else: + BrowserMergeVisit = Any + + +def _index_new(history: Iterator[BrowserMergeVisit]) -> Results: + for v in history: + desc: Optional[str] = None + duration: Optional[Second] = None + metadata = v.metadata + if metadata is not None: + desc = metadata.title + duration = metadata.duration + yield Visit( + url=v.url, + dt=v.dt, + locator=Loc(title=desc or v.url, href=v.url), + duration=duration, + ) diff --git a/src/promnesia/sources/browser_new.py b/src/promnesia/sources/browser_new.py deleted file mode 100644 index 76fd6dd7..00000000 --- a/src/promnesia/sources/browser_new.py +++ /dev/null @@ -1,22 +0,0 @@ -from typing import Optional - -from promnesia.common import Results, Visit, Loc, Second - - -def index() -> Results: - from . import hpi - from my.browser.all import history - - for v in history(): - desc: Optional[str] = None - duration: Optional[Second] = None - metadata = v.metadata - if metadata is not None: - desc = metadata.title - duration = metadata.duration - yield Visit( - url=v.url, - dt=v.dt, - locator=Loc(title=desc or v.url, href=v.url), - duration=duration, - ) diff --git a/src/promnesia/sources/browser_old.py b/src/promnesia/sources/browser_old.py index 82454d90..2886aa7c 100644 --- a/src/promnesia/sources/browser_old.py +++ b/src/promnesia/sources/browser_old.py @@ -6,31 +6,21 @@ import pytz -from ..common import PathIsh, Results, Visit, Loc, get_logger, Second, mime +from ..common import PathIsh, Results, Visit, Loc, logger, Second, is_sqlite_db from .. import config # todo mcachew? from cachew import cachew -logger = get_logger() - def index(p: PathIsh) -> Results: pp = Path(p) assert pp.exists(), pp # just in case of broken symlinks - # is_file check because it also returns dirs - # TODO hmm, not sure what I meant here -- which dirs? behind symlinks? - is_db = lambda x: x.is_file() and mime(x) in { - 'application/x-sqlite3', - 'application/vnd.sqlite3', - # TODO this mime can also match wal files/journals, not sure - } - # todo warn if filtered out too many? # todo wonder how quickly mimes can be computed? # todo ugh, dunno, maybe this really belongs to hpi?? need get_files etc... - dbs = [p for p in sorted(pp.rglob('*')) if is_db(p)] + dbs = [p for p in sorted(pp.rglob('*')) if is_sqlite_db(p)] assert len(dbs) > 0, pp logger.info('processing %d databases', len(dbs)) diff --git a/tox.ini b/tox.ini index ca358dfc..278bb11d 100644 --- a/tox.ini +++ b/tox.ini @@ -72,6 +72,7 @@ commands = hpi module install my.reddit hpi module install my.fbmessenger hpi module install my.google.takeout.parser + hpi module install my.browser.export {envpython} -m mypy --install-types --non-interactive \ -p promnesia.sources \ From 25602a9fb08453f233281d23281b00dc555a2018 Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Fri, 10 Feb 2023 20:07:01 +0000 Subject: [PATCH 4/4] docs: bring SOURCES.org up-to date also switch it to use ast module -- much easier & quicker to run --- doc/SOURCES.org | 102 +++++++++++++++------------- src/promnesia/sources/auto.py | 3 + src/promnesia/sources/browser.py | 4 ++ src/promnesia/sources/hackernews.py | 3 +- src/promnesia/sources/rss.py | 4 ++ src/promnesia/sources/shellcmd.py | 4 ++ src/promnesia/sources/signal.py | 13 ++-- src/promnesia/sources/viber.py | 2 +- 8 files changed, 80 insertions(+), 55 deletions(-) diff --git a/doc/SOURCES.org b/doc/SOURCES.org index 47fea684..9e89f057 100644 --- a/doc/SOURCES.org +++ b/doc/SOURCES.org @@ -10,9 +10,11 @@ import setup for (name, description), vals in setup.DEPS_SOURCES.items(): # fuck org-ruby. promnesia[name] should be in quotes, but then it doesn't render as code. ugh. # https://github.com/wallyqs/org-ruby/issues/45 + vals = [v.split('>')[0] for v in vals] + if len(vals) == 0: + continue print(f"- ~pip3 install --user promnesia[{name}]~") print(f' ') - vals = [v.split('>')[0] for v in vals] print(f' {description}: {", ".join(vals)}') #+end_src @@ -35,9 +37,6 @@ for (name, description), vals in setup.DEPS_SOURCES.items(): - ~pip3 install --user promnesia[org]~ dependencies for sources.org: orgparse -- ~pip3 install --user promnesia[telegram]~ - - dependencies for sources.telegram: dataset :end: Alternatively, you can just install all of them in bulk: ~pip3 install --user promnesia[all]~. @@ -47,44 +46,41 @@ Alternatively, you can just install all of them in bulk: ~pip3 install --user pr These are included with the current Promnesia distribution: -#+begin_src python :python "with_my python3" :dir ../src :exports output :results output drawer +#+begin_src python :dir ../src :exports output :results output drawer print('\n') # fix github org-mode issue with drawers +import ast from pathlib import Path -import pkgutil -import importlib -import inspect +import os indent = lambda s: ''.join(' ' + l for l in s.splitlines(keepends=True)) git_root = Path('.').absolute().parent -from promnesia.common import Results - -import promnesia.sources as pkg -for importer, name, ispkg in sorted(pkgutil.walk_packages( - path=pkg.__path__, - prefix=pkg.__name__+'.' -), key=lambda x: x[1]): - if name in { - # TODO damn, these modules need depednencies... - 'promnesia.sources.browser', - 'promnesia.sources.markdown', - 'promnesia.sources.org', - 'promnesia.sources.plaintext', +src = git_root / 'src' + +for f in sorted((src / 'promnesia/sources').rglob('*.py')): + mp = f.relative_to(src) + module_name = str(mp.with_suffix('')).replace(os.sep, '.') + if module_name in { + 'promnesia.sources.browser_old', # deprecated + 'promnesia.sources.takeout_legacy', # deprecated + 'promnesia.sources.guess', + 'promnesia.sources.demo', }: continue - m = importlib.import_module(name) - public = [(k, v) for k, v in inspect.getmembers(m) if not k.startswith('_')] - indexers = [(k, v) for k, v in public if getattr(v, '__annotations__', {}).get('return') == Results] - assert len(indexers) > 0, name - for k, i in indexers: - # print(inspect.signature(i)) - link = '../' + str(Path(m.__file__).relative_to(git_root)) - print(f'- [[file:{link}][{name}]]') - d = m.__doc__ - if d is not None: - print(indent(d)) + a: ast.Module = ast.parse(f.read_text()) + has_index = False + for x in a.body: + if isinstance(x, ast.FunctionDef) and x.name == 'index': + has_index = True + if not has_index: + continue + link = '../' + str(f.relative_to(git_root)) + print(f'- [[file:{link}][{module_name}]]') + doc = ast.get_docstring(a, clean=False) + if doc is not None: + print(indent(doc)) #+end_src #+RESULTS: @@ -98,7 +94,11 @@ for importer, name, ispkg in sorted(pkgutil.walk_packages( - can index most of plaintext files, including source code! - autodetects Obsidian vault and adds `obsidian://` app protocol support [[file:../src/promnesia/sources/obsidian.py][promnesia.sources.obsidian]] - autodetects Logseq graph and adds `logseq://` app protocol support [[file:../src/promnesia/sources/logseq.py][promnesia.sources.logseq]] - + +- [[file:../src/promnesia/sources/browser.py][promnesia.sources.browser]] + + Uses [[https://github.com/karlicoss/HPI][HPI]] for visits from web browsers. + - [[file:../src/promnesia/sources/fbmessenger.py][promnesia.sources.fbmessenger]] Uses [[https://github.com/karlicoss/HPI][HPI]] for the messages data. @@ -107,10 +107,9 @@ for importer, name, ispkg in sorted(pkgutil.walk_packages( Uses [[https://github.com/karlicoss/HPI][HPI]] github module -- [[file:../src/promnesia/sources/guess.py][promnesia.sources.guess]] -- [[file:../src/promnesia/sources/html.py][promnesia.sources.html]] +- [[file:../src/promnesia/sources/hackernews.py][promnesia.sources.hackernews]] - Extracts links from HTML files + Uses [[https://github.com/karlicoss/HPI][HPI]] dogsheep module to import HackerNews items. - [[file:../src/promnesia/sources/hypothesis.py][promnesia.sources.hypothesis]] @@ -133,12 +132,25 @@ for importer, name, ispkg in sorted(pkgutil.walk_packages( Uses [[https://github.com/karlicoss/HPI][HPI]] for Roam Research data - [[file:../src/promnesia/sources/rss.py][promnesia.sources.rss]] + + Uses [[https://github.com/karlicoss/HPI][HPI]] for RSS data. + - [[file:../src/promnesia/sources/shellcmd.py][promnesia.sources.shellcmd]] + Greps out URLs from an arbitrary shell command results. + +- [[file:../src/promnesia/sources/signal.py][promnesia.sources.signal]] + + Collects visits from Signal Desktop's encrypted SQLIite db(s). + - [[file:../src/promnesia/sources/smscalls.py][promnesia.sources.smscalls]] Uses [[https://github.com/karlicoss/HPI][HPI]] smscalls module +- [[file:../src/promnesia/sources/stackexchange.py][promnesia.sources.stackexchange]] + + Uses [[https://github.com/karlicoss/HPI][HPI]] for Stackexchange data. + - [[file:../src/promnesia/sources/takeout.py][promnesia.sources.takeout]] Uses HPI [[https://github.com/karlicoss/HPI/blob/master/doc/MODULES.org#mygoogletakeoutpaths][google.takeout]] module @@ -147,16 +159,6 @@ for importer, name, ispkg in sorted(pkgutil.walk_packages( Uses [[https://github.com/fabianonline/telegram_backup#readme][telegram_backup]] database for messages data -- [[file:../src/promnesia/sources/viber.py][promnesia.sources.viber]] - - Uses all local SQLite files found in your Viber Desktop configurations: - usually in =~/.ViberPC/**/viber.db= (one directory for each telephone number). - -- [[file:../src/promnesia/sources/signal.py][promnesia.sources.signal]] - - When path(s) given, uses the SQLite inside Signal-Desktop's configuration directory - (see the sources for more parameters & location of the db-file for each platform) - - [[file:../src/promnesia/sources/twitter.py][promnesia.sources.twitter]] Uses [[https://github.com/karlicoss/HPI][HPI]] for Twitter data. @@ -165,10 +167,18 @@ for importer, name, ispkg in sorted(pkgutil.walk_packages( Clones & indexes Git repositories (via sources.auto) +- [[file:../src/promnesia/sources/viber.py][promnesia.sources.viber]] + + Collects visits from Viber desktop app (e.g. `~/.ViberPC/XYZ123/viber.db`) + - [[file:../src/promnesia/sources/website.py][promnesia.sources.website]] Clones a website with wget and indexes via sources.auto +- [[file:../src/promnesia/sources/zulip.py][promnesia.sources.zulip]] + + Uses [[https://github.com/karlicoss/HPI][HPI]] for Zulip data. + :end: diff --git a/src/promnesia/sources/auto.py b/src/promnesia/sources/auto.py index 2558f17d..81c019dd 100644 --- a/src/promnesia/sources/auto.py +++ b/src/promnesia/sources/auto.py @@ -1,6 +1,9 @@ """ - discovers files recursively - guesses the format (orgmode/markdown/json/etc) by the extension/MIME type +- can index most of plaintext files, including source code! +- autodetects Obsidian vault and adds `obsidian://` app protocol support [[file:../src/promnesia/sources/obsidian.py][promnesia.sources.obsidian]] +- autodetects Logseq graph and adds `logseq://` app protocol support [[file:../src/promnesia/sources/logseq.py][promnesia.sources.logseq]] """ import csv diff --git a/src/promnesia/sources/browser.py b/src/promnesia/sources/browser.py index 337cdccc..a296d194 100644 --- a/src/promnesia/sources/browser.py +++ b/src/promnesia/sources/browser.py @@ -1,3 +1,7 @@ +''' +Uses [[https://github.com/karlicoss/HPI][HPI]] for visits from web browsers. +''' + import re from typing import Optional, Iterator, Any, TYPE_CHECKING import warnings diff --git a/src/promnesia/sources/hackernews.py b/src/promnesia/sources/hackernews.py index 06f34940..de434058 100644 --- a/src/promnesia/sources/hackernews.py +++ b/src/promnesia/sources/hackernews.py @@ -1,6 +1,5 @@ ''' -Uses [[https://github.com/karlicoss/HPI][HPI]] dogsheep module to import -Hacker News items. +Uses [[https://github.com/karlicoss/HPI][HPI]] dogsheep module to import HackerNews items. ''' import textwrap diff --git a/src/promnesia/sources/rss.py b/src/promnesia/sources/rss.py index 7d71901c..4f066ca4 100644 --- a/src/promnesia/sources/rss.py +++ b/src/promnesia/sources/rss.py @@ -1,3 +1,7 @@ +''' +Uses [[https://github.com/karlicoss/HPI][HPI]] for RSS data. +''' + from itertools import chain from ..common import Visit, Loc, extract_urls, Results, get_logger diff --git a/src/promnesia/sources/shellcmd.py b/src/promnesia/sources/shellcmd.py index 6a64d4d3..efdb1bbc 100644 --- a/src/promnesia/sources/shellcmd.py +++ b/src/promnesia/sources/shellcmd.py @@ -1,3 +1,7 @@ +""" +Greps out URLs from an arbitrary shell command results. +""" + from datetime import datetime import os import re diff --git a/src/promnesia/sources/signal.py b/src/promnesia/sources/signal.py index 4422b04f..878f2648 100644 --- a/src/promnesia/sources/signal.py +++ b/src/promnesia/sources/signal.py @@ -1,12 +1,13 @@ """ -Harvest visits from Signal Desktop's chiphered SQLIite db(s). +Collects visits from Signal Desktop's encrypted SQLIite db(s). +""" -Functions get their defaults from module-data. +# Functions get their defaults from module-data. +# +# * Open-ciphered-db adapted from: +# https://github.com/carderne/signal-export/commit/2284c8f4 +# * Copyright (c) 2019 Chris Arderne, 2020 Kostis Anagnostopoulos -* Open-ciphered-db adapted from: - https://github.com/carderne/signal-export/commit/2284c8f4 -* Copyright (c) 2019 Chris Arderne, 2020 Kostis Anagnostopoulos -""" import json import logging diff --git a/src/promnesia/sources/viber.py b/src/promnesia/sources/viber.py index 0542b70c..23dfeca5 100644 --- a/src/promnesia/sources/viber.py +++ b/src/promnesia/sources/viber.py @@ -1,5 +1,5 @@ """ -Adapted from `telegram.py` to read from `~/.ViberPC/XYZ123/viber.db` +Collects visits from Viber desktop app (e.g. `~/.ViberPC/XYZ123/viber.db`) """ import logging