From 6a18f47c37696055c72eaa7c689c0a2ab983b229 Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Wed, 18 Sep 2024 23:03:03 +0100 Subject: [PATCH] my.github.gdpr/my.zulip.organization: use kompress support for tar.gz if it's available otherwise fall back onto unpacking into tmp dir via my.core.structure --- my/core/kompress.py | 6 +-- my/github/gdpr.py | 96 ++++++++++++++++++++++++---------------- my/zulip/organization.py | 91 +++++++++++++++++++++++++------------ setup.py | 14 +++--- 4 files changed, 130 insertions(+), 77 deletions(-) diff --git a/my/core/kompress.py b/my/core/kompress.py index 6ab3228f..7cbf310c 100644 --- a/my/core/kompress.py +++ b/my/core/kompress.py @@ -1,4 +1,5 @@ from .internal import assert_subpackage; assert_subpackage(__name__) + from . import warnings # do this later -- for now need to transition modules to avoid using kompress directly (e.g. ZipPath) @@ -8,10 +9,7 @@ from kompress import * except ModuleNotFoundError as e: if e.name == 'kompress': - warnings.high('Please install kompress (pip3 install kompress), it will be required in the future. Falling onto vendorized kompress for now.') + warnings.high('Please install kompress (pip3 install kompress). Falling onto vendorized kompress for now.') from ._deprecated.kompress import * # type: ignore[assignment] else: raise e - -# this is deprecated in compress, keep here for backwards compatibility -open = kopen # noqa: F405 diff --git a/my/github/gdpr.py b/my/github/gdpr.py index acbeb8f1..a56ff464 100644 --- a/my/github/gdpr.py +++ b/my/github/gdpr.py @@ -1,36 +1,42 @@ """ Github data (uses [[https://github.com/settings/admin][official GDPR export]]) """ -from dataclasses import dataclass + +from __future__ import annotations + import json +from abc import abstractmethod from pathlib import Path -import tarfile -from typing import Iterable, Any, Sequence, Dict, Optional +from typing import Any, Iterator, Sequence -from my.core import get_files, Res, PathIsh, stat, Stats, make_logger -from my.core.cfg import make_config -from my.core.error import notnone, echain +from my.core import Paths, Res, Stats, get_files, make_logger, stat, warnings +from my.core.error import echain -from .common import Event, parse_dt, EventIds +from .common import Event, EventIds, parse_dt -# TODO later, use a separate user config? (github_gdpr) -from my.config import github as user_config +logger = make_logger(__name__) -@dataclass -class github(user_config): - gdpr_dir: PathIsh # path to unpacked GDPR archive +class config: + @property + @abstractmethod + def gdpr_dir(self) -> Paths: + raise NotImplementedError -config = make_config(github) +def make_config() -> config: + # TODO later, use a separate user config? (github_gdpr) + from my.config import github as user_config + class combined_config(user_config, config): + pass -logger = make_logger(__name__) + return combined_config() def inputs() -> Sequence[Path]: - gdir = config.gdpr_dir - res = get_files(gdir) + gdpr_dir = make_config().gdpr_dir + res = get_files(gdpr_dir) schema_json = [f for f in res if f.name == 'schema.json'] was_unpacked = len(schema_json) > 0 if was_unpacked: @@ -43,22 +49,37 @@ def inputs() -> Sequence[Path]: return res -def events() -> Iterable[Res[Event]]: +def events() -> Iterator[Res[Event]]: last = max(inputs()) logger.info(f'extracting data from {last}') - # a bit naughty and ad-hoc, but we will generify reading from tar.gz. once we have more examples - # another one is zulip archive - if last.is_dir(): - files = sorted(last.glob('*.json')) # looks like all files are in the root - open_file = lambda f: f.open() + root: Path | None = None + + if last.is_dir(): # if it's already CPath, this will match it + root = last else: - # treat as .tar.gz - tfile = tarfile.open(last) - files = sorted(map(Path, tfile.getnames())) - files = [p for p in files if len(p.parts) == 1 and p.suffix == '.json'] - open_file = lambda p: notnone(tfile.extractfile(f'./{p}')) # NOTE odd, doesn't work without ./ + try: + from kompress import CPath + + root = CPath(last) + assert len(list(root.iterdir())) > 0 # trigger to check if we have the kompress version with targz support + except Exception as e: + logger.exception(e) + warnings.high("Upgrade 'kompress' to latest version with native .tar.gz support. Falling back to unpacking to tmp dir.") + + if root is None: + from my.core.structure import match_structure + + with match_structure(last, expected=()) as res: # expected=() matches it regardless any patterns + [root] = res + yield from _process_one(root) + else: + yield from _process_one(root) + + +def _process_one(root: Path) -> Iterator[Res[Event]]: + files = sorted(root.glob('*.json')) # looks like all files are in the root # fmt: off handler_map = { @@ -100,8 +121,7 @@ def events() -> Iterable[Res[Event]]: # ignored continue - with open_file(f) as fo: - j = json.load(fo) + j = json.loads(f.read_text()) for r in j: try: yield handler(r) @@ -116,7 +136,7 @@ def stats() -> Stats: # TODO typing.TypedDict could be handy here.. -def _parse_common(d: Dict) -> Dict: +def _parse_common(d: dict) -> dict: url = d['url'] body = d.get('body') return { @@ -126,7 +146,7 @@ def _parse_common(d: Dict) -> Dict: } -def _parse_repository(d: Dict) -> Event: +def _parse_repository(d: dict) -> Event: pref = 'https://github.com/' url = d['url'] dts = d['created_at'] @@ -142,13 +162,13 @@ def _parse_repository(d: Dict) -> Event: # user may be None if the user was deleted -def _is_bot(user: Optional[str]) -> bool: +def _is_bot(user: str | None) -> bool: if user is None: return False return "[bot]" in user -def _parse_issue_comment(d: Dict) -> Event: +def _parse_issue_comment(d: dict) -> Event: url = d['url'] return Event( **_parse_common(d), @@ -158,7 +178,7 @@ def _parse_issue_comment(d: Dict) -> Event: ) -def _parse_issue(d: Dict) -> Event: +def _parse_issue(d: dict) -> Event: url = d['url'] title = d['title'] return Event( @@ -169,7 +189,7 @@ def _parse_issue(d: Dict) -> Event: ) -def _parse_pull_request(d: Dict) -> Event: +def _parse_pull_request(d: dict) -> Event: dts = d['created_at'] url = d['url'] title = d['title'] @@ -183,7 +203,7 @@ def _parse_pull_request(d: Dict) -> Event: ) -def _parse_project(d: Dict) -> Event: +def _parse_project(d: dict) -> Event: url = d['url'] title = d['name'] is_bot = "[bot]" in d["creator"] @@ -198,7 +218,7 @@ def _parse_project(d: Dict) -> Event: ) -def _parse_release(d: Dict) -> Event: +def _parse_release(d: dict) -> Event: tag = d['tag_name'] return Event( **_parse_common(d), @@ -207,7 +227,7 @@ def _parse_release(d: Dict) -> Event: ) -def _parse_commit_comment(d: Dict) -> Event: +def _parse_commit_comment(d: dict) -> Event: url = d['url'] return Event( **_parse_common(d), diff --git a/my/zulip/organization.py b/my/zulip/organization.py index 87254115..2e0df4bd 100644 --- a/my/zulip/organization.py +++ b/my/zulip/organization.py @@ -1,38 +1,55 @@ """ Zulip data from [[https://memex.zulipchat.com/help/export-your-organization][Organization export]] """ + +from __future__ import annotations + +import json +from abc import abstractmethod from dataclasses import dataclass from datetime import datetime, timezone from itertools import count -import json from pathlib import Path -from typing import Sequence, Iterator, Dict, Union +from typing import Iterator, Sequence from my.core import ( - assert_never, - datetime_aware, - get_files, - stat, Json, Paths, Res, Stats, + assert_never, + datetime_aware, + get_files, + make_logger, + stat, + warnings, ) -from my.core.error import notnone -import my.config +logger = make_logger(__name__) -@dataclass -class organization(my.config.zulip.organization): - # paths[s]/glob to the exported JSON data - export_path: Paths + +class config: + @property + @abstractmethod + def export_path(self) -> Paths: + """paths[s]/glob to the exported JSON data""" + raise NotImplementedError + + +def make_config() -> config: + from my.config import zulip as user_config + + class combined_config(user_config.organization, config): + pass + + return combined_config() def inputs() -> Sequence[Path]: # TODO: seems like export ids are kinda random.. # not sure what's the best way to figure out the last without renaming? # could use mtime perhaps? - return get_files(organization.export_path, sort=False) + return get_files(make_config().export_path, sort=False) @dataclass(frozen=True) @@ -85,19 +102,39 @@ def permalink(self) -> str: # todo cache it -def _entities() -> Iterator[Res[Union[Server, Sender, _Message]]]: +def _entities() -> Iterator[Res[Server | Sender | _Message]]: last = max(inputs()) - # todo would be nice to switch it to unpacked dirs as well, similar to ZipPath - # I guess makes sense to have a special implementation for .tar.gz considering how common are they - import tarfile + logger.info(f'extracting data from {last}') + + root: Path | None = None + + if last.is_dir(): # if it's already CPath, this will match it + root = last + else: + try: + from kompress import CPath + + root = CPath(last) + assert len(list(root.iterdir())) > 0 # trigger to check if we have the kompress version with targz support + except Exception as e: + logger.exception(e) + warnings.high("Upgrade 'kompress' to latest version with native .tar.gz support. Falling back to unpacking to tmp dir.") + + if root is None: + from my.core.structure import match_structure + + with match_structure(last, expected=()) as res: # expected=() matches it regardless any patterns + [root] = res + yield from _process_one(root) + else: + yield from _process_one(root) - tfile = tarfile.open(last) - subdir = tfile.getnames()[0] # there is a directory inside tar file, first name should be that +def _process_one(root: Path) -> Iterator[Res[Server | Sender | _Message]]: + [subdir] = root.iterdir() # there is a directory inside tar file, first name should be that - with notnone(tfile.extractfile(f'{subdir}/realm.json')) as fo: - rj = json.load(fo) + rj = json.loads((subdir / 'realm.json').read_text()) [sj] = rj['zerver_realm'] server = Server( @@ -136,12 +173,10 @@ def _parse_message(j: Json) -> _Message: for idx in count(start=1, step=1): fname = f'messages-{idx:06}.json' - fpath = f'{subdir}/{fname}' - if fpath not in tfile.getnames(): - # tarfile doesn't have .exists? + fpath = subdir / fname + if not fpath.exists(): break - with notnone(tfile.extractfile(fpath)) as fo: - mj = json.load(fo) + mj = json.loads(fpath.read_text()) # TODO handle zerver_usermessage for j in mj['zerver_message']: try: @@ -151,8 +186,8 @@ def _parse_message(j: Json) -> _Message: def messages() -> Iterator[Res[Message]]: - id2sender: Dict[int, Sender] = {} - id2server: Dict[int, Server] = {} + id2sender: dict[int, Sender] = {} + id2server: dict[int, Server] = {} for x in _entities(): if isinstance(x, Exception): yield x diff --git a/setup.py b/setup.py index cf4b79f0..83358510 100644 --- a/setup.py +++ b/setup.py @@ -4,13 +4,13 @@ from setuptools import setup, find_namespace_packages # type: ignore INSTALL_REQUIRES = [ - 'pytz', # even though it's not needed by the core, it's so common anyway... - 'typing-extensions', # one of the most common pypi packages, ok to depend for core - 'appdirs', # very common, and makes it portable - 'more-itertools', # it's just too useful and very common anyway - 'decorator' , # less pain in writing correct decorators. very mature and stable, so worth keeping in core - 'click>=8.1' , # for the CLI, printing colors, decorator-based - may allow extensions to CLI - 'kompress' , # for transparent access to compressed files via pathlib.Path + 'pytz' , # even though it's not needed by the core, it's so common anyway... + 'typing-extensions' , # one of the most common pypi packages, ok to depend for core + 'appdirs' , # very common, and makes it portable + 'more-itertools' , # it's just too useful and very common anyway + 'decorator' , # less pain in writing correct decorators. very mature and stable, so worth keeping in core + 'click>=8.1' , # for the CLI, printing colors, decorator-based - may allow extensions to CLI + 'kompress>=0.2.20240918' , # for transparent access to compressed files via pathlib.Path ]