From c136a37c127fba51ef5fa505bbdd2b280e2a3209 Mon Sep 17 00:00:00 2001 From: karlicoss Date: Fri, 27 Oct 2023 02:14:50 +0100 Subject: [PATCH 1/2] my.zulip.organization: use UTC timestamps, support custom archive names + some cleanup --- my/zulip/organization.py | 65 ++++++++++++++++++++-------------------- 1 file changed, 33 insertions(+), 32 deletions(-) diff --git a/my/zulip/organization.py b/my/zulip/organization.py index 64b5ae3d..87254115 100644 --- a/my/zulip/organization.py +++ b/my/zulip/organization.py @@ -2,24 +2,37 @@ Zulip data from [[https://memex.zulipchat.com/help/export-your-organization][Organization export]] """ from dataclasses import dataclass -from typing import Sequence, Iterator, Dict +from datetime import datetime, timezone +from itertools import count +import json +from pathlib import Path +from typing import Sequence, Iterator, Dict, Union + +from my.core import ( + assert_never, + datetime_aware, + get_files, + stat, + Json, + Paths, + Res, + Stats, +) +from my.core.error import notnone +import my.config -from my.config import zulip as user_config -from ..core import Paths @dataclass -class organization(user_config.organization): +class organization(my.config.zulip.organization): # paths[s]/glob to the exported JSON data export_path: Paths -from pathlib import Path -from ..core import get_files, Json def inputs() -> Sequence[Path]: - return get_files(organization.export_path) - - -from datetime import datetime + # TODO: seems like export ids are kinda random.. + # not sure what's the best way to figure out the last without renaming? + # could use mtime perhaps? + return get_files(organization.export_path, sort=False) @dataclass(frozen=True) @@ -39,16 +52,11 @@ class Sender: # from the data, seems that subjects are completely implicit and determined by name? # streams have ids (can extract from realm/zerver_stream), but unclear how to correlate messages/topics to streams? - @dataclass(frozen=True) class _Message: # todo hmm not sure what would be a good field order.. id: int - sent: datetime - # TODO hmm kinda unclear whether it uses UTC or not?? - # https://github.com/zulip/zulip/blob/0c2e4eec200d986a9a020f3e9a651d27216e0e85/zerver/models.py#L3071-L3076 - # it keeps it tz aware.. but not sure what happens after? - # https://github.com/zulip/zulip/blob/1dfddffc8dac744fd6a6fbfd937018074c8bb166/zproject/computed_settings.py#L151 + sent: datetime_aware # double checked and they are in utc subject: str sender_id: int server_id: int @@ -60,7 +68,7 @@ class _Message: @dataclass(frozen=True) class Message: id: int - sent: datetime + sent: datetime_aware subject: str sender: Sender server: Server @@ -76,23 +84,18 @@ def permalink(self) -> str: return f'https://{self.server.string_id}.zulipchat.com/#narrow/near/{self.id}' -from typing import Union -from itertools import count -import json -from ..core import Res, assert_never # todo cache it def _entities() -> Iterator[Res[Union[Server, Sender, _Message]]]: - # TODO hmm -- not sure if max lexicographically will actually be latest? last = max(inputs()) - subdir = last.with_suffix('').stem # there is a directory inside tar.gz - # todo would be nice to switch it to unpacked dirs as well, similar to ZipPath # I guess makes sense to have a special implementation for .tar.gz considering how common are they import tarfile - from ..core.error import notnone tfile = tarfile.open(last) + + subdir = tfile.getnames()[0] # there is a directory inside tar file, first name should be that + with notnone(tfile.extractfile(f'{subdir}/realm.json')) as fo: rj = json.load(fo) @@ -114,20 +117,22 @@ def _entities() -> Iterator[Res[Union[Server, Sender, _Message]]]: for j in rj['zerver_userprofile_crossrealm']: # e.g. zulip bot yield Sender( id=j['id'], - full_name=j['email'], # doesn't seem to have anything + full_name=j['email'], # doesn't seem to have anything email=j['email'], ) def _parse_message(j: Json) -> _Message: ds = j['date_sent'] + # fmt: off return _Message( id = j['id'], - sent = datetime.fromtimestamp(ds), + sent = datetime.fromtimestamp(ds, tz=timezone.utc), subject = j['subject'], sender_id = j['sender'], server_id = server.id, content = j['content'], ) + # fmt: on for idx in count(start=1, step=1): fname = f'messages-{idx:06}.json' @@ -172,9 +177,5 @@ def messages() -> Iterator[Res[Message]]: assert_never(x) -from my.core import Stats def stats() -> Stats: - from my.core import stat - return { - **stat(messages) - } + return {**stat(messages)} From a65839440dd05c270310f0eb5d673801245d2f3d Mon Sep 17 00:00:00 2001 From: karlicoss Date: Fri, 27 Oct 2023 02:27:04 +0100 Subject: [PATCH 2/2] my.hackernews.dogsheep: use utc datetime + minor cleanup --- my/hackernews/dogsheep.py | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/my/hackernews/dogsheep.py b/my/hackernews/dogsheep.py index aac0b1a9..de6c58da 100644 --- a/my/hackernews/dogsheep.py +++ b/my/hackernews/dogsheep.py @@ -4,18 +4,19 @@ from __future__ import annotations from dataclasses import dataclass -from datetime import datetime +from datetime import datetime, timezone from pathlib import Path from typing import Iterator, Sequence, Optional -from my.core import get_files, Paths, Res +from my.core import get_files, Paths, Res, datetime_aware from my.core.sqlite import sqlite_connection +import my.config -from my.config import hackernews as user_config +from .common import hackernews_link @dataclass -class config(user_config.dogsheep): +class config(my.config.hackernews.dogsheep): # paths[s]/glob to the dogsheep database export_path: Paths @@ -26,24 +27,23 @@ def inputs() -> Sequence[Path]: return get_files(config.export_path) -from .common import hackernews_link - # TODO not sure if worth splitting into Comment and Story? @dataclass(unsafe_hash=True) class Item: id: str type: str - # TODO is it urc?? - created: datetime + created: datetime_aware # checked and it's utc title: Optional[str] # only present for Story - text_html: Optional[str] # should be present for Comment and might for Story - url: Optional[str] # might be present for Story + text_html: Optional[str] # should be present for Comment and might for Story + url: Optional[str] # might be present for Story # todo process 'deleted'? fields? # todo process 'parent'? @property def permalink(self) -> str: return hackernews_link(self.id) + + # TODO hmm kinda annoying that permalink isn't getting serialized # maybe won't be such a big problem if we used hpi query directly on objects, without jsons? # so we could just take .permalink thing @@ -56,7 +56,7 @@ def items() -> Iterator[Res[Item]]: yield Item( id=r['id'], type=r['type'], - created=datetime.fromtimestamp(r['time']), + created=datetime.fromtimestamp(r['time'], tz=timezone.utc), title=r['title'], # todo hmm maybe a method to strip off html tags would be nice text_html=r['text'],