From c136a37c127fba51ef5fa505bbdd2b280e2a3209 Mon Sep 17 00:00:00 2001
From: karlicoss <karlicoss@gmail.com>
Date: Fri, 27 Oct 2023 02:14:50 +0100
Subject: [PATCH 1/2] my.zulip.organization: use UTC timestamps, support custom
 archive names + some cleanup

---
 my/zulip/organization.py | 65 ++++++++++++++++++++--------------------
 1 file changed, 33 insertions(+), 32 deletions(-)

diff --git a/my/zulip/organization.py b/my/zulip/organization.py
index 64b5ae3d..87254115 100644
--- a/my/zulip/organization.py
+++ b/my/zulip/organization.py
@@ -2,24 +2,37 @@
 Zulip data from [[https://memex.zulipchat.com/help/export-your-organization][Organization export]]
 """
 from dataclasses import dataclass
-from typing import Sequence, Iterator, Dict
+from datetime import datetime, timezone
+from itertools import count
+import json
+from pathlib import Path
+from typing import Sequence, Iterator, Dict, Union
+
+from my.core import (
+    assert_never,
+    datetime_aware,
+    get_files,
+    stat,
+    Json,
+    Paths,
+    Res,
+    Stats,
+)
+from my.core.error import notnone
+import my.config
 
-from my.config import zulip as user_config
 
-from ..core import Paths
 @dataclass
-class organization(user_config.organization):
+class organization(my.config.zulip.organization):
     # paths[s]/glob to the exported JSON data
     export_path: Paths
 
 
-from pathlib import Path
-from ..core import get_files, Json
 def inputs() -> Sequence[Path]:
-    return get_files(organization.export_path)
-
-
-from datetime import datetime
+    # TODO: seems like export ids are kinda random..
+    # not sure what's the best way to figure out the last without renaming?
+    # could use mtime perhaps?
+    return get_files(organization.export_path, sort=False)
 
 
 @dataclass(frozen=True)
@@ -39,16 +52,11 @@ class Sender:
 
 # from the data, seems that subjects are completely implicit and determined by name?
 # streams have ids (can extract from realm/zerver_stream), but unclear how to correlate messages/topics to streams?
-
 @dataclass(frozen=True)
 class _Message:
     # todo hmm not sure what would be a good field order..
     id: int
-    sent: datetime
-    # TODO hmm kinda unclear whether it uses UTC or not??
-    # https://github.com/zulip/zulip/blob/0c2e4eec200d986a9a020f3e9a651d27216e0e85/zerver/models.py#L3071-L3076
-    # it keeps it tz aware.. but not sure what happens after?
-    # https://github.com/zulip/zulip/blob/1dfddffc8dac744fd6a6fbfd937018074c8bb166/zproject/computed_settings.py#L151
+    sent: datetime_aware  # double checked and they are in utc
     subject: str
     sender_id: int
     server_id: int
@@ -60,7 +68,7 @@ class _Message:
 @dataclass(frozen=True)
 class Message:
     id: int
-    sent: datetime
+    sent: datetime_aware
     subject: str
     sender: Sender
     server: Server
@@ -76,23 +84,18 @@ def permalink(self) -> str:
         return f'https://{self.server.string_id}.zulipchat.com/#narrow/near/{self.id}'
 
 
-from typing import Union
-from itertools import count
-import json
-from ..core import Res, assert_never
 # todo cache it
 def _entities() -> Iterator[Res[Union[Server, Sender, _Message]]]:
-    # TODO hmm -- not sure if max lexicographically will actually be latest?
     last = max(inputs())
 
-    subdir = last.with_suffix('').stem # there is a directory inside tar.gz
-
     # todo would be nice to switch it to unpacked dirs as well, similar to ZipPath
     # I guess makes sense to have a special implementation for .tar.gz considering how common are they
     import tarfile
-    from ..core.error import notnone
 
     tfile = tarfile.open(last)
+
+    subdir = tfile.getnames()[0]  # there is a directory inside tar file, first name should be that
+
     with notnone(tfile.extractfile(f'{subdir}/realm.json')) as fo:
         rj = json.load(fo)
 
@@ -114,20 +117,22 @@ def _entities() -> Iterator[Res[Union[Server, Sender, _Message]]]:
     for j in rj['zerver_userprofile_crossrealm']:  # e.g. zulip bot
         yield Sender(
             id=j['id'],
-            full_name=j['email'], # doesn't seem to have anything
+            full_name=j['email'],  # doesn't seem to have anything
             email=j['email'],
         )
 
     def _parse_message(j: Json) -> _Message:
         ds = j['date_sent']
+        # fmt: off
         return _Message(
             id        = j['id'],
-            sent      = datetime.fromtimestamp(ds),
+            sent      = datetime.fromtimestamp(ds, tz=timezone.utc),
             subject   = j['subject'],
             sender_id = j['sender'],
             server_id = server.id,
             content   = j['content'],
         )
+        # fmt: on
 
     for idx in count(start=1, step=1):
         fname = f'messages-{idx:06}.json'
@@ -172,9 +177,5 @@ def messages() -> Iterator[Res[Message]]:
         assert_never(x)
 
 
-from my.core import Stats
 def stats() -> Stats:
-    from my.core import stat
-    return {
-        **stat(messages)
-    }
+    return {**stat(messages)}

From a65839440dd05c270310f0eb5d673801245d2f3d Mon Sep 17 00:00:00 2001
From: karlicoss <karlicoss@gmail.com>
Date: Fri, 27 Oct 2023 02:27:04 +0100
Subject: [PATCH 2/2] my.hackernews.dogsheep: use utc datetime + minor cleanup

---
 my/hackernews/dogsheep.py | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/my/hackernews/dogsheep.py b/my/hackernews/dogsheep.py
index aac0b1a9..de6c58da 100644
--- a/my/hackernews/dogsheep.py
+++ b/my/hackernews/dogsheep.py
@@ -4,18 +4,19 @@
 from __future__ import annotations
 
 from dataclasses import dataclass
-from datetime import datetime
+from datetime import datetime, timezone
 from pathlib import Path
 from typing import Iterator, Sequence, Optional
 
-from my.core import get_files, Paths, Res
+from my.core import get_files, Paths, Res, datetime_aware
 from my.core.sqlite import sqlite_connection
+import my.config
 
-from my.config import hackernews as user_config
+from .common import hackernews_link
 
 
 @dataclass
-class config(user_config.dogsheep):
+class config(my.config.hackernews.dogsheep):
     # paths[s]/glob to the dogsheep database
     export_path: Paths
 
@@ -26,24 +27,23 @@ def inputs() -> Sequence[Path]:
     return get_files(config.export_path)
 
 
-from .common import hackernews_link
-
 # TODO not sure if worth splitting into Comment and Story?
 @dataclass(unsafe_hash=True)
 class Item:
     id: str
     type: str
-    # TODO is it urc??
-    created: datetime
+    created: datetime_aware  # checked and it's utc
     title: Optional[str]  # only present for Story
-    text_html: Optional[str] # should be present for Comment and might for Story
-    url: Optional[str] # might be present for Story
+    text_html: Optional[str]  # should be present for Comment and might for Story
+    url: Optional[str]  # might be present for Story
     # todo process 'deleted'? fields?
     # todo process 'parent'?
 
     @property
     def permalink(self) -> str:
         return hackernews_link(self.id)
+
+
 # TODO hmm kinda annoying that permalink isn't getting serialized
 # maybe won't be such a big problem if we used hpi query directly on objects, without jsons?
 # so we could just take .permalink thing
@@ -56,7 +56,7 @@ def items() -> Iterator[Res[Item]]:
             yield Item(
                 id=r['id'],
                 type=r['type'],
-                created=datetime.fromtimestamp(r['time']),
+                created=datetime.fromtimestamp(r['time'], tz=timezone.utc),
                 title=r['title'],
                 # todo hmm maybe a method to strip off html tags would be nice
                 text_html=r['text'],