karlicoss · karlicoss · Oct 27, 2023 · Oct 27, 2023 · Oct 27, 2023
diff --git a/my/hackernews/dogsheep.py b/my/hackernews/dogsheep.py
@@ -4,18 +4,19 @@
 from __future__ import annotations
 
 from dataclasses import dataclass
-from datetime import datetime
+from datetime import datetime, timezone
 from pathlib import Path
 from typing import Iterator, Sequence, Optional
 
-from my.core import get_files, Paths, Res
+from my.core import get_files, Paths, Res, datetime_aware
 from my.core.sqlite import sqlite_connection
+import my.config
 
-from my.config import hackernews as user_config
+from .common import hackernews_link
 
 
 @dataclass
-class config(user_config.dogsheep):
+class config(my.config.hackernews.dogsheep):
  # paths[s]/glob to the dogsheep database
  export_path: Paths
 
@@ -26,24 +27,23 @@ def inputs() -> Sequence[Path]:
  return get_files(config.export_path)
 
 
-from .common import hackernews_link
-
 # TODO not sure if worth splitting into Comment and Story?
 @dataclass(unsafe_hash=True)
 class Item:
  id: str
  type: str
- # TODO is it urc??
- created: datetime
+ created: datetime_aware # checked and it's utc
  title: Optional[str] # only present for Story
- text_html: Optional[str] # should be present for Comment and might for Story
- url: Optional[str] # might be present for Story
+ text_html: Optional[str]  # should be present for Comment and might for Story
+ url: Optional[str]  # might be present for Story
  # todo process 'deleted'? fields?
  # todo process 'parent'?
 
  @property
  def permalink(self) -> str:
  return hackernews_link(self.id)
+
+
 # TODO hmm kinda annoying that permalink isn't getting serialized
 # maybe won't be such a big problem if we used hpi query directly on objects, without jsons?
 # so we could just take .permalink thing
@@ -56,7 +56,7 @@ def items() -> Iterator[Res[Item]]:
  yield Item(
  id=r['id'],
  type=r['type'],
- created=datetime.fromtimestamp(r['time']),
+ created=datetime.fromtimestamp(r['time'], tz=timezone.utc),
  title=r['title'],
  # todo hmm maybe a method to strip off html tags would be nice
  text_html=r['text'],

diff --git a/my/zulip/organization.py b/my/zulip/organization.py
@@ -2,24 +2,37 @@
 Zulip data from [[https://memex.zulipchat.com/help/export-your-organization][Organization export]]
 """
 from dataclasses import dataclass
-from typing import Sequence, Iterator, Dict
+from datetime import datetime, timezone
+from itertools import count
+import json
+from pathlib import Path
+from typing import Sequence, Iterator, Dict, Union
+
+from my.core import (
+ assert_never,
+ datetime_aware,
+ get_files,
+ stat,
+ Json,
+ Paths,
+ Res,
+ Stats,
+)
+from my.core.error import notnone
+import my.config
 
-from my.config import zulip as user_config
 
-from ..core import Paths
 @dataclass
-class organization(user_config.organization):
+class organization(my.config.zulip.organization):
  # paths[s]/glob to the exported JSON data
  export_path: Paths
 
 
-from pathlib import Path
-from ..core import get_files, Json
 def inputs() -> Sequence[Path]:
- return get_files(organization.export_path)
-
-
-from datetime import datetime
+ # TODO: seems like export ids are kinda random..
+ # not sure what's the best way to figure out the last without renaming?
+ # could use mtime perhaps?
+ return get_files(organization.export_path, sort=False)
 
 
 @dataclass(frozen=True)
@@ -39,16 +52,11 @@ class Sender:
 
 # from the data, seems that subjects are completely implicit and determined by name?
 # streams have ids (can extract from realm/zerver_stream), but unclear how to correlate messages/topics to streams?
-
 @dataclass(frozen=True)
 class _Message:
  # todo hmm not sure what would be a good field order..
  id: int
- sent: datetime
- # TODO hmm kinda unclear whether it uses UTC or not??
- # https://github.com/zulip/zulip/blob/0c2e4eec200d986a9a020f3e9a651d27216e0e85/zerver/models.py#L3071-L3076
- # it keeps it tz aware.. but not sure what happens after?
- # https://github.com/zulip/zulip/blob/1dfddffc8dac744fd6a6fbfd937018074c8bb166/zproject/computed_settings.py#L151
+ sent: datetime_aware # double checked and they are in utc
  subject: str
  sender_id: int
  server_id: int
@@ -60,7 +68,7 @@ class _Message:
 @dataclass(frozen=True)
 class Message:
  id: int
- sent: datetime
+ sent: datetime_aware
  subject: str
  sender: Sender
  server: Server
@@ -76,23 +84,18 @@ def permalink(self) -> str:
  return f'https://{self.server.string_id}.zulipchat.com/#narrow/near/{self.id}'
 
 
-from typing import Union
-from itertools import count
-import json
-from ..core import Res, assert_never
 # todo cache it
 def _entities() -> Iterator[Res[Union[Server, Sender, _Message]]]:
- # TODO hmm -- not sure if max lexicographically will actually be latest?
  last = max(inputs())
 
- subdir = last.with_suffix('').stem # there is a directory inside tar.gz
-
  # todo would be nice to switch it to unpacked dirs as well, similar to ZipPath
  # I guess makes sense to have a special implementation for .tar.gz considering how common are they
  import tarfile
- from ..core.error import notnone
 
  tfile = tarfile.open(last)
+
+ subdir = tfile.getnames()[0] # there is a directory inside tar file, first name should be that
+
  with notnone(tfile.extractfile(f'{subdir}/realm.json')) as fo:
  rj = json.load(fo)
 
@@ -114,20 +117,22 @@ def _entities() -> Iterator[Res[Union[Server, Sender, _Message]]]:
  for j in rj['zerver_userprofile_crossrealm']: # e.g. zulip bot
  yield Sender(
  id=j['id'],
- full_name=j['email'], # doesn't seem to have anything
+ full_name=j['email'],  # doesn't seem to have anything
  email=j['email'],
  )
 
  def _parse_message(j: Json) -> _Message:
  ds = j['date_sent']
+ # fmt: off
  return _Message(
  id = j['id'],
- sent = datetime.fromtimestamp(ds),
+ sent = datetime.fromtimestamp(ds, tz=timezone.utc),
  subject = j['subject'],
  sender_id = j['sender'],
  server_id = server.id,
  content = j['content'],
  )
+ # fmt: on
 
  for idx in count(start=1, step=1):
  fname = f'messages-{idx:06}.json'
@@ -172,9 +177,5 @@ def messages() -> Iterator[Res[Message]]:
  assert_never(x)
 
 
-from my.core import Stats
 def stats() -> Stats:
- from my.core import stat
- return {
- **stat(messages)
- }
+ return {**stat(messages)}