Skip to content

Commit

Permalink
instagram: initial module for android app data (direct messages)
Browse files Browse the repository at this point in the history
  • Loading branch information
karlicoss committed Feb 2, 2022
1 parent 823668c commit e309531
Show file tree
Hide file tree
Showing 3 changed files with 161 additions and 4 deletions.
8 changes: 4 additions & 4 deletions my/bumble/android.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ def inputs() -> Sequence[Path]:
return get_files(config.export_path)


@dataclass
@dataclass(unsafe_hash=True)
class Person:
user_id: str
user_name: str
Expand All @@ -39,13 +39,13 @@ class _BaseMessage:
text: str


@dataclass
@dataclass(unsafe_hash=True)
class _Message(_BaseMessage):
conversation_id: str
reply_to_id: Optional[str]


@dataclass
@dataclass(unsafe_hash=True)
class Message(_BaseMessage):
person: Person
reply_to: Optional[Message]
Expand Down Expand Up @@ -118,4 +118,4 @@ def messages() -> Iterator[Res[Message]]:
id2msg[m.id] = m
yield m
continue
assert False # should be unreachable
assert False, type(x) # should be unreachable
5 changes: 5 additions & 0 deletions my/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,3 +105,8 @@ class organization:
class bumble:
class android:
export_path: Paths


class instagram:
class android:
export_path: Paths
152 changes: 152 additions & 0 deletions my/instagram/android.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,152 @@
"""
Bumble data from Android app database (in =/data/data/com.instagram.android/databases/direct.db=)
"""
from __future__ import annotations

from dataclasses import dataclass
from datetime import datetime
from typing import Iterator, Sequence, Optional, Dict

from more_itertools import unique_everseen

from my.config import instagram as user_config


from ..core import Paths
@dataclass
class config(user_config.android):
# paths[s]/glob to the exported sqlite databases
export_path: Paths


from ..core import get_files
from pathlib import Path
def inputs() -> Sequence[Path]:
return get_files(config.export_path)


@dataclass(unsafe_hash=True)
class User:
id: str
username: str
full_name: str


# todo not sure about order of fields...
@dataclass
class _BaseMessage:
id: str
created: datetime
text: str
thread_id: str


@dataclass(unsafe_hash=True)
class _Message(_BaseMessage):
user_id: str
# TODO ugh. can't figure out if dms have proper replies?
# reply_to_id: Optional[str]


@dataclass(unsafe_hash=True)
class Message(_BaseMessage):
user: User
# TODO could also extract Thread objec? not sure if useful
# reply_to: Optional[Message]


from ..core import Json
def _parse_message(j: Json) -> Optional[_Message]:
id = j['item_id']
t = j['item_type']
tid = j['thread_key']['thread_id']
uid = j['user_id']
# TODO not sure if utc??
created = datetime.fromtimestamp(int(j['timestamp']) / 1_000_000)
text: str
if t == 'text':
text = j['text']
elif t == 'reel_share':
# TODO include reel_share -> media??
# the problem is that the links are deliberately expired by instagram..
text = j['reel_share']['text']
elif t == 'action_log':
# something like "X liked message" -- hardly useful?
return None
else:
raise RuntimeError(f"{id}: {t} isn't handled yet")

return _Message(
id=id,
created=created,
text=text,
thread_id=tid,
user_id=uid,
# reply_to_id='FIXME',
)


import json
from typing import Union
from ..core.error import Res
import sqlite3
from ..core.sqlite import sqlite_connect_immutable
def _entities() -> Iterator[Res[Union[User, _Message]]]:
# NOTE: definitely need to merge multiple, app seems to recycle old messages
# TODO: hmm hard to guarantee timestamp ordering when we use synthetic input data...
for f in inputs():
with sqlite_connect_immutable(f) as db:

for row in db.execute(f'SELECT user_id, thread_info FROM threads'):
(self_uid, js,) = row
# ugh wtf?? no easier way to extract your own user id/name??
yield User(
id=str(self_uid),
full_name='You',
username='you',
)
j = json.loads(js)
for r in j['recipients']:
yield User(
id=str(r['id']), # for some reason it's int in the db
full_name=r['full_name'],
username=r['username'],
)

for row in db.execute(f'SELECT message FROM messages ORDER BY timestamp'):
# eh, seems to contain everything in json?
(js,) = row
j = json.loads(js)
try:
m = _parse_message(j)
if m is not None:
yield m
except Exception as e:
yield e


def messages() -> Iterator[Res[Message]]:
# TODO would be nicer to use a decorator for unique_everseen?
id2user: Dict[str, User] = {}
for x in unique_everseen(_entities()):
if isinstance(x, Exception):
yield x
continue
if isinstance(x, User):
id2user[x.id] = x
continue
if isinstance(x, _Message):
try:
user = id2user[x.user_id]
except Exception as e:
yield e
continue
yield Message(
id=x.id,
created=x.created,
text=x.text,
thread_id=x.thread_id,
user=user,
)
continue
assert False, type(x) # should not happen

0 comments on commit e309531

Please sign in to comment.