Skip to content

Commit

Permalink
fix breaking changes of pocket api
Browse files Browse the repository at this point in the history
Since recently, pocket started to
- returning all items including previously deleted?
  we filter them out in DAL now since they don't have any actual data
- return created_at timestamp for highlights as isoformat ending with Z
  (however this seems to be an incorrect timestamp -- not sure if it used to be correct previously)

Seems like the only mention of any changes is this https://x.com/smarachefr/status/1839571167373369640

Not sure if there is anything else, but this results in same data as previously now at least
  • Loading branch information
karlicoss committed Sep 30, 2024
1 parent a609880 commit 8aaed5a
Show file tree
Hide file tree
Showing 3 changed files with 91 additions and 35 deletions.
48 changes: 34 additions & 14 deletions src/pockexport/dal.py
Original file line number Diff line number Diff line change
@@ -1,28 +1,43 @@
#!/usr/bin/env python3
from __future__ import annotations

import json
from datetime import datetime
from datetime import datetime, timezone
from pathlib import Path
from typing import Any, Iterator, NamedTuple, Sequence
from typing import Iterator, NamedTuple, Sequence

from .exporthelpers import dal_helper
from .exporthelpers.dal_helper import Json, PathIsh, pathify
from .exporthelpers.dal_helper import (
Json,
PathIsh,
datetime_aware,
fromisoformat,
pathify,
)

# TODO FIXME are times in utc? not mentioned anywhere...

class Highlight(NamedTuple):
json: Any
json: Json

@property
def text(self) -> str:
return self.json['quote']

@property
def created(self) -> datetime:
return datetime.strptime(self.json['created_at'], '%Y-%m-%d %H:%M:%S')
def created(self) -> datetime_aware:
created_at_s = self.json['created_at']
if created_at_s.endswith('Z'):
# FIXME not convinced timestamp is correct here?
# tested with item highlighted at 2024-09-30 at 00:53 UTC and it appeared as 2024-09-29T19:53:35.000Z in export??
return fromisoformat(created_at_s)
else:
# older format (pre September 2024)
dt = datetime.strptime(self.json['created_at'], '%Y-%m-%d %H:%M:%S')
return dt.replace(tzinfo=timezone.utc)


class Article(NamedTuple):
json: Any
json: Json

@property
def url(self) -> str:
Expand All @@ -41,8 +56,8 @@ def pocket_link(self) -> str:
return 'https://app.getpocket.com/read/' + self.json['item_id']

@property
def added(self) -> datetime:
return datetime.fromtimestamp(int(self.json['time_added']))
def added(self) -> datetime_aware:
return datetime.fromtimestamp(int(self.json['time_added']), tz=timezone.utc)

@property
def highlights(self) -> Sequence[Highlight]:
Expand All @@ -64,7 +79,12 @@ def raw(self) -> Json:
return json.loads(last.read_text())

def articles(self) -> Iterator[Article]:
yield from map(Article, self.raw()['list'].values())
for j in self.raw()['list'].values():
# means "item should be deleted" according to api?? https://getpocket.com/developer/docs/v3/retrieve
# started happening around September 2024... in this case there is no data inside except item id
if j['status'] == '2':
continue
yield Article(j)


def _get_test_sources() -> Sequence[PathIsh]:
Expand All @@ -79,10 +99,10 @@ def test() -> None:
articles = list(dal.articles())
assert len(articles) == 10
for a in articles:
assert a.url is not None
assert a.title is not None
assert a.url is not None
assert a.title is not None
assert a.pocket_link is not None
assert a.added is not None
assert a.added is not None
for h in a.highlights:
h.text # noqa: B018
h.created # noqa: B018
Expand Down
76 changes: 56 additions & 20 deletions src/pockexport/export.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,20 @@
#!/usr/bin/env python3
from __future__ import annotations

import json

import pocket # type: ignore

from .exporthelpers.export_helper import Json
from .exporthelpers.logging_helper import make_logger

## useful for debugging
# from http.client import HTTPConnection
# HTTPConnection.debuglevel = 1
###

logger = make_logger(__name__, level='debug')


class Exporter:
def __init__(self, *args, **kwargs) -> None:
Expand All @@ -15,25 +27,47 @@ def export_json(self):
def get(self, **kwargs):
pass

# apparently no pagination?
res = get(
self.api,
images=1,
videos=1,
tags=1,
rediscovery=1,
annotations=1,
authors=1,
itemOptics=1,
meta=1,
posts=1,
total=1,
forceaccount=1,
state='all',
sort='newest',
detailType='complete',
)
return res[0]
all_items: dict[str, Json] = {}

first_res: Json | None = None
total: int | None = None

while True:
offset = len(all_items)
logger.debug(f'retrieving from {offset=} (expected {total=})')
res, _headers = get(
self.api,
images=1,
videos=1,
tags=1,
rediscovery=1,
annotations=1,
authors=1,
itemOptics=1,
meta=1,
posts=1,
total=1,
forceaccount=1,
offset=offset,
count=30, # max count per request according to api docs
state='all',
sort='newest',
detailType='complete',
)
if first_res is None:
first_res = res

assert res.get('error') is None, res # just in case
total = int(res['total'])

new_items: dict[str, Json] = res['list']
if len(new_items) == 0:
break

all_items.update(new_items)

first_res['list'] = all_items # eh, hacky, but not sure what's a better way
return first_res


def get_json(**params):
Expand All @@ -54,13 +88,15 @@ def main() -> None:

def make_parser():
from .exporthelpers.export_helper import Parser, setup_parser

parser = Parser('Export your personal Pocket data, *including highlights* as JSON.')
setup_parser(
parser=parser,
params=['consumer_key', 'access_token'],
extra_usage='''
You can also import ~pockexport.export~ as a module and call ~get_json~ function directly to get raw JSON.
''')
''',
)
return parser


Expand Down
2 changes: 1 addition & 1 deletion src/pockexport/exporthelpers
Submodule exporthelpers updated 1 files
+13 −0 dal_helper.py

0 comments on commit 8aaed5a

Please sign in to comment.