From eb1f96f75fe1003eb810f832ae646a2ce52d46aa Mon Sep 17 00:00:00 2001 From: lemon24 Date: Thu, 21 Oct 2021 21:35:44 +0300 Subject: [PATCH] Allow Entry.updated to be None instead of defaulting to first_updated (typing fails). For #239/#183. --- src/reader/_parser.py | 3 -- src/reader/_storage.py | 17 +++++++---- src/reader/_types.py | 5 +++- src/reader/_updater.py | 48 ++++++++++++------------------ tests/data/full.rss.py | 7 +++-- tests/fakeparser.py | 4 +-- tests/test_reader.py | 66 ++++++++++++++++++++++++++++++++++-------- tests/test_storage.py | 2 +- 8 files changed, 94 insertions(+), 58 deletions(-) diff --git a/src/reader/_parser.py b/src/reader/_parser.py index 42e9d8de..9ac6b709 100644 --- a/src/reader/_parser.py +++ b/src/reader/_parser.py @@ -529,9 +529,6 @@ def convert(key: str) -> Any: updated = convert('updated_parsed') published = convert('published_parsed') - if published and not updated and is_rss: - updated, published = published, None - return updated, published diff --git a/src/reader/_storage.py b/src/reader/_storage.py index 8f18be4c..1bb50fa6 100644 --- a/src/reader/_storage.py +++ b/src/reader/_storage.py @@ -618,7 +618,11 @@ def _get_entries_for_update_n_queries( context = dict(feed_url=feed_url, id=id) row = self.db.execute( """ - SELECT updated, data_hash, data_hash_changed + SELECT + updated, + published, + data_hash, + data_hash_changed FROM entries WHERE feed = :feed_url AND id = :id; @@ -645,6 +649,7 @@ def _get_entries_for_update_one_query( SELECT entries.id IS NOT NULL, entries.updated, + entries.published, entries.data_hash, entries.data_hash_changed FROM input @@ -1505,15 +1510,15 @@ def make_recent_last_select(id_prefix: str = 'entries.') -> Sequence[Any]: coalesce ( CASE WHEN - coalesce(entries.published, entries.updated) + coalesce(entries.published, entries.updated, entries.first_updated) >= :recent_threshold THEN entries.first_updated_epoch END, - entries.published, entries.updated + entries.published, entries.updated, entries.first_updated ) """, ), - ('kinda_published', 'coalesce(published, updated)'), + ('kinda_published', 'coalesce(published, updated, first_updated)'), f'{id_prefix}feed', 'last_updated', ('negative_feed_order', '- feed_order'), @@ -1537,7 +1542,7 @@ def make_by_kinda_cte(recent: bool) -> str: kinda_first_updated = 'first_updated_epoch' sign = '>=' else: - kinda_first_updated = 'coalesce(published, updated)' + kinda_first_updated = 'coalesce(published, updated, first_updated)' sign = '<' return f""" @@ -1546,7 +1551,7 @@ def make_by_kinda_cte(recent: bool) -> str: id, last_updated, {kinda_first_updated} as kinda_first_updated, - coalesce(published, updated) as kinda_published, + coalesce(published, updated, first_updated) as kinda_published, - feed_order as negative_feed_order FROM entries WHERE kinda_published {sign} :recent_threshold diff --git a/src/reader/_types.py b/src/reader/_types.py index ec0f3f57..272c486f 100644 --- a/src/reader/_types.py +++ b/src/reader/_types.py @@ -179,7 +179,10 @@ class EntryForUpdate(NamedTuple): """Update-relevant information about an existing entry, from Storage.""" #: The date the entry was last updated, according to the entry. - updated: datetime + updated: Optional[datetime] + + #: The date the entry was published, according to the entry. + published: Optional[datetime] #: The hash of the corresponding EntryData. hash: Optional[bytes] diff --git a/src/reader/_updater.py b/src/reader/_updater.py index 1b4bcc9f..8495ff44 100644 --- a/src/reader/_updater.py +++ b/src/reader/_updater.py @@ -121,49 +121,37 @@ def should_update_feed(self, new: FeedData, entries_to_update: bool) -> bool: return False - def compute_entry_updated( - self, id: str, new: Optional[datetime], old: Optional[datetime] - ) -> Optional[datetime]: + def should_update_entry( + self, new: EntryData[Optional[datetime]], old: Optional[EntryForUpdate] + ) -> Tuple[Optional[EntryData[datetime]], bool]: def debug(msg: str, *args: Any) -> None: self.log.debug("entry %r: " + msg, id, *args) - if not new: + if not old: + # FIXME: this should say added + debug("entry updated") + return new, False + + new_ref_dt = new.updated or new.published + old_ref_dt = old.updated or old.published + + if not new_ref_dt: debug("has no updated, updating but not changing updated") debug("entry added/updated") - return old or self.now + return new, False if self.stale: debug("feed marked as stale, updating anyway") debug("entry added/updated") - return new + return new, False - if old and new <= old: + if old_ref_dt and new_ref_dt <= old_ref_dt: debug( "entry not updated, skipping (old updated %s, new updated %s)", old, new ) - return None - - debug("entry updated") - return new - - def should_update_entry( - self, new: EntryData[Optional[datetime]], old: Optional[EntryForUpdate] - ) -> Tuple[Optional[EntryData[datetime]], bool]: - - updated = self.compute_entry_updated(new.id, new.updated, old and old.updated) - - if updated: - return EntryData(**new._replace(updated=updated).__dict__), False - - # At this point, new should have .updated is not None. - # otherwise, compute_entry_updated() would have returned - # either old.updated or self.now. - # Remove this when it stops doing that, as proposed in this comment: - # https://github.com/lemon24/reader/issues/179#issuecomment-663840297 - assert new.updated is not None - - # If old is None, compute_entry_updated() returned something. - assert old is not None + else: + debug("entry updated") + return new, False # Check if the entry content actually changed: # https://github.com/lemon24/reader/issues/179 diff --git a/tests/data/full.rss.py b/tests/data/full.rss.py index f92530d0..4d08437b 100644 --- a/tests/data/full.rss.py +++ b/tests/data/full.rss.py @@ -20,11 +20,11 @@ EntryData( feed_url=feed.url, id='7bd204c6-1655-4c27-aeee-53f933c5395f', - updated=datetime.datetime(2009, 9, 6, 16, 20), + updated=None, title='Example entry', link='http://www.example.com/blog/post/1', author='Example editor', - published=None, + published=datetime.datetime(2009, 9, 6, 16, 20), summary='Here is some text containing an interesting description.', content=( # the text/plain type comes from feedparser @@ -40,7 +40,8 @@ EntryData( feed_url=feed.url, id='00000000-1655-4c27-aeee-00000000', - updated=datetime.datetime(2009, 9, 6, 0, 0, 0), + updated=None, + published=datetime.datetime(2009, 9, 6, 0, 0, 0), title='Example entry, again', ), ] diff --git a/tests/fakeparser.py b/tests/fakeparser.py index 02e7e940..2f64956c 100644 --- a/tests/fakeparser.py +++ b/tests/fakeparser.py @@ -20,7 +20,7 @@ def _make_feed(number, updated=None, **kwargs): ) -def _make_entry(feed_number, number, updated, **kwargs): +def _make_entry(feed_number, number, updated=None, **kwargs): return EntryData( f'{feed_number}', # evals to tuple @@ -51,7 +51,7 @@ def feed(self, number, updated=None, **kwargs): self.entries.setdefault(number, OrderedDict()) return feed - def entry(self, feed_number, number, updated, **kwargs): + def entry(self, feed_number, number, updated=None, **kwargs): entry = _make_entry(feed_number, number, updated, **kwargs) self.entries[feed_number][number] = entry return entry diff --git a/tests/test_reader.py b/tests/test_reader.py index 87429a69..90885470 100644 --- a/tests/test_reader.py +++ b/tests/test_reader.py @@ -347,15 +347,7 @@ def test_update_entry_updated(reader, call_update_method, caplog, monkeypatch): def test_update_no_updated(reader, chunk_size, call_update_method): """If a feed has updated == None, it should be treated as updated. - If an entry has updated == None, it should: - - * be updated every time, but - * have updated set to the first time it was updated until it has a new - updated != None - - This means a stored entry always has updated set. - - https://github.com/lemon24/reader/issues/88 + If an entry has updated == None, it should be updated every time. """ reader._storage.chunk_size = chunk_size @@ -374,7 +366,7 @@ def test_update_no_updated(reader, chunk_size, call_update_method): assert set(reader.get_entries()) == { entry_one.as_entry( feed=feed, - updated=datetime(2010, 1, 1), + updated=None, first_updated=datetime(2010, 1, 1), last_updated=datetime(2010, 1, 1), ) @@ -391,13 +383,11 @@ def test_update_no_updated(reader, chunk_size, call_update_method): assert set(reader.get_entries()) == { entry_one.as_entry( feed=feed, - updated=datetime(2010, 1, 1), first_updated=datetime(2010, 1, 1), last_updated=datetime(2010, 1, 2), ), entry_two.as_entry( feed=feed, - updated=datetime(2010, 1, 2), first_updated=datetime(2010, 1, 2), last_updated=datetime(2010, 1, 2), ), @@ -1219,6 +1209,8 @@ def test_get_entries_recent_order( """Entries should be sorted descending by (with decreasing priority): * entry first updated (only if newer than _storage.recent_threshold) + * as of 2021-10 / #239, this test isn't covering this; + test_get_entries_recent_first_updated_order (below) is * entry published (or entry updated if published is none) * feed URL * entry last updated @@ -1283,6 +1275,56 @@ def test_get_entries_recent_order( assert [eval(e.id) for e in call_method(reader)] == [t[:2] for t in expected] +@pytest.mark.parametrize( + 'chunk_size', + [ + # the default + Storage.chunk_size, + # rough result size for this test + 1, + 2, + 3, + 8, + # unchunked query + 0, + ], +) +@with_call_entries_recent_method +def test_get_entries_recent_first_updated_order( + reader, chunk_size, pre_stuff, call_method +): + """For entries with no published/updated, + entries should be sorted by first_updated. + + """ + reader._storage.chunk_size = chunk_size + + parser = Parser() + reader._parser = parser + feed = parser.feed(1, datetime(2010, 1, 1)) + reader.add_feed(feed.url) + + reader._now = lambda: naive_datetime(2010, 1, 2) + three = parser.entry(1, 3) + reader.update_feeds() + + reader._now = lambda: naive_datetime(2010, 1, 4) + two = parser.entry(1, 2) + reader.update_feeds() + + reader._now = lambda: naive_datetime(2010, 1, 1) + four = parser.entry(1, 4, datetime(2010, 1, 1)) + reader.update_feeds() + + reader._now = lambda: naive_datetime(2010, 1, 3) + one = parser.entry(1, 1, datetime(2010, 1, 1)) + reader.update_feeds() + + pre_stuff(reader) + + assert [eval(e.id)[1] for e in call_method(reader)] == [2, 1, 3, 4] + + @pytest.mark.parametrize( 'chunk_size', [ diff --git a/tests/test_storage.py b/tests/test_storage.py index cfa898f9..f5a5e062 100644 --- a/tests/test_storage.py +++ b/tests/test_storage.py @@ -539,7 +539,7 @@ def test_get_entries_for_update(storage_cls): ) assert list(storage.get_entries_for_update([('feed', 'one'), ('feed', 'two')])) == [ - EntryForUpdate(datetime(2010, 1, 1), entry.hash, 0), + EntryForUpdate(datetime(2010, 1, 1), None, entry.hash, 0), None, ]