Skip to content

Commit

Permalink
Extract parsing logic from "retrievers".
Browse files Browse the repository at this point in the history
For #205.
  • Loading branch information
lemon24 committed Jan 25, 2021
1 parent 40b626e commit 8180be6
Show file tree
Hide file tree
Showing 3 changed files with 64 additions and 21 deletions.
54 changes: 41 additions & 13 deletions src/reader/_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,13 @@
from dataclasses import dataclass
from datetime import datetime
from typing import Any
from typing import BinaryIO
from typing import Callable
from typing import ContextManager
from typing import Iterable
from typing import Iterator
from typing import Mapping
from typing import NamedTuple
from typing import Optional
from typing import Tuple

Expand All @@ -24,7 +29,6 @@
from ._types import EntryData
from ._types import FeedData
from ._types import ParsedFeed
from ._types import ParserType
from .exceptions import _NotModified
from .exceptions import ParseError
from .types import Content
Expand Down Expand Up @@ -130,7 +134,11 @@ def _process_feed(
updated, _ = _get_updated_published(d.feed, is_rss)

feed = FeedData(
url, updated, d.feed.get('title'), d.feed.get('link'), d.feed.get('author'),
url,
updated,
d.feed.get('title'),
d.feed.get('link'),
d.feed.get('author'),
)
# This must be a list, not a generator expression,
# otherwise the user may get a ParseError when calling
Expand All @@ -153,11 +161,26 @@ def parse_feed(
# https://github.com/lemon24/reader/issues/125
# https://github.com/lemon24/reader/issues/157
result = feedparser.parse(
*args, resolve_relative_uris=True, sanitize_html=True, **kwargs,
*args,
resolve_relative_uris=True,
sanitize_html=True,
**kwargs,
)
return _process_feed(url, result)


class RetrieveResult(NamedTuple):
file: BinaryIO
http_etag: Optional[str] = None
http_last_modified: Optional[str] = None
headers: Optional[Mapping[str, str]] = None


ParserType = Callable[
[str, Optional[str], Optional[str]], ContextManager[RetrieveResult]
]


class Parser:

user_agent = (
Expand Down Expand Up @@ -186,7 +209,14 @@ def __call__(
http_etag: Optional[str] = None,
http_last_modified: Optional[str] = None,
) -> ParsedFeed:
return self.get_parser(url)(url, http_etag, http_last_modified)
parser = self.get_parser(url)
with parser(url, http_etag, http_last_modified) as result:
feed, entries = parse_feed(
url,
result.file,
response_headers=result.headers,
)
return ParsedFeed(feed, entries, result.http_etag, result.http_last_modified)

def make_session(self) -> SessionWrapper:
session = SessionWrapper(hooks=self.session_hooks.copy())
Expand Down Expand Up @@ -223,17 +253,16 @@ def __post_init__(self) -> None:
# give feed_root checks a chance to fail early
self._normalize_url('known-good-feed-url')

def __call__(self, url: str, *args: Any, **kwargs: Any) -> ParsedFeed:
@contextmanager
def __call__(self, url: str, *args: Any, **kwargs: Any) -> Iterator[RetrieveResult]:
try:
normalized_url = self._normalize_url(url)
except ValueError as e:
raise ParseError(url, message=str(e)) from None

with wrap_exceptions(url, "while reading feed"):
with open(normalized_url, 'rb') as file:
feed, entries = parse_feed(url, file)

return ParsedFeed(feed, entries)
yield RetrieveResult(file)

def _normalize_url(self, url: str) -> str:
path = _extract_path(url)
Expand Down Expand Up @@ -429,12 +458,13 @@ class HTTPParser:

make_session: _MakeSession

@contextmanager
def __call__(
self,
url: str,
http_etag: Optional[str] = None,
http_last_modified: Optional[str] = None,
) -> ParsedFeed:
) -> Iterator[RetrieveResult]:
request_headers = {'Accept': feedparser.http.ACCEPT_HEADER, 'A-IM': 'feed'}

# TODO: maybe share the session in the parser?
Expand Down Expand Up @@ -466,12 +496,10 @@ def __call__(
response.raw.decode_content = True

with wrap_exceptions(url, "while reading feed"), response:
feed, entries = parse_feed(
url, response.raw, response_headers=response_headers,
yield RetrieveResult(
response.raw, http_etag, http_last_modified, response_headers
)

return ParsedFeed(feed, entries, http_etag, http_last_modified)


def default_parser(feed_root: Optional[str] = None) -> Parser:
parser = Parser()
Expand Down
4 changes: 0 additions & 4 deletions src/reader/_types.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
from dataclasses import dataclass
from datetime import datetime
from typing import Callable
from typing import Generic
from typing import Iterable
from typing import NamedTuple
Expand Down Expand Up @@ -328,6 +327,3 @@ def from_args(
raise ValueError("updates_enabled should be one of (None, False, True)")

return cls(feed_url, tag_filter, broken, updates_enabled)


ParserType = Callable[[str, Optional[str], Optional[str]], ParsedFeed]
27 changes: 23 additions & 4 deletions tests/test_parser.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import io
import logging
from contextlib import contextmanager
from unittest.mock import MagicMock

import feedparser
Expand All @@ -11,6 +12,7 @@
from reader._parser import default_parser
from reader._parser import FileParser
from reader._parser import parse_feed
from reader._parser import RetrieveResult
from reader._parser import SessionWrapper
from reader.exceptions import _NotModified
from reader.exceptions import ParseError
Expand Down Expand Up @@ -251,7 +253,10 @@ def callback(request, context):

@pytest.mark.parametrize('feed_type', ['rss', 'atom'])
def test_parse_sends_etag_last_modified(
parse, make_http_get_headers_url, data_dir, feed_type,
parse,
make_http_get_headers_url,
data_dir,
feed_type,
):
feed_url = make_http_get_headers_url(data_dir.join('full.' + feed_type))
parse(feed_url, 'etag', 'last_modified')
Expand Down Expand Up @@ -553,11 +558,25 @@ def test_default_response_headers(
assert mock.call_args[1]['response_headers']['Content-Type'] == 'text/xml'


def test_parsers(parse):
def test_parsers(parse, monkeypatch):
parse.parsers.clear()

parse.mount_parser('http://', lambda *args: ('generic', *args))
parse.mount_parser('http://specific.com', lambda *args: ('specific', *args))
def make_parser(name):
@contextmanager
def parser(*args):
# temporary, until we split parsers and retrievers
monkeypatch.setattr(
'reader._parser.parse_feed', lambda *a, **kw: (name, args[0])
)
try:
yield RetrieveResult(name, *args[1:])
finally:
monkeypatch.undo()

return parser

parse.mount_parser('http://', make_parser('generic'))
parse.mount_parser('http://specific.com', make_parser('specific'))

assert parse('http://generic.com/', 'etag', None) == (
'generic',
Expand Down

0 comments on commit 8180be6

Please sign in to comment.