diff --git a/src/reader/_parser.py b/src/reader/_parser.py index d88d8b25..73a6fff6 100644 --- a/src/reader/_parser.py +++ b/src/reader/_parser.py @@ -13,6 +13,7 @@ from typing import BinaryIO from typing import Callable from typing import ContextManager +from typing import Dict from typing import Iterable from typing import Iterator from typing import List @@ -214,6 +215,10 @@ def accept_header(self) -> str: # pragma: no cover pass +def unparse_accept_header(values: Iterable[Tuple[str, float]]) -> str: + return MIMEAccept(values).to_header() + + class Parser: user_agent = ( @@ -222,7 +227,7 @@ class Parser: def __init__(self) -> None: self.retrievers: 'OrderedDict[str, RetriverType]' = OrderedDict() - self.parsers_by_mime_type: 'List[Tuple[MIMEAccept, ParserType]]' = [] + self.parsers_by_mime_type: Dict[str, List[Tuple[float, ParserType]]] = {} self.session_hooks = SessionHooks() def mount_retriever(self, prefix: str, retriever: RetriverType) -> None: @@ -240,23 +245,32 @@ def get_retriever(self, url: str) -> RetriverType: def get_parser_by_mime_type( self, mime_type: str ) -> Optional[ParserType]: # pragma: no cover - for accept, parser in self.parsers_by_mime_type: - if accept.best_match([mime_type]): - return parser + parsers = self.parsers_by_mime_type.get(mime_type, ()) + if not parsers: + parsers = self.parsers_by_mime_type.get('*/*', ()) + if parsers: + return parsers[-1][1] return None def mount_parser_by_mime_type( self, parser: ParserType, accept_header: Optional[str] = None ) -> None: # pragma: no cover - if accept_header: - accept = parse_accept_header(accept_header, MIMEAccept) - else: + if not accept_header: if not isinstance(parser, AwareParserType): raise TypeError("unaware parser type with no accept_header given") - accept = parse_accept_header(parser.accept_header, MIMEAccept) + accept_header = parser.accept_header + + for mime_type, quality in parse_accept_header(accept_header): + if not quality: + continue - parsers = self.parsers_by_mime_type - parsers.append((accept, parser)) + parsers = self.parsers_by_mime_type.setdefault(mime_type, []) + + existing_qualities = sorted( + (q, i) for i, (q, _) in enumerate(parsers) if q > quality + ) + index = existing_qualities[0][1] if existing_qualities else 0 + parsers.insert(index, (quality, parser)) def __call__( self, @@ -270,11 +284,11 @@ def __call__( http_accept: Optional[str] if not parser: - http_accept = MIMEAccept( - mime_type - for accept, _ in self.parsers_by_mime_type - for mime_type in accept - ).to_header() + http_accept = unparse_accept_header( + (mime_type, quality) + for mime_type, parsers in self.parsers_by_mime_type.items() + for quality, _ in parsers + ) else: # URL parsers get the default session / requests Accept (*/*); # later, we may use parser.accept_header, if it exists, but YAGNI diff --git a/tests/test_parser.py b/tests/test_parser.py index 47720f42..090248a8 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -12,6 +12,7 @@ from reader._parser import default_parser from reader._parser import feedparser_parse from reader._parser import FileRetriever +from reader._parser import Parser from reader._parser import RetrieveResult from reader._parser import SessionWrapper from reader.exceptions import _NotModified @@ -766,4 +767,25 @@ def test_normalize_url_errors(monkeypatch, reload_module, os_name, url, reason): assert reason in str(excinfo.value) +def test_parser_mount_order(): + p = Parser() + p.mount_parser_by_mime_type('P0', 'one/two;q=0.0') + p.mount_parser_by_mime_type('P1', 'one/two') + p.mount_parser_by_mime_type('P2', 'one/two;q=0.1') + p.mount_parser_by_mime_type('P3', 'one/two;q=0.1') + p.mount_parser_by_mime_type('P4', 'one/two;q=0.4') + p.mount_parser_by_mime_type('P5', 'one/two;q=0.5') + p.mount_parser_by_mime_type('P6', 'one/two;q=0.3') + assert p.parsers_by_mime_type == { + 'one/two': [ + (0.1, 'P2'), + (0.1, 'P3'), + (0.3, 'P6'), + (0.4, 'P4'), + (0.5, 'P5'), + (1, 'P1'), + ] + } + + # FIXME: test no mimetype (#205)