From 24d371102ef2fcd615f4d29f73e1201f8ab5c21c Mon Sep 17 00:00:00 2001 From: Max Dallabetta Date: Tue, 5 Nov 2024 20:07:31 +0100 Subject: [PATCH 1/3] add `octet-stream` to decompressor --- src/fundus/scraping/url.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/fundus/scraping/url.py b/src/fundus/scraping/url.py index 52ac6459..38c56dc4 100644 --- a/src/fundus/scraping/url.py +++ b/src/fundus/scraping/url.py @@ -22,6 +22,7 @@ class _ArchiveDecompressor: def __init__(self): self.archive_mapping: Dict[str, Callable[[bytes], bytes]] = { + "application/octet-stream": self._decompress_gzip, "application/x-gzip": self._decompress_gzip, "gzip": self._decompress_gzip, } From bca23089e1c18f0d8c59c10cd034874d78de2282 Mon Sep 17 00:00:00 2001 From: Max Dallabetta Date: Tue, 5 Nov 2024 20:54:13 +0100 Subject: [PATCH 2/3] detect compression format for `octet-stream` --- src/fundus/scraping/url.py | 42 +++++++++++++++++++++++++++++++++++--- 1 file changed, 39 insertions(+), 3 deletions(-) diff --git a/src/fundus/scraping/url.py b/src/fundus/scraping/url.py index 38c56dc4..71a368e5 100644 --- a/src/fundus/scraping/url.py +++ b/src/fundus/scraping/url.py @@ -2,6 +2,7 @@ import itertools from abc import ABC, abstractmethod from dataclasses import dataclass, field +from enum import Enum from functools import cached_property from typing import Callable, ClassVar, Dict, Iterable, Iterator, List, Optional @@ -13,20 +14,51 @@ from requests import ConnectionError, HTTPError from fundus.logging import create_logger +from fundus.parser.utility import generic_nodes_to_text from fundus.scraping.filter import URLFilter, inverse from fundus.scraping.session import _default_header, session_handler logger = create_logger(__name__) +class CompressionFormats(Enum): + GZIP = 1 + BZ2 = 2 + ZIP = 3 + LZMA = 4 + + class _ArchiveDecompressor: def __init__(self): self.archive_mapping: Dict[str, Callable[[bytes], bytes]] = { - "application/octet-stream": self._decompress_gzip, + "application/octet-stream": self._decompress_octet_stream, "application/x-gzip": self._decompress_gzip, "gzip": self._decompress_gzip, } + @staticmethod + def identify_compression_format(compressed_content: bytes) -> Optional[CompressionFormats]: + if compressed_content.startswith(b"\x1f\x8b"): + return CompressionFormats.GZIP + elif compressed_content.startswith(b"\x42\x5a"): + return CompressionFormats.BZ2 + elif compressed_content.startswith(b"PK\x03\x04"): + return CompressionFormats.ZIP + elif compressed_content.startswith(b"\x28\xb5\x2f\xfd"): + return CompressionFormats.LZMA + return None + + def _decompress_octet_stream(self, compressed_content: bytes) -> bytes: + if (compression_format := self.identify_compression_format(compressed_content)) is None: + logger.debug(f"Could not identify compression format") + raise NotImplementedError + + if compression_format == CompressionFormats.GZIP: + return self._decompress_gzip(compressed_content) + else: + logger.debug(f"Decompression not implemented for {compression_format.name!r} format") + raise NotImplementedError + @staticmethod def _decompress_gzip(compressed_content: bytes) -> bytes: decompressed_content = gzip.decompress(compressed_content) @@ -116,12 +148,16 @@ def yield_recursive(sitemap_url: str) -> Iterator[str]: return content = response.content if (content_type := response.headers.get("content-type")) in self._decompressor.supported_file_formats: - content = self._decompressor.decompress(content, content_type) + try: + content = self._decompressor.decompress(content, content_type) + except NotImplementedError: + logger.warning(f"No matching decompression found for {sitemap_url!r}") + return if not content: logger.warning(f"Warning! Empty sitemap at {sitemap_url!r}") return tree = lxml.html.fromstring(content) - urls = [node.text_content() for node in self._url_selector(tree)] + urls = generic_nodes_to_text(self._url_selector(tree), normalize=True) if urls: for new_url in reversed(urls) if self.reverse else urls: yield new_url From 74dfc8e85f91ae95a11cfb7e9e5682c665b8e649 Mon Sep 17 00:00:00 2001 From: Max Dallabetta Date: Fri, 8 Nov 2024 13:01:34 +0100 Subject: [PATCH 3/3] add missing decompression functions --- src/fundus/scraping/url.py | 79 +++++++++++++++++++++++--------------- 1 file changed, 48 insertions(+), 31 deletions(-) diff --git a/src/fundus/scraping/url.py b/src/fundus/scraping/url.py index 71a368e5..0b23fe0f 100644 --- a/src/fundus/scraping/url.py +++ b/src/fundus/scraping/url.py @@ -1,8 +1,9 @@ +import bz2 import gzip import itertools +import lzma from abc import ABC, abstractmethod from dataclasses import dataclass, field -from enum import Enum from functools import cached_property from typing import Callable, ClassVar, Dict, Iterable, Iterator, List, Optional @@ -21,48 +22,64 @@ logger = create_logger(__name__) -class CompressionFormats(Enum): - GZIP = 1 - BZ2 = 2 - ZIP = 3 - LZMA = 4 +class CompressionFormat: + def __init__( + self, name: str, decompression: Optional[Callable[[bytes], bytes]] = None, *, byte_mask: Optional[bytes] = None + ) -> None: + self.name = name + self.decompression = decompression + self.byte_mask = byte_mask + + def match(self, compressed_content: bytes) -> bool: + if self.byte_mask: + return compressed_content.startswith(self.byte_mask) + return False + + def __call__(self, compressed_content: bytes) -> bytes: + if self.decompression is None: + raise NotImplementedError(f"Decompression not implemented for {self.name!r}") + return self.decompression(compressed_content) + + def __repr__(self): + if self.decompression is None: + return f"{self.name} -- Not implemented" + return self.name + + +class CompressionFormats: + GZIP = CompressionFormat("gzip", gzip.decompress, byte_mask=b"\x1f\x8b") + BZ2 = CompressionFormat("bz2", bz2.decompress, byte_mask=b"\x42\x5a") + ZIP = CompressionFormat("zip", byte_mask=b"PK\x03\x04") + LZMA = CompressionFormat("lzma", lzma.decompress, byte_mask=b"\x28\xb5\x2f\xfd") + + @classmethod + def iter_formats(cls) -> Iterator[CompressionFormat]: + for obj in cls.__dict__.values(): + if isinstance(obj, CompressionFormat): + yield obj + + @classmethod + def identify(cls, compressed_content: bytes) -> Optional[CompressionFormat]: + for compression_format in cls.iter_formats(): + if compression_format.match(compressed_content): + return compression_format + return None class _ArchiveDecompressor: def __init__(self): self.archive_mapping: Dict[str, Callable[[bytes], bytes]] = { "application/octet-stream": self._decompress_octet_stream, - "application/x-gzip": self._decompress_gzip, - "gzip": self._decompress_gzip, + "application/x-gzip": CompressionFormats.GZIP, + "gzip": CompressionFormats.GZIP, } - @staticmethod - def identify_compression_format(compressed_content: bytes) -> Optional[CompressionFormats]: - if compressed_content.startswith(b"\x1f\x8b"): - return CompressionFormats.GZIP - elif compressed_content.startswith(b"\x42\x5a"): - return CompressionFormats.BZ2 - elif compressed_content.startswith(b"PK\x03\x04"): - return CompressionFormats.ZIP - elif compressed_content.startswith(b"\x28\xb5\x2f\xfd"): - return CompressionFormats.LZMA - return None - def _decompress_octet_stream(self, compressed_content: bytes) -> bytes: - if (compression_format := self.identify_compression_format(compressed_content)) is None: + if (compression_format := CompressionFormats.identify(compressed_content)) is None: logger.debug(f"Could not identify compression format") raise NotImplementedError - if compression_format == CompressionFormats.GZIP: - return self._decompress_gzip(compressed_content) - else: - logger.debug(f"Decompression not implemented for {compression_format.name!r} format") - raise NotImplementedError - - @staticmethod - def _decompress_gzip(compressed_content: bytes) -> bytes: - decompressed_content = gzip.decompress(compressed_content) - return decompressed_content + return compression_format(compressed_content) def decompress(self, content: bytes, file_format: "str") -> bytes: decompress_function = self.archive_mapping[file_format]