flairNLP · addie9800 · Jun 19, 2024 · Jun 12, 2024 · Jun 12, 2024 · Jun 16, 2024
diff --git a/docs/5_advanced_topics.md b/docs/5_advanced_topics.md
@@ -3,6 +3,7 @@
 * [How to search for publishers](#how-to-search-for-publishers)
   * [Using `search()`](#using-search)
 * [Saving the crawled articles](#saving-the-crawled-articles)
+* [Working with deprecated publishers](#working-with-deprecated-publishers)
 
 # Advanced Topics
 
@@ -28,4 +29,10 @@ crawler = Crawler(fitting_publishers)
 
 To save all crawled articles to a file use the `save_to_file` parameter of the `crawl` method.
 When given a path, the crawled articles will be saved as a JSON list using the 
-[default article serialization](3_the_article_class.md#saving-an-article) and `UTF-8` encoding.
+[default article serialization](3_the_article_class.md#saving-an-article) and `UTF-8` encoding.
+
+## Working with deprecated publishers
+
+When we notice that a publisher is uncrawlable for whatever reason, we will mark it with a deprecated flag.
+This mostly has internal usages, since the default value for the `Crawler` `ignore_deprecated` flag is `False`.
+You can alter this behaviour when initiating the `Crawler` and setting the `ignore_deprecated` flag.
diff --git a/docs/supported_publishers.md b/docs/supported_publishers.md
@@ -1331,7 +1331,9 @@
         <code>Reuters</code>
       </td>
       <td>
-        <div>Reuters</div>
+        <div>
+          <strike>Reuters</strike>
+        </div>
       </td>
       <td>
         <a href="https://www.reuters.com/">
@@ -1458,7 +1460,9 @@
         <code>WashingtonTimes</code>
       </td>
       <td>
-        <div>The Washington Times</div>
+        <div>
+          <strike>The Washington Times</strike>
+        </div>
       </td>
       <td>
         <a href="https://www.washingtontimes.com/">

diff --git a/scripts/generate_tables.py b/scripts/generate_tables.py
@@ -6,7 +6,20 @@
 import lxml.etree
 import lxml.html
 import more_itertools
-from lxml.html.builder import CLASS, CODE, DIV, SPAN, TABLE, TBODY, TD, TH, THEAD, TR, A
+from lxml.html.builder import (
+    CLASS,
+    CODE,
+    DIV,
+    SPAN,
+    STRIKE,
+    TABLE,
+    TBODY,
+    TD,
+    TH,
+    THEAD,
+    TR,
+    A,
+)
 
 from fundus import PublisherCollection
 from fundus import __development_base_path__ as root_path
@@ -23,7 +36,9 @@ def __call__(self, spec: PublisherEnum) -> lxml.html.HtmlElement:
 
 column_mapping: Dict[str, ColumnFactory] = {
     "Class": lambda spec: TD(CODE(spec.name)),
-    "Source": lambda spec: TD(DIV(f"{spec.publisher_name}")),
+    "Source": lambda spec: TD(DIV(f"{spec.publisher_name}"))
+    if not spec.deprecated
+    else TD(DIV(STRIKE(f"{spec.publisher_name}"))),
     "URL": lambda spec: TD(A(SPAN(urlparse(spec.domain).netloc), href=spec.domain)),
     "Missing Attributes": lambda spec: (
         TD(*[CODE(a) for a in sorted(attributes)])

diff --git a/scripts/publisher_coverage.py b/scripts/publisher_coverage.py
@@ -36,7 +36,9 @@ def main() -> None:
                 # skip publishers providing no sources for forward crawling
                 print(f"⏩  SKIPPED: {publisher_name!r} - No sources defined")
                 continue
-
+            if publisher.deprecated:  # type: ignore[attr-defined]
+                print(f"⏩  SKIPPED: {publisher_name!r} - Deprecated")
+                continue
             crawler: Crawler = Crawler(publisher, delay=0.4)
 
             timed_next = timeout(next, time=20, silent=True)

diff --git a/src/fundus/publishers/base_objects.py b/src/fundus/publishers/base_objects.py
@@ -19,6 +19,7 @@ class PublisherSpec:
     query_parameter: Dict[str, str] = field(default_factory=dict)
     url_filter: Optional[URLFilter] = field(default=None)
     request_header: Dict[str, str] = field(default_factory=dict)
+    deprecated: bool = False
 
 
 class PublisherEnumMeta(EnumMeta):
@@ -47,6 +48,7 @@ def __init__(self, spec: PublisherSpec):
         self.query_parameter = spec.query_parameter
         self.url_filter = spec.url_filter
         self.request_header = spec.request_header
+        self.deprecated = spec.deprecated
 
         # we define the dict here manually instead of using default dict so that we can control
         # the order in which sources are proceeded.

diff --git a/src/fundus/publishers/us/__init__.py b/src/fundus/publishers/us/__init__.py
@@ -145,6 +145,7 @@ class US(PublisherEnum):
             Sitemap("https://www.washingtontimes.com/sitemap-entries.xml"),
         ],
         parser=WashingtonTimesParser,
+        deprecated=True,
     )
 
     WashingtonPost = PublisherSpec(
@@ -179,6 +180,7 @@ class US(PublisherEnum):
             NewsMap("https://www.reuters.com/arc/outboundfeeds/news-sitemap-index/?outputType=xml"),
         ],
         parser=ReutersParser,
+        deprecated=True,
     )
 
     OccupyDemocrats = PublisherSpec(

diff --git a/src/fundus/scraping/crawler.py b/src/fundus/scraping/crawler.py
@@ -128,10 +128,9 @@ def remove_query_parameters_from_url(url: str) -> str:
 
 class CrawlerBase(ABC):
     def __init__(self, *publishers: Publisher):
-        if not publishers:
-            raise ValueError("param <publishers> of <Crawler.__init__> has to be non empty")
-
         self.publishers: List[PublisherEnum] = list(set(more_itertools.collapse(publishers)))
+        if not self.publishers:
+            raise ValueError("param <publishers> of <Crawler.__init__> must include at least one publisher.")
 
     @abstractmethod
     def _build_article_iterator(
@@ -221,8 +220,7 @@ def build_extraction_filter() -> Optional[ExtractionFilter]:
             fitting_publishers = self.publishers
 
         article_count = 0
-        if save_to_file is not None:
-            crawled_articles = list()
+        crawled_articles = []
 
         try:
             for article in self._build_article_iterator(
@@ -252,6 +250,7 @@ def __init__(
         self,
         *publishers: Publisher,
         restrict_sources_to: Optional[List[Type[URLSource]]] = None,
+        ignore_deprecated: bool = False,
         delay: Optional[Union[float, Delay]] = 1.0,
         threading: bool = True,
     ):
@@ -266,19 +265,34 @@ def __init__(
 
         Args:
             *publishers (Union[PublisherEnum, Type[PublisherEnum], PublisherCollectionMeta]): The publishers to crawl.
-            restrict_sources_to (Optional[List[Type[URLSource]]]): Lets you restrict
-                sources defined in the publisher specs. If set, only articles from given source types
-                will be yielded.
+            restrict_sources_to (Optional[List[Type[URLSource]]]): Lets you restrict sources defined in the publisher
+                specs. If set, only articles from given source types will be yielded.
+            ignore_deprecated (bool): If set to True, Publishers marked as deprecated will be skipped.
+                Defaults to False.
             delay (Optional[Union[float, Delay]]): Set a delay time in seconds to be used between article
                 downloads. You can set a delay directly using float or any callable satisfying the Delay
                 protocol. If set to None, no delay will be used between batches. See Delay for more
                 information. Defaults to None.
             threading (bool): If True, the crawler will use a dedicated thread per publisher, if set to False,
-                the crawler will use a single thread for all publishers and load articles successively. This will greatly
-                influence performance, and it is highly recommended to use a threaded crawler. Deafults to True.
+                the crawler will use a single thread for all publishers and load articles successively. This will
+                greatly influence performance, and it is highly recommended to use a threaded crawler.
+                Defaults to True.
         """
 
-        super().__init__(*publishers)
+        def filter_publishers(publisher: PublisherEnum) -> bool:
+            if publisher.deprecated and ignore_deprecated:
+                logger.warning(f"Skipping deprecated publisher: {publisher.publisher_name}")
+                return False
+            return True
+
+        fitting_publishers = list(filter(filter_publishers, more_itertools.collapse(publishers)))
+        if not fitting_publishers:
+            raise ValueError(
+                f"All given publishers are deprecated. Either set <ignore_deprecated> to `False` or "
+                f"include at least one publisher that isn't deprecated."
+            )
+
+        super().__init__(*fitting_publishers)
 
         self.restrict_sources_to = restrict_sources_to
         self.delay = delay

diff --git a/tests/test_crawler.py b/tests/test_crawler.py
@@ -5,16 +5,14 @@
 
 class TestPipeline:
     def test_crawler_with_empty_collection(self, collection_with_empty_publisher_enum):
-        crawler = Crawler(collection_with_empty_publisher_enum)
-        assert crawler.publishers == list()
-        assert next(crawler.crawl(), None) is None
+        with pytest.raises(ValueError):
+            Crawler(collection_with_empty_publisher_enum)
 
         with pytest.raises(ValueError):
             Crawler(*collection_with_empty_publisher_enum)
 
     def test_crawler_with_collection(self, collection_with_valid_publisher_enum):
         crawler = Crawler(*collection_with_valid_publisher_enum)
-        publisher = collection_with_valid_publisher_enum.pub.value
         assert len(crawler.publishers) == 1
 
     def test_crawler_with_two_collections(