flairNLP · addie9800 · Jun 19, 2024 · Jun 12, 2024 · Jun 12, 2024 · Jun 16, 2024
diff --git a/docs/supported_publishers.md b/docs/supported_publishers.md
@@ -1331,7 +1331,9 @@
         <code>Reuters</code>
       </td>
       <td>
-        <div>Reuters</div>
+        <div>
+          <strike>Reuters</strike>
+        </div>
       </td>
       <td>
         <a href="https://www.reuters.com/">
@@ -1458,7 +1460,9 @@
         <code>WashingtonTimes</code>
       </td>
       <td>
-        <div>The Washington Times</div>
+        <div>
+          <strike>The Washington Times</strike>
+        </div>
       </td>
       <td>
         <a href="https://www.washingtontimes.com/">

diff --git a/scripts/generate_tables.py b/scripts/generate_tables.py
@@ -6,7 +6,20 @@
 import lxml.etree
 import lxml.html
 import more_itertools
-from lxml.html.builder import CLASS, CODE, DIV, SPAN, TABLE, TBODY, TD, TH, THEAD, TR, A
+from lxml.html.builder import (
+    CLASS,
+    CODE,
+    DIV,
+    SPAN,
+    STRIKE,
+    TABLE,
+    TBODY,
+    TD,
+    TH,
+    THEAD,
+    TR,
+    A,
+)
 
 from fundus import PublisherCollection
 from fundus import __development_base_path__ as root_path
@@ -23,7 +36,9 @@ def __call__(self, spec: PublisherEnum) -> lxml.html.HtmlElement:
 
 column_mapping: Dict[str, ColumnFactory] = {
     "Class": lambda spec: TD(CODE(spec.name)),
-    "Source": lambda spec: TD(DIV(f"{spec.publisher_name}")),
+    "Source": lambda spec: TD(DIV(f"{spec.publisher_name}"))
+    if not spec.deprecated
+    else TD(DIV(STRIKE(f"{spec.publisher_name}"))),
     "URL": lambda spec: TD(A(SPAN(urlparse(spec.domain).netloc), href=spec.domain)),
     "Missing Attributes": lambda spec: (
         TD(*[CODE(a) for a in sorted(attributes)])

diff --git a/scripts/publisher_coverage.py b/scripts/publisher_coverage.py
@@ -36,7 +36,9 @@ def main() -> None:
                 # skip publishers providing no sources for forward crawling
                 print(f"⏩  SKIPPED: {publisher_name!r} - No sources defined")
                 continue
-
+            if publisher.deprecated:  # type: ignore[attr-defined]
+                print(f"⏩  SKIPPED: {publisher_name!r} - Deprecated")
+                continue
             crawler: Crawler = Crawler(publisher, delay=0.4)
 
             timed_next = timeout(next, time=20, silent=True)

diff --git a/src/fundus/publishers/base_objects.py b/src/fundus/publishers/base_objects.py
@@ -19,6 +19,7 @@ class PublisherSpec:
     query_parameter: Dict[str, str] = field(default_factory=dict)
     url_filter: Optional[URLFilter] = field(default=None)
     request_header: Dict[str, str] = field(default_factory=dict)
+    deprecated: bool = False
 
 
 class PublisherEnumMeta(EnumMeta):
@@ -47,6 +48,7 @@ def __init__(self, spec: PublisherSpec):
         self.query_parameter = spec.query_parameter
         self.url_filter = spec.url_filter
         self.request_header = spec.request_header
+        self.deprecated = spec.deprecated
 
         # we define the dict here manually instead of using default dict so that we can control
         # the order in which sources are proceeded.

diff --git a/src/fundus/publishers/us/__init__.py b/src/fundus/publishers/us/__init__.py
@@ -145,6 +145,7 @@ class US(PublisherEnum):
             Sitemap("https://www.washingtontimes.com/sitemap-entries.xml"),
         ],
         parser=WashingtonTimesParser,
+        deprecated=True,
     )
 
     WashingtonPost = PublisherSpec(
@@ -179,6 +180,7 @@ class US(PublisherEnum):
             NewsMap("https://www.reuters.com/arc/outboundfeeds/news-sitemap-index/?outputType=xml"),
         ],
         parser=ReutersParser,
+        deprecated=True,
     )
 
     OccupyDemocrats = PublisherSpec(

diff --git a/src/fundus/scraping/crawler.py b/src/fundus/scraping/crawler.py
@@ -204,7 +204,10 @@ def build_extraction_filter() -> Optional[ExtractionFilter]:
                         f"is(are) not supported by {publisher.publisher_name}. Skipping publisher"
                     )
                 else:
-                    fitting_publishers.append(publisher)
+                    if not publisher.deprecated:
+                        fitting_publishers.append(publisher)
+                    else:
+                        logger.warning(f"Skipping deprecated publisher: {publisher.publisher_name}")
 
             if not fitting_publishers:
                 logger.error(