Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Deprecated Flag for Uncrawlable Publishers #534

Merged
merged 12 commits into from
Jun 19, 2024
8 changes: 6 additions & 2 deletions docs/supported_publishers.md
Original file line number Diff line number Diff line change
Expand Up @@ -1331,7 +1331,9 @@
<code>Reuters</code>
</td>
<td>
<div>Reuters</div>
<div>
<strike>Reuters</strike>
</div>
</td>
<td>
<a href="https://www.reuters.com/">
Expand Down Expand Up @@ -1458,7 +1460,9 @@
<code>WashingtonTimes</code>
</td>
<td>
<div>The Washington Times</div>
<div>
<strike>The Washington Times</strike>
</div>
</td>
<td>
<a href="https://www.washingtontimes.com/">
Expand Down
19 changes: 17 additions & 2 deletions scripts/generate_tables.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,20 @@
import lxml.etree
import lxml.html
import more_itertools
from lxml.html.builder import CLASS, CODE, DIV, SPAN, TABLE, TBODY, TD, TH, THEAD, TR, A
from lxml.html.builder import (
CLASS,
CODE,
DIV,
SPAN,
STRIKE,
TABLE,
TBODY,
TD,
TH,
THEAD,
TR,
A,
)

from fundus import PublisherCollection
from fundus import __development_base_path__ as root_path
Expand All @@ -23,7 +36,9 @@ def __call__(self, spec: PublisherEnum) -> lxml.html.HtmlElement:

column_mapping: Dict[str, ColumnFactory] = {
"Class": lambda spec: TD(CODE(spec.name)),
"Source": lambda spec: TD(DIV(f"{spec.publisher_name}")),
"Source": lambda spec: TD(DIV(f"{spec.publisher_name}"))
if not spec.deprecated
else TD(DIV(STRIKE(f"{spec.publisher_name}"))),
"URL": lambda spec: TD(A(SPAN(urlparse(spec.domain).netloc), href=spec.domain)),
"Missing Attributes": lambda spec: (
TD(*[CODE(a) for a in sorted(attributes)])
Expand Down
4 changes: 3 additions & 1 deletion scripts/publisher_coverage.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,9 @@ def main() -> None:
# skip publishers providing no sources for forward crawling
print(f"⏩ SKIPPED: {publisher_name!r} - No sources defined")
continue

if publisher.deprecated: # type: ignore[attr-defined]
print(f"⏩ SKIPPED: {publisher_name!r} - Deprecated")
continue
crawler: Crawler = Crawler(publisher, delay=0.4)

timed_next = timeout(next, time=20, silent=True)
Expand Down
2 changes: 2 additions & 0 deletions src/fundus/publishers/base_objects.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ class PublisherSpec:
query_parameter: Dict[str, str] = field(default_factory=dict)
url_filter: Optional[URLFilter] = field(default=None)
request_header: Dict[str, str] = field(default_factory=dict)
deprecated: bool = False


class PublisherEnumMeta(EnumMeta):
Expand Down Expand Up @@ -47,6 +48,7 @@ def __init__(self, spec: PublisherSpec):
self.query_parameter = spec.query_parameter
self.url_filter = spec.url_filter
self.request_header = spec.request_header
self.deprecated = spec.deprecated

# we define the dict here manually instead of using default dict so that we can control
# the order in which sources are proceeded.
Expand Down
2 changes: 2 additions & 0 deletions src/fundus/publishers/us/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,7 @@ class US(PublisherEnum):
Sitemap("https://www.washingtontimes.com/sitemap-entries.xml"),
],
parser=WashingtonTimesParser,
deprecated=True,
)

WashingtonPost = PublisherSpec(
Expand Down Expand Up @@ -179,6 +180,7 @@ class US(PublisherEnum):
NewsMap("https://www.reuters.com/arc/outboundfeeds/news-sitemap-index/?outputType=xml"),
],
parser=ReutersParser,
deprecated=True,
)

OccupyDemocrats = PublisherSpec(
Expand Down
5 changes: 4 additions & 1 deletion src/fundus/scraping/crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -204,7 +204,10 @@ def build_extraction_filter() -> Optional[ExtractionFilter]:
f"is(are) not supported by {publisher.publisher_name}. Skipping publisher"
)
else:
fitting_publishers.append(publisher)
if not publisher.deprecated:
fitting_publishers.append(publisher)
else:
logger.warning(f"Skipping deprecated publisher: {publisher.publisher_name}")
MaxDall marked this conversation as resolved.
Show resolved Hide resolved

if not fitting_publishers:
logger.error(
Expand Down