Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Deprecated Flag for Uncrawlable Publishers #534

Merged
merged 12 commits into from
Jun 19, 2024
9 changes: 8 additions & 1 deletion docs/5_advanced_topics.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
* [How to search for publishers](#how-to-search-for-publishers)
* [Using `search()`](#using-search)
* [Saving the crawled articles](#saving-the-crawled-articles)
* [Working with deprecated publishers](#working-with-deprecated-publishers)

# Advanced Topics

Expand All @@ -28,4 +29,10 @@ crawler = Crawler(fitting_publishers)

To save all crawled articles to a file use the `save_to_file` parameter of the `crawl` method.
When given a path, the crawled articles will be saved as a JSON list using the
[default article serialization](3_the_article_class.md#saving-an-article) and `UTF-8` encoding.
[default article serialization](3_the_article_class.md#saving-an-article) and `UTF-8` encoding.

## Working with deprecated publishers

When we notice that a publisher is uncrawlable for whatever reason, we will mark it with a deprecated flag.
This mostly has internal usages, since the default value for the `Crawler` `ignore_deprecated` flag is `False`.
You can alter this behaviour when initiating the `Crawler` and setting the `ignore_deprecated` flag.
8 changes: 6 additions & 2 deletions docs/supported_publishers.md
Original file line number Diff line number Diff line change
Expand Up @@ -1331,7 +1331,9 @@
<code>Reuters</code>
</td>
<td>
<div>Reuters</div>
<div>
<strike>Reuters</strike>
</div>
</td>
<td>
<a href="https://www.reuters.com/">
Expand Down Expand Up @@ -1458,7 +1460,9 @@
<code>WashingtonTimes</code>
</td>
<td>
<div>The Washington Times</div>
<div>
<strike>The Washington Times</strike>
</div>
</td>
<td>
<a href="https://www.washingtontimes.com/">
Expand Down
19 changes: 17 additions & 2 deletions scripts/generate_tables.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,20 @@
import lxml.etree
import lxml.html
import more_itertools
from lxml.html.builder import CLASS, CODE, DIV, SPAN, TABLE, TBODY, TD, TH, THEAD, TR, A
from lxml.html.builder import (
CLASS,
CODE,
DIV,
SPAN,
STRIKE,
TABLE,
TBODY,
TD,
TH,
THEAD,
TR,
A,
)

from fundus import PublisherCollection
from fundus import __development_base_path__ as root_path
Expand All @@ -23,7 +36,9 @@ def __call__(self, spec: PublisherEnum) -> lxml.html.HtmlElement:

column_mapping: Dict[str, ColumnFactory] = {
"Class": lambda spec: TD(CODE(spec.name)),
"Source": lambda spec: TD(DIV(f"{spec.publisher_name}")),
"Source": lambda spec: TD(DIV(f"{spec.publisher_name}"))
if not spec.deprecated
else TD(DIV(STRIKE(f"{spec.publisher_name}"))),
"URL": lambda spec: TD(A(SPAN(urlparse(spec.domain).netloc), href=spec.domain)),
"Missing Attributes": lambda spec: (
TD(*[CODE(a) for a in sorted(attributes)])
Expand Down
4 changes: 3 additions & 1 deletion scripts/publisher_coverage.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,9 @@ def main() -> None:
# skip publishers providing no sources for forward crawling
print(f"⏩ SKIPPED: {publisher_name!r} - No sources defined")
continue

if publisher.deprecated: # type: ignore[attr-defined]
print(f"⏩ SKIPPED: {publisher_name!r} - Deprecated")
continue
crawler: Crawler = Crawler(publisher, delay=0.4)

timed_next = timeout(next, time=20, silent=True)
Expand Down
2 changes: 2 additions & 0 deletions src/fundus/publishers/base_objects.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ class PublisherSpec:
query_parameter: Dict[str, str] = field(default_factory=dict)
url_filter: Optional[URLFilter] = field(default=None)
request_header: Dict[str, str] = field(default_factory=dict)
deprecated: bool = False


class PublisherEnumMeta(EnumMeta):
Expand Down Expand Up @@ -47,6 +48,7 @@ def __init__(self, spec: PublisherSpec):
self.query_parameter = spec.query_parameter
self.url_filter = spec.url_filter
self.request_header = spec.request_header
self.deprecated = spec.deprecated

# we define the dict here manually instead of using default dict so that we can control
# the order in which sources are proceeded.
Expand Down
2 changes: 2 additions & 0 deletions src/fundus/publishers/us/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,7 @@ class US(PublisherEnum):
Sitemap("https://www.washingtontimes.com/sitemap-entries.xml"),
],
parser=WashingtonTimesParser,
deprecated=True,
)

WashingtonPost = PublisherSpec(
Expand Down Expand Up @@ -179,6 +180,7 @@ class US(PublisherEnum):
NewsMap("https://www.reuters.com/arc/outboundfeeds/news-sitemap-index/?outputType=xml"),
],
parser=ReutersParser,
deprecated=True,
)

OccupyDemocrats = PublisherSpec(
Expand Down
36 changes: 25 additions & 11 deletions src/fundus/scraping/crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,10 +128,9 @@ def remove_query_parameters_from_url(url: str) -> str:

class CrawlerBase(ABC):
def __init__(self, *publishers: Publisher):
if not publishers:
raise ValueError("param <publishers> of <Crawler.__init__> has to be non empty")

self.publishers: List[PublisherEnum] = list(set(more_itertools.collapse(publishers)))
if not self.publishers:
raise ValueError("param <publishers> of <Crawler.__init__> must include at least one publisher.")

@abstractmethod
def _build_article_iterator(
Expand Down Expand Up @@ -221,8 +220,7 @@ def build_extraction_filter() -> Optional[ExtractionFilter]:
fitting_publishers = self.publishers

article_count = 0
if save_to_file is not None:
crawled_articles = list()
crawled_articles = []

try:
for article in self._build_article_iterator(
Expand Down Expand Up @@ -252,6 +250,7 @@ def __init__(
self,
*publishers: Publisher,
restrict_sources_to: Optional[List[Type[URLSource]]] = None,
ignore_deprecated: bool = False,
delay: Optional[Union[float, Delay]] = 1.0,
threading: bool = True,
):
Expand All @@ -266,19 +265,34 @@ def __init__(

Args:
*publishers (Union[PublisherEnum, Type[PublisherEnum], PublisherCollectionMeta]): The publishers to crawl.
restrict_sources_to (Optional[List[Type[URLSource]]]): Lets you restrict
sources defined in the publisher specs. If set, only articles from given source types
will be yielded.
restrict_sources_to (Optional[List[Type[URLSource]]]): Lets you restrict sources defined in the publisher
specs. If set, only articles from given source types will be yielded.
ignore_deprecated (bool): If set to True, Publishers marked as deprecated will be skipped.
Defaults to False.
delay (Optional[Union[float, Delay]]): Set a delay time in seconds to be used between article
downloads. You can set a delay directly using float or any callable satisfying the Delay
protocol. If set to None, no delay will be used between batches. See Delay for more
information. Defaults to None.
threading (bool): If True, the crawler will use a dedicated thread per publisher, if set to False,
the crawler will use a single thread for all publishers and load articles successively. This will greatly
influence performance, and it is highly recommended to use a threaded crawler. Deafults to True.
the crawler will use a single thread for all publishers and load articles successively. This will
greatly influence performance, and it is highly recommended to use a threaded crawler.
Defaults to True.
"""

super().__init__(*publishers)
def filter_publishers(publisher: PublisherEnum) -> bool:
if publisher.deprecated and ignore_deprecated:
logger.warning(f"Skipping deprecated publisher: {publisher.publisher_name}")
return False
return True

fitting_publishers = list(filter(filter_publishers, more_itertools.collapse(publishers)))
if not fitting_publishers:
raise ValueError(
f"All given publishers are deprecated. Either set <ignore_deprecated> to `False` or "
f"include at least one publisher that isn't deprecated."
)

super().__init__(*fitting_publishers)

self.restrict_sources_to = restrict_sources_to
self.delay = delay
Expand Down
6 changes: 2 additions & 4 deletions tests/test_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,16 +5,14 @@

class TestPipeline:
def test_crawler_with_empty_collection(self, collection_with_empty_publisher_enum):
crawler = Crawler(collection_with_empty_publisher_enum)
assert crawler.publishers == list()
assert next(crawler.crawl(), None) is None
with pytest.raises(ValueError):
Crawler(collection_with_empty_publisher_enum)

with pytest.raises(ValueError):
Crawler(*collection_with_empty_publisher_enum)

def test_crawler_with_collection(self, collection_with_valid_publisher_enum):
crawler = Crawler(*collection_with_valid_publisher_enum)
publisher = collection_with_valid_publisher_enum.pub.value
assert len(crawler.publishers) == 1

def test_crawler_with_two_collections(
Expand Down