flairNLP · addie9800 · Jun 13, 2024 · Jun 4, 2024 · Jun 5, 2024 · Jun 10, 2024
diff --git a/README.md b/README.md
@@ -131,7 +131,7 @@ We provide **quick tutorials** to get you started with the library:
 2. [**Tutorial 2: How to crawl articles from CC-NEWS**](docs/2_crawl_from_cc_news.md)
 3. [**Tutorial 3: The Article Class**](docs/3_the_article_class.md)
 4. [**Tutorial 4: How to filter articles**](docs/4_how_to_filter_articles.md)
-5. [**Tutorial 5: How to search for publishers**](docs/5_how_to_search_for_publishers.md)
+5. [**Tutorial 5: How to search for publishers**](docs/5_advanced_topics)
 
 If you wish to contribute check out these tutorials:
 1. [**How to contribute**](docs/how_to_contribute.md)

diff --git a/docs/3_the_article_class.md b/docs/3_the_article_class.md
@@ -5,6 +5,7 @@
   * [The articles' body](#the-articles-body)
   * [HTML](#html)
   * [Language detection](#language-detection)
+  * [Saving an Article](#saving-an-article)
 
 # The Article class
 
@@ -45,7 +46,7 @@ You can find those attributes under the [**supported publisher**](supported_publ
 
 Sometimes an attribute listed in the attribute guidelines isn't supported at all by a specific parser.
 You can find this information under the `Missing Attributes` tab within the supported publisher tables.
-There is also a built-in search mechanic you can learn about [here](5_how_to_search_for_publishers)
+There is also a built-in search mechanic you can learn about [here](5_advanced_topics)
 
 ## The articles' body
 
@@ -137,4 +138,10 @@ Should print this:
 en
 ```
 
+## Saving an Article
+
+In case you want to save some or all of the articles (refer to the [`save_to_file` parameter](5_advanced_topics.md#saving-the-crawled-articles) in the next section for the latter), the `Article` class provides a `to_json()` function.
+The function expects two boolean parameters `include_ld` and `include_meta` which are set to `False` by default and allow you to control whether or not the parsed `ld+json` or `meta` data respectively are included in the output.
+It returns a dictionary object containing serializable attribute-value pairs.
+
 In the [**next section**](4_how_to_filter_articles.md) we will show you how to filter articles.
diff --git a/docs/4_how_to_filter_articles.md b/docs/4_how_to_filter_articles.md
@@ -196,4 +196,4 @@ crawler = Crawler(PublisherCollection.us, restrict_sources_to=[NewsMap])
 The `crawl()` method supports functionality to filter out articles with URLs previously encountered in this run.
 You can alter this behavior by setting the `only_unique` parameter.
 
-In the [next section](5_how_to_search_for_publishers.md) we will show you how to search through publishers in the `PublisherCollection`.
+In the [next section](5_advanced_topics) we will show you how to search through publishers in the `PublisherCollection`.
diff --git a/docs/5_how_to_search_for_publishers.md → docs/5_advanced_topics.md b/docs/5_how_to_search_for_publishers.md → docs/5_advanced_topics.md
@@ -2,12 +2,15 @@
 
 * [How to search for publishers](#how-to-search-for-publishers)
   * [Using `search()`](#using-search)
+* [Saving the crawled articles](#saving-the-crawled-articles)
 
-# How to search for publishers
+# Advanced Topics
 
-This tutorial will show you how to search for specific publishers in the `PublisherCollection`.
+This tutorial will show further options such as searching for specific publishers in the `PublisherCollection` or saving the crawled articles.
 
-## Using `search()`
+## How to search for publishers
+
+### Using `search()`
 
 There are quite a few differences between the publishers, especially in the attributes the underlying parser supports.
 You can search through the collection to get only publishers fitting your use case by utilizing the `search()` method.
@@ -20,3 +23,8 @@ from fundus import Crawler, PublisherCollection, NewsMap
 fitting_publishers = PublisherCollection.us.search(attributes=["topics"], source_types=[NewsMap])
 crawler = Crawler(fitting_publishers)
 ````
+
+## Saving the crawled articles
+
+When trying to save all parsed articles within a single file, the `crawl()` function provides the `save_to_file` parameter.
+The parameter can be assigned a filepath specifying the location for the articles to be saved.
diff --git a/src/fundus/parser/data.py b/src/fundus/parser/data.py
@@ -49,6 +49,9 @@ def __init__(self, lds: Iterable[Dict[str, Any]] = ()):
             else:
                 self.add_ld(ld)
 
+    def to_dict(self) -> Dict[str, Any]:
+        return {attribute: value for attribute, value in self.__dict__.items() if "__" not in attribute}
+
     def add_ld(self, ld: Dict[str, Any]) -> None:
         if ld_type := ld.get("@type"):
             if isinstance(ld_type, list):

diff --git a/src/fundus/scraping/article.py b/src/fundus/scraping/article.py
@@ -10,7 +10,9 @@
 
 from fundus.logging import create_logger
 from fundus.parser import ArticleBody
+from fundus.parser.data import LinkedDataMapping
 from fundus.scraping.html import HTML
+from fundus.utils.serialization import JSONVal
 
 logger = create_logger(__name__)
 
@@ -39,8 +41,11 @@ def from_extracted(cls, html: HTML, extracted: Dict[str, Any], exception: Option
         )
 
         article: Article = cls(html, exception, **dict(extracted_validated))
+        unvalidated_attributes: List[str] = []
         for attribute, value in extracted_unvalidated:
             object.__setattr__(article, attribute, value)  # Sets attributes on a frozen dataclass
+            unvalidated_attributes.append(attribute)
+        object.__setattr__(article, "_unvalidated_attributes", unvalidated_attributes)
 
         return article
 
@@ -69,6 +74,30 @@ def lang(self) -> Optional[str]:
     def __getattr__(self, item: object) -> Any:
         raise AttributeError(f"{type(self).__name__!r} object has no attribute {str(item)!r}")
 
+    def to_json(self, include_ld: bool = False, include_meta: bool = False) -> Dict[str, JSONVal]:
+        data: Dict[str, JSONVal] = {
+            "title": self.title,
+            "plaintext": self.plaintext,
+            "authors": self.authors,
+            "publishing_date": str(self.publishing_date),
+            "topics": self.topics,
+            "free_access": self.free_access,
+        }
+
+        for attribute in self.__dict__.get("_unvalidated_attributes", []):
+            if ((not include_ld) and attribute == "ld") or ((not include_meta) and attribute == "meta"):
+                continue
+            value = getattr(self, attribute)
+
+            if value is None or isinstance(value, (bool, str, float, int, list, dict)):
+                data[attribute] = value
+            elif isinstance(value, LinkedDataMapping):
+                data[attribute] = value.to_dict()
+            else:
+                raise TypeError(f"{attribute} of type {type(value)!r} is not JSON serializable")
+
+        return data
+
     def __str__(self):
         # the subsequent indent here is a bit wacky, but textwrapper.dedent won't work with tabs, so we have to use
         # whitespaces instead.

diff --git a/src/fundus/scraping/crawler.py b/src/fundus/scraping/crawler.py
@@ -1,6 +1,7 @@
 from __future__ import annotations
 
 import gzip
+import json
 import os
 import re
 from abc import ABC, abstractmethod
@@ -148,6 +149,7 @@ def crawl(
         only_complete: Union[bool, ExtractionFilter] = Requires("title", "body", "publishing_date"),
         url_filter: Optional[URLFilter] = None,
         only_unique: bool = True,
+        save_to_file: Optional[str] = None,
     ) -> Iterator[Article]:
         """Yields articles from initialized scrapers
 
@@ -169,6 +171,8 @@ def crawl(
                 URLs before download. This filter applies on both requested and responded URL. Defaults to None.
             only_unique (bool): If set to True, articles yielded will be unique on the responded URL.
                 Always returns the first encountered article. Defaults to True.
+            save_to_file (Optional[str]): If set, the value should correspond to the filepath of the output file.
+                The articles will be collected in a list which is JSON-encoded and saved to the specified file
 
         Returns:
             Iterator[Article]: An iterator yielding objects of type Article.
@@ -216,18 +220,26 @@ def build_extraction_filter() -> Optional[ExtractionFilter]:
             fitting_publishers = self.publishers
 
         article_count = 0
+        if save_to_file:
+            crawled_articles = list()
         for article in self._build_article_iterator(
             tuple(fitting_publishers), error_handling, build_extraction_filter(), url_filter
         ):
             url_without_query_parameters = remove_query_parameters_from_url(article.html.responded_url)
             if not only_unique or url_without_query_parameters not in response_cache:
                 response_cache.add(url_without_query_parameters)
                 article_count += 1
+                if save_to_file:
+                    crawled_articles.append(article)
                 yield article
             if article_count == max_articles:
                 break
 
         session_handler.close_current_session()
+        if save_to_file:
+            with open(save_to_file, "w") as file:
+                logger.info(f"Writing crawled articles to {save_to_file}")
+                file.write(json.dumps(crawled_articles, default=lambda o: o.to_json()))
 
 
 class Crawler(CrawlerBase):

diff --git a/src/fundus/utils/serialization.py b/src/fundus/utils/serialization.py
@@ -0,0 +1,5 @@
+from typing import Dict, Sequence, Union
+
+from typing_extensions import TypeAlias
+
+JSONVal: TypeAlias = Union[None, bool, str, float, int, Sequence["JSONVal"], Dict[str, "JSONVal"]]