flairNLP · addie9800 · Jun 13, 2024 · Jun 4, 2024 · Jun 5, 2024 · Jun 10, 2024
diff --git a/README.md b/README.md
@@ -131,7 +131,7 @@ We provide **quick tutorials** to get you started with the library:
 2. [**Tutorial 2: How to crawl articles from CC-NEWS**](docs/2_crawl_from_cc_news.md)
 3. [**Tutorial 3: The Article Class**](docs/3_the_article_class.md)
 4. [**Tutorial 4: How to filter articles**](docs/4_how_to_filter_articles.md)
-5. [**Tutorial 5: How to search for publishers**](docs/5_how_to_search_for_publishers.md)
+5. [**Tutorial 5: How to search for publishers**](docs/5_advanced_topics)
 
 If you wish to contribute check out these tutorials:
 1. [**How to contribute**](docs/how_to_contribute.md)

diff --git a/docs/3_the_article_class.md b/docs/3_the_article_class.md
@@ -5,6 +5,7 @@
   * [The articles' body](#the-articles-body)
   * [HTML](#html)
   * [Language detection](#language-detection)
+  * [Saving an Article](#saving-an-article)
 
 # The Article class
 
@@ -45,7 +46,7 @@ You can find those attributes under the [**supported publisher**](supported_publ
 
 Sometimes an attribute listed in the attribute guidelines isn't supported at all by a specific parser.
 You can find this information under the `Missing Attributes` tab within the supported publisher tables.
-There is also a built-in search mechanic you can learn about [here](5_how_to_search_for_publishers)
+There is also a built-in search mechanic you can learn about [here](5_advanced_topics)
 
 ## The articles' body
 
@@ -137,4 +138,21 @@ Should print this:
 en
 ```
 
+## Saving an Article
+
+In case you want to save an article in JSON format, the `Article` class provides a `to_json` method, returning a JSON serializable dictionary.
+The function accepts string values to specify which attributes should be serialized.
+Per default, all extracted attributes and the `plaintext` attribute of `Article` are included in the serialization.
+
+````python
+for article in crawler.crawl(max_articles=10):
+
+    # use the default serialization
+    article_json = article.to_json()
+    # or only serialize specific attributes
+    article_json = article.to_json("title", "plaintext", "lang")
+````
+
+To save all articles at once, using the default serialization and only specifying a location, refer to [this section](5_advanced_topics.md#saving-the-crawled-articles).
+
 In the [**next section**](4_how_to_filter_articles.md) we will show you how to filter articles.
diff --git a/docs/4_how_to_filter_articles.md b/docs/4_how_to_filter_articles.md
@@ -196,4 +196,4 @@ crawler = Crawler(PublisherCollection.us, restrict_sources_to=[NewsMap])
 The `crawl()` method supports functionality to filter out articles with URLs previously encountered in this run.
 You can alter this behavior by setting the `only_unique` parameter.
 
-In the [next section](5_how_to_search_for_publishers.md) we will show you how to search through publishers in the `PublisherCollection`.
+In the [next section](5_advanced_topics) we will show you how to search through publishers in the `PublisherCollection`.
diff --git a/docs/5_how_to_search_for_publishers.md → docs/5_advanced_topics.md b/docs/5_how_to_search_for_publishers.md → docs/5_advanced_topics.md
@@ -2,12 +2,15 @@
 
 * [How to search for publishers](#how-to-search-for-publishers)
   * [Using `search()`](#using-search)
+* [Saving the crawled articles](#saving-the-crawled-articles)
 
-# How to search for publishers
+# Advanced Topics
 
-This tutorial will show you how to search for specific publishers in the `PublisherCollection`.
+This tutorial will show further options such as searching for specific publishers in the `PublisherCollection` or saving the crawled articles.
 
-## Using `search()`
+## How to search for publishers
+
+### Using `search()`
 
 There are quite a few differences between the publishers, especially in the attributes the underlying parser supports.
 You can search through the collection to get only publishers fitting your use case by utilizing the `search()` method.
@@ -20,3 +23,9 @@ from fundus import Crawler, PublisherCollection, NewsMap
 fitting_publishers = PublisherCollection.us.search(attributes=["topics"], source_types=[NewsMap])
 crawler = Crawler(fitting_publishers)
 ````
+
+## Save crawled articles to a file
+
+To save all crawled articles to a file use the `save_to_file` parameter of the `crawl` method.
+When given a path, the crawled articles will be saved as a JSON list using the 
+[default article serialization](3_the_article_class.md#saving-an-article) and `UTF-8` encoding.
diff --git a/src/fundus/parser/data.py b/src/fundus/parser/data.py
@@ -49,6 +49,9 @@ def __init__(self, lds: Iterable[Dict[str, Any]] = ()):
             else:
                 self.add_ld(ld)
 
+    def serialize(self) -> Dict[str, Any]:
+        return {attribute: value for attribute, value in self.__dict__.items() if "__" not in attribute}
+
     def add_ld(self, ld: Dict[str, Any]) -> None:
         if ld_type := ld.get("@type"):
             if isinstance(ld_type, list):

diff --git a/src/fundus/scraping/article.py b/src/fundus/scraping/article.py
@@ -11,6 +11,7 @@
 from fundus.logging import create_logger
 from fundus.parser import ArticleBody
 from fundus.scraping.html import HTML
+from fundus.utils.serialization import JSONVal, is_jsonable
 
 logger = create_logger(__name__)
 
@@ -39,8 +40,11 @@ def from_extracted(cls, html: HTML, extracted: Dict[str, Any], exception: Option
         )
 
         article: Article = cls(html, exception, **dict(extracted_validated))
+        unvalidated_attributes: Set[str] = set()
         for attribute, value in extracted_unvalidated:
             object.__setattr__(article, attribute, value)  # Sets attributes on a frozen dataclass
+            unvalidated_attributes.add(attribute)
+        object.__setattr__(article, "_unvalidated_attributes", unvalidated_attributes)
 
         return article
 
@@ -69,6 +73,42 @@ def lang(self) -> Optional[str]:
     def __getattr__(self, item: object) -> Any:
         raise AttributeError(f"{type(self).__name__!r} object has no attribute {str(item)!r}")
 
+    def to_json(self, *attributes: str) -> Dict[str, JSONVal]:
+        """Converts article object into a JSON serializable dictionary.
+
+        One can specify which attributes should be included by passing attribute names as parameters.
+        Default: title, plaintext, authors, publishing_date, topics, free_access + unvalidated attributes
+
+        Args:
+            *attributes: The attributes to serialize. Default: see docstring.
+
+        Returns:
+            A json serializable dictionary
+        """
+
+        # default value for attributes
+        if not attributes:
+            validated = ["title", "plaintext", "authors", "publishing_date", "topics", "free_access"]
+            unvalidated = list(self.__dict__.get("_unvalidated_attributes", set()) - {"meta", "ld"})
+            attributes = tuple(validated + unvalidated)
+
+        serialization = {}
+        for attribute in attributes:
+            if not hasattr(self, attribute):
+                continue
+            value = getattr(self, attribute)
+
+            if hasattr(value, "serialize"):
+                value = value.serialize()
+            elif isinstance(value, datetime):
+                value = str(value)
+            elif not is_jsonable(value):
+                raise TypeError(f"Attribute {attribute!r} of type {type(value)!r} is not JSON serializable")
+
+            serialization[attribute] = value
+
+        return serialization
+
     def __str__(self):
         # the subsequent indent here is a bit wacky, but textwrapper.dedent won't work with tabs, so we have to use
         # whitespaces instead.

diff --git a/src/fundus/scraping/crawler.py b/src/fundus/scraping/crawler.py
@@ -1,6 +1,7 @@
 from __future__ import annotations
 
 import gzip
+import json
 import os
 import re
 from abc import ABC, abstractmethod
@@ -9,6 +10,7 @@
 from multiprocessing import Manager
 from multiprocessing.context import TimeoutError
 from multiprocessing.pool import MapResult, Pool, ThreadPool
+from pathlib import Path
 from queue import Empty, Queue
 from typing import (
     Any,
@@ -148,6 +150,7 @@ def crawl(
         only_complete: Union[bool, ExtractionFilter] = Requires("title", "body", "publishing_date"),
         url_filter: Optional[URLFilter] = None,
         only_unique: bool = True,
+        save_to_file: Union[None, str, Path] = None,
     ) -> Iterator[Article]:
         """Yields articles from initialized scrapers
 
@@ -169,6 +172,8 @@ def crawl(
                 URLs before download. This filter applies on both requested and responded URL. Defaults to None.
             only_unique (bool): If set to True, articles yielded will be unique on the responded URL.
                 Always returns the first encountered article. Defaults to True.
+            save_to_file (Union[None, str, Path]): If set, the crawled articles will be collected saved to the
+                specified file as a JSON list.
 
         Returns:
             Iterator[Article]: An iterator yielding objects of type Article.
@@ -216,18 +221,30 @@ def build_extraction_filter() -> Optional[ExtractionFilter]:
             fitting_publishers = self.publishers
 
         article_count = 0
-        for article in self._build_article_iterator(
-            tuple(fitting_publishers), error_handling, build_extraction_filter(), url_filter
-        ):
-            url_without_query_parameters = remove_query_parameters_from_url(article.html.responded_url)
-            if not only_unique or url_without_query_parameters not in response_cache:
-                response_cache.add(url_without_query_parameters)
-                article_count += 1
-                yield article
-            if article_count == max_articles:
-                break
-
-        session_handler.close_current_session()
+        if save_to_file is not None:
+            crawled_articles = list()
+
+        try:
+            for article in self._build_article_iterator(
+                tuple(fitting_publishers), error_handling, build_extraction_filter(), url_filter
+            ):
+                url_without_query_parameters = remove_query_parameters_from_url(article.html.responded_url)
+                if not only_unique or url_without_query_parameters not in response_cache:
+                    response_cache.add(url_without_query_parameters)
+                    article_count += 1
+                    if save_to_file is not None:
+                        crawled_articles.append(article)
+                    yield article
+                if article_count == max_articles:
+                    break
+        finally:
+            session_handler.close_current_session()
+            if save_to_file is not None:
+                with open(save_to_file, "w", encoding="utf-8") as file:
+                    logger.info(f"Writing crawled articles to {save_to_file!r}")
+                    file.write(
+                        json.dumps(crawled_articles, default=lambda o: o.to_json(), ensure_ascii=False, indent=4)
+                    )
 
 
 class Crawler(CrawlerBase):

diff --git a/src/fundus/utils/serialization.py b/src/fundus/utils/serialization.py
@@ -0,0 +1,14 @@
+import json
+from typing import Dict, Sequence, Union, cast
+
+from typing_extensions import TypeAlias
+
+JSONVal: TypeAlias = Union[None, bool, str, float, int, Sequence["JSONVal"], Dict[str, "JSONVal"]]
+
+
+def is_jsonable(x):
+    try:
+        json.dumps(x)
+        return True
+    except (TypeError, OverflowError):
+        return False