openfoodfacts · alexgarel · Dec 14, 2022 · May 4, 2020 · May 4, 2020 · May 4, 2020
diff --git a/.editorconfig b/.editorconfig
@@ -18,3 +18,7 @@ trim_trailing_whitespace = true
 [*.{pm,pl,t}]
 indent_style = tab
 indent_size = 4
+
+# Python files: 4 spaces indentation
+[*.py]
+indent_size = 4
@@ -0,0 +1,34 @@
+# Non-EU Packager Codes
+
+A Python application to download and manage non-EU packager codes, as listed on [the official page](https://webgate.ec.europa.eu/sanco/traces/output/non_eu_listsPerCountry_en.htm).
+
+## Setup
+
+Requires Python 3.5 or newer. To install, create a virtual environment using your favorite manager and activate it, for example:
+
+```shell script
+python3 -m venv ~/.pyenvs/packager-codes
+source ~/.pyenvs/packager-codes/bin/activate
+```
+
+Install dependencies:
+
+```shell script
+pip install -r requirements.txt
+```
+
+## Usage
+
+Simply run `python packager_codes.py --help` to see the main help.
+
+To download or update packager code files in the directory `packager_codes_data`:
+
+```shell script
+python packager_codes.py sync
+```
+
+To display the status of the locally downloaded files as compared to the remote:
+
+````shell script
+python packager_codes.py status
+````
diff --git a/scripts/packager-codes/non-eu/non_eu_spider.py b/scripts/packager-codes/non-eu/non_eu_spider.py
@@ -0,0 +1,54 @@
+from datetime import datetime, date
+from urllib.parse import urljoin
+from typing import Any
+
+import scrapy
+from scrapy.loader import ItemLoader
+from scrapy.loader.processors import MapCompose
+
+
+def get_one(values: list) -> Any:
+    if len(values) != 1:
+        raise ValueError("values list length must be equal to 1: {}".format(values))
+    return values[0]
+
+
+def extract_publication_date(date_str: str) -> date:
+    return datetime.strptime(date_str.strip(" ()"), "%d/%m/%Y").date()
+
+
+class NonEuDocumentItem(scrapy.Item):
+    country_name = scrapy.Field(output_processor=get_one)  # type: str
+    section = scrapy.Field(output_processor=get_one)  # type: str
+    title = scrapy.Field(output_processor=get_one)  # type: str
+    publication_date = scrapy.Field(
+        input_processor=MapCompose(extract_publication_date), output_processor=get_one
+    )  # type: datetime
+    file_path = scrapy.Field(output_processor=get_one)  # type: str
+    url = scrapy.Field(output_processor=get_one)  # type: str
+
+
+class NonEuSpider(scrapy.Spider):
+    name = "non_eu"
+    start_urls = [
+        "https://webgate.ec.europa.eu/sanco/traces/output/non_eu_listsPerCountry_en.htm"
+    ]
+
+    def parse(self, response):
+        for country_cell in response.xpath("//ul[@class='country-list']/li"):
+            country_name = country_cell.xpath("a[@class='country-name']/text()").get()
+
+            for section_table in country_cell.xpath("ul"):
+                section = section_table.xpath("preceding-sibling::h3[1]/text()").get()
+
+                for doc_link in section_table.xpath("li/a"):
+                    file_path = doc_link.xpath("@href").get()
+
+                    doc_loader = ItemLoader(item=NonEuDocumentItem(), selector=doc_link)
+                    doc_loader.add_value("country_name", country_name)
+                    doc_loader.add_value("section", section)
+                    doc_loader.add_xpath("title", "text()")
+                    doc_loader.add_xpath("publication_date", "span/text()")
+                    doc_loader.add_value("file_path", file_path)
+                    doc_loader.add_value("url", urljoin(response.url, file_path))
+                    yield doc_loader.load_item()
@@ -0,0 +1,167 @@
+from datetime import datetime
+import json
+import logging
+from pathlib import Path
+import subprocess
+from typing import Any, List, Mapping, Sequence
+from urllib.request import urlopen
+
+import click
+
+
+logging.basicConfig()
+logger = logging.getLogger()
+logger.setLevel(logging.INFO)
+
+JSONObject = Mapping[str, Any]
+
+SCRAPY_SPIDER_FILE_PATH = Path("non_eu_spider.py").absolute()
+
+
+def scrape_document_info() -> List[JSONObject]:
+    """Scrape official non-EU packager codes page and extract documents information.
+
+    Returns:
+        list of JSONObject: List of document information as dictionaries with the keys:
+        country_name, title, url, publication_date, file_path, section.
+    """
+    logger.info("Scraping remote document information")
+    cmd = "scrapy runspider --output - --output-format json --loglevel WARN".split(" ")
+    cmd.append(str(SCRAPY_SPIDER_FILE_PATH))
+    cmd_res = subprocess.run(cmd, stdout=subprocess.PIPE, check=True)
+    return json.loads(cmd_res.stdout.decode())
+
+
+def download_documents(document_info: Sequence[JSONObject], dest_dir: Path) -> None:
+    logger.info("Downloading %s documents into '%s'", len(document_info), dest_dir)
+    dest_dir = Path(dest_dir)
+    for i, doc_info in enumerate(document_info):
+        dest_path = dest_dir / doc_info["file_path"]
+        logger.info(
+            "(%s/%s) Downloading %s", i + 1, len(document_info), doc_info["url"]
+        )
+        dest_path.parent.mkdir(parents=True, exist_ok=True)
+        with urlopen(doc_info["url"]) as response, dest_path.open("wb") as dest_file:
+            dest_file.write(response.read())
+
+
+def document_info_diff(
+    scraped: Sequence[JSONObject], local: Sequence[JSONObject]
+) -> Mapping[str, List[JSONObject]]:
+    scraped_docs = {d["file_path"]: d for d in scraped}
+    local_docs = {d["file_path"]: d for d in local}
+
+    new_names = set(scraped_docs.keys()).difference(local_docs.keys())
+    removed_names = set(local_docs.keys()).difference(scraped_docs.keys())
+    updated_names = [
+        doc_name
+        for doc_name, doc in local_docs.items()
+        if (
+            doc_name not in removed_names
+            and scraped_docs[doc_name]["publication_date"] > doc["publication_date"]
+        )
+    ]
+    unchanged_names = (
+        set(local_docs.keys()).difference(removed_names).difference(updated_names)
+    )
+
+    return {
+        "new": [scraped_docs[n] for n in new_names],
+        "removed": [local_docs[n] for n in removed_names],
+        "updated": [scraped_docs[n] for n in updated_names],
+        "unchanged": [local_docs[n] for n in unchanged_names],
+    }
+
+
+def load_local_meta(data_dir: Path) -> JSONObject:
+    meta_path = data_dir / "meta.json"
+    logger.info("Loading local metadata from '%s'", meta_path)
+    if not meta_path.exists():
+        return {"document_info": []}
+    else:
+        with meta_path.open("r") as meta_file:
+            return json.load(meta_file)
+
+
+@click.group(help="Manage non-EU packager code data.")
+def main():
+    pass
+
+
+@main.command(
+    help="Show local data status as compared to remote source.\n\n"
+    "DATA_DIR is the path to the local directory containing packager code data. "
+    "Defaults to 'packager_codes_data'.",
+)
+@click.argument(
+    "data_dir", type=click.Path(file_okay=False), default="packager_codes_data"
+)
+@click.option(
+    "--output-format",
+    "-f",
+    type=click.Choice(["summary", "json"]),
+    default="summary",
+    help="Command output format.",
+    show_default=True,
+)
+def status(data_dir: str, output_format: str) -> None:
+    data_dir = Path(data_dir)
+
+    local_meta = load_local_meta(data_dir)
+    scraped_info = scrape_document_info()
+    print(scraped_info)
+    doc_diff = document_info_diff(scraped_info, local_meta["document_info"])
+
+    if output_format == "json":
+        click.echo(json.dumps(doc_diff, indent=2))
+    else:
+        text = (
+            "Last updated: {}\nNew: {}, Removed: {}, Updated: {}, Unchanged: {}"
+        ).format(
+            local_meta.get("updated", "never"),
+            len(doc_diff["new"]),
+            len(doc_diff["removed"]),
+            len(doc_diff["updated"]),
+            len(doc_diff["unchanged"]),
+        )
+        click.echo(text)
+
+
+@main.command(
+    help="Sync packager code files with remote.\n\n"
+    "DATA_DIR is the path of the local directory in which to sync data. Defaults to "
+    "'packager_codes_data'.",
+)
+@click.argument(
+    "data_dir", type=click.Path(file_okay=False), default="packager_codes_data"
+)
+def sync(data_dir: str) -> None:
+    data_dir = Path(data_dir)
+    data_dir.mkdir(exist_ok=True)
+
+    local_meta = load_local_meta(data_dir)
+    document_info = scrape_document_info()
+    doc_diff = document_info_diff(document_info, local_meta["document_info"])
+
+    logger.info("Deleting %s removed documents", len(doc_diff["removed"]))
+    for removed_doc in doc_diff["removed"]:
+        doc_path = data_dir / removed_doc["file_path"]
+        logger.info("Deleting '%s'", doc_path)
+        doc_path.unlink()
+
+    changed_docs = doc_diff["new"] + doc_diff["updated"]
+    download_documents(changed_docs, data_dir)
+
+    meta_path = data_dir / "meta.json"
+    logger.info("Writing metadata in '%s'", meta_path)
+    meta = {
+        "description": "OpenFoodFacts non-EU packager codes",
+        "updated": datetime.now().isoformat(),
+        "document_info": document_info,
+    }
+    with meta_path.open("w") as meta_file:
+        json.dump(meta, meta_file, indent=2)
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,3 @@
+# Requires Python >= 3.5
+click
+scrapy