From 22e5deccb1563a0d140e7b1af075dd6b1dd73cc7 Mon Sep 17 00:00:00 2001 From: Pablo Rosado Date: Wed, 13 Mar 2024 19:22:41 +0100 Subject: [PATCH 01/54] Adapt create new snapshots script --- etl/scripts/faostat/create_new_snapshots.py | 111 +++++++++----------- etl/scripts/faostat/shared.py | 2 + 2 files changed, 54 insertions(+), 59 deletions(-) diff --git a/etl/scripts/faostat/create_new_snapshots.py b/etl/scripts/faostat/create_new_snapshots.py index b03e46ed56c..a8a80628886 100644 --- a/etl/scripts/faostat/create_new_snapshots.py +++ b/etl/scripts/faostat/create_new_snapshots.py @@ -27,16 +27,14 @@ import datetime as dt import json import tempfile -from pathlib import Path from typing import Any, Dict, List, cast import requests from dateutil import parser -from etl.files import ruamel_dump -from etl.paths import SNAPSHOTS_DIR from etl.scripts.faostat.shared import ( API_BASE_URL, + ATTRIBUTION_SHORT, FAO_CATALOG_URL, FAO_DATA_URL, INCLUDED_DATASETS_CODES, @@ -47,21 +45,7 @@ VERSION, log, ) -from etl.snapshot import Snapshot, SnapshotMeta, add_snapshot, snapshot_catalog - - -def create_snapshot_metadata_file(metadata: Dict[str, Any]) -> None: - # Path to new snapshot folder. - snapshot_dir_path = Path(SNAPSHOTS_DIR / metadata["namespace"] / metadata["version"]) - # Path to new snapshot metadata file. - snapshot_file_path = (snapshot_dir_path / metadata["short_name"]).with_suffix(f".{metadata['file_extension']}.dvc") - - # Ensure new snapshot folder exists, otherwise create it. - snapshot_dir_path.mkdir(exist_ok=True) - - # Create metadata file for current domain dataset. - with open(snapshot_file_path, "w") as f: - f.write(ruamel_dump({"meta": metadata})) +from etl.snapshot import Snapshot, SnapshotMeta, snapshot_catalog class FAODataset: @@ -101,6 +85,18 @@ def modification_date(self) -> dt.date: def short_name(self) -> str: return f"{self.namespace}_{self._dataset_metadata['DatasetCode'].lower()}" + @property + def dataset_name(self) -> str: + return self._dataset_metadata["DatasetName"] + + @property + def dataset_description(self) -> str: + return self._dataset_metadata["DatasetDescription"] + + @property + def dataset_code(self) -> str: + return self._dataset_metadata["DatasetCode"] + @property def source_data_url(self) -> str: return self._dataset_metadata["FileLocation"] @@ -117,26 +113,24 @@ def metadata(self) -> Dict[str, Any]: log.warning(f"Description for dataset {self.short_name} is missing. Type one manually.") return { "namespace": self.namespace, - "short_name": self.short_name, - "name": (f"{self._dataset_metadata['DatasetName']} - FAO" f" ({self.publication_year})"), - "description": self._dataset_metadata["DatasetDescription"], - "source": { - "name": SOURCE_NAME, - "description": self._dataset_metadata["DatasetDescription"], - "published_by": SOURCE_NAME, - "publication_year": self.publication_year, - "publication_date": str(self.publication_date), - "date_accessed": VERSION, - "source_data_url": self.source_data_url, - }, "version": VERSION, - "url": f"{FAO_DATA_URL}/{self._dataset_metadata['DatasetCode']}", + "short_name": self.short_name, "file_extension": "zip", - "license": { - "name": LICENSE_NAME, - "url": LICENSE_URL, + "origin": { + "title": self.dataset_name, + "description": self.dataset_description, + "date_published": str(self.publication_date), + "date_accessed": VERSION, + "producer": SOURCE_NAME, + "citation_full": f"{SOURCE_NAME} - {self.dataset_name} ({self.publication_year}).", + "attribution_short": ATTRIBUTION_SHORT, + "url_main": f"{FAO_DATA_URL}/{self.dataset_code}", + "url_download": self.source_data_url, + "license": { + "name": LICENSE_NAME, + "url": LICENSE_URL, + }, }, - "is_public": True, } @property @@ -151,17 +145,14 @@ def to_snapshot(self) -> None: """ log.info(f"Creating snapshot for step {self.metadata['short_name']}.") - # Create metadata file for current domain dataset. - create_snapshot_metadata_file(self.metadata) - # Create a new snapshot. snap = Snapshot(self.snapshot_metadata.uri) - # Download data from source. - snap.download_from_source() + # Create metadata file for current domain dataset. + snap.path.write_text(snap.metadata.to_yaml()) - # Add file to DVC and upload to S3. - snap.dvc_add(upload=True) + # Download data from source and upload to S3. + snap.create_snapshot(upload=True) def load_faostat_catalog() -> List[Dict[str, Any]]: @@ -211,24 +202,23 @@ def metadata(self) -> Dict[str, Any]: return { "namespace": NAMESPACE, "short_name": f"{NAMESPACE}_metadata", - "name": f"Metadata and identifiers - FAO ({self.publication_year})", - "description": "Metadata and identifiers used in FAO datasets", - "source": { - "name": SOURCE_NAME, - "published_by": SOURCE_NAME, - "publication_year": self.publication_year, - "publication_date": str(self.publication_date), - "date_accessed": VERSION, - "source_data_url": None, - }, "version": VERSION, - "url": FAO_DATA_URL, "file_extension": "json", - "license": { - "name": LICENSE_NAME, - "url": LICENSE_URL, + "origin": { + "title": "Metadata and identifiers", + "description": "Metadata and identifiers used in FAOSTAT datasets.", + "date_published": str(self.publication_date), + "date_accessed": VERSION, + "producer": SOURCE_NAME, + "citation_full": f"{SOURCE_NAME} ({self.publication_year}).", + "attribution_short": ATTRIBUTION_SHORT, + "url_main": FAO_DATA_URL, + "url_download": None, + "license": { + "name": LICENSE_NAME, + "url": LICENSE_URL, + }, }, - "is_public": True, } @property @@ -261,14 +251,17 @@ def to_snapshot(self) -> None: log.info(f"Creating snapshot for step {self.metadata['short_name']}.") # Create metadata file for current domain dataset. - create_snapshot_metadata_file(self.metadata) + snap = Snapshot(self.snapshot_metadata.uri) + + # Create metadata file for current domain dataset. + snap.path.write_text(snap.metadata.to_yaml()) with tempfile.NamedTemporaryFile() as f: # Download data into a temporary file. self._fetch_additional_metadata_and_save(f.name) # Create snapshot. - add_snapshot(uri=self.snapshot_metadata.uri, filename=f.name, upload=True) + snap.create_snapshot(filename=f.name, upload=True) def main(read_only: bool = False) -> None: diff --git a/etl/scripts/faostat/shared.py b/etl/scripts/faostat/shared.py index 45ac45c1260..c2bf5b37535 100644 --- a/etl/scripts/faostat/shared.py +++ b/etl/scripts/faostat/shared.py @@ -22,6 +22,8 @@ FAO_DATA_URL = "http://www.fao.org/faostat/en/#data" # Metadata source name. SOURCE_NAME = "Food and Agriculture Organization of the United Nations" +# Short attribution. +ATTRIBUTION_SHORT = "FAOSTAT" # Metadata related to license. LICENSE_URL = "http://www.fao.org/contact-us/terms/db-terms-of-use/en" LICENSE_NAME = "CC BY-NC-SA 3.0 IGO" From 418274da49cfad935cef0823a0c99258187165ff Mon Sep 17 00:00:00 2001 From: Pablo Rosado Date: Thu, 14 Mar 2024 09:22:14 +0100 Subject: [PATCH 02/54] Fix create_new_snapshots --- etl/scripts/faostat/create_new_snapshots.py | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/etl/scripts/faostat/create_new_snapshots.py b/etl/scripts/faostat/create_new_snapshots.py index a8a80628886..e6b2fc8b664 100644 --- a/etl/scripts/faostat/create_new_snapshots.py +++ b/etl/scripts/faostat/create_new_snapshots.py @@ -145,11 +145,17 @@ def to_snapshot(self) -> None: """ log.info(f"Creating snapshot for step {self.metadata['short_name']}.") - # Create a new snapshot. - snap = Snapshot(self.snapshot_metadata.uri) + # Create a new snapshot metadata. + metadata = SnapshotMeta.from_dict(self.metadata) + + # Ensure parent directory exists. + metadata.path.parent.mkdir(parents=True, exist_ok=True) # Create metadata file for current domain dataset. - snap.path.write_text(snap.metadata.to_yaml()) + metadata.path.write_text(metadata.to_yaml()) + + # Create a new snapshot. + snap = Snapshot(metadata.uri) # Download data from source and upload to S3. snap.create_snapshot(upload=True) @@ -250,11 +256,14 @@ def _fetch_additional_metadata_and_save(output_filename: str) -> None: def to_snapshot(self) -> None: log.info(f"Creating snapshot for step {self.metadata['short_name']}.") - # Create metadata file for current domain dataset. - snap = Snapshot(self.snapshot_metadata.uri) + # Create new snapshot metadata object. + metadata = SnapshotMeta.from_dict(self.metadata) # Create metadata file for current domain dataset. - snap.path.write_text(snap.metadata.to_yaml()) + metadata.path.write_text(metadata.to_yaml()) + + # Create a new snapshot. + snap = Snapshot(metadata.uri) with tempfile.NamedTemporaryFile() as f: # Download data into a temporary file. From 7508ad46b6ce758910ccd74edfdee2f88b58f521 Mon Sep 17 00:00:00 2001 From: Pablo Rosado Date: Thu, 14 Mar 2024 09:49:08 +0100 Subject: [PATCH 03/54] Add snapshots and meadow steps, and remove wcad --- dag/faostat.yml | 61 ++++++ docs/data/faostat.md | 16 +- etl/scripts/faostat/shared.py | 3 +- .../meadow/faostat/2024-03-14/faostat_cahd.py | 2 + .../meadow/faostat/2024-03-14/faostat_ef.py | 2 + .../meadow/faostat/2024-03-14/faostat_ei.py | 2 + .../meadow/faostat/2024-03-14/faostat_ek.py | 2 + .../meadow/faostat/2024-03-14/faostat_el.py | 2 + .../meadow/faostat/2024-03-14/faostat_emn.py | 2 + .../meadow/faostat/2024-03-14/faostat_ep.py | 2 + .../meadow/faostat/2024-03-14/faostat_esb.py | 2 + .../meadow/faostat/2024-03-14/faostat_fa.py | 2 + .../meadow/faostat/2024-03-14/faostat_fbs.py | 2 + .../meadow/faostat/2024-03-14/faostat_fbsh.py | 2 + .../meadow/faostat/2024-03-14/faostat_fo.py | 2 + .../meadow/faostat/2024-03-14/faostat_fs.py | 2 + .../meadow/faostat/2024-03-14/faostat_gn.py | 2 + .../meadow/faostat/2024-03-14/faostat_ic.py | 2 + .../meadow/faostat/2024-03-14/faostat_lc.py | 2 + .../faostat/2024-03-14/faostat_metadata.py | 201 ++++++++++++++++++ .../meadow/faostat/2024-03-14/faostat_qcl.py | 2 + .../meadow/faostat/2024-03-14/faostat_qi.py | 2 + .../meadow/faostat/2024-03-14/faostat_qv.py | 2 + .../meadow/faostat/2024-03-14/faostat_rfb.py | 2 + .../meadow/faostat/2024-03-14/faostat_rfn.py | 2 + .../meadow/faostat/2024-03-14/faostat_rl.py | 2 + .../meadow/faostat/2024-03-14/faostat_rp.py | 2 + .../meadow/faostat/2024-03-14/faostat_rt.py | 2 + .../meadow/faostat/2024-03-14/faostat_scl.py | 2 + .../meadow/faostat/2024-03-14/faostat_sdgb.py | 2 + .../meadow/faostat/2024-03-14/faostat_tcl.py | 2 + .../meadow/faostat/2024-03-14/faostat_ti.py | 2 + .../data/meadow/faostat/2024-03-14/shared.py | 173 +++++++++++++++ .../faostat/2024-03-14/faostat_cahd.zip.dvc | 21 ++ .../faostat/2024-03-14/faostat_ei.zip.dvc | 20 ++ .../faostat/2024-03-14/faostat_ek.zip.dvc | 19 ++ .../faostat/2024-03-14/faostat_emn.zip.dvc | 19 ++ .../faostat/2024-03-14/faostat_esb.zip.dvc | 21 ++ .../faostat/2024-03-14/faostat_fa.zip.dvc | 19 ++ .../faostat/2024-03-14/faostat_fbs.zip.dvc | 19 ++ .../faostat/2024-03-14/faostat_fbsh.zip.dvc | 20 ++ .../faostat/2024-03-14/faostat_fo.zip.dvc | 19 ++ .../faostat/2024-03-14/faostat_fs.zip.dvc | 20 ++ .../faostat/2024-03-14/faostat_gn.zip.dvc | 20 ++ .../faostat/2024-03-14/faostat_ic.zip.dvc | 19 ++ .../faostat/2024-03-14/faostat_lc.zip.dvc | 19 ++ .../2024-03-14/faostat_metadata.json.dvc | 17 ++ .../faostat/2024-03-14/faostat_qcl.zip.dvc | 19 ++ .../faostat/2024-03-14/faostat_qi.zip.dvc | 19 ++ .../faostat/2024-03-14/faostat_qv.zip.dvc | 19 ++ .../faostat/2024-03-14/faostat_rfb.zip.dvc | 20 ++ .../faostat/2024-03-14/faostat_rfn.zip.dvc | 20 ++ .../faostat/2024-03-14/faostat_rl.zip.dvc | 19 ++ .../faostat/2024-03-14/faostat_rp.zip.dvc | 19 ++ .../faostat/2024-03-14/faostat_rt.zip.dvc | 19 ++ .../faostat/2024-03-14/faostat_scl.zip.dvc | 19 ++ .../faostat/2024-03-14/faostat_sdgb.zip.dvc | 19 ++ .../faostat/2024-03-14/faostat_tcl.zip.dvc | 19 ++ .../faostat/2024-03-14/faostat_ti.zip.dvc | 19 ++ 59 files changed, 1005 insertions(+), 7 deletions(-) create mode 100644 etl/steps/data/meadow/faostat/2024-03-14/faostat_cahd.py create mode 100644 etl/steps/data/meadow/faostat/2024-03-14/faostat_ef.py create mode 100644 etl/steps/data/meadow/faostat/2024-03-14/faostat_ei.py create mode 100644 etl/steps/data/meadow/faostat/2024-03-14/faostat_ek.py create mode 100644 etl/steps/data/meadow/faostat/2024-03-14/faostat_el.py create mode 100644 etl/steps/data/meadow/faostat/2024-03-14/faostat_emn.py create mode 100644 etl/steps/data/meadow/faostat/2024-03-14/faostat_ep.py create mode 100644 etl/steps/data/meadow/faostat/2024-03-14/faostat_esb.py create mode 100644 etl/steps/data/meadow/faostat/2024-03-14/faostat_fa.py create mode 100644 etl/steps/data/meadow/faostat/2024-03-14/faostat_fbs.py create mode 100644 etl/steps/data/meadow/faostat/2024-03-14/faostat_fbsh.py create mode 100644 etl/steps/data/meadow/faostat/2024-03-14/faostat_fo.py create mode 100644 etl/steps/data/meadow/faostat/2024-03-14/faostat_fs.py create mode 100644 etl/steps/data/meadow/faostat/2024-03-14/faostat_gn.py create mode 100644 etl/steps/data/meadow/faostat/2024-03-14/faostat_ic.py create mode 100644 etl/steps/data/meadow/faostat/2024-03-14/faostat_lc.py create mode 100644 etl/steps/data/meadow/faostat/2024-03-14/faostat_metadata.py create mode 100644 etl/steps/data/meadow/faostat/2024-03-14/faostat_qcl.py create mode 100644 etl/steps/data/meadow/faostat/2024-03-14/faostat_qi.py create mode 100644 etl/steps/data/meadow/faostat/2024-03-14/faostat_qv.py create mode 100644 etl/steps/data/meadow/faostat/2024-03-14/faostat_rfb.py create mode 100644 etl/steps/data/meadow/faostat/2024-03-14/faostat_rfn.py create mode 100644 etl/steps/data/meadow/faostat/2024-03-14/faostat_rl.py create mode 100644 etl/steps/data/meadow/faostat/2024-03-14/faostat_rp.py create mode 100644 etl/steps/data/meadow/faostat/2024-03-14/faostat_rt.py create mode 100644 etl/steps/data/meadow/faostat/2024-03-14/faostat_scl.py create mode 100644 etl/steps/data/meadow/faostat/2024-03-14/faostat_sdgb.py create mode 100644 etl/steps/data/meadow/faostat/2024-03-14/faostat_tcl.py create mode 100644 etl/steps/data/meadow/faostat/2024-03-14/faostat_ti.py create mode 100644 etl/steps/data/meadow/faostat/2024-03-14/shared.py create mode 100644 snapshots/faostat/2024-03-14/faostat_cahd.zip.dvc create mode 100644 snapshots/faostat/2024-03-14/faostat_ei.zip.dvc create mode 100644 snapshots/faostat/2024-03-14/faostat_ek.zip.dvc create mode 100644 snapshots/faostat/2024-03-14/faostat_emn.zip.dvc create mode 100644 snapshots/faostat/2024-03-14/faostat_esb.zip.dvc create mode 100644 snapshots/faostat/2024-03-14/faostat_fa.zip.dvc create mode 100644 snapshots/faostat/2024-03-14/faostat_fbs.zip.dvc create mode 100644 snapshots/faostat/2024-03-14/faostat_fbsh.zip.dvc create mode 100644 snapshots/faostat/2024-03-14/faostat_fo.zip.dvc create mode 100644 snapshots/faostat/2024-03-14/faostat_fs.zip.dvc create mode 100644 snapshots/faostat/2024-03-14/faostat_gn.zip.dvc create mode 100644 snapshots/faostat/2024-03-14/faostat_ic.zip.dvc create mode 100644 snapshots/faostat/2024-03-14/faostat_lc.zip.dvc create mode 100644 snapshots/faostat/2024-03-14/faostat_metadata.json.dvc create mode 100644 snapshots/faostat/2024-03-14/faostat_qcl.zip.dvc create mode 100644 snapshots/faostat/2024-03-14/faostat_qi.zip.dvc create mode 100644 snapshots/faostat/2024-03-14/faostat_qv.zip.dvc create mode 100644 snapshots/faostat/2024-03-14/faostat_rfb.zip.dvc create mode 100644 snapshots/faostat/2024-03-14/faostat_rfn.zip.dvc create mode 100644 snapshots/faostat/2024-03-14/faostat_rl.zip.dvc create mode 100644 snapshots/faostat/2024-03-14/faostat_rp.zip.dvc create mode 100644 snapshots/faostat/2024-03-14/faostat_rt.zip.dvc create mode 100644 snapshots/faostat/2024-03-14/faostat_scl.zip.dvc create mode 100644 snapshots/faostat/2024-03-14/faostat_sdgb.zip.dvc create mode 100644 snapshots/faostat/2024-03-14/faostat_tcl.zip.dvc create mode 100644 snapshots/faostat/2024-03-14/faostat_ti.zip.dvc diff --git a/dag/faostat.yml b/dag/faostat.yml index 56c8b794cdf..14877d0dd58 100644 --- a/dag/faostat.yml +++ b/dag/faostat.yml @@ -323,3 +323,64 @@ steps: # data://grapher/faostat/2023-06-12/additional_variables: - data://garden/faostat/2023-06-12/additional_variables + # + # FAOSTAT meadow steps for version 2024-03-14 + # + data://meadow/faostat/2024-03-14/faostat_cahd: + - snapshot://faostat/2024-03-14/faostat_cahd.zip + data://meadow/faostat/2024-03-14/faostat_ef: + - snapshot://faostat/2023-02-22/faostat_ef.zip + data://meadow/faostat/2024-03-14/faostat_ei: + - snapshot://faostat/2024-03-14/faostat_ei.zip + data://meadow/faostat/2024-03-14/faostat_ek: + - snapshot://faostat/2024-03-14/faostat_ek.zip + data://meadow/faostat/2024-03-14/faostat_el: + - snapshot://faostat/2023-06-12/faostat_el.zip + data://meadow/faostat/2024-03-14/faostat_emn: + - snapshot://faostat/2024-03-14/faostat_emn.zip + data://meadow/faostat/2024-03-14/faostat_ep: + - snapshot://faostat/2023-02-22/faostat_ep.zip + data://meadow/faostat/2024-03-14/faostat_esb: + - snapshot://faostat/2024-03-14/faostat_esb.zip + data://meadow/faostat/2024-03-14/faostat_fa: + - snapshot://faostat/2024-03-14/faostat_fa.zip + data://meadow/faostat/2024-03-14/faostat_fbs: + - snapshot://faostat/2024-03-14/faostat_fbs.zip + data://meadow/faostat/2024-03-14/faostat_fbsh: + - snapshot://faostat/2024-03-14/faostat_fbsh.zip + data://meadow/faostat/2024-03-14/faostat_fo: + - snapshot://faostat/2024-03-14/faostat_fo.zip + data://meadow/faostat/2024-03-14/faostat_fs: + - snapshot://faostat/2024-03-14/faostat_fs.zip + data://meadow/faostat/2024-03-14/faostat_gn: + - snapshot://faostat/2024-03-14/faostat_gn.zip + data://meadow/faostat/2024-03-14/faostat_ic: + - snapshot://faostat/2024-03-14/faostat_ic.zip + data://meadow/faostat/2024-03-14/faostat_lc: + - snapshot://faostat/2024-03-14/faostat_lc.zip + data://meadow/faostat/2024-03-14/faostat_metadata: + - snapshot://faostat/2024-03-14/faostat_metadata.json + data://meadow/faostat/2024-03-14/faostat_qcl: + - snapshot://faostat/2024-03-14/faostat_qcl.zip + data://meadow/faostat/2024-03-14/faostat_qi: + - snapshot://faostat/2024-03-14/faostat_qi.zip + data://meadow/faostat/2024-03-14/faostat_qv: + - snapshot://faostat/2024-03-14/faostat_qv.zip + data://meadow/faostat/2024-03-14/faostat_rfb: + - snapshot://faostat/2024-03-14/faostat_rfb.zip + data://meadow/faostat/2024-03-14/faostat_rfn: + - snapshot://faostat/2024-03-14/faostat_rfn.zip + data://meadow/faostat/2024-03-14/faostat_rl: + - snapshot://faostat/2024-03-14/faostat_rl.zip + data://meadow/faostat/2024-03-14/faostat_rp: + - snapshot://faostat/2024-03-14/faostat_rp.zip + data://meadow/faostat/2024-03-14/faostat_rt: + - snapshot://faostat/2024-03-14/faostat_rt.zip + data://meadow/faostat/2024-03-14/faostat_scl: + - snapshot://faostat/2024-03-14/faostat_scl.zip + data://meadow/faostat/2024-03-14/faostat_sdgb: + - snapshot://faostat/2024-03-14/faostat_sdgb.zip + data://meadow/faostat/2024-03-14/faostat_tcl: + - snapshot://faostat/2024-03-14/faostat_tcl.zip + data://meadow/faostat/2024-03-14/faostat_ti: + - snapshot://faostat/2024-03-14/faostat_ti.zip diff --git a/docs/data/faostat.md b/docs/data/faostat.md index 2c589ecca7a..04d18449f13 100644 --- a/docs/data/faostat.md +++ b/docs/data/faostat.md @@ -228,13 +228,17 @@ If no dataset requires an update, the workflow stops here. 2. Create new meadow steps. + !!! note + + The `-a` flag ensures a meadow step is created for all steps. In principle, we could just create steps for those domains that have new snapshots. But, given that we update this dataset yearly, it seems more convenient to simply update all of them. + ```bash - python etl/scripts/faostat/create_new_steps.py -c meadow + python etl/scripts/faostat/create_new_steps.py -c meadow -a ``` 3. Run the new etl meadow steps, to generate the meadow datasets. ```bash - etl meadow/faostat/YYYY-MM-DD + etl run meadow/faostat/YYYY-MM-DD ``` 4. Create new garden steps. @@ -246,7 +250,7 @@ If no dataset requires an update, the workflow stops here. 5. Run the new etl garden steps, to generate the garden datasets. ```bash - etl garden/faostat/YYYY-MM-DD + etl run garden/faostat/YYYY-MM-DD ``` Optionally, set `INSPECT_ANOMALIES=True`, to visualize if anomalies that were detected in the previous version of the data are still present in the current version. @@ -266,7 +270,7 @@ If no dataset requires an update, the workflow stops here. If any changes were found, re-run the garden steps. ```bash - etl garden/faostat/YYYY-MM-DD + etl run garden/faostat/YYYY-MM-DD ``` 7. Create new grapher steps. @@ -278,7 +282,7 @@ If no dataset requires an update, the workflow stops here. 8. Run the new etl grapher steps, to generate the grapher charts. ```bash - etl faostat/YYYY-MM-DD --grapher + etl run faostat/YYYY-MM-DD --grapher ``` 9. Generate chart revisions (showing a chart using an old version of a variable and the same chart using the new @@ -302,7 +306,7 @@ accept or reject changes. 12. Run the new etl explorers step, to generate the csv files for the global food explorer. ```bash - etl explorers/faostat/YYYY-MM-DD/food_explorer + etl run explorers/faostat/YYYY-MM-DD/food_explorer ``` Run internal sanity checks on the generated files. diff --git a/etl/scripts/faostat/shared.py b/etl/scripts/faostat/shared.py index c2bf5b37535..34af9c739f0 100644 --- a/etl/scripts/faostat/shared.py +++ b/etl/scripts/faostat/shared.py @@ -86,8 +86,9 @@ "tcl", # Trade: Trade Indices. "ti", + # Removed from the list (as they have not been used and were causing issues). # World Census of Agriculture. - "wcad", + # "wcad", ] # URL for dataset codes in FAOSTAT catalog. # This is the URL used to get the remote location of the actual data files to be downloaded, and the date of their diff --git a/etl/steps/data/meadow/faostat/2024-03-14/faostat_cahd.py b/etl/steps/data/meadow/faostat/2024-03-14/faostat_cahd.py new file mode 100644 index 00000000000..cc99efc2b22 --- /dev/null +++ b/etl/steps/data/meadow/faostat/2024-03-14/faostat_cahd.py @@ -0,0 +1,2 @@ +"""FAOSTAT meadow step for faostat_cahd dataset.""" +from .shared import run # noqa:F401 diff --git a/etl/steps/data/meadow/faostat/2024-03-14/faostat_ef.py b/etl/steps/data/meadow/faostat/2024-03-14/faostat_ef.py new file mode 100644 index 00000000000..c1b3ce5eec8 --- /dev/null +++ b/etl/steps/data/meadow/faostat/2024-03-14/faostat_ef.py @@ -0,0 +1,2 @@ +"""FAOSTAT meadow step for faostat_ef dataset.""" +from .shared import run # noqa:F401 diff --git a/etl/steps/data/meadow/faostat/2024-03-14/faostat_ei.py b/etl/steps/data/meadow/faostat/2024-03-14/faostat_ei.py new file mode 100644 index 00000000000..8f8c520ac1c --- /dev/null +++ b/etl/steps/data/meadow/faostat/2024-03-14/faostat_ei.py @@ -0,0 +1,2 @@ +"""FAOSTAT meadow step for faostat_ei dataset.""" +from .shared import run # noqa:F401 diff --git a/etl/steps/data/meadow/faostat/2024-03-14/faostat_ek.py b/etl/steps/data/meadow/faostat/2024-03-14/faostat_ek.py new file mode 100644 index 00000000000..8affbd5ac70 --- /dev/null +++ b/etl/steps/data/meadow/faostat/2024-03-14/faostat_ek.py @@ -0,0 +1,2 @@ +"""FAOSTAT meadow step for faostat_ek dataset.""" +from .shared import run # noqa:F401 diff --git a/etl/steps/data/meadow/faostat/2024-03-14/faostat_el.py b/etl/steps/data/meadow/faostat/2024-03-14/faostat_el.py new file mode 100644 index 00000000000..7cda6b5ced7 --- /dev/null +++ b/etl/steps/data/meadow/faostat/2024-03-14/faostat_el.py @@ -0,0 +1,2 @@ +"""FAOSTAT meadow step for faostat_el dataset.""" +from .shared import run # noqa:F401 diff --git a/etl/steps/data/meadow/faostat/2024-03-14/faostat_emn.py b/etl/steps/data/meadow/faostat/2024-03-14/faostat_emn.py new file mode 100644 index 00000000000..e0341d5f29b --- /dev/null +++ b/etl/steps/data/meadow/faostat/2024-03-14/faostat_emn.py @@ -0,0 +1,2 @@ +"""FAOSTAT meadow step for faostat_emn dataset.""" +from .shared import run # noqa:F401 diff --git a/etl/steps/data/meadow/faostat/2024-03-14/faostat_ep.py b/etl/steps/data/meadow/faostat/2024-03-14/faostat_ep.py new file mode 100644 index 00000000000..de1278faacf --- /dev/null +++ b/etl/steps/data/meadow/faostat/2024-03-14/faostat_ep.py @@ -0,0 +1,2 @@ +"""FAOSTAT meadow step for faostat_ep dataset.""" +from .shared import run # noqa:F401 diff --git a/etl/steps/data/meadow/faostat/2024-03-14/faostat_esb.py b/etl/steps/data/meadow/faostat/2024-03-14/faostat_esb.py new file mode 100644 index 00000000000..d90d2c0538a --- /dev/null +++ b/etl/steps/data/meadow/faostat/2024-03-14/faostat_esb.py @@ -0,0 +1,2 @@ +"""FAOSTAT meadow step for faostat_esb dataset.""" +from .shared import run # noqa:F401 diff --git a/etl/steps/data/meadow/faostat/2024-03-14/faostat_fa.py b/etl/steps/data/meadow/faostat/2024-03-14/faostat_fa.py new file mode 100644 index 00000000000..29014f1b54a --- /dev/null +++ b/etl/steps/data/meadow/faostat/2024-03-14/faostat_fa.py @@ -0,0 +1,2 @@ +"""FAOSTAT meadow step for faostat_fa dataset.""" +from .shared import run # noqa:F401 diff --git a/etl/steps/data/meadow/faostat/2024-03-14/faostat_fbs.py b/etl/steps/data/meadow/faostat/2024-03-14/faostat_fbs.py new file mode 100644 index 00000000000..65cbf54e4e3 --- /dev/null +++ b/etl/steps/data/meadow/faostat/2024-03-14/faostat_fbs.py @@ -0,0 +1,2 @@ +"""FAOSTAT meadow step for faostat_fbs dataset.""" +from .shared import run # noqa:F401 diff --git a/etl/steps/data/meadow/faostat/2024-03-14/faostat_fbsh.py b/etl/steps/data/meadow/faostat/2024-03-14/faostat_fbsh.py new file mode 100644 index 00000000000..ef0b7233357 --- /dev/null +++ b/etl/steps/data/meadow/faostat/2024-03-14/faostat_fbsh.py @@ -0,0 +1,2 @@ +"""FAOSTAT meadow step for faostat_fbsh dataset.""" +from .shared import run # noqa:F401 diff --git a/etl/steps/data/meadow/faostat/2024-03-14/faostat_fo.py b/etl/steps/data/meadow/faostat/2024-03-14/faostat_fo.py new file mode 100644 index 00000000000..9932ebb4718 --- /dev/null +++ b/etl/steps/data/meadow/faostat/2024-03-14/faostat_fo.py @@ -0,0 +1,2 @@ +"""FAOSTAT meadow step for faostat_fo dataset.""" +from .shared import run # noqa:F401 diff --git a/etl/steps/data/meadow/faostat/2024-03-14/faostat_fs.py b/etl/steps/data/meadow/faostat/2024-03-14/faostat_fs.py new file mode 100644 index 00000000000..74f1892050e --- /dev/null +++ b/etl/steps/data/meadow/faostat/2024-03-14/faostat_fs.py @@ -0,0 +1,2 @@ +"""FAOSTAT meadow step for faostat_fs dataset.""" +from .shared import run # noqa:F401 diff --git a/etl/steps/data/meadow/faostat/2024-03-14/faostat_gn.py b/etl/steps/data/meadow/faostat/2024-03-14/faostat_gn.py new file mode 100644 index 00000000000..6cc1cdd3414 --- /dev/null +++ b/etl/steps/data/meadow/faostat/2024-03-14/faostat_gn.py @@ -0,0 +1,2 @@ +"""FAOSTAT meadow step for faostat_gn dataset.""" +from .shared import run # noqa:F401 diff --git a/etl/steps/data/meadow/faostat/2024-03-14/faostat_ic.py b/etl/steps/data/meadow/faostat/2024-03-14/faostat_ic.py new file mode 100644 index 00000000000..76a7833c6f8 --- /dev/null +++ b/etl/steps/data/meadow/faostat/2024-03-14/faostat_ic.py @@ -0,0 +1,2 @@ +"""FAOSTAT meadow step for faostat_ic dataset.""" +from .shared import run # noqa:F401 diff --git a/etl/steps/data/meadow/faostat/2024-03-14/faostat_lc.py b/etl/steps/data/meadow/faostat/2024-03-14/faostat_lc.py new file mode 100644 index 00000000000..a18b1892fbf --- /dev/null +++ b/etl/steps/data/meadow/faostat/2024-03-14/faostat_lc.py @@ -0,0 +1,2 @@ +"""FAOSTAT meadow step for faostat_lc dataset.""" +from .shared import run # noqa:F401 diff --git a/etl/steps/data/meadow/faostat/2024-03-14/faostat_metadata.py b/etl/steps/data/meadow/faostat/2024-03-14/faostat_metadata.py new file mode 100644 index 00000000000..2f1133f4cc8 --- /dev/null +++ b/etl/steps/data/meadow/faostat/2024-03-14/faostat_metadata.py @@ -0,0 +1,201 @@ +"""Load FAOSTAT (additional) metadata (snapshot ingested using the API) and create a meadow faostat_metadata dataset. + +The resulting meadow dataset has as many tables as domain-categories ('faostat_qcl_area', 'faostat_fbs_item', ...). + +All categories are defined below in 'category_structure'. + +""" + +from pathlib import Path +from typing import Any, Dict, List + +import pandas as pd +import structlog +from owid.catalog import Table +from owid.datautils.io import load_json +from shared import CURRENT_DIR, NAMESPACE + +from etl.helpers import PathFinder, create_dataset + +log = structlog.get_logger() + +# Name for new meadow dataset. +DATASET_SHORT_NAME = f"{NAMESPACE}_metadata" + +# Define the structure of the additional metadata file. +category_structure = { + "area": { + "index": ["Country Code"], + "short_name": "area", + }, + "areagroup": { + "index": ["Country Group Code", "Country Code"], + "short_name": "area_group", + }, + "element": { + "index": ["Element Code"], + "short_name": "element", + }, + "flag": { + "index": ["Flag"], + "short_name": "flag", + }, + "glossary": { + "index": ["Glossary Code"], + "short_name": "glossary", + }, + "item": { + "index": ["Item Code"], + "short_name": "item", + }, + "itemfactor": { + "index": ["Item Group Code", "Item Code", "Element Code"], + "short_name": "item_factor", + }, + "itemgroup": { + "index": ["Item Group Code", "Item Code"], + "short_name": "item_group", + }, + "items": { + "index": ["Item Code"], + "short_name": "item", + }, + "itemsgroup": { + "index": ["Item Group Code", "Item Code"], + "short_name": "item_group", + }, + # Specific for faostat_fa. + "recipientarea": { + "index": ["Recipient Country Code"], + "short_name": "area", + }, + "unit": { + "index": ["Unit Name"], + "short_name": "unit", + }, + # Specific for faostat_fa. + "year": { + "index": ["Year Code"], + "short_name": "year", + }, + # Specific for faostat_fs. + "year3": { + "index": ["Year Code"], + "short_name": "year", + }, + "years": { + "index": ["Year Code"], + "short_name": "year", + }, + # Specific for faostat_wcad. + "yearwca": { + "index": ["Year Code"], + "short_name": "year", + }, + # Specific for faostat_gn. + "sources": { + "index": ["Source Code"], + "short_name": "sources", + }, +} + + +def check_that_category_structure_is_well_defined(md: Dict[str, Any]) -> None: + """Check that metadata content is consistent with category_structure (defined above). + + If that is not the case, it is possible that the content of metadata has changed, and therefore category_structure + may need to be edited. + + Parameters + ---------- + md : dict + Raw FAOSTAT (additional) metadata of all datasets. + + """ + for dataset in list(md): + for category in category_structure: + category_indexes = category_structure[category]["index"] + if category in md[dataset]: + category_metadata = md[dataset][category]["data"] + for entry in category_metadata: + for category_index in category_indexes: + error = ( + f"Index {category_index} not found in {category} for {dataset}. " + f"Consider redefining category_structure." + ) + assert category_index in entry, error + + +def create_tables_for_all_domain_records(additional_metadata: Dict[str, Any]) -> List[Table]: + """Create a table for each of the domain-categories (e.g. 'faostat_qcl_item'). + + Parameters + ---------- + additional_metadata : Dict[str, Any] + FAOSTAT additional metadata. + + Returns + ------- + tables: List[Table] + List of tables, each one corresponding to a specific domain-category. + + """ + # Create a new table for each domain-category (e.g. 'faostat_qcl_item'). + tables = [] + used_short_names = set() + for domain in additional_metadata: + for category in list(additional_metadata[domain]): + json_data = additional_metadata[domain][category]["data"] + df = pd.DataFrame.from_dict(json_data) + if len(df) > 0: + df.set_index( + category_structure[category]["index"], + verify_integrity=True, + inplace=True, + ) + table_short_name = f'{NAMESPACE}_{domain.lower()}_{category_structure[category]["short_name"]}' + + # there might be duplicates coming from `itemsgroup` and `itemgroup` + if table_short_name in used_short_names: + log.warning("faostat_metadata.duplicate_short_name", short_name=table_short_name) + continue + used_short_names.add(table_short_name) + + table = Table(df, short_name=table_short_name) + tables.append(table) + + return tables + + +def run(dest_dir: str) -> None: + # + # Load data. + # + # Fetch the dataset short name from dest_dir. + dataset_short_name = Path(dest_dir).name + + # Define path to current step file. + current_step_file = (CURRENT_DIR / dataset_short_name).with_suffix(".py") + + # Get paths and naming conventions for current data step. + paths = PathFinder(current_step_file.as_posix()) + + # Load snapshot. + snapshot = paths.load_dependency(short_name=dataset_short_name + ".json", channel="snapshot") + additional_metadata = load_json(snapshot.path) + + # + # Process data. + # + # Run sanity checks. + check_that_category_structure_is_well_defined(md=additional_metadata) + + # Create a new table for each domain-record (e.g. 'faostat_qcl_item'). + tables = create_tables_for_all_domain_records(additional_metadata=additional_metadata) + + # + # Save outputs. + # + # Create a new meadow dataset. + ds_meadow = create_dataset(dest_dir=dest_dir, tables=tables, default_metadata=snapshot.metadata) + ds_meadow.save() diff --git a/etl/steps/data/meadow/faostat/2024-03-14/faostat_qcl.py b/etl/steps/data/meadow/faostat/2024-03-14/faostat_qcl.py new file mode 100644 index 00000000000..d66b2edc113 --- /dev/null +++ b/etl/steps/data/meadow/faostat/2024-03-14/faostat_qcl.py @@ -0,0 +1,2 @@ +"""FAOSTAT meadow step for faostat_qcl dataset.""" +from .shared import run # noqa:F401 diff --git a/etl/steps/data/meadow/faostat/2024-03-14/faostat_qi.py b/etl/steps/data/meadow/faostat/2024-03-14/faostat_qi.py new file mode 100644 index 00000000000..460cc5faca5 --- /dev/null +++ b/etl/steps/data/meadow/faostat/2024-03-14/faostat_qi.py @@ -0,0 +1,2 @@ +"""FAOSTAT meadow step for faostat_qi dataset.""" +from .shared import run # noqa:F401 diff --git a/etl/steps/data/meadow/faostat/2024-03-14/faostat_qv.py b/etl/steps/data/meadow/faostat/2024-03-14/faostat_qv.py new file mode 100644 index 00000000000..07e74a4a95b --- /dev/null +++ b/etl/steps/data/meadow/faostat/2024-03-14/faostat_qv.py @@ -0,0 +1,2 @@ +"""FAOSTAT meadow step for faostat_qv dataset.""" +from .shared import run # noqa:F401 diff --git a/etl/steps/data/meadow/faostat/2024-03-14/faostat_rfb.py b/etl/steps/data/meadow/faostat/2024-03-14/faostat_rfb.py new file mode 100644 index 00000000000..ae439c21964 --- /dev/null +++ b/etl/steps/data/meadow/faostat/2024-03-14/faostat_rfb.py @@ -0,0 +1,2 @@ +"""FAOSTAT meadow step for faostat_rfb dataset.""" +from .shared import run # noqa:F401 diff --git a/etl/steps/data/meadow/faostat/2024-03-14/faostat_rfn.py b/etl/steps/data/meadow/faostat/2024-03-14/faostat_rfn.py new file mode 100644 index 00000000000..bae546a50e5 --- /dev/null +++ b/etl/steps/data/meadow/faostat/2024-03-14/faostat_rfn.py @@ -0,0 +1,2 @@ +"""FAOSTAT meadow step for faostat_rfn dataset.""" +from .shared import run # noqa:F401 diff --git a/etl/steps/data/meadow/faostat/2024-03-14/faostat_rl.py b/etl/steps/data/meadow/faostat/2024-03-14/faostat_rl.py new file mode 100644 index 00000000000..cb95f2263fb --- /dev/null +++ b/etl/steps/data/meadow/faostat/2024-03-14/faostat_rl.py @@ -0,0 +1,2 @@ +"""FAOSTAT meadow step for faostat_rl dataset.""" +from .shared import run # noqa:F401 diff --git a/etl/steps/data/meadow/faostat/2024-03-14/faostat_rp.py b/etl/steps/data/meadow/faostat/2024-03-14/faostat_rp.py new file mode 100644 index 00000000000..010769e5587 --- /dev/null +++ b/etl/steps/data/meadow/faostat/2024-03-14/faostat_rp.py @@ -0,0 +1,2 @@ +"""FAOSTAT meadow step for faostat_rp dataset.""" +from .shared import run # noqa:F401 diff --git a/etl/steps/data/meadow/faostat/2024-03-14/faostat_rt.py b/etl/steps/data/meadow/faostat/2024-03-14/faostat_rt.py new file mode 100644 index 00000000000..7254a8063e9 --- /dev/null +++ b/etl/steps/data/meadow/faostat/2024-03-14/faostat_rt.py @@ -0,0 +1,2 @@ +"""FAOSTAT meadow step for faostat_rt dataset.""" +from .shared import run # noqa:F401 diff --git a/etl/steps/data/meadow/faostat/2024-03-14/faostat_scl.py b/etl/steps/data/meadow/faostat/2024-03-14/faostat_scl.py new file mode 100644 index 00000000000..e9fc0ab99e4 --- /dev/null +++ b/etl/steps/data/meadow/faostat/2024-03-14/faostat_scl.py @@ -0,0 +1,2 @@ +"""FAOSTAT meadow step for faostat_scl dataset.""" +from .shared import run # noqa:F401 diff --git a/etl/steps/data/meadow/faostat/2024-03-14/faostat_sdgb.py b/etl/steps/data/meadow/faostat/2024-03-14/faostat_sdgb.py new file mode 100644 index 00000000000..bde23c34c06 --- /dev/null +++ b/etl/steps/data/meadow/faostat/2024-03-14/faostat_sdgb.py @@ -0,0 +1,2 @@ +"""FAOSTAT meadow step for faostat_sdgb dataset.""" +from .shared import run # noqa:F401 diff --git a/etl/steps/data/meadow/faostat/2024-03-14/faostat_tcl.py b/etl/steps/data/meadow/faostat/2024-03-14/faostat_tcl.py new file mode 100644 index 00000000000..c5299c892af --- /dev/null +++ b/etl/steps/data/meadow/faostat/2024-03-14/faostat_tcl.py @@ -0,0 +1,2 @@ +"""FAOSTAT meadow step for faostat_tcl dataset.""" +from .shared import run # noqa:F401 diff --git a/etl/steps/data/meadow/faostat/2024-03-14/faostat_ti.py b/etl/steps/data/meadow/faostat/2024-03-14/faostat_ti.py new file mode 100644 index 00000000000..9cfc9f9af7a --- /dev/null +++ b/etl/steps/data/meadow/faostat/2024-03-14/faostat_ti.py @@ -0,0 +1,2 @@ +"""FAOSTAT meadow step for faostat_ti dataset.""" +from .shared import run # noqa:F401 diff --git a/etl/steps/data/meadow/faostat/2024-03-14/shared.py b/etl/steps/data/meadow/faostat/2024-03-14/shared.py new file mode 100644 index 00000000000..09488c04043 --- /dev/null +++ b/etl/steps/data/meadow/faostat/2024-03-14/shared.py @@ -0,0 +1,173 @@ +"""Shared definitions in FAOSTAT meadow steps. + +Some basic processing is required to create tables from the raw data. +For example, column "Note" (present in some datasets) is skipped to avoid parsing errors. +Other minor changes can be found in the code. + +""" + +import os +import tempfile +import zipfile +from pathlib import Path + +import pandas as pd +import structlog +from owid.catalog import Table + +from etl.helpers import PathFinder, create_dataset + +# Initialise log. +log = structlog.get_logger() + +# Define path to current folder, namespace and version of all datasets in this folder. +CURRENT_DIR = Path(__file__).parent +NAMESPACE = CURRENT_DIR.parent.name +VERSION = CURRENT_DIR.name + + +def load_data(local_path: Path) -> pd.DataFrame: + """Load snapshot data (as a dataframe) for current dataset. + + Parameters + ---------- + local_path : Path or str + Path to local snapshot file. + + Returns + ------- + data : pd.DataFrame + Snapshot data. + + """ + # Unzip data into a temporary folder. + with tempfile.TemporaryDirectory() as temp_dir: + z = zipfile.ZipFile(local_path) + z.extractall(temp_dir) + (filename,) = list(filter(lambda x: "(Normalized)" in x, os.listdir(temp_dir))) + + # Load data from main file. + data = pd.read_csv(os.path.join(temp_dir, filename), encoding="latin-1", low_memory=False) + + return data + + +def run_sanity_checks(data: pd.DataFrame) -> None: + """Run basic sanity checks on loaded data (raise assertion errors if any check fails). + + Parameters + ---------- + data : pd.DataFrame + Data to be checked. + + """ + df = data.copy() + + # Check that column "Year Code" is identical to "Year", and can therefore be dropped. + error = "Column 'Year Code' does not coincide with column 'Year'." + if "Year" not in data.columns: + pass + # Column 'Year' is not in data (this happens at least in faostat_wcad, which requires further processing). + elif df["Year"].dtype == int: + # In most cases, columns "Year Code" and "Year" are simply the year. + assert (df["Year Code"] == df["Year"]).all(), error + else: + # Sometimes (e.g. for dataset fs) there are year ranges (e.g. with "Year Code" 20002002 and "Year" "2000-2002"). + assert (df["Year Code"] == df["Year"].str.replace("-", "").astype(int)).all(), error + + # Check that there is only one element-unit for each element code. + error = "Multiple element-unit for the same element code." + assert (df.groupby(["Element", "Unit"])["Element Code"].nunique() == 1).all(), error + + +def prepare_output_data(data: pd.DataFrame) -> pd.DataFrame: + """Prepare data before saving it to meadow. + + Parameters + ---------- + data : pd.DataFrame + Data. + + Returns + ------- + df : pd.DataFrame + Data ready to be stored as a table in meadow. + + """ + df = data.copy() + + # Select columns to keep. + # Note: + # * Ignore column "Year Code" (which is almost identical to "Year", and does not add information). + # * Ignore column "Note" (which is included only in faostat_fa, faostat_fs, faostat_sdgb and faostat_wcad datasets). + # This column may contain double-quoted text within double-quoted text, which becomes impossible to parse. + # E.g. faostat_wcad line 105. + # * Add "Recipient Country Code" and "Recipient Code", which are the names for "Area Code" and "Area", respectively, + # for dataset faostat_fa. + columns_to_keep = [ + "Area Code", + "Area", + "Year", + "Item Code", + "Item", + "Element Code", + "Element", + "Unit", + "Value", + "Flag", + "Recipient Country Code", + "Recipient Country", + # Additional columns for faostat_wcad. + "WCA Round", + "Census Year", + ] + # Select only columns that are found in the dataframe. + columns_to_keep = list(set(columns_to_keep) & set(df.columns)) + df = df[columns_to_keep] + + # Set index columns depending on what columns are available in the dataframe. + # Note: "Recipient Country Code" appears only in faostat_fa, and seems to replace "Area Code". + # Note: "WCA Round" and "Census Year" appear only in faostat_wcad. + index_columns = list( + {"Area Code", "Recipient Country Code", "Year", "Item Code", "Element Code", "WCA Round", "Census Year"} + & set(df.columns) + ) + if df.duplicated(subset=index_columns).any(): + log.warning("Index has duplicated keys.") + df = df.set_index(index_columns) + + return df + + +def run(dest_dir: str) -> None: + # + # Load data. + # + # Fetch the dataset short name from dest_dir. + dataset_short_name = Path(dest_dir).name + + # Define path to current step file. + current_step_file = (CURRENT_DIR / dataset_short_name).with_suffix(".py") + + # Get paths and naming conventions for current data step. + paths = PathFinder(current_step_file.as_posix()) + + # Load snapshot. + snapshot = paths.load_dependency(short_name=dataset_short_name + ".zip", channel="snapshot") + df_snapshot = load_data(snapshot.path) + + # + # Process data. + # + # Run sanity checks. + run_sanity_checks(data=df_snapshot) + + # Prepare output meadow table. + tb_meadow = Table(prepare_output_data(data=df_snapshot), short_name=dataset_short_name) + + # + # Save outputs. + # + # Create a new meadow dataset. + ds_meadow = create_dataset(dest_dir=dest_dir, tables=[tb_meadow], default_metadata=snapshot.metadata) + ds_meadow.save() diff --git a/snapshots/faostat/2024-03-14/faostat_cahd.zip.dvc b/snapshots/faostat/2024-03-14/faostat_cahd.zip.dvc new file mode 100644 index 00000000000..7cae8b2ca67 --- /dev/null +++ b/snapshots/faostat/2024-03-14/faostat_cahd.zip.dvc @@ -0,0 +1,21 @@ +meta: + origin: + producer: Food and Agriculture Organization of the United Nations + title: 'Cost and Affordability of a Healthy Diet: Cost and Affordability of a Healthy Diet (CoAHD)' + description: |- + Indicators on the cost and affordability of a healthy diet are estimated in each country and show the population’s physical and economic access to least expensive locally available foods to meet requirements for a healthy diet, as defined in food-based dietary guidelines (FBDGs). The indicators use observed retail food consumer prices and income distributions to provide an operational measure of people’s access to locally available foods in the proportions needed for health. These indicators support efforts within the framework of the Sustainable Development Goals (SDGs) to end hunger, achieve food security and improved nutrition, and promote sustainable agriculture by 2030 (SDG 2). They also support the monitoring of progress towards the objective of transforming agrifood systems by promoting “nutrition-sensitive agriculture”. For definitions of these indicators, see Definitions and standards. + citation_full: |- + Food and Agriculture Organization of the United Nations - Cost and Affordability of a Healthy Diet: Cost and Affordability of a Healthy Diet (CoAHD) (2023). + attribution_short: FAOSTAT + url_main: http://www.fao.org/faostat/en/#data/CAHD + url_download: |- + https://fenixservices.fao.org/faostat/static/bulkdownloads/Cost_Affordability_Healthy_Diet_(CoAHD)_E_All_Data_(Normalized).zip + date_accessed: '2024-03-14' + date_published: '2023-07-14' + license: + name: CC BY-NC-SA 3.0 IGO + url: http://www.fao.org/contact-us/terms/db-terms-of-use/en +outs: + - md5: 40a63b998995cdd2f58ec1fa70bca642 + size: 38069 + path: faostat_cahd.zip diff --git a/snapshots/faostat/2024-03-14/faostat_ei.zip.dvc b/snapshots/faostat/2024-03-14/faostat_ei.zip.dvc new file mode 100644 index 00000000000..9c3e3c96596 --- /dev/null +++ b/snapshots/faostat/2024-03-14/faostat_ei.zip.dvc @@ -0,0 +1,20 @@ +meta: + origin: + producer: Food and Agriculture Organization of the United Nations + title: 'Climate Change: Agrifood systems emissions: Emissions intensities' + description: |- + The FAOSTAT domain Emissions intensities contains analytical data on the intensity of greenhouse gas (GHG) emissions by agricultural commodity. This indicator is defined as greenhouse gas emissions per kg of product. Data are available for a set of agricultural commodities (e.g. rice and other cereals, meat, milk, eggs), by country, with global coverage and relative to the period 1961–2020. + citation_full: |- + Food and Agriculture Organization of the United Nations - Climate Change: Agrifood systems emissions: Emissions intensities (2023). + attribution_short: FAOSTAT + url_main: http://www.fao.org/faostat/en/#data/EI + url_download: https://fenixservices.fao.org/faostat/static/bulkdownloads/Environment_Emissions_intensities_E_All_Data_(Normalized).zip + date_accessed: '2024-03-14' + date_published: '2023-12-06' + license: + name: CC BY-NC-SA 3.0 IGO + url: http://www.fao.org/contact-us/terms/db-terms-of-use/en +outs: + - md5: 89e25bcf5ae42cc08038942cf2a9bb7e + size: 3557924 + path: faostat_ei.zip diff --git a/snapshots/faostat/2024-03-14/faostat_ek.zip.dvc b/snapshots/faostat/2024-03-14/faostat_ek.zip.dvc new file mode 100644 index 00000000000..65c4e3ed4f4 --- /dev/null +++ b/snapshots/faostat/2024-03-14/faostat_ek.zip.dvc @@ -0,0 +1,19 @@ +meta: + origin: + producer: Food and Agriculture Organization of the United Nations + title: 'Land, Inputs and Sustainability: Livestock Patterns' + description: |- + The Livestock Patterns domain of FAOSTAT contains data on livestock numbers, shares of major livestock species and densities of livestock units in the agricultural land area. Values are calculated using Livestock Units (LSU), which facilitate aggregating information for different livestock types. Data are available by country, with global coverage, for the period 1961 to present, with annual updates. This methodology applies the LSU coefficients reported in the "Guidelines for the preparation of livestock sector reviews" (FAO, 2011). From this publication, LSU coefficients are computed by livestock type and by country. The reference unit used for the calculation of livestock units (=1 LSU) is the grazing equivalent of one adult dairy cow producing 3000 kg of milk annually, fed without additional concentrated foodstuffs. FAOSTAT agri-environmental indicators on livestock patterns closely follow the structure of the indicators in EUROSTAT. + citation_full: 'Food and Agriculture Organization of the United Nations - Land, Inputs and Sustainability: Livestock Patterns (2023).' + attribution_short: FAOSTAT + url_main: http://www.fao.org/faostat/en/#data/EK + url_download: https://fenixservices.fao.org/faostat/static/bulkdownloads/Environment_LivestockPatterns_E_All_Data_(Normalized).zip + date_accessed: '2024-03-14' + date_published: '2023-08-30' + license: + name: CC BY-NC-SA 3.0 IGO + url: http://www.fao.org/contact-us/terms/db-terms-of-use/en +outs: + - md5: 29bc9a0c4ae76ca22dda476799eac1e4 + size: 3067161 + path: faostat_ek.zip diff --git a/snapshots/faostat/2024-03-14/faostat_emn.zip.dvc b/snapshots/faostat/2024-03-14/faostat_emn.zip.dvc new file mode 100644 index 00000000000..15f63877213 --- /dev/null +++ b/snapshots/faostat/2024-03-14/faostat_emn.zip.dvc @@ -0,0 +1,19 @@ +meta: + origin: + producer: Food and Agriculture Organization of the United Nations + title: 'Land, Inputs and Sustainability: Livestock Manure' + description: |- + The Livestock Manure domain of FAOSTAT contains estimates of nitrogen (N) inputs to agricultural soils from livestock manure. Data on the N losses to air and water are also disseminated. These estimates are compiled using official FAOSTAT statistics of animal stocks and by applying the internationally approved Guidelines of the Intergovernmental Panel on Climate Change (IPCC). Data are available by country, with global coverage and relative to the period 1961–2020, with annual updates. The following elements are disseminated: 1) Stocks; 2) Amount excreted in manure (N content); 3) Manure left on pasture (N content); 4) Manure left on pasture that volatilises (N content); 5) Manure left on pasture that leaches (N content); 6) Manure treated (N content); 7) Losses from manure treated (N content); 8) Manure applied to soils (N content); 9) Manure applied to soils that volatilises (N content); 10) Manure applied to soils that leaches (N content). + citation_full: 'Food and Agriculture Organization of the United Nations - Land, Inputs and Sustainability: Livestock Manure (2023).' + attribution_short: FAOSTAT + url_main: http://www.fao.org/faostat/en/#data/EMN + url_download: https://fenixservices.fao.org/faostat/static/bulkdownloads/Environment_LivestockManure_E_All_Data_(Normalized).zip + date_accessed: '2024-03-14' + date_published: '2023-09-18' + license: + name: CC BY-NC-SA 3.0 IGO + url: http://www.fao.org/contact-us/terms/db-terms-of-use/en +outs: + - md5: f827a6c395e65d856df32b77340758f6 + size: 20791545 + path: faostat_emn.zip diff --git a/snapshots/faostat/2024-03-14/faostat_esb.zip.dvc b/snapshots/faostat/2024-03-14/faostat_esb.zip.dvc new file mode 100644 index 00000000000..d17f474386d --- /dev/null +++ b/snapshots/faostat/2024-03-14/faostat_esb.zip.dvc @@ -0,0 +1,21 @@ +meta: + origin: + producer: Food and Agriculture Organization of the United Nations + title: 'Land, Inputs and Sustainability: Cropland Nutrient Balance' + description: |- + 2022 Cropland nutrient budget analytical briefThe Cropland Nutrient Budget domain contains information on the flows of nitrogen, phosphorus, and potassium from synthetic fertilizer, manure applied to soils, atmospheric deposition, crop removal, and biological fixation over cropland and per unit area of cropland. The flows are aggregated to total inputs and total outputs, from which the overall nutrient budget and nutrient use efficiency on cropland are calculated. Statistics are disseminated in units of tonnes and in kg/ha, as appropriate. Nutrient use efficiency is expressed as a fraction (%). Data are available by country, with global coverage relative to the period 1961-2020, with annual updates. + citation_full: |- + Food and Agriculture Organization of the United Nations - Land, Inputs and Sustainability: Cropland Nutrient Balance (2023). + attribution_short: FAOSTAT + url_main: http://www.fao.org/faostat/en/#data/ESB + url_download: |- + https://fenixservices.fao.org/faostat/static/bulkdownloads/Environment_Cropland_nutrient_budget_E_All_Data_(Normalized).zip + date_accessed: '2024-03-14' + date_published: '2023-11-08' + license: + name: CC BY-NC-SA 3.0 IGO + url: http://www.fao.org/contact-us/terms/db-terms-of-use/en +outs: + - md5: 7b6ac94d7ea0db34fb171e632fcdfc71 + size: 7754443 + path: faostat_esb.zip diff --git a/snapshots/faostat/2024-03-14/faostat_fa.zip.dvc b/snapshots/faostat/2024-03-14/faostat_fa.zip.dvc new file mode 100644 index 00000000000..f217437b0fc --- /dev/null +++ b/snapshots/faostat/2024-03-14/faostat_fa.zip.dvc @@ -0,0 +1,19 @@ +meta: + origin: + producer: Food and Agriculture Organization of the United Nations + title: 'Discontinued archives and data series: Food Aid Shipments (WFP)' + description: '' + citation_full: |- + Food and Agriculture Organization of the United Nations - Discontinued archives and data series: Food Aid Shipments (WFP) (2016). + attribution_short: FAOSTAT + url_main: http://www.fao.org/faostat/en/#data/FA + url_download: https://fenixservices.fao.org/faostat/static/bulkdownloads/Food_Aid_Shipments_WFP_E_All_Data_(Normalized).zip + date_accessed: '2024-03-14' + date_published: '2016-12-22' + license: + name: CC BY-NC-SA 3.0 IGO + url: http://www.fao.org/contact-us/terms/db-terms-of-use/en +outs: + - md5: 09563ea5bb690febaffea50e4d05d7d0 + size: 254793 + path: faostat_fa.zip diff --git a/snapshots/faostat/2024-03-14/faostat_fbs.zip.dvc b/snapshots/faostat/2024-03-14/faostat_fbs.zip.dvc new file mode 100644 index 00000000000..8d92684a14a --- /dev/null +++ b/snapshots/faostat/2024-03-14/faostat_fbs.zip.dvc @@ -0,0 +1,19 @@ +meta: + origin: + producer: Food and Agriculture Organization of the United Nations + title: 'Food Balances: Food Balances (2010-)' + description: |- + Food Balance Sheet presents a comprehensive picture of the pattern of a country's food supply during a specified reference period. The food balance sheet shows for each food item - i.e. each primary commodity and a number of processed commodities potentially available for human consumption - the sources of supply and its utilization. The total quantity of foodstuffs produced in a country added to the total quantity imported and adjusted to any change in stocks that may have occurred since the beginning of the reference period gives the supply available during that period. On the utilization side a distinction is made between the quantities exported, fed to livestock, used for seed, put to manufacture for food use and non-food uses, losses during storage and transportation, and food supplies available for human consumption. The per caput supply of each such food item available for human consumption is then obtained by dividing the respective quantity by the related data on the population actually partaking of it. Data on per caput food supplies are expressed in terms of quantity and - by applying appropriate food composition factors for all primary and processed products - also in terms of caloric value and protein and fat content. + citation_full: 'Food and Agriculture Organization of the United Nations - Food Balances: Food Balances (2010-) (2023).' + attribution_short: FAOSTAT + url_main: http://www.fao.org/faostat/en/#data/FBS + url_download: https://fenixservices.fao.org/faostat/static/bulkdownloads/FoodBalanceSheets_E_All_Data_(Normalized).zip + date_accessed: '2024-03-14' + date_published: '2023-09-04' + license: + name: CC BY-NC-SA 3.0 IGO + url: http://www.fao.org/contact-us/terms/db-terms-of-use/en +outs: + - md5: 3b08d9d7f3c9a578bc7b07d59b3e85ab + size: 57235148 + path: faostat_fbs.zip diff --git a/snapshots/faostat/2024-03-14/faostat_fbsh.zip.dvc b/snapshots/faostat/2024-03-14/faostat_fbsh.zip.dvc new file mode 100644 index 00000000000..41d13002034 --- /dev/null +++ b/snapshots/faostat/2024-03-14/faostat_fbsh.zip.dvc @@ -0,0 +1,20 @@ +meta: + origin: + producer: Food and Agriculture Organization of the United Nations + title: 'Food Balances: Food Balances (-2013, old methodology and population)' + description: |- + Food Balance Sheet presents a comprehensive picture of the pattern of a country's food supply during a specified reference period. The food balance sheet shows for each food item - i.e. each primary commodity and a number of processed commodities potentially available for human consumption - the sources of supply and its utilization. The total quantity of foodstuffs produced in a country added to the total quantity imported and adjusted to any change in stocks that may have occurred since the beginning of the reference period gives the supply available during that period. On the utilization side a distinction is made between the quantities exported, fed to livestock, used for seed, put to manufacture for food use and non-food uses, losses during storage and transportation, and food supplies available for human consumption. The per caput supply of each such food item available for human consumption is then obtained by dividing the respective quantity by the related data on the population actually partaking of it. Data on per caput food supplies are expressed in terms of quantity and - by applying appropriate food composition factors for all primary and processed products - also in terms of caloric value and protein and fat content. + citation_full: |- + Food and Agriculture Organization of the United Nations - Food Balances: Food Balances (-2013, old methodology and population) (2023). + attribution_short: FAOSTAT + url_main: http://www.fao.org/faostat/en/#data/FBSH + url_download: https://fenixservices.fao.org/faostat/static/bulkdownloads/FoodBalanceSheetsHistoric_E_All_Data_(Normalized).zip + date_accessed: '2024-03-14' + date_published: '2023-03-10' + license: + name: CC BY-NC-SA 3.0 IGO + url: http://www.fao.org/contact-us/terms/db-terms-of-use/en +outs: + - md5: 1d201817fe05ef2a5dcb425919336962 + size: 72467143 + path: faostat_fbsh.zip diff --git a/snapshots/faostat/2024-03-14/faostat_fo.zip.dvc b/snapshots/faostat/2024-03-14/faostat_fo.zip.dvc new file mode 100644 index 00000000000..0d77ef5ed04 --- /dev/null +++ b/snapshots/faostat/2024-03-14/faostat_fo.zip.dvc @@ -0,0 +1,19 @@ +meta: + origin: + producer: Food and Agriculture Organization of the United Nations + title: 'Forestry: Forestry Production and Trade' + description: |- + The database contains data on the production and trade in roundwood and in primary wood and paper products for all countries and territories in the world.The main types of primary forest products included in this database are roundwood, sawnwood, wood-based panels, pulp, and paper and paperboard. These products are detailed further and defined in the Joint Forest Sector Questionnaire (JFSQ) (https://www.fao.org/forestry/statistics/80572/en/). The database contains details of the following topics: - Roundwood removals (production) by coniferous and non-coniferous wood and assortments, - production and trade in industrial roundwood, sawnwood, wood-based panels, wood charcoal, pulp, paper and paperboard, and other products. More detailed information on wood products, including definitions, can be found at https://www.fao.org/forestry/statistics/80572/en + citation_full: 'Food and Agriculture Organization of the United Nations - Forestry: Forestry Production and Trade (2023).' + attribution_short: FAOSTAT + url_main: http://www.fao.org/faostat/en/#data/FO + url_download: https://fenixservices.fao.org/faostat/static/bulkdownloads/Forestry_E_All_Data_(Normalized).zip + date_accessed: '2024-03-14' + date_published: '2023-12-21' + license: + name: CC BY-NC-SA 3.0 IGO + url: http://www.fao.org/contact-us/terms/db-terms-of-use/en +outs: + - md5: 25c7d0da13a6d09599c0c832c225ee3e + size: 16176675 + path: faostat_fo.zip diff --git a/snapshots/faostat/2024-03-14/faostat_fs.zip.dvc b/snapshots/faostat/2024-03-14/faostat_fs.zip.dvc new file mode 100644 index 00000000000..727172e82df --- /dev/null +++ b/snapshots/faostat/2024-03-14/faostat_fs.zip.dvc @@ -0,0 +1,20 @@ +meta: + origin: + producer: Food and Agriculture Organization of the United Nations + title: 'Food Security and Nutrition: Suite of Food Security Indicators' + description: |- + The Suite of Food Security Indicators presents the core set of food security indicators. Following the recommendation of experts gathered in the Committee on World Food Security (CFS) Round Table on hunger measurement, hosted at FAO headquarters in September 2011, an initial set of indicators aiming to capture various aspects of food insecurity is presented here. The choice of the indicators has been informed by expert judgment and the availability of data with sufficient coverage to enable comparisons across regions and over time. Many of these indicators are produced and published elsewhere by FAO and other international organizations. They are reported here in a single database with the aim of building a wide food security information system. More indicators will be added to this set as more data will become available. Indicators are classified along the four dimensions of food security -- availability, access, utilization and stability. For definitions of these indicators, see Definitions and standards below (under Item). + citation_full: |- + Food and Agriculture Organization of the United Nations - Food Security and Nutrition: Suite of Food Security Indicators (2023). + attribution_short: FAOSTAT + url_main: http://www.fao.org/faostat/en/#data/FS + url_download: https://fenixservices.fao.org/faostat/static/bulkdownloads/Food_Security_Data_E_All_Data_(Normalized).zip + date_accessed: '2024-03-14' + date_published: '2023-08-14' + license: + name: CC BY-NC-SA 3.0 IGO + url: http://www.fao.org/contact-us/terms/db-terms-of-use/en +outs: + - md5: 517e6695aafbfd3ddf69f3a6dd9a5bcc + size: 1881582 + path: faostat_fs.zip diff --git a/snapshots/faostat/2024-03-14/faostat_gn.zip.dvc b/snapshots/faostat/2024-03-14/faostat_gn.zip.dvc new file mode 100644 index 00000000000..262fce0487b --- /dev/null +++ b/snapshots/faostat/2024-03-14/faostat_gn.zip.dvc @@ -0,0 +1,20 @@ +meta: + origin: + producer: Food and Agriculture Organization of the United Nations + title: 'Climate Change: Agrifood systems emissions: Emissions from Energy use in agriculture' + description: |- + Greenhouse gas (GHG) emissions from direct on-farm energy use consist of carbon dioxide, methane, and nitrous oxide gases related with fuel combustion and electricity generation in agriculture (including fisheries). The FAOSTAT emissions database has a global scope for the period 1970 to 2021 (with annual updates), by motor gasoline, gas-diesel oils, gasoline, natural gas, liquefied petroleum gas, residual fuel oil, coal, electricity, heat, gas-diesel oils in fisheries, residual fuel oil in fisheries, and by aggregates (total energy, energy consumed in fishery and total energy without electricity heat). Activity data(Energy use) is also provided. + citation_full: |- + Food and Agriculture Organization of the United Nations - Climate Change: Agrifood systems emissions: Emissions from Energy use in agriculture (2023). + attribution_short: FAOSTAT + url_main: http://www.fao.org/faostat/en/#data/GN + url_download: https://fenixservices.fao.org/faostat/static/bulkdownloads/Emissions_Agriculture_Energy_E_All_Data_(Normalized).zip + date_accessed: '2024-03-14' + date_published: '2023-12-07' + license: + name: CC BY-NC-SA 3.0 IGO + url: http://www.fao.org/contact-us/terms/db-terms-of-use/en +outs: + - md5: 6637a49f9c5d4f28a7644d96d7e7469b + size: 1597810 + path: faostat_gn.zip diff --git a/snapshots/faostat/2024-03-14/faostat_ic.zip.dvc b/snapshots/faostat/2024-03-14/faostat_ic.zip.dvc new file mode 100644 index 00000000000..cca81e228e2 --- /dev/null +++ b/snapshots/faostat/2024-03-14/faostat_ic.zip.dvc @@ -0,0 +1,19 @@ +meta: + origin: + producer: Food and Agriculture Organization of the United Nations + title: 'Investment: Credit to Agriculture' + description: |- + The Credit to Agriculture dataset provides national data for over 130 countries on the amount of loans provided by the private/commercial banking sector to producers in agriculture, forestry and fishing, including household producers, cooperatives, and agro-businesses. For some countries, the three subsectors of agriculture, forestry, and fishing are completely specified. In other cases, complete disaggregations are not available. The dataset also provides statistics on the total credit to all industries, indicators on the share of credit to agricultural producers, and an agriculture orientation index (AOI) for credit that normalizes the share of credit to agriculture over total credit by dividing it by the share of agriculture in gross domestic product (GDP). As such, it can provide a more accurate indication of the relative importance that banking sectors place on financing the sector. An AOI lower than 1 indicates that the agriculture sector receives a credit share lower than its contribution to the economy, while an AOI greater than 1 indicates a credit share to the agriculture sector greater than its economic contribution. + citation_full: 'Food and Agriculture Organization of the United Nations - Investment: Credit to Agriculture (2023).' + attribution_short: FAOSTAT + url_main: http://www.fao.org/faostat/en/#data/IC + url_download: https://fenixservices.fao.org/faostat/static/bulkdownloads/Investment_CreditAgriculture_E_All_Data_(Normalized).zip + date_accessed: '2024-03-14' + date_published: '2023-11-16' + license: + name: CC BY-NC-SA 3.0 IGO + url: http://www.fao.org/contact-us/terms/db-terms-of-use/en +outs: + - md5: b2379bf789d2774f8085206615ca6bc4 + size: 501545 + path: faostat_ic.zip diff --git a/snapshots/faostat/2024-03-14/faostat_lc.zip.dvc b/snapshots/faostat/2024-03-14/faostat_lc.zip.dvc new file mode 100644 index 00000000000..02f13518fac --- /dev/null +++ b/snapshots/faostat/2024-03-14/faostat_lc.zip.dvc @@ -0,0 +1,19 @@ +meta: + origin: + producer: Food and Agriculture Organization of the United Nations + title: 'Land, Inputs and Sustainability: Land Cover' + description: |- + The FAOSTAT domain Land Cover under the Agri-Environmental Indicators section contains land cover information organized by the land cover classes of the international standard system for Environmental and Economic Accounting Central Framework (SEEA CF). The land cover information is compiled from publicly available Global Land Cover (GLC) maps: a) MODIS land cover types based on the Land Cover Classification System, LCCS (2001–2021); b) The European Spatial Agency (ESA) Climate Change Initiative (CCI) annual land cover maps (1992–2020) produced by the Université catholique de Louvain (UCL)-Geomatics and now under the European Copernicus Program; c) The annual land cover maps which were produced under the European Copernicus Global Land Service (CGLS) (CGLS land cover, containing discrete land cover categorization for the period 2015–2019), with spatial resolution 100m; and d) 4) The WorldCover maps of the European Space Agency —available for the years 2020 and 2021, produced at 10m resolution. + citation_full: 'Food and Agriculture Organization of the United Nations - Land, Inputs and Sustainability: Land Cover (2023).' + attribution_short: FAOSTAT + url_main: http://www.fao.org/faostat/en/#data/LC + url_download: https://fenixservices.fao.org/faostat/static/bulkdownloads/Environment_LandCover_E_All_Data_(Normalized).zip + date_accessed: '2024-03-14' + date_published: '2023-06-13' + license: + name: CC BY-NC-SA 3.0 IGO + url: http://www.fao.org/contact-us/terms/db-terms-of-use/en +outs: + - md5: a807127de14dacc4b321c344d12e3383 + size: 1528536 + path: faostat_lc.zip diff --git a/snapshots/faostat/2024-03-14/faostat_metadata.json.dvc b/snapshots/faostat/2024-03-14/faostat_metadata.json.dvc new file mode 100644 index 00000000000..dcf032d1347 --- /dev/null +++ b/snapshots/faostat/2024-03-14/faostat_metadata.json.dvc @@ -0,0 +1,17 @@ +meta: + origin: + producer: Food and Agriculture Organization of the United Nations + title: Metadata and identifiers + description: Metadata and identifiers used in FAOSTAT datasets. + citation_full: Food and Agriculture Organization of the United Nations (2024). + attribution_short: FAOSTAT + url_main: http://www.fao.org/faostat/en/#data + date_accessed: '2024-03-14' + date_published: '2024-03-14' + license: + name: CC BY-NC-SA 3.0 IGO + url: http://www.fao.org/contact-us/terms/db-terms-of-use/en +outs: + - md5: 7529f6174cd449873afc82cd0185be8c + size: 11642573 + path: faostat_metadata.json diff --git a/snapshots/faostat/2024-03-14/faostat_qcl.zip.dvc b/snapshots/faostat/2024-03-14/faostat_qcl.zip.dvc new file mode 100644 index 00000000000..7efc45a5ac8 --- /dev/null +++ b/snapshots/faostat/2024-03-14/faostat_qcl.zip.dvc @@ -0,0 +1,19 @@ +meta: + origin: + producer: Food and Agriculture Organization of the United Nations + title: 'Production: Crops and livestock products' + description: |- + Crop and livestock statistics are recorded for 278 products, covering the following categories: 1) CROPS PRIMARY: Cereals, Citrus Fruit, Fibre Crops, Fruit, Oil Crops, Oil Crops and Cakes in Oil Equivalent, Pulses, Roots and Tubers, Sugar Crops, Treenuts and Vegetables. Data are expressed in terms of area harvested, production quantity and yield. Cereals: Area and production data on cereals relate to crops harvested for dry grain only. Cereal crops harvested for hay or harvested green for food, feed or silage or used for grazing are therefore excluded. 2) CROPS PROCESSED: Beer of barley; Cotton lint; Cottonseed; Margarine, short; Molasses; Oil, coconut (copra); Oil, cottonseed; Oil, groundnut; Oil, linseed; Oil, maize; Oil, olive, virgin; Oil, palm; Oil, palm kernel; Oil, rapeseed; Oil, safflower; Oil, sesame; Oil, soybean; Oil, sunflower; Palm kernels; Sugar Raw Centrifugal; Wine. 3) LIVE ANIMALS: Animals live n.e.s.; Asses; Beehives; Buffaloes; Camelids, other; Camels; Cattle; Chickens; Ducks; Geese and guinea fowls; Goats; Horses; Mules; Pigeons, other birds; Pigs; Rabbits and hares; Rodents, other; Sheep; Turkeys. 4) LIVESTOCK PRIMARY: Beeswax; Eggs (various types); Hides buffalo, fresh; Hides, cattle, fresh; Honey, natural; Meat (ass, bird nes, buffalo, camel, cattle, chicken, duck, game, goat, goose and guinea fowl, horse, mule, Meat nes, meat other camelids, Meat other rodents, pig, rabbit, sheep, turkey); Milk (buffalo, camel, cow, goat, sheep); Offals, nes; Silk-worm cocoons, reelable; Skins (goat, sheep); Snails, not sea; Wool, greasy. 5) LIVESTOCK PROCESSED: Butter (of milk from sheep, goat, buffalo, cow); Cheese (of milk from goat, buffalo, sheep, cow milk); Cheese of skimmed cow milk; Cream fresh; Ghee (cow and buffalo milk); Lard; Milk (dry buttermilk, skimmed condensed, skimmed cow, skimmed dried, skimmed evaporated, whole condensed, whole dried, whole evaporated); Silk raw; Tallow; Whey (condensed and dry); Yoghurt. + citation_full: 'Food and Agriculture Organization of the United Nations - Production: Crops and livestock products (2023).' + attribution_short: FAOSTAT + url_main: http://www.fao.org/faostat/en/#data/QCL + url_download: https://fenixservices.fao.org/faostat/static/bulkdownloads/Production_Crops_Livestock_E_All_Data_(Normalized).zip + date_accessed: '2024-03-14' + date_published: '2023-12-18' + license: + name: CC BY-NC-SA 3.0 IGO + url: http://www.fao.org/contact-us/terms/db-terms-of-use/en +outs: + - md5: 516c32dc777dd23da0c6e79a4c13f2db + size: 33327037 + path: faostat_qcl.zip diff --git a/snapshots/faostat/2024-03-14/faostat_qi.zip.dvc b/snapshots/faostat/2024-03-14/faostat_qi.zip.dvc new file mode 100644 index 00000000000..4cb825a6815 --- /dev/null +++ b/snapshots/faostat/2024-03-14/faostat_qi.zip.dvc @@ -0,0 +1,19 @@ +meta: + origin: + producer: Food and Agriculture Organization of the United Nations + title: 'Production: Production Indices' + description: |- + The FAO indices of agricultural production show the relative level of the aggregate volume of agricultural production for each year in comparison with the base period 2014-2016. Indices for meat production are computed based on data for production from indigenous animals. + citation_full: 'Food and Agriculture Organization of the United Nations - Production: Production Indices (2024).' + attribution_short: FAOSTAT + url_main: http://www.fao.org/faostat/en/#data/QI + url_download: https://fenixservices.fao.org/faostat/static/bulkdownloads/Production_Indices_E_All_Data_(Normalized).zip + date_accessed: '2024-03-14' + date_published: '2024-03-07' + license: + name: CC BY-NC-SA 3.0 IGO + url: http://www.fao.org/contact-us/terms/db-terms-of-use/en +outs: + - md5: 670b6fd31bd33084dc1e279907eecc4d + size: 15758082 + path: faostat_qi.zip diff --git a/snapshots/faostat/2024-03-14/faostat_qv.zip.dvc b/snapshots/faostat/2024-03-14/faostat_qv.zip.dvc new file mode 100644 index 00000000000..e04c7483305 --- /dev/null +++ b/snapshots/faostat/2024-03-14/faostat_qv.zip.dvc @@ -0,0 +1,19 @@ +meta: + origin: + producer: Food and Agriculture Organization of the United Nations + title: 'Production: Value of Agricultural Production' + description: |- + Values of agricultural production are calculated based on production data of primary commodities from Production domain and producer prices from Prices domain. The livestock value of production is measured in terms of indigenous meat. + citation_full: 'Food and Agriculture Organization of the United Nations - Production: Value of Agricultural Production (2024).' + attribution_short: FAOSTAT + url_main: http://www.fao.org/faostat/en/#data/QV + url_download: https://fenixservices.fao.org/faostat/static/bulkdownloads/Value_of_Production_E_All_Data_(Normalized).zip + date_accessed: '2024-03-14' + date_published: '2024-03-07' + license: + name: CC BY-NC-SA 3.0 IGO + url: http://www.fao.org/contact-us/terms/db-terms-of-use/en +outs: + - md5: 7bc127c8690bd3e25964d1fc75393a2c + size: 26486976 + path: faostat_qv.zip diff --git a/snapshots/faostat/2024-03-14/faostat_rfb.zip.dvc b/snapshots/faostat/2024-03-14/faostat_rfb.zip.dvc new file mode 100644 index 00000000000..34cf2b0c69f --- /dev/null +++ b/snapshots/faostat/2024-03-14/faostat_rfb.zip.dvc @@ -0,0 +1,20 @@ +meta: + origin: + producer: Food and Agriculture Organization of the United Nations + title: 'Land, Inputs and Sustainability: Fertilizers by Product' + description: |- + The Fertilizers by Product dataset contains information on the Production, Trade and Agriculture Use of inorganic (chemical or mineral) fertilizers products, over the time series 2002-present. The fertilizer statistics data are for a set of 23 product categories. Both straight and compound fertilizers are included. There is information available about methodology at: https://fenixservices.fao.org/faostat/static/documents/RFB/RFB_EN_README.pdf. + citation_full: |- + Food and Agriculture Organization of the United Nations - Land, Inputs and Sustainability: Fertilizers by Product (2023). + attribution_short: FAOSTAT + url_main: http://www.fao.org/faostat/en/#data/RFB + url_download: https://fenixservices.fao.org/faostat/static/bulkdownloads/Inputs_FertilizersProduct_E_All_Data_(Normalized).zip + date_accessed: '2024-03-14' + date_published: '2023-06-08' + license: + name: CC BY-NC-SA 3.0 IGO + url: http://www.fao.org/contact-us/terms/db-terms-of-use/en +outs: + - md5: 3ce0a4e990624c4944d38a66331712da + size: 1941499 + path: faostat_rfb.zip diff --git a/snapshots/faostat/2024-03-14/faostat_rfn.zip.dvc b/snapshots/faostat/2024-03-14/faostat_rfn.zip.dvc new file mode 100644 index 00000000000..8f4c7024951 --- /dev/null +++ b/snapshots/faostat/2024-03-14/faostat_rfn.zip.dvc @@ -0,0 +1,20 @@ +meta: + origin: + producer: Food and Agriculture Organization of the United Nations + title: 'Land, Inputs and Sustainability: Fertilizers by Nutrient' + description: |- + The Fertilizers by Nutrient dataset contains information on the totals in nutrients for Production, Trade and Agriculture Use of inorganic (chemical or mineral) fertilizers, over the time series 1961-present. The data are provided for the three primary plant nutrients: nitrogen (N), phosphorus (expressed as P2O5) and potassium (expressed as K2O). Both straight and compound fertilizers are included. There is information on the methodology available at: https://fenixservices.fao.org/faostat/static/documents/RFN/RFN_EN_README.pdf + citation_full: |- + Food and Agriculture Organization of the United Nations - Land, Inputs and Sustainability: Fertilizers by Nutrient (2023). + attribution_short: FAOSTAT + url_main: http://www.fao.org/faostat/en/#data/RFN + url_download: https://fenixservices.fao.org/faostat/static/bulkdownloads/Inputs_FertilizersNutrient_E_All_Data_(Normalized).zip + date_accessed: '2024-03-14' + date_published: '2023-07-05' + license: + name: CC BY-NC-SA 3.0 IGO + url: http://www.fao.org/contact-us/terms/db-terms-of-use/en +outs: + - md5: f0f44d270e8fa65a7efa519e14757e1d + size: 1838529 + path: faostat_rfn.zip diff --git a/snapshots/faostat/2024-03-14/faostat_rl.zip.dvc b/snapshots/faostat/2024-03-14/faostat_rl.zip.dvc new file mode 100644 index 00000000000..50472f3f795 --- /dev/null +++ b/snapshots/faostat/2024-03-14/faostat_rl.zip.dvc @@ -0,0 +1,19 @@ +meta: + origin: + producer: Food and Agriculture Organization of the United Nations + title: 'Land, Inputs and Sustainability: Land Use' + description: |- + The FAOSTAT Land Use domain contains data on forty-four categories of land use, irrigation and agricultural practices and five indicators relevant to monitor agriculture, forestry and fisheries activities at national, regional and global level. Data are available by country and year, with global coverage and annual updates. + citation_full: 'Food and Agriculture Organization of the United Nations - Land, Inputs and Sustainability: Land Use (2024).' + attribution_short: FAOSTAT + url_main: http://www.fao.org/faostat/en/#data/RL + url_download: https://fenixservices.fao.org/faostat/static/bulkdownloads/Inputs_LandUse_E_All_Data_(Normalized).zip + date_accessed: '2024-03-14' + date_published: '2024-02-15' + license: + name: CC BY-NC-SA 3.0 IGO + url: http://www.fao.org/contact-us/terms/db-terms-of-use/en +outs: + - md5: 700a9089cfba6504e92aacd25e176fd7 + size: 2602866 + path: faostat_rl.zip diff --git a/snapshots/faostat/2024-03-14/faostat_rp.zip.dvc b/snapshots/faostat/2024-03-14/faostat_rp.zip.dvc new file mode 100644 index 00000000000..a9df3aae275 --- /dev/null +++ b/snapshots/faostat/2024-03-14/faostat_rp.zip.dvc @@ -0,0 +1,19 @@ +meta: + origin: + producer: Food and Agriculture Organization of the United Nations + title: 'Land, Inputs and Sustainability: Pesticides Use' + description: |- + The Pesticides Use database includes data on the use of major pesticide groups (Insecticides, Herbicides, Fungicides, Plant growth regulators and Rodenticides) and of relevant chemical families. Data report the quantities (in tonnes of active ingredients) + citation_full: 'Food and Agriculture Organization of the United Nations - Land, Inputs and Sustainability: Pesticides Use (2024).' + attribution_short: FAOSTAT + url_main: http://www.fao.org/faostat/en/#data/RP + url_download: https://fenixservices.fao.org/faostat/static/bulkdownloads/Inputs_Pesticides_Use_E_All_Data_(Normalized).zip + date_accessed: '2024-03-14' + date_published: '2024-02-18' + license: + name: CC BY-NC-SA 3.0 IGO + url: http://www.fao.org/contact-us/terms/db-terms-of-use/en +outs: + - md5: 0d14a1888b0fb2df03c74ffba0df49d1 + size: 870260 + path: faostat_rp.zip diff --git a/snapshots/faostat/2024-03-14/faostat_rt.zip.dvc b/snapshots/faostat/2024-03-14/faostat_rt.zip.dvc new file mode 100644 index 00000000000..ca5d5143ce7 --- /dev/null +++ b/snapshots/faostat/2024-03-14/faostat_rt.zip.dvc @@ -0,0 +1,19 @@ +meta: + origin: + producer: Food and Agriculture Organization of the United Nations + title: 'Land, Inputs and Sustainability: Pesticides Trade' + description: |- + This domain contains data on pesticides and covers two different categories: pesticides traded in form or packagingfor retail sale or as preparations or articles, and pesticides traded as separate chemically defined compounds (if relevant for the Rotterdam Convention on the Prior Informed Consent Procedure for Certain Hazardous Chemicals and Pesticides in International Trade). The pesticides traded for retail sale or as preparations or articles are those classified under code 38.08 in the Harmonized System Nomenclature (HS) and include: hazardous pesticides, insecticides, fungicides, herbicides, disinfectants and other. For these pesticides, this domain contains trade data (imports and exports) in values only (current 1000 US dollars), and the time series extends from 1961 onwards. The pesticides traded as separate chemically defined compounds are those listed in Annex III of the Rotterdam Convention (excluding industrial chemicals) and therefore subject to the Prior Informed Consent (PIC) procedure. The correspondence with the HS Nomenclature is shown in the table at the Related Documents section. For these pesticides, this domain contains trade data (imports and exports) in both value (current 1000 US dollars) and quantity (net weight in tonnes), and the time series extends from 2007 onwards. + citation_full: 'Food and Agriculture Organization of the United Nations - Land, Inputs and Sustainability: Pesticides Trade (2023).' + attribution_short: FAOSTAT + url_main: http://www.fao.org/faostat/en/#data/RT + url_download: https://fenixservices.fao.org/faostat/static/bulkdownloads/Inputs_Pesticides_Trade_E_All_Data_(Normalized).zip + date_accessed: '2024-03-14' + date_published: '2023-07-03' + license: + name: CC BY-NC-SA 3.0 IGO + url: http://www.fao.org/contact-us/terms/db-terms-of-use/en +outs: + - md5: 629747f935ad45d2af3974bc7edeebc7 + size: 1656904 + path: faostat_rt.zip diff --git a/snapshots/faostat/2024-03-14/faostat_scl.zip.dvc b/snapshots/faostat/2024-03-14/faostat_scl.zip.dvc new file mode 100644 index 00000000000..b1bb9cb7eba --- /dev/null +++ b/snapshots/faostat/2024-03-14/faostat_scl.zip.dvc @@ -0,0 +1,19 @@ +meta: + origin: + producer: Food and Agriculture Organization of the United Nations + title: 'Food Balances: Supply Utilization Accounts (2010-)' + description: |- + Supply Utilization Accounts (SUA's) present a comprehensive picture of the pattern of a country's food supply during a specified reference period. The total quantity of foodstuffs produced in a country added to the total quantity imported and adjusted to any change in stocks that may have occurred since the beginning of the reference period gives the supply available during that period. On the utilization side a distinction is made between the quantities exported, fed to livestock, used for seed, put to manufacture for food use and non-food uses, losses during storage and transportation, and food supplies available for human consumption. The per caput supply of each such food item available for human consumption is then obtained by dividing the respective quantity by the related data on the population actually partaking of it. Data on per caput food supplies are expressed in terms of quantity and - by applying appropriate food composition factors for all primary and processed products - also in terms of caloric value and protein and fat content. + citation_full: 'Food and Agriculture Organization of the United Nations - Food Balances: Supply Utilization Accounts (2010-) (2023).' + attribution_short: FAOSTAT + url_main: http://www.fao.org/faostat/en/#data/SCL + url_download: https://fenixservices.fao.org/faostat/static/bulkdownloads/SUA_Crops_Livestock_E_All_Data_(Normalized).zip + date_accessed: '2024-03-14' + date_published: '2023-09-04' + license: + name: CC BY-NC-SA 3.0 IGO + url: http://www.fao.org/contact-us/terms/db-terms-of-use/en +outs: + - md5: 7c12b0bb36fe2ff93ff6a57f5a6a419f + size: 75493417 + path: faostat_scl.zip diff --git a/snapshots/faostat/2024-03-14/faostat_sdgb.zip.dvc b/snapshots/faostat/2024-03-14/faostat_sdgb.zip.dvc new file mode 100644 index 00000000000..51c00325eba --- /dev/null +++ b/snapshots/faostat/2024-03-14/faostat_sdgb.zip.dvc @@ -0,0 +1,19 @@ +meta: + origin: + producer: Food and Agriculture Organization of the United Nations + title: 'SDG Indicators: SDG Indicators' + description: |- + As the custodian agency of 21 SDG indicators, the Food and Agriculture Organization of the United Nations (FAO) is responsible for curating and refining the methodologies of these indicators, collecting data from national sources, ensuring their quality and compatibility with applicable standards and classifications, and disseminating data at global level. This FAOSTAT domain complements the global SDG database administered by the United Nations Statistical Division, as well as FAO’s SDG indicators portal, by providing access to the available data for each of these indicators. Please click the metadata link on the right hand navigation column for an abridged version of the methodology for compiling each of these indicators, a description of data sources and the relevant contact persons responsible for each indicator in the Organization. For a more detailed description of the methodology, data sources and reporting procedures, please follow the link to the official SDG indicator metadata document available at the bottom of each summary metadata page in the document on the right. + citation_full: 'Food and Agriculture Organization of the United Nations - SDG Indicators: SDG Indicators (2023).' + attribution_short: FAOSTAT + url_main: http://www.fao.org/faostat/en/#data/SDGB + url_download: https://fenixservices.fao.org/faostat/static/bulkdownloads/SDG_BulkDownloads_E_All_Data_(Normalized).zip + date_accessed: '2024-03-14' + date_published: '2023-09-20' + license: + name: CC BY-NC-SA 3.0 IGO + url: http://www.fao.org/contact-us/terms/db-terms-of-use/en +outs: + - md5: 13ef8ed550fc5d6029f1ca154c330485 + size: 3157616 + path: faostat_sdgb.zip diff --git a/snapshots/faostat/2024-03-14/faostat_tcl.zip.dvc b/snapshots/faostat/2024-03-14/faostat_tcl.zip.dvc new file mode 100644 index 00000000000..7543a8f78e7 --- /dev/null +++ b/snapshots/faostat/2024-03-14/faostat_tcl.zip.dvc @@ -0,0 +1,19 @@ +meta: + origin: + producer: Food and Agriculture Organization of the United Nations + title: 'Trade: Crops and livestock products' + description: |- + The food and agricultural trade dataset is collected, processed and disseminated by FAO according to the standard International Merchandise Trade Statistics (IMTS) Methodology. The data is mainly provided by UNSD, Eurostat, and other national authorities as needed. This source data is checked for outliers, trade partner data is used for non-reporting countries or missing cells, and data on food aid is added to take into account total cross-border trade flows. The trade database includes the following variables: export quantity, export value, import quantity, and import value. The trade database includes all food and agricultural products imported/exported annually by all the countries in the world. + citation_full: 'Food and Agriculture Organization of the United Nations - Trade: Crops and livestock products (2023).' + attribution_short: FAOSTAT + url_main: http://www.fao.org/faostat/en/#data/TCL + url_download: https://fenixservices.fao.org/faostat/static/bulkdownloads/Trade_CropsLivestock_E_All_Data_(Normalized).zip + date_accessed: '2024-03-14' + date_published: '2023-12-13' + license: + name: CC BY-NC-SA 3.0 IGO + url: http://www.fao.org/contact-us/terms/db-terms-of-use/en +outs: + - md5: 81025b83ce4eb7e40c03b7eb0f96e391 + size: 256273086 + path: faostat_tcl.zip diff --git a/snapshots/faostat/2024-03-14/faostat_ti.zip.dvc b/snapshots/faostat/2024-03-14/faostat_ti.zip.dvc new file mode 100644 index 00000000000..628f8d1e058 --- /dev/null +++ b/snapshots/faostat/2024-03-14/faostat_ti.zip.dvc @@ -0,0 +1,19 @@ +meta: + origin: + producer: Food and Agriculture Organization of the United Nations + title: 'Trade: Trade Indices' + description: |- + The food and agricultural trade dataset is collected, processed and disseminated by FAO according to the standard International Merchandise Trade Statistics (IMTS) Methodology. The data is mainly provided by UNSD, Eurostat, and other national authorities as needed. This source data is checked for outliers, trade partner data is used for non-reporting countries or missing cells, and data on food aid is added to take into account total cross-border trade flows. The trade database includes the following variables: export quantity, export value, import quantity, and import value. The trade database includes all food and agricultural products imported/exported annually by all the countries in the world. + citation_full: 'Food and Agriculture Organization of the United Nations - Trade: Trade Indices (2023).' + attribution_short: FAOSTAT + url_main: http://www.fao.org/faostat/en/#data/TI + url_download: https://fenixservices.fao.org/faostat/static/bulkdownloads/Trade_Indices_E_All_Data_(Normalized).zip + date_accessed: '2024-03-14' + date_published: '2023-12-18' + license: + name: CC BY-NC-SA 3.0 IGO + url: http://www.fao.org/contact-us/terms/db-terms-of-use/en +outs: + - md5: dcc2dc25b808c366bddfb1bec18653bc + size: 126295824 + path: faostat_ti.zip From 79e1784d660c00613031791679202014cd8fcf56 Mon Sep 17 00:00:00 2001 From: Pablo Rosado Date: Thu, 14 Mar 2024 10:11:59 +0100 Subject: [PATCH 04/54] Remove faostat_gn and fix issues --- dag/faostat.yml | 2 -- etl/scripts/faostat/create_new_snapshots.py | 16 +++++++++++---- etl/scripts/faostat/shared.py | 4 ++-- .../meadow/faostat/2024-03-14/faostat_gn.py | 2 -- .../faostat/2024-03-14/faostat_gn.zip.dvc | 20 ------------------- .../2024-03-14/faostat_metadata.json.dvc | 4 ++-- 6 files changed, 16 insertions(+), 32 deletions(-) delete mode 100644 etl/steps/data/meadow/faostat/2024-03-14/faostat_gn.py delete mode 100644 snapshots/faostat/2024-03-14/faostat_gn.zip.dvc diff --git a/dag/faostat.yml b/dag/faostat.yml index 14877d0dd58..30395680a6e 100644 --- a/dag/faostat.yml +++ b/dag/faostat.yml @@ -352,8 +352,6 @@ steps: - snapshot://faostat/2024-03-14/faostat_fo.zip data://meadow/faostat/2024-03-14/faostat_fs: - snapshot://faostat/2024-03-14/faostat_fs.zip - data://meadow/faostat/2024-03-14/faostat_gn: - - snapshot://faostat/2024-03-14/faostat_gn.zip data://meadow/faostat/2024-03-14/faostat_ic: - snapshot://faostat/2024-03-14/faostat_ic.zip data://meadow/faostat/2024-03-14/faostat_lc: diff --git a/etl/scripts/faostat/create_new_snapshots.py b/etl/scripts/faostat/create_new_snapshots.py index e6b2fc8b664..01da17e543a 100644 --- a/etl/scripts/faostat/create_new_snapshots.py +++ b/etl/scripts/faostat/create_new_snapshots.py @@ -185,9 +185,15 @@ def is_dataset_already_up_to_date( """ dataset_up_to_date = False for snapshot in existing_snapshots: - assert snapshot.metadata.source - snapshot_source_data_url = snapshot.metadata.source.source_data_url - snapshot_date_accessed = parser.parse(str(snapshot.metadata.source.date_accessed)).date() + assert snapshot.metadata.source or snapshot.metadata.origin + if snapshot.metadata.source: + snapshot_source_data_url = snapshot.metadata.source.source_data_url + snapshot_date_accessed = parser.parse(str(snapshot.metadata.source.date_accessed)).date() + elif snapshot.metadata.origin: + snapshot_source_data_url = snapshot.metadata.origin.url_download + snapshot_date_accessed = parser.parse(str(snapshot.metadata.origin.date_accessed)).date() + else: + raise ValueError(f"Snapshot {snapshot.metadata.short_name} does not have source or origin.") if (snapshot_source_data_url == source_data_url) and (snapshot_date_accessed > source_modification_date): dataset_up_to_date = True @@ -275,7 +281,9 @@ def to_snapshot(self) -> None: def main(read_only: bool = False) -> None: # Load list of existing snapshots related to current NAMESPACE. - existing_snapshots = list(snapshot_catalog(match=NAMESPACE)) + existing_snapshots = [ + snapshot for snapshot in list(snapshot_catalog(match=NAMESPACE)) if "backport/" not in snapshot.uri + ] # Initialise a flag that will become true if any dataset needs to be updated. any_dataset_was_updated = False diff --git a/etl/scripts/faostat/shared.py b/etl/scripts/faostat/shared.py index 34af9c739f0..6ce58a94956 100644 --- a/etl/scripts/faostat/shared.py +++ b/etl/scripts/faostat/shared.py @@ -56,8 +56,6 @@ "fo", # Food Security and Nutrition: Suite of Food Security Indicators. "fs", - # Energy use. - "gn", # Credit to Agriculture. "ic", # Land, Inputs and Sustainability: Land Cover. @@ -89,6 +87,8 @@ # Removed from the list (as they have not been used and were causing issues). # World Census of Agriculture. # "wcad", + # Energy use. + # "gn", ] # URL for dataset codes in FAOSTAT catalog. # This is the URL used to get the remote location of the actual data files to be downloaded, and the date of their diff --git a/etl/steps/data/meadow/faostat/2024-03-14/faostat_gn.py b/etl/steps/data/meadow/faostat/2024-03-14/faostat_gn.py deleted file mode 100644 index 6cc1cdd3414..00000000000 --- a/etl/steps/data/meadow/faostat/2024-03-14/faostat_gn.py +++ /dev/null @@ -1,2 +0,0 @@ -"""FAOSTAT meadow step for faostat_gn dataset.""" -from .shared import run # noqa:F401 diff --git a/snapshots/faostat/2024-03-14/faostat_gn.zip.dvc b/snapshots/faostat/2024-03-14/faostat_gn.zip.dvc deleted file mode 100644 index 262fce0487b..00000000000 --- a/snapshots/faostat/2024-03-14/faostat_gn.zip.dvc +++ /dev/null @@ -1,20 +0,0 @@ -meta: - origin: - producer: Food and Agriculture Organization of the United Nations - title: 'Climate Change: Agrifood systems emissions: Emissions from Energy use in agriculture' - description: |- - Greenhouse gas (GHG) emissions from direct on-farm energy use consist of carbon dioxide, methane, and nitrous oxide gases related with fuel combustion and electricity generation in agriculture (including fisheries). The FAOSTAT emissions database has a global scope for the period 1970 to 2021 (with annual updates), by motor gasoline, gas-diesel oils, gasoline, natural gas, liquefied petroleum gas, residual fuel oil, coal, electricity, heat, gas-diesel oils in fisheries, residual fuel oil in fisheries, and by aggregates (total energy, energy consumed in fishery and total energy without electricity heat). Activity data(Energy use) is also provided. - citation_full: |- - Food and Agriculture Organization of the United Nations - Climate Change: Agrifood systems emissions: Emissions from Energy use in agriculture (2023). - attribution_short: FAOSTAT - url_main: http://www.fao.org/faostat/en/#data/GN - url_download: https://fenixservices.fao.org/faostat/static/bulkdownloads/Emissions_Agriculture_Energy_E_All_Data_(Normalized).zip - date_accessed: '2024-03-14' - date_published: '2023-12-07' - license: - name: CC BY-NC-SA 3.0 IGO - url: http://www.fao.org/contact-us/terms/db-terms-of-use/en -outs: - - md5: 6637a49f9c5d4f28a7644d96d7e7469b - size: 1597810 - path: faostat_gn.zip diff --git a/snapshots/faostat/2024-03-14/faostat_metadata.json.dvc b/snapshots/faostat/2024-03-14/faostat_metadata.json.dvc index dcf032d1347..39d98f2073a 100644 --- a/snapshots/faostat/2024-03-14/faostat_metadata.json.dvc +++ b/snapshots/faostat/2024-03-14/faostat_metadata.json.dvc @@ -12,6 +12,6 @@ meta: name: CC BY-NC-SA 3.0 IGO url: http://www.fao.org/contact-us/terms/db-terms-of-use/en outs: - - md5: 7529f6174cd449873afc82cd0185be8c - size: 11642573 + - md5: a8ed55bb4ac26e10651fa5517f07227d + size: 10861715 path: faostat_metadata.json From 3d0eaf279c4ba015ddd83113503b374cb103159c Mon Sep 17 00:00:00 2001 From: Pablo Rosado Date: Thu, 14 Mar 2024 10:20:12 +0100 Subject: [PATCH 05/54] Fix meadow faostat_sdgb --- etl/steps/data/meadow/faostat/2024-03-14/faostat_metadata.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/etl/steps/data/meadow/faostat/2024-03-14/faostat_metadata.py b/etl/steps/data/meadow/faostat/2024-03-14/faostat_metadata.py index 2f1133f4cc8..6140fe64de6 100644 --- a/etl/steps/data/meadow/faostat/2024-03-14/faostat_metadata.py +++ b/etl/steps/data/meadow/faostat/2024-03-14/faostat_metadata.py @@ -97,6 +97,11 @@ "index": ["Source Code"], "short_name": "sources", }, + # Specific for faostat_sdgb. + "itemsSDG": { + "index": ["Item Code"], + "short_name": "item", + }, } From ac6e691f8ca083eee4f5cd5079df2a881bf5037a Mon Sep 17 00:00:00 2001 From: Pablo Rosado Date: Thu, 14 Mar 2024 10:21:04 +0100 Subject: [PATCH 06/54] Update docs --- docs/data/faostat.md | 2 -- 1 file changed, 2 deletions(-) diff --git a/docs/data/faostat.md b/docs/data/faostat.md index 04d18449f13..6e8520f4e74 100644 --- a/docs/data/faostat.md +++ b/docs/data/faostat.md @@ -65,7 +65,6 @@ List of meadow datasets that are generated, and their titles: * `faostat_fbsh`: Food Balances: Food Balances (-2013, old methodology and population). * `faostat_fo`: Forestry: Forestry Production and Trade. * `faostat_fs`: Food Security and Nutrition: Suite of Food Security Indicators. -* `faostat_gn`: Climate Change: Energy Use. * `faostat_ic`: Investment: Credit to Agriculture. * `faostat_lc`: Land, Inputs and Sustainability: Land Cover. * `faostat_qcl`: Production: Crops and livestock products. @@ -80,7 +79,6 @@ List of meadow datasets that are generated, and their titles: * `faostat_sdgb`: SDG Indicators: SDG Indicators. * `faostat_tcl`: Trade: Crops and livestock products. * `faostat_ti`: Trade: Trade Indices. -* `faostat_wcad`: World Census of Agriculture: Structural data from agricultural censuses. Each dataset `faostat_*` contains only one table: From 17de3c2ab1ed806414b41546e8361520a19a94bb Mon Sep 17 00:00:00 2001 From: Pablo Rosado Date: Thu, 14 Mar 2024 12:43:36 +0100 Subject: [PATCH 07/54] Add garden steps and minor improvements on meadow --- dag/faostat.yml | 156 ++ .../faostat/2024-03-14/custom_datasets.csv | 37 + .../2024-03-14/custom_elements_and_units.csv | 35 + .../faostat/2024-03-14/custom_items.csv | 255 +++ .../faostat/2024-03-14/detected_anomalies.py | 832 ++++++++ .../faostat/2024-03-14/faostat.countries.json | 324 +++ .../faostat.excluded_countries.json | 59 + .../garden/faostat/2024-03-14/faostat_cahd.py | 2 + .../garden/faostat/2024-03-14/faostat_ei.py | 2 + .../garden/faostat/2024-03-14/faostat_ek.py | 2 + .../garden/faostat/2024-03-14/faostat_emn.py | 2 + .../garden/faostat/2024-03-14/faostat_esb.py | 2 + .../garden/faostat/2024-03-14/faostat_fa.py | 2 + .../garden/faostat/2024-03-14/faostat_fbsc.py | 219 ++ .../garden/faostat/2024-03-14/faostat_fo.py | 2 + .../2024-03-14/faostat_food_explorer.py | 555 +++++ .../garden/faostat/2024-03-14/faostat_fs.py | 2 + .../garden/faostat/2024-03-14/faostat_ic.py | 2 + .../garden/faostat/2024-03-14/faostat_lc.py | 2 + .../faostat/2024-03-14/faostat_metadata.py | 1076 ++++++++++ .../garden/faostat/2024-03-14/faostat_qcl.py | 534 +++++ .../garden/faostat/2024-03-14/faostat_qi.py | 2 + .../garden/faostat/2024-03-14/faostat_qv.py | 2 + .../garden/faostat/2024-03-14/faostat_rfb.py | 2 + .../garden/faostat/2024-03-14/faostat_rfn.py | 2 + .../garden/faostat/2024-03-14/faostat_rl.py | 2 + .../garden/faostat/2024-03-14/faostat_rp.py | 2 + .../garden/faostat/2024-03-14/faostat_rt.py | 2 + .../garden/faostat/2024-03-14/faostat_scl.py | 2 + .../garden/faostat/2024-03-14/faostat_sdgb.py | 2 + .../garden/faostat/2024-03-14/faostat_tcl.py | 2 + .../garden/faostat/2024-03-14/faostat_ti.py | 2 + .../data/garden/faostat/2024-03-14/shared.py | 1893 +++++++++++++++++ .../faostat/2024-03-14/value_amendments.csv | 8 + .../faostat/2024-03-14/faostat_metadata.py | 2 +- .../data/meadow/faostat/2024-03-14/shared.py | 2 +- 36 files changed, 6027 insertions(+), 2 deletions(-) create mode 100644 etl/steps/data/garden/faostat/2024-03-14/custom_datasets.csv create mode 100644 etl/steps/data/garden/faostat/2024-03-14/custom_elements_and_units.csv create mode 100644 etl/steps/data/garden/faostat/2024-03-14/custom_items.csv create mode 100644 etl/steps/data/garden/faostat/2024-03-14/detected_anomalies.py create mode 100644 etl/steps/data/garden/faostat/2024-03-14/faostat.countries.json create mode 100644 etl/steps/data/garden/faostat/2024-03-14/faostat.excluded_countries.json create mode 100644 etl/steps/data/garden/faostat/2024-03-14/faostat_cahd.py create mode 100644 etl/steps/data/garden/faostat/2024-03-14/faostat_ei.py create mode 100644 etl/steps/data/garden/faostat/2024-03-14/faostat_ek.py create mode 100644 etl/steps/data/garden/faostat/2024-03-14/faostat_emn.py create mode 100644 etl/steps/data/garden/faostat/2024-03-14/faostat_esb.py create mode 100644 etl/steps/data/garden/faostat/2024-03-14/faostat_fa.py create mode 100644 etl/steps/data/garden/faostat/2024-03-14/faostat_fbsc.py create mode 100644 etl/steps/data/garden/faostat/2024-03-14/faostat_fo.py create mode 100644 etl/steps/data/garden/faostat/2024-03-14/faostat_food_explorer.py create mode 100644 etl/steps/data/garden/faostat/2024-03-14/faostat_fs.py create mode 100644 etl/steps/data/garden/faostat/2024-03-14/faostat_ic.py create mode 100644 etl/steps/data/garden/faostat/2024-03-14/faostat_lc.py create mode 100644 etl/steps/data/garden/faostat/2024-03-14/faostat_metadata.py create mode 100644 etl/steps/data/garden/faostat/2024-03-14/faostat_qcl.py create mode 100644 etl/steps/data/garden/faostat/2024-03-14/faostat_qi.py create mode 100644 etl/steps/data/garden/faostat/2024-03-14/faostat_qv.py create mode 100644 etl/steps/data/garden/faostat/2024-03-14/faostat_rfb.py create mode 100644 etl/steps/data/garden/faostat/2024-03-14/faostat_rfn.py create mode 100644 etl/steps/data/garden/faostat/2024-03-14/faostat_rl.py create mode 100644 etl/steps/data/garden/faostat/2024-03-14/faostat_rp.py create mode 100644 etl/steps/data/garden/faostat/2024-03-14/faostat_rt.py create mode 100644 etl/steps/data/garden/faostat/2024-03-14/faostat_scl.py create mode 100644 etl/steps/data/garden/faostat/2024-03-14/faostat_sdgb.py create mode 100644 etl/steps/data/garden/faostat/2024-03-14/faostat_tcl.py create mode 100644 etl/steps/data/garden/faostat/2024-03-14/faostat_ti.py create mode 100644 etl/steps/data/garden/faostat/2024-03-14/shared.py create mode 100644 etl/steps/data/garden/faostat/2024-03-14/value_amendments.csv diff --git a/dag/faostat.yml b/dag/faostat.yml index 30395680a6e..7a44fd08583 100644 --- a/dag/faostat.yml +++ b/dag/faostat.yml @@ -382,3 +382,159 @@ steps: - snapshot://faostat/2024-03-14/faostat_tcl.zip data://meadow/faostat/2024-03-14/faostat_ti: - snapshot://faostat/2024-03-14/faostat_ti.zip + # + # FAOSTAT garden steps for version 2024-03-14 + # + data://garden/faostat/2024-03-14/faostat_cahd: + - data://meadow/faostat/2024-03-14/faostat_cahd + - data://garden/faostat/2024-03-14/faostat_metadata + - data://grapher/demography/2023-03-31/population + - data://grapher/wb/2024-03-11/income_groups + data://garden/faostat/2024-03-14/faostat_ei: + - data://garden/faostat/2024-03-14/faostat_metadata + - data://meadow/faostat/2024-03-14/faostat_ei + - data://grapher/demography/2023-03-31/population + - data://grapher/wb/2024-03-11/income_groups + data://garden/faostat/2024-03-14/faostat_ek: + - data://meadow/faostat/2024-03-14/faostat_ek + - data://garden/faostat/2024-03-14/faostat_metadata + - data://grapher/demography/2023-03-31/population + - data://grapher/wb/2024-03-11/income_groups + data://garden/faostat/2024-03-14/faostat_emn: + - data://garden/faostat/2024-03-14/faostat_metadata + - data://grapher/wb/2024-03-11/income_groups + - data://grapher/demography/2023-03-31/population + - data://meadow/faostat/2024-03-14/faostat_emn + data://garden/faostat/2024-03-14/faostat_esb: + - data://garden/faostat/2024-03-14/faostat_metadata + - data://meadow/faostat/2024-03-14/faostat_esb + - data://grapher/demography/2023-03-31/population + - data://grapher/wb/2024-03-11/income_groups + data://garden/faostat/2024-03-14/faostat_fa: + - data://garden/faostat/2024-03-14/faostat_metadata + - data://grapher/demography/2023-03-31/population + - data://meadow/faostat/2024-03-14/faostat_fa + - data://grapher/wb/2024-03-11/income_groups + data://garden/faostat/2024-03-14/faostat_fbsc: + - data://meadow/faostat/2024-03-14/faostat_fbsh + - data://meadow/faostat/2024-03-14/faostat_fbs + - data://garden/faostat/2024-03-14/faostat_metadata + - data://grapher/demography/2023-03-31/population + - data://grapher/wb/2024-03-11/income_groups + data://garden/faostat/2024-03-14/faostat_fo: + - data://garden/faostat/2024-03-14/faostat_metadata + - data://meadow/faostat/2024-03-14/faostat_fo + - data://grapher/demography/2023-03-31/population + - data://grapher/wb/2024-03-11/income_groups + data://garden/faostat/2024-03-14/faostat_food_explorer: + - data://garden/faostat/2024-03-14/faostat_qcl + - data://garden/faostat/2024-03-14/faostat_fbsc + - data://garden/faostat/2024-03-14/faostat_metadata + - data://grapher/demography/2023-03-31/population + - data://grapher/wb/2024-03-11/income_groups + data://garden/faostat/2024-03-14/faostat_fs: + - data://meadow/faostat/2024-03-14/faostat_fs + - data://garden/faostat/2024-03-14/faostat_metadata + - data://grapher/demography/2023-03-31/population + - data://grapher/wb/2024-03-11/income_groups + data://garden/faostat/2024-03-14/faostat_ic: + - data://garden/faostat/2024-03-14/faostat_metadata + - data://grapher/demography/2023-03-31/population + - data://meadow/faostat/2024-03-14/faostat_ic + - data://grapher/wb/2024-03-11/income_groups + data://garden/faostat/2024-03-14/faostat_lc: + - data://garden/faostat/2024-03-14/faostat_metadata + - data://grapher/demography/2023-03-31/population + - data://grapher/wb/2024-03-11/income_groups + - data://meadow/faostat/2024-03-14/faostat_lc + data://garden/faostat/2024-03-14/faostat_metadata: + - data://meadow/faostat/2024-03-14/faostat_rt + - data://meadow/faostat/2024-03-14/faostat_scl + - data://meadow/faostat/2024-03-14/faostat_el + - data://meadow/faostat/2023-06-12/faostat_gn + - data://meadow/faostat/2024-03-14/faostat_sdgb + - data://meadow/faostat/2024-03-14/faostat_qv + - data://meadow/faostat/2024-03-14/faostat_emn + - data://meadow/faostat/2024-03-14/faostat_ek + - data://meadow/faostat/2024-03-14/faostat_qcl + - data://meadow/faostat/2024-03-14/faostat_fbsh + - data://meadow/faostat/2024-03-14/faostat_tcl + - data://meadow/faostat/2024-03-14/faostat_fa + - data://meadow/faostat/2024-03-14/faostat_fo + - data://meadow/faostat/2024-03-14/faostat_metadata + - data://meadow/faostat/2024-03-14/faostat_fs + - data://meadow/faostat/2024-03-14/faostat_ei + - data://meadow/faostat/2024-03-14/faostat_rl + - data://meadow/faostat/2024-03-14/faostat_ic + - data://meadow/faostat/2024-03-14/faostat_ef + - data://meadow/faostat/2024-03-14/faostat_qi + - data://meadow/faostat/2024-03-14/faostat_rfn + - data://meadow/faostat/2024-03-14/faostat_rfb + - data://meadow/faostat/2024-03-14/faostat_esb + - data://meadow/faostat/2024-03-14/faostat_lc + - data://meadow/faostat/2024-03-14/faostat_rp + - data://meadow/faostat/2024-03-14/faostat_cahd + - data://meadow/faostat/2024-03-14/faostat_fbs + - data://meadow/faostat/2024-03-14/faostat_ti + - data://meadow/faostat/2023-06-12/faostat_wcad + - data://meadow/faostat/2024-03-14/faostat_ep + data://garden/faostat/2024-03-14/faostat_qcl: + - data://garden/faostat/2024-03-14/faostat_metadata + - data://grapher/demography/2023-03-31/population + - data://meadow/faostat/2024-03-14/faostat_qcl + - data://grapher/wb/2024-03-11/income_groups + data://garden/faostat/2024-03-14/faostat_qi: + - data://meadow/faostat/2024-03-14/faostat_qi + - data://garden/faostat/2024-03-14/faostat_metadata + - data://grapher/demography/2023-03-31/population + - data://grapher/wb/2024-03-11/income_groups + data://garden/faostat/2024-03-14/faostat_qv: + - data://meadow/faostat/2024-03-14/faostat_qv + - data://garden/faostat/2024-03-14/faostat_metadata + - data://grapher/demography/2023-03-31/population + - data://grapher/wb/2024-03-11/income_groups + data://garden/faostat/2024-03-14/faostat_rfb: + - data://garden/faostat/2024-03-14/faostat_metadata + - data://meadow/faostat/2024-03-14/faostat_rfb + - data://grapher/demography/2023-03-31/population + - data://grapher/wb/2024-03-11/income_groups + data://garden/faostat/2024-03-14/faostat_rfn: + - data://garden/faostat/2024-03-14/faostat_metadata + - data://grapher/demography/2023-03-31/population + - data://meadow/faostat/2024-03-14/faostat_rfn + - data://grapher/wb/2024-03-11/income_groups + data://garden/faostat/2024-03-14/faostat_rl: + - data://garden/faostat/2024-03-14/faostat_metadata + - data://meadow/faostat/2024-03-14/faostat_rl + - data://grapher/demography/2023-03-31/population + - data://grapher/wb/2024-03-11/income_groups + data://garden/faostat/2024-03-14/faostat_rp: + - data://meadow/faostat/2024-03-14/faostat_rp + - data://garden/faostat/2024-03-14/faostat_metadata + - data://grapher/demography/2023-03-31/population + - data://grapher/wb/2024-03-11/income_groups + data://garden/faostat/2024-03-14/faostat_rt: + - data://garden/faostat/2024-03-14/faostat_metadata + - data://meadow/faostat/2024-03-14/faostat_rt + - data://grapher/demography/2023-03-31/population + - data://grapher/wb/2024-03-11/income_groups + data://garden/faostat/2024-03-14/faostat_scl: + - data://meadow/faostat/2024-03-14/faostat_scl + - data://garden/faostat/2024-03-14/faostat_metadata + - data://grapher/demography/2023-03-31/population + - data://grapher/wb/2024-03-11/income_groups + data://garden/faostat/2024-03-14/faostat_sdgb: + - data://garden/faostat/2024-03-14/faostat_metadata + - data://meadow/faostat/2024-03-14/faostat_sdgb + - data://grapher/wb/2024-03-11/income_groups + - data://grapher/demography/2023-03-31/population + data://garden/faostat/2024-03-14/faostat_tcl: + - data://meadow/faostat/2024-03-14/faostat_tcl + - data://garden/faostat/2024-03-14/faostat_metadata + - data://grapher/demography/2023-03-31/population + - data://grapher/wb/2024-03-11/income_groups + data://garden/faostat/2024-03-14/faostat_ti: + - data://garden/faostat/2024-03-14/faostat_metadata + - data://grapher/demography/2023-03-31/population + - data://meadow/faostat/2024-03-14/faostat_ti + - data://grapher/wb/2024-03-11/income_groups diff --git a/etl/steps/data/garden/faostat/2024-03-14/custom_datasets.csv b/etl/steps/data/garden/faostat/2024-03-14/custom_datasets.csv new file mode 100644 index 00000000000..66d40136c44 --- /dev/null +++ b/etl/steps/data/garden/faostat/2024-03-14/custom_datasets.csv @@ -0,0 +1,37 @@ +dataset,fao_dataset_title,owid_dataset_title,fao_dataset_description,owid_dataset_description +faostat_cahd,Cost and Affordability of a Healthy Diet: Cost and Affordability of a Healthy Diet (CoAHD) - FAO (2023),,"Indicators on the cost and affordability of a healthy diet are estimated in each country and show the population’s physical and economic access to least expensive locally available foods to meet requirements for a healthy diet, as defined in food-based dietary guidelines (FBDGs). The indicators use observed retail food consumer prices and income distributions to provide an operational measure of people’s access to locally available foods in the proportions needed for health. These indicators support efforts within the framework of the Sustainable Development Goals (SDGs) to end hunger, achieve food security and improved nutrition, and promote sustainable agriculture by 2030 (SDG 2). They also support the monitoring of progress towards the objective of transforming agrifood systems by promoting “nutrition-sensitive agriculture”. For definitions of these indicators, see Definitions and standards.", +faostat_ef,"Land, Inputs and Sustainability: Fertilizers indicators - FAO (2022)",Agri-Environmental Indicators: Fertilizers indicators,"The FAOSTAT domain Fertilizers Indicators provides information on three rations: a) the ratio between the totals by nutrient of agricultural use of chemical or mineral fertilizers, reported in the FAOSTAT domain “Inputs/Fertilizers by Nutrient” for nitrogen (N), phosphorus (expressed as P2O5) and potassium (expressed as K2O) and the area of cropland reported in the FAOSTAT domain “Inputs/Land Use”; b) The ratio of fertilizers use and the annual population reported in the FAOSTAT domain “Population and Employment/Population”; and c) The ratio of fertilizers use and the value of agricultural production reported in the FAOSTAT domain “Production/Value of Agricultural Production.Data are available at national, regional, and global level over the time series 1961-present.","Agri-Environmental Indicators: Fertilizers indicators + +This dataset describes the use of chemical and mineral fertilizers per area of cropland (which corresponds to the sum of arable land and permanent crops) at national, regional, and global level." +faostat_ei,Climate Change: Emissions intensities - FAO (2022),Agri-Environmental Indicators: Emissions intensities,"The FAOSTAT domain Emissions intensities contains analytical data on the intensity of greenhouse gas (GHG) emissions by agricultural commodity. This indicator is defined as greenhouse gas emissions per kg of product. Data are available for a set of agricultural commodities (e.g. rice and other cereals, meat, milk, eggs), by country, with global coverage and relative to the period 1961–2020.",Agri-Environmental Indicators: Emissions intensities +faostat_ek,"Land, Inputs and Sustainability: Livestock Patterns - FAO (2022)",Agri-Environmental Indicators: Livestock Patterns,"The Livestock Patterns domain of FAOSTAT contains data on livestock numbers, shares of major livestock species and densities of livestock units in the agricultural land area. Values are calculated using Livestock Units (LSU), which facilitate aggregating information for different livestock types. Data are available by country, with global coverage, for the period 1961 to present, with annual updates. This methodology applies the LSU coefficients reported in the ""Guidelines for the preparation of livestock sector reviews"" (FAO, 2011). From this publication, LSU coefficients are computed by livestock type and by country. The reference unit used for the calculation of livestock units (=1 LSU) is the grazing equivalent of one adult dairy cow producing 3000 kg of milk annually, fed without additional concentrated foodstuffs. FAOSTAT agri-environmental indicators on livestock patterns closely follow the structure of the indicators in EUROSTAT.",Agri-Environmental Indicators: Livestock Patterns +faostat_el,"Land, Inputs and Sustainability: Land use indicators - FAO (2022)",Agri-Environmental Indicators: Land use indicators,"The Agri-environmental Indicators—Land Use domain provides information on the distribution of agricultural and forest land, and their sub-components, including irrigated areas and areas under organic agriculture, at national, regional and global levels.Per capita values are included in this update.",Agri-Environmental Indicators: Land use indicators +faostat_emn,"Land, Inputs and Sustainability: Livestock Manure - FAO (2022)",Agri-Environmental Indicators: Livestock Manure,"The Livestock Manure domain of FAOSTAT contains estimates of nitrogen (N) inputs to agricultural soils from livestock manure. Data on the N losses to air and water are also disseminated. These estimates are compiled using official FAOSTAT statistics of animal stocks and by applying the internationally approved Guidelines of the Intergovernmental Panel on Climate Change (IPCC). Data are available by country, with global coverage and relative to the period 1961–2020, with annual updates. The following elements are disseminated: 1) Stocks; 2) Amount excreted in manure (N content); 3) Manure left on pasture (N content); 4) Manure left on pasture that volatilises (N content); 5) Manure left on pasture that leaches (N content); 6) Manure treated (N content); 7) Losses from manure treated (N content); 8) Manure applied to soils (N content); 9) Manure applied to soils that volatilises (N content); 10) Manure applied to soils that leaches (N content).",Agri-Environmental Indicators: Livestock Manure +faostat_ep,"Land, Inputs and Sustainability: Pesticides indicators - FAO (2022)",Agri-Environmental Indicators: Pesticides indicators,Agri-environmental indicator on the Use of pesticides per area of cropland (which is the sum of arable land and land under permanent crops) at national level for the period 1990 to 2016.,Agri-Environmental Indicators: Pesticides indicators +faostat_esb,"Land, Inputs and Sustainability: Cropland Nutrient Budget - FAO (2022)","Land, Inputs and Sustainability: Soil nutrient budget","2022 Cropland nutrient budget analytical briefThe Cropland Nutrient Budget domain contains information on the flows of nitrogen, phosphorus, and potassium from synthetic fertilizer, manure applied to soils, atmospheric deposition, crop removal, and biological fixation over cropland and per unit area of cropland. The flows are aggregated to total inputs and total outputs, from which the overall nutrient budget and nutrient use efficiency on cropland are calculated. Statistics are disseminated in units of tonnes and in kg/ha, as appropriate. Nutrient use efficiency is expressed as a fraction (%). Data are available by country, with global coverage relative to the period 1961-2020, with annual updates.","Land, Inputs and Sustainability: Soil nutrient budget" +faostat_fa,Discontinued archives and data series: Food Aid Shipments (WFP) - FAO (2016),Discontinued archives and data series: Food Aid Shipments (WFP),,Discontinued archives and data series: Food Aid Shipments (WFP) +faostat_fbs,Food Balances: Food Balances (2010-) - FAO (2023),Food Balance: New Food Balances,"Food Balance Sheet presents a comprehensive picture of the pattern of a country's food supply during a specified reference period. The food balance sheet shows for each food item - i.e. each primary commodity and a number of processed commodities potentially available for human consumption - the sources of supply and its utilization. The total quantity of foodstuffs produced in a country added to the total quantity imported and adjusted to any change in stocks that may have occurred since the beginning of the reference period gives the supply available during that period. On the utilization side a distinction is made between the quantities exported, fed to livestock, used for seed, put to manufacture for food use and non-food uses, losses during storage and transportation, and food supplies available for human consumption. The per caput supply of each such food item available for human consumption is then obtained by dividing the respective quantity by the related data on the population actually partaking of it. Data on per caput food supplies are expressed in terms of quantity and - by applying appropriate food composition factors for all primary and processed products - also in terms of caloric value and protein and fat content.",Food Balance: New Food Balances +faostat_fbsc,Food Balances: Food Balances (2010-) - FAO (2022),"Food Balances (old methodology before 2010, and new from 2010 onwards)","Food Balance Sheet presents a comprehensive picture of the pattern of a country's food supply during a specified reference period. The food balance sheet shows for each food item - i.e. each primary commodity and a number of processed commodities potentially available for human consumption - the sources of supply and its utilization. The total quantity of foodstuffs produced in a country added to the total quantity imported and adjusted to any change in stocks that may have occurred since the beginning of the reference period gives the supply available during that period. On the utilization side a distinction is made between the quantities exported, fed to livestock, used for seed, put to manufacture for food use and non-food uses, losses during storage and transportation, and food supplies available for human consumption. The per caput supply of each such food item available for human consumption is then obtained by dividing the respective quantity by the related data on the population actually partaking of it. Data on per caput food supplies are expressed in terms of quantity and - by applying appropriate food composition factors for all primary and processed products - also in terms of caloric value and protein and fat content.","Food Balances (old methodology before 2010, and new from 2010 onwards)" +faostat_fbsh,"Food Balances: Food Balances (-2013, old methodology and population) - FAO (2023)",Food Balance: Food Balances (old methodology and population),"Food Balance Sheet presents a comprehensive picture of the pattern of a country's food supply during a specified reference period. The food balance sheet shows for each food item - i.e. each primary commodity and a number of processed commodities potentially available for human consumption - the sources of supply and its utilization. The total quantity of foodstuffs produced in a country added to the total quantity imported and adjusted to any change in stocks that may have occurred since the beginning of the reference period gives the supply available during that period. On the utilization side a distinction is made between the quantities exported, fed to livestock, used for seed, put to manufacture for food use and non-food uses, losses during storage and transportation, and food supplies available for human consumption. The per caput supply of each such food item available for human consumption is then obtained by dividing the respective quantity by the related data on the population actually partaking of it. Data on per caput food supplies are expressed in terms of quantity and - by applying appropriate food composition factors for all primary and processed products - also in terms of caloric value and protein and fat content.",Food Balance: Food Balances (old methodology and population) +faostat_fo,Forestry: Forestry Production and Trade - FAO (2023),Forestry: Forestry Production and Trade,"The database contains data on the production and trade in roundwood and in primary wood and paper products for all countries and territories in the world.The main types of primary forest products included in this database are roundwood, sawnwood, wood-based panels, pulp, and paper and paperboard. These products are detailed further and defined in the Joint Forest Sector Questionnaire (JFSQ) (http://www.fao.org/forestry/statistics/80572/en/). The database contains details of the following topics: - Roundwood removals (production) by coniferous and non-coniferous wood, - production and trade in industrial Roundwood, sawnwood, wood-based panels, wood charcoal, pulp, paper paperboard, and other products. More detailed information on wood products, including definitions, can be found at http://www.fao.org/forestry/statistics/80572/en/",Forestry: Forestry Production and Trade +faostat_fs,Food Security and Nutrition: Suite of Food Security Indicators - FAO (2022),Food Security: Suite of Food Security Indicators,"The Suite of Food Security Indicators presents the core set of food security indicators. Following the recommendation of experts gathered in the Committee on World Food Security (CFS) Round Table on hunger measurement, hosted at FAO headquarters in September 2011, an initial set of indicators aiming to capture various aspects of food insecurity is presented here. The choice of the indicators has been informed by expert judgment and the availability of data with sufficient coverage to enable comparisons across regions and over time. Many of these indicators are produced and published elsewhere by FAO and other international organizations. They are reported here in a single database with the aim of building a wide food security information system. More indicators will be added to this set as more data will become available. Indicators are classified along the four dimensions of food security -- availability, access, utilization and stability. For definitions of these indicators, see Definitions and standards below (under Item).",Food Security: Suite of Food Security Indicators +faostat_gn,Climate Change: Agrifood systems emissions: Emissions from Energy use in agriculture - FAO (2023),Climate Change: Energy Use,"Greenhouse gas (GHG) emissions from direct on-farm energy use consist of carbon dioxide, methane, and nitrous oxide gases related with fuel combustion and electricity generation in agriculture (including fisheries). The FAOSTAT emissions database has a global scope for the period 1970 to 2021 (with annual updates), by motor gasoline, gas-diesel oils, gasoline, natural gas, liquefied petroleum gas, residual fuel oil, coal, electricity, heat, gas-diesel oils in fisheries, residual fuel oil in fisheries, and by aggregates (total energy, energy consumed in fishery and total energy without electricity heat). Activity data(Energy use) is also provided.",Climate Change: Energy Use +faostat_ic,Investment: Credit to Agriculture - FAO (2022),Investment: Credit to Agriculture,"The Credit to Agriculture dataset provides national data for over 130 countries on the amount of loans provided by the private/commercial banking sector to producers in agriculture, forestry and fishing, including household producers, cooperatives, and agro-businesses. For some countries, the three subsectors of agriculture, forestry, and fishing are completely specified. In other cases, complete disaggregations are not available. The dataset also provides statistics on the total credit to all industries, indicators on the share of credit to agricultural producers, and an agriculture orientation index (AOI) for credit that normalizes the share of credit to agriculture over total credit by dividing it by the share of agriculture in gross domestic product (GDP). As such, it can provide a more accurate indication of the relative importance that banking sectors place on financing the sector. An AOI lower than 1 indicates that the agriculture sector receives a credit share lower than its contribution to the economy, while an AOI greater than 1 indicates a credit share to the agriculture sector greater than its economic contribution.",Investment: Credit to Agriculture +faostat_lc,"Land, Inputs and Sustainability: Land Cover - FAO (2022)",Agri-Environmental Indicators: Land Cover,"The FAOSTAT domain Land Cover under the Agri-Environmental Indicators section contains land cover information organized by the land cover classes of the international standard system for Environmental and Economic Accounting Central Framework (SEEA CF). The land cover information is compiled from publicly available Global Land Cover (GLC) maps: a) MODIS land cover types based on the Land Cover Classification System, LCCS (2001–2018) and b) the European Spatial Agency (ESA) Climate Change Initiative (CCI) annual land cover maps (1992–2018) produced by the Université catholique de Louvain (UCL)-Geomatics and now under the European Copernicus Program.",Agri-Environmental Indicators: Land Cover +faostat_qcl,Production: Crops and livestock products - FAO (2023),Production: Crops and livestock products,"Crop and livestock statistics are recorded for 278 products, covering the following categories: 1) CROPS PRIMARY: Cereals, Citrus Fruit, Fibre Crops, Fruit, Oil Crops, Oil Crops and Cakes in Oil Equivalent, Pulses, Roots and Tubers, Sugar Crops, Treenuts and Vegetables. Data are expressed in terms of area harvested, production quantity and yield. Cereals: Area and production data on cereals relate to crops harvested for dry grain only. Cereal crops harvested for hay or harvested green for food, feed or silage or used for grazing are therefore excluded. 2) CROPS PROCESSED: Beer of barley; Cotton lint; Cottonseed; Margarine, short; Molasses; Oil, coconut (copra); Oil, cottonseed; Oil, groundnut; Oil, linseed; Oil, maize; Oil, olive, virgin; Oil, palm; Oil, palm kernel; Oil, rapeseed; Oil, safflower; Oil, sesame; Oil, soybean; Oil, sunflower; Palm kernels; Sugar Raw Centrifugal; Wine. 3) LIVE ANIMALS: Animals live n.e.s.; Asses; Beehives; Buffaloes; Camelids, other; Camels; Cattle; Chickens; Ducks; Geese and guinea fowls; Goats; Horses; Mules; Pigeons, other birds; Pigs; Rabbits and hares; Rodents, other; Sheep; Turkeys. 4) LIVESTOCK PRIMARY: Beeswax; Eggs (various types); Hides buffalo, fresh; Hides, cattle, fresh; Honey, natural; Meat (ass, bird nes, buffalo, camel, cattle, chicken, duck, game, goat, goose and guinea fowl, horse, mule, Meat nes, meat other camelids, Meat other rodents, pig, rabbit, sheep, turkey); Milk (buffalo, camel, cow, goat, sheep); Offals, nes; Silk-worm cocoons, reelable; Skins (goat, sheep); Snails, not sea; Wool, greasy. 5) LIVESTOCK PROCESSED: Butter (of milk from sheep, goat, buffalo, cow); Cheese (of milk from goat, buffalo, sheep, cow milk); Cheese of skimmed cow milk; Cream fresh; Ghee (cow and buffalo milk); Lard; Milk (dry buttermilk, skimmed condensed, skimmed cow, skimmed dried, skimmed evaporated, whole condensed, whole dried, whole evaporated); Silk raw; Tallow; Whey (condensed and dry); Yoghurt",Production: Crops and livestock products +faostat_qi,Production: Production Indices - FAO (2023),Production: Production Indices,"Crop and livestock statistics are recorded for 278 products, covering the following categories:1) CROPS PRIMARY:Cereals, Citrus Fruit, Fibre Crops, Fruit, Oil Crops, Oil Crops and Cakes in Oil Equivalent, Pulses, Roots and Tubers, Sugar Crops, Treenuts and Vegetables. Data are expressed in terms of area harvested, production quantity and yield. Cereals: Area and production data on cereals relate to crops harvested for dry grain only. Cereal crops harvested for hay or harvested green for food, feed or silage or used for grazing are therefore excluded.2) CROPS PROCESSED:Beer of barley; Cotton lint; Cottonseed; Margarine, short; Molasses; Oil, coconut (copra); Oil, cottonseed; Oil, groundnut; Oil, linseed; Oil, maize; Oil, olive, virgin; Oil, palm; Oil, palm kernel; Oil, rapeseed; Oil, safflower; Oil, sesame; Oil, soybean; Oil, sunflower; Palm kernels; Sugar Raw Centrifugal; Wine.3) LIVE ANIMALS:Animals live n.e.s.; Asses; Beehives; Buffaloes; Camelids, other; Camels; Cattle; Chickens; Ducks; Geese and guinea fowls; Goats; Horses; Mules; Pigeons, other birds; Pigs; Rabbits and hares; Rodents, other; Sheep; Turkeys.4) LIVESTOCK PRIMARY:Beeswax; Eggs (various types); Hides buffalo, fresh; Hides, cattle, fresh; Honey, natural; Meat (ass, bird nes, buffalo, camel, cattle, chicken, duck, game, goat, goose and guinea fowl, horse, mule, Meat nes, meat other camelids, Meat other rodents, pig, rabbit, sheep, turkey); Milk (buffalo, camel, cow, goat, sheep); Offals, nes; Silk-worm cocoons, reelable; Skins (goat, sheep); Snails, not sea; Wool, greasy.5) LIVESTOCK PROCESSED:Butter (of milk from sheep, goat, buffalo, cow); Cheese (of milk from goat, buffalo, sheep, cow milk); Cheese of skimmed cow milk; Cream fresh; Ghee (cow and buffalo milk); Lard; Milk (dry buttermilk, skimmed condensed, skimmed cow, skimmed dried, skimmed evaporated, whole condensed, whole dried, whole evaporated); Silk raw; Tallow; Whey (condensed and dry); Yoghurt","Production: Production Indices + +This dataset includes gross and net production indices for various food and agriculture aggregates expressed in both totals and per capita." +faostat_qv,Production: Value of Agricultural Production - FAO (2023),Production: Value of Agricultural Production,"Crop and livestock statistics are recorded for 278 products, covering the following categories:1) CROPS PRIMARY:Cereals, Citrus Fruit, Fibre Crops, Fruit, Oil Crops, Oil Crops and Cakes in Oil Equivalent, Pulses, Roots and Tubers, Sugar Crops, Treenuts and Vegetables. Data are expressed in terms of area harvested, production quantity and yield. Cereals: Area and production data on cereals relate to crops harvested for dry grain only. Cereal crops harvested for hay or harvested green for food, feed or silage or used for grazing are therefore excluded.2) CROPS PROCESSED:Beer of barley; Cotton lint; Cottonseed; Margarine, short; Molasses; Oil, coconut (copra); Oil, cottonseed; Oil, groundnut; Oil, linseed; Oil, maize; Oil, olive, virgin; Oil, palm; Oil, palm kernel; Oil, rapeseed; Oil, safflower; Oil, sesame; Oil, soybean; Oil, sunflower; Palm kernels; Sugar Raw Centrifugal; Wine.3) LIVE ANIMALS:Animals live n.e.s.; Asses; Beehives; Buffaloes; Camelids, other; Camels; Cattle; Chickens; Ducks; Geese and guinea fowls; Goats; Horses; Mules; Pigeons, other birds; Pigs; Rabbits and hares; Rodents, other; Sheep; Turkeys.4) LIVESTOCK PRIMARY:Beeswax; Eggs (various types); Hides buffalo, fresh; Hides, cattle, fresh; Honey, natural; Meat (ass, bird nes, buffalo, camel, cattle, chicken, duck, game, goat, goose and guinea fowl, horse, mule, Meat nes, meat other camelids, Meat other rodents, pig, rabbit, sheep, turkey); Milk (buffalo, camel, cow, goat, sheep); Offals, nes; Silk-worm cocoons, reelable; Skins (goat, sheep); Snails, not sea; Wool, greasy.5) LIVESTOCK PROCESSED:Butter (of milk from sheep, goat, buffalo, cow); Cheese (of milk from goat, buffalo, sheep, cow milk); Cheese of skimmed cow milk; Cream fresh; Ghee (cow and buffalo milk); Lard; Milk (dry buttermilk, skimmed condensed, skimmed cow, skimmed dried, skimmed evaporated, whole condensed, whole dried, whole evaporated); Silk raw; Tallow; Whey (condensed and dry); Yoghurt","Production: Value of Agricultural Production + +This dataset includes gross and net production values, in constant international US$, and gross production values, in constant and current US$ and Local Currency Units, for various food and agriculture commodities and aggregates thereof, expressed in both total value and value per capita." +faostat_rfb,"Land, Inputs and Sustainability: Fertilizers by Product - FAO (2022)",Inputs: Fertilizers by Product,"The Fertilizers by Product dataset contains information on the Production, Trade and Agriculture Use of inorganic (chemical or mineral) fertilizers products, over the time series 2002-present. The fertilizer statistics data are for a set of 23 product categories. Both straight and compound fertilizers are included. There is information available about methodology at: http://fenixservices.fao.org/faostat/static/documents/RFB/RFB_EN_README.pdf.",Inputs: Fertilizers by Product +faostat_rfn,"Land, Inputs and Sustainability: Fertilizers by Nutrient - FAO (2022)",Inputs: Fertilizers by Nutrient,"The Fertilizers by Nutrient dataset contains information on the totals in nutrients for Production, Trade and Agriculture Use of inorganic (chemical or mineral) fertilizers, over the time series 1961-present. The data are provided for the three primary plant nutrients: nitrogen (N), phosphorus (expressed as P2O5) and potassium (expressed as K2O). Both straight and compound fertilizers are included. There is information on the methodology available at: http://fenixservices.fao.org/faostat/static/documents/RFN/RFN_EN_README.pdf",Inputs: Fertilizers by Nutrient +faostat_rl,"Land, Inputs and Sustainability: Land Use - FAO (2022)",Inputs: Land Use,"The FAOSTAT Land Use domain contains data on forty-four categories of land use, irrigation and agricultural practices, relevant to monitor agriculture, forestry and fisheries activities at national, regional and global level. Data are available by country and year, with global coverage and annual updates.",Inputs: Land Use +faostat_rp,"Land, Inputs and Sustainability: Pesticides Use - FAO (2022)",Inputs: Pesticides Use,"The Pesticides Use database includes data on the use of major pesticide groups (Insecticides, Herbicides, Fungicides, Plant growth regulators and Rodenticides) and of relevant chemical families. Data report the quantities (in tonnes of active ingredients)",Inputs: Pesticides Use +faostat_rt,"Land, Inputs and Sustainability: Pesticides Trade - FAO (2023)",Inputs: Pesticides Trade,"This domain contains data on pesticides and covers two different categories: pesticides traded in form or packagingfor retail sale or as preparations or articles, and pesticides traded as separate chemically defined compounds (if relevant for the Rotterdam Convention on the Prior Informed Consent Procedure for Certain Hazardous Chemicals and Pesticides in International Trade). The pesticides traded for retail sale or as preparations or articles are those classified under code 38.08 in the Harmonized System Nomenclature (HS) and include: hazardous pesticides, insecticides, fungicides, herbicides, disinfectants and other. For these pesticides, this domain contains trade data (imports and exports) in values only (current 1000 US dollars), and the time series extends from 1961 onwards. The pesticides traded as separate chemically defined compounds are those listed in Annex III of the Rotterdam Convention (excluding industrial chemicals) and therefore subject to the Prior Informed Consent (PIC) procedure. The correspondence with the HS Nomenclature is shown in the table at the Related Documents section. For these pesticides, this domain contains trade data (imports and exports) in both value (current 1000 US dollars) and quantity (net weight in tonnes), and the time series extends from 2007 onwards.",Inputs: Pesticides Trade +faostat_scl,Food Balances: Supply Utilization Accounts (2010-) - FAO (2023),Food Balances: Supply Utilization Accounts,"Supply Utilization Accounts and Food Balance Sheet present a comprehensive picture of the pattern of a country's food supply during a specified reference period. The food balance sheet shows for each food item - i.e. each primary commodity and a number of processed commodities potentially available for human consumption - the sources of supply and its utilization. The total quantity of foodstuffs produced in a country added to the total quantity imported and adjusted to any change in stocks that may have occurred since the beginning of the reference period gives the supply available during that period. On the utilization side a distinction is made between the quantities exported, fed to livestock, used for seed, put to manufacture for food use and non-food uses, losses during storage and transportation, and food supplies available for human consumption. The per caput supply of each such food item available for human consumption is then obtained by dividing the respective quantity by the related data on the population actually partaking of it. Data on per caput food supplies are expressed in terms of quantity and - by applying appropriate food composition factors for all primary and processed products - also in terms of caloric value and protein and fat content.",Food Balances: Supply Utilization Accounts +faostat_sdgb,SDG Indicators: SDG Indicators - FAO (2023),SDG Indicators: SDG Indicators,"As the custodian agency of 21 SDG indicators, the Food and Agriculture Organization of the United Nations (FAO) is responsible for curating and refining the methodologies of these indicators, collecting data from national sources, ensuring their quality and compatibility with applicable standards and classifications, and disseminating data at global level. This FAOSTAT domain complements the global SDG database administered by the United Nations Statistical Division, as well as FAO’s SDG indicators portal, by providing access to the available data for each of these indicators. Please click the metadata link on the right hand navigation column for an abridged version of the methodology for compiling each of these indicators, a description of data sources and the relevant contact persons responsible for each indicator in the Organization. For a more detailed description of the methodology, data sources and reporting procedures, please follow the link to the official SDG indicator metadata document available at the bottom of each summary metadata page in the document on the right. ",SDG Indicators: SDG Indicators +faostat_tcl,Trade: Crops and livestock products - FAO (2023),Trade: Crops and livestock products,"The food and agricultural trade dataset is collected, processed and disseminated by FAO according to the standard International Merchandise Trade Statistics (IMTS) Methodology. The data is mainly provided by UNSD, Eurostat, and other national authorities as needed. This source data is checked for outliers, trade partner data is used for non-reporting countries or missing cells, and data on food aid is added to take into account total cross-border trade flows. The trade database includes the following variables: export quantity, export value, import quantity, and import value. The trade database includes all food and agricultural products imported/exported annually by all the countries in the world.",Trade: Crops and livestock products +faostat_ti,Trade: Trade Indices - FAO (2023),Trade: Trade Indices,"The food and agricultural trade dataset is collected, processed and disseminated by FAO according to the standard International Merchandise Trade Statistics (IMTS) Methodology. The data is mainly provided by UNSD, Eurostat, and other national authorities as needed. This source data is checked for outliers, trade partner data is used for non-reporting countries or missing cells, and data on food aid is added to take into account total cross-border trade flows. The trade database includes the following variables: export quantity, export value, import quantity, and import value. The trade database includes all food and agricultural products imported/exported annually by all the countries in the world.",Trade: Trade Indices +faostat_wcad,World Census of Agriculture: Structural data from agricultural censuses - FAO (2023),World Census of Agriculture: Structural data from agricultural censuses,"Data from censuses of agriculture are collected at holding level and provide information about the structure of agriculture of a country or a territory (e.g. size and number of holdings, land tenure, legal status, and holder gender). An agricultural holding is an economic unit of agricultural production under single management comprising all livestock kept and all land used wholly or partly for agricultural production purposes. Member countries provided census data to FAO under the World Programme for the Census of Agriculture (WCA). National censuses are conducted at least once every ten years in an internationally comparable way. The ‘’Structural data from agricultural censuses’’ domain in FAOSTAT provides structural data from the last four WCA rounds (WCA 2020, 2010, 2000 and 1990) for each participating country and territory, to the extent possible. For earlier rounds (WCA 1930, 1950, 1960, 1970 and 1980) data are provided only on the number and area of holdings. The data are prepared based on the national census reports, later disseminated by FAO through the publications SDS 17, SDS 12 and SDS 9 and 9a, and recent methodological review of the available country census data of WCA 2020 round.",World Census of Agriculture: Structural data from agricultural censuses diff --git a/etl/steps/data/garden/faostat/2024-03-14/custom_elements_and_units.csv b/etl/steps/data/garden/faostat/2024-03-14/custom_elements_and_units.csv new file mode 100644 index 00000000000..de8f21f9ce0 --- /dev/null +++ b/etl/steps/data/garden/faostat/2024-03-14/custom_elements_and_units.csv @@ -0,0 +1,35 @@ +dataset,element_code,fao_element,owid_element,fao_unit,fao_unit_short_name,owid_unit,owid_unit_short_name,owid_unit_factor,fao_element_description,owid_element_description,owid_aggregation,was_per_capita,make_per_capita +faostat_rl,005110,Area,Area,thousand Hectares,1000 ha,hectares,ha,1000,Extent of surface of land or water. Source: FAO Statistics Division,,sum,0,1 +faostat_qcl,005312,Area harvested,Area harvested,Hectares,ha,hectares,ha,,"Data refer to the area from which a crop is gathered. Area harvested, therefore, excludes the area from which, although sown or planted, there was no harvest due to damage, failure, etc. It is usually net for temporary crops and some times gross for permanent crops. Net area differs from gross area insofar as the latter includes uncultivated patches, footpaths, ditches, headlands, shoulders, shelterbelts, etc.If the crop under consideration is harvested more than once during the year as a consequence of successive cropping (i.e. the same crop is sown or planted more than once in the same field during the year), the area is counted as many times as harvested. On the contrary, area harvested will be recorded only once in the case of successive gathering of the crop during the year from the same standing crops. With regard to mixed and associated crops, the area sown relating to each crop should be reported separately. When the mixture refers to particular crops, generally grains, it is recommended to treat the mixture as if it were a single crop; therefore, area sown is recorded only for the crop reported. Source: FAO Statistics Division",,sum,0,1 +faostat_fbsc,005301,Domestic supply quantity,Domestic supply,1000 tonnes,1000 tonnes,tonnes,t,1000.0,"Production + imports - exports + changes in stocks (decrease or increase) = supply for domestic utilization in the new methodology. There are various ways of defining supply and, in fact, various concepts are in use. The elements involved are production, imports, exports and changes in stocks (increase or decrease). There is no doubt that production, imports and stock changes (either decrease or increase in stocks) are genuine supply elements. Source: FAO Statistics Division",,sum,0,1 +faostat_fbsc,005911,Export Quantity,Exports,1000 tonnes,1000 tonnes,tonnes,t,1000.0,,,sum,0,1 +faostat_fbsc,000684,Fat supply quantity (g/capita/day),Food available for consumption,g/capita/day,g/capita/day,grams of fat per day,g/day,,,,sum,1,1 +faostat_fbsc,005521,Feed,Feed,1000 tonnes,1000 tonnes,tonnes,t,1000.0,"Data refer to the quantity of the commodity in question available for feeding to the livestock and poultry during the reference period, whether domestically produced or imported. Source: FAO. 1986. The ICS users' manual. Interlinked computer strorage and processing system of food and agricultural commodity data. Rome.",,sum,0,1 +faostat_fbsc,005142,Food,Food,1000 tonnes,1000 tonnes,tonnes,t,1000.0,"Data refer to the total amount of the commodity available as human food during the reference period. Data include the commodity in question, as well as any commodity derived therefrom as a result of further processing. Food from maize, for example, comprises the amount of maize, maize meal and any other products derived therefrom available for human consumption. Food from milk relates to the amounts of milk as such, as well as the fresh milk equivalent of dairy products. Source: FAO. 1986. The ICS users' manual. Interlinked computer strorage and processing system of food and agricultural commodity data. Rome.",,sum,0,1 +faostat_fbsc,000664,Food supply (kcal/capita/day),Food available for consumption,kcal/capita/day,kcal/capita/day,kilocalories per day,kcal/day,,,,sum,1,1 +faostat_fbsc,000645,Food supply quantity (kg/capita/yr),Food available for consumption,Kilograms,kg,kilograms per year,kg,,,,sum,1,1 +faostat_fbsc,005611,Import Quantity,Imports,1000 tonnes,1000 tonnes,tonnes,t,1000.0,,,sum,0,1 +faostat_qcl,005313,Laying,,1000 Head,1000 Head,animals,animals,1000.0,,,sum,0,0 +faostat_fbsc,005123,Losses,Waste in Supply Chain,1000 tonnes,1000 tonnes,tonnes,t,1000.0,"Amount of the commodity in question lost through wastage (waste) during the year at all stages between the level at which production is recorded and the household, i.e. storage and transportation. Losses occurring before and during harvest are excluded. Waste from both edible and inedible parts of the commodity occurring in the household is also excluded. Quantities lost during the transformation of primary commodities into processed products are taken into account in the assessment of respective extraction/conversion rates. Distribution wastes tend to be considerable in countries with hot humid climate, difficult transportation and inadequate storage or processing facilities. This applies to the more perishable foodstuffs, and especially to those which have to be transported or stored for a long time in a tropical climate. Waste is often estimated as a fixed percentage of availability, the latter being defined as production plus imports plus stock withdrawals. Source: FAO. 1986. The ICS users' manual. Interlinked computer strorage and processing system of food and agricultural commodity data. Rome.",,sum,0,1 +faostat_qcl,005318,Milk Animals,,Head,Head,,,,,,sum,0,0 +faostat_fbsc,005154,Other uses (non-food),Other uses,1000 tonnes,1000 tonnes,tonnes,t,1000.0,"Data refer to quantities of commodities used for non-food purposes, e.g. oil for soap. In order not to distort the picture of the national food pattern quantities of the commodity in question consumed mainly by tourists are included here (see also ""Per capita supply""). In addition, this variable covers pet food. Source: FAO. 1986. The ICS users' manual. Interlinked computer strorage and processing system of food and agricultural commodity data. Rome.",,sum,0,1 +faostat_fbsc,005131,Processing,,1000 tonnes,1000 tonnes,tonnes,t,1000.0,,,sum,0,0 +faostat_qcl,005314,Prod Popultn,,Number,No,,,,,,sum,0,0 +faostat_qcl,005320,Producing Animals/Slaughtered,Producing or slaughtered animals,Head,Head,animals,animals,,,,sum,0,1 +faostat_qcl,005321,Producing Animals/Slaughtered,Producing or slaughtered animals,1000 Head,1000 Head,animals,animals,1000.0,,,sum,0,1 +faostat_fbsc,005511,Production,,1000 tonnes,1000 tonnes,tonnes,t,1000.0,"Figures relate to the total domestic production whether inside or outside the agricultural sector, i.e. it includes non-commercial production and production from kitchen gardens. Unless otherwise indicated, production is reported at the farm level for crop and livestock products (i.e. in the case of crops, excluding harvesting losses) and in terms of live weight for fish items (i.e. the actual ex-water weight at the time of the catch). All data shown relate to total meat production from both commercial and farm slaughter. Data are expressed in terms of dressed carcass weight, excluding offal and slaughter fats. Production of beef and buffalo meat includes veal; mutton and goat meat includes meat from lambs and kids; pig meat includes bacon and ham in fresh equivalent. Poultry meat includes meat from all domestic birds and refers, wherever possible, to ready-to-cook weight. Source: FAO Statistics Division",,sum,0,0 +faostat_qcl,005513,Production,,thousand Number,1000 No,number,number,1000.0,"Figures relate to the total domestic production whether inside or outside the agricultural sector, i.e. it includes non-commercial production and production from kitchen gardens. Unless otherwise indicated, production is reported at the farm level for crop and livestock products (i.e. in the case of crops, excluding harvesting losses) and in terms of live weight for fish items (i.e. the actual ex-water weight at the time of the catch). All data shown relate to total meat production from both commercial and farm slaughter. Data are expressed in terms of dressed carcass weight, excluding offal and slaughter fats. Production of beef and buffalo meat includes veal; mutton and goat meat includes meat from lambs and kids; pig meat includes bacon and ham in fresh equivalent. Poultry meat includes meat from all domestic birds and refers, wherever possible, to ready-to-cook weight. Source: FAO Statistics Division",,sum,0,0 +faostat_qcl,005510,Production,Production,tonnes,tonnes,tonnes,t,,Amount produced in the year,,sum,0,1 +faostat_fbsc,000674,Protein supply quantity (g/capita/day),Food available for consumption,g/capita/day,g/capita/day,grams of protein per day,g/day,,,,sum,1,1 +faostat_fbsc,005170,Residuals,,1000 tonnes,1000 tonnes,tonnes,t,1000.0,It is defined as the imbalance (positive or negative) in the supply and utilization equation. It occures mainly due to the inconsitencies of national data provided by countries.,,sum,0,0 +faostat_fbsc,005527,Seed,,1000 tonnes,1000 tonnes,tonnes,t,1000.0,"Data include the amounts of the commodity in question set aside for sowing or planting (or generally for reproduction purposes, e.g. sugar cane planted, potatoes for seed, eggs for hatching and fish for bait, whether domestically produced or imported) during the reference period. Account is taken of double or successive sowing or planting whenever it occurs. The data of seed include also, when it is the case, the quantities necessary for sowing or planting the area relating to crops harvested green for fodder or for food.(e.g. green peas, green beans, maize for forage)  Data for seed element are stored in tonnes (t). Whenever official data were not available, seed figures have been estimated either as a percentage of supply (e.g. eggs for hatching) or by multiplying a seed rate with the area under the crop of the subsequent year. Source: FAO. 1986. The ICS users' manual. Interlinked computer strorage and processing system of food and agricultural commodity data. Rome.",,sum,0,0 +faostat_fbsc,005072,Stock Variation,,1000 tonnes,1000 tonnes,tonnes,t,1000.0,"Comprises changes in stocks occurring during the reference period at all levels between the production and the retail levels, i.e. it comprises changes in government stocks, in stocks with manufacturers, importers, exporters, other wholesale and retail merchants, transport and storage enterprises and in stocks on farms. In actual fact, however, the information available often relates only to stocks held by governments and even these are not available for a number of countries and important commodities. In the absence of information on opening and closing stocks changes in stocks are also used for shifting production from the calendar year in which it is harvested to the year in which it is consumed. Net increases in stocks (add to stock) are generally indicated by the sign ""-"". No sign denotes net decreases (from stock). Source: FAO Statistics Division",,sum,0,0 +faostat_qcl,005112,Stocks,,1000 Head,1000 Head,animals,animals,1000.0,"This variable indicates the number of animals of the species present in the country at the time of enumeration. It includes animals raised either for draft purposes or for meat, eggs and dairy production or kept for breeding. Live animals in captivity for fur or skin such as foxes, minks etc. are not included in the system although furskin trade is reported. The enumeration to be chosen, when more than one survey is taken, is the closest to the beginning of the calendar year. Livestock data are reported in number of heads (units) except for poultry, rabbits and other rodents which are reported in thousand units. Source: FAO Statistics Division",,sum,0,0 +faostat_qcl,005111,Stocks,,Head,Head,animals,animals,,"This variable indicates the number of animals of the species present in the country at the time of enumeration. It includes animals raised either for draft purposes or for meat, eggs and dairy production or kept for breeding. Live animals in captivity for fur or skin such as foxes, minks etc. are not included in the system although furskin trade is reported. The enumeration to be chosen, when more than one survey is taken, is the closest to the beginning of the calendar year. Livestock data are reported in number of heads (units) except for poultry, rabbits and other rodents which are reported in thousand units. Source: FAO Statistics Division",,sum,0,0 +faostat_fbsc,005171,Tourist consumption,,1000 tonnes,1000 tonnes,tonnes,t,1000.0,,,sum,0,0 +faostat_qcl,005422,Yield,,hg,hg,,,,,,sum,0,0 +faostat_qcl,005410,Yield,Yield,100mg/An,100mg/An,kilograms per animal,kg/animal,0.0001,,,,0,0 +faostat_qcl,005420,Yield,Yield,hg/An,hg/An,kilograms per animal,kg/animal,0.1,,,,0,0 +faostat_qcl,005419,Yield,Yield,hg/ha,hg/ha,tonnes per hectare,t/ha,0.0001,"Harvested production per unit of harvested area for crop products. In most of the cases yield data are not recorded but obtained by dividing the production data by the data on area harvested. Data on yields of permanent crops are not as reliable as those for temporary crops either because most of the area information may correspond to planted area, as for grapes, or because of the scarcity and unreliability of the area figures reported by the countries, as for example for cocoa and coffee. Source: FAO Statistics Division",,,0,0 +faostat_qcl,005417,Yield/Carcass Weight,Yield,hg/An,hg/An,kilograms per animal,kg/animal,0.1,,,,0,0 +faostat_qcl,005424,Yield/Carcass Weight,Yield,0.1g/An,0.1g/An,kilograms per animal,kg/animal,0.0001,,,,0,0 diff --git a/etl/steps/data/garden/faostat/2024-03-14/custom_items.csv b/etl/steps/data/garden/faostat/2024-03-14/custom_items.csv new file mode 100644 index 00000000000..1064ca37308 --- /dev/null +++ b/etl/steps/data/garden/faostat/2024-03-14/custom_items.csv @@ -0,0 +1,255 @@ +dataset,item_code,fao_item,owid_item,fao_item_description,owid_item_description +faostat_qcl,00000221,"Almonds, in shell",Almonds,"Almonds, in shell This subclass is defined through the following headings/subheadings of the HS 2007: 0802.11.", +faostat_fbsc,00002946,Animal fats,Animal fats group,, +faostat_qcl,00000711,"Anise, badian, coriander, cumin, caraway, fennel and juniper berries, raw",Herbs (e.g. fennel),"Anise, badian, coriander, cumin, caraway, fennel and juniper berries, raw This subclass includes: - aniseed, Pimpinella anisum, raw - star anise (badian) or Chinese star anise, Illicium verum, raw - fennel, Foeniculum vulgare, raw (when used as spice) - coriander (cilantro), Coriandrum sativum, raw - cumin, Cuminum cyminum, raw - caraway seeds, Carum carvi, raw - juniper berries, Juniperus communis, raw This subclass does not include: - fennel (when used as a vegetable), cf. 01290 - processed anise, fennel, coriander, cumin, caraway and juniper berries, cf. 23924", +faostat_qcl,00000515,Apples,,Apples This subclass is defined through the following headings/subheadings of the HS 2007: 0808.10., +faostat_fbsc,00002617,Apples and products,Apples,"Default composition: 515 Apples, 518 Juice, apple, single strength, 519 Juice, apple, concentrated", +faostat_qcl,00000526,Apricots,,Apricots This subclass is defined through the following headings/subheadings of the HS 2007: 0809.10., +faostat_qcl,00000226,Areca nuts,,"Areca nuts, species of Areca catechu, also known as betel nut, in shell, produced mainly in the Far East. Areca nuts are used mainly as masticatory. These nuts contain alkaloids (arecoline and arecaidine). (Unofficial definition)", +faostat_qcl,00000366,Artichokes,,"Artichokes This subclass includes: - artichokes, Cynara scolymus This subclass does not include: - Jerusalem artichokes, cf. 01599", +faostat_qcl,00000367,Asparagus,,Asparagus This subclass is defined through the following headings/subheadings of the HS 2007: 0709.20., +faostat_qcl,00000572,Avocados,,"Avocados This subclass includes: - avocados, Persea americana", +faostat_qcl,00000486,Bananas,,"Bananas This subclass includes: - sweet/dessert bananas, Musa sapientum, M. cavendishii, M. nana, i.e. bananas that can be eaten without further preparation This subclass does not include: - plantains, cooking bananas, Musa paradisiaca, cf. 01313", +faostat_fbsc,00002615,Bananas,,Default composition: 486 Bananas, +faostat_qcl,00000044,Barley,,Barley, +faostat_fbsc,00002513,Barley and products,Barley,"Default composition: 44 Barley, 45 Barley, pot, 46 Barley, pearled, 47 Bran, barley, 48 Flour, barley and grits, 49 Malt, 50 Malt extract", +faostat_fbsc,00002546,Beans,"Beans, dry","Default composition: 176 Beans, dry", +faostat_qcl,00000176,"Beans, dry",,"Beans, dry This subclass includes: - beans, species of Phaseolus (vulgaris, lunatus, angularis, aureus, etc.) - beans, species of Vigna (angularis, mungo, radiata, unguiculata, etc.) This subclass does not include: - soya beans, cf. 0141 - green beans, cf. 01241 - lentils, green, cf. 01249 - bean shoots and sprouts, cf. 01290 - locust beans (carobs), cf. 01356 - castor beans, cf. 01449 - broad beans and horse beans, dry, cf. 01702 - garbanzo beans (chickpeas), dry, cf. 01703 - lentils, dry, cf. 01704", +faostat_qcl,00001806,"Beef and Buffalo Meat, primary","Meat, beef and buffalo","The term ""LIVESTOCK"" is used in a broad sense to cover all grown animals regardless of age, location or purpose of breeding. Non-domesticated animals are excluded under this definition unless they are kept or raised in captivity. Domestic animals included are large and small quadrupeds, poultry, insects (bees) and larvae of insects (silkworms). Figures on livestock numbers should refer to live animals enumerated on a given day or on several consecutive days. The FAO practice is that figures for an indicated year relate to animals reported by countries for any day between October of the previous year and September of the year indicated. Statistics on live animals by age, sex and utilization generally are not included in the list that follows, even though such breakdowns are extremely desirable in terms of national statistics. For each animal species FAO proposes that information be maintained on changes in national herds during the year according to the following equation: initial herd + animals born + imports of live animals - exports of live animals - natural losses - slaughter = closing herd.FAO defines meat as the flesh of animals used for food. In production data, meat is normally reported inclusive of bone and exclusive of meat that is unfit for human consumption. As reported by individual countries, meat production data may refer either to commercial production (meat entering marketing channels), inspected production (from animals slaughtered under sanitary inspection), or total production (the total of the above- mentioned categories plus slaughter for personal consumption). All FAO annual production data refer to total production.Country statistics on meat production adhere to one or more of the following concepts: 1. Live weight: the weight of the animal immediately before slaughter. 2. Killed weight: the live weight less the uncollected blood lost during slaughter. 3. Dressed carcass weight: weight minus all parts - edible and inedible - that are removed in dressing the carcass. The concept varies widely from country to country and according to the various species of livestock. Edible parts generally include edible offals (head or head meat, tongue, brains, heart, liver, spleen, stomach or tripes and, in a few countries, other parts such as feet, throat and lungs. Slaughter fats (the unrendered fats that fall in the course of dressing the carcasses) are recorded as either edible or inedible according to country practice. Inedible parts generally include hides and skins (except in the case of pigs), as well as hoofs and stomach contents.Among individual countries, one of the following three concepts issued to measure production:A. Production from all animals, of both indigenous and foreign origin, that are slaughtered within national boundaries. B. Production from the slaughter of indigenous animals plus exports of live indigenous animals during the reference period. Derived from meat production as follows: production from slaughtered animals plus the meat equivalent of all animals exported alive, minus the meat equivalent of all animals imported alive. As imports/exports of live animals are recorded by FAO in numbers, not weight, animal type and size are of significance. C. The biological production concept covers indigenous animals that are either slaughtered or exported live, plus net additions to the stock during the reference period. Derived from indigenous productions follows: indigenous production plus (or minus) the meat equivalent of the change in the stock numbers during the reference period. Production is expressed in terms of live weight. Changes in the total live weight of all animals are not taken into account.FAO uses the first concept of meat production in the construction of its food balance sheets and for related indicators. The second concept, indigenous meat production, in measuring the output of the national livestock sector, is useful mainly in the construction of index numbers of agricultural production. The third concept, biological production, would be the most complete as it also reflects changes in the livestock herd, but it is not used because of difficulties in obtaining information from national reporting offices. The prices applied to indigenous meat production are derived from prices of live animals. This covers not only the value of meat, but also the value of offals, fats, hides and skins.Beef and Veal, Dried, Salted, Smok Meat of bovine animals, whether salted, in brine, dried or smoked. Includes edible flours and meals.Beef and Veal Preparations nes Meat and offal (o/t liver) that are boiled, steamed, grilled, fried, roasted or otherwise cooked. Includes prepared meals that contain more than 20% of meat and offal by weight.BUFFALO MEAT Fresh, chilled or frozen, with bone in or boneless.OFFALS OF BUFFALO, EDIBLE Fresh, chilled or frozen.", +faostat_qcl,00001183,Beeswax,,"The substance bees use to build the hexagonal cells of the combs of beehives. Includes other insect waxes as well, i.e. lac and Chinese wax. (Unofficial definition)", +faostat_qcl,00000552,Blueberries,,"This subclass includes blueberries, species of Vaccinium myrtillus (European blueberry, wild bilberry, whortleberry) andVaccinium corymbosum (American blueberry) (Unofficial definition)", +faostat_fbsc,00002731,Bovine Meat,"Meat, beef","Default composition: 867 Meat, cattle, 870 Meat, cattle, boneless (beef & veal), 872 Meat, beef, dried, salted, smoked, 873 Meat, extracts, 874 Meat, beef and veal sausages, 875 Meat, beef, preparations, 876 Meat, beef, canned, 877 Meat, homogenized preparations, 947 Meat, buffalo", +faostat_qcl,00000216,"Brazil nuts, in shell","Brazil nuts, with shell","Brazil nuts, in shell This subclass is defined through the following headings/subheadings of the HS 2007: 0801.21.", +faostat_qcl,00000181,"Broad beans and horse beans, dry",Broad beans,"Broad beans and horse beans, dry This subclass is defined through the following headings/subheadings of the HS 2007: 0713.50.", +faostat_qcl,00000089,Buckwheat,,Buckwheat This subclass is defined through the following headings/subheadings of the HS 2007: 1008.10., +faostat_qcl,00000949,"Buffalo fat, unrendered","Fat, buffaloes","Buffalo fat, unrendered This subclass includes: - fat from buffalos, animals of subclass 02112, unrendered This subclass does not include: - tallow, fat from buffalos, rendered, cf. 21523 - marrow, bone or foot oil and their fractions, from buffalo, cf. 21529", +faostat_fbsc,00002740,"Butter, Ghee",Butter and ghee,"Default composition: 886 Butter, cow milk, 887 Ghee, butteroil of cow milk, 952 Butter, buffalo milk, 953 Ghee, of buffalo milk, 983 Butter and ghee, sheep milk, 1022 Butter of goat mlk", +faostat_qcl,00000358,Cabbages,Cabbages,"Cabbages This subclass is defined through the following headings/subheadings of the HS 2007: 0704.20, .90.", +faostat_qcl,00000101,Canary seed,,Canary seed This subclass is defined through the following headings/subheadings of the HS 2007: 1008.30., +faostat_qcl,00000568,Cantaloupes and other melons,Melon,Cantaloupes and other melons This subclass is defined through the following headings/subheadings of the HS 2007: 0807.19., +faostat_qcl,00000426,Carrots and turnips,,Carrots and turnips This subclass is defined through the following headings/subheadings of the HS 2007: 0706.10., +faostat_qcl,00000217,"Cashew nuts, in shell",Cashew nuts,"Cashew nuts, in shell This subclass is defined through the following headings/subheadings of the HS 2007: 0801.31.", +faostat_fbsc,00002532,Cassava and products,Cassava,"Default composition: 125 Cassava, 126 Flour, cassava, 127 Tapioca, cassava, 128 Cassava dried, 129 Starch, cassava", +faostat_qcl,00000125,"Cassava, fresh",Cassava,"Cassava, species of Manihot esculenta; Manihot utilissima (manioc, mandioca, yuca) and Manihot palmata; Manihot dulcis (yuca dulce), fresh, chilled, frozen, is a semi-permanent highly perishable tuberous crop grown in tropical and subtropical regions. Sometimes bitter and sweet cassavas are referred to as separate species, the former being M. esculenta and the latter M. palmata, but this is incorrect since the toxicity varies according to location. Cassava is the staple food in many tropical countries. It is not traded internationally in its fresh state because tubers deteriorate very rapidly. (Unofficial definition)", +faostat_qcl,00000265,Castor oil seeds,Castor oil seed,Castor oil seeds This subclass is defined through the following headings/subheadings of the HS 2007: 1207.30., +faostat_qcl,00000869,"Cattle fat, unrendered","Fat, cattle","Cattle fat, unrendered This subclass includes: - fat from cattle, animals of subclass 02111, unrendered This subclass does not include: - tallow, fat from cattle, rendered, cf. 21523 - greaves fat, cf. 21523 - marrow, bone or foot oil and their fractions, from cattle, cf. 21529", +faostat_qcl,00000393,Cauliflowers and broccoli,,Cauliflowers and broccoli This subclass is defined through the following headings/subheadings of the HS 2007: 0704.10., +faostat_qcl,00001717,"Cereals, primary",Cereals,"Cereals are generally of the gramineous family and, in the FAO concept, refer to crops harvested for dry grain only. Crops harvested green for forage, silage or grazingare classified as fodder crops. Also excluded are industrial crops, e.g. broom sorghum (Crude organic materials nes) and sweet sorghum when grown for syrup (Sugar crops nes). For international trade classifications, fresh cereals (other than sweet corn), whether or not suitable for use as fresh vegetables, are classified as cereals. Cereals are identified according to their genus. However, when two or more genera are sown and harvested as a mixture they should be classified and reported as ""mixed grains"". Production data are reported in terms of clean, dry weight of grains (12-14 percent moisture) in the form usually marketed. Rice, however, is reported in terms of paddy. Apart from moisture content and inedible substances such as cellulose, cereal grains contain, along with traces of minerals and vitamins, carbohydrates - mainly starches - (comprising 65-75 percent of their total weight), as well as proteins (6-12 percent) and fat (1-5 percent). The FAO definitions cover 17 primary cereals, of which one - white maize - is a component of maize. Each definition is listed along with its code, botanical name or names, and a short description. Cereal products derive either from the processing of grain through one or more mechanical or chemical operations, or from the processing of flour, meal or starch. Each cereal product is listed after the cereal from which it is derived.", +faostat_qcl,00001745,Cheese (All Kinds),Cheese,, +faostat_qcl,00000531,Cherries,,"This subclass includes cherries, species of Prunus avium (whiteheart cherries); Cerasus avium (mazzard, sweet cherry, morello cherries), var. duracina (hard-fleshed cherry) and var. Juliana (heart cherry). (Unofficial definition)", +faostat_qcl,00000220,"Chestnuts, in shell",Chestnut,"Chestnuts, in shell This subclass includes: - chestnuts, nuts of Castanea This subclass does not include: - Chinese water chestnuts, Eleocharis dulcis, cf. 01290 - water chestnuts, Trapa natans, cf. 01379 - shelled chestnuts, cf. 21429 - horse chestnuts, cf. 39120", +faostat_qcl,00000191,"Chick peas, dry",Chickpeas,"Chick peas, dry This subclass is defined through the following headings/subheadings of the HS 2007: 0713.20.", +faostat_qcl,00000689,"Chillies and peppers, dry (Capsicum spp., Pimenta spp.), raw",Chillies and peppers,"Chillies and peppers, dry (Capsicum spp., Pimenta spp.), raw This subclass includes: - raw, dried fruits of plants of the genus Capsicum capsicum, sweet pepper chilli pepper cultivars of Capsicum annuum cayenne pepper jalapeno pepper anaheim pepperpaprika, bell pepper ancho pepperpimento - raw, dried fruits of plants of the genus Pimenta allspice, Jamaica pepper Pimenta dioica This subclass does not include: - chillies and peppers, green, cf. 01231 - dry chillies and peppers, crushed, ground or otherwise processed, cf. 23922", +faostat_qcl,00001804,"Citrus Fruit, Total",Citrus Fruit,, +faostat_fbsc,00002633,Cocoa Beans and products,Cocoa beans,"Default composition: 661 Cocoa, beans, 662 Cocoa, paste, 665 Cocoa, powder & cake, 666 Chocolate products nes", +faostat_fbsc,00002578,Coconut Oil,Coconut oil,"Default composition: 252 Oil, coconut (copra)", +faostat_qcl,00000252,Coconut oil,Coconut oil,Coconut oil, +faostat_fbsc,00002560,Coconuts - Incl Copra,Coconuts,"Default composition: 249 Coconuts, 250 Coconuts, desiccated, 251 Copra", +faostat_qcl,00000249,"Coconuts, in shell",Coconuts,"Coconuts, in shell This subclass includes: - coconuts, Cocos nucifera, in the shell, i.e. the inner shell (endocarp) or the outer shell (mesocarp or exocarp) This subclass does not include: - shelled coconuts, cf. 21429", +faostat_qcl,00000656,"Coffee, green",,"Coffee, green This subclass is defined through the following headings/subheadings of the HS 2007: 0901.11.", +faostat_qcl,00000767,"Cotton lint, ginned",Cotton,"Gossypium spp Fibres from ginning seed cotton that have not been carded or combed. Trade data also include fibres that have been cleaned, bleached, dyed or rendered absorbent. (Unofficial definition)", +faostat_qcl,00000329,Cotton seed,Cottonseed,Cottonseed, +faostat_fbsc,00002559,Cottonseed,,Default composition: 329 Cottonseed, +faostat_fbsc,00002575,Cottonseed Oil,Cottonseed oil,"Default composition: 331 Oil, cottonseed", +faostat_qcl,00000331,Cottonseed oil,Cottonseed oil,Cottonseed oil, +faostat_qcl,00000195,"Cow peas, dry",Cow peas,"Cow peas, dry This subclass is defined through the following headings/subheadings of the HS 2007: 0713.35.", +faostat_qcl,00000554,Cranberries,,"This subclass includes cranberries, species of Vaccinium macrocarpon (American cranberry) and Vaccinium oxycoccus (European cranberry). (Unofficial definition)", +faostat_qcl,00000397,Cucumbers and gherkins,,Cucumbers and gherkins This subclass is defined through the following headings/subheadings of the HS 2007: 0707., +faostat_qcl,00000550,Currants,,"Currants, species of Ribes nigrum (Black) and Ribes rubrum (red and white). (Unofficial definition)", +faostat_fbsc,00002619,Dates,,Default composition: 577 Dates, +faostat_qcl,00000577,Dates,,"Dates This subclass includes: - dates, fruit of the date palm, Phoenix dactylifera", +faostat_qcl,00000948,"Edible offal of buffalo, fresh, chilled or frozen","Offals, buffaloes","Edible offal of buffalo, fresh, chilled or frozen This subclass includes: - edible offal of buffalo, animals of subclass 02112, fresh, chilled or frozen This subclass does not include: - edible offal of cattle, cf. 21151", +faostat_qcl,00000868,"Edible offal of cattle, fresh, chilled or frozen","Offals, cattle","Edible offal of cattle, fresh, chilled or frozen This subclass includes: - edible offal of cattle, animals of subclass 02111, fresh, chilled or frozen This subclass does not include: - edible offal of buffalo, cf. 21152", +faostat_qcl,00001018,"Edible offal of goat, fresh, chilled or frozen","Offals, goats","Edible offal of goat, fresh, chilled or frozen This subclass includes: - edible offal of goat, animals of subclass 02123, fresh, chilled or frozen", +faostat_qcl,00001036,"Edible offal of pigs, fresh, chilled or frozen","Offals, pigs","Edible offal of pigs, fresh, chilled or frozen This subclass includes: - edible offal of pigs, animals of subclass 02140, fresh, chilled or frozen", +faostat_qcl,00000978,"Edible offal of sheep, fresh, chilled or frozen","Offals, sheep","Edible offal of sheep, fresh, chilled or frozen This subclass includes: - edible offal of sheep, animals of subclass 02122, fresh, chilled or frozen", +faostat_qcl,00001128,"Edible offals of camels and other camelids, fresh, chilled or frozen","Offals, camels","Edible offals of camels and other camelids, fresh, chilled or frozen (Unofficial definition)", +faostat_qcl,00001098,"Edible offals of horses and other equines, fresh, chilled or frozen","Offals, horses","Edible offals of horses and other equines, fresh, chilled or frozen (Unofficial definition)", +faostat_qcl,00000399,Eggplants (aubergines),Eggplants,Eggplants (aubergines) This subclass is defined through the following headings/subheadings of the HS 2007: 0709.30., +faostat_fbsc,00002744,Eggs,All egg products,"Default composition: 916 Egg albumine, 1062 Eggs, hen, in shell, 1063 Eggs, liquid, 1064 Eggs, dried, 1091 Eggs, other bird, in shell", +faostat_fbsc,00002949,Eggs,,, +faostat_qcl,00001783,Eggs Primary,Eggs,"Eggs and Egg products. Egg production by type of poultry should refer to the total production of eggs in the shell by all types of hens in both the traditional sector (individually owned small flocks) and the modern sector (large-scale, intensive commercial poultry farms).Total productions include eggs for hatching but exclude waste of farms. Countries should report in terms of both numbers and weight.FAO lists seven egg and egg products items, including four primary and three processed products.", +faostat_qcl,00001091,"Eggs from other birds in shell, fresh, n.e.c.",Eggs from other birds (excl. hens),"Eggs from other birds in shell, fresh, n.e.c.", +faostat_qcl,00001129,Fat of camels,"Fat, camels",Unrendered slaughter fats (Unofficial definition), +faostat_qcl,00001037,Fat of pigs,"Fat, pigs","Unrendered slaughter fats of pigs, including edible and inedible fats that are removed in the course of dressing the carcass. (Unofficial definition)", +faostat_fbsc,00002737,"Fats, Animals, Raw",Animal fats,"Default composition: 869 Fat, cattle, 871 Fat, cattle butcher, 949 Fat, buffaloes, 979 Fat, sheep, 994 Grease incl. lanolin wool, 1019 Fat, goats, 1037 Fat, pigs, 1040 Fat, pig butcher, 1043 Lard, 1065 Fat, poultry, 1066 Fat, poultry, rendered, 1129 Fat, camels, 1160 Fat, other camelids, 1168 Oils, fats of animal nes, 1221 Lard stearine oil, 1222 Degras, 1225 Tallow, 1243 Fat, nes, prepared", +faostat_fbsc,00002960,"Fish, Seafood",Fish and seafood,, +faostat_qcl,00000773,"Flax, processed but not spun",Flax fibre,"Broken, scutched, hackled etc. but not spun. Traditionally, FAO has used this commodity to identify production in its raw state; in reality, the primary agricultural product is the commodity 01929.01 (Flax, raw or retted) which can either be used for the production of fibre or for other purposes (Unofficial definition)", +faostat_qcl,00001738,Fruit Primary,Fruit,"Fruit Crops consist of fruits and berries that, with few exceptions, are characterized by their sweet taste. Nearly all are permanent crops, mainly from trees, bushes and shrubs, as well as vines and palms. Fruits and berries grow on branches, stalks or the trunks of plants, usually singly, but sometimes grouped in bunches or clusters (e.g. bananas and grapes). Commercial crops are cultivated in plantations, but significant quantities of fruits are also collected from scattered plants that may or may not be cultivated. Although melons and watermelons are generally considered to be fruits, FAO groups them with vegetables because they are temporary crops. Fruit crops are highly perishable. Their shelf life may be extended through the application of chemical substances that inhibit the growth of micro-organisms and through careful control of the surrounding temperature, pressure and humidity once the fruit has been picked. Fruits and berries have a very high water content accounting for some 70- 90 percent of their weight. They contain, in various degrees, minerals, vitamins and organic acids, some of which reside in the peel or skin. Some fruits have a high fibre content and other inedible components, so that wastage is high, e.g. 60 percent for passion fruit and 35-45 percent for pineapples. The waste in temperate zone fruit is lower, generally of the order of 10-15 percent, while berries contain very little waste. The carbohydrate content of fruits varies widely. Protein content is very low, averaging less than 1 percent, or below that in vegetables. Fat content in fruit is negligible, with the notable exception of avocados. Fruit crops are consumed directly as food and are processed into dried fruit, fruit juice, canned fruit, frozen fruit, jam, alcoholic beverages, etc. Fruit crops are not normally grown for animal feed, although significant quantities of diseased and substandard fruits, as well as certain by-products of the fruit processing industry, are fed to animals. Production data for fruit crops should relate to fruits actually harvested. Data on bananas and plantains should relate to the weight of single bananas or banana hands, excluding the weight of the central stalk. FAO lists 36 primary fruit crops.", +faostat_fbsc,00002919,Fruits - Excluding Wine,Fruit,, +faostat_qcl,00001163,"Game meat, fresh, chilled or frozen","Meat, game","Meat and offals of wild animals, whether fresh, chilled or frozen. (Unofficial definition)", +faostat_qcl,00001019,"Goat fat, unrendered","Fat, goats","Goat fat, unrendered This subclass includes: - fat from goats, animals of subclass 02123, unrendered This subclass does not include: - tallow, fat from goats, rendered, cf. 21523 - marrow, bone or foot oil and their fractions, from goats, cf. 21529", +faostat_fbsc,00002901,Grand Total,Total,, +faostat_fbsc,00002613,Grapefruit and products,Grapefruit,"Default composition: 507 Grapefruit (inc. pomelos), 509 Juice, grapefruit, 510 Juice, grapefruit, concentrated", +faostat_qcl,00000560,Grapes,,Grapes This subclass is defined through the following headings/subheadings of the HS 2007: 0806.10., +faostat_qcl,00000446,Green corn (maize),Green maize,"Green corn (maize), species of Zea mays, particularly var. Saccharata, includes maize harvested green for food. Saccharata variety is commonly known as sweet corn, whether or not in the cob. (Unofficial definition)", +faostat_qcl,00000406,Green garlic,Garlic,Green garlic This subclass is defined through the following headings/subheadings of the HS 2007: 0703.20., +faostat_fbsc,00002572,Groundnut Oil,Groundnut oil,"Default composition: 244 Oil, groundnut", +faostat_qcl,00000244,Groundnut oil,Groundnut oil,Groundnut oil, +faostat_fbsc,00002552,Groundnuts,,"Default composition: 242 Groundnuts, with shell, 243 Groundnuts, shelled, 246 Groundnuts, prepared, 247 Peanut butter", +faostat_qcl,00000242,"Groundnuts, excluding shelled",Groundnuts,"Groundnuts, excluding shelled", +faostat_qcl,00000225,"Hazelnuts, in shell",Hazelnuts,"Hazelnuts, in shell This subclass is defined through the following headings/subheadings of the HS 2007: 0802.21.", +faostat_qcl,00000336,Hempseed,,"Hempseed, species of Cannabis sativa, is an annual herbaceous that is cultivated for its fibre as well as its oil. In major producing countries oil is extracted from the seeds. (Unofficial definition)", +faostat_qcl,00001062,"Hen eggs in shell, fresh",Eggs from hens,"Hen eggs in shell, fresh", +faostat_fbsc,00002745,Honey,,"Default composition: 1182 Honey, natural", +faostat_qcl,00001097,"Horse meat, fresh or chilled","Meat, horse","Horse meat, fresh or chilled (Unofficial definition)", +faostat_qcl,00000780,"Jute, raw or retted",Jute,"White jute (Corchorus capsularis); red jute, tossa (C. olitorius) Trade data cover raw or processed jute (but not spun), tow and waste, yarn waste and garnetted stock and may include jute-like fibres. (Unofficial definition)", +faostat_qcl,00000263,Karite nuts (sheanuts),Karite nuts,"Karite nuts (sheanuts), species of Butyrospermum parkii, includes only the nut contained in the fruit although the pulp around the nut is also edible. (Unofficial definition)", +faostat_qcl,00000592,Kiwi fruit,Kiwi,Kiwi fruit This subclass is defined through the following headings/subheadings of the HS 2007: 0810.50., +faostat_qcl,00000224,Kola nuts,,"Kola nuts, species of Cola nitida; Cola vera; Cola acuminata (kola, cola, Sudan cola nut), produced mainly in Africa. Kola nuts, containing 2.4 to 2.6% caffeine, are commonly chewed by the local population. Much used in Europe and America in the production of beverages. (Unofficial definition)", +faostat_qcl,00000407,Leeks and other alliaceous vegetables,Leeks,Leeks and other alliaceous vegetables This subclass is defined through the following headings/subheadings of the HS 2007: 0703.90., +faostat_qcl,00000497,Lemons and limes,,"Lemons and limes This subclass includes: - lemons, fruit of Citrus limon - limes, fruit of Citrus aurantifolia and Citrus latifolia This subclass does not include: - citrons, cf. 01329", +faostat_fbsc,00002612,"Lemons, Limes and products",Lemons and limes,"Default composition: 497 Lemons and limes, 498 Juice, lemon, single strength, 499 Juice, lemon, concentrated", +faostat_qcl,00000201,"Lentils, dry",Lentils,"Lentils, dry This subclass is defined through the following headings/subheadings of the HS 2007: 0713.40.", +faostat_qcl,00000372,Lettuce and chicory,Lettuce,Lettuce and chicory This subclass is defined through the following headings/subheadings of the HS 2007: 0705., +faostat_qcl,00000333,Linseed,,Linseed This subclass is defined through the following headings/subheadings of the HS 2007: 1204., +faostat_qcl,00000056,Maize (corn),Maize,Maize (corn) This class includes: - maize harvested for their dry grains only, +faostat_fbsc,00002582,Maize Germ Oil,Maize oil,"Default composition: 60 Oil, maize", +faostat_fbsc,00002514,Maize and products,Maize,"Default composition: 56 Maize, 57 Germ, maize, 58 Flour, maize, 59 Bran, maize, 63 Gluten, maize, 64 Starch, maize, 846 Feed and meal, gluten", +faostat_qcl,00000571,"Mangoes, guavas and mangosteens",Mangoes,"Mangoes, guavas and mangosteens This subclass includes: - mangoes, fruit of Mangifera indica - guavas, fruit of the trees of the genus Psidium, in particular Psidium cattleianum and Psidium guajava - mangosteens, Garcinia mangostana", +faostat_qcl,00001242,Margarine and shortening,Margarine,"Margarine is made principally from one or more hydrogenated vegetable or animal fats or oils in which is dispersed an aqueous potion containing milk products, salt, flavouring agents and other additives. Shortening is a product similar to margarine, but with a higher animal fat content. Shortening and compound fats are used primarily for baking and frying. The fat content of margarine and shortening varies from 70 to 90% (Unofficial definition)", +faostat_fbsc,00002943,Meat,"Meat, total",, +faostat_qcl,00001108,"Meat of asses, fresh or chilled","Meat, ass","Meat of asses, fresh or chilled (Unofficial definition)", +faostat_qcl,00000947,"Meat of buffalo, fresh or chilled","Meat, buffalo","Meat of buffalo, fresh or chilled This subclass includes: - meat of buffalo, animals of subclass 02112, fresh or chilled This subclass does not include: - meat of buffalo, frozen, cf. 21132 - edible offal of buffalo, cf. 21152", +faostat_qcl,00001127,"Meat of camels, fresh or chilled","Meat, camel","Meat of camels, fresh or chilled (Unofficial definition)", +faostat_qcl,00001058,"Meat of chickens, fresh or chilled","Meat, chicken","Meat of chickens, fresh or chilled This subclass includes: - meat of chickens, Gallus domesticus, birds of subclass 02151, fresh or chilled This subclass does not include: - meat of chickens, frozen, cf. 21141 - edible offal of chicken, cf. 21160", +faostat_qcl,00001069,"Meat of ducks, fresh or chilled","Meat, duck","Meat of ducks, fresh or chilled This subclass includes: - meat of ducks, birds of subclass 02154, fresh or chilled This subclass does not include: - meat of ducks, frozen, cf. 21142 - fatty liver of duck, cf. 21160 - edible offal of duck, cf. 21160", +faostat_qcl,00001073,"Meat of geese, fresh or chilled","Meat, goose and guinea fowl","Meat of geese, fresh or chilled This subclass includes: - meat of geese, birds of subclass 02153, fresh or chilled This subclass does not include: - meat of geese, frozen, cf. 21143 - fatty liver of geese, cf. 21160 - edible offal of geese, cf. 21160", +faostat_qcl,00001017,"Meat of goat, fresh or chilled","Meat, goat","Meat of goat, fresh or chilled This subclass includes: - meat of goat, animals of subclass 02123, fresh or chilled This subclass does not include: - meat of goat, frozen, cf. 21136 - edible offal of goat, cf. 21156", +faostat_qcl,00001111,"Meat of mules, fresh or chilled","Meat, mule","Meat of mules, fresh or chilled (Unofficial definition)", +faostat_qcl,00001035,"Meat of pig with the bone, fresh or chilled","Meat, pig","Meat, with the bone in, of domestic or wild pigs (e.g. wild boars), whether fresh or chilled. (Unofficial definition)", +faostat_qcl,00001141,"Meat of rabbits and hares, fresh or chilled","Meat, rabbit","Meat of rabbits and hares, fresh or chilled This subclass includes: - meat of rabbits and hares, animals of subclass 0219 1, fresh or chilled This subclass does not include: - meat of rabbits and hares, frozen, cf. 21134 - edible offal of rabbits and hares, cf. 21159", +faostat_qcl,00000977,"Meat of sheep, fresh or chilled","Meat, lamb and mutton","Meat of sheep, fresh or chilled This subclass is defined through the following headings/subheadings of the HS 2007: 0204.10 - .23.", +faostat_qcl,00001080,"Meat of turkeys, fresh or chilled","Meat, turkey","Meat of turkeys, fresh or chilled This subclass includes: - meat of turkeys, birds of subclass 02152, fresh or chilled This subclass does not include: - meat of turkeys, frozen, cf. 21144 - edible offal of turkey, cf. 21160", +faostat_qcl,00001808,"Meat, Poultry","Meat, poultry",, +faostat_qcl,00001765,"Meat, Total","Meat, total","FAO defines meat as the flesh of animals used for food. In production data, meat is normally reported inclusive of bone and exclusive of meat that is unfit for human consumption. As reported by individual countries, meat production data may refer either to commercial production (meat entering marketing channels), inspected production (from animals slaughtered under sanitary inspection), or total production (the total of the above- mentioned categories plus slaughter for personal consumption). All FAO annual production data refer to total production.Country statistics on meat production adhere to one or more of the following concepts: 1. Live weight: the weight of the animal immediately before slaughter. 2. Killed weight: the live weight less the uncollected blood lost during slaughter. 3. Dressed carcass weight: weight minus all parts - edible and inedible - that are removed in dressing the carcass. The concept varies widely from country to country and according to the various species of livestock. Edible parts generally include edible offals (head or head meat, tongue, brains, heart, liver, spleen, stomach or tripes and, in a few countries, other parts such as feet, throat and lungs. Slaughter fats (the unrendered fats that fall in the course of dressing the carcasses) are recorded as either edible or inedible according to country practice. Inedible parts generally include hides and skins (except in the case of pigs), as well as hoofs and stomach contents.Meat production data for minor animals (poultry, rabbits, etc.) are reported in one of the following three ways: ready-to-cook weight (giblets are sometimes included and sometimes excluded); eviscerated weight (including the feet and head); or dressed weight, i.e. the live weight less the blood, feathers and skin.FAO data relate to dressed carcass weight for livestock and, wherever possible, ready-to- cook weight for poultry.Among individual countries, one of the following three concepts issued to measure production:A. Production from all animals, of both indigenous and foreign origin, that are slaughtered within national boundaries. B. Production from the slaughter of indigenous animals plus exports of live indigenous animals during the reference period. Derived from meat production as follows: production from slaughtered animals plus the meat equivalent of all animals exported alive, minus the meat equivalent of all animals imported alive. As imports/exports of live animals are recorded by FAO in numbers, not weight, animal type and size are of significance. C. The biological production concept covers indigenous animals that are either slaughtered or exported live, plus net additions to the stock during the reference period. Derived from indigenous productions follows: indigenous production plus (or minus) the meat equivalent of the change in the stock numbers during the reference period. Production is expressed in terms of live weight. Changes in the total live weight of all animals are not taken into account.FAO uses the first concept of meat production in the construction of its food balance sheets and for related indicators. The second concept, indigenous meat production, in measuring the output of the national livestock sector, is useful mainly in the construction of index numbers of agricultural production. The third concept, biological production, would be the most complete as it also reflects changes in the livestock herd, but it is not used because of difficulties in obtaining information from national reporting offices. The prices applied to indigenous meat production are derived from prices of live animals. This covers not only the value of meat, but also the value of offals, fats, hides and skins.", +faostat_qcl,00000299,Melonseed,,"This subclass covers melonseed, species of Cucumis melo. It may include seeds of other Cucurbitaceae (Unofficial definition)", +faostat_fbsc,00002948,Milk - Excluding Butter,Milk,, +faostat_qcl,00001780,"Milk, Total",Milk,"Milk, eggs, honey and beeswax are included as products of live animals. Fibres of animal origin (mainly wool and silk) are included with fibres of vegetal and animal origin. Milk and dairy products. Estimates of milk production as reported by countries refer to one or more of the following three concepts. Gross production is milk production plus milk sucked by young animals. Production available for consumption is net production less milk fed to animals, milk retained by farmers for food and feed, direct sales to consumers and farm waste. The FAO concept relates to net milk production. Data should be reported by kind of milking animal (cow, sheep, goat, etc.) in terms of whole milk and by weight. In most developed countries only 5-10 percent of whole milk is used directly for human consumption. The bulk of milk production is processed before being marketed as liquid milk (e. G. standardized, pasteurized, skimmed, etc.) or is manufactured in to products such as cream, butter, cheese, evaporated and condensed milk, milk powder, casein, yogurt, ice cream, etc. About 70 percent of whole milk is processed into dairy products; the by-products of these processes (e. G. skim milk, buttermilk and whey) are used either for feed or are manufactured into other dairy products, e. G. dry skim milk and low-fat cheese. Processed milk and dairy products are often supplemented with vitamins, mineral and various additives. FAO list 50 milk and dairy product items in the list that follows, of which five are primary products. Some food products containing milk are not listed separately by FAO, e. G. eggnog, sherbet, malted milk, chocolate milk drink and mellorine.", +faostat_qcl,00000079,Millet,,Millet, +faostat_fbsc,00002517,Millet and products,Millet,"Default composition: 79 Millet, 80 Flour, millet, 81 Bran, millet", +faostat_fbsc,00002928,Miscellaneous,Miscellaneous group,, +faostat_qcl,00000103,Mixed grain,Mixed grains,Mixed grains is a mixture of cereal species that are sown and harvested together. It does not include meslin (a mixture wheat/rye) (c.f. 0111). (Unofficial definition), +faostat_qcl,00000165,Molasses,,Molasses This subclass is defined through the following headings/subheadings of the HS 2007: 1703., +faostat_qcl,00000449,Mushrooms and truffles,Mushrooms,Mushrooms and truffles This subclass is defined through the following headings/subheadings of the HS 2007: 0709.51 - .59., +faostat_qcl,00000292,Mustard seed,,Mustard seed This subclass is defined through the following headings/subheadings of the HS 2007: 1207.50., +faostat_fbsc,00002732,Mutton & Goat Meat,"Meat, sheep and goat","Default composition: 977 Meat, sheep, 1017 Meat, goat", +faostat_qcl,00001182,Natural honey,Honey,Natural honey This subclass is defined through the following headings/subheadings of the HS 2007: 0409., +faostat_fbsc,00002551,Nuts and products,Nuts,"Default composition: 216 Brazil nuts, with shell, 217 Cashew nuts, with shell, 220 Chestnut, 221 Almonds, with shell, 222 Walnuts, with shell, 223 Pistachios, 224 Kola nuts, 225 Hazelnuts, with shell, 226 Areca nuts, 229 Brazil nuts, shelled, 230 Cashew nuts, shelled, 231 Almonds shelled, 232 Walnuts, shelled, 233 Hazelnuts, shelled, 234 Nuts, nes, 235 Nuts, prepared (exc. groundnuts)", +faostat_fbsc,00002516,Oats,,"Default composition: 75 Oats, 76 Oats rolled, 77 Bran, oats", +faostat_qcl,00000075,Oats,,Oats, +faostat_fbsc,00002945,Offals,Offals group,, +faostat_fbsc,00002736,"Offals, Edible",Offals,"Default composition: 868 Offals, edible, cattle, 878 Liver prep., 948 Offals, edible, buffaloes, 978 Offals, sheep,edible, 1018 Offals, edible, goats, 1036 Offals, pigs, edible, 1059 Offals, liver chicken, 1074 Offals, liver geese, 1075 Offals, liver duck, 1081 Offals, liver turkeys, 1098 Offals, horses, 1128 Offals, edible, camels, 1159 Offals, other camelids, 1167 Offals, nes", +faostat_qcl,00000334,Oil of linseed,Linseed oil,Obtained by pressure extraction. Used mainly in non-food items (Unofficial definition), +faostat_qcl,00000060,Oil of maize,Maize oil,Extracted from germ by pressure or by solvents (Unofficial definition), +faostat_qcl,00000258,Oil of palm kernel,Palm kernel oil,Obtained from the kernel of the nut of the fruits of the oil palm by pressure in two or three stages at different temperatures. Including oil of babassu kernels (Unofficial definition), +faostat_qcl,00000290,Oil of sesame seed,Sesame oil,Obtained by pressure extraction in two or three stages at different temperatures. Sometimes the oil is also extracted by solvent from the residue of the pressure extraction. Used mainly for food (Unofficial definition), +faostat_qcl,00000254,Oil palm fruit,Palm fruit oil,"This subclass covers oil palm fruit, species of Elaeis guineensis. The oil palm produces bunches containing a large number of fruits with the fleshy mesocarp enclosing a kernel that is covered by a very hard shell. FAO considers palm oil (coming from the pulp) and palm kernels as primary products. The oil extraction rate from a bunch varies from 17 to 27% for palm oil, and from 4 to 10% for palm kernels. (Unofficial definition)", +faostat_fbsc,00002913,Oilcrops,,, +faostat_qcl,00001841,"Oilcrops, Cake Equivalent",,, +faostat_qcl,00001732,"Oilcrops, Oil Equivalent",,"Oil-Bearing Crops or Oil Crops include both annual (usually called oilseeds) and perennial plants whose seeds, fruits or mesocarp and nuts are valued mainly for the edible or industrial oils that are extracted from them. Dessert and table nuts, although rich in oil, are listed under Nuts (see Chapter .). Annual oilseed plants tha are either harvested green or are used for grazing and for green manure are included with Fodder Crops (see Chapter 11.). Some of the crops included in this chapter are also fibre crops in that both the seeds and the fibres are harvested from the same plant. Such crops include: coconuts, yielding coir from the mesocarp; kapok fruit; seed cotton; linseed; and hempseed. In the case of several other crops, both the pulp of the fruit and the kernels are used for oil. The main crops of this type are oil-palm fruit and tallow tree seeds. Production data are reported in terms of dry products as marketed. Exceptions to this general rule include: groundnuts, which are reported as groundnuts in the shell; coconuts, which are reported on the basis of the weight of the nut including the woody shell, but excluding the fibrous outer husk; and palm oil, which is reported in terms of oil, by weight. Because of the very different nature of the various oil crops, the primary products cannot be aggregated in their natural weight to obtain total oil crops. For this reason, FAO converts the crops to either an oil equivalent or an oilcake equivalent before aggregating them. Only 5-6 percent of the world production of oil crops is used for seed (oilseeds) and animal feed, while about 8 percent is used for food. The remaining 86 percent is processed into oil. The fat content of oil crops varies widely. Fat content ranges from as low as 10-15 percent of the weight of coconuts to over 50 percent of the weight of sesame seeds and palm kernels. Carbohydrates, mainly polysaccharides, range from 15 to 30 percent in the oilseeds, but are generally lower in other oil-bearing crops. The protein content is very high in soybeans, at up to 40 percent, but is much lower in many other oilseeds, at 15-25 percent, and is lower still in some other oil-bearing crops.", +faostat_qcl,00000430,Okra,,"Okra, species of Abelmoschus esculentus and Hibiscus esculentus, also called gombo, fresh or chilled. (Unofficial definition)", +faostat_fbsc,00002580,Olive Oil,Olive oil,"Default composition: 261 Oil, olive, virgin, 274 Oil, olive residues", +faostat_qcl,00000261,Olive oil,Olive oil,Olive oil, +faostat_qcl,00000260,Olives,,"Olives This subclass includes: - olives, Olea europaea", +faostat_fbsc,00002563,Olives (including preserved),Olives,"Default composition: 260 Olives, 262 Olives preserved", +faostat_fbsc,00002602,Onions,,"Default composition: 403 Onions, dry", +faostat_qcl,00000403,"Onions and shallots, dry (excluding dehydrated)",Onions,"Onions and shallots, dry (excluding dehydrated), species of Allium cepa, includes onions at a mature stage, but not dehydrated onions. (Unofficial definition)", +faostat_qcl,00000490,Oranges,,"Oranges This subclass includes: - oranges, Citrus sirensis - bitter oranges, Citrus aurantium This subclass does not include: - bergamots, cf. 01329 - chinottos, cf. 01329", +faostat_fbsc,00002611,"Oranges, Mandarines",Oranges,"Default composition: 490 Oranges, 491 Juice, orange, single strength, 492 Juice, orange, concentrated, 495 Tangerines, mandarins, clementines, satsumas, 496 Juice, tangerine", +faostat_qcl,00000414,"Other beans, green","Beans, green","Other beans, green, species of Phaseolus spp. And Vigna spp., for shelling, fresh or chilled. (Unofficial definition)", +faostat_qcl,00000821,"Other fibre crops, raw, n.e.c.",Fibre crops,"Including inter alia: alfa, esparto (Lygeum spartum; Stipa tenacissima); bowstring hemp (Sansevieria spp.); caroa (Neoglaziovia variegata); fuque fibre (Furcraea macrophylla); Mauritius hemp (F. gigantea); New Zealand flax (Phormium tenax); palma ixtle (Samuela carnerosana) Other fibres that are not identified separately because of their minor relevance at the international level. Because of their limited local importance, some countries report vegetable fibres under this commodity heading that are classified individually by FAO. The fibre is obtained from the leaves, stalks or fruit of the plant. In instances where the fibrous part is normally used for other purposes, data cover only those fibres intended for spinning. (Unofficial definition)", +faostat_qcl,00000256,Palm kernels,,"Palm kernels are the Seeds of the oil palm. Babassu kernels, species of Orbignya speciosa, are often reported as palm kernels. (Unofficial definition)", +faostat_fbsc,00002562,Palm kernels,,"Default composition: 254 Oil, palm fruit, 256 Palm kernels", +faostat_qcl,00000257,Palm oil,Palm oil,Palm oil, +faostat_fbsc,00002576,Palmkernel Oil,Palm kernel oil,"Default composition: 258 Oil, palm kernel", +faostat_qcl,00000600,Papayas,,Papayas This subclass is defined through the following headings/subheadings of the HS 2007: 0807.20., +faostat_qcl,00000534,Peaches and nectarines,,Peaches and nectarines This subclass is defined through the following headings/subheadings of the HS 2007: 0809.30., +faostat_qcl,00000521,Pears,,"Pears, species of Pyrus communis, for beverages production (e.g. perry), dessert or industrial purposes (e.g. production of paste, jam or jelly, and pectin). (Unofficial definition)", +faostat_fbsc,00002547,Peas,"Peas, dry","Default composition: 187 Peas, dry", +faostat_qcl,00000187,"Peas, dry",,"Peas, dry This subclass is defined through the following headings/subheadings of the HS 2007: 0713.10.", +faostat_qcl,00000417,"Peas, green",,"Peas, green This subclass is defined through the following headings/subheadings of the HS 2007: 0708.10.", +faostat_fbsc,00002640,Pepper,,Default composition: 687 Pepper (piper spp.), +faostat_qcl,00000687,"Pepper (Piper spp.), raw",Pepper,"Pepper (Piper spp.), raw This subclass is defined through the following headings/subheadings of the HS 2007: 0904.11.", +faostat_qcl,00000197,"Pigeon peas, dry",Pigeon peas,"Pigeon peas, dry This subclass is defined through the following headings/subheadings of the HS 2007: 0713.60.", +faostat_fbsc,00002733,Pigmeat,"Meat, pig","Default composition: 1035 Meat, pig, 1038 Meat, pork, 1039 Bacon and ham, 1041 Meat, pig sausages, 1042 Meat, pig, preparations", +faostat_fbsc,00002641,Pimento,Chillies and peppers,"Default composition: 689 Chillies and peppers, dry", +faostat_qcl,00000574,Pineapples,,"Pineapples This subclass includes: - pineapples, Ananas comosus", +faostat_fbsc,00002618,Pineapples and products,Pineapples,"Default composition: 574 Pineapples, 575 Pineapples canned, 576 Juice, pineapple, 580 Juice, pineapple, concentrated", +faostat_qcl,00000223,"Pistachios, in shell",Pistachios,"Pistachios, in shell This subclass includes: - pistachios, seeds from the fruit of Pistacia vera", +faostat_fbsc,00002616,Plantains,,Default composition: 489 Plantains, +faostat_qcl,00000489,Plantains and cooking bananas,Plantains,"Plantains and cooking bananas This subclass includes: - plantains - cooking bananas, Musa paradisiaca, i.e. bananas that need to be prepared (cooked or boiled) for consumption This subclass does not include: - sweet/dessert bananas, Musa sapientum, M. cavendishii, M. nana, cf. 01312", +faostat_qcl,00000536,Plums and sloes,Plums,Plums and sloes This subclass is defined through the following headings/subheadings of the HS 2007: 0809.40., +faostat_qcl,00000507,Pomelos and grapefruits,Grapefruit,"Pomelos and grapefruits This subclass includes: - grapefruits, fruit of the grapefruit tree, Citrus paradisi - pomelos, shaddocks, Citrus maxima, or Citrus grandis", +faostat_qcl,00000296,Poppy seed,Poppy seeds,Poppy seed This subclass is defined through the following headings/subheadings of the HS 2007: 1207.91., +faostat_qcl,00000116,Potatoes,,Potatoes This subclass is defined through the following headings/subheadings of the HS 2007: 0701., +faostat_fbsc,00002531,Potatoes and products,Potatoes,"Default composition: 116 Potatoes, 117 Flour, potatoes, 118 Potatoes, frozen, 119 Starch, potatoes, 121 Tapioca, potatoes", +faostat_qcl,00002029,Poultry Birds,Poultry,, +faostat_fbsc,00002734,Poultry Meat,"Meat, poultry","Default composition: 1058 Meat, chicken, 1060 Fat, liver prepared (foie gras), 1061 Meat, chicken, canned, 1069 Meat, duck, 1073 Meat, goose and guinea fowl, 1080 Meat, turkey", +faostat_fbsc,00002911,Pulses,,, +faostat_qcl,00001726,"Pulses, Total",Pulses,"Pulses are annual leguminous crops yielding from one to 12 grains or seeds of variable size, shape and color within a pod. They are used for both food and feed. The term ""pulses"" is limited to crops harvested solely for dry grain, thereby excluding crops harvested green for food (green peas, green beans, etc.) which are classified as vegetable crops. Also excluded are those crops used mainly for oil extraction (e.g. soybean and groundnuts) and leguminous crops (e.g. seeds of clover and alfalfa) that are used exclusively for sowing purposes. In addition to their food value, pulses also play an important role in cropping systems because of their ability to produce nitrogen and thereby enrich the soil. Pulses contain carbohydrates, mainly starches (55-65 percent of the total weight); proteins, including essential amino acids (18-25 percent, and much higher than cereals); and fat (1 - 4 percent). The remainder consists of water and inedible substances. Production data should be reported in terms of dry clean weight, excluding the weightof the pods. Certain kinds of pulses can be skinned and partially crushed or split toremove the seed-coat, but the resulting products are still considered raw for classification purposes. FAO covers 11 primary pulses. Each is listed below, along with its code, its botanical name, or names, and a short description. Only two processed products are included in the FAO list, namely flour of pulses and bran of pulses.", +faostat_qcl,00000092,Quinoa,,Quinoa This subclass is defined through the following headings/subheadings of the HS 2007: 1008.50., +faostat_qcl,00001140,Rabbits and hares,Rabbits,"Rabbits and hares This subclass includes: - rabbits, Oryctolagus cuniculus - hares, species of Lepus", +faostat_qcl,00000270,Rape or colza seed,Rapeseed,Rape or colza seed This subclass is defined through the following headings/subheadings of the HS 2007: 1205., +faostat_qcl,00000271,"Rapeseed or canola oil, crude",Rapeseed oil,Obtained by pressure extraction for food use. Oil recovered with solvent from the residues of the pressure extraction is used for industrial purposes. Canola oil is produced from new varieties of rapeseed (Unofficial definition), +faostat_qcl,00000547,Raspberries,,"This subclass covers raspberries, species of Rubus idaeus.", +faostat_qcl,00000162,Raw cane or beet sugar (centrifugal only),Sugar (raw),The sum of codes 23511.01 and 23512. Processed further to obtain refined sugar (Unofficial definition), +faostat_qcl,00000957,Raw hides and skins of buffaloes,Buffalo hides,Green hide or skin as removed from the carcass of the animal. Used for production data only. Both adult and young animals. (Unofficial definition), +faostat_qcl,00000919,Raw hides and skins of cattle,Cattle hides,Green hide or skin as removed from the carcass of the animal. Used for production data only. (Unofficial definition), +faostat_qcl,00001025,Raw hides and skins of goats or kids,"Skins, goat","Raw hides and skins of goats or kids This subclass includes: - raw hides and skins (fresh or preserved, but not further prepared) of goats and kids, animals of subclass 02123", +faostat_qcl,00000995,Raw hides and skins of sheep or lambs,"Skins, sheep",Raw hides and skins of sheep or lambs This subclass is defined through the following headings/subheadings of the HS 2007: 4102., +faostat_qcl,00001186,Raw silk (not thrown),Silk,Raw silk (not thrown) This subclass is defined through the following headings/subheadings of the HS 2007: 5002., +faostat_qcl,00000027,Rice,Rice,Rice, +faostat_fbsc,00002807,Rice and products,Rice,"Default composition: 27 Rice, paddy, 28 Rice, husked, 29 Rice, milled/husked, 31 Rice, milled, 32 Rice, broken, 33 Gluten, rice, 34 Starch, rice, 35 Bran, rice, 38 Flour, rice", +faostat_qcl,00001720,"Roots and Tubers, Total",Roots and tubers,"Roots and Tubers are plants yielding starchy roots, tubers, rhizomes, corms and stems. They are used mainly for human food (as such or in processed form), for animal feed and for manufacturing starch, alcohol and fermented beverages including beer. The denomination ""roots and tubers"" excludes crops which are cultivated mainly for feed (mangolds, swedes) or for processing into sugar (sugar beets), and those classified as ""roots, bulb and tuberous vegetables"" (onions, garlic and beets). It does include starch and the starchy pith and flour obtained from the trunk of the sago palm and the stem of the Abyssinian banana (Musa ensete). Certain root crops, notably bitter cassava, contain toxic substances, particularly in the skins. As a result, certain processes must be undertaken to make the product safe for human consumption. Apart from their high water content (70-80 percent), these crops contain mainly carbohydrates (largely starches that account for 16-24 percent of their total weight) with very little protein and fat (0-2 percent each). Methods of propagating root crops vary. A live potato tuber or seed must be planted but only part of the live yam tuber and a piece of the stalk (not the root) in the case of cassava. Production data of root crops should be reported in terms of clean weight, i.e. free of earth and mud. FAO distinguishes among seven primary root and tuber crops. The code and name of each one appears in the list that follows, along with its botanical name, or names, and a short description. The processed products of roots and tubers are listed together with their parent primary crops.", +faostat_qcl,00000071,Rye,,Rye, +faostat_fbsc,00002515,Rye and products,Rye,"Default composition: 71 Rye, 72 Flour, rye, 73 Bran, rye", +faostat_qcl,00000280,Safflower seed,,"Safflower seed This subclass includes: - safflower seed, Carthamus tinctorius", +faostat_qcl,00000281,"Safflower-seed oil, crude",Safflower oil,Obtained either by pressure or by solvent. Has both food and industrial uses (Unofficial definition), +faostat_qcl,00000328,"Seed cotton, unginned",Seed cotton,"Unginned cotton, Gossypium spp. Grown for both seed and for fibre. FAO considers cottonseed, cotton lint and linters to be primary products. Lint content ranges from 30 to 40%, seed 55 to 65%, and linters 2 to 5% though they are not always separated. (Unofficial definition)", +faostat_qcl,00000289,Sesame seed,,Sesame seed This subclass is defined through the following headings/subheadings of the HS 2007: 1207.40., +faostat_fbsc,00002561,Sesame seed,,Default composition: 289 Sesame seed, +faostat_fbsc,00002579,Sesameseed Oil,Sesame oil,"Default composition: 290 Oil, sesame", +faostat_qcl,00001807,Sheep and Goat Meat,"Meat, sheep and goat",, +faostat_qcl,00001749,Sheep and Goats,Sheep and goats,, +faostat_qcl,00000979,"Sheep fat, unrendered","Fat, sheep","Sheep fat, unrendered This subclass includes: - fat from sheep, animals of subclass 02122, unrendered This subclass does not include: - tallow, fat from sheep, rendered, cf. 21523 - marrow, bone or foot oil and their fractions, from sheep, cf. 21529", +faostat_qcl,00000987,"Shorn wool, greasy, including fleece-washed shorn wool",Wool,"Shorn wool, greasy, including fleece-washed shorn wool This subclass is defined through the following headings/subheadings of the HS 2007: 5101.11.", +faostat_qcl,00000083,Sorghum,,Sorghum, +faostat_fbsc,00002518,Sorghum and products,Sorghum,"Default composition: 83 Sorghum, 84 Flour, sorghum, 85 Bran, sorghum", +faostat_qcl,00000237,Soya bean oil,Soybean oil,Soya bean oil, +faostat_qcl,00000236,Soya beans,Soybeans,Soya beans, +faostat_fbsc,00002571,Soyabean Oil,Soybean oil,"Default composition: 237 Oil, soybean", +faostat_fbsc,00002555,Soyabeans,Soybeans,"Default composition: 236 Soybeans, 239 Soya sauce, 240 Soya paste, 241 Soya curd", +faostat_qcl,00000373,Spinach,,Spinach This subclass is defined through the following headings/subheadings of the HS 2007: 0709.70., +faostat_qcl,00000544,Strawberries,,Strawberries This subclass is defined through the following headings/subheadings of the HS 2007: 0810.10., +faostat_qcl,00000423,String beans,,"String beans, species of Phaseolus vulgaris. String beans for shelling are excluded. (Unofficial definition)", +faostat_fbsc,00002908,Sugar Crops,Sugar crops,, +faostat_qcl,00001723,Sugar Crops Primary,Sugar crops,"Sugar Crops and Sweeteners and derived products: In addition to providing the source for the manufacture of sugar, SUGAR CROPS are used to produce alcohol and ethanol. In certain countries, sugar cane is eaten raw in minor quantities. It also is used in the preparation of juices and for animal feed. There are two major sugar crops: sugar beets and sugar cane. However, sugar and syrups are also produced from the sap of certain species of maple trees, from sweet sorghum when cultivated explicitly for making syrup and from sugar palm. Sugar beets that are cultivated solely as a fodder crop and red or garden beets that are classified as vegetable crops are excluded from the FAO list of sugar crops. Sugar cane is a perennial grass (replanted at intervals using pieces of the cane stalks) that is cultivated mainly in the tropics. Sugar beet is an annual crop that is propagated by the seeds of the flowers. It is cultivated in cooler climates than sugar cane, mainly above the 35th parallel of the Northern Hemisphere. Both sugar beets and sugar cane have high water content, accounting for about 75 percent of the total weight of the plants. The sugar content of sugar cane ranges from 10 to 15 percent of the total weight, while that of sugar beets is between 13 and 18 percent. The protein and fat content of both beets and cane is almost nil. Production data on sugar beets and sugar cane relate to the harvested crop, free of soil, plant tops and leaves. FAO lists three primary sugar crops. Under the name SWEETENERS, FAO includes products used for sweetening that are derived from sugar crops, cereals, fruits or milk, or that are produced by insects. This category includes a wide variety of monosaccharide (glucose and fructose) and disaccharides (sucrose and saccharose). They exist either in a crystallized state as sugar, or in thick liquid form as syrups. The traditional sources of sugar are sugar cane and sugar beets. But in recent years, ever larger quantities of cereals (mainly maize) have been used to produce sweeteners derived from starch. OTHER DERIVED PRODUCTS. In addition to sugar, molasses is also obtained with various degrees of sugar content. The by-product obtained from the extraction of sugar is called bagasse in the case of sugar cane, and beet pulp in the case of sugar beets.", +faostat_fbsc,00002537,Sugar beet,,Default composition: 157 Sugar beet, +faostat_qcl,00000157,Sugar beet,,Sugar beet This subclass is defined through the following headings/subheadings of the HS 2007: 1212.91., +faostat_fbsc,00002536,Sugar cane,,Default composition: 156 Sugar cane, +faostat_qcl,00000156,Sugar cane,,"Sugar cane This subclass includes: - sugar cane, Saccharum officinarum", +faostat_fbsc,00002557,Sunflower seed,,Default composition: 267 Sunflower seed, +faostat_qcl,00000267,Sunflower seed,,Sunflower seed This subclass is defined through the following headings/subheadings of the HS 2007: 1206., +faostat_qcl,00000268,"Sunflower-seed oil, crude",Sunflower oil,Obtained by pressure extraction. Mainly for food use (Unofficial definition), +faostat_fbsc,00002573,Sunflowerseed Oil,Sunflower oil,"Default composition: 268 Oil, sunflower", +faostat_qcl,00000122,Sweet potatoes,,Sweet potatoes This subclass is defined through the following headings/subheadings of the HS 2007: 0714.20., +faostat_fbsc,00002533,Sweet potatoes,,Default composition: 122 Sweet potatoes, +faostat_qcl,00000495,"Tangerines, mandarins, clementines",Tangerines,"Tangerines, mandarins, clementines This subclass includes: - tangerines, mandarins and clementines, fruits of Citrus reticulata", +faostat_qcl,00000667,Tea leaves,Tea,"Tea leaves This subclass is defined through the following headings/subheadings of the HS 2007: 0902.20, .40.", +faostat_qcl,00000388,Tomatoes,,Tomatoes This subclass is defined through the following headings/subheadings of the HS 2007: 0702., +faostat_fbsc,00002601,Tomatoes and products,Tomatoes,"Default composition: 388 Tomatoes, 389 Juice, tomato, concentrated, 390 Juice, tomato, 391 Tomatoes, paste, 392 Tomatoes, peeled", +faostat_fbsc,00002912,Treenuts,,, +faostat_qcl,00001729,"Treenuts, Total",Treenuts,"Tree NUTS are dry fruits or kernels enclosed in woody shells or hard husks, which in turn are generally covered by a thick, fleshy/fibrous outer husk that is removed during harvest. Similar products, such as groundnuts, sunflower seeds and melon seeds, although often used for similar purposes, are included with oil-bearing crops.FAO includes in this group only dessert or table nuts. Nuts that are used mainly for flavouring beverages and masticatory and stimulant nuts should be excluded. An exception is made for areca nuts and kola nuts, which FAO considers to be inedible nuts, but which are included with the nut and derived products group to be consistent with international trade classifications. Nuts used mainly for the extraction of oil or butter, (e.g. sheanuts) as well as nuts contained in other fruits (e.g. peaches) are excluded. It should be noted that some countries report certain nut crops (chestnuts, pignolia nuts) with forestry products. Production data relate to the weight of nuts in the shell or husk, but without the outer husk. The weight of the kernel contained in the nut ranges from as low as 30 percent for cashew nuts to as high as 80 percent in the case of chestnuts. The edible portion of nut kernels is, with the major exception of chestnuts, very rich in fat content at between 50 percent and 65 percent. Protein content makes up 15-20 percent and carbohydrate content is between 10 percent and 15 percent. Starch and saccharose are the main components of dry chestnuts, accounting for about 75 percent. FAO covers ten primary nut crops. Each is listed below along with its code, its botanical name, or names, and a short description. NUT PRODUCTS include shelled nuts, whole or split, and further processed products, including roasted nuts, meal/flour, paste, oil, etc. Nut oils are not separately identified in the FAO classification; instead they are included under the heading ""oil of vegetable origin nes"". The most commonly marketed oils are almon oil and cashew nut oil and its derivative cardol.", +faostat_qcl,00000826,Unmanufactured tobacco,Tobacco,Unmanufactured tobacco This subclass is defined through the following headings/subheadings of the HS 2007: 2401.10., +faostat_fbsc,00002918,Vegetables,,, +faostat_qcl,00001735,Vegetables Primary,Vegetables,"Vegetables, as classified in this group, are mainly annual plants cultivated as field and garden crops in the open and under glass, and used almost exclusively for food. Vegetables grown principally for animal feed or seed should be excluded. Certain plants, normally classified as cereals and pulses, belong to this group when harvested green, such as green maize, green peas, etc. This grouping differs from international trade classifications for vegetables in that it includes melons and watermelons, which are normally considered to be fruit crops. But, whereas fruit crops are virtually all permanent crops, melons and watermelons are similar to vegetables in that they are temporary crops. Chillies and green peppers are included in this grouping when they are harvested for consumption as vegetables and not processed into spices. FAO production data for green peas and green beans refer to the total weight including pods, although some countries report on a shelled weight basis. The weight of the pods ranges from 40 to 50 percent for peas to up to 70 percent for broad beans. Area data on small vegetable gardens are often omitted in agricultural surveys, although production estimates may be reported. Trade data for fresh vegetables also include chilled vegetables, meaning the temperature of the products has been reduced to around 0øC without the products being frozen. Vegetables contain principally water, accounting for between 70 percent and 95 percent of their weight. They are low in nutrients, but contain minerals and vitamins. FAO covers 27 primary vegetable products. Each is listed along with its code, botanical name, or names, and a short description. PRODUCTS DERIVED FROM VEGETABLES refer to processed products. Apart from a few main products, international trade classifications do not permit a sufficiently detailed classification of processed products according to the primary commodity used in the preparation. A similar situation prevails for frozen vegetables.", +faostat_qcl,00000222,"Walnuts, in shell",Walnuts,"Walnuts, in shell This subclass is defined through the following headings/subheadings of the HS 2007: 0802.31.", +faostat_qcl,00000567,Watermelons,,Watermelons This subclass is defined through the following headings/subheadings of the HS 2007: 0807.11., +faostat_qcl,00000015,Wheat,,Wheat, +faostat_fbsc,00002511,Wheat and products,Wheat,"Default composition: 15 Wheat, 16 Flour, wheat, 17 Bran, wheat, 18 Macaroni, 19 Germ, wheat, 20 Bread, 21 Bulgur, 22 Pastry, 23 Starch, wheat, 24 Gluten, wheat, 41 Cereals, breakfast, 110 Wafers, 114 Mixes and doughs, 115 Food preparations, flour, malt extract", +faostat_qcl,00000900,"Whey, dry",Whey,Used in both food and animal feed (Unofficial definition), +faostat_fbsc,00002655,Wine,,"Default composition: 564 Wine, 565 Vermouths & similar", +faostat_qcl,00000564,Wine,,"Wines of fresh grapes of all qualities, including sparkling, fortified and dessert wines (Unofficial definition)", +faostat_qcl,00000137,Yams,,"Yams This subclass includes: - yams, tubers from vines of the genus Dioscorea D. batatas D. trifida D. alata D. bulbifera D. rotunda D. cayenensis D. exculenta D. dumetorum This subclass does not include: - sweet potatoes, cf. 01530", +faostat_fbsc,00002535,Yams,,Default composition: 137 Yams, diff --git a/etl/steps/data/garden/faostat/2024-03-14/detected_anomalies.py b/etl/steps/data/garden/faostat/2024-03-14/detected_anomalies.py new file mode 100644 index 00000000000..22d10b928cd --- /dev/null +++ b/etl/steps/data/garden/faostat/2024-03-14/detected_anomalies.py @@ -0,0 +1,832 @@ +"""This module contains a class for each type of data anomaly detected. + +If after a data update an anomaly is no longer in the data, remove the corresponding class from this module. + +See documentation of class DataAnomaly below for more details on how anomaly classes are structured. + +""" +import abc +import os +from typing import Tuple + +import pandas as pd +import plotly.express as px +from structlog import get_logger + +log = get_logger() + +# Sentence to add before describing data anomalies (if there is any). +ANOMALY_DESCRIPTION_INTRODUCTION = "\n\nProcessing of possible data anomalies by Our World in Data:" + +# If environment variable INSPECT_ANOMALIES is set to True, run the step in interactive mode. +INSPECT_ANOMALIES = bool(os.getenv("INSPECT_ANOMALIES", False)) + + +class DataAnomaly(abc.ABC): + """Abstract class for a certain type of data anomaly.""" + + def __init__(self) -> None: + pass + + @property + @abc.abstractmethod + def description(self) -> str: + """A human-readable text that describes the anomaly. + + NOTE: The description will be added to the dataset metadata description, and hence will be shown in grapher. + """ + raise NotImplementedError + + @abc.abstractmethod + def check(self, df: pd.DataFrame) -> None: + """A method that ensures the anomaly exists in the data. + + This is useful to detect if an anomaly has been corrected after a data update. + + Parameters + ---------- + df : pd.DataFrame + Data containing anomalies. + """ + raise NotImplementedError + + @abc.abstractmethod + def fix(self, df: pd.DataFrame) -> pd.DataFrame: + """A method that removes the anomaly. + + Parameters + ---------- + df : pd.DataFrame + Data that contains anomalies to be removed. + + Returns + ------- + df_fixed : pd.DataFrame + Data after removing anomalies. + """ + raise NotImplementedError + + def inspect(self, df: pd.DataFrame) -> None: + """An optional method that plots (in the browser) a visualization that shows the anomaly. + + It can be used before and after removing the anomalies. + + Parameters + ---------- + df : pd.DataFrame + Data to be inspected (before or after having anomalies removed). + """ + raise NotImplementedError + + def handle_anomalies(self, df: pd.DataFrame, inspect_anomalies: bool = INSPECT_ANOMALIES) -> pd.DataFrame: + """A helper method that uses all the previous methods in the usual order. + + Parameters + ---------- + df : pd.DataFrame + Data with anomalies. + inspect_anomalies : bool, optional + True to open charts in the browser to visualize the data before and after removing the anomalies. + + Returns + ------- + df_fixed : pd.DataFrame + Data after removing anomalies. + """ + log.info(f"Handling anomaly: {self.description}") + log.info("Checking that known data anomalies are present in the data") + self.check(df=df) + + if inspect_anomalies: + log.info("Inspect anomaly before fixing.") + self.inspect(df=df) + + log.info("Fixing anomalies.") + df_fixed = self.fix(df=df) + + if inspect_anomalies: + log.info("Inspect anomaly after fixing.") + self.inspect(df=df_fixed) + + return df_fixed + + +def _split_long_title(text: str) -> str: + """Split a given text to have at most 100 characters per line, where line breaks are noted by the HTML tag
.""" + # I couldn't find an easy way to show a figure below a long text, so I split the title into lines of a fixed width. + line_length = 100 + html_text = "
".join([text[i : i + line_length] for i in range(0, len(text), line_length)]) + + return html_text + + +class SpinachAreaHarvestedAnomaly(DataAnomaly): + description = ( # type: ignore + "The area harvested of spinach for China (which refers to mainland) in 1984 is missing. " + "This causes that other regions that are aggregates which include China mainland have a spurious reduction in " + "area harvested of spinach in that year, and a spurious increase in yield. " + "Therefore, we remove those spurious aggregate values." + ) + + affected_item_codes = [ + "00000373", + ] + affected_element_codes = [ + "005312", + "5312pc", + "005419", + ] + affected_years = [ + 1984, + ] + affected_countries = [ + "China", + "China (FAO)", + "Asia", + "Asia (FAO)", + "Upper-middle-income countries", + "Eastern Asia (FAO)", + "World", + ] + + def check(self, df): + # Check that the data point is indeed missing. + assert df[ + (df["country"] == "China") + & (df["item_code"].isin(self.affected_item_codes)) + & (df["element_code"].isin(self.affected_element_codes)) + & (df["year"].isin(self.affected_years)) + ].empty + # For consistency, check that other years do have data for the same item and element. + assert not df[ + (df["country"] == "China") + & (df["item_code"].isin(self.affected_item_codes)) + & (df["element_code"].isin(self.affected_element_codes)) + & ~(df["year"].isin(self.affected_years)) + ].empty + + def inspect(self, df): + log.info( + "The anomaly causes: " + "\n* A dip in area harvested of spinach in that year (element code 005312). " + "\n* In terms of per capita area (element code 005312pc), the dip is not as visible. " + "\n* A big increase in yield (element code 005419) for that year." + ) + for element_code in self.affected_element_codes: + selection = ( + (df["country"].isin(self.affected_countries)) + & (df["item_code"].isin(self.affected_item_codes)) + & (df["element_code"] == element_code) + ) + df_affected = df[selection].astype({"country": str}) + title = _split_long_title(self.description + f"Element code {element_code}") + fig = px.line(df_affected, x="year", y="value", color="country", title=title) + fig.show() + + def fix(self, df): + indexes_to_drop = df[ + ( + (df["country"].isin(self.affected_countries)) + & (df["item_code"].isin(self.affected_item_codes)) + & (df["element_code"].isin(self.affected_element_codes)) + & (df["year"].isin(self.affected_years)) + ) + ].index + df_fixed = df.drop(indexes_to_drop).reset_index(drop=True) + + return df_fixed + + +class CocoaBeansFoodAvailableAnomaly(DataAnomaly): + description = ( # type: ignore + "Food available for consumption for cocoa beans from 2010 onwards presents many zeros for different countries. " + "These zeros are likely to correspond to missing data. " + "This issue may be caused by a change in FAO methodology precisely on 2010. " + "Therefore, to be conservative, we eliminate those zeros and treat them as missing values. " + "For aggregate regions (like continents), data from 2010 onwards is not zero, but a small number (resulting " + "from summing many spurious zeros). " + "Therefore, we also remove data for region aggregates from 2010 onwards." + ) + + affected_item_codes = [ + "00002633", + ] + affected_element_codes = [ + "000645", + "0645pc", + "005142", + "5142pc", + ] + # List of countries with value of exactly zero for all years after 2010. + # This list does not need to include all countries with that problem (it's used just to check they are still zero). + expected_countries_with_all_zero = [ + "United States", + "China", + "Norway", + ] + + def check(self, df): + assert ( + df[ + ( + (df["item_code"].isin(self.affected_item_codes)) + & (df["element_code"].isin(self.affected_element_codes)) + & (df["year"] >= 2010) + & (df["country"].isin(self.expected_countries_with_all_zero)) + ) + ]["value"] + == 0 + ).all() + # Check that, for the same countries, there is at least one value prior to 2010 where value is not zero. + assert ( + df[ + ( + (df["item_code"].isin(self.affected_item_codes)) + & (df["element_code"].isin(self.affected_element_codes)) + & (df["year"] < 2010) + & (df["country"].isin(self.expected_countries_with_all_zero)) + ) + ]["value"] + > 0 + ).any() + + def inspect(self, df): + log.info( + "The anomaly causes: " + "\n* Zeros from 2010 onwards. " + "\n* I's usually zero all years, but some countries also have single non-zero values (e.g. Afghanistan)." + ) + for element_code in self.affected_element_codes: + selection = (df["item_code"].isin(self.affected_item_codes)) & (df["element_code"] == element_code) + df_affected = df[selection].astype({"country": str}).sort_values(["country", "year"]) + title = _split_long_title(self.description + f"Element code {element_code}") + fig = px.line(df_affected, x="year", y="value", color="country", title=title, markers=True) + fig.show() + + def fix(self, df): + # Remove all possibly spurious zeros from 2010 onwards in all countries. + indexes_to_drop = df[ + ( + (df["year"] > 2010) + & (df["item_code"].isin(self.affected_item_codes)) + & (df["element_code"].isin(self.affected_element_codes)) + & (df["value"] == 0) + ) + ].index.tolist() + # Additionally, remove all data for region aggregates from 2010 onwards. + # List of possibly affected region aggregates, including all original FAO region aggregates. + aggregates = [ + "North America", + "South America", + "Europe", + "European Union (27)", + "Africa", + "Asia", + "Oceania", + "Low-income countries", + "Upper-middle-income countries", + "Lower-middle-income countries", + "High-income countries", + "World", + ] + sorted(set(df[df["country"].str.contains("FAO")]["country"])) + indexes_to_drop.extend( + df[ + (df["country"].isin(aggregates)) + & (df["year"] >= 2010) + & (df["item_code"].isin(self.affected_item_codes)) + & (df["element_code"].isin(self.affected_element_codes)) + ].index.tolist() + ) + + df_fixed = df.drop(indexes_to_drop).reset_index(drop=True) + + return df_fixed + + +class EggYieldNorthernEuropeAnomaly(DataAnomaly): + description = ( # type: ignore + "The amount of eggs produced per bird for Northern Europe (FAO) is unreasonably high before 1973, with values " + "between 50kg and 115kg, while from 1973 on it has more reasonable values, below 20kg. " + "Therefore, we remove all values for Northern Europe (FAO) between 1961 and 1972." + ) + + affected_item_codes = [ + "00001783", + ] + affected_element_codes = [ + "005410", + ] + affected_years = [ + 1961, + 1962, + 1963, + 1964, + 1965, + 1966, + 1967, + 1968, + 1969, + 1970, + 1971, + 1972, + ] + affected_countries = [ + "Northern Europe (FAO)", + ] + + def check(self, df): + # Check that the data prior to 1973 is indeed higher than expected, and significantly lower from then on. + assert ( + df[ + (df["country"].isin(self.affected_countries)) + & (df["item_code"].isin(self.affected_item_codes)) + & (df["element_code"].isin(self.affected_element_codes)) + & (df["year"].isin(self.affected_years)) + ]["value"] + > 40 + ).all() + assert ( + df[ + (df["country"].isin(self.affected_countries)) + & (df["item_code"].isin(self.affected_item_codes)) + & (df["element_code"].isin(self.affected_element_codes)) + & ~(df["year"].isin(self.affected_years)) + ]["value"] + < 40 + ).all() + + def inspect(self, df): + log.info( + "The anomaly causes: " + "\n* The egg yield of Northern Europe (FAO) before 1973 much higher than any other year." + ) + for element_code in self.affected_element_codes: + selection = (df["item_code"].isin(self.affected_item_codes)) & (df["element_code"] == element_code) + df_affected = df[selection].astype({"country": str}).sort_values(["country", "year"]) + title = _split_long_title(self.description + f"Element code {element_code}") + fig = px.line(df_affected, x="year", y="value", color="country", title=title) + fig.show() + + def fix(self, df): + indexes_to_drop = df[ + ( + (df["country"].isin(self.affected_countries)) + & (df["item_code"].isin(self.affected_item_codes)) + & (df["element_code"].isin(self.affected_element_codes)) + & (df["year"].isin(self.affected_years)) + ) + ].index + df_fixed = df.drop(indexes_to_drop).reset_index(drop=True) + + return df_fixed + + +class TeaProductionAnomaly(DataAnomaly): + description = ( # type: ignore + "Tea production in FAO data increased dramatically from 1990 to 1991 for many different countries (including " + "some of the main producers, like China and India). However, data from 1991 was flagged as 'Estimated value' " + "(while data prior to 1991 is flagged as 'Official figure'). This potentially anomalous increase was not " + "present in the previous version of the data. Therefore, we removed tea production data (as well as " + "per-capita production and yield) from 1991 onwards." + ) + + affected_item_codes = [ + "00000667", + ] + affected_element_codes = [ + "005510", # Production. + "5510pc", # Per capita production. + "005419", # Yield. + ] + # Countries affected by the anomaly. + # NOTE: All countries will be removed (since some of them are the main contributors to tea production), but these + # ones will be used to check for the anomaly. + affected_countries = [ + "Africa", + "Africa (FAO)", + "Americas (FAO)", + "Argentina", + "Asia", + "Asia (FAO)", + "Bangladesh", + "China", + "China (FAO)", + "Eastern Africa (FAO)", + "Eastern Asia (FAO)", + "India", + "Indonesia", + "Iran", + "Kenya", + "Land Locked Developing Countries (FAO)", + "Least Developed Countries (FAO)", + "Low-income countries", + "Low Income Food Deficit Countries (FAO)", + "Lower-middle-income countries", + "Malawi", + "Net Food Importing Developing Countries (FAO)", + "Rwanda", + "South America", + "South America (FAO)", + "South-eastern Asia (FAO)", + "Southern Asia (FAO)", + "Sri Lanka", + "Tanzania", + "Turkey", + "Uganda", + "Upper-middle-income countries", + "Vietnam", + "Western Asia (FAO)", + "World", + "Zimbabwe", + ] + + def check(self, df): + # Check that the data on 1990 has the flag "A" (Official figure) for each of the affected countries. + flagged_official = df[ + (df["country"].isin(self.affected_countries)) + & (df["item_code"].isin(self.affected_item_codes)) + & (df["element_code"].isin(self.affected_element_codes)) + & (df["year"] == 1990) + ] + flagged_estimate = df[ + (df["country"].isin(self.affected_countries)) + & (df["item_code"].isin(self.affected_item_codes)) + & (df["element_code"].isin(self.affected_element_codes)) + & (df["year"] > 1990) + ] + # Check that all affected countries have official data on 1990, and estimated on 1991. + for country in self.affected_countries: + # Assert it for each individual country. + # Check that tea production increases by at least a factor of 3. + high_value = flagged_estimate[ + (flagged_estimate["country"] == country) & (flagged_estimate["element_code"] == "005510") + ]["value"].iloc[0] + low_value = flagged_official[ + (flagged_official["country"] == country) & (flagged_official["element_code"] == "005510") + ]["value"].iloc[0] + assert high_value / low_value > 3 + + def inspect(self, df): + log.info("The anomaly causes: " "\n* The production of tea to increase dramatically from 1990 to 1991.") + for element_code in self.affected_element_codes: + selection = (df["item_code"].isin(self.affected_item_codes)) & (df["element_code"] == element_code) + df_affected = df[selection].astype({"country": str}).sort_values(["country", "year"]) + title = _split_long_title(self.description + f"Element code {element_code}") + fig = px.line(df_affected, x="year", y="value", color="country", title=title) + fig.show() + + def fix(self, df): + indexes_to_drop = df[ + ( + (df["item_code"].isin(self.affected_item_codes)) + & (df["element_code"].isin(self.affected_element_codes)) + & (df["year"] > 1990) + ) + ].index + df_fixed = df.drop(indexes_to_drop).reset_index(drop=True) + + return df_fixed + + +class HighYieldAnomaly(DataAnomaly): + description = () # type: ignore + + affected_item_codes = [] + affected_element_codes = [] + affected_years = [] + affected_countries = [] + + def check(self, df): + # Check that the data in the affected years is higher than expected, and significantly lower from then on. + assert ( + df[ + (df["country"].isin(self.affected_countries)) + & (df["item_code"].isin(self.affected_item_codes)) + & (df["element_code"].isin(self.affected_element_codes)) + & (df["year"].isin(self.affected_years)) + ]["value"] + > 100 + ).all() + assert ( + df[ + (df["country"].isin(self.affected_countries)) + & (df["item_code"].isin(self.affected_item_codes)) + & (df["element_code"].isin(self.affected_element_codes)) + & ~(df["year"].isin(self.affected_years)) + ]["value"] + < 100 + ).all() + + def inspect(self, df): + log.info("The anomaly causes: " "\n* The yield of certain items, countries and years to be unreasonably high.") + for element_code in self.affected_element_codes: + selection = (df["item_code"].isin(self.affected_item_codes)) & (df["element_code"] == element_code) + df_affected = df[selection].astype({"country": str}).sort_values(["country", "year"]) + title = _split_long_title(self.description + f"Element code {element_code}") + fig = px.line(df_affected, x="year", y="value", color="country", title=title) + fig.show() + + def fix(self, df): + indexes_to_drop = df[ + ( + (df["country"].isin(self.affected_countries)) + & (df["item_code"].isin(self.affected_item_codes)) + & (df["element_code"].isin(self.affected_element_codes)) + & (df["year"].isin(self.affected_years)) + ) + ].index + df_fixed = df.drop(indexes_to_drop).reset_index(drop=True) + + return df_fixed + + +class FruitYieldAnomaly(HighYieldAnomaly): + description = ( # type: ignore + "Yields are unreasonably high (possibly by a factor of 1000) for some items, countries and years. " + "For example, the yield of item 'Fruit Primary' in Denmark prior to 1985 is larger than 6000 tonnes/ha. " + "Similar issues happen to Antigua and Barbuda and Burkina Faso. " + "For Netherlands, a similar anomaly is found but prior to 1984 (which will be considered separately). " + "Therefore, we remove these possibly spurious values." + ) + + affected_item_codes = [ + # Item code for "Fruit Primary". + "00001738", + ] + affected_element_codes = [ + # Element code for "Yield". + "005419", + ] + affected_years = [ + 1961, + 1962, + 1963, + 1964, + 1965, + 1966, + 1967, + 1968, + 1969, + 1970, + 1971, + 1972, + 1973, + 1974, + 1975, + 1976, + 1977, + 1978, + 1979, + 1980, + 1981, + 1982, + 1983, + 1984, + ] + affected_countries = [ + "Antigua and Barbuda", + "Burkina Faso", + "Denmark", + ] + + +class FruitYieldNetherlandsAnomaly(HighYieldAnomaly): + description = ( # type: ignore + "Yields are unreasonably high (possibly by a factor of 1000) for some items, countries and years. " + "This happens to item 'Fruit Primary' in Netherlands prior to 1984. " + "Therefore, we remove these possibly spurious values." + ) + + affected_item_codes = [ + # Item code for "Fruit Primary". + "00001738", + ] + affected_element_codes = [ + # Element code for "Yield". + "005419", + ] + affected_years = [ + 1961, + 1962, + 1963, + 1964, + 1965, + 1966, + 1967, + 1968, + 1969, + 1970, + 1971, + 1972, + 1973, + 1974, + 1975, + 1976, + 1977, + 1978, + 1979, + 1980, + 1981, + 1982, + 1983, + ] + affected_countries = [ + "Netherlands", + ] + + +class LocustBeansYieldAnomaly(HighYieldAnomaly): + description = ( # type: ignore + "Yields are unreasonably high (possibly by a factor of 1000) for some items, countries and years. " + "This happens to item 'Locust beans (carobs)' for region 'Net Food Importing Developing Countries (FAO)'. " + "Therefore, we remove these possibly spurious values." + ) + + affected_item_codes = [ + # Item code for "Locust beans (carobs)". + "00000461", + ] + affected_element_codes = [ + # Element code for "Yield". + "005419", + ] + affected_years = [ + 1961, + 1962, + 1963, + 1964, + 1965, + 1966, + 1967, + 1968, + 1969, + 1970, + 1971, + 1972, + 1973, + 1974, + 1975, + 1976, + 1977, + 1978, + 1979, + 1980, + 1981, + 1982, + 1983, + 1984, + ] + affected_countries = [ + "Net Food Importing Developing Countries (FAO)", + ] + + +class WalnutsYieldAnomaly(HighYieldAnomaly): + description = ( # type: ignore + "Yields are unreasonably high (possibly by a factor of 1000) for some items, countries and years. " + "This happens to item 'Walnuts, in shell' for region 'Eastern Asia (FAO)'. " + "Therefore, we remove these possibly spurious values." + ) + + affected_item_codes = [ + # Item code for "Walnuts, in shell". + "00000222", + ] + affected_element_codes = [ + # Element code for "Yield". + "005419", + ] + affected_years = [ + 1961, + 1962, + 1963, + 1964, + 1965, + 1966, + 1967, + 1968, + 1969, + 1970, + 1971, + 1972, + 1973, + 1974, + 1975, + 1976, + 1977, + 1978, + 1979, + 1980, + 1981, + 1982, + 1983, + 1984, + ] + affected_countries = [ + "Eastern Asia (FAO)", + ] + + +class OtherTropicalFruitYieldNorthernAfricaAnomaly(HighYieldAnomaly): + description = ( # type: ignore + "Yields are unreasonably high (possibly by a factor of 1000) for some items, countries and years. " + "This happens to item 'Other tropical fruits, n.e.c.' for region 'Northern Africa (FAO)'. " + "Therefore, we remove these possibly spurious values." + ) + + affected_item_codes = [ + # Item code for "Other tropical fruits, n.e.c.". + "00000603", + ] + affected_element_codes = [ + # Element code for "Yield". + "005419", + ] + affected_years = [ + 1961, + 1962, + 1963, + 1964, + 1965, + 1966, + 1967, + 1968, + 1969, + 1970, + 1971, + 1972, + 1973, + 1974, + 1975, + 1976, + ] + affected_countries = [ + "Northern Africa (FAO)", + ] + + +class OtherTropicalFruitYieldSouthAmericaAnomaly(HighYieldAnomaly): + description = ( # type: ignore + "Yields are unreasonably high (possibly by a factor of 1000) for some items, countries and years. " + "This happens to item 'Other tropical fruits, n.e.c.' for South America. " + "Therefore, we remove these possibly spurious values." + ) + + affected_item_codes = [ + # Item code for "Other tropical fruits, n.e.c.". + "00000603", + ] + affected_element_codes = [ + # Element code for "Yield". + "005419", + ] + affected_years = [ + 1961, + 1962, + 1963, + 1964, + 1965, + 1966, + 1967, + 1968, + 1969, + ] + affected_countries = [ + "South America (FAO)", + "South America", + ] + + +detected_anomalies = { + "faostat_qcl": [ + SpinachAreaHarvestedAnomaly, + EggYieldNorthernEuropeAnomaly, + TeaProductionAnomaly, + FruitYieldAnomaly, + FruitYieldNetherlandsAnomaly, + LocustBeansYieldAnomaly, + WalnutsYieldAnomaly, + OtherTropicalFruitYieldNorthernAfricaAnomaly, + OtherTropicalFruitYieldSouthAmericaAnomaly, + ], + "faostat_fbsc": [ + CocoaBeansFoodAvailableAnomaly, + ], +} + + +def handle_anomalies(dataset_short_name: str, data: pd.DataFrame) -> Tuple[pd.DataFrame, str]: + if dataset_short_name not in detected_anomalies: + # If there is no anomaly class for a given dataset, return the same data and an empty anomaly description. + return data, "" + else: + # If there are anomalies, fix them, and return the fixed data and a text describing all anomalies. + data_fixed = data.copy() + anomaly_descriptions = ANOMALY_DESCRIPTION_INTRODUCTION + + for anomaly_class in detected_anomalies[dataset_short_name]: + anomaly = anomaly_class() + anomaly_descriptions += "\n\n+" + anomaly.description + data_fixed = anomaly.handle_anomalies(df=data_fixed) + + return data_fixed, anomaly_descriptions diff --git a/etl/steps/data/garden/faostat/2024-03-14/faostat.countries.json b/etl/steps/data/garden/faostat/2024-03-14/faostat.countries.json new file mode 100644 index 00000000000..99202add9cb --- /dev/null +++ b/etl/steps/data/garden/faostat/2024-03-14/faostat.countries.json @@ -0,0 +1,324 @@ +{ + "Afghanistan": "Afghanistan", + "Albania": "Albania", + "Algeria": "Algeria", + "American Samoa": "American Samoa", + "Andorra": "Andorra", + "Angola": "Angola", + "Anguilla": "Anguilla", + "Antarctica": "Antarctica", + "Antigua and Barbuda": "Antigua and Barbuda", + "Argentina": "Argentina", + "Armenia": "Armenia", + "Aruba": "Aruba", + "Australia": "Australia", + "Austria": "Austria", + "Azerbaijan": "Azerbaijan", + "Åland Islands": "Aland Islands", + "Bahamas": "Bahamas", + "Bahrain": "Bahrain", + "Bangladesh": "Bangladesh", + "Barbados": "Barbados", + "Belarus": "Belarus", + "Belgium": "Belgium", + "Belize": "Belize", + "Benin": "Benin", + "Bermuda": "Bermuda", + "Bhutan": "Bhutan", + "Bolivia (Plurinational State of)": "Bolivia", + "Bonaire, Sint Eustatius and Saba": "Bonaire Sint Eustatius and Saba", + "Bosnia and Herzegovina": "Bosnia and Herzegovina", + "Botswana": "Botswana", + "Bouvet Island": "Bouvet Island", + "Brazil": "Brazil", + "British Virgin Islands": "British Virgin Islands", + "Brunei Darussalam": "Brunei", + "Bulgaria": "Bulgaria", + "Burkina Faso": "Burkina Faso", + "Burundi": "Burundi", + "Cabo Verde": "Cape Verde", + "Cambodia": "Cambodia", + "Cameroon": "Cameroon", + "Canada": "Canada", + "Cayman Islands": "Cayman Islands", + "Central African Republic": "Central African Republic", + "Chad": "Chad", + "Channel Islands": "Channel Islands", + "Chile": "Chile", + "China (excluding Hong Kong & Macao)": "China", + "China, Hong Kong SAR": "Hong Kong", + "China, Macao SAR": "Macao", + "China, Taiwan Province of": "Taiwan", + "China, mainland": "China", + "Christmas Island": "Christmas Island", + "Cocos (Keeling) Islands": "Cocos Islands", + "Colombia": "Colombia", + "Comoros": "Comoros", + "Congo": "Congo", + "Cook Islands": "Cook Islands", + "Costa Rica": "Costa Rica", + "Croatia": "Croatia", + "Cuba": "Cuba", + "Cura\u00e7ao": "Curacao", + "Cyprus": "Cyprus", + "Czechia": "Czechia", + "Czechoslovakia": "Czechoslovakia", + "C\u00f4te d'Ivoire": "Cote d'Ivoire", + "Democratic People's Republic of Korea": "North Korea", + "Democratic Republic of the Congo": "Democratic Republic of Congo", + "Denmark": "Denmark", + "Djibouti": "Djibouti", + "Dominica": "Dominica", + "Dominican Republic": "Dominican Republic", + "Ecuador": "Ecuador", + "Egypt": "Egypt", + "El Salvador": "El Salvador", + "Equatorial Guinea": "Equatorial Guinea", + "Eritrea": "Eritrea", + "Estonia": "Estonia", + "Eswatini": "Eswatini", + "Ethiopia": "Ethiopia", + "Ethiopia PDR": "Ethiopia (former)", + "Falkland Islands (Malvinas)": "Falkland Islands", + "Faroe Islands": "Faroe Islands", + "Fiji": "Fiji", + "Finland": "Finland", + "France": "France", + "French Guyana": "French Guiana", + "French Polynesia": "French Polynesia", + "French Southern Territories": "French Southern Territories", + "Gabon": "Gabon", + "Gambia": "Gambia", + "Gaza Strip (Palestine)": "Gaza Strip", + "Georgia": "Georgia", + "Germany": "Germany", + "Ghana": "Ghana", + "Gibraltar": "Gibraltar", + "Greece": "Greece", + "Greenland": "Greenland", + "Grenada": "Grenada", + "Guadeloupe": "Guadeloupe", + "Guam": "Guam", + "Guatemala": "Guatemala", + "Guernsey": "Guernsey", + "Guinea": "Guinea", + "Guinea-Bissau": "Guinea-Bissau", + "Guyana": "Guyana", + "Haiti": "Haiti", + "Heard and McDonald Islands": "Heard Island and McDonald Islands", + "Holy See": "Vatican", + "Honduras": "Honduras", + "Hungary": "Hungary", + "Iceland": "Iceland", + "India": "India", + "Indonesia": "Indonesia", + "Iran (Islamic Republic of)": "Iran", + "Iraq": "Iraq", + "Ireland": "Ireland", + "Isle of Man": "Isle of Man", + "Israel": "Israel", + "Italy": "Italy", + "Jamaica": "Jamaica", + "Japan": "Japan", + "Jersey": "Jersey", + "Jordan": "Jordan", + "Kazakhstan": "Kazakhstan", + "Kenya": "Kenya", + "Kiribati": "Kiribati", + "Kuwait": "Kuwait", + "Kyrgyzstan": "Kyrgyzstan", + "Lao People's Democratic Republic": "Laos", + "Latvia": "Latvia", + "Lebanon": "Lebanon", + "Lesotho": "Lesotho", + "Liberia": "Liberia", + "Libya": "Libya", + "Liechtenstein": "Liechtenstein", + "Lithuania": "Lithuania", + "Luxembourg": "Luxembourg", + "Madagascar": "Madagascar", + "Malawi": "Malawi", + "Malaysia": "Malaysia", + "Maldives": "Maldives", + "Mali": "Mali", + "Malta": "Malta", + "Marshall Islands": "Marshall Islands", + "Martinique": "Martinique", + "Mauritania": "Mauritania", + "Mauritius": "Mauritius", + "Mayotte": "Mayotte", + "Melanesia": "Melanesia", + "Mexico": "Mexico", + "Micronesia (Federated States of)": "Micronesia (country)", + "Monaco": "Monaco", + "Mongolia": "Mongolia", + "Montenegro": "Montenegro", + "Montserrat": "Montserrat", + "Morocco": "Morocco", + "Mozambique": "Mozambique", + "Myanmar": "Myanmar", + "Namibia": "Namibia", + "Nauru": "Nauru", + "Nepal": "Nepal", + "Netherlands": "Netherlands", + "Netherlands Antilles (former)": "Netherlands Antilles", + "New Caledonia": "New Caledonia", + "New Zealand": "New Zealand", + "Nicaragua": "Nicaragua", + "Niger": "Niger", + "Nigeria": "Nigeria", + "Niue": "Niue", + "Norfolk Island": "Norfolk Island", + "North Macedonia": "North Macedonia", + "Northern Mariana Islands": "Northern Mariana Islands", + "Norway": "Norway", + "Oman": "Oman", + "Pakistan": "Pakistan", + "Palau": "Palau", + "Palestine": "Palestine", + "Panama": "Panama", + "Papua New Guinea": "Papua New Guinea", + "Paraguay": "Paraguay", + "Peru": "Peru", + "Philippines": "Philippines", + "Pitcairn": "Pitcairn", + "Poland": "Poland", + "Polynesia": "Polynesia", + "Portugal": "Portugal", + "Puerto Rico": "Puerto Rico", + "Qatar": "Qatar", + "Republic of Korea": "South Korea", + "Republic of Moldova": "Moldova", + "Romania": "Romania", + "Russian Federation": "Russia", + "Rwanda": "Rwanda", + "R\u00e9union": "Reunion", + "Saint Barth\u00e9lemy": "Saint Barthlemy", + "Saint Helena, Ascension and Tristan da Cunha": "Saint Helena", + "Saint Kitts and Nevis": "Saint Kitts and Nevis", + "Saint Lucia": "Saint Lucia", + "Saint Pierre and Miquelon": "Saint Pierre and Miquelon", + "Saint Vincent and the Grenadines": "Saint Vincent and the Grenadines", + "Saint-Martin (French part)": "Saint Martin (French part)", + "Samoa": "Samoa", + "San Marino": "San Marino", + "Sao Tome and Principe": "Sao Tome and Principe", + "Saudi Arabia": "Saudi Arabia", + "Senegal": "Senegal", + "Serbia": "Serbia", + "Serbia and Montenegro": "Serbia and Montenegro", + "Seychelles": "Seychelles", + "Sierra Leone": "Sierra Leone", + "Singapore": "Singapore", + "Sint Maarten (Dutch part)": "Sint Maarten (Dutch part)", + "Slovakia": "Slovakia", + "Slovenia": "Slovenia", + "Solomon Islands": "Solomon Islands", + "Somalia": "Somalia", + "South Africa": "South Africa", + "South Georgia and the South Sandwich Islands": "South Georgia and the South Sandwich Islands", + "South Sudan": "South Sudan", + "Spain": "Spain", + "Sri Lanka": "Sri Lanka", + "Sudan": "Sudan", + "Sudan (former)": "Sudan (former)", + "Suriname": "Suriname", + "Svalbard and Jan Mayen Islands": "Svalbard and Jan Mayen", + "Sweden": "Sweden", + "Switzerland": "Switzerland", + "Syrian Arab Republic": "Syria", + "Tajikistan": "Tajikistan", + "Thailand": "Thailand", + "Timor-Leste": "East Timor", + "Togo": "Togo", + "Tokelau": "Tokelau", + "Tonga": "Tonga", + "Trinidad and Tobago": "Trinidad and Tobago", + "Tunisia": "Tunisia", + "Türkiye": "Turkey", + "Turkmenistan": "Turkmenistan", + "Turks and Caicos Islands": "Turks and Caicos Islands", + "Tuvalu": "Tuvalu", + "USSR": "USSR", + "Uganda": "Uganda", + "Ukraine": "Ukraine", + "United Arab Emirates": "United Arab Emirates", + "United Kingdom of Great Britain and Northern Ireland": "United Kingdom", + "United Republic of Tanzania": "Tanzania", + "United States Minor Outlying Islands": "United States Minor Outlying Islands", + "United States Virgin Islands": "United States Virgin Islands", + "United States of America": "United States", + "Uruguay": "Uruguay", + "Uzbekistan": "Uzbekistan", + "Vanuatu": "Vanuatu", + "Venezuela (Bolivarian Republic of)": "Venezuela", + "Viet Nam": "Vietnam", + "Wallis and Futuna Islands": "Wallis and Futuna", + "Western Sahara": "Western Sahara", + "World": "World", + "Yemen": "Yemen", + "Yugoslav SFR": "Yugoslavia", + "Zambia": "Zambia", + "Zimbabwe": "Zimbabwe", + "Africa": "Africa (FAO)", + "Americas": "Americas (FAO)", + "Asia": "Asia (FAO)", + "Belgium-Luxembourg": "Belgium-Luxembourg (FAO)", + "Caribbean": "Caribbean (FAO)", + "Central America": "Central America (FAO)", + "Central Asia": "Central Asia (FAO)", + "China": "China (FAO)", + "Eastern Africa": "Eastern Africa (FAO)", + "Eastern Asia": "Eastern Asia (FAO)", + "Eastern Asia and South-eastern Asia": "Eastern Asia and South-eastern Asia (FAO)", + "Eastern Europe": "Eastern Europe (FAO)", + "Europe": "Europe (FAO)", + "European Union (27)": "European Union (27) (FAO)", + "High-income economies": "High-income economies (FAO)", + "Land Locked Developing Countries": "Land Locked Developing Countries (FAO)", + "Latin America and the Caribbean": "Latin America and the Caribbean (FAO)", + "Least Developed Countries": "Least Developed Countries (FAO)", + "Low Income Food Deficit Countries": "Low Income Food Deficit Countries (FAO)", + "Low income economies": "Low income economies (FAO)", + "Lower-middle-income economies": "Lower-middle-income economies (FAO)", + "Melanesia (excluding intra-trade)": "Melanesia (excluding intra-trade) (FAO)", + "Micronesia": "Micronesia (FAO)", + "Middle Africa": "Middle Africa (FAO)", + "Midway Island": "Midway Island (FAO)", + "Net Food Importing Developing Countries": "Net Food Importing Developing Countries (FAO)", + "North and Central America": "North and Central America (FAO)", + "Northern Africa": "Northern Africa (FAO)", + "Northern America": "Northern America (FAO)", + "Northern America and Europe": "Northern America and Europe (FAO)", + "Northern Europe": "Northern Europe (FAO)", + "OECD": "OECD (FAO)", + "Oceania": "Oceania (FAO)", + "Sark": "Sark (FAO)", + "Small Island Developing States": "Small Island Developing States (FAO)", + "South America": "South America (FAO)", + "South-eastern Asia": "South-eastern Asia (FAO)", + "Southern Africa": "Southern Africa (FAO)", + "Southern Asia": "Southern Asia (FAO)", + "Southern Europe": "Southern Europe (FAO)", + "Sub-Saharan Africa": "Sub-Saharan Africa (FAO)", + "Upper-middle-income economies": "Upper-middle-income economies (FAO)", + "Wake Island": "Wake Island (FAO)", + "Western Africa": "Western Africa (FAO)", + "Western Asia": "Western Asia (FAO)", + "Western Europe": "Western Europe (FAO)", + "FAO Major Fishing Area: Atlantic, Eastern Central (14.4.1)": "Eastern Central Atlantic (FAO)", + "FAO Major Fishing Area: Atlantic, Northeast (14.4.1)": "Northeast Atlantic (FAO)", + "FAO Major Fishing Area: Atlantic, Northwest (14.4.1)": "Northwest Atlantic (FAO)", + "FAO Major Fishing Area: Atlantic, Southeast (14.4.1)": "Southeast Atlantic (FAO)", + "FAO Major Fishing Area: Atlantic, Southwest (14.4.1)": "Southwest Atlantic (FAO)", + "FAO Major Fishing Area: Atlantic, Western Central (14.4.1)": "Western Central Atlantic (FAO)", + "FAO Major Fishing Area: Indian Ocean, Eastern (14.4.1)": "Eastern Indian Ocean (FAO)", + "FAO Major Fishing Area: Indian Ocean, Western (14.4.1)": "Western Indian Ocean (FAO)", + "FAO Major Fishing Area: Mediterranean and Black Sea (14.4.1)": "Mediterranean and Black Sea (FAO)", + "FAO Major Fishing Area: Pacific, Eastern Central (14.4.1)": "Eastern Central Pacific (FAO)", + "FAO Major Fishing Area: Pacific, Northeast (14.4.1)": "Northeast Pacific (FAO)", + "FAO Major Fishing Area: Pacific, Northwest (14.4.1)": "Northwest Pacific (FAO)", + "FAO Major Fishing Area: Pacific, Southeast (14.4.1)": "Southeast Pacific (FAO)", + "FAO Major Fishing Area: Pacific, Southwest (14.4.1)": "Southwest Pacific (FAO)", + "FAO Major Fishing Area: Pacific, Western Central (14.4.1)": "Western Central Pacific (FAO)" +} diff --git a/etl/steps/data/garden/faostat/2024-03-14/faostat.excluded_countries.json b/etl/steps/data/garden/faostat/2024-03-14/faostat.excluded_countries.json new file mode 100644 index 00000000000..60aab73b9b1 --- /dev/null +++ b/etl/steps/data/garden/faostat/2024-03-14/faostat.excluded_countries.json @@ -0,0 +1,59 @@ +[ + "Africa (excluding intra-trade)", + "Americas (excluding intra-trade)", + "Annex I countries", + "Antarctic Region", + "Asia (excluding intra-trade)", + "Australia and New Zealand", + "Australia and New Zealand (excluding intra-trade)", + "Caribbean (excluding intra-trade)", + "Caucasus and Central Asia", + "Central America (excluding intra-trade)", + "Central Asia (excluding intra-trade)", + "Central Asia and Southern Asia", + "Chagos Archipelago", + "China (excluding intra-trade)", + "Eastern Africa (excluding intra-trade)", + "Eastern Asia (excluding Japan and China)", + "Eastern Asia (excluding intra-trade)", + "Eastern Europe (excluding intra-trade)", + "Europe (excluding intra-trade)", + "Europe, Northern America, Australia and New Zealand", + "European Union (12) (excluding intra-trade)", + "European Union (15) (excluding intra-trade)", + "European Union (25) (excluding intra-trade)", + "European Union (27) (excluding Croatia) (excluding intra-trade)", + "European Union (27) (excluding intra-trade)", + "European Union (28) (excluding intra-trade)", + "International Centres (FAO) (2.5.1.a)", + "Johnston Island", + "Land Locked Developing Countries (excluding intra-trade)", + "Least Developed Countries (excluding intra-trade)", + "Low Income Food Deficit Countries (excluding intra-trade)", + "Micronesia (excluding intra-trade)", + "Middle Africa (excluding intra-trade)", + "Net Food Importing Developing Countries (excluding intra-trade)", + "Non-Annex I countries", + "Northern Africa (excluding Sudan)", + "Northern Africa (excluding intra-trade)", + "Northern America (excluding intra-trade)", + "Northern Europe (excluding intra-trade)", + "Oceania (excluding intra-trade)", + "Oceania excluding Australia and New Zealand", + "Pacific Islands Trust Territory", + "Polynesia (excluding intra-trade)", + "Regional Centres (FAO) (2.5.1.a)", + "Small Island Developing States (excluding intra-trade)", + "South America (excluding intra-trade)", + "South-Eastern Asia (excluding intra-trade)", + "Southern Africa (excluding intra-trade)", + "Southern Asia (excluding India)", + "Southern Asia (excluding intra-trade)", + "Southern Europe (excluding intra-trade)", + "Sub-Saharan Africa (including Sudan)", + "Western Africa (excluding intra-trade)", + "Western Asia (exc. Armenia, Azerbaijan, Cyprus, Israel and Georgia)", + "Western Asia (excluding intra-trade)", + "Western Asia and Northern Africa", + "Western Europe (excluding intra-trade)" +] diff --git a/etl/steps/data/garden/faostat/2024-03-14/faostat_cahd.py b/etl/steps/data/garden/faostat/2024-03-14/faostat_cahd.py new file mode 100644 index 00000000000..048afae3560 --- /dev/null +++ b/etl/steps/data/garden/faostat/2024-03-14/faostat_cahd.py @@ -0,0 +1,2 @@ +"""FAOSTAT garden step for faostat_cahd dataset.""" +from .shared import run # noqa:F401 diff --git a/etl/steps/data/garden/faostat/2024-03-14/faostat_ei.py b/etl/steps/data/garden/faostat/2024-03-14/faostat_ei.py new file mode 100644 index 00000000000..5bd7d23db88 --- /dev/null +++ b/etl/steps/data/garden/faostat/2024-03-14/faostat_ei.py @@ -0,0 +1,2 @@ +"""FAOSTAT garden step for faostat_ei dataset.""" +from .shared import run # noqa:F401 diff --git a/etl/steps/data/garden/faostat/2024-03-14/faostat_ek.py b/etl/steps/data/garden/faostat/2024-03-14/faostat_ek.py new file mode 100644 index 00000000000..c6ec4c862e8 --- /dev/null +++ b/etl/steps/data/garden/faostat/2024-03-14/faostat_ek.py @@ -0,0 +1,2 @@ +"""FAOSTAT garden step for faostat_ek dataset.""" +from .shared import run # noqa:F401 diff --git a/etl/steps/data/garden/faostat/2024-03-14/faostat_emn.py b/etl/steps/data/garden/faostat/2024-03-14/faostat_emn.py new file mode 100644 index 00000000000..5f12637ec70 --- /dev/null +++ b/etl/steps/data/garden/faostat/2024-03-14/faostat_emn.py @@ -0,0 +1,2 @@ +"""FAOSTAT garden step for faostat_emn dataset.""" +from .shared import run # noqa:F401 diff --git a/etl/steps/data/garden/faostat/2024-03-14/faostat_esb.py b/etl/steps/data/garden/faostat/2024-03-14/faostat_esb.py new file mode 100644 index 00000000000..2a4896b9edf --- /dev/null +++ b/etl/steps/data/garden/faostat/2024-03-14/faostat_esb.py @@ -0,0 +1,2 @@ +"""FAOSTAT garden step for faostat_esb dataset.""" +from .shared import run # noqa:F401 diff --git a/etl/steps/data/garden/faostat/2024-03-14/faostat_fa.py b/etl/steps/data/garden/faostat/2024-03-14/faostat_fa.py new file mode 100644 index 00000000000..e594773a567 --- /dev/null +++ b/etl/steps/data/garden/faostat/2024-03-14/faostat_fa.py @@ -0,0 +1,2 @@ +"""FAOSTAT garden step for faostat_fa dataset.""" +from .shared import run # noqa:F401 diff --git a/etl/steps/data/garden/faostat/2024-03-14/faostat_fbsc.py b/etl/steps/data/garden/faostat/2024-03-14/faostat_fbsc.py new file mode 100644 index 00000000000..7ac8c6d544c --- /dev/null +++ b/etl/steps/data/garden/faostat/2024-03-14/faostat_fbsc.py @@ -0,0 +1,219 @@ +"""FAOSTAT garden step for faostat_fbsc dataset (Food Balances Combined). + +Combine the old and new food balances datasets: +* `faostat_fbsh`: Old (historical) dataset. +* `faostat_fbs`: Current dataset. + +A new (combined) dataset will be generated: "faostat_fbsc". + +This is because a new version of the Food Balances dataset was launched in 2014 with a slightly new methodology: +https://fenixservices.fao.org/faostat/static/documents/FBS/New%20FBS%20methodology.pdf + +NOTE: It seems that FAOSTAT is possibly extending the coverage of the new methodology. So the year of intersection of +both datasets will be earlier and earlier. The global variable `FBS_FIRST_YEAR` may have to be redefined in a future +update. + +""" + +from pathlib import Path +from typing import cast + +import pandas as pd +from owid import catalog +from owid.datautils import dataframes +from shared import ( + ADDED_TITLE_TO_WIDE_TABLE, + CURRENT_DIR, + NAMESPACE, + add_per_capita_variables, + add_regions, + clean_data, + handle_anomalies, + harmonize_elements, + harmonize_items, + log, + parse_amendments_table, + prepare_long_table, + prepare_wide_table, +) + +from etl.helpers import PathFinder, create_dataset + +# First year for which we have data in fbs dataset (it defines the first year when new methodology is used). +FBS_FIRST_YEAR = 2010 +DATASET_TITLE = f"Food Balances (old methodology before {FBS_FIRST_YEAR}, and new from {FBS_FIRST_YEAR} onwards)" + + +def combine_fbsh_and_fbs_datasets( + fbsh_dataset: catalog.Dataset, + fbs_dataset: catalog.Dataset, +) -> pd.DataFrame: + """Combine `faostat_fbsh` and `faostat_fbs` meadow datasets. + + Parameters + ---------- + fbsh_dataset : catalog.Dataset + Meadow `faostat_fbsh` dataset. + fbs_dataset : catalog.Dataset + Meadow `faostat_fbs` dataset. + + Returns + ------- + fbsc : pd.DataFrame + Combination of the tables of the two input datasets (as a dataframe, not a dataset). + + """ + # Sanity checks. + error = "Description of fbs and fbsh datasets is different." + assert fbsh_dataset.metadata.description == fbs_dataset.metadata.description, error + error = "Licenses of fbsh and fbs are different." + assert fbsh_dataset.metadata.licenses == fbs_dataset.metadata.licenses, error + + # Load dataframes for fbs and fbsh datasets. + fbsh = pd.DataFrame(fbsh_dataset["faostat_fbsh"]).reset_index() + fbs = pd.DataFrame(fbs_dataset["faostat_fbs"]).reset_index() + + # Harmonize items and elements in both datasets. + fbsh = harmonize_items(df=fbsh, dataset_short_name="faostat_fbsh") + fbsh = harmonize_elements(df=fbsh) + fbs = harmonize_items(df=fbs, dataset_short_name="faostat_fbs") + fbs = harmonize_elements(df=fbs) + + # Ensure there is no overlap in data between the two datasets, and that there is no gap between them. + assert fbs["year"].min() == FBS_FIRST_YEAR, f"First year of fbs dataset is not {FBS_FIRST_YEAR}" + if fbsh["year"].max() >= fbs["year"].min(): + # There is overlapping data between fbsh and fbs datasets. Prioritising fbs over fbsh." + fbsh = fbsh.loc[fbsh["year"] < fbs["year"].min()].reset_index(drop=True) + if (fbsh["year"].max() + 1) < fbs["year"].min(): + log.warning("Data is missing for one or more years between fbsh and fbs datasets.") + + # Sanity checks. + # Ensure the elements that are in fbsh but not in fbs are covered by ITEMS_MAPPING. + error = "Mismatch between items in fbsh and fbs. Redefine shared.ITEM_AMENDMENTS." + assert set(fbsh["item"]) == set(fbs["item"]), error + # Some elements are found in fbs but not in fbsh. This is understandable, since fbs is + # more recent and may have additional elements. However, ensure that there are no + # elements in fbsh that are not in fbs. + error = "There are elements in fbsh that are not in fbs." + assert set(fbsh["element"]) < set(fbs["element"]), error + + # Concatenate old and new dataframes using function that keeps categoricals. + fbsc = dataframes.concatenate([fbsh, fbs]).sort_values(["area", "year"]).reset_index(drop=True) + + # Ensure that each element has only one unit and one description. + error = "Some elements in the combined dataset have more than one unit." + assert fbsc.groupby("element")["unit"].nunique().max() == 1, error + + return cast(pd.DataFrame, fbsc) + + +def _assert_df_size(df: pd.DataFrame, size_mb: float) -> None: + """Check that dataframe is smaller than given size to prevent OOM errors.""" + real_size_mb = df.memory_usage(deep=True).sum() / 1e6 + assert real_size_mb <= size_mb, f"DataFrame size is too big: {real_size_mb} MB > {size_mb} MB" + + +def run(dest_dir: str) -> None: + # + # Load data. + # + # Fetch the dataset short name from dest_dir. + dataset_short_name = Path(dest_dir).name + + # Define path to current step file. + current_step_file = (CURRENT_DIR / dataset_short_name).with_suffix(".py") + + # Get paths and naming conventions for current data step. + paths = PathFinder(current_step_file.as_posix()) + + # Load fbsh and fbs. + log.info("faostat_fbsc.loading_datasets") + fbsh_dataset: catalog.Dataset = paths.load_dependency(f"{NAMESPACE}_fbsh") + fbs_dataset: catalog.Dataset = paths.load_dependency(f"{NAMESPACE}_fbs") + + # Load dataset of FAOSTAT metadata. + metadata: catalog.Dataset = paths.load_dependency(f"{NAMESPACE}_metadata") + + # Load dataset, items, element-units, and countries metadata. + dataset_metadata = pd.DataFrame(metadata["datasets"]).loc[dataset_short_name].to_dict() + items_metadata = pd.DataFrame(metadata["items"]).reset_index() + items_metadata = items_metadata[items_metadata["dataset"] == dataset_short_name].reset_index(drop=True) + elements_metadata = pd.DataFrame(metadata["elements"]).reset_index() + elements_metadata = elements_metadata[elements_metadata["dataset"] == dataset_short_name].reset_index(drop=True) + countries_metadata = pd.DataFrame(metadata["countries"]).reset_index() + amendments = parse_amendments_table(amendments=metadata["amendments"], dataset_short_name=dataset_short_name) + + # + # Process data. + # + # Combine fbsh and fbs datasets. + log.info( + "faostat_fbsc.combine_fbsh_and_fbs_datasets", + fbsh_shape=fbsh_dataset["faostat_fbsh"].shape, + fbs_shape=fbs_dataset["faostat_fbs"].shape, + ) + data = combine_fbsh_and_fbs_datasets(fbsh_dataset, fbs_dataset) + + _assert_df_size(data, 2000) + + # Prepare data. + data = clean_data( + data=data, + items_metadata=items_metadata, + elements_metadata=elements_metadata, + countries_metadata=countries_metadata, + amendments=amendments, + ) + + # Add data for aggregate regions. + data = add_regions(data=data, elements_metadata=elements_metadata) + + # Add per-capita variables. + data = add_per_capita_variables(data=data, elements_metadata=elements_metadata) + + # Handle detected anomalies in the data. + data, anomaly_descriptions = handle_anomalies(dataset_short_name=dataset_short_name, data=data) + + # Avoid objects as they would explode memory, use categoricals instead. + for col in data.columns: + assert data[col].dtype != object, f"Column {col} should not have object type" + + _assert_df_size(data, 2000) + + # Create a long table (with item code and element code as part of the index). + log.info("faostat_fbsc.prepare_long_table", shape=data.shape) + data_table_long = prepare_long_table(data=data) + + _assert_df_size(data_table_long, 2000) + + # Create a wide table (with only country and year as index). + log.info("faostat_fbsc.prepare_wide_table", shape=data.shape) + data_table_wide = prepare_wide_table(data=data) + + # + # Save outputs. + # + # Update tables metadata. + data_table_long.metadata.short_name = dataset_short_name + data_table_long.metadata.title = dataset_metadata["owid_dataset_title"] + data_table_wide.metadata.short_name = f"{dataset_short_name}_flat" + data_table_wide.metadata.title = dataset_metadata["owid_dataset_title"] + ADDED_TITLE_TO_WIDE_TABLE + + # Initialise new garden dataset. + ds_garden = create_dataset( + dest_dir=dest_dir, tables=[data_table_long, data_table_wide], default_metadata=fbs_dataset.metadata + ) + + # Check that the title assigned here coincides with the one in custom_datasets.csv (for consistency). + error = "Dataset title given to fbsc is different to the one in custom_datasets.csv. Update the latter file." + assert DATASET_TITLE == dataset_metadata["owid_dataset_title"], error + + # Update dataset metadata and add description of anomalies (if any) to the dataset description. + ds_garden.metadata.description = dataset_metadata["owid_dataset_description"] + anomaly_descriptions + ds_garden.metadata.title = dataset_metadata["owid_dataset_title"] + + # Update the main source's metadata description (which will be shown in charts). + ds_garden.metadata.sources[0].description = ds_garden.metadata.description + + # Create garden dataset. + ds_garden.save() diff --git a/etl/steps/data/garden/faostat/2024-03-14/faostat_fo.py b/etl/steps/data/garden/faostat/2024-03-14/faostat_fo.py new file mode 100644 index 00000000000..bba98a5e224 --- /dev/null +++ b/etl/steps/data/garden/faostat/2024-03-14/faostat_fo.py @@ -0,0 +1,2 @@ +"""FAOSTAT garden step for faostat_fo dataset.""" +from .shared import run # noqa:F401 diff --git a/etl/steps/data/garden/faostat/2024-03-14/faostat_food_explorer.py b/etl/steps/data/garden/faostat/2024-03-14/faostat_food_explorer.py new file mode 100644 index 00000000000..b620f241446 --- /dev/null +++ b/etl/steps/data/garden/faostat/2024-03-14/faostat_food_explorer.py @@ -0,0 +1,555 @@ +"""Dataset feeding the global food explorer. + +Load the qcl and fbsc (combination of fbsh and fbs) datasets, and create a combined dataset of food items (now called +products). + +The resulting dataset will later be loaded by the `explorer/food_explorer` which feeds our +[Global food explorer](https://ourworldindata.org/explorers/global-food). + +""" + +from pathlib import Path +from typing import cast + +import pandas as pd +from owid import catalog +from owid.datautils import dataframes +from shared import CURRENT_DIR, NAMESPACE + +from etl.data_helpers import geo +from etl.helpers import PathFinder, create_dataset + +# Dataset name and title. +DATASET_TITLE = "Food Explorer" +DATASET_DESCRIPTION = ( + "This dataset has been created by Our World in Data, merging existing FAOSTAT datasets. In " + "particular, we have used 'Crops and livestock products' (QCL) and 'Food Balances' (FBSH and " + "FBS) datasets. Each row contains all the metrics for a specific combination of (country, " + "product, year). The metrics may come from different datasets." +) + +# The names of the products to include in the food explorer will be further edited in owid-content, following to the +# following file: +# https://github.com/owid/owid-content/blob/master/scripts/global-food-explorer/foods.csv +ITEM_CODES_QCL = [ + "00000060", # From faostat_qcl - 'Maize oil' (previously 'Maize oil'). + "00000567", # From faostat_qcl - 'Watermelons' (previously 'Watermelons'). + "00000075", # From faostat_qcl - 'Oats' (previously 'Oats'). + "00000191", # From faostat_qcl - 'Chickpeas' (previously 'Chickpeas'). + "00001069", # From faostat_qcl - 'Meat of ducks, fresh or chilled' (previously 'Meat, duck'). + "00000957", # From faostat_qcl - 'Buffalo hides' (previously 'Buffalo hides'). + "00000176", # From faostat_qcl - 'Beans, dry' (previously 'Beans, dry'). + "00001182", # From faostat_qcl - 'Honey' (previously 'Honey'). + "00000399", # From faostat_qcl - 'Eggplants' (previously 'Eggplants'). + "00000554", # From faostat_qcl - 'Cranberries' (previously 'Cranberries'). + "00000296", # From faostat_qcl - 'Poppy seeds' (previously 'Poppy seeds'). + "00000201", # From faostat_qcl - 'Lentils, dry' (previously 'Lentils'). + "00000268", # From faostat_qcl - 'Sunflower oil' (previously 'Sunflower oil'). + "00001806", # From faostat_qcl - 'Meat, beef and buffalo' (previously 'Meat, beef and buffalo'). + "00000600", # From faostat_qcl - 'Papayas' (previously 'Papayas'). + "00000334", # From faostat_qcl - 'Linseed oil' (previously 'Linseed oil'). + "00001097", # From faostat_qcl - 'Horse meat, fresh or chilled' (previously 'Meat, horse'). + "00000165", # From faostat_qcl - 'Molasses' (previously 'Molasses'). + "00000426", # From faostat_qcl - 'Carrots and turnips' (previously 'Carrots and turnips'). + "00000216", # From faostat_qcl - 'Brazil nuts, in shell' (previously 'Brazil nuts, with shell'). + "00000137", # From faostat_qcl - 'Yams' (previously 'Yams'). + "00000222", # From faostat_qcl - 'Walnuts' (previously 'Walnuts'). + "00000289", # From faostat_qcl - 'Sesame seed' (previously 'Sesame seed'). + "00000122", # From faostat_qcl - 'Sweet potatoes' (previously 'Sweet potatoes'). + "00001738", # From faostat_qcl - 'Fruit' (previously 'Fruit'). + "00001780", # From faostat_qcl - 'Milk' (previously 'Milk'). + "00001804", # From faostat_qcl - 'Citrus Fruit' (previously 'Citrus Fruit'). + "00000656", # From faostat_qcl - 'Coffee, green' (previously 'Coffee, green'). + "00001019", # From faostat_qcl - 'Goat fat, unrendered' (previously 'Fat, goats'). + "00000225", # From faostat_qcl - 'Hazelnuts' (previously 'Hazelnuts'). + "00000406", # From faostat_qcl - 'Green garlic' (previously 'Garlic'). + "00000995", # From faostat_qcl - 'Skins, sheep' (previously 'Skins, sheep'). + "00000244", # From faostat_qcl - 'Groundnut oil' (previously 'Groundnut oil'). + "00000281", # From faostat_qcl - 'Safflower oil' (previously 'Safflower oil'). + "00000267", # From faostat_qcl - 'Sunflower seed' (previously 'Sunflower seed'). + "00001025", # From faostat_qcl - 'Skins, goat' (previously 'Skins, goat'). + "00000252", # From faostat_qcl - 'Coconut oil' (previously 'Coconut oil'). + "00000256", # From faostat_qcl - 'Palm kernels' (previously 'Palm kernels'). + "00000868", # From faostat_qcl - 'Offals, cattle' (previously 'Offals, cattle'). + "00000292", # From faostat_qcl - 'Mustard seed' (previously 'Mustard seed'). + "00000101", # From faostat_qcl - 'Canary seed' (previously 'Canary seed'). + "00001098", # From faostat_qcl - 'Edible offals of horses and other equines, fresh, chilled or frozen' (previously 'Offals, horses'). + "00001062", # From faostat_qcl - 'Eggs from hens' (previously 'Eggs from hens'). + "00001808", # From faostat_qcl - 'Meat, poultry' (previously 'Meat, poultry'). + "00000258", # From faostat_qcl - 'Palm kernel oil' (previously 'Palm kernel oil'). + "00000156", # From faostat_qcl - 'Sugar cane' (previously 'Sugar cane'). + "00000373", # From faostat_qcl - 'Spinach' (previously 'Spinach'). + "00000773", # From faostat_qcl - 'Flax fibre' (previously 'Flax fibre'). + "00000116", # From faostat_qcl - 'Potatoes' (previously 'Potatoes'). + "00000869", # From faostat_qcl - 'Cattle fat, unrendered' (previously 'Fat, cattle'). + "00000358", # From faostat_qcl - 'Cabbages' (previously 'Cabbages'). + "00000767", # From faostat_qcl - 'Cotton' (previously 'Cotton'). + "00000388", # From faostat_qcl - 'Tomatoes' (previously 'Tomatoes'). + "00000220", # From faostat_qcl - 'Chestnuts, in shell' (previously 'Chestnut'). + "00000027", # From faostat_qcl - 'Rice' (previously 'Rice'). + "00000367", # From faostat_qcl - 'Asparagus' (previously 'Asparagus'). + "00000977", # From faostat_qcl - 'Meat, lamb and mutton' (previously 'Meat, lamb and mutton'). + "00000015", # From faostat_qcl - 'Wheat' (previously 'Wheat'). + "00001127", # From faostat_qcl - 'Meat of camels, fresh or chilled' (previously 'Meat, camel'). + "00001183", # From faostat_qcl - 'Beeswax' (previously 'Beeswax'). + "00001720", # From faostat_qcl - 'Roots and tubers' (previously 'Roots and tubers'). + "00001186", # From faostat_qcl - 'Silk' (previously 'Silk'). + "00000826", # From faostat_qcl - 'Tobacco' (previously 'Tobacco'). + "00000978", # From faostat_qcl - 'Offals, sheep' (previously 'Offals, sheep'). + "00000948", # From faostat_qcl - 'Offals, buffaloes' (previously 'Offals, buffaloes'). + "00000226", # From faostat_qcl - 'Areca nuts' (previously 'Areca nuts'). + "00000417", # From faostat_qcl - 'Peas, green' (previously 'Peas, green'). + "00000407", # From faostat_qcl - 'Leeks' (previously 'Leeks'). + "00000224", # From faostat_qcl - 'Kola nuts' (previously 'Kola nuts'). + "00000079", # From faostat_qcl - 'Millet' (previously 'Millet'). + "00000568", # From faostat_qcl - 'Melon' (previously 'Melon'). + "00000900", # From faostat_qcl - 'Whey' (previously 'Whey'). + "00000544", # From faostat_qcl - 'Strawberries' (previously 'Strawberries'). + "00000333", # From faostat_qcl - 'Linseed' (previously 'Linseed'). + "00000571", # From faostat_qcl - 'Mangoes' (previously 'Mangoes'). + "00000534", # From faostat_qcl - 'Peaches and nectarines' (previously 'Peaches and nectarines'). + "00000372", # From faostat_qcl - 'Lettuce' (previously 'Lettuce'). + "00001080", # From faostat_qcl - 'Meat of turkeys, fresh or chilled' (previously 'Meat, turkey'). + "00000083", # From faostat_qcl - 'Sorghum' (previously 'Sorghum'). + "00001732", # From faostat_qcl - 'Oilcrops, Oil Equivalent' (previously 'Oilcrops, Oil Equivalent'). + "00000336", # From faostat_qcl - 'Hempseed' (previously 'Hempseed'). + "00000397", # From faostat_qcl - 'Cucumbers and gherkins' (previously 'Cucumbers and gherkins'). + "00000223", # From faostat_qcl - 'Pistachios, in shell' (previously 'Pistachios'). + "00000242", # From faostat_qcl - 'Groundnuts' (previously 'Groundnuts'). + "00000489", # From faostat_qcl - 'Plantains' (previously 'Plantains'). + "00000495", # From faostat_qcl - 'Tangerines' (previously 'Tangerines'). + "00000195", # From faostat_qcl - 'Cow peas' (previously 'Cow peas'). + "00000290", # From faostat_qcl - 'Sesame oil' (previously 'Sesame oil'). + "00000497", # From faostat_qcl - 'Lemons and limes' (previously 'Lemons and limes'). + "00000711", # From faostat_qcl - 'Herbs (e.g. fennel)' (previously 'Herbs (e.g. fennel)'). + "00001129", # From faostat_qcl - 'Fat of camels' (previously 'Fat, camels'). + "00000577", # From faostat_qcl - 'Dates' (previously 'Dates'). + "00001108", # From faostat_qcl - 'Meat of asses, fresh or chilled' (previously 'Meat, ass'). + "00000071", # From faostat_qcl - 'Rye' (previously 'Rye'). + "00001073", # From faostat_qcl - 'Meat of geese, fresh or chilled' (previously 'Meat, goose and guinea fowl'). + "00000687", # From faostat_qcl - 'Pepper' (previously 'Pepper'). + "00000280", # From faostat_qcl - 'Safflower seed' (previously 'Safflower seed'). + "00000157", # From faostat_qcl - 'Sugar beet' (previously 'Sugar beet'). + "00000271", # From faostat_qcl - 'Rapeseed oil' (previously 'Rapeseed oil'). + "00001735", # From faostat_qcl - 'Vegetables' (previously 'Vegetables'). + "00001035", # From faostat_qcl - 'Meat of pig with the bone, fresh or chilled' (previously 'Meat, pig'). + "00001128", # From faostat_qcl - 'Offals, camels' (previously 'Offals, camels'). + "00000564", # From faostat_qcl - 'Wine' (previously 'Wine'). + "00000092", # From faostat_qcl - 'Quinoa' (previously 'Quinoa'). + "00000507", # From faostat_qcl - 'Grapefruit' (previously 'Grapefruit'). + "00000089", # From faostat_qcl - 'Buckwheat' (previously 'Buckwheat'). + "00000949", # From faostat_qcl - 'Buffalo fat, unrendered' (previously 'Fat, buffaloes'). + "00000821", # From faostat_qcl - 'Fibre crops' (previously 'Fibre crops'). + "00000221", # From faostat_qcl - 'Almonds' (previously 'Almonds'). + "00000328", # From faostat_qcl - 'Seed cotton, unginned' (previously 'Seed cotton'). + "00001717", # From faostat_qcl - 'Cereals' (previously 'Cereals'). + "00000547", # From faostat_qcl - 'Raspberries' (previously 'Raspberries'). + "00000187", # From faostat_qcl - 'Peas, dry' (previously 'Peas, dry'). + "00000560", # From faostat_qcl - 'Grapes' (previously 'Grapes'). + "00000689", # From faostat_qcl - 'Chillies and peppers' (previously 'Chillies and peppers'). + "00001091", # From faostat_qcl - 'Eggs from other birds (excl. hens)' (previously 'Eggs from other birds (excl. hens)'). + "00001163", # From faostat_qcl - 'Game meat, fresh, chilled or frozen' (previously 'Meat, game'). + "00001807", # From faostat_qcl - 'Meat, sheep and goat' (previously 'Meat, sheep and goat'). + "00001141", # From faostat_qcl - 'Meat of rabbits and hares, fresh or chilled' (previously 'Meat, rabbit'). + "00000490", # From faostat_qcl - 'Oranges' (previously 'Oranges'). + "00001841", # From faostat_qcl - 'Oilcrops, Cake Equivalent' (previously 'Oilcrops, Cake Equivalent'). + "00000552", # From faostat_qcl - 'Blueberries' (previously 'Blueberries'). + "00001783", # From faostat_qcl - 'Eggs' (previously 'Eggs'). + "00000254", # From faostat_qcl - 'Palm fruit oil' (previously 'Palm fruit oil'). + "00000263", # From faostat_qcl - 'Karite nuts' (previously 'Karite nuts'). + "00000044", # From faostat_qcl - 'Barley' (previously 'Barley'). + "00001036", # From faostat_qcl - 'Offals, pigs' (previously 'Offals, pigs'). + "00000446", # From faostat_qcl - 'Green maize' (previously 'Green maize'). + "00001745", # From faostat_qcl - 'Cheese' (previously 'Cheese'). + "00000261", # From faostat_qcl - 'Olive oil' (previously 'Olive oil'). + "00000236", # From faostat_qcl - 'Soya beans' (previously 'Soybeans'). + "00000125", # From faostat_qcl - 'Cassava, fresh' (previously 'Cassava'). + "00000260", # From faostat_qcl - 'Olives' (previously 'Olives'). + "00000329", # From faostat_qcl - 'Cotton seed' (previously 'Cottonseed'). + "00000521", # From faostat_qcl - 'Pears' (previously 'Pears'). + "00001018", # From faostat_qcl - 'Offals, goats' (previously 'Offals, goats'). + "00001765", # From faostat_qcl - 'Meat, total' (previously 'Meat, total'). + "00000550", # From faostat_qcl - 'Currants' (previously 'Currants'). + "00001058", # From faostat_qcl - 'Meat of chickens, fresh or chilled' (previously 'Meat, chicken'). + "00000197", # From faostat_qcl - 'Pigeon peas, dry' (previously 'Pigeon peas'). + "00000270", # From faostat_qcl - 'Rape or colza seed' (previously 'Rapeseed'). + "00000526", # From faostat_qcl - 'Apricots' (previously 'Apricots'). + "00000592", # From faostat_qcl - 'Kiwi' (previously 'Kiwi'). + "00000237", # From faostat_qcl - 'Soybean oil' (previously 'Soybean oil'). + "00000947", # From faostat_qcl - 'Meat of buffalo, fresh or chilled' (previously 'Meat, buffalo'). + "00000265", # From faostat_qcl - 'Castor oil seeds' (previously 'Castor oil seed'). + "00000430", # From faostat_qcl - 'Okra' (previously 'Okra'). + "00000331", # From faostat_qcl - 'Cottonseed oil' (previously 'Cottonseed oil'). + "00000103", # From faostat_qcl - 'Mixed grains' (previously 'Mixed grains'). + "00000486", # From faostat_qcl - 'Bananas' (previously 'Bananas'). + "00000919", # From faostat_qcl - 'Cattle hides' (previously 'Cattle hides'). + "00001242", # From faostat_qcl - 'Margarine' (previously 'Margarine'). + "00000449", # From faostat_qcl - 'Mushrooms' (previously 'Mushrooms'). + "00001037", # From faostat_qcl - 'Fat of pigs' (previously 'Fat, pigs'). + "00001729", # From faostat_qcl - 'Treenuts' (previously 'Treenuts'). + "00000366", # From faostat_qcl - 'Artichokes' (previously 'Artichokes'). + "00000217", # From faostat_qcl - 'Cashew nuts' (previously 'Cashew nuts'). + "00000299", # From faostat_qcl - 'Melonseed' (previously 'Melonseed'). + "00000574", # From faostat_qcl - 'Pineapples' (previously 'Pineapples'). + "00000979", # From faostat_qcl - 'Sheep fat, unrendered' (previously 'Fat, sheep'). + "00000987", # From faostat_qcl - 'Wool' (previously 'Wool'). + "00000423", # From faostat_qcl - 'String beans' (previously 'String beans'). + "00000249", # From faostat_qcl - 'Coconuts, in shell' (previously 'Coconuts'). + "00000780", # From faostat_qcl - 'Jute, raw or retted' (previously 'Jute'). + "00000536", # From faostat_qcl - 'Plums' (previously 'Plums'). + "00001111", # From faostat_qcl - 'Meat of mules, fresh or chilled' (previously 'Meat, mule'). + "00001723", # From faostat_qcl - 'Sugar crops' (previously 'Sugar crops'). + "00001726", # From faostat_qcl - 'Pulses' (previously 'Pulses'). + "00000162", # From faostat_qcl - 'Sugar (raw)' (previously 'Sugar (raw)'). + "00000667", # From faostat_qcl - 'Tea leaves' (previously 'Tea'). + "00000056", # From faostat_qcl - 'Maize (corn)' (previously 'Maize'). + "00000257", # From faostat_qcl - 'Palm oil' (previously 'Palm oil'). + "00000393", # From faostat_qcl - 'Cauliflowers and broccoli' (previously 'Cauliflowers and broccoli'). + "00000531", # From faostat_qcl - 'Cherries' (previously 'Cherries'). + "00000572", # From faostat_qcl - 'Avocados' (previously 'Avocados'). + "00000403", # From faostat_qcl - 'Onions' (previously 'Onions'). + "00000515", # From faostat_qcl - 'Apples' (previously 'Apples'). + "00000414", # From faostat_qcl - 'Other beans, green' (previously 'Beans, green'). + "00001017", # From faostat_qcl - 'Meat of goat, fresh or chilled' (previously 'Meat, goat'). + "00000181", # From faostat_qcl - 'Broad beans' (previously 'Broad beans'). +] + +ITEM_CODES_FBSC = [ + "00002576", # From faostat_fbsc - 'Palm kernel oil' (previously 'Palm kernel oil'). + "00002516", # From faostat_fbsc - 'Oats' (previously 'Oats'). + "00002562", # From faostat_fbsc - 'Palm kernels' (previously 'Palm kernels'). + "00002551", # From faostat_fbsc - 'Nuts' (previously 'Nuts'). + "00002913", # From faostat_fbsc - 'Oilcrops' (previously 'Oilcrops'). + "00002533", # From faostat_fbsc - 'Sweet potatoes' (previously 'Sweet potatoes'). + "00002560", # From faostat_fbsc - 'Coconuts' (previously 'Coconuts'). + "00002511", # From faostat_fbsc - 'Wheat' (previously 'Wheat'). + "00002557", # From faostat_fbsc - 'Sunflower seed' (previously 'Sunflower seed'). + "00002602", # From faostat_fbsc - 'Onions' (previously 'Onions'). + "00002734", # From faostat_fbsc - 'Meat, poultry' (previously 'Meat, poultry'). + "00002572", # From faostat_fbsc - 'Groundnut oil' (previously 'Groundnut oil'). + "00002736", # From faostat_fbsc - 'Offals' (previously 'Offals'). + "00002579", # From faostat_fbsc - 'Sesame oil' (previously 'Sesame oil'). + "00002552", # From faostat_fbsc - 'Groundnuts' (previously 'Groundnuts'). + "00002943", # From faostat_fbsc - 'Meat, total' (previously 'Meat, total'). + "00002912", # From faostat_fbsc - 'Treenuts' (previously 'Treenuts'). + "00002611", # From faostat_fbsc - 'Oranges' (previously 'Oranges'). + "00002616", # From faostat_fbsc - 'Plantains' (previously 'Plantains'). + "00002617", # From faostat_fbsc - 'Apples' (previously 'Apples'). + "00002563", # From faostat_fbsc - 'Olives' (previously 'Olives'). + "00002513", # From faostat_fbsc - 'Barley' (previously 'Barley'). + "00002532", # From faostat_fbsc - 'Cassava' (previously 'Cassava'). + "00002918", # From faostat_fbsc - 'Vegetables' (previously 'Vegetables'). + "00002948", # From faostat_fbsc - 'Milk' (previously 'Milk'). + "00002613", # From faostat_fbsc - 'Grapefruit' (previously 'Grapefruit'). + "00002555", # From faostat_fbsc - 'Soybeans' (previously 'Soybeans'). + "00002537", # From faostat_fbsc - 'Sugar beet' (previously 'Sugar beet'). + "00002640", # From faostat_fbsc - 'Pepper' (previously 'Pepper'). + "00002536", # From faostat_fbsc - 'Sugar cane' (previously 'Sugar cane'). + "00002633", # From faostat_fbsc - 'Cocoa beans' (previously 'Cocoa beans'). + "00002561", # From faostat_fbsc - 'Sesame seed' (previously 'Sesame seed'). + "00002546", # From faostat_fbsc - 'Beans, dry' (previously 'Beans, dry'). + "00002740", # From faostat_fbsc - 'Butter and ghee' (previously 'Butter and ghee'). + "00002514", # From faostat_fbsc - 'Maize' (previously 'Maize'). + "00002575", # From faostat_fbsc - 'Cottonseed oil' (previously 'Cottonseed oil'). + "00002641", # From faostat_fbsc - 'Chillies and peppers' (previously 'Chillies and peppers'). + "00002733", # From faostat_fbsc - 'Pork' (previously 'Pork'). + "00002919", # From faostat_fbsc - 'Fruit' (previously 'Fruit'). + "00002655", # From faostat_fbsc - 'Wine' (previously 'Wine'). + "00002618", # From faostat_fbsc - 'Pineapples' (previously 'Pineapples'). + "00002612", # From faostat_fbsc - 'Lemons and limes' (previously 'Lemons and limes'). + "00002580", # From faostat_fbsc - 'Olive oil' (previously 'Olive oil'). + "00002515", # From faostat_fbsc - 'Rye' (previously 'Rye'). + "00002582", # From faostat_fbsc - 'Maize oil' (previously 'Maize oil'). + "00002731", # From faostat_fbsc - 'Meat, beef' (previously 'Meat, beef'). + "00002518", # From faostat_fbsc - 'Sorghum' (previously 'Sorghum'). + "00002949", # From faostat_fbsc - 'Eggs' (previously 'Eggs'). + "00002531", # From faostat_fbsc - 'Potatoes' (previously 'Potatoes'). + "00002615", # From faostat_fbsc - 'Bananas' (previously 'Bananas'). + "00002573", # From faostat_fbsc - 'Sunflower oil' (previously 'Sunflower oil'). + "00002578", # From faostat_fbsc - 'Coconut oil' (previously 'Coconut oil'). + "00002601", # From faostat_fbsc - 'Tomatoes' (previously 'Tomatoes'). + "00002571", # From faostat_fbsc - 'Soybean oil' (previously 'Soybean oil'). + "00002559", # From faostat_fbsc - 'Cottonseed' (previously 'Cottonseed'). + "00002732", # From faostat_fbsc - 'Meat, sheep and goat' (previously 'Meat, sheep and goat'). + "00002901", # From faostat_fbsc - 'Total' (previously 'Total'). + "00002619", # From faostat_fbsc - 'Dates' (previously 'Dates'). + "00002911", # From faostat_fbsc - 'Pulses' (previously 'Pulses'). + "00002535", # From faostat_fbsc - 'Yams' (previously 'Yams'). + "00002745", # From faostat_fbsc - 'Honey' (previously 'Honey'). + "00002737", # From faostat_fbsc - 'Animal fats' (previously 'Animal fats'). + "00002517", # From faostat_fbsc - 'Millet' (previously 'Millet'). + "00002547", # From faostat_fbsc - 'Peas, dry' (previously 'Peas, dry'). + "00002807", # From faostat_fbsc - 'Rice' (previously 'Rice'). + "00002960", # From faostat_fbsc - 'Fish and seafood' (previously 'Fish and seafood'). + "00002908", # From faostat_fbsc - 'Sugar crops' (previously 'Sugar crops'). +] + +# OWID item name, element name, and unit name for population (as given in faostat_qcl and faostat_fbsc datasets). +FAO_POPULATION_ITEM_NAME = "Population" +FAO_POPULATION_ELEMENT_NAME = "Total Population - Both sexes" +FAO_POPULATION_UNIT = "1000 persons" + +# List of element codes to consider from faostat_qcl. +ELEMENT_CODES_QCL = [ + "005312", + "005313", + "005314", + "005318", + "005320", + "005321", + "005410", + "005413", + "005417", + "005419", + "005420", + "005422", + "005424", + "005510", + "005513", + "5312pc", + "5320pc", + "5321pc", + "5510pc", +] +# List of element codes to consider from faostat_fbsc. +ELEMENT_CODES_FBSC = [ + "000645", + "000664", + "000674", + "000684", + "005072", + "005123", + "005131", + "005142", + "005154", + "005170", + "005171", + "005301", + # Element 'Production' (in tonnes, originally given in 1000 tonnes) is taken from qcl. + # Although fbsc has items for this element that are not in qcl, they overlap in a number of items with slightly + # different values. To avoid this issue, we ignore the element from fbsc and use only the one in qcl. + # '005511', + "005521", + "005527", + "005611", + "005911", + "0645pc", + "0664pc", + "0674pc", + "0684pc", + "5123pc", + "5142pc", + "5154pc", + "5301pc", + "5521pc", + "5611pc", + "5911pc", + # The following element code is for population. + "000511", +] + + +def combine_qcl_and_fbsc(qcl_table: catalog.Table, fbsc_table: catalog.Table) -> pd.DataFrame: + """Combine garden `faostat_qcl` and `faostat_fbsc` datasets. + + Parameters + ---------- + qcl_table : catalog.Table + Main table (in long format) of the `faostat_qcl` dataset. + fbsc_table : catalog.Table + Main table (in long format) of the `faostat_fbsc` dataset. + + Returns + ------- + combined : pd.DataFrame + Combined data (as a dataframe, not a table). + + """ + columns = [ + "country", + "year", + "item_code", + "element_code", + "item", + "element", + "unit", + "unit_short_name", + "value", + "population_with_data", + ] + qcl = pd.DataFrame(qcl_table).reset_index()[columns] + # Select relevant element codes. + qcl = qcl[qcl["element_code"].isin(ELEMENT_CODES_QCL)].reset_index(drop=True) + qcl["value"] = qcl["value"].astype(float) + qcl["element"] = [element for element in qcl["element"]] + qcl["unit"] = [unit for unit in qcl["unit"]] + qcl["item"] = [item for item in qcl["item"]] + fbsc = pd.DataFrame(fbsc_table).reset_index()[columns] + # Select relevant element codes. + fbsc = fbsc[fbsc["element_code"].isin(ELEMENT_CODES_FBSC)].reset_index(drop=True) + fbsc["value"] = fbsc["value"].astype(float) + fbsc["element"] = [element for element in fbsc["element"]] + fbsc["unit"] = [unit for unit in fbsc["unit"]] + fbsc["item"] = [item for item in fbsc["item"]] + + rename_columns = {"item": "product"} + combined = ( + dataframes.concatenate([qcl, fbsc], ignore_index=True).rename(columns=rename_columns).reset_index(drop=True) + ) + + # Sanity checks. + assert len(combined) == (len(qcl) + len(fbsc)), "Unexpected number of rows after combining qcl and fbsc datasets." + + assert len(combined[combined["value"].isnull()]) == 0, "Unexpected nan values." + + n_items_per_item_code = combined.groupby("item_code")["product"].transform("nunique") + assert combined[n_items_per_item_code > 1].empty, "There are item codes with multiple items." + + n_elements_per_element_code = combined.groupby("element_code")["element"].transform("nunique") + assert combined[n_elements_per_element_code > 1].empty, "There are element codes with multiple elements." + + n_units_per_element_code = combined.groupby("element_code")["unit"].transform("nunique") + assert combined[n_units_per_element_code > 1].empty, "There are element codes with multiple units." + + error = "There are unexpected duplicate rows. Rename items in custom_items.csv to avoid clashes." + assert combined[combined.duplicated(subset=["product", "country", "year", "element", "unit"])].empty, error + + return cast(pd.DataFrame, combined) + + +def get_fao_population(combined: pd.DataFrame) -> pd.DataFrame: + """Extract the FAO population data from data (in long format). + + Parameters + ---------- + combined : pd.DataFrame + Combination of `faostat_qcl` and `faostat_fbsc` data (although this function could also be applied to just + `faostat_fbsc` data, since `faostat_qcl` does not contain FAO population data). + + Returns + ------- + fao_population : pd.DataFrame + Population (by country and year) according to FAO, extracted from the `faostat_fbsc` dataset. + + """ + # Select the item and element that corresponds to population values. + fao_population = combined[ + (combined["product"] == FAO_POPULATION_ITEM_NAME) & (combined["element"] == FAO_POPULATION_ELEMENT_NAME) + ].reset_index(drop=True) + + # Check that population is given in "1000 persons" and convert to persons. + error = "FAOSTAT population changed item, element, or unit." + assert list(fao_population["unit"].unique()) == [FAO_POPULATION_UNIT], error + fao_population["value"] *= 1000 + + # Drop missing values and prepare output dataframe. + fao_population = ( + fao_population[["country", "year", "value"]].dropna(how="any").rename(columns={"value": "fao_population"}) + ) + + return fao_population + + +def process_combined_data(combined: pd.DataFrame) -> pd.DataFrame: + """Process combined data (combination of `faostat_qcl` and `faostat_fbsc` data) to have the content and format + required by the food explorer. + + Parameters + ---------- + combined : pd.DataFrame + Combination of `faostat_qcl` and `faostat_fbsc` data. + + Returns + ------- + data_wide : pd.DataFrame + Processed data (in wide format). + + """ + combined = combined.copy() + + # Get FAO population from data (it is given as another item). + fao_population = get_fao_population(combined=combined) + + # List of all item codes to select. + selected_item_codes = sorted(set(ITEM_CODES_FBSC).union(ITEM_CODES_QCL)) + + # Check that all expected products are included in the data. + missing_products = sorted(set(selected_item_codes) - set(set(combined["item_code"]))) + assert len(missing_products) == 0, f"{len(missing_products)} missing products for food explorer." + + # Select relevant products for the food explorer. + combined = combined[combined["item_code"].isin(selected_item_codes)].reset_index(drop=True) + + # Join element and unit into one title column. + combined["title"] = combined["element"] + " (" + combined["unit"] + ")" + + # This will create a table with just one column and country-year as index. + index_columns = ["product", "country", "year"] + data_wide = combined.pivot(index=index_columns, columns=["title"], values="value").reset_index() + + # Add column for FAO population. + data_wide = pd.merge(data_wide, fao_population, on=["country", "year"], how="left") + + # Add column for OWID population. + data_wide = geo.add_population_to_dataframe(df=data_wide, warn_on_missing_countries=False) + + # Fill gaps in OWID population with FAO population (for "* (FAO)" countries, i.e. countries that were not + # harmonized and for which there is no OWID population). + # Then drop "fao_population", since it is no longer needed. + data_wide["population"] = data_wide["population"].fillna(data_wide["fao_population"]) + data_wide = data_wide.drop(columns="fao_population") + + assert len(data_wide.columns[data_wide.isnull().all(axis=0)]) == 0, "Unexpected columns with only nan values." + + # Set a reasonable index. + data_wide = data_wide.set_index(index_columns, verify_integrity=True) + + return data_wide + + +def run(dest_dir: str) -> None: + # + # Load data. + # + # Fetch the dataset short name from dest_dir. + dataset_short_name = Path(dest_dir).name + + # Define path to current step file. + current_step_file = (CURRENT_DIR / dataset_short_name).with_suffix(".py") + + # Get paths and naming conventions for current data step. + paths = PathFinder(current_step_file.as_posix()) + + # Load latest qcl and fbsc datasets from garden. + qcl_dataset: catalog.Dataset = paths.load_dependency(f"{NAMESPACE}_qcl") + fbsc_dataset: catalog.Dataset = paths.load_dependency(f"{NAMESPACE}_fbsc") + + # Get main long tables from qcl and fbsc datasets. + qcl_table = qcl_dataset[f"{NAMESPACE}_qcl"] + fbsc_table = fbsc_dataset[f"{NAMESPACE}_fbsc"] + + # + # Process data. + # + # Combine qcl and fbsc data. + data = combine_qcl_and_fbsc(qcl_table=qcl_table, fbsc_table=fbsc_table) + + # Prepare data in the format required by the food explorer. + data = process_combined_data(combined=data) + + # Create table of products. + table = catalog.Table(data, short_name=dataset_short_name) + + # + # Save outputs. + # + # Initialise new garden dataset. + ds_garden = create_dataset(dest_dir=dest_dir, tables=[table], default_metadata=fbsc_dataset.metadata) + + # Update dataset metadata and combine sources from qcl and fbsc datasets. + ds_garden.metadata.title = DATASET_TITLE + ds_garden.metadata.description = DATASET_DESCRIPTION + ds_garden.metadata.sources = fbsc_dataset.metadata.sources + qcl_dataset.metadata.sources + + # Create new dataset in garden. + ds_garden.save() diff --git a/etl/steps/data/garden/faostat/2024-03-14/faostat_fs.py b/etl/steps/data/garden/faostat/2024-03-14/faostat_fs.py new file mode 100644 index 00000000000..a836381fb94 --- /dev/null +++ b/etl/steps/data/garden/faostat/2024-03-14/faostat_fs.py @@ -0,0 +1,2 @@ +"""FAOSTAT garden step for faostat_fs dataset.""" +from .shared import run # noqa:F401 diff --git a/etl/steps/data/garden/faostat/2024-03-14/faostat_ic.py b/etl/steps/data/garden/faostat/2024-03-14/faostat_ic.py new file mode 100644 index 00000000000..5e86234ddde --- /dev/null +++ b/etl/steps/data/garden/faostat/2024-03-14/faostat_ic.py @@ -0,0 +1,2 @@ +"""FAOSTAT garden step for faostat_ic dataset.""" +from .shared import run # noqa:F401 diff --git a/etl/steps/data/garden/faostat/2024-03-14/faostat_lc.py b/etl/steps/data/garden/faostat/2024-03-14/faostat_lc.py new file mode 100644 index 00000000000..ab508fd95ad --- /dev/null +++ b/etl/steps/data/garden/faostat/2024-03-14/faostat_lc.py @@ -0,0 +1,2 @@ +"""FAOSTAT garden step for faostat_lc dataset.""" +from .shared import run # noqa:F401 diff --git a/etl/steps/data/garden/faostat/2024-03-14/faostat_metadata.py b/etl/steps/data/garden/faostat/2024-03-14/faostat_metadata.py new file mode 100644 index 00000000000..95e9c56d545 --- /dev/null +++ b/etl/steps/data/garden/faostat/2024-03-14/faostat_metadata.py @@ -0,0 +1,1076 @@ +"""FAOSTAT garden step for faostat_metadata dataset. + +This step reads from: +* The (additional) metadata dataset. The only crucial ingredients from here (that will be used later on in other garden + steps are element, item and units descriptions, and country groups (used to check that we do not double count + countries when aggregating data for regions). +* Custom datasets file ('./custom_datasets.csv'). +* Custom elements and units file ('./custom_elements_and_units.csv'). +* Custom items file ('./custom_items.csv'). +* Value amendments file ('./value_amendments.csv'). +* Each of the individual meadow datasets. They are loaded to extract their countries, items, elements and units, and + some sanity checks are performed. + +This step will: +* Output a dataset (to be loaded by all garden datasets) with tables 'countries, 'datasets', 'elements', 'items' + and 'amendments'. +* Apply sanity checks to countries, elements, items, and units. +* Apply custom names and descriptions to datasets, elements, items and units. +* Check that spurious values in value_amendments.csv are in the data, and whether there are new spurious values. +* Harmonize country names. +* Find countries that correspond to aggregates of other countries (e.g. 'Melanesia'). +* Ensure there are no degeneracies within a dataset (i.e. ensure each index is unique). +* Ensure there are no degeneracies between datasets (using dataset, item_code, element_code as keys). + +There are some non-trivial issues with the definitions of items at FAOSTAT: +* Some item codes in the data are missing in the metadata, and vice versa. +* The mapping item_code -> item in the data files is sometimes different from the mapping item_code -> item + in the (additional) metadata dataset. Some examples: + * In dataset qv, item code 221 in the data corresponds to item "Almonds, in shell", whereas in the metadata, + item code 221 corresponds to item "Almonds, with shell", which is the same item, but with a slightly different + name. This happens with many items. On the website (https://www.fao.org/faostat/en/?#data/QV) they seem to be + using the naming from the metadata. We can safely ignore this issue, and stick to the names in the data. + +There are several cases in which one or a few item codes in the data are missing in the metadata. Also, there are +several cases in which an item code in the data has an item name slightly different in the metadata. But these are not +important issues (since we use item_code to merge different datasets, and we use metadata only to fetch descriptions). + +""" + +import json +import sys +from copy import deepcopy +from pathlib import Path +from typing import Dict, List, Tuple, cast + +import pandas as pd +from owid import catalog +from owid.datautils import dataframes, io +from shared import ( + CURRENT_DIR, + FAOSTAT_METADATA_SHORT_NAME, + FLAGS_RANKING, + N_CHARACTERS_ELEMENT_CODE, + N_CHARACTERS_ITEM_CODE, + N_CHARACTERS_ITEM_CODE_SDGB, + NAMESPACE, + harmonize_elements, + harmonize_items, + log, + optimize_table_dtypes, + prepare_dataset_description, +) +from tqdm.auto import tqdm + +from etl.helpers import PathFinder + +# Minimum number of issues in the comparison of items and item codes from data and metadata to raise a warning. +N_ISSUES_ON_ITEMS_FOR_WARNING = 1 + + +def create_dataset_descriptions_dataframe_for_domain(table: catalog.Table, dataset_short_name: str) -> pd.DataFrame: + """Create a single row dataframe with the dataset name, title and description, for a given domain. + + Parameters + ---------- + table : catalog.Table + Latest table for considered domain. + dataset_short_name : str + Dataset short name (e.g. 'faostat_qcl'). + + Returns + ------- + dataset_descriptions_df : pd.DataFrame + Dataframe of name, title and description of a domain. + + """ + dataset_descriptions_df = pd.DataFrame( + { + "dataset": [dataset_short_name], + "fao_dataset_title": [table.metadata.dataset.title], + "fao_dataset_description": [table.metadata.dataset.description], + } + ) + + return dataset_descriptions_df + + +def clean_global_dataset_descriptions_dataframe( + datasets_df: pd.DataFrame, custom_datasets: pd.DataFrame +) -> pd.DataFrame: + """Apply sanity checks to the dataframe gathered from the data of each individual datasets, and add custom dataset + titles and descriptions. + + Parameters + ---------- + datasets_df : pd.DataFrame + Dataframe of descriptions gathered from the data of each individual dataset. + custom_datasets : pd.DataFrame + Data from the custom_datasets.csv file. + + Returns + ------- + datasets_df : pd.Dataframe + Clean dataframe of dataset titles and descriptions (customized and original FAO ones). + + """ + datasets_df = datasets_df.copy() + + # Check that the dataset descriptions of fbsh and fbs are identical. + error = ( + "Datasets fbsh and fbs have different descriptions. " + "This may happen in the future: Simply check that nothing significant has changed and remove this assertion." + ) + assert ( + datasets_df[datasets_df["dataset"] == "faostat_fbsh"]["fao_dataset_description"].item() + == datasets_df[datasets_df["dataset"] == "faostat_fbs"]["fao_dataset_description"].item() + ), error + # Drop row for fbsh, and rename "fbs" to "fbsc" (since this will be the name for the combined dataset). + datasets_df = datasets_df[datasets_df["dataset"] != "faostat_fbsh"].reset_index(drop=True) + datasets_df.loc[datasets_df["dataset"] == "faostat_fbs", "dataset"] = "faostat_fbsc" + + # Add custom dataset titles. + datasets_df = pd.merge( + datasets_df, + custom_datasets, + on="dataset", + how="left", + suffixes=("_new", "_old"), + ) + + changed_titles = datasets_df[ + datasets_df["fao_dataset_title_old"].fillna("") != datasets_df["fao_dataset_title_new"].fillna("") + ] + changed_descriptions = datasets_df[ + datasets_df["fao_dataset_description_old"].fillna("") != datasets_df["fao_dataset_description_new"].fillna("") + ] + + if len(changed_titles) > 0: + log.warning(f"{len(changed_titles)} domains have changed titles, consider updating custom_datasets.csv.") + if len(changed_descriptions) > 0: + log.warning( + f"{len(changed_descriptions)} domains have changed descriptions. " f"Consider updating custom_datasets.csv." + ) + datasets_df = datasets_df.drop(columns=["fao_dataset_title_old", "fao_dataset_description_old"]).rename( + columns={ + "fao_dataset_title_new": "fao_dataset_title", + "fao_dataset_description_new": "fao_dataset_description", + } + ) + + datasets_df["owid_dataset_title"] = datasets_df["owid_dataset_title"].fillna(datasets_df["fao_dataset_title"]) + error = "Custom titles for different datasets are equal. Edit custom_datasets.csv file." + assert len(set(datasets_df["dataset"])) == len(set(datasets_df["owid_dataset_title"])), error + + # The final description will be the owid description (if there is any) followed by the original FAO description + # (if there is any). + datasets_df["owid_dataset_description"] = [ + prepare_dataset_description( + fao_description=dataset["fao_dataset_description"], + owid_description=dataset["owid_dataset_description"], + ) + for _, dataset in datasets_df.fillna("").iterrows() + ] + + # Reorder columns. + datasets_df = datasets_df[ + [ + "dataset", + "fao_dataset_title", + "owid_dataset_title", + "fao_dataset_description", + "owid_dataset_description", + ] + ] + + return datasets_df + + +def check_that_item_and_element_harmonization_does_not_trim_codes( + data: pd.DataFrame, dataset_short_name: str, category: str +) -> None: + # Ensure that the number of digits of all item and element codes is smaller than the limits defined + # at the beginning of the garden shared module, by N_CHARACTERS_ITEM_CODE and N_CHARACTERS_ELEMENT_CODE, + # respectively. + + # Set the maximum number of characters for item_code. + if dataset_short_name == f"{NAMESPACE}_sdgb": + n_characters_item_code = N_CHARACTERS_ITEM_CODE_SDGB + else: + n_characters_item_code = N_CHARACTERS_ITEM_CODE + + n_characters = {"element": N_CHARACTERS_ELEMENT_CODE, "item": n_characters_item_code} + error = ( + f"{category.capitalize()} codes found with more characters than expected for {dataset_short_name}. " + f"This parameter (N_CHARACTERS_*_CODE*) is defined in garden shared module and may need to be increased. " + f"This would change how {category} codes are harmonized, increasing the length of variable names. " + f"It may have further unwanted consequences, so do it with caution." + ) + assert all([len(str(code)) <= n_characters[category] for code in data[f"{category}_code"].unique()]), error + + +def create_items_dataframe_for_domain( + table: catalog.Table, metadata: catalog.Dataset, dataset_short_name: str +) -> pd.DataFrame: + """Apply sanity checks to the items of a table in a dataset, and to the items from the metadata, harmonize all item + codes and items, and add item descriptions. + + Parameters + ---------- + table : catalog.Table + Data for a given domain. + metadata: catalog.Dataset + Metadata dataset from meadow. + dataset_short_name : str + Dataset short name (e.g. 'faostat_qcl'). + + Returns + ------- + items_from_data : pd.Dataframe + Item names and descriptions (customized ones and FAO original ones) for a particular domain. + + """ + df = pd.DataFrame(table).reset_index() + + # Load items from data. + items_from_data = ( + df.rename(columns={"item": "fao_item"})[["item_code", "fao_item"]].drop_duplicates().reset_index(drop=True) + ) + # Sanity check. + check_that_item_and_element_harmonization_does_not_trim_codes( + data=df, dataset_short_name=dataset_short_name, category="item" + ) + # Ensure items are well constructed and amend already known issues (defined in shared.ITEM_AMENDMENTS). + items_from_data = harmonize_items(df=items_from_data, dataset_short_name=dataset_short_name, item_col="fao_item") + + # Load items from metadata. + items_columns = { + "item_code": "item_code", + "item": "fao_item", + "description": "fao_item_description", + } + _items_df = ( + metadata[f"{dataset_short_name}_item"] + .reset_index()[list(items_columns)] + .rename(columns=items_columns) + .drop_duplicates() + .sort_values(list(items_columns.values())) + .reset_index(drop=True) + ) + _items_df = harmonize_items(df=_items_df, dataset_short_name=dataset_short_name, item_col="fao_item") + _items_df["fao_item_description"] = _items_df["fao_item_description"].astype("string") + + # Add descriptions (from metadata) to items (from data). + items_from_data = ( + pd.merge(items_from_data, _items_df, on=["item_code", "fao_item"], how="left") + .sort_values(["item_code", "fao_item"]) + .reset_index(drop=True) + ) + items_from_data["dataset"] = dataset_short_name + items_from_data["fao_item_description"] = items_from_data["fao_item_description"].fillna("") + + # Sanity checks for items in current dataset: + + # Check that in data, there is only one item per item code. + n_items_per_item_code = items_from_data.groupby("item_code")["fao_item"].transform("nunique") + error = f"Multiple items for a given item code in dataset {dataset_short_name}." + assert items_from_data[n_items_per_item_code > 1].empty, error + + # Check that all item codes in data are defined in metadata, and check that the mapping item code -> item in + # the data is the same as in the metadata (which often is not the case). + compared = pd.merge( + items_from_data[["item_code", "fao_item"]], + _items_df[["item_code", "fao_item"]], + on="item_code", + how="left", + suffixes=("_in_data", "_in_metadata"), + ) + different_items = compared[compared["fao_item_in_data"] != compared["fao_item_in_metadata"]] + missing_item_codes = set(items_from_data["item_code"]) - set(_items_df["item_code"]) + if (len(different_items) + len(missing_item_codes)) > N_ISSUES_ON_ITEMS_FOR_WARNING: + log.warning( + f"{len(missing_item_codes)} item codes in {dataset_short_name} missing in metadata. " + f"{len(different_items)} item codes in data mapping to different items in metadata." + ) + + return items_from_data + + +def clean_global_items_dataframe(items_df: pd.DataFrame, custom_items: pd.DataFrame) -> pd.DataFrame: + """Apply global sanity checks to items gathered from all datasets, and create a clean global items dataframe. + + Parameters + ---------- + items_df : pd.DataFrame + Items dataframe gathered from all domains. + custom_items : pd.DataFrame + Data from custom_items.csv file. + + Returns + ------- + items_df : pd.DataFrame + Clean global items dataframe. + + """ + items_df = items_df.copy() + + # Check that fbs and fbsh have the same contributions, remove one of them, and rename the other to fbsc. + check = pd.merge( + items_df[items_df["dataset"] == "faostat_fbsh"].reset_index(drop=True)[["item_code", "fao_item"]], + items_df[items_df["dataset"] == "faostat_fbs"].reset_index(drop=True)[["item_code", "fao_item"]], + how="outer", + on=["item_code"], + suffixes=("_fbsh", "_fbs"), + ) + assert (check["fao_item_fbsh"] == check["fao_item_fbs"]).all() + # Drop all rows for fbsh, and rename "fbs" to "fbsc" (since this will be the name for the combined dataset). + items_df = items_df[items_df["dataset"] != "faostat_fbsh"].reset_index(drop=True) + items_df.loc[items_df["dataset"] == "faostat_fbs", "dataset"] = "faostat_fbsc" + + # Add custom item names. + items_df = pd.merge( + items_df, + custom_items.rename(columns={"fao_item": "fao_item_check"}), + on=["dataset", "item_code"], + how="left", + suffixes=("_new", "_old"), + ) + + changed_descriptions = items_df[ + (items_df["fao_item_description_old"] != items_df["fao_item_description_new"]) + & (items_df["fao_item_description_old"].notnull()) + ] + if len(changed_descriptions) > 0: + log.warning( + f"{len(changed_descriptions)} domains have changed item descriptions. " + f"Consider updating custom_items.csv." + ) + + items_df = items_df.drop(columns="fao_item_description_old").rename( + columns={"fao_item_description_new": "fao_item_description"} + ) + + # Check that item names have not changed. + # NOTE: This condition used to raise an error if not fulfilled. Consider making it an assertion. + if not ( + items_df[items_df["fao_item_check"].notnull()]["fao_item_check"] + == items_df[items_df["fao_item_check"].notnull()]["fao_item"] + ).all(): + log.warning("Item names may have changed with respect to custom items file. Update custom items file.") + items_df = items_df.drop(columns=["fao_item_check"]) + + # Assign original FAO name to all owid items that do not have a custom name. + items_df["owid_item"] = items_df["owid_item"].fillna(items_df["fao_item"]) + + # Add custom item descriptions, and assign original FAO descriptions to items that do not have a custom description. + items_df["owid_item_description"] = items_df["owid_item_description"].fillna(items_df["fao_item_description"]) + + # Check that we have not introduced ambiguities when assigning custom item names. + n_owid_items_per_item_code = items_df.groupby(["dataset", "item_code"])["owid_item"].transform("nunique") + error = "Multiple owid items for a given item code in a dataset." + assert items_df[n_owid_items_per_item_code > 1].empty, error + + items_df = ( + items_df[ + [ + "dataset", + "item_code", + "fao_item", + "owid_item", + "fao_item_description", + "owid_item_description", + ] + ] + .sort_values(["dataset", "item_code"]) + .reset_index(drop=True) + ) + + return items_df + + +def create_elements_dataframe_for_domain( + table: catalog.Table, metadata: catalog.Dataset, dataset_short_name: str +) -> pd.DataFrame: + """Apply sanity checks to the elements and units of a table in a dataset, and to the elements and units from the + metadata, harmonize all element code, and add descriptions. + + Parameters + ---------- + table : catalog.Table + Data for a given domain. + metadata: catalog.Dataset + Additional metadata dataset from meadow. + dataset_short_name : str + Dataset short name (e.g. 'faostat_qcl'). + + Returns + ------- + elements_from_data : pd.Dataframe + Element names and descriptions and unit names and descriptions (customized ones and FAO original ones) for a + particular domain. + + """ + + df = pd.DataFrame(table).reset_index() + # Load elements from data. + elements_from_data = ( + df.rename(columns={"element": "fao_element", "unit": "fao_unit_short_name"})[ + ["element_code", "fao_element", "fao_unit_short_name"] + ] + .drop_duplicates() + .reset_index(drop=True) + ) + # Sanity check. + check_that_item_and_element_harmonization_does_not_trim_codes( + data=df, dataset_short_name=dataset_short_name, category="element" + ) + # Ensure element_code is always a string of a fix number of characters. + elements_from_data = harmonize_elements(df=elements_from_data, element_col="fao_element") + + # Load elements from metadata. + elements_columns = { + "element_code": "element_code", + "element": "fao_element", + "description": "fao_element_description", + } + _elements_df = ( + metadata[f"{dataset_short_name}_element"] + .reset_index()[list(elements_columns)] + .rename(columns=elements_columns) + .drop_duplicates() + .sort_values(list(elements_columns.values())) + .reset_index(drop=True) + ) + _elements_df = harmonize_elements(df=_elements_df, element_col="fao_element") + _elements_df["fao_element_description"] = _elements_df["fao_element_description"].astype("string") + + # Load units metadata. + units_columns = { + "unit_name": "fao_unit_short_name", + "description": "fao_unit", + } + _units_df = ( + metadata[f"{dataset_short_name}_unit"] + .reset_index()[list(units_columns)] + .rename(columns=units_columns) + .drop_duplicates() + .sort_values(list(units_columns.values())) + .reset_index(drop=True) + ) + _units_df["fao_unit"] = _units_df["fao_unit"].astype("string") + + # Add element descriptions (from metadata). + elements_from_data = ( + pd.merge( + elements_from_data, + _elements_df, + on=["element_code", "fao_element"], + how="left", + ) + .sort_values(["element_code", "fao_element"]) + .reset_index(drop=True) + ) + elements_from_data["dataset"] = dataset_short_name + elements_from_data["fao_element_description"] = elements_from_data["fao_element_description"].fillna("") + + # Add unit descriptions (from metadata). + elements_from_data = ( + pd.merge(elements_from_data, _units_df, on=["fao_unit_short_name"], how="left") + .sort_values(["fao_unit_short_name"]) + .reset_index(drop=True) + ) + elements_from_data["fao_unit"] = elements_from_data["fao_unit"].fillna(elements_from_data["fao_unit_short_name"]) + + # Sanity checks: + + # Check that in data, there is only one unit per element code. + n_units_per_element_code = df.groupby("element_code")["unit"].transform("nunique") + error = f"Multiple units for a given element code in dataset {dataset_short_name}." + assert df[n_units_per_element_code > 1].empty, error + + # Check that in data, there is only one element per element code. + n_elements_per_element_code = elements_from_data.groupby("element_code")["fao_element"].transform("nunique") + error = f"Multiple elements for a given element code in dataset {dataset_short_name}." + assert elements_from_data[n_elements_per_element_code > 1].empty, error + + return elements_from_data + + +def clean_global_elements_dataframe(elements_df: pd.DataFrame, custom_elements: pd.DataFrame) -> pd.DataFrame: + """Apply global sanity checks to elements and units gathered from all datasets, and create a clean global elements + and units dataframe. + + Parameters + ---------- + elements_df : pd.DataFrame + Elements and units dataframe gathered from all domains. + custom_elements : pd.DataFrame + Data from custom_element_and_units.csv file. + + Returns + ------- + elements_df : pd.DataFrame + Clean global elements and units dataframe. + + """ + elements_df = elements_df.copy() + + # Check that all elements of fbsh are in fbs (although fbs may contain additional elements). + assert set(elements_df[elements_df["dataset"] == "faostat_fbsh"]["element_code"]) <= set( + elements_df[elements_df["dataset"] == "faostat_fbs"]["element_code"] + ) + # Drop all rows for fbsh, and rename "fbs" to "fbsc" (since this will be the name for the combined dataset). + elements_df = elements_df[elements_df["dataset"] != "faostat_fbsh"].reset_index(drop=True) + elements_df.loc[elements_df["dataset"] == "faostat_fbs", "dataset"] = "faostat_fbsc" + + elements_df = pd.merge( + elements_df, + custom_elements.rename( + columns={ + "fao_element": "fao_element_check", + "fao_unit_short_name": "fao_unit_short_name_check", + } + ), + on=["dataset", "element_code"], + how="left", + suffixes=("_new", "_old"), + ) + + changed_units = elements_df[ + (elements_df["fao_unit_new"] != elements_df["fao_unit_old"]) & (elements_df["fao_unit_old"].notnull()) + ] + if len(changed_units) > 0: + log.warning(f"{len(changed_units)} domains have changed units, consider updating custom_elements.csv.") + + changed_descriptions = elements_df[ + (elements_df["fao_element_description_new"] != elements_df["fao_element_description_old"]) + & (elements_df["fao_element_description_old"].notnull()) + ] + if len(changed_descriptions) > 0: + log.warning( + f"{len(changed_descriptions)} domains have changed element descriptions. " + f"Consider updating custom_elements.csv." + ) + + elements_df = elements_df.drop(columns=["fao_unit_old", "fao_element_description_old"]).rename( + columns={ + "fao_element_description_new": "fao_element_description", + "fao_unit_new": "fao_unit", + } + ) + + error = "Element names have changed with respect to custom elements file. Update custom elements file." + assert ( + elements_df[elements_df["fao_element_check"].notnull()]["fao_element_check"] + == elements_df[elements_df["fao_element_check"].notnull()]["fao_element"] + ).all(), error + elements_df = elements_df.drop(columns=["fao_element_check"]) + + error = "Unit names have changed with respect to custom elements file. Update custom elements file." + assert ( + elements_df[elements_df["fao_unit_short_name_check"].notnull()]["fao_unit_short_name_check"] + == elements_df[elements_df["fao_unit_short_name_check"].notnull()]["fao_unit_short_name"] + ).all(), error + elements_df = elements_df.drop(columns=["fao_unit_short_name_check"]) + + # Assign original FAO names where there is no custom one. + elements_df["owid_element"] = elements_df["owid_element"].fillna(elements_df["fao_element"]) + elements_df["owid_unit"] = elements_df["owid_unit"].fillna(elements_df["fao_unit"]) + elements_df["owid_element_description"] = elements_df["owid_element_description"].fillna( + elements_df["fao_element_description"] + ) + elements_df["owid_unit_short_name"] = elements_df["owid_unit_short_name"].fillna(elements_df["fao_unit_short_name"]) + + # Assume variables were not per capita, if was_per_capita is not informed, and make boolean. + elements_df["was_per_capita"] = elements_df["was_per_capita"].fillna("0").replace({"0": False, "1": True}) + + # Idem for variables to make per capita. + elements_df["make_per_capita"] = elements_df["make_per_capita"].fillna("0").replace({"0": False, "1": True}) + + # Check that we have not introduced ambiguities when assigning custom element or unit names. + n_owid_elements_per_element_code = elements_df.groupby(["dataset", "element_code"])["owid_element"].transform( + "nunique" + ) + error = "Multiple owid elements for a given element code in a dataset." + assert elements_df[n_owid_elements_per_element_code > 1].empty, error + + # Check that we have not introduced ambiguities when assigning custom element or unit names. + n_owid_units_per_element_code = elements_df.groupby(["dataset", "element_code"])["owid_unit"].transform("nunique") + error = "Multiple owid elements for a given element code in a dataset." + assert elements_df[n_owid_units_per_element_code > 1].empty, error + + # NOTE: We assert that there is one element for each element code. But the opposite may not be true: there can be + # multiple element codes with the same element. And idem for items. + + return elements_df + + +def check_countries_to_exclude_or_harmonize( + countries_in_data: pd.DataFrame, excluded_countries: List[str], countries_harmonization: Dict[str, str] +) -> None: + # Check that all excluded countries are in the data. + unknown_excluded_countries = set(excluded_countries) - set(countries_in_data["fao_country"]) + error = ( + f"Uknown excluded countries (to be removed from faostat.excluded_countries.json): {unknown_excluded_countries}" + ) + assert len(unknown_excluded_countries) == 0, error + + # Check that all countries to be harmonized are in the data. + unknown_countries_to_harmonize = set(countries_harmonization) - set(countries_in_data["fao_country"]) + error = f"Unknown countries to be harmonized (to be removed or edited in faostat.countries.json): {unknown_countries_to_harmonize}" + assert len(unknown_countries_to_harmonize) == 0, error + + # Check that all countries in the data are either to be excluded or to be harmonized. + unknown_countries = set(countries_in_data["fao_country"]) - set(excluded_countries) - set(countries_harmonization) + error = f"Unknown countries in the data (to be added either to faostat.excluded_countries.json or to faostat.countries.json): {unknown_countries}" + assert len(unknown_countries) == 0, error + + +def clean_global_countries_dataframe( + countries_in_data: pd.DataFrame, + country_groups: Dict[str, List[str]], + countries_harmonization: Dict[str, str], + excluded_countries: List[str], +) -> pd.DataFrame: + """Clean dataframe of countries gathered from the data of the individual domains, harmonize country names (and + country names of members of regions), and create a clean global countries dataframe. + + Parameters + ---------- + countries_in_data : pd.DataFrame + Countries gathered from the data of all domains. + country_groups : dict + Countries and their members, gathered from the data. + countries_harmonization : dict + Mapping of country names (from FAO names to OWID names). + excluded_countries : list + Country names to be ignored. + + Returns + ------- + countries_df : pd.DataFrame + Clean global countries dataframe. + + """ + countries_df = countries_in_data.copy() + + # Sanity checks. + check_countries_to_exclude_or_harmonize( + countries_in_data=countries_in_data, + excluded_countries=excluded_countries, + countries_harmonization=countries_harmonization, + ) + + # Harmonize country groups and members. + country_groups_harmonized = { + countries_harmonization[group]: sorted([countries_harmonization[member] for member in country_groups[group]]) + for group in country_groups + if group in countries_harmonization + } + + # Harmonize country names. + countries_df["country"] = dataframes.map_series( + series=countries_df["fao_country"], + mapping=countries_harmonization, + warn_on_missing_mappings=False, + warn_on_unused_mappings=False, + make_unmapped_values_nan=True, + show_full_warning=True, + ) + + # Add country members to countries dataframe. + countries_df["members"] = dataframes.map_series( + series=countries_df["country"], + mapping=country_groups_harmonized, + make_unmapped_values_nan=True, + ) + + # Feather does not support object types, so convert column of lists to column of strings. + countries_df["members"] = [ + json.dumps(members) if isinstance(members, list) else members for members in countries_df["members"] + ] + + return countries_df + + +def create_table(df: pd.DataFrame, short_name: str, index_cols: List[str]) -> catalog.Table: + """Create a table with optimal format and basic metadata, out of a dataframe. + + Parameters + ---------- + df : pd.DataFrame + Input dataframe. + short_name : str + Short name to add in the metadata of the new table. + index_cols : list + Columns to use as indexes of the new table. + + Returns + ------- + table : catalog.Table + New table. + + """ + table = catalog.Table(df).copy() + + # Optimize column dtypes before storing feather file, and ensure codes are categories (instead of ints). + table = optimize_table_dtypes(table) + + # Set indexes and other necessary metadata. + table = table.set_index(index_cols, verify_integrity=True) + + table.metadata.short_name = short_name + table.metadata.primary_key = index_cols + + return cast(catalog.Table, table) + + +def check_that_flag_definitions_in_dataset_agree_with_those_in_flags_ranking( + metadata: catalog.Dataset, +) -> None: + """Check that the definition of flags in the additional metadata for current dataset agree with the ones we have + manually written down in our flags ranking (raise error otherwise). + + Parameters + ---------- + metadata : catalog.Dataset + Additional metadata dataset (that must contain one table for current dataset). + + """ + for table_name in metadata.table_names: + if "flag" in table_name: + flag_df = metadata[table_name].reset_index() + comparison = pd.merge(FLAGS_RANKING, flag_df, on="flag", how="inner") + error_message = ( + f"Flag definitions in file {table_name} are different to those in our flags ranking. " + f"Redefine shared.FLAGS_RANKING." + ) + assert (comparison["description"] == comparison["flags"]).all(), error_message + + +def check_that_all_flags_in_dataset_are_in_ranking(table: catalog.Table, metadata_for_flags: catalog.Table) -> None: + """Check that all flags found in current dataset are defined in our flags ranking (raise error otherwise). + + Parameters + ---------- + table : pd.DataFrame + Data table for current dataset. + metadata_for_flags : catalog.Table + Flags for current dataset, as defined in dataset of additional metadata. + + """ + if not set(table["flag"]) < set(FLAGS_RANKING["flag"]): + missing_flags = set(table["flag"]) - set(FLAGS_RANKING["flag"]) + flags_data = pd.DataFrame(metadata_for_flags).reset_index() + if set(missing_flags) < set(flags_data["flag"]): + message = "Missing flags. Copy the following lines to FLAGS_RANKING (and put them in the right order):" + for i, j in pd.DataFrame(metadata_for_flags).loc[list(missing_flags)].iterrows(): + message += f"\n{(i, j['flags'])}," + log.warning(message) + else: + log.warning( + f"Missing flags. {missing_flags} are not defined in additional metadata. Get definition from " + f"https://www.fao.org/faostat/en/#definitions" + ) + raise AssertionError("Flags in dataset not found in FLAGS_RANKING. Manually add those flags.") + + +def check_definitions_in_value_amendments( + table: catalog.Table, dataset_short_name: str, value_amendments: pd.DataFrame +) -> None: + """Check definitions in the value_amendments.csv file. + + This function will assert that: + * All spurious values defined in the file are still found in the data. + * There are no unexpected spurious values in the data. + + Spurious values are only searched for in "value" column if it has "category" dtype. + See regular expression below used to search for spurious values. + + Parameters + ---------- + table : catalog.Table + _description_ + dataset_short_name : str + _description_ + value_amendments : pd.DataFrame + _description_ + """ + # Regular expression used to search for spurious values in the "value" column. + regex_spurious_values = "<|,|N" + + # Select value amendments for the specified dataset. + _value_amendments = value_amendments[value_amendments["dataset"] == dataset_short_name] + if not _value_amendments.empty: + # Check that spurious values defined in value_amendments.csv are indeed found in the data. + expected_spurious_values_not_found = set(_value_amendments["spurious_value"]) - set(table["value"]) + error = ( + f"Expected spurious values {expected_spurious_values_not_found} not found in {dataset_short_name}. " + f"Remove them from value_amendments.csv." + ) + assert len(expected_spurious_values_not_found) == 0, error + + # Search for additional spurious values (only if data values are of "category" type). + if table["value"].dtype == "category": + # Find any possible spurious values in the data. + spurious_values = ( + table[table["value"].astype(str).str.contains(regex_spurious_values, regex=True)]["value"].unique().tolist() + ) + # Find if any of those were not accounted for already in value_amendments. + new_spurious_values = set(spurious_values) - set(_value_amendments["spurious_value"]) + error = f"Unexpected spurious values found in {dataset_short_name}. Add the following values to value_amendments.csv: {new_spurious_values}" + assert len(new_spurious_values) == 0, error + + +def process_metadata( + paths: PathFinder, + metadata: catalog.Dataset, + custom_datasets: pd.DataFrame, + custom_elements: pd.DataFrame, + custom_items: pd.DataFrame, + countries_harmonization: Dict[str, str], + excluded_countries: List[str], + value_amendments: pd.DataFrame, +) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]: + """Apply various sanity checks, gather data (about dataset, item, element and unit names and descriptions) from all + domains, compare with data from its corresponding metadata file, and create clean dataframes of metadata about + dataset, elements, units, items, and countries. + + Parameters + ---------- + metadata : catalog.Dataset + Additional metadata dataset from meadow. + custom_datasets : pd.DataFrame + Data from custom_datasets.csv file. + custom_elements : pd.DataFrame + Data from custom_elements_and_units.csv file. + custom_items : pd.DataFrame + Data from custom_items.csv file. + countries_harmonization : dict + Data from faostat.countries.json file. + excluded_countries : list + Data from faostat.excluded_countries.json file. + value_amendments : pd.DataFrame + Data from value_amendments.csv file. + + Returns + ------- + countries_df : pd.DataFrame + Clean dataframe of global countries. + datasets_df : pd.DataFrame + Clean dataframe of global dataset names and descriptions. + elements_df : pd.DataFrame + Clean dataframe of global element and unit names and descriptions. + items_df : pd.DataFrame + Clean dataframe of global item names and descriptions. + + """ + # Check if flags definitions need to be updated. + check_that_flag_definitions_in_dataset_agree_with_those_in_flags_ranking(metadata) + + # List all FAOSTAT dataset short names. + dataset_short_names = sorted( + set([NAMESPACE + "_" + table_name.split("_")[1] for table_name in metadata.table_names]) + ) + + # Initialise dataframe of dataset descriptions, items, and element-units. + # We cannot remove "dataset" from the items and elements dataframes, because it can happen that, for a given + # item code, the item name is slightly different in two different datasets. + datasets_df = pd.DataFrame({"dataset": [], "fao_dataset_title": [], "fao_dataset_description": []}) + items_df = pd.DataFrame({"dataset": [], "item_code": [], "fao_item": [], "fao_item_description": []}) + elements_df = pd.DataFrame( + { + "dataset": [], + "element_code": [], + "fao_element": [], + "fao_element_description": [], + "fao_unit": [], + "fao_unit_short_name": [], + } + ) + + # Initialise list of all countries in all datasets, and all country groups. + countries_in_data = pd.DataFrame({"area_code": [], "fao_country": []}).astype({"area_code": "Int64"}) + country_groups_in_data: Dict[str, List[str]] = {} + + # Gather all variables from the latest version of each meadow dataset. + for dataset_short_name in tqdm(dataset_short_names, file=sys.stdout): + print(dataset_short_name) + # Load latest meadow table for current dataset. + ds_latest: catalog.Dataset = paths.load_dependency(dataset_short_name) + table = ds_latest[dataset_short_name] + df = pd.DataFrame(table.reset_index()).rename( + columns={ + "area": "fao_country", + "recipient_country": "fao_country", + "recipient_country_code": "area_code", + } + )[["area_code", "fao_country"]] + + df["area_code"] = df["area_code"].astype("Int64") + + ################################################################################################################ + # Temporary patch. + # Some areas are defined with different names (but same area codes) in different domains. + # This causes some issues at a later stage. + # For now, manually rename those areas here. + if "French Guiana" in df["fao_country"].unique(): + df["fao_country"] = dataframes.map_series(df["fao_country"], mapping={"French Guiana": "French Guyana"}) + if "Netherlands (Kingdom of the)" in df["fao_country"].unique(): + df["fao_country"] = dataframes.map_series( + df["fao_country"], mapping={"Netherlands (Kingdom of the)": "Netherlands"} + ) + if "Saint Martin (French part)" in df["fao_country"].unique(): + df["fao_country"] = dataframes.map_series( + df["fao_country"], mapping={"Saint Martin (French part)": "Saint-Martin (French part)"} + ) + + ################################################################################################################ + + if f"{dataset_short_name}_flag" in metadata.table_names: + check_that_all_flags_in_dataset_are_in_ranking( + table=table, metadata_for_flags=metadata[f"{dataset_short_name}_flag"] + ) + + # Check if spurious values defined in value_amendments.csv are still in the data, + # and whether there are new spurious values to be amended. + check_definitions_in_value_amendments( + table=table, dataset_short_name=dataset_short_name, value_amendments=value_amendments + ) + + # Gather dataset descriptions, items, and element-units for current domain. + datasets_from_data = create_dataset_descriptions_dataframe_for_domain( + table=table, dataset_short_name=dataset_short_name + ) + + items_from_data = create_items_dataframe_for_domain( + table=table, metadata=metadata, dataset_short_name=dataset_short_name + ) + + elements_from_data = create_elements_dataframe_for_domain( + table=table, metadata=metadata, dataset_short_name=dataset_short_name + ) + + # Add countries in this dataset to the list of all countries. + countries_in_data = pd.concat([countries_in_data, df]).drop_duplicates() + + # Get country groups in this dataset. + area_group_table_name = f"{dataset_short_name}_area_group" + if area_group_table_name in metadata: + country_groups = ( + metadata[f"{dataset_short_name}_area_group"] + .reset_index() + .drop_duplicates(subset=["country_group", "country"]) + .groupby("country_group") + .agg({"country": list}) + .to_dict()["country"] + ) + # Add new groups to country_groups_in_data; if they are already there, ensure they contain all members. + for group in list(country_groups): + if group not in countries_in_data["fao_country"]: + # This should not happen, but skip just in case. + continue + if group in list(country_groups_in_data): + all_members = set(country_groups_in_data[group]) | set(country_groups[group]) + country_groups_in_data[group] = list(all_members) + else: + country_groups_in_data[group] = country_groups[group] + + # Add dataset descriptions, items, and element-units from current dataset to global dataframes. + datasets_df = dataframes.concatenate([datasets_df, datasets_from_data], ignore_index=True) + items_df = dataframes.concatenate([items_df, items_from_data], ignore_index=True) + elements_df = dataframes.concatenate([elements_df, elements_from_data], ignore_index=True) + + datasets_df = clean_global_dataset_descriptions_dataframe(datasets_df=datasets_df, custom_datasets=custom_datasets) + items_df = clean_global_items_dataframe(items_df=items_df, custom_items=custom_items) + + elements_df = clean_global_elements_dataframe(elements_df=elements_df, custom_elements=custom_elements) + countries_df = clean_global_countries_dataframe( + countries_in_data=countries_in_data, + country_groups=country_groups_in_data, + countries_harmonization=countries_harmonization, + excluded_countries=excluded_countries, + ) + + return countries_df, datasets_df, elements_df, items_df + + +def run(dest_dir: str) -> None: + # + # Load data. + # + # Fetch the dataset short name from dest_dir. + dataset_short_name = Path(dest_dir).name + + # Define path to current step file. + current_step_file = (CURRENT_DIR / dataset_short_name).with_suffix(".py") + + # Get paths and naming conventions for current data step. + paths = PathFinder(current_step_file.as_posix()) + + # Path to file with custom dataset titles and descriptions. + custom_datasets_file = paths.directory / "custom_datasets.csv" + # Path to file with custom item names and descriptions. + custom_items_file = paths.directory / "custom_items.csv" + # Path to file with custom element and unit names and descriptions. + custom_elements_and_units_file = paths.directory / "custom_elements_and_units.csv" + # Path to file with mapping from FAO names to OWID harmonized country names. + countries_file = paths.directory / f"{NAMESPACE}.countries.json" + # Path to file with list of excluded countries and regions. + excluded_countries_file = paths.directory / f"{NAMESPACE}.excluded_countries.json" + # Path to file with spurious values and amendments. + value_amendments_file = paths.directory / "value_amendments.csv" + + # Load metadata from meadow. + metadata = paths.load_dataset() + + # Load custom dataset names, items, element-unit names, and value amendments. + custom_datasets = pd.read_csv(custom_datasets_file, dtype=str) + custom_elements = pd.read_csv(custom_elements_and_units_file, dtype=str) + custom_items = pd.read_csv(custom_items_file, dtype=str) + value_amendments = pd.read_csv(value_amendments_file, dtype=str) + + # Load country mapping and excluded countries files. + countries_harmonization = io.load_json(countries_file) + excluded_countries = io.load_json(excluded_countries_file) + + # + # Process data. + # + countries_df, datasets_df, elements_df, items_df = process_metadata( + paths=paths, + metadata=metadata, + custom_datasets=custom_datasets, + custom_elements=custom_elements, + custom_items=custom_items, + countries_harmonization=countries_harmonization, + excluded_countries=excluded_countries, + value_amendments=value_amendments, + ) + + # + # Save outputs. + # + # Initialize new garden dataset. + dataset_garden = catalog.Dataset.create_empty(dest_dir) + dataset_garden.short_name = FAOSTAT_METADATA_SHORT_NAME + # Keep original dataset's metadata from meadow. + dataset_garden.metadata = deepcopy(metadata.metadata) + # Create new dataset in garden. + dataset_garden.save() + + # Create new garden dataset with all dataset descriptions, items, element-units, and countries. + datasets_table = create_table(df=datasets_df, short_name="datasets", index_cols=["dataset"]) + items_table = create_table(df=items_df, short_name="items", index_cols=["dataset", "item_code"]) + elements_table = create_table(df=elements_df, short_name="elements", index_cols=["dataset", "element_code"]) + countries_table = create_table(df=countries_df, short_name="countries", index_cols=["area_code"]) + amendments_table = catalog.Table(value_amendments, short_name="amendments").set_index( + ["dataset", "spurious_value"], verify_integrity=True + ) + + # Add tables to dataset. + dataset_garden.add(datasets_table, repack=False) + dataset_garden.add(items_table, repack=False) + dataset_garden.add(elements_table, repack=False) + dataset_garden.add(countries_table, repack=False) + dataset_garden.add(amendments_table, repack=False) diff --git a/etl/steps/data/garden/faostat/2024-03-14/faostat_qcl.py b/etl/steps/data/garden/faostat/2024-03-14/faostat_qcl.py new file mode 100644 index 00000000000..a2db317cbd5 --- /dev/null +++ b/etl/steps/data/garden/faostat/2024-03-14/faostat_qcl.py @@ -0,0 +1,534 @@ +"""FAOSTAT garden step for faostat_qcl dataset.""" + +from pathlib import Path + +import numpy as np +import pandas as pd +from owid import catalog +from owid.datautils import dataframes +from shared import ( + ADDED_TITLE_TO_WIDE_TABLE, + CURRENT_DIR, + FLAG_MULTIPLE_FLAGS, + NAMESPACE, + REGIONS_TO_ADD, + add_per_capita_variables, + add_regions, + clean_data, + handle_anomalies, + harmonize_elements, + harmonize_items, + log, + parse_amendments_table, + prepare_long_table, + prepare_wide_table, +) + +from etl.helpers import PathFinder, create_dataset + +# Item and item code for 'Meat, poultry'. +ITEM_POULTRY = "Meat, poultry" +ITEM_CODE_MEAT_POULTRY = "00001808" +# Item code for 'Meat, chicken'. +ITEM_CODE_MEAT_CHICKEN = "00001058" +# List item codes to sum as part of "Meat, total" (avoiding double-counting items). +MEAT_TOTAL_ITEM_CODES = [ + "00000977", # 'Meat, lamb and mutton' (previously 'Meat, lamb and mutton') + "00001035", # 'Meat of pig with the bone, fresh or chilled' (previously 'Meat, pig') + "00001097", # 'Horse meat, fresh or chilled' (previously 'Meat, horse') + "00001108", # 'Meat of asses, fresh or chilled' (previously 'Meat, ass') + "00001111", # 'Meat of mules, fresh or chilled' (previously 'Meat, mule') + "00001127", # 'Meat of camels, fresh or chilled' (previously 'Meat, camel') + "00001141", # 'Meat of rabbits and hares, fresh or chilled' (previously 'Meat, rabbit') + "00001806", # 'Meat, beef and buffalo' (previously 'Meat, beef and buffalo') + "00001807", # 'Meat, sheep and goat' (previously 'Meat, sheep and goat') + ITEM_CODE_MEAT_POULTRY, # 'Meat, poultry' (previously 'Meat, poultry') +] + +# List of element codes for "Producing or slaughtered animals" (they have different items assigned). +SLAUGHTERED_ANIMALS_ELEMENT_CODES = ["005320", "005321"] +# For the resulting dataframe, we arbitrarily assign the first of those codes. +SLAUGHTERED_ANIMALS_ELEMENT_CODE = SLAUGHTERED_ANIMALS_ELEMENT_CODES[0] +# Item code for 'Meat, total'. +TOTAL_MEAT_ITEM_CODE = "00001765" +# OWID item name for total meat. +TOTAL_MEAT_ITEM = "Meat, total" +# OWID element name, unit name, and unit short name for number of slaughtered animals. +SLAUGHTERED_ANIMALS_ELEMENT = "Producing or slaughtered animals" +SLAUGHTERED_ANIMALS_UNIT = "animals" +SLAUGHTERED_ANIMALS_UNIT_SHORT_NAME = "animals" +# Text to be added to the dataset description (after the description of anomalies). +SLAUGHTERED_ANIMALS_ADDITIONAL_DESCRIPTION = ( + "\n\nFAO does not provide data for the total number of slaughtered animals " + "to produce meat. We calculate this metric by adding up the number of slaughtered animals of all meat groups. " + "However, when data for slaughtered poultry (which usually outnumbers other meat groups) is not provided, we do " + "not calculate the total (to avoid spurious dips in the data)." +) + + +def fill_slaughtered_poultry_with_slaughtered_chicken(data: pd.DataFrame) -> pd.DataFrame: + """Fill missing data on slaughtered poultry with slaughtered chicken. + + Most of poultry meat comes from chicken. However, sometimes chicken is informed, but the rest of poultry isn't, + which causes poultry data to be empty (e.g. Spain in 2018). + Therefore, we fill missing data for poultry with chicken data. + """ + data = data.copy() + + # Prepare a slice of the data to extract additional data fields. + additional_fields = ( + data[(data["item_code"] == ITEM_CODE_MEAT_POULTRY) & (data["unit"] == SLAUGHTERED_ANIMALS_UNIT)][ + ["fao_item", "item_description", "fao_unit_short_name"] + ] + .drop_duplicates() + .iloc[0] + ) + + # Select data for the number of slaughtered chicken. + chickens_slaughtered = data[ + (data["item_code"] == ITEM_CODE_MEAT_CHICKEN) + & (data["element"] == SLAUGHTERED_ANIMALS_ELEMENT) + & (data["unit"] == SLAUGHTERED_ANIMALS_UNIT) + ] + + # Select data for the number of slaughtered poultry. + poultry_slaughtered = data[ + (data["item_code"] == ITEM_CODE_MEAT_POULTRY) + & (data["element"] == SLAUGHTERED_ANIMALS_ELEMENT) + & (data["unit"] == SLAUGHTERED_ANIMALS_UNIT) + ][["country", "year", "value"]] + + # Combine poultry and chicken data. + compared = pd.merge( + chickens_slaughtered, + poultry_slaughtered, + on=["country", "year"], + how="outer", + indicator=True, + suffixes=("_chicken", "_poultry"), + ) + + error = "There are cases where slaughtered poultry is informed, but slaughered chicken is not." + assert compared[compared["_merge"] == "right_only"].empty, error + + error = "There are rows where there is more slaughtered poultry than slaughtered chicken." + assert compared[compared["value_poultry"] < compared["value_chicken"]].empty, error + + # Prepare a replacement dataframe for missing data on slaughtered poultry. + poultry_slaughtered_missing_data = ( + compared[compared["_merge"] == "left_only"] + .assign( + **{ + "item_code": ITEM_CODE_MEAT_POULTRY, + "item": ITEM_POULTRY, + "fao_item": additional_fields["fao_item"], + "fao_unit_short_name": additional_fields["fao_unit_short_name"], + "item_description": additional_fields["item_description"], + } + ) + .drop(columns=["_merge", "value_poultry"]) + .rename(columns={"value_chicken": "value"}) + ) + + log.info( + f"Filling {len(poultry_slaughtered_missing_data)} rows of missing data for slaughtered poultry with " + "slaughtered chicken." + ) + # Add chicken data to the full dataframe. + data = pd.concat([data, poultry_slaughtered_missing_data], ignore_index=True) + + return data + + +def add_slaughtered_animals_to_meat_total(data: pd.DataFrame) -> pd.DataFrame: + """Add number of slaughtered animals to meat total. + + There is no FAOSTAT data on slaughtered animals for total meat. We construct this data by aggregating that element + for the items specified in items_to_aggregate (which corresponds to all meat items after removing redundancies). + + If the number of slaughtered poultry is not informed, we remove the number of total animals slaughtered + (since poultry are by far the most commonly slaughtered animals). + + Parameters + ---------- + data : pd.DataFrame + Processed data where meat total does not have number of slaughtered animals. + + Returns + ------- + combined_data : pd.DataFrame + Data after adding the new variable. + + """ + data = data.copy() + + error = f"Some items required to get the aggregate '{TOTAL_MEAT_ITEM}' are missing in data." + assert set(MEAT_TOTAL_ITEM_CODES) < set(data["item_code"]), error + assert SLAUGHTERED_ANIMALS_ELEMENT in data["element"].unique() + assert SLAUGHTERED_ANIMALS_UNIT in data["unit"].unique() + + # Check that, indeed, the number of slaughtered animals for total meat is not given in the original data. + assert data[ + (data["item"] == TOTAL_MEAT_ITEM) + & (data["element"] == SLAUGHTERED_ANIMALS_ELEMENT) + & (data["unit"] == SLAUGHTERED_ANIMALS_UNIT) + ].empty + + # There are two element codes for the same element (they have different items assigned). + error = "Element codes for 'Producing or slaughtered animals' may have changed." + assert ( + data[(data["element"] == SLAUGHTERED_ANIMALS_ELEMENT) & ~(data["element_code"].str.contains("pc"))][ + "element_code" + ] + .unique() + .tolist() + == SLAUGHTERED_ANIMALS_ELEMENT_CODES + ), error + + # Check that the items assigned to each the two element codes do not overlap. + error = "Element codes for 'Producing or slaughtered animals' have overlapping items." + items_for_different_elements = ( + data[(data["element_code"].isin(SLAUGHTERED_ANIMALS_ELEMENT_CODES))] + .groupby("element_code", observed=True) + .agg({"item_code": lambda x: list(x.unique())}) + .to_dict()["item_code"] + ) + assert set.intersection(*[set(x) for x in items_for_different_elements.values()]) == set(), error + + # Confirm the item code for total meat. + error = f"Item code for '{TOTAL_MEAT_ITEM}' may have changed." + assert list(data[data["item"] == TOTAL_MEAT_ITEM]["item_code"].unique()) == [TOTAL_MEAT_ITEM_CODE], error + + # Select the subset of data to aggregate. + data_to_aggregate = ( + data[ + (data["element"] == SLAUGHTERED_ANIMALS_ELEMENT) + & (data["unit"] == SLAUGHTERED_ANIMALS_UNIT) + & (data["item_code"].isin(MEAT_TOTAL_ITEM_CODES)) + ] + .dropna(subset="value") + .reset_index(drop=True) + ) + + # Create a dataframe with the total number of animals used for meat. + animals = dataframes.groupby_agg( + data_to_aggregate, + groupby_columns=[ + "area_code", + "fao_country", + "fao_element", + "country", + "year", + "population_with_data", + ], + aggregations={ + "value": "sum", + "flag": lambda x: x if len(x) == 1 else FLAG_MULTIPLE_FLAGS, + }, + # TODO: Consider relaxing this assumption, and letting it be None (and impose min_num_values=1). + num_allowed_nans=0, + ).reset_index() + + # Get element description for selected element code (so far it's always been an empty string). + _slaughtered_animals_element_description = data[data["element_code"].isin(SLAUGHTERED_ANIMALS_ELEMENT_CODES)][ + "element_description" + ].unique() + assert len(_slaughtered_animals_element_description) == 1 + slaughtered_animals_element_description = _slaughtered_animals_element_description[0] + + # Get item description for selected item code. + _total_meat_item_description = data[data["item_code"] == TOTAL_MEAT_ITEM_CODE]["item_description"].unique() + assert len(_total_meat_item_description) == 1 + total_meat_item_description = _total_meat_item_description[0] + + # Get FAO item name for selected item code. + _total_meat_fao_item = data[data["item_code"] == TOTAL_MEAT_ITEM_CODE]["fao_item"].unique() + assert len(_total_meat_fao_item) == 1 + total_meat_fao_item = _total_meat_fao_item[0] + + # Get FAO unit for selected item code. + _total_meat_fao_unit = data[data["item_code"] == TOTAL_MEAT_ITEM_CODE]["fao_unit_short_name"].unique() + assert len(_total_meat_fao_unit) == 1 + total_meat_fao_unit = _total_meat_fao_unit[0] + + # Manually include the rest of columns. + animals["element"] = SLAUGHTERED_ANIMALS_ELEMENT + animals["element_description"] = slaughtered_animals_element_description + animals["unit"] = SLAUGHTERED_ANIMALS_UNIT + animals["unit_short_name"] = SLAUGHTERED_ANIMALS_UNIT_SHORT_NAME + # We arbitrarily assign the first element code (out of the two available) to the resulting variables. + animals["element_code"] = SLAUGHTERED_ANIMALS_ELEMENT_CODE + animals["item_code"] = TOTAL_MEAT_ITEM_CODE + animals["item"] = TOTAL_MEAT_ITEM + animals["item_description"] = total_meat_item_description + animals["fao_item"] = total_meat_fao_item + animals["fao_unit_short_name"] = total_meat_fao_unit + + log.info(f"Adding {len(animals)} rows with the total number of slaughtered animals for meat.") + + # For each year, we are adding up the number of animals slaughtered to compute the total, regardless of how many + # of those animals have data. + # However, some years do not have data for a particular animal; this is acceptable except if the animal is poultry, + # which is the most commonly slaughtered animal. Therefore, if data is missing for poultry, the total will show a + # significant (and spurious) decrease (this happens, e.g. in Estonia in 2019). + # Therefore, we remove data points for which poultry is not informed. + + # Find country-years for which we have the number of poultry slaughtered. + country_years_with_poultry_data = ( + data[ + (data["item_code"] == ITEM_CODE_MEAT_POULTRY) + & (data["element"] == SLAUGHTERED_ANIMALS_ELEMENT) + & (data["unit"] == SLAUGHTERED_ANIMALS_UNIT) + ] + .dropna(subset="value")[["country", "year"]] + .drop_duplicates() + .reset_index(drop=True) + ) + + # Add a column to inform of all those rows for which we don't have poultry data. + compared = pd.merge(animals, country_years_with_poultry_data, how="outer", indicator=True) + + assert compared[compared["_merge"] == "right_only"].empty, "Expected 'left_only' or 'both', not 'right_only'." + + log.info( + f"Removed {len(compared[compared['_merge'] == 'left_only'])} rows for which we don't have the number of " + "poultry slaughtered." + ) + + animals_corrected = compared[compared["_merge"] == "both"].reset_index(drop=True).drop(columns=["_merge"]) + + # Check that we are not missing any column. + assert set(data.columns) == set(animals_corrected.columns) + + # Add animals data to the original dataframe. + combined_data = ( + pd.concat([data, animals_corrected], ignore_index=True) + .reset_index(drop=True) + .astype( + { + "element_code": "category", + "item_code": "category", + "fao_item": "category", + "fao_unit_short_name": "category", + "flag": "category", + "item": "category", + "item_description": "category", + "element": "category", + "unit": "category", + "element_description": "category", + "unit_short_name": "category", + } + ) + ) + + return combined_data + + +def add_yield_to_aggregate_regions(data: pd.DataFrame) -> pd.DataFrame: + """Add yield (production / area harvested) to data for aggregate regions (i.e. continents and income groups). + + This data is not included in aggregate regions because it cannot be aggregated by simply summing the contribution of + the individual countries. Instead, we need to aggregate production, then aggregate area harvested, and then divide + one by the other. + + Note: Here, we divide production (the sum of the production from a list of countries in a region) by area (the sum + of the area from a list of countries in a region) to obtain yield. But the list of countries that contributed to + production may not be the same as the list of countries that contributed to area. We could impose that they must be + the same, but this causes the resulting series to have gaps. Additionally, it seems that FAO also constructs yield + in the same way. This was checked by comparing the resulting yield curves for 'Almonds' for all aggregate regions + with their corresponding *(FAO) regions; they were identical. + + Parameters + ---------- + data : pd.DataFrame + Data that does not contain yield for aggregate regions. + + Returns + ------- + combined_data : pd.DataFrame + Data after adding yield. + + """ + # Element code of production, area harvested, and yield. + production_element_code = "005510" + area_element_code = "005312" + yield_element_code = "005419" + + # Check that indeed regions do not contain any data for yield. + assert data[(data["country"].isin(REGIONS_TO_ADD)) & (data["element_code"] == yield_element_code)].empty + + # Gather all fields that should stay the same. + additional_fields = data[data["element_code"] == yield_element_code][ + [ + "element", + "element_description", + "fao_element", + "fao_unit_short_name", + "unit", + "unit_short_name", + ] + ].drop_duplicates() + assert len(additional_fields) == 1 + + # Create a dataframe of production of regions. + data_production = data[(data["country"].isin(REGIONS_TO_ADD)) & (data["element_code"] == production_element_code)] + + # Create a dataframe of area of regions. + data_area = data[(data["country"].isin(REGIONS_TO_ADD)) & (data["element_code"] == area_element_code)] + + # Merge the two dataframes and create the new yield variable. + merge_cols = [ + "area_code", + "year", + "item_code", + "fao_country", + "fao_item", + "item", + "item_description", + "country", + ] + combined = pd.merge( + data_production, + data_area[merge_cols + ["flag", "value"]], + on=merge_cols, + how="inner", + suffixes=("_production", "_area"), + ) + + combined["value"] = combined["value_production"] / combined["value_area"] + + # Replace infinities (caused by dividing by zero) by nan. + combined["value"] = combined["value"].replace(np.inf, np.nan) + + # If both fields have the same flag, use that, otherwise use the flag of multiple flags. + combined["flag"] = [ + flag_production if flag_production == flag_area else FLAG_MULTIPLE_FLAGS + for flag_production, flag_area in zip(combined["flag_production"], combined["flag_area"]) + ] + + # Drop rows of nan and unnecessary columns. + combined = combined.drop(columns=["flag_production", "flag_area", "value_production", "value_area"]) + combined = combined.dropna(subset="value").reset_index(drop=True) + + # Replace fields appropriately. + combined["element_code"] = yield_element_code + # Replace all other fields from the corresponding fields in yield (tonnes per hectare) variable. + for field in additional_fields.columns: + combined[field] = additional_fields[field].item() + assert set(data.columns) == set(combined.columns) + combined_data = ( + pd.concat([data, combined], ignore_index=True) + .reset_index(drop=True) + .astype( + { + "element_code": "category", + "fao_element": "category", + "fao_unit_short_name": "category", + "flag": "category", + "element": "category", + "unit": "category", + "element_description": "category", + "unit_short_name": "category", + } + ) + ) + + return combined_data + + +def run(dest_dir: str) -> None: + # + # Load data. + # + # Fetch the dataset short name from dest_dir. + dataset_short_name = Path(dest_dir).name + + # Define path to current step file. + current_step_file = (CURRENT_DIR / dataset_short_name).with_suffix(".py") + + # Get paths and naming conventions for current data step. + paths = PathFinder(current_step_file.as_posix()) + + # Load latest meadow dataset and keep its metadata. + ds_meadow: catalog.Dataset = paths.load_dependency(dataset_short_name) + # Load main table from dataset. + tb_meadow = ds_meadow[dataset_short_name] + data = pd.DataFrame(tb_meadow).reset_index() + + # Load dataset of FAOSTAT metadata. + metadata: catalog.Dataset = paths.load_dependency(f"{NAMESPACE}_metadata") + + # Load dataset, items, element-units, and countries metadata. + dataset_metadata = pd.DataFrame(metadata["datasets"]).loc[dataset_short_name].to_dict() + items_metadata = pd.DataFrame(metadata["items"]).reset_index() + items_metadata = items_metadata[items_metadata["dataset"] == dataset_short_name].reset_index(drop=True) + elements_metadata = pd.DataFrame(metadata["elements"]).reset_index() + elements_metadata = elements_metadata[elements_metadata["dataset"] == dataset_short_name].reset_index(drop=True) + countries_metadata = pd.DataFrame(metadata["countries"]).reset_index() + amendments = parse_amendments_table(amendments=metadata["amendments"], dataset_short_name=dataset_short_name) + + # + # Process data. + # + # Harmonize items and elements, and clean data. + data = harmonize_items(df=data, dataset_short_name=dataset_short_name) + data = harmonize_elements(df=data) + + # Prepare data. + data = clean_data( + data=data, + items_metadata=items_metadata, + elements_metadata=elements_metadata, + countries_metadata=countries_metadata, + amendments=amendments, + ) + + # Fill missing data for slaughtered poultry with slaughtered chicken. + data = fill_slaughtered_poultry_with_slaughtered_chicken(data=data) + + # Include number of slaughtered animals in total meat (which is missing). + data = add_slaughtered_animals_to_meat_total(data=data) + + # Add data for aggregate regions. + data = add_regions(data=data, elements_metadata=elements_metadata) + + # Add per-capita variables. + data = add_per_capita_variables(data=data, elements_metadata=elements_metadata) + + # Add yield (production per area) to aggregate regions. + data = add_yield_to_aggregate_regions(data) + + # Handle detected anomalies in the data. + data, anomaly_descriptions = handle_anomalies(dataset_short_name=dataset_short_name, data=data) + + # Create a long table (with item code and element code as part of the index). + data_table_long = prepare_long_table(data=data) + + # Create a wide table (with only country and year as index). + data_table_wide = prepare_wide_table(data=data) + + # + # Save outputs. + # + # Update tables metadata. + data_table_long.metadata.short_name = dataset_short_name + data_table_long.metadata.title = dataset_metadata["owid_dataset_title"] + data_table_wide.metadata.short_name = f"{dataset_short_name}_flat" + data_table_wide.metadata.title = dataset_metadata["owid_dataset_title"] + ADDED_TITLE_TO_WIDE_TABLE + + # Initialise new garden dataset. + ds_garden = create_dataset( + dest_dir=dest_dir, tables=[data_table_long, data_table_wide], default_metadata=ds_meadow.metadata + ) + + # Update dataset metadata and add description of anomalies (if any) to the dataset description. + ds_garden.metadata.description = ( + dataset_metadata["owid_dataset_description"] + anomaly_descriptions + SLAUGHTERED_ANIMALS_ADDITIONAL_DESCRIPTION + ) + ds_garden.metadata.title = dataset_metadata["owid_dataset_title"] + + # Update the main source's metadata description (which will be shown in charts). + ds_garden.metadata.sources[0].description = ds_garden.metadata.description + + # Create garden dataset. + ds_garden.save() diff --git a/etl/steps/data/garden/faostat/2024-03-14/faostat_qi.py b/etl/steps/data/garden/faostat/2024-03-14/faostat_qi.py new file mode 100644 index 00000000000..8c271f07bc2 --- /dev/null +++ b/etl/steps/data/garden/faostat/2024-03-14/faostat_qi.py @@ -0,0 +1,2 @@ +"""FAOSTAT garden step for faostat_qi dataset.""" +from .shared import run # noqa:F401 diff --git a/etl/steps/data/garden/faostat/2024-03-14/faostat_qv.py b/etl/steps/data/garden/faostat/2024-03-14/faostat_qv.py new file mode 100644 index 00000000000..f564688376e --- /dev/null +++ b/etl/steps/data/garden/faostat/2024-03-14/faostat_qv.py @@ -0,0 +1,2 @@ +"""FAOSTAT garden step for faostat_qv dataset.""" +from .shared import run # noqa:F401 diff --git a/etl/steps/data/garden/faostat/2024-03-14/faostat_rfb.py b/etl/steps/data/garden/faostat/2024-03-14/faostat_rfb.py new file mode 100644 index 00000000000..68669b4cbd4 --- /dev/null +++ b/etl/steps/data/garden/faostat/2024-03-14/faostat_rfb.py @@ -0,0 +1,2 @@ +"""FAOSTAT garden step for faostat_rfb dataset.""" +from .shared import run # noqa:F401 diff --git a/etl/steps/data/garden/faostat/2024-03-14/faostat_rfn.py b/etl/steps/data/garden/faostat/2024-03-14/faostat_rfn.py new file mode 100644 index 00000000000..4ebfe341728 --- /dev/null +++ b/etl/steps/data/garden/faostat/2024-03-14/faostat_rfn.py @@ -0,0 +1,2 @@ +"""FAOSTAT garden step for faostat_rfn dataset.""" +from .shared import run # noqa:F401 diff --git a/etl/steps/data/garden/faostat/2024-03-14/faostat_rl.py b/etl/steps/data/garden/faostat/2024-03-14/faostat_rl.py new file mode 100644 index 00000000000..f43cbe31912 --- /dev/null +++ b/etl/steps/data/garden/faostat/2024-03-14/faostat_rl.py @@ -0,0 +1,2 @@ +"""FAOSTAT garden step for faostat_rl dataset.""" +from .shared import run # noqa:F401 diff --git a/etl/steps/data/garden/faostat/2024-03-14/faostat_rp.py b/etl/steps/data/garden/faostat/2024-03-14/faostat_rp.py new file mode 100644 index 00000000000..f15e468d920 --- /dev/null +++ b/etl/steps/data/garden/faostat/2024-03-14/faostat_rp.py @@ -0,0 +1,2 @@ +"""FAOSTAT garden step for faostat_rp dataset.""" +from .shared import run # noqa:F401 diff --git a/etl/steps/data/garden/faostat/2024-03-14/faostat_rt.py b/etl/steps/data/garden/faostat/2024-03-14/faostat_rt.py new file mode 100644 index 00000000000..8b7a9257526 --- /dev/null +++ b/etl/steps/data/garden/faostat/2024-03-14/faostat_rt.py @@ -0,0 +1,2 @@ +"""FAOSTAT garden step for faostat_rt dataset.""" +from .shared import run # noqa:F401 diff --git a/etl/steps/data/garden/faostat/2024-03-14/faostat_scl.py b/etl/steps/data/garden/faostat/2024-03-14/faostat_scl.py new file mode 100644 index 00000000000..00d0d6eb376 --- /dev/null +++ b/etl/steps/data/garden/faostat/2024-03-14/faostat_scl.py @@ -0,0 +1,2 @@ +"""FAOSTAT garden step for faostat_scl dataset.""" +from .shared import run # noqa:F401 diff --git a/etl/steps/data/garden/faostat/2024-03-14/faostat_sdgb.py b/etl/steps/data/garden/faostat/2024-03-14/faostat_sdgb.py new file mode 100644 index 00000000000..67932fa7aaf --- /dev/null +++ b/etl/steps/data/garden/faostat/2024-03-14/faostat_sdgb.py @@ -0,0 +1,2 @@ +"""FAOSTAT garden step for faostat_sdgb dataset.""" +from .shared import run # noqa:F401 diff --git a/etl/steps/data/garden/faostat/2024-03-14/faostat_tcl.py b/etl/steps/data/garden/faostat/2024-03-14/faostat_tcl.py new file mode 100644 index 00000000000..2df286d3992 --- /dev/null +++ b/etl/steps/data/garden/faostat/2024-03-14/faostat_tcl.py @@ -0,0 +1,2 @@ +"""FAOSTAT garden step for faostat_tcl dataset.""" +from .shared import run # noqa:F401 diff --git a/etl/steps/data/garden/faostat/2024-03-14/faostat_ti.py b/etl/steps/data/garden/faostat/2024-03-14/faostat_ti.py new file mode 100644 index 00000000000..682199d79d9 --- /dev/null +++ b/etl/steps/data/garden/faostat/2024-03-14/faostat_ti.py @@ -0,0 +1,2 @@ +"""FAOSTAT garden step for faostat_ti dataset.""" +from .shared import run # noqa:F401 diff --git a/etl/steps/data/garden/faostat/2024-03-14/shared.py b/etl/steps/data/garden/faostat/2024-03-14/shared.py new file mode 100644 index 00000000000..1953069445b --- /dev/null +++ b/etl/steps/data/garden/faostat/2024-03-14/shared.py @@ -0,0 +1,1893 @@ +"""Shared definitions in FAOSTAT garden steps. + +This module contains: +* Common functions used in garden steps. +* Definitions related to elements and items (e.g. item amendments). +* Definitions related to countries and regions (e.g. aggregate regions to generate and definition of historic regions). +* Definitions of flags (found in the original FAOSTAT data) and their ranking (i.e. the priority of data points when + there are duplicates). +* Other additional definitions (e.g. texts to include in the definitions of generated per-capita variables). + +""" + +import json +import sys +from pathlib import Path +from typing import Dict, List, cast + +import numpy as np +import pandas as pd +import structlog +from detected_anomalies import handle_anomalies +from owid import catalog, repack # type: ignore +from owid.datautils import dataframes +from tqdm.auto import tqdm + +from etl.data_helpers import geo +from etl.helpers import PathFinder, create_dataset +from etl.paths import DATA_DIR + +# Initialise log. +log = structlog.get_logger() + +# Define path to current folder, namespace and version of all datasets in this folder. +CURRENT_DIR = Path(__file__).parent +NAMESPACE = CURRENT_DIR.parent.name +VERSION = CURRENT_DIR.name + +# Name of FAOSTAT metadata dataset. +FAOSTAT_METADATA_SHORT_NAME = f"{NAMESPACE}_metadata" + +# Elements and items. + +# Maximum number of characters for item_code. +# FAOSTAT "item_code" is usually an integer number, however sometimes it has decimals and sometimes it contains letters. +# So we will convert it into a string of this number of characters (integers will be prepended with zeros). +N_CHARACTERS_ITEM_CODE = 8 +# Maximum number of characters for item_code for faostat_sdgb, which has a different kind of item codes, +# e.g. '24002-F-Y_GE15', '24002-M-Y_GE15', etc. +N_CHARACTERS_ITEM_CODE_SDGB = 14 +# Maximum number of characters for element_code (integers will be prepended with zeros). +N_CHARACTERS_ELEMENT_CODE = 6 +# Manual fixes to item codes to avoid ambiguities. +ITEM_AMENDMENTS = { + "faostat_fbsh": [ + # Mappings to harmonize item names of fbsh with those of fbs. + { + "item_code": "00002556", + "fao_item": "Groundnuts (Shelled Eq)", + "new_item_code": "00002552", + "new_fao_item": "Groundnuts", + }, + { + "item_code": "00002805", + "fao_item": "Rice (Milled Equivalent)", + "new_item_code": "00002807", + "new_fao_item": "Rice and products", + }, + ], +} + + +# Countries and regions. + +# When creating region aggregates for a certain variable in a certain year, we want to ensure that we have enough +# data to create the aggregate. There is no straightforward way to do so. Our criterion is to: +# * sum the data of all countries in the region, and then +# * remove rows such that the sum of the population of countries with data (for a given year) is too small, compared +# to the total population of the region. +# For example, if for a certain variable in a certain year, only a few countries with little population have data, +# then assign nan to that region-variable-year. +# Define here that minimum fraction of population that must have data to create an aggregate. +# A fraction of 0 means that we do accept aggregates even if only a few countries contribute (which seems to be the +# default approach by FAOSTAT). +MIN_FRAC_POPULATION_WITH_DATA = 0.0 +# Reference year to build the list of mandatory countries. +REFERENCE_YEAR = 2018 +REGIONS_TO_ADD = { + "North America": { + "area_code": "OWID_NAM", + "min_frac_population_with_data": MIN_FRAC_POPULATION_WITH_DATA, + }, + "South America": { + "area_code": "OWID_SAM", + "min_frac_population_with_data": MIN_FRAC_POPULATION_WITH_DATA, + }, + "Europe": { + "area_code": "OWID_EUR", + "min_frac_population_with_data": MIN_FRAC_POPULATION_WITH_DATA, + }, + "European Union (27)": { + "area_code": "OWID_EU27", + "min_frac_population_with_data": MIN_FRAC_POPULATION_WITH_DATA, + }, + "Africa": { + "area_code": "OWID_AFR", + "min_frac_population_with_data": MIN_FRAC_POPULATION_WITH_DATA, + }, + "Asia": { + "area_code": "OWID_ASI", + "min_frac_population_with_data": MIN_FRAC_POPULATION_WITH_DATA, + }, + "Oceania": { + "area_code": "OWID_OCE", + "min_frac_population_with_data": MIN_FRAC_POPULATION_WITH_DATA, + }, + "Low-income countries": { + "area_code": "OWID_LIC", + "min_frac_population_with_data": MIN_FRAC_POPULATION_WITH_DATA, + }, + "Upper-middle-income countries": { + "area_code": "OWID_UMC", + "min_frac_population_with_data": MIN_FRAC_POPULATION_WITH_DATA, + }, + "Lower-middle-income countries": { + "area_code": "OWID_LMC", + "min_frac_population_with_data": MIN_FRAC_POPULATION_WITH_DATA, + }, + "High-income countries": { + "area_code": "OWID_HIC", + "min_frac_population_with_data": MIN_FRAC_POPULATION_WITH_DATA, + }, +} + +# When creating region aggregates, we need to ignore geographical regions that contain aggregate data from other +# countries, to avoid double-counting the data of those countries. +# Note: This list does not contain all country groups, but only those that are in our list of harmonized countries +# (without the *(FAO) suffix). +REGIONS_TO_IGNORE_IN_AGGREGATES = [ + "Melanesia", + "Polynesia", +] + +# When creating region aggregates, decide how to distribute historical regions. +# The following decisions are based on the current location of the countries that succeeded the region, and their income +# group. Continent and income group assigned corresponds to the continent and income group of the majority of the +# population in the member countries. +HISTORIC_TO_CURRENT_REGION = { + "Czechoslovakia": { + "continent": "Europe", + "income_group": "High-income countries", + "members": [ + # Europe - High-income countries. + "Czechia", + "Slovakia", + ], + }, + "Ethiopia (former)": { + "continent": "Africa", + "income_group": "Low-income countries", + "members": [ + # Africa - Low-income countries. + "Ethiopia", + "Eritrea", + ], + }, + "Netherlands Antilles": { + "continent": "North America", + "income_group": "High-income countries", + "members": [ + # North America - High-income countries. + "Aruba", + "Curacao", + "Sint Maarten (Dutch part)", + ], + }, + "Serbia and Montenegro": { + "continent": "Europe", + "income_group": "Upper-middle-income countries", + "members": [ + # Europe - Upper-middle-income countries. + "Serbia", + "Montenegro", + ], + }, + "Sudan (former)": { + "continent": "Africa", + "income_group": "Low-income countries", + "members": [ + # Africa - Low-income countries. + "Sudan", + "South Sudan", + ], + }, + "USSR": { + "continent": "Europe", + "income_group": "Upper-middle-income countries", + "members": [ + # Europe - High-income countries. + "Lithuania", + "Estonia", + "Latvia", + # Europe - Upper-middle-income countries. + "Moldova", + "Belarus", + "Russia", + # Europe - Lower-middle-income countries. + "Ukraine", + # Asia - Upper-middle-income countries. + "Georgia", + "Armenia", + "Azerbaijan", + "Turkmenistan", + "Kazakhstan", + # Asia - Lower-middle-income countries. + "Kyrgyzstan", + "Uzbekistan", + "Tajikistan", + ], + }, + "Yugoslavia": { + "continent": "Europe", + "income_group": "Upper-middle-income countries", + "members": [ + # Europe - High-income countries. + "Croatia", + "Slovenia", + # Europe - Upper-middle-income countries. + "North Macedonia", + "Bosnia and Herzegovina", + "Serbia", + "Montenegro", + ], + }, +} + + +# Flags. + +# We have created a manual ranking of FAOSTAT flags. These flags are only used when there is ambiguity in the data, +# namely, when there is more than one data value for a certain country-year-item-element-unit. +# NOTES: +# * We check that the definitions in our manual ranking agree with the ones provided by FAOSTAT. +# * We do not include all flags: We include only the ones that solve an ambiguity in a particular case, and add more +# flags as we see need. +# * We have found flags that appeared in a dataset, but were not included in the additional metadata +# (namely flag "R", found in qcl dataset, and "W" in rt dataset). These flags were added manually, using the +# definition in List / Flags in: +# https://www.fao.org/faostat/en/#definitions +# * Other flags (namely "B", in rl dataset and "w" in rt dataset) were not found either in the additional metadata or in +# the website definitions. They have been assigned the description "Unknown flag". +# * Unfortunately, flags do not remove all ambiguities: remaining duplicates are dropped without any meaningful +# criterion. +# Flag to assign to data points with nan flag (which by definition is considered official data). +FLAG_OFFICIAL_DATA = "official_data" +# Flag to assign to data points for regions that are the result of aggregating data points with different flags. +FLAG_MULTIPLE_FLAGS = "multiple_flags" +# Rank flags by priority (where lowest index is highest priority). +FLAGS_RANKING = ( + pd.DataFrame.from_records( + columns=["flag", "description"], + data=[ + # FAO uses nan flag for official data; in our datasets we will replace nans by FLAG_OFFICIAL_DATA. + (np.nan, "Official data"), + ("A", "Official figure"), + ("X", "Figure from international organizations"), + ("C", "Aggregate, may include official, semi-official, estimated or calculated data"), + ("P", "Provisional value"), + ("I", "Imputed value"), + ("E", "Estimated value"), + ("F", "Forecast value"), + ("T", "Unofficial figure"), + ("B", "Time series break"), + ("N", "Not significant (negligible)"), + ("U", "Low reliability"), + ("L", "Missing value; data exist"), + ("O", "Missing value"), + ("M", "Missing value (data cannot exist, not applicable)"), + ("Q", "Missing value; suppressed"), + ("V", "Unvalidated value"), + ("Fp", "Unknown flag"), + ], + ) + .reset_index() + .rename(columns={"index": "ranking"}) +) + +# Additional descriptions. + +# Additional explanation to append to element description for variables that were originally given per capita. +WAS_PER_CAPITA_ADDED_ELEMENT_DESCRIPTION = ( + "Originally given per-capita, and converted into total figures by " "multiplying by population (given by FAO)." +) +# Additional explanation to append to element description for created per-capita variables. +NEW_PER_CAPITA_ADDED_ELEMENT_DESCRIPTION = ( + "Per-capita values are obtained by dividing the original values by the " + "population (either provided by FAO or by OWID)." +) + +# Additional text to include in the metadata title of the output wide table. +ADDED_TITLE_TO_WIDE_TABLE = " - Flattened table indexed by country-year." + + +# Shared functions. + + +def check_that_countries_are_well_defined(data: pd.DataFrame) -> None: + """Apply sanity checks related to the definition of countries. + + Parameters + ---------- + data : pd.DataFrame + Data, right after harmonizing country names. + + """ + # Ensure area codes and countries are well defined, and no ambiguities were introduced when mapping country names. + n_countries_per_area_code = data.groupby("area_code")["country"].transform("nunique") + ambiguous_area_codes = ( + data.loc[n_countries_per_area_code > 1][["area_code", "country"]] + .drop_duplicates() + .set_index("area_code")["country"] + .to_dict() + ) + error = ( + f"There cannot be multiple countries for the same area code. " + f"Redefine countries file for:\n{ambiguous_area_codes}." + ) + assert len(ambiguous_area_codes) == 0, error + n_area_codes_per_country = data.groupby("country")["area_code"].transform("nunique") + ambiguous_countries = ( + data.loc[n_area_codes_per_country > 1][["area_code", "country"]] + .drop_duplicates() + .set_index("area_code")["country"] + .to_dict() + ) + error = ( + f"There cannot be multiple area codes for the same countries. " + f"Redefine countries file for:\n{ambiguous_countries}." + ) + assert len(ambiguous_countries) == 0, error + + +def check_that_regions_with_subregions_are_ignored_when_constructing_aggregates( + countries_metadata: pd.DataFrame, +) -> None: + """Check that regions that contain subregions are ignored when constructing region aggregates, to avoid + double-counting those subregions. + + Parameters + ---------- + countries_metadata : pd.DataFrame + Table 'countries' from garden faostat_metadata dataset. + + """ + # Check if there is any harmonized regions that contain subregions. + # If so, they should be ignored when constructing region aggregates, to avoid double-counting them. + countries_with_subregions = ( + countries_metadata[ + (countries_metadata["country"] != "World") + & (~countries_metadata["country"].isin(REGIONS_TO_ADD)) + & (~countries_metadata["country"].isin(REGIONS_TO_IGNORE_IN_AGGREGATES)) + & (~countries_metadata["country"].str.contains("(FAO)", regex=False).fillna(False)) + & (countries_metadata["members"].notnull()) + ]["country"] + .unique() + .tolist() + ) + + error = ( + f"Regions {countries_with_subregions} contain subregions. Add them to REGIONS_TO_IGNORE_IN_AGGREGATES to " + f"avoid double-counting subregions when constructing aggregates." + ) + assert len(countries_with_subregions) == 0, error + + +def harmonize_items(df: pd.DataFrame, dataset_short_name: str, item_col: str = "item") -> pd.DataFrame: + """Harmonize item codes (by ensuring they are strings of numbers with a fixed length, prepended with zeros), make + amendments to faulty items, and make item codes and items of categorical dtype. + + Parameters + ---------- + df : pd.DataFrame + Data before harmonizing item codes. + dataset_short_name : str + Dataset short name. + item_col : str + Name of items column. + + Returns + ------- + df : pd.DataFrame + Data after harmonizing item codes. + + """ + df = df.copy() + + # Set the maximum number of characters for item_code. + if dataset_short_name == f"{NAMESPACE}_sdgb": + n_characters_item_code = N_CHARACTERS_ITEM_CODE_SDGB + else: + n_characters_item_code = N_CHARACTERS_ITEM_CODE + + # Note: Here list comprehension is faster than doing .astype(str).str.zfill(...). + df["item_code"] = [str(item_code).zfill(n_characters_item_code) for item_code in df["item_code"]] + + # Convert both columns to category to reduce memory. + df = df.astype({"item_code": "category", item_col: "category"}) + + # Fix those few cases where there is more than one item per item code within a given dataset. + if dataset_short_name in ITEM_AMENDMENTS: + for amendment in ITEM_AMENDMENTS[dataset_short_name]: + # Ensure new item code and item name are added as categories, to avoid errors. + if amendment["new_item_code"] not in df["item_code"].cat.categories: + df["item_code"] = df["item_code"].cat.add_categories(amendment["new_item_code"]) + if amendment["new_fao_item"] not in df[item_col].cat.categories: + df[item_col] = df[item_col].cat.add_categories(amendment["new_fao_item"]) + + # Update item code and item name. + df.loc[ + (df["item_code"] == amendment["item_code"]) & (df[item_col] == amendment["fao_item"]), + ("item_code", item_col), + ] = (amendment["new_item_code"], amendment["new_fao_item"]) + + # Remove unused categories. + df["item_code"] = df["item_code"].cat.remove_unused_categories() + df[item_col] = df[item_col].cat.remove_unused_categories() + + return df + + +def harmonize_elements(df: pd.DataFrame, element_col: str = "element") -> pd.DataFrame: + """Harmonize element codes (by ensuring they are strings of numbers with a fixed length, prepended with zeros), and + make element codes and elements of categorical dtype. + + Parameters + ---------- + df : pd.DataFrame + element_col : str + Name of element column (this is only necessary to convert element column into categorical dtype). + + Returns + ------- + df : pd.DataFrame + Data after harmonizing element codes. + + """ + df = df.copy() + df["element_code"] = [str(element_code).zfill(N_CHARACTERS_ELEMENT_CODE) for element_code in df["element_code"]] + + # Convert both columns to category to reduce memory + df = df.astype({"element_code": "category", element_col: "category"}) + + return df + + +def harmonize_countries(data: pd.DataFrame, countries_metadata: pd.DataFrame) -> pd.DataFrame: + """Harmonize country names. + + A new column 'country' will be added, with the harmonized country names. Column 'fao_country' will remain, to have + the original FAO country name as a reference. + + Parameters + ---------- + data : pd.DataFrame + Data before harmonizing country names. + countries_metadata : pd.DataFrame + Table 'countries' from garden faostat_metadata dataset. + + Returns + ------- + data : pd.DataFrame + Data after harmonizing country names. + + """ + data = data.copy() + # Add harmonized country names (from countries metadata) to data. + data = pd.merge( + data, + countries_metadata[["area_code", "fao_country", "country"]].rename( + columns={"fao_country": "fao_country_check"} + ), + on="area_code", + how="left", + ) + + # area_code should always be an int + data["area_code"] = data["area_code"].astype(int) + + # Sanity check. + country_mismatch = data[(data["fao_country"].astype(str) != data["fao_country_check"])] + if len(country_mismatch) > 0: + faulty_mapping = country_mismatch.set_index("fao_country").to_dict()["fao_country_check"] + log.warning(f"Mismatch between fao_country in data and in metadata: {faulty_mapping}") + data = data.drop(columns="fao_country_check") + + # Remove unmapped countries. + data = data[data["country"].notnull()].reset_index(drop=True) + + # Further sanity checks. + check_that_countries_are_well_defined(data) + check_that_regions_with_subregions_are_ignored_when_constructing_aggregates(countries_metadata) + + # Set appropriate dtypes. + data = data.astype({"country": "category", "fao_country": "category"}) + + return data + + +def prepare_dataset_description(fao_description: str, owid_description: str) -> str: + """Prepare dataset description using the original FAO description and an (optional) OWID description. + + Parameters + ---------- + fao_description : str + Original FAOSTAT dataset description. + owid_description : str + Optional OWID dataset description + + Returns + ------- + description: str + Dataset description. + """ + + description = "" + if len(owid_description) > 0: + description += owid_description + "\n\n" + + if len(fao_description) > 0: + description += f"Original dataset description by FAOSTAT:\n{fao_description}" + + # Remove empty spaces at the beginning and end. + description = description.strip() + + return description + + +def prepare_variable_description(item: str, element: str, item_description: str, element_description: str) -> str: + """Prepare variable description by combining item and element names and descriptions. + + This will be used in the variable metadata of the wide table, and shown in grapher SOURCES tab. + + Parameters + ---------- + item : str + Item name. + element : str + Element name. + item_description : str + Item description. + element_description : str + Element description. + + Returns + ------- + description : str + Variable description. + """ + description = f"Item: {item}\n" + if len(item_description) > 0: + description += f"Description: {item_description}\n" + + description += f"\nMetric: {element}\n" + if len(element_description) > 0: + description += f"Description: {element_description}" + + # Remove empty spaces at the beginning and end. + description = description.strip() + + return description + + +def remove_rows_with_nan_value(data: pd.DataFrame, verbose: bool = False) -> pd.DataFrame: + """Remove rows for which column "value" is nan. + + Parameters + ---------- + data : pd.DataFrame + Data for current dataset. + verbose : bool + True to display information about the number and fraction of rows removed. + + Returns + ------- + data : pd.DataFrame + Data after removing nan values. + + """ + data = data.copy() + # Number of rows with a nan in column "value". + # We could also remove rows with any nan, however, before doing that, we would need to assign a value to nan flags. + n_rows_with_nan_value = len(data[data["value"].isnull()]) + if n_rows_with_nan_value > 0: + frac_nan_rows = n_rows_with_nan_value / len(data) + if verbose: + log.info(f"Removing {n_rows_with_nan_value} rows ({frac_nan_rows: .2%}) " f"with nan in column 'value'.") + if frac_nan_rows > 0.15: + log.warning(f"{frac_nan_rows: .0%} rows of nan values removed.") + data = data.dropna(subset="value").reset_index(drop=True) + + return data + + +def remove_columns_with_only_nans(data: pd.DataFrame, verbose: bool = True) -> pd.DataFrame: + """Remove columns that only have nans. + + In principle, it should not be possible that columns have only nan values, but we use this function just in case. + + Parameters + ---------- + data : pd.DataFrame + Data for current dataset. + verbose : bool + True to display information about the removal of columns with nan values. + + Returns + ------- + data : pd.DataFrame + Data after removing columns of nans. + + """ + data = data.copy() + # Remove columns that only have nans. + columns_of_nans = data.columns[data.isnull().all(axis=0)] + if len(columns_of_nans) > 0: + if verbose: + log.info( + f"Removing {len(columns_of_nans)} columns ({len(columns_of_nans) / len(data.columns): .2%}) " + f"that have only nans." + ) + data = data.drop(columns=columns_of_nans) + + return data + + +def remove_duplicates(data: pd.DataFrame, index_columns: List[str], verbose: bool = True) -> pd.DataFrame: + """Remove rows with duplicated index (country, year, item, element, unit). + + First attempt to use flags to remove duplicates. If there are still duplicates, remove in whatever way possible. + + Parameters + ---------- + data : pd.DataFrame + Data for current dataset. + index_columns : list + Columns expected to be used as index of the data. + verbose : bool + True to display a summary of the removed duplicates. + + Returns + ------- + data : pd.DataFrame + Data (with a dummy numerical index) after removing duplicates. + + """ + data = data.copy() + + # Select columns that will be used as indexes. + _index_columns = [column for column in index_columns if column in data.columns] + # Number of ambiguous indexes (those that have multiple data values). + n_ambiguous_indexes = len(data[data.duplicated(subset=_index_columns, keep="first")]) + if n_ambiguous_indexes > 0: + # Add flag ranking to dataset. + flags_ranking = FLAGS_RANKING.copy() + flags_ranking["flag"] = flags_ranking["flag"].fillna(FLAG_OFFICIAL_DATA) + data = pd.merge( + data, + flags_ranking[["flag", "ranking"]].rename(columns={"ranking": "flag_ranking"}), + on="flag", + how="left", + ).astype({"flag": "category"}) + + # Number of ambiguous indexes that cannot be solved using flags. + n_ambiguous_indexes_unsolvable = len( + data[data.duplicated(subset=_index_columns + ["flag_ranking"], keep="first")] + ) + # Remove ambiguous indexes (those that have multiple data values). + # When possible, use flags to prioritise among duplicates. + data = data.sort_values(_index_columns + ["flag_ranking"]).drop_duplicates(subset=_index_columns, keep="first") + frac_ambiguous = n_ambiguous_indexes / len(data) + frac_ambiguous_solved_by_flags = 1 - (n_ambiguous_indexes_unsolvable / n_ambiguous_indexes) + if verbose: + log.info( + f"Removing {n_ambiguous_indexes} ambiguous indexes ({frac_ambiguous: .2%}). " + f"{frac_ambiguous_solved_by_flags: .2%} of ambiguities were solved with flags." + ) + + data = data.drop(columns=["flag_ranking"]) + + return data + + +def clean_year_column(year_column: pd.Series) -> pd.Series: + """Clean year column. + + Year is given almost always as an integer value. But sometimes (e.g. in the faostat_fs dataset) it is a range of + years (that differ by exactly 2 years, e.g. "2010-2012"). This function returns a series of integer years, which, in + the cases where the original year was a range, corresponds to the mean of the range. + + Parameters + ---------- + year_column : pd.Series + Original column of year values (which may be integer, or ranges of values). + + Returns + ------- + year_clean_series : pd.Series + Clean column of years, as integer values. + + """ + year_clean = [] + for year in year_column: + if "-" in str(year): + year_range = year.split("-") + year_min = int(year_range[0]) + year_max = int(year_range[1]) + assert year_max - year_min == 2 + year_clean.append(year_min + 1) + else: + year_clean.append(int(year)) + + # Prepare series of integer year values. + year_clean_series = pd.Series(year_clean) + year_clean_series.name = "year" + + return year_clean_series + + +def add_custom_names_and_descriptions( + data: pd.DataFrame, items_metadata: pd.DataFrame, elements_metadata: pd.DataFrame +) -> pd.DataFrame: + """Add columns with custom names, descriptions and conversion factors for elements, items and units. + + The returned dataframe will have the same number of rows as the ingested data, but: + * Column 'element' will become the customized OWID element name. + * A new column 'fao_element' will be added, with the original FAO element name. + * A new column 'element_description' will be added, with the customized OWID element description. + * Column 'item' will become the customized OWID item name. + * A new column 'fao_item' will be added, with the original FAO item name. + * A new column 'item_description' will be added, with the customized OWID item description. + * Column 'unit' will become the customized OWID unit name (long version). + * A new column 'unit_short_name' will be added, with the customized OWID unit name (short version). + * A new column 'fao_unit_short_name' will be added, with the original FAO unit name (short version). + * A new column 'unit_factor' will be added, with the custom factor that values have to be multiplied by (which is + not done by this function). + + NOTE: + * Given that an item code can have different item names in different datasets, it is important that items_metadata + argument contains only item codes only for the relevant domain. For example, if data comes from the faostat_qcl + dataset, items_metadata should contain only items from that dataset. This can be achieved by selecting + `items_metadata["dataset"] == 'faostat_qcl']` before passing it to this function. + * The same applies to elements_metadata: For safety, it should only contain elements of the relevant domain. + + Parameters + ---------- + data : pd.DataFrame + Data for a particular domain, with harmonized item codes and element codes. + items_metadata : pd.DataFrame + Table 'items' from the garden faostat_metadata dataset, after selecting items for the current dataset. + elements_metadata : pd.DataFrame + Table 'elements' from the garden faostat_metadata dataset, after selecting elements for the current dataset. + + Returns + ------- + data : pd.DataFrame + Data after adding and editing its columns as described above. + + """ + data = data.copy() + + error = "There are missing item codes in metadata." + assert set(data["item_code"]) <= set(items_metadata["item_code"]), error + + error = "There are missing element codes in metadata." + assert set(data["element_code"]) <= set(elements_metadata["element_code"]), error + + _expected_n_rows = len(data) + data = pd.merge( + data.rename(columns={"item": "fao_item"}), + items_metadata[["item_code", "owid_item", "owid_item_description"]], + on="item_code", + how="left", + ) + assert len(data) == _expected_n_rows, "Something went wrong when merging data with items metadata." + + data = pd.merge( + data.rename(columns={"element": "fao_element", "unit": "fao_unit_short_name"}), + elements_metadata[ + [ + "element_code", + "owid_element", + "owid_unit", + "owid_unit_factor", + "owid_element_description", + "owid_unit_short_name", + ] + ], + on=["element_code"], + how="left", + ) + assert len(data) == _expected_n_rows, "Something went wrong when merging data with elements metadata." + + # `category` type was lost during merge, convert it back + data = data.astype( + { + "element_code": "category", + "item_code": "category", + } + ) + + # Remove "owid_" from column names. + data = data.rename(columns={column: column.replace("owid_", "") for column in data.columns}) + + return data + + +def remove_regions_from_countries_regions_members( + countries_regions: pd.DataFrame, regions_to_remove: List[str] +) -> pd.DataFrame: + """Remove regions that have to be ignored from the lists of members in the countries-regions dataset. + + Parameters + ---------- + countries_regions : pd.DataFrame + Countries-regions dataset (from the OWID catalog). + regions_to_remove : list + Regions to ignore. + + Returns + ------- + countries_regions : pd.DataFrame + Countries-regions dataset after removing regions from the lists of members of each country or region. + + """ + countries_regions = countries_regions.copy() + countries_regions["members"] = countries_regions["members"].dropna().astype(str) + + # Get the owid code for each region that needs to be ignored when creating region aggregates. + regions_to_ignore_codes = [] + for region in set(regions_to_remove): + selected_region = countries_regions[countries_regions["name"] == region] + assert len(selected_region) == 1, f"Region {region} ambiguous or not found in countries_regions dataset." + regions_to_ignore_codes.append(selected_region.index[0]) + + # Remove those regions to ignore from lists of members of each region. + regions_mask = countries_regions["members"].notnull() + countries_regions.loc[regions_mask, "members"] = [ + json.dumps(list(set(json.loads(members)) - set(regions_to_ignore_codes))) + for members in countries_regions[regions_mask]["members"] + ] + + return countries_regions + + +def load_population() -> pd.DataFrame: + """Load OWID population dataset, and add historical regions to it. + + Returns + ------- + population : pd.DataFrame + Population dataset. + + """ + # Load population dataset. + population = catalog.Dataset(DATA_DIR / "garden/owid/latest/key_indicators/")["population"].reset_index()[ + ["country", "year", "population"] + ] + + # Add data for historical regions (if not in population) by adding the population of its current successors. + countries_with_population = population["country"].unique() + missing_countries = [country for country in HISTORIC_TO_CURRENT_REGION if country not in countries_with_population] + for country in missing_countries: + members = HISTORIC_TO_CURRENT_REGION[country]["members"] + _population = ( + population[population["country"].isin(members)] + .groupby("year") + .agg({"population": "sum", "country": "nunique"}) + .reset_index() + ) + # Select only years for which we have data for all member countries. + _population = _population[_population["country"] == len(members)].reset_index(drop=True) + _population["country"] = country + population = pd.concat([population, _population], ignore_index=True).reset_index(drop=True) + + error = "Duplicate country-years found in population. Check if historical regions changed." + assert population[population.duplicated(subset=["country", "year"])].empty, error + + return cast(pd.DataFrame, population) + + +def load_countries_regions() -> pd.DataFrame: + """Load countries-regions dataset from the OWID catalog, and remove certain regions (defined in + REGIONS_TO_IGNORE_IN_AGGREGATES) from the lists of members of countries or regions. + + Returns + ------- + countries_regions : pd.DataFrame + Countries-regions dataset. + + """ + # Load dataset of countries and regions. + countries_regions = catalog.Dataset(DATA_DIR / "garden/regions/2023-01-01/regions")["regions"] + + countries_regions = remove_regions_from_countries_regions_members( + countries_regions, regions_to_remove=REGIONS_TO_IGNORE_IN_AGGREGATES + ) + + return cast(pd.DataFrame, countries_regions) + + +def load_income_groups() -> pd.DataFrame: + """Load dataset of income groups and add historical regions to it. + + Returns + ------- + income_groups : pd.DataFrame + Income groups data. + + """ + # Load the WorldBank dataset for income grups. + income_groups = catalog.Dataset(DATA_DIR / "garden/wb/2021-07-01/wb_income")["wb_income_group"].reset_index() + + # Add historical regions to income groups. + for historic_region in HISTORIC_TO_CURRENT_REGION: + historic_region_income_group = HISTORIC_TO_CURRENT_REGION[historic_region]["income_group"] + if historic_region not in income_groups["country"]: + historic_region_df = pd.DataFrame( + { + "country": [historic_region], + "income_group": [historic_region_income_group], + } + ) + income_groups = pd.concat([income_groups, historic_region_df], ignore_index=True) + + return cast(pd.DataFrame, income_groups) + + +def list_countries_in_region(region: str, countries_regions: pd.DataFrame, income_groups: pd.DataFrame) -> List[str]: + """List all countries in a specific region or income group. + + Parameters + ---------- + region : str + Name of the region. + countries_regions : pd.DataFrame + Countries-regions dataset (after removing certain regions from the lists of members). + income_groups : pd.DataFrame + Dataset of income groups, which includes historical regions. + + Returns + ------- + countries_in_regions : list + List of countries in the given region or income group. + + """ + # Number of attempts to fetch countries regions data. + attempts = 5 + attempt = 0 + countries_in_region = list() + while attempt < attempts: + try: + # List countries in region. + countries_in_region = geo.list_countries_in_region( + region=region, + countries_regions=countries_regions, + income_groups=income_groups, + ) + break + except ConnectionResetError: + attempt += 1 + finally: + assert len(countries_in_region) > 0, "Unable to fetch countries-regions data." + + return countries_in_region + + +def remove_overlapping_data_between_historical_regions_and_successors( + data_region: pd.DataFrame, +) -> pd.DataFrame: + """Remove overlapping data between a historical region and any of its successors (if there is any overlap), to avoid + double-counting those regions when aggregating data. + + Data for historical regions (e.g. USSR) could overlap with data of the successor countries (e.g. Russia). If this + happens, remove data (on the overlapping element-item-years) of the historical country. + + Parameters + ---------- + data_region : pd.DataFrame + Data (after selecting the countries of a certain relevant region). + + Returns + ------- + data_region : pd.DataFrame + Data after removing data with overlapping regions. + + """ + data_region = data_region.copy() + + columns = ["item_code", "element_code", "year"] + indexes_to_drop = [] + for historical_region in HISTORIC_TO_CURRENT_REGION: + # Successors of the current historical region. + historical_successors = HISTORIC_TO_CURRENT_REGION[historical_region]["members"] + # Unique combinations of item codes, element codes, and years for which historical region has data. + historical_region_years = data_region[(data_region["country"] == historical_region)][columns].drop_duplicates() + # Unique combinations of item codes, element codes, and years for which successors have data. + historical_successors_years = data_region[(data_region["country"].isin(historical_successors))][ + columns + ].drop_duplicates() + # Find unique years where the above combinations of item-element-years of region and successors overlap. + overlapping_years = pd.concat([historical_region_years, historical_successors_years], ignore_index=True) + overlapping_years = overlapping_years[overlapping_years.duplicated()] + if not overlapping_years.empty: + log.warning( + f"Removing rows where historical region {historical_region} overlaps with its successors " + f"(years {sorted(set(overlapping_years['year']))})." + ) + # Select rows in data_region to drop. + overlapping_years["country"] = historical_region + indexes_to_drop.extend( + pd.merge( + data_region.reset_index(), + overlapping_years, + how="inner", + on=["country"] + columns, + )["index"].tolist() + ) + + if len(indexes_to_drop) > 0: + # Remove rows of data of the historical region where its data overlaps with data from its successors. + data_region = data_region.drop(index=indexes_to_drop) + + return data_region + + +def add_regions(data: pd.DataFrame, elements_metadata: pd.DataFrame) -> pd.DataFrame: + """Add region aggregates (i.e. aggregate data for continents and income groups). + + Regions to be created are defined above, in REGIONS_TO_ADD, and the variables for which data will be aggregated are + those that, in the custom_elements_and_units.csv file, have a non-empty 'owid_aggregation' field (usually with + 'sum', or 'mean'). The latter field determines the type of aggregation to create. + + Historical regions (if any) will be included in the aggregations, after ensuring that there is no overlap between + the data for the region, and the data of any of its successor countries (for each item-element-year). + + Parameters + ---------- + data : pd.DataFrame + Clean data (after harmonizing items, element and countries). + elements_metadata : pd.DataFrame + Table 'elements' from the garden faostat_metadata dataset, after selecting elements for the current domain. + + Returns + ------- + data : pd.DataFrame + Data after adding rows for aggregate regions. + + """ + data = data.copy() + + # Create a dictionary of aggregations, specifying the operation to use when creating regions. + # These aggregations are defined in the custom_elements_and_units.csv file, and added to the metadata dataset. + aggregations = ( + elements_metadata[(elements_metadata["owid_aggregation"].notnull())] + .set_index("element_code") + .to_dict()["owid_aggregation"] + ) + if len(aggregations) > 0: + log.info("add_regions", shape=data.shape) + + # Load population dataset, countries-regions, and income groups datasets. + population = load_population() + countries_regions = load_countries_regions() + income_groups = load_income_groups() + + # Invert dictionary of aggregations to have the aggregation as key, and the list of element codes as value. + aggregations_inverted = { + unique_value: pd.unique([item for item, value in aggregations.items() if value == unique_value]).tolist() + for unique_value in aggregations.values() + } + for region in tqdm(REGIONS_TO_ADD, file=sys.stdout): + countries_in_region = list_countries_in_region( + region, countries_regions=countries_regions, income_groups=income_groups + ) + region_code = REGIONS_TO_ADD[region]["area_code"] + region_population = population[population["country"] == region][["year", "population"]].reset_index( + drop=True + ) + region_min_frac_population_with_data = REGIONS_TO_ADD[region]["min_frac_population_with_data"] + for aggregation in aggregations_inverted: + # List of element codes for which the same aggregate method (e.g. "sum") will be applied. + element_codes = aggregations_inverted[aggregation] + + # Select relevant rows in the data. + data_region = data[ + (data["country"].isin(countries_in_region)) & (data["element_code"].isin(element_codes)) + ] + + # Ensure there is no overlap between historical regions and their successors. + data_region = remove_overlapping_data_between_historical_regions_and_successors(data_region) + + if len(data_region) > 0: + data_region = ( + dataframes.groupby_agg( + df=data_region.dropna(subset="value"), + groupby_columns=[ + "year", + "item_code", + "element_code", + "item", + "element", + "fao_element", + "fao_item", + "item_description", + "unit", + "unit_short_name", + "fao_unit_short_name", + "element_description", + ], + num_allowed_nans=None, + frac_allowed_nans=None, + aggregations={ + "value": aggregation, + "flag": lambda x: x if len(x) == 1 else FLAG_MULTIPLE_FLAGS, + "population_with_data": "sum", + }, + ) + .reset_index() + .dropna(subset="element") + ) + + # Add total population of the region (for each year) to the relevant data. + data_region = pd.merge(data_region, region_population, on="year", how="left") + + # Keep only rows for which we have sufficient data. + data_region = data_region[ + (data_region["population_with_data"] / data_region["population"]) + >= region_min_frac_population_with_data + ].reset_index(drop=True) + + # Add region's name and area code. + data_region["country"] = region + data_region["area_code"] = region_code + + # Use category type which is more efficient than using strings + data_region = data_region.astype( + { + "flag": "category", + "country": "category", + } + ) + + # Add data for current region to data. + data = dataframes.concatenate( + [data[data["country"] != region], data_region], + ignore_index=True, + ) + + # Check that the fraction of population with data is as high as expected. + frac_population = data["population_with_data"] / data["population"] + assert frac_population[frac_population.notnull()].min() >= region_min_frac_population_with_data + + # Drop column of total population (we will still keep population_with_data). + data = data.drop(columns=["population"]) + + # Make area_code of category type (it contains integers and strings, and feather does not support object types). + data["area_code"] = data["area_code"].astype(str).astype("category") + + # Sort conveniently. + data = data.sort_values(["country", "year"]).reset_index(drop=True) + + check_that_countries_are_well_defined(data) + + return data + + +def add_fao_population_if_given(data: pd.DataFrame) -> pd.DataFrame: + """Add a new column for FAO population, if population values are given in the data. + + Some datasets (e.g. faostat_fbsh and faostat_fbs) include per-capita variables from the beginning. When this + happens, FAO population may be given as another item-element. To be able to convert those per-capita variables into + total values, we need to extract that population data and make it a new column. + + Parameters + ---------- + data : pd.DataFrame + Data (after harmonizing elements and items, but before harmonizing countries). + + Returns + ------- + data : pd.DataFrame + Data, after adding a column 'fao_population', if FAO population was found in the data. + + """ + # Select rows that correspond to FAO population. + fao_population_item_name = "Population" + fao_population_element_name = "Total Population - Both sexes" + population_rows_mask = (data["fao_item"] == fao_population_item_name) & ( + data["fao_element"] == fao_population_element_name + ) + + if population_rows_mask.any(): + data = data.copy() + + fao_population = data[population_rows_mask].reset_index(drop=True) + + # Check that population is given in "1000 persons" and convert to persons. + assert list(fao_population["unit"].unique()) == ["1000 persons"], "FAO population may have changed units." + fao_population["value"] *= 1000 + + # Note: Here we will dismiss the flags related to population. But they are only relevant for those columns + # that were given as per capita variables. + fao_population = ( + fao_population[["area_code", "year", "value"]] + .drop_duplicates() + .dropna(how="any") + .rename(columns={"value": "fao_population"}) + ) + + # Add FAO population as a new column in data. + data = pd.merge(data, fao_population, how="left", on=["area_code", "year"]) + + return data + + +def add_population( + df: pd.DataFrame, + country_col: str = "country", + year_col: str = "year", + population_col: str = "population", + warn_on_missing_countries: bool = True, + show_full_warning: bool = True, +) -> pd.DataFrame: + """Add a column of OWID population to the countries in the data, including population of historical regions. + + This function has been adapted from datautils.geo, because population currently does not include historic regions. + We include them in this function. + + Parameters + ---------- + df : pd.DataFrame + Data without a column for population (after harmonizing elements, items and country names). + country_col : str + Name of country column in data. + year_col : str + Name of year column in data. + population_col : str + Name for new population column in data. + warn_on_missing_countries : bool + True to warn if population is not found for any of the countries in the data. + show_full_warning : bool + True to show affected countries if the previous warning is raised. + + Returns + ------- + df_with_population : pd.DataFrame + Data after adding a column for population for all countries in the data. + + """ + + # Load population dataset. + population = load_population().rename( + columns={ + "country": country_col, + "year": year_col, + "population": population_col, + } + )[[country_col, year_col, population_col]] + + # Check if there is any missing country. + missing_countries = set(df[country_col]) - set(population[country_col]) + if len(missing_countries) > 0: + if warn_on_missing_countries: + geo.warn_on_list_of_entities( + list_of_entities=missing_countries, + warning_message=( + f"{len(missing_countries)} countries not found in population" + " dataset. They will remain in the dataset, but have nan" + " population." + ), + show_list=show_full_warning, + ) + + # Add population to original dataframe. + df_with_population = pd.merge(df, population, on=[country_col, year_col], how="left") + + return df_with_population + + +def convert_variables_given_per_capita_to_total_value( + data: pd.DataFrame, elements_metadata: pd.DataFrame +) -> pd.DataFrame: + """Replace variables given per capita in the original data by total values. + + NOTE: + * Per-capita variables to be replaced by their total values are those with 'was_per_capita' equal to 1 in the + custom_elements_and_units.csv file. + * The new variables will have the same element codes as the original per-capita variables. + + Parameters + ---------- + data : pd.DataFrame + Data (after harmonizing elements and items, but before harmonizing countries). + elements_metadata : pd.DataFrame + Table 'elements' from the garden faostat_metadata dataset, after selecting the elements of the relevant domain. + + Returns + ------- + data : pd.DataFrame + Data, after converting per-capita variables to total value. + + """ + # Select element codes that were originally given as per capita variables (if any), and, if FAO population is + # given, make them total variables instead of per capita. + # All variables in the custom_elements_and_units.csv file with "was_per_capita" True will be converted into + # total (non-per-capita) values. + element_codes_that_were_per_capita = list( + elements_metadata[elements_metadata["was_per_capita"]]["element_code"].unique() + ) + if len(element_codes_that_were_per_capita) > 0: + data = data.copy() + + assert "fao_population" in data.columns, "fao_population not found, maybe it changed item, element." + + # Select variables that were given as per capita variables in the original data and that need to be converted. + per_capita_mask = data["element_code"].isin(element_codes_that_were_per_capita) + + # Multiply them by the FAO population to convert them into total value. + data.loc[per_capita_mask, "value"] = data[per_capita_mask]["value"] * data[per_capita_mask]["fao_population"] + + # Include an additional description to all elements that were converted from per capita to total variables. + if "" not in data["element_description"].cat.categories: + data["element_description"] = data["element_description"].cat.add_categories([""]) + data.loc[per_capita_mask, "element_description"] = data.loc[per_capita_mask, "element_description"].fillna("") + data["element_description"] = dataframes.apply_on_categoricals( + [data.element_description, per_capita_mask.astype("category")], + lambda desc, mask: f"{desc} {WAS_PER_CAPITA_ADDED_ELEMENT_DESCRIPTION}".lstrip() if mask else f"{desc}", + ) + + return data + + +def add_per_capita_variables(data: pd.DataFrame, elements_metadata: pd.DataFrame) -> pd.DataFrame: + """Add per-capita variables to data in a long format (and keep original variables as well). + + NOTE: + * Variables for which new per-capita rows will be created are those with 'make_per_capita' equal to 1 in the + custom_elements_and_units.csv file. + * The new variables will have the same element codes as the original per-capita variables, with 'pc' appended to + the number. + + Parameters + ---------- + data : pd.DataFrame + Clean data (after harmonizing item codes and element codes, and countries, and adding aggregate regions). + elements_metadata : pd.DataFrame + Elements table from the garden faostat_metadata dataset, after selecting elements for the relevant domain. + + Returns + ------- + data : pd.DataFrame + Data with per-capita variables. + + """ + data = data.copy() + + # Find element codes that have to be made per capita. + element_codes_to_make_per_capita = list( + elements_metadata[elements_metadata["make_per_capita"]]["element_code"].unique() + ) + if len(element_codes_to_make_per_capita) > 0: + log.info("add_per_capita_variables", shape=data.shape) + + # Create a new dataframe that will have all per capita variables. + per_capita_data = data[data["element_code"].isin(element_codes_to_make_per_capita)].reset_index(drop=True) + + # Change element codes of per capita variables. + per_capita_data["element_code"] = per_capita_data["element_code"].cat.rename_categories( + lambda c: (c.lstrip("0") + "pc").zfill(N_CHARACTERS_ELEMENT_CODE) + ) + + # Create a mask that selects FAO regions (regions that, in the countries.json file, were not harmonized, and + # have '(FAO)' at the end of the name). + fao_regions_mask = per_capita_data["country"].str.contains("(FAO)", regex=False) + # Create a mask that selects all other regions (i.e. harmonized countries). + owid_regions_mask = ~fao_regions_mask + + # Create per capita variables for FAO regions (this can only be done if a column for FAO population is given). + if "fao_population" in per_capita_data.columns: + per_capita_data.loc[fao_regions_mask, "value"] = ( + per_capita_data[fao_regions_mask]["value"] / per_capita_data[fao_regions_mask]["fao_population"] + ) + else: + # Per capita variables can't be created for FAO regions, since we don't have FAO population. + # Remove these regions from the per capita dataframe; only OWID harmonized countries will be kept. + per_capita_data = per_capita_data[~fao_regions_mask].reset_index(drop=True) + owid_regions_mask = np.ones(len(per_capita_data), dtype=bool) + + # Add per capita values to all other regions that are not FAO regions. + per_capita_data.loc[owid_regions_mask, "value"] = ( + per_capita_data[owid_regions_mask]["value"] / per_capita_data[owid_regions_mask]["population_with_data"] # type: ignore + ) + + # Remove nans (which may have been created because of missing FAO population). + per_capita_data = per_capita_data.dropna(subset="value").reset_index(drop=True) # type: ignore + + # Add "per capita" to all units. + per_capita_data["unit"] = per_capita_data["unit"].cat.rename_categories(lambda c: f"{c} per capita") + # Include an additional note in the description on affected elements. + per_capita_data["element_description"] = per_capita_data["element_description"].cat.rename_categories( + lambda c: f"{c} {NEW_PER_CAPITA_ADDED_ELEMENT_DESCRIPTION}" + ) + # Add new rows with per capita variables to data. + data = dataframes.concatenate([data, per_capita_data], ignore_index=True).reset_index(drop=True) + + return data + + +def clean_data_values(values: pd.Series, amendments: Dict[str, str]) -> pd.Series: + """Fix spurious data values (defined in value_amendments.csv) and make values a float column. + + Parameters + ---------- + values : pd.Series + Content of the "value" column in the original data. + + Returns + ------- + values_clean : pd.Series + Original values after fixing known issues and converting to float. + + """ + values_clean = values.copy() + if len(amendments) > 0: + values_clean = dataframes.map_series( + series=values_clean, + mapping=amendments, + warn_on_missing_mappings=False, + warn_on_unused_mappings=True, + show_full_warning=True, + ) + + # Convert all numbers into numeric. + # Note: If this step fails with a ValueError, it may be because other spurious values have been introduced. + # If so, add them to value_amendments.csv and re-run faostat_metadata. + values_clean = values_clean.astype(float) + + return values_clean + + +def clean_data( + data: pd.DataFrame, + items_metadata: pd.DataFrame, + elements_metadata: pd.DataFrame, + countries_metadata: pd.DataFrame, + amendments: Dict[str, str], +) -> pd.DataFrame: + """Process data (with already harmonized item codes and element codes), before adding aggregate regions and + per-capita variables. + + NOTE: + * Given that an item code can have different item names in different datasets, it is important that items_metadata + argument contains only item codes only for the relevant domain. For example, if data comes from the faostat_qcl + dataset, items_metadata should contain only items from that dataset. This can be achieved by selecting + `items_metadata["dataset"] == 'faostat_qcl']` before passing it to this function. + * The same applies to elements_metadata: For safety, it should only contain elements of the relevant domain. + + Parameters + ---------- + data : pd.DataFrame + Unprocessed data for current dataset (with harmonized item codes and element codes). + items_metadata : pd.DataFrame + Items metadata (from the metadata dataset) after selecting items for only the relevant domain. + elements_metadata : pd.DataFrame + Elements metadata (from the metadata dataset) after selecting elements for only the relevant domain. + countries_metadata : pd.DataFrame + Countries metadata (from the metadata dataset). + amendments : dict + Value amendments (if any). + + Returns + ------- + data : pd.DataFrame + Processed data, ready to be made into a table for a garden dataset. + + """ + data = data.copy() + + # Fix spurious data values (applying mapping in value_amendments.csv) and ensure column of values is float. + data["value"] = clean_data_values(data["value"], amendments=amendments) + + # Convert nan flags into "official" (to avoid issues later on when dealing with flags). + data["flag"] = pd.Series( + [flag if not pd.isnull(flag) else FLAG_OFFICIAL_DATA for flag in data["flag"]], + dtype="category", + ) + + # Some datasets (at least faostat_fa) use "recipient_country" instead of "area". For consistency, change this. + data = data.rename( + columns={ + "area": "fao_country", + "recipient_country": "fao_country", + "recipient_country_code": "area_code", + } + ) + + # Dataset faostat_wcad doesn't have a year column, but a "census_year", which has intervals like "2002-2003" + if "census_year" in data.columns: + if data["census_year"].astype(str).str.contains("-.{4}/", regex=True).any(): + log.warning( + "Column 'census_year' in dataset 'faostat_wcad' contains values that need to be properly analysed " + "and processed, e.g. 1976-1977/1980-1981. For the moment, we take the first 4 digits as the year." + ) + + # Remove rows that don't have a census year, and take the first 4 digits + data = data.dropna(subset="census_year").reset_index(drop=True) + data["year"] = data["census_year"].astype(str).str[0:4].astype(int) + + # Ensure year column is integer (sometimes it is given as a range of years, e.g. 2013-2015). + data["year"] = clean_year_column(data["year"]) + + # Remove rows with nan value. + data = remove_rows_with_nan_value(data) + + # Use custom names for items, elements and units (and keep original names in "fao_*" columns). + data = add_custom_names_and_descriptions(data, items_metadata, elements_metadata) + + # Multiply data values by their corresponding unit factor, if any was given, and then drop unit_factor column. + unit_factor_mask = data["unit_factor"].notnull() + data.loc[unit_factor_mask, "value"] = data[unit_factor_mask]["value"] * data[unit_factor_mask]["unit_factor"] + data = data.drop(columns=["unit_factor"]) + + # Add FAO population as an additional column (if given in the original data). + data = add_fao_population_if_given(data) + + # Convert variables that were given per-capita to total value. + data = convert_variables_given_per_capita_to_total_value(data, elements_metadata=elements_metadata) + + # Harmonize country names. + data = harmonize_countries(data=data, countries_metadata=countries_metadata) + + # Remove duplicated data points (if any) keeping the one with lowest ranking flag (i.e. highest priority). + data = remove_duplicates( + data=data, + index_columns=["area_code", "year", "item_code", "element_code"], + verbose=True, + ) + + # Add column for population; when creating region aggregates, this column will have the population of the countries + # for which there was data. For example, for Europe in a specific year, the population may differ from item to item, + # because for one item we may have more European countries informed than for the other. + data = add_population(df=data, population_col="population_with_data", warn_on_missing_countries=False) + + # Convert back to categorical columns (maybe this should be handled automatically in `add_population_to_dataframe`) + data = data.astype({"country": "category"}) + + return data + + +def optimize_table_dtypes(table: catalog.Table) -> catalog.Table: + """Optimize the dtypes of the columns in a table. + + NOTE: Using `.astype` in a loop over different columns is slow. Instead, it is better to map all columns at once or + call `repack_frame` with dtypes arg + + Parameters + ---------- + table : catalog.Table + Table with possibly non-optimal column dtypes. + + Returns + ------- + optimized_table : catalog.Table + Table with optimized dtypes. + + """ + dtypes = {c: "category" for c in ["area_code", "item_code", "element_code"] if c in table.columns} + + # Store variables metadata before optimizing table dtypes (otherwise they will be lost). + variables_metadata = {variable: table[variable].metadata for variable in table.columns} + + optimized_table = repack.repack_frame(table, dtypes=dtypes) + + # Recover variable metadata (that was lost when optimizing table dtypes). + for variable in variables_metadata: + optimized_table[variable].metadata = variables_metadata[variable] + + return optimized_table + + +def prepare_long_table(data: pd.DataFrame) -> catalog.Table: + """Prepare a data table in long format. + + Parameters + ---------- + data : pd.DataFrame + Data (as a dataframe) in long format. + + Returns + ------- + data_table_long : catalog.Table + Data (as a table) in long format. + + """ + # Create new table with long data. + data_table_long = catalog.Table(data) + + # Ensure table has the optimal dtypes before storing it as feather file. + data_table_long = optimize_table_dtypes(table=data_table_long) + + # Set appropriate indexes. + index_columns = ["area_code", "year", "item_code", "element_code"] + data_table_long = data_table_long.set_index(index_columns, verify_integrity=True).sort_index() + + # Sanity check. + number_of_infinities = len(data_table_long[data_table_long["value"] == np.inf]) + assert number_of_infinities == 0, f"There are {number_of_infinities} infinity values in the long table." + + return cast(catalog.Table, data_table_long) + + +def create_variable_short_names(variable_name: str) -> str: + """Create lower-snake-case short names for the columns in the wide (flatten) output table, ensuring that they are + not too long (to avoid issues when inserting variable in grapher). + + If a new name is too long, the ending of the item name will be reduced. + If the item name is not long enough to solve the problem, this function will raise an assertion error. + + Parameters + ---------- + variable_name : str + Variable name. + + Returns + ------- + new_name : str + New variable name. + + """ + # Extract all the necessary fields from the variable name. + item, item_code, element, element_code, unit = variable_name.replace("||", "|").split(" | ") + + # Check that the extraction was correct by constructing the variable name again and comparing with the original. + assert variable_name == f"{item} | {item_code} || {element} | {element_code} || {unit}" + + new_name = catalog.utils.underscore(variable_name) + + # Check that the number of characters of the short name is not too long. + n_char = len(new_name) + if n_char > 255: + # This name will cause an issue when uploading to grapher (because of a limit of 255 characters in short name). + # Remove the extra characters from the ending of the item name (if possible). + n_char_to_be_removed = n_char - 255 + # It could happen that it is not the item name that is long, but the element name, dataset, or unit. + # But for the moment, assume it is the item name. + assert len(item) > n_char_to_be_removed, "Variable name is too long, but it is not due to item name." + new_item = catalog.utils.underscore(item)[0:-n_char_to_be_removed] + new_name = catalog.utils.underscore(f"{new_item} | {item_code} || {element} | {element_code} || {unit}") + + # Check that now the new name now fulfils the length requirement. + error = "Variable short name is too long. Improve create_variable_names function to account for this case." + assert len(new_name) <= 255, error + + return cast(str, new_name) + + +def prepare_wide_table(data: pd.DataFrame) -> catalog.Table: + """Flatten a long table to obtain a wide table with ["country", "year"] as index. + + The input table will be pivoted to have [country, year] as index, and as many columns as combinations of + item-element-unit entities. + + Parameters + ---------- + data : pd.DataFrame + Data for current domain. + + Returns + ------- + wide_table : catalog.Table + Data table with index [country, year]. + + """ + data = data.copy(deep=False) + + # Ensure "item" exists in data (there are some datasets where it may be missing). + if "item" not in data.columns: + data["item"] = "" + + # Construct a variable name that will not yield any possible duplicates. + # This will be used as column names (which will then be formatted properly with underscores and lower case), + # and also as the variable titles in grapher. + # Also, for convenience, keep a similar structure as in the previous OWID dataset release. + # Finally, ensure that the short name version of the variable is not too long + # (which would cause issues when uploading to grapher). + data["variable_name"] = dataframes.apply_on_categoricals( + [data.item, data.item_code, data.element, data.element_code, data.unit], + lambda item, + item_code, + element, + element_code, + unit: f"{item} | {item_code} || {element} | {element_code} || {unit}", + ) + + # Construct a human-readable variable display name (which will be shown in grapher charts). + data["variable_display_name"] = dataframes.apply_on_categoricals( + [data.item, data.element, data.unit], + lambda item, element, unit: f"{item} - {element} ({unit})", + ) + + # Construct a human-readable variable description (for the variable metadata). + data["variable_description"] = dataframes.apply_on_categoricals( + [data.item, data.element, data.item_description, data.element_description], + prepare_variable_description, + ) + + # Pivot over long dataframe to generate a wide dataframe with country-year as index, and as many columns as + # unique elements in "variable_name" (which should be as many as combinations of item-elements). + # Note: We include area_code in the index for completeness, but by construction country-year should not have + # duplicates. + # Note: `pivot` operation is usually faster on categorical columns + log.info("prepare_wide_table.pivot", shape=data.shape) + # Create a wide table with just the data values. + wide_table = catalog.Table( + data.pivot( + index=["area_code", "country", "year"], + columns=["variable_name"], + values="value", + ) + ) + + # Add metadata to each new variable in the wide data table. + log.info("prepare_wide_table.adding_metadata", shape=wide_table.shape) + + # Add variable name. + for column in wide_table.columns: + wide_table[column].metadata.title = column + + # Add variable unit (long name). + variable_name_mapping = _variable_name_map(data, "unit") + for column in wide_table.columns: + wide_table[column].metadata.unit = variable_name_mapping[column] + + # Add variable unit (short name). + variable_name_mapping = _variable_name_map(data, "unit_short_name") + for column in wide_table.columns: + wide_table[column].metadata.short_unit = variable_name_mapping[column] + + # Add variable description. + variable_name_mapping = _variable_name_map(data, "variable_description") + for column in wide_table.columns: + wide_table[column].metadata.description = variable_name_mapping[column] + + # Add display parameters (for grapher). + for column in wide_table.columns: + wide_table[column].metadata.display = {} + + # Display name. + variable_name_mapping = _variable_name_map(data, "variable_display_name") + for column in wide_table.columns: + wide_table[column].metadata.display["name"] = variable_name_mapping[column] + + # Ensure columns have the optimal dtypes, but codes are categories. + log.info("prepare_wide_table.optimize_table_dtypes", shape=wide_table.shape) + wide_table = optimize_table_dtypes(table=wide_table.reset_index()) + + # Sort columns and rows conveniently. + wide_table = wide_table.set_index(["country", "year"], verify_integrity=True) + wide_table = wide_table[["area_code"] + sorted([column for column in wide_table.columns if column != "area_code"])] + wide_table = wide_table.sort_index(level=["country", "year"]).sort_index() + + # Make all column names snake_case. + variable_to_short_name = { + column: create_variable_short_names(variable_name=wide_table[column].metadata.title) + for column in wide_table.columns + if wide_table[column].metadata.title is not None + } + wide_table = wide_table.rename(columns=variable_to_short_name, errors="raise") + + # Sanity check. + number_of_infinities = np.isinf(wide_table.select_dtypes(include=np.number).fillna(0)).values.sum() + assert number_of_infinities == 0, f"There are {number_of_infinities} infinity values in the wide table." + + return wide_table + + +def _variable_name_map(data: pd.DataFrame, column: str) -> Dict[str, str]: + """Extract map {variable name -> column} from dataframe and make sure it is unique (i.e. ensure that one variable + does not map to two distinct values).""" + pivot = data.dropna(subset=[column]).groupby(["variable_name"], observed=True)[column].apply(set) + assert all(pivot.map(len) == 1) + return pivot.map(lambda x: list(x)[0]).to_dict() # type: ignore + + +def parse_amendments_table(amendments: catalog.Table, dataset_short_name: str): + amendments = pd.DataFrame(amendments).reset_index() + # Create a dictionary mapping spurious values to amended values. + amendments = ( + amendments[amendments["dataset"] == dataset_short_name] + .drop(columns="dataset") + .set_index("spurious_value") + .to_dict()["new_value"] + ) + # For some reason, empty values are loaded in the table as None. Change them to nan. + amendments = {old: new if new is not None else np.nan for old, new in amendments.items()} + + return amendments + + +def run(dest_dir: str) -> None: + # + # Load data. + # + # Fetch the dataset short name from dest_dir. + dataset_short_name = Path(dest_dir).name + + # Define path to current step file. + current_step_file = (CURRENT_DIR / dataset_short_name).with_suffix(".py") + + # Get paths and naming conventions for current data step. + paths = PathFinder(current_step_file.as_posix()) + + # Load latest meadow dataset and keep its metadata. + ds_meadow: catalog.Dataset = paths.load_dependency(dataset_short_name) + # Load main table from dataset. + tb_meadow = ds_meadow[dataset_short_name] + data = pd.DataFrame(tb_meadow).reset_index() + + # Load dataset of FAOSTAT metadata. + metadata: catalog.Dataset = paths.load_dependency(f"{NAMESPACE}_metadata") + + # Load dataset, items, element-units, countries metadata, and value amendments. + dataset_metadata = pd.DataFrame(metadata["datasets"]).loc[dataset_short_name].to_dict() + items_metadata = pd.DataFrame(metadata["items"]).reset_index() + items_metadata = items_metadata[items_metadata["dataset"] == dataset_short_name].reset_index(drop=True) + elements_metadata = pd.DataFrame(metadata["elements"]).reset_index() + elements_metadata = elements_metadata[elements_metadata["dataset"] == dataset_short_name].reset_index(drop=True) + countries_metadata = pd.DataFrame(metadata["countries"]).reset_index() + amendments = parse_amendments_table(amendments=metadata["amendments"], dataset_short_name=dataset_short_name) + + # + # Process data. + # + # Harmonize items and elements, and clean data. + data = harmonize_items(df=data, dataset_short_name=dataset_short_name) + data = harmonize_elements(df=data) + + # Prepare data. + data = clean_data( + data=data, + items_metadata=items_metadata, + elements_metadata=elements_metadata, + countries_metadata=countries_metadata, + amendments=amendments, + ) + + # Add data for aggregate regions. + data = add_regions(data=data, elements_metadata=elements_metadata) + + # Add per-capita variables. + data = add_per_capita_variables(data=data, elements_metadata=elements_metadata) + + # Handle detected anomalies in the data. + data, anomaly_descriptions = handle_anomalies(dataset_short_name=dataset_short_name, data=data) + + # Create a long table (with item code and element code as part of the index). + data_table_long = prepare_long_table(data=data) + + # Create a wide table (with only country and year as index). + data_table_wide = prepare_wide_table(data=data) + + # + # Save outputs. + # + # Update tables metadata. + data_table_long.metadata.short_name = dataset_short_name + data_table_long.metadata.title = dataset_metadata["owid_dataset_title"] + data_table_wide.metadata.short_name = f"{dataset_short_name}_flat" + data_table_wide.metadata.title = dataset_metadata["owid_dataset_title"] + ADDED_TITLE_TO_WIDE_TABLE + + # Initialise new garden dataset. + ds_garden = create_dataset( + dest_dir=dest_dir, tables=[data_table_long, data_table_wide], default_metadata=ds_meadow.metadata + ) + # Update dataset metadata. + # Add description of anomalies (if any) to the dataset description. + ds_garden.metadata.description = dataset_metadata["owid_dataset_description"] + anomaly_descriptions + ds_garden.metadata.title = dataset_metadata["owid_dataset_title"] + + # Update the main source's metadata description (which will be shown in charts). + ds_garden.metadata.sources[0].description = ds_garden.metadata.description + + # Create garden dataset. + ds_garden.save() diff --git a/etl/steps/data/garden/faostat/2024-03-14/value_amendments.csv b/etl/steps/data/garden/faostat/2024-03-14/value_amendments.csv new file mode 100644 index 00000000000..2848b3da2c6 --- /dev/null +++ b/etl/steps/data/garden/faostat/2024-03-14/value_amendments.csv @@ -0,0 +1,8 @@ +dataset,spurious_value,new_value +faostat_fs,"<0.1","0.1" +faostat_fs,"<2.5","2.5" +faostat_fs,"<0.5","0.5" +faostat_sdgb,"<0.1","0.1" +faostat_sdgb,"<0.5","0.5" +faostat_sdgb,"<2.5","2.5" +faostat_sdgb,"<100","100" diff --git a/etl/steps/data/meadow/faostat/2024-03-14/faostat_metadata.py b/etl/steps/data/meadow/faostat/2024-03-14/faostat_metadata.py index 6140fe64de6..4990dcf0fa5 100644 --- a/etl/steps/data/meadow/faostat/2024-03-14/faostat_metadata.py +++ b/etl/steps/data/meadow/faostat/2024-03-14/faostat_metadata.py @@ -186,7 +186,7 @@ def run(dest_dir: str) -> None: paths = PathFinder(current_step_file.as_posix()) # Load snapshot. - snapshot = paths.load_dependency(short_name=dataset_short_name + ".json", channel="snapshot") + snapshot = paths.load_snapshot() additional_metadata = load_json(snapshot.path) # diff --git a/etl/steps/data/meadow/faostat/2024-03-14/shared.py b/etl/steps/data/meadow/faostat/2024-03-14/shared.py index 09488c04043..9fdd9e12f9b 100644 --- a/etl/steps/data/meadow/faostat/2024-03-14/shared.py +++ b/etl/steps/data/meadow/faostat/2024-03-14/shared.py @@ -153,7 +153,7 @@ def run(dest_dir: str) -> None: paths = PathFinder(current_step_file.as_posix()) # Load snapshot. - snapshot = paths.load_dependency(short_name=dataset_short_name + ".zip", channel="snapshot") + snapshot = paths.load_snapshot() df_snapshot = load_data(snapshot.path) # From 273de4f6633d46dc973cd3eeb39e3a5133f8b6ef Mon Sep 17 00:00:00 2001 From: Pablo Rosado Date: Thu, 14 Mar 2024 13:09:57 +0100 Subject: [PATCH 08/54] Fix issues with item code changes in faostat_fs --- etl/scripts/faostat/create_chart_revisions.py | 18 ++++++++---------- etl/scripts/faostat/shared.py | 9 +++++++++ .../faostat/2024-03-14/faostat_metadata.py | 8 ++++---- .../data/garden/faostat/2024-03-14/shared.py | 6 +++--- .../faostat/2024-03-14/value_amendments.csv | 3 +++ 5 files changed, 27 insertions(+), 17 deletions(-) diff --git a/etl/scripts/faostat/create_chart_revisions.py b/etl/scripts/faostat/create_chart_revisions.py index f040bc5c759..3bfd1979ea9 100644 --- a/etl/scripts/faostat/create_chart_revisions.py +++ b/etl/scripts/faostat/create_chart_revisions.py @@ -21,8 +21,14 @@ from etl import db from etl.chart_revision.v1.revision import create_and_submit_charts_revisions from etl.paths import DATA_DIR -from etl.scripts.faostat.shared import NAMESPACE +from etl.scripts.faostat.shared import ( + N_CHARACTERS_ELEMENT_CODE, + N_CHARACTERS_ITEM_CODE, + N_CHARACTERS_ITEM_CODE_EXTENDED, + NAMESPACE, +) +# Initialize logger. log = get_logger() # Channel from which the dataset versions and variables will be loaded. @@ -31,14 +37,6 @@ # Columns to not take as variables. COLUMNS_TO_IGNORE = ["country", "year", "index"] -# WARNING: These definitions should coincide with those given in the shared module of the garden step. -# So we will convert it into a string of this number of characters (integers will be prepended with zeros). -N_CHARACTERS_ITEM_CODE = 8 -# Idem for faostat_sdgb (that has different item codes). -N_CHARACTERS_ITEM_CODE_SDGB = 14 -# Maximum number of characters for element_code (integers will be prepended with zeros). -N_CHARACTERS_ELEMENT_CODE = 6 - # This regex should extract item codes and element codes, which are made of numbers, sometimes "pc" # (for per capita variables), and "M" and "F" (for male and female, only for certain domains, like fs and sdgb). REGEX_TO_EXTRACT_ITEM_AND_ELEMENT = ( @@ -46,7 +44,7 @@ ) # Idem for faostat_sdgb. REGEX_TO_EXTRACT_ITEM_AND_ELEMENT_SDGB = ( - rf".*([0-9A-Z]{{{N_CHARACTERS_ITEM_CODE_SDGB}}}).*([0-9pcMF]{{{N_CHARACTERS_ELEMENT_CODE}}})" + rf".*([0-9A-Z]{{{N_CHARACTERS_ITEM_CODE_EXTENDED}}}).*([0-9pcMF]{{{N_CHARACTERS_ELEMENT_CODE}}})" ) diff --git a/etl/scripts/faostat/shared.py b/etl/scripts/faostat/shared.py index 6ce58a94956..6230e2c69c6 100644 --- a/etl/scripts/faostat/shared.py +++ b/etl/scripts/faostat/shared.py @@ -27,6 +27,15 @@ # Metadata related to license. LICENSE_URL = "http://www.fao.org/contact-us/terms/db-terms-of-use/en" LICENSE_NAME = "CC BY-NC-SA 3.0 IGO" +# Maximum number of characters for item_code. +# WARNING: These definitions should coincide with those given in the shared module of the garden step. +# FAOSTAT "item_code" is usually an integer number, however sometimes it has decimals and sometimes it contains letters. +# So we will convert it into a string of this number of characters (integers will be prepended with zeros). +N_CHARACTERS_ITEM_CODE = 8 +# Idem for faostat_sdgb and faostat_fs (that have different, longer item codes with digits and letters). +N_CHARACTERS_ITEM_CODE_EXTENDED = 14 +# Maximum number of characters for element_code (integers will be prepended with zeros). +N_CHARACTERS_ELEMENT_CODE = 6 # Codes of FAOSTAT domains to download from FAO and upload to walden bucket. # This is the list that will determine the datasets (faostat_*) to be created in all further etl data steps. INCLUDED_DATASETS_CODES = [ diff --git a/etl/steps/data/garden/faostat/2024-03-14/faostat_metadata.py b/etl/steps/data/garden/faostat/2024-03-14/faostat_metadata.py index 95e9c56d545..4cbebcae85b 100644 --- a/etl/steps/data/garden/faostat/2024-03-14/faostat_metadata.py +++ b/etl/steps/data/garden/faostat/2024-03-14/faostat_metadata.py @@ -52,7 +52,7 @@ FLAGS_RANKING, N_CHARACTERS_ELEMENT_CODE, N_CHARACTERS_ITEM_CODE, - N_CHARACTERS_ITEM_CODE_SDGB, + N_CHARACTERS_ITEM_CODE_EXTENDED, NAMESPACE, harmonize_elements, harmonize_items, @@ -194,8 +194,8 @@ def check_that_item_and_element_harmonization_does_not_trim_codes( # respectively. # Set the maximum number of characters for item_code. - if dataset_short_name == f"{NAMESPACE}_sdgb": - n_characters_item_code = N_CHARACTERS_ITEM_CODE_SDGB + if dataset_short_name in [f"{NAMESPACE}_sdgb", f"{NAMESPACE}_fs"]: + n_characters_item_code = N_CHARACTERS_ITEM_CODE_EXTENDED else: n_characters_item_code = N_CHARACTERS_ITEM_CODE @@ -897,7 +897,7 @@ def process_metadata( for dataset_short_name in tqdm(dataset_short_names, file=sys.stdout): print(dataset_short_name) # Load latest meadow table for current dataset. - ds_latest: catalog.Dataset = paths.load_dependency(dataset_short_name) + ds_latest = paths.load_dataset(dataset_short_name) table = ds_latest[dataset_short_name] df = pd.DataFrame(table.reset_index()).rename( columns={ diff --git a/etl/steps/data/garden/faostat/2024-03-14/shared.py b/etl/steps/data/garden/faostat/2024-03-14/shared.py index 1953069445b..ecd50658dc4 100644 --- a/etl/steps/data/garden/faostat/2024-03-14/shared.py +++ b/etl/steps/data/garden/faostat/2024-03-14/shared.py @@ -44,9 +44,9 @@ # FAOSTAT "item_code" is usually an integer number, however sometimes it has decimals and sometimes it contains letters. # So we will convert it into a string of this number of characters (integers will be prepended with zeros). N_CHARACTERS_ITEM_CODE = 8 -# Maximum number of characters for item_code for faostat_sdgb, which has a different kind of item codes, +# Maximum number of characters for item_code for faostat_sdgb and faostat_fs, which have a different kind of item codes, # e.g. '24002-F-Y_GE15', '24002-M-Y_GE15', etc. -N_CHARACTERS_ITEM_CODE_SDGB = 14 +N_CHARACTERS_ITEM_CODE_EXTENDED = 14 # Maximum number of characters for element_code (integers will be prepended with zeros). N_CHARACTERS_ELEMENT_CODE = 6 # Manual fixes to item codes to avoid ambiguities. @@ -395,7 +395,7 @@ def harmonize_items(df: pd.DataFrame, dataset_short_name: str, item_col: str = " # Set the maximum number of characters for item_code. if dataset_short_name == f"{NAMESPACE}_sdgb": - n_characters_item_code = N_CHARACTERS_ITEM_CODE_SDGB + n_characters_item_code = N_CHARACTERS_ITEM_CODE_EXTENDED else: n_characters_item_code = N_CHARACTERS_ITEM_CODE diff --git a/etl/steps/data/garden/faostat/2024-03-14/value_amendments.csv b/etl/steps/data/garden/faostat/2024-03-14/value_amendments.csv index 2848b3da2c6..fafd114ae9e 100644 --- a/etl/steps/data/garden/faostat/2024-03-14/value_amendments.csv +++ b/etl/steps/data/garden/faostat/2024-03-14/value_amendments.csv @@ -1,6 +1,9 @@ dataset,spurious_value,new_value faostat_fs,"<0.1","0.1" +faostat_fs,"<0.2","0.2" faostat_fs,"<2.5","2.5" +faostat_fs,"<0.3","0.3" +faostat_fs,"<0.4","0.4" faostat_fs,"<0.5","0.5" faostat_sdgb,"<0.1","0.1" faostat_sdgb,"<0.5","0.5" From 6277a021a0cf9410f9c6d9b780c41e21e5ec0257 Mon Sep 17 00:00:00 2001 From: Pablo Rosado Date: Thu, 14 Mar 2024 13:18:53 +0100 Subject: [PATCH 09/54] Add new flag, and increase number of characters for item codes of faostat_sdgb --- etl/scripts/faostat/shared.py | 2 +- etl/steps/data/garden/faostat/2024-03-14/shared.py | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/etl/scripts/faostat/shared.py b/etl/scripts/faostat/shared.py index 6230e2c69c6..84ac02cfe73 100644 --- a/etl/scripts/faostat/shared.py +++ b/etl/scripts/faostat/shared.py @@ -33,7 +33,7 @@ # So we will convert it into a string of this number of characters (integers will be prepended with zeros). N_CHARACTERS_ITEM_CODE = 8 # Idem for faostat_sdgb and faostat_fs (that have different, longer item codes with digits and letters). -N_CHARACTERS_ITEM_CODE_EXTENDED = 14 +N_CHARACTERS_ITEM_CODE_EXTENDED = 15 # Maximum number of characters for element_code (integers will be prepended with zeros). N_CHARACTERS_ELEMENT_CODE = 6 # Codes of FAOSTAT domains to download from FAO and upload to walden bucket. diff --git a/etl/steps/data/garden/faostat/2024-03-14/shared.py b/etl/steps/data/garden/faostat/2024-03-14/shared.py index ecd50658dc4..50be7b8f4a7 100644 --- a/etl/steps/data/garden/faostat/2024-03-14/shared.py +++ b/etl/steps/data/garden/faostat/2024-03-14/shared.py @@ -46,7 +46,7 @@ N_CHARACTERS_ITEM_CODE = 8 # Maximum number of characters for item_code for faostat_sdgb and faostat_fs, which have a different kind of item codes, # e.g. '24002-F-Y_GE15', '24002-M-Y_GE15', etc. -N_CHARACTERS_ITEM_CODE_EXTENDED = 14 +N_CHARACTERS_ITEM_CODE_EXTENDED = 15 # Maximum number of characters for element_code (integers will be prepended with zeros). N_CHARACTERS_ELEMENT_CODE = 6 # Manual fixes to item codes to avoid ambiguities. @@ -272,6 +272,7 @@ ("B", "Time series break"), ("N", "Not significant (negligible)"), ("U", "Low reliability"), + ("G", "Experimental value"), ("L", "Missing value; data exist"), ("O", "Missing value"), ("M", "Missing value (data cannot exist, not applicable)"), From 660ed7fd5cd04ef03bf0795914f3df7df2021686 Mon Sep 17 00:00:00 2001 From: Pablo Rosado Date: Thu, 14 Mar 2024 16:10:36 +0100 Subject: [PATCH 10/54] Remove unused excluded country, fix new element mismatch between fbsh and fbs, and replace assertions with log errors --- .../faostat.excluded_countries.json | 1 - .../faostat/2024-03-14/faostat_metadata.py | 43 ++++++++++++------- 2 files changed, 28 insertions(+), 16 deletions(-) diff --git a/etl/steps/data/garden/faostat/2024-03-14/faostat.excluded_countries.json b/etl/steps/data/garden/faostat/2024-03-14/faostat.excluded_countries.json index 60aab73b9b1..776ed419406 100644 --- a/etl/steps/data/garden/faostat/2024-03-14/faostat.excluded_countries.json +++ b/etl/steps/data/garden/faostat/2024-03-14/faostat.excluded_countries.json @@ -2,7 +2,6 @@ "Africa (excluding intra-trade)", "Americas (excluding intra-trade)", "Annex I countries", - "Antarctic Region", "Asia (excluding intra-trade)", "Australia and New Zealand", "Australia and New Zealand (excluding intra-trade)", diff --git a/etl/steps/data/garden/faostat/2024-03-14/faostat_metadata.py b/etl/steps/data/garden/faostat/2024-03-14/faostat_metadata.py index 4cbebcae85b..30663f2186e 100644 --- a/etl/steps/data/garden/faostat/2024-03-14/faostat_metadata.py +++ b/etl/steps/data/garden/faostat/2024-03-14/faostat_metadata.py @@ -40,7 +40,6 @@ import json import sys from copy import deepcopy -from pathlib import Path from typing import Dict, List, Tuple, cast import pandas as pd @@ -516,8 +515,16 @@ def clean_global_elements_dataframe(elements_df: pd.DataFrame, custom_elements: elements_df = elements_df.copy() # Check that all elements of fbsh are in fbs (although fbs may contain additional elements). - assert set(elements_df[elements_df["dataset"] == "faostat_fbsh"]["element_code"]) <= set( - elements_df[elements_df["dataset"] == "faostat_fbs"]["element_code"] + # The only exception is "Stock Variation", which have slightly different definitions: + # On fbs "Stock Variation" (005072), "Net decreases (from stock) are generally indicated by the sign "-". No sign denotes net increases (add to stock)". + # On fbsh "Stock Variation" (005074), "Net increases in stocks (add to stock) are generally indicated by the sign "-". No sign denotes net decreases (from stock).". + # Given that they have different definitions, we should not map one to the other. + # So, for now, simply ignore it. + ELEMENTS_IN_FBSH_MISSING_IN_FBS = {"005074"} + assert ( + set(elements_df[elements_df["dataset"] == "faostat_fbsh"]["element_code"]) + - set(elements_df[elements_df["dataset"] == "faostat_fbs"]["element_code"]) + == ELEMENTS_IN_FBSH_MISSING_IN_FBS ) # Drop all rows for fbsh, and rename "fbs" to "fbsc" (since this will be the name for the combined dataset). elements_df = elements_df[elements_df["dataset"] != "faostat_fbsh"].reset_index(drop=True) @@ -559,18 +566,25 @@ def clean_global_elements_dataframe(elements_df: pd.DataFrame, custom_elements: } ) - error = "Element names have changed with respect to custom elements file. Update custom elements file." - assert ( - elements_df[elements_df["fao_element_check"].notnull()]["fao_element_check"] - == elements_df[elements_df["fao_element_check"].notnull()]["fao_element"] - ).all(), error + # Check if element or unit names have changed with respect to the custom elements and units file. + # NOTE: This raises an error instead of a warning because further steps will (certainly?) fail. + changed_elements = elements_df[ + elements_df["fao_element_check"].notnull() & (elements_df["fao_element_check"] != elements_df["fao_element"]) + ][["fao_element_check", "fao_element"]] + if len(changed_elements) > 0: + log.error( + f"{len(changed_elements)} element names have changed with respect to custom elements file. Use `update_custom_metadata.py` to update custom elements file." + ) elements_df = elements_df.drop(columns=["fao_element_check"]) - error = "Unit names have changed with respect to custom elements file. Update custom elements file." - assert ( - elements_df[elements_df["fao_unit_short_name_check"].notnull()]["fao_unit_short_name_check"] - == elements_df[elements_df["fao_unit_short_name_check"].notnull()]["fao_unit_short_name"] - ).all(), error + changed_units = elements_df[ + elements_df["fao_unit_short_name_check"].notnull() + & (elements_df["fao_unit_short_name_check"] != elements_df["fao_unit_short_name"]) + ][["fao_unit_short_name_check", "fao_unit_short_name"]] + if len(changed_units) > 0: + log.error( + f"{len(changed_units)} unit names have changed with respect to custom elements file. Use `update_custom_metadata.py` to update custom elements file." + ) elements_df = elements_df.drop(columns=["fao_unit_short_name_check"]) # Assign original FAO names where there is no custom one. @@ -895,7 +909,6 @@ def process_metadata( # Gather all variables from the latest version of each meadow dataset. for dataset_short_name in tqdm(dataset_short_names, file=sys.stdout): - print(dataset_short_name) # Load latest meadow table for current dataset. ds_latest = paths.load_dataset(dataset_short_name) table = ds_latest[dataset_short_name] @@ -1000,7 +1013,7 @@ def run(dest_dir: str) -> None: # Load data. # # Fetch the dataset short name from dest_dir. - dataset_short_name = Path(dest_dir).name + dataset_short_name = f"{NAMESPACE}_metadata" # Define path to current step file. current_step_file = (CURRENT_DIR / dataset_short_name).with_suffix(".py") From a9c5fc49d9e9131c0848200a966d1586fec83c31 Mon Sep 17 00:00:00 2001 From: Pablo Rosado Date: Thu, 14 Mar 2024 16:44:15 +0100 Subject: [PATCH 11/54] Fix issue with categorical columns in script that updates custom metadata --- etl/scripts/faostat/update_custom_metadata.py | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/etl/scripts/faostat/update_custom_metadata.py b/etl/scripts/faostat/update_custom_metadata.py index 55c392382c4..f4e739ee36d 100644 --- a/etl/scripts/faostat/update_custom_metadata.py +++ b/etl/scripts/faostat/update_custom_metadata.py @@ -248,15 +248,20 @@ def update_custom_items_file(interactive=False, version=VERSION, read_only=False suffixes=("_old", "_new"), ) - if interactive: - for field in fields_to_compare: - n_changes = len(compared[compared[f"{field}_old"].fillna("") != compared[f"{field}_new"].fillna("")]) - tqdm.write(f"\nNumber of changes in {field} to review: {n_changes}") + # Ensure no column is of categorical type. + compared = compared.astype(object) # Go one by one on the datasets for which at least one custom item was defined. for field in tqdm(fields_to_compare): - _compared = compared[compared[f"{field}_old"].fillna("") != compared[f"{field}_new"].fillna("")].reset_index() - for i, row in tqdm(_compared.iterrows(), total=len(_compared)): + _compared = compared.copy() + _compared[f"{field}_old"] = _compared[f"{field}_old"].fillna("") + _compared[f"{field}_new"] = _compared[f"{field}_new"].fillna("") + _compared = _compared[_compared[f"{field}_old"] != _compared[f"{field}_new"]].reset_index() + if interactive: + n_changes = len(_compared) + tqdm.write(f"\nNumber of changes in {field} to review: {n_changes}") + + for _, row in tqdm(_compared.iterrows(), total=len(_compared)): dataset_short_name = row["dataset"] item_code = row["item_code"] old = row[f"{field}_old"] From c4169f8e447aa2da7d0d7c0b7ab55317bf0dc940 Mon Sep 17 00:00:00 2001 From: Pablo Rosado Date: Thu, 14 Mar 2024 16:44:27 +0100 Subject: [PATCH 12/54] Update custom metadata files --- .../faostat/2024-03-14/custom_datasets.csv | 52 ++++++++--------- .../2024-03-14/custom_elements_and_units.csv | 58 +++++++++---------- .../faostat/2024-03-14/custom_items.csv | 2 +- 3 files changed, 55 insertions(+), 57 deletions(-) diff --git a/etl/steps/data/garden/faostat/2024-03-14/custom_datasets.csv b/etl/steps/data/garden/faostat/2024-03-14/custom_datasets.csv index 66d40136c44..c6fbabd9086 100644 --- a/etl/steps/data/garden/faostat/2024-03-14/custom_datasets.csv +++ b/etl/steps/data/garden/faostat/2024-03-14/custom_datasets.csv @@ -1,37 +1,35 @@ dataset,fao_dataset_title,owid_dataset_title,fao_dataset_description,owid_dataset_description -faostat_cahd,Cost and Affordability of a Healthy Diet: Cost and Affordability of a Healthy Diet (CoAHD) - FAO (2023),,"Indicators on the cost and affordability of a healthy diet are estimated in each country and show the population’s physical and economic access to least expensive locally available foods to meet requirements for a healthy diet, as defined in food-based dietary guidelines (FBDGs). The indicators use observed retail food consumer prices and income distributions to provide an operational measure of people’s access to locally available foods in the proportions needed for health. These indicators support efforts within the framework of the Sustainable Development Goals (SDGs) to end hunger, achieve food security and improved nutrition, and promote sustainable agriculture by 2030 (SDG 2). They also support the monitoring of progress towards the objective of transforming agrifood systems by promoting “nutrition-sensitive agriculture”. For definitions of these indicators, see Definitions and standards.", +faostat_cahd,Cost and Affordability of a Healthy Diet: Cost and Affordability of a Healthy Diet (CoAHD),,"Indicators on the cost and affordability of a healthy diet are estimated in each country and show the population’s physical and economic access to least expensive locally available foods to meet requirements for a healthy diet, as defined in food-based dietary guidelines (FBDGs). The indicators use observed retail food consumer prices and income distributions to provide an operational measure of people’s access to locally available foods in the proportions needed for health. These indicators support efforts within the framework of the Sustainable Development Goals (SDGs) to end hunger, achieve food security and improved nutrition, and promote sustainable agriculture by 2030 (SDG 2). They also support the monitoring of progress towards the objective of transforming agrifood systems by promoting “nutrition-sensitive agriculture”. For definitions of these indicators, see Definitions and standards.", faostat_ef,"Land, Inputs and Sustainability: Fertilizers indicators - FAO (2022)",Agri-Environmental Indicators: Fertilizers indicators,"The FAOSTAT domain Fertilizers Indicators provides information on three rations: a) the ratio between the totals by nutrient of agricultural use of chemical or mineral fertilizers, reported in the FAOSTAT domain “Inputs/Fertilizers by Nutrient” for nitrogen (N), phosphorus (expressed as P2O5) and potassium (expressed as K2O) and the area of cropland reported in the FAOSTAT domain “Inputs/Land Use”; b) The ratio of fertilizers use and the annual population reported in the FAOSTAT domain “Population and Employment/Population”; and c) The ratio of fertilizers use and the value of agricultural production reported in the FAOSTAT domain “Production/Value of Agricultural Production.Data are available at national, regional, and global level over the time series 1961-present.","Agri-Environmental Indicators: Fertilizers indicators This dataset describes the use of chemical and mineral fertilizers per area of cropland (which corresponds to the sum of arable land and permanent crops) at national, regional, and global level." -faostat_ei,Climate Change: Emissions intensities - FAO (2022),Agri-Environmental Indicators: Emissions intensities,"The FAOSTAT domain Emissions intensities contains analytical data on the intensity of greenhouse gas (GHG) emissions by agricultural commodity. This indicator is defined as greenhouse gas emissions per kg of product. Data are available for a set of agricultural commodities (e.g. rice and other cereals, meat, milk, eggs), by country, with global coverage and relative to the period 1961–2020.",Agri-Environmental Indicators: Emissions intensities -faostat_ek,"Land, Inputs and Sustainability: Livestock Patterns - FAO (2022)",Agri-Environmental Indicators: Livestock Patterns,"The Livestock Patterns domain of FAOSTAT contains data on livestock numbers, shares of major livestock species and densities of livestock units in the agricultural land area. Values are calculated using Livestock Units (LSU), which facilitate aggregating information for different livestock types. Data are available by country, with global coverage, for the period 1961 to present, with annual updates. This methodology applies the LSU coefficients reported in the ""Guidelines for the preparation of livestock sector reviews"" (FAO, 2011). From this publication, LSU coefficients are computed by livestock type and by country. The reference unit used for the calculation of livestock units (=1 LSU) is the grazing equivalent of one adult dairy cow producing 3000 kg of milk annually, fed without additional concentrated foodstuffs. FAOSTAT agri-environmental indicators on livestock patterns closely follow the structure of the indicators in EUROSTAT.",Agri-Environmental Indicators: Livestock Patterns +faostat_ei,Climate Change: Agrifood systems emissions: Emissions intensities,Agri-Environmental Indicators: Emissions intensities,"The FAOSTAT domain Emissions intensities contains analytical data on the intensity of greenhouse gas (GHG) emissions by agricultural commodity. This indicator is defined as greenhouse gas emissions per kg of product. Data are available for a set of agricultural commodities (e.g. rice and other cereals, meat, milk, eggs), by country, with global coverage and relative to the period 1961–2020.",Agri-Environmental Indicators: Emissions intensities +faostat_ek,"Land, Inputs and Sustainability: Livestock Patterns",Agri-Environmental Indicators: Livestock Patterns,"The Livestock Patterns domain of FAOSTAT contains data on livestock numbers, shares of major livestock species and densities of livestock units in the agricultural land area. Values are calculated using Livestock Units (LSU), which facilitate aggregating information for different livestock types. Data are available by country, with global coverage, for the period 1961 to present, with annual updates. This methodology applies the LSU coefficients reported in the ""Guidelines for the preparation of livestock sector reviews"" (FAO, 2011). From this publication, LSU coefficients are computed by livestock type and by country. The reference unit used for the calculation of livestock units (=1 LSU) is the grazing equivalent of one adult dairy cow producing 3000 kg of milk annually, fed without additional concentrated foodstuffs. FAOSTAT agri-environmental indicators on livestock patterns closely follow the structure of the indicators in EUROSTAT.",Agri-Environmental Indicators: Livestock Patterns faostat_el,"Land, Inputs and Sustainability: Land use indicators - FAO (2022)",Agri-Environmental Indicators: Land use indicators,"The Agri-environmental Indicators—Land Use domain provides information on the distribution of agricultural and forest land, and their sub-components, including irrigated areas and areas under organic agriculture, at national, regional and global levels.Per capita values are included in this update.",Agri-Environmental Indicators: Land use indicators -faostat_emn,"Land, Inputs and Sustainability: Livestock Manure - FAO (2022)",Agri-Environmental Indicators: Livestock Manure,"The Livestock Manure domain of FAOSTAT contains estimates of nitrogen (N) inputs to agricultural soils from livestock manure. Data on the N losses to air and water are also disseminated. These estimates are compiled using official FAOSTAT statistics of animal stocks and by applying the internationally approved Guidelines of the Intergovernmental Panel on Climate Change (IPCC). Data are available by country, with global coverage and relative to the period 1961–2020, with annual updates. The following elements are disseminated: 1) Stocks; 2) Amount excreted in manure (N content); 3) Manure left on pasture (N content); 4) Manure left on pasture that volatilises (N content); 5) Manure left on pasture that leaches (N content); 6) Manure treated (N content); 7) Losses from manure treated (N content); 8) Manure applied to soils (N content); 9) Manure applied to soils that volatilises (N content); 10) Manure applied to soils that leaches (N content).",Agri-Environmental Indicators: Livestock Manure +faostat_emn,"Land, Inputs and Sustainability: Livestock Manure",Agri-Environmental Indicators: Livestock Manure,"The Livestock Manure domain of FAOSTAT contains estimates of nitrogen (N) inputs to agricultural soils from livestock manure. Data on the N losses to air and water are also disseminated. These estimates are compiled using official FAOSTAT statistics of animal stocks and by applying the internationally approved Guidelines of the Intergovernmental Panel on Climate Change (IPCC). Data are available by country, with global coverage and relative to the period 1961–2020, with annual updates. The following elements are disseminated: 1) Stocks; 2) Amount excreted in manure (N content); 3) Manure left on pasture (N content); 4) Manure left on pasture that volatilises (N content); 5) Manure left on pasture that leaches (N content); 6) Manure treated (N content); 7) Losses from manure treated (N content); 8) Manure applied to soils (N content); 9) Manure applied to soils that volatilises (N content); 10) Manure applied to soils that leaches (N content).",Agri-Environmental Indicators: Livestock Manure faostat_ep,"Land, Inputs and Sustainability: Pesticides indicators - FAO (2022)",Agri-Environmental Indicators: Pesticides indicators,Agri-environmental indicator on the Use of pesticides per area of cropland (which is the sum of arable land and land under permanent crops) at national level for the period 1990 to 2016.,Agri-Environmental Indicators: Pesticides indicators -faostat_esb,"Land, Inputs and Sustainability: Cropland Nutrient Budget - FAO (2022)","Land, Inputs and Sustainability: Soil nutrient budget","2022 Cropland nutrient budget analytical briefThe Cropland Nutrient Budget domain contains information on the flows of nitrogen, phosphorus, and potassium from synthetic fertilizer, manure applied to soils, atmospheric deposition, crop removal, and biological fixation over cropland and per unit area of cropland. The flows are aggregated to total inputs and total outputs, from which the overall nutrient budget and nutrient use efficiency on cropland are calculated. Statistics are disseminated in units of tonnes and in kg/ha, as appropriate. Nutrient use efficiency is expressed as a fraction (%). Data are available by country, with global coverage relative to the period 1961-2020, with annual updates.","Land, Inputs and Sustainability: Soil nutrient budget" -faostat_fa,Discontinued archives and data series: Food Aid Shipments (WFP) - FAO (2016),Discontinued archives and data series: Food Aid Shipments (WFP),,Discontinued archives and data series: Food Aid Shipments (WFP) -faostat_fbs,Food Balances: Food Balances (2010-) - FAO (2023),Food Balance: New Food Balances,"Food Balance Sheet presents a comprehensive picture of the pattern of a country's food supply during a specified reference period. The food balance sheet shows for each food item - i.e. each primary commodity and a number of processed commodities potentially available for human consumption - the sources of supply and its utilization. The total quantity of foodstuffs produced in a country added to the total quantity imported and adjusted to any change in stocks that may have occurred since the beginning of the reference period gives the supply available during that period. On the utilization side a distinction is made between the quantities exported, fed to livestock, used for seed, put to manufacture for food use and non-food uses, losses during storage and transportation, and food supplies available for human consumption. The per caput supply of each such food item available for human consumption is then obtained by dividing the respective quantity by the related data on the population actually partaking of it. Data on per caput food supplies are expressed in terms of quantity and - by applying appropriate food composition factors for all primary and processed products - also in terms of caloric value and protein and fat content.",Food Balance: New Food Balances -faostat_fbsc,Food Balances: Food Balances (2010-) - FAO (2022),"Food Balances (old methodology before 2010, and new from 2010 onwards)","Food Balance Sheet presents a comprehensive picture of the pattern of a country's food supply during a specified reference period. The food balance sheet shows for each food item - i.e. each primary commodity and a number of processed commodities potentially available for human consumption - the sources of supply and its utilization. The total quantity of foodstuffs produced in a country added to the total quantity imported and adjusted to any change in stocks that may have occurred since the beginning of the reference period gives the supply available during that period. On the utilization side a distinction is made between the quantities exported, fed to livestock, used for seed, put to manufacture for food use and non-food uses, losses during storage and transportation, and food supplies available for human consumption. The per caput supply of each such food item available for human consumption is then obtained by dividing the respective quantity by the related data on the population actually partaking of it. Data on per caput food supplies are expressed in terms of quantity and - by applying appropriate food composition factors for all primary and processed products - also in terms of caloric value and protein and fat content.","Food Balances (old methodology before 2010, and new from 2010 onwards)" -faostat_fbsh,"Food Balances: Food Balances (-2013, old methodology and population) - FAO (2023)",Food Balance: Food Balances (old methodology and population),"Food Balance Sheet presents a comprehensive picture of the pattern of a country's food supply during a specified reference period. The food balance sheet shows for each food item - i.e. each primary commodity and a number of processed commodities potentially available for human consumption - the sources of supply and its utilization. The total quantity of foodstuffs produced in a country added to the total quantity imported and adjusted to any change in stocks that may have occurred since the beginning of the reference period gives the supply available during that period. On the utilization side a distinction is made between the quantities exported, fed to livestock, used for seed, put to manufacture for food use and non-food uses, losses during storage and transportation, and food supplies available for human consumption. The per caput supply of each such food item available for human consumption is then obtained by dividing the respective quantity by the related data on the population actually partaking of it. Data on per caput food supplies are expressed in terms of quantity and - by applying appropriate food composition factors for all primary and processed products - also in terms of caloric value and protein and fat content.",Food Balance: Food Balances (old methodology and population) -faostat_fo,Forestry: Forestry Production and Trade - FAO (2023),Forestry: Forestry Production and Trade,"The database contains data on the production and trade in roundwood and in primary wood and paper products for all countries and territories in the world.The main types of primary forest products included in this database are roundwood, sawnwood, wood-based panels, pulp, and paper and paperboard. These products are detailed further and defined in the Joint Forest Sector Questionnaire (JFSQ) (http://www.fao.org/forestry/statistics/80572/en/). The database contains details of the following topics: - Roundwood removals (production) by coniferous and non-coniferous wood, - production and trade in industrial Roundwood, sawnwood, wood-based panels, wood charcoal, pulp, paper paperboard, and other products. More detailed information on wood products, including definitions, can be found at http://www.fao.org/forestry/statistics/80572/en/",Forestry: Forestry Production and Trade -faostat_fs,Food Security and Nutrition: Suite of Food Security Indicators - FAO (2022),Food Security: Suite of Food Security Indicators,"The Suite of Food Security Indicators presents the core set of food security indicators. Following the recommendation of experts gathered in the Committee on World Food Security (CFS) Round Table on hunger measurement, hosted at FAO headquarters in September 2011, an initial set of indicators aiming to capture various aspects of food insecurity is presented here. The choice of the indicators has been informed by expert judgment and the availability of data with sufficient coverage to enable comparisons across regions and over time. Many of these indicators are produced and published elsewhere by FAO and other international organizations. They are reported here in a single database with the aim of building a wide food security information system. More indicators will be added to this set as more data will become available. Indicators are classified along the four dimensions of food security -- availability, access, utilization and stability. For definitions of these indicators, see Definitions and standards below (under Item).",Food Security: Suite of Food Security Indicators -faostat_gn,Climate Change: Agrifood systems emissions: Emissions from Energy use in agriculture - FAO (2023),Climate Change: Energy Use,"Greenhouse gas (GHG) emissions from direct on-farm energy use consist of carbon dioxide, methane, and nitrous oxide gases related with fuel combustion and electricity generation in agriculture (including fisheries). The FAOSTAT emissions database has a global scope for the period 1970 to 2021 (with annual updates), by motor gasoline, gas-diesel oils, gasoline, natural gas, liquefied petroleum gas, residual fuel oil, coal, electricity, heat, gas-diesel oils in fisheries, residual fuel oil in fisheries, and by aggregates (total energy, energy consumed in fishery and total energy without electricity heat). Activity data(Energy use) is also provided.",Climate Change: Energy Use -faostat_ic,Investment: Credit to Agriculture - FAO (2022),Investment: Credit to Agriculture,"The Credit to Agriculture dataset provides national data for over 130 countries on the amount of loans provided by the private/commercial banking sector to producers in agriculture, forestry and fishing, including household producers, cooperatives, and agro-businesses. For some countries, the three subsectors of agriculture, forestry, and fishing are completely specified. In other cases, complete disaggregations are not available. The dataset also provides statistics on the total credit to all industries, indicators on the share of credit to agricultural producers, and an agriculture orientation index (AOI) for credit that normalizes the share of credit to agriculture over total credit by dividing it by the share of agriculture in gross domestic product (GDP). As such, it can provide a more accurate indication of the relative importance that banking sectors place on financing the sector. An AOI lower than 1 indicates that the agriculture sector receives a credit share lower than its contribution to the economy, while an AOI greater than 1 indicates a credit share to the agriculture sector greater than its economic contribution.",Investment: Credit to Agriculture -faostat_lc,"Land, Inputs and Sustainability: Land Cover - FAO (2022)",Agri-Environmental Indicators: Land Cover,"The FAOSTAT domain Land Cover under the Agri-Environmental Indicators section contains land cover information organized by the land cover classes of the international standard system for Environmental and Economic Accounting Central Framework (SEEA CF). The land cover information is compiled from publicly available Global Land Cover (GLC) maps: a) MODIS land cover types based on the Land Cover Classification System, LCCS (2001–2018) and b) the European Spatial Agency (ESA) Climate Change Initiative (CCI) annual land cover maps (1992–2018) produced by the Université catholique de Louvain (UCL)-Geomatics and now under the European Copernicus Program.",Agri-Environmental Indicators: Land Cover -faostat_qcl,Production: Crops and livestock products - FAO (2023),Production: Crops and livestock products,"Crop and livestock statistics are recorded for 278 products, covering the following categories: 1) CROPS PRIMARY: Cereals, Citrus Fruit, Fibre Crops, Fruit, Oil Crops, Oil Crops and Cakes in Oil Equivalent, Pulses, Roots and Tubers, Sugar Crops, Treenuts and Vegetables. Data are expressed in terms of area harvested, production quantity and yield. Cereals: Area and production data on cereals relate to crops harvested for dry grain only. Cereal crops harvested for hay or harvested green for food, feed or silage or used for grazing are therefore excluded. 2) CROPS PROCESSED: Beer of barley; Cotton lint; Cottonseed; Margarine, short; Molasses; Oil, coconut (copra); Oil, cottonseed; Oil, groundnut; Oil, linseed; Oil, maize; Oil, olive, virgin; Oil, palm; Oil, palm kernel; Oil, rapeseed; Oil, safflower; Oil, sesame; Oil, soybean; Oil, sunflower; Palm kernels; Sugar Raw Centrifugal; Wine. 3) LIVE ANIMALS: Animals live n.e.s.; Asses; Beehives; Buffaloes; Camelids, other; Camels; Cattle; Chickens; Ducks; Geese and guinea fowls; Goats; Horses; Mules; Pigeons, other birds; Pigs; Rabbits and hares; Rodents, other; Sheep; Turkeys. 4) LIVESTOCK PRIMARY: Beeswax; Eggs (various types); Hides buffalo, fresh; Hides, cattle, fresh; Honey, natural; Meat (ass, bird nes, buffalo, camel, cattle, chicken, duck, game, goat, goose and guinea fowl, horse, mule, Meat nes, meat other camelids, Meat other rodents, pig, rabbit, sheep, turkey); Milk (buffalo, camel, cow, goat, sheep); Offals, nes; Silk-worm cocoons, reelable; Skins (goat, sheep); Snails, not sea; Wool, greasy. 5) LIVESTOCK PROCESSED: Butter (of milk from sheep, goat, buffalo, cow); Cheese (of milk from goat, buffalo, sheep, cow milk); Cheese of skimmed cow milk; Cream fresh; Ghee (cow and buffalo milk); Lard; Milk (dry buttermilk, skimmed condensed, skimmed cow, skimmed dried, skimmed evaporated, whole condensed, whole dried, whole evaporated); Silk raw; Tallow; Whey (condensed and dry); Yoghurt",Production: Crops and livestock products -faostat_qi,Production: Production Indices - FAO (2023),Production: Production Indices,"Crop and livestock statistics are recorded for 278 products, covering the following categories:1) CROPS PRIMARY:Cereals, Citrus Fruit, Fibre Crops, Fruit, Oil Crops, Oil Crops and Cakes in Oil Equivalent, Pulses, Roots and Tubers, Sugar Crops, Treenuts and Vegetables. Data are expressed in terms of area harvested, production quantity and yield. Cereals: Area and production data on cereals relate to crops harvested for dry grain only. Cereal crops harvested for hay or harvested green for food, feed or silage or used for grazing are therefore excluded.2) CROPS PROCESSED:Beer of barley; Cotton lint; Cottonseed; Margarine, short; Molasses; Oil, coconut (copra); Oil, cottonseed; Oil, groundnut; Oil, linseed; Oil, maize; Oil, olive, virgin; Oil, palm; Oil, palm kernel; Oil, rapeseed; Oil, safflower; Oil, sesame; Oil, soybean; Oil, sunflower; Palm kernels; Sugar Raw Centrifugal; Wine.3) LIVE ANIMALS:Animals live n.e.s.; Asses; Beehives; Buffaloes; Camelids, other; Camels; Cattle; Chickens; Ducks; Geese and guinea fowls; Goats; Horses; Mules; Pigeons, other birds; Pigs; Rabbits and hares; Rodents, other; Sheep; Turkeys.4) LIVESTOCK PRIMARY:Beeswax; Eggs (various types); Hides buffalo, fresh; Hides, cattle, fresh; Honey, natural; Meat (ass, bird nes, buffalo, camel, cattle, chicken, duck, game, goat, goose and guinea fowl, horse, mule, Meat nes, meat other camelids, Meat other rodents, pig, rabbit, sheep, turkey); Milk (buffalo, camel, cow, goat, sheep); Offals, nes; Silk-worm cocoons, reelable; Skins (goat, sheep); Snails, not sea; Wool, greasy.5) LIVESTOCK PROCESSED:Butter (of milk from sheep, goat, buffalo, cow); Cheese (of milk from goat, buffalo, sheep, cow milk); Cheese of skimmed cow milk; Cream fresh; Ghee (cow and buffalo milk); Lard; Milk (dry buttermilk, skimmed condensed, skimmed cow, skimmed dried, skimmed evaporated, whole condensed, whole dried, whole evaporated); Silk raw; Tallow; Whey (condensed and dry); Yoghurt","Production: Production Indices +faostat_esb,"Land, Inputs and Sustainability: Cropland Nutrient Balance","Land, Inputs and Sustainability: Soil nutrient budget","2022 Cropland nutrient budget analytical briefThe Cropland Nutrient Budget domain contains information on the flows of nitrogen, phosphorus, and potassium from synthetic fertilizer, manure applied to soils, atmospheric deposition, crop removal, and biological fixation over cropland and per unit area of cropland. The flows are aggregated to total inputs and total outputs, from which the overall nutrient budget and nutrient use efficiency on cropland are calculated. Statistics are disseminated in units of tonnes and in kg/ha, as appropriate. Nutrient use efficiency is expressed as a fraction (%). Data are available by country, with global coverage relative to the period 1961-2020, with annual updates.","Land, Inputs and Sustainability: Soil nutrient budget" +faostat_fa,Discontinued archives and data series: Food Aid Shipments (WFP),Discontinued archives and data series: Food Aid Shipments (WFP),,Discontinued archives and data series: Food Aid Shipments (WFP) +faostat_fbs,Food Balances: Food Balances (2010-),Food Balance: New Food Balances,"Food Balance Sheet presents a comprehensive picture of the pattern of a country's food supply during a specified reference period. The food balance sheet shows for each food item - i.e. each primary commodity and a number of processed commodities potentially available for human consumption - the sources of supply and its utilization. The total quantity of foodstuffs produced in a country added to the total quantity imported and adjusted to any change in stocks that may have occurred since the beginning of the reference period gives the supply available during that period. On the utilization side a distinction is made between the quantities exported, fed to livestock, used for seed, put to manufacture for food use and non-food uses, losses during storage and transportation, and food supplies available for human consumption. The per caput supply of each such food item available for human consumption is then obtained by dividing the respective quantity by the related data on the population actually partaking of it. Data on per caput food supplies are expressed in terms of quantity and - by applying appropriate food composition factors for all primary and processed products - also in terms of caloric value and protein and fat content.",Food Balance: New Food Balances +faostat_fbsc,Food Balances: Food Balances (2010-),"Food Balances (old methodology before 2010, and new from 2010 onwards)","Food Balance Sheet presents a comprehensive picture of the pattern of a country's food supply during a specified reference period. The food balance sheet shows for each food item - i.e. each primary commodity and a number of processed commodities potentially available for human consumption - the sources of supply and its utilization. The total quantity of foodstuffs produced in a country added to the total quantity imported and adjusted to any change in stocks that may have occurred since the beginning of the reference period gives the supply available during that period. On the utilization side a distinction is made between the quantities exported, fed to livestock, used for seed, put to manufacture for food use and non-food uses, losses during storage and transportation, and food supplies available for human consumption. The per caput supply of each such food item available for human consumption is then obtained by dividing the respective quantity by the related data on the population actually partaking of it. Data on per caput food supplies are expressed in terms of quantity and - by applying appropriate food composition factors for all primary and processed products - also in terms of caloric value and protein and fat content.","Food Balances (old methodology before 2010, and new from 2010 onwards)" +faostat_fbsh,"Food Balances: Food Balances (-2013, old methodology and population)",Food Balance: Food Balances (old methodology and population),"Food Balance Sheet presents a comprehensive picture of the pattern of a country's food supply during a specified reference period. The food balance sheet shows for each food item - i.e. each primary commodity and a number of processed commodities potentially available for human consumption - the sources of supply and its utilization. The total quantity of foodstuffs produced in a country added to the total quantity imported and adjusted to any change in stocks that may have occurred since the beginning of the reference period gives the supply available during that period. On the utilization side a distinction is made between the quantities exported, fed to livestock, used for seed, put to manufacture for food use and non-food uses, losses during storage and transportation, and food supplies available for human consumption. The per caput supply of each such food item available for human consumption is then obtained by dividing the respective quantity by the related data on the population actually partaking of it. Data on per caput food supplies are expressed in terms of quantity and - by applying appropriate food composition factors for all primary and processed products - also in terms of caloric value and protein and fat content.",Food Balance: Food Balances (old methodology and population) +faostat_fo,Forestry: Forestry Production and Trade,Forestry: Forestry Production and Trade,"The database contains data on the production and trade in roundwood and in primary wood and paper products for all countries and territories in the world.The main types of primary forest products included in this database are roundwood, sawnwood, wood-based panels, pulp, and paper and paperboard. These products are detailed further and defined in the Joint Forest Sector Questionnaire (JFSQ) (https://www.fao.org/forestry/statistics/80572/en/). The database contains details of the following topics: - Roundwood removals (production) by coniferous and non-coniferous wood and assortments, - production and trade in industrial roundwood, sawnwood, wood-based panels, wood charcoal, pulp, paper and paperboard, and other products. More detailed information on wood products, including definitions, can be found at https://www.fao.org/forestry/statistics/80572/en",Forestry: Forestry Production and Trade +faostat_fs,Food Security and Nutrition: Suite of Food Security Indicators,Food Security: Suite of Food Security Indicators,"The Suite of Food Security Indicators presents the core set of food security indicators. Following the recommendation of experts gathered in the Committee on World Food Security (CFS) Round Table on hunger measurement, hosted at FAO headquarters in September 2011, an initial set of indicators aiming to capture various aspects of food insecurity is presented here. The choice of the indicators has been informed by expert judgment and the availability of data with sufficient coverage to enable comparisons across regions and over time. Many of these indicators are produced and published elsewhere by FAO and other international organizations. They are reported here in a single database with the aim of building a wide food security information system. More indicators will be added to this set as more data will become available. Indicators are classified along the four dimensions of food security -- availability, access, utilization and stability. For definitions of these indicators, see Definitions and standards below (under Item).",Food Security: Suite of Food Security Indicators +faostat_ic,Investment: Credit to Agriculture,Investment: Credit to Agriculture,"The Credit to Agriculture dataset provides national data for over 130 countries on the amount of loans provided by the private/commercial banking sector to producers in agriculture, forestry and fishing, including household producers, cooperatives, and agro-businesses. For some countries, the three subsectors of agriculture, forestry, and fishing are completely specified. In other cases, complete disaggregations are not available. The dataset also provides statistics on the total credit to all industries, indicators on the share of credit to agricultural producers, and an agriculture orientation index (AOI) for credit that normalizes the share of credit to agriculture over total credit by dividing it by the share of agriculture in gross domestic product (GDP). As such, it can provide a more accurate indication of the relative importance that banking sectors place on financing the sector. An AOI lower than 1 indicates that the agriculture sector receives a credit share lower than its contribution to the economy, while an AOI greater than 1 indicates a credit share to the agriculture sector greater than its economic contribution.",Investment: Credit to Agriculture +faostat_lc,"Land, Inputs and Sustainability: Land Cover",Agri-Environmental Indicators: Land Cover,"The FAOSTAT domain Land Cover under the Agri-Environmental Indicators section contains land cover information organized by the land cover classes of the international standard system for Environmental and Economic Accounting Central Framework (SEEA CF). The land cover information is compiled from publicly available Global Land Cover (GLC) maps: a) MODIS land cover types based on the Land Cover Classification System, LCCS (2001–2021); b) The European Spatial Agency (ESA) Climate Change Initiative (CCI) annual land cover maps (1992–2020) produced by the Université catholique de Louvain (UCL)-Geomatics and now under the European Copernicus Program; c) The annual land cover maps which were produced under the European Copernicus Global Land Service (CGLS) (CGLS land cover, containing discrete land cover categorization for the period 2015–2019), with spatial resolution 100m; and d) 4) The WorldCover maps of the European Space Agency —available for the years 2020 and 2021, produced at 10m resolution.",Agri-Environmental Indicators: Land Cover +faostat_qcl,Production: Crops and livestock products,Production: Crops and livestock products,"Crop and livestock statistics are recorded for 278 products, covering the following categories: 1) CROPS PRIMARY: Cereals, Citrus Fruit, Fibre Crops, Fruit, Oil Crops, Oil Crops and Cakes in Oil Equivalent, Pulses, Roots and Tubers, Sugar Crops, Treenuts and Vegetables. Data are expressed in terms of area harvested, production quantity and yield. Cereals: Area and production data on cereals relate to crops harvested for dry grain only. Cereal crops harvested for hay or harvested green for food, feed or silage or used for grazing are therefore excluded. 2) CROPS PROCESSED: Beer of barley; Cotton lint; Cottonseed; Margarine, short; Molasses; Oil, coconut (copra); Oil, cottonseed; Oil, groundnut; Oil, linseed; Oil, maize; Oil, olive, virgin; Oil, palm; Oil, palm kernel; Oil, rapeseed; Oil, safflower; Oil, sesame; Oil, soybean; Oil, sunflower; Palm kernels; Sugar Raw Centrifugal; Wine. 3) LIVE ANIMALS: Animals live n.e.s.; Asses; Beehives; Buffaloes; Camelids, other; Camels; Cattle; Chickens; Ducks; Geese and guinea fowls; Goats; Horses; Mules; Pigeons, other birds; Pigs; Rabbits and hares; Rodents, other; Sheep; Turkeys. 4) LIVESTOCK PRIMARY: Beeswax; Eggs (various types); Hides buffalo, fresh; Hides, cattle, fresh; Honey, natural; Meat (ass, bird nes, buffalo, camel, cattle, chicken, duck, game, goat, goose and guinea fowl, horse, mule, Meat nes, meat other camelids, Meat other rodents, pig, rabbit, sheep, turkey); Milk (buffalo, camel, cow, goat, sheep); Offals, nes; Silk-worm cocoons, reelable; Skins (goat, sheep); Snails, not sea; Wool, greasy. 5) LIVESTOCK PROCESSED: Butter (of milk from sheep, goat, buffalo, cow); Cheese (of milk from goat, buffalo, sheep, cow milk); Cheese of skimmed cow milk; Cream fresh; Ghee (cow and buffalo milk); Lard; Milk (dry buttermilk, skimmed condensed, skimmed cow, skimmed dried, skimmed evaporated, whole condensed, whole dried, whole evaporated); Silk raw; Tallow; Whey (condensed and dry); Yoghurt.",Production: Crops and livestock products +faostat_qi,Production: Production Indices,Production: Production Indices,The FAO indices of agricultural production show the relative level of the aggregate volume of agricultural production for each year in comparison with the base period 2014-2016. Indices for meat production are computed based on data for production from indigenous animals.,"Production: Production Indices This dataset includes gross and net production indices for various food and agriculture aggregates expressed in both totals and per capita." -faostat_qv,Production: Value of Agricultural Production - FAO (2023),Production: Value of Agricultural Production,"Crop and livestock statistics are recorded for 278 products, covering the following categories:1) CROPS PRIMARY:Cereals, Citrus Fruit, Fibre Crops, Fruit, Oil Crops, Oil Crops and Cakes in Oil Equivalent, Pulses, Roots and Tubers, Sugar Crops, Treenuts and Vegetables. Data are expressed in terms of area harvested, production quantity and yield. Cereals: Area and production data on cereals relate to crops harvested for dry grain only. Cereal crops harvested for hay or harvested green for food, feed or silage or used for grazing are therefore excluded.2) CROPS PROCESSED:Beer of barley; Cotton lint; Cottonseed; Margarine, short; Molasses; Oil, coconut (copra); Oil, cottonseed; Oil, groundnut; Oil, linseed; Oil, maize; Oil, olive, virgin; Oil, palm; Oil, palm kernel; Oil, rapeseed; Oil, safflower; Oil, sesame; Oil, soybean; Oil, sunflower; Palm kernels; Sugar Raw Centrifugal; Wine.3) LIVE ANIMALS:Animals live n.e.s.; Asses; Beehives; Buffaloes; Camelids, other; Camels; Cattle; Chickens; Ducks; Geese and guinea fowls; Goats; Horses; Mules; Pigeons, other birds; Pigs; Rabbits and hares; Rodents, other; Sheep; Turkeys.4) LIVESTOCK PRIMARY:Beeswax; Eggs (various types); Hides buffalo, fresh; Hides, cattle, fresh; Honey, natural; Meat (ass, bird nes, buffalo, camel, cattle, chicken, duck, game, goat, goose and guinea fowl, horse, mule, Meat nes, meat other camelids, Meat other rodents, pig, rabbit, sheep, turkey); Milk (buffalo, camel, cow, goat, sheep); Offals, nes; Silk-worm cocoons, reelable; Skins (goat, sheep); Snails, not sea; Wool, greasy.5) LIVESTOCK PROCESSED:Butter (of milk from sheep, goat, buffalo, cow); Cheese (of milk from goat, buffalo, sheep, cow milk); Cheese of skimmed cow milk; Cream fresh; Ghee (cow and buffalo milk); Lard; Milk (dry buttermilk, skimmed condensed, skimmed cow, skimmed dried, skimmed evaporated, whole condensed, whole dried, whole evaporated); Silk raw; Tallow; Whey (condensed and dry); Yoghurt","Production: Value of Agricultural Production +faostat_qv,Production: Value of Agricultural Production,Production: Value of Agricultural Production,Values of agricultural production are calculated based on production data of primary commodities from Production domain and producer prices from Prices domain. The livestock value of production is measured in terms of indigenous meat.,"Production: Value of Agricultural Production This dataset includes gross and net production values, in constant international US$, and gross production values, in constant and current US$ and Local Currency Units, for various food and agriculture commodities and aggregates thereof, expressed in both total value and value per capita." -faostat_rfb,"Land, Inputs and Sustainability: Fertilizers by Product - FAO (2022)",Inputs: Fertilizers by Product,"The Fertilizers by Product dataset contains information on the Production, Trade and Agriculture Use of inorganic (chemical or mineral) fertilizers products, over the time series 2002-present. The fertilizer statistics data are for a set of 23 product categories. Both straight and compound fertilizers are included. There is information available about methodology at: http://fenixservices.fao.org/faostat/static/documents/RFB/RFB_EN_README.pdf.",Inputs: Fertilizers by Product -faostat_rfn,"Land, Inputs and Sustainability: Fertilizers by Nutrient - FAO (2022)",Inputs: Fertilizers by Nutrient,"The Fertilizers by Nutrient dataset contains information on the totals in nutrients for Production, Trade and Agriculture Use of inorganic (chemical or mineral) fertilizers, over the time series 1961-present. The data are provided for the three primary plant nutrients: nitrogen (N), phosphorus (expressed as P2O5) and potassium (expressed as K2O). Both straight and compound fertilizers are included. There is information on the methodology available at: http://fenixservices.fao.org/faostat/static/documents/RFN/RFN_EN_README.pdf",Inputs: Fertilizers by Nutrient -faostat_rl,"Land, Inputs and Sustainability: Land Use - FAO (2022)",Inputs: Land Use,"The FAOSTAT Land Use domain contains data on forty-four categories of land use, irrigation and agricultural practices, relevant to monitor agriculture, forestry and fisheries activities at national, regional and global level. Data are available by country and year, with global coverage and annual updates.",Inputs: Land Use -faostat_rp,"Land, Inputs and Sustainability: Pesticides Use - FAO (2022)",Inputs: Pesticides Use,"The Pesticides Use database includes data on the use of major pesticide groups (Insecticides, Herbicides, Fungicides, Plant growth regulators and Rodenticides) and of relevant chemical families. Data report the quantities (in tonnes of active ingredients)",Inputs: Pesticides Use -faostat_rt,"Land, Inputs and Sustainability: Pesticides Trade - FAO (2023)",Inputs: Pesticides Trade,"This domain contains data on pesticides and covers two different categories: pesticides traded in form or packagingfor retail sale or as preparations or articles, and pesticides traded as separate chemically defined compounds (if relevant for the Rotterdam Convention on the Prior Informed Consent Procedure for Certain Hazardous Chemicals and Pesticides in International Trade). The pesticides traded for retail sale or as preparations or articles are those classified under code 38.08 in the Harmonized System Nomenclature (HS) and include: hazardous pesticides, insecticides, fungicides, herbicides, disinfectants and other. For these pesticides, this domain contains trade data (imports and exports) in values only (current 1000 US dollars), and the time series extends from 1961 onwards. The pesticides traded as separate chemically defined compounds are those listed in Annex III of the Rotterdam Convention (excluding industrial chemicals) and therefore subject to the Prior Informed Consent (PIC) procedure. The correspondence with the HS Nomenclature is shown in the table at the Related Documents section. For these pesticides, this domain contains trade data (imports and exports) in both value (current 1000 US dollars) and quantity (net weight in tonnes), and the time series extends from 2007 onwards.",Inputs: Pesticides Trade -faostat_scl,Food Balances: Supply Utilization Accounts (2010-) - FAO (2023),Food Balances: Supply Utilization Accounts,"Supply Utilization Accounts and Food Balance Sheet present a comprehensive picture of the pattern of a country's food supply during a specified reference period. The food balance sheet shows for each food item - i.e. each primary commodity and a number of processed commodities potentially available for human consumption - the sources of supply and its utilization. The total quantity of foodstuffs produced in a country added to the total quantity imported and adjusted to any change in stocks that may have occurred since the beginning of the reference period gives the supply available during that period. On the utilization side a distinction is made between the quantities exported, fed to livestock, used for seed, put to manufacture for food use and non-food uses, losses during storage and transportation, and food supplies available for human consumption. The per caput supply of each such food item available for human consumption is then obtained by dividing the respective quantity by the related data on the population actually partaking of it. Data on per caput food supplies are expressed in terms of quantity and - by applying appropriate food composition factors for all primary and processed products - also in terms of caloric value and protein and fat content.",Food Balances: Supply Utilization Accounts -faostat_sdgb,SDG Indicators: SDG Indicators - FAO (2023),SDG Indicators: SDG Indicators,"As the custodian agency of 21 SDG indicators, the Food and Agriculture Organization of the United Nations (FAO) is responsible for curating and refining the methodologies of these indicators, collecting data from national sources, ensuring their quality and compatibility with applicable standards and classifications, and disseminating data at global level. This FAOSTAT domain complements the global SDG database administered by the United Nations Statistical Division, as well as FAO’s SDG indicators portal, by providing access to the available data for each of these indicators. Please click the metadata link on the right hand navigation column for an abridged version of the methodology for compiling each of these indicators, a description of data sources and the relevant contact persons responsible for each indicator in the Organization. For a more detailed description of the methodology, data sources and reporting procedures, please follow the link to the official SDG indicator metadata document available at the bottom of each summary metadata page in the document on the right. ",SDG Indicators: SDG Indicators -faostat_tcl,Trade: Crops and livestock products - FAO (2023),Trade: Crops and livestock products,"The food and agricultural trade dataset is collected, processed and disseminated by FAO according to the standard International Merchandise Trade Statistics (IMTS) Methodology. The data is mainly provided by UNSD, Eurostat, and other national authorities as needed. This source data is checked for outliers, trade partner data is used for non-reporting countries or missing cells, and data on food aid is added to take into account total cross-border trade flows. The trade database includes the following variables: export quantity, export value, import quantity, and import value. The trade database includes all food and agricultural products imported/exported annually by all the countries in the world.",Trade: Crops and livestock products -faostat_ti,Trade: Trade Indices - FAO (2023),Trade: Trade Indices,"The food and agricultural trade dataset is collected, processed and disseminated by FAO according to the standard International Merchandise Trade Statistics (IMTS) Methodology. The data is mainly provided by UNSD, Eurostat, and other national authorities as needed. This source data is checked for outliers, trade partner data is used for non-reporting countries or missing cells, and data on food aid is added to take into account total cross-border trade flows. The trade database includes the following variables: export quantity, export value, import quantity, and import value. The trade database includes all food and agricultural products imported/exported annually by all the countries in the world.",Trade: Trade Indices -faostat_wcad,World Census of Agriculture: Structural data from agricultural censuses - FAO (2023),World Census of Agriculture: Structural data from agricultural censuses,"Data from censuses of agriculture are collected at holding level and provide information about the structure of agriculture of a country or a territory (e.g. size and number of holdings, land tenure, legal status, and holder gender). An agricultural holding is an economic unit of agricultural production under single management comprising all livestock kept and all land used wholly or partly for agricultural production purposes. Member countries provided census data to FAO under the World Programme for the Census of Agriculture (WCA). National censuses are conducted at least once every ten years in an internationally comparable way. The ‘’Structural data from agricultural censuses’’ domain in FAOSTAT provides structural data from the last four WCA rounds (WCA 2020, 2010, 2000 and 1990) for each participating country and territory, to the extent possible. For earlier rounds (WCA 1930, 1950, 1960, 1970 and 1980) data are provided only on the number and area of holdings. The data are prepared based on the national census reports, later disseminated by FAO through the publications SDS 17, SDS 12 and SDS 9 and 9a, and recent methodological review of the available country census data of WCA 2020 round.",World Census of Agriculture: Structural data from agricultural censuses +faostat_rfb,"Land, Inputs and Sustainability: Fertilizers by Product",Inputs: Fertilizers by Product,"The Fertilizers by Product dataset contains information on the Production, Trade and Agriculture Use of inorganic (chemical or mineral) fertilizers products, over the time series 2002-present. The fertilizer statistics data are for a set of 23 product categories. Both straight and compound fertilizers are included. There is information available about methodology at: https://fenixservices.fao.org/faostat/static/documents/RFB/RFB_EN_README.pdf.",Inputs: Fertilizers by Product +faostat_rfn,"Land, Inputs and Sustainability: Fertilizers by Nutrient",Inputs: Fertilizers by Nutrient,"The Fertilizers by Nutrient dataset contains information on the totals in nutrients for Production, Trade and Agriculture Use of inorganic (chemical or mineral) fertilizers, over the time series 1961-present. The data are provided for the three primary plant nutrients: nitrogen (N), phosphorus (expressed as P2O5) and potassium (expressed as K2O). Both straight and compound fertilizers are included. There is information on the methodology available at: https://fenixservices.fao.org/faostat/static/documents/RFN/RFN_EN_README.pdf",Inputs: Fertilizers by Nutrient +faostat_rl,"Land, Inputs and Sustainability: Land Use",Inputs: Land Use,"The FAOSTAT Land Use domain contains data on forty-four categories of land use, irrigation and agricultural practices and five indicators relevant to monitor agriculture, forestry and fisheries activities at national, regional and global level. Data are available by country and year, with global coverage and annual updates.",Inputs: Land Use +faostat_rp,"Land, Inputs and Sustainability: Pesticides Use",Inputs: Pesticides Use,"The Pesticides Use database includes data on the use of major pesticide groups (Insecticides, Herbicides, Fungicides, Plant growth regulators and Rodenticides) and of relevant chemical families. Data report the quantities (in tonnes of active ingredients)",Inputs: Pesticides Use +faostat_rt,"Land, Inputs and Sustainability: Pesticides Trade",Inputs: Pesticides Trade,"This domain contains data on pesticides and covers two different categories: pesticides traded in form or packagingfor retail sale or as preparations or articles, and pesticides traded as separate chemically defined compounds (if relevant for the Rotterdam Convention on the Prior Informed Consent Procedure for Certain Hazardous Chemicals and Pesticides in International Trade). The pesticides traded for retail sale or as preparations or articles are those classified under code 38.08 in the Harmonized System Nomenclature (HS) and include: hazardous pesticides, insecticides, fungicides, herbicides, disinfectants and other. For these pesticides, this domain contains trade data (imports and exports) in values only (current 1000 US dollars), and the time series extends from 1961 onwards. The pesticides traded as separate chemically defined compounds are those listed in Annex III of the Rotterdam Convention (excluding industrial chemicals) and therefore subject to the Prior Informed Consent (PIC) procedure. The correspondence with the HS Nomenclature is shown in the table at the Related Documents section. For these pesticides, this domain contains trade data (imports and exports) in both value (current 1000 US dollars) and quantity (net weight in tonnes), and the time series extends from 2007 onwards.",Inputs: Pesticides Trade +faostat_scl,Food Balances: Supply Utilization Accounts (2010-),Food Balances: Supply Utilization Accounts,"Supply Utilization Accounts (SUA's) present a comprehensive picture of the pattern of a country's food supply during a specified reference period. The total quantity of foodstuffs produced in a country added to the total quantity imported and adjusted to any change in stocks that may have occurred since the beginning of the reference period gives the supply available during that period. On the utilization side a distinction is made between the quantities exported, fed to livestock, used for seed, put to manufacture for food use and non-food uses, losses during storage and transportation, and food supplies available for human consumption. The per caput supply of each such food item available for human consumption is then obtained by dividing the respective quantity by the related data on the population actually partaking of it. Data on per caput food supplies are expressed in terms of quantity and - by applying appropriate food composition factors for all primary and processed products - also in terms of caloric value and protein and fat content.",Food Balances: Supply Utilization Accounts +faostat_sdgb,SDG Indicators: SDG Indicators,SDG Indicators: SDG Indicators,"As the custodian agency of 21 SDG indicators, the Food and Agriculture Organization of the United Nations (FAO) is responsible for curating and refining the methodologies of these indicators, collecting data from national sources, ensuring their quality and compatibility with applicable standards and classifications, and disseminating data at global level. This FAOSTAT domain complements the global SDG database administered by the United Nations Statistical Division, as well as FAO’s SDG indicators portal, by providing access to the available data for each of these indicators. Please click the metadata link on the right hand navigation column for an abridged version of the methodology for compiling each of these indicators, a description of data sources and the relevant contact persons responsible for each indicator in the Organization. For a more detailed description of the methodology, data sources and reporting procedures, please follow the link to the official SDG indicator metadata document available at the bottom of each summary metadata page in the document on the right.",SDG Indicators: SDG Indicators +faostat_tcl,Trade: Crops and livestock products,Trade: Crops and livestock products,"The food and agricultural trade dataset is collected, processed and disseminated by FAO according to the standard International Merchandise Trade Statistics (IMTS) Methodology. The data is mainly provided by UNSD, Eurostat, and other national authorities as needed. This source data is checked for outliers, trade partner data is used for non-reporting countries or missing cells, and data on food aid is added to take into account total cross-border trade flows. The trade database includes the following variables: export quantity, export value, import quantity, and import value. The trade database includes all food and agricultural products imported/exported annually by all the countries in the world.",Trade: Crops and livestock products +faostat_ti,Trade: Trade Indices,Trade: Trade Indices,"The food and agricultural trade dataset is collected, processed and disseminated by FAO according to the standard International Merchandise Trade Statistics (IMTS) Methodology. The data is mainly provided by UNSD, Eurostat, and other national authorities as needed. This source data is checked for outliers, trade partner data is used for non-reporting countries or missing cells, and data on food aid is added to take into account total cross-border trade flows. The trade database includes the following variables: export quantity, export value, import quantity, and import value. The trade database includes all food and agricultural products imported/exported annually by all the countries in the world.",Trade: Trade Indices diff --git a/etl/steps/data/garden/faostat/2024-03-14/custom_elements_and_units.csv b/etl/steps/data/garden/faostat/2024-03-14/custom_elements_and_units.csv index de8f21f9ce0..c937223f26a 100644 --- a/etl/steps/data/garden/faostat/2024-03-14/custom_elements_and_units.csv +++ b/etl/steps/data/garden/faostat/2024-03-14/custom_elements_and_units.csv @@ -1,35 +1,35 @@ dataset,element_code,fao_element,owid_element,fao_unit,fao_unit_short_name,owid_unit,owid_unit_short_name,owid_unit_factor,fao_element_description,owid_element_description,owid_aggregation,was_per_capita,make_per_capita faostat_rl,005110,Area,Area,thousand Hectares,1000 ha,hectares,ha,1000,Extent of surface of land or water. Source: FAO Statistics Division,,sum,0,1 faostat_qcl,005312,Area harvested,Area harvested,Hectares,ha,hectares,ha,,"Data refer to the area from which a crop is gathered. Area harvested, therefore, excludes the area from which, although sown or planted, there was no harvest due to damage, failure, etc. It is usually net for temporary crops and some times gross for permanent crops. Net area differs from gross area insofar as the latter includes uncultivated patches, footpaths, ditches, headlands, shoulders, shelterbelts, etc.If the crop under consideration is harvested more than once during the year as a consequence of successive cropping (i.e. the same crop is sown or planted more than once in the same field during the year), the area is counted as many times as harvested. On the contrary, area harvested will be recorded only once in the case of successive gathering of the crop during the year from the same standing crops. With regard to mixed and associated crops, the area sown relating to each crop should be reported separately. When the mixture refers to particular crops, generally grains, it is recommended to treat the mixture as if it were a single crop; therefore, area sown is recorded only for the crop reported. Source: FAO Statistics Division",,sum,0,1 -faostat_fbsc,005301,Domestic supply quantity,Domestic supply,1000 tonnes,1000 tonnes,tonnes,t,1000.0,"Production + imports - exports + changes in stocks (decrease or increase) = supply for domestic utilization in the new methodology. There are various ways of defining supply and, in fact, various concepts are in use. The elements involved are production, imports, exports and changes in stocks (increase or decrease). There is no doubt that production, imports and stock changes (either decrease or increase in stocks) are genuine supply elements. Source: FAO Statistics Division",,sum,0,1 -faostat_fbsc,005911,Export Quantity,Exports,1000 tonnes,1000 tonnes,tonnes,t,1000.0,,,sum,0,1 -faostat_fbsc,000684,Fat supply quantity (g/capita/day),Food available for consumption,g/capita/day,g/capita/day,grams of fat per day,g/day,,,,sum,1,1 -faostat_fbsc,005521,Feed,Feed,1000 tonnes,1000 tonnes,tonnes,t,1000.0,"Data refer to the quantity of the commodity in question available for feeding to the livestock and poultry during the reference period, whether domestically produced or imported. Source: FAO. 1986. The ICS users' manual. Interlinked computer strorage and processing system of food and agricultural commodity data. Rome.",,sum,0,1 -faostat_fbsc,005142,Food,Food,1000 tonnes,1000 tonnes,tonnes,t,1000.0,"Data refer to the total amount of the commodity available as human food during the reference period. Data include the commodity in question, as well as any commodity derived therefrom as a result of further processing. Food from maize, for example, comprises the amount of maize, maize meal and any other products derived therefrom available for human consumption. Food from milk relates to the amounts of milk as such, as well as the fresh milk equivalent of dairy products. Source: FAO. 1986. The ICS users' manual. Interlinked computer strorage and processing system of food and agricultural commodity data. Rome.",,sum,0,1 -faostat_fbsc,000664,Food supply (kcal/capita/day),Food available for consumption,kcal/capita/day,kcal/capita/day,kilocalories per day,kcal/day,,,,sum,1,1 +faostat_fbsc,005301,Domestic supply quantity,Domestic supply,thousand Tonnes,1000 t,tonnes,t,1000.0,"Production + imports - exports + changes in stocks (decrease or increase) = supply for domestic utilization in the new methodology. There are various ways of defining supply and, in fact, various concepts are in use. The elements involved are production, imports, exports and changes in stocks (increase or decrease). There is no doubt that production, imports and stock changes (either decrease or increase in stocks) are genuine supply elements. Source: FAO Statistics Division",,sum,0,1 +faostat_fbsc,005911,Export Quantity,Exports,thousand Tonnes,1000 t,tonnes,t,1000.0,,,sum,0,1 +faostat_fbsc,000684,Fat supply quantity (g/capita/day),Food available for consumption,Grams per capita per day,g/cap/d,grams of fat per day,g/day,,,,sum,1,1 +faostat_fbsc,005521,Feed,Feed,thousand Tonnes,1000 t,tonnes,t,1000.0,"Data refer to the quantity of the commodity in question available for feeding to the livestock and poultry during the reference period, whether domestically produced or imported. Source: FAO. 1986. The ICS users' manual. Interlinked computer strorage and processing system of food and agricultural commodity data. Rome.",,sum,0,1 +faostat_fbsc,005142,Food,Food,thousand Tonnes,1000 t,tonnes,t,1000.0,"Data refer to the total amount of the commodity available as human food during the reference period. Data include the commodity in question, as well as any commodity derived therefrom as a result of further processing. Food from maize, for example, comprises the amount of maize, maize meal and any other products derived therefrom available for human consumption. Food from milk relates to the amounts of milk as such, as well as the fresh milk equivalent of dairy products. Source: FAO. 1986. The ICS users' manual. Interlinked computer strorage and processing system of food and agricultural commodity data. Rome.",,sum,0,1 +faostat_fbsc,000664,Food supply (kcal/capita/day),Food available for consumption,Kilocalories per capita per day,kcal/cap/d,kilocalories per day,kcal/day,,,,sum,1,1 faostat_fbsc,000645,Food supply quantity (kg/capita/yr),Food available for consumption,Kilograms,kg,kilograms per year,kg,,,,sum,1,1 -faostat_fbsc,005611,Import Quantity,Imports,1000 tonnes,1000 tonnes,tonnes,t,1000.0,,,sum,0,1 -faostat_qcl,005313,Laying,,1000 Head,1000 Head,animals,animals,1000.0,,,sum,0,0 -faostat_fbsc,005123,Losses,Waste in Supply Chain,1000 tonnes,1000 tonnes,tonnes,t,1000.0,"Amount of the commodity in question lost through wastage (waste) during the year at all stages between the level at which production is recorded and the household, i.e. storage and transportation. Losses occurring before and during harvest are excluded. Waste from both edible and inedible parts of the commodity occurring in the household is also excluded. Quantities lost during the transformation of primary commodities into processed products are taken into account in the assessment of respective extraction/conversion rates. Distribution wastes tend to be considerable in countries with hot humid climate, difficult transportation and inadequate storage or processing facilities. This applies to the more perishable foodstuffs, and especially to those which have to be transported or stored for a long time in a tropical climate. Waste is often estimated as a fixed percentage of availability, the latter being defined as production plus imports plus stock withdrawals. Source: FAO. 1986. The ICS users' manual. Interlinked computer strorage and processing system of food and agricultural commodity data. Rome.",,sum,0,1 -faostat_qcl,005318,Milk Animals,,Head,Head,,,,,,sum,0,0 -faostat_fbsc,005154,Other uses (non-food),Other uses,1000 tonnes,1000 tonnes,tonnes,t,1000.0,"Data refer to quantities of commodities used for non-food purposes, e.g. oil for soap. In order not to distort the picture of the national food pattern quantities of the commodity in question consumed mainly by tourists are included here (see also ""Per capita supply""). In addition, this variable covers pet food. Source: FAO. 1986. The ICS users' manual. Interlinked computer strorage and processing system of food and agricultural commodity data. Rome.",,sum,0,1 -faostat_fbsc,005131,Processing,,1000 tonnes,1000 tonnes,tonnes,t,1000.0,,,sum,0,0 +faostat_fbsc,005611,Import Quantity,Imports,thousand Tonnes,1000 t,tonnes,t,1000.0,,,sum,0,1 +faostat_qcl,005313,Laying,,thousand Animals,1000 An,animals,animals,1000.0,,,sum,0,0 +faostat_fbsc,005123,Losses,Waste in Supply Chain,thousand Tonnes,1000 t,tonnes,t,1000.0,"Amount of the commodity in question lost through wastage (waste) during the year at all stages between the level at which production is recorded and the household, i.e. storage and transportation. Losses occurring before and during harvest are excluded. Waste from both edible and inedible parts of the commodity occurring in the household is also excluded. Quantities lost during the transformation of primary commodities into processed products are taken into account in the assessment of respective extraction/conversion rates. Distribution wastes tend to be considerable in countries with hot humid climate, difficult transportation and inadequate storage or processing facilities. This applies to the more perishable foodstuffs, and especially to those which have to be transported or stored for a long time in a tropical climate. Waste is often estimated as a fixed percentage of availability, the latter being defined as production plus imports plus stock withdrawals. Source: FAO. 1986. The ICS users' manual. Interlinked computer strorage and processing system of food and agricultural commodity data. Rome.",,sum,0,1 +faostat_qcl,005318,Milk Animals,,Animals,An,,,,,,sum,0,0 +faostat_fbsc,005154,Other uses (non-food),Other uses,thousand Tonnes,1000 t,tonnes,t,1000.0,"Data refer to quantities of commodities used for non-food purposes, e.g. oil for soap. In order not to distort the picture of the national food pattern quantities of the commodity in question consumed mainly by tourists are included here (see also ""Per capita supply""). In addition, this variable covers pet food. Source: FAO. 1986. The ICS users' manual. Interlinked computer strorage and processing system of food and agricultural commodity data. Rome.",,sum,0,1 +faostat_fbsc,005131,Processing,,thousand Tonnes,1000 t,tonnes,t,1000.0,,,sum,0,0 faostat_qcl,005314,Prod Popultn,,Number,No,,,,,,sum,0,0 -faostat_qcl,005320,Producing Animals/Slaughtered,Producing or slaughtered animals,Head,Head,animals,animals,,,,sum,0,1 -faostat_qcl,005321,Producing Animals/Slaughtered,Producing or slaughtered animals,1000 Head,1000 Head,animals,animals,1000.0,,,sum,0,1 -faostat_fbsc,005511,Production,,1000 tonnes,1000 tonnes,tonnes,t,1000.0,"Figures relate to the total domestic production whether inside or outside the agricultural sector, i.e. it includes non-commercial production and production from kitchen gardens. Unless otherwise indicated, production is reported at the farm level for crop and livestock products (i.e. in the case of crops, excluding harvesting losses) and in terms of live weight for fish items (i.e. the actual ex-water weight at the time of the catch). All data shown relate to total meat production from both commercial and farm slaughter. Data are expressed in terms of dressed carcass weight, excluding offal and slaughter fats. Production of beef and buffalo meat includes veal; mutton and goat meat includes meat from lambs and kids; pig meat includes bacon and ham in fresh equivalent. Poultry meat includes meat from all domestic birds and refers, wherever possible, to ready-to-cook weight. Source: FAO Statistics Division",,sum,0,0 +faostat_qcl,005321,Producing Animals/Slaughtered,Producing or slaughtered animals,thousand Animals,1000 An,animals,animals,1000.0,,,sum,0,1 +faostat_qcl,005320,Producing Animals/Slaughtered,Producing or slaughtered animals,Animals,An,animals,animals,,,,sum,0,1 +faostat_fbsc,005511,Production,,thousand Tonnes,1000 t,tonnes,t,1000.0,"Figures relate to the total domestic production whether inside or outside the agricultural sector, i.e. it includes non-commercial production and production from kitchen gardens. Unless otherwise indicated, production is reported at the farm level for crop and livestock products (i.e. in the case of crops, excluding harvesting losses) and in terms of live weight for fish items (i.e. the actual ex-water weight at the time of the catch). All data shown relate to total meat production from both commercial and farm slaughter. Data are expressed in terms of dressed carcass weight, excluding offal and slaughter fats. Production of beef and buffalo meat includes veal; mutton and goat meat includes meat from lambs and kids; pig meat includes bacon and ham in fresh equivalent. Poultry meat includes meat from all domestic birds and refers, wherever possible, to ready-to-cook weight. Source: FAO Statistics Division",,sum,0,0 faostat_qcl,005513,Production,,thousand Number,1000 No,number,number,1000.0,"Figures relate to the total domestic production whether inside or outside the agricultural sector, i.e. it includes non-commercial production and production from kitchen gardens. Unless otherwise indicated, production is reported at the farm level for crop and livestock products (i.e. in the case of crops, excluding harvesting losses) and in terms of live weight for fish items (i.e. the actual ex-water weight at the time of the catch). All data shown relate to total meat production from both commercial and farm slaughter. Data are expressed in terms of dressed carcass weight, excluding offal and slaughter fats. Production of beef and buffalo meat includes veal; mutton and goat meat includes meat from lambs and kids; pig meat includes bacon and ham in fresh equivalent. Poultry meat includes meat from all domestic birds and refers, wherever possible, to ready-to-cook weight. Source: FAO Statistics Division",,sum,0,0 -faostat_qcl,005510,Production,Production,tonnes,tonnes,tonnes,t,,Amount produced in the year,,sum,0,1 -faostat_fbsc,000674,Protein supply quantity (g/capita/day),Food available for consumption,g/capita/day,g/capita/day,grams of protein per day,g/day,,,,sum,1,1 -faostat_fbsc,005170,Residuals,,1000 tonnes,1000 tonnes,tonnes,t,1000.0,It is defined as the imbalance (positive or negative) in the supply and utilization equation. It occures mainly due to the inconsitencies of national data provided by countries.,,sum,0,0 -faostat_fbsc,005527,Seed,,1000 tonnes,1000 tonnes,tonnes,t,1000.0,"Data include the amounts of the commodity in question set aside for sowing or planting (or generally for reproduction purposes, e.g. sugar cane planted, potatoes for seed, eggs for hatching and fish for bait, whether domestically produced or imported) during the reference period. Account is taken of double or successive sowing or planting whenever it occurs. The data of seed include also, when it is the case, the quantities necessary for sowing or planting the area relating to crops harvested green for fodder or for food.(e.g. green peas, green beans, maize for forage)  Data for seed element are stored in tonnes (t). Whenever official data were not available, seed figures have been estimated either as a percentage of supply (e.g. eggs for hatching) or by multiplying a seed rate with the area under the crop of the subsequent year. Source: FAO. 1986. The ICS users' manual. Interlinked computer strorage and processing system of food and agricultural commodity data. Rome.",,sum,0,0 -faostat_fbsc,005072,Stock Variation,,1000 tonnes,1000 tonnes,tonnes,t,1000.0,"Comprises changes in stocks occurring during the reference period at all levels between the production and the retail levels, i.e. it comprises changes in government stocks, in stocks with manufacturers, importers, exporters, other wholesale and retail merchants, transport and storage enterprises and in stocks on farms. In actual fact, however, the information available often relates only to stocks held by governments and even these are not available for a number of countries and important commodities. In the absence of information on opening and closing stocks changes in stocks are also used for shifting production from the calendar year in which it is harvested to the year in which it is consumed. Net increases in stocks (add to stock) are generally indicated by the sign ""-"". No sign denotes net decreases (from stock). Source: FAO Statistics Division",,sum,0,0 -faostat_qcl,005112,Stocks,,1000 Head,1000 Head,animals,animals,1000.0,"This variable indicates the number of animals of the species present in the country at the time of enumeration. It includes animals raised either for draft purposes or for meat, eggs and dairy production or kept for breeding. Live animals in captivity for fur or skin such as foxes, minks etc. are not included in the system although furskin trade is reported. The enumeration to be chosen, when more than one survey is taken, is the closest to the beginning of the calendar year. Livestock data are reported in number of heads (units) except for poultry, rabbits and other rodents which are reported in thousand units. Source: FAO Statistics Division",,sum,0,0 -faostat_qcl,005111,Stocks,,Head,Head,animals,animals,,"This variable indicates the number of animals of the species present in the country at the time of enumeration. It includes animals raised either for draft purposes or for meat, eggs and dairy production or kept for breeding. Live animals in captivity for fur or skin such as foxes, minks etc. are not included in the system although furskin trade is reported. The enumeration to be chosen, when more than one survey is taken, is the closest to the beginning of the calendar year. Livestock data are reported in number of heads (units) except for poultry, rabbits and other rodents which are reported in thousand units. Source: FAO Statistics Division",,sum,0,0 -faostat_fbsc,005171,Tourist consumption,,1000 tonnes,1000 tonnes,tonnes,t,1000.0,,,sum,0,0 -faostat_qcl,005422,Yield,,hg,hg,,,,,,sum,0,0 -faostat_qcl,005410,Yield,Yield,100mg/An,100mg/An,kilograms per animal,kg/animal,0.0001,,,,0,0 -faostat_qcl,005420,Yield,Yield,hg/An,hg/An,kilograms per animal,kg/animal,0.1,,,,0,0 -faostat_qcl,005419,Yield,Yield,hg/ha,hg/ha,tonnes per hectare,t/ha,0.0001,"Harvested production per unit of harvested area for crop products. In most of the cases yield data are not recorded but obtained by dividing the production data by the data on area harvested. Data on yields of permanent crops are not as reliable as those for temporary crops either because most of the area information may correspond to planted area, as for grapes, or because of the scarcity and unreliability of the area figures reported by the countries, as for example for cocoa and coffee. Source: FAO Statistics Division",,,0,0 -faostat_qcl,005417,Yield/Carcass Weight,Yield,hg/An,hg/An,kilograms per animal,kg/animal,0.1,,,,0,0 -faostat_qcl,005424,Yield/Carcass Weight,Yield,0.1g/An,0.1g/An,kilograms per animal,kg/animal,0.0001,,,,0,0 +faostat_qcl,005510,Production,Production,Tonnes,t,tonnes,t,,Amount produced in the year,,sum,0,1 +faostat_fbsc,000674,Protein supply quantity (g/capita/day),Food available for consumption,Grams per capita per day,g/cap/d,grams of protein per day,g/day,,,,sum,1,1 +faostat_fbsc,005170,Residuals,,thousand Tonnes,1000 t,tonnes,t,1000.0,It is defined as the imbalance (positive or negative) in the supply and utilization equation. It occures mainly due to the inconsitencies of national data provided by countries.,,sum,0,0 +faostat_fbsc,005527,Seed,,thousand Tonnes,1000 t,tonnes,t,1000.0,"Data include the amounts of the commodity in question set aside for sowing or planting (or generally for reproduction purposes, e.g. sugar cane planted, potatoes for seed, eggs for hatching and fish for bait, whether domestically produced or imported) during the reference period. Account is taken of double or successive sowing or planting whenever it occurs. The data of seed include also, when it is the case, the quantities necessary for sowing or planting the area relating to crops harvested green for fodder or for food.(e.g. green peas, green beans, maize for forage)  Data for seed element are stored in tonnes (t). Whenever official data were not available, seed figures have been estimated either as a percentage of supply (e.g. eggs for hatching) or by multiplying a seed rate with the area under the crop of the subsequent year. Source: FAO. 1986. The ICS users' manual. Interlinked computer strorage and processing system of food and agricultural commodity data. Rome.",,sum,0,0 +faostat_fbsc,005072,Stock Variation,,thousand Tonnes,1000 t,tonnes,t,1000.0,"Comprises changes in stocks occurring during the reference period at all levels between the production and the retail levels, i.e. it comprises changes in government stocks, in stocks with manufacturers, importers, exporters, other wholesale and retail merchants, transport and storage enterprises and in stocks on farms. In actual fact, however, the information available often relates only to stocks held by governments and even these are not available for a number of countries and important commodities. In the absence of information on opening and closing stocks changes in stocks are also used for shifting production from the calendar year in which it is harvested to the year in which it is consumed. Net decreases (from stock) are generally indicated by the sign ""-"". No sign denotes net increases (add to stock). Source: FAO Statistics Division",,sum,0,0 +faostat_qcl,005112,Stocks,,thousand Animals,1000 An,animals,animals,1000.0,"This variable indicates the number of animals of the species present in the country at the time of enumeration. It includes animals raised either for draft purposes or for meat, eggs and dairy production or kept for breeding. Live animals in captivity for fur or skin such as foxes, minks etc. are not included in the system although furskin trade is reported. The enumeration to be chosen, when more than one survey is taken, is the closest to the beginning of the calendar year. Livestock data are reported in number of heads (units) except for poultry, rabbits and other rodents which are reported in thousand units. Source: FAO Statistics Division",,sum,0,0 +faostat_qcl,005111,Stocks,,Animals,An,animals,animals,,"This variable indicates the number of animals of the species present in the country at the time of enumeration. It includes animals raised either for draft purposes or for meat, eggs and dairy production or kept for breeding. Live animals in captivity for fur or skin such as foxes, minks etc. are not included in the system although furskin trade is reported. The enumeration to be chosen, when more than one survey is taken, is the closest to the beginning of the calendar year. Livestock data are reported in number of heads (units) except for poultry, rabbits and other rodents which are reported in thousand units. Source: FAO Statistics Division",,sum,0,0 +faostat_fbsc,005171,Tourist consumption,,thousand Tonnes,1000 t,tonnes,t,1000.0,,,sum,0,0 +faostat_qcl,005422,Yield,,hundred Grams,100 g,,,,,,sum,0,0 +faostat_qcl,005410,Yield,Yield,hundred Milligrams per animal,100 mg/An,kilograms per animal,kg/animal,0.0001,,,,0,0 +faostat_qcl,005420,Yield,Yield,hundred Grams per animal,100 g/An,kilograms per animal,kg/animal,0.1,,,,0,0 +faostat_qcl,005419,Yield,Yield,hundred Grams per hectare,100 g/ha,tonnes per hectare,t/ha,0.0001,"Harvested production per unit of harvested area for crop products. In most of the cases yield data are not recorded but obtained by dividing the production data by the data on area harvested. Data on yields of permanent crops are not as reliable as those for temporary crops either because most of the area information may correspond to planted area, as for grapes, or because of the scarcity and unreliability of the area figures reported by the countries, as for example for cocoa and coffee. Source: FAO Statistics Division",,,0,0 +faostat_qcl,005417,Yield/Carcass Weight,Yield,hundred Grams per animal,100 g/An,kilograms per animal,kg/animal,0.1,,,,0,0 +faostat_qcl,005424,Yield/Carcass Weight,Yield,tenth Grams per animal,0.1 g/An,kilograms per animal,kg/animal,0.0001,,,,0,0 diff --git a/etl/steps/data/garden/faostat/2024-03-14/custom_items.csv b/etl/steps/data/garden/faostat/2024-03-14/custom_items.csv index 1064ca37308..e870867288b 100644 --- a/etl/steps/data/garden/faostat/2024-03-14/custom_items.csv +++ b/etl/steps/data/garden/faostat/2024-03-14/custom_items.csv @@ -1,4 +1,5 @@ dataset,item_code,fao_item,owid_item,fao_item_description,owid_item_description +faostat_qcl,00000773,,Flax fibre,, faostat_qcl,00000221,"Almonds, in shell",Almonds,"Almonds, in shell This subclass is defined through the following headings/subheadings of the HS 2007: 0802.11.", faostat_fbsc,00002946,Animal fats,Animal fats group,, faostat_qcl,00000711,"Anise, badian, coriander, cumin, caraway, fennel and juniper berries, raw",Herbs (e.g. fennel),"Anise, badian, coriander, cumin, caraway, fennel and juniper berries, raw This subclass includes: - aniseed, Pimpinella anisum, raw - star anise (badian) or Chinese star anise, Illicium verum, raw - fennel, Foeniculum vulgare, raw (when used as spice) - coriander (cilantro), Coriandrum sativum, raw - cumin, Cuminum cyminum, raw - caraway seeds, Carum carvi, raw - juniper berries, Juniperus communis, raw This subclass does not include: - fennel (when used as a vegetable), cf. 01290 - processed anise, fennel, coriander, cumin, caraway and juniper berries, cf. 23924", @@ -74,7 +75,6 @@ faostat_qcl,00001129,Fat of camels,"Fat, camels",Unrendered slaughter fats (Unof faostat_qcl,00001037,Fat of pigs,"Fat, pigs","Unrendered slaughter fats of pigs, including edible and inedible fats that are removed in the course of dressing the carcass. (Unofficial definition)", faostat_fbsc,00002737,"Fats, Animals, Raw",Animal fats,"Default composition: 869 Fat, cattle, 871 Fat, cattle butcher, 949 Fat, buffaloes, 979 Fat, sheep, 994 Grease incl. lanolin wool, 1019 Fat, goats, 1037 Fat, pigs, 1040 Fat, pig butcher, 1043 Lard, 1065 Fat, poultry, 1066 Fat, poultry, rendered, 1129 Fat, camels, 1160 Fat, other camelids, 1168 Oils, fats of animal nes, 1221 Lard stearine oil, 1222 Degras, 1225 Tallow, 1243 Fat, nes, prepared", faostat_fbsc,00002960,"Fish, Seafood",Fish and seafood,, -faostat_qcl,00000773,"Flax, processed but not spun",Flax fibre,"Broken, scutched, hackled etc. but not spun. Traditionally, FAO has used this commodity to identify production in its raw state; in reality, the primary agricultural product is the commodity 01929.01 (Flax, raw or retted) which can either be used for the production of fibre or for other purposes (Unofficial definition)", faostat_qcl,00001738,Fruit Primary,Fruit,"Fruit Crops consist of fruits and berries that, with few exceptions, are characterized by their sweet taste. Nearly all are permanent crops, mainly from trees, bushes and shrubs, as well as vines and palms. Fruits and berries grow on branches, stalks or the trunks of plants, usually singly, but sometimes grouped in bunches or clusters (e.g. bananas and grapes). Commercial crops are cultivated in plantations, but significant quantities of fruits are also collected from scattered plants that may or may not be cultivated. Although melons and watermelons are generally considered to be fruits, FAO groups them with vegetables because they are temporary crops. Fruit crops are highly perishable. Their shelf life may be extended through the application of chemical substances that inhibit the growth of micro-organisms and through careful control of the surrounding temperature, pressure and humidity once the fruit has been picked. Fruits and berries have a very high water content accounting for some 70- 90 percent of their weight. They contain, in various degrees, minerals, vitamins and organic acids, some of which reside in the peel or skin. Some fruits have a high fibre content and other inedible components, so that wastage is high, e.g. 60 percent for passion fruit and 35-45 percent for pineapples. The waste in temperate zone fruit is lower, generally of the order of 10-15 percent, while berries contain very little waste. The carbohydrate content of fruits varies widely. Protein content is very low, averaging less than 1 percent, or below that in vegetables. Fat content in fruit is negligible, with the notable exception of avocados. Fruit crops are consumed directly as food and are processed into dried fruit, fruit juice, canned fruit, frozen fruit, jam, alcoholic beverages, etc. Fruit crops are not normally grown for animal feed, although significant quantities of diseased and substandard fruits, as well as certain by-products of the fruit processing industry, are fed to animals. Production data for fruit crops should relate to fruits actually harvested. Data on bananas and plantains should relate to the weight of single bananas or banana hands, excluding the weight of the central stalk. FAO lists 36 primary fruit crops.", faostat_fbsc,00002919,Fruits - Excluding Wine,Fruit,, faostat_qcl,00001163,"Game meat, fresh, chilled or frozen","Meat, game","Meat and offals of wild animals, whether fresh, chilled or frozen. (Unofficial definition)", From c7dd40ab6caaec6186385040fe34b9239c666501 Mon Sep 17 00:00:00 2001 From: Pablo Rosado Date: Thu, 14 Mar 2024 18:33:04 +0100 Subject: [PATCH 13/54] Add element amendments for fbsc --- .../garden/faostat/2024-03-14/faostat_fbsc.py | 24 +++++--- .../faostat/2024-03-14/faostat_metadata.py | 20 +++--- .../garden/faostat/2024-03-14/faostat_qcl.py | 2 +- .../data/garden/faostat/2024-03-14/shared.py | 61 +++++++++++++++++-- 4 files changed, 84 insertions(+), 23 deletions(-) diff --git a/etl/steps/data/garden/faostat/2024-03-14/faostat_fbsc.py b/etl/steps/data/garden/faostat/2024-03-14/faostat_fbsc.py index 7ac8c6d544c..4d90e249e4b 100644 --- a/etl/steps/data/garden/faostat/2024-03-14/faostat_fbsc.py +++ b/etl/steps/data/garden/faostat/2024-03-14/faostat_fbsc.py @@ -24,6 +24,7 @@ from shared import ( ADDED_TITLE_TO_WIDE_TABLE, CURRENT_DIR, + ELEMENTS_IN_FBSH_MISSING_IN_FBS, NAMESPACE, add_per_capita_variables, add_regions, @@ -75,9 +76,9 @@ def combine_fbsh_and_fbs_datasets( # Harmonize items and elements in both datasets. fbsh = harmonize_items(df=fbsh, dataset_short_name="faostat_fbsh") - fbsh = harmonize_elements(df=fbsh) + fbsh = harmonize_elements(df=fbsh, dataset_short_name="faostat_fbsh") fbs = harmonize_items(df=fbs, dataset_short_name="faostat_fbs") - fbs = harmonize_elements(df=fbs) + fbs = harmonize_elements(df=fbs, dataset_short_name="faostat_fbs") # Ensure there is no overlap in data between the two datasets, and that there is no gap between them. assert fbs["year"].min() == FBS_FIRST_YEAR, f"First year of fbs dataset is not {FBS_FIRST_YEAR}" @@ -91,18 +92,27 @@ def combine_fbsh_and_fbs_datasets( # Ensure the elements that are in fbsh but not in fbs are covered by ITEMS_MAPPING. error = "Mismatch between items in fbsh and fbs. Redefine shared.ITEM_AMENDMENTS." assert set(fbsh["item"]) == set(fbs["item"]), error + assert set(fbsh["item_code"]) == set(fbs["item_code"]), error # Some elements are found in fbs but not in fbsh. This is understandable, since fbs is # more recent and may have additional elements. However, ensure that there are no # elements in fbsh that are not in fbs. error = "There are elements in fbsh that are not in fbs." assert set(fbsh["element"]) < set(fbs["element"]), error + assert set(fbsh["element_code"]) - set(fbs["element_code"]) == ELEMENTS_IN_FBSH_MISSING_IN_FBS, error + + # Remove elements from fbsh that are not in fbs (since they have different meanings and hence should not be + # combined as if they were the same element). + fbsh = fbsh[~fbsh["element_code"].isin(ELEMENTS_IN_FBSH_MISSING_IN_FBS)].reset_index(drop=True) # Concatenate old and new dataframes using function that keeps categoricals. fbsc = dataframes.concatenate([fbsh, fbs]).sort_values(["area", "year"]).reset_index(drop=True) # Ensure that each element has only one unit and one description. - error = "Some elements in the combined dataset have more than one unit." - assert fbsc.groupby("element")["unit"].nunique().max() == 1, error + error = "Some elements in the combined dataset have more than one unit. Manually check them and consider adding them to ELEMENT_AMENDMENTS." + units_per_element = fbsc.groupby("element", as_index=False, observed=True)["unit"].nunique() + elements_with_ambiguous_units = units_per_element[units_per_element["unit"] > 1]["element"].tolist() + fbsc[fbsc["element"].isin(elements_with_ambiguous_units)].drop_duplicates(subset=["element", "unit"]) + assert len(elements_with_ambiguous_units) == 0, error return cast(pd.DataFrame, fbsc) @@ -128,11 +138,11 @@ def run(dest_dir: str) -> None: # Load fbsh and fbs. log.info("faostat_fbsc.loading_datasets") - fbsh_dataset: catalog.Dataset = paths.load_dependency(f"{NAMESPACE}_fbsh") - fbs_dataset: catalog.Dataset = paths.load_dependency(f"{NAMESPACE}_fbs") + fbsh_dataset = paths.load_dataset(f"{NAMESPACE}_fbsh") + fbs_dataset = paths.load_dataset(f"{NAMESPACE}_fbs") # Load dataset of FAOSTAT metadata. - metadata: catalog.Dataset = paths.load_dependency(f"{NAMESPACE}_metadata") + metadata = paths.load_dataset(f"{NAMESPACE}_metadata") # Load dataset, items, element-units, and countries metadata. dataset_metadata = pd.DataFrame(metadata["datasets"]).loc[dataset_short_name].to_dict() diff --git a/etl/steps/data/garden/faostat/2024-03-14/faostat_metadata.py b/etl/steps/data/garden/faostat/2024-03-14/faostat_metadata.py index 30663f2186e..d0fdab1bb1e 100644 --- a/etl/steps/data/garden/faostat/2024-03-14/faostat_metadata.py +++ b/etl/steps/data/garden/faostat/2024-03-14/faostat_metadata.py @@ -47,6 +47,7 @@ from owid.datautils import dataframes, io from shared import ( CURRENT_DIR, + ELEMENTS_IN_FBSH_MISSING_IN_FBS, FAOSTAT_METADATA_SHORT_NAME, FLAGS_RANKING, N_CHARACTERS_ELEMENT_CODE, @@ -424,7 +425,12 @@ def create_elements_dataframe_for_domain( data=df, dataset_short_name=dataset_short_name, category="element" ) # Ensure element_code is always a string of a fix number of characters. - elements_from_data = harmonize_elements(df=elements_from_data, element_col="fao_element") + elements_from_data = harmonize_elements( + df=elements_from_data, + dataset_short_name=dataset_short_name, + element_col="fao_element", + unit_col="fao_unit_short_name", + ) # Load elements from metadata. elements_columns = { @@ -440,7 +446,9 @@ def create_elements_dataframe_for_domain( .sort_values(list(elements_columns.values())) .reset_index(drop=True) ) - _elements_df = harmonize_elements(df=_elements_df, element_col="fao_element") + _elements_df = harmonize_elements( + df=_elements_df, dataset_short_name=dataset_short_name, element_col="fao_element", unit_col=None + ) _elements_df["fao_element_description"] = _elements_df["fao_element_description"].astype("string") # Load units metadata. @@ -515,17 +523,11 @@ def clean_global_elements_dataframe(elements_df: pd.DataFrame, custom_elements: elements_df = elements_df.copy() # Check that all elements of fbsh are in fbs (although fbs may contain additional elements). - # The only exception is "Stock Variation", which have slightly different definitions: - # On fbs "Stock Variation" (005072), "Net decreases (from stock) are generally indicated by the sign "-". No sign denotes net increases (add to stock)". - # On fbsh "Stock Variation" (005074), "Net increases in stocks (add to stock) are generally indicated by the sign "-". No sign denotes net decreases (from stock).". - # Given that they have different definitions, we should not map one to the other. - # So, for now, simply ignore it. - ELEMENTS_IN_FBSH_MISSING_IN_FBS = {"005074"} assert ( set(elements_df[elements_df["dataset"] == "faostat_fbsh"]["element_code"]) - set(elements_df[elements_df["dataset"] == "faostat_fbs"]["element_code"]) == ELEMENTS_IN_FBSH_MISSING_IN_FBS - ) + ), "There are new elements in fbsh that are not in fbs. Add them to ELEMENTS_IN_FBSH_MISSING_IN_FBS." # Drop all rows for fbsh, and rename "fbs" to "fbsc" (since this will be the name for the combined dataset). elements_df = elements_df[elements_df["dataset"] != "faostat_fbsh"].reset_index(drop=True) elements_df.loc[elements_df["dataset"] == "faostat_fbs", "dataset"] = "faostat_fbsc" diff --git a/etl/steps/data/garden/faostat/2024-03-14/faostat_qcl.py b/etl/steps/data/garden/faostat/2024-03-14/faostat_qcl.py index a2db317cbd5..10360f6a588 100644 --- a/etl/steps/data/garden/faostat/2024-03-14/faostat_qcl.py +++ b/etl/steps/data/garden/faostat/2024-03-14/faostat_qcl.py @@ -472,7 +472,7 @@ def run(dest_dir: str) -> None: # # Harmonize items and elements, and clean data. data = harmonize_items(df=data, dataset_short_name=dataset_short_name) - data = harmonize_elements(df=data) + data = harmonize_elements(df=data, dataset_short_name=dataset_short_name) # Prepare data. data = clean_data( diff --git a/etl/steps/data/garden/faostat/2024-03-14/shared.py b/etl/steps/data/garden/faostat/2024-03-14/shared.py index 50be7b8f4a7..5dc145dcd61 100644 --- a/etl/steps/data/garden/faostat/2024-03-14/shared.py +++ b/etl/steps/data/garden/faostat/2024-03-14/shared.py @@ -13,7 +13,7 @@ import json import sys from pathlib import Path -from typing import Dict, List, cast +from typing import Dict, List, Optional, cast import numpy as np import pandas as pd @@ -67,7 +67,27 @@ }, ], } - +# Manual fixes to elements and units to avoid ambiguities. +ELEMENT_AMENDMENTS = { + "faostat_fbsh": [ + # Mappings to harmonize element and unit names of fbsh with those of fbs. + { + "element_code": "000645", + "fao_element": "Food supply quantity (kg/capita/yr)", + "fao_unit": "kg/cap", + "new_element_code": "000645", + "new_fao_element": "Food supply quantity (kg/capita/yr)", + "new_fao_unit": "kg", + }, + ], +} +# Ideally, all elements of fbsh should be in fbs (although fbs may contain additional elements). +# The only exception is "Stock Variation", which have slightly different definitions: +# On fbs "Stock Variation" (005072), "Net decreases (from stock) are generally indicated by the sign "-". No sign denotes net increases (add to stock)". +# On fbsh "Stock Variation" (005074), "Net increases in stocks (add to stock) are generally indicated by the sign "-". No sign denotes net decreases (from stock).". +# Given that they have different definitions, we should not map one to the other. +# So, for now, simply ignore it. +ELEMENTS_IN_FBSH_MISSING_IN_FBS = {"005074"} # Countries and regions. @@ -428,13 +448,18 @@ def harmonize_items(df: pd.DataFrame, dataset_short_name: str, item_col: str = " return df -def harmonize_elements(df: pd.DataFrame, element_col: str = "element") -> pd.DataFrame: +def harmonize_elements( + df: pd.DataFrame, dataset_short_name: str, element_col: str = "element", unit_col: Optional[str] = "unit" +) -> pd.DataFrame: """Harmonize element codes (by ensuring they are strings of numbers with a fixed length, prepended with zeros), and make element codes and elements of categorical dtype. Parameters ---------- df : pd.DataFrame + Data before harmonizing element codes. + dataset_short_name : str + Dataset short name. element_col : str Name of element column (this is only necessary to convert element column into categorical dtype). @@ -450,6 +475,30 @@ def harmonize_elements(df: pd.DataFrame, element_col: str = "element") -> pd.Dat # Convert both columns to category to reduce memory df = df.astype({"element_code": "category", element_col: "category"}) + # Fix those few cases where there is more than one item per item code within a given dataset. + if dataset_short_name in ELEMENT_AMENDMENTS: + for amendment in ELEMENT_AMENDMENTS[dataset_short_name]: + # Ensure new item code and item name are added as categories, to avoid errors. + if amendment["new_element_code"] not in df["element_code"].cat.categories: + df["element_code"] = df["element_code"].cat.add_categories(amendment["new_element_code"]) + if amendment["new_fao_element"] not in df[element_col].cat.categories: + df[element_col] = df[element_col].cat.add_categories(amendment["new_fao_element"]) + if unit_col is not None and amendment["new_fao_unit"] not in df[unit_col].cat.categories: + df[unit_col] = df[unit_col].cat.add_categories(amendment["new_fao_unit"]) + + if unit_col is not None: + # Update element code, element name, and unit name. + df.loc[ + (df["element_code"] == amendment["element_code"]) & (df[element_col] == amendment["fao_element"]), + ("element_code", element_col, unit_col), + ] = (amendment["new_element_code"], amendment["new_fao_element"], amendment["new_fao_unit"]) + else: + # Update element code, and element name. + df.loc[ + (df["element_code"] == amendment["element_code"]) & (df[element_col] == amendment["fao_element"]), + ("element_code", element_col), + ] = (amendment["new_element_code"], amendment["new_fao_element"]) + return df @@ -1821,13 +1870,13 @@ def run(dest_dir: str) -> None: paths = PathFinder(current_step_file.as_posix()) # Load latest meadow dataset and keep its metadata. - ds_meadow: catalog.Dataset = paths.load_dependency(dataset_short_name) + ds_meadow = paths.load_dataset(dataset_short_name) # Load main table from dataset. tb_meadow = ds_meadow[dataset_short_name] data = pd.DataFrame(tb_meadow).reset_index() # Load dataset of FAOSTAT metadata. - metadata: catalog.Dataset = paths.load_dependency(f"{NAMESPACE}_metadata") + metadata = paths.load_dataset(f"{NAMESPACE}_metadata") # Load dataset, items, element-units, countries metadata, and value amendments. dataset_metadata = pd.DataFrame(metadata["datasets"]).loc[dataset_short_name].to_dict() @@ -1843,7 +1892,7 @@ def run(dest_dir: str) -> None: # # Harmonize items and elements, and clean data. data = harmonize_items(df=data, dataset_short_name=dataset_short_name) - data = harmonize_elements(df=data) + data = harmonize_elements(df=data, dataset_short_name=dataset_short_name) # Prepare data. data = clean_data( From 8a39331ad1d11e2abf43ceb5e1892e8e64da8cbf Mon Sep 17 00:00:00 2001 From: Pablo Rosado Date: Fri, 15 Mar 2024 13:28:44 +0100 Subject: [PATCH 14/54] Fix various issues --- .../garden/faostat/2024-03-14/faostat_fbsc.py | 3 -- .../garden/faostat/2024-03-14/faostat_qcl.py | 3 -- .../data/garden/faostat/2024-03-14/shared.py | 32 ++++++++++--------- 3 files changed, 17 insertions(+), 21 deletions(-) diff --git a/etl/steps/data/garden/faostat/2024-03-14/faostat_fbsc.py b/etl/steps/data/garden/faostat/2024-03-14/faostat_fbsc.py index 4d90e249e4b..7ea82be59f9 100644 --- a/etl/steps/data/garden/faostat/2024-03-14/faostat_fbsc.py +++ b/etl/steps/data/garden/faostat/2024-03-14/faostat_fbsc.py @@ -222,8 +222,5 @@ def run(dest_dir: str) -> None: ds_garden.metadata.description = dataset_metadata["owid_dataset_description"] + anomaly_descriptions ds_garden.metadata.title = dataset_metadata["owid_dataset_title"] - # Update the main source's metadata description (which will be shown in charts). - ds_garden.metadata.sources[0].description = ds_garden.metadata.description - # Create garden dataset. ds_garden.save() diff --git a/etl/steps/data/garden/faostat/2024-03-14/faostat_qcl.py b/etl/steps/data/garden/faostat/2024-03-14/faostat_qcl.py index 10360f6a588..20011f19d5e 100644 --- a/etl/steps/data/garden/faostat/2024-03-14/faostat_qcl.py +++ b/etl/steps/data/garden/faostat/2024-03-14/faostat_qcl.py @@ -527,8 +527,5 @@ def run(dest_dir: str) -> None: ) ds_garden.metadata.title = dataset_metadata["owid_dataset_title"] - # Update the main source's metadata description (which will be shown in charts). - ds_garden.metadata.sources[0].description = ds_garden.metadata.description - # Create garden dataset. ds_garden.save() diff --git a/etl/steps/data/garden/faostat/2024-03-14/shared.py b/etl/steps/data/garden/faostat/2024-03-14/shared.py index 5dc145dcd61..3310f51d719 100644 --- a/etl/steps/data/garden/faostat/2024-03-14/shared.py +++ b/etl/steps/data/garden/faostat/2024-03-14/shared.py @@ -1241,9 +1241,12 @@ def add_fao_population_if_given(data: pd.DataFrame) -> pd.DataFrame: Data, after adding a column 'fao_population', if FAO population was found in the data. """ - # Select rows that correspond to FAO population. + # Name of item and element of FAO population (used to select population in the data). fao_population_item_name = "Population" fao_population_element_name = "Total Population - Both sexes" + # Expected name of unit of FAO population. + fao_population_unit_name = "thousand Number" + # Select rows that correspond to FAO population. population_rows_mask = (data["fao_item"] == fao_population_item_name) & ( data["fao_element"] == fao_population_element_name ) @@ -1254,7 +1257,9 @@ def add_fao_population_if_given(data: pd.DataFrame) -> pd.DataFrame: fao_population = data[population_rows_mask].reset_index(drop=True) # Check that population is given in "1000 persons" and convert to persons. - assert list(fao_population["unit"].unique()) == ["1000 persons"], "FAO population may have changed units." + assert list(fao_population["unit"].unique()) == [ + fao_population_unit_name + ], "FAO population may have changed units." fao_population["value"] *= 1000 # Note: Here we will dismiss the flags related to population. But they are only relevant for those columns @@ -1553,17 +1558,17 @@ def clean_data( } ) - # Dataset faostat_wcad doesn't have a year column, but a "census_year", which has intervals like "2002-2003" - if "census_year" in data.columns: - if data["census_year"].astype(str).str.contains("-.{4}/", regex=True).any(): - log.warning( - "Column 'census_year' in dataset 'faostat_wcad' contains values that need to be properly analysed " - "and processed, e.g. 1976-1977/1980-1981. For the moment, we take the first 4 digits as the year." - ) + # # Dataset faostat_wcad doesn't have a year column, but a "census_year", which has intervals like "2002-2003" + # if "census_year" in data.columns: + # if data["census_year"].astype(str).str.contains("-.{4}/", regex=True).any(): + # log.warning( + # "Column 'census_year' in dataset 'faostat_wcad' contains values that need to be properly analysed " + # "and processed, e.g. 1976-1977/1980-1981. For the moment, we take the first 4 digits as the year." + # ) - # Remove rows that don't have a census year, and take the first 4 digits - data = data.dropna(subset="census_year").reset_index(drop=True) - data["year"] = data["census_year"].astype(str).str[0:4].astype(int) + # # Remove rows that don't have a census year, and take the first 4 digits + # data = data.dropna(subset="census_year").reset_index(drop=True) + # data["year"] = data["census_year"].astype(str).str[0:4].astype(int) # Ensure year column is integer (sometimes it is given as a range of years, e.g. 2013-2015). data["year"] = clean_year_column(data["year"]) @@ -1936,8 +1941,5 @@ def run(dest_dir: str) -> None: ds_garden.metadata.description = dataset_metadata["owid_dataset_description"] + anomaly_descriptions ds_garden.metadata.title = dataset_metadata["owid_dataset_title"] - # Update the main source's metadata description (which will be shown in charts). - ds_garden.metadata.sources[0].description = ds_garden.metadata.description - # Create garden dataset. ds_garden.save() From 9ff00b52a0c84392ca47710008155341d8604732 Mon Sep 17 00:00:00 2001 From: Pablo Rosado Date: Fri, 15 Mar 2024 15:13:44 +0100 Subject: [PATCH 15/54] Remove anomaly fix on cocoa beans, which is no longer in the data --- docs/data/faostat.md | 2 +- .../faostat/2024-03-14/detected_anomalies.py | 110 +----------------- 2 files changed, 2 insertions(+), 110 deletions(-) diff --git a/docs/data/faostat.md b/docs/data/faostat.md index 6e8520f4e74..0eb222d43d0 100644 --- a/docs/data/faostat.md +++ b/docs/data/faostat.md @@ -254,7 +254,7 @@ If no dataset requires an update, the workflow stops here. Optionally, set `INSPECT_ANOMALIES=True`, to visualize if anomalies that were detected in the previous version of the data are still present in the current version. ```bash - INSPECT_ANOMALIES=True etl garden/faostat/YYYY-MM-DD + INSPECT_ANOMALIES=True etl run garden/faostat/YYYY-MM-DD ``` !!! note diff --git a/etl/steps/data/garden/faostat/2024-03-14/detected_anomalies.py b/etl/steps/data/garden/faostat/2024-03-14/detected_anomalies.py index 22d10b928cd..380c24e2ddc 100644 --- a/etl/steps/data/garden/faostat/2024-03-14/detected_anomalies.py +++ b/etl/steps/data/garden/faostat/2024-03-14/detected_anomalies.py @@ -197,112 +197,6 @@ def fix(self, df): return df_fixed -class CocoaBeansFoodAvailableAnomaly(DataAnomaly): - description = ( # type: ignore - "Food available for consumption for cocoa beans from 2010 onwards presents many zeros for different countries. " - "These zeros are likely to correspond to missing data. " - "This issue may be caused by a change in FAO methodology precisely on 2010. " - "Therefore, to be conservative, we eliminate those zeros and treat them as missing values. " - "For aggregate regions (like continents), data from 2010 onwards is not zero, but a small number (resulting " - "from summing many spurious zeros). " - "Therefore, we also remove data for region aggregates from 2010 onwards." - ) - - affected_item_codes = [ - "00002633", - ] - affected_element_codes = [ - "000645", - "0645pc", - "005142", - "5142pc", - ] - # List of countries with value of exactly zero for all years after 2010. - # This list does not need to include all countries with that problem (it's used just to check they are still zero). - expected_countries_with_all_zero = [ - "United States", - "China", - "Norway", - ] - - def check(self, df): - assert ( - df[ - ( - (df["item_code"].isin(self.affected_item_codes)) - & (df["element_code"].isin(self.affected_element_codes)) - & (df["year"] >= 2010) - & (df["country"].isin(self.expected_countries_with_all_zero)) - ) - ]["value"] - == 0 - ).all() - # Check that, for the same countries, there is at least one value prior to 2010 where value is not zero. - assert ( - df[ - ( - (df["item_code"].isin(self.affected_item_codes)) - & (df["element_code"].isin(self.affected_element_codes)) - & (df["year"] < 2010) - & (df["country"].isin(self.expected_countries_with_all_zero)) - ) - ]["value"] - > 0 - ).any() - - def inspect(self, df): - log.info( - "The anomaly causes: " - "\n* Zeros from 2010 onwards. " - "\n* I's usually zero all years, but some countries also have single non-zero values (e.g. Afghanistan)." - ) - for element_code in self.affected_element_codes: - selection = (df["item_code"].isin(self.affected_item_codes)) & (df["element_code"] == element_code) - df_affected = df[selection].astype({"country": str}).sort_values(["country", "year"]) - title = _split_long_title(self.description + f"Element code {element_code}") - fig = px.line(df_affected, x="year", y="value", color="country", title=title, markers=True) - fig.show() - - def fix(self, df): - # Remove all possibly spurious zeros from 2010 onwards in all countries. - indexes_to_drop = df[ - ( - (df["year"] > 2010) - & (df["item_code"].isin(self.affected_item_codes)) - & (df["element_code"].isin(self.affected_element_codes)) - & (df["value"] == 0) - ) - ].index.tolist() - # Additionally, remove all data for region aggregates from 2010 onwards. - # List of possibly affected region aggregates, including all original FAO region aggregates. - aggregates = [ - "North America", - "South America", - "Europe", - "European Union (27)", - "Africa", - "Asia", - "Oceania", - "Low-income countries", - "Upper-middle-income countries", - "Lower-middle-income countries", - "High-income countries", - "World", - ] + sorted(set(df[df["country"].str.contains("FAO")]["country"])) - indexes_to_drop.extend( - df[ - (df["country"].isin(aggregates)) - & (df["year"] >= 2010) - & (df["item_code"].isin(self.affected_item_codes)) - & (df["element_code"].isin(self.affected_element_codes)) - ].index.tolist() - ) - - df_fixed = df.drop(indexes_to_drop).reset_index(drop=True) - - return df_fixed - - class EggYieldNorthernEuropeAnomaly(DataAnomaly): description = ( # type: ignore "The amount of eggs produced per bird for Northern Europe (FAO) is unreasonably high before 1973, with values " @@ -809,9 +703,7 @@ class OtherTropicalFruitYieldSouthAmericaAnomaly(HighYieldAnomaly): OtherTropicalFruitYieldNorthernAfricaAnomaly, OtherTropicalFruitYieldSouthAmericaAnomaly, ], - "faostat_fbsc": [ - CocoaBeansFoodAvailableAnomaly, - ], + "faostat_fbsc": [], } From eb45fabfeef1e79ef4cf26ba6950d1e5e56215db Mon Sep 17 00:00:00 2001 From: Pablo Rosado Date: Fri, 15 Mar 2024 16:00:41 +0100 Subject: [PATCH 16/54] Fix issues with region aggregates in fbsc --- dag/faostat.yml | 49 +++---- .../garden/faostat/2024-03-14/faostat_fbsc.py | 18 ++- .../data/garden/faostat/2024-03-14/shared.py | 132 ++++-------------- 3 files changed, 70 insertions(+), 129 deletions(-) diff --git a/dag/faostat.yml b/dag/faostat.yml index 7a44fd08583..0a68aaec371 100644 --- a/dag/faostat.yml +++ b/dag/faostat.yml @@ -389,63 +389,64 @@ steps: - data://meadow/faostat/2024-03-14/faostat_cahd - data://garden/faostat/2024-03-14/faostat_metadata - data://grapher/demography/2023-03-31/population - - data://grapher/wb/2024-03-11/income_groups + - data://garden/wb/2024-03-11/income_groups data://garden/faostat/2024-03-14/faostat_ei: - data://garden/faostat/2024-03-14/faostat_metadata - data://meadow/faostat/2024-03-14/faostat_ei - data://grapher/demography/2023-03-31/population - - data://grapher/wb/2024-03-11/income_groups + - data://garden/wb/2024-03-11/income_groups data://garden/faostat/2024-03-14/faostat_ek: - data://meadow/faostat/2024-03-14/faostat_ek - data://garden/faostat/2024-03-14/faostat_metadata - data://grapher/demography/2023-03-31/population - - data://grapher/wb/2024-03-11/income_groups + - data://garden/wb/2024-03-11/income_groups data://garden/faostat/2024-03-14/faostat_emn: - data://garden/faostat/2024-03-14/faostat_metadata - - data://grapher/wb/2024-03-11/income_groups + - data://garden/wb/2024-03-11/income_groups - data://grapher/demography/2023-03-31/population - data://meadow/faostat/2024-03-14/faostat_emn data://garden/faostat/2024-03-14/faostat_esb: - data://garden/faostat/2024-03-14/faostat_metadata - data://meadow/faostat/2024-03-14/faostat_esb - data://grapher/demography/2023-03-31/population - - data://grapher/wb/2024-03-11/income_groups + - data://garden/wb/2024-03-11/income_groups data://garden/faostat/2024-03-14/faostat_fa: - data://garden/faostat/2024-03-14/faostat_metadata - data://grapher/demography/2023-03-31/population - data://meadow/faostat/2024-03-14/faostat_fa - - data://grapher/wb/2024-03-11/income_groups + - data://garden/wb/2024-03-11/income_groups data://garden/faostat/2024-03-14/faostat_fbsc: - data://meadow/faostat/2024-03-14/faostat_fbsh - data://meadow/faostat/2024-03-14/faostat_fbs - data://garden/faostat/2024-03-14/faostat_metadata - data://grapher/demography/2023-03-31/population - - data://grapher/wb/2024-03-11/income_groups + - data://garden/regions/2023-01-01/regions + - data://garden/wb/2024-03-11/income_groups data://garden/faostat/2024-03-14/faostat_fo: - data://garden/faostat/2024-03-14/faostat_metadata - data://meadow/faostat/2024-03-14/faostat_fo - data://grapher/demography/2023-03-31/population - - data://grapher/wb/2024-03-11/income_groups + - data://garden/wb/2024-03-11/income_groups data://garden/faostat/2024-03-14/faostat_food_explorer: - data://garden/faostat/2024-03-14/faostat_qcl - data://garden/faostat/2024-03-14/faostat_fbsc - data://garden/faostat/2024-03-14/faostat_metadata - data://grapher/demography/2023-03-31/population - - data://grapher/wb/2024-03-11/income_groups + - data://garden/wb/2024-03-11/income_groups data://garden/faostat/2024-03-14/faostat_fs: - data://meadow/faostat/2024-03-14/faostat_fs - data://garden/faostat/2024-03-14/faostat_metadata - data://grapher/demography/2023-03-31/population - - data://grapher/wb/2024-03-11/income_groups + - data://garden/wb/2024-03-11/income_groups data://garden/faostat/2024-03-14/faostat_ic: - data://garden/faostat/2024-03-14/faostat_metadata - data://grapher/demography/2023-03-31/population - data://meadow/faostat/2024-03-14/faostat_ic - - data://grapher/wb/2024-03-11/income_groups + - data://garden/wb/2024-03-11/income_groups data://garden/faostat/2024-03-14/faostat_lc: - data://garden/faostat/2024-03-14/faostat_metadata - data://grapher/demography/2023-03-31/population - - data://grapher/wb/2024-03-11/income_groups + - data://garden/wb/2024-03-11/income_groups - data://meadow/faostat/2024-03-14/faostat_lc data://garden/faostat/2024-03-14/faostat_metadata: - data://meadow/faostat/2024-03-14/faostat_rt @@ -482,59 +483,59 @@ steps: - data://garden/faostat/2024-03-14/faostat_metadata - data://grapher/demography/2023-03-31/population - data://meadow/faostat/2024-03-14/faostat_qcl - - data://grapher/wb/2024-03-11/income_groups + - data://garden/wb/2024-03-11/income_groups data://garden/faostat/2024-03-14/faostat_qi: - data://meadow/faostat/2024-03-14/faostat_qi - data://garden/faostat/2024-03-14/faostat_metadata - data://grapher/demography/2023-03-31/population - - data://grapher/wb/2024-03-11/income_groups + - data://garden/wb/2024-03-11/income_groups data://garden/faostat/2024-03-14/faostat_qv: - data://meadow/faostat/2024-03-14/faostat_qv - data://garden/faostat/2024-03-14/faostat_metadata - data://grapher/demography/2023-03-31/population - - data://grapher/wb/2024-03-11/income_groups + - data://garden/wb/2024-03-11/income_groups data://garden/faostat/2024-03-14/faostat_rfb: - data://garden/faostat/2024-03-14/faostat_metadata - data://meadow/faostat/2024-03-14/faostat_rfb - data://grapher/demography/2023-03-31/population - - data://grapher/wb/2024-03-11/income_groups + - data://garden/wb/2024-03-11/income_groups data://garden/faostat/2024-03-14/faostat_rfn: - data://garden/faostat/2024-03-14/faostat_metadata - data://grapher/demography/2023-03-31/population - data://meadow/faostat/2024-03-14/faostat_rfn - - data://grapher/wb/2024-03-11/income_groups + - data://garden/wb/2024-03-11/income_groups data://garden/faostat/2024-03-14/faostat_rl: - data://garden/faostat/2024-03-14/faostat_metadata - data://meadow/faostat/2024-03-14/faostat_rl - data://grapher/demography/2023-03-31/population - - data://grapher/wb/2024-03-11/income_groups + - data://garden/wb/2024-03-11/income_groups data://garden/faostat/2024-03-14/faostat_rp: - data://meadow/faostat/2024-03-14/faostat_rp - data://garden/faostat/2024-03-14/faostat_metadata - data://grapher/demography/2023-03-31/population - - data://grapher/wb/2024-03-11/income_groups + - data://garden/wb/2024-03-11/income_groups data://garden/faostat/2024-03-14/faostat_rt: - data://garden/faostat/2024-03-14/faostat_metadata - data://meadow/faostat/2024-03-14/faostat_rt - data://grapher/demography/2023-03-31/population - - data://grapher/wb/2024-03-11/income_groups + - data://garden/wb/2024-03-11/income_groups data://garden/faostat/2024-03-14/faostat_scl: - data://meadow/faostat/2024-03-14/faostat_scl - data://garden/faostat/2024-03-14/faostat_metadata - data://grapher/demography/2023-03-31/population - - data://grapher/wb/2024-03-11/income_groups + - data://garden/wb/2024-03-11/income_groups data://garden/faostat/2024-03-14/faostat_sdgb: - data://garden/faostat/2024-03-14/faostat_metadata - data://meadow/faostat/2024-03-14/faostat_sdgb - - data://grapher/wb/2024-03-11/income_groups + - data://garden/wb/2024-03-11/income_groups - data://grapher/demography/2023-03-31/population data://garden/faostat/2024-03-14/faostat_tcl: - data://meadow/faostat/2024-03-14/faostat_tcl - data://garden/faostat/2024-03-14/faostat_metadata - data://grapher/demography/2023-03-31/population - - data://grapher/wb/2024-03-11/income_groups + - data://garden/wb/2024-03-11/income_groups data://garden/faostat/2024-03-14/faostat_ti: - data://garden/faostat/2024-03-14/faostat_metadata - data://grapher/demography/2023-03-31/population - data://meadow/faostat/2024-03-14/faostat_ti - - data://grapher/wb/2024-03-11/income_groups + - data://garden/wb/2024-03-11/income_groups diff --git a/etl/steps/data/garden/faostat/2024-03-14/faostat_fbsc.py b/etl/steps/data/garden/faostat/2024-03-14/faostat_fbsc.py index 7ea82be59f9..90e1e429c02 100644 --- a/etl/steps/data/garden/faostat/2024-03-14/faostat_fbsc.py +++ b/etl/steps/data/garden/faostat/2024-03-14/faostat_fbsc.py @@ -153,6 +153,15 @@ def run(dest_dir: str) -> None: countries_metadata = pd.DataFrame(metadata["countries"]).reset_index() amendments = parse_amendments_table(amendments=metadata["amendments"], dataset_short_name=dataset_short_name) + # Load regions dataset. + ds_regions = paths.load_dataset("regions") + + # Load income groups dataset. + ds_income_groups = paths.load_dataset("income_groups") + + # Lod population dataset. + ds_population = paths.load_dataset("population") + # # Process data. # @@ -169,6 +178,7 @@ def run(dest_dir: str) -> None: # Prepare data. data = clean_data( data=data, + ds_population=ds_population, items_metadata=items_metadata, elements_metadata=elements_metadata, countries_metadata=countries_metadata, @@ -176,7 +186,13 @@ def run(dest_dir: str) -> None: ) # Add data for aggregate regions. - data = add_regions(data=data, elements_metadata=elements_metadata) + data = add_regions( + data=data, + ds_regions=ds_regions, + ds_income_groups=ds_income_groups, + ds_population=ds_population, + elements_metadata=elements_metadata, + ) # Add per-capita variables. data = add_per_capita_variables(data=data, elements_metadata=elements_metadata) diff --git a/etl/steps/data/garden/faostat/2024-03-14/shared.py b/etl/steps/data/garden/faostat/2024-03-14/shared.py index 3310f51d719..c397e4e779c 100644 --- a/etl/steps/data/garden/faostat/2024-03-14/shared.py +++ b/etl/steps/data/garden/faostat/2024-03-14/shared.py @@ -25,7 +25,6 @@ from etl.data_helpers import geo from etl.helpers import PathFinder, create_dataset -from etl.paths import DATA_DIR # Initialise log. log = structlog.get_logger() @@ -155,10 +154,7 @@ # countries, to avoid double-counting the data of those countries. # Note: This list does not contain all country groups, but only those that are in our list of harmonized countries # (without the *(FAO) suffix). -REGIONS_TO_IGNORE_IN_AGGREGATES = [ - "Melanesia", - "Polynesia", -] +REGIONS_TO_IGNORE_IN_AGGREGATES = [] # When creating region aggregates, decide how to distribute historical regions. # The following decisions are based on the current location of the countries that succeeded the region, and their income @@ -901,7 +897,7 @@ def remove_regions_from_countries_regions_members( return countries_regions -def load_population() -> pd.DataFrame: +def load_population(ds_population: catalog.Dataset) -> pd.DataFrame: """Load OWID population dataset, and add historical regions to it. Returns @@ -911,9 +907,7 @@ def load_population() -> pd.DataFrame: """ # Load population dataset. - population = catalog.Dataset(DATA_DIR / "garden/owid/latest/key_indicators/")["population"].reset_index()[ - ["country", "year", "population"] - ] + population = ds_population["population"].reset_index()[["country", "year", "population"]] # Add data for historical regions (if not in population) by adding the population of its current successors. countries_with_population = population["country"].unique() @@ -937,92 +931,6 @@ def load_population() -> pd.DataFrame: return cast(pd.DataFrame, population) -def load_countries_regions() -> pd.DataFrame: - """Load countries-regions dataset from the OWID catalog, and remove certain regions (defined in - REGIONS_TO_IGNORE_IN_AGGREGATES) from the lists of members of countries or regions. - - Returns - ------- - countries_regions : pd.DataFrame - Countries-regions dataset. - - """ - # Load dataset of countries and regions. - countries_regions = catalog.Dataset(DATA_DIR / "garden/regions/2023-01-01/regions")["regions"] - - countries_regions = remove_regions_from_countries_regions_members( - countries_regions, regions_to_remove=REGIONS_TO_IGNORE_IN_AGGREGATES - ) - - return cast(pd.DataFrame, countries_regions) - - -def load_income_groups() -> pd.DataFrame: - """Load dataset of income groups and add historical regions to it. - - Returns - ------- - income_groups : pd.DataFrame - Income groups data. - - """ - # Load the WorldBank dataset for income grups. - income_groups = catalog.Dataset(DATA_DIR / "garden/wb/2021-07-01/wb_income")["wb_income_group"].reset_index() - - # Add historical regions to income groups. - for historic_region in HISTORIC_TO_CURRENT_REGION: - historic_region_income_group = HISTORIC_TO_CURRENT_REGION[historic_region]["income_group"] - if historic_region not in income_groups["country"]: - historic_region_df = pd.DataFrame( - { - "country": [historic_region], - "income_group": [historic_region_income_group], - } - ) - income_groups = pd.concat([income_groups, historic_region_df], ignore_index=True) - - return cast(pd.DataFrame, income_groups) - - -def list_countries_in_region(region: str, countries_regions: pd.DataFrame, income_groups: pd.DataFrame) -> List[str]: - """List all countries in a specific region or income group. - - Parameters - ---------- - region : str - Name of the region. - countries_regions : pd.DataFrame - Countries-regions dataset (after removing certain regions from the lists of members). - income_groups : pd.DataFrame - Dataset of income groups, which includes historical regions. - - Returns - ------- - countries_in_regions : list - List of countries in the given region or income group. - - """ - # Number of attempts to fetch countries regions data. - attempts = 5 - attempt = 0 - countries_in_region = list() - while attempt < attempts: - try: - # List countries in region. - countries_in_region = geo.list_countries_in_region( - region=region, - countries_regions=countries_regions, - income_groups=income_groups, - ) - break - except ConnectionResetError: - attempt += 1 - finally: - assert len(countries_in_region) > 0, "Unable to fetch countries-regions data." - - return countries_in_region - - def remove_overlapping_data_between_historical_regions_and_successors( data_region: pd.DataFrame, ) -> pd.DataFrame: @@ -1082,7 +990,13 @@ def remove_overlapping_data_between_historical_regions_and_successors( return data_region -def add_regions(data: pd.DataFrame, elements_metadata: pd.DataFrame) -> pd.DataFrame: +def add_regions( + data: pd.DataFrame, + ds_regions: catalog.Dataset, + ds_income_groups: catalog.Dataset, + ds_population: catalog.Dataset, + elements_metadata: pd.DataFrame, +) -> pd.DataFrame: """Add region aggregates (i.e. aggregate data for continents and income groups). Regions to be created are defined above, in REGIONS_TO_ADD, and the variables for which data will be aggregated are @@ -1118,9 +1032,7 @@ def add_regions(data: pd.DataFrame, elements_metadata: pd.DataFrame) -> pd.DataF log.info("add_regions", shape=data.shape) # Load population dataset, countries-regions, and income groups datasets. - population = load_population() - countries_regions = load_countries_regions() - income_groups = load_income_groups() + population = load_population(ds_population=ds_population) # Invert dictionary of aggregations to have the aggregation as key, and the list of element codes as value. aggregations_inverted = { @@ -1128,8 +1040,12 @@ def add_regions(data: pd.DataFrame, elements_metadata: pd.DataFrame) -> pd.DataF for unique_value in aggregations.values() } for region in tqdm(REGIONS_TO_ADD, file=sys.stdout): - countries_in_region = list_countries_in_region( - region, countries_regions=countries_regions, income_groups=income_groups + countries_in_region = geo.list_members_of_region( + region, + ds_regions=ds_regions, + ds_income_groups=ds_income_groups, + excluded_regions=REGIONS_TO_IGNORE_IN_AGGREGATES, + include_historical_regions_in_income_groups=True, ) region_code = REGIONS_TO_ADD[region]["area_code"] region_population = population[population["country"] == region][["year", "population"]].reset_index( @@ -1201,7 +1117,7 @@ def add_regions(data: pd.DataFrame, elements_metadata: pd.DataFrame) -> pd.DataF # Add data for current region to data. data = dataframes.concatenate( - [data[data["country"] != region], data_region], + [data[data["country"] != region].reset_index(drop=True), data_region], ignore_index=True, ) @@ -1279,6 +1195,7 @@ def add_fao_population_if_given(data: pd.DataFrame) -> pd.DataFrame: def add_population( df: pd.DataFrame, + ds_population: catalog.Dataset, country_col: str = "country", year_col: str = "year", population_col: str = "population", @@ -1313,7 +1230,7 @@ def add_population( """ # Load population dataset. - population = load_population().rename( + population = load_population(ds_population=ds_population).rename( columns={ "country": country_col, "year": year_col, @@ -1504,6 +1421,7 @@ def clean_data_values(values: pd.Series, amendments: Dict[str, str]) -> pd.Serie def clean_data( data: pd.DataFrame, + ds_population: catalog.Dataset, items_metadata: pd.DataFrame, elements_metadata: pd.DataFrame, countries_metadata: pd.DataFrame, @@ -1603,7 +1521,9 @@ def clean_data( # Add column for population; when creating region aggregates, this column will have the population of the countries # for which there was data. For example, for Europe in a specific year, the population may differ from item to item, # because for one item we may have more European countries informed than for the other. - data = add_population(df=data, population_col="population_with_data", warn_on_missing_countries=False) + data = add_population( + df=data, ds_population=ds_population, population_col="population_with_data", warn_on_missing_countries=False + ) # Convert back to categorical columns (maybe this should be handled automatically in `add_population_to_dataframe`) data = data.astype({"country": "category"}) @@ -1892,6 +1812,9 @@ def run(dest_dir: str) -> None: countries_metadata = pd.DataFrame(metadata["countries"]).reset_index() amendments = parse_amendments_table(amendments=metadata["amendments"], dataset_short_name=dataset_short_name) + # Load population dataset. + ds_population = paths.load_dataset("population") + # # Process data. # @@ -1902,6 +1825,7 @@ def run(dest_dir: str) -> None: # Prepare data. data = clean_data( data=data, + ds_population=ds_population, items_metadata=items_metadata, elements_metadata=elements_metadata, countries_metadata=countries_metadata, From d4606b85880b5018277d4b6d045d5fc521bb081a Mon Sep 17 00:00:00 2001 From: Pablo Rosado Date: Mon, 18 Mar 2024 11:24:29 +0100 Subject: [PATCH 17/54] Explicitly use regions, population and income groups datasets --- dag/faostat.yml | 23 +++++++++++++++++++ .../garden/faostat/2024-03-14/faostat_qcl.py | 23 +++++++++++++++---- .../data/garden/faostat/2024-03-14/shared.py | 14 ++++++++++- 3 files changed, 55 insertions(+), 5 deletions(-) diff --git a/dag/faostat.yml b/dag/faostat.yml index 0a68aaec371..742aa274ef4 100644 --- a/dag/faostat.yml +++ b/dag/faostat.yml @@ -389,31 +389,37 @@ steps: - data://meadow/faostat/2024-03-14/faostat_cahd - data://garden/faostat/2024-03-14/faostat_metadata - data://grapher/demography/2023-03-31/population + - data://garden/regions/2023-01-01/regions - data://garden/wb/2024-03-11/income_groups data://garden/faostat/2024-03-14/faostat_ei: - data://garden/faostat/2024-03-14/faostat_metadata - data://meadow/faostat/2024-03-14/faostat_ei - data://grapher/demography/2023-03-31/population + - data://garden/regions/2023-01-01/regions - data://garden/wb/2024-03-11/income_groups data://garden/faostat/2024-03-14/faostat_ek: - data://meadow/faostat/2024-03-14/faostat_ek - data://garden/faostat/2024-03-14/faostat_metadata - data://grapher/demography/2023-03-31/population + - data://garden/regions/2023-01-01/regions - data://garden/wb/2024-03-11/income_groups data://garden/faostat/2024-03-14/faostat_emn: - data://garden/faostat/2024-03-14/faostat_metadata - data://garden/wb/2024-03-11/income_groups - data://grapher/demography/2023-03-31/population + - data://garden/regions/2023-01-01/regions - data://meadow/faostat/2024-03-14/faostat_emn data://garden/faostat/2024-03-14/faostat_esb: - data://garden/faostat/2024-03-14/faostat_metadata - data://meadow/faostat/2024-03-14/faostat_esb - data://grapher/demography/2023-03-31/population + - data://garden/regions/2023-01-01/regions - data://garden/wb/2024-03-11/income_groups data://garden/faostat/2024-03-14/faostat_fa: - data://garden/faostat/2024-03-14/faostat_metadata - data://grapher/demography/2023-03-31/population - data://meadow/faostat/2024-03-14/faostat_fa + - data://garden/regions/2023-01-01/regions - data://garden/wb/2024-03-11/income_groups data://garden/faostat/2024-03-14/faostat_fbsc: - data://meadow/faostat/2024-03-14/faostat_fbsh @@ -426,26 +432,31 @@ steps: - data://garden/faostat/2024-03-14/faostat_metadata - data://meadow/faostat/2024-03-14/faostat_fo - data://grapher/demography/2023-03-31/population + - data://garden/regions/2023-01-01/regions - data://garden/wb/2024-03-11/income_groups data://garden/faostat/2024-03-14/faostat_food_explorer: - data://garden/faostat/2024-03-14/faostat_qcl - data://garden/faostat/2024-03-14/faostat_fbsc - data://garden/faostat/2024-03-14/faostat_metadata - data://grapher/demography/2023-03-31/population + - data://garden/regions/2023-01-01/regions - data://garden/wb/2024-03-11/income_groups data://garden/faostat/2024-03-14/faostat_fs: - data://meadow/faostat/2024-03-14/faostat_fs - data://garden/faostat/2024-03-14/faostat_metadata - data://grapher/demography/2023-03-31/population + - data://garden/regions/2023-01-01/regions - data://garden/wb/2024-03-11/income_groups data://garden/faostat/2024-03-14/faostat_ic: - data://garden/faostat/2024-03-14/faostat_metadata - data://grapher/demography/2023-03-31/population - data://meadow/faostat/2024-03-14/faostat_ic + - data://garden/regions/2023-01-01/regions - data://garden/wb/2024-03-11/income_groups data://garden/faostat/2024-03-14/faostat_lc: - data://garden/faostat/2024-03-14/faostat_metadata - data://grapher/demography/2023-03-31/population + - data://garden/regions/2023-01-01/regions - data://garden/wb/2024-03-11/income_groups - data://meadow/faostat/2024-03-14/faostat_lc data://garden/faostat/2024-03-14/faostat_metadata: @@ -483,59 +494,71 @@ steps: - data://garden/faostat/2024-03-14/faostat_metadata - data://grapher/demography/2023-03-31/population - data://meadow/faostat/2024-03-14/faostat_qcl + - data://garden/regions/2023-01-01/regions - data://garden/wb/2024-03-11/income_groups data://garden/faostat/2024-03-14/faostat_qi: - data://meadow/faostat/2024-03-14/faostat_qi - data://garden/faostat/2024-03-14/faostat_metadata - data://grapher/demography/2023-03-31/population + - data://garden/regions/2023-01-01/regions - data://garden/wb/2024-03-11/income_groups data://garden/faostat/2024-03-14/faostat_qv: - data://meadow/faostat/2024-03-14/faostat_qv - data://garden/faostat/2024-03-14/faostat_metadata - data://grapher/demography/2023-03-31/population + - data://garden/regions/2023-01-01/regions - data://garden/wb/2024-03-11/income_groups data://garden/faostat/2024-03-14/faostat_rfb: - data://garden/faostat/2024-03-14/faostat_metadata - data://meadow/faostat/2024-03-14/faostat_rfb - data://grapher/demography/2023-03-31/population + - data://garden/regions/2023-01-01/regions - data://garden/wb/2024-03-11/income_groups data://garden/faostat/2024-03-14/faostat_rfn: - data://garden/faostat/2024-03-14/faostat_metadata - data://grapher/demography/2023-03-31/population - data://meadow/faostat/2024-03-14/faostat_rfn + - data://garden/regions/2023-01-01/regions - data://garden/wb/2024-03-11/income_groups data://garden/faostat/2024-03-14/faostat_rl: - data://garden/faostat/2024-03-14/faostat_metadata - data://meadow/faostat/2024-03-14/faostat_rl - data://grapher/demography/2023-03-31/population + - data://garden/regions/2023-01-01/regions - data://garden/wb/2024-03-11/income_groups data://garden/faostat/2024-03-14/faostat_rp: - data://meadow/faostat/2024-03-14/faostat_rp - data://garden/faostat/2024-03-14/faostat_metadata - data://grapher/demography/2023-03-31/population + - data://garden/regions/2023-01-01/regions - data://garden/wb/2024-03-11/income_groups data://garden/faostat/2024-03-14/faostat_rt: - data://garden/faostat/2024-03-14/faostat_metadata - data://meadow/faostat/2024-03-14/faostat_rt - data://grapher/demography/2023-03-31/population + - data://garden/regions/2023-01-01/regions - data://garden/wb/2024-03-11/income_groups data://garden/faostat/2024-03-14/faostat_scl: - data://meadow/faostat/2024-03-14/faostat_scl - data://garden/faostat/2024-03-14/faostat_metadata - data://grapher/demography/2023-03-31/population + - data://garden/regions/2023-01-01/regions - data://garden/wb/2024-03-11/income_groups data://garden/faostat/2024-03-14/faostat_sdgb: - data://garden/faostat/2024-03-14/faostat_metadata - data://meadow/faostat/2024-03-14/faostat_sdgb - data://garden/wb/2024-03-11/income_groups - data://grapher/demography/2023-03-31/population + - data://garden/regions/2023-01-01/regions data://garden/faostat/2024-03-14/faostat_tcl: - data://meadow/faostat/2024-03-14/faostat_tcl - data://garden/faostat/2024-03-14/faostat_metadata - data://grapher/demography/2023-03-31/population - data://garden/wb/2024-03-11/income_groups + - data://garden/regions/2023-01-01/regions data://garden/faostat/2024-03-14/faostat_ti: - data://garden/faostat/2024-03-14/faostat_metadata - data://grapher/demography/2023-03-31/population + - data://garden/regions/2023-01-01/regions - data://meadow/faostat/2024-03-14/faostat_ti - data://garden/wb/2024-03-11/income_groups diff --git a/etl/steps/data/garden/faostat/2024-03-14/faostat_qcl.py b/etl/steps/data/garden/faostat/2024-03-14/faostat_qcl.py index 20011f19d5e..a760a09a8cd 100644 --- a/etl/steps/data/garden/faostat/2024-03-14/faostat_qcl.py +++ b/etl/steps/data/garden/faostat/2024-03-14/faostat_qcl.py @@ -4,7 +4,6 @@ import numpy as np import pandas as pd -from owid import catalog from owid.datautils import dataframes from shared import ( ADDED_TITLE_TO_WIDE_TABLE, @@ -450,13 +449,13 @@ def run(dest_dir: str) -> None: paths = PathFinder(current_step_file.as_posix()) # Load latest meadow dataset and keep its metadata. - ds_meadow: catalog.Dataset = paths.load_dependency(dataset_short_name) + ds_meadow = paths.load_dataset(dataset_short_name) # Load main table from dataset. tb_meadow = ds_meadow[dataset_short_name] data = pd.DataFrame(tb_meadow).reset_index() # Load dataset of FAOSTAT metadata. - metadata: catalog.Dataset = paths.load_dependency(f"{NAMESPACE}_metadata") + metadata = paths.load_dataset(f"{NAMESPACE}_metadata") # Load dataset, items, element-units, and countries metadata. dataset_metadata = pd.DataFrame(metadata["datasets"]).loc[dataset_short_name].to_dict() @@ -467,6 +466,15 @@ def run(dest_dir: str) -> None: countries_metadata = pd.DataFrame(metadata["countries"]).reset_index() amendments = parse_amendments_table(amendments=metadata["amendments"], dataset_short_name=dataset_short_name) + # Load regions dataset. + ds_regions = paths.load_dataset("regions") + + # Load population dataset. + ds_population = paths.load_dataset("population") + + # Load income groups dataset. + ds_income_groups = paths.load_dataset("income_groups") + # # Process data. # @@ -477,6 +485,7 @@ def run(dest_dir: str) -> None: # Prepare data. data = clean_data( data=data, + ds_population=ds_population, items_metadata=items_metadata, elements_metadata=elements_metadata, countries_metadata=countries_metadata, @@ -490,7 +499,13 @@ def run(dest_dir: str) -> None: data = add_slaughtered_animals_to_meat_total(data=data) # Add data for aggregate regions. - data = add_regions(data=data, elements_metadata=elements_metadata) + data = add_regions( + data=data, + ds_regions=ds_regions, + ds_population=ds_population, + ds_income_groups=ds_income_groups, + elements_metadata=elements_metadata, + ) # Add per-capita variables. data = add_per_capita_variables(data=data, elements_metadata=elements_metadata) diff --git a/etl/steps/data/garden/faostat/2024-03-14/shared.py b/etl/steps/data/garden/faostat/2024-03-14/shared.py index c397e4e779c..2899b1dc3cc 100644 --- a/etl/steps/data/garden/faostat/2024-03-14/shared.py +++ b/etl/steps/data/garden/faostat/2024-03-14/shared.py @@ -1815,6 +1815,12 @@ def run(dest_dir: str) -> None: # Load population dataset. ds_population = paths.load_dataset("population") + # Load regions dataset. + ds_regions = paths.load_dataset("regions") + + # Load income groups dataset. + ds_income_groups = paths.load_dataset("income_groups") + # # Process data. # @@ -1833,7 +1839,13 @@ def run(dest_dir: str) -> None: ) # Add data for aggregate regions. - data = add_regions(data=data, elements_metadata=elements_metadata) + data = add_regions( + data=data, + ds_regions=ds_regions, + ds_population=ds_population, + ds_income_groups=ds_income_groups, + elements_metadata=elements_metadata, + ) # Add per-capita variables. data = add_per_capita_variables(data=data, elements_metadata=elements_metadata) From 29663767193128d24d5af37f8ab318cede6fa3be Mon Sep 17 00:00:00 2001 From: Pablo Rosado Date: Mon, 18 Mar 2024 11:36:07 +0100 Subject: [PATCH 18/54] Fix anomaly of spinach area harvested in China --- .../faostat/2024-03-14/detected_anomalies.py | 36 +++++++++++-------- 1 file changed, 21 insertions(+), 15 deletions(-) diff --git a/etl/steps/data/garden/faostat/2024-03-14/detected_anomalies.py b/etl/steps/data/garden/faostat/2024-03-14/detected_anomalies.py index 380c24e2ddc..218db8eca53 100644 --- a/etl/steps/data/garden/faostat/2024-03-14/detected_anomalies.py +++ b/etl/steps/data/garden/faostat/2024-03-14/detected_anomalies.py @@ -122,7 +122,7 @@ def _split_long_title(text: str) -> str: class SpinachAreaHarvestedAnomaly(DataAnomaly): description = ( # type: ignore - "The area harvested of spinach for China (which refers to mainland) in 1984 is missing. " + "The area harvested of spinach for China (which refers to mainland) in 1984 is zero. " "This causes that other regions that are aggregates which include China mainland have a spurious reduction in " "area harvested of spinach in that year, and a spurious increase in yield. " "Therefore, we remove those spurious aggregate values." @@ -150,20 +150,26 @@ class SpinachAreaHarvestedAnomaly(DataAnomaly): ] def check(self, df): - # Check that the data point is indeed missing. - assert df[ - (df["country"] == "China") - & (df["item_code"].isin(self.affected_item_codes)) - & (df["element_code"].isin(self.affected_element_codes)) - & (df["year"].isin(self.affected_years)) - ].empty - # For consistency, check that other years do have data for the same item and element. - assert not df[ - (df["country"] == "China") - & (df["item_code"].isin(self.affected_item_codes)) - & (df["element_code"].isin(self.affected_element_codes)) - & ~(df["year"].isin(self.affected_years)) - ].empty + # Check that the data point is indeed zero. + assert ( + df[ + (df["country"] == "China") + & (df["item_code"].isin(self.affected_item_codes)) + & (df["element_code"].isin(self.affected_element_codes)) + & (df["year"].isin(self.affected_years)) + ]["value"] + == 0 + ).all() + # For consistency, check that other years do have non-zero data for the same item and element. + assert ( + df[ + (df["country"] == "China") + & (df["item_code"].isin(self.affected_item_codes)) + & (df["element_code"].isin(self.affected_element_codes)) + & ~(df["year"].isin(self.affected_years)) + ]["value"] + > 0 + ).all() def inspect(self, df): log.info( From 91f8f43baba3a42297138ef5474b4e7c0515fa00 Mon Sep 17 00:00:00 2001 From: Pablo Rosado Date: Mon, 18 Mar 2024 12:48:23 +0100 Subject: [PATCH 19/54] Fix errors on missing units --- .../data/garden/faostat/2024-03-14/shared.py | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) diff --git a/etl/steps/data/garden/faostat/2024-03-14/shared.py b/etl/steps/data/garden/faostat/2024-03-14/shared.py index 2899b1dc3cc..06bb361dde0 100644 --- a/etl/steps/data/garden/faostat/2024-03-14/shared.py +++ b/etl/steps/data/garden/faostat/2024-03-14/shared.py @@ -856,6 +856,13 @@ def add_custom_names_and_descriptions( # Remove "owid_" from column names. data = data.rename(columns={column: column.replace("owid_", "") for column in data.columns}) + # Fill missing unit and short_unit columns with empty strings. + for column in ["unit", "unit_short_name"]: + missing_unit_mask = data[column].isnull() + if not data[missing_unit_mask].empty: + log.warning(f"Missing {column} for elements: {set(data[missing_unit_mask]['element'])}") + data[column] = data[column].cat.add_categories("").fillna("") + return data @@ -1476,18 +1483,6 @@ def clean_data( } ) - # # Dataset faostat_wcad doesn't have a year column, but a "census_year", which has intervals like "2002-2003" - # if "census_year" in data.columns: - # if data["census_year"].astype(str).str.contains("-.{4}/", regex=True).any(): - # log.warning( - # "Column 'census_year' in dataset 'faostat_wcad' contains values that need to be properly analysed " - # "and processed, e.g. 1976-1977/1980-1981. For the moment, we take the first 4 digits as the year." - # ) - - # # Remove rows that don't have a census year, and take the first 4 digits - # data = data.dropna(subset="census_year").reset_index(drop=True) - # data["year"] = data["census_year"].astype(str).str[0:4].astype(int) - # Ensure year column is integer (sometimes it is given as a range of years, e.g. 2013-2015). data["year"] = clean_year_column(data["year"]) From 649303d9446ba3805d6fe24f22ddb2b84c7c039f Mon Sep 17 00:00:00 2001 From: Pablo Rosado Date: Mon, 18 Mar 2024 15:22:55 +0100 Subject: [PATCH 20/54] Minor fixes --- dag/faostat.yml | 2 -- etl/steps/data/meadow/faostat/2024-03-14/faostat_metadata.py | 3 +-- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/dag/faostat.yml b/dag/faostat.yml index 742aa274ef4..95029874a19 100644 --- a/dag/faostat.yml +++ b/dag/faostat.yml @@ -463,7 +463,6 @@ steps: - data://meadow/faostat/2024-03-14/faostat_rt - data://meadow/faostat/2024-03-14/faostat_scl - data://meadow/faostat/2024-03-14/faostat_el - - data://meadow/faostat/2023-06-12/faostat_gn - data://meadow/faostat/2024-03-14/faostat_sdgb - data://meadow/faostat/2024-03-14/faostat_qv - data://meadow/faostat/2024-03-14/faostat_emn @@ -488,7 +487,6 @@ steps: - data://meadow/faostat/2024-03-14/faostat_cahd - data://meadow/faostat/2024-03-14/faostat_fbs - data://meadow/faostat/2024-03-14/faostat_ti - - data://meadow/faostat/2023-06-12/faostat_wcad - data://meadow/faostat/2024-03-14/faostat_ep data://garden/faostat/2024-03-14/faostat_qcl: - data://garden/faostat/2024-03-14/faostat_metadata diff --git a/etl/steps/data/meadow/faostat/2024-03-14/faostat_metadata.py b/etl/steps/data/meadow/faostat/2024-03-14/faostat_metadata.py index 4990dcf0fa5..ea4d16fe6c3 100644 --- a/etl/steps/data/meadow/faostat/2024-03-14/faostat_metadata.py +++ b/etl/steps/data/meadow/faostat/2024-03-14/faostat_metadata.py @@ -6,7 +6,6 @@ """ -from pathlib import Path from typing import Any, Dict, List import pandas as pd @@ -177,7 +176,7 @@ def run(dest_dir: str) -> None: # Load data. # # Fetch the dataset short name from dest_dir. - dataset_short_name = Path(dest_dir).name + dataset_short_name = f"{NAMESPACE}_metadata" # Define path to current step file. current_step_file = (CURRENT_DIR / dataset_short_name).with_suffix(".py") From d906ba82bcbba5a5db538a2015687781df90fd6f Mon Sep 17 00:00:00 2001 From: Pablo Rosado Date: Mon, 18 Mar 2024 17:44:24 +0100 Subject: [PATCH 21/54] Fix missing metadata in latest faostat_qv, big refactor --- .../faostat/2024-03-14/faostat_metadata.py | 26 ++++++----- .../data/garden/faostat/2024-03-14/shared.py | 45 ++++++++++++------- .../faostat/2024-03-14/faostat_metadata.py | 34 +++++++------- 3 files changed, 62 insertions(+), 43 deletions(-) diff --git a/etl/steps/data/garden/faostat/2024-03-14/faostat_metadata.py b/etl/steps/data/garden/faostat/2024-03-14/faostat_metadata.py index d0fdab1bb1e..b7faa7bc61a 100644 --- a/etl/steps/data/garden/faostat/2024-03-14/faostat_metadata.py +++ b/etl/steps/data/garden/faostat/2024-03-14/faostat_metadata.py @@ -249,9 +249,13 @@ def create_items_dataframe_for_domain( "item": "fao_item", "description": "fao_item_description", } + _metadata = metadata[f"{dataset_short_name}_item"].reset_index() + if not (set(items_columns) <= set(_metadata.columns)): + # This is the case for faostat_qv since last version. + return items_from_data + _items_df = ( - metadata[f"{dataset_short_name}_item"] - .reset_index()[list(items_columns)] + _metadata[list(items_columns)] .rename(columns=items_columns) .drop_duplicates() .sort_values(list(items_columns.values())) @@ -438,9 +442,13 @@ def create_elements_dataframe_for_domain( "element": "fao_element", "description": "fao_element_description", } + _metadata = metadata[f"{dataset_short_name}_element"].reset_index() + if not (set(elements_columns) <= set(_metadata.columns)): + # This is the case for faostat_qv since last version. + return elements_from_data + _elements_df = ( - metadata[f"{dataset_short_name}_element"] - .reset_index()[list(elements_columns)] + _metadata[list(elements_columns)] .rename(columns=elements_columns) .drop_duplicates() .sort_values(list(elements_columns.values())) @@ -754,7 +762,7 @@ def check_that_flag_definitions_in_dataset_agree_with_those_in_flags_ranking( """ for table_name in metadata.table_names: - if "flag" in table_name: + if ("flag" in table_name) and ("flags" in metadata[table_name].columns): flag_df = metadata[table_name].reset_index() comparison = pd.merge(FLAGS_RANKING, flag_df, on="flag", how="inner") error_message = ( @@ -970,12 +978,10 @@ def process_metadata( countries_in_data = pd.concat([countries_in_data, df]).drop_duplicates() # Get country groups in this dataset. - area_group_table_name = f"{dataset_short_name}_area_group" - if area_group_table_name in metadata: + _metadata = metadata[f"{dataset_short_name}_area_group"].reset_index() + if set(["country_group", "country"]) <= set(_metadata.columns): country_groups = ( - metadata[f"{dataset_short_name}_area_group"] - .reset_index() - .drop_duplicates(subset=["country_group", "country"]) + _metadata.drop_duplicates(subset=["country_group", "country"]) .groupby("country_group") .agg({"country": list}) .to_dict()["country"] diff --git a/etl/steps/data/garden/faostat/2024-03-14/shared.py b/etl/steps/data/garden/faostat/2024-03-14/shared.py index 06bb361dde0..9d18c0417d7 100644 --- a/etl/steps/data/garden/faostat/2024-03-14/shared.py +++ b/etl/steps/data/garden/faostat/2024-03-14/shared.py @@ -1489,16 +1489,18 @@ def clean_data( # Remove rows with nan value. data = remove_rows_with_nan_value(data) - # Use custom names for items, elements and units (and keep original names in "fao_*" columns). - data = add_custom_names_and_descriptions(data, items_metadata, elements_metadata) + if len(items_metadata) > 0 and len(elements_metadata) > 0: + # This is not fulfilled for faostat_qv since the last update. + # Use custom names for items, elements and units (and keep original names in "fao_*" columns). + data = add_custom_names_and_descriptions(data, items_metadata, elements_metadata) - # Multiply data values by their corresponding unit factor, if any was given, and then drop unit_factor column. - unit_factor_mask = data["unit_factor"].notnull() - data.loc[unit_factor_mask, "value"] = data[unit_factor_mask]["value"] * data[unit_factor_mask]["unit_factor"] - data = data.drop(columns=["unit_factor"]) + # Multiply data values by their corresponding unit factor, if any was given, and then drop unit_factor column. + unit_factor_mask = data["unit_factor"].notnull() + data.loc[unit_factor_mask, "value"] = data[unit_factor_mask]["value"] * data[unit_factor_mask]["unit_factor"] + data = data.drop(columns=["unit_factor"]) - # Add FAO population as an additional column (if given in the original data). - data = add_fao_population_if_given(data) + # Add FAO population as an additional column (if given in the original data). + data = add_fao_population_if_given(data) # Convert variables that were given per-capita to total value. data = convert_variables_given_per_capita_to_total_value(data, elements_metadata=elements_metadata) @@ -1677,11 +1679,15 @@ def prepare_wide_table(data: pd.DataFrame) -> catalog.Table: lambda item, element, unit: f"{item} - {element} ({unit})", ) - # Construct a human-readable variable description (for the variable metadata). - data["variable_description"] = dataframes.apply_on_categoricals( - [data.item, data.element, data.item_description, data.element_description], - prepare_variable_description, - ) + if "item_description" in data.columns: + # Construct a human-readable variable description (for the variable metadata). + data["variable_description"] = dataframes.apply_on_categoricals( + [data.item, data.element, data.item_description, data.element_description], + prepare_variable_description, + ) + else: + # This is the case for faostat_qv since the last update. + data["variable_description"] = "" # Pivot over long dataframe to generate a wide dataframe with country-year as index, and as many columns as # unique elements in "variable_name" (which should be as many as combinations of item-elements). @@ -1710,10 +1716,15 @@ def prepare_wide_table(data: pd.DataFrame) -> catalog.Table: for column in wide_table.columns: wide_table[column].metadata.unit = variable_name_mapping[column] - # Add variable unit (short name). - variable_name_mapping = _variable_name_map(data, "unit_short_name") - for column in wide_table.columns: - wide_table[column].metadata.short_unit = variable_name_mapping[column] + if "unit_short_name" in data.columns: + # Add variable unit (short name). + variable_name_mapping = _variable_name_map(data, "unit_short_name") + for column in wide_table.columns: + wide_table[column].metadata.short_unit = variable_name_mapping[column] + else: + # This is the case for faostat_qv since the last update. + for column in wide_table.columns: + wide_table[column].metadata.short_unit = "" # Add variable description. variable_name_mapping = _variable_name_map(data, "variable_description") diff --git a/etl/steps/data/meadow/faostat/2024-03-14/faostat_metadata.py b/etl/steps/data/meadow/faostat/2024-03-14/faostat_metadata.py index ea4d16fe6c3..ecaab7a72a0 100644 --- a/etl/steps/data/meadow/faostat/2024-03-14/faostat_metadata.py +++ b/etl/steps/data/meadow/faostat/2024-03-14/faostat_metadata.py @@ -151,22 +151,24 @@ def create_tables_for_all_domain_records(additional_metadata: Dict[str, Any]) -> for category in list(additional_metadata[domain]): json_data = additional_metadata[domain][category]["data"] df = pd.DataFrame.from_dict(json_data) - if len(df) > 0: - df.set_index( - category_structure[category]["index"], - verify_integrity=True, - inplace=True, - ) - table_short_name = f'{NAMESPACE}_{domain.lower()}_{category_structure[category]["short_name"]}' - - # there might be duplicates coming from `itemsgroup` and `itemgroup` - if table_short_name in used_short_names: - log.warning("faostat_metadata.duplicate_short_name", short_name=table_short_name) - continue - used_short_names.add(table_short_name) - - table = Table(df, short_name=table_short_name) - tables.append(table) + if len(df) == 0: + # This is the case for lc flag, rfb flag, scl itemfactor, and, since last version, all qv categories. + df = df.assign(**{col: [] for col in category_structure[category]["index"]}) + df.set_index( + category_structure[category]["index"], + verify_integrity=True, + inplace=True, + ) + table_short_name = f'{NAMESPACE}_{domain.lower()}_{category_structure[category]["short_name"]}' + + # there might be duplicates coming from `itemsgroup` and `itemgroup` + if table_short_name in used_short_names: + log.warning("faostat_metadata.duplicate_short_name", short_name=table_short_name) + continue + used_short_names.add(table_short_name) + + table = Table(df, short_name=table_short_name) + tables.append(table) return tables From a1092067e49ee7efa63ff98cfbbba1e1ba95d81a Mon Sep 17 00:00:00 2001 From: Pablo Rosado Date: Tue, 19 Mar 2024 09:39:57 +0100 Subject: [PATCH 22/54] Fix changed item in qcl and global food explorer --- dag/faostat.yml | 5 + docs/data/faostat.md | 10 +- .../explorers/faostat/latest/food_explorer.py | 117 ++++++++++++++++++ .../faostat/2024-03-14/custom_items.csv | 2 +- .../2024-03-14/faostat_food_explorer.py | 21 ++-- .../data/garden/faostat/2024-03-14/shared.py | 4 + 6 files changed, 144 insertions(+), 15 deletions(-) create mode 100644 etl/steps/data/explorers/faostat/latest/food_explorer.py diff --git a/dag/faostat.yml b/dag/faostat.yml index 95029874a19..ac9d4bff484 100644 --- a/dag/faostat.yml +++ b/dag/faostat.yml @@ -560,3 +560,8 @@ steps: - data://garden/regions/2023-01-01/regions - data://meadow/faostat/2024-03-14/faostat_ti - data://garden/wb/2024-03-11/income_groups + # + # FAOSTAT food explorer step + # + data://explorers/faostat/latest/food_explorer: + - data://garden/faostat/2024-03-14/faostat_food_explorer diff --git a/docs/data/faostat.md b/docs/data/faostat.md index 0eb222d43d0..44b3d5cd36d 100644 --- a/docs/data/faostat.md +++ b/docs/data/faostat.md @@ -297,18 +297,20 @@ version) for each dataset, to replace variables of a dataset from its second lat 10. Use OWID's internal approval tool to visually inspect changes between the old and new versions of updated charts, and accept or reject changes. -11. Create a new explorers step. For the moment, this has to be done manually: - * Duplicate the latest step in `etl/etl/steps/data/explorers/faostat/` and use the current date as the new version. - * Duplicate entry of explorers step in the dag, and replace versions (of the step itself and its dependencies) by the corresponding latest versions. +11. Update the explorers step `data://explorers/faostat/latest/food_explorer` (for the moment, this has to be done manually): Edit the version of its only dependency in the dag, so that it loads the latest garden step. It should be `data://garden/faostat/YYYY-MM-DD/faostat_food_explorer`. 12. Run the new etl explorers step, to generate the csv files for the global food explorer. ```bash - etl run explorers/faostat/YYYY-MM-DD/food_explorer + etl run explorers/faostat/latest/food_explorer ``` Run internal sanity checks on the generated files. + !!! note + + Sometimes items change in FAOSTAT. If that's the case, you may need to edit a file in the `owid-content` repository, namely `scripts/global-food-explorer/foods.csv`. Then, follow the instructions in `scripts/global-food-explorer/README.md`. + 13. Manually create a new garden dataset of additional variables `additional_variables` for the new version, and update its metadata. Then create a new grapher dataset too. Manually update all other datasets that use any faostat dataset as a dependency. !!! note diff --git a/etl/steps/data/explorers/faostat/latest/food_explorer.py b/etl/steps/data/explorers/faostat/latest/food_explorer.py new file mode 100644 index 00000000000..374282bdc04 --- /dev/null +++ b/etl/steps/data/explorers/faostat/latest/food_explorer.py @@ -0,0 +1,117 @@ +"""Food explorer data step. + +Loads the faostat_food_explorer dataset from garden and stores a table (as a csv file) for each food product. + +""" + +import sys +from typing import List + +from owid.catalog import Table, utils +from tqdm.auto import tqdm + +from etl.helpers import PathFinder, create_dataset + +paths = PathFinder(__file__) + +# Rename columns to be used by the food explorer. +# Note: Include here all columns, even if the name is not changed. +EXPECTED_COLUMNS = { + "population": "population", + "area_harvested__hectares": "area_harvested__ha", + "area_harvested__hectares_per_capita": "area_harvested__ha__per_capita", + "domestic_supply__tonnes": "domestic_supply__tonnes", + "domestic_supply__tonnes_per_capita": "domestic_supply__tonnes__per_capita", + "exports__tonnes": "exports__tonnes", + "exports__tonnes_per_capita": "exports__tonnes__per_capita", + "feed__tonnes": "feed__tonnes", + "feed__tonnes_per_capita": "feed__tonnes__per_capita", + "food__tonnes": "food__tonnes", + "food__tonnes_per_capita": "food__tonnes__per_capita", + "food_available_for_consumption__grams_of_fat_per_day_per_capita": "food_available_for_consumption__fat_g_per_day__per_capita", + "food_available_for_consumption__kilocalories_per_day_per_capita": "food_available_for_consumption__kcal_per_day__per_capita", + "food_available_for_consumption__kilograms_per_year_per_capita": "food_available_for_consumption__kg_per_year__per_capita", + "food_available_for_consumption__grams_of_protein_per_day_per_capita": "food_available_for_consumption__protein_g_per_day__per_capita", + "imports__tonnes": "imports__tonnes", + "imports__tonnes_per_capita": "imports__tonnes__per_capita", + "other_uses__tonnes": "other_uses__tonnes", + "other_uses__tonnes_per_capita": "other_uses__tonnes__per_capita", + "producing_or_slaughtered_animals__animals": "producing_or_slaughtered_animals__animals", + "producing_or_slaughtered_animals__animals_per_capita": "producing_or_slaughtered_animals__animals__per_capita", + "production__tonnes": "production__tonnes", + "production__tonnes_per_capita": "production__tonnes__per_capita", + "waste_in_supply_chain__tonnes": "waste_in_supply_chain__tonnes", + "waste_in_supply_chain__tonnes_per_capita": "waste_in_supply_chain__tonnes__per_capita", + "yield__kilograms_per_animal": "yield__kg_per_animal", + "yield__tonnes_per_hectare": "yield__tonnes_per_ha", +} + + +def create_table_for_each_product(tb_garden: Table) -> List[Table]: + """Create a list of tables, one for each product found in a garden table. + + Parameters + ---------- + tb_garden : Table + Table of products from garden dataset. + + Returns + ------- + tables : List[Table] + List of tables, one for each product. + + """ + # List all products in table + products = sorted(tb_garden.index.get_level_values("product").unique().tolist()) + + tables = [] + for product in tqdm(products, file=sys.stdout): + # Save a table for each food product. + table_product = tb_garden.loc[product].copy() + + # Update table metadata. + table_product.title = product + + # Rename columns, select the required ones, and sort columns and rows conveniently. + table_product = table_product[list(EXPECTED_COLUMNS)].rename(columns=EXPECTED_COLUMNS) + table_product = table_product[ + ["population"] + [column for column in sorted(table_product.columns) if column not in ["population"]] + ] + table_product = table_product.sort_index() + + table_product.metadata.short_name = ( + utils.underscore(name=product, validate=True).replace("__", "_").replace("_e_g_", "_eg_") + ) + + # Add table to list of all tables to include in the explorers dataset. + tables.append(table_product) + + return tables + + +def run(dest_dir: str) -> None: + # + # Load data. + # + # Load the dataset for FAOSTAT food explorer from garden. + ds_garden = paths.load_dataset("faostat_food_explorer") + + # Get the table of all food products. + tb_garden = ds_garden["faostat_food_explorer"] + + # + # Process data. + # + tables = create_table_for_each_product(tb_garden=tb_garden) + + # + # Save outputs. + # + # Initialize new explorers dataset. + ds_explorers = create_dataset( + dest_dir=dest_dir, tables=tables, default_metadata=ds_garden.metadata, formats=["csv"] + ) + ds_explorers.metadata.short_name = "food_explorer" + + # Create new explorers dataset. + ds_explorers.save() diff --git a/etl/steps/data/garden/faostat/2024-03-14/custom_items.csv b/etl/steps/data/garden/faostat/2024-03-14/custom_items.csv index e870867288b..a0e6a57cb06 100644 --- a/etl/steps/data/garden/faostat/2024-03-14/custom_items.csv +++ b/etl/steps/data/garden/faostat/2024-03-14/custom_items.csv @@ -1,5 +1,5 @@ dataset,item_code,fao_item,owid_item,fao_item_description,owid_item_description -faostat_qcl,00000773,,Flax fibre,, +faostat_qcl,00000771,,"Flax, raw or retted",, faostat_qcl,00000221,"Almonds, in shell",Almonds,"Almonds, in shell This subclass is defined through the following headings/subheadings of the HS 2007: 0802.11.", faostat_fbsc,00002946,Animal fats,Animal fats group,, faostat_qcl,00000711,"Anise, badian, coriander, cumin, caraway, fennel and juniper berries, raw",Herbs (e.g. fennel),"Anise, badian, coriander, cumin, caraway, fennel and juniper berries, raw This subclass includes: - aniseed, Pimpinella anisum, raw - star anise (badian) or Chinese star anise, Illicium verum, raw - fennel, Foeniculum vulgare, raw (when used as spice) - coriander (cilantro), Coriandrum sativum, raw - cumin, Cuminum cyminum, raw - caraway seeds, Carum carvi, raw - juniper berries, Juniperus communis, raw This subclass does not include: - fennel (when used as a vegetable), cf. 01290 - processed anise, fennel, coriander, cumin, caraway and juniper berries, cf. 23924", diff --git a/etl/steps/data/garden/faostat/2024-03-14/faostat_food_explorer.py b/etl/steps/data/garden/faostat/2024-03-14/faostat_food_explorer.py index b620f241446..dfbf6f9ccbe 100644 --- a/etl/steps/data/garden/faostat/2024-03-14/faostat_food_explorer.py +++ b/etl/steps/data/garden/faostat/2024-03-14/faostat_food_explorer.py @@ -14,7 +14,13 @@ import pandas as pd from owid import catalog from owid.datautils import dataframes -from shared import CURRENT_DIR, NAMESPACE +from shared import ( + CURRENT_DIR, + FAO_POPULATION_ELEMENT_NAME, + FAO_POPULATION_ITEM_NAME, + FAO_POPULATION_UNIT_NAME, + NAMESPACE, +) from etl.data_helpers import geo from etl.helpers import PathFinder, create_dataset @@ -79,7 +85,7 @@ "00000258", # From faostat_qcl - 'Palm kernel oil' (previously 'Palm kernel oil'). "00000156", # From faostat_qcl - 'Sugar cane' (previously 'Sugar cane'). "00000373", # From faostat_qcl - 'Spinach' (previously 'Spinach'). - "00000773", # From faostat_qcl - 'Flax fibre' (previously 'Flax fibre'). + "00000771", # From faostat_qcl - 'Flax, raw or retted' (previously 'Flax fibre'). "00000116", # From faostat_qcl - 'Potatoes' (previously 'Potatoes'). "00000869", # From faostat_qcl - 'Cattle fat, unrendered' (previously 'Fat, cattle'). "00000358", # From faostat_qcl - 'Cabbages' (previously 'Cabbages'). @@ -284,11 +290,6 @@ "00002908", # From faostat_fbsc - 'Sugar crops' (previously 'Sugar crops'). ] -# OWID item name, element name, and unit name for population (as given in faostat_qcl and faostat_fbsc datasets). -FAO_POPULATION_ITEM_NAME = "Population" -FAO_POPULATION_ELEMENT_NAME = "Total Population - Both sexes" -FAO_POPULATION_UNIT = "1000 persons" - # List of element codes to consider from faostat_qcl. ELEMENT_CODES_QCL = [ "005312", @@ -439,7 +440,7 @@ def get_fao_population(combined: pd.DataFrame) -> pd.DataFrame: # Check that population is given in "1000 persons" and convert to persons. error = "FAOSTAT population changed item, element, or unit." - assert list(fao_population["unit"].unique()) == [FAO_POPULATION_UNIT], error + assert list(fao_population["unit"].unique()) == [FAO_POPULATION_UNIT_NAME], error fao_population["value"] *= 1000 # Drop missing values and prepare output dataframe. @@ -521,8 +522,8 @@ def run(dest_dir: str) -> None: paths = PathFinder(current_step_file.as_posix()) # Load latest qcl and fbsc datasets from garden. - qcl_dataset: catalog.Dataset = paths.load_dependency(f"{NAMESPACE}_qcl") - fbsc_dataset: catalog.Dataset = paths.load_dependency(f"{NAMESPACE}_fbsc") + qcl_dataset = paths.load_dataset(f"{NAMESPACE}_qcl") + fbsc_dataset = paths.load_dataset(f"{NAMESPACE}_fbsc") # Get main long tables from qcl and fbsc datasets. qcl_table = qcl_dataset[f"{NAMESPACE}_qcl"] diff --git a/etl/steps/data/garden/faostat/2024-03-14/shared.py b/etl/steps/data/garden/faostat/2024-03-14/shared.py index 9d18c0417d7..36d19f04da5 100644 --- a/etl/steps/data/garden/faostat/2024-03-14/shared.py +++ b/etl/steps/data/garden/faostat/2024-03-14/shared.py @@ -316,6 +316,10 @@ # Additional text to include in the metadata title of the output wide table. ADDED_TITLE_TO_WIDE_TABLE = " - Flattened table indexed by country-year." +# Name of item, element and unit of FAO population (used to select population in the data). +FAO_POPULATION_ITEM_NAME = "Population" +FAO_POPULATION_ELEMENT_NAME = "Total Population - Both sexes" +FAO_POPULATION_UNIT_NAME = "thousand Number" # Shared functions. From e4cc4d7b6a0622efd30a0b22d7d52c04db752b68 Mon Sep 17 00:00:00 2001 From: Pablo Rosado Date: Tue, 19 Mar 2024 11:15:12 +0100 Subject: [PATCH 23/54] Add grapher steps (WIP) --- dag/faostat.yml | 49 +++++++++++++++++++ .../faostat/2024-03-14/faostat_cahd.py | 2 + .../grapher/faostat/2024-03-14/faostat_ei.py | 2 + .../grapher/faostat/2024-03-14/faostat_ek.py | 2 + .../grapher/faostat/2024-03-14/faostat_emn.py | 2 + .../grapher/faostat/2024-03-14/faostat_esb.py | 2 + .../grapher/faostat/2024-03-14/faostat_fa.py | 2 + .../faostat/2024-03-14/faostat_fbsc.py | 2 + .../grapher/faostat/2024-03-14/faostat_fo.py | 2 + .../grapher/faostat/2024-03-14/faostat_fs.py | 2 + .../grapher/faostat/2024-03-14/faostat_ic.py | 2 + .../grapher/faostat/2024-03-14/faostat_lc.py | 2 + .../grapher/faostat/2024-03-14/faostat_qcl.py | 2 + .../grapher/faostat/2024-03-14/faostat_qi.py | 2 + .../grapher/faostat/2024-03-14/faostat_qv.py | 2 + .../grapher/faostat/2024-03-14/faostat_rfb.py | 2 + .../grapher/faostat/2024-03-14/faostat_rfn.py | 2 + .../grapher/faostat/2024-03-14/faostat_rl.py | 2 + .../grapher/faostat/2024-03-14/faostat_rp.py | 2 + .../grapher/faostat/2024-03-14/faostat_rt.py | 2 + .../grapher/faostat/2024-03-14/faostat_scl.py | 2 + .../faostat/2024-03-14/faostat_sdgb.py | 2 + .../grapher/faostat/2024-03-14/faostat_tcl.py | 2 + .../grapher/faostat/2024-03-14/faostat_ti.py | 2 + .../data/grapher/faostat/2024-03-14/shared.py | 43 ++++++++++++++++ 25 files changed, 138 insertions(+) create mode 100644 etl/steps/data/grapher/faostat/2024-03-14/faostat_cahd.py create mode 100644 etl/steps/data/grapher/faostat/2024-03-14/faostat_ei.py create mode 100644 etl/steps/data/grapher/faostat/2024-03-14/faostat_ek.py create mode 100644 etl/steps/data/grapher/faostat/2024-03-14/faostat_emn.py create mode 100644 etl/steps/data/grapher/faostat/2024-03-14/faostat_esb.py create mode 100644 etl/steps/data/grapher/faostat/2024-03-14/faostat_fa.py create mode 100644 etl/steps/data/grapher/faostat/2024-03-14/faostat_fbsc.py create mode 100644 etl/steps/data/grapher/faostat/2024-03-14/faostat_fo.py create mode 100644 etl/steps/data/grapher/faostat/2024-03-14/faostat_fs.py create mode 100644 etl/steps/data/grapher/faostat/2024-03-14/faostat_ic.py create mode 100644 etl/steps/data/grapher/faostat/2024-03-14/faostat_lc.py create mode 100644 etl/steps/data/grapher/faostat/2024-03-14/faostat_qcl.py create mode 100644 etl/steps/data/grapher/faostat/2024-03-14/faostat_qi.py create mode 100644 etl/steps/data/grapher/faostat/2024-03-14/faostat_qv.py create mode 100644 etl/steps/data/grapher/faostat/2024-03-14/faostat_rfb.py create mode 100644 etl/steps/data/grapher/faostat/2024-03-14/faostat_rfn.py create mode 100644 etl/steps/data/grapher/faostat/2024-03-14/faostat_rl.py create mode 100644 etl/steps/data/grapher/faostat/2024-03-14/faostat_rp.py create mode 100644 etl/steps/data/grapher/faostat/2024-03-14/faostat_rt.py create mode 100644 etl/steps/data/grapher/faostat/2024-03-14/faostat_scl.py create mode 100644 etl/steps/data/grapher/faostat/2024-03-14/faostat_sdgb.py create mode 100644 etl/steps/data/grapher/faostat/2024-03-14/faostat_tcl.py create mode 100644 etl/steps/data/grapher/faostat/2024-03-14/faostat_ti.py create mode 100644 etl/steps/data/grapher/faostat/2024-03-14/shared.py diff --git a/dag/faostat.yml b/dag/faostat.yml index ac9d4bff484..c2d01acbc18 100644 --- a/dag/faostat.yml +++ b/dag/faostat.yml @@ -561,6 +561,55 @@ steps: - data://meadow/faostat/2024-03-14/faostat_ti - data://garden/wb/2024-03-11/income_groups # + # FAOSTAT grapher steps for version 2024-03-14 + # + data://grapher/faostat/2024-03-14/faostat_cahd: + - data://garden/faostat/2024-03-14/faostat_cahd + data://grapher/faostat/2024-03-14/faostat_ei: + - data://garden/faostat/2024-03-14/faostat_ei + data://grapher/faostat/2024-03-14/faostat_ek: + - data://garden/faostat/2024-03-14/faostat_ek + data://grapher/faostat/2024-03-14/faostat_emn: + - data://garden/faostat/2024-03-14/faostat_emn + data://grapher/faostat/2024-03-14/faostat_esb: + - data://garden/faostat/2024-03-14/faostat_esb + data://grapher/faostat/2024-03-14/faostat_fa: + - data://garden/faostat/2024-03-14/faostat_fa + data://grapher/faostat/2024-03-14/faostat_fbsc: + - data://garden/faostat/2024-03-14/faostat_fbsc + data://grapher/faostat/2024-03-14/faostat_fo: + - data://garden/faostat/2024-03-14/faostat_fo + data://grapher/faostat/2024-03-14/faostat_fs: + - data://garden/faostat/2024-03-14/faostat_fs + data://grapher/faostat/2024-03-14/faostat_ic: + - data://garden/faostat/2024-03-14/faostat_ic + data://grapher/faostat/2024-03-14/faostat_lc: + - data://garden/faostat/2024-03-14/faostat_lc + data://grapher/faostat/2024-03-14/faostat_qcl: + - data://garden/faostat/2024-03-14/faostat_qcl + data://grapher/faostat/2024-03-14/faostat_qi: + - data://garden/faostat/2024-03-14/faostat_qi + data://grapher/faostat/2024-03-14/faostat_qv: + - data://garden/faostat/2024-03-14/faostat_qv + data://grapher/faostat/2024-03-14/faostat_rfb: + - data://garden/faostat/2024-03-14/faostat_rfb + data://grapher/faostat/2024-03-14/faostat_rfn: + - data://garden/faostat/2024-03-14/faostat_rfn + data://grapher/faostat/2024-03-14/faostat_rl: + - data://garden/faostat/2024-03-14/faostat_rl + data://grapher/faostat/2024-03-14/faostat_rp: + - data://garden/faostat/2024-03-14/faostat_rp + data://grapher/faostat/2024-03-14/faostat_rt: + - data://garden/faostat/2024-03-14/faostat_rt + data://grapher/faostat/2024-03-14/faostat_scl: + - data://garden/faostat/2024-03-14/faostat_scl + data://grapher/faostat/2024-03-14/faostat_sdgb: + - data://garden/faostat/2024-03-14/faostat_sdgb + data://grapher/faostat/2024-03-14/faostat_tcl: + - data://garden/faostat/2024-03-14/faostat_tcl + data://grapher/faostat/2024-03-14/faostat_ti: + - data://garden/faostat/2024-03-14/faostat_ti + # # FAOSTAT food explorer step # data://explorers/faostat/latest/food_explorer: diff --git a/etl/steps/data/grapher/faostat/2024-03-14/faostat_cahd.py b/etl/steps/data/grapher/faostat/2024-03-14/faostat_cahd.py new file mode 100644 index 00000000000..eb6916c2187 --- /dev/null +++ b/etl/steps/data/grapher/faostat/2024-03-14/faostat_cahd.py @@ -0,0 +1,2 @@ +"""FAOSTAT grapher step for faostat_cahd dataset.""" +from .shared import run # noqa:F401 diff --git a/etl/steps/data/grapher/faostat/2024-03-14/faostat_ei.py b/etl/steps/data/grapher/faostat/2024-03-14/faostat_ei.py new file mode 100644 index 00000000000..d5ca840e309 --- /dev/null +++ b/etl/steps/data/grapher/faostat/2024-03-14/faostat_ei.py @@ -0,0 +1,2 @@ +"""FAOSTAT grapher step for faostat_ei dataset.""" +from .shared import run # noqa:F401 diff --git a/etl/steps/data/grapher/faostat/2024-03-14/faostat_ek.py b/etl/steps/data/grapher/faostat/2024-03-14/faostat_ek.py new file mode 100644 index 00000000000..1438bef60af --- /dev/null +++ b/etl/steps/data/grapher/faostat/2024-03-14/faostat_ek.py @@ -0,0 +1,2 @@ +"""FAOSTAT grapher step for faostat_ek dataset.""" +from .shared import run # noqa:F401 diff --git a/etl/steps/data/grapher/faostat/2024-03-14/faostat_emn.py b/etl/steps/data/grapher/faostat/2024-03-14/faostat_emn.py new file mode 100644 index 00000000000..a4ec711b24f --- /dev/null +++ b/etl/steps/data/grapher/faostat/2024-03-14/faostat_emn.py @@ -0,0 +1,2 @@ +"""FAOSTAT grapher step for faostat_emn dataset.""" +from .shared import run # noqa:F401 diff --git a/etl/steps/data/grapher/faostat/2024-03-14/faostat_esb.py b/etl/steps/data/grapher/faostat/2024-03-14/faostat_esb.py new file mode 100644 index 00000000000..9443efd4c2f --- /dev/null +++ b/etl/steps/data/grapher/faostat/2024-03-14/faostat_esb.py @@ -0,0 +1,2 @@ +"""FAOSTAT grapher step for faostat_esb dataset.""" +from .shared import run # noqa:F401 diff --git a/etl/steps/data/grapher/faostat/2024-03-14/faostat_fa.py b/etl/steps/data/grapher/faostat/2024-03-14/faostat_fa.py new file mode 100644 index 00000000000..68c503b33fb --- /dev/null +++ b/etl/steps/data/grapher/faostat/2024-03-14/faostat_fa.py @@ -0,0 +1,2 @@ +"""FAOSTAT grapher step for faostat_fa dataset.""" +from .shared import run # noqa:F401 diff --git a/etl/steps/data/grapher/faostat/2024-03-14/faostat_fbsc.py b/etl/steps/data/grapher/faostat/2024-03-14/faostat_fbsc.py new file mode 100644 index 00000000000..a96693ea59e --- /dev/null +++ b/etl/steps/data/grapher/faostat/2024-03-14/faostat_fbsc.py @@ -0,0 +1,2 @@ +"""FAOSTAT grapher step for faostat_fbsc dataset.""" +from .shared import run # noqa:F401 diff --git a/etl/steps/data/grapher/faostat/2024-03-14/faostat_fo.py b/etl/steps/data/grapher/faostat/2024-03-14/faostat_fo.py new file mode 100644 index 00000000000..52d47d9693b --- /dev/null +++ b/etl/steps/data/grapher/faostat/2024-03-14/faostat_fo.py @@ -0,0 +1,2 @@ +"""FAOSTAT grapher step for faostat_fo dataset.""" +from .shared import run # noqa:F401 diff --git a/etl/steps/data/grapher/faostat/2024-03-14/faostat_fs.py b/etl/steps/data/grapher/faostat/2024-03-14/faostat_fs.py new file mode 100644 index 00000000000..9ac98d46d8b --- /dev/null +++ b/etl/steps/data/grapher/faostat/2024-03-14/faostat_fs.py @@ -0,0 +1,2 @@ +"""FAOSTAT grapher step for faostat_fs dataset.""" +from .shared import run # noqa:F401 diff --git a/etl/steps/data/grapher/faostat/2024-03-14/faostat_ic.py b/etl/steps/data/grapher/faostat/2024-03-14/faostat_ic.py new file mode 100644 index 00000000000..3bb8b297f9b --- /dev/null +++ b/etl/steps/data/grapher/faostat/2024-03-14/faostat_ic.py @@ -0,0 +1,2 @@ +"""FAOSTAT grapher step for faostat_ic dataset.""" +from .shared import run # noqa:F401 diff --git a/etl/steps/data/grapher/faostat/2024-03-14/faostat_lc.py b/etl/steps/data/grapher/faostat/2024-03-14/faostat_lc.py new file mode 100644 index 00000000000..9e55fe697eb --- /dev/null +++ b/etl/steps/data/grapher/faostat/2024-03-14/faostat_lc.py @@ -0,0 +1,2 @@ +"""FAOSTAT grapher step for faostat_lc dataset.""" +from .shared import run # noqa:F401 diff --git a/etl/steps/data/grapher/faostat/2024-03-14/faostat_qcl.py b/etl/steps/data/grapher/faostat/2024-03-14/faostat_qcl.py new file mode 100644 index 00000000000..17ea29863b0 --- /dev/null +++ b/etl/steps/data/grapher/faostat/2024-03-14/faostat_qcl.py @@ -0,0 +1,2 @@ +"""FAOSTAT grapher step for faostat_qcl dataset.""" +from .shared import run # noqa:F401 diff --git a/etl/steps/data/grapher/faostat/2024-03-14/faostat_qi.py b/etl/steps/data/grapher/faostat/2024-03-14/faostat_qi.py new file mode 100644 index 00000000000..ec1e351be6d --- /dev/null +++ b/etl/steps/data/grapher/faostat/2024-03-14/faostat_qi.py @@ -0,0 +1,2 @@ +"""FAOSTAT grapher step for faostat_qi dataset.""" +from .shared import run # noqa:F401 diff --git a/etl/steps/data/grapher/faostat/2024-03-14/faostat_qv.py b/etl/steps/data/grapher/faostat/2024-03-14/faostat_qv.py new file mode 100644 index 00000000000..a8ad501a473 --- /dev/null +++ b/etl/steps/data/grapher/faostat/2024-03-14/faostat_qv.py @@ -0,0 +1,2 @@ +"""FAOSTAT grapher step for faostat_qv dataset.""" +from .shared import run # noqa:F401 diff --git a/etl/steps/data/grapher/faostat/2024-03-14/faostat_rfb.py b/etl/steps/data/grapher/faostat/2024-03-14/faostat_rfb.py new file mode 100644 index 00000000000..9203ba8a494 --- /dev/null +++ b/etl/steps/data/grapher/faostat/2024-03-14/faostat_rfb.py @@ -0,0 +1,2 @@ +"""FAOSTAT grapher step for faostat_rfb dataset.""" +from .shared import run # noqa:F401 diff --git a/etl/steps/data/grapher/faostat/2024-03-14/faostat_rfn.py b/etl/steps/data/grapher/faostat/2024-03-14/faostat_rfn.py new file mode 100644 index 00000000000..006af8bb6ce --- /dev/null +++ b/etl/steps/data/grapher/faostat/2024-03-14/faostat_rfn.py @@ -0,0 +1,2 @@ +"""FAOSTAT grapher step for faostat_rfn dataset.""" +from .shared import run # noqa:F401 diff --git a/etl/steps/data/grapher/faostat/2024-03-14/faostat_rl.py b/etl/steps/data/grapher/faostat/2024-03-14/faostat_rl.py new file mode 100644 index 00000000000..95550785095 --- /dev/null +++ b/etl/steps/data/grapher/faostat/2024-03-14/faostat_rl.py @@ -0,0 +1,2 @@ +"""FAOSTAT grapher step for faostat_rl dataset.""" +from .shared import run # noqa:F401 diff --git a/etl/steps/data/grapher/faostat/2024-03-14/faostat_rp.py b/etl/steps/data/grapher/faostat/2024-03-14/faostat_rp.py new file mode 100644 index 00000000000..b552b8f0035 --- /dev/null +++ b/etl/steps/data/grapher/faostat/2024-03-14/faostat_rp.py @@ -0,0 +1,2 @@ +"""FAOSTAT grapher step for faostat_rp dataset.""" +from .shared import run # noqa:F401 diff --git a/etl/steps/data/grapher/faostat/2024-03-14/faostat_rt.py b/etl/steps/data/grapher/faostat/2024-03-14/faostat_rt.py new file mode 100644 index 00000000000..709d69ac2d5 --- /dev/null +++ b/etl/steps/data/grapher/faostat/2024-03-14/faostat_rt.py @@ -0,0 +1,2 @@ +"""FAOSTAT grapher step for faostat_rt dataset.""" +from .shared import run # noqa:F401 diff --git a/etl/steps/data/grapher/faostat/2024-03-14/faostat_scl.py b/etl/steps/data/grapher/faostat/2024-03-14/faostat_scl.py new file mode 100644 index 00000000000..95725e189c9 --- /dev/null +++ b/etl/steps/data/grapher/faostat/2024-03-14/faostat_scl.py @@ -0,0 +1,2 @@ +"""FAOSTAT grapher step for faostat_scl dataset.""" +from .shared import run # noqa:F401 diff --git a/etl/steps/data/grapher/faostat/2024-03-14/faostat_sdgb.py b/etl/steps/data/grapher/faostat/2024-03-14/faostat_sdgb.py new file mode 100644 index 00000000000..7e0187e34ba --- /dev/null +++ b/etl/steps/data/grapher/faostat/2024-03-14/faostat_sdgb.py @@ -0,0 +1,2 @@ +"""FAOSTAT grapher step for faostat_sdgb dataset.""" +from .shared import run # noqa:F401 diff --git a/etl/steps/data/grapher/faostat/2024-03-14/faostat_tcl.py b/etl/steps/data/grapher/faostat/2024-03-14/faostat_tcl.py new file mode 100644 index 00000000000..0babbf0b2e3 --- /dev/null +++ b/etl/steps/data/grapher/faostat/2024-03-14/faostat_tcl.py @@ -0,0 +1,2 @@ +"""FAOSTAT grapher step for faostat_tcl dataset.""" +from .shared import run # noqa:F401 diff --git a/etl/steps/data/grapher/faostat/2024-03-14/faostat_ti.py b/etl/steps/data/grapher/faostat/2024-03-14/faostat_ti.py new file mode 100644 index 00000000000..94634f7e505 --- /dev/null +++ b/etl/steps/data/grapher/faostat/2024-03-14/faostat_ti.py @@ -0,0 +1,2 @@ +"""FAOSTAT grapher step for faostat_ti dataset.""" +from .shared import run # noqa:F401 diff --git a/etl/steps/data/grapher/faostat/2024-03-14/shared.py b/etl/steps/data/grapher/faostat/2024-03-14/shared.py new file mode 100644 index 00000000000..95b623b6fff --- /dev/null +++ b/etl/steps/data/grapher/faostat/2024-03-14/shared.py @@ -0,0 +1,43 @@ +"""Common grapher step for all FAOSTAT domains. + +""" +from pathlib import Path + +from etl.helpers import PathFinder, create_dataset + +# Define path to current folder, namespace and version of all datasets in this folder. +CURRENT_DIR = Path(__file__).parent +VERSION = CURRENT_DIR.name + + +def run(dest_dir: str) -> None: + # + # Load data. + # + # Fetch the dataset short name from dest_dir. + dataset_short_name = Path(dest_dir).name + + # Define path to current step file. + current_step_file = (CURRENT_DIR / dataset_short_name).with_suffix(".py") + + # Get paths and naming conventions for current data step. + paths = PathFinder(current_step_file.as_posix()) + + # Load latest garden dataset. + ds_garden = paths.load_dataset(dataset_short_name) + + # Load wide table from dataset. + tb_garden = ds_garden[f"{dataset_short_name}_flat"] + + # + # Process data. + # + # Remove unnecessary columns. + tb_garden = tb_garden.drop(columns="area_code") + + # + # Save outputs. + # + # Create a new grapher dataset. + ds_grapher = create_dataset(dest_dir=dest_dir, tables=[tb_garden], default_metadata=ds_garden.metadata) + ds_grapher.save() From 3da102e742c4099563d2356e87832b4d59a46563 Mon Sep 17 00:00:00 2001 From: Pablo Rosado Date: Tue, 19 Mar 2024 12:09:28 +0100 Subject: [PATCH 24/54] Use tables instead of dataframes --- .../garden/faostat/2024-03-14/faostat_fbsc.py | 18 +- .../faostat/2024-03-14/faostat_metadata.py | 8 +- .../garden/faostat/2024-03-14/faostat_qcl.py | 14 +- .../data/garden/faostat/2024-03-14/shared.py | 653 +++++++++--------- 4 files changed, 343 insertions(+), 350 deletions(-) diff --git a/etl/steps/data/garden/faostat/2024-03-14/faostat_fbsc.py b/etl/steps/data/garden/faostat/2024-03-14/faostat_fbsc.py index 90e1e429c02..e82fff42ee8 100644 --- a/etl/steps/data/garden/faostat/2024-03-14/faostat_fbsc.py +++ b/etl/steps/data/garden/faostat/2024-03-14/faostat_fbsc.py @@ -75,10 +75,10 @@ def combine_fbsh_and_fbs_datasets( fbs = pd.DataFrame(fbs_dataset["faostat_fbs"]).reset_index() # Harmonize items and elements in both datasets. - fbsh = harmonize_items(df=fbsh, dataset_short_name="faostat_fbsh") - fbsh = harmonize_elements(df=fbsh, dataset_short_name="faostat_fbsh") - fbs = harmonize_items(df=fbs, dataset_short_name="faostat_fbs") - fbs = harmonize_elements(df=fbs, dataset_short_name="faostat_fbs") + fbsh = harmonize_items(tb=fbsh, dataset_short_name="faostat_fbsh") + fbsh = harmonize_elements(tb=fbsh, dataset_short_name="faostat_fbsh") + fbs = harmonize_items(tb=fbs, dataset_short_name="faostat_fbs") + fbs = harmonize_elements(tb=fbs, dataset_short_name="faostat_fbs") # Ensure there is no overlap in data between the two datasets, and that there is no gap between them. assert fbs["year"].min() == FBS_FIRST_YEAR, f"First year of fbs dataset is not {FBS_FIRST_YEAR}" @@ -177,7 +177,7 @@ def run(dest_dir: str) -> None: # Prepare data. data = clean_data( - data=data, + tb=data, ds_population=ds_population, items_metadata=items_metadata, elements_metadata=elements_metadata, @@ -187,7 +187,7 @@ def run(dest_dir: str) -> None: # Add data for aggregate regions. data = add_regions( - data=data, + tb=data, ds_regions=ds_regions, ds_income_groups=ds_income_groups, ds_population=ds_population, @@ -195,7 +195,7 @@ def run(dest_dir: str) -> None: ) # Add per-capita variables. - data = add_per_capita_variables(data=data, elements_metadata=elements_metadata) + data = add_per_capita_variables(tb=data, elements_metadata=elements_metadata) # Handle detected anomalies in the data. data, anomaly_descriptions = handle_anomalies(dataset_short_name=dataset_short_name, data=data) @@ -208,13 +208,13 @@ def run(dest_dir: str) -> None: # Create a long table (with item code and element code as part of the index). log.info("faostat_fbsc.prepare_long_table", shape=data.shape) - data_table_long = prepare_long_table(data=data) + data_table_long = prepare_long_table(tb=data) _assert_df_size(data_table_long, 2000) # Create a wide table (with only country and year as index). log.info("faostat_fbsc.prepare_wide_table", shape=data.shape) - data_table_wide = prepare_wide_table(data=data) + data_table_wide = prepare_wide_table(tb=data) # # Save outputs. diff --git a/etl/steps/data/garden/faostat/2024-03-14/faostat_metadata.py b/etl/steps/data/garden/faostat/2024-03-14/faostat_metadata.py index b7faa7bc61a..59f415471c8 100644 --- a/etl/steps/data/garden/faostat/2024-03-14/faostat_metadata.py +++ b/etl/steps/data/garden/faostat/2024-03-14/faostat_metadata.py @@ -241,7 +241,7 @@ def create_items_dataframe_for_domain( data=df, dataset_short_name=dataset_short_name, category="item" ) # Ensure items are well constructed and amend already known issues (defined in shared.ITEM_AMENDMENTS). - items_from_data = harmonize_items(df=items_from_data, dataset_short_name=dataset_short_name, item_col="fao_item") + items_from_data = harmonize_items(tb=items_from_data, dataset_short_name=dataset_short_name, item_col="fao_item") # Load items from metadata. items_columns = { @@ -261,7 +261,7 @@ def create_items_dataframe_for_domain( .sort_values(list(items_columns.values())) .reset_index(drop=True) ) - _items_df = harmonize_items(df=_items_df, dataset_short_name=dataset_short_name, item_col="fao_item") + _items_df = harmonize_items(tb=_items_df, dataset_short_name=dataset_short_name, item_col="fao_item") _items_df["fao_item_description"] = _items_df["fao_item_description"].astype("string") # Add descriptions (from metadata) to items (from data). @@ -430,7 +430,7 @@ def create_elements_dataframe_for_domain( ) # Ensure element_code is always a string of a fix number of characters. elements_from_data = harmonize_elements( - df=elements_from_data, + tb=elements_from_data, dataset_short_name=dataset_short_name, element_col="fao_element", unit_col="fao_unit_short_name", @@ -455,7 +455,7 @@ def create_elements_dataframe_for_domain( .reset_index(drop=True) ) _elements_df = harmonize_elements( - df=_elements_df, dataset_short_name=dataset_short_name, element_col="fao_element", unit_col=None + tb=_elements_df, dataset_short_name=dataset_short_name, element_col="fao_element", unit_col=None ) _elements_df["fao_element_description"] = _elements_df["fao_element_description"].astype("string") diff --git a/etl/steps/data/garden/faostat/2024-03-14/faostat_qcl.py b/etl/steps/data/garden/faostat/2024-03-14/faostat_qcl.py index a760a09a8cd..537c4bbba97 100644 --- a/etl/steps/data/garden/faostat/2024-03-14/faostat_qcl.py +++ b/etl/steps/data/garden/faostat/2024-03-14/faostat_qcl.py @@ -479,12 +479,12 @@ def run(dest_dir: str) -> None: # Process data. # # Harmonize items and elements, and clean data. - data = harmonize_items(df=data, dataset_short_name=dataset_short_name) - data = harmonize_elements(df=data, dataset_short_name=dataset_short_name) + data = harmonize_items(tb=data, dataset_short_name=dataset_short_name) + data = harmonize_elements(tb=data, dataset_short_name=dataset_short_name) # Prepare data. data = clean_data( - data=data, + tb=data, ds_population=ds_population, items_metadata=items_metadata, elements_metadata=elements_metadata, @@ -500,7 +500,7 @@ def run(dest_dir: str) -> None: # Add data for aggregate regions. data = add_regions( - data=data, + tb=data, ds_regions=ds_regions, ds_population=ds_population, ds_income_groups=ds_income_groups, @@ -508,7 +508,7 @@ def run(dest_dir: str) -> None: ) # Add per-capita variables. - data = add_per_capita_variables(data=data, elements_metadata=elements_metadata) + data = add_per_capita_variables(tb=data, elements_metadata=elements_metadata) # Add yield (production per area) to aggregate regions. data = add_yield_to_aggregate_regions(data) @@ -517,10 +517,10 @@ def run(dest_dir: str) -> None: data, anomaly_descriptions = handle_anomalies(dataset_short_name=dataset_short_name, data=data) # Create a long table (with item code and element code as part of the index). - data_table_long = prepare_long_table(data=data) + data_table_long = prepare_long_table(tb=data) # Create a wide table (with only country and year as index). - data_table_wide = prepare_wide_table(data=data) + data_table_wide = prepare_wide_table(tb=data) # # Save outputs. diff --git a/etl/steps/data/garden/faostat/2024-03-14/shared.py b/etl/steps/data/garden/faostat/2024-03-14/shared.py index 36d19f04da5..11a5d7efb68 100644 --- a/etl/steps/data/garden/faostat/2024-03-14/shared.py +++ b/etl/steps/data/garden/faostat/2024-03-14/shared.py @@ -13,13 +13,17 @@ import json import sys from pathlib import Path -from typing import Dict, List, Optional, cast +from typing import Dict, List, Optional import numpy as np +import owid.catalog.processing as pr import pandas as pd import structlog from detected_anomalies import handle_anomalies -from owid import catalog, repack # type: ignore +from owid import repack # type: ignore +from owid.catalog import Dataset, Table, Variable, VariablePresentationMeta +from owid.catalog.tables import read_from_records +from owid.catalog.utils import underscore from owid.datautils import dataframes from tqdm.auto import tqdm @@ -272,7 +276,7 @@ FLAG_MULTIPLE_FLAGS = "multiple_flags" # Rank flags by priority (where lowest index is highest priority). FLAGS_RANKING = ( - pd.DataFrame.from_records( + read_from_records( columns=["flag", "description"], data=[ # FAO uses nan flag for official data; in our datasets we will replace nans by FLAG_OFFICIAL_DATA. @@ -324,19 +328,19 @@ # Shared functions. -def check_that_countries_are_well_defined(data: pd.DataFrame) -> None: +def check_that_countries_are_well_defined(tb: Table) -> None: """Apply sanity checks related to the definition of countries. Parameters ---------- - data : pd.DataFrame + tb : Table Data, right after harmonizing country names. """ # Ensure area codes and countries are well defined, and no ambiguities were introduced when mapping country names. - n_countries_per_area_code = data.groupby("area_code")["country"].transform("nunique") + n_countries_per_area_code = tb.groupby("area_code")["country"].transform("nunique") ambiguous_area_codes = ( - data.loc[n_countries_per_area_code > 1][["area_code", "country"]] + tb.loc[n_countries_per_area_code > 1][["area_code", "country"]] .drop_duplicates() .set_index("area_code")["country"] .to_dict() @@ -346,9 +350,9 @@ def check_that_countries_are_well_defined(data: pd.DataFrame) -> None: f"Redefine countries file for:\n{ambiguous_area_codes}." ) assert len(ambiguous_area_codes) == 0, error - n_area_codes_per_country = data.groupby("country")["area_code"].transform("nunique") + n_area_codes_per_country = tb.groupby("country")["area_code"].transform("nunique") ambiguous_countries = ( - data.loc[n_area_codes_per_country > 1][["area_code", "country"]] + tb.loc[n_area_codes_per_country > 1][["area_code", "country"]] .drop_duplicates() .set_index("area_code")["country"] .to_dict() @@ -361,14 +365,14 @@ def check_that_countries_are_well_defined(data: pd.DataFrame) -> None: def check_that_regions_with_subregions_are_ignored_when_constructing_aggregates( - countries_metadata: pd.DataFrame, + countries_metadata: Table, ) -> None: """Check that regions that contain subregions are ignored when constructing region aggregates, to avoid double-counting those subregions. Parameters ---------- - countries_metadata : pd.DataFrame + countries_metadata : Table Table 'countries' from garden faostat_metadata dataset. """ @@ -393,13 +397,13 @@ def check_that_regions_with_subregions_are_ignored_when_constructing_aggregates( assert len(countries_with_subregions) == 0, error -def harmonize_items(df: pd.DataFrame, dataset_short_name: str, item_col: str = "item") -> pd.DataFrame: +def harmonize_items(tb: Table, dataset_short_name: str, item_col: str = "item") -> Table: """Harmonize item codes (by ensuring they are strings of numbers with a fixed length, prepended with zeros), make amendments to faulty items, and make item codes and items of categorical dtype. Parameters ---------- - df : pd.DataFrame + tb : Table Data before harmonizing item codes. dataset_short_name : str Dataset short name. @@ -408,11 +412,11 @@ def harmonize_items(df: pd.DataFrame, dataset_short_name: str, item_col: str = " Returns ------- - df : pd.DataFrame + tb : Table Data after harmonizing item codes. """ - df = df.copy() + tb = tb.copy() # Set the maximum number of characters for item_code. if dataset_short_name == f"{NAMESPACE}_sdgb": @@ -421,42 +425,42 @@ def harmonize_items(df: pd.DataFrame, dataset_short_name: str, item_col: str = " n_characters_item_code = N_CHARACTERS_ITEM_CODE # Note: Here list comprehension is faster than doing .astype(str).str.zfill(...). - df["item_code"] = [str(item_code).zfill(n_characters_item_code) for item_code in df["item_code"]] + tb["item_code"] = [str(item_code).zfill(n_characters_item_code) for item_code in tb["item_code"]] # Convert both columns to category to reduce memory. - df = df.astype({"item_code": "category", item_col: "category"}) + tb = tb.astype({"item_code": "category", item_col: "category"}) # Fix those few cases where there is more than one item per item code within a given dataset. if dataset_short_name in ITEM_AMENDMENTS: for amendment in ITEM_AMENDMENTS[dataset_short_name]: # Ensure new item code and item name are added as categories, to avoid errors. - if amendment["new_item_code"] not in df["item_code"].cat.categories: - df["item_code"] = df["item_code"].cat.add_categories(amendment["new_item_code"]) - if amendment["new_fao_item"] not in df[item_col].cat.categories: - df[item_col] = df[item_col].cat.add_categories(amendment["new_fao_item"]) + if amendment["new_item_code"] not in tb["item_code"].cat.categories: + tb["item_code"] = tb["item_code"].cat.add_categories(amendment["new_item_code"]) + if amendment["new_fao_item"] not in tb[item_col].cat.categories: + tb[item_col] = tb[item_col].cat.add_categories(amendment["new_fao_item"]) # Update item code and item name. - df.loc[ - (df["item_code"] == amendment["item_code"]) & (df[item_col] == amendment["fao_item"]), + tb.loc[ + (tb["item_code"] == amendment["item_code"]) & (tb[item_col] == amendment["fao_item"]), ("item_code", item_col), ] = (amendment["new_item_code"], amendment["new_fao_item"]) # Remove unused categories. - df["item_code"] = df["item_code"].cat.remove_unused_categories() - df[item_col] = df[item_col].cat.remove_unused_categories() + tb["item_code"] = tb["item_code"].cat.remove_unused_categories() + tb[item_col] = tb[item_col].cat.remove_unused_categories() - return df + return tb def harmonize_elements( - df: pd.DataFrame, dataset_short_name: str, element_col: str = "element", unit_col: Optional[str] = "unit" -) -> pd.DataFrame: + tb: Table, dataset_short_name: str, element_col: str = "element", unit_col: Optional[str] = "unit" +) -> Table: """Harmonize element codes (by ensuring they are strings of numbers with a fixed length, prepended with zeros), and make element codes and elements of categorical dtype. Parameters ---------- - df : pd.DataFrame + tb : Table Data before harmonizing element codes. dataset_short_name : str Dataset short name. @@ -465,44 +469,44 @@ def harmonize_elements( Returns ------- - df : pd.DataFrame + tb : Table Data after harmonizing element codes. """ - df = df.copy() - df["element_code"] = [str(element_code).zfill(N_CHARACTERS_ELEMENT_CODE) for element_code in df["element_code"]] + tb = tb.copy() + tb["element_code"] = [str(element_code).zfill(N_CHARACTERS_ELEMENT_CODE) for element_code in tb["element_code"]] # Convert both columns to category to reduce memory - df = df.astype({"element_code": "category", element_col: "category"}) + tb = tb.astype({"element_code": "category", element_col: "category"}) # Fix those few cases where there is more than one item per item code within a given dataset. if dataset_short_name in ELEMENT_AMENDMENTS: for amendment in ELEMENT_AMENDMENTS[dataset_short_name]: # Ensure new item code and item name are added as categories, to avoid errors. - if amendment["new_element_code"] not in df["element_code"].cat.categories: - df["element_code"] = df["element_code"].cat.add_categories(amendment["new_element_code"]) - if amendment["new_fao_element"] not in df[element_col].cat.categories: - df[element_col] = df[element_col].cat.add_categories(amendment["new_fao_element"]) - if unit_col is not None and amendment["new_fao_unit"] not in df[unit_col].cat.categories: - df[unit_col] = df[unit_col].cat.add_categories(amendment["new_fao_unit"]) + if amendment["new_element_code"] not in tb["element_code"].cat.categories: + tb["element_code"] = tb["element_code"].cat.add_categories(amendment["new_element_code"]) + if amendment["new_fao_element"] not in tb[element_col].cat.categories: + tb[element_col] = tb[element_col].cat.add_categories(amendment["new_fao_element"]) + if unit_col is not None and amendment["new_fao_unit"] not in tb[unit_col].cat.categories: + tb[unit_col] = tb[unit_col].cat.add_categories(amendment["new_fao_unit"]) if unit_col is not None: # Update element code, element name, and unit name. - df.loc[ - (df["element_code"] == amendment["element_code"]) & (df[element_col] == amendment["fao_element"]), + tb.loc[ + (tb["element_code"] == amendment["element_code"]) & (tb[element_col] == amendment["fao_element"]), ("element_code", element_col, unit_col), ] = (amendment["new_element_code"], amendment["new_fao_element"], amendment["new_fao_unit"]) else: # Update element code, and element name. - df.loc[ - (df["element_code"] == amendment["element_code"]) & (df[element_col] == amendment["fao_element"]), + tb.loc[ + (tb["element_code"] == amendment["element_code"]) & (tb[element_col] == amendment["fao_element"]), ("element_code", element_col), ] = (amendment["new_element_code"], amendment["new_fao_element"]) - return df + return tb -def harmonize_countries(data: pd.DataFrame, countries_metadata: pd.DataFrame) -> pd.DataFrame: +def harmonize_countries(tb: Table, countries_metadata: Table) -> Table: """Harmonize country names. A new column 'country' will be added, with the harmonized country names. Column 'fao_country' will remain, to have @@ -510,21 +514,20 @@ def harmonize_countries(data: pd.DataFrame, countries_metadata: pd.DataFrame) -> Parameters ---------- - data : pd.DataFrame + tb : Table Data before harmonizing country names. - countries_metadata : pd.DataFrame + countries_metadata : Table Table 'countries' from garden faostat_metadata dataset. Returns ------- - data : pd.DataFrame + tb : Table Data after harmonizing country names. """ - data = data.copy() + tb = tb.copy() # Add harmonized country names (from countries metadata) to data. - data = pd.merge( - data, + tb = tb.merge( countries_metadata[["area_code", "fao_country", "country"]].rename( columns={"fao_country": "fao_country_check"} ), @@ -533,26 +536,26 @@ def harmonize_countries(data: pd.DataFrame, countries_metadata: pd.DataFrame) -> ) # area_code should always be an int - data["area_code"] = data["area_code"].astype(int) + tb["area_code"] = tb["area_code"].astype(int) # Sanity check. - country_mismatch = data[(data["fao_country"].astype(str) != data["fao_country_check"])] + country_mismatch = tb[(tb["fao_country"].astype(str) != tb["fao_country_check"])] if len(country_mismatch) > 0: faulty_mapping = country_mismatch.set_index("fao_country").to_dict()["fao_country_check"] log.warning(f"Mismatch between fao_country in data and in metadata: {faulty_mapping}") - data = data.drop(columns="fao_country_check") + tb = tb.drop(columns="fao_country_check") # Remove unmapped countries. - data = data[data["country"].notnull()].reset_index(drop=True) + tb = tb[tb["country"].notnull()].reset_index(drop=True) # Further sanity checks. - check_that_countries_are_well_defined(data) + check_that_countries_are_well_defined(tb) check_that_regions_with_subregions_are_ignored_when_constructing_aggregates(countries_metadata) # Set appropriate dtypes. - data = data.astype({"country": "category", "fao_country": "category"}) + tb = tb.astype({"country": "category", "fao_country": "category"}) - return data + return tb def prepare_dataset_description(fao_description: str, owid_description: str) -> str: @@ -619,77 +622,77 @@ def prepare_variable_description(item: str, element: str, item_description: str, return description -def remove_rows_with_nan_value(data: pd.DataFrame, verbose: bool = False) -> pd.DataFrame: +def remove_rows_with_nan_value(tb: Table, verbose: bool = False) -> Table: """Remove rows for which column "value" is nan. Parameters ---------- - data : pd.DataFrame + tb : Table Data for current dataset. verbose : bool True to display information about the number and fraction of rows removed. Returns ------- - data : pd.DataFrame + tb : Table Data after removing nan values. """ - data = data.copy() + tb = tb.copy() # Number of rows with a nan in column "value". # We could also remove rows with any nan, however, before doing that, we would need to assign a value to nan flags. - n_rows_with_nan_value = len(data[data["value"].isnull()]) + n_rows_with_nan_value = len(tb[tb["value"].isnull()]) if n_rows_with_nan_value > 0: - frac_nan_rows = n_rows_with_nan_value / len(data) + frac_nan_rows = n_rows_with_nan_value / len(tb) if verbose: log.info(f"Removing {n_rows_with_nan_value} rows ({frac_nan_rows: .2%}) " f"with nan in column 'value'.") if frac_nan_rows > 0.15: log.warning(f"{frac_nan_rows: .0%} rows of nan values removed.") - data = data.dropna(subset="value").reset_index(drop=True) + tb = tb.dropna(subset="value").reset_index(drop=True) - return data + return tb -def remove_columns_with_only_nans(data: pd.DataFrame, verbose: bool = True) -> pd.DataFrame: +def remove_columns_with_only_nans(tb: Table, verbose: bool = True) -> Table: """Remove columns that only have nans. In principle, it should not be possible that columns have only nan values, but we use this function just in case. Parameters ---------- - data : pd.DataFrame + tb : Table Data for current dataset. verbose : bool True to display information about the removal of columns with nan values. Returns ------- - data : pd.DataFrame + tb : Table Data after removing columns of nans. """ - data = data.copy() + tb = tb.copy() # Remove columns that only have nans. - columns_of_nans = data.columns[data.isnull().all(axis=0)] + columns_of_nans = tb.columns[tb.isnull().all(axis=0)] if len(columns_of_nans) > 0: if verbose: log.info( - f"Removing {len(columns_of_nans)} columns ({len(columns_of_nans) / len(data.columns): .2%}) " + f"Removing {len(columns_of_nans)} columns ({len(columns_of_nans) / len(tb.columns): .2%}) " f"that have only nans." ) - data = data.drop(columns=columns_of_nans) + tb = tb.drop(columns=columns_of_nans) - return data + return tb -def remove_duplicates(data: pd.DataFrame, index_columns: List[str], verbose: bool = True) -> pd.DataFrame: +def remove_duplicates(tb: Table, index_columns: List[str], verbose: bool = True) -> Table: """Remove rows with duplicated index (country, year, item, element, unit). First attempt to use flags to remove duplicates. If there are still duplicates, remove in whatever way possible. Parameters ---------- - data : pd.DataFrame + tb : Table Data for current dataset. index_columns : list Columns expected to be used as index of the data. @@ -698,35 +701,32 @@ def remove_duplicates(data: pd.DataFrame, index_columns: List[str], verbose: boo Returns ------- - data : pd.DataFrame + tb : Table Data (with a dummy numerical index) after removing duplicates. """ - data = data.copy() + tb = tb.copy() # Select columns that will be used as indexes. - _index_columns = [column for column in index_columns if column in data.columns] + _index_columns = [column for column in index_columns if column in tb.columns] # Number of ambiguous indexes (those that have multiple data values). - n_ambiguous_indexes = len(data[data.duplicated(subset=_index_columns, keep="first")]) + n_ambiguous_indexes = len(tb[tb.duplicated(subset=_index_columns, keep="first")]) if n_ambiguous_indexes > 0: # Add flag ranking to dataset. flags_ranking = FLAGS_RANKING.copy() flags_ranking["flag"] = flags_ranking["flag"].fillna(FLAG_OFFICIAL_DATA) - data = pd.merge( - data, + tb = tb.merge( flags_ranking[["flag", "ranking"]].rename(columns={"ranking": "flag_ranking"}), on="flag", how="left", ).astype({"flag": "category"}) # Number of ambiguous indexes that cannot be solved using flags. - n_ambiguous_indexes_unsolvable = len( - data[data.duplicated(subset=_index_columns + ["flag_ranking"], keep="first")] - ) + n_ambiguous_indexes_unsolvable = len(tb[tb.duplicated(subset=_index_columns + ["flag_ranking"], keep="first")]) # Remove ambiguous indexes (those that have multiple data values). # When possible, use flags to prioritise among duplicates. - data = data.sort_values(_index_columns + ["flag_ranking"]).drop_duplicates(subset=_index_columns, keep="first") - frac_ambiguous = n_ambiguous_indexes / len(data) + tb = tb.sort_values(_index_columns + ["flag_ranking"]).drop_duplicates(subset=_index_columns, keep="first") + frac_ambiguous = n_ambiguous_indexes / len(tb) frac_ambiguous_solved_by_flags = 1 - (n_ambiguous_indexes_unsolvable / n_ambiguous_indexes) if verbose: log.info( @@ -734,12 +734,12 @@ def remove_duplicates(data: pd.DataFrame, index_columns: List[str], verbose: boo f"{frac_ambiguous_solved_by_flags: .2%} of ambiguities were solved with flags." ) - data = data.drop(columns=["flag_ranking"]) + tb = tb.drop(columns=["flag_ranking"]) - return data + return tb -def clean_year_column(year_column: pd.Series) -> pd.Series: +def clean_year_column(year_column: Variable) -> Variable: """Clean year column. Year is given almost always as an integer value. But sometimes (e.g. in the faostat_fs dataset) it is a range of @@ -748,12 +748,12 @@ def clean_year_column(year_column: pd.Series) -> pd.Series: Parameters ---------- - year_column : pd.Series + year_column : Variable Original column of year values (which may be integer, or ranges of values). Returns ------- - year_clean_series : pd.Series + year_clean_series : Variable Clean column of years, as integer values. """ @@ -769,15 +769,12 @@ def clean_year_column(year_column: pd.Series) -> pd.Series: year_clean.append(int(year)) # Prepare series of integer year values. - year_clean_series = pd.Series(year_clean) - year_clean_series.name = "year" + year_clean_series = Variable(year_clean, name="year") return year_clean_series -def add_custom_names_and_descriptions( - data: pd.DataFrame, items_metadata: pd.DataFrame, elements_metadata: pd.DataFrame -) -> pd.DataFrame: +def add_custom_names_and_descriptions(tb: Table, items_metadata: Table, elements_metadata: Table) -> Table: """Add columns with custom names, descriptions and conversion factors for elements, items and units. The returned dataframe will have the same number of rows as the ingested data, but: @@ -802,38 +799,36 @@ def add_custom_names_and_descriptions( Parameters ---------- - data : pd.DataFrame + tb : Table Data for a particular domain, with harmonized item codes and element codes. - items_metadata : pd.DataFrame + items_metadata : Table Table 'items' from the garden faostat_metadata dataset, after selecting items for the current dataset. - elements_metadata : pd.DataFrame + elements_metadata : Table Table 'elements' from the garden faostat_metadata dataset, after selecting elements for the current dataset. Returns ------- - data : pd.DataFrame + tb : Table Data after adding and editing its columns as described above. """ - data = data.copy() + tb = tb.copy() error = "There are missing item codes in metadata." - assert set(data["item_code"]) <= set(items_metadata["item_code"]), error + assert set(tb["item_code"]) <= set(items_metadata["item_code"]), error error = "There are missing element codes in metadata." - assert set(data["element_code"]) <= set(elements_metadata["element_code"]), error + assert set(tb["element_code"]) <= set(elements_metadata["element_code"]), error - _expected_n_rows = len(data) - data = pd.merge( - data.rename(columns={"item": "fao_item"}), + _expected_n_rows = len(tb) + tb = tb.rename(columns={"item": "fao_item"}, errors="raise").merge( items_metadata[["item_code", "owid_item", "owid_item_description"]], on="item_code", how="left", ) - assert len(data) == _expected_n_rows, "Something went wrong when merging data with items metadata." + assert len(tb) == _expected_n_rows, "Something went wrong when merging data with items metadata." - data = pd.merge( - data.rename(columns={"element": "fao_element", "unit": "fao_unit_short_name"}), + tb = tb.rename(columns={"element": "fao_element", "unit": "fao_unit_short_name"}, errors="raise").merge( elements_metadata[ [ "element_code", @@ -847,10 +842,10 @@ def add_custom_names_and_descriptions( on=["element_code"], how="left", ) - assert len(data) == _expected_n_rows, "Something went wrong when merging data with elements metadata." + assert len(tb) == _expected_n_rows, "Something went wrong when merging data with elements metadata." # `category` type was lost during merge, convert it back - data = data.astype( + tb = tb.astype( { "element_code": "category", "item_code": "category", @@ -858,33 +853,31 @@ def add_custom_names_and_descriptions( ) # Remove "owid_" from column names. - data = data.rename(columns={column: column.replace("owid_", "") for column in data.columns}) + tb = tb.rename(columns={column: column.replace("owid_", "") for column in tb.columns}) # Fill missing unit and short_unit columns with empty strings. for column in ["unit", "unit_short_name"]: - missing_unit_mask = data[column].isnull() - if not data[missing_unit_mask].empty: - log.warning(f"Missing {column} for elements: {set(data[missing_unit_mask]['element'])}") - data[column] = data[column].cat.add_categories("").fillna("") + missing_unit_mask = tb[column].isnull() + if not tb[missing_unit_mask].empty: + log.warning(f"Missing {column} for elements: {set(tb[missing_unit_mask]['element'])}") + tb[column] = tb[column].cat.add_categories("").fillna("") - return data + return tb -def remove_regions_from_countries_regions_members( - countries_regions: pd.DataFrame, regions_to_remove: List[str] -) -> pd.DataFrame: +def remove_regions_from_countries_regions_members(countries_regions: Table, regions_to_remove: List[str]) -> Table: """Remove regions that have to be ignored from the lists of members in the countries-regions dataset. Parameters ---------- - countries_regions : pd.DataFrame + countries_regions : Table Countries-regions dataset (from the OWID catalog). regions_to_remove : list Regions to ignore. Returns ------- - countries_regions : pd.DataFrame + countries_regions : Table Countries-regions dataset after removing regions from the lists of members of each country or region. """ @@ -908,12 +901,12 @@ def remove_regions_from_countries_regions_members( return countries_regions -def load_population(ds_population: catalog.Dataset) -> pd.DataFrame: +def load_population(ds_population: Dataset) -> Table: """Load OWID population dataset, and add historical regions to it. Returns ------- - population : pd.DataFrame + population : Table Population dataset. """ @@ -934,17 +927,17 @@ def load_population(ds_population: catalog.Dataset) -> pd.DataFrame: # Select only years for which we have data for all member countries. _population = _population[_population["country"] == len(members)].reset_index(drop=True) _population["country"] = country - population = pd.concat([population, _population], ignore_index=True).reset_index(drop=True) + population = pr.concat([population, _population], ignore_index=True).reset_index(drop=True) error = "Duplicate country-years found in population. Check if historical regions changed." assert population[population.duplicated(subset=["country", "year"])].empty, error - return cast(pd.DataFrame, population) + return population def remove_overlapping_data_between_historical_regions_and_successors( - data_region: pd.DataFrame, -) -> pd.DataFrame: + data_region: Table, +) -> Table: """Remove overlapping data between a historical region and any of its successors (if there is any overlap), to avoid double-counting those regions when aggregating data. @@ -953,12 +946,12 @@ def remove_overlapping_data_between_historical_regions_and_successors( Parameters ---------- - data_region : pd.DataFrame + data_region : Table Data (after selecting the countries of a certain relevant region). Returns ------- - data_region : pd.DataFrame + data_region : Table Data after removing data with overlapping regions. """ @@ -976,7 +969,7 @@ def remove_overlapping_data_between_historical_regions_and_successors( columns ].drop_duplicates() # Find unique years where the above combinations of item-element-years of region and successors overlap. - overlapping_years = pd.concat([historical_region_years, historical_successors_years], ignore_index=True) + overlapping_years = pr.concat([historical_region_years, historical_successors_years], ignore_index=True) overlapping_years = overlapping_years[overlapping_years.duplicated()] if not overlapping_years.empty: log.warning( @@ -986,12 +979,13 @@ def remove_overlapping_data_between_historical_regions_and_successors( # Select rows in data_region to drop. overlapping_years["country"] = historical_region indexes_to_drop.extend( - pd.merge( - data_region.reset_index(), + data_region.reset_index() + .merge( overlapping_years, how="inner", on=["country"] + columns, - )["index"].tolist() + )["index"] + .tolist() ) if len(indexes_to_drop) > 0: @@ -1002,12 +996,12 @@ def remove_overlapping_data_between_historical_regions_and_successors( def add_regions( - data: pd.DataFrame, - ds_regions: catalog.Dataset, - ds_income_groups: catalog.Dataset, - ds_population: catalog.Dataset, - elements_metadata: pd.DataFrame, -) -> pd.DataFrame: + tb: Table, + ds_regions: Dataset, + ds_income_groups: Dataset, + ds_population: Dataset, + elements_metadata: Table, +) -> Table: """Add region aggregates (i.e. aggregate data for continents and income groups). Regions to be created are defined above, in REGIONS_TO_ADD, and the variables for which data will be aggregated are @@ -1019,18 +1013,18 @@ def add_regions( Parameters ---------- - data : pd.DataFrame + tb : Table Clean data (after harmonizing items, element and countries). - elements_metadata : pd.DataFrame + elements_metadata : Table Table 'elements' from the garden faostat_metadata dataset, after selecting elements for the current domain. Returns ------- - data : pd.DataFrame + tb : Table Data after adding rows for aggregate regions. """ - data = data.copy() + tb = tb.copy() # Create a dictionary of aggregations, specifying the operation to use when creating regions. # These aggregations are defined in the custom_elements_and_units.csv file, and added to the metadata dataset. @@ -1040,7 +1034,7 @@ def add_regions( .to_dict()["owid_aggregation"] ) if len(aggregations) > 0: - log.info("add_regions", shape=data.shape) + log.info("add_regions", shape=tb.shape) # Load population dataset, countries-regions, and income groups datasets. population = load_population(ds_population=ds_population) @@ -1068,9 +1062,7 @@ def add_regions( element_codes = aggregations_inverted[aggregation] # Select relevant rows in the data. - data_region = data[ - (data["country"].isin(countries_in_region)) & (data["element_code"].isin(element_codes)) - ] + data_region = tb[(tb["country"].isin(countries_in_region)) & (tb["element_code"].isin(element_codes))] # Ensure there is no overlap between historical regions and their successors. data_region = remove_overlapping_data_between_historical_regions_and_successors(data_region) @@ -1106,7 +1098,7 @@ def add_regions( ) # Add total population of the region (for each year) to the relevant data. - data_region = pd.merge(data_region, region_population, on="year", how="left") + data_region = data_region.merge(region_population, on="year", how="left") # Keep only rows for which we have sufficient data. data_region = data_region[ @@ -1127,30 +1119,30 @@ def add_regions( ) # Add data for current region to data. - data = dataframes.concatenate( - [data[data["country"] != region].reset_index(drop=True), data_region], + tb = dataframes.concatenate( + [tb[tb["country"] != region].reset_index(drop=True), data_region], ignore_index=True, ) # Check that the fraction of population with data is as high as expected. - frac_population = data["population_with_data"] / data["population"] + frac_population = tb["population_with_data"] / tb["population"] assert frac_population[frac_population.notnull()].min() >= region_min_frac_population_with_data # Drop column of total population (we will still keep population_with_data). - data = data.drop(columns=["population"]) + tb = tb.drop(columns=["population"]) # Make area_code of category type (it contains integers and strings, and feather does not support object types). - data["area_code"] = data["area_code"].astype(str).astype("category") + tb["area_code"] = tb["area_code"].astype(str).astype("category") # Sort conveniently. - data = data.sort_values(["country", "year"]).reset_index(drop=True) + tb = tb.sort_values(["country", "year"]).reset_index(drop=True) - check_that_countries_are_well_defined(data) + check_that_countries_are_well_defined(tb) - return data + return tb -def add_fao_population_if_given(data: pd.DataFrame) -> pd.DataFrame: +def add_fao_population_if_given(tb: Table) -> Table: """Add a new column for FAO population, if population values are given in the data. Some datasets (e.g. faostat_fbsh and faostat_fbs) include per-capita variables from the beginning. When this @@ -1159,12 +1151,12 @@ def add_fao_population_if_given(data: pd.DataFrame) -> pd.DataFrame: Parameters ---------- - data : pd.DataFrame + tb : Table Data (after harmonizing elements and items, but before harmonizing countries). Returns ------- - data : pd.DataFrame + tb : Table Data, after adding a column 'fao_population', if FAO population was found in the data. """ @@ -1174,14 +1166,14 @@ def add_fao_population_if_given(data: pd.DataFrame) -> pd.DataFrame: # Expected name of unit of FAO population. fao_population_unit_name = "thousand Number" # Select rows that correspond to FAO population. - population_rows_mask = (data["fao_item"] == fao_population_item_name) & ( - data["fao_element"] == fao_population_element_name + population_rows_mask = (tb["fao_item"] == fao_population_item_name) & ( + tb["fao_element"] == fao_population_element_name ) if population_rows_mask.any(): - data = data.copy() + tb = tb.copy() - fao_population = data[population_rows_mask].reset_index(drop=True) + fao_population = tb[population_rows_mask].reset_index(drop=True) # Check that population is given in "1000 persons" and convert to persons. assert list(fao_population["unit"].unique()) == [ @@ -1199,20 +1191,20 @@ def add_fao_population_if_given(data: pd.DataFrame) -> pd.DataFrame: ) # Add FAO population as a new column in data. - data = pd.merge(data, fao_population, how="left", on=["area_code", "year"]) + tb = tb.merge(fao_population, how="left", on=["area_code", "year"]) - return data + return tb def add_population( - df: pd.DataFrame, - ds_population: catalog.Dataset, + tb: Table, + ds_population: Dataset, country_col: str = "country", year_col: str = "year", population_col: str = "population", warn_on_missing_countries: bool = True, show_full_warning: bool = True, -) -> pd.DataFrame: +) -> Table: """Add a column of OWID population to the countries in the data, including population of historical regions. This function has been adapted from datautils.geo, because population currently does not include historic regions. @@ -1220,7 +1212,7 @@ def add_population( Parameters ---------- - df : pd.DataFrame + tb : Table Data without a column for population (after harmonizing elements, items and country names). country_col : str Name of country column in data. @@ -1235,7 +1227,7 @@ def add_population( Returns ------- - df_with_population : pd.DataFrame + tb_with_population : Table Data after adding a column for population for all countries in the data. """ @@ -1250,7 +1242,7 @@ def add_population( )[[country_col, year_col, population_col]] # Check if there is any missing country. - missing_countries = set(df[country_col]) - set(population[country_col]) + missing_countries = set(tb[country_col]) - set(population[country_col]) if len(missing_countries) > 0: if warn_on_missing_countries: geo.warn_on_list_of_entities( @@ -1264,14 +1256,12 @@ def add_population( ) # Add population to original dataframe. - df_with_population = pd.merge(df, population, on=[country_col, year_col], how="left") + tb_with_population = tb.merge(population, on=[country_col, year_col], how="left") - return df_with_population + return tb_with_population -def convert_variables_given_per_capita_to_total_value( - data: pd.DataFrame, elements_metadata: pd.DataFrame -) -> pd.DataFrame: +def convert_variables_given_per_capita_to_total_value(tb: Table, elements_metadata: Table) -> Table: """Replace variables given per capita in the original data by total values. NOTE: @@ -1281,14 +1271,14 @@ def convert_variables_given_per_capita_to_total_value( Parameters ---------- - data : pd.DataFrame + tb : Table Data (after harmonizing elements and items, but before harmonizing countries). - elements_metadata : pd.DataFrame + elements_metadata : Table Table 'elements' from the garden faostat_metadata dataset, after selecting the elements of the relevant domain. Returns ------- - data : pd.DataFrame + tb : Table Data, after converting per-capita variables to total value. """ @@ -1300,29 +1290,29 @@ def convert_variables_given_per_capita_to_total_value( elements_metadata[elements_metadata["was_per_capita"]]["element_code"].unique() ) if len(element_codes_that_were_per_capita) > 0: - data = data.copy() + tb = tb.copy() - assert "fao_population" in data.columns, "fao_population not found, maybe it changed item, element." + assert "fao_population" in tb.columns, "fao_population not found, maybe it changed item, element." # Select variables that were given as per capita variables in the original data and that need to be converted. - per_capita_mask = data["element_code"].isin(element_codes_that_were_per_capita) + per_capita_mask = tb["element_code"].isin(element_codes_that_were_per_capita) # Multiply them by the FAO population to convert them into total value. - data.loc[per_capita_mask, "value"] = data[per_capita_mask]["value"] * data[per_capita_mask]["fao_population"] + tb.loc[per_capita_mask, "value"] = tb[per_capita_mask]["value"] * tb[per_capita_mask]["fao_population"] # Include an additional description to all elements that were converted from per capita to total variables. - if "" not in data["element_description"].cat.categories: - data["element_description"] = data["element_description"].cat.add_categories([""]) - data.loc[per_capita_mask, "element_description"] = data.loc[per_capita_mask, "element_description"].fillna("") - data["element_description"] = dataframes.apply_on_categoricals( - [data.element_description, per_capita_mask.astype("category")], + if "" not in tb["element_description"].cat.categories: + tb["element_description"] = tb["element_description"].cat.add_categories([""]) + tb.loc[per_capita_mask, "element_description"] = tb.loc[per_capita_mask, "element_description"].fillna("") + tb["element_description"] = dataframes.apply_on_categoricals( + [tb.element_description, per_capita_mask.astype("category")], lambda desc, mask: f"{desc} {WAS_PER_CAPITA_ADDED_ELEMENT_DESCRIPTION}".lstrip() if mask else f"{desc}", ) - return data + return tb -def add_per_capita_variables(data: pd.DataFrame, elements_metadata: pd.DataFrame) -> pd.DataFrame: +def add_per_capita_variables(tb: Table, elements_metadata: Table) -> Table: """Add per-capita variables to data in a long format (and keep original variables as well). NOTE: @@ -1333,28 +1323,28 @@ def add_per_capita_variables(data: pd.DataFrame, elements_metadata: pd.DataFrame Parameters ---------- - data : pd.DataFrame + tb : Table Clean data (after harmonizing item codes and element codes, and countries, and adding aggregate regions). - elements_metadata : pd.DataFrame + elements_metadata : Table Elements table from the garden faostat_metadata dataset, after selecting elements for the relevant domain. Returns ------- - data : pd.DataFrame + tb : Table Data with per-capita variables. """ - data = data.copy() + tb = tb.copy() # Find element codes that have to be made per capita. element_codes_to_make_per_capita = list( elements_metadata[elements_metadata["make_per_capita"]]["element_code"].unique() ) if len(element_codes_to_make_per_capita) > 0: - log.info("add_per_capita_variables", shape=data.shape) + log.info("add_per_capita_variables", shape=tb.shape) # Create a new dataframe that will have all per capita variables. - per_capita_data = data[data["element_code"].isin(element_codes_to_make_per_capita)].reset_index(drop=True) + per_capita_data = tb[tb["element_code"].isin(element_codes_to_make_per_capita)].reset_index(drop=True) # Change element codes of per capita variables. per_capita_data["element_code"] = per_capita_data["element_code"].cat.rename_categories( @@ -1393,33 +1383,36 @@ def add_per_capita_variables(data: pd.DataFrame, elements_metadata: pd.DataFrame lambda c: f"{c} {NEW_PER_CAPITA_ADDED_ELEMENT_DESCRIPTION}" ) # Add new rows with per capita variables to data. - data = dataframes.concatenate([data, per_capita_data], ignore_index=True).reset_index(drop=True) + tb = dataframes.concatenate([tb, per_capita_data], ignore_index=True).reset_index(drop=True) - return data + return tb -def clean_data_values(values: pd.Series, amendments: Dict[str, str]) -> pd.Series: +def clean_data_values(values: Variable, amendments: Dict[str, str]) -> Variable: """Fix spurious data values (defined in value_amendments.csv) and make values a float column. Parameters ---------- - values : pd.Series + values : Variable Content of the "value" column in the original data. Returns ------- - values_clean : pd.Series + values_clean : Variable Original values after fixing known issues and converting to float. """ values_clean = values.copy() if len(amendments) > 0: - values_clean = dataframes.map_series( - series=values_clean, - mapping=amendments, - warn_on_missing_mappings=False, - warn_on_unused_mappings=True, - show_full_warning=True, + values_clean = Variable( + dataframes.map_series( + series=values_clean, + mapping=amendments, + warn_on_missing_mappings=False, + warn_on_unused_mappings=True, + show_full_warning=True, + ), + name="value", ) # Convert all numbers into numeric. @@ -1431,13 +1424,13 @@ def clean_data_values(values: pd.Series, amendments: Dict[str, str]) -> pd.Serie def clean_data( - data: pd.DataFrame, - ds_population: catalog.Dataset, - items_metadata: pd.DataFrame, - elements_metadata: pd.DataFrame, - countries_metadata: pd.DataFrame, + tb: Table, + ds_population: Dataset, + items_metadata: Table, + elements_metadata: Table, + countries_metadata: Table, amendments: Dict[str, str], -) -> pd.DataFrame: +) -> Table: """Process data (with already harmonized item codes and element codes), before adding aggregate regions and per-capita variables. @@ -1450,36 +1443,37 @@ def clean_data( Parameters ---------- - data : pd.DataFrame + tb : Table Unprocessed data for current dataset (with harmonized item codes and element codes). - items_metadata : pd.DataFrame + items_metadata : Table Items metadata (from the metadata dataset) after selecting items for only the relevant domain. - elements_metadata : pd.DataFrame + elements_metadata : Table Elements metadata (from the metadata dataset) after selecting elements for only the relevant domain. - countries_metadata : pd.DataFrame + countries_metadata : Table Countries metadata (from the metadata dataset). amendments : dict Value amendments (if any). Returns ------- - data : pd.DataFrame + tb : Table Processed data, ready to be made into a table for a garden dataset. """ - data = data.copy() + tb = tb.copy() # Fix spurious data values (applying mapping in value_amendments.csv) and ensure column of values is float. - data["value"] = clean_data_values(data["value"], amendments=amendments) + tb["value"] = clean_data_values(tb["value"], amendments=amendments) # Convert nan flags into "official" (to avoid issues later on when dealing with flags). - data["flag"] = pd.Series( - [flag if not pd.isnull(flag) else FLAG_OFFICIAL_DATA for flag in data["flag"]], + tb["flag"] = Variable( + [flag if not pd.isnull(flag) else FLAG_OFFICIAL_DATA for flag in tb["flag"]], dtype="category", + name="flag", ) # Some datasets (at least faostat_fa) use "recipient_country" instead of "area". For consistency, change this. - data = data.rename( + tb = tb.rename( columns={ "area": "fao_country", "recipient_country": "fao_country", @@ -1488,33 +1482,33 @@ def clean_data( ) # Ensure year column is integer (sometimes it is given as a range of years, e.g. 2013-2015). - data["year"] = clean_year_column(data["year"]) + tb["year"] = clean_year_column(tb["year"]) # Remove rows with nan value. - data = remove_rows_with_nan_value(data) + tb = remove_rows_with_nan_value(tb) if len(items_metadata) > 0 and len(elements_metadata) > 0: # This is not fulfilled for faostat_qv since the last update. # Use custom names for items, elements and units (and keep original names in "fao_*" columns). - data = add_custom_names_and_descriptions(data, items_metadata, elements_metadata) + tb = add_custom_names_and_descriptions(tb, items_metadata, elements_metadata) # Multiply data values by their corresponding unit factor, if any was given, and then drop unit_factor column. - unit_factor_mask = data["unit_factor"].notnull() - data.loc[unit_factor_mask, "value"] = data[unit_factor_mask]["value"] * data[unit_factor_mask]["unit_factor"] - data = data.drop(columns=["unit_factor"]) + unit_factor_mask = tb["unit_factor"].notnull() + tb.loc[unit_factor_mask, "value"] = tb[unit_factor_mask]["value"] * tb[unit_factor_mask]["unit_factor"] + tb = tb.drop(columns=["unit_factor"]) # Add FAO population as an additional column (if given in the original data). - data = add_fao_population_if_given(data) + tb = add_fao_population_if_given(tb) # Convert variables that were given per-capita to total value. - data = convert_variables_given_per_capita_to_total_value(data, elements_metadata=elements_metadata) + tb = convert_variables_given_per_capita_to_total_value(tb, elements_metadata=elements_metadata) # Harmonize country names. - data = harmonize_countries(data=data, countries_metadata=countries_metadata) + tb = harmonize_countries(tb=tb, countries_metadata=countries_metadata) # Remove duplicated data points (if any) keeping the one with lowest ranking flag (i.e. highest priority). - data = remove_duplicates( - data=data, + tb = remove_duplicates( + tb=tb, index_columns=["area_code", "year", "item_code", "element_code"], verbose=True, ) @@ -1522,17 +1516,17 @@ def clean_data( # Add column for population; when creating region aggregates, this column will have the population of the countries # for which there was data. For example, for Europe in a specific year, the population may differ from item to item, # because for one item we may have more European countries informed than for the other. - data = add_population( - df=data, ds_population=ds_population, population_col="population_with_data", warn_on_missing_countries=False + tb = add_population( + tb=tb, ds_population=ds_population, population_col="population_with_data", warn_on_missing_countries=False ) # Convert back to categorical columns (maybe this should be handled automatically in `add_population_to_dataframe`) - data = data.astype({"country": "category"}) + tb = tb.astype({"country": "category"}) - return data + return tb -def optimize_table_dtypes(table: catalog.Table) -> catalog.Table: +def optimize_table_dtypes(table: Table) -> Table: """Optimize the dtypes of the columns in a table. NOTE: Using `.astype` in a loop over different columns is slow. Instead, it is better to map all columns at once or @@ -1540,12 +1534,12 @@ def optimize_table_dtypes(table: catalog.Table) -> catalog.Table: Parameters ---------- - table : catalog.Table + table : Table Table with possibly non-optimal column dtypes. Returns ------- - optimized_table : catalog.Table + optimized_table : Table Table with optimized dtypes. """ @@ -1563,35 +1557,35 @@ def optimize_table_dtypes(table: catalog.Table) -> catalog.Table: return optimized_table -def prepare_long_table(data: pd.DataFrame) -> catalog.Table: +def prepare_long_table(tb: Table) -> Table: """Prepare a data table in long format. Parameters ---------- - data : pd.DataFrame + tb : Table Data (as a dataframe) in long format. Returns ------- - data_table_long : catalog.Table + tb_long : Table Data (as a table) in long format. """ # Create new table with long data. - data_table_long = catalog.Table(data) + tb_long = Table(tb) # Ensure table has the optimal dtypes before storing it as feather file. - data_table_long = optimize_table_dtypes(table=data_table_long) + tb_long = optimize_table_dtypes(table=tb_long) # Set appropriate indexes. index_columns = ["area_code", "year", "item_code", "element_code"] - data_table_long = data_table_long.set_index(index_columns, verify_integrity=True).sort_index() + tb_long = tb_long.set_index(index_columns, verify_integrity=True).sort_index() # Sanity check. - number_of_infinities = len(data_table_long[data_table_long["value"] == np.inf]) + number_of_infinities = len(tb_long[tb_long["value"] == np.inf]) assert number_of_infinities == 0, f"There are {number_of_infinities} infinity values in the long table." - return cast(catalog.Table, data_table_long) + return tb_long def create_variable_short_names(variable_name: str) -> str: @@ -1618,7 +1612,7 @@ def create_variable_short_names(variable_name: str) -> str: # Check that the extraction was correct by constructing the variable name again and comparing with the original. assert variable_name == f"{item} | {item_code} || {element} | {element_code} || {unit}" - new_name = catalog.utils.underscore(variable_name) + new_name = underscore(variable_name) # Check that the number of characters of the short name is not too long. n_char = len(new_name) @@ -1629,17 +1623,17 @@ def create_variable_short_names(variable_name: str) -> str: # It could happen that it is not the item name that is long, but the element name, dataset, or unit. # But for the moment, assume it is the item name. assert len(item) > n_char_to_be_removed, "Variable name is too long, but it is not due to item name." - new_item = catalog.utils.underscore(item)[0:-n_char_to_be_removed] - new_name = catalog.utils.underscore(f"{new_item} | {item_code} || {element} | {element_code} || {unit}") + new_item = underscore(item)[0:-n_char_to_be_removed] + new_name = underscore(f"{new_item} | {item_code} || {element} | {element_code} || {unit}") # Check that now the new name now fulfils the length requirement. error = "Variable short name is too long. Improve create_variable_names function to account for this case." assert len(new_name) <= 255, error - return cast(str, new_name) + return new_name -def prepare_wide_table(data: pd.DataFrame) -> catalog.Table: +def prepare_wide_table(tb: Table) -> Table: """Flatten a long table to obtain a wide table with ["country", "year"] as index. The input table will be pivoted to have [country, year] as index, and as many columns as combinations of @@ -1647,20 +1641,20 @@ def prepare_wide_table(data: pd.DataFrame) -> catalog.Table: Parameters ---------- - data : pd.DataFrame + tb : Table Data for current domain. Returns ------- - wide_table : catalog.Table + tb_wide : Table Data table with index [country, year]. """ - data = data.copy(deep=False) + tb = tb.copy(deep=False) # Ensure "item" exists in data (there are some datasets where it may be missing). - if "item" not in data.columns: - data["item"] = "" + if "item" not in tb.columns: + tb["item"] = "" # Construct a variable name that will not yield any possible duplicates. # This will be used as column names (which will then be formatted properly with underscores and lower case), @@ -1668,8 +1662,8 @@ def prepare_wide_table(data: pd.DataFrame) -> catalog.Table: # Also, for convenience, keep a similar structure as in the previous OWID dataset release. # Finally, ensure that the short name version of the variable is not too long # (which would cause issues when uploading to grapher). - data["variable_name"] = dataframes.apply_on_categoricals( - [data.item, data.item_code, data.element, data.element_code, data.unit], + tb["variable_name"] = dataframes.apply_on_categoricals( + [tb.item, tb.item_code, tb.element, tb.element_code, tb.unit], lambda item, item_code, element, @@ -1678,30 +1672,30 @@ def prepare_wide_table(data: pd.DataFrame) -> catalog.Table: ) # Construct a human-readable variable display name (which will be shown in grapher charts). - data["variable_display_name"] = dataframes.apply_on_categoricals( - [data.item, data.element, data.unit], + tb["variable_display_name"] = dataframes.apply_on_categoricals( + [tb.item, tb.element, tb.unit], lambda item, element, unit: f"{item} - {element} ({unit})", ) - if "item_description" in data.columns: + if "item_description" in tb.columns: # Construct a human-readable variable description (for the variable metadata). - data["variable_description"] = dataframes.apply_on_categoricals( - [data.item, data.element, data.item_description, data.element_description], + tb["variable_description"] = dataframes.apply_on_categoricals( + [tb.item, tb.element, tb.item_description, tb.element_description], prepare_variable_description, ) else: # This is the case for faostat_qv since the last update. - data["variable_description"] = "" + tb["variable_description"] = "" # Pivot over long dataframe to generate a wide dataframe with country-year as index, and as many columns as # unique elements in "variable_name" (which should be as many as combinations of item-elements). # Note: We include area_code in the index for completeness, but by construction country-year should not have # duplicates. # Note: `pivot` operation is usually faster on categorical columns - log.info("prepare_wide_table.pivot", shape=data.shape) + log.info("prepare_wide_table.pivot", shape=tb.shape) # Create a wide table with just the data values. - wide_table = catalog.Table( - data.pivot( + tb_wide = Table( + tb.pivot( index=["area_code", "country", "year"], columns=["variable_name"], values="value", @@ -1709,66 +1703,68 @@ def prepare_wide_table(data: pd.DataFrame) -> catalog.Table: ) # Add metadata to each new variable in the wide data table. - log.info("prepare_wide_table.adding_metadata", shape=wide_table.shape) + log.info("prepare_wide_table.adding_metadata", shape=tb_wide.shape) # Add variable name. - for column in wide_table.columns: - wide_table[column].metadata.title = column + for column in tb_wide.columns: + tb_wide[column].metadata.title = column # Add variable unit (long name). - variable_name_mapping = _variable_name_map(data, "unit") - for column in wide_table.columns: - wide_table[column].metadata.unit = variable_name_mapping[column] + variable_name_mapping = _variable_name_map(tb, "unit") + for column in tb_wide.columns: + tb_wide[column].metadata.unit = variable_name_mapping[column] - if "unit_short_name" in data.columns: + if "unit_short_name" in tb.columns: # Add variable unit (short name). - variable_name_mapping = _variable_name_map(data, "unit_short_name") - for column in wide_table.columns: - wide_table[column].metadata.short_unit = variable_name_mapping[column] + variable_name_mapping = _variable_name_map(tb, "unit_short_name") + for column in tb_wide.columns: + tb_wide[column].metadata.short_unit = variable_name_mapping[column] else: # This is the case for faostat_qv since the last update. - for column in wide_table.columns: - wide_table[column].metadata.short_unit = "" + for column in tb_wide.columns: + tb_wide[column].metadata.short_unit = "" # Add variable description. - variable_name_mapping = _variable_name_map(data, "variable_description") - for column in wide_table.columns: - wide_table[column].metadata.description = variable_name_mapping[column] + variable_name_mapping = _variable_name_map(tb, "variable_description") + for column in tb_wide.columns: + tb_wide[column].metadata.description = variable_name_mapping[column] - # Add display parameters (for grapher). - for column in wide_table.columns: - wide_table[column].metadata.display = {} + # Add display and presentation parameters (for grapher). + for column in tb_wide.columns: + tb_wide[column].metadata.display = {} + tb_wide[column].metadata.presentation = VariablePresentationMeta() # Display name. - variable_name_mapping = _variable_name_map(data, "variable_display_name") - for column in wide_table.columns: - wide_table[column].metadata.display["name"] = variable_name_mapping[column] + variable_name_mapping = _variable_name_map(tb, "variable_display_name") + for column in tb_wide.columns: + tb_wide[column].metadata.display["name"] = variable_name_mapping[column] + tb_wide[column].metadata.presentation.title_public = variable_name_mapping[column] # Ensure columns have the optimal dtypes, but codes are categories. - log.info("prepare_wide_table.optimize_table_dtypes", shape=wide_table.shape) - wide_table = optimize_table_dtypes(table=wide_table.reset_index()) + log.info("prepare_wide_table.optimize_table_dtypes", shape=tb_wide.shape) + tb_wide = optimize_table_dtypes(table=tb_wide.reset_index()) # Sort columns and rows conveniently. - wide_table = wide_table.set_index(["country", "year"], verify_integrity=True) - wide_table = wide_table[["area_code"] + sorted([column for column in wide_table.columns if column != "area_code"])] - wide_table = wide_table.sort_index(level=["country", "year"]).sort_index() + tb_wide = tb_wide.set_index(["country", "year"], verify_integrity=True) + tb_wide = tb_wide[["area_code"] + sorted([column for column in tb_wide.columns if column != "area_code"])] + tb_wide = tb_wide.sort_index(level=["country", "year"]).sort_index() # Make all column names snake_case. variable_to_short_name = { - column: create_variable_short_names(variable_name=wide_table[column].metadata.title) - for column in wide_table.columns - if wide_table[column].metadata.title is not None + column: create_variable_short_names(variable_name=tb_wide[column].metadata.title) + for column in tb_wide.columns + if tb_wide[column].metadata.title is not None } - wide_table = wide_table.rename(columns=variable_to_short_name, errors="raise") + tb_wide = tb_wide.rename(columns=variable_to_short_name, errors="raise") # Sanity check. - number_of_infinities = np.isinf(wide_table.select_dtypes(include=np.number).fillna(0)).values.sum() + number_of_infinities = np.isinf(tb_wide.select_dtypes(include=np.number).fillna(0)).values.sum() assert number_of_infinities == 0, f"There are {number_of_infinities} infinity values in the wide table." - return wide_table + return tb_wide -def _variable_name_map(data: pd.DataFrame, column: str) -> Dict[str, str]: +def _variable_name_map(data: Table, column: str) -> Dict[str, str]: """Extract map {variable name -> column} from dataframe and make sure it is unique (i.e. ensure that one variable does not map to two distinct values).""" pivot = data.dropna(subset=[column]).groupby(["variable_name"], observed=True)[column].apply(set) @@ -1776,8 +1772,8 @@ def _variable_name_map(data: pd.DataFrame, column: str) -> Dict[str, str]: return pivot.map(lambda x: list(x)[0]).to_dict() # type: ignore -def parse_amendments_table(amendments: catalog.Table, dataset_short_name: str): - amendments = pd.DataFrame(amendments).reset_index() +def parse_amendments_table(amendments: Table, dataset_short_name: str): + amendments = Table(amendments).reset_index() # Create a dictionary mapping spurious values to amended values. amendments = ( amendments[amendments["dataset"] == dataset_short_name] @@ -1807,19 +1803,18 @@ def run(dest_dir: str) -> None: # Load latest meadow dataset and keep its metadata. ds_meadow = paths.load_dataset(dataset_short_name) # Load main table from dataset. - tb_meadow = ds_meadow[dataset_short_name] - data = pd.DataFrame(tb_meadow).reset_index() + tb = ds_meadow[dataset_short_name].reset_index() # Load dataset of FAOSTAT metadata. metadata = paths.load_dataset(f"{NAMESPACE}_metadata") # Load dataset, items, element-units, countries metadata, and value amendments. - dataset_metadata = pd.DataFrame(metadata["datasets"]).loc[dataset_short_name].to_dict() - items_metadata = pd.DataFrame(metadata["items"]).reset_index() + dataset_metadata = metadata["datasets"].loc[dataset_short_name].to_dict() + items_metadata = metadata["items"].reset_index() items_metadata = items_metadata[items_metadata["dataset"] == dataset_short_name].reset_index(drop=True) - elements_metadata = pd.DataFrame(metadata["elements"]).reset_index() + elements_metadata = metadata["elements"].reset_index() elements_metadata = elements_metadata[elements_metadata["dataset"] == dataset_short_name].reset_index(drop=True) - countries_metadata = pd.DataFrame(metadata["countries"]).reset_index() + countries_metadata = metadata["countries"].reset_index() amendments = parse_amendments_table(amendments=metadata["amendments"], dataset_short_name=dataset_short_name) # Load population dataset. @@ -1835,12 +1830,12 @@ def run(dest_dir: str) -> None: # Process data. # # Harmonize items and elements, and clean data. - data = harmonize_items(df=data, dataset_short_name=dataset_short_name) - data = harmonize_elements(df=data, dataset_short_name=dataset_short_name) + tb = harmonize_items(tb=tb, dataset_short_name=dataset_short_name) + tb = harmonize_elements(tb=tb, dataset_short_name=dataset_short_name) # Prepare data. - data = clean_data( - data=data, + tb = clean_data( + tb=tb, ds_population=ds_population, items_metadata=items_metadata, elements_metadata=elements_metadata, @@ -1849,8 +1844,8 @@ def run(dest_dir: str) -> None: ) # Add data for aggregate regions. - data = add_regions( - data=data, + tb = add_regions( + tb=tb, ds_regions=ds_regions, ds_population=ds_population, ds_income_groups=ds_income_groups, @@ -1858,30 +1853,28 @@ def run(dest_dir: str) -> None: ) # Add per-capita variables. - data = add_per_capita_variables(data=data, elements_metadata=elements_metadata) + tb = add_per_capita_variables(tb=tb, elements_metadata=elements_metadata) # Handle detected anomalies in the data. - data, anomaly_descriptions = handle_anomalies(dataset_short_name=dataset_short_name, data=data) + tb, anomaly_descriptions = handle_anomalies(dataset_short_name=dataset_short_name, data=tb) # Create a long table (with item code and element code as part of the index). - data_table_long = prepare_long_table(data=data) + tb_long = prepare_long_table(tb=tb) # Create a wide table (with only country and year as index). - data_table_wide = prepare_wide_table(data=data) + tb_wide = prepare_wide_table(tb=tb) # # Save outputs. # # Update tables metadata. - data_table_long.metadata.short_name = dataset_short_name - data_table_long.metadata.title = dataset_metadata["owid_dataset_title"] - data_table_wide.metadata.short_name = f"{dataset_short_name}_flat" - data_table_wide.metadata.title = dataset_metadata["owid_dataset_title"] + ADDED_TITLE_TO_WIDE_TABLE + tb_long.metadata.short_name = dataset_short_name + tb_long.metadata.title = dataset_metadata["owid_dataset_title"] + tb_wide.metadata.short_name = f"{dataset_short_name}_flat" + tb_wide.metadata.title = dataset_metadata["owid_dataset_title"] + ADDED_TITLE_TO_WIDE_TABLE # Initialise new garden dataset. - ds_garden = create_dataset( - dest_dir=dest_dir, tables=[data_table_long, data_table_wide], default_metadata=ds_meadow.metadata - ) + ds_garden = create_dataset(dest_dir=dest_dir, tables=[tb_long, tb_wide], default_metadata=ds_meadow.metadata) # Update dataset metadata. # Add description of anomalies (if any) to the dataset description. ds_garden.metadata.description = dataset_metadata["owid_dataset_description"] + anomaly_descriptions From 4aad908f12cbb2d4a0fc3bd1d499c55dfd45bfee Mon Sep 17 00:00:00 2001 From: Pablo Rosado Date: Tue, 19 Mar 2024 12:29:19 +0100 Subject: [PATCH 25/54] Use tables instead of dataframes in fbsc and qcl steps --- .../garden/faostat/2024-03-14/faostat_fbsc.py | 76 +++++++++---------- .../2024-03-14/faostat_food_explorer.py | 41 +++++----- .../garden/faostat/2024-03-14/faostat_qcl.py | 57 +++++++------- 3 files changed, 85 insertions(+), 89 deletions(-) diff --git a/etl/steps/data/garden/faostat/2024-03-14/faostat_fbsc.py b/etl/steps/data/garden/faostat/2024-03-14/faostat_fbsc.py index e82fff42ee8..907f38311a7 100644 --- a/etl/steps/data/garden/faostat/2024-03-14/faostat_fbsc.py +++ b/etl/steps/data/garden/faostat/2024-03-14/faostat_fbsc.py @@ -16,10 +16,8 @@ """ from pathlib import Path -from typing import cast -import pandas as pd -from owid import catalog +from owid.catalog import Dataset, Table from owid.datautils import dataframes from shared import ( ADDED_TITLE_TO_WIDE_TABLE, @@ -46,80 +44,80 @@ def combine_fbsh_and_fbs_datasets( - fbsh_dataset: catalog.Dataset, - fbs_dataset: catalog.Dataset, -) -> pd.DataFrame: + ds_fbsh: Dataset, + ds_fbs: Dataset, +) -> Table: """Combine `faostat_fbsh` and `faostat_fbs` meadow datasets. Parameters ---------- - fbsh_dataset : catalog.Dataset + ds_fbsh : Dataset Meadow `faostat_fbsh` dataset. - fbs_dataset : catalog.Dataset + ds_fbs : Dataset Meadow `faostat_fbs` dataset. Returns ------- - fbsc : pd.DataFrame + tb_fbsc : Table Combination of the tables of the two input datasets (as a dataframe, not a dataset). """ # Sanity checks. error = "Description of fbs and fbsh datasets is different." - assert fbsh_dataset.metadata.description == fbs_dataset.metadata.description, error + assert ds_fbsh.metadata.description == ds_fbs.metadata.description, error error = "Licenses of fbsh and fbs are different." - assert fbsh_dataset.metadata.licenses == fbs_dataset.metadata.licenses, error + assert ds_fbsh.metadata.licenses == ds_fbs.metadata.licenses, error # Load dataframes for fbs and fbsh datasets. - fbsh = pd.DataFrame(fbsh_dataset["faostat_fbsh"]).reset_index() - fbs = pd.DataFrame(fbs_dataset["faostat_fbs"]).reset_index() + tb_fbsh = ds_fbsh["faostat_fbsh"].reset_index() + tb_fbs = ds_fbs["faostat_fbs"].reset_index() # Harmonize items and elements in both datasets. - fbsh = harmonize_items(tb=fbsh, dataset_short_name="faostat_fbsh") - fbsh = harmonize_elements(tb=fbsh, dataset_short_name="faostat_fbsh") - fbs = harmonize_items(tb=fbs, dataset_short_name="faostat_fbs") - fbs = harmonize_elements(tb=fbs, dataset_short_name="faostat_fbs") + tb_fbsh = harmonize_items(tb=tb_fbsh, dataset_short_name="faostat_fbsh") + tb_fbsh = harmonize_elements(tb=tb_fbsh, dataset_short_name="faostat_fbsh") + tb_fbs = harmonize_items(tb=tb_fbs, dataset_short_name="faostat_fbs") + tb_fbs = harmonize_elements(tb=tb_fbs, dataset_short_name="faostat_fbs") # Ensure there is no overlap in data between the two datasets, and that there is no gap between them. - assert fbs["year"].min() == FBS_FIRST_YEAR, f"First year of fbs dataset is not {FBS_FIRST_YEAR}" - if fbsh["year"].max() >= fbs["year"].min(): + assert tb_fbs["year"].min() == FBS_FIRST_YEAR, f"First year of fbs dataset is not {FBS_FIRST_YEAR}" + if tb_fbsh["year"].max() >= tb_fbs["year"].min(): # There is overlapping data between fbsh and fbs datasets. Prioritising fbs over fbsh." - fbsh = fbsh.loc[fbsh["year"] < fbs["year"].min()].reset_index(drop=True) - if (fbsh["year"].max() + 1) < fbs["year"].min(): + tb_fbsh = tb_fbsh.loc[tb_fbsh["year"] < tb_fbs["year"].min()].reset_index(drop=True) + if (tb_fbsh["year"].max() + 1) < tb_fbs["year"].min(): log.warning("Data is missing for one or more years between fbsh and fbs datasets.") # Sanity checks. # Ensure the elements that are in fbsh but not in fbs are covered by ITEMS_MAPPING. error = "Mismatch between items in fbsh and fbs. Redefine shared.ITEM_AMENDMENTS." - assert set(fbsh["item"]) == set(fbs["item"]), error - assert set(fbsh["item_code"]) == set(fbs["item_code"]), error + assert set(tb_fbsh["item"]) == set(tb_fbs["item"]), error + assert set(tb_fbsh["item_code"]) == set(tb_fbs["item_code"]), error # Some elements are found in fbs but not in fbsh. This is understandable, since fbs is # more recent and may have additional elements. However, ensure that there are no # elements in fbsh that are not in fbs. error = "There are elements in fbsh that are not in fbs." - assert set(fbsh["element"]) < set(fbs["element"]), error - assert set(fbsh["element_code"]) - set(fbs["element_code"]) == ELEMENTS_IN_FBSH_MISSING_IN_FBS, error + assert set(tb_fbsh["element"]) < set(tb_fbs["element"]), error + assert set(tb_fbsh["element_code"]) - set(tb_fbs["element_code"]) == ELEMENTS_IN_FBSH_MISSING_IN_FBS, error # Remove elements from fbsh that are not in fbs (since they have different meanings and hence should not be # combined as if they were the same element). - fbsh = fbsh[~fbsh["element_code"].isin(ELEMENTS_IN_FBSH_MISSING_IN_FBS)].reset_index(drop=True) + tb_fbsh = tb_fbsh[~tb_fbsh["element_code"].isin(ELEMENTS_IN_FBSH_MISSING_IN_FBS)].reset_index(drop=True) # Concatenate old and new dataframes using function that keeps categoricals. - fbsc = dataframes.concatenate([fbsh, fbs]).sort_values(["area", "year"]).reset_index(drop=True) + tb_fbsc = dataframes.concatenate([tb_fbsh, tb_fbs]).sort_values(["area", "year"]).reset_index(drop=True) # Ensure that each element has only one unit and one description. error = "Some elements in the combined dataset have more than one unit. Manually check them and consider adding them to ELEMENT_AMENDMENTS." - units_per_element = fbsc.groupby("element", as_index=False, observed=True)["unit"].nunique() + units_per_element = tb_fbsc.groupby("element", as_index=False, observed=True)["unit"].nunique() elements_with_ambiguous_units = units_per_element[units_per_element["unit"] > 1]["element"].tolist() - fbsc[fbsc["element"].isin(elements_with_ambiguous_units)].drop_duplicates(subset=["element", "unit"]) + tb_fbsc[tb_fbsc["element"].isin(elements_with_ambiguous_units)].drop_duplicates(subset=["element", "unit"]) assert len(elements_with_ambiguous_units) == 0, error - return cast(pd.DataFrame, fbsc) + return tb_fbsc -def _assert_df_size(df: pd.DataFrame, size_mb: float) -> None: +def _assert_tb_size(tb: Table, size_mb: float) -> None: """Check that dataframe is smaller than given size to prevent OOM errors.""" - real_size_mb = df.memory_usage(deep=True).sum() / 1e6 + real_size_mb = tb.memory_usage(deep=True).sum() / 1e6 assert real_size_mb <= size_mb, f"DataFrame size is too big: {real_size_mb} MB > {size_mb} MB" @@ -145,12 +143,12 @@ def run(dest_dir: str) -> None: metadata = paths.load_dataset(f"{NAMESPACE}_metadata") # Load dataset, items, element-units, and countries metadata. - dataset_metadata = pd.DataFrame(metadata["datasets"]).loc[dataset_short_name].to_dict() - items_metadata = pd.DataFrame(metadata["items"]).reset_index() + dataset_metadata = metadata["datasets"].loc[dataset_short_name].to_dict() + items_metadata = metadata["items"].reset_index() items_metadata = items_metadata[items_metadata["dataset"] == dataset_short_name].reset_index(drop=True) - elements_metadata = pd.DataFrame(metadata["elements"]).reset_index() + elements_metadata = metadata["elements"].reset_index() elements_metadata = elements_metadata[elements_metadata["dataset"] == dataset_short_name].reset_index(drop=True) - countries_metadata = pd.DataFrame(metadata["countries"]).reset_index() + countries_metadata = metadata["countries"].reset_index() amendments = parse_amendments_table(amendments=metadata["amendments"], dataset_short_name=dataset_short_name) # Load regions dataset. @@ -173,7 +171,7 @@ def run(dest_dir: str) -> None: ) data = combine_fbsh_and_fbs_datasets(fbsh_dataset, fbs_dataset) - _assert_df_size(data, 2000) + _assert_tb_size(data, 2000) # Prepare data. data = clean_data( @@ -204,13 +202,13 @@ def run(dest_dir: str) -> None: for col in data.columns: assert data[col].dtype != object, f"Column {col} should not have object type" - _assert_df_size(data, 2000) + _assert_tb_size(data, 2000) # Create a long table (with item code and element code as part of the index). log.info("faostat_fbsc.prepare_long_table", shape=data.shape) data_table_long = prepare_long_table(tb=data) - _assert_df_size(data_table_long, 2000) + _assert_tb_size(data_table_long, 2000) # Create a wide table (with only country and year as index). log.info("faostat_fbsc.prepare_wide_table", shape=data.shape) diff --git a/etl/steps/data/garden/faostat/2024-03-14/faostat_food_explorer.py b/etl/steps/data/garden/faostat/2024-03-14/faostat_food_explorer.py index dfbf6f9ccbe..9e4e3b9683f 100644 --- a/etl/steps/data/garden/faostat/2024-03-14/faostat_food_explorer.py +++ b/etl/steps/data/garden/faostat/2024-03-14/faostat_food_explorer.py @@ -9,10 +9,9 @@ """ from pathlib import Path -from typing import cast import pandas as pd -from owid import catalog +from owid.catalog import Table from owid.datautils import dataframes from shared import ( CURRENT_DIR, @@ -350,20 +349,20 @@ ] -def combine_qcl_and_fbsc(qcl_table: catalog.Table, fbsc_table: catalog.Table) -> pd.DataFrame: +def combine_qcl_and_fbsc(tb_qcl: Table, tb_fbsc: Table) -> Table: """Combine garden `faostat_qcl` and `faostat_fbsc` datasets. Parameters ---------- - qcl_table : catalog.Table + tb_qcl : Table Main table (in long format) of the `faostat_qcl` dataset. - fbsc_table : catalog.Table + tb_fbsc : Table Main table (in long format) of the `faostat_fbsc` dataset. Returns ------- - combined : pd.DataFrame - Combined data (as a dataframe, not a table). + combined : Table + Combined data. """ columns = [ @@ -378,14 +377,14 @@ def combine_qcl_and_fbsc(qcl_table: catalog.Table, fbsc_table: catalog.Table) -> "value", "population_with_data", ] - qcl = pd.DataFrame(qcl_table).reset_index()[columns] + qcl = tb_qcl.reset_index()[columns] # Select relevant element codes. qcl = qcl[qcl["element_code"].isin(ELEMENT_CODES_QCL)].reset_index(drop=True) qcl["value"] = qcl["value"].astype(float) qcl["element"] = [element for element in qcl["element"]] qcl["unit"] = [unit for unit in qcl["unit"]] qcl["item"] = [item for item in qcl["item"]] - fbsc = pd.DataFrame(fbsc_table).reset_index()[columns] + fbsc = tb_fbsc.reset_index()[columns] # Select relevant element codes. fbsc = fbsc[fbsc["element_code"].isin(ELEMENT_CODES_FBSC)].reset_index(drop=True) fbsc["value"] = fbsc["value"].astype(float) @@ -415,21 +414,21 @@ def combine_qcl_and_fbsc(qcl_table: catalog.Table, fbsc_table: catalog.Table) -> error = "There are unexpected duplicate rows. Rename items in custom_items.csv to avoid clashes." assert combined[combined.duplicated(subset=["product", "country", "year", "element", "unit"])].empty, error - return cast(pd.DataFrame, combined) + return combined -def get_fao_population(combined: pd.DataFrame) -> pd.DataFrame: +def get_fao_population(combined: Table) -> Table: """Extract the FAO population data from data (in long format). Parameters ---------- - combined : pd.DataFrame + combined : Table Combination of `faostat_qcl` and `faostat_fbsc` data (although this function could also be applied to just `faostat_fbsc` data, since `faostat_qcl` does not contain FAO population data). Returns ------- - fao_population : pd.DataFrame + fao_population : Table Population (by country and year) according to FAO, extracted from the `faostat_fbsc` dataset. """ @@ -443,7 +442,7 @@ def get_fao_population(combined: pd.DataFrame) -> pd.DataFrame: assert list(fao_population["unit"].unique()) == [FAO_POPULATION_UNIT_NAME], error fao_population["value"] *= 1000 - # Drop missing values and prepare output dataframe. + # Drop missing values and prepare output table. fao_population = ( fao_population[["country", "year", "value"]].dropna(how="any").rename(columns={"value": "fao_population"}) ) @@ -451,18 +450,18 @@ def get_fao_population(combined: pd.DataFrame) -> pd.DataFrame: return fao_population -def process_combined_data(combined: pd.DataFrame) -> pd.DataFrame: +def process_combined_data(combined: Table) -> Table: """Process combined data (combination of `faostat_qcl` and `faostat_fbsc` data) to have the content and format required by the food explorer. Parameters ---------- - combined : pd.DataFrame + combined : Table Combination of `faostat_qcl` and `faostat_fbsc` data. Returns ------- - data_wide : pd.DataFrame + data_wide : Table Processed data (in wide format). """ @@ -526,20 +525,20 @@ def run(dest_dir: str) -> None: fbsc_dataset = paths.load_dataset(f"{NAMESPACE}_fbsc") # Get main long tables from qcl and fbsc datasets. - qcl_table = qcl_dataset[f"{NAMESPACE}_qcl"] - fbsc_table = fbsc_dataset[f"{NAMESPACE}_fbsc"] + tb_qcl = qcl_dataset[f"{NAMESPACE}_qcl"] + tb_fbsc = fbsc_dataset[f"{NAMESPACE}_fbsc"] # # Process data. # # Combine qcl and fbsc data. - data = combine_qcl_and_fbsc(qcl_table=qcl_table, fbsc_table=fbsc_table) + data = combine_qcl_and_fbsc(tb_qcl=tb_qcl, tb_fbsc=tb_fbsc) # Prepare data in the format required by the food explorer. data = process_combined_data(combined=data) # Create table of products. - table = catalog.Table(data, short_name=dataset_short_name) + table = Table(data, short_name=dataset_short_name) # # Save outputs. diff --git a/etl/steps/data/garden/faostat/2024-03-14/faostat_qcl.py b/etl/steps/data/garden/faostat/2024-03-14/faostat_qcl.py index 537c4bbba97..05f42206644 100644 --- a/etl/steps/data/garden/faostat/2024-03-14/faostat_qcl.py +++ b/etl/steps/data/garden/faostat/2024-03-14/faostat_qcl.py @@ -3,7 +3,8 @@ from pathlib import Path import numpy as np -import pandas as pd +import owid.catalog.processing as pr +from owid.catalog import Table from owid.datautils import dataframes from shared import ( ADDED_TITLE_TO_WIDE_TABLE, @@ -46,7 +47,7 @@ # List of element codes for "Producing or slaughtered animals" (they have different items assigned). SLAUGHTERED_ANIMALS_ELEMENT_CODES = ["005320", "005321"] -# For the resulting dataframe, we arbitrarily assign the first of those codes. +# For the resulting table, we arbitrarily assign the first of those codes. SLAUGHTERED_ANIMALS_ELEMENT_CODE = SLAUGHTERED_ANIMALS_ELEMENT_CODES[0] # Item code for 'Meat, total'. TOTAL_MEAT_ITEM_CODE = "00001765" @@ -65,7 +66,7 @@ ) -def fill_slaughtered_poultry_with_slaughtered_chicken(data: pd.DataFrame) -> pd.DataFrame: +def fill_slaughtered_poultry_with_slaughtered_chicken(data: Table) -> Table: """Fill missing data on slaughtered poultry with slaughtered chicken. Most of poultry meat comes from chicken. However, sometimes chicken is informed, but the rest of poultry isn't, @@ -98,8 +99,7 @@ def fill_slaughtered_poultry_with_slaughtered_chicken(data: pd.DataFrame) -> pd. ][["country", "year", "value"]] # Combine poultry and chicken data. - compared = pd.merge( - chickens_slaughtered, + compared = chickens_slaughtered.merge( poultry_slaughtered, on=["country", "year"], how="outer", @@ -113,7 +113,7 @@ def fill_slaughtered_poultry_with_slaughtered_chicken(data: pd.DataFrame) -> pd. error = "There are rows where there is more slaughtered poultry than slaughtered chicken." assert compared[compared["value_poultry"] < compared["value_chicken"]].empty, error - # Prepare a replacement dataframe for missing data on slaughtered poultry. + # Prepare a replacement table for missing data on slaughtered poultry. poultry_slaughtered_missing_data = ( compared[compared["_merge"] == "left_only"] .assign( @@ -133,13 +133,13 @@ def fill_slaughtered_poultry_with_slaughtered_chicken(data: pd.DataFrame) -> pd. f"Filling {len(poultry_slaughtered_missing_data)} rows of missing data for slaughtered poultry with " "slaughtered chicken." ) - # Add chicken data to the full dataframe. - data = pd.concat([data, poultry_slaughtered_missing_data], ignore_index=True) + # Add chicken data to the full table. + data = pr.concat([data, poultry_slaughtered_missing_data], ignore_index=True) return data -def add_slaughtered_animals_to_meat_total(data: pd.DataFrame) -> pd.DataFrame: +def add_slaughtered_animals_to_meat_total(data: Table) -> Table: """Add number of slaughtered animals to meat total. There is no FAOSTAT data on slaughtered animals for total meat. We construct this data by aggregating that element @@ -150,12 +150,12 @@ def add_slaughtered_animals_to_meat_total(data: pd.DataFrame) -> pd.DataFrame: Parameters ---------- - data : pd.DataFrame + data : Table Processed data where meat total does not have number of slaughtered animals. Returns ------- - combined_data : pd.DataFrame + combined_data : Table Data after adding the new variable. """ @@ -209,7 +209,7 @@ def add_slaughtered_animals_to_meat_total(data: pd.DataFrame) -> pd.DataFrame: .reset_index(drop=True) ) - # Create a dataframe with the total number of animals used for meat. + # Create a table with the total number of animals used for meat. animals = dataframes.groupby_agg( data_to_aggregate, groupby_columns=[ @@ -285,7 +285,7 @@ def add_slaughtered_animals_to_meat_total(data: pd.DataFrame) -> pd.DataFrame: ) # Add a column to inform of all those rows for which we don't have poultry data. - compared = pd.merge(animals, country_years_with_poultry_data, how="outer", indicator=True) + compared = animals.merge(country_years_with_poultry_data, how="outer", indicator=True) assert compared[compared["_merge"] == "right_only"].empty, "Expected 'left_only' or 'both', not 'right_only'." @@ -299,9 +299,9 @@ def add_slaughtered_animals_to_meat_total(data: pd.DataFrame) -> pd.DataFrame: # Check that we are not missing any column. assert set(data.columns) == set(animals_corrected.columns) - # Add animals data to the original dataframe. + # Add animals data to the original table. combined_data = ( - pd.concat([data, animals_corrected], ignore_index=True) + pr.concat([data, animals_corrected], ignore_index=True) .reset_index(drop=True) .astype( { @@ -323,7 +323,7 @@ def add_slaughtered_animals_to_meat_total(data: pd.DataFrame) -> pd.DataFrame: return combined_data -def add_yield_to_aggregate_regions(data: pd.DataFrame) -> pd.DataFrame: +def add_yield_to_aggregate_regions(data: Table) -> Table: """Add yield (production / area harvested) to data for aggregate regions (i.e. continents and income groups). This data is not included in aggregate regions because it cannot be aggregated by simply summing the contribution of @@ -339,12 +339,12 @@ def add_yield_to_aggregate_regions(data: pd.DataFrame) -> pd.DataFrame: Parameters ---------- - data : pd.DataFrame + data : Table Data that does not contain yield for aggregate regions. Returns ------- - combined_data : pd.DataFrame + combined_data : Table Data after adding yield. """ @@ -369,13 +369,13 @@ def add_yield_to_aggregate_regions(data: pd.DataFrame) -> pd.DataFrame: ].drop_duplicates() assert len(additional_fields) == 1 - # Create a dataframe of production of regions. + # Create a table of production of regions. data_production = data[(data["country"].isin(REGIONS_TO_ADD)) & (data["element_code"] == production_element_code)] - # Create a dataframe of area of regions. + # Create a table of area of regions. data_area = data[(data["country"].isin(REGIONS_TO_ADD)) & (data["element_code"] == area_element_code)] - # Merge the two dataframes and create the new yield variable. + # Merge the two tables and create the new yield variable. merge_cols = [ "area_code", "year", @@ -386,8 +386,7 @@ def add_yield_to_aggregate_regions(data: pd.DataFrame) -> pd.DataFrame: "item_description", "country", ] - combined = pd.merge( - data_production, + combined = data_production.merge( data_area[merge_cols + ["flag", "value"]], on=merge_cols, how="inner", @@ -416,7 +415,7 @@ def add_yield_to_aggregate_regions(data: pd.DataFrame) -> pd.DataFrame: combined[field] = additional_fields[field].item() assert set(data.columns) == set(combined.columns) combined_data = ( - pd.concat([data, combined], ignore_index=True) + pr.concat([data, combined], ignore_index=True) .reset_index(drop=True) .astype( { @@ -452,18 +451,18 @@ def run(dest_dir: str) -> None: ds_meadow = paths.load_dataset(dataset_short_name) # Load main table from dataset. tb_meadow = ds_meadow[dataset_short_name] - data = pd.DataFrame(tb_meadow).reset_index() + data = tb_meadow.reset_index() # Load dataset of FAOSTAT metadata. metadata = paths.load_dataset(f"{NAMESPACE}_metadata") # Load dataset, items, element-units, and countries metadata. - dataset_metadata = pd.DataFrame(metadata["datasets"]).loc[dataset_short_name].to_dict() - items_metadata = pd.DataFrame(metadata["items"]).reset_index() + dataset_metadata = metadata["datasets"].loc[dataset_short_name].to_dict() + items_metadata = metadata["items"].reset_index() items_metadata = items_metadata[items_metadata["dataset"] == dataset_short_name].reset_index(drop=True) - elements_metadata = pd.DataFrame(metadata["elements"]).reset_index() + elements_metadata = metadata["elements"].reset_index() elements_metadata = elements_metadata[elements_metadata["dataset"] == dataset_short_name].reset_index(drop=True) - countries_metadata = pd.DataFrame(metadata["countries"]).reset_index() + countries_metadata = metadata["countries"].reset_index() amendments = parse_amendments_table(amendments=metadata["amendments"], dataset_short_name=dataset_short_name) # Load regions dataset. From 38d5a4a5ded23c2f3c157521e86b8a1ec5935531 Mon Sep 17 00:00:00 2001 From: Pablo Rosado Date: Tue, 19 Mar 2024 12:51:53 +0100 Subject: [PATCH 26/54] Use tables instead of dataframes in garden metadata step --- .../faostat/2024-03-14/faostat_metadata.py | 484 +++++++++--------- .../data/garden/faostat/2024-03-14/shared.py | 3 +- 2 files changed, 238 insertions(+), 249 deletions(-) diff --git a/etl/steps/data/garden/faostat/2024-03-14/faostat_metadata.py b/etl/steps/data/garden/faostat/2024-03-14/faostat_metadata.py index 59f415471c8..57c9027e128 100644 --- a/etl/steps/data/garden/faostat/2024-03-14/faostat_metadata.py +++ b/etl/steps/data/garden/faostat/2024-03-14/faostat_metadata.py @@ -40,10 +40,10 @@ import json import sys from copy import deepcopy -from typing import Dict, List, Tuple, cast +from typing import Dict, List, Tuple -import pandas as pd -from owid import catalog +import owid.catalog.processing as pr +from owid.catalog import Dataset, Table from owid.datautils import dataframes, io from shared import ( CURRENT_DIR, @@ -68,23 +68,23 @@ N_ISSUES_ON_ITEMS_FOR_WARNING = 1 -def create_dataset_descriptions_dataframe_for_domain(table: catalog.Table, dataset_short_name: str) -> pd.DataFrame: - """Create a single row dataframe with the dataset name, title and description, for a given domain. +def create_dataset_descriptions_table_for_domain(table: Table, dataset_short_name: str) -> Table: + """Create a single row table with the dataset name, title and description, for a given domain. Parameters ---------- - table : catalog.Table + table : Table Latest table for considered domain. dataset_short_name : str Dataset short name (e.g. 'faostat_qcl'). Returns ------- - dataset_descriptions_df : pd.DataFrame - Dataframe of name, title and description of a domain. + tb_dataset_descriptions : Table + Table of name, title and description of a domain. """ - dataset_descriptions_df = pd.DataFrame( + tb_dataset_descriptions = Table( { "dataset": [dataset_short_name], "fao_dataset_title": [table.metadata.dataset.title], @@ -92,29 +92,27 @@ def create_dataset_descriptions_dataframe_for_domain(table: catalog.Table, datas } ) - return dataset_descriptions_df + return tb_dataset_descriptions -def clean_global_dataset_descriptions_dataframe( - datasets_df: pd.DataFrame, custom_datasets: pd.DataFrame -) -> pd.DataFrame: - """Apply sanity checks to the dataframe gathered from the data of each individual datasets, and add custom dataset +def clean_global_dataset_descriptions_table(tb_datasets: Table, tb_custom_datasets: Table) -> Table: + """Apply sanity checks to the table gathered from the data of each individual datasets, and add custom dataset titles and descriptions. Parameters ---------- - datasets_df : pd.DataFrame - Dataframe of descriptions gathered from the data of each individual dataset. - custom_datasets : pd.DataFrame + tb_datasets : Table + Table of descriptions gathered from the data of each individual dataset. + tb_custom_datasets : Table Data from the custom_datasets.csv file. Returns ------- - datasets_df : pd.Dataframe - Clean dataframe of dataset titles and descriptions (customized and original FAO ones). + tb_datasets : Table + Clean table of dataset titles and descriptions (customized and original FAO ones). """ - datasets_df = datasets_df.copy() + tb_datasets = tb_datasets.copy() # Check that the dataset descriptions of fbsh and fbs are identical. error = ( @@ -122,27 +120,26 @@ def clean_global_dataset_descriptions_dataframe( "This may happen in the future: Simply check that nothing significant has changed and remove this assertion." ) assert ( - datasets_df[datasets_df["dataset"] == "faostat_fbsh"]["fao_dataset_description"].item() - == datasets_df[datasets_df["dataset"] == "faostat_fbs"]["fao_dataset_description"].item() + tb_datasets[tb_datasets["dataset"] == "faostat_fbsh"]["fao_dataset_description"].item() + == tb_datasets[tb_datasets["dataset"] == "faostat_fbs"]["fao_dataset_description"].item() ), error # Drop row for fbsh, and rename "fbs" to "fbsc" (since this will be the name for the combined dataset). - datasets_df = datasets_df[datasets_df["dataset"] != "faostat_fbsh"].reset_index(drop=True) - datasets_df.loc[datasets_df["dataset"] == "faostat_fbs", "dataset"] = "faostat_fbsc" + tb_datasets = tb_datasets[tb_datasets["dataset"] != "faostat_fbsh"].reset_index(drop=True) + tb_datasets.loc[tb_datasets["dataset"] == "faostat_fbs", "dataset"] = "faostat_fbsc" # Add custom dataset titles. - datasets_df = pd.merge( - datasets_df, - custom_datasets, + tb_datasets = tb_datasets.merge( + tb_custom_datasets, on="dataset", how="left", suffixes=("_new", "_old"), ) - changed_titles = datasets_df[ - datasets_df["fao_dataset_title_old"].fillna("") != datasets_df["fao_dataset_title_new"].fillna("") + changed_titles = tb_datasets[ + tb_datasets["fao_dataset_title_old"].fillna("") != tb_datasets["fao_dataset_title_new"].fillna("") ] - changed_descriptions = datasets_df[ - datasets_df["fao_dataset_description_old"].fillna("") != datasets_df["fao_dataset_description_new"].fillna("") + changed_descriptions = tb_datasets[ + tb_datasets["fao_dataset_description_old"].fillna("") != tb_datasets["fao_dataset_description_new"].fillna("") ] if len(changed_titles) > 0: @@ -151,29 +148,29 @@ def clean_global_dataset_descriptions_dataframe( log.warning( f"{len(changed_descriptions)} domains have changed descriptions. " f"Consider updating custom_datasets.csv." ) - datasets_df = datasets_df.drop(columns=["fao_dataset_title_old", "fao_dataset_description_old"]).rename( + tb_datasets = tb_datasets.drop(columns=["fao_dataset_title_old", "fao_dataset_description_old"]).rename( columns={ "fao_dataset_title_new": "fao_dataset_title", "fao_dataset_description_new": "fao_dataset_description", } ) - datasets_df["owid_dataset_title"] = datasets_df["owid_dataset_title"].fillna(datasets_df["fao_dataset_title"]) + tb_datasets["owid_dataset_title"] = tb_datasets["owid_dataset_title"].fillna(tb_datasets["fao_dataset_title"]) error = "Custom titles for different datasets are equal. Edit custom_datasets.csv file." - assert len(set(datasets_df["dataset"])) == len(set(datasets_df["owid_dataset_title"])), error + assert len(set(tb_datasets["dataset"])) == len(set(tb_datasets["owid_dataset_title"])), error # The final description will be the owid description (if there is any) followed by the original FAO description # (if there is any). - datasets_df["owid_dataset_description"] = [ + tb_datasets["owid_dataset_description"] = [ prepare_dataset_description( fao_description=dataset["fao_dataset_description"], owid_description=dataset["owid_dataset_description"], ) - for _, dataset in datasets_df.fillna("").iterrows() + for _, dataset in tb_datasets.fillna("").iterrows() ] # Reorder columns. - datasets_df = datasets_df[ + tb_datasets = tb_datasets[ [ "dataset", "fao_dataset_title", @@ -183,11 +180,11 @@ def clean_global_dataset_descriptions_dataframe( ] ] - return datasets_df + return tb_datasets def check_that_item_and_element_harmonization_does_not_trim_codes( - data: pd.DataFrame, dataset_short_name: str, category: str + data: Table, dataset_short_name: str, category: str ) -> None: # Ensure that the number of digits of all item and element codes is smaller than the limits defined # at the beginning of the garden shared module, by N_CHARACTERS_ITEM_CODE and N_CHARACTERS_ELEMENT_CODE, @@ -209,36 +206,34 @@ def check_that_item_and_element_harmonization_does_not_trim_codes( assert all([len(str(code)) <= n_characters[category] for code in data[f"{category}_code"].unique()]), error -def create_items_dataframe_for_domain( - table: catalog.Table, metadata: catalog.Dataset, dataset_short_name: str -) -> pd.DataFrame: +def create_items_table_for_domain(table: Table, metadata: Dataset, dataset_short_name: str) -> Table: """Apply sanity checks to the items of a table in a dataset, and to the items from the metadata, harmonize all item codes and items, and add item descriptions. Parameters ---------- - table : catalog.Table + table : Table Data for a given domain. - metadata: catalog.Dataset + metadata: Dataset Metadata dataset from meadow. dataset_short_name : str Dataset short name (e.g. 'faostat_qcl'). Returns ------- - items_from_data : pd.Dataframe + items_from_data : Table Item names and descriptions (customized ones and FAO original ones) for a particular domain. """ - df = pd.DataFrame(table).reset_index() + tb = table.reset_index() # Load items from data. items_from_data = ( - df.rename(columns={"item": "fao_item"})[["item_code", "fao_item"]].drop_duplicates().reset_index(drop=True) + tb.rename(columns={"item": "fao_item"})[["item_code", "fao_item"]].drop_duplicates().reset_index(drop=True) ) # Sanity check. check_that_item_and_element_harmonization_does_not_trim_codes( - data=df, dataset_short_name=dataset_short_name, category="item" + data=tb, dataset_short_name=dataset_short_name, category="item" ) # Ensure items are well constructed and amend already known issues (defined in shared.ITEM_AMENDMENTS). items_from_data = harmonize_items(tb=items_from_data, dataset_short_name=dataset_short_name, item_col="fao_item") @@ -254,19 +249,19 @@ def create_items_dataframe_for_domain( # This is the case for faostat_qv since last version. return items_from_data - _items_df = ( + _tb_items = ( _metadata[list(items_columns)] .rename(columns=items_columns) .drop_duplicates() .sort_values(list(items_columns.values())) .reset_index(drop=True) ) - _items_df = harmonize_items(tb=_items_df, dataset_short_name=dataset_short_name, item_col="fao_item") - _items_df["fao_item_description"] = _items_df["fao_item_description"].astype("string") + _tb_items = harmonize_items(tb=_tb_items, dataset_short_name=dataset_short_name, item_col="fao_item") + _tb_items["fao_item_description"] = _tb_items["fao_item_description"].astype("string") # Add descriptions (from metadata) to items (from data). items_from_data = ( - pd.merge(items_from_data, _items_df, on=["item_code", "fao_item"], how="left") + items_from_data.merge(_tb_items, on=["item_code", "fao_item"], how="left") .sort_values(["item_code", "fao_item"]) .reset_index(drop=True) ) @@ -282,15 +277,14 @@ def create_items_dataframe_for_domain( # Check that all item codes in data are defined in metadata, and check that the mapping item code -> item in # the data is the same as in the metadata (which often is not the case). - compared = pd.merge( - items_from_data[["item_code", "fao_item"]], - _items_df[["item_code", "fao_item"]], + compared = items_from_data[["item_code", "fao_item"]].merge( + _tb_items[["item_code", "fao_item"]], on="item_code", how="left", suffixes=("_in_data", "_in_metadata"), ) different_items = compared[compared["fao_item_in_data"] != compared["fao_item_in_metadata"]] - missing_item_codes = set(items_from_data["item_code"]) - set(_items_df["item_code"]) + missing_item_codes = set(items_from_data["item_code"]) - set(_tb_items["item_code"]) if (len(different_items) + len(missing_item_codes)) > N_ISSUES_ON_ITEMS_FOR_WARNING: log.warning( f"{len(missing_item_codes)} item codes in {dataset_short_name} missing in metadata. " @@ -300,49 +294,51 @@ def create_items_dataframe_for_domain( return items_from_data -def clean_global_items_dataframe(items_df: pd.DataFrame, custom_items: pd.DataFrame) -> pd.DataFrame: - """Apply global sanity checks to items gathered from all datasets, and create a clean global items dataframe. +def clean_global_items_table(tb_items: Table, custom_items: Table) -> Table: + """Apply global sanity checks to items gathered from all datasets, and create a clean global items table. Parameters ---------- - items_df : pd.DataFrame - Items dataframe gathered from all domains. - custom_items : pd.DataFrame + tb_items : Table + Items table gathered from all domains. + custom_items : Table Data from custom_items.csv file. Returns ------- - items_df : pd.DataFrame - Clean global items dataframe. + tb_items : Table + Clean global items table. """ - items_df = items_df.copy() + tb_items = tb_items.copy() # Check that fbs and fbsh have the same contributions, remove one of them, and rename the other to fbsc. - check = pd.merge( - items_df[items_df["dataset"] == "faostat_fbsh"].reset_index(drop=True)[["item_code", "fao_item"]], - items_df[items_df["dataset"] == "faostat_fbs"].reset_index(drop=True)[["item_code", "fao_item"]], - how="outer", - on=["item_code"], - suffixes=("_fbsh", "_fbs"), + check = ( + tb_items[tb_items["dataset"] == "faostat_fbsh"] + .reset_index(drop=True)[["item_code", "fao_item"]] + .merge( + tb_items[tb_items["dataset"] == "faostat_fbs"].reset_index(drop=True)[["item_code", "fao_item"]], + how="outer", + on=["item_code"], + suffixes=("_fbsh", "_fbs"), + ) ) assert (check["fao_item_fbsh"] == check["fao_item_fbs"]).all() # Drop all rows for fbsh, and rename "fbs" to "fbsc" (since this will be the name for the combined dataset). - items_df = items_df[items_df["dataset"] != "faostat_fbsh"].reset_index(drop=True) - items_df.loc[items_df["dataset"] == "faostat_fbs", "dataset"] = "faostat_fbsc" + tb_items = tb_items[tb_items["dataset"] != "faostat_fbsh"].reset_index(drop=True) + tb_items.loc[tb_items["dataset"] == "faostat_fbs", "dataset"] = "faostat_fbsc" # Add custom item names. - items_df = pd.merge( - items_df, + tb_items = tb_items.merge( custom_items.rename(columns={"fao_item": "fao_item_check"}), on=["dataset", "item_code"], how="left", suffixes=("_new", "_old"), ) - changed_descriptions = items_df[ - (items_df["fao_item_description_old"] != items_df["fao_item_description_new"]) - & (items_df["fao_item_description_old"].notnull()) + changed_descriptions = tb_items[ + (tb_items["fao_item_description_old"] != tb_items["fao_item_description_new"]) + & (tb_items["fao_item_description_old"].notnull()) ] if len(changed_descriptions) > 0: log.warning( @@ -350,32 +346,32 @@ def clean_global_items_dataframe(items_df: pd.DataFrame, custom_items: pd.DataFr f"Consider updating custom_items.csv." ) - items_df = items_df.drop(columns="fao_item_description_old").rename( + tb_items = tb_items.drop(columns="fao_item_description_old").rename( columns={"fao_item_description_new": "fao_item_description"} ) # Check that item names have not changed. # NOTE: This condition used to raise an error if not fulfilled. Consider making it an assertion. if not ( - items_df[items_df["fao_item_check"].notnull()]["fao_item_check"] - == items_df[items_df["fao_item_check"].notnull()]["fao_item"] + tb_items[tb_items["fao_item_check"].notnull()]["fao_item_check"] + == tb_items[tb_items["fao_item_check"].notnull()]["fao_item"] ).all(): log.warning("Item names may have changed with respect to custom items file. Update custom items file.") - items_df = items_df.drop(columns=["fao_item_check"]) + tb_items = tb_items.drop(columns=["fao_item_check"]) # Assign original FAO name to all owid items that do not have a custom name. - items_df["owid_item"] = items_df["owid_item"].fillna(items_df["fao_item"]) + tb_items["owid_item"] = tb_items["owid_item"].fillna(tb_items["fao_item"]) # Add custom item descriptions, and assign original FAO descriptions to items that do not have a custom description. - items_df["owid_item_description"] = items_df["owid_item_description"].fillna(items_df["fao_item_description"]) + tb_items["owid_item_description"] = tb_items["owid_item_description"].fillna(tb_items["fao_item_description"]) # Check that we have not introduced ambiguities when assigning custom item names. - n_owid_items_per_item_code = items_df.groupby(["dataset", "item_code"])["owid_item"].transform("nunique") + n_owid_items_per_item_code = tb_items.groupby(["dataset", "item_code"])["owid_item"].transform("nunique") error = "Multiple owid items for a given item code in a dataset." - assert items_df[n_owid_items_per_item_code > 1].empty, error + assert tb_items[n_owid_items_per_item_code > 1].empty, error - items_df = ( - items_df[ + tb_items = ( + tb_items[ [ "dataset", "item_code", @@ -389,36 +385,34 @@ def clean_global_items_dataframe(items_df: pd.DataFrame, custom_items: pd.DataFr .reset_index(drop=True) ) - return items_df + return tb_items -def create_elements_dataframe_for_domain( - table: catalog.Table, metadata: catalog.Dataset, dataset_short_name: str -) -> pd.DataFrame: +def create_elements_table_for_domain(table: Table, metadata: Dataset, dataset_short_name: str) -> Table: """Apply sanity checks to the elements and units of a table in a dataset, and to the elements and units from the metadata, harmonize all element code, and add descriptions. Parameters ---------- - table : catalog.Table + table : Table Data for a given domain. - metadata: catalog.Dataset + metadata: Dataset Additional metadata dataset from meadow. dataset_short_name : str Dataset short name (e.g. 'faostat_qcl'). Returns ------- - elements_from_data : pd.Dataframe + elements_from_data : Table Element names and descriptions and unit names and descriptions (customized ones and FAO original ones) for a particular domain. """ - df = pd.DataFrame(table).reset_index() + tb = table.reset_index() # Load elements from data. elements_from_data = ( - df.rename(columns={"element": "fao_element", "unit": "fao_unit_short_name"})[ + tb.rename(columns={"element": "fao_element", "unit": "fao_unit_short_name"})[ ["element_code", "fao_element", "fao_unit_short_name"] ] .drop_duplicates() @@ -426,7 +420,7 @@ def create_elements_dataframe_for_domain( ) # Sanity check. check_that_item_and_element_harmonization_does_not_trim_codes( - data=df, dataset_short_name=dataset_short_name, category="element" + data=tb, dataset_short_name=dataset_short_name, category="element" ) # Ensure element_code is always a string of a fix number of characters. elements_from_data = harmonize_elements( @@ -447,24 +441,24 @@ def create_elements_dataframe_for_domain( # This is the case for faostat_qv since last version. return elements_from_data - _elements_df = ( + _tb_elements = ( _metadata[list(elements_columns)] .rename(columns=elements_columns) .drop_duplicates() .sort_values(list(elements_columns.values())) .reset_index(drop=True) ) - _elements_df = harmonize_elements( - tb=_elements_df, dataset_short_name=dataset_short_name, element_col="fao_element", unit_col=None + _tb_elements = harmonize_elements( + tb=_tb_elements, dataset_short_name=dataset_short_name, element_col="fao_element", unit_col=None ) - _elements_df["fao_element_description"] = _elements_df["fao_element_description"].astype("string") + _tb_elements["fao_element_description"] = _tb_elements["fao_element_description"].astype("string") # Load units metadata. units_columns = { "unit_name": "fao_unit_short_name", "description": "fao_unit", } - _units_df = ( + _tb_units = ( metadata[f"{dataset_short_name}_unit"] .reset_index()[list(units_columns)] .rename(columns=units_columns) @@ -472,13 +466,12 @@ def create_elements_dataframe_for_domain( .sort_values(list(units_columns.values())) .reset_index(drop=True) ) - _units_df["fao_unit"] = _units_df["fao_unit"].astype("string") + _tb_units["fao_unit"] = _tb_units["fao_unit"].astype("string") # Add element descriptions (from metadata). elements_from_data = ( - pd.merge( - elements_from_data, - _elements_df, + elements_from_data.merge( + _tb_elements, on=["element_code", "fao_element"], how="left", ) @@ -490,7 +483,7 @@ def create_elements_dataframe_for_domain( # Add unit descriptions (from metadata). elements_from_data = ( - pd.merge(elements_from_data, _units_df, on=["fao_unit_short_name"], how="left") + elements_from_data.merge(_tb_units, on=["fao_unit_short_name"], how="left") .sort_values(["fao_unit_short_name"]) .reset_index(drop=True) ) @@ -499,9 +492,9 @@ def create_elements_dataframe_for_domain( # Sanity checks: # Check that in data, there is only one unit per element code. - n_units_per_element_code = df.groupby("element_code")["unit"].transform("nunique") + n_units_per_element_code = tb.groupby("element_code")["unit"].transform("nunique") error = f"Multiple units for a given element code in dataset {dataset_short_name}." - assert df[n_units_per_element_code > 1].empty, error + assert tb[n_units_per_element_code > 1].empty, error # Check that in data, there is only one element per element code. n_elements_per_element_code = elements_from_data.groupby("element_code")["fao_element"].transform("nunique") @@ -511,37 +504,36 @@ def create_elements_dataframe_for_domain( return elements_from_data -def clean_global_elements_dataframe(elements_df: pd.DataFrame, custom_elements: pd.DataFrame) -> pd.DataFrame: +def clean_global_elements_table(tb_elements: Table, custom_elements: Table) -> Table: """Apply global sanity checks to elements and units gathered from all datasets, and create a clean global elements - and units dataframe. + and units table. Parameters ---------- - elements_df : pd.DataFrame - Elements and units dataframe gathered from all domains. - custom_elements : pd.DataFrame + tb_elements : Table + Elements and units table gathered from all domains. + custom_elements : Table Data from custom_element_and_units.csv file. Returns ------- - elements_df : pd.DataFrame - Clean global elements and units dataframe. + tb_elements : Table + Clean global elements and units table. """ - elements_df = elements_df.copy() + tb_elements = tb_elements.copy() # Check that all elements of fbsh are in fbs (although fbs may contain additional elements). assert ( - set(elements_df[elements_df["dataset"] == "faostat_fbsh"]["element_code"]) - - set(elements_df[elements_df["dataset"] == "faostat_fbs"]["element_code"]) + set(tb_elements[tb_elements["dataset"] == "faostat_fbsh"]["element_code"]) + - set(tb_elements[tb_elements["dataset"] == "faostat_fbs"]["element_code"]) == ELEMENTS_IN_FBSH_MISSING_IN_FBS ), "There are new elements in fbsh that are not in fbs. Add them to ELEMENTS_IN_FBSH_MISSING_IN_FBS." # Drop all rows for fbsh, and rename "fbs" to "fbsc" (since this will be the name for the combined dataset). - elements_df = elements_df[elements_df["dataset"] != "faostat_fbsh"].reset_index(drop=True) - elements_df.loc[elements_df["dataset"] == "faostat_fbs", "dataset"] = "faostat_fbsc" + tb_elements = tb_elements[tb_elements["dataset"] != "faostat_fbsh"].reset_index(drop=True) + tb_elements.loc[tb_elements["dataset"] == "faostat_fbs", "dataset"] = "faostat_fbsc" - elements_df = pd.merge( - elements_df, + tb_elements = tb_elements.merge( custom_elements.rename( columns={ "fao_element": "fao_element_check", @@ -553,15 +545,15 @@ def clean_global_elements_dataframe(elements_df: pd.DataFrame, custom_elements: suffixes=("_new", "_old"), ) - changed_units = elements_df[ - (elements_df["fao_unit_new"] != elements_df["fao_unit_old"]) & (elements_df["fao_unit_old"].notnull()) + changed_units = tb_elements[ + (tb_elements["fao_unit_new"] != tb_elements["fao_unit_old"]) & (tb_elements["fao_unit_old"].notnull()) ] if len(changed_units) > 0: log.warning(f"{len(changed_units)} domains have changed units, consider updating custom_elements.csv.") - changed_descriptions = elements_df[ - (elements_df["fao_element_description_new"] != elements_df["fao_element_description_old"]) - & (elements_df["fao_element_description_old"].notnull()) + changed_descriptions = tb_elements[ + (tb_elements["fao_element_description_new"] != tb_elements["fao_element_description_old"]) + & (tb_elements["fao_element_description_old"].notnull()) ] if len(changed_descriptions) > 0: log.warning( @@ -569,7 +561,7 @@ def clean_global_elements_dataframe(elements_df: pd.DataFrame, custom_elements: f"Consider updating custom_elements.csv." ) - elements_df = elements_df.drop(columns=["fao_unit_old", "fao_element_description_old"]).rename( + tb_elements = tb_elements.drop(columns=["fao_unit_old", "fao_element_description_old"]).rename( columns={ "fao_element_description_new": "fao_element_description", "fao_unit_new": "fao_unit", @@ -578,59 +570,59 @@ def clean_global_elements_dataframe(elements_df: pd.DataFrame, custom_elements: # Check if element or unit names have changed with respect to the custom elements and units file. # NOTE: This raises an error instead of a warning because further steps will (certainly?) fail. - changed_elements = elements_df[ - elements_df["fao_element_check"].notnull() & (elements_df["fao_element_check"] != elements_df["fao_element"]) + changed_elements = tb_elements[ + tb_elements["fao_element_check"].notnull() & (tb_elements["fao_element_check"] != tb_elements["fao_element"]) ][["fao_element_check", "fao_element"]] if len(changed_elements) > 0: log.error( f"{len(changed_elements)} element names have changed with respect to custom elements file. Use `update_custom_metadata.py` to update custom elements file." ) - elements_df = elements_df.drop(columns=["fao_element_check"]) + tb_elements = tb_elements.drop(columns=["fao_element_check"]) - changed_units = elements_df[ - elements_df["fao_unit_short_name_check"].notnull() - & (elements_df["fao_unit_short_name_check"] != elements_df["fao_unit_short_name"]) + changed_units = tb_elements[ + tb_elements["fao_unit_short_name_check"].notnull() + & (tb_elements["fao_unit_short_name_check"] != tb_elements["fao_unit_short_name"]) ][["fao_unit_short_name_check", "fao_unit_short_name"]] if len(changed_units) > 0: log.error( f"{len(changed_units)} unit names have changed with respect to custom elements file. Use `update_custom_metadata.py` to update custom elements file." ) - elements_df = elements_df.drop(columns=["fao_unit_short_name_check"]) + tb_elements = tb_elements.drop(columns=["fao_unit_short_name_check"]) # Assign original FAO names where there is no custom one. - elements_df["owid_element"] = elements_df["owid_element"].fillna(elements_df["fao_element"]) - elements_df["owid_unit"] = elements_df["owid_unit"].fillna(elements_df["fao_unit"]) - elements_df["owid_element_description"] = elements_df["owid_element_description"].fillna( - elements_df["fao_element_description"] + tb_elements["owid_element"] = tb_elements["owid_element"].fillna(tb_elements["fao_element"]) + tb_elements["owid_unit"] = tb_elements["owid_unit"].fillna(tb_elements["fao_unit"]) + tb_elements["owid_element_description"] = tb_elements["owid_element_description"].fillna( + tb_elements["fao_element_description"] ) - elements_df["owid_unit_short_name"] = elements_df["owid_unit_short_name"].fillna(elements_df["fao_unit_short_name"]) + tb_elements["owid_unit_short_name"] = tb_elements["owid_unit_short_name"].fillna(tb_elements["fao_unit_short_name"]) # Assume variables were not per capita, if was_per_capita is not informed, and make boolean. - elements_df["was_per_capita"] = elements_df["was_per_capita"].fillna("0").replace({"0": False, "1": True}) + tb_elements["was_per_capita"] = tb_elements["was_per_capita"].fillna("0").replace({"0": False, "1": True}) # Idem for variables to make per capita. - elements_df["make_per_capita"] = elements_df["make_per_capita"].fillna("0").replace({"0": False, "1": True}) + tb_elements["make_per_capita"] = tb_elements["make_per_capita"].fillna("0").replace({"0": False, "1": True}) # Check that we have not introduced ambiguities when assigning custom element or unit names. - n_owid_elements_per_element_code = elements_df.groupby(["dataset", "element_code"])["owid_element"].transform( + n_owid_elements_per_element_code = tb_elements.groupby(["dataset", "element_code"])["owid_element"].transform( "nunique" ) error = "Multiple owid elements for a given element code in a dataset." - assert elements_df[n_owid_elements_per_element_code > 1].empty, error + assert tb_elements[n_owid_elements_per_element_code > 1].empty, error # Check that we have not introduced ambiguities when assigning custom element or unit names. - n_owid_units_per_element_code = elements_df.groupby(["dataset", "element_code"])["owid_unit"].transform("nunique") + n_owid_units_per_element_code = tb_elements.groupby(["dataset", "element_code"])["owid_unit"].transform("nunique") error = "Multiple owid elements for a given element code in a dataset." - assert elements_df[n_owid_units_per_element_code > 1].empty, error + assert tb_elements[n_owid_units_per_element_code > 1].empty, error # NOTE: We assert that there is one element for each element code. But the opposite may not be true: there can be # multiple element codes with the same element. And idem for items. - return elements_df + return tb_elements def check_countries_to_exclude_or_harmonize( - countries_in_data: pd.DataFrame, excluded_countries: List[str], countries_harmonization: Dict[str, str] + countries_in_data: Table, excluded_countries: List[str], countries_harmonization: Dict[str, str] ) -> None: # Check that all excluded countries are in the data. unknown_excluded_countries = set(excluded_countries) - set(countries_in_data["fao_country"]) @@ -650,18 +642,18 @@ def check_countries_to_exclude_or_harmonize( assert len(unknown_countries) == 0, error -def clean_global_countries_dataframe( - countries_in_data: pd.DataFrame, +def clean_global_countries_table( + countries_in_data: Table, country_groups: Dict[str, List[str]], countries_harmonization: Dict[str, str], excluded_countries: List[str], -) -> pd.DataFrame: - """Clean dataframe of countries gathered from the data of the individual domains, harmonize country names (and - country names of members of regions), and create a clean global countries dataframe. +) -> Table: + """Clean table of countries gathered from the data of the individual domains, harmonize country names (and + country names of members of regions), and create a clean global countries table. Parameters ---------- - countries_in_data : pd.DataFrame + countries_in_data : Table Countries gathered from the data of all domains. country_groups : dict Countries and their members, gathered from the data. @@ -672,11 +664,11 @@ def clean_global_countries_dataframe( Returns ------- - countries_df : pd.DataFrame - Clean global countries dataframe. + tb_countries : Table + Clean global countries table. """ - countries_df = countries_in_data.copy() + tb_countries = countries_in_data.copy() # Sanity checks. check_countries_to_exclude_or_harmonize( @@ -693,8 +685,8 @@ def clean_global_countries_dataframe( } # Harmonize country names. - countries_df["country"] = dataframes.map_series( - series=countries_df["fao_country"], + tb_countries["country"] = dataframes.map_series( + series=tb_countries["fao_country"], mapping=countries_harmonization, warn_on_missing_mappings=False, warn_on_unused_mappings=False, @@ -702,28 +694,28 @@ def clean_global_countries_dataframe( show_full_warning=True, ) - # Add country members to countries dataframe. - countries_df["members"] = dataframes.map_series( - series=countries_df["country"], + # Add country members to countries table. + tb_countries["members"] = dataframes.map_series( + series=tb_countries["country"], mapping=country_groups_harmonized, make_unmapped_values_nan=True, ) # Feather does not support object types, so convert column of lists to column of strings. - countries_df["members"] = [ - json.dumps(members) if isinstance(members, list) else members for members in countries_df["members"] + tb_countries["members"] = [ + json.dumps(members) if isinstance(members, list) else members for members in tb_countries["members"] ] - return countries_df + return tb_countries -def create_table(df: pd.DataFrame, short_name: str, index_cols: List[str]) -> catalog.Table: - """Create a table with optimal format and basic metadata, out of a dataframe. +def create_table(tb: Table, short_name: str, index_cols: List[str]) -> Table: + """Create a table with optimal format and basic metadata, out of a table. Parameters ---------- - df : pd.DataFrame - Input dataframe. + tb : Table + Input table. short_name : str Short name to add in the metadata of the new table. index_cols : list @@ -731,11 +723,11 @@ def create_table(df: pd.DataFrame, short_name: str, index_cols: List[str]) -> ca Returns ------- - table : catalog.Table + table : Table New table. """ - table = catalog.Table(df).copy() + table = Table(tb).copy() # Optimize column dtypes before storing feather file, and ensure codes are categories (instead of ints). table = optimize_table_dtypes(table) @@ -746,25 +738,25 @@ def create_table(df: pd.DataFrame, short_name: str, index_cols: List[str]) -> ca table.metadata.short_name = short_name table.metadata.primary_key = index_cols - return cast(catalog.Table, table) + return table def check_that_flag_definitions_in_dataset_agree_with_those_in_flags_ranking( - metadata: catalog.Dataset, + metadata: Dataset, ) -> None: """Check that the definition of flags in the additional metadata for current dataset agree with the ones we have manually written down in our flags ranking (raise error otherwise). Parameters ---------- - metadata : catalog.Dataset + metadata : Dataset Additional metadata dataset (that must contain one table for current dataset). """ for table_name in metadata.table_names: if ("flag" in table_name) and ("flags" in metadata[table_name].columns): - flag_df = metadata[table_name].reset_index() - comparison = pd.merge(FLAGS_RANKING, flag_df, on="flag", how="inner") + tb_flag = metadata[table_name].reset_index() + comparison = FLAGS_RANKING.merge(tb_flag, on="flag", how="inner") error_message = ( f"Flag definitions in file {table_name} are different to those in our flags ranking. " f"Redefine shared.FLAGS_RANKING." @@ -772,23 +764,23 @@ def check_that_flag_definitions_in_dataset_agree_with_those_in_flags_ranking( assert (comparison["description"] == comparison["flags"]).all(), error_message -def check_that_all_flags_in_dataset_are_in_ranking(table: catalog.Table, metadata_for_flags: catalog.Table) -> None: +def check_that_all_flags_in_dataset_are_in_ranking(table: Table, metadata_for_flags: Table) -> None: """Check that all flags found in current dataset are defined in our flags ranking (raise error otherwise). Parameters ---------- - table : pd.DataFrame + table : Table Data table for current dataset. - metadata_for_flags : catalog.Table + metadata_for_flags : Table Flags for current dataset, as defined in dataset of additional metadata. """ if not set(table["flag"]) < set(FLAGS_RANKING["flag"]): missing_flags = set(table["flag"]) - set(FLAGS_RANKING["flag"]) - flags_data = pd.DataFrame(metadata_for_flags).reset_index() + flags_data = metadata_for_flags.reset_index() if set(missing_flags) < set(flags_data["flag"]): message = "Missing flags. Copy the following lines to FLAGS_RANKING (and put them in the right order):" - for i, j in pd.DataFrame(metadata_for_flags).loc[list(missing_flags)].iterrows(): + for i, j in metadata_for_flags.loc[list(missing_flags)].iterrows(): message += f"\n{(i, j['flags'])}," log.warning(message) else: @@ -799,9 +791,7 @@ def check_that_all_flags_in_dataset_are_in_ranking(table: catalog.Table, metadat raise AssertionError("Flags in dataset not found in FLAGS_RANKING. Manually add those flags.") -def check_definitions_in_value_amendments( - table: catalog.Table, dataset_short_name: str, value_amendments: pd.DataFrame -) -> None: +def check_definitions_in_value_amendments(table: Table, dataset_short_name: str, value_amendments: Table) -> None: """Check definitions in the value_amendments.csv file. This function will assert that: @@ -813,11 +803,11 @@ def check_definitions_in_value_amendments( Parameters ---------- - table : catalog.Table + table : Table _description_ dataset_short_name : str _description_ - value_amendments : pd.DataFrame + value_amendments : Table _description_ """ # Regular expression used to search for spurious values in the "value" column. @@ -848,45 +838,45 @@ def check_definitions_in_value_amendments( def process_metadata( paths: PathFinder, - metadata: catalog.Dataset, - custom_datasets: pd.DataFrame, - custom_elements: pd.DataFrame, - custom_items: pd.DataFrame, + metadata: Dataset, + custom_datasets: Table, + custom_elements: Table, + custom_items: Table, countries_harmonization: Dict[str, str], excluded_countries: List[str], - value_amendments: pd.DataFrame, -) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]: + value_amendments: Table, +) -> Tuple[Table, Table, Table, Table]: """Apply various sanity checks, gather data (about dataset, item, element and unit names and descriptions) from all - domains, compare with data from its corresponding metadata file, and create clean dataframes of metadata about + domains, compare with data from its corresponding metadata file, and create clean tables of metadata about dataset, elements, units, items, and countries. Parameters ---------- - metadata : catalog.Dataset + metadata : Dataset Additional metadata dataset from meadow. - custom_datasets : pd.DataFrame + custom_datasets : Table Data from custom_datasets.csv file. - custom_elements : pd.DataFrame + custom_elements : Table Data from custom_elements_and_units.csv file. - custom_items : pd.DataFrame + custom_items : Table Data from custom_items.csv file. countries_harmonization : dict Data from faostat.countries.json file. excluded_countries : list Data from faostat.excluded_countries.json file. - value_amendments : pd.DataFrame + value_amendments : Table Data from value_amendments.csv file. Returns ------- - countries_df : pd.DataFrame - Clean dataframe of global countries. - datasets_df : pd.DataFrame - Clean dataframe of global dataset names and descriptions. - elements_df : pd.DataFrame - Clean dataframe of global element and unit names and descriptions. - items_df : pd.DataFrame - Clean dataframe of global item names and descriptions. + tb_countries : Table + Clean table of global countries. + tb_datasets : Table + Clean table of global dataset names and descriptions. + tb_elements : Table + Clean table of global element and unit names and descriptions. + tb_items : Table + Clean table of global item names and descriptions. """ # Check if flags definitions need to be updated. @@ -897,12 +887,12 @@ def process_metadata( set([NAMESPACE + "_" + table_name.split("_")[1] for table_name in metadata.table_names]) ) - # Initialise dataframe of dataset descriptions, items, and element-units. - # We cannot remove "dataset" from the items and elements dataframes, because it can happen that, for a given + # Initialise table of dataset descriptions, items, and element-units. + # We cannot remove "dataset" from the items and elements tables, because it can happen that, for a given # item code, the item name is slightly different in two different datasets. - datasets_df = pd.DataFrame({"dataset": [], "fao_dataset_title": [], "fao_dataset_description": []}) - items_df = pd.DataFrame({"dataset": [], "item_code": [], "fao_item": [], "fao_item_description": []}) - elements_df = pd.DataFrame( + tb_datasets = Table({"dataset": [], "fao_dataset_title": [], "fao_dataset_description": []}) + tb_items = Table({"dataset": [], "item_code": [], "fao_item": [], "fao_item_description": []}) + tb_elements = Table( { "dataset": [], "element_code": [], @@ -914,7 +904,7 @@ def process_metadata( ) # Initialise list of all countries in all datasets, and all country groups. - countries_in_data = pd.DataFrame({"area_code": [], "fao_country": []}).astype({"area_code": "Int64"}) + countries_in_data = Table({"area_code": [], "fao_country": []}).astype({"area_code": "Int64"}) country_groups_in_data: Dict[str, List[str]] = {} # Gather all variables from the latest version of each meadow dataset. @@ -922,7 +912,7 @@ def process_metadata( # Load latest meadow table for current dataset. ds_latest = paths.load_dataset(dataset_short_name) table = ds_latest[dataset_short_name] - df = pd.DataFrame(table.reset_index()).rename( + tb = table.reset_index().rename( columns={ "area": "fao_country", "recipient_country": "fao_country", @@ -930,22 +920,22 @@ def process_metadata( } )[["area_code", "fao_country"]] - df["area_code"] = df["area_code"].astype("Int64") + tb["area_code"] = tb["area_code"].astype("Int64") ################################################################################################################ # Temporary patch. # Some areas are defined with different names (but same area codes) in different domains. # This causes some issues at a later stage. # For now, manually rename those areas here. - if "French Guiana" in df["fao_country"].unique(): - df["fao_country"] = dataframes.map_series(df["fao_country"], mapping={"French Guiana": "French Guyana"}) - if "Netherlands (Kingdom of the)" in df["fao_country"].unique(): - df["fao_country"] = dataframes.map_series( - df["fao_country"], mapping={"Netherlands (Kingdom of the)": "Netherlands"} + if "French Guiana" in tb["fao_country"].unique(): + tb["fao_country"] = dataframes.map_series(tb["fao_country"], mapping={"French Guiana": "French Guyana"}) + if "Netherlands (Kingdom of the)" in tb["fao_country"].unique(): + tb["fao_country"] = dataframes.map_series( + tb["fao_country"], mapping={"Netherlands (Kingdom of the)": "Netherlands"} ) - if "Saint Martin (French part)" in df["fao_country"].unique(): - df["fao_country"] = dataframes.map_series( - df["fao_country"], mapping={"Saint Martin (French part)": "Saint-Martin (French part)"} + if "Saint Martin (French part)" in tb["fao_country"].unique(): + tb["fao_country"] = dataframes.map_series( + tb["fao_country"], mapping={"Saint Martin (French part)": "Saint-Martin (French part)"} ) ################################################################################################################ @@ -962,20 +952,20 @@ def process_metadata( ) # Gather dataset descriptions, items, and element-units for current domain. - datasets_from_data = create_dataset_descriptions_dataframe_for_domain( + datasets_from_data = create_dataset_descriptions_table_for_domain( table=table, dataset_short_name=dataset_short_name ) - items_from_data = create_items_dataframe_for_domain( + items_from_data = create_items_table_for_domain( table=table, metadata=metadata, dataset_short_name=dataset_short_name ) - elements_from_data = create_elements_dataframe_for_domain( + elements_from_data = create_elements_table_for_domain( table=table, metadata=metadata, dataset_short_name=dataset_short_name ) # Add countries in this dataset to the list of all countries. - countries_in_data = pd.concat([countries_in_data, df]).drop_duplicates() + countries_in_data = pr.concat([countries_in_data, tb]).drop_duplicates() # Get country groups in this dataset. _metadata = metadata[f"{dataset_short_name}_area_group"].reset_index() @@ -997,23 +987,23 @@ def process_metadata( else: country_groups_in_data[group] = country_groups[group] - # Add dataset descriptions, items, and element-units from current dataset to global dataframes. - datasets_df = dataframes.concatenate([datasets_df, datasets_from_data], ignore_index=True) - items_df = dataframes.concatenate([items_df, items_from_data], ignore_index=True) - elements_df = dataframes.concatenate([elements_df, elements_from_data], ignore_index=True) + # Add dataset descriptions, items, and element-units from current dataset to global tables. + tb_datasets = dataframes.concatenate([tb_datasets, datasets_from_data], ignore_index=True) + tb_items = dataframes.concatenate([tb_items, items_from_data], ignore_index=True) + tb_elements = dataframes.concatenate([tb_elements, elements_from_data], ignore_index=True) - datasets_df = clean_global_dataset_descriptions_dataframe(datasets_df=datasets_df, custom_datasets=custom_datasets) - items_df = clean_global_items_dataframe(items_df=items_df, custom_items=custom_items) + tb_datasets = clean_global_dataset_descriptions_table(tb_datasets=tb_datasets, tb_custom_datasets=custom_datasets) + tb_items = clean_global_items_table(tb_items=tb_items, custom_items=custom_items) - elements_df = clean_global_elements_dataframe(elements_df=elements_df, custom_elements=custom_elements) - countries_df = clean_global_countries_dataframe( + tb_elements = clean_global_elements_table(tb_elements=tb_elements, custom_elements=custom_elements) + tb_countries = clean_global_countries_table( countries_in_data=countries_in_data, country_groups=country_groups_in_data, countries_harmonization=countries_harmonization, excluded_countries=excluded_countries, ) - return countries_df, datasets_df, elements_df, items_df + return tb_countries, tb_datasets, tb_elements, tb_items def run(dest_dir: str) -> None: @@ -1046,10 +1036,10 @@ def run(dest_dir: str) -> None: metadata = paths.load_dataset() # Load custom dataset names, items, element-unit names, and value amendments. - custom_datasets = pd.read_csv(custom_datasets_file, dtype=str) - custom_elements = pd.read_csv(custom_elements_and_units_file, dtype=str) - custom_items = pd.read_csv(custom_items_file, dtype=str) - value_amendments = pd.read_csv(value_amendments_file, dtype=str) + custom_datasets = pr.read_csv(custom_datasets_file, dtype=str) + custom_elements = pr.read_csv(custom_elements_and_units_file, dtype=str) + custom_items = pr.read_csv(custom_items_file, dtype=str) + value_amendments = pr.read_csv(value_amendments_file, dtype=str) # Load country mapping and excluded countries files. countries_harmonization = io.load_json(countries_file) @@ -1058,7 +1048,7 @@ def run(dest_dir: str) -> None: # # Process data. # - countries_df, datasets_df, elements_df, items_df = process_metadata( + tb_countries, tb_datasets, tb_elements, tb_items = process_metadata( paths=paths, metadata=metadata, custom_datasets=custom_datasets, @@ -1073,7 +1063,7 @@ def run(dest_dir: str) -> None: # Save outputs. # # Initialize new garden dataset. - dataset_garden = catalog.Dataset.create_empty(dest_dir) + dataset_garden = Dataset.create_empty(dest_dir) dataset_garden.short_name = FAOSTAT_METADATA_SHORT_NAME # Keep original dataset's metadata from meadow. dataset_garden.metadata = deepcopy(metadata.metadata) @@ -1081,11 +1071,11 @@ def run(dest_dir: str) -> None: dataset_garden.save() # Create new garden dataset with all dataset descriptions, items, element-units, and countries. - datasets_table = create_table(df=datasets_df, short_name="datasets", index_cols=["dataset"]) - items_table = create_table(df=items_df, short_name="items", index_cols=["dataset", "item_code"]) - elements_table = create_table(df=elements_df, short_name="elements", index_cols=["dataset", "element_code"]) - countries_table = create_table(df=countries_df, short_name="countries", index_cols=["area_code"]) - amendments_table = catalog.Table(value_amendments, short_name="amendments").set_index( + datasets_table = create_table(tb=tb_datasets, short_name="datasets", index_cols=["dataset"]) + items_table = create_table(tb=tb_items, short_name="items", index_cols=["dataset", "item_code"]) + elements_table = create_table(tb=tb_elements, short_name="elements", index_cols=["dataset", "element_code"]) + countries_table = create_table(tb=tb_countries, short_name="countries", index_cols=["area_code"]) + amendments_table = Table(value_amendments, short_name="amendments").set_index( ["dataset", "spurious_value"], verify_integrity=True ) diff --git a/etl/steps/data/garden/faostat/2024-03-14/shared.py b/etl/steps/data/garden/faostat/2024-03-14/shared.py index 11a5d7efb68..ba08cc1167b 100644 --- a/etl/steps/data/garden/faostat/2024-03-14/shared.py +++ b/etl/steps/data/garden/faostat/2024-03-14/shared.py @@ -22,7 +22,6 @@ from detected_anomalies import handle_anomalies from owid import repack # type: ignore from owid.catalog import Dataset, Table, Variable, VariablePresentationMeta -from owid.catalog.tables import read_from_records from owid.catalog.utils import underscore from owid.datautils import dataframes from tqdm.auto import tqdm @@ -276,7 +275,7 @@ FLAG_MULTIPLE_FLAGS = "multiple_flags" # Rank flags by priority (where lowest index is highest priority). FLAGS_RANKING = ( - read_from_records( + pr.read_from_records( columns=["flag", "description"], data=[ # FAO uses nan flag for official data; in our datasets we will replace nans by FLAG_OFFICIAL_DATA. From ef0a19a2b64efa9901f78a8db266484376f98a84 Mon Sep 17 00:00:00 2001 From: Pablo Rosado Date: Tue, 19 Mar 2024 15:04:20 +0100 Subject: [PATCH 27/54] Improve metadata handling of meadow steps --- .../faostat/2024-03-14/faostat_metadata.py | 19 ++--- .../data/meadow/faostat/2024-03-14/shared.py | 74 +++++++++---------- 2 files changed, 47 insertions(+), 46 deletions(-) diff --git a/etl/steps/data/meadow/faostat/2024-03-14/faostat_metadata.py b/etl/steps/data/meadow/faostat/2024-03-14/faostat_metadata.py index ecaab7a72a0..38a6b36b570 100644 --- a/etl/steps/data/meadow/faostat/2024-03-14/faostat_metadata.py +++ b/etl/steps/data/meadow/faostat/2024-03-14/faostat_metadata.py @@ -8,13 +8,14 @@ from typing import Any, Dict, List -import pandas as pd +import owid.catalog.processing as pr import structlog from owid.catalog import Table from owid.datautils.io import load_json from shared import CURRENT_DIR, NAMESPACE from etl.helpers import PathFinder, create_dataset +from etl.snapshot import Snapshot log = structlog.get_logger() @@ -130,7 +131,7 @@ def check_that_category_structure_is_well_defined(md: Dict[str, Any]) -> None: assert category_index in entry, error -def create_tables_for_all_domain_records(additional_metadata: Dict[str, Any]) -> List[Table]: +def create_tables_for_all_domain_records(additional_metadata: Dict[str, Any], snapshot: Snapshot) -> List[Table]: """Create a table for each of the domain-categories (e.g. 'faostat_qcl_item'). Parameters @@ -150,11 +151,11 @@ def create_tables_for_all_domain_records(additional_metadata: Dict[str, Any]) -> for domain in additional_metadata: for category in list(additional_metadata[domain]): json_data = additional_metadata[domain][category]["data"] - df = pd.DataFrame.from_dict(json_data) - if len(df) == 0: + tb = pr.read_from_dict(json_data, metadata=snapshot.to_table_metadata(), origin=snapshot.metadata.origin) + if len(tb) == 0: # This is the case for lc flag, rfb flag, scl itemfactor, and, since last version, all qv categories. - df = df.assign(**{col: [] for col in category_structure[category]["index"]}) - df.set_index( + tb = tb.assign(**{col: [] for col in category_structure[category]["index"]}) + tb.set_index( category_structure[category]["index"], verify_integrity=True, inplace=True, @@ -167,8 +168,8 @@ def create_tables_for_all_domain_records(additional_metadata: Dict[str, Any]) -> continue used_short_names.add(table_short_name) - table = Table(df, short_name=table_short_name) - tables.append(table) + tb.metadata.short_name = table_short_name + tables.append(tb) return tables @@ -197,7 +198,7 @@ def run(dest_dir: str) -> None: check_that_category_structure_is_well_defined(md=additional_metadata) # Create a new table for each domain-record (e.g. 'faostat_qcl_item'). - tables = create_tables_for_all_domain_records(additional_metadata=additional_metadata) + tables = create_tables_for_all_domain_records(additional_metadata=additional_metadata, snapshot=snapshot) # # Save outputs. diff --git a/etl/steps/data/meadow/faostat/2024-03-14/shared.py b/etl/steps/data/meadow/faostat/2024-03-14/shared.py index 9fdd9e12f9b..d5cd5539a0d 100644 --- a/etl/steps/data/meadow/faostat/2024-03-14/shared.py +++ b/etl/steps/data/meadow/faostat/2024-03-14/shared.py @@ -11,11 +11,12 @@ import zipfile from pathlib import Path -import pandas as pd +import owid.catalog.processing as pr import structlog from owid.catalog import Table from etl.helpers import PathFinder, create_dataset +from etl.snapshot import Snapshot # Initialise log. log = structlog.get_logger() @@ -26,8 +27,8 @@ VERSION = CURRENT_DIR.name -def load_data(local_path: Path) -> pd.DataFrame: - """Load snapshot data (as a dataframe) for current dataset. +def load_data(snapshot: Snapshot) -> Table: + """Load snapshot data (as a table) for current dataset. Parameters ---------- @@ -36,65 +37,71 @@ def load_data(local_path: Path) -> pd.DataFrame: Returns ------- - data : pd.DataFrame + data : Table Snapshot data. """ # Unzip data into a temporary folder. with tempfile.TemporaryDirectory() as temp_dir: - z = zipfile.ZipFile(local_path) + z = zipfile.ZipFile(snapshot.path) z.extractall(temp_dir) (filename,) = list(filter(lambda x: "(Normalized)" in x, os.listdir(temp_dir))) # Load data from main file. - data = pd.read_csv(os.path.join(temp_dir, filename), encoding="latin-1", low_memory=False) + data = pr.read_csv( + os.path.join(temp_dir, filename), + encoding="latin-1", + low_memory=False, + origin=snapshot.metadata.origin, + metadata=snapshot.to_table_metadata(), + ) return data -def run_sanity_checks(data: pd.DataFrame) -> None: +def run_sanity_checks(tb: Table) -> None: """Run basic sanity checks on loaded data (raise assertion errors if any check fails). Parameters ---------- - data : pd.DataFrame + tb : Table Data to be checked. """ - df = data.copy() + tb = tb.copy() # Check that column "Year Code" is identical to "Year", and can therefore be dropped. error = "Column 'Year Code' does not coincide with column 'Year'." - if "Year" not in data.columns: + if "Year" not in tb.columns: pass # Column 'Year' is not in data (this happens at least in faostat_wcad, which requires further processing). - elif df["Year"].dtype == int: + elif tb["Year"].dtype == int: # In most cases, columns "Year Code" and "Year" are simply the year. - assert (df["Year Code"] == df["Year"]).all(), error + assert (tb["Year Code"] == tb["Year"]).all(), error else: # Sometimes (e.g. for dataset fs) there are year ranges (e.g. with "Year Code" 20002002 and "Year" "2000-2002"). - assert (df["Year Code"] == df["Year"].str.replace("-", "").astype(int)).all(), error + assert (tb["Year Code"] == tb["Year"].str.replace("-", "").astype(int)).all(), error # Check that there is only one element-unit for each element code. error = "Multiple element-unit for the same element code." - assert (df.groupby(["Element", "Unit"])["Element Code"].nunique() == 1).all(), error + assert (tb.groupby(["Element", "Unit"])["Element Code"].nunique() == 1).all(), error -def prepare_output_data(data: pd.DataFrame) -> pd.DataFrame: +def prepare_output_data(tb: Table) -> Table: """Prepare data before saving it to meadow. Parameters ---------- - data : pd.DataFrame + tb : Table Data. Returns ------- - df : pd.DataFrame + tb : Table Data ready to be stored as a table in meadow. """ - df = data.copy() + tb = tb.copy() # Select columns to keep. # Note: @@ -117,26 +124,19 @@ def prepare_output_data(data: pd.DataFrame) -> pd.DataFrame: "Flag", "Recipient Country Code", "Recipient Country", - # Additional columns for faostat_wcad. - "WCA Round", - "Census Year", ] - # Select only columns that are found in the dataframe. - columns_to_keep = list(set(columns_to_keep) & set(df.columns)) - df = df[columns_to_keep] + # Select only columns that are found in the table. + columns_to_keep = list(set(columns_to_keep) & set(tb.columns)) + tb = tb[columns_to_keep] - # Set index columns depending on what columns are available in the dataframe. + # Set index columns depending on what columns are available in the table. # Note: "Recipient Country Code" appears only in faostat_fa, and seems to replace "Area Code". - # Note: "WCA Round" and "Census Year" appear only in faostat_wcad. - index_columns = list( - {"Area Code", "Recipient Country Code", "Year", "Item Code", "Element Code", "WCA Round", "Census Year"} - & set(df.columns) - ) - if df.duplicated(subset=index_columns).any(): + index_columns = list({"Area Code", "Recipient Country Code", "Year", "Item Code", "Element Code"} & set(tb.columns)) + if tb.duplicated(subset=index_columns).any(): log.warning("Index has duplicated keys.") - df = df.set_index(index_columns) + tb = tb.set_index(index_columns) - return df + return tb def run(dest_dir: str) -> None: @@ -154,20 +154,20 @@ def run(dest_dir: str) -> None: # Load snapshot. snapshot = paths.load_snapshot() - df_snapshot = load_data(snapshot.path) + tb_snapshot = load_data(snapshot) # # Process data. # # Run sanity checks. - run_sanity_checks(data=df_snapshot) + run_sanity_checks(tb=tb_snapshot) # Prepare output meadow table. - tb_meadow = Table(prepare_output_data(data=df_snapshot), short_name=dataset_short_name) + tb = prepare_output_data(tb=tb_snapshot) # # Save outputs. # # Create a new meadow dataset. - ds_meadow = create_dataset(dest_dir=dest_dir, tables=[tb_meadow], default_metadata=snapshot.metadata) + ds_meadow = create_dataset(dest_dir=dest_dir, tables=[tb]) ds_meadow.save() From deffc1dd645fb618a590713956980c56e5e9e9ae Mon Sep 17 00:00:00 2001 From: Pablo Rosado Date: Tue, 19 Mar 2024 16:06:12 +0100 Subject: [PATCH 28/54] Ensure origins are properly propagated --- .../faostat/2024-03-14/detected_anomalies.py | 228 +++++++++--------- .../garden/faostat/2024-03-14/faostat_fbsc.py | 69 +++--- .../garden/faostat/2024-03-14/faostat_qcl.py | 132 +++++----- .../data/garden/faostat/2024-03-14/shared.py | 62 +++-- 4 files changed, 247 insertions(+), 244 deletions(-) diff --git a/etl/steps/data/garden/faostat/2024-03-14/detected_anomalies.py b/etl/steps/data/garden/faostat/2024-03-14/detected_anomalies.py index 218db8eca53..991d0a30602 100644 --- a/etl/steps/data/garden/faostat/2024-03-14/detected_anomalies.py +++ b/etl/steps/data/garden/faostat/2024-03-14/detected_anomalies.py @@ -9,8 +9,8 @@ import os from typing import Tuple -import pandas as pd import plotly.express as px +from owid.catalog import Table from structlog import get_logger log = get_logger() @@ -38,77 +38,77 @@ def description(self) -> str: raise NotImplementedError @abc.abstractmethod - def check(self, df: pd.DataFrame) -> None: + def check(self, tb: Table) -> None: """A method that ensures the anomaly exists in the data. This is useful to detect if an anomaly has been corrected after a data update. Parameters ---------- - df : pd.DataFrame + tb : Table Data containing anomalies. """ raise NotImplementedError @abc.abstractmethod - def fix(self, df: pd.DataFrame) -> pd.DataFrame: + def fix(self, tb: Table) -> Table: """A method that removes the anomaly. Parameters ---------- - df : pd.DataFrame + tb : Table Data that contains anomalies to be removed. Returns ------- - df_fixed : pd.DataFrame + tb_fixed : Table Data after removing anomalies. """ raise NotImplementedError - def inspect(self, df: pd.DataFrame) -> None: + def inspect(self, tb: Table) -> None: """An optional method that plots (in the browser) a visualization that shows the anomaly. It can be used before and after removing the anomalies. Parameters ---------- - df : pd.DataFrame + tb : Table Data to be inspected (before or after having anomalies removed). """ raise NotImplementedError - def handle_anomalies(self, df: pd.DataFrame, inspect_anomalies: bool = INSPECT_ANOMALIES) -> pd.DataFrame: + def handle_anomalies(self, tb: Table, inspect_anomalies: bool = INSPECT_ANOMALIES) -> Table: """A helper method that uses all the previous methods in the usual order. Parameters ---------- - df : pd.DataFrame + tb : Table Data with anomalies. inspect_anomalies : bool, optional True to open charts in the browser to visualize the data before and after removing the anomalies. Returns ------- - df_fixed : pd.DataFrame + tb_fixed : Table Data after removing anomalies. """ log.info(f"Handling anomaly: {self.description}") log.info("Checking that known data anomalies are present in the data") - self.check(df=df) + self.check(tb=tb) if inspect_anomalies: log.info("Inspect anomaly before fixing.") - self.inspect(df=df) + self.inspect(tb=tb) log.info("Fixing anomalies.") - df_fixed = self.fix(df=df) + tb_fixed = self.fix(tb=tb) if inspect_anomalies: log.info("Inspect anomaly after fixing.") - self.inspect(df=df_fixed) + self.inspect(tb=tb_fixed) - return df_fixed + return tb_fixed def _split_long_title(text: str) -> str: @@ -149,29 +149,29 @@ class SpinachAreaHarvestedAnomaly(DataAnomaly): "World", ] - def check(self, df): + def check(self, tb): # Check that the data point is indeed zero. assert ( - df[ - (df["country"] == "China") - & (df["item_code"].isin(self.affected_item_codes)) - & (df["element_code"].isin(self.affected_element_codes)) - & (df["year"].isin(self.affected_years)) + tb[ + (tb["country"] == "China") + & (tb["item_code"].isin(self.affected_item_codes)) + & (tb["element_code"].isin(self.affected_element_codes)) + & (tb["year"].isin(self.affected_years)) ]["value"] == 0 ).all() # For consistency, check that other years do have non-zero data for the same item and element. assert ( - df[ - (df["country"] == "China") - & (df["item_code"].isin(self.affected_item_codes)) - & (df["element_code"].isin(self.affected_element_codes)) - & ~(df["year"].isin(self.affected_years)) + tb[ + (tb["country"] == "China") + & (tb["item_code"].isin(self.affected_item_codes)) + & (tb["element_code"].isin(self.affected_element_codes)) + & ~(tb["year"].isin(self.affected_years)) ]["value"] > 0 ).all() - def inspect(self, df): + def inspect(self, tb): log.info( "The anomaly causes: " "\n* A dip in area harvested of spinach in that year (element code 005312). " @@ -180,27 +180,27 @@ def inspect(self, df): ) for element_code in self.affected_element_codes: selection = ( - (df["country"].isin(self.affected_countries)) - & (df["item_code"].isin(self.affected_item_codes)) - & (df["element_code"] == element_code) + (tb["country"].isin(self.affected_countries)) + & (tb["item_code"].isin(self.affected_item_codes)) + & (tb["element_code"] == element_code) ) - df_affected = df[selection].astype({"country": str}) + tb_affected = tb[selection].astype({"country": str}) title = _split_long_title(self.description + f"Element code {element_code}") - fig = px.line(df_affected, x="year", y="value", color="country", title=title) + fig = px.line(tb_affected, x="year", y="value", color="country", title=title) fig.show() - def fix(self, df): - indexes_to_drop = df[ + def fix(self, tb): + indexes_to_drop = tb[ ( - (df["country"].isin(self.affected_countries)) - & (df["item_code"].isin(self.affected_item_codes)) - & (df["element_code"].isin(self.affected_element_codes)) - & (df["year"].isin(self.affected_years)) + (tb["country"].isin(self.affected_countries)) + & (tb["item_code"].isin(self.affected_item_codes)) + & (tb["element_code"].isin(self.affected_element_codes)) + & (tb["year"].isin(self.affected_years)) ) ].index - df_fixed = df.drop(indexes_to_drop).reset_index(drop=True) + tb_fixed = tb.drop(indexes_to_drop).reset_index(drop=True) - return df_fixed + return tb_fixed class EggYieldNorthernEuropeAnomaly(DataAnomaly): @@ -234,51 +234,51 @@ class EggYieldNorthernEuropeAnomaly(DataAnomaly): "Northern Europe (FAO)", ] - def check(self, df): + def check(self, tb): # Check that the data prior to 1973 is indeed higher than expected, and significantly lower from then on. assert ( - df[ - (df["country"].isin(self.affected_countries)) - & (df["item_code"].isin(self.affected_item_codes)) - & (df["element_code"].isin(self.affected_element_codes)) - & (df["year"].isin(self.affected_years)) + tb[ + (tb["country"].isin(self.affected_countries)) + & (tb["item_code"].isin(self.affected_item_codes)) + & (tb["element_code"].isin(self.affected_element_codes)) + & (tb["year"].isin(self.affected_years)) ]["value"] > 40 ).all() assert ( - df[ - (df["country"].isin(self.affected_countries)) - & (df["item_code"].isin(self.affected_item_codes)) - & (df["element_code"].isin(self.affected_element_codes)) - & ~(df["year"].isin(self.affected_years)) + tb[ + (tb["country"].isin(self.affected_countries)) + & (tb["item_code"].isin(self.affected_item_codes)) + & (tb["element_code"].isin(self.affected_element_codes)) + & ~(tb["year"].isin(self.affected_years)) ]["value"] < 40 ).all() - def inspect(self, df): + def inspect(self, tb): log.info( "The anomaly causes: " "\n* The egg yield of Northern Europe (FAO) before 1973 much higher than any other year." ) for element_code in self.affected_element_codes: - selection = (df["item_code"].isin(self.affected_item_codes)) & (df["element_code"] == element_code) - df_affected = df[selection].astype({"country": str}).sort_values(["country", "year"]) + selection = (tb["item_code"].isin(self.affected_item_codes)) & (tb["element_code"] == element_code) + tb_affected = tb[selection].astype({"country": str}).sort_values(["country", "year"]) title = _split_long_title(self.description + f"Element code {element_code}") - fig = px.line(df_affected, x="year", y="value", color="country", title=title) + fig = px.line(tb_affected, x="year", y="value", color="country", title=title) fig.show() - def fix(self, df): - indexes_to_drop = df[ + def fix(self, tb): + indexes_to_drop = tb[ ( - (df["country"].isin(self.affected_countries)) - & (df["item_code"].isin(self.affected_item_codes)) - & (df["element_code"].isin(self.affected_element_codes)) - & (df["year"].isin(self.affected_years)) + (tb["country"].isin(self.affected_countries)) + & (tb["item_code"].isin(self.affected_item_codes)) + & (tb["element_code"].isin(self.affected_element_codes)) + & (tb["year"].isin(self.affected_years)) ) ].index - df_fixed = df.drop(indexes_to_drop).reset_index(drop=True) + tb_fixed = tb.drop(indexes_to_drop).reset_index(drop=True) - return df_fixed + return tb_fixed class TeaProductionAnomaly(DataAnomaly): @@ -340,19 +340,19 @@ class TeaProductionAnomaly(DataAnomaly): "Zimbabwe", ] - def check(self, df): + def check(self, tb): # Check that the data on 1990 has the flag "A" (Official figure) for each of the affected countries. - flagged_official = df[ - (df["country"].isin(self.affected_countries)) - & (df["item_code"].isin(self.affected_item_codes)) - & (df["element_code"].isin(self.affected_element_codes)) - & (df["year"] == 1990) + flagged_official = tb[ + (tb["country"].isin(self.affected_countries)) + & (tb["item_code"].isin(self.affected_item_codes)) + & (tb["element_code"].isin(self.affected_element_codes)) + & (tb["year"] == 1990) ] - flagged_estimate = df[ - (df["country"].isin(self.affected_countries)) - & (df["item_code"].isin(self.affected_item_codes)) - & (df["element_code"].isin(self.affected_element_codes)) - & (df["year"] > 1990) + flagged_estimate = tb[ + (tb["country"].isin(self.affected_countries)) + & (tb["item_code"].isin(self.affected_item_codes)) + & (tb["element_code"].isin(self.affected_element_codes)) + & (tb["year"] > 1990) ] # Check that all affected countries have official data on 1990, and estimated on 1991. for country in self.affected_countries: @@ -366,26 +366,26 @@ def check(self, df): ]["value"].iloc[0] assert high_value / low_value > 3 - def inspect(self, df): + def inspect(self, tb): log.info("The anomaly causes: " "\n* The production of tea to increase dramatically from 1990 to 1991.") for element_code in self.affected_element_codes: - selection = (df["item_code"].isin(self.affected_item_codes)) & (df["element_code"] == element_code) - df_affected = df[selection].astype({"country": str}).sort_values(["country", "year"]) + selection = (tb["item_code"].isin(self.affected_item_codes)) & (tb["element_code"] == element_code) + tb_affected = tb[selection].astype({"country": str}).sort_values(["country", "year"]) title = _split_long_title(self.description + f"Element code {element_code}") - fig = px.line(df_affected, x="year", y="value", color="country", title=title) + fig = px.line(tb_affected, x="year", y="value", color="country", title=title) fig.show() - def fix(self, df): - indexes_to_drop = df[ + def fix(self, tb): + indexes_to_drop = tb[ ( - (df["item_code"].isin(self.affected_item_codes)) - & (df["element_code"].isin(self.affected_element_codes)) - & (df["year"] > 1990) + (tb["item_code"].isin(self.affected_item_codes)) + & (tb["element_code"].isin(self.affected_element_codes)) + & (tb["year"] > 1990) ) ].index - df_fixed = df.drop(indexes_to_drop).reset_index(drop=True) + tb_fixed = tb.drop(indexes_to_drop).reset_index(drop=True) - return df_fixed + return tb_fixed class HighYieldAnomaly(DataAnomaly): @@ -396,48 +396,48 @@ class HighYieldAnomaly(DataAnomaly): affected_years = [] affected_countries = [] - def check(self, df): + def check(self, tb): # Check that the data in the affected years is higher than expected, and significantly lower from then on. assert ( - df[ - (df["country"].isin(self.affected_countries)) - & (df["item_code"].isin(self.affected_item_codes)) - & (df["element_code"].isin(self.affected_element_codes)) - & (df["year"].isin(self.affected_years)) + tb[ + (tb["country"].isin(self.affected_countries)) + & (tb["item_code"].isin(self.affected_item_codes)) + & (tb["element_code"].isin(self.affected_element_codes)) + & (tb["year"].isin(self.affected_years)) ]["value"] > 100 ).all() assert ( - df[ - (df["country"].isin(self.affected_countries)) - & (df["item_code"].isin(self.affected_item_codes)) - & (df["element_code"].isin(self.affected_element_codes)) - & ~(df["year"].isin(self.affected_years)) + tb[ + (tb["country"].isin(self.affected_countries)) + & (tb["item_code"].isin(self.affected_item_codes)) + & (tb["element_code"].isin(self.affected_element_codes)) + & ~(tb["year"].isin(self.affected_years)) ]["value"] < 100 ).all() - def inspect(self, df): + def inspect(self, tb): log.info("The anomaly causes: " "\n* The yield of certain items, countries and years to be unreasonably high.") for element_code in self.affected_element_codes: - selection = (df["item_code"].isin(self.affected_item_codes)) & (df["element_code"] == element_code) - df_affected = df[selection].astype({"country": str}).sort_values(["country", "year"]) + selection = (tb["item_code"].isin(self.affected_item_codes)) & (tb["element_code"] == element_code) + tb_affected = tb[selection].astype({"country": str}).sort_values(["country", "year"]) title = _split_long_title(self.description + f"Element code {element_code}") - fig = px.line(df_affected, x="year", y="value", color="country", title=title) + fig = px.line(tb_affected, x="year", y="value", color="country", title=title) fig.show() - def fix(self, df): - indexes_to_drop = df[ + def fix(self, tb): + indexes_to_drop = tb[ ( - (df["country"].isin(self.affected_countries)) - & (df["item_code"].isin(self.affected_item_codes)) - & (df["element_code"].isin(self.affected_element_codes)) - & (df["year"].isin(self.affected_years)) + (tb["country"].isin(self.affected_countries)) + & (tb["item_code"].isin(self.affected_item_codes)) + & (tb["element_code"].isin(self.affected_element_codes)) + & (tb["year"].isin(self.affected_years)) ) ].index - df_fixed = df.drop(indexes_to_drop).reset_index(drop=True) + tb_fixed = tb.drop(indexes_to_drop).reset_index(drop=True) - return df_fixed + return tb_fixed class FruitYieldAnomaly(HighYieldAnomaly): @@ -713,18 +713,18 @@ class OtherTropicalFruitYieldSouthAmericaAnomaly(HighYieldAnomaly): } -def handle_anomalies(dataset_short_name: str, data: pd.DataFrame) -> Tuple[pd.DataFrame, str]: +def handle_anomalies(dataset_short_name: str, tb: Table) -> Tuple[Table, str]: if dataset_short_name not in detected_anomalies: # If there is no anomaly class for a given dataset, return the same data and an empty anomaly description. - return data, "" + return tb, "" else: # If there are anomalies, fix them, and return the fixed data and a text describing all anomalies. - data_fixed = data.copy() + tb_fixed = tb.copy() anomaly_descriptions = ANOMALY_DESCRIPTION_INTRODUCTION for anomaly_class in detected_anomalies[dataset_short_name]: anomaly = anomaly_class() anomaly_descriptions += "\n\n+" + anomaly.description - data_fixed = anomaly.handle_anomalies(df=data_fixed) + tb_fixed = anomaly.handle_anomalies(tb=tb_fixed) - return data_fixed, anomaly_descriptions + return tb_fixed, anomaly_descriptions diff --git a/etl/steps/data/garden/faostat/2024-03-14/faostat_fbsc.py b/etl/steps/data/garden/faostat/2024-03-14/faostat_fbsc.py index 907f38311a7..d6008b82f91 100644 --- a/etl/steps/data/garden/faostat/2024-03-14/faostat_fbsc.py +++ b/etl/steps/data/garden/faostat/2024-03-14/faostat_fbsc.py @@ -17,8 +17,8 @@ from pathlib import Path +import owid.catalog.processing as pr from owid.catalog import Dataset, Table -from owid.datautils import dataframes from shared import ( ADDED_TITLE_TO_WIDE_TABLE, CURRENT_DIR, @@ -59,7 +59,7 @@ def combine_fbsh_and_fbs_datasets( Returns ------- tb_fbsc : Table - Combination of the tables of the two input datasets (as a dataframe, not a dataset). + Combination of the tables of the two input datasets (as a table, not a dataset). """ # Sanity checks. @@ -68,7 +68,7 @@ def combine_fbsh_and_fbs_datasets( error = "Licenses of fbsh and fbs are different." assert ds_fbsh.metadata.licenses == ds_fbs.metadata.licenses, error - # Load dataframes for fbs and fbsh datasets. + # Load tables for fbs and fbsh datasets. tb_fbsh = ds_fbsh["faostat_fbsh"].reset_index() tb_fbs = ds_fbs["faostat_fbs"].reset_index() @@ -102,23 +102,24 @@ def combine_fbsh_and_fbs_datasets( # combined as if they were the same element). tb_fbsh = tb_fbsh[~tb_fbsh["element_code"].isin(ELEMENTS_IN_FBSH_MISSING_IN_FBS)].reset_index(drop=True) - # Concatenate old and new dataframes using function that keeps categoricals. - tb_fbsc = dataframes.concatenate([tb_fbsh, tb_fbs]).sort_values(["area", "year"]).reset_index(drop=True) + # Concatenate old and new tables. + # tb_fbsc = dataframes.concatenate([tb_fbsh, tb_fbs]).sort_values(["area", "year"]).reset_index(drop=True) + tb_fbsc = pr.concat([tb_fbsh, tb_fbs]).sort_values(["area", "year"]).reset_index(drop=True) # Ensure that each element has only one unit and one description. error = "Some elements in the combined dataset have more than one unit. Manually check them and consider adding them to ELEMENT_AMENDMENTS." units_per_element = tb_fbsc.groupby("element", as_index=False, observed=True)["unit"].nunique() elements_with_ambiguous_units = units_per_element[units_per_element["unit"] > 1]["element"].tolist() - tb_fbsc[tb_fbsc["element"].isin(elements_with_ambiguous_units)].drop_duplicates(subset=["element", "unit"]) + # tb_fbsc[tb_fbsc["element"].isin(elements_with_ambiguous_units)].drop_duplicates(subset=["element", "unit"]) assert len(elements_with_ambiguous_units) == 0, error return tb_fbsc def _assert_tb_size(tb: Table, size_mb: float) -> None: - """Check that dataframe is smaller than given size to prevent OOM errors.""" + """Check that table is smaller than given size to prevent OOM errors.""" real_size_mb = tb.memory_usage(deep=True).sum() / 1e6 - assert real_size_mb <= size_mb, f"DataFrame size is too big: {real_size_mb} MB > {size_mb} MB" + assert real_size_mb <= size_mb, f"Table size is too big: {real_size_mb} MB > {size_mb} MB" def run(dest_dir: str) -> None: @@ -136,8 +137,8 @@ def run(dest_dir: str) -> None: # Load fbsh and fbs. log.info("faostat_fbsc.loading_datasets") - fbsh_dataset = paths.load_dataset(f"{NAMESPACE}_fbsh") - fbs_dataset = paths.load_dataset(f"{NAMESPACE}_fbs") + ds_fbsh = paths.load_dataset(f"{NAMESPACE}_fbsh") + ds_fbs = paths.load_dataset(f"{NAMESPACE}_fbs") # Load dataset of FAOSTAT metadata. metadata = paths.load_dataset(f"{NAMESPACE}_metadata") @@ -166,16 +167,16 @@ def run(dest_dir: str) -> None: # Combine fbsh and fbs datasets. log.info( "faostat_fbsc.combine_fbsh_and_fbs_datasets", - fbsh_shape=fbsh_dataset["faostat_fbsh"].shape, - fbs_shape=fbs_dataset["faostat_fbs"].shape, + fbsh_shape=ds_fbsh["faostat_fbsh"].shape, + fbs_shape=ds_fbs["faostat_fbs"].shape, ) - data = combine_fbsh_and_fbs_datasets(fbsh_dataset, fbs_dataset) + tb = combine_fbsh_and_fbs_datasets(ds_fbsh=ds_fbsh, ds_fbs=ds_fbs) - _assert_tb_size(data, 2000) + _assert_tb_size(tb, 2000) # Prepare data. - data = clean_data( - tb=data, + tb = clean_data( + tb=tb, ds_population=ds_population, items_metadata=items_metadata, elements_metadata=elements_metadata, @@ -184,8 +185,8 @@ def run(dest_dir: str) -> None: ) # Add data for aggregate regions. - data = add_regions( - tb=data, + tb = add_regions( + tb=tb, ds_regions=ds_regions, ds_income_groups=ds_income_groups, ds_population=ds_population, @@ -193,40 +194,38 @@ def run(dest_dir: str) -> None: ) # Add per-capita variables. - data = add_per_capita_variables(tb=data, elements_metadata=elements_metadata) + tb = add_per_capita_variables(tb=tb, elements_metadata=elements_metadata) # Handle detected anomalies in the data. - data, anomaly_descriptions = handle_anomalies(dataset_short_name=dataset_short_name, data=data) + tb, anomaly_descriptions = handle_anomalies(dataset_short_name=dataset_short_name, tb=tb) # Avoid objects as they would explode memory, use categoricals instead. - for col in data.columns: - assert data[col].dtype != object, f"Column {col} should not have object type" + for col in tb.columns: + assert tb[col].dtype != object, f"Column {col} should not have object type" - _assert_tb_size(data, 2000) + _assert_tb_size(tb, 2000) # Create a long table (with item code and element code as part of the index). - log.info("faostat_fbsc.prepare_long_table", shape=data.shape) - data_table_long = prepare_long_table(tb=data) + log.info("faostat_fbsc.prepare_long_table", shape=tb.shape) + tb_long = prepare_long_table(tb=tb) - _assert_tb_size(data_table_long, 2000) + _assert_tb_size(tb_long, 2000) # Create a wide table (with only country and year as index). - log.info("faostat_fbsc.prepare_wide_table", shape=data.shape) - data_table_wide = prepare_wide_table(tb=data) + log.info("faostat_fbsc.prepare_wide_table", shape=tb.shape) + tb_wide = prepare_wide_table(tb=tb) # # Save outputs. # # Update tables metadata. - data_table_long.metadata.short_name = dataset_short_name - data_table_long.metadata.title = dataset_metadata["owid_dataset_title"] - data_table_wide.metadata.short_name = f"{dataset_short_name}_flat" - data_table_wide.metadata.title = dataset_metadata["owid_dataset_title"] + ADDED_TITLE_TO_WIDE_TABLE + tb_long.metadata.short_name = dataset_short_name + tb_long.metadata.title = dataset_metadata["owid_dataset_title"] + tb_wide.metadata.short_name = f"{dataset_short_name}_flat" + tb_wide.metadata.title = dataset_metadata["owid_dataset_title"] + ADDED_TITLE_TO_WIDE_TABLE # Initialise new garden dataset. - ds_garden = create_dataset( - dest_dir=dest_dir, tables=[data_table_long, data_table_wide], default_metadata=fbs_dataset.metadata - ) + ds_garden = create_dataset(dest_dir=dest_dir, tables=[tb_long, tb_wide], default_metadata=ds_fbs.metadata) # Check that the title assigned here coincides with the one in custom_datasets.csv (for consistency). error = "Dataset title given to fbsc is different to the one in custom_datasets.csv. Update the latter file." diff --git a/etl/steps/data/garden/faostat/2024-03-14/faostat_qcl.py b/etl/steps/data/garden/faostat/2024-03-14/faostat_qcl.py index 05f42206644..85d899e4b26 100644 --- a/etl/steps/data/garden/faostat/2024-03-14/faostat_qcl.py +++ b/etl/steps/data/garden/faostat/2024-03-14/faostat_qcl.py @@ -66,18 +66,18 @@ ) -def fill_slaughtered_poultry_with_slaughtered_chicken(data: Table) -> Table: +def fill_slaughtered_poultry_with_slaughtered_chicken(tb: Table) -> Table: """Fill missing data on slaughtered poultry with slaughtered chicken. Most of poultry meat comes from chicken. However, sometimes chicken is informed, but the rest of poultry isn't, which causes poultry data to be empty (e.g. Spain in 2018). Therefore, we fill missing data for poultry with chicken data. """ - data = data.copy() + tb = tb.copy() # Prepare a slice of the data to extract additional data fields. additional_fields = ( - data[(data["item_code"] == ITEM_CODE_MEAT_POULTRY) & (data["unit"] == SLAUGHTERED_ANIMALS_UNIT)][ + tb[(tb["item_code"] == ITEM_CODE_MEAT_POULTRY) & (tb["unit"] == SLAUGHTERED_ANIMALS_UNIT)][ ["fao_item", "item_description", "fao_unit_short_name"] ] .drop_duplicates() @@ -85,17 +85,17 @@ def fill_slaughtered_poultry_with_slaughtered_chicken(data: Table) -> Table: ) # Select data for the number of slaughtered chicken. - chickens_slaughtered = data[ - (data["item_code"] == ITEM_CODE_MEAT_CHICKEN) - & (data["element"] == SLAUGHTERED_ANIMALS_ELEMENT) - & (data["unit"] == SLAUGHTERED_ANIMALS_UNIT) + chickens_slaughtered = tb[ + (tb["item_code"] == ITEM_CODE_MEAT_CHICKEN) + & (tb["element"] == SLAUGHTERED_ANIMALS_ELEMENT) + & (tb["unit"] == SLAUGHTERED_ANIMALS_UNIT) ] # Select data for the number of slaughtered poultry. - poultry_slaughtered = data[ - (data["item_code"] == ITEM_CODE_MEAT_POULTRY) - & (data["element"] == SLAUGHTERED_ANIMALS_ELEMENT) - & (data["unit"] == SLAUGHTERED_ANIMALS_UNIT) + poultry_slaughtered = tb[ + (tb["item_code"] == ITEM_CODE_MEAT_POULTRY) + & (tb["element"] == SLAUGHTERED_ANIMALS_ELEMENT) + & (tb["unit"] == SLAUGHTERED_ANIMALS_UNIT) ][["country", "year", "value"]] # Combine poultry and chicken data. @@ -134,12 +134,12 @@ def fill_slaughtered_poultry_with_slaughtered_chicken(data: Table) -> Table: "slaughtered chicken." ) # Add chicken data to the full table. - data = pr.concat([data, poultry_slaughtered_missing_data], ignore_index=True) + tb = pr.concat([tb, poultry_slaughtered_missing_data], ignore_index=True) - return data + return tb -def add_slaughtered_animals_to_meat_total(data: Table) -> Table: +def add_slaughtered_animals_to_meat_total(tb: Table) -> Table: """Add number of slaughtered animals to meat total. There is no FAOSTAT data on slaughtered animals for total meat. We construct this data by aggregating that element @@ -150,35 +150,33 @@ def add_slaughtered_animals_to_meat_total(data: Table) -> Table: Parameters ---------- - data : Table + tb : Table Processed data where meat total does not have number of slaughtered animals. Returns ------- - combined_data : Table + tb_combined : Table Data after adding the new variable. """ - data = data.copy() + tb = tb.copy() error = f"Some items required to get the aggregate '{TOTAL_MEAT_ITEM}' are missing in data." - assert set(MEAT_TOTAL_ITEM_CODES) < set(data["item_code"]), error - assert SLAUGHTERED_ANIMALS_ELEMENT in data["element"].unique() - assert SLAUGHTERED_ANIMALS_UNIT in data["unit"].unique() + assert set(MEAT_TOTAL_ITEM_CODES) < set(tb["item_code"]), error + assert SLAUGHTERED_ANIMALS_ELEMENT in tb["element"].unique() + assert SLAUGHTERED_ANIMALS_UNIT in tb["unit"].unique() # Check that, indeed, the number of slaughtered animals for total meat is not given in the original data. - assert data[ - (data["item"] == TOTAL_MEAT_ITEM) - & (data["element"] == SLAUGHTERED_ANIMALS_ELEMENT) - & (data["unit"] == SLAUGHTERED_ANIMALS_UNIT) + assert tb[ + (tb["item"] == TOTAL_MEAT_ITEM) + & (tb["element"] == SLAUGHTERED_ANIMALS_ELEMENT) + & (tb["unit"] == SLAUGHTERED_ANIMALS_UNIT) ].empty # There are two element codes for the same element (they have different items assigned). error = "Element codes for 'Producing or slaughtered animals' may have changed." assert ( - data[(data["element"] == SLAUGHTERED_ANIMALS_ELEMENT) & ~(data["element_code"].str.contains("pc"))][ - "element_code" - ] + tb[(tb["element"] == SLAUGHTERED_ANIMALS_ELEMENT) & ~(tb["element_code"].str.contains("pc"))]["element_code"] .unique() .tolist() == SLAUGHTERED_ANIMALS_ELEMENT_CODES @@ -187,7 +185,7 @@ def add_slaughtered_animals_to_meat_total(data: Table) -> Table: # Check that the items assigned to each the two element codes do not overlap. error = "Element codes for 'Producing or slaughtered animals' have overlapping items." items_for_different_elements = ( - data[(data["element_code"].isin(SLAUGHTERED_ANIMALS_ELEMENT_CODES))] + tb[(tb["element_code"].isin(SLAUGHTERED_ANIMALS_ELEMENT_CODES))] .groupby("element_code", observed=True) .agg({"item_code": lambda x: list(x.unique())}) .to_dict()["item_code"] @@ -196,14 +194,14 @@ def add_slaughtered_animals_to_meat_total(data: Table) -> Table: # Confirm the item code for total meat. error = f"Item code for '{TOTAL_MEAT_ITEM}' may have changed." - assert list(data[data["item"] == TOTAL_MEAT_ITEM]["item_code"].unique()) == [TOTAL_MEAT_ITEM_CODE], error + assert list(tb[tb["item"] == TOTAL_MEAT_ITEM]["item_code"].unique()) == [TOTAL_MEAT_ITEM_CODE], error # Select the subset of data to aggregate. data_to_aggregate = ( - data[ - (data["element"] == SLAUGHTERED_ANIMALS_ELEMENT) - & (data["unit"] == SLAUGHTERED_ANIMALS_UNIT) - & (data["item_code"].isin(MEAT_TOTAL_ITEM_CODES)) + tb[ + (tb["element"] == SLAUGHTERED_ANIMALS_ELEMENT) + & (tb["unit"] == SLAUGHTERED_ANIMALS_UNIT) + & (tb["item_code"].isin(MEAT_TOTAL_ITEM_CODES)) ] .dropna(subset="value") .reset_index(drop=True) @@ -229,24 +227,24 @@ def add_slaughtered_animals_to_meat_total(data: Table) -> Table: ).reset_index() # Get element description for selected element code (so far it's always been an empty string). - _slaughtered_animals_element_description = data[data["element_code"].isin(SLAUGHTERED_ANIMALS_ELEMENT_CODES)][ + _slaughtered_animals_element_description = tb[tb["element_code"].isin(SLAUGHTERED_ANIMALS_ELEMENT_CODES)][ "element_description" ].unique() assert len(_slaughtered_animals_element_description) == 1 slaughtered_animals_element_description = _slaughtered_animals_element_description[0] # Get item description for selected item code. - _total_meat_item_description = data[data["item_code"] == TOTAL_MEAT_ITEM_CODE]["item_description"].unique() + _total_meat_item_description = tb[tb["item_code"] == TOTAL_MEAT_ITEM_CODE]["item_description"].unique() assert len(_total_meat_item_description) == 1 total_meat_item_description = _total_meat_item_description[0] # Get FAO item name for selected item code. - _total_meat_fao_item = data[data["item_code"] == TOTAL_MEAT_ITEM_CODE]["fao_item"].unique() + _total_meat_fao_item = tb[tb["item_code"] == TOTAL_MEAT_ITEM_CODE]["fao_item"].unique() assert len(_total_meat_fao_item) == 1 total_meat_fao_item = _total_meat_fao_item[0] # Get FAO unit for selected item code. - _total_meat_fao_unit = data[data["item_code"] == TOTAL_MEAT_ITEM_CODE]["fao_unit_short_name"].unique() + _total_meat_fao_unit = tb[tb["item_code"] == TOTAL_MEAT_ITEM_CODE]["fao_unit_short_name"].unique() assert len(_total_meat_fao_unit) == 1 total_meat_fao_unit = _total_meat_fao_unit[0] @@ -274,10 +272,10 @@ def add_slaughtered_animals_to_meat_total(data: Table) -> Table: # Find country-years for which we have the number of poultry slaughtered. country_years_with_poultry_data = ( - data[ - (data["item_code"] == ITEM_CODE_MEAT_POULTRY) - & (data["element"] == SLAUGHTERED_ANIMALS_ELEMENT) - & (data["unit"] == SLAUGHTERED_ANIMALS_UNIT) + tb[ + (tb["item_code"] == ITEM_CODE_MEAT_POULTRY) + & (tb["element"] == SLAUGHTERED_ANIMALS_ELEMENT) + & (tb["unit"] == SLAUGHTERED_ANIMALS_UNIT) ] .dropna(subset="value")[["country", "year"]] .drop_duplicates() @@ -297,11 +295,11 @@ def add_slaughtered_animals_to_meat_total(data: Table) -> Table: animals_corrected = compared[compared["_merge"] == "both"].reset_index(drop=True).drop(columns=["_merge"]) # Check that we are not missing any column. - assert set(data.columns) == set(animals_corrected.columns) + assert set(tb.columns) == set(animals_corrected.columns) # Add animals data to the original table. - combined_data = ( - pr.concat([data, animals_corrected], ignore_index=True) + tb_combined = ( + pr.concat([tb, animals_corrected], ignore_index=True) .reset_index(drop=True) .astype( { @@ -320,7 +318,7 @@ def add_slaughtered_animals_to_meat_total(data: Table) -> Table: ) ) - return combined_data + return tb_combined def add_yield_to_aggregate_regions(data: Table) -> Table: @@ -447,11 +445,9 @@ def run(dest_dir: str) -> None: # Get paths and naming conventions for current data step. paths = PathFinder(current_step_file.as_posix()) - # Load latest meadow dataset and keep its metadata. + # Load latest meadow dataset and read its main table. ds_meadow = paths.load_dataset(dataset_short_name) - # Load main table from dataset. - tb_meadow = ds_meadow[dataset_short_name] - data = tb_meadow.reset_index() + tb = ds_meadow[dataset_short_name].reset_index() # Load dataset of FAOSTAT metadata. metadata = paths.load_dataset(f"{NAMESPACE}_metadata") @@ -478,12 +474,12 @@ def run(dest_dir: str) -> None: # Process data. # # Harmonize items and elements, and clean data. - data = harmonize_items(tb=data, dataset_short_name=dataset_short_name) - data = harmonize_elements(tb=data, dataset_short_name=dataset_short_name) + tb = harmonize_items(tb=tb, dataset_short_name=dataset_short_name) + tb = harmonize_elements(tb=tb, dataset_short_name=dataset_short_name) # Prepare data. - data = clean_data( - tb=data, + tb = clean_data( + tb=tb, ds_population=ds_population, items_metadata=items_metadata, elements_metadata=elements_metadata, @@ -492,14 +488,14 @@ def run(dest_dir: str) -> None: ) # Fill missing data for slaughtered poultry with slaughtered chicken. - data = fill_slaughtered_poultry_with_slaughtered_chicken(data=data) + tb = fill_slaughtered_poultry_with_slaughtered_chicken(tb=tb) # Include number of slaughtered animals in total meat (which is missing). - data = add_slaughtered_animals_to_meat_total(data=data) + tb = add_slaughtered_animals_to_meat_total(tb=tb) # Add data for aggregate regions. - data = add_regions( - tb=data, + tb = add_regions( + tb=tb, ds_regions=ds_regions, ds_population=ds_population, ds_income_groups=ds_income_groups, @@ -507,33 +503,31 @@ def run(dest_dir: str) -> None: ) # Add per-capita variables. - data = add_per_capita_variables(tb=data, elements_metadata=elements_metadata) + tb = add_per_capita_variables(tb=tb, elements_metadata=elements_metadata) # Add yield (production per area) to aggregate regions. - data = add_yield_to_aggregate_regions(data) + tb = add_yield_to_aggregate_regions(tb) # Handle detected anomalies in the data. - data, anomaly_descriptions = handle_anomalies(dataset_short_name=dataset_short_name, data=data) + tb, anomaly_descriptions = handle_anomalies(dataset_short_name=dataset_short_name, tb=tb) # Create a long table (with item code and element code as part of the index). - data_table_long = prepare_long_table(tb=data) + tb_long = prepare_long_table(tb=tb) # Create a wide table (with only country and year as index). - data_table_wide = prepare_wide_table(tb=data) + tb_wide = prepare_wide_table(tb=tb) # # Save outputs. # # Update tables metadata. - data_table_long.metadata.short_name = dataset_short_name - data_table_long.metadata.title = dataset_metadata["owid_dataset_title"] - data_table_wide.metadata.short_name = f"{dataset_short_name}_flat" - data_table_wide.metadata.title = dataset_metadata["owid_dataset_title"] + ADDED_TITLE_TO_WIDE_TABLE + tb_long.metadata.short_name = dataset_short_name + tb_long.metadata.title = dataset_metadata["owid_dataset_title"] + tb_wide.metadata.short_name = f"{dataset_short_name}_flat" + tb_wide.metadata.title = dataset_metadata["owid_dataset_title"] + ADDED_TITLE_TO_WIDE_TABLE # Initialise new garden dataset. - ds_garden = create_dataset( - dest_dir=dest_dir, tables=[data_table_long, data_table_wide], default_metadata=ds_meadow.metadata - ) + ds_garden = create_dataset(dest_dir=dest_dir, tables=[tb_long, tb_wide], default_metadata=ds_meadow.metadata) # Update dataset metadata and add description of anomalies (if any) to the dataset description. ds_garden.metadata.description = ( diff --git a/etl/steps/data/garden/faostat/2024-03-14/shared.py b/etl/steps/data/garden/faostat/2024-03-14/shared.py index ba08cc1167b..8b52341e391 100644 --- a/etl/steps/data/garden/faostat/2024-03-14/shared.py +++ b/etl/steps/data/garden/faostat/2024-03-14/shared.py @@ -1019,11 +1019,11 @@ def add_regions( Returns ------- - tb : Table + tb_with_regions : Table Data after adding rows for aggregate regions. """ - tb = tb.copy() + tb_with_regions = tb.copy() # Create a dictionary of aggregations, specifying the operation to use when creating regions. # These aggregations are defined in the custom_elements_and_units.csv file, and added to the metadata dataset. @@ -1033,7 +1033,7 @@ def add_regions( .to_dict()["owid_aggregation"] ) if len(aggregations) > 0: - log.info("add_regions", shape=tb.shape) + log.info("add_regions", shape=tb_with_regions.shape) # Load population dataset, countries-regions, and income groups datasets. population = load_population(ds_population=ds_population) @@ -1061,7 +1061,10 @@ def add_regions( element_codes = aggregations_inverted[aggregation] # Select relevant rows in the data. - data_region = tb[(tb["country"].isin(countries_in_region)) & (tb["element_code"].isin(element_codes))] + data_region = tb_with_regions[ + (tb_with_regions["country"].isin(countries_in_region)) + & (tb_with_regions["element_code"].isin(element_codes)) + ] # Ensure there is no overlap between historical regions and their successors. data_region = remove_overlapping_data_between_historical_regions_and_successors(data_region) @@ -1118,27 +1121,30 @@ def add_regions( ) # Add data for current region to data. - tb = dataframes.concatenate( - [tb[tb["country"] != region].reset_index(drop=True), data_region], + tb_with_regions = dataframes.concatenate( + [tb_with_regions[tb_with_regions["country"] != region].reset_index(drop=True), data_region], ignore_index=True, ) # Check that the fraction of population with data is as high as expected. - frac_population = tb["population_with_data"] / tb["population"] + frac_population = tb_with_regions["population_with_data"] / tb_with_regions["population"] assert frac_population[frac_population.notnull()].min() >= region_min_frac_population_with_data # Drop column of total population (we will still keep population_with_data). - tb = tb.drop(columns=["population"]) + tb_with_regions = tb_with_regions.drop(columns=["population"]) # Make area_code of category type (it contains integers and strings, and feather does not support object types). - tb["area_code"] = tb["area_code"].astype(str).astype("category") + tb_with_regions["area_code"] = tb_with_regions["area_code"].astype(str).astype("category") # Sort conveniently. - tb = tb.sort_values(["country", "year"]).reset_index(drop=True) + tb_with_regions = tb_with_regions.sort_values(["country", "year"]).reset_index(drop=True) - check_that_countries_are_well_defined(tb) + check_that_countries_are_well_defined(tb_with_regions) - return tb + # Copy metadata of the original table (including indicators metadata). + tb_with_regions = tb_with_regions.copy_metadata(from_table=tb) + + return tb_with_regions def add_fao_population_if_given(tb: Table) -> Table: @@ -1333,17 +1339,19 @@ def add_per_capita_variables(tb: Table, elements_metadata: Table) -> Table: Data with per-capita variables. """ - tb = tb.copy() + tb_with_pc_variables = tb.copy() # Find element codes that have to be made per capita. element_codes_to_make_per_capita = list( elements_metadata[elements_metadata["make_per_capita"]]["element_code"].unique() ) if len(element_codes_to_make_per_capita) > 0: - log.info("add_per_capita_variables", shape=tb.shape) + log.info("add_per_capita_variables", shape=tb_with_pc_variables.shape) # Create a new dataframe that will have all per capita variables. - per_capita_data = tb[tb["element_code"].isin(element_codes_to_make_per_capita)].reset_index(drop=True) + per_capita_data = tb_with_pc_variables[ + tb_with_pc_variables["element_code"].isin(element_codes_to_make_per_capita) + ].reset_index(drop=True) # Change element codes of per capita variables. per_capita_data["element_code"] = per_capita_data["element_code"].cat.rename_categories( @@ -1382,7 +1390,12 @@ def add_per_capita_variables(tb: Table, elements_metadata: Table) -> Table: lambda c: f"{c} {NEW_PER_CAPITA_ADDED_ELEMENT_DESCRIPTION}" ) # Add new rows with per capita variables to data. - tb = dataframes.concatenate([tb, per_capita_data], ignore_index=True).reset_index(drop=True) + tb_with_pc_variables = dataframes.concatenate( + [tb_with_pc_variables, per_capita_data], ignore_index=True + ).reset_index(drop=True) + + # Copy metadata of the original table (including indicators metadata). + tb_with_pc_variables = tb_with_pc_variables.copy_metadata(from_table=tb) return tb @@ -1571,7 +1584,7 @@ def prepare_long_table(tb: Table) -> Table: """ # Create new table with long data. - tb_long = Table(tb) + tb_long = tb.copy() # Ensure table has the optimal dtypes before storing it as feather file. tb_long = optimize_table_dtypes(table=tb_long) @@ -1693,12 +1706,10 @@ def prepare_wide_table(tb: Table) -> Table: # Note: `pivot` operation is usually faster on categorical columns log.info("prepare_wide_table.pivot", shape=tb.shape) # Create a wide table with just the data values. - tb_wide = Table( - tb.pivot( - index=["area_code", "country", "year"], - columns=["variable_name"], - values="value", - ) + tb_wide = tb.pivot( + index=["area_code", "country", "year"], + columns=["variable_name"], + values="value", ) # Add metadata to each new variable in the wide data table. @@ -1799,9 +1810,8 @@ def run(dest_dir: str) -> None: # Get paths and naming conventions for current data step. paths = PathFinder(current_step_file.as_posix()) - # Load latest meadow dataset and keep its metadata. + # Load latest meadow dataset and read its main table. ds_meadow = paths.load_dataset(dataset_short_name) - # Load main table from dataset. tb = ds_meadow[dataset_short_name].reset_index() # Load dataset of FAOSTAT metadata. @@ -1855,7 +1865,7 @@ def run(dest_dir: str) -> None: tb = add_per_capita_variables(tb=tb, elements_metadata=elements_metadata) # Handle detected anomalies in the data. - tb, anomaly_descriptions = handle_anomalies(dataset_short_name=dataset_short_name, data=tb) + tb, anomaly_descriptions = handle_anomalies(dataset_short_name=dataset_short_name, tb=tb) # Create a long table (with item code and element code as part of the index). tb_long = prepare_long_table(tb=tb) From 210f05904865bbe70ee975c31c93b789f3e09c72 Mon Sep 17 00:00:00 2001 From: Pablo Rosado Date: Tue, 19 Mar 2024 16:38:28 +0100 Subject: [PATCH 29/54] Fix various issues to ensure origins are propagated --- .../garden/faostat/2024-03-14/faostat_fbsc.py | 10 ++-- .../2024-03-14/faostat_food_explorer.py | 53 ++++++++++--------- .../data/garden/faostat/2024-03-14/shared.py | 2 +- 3 files changed, 33 insertions(+), 32 deletions(-) diff --git a/etl/steps/data/garden/faostat/2024-03-14/faostat_fbsc.py b/etl/steps/data/garden/faostat/2024-03-14/faostat_fbsc.py index d6008b82f91..879ce0d474a 100644 --- a/etl/steps/data/garden/faostat/2024-03-14/faostat_fbsc.py +++ b/etl/steps/data/garden/faostat/2024-03-14/faostat_fbsc.py @@ -172,7 +172,7 @@ def run(dest_dir: str) -> None: ) tb = combine_fbsh_and_fbs_datasets(ds_fbsh=ds_fbsh, ds_fbs=ds_fbs) - _assert_tb_size(tb, 2000) + # _assert_tb_size(tb, 2000) # Prepare data. tb = clean_data( @@ -200,16 +200,16 @@ def run(dest_dir: str) -> None: tb, anomaly_descriptions = handle_anomalies(dataset_short_name=dataset_short_name, tb=tb) # Avoid objects as they would explode memory, use categoricals instead. - for col in tb.columns: - assert tb[col].dtype != object, f"Column {col} should not have object type" + # for col in tb.columns: + # assert tb[col].dtype != object, f"Column {col} should not have object type" - _assert_tb_size(tb, 2000) + # _assert_tb_size(tb, 2000) # Create a long table (with item code and element code as part of the index). log.info("faostat_fbsc.prepare_long_table", shape=tb.shape) tb_long = prepare_long_table(tb=tb) - _assert_tb_size(tb_long, 2000) + # _assert_tb_size(tb_long, 2000) # Create a wide table (with only country and year as index). log.info("faostat_fbsc.prepare_wide_table", shape=tb.shape) diff --git a/etl/steps/data/garden/faostat/2024-03-14/faostat_food_explorer.py b/etl/steps/data/garden/faostat/2024-03-14/faostat_food_explorer.py index 9e4e3b9683f..ecdc689ed7a 100644 --- a/etl/steps/data/garden/faostat/2024-03-14/faostat_food_explorer.py +++ b/etl/steps/data/garden/faostat/2024-03-14/faostat_food_explorer.py @@ -10,8 +10,7 @@ from pathlib import Path -import pandas as pd -from owid.catalog import Table +from owid.catalog import Dataset, Table from owid.datautils import dataframes from shared import ( CURRENT_DIR, @@ -450,7 +449,7 @@ def get_fao_population(combined: Table) -> Table: return fao_population -def process_combined_data(combined: Table) -> Table: +def process_combined_data(tb: Table, ds_population: Dataset) -> Table: """Process combined data (combination of `faostat_qcl` and `faostat_fbsc` data) to have the content and format required by the food explorer. @@ -465,46 +464,46 @@ def process_combined_data(combined: Table) -> Table: Processed data (in wide format). """ - combined = combined.copy() + tb = tb.copy() # Get FAO population from data (it is given as another item). - fao_population = get_fao_population(combined=combined) + fao_population = get_fao_population(combined=tb) # List of all item codes to select. selected_item_codes = sorted(set(ITEM_CODES_FBSC).union(ITEM_CODES_QCL)) # Check that all expected products are included in the data. - missing_products = sorted(set(selected_item_codes) - set(set(combined["item_code"]))) + missing_products = sorted(set(selected_item_codes) - set(set(tb["item_code"]))) assert len(missing_products) == 0, f"{len(missing_products)} missing products for food explorer." # Select relevant products for the food explorer. - combined = combined[combined["item_code"].isin(selected_item_codes)].reset_index(drop=True) + tb = tb[tb["item_code"].isin(selected_item_codes)].reset_index(drop=True) # Join element and unit into one title column. - combined["title"] = combined["element"] + " (" + combined["unit"] + ")" + tb["title"] = tb["element"] + " (" + tb["unit"] + ")" # This will create a table with just one column and country-year as index. index_columns = ["product", "country", "year"] - data_wide = combined.pivot(index=index_columns, columns=["title"], values="value").reset_index() + tb_wide = tb.pivot(index=index_columns, columns=["title"], values="value").reset_index() # Add column for FAO population. - data_wide = pd.merge(data_wide, fao_population, on=["country", "year"], how="left") + tb_wide = tb_wide.merge(fao_population, on=["country", "year"], how="left") # Add column for OWID population. - data_wide = geo.add_population_to_dataframe(df=data_wide, warn_on_missing_countries=False) + tb_wide = geo.add_population_to_table(tb=tb_wide, ds_population=ds_population, warn_on_missing_countries=False) # Fill gaps in OWID population with FAO population (for "* (FAO)" countries, i.e. countries that were not # harmonized and for which there is no OWID population). # Then drop "fao_population", since it is no longer needed. - data_wide["population"] = data_wide["population"].fillna(data_wide["fao_population"]) - data_wide = data_wide.drop(columns="fao_population") + tb_wide["population"] = tb_wide["population"].fillna(tb_wide["fao_population"]) + tb_wide = tb_wide.drop(columns="fao_population") - assert len(data_wide.columns[data_wide.isnull().all(axis=0)]) == 0, "Unexpected columns with only nan values." + assert len(tb_wide.columns[tb_wide.isnull().all(axis=0)]) == 0, "Unexpected columns with only nan values." # Set a reasonable index. - data_wide = data_wide.set_index(index_columns, verify_integrity=True) + tb_wide = tb_wide.set_index(index_columns, verify_integrity=True) - return data_wide + return tb_wide def run(dest_dir: str) -> None: @@ -521,35 +520,37 @@ def run(dest_dir: str) -> None: paths = PathFinder(current_step_file.as_posix()) # Load latest qcl and fbsc datasets from garden. - qcl_dataset = paths.load_dataset(f"{NAMESPACE}_qcl") - fbsc_dataset = paths.load_dataset(f"{NAMESPACE}_fbsc") + ds_qcl = paths.load_dataset(f"{NAMESPACE}_qcl") + ds_fbsc = paths.load_dataset(f"{NAMESPACE}_fbsc") # Get main long tables from qcl and fbsc datasets. - tb_qcl = qcl_dataset[f"{NAMESPACE}_qcl"] - tb_fbsc = fbsc_dataset[f"{NAMESPACE}_fbsc"] + tb_qcl = ds_qcl[f"{NAMESPACE}_qcl"] + tb_fbsc = ds_fbsc[f"{NAMESPACE}_fbsc"] + + # Load population dataset. + ds_population = paths.load_dataset("population") # # Process data. # # Combine qcl and fbsc data. - data = combine_qcl_and_fbsc(tb_qcl=tb_qcl, tb_fbsc=tb_fbsc) + tb = combine_qcl_and_fbsc(tb_qcl=tb_qcl, tb_fbsc=tb_fbsc) # Prepare data in the format required by the food explorer. - data = process_combined_data(combined=data) + tb = process_combined_data(tb=tb, ds_population=ds_population) - # Create table of products. - table = Table(data, short_name=dataset_short_name) + # Rename table of products. + tb.metadata.short_name = dataset_short_name # # Save outputs. # # Initialise new garden dataset. - ds_garden = create_dataset(dest_dir=dest_dir, tables=[table], default_metadata=fbsc_dataset.metadata) + ds_garden = create_dataset(dest_dir=dest_dir, tables=[tb], default_metadata=ds_fbsc.metadata) # Update dataset metadata and combine sources from qcl and fbsc datasets. ds_garden.metadata.title = DATASET_TITLE ds_garden.metadata.description = DATASET_DESCRIPTION - ds_garden.metadata.sources = fbsc_dataset.metadata.sources + qcl_dataset.metadata.sources # Create new dataset in garden. ds_garden.save() diff --git a/etl/steps/data/garden/faostat/2024-03-14/shared.py b/etl/steps/data/garden/faostat/2024-03-14/shared.py index 8b52341e391..bc6add0290b 100644 --- a/etl/steps/data/garden/faostat/2024-03-14/shared.py +++ b/etl/steps/data/garden/faostat/2024-03-14/shared.py @@ -1397,7 +1397,7 @@ def add_per_capita_variables(tb: Table, elements_metadata: Table) -> Table: # Copy metadata of the original table (including indicators metadata). tb_with_pc_variables = tb_with_pc_variables.copy_metadata(from_table=tb) - return tb + return tb_with_pc_variables def clean_data_values(values: Variable, amendments: Dict[str, str]) -> Variable: From da3d6a713dc0b97a0642e202caf6e3ae4c71b91d Mon Sep 17 00:00:00 2001 From: Pablo Rosado Date: Tue, 19 Mar 2024 16:48:00 +0100 Subject: [PATCH 30/54] Fix missing origins --- etl/steps/data/garden/faostat/2024-03-14/shared.py | 4 ++-- etl/steps/data/meadow/faostat/2024-03-14/shared.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/etl/steps/data/garden/faostat/2024-03-14/shared.py b/etl/steps/data/garden/faostat/2024-03-14/shared.py index bc6add0290b..0cf4357bc9f 100644 --- a/etl/steps/data/garden/faostat/2024-03-14/shared.py +++ b/etl/steps/data/garden/faostat/2024-03-14/shared.py @@ -1425,7 +1425,7 @@ def clean_data_values(values: Variable, amendments: Dict[str, str]) -> Variable: show_full_warning=True, ), name="value", - ) + ).copy_metadata(from_variable=values) # Convert all numbers into numeric. # Note: If this step fails with a ValueError, it may be because other spurious values have been introduced. @@ -1883,7 +1883,7 @@ def run(dest_dir: str) -> None: tb_wide.metadata.title = dataset_metadata["owid_dataset_title"] + ADDED_TITLE_TO_WIDE_TABLE # Initialise new garden dataset. - ds_garden = create_dataset(dest_dir=dest_dir, tables=[tb_long, tb_wide], default_metadata=ds_meadow.metadata) + ds_garden = create_dataset(dest_dir=dest_dir, tables=[tb_long, tb_wide], default_metadata=ds_meadow.metadata, check_variables_metadata=True) # Update dataset metadata. # Add description of anomalies (if any) to the dataset description. ds_garden.metadata.description = dataset_metadata["owid_dataset_description"] + anomaly_descriptions diff --git a/etl/steps/data/meadow/faostat/2024-03-14/shared.py b/etl/steps/data/meadow/faostat/2024-03-14/shared.py index d5cd5539a0d..9facd29c7b9 100644 --- a/etl/steps/data/meadow/faostat/2024-03-14/shared.py +++ b/etl/steps/data/meadow/faostat/2024-03-14/shared.py @@ -169,5 +169,5 @@ def run(dest_dir: str) -> None: # Save outputs. # # Create a new meadow dataset. - ds_meadow = create_dataset(dest_dir=dest_dir, tables=[tb]) + ds_meadow = create_dataset(dest_dir=dest_dir, tables=[tb], check_variables_metadata=True) ds_meadow.save() From 424f50e46cb5535b250ea6fd829a56006af9a8f0 Mon Sep 17 00:00:00 2001 From: Pablo Rosado Date: Wed, 20 Mar 2024 10:01:13 +0100 Subject: [PATCH 31/54] Raise warning if domains do not exist any longer in FAOSTAT, and remove those --- etl/scripts/faostat/create_new_snapshots.py | 8 ++++++++ etl/scripts/faostat/shared.py | 13 +++++++------ etl/steps/data/garden/faostat/2024-03-14/shared.py | 4 +++- 3 files changed, 18 insertions(+), 7 deletions(-) diff --git a/etl/scripts/faostat/create_new_snapshots.py b/etl/scripts/faostat/create_new_snapshots.py index 01da17e543a..d0d07c2a6e3 100644 --- a/etl/scripts/faostat/create_new_snapshots.py +++ b/etl/scripts/faostat/create_new_snapshots.py @@ -289,6 +289,14 @@ def main(read_only: bool = False) -> None: any_dataset_was_updated = False # Fetch dataset codes from FAOSTAT catalog. faostat_catalog = load_faostat_catalog() + + # Check if any of the domains that we want to download are missing. + missing_domains = sorted( + set(INCLUDED_DATASETS_CODES) - set([entry["DatasetCode"].lower() for entry in faostat_catalog]) + ) + if len(missing_domains) > 0: + log.warning(f"The following domains cannot be found in FAOSTAT anymore: {missing_domains}") + for description in faostat_catalog: # Build FAODataset instance. dataset_code = description["DatasetCode"].lower() diff --git a/etl/scripts/faostat/shared.py b/etl/scripts/faostat/shared.py index 84ac02cfe73..b0c0c881f9d 100644 --- a/etl/scripts/faostat/shared.py +++ b/etl/scripts/faostat/shared.py @@ -41,18 +41,12 @@ INCLUDED_DATASETS_CODES = [ # Cost and Affordability of a Healthy Diet. "cahd", - # Land, Inputs and Sustainability: Fertilizers indicators. - "ef", # Climate Change: Emissions intensities. "ei", # Land, Inputs and Sustainability: Livestock Patterns. "ek", - # Land, Inputs and Sustainability: Land use indicators. - "el", # Land, Inputs and Sustainability: Livestock Manure. "emn", - # Land, Inputs and Sustainability: Pesticides indicators. - "ep", # Land, Inputs and Sustainability: Soil nutrient budget. "esb", # Discontinued archives and data series: Food Aid Shipments (WFP). @@ -98,6 +92,13 @@ # "wcad", # Energy use. # "gn", + # The following domains used to exist in FAOSTAT, but they have been removed. + # Land, Inputs and Sustainability: Fertilizers indicators. + # "ef", + # Land, Inputs and Sustainability: Land use indicators. + # "el", + # Land, Inputs and Sustainability: Pesticides indicators. + # "ep", ] # URL for dataset codes in FAOSTAT catalog. # This is the URL used to get the remote location of the actual data files to be downloaded, and the date of their diff --git a/etl/steps/data/garden/faostat/2024-03-14/shared.py b/etl/steps/data/garden/faostat/2024-03-14/shared.py index 0cf4357bc9f..eb173b67371 100644 --- a/etl/steps/data/garden/faostat/2024-03-14/shared.py +++ b/etl/steps/data/garden/faostat/2024-03-14/shared.py @@ -1883,7 +1883,9 @@ def run(dest_dir: str) -> None: tb_wide.metadata.title = dataset_metadata["owid_dataset_title"] + ADDED_TITLE_TO_WIDE_TABLE # Initialise new garden dataset. - ds_garden = create_dataset(dest_dir=dest_dir, tables=[tb_long, tb_wide], default_metadata=ds_meadow.metadata, check_variables_metadata=True) + ds_garden = create_dataset( + dest_dir=dest_dir, tables=[tb_long, tb_wide], default_metadata=ds_meadow.metadata, check_variables_metadata=True + ) # Update dataset metadata. # Add description of anomalies (if any) to the dataset description. ds_garden.metadata.description = dataset_metadata["owid_dataset_description"] + anomaly_descriptions From 1f5c48d00f88fa65d17482f6b07c6fb41113f1a0 Mon Sep 17 00:00:00 2001 From: Pablo Rosado Date: Wed, 20 Mar 2024 10:24:49 +0100 Subject: [PATCH 32/54] Update docs --- docs/data/faostat.md | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/docs/data/faostat.md b/docs/data/faostat.md index 44b3d5cd36d..50845348bdb 100644 --- a/docs/data/faostat.md +++ b/docs/data/faostat.md @@ -224,6 +224,16 @@ If no dataset requires an update, the workflow stops here. python etl/scripts/faostat/create_new_snapshots.py ``` + !!! note + + It has already happened a few times that FAOSTAT changes an indicator from one domain to another. If + `create_new_snapshot` raises a warning because a domain is no longer found, usually the indicators of the old + domain can be found in another domain. Go to the grapher dataset of the old domain and gather all pairs of + item code + element code that were used in charts. Then go to [the FAOSTAT definitions table](https://www.fao.org/faostat/en/#definitions) + and find the domain where that combination of item code and element code can be found. If we were not + downloading this domain, add it to the list `INCLUDED_DATASETS_CODES`. Then replace variables used in those + charts with the new ones. + 2. Create new meadow steps. !!! note From 8e2431c70979d55929d8fe4dceb0c126b0132556 Mon Sep 17 00:00:00 2001 From: Marigold Date: Wed, 20 Mar 2024 11:11:56 +0100 Subject: [PATCH 33/54] :bug: retry on SSLError --- apps/backport/datasync/datasync.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/apps/backport/datasync/datasync.py b/apps/backport/datasync/datasync.py index d1d01923af6..63805898031 100644 --- a/apps/backport/datasync/datasync.py +++ b/apps/backport/datasync/datasync.py @@ -3,7 +3,7 @@ from typing import Any, Dict import structlog -from botocore.exceptions import EndpointConnectionError +from botocore.exceptions import EndpointConnectionError, SSLError from owid.catalog import s3_utils from tenacity import Retrying from tenacity.retry import retry_if_exception_type @@ -32,7 +32,7 @@ def upload_gzip_dict(d: Dict[str, Any], s3_path: str, private: bool = False) -> for attempt in Retrying( wait=wait_exponential(min=5, max=100), stop=stop_after_attempt(7), - retry=retry_if_exception_type(EndpointConnectionError), + retry=retry_if_exception_type((EndpointConnectionError, SSLError)), ): with attempt: client.put_object( From 3b0c8e4a4b4906636fdfc324f81570e9f670c659 Mon Sep 17 00:00:00 2001 From: Pablo Rosado Date: Wed, 20 Mar 2024 11:37:23 +0100 Subject: [PATCH 34/54] Remove steps that do not exist anymore in FAOSTAT --- dag/faostat.yml | 9 --------- etl/steps/data/meadow/faostat/2024-03-14/faostat_ef.py | 2 -- etl/steps/data/meadow/faostat/2024-03-14/faostat_el.py | 2 -- etl/steps/data/meadow/faostat/2024-03-14/faostat_ep.py | 2 -- 4 files changed, 15 deletions(-) delete mode 100644 etl/steps/data/meadow/faostat/2024-03-14/faostat_ef.py delete mode 100644 etl/steps/data/meadow/faostat/2024-03-14/faostat_el.py delete mode 100644 etl/steps/data/meadow/faostat/2024-03-14/faostat_ep.py diff --git a/dag/faostat.yml b/dag/faostat.yml index c2d01acbc18..f5c4787c6d7 100644 --- a/dag/faostat.yml +++ b/dag/faostat.yml @@ -328,18 +328,12 @@ steps: # data://meadow/faostat/2024-03-14/faostat_cahd: - snapshot://faostat/2024-03-14/faostat_cahd.zip - data://meadow/faostat/2024-03-14/faostat_ef: - - snapshot://faostat/2023-02-22/faostat_ef.zip data://meadow/faostat/2024-03-14/faostat_ei: - snapshot://faostat/2024-03-14/faostat_ei.zip data://meadow/faostat/2024-03-14/faostat_ek: - snapshot://faostat/2024-03-14/faostat_ek.zip - data://meadow/faostat/2024-03-14/faostat_el: - - snapshot://faostat/2023-06-12/faostat_el.zip data://meadow/faostat/2024-03-14/faostat_emn: - snapshot://faostat/2024-03-14/faostat_emn.zip - data://meadow/faostat/2024-03-14/faostat_ep: - - snapshot://faostat/2023-02-22/faostat_ep.zip data://meadow/faostat/2024-03-14/faostat_esb: - snapshot://faostat/2024-03-14/faostat_esb.zip data://meadow/faostat/2024-03-14/faostat_fa: @@ -462,7 +456,6 @@ steps: data://garden/faostat/2024-03-14/faostat_metadata: - data://meadow/faostat/2024-03-14/faostat_rt - data://meadow/faostat/2024-03-14/faostat_scl - - data://meadow/faostat/2024-03-14/faostat_el - data://meadow/faostat/2024-03-14/faostat_sdgb - data://meadow/faostat/2024-03-14/faostat_qv - data://meadow/faostat/2024-03-14/faostat_emn @@ -477,7 +470,6 @@ steps: - data://meadow/faostat/2024-03-14/faostat_ei - data://meadow/faostat/2024-03-14/faostat_rl - data://meadow/faostat/2024-03-14/faostat_ic - - data://meadow/faostat/2024-03-14/faostat_ef - data://meadow/faostat/2024-03-14/faostat_qi - data://meadow/faostat/2024-03-14/faostat_rfn - data://meadow/faostat/2024-03-14/faostat_rfb @@ -487,7 +479,6 @@ steps: - data://meadow/faostat/2024-03-14/faostat_cahd - data://meadow/faostat/2024-03-14/faostat_fbs - data://meadow/faostat/2024-03-14/faostat_ti - - data://meadow/faostat/2024-03-14/faostat_ep data://garden/faostat/2024-03-14/faostat_qcl: - data://garden/faostat/2024-03-14/faostat_metadata - data://grapher/demography/2023-03-31/population diff --git a/etl/steps/data/meadow/faostat/2024-03-14/faostat_ef.py b/etl/steps/data/meadow/faostat/2024-03-14/faostat_ef.py deleted file mode 100644 index c1b3ce5eec8..00000000000 --- a/etl/steps/data/meadow/faostat/2024-03-14/faostat_ef.py +++ /dev/null @@ -1,2 +0,0 @@ -"""FAOSTAT meadow step for faostat_ef dataset.""" -from .shared import run # noqa:F401 diff --git a/etl/steps/data/meadow/faostat/2024-03-14/faostat_el.py b/etl/steps/data/meadow/faostat/2024-03-14/faostat_el.py deleted file mode 100644 index 7cda6b5ced7..00000000000 --- a/etl/steps/data/meadow/faostat/2024-03-14/faostat_el.py +++ /dev/null @@ -1,2 +0,0 @@ -"""FAOSTAT meadow step for faostat_el dataset.""" -from .shared import run # noqa:F401 diff --git a/etl/steps/data/meadow/faostat/2024-03-14/faostat_ep.py b/etl/steps/data/meadow/faostat/2024-03-14/faostat_ep.py deleted file mode 100644 index de1278faacf..00000000000 --- a/etl/steps/data/meadow/faostat/2024-03-14/faostat_ep.py +++ /dev/null @@ -1,2 +0,0 @@ -"""FAOSTAT meadow step for faostat_ep dataset.""" -from .shared import run # noqa:F401 From 632a27ee6c68cffaf216ce79f52ac531d5d34866 Mon Sep 17 00:00:00 2001 From: Pablo Rosado Date: Wed, 20 Mar 2024 11:54:20 +0100 Subject: [PATCH 35/54] Ensure origins are propagated --- .../garden/faostat/2024-03-14/faostat_fbsc.py | 12 +++++++++++- .../faostat/2024-03-14/faostat_food_explorer.py | 11 ++++++++--- .../data/garden/faostat/2024-03-14/faostat_qcl.py | 15 ++++++++++++++- .../data/garden/faostat/2024-03-14/shared.py | 13 ++++++++++++- .../data/meadow/faostat/2024-03-14/shared.py | 10 ++++++++-- 5 files changed, 53 insertions(+), 8 deletions(-) diff --git a/etl/steps/data/garden/faostat/2024-03-14/faostat_fbsc.py b/etl/steps/data/garden/faostat/2024-03-14/faostat_fbsc.py index 879ce0d474a..af42c1e0104 100644 --- a/etl/steps/data/garden/faostat/2024-03-14/faostat_fbsc.py +++ b/etl/steps/data/garden/faostat/2024-03-14/faostat_fbsc.py @@ -215,6 +215,14 @@ def run(dest_dir: str) -> None: log.info("faostat_fbsc.prepare_wide_table", shape=tb.shape) tb_wide = prepare_wide_table(tb=tb) + # Check that column "value" has two origins (other columns are not as important and may not have origins). + error = f"Column 'value' of the long table of {dataset_short_name} must have two origins." + assert len(tb_long["value"].metadata.origins) == 2, error + error = f"All value columns of the wide table of {dataset_short_name} must have two origins." + assert all( + [len(tb_wide[column].metadata.origins) == 2 for column in tb_wide.columns if column not in ["area_code"]] + ), error + # # Save outputs. # @@ -225,7 +233,9 @@ def run(dest_dir: str) -> None: tb_wide.metadata.title = dataset_metadata["owid_dataset_title"] + ADDED_TITLE_TO_WIDE_TABLE # Initialise new garden dataset. - ds_garden = create_dataset(dest_dir=dest_dir, tables=[tb_long, tb_wide], default_metadata=ds_fbs.metadata) + ds_garden = create_dataset( + dest_dir=dest_dir, tables=[tb_long, tb_wide], default_metadata=ds_fbs.metadata, check_variables_metadata=False + ) # Check that the title assigned here coincides with the one in custom_datasets.csv (for consistency). error = "Dataset title given to fbsc is different to the one in custom_datasets.csv. Update the latter file." diff --git a/etl/steps/data/garden/faostat/2024-03-14/faostat_food_explorer.py b/etl/steps/data/garden/faostat/2024-03-14/faostat_food_explorer.py index ecdc689ed7a..326485e94b8 100644 --- a/etl/steps/data/garden/faostat/2024-03-14/faostat_food_explorer.py +++ b/etl/steps/data/garden/faostat/2024-03-14/faostat_food_explorer.py @@ -10,8 +10,8 @@ from pathlib import Path +import owid.catalog.processing as pr from owid.catalog import Dataset, Table -from owid.datautils import dataframes from shared import ( CURRENT_DIR, FAO_POPULATION_ELEMENT_NAME, @@ -392,8 +392,11 @@ def combine_qcl_and_fbsc(tb_qcl: Table, tb_fbsc: Table) -> Table: fbsc["item"] = [item for item in fbsc["item"]] rename_columns = {"item": "product"} + # combined = ( + # dataframes.concatenate([qcl, fbsc], ignore_index=True).rename(columns=rename_columns).reset_index(drop=True) + # ) combined = ( - dataframes.concatenate([qcl, fbsc], ignore_index=True).rename(columns=rename_columns).reset_index(drop=True) + pr.concat([qcl, fbsc], ignore_index=True).rename(columns=rename_columns, errors="raise").reset_index(drop=True) ) # Sanity checks. @@ -546,7 +549,9 @@ def run(dest_dir: str) -> None: # Save outputs. # # Initialise new garden dataset. - ds_garden = create_dataset(dest_dir=dest_dir, tables=[tb], default_metadata=ds_fbsc.metadata) + ds_garden = create_dataset( + dest_dir=dest_dir, tables=[tb], default_metadata=ds_fbsc.metadata, check_variables_metadata=True + ) # Update dataset metadata and combine sources from qcl and fbsc datasets. ds_garden.metadata.title = DATASET_TITLE diff --git a/etl/steps/data/garden/faostat/2024-03-14/faostat_qcl.py b/etl/steps/data/garden/faostat/2024-03-14/faostat_qcl.py index 85d899e4b26..5ac19596d9b 100644 --- a/etl/steps/data/garden/faostat/2024-03-14/faostat_qcl.py +++ b/etl/steps/data/garden/faostat/2024-03-14/faostat_qcl.py @@ -517,6 +517,14 @@ def run(dest_dir: str) -> None: # Create a wide table (with only country and year as index). tb_wide = prepare_wide_table(tb=tb) + # Check that column "value" has an origin (other columns are not as important and may not have origins). + error = f"Column 'value' of the long table of {dataset_short_name} must have one origin." + assert len(tb_long["value"].metadata.origins) == 1, error + error = f"All value columns of the wide table of {dataset_short_name} must have one origin." + assert all( + [len(tb_wide[column].metadata.origins) == 1 for column in tb_wide.columns if column not in ["area_code"]] + ), error + # # Save outputs. # @@ -527,7 +535,12 @@ def run(dest_dir: str) -> None: tb_wide.metadata.title = dataset_metadata["owid_dataset_title"] + ADDED_TITLE_TO_WIDE_TABLE # Initialise new garden dataset. - ds_garden = create_dataset(dest_dir=dest_dir, tables=[tb_long, tb_wide], default_metadata=ds_meadow.metadata) + ds_garden = create_dataset( + dest_dir=dest_dir, + tables=[tb_long, tb_wide], + default_metadata=ds_meadow.metadata, + check_variables_metadata=False, + ) # Update dataset metadata and add description of anomalies (if any) to the dataset description. ds_garden.metadata.description = ( diff --git a/etl/steps/data/garden/faostat/2024-03-14/shared.py b/etl/steps/data/garden/faostat/2024-03-14/shared.py index eb173b67371..08e0606214c 100644 --- a/etl/steps/data/garden/faostat/2024-03-14/shared.py +++ b/etl/steps/data/garden/faostat/2024-03-14/shared.py @@ -1873,6 +1873,14 @@ def run(dest_dir: str) -> None: # Create a wide table (with only country and year as index). tb_wide = prepare_wide_table(tb=tb) + # Check that column "value" has an origin (other columns are not as important and may not have origins). + error = f"Column 'value' of the long table of {dataset_short_name} must have one origin." + assert len(tb_long["value"].metadata.origins) == 1, error + error = f"All value columns of the wide table of {dataset_short_name} must have one origin." + assert all( + [len(tb_wide[column].metadata.origins) == 1 for column in tb_wide.columns if column not in ["area_code"]] + ), error + # # Save outputs. # @@ -1884,7 +1892,10 @@ def run(dest_dir: str) -> None: # Initialise new garden dataset. ds_garden = create_dataset( - dest_dir=dest_dir, tables=[tb_long, tb_wide], default_metadata=ds_meadow.metadata, check_variables_metadata=True + dest_dir=dest_dir, + tables=[tb_long, tb_wide], + default_metadata=ds_meadow.metadata, + check_variables_metadata=False, ) # Update dataset metadata. # Add description of anomalies (if any) to the dataset description. diff --git a/etl/steps/data/meadow/faostat/2024-03-14/shared.py b/etl/steps/data/meadow/faostat/2024-03-14/shared.py index 9facd29c7b9..e744a283231 100644 --- a/etl/steps/data/meadow/faostat/2024-03-14/shared.py +++ b/etl/steps/data/meadow/faostat/2024-03-14/shared.py @@ -134,7 +134,9 @@ def prepare_output_data(tb: Table) -> Table: index_columns = list({"Area Code", "Recipient Country Code", "Year", "Item Code", "Element Code"} & set(tb.columns)) if tb.duplicated(subset=index_columns).any(): log.warning("Index has duplicated keys.") - tb = tb.set_index(index_columns) + + # Ensure all columns are snake-case, and set an index. + tb = tb.set_index(index_columns).underscore() return tb @@ -165,9 +167,13 @@ def run(dest_dir: str) -> None: # Prepare output meadow table. tb = prepare_output_data(tb=tb_snapshot) + # Check that column "value" has an origin (other columns are not as important and may not have origins). + assert len(tb["value"].metadata.origins) == 1, f"Column 'value' of {dataset_short_name} must have one origin." + # # Save outputs. # # Create a new meadow dataset. - ds_meadow = create_dataset(dest_dir=dest_dir, tables=[tb], check_variables_metadata=True) + # NOTE: Do not check if all variables have metadata. We asserted above that "value" has an origin. + ds_meadow = create_dataset(dest_dir=dest_dir, tables=[tb], check_variables_metadata=False) ds_meadow.save() From 5963b81bbcf3745c919053c0e33bbe0390716246 Mon Sep 17 00:00:00 2001 From: Pablo Rosado Date: Wed, 20 Mar 2024 12:33:12 +0100 Subject: [PATCH 36/54] Fix changed country names --- .../faostat/2024-03-14/faostat.countries.json | 6 +++--- .../faostat/2024-03-14/faostat_metadata.py | 18 ------------------ 2 files changed, 3 insertions(+), 21 deletions(-) diff --git a/etl/steps/data/garden/faostat/2024-03-14/faostat.countries.json b/etl/steps/data/garden/faostat/2024-03-14/faostat.countries.json index 99202add9cb..98a4f1855c8 100644 --- a/etl/steps/data/garden/faostat/2024-03-14/faostat.countries.json +++ b/etl/steps/data/garden/faostat/2024-03-14/faostat.countries.json @@ -84,7 +84,7 @@ "Fiji": "Fiji", "Finland": "Finland", "France": "France", - "French Guyana": "French Guiana", + "French Guiana": "French Guiana", "French Polynesia": "French Polynesia", "French Southern Territories": "French Southern Territories", "Gabon": "Gabon", @@ -160,7 +160,7 @@ "Namibia": "Namibia", "Nauru": "Nauru", "Nepal": "Nepal", - "Netherlands": "Netherlands", + "Netherlands (Kingdom of the)": "Netherlands", "Netherlands Antilles (former)": "Netherlands Antilles", "New Caledonia": "New Caledonia", "New Zealand": "New Zealand", @@ -199,7 +199,7 @@ "Saint Lucia": "Saint Lucia", "Saint Pierre and Miquelon": "Saint Pierre and Miquelon", "Saint Vincent and the Grenadines": "Saint Vincent and the Grenadines", - "Saint-Martin (French part)": "Saint Martin (French part)", + "Saint Martin (French part)": "Saint Martin (French part)", "Samoa": "Samoa", "San Marino": "San Marino", "Sao Tome and Principe": "Sao Tome and Principe", diff --git a/etl/steps/data/garden/faostat/2024-03-14/faostat_metadata.py b/etl/steps/data/garden/faostat/2024-03-14/faostat_metadata.py index 57c9027e128..98bfabc943d 100644 --- a/etl/steps/data/garden/faostat/2024-03-14/faostat_metadata.py +++ b/etl/steps/data/garden/faostat/2024-03-14/faostat_metadata.py @@ -922,24 +922,6 @@ def process_metadata( tb["area_code"] = tb["area_code"].astype("Int64") - ################################################################################################################ - # Temporary patch. - # Some areas are defined with different names (but same area codes) in different domains. - # This causes some issues at a later stage. - # For now, manually rename those areas here. - if "French Guiana" in tb["fao_country"].unique(): - tb["fao_country"] = dataframes.map_series(tb["fao_country"], mapping={"French Guiana": "French Guyana"}) - if "Netherlands (Kingdom of the)" in tb["fao_country"].unique(): - tb["fao_country"] = dataframes.map_series( - tb["fao_country"], mapping={"Netherlands (Kingdom of the)": "Netherlands"} - ) - if "Saint Martin (French part)" in tb["fao_country"].unique(): - tb["fao_country"] = dataframes.map_series( - tb["fao_country"], mapping={"Saint Martin (French part)": "Saint-Martin (French part)"} - ) - - ################################################################################################################ - if f"{dataset_short_name}_flag" in metadata.table_names: check_that_all_flags_in_dataset_are_in_ranking( table=table, metadata_for_flags=metadata[f"{dataset_short_name}_flag"] From 5b63f9e6494dd396f20d60cdb5620c5708b05440 Mon Sep 17 00:00:00 2001 From: Pablo Rosado Date: Wed, 20 Mar 2024 12:51:58 +0100 Subject: [PATCH 37/54] Avoid repetitive warning on missing units --- etl/steps/data/garden/faostat/2024-03-14/shared.py | 8 +++++++- etl/steps/data/grapher/faostat/2024-03-14/shared.py | 4 +++- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/etl/steps/data/garden/faostat/2024-03-14/shared.py b/etl/steps/data/garden/faostat/2024-03-14/shared.py index 08e0606214c..f515b744707 100644 --- a/etl/steps/data/garden/faostat/2024-03-14/shared.py +++ b/etl/steps/data/garden/faostat/2024-03-14/shared.py @@ -855,11 +855,17 @@ def add_custom_names_and_descriptions(tb: Table, items_metadata: Table, elements tb = tb.rename(columns={column: column.replace("owid_", "") for column in tb.columns}) # Fill missing unit and short_unit columns with empty strings. + missing_fields = {"fields": [], "elements": []} for column in ["unit", "unit_short_name"]: missing_unit_mask = tb[column].isnull() if not tb[missing_unit_mask].empty: - log.warning(f"Missing {column} for elements: {set(tb[missing_unit_mask]['element'])}") tb[column] = tb[column].cat.add_categories("").fillna("") + missing_fields["fields"].append(column) + missing_fields["elements"] = sorted(set(missing_fields["elements"]) | set(tb[missing_unit_mask]["element"])) + if missing_fields["fields"]: + log.info( + f"Filling missing fields {missing_fields['fields']} with ''. Affected elements: {missing_fields['elements']}" + ) return tb diff --git a/etl/steps/data/grapher/faostat/2024-03-14/shared.py b/etl/steps/data/grapher/faostat/2024-03-14/shared.py index 95b623b6fff..bf7d941e000 100644 --- a/etl/steps/data/grapher/faostat/2024-03-14/shared.py +++ b/etl/steps/data/grapher/faostat/2024-03-14/shared.py @@ -39,5 +39,7 @@ def run(dest_dir: str) -> None: # Save outputs. # # Create a new grapher dataset. - ds_grapher = create_dataset(dest_dir=dest_dir, tables=[tb_garden], default_metadata=ds_garden.metadata) + ds_grapher = create_dataset( + dest_dir=dest_dir, tables=[tb_garden], default_metadata=ds_garden.metadata, check_variables_metadata=True + ) ds_grapher.save() From c8f00c55713d6286328d1e4faddc69c8caa2815b Mon Sep 17 00:00:00 2001 From: Pablo Rosado Date: Wed, 20 Mar 2024 13:17:04 +0100 Subject: [PATCH 38/54] Check discrepant item name definitions and improve warning messages and documentation --- docs/data/faostat.md | 6 +++++- .../data/garden/faostat/2024-03-14/faostat_metadata.py | 8 +++++--- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/docs/data/faostat.md b/docs/data/faostat.md index 50845348bdb..9c221ab430d 100644 --- a/docs/data/faostat.md +++ b/docs/data/faostat.md @@ -268,7 +268,7 @@ If no dataset requires an update, the workflow stops here. ``` !!! note - If a new domain has been added to this version, you may need to manually add its meadow step as a dependency of garden/faostat/YYYY-MM-DD/faostat_metadata in the dag (this is a known bug). + If a new domain has been added to this version, you may need to manually add its meadow step as a dependency of garden/faostat/YYYY-MM-DD/faostat_metadata in the dag (this is a known bug). 6. Inspect and update any possible changes of dataset/item/element/unit names and descriptions. @@ -281,6 +281,10 @@ If no dataset requires an update, the workflow stops here. etl run garden/faostat/YYYY-MM-DD ``` + !!! note + + Sometimes `garden/faostat/YYYY-MM-DD/faostat_metadata` raises the warning "X item codes in data mapping to different items in metadata.". This used to happen often. In the latest version, only once (`faostat_rp`, with 38 discrepant items). It usually means that there are small differences between the item name in FAO data and the item name (for the same item code) in FAO metadata. There usually are small differences, like "Rodenticides Other" and "Rodenticides – Other". But after every update, if there are new (or many) discrepant items, check the content of `compared` inside `create_items_table_for_domain`, in the `faostat_metadata`step. + 7. Create new grapher steps. ```bash diff --git a/etl/steps/data/garden/faostat/2024-03-14/faostat_metadata.py b/etl/steps/data/garden/faostat/2024-03-14/faostat_metadata.py index 98bfabc943d..982e9499459 100644 --- a/etl/steps/data/garden/faostat/2024-03-14/faostat_metadata.py +++ b/etl/steps/data/garden/faostat/2024-03-14/faostat_metadata.py @@ -285,10 +285,12 @@ def create_items_table_for_domain(table: Table, metadata: Dataset, dataset_short ) different_items = compared[compared["fao_item_in_data"] != compared["fao_item_in_metadata"]] missing_item_codes = set(items_from_data["item_code"]) - set(_tb_items["item_code"]) - if (len(different_items) + len(missing_item_codes)) > N_ISSUES_ON_ITEMS_FOR_WARNING: + if len(missing_item_codes) > 0: + log.warning(f"{len(missing_item_codes)} item codes in {dataset_short_name} missing in metadata. ") + if len(different_items) > 0: + _frac_different = len(different_items) / len(set(compared["fao_item_in_data"])) log.warning( - f"{len(missing_item_codes)} item codes in {dataset_short_name} missing in metadata. " - f"{len(different_items)} item codes in data mapping to different items in metadata." + f"{len(different_items)} item codes in data ({_frac_different:.2%}) mapping to different items in metadata." ) return items_from_data From 5aee3626577ef0fbce51378d226e4a60ab6ae95f Mon Sep 17 00:00:00 2001 From: Pablo Rosado Date: Wed, 20 Mar 2024 13:20:39 +0100 Subject: [PATCH 39/54] Improve warning messages --- etl/steps/data/garden/faostat/2024-03-14/faostat_metadata.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/etl/steps/data/garden/faostat/2024-03-14/faostat_metadata.py b/etl/steps/data/garden/faostat/2024-03-14/faostat_metadata.py index 982e9499459..d5405097545 100644 --- a/etl/steps/data/garden/faostat/2024-03-14/faostat_metadata.py +++ b/etl/steps/data/garden/faostat/2024-03-14/faostat_metadata.py @@ -64,9 +64,6 @@ from etl.helpers import PathFinder -# Minimum number of issues in the comparison of items and item codes from data and metadata to raise a warning. -N_ISSUES_ON_ITEMS_FOR_WARNING = 1 - def create_dataset_descriptions_table_for_domain(table: Table, dataset_short_name: str) -> Table: """Create a single row table with the dataset name, title and description, for a given domain. @@ -290,7 +287,7 @@ def create_items_table_for_domain(table: Table, metadata: Dataset, dataset_short if len(different_items) > 0: _frac_different = len(different_items) / len(set(compared["fao_item_in_data"])) log.warning( - f"{len(different_items)} item codes in data ({_frac_different:.2%}) mapping to different items in metadata." + f"{len(different_items)} item codes of {dataset_short_name} in data ({_frac_different:.2%}) mapping to different items in metadata." ) return items_from_data From 7ac08af2a61aaa91cd87ce6d575d36daf0f0c5eb Mon Sep 17 00:00:00 2001 From: Pablo Rosado Date: Wed, 20 Mar 2024 14:22:35 +0100 Subject: [PATCH 40/54] Inspect remaining warnings and improve warning handling --- docs/data/faostat.md | 13 +++++++----- .../faostat/2024-03-14/faostat_metadata.py | 20 +++++++++++++++---- 2 files changed, 24 insertions(+), 9 deletions(-) diff --git a/docs/data/faostat.md b/docs/data/faostat.md index 9c221ab430d..85458ca1c54 100644 --- a/docs/data/faostat.md +++ b/docs/data/faostat.md @@ -261,11 +261,18 @@ If no dataset requires an update, the workflow stops here. etl run garden/faostat/YYYY-MM-DD ``` - Optionally, set `INSPECT_ANOMALIES=True`, to visualize if anomalies that were detected in the previous version of the data are still present in the current version. + The first time running the steps after an update, set `INSPECT_ANOMALIES=True`, to visualize if anomalies that were detected in the previous version of the data are still present in the current version. ```bash INSPECT_ANOMALIES=True etl run garden/faostat/YYYY-MM-DD ``` + + If warnings are shown, set `SHOW_WARNING_DETAILS=True` and run the `faostat_metadata` step again. Check that differences between item names in the data and metadata are small (e.g. "Rodenticides Other" -> "Rodenticides – Other"). If differences are not small, investigate the issue. + + ```bash + SHOW_WARNING_DETAILS=True etl run garden/faostat/YYYY-MM-DD/faostat_metadata + ``` + !!! note If a new domain has been added to this version, you may need to manually add its meadow step as a dependency of garden/faostat/YYYY-MM-DD/faostat_metadata in the dag (this is a known bug). @@ -281,10 +288,6 @@ If no dataset requires an update, the workflow stops here. etl run garden/faostat/YYYY-MM-DD ``` - !!! note - - Sometimes `garden/faostat/YYYY-MM-DD/faostat_metadata` raises the warning "X item codes in data mapping to different items in metadata.". This used to happen often. In the latest version, only once (`faostat_rp`, with 38 discrepant items). It usually means that there are small differences between the item name in FAO data and the item name (for the same item code) in FAO metadata. There usually are small differences, like "Rodenticides Other" and "Rodenticides – Other". But after every update, if there are new (or many) discrepant items, check the content of `compared` inside `create_items_table_for_domain`, in the `faostat_metadata`step. - 7. Create new grapher steps. ```bash diff --git a/etl/steps/data/garden/faostat/2024-03-14/faostat_metadata.py b/etl/steps/data/garden/faostat/2024-03-14/faostat_metadata.py index d5405097545..17514fea34c 100644 --- a/etl/steps/data/garden/faostat/2024-03-14/faostat_metadata.py +++ b/etl/steps/data/garden/faostat/2024-03-14/faostat_metadata.py @@ -38,6 +38,7 @@ """ import json +import os import sys from copy import deepcopy from typing import Dict, List, Tuple @@ -64,6 +65,10 @@ from etl.helpers import PathFinder +# If environment variable SHOW_WARNING_DETAILS is set to True, show a detailed warning (e.g. when comparing discrepant +# item names from data and metadata). +SHOW_WARNING_DETAILS = bool(os.getenv("SHOW_WARNING_DETAILS", False)) + def create_dataset_descriptions_table_for_domain(table: Table, dataset_short_name: str) -> Table: """Create a single row table with the dataset name, title and description, for a given domain. @@ -283,12 +288,19 @@ def create_items_table_for_domain(table: Table, metadata: Dataset, dataset_short different_items = compared[compared["fao_item_in_data"] != compared["fao_item_in_metadata"]] missing_item_codes = set(items_from_data["item_code"]) - set(_tb_items["item_code"]) if len(missing_item_codes) > 0: - log.warning(f"{len(missing_item_codes)} item codes in {dataset_short_name} missing in metadata. ") + warning_message = f"{len(missing_item_codes)} item codes of {dataset_short_name} data missing in metadata. " + if SHOW_WARNING_DETAILS: + for item_code in missing_item_codes: + warning_message += f"\n* Item {item_code} from data: {items_from_data[items_from_data['item_code'] == item_code]['fao_item'].item()}" + log.warning(warning_message) if len(different_items) > 0: _frac_different = len(different_items) / len(set(compared["fao_item_in_data"])) - log.warning( - f"{len(different_items)} item codes of {dataset_short_name} in data ({_frac_different:.2%}) mapping to different items in metadata." - ) + warning_message = f"{len(different_items)} item codes of {dataset_short_name} in data ({_frac_different:.2%}) mapping to different items in metadata." + if SHOW_WARNING_DETAILS: + for item_code, item_in_data, item_in_metadata in different_items.values: + warning_message += f"\n\n* Item {item_code} from data: {item_in_data}" + warning_message += f"\n* Item {item_code} from metadata: {item_in_metadata}" + log.warning(warning_message) return items_from_data From acd456aaa1c0d51f7dba431c824df49a8468159d Mon Sep 17 00:00:00 2001 From: Pablo Rosado Date: Wed, 20 Mar 2024 14:45:47 +0100 Subject: [PATCH 41/54] Add garden and grapher step of additional variables --- dag/faostat.yml | 15 + .../2024-03-14/additional_variables.meta.yml | 711 +++++++++ .../2024-03-14/additional_variables.py | 1291 +++++++++++++++++ .../2024-03-14/additional_variables.py | 204 +++ 4 files changed, 2221 insertions(+) create mode 100644 etl/steps/data/garden/faostat/2024-03-14/additional_variables.meta.yml create mode 100644 etl/steps/data/garden/faostat/2024-03-14/additional_variables.py create mode 100644 etl/steps/data/grapher/faostat/2024-03-14/additional_variables.py diff --git a/dag/faostat.yml b/dag/faostat.yml index f5c4787c6d7..06e1174721c 100644 --- a/dag/faostat.yml +++ b/dag/faostat.yml @@ -605,3 +605,18 @@ steps: # data://explorers/faostat/latest/food_explorer: - data://garden/faostat/2024-03-14/faostat_food_explorer + # + # FAOSTAT garden step for additional variables + # + data://garden/faostat/2024-03-14/additional_variables: + - data://garden/faostat/2024-03-14/faostat_rl + - data://garden/faostat/2024-03-14/faostat_qi + - data://garden/faostat/2024-03-14/faostat_qcl + - data://garden/faostat/2024-03-14/faostat_sdgb + - data://garden/faostat/2024-03-14/faostat_fbsc + - data://garden/faostat/2024-03-14/faostat_rfn + # + # FAOSTAT grapher step for additional variables + # + data://grapher/faostat/2024-03-14/additional_variables: + - data://garden/faostat/2024-03-14/additional_variables diff --git a/etl/steps/data/garden/faostat/2024-03-14/additional_variables.meta.yml b/etl/steps/data/garden/faostat/2024-03-14/additional_variables.meta.yml new file mode 100644 index 00000000000..2849223f629 --- /dev/null +++ b/etl/steps/data/garden/faostat/2024-03-14/additional_variables.meta.yml @@ -0,0 +1,711 @@ +all_sources: + - faostat: &faostat_source + name: Food and Agriculture Organization of the United Nations + published_by: Food and Agriculture Organization of the United Nations + url: http://www.fao.org/faostat/en/#data/ + date_accessed: "2023-06-12" + publication_date: "2023-06-12" + publication_year: 2023 + +dataset: + title: Additional variables (FAOSTAT, 2023b) + description: | + Additional variables created using data from different FAOSTAT datasets. + sources: + - *faostat_source + +tables: + arable_land_per_crop_output: + variables: + area: + title: 'Arable land' + unit: 'hectares' + short_unit: 'ha' + description: | + Arable land is the total of areas (extent of surface of land or water) under temporary crops, temporary meadows and pastures, and land with temporary fallow. Arable land does not include land that is potentially cultivable but is not normally cultivated. + index: + title: 'Gross Production Index Number' + unit: '' + short_unit: '' + description: | + Gross Production Index Number (2014-2016 = 100). + arable_land_per_crop_output: + title: Arable land needed to produce a fixed quantity of crops + unit: '' + short_unit: '' + description: | + Index of arable land needed to produce a fixed quantity of crops (where values in 1961 are equal to 1.0). This is calculated as arable land divided by the crop production index (PIN). The crop production index here is the sum of crop commodities produced (after deductions of quantities used as seed and feed). It is weighted by the commodity prices. + + This metric measures the index of arable land needed to produce a fixed quantity of crops (where values in 1961 are equal to 1.0). + + Arable land is the total of areas (extent of surface of land or water) under temporary crops, temporary meadows and pastures, and land with temporary fallow. Arable land does not include land that is potentially cultivable but is not normally cultivated. + area_used_per_crop_type: + variables: + area_used_for_production: + title: "Area used for production" + unit: "hectares" + short_unit: "ha" + # Description will be fetched from the original FAOSTAT item and element descriptions. + # description: | + share_of_sustainable_and_overexploited_fish: + variables: + sustainable_fish: + title: "Percentage of fish stocks within biologically sustainable levels" + unit: "%" + short_unit: "%" + description: | + Fish stock are subpopulations of a particular species of fish which have common parameters such as location, growth and mortality which define their population dynamics. Fish stocks are within biologically sustainable levels when fish catch does not exceed the maximum sustainable yield (MSY) - the rate at which fish populations can regenerate. + overexploited_fish: + title : "Percentage of overexploited fish stocks" + unit: "%" + short_unit: "%" + description: | + Fish stock are subpopulations of a particular species of fish which have common parameters such as location, growth and mortality which define their population dynamics. Fish stocks are overexploited when fish catch exceeds the maximum sustainable yield (MSY) - the rate at which fish populations can regenerate. + land_spared_by_increased_crop_yields: + variables: + area: + title: "Actual cropland area today" + unit: "hectares" + short_unit: "ha" + description: | + Total cropland area on a given year, calculated by dividing the total production by the crop yield. + area_with_yield_of_1961: + title: "Cropland area needed if yields stagnated in 1961" + unit: "hectares" + short_unit: "ha" + description: | + Total cropland area that would be necessary if crop yields stagnated in 1961. + + This area is calculated by dividing the total production on a given year by the crop yield of 1961. + spared_land: + title: "Land spared due to crop yield increases" + unit: "hectares" + short_unit: "ha" + description: | + Land spared since 1961 due to the increase of crop yields. + + This area is calculated as the cropland area that would be necessary if crop yields stagnated in 1961 (the total production on a given year divided by the crop yield of 1961), minus the true cropland area on a given year. + spared_land__pct: + title: "Percentage reduction in area needed due to crop yield increases" + unit: "hectares" + short_unit: "ha" + description: | + Land spared since 1961 due to the increase of crop yields, as a percentage of the total land that would be necessary if crop yields had not increased since then. + # All metadata for food_available_for_consumption is prepared via script. + # food_available_for_consumption: + macronutrient_compositions: + variables: + energy_from_animal_products: + title: "Daily caloric intake per person from animal products" + unit: "kilocalories per day per capita" + short_unit: "kcal" + description: ¯onutrient_composition_variable_description | + The FAO provide annual figures from 1961 by country on daily caloric supply, fat supply (in grams), and protein supply (in grams). To calculate the daily per capita supply of carbohydrates, we assume an energy density by macronutrient of 4 kcal per gram of both protein and carbohydrate and 9 kcal per gram of fat (based on established nutritional guidelines reported by the FAO). The daily supply of carbohydrates is therefore calculated as: + + ((Daily supply of kcal)-(Daily supply of protein * 4 + Daily supply of fat * 9)) / 4 + + The quantity of calories from each macronutrient is then calculated based on the energy density figures given above (e.g. calories from protein is calculated by multiplying the daily supply of protein in grams by 4). + + For an explanation of these conversion factors, see "Chapter 3: Calculation Of The Energy Content Of Foods - Energy Conversion Factors", available at: http://www.fao.org/docrep/006/Y5022E/y5022e04.htm + + The share of calories derived from each macronutrient is then calculated by dividing the number of calories derived from a given macronutrient by the total daily caloric supply. + + Protein of animal origin includes protein supplied in the form of all meat commodities, eggs and dairy products, and fish & seafood. + energy_from_animal_protein: + title: "Daily caloric intake per person that comes from animal protein" + unit: "kilocalories per day per capita" + short_unit: "kcal" + description: *macronutrient_composition_variable_description + energy_from_vegetal_products: + title: "Daily caloric intake per person from vegetal products" + unit: "kilocalories per day per capita" + short_unit: "kcal" + description: *macronutrient_composition_variable_description + energy_from_vegetal_protein: + title: "Daily caloric intake per person that comes from vegetal protein" + unit: "kilocalories per day per capita" + short_unit: "kcal" + description: *macronutrient_composition_variable_description + fat_from_animal_products: + title: "Daily fat intake per person from animal products" + unit: "grams per day per capita" + short_unit: "g" + description: *macronutrient_composition_variable_description + fat_from_vegetal_products: + title: "Daily fat intake per person from vegetal products" + unit: "grams per day per capita" + short_unit: "g" + description: *macronutrient_composition_variable_description + protein_from_animal_products: + title: "Daily protein intake from animal products" + unit: "grams per day per capita" + short_unit: "g" + description: *macronutrient_composition_variable_description + protein_from_vegetal_products: + title: "Daily protein intake per person from vegetal products" + unit: "grams per day per capita" + short_unit: "g" + description: *macronutrient_composition_variable_description + share_of_energy_from_animal_protein: + title: "Share of the daily caloric intake that comes from animal protein" + unit: "%" + short_unit: "%" + description: *macronutrient_composition_variable_description + share_of_energy_from_carbohydrates: + title: "Share of the daily caloric intake that comes from carbohydrates" + unit: "%" + short_unit: "%" + description: *macronutrient_composition_variable_description + share_of_energy_from_fat: + title: "Share of the daily caloric intake that comes from fat" + unit: "%" + short_unit: "%" + description: *macronutrient_composition_variable_description + share_of_energy_from_protein: + title: "Share of the daily caloric intake that comes from protein" + unit: "%" + short_unit: "%" + description: *macronutrient_composition_variable_description + share_of_energy_from_vegetal_protein: + title: "Share of the daily caloric intake that comes from vegetal protein" + unit: "%" + short_unit: "%" + description: *macronutrient_composition_variable_description + total_carbohydrates: + title: "Daily carbohydrates intake per person" + unit: "grams per day per capita" + short_unit: "g" + description: *macronutrient_composition_variable_description + total_energy: + title: "Daily caloric intake per person" + unit: "kilocalories per day per capita" + short_unit: "kcal" + description: *macronutrient_composition_variable_description + total_energy_from_carbohydrates: + title: "Daily caloric intake per person from carbohydrates" + unit: "kilocalories per day per capita" + short_unit: "kcal" + description: *macronutrient_composition_variable_description + total_energy_from_fat: + title: "Daily caloric intake per person from fat" + unit: "kilocalories per day per capita" + short_unit: "kcal" + description: *macronutrient_composition_variable_description + total_energy_from_protein: + title: "Daily caloric intake per person from protein" + unit: "kilocalories per day per capita" + short_unit: "kcal" + description: *macronutrient_composition_variable_description + total_fat: + title: "Daily fat intake per person" + unit: "grams per day per capita" + short_unit: "g" + description: *macronutrient_composition_variable_description + total_protein: + title: "Daily protein intake per person" + unit: "grams per day per capita" + short_unit: "g" + description: *macronutrient_composition_variable_description + fertilizers: + variables: + nitrogen_per_cropland: + title: Nitrogen use per area of cropland + unit: kilograms per hectare + short_unit: kg/ha + description: | + Nutrient nitrogen (N) from all fertilizer products per area of cropland, which corresponds to the sum of arable land and permanent crops. + phosphate_per_cropland: + title: Phosphate use per area of cropland + unit: kilograms per hectare + short_unit: kg/ha + description: | + Nutrient phosphate (P2O5) from all fertilizer products per area of cropland, which corresponds to the sum of arable land and permanent crops. + potash_per_cropland: + title: Potash use per area of cropland + unit: kilograms per hectare + short_unit: kg/ha + description: | + Nutrient potash (K2O) from all fertilizer products per area of cropland, which corresponds to the sum of arable land and permanent crops. + all_fertilizers_per_cropland: + title: All fertilizers use per area of cropland + unit: kilograms per hectare + short_unit: kg/ha + description: | + Agricultural use of all fertilizer products (including nitrogenous, potash, and phosphate fertilizers) per area of cropland, which corresponds to the sum of arable land and permanent crops. + cropland: + title: Area of cropland + unit: hectares + short_unit: ha + description: + Surface area of cropland, which corresponds to the sum of arable land and permanent crops. + nitrogen_use: + title: Nitrogen use + unit: tonnes + short_unit: t + description: | + Agricultural use of nutrient nitrogen (N) from all fertilizer products. + phosphate_use: + title: Phosphate use + unit: tonnes + short_unit: t + description: | + Agricultural use of nutrient phosphate (P2O5) from all fertilizer products. + potash_use: + title: Potash use + unit: tonnes + short_unit: t + description: | + Agricultural use of nutrient potash (K2O) from all fertilizer products. + all_fertilizers_use: + title: All fertilizers use + unit: tonnes + short_unit: t + description: | + Agricultural use from all fertilizer products (including nitrogenous, potash, and phosphate fertilizers). + vegetable_oil_yields: + variables: + sunflower_production: + title: Production of sunflower oil + unit: tonnes + short_unit: t + description: | + Amount of sunflower oil produced. + soybean_production: + title: Production of soybean oil + unit: tonnes + short_unit: t + description: | + Amount of soybean oil produced. + groundnut_production: + title: Production of groundnut oil + unit: tonnes + short_unit: t + description: | + Amount of groundnut oil produced. + coconut_production: + title: Production of coconut oil + unit: tonnes + short_unit: t + description: | + Amount of coconut oil produced. + olive_production: + title: Production of olive oil + unit: tonnes + short_unit: t + description: | + Amount of olive oil produced. + cottonseed_production: + title: Production of cottonseed oil + unit: tonnes + short_unit: t + description: | + Amount of cottonseed oil produced. + sesame_production: + title: Production of sesame oil + unit: tonnes + short_unit: t + description: | + Amount of sesame oil produced. + rapeseed_production: + title: Production of rapeseed oil + unit: tonnes + short_unit: t + description: | + Amount of rapeseed oil produced. + palm_production: + title: Production of palm oil + unit: tonnes + short_unit: t + description: | + Amount of palm oil produced. Palm oil includes palm kernel oil. + sunflower_area: + title: Area harvested for sunflower crops + unit: hectares + short_unit: ha + description: | + Land area used to harvest sunflower crops. + cottonseed_area: + title: Area harvested for cottonseed crops + unit: hectares + short_unit: ha + description: | + Land area used to harvest cottonseed crops. + soybean_area: + title: Area harvested for soybean crops + unit: hectares + short_unit: ha + description: | + Land area used to harvest soybean crops. + groundnut_area: + title: Area harvested for groundnut crops + unit: hectares + short_unit: ha + description: | + Land area used to harvest groundnut crops. + olive_area: + title: Area harvested for olive crops + unit: hectares + short_unit: ha + description: | + Land area used to harvest olive crops. + rapeseed_area: + title: Area harvested for rapeseed crops + unit: hectares + short_unit: ha + description: | + Land area used to harvest rapeseed crops. + coconut_area: + title: Area harvested for coconut crops + unit: hectares + short_unit: ha + description: | + Land area used to harvest coconut crops. + sesame_area: + title: Area harvested for sesame crops + unit: hectares + short_unit: ha + description: | + Land area used to harvest sesame crops. + palm_area: + title: Area harvested for palm fruit crops + unit: hectares + short_unit: ha + description: | + Land area used to harvest palm fruit crops. Palm oil includes palm kernel oil. + vegetable_oils_production: + title: Global production of vegetable oils + unit: tonnes + short_unit: t + description: | + Amount of vegetable oils produced worldwide. + palm_tonnes_per_hectare: + title: Palm oil yield per crop + unit: tonnes per hectare + short_unit: tonnes/ha + description: | + Average amount of palm oil produced per hectare of palm fruit crops harvested. Note that this calculates the oil yield per hectare, which is different from the yield of the total crop, because not all of the crop can be used for oil. Palm oil includes palm kernel oil. + palm_hectares_per_tonne: + title: Area of palm fruit crops harvested to produce a tonne of palm oil + unit: hectares per tonne + short_unit: hectares/tonne + description: | + Area of palm fruit crops harvested to produce a tonne of palm oil. This metric is the inverse of oil yields, and represents the amount of land that would need to be devoted to grow a given crop to produce one tonne of oil. Palm oil includes palm kernel oil. + palm_area_to_meet_global_oil_demand: + title: Area needed to meet the global vegetable oil demand with only palm oil + unit: hectares + short_unit: ha + description: | + Amount of land that would need to be devoted to grow palm fruit crops if it was to meet global vegetable oil demand alone. Palm oil includes palm kernel oil. + sunflower_tonnes_per_hectare: + title: Sunflower oil yield per crop + unit: tonnes per hectare + short_unit: tonnes/ha + description: | + Average amount of sunflower oil produced per hectare of sunflower crops harvested. Note that this calculates the oil yield per hectare, which is different from the yield of the total crop, because not all of the crop can be used for oil. + sunflower_hectares_per_tonne: + title: Area of sunflower crops harvested to produce a tonne of sunflower oil + unit: hectares per tonne + short_unit: hectares/tonne + description: | + Area of sunflower crops harvested to produce a tonne of sunflower oil. This metric is the inverse of oil yields, and represents the amount of land that would need to be devoted to grow a given crop to produce one tonne of oil. + sunflower_area_to_meet_global_oil_demand: + title: Area needed to meet the global vegetable oil demand with only sunflower oil + unit: hectares + short_unit: ha + description: | + Amount of land that would need to be devoted to grow sunflower crops if it was to meet global vegetable oil demand alone. + rapeseed_tonnes_per_hectare: + title: Rapeseed oil yield per crop + unit: tonnes per hectare + short_unit: tonnes/ha + description: | + Average amount of rapeseed oil produced per hectare of rapeseed crops harvested. Note that this calculates the oil yield per hectare, which is different from the yield of the total crop, because not all of the crop can be used for oil. + rapeseed_hectares_per_tonne: + title: Area of rapeseed crops harvested to produce a tonne of rapeseed oil + unit: hectares per tonne + short_unit: hectares/tonne + description: | + Area of rapeseed crops harvested to produce a tonne of rapeseed oil. This metric is the inverse of oil yields, and represents the amount of land that would need to be devoted to grow a given crop to produce one tonne of oil. + rapeseed_area_to_meet_global_oil_demand: + title: Area needed to meet the global vegetable oil demand with only rapeseed oil + unit: hectares + short_unit: ha + description: | + Amount of land that would need to be devoted to grow rapeseed crops if it was to meet global vegetable oil demand alone. + soybean_tonnes_per_hectare: + title: Soybean oil yield per crop + unit: tonnes per hectare + short_unit: tonnes/ha + description: | + Average amount of soybean oil produced per hectare of soybean crops harvested. Note that this calculates the oil yield per hectare, which is different from the yield of the total crop, because not all of the crop can be used for oil. + soybean_hectares_per_tonne: + title: Area of soybean crops harvested to produce a tonne of soybean oil + unit: hectares per tonne + short_unit: hectares/tonne + description: | + Area of soybean crops harvested to produce a tonne of soybean oil. This metric is the inverse of oil yields, and represents the amount of land that would need to be devoted to grow a given crop to produce one tonne of oil. + soybean_area_to_meet_global_oil_demand: + title: Area needed to meet the global vegetable oil demand with only soybean oil + unit: hectares + short_unit: ha + description: | + Amount of land that would need to be devoted to grow soybean crops if it was to meet global vegetable oil demand alone. + olive_tonnes_per_hectare: + title: Olive oil yield per crop + unit: tonnes per hectare + short_unit: tonnes/ha + description: | + Average amount of olive oil produced per hectare of olive crops harvested. Note that this calculates the oil yield per hectare, which is different from the yield of the total crop, because not all of the crop can be used for oil. + olive_hectares_per_tonne: + title: Area of olive crops harvested to produce a tonne of olive oil + unit: hectares per tonne + short_unit: hectares/tonne + description: | + Area of olive crops harvested to produce a tonne of olive oil. This metric is the inverse of oil yields, and represents the amount of land that would need to be devoted to grow a given crop to produce one tonne of oil. + olive_area_to_meet_global_oil_demand: + title: Area needed to meet the global vegetable oil demand with only olive oil + unit: hectares + short_unit: ha + description: | + Amount of land that would need to be devoted to grow olive crops if it was to meet global vegetable oil demand alone. + coconut_tonnes_per_hectare: + title: Coconut oil yield per crop + unit: tonnes per hectare + short_unit: tonnes/ha + description: | + Average amount of coconut oil produced per hectare of coconut crops harvested. Note that this calculates the oil yield per hectare, which is different from the yield of the total crop, because not all of the crop can be used for oil. + coconut_hectares_per_tonne: + title: Area of coconut crops harvested to produce a tonne of coconut oil + unit: hectares per tonne + short_unit: hectares/tonne + description: | + Area of coconut crops harvested to produce a tonne of coconut oil. This metric is the inverse of oil yields, and represents the amount of land that would need to be devoted to grow a given crop to produce one tonne of oil. + coconut_area_to_meet_global_oil_demand: + title: Area needed to meet the global vegetable oil demand with only coconut oil + unit: hectares + short_unit: ha + description: | + Amount of land that would need to be devoted to grow coconut crops if it was to meet global vegetable oil demand alone. + groundnut_tonnes_per_hectare: + title: Groundnut oil yield per crop + unit: tonnes per hectare + short_unit: tonnes/ha + description: | + Average amount of groundnut oil produced per hectare of groundnut crops harvested. Note that this calculates the oil yield per hectare, which is different from the yield of the total crop, because not all of the crop can be used for oil. + groundnut_hectares_per_tonne: + title: Area of groundnut crops harvested to produce a tonne of groundnut oil + unit: hectares per tonne + short_unit: hectares/tonne + description: | + Area of groundnut crops harvested to produce a tonne of groundnut oil. This metric is the inverse of oil yields, and represents the amount of land that would need to be devoted to grow a given crop to produce one tonne of oil. + groundnut_area_to_meet_global_oil_demand: + title: Area needed to meet the global vegetable oil demand with only groundnut oil + unit: hectares + short_unit: ha + description: | + Amount of land that would need to be devoted to grow groundnut crops if it was to meet global vegetable oil demand alone. + cottonseed_tonnes_per_hectare: + title: Cottonseed oil yield per crop + unit: tonnes per hectare + short_unit: tonnes/ha + description: | + Average amount of cottonseed oil produced per hectare of cottonseed crops harvested. Note that this calculates the oil yield per hectare, which is different from the yield of the total crop, because not all of the crop can be used for oil. + cottonseed_hectares_per_tonne: + title: Area of cottonseed crops harvested to produce a tonne of cottonseed oil + unit: hectares per tonne + short_unit: hectares/tonne + description: | + Area of cottonseed crops harvested to produce a tonne of cottonseed oil. This metric is the inverse of oil yields, and represents the amount of land that would need to be devoted to grow a given crop to produce one tonne of oil. + cottonseed_area_to_meet_global_oil_demand: + title: Area needed to meet the global vegetable oil demand with only cottonseed oil + unit: hectares + short_unit: ha + description: | + Amount of land that would need to be devoted to grow cottonseed crops if it was to meet global vegetable oil demand alone. + sesame_tonnes_per_hectare: + title: Sesame oil yield per crop + unit: tonnes per hectare + short_unit: tonnes/ha + description: | + Average amount of sesame oil produced per hectare of sesame crops harvested. Note that this calculates the oil yield per hectare, which is different from the yield of the total crop, because not all of the crop can be used for oil. + sesame_hectares_per_tonne: + title: Area of sesame crops harvested to produce a tonne of sesame oil + unit: hectares per tonne + short_unit: hectares/tonne + description: | + Area of sesame crops harvested to produce a tonne of sesame oil. This metric is the inverse of oil yields, and represents the amount of land that would need to be devoted to grow a given crop to produce one tonne of oil. + sesame_area_to_meet_global_oil_demand: + title: Area needed to meet the global vegetable oil demand with only sesame oil + unit: hectares + short_unit: ha + description: | + Amount of land that would need to be devoted to grow sesame crops if it was to meet global vegetable oil demand alone. + agriculture_land_use_evolution: + variables: + agriculture_area: + title: Area used for agriculture + unit: hectares + short_unit: ha + description: | + Surface area devoted to agriculture on a given year. + agriculture_area_one_decade_back: + title: Area used for agriculture one decade back + unit: hectares + short_unit: ha + description: | + Surface area devoted to agriculture one decade before a given year. For example, for year 2020, this variable gives the extent of agricultural land in 2010. + cropland_area: + title: Area used for croplands + unit: hectares + short_unit: ha + description: | + Surface area devoted to croplands on a given year. + cropland_area_one_decade_back: + title: Area used for croplands one decade back + unit: hectares + short_unit: ha + description: | + Surface area devoted to croplands one decade before a given year. For example, for year 2020, this variable gives the extent of croplands in 2010. + pasture_area: + title: Area used for pastures + unit: hectares + short_unit: ha + description: | + Surface area devoted to pastures on a given year. + pasture_area_one_decade_back: + title: Area used for pastures one decade back + unit: hectares + short_unit: ha + description: | + Surface area devoted to pastures one decade before a given year. For example, for year 2020, this variable gives the extent of pastures in 2010. + year_one_decade_back: + title: Year one decade back + unit: "" + short_unit: "" + description: | + Year one decade before a given year. For example, for year 2020, this variable would be 2010. + agriculture_area_change: + title: Change in agriculture area with respect to one decade back + unit: "%" + short_unit: "%" + description: | + Percentage change in surface area devoted to agriculture with respect to 10 years before. Negative values imply that surface area has decreased with respect to the previous decade. + + This data is used to assess which countries may have already peaked in their agricultural land use. + + Assessing this by looking at annual land use data is difficult because there can be significant year-to-year variability. That land use for one or two years was lower than previous years would be insufficient to conclude that a country had peaked. + + For this reason we look at decadal changes in agricultural land. We look at land use in the latest year relative to 10 years before. + + If land use is lower in the latest year then we suggest that land use may have peaked. If land use it the same or higher than a decade back, we suggest that it hasn't, or this is uncertain. + cropland_area_change: + title: Change in cropland area with respect to one decade back + unit: "%" + short_unit: "%" + description: | + Percentage change in surface area devoted to croplands with respect to 10 years before. Negative values imply that surface area has decreased with respect to the previous decade. + + This data is used to assess which countries may have already peaked in their agricultural land use. + + Assessing this by looking at annual land use data is difficult because there can be significant year-to-year variability. That land use for one or two years was lower than previous years would be insufficient to conclude that a country had peaked. + + For this reason we look at decadal changes in agricultural land. We look at land use in the latest year relative to 10 years before. + + If land use is lower in the latest year then we suggest that land use may have peaked. If land use it the same or higher than a decade back, we suggest that it hasn't, or this is uncertain. + pasture_area_change: + title: Change in pasture area with respect to one decade back + unit: "%" + short_unit: "%" + description: | + Percentage change in surface area devoted to pastures with respect to 10 years before. Negative values imply that surface area has decreased with respect to the previous decade. + + This data is used to assess which countries may have already peaked in their agricultural land use. + + Assessing this by looking at annual land use data is difficult because there can be significant year-to-year variability. That land use for one or two years was lower than previous years would be insufficient to conclude that a country had peaked. + + For this reason we look at decadal changes in agricultural land. We look at land use in the latest year relative to 10 years before. + + If land use is lower in the latest year then we suggest that land use may have peaked. If land use it the same or higher than a decade back, we suggest that it hasn't, or this is uncertain. + hypothetical_meat_consumption: + variables: + animals_global: + title: Number of slaughtered animals to produce meat worldwide + unit: "animals" + short_unit: "" + animals_global_hypothetical: + title: Hypothetical number of slaughtered animals if everyone ate like the average citizen of a given country + unit: "animals" + short_unit: "" + description: | + Hypothetical number of slaughtered animals worldwide if everyone in the world ate the same quantity as the average citizen of a given country. + + This is a hypothetical variable derived by Our World in Data which answers the question: "How many animals would need to be slaughtered if everyone in the world consumed the average per capita amount of a given country?". For example: "How many animals would need to be slaughtered if everyone in the world consumed the same amount of meat as the average UK citizen?". + + This was derived by multiplying global population by the per capita number of slaughtered animals of a given country. + animals_per_capita: + title: Number of slaughtered animals per person in each country + unit: "animals per person" + short_unit: "" + global_population: + title: World population + unit: "people" + short_unit: "" + production_global: + title: Total amount of meat produced worldwide + unit: "tonnes" + short_unit: "t" + production_global_hypothetical: + title: Hypothetical global meat demand if everyone ate like the average citizen of a given country + unit: "tonnes" + short_unit: "t" + description: | + Hypothetical global meat demand if everyone in the world ate the same quantity as the average citizen of a given country. + + This is a hypothetical variable derived by Our World in Data which answers the question: "What would global meat production have to be if everyone in the world consumed the average per capita amount of a given country?". For example: "How much meat would we need to produce if everyone in the world consumed the same amount of meat as the average UK citizen?". + + This was derived by multiplying global population by per capita meat supply of a given country. + production_per_capita: + title: Per-capita production of meat in each country + unit: "tonnes per person" + short_unit: "t/person" + cereal_allocation: + variables: + cereals_allocated_to_animal_feed: + title: Cereals allocated to animal feed + unit: tonnes + short_unit: t + description: | + Quantity of cereal crops allocated to animal feed (and not human food or other uses, such as biofuel production). + cereals_allocated_to_food: + title: Cereals allocated to human food + unit: tonnes + short_unit: t + description: | + Quantity of cereal crops allocated to human food (and not animal feed or other uses, such as biofuel production). + cereals_allocated_to_other_uses: + title: Cereals allocated to other uses + unit: tonnes + short_unit: t + description: | + Quantity of cereal crops allocated to other uses (and not to human food or animal feed), predominantly industrial uses such as biofuel production. + share_of_cereals_allocated_to_animal_feed: + title: Share of cereals that are allocated to animal feed + unit: "%" + short_unit: "%" + description: | + This is calculated by dividing the amount of cereals allocated to animal feed by the sum of all cereal uses considered (namely human food, animal feed, and other uses such us biofuel production). This corresponds to cereals available domestically (after trade) excluding supply chain losses and seed resown from the crop. + share_of_cereals_allocated_to_food: + title: Share of cereals that are allocated to human food + unit: "%" + short_unit: "%" + description: | + This is calculated by dividing the amount of cereals allocated to human food by the sum of all cereal uses considered (namely human food, animal feed, and other uses such us biofuel production). This corresponds to cereals available domestically (after trade) excluding supply chain losses and seed resown from the crop. + share_of_cereals_allocated_to_other_uses: + title: Share of cereals that are allocated to other uses such as biofuel production + unit: "%" + short_unit: "%" + description: | + This is calculated by dividing the amount of cereals allocated to other uses (predominantly industrial uses such as biofuel production) by the sum of all cereal uses considered (namely human food, animal feed, and other uses). This corresponds to cereals available domestically (after trade) excluding supply chain losses and seed resown from the crop. + # All metadata for maize_and_wheat and fertilizer_exports is prepared via script. + # maize_and_wheat: + # fertilizer_exports: diff --git a/etl/steps/data/garden/faostat/2024-03-14/additional_variables.py b/etl/steps/data/garden/faostat/2024-03-14/additional_variables.py new file mode 100644 index 00000000000..1c7d75796ec --- /dev/null +++ b/etl/steps/data/garden/faostat/2024-03-14/additional_variables.py @@ -0,0 +1,1291 @@ +"""Dataset that combines different variables of other FAOSTAT datasets. + +""" + +import numpy as np +import pandas as pd +from owid.catalog import Table +from owid.catalog.utils import underscore +from owid.datautils.dataframes import multi_merge +from shared import NAMESPACE + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def generate_arable_land_per_crop_output(df_rl: pd.DataFrame, df_qi: pd.DataFrame) -> Table: + # Item code for item "Arable land" of faostat_rl dataset. + ITEM_CODE_FOR_ARABLE_LAND = "00006621" + # Element code for element "Area" of faostat_rl dataset. + ELEMENT_CODE_FOR_AREA = "005110" + # Item code for item "Crops" of faostat_qi dataset. + ITEM_CODE_FOR_CROPS = "00002041" + # Element code for "Gross Production Index Number (2014-2016 = 100)" of faostat_qi dataset. + ELEMENT_CODE_PRODUCTION_INDEX = "000432" + # Reference year for production index (values of area/index will be divided by the value on this year). + PRODUCTION_INDEX_REFERENCE_YEAR = 1961 + + # Select the necessary item and element of the land use dataset. + df_rl = df_rl[ + (df_rl["item_code"] == ITEM_CODE_FOR_ARABLE_LAND) & (df_rl["element_code"] == ELEMENT_CODE_FOR_AREA) + ].reset_index(drop=True) + # Sanity check. + error = "Unit for element 'Area' in faostat_rl has changed." + assert list(df_rl["unit"].unique()) == ["hectares"], error + # Rename columns and select only necessary columns. + df_rl = df_rl[["country", "year", "value"]].rename(columns={"value": "area"}).reset_index(drop=True) + + # Select the necessary item and element of the production index dataset. + df_qi = df_qi[ + (df_qi["element_code"] == ELEMENT_CODE_PRODUCTION_INDEX) & (df_qi["item_code"] == ITEM_CODE_FOR_CROPS) + ].reset_index(drop=True) + # Sanity check. + error = "Unit for element 'Gross Production Index Number (2014-2016 = 100)' in faostat_qi has changed." + assert list(df_qi["unit"].unique()) == [""], error + # Rename columns and select only necessary columns. + df_qi = df_qi[["country", "year", "value"]].rename(columns={"value": "index"}) + + # Combine both dataframes. + combined = pd.merge(df_rl, df_qi, on=["country", "year"], how="inner", validate="one_to_one") + + # Create the new variable of arable land per crop output. + combined["value"] = combined["area"] / combined["index"] + + # Add a column of a reference value for each country, and normalize data by dividing by the reference value. + reference = combined[combined["year"] == PRODUCTION_INDEX_REFERENCE_YEAR][["country", "value"]].reset_index( + drop=True + ) + combined = pd.merge( + combined, reference[["country", "value"]], on=["country"], how="left", suffixes=("", "_reference") + ) + combined["value"] /= combined["value_reference"] + + # Remove all countries for which we did not have data for the reference year. + combined = combined.dropna(subset="value").reset_index(drop=True) + + # Remove unnecessary columns and rename conveniently. + combined = combined.drop(columns=["value_reference"]).rename(columns={"value": "arable_land_per_crop_output"}) + + # Set an appropriate index and sort conveniently. + tb_combined = Table( + combined.set_index(["country", "year"], verify_integrity=True).sort_index(), + short_name="arable_land_per_crop_output", + ) + + return tb_combined + + +def generate_area_used_for_production_per_crop_type(df_qcl: pd.DataFrame) -> Table: + # Element code for "Area harvested" of faostat_qcl dataset. + ELEMENT_CODE_FOR_AREA_HARVESTED = "005312" + + # List of items belonging to item group "Coarse Grain, Total", according to + # https://www.fao.org/faostat/en/#definitions + ITEM_CODES_COARSE_GRAINS = [ + "00000044", # Barley + "00000089", # Buckwheat + "00000101", # Canary seed + "00000108", # Cereals n.e.c. + "00000108", # Cereals nes + "00000094", # Fonio + "00000103", # Grain, mixed + "00000056", # Maize + "00000056", # Maize (corn) + "00000079", # Millet + "00000103", # Mixed grain + "00000075", # Oats + "00000092", # Quinoa + "00000071", # Rye + "00000083", # Sorghum + "00000097", # Triticale + ] + + # Item codes for croup groups from faostat_qcl. + ITEM_CODES_OF_CROP_GROUPS = [ + "00001717", # Cereals + "00001804", # Citrus Fruit + "00001738", # Fruit + "00000780", # Jute + "00001732", # Oilcrops, Oil Equivalent + "00001726", # Pulses + "00001720", # Roots and tubers + "00001729", # Treenuts + "00001735", # Vegetables + "00001814", # Coarse Grain + ] + + error = "Not all expected item codes were found in QCL." + assert set(ITEM_CODES_COARSE_GRAINS) < set(df_qcl["item_code"]), error + + # Select the world and the element code for area harvested. + area_by_crop_type = df_qcl[ + (df_qcl["country"] == "World") & (df_qcl["element_code"] == ELEMENT_CODE_FOR_AREA_HARVESTED) + ].reset_index(drop=True) + error = "Unit for element 'Area harvested' in faostat_qcl has changed." + assert list(area_by_crop_type["unit"].unique()) == ["hectares"], error + + # Add items for item group "Coarse Grain, Total". + coarse_grains = ( + area_by_crop_type[(area_by_crop_type["item_code"].isin(ITEM_CODES_COARSE_GRAINS))] + .groupby("year", as_index=False) + .agg({"value": "sum"}) + .assign(**{"item": "Coarse Grain", "item_code": "00001814"}) + ) + area_by_crop_type = pd.concat( + [area_by_crop_type[~area_by_crop_type["item_code"].isin(ITEM_CODES_COARSE_GRAINS)], coarse_grains], + ignore_index=True, + ) + + area_by_crop_type = area_by_crop_type[area_by_crop_type["item_code"].isin(ITEM_CODES_OF_CROP_GROUPS)].reset_index( + drop=True + ) + + # Prepare variable description. + descriptions = "Definitions by FAOSTAT:" + for item in sorted(set(area_by_crop_type["item"])): + descriptions += f"\n\nItem: {item}" + item_description = area_by_crop_type[area_by_crop_type["item"] == item]["item_description"].fillna("").iloc[0] + if len(item_description) > 0: + descriptions += f"\nDescription: {item_description}" + + descriptions += f"\n\nMetric: {area_by_crop_type['element'].iloc[0]}" + descriptions += f"\nDescription: {area_by_crop_type['element_description'].iloc[0]}" + + # Create a table with the necessary columns, set an appropriate index, and sort conveniently. + tb_area_by_crop_type = Table( + area_by_crop_type[["item", "year", "value"]] + .rename(columns={"value": "area_used_for_production"}) + .set_index(["item", "year"], verify_integrity=True) + .sort_index(), + short_name="area_used_per_crop_type", + ) + + # Add a table description. + tb_area_by_crop_type["area_used_for_production"].metadata.description = descriptions + + return tb_area_by_crop_type + + +def generate_percentage_of_sustainable_and_overexploited_fish(df_sdgb: pd.DataFrame) -> Table: + # "14.4.1 Proportion of fish stocks within biologically sustainable levels" + ITEM_CODE_SUSTAINABLE_FISH = "000000000024029" + + # Select the necessary item. + df_sdgb = df_sdgb[df_sdgb["item_code"] == ITEM_CODE_SUSTAINABLE_FISH].reset_index(drop=True) + error = "Unit for fish data has changed." + assert list(df_sdgb["unit"].unique()) == ["Percent"], error + error = "Element for fish data has changed." + assert list(df_sdgb["element"].unique()) == ["Value"], error + + # Select necessary columns (item and element descriptions are empty in the current version). + df_sdgb = df_sdgb[["country", "year", "value"]].rename(columns={"value": "sustainable_fish"}) + + error = "Percentage of sustainable fish larger than 100%." + assert (df_sdgb["sustainable_fish"] <= 100).all(), error + + # Add column of percentage of overexploited fish. + df_sdgb["overexploited_fish"] = 100 - df_sdgb["sustainable_fish"] + + # Create a table with the necessary columns, set an appropriate index, and sort conveniently. + tb_fish = ( + Table(df_sdgb, short_name="share_of_sustainable_and_overexploited_fish") + .set_index(["country", "year"], verify_integrity=True) + .sort_index() + ) + + return tb_fish + + +def generate_spared_land_from_increased_yields(df_qcl: pd.DataFrame) -> Table: + # Reference year (to see how much land we spare from increased yields). + REFERENCE_YEAR = 1961 + # Element code for "Yield" of faostat_qcl dataset. + ELEMENT_CODE_FOR_YIELD = "005419" + # Element code for "Production" of faostat_qcl dataset. + ELEMENT_CODE_FOR_PRODUCTION = "005510" + + # Item codes for crop groups from faostat_qcl. + ITEM_CODES_OF_CROP_GROUPS = [ + "00001717", # Cereals + "00001738", # Fruit + "00001726", # Pulses + "00001720", # Roots and tubers + "00001735", # Vegetables + "00001723", # Sugar Crops + "00001729", # Treenuts + # Data for fibre crops has changed significantly since last version, and is also significantly smaller than + # other crop groups, so we omit it. + # "00000821", # Fibre crops. + ] + + # Select necessary items and elements. + spared_land = df_qcl[ + (df_qcl["item_code"].isin(ITEM_CODES_OF_CROP_GROUPS)) + & (df_qcl["element_code"].isin([ELEMENT_CODE_FOR_PRODUCTION, ELEMENT_CODE_FOR_YIELD])) + ].reset_index(drop=True) + + # Sanity check. + error = "Units for production and yield have changed." + assert set(spared_land["unit"]) == set(["tonnes per hectare", "tonnes"]), error + + # Transpose dataframe. + spared_land = spared_land.pivot( + index=["country", "year", "item"], columns=["element"], values="value" + ).reset_index() + + # Fix spurious index name after pivotting. + spared_land.columns = list(spared_land.columns) + + # Add columns for production and yield for a given reference year. + reference_values = spared_land[spared_land["year"] == REFERENCE_YEAR].drop(columns=["year"]) + spared_land = pd.merge( + spared_land, reference_values, on=["country", "item"], how="left", suffixes=("", f" in {REFERENCE_YEAR}") + ) + + # Drop countries for which we did not have data in the reference year. + spared_land = spared_land.dropna().reset_index(drop=True) + + # Calculate area harvested that would be required given current production, with the yield of the reference year. + spared_land[f"Area with yield of {REFERENCE_YEAR}"] = ( + spared_land["Production"] / spared_land[f"Yield in {REFERENCE_YEAR}"] + ) + # Calculate the real area harvested (given the current production and yield). + spared_land["Area"] = spared_land["Production"] / spared_land["Yield"] + + # Keep only required columns + spared_land = spared_land[["country", "year", "item", "Area", f"Area with yield of {REFERENCE_YEAR}"]].reset_index( + drop=True + ) + + # Add total area for all crops. + all_crops = ( + spared_land.groupby(["country", "year"], as_index=False, observed=True) + .agg({"Area": sum, f"Area with yield of {REFERENCE_YEAR}": sum}) + .assign(**{"item": "All crops"}) + ) + spared_land = pd.concat([spared_land, all_crops], ignore_index=True) + + # Calculate the spared land in total value, and as a percentage of land we would have used with no yield increase. + spared_land["Spared land"] = spared_land[f"Area with yield of {REFERENCE_YEAR}"] - spared_land["Area"] + spared_land["Spared land (%)"] = ( + 100 * spared_land["Spared land"] / spared_land[f"Area with yield of {REFERENCE_YEAR}"] + ) + + # Create a table with the necessary columns, set an appropriate index, and sort conveniently. + tb_spared_land = Table( + spared_land.set_index(["country", "year", "item"], verify_integrity=True).sort_index(), + short_name="land_spared_by_increased_crop_yields", + underscore=True, + ) + + return tb_spared_land + + +def generate_food_available_for_consumption(df_fbsc: pd.DataFrame) -> Table: + # Element code for "Food available for consumption" of faostat_fbsc (in kilocalories per day per capita). + ELEMENT_CODE_FOR_PER_CAPITA_FOOD = "0664pc" + # Expected unit. + CONSUMPTION_UNIT = "kilocalories per day per capita" + + # Select relevant metric. + df_fbsc = df_fbsc[(df_fbsc["element_code"] == ELEMENT_CODE_FOR_PER_CAPITA_FOOD)].reset_index(drop=True) + + # Sanity check. + error = "Units for food available for consumption have changed." + assert list(df_fbsc["unit"].unique()) == [CONSUMPTION_UNIT], error + + # List of food groups created by OWID. + # Each food group contains one or more "item groups", defined by FAOSTAT. + # Each item group contains one or more "item", defined by FAOSTAT. + # The complete list of items coincides exactly with the complete list of items of FAOSTAT item group "Grand Total" + # (with item group code 2901). + # So all existing food items in FBSC are contained here, and there are no repetitions. + # Notes: + # * There are a few item groups that are not included here, namely "Vegetal Products" (item group code 2903), + # and "Animal Products" (item group code 2941). But their items are contained in other item groups, so including + # them would cause unnecessary repetition of items. + # * To check for the components of an individual item group: + # from etl.paths import DATA_DIR + # metadata = Dataset(DATA_DIR / "meadow/faostat/2023-02-22/faostat_metadata") + # item_groups = metadata["faostat_fbs_item_group"] + # set(item_groups.loc[2941]["item"]) + FOOD_GROUPS = { + "Cereals and grains": [ + "00002905", # Cereals, Excluding Beer + # Item group contains: + # 'Barley and products', + # 'Cereals, Other', + # 'Maize and products', + # 'Millet and products', + # 'Oats', + # 'Rice and products', + # 'Rye and products', + # 'Sorghum and products', + # 'Wheat and products', + ], + "Pulses": [ + "00002911", # Pulses + # Item group contains: + # 'Beans', + # 'Peas', + # 'Pulses, Other and products', + ], + "Starchy roots": [ + "00002907", # Starchy Roots + # Item group contains: + # 'Cassava and products', + # 'Potatoes and products', + # 'Roots, Other', + # 'Sweet potatoes', + # 'Yams', + ], + "Fruits and vegetables": [ + "00002919", # Fruits - Excluding Wine + # Item group contains: + # 'Apples and products', + # 'Bananas', + # 'Citrus, Other', + # 'Dates', + # 'Fruits, other', + # 'Grapefruit and products', + # 'Grapes and products (excl wine)', + # 'Lemons, Limes and products', + # 'Oranges, Mandarines', + # 'Pineapples and products', + # 'Plantains', + "00002918", # Vegetables + # Item group contains: + # 'Onions', + # 'Tomatoes and products', + # 'Vegetables, other', + ], + "Oils and fats": [ + "00002914", # Vegetable Oils + # Item group contains: + # 'Coconut Oil', + # 'Cottonseed Oil', + # 'Groundnut Oil', + # 'Maize Germ Oil', + # 'Oilcrops Oil, Other', + # 'Olive Oil', + # 'Palm Oil', + # 'Palmkernel Oil', + # 'Rape and Mustard Oil', + # 'Ricebran Oil', + # 'Sesameseed Oil', + # 'Soyabean Oil', + # 'Sunflowerseed Oil' + "00002946", # Animal fats group + # Item group contains: + # 'Butter, Ghee', + # 'Cream', + # 'Fats, Animals, Raw', + # 'Fish, Body Oil', + # 'Fish, Liver Oil' + "00002913", # Oilcrops + # Item group contains: + # 'Coconuts - Incl Copra', + # 'Cottonseed', + # 'Groundnuts', + # 'Oilcrops, Other', + # 'Olives (including preserved)', + # 'Palm kernels', + # 'Rape and Mustardseed', + # 'Sesame seed', + # 'Soyabeans', + # 'Sunflower seed' + "00002912", # Treenuts + # Item group contains: + # 'Nuts and products', + ], + "Sugar": [ + "00002909", # Sugar & Sweeteners + # Item group contains: + # 'Honey', + # 'Sugar (Raw Equivalent)', + # 'Sugar non-centrifugal', + # 'Sweeteners, Other', + "00002908", # Sugar crops + # Item group contains: + # 'Sugar beet', + # 'Sugar cane', + ], + "Meat": [ + "00002960", # Fish and seafood + # Item group contains: + # 'Aquatic Animals, Others', + # 'Cephalopods', + # 'Crustaceans', + # 'Demersal Fish', + # 'Freshwater Fish', + # 'Marine Fish, Other', + # 'Molluscs, Other', + # 'Pelagic Fish', + "00002943", # Meat, total + # Item group contains: + # 'Bovine Meat', + # 'Meat, Other', + # 'Mutton & Goat Meat', + # 'Pigmeat', + # 'Poultry Meat', + ], + "Dairy and eggs": [ + "00002948", # Milk - Excluding Butter + # Item group contains: + # 'Milk - Excluding Butter', + "00002949", # Eggs + # Item group contains: + # 'Eggs', + ], + "Alcoholic beverages": [ + "00002924", # Alcoholic Beverages + # Item group contains: + # 'Alcohol, Non-Food', + # 'Beer', + # 'Beverages, Alcoholic', + # 'Beverages, Fermented', + # 'Wine', + ], + "Other": [ + "00002928", # Miscellaneous + # Item group contains: + # 'Infant food', + # 'Miscellaneous', + "00002923", # Spices + # Item group contains: + # 'Cloves', + # 'Pepper', + # 'Pimento', + # 'Spices, Other', + "00002922", # Stimulants + # Item group contains: + # 'Cocoa Beans and products', + # 'Coffee and products', + # 'Tea (including mate)', + "00002945", # Offals + # Item group contains: + # 'Offals, Edible', + "00002961", # Aquatic Products, Other + # 'Aquatic Plants', + # 'Meat, Aquatic Mammals', + ], + } + + # Sanity check. + error = "Not all expected item codes are found in the data." + assert set([item_code for group in FOOD_GROUPS.values() for item_code in group]) <= set(df_fbsc["item_code"]), error + + # Create a list of dataframes, one for each food group. + dfs = [ + df_fbsc[df_fbsc["item_code"].isin(FOOD_GROUPS[group])] + .groupby(["country", "year"], as_index=False, observed=True) + .agg({"value": "sum"}) + .rename(columns={"value": group}) + for group in FOOD_GROUPS + ] + combined = multi_merge(dfs=dfs, on=["country", "year"], how="outer") + + # Create a table, set an appropriate index, and sort conveniently. + tb_food_available_for_consumption = Table( + combined.set_index(["country", "year"], verify_integrity=True).sort_index(), + short_name="food_available_for_consumption", + underscore=True, + ) + + # Prepare variable metadata. + common_description = ( + "Data represents the average daily per capita supply of calories from the full range of " + "commodities, grouped by food categories. Note that these figures do not correct for waste at the " + "household/consumption level so may not directly reflect the quantity of food finally consumed by a given " + "individual.\n\nSpecific food commodities have been grouped into higher-level categories." + ) + for group in FOOD_GROUPS: + item_names = list(df_fbsc[df_fbsc["item_code"].isin(FOOD_GROUPS[group])]["item"].unique()) + description = ( + common_description + + f" Food group '{group}' includes the FAO item groups: '" + + "', '".join(item_names) + + "'." + ) + tb_food_available_for_consumption[ + underscore(group) + ].metadata.title = f"Daily caloric intake per person from {group.lower().replace('other', 'other commodities')}" + tb_food_available_for_consumption[underscore(group)].metadata.unit = CONSUMPTION_UNIT + tb_food_available_for_consumption[underscore(group)].metadata.short_unit = "kcal" + tb_food_available_for_consumption[underscore(group)].metadata.description = description + + return tb_food_available_for_consumption + + +def generate_macronutrient_compositions(df_fbsc: pd.DataFrame) -> Table: + # Item code for "Total" of faostat_fbsc. + ITEM_CODE_ALL_PRODUCTS = "00002901" + # Item code for "Vegetal Products" of faostat_fbsc. + ITEM_CODE_VEGETAL_PRODUCTS = "00002903" + # Item code for "Animal Products" of faostat_fbsc. + ITEM_CODE_ANIMAL_PRODUCTS = "00002941" + + # Element code for "Food available for consumption" of faostat_fbsc (in kilocalories per day per capita). + ELEMENT_CODE_FOR_ENERGY_PER_DAY = "0664pc" + # Element code for "Food available for consumption" of faostat_fbsc (in grams of protein per day per capita). + ELEMENT_CODE_FOR_PROTEIN_PER_DAY = "0674pc" + # Element code for "Food available for consumption" of faostat_fbsc (in grams of fat per day per capita). + ELEMENT_CODE_FOR_FAT_PER_DAY = "0684pc" + + # Assumed energy density by macronutrient, in kilocalories per gram of fat, protein or carbohydrates. + KCAL_PER_GRAM_OF_FAT = 9 + KCAL_PER_GRAM_OF_PROTEIN = 4 + KCAL_PER_GRAM_OF_CARBOHYDRATES = 4 + + # Select relevant items and elements. + df = df_fbsc[ + (df_fbsc["item_code"].isin([ITEM_CODE_ALL_PRODUCTS, ITEM_CODE_ANIMAL_PRODUCTS, ITEM_CODE_VEGETAL_PRODUCTS])) + & ( + df_fbsc["element_code"].isin( + [ELEMENT_CODE_FOR_ENERGY_PER_DAY, ELEMENT_CODE_FOR_PROTEIN_PER_DAY, ELEMENT_CODE_FOR_FAT_PER_DAY] + ) + ) + ].reset_index(drop=True) + + # Sanity check. + error = "One or more of the units of food available for consumption has changed." + assert list(df["unit"].unique()) == [ + "kilocalories per day per capita", + "grams of protein per day per capita", + "grams of fat per day per capita", + ], error + + # Food contents and element code for the metric of their consumption per day per capita. + food_contents = { + "energy": ELEMENT_CODE_FOR_ENERGY_PER_DAY, + "fat": ELEMENT_CODE_FOR_FAT_PER_DAY, + "protein": ELEMENT_CODE_FOR_PROTEIN_PER_DAY, + } + + # Initialize a list of dataframes, one for each food content (energy, fat or protein). + dfs = [] + for content in food_contents: + # Create a dataframe for each food content, and add it to the list. + df_content = df[df["element_code"] == food_contents[content]].pivot( + index=["country", "year"], columns=["item"], values=["value"] + ) # .reset_index() + df_content.columns = df_content.columns.droplevel(0) + df_content = df_content.reset_index().rename( + columns={ + "Total": f"Total {content}", + "Vegetal Products": f"{content.capitalize()} from vegetal products", + "Animal Products": f"{content.capitalize()} from animal products", + } + ) + dfs.append(df_content) + + # Sanity check. + error = f"The sum of animal and vegetable {content} does not add up to the total." + assert ( + 100 + * abs( + df_content[f"{content.capitalize()} from animal products"] + + df_content[f"{content.capitalize()} from vegetal products"] + - df_content[f"Total {content}"] + ) + / df_content[f"Total {content}"] + < 1 + ).all(), error + + # Combine all dataframes. + combined = multi_merge(dfs=dfs, on=["country", "year"], how="outer") + + # Daily caloric intake from fat, per person. + combined["Total energy from fat"] = combined["Total fat"] * KCAL_PER_GRAM_OF_FAT + # Daily caloric intake from protein, per person. + combined["Total energy from protein"] = combined["Total protein"] * KCAL_PER_GRAM_OF_PROTEIN + # Daily caloric intake from carbohydrates (assumed to be the rest of the daily caloric intake), per person. + # This is the difference between the total caloric intake minus the caloric intake from protein and fat. + combined["Total energy from carbohydrates"] = ( + combined["Total energy"] - combined["Total energy from fat"] - combined["Total energy from protein"] + ) + + # Daily intake of carbohydrates per person. + combined["Total carbohydrates"] = combined["Total energy from carbohydrates"] / KCAL_PER_GRAM_OF_CARBOHYDRATES + + # Caloric intake from fat as a percentage of the total daily caloric intake. + combined["Share of energy from fat"] = 100 * combined["Total energy from fat"] / combined["Total energy"] + # Caloric intake from protein as a percentage of the total daily caloric intake. + combined["Share of energy from protein"] = 100 * combined["Total energy from protein"] / combined["Total energy"] + # Caloric intake from carbohydrates as a percentage of the total daily caloric intake. + combined["Share of energy from carbohydrates"] = ( + 100 * combined["Total energy from carbohydrates"] / combined["Total energy"] + ) + + # Daily caloric intake from animal protein. + combined["Energy from animal protein"] = combined["Protein from animal products"] * KCAL_PER_GRAM_OF_PROTEIN + # Caloric intake from animal protein as a percentage of the total daily caloric intake. + combined["Share of energy from animal protein"] = ( + 100 * combined["Energy from animal protein"] / combined["Total energy"] + ) + # Daily caloric intake from vegetal protein. + combined["Energy from vegetal protein"] = combined["Protein from vegetal products"] * KCAL_PER_GRAM_OF_PROTEIN + # Caloric intake from vegetal protein as a percentage of the total daily caloric intake. + combined["Share of energy from vegetal protein"] = ( + 100 * combined["Energy from vegetal protein"] / combined["Total energy"] + ) + + # Create a table, set an appropriate index, and sort conveniently. + tb_combined = Table( + combined.set_index(["country", "year"], verify_integrity=True).sort_index(), + short_name="macronutrient_compositions", + underscore=True, + ) + + return tb_combined + + +def generate_fertilizers(df_rfn: pd.DataFrame, df_rl: pd.DataFrame) -> Table: + # Item code for "Cropland" (which includes arable land and permanent crops). + ITEM_CODE_FOR_CROPLAND = "00006620" + + # Element code for element "Area" of faostat_rl dataset. + ELEMENT_CODE_FOR_AREA = "005110" + + # Item codes for fertilizers in faostat_rfn (namely nitrogen, phosphate and potash). + ITEM_CODES_FOR_FERTILIZERS = ["00003102", "00003103", "00003104"] + + # Element code for use per area of cropland. + ELEMENT_CODE_FOR_USE_PER_AREA = "005159" + + # Convert units from kilograms to tonnes. + KG_TO_TONNES = 1e-3 + + # Select necessary element (use per area). + fertilizers = df_rfn[(df_rfn["element_code"] == ELEMENT_CODE_FOR_USE_PER_AREA)].reset_index(drop=True) + + # Sanity checks. + error = "Unit for use per area has changed." + assert list(fertilizers["unit"].unique()) == ["Kilograms per hectare"], error + + error = "Unexpected list of item codes for fertilizers (maybe another was added to faostat_rfn)." + assert set(fertilizers["item_code"]) == set(ITEM_CODES_FOR_FERTILIZERS), error + + # Transpose fertilizers data. + fertilizers = fertilizers.pivot(index=["country", "year"], columns=["item"], values=["value"]) + + # Fix spurious index names after pivoting, and rename columns conveniently. + fertilizers.columns = [column[1] for column in fertilizers.columns] + + fertilizers = fertilizers.rename( + columns={ + "Nutrient nitrogen N (total)": "nitrogen_per_cropland", + "Nutrient phosphate P2O5 (total)": "phosphate_per_cropland", + "Nutrient potash K2O (total)": "potash_per_cropland", + }, + errors="raise", + ) + + # Add column for total fertilizers per area cropland. + fertilizers["all_fertilizers_per_cropland"] = fertilizers[ + ["nitrogen_per_cropland", "phosphate_per_cropland", "potash_per_cropland"] + ].sum(axis=1) + + # To get total agricultural use of fertilizers, we need cropland area. + area = df_rl[ + (df_rl["element_code"] == ELEMENT_CODE_FOR_AREA) & (df_rl["item_code"] == ITEM_CODE_FOR_CROPLAND) + ].reset_index(drop=True) + + # Sanity check. + error = "Unit for area has changed." + assert list(area["unit"].unique()) == ["hectares"], error + + # Transpose area data. + area = area.pivot(index=["country", "year"], columns=["item"], values=["value"]).reset_index() + area.columns = ["country", "year", "cropland"] + + # Combine fertilizers and area. + combined = pd.merge(fertilizers, area, on=["country", "year"], how="outer", validate="one_to_one") + + # Add variables for total fertilizer use. + for fertilizer in ["nitrogen", "phosphate", "potash", "all_fertilizers"]: + combined[f"{fertilizer}_use"] = combined[f"{fertilizer}_per_cropland"] * combined["cropland"] * KG_TO_TONNES + + # Create a table, set an appropriate index, and sort conveniently. + tb_fertilizers = Table( + combined.set_index(["country", "year"], verify_integrity=True).sort_index(), + short_name="fertilizers", + underscore=True, + ) + + return tb_fertilizers + + +def generate_vegetable_oil_yields(df_qcl: pd.DataFrame, df_fbsc: pd.DataFrame) -> Table: + # Element code for "Production" in faostat_qcl. + ELEMENT_CODE_FOR_PRODUCTION_QCL = "005510" + # Element code for "Production" in faostat_fbsc. + ELEMENT_CODE_FOR_PRODUCTION_FBSC = "005511" + # Unit for "Production". + UNIT_FOR_PRODUCTION = "tonnes" + # Element code for "Area harvested". + ELEMENT_CODE_FOR_AREA = "005312" + # Unit for "Area harvested". + UNIT_FOR_AREA = "hectares" + # Item code for "Vegetable Oils" (required to get the global production of vegetable oils on a given year). + ITEM_CODE_FOR_VEGETABLE_OILS_TOTAL = "00002914" + # Item codes in faostat_qcl for the area of the crops (we don't need the production of the crops). + ITEM_CODE_FOR_EACH_CROP_AREA = { + # The item "Palm fruit oil" refers to the fruit that contains both the pulp (that leads to palm oil) + # as well as the kernel (that leads to palm kernel oil). + "palm": "00000254", # Palm fruit oil + "sunflower": "00000267", # Sunflower seed + "rapeseed": "00000270", # Rapeseed + "soybean": "00000236", # Soybeans + "olive": "00000260", # Olives + "coconut": "00000249", # Coconuts + "groundnut": "00000242", # Groundnuts + "cottonseed": "00000328", # Seed cotton + "sesame": "00000289", # Sesame seed + # Item "Maize" has the description "[...] This class includes: - maize harvested for their dry grains only" + # So it's not clear whether it includes area used for maize oil, and therefore I won't consider it. + # "maize": "00000056", # Maize + # Other vegetable oils not considered. + # "safflower": "00000280", # Safflower seed + # "linseed": "00000333", # Linseed + } + # Item codes in faostat_qcl for the production of the oils (there is no area harvested data for oils). + ITEM_CODE_FOR_EACH_CROP_PRODUCTION = { + # The item "Palm oil" doesn't have a description, but it probably refers to only the oil from the pulp of the + # palm fruit (therefore it does not include the kernel). + "palm": "00000257", # Palm oil + # The item "Palm kernel oil" clearly refers to only the oil produced from the kernel of the palm fruit. + # Therefore, "Palm oil" and "Palm kernel oil" will need to be combined to account for all oils produced from + # the palm fruit (item "Palm fruit oil" for which we have the area harvested). + "palm_kernel": "00000258", # Palm kernel oil + "sunflower": "00000268", # Sunflower oil + "rapeseed": "00000271", # Rapeseed oil + "soybean": "00000237", # Soybean oil + "olive": "00000261", # Olive oil + "coconut": "00000252", # Coconut oil + "groundnut": "00000244", # Groundnut oil + "cottonseed": "00000331", # Cottonseed oil + "sesame": "00000290", # Sesame oil + # Item "maize" is not included (see comment above). + # "maize": "00000060", # Maize oil + # Other vegetable oils not considered. + # "safflower": "00000281", # Safflower oil + # "linseed": "00000334", # Linseed oil + } + + # Extract the total production of vegetable oil. This is given in fbsc but not qcl. + total_production = df_fbsc[ + (df_fbsc["country"] == "World") + & (df_fbsc["item_code"] == ITEM_CODE_FOR_VEGETABLE_OILS_TOTAL) + & (df_fbsc["element_code"] == ELEMENT_CODE_FOR_PRODUCTION_FBSC) + & (df_fbsc["unit"] == UNIT_FOR_PRODUCTION) + ].reset_index(drop=True) + + # Transpose data. + total_production = total_production.pivot( + index=["country", "year"], columns=["item_code"], values=["value"] + ).rename(columns={ITEM_CODE_FOR_VEGETABLE_OILS_TOTAL: "vegetable_oils_production"}) + + # Fix column names after pivoting. + total_production.columns = [column[1] for column in total_production.columns] + total_production = total_production.reset_index().drop(columns=["country"]) + + # Select relevant items, elements and units for the production of crops. + production = df_qcl[ + (df_qcl["item_code"].isin(ITEM_CODE_FOR_EACH_CROP_PRODUCTION.values())) + & (df_qcl["unit"] == UNIT_FOR_PRODUCTION) + & (df_qcl["element_code"] == ELEMENT_CODE_FOR_PRODUCTION_QCL) + ].reset_index(drop=True) + + # Transpose data. + production = production.pivot(index=["country", "year"], columns=["item_code"], values=["value"]) + + # Fix column names after pivoting. + production.columns = np.array(production.columns.tolist())[:, 1] + + # Assign a convenient name to each crop. + CROP_NAME_FOR_ITEM_CODE = { + ITEM_CODE_FOR_EACH_CROP_PRODUCTION[item_code]: item_code for item_code in ITEM_CODE_FOR_EACH_CROP_PRODUCTION + } + production = production.rename( + columns={item_code: CROP_NAME_FOR_ITEM_CODE[item_code] + "_production" for item_code in production.columns} + ).reset_index() + + # Select relevant items, elements and units for the area of crops. + area = df_qcl[ + (df_qcl["item_code"].isin(ITEM_CODE_FOR_EACH_CROP_AREA.values())) + & (df_qcl["unit"] == UNIT_FOR_AREA) + & (df_qcl["element_code"] == ELEMENT_CODE_FOR_AREA) + ].reset_index(drop=True) + + # Transpose data. + area = area.pivot(index=["country", "year"], columns=["item_code"], values=["value"]) + + # Fix column names after pivoting. + area.columns = np.array(area.columns.tolist())[:, 1] + + # Assign a convenient name to each crop. + CROP_NAME_FOR_ITEM_CODE = { + ITEM_CODE_FOR_EACH_CROP_AREA[item_code]: item_code for item_code in ITEM_CODE_FOR_EACH_CROP_AREA + } + area = area.rename( + columns={item_code: CROP_NAME_FOR_ITEM_CODE[item_code] + "_area" for item_code in area.columns} + ).reset_index() + + # Combine production and area. + combined = pd.merge(production, area, on=["country", "year"], how="outer") + + # Add column for global vegetable oil production. + combined = pd.merge(combined, total_production, on=["year"], how="left") + + # Combine the production of palm oil and palm kernel oil, since we have the area harvested for the palm fruit + # (which leads to the production of both palm oil and palm kernel oil). + combined["palm_production"] += combined["palm_kernel_production"] + combined = combined.drop(columns=["palm_kernel_production"]) + + # For each crop, create three relevant metrics. + for crop in ITEM_CODE_FOR_EACH_CROP_AREA: + # Vegetable oil yield, which is the amount of oil produced per area harvested of the original crop. + combined[f"{crop}_tonnes_per_hectare"] = combined[f"{crop}_production"] / combined[f"{crop}_area"] + # Hectares of the original crop harvested per tonne of oil produced (inverse of the previous). + combined[f"{crop}_hectares_per_tonne"] = combined[f"{crop}_area"] / combined[f"{crop}_production"] + # Area required to produce the total demand of vegetable oils using only one specific crop. + combined[f"{crop}_area_to_meet_global_oil_demand"] = ( + combined[f"{crop}_hectares_per_tonne"] * combined["vegetable_oils_production"] + ) + + # Replace infinite values (obtained when dividing by a null area) by nans. + combined = combined.replace(np.inf, np.nan) + + # Create a table, set an appropriate index, and sort conveniently. + tb_vegetable_oil_yields = Table( + combined.set_index(["country", "year"], verify_integrity=True).sort_index(), + short_name="vegetable_oil_yields", + underscore=True, + ) + + return tb_vegetable_oil_yields + + +def generate_agriculture_land_evolution(df_rl: pd.DataFrame) -> Table: + # Element code for "Area". + ELEMENT_CODE_FOR_AREA = "005110" + # Unit for element of area. + UNIT_FOR_AREA = "hectares" + # Item code for "Land under perm. meadows and pastures". + ITEM_CODE_FOR_PASTURES = "00006655" + # Item code for "Cropland". + ITEM_CODE_FOR_CROPLAND = "00006620" + # Item code for "Agricultural land". + ITEM_CODE_FOR_AGRICULTURAL_LAND = "00006610" + + # Select the relevant items, elements and units. + land = df_rl[ + (df_rl["unit"] == UNIT_FOR_AREA) + & (df_rl["element_code"] == ELEMENT_CODE_FOR_AREA) + & (df_rl["item_code"].isin([ITEM_CODE_FOR_AGRICULTURAL_LAND, ITEM_CODE_FOR_CROPLAND, ITEM_CODE_FOR_PASTURES])) + ].reset_index(drop=True) + + # Transpose data and rename columns conveniently. + land = land.pivot(index=["country", "year"], columns=["item_code"], values="value").reset_index() + land.columns = list(land.columns) + land = land.rename( + columns={ + ITEM_CODE_FOR_AGRICULTURAL_LAND: "agriculture_area", + ITEM_CODE_FOR_CROPLAND: "cropland_area", + ITEM_CODE_FOR_PASTURES: "pasture_area", + }, + errors="raise", + ) + + # Add columns corresponding to the values of one decade before. + _land = land.copy() + _land["_year"] = _land["year"] + 10 + combined = pd.merge( + land, + _land, + left_on=["country", "year"], + right_on=["country", "_year"], + how="inner", + suffixes=("", "_one_decade_back"), + ).drop(columns=["_year"]) + + # For each item, add the percentage change of land use this year with respect to one decade back. + for item in ["agriculture_area", "cropland_area", "pasture_area"]: + combined[f"{item}_change"] = ( + 100 * (combined[f"{item}"] - combined[f"{item}_one_decade_back"]) / combined[f"{item}_one_decade_back"] + ) + + # Set an appropriate index and sort conveniently. + combined = combined.set_index(["country", "year"], verify_integrity=True).sort_index().sort_index(axis=1) + + # Create a table. + tb_land_use_evolution = Table(combined, short_name="agriculture_land_use_evolution", underscore=True) + + return tb_land_use_evolution + + +def generate_hypothetical_meat_consumption(df_qcl: pd.DataFrame) -> Table: + # Element code and unit for "Production". + ELEMENT_CODE_FOR_PRODUCTION = "005510" + UNIT_FOR_PRODUCTION = "tonnes" + # Element code and unit for per-capita "Production". + ELEMENT_CODE_FOR_PRODUCTION_PER_CAPITA = "5510pc" + UNIT_FOR_PRODUCTION_PER_CAPITA = "tonnes per capita" + # Element code and unit for "Producing or slaughtered animals". + ELEMENT_CODE_FOR_ANIMALS = "005320" + UNIT_FOR_ANIMALS = "animals" + # Element code and unit for per-capita "Producing or slaughtered animals". + ELEMENT_CODE_FOR_ANIMALS_PER_CAPITA = "5320pc" + UNIT_FOR_ANIMALS_PER_CAPITA = "animals per capita" + # Item code for "Meat, total". + ITEM_CODE_FOR_MEAT_TOTAL = "00001765" + + # Select the required items/elements/units to get national data on per-capita production and slaughtered animals. + meat = df_qcl[ + (df_qcl["item_code"] == ITEM_CODE_FOR_MEAT_TOTAL) + & (df_qcl["element_code"].isin([ELEMENT_CODE_FOR_PRODUCTION_PER_CAPITA, ELEMENT_CODE_FOR_ANIMALS_PER_CAPITA])) + & (df_qcl["unit"].isin([UNIT_FOR_PRODUCTION_PER_CAPITA, UNIT_FOR_ANIMALS_PER_CAPITA])) + ].reset_index(drop=True) + meat = meat.pivot(index=["country", "year"], columns="element_code", values="value").reset_index() + meat = meat.rename( + columns={ + ELEMENT_CODE_FOR_ANIMALS_PER_CAPITA: "animals_per_capita", + ELEMENT_CODE_FOR_PRODUCTION_PER_CAPITA: "production_per_capita", + } + ) + + # Take data for global population from the "population_with_data" column for the production of total meat. + # This should coincide with the true world population. + # Note that "population_with_data" may differ with the total population for certain items/elements for region + # aggregates (e.g. "Africa"). For slaughtered animals, population with data may also differ, since it's + # built for all countries (in the garden faostat_qcl step) by aggregating. + # But this does not happen with total meat/production for the "World", since this data was extracted directly from FAOSTAT. + # TODO: Confirm this by checking qcl code, especially the one about animals slaughtered + global_population = ( + df_qcl[ + (df_qcl["country"] == "World") + & (df_qcl["element_code"] == ELEMENT_CODE_FOR_PRODUCTION) + & (df_qcl["unit"] == UNIT_FOR_PRODUCTION) + & (df_qcl["item_code"] == ITEM_CODE_FOR_MEAT_TOTAL) + ][["year", "population_with_data"]] + .reset_index(drop=True) + .rename(columns={"population_with_data": "global_population"}) + ).astype({"global_population": int}) + + # Just for reference, extract global production and number of slaughtered animals. + global_production = ( + df_qcl[ + (df_qcl["country"] == "World") + & (df_qcl["element_code"] == ELEMENT_CODE_FOR_PRODUCTION) + & (df_qcl["unit"] == UNIT_FOR_PRODUCTION) + & (df_qcl["item_code"] == ITEM_CODE_FOR_MEAT_TOTAL) + ][["year", "value"]] + .reset_index(drop=True) + .rename(columns={"value": "production_global"}) + ) + global_animals = ( + df_qcl[ + (df_qcl["country"] == "World") + & (df_qcl["element_code"] == ELEMENT_CODE_FOR_ANIMALS) + & (df_qcl["unit"] == UNIT_FOR_ANIMALS) + & (df_qcl["item_code"] == ITEM_CODE_FOR_MEAT_TOTAL) + ][["year", "value"]] + .reset_index(drop=True) + .rename(columns={"value": "animals_global"}) + ) + + # Combine national with global data. + combined = multi_merge(dfs=[meat, global_population, global_production, global_animals], on=["year"], how="left") + + # Sanity check. + error = "Rows have changed after merging national data with global data." + assert len(combined) == len(meat), error + + # Add columns for hypothetical global production and number of slaughtered animals. + # This is the production (or number of slaughtered animals) that would be needed worldwide to meet the demand of a given country. + combined["production_global_hypothetical"] = combined["production_per_capita"] * combined["global_population"] + combined["animals_global_hypothetical"] = combined["animals_per_capita"] * combined["global_population"] + + # Set an appropriate index and sort conveniently. + combined = combined.set_index(["country", "year"], verify_integrity=True).sort_index().sort_index(axis=1) + + # Create a table with the combined data. + tb_hypothetical_meat_consumption = Table(combined, short_name="hypothetical_meat_consumption", underscore=True) + + return tb_hypothetical_meat_consumption + + +def generate_cereal_allocation(df_fbsc: pd.DataFrame) -> Table: + # Item code for "Cereals - Excluding Beer". + ITEM_CODE_FOR_CEREALS = "00002905" + # Note: We disregard the contribution from "00002520" ("Cereals, Other"), which is usually negligible compared to the total. + # Element code and unit for "Food". + # Note: The element code for "Food available for consumption" is "000645"; this should be the same data, except that + # it is given in kilograms (originally it was given per capita). Therefore, we use "Food", which is more convenient. + ELEMENT_CODE_FOR_FOOD = "005142" + UNIT_FOR_FOOD = "tonnes" + # Element code and unit for "Feed". + ELEMENT_CODE_FOR_FEED = "005521" + UNIT_FOR_FEED = "tonnes" + # Element code and unit for "Other uses". + ELEMENT_CODE_FOR_OTHER_USES = "005154" + UNIT_FOR_OTHER_USES = "tonnes" + + # Select the relevant items/elements. + cereals = df_fbsc[ + (df_fbsc["item_code"] == ITEM_CODE_FOR_CEREALS) + & (df_fbsc["element_code"].isin([ELEMENT_CODE_FOR_FOOD, ELEMENT_CODE_FOR_FEED, ELEMENT_CODE_FOR_OTHER_USES])) + ].reset_index(drop=True) + + # Sanity check. + error = "Units have changed" + assert set(cereals["unit"]) == set([UNIT_FOR_FOOD, UNIT_FOR_FEED, UNIT_FOR_OTHER_USES]), error + + # Transpose data and rename columns conveniently. + cereals = ( + cereals.pivot(index=["country", "year"], columns="element_code", values="value") + .reset_index() + .rename( + columns={ + ELEMENT_CODE_FOR_FOOD: "cereals_allocated_to_food", + ELEMENT_CODE_FOR_FEED: "cereals_allocated_to_animal_feed", + ELEMENT_CODE_FOR_OTHER_USES: "cereals_allocated_to_other_uses", + } + ) + ) + + # Add variables for the share of cereals allocated to each use. + all_cereal_uses = ["food", "animal_feed", "other_uses"] + for item in all_cereal_uses: + cereals[f"share_of_cereals_allocated_to_{item}"] = ( + 100 + * cereals[f"cereals_allocated_to_{item}"] + / cereals[[f"cereals_allocated_to_{use}" for use in all_cereal_uses]].sum(axis=1) + ) + + # Set an appropriate index and sort conveniently. + cereals = cereals.set_index(["country", "year"], verify_integrity=True).sort_index().sort_index(axis=1) + + # Create a table with the generated data. + tb_cereal_allocation = Table(cereals, short_name="cereal_allocation", underscore=True) + + return tb_cereal_allocation + + +def generate_maize_and_wheat(df_fbsc: pd.DataFrame) -> Table: + # Item code for "Wheat". + ITEM_CODE_FOR_WHEAT = "00002511" + # Item code for "Maize". + ITEM_CODE_FOR_MAIZE = "00002514" + # Element code for "Exports". + ELEMENT_CODE_FOR_EXPORTS = "005911" + # Element code for "Feed". + ELEMENT_CODE_FOR_FEED = "005521" + # Element code for "Other uses". + ELEMENT_CODE_FOR_OTHER_USES = "005154" + + # Select the relevant items/elements. + maize_and_wheat = df_fbsc[ + (df_fbsc["item_code"].isin([ITEM_CODE_FOR_MAIZE, ITEM_CODE_FOR_WHEAT])) + & (df_fbsc["element_code"].isin([ELEMENT_CODE_FOR_EXPORTS, ELEMENT_CODE_FOR_FEED, ELEMENT_CODE_FOR_OTHER_USES])) + ] + + # Sanity check. + error = "Units have changed." + assert list(maize_and_wheat["unit"].unique()) == ["tonnes"], error + + # Transpose data and rename columns conveniently. + maize_and_wheat = maize_and_wheat.pivot( + index=["country", "year"], columns=["item_code", "element_code"], values="value" + ) + maize_and_wheat = maize_and_wheat.rename( + columns={ITEM_CODE_FOR_MAIZE: "maize", ITEM_CODE_FOR_WHEAT: "wheat"}, level=0 + ).rename( + columns={ + ELEMENT_CODE_FOR_EXPORTS: "exports", + ELEMENT_CODE_FOR_FEED: "animal_feed", + ELEMENT_CODE_FOR_OTHER_USES: "other_uses", + } + ) + maize_and_wheat.columns = [column[0] + "_" + column[1] for column in maize_and_wheat.columns] + + # Set an appropriate index and sort conveniently. + maize_and_wheat = ( + maize_and_wheat.reset_index() + .set_index(["country", "year"], verify_integrity=True) + .sort_index() + .sort_index(axis=1) + ) + + # Create a table with the generated data. + tb_maize_and_wheat = Table(maize_and_wheat, short_name="maize_and_wheat", underscore=True) + + # Add minimal variable metadata (more metadata will be added at the grapher step). + for column in tb_maize_and_wheat.columns: + tb_maize_and_wheat[column].metadata.unit = "tonnes" + tb_maize_and_wheat[column].metadata.short_unit = "t" + + return tb_maize_and_wheat + + +def generate_fertilizer_exports(df_rfn: pd.DataFrame) -> Table: + # Element code for "Export Quantity". + ELEMENT_CODE_FOR_EXPORTS = "005910" + # Item code for "Nutrient nitrogen N (total)". + ITEM_CODE_FOR_NITROGEN = "00003102" + # Item code for "Nutrient phosphate P2O5 (total)". + ITEM_CODE_FOR_PHOSPHATE = "00003103" + # Item code for "Nutrient potash K2O (total)". + ITEM_CODE_FOR_POTASH = "00003104" + + # Select the relevant items and elements. + fertilizer_exports = df_rfn[ + (df_rfn["element_code"] == ELEMENT_CODE_FOR_EXPORTS) + & (df_rfn["item_code"].isin([ITEM_CODE_FOR_NITROGEN, ITEM_CODE_FOR_PHOSPHATE, ITEM_CODE_FOR_POTASH])) + ].reset_index(drop=True) + + # Sanity check. + error = "Units have changed." + assert list(fertilizer_exports["unit"].unique()) == ["Tonnes"], error + + # Rename columns and items conveniently. + fertilizer_exports = fertilizer_exports[["country", "year", "item_code", "value"]].rename( + columns={"item_code": "item", "value": "exports"} + ) + fertilizer_exports["item"] = fertilizer_exports["item"].replace( + {ITEM_CODE_FOR_NITROGEN: "Nitrogen", ITEM_CODE_FOR_PHOSPHATE: "Phosphorous", ITEM_CODE_FOR_POTASH: "Potassium"} + ) + + # Add column of global exports. + global_exports = ( + fertilizer_exports[fertilizer_exports["country"] == "World"].drop(columns=["country"]).reset_index(drop=True) + ) + fertilizer_exports = pd.merge( + fertilizer_exports, global_exports, how="left", on=["year", "item"], suffixes=("", "_global") + ) + + # Create columns for the share of exports. + fertilizer_exports["share_of_exports"] = 100 * fertilizer_exports["exports"] / fertilizer_exports["exports_global"] + + # Drop column of global exports. + fertilizer_exports = fertilizer_exports.drop(columns=["exports_global"]) + + # Set an appropriate index and sort conveniently. + fertilizer_exports = ( + fertilizer_exports.set_index(["country", "year", "item"], verify_integrity=True).sort_index().sort_index(axis=1) + ) + + # Create a table with the generated data. + tb_fertilizer_exports = Table(fertilizer_exports, short_name="fertilizer_exports", underscore=True) + + # Add minimal variable metadata (more metadata will be added at the grapher step). + tb_fertilizer_exports["share_of_exports"].metadata.unit = "%" + tb_fertilizer_exports["share_of_exports"].metadata.short_unit = "%" + tb_fertilizer_exports["exports"].metadata.unit = "tonnes" + tb_fertilizer_exports["exports"].metadata.short_unit = "t" + + return tb_fertilizer_exports + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load dataset about land use, load its main (long-format) table, and create a convenient dataframe. + ds_rl = paths.load_dataset(f"{NAMESPACE}_rl") + tb_rl = ds_rl[f"{NAMESPACE}_rl"] + df_rl = pd.DataFrame(tb_rl).reset_index() + + # Load dataset about production indices, load its main (long-format) table, and create a convenient dataframe. + ds_qi = paths.load_dataset(f"{NAMESPACE}_qi") + tb_qi = ds_qi[f"{NAMESPACE}_qi"] + df_qi = pd.DataFrame(tb_qi).reset_index() + + # Load dataset about crops and livestock, load its main (long-format) table, and create a convenient dataframe. + ds_qcl = paths.load_dataset(f"{NAMESPACE}_qcl") + tb_qcl = ds_qcl[f"{NAMESPACE}_qcl"] + df_qcl = pd.DataFrame(tb_qcl).reset_index() + + # Load dataset about SDG indicators, load its main (long-format) table, and create a convenient dataframe. + ds_sdgb = paths.load_dataset(f"{NAMESPACE}_sdgb") + tb_sdgb = ds_sdgb[f"{NAMESPACE}_sdgb"] + df_sdgb = pd.DataFrame(tb_sdgb).reset_index() + + # Load dataset about food balances, load its main (long-format) table, and create a convenient dataframe. + ds_fbsc = paths.load_dataset(f"{NAMESPACE}_fbsc") + tb_fbsc = ds_fbsc[f"{NAMESPACE}_fbsc"] + df_fbsc = pd.DataFrame(tb_fbsc).reset_index() + + # Load dataset about fertilizers by nutrient, load its main (long-format) table, and create a convenient dataframe. + ds_rfn = paths.load_dataset(f"{NAMESPACE}_rfn") + tb_rfn = ds_rfn[f"{NAMESPACE}_rfn"] + df_rfn = pd.DataFrame(tb_rfn).reset_index() + + # + # Process data. + # + # Create table for arable land per crop output. + tb_arable_land_per_crop_output = generate_arable_land_per_crop_output(df_rl=df_rl, df_qi=df_qi) + + # Create table for area used for production per crop type. + tb_area_by_crop_type = generate_area_used_for_production_per_crop_type(df_qcl=df_qcl) + + # Create table for the share of sustainable and overexploited fish. + tb_sustainable_and_overexploited_fish = generate_percentage_of_sustainable_and_overexploited_fish(df_sdgb=df_sdgb) + + # Create table for spared land due to increased yields. + tb_spared_land_from_increased_yields = generate_spared_land_from_increased_yields(df_qcl=df_qcl) + + # Create table for dietary compositions by commodity group. + tb_food_available_for_consumption = generate_food_available_for_consumption(df_fbsc=df_fbsc) + + # Create table for macronutrient compositions. + tb_macronutrient_compositions = generate_macronutrient_compositions(df_fbsc=df_fbsc) + + # Create table for fertilizers data. + tb_fertilizers = generate_fertilizers(df_rfn=df_rfn, df_rl=df_rl) + + # Create table for vegetable oil yields. + tb_vegetable_oil_yields = generate_vegetable_oil_yields(df_qcl=df_qcl, df_fbsc=df_fbsc) + + # Create table for peak agricultural land. + tb_agriculture_land_use_evolution = generate_agriculture_land_evolution(df_rl=df_rl) + + # Create table for hypothetical meat consumption + tb_hypothetical_meat_consumption = generate_hypothetical_meat_consumption(df_qcl=df_qcl) + + # Create table for cereal allocation. + tb_cereal_allocation = generate_cereal_allocation(df_fbsc=df_fbsc) + + # Create table for maize and wheat data (used in the context of the Ukraine war). + tb_maize_and_wheat = generate_maize_and_wheat(df_fbsc=df_fbsc) + + # Create table for fertilizer exports (used in the context of the Ukraine war). + tb_fertilizer_exports = generate_fertilizer_exports(df_rfn=df_rfn) + + # + # Save outputs. + # + # Create a new garden dataset. + ds_garden = create_dataset( + dest_dir, + tables=[ + tb_arable_land_per_crop_output, + tb_area_by_crop_type, + tb_sustainable_and_overexploited_fish, + tb_spared_land_from_increased_yields, + tb_food_available_for_consumption, + tb_macronutrient_compositions, + tb_fertilizers, + tb_vegetable_oil_yields, + tb_agriculture_land_use_evolution, + tb_hypothetical_meat_consumption, + tb_cereal_allocation, + tb_maize_and_wheat, + tb_fertilizer_exports, + ], + ) + ds_garden.save() diff --git a/etl/steps/data/grapher/faostat/2024-03-14/additional_variables.py b/etl/steps/data/grapher/faostat/2024-03-14/additional_variables.py new file mode 100644 index 00000000000..4fe600516f1 --- /dev/null +++ b/etl/steps/data/grapher/faostat/2024-03-14/additional_variables.py @@ -0,0 +1,204 @@ +"""Load a garden dataset and create a grapher dataset.""" + +import pandas as pd +from owid.catalog import Table +from owid.catalog.utils import underscore_table + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def prepare_maize_and_wheat_in_the_context_of_the_ukraine_war(tb_maize_and_wheat: Table) -> Table: + # Prepare groupings that will be shown in a stacked discrete bar chart. + # Ukraine and Russia exports of maize and wheat. + ukraine_and_russia_exports = ( + pd.merge( + tb_maize_and_wheat[["maize_exports", "wheat_exports"]].loc["Ukraine"], + tb_maize_and_wheat[["maize_exports", "wheat_exports"]].loc["Russia"], + left_index=True, + right_index=True, + suffixes=(" Ukraine", " Russia"), + ) + .assign(**{"country": "Ukraine and Russia exports"}) + .reset_index() + ) + # EU and UK maize and wheat used for animal feed. + eu_and_uk_feed = ( + pd.merge( + tb_maize_and_wheat[["maize_animal_feed", "wheat_animal_feed"]].loc["European Union (27)"], + tb_maize_and_wheat[["maize_animal_feed", "wheat_animal_feed"]].loc["United Kingdom"], + left_index=True, + right_index=True, + suffixes=(" EU", " UK"), + ) + .assign(**{"country": "EU and UK animal feed"}) + .reset_index() + ) + # EU and UK maize and wheat devoted to other uses (predominantly biofuels). + eu_and_uk_biofuels = ( + pd.merge( + tb_maize_and_wheat[["maize_other_uses", "wheat_other_uses"]].loc["European Union (27)"], + tb_maize_and_wheat[["maize_other_uses", "wheat_other_uses"]].loc["United Kingdom"], + left_index=True, + right_index=True, + suffixes=(" EU", " UK"), + ) + .assign(**{"country": "EU and UK biofuels"}) + .reset_index() + ) + # US maize and wheat used for animal feed. + us_feed = ( + tb_maize_and_wheat[["maize_animal_feed", "wheat_animal_feed"]] + .loc["United States"] + .rename(columns={"maize_animal_feed": "maize_animal_feed US", "wheat_animal_feed": "wheat_animal_feed US"}) + .assign(**{"country": "US animal feed"}) + .reset_index() + ) + # US maize and wheat devoted to other uses (predominantly biofuels). + us_biofuels = ( + tb_maize_and_wheat[["maize_other_uses", "wheat_other_uses"]] + .loc["United States"] + .rename(columns={"maize_other_uses": "maize_other_uses US", "wheat_other_uses": "wheat_other_uses US"}) + .assign(**{"country": "US biofuels"}) + .reset_index() + ) + + # Combine all groupings. + combined = pd.concat( + [ukraine_and_russia_exports, eu_and_uk_feed, eu_and_uk_biofuels, us_feed, us_biofuels], ignore_index=True + ) + + # Set an appropriate index and sort conveniently. + combined = combined.set_index(["country", "year"], verify_integrity=True).sort_index().sort_index(axis=1) + + # Adapt metadata. + combined.metadata.short_name = "maize_and_wheat_in_the_context_of_the_ukraine_war" + for column in combined.columns: + title = ( + column.replace("maize_", "Maize ") + .replace("wheat_", "Wheat ") + .replace("animal_feed", "used for animal feed in") + .replace("exports", "exported by") + .replace("other_uses", "used for biofuels in") + ) + combined[column].metadata.title = title + combined[column].metadata.unit = "tonnes" + combined[column].metadata.short_unit = "t" + combined = underscore_table(combined) + + return combined + + +def prepare_fertilizer_exports_in_the_context_of_the_ukraine_war(tb_fertilizer_exports: Table) -> Table: + # Select the relevant countries for the chart. + fertilizer_exports = tb_fertilizer_exports.loc[["Ukraine", "Russia", "Belarus"]].reset_index() + + # Transpose data. + fertilizer_exports = fertilizer_exports.pivot( + index=["item", "year"], columns="country", values=["exports", "share_of_exports"] + ) + + fertilizer_exports.columns = [column[0] + " " + column[1] for column in fertilizer_exports.columns] + + # To be able to work in grapher, rename "item" column to "country". + fertilizer_exports.index.names = ["country", "year"] + + # Adapt metadata. + fertilizer_exports.metadata.short_name = "fertilizer_exports_in_the_context_of_the_ukraine_war" + for column in fertilizer_exports.columns: + element, country = column.split(" ") + title = element.capitalize().replace("_", " ") + " from " + country + fertilizer_exports[column].metadata.title = title + if "share" in column: + fertilizer_exports[column].metadata.unit = "%" + fertilizer_exports[column].metadata.short_unit = "%" + else: + fertilizer_exports[column].metadata.unit = "tonnes" + fertilizer_exports[column].metadata.short_unit = "t" + fertilizer_exports = underscore_table(fertilizer_exports) + + return fertilizer_exports + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load garden dataset. + ds_garden = paths.load_dataset("additional_variables") + + # Read tables from garden dataset. + tb_arable_land_per_crop_output = ds_garden["arable_land_per_crop_output"] + tb_area_used_per_crop_type = ds_garden["area_used_per_crop_type"] + tb_sustainable_and_overexploited_fish = ds_garden["share_of_sustainable_and_overexploited_fish"] + tb_land_spared_by_increased_crop_yields = ds_garden["land_spared_by_increased_crop_yields"] + tb_food_available_for_consumption = ds_garden["food_available_for_consumption"] + tb_macronutrient_compositions = ds_garden["macronutrient_compositions"] + tb_fertilizers = ds_garden["fertilizers"] + tb_vegetable_oil_yields = ds_garden["vegetable_oil_yields"] + tb_agriculture_land_use_evolution = ds_garden["agriculture_land_use_evolution"] + tb_hypothetical_meat_consumption = ds_garden["hypothetical_meat_consumption"] + tb_cereal_allocation = ds_garden["cereal_allocation"] + tb_maize_and_wheat = ds_garden["maize_and_wheat"] + tb_fertilizer_exports = ds_garden["fertilizer_exports"] + + # + # Process data. + # + # To insert table into grapher DB, change "item" column to "country" (which will be changed back in the admin). + tb_area_used_per_crop_type = ( + tb_area_used_per_crop_type.reset_index() + .rename(columns={"item": "country"}) + .set_index(["country", "year"], verify_integrity=True) + .sort_index() + ) + + # For land spared by increased crop yields, for the moment we only need global data, by crop type. + # And again, change "item" to "country" to fit grapher DB needs. + tb_land_spared_by_increased_crop_yields = tb_land_spared_by_increased_crop_yields.reset_index() + tb_land_spared_by_increased_crop_yields = ( + tb_land_spared_by_increased_crop_yields[tb_land_spared_by_increased_crop_yields["country"] == "World"] + .drop(columns=["country"]) + .rename(columns={"item": "country"}) + .set_index(["country", "year"], verify_integrity=True) + .sort_index() + ) + + # Prepare maize and what data in the context of the Ukraine war. + tb_maize_and_wheat_in_the_context_of_the_ukraine_war = prepare_maize_and_wheat_in_the_context_of_the_ukraine_war( + tb_maize_and_wheat=tb_maize_and_wheat + ) + + # Prepare fertilizer exports data in the context of the Ukraine war. + tb_fertilizer_exports_in_the_context_of_the_ukraine_war = ( + prepare_fertilizer_exports_in_the_context_of_the_ukraine_war(tb_fertilizer_exports=tb_fertilizer_exports) + ) + + # + # Save outputs. + # + # Create a new grapher dataset with the same metadata as the garden dataset. + ds_grapher = create_dataset( + dest_dir, + tables=[ + tb_arable_land_per_crop_output, + tb_area_used_per_crop_type, + tb_sustainable_and_overexploited_fish, + tb_land_spared_by_increased_crop_yields, + tb_food_available_for_consumption, + tb_macronutrient_compositions, + tb_fertilizers, + tb_vegetable_oil_yields, + tb_agriculture_land_use_evolution, + tb_hypothetical_meat_consumption, + tb_cereal_allocation, + tb_maize_and_wheat_in_the_context_of_the_ukraine_war, + tb_fertilizer_exports_in_the_context_of_the_ukraine_war, + ], + default_metadata=ds_garden.metadata, + ) + + # Save changes in the new grapher dataset. + ds_grapher.save() From 5a54af14df891aa1a6f95f39fce1806bbe51f6ea Mon Sep 17 00:00:00 2001 From: Pablo Rosado Date: Wed, 20 Mar 2024 16:47:30 +0100 Subject: [PATCH 42/54] Adapt garden and grapher steps of additional variables --- .../2024-03-14/additional_variables.py | 586 +++++++++--------- .../2024-03-14/additional_variables.py | 75 ++- 2 files changed, 327 insertions(+), 334 deletions(-) diff --git a/etl/steps/data/garden/faostat/2024-03-14/additional_variables.py b/etl/steps/data/garden/faostat/2024-03-14/additional_variables.py index 1c7d75796ec..58ddbc2594e 100644 --- a/etl/steps/data/garden/faostat/2024-03-14/additional_variables.py +++ b/etl/steps/data/garden/faostat/2024-03-14/additional_variables.py @@ -3,10 +3,9 @@ """ import numpy as np -import pandas as pd +import owid.catalog.processing as pr from owid.catalog import Table from owid.catalog.utils import underscore -from owid.datautils.dataframes import multi_merge from shared import NAMESPACE from etl.helpers import PathFinder, create_dataset @@ -15,7 +14,7 @@ paths = PathFinder(__file__) -def generate_arable_land_per_crop_output(df_rl: pd.DataFrame, df_qi: pd.DataFrame) -> Table: +def generate_arable_land_per_crop_output(tb_rl: Table, tb_qi: Table) -> Table: # Item code for item "Arable land" of faostat_rl dataset. ITEM_CODE_FOR_ARABLE_LAND = "00006621" # Element code for element "Area" of faostat_rl dataset. @@ -28,27 +27,27 @@ def generate_arable_land_per_crop_output(df_rl: pd.DataFrame, df_qi: pd.DataFram PRODUCTION_INDEX_REFERENCE_YEAR = 1961 # Select the necessary item and element of the land use dataset. - df_rl = df_rl[ - (df_rl["item_code"] == ITEM_CODE_FOR_ARABLE_LAND) & (df_rl["element_code"] == ELEMENT_CODE_FOR_AREA) + tb_rl = tb_rl[ + (tb_rl["item_code"] == ITEM_CODE_FOR_ARABLE_LAND) & (tb_rl["element_code"] == ELEMENT_CODE_FOR_AREA) ].reset_index(drop=True) # Sanity check. error = "Unit for element 'Area' in faostat_rl has changed." - assert list(df_rl["unit"].unique()) == ["hectares"], error + assert list(tb_rl["unit"].unique()) == ["hectares"], error # Rename columns and select only necessary columns. - df_rl = df_rl[["country", "year", "value"]].rename(columns={"value": "area"}).reset_index(drop=True) + tb_rl = tb_rl[["country", "year", "value"]].rename(columns={"value": "area"}, errors="raise").reset_index(drop=True) # Select the necessary item and element of the production index dataset. - df_qi = df_qi[ - (df_qi["element_code"] == ELEMENT_CODE_PRODUCTION_INDEX) & (df_qi["item_code"] == ITEM_CODE_FOR_CROPS) + tb_qi = tb_qi[ + (tb_qi["element_code"] == ELEMENT_CODE_PRODUCTION_INDEX) & (tb_qi["item_code"] == ITEM_CODE_FOR_CROPS) ].reset_index(drop=True) # Sanity check. error = "Unit for element 'Gross Production Index Number (2014-2016 = 100)' in faostat_qi has changed." - assert list(df_qi["unit"].unique()) == [""], error + assert list(tb_qi["unit"].unique()) == [""], error # Rename columns and select only necessary columns. - df_qi = df_qi[["country", "year", "value"]].rename(columns={"value": "index"}) + tb_qi = tb_qi[["country", "year", "value"]].rename(columns={"value": "index"}, errors="raise") - # Combine both dataframes. - combined = pd.merge(df_rl, df_qi, on=["country", "year"], how="inner", validate="one_to_one") + # Combine both tables. + combined = tb_rl.merge(tb_qi, on=["country", "year"], how="inner", validate="one_to_one") # Create the new variable of arable land per crop output. combined["value"] = combined["area"] / combined["index"] @@ -57,27 +56,25 @@ def generate_arable_land_per_crop_output(df_rl: pd.DataFrame, df_qi: pd.DataFram reference = combined[combined["year"] == PRODUCTION_INDEX_REFERENCE_YEAR][["country", "value"]].reset_index( drop=True ) - combined = pd.merge( - combined, reference[["country", "value"]], on=["country"], how="left", suffixes=("", "_reference") - ) + combined = combined.merge(reference[["country", "value"]], on=["country"], how="left", suffixes=("", "_reference")) combined["value"] /= combined["value_reference"] # Remove all countries for which we did not have data for the reference year. combined = combined.dropna(subset="value").reset_index(drop=True) # Remove unnecessary columns and rename conveniently. - combined = combined.drop(columns=["value_reference"]).rename(columns={"value": "arable_land_per_crop_output"}) + combined = combined.drop(columns=["value_reference"], errors="raise").rename( + columns={"value": "arable_land_per_crop_output"}, errors="raise" + ) # Set an appropriate index and sort conveniently. - tb_combined = Table( - combined.set_index(["country", "year"], verify_integrity=True).sort_index(), - short_name="arable_land_per_crop_output", - ) + tb_combined = combined.set_index(["country", "year"], verify_integrity=True).sort_index() + tb_combined.metadata.short_name = "arable_land_per_crop_output" return tb_combined -def generate_area_used_for_production_per_crop_type(df_qcl: pd.DataFrame) -> Table: +def generate_area_used_for_production_per_crop_type(tb_qcl: Table) -> Table: # Element code for "Area harvested" of faostat_qcl dataset. ELEMENT_CODE_FOR_AREA_HARVESTED = "005312" @@ -117,11 +114,11 @@ def generate_area_used_for_production_per_crop_type(df_qcl: pd.DataFrame) -> Tab ] error = "Not all expected item codes were found in QCL." - assert set(ITEM_CODES_COARSE_GRAINS) < set(df_qcl["item_code"]), error + assert set(ITEM_CODES_COARSE_GRAINS) < set(tb_qcl["item_code"]), error # Select the world and the element code for area harvested. - area_by_crop_type = df_qcl[ - (df_qcl["country"] == "World") & (df_qcl["element_code"] == ELEMENT_CODE_FOR_AREA_HARVESTED) + area_by_crop_type = tb_qcl[ + (tb_qcl["country"] == "World") & (tb_qcl["element_code"] == ELEMENT_CODE_FOR_AREA_HARVESTED) ].reset_index(drop=True) error = "Unit for element 'Area harvested' in faostat_qcl has changed." assert list(area_by_crop_type["unit"].unique()) == ["hectares"], error @@ -133,7 +130,7 @@ def generate_area_used_for_production_per_crop_type(df_qcl: pd.DataFrame) -> Tab .agg({"value": "sum"}) .assign(**{"item": "Coarse Grain", "item_code": "00001814"}) ) - area_by_crop_type = pd.concat( + area_by_crop_type = pr.concat( [area_by_crop_type[~area_by_crop_type["item_code"].isin(ITEM_CODES_COARSE_GRAINS)], coarse_grains], ignore_index=True, ) @@ -153,52 +150,49 @@ def generate_area_used_for_production_per_crop_type(df_qcl: pd.DataFrame) -> Tab descriptions += f"\n\nMetric: {area_by_crop_type['element'].iloc[0]}" descriptions += f"\nDescription: {area_by_crop_type['element_description'].iloc[0]}" - # Create a table with the necessary columns, set an appropriate index, and sort conveniently. - tb_area_by_crop_type = Table( + # Select the necessary columns, set an appropriate index, and sort conveniently. + tb_area_by_crop_type = ( area_by_crop_type[["item", "year", "value"]] - .rename(columns={"value": "area_used_for_production"}) + .rename(columns={"value": "area_used_for_production"}, errors="raise") .set_index(["item", "year"], verify_integrity=True) - .sort_index(), - short_name="area_used_per_crop_type", + .sort_index() ) + tb_area_by_crop_type.metadata.short_name = "area_used_per_crop_type" - # Add a table description. - tb_area_by_crop_type["area_used_for_production"].metadata.description = descriptions + # Add table description to the indicator's key description. + tb_area_by_crop_type["area_used_for_production"].metadata.description_key = descriptions return tb_area_by_crop_type -def generate_percentage_of_sustainable_and_overexploited_fish(df_sdgb: pd.DataFrame) -> Table: +def generate_percentage_of_sustainable_and_overexploited_fish(tb_sdgb: Table) -> Table: # "14.4.1 Proportion of fish stocks within biologically sustainable levels" ITEM_CODE_SUSTAINABLE_FISH = "000000000024029" # Select the necessary item. - df_sdgb = df_sdgb[df_sdgb["item_code"] == ITEM_CODE_SUSTAINABLE_FISH].reset_index(drop=True) + tb_sdgb = tb_sdgb[tb_sdgb["item_code"] == ITEM_CODE_SUSTAINABLE_FISH].reset_index(drop=True) error = "Unit for fish data has changed." - assert list(df_sdgb["unit"].unique()) == ["Percent"], error + assert list(tb_sdgb["unit"].unique()) == ["Percent"], error error = "Element for fish data has changed." - assert list(df_sdgb["element"].unique()) == ["Value"], error + assert list(tb_sdgb["element"].unique()) == ["Value"], error # Select necessary columns (item and element descriptions are empty in the current version). - df_sdgb = df_sdgb[["country", "year", "value"]].rename(columns={"value": "sustainable_fish"}) + tb_sdgb = tb_sdgb[["country", "year", "value"]].rename(columns={"value": "sustainable_fish"}, errors="raise") error = "Percentage of sustainable fish larger than 100%." - assert (df_sdgb["sustainable_fish"] <= 100).all(), error + assert (tb_sdgb["sustainable_fish"] <= 100).all(), error # Add column of percentage of overexploited fish. - df_sdgb["overexploited_fish"] = 100 - df_sdgb["sustainable_fish"] + tb_sdgb["overexploited_fish"] = 100 - tb_sdgb["sustainable_fish"] - # Create a table with the necessary columns, set an appropriate index, and sort conveniently. - tb_fish = ( - Table(df_sdgb, short_name="share_of_sustainable_and_overexploited_fish") - .set_index(["country", "year"], verify_integrity=True) - .sort_index() - ) + # Set an appropriate index and sort conveniently. + tb_fish = tb_sdgb.set_index(["country", "year"], verify_integrity=True).sort_index() + tb_fish.metadata.short_name = "share_of_sustainable_and_overexploited_fish" return tb_fish -def generate_spared_land_from_increased_yields(df_qcl: pd.DataFrame) -> Table: +def generate_spared_land_from_increased_yields(tb_qcl: Table) -> Table: # Reference year (to see how much land we spare from increased yields). REFERENCE_YEAR = 1961 # Element code for "Yield" of faostat_qcl dataset. @@ -221,27 +215,24 @@ def generate_spared_land_from_increased_yields(df_qcl: pd.DataFrame) -> Table: ] # Select necessary items and elements. - spared_land = df_qcl[ - (df_qcl["item_code"].isin(ITEM_CODES_OF_CROP_GROUPS)) - & (df_qcl["element_code"].isin([ELEMENT_CODE_FOR_PRODUCTION, ELEMENT_CODE_FOR_YIELD])) + spared_land = tb_qcl[ + (tb_qcl["item_code"].isin(ITEM_CODES_OF_CROP_GROUPS)) + & (tb_qcl["element_code"].isin([ELEMENT_CODE_FOR_PRODUCTION, ELEMENT_CODE_FOR_YIELD])) ].reset_index(drop=True) # Sanity check. error = "Units for production and yield have changed." assert set(spared_land["unit"]) == set(["tonnes per hectare", "tonnes"]), error - # Transpose dataframe. + # Transpose table. spared_land = spared_land.pivot( - index=["country", "year", "item"], columns=["element"], values="value" - ).reset_index() - - # Fix spurious index name after pivotting. - spared_land.columns = list(spared_land.columns) + index=["country", "year", "item"], columns=["element"], values="value", join_column_levels_with="_" + ) # Add columns for production and yield for a given reference year. - reference_values = spared_land[spared_land["year"] == REFERENCE_YEAR].drop(columns=["year"]) - spared_land = pd.merge( - spared_land, reference_values, on=["country", "item"], how="left", suffixes=("", f" in {REFERENCE_YEAR}") + reference_values = spared_land[spared_land["year"] == REFERENCE_YEAR].drop(columns=["year"], errors="raise") + spared_land = spared_land.merge( + reference_values, on=["country", "item"], how="left", suffixes=("", f" in {REFERENCE_YEAR}") ) # Drop countries for which we did not have data in the reference year. @@ -265,7 +256,7 @@ def generate_spared_land_from_increased_yields(df_qcl: pd.DataFrame) -> Table: .agg({"Area": sum, f"Area with yield of {REFERENCE_YEAR}": sum}) .assign(**{"item": "All crops"}) ) - spared_land = pd.concat([spared_land, all_crops], ignore_index=True) + spared_land = pr.concat([spared_land, all_crops], ignore_index=True) # Calculate the spared land in total value, and as a percentage of land we would have used with no yield increase. spared_land["Spared land"] = spared_land[f"Area with yield of {REFERENCE_YEAR}"] - spared_land["Area"] @@ -273,28 +264,25 @@ def generate_spared_land_from_increased_yields(df_qcl: pd.DataFrame) -> Table: 100 * spared_land["Spared land"] / spared_land[f"Area with yield of {REFERENCE_YEAR}"] ) - # Create a table with the necessary columns, set an appropriate index, and sort conveniently. - tb_spared_land = Table( - spared_land.set_index(["country", "year", "item"], verify_integrity=True).sort_index(), - short_name="land_spared_by_increased_crop_yields", - underscore=True, - ) + # Set an appropriate index and sort conveniently. + tb_spared_land = spared_land.set_index(["country", "year", "item"], verify_integrity=True).sort_index() + tb_spared_land.metadata.short_name = "land_spared_by_increased_crop_yields" return tb_spared_land -def generate_food_available_for_consumption(df_fbsc: pd.DataFrame) -> Table: +def generate_food_available_for_consumption(tb_fbsc: Table) -> Table: # Element code for "Food available for consumption" of faostat_fbsc (in kilocalories per day per capita). ELEMENT_CODE_FOR_PER_CAPITA_FOOD = "0664pc" # Expected unit. CONSUMPTION_UNIT = "kilocalories per day per capita" # Select relevant metric. - df_fbsc = df_fbsc[(df_fbsc["element_code"] == ELEMENT_CODE_FOR_PER_CAPITA_FOOD)].reset_index(drop=True) + tb_fbsc = tb_fbsc[(tb_fbsc["element_code"] == ELEMENT_CODE_FOR_PER_CAPITA_FOOD)].reset_index(drop=True) # Sanity check. error = "Units for food available for consumption have changed." - assert list(df_fbsc["unit"].unique()) == [CONSUMPTION_UNIT], error + assert list(tb_fbsc["unit"].unique()) == [CONSUMPTION_UNIT], error # List of food groups created by OWID. # Each food group contains one or more "item groups", defined by FAOSTAT. @@ -475,24 +463,23 @@ def generate_food_available_for_consumption(df_fbsc: pd.DataFrame) -> Table: # Sanity check. error = "Not all expected item codes are found in the data." - assert set([item_code for group in FOOD_GROUPS.values() for item_code in group]) <= set(df_fbsc["item_code"]), error + assert set([item_code for group in FOOD_GROUPS.values() for item_code in group]) <= set(tb_fbsc["item_code"]), error - # Create a list of dataframes, one for each food group. - dfs = [ - df_fbsc[df_fbsc["item_code"].isin(FOOD_GROUPS[group])] + # Create a list of tables, one for each food group. + tables = [ + tb_fbsc[tb_fbsc["item_code"].isin(FOOD_GROUPS[group])] .groupby(["country", "year"], as_index=False, observed=True) .agg({"value": "sum"}) - .rename(columns={"value": group}) + .rename(columns={"value": group}, errors="raise") for group in FOOD_GROUPS ] - combined = multi_merge(dfs=dfs, on=["country", "year"], how="outer") + combined = pr.multi_merge(tables=tables, on=["country", "year"], how="outer") - # Create a table, set an appropriate index, and sort conveniently. - tb_food_available_for_consumption = Table( - combined.set_index(["country", "year"], verify_integrity=True).sort_index(), - short_name="food_available_for_consumption", - underscore=True, + # Ensure all column names are snake-case, set an appropriate index and and sort conveniently. + tb_food_available_for_consumption = ( + combined.underscore().set_index(["country", "year"], verify_integrity=True).sort_index() ) + tb_food_available_for_consumption.metadata.short_name = "food_available_for_consumption" # Prepare variable metadata. common_description = ( @@ -502,7 +489,7 @@ def generate_food_available_for_consumption(df_fbsc: pd.DataFrame) -> Table: "individual.\n\nSpecific food commodities have been grouped into higher-level categories." ) for group in FOOD_GROUPS: - item_names = list(df_fbsc[df_fbsc["item_code"].isin(FOOD_GROUPS[group])]["item"].unique()) + item_names = list(tb_fbsc[tb_fbsc["item_code"].isin(FOOD_GROUPS[group])]["item"].unique()) description = ( common_description + f" Food group '{group}' includes the FAO item groups: '" @@ -514,12 +501,12 @@ def generate_food_available_for_consumption(df_fbsc: pd.DataFrame) -> Table: ].metadata.title = f"Daily caloric intake per person from {group.lower().replace('other', 'other commodities')}" tb_food_available_for_consumption[underscore(group)].metadata.unit = CONSUMPTION_UNIT tb_food_available_for_consumption[underscore(group)].metadata.short_unit = "kcal" - tb_food_available_for_consumption[underscore(group)].metadata.description = description + tb_food_available_for_consumption[underscore(group)].metadata.description_key = description return tb_food_available_for_consumption -def generate_macronutrient_compositions(df_fbsc: pd.DataFrame) -> Table: +def generate_macronutrient_compositions(tb_fbsc: Table) -> Table: # Item code for "Total" of faostat_fbsc. ITEM_CODE_ALL_PRODUCTS = "00002901" # Item code for "Vegetal Products" of faostat_fbsc. @@ -540,10 +527,10 @@ def generate_macronutrient_compositions(df_fbsc: pd.DataFrame) -> Table: KCAL_PER_GRAM_OF_CARBOHYDRATES = 4 # Select relevant items and elements. - df = df_fbsc[ - (df_fbsc["item_code"].isin([ITEM_CODE_ALL_PRODUCTS, ITEM_CODE_ANIMAL_PRODUCTS, ITEM_CODE_VEGETAL_PRODUCTS])) + tb = tb_fbsc[ + (tb_fbsc["item_code"].isin([ITEM_CODE_ALL_PRODUCTS, ITEM_CODE_ANIMAL_PRODUCTS, ITEM_CODE_VEGETAL_PRODUCTS])) & ( - df_fbsc["element_code"].isin( + tb_fbsc["element_code"].isin( [ELEMENT_CODE_FOR_ENERGY_PER_DAY, ELEMENT_CODE_FOR_PROTEIN_PER_DAY, ELEMENT_CODE_FOR_FAT_PER_DAY] ) ) @@ -551,7 +538,7 @@ def generate_macronutrient_compositions(df_fbsc: pd.DataFrame) -> Table: # Sanity check. error = "One or more of the units of food available for consumption has changed." - assert list(df["unit"].unique()) == [ + assert list(tb["unit"].unique()) == [ "kilocalories per day per capita", "grams of protein per day per capita", "grams of fat per day per capita", @@ -564,38 +551,38 @@ def generate_macronutrient_compositions(df_fbsc: pd.DataFrame) -> Table: "protein": ELEMENT_CODE_FOR_PROTEIN_PER_DAY, } - # Initialize a list of dataframes, one for each food content (energy, fat or protein). - dfs = [] + # Initialize a list of tables, one for each food content (energy, fat or protein). + tables = [] for content in food_contents: - # Create a dataframe for each food content, and add it to the list. - df_content = df[df["element_code"] == food_contents[content]].pivot( - index=["country", "year"], columns=["item"], values=["value"] - ) # .reset_index() - df_content.columns = df_content.columns.droplevel(0) - df_content = df_content.reset_index().rename( + # Create a table for each food content, and add it to the list. + tb_content = tb[tb["element_code"] == food_contents[content]].pivot( + index=["country", "year"], columns=["item"], values=["value"], join_column_levels_with="_" + ) + tb_content = tb_content.rename( columns={ - "Total": f"Total {content}", - "Vegetal Products": f"{content.capitalize()} from vegetal products", - "Animal Products": f"{content.capitalize()} from animal products", - } + "value_Total": f"Total {content}", + "value_Vegetal Products": f"{content.capitalize()} from vegetal products", + "value_Animal Products": f"{content.capitalize()} from animal products", + }, + errors="raise", ) - dfs.append(df_content) + tables.append(tb_content) # Sanity check. error = f"The sum of animal and vegetable {content} does not add up to the total." assert ( 100 * abs( - df_content[f"{content.capitalize()} from animal products"] - + df_content[f"{content.capitalize()} from vegetal products"] - - df_content[f"Total {content}"] + tb_content[f"{content.capitalize()} from animal products"] + + tb_content[f"{content.capitalize()} from vegetal products"] + - tb_content[f"Total {content}"] ) - / df_content[f"Total {content}"] + / tb_content[f"Total {content}"] < 1 ).all(), error - # Combine all dataframes. - combined = multi_merge(dfs=dfs, on=["country", "year"], how="outer") + # Combine all tables. + combined = pr.multi_merge(tables=tables, on=["country", "year"], how="outer") # Daily caloric intake from fat, per person. combined["Total energy from fat"] = combined["Total fat"] * KCAL_PER_GRAM_OF_FAT @@ -632,17 +619,14 @@ def generate_macronutrient_compositions(df_fbsc: pd.DataFrame) -> Table: 100 * combined["Energy from vegetal protein"] / combined["Total energy"] ) - # Create a table, set an appropriate index, and sort conveniently. - tb_combined = Table( - combined.set_index(["country", "year"], verify_integrity=True).sort_index(), - short_name="macronutrient_compositions", - underscore=True, - ) + # Ensure all column names are snake-case, set an appropriate index, and sort conveniently. + tb_combined = combined.underscore().set_index(["country", "year"], verify_integrity=True).sort_index() + tb_combined.metadata.short_name = "macronutrient_compositions" return tb_combined -def generate_fertilizers(df_rfn: pd.DataFrame, df_rl: pd.DataFrame) -> Table: +def generate_fertilizers(tb_rfn: Table, tb_rl: Table) -> Table: # Item code for "Cropland" (which includes arable land and permanent crops). ITEM_CODE_FOR_CROPLAND = "00006620" @@ -659,7 +643,7 @@ def generate_fertilizers(df_rfn: pd.DataFrame, df_rl: pd.DataFrame) -> Table: KG_TO_TONNES = 1e-3 # Select necessary element (use per area). - fertilizers = df_rfn[(df_rfn["element_code"] == ELEMENT_CODE_FOR_USE_PER_AREA)].reset_index(drop=True) + fertilizers = tb_rfn[(tb_rfn["element_code"] == ELEMENT_CODE_FOR_USE_PER_AREA)].reset_index(drop=True) # Sanity checks. error = "Unit for use per area has changed." @@ -669,16 +653,16 @@ def generate_fertilizers(df_rfn: pd.DataFrame, df_rl: pd.DataFrame) -> Table: assert set(fertilizers["item_code"]) == set(ITEM_CODES_FOR_FERTILIZERS), error # Transpose fertilizers data. - fertilizers = fertilizers.pivot(index=["country", "year"], columns=["item"], values=["value"]) - - # Fix spurious index names after pivoting, and rename columns conveniently. - fertilizers.columns = [column[1] for column in fertilizers.columns] + fertilizers = fertilizers.pivot( + index=["country", "year"], columns=["item"], values=["value"], join_column_levels_with="_" + ) + # Rename columns conveniently. fertilizers = fertilizers.rename( columns={ - "Nutrient nitrogen N (total)": "nitrogen_per_cropland", - "Nutrient phosphate P2O5 (total)": "phosphate_per_cropland", - "Nutrient potash K2O (total)": "potash_per_cropland", + "value_Nutrient nitrogen N (total)": "nitrogen_per_cropland", + "value_Nutrient phosphate P2O5 (total)": "phosphate_per_cropland", + "value_Nutrient potash K2O (total)": "potash_per_cropland", }, errors="raise", ) @@ -689,8 +673,8 @@ def generate_fertilizers(df_rfn: pd.DataFrame, df_rl: pd.DataFrame) -> Table: ].sum(axis=1) # To get total agricultural use of fertilizers, we need cropland area. - area = df_rl[ - (df_rl["element_code"] == ELEMENT_CODE_FOR_AREA) & (df_rl["item_code"] == ITEM_CODE_FOR_CROPLAND) + area = tb_rl[ + (tb_rl["element_code"] == ELEMENT_CODE_FOR_AREA) & (tb_rl["item_code"] == ITEM_CODE_FOR_CROPLAND) ].reset_index(drop=True) # Sanity check. @@ -698,27 +682,25 @@ def generate_fertilizers(df_rfn: pd.DataFrame, df_rl: pd.DataFrame) -> Table: assert list(area["unit"].unique()) == ["hectares"], error # Transpose area data. - area = area.pivot(index=["country", "year"], columns=["item"], values=["value"]).reset_index() - area.columns = ["country", "year", "cropland"] + area = area.pivot( + index=["country", "year"], columns=["item"], values=["value"], join_column_levels_with="_" + ).rename(columns={"value_Cropland": "cropland"}, errors="raise") # Combine fertilizers and area. - combined = pd.merge(fertilizers, area, on=["country", "year"], how="outer", validate="one_to_one") + combined = fertilizers.merge(area, on=["country", "year"], how="outer", validate="one_to_one") # Add variables for total fertilizer use. for fertilizer in ["nitrogen", "phosphate", "potash", "all_fertilizers"]: combined[f"{fertilizer}_use"] = combined[f"{fertilizer}_per_cropland"] * combined["cropland"] * KG_TO_TONNES - # Create a table, set an appropriate index, and sort conveniently. - tb_fertilizers = Table( - combined.set_index(["country", "year"], verify_integrity=True).sort_index(), - short_name="fertilizers", - underscore=True, - ) + # Set an appropriate index and sort conveniently. + tb_fertilizers = combined.set_index(["country", "year"], verify_integrity=True).sort_index() + tb_fertilizers.metadata.short_name = "fertilizers" return tb_fertilizers -def generate_vegetable_oil_yields(df_qcl: pd.DataFrame, df_fbsc: pd.DataFrame) -> Table: +def generate_vegetable_oil_yields(tb_qcl: Table, tb_fbsc: Table) -> Table: # Element code for "Production" in faostat_qcl. ELEMENT_CODE_FOR_PRODUCTION_QCL = "005510" # Element code for "Production" in faostat_fbsc. @@ -776,74 +758,90 @@ def generate_vegetable_oil_yields(df_qcl: pd.DataFrame, df_fbsc: pd.DataFrame) - } # Extract the total production of vegetable oil. This is given in fbsc but not qcl. - total_production = df_fbsc[ - (df_fbsc["country"] == "World") - & (df_fbsc["item_code"] == ITEM_CODE_FOR_VEGETABLE_OILS_TOTAL) - & (df_fbsc["element_code"] == ELEMENT_CODE_FOR_PRODUCTION_FBSC) - & (df_fbsc["unit"] == UNIT_FOR_PRODUCTION) + total_production = tb_fbsc[ + (tb_fbsc["country"] == "World") + & (tb_fbsc["item_code"] == ITEM_CODE_FOR_VEGETABLE_OILS_TOTAL) + & (tb_fbsc["element_code"] == ELEMENT_CODE_FOR_PRODUCTION_FBSC) + & (tb_fbsc["unit"] == UNIT_FOR_PRODUCTION) ].reset_index(drop=True) # Transpose data. - total_production = total_production.pivot( - index=["country", "year"], columns=["item_code"], values=["value"] - ).rename(columns={ITEM_CODE_FOR_VEGETABLE_OILS_TOTAL: "vegetable_oils_production"}) - - # Fix column names after pivoting. - total_production.columns = [column[1] for column in total_production.columns] - total_production = total_production.reset_index().drop(columns=["country"]) + total_production = ( + total_production.pivot( + index=["country", "year"], columns=["item_code"], values=["value"], join_column_levels_with="_" + ) + .rename(columns={"value_" + ITEM_CODE_FOR_VEGETABLE_OILS_TOTAL: "vegetable_oils_production"}, errors="raise") + .drop(columns=["country"], errors="raise") + ) # Select relevant items, elements and units for the production of crops. - production = df_qcl[ - (df_qcl["item_code"].isin(ITEM_CODE_FOR_EACH_CROP_PRODUCTION.values())) - & (df_qcl["unit"] == UNIT_FOR_PRODUCTION) - & (df_qcl["element_code"] == ELEMENT_CODE_FOR_PRODUCTION_QCL) + production = tb_qcl[ + (tb_qcl["item_code"].isin(ITEM_CODE_FOR_EACH_CROP_PRODUCTION.values())) + & (tb_qcl["unit"] == UNIT_FOR_PRODUCTION) + & (tb_qcl["element_code"] == ELEMENT_CODE_FOR_PRODUCTION_QCL) ].reset_index(drop=True) # Transpose data. - production = production.pivot(index=["country", "year"], columns=["item_code"], values=["value"]) - - # Fix column names after pivoting. - production.columns = np.array(production.columns.tolist())[:, 1] + production = production.pivot( + index=["country", "year"], columns=["item_code"], values=["value"], join_column_levels_with="_" + ) + production = production.rename( + columns={ + column: column.replace("value_", "") for column in production.columns if column not in ["country", "year"] + }, + errors="raise", + ) # Assign a convenient name to each crop. CROP_NAME_FOR_ITEM_CODE = { ITEM_CODE_FOR_EACH_CROP_PRODUCTION[item_code]: item_code for item_code in ITEM_CODE_FOR_EACH_CROP_PRODUCTION } production = production.rename( - columns={item_code: CROP_NAME_FOR_ITEM_CODE[item_code] + "_production" for item_code in production.columns} - ).reset_index() + columns={ + item_code: CROP_NAME_FOR_ITEM_CODE[item_code] + "_production" + for item_code in production.columns + if item_code not in ["country", "year"] + }, + errors="raise", + ) # Select relevant items, elements and units for the area of crops. - area = df_qcl[ - (df_qcl["item_code"].isin(ITEM_CODE_FOR_EACH_CROP_AREA.values())) - & (df_qcl["unit"] == UNIT_FOR_AREA) - & (df_qcl["element_code"] == ELEMENT_CODE_FOR_AREA) + area = tb_qcl[ + (tb_qcl["item_code"].isin(ITEM_CODE_FOR_EACH_CROP_AREA.values())) + & (tb_qcl["unit"] == UNIT_FOR_AREA) + & (tb_qcl["element_code"] == ELEMENT_CODE_FOR_AREA) ].reset_index(drop=True) # Transpose data. - area = area.pivot(index=["country", "year"], columns=["item_code"], values=["value"]) - - # Fix column names after pivoting. - area.columns = np.array(area.columns.tolist())[:, 1] + area = area.pivot(index=["country", "year"], columns=["item_code"], values=["value"], join_column_levels_with="_") + area = area.rename( + columns={column: column.replace("value_", "") for column in area.columns if column not in ["country", "year"]}, + errors="raise", + ) # Assign a convenient name to each crop. CROP_NAME_FOR_ITEM_CODE = { ITEM_CODE_FOR_EACH_CROP_AREA[item_code]: item_code for item_code in ITEM_CODE_FOR_EACH_CROP_AREA } area = area.rename( - columns={item_code: CROP_NAME_FOR_ITEM_CODE[item_code] + "_area" for item_code in area.columns} - ).reset_index() + columns={ + item_code: CROP_NAME_FOR_ITEM_CODE[item_code] + "_area" + for item_code in area.columns + if item_code not in ["country", "year"] + }, + errors="raise", + ) # Combine production and area. - combined = pd.merge(production, area, on=["country", "year"], how="outer") + combined = production.merge(area, on=["country", "year"], how="outer") # Add column for global vegetable oil production. - combined = pd.merge(combined, total_production, on=["year"], how="left") + combined = combined.merge(total_production, on=["year"], how="left") # Combine the production of palm oil and palm kernel oil, since we have the area harvested for the palm fruit # (which leads to the production of both palm oil and palm kernel oil). combined["palm_production"] += combined["palm_kernel_production"] - combined = combined.drop(columns=["palm_kernel_production"]) + combined = combined.drop(columns=["palm_kernel_production"], errors="raise") # For each crop, create three relevant metrics. for crop in ITEM_CODE_FOR_EACH_CROP_AREA: @@ -859,17 +857,14 @@ def generate_vegetable_oil_yields(df_qcl: pd.DataFrame, df_fbsc: pd.DataFrame) - # Replace infinite values (obtained when dividing by a null area) by nans. combined = combined.replace(np.inf, np.nan) - # Create a table, set an appropriate index, and sort conveniently. - tb_vegetable_oil_yields = Table( - combined.set_index(["country", "year"], verify_integrity=True).sort_index(), - short_name="vegetable_oil_yields", - underscore=True, - ) + # Set an appropriate index and sort conveniently. + tb_vegetable_oil_yields = combined.set_index(["country", "year"], verify_integrity=True).sort_index() + tb_vegetable_oil_yields.metadata.short_name = "vegetable_oil_yields" return tb_vegetable_oil_yields -def generate_agriculture_land_evolution(df_rl: pd.DataFrame) -> Table: +def generate_agriculture_land_evolution(tb_rl: Table) -> Table: # Element code for "Area". ELEMENT_CODE_FOR_AREA = "005110" # Unit for element of area. @@ -882,15 +877,14 @@ def generate_agriculture_land_evolution(df_rl: pd.DataFrame) -> Table: ITEM_CODE_FOR_AGRICULTURAL_LAND = "00006610" # Select the relevant items, elements and units. - land = df_rl[ - (df_rl["unit"] == UNIT_FOR_AREA) - & (df_rl["element_code"] == ELEMENT_CODE_FOR_AREA) - & (df_rl["item_code"].isin([ITEM_CODE_FOR_AGRICULTURAL_LAND, ITEM_CODE_FOR_CROPLAND, ITEM_CODE_FOR_PASTURES])) + land = tb_rl[ + (tb_rl["unit"] == UNIT_FOR_AREA) + & (tb_rl["element_code"] == ELEMENT_CODE_FOR_AREA) + & (tb_rl["item_code"].isin([ITEM_CODE_FOR_AGRICULTURAL_LAND, ITEM_CODE_FOR_CROPLAND, ITEM_CODE_FOR_PASTURES])) ].reset_index(drop=True) # Transpose data and rename columns conveniently. - land = land.pivot(index=["country", "year"], columns=["item_code"], values="value").reset_index() - land.columns = list(land.columns) + land = land.pivot(index=["country", "year"], columns=["item_code"], values="value", join_column_levels_with="_") land = land.rename( columns={ ITEM_CODE_FOR_AGRICULTURAL_LAND: "agriculture_area", @@ -903,14 +897,13 @@ def generate_agriculture_land_evolution(df_rl: pd.DataFrame) -> Table: # Add columns corresponding to the values of one decade before. _land = land.copy() _land["_year"] = _land["year"] + 10 - combined = pd.merge( - land, + combined = land.merge( _land, left_on=["country", "year"], right_on=["country", "_year"], how="inner", suffixes=("", "_one_decade_back"), - ).drop(columns=["_year"]) + ).drop(columns=["_year"], errors="raise") # For each item, add the percentage change of land use this year with respect to one decade back. for item in ["agriculture_area", "cropland_area", "pasture_area"]: @@ -918,16 +911,19 @@ def generate_agriculture_land_evolution(df_rl: pd.DataFrame) -> Table: 100 * (combined[f"{item}"] - combined[f"{item}_one_decade_back"]) / combined[f"{item}_one_decade_back"] ) - # Set an appropriate index and sort conveniently. - combined = combined.set_index(["country", "year"], verify_integrity=True).sort_index().sort_index(axis=1) + # To avoid warnings, copy metadata of "year" to the new column "year_one_decade_back". + combined["year_one_decade_back"] = combined["year_one_decade_back"].copy_metadata(combined["year"]) - # Create a table. - tb_land_use_evolution = Table(combined, short_name="agriculture_land_use_evolution", underscore=True) + # Set an appropriate index and sort conveniently. + tb_agriculture_land_use_evolution = ( + combined.set_index(["country", "year"], verify_integrity=True).sort_index().sort_index(axis=1) + ) + tb_agriculture_land_use_evolution.metadata.short_name = "agriculture_land_use_evolution" - return tb_land_use_evolution + return tb_agriculture_land_use_evolution -def generate_hypothetical_meat_consumption(df_qcl: pd.DataFrame) -> Table: +def generate_hypothetical_meat_consumption(tb_qcl: Table) -> Table: # Element code and unit for "Production". ELEMENT_CODE_FOR_PRODUCTION = "005510" UNIT_FOR_PRODUCTION = "tonnes" @@ -944,17 +940,18 @@ def generate_hypothetical_meat_consumption(df_qcl: pd.DataFrame) -> Table: ITEM_CODE_FOR_MEAT_TOTAL = "00001765" # Select the required items/elements/units to get national data on per-capita production and slaughtered animals. - meat = df_qcl[ - (df_qcl["item_code"] == ITEM_CODE_FOR_MEAT_TOTAL) - & (df_qcl["element_code"].isin([ELEMENT_CODE_FOR_PRODUCTION_PER_CAPITA, ELEMENT_CODE_FOR_ANIMALS_PER_CAPITA])) - & (df_qcl["unit"].isin([UNIT_FOR_PRODUCTION_PER_CAPITA, UNIT_FOR_ANIMALS_PER_CAPITA])) + meat = tb_qcl[ + (tb_qcl["item_code"] == ITEM_CODE_FOR_MEAT_TOTAL) + & (tb_qcl["element_code"].isin([ELEMENT_CODE_FOR_PRODUCTION_PER_CAPITA, ELEMENT_CODE_FOR_ANIMALS_PER_CAPITA])) + & (tb_qcl["unit"].isin([UNIT_FOR_PRODUCTION_PER_CAPITA, UNIT_FOR_ANIMALS_PER_CAPITA])) ].reset_index(drop=True) - meat = meat.pivot(index=["country", "year"], columns="element_code", values="value").reset_index() + meat = meat.pivot(index=["country", "year"], columns="element_code", values="value", join_column_levels_with="_") meat = meat.rename( columns={ ELEMENT_CODE_FOR_ANIMALS_PER_CAPITA: "animals_per_capita", ELEMENT_CODE_FOR_PRODUCTION_PER_CAPITA: "production_per_capita", - } + }, + errors="raise", ) # Take data for global population from the "population_with_data" column for the production of total meat. @@ -965,40 +962,42 @@ def generate_hypothetical_meat_consumption(df_qcl: pd.DataFrame) -> Table: # But this does not happen with total meat/production for the "World", since this data was extracted directly from FAOSTAT. # TODO: Confirm this by checking qcl code, especially the one about animals slaughtered global_population = ( - df_qcl[ - (df_qcl["country"] == "World") - & (df_qcl["element_code"] == ELEMENT_CODE_FOR_PRODUCTION) - & (df_qcl["unit"] == UNIT_FOR_PRODUCTION) - & (df_qcl["item_code"] == ITEM_CODE_FOR_MEAT_TOTAL) + tb_qcl[ + (tb_qcl["country"] == "World") + & (tb_qcl["element_code"] == ELEMENT_CODE_FOR_PRODUCTION) + & (tb_qcl["unit"] == UNIT_FOR_PRODUCTION) + & (tb_qcl["item_code"] == ITEM_CODE_FOR_MEAT_TOTAL) ][["year", "population_with_data"]] .reset_index(drop=True) - .rename(columns={"population_with_data": "global_population"}) + .rename(columns={"population_with_data": "global_population"}, errors="raise") ).astype({"global_population": int}) # Just for reference, extract global production and number of slaughtered animals. global_production = ( - df_qcl[ - (df_qcl["country"] == "World") - & (df_qcl["element_code"] == ELEMENT_CODE_FOR_PRODUCTION) - & (df_qcl["unit"] == UNIT_FOR_PRODUCTION) - & (df_qcl["item_code"] == ITEM_CODE_FOR_MEAT_TOTAL) + tb_qcl[ + (tb_qcl["country"] == "World") + & (tb_qcl["element_code"] == ELEMENT_CODE_FOR_PRODUCTION) + & (tb_qcl["unit"] == UNIT_FOR_PRODUCTION) + & (tb_qcl["item_code"] == ITEM_CODE_FOR_MEAT_TOTAL) ][["year", "value"]] .reset_index(drop=True) - .rename(columns={"value": "production_global"}) + .rename(columns={"value": "production_global"}, errors="raise") ) global_animals = ( - df_qcl[ - (df_qcl["country"] == "World") - & (df_qcl["element_code"] == ELEMENT_CODE_FOR_ANIMALS) - & (df_qcl["unit"] == UNIT_FOR_ANIMALS) - & (df_qcl["item_code"] == ITEM_CODE_FOR_MEAT_TOTAL) + tb_qcl[ + (tb_qcl["country"] == "World") + & (tb_qcl["element_code"] == ELEMENT_CODE_FOR_ANIMALS) + & (tb_qcl["unit"] == UNIT_FOR_ANIMALS) + & (tb_qcl["item_code"] == ITEM_CODE_FOR_MEAT_TOTAL) ][["year", "value"]] .reset_index(drop=True) - .rename(columns={"value": "animals_global"}) + .rename(columns={"value": "animals_global"}, errors="raise") ) # Combine national with global data. - combined = multi_merge(dfs=[meat, global_population, global_production, global_animals], on=["year"], how="left") + combined = pr.multi_merge( + tables=[meat, global_population, global_production, global_animals], on=["year"], how="left" + ) # Sanity check. error = "Rows have changed after merging national data with global data." @@ -1010,15 +1009,15 @@ def generate_hypothetical_meat_consumption(df_qcl: pd.DataFrame) -> Table: combined["animals_global_hypothetical"] = combined["animals_per_capita"] * combined["global_population"] # Set an appropriate index and sort conveniently. - combined = combined.set_index(["country", "year"], verify_integrity=True).sort_index().sort_index(axis=1) - - # Create a table with the combined data. - tb_hypothetical_meat_consumption = Table(combined, short_name="hypothetical_meat_consumption", underscore=True) + tb_hypothetical_meat_consumption = ( + combined.set_index(["country", "year"], verify_integrity=True).sort_index().sort_index(axis=1) + ) + tb_hypothetical_meat_consumption.metadata.short_name = "hypothetical_meat_consumption" return tb_hypothetical_meat_consumption -def generate_cereal_allocation(df_fbsc: pd.DataFrame) -> Table: +def generate_cereal_allocation(tb_fbsc: Table) -> Table: # Item code for "Cereals - Excluding Beer". ITEM_CODE_FOR_CEREALS = "00002905" # Note: We disregard the contribution from "00002520" ("Cereals, Other"), which is usually negligible compared to the total. @@ -1035,9 +1034,9 @@ def generate_cereal_allocation(df_fbsc: pd.DataFrame) -> Table: UNIT_FOR_OTHER_USES = "tonnes" # Select the relevant items/elements. - cereals = df_fbsc[ - (df_fbsc["item_code"] == ITEM_CODE_FOR_CEREALS) - & (df_fbsc["element_code"].isin([ELEMENT_CODE_FOR_FOOD, ELEMENT_CODE_FOR_FEED, ELEMENT_CODE_FOR_OTHER_USES])) + cereals = tb_fbsc[ + (tb_fbsc["item_code"] == ITEM_CODE_FOR_CEREALS) + & (tb_fbsc["element_code"].isin([ELEMENT_CODE_FOR_FOOD, ELEMENT_CODE_FOR_FEED, ELEMENT_CODE_FOR_OTHER_USES])) ].reset_index(drop=True) # Sanity check. @@ -1045,16 +1044,15 @@ def generate_cereal_allocation(df_fbsc: pd.DataFrame) -> Table: assert set(cereals["unit"]) == set([UNIT_FOR_FOOD, UNIT_FOR_FEED, UNIT_FOR_OTHER_USES]), error # Transpose data and rename columns conveniently. - cereals = ( - cereals.pivot(index=["country", "year"], columns="element_code", values="value") - .reset_index() - .rename( - columns={ - ELEMENT_CODE_FOR_FOOD: "cereals_allocated_to_food", - ELEMENT_CODE_FOR_FEED: "cereals_allocated_to_animal_feed", - ELEMENT_CODE_FOR_OTHER_USES: "cereals_allocated_to_other_uses", - } - ) + cereals = cereals.pivot( + index=["country", "year"], columns="element_code", values="value", join_column_levels_with="_" + ).rename( + columns={ + ELEMENT_CODE_FOR_FOOD: "cereals_allocated_to_food", + ELEMENT_CODE_FOR_FEED: "cereals_allocated_to_animal_feed", + ELEMENT_CODE_FOR_OTHER_USES: "cereals_allocated_to_other_uses", + }, + errors="raise", ) # Add variables for the share of cereals allocated to each use. @@ -1067,15 +1065,13 @@ def generate_cereal_allocation(df_fbsc: pd.DataFrame) -> Table: ) # Set an appropriate index and sort conveniently. - cereals = cereals.set_index(["country", "year"], verify_integrity=True).sort_index().sort_index(axis=1) - - # Create a table with the generated data. - tb_cereal_allocation = Table(cereals, short_name="cereal_allocation", underscore=True) + tb_cereal_allocation = cereals.set_index(["country", "year"], verify_integrity=True).sort_index().sort_index(axis=1) + tb_cereal_allocation.metadata.short_name = "cereal_allocation" return tb_cereal_allocation -def generate_maize_and_wheat(df_fbsc: pd.DataFrame) -> Table: +def generate_maize_and_wheat(tb_fbsc: Table) -> Table: # Item code for "Wheat". ITEM_CODE_FOR_WHEAT = "00002511" # Item code for "Maize". @@ -1088,9 +1084,9 @@ def generate_maize_and_wheat(df_fbsc: pd.DataFrame) -> Table: ELEMENT_CODE_FOR_OTHER_USES = "005154" # Select the relevant items/elements. - maize_and_wheat = df_fbsc[ - (df_fbsc["item_code"].isin([ITEM_CODE_FOR_MAIZE, ITEM_CODE_FOR_WHEAT])) - & (df_fbsc["element_code"].isin([ELEMENT_CODE_FOR_EXPORTS, ELEMENT_CODE_FOR_FEED, ELEMENT_CODE_FOR_OTHER_USES])) + maize_and_wheat = tb_fbsc[ + (tb_fbsc["item_code"].isin([ITEM_CODE_FOR_MAIZE, ITEM_CODE_FOR_WHEAT])) + & (tb_fbsc["element_code"].isin([ELEMENT_CODE_FOR_EXPORTS, ELEMENT_CODE_FOR_FEED, ELEMENT_CODE_FOR_OTHER_USES])) ] # Sanity check. @@ -1099,29 +1095,24 @@ def generate_maize_and_wheat(df_fbsc: pd.DataFrame) -> Table: # Transpose data and rename columns conveniently. maize_and_wheat = maize_and_wheat.pivot( - index=["country", "year"], columns=["item_code", "element_code"], values="value" - ) - maize_and_wheat = maize_and_wheat.rename( - columns={ITEM_CODE_FOR_MAIZE: "maize", ITEM_CODE_FOR_WHEAT: "wheat"}, level=0 + index=["country", "year"], columns=["item_code", "element_code"], values="value", join_column_levels_with="_" ).rename( columns={ - ELEMENT_CODE_FOR_EXPORTS: "exports", - ELEMENT_CODE_FOR_FEED: "animal_feed", - ELEMENT_CODE_FOR_OTHER_USES: "other_uses", - } + f"{ITEM_CODE_FOR_MAIZE}_{ELEMENT_CODE_FOR_EXPORTS}": "maize_exports", + f"{ITEM_CODE_FOR_MAIZE}_{ELEMENT_CODE_FOR_FEED}": "maize_animal_feed", + f"{ITEM_CODE_FOR_MAIZE}_{ELEMENT_CODE_FOR_OTHER_USES}": "maize_other_uses", + f"{ITEM_CODE_FOR_WHEAT}_{ELEMENT_CODE_FOR_EXPORTS}": "wheat_exports", + f"{ITEM_CODE_FOR_WHEAT}_{ELEMENT_CODE_FOR_FEED}": "wheat_animal_feed", + f"{ITEM_CODE_FOR_WHEAT}_{ELEMENT_CODE_FOR_OTHER_USES}": "wheat_other_uses", + }, + errors="raise", ) - maize_and_wheat.columns = [column[0] + "_" + column[1] for column in maize_and_wheat.columns] # Set an appropriate index and sort conveniently. - maize_and_wheat = ( - maize_and_wheat.reset_index() - .set_index(["country", "year"], verify_integrity=True) - .sort_index() - .sort_index(axis=1) + tb_maize_and_wheat = ( + maize_and_wheat.set_index(["country", "year"], verify_integrity=True).sort_index().sort_index(axis=1) ) - - # Create a table with the generated data. - tb_maize_and_wheat = Table(maize_and_wheat, short_name="maize_and_wheat", underscore=True) + tb_maize_and_wheat.metadata.short_name = "maize_and_wheat" # Add minimal variable metadata (more metadata will be added at the grapher step). for column in tb_maize_and_wheat.columns: @@ -1131,7 +1122,7 @@ def generate_maize_and_wheat(df_fbsc: pd.DataFrame) -> Table: return tb_maize_and_wheat -def generate_fertilizer_exports(df_rfn: pd.DataFrame) -> Table: +def generate_fertilizer_exports(tb_rfn: Table) -> Table: # Element code for "Export Quantity". ELEMENT_CODE_FOR_EXPORTS = "005910" # Item code for "Nutrient nitrogen N (total)". @@ -1142,9 +1133,9 @@ def generate_fertilizer_exports(df_rfn: pd.DataFrame) -> Table: ITEM_CODE_FOR_POTASH = "00003104" # Select the relevant items and elements. - fertilizer_exports = df_rfn[ - (df_rfn["element_code"] == ELEMENT_CODE_FOR_EXPORTS) - & (df_rfn["item_code"].isin([ITEM_CODE_FOR_NITROGEN, ITEM_CODE_FOR_PHOSPHATE, ITEM_CODE_FOR_POTASH])) + fertilizer_exports = tb_rfn[ + (tb_rfn["element_code"] == ELEMENT_CODE_FOR_EXPORTS) + & (tb_rfn["item_code"].isin([ITEM_CODE_FOR_NITROGEN, ITEM_CODE_FOR_PHOSPHATE, ITEM_CODE_FOR_POTASH])) ].reset_index(drop=True) # Sanity check. @@ -1153,7 +1144,7 @@ def generate_fertilizer_exports(df_rfn: pd.DataFrame) -> Table: # Rename columns and items conveniently. fertilizer_exports = fertilizer_exports[["country", "year", "item_code", "value"]].rename( - columns={"item_code": "item", "value": "exports"} + columns={"item_code": "item", "value": "exports"}, errors="raise" ) fertilizer_exports["item"] = fertilizer_exports["item"].replace( {ITEM_CODE_FOR_NITROGEN: "Nitrogen", ITEM_CODE_FOR_PHOSPHATE: "Phosphorous", ITEM_CODE_FOR_POTASH: "Potassium"} @@ -1163,23 +1154,21 @@ def generate_fertilizer_exports(df_rfn: pd.DataFrame) -> Table: global_exports = ( fertilizer_exports[fertilizer_exports["country"] == "World"].drop(columns=["country"]).reset_index(drop=True) ) - fertilizer_exports = pd.merge( - fertilizer_exports, global_exports, how="left", on=["year", "item"], suffixes=("", "_global") + fertilizer_exports = fertilizer_exports.merge( + global_exports, how="left", on=["year", "item"], suffixes=("", "_global") ) # Create columns for the share of exports. fertilizer_exports["share_of_exports"] = 100 * fertilizer_exports["exports"] / fertilizer_exports["exports_global"] # Drop column of global exports. - fertilizer_exports = fertilizer_exports.drop(columns=["exports_global"]) + fertilizer_exports = fertilizer_exports.drop(columns=["exports_global"], errors="raise") # Set an appropriate index and sort conveniently. - fertilizer_exports = ( + tb_fertilizer_exports = ( fertilizer_exports.set_index(["country", "year", "item"], verify_integrity=True).sort_index().sort_index(axis=1) ) - - # Create a table with the generated data. - tb_fertilizer_exports = Table(fertilizer_exports, short_name="fertilizer_exports", underscore=True) + tb_fertilizer_exports.metadata.short_name = "fertilizer_exports" # Add minimal variable metadata (more metadata will be added at the grapher step). tb_fertilizer_exports["share_of_exports"].metadata.unit = "%" @@ -1194,77 +1183,71 @@ def run(dest_dir: str) -> None: # # Load inputs. # - # Load dataset about land use, load its main (long-format) table, and create a convenient dataframe. + # Load dataset about land use and load its main (long-format) table. ds_rl = paths.load_dataset(f"{NAMESPACE}_rl") - tb_rl = ds_rl[f"{NAMESPACE}_rl"] - df_rl = pd.DataFrame(tb_rl).reset_index() + tb_rl = ds_rl[f"{NAMESPACE}_rl"].reset_index() - # Load dataset about production indices, load its main (long-format) table, and create a convenient dataframe. + # Load dataset about production indices and load its main (long-format) table. ds_qi = paths.load_dataset(f"{NAMESPACE}_qi") - tb_qi = ds_qi[f"{NAMESPACE}_qi"] - df_qi = pd.DataFrame(tb_qi).reset_index() + tb_qi = ds_qi[f"{NAMESPACE}_qi"].reset_index() - # Load dataset about crops and livestock, load its main (long-format) table, and create a convenient dataframe. + # Load dataset about crops and livestock and load its main (long-format) table. ds_qcl = paths.load_dataset(f"{NAMESPACE}_qcl") - tb_qcl = ds_qcl[f"{NAMESPACE}_qcl"] - df_qcl = pd.DataFrame(tb_qcl).reset_index() + tb_qcl = ds_qcl[f"{NAMESPACE}_qcl"].reset_index() - # Load dataset about SDG indicators, load its main (long-format) table, and create a convenient dataframe. + # Load dataset about SDG indicators and load its main (long-format) table. ds_sdgb = paths.load_dataset(f"{NAMESPACE}_sdgb") - tb_sdgb = ds_sdgb[f"{NAMESPACE}_sdgb"] - df_sdgb = pd.DataFrame(tb_sdgb).reset_index() + tb_sdgb = ds_sdgb[f"{NAMESPACE}_sdgb"].reset_index() - # Load dataset about food balances, load its main (long-format) table, and create a convenient dataframe. + # Load dataset about food balances and load its main (long-format) table. ds_fbsc = paths.load_dataset(f"{NAMESPACE}_fbsc") - tb_fbsc = ds_fbsc[f"{NAMESPACE}_fbsc"] - df_fbsc = pd.DataFrame(tb_fbsc).reset_index() + tb_fbsc = ds_fbsc[f"{NAMESPACE}_fbsc"].reset_index() - # Load dataset about fertilizers by nutrient, load its main (long-format) table, and create a convenient dataframe. + # Load dataset about fertilizers by nutrient and load its main (long-format) table. ds_rfn = paths.load_dataset(f"{NAMESPACE}_rfn") - tb_rfn = ds_rfn[f"{NAMESPACE}_rfn"] - df_rfn = pd.DataFrame(tb_rfn).reset_index() + tb_rfn = ds_rfn[f"{NAMESPACE}_rfn"].reset_index() # # Process data. # # Create table for arable land per crop output. - tb_arable_land_per_crop_output = generate_arable_land_per_crop_output(df_rl=df_rl, df_qi=df_qi) + tb_arable_land_per_crop_output = generate_arable_land_per_crop_output(tb_rl=tb_rl, tb_qi=tb_qi) # Create table for area used for production per crop type. - tb_area_by_crop_type = generate_area_used_for_production_per_crop_type(df_qcl=df_qcl) + tb_area_by_crop_type = generate_area_used_for_production_per_crop_type(tb_qcl=tb_qcl) # Create table for the share of sustainable and overexploited fish. - tb_sustainable_and_overexploited_fish = generate_percentage_of_sustainable_and_overexploited_fish(df_sdgb=df_sdgb) + tb_sustainable_and_overexploited_fish = generate_percentage_of_sustainable_and_overexploited_fish(tb_sdgb=tb_sdgb) # Create table for spared land due to increased yields. - tb_spared_land_from_increased_yields = generate_spared_land_from_increased_yields(df_qcl=df_qcl) + tb_spared_land_from_increased_yields = generate_spared_land_from_increased_yields(tb_qcl=tb_qcl) # Create table for dietary compositions by commodity group. - tb_food_available_for_consumption = generate_food_available_for_consumption(df_fbsc=df_fbsc) + tb_food_available_for_consumption = generate_food_available_for_consumption(tb_fbsc=tb_fbsc) # Create table for macronutrient compositions. - tb_macronutrient_compositions = generate_macronutrient_compositions(df_fbsc=df_fbsc) + tb_macronutrient_compositions = generate_macronutrient_compositions(tb_fbsc=tb_fbsc) # Create table for fertilizers data. - tb_fertilizers = generate_fertilizers(df_rfn=df_rfn, df_rl=df_rl) + tb_fertilizers = generate_fertilizers(tb_rfn=tb_rfn, tb_rl=tb_rl) # Create table for vegetable oil yields. - tb_vegetable_oil_yields = generate_vegetable_oil_yields(df_qcl=df_qcl, df_fbsc=df_fbsc) + tb_vegetable_oil_yields = generate_vegetable_oil_yields(tb_qcl=tb_qcl, tb_fbsc=tb_fbsc) # Create table for peak agricultural land. - tb_agriculture_land_use_evolution = generate_agriculture_land_evolution(df_rl=df_rl) + tb_agriculture_land_use_evolution = generate_agriculture_land_evolution(tb_rl=tb_rl) # Create table for hypothetical meat consumption - tb_hypothetical_meat_consumption = generate_hypothetical_meat_consumption(df_qcl=df_qcl) + tb_hypothetical_meat_consumption = generate_hypothetical_meat_consumption(tb_qcl=tb_qcl) # Create table for cereal allocation. - tb_cereal_allocation = generate_cereal_allocation(df_fbsc=df_fbsc) + tb_cereal_allocation = generate_cereal_allocation(tb_fbsc=tb_fbsc) # Create table for maize and wheat data (used in the context of the Ukraine war). - tb_maize_and_wheat = generate_maize_and_wheat(df_fbsc=df_fbsc) + tb_maize_and_wheat = generate_maize_and_wheat(tb_fbsc=tb_fbsc) # Create table for fertilizer exports (used in the context of the Ukraine war). - tb_fertilizer_exports = generate_fertilizer_exports(df_rfn=df_rfn) + tb_fertilizer_exports = generate_fertilizer_exports(tb_rfn=tb_rfn) # # Save outputs. @@ -1287,5 +1270,6 @@ def run(dest_dir: str) -> None: tb_maize_and_wheat, tb_fertilizer_exports, ], + check_variables_metadata=True, ) ds_garden.save() diff --git a/etl/steps/data/grapher/faostat/2024-03-14/additional_variables.py b/etl/steps/data/grapher/faostat/2024-03-14/additional_variables.py index 4fe600516f1..fe862342b8d 100644 --- a/etl/steps/data/grapher/faostat/2024-03-14/additional_variables.py +++ b/etl/steps/data/grapher/faostat/2024-03-14/additional_variables.py @@ -1,6 +1,6 @@ """Load a garden dataset and create a grapher dataset.""" -import pandas as pd +import owid.catalog.processing as pr from owid.catalog import Table from owid.catalog.utils import underscore_table @@ -14,59 +14,67 @@ def prepare_maize_and_wheat_in_the_context_of_the_ukraine_war(tb_maize_and_wheat # Prepare groupings that will be shown in a stacked discrete bar chart. # Ukraine and Russia exports of maize and wheat. ukraine_and_russia_exports = ( - pd.merge( - tb_maize_and_wheat[["maize_exports", "wheat_exports"]].loc["Ukraine"], - tb_maize_and_wheat[["maize_exports", "wheat_exports"]].loc["Russia"], - left_index=True, - right_index=True, + tb_maize_and_wheat[tb_maize_and_wheat["country"] == "Ukraine"][["year", "maize_exports", "wheat_exports"]] + .merge( + tb_maize_and_wheat[tb_maize_and_wheat["country"] == "Russia"][["year", "maize_exports", "wheat_exports"]], + on="year", suffixes=(" Ukraine", " Russia"), ) .assign(**{"country": "Ukraine and Russia exports"}) - .reset_index() ) # EU and UK maize and wheat used for animal feed. eu_and_uk_feed = ( - pd.merge( - tb_maize_and_wheat[["maize_animal_feed", "wheat_animal_feed"]].loc["European Union (27)"], - tb_maize_and_wheat[["maize_animal_feed", "wheat_animal_feed"]].loc["United Kingdom"], - left_index=True, - right_index=True, + tb_maize_and_wheat[tb_maize_and_wheat["country"] == "European Union (27)"][ + ["year", "maize_animal_feed", "wheat_animal_feed"] + ] + .merge( + tb_maize_and_wheat[tb_maize_and_wheat["country"] == "United Kingdom"][ + ["year", "maize_animal_feed", "wheat_animal_feed"] + ], + on="year", suffixes=(" EU", " UK"), ) .assign(**{"country": "EU and UK animal feed"}) - .reset_index() ) # EU and UK maize and wheat devoted to other uses (predominantly biofuels). eu_and_uk_biofuels = ( - pd.merge( - tb_maize_and_wheat[["maize_other_uses", "wheat_other_uses"]].loc["European Union (27)"], - tb_maize_and_wheat[["maize_other_uses", "wheat_other_uses"]].loc["United Kingdom"], - left_index=True, - right_index=True, + tb_maize_and_wheat[tb_maize_and_wheat["country"] == "European Union (27)"][ + ["year", "maize_other_uses", "wheat_other_uses"] + ] + .merge( + tb_maize_and_wheat[tb_maize_and_wheat["country"] == "United Kingdom"][ + ["year", "maize_other_uses", "wheat_other_uses"] + ], + on="year", suffixes=(" EU", " UK"), ) .assign(**{"country": "EU and UK biofuels"}) - .reset_index() ) # US maize and wheat used for animal feed. us_feed = ( - tb_maize_and_wheat[["maize_animal_feed", "wheat_animal_feed"]] - .loc["United States"] - .rename(columns={"maize_animal_feed": "maize_animal_feed US", "wheat_animal_feed": "wheat_animal_feed US"}) + tb_maize_and_wheat[tb_maize_and_wheat["country"] == "United States"][ + ["year", "maize_animal_feed", "wheat_animal_feed"] + ] + .rename( + columns={"maize_animal_feed": "maize_animal_feed US", "wheat_animal_feed": "wheat_animal_feed US"}, + errors="raise", + ) .assign(**{"country": "US animal feed"}) - .reset_index() ) # US maize and wheat devoted to other uses (predominantly biofuels). us_biofuels = ( - tb_maize_and_wheat[["maize_other_uses", "wheat_other_uses"]] - .loc["United States"] - .rename(columns={"maize_other_uses": "maize_other_uses US", "wheat_other_uses": "wheat_other_uses US"}) + tb_maize_and_wheat[tb_maize_and_wheat["country"] == "United States"][ + ["year", "maize_other_uses", "wheat_other_uses"] + ] + .rename( + columns={"maize_other_uses": "maize_other_uses US", "wheat_other_uses": "wheat_other_uses US"}, + errors="raise", + ) .assign(**{"country": "US biofuels"}) - .reset_index() ) # Combine all groupings. - combined = pd.concat( + combined = pr.concat( [ukraine_and_russia_exports, eu_and_uk_feed, eu_and_uk_biofuels, us_feed, us_biofuels], ignore_index=True ) @@ -86,7 +94,7 @@ def prepare_maize_and_wheat_in_the_context_of_the_ukraine_war(tb_maize_and_wheat combined[column].metadata.title = title combined[column].metadata.unit = "tonnes" combined[column].metadata.short_unit = "t" - combined = underscore_table(combined) + combined = combined.underscore() return combined @@ -141,7 +149,7 @@ def run(dest_dir: str) -> None: tb_agriculture_land_use_evolution = ds_garden["agriculture_land_use_evolution"] tb_hypothetical_meat_consumption = ds_garden["hypothetical_meat_consumption"] tb_cereal_allocation = ds_garden["cereal_allocation"] - tb_maize_and_wheat = ds_garden["maize_and_wheat"] + tb_maize_and_wheat = ds_garden["maize_and_wheat"].reset_index() tb_fertilizer_exports = ds_garden["fertilizer_exports"] # @@ -150,7 +158,7 @@ def run(dest_dir: str) -> None: # To insert table into grapher DB, change "item" column to "country" (which will be changed back in the admin). tb_area_used_per_crop_type = ( tb_area_used_per_crop_type.reset_index() - .rename(columns={"item": "country"}) + .rename(columns={"item": "country"}, errors="raise") .set_index(["country", "year"], verify_integrity=True) .sort_index() ) @@ -160,8 +168,8 @@ def run(dest_dir: str) -> None: tb_land_spared_by_increased_crop_yields = tb_land_spared_by_increased_crop_yields.reset_index() tb_land_spared_by_increased_crop_yields = ( tb_land_spared_by_increased_crop_yields[tb_land_spared_by_increased_crop_yields["country"] == "World"] - .drop(columns=["country"]) - .rename(columns={"item": "country"}) + .drop(columns=["country"], errors="raise") + .rename(columns={"item": "country"}, errors="raise") .set_index(["country", "year"], verify_integrity=True) .sort_index() ) @@ -198,6 +206,7 @@ def run(dest_dir: str) -> None: tb_fertilizer_exports_in_the_context_of_the_ukraine_war, ], default_metadata=ds_garden.metadata, + check_variables_metadata=True, ) # Save changes in the new grapher dataset. From 06ae67320f6aaf96cd7451be34e09e812ae804d6 Mon Sep 17 00:00:00 2001 From: Pablo Rosado Date: Wed, 20 Mar 2024 16:54:07 +0100 Subject: [PATCH 43/54] Add multi_merge function to owid catalog --- lib/catalog/owid/catalog/processing.py | 2 ++ lib/catalog/owid/catalog/tables.py | 23 +++++++++++++++++++++++ 2 files changed, 25 insertions(+) diff --git a/lib/catalog/owid/catalog/processing.py b/lib/catalog/owid/catalog/processing.py index e645654ae54..8a04b38ab06 100755 --- a/lib/catalog/owid/catalog/processing.py +++ b/lib/catalog/owid/catalog/processing.py @@ -6,6 +6,7 @@ concat, melt, merge, + multi_merge, pivot, read_csv, read_excel, @@ -24,6 +25,7 @@ "concat", "melt", "merge", + "multi_merge", "pivot", "read_csv", "read_feather", diff --git a/lib/catalog/owid/catalog/tables.py b/lib/catalog/owid/catalog/tables.py index c8c1f60448c..497455861c3 100644 --- a/lib/catalog/owid/catalog/tables.py +++ b/lib/catalog/owid/catalog/tables.py @@ -1648,6 +1648,29 @@ def _resolve_collisions( return new_cols +def multi_merge(tables: List[Table], *args, **kwargs) -> Table: + """Merge multiple tables. + + This is a helper function when merging more than two tables on common columns. + + Parameters + ---------- + tables : List[Table] + Tables to merge. + + Returns + ------- + combined : Table + Merged table. + + """ + combined = tables[0].copy() + for table in tables[1:]: + combined = combined.merge(table, *args, **kwargs) + + return combined + + def _extract_variables(t: Table, cols: Optional[Union[List[str], str]]) -> List[variables.Variable]: if not cols: return [] From 751594c2863845bf0d1dc0b2a416ee567c590c38 Mon Sep 17 00:00:00 2001 From: Pablo Rosado Date: Wed, 20 Mar 2024 17:19:03 +0100 Subject: [PATCH 44/54] Update metadata of additional variables dataset --- .../2024-03-14/additional_variables.meta.yml | 298 ++++++++---------- 1 file changed, 127 insertions(+), 171 deletions(-) diff --git a/etl/steps/data/garden/faostat/2024-03-14/additional_variables.meta.yml b/etl/steps/data/garden/faostat/2024-03-14/additional_variables.meta.yml index 2849223f629..afd8367972f 100644 --- a/etl/steps/data/garden/faostat/2024-03-14/additional_variables.meta.yml +++ b/etl/steps/data/garden/faostat/2024-03-14/additional_variables.meta.yml @@ -1,18 +1,24 @@ -all_sources: - - faostat: &faostat_source - name: Food and Agriculture Organization of the United Nations - published_by: Food and Agriculture Organization of the United Nations - url: http://www.fao.org/faostat/en/#data/ - date_accessed: "2023-06-12" - publication_date: "2023-06-12" - publication_year: 2023 +definitions: + common: + macronutrient_compositions: + description_processing: | + - The FAO provide annual figures from 1961 by country on daily caloric supply, fat supply (in grams), and protein supply (in grams). To calculate the daily per capita supply of carbohydrates, we assume an energy density by macronutrient of 4 kcal per gram of both protein and carbohydrate and 9 kcal per gram of fat (based on established nutritional guidelines reported by the FAO). The daily supply of carbohydrates is therefore calculated as: + + ((Daily supply of kcal)-(Daily supply of protein * 4 + Daily supply of fat * 9)) / 4 + + - The quantity of calories from each macronutrient is then calculated based on the energy density figures given above (e.g. calories from protein is calculated by multiplying the daily supply of protein in grams by 4). + + - For an explanation of these conversion factors, see "Chapter 3: Calculation Of The Energy Content Of Foods - Energy Conversion Factors", available at: http://www.fao.org/docrep/006/Y5022E/y5022e04.htm + + - The share of calories derived from each macronutrient is then calculated by dividing the number of calories derived from a given macronutrient by the total daily caloric supply. + + - Protein of animal origin includes protein supplied in the form of all meat commodities, eggs and dairy products, and fish & seafood. + dataset: - title: Additional variables (FAOSTAT, 2023b) + title: Additional FAOSTAT variables description: | Additional variables created using data from different FAOSTAT datasets. - sources: - - *faostat_source tables: arable_land_per_crop_output: @@ -21,24 +27,24 @@ tables: title: 'Arable land' unit: 'hectares' short_unit: 'ha' - description: | + description_short: | Arable land is the total of areas (extent of surface of land or water) under temporary crops, temporary meadows and pastures, and land with temporary fallow. Arable land does not include land that is potentially cultivable but is not normally cultivated. index: title: 'Gross Production Index Number' unit: '' short_unit: '' - description: | + description_short: | Gross Production Index Number (2014-2016 = 100). arable_land_per_crop_output: title: Arable land needed to produce a fixed quantity of crops unit: '' short_unit: '' - description: | - Index of arable land needed to produce a fixed quantity of crops (where values in 1961 are equal to 1.0). This is calculated as arable land divided by the crop production index (PIN). The crop production index here is the sum of crop commodities produced (after deductions of quantities used as seed and feed). It is weighted by the commodity prices. - - This metric measures the index of arable land needed to produce a fixed quantity of crops (where values in 1961 are equal to 1.0). - - Arable land is the total of areas (extent of surface of land or water) under temporary crops, temporary meadows and pastures, and land with temporary fallow. Arable land does not include land that is potentially cultivable but is not normally cultivated. + description_short: | + Index of arable land needed to produce a fixed quantity of crops (where values in 1961 are equal to 1.0). + description_key: + - Arable land is the total of areas (extent of surface of land or water) under temporary crops, temporary meadows and pastures, and land with temporary fallow. Arable land does not include land that is potentially cultivable but is not normally cultivated. + - This is calculated as arable land divided by the crop production index (PIN). The crop production index is the sum of crop commodities produced after deductions of quantities used as seed and feed. + - It is weighted by the commodity prices. area_used_per_crop_type: variables: area_used_for_production: @@ -53,13 +59,13 @@ tables: title: "Percentage of fish stocks within biologically sustainable levels" unit: "%" short_unit: "%" - description: | + description_short: | Fish stock are subpopulations of a particular species of fish which have common parameters such as location, growth and mortality which define their population dynamics. Fish stocks are within biologically sustainable levels when fish catch does not exceed the maximum sustainable yield (MSY) - the rate at which fish populations can regenerate. overexploited_fish: title : "Percentage of overexploited fish stocks" unit: "%" short_unit: "%" - description: | + description_short: | Fish stock are subpopulations of a particular species of fish which have common parameters such as location, growth and mortality which define their population dynamics. Fish stocks are overexploited when fish catch exceeds the maximum sustainable yield (MSY) - the rate at which fish populations can regenerate. land_spared_by_increased_crop_yields: variables: @@ -67,29 +73,29 @@ tables: title: "Actual cropland area today" unit: "hectares" short_unit: "ha" - description: | + description_short: | Total cropland area on a given year, calculated by dividing the total production by the crop yield. area_with_yield_of_1961: title: "Cropland area needed if yields stagnated in 1961" unit: "hectares" short_unit: "ha" - description: | + description_short: | Total cropland area that would be necessary if crop yields stagnated in 1961. - - This area is calculated by dividing the total production on a given year by the crop yield of 1961. + description_processing: | + - This area is calculated by dividing the total production on a given year by the crop yield of 1961. spared_land: title: "Land spared due to crop yield increases" unit: "hectares" short_unit: "ha" - description: | + description_short: | Land spared since 1961 due to the increase of crop yields. - - This area is calculated as the cropland area that would be necessary if crop yields stagnated in 1961 (the total production on a given year divided by the crop yield of 1961), minus the true cropland area on a given year. + description_processing: | + - This area is calculated as the cropland area that would be necessary if crop yields stagnated in 1961 (the total production on a given year divided by the crop yield of 1961), minus the true cropland area on a given year. spared_land__pct: title: "Percentage reduction in area needed due to crop yield increases" unit: "hectares" short_unit: "ha" - description: | + description_short: | Land spared since 1961 due to the increase of crop yields, as a percentage of the total land that would be necessary if crop yields had not increased since then. # All metadata for food_available_for_consumption is prepared via script. # food_available_for_consumption: @@ -99,168 +105,137 @@ tables: title: "Daily caloric intake per person from animal products" unit: "kilocalories per day per capita" short_unit: "kcal" - description: ¯onutrient_composition_variable_description | - The FAO provide annual figures from 1961 by country on daily caloric supply, fat supply (in grams), and protein supply (in grams). To calculate the daily per capita supply of carbohydrates, we assume an energy density by macronutrient of 4 kcal per gram of both protein and carbohydrate and 9 kcal per gram of fat (based on established nutritional guidelines reported by the FAO). The daily supply of carbohydrates is therefore calculated as: - - ((Daily supply of kcal)-(Daily supply of protein * 4 + Daily supply of fat * 9)) / 4 - - The quantity of calories from each macronutrient is then calculated based on the energy density figures given above (e.g. calories from protein is calculated by multiplying the daily supply of protein in grams by 4). - - For an explanation of these conversion factors, see "Chapter 3: Calculation Of The Energy Content Of Foods - Energy Conversion Factors", available at: http://www.fao.org/docrep/006/Y5022E/y5022e04.htm - - The share of calories derived from each macronutrient is then calculated by dividing the number of calories derived from a given macronutrient by the total daily caloric supply. - - Protein of animal origin includes protein supplied in the form of all meat commodities, eggs and dairy products, and fish & seafood. energy_from_animal_protein: title: "Daily caloric intake per person that comes from animal protein" unit: "kilocalories per day per capita" short_unit: "kcal" - description: *macronutrient_composition_variable_description energy_from_vegetal_products: title: "Daily caloric intake per person from vegetal products" unit: "kilocalories per day per capita" short_unit: "kcal" - description: *macronutrient_composition_variable_description energy_from_vegetal_protein: title: "Daily caloric intake per person that comes from vegetal protein" unit: "kilocalories per day per capita" short_unit: "kcal" - description: *macronutrient_composition_variable_description fat_from_animal_products: title: "Daily fat intake per person from animal products" unit: "grams per day per capita" short_unit: "g" - description: *macronutrient_composition_variable_description fat_from_vegetal_products: title: "Daily fat intake per person from vegetal products" unit: "grams per day per capita" short_unit: "g" - description: *macronutrient_composition_variable_description protein_from_animal_products: title: "Daily protein intake from animal products" unit: "grams per day per capita" short_unit: "g" - description: *macronutrient_composition_variable_description protein_from_vegetal_products: title: "Daily protein intake per person from vegetal products" unit: "grams per day per capita" short_unit: "g" - description: *macronutrient_composition_variable_description share_of_energy_from_animal_protein: title: "Share of the daily caloric intake that comes from animal protein" unit: "%" short_unit: "%" - description: *macronutrient_composition_variable_description share_of_energy_from_carbohydrates: title: "Share of the daily caloric intake that comes from carbohydrates" unit: "%" short_unit: "%" - description: *macronutrient_composition_variable_description share_of_energy_from_fat: title: "Share of the daily caloric intake that comes from fat" unit: "%" short_unit: "%" - description: *macronutrient_composition_variable_description share_of_energy_from_protein: title: "Share of the daily caloric intake that comes from protein" unit: "%" short_unit: "%" - description: *macronutrient_composition_variable_description share_of_energy_from_vegetal_protein: title: "Share of the daily caloric intake that comes from vegetal protein" unit: "%" short_unit: "%" - description: *macronutrient_composition_variable_description total_carbohydrates: title: "Daily carbohydrates intake per person" unit: "grams per day per capita" short_unit: "g" - description: *macronutrient_composition_variable_description total_energy: title: "Daily caloric intake per person" unit: "kilocalories per day per capita" short_unit: "kcal" - description: *macronutrient_composition_variable_description total_energy_from_carbohydrates: title: "Daily caloric intake per person from carbohydrates" unit: "kilocalories per day per capita" short_unit: "kcal" - description: *macronutrient_composition_variable_description total_energy_from_fat: title: "Daily caloric intake per person from fat" unit: "kilocalories per day per capita" short_unit: "kcal" - description: *macronutrient_composition_variable_description total_energy_from_protein: title: "Daily caloric intake per person from protein" unit: "kilocalories per day per capita" short_unit: "kcal" - description: *macronutrient_composition_variable_description total_fat: title: "Daily fat intake per person" unit: "grams per day per capita" short_unit: "g" - description: *macronutrient_composition_variable_description total_protein: title: "Daily protein intake per person" unit: "grams per day per capita" short_unit: "g" - description: *macronutrient_composition_variable_description fertilizers: variables: nitrogen_per_cropland: title: Nitrogen use per area of cropland unit: kilograms per hectare short_unit: kg/ha - description: | + description_short: | Nutrient nitrogen (N) from all fertilizer products per area of cropland, which corresponds to the sum of arable land and permanent crops. phosphate_per_cropland: title: Phosphate use per area of cropland unit: kilograms per hectare short_unit: kg/ha - description: | + description_short: | Nutrient phosphate (P2O5) from all fertilizer products per area of cropland, which corresponds to the sum of arable land and permanent crops. potash_per_cropland: title: Potash use per area of cropland unit: kilograms per hectare short_unit: kg/ha - description: | + description_short: | Nutrient potash (K2O) from all fertilizer products per area of cropland, which corresponds to the sum of arable land and permanent crops. all_fertilizers_per_cropland: title: All fertilizers use per area of cropland unit: kilograms per hectare short_unit: kg/ha - description: | + description_short: | Agricultural use of all fertilizer products (including nitrogenous, potash, and phosphate fertilizers) per area of cropland, which corresponds to the sum of arable land and permanent crops. cropland: title: Area of cropland unit: hectares short_unit: ha - description: + description_short: Surface area of cropland, which corresponds to the sum of arable land and permanent crops. nitrogen_use: title: Nitrogen use unit: tonnes short_unit: t - description: | + description_short: | Agricultural use of nutrient nitrogen (N) from all fertilizer products. phosphate_use: title: Phosphate use unit: tonnes short_unit: t - description: | + description_short: | Agricultural use of nutrient phosphate (P2O5) from all fertilizer products. potash_use: title: Potash use unit: tonnes short_unit: t - description: | + description_short: | Agricultural use of nutrient potash (K2O) from all fertilizer products. all_fertilizers_use: title: All fertilizers use unit: tonnes short_unit: t - description: | + description_short: | Agricultural use from all fertilizer products (including nitrogenous, potash, and phosphate fertilizers). vegetable_oil_yields: variables: @@ -268,277 +243,277 @@ tables: title: Production of sunflower oil unit: tonnes short_unit: t - description: | + description_short: | Amount of sunflower oil produced. soybean_production: title: Production of soybean oil unit: tonnes short_unit: t - description: | + description_short: | Amount of soybean oil produced. groundnut_production: title: Production of groundnut oil unit: tonnes short_unit: t - description: | + description_short: | Amount of groundnut oil produced. coconut_production: title: Production of coconut oil unit: tonnes short_unit: t - description: | + description_short: | Amount of coconut oil produced. olive_production: title: Production of olive oil unit: tonnes short_unit: t - description: | + description_short: | Amount of olive oil produced. cottonseed_production: title: Production of cottonseed oil unit: tonnes short_unit: t - description: | + description_short: | Amount of cottonseed oil produced. sesame_production: title: Production of sesame oil unit: tonnes short_unit: t - description: | + description_short: | Amount of sesame oil produced. rapeseed_production: title: Production of rapeseed oil unit: tonnes short_unit: t - description: | + description_short: | Amount of rapeseed oil produced. palm_production: title: Production of palm oil unit: tonnes short_unit: t - description: | + description_short: | Amount of palm oil produced. Palm oil includes palm kernel oil. sunflower_area: title: Area harvested for sunflower crops unit: hectares short_unit: ha - description: | + description_short: | Land area used to harvest sunflower crops. cottonseed_area: title: Area harvested for cottonseed crops unit: hectares short_unit: ha - description: | + description_short: | Land area used to harvest cottonseed crops. soybean_area: title: Area harvested for soybean crops unit: hectares short_unit: ha - description: | + description_short: | Land area used to harvest soybean crops. groundnut_area: title: Area harvested for groundnut crops unit: hectares short_unit: ha - description: | + description_short: | Land area used to harvest groundnut crops. olive_area: title: Area harvested for olive crops unit: hectares short_unit: ha - description: | + description_short: | Land area used to harvest olive crops. rapeseed_area: title: Area harvested for rapeseed crops unit: hectares short_unit: ha - description: | + description_short: | Land area used to harvest rapeseed crops. coconut_area: title: Area harvested for coconut crops unit: hectares short_unit: ha - description: | + description_short: | Land area used to harvest coconut crops. sesame_area: title: Area harvested for sesame crops unit: hectares short_unit: ha - description: | + description_short: | Land area used to harvest sesame crops. palm_area: title: Area harvested for palm fruit crops unit: hectares short_unit: ha - description: | + description_short: | Land area used to harvest palm fruit crops. Palm oil includes palm kernel oil. vegetable_oils_production: title: Global production of vegetable oils unit: tonnes short_unit: t - description: | + description_short: | Amount of vegetable oils produced worldwide. palm_tonnes_per_hectare: title: Palm oil yield per crop unit: tonnes per hectare short_unit: tonnes/ha - description: | + description_short: | Average amount of palm oil produced per hectare of palm fruit crops harvested. Note that this calculates the oil yield per hectare, which is different from the yield of the total crop, because not all of the crop can be used for oil. Palm oil includes palm kernel oil. palm_hectares_per_tonne: title: Area of palm fruit crops harvested to produce a tonne of palm oil unit: hectares per tonne short_unit: hectares/tonne - description: | + description_short: | Area of palm fruit crops harvested to produce a tonne of palm oil. This metric is the inverse of oil yields, and represents the amount of land that would need to be devoted to grow a given crop to produce one tonne of oil. Palm oil includes palm kernel oil. palm_area_to_meet_global_oil_demand: title: Area needed to meet the global vegetable oil demand with only palm oil unit: hectares short_unit: ha - description: | + description_short: | Amount of land that would need to be devoted to grow palm fruit crops if it was to meet global vegetable oil demand alone. Palm oil includes palm kernel oil. sunflower_tonnes_per_hectare: title: Sunflower oil yield per crop unit: tonnes per hectare short_unit: tonnes/ha - description: | + description_short: | Average amount of sunflower oil produced per hectare of sunflower crops harvested. Note that this calculates the oil yield per hectare, which is different from the yield of the total crop, because not all of the crop can be used for oil. sunflower_hectares_per_tonne: title: Area of sunflower crops harvested to produce a tonne of sunflower oil unit: hectares per tonne short_unit: hectares/tonne - description: | + description_short: | Area of sunflower crops harvested to produce a tonne of sunflower oil. This metric is the inverse of oil yields, and represents the amount of land that would need to be devoted to grow a given crop to produce one tonne of oil. sunflower_area_to_meet_global_oil_demand: title: Area needed to meet the global vegetable oil demand with only sunflower oil unit: hectares short_unit: ha - description: | + description_short: | Amount of land that would need to be devoted to grow sunflower crops if it was to meet global vegetable oil demand alone. rapeseed_tonnes_per_hectare: title: Rapeseed oil yield per crop unit: tonnes per hectare short_unit: tonnes/ha - description: | + description_short: | Average amount of rapeseed oil produced per hectare of rapeseed crops harvested. Note that this calculates the oil yield per hectare, which is different from the yield of the total crop, because not all of the crop can be used for oil. rapeseed_hectares_per_tonne: title: Area of rapeseed crops harvested to produce a tonne of rapeseed oil unit: hectares per tonne short_unit: hectares/tonne - description: | + description_short: | Area of rapeseed crops harvested to produce a tonne of rapeseed oil. This metric is the inverse of oil yields, and represents the amount of land that would need to be devoted to grow a given crop to produce one tonne of oil. rapeseed_area_to_meet_global_oil_demand: title: Area needed to meet the global vegetable oil demand with only rapeseed oil unit: hectares short_unit: ha - description: | + description_short: | Amount of land that would need to be devoted to grow rapeseed crops if it was to meet global vegetable oil demand alone. soybean_tonnes_per_hectare: title: Soybean oil yield per crop unit: tonnes per hectare short_unit: tonnes/ha - description: | + description_short: | Average amount of soybean oil produced per hectare of soybean crops harvested. Note that this calculates the oil yield per hectare, which is different from the yield of the total crop, because not all of the crop can be used for oil. soybean_hectares_per_tonne: title: Area of soybean crops harvested to produce a tonne of soybean oil unit: hectares per tonne short_unit: hectares/tonne - description: | + description_short: | Area of soybean crops harvested to produce a tonne of soybean oil. This metric is the inverse of oil yields, and represents the amount of land that would need to be devoted to grow a given crop to produce one tonne of oil. soybean_area_to_meet_global_oil_demand: title: Area needed to meet the global vegetable oil demand with only soybean oil unit: hectares short_unit: ha - description: | + description_short: | Amount of land that would need to be devoted to grow soybean crops if it was to meet global vegetable oil demand alone. olive_tonnes_per_hectare: title: Olive oil yield per crop unit: tonnes per hectare short_unit: tonnes/ha - description: | + description_short: | Average amount of olive oil produced per hectare of olive crops harvested. Note that this calculates the oil yield per hectare, which is different from the yield of the total crop, because not all of the crop can be used for oil. olive_hectares_per_tonne: title: Area of olive crops harvested to produce a tonne of olive oil unit: hectares per tonne short_unit: hectares/tonne - description: | + description_short: | Area of olive crops harvested to produce a tonne of olive oil. This metric is the inverse of oil yields, and represents the amount of land that would need to be devoted to grow a given crop to produce one tonne of oil. olive_area_to_meet_global_oil_demand: title: Area needed to meet the global vegetable oil demand with only olive oil unit: hectares short_unit: ha - description: | + description_short: | Amount of land that would need to be devoted to grow olive crops if it was to meet global vegetable oil demand alone. coconut_tonnes_per_hectare: title: Coconut oil yield per crop unit: tonnes per hectare short_unit: tonnes/ha - description: | + description_short: | Average amount of coconut oil produced per hectare of coconut crops harvested. Note that this calculates the oil yield per hectare, which is different from the yield of the total crop, because not all of the crop can be used for oil. coconut_hectares_per_tonne: title: Area of coconut crops harvested to produce a tonne of coconut oil unit: hectares per tonne short_unit: hectares/tonne - description: | + description_short: | Area of coconut crops harvested to produce a tonne of coconut oil. This metric is the inverse of oil yields, and represents the amount of land that would need to be devoted to grow a given crop to produce one tonne of oil. coconut_area_to_meet_global_oil_demand: title: Area needed to meet the global vegetable oil demand with only coconut oil unit: hectares short_unit: ha - description: | + description_short: | Amount of land that would need to be devoted to grow coconut crops if it was to meet global vegetable oil demand alone. groundnut_tonnes_per_hectare: title: Groundnut oil yield per crop unit: tonnes per hectare short_unit: tonnes/ha - description: | + description_short: | Average amount of groundnut oil produced per hectare of groundnut crops harvested. Note that this calculates the oil yield per hectare, which is different from the yield of the total crop, because not all of the crop can be used for oil. groundnut_hectares_per_tonne: title: Area of groundnut crops harvested to produce a tonne of groundnut oil unit: hectares per tonne short_unit: hectares/tonne - description: | + description_short: | Area of groundnut crops harvested to produce a tonne of groundnut oil. This metric is the inverse of oil yields, and represents the amount of land that would need to be devoted to grow a given crop to produce one tonne of oil. groundnut_area_to_meet_global_oil_demand: title: Area needed to meet the global vegetable oil demand with only groundnut oil unit: hectares short_unit: ha - description: | + description_short: | Amount of land that would need to be devoted to grow groundnut crops if it was to meet global vegetable oil demand alone. cottonseed_tonnes_per_hectare: title: Cottonseed oil yield per crop unit: tonnes per hectare short_unit: tonnes/ha - description: | + description_short: | Average amount of cottonseed oil produced per hectare of cottonseed crops harvested. Note that this calculates the oil yield per hectare, which is different from the yield of the total crop, because not all of the crop can be used for oil. cottonseed_hectares_per_tonne: title: Area of cottonseed crops harvested to produce a tonne of cottonseed oil unit: hectares per tonne short_unit: hectares/tonne - description: | + description_short: | Area of cottonseed crops harvested to produce a tonne of cottonseed oil. This metric is the inverse of oil yields, and represents the amount of land that would need to be devoted to grow a given crop to produce one tonne of oil. cottonseed_area_to_meet_global_oil_demand: title: Area needed to meet the global vegetable oil demand with only cottonseed oil unit: hectares short_unit: ha - description: | + description_short: | Amount of land that would need to be devoted to grow cottonseed crops if it was to meet global vegetable oil demand alone. sesame_tonnes_per_hectare: title: Sesame oil yield per crop unit: tonnes per hectare short_unit: tonnes/ha - description: | + description_short: | Average amount of sesame oil produced per hectare of sesame crops harvested. Note that this calculates the oil yield per hectare, which is different from the yield of the total crop, because not all of the crop can be used for oil. sesame_hectares_per_tonne: title: Area of sesame crops harvested to produce a tonne of sesame oil unit: hectares per tonne short_unit: hectares/tonne - description: | + description_short: | Area of sesame crops harvested to produce a tonne of sesame oil. This metric is the inverse of oil yields, and represents the amount of land that would need to be devoted to grow a given crop to produce one tonne of oil. sesame_area_to_meet_global_oil_demand: title: Area needed to meet the global vegetable oil demand with only sesame oil unit: hectares short_unit: ha - description: | + description_short: | Amount of land that would need to be devoted to grow sesame crops if it was to meet global vegetable oil demand alone. agriculture_land_use_evolution: variables: @@ -546,86 +521,67 @@ tables: title: Area used for agriculture unit: hectares short_unit: ha - description: | + description_short: | Surface area devoted to agriculture on a given year. agriculture_area_one_decade_back: title: Area used for agriculture one decade back unit: hectares short_unit: ha - description: | + description_short: | Surface area devoted to agriculture one decade before a given year. For example, for year 2020, this variable gives the extent of agricultural land in 2010. cropland_area: title: Area used for croplands unit: hectares short_unit: ha - description: | + description_short: | Surface area devoted to croplands on a given year. cropland_area_one_decade_back: title: Area used for croplands one decade back unit: hectares short_unit: ha - description: | + description_short: | Surface area devoted to croplands one decade before a given year. For example, for year 2020, this variable gives the extent of croplands in 2010. pasture_area: title: Area used for pastures unit: hectares short_unit: ha - description: | + description_short: | Surface area devoted to pastures on a given year. pasture_area_one_decade_back: title: Area used for pastures one decade back unit: hectares short_unit: ha - description: | + description_short: | Surface area devoted to pastures one decade before a given year. For example, for year 2020, this variable gives the extent of pastures in 2010. year_one_decade_back: title: Year one decade back unit: "" short_unit: "" - description: | + description_short: | Year one decade before a given year. For example, for year 2020, this variable would be 2010. agriculture_area_change: title: Change in agriculture area with respect to one decade back unit: "%" short_unit: "%" - description: | - Percentage change in surface area devoted to agriculture with respect to 10 years before. Negative values imply that surface area has decreased with respect to the previous decade. - - This data is used to assess which countries may have already peaked in their agricultural land use. - - Assessing this by looking at annual land use data is difficult because there can be significant year-to-year variability. That land use for one or two years was lower than previous years would be insufficient to conclude that a country had peaked. - - For this reason we look at decadal changes in agricultural land. We look at land use in the latest year relative to 10 years before. - - If land use is lower in the latest year then we suggest that land use may have peaked. If land use it the same or higher than a decade back, we suggest that it hasn't, or this is uncertain. + description_short: | + Percentage change in surface area devoted to agriculture with respect to 10 years before. Negative values imply that surface area has decreased with respect to the previous decade. This data is used to assess which countries may have already peaked in their agricultural land use. + description_processing: &agricultural_land_use_description_processing | + - Assessing this by looking at annual land use data is difficult because there can be significant year-to-year variability. That land use for one or two years was lower than previous years would be insufficient to conclude that a country had peaked. For this reason we look at decadal changes in agricultural land. We look at land use in the latest year relative to 10 years before. + - If land use is lower in the latest year then we suggest that land use may have peaked. If land use it the same or higher than a decade back, we suggest that it hasn't, or this is uncertain. cropland_area_change: title: Change in cropland area with respect to one decade back unit: "%" short_unit: "%" - description: | - Percentage change in surface area devoted to croplands with respect to 10 years before. Negative values imply that surface area has decreased with respect to the previous decade. - - This data is used to assess which countries may have already peaked in their agricultural land use. - - Assessing this by looking at annual land use data is difficult because there can be significant year-to-year variability. That land use for one or two years was lower than previous years would be insufficient to conclude that a country had peaked. - - For this reason we look at decadal changes in agricultural land. We look at land use in the latest year relative to 10 years before. - - If land use is lower in the latest year then we suggest that land use may have peaked. If land use it the same or higher than a decade back, we suggest that it hasn't, or this is uncertain. + description_short: | + Percentage change in surface area devoted to croplands with respect to 10 years before. Negative values imply that surface area has decreased with respect to the previous decade. This data is used to assess which countries may have already peaked in their agricultural land use. + description_processing: *agricultural_land_use_description_processing pasture_area_change: title: Change in pasture area with respect to one decade back unit: "%" short_unit: "%" - description: | - Percentage change in surface area devoted to pastures with respect to 10 years before. Negative values imply that surface area has decreased with respect to the previous decade. - - This data is used to assess which countries may have already peaked in their agricultural land use. - - Assessing this by looking at annual land use data is difficult because there can be significant year-to-year variability. That land use for one or two years was lower than previous years would be insufficient to conclude that a country had peaked. - - For this reason we look at decadal changes in agricultural land. We look at land use in the latest year relative to 10 years before. - - If land use is lower in the latest year then we suggest that land use may have peaked. If land use it the same or higher than a decade back, we suggest that it hasn't, or this is uncertain. + description_short: | + Percentage change in surface area devoted to pastures with respect to 10 years before. Negative values imply that surface area has decreased with respect to the previous decade. This data is used to assess which countries may have already peaked in their agricultural land use. + description_processing: *agricultural_land_use_description_processing hypothetical_meat_consumption: variables: animals_global: @@ -636,12 +592,12 @@ tables: title: Hypothetical number of slaughtered animals if everyone ate like the average citizen of a given country unit: "animals" short_unit: "" - description: | + description_short: | Hypothetical number of slaughtered animals worldwide if everyone in the world ate the same quantity as the average citizen of a given country. - - This is a hypothetical variable derived by Our World in Data which answers the question: "How many animals would need to be slaughtered if everyone in the world consumed the average per capita amount of a given country?". For example: "How many animals would need to be slaughtered if everyone in the world consumed the same amount of meat as the average UK citizen?". - - This was derived by multiplying global population by the per capita number of slaughtered animals of a given country. + description_key: + - 'This is a hypothetical variable derived by Our World in Data which answers the question: "How many animals would need to be slaughtered if everyone in the world consumed the average per capita amount of a given country?". For example: "How many animals would need to be slaughtered if everyone in the world consumed the same amount of meat as the average UK citizen?".' + description_processing: | + - This indicator was calculated by multiplying global population by the per capita number of slaughtered animals of a given country. animals_per_capita: title: Number of slaughtered animals per person in each country unit: "animals per person" @@ -658,12 +614,12 @@ tables: title: Hypothetical global meat demand if everyone ate like the average citizen of a given country unit: "tonnes" short_unit: "t" - description: | + description_short: | Hypothetical global meat demand if everyone in the world ate the same quantity as the average citizen of a given country. - - This is a hypothetical variable derived by Our World in Data which answers the question: "What would global meat production have to be if everyone in the world consumed the average per capita amount of a given country?". For example: "How much meat would we need to produce if everyone in the world consumed the same amount of meat as the average UK citizen?". - - This was derived by multiplying global population by per capita meat supply of a given country. + description_key: + - 'This is a hypothetical variable derived by Our World in Data which answers the question: "What would global meat production have to be if everyone in the world consumed the average per capita amount of a given country?". For example: "How much meat would we need to produce if everyone in the world consumed the same amount of meat as the average UK citizen?".' + description_processing: | + - This indicator was calculated by multiplying global population by per capita meat supply of a given country. production_per_capita: title: Per-capita production of meat in each country unit: "tonnes per person" @@ -674,38 +630,38 @@ tables: title: Cereals allocated to animal feed unit: tonnes short_unit: t - description: | + description_short: | Quantity of cereal crops allocated to animal feed (and not human food or other uses, such as biofuel production). cereals_allocated_to_food: title: Cereals allocated to human food unit: tonnes short_unit: t - description: | + description_short: | Quantity of cereal crops allocated to human food (and not animal feed or other uses, such as biofuel production). cereals_allocated_to_other_uses: title: Cereals allocated to other uses unit: tonnes short_unit: t - description: | + description_short: | Quantity of cereal crops allocated to other uses (and not to human food or animal feed), predominantly industrial uses such as biofuel production. share_of_cereals_allocated_to_animal_feed: title: Share of cereals that are allocated to animal feed unit: "%" short_unit: "%" - description: | - This is calculated by dividing the amount of cereals allocated to animal feed by the sum of all cereal uses considered (namely human food, animal feed, and other uses such us biofuel production). This corresponds to cereals available domestically (after trade) excluding supply chain losses and seed resown from the crop. + description_processing: | + - This indicator is calculated by dividing the amount of cereals allocated to animal feed by the sum of all cereal uses considered (namely human food, animal feed, and other uses such us biofuel production). This corresponds to cereals available domestically (after trade) excluding supply chain losses and seed resown from the crop. share_of_cereals_allocated_to_food: title: Share of cereals that are allocated to human food unit: "%" short_unit: "%" - description: | - This is calculated by dividing the amount of cereals allocated to human food by the sum of all cereal uses considered (namely human food, animal feed, and other uses such us biofuel production). This corresponds to cereals available domestically (after trade) excluding supply chain losses and seed resown from the crop. + description_processing: | + - This indicator is calculated by dividing the amount of cereals allocated to human food by the sum of all cereal uses considered (namely human food, animal feed, and other uses such us biofuel production). This corresponds to cereals available domestically (after trade) excluding supply chain losses and seed resown from the crop. share_of_cereals_allocated_to_other_uses: title: Share of cereals that are allocated to other uses such as biofuel production unit: "%" short_unit: "%" - description: | - This is calculated by dividing the amount of cereals allocated to other uses (predominantly industrial uses such as biofuel production) by the sum of all cereal uses considered (namely human food, animal feed, and other uses). This corresponds to cereals available domestically (after trade) excluding supply chain losses and seed resown from the crop. + description_processing: | + - This indicator is calculated by dividing the amount of cereals allocated to other uses (predominantly industrial uses such as biofuel production) by the sum of all cereal uses considered (namely human food, animal feed, and other uses). This corresponds to cereals available domestically (after trade) excluding supply chain losses and seed resown from the crop. # All metadata for maize_and_wheat and fertilizer_exports is prepared via script. # maize_and_wheat: # fertilizer_exports: From 5de64996cbbc522ee327287e8a76d8019243946c Mon Sep 17 00:00:00 2001 From: Pablo Rosado Date: Wed, 20 Mar 2024 17:25:25 +0100 Subject: [PATCH 45/54] Improve metadata of additional variables dataset --- .../2024-03-14/additional_variables.meta.yml | 30 ++++++++----------- 1 file changed, 13 insertions(+), 17 deletions(-) diff --git a/etl/steps/data/garden/faostat/2024-03-14/additional_variables.meta.yml b/etl/steps/data/garden/faostat/2024-03-14/additional_variables.meta.yml index afd8367972f..76cfff23706 100644 --- a/etl/steps/data/garden/faostat/2024-03-14/additional_variables.meta.yml +++ b/etl/steps/data/garden/faostat/2024-03-14/additional_variables.meta.yml @@ -1,20 +1,3 @@ -definitions: - common: - macronutrient_compositions: - description_processing: | - - The FAO provide annual figures from 1961 by country on daily caloric supply, fat supply (in grams), and protein supply (in grams). To calculate the daily per capita supply of carbohydrates, we assume an energy density by macronutrient of 4 kcal per gram of both protein and carbohydrate and 9 kcal per gram of fat (based on established nutritional guidelines reported by the FAO). The daily supply of carbohydrates is therefore calculated as: - - ((Daily supply of kcal)-(Daily supply of protein * 4 + Daily supply of fat * 9)) / 4 - - - The quantity of calories from each macronutrient is then calculated based on the energy density figures given above (e.g. calories from protein is calculated by multiplying the daily supply of protein in grams by 4). - - - For an explanation of these conversion factors, see "Chapter 3: Calculation Of The Energy Content Of Foods - Energy Conversion Factors", available at: http://www.fao.org/docrep/006/Y5022E/y5022e04.htm - - - The share of calories derived from each macronutrient is then calculated by dividing the number of calories derived from a given macronutrient by the total daily caloric supply. - - - Protein of animal origin includes protein supplied in the form of all meat commodities, eggs and dairy products, and fish & seafood. - - dataset: title: Additional FAOSTAT variables description: | @@ -100,6 +83,19 @@ tables: # All metadata for food_available_for_consumption is prepared via script. # food_available_for_consumption: macronutrient_compositions: + common: + description_processing: | + - The FAO provide annual figures from 1961 by country on daily caloric supply, fat supply (in grams), and protein supply (in grams). To calculate the daily per capita supply of carbohydrates, we assume an energy density by macronutrient of 4 kcal per gram of both protein and carbohydrate and 9 kcal per gram of fat (based on established nutritional guidelines reported by the FAO). The daily supply of carbohydrates is therefore calculated as: + + ((Daily supply of kcal)-(Daily supply of protein * 4 + Daily supply of fat * 9)) / 4 + + - The quantity of calories from each macronutrient is then calculated based on the energy density figures given above (e.g. calories from protein is calculated by multiplying the daily supply of protein in grams by 4). + + - For an explanation of these conversion factors, see "Chapter 3: Calculation Of The Energy Content Of Foods - Energy Conversion Factors", available at: http://www.fao.org/docrep/006/Y5022E/y5022e04.htm + + - The share of calories derived from each macronutrient is then calculated by dividing the number of calories derived from a given macronutrient by the total daily caloric supply. + + - Protein of animal origin includes protein supplied in the form of all meat commodities, eggs and dairy products, and fish & seafood. variables: energy_from_animal_products: title: "Daily caloric intake per person from animal products" From 1b5febdb5fdda3d8f811ca8add14901b914675b8 Mon Sep 17 00:00:00 2001 From: Pablo Rosado Date: Thu, 21 Mar 2024 15:39:07 +0100 Subject: [PATCH 46/54] Improve function that finds dataset id from name --- etl/db.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/etl/db.py b/etl/db.py index 17160a361ba..850a1bca289 100644 --- a/etl/db.py +++ b/etl/db.py @@ -81,7 +81,9 @@ def open_db() -> Generator[DBUtils, None, None]: connection.close() -def get_dataset_id(dataset_name: str, db_conn: Optional[MySQLdb.Connection] = None) -> Any: +def get_dataset_id( + dataset_name: str, db_conn: Optional[MySQLdb.Connection] = None, version: Optional[str] = None +) -> Any: """Get the dataset ID of a specific dataset name from database. If more than one dataset is found for the same name, or if no dataset is found, an error is raised. @@ -92,6 +94,9 @@ def get_dataset_id(dataset_name: str, db_conn: Optional[MySQLdb.Connection] = No Dataset name. db_conn : MySQLdb.Connection Connection to database. Defaults to None, in which case a default connection is created (uses etl.config). + version : str + ETL version of the dataset. This is necessary when multiple datasets have the same title. In such a case, if + version is not given, the function will raise an error. Returns ------- @@ -107,6 +112,10 @@ def get_dataset_id(dataset_name: str, db_conn: Optional[MySQLdb.Connection] = No FROM datasets WHERE name = '{dataset_name}' """ + + if version: + query += f" AND version = '{version}'" + with db_conn.cursor() as cursor: cursor.execute(query) result = cursor.fetchall() From 95c11e4b6ee37a4729a2b66992aaf6b474b3ce44 Mon Sep 17 00:00:00 2001 From: Pablo Rosado Date: Thu, 21 Mar 2024 15:40:32 +0100 Subject: [PATCH 47/54] Improve script to submit faostat chart revisions --- etl/scripts/faostat/create_chart_revisions.py | 80 +++++++------------ 1 file changed, 31 insertions(+), 49 deletions(-) diff --git a/etl/scripts/faostat/create_chart_revisions.py b/etl/scripts/faostat/create_chart_revisions.py index 3bfd1979ea9..0b726ad5919 100644 --- a/etl/scripts/faostat/create_chart_revisions.py +++ b/etl/scripts/faostat/create_chart_revisions.py @@ -5,6 +5,9 @@ chart that is already in revision (when another variable previously triggered a revision for the same chart). When this happens, simply go through the approval tool and run this script again until it produces no more revisions. +NOTE: This script assumes both old and new ETL steps involved have been executed already in the present environment. +If not, execute them (e.g. `etl run faostat`, which may take a while). + """ import argparse @@ -21,12 +24,7 @@ from etl import db from etl.chart_revision.v1.revision import create_and_submit_charts_revisions from etl.paths import DATA_DIR -from etl.scripts.faostat.shared import ( - N_CHARACTERS_ELEMENT_CODE, - N_CHARACTERS_ITEM_CODE, - N_CHARACTERS_ITEM_CODE_EXTENDED, - NAMESPACE, -) +from etl.scripts.faostat.shared import NAMESPACE # Initialize logger. log = get_logger() @@ -37,15 +35,8 @@ # Columns to not take as variables. COLUMNS_TO_IGNORE = ["country", "year", "index"] -# This regex should extract item codes and element codes, which are made of numbers, sometimes "pc" -# (for per capita variables), and "M" and "F" (for male and female, only for certain domains, like fs and sdgb). -REGEX_TO_EXTRACT_ITEM_AND_ELEMENT = ( - rf".*([0-9pcMF]{{{N_CHARACTERS_ITEM_CODE}}}).*([0-9pcMF]{{{N_CHARACTERS_ELEMENT_CODE}}})" -) -# Idem for faostat_sdgb. -REGEX_TO_EXTRACT_ITEM_AND_ELEMENT_SDGB = ( - rf".*([0-9A-Z]{{{N_CHARACTERS_ITEM_CODE_EXTENDED}}}).*([0-9pcMF]{{{N_CHARACTERS_ELEMENT_CODE}}})" -) +# This regex should extract item codes and element codes from variable names. +REGEX_TO_EXTRACT_ITEM_AND_ELEMENT = r"\|\s*([^|]+)\s*\|\|\s*[^|]+\|\s*([^|]+)\s*\|\|" def extract_variables_from_dataset(dataset_short_name: str, version: str) -> List[str]: @@ -62,11 +53,8 @@ def extract_variables_from_dataset(dataset_short_name: str, version: str) -> Lis return variable_titles -def extract_identifiers_from_variable_name(variable: str, dataset_short_name: str) -> Dict[str, Any]: - if dataset_short_name == "faostat_sdgb": - matches = re.findall(REGEX_TO_EXTRACT_ITEM_AND_ELEMENT_SDGB, variable) - else: - matches = re.findall(REGEX_TO_EXTRACT_ITEM_AND_ELEMENT, variable) +def extract_identifiers_from_variable_name(variable: str) -> Dict[str, Any]: + matches = re.findall(REGEX_TO_EXTRACT_ITEM_AND_ELEMENT, variable) error = f"Item code or element code could not be extracted for variable: {variable}" assert np.shape(matches) == (1, 2), error item_code, element_code = matches[0] @@ -75,22 +63,10 @@ def extract_identifiers_from_variable_name(variable: str, dataset_short_name: st return variable_codes -def map_old_to_new_variable_names( - variables_old: List[str], variables_new: List[str], dataset_short_name: str -) -> Dict[str, str]: +def map_old_to_new_variable_names(variables_old: List[str], variables_new: List[str]) -> Dict[str, str]: # Extract item codes and element codes from variable names. - codes_old = pd.DataFrame( - [ - extract_identifiers_from_variable_name(variable=variable, dataset_short_name=dataset_short_name) - for variable in variables_old - ] - ) - codes_new = pd.DataFrame( - [ - extract_identifiers_from_variable_name(variable=variable, dataset_short_name=dataset_short_name) - for variable in variables_new - ] - ) + codes_old = pd.DataFrame([extract_identifiers_from_variable_name(variable=variable) for variable in variables_old]) + codes_new = pd.DataFrame([extract_identifiers_from_variable_name(variable=variable) for variable in variables_new]) variables_matched = pd.merge( codes_old, codes_new, how="outer", on=["item_code", "element_code"], suffixes=("_old", "_new") @@ -116,11 +92,23 @@ def map_old_to_new_variable_names( def get_grapher_data_for_old_and_new_variables( dataset_old: Dataset, dataset_new: Dataset -) -> Tuple[pd.DataFrame, pd.DataFrame]: +) -> Tuple[Optional[pd.DataFrame], Optional[pd.DataFrame]]: with db.get_connection() as db_conn: - # Get old and new dataset ids. - dataset_id_old = db.get_dataset_id(db_conn=db_conn, dataset_name=dataset_old.metadata.title) - dataset_id_new = db.get_dataset_id(db_conn=db_conn, dataset_name=dataset_new.metadata.title) + try: + # Get old and new dataset ids. + dataset_id_old = db.get_dataset_id( + db_conn=db_conn, dataset_name=dataset_old.metadata.title, version=dataset_old.metadata.version + ) + except AssertionError: + log.error(f"Dataset {dataset_old.metadata.title} not found in grapher DB.") + return None, None + try: + dataset_id_new = db.get_dataset_id( + db_conn=db_conn, dataset_name=dataset_new.metadata.title, version=dataset_new.metadata.version + ) + except AssertionError: + log.error(f"Dataset {dataset_new.metadata.title} not found in grapher DB.") + return None, None # Get variables from old dataset that have been used in at least one chart. grapher_variables_old = db.get_variables_in_dataset( @@ -204,15 +192,16 @@ def get_grapher_variable_id_mapping_for_two_dataset_versions( variables_new = extract_variables_from_dataset(dataset_short_name=dataset_short_name, version=version_new) # Map old to new variable names. - variables_mapping = map_old_to_new_variable_names( - variables_old=variables_old, variables_new=variables_new, dataset_short_name=dataset_short_name - ) + variables_mapping = map_old_to_new_variable_names(variables_old=variables_old, variables_new=variables_new) # Get data for old and new variables from grapher db. grapher_variables_old, grapher_variables_new = get_grapher_data_for_old_and_new_variables( dataset_old=dataset_old, dataset_new=dataset_new ) + if (grapher_variables_old is None) or (grapher_variables_new is None): + return {} + # Check that variable titles in ETL match those found in grapher DB. error = "Mismatch between expected old variable titles in ETL and grapher DB." # NOTE: grapher_variables_old includes only variables that have been used in charts, whereas variables_old @@ -249,13 +238,6 @@ def main( # List all datasets to map. dataset_short_names = [f"{NAMESPACE}_{domain.lower()}" for domain in domains] - #################################################################################################################### - # Temporarily ignore faostat_sdgb, which has changed item codes significantly. - dataset_short_names = [ - dataset_short_name for dataset_short_name in dataset_short_names if dataset_short_name != "faostat_sdgb" - ] - #################################################################################################################### - for dataset_short_name in dataset_short_names: log.info(f"Checking available versions for dataset {dataset_short_name}.") # Ensure a dataset exist for each of the specified versions. From c98ef240f1ebd1799c0afa5893846fa12fde0d4e Mon Sep 17 00:00:00 2001 From: Pablo Rosado Date: Thu, 21 Mar 2024 17:25:05 +0100 Subject: [PATCH 48/54] Fix errors in script that creates chart revisions --- etl/scripts/faostat/create_chart_revisions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/etl/scripts/faostat/create_chart_revisions.py b/etl/scripts/faostat/create_chart_revisions.py index 0b726ad5919..5ac258a8c5d 100644 --- a/etl/scripts/faostat/create_chart_revisions.py +++ b/etl/scripts/faostat/create_chart_revisions.py @@ -199,7 +199,7 @@ def get_grapher_variable_id_mapping_for_two_dataset_versions( dataset_old=dataset_old, dataset_new=dataset_new ) - if (grapher_variables_old is None) or (grapher_variables_new is None): + if (grapher_variables_old is None) or (grapher_variables_new is None) or (len(variables_mapping) == 0): return {} # Check that variable titles in ETL match those found in grapher DB. From cfc3a5eddc887298a8724d8b99d3d0859e4faa18 Mon Sep 17 00:00:00 2001 From: Pablo Rosado Date: Fri, 22 Mar 2024 08:56:22 +0100 Subject: [PATCH 49/54] Delete unused faostat script --- .../faostat/archive/map_sdgb_variables.py | 197 ------------------ 1 file changed, 197 deletions(-) delete mode 100644 etl/scripts/faostat/archive/map_sdgb_variables.py diff --git a/etl/scripts/faostat/archive/map_sdgb_variables.py b/etl/scripts/faostat/archive/map_sdgb_variables.py deleted file mode 100644 index 11ca2306638..00000000000 --- a/etl/scripts/faostat/archive/map_sdgb_variables.py +++ /dev/null @@ -1,197 +0,0 @@ -"""Map old grapher variables to new ones, and create chart revisions, specifically for faostat_sdgb. - -This script has been adapted from create_chart_revisions.py. -We do this separately to the rest of datasets because the item codes of this dataset used to be totally different -to the usual item code. But in the latest version, this issue has been corrected. -Therefore this script will not be necessary after the current update. - -""" - -import argparse -from typing import Any, Dict, List, Optional - -import pandas as pd -from MySQLdb import IntegrityError -from owid.catalog import Dataset -from structlog import get_logger - -from etl.chart_revision.v1.revision import create_and_submit_charts_revisions -from etl.paths import DATA_DIR -from etl.scripts.faostat.create_chart_revisions import ( - extract_variables_from_dataset, - find_and_check_available_versions_for_dataset, - get_grapher_data_for_old_and_new_variables, - map_old_to_new_grapher_variable_ids, -) -from etl.scripts.faostat.shared import NAMESPACE - -log = get_logger() - -# Channel from which the dataset versions and variables will be loaded. -CHANNEL = "grapher" - -# Domain and versions of the relevant datasets. -DOMAIN = "sdgb" -VERSION_OLD = "2022-05-17" -VERSION_NEW = "2023-02-22" - -# Columns to not take as variables. -COLUMNS_TO_IGNORE = ["country", "year", "index"] - -# Some items changed their name, so they have to be manually mapped. -MANUAL_VARIABLES_MAPPING = { - "12.3.1 Food Loss Percentage (%) | AG_FLS_IDX || Value | 006121 || percent": "12.3.1(a) Food Loss Percentage (%) | 00024044 || Value | 006121 || percent", - "2.5.2 Proportion of local breeds classified as being at risk as a share of local breeds with known level of extinction risk (%) | ER_RSK_LBREDS || Value | 006121 || percent": "2.5.2 Proportion of local breeds classified as being at risk of extinction (%) | 00024018 || Value | 006121 || percent", - "6.4.1 Industrial Water Use Efficiency [US$/m3] | 00240281 || Value | 006178 || United States dollars per cubic metre": "6.4.1 Water Use Efficiency [US$/m3] (Industries) | 00240281 || Value | 006178 || United States dollars per cubic metre", - "6.4.1 Irrigated Agriculture Water Use Efficiency [US$/m3] | 00240280 || Value | 006178 || United States dollars per cubic metre": "6.4.1 Water Use Efficiency [US$/m3] (Agriculture) | 00240280 || Value | 006178 || United States dollars per cubic metre", - "6.4.1 Services Water Use Efficiency [US$/m3] | 00240282 || Value | 006178 || United States dollars per cubic metre": "6.4.1 Water Use Efficiency [US$/m3] (Services) | 00240282 || Value | 006178 || United States dollars per cubic metre", - "6.4.1 Total Water Use Efficiency [US$/m3] | 00240283 || Value | 006178 || United States dollars per cubic metre": "6.4.1 Water Use Efficiency [US$/m3] (Total) | 00240283 || Value | 006178 || United States dollars per cubic metre", - # Level of water stress is disaggregated in Agriculture, Industries, Services, and Total. - # We assume the old variable (where the aggregate was not specified) corresponds to total. - "6.4.2 Level of water stress: freshwater withdrawal as a proportion of available freshwater resources (%) | ER_H2O_STRESS || Value | 006121 || percent": "6.4.2 Level of water stress: freshwater withdrawal as a proportion of available freshwater resources (%) (Total) | 00240273 || Value | 006121 || percent", -} - - -def extract_identifiers_from_variable_name(variable: str) -> Dict[str, Any]: - # Instead of matching by extracting item and element codes from old and new versions, we do the opposite. - # We match variables by their name, omitting the codes. - item, element, unit = variable.split("||") - item = item.split("|")[0].strip() - element = element.split("|")[0].strip() - unit = unit.strip() - - error = f"Item, element or unit could not be extracted for variable: {variable}" - assert len(item) * len(element) * len(unit) > 0, error - variable_codes = {"variable": variable, "item": item, "element": element, "unit": unit} - - return variable_codes - - -def map_old_to_new_variable_names(variables_old: List[str], variables_new: List[str]) -> Dict[str, str]: - # Extract identifiers from variable names. - codes_old = pd.DataFrame([extract_identifiers_from_variable_name(variable) for variable in variables_old]) - codes_new = pd.DataFrame([extract_identifiers_from_variable_name(variable) for variable in variables_new]) - - variables_matched = pd.merge( - codes_old, codes_new, how="outer", on=["item", "element", "unit"], suffixes=("_old", "_new") - ) - - # Find if any of the old variables are not found in the new dataset. - unmatched_old_variables = variables_matched[variables_matched["variable_new"].isnull()]["variable_old"].tolist() - - # Find if there are new variables that did not exist. - # They could also correspond to old variables that were not successfully matched. - possible_new_variables = variables_matched[variables_matched["variable_old"].isnull()]["variable_new"].tolist() - - if len(possible_new_variables) > 0: - log.info(f"There are {len(possible_new_variables)} unknown new variables.") - if len(unmatched_old_variables) > 0: - log.info(f"There are {len(unmatched_old_variables)} old variables not matched to any new variables.") - - # Map old variable names to new variable names. - variables_name_mapping = variables_matched.dropna().set_index("variable_old").to_dict()["variable_new"] - - # Add manually mapped variables. - variables_name_mapping.update(MANUAL_VARIABLES_MAPPING) - - return variables_name_mapping - - -def get_grapher_variable_id_mapping_for_two_dataset_versions( - dataset_short_name: str, version_old: str, version_new: str -) -> Dict[int, int]: - # Load old and new datasets. - dataset_old = Dataset(DATA_DIR / "grapher" / NAMESPACE / version_old / dataset_short_name) - dataset_new = Dataset(DATA_DIR / "grapher" / NAMESPACE / version_new / dataset_short_name) - - # Get all variable names from the old and new datasets. - variables_old = extract_variables_from_dataset(dataset_short_name=dataset_short_name, version=version_old) - variables_new = extract_variables_from_dataset(dataset_short_name=dataset_short_name, version=version_new) - - # Map old to new variable names. - variables_mapping = map_old_to_new_variable_names(variables_old=variables_old, variables_new=variables_new) - - # Get data for old and new variables from grapher db. - grapher_variables_old, grapher_variables_new = get_grapher_data_for_old_and_new_variables( - dataset_old=dataset_old, dataset_new=dataset_new - ) - - # Check that variable titles in ETL match those found in grapher DB. - error = "Mismatch between expected old variable titles in ETL and grapher DB." - # NOTE: grapher_variables_old includes only variables that have been used in charts, whereas variables_old - # includes all variables. Therefore, we check that the former is fully contained in the latter. - assert set(grapher_variables_old["name"]) <= set(variables_old), error - error = "Mismatch between expected new variable titles in ETL and grapher DB." - # NOTE: Both grapher_variables_new and variables_new should contain all variables. - assert set(grapher_variables_new["name"]) == set(variables_new), error - - grapher_variable_ids_mapping = map_old_to_new_grapher_variable_ids( - grapher_variables_old, grapher_variables_new, variables_mapping - ) - - return grapher_variable_ids_mapping - - -def main( - domains: Optional[List[str]] = None, - version_old: Optional[str] = None, - version_new: Optional[str] = None, - execute_revisions: bool = False, -) -> None: - if domains is None: - # If domains is not specified, gather all domains found in all steps for the considered channel. - domains = sorted( - set( - [ - dataset_path.name.split("_")[-1] - for dataset_path in list((DATA_DIR / CHANNEL / NAMESPACE).glob("*/*")) - ] - ) - ) - - # List all datasets to map. - dataset_short_names = [f"{NAMESPACE}_{domain.lower()}" for domain in domains] - - for dataset_short_name in dataset_short_names: - log.info(f"Checking available versions for dataset {dataset_short_name}.") - # Ensure a dataset exist for each of the specified versions. - # And if a version is not specified, assume the latest for the new dataset, or second latest for the old. - version_old, version_new = find_and_check_available_versions_for_dataset( - dataset_short_name=dataset_short_name, version_old=version_old, version_new=version_new - ) - - # Get mapping of old grapher id variable to new grapher id variable. - grapher_variable_ids_mapping = get_grapher_variable_id_mapping_for_two_dataset_versions( - dataset_short_name=dataset_short_name, - version_old=version_old, # type: ignore - version_new=version_new, # type: ignore - ) - - if execute_revisions and len(grapher_variable_ids_mapping) > 0: - # Submit revisions to grapher db. - log.info(f"Creating chart revisions to map {len(grapher_variable_ids_mapping)} old variables to new ones.") - try: - create_and_submit_charts_revisions(mapping=grapher_variable_ids_mapping) - except IntegrityError: - log.error( - "Execution failed because some of the charts are already awaiting revision. " - "Go through the approval tool and re-run this script." - ) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser(description=__doc__) - parser.add_argument( - "-e", - "--execute_revisions", - default=False, - action="store_true", - help="If given, execute chart revisions. Otherwise, simply print the log without starting any revisions.", - ) - args = parser.parse_args() - main( - domains=[DOMAIN], - version_old=VERSION_OLD, - version_new=VERSION_NEW, - execute_revisions=args.execute_revisions, - ) From ed19a50877c4c2767fe28faf5e30b86afd235fbf Mon Sep 17 00:00:00 2001 From: Pablo Rosado Date: Fri, 22 Mar 2024 16:53:13 +0100 Subject: [PATCH 50/54] Fix bad format of description_key --- .../faostat/2024-03-14/additional_variables.py | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/etl/steps/data/garden/faostat/2024-03-14/additional_variables.py b/etl/steps/data/garden/faostat/2024-03-14/additional_variables.py index 58ddbc2594e..8617d95e0e7 100644 --- a/etl/steps/data/garden/faostat/2024-03-14/additional_variables.py +++ b/etl/steps/data/garden/faostat/2024-03-14/additional_variables.py @@ -140,15 +140,13 @@ def generate_area_used_for_production_per_crop_type(tb_qcl: Table) -> Table: ) # Prepare variable description. - descriptions = "Definitions by FAOSTAT:" + descriptions = "" for item in sorted(set(area_by_crop_type["item"])): - descriptions += f"\n\nItem: {item}" item_description = area_by_crop_type[area_by_crop_type["item"] == item]["item_description"].fillna("").iloc[0] if len(item_description) > 0: - descriptions += f"\nDescription: {item_description}" + descriptions += f"\n\n- {item}: {item_description}" - descriptions += f"\n\nMetric: {area_by_crop_type['element'].iloc[0]}" - descriptions += f"\nDescription: {area_by_crop_type['element_description'].iloc[0]}" + descriptions += f"\n\n- {area_by_crop_type['element'].iloc[0]}: {area_by_crop_type['element_description'].iloc[0]}" # Select the necessary columns, set an appropriate index, and sort conveniently. tb_area_by_crop_type = ( @@ -160,7 +158,7 @@ def generate_area_used_for_production_per_crop_type(tb_qcl: Table) -> Table: tb_area_by_crop_type.metadata.short_name = "area_used_per_crop_type" # Add table description to the indicator's key description. - tb_area_by_crop_type["area_used_for_production"].metadata.description_key = descriptions + tb_area_by_crop_type["area_used_for_production"].metadata.description_from_producer = descriptions return tb_area_by_crop_type @@ -483,10 +481,10 @@ def generate_food_available_for_consumption(tb_fbsc: Table) -> Table: # Prepare variable metadata. common_description = ( - "Data represents the average daily per capita supply of calories from the full range of " + "This data represents the average daily per capita supply of calories from the full range of " "commodities, grouped by food categories. Note that these figures do not correct for waste at the " - "household/consumption level so may not directly reflect the quantity of food finally consumed by a given " - "individual.\n\nSpecific food commodities have been grouped into higher-level categories." + "household or consumption level, so they may not directly reflect the quantity of food finally consumed by a " + "given individual.\n\nSpecific food commodities have been grouped into higher-level categories." ) for group in FOOD_GROUPS: item_names = list(tb_fbsc[tb_fbsc["item_code"].isin(FOOD_GROUPS[group])]["item"].unique()) @@ -501,7 +499,7 @@ def generate_food_available_for_consumption(tb_fbsc: Table) -> Table: ].metadata.title = f"Daily caloric intake per person from {group.lower().replace('other', 'other commodities')}" tb_food_available_for_consumption[underscore(group)].metadata.unit = CONSUMPTION_UNIT tb_food_available_for_consumption[underscore(group)].metadata.short_unit = "kcal" - tb_food_available_for_consumption[underscore(group)].metadata.description_key = description + tb_food_available_for_consumption[underscore(group)].metadata.description_key = [description] return tb_food_available_for_consumption From c029fd34c05157e1beb520531f5b5bfb049b8bcd Mon Sep 17 00:00:00 2001 From: Pablo Rosado Date: Mon, 25 Mar 2024 09:57:05 +0100 Subject: [PATCH 51/54] Improve snapshot descriptions --- docs/data/faostat.md | 11 ++++++++++- snapshots/faostat/2024-03-14/faostat_cahd.zip.dvc | 6 +++++- snapshots/faostat/2024-03-14/faostat_ei.zip.dvc | 4 +++- snapshots/faostat/2024-03-14/faostat_ek.zip.dvc | 6 +++++- snapshots/faostat/2024-03-14/faostat_emn.zip.dvc | 6 +++++- snapshots/faostat/2024-03-14/faostat_esb.zip.dvc | 6 +++++- snapshots/faostat/2024-03-14/faostat_fbs.zip.dvc | 6 +++++- snapshots/faostat/2024-03-14/faostat_fbsh.zip.dvc | 6 +++++- snapshots/faostat/2024-03-14/faostat_fo.zip.dvc | 6 +++++- snapshots/faostat/2024-03-14/faostat_fs.zip.dvc | 8 +++++++- snapshots/faostat/2024-03-14/faostat_ic.zip.dvc | 6 +++++- snapshots/faostat/2024-03-14/faostat_lc.zip.dvc | 4 +++- snapshots/faostat/2024-03-14/faostat_qcl.zip.dvc | 12 +++++++++++- snapshots/faostat/2024-03-14/faostat_qi.zip.dvc | 4 +++- snapshots/faostat/2024-03-14/faostat_qv.zip.dvc | 4 +++- snapshots/faostat/2024-03-14/faostat_rfb.zip.dvc | 4 +++- snapshots/faostat/2024-03-14/faostat_rfn.zip.dvc | 4 +++- snapshots/faostat/2024-03-14/faostat_rl.zip.dvc | 4 +++- snapshots/faostat/2024-03-14/faostat_rp.zip.dvc | 4 +++- snapshots/faostat/2024-03-14/faostat_rt.zip.dvc | 8 +++++++- snapshots/faostat/2024-03-14/faostat_scl.zip.dvc | 8 +++++++- snapshots/faostat/2024-03-14/faostat_sdgb.zip.dvc | 4 +++- snapshots/faostat/2024-03-14/faostat_tcl.zip.dvc | 6 +++++- snapshots/faostat/2024-03-14/faostat_ti.zip.dvc | 6 +++++- 24 files changed, 119 insertions(+), 24 deletions(-) diff --git a/docs/data/faostat.md b/docs/data/faostat.md index 85458ca1c54..16b8d8183b6 100644 --- a/docs/data/faostat.md +++ b/docs/data/faostat.md @@ -334,7 +334,16 @@ accept or reject changes. In the future this could be handled automatically by one of the existing scripts. -14. Archive unnecessary DB datasets, and move old, unnecessary etl steps in the dag to the archive dag. +14. Manually improve the metadata, if needed. For example, inspect the snapshot metadata files, to improve the dataset descriptions: + - Insert line break after first sentence (which usually is the general description of the dataset). + - Remove spurious symbols. + - Insert spaces where missing (e.g. "end of sentence.Start of next sentence"). + - Remove double spaces (e.g. "end of sentence. Start of next sentence"). + - Insert line breaks to create paragraphs (by context). + - Remove incomplete sentences (sometimes there are half sentences that may have been added by mistake). + - Remove mentions to links in FAOSTAT page (since they will not be seen from grapher). + +15. Archive unnecessary DB datasets, and move old, unnecessary etl steps in the dag to the archive dag. ```bash python etl/scripts/faostat/archive_old_datasets.py -e ``` diff --git a/snapshots/faostat/2024-03-14/faostat_cahd.zip.dvc b/snapshots/faostat/2024-03-14/faostat_cahd.zip.dvc index 7cae8b2ca67..01045a2b5c0 100644 --- a/snapshots/faostat/2024-03-14/faostat_cahd.zip.dvc +++ b/snapshots/faostat/2024-03-14/faostat_cahd.zip.dvc @@ -3,7 +3,11 @@ meta: producer: Food and Agriculture Organization of the United Nations title: 'Cost and Affordability of a Healthy Diet: Cost and Affordability of a Healthy Diet (CoAHD)' description: |- - Indicators on the cost and affordability of a healthy diet are estimated in each country and show the population’s physical and economic access to least expensive locally available foods to meet requirements for a healthy diet, as defined in food-based dietary guidelines (FBDGs). The indicators use observed retail food consumer prices and income distributions to provide an operational measure of people’s access to locally available foods in the proportions needed for health. These indicators support efforts within the framework of the Sustainable Development Goals (SDGs) to end hunger, achieve food security and improved nutrition, and promote sustainable agriculture by 2030 (SDG 2). They also support the monitoring of progress towards the objective of transforming agrifood systems by promoting “nutrition-sensitive agriculture”. For definitions of these indicators, see Definitions and standards. + Indicators on the cost and affordability of a healthy diet are estimated in each country and show the population's physical and economic access to least expensive locally available foods to meet requirements for a healthy diet, as defined in food-based dietary guidelines (FBDGs). + + The indicators use observed retail food consumer prices and income distributions to provide an operational measure of people's access to locally available foods in the proportions needed for health. + + These indicators support efforts within the framework of the Sustainable Development Goals (SDGs) to end hunger, achieve food security and improved nutrition, and promote sustainable agriculture by 2030 (SDG 2). They also support the monitoring of progress towards the objective of transforming agrifood systems by promoting “nutrition-sensitive agriculture”. For definitions of these indicators, see Definitions and standards. citation_full: |- Food and Agriculture Organization of the United Nations - Cost and Affordability of a Healthy Diet: Cost and Affordability of a Healthy Diet (CoAHD) (2023). attribution_short: FAOSTAT diff --git a/snapshots/faostat/2024-03-14/faostat_ei.zip.dvc b/snapshots/faostat/2024-03-14/faostat_ei.zip.dvc index 9c3e3c96596..de0fa86b2f5 100644 --- a/snapshots/faostat/2024-03-14/faostat_ei.zip.dvc +++ b/snapshots/faostat/2024-03-14/faostat_ei.zip.dvc @@ -3,7 +3,9 @@ meta: producer: Food and Agriculture Organization of the United Nations title: 'Climate Change: Agrifood systems emissions: Emissions intensities' description: |- - The FAOSTAT domain Emissions intensities contains analytical data on the intensity of greenhouse gas (GHG) emissions by agricultural commodity. This indicator is defined as greenhouse gas emissions per kg of product. Data are available for a set of agricultural commodities (e.g. rice and other cereals, meat, milk, eggs), by country, with global coverage and relative to the period 1961–2020. + The FAOSTAT domain Emissions intensities contains analytical data on the intensity of greenhouse gas (GHG) emissions by agricultural commodity. + + This indicator is defined as greenhouse gas emissions per kg of product. Data are available for a set of agricultural commodities (e.g. rice and other cereals, meat, milk, eggs), by country, with global coverage and relative to the period 1961-2020. citation_full: |- Food and Agriculture Organization of the United Nations - Climate Change: Agrifood systems emissions: Emissions intensities (2023). attribution_short: FAOSTAT diff --git a/snapshots/faostat/2024-03-14/faostat_ek.zip.dvc b/snapshots/faostat/2024-03-14/faostat_ek.zip.dvc index 65c4e3ed4f4..329c8e1552a 100644 --- a/snapshots/faostat/2024-03-14/faostat_ek.zip.dvc +++ b/snapshots/faostat/2024-03-14/faostat_ek.zip.dvc @@ -3,7 +3,11 @@ meta: producer: Food and Agriculture Organization of the United Nations title: 'Land, Inputs and Sustainability: Livestock Patterns' description: |- - The Livestock Patterns domain of FAOSTAT contains data on livestock numbers, shares of major livestock species and densities of livestock units in the agricultural land area. Values are calculated using Livestock Units (LSU), which facilitate aggregating information for different livestock types. Data are available by country, with global coverage, for the period 1961 to present, with annual updates. This methodology applies the LSU coefficients reported in the "Guidelines for the preparation of livestock sector reviews" (FAO, 2011). From this publication, LSU coefficients are computed by livestock type and by country. The reference unit used for the calculation of livestock units (=1 LSU) is the grazing equivalent of one adult dairy cow producing 3000 kg of milk annually, fed without additional concentrated foodstuffs. FAOSTAT agri-environmental indicators on livestock patterns closely follow the structure of the indicators in EUROSTAT. + The Livestock Patterns domain of FAOSTAT contains data on livestock numbers, shares of major livestock species and densities of livestock units in the agricultural land area. + + Values are calculated using Livestock Units (LSU), which facilitate aggregating information for different livestock types. Data are available by country, with global coverage, for the period 1961 to present, with annual updates. This methodology applies the LSU coefficients reported in the "Guidelines for the preparation of livestock sector reviews" (FAO, 2011). From this publication, LSU coefficients are computed by livestock type and by country. The reference unit used for the calculation of livestock units (=1 LSU) is the grazing equivalent of one adult dairy cow producing 3000 kg of milk annually, fed without additional concentrated foodstuffs. + + FAOSTAT agri-environmental indicators on livestock patterns closely follow the structure of the indicators in EUROSTAT. citation_full: 'Food and Agriculture Organization of the United Nations - Land, Inputs and Sustainability: Livestock Patterns (2023).' attribution_short: FAOSTAT url_main: http://www.fao.org/faostat/en/#data/EK diff --git a/snapshots/faostat/2024-03-14/faostat_emn.zip.dvc b/snapshots/faostat/2024-03-14/faostat_emn.zip.dvc index 15f63877213..27dd317f84b 100644 --- a/snapshots/faostat/2024-03-14/faostat_emn.zip.dvc +++ b/snapshots/faostat/2024-03-14/faostat_emn.zip.dvc @@ -3,7 +3,11 @@ meta: producer: Food and Agriculture Organization of the United Nations title: 'Land, Inputs and Sustainability: Livestock Manure' description: |- - The Livestock Manure domain of FAOSTAT contains estimates of nitrogen (N) inputs to agricultural soils from livestock manure. Data on the N losses to air and water are also disseminated. These estimates are compiled using official FAOSTAT statistics of animal stocks and by applying the internationally approved Guidelines of the Intergovernmental Panel on Climate Change (IPCC). Data are available by country, with global coverage and relative to the period 1961–2020, with annual updates. The following elements are disseminated: 1) Stocks; 2) Amount excreted in manure (N content); 3) Manure left on pasture (N content); 4) Manure left on pasture that volatilises (N content); 5) Manure left on pasture that leaches (N content); 6) Manure treated (N content); 7) Losses from manure treated (N content); 8) Manure applied to soils (N content); 9) Manure applied to soils that volatilises (N content); 10) Manure applied to soils that leaches (N content). + The Livestock Manure domain of FAOSTAT contains estimates of nitrogen (N) inputs to agricultural soils from livestock manure. + + Data on the N losses to air and water are also disseminated. These estimates are compiled using official FAOSTAT statistics of animal stocks and by applying the internationally approved Guidelines of the Intergovernmental Panel on Climate Change (IPCC). Data are available by country, with global coverage and relative to the period 1961-2020, with annual updates. + + The following elements are disseminated: 1) Stocks; 2) Amount excreted in manure (N content); 3) Manure left on pasture (N content); 4) Manure left on pasture that volatilizes (N content); 5) Manure left on pasture that leaches (N content); 6) Manure treated (N content); 7) Losses from manure treated (N content); 8) Manure applied to soils (N content); 9) Manure applied to soils that volatilizes (N content); 10) Manure applied to soils that leaches (N content). citation_full: 'Food and Agriculture Organization of the United Nations - Land, Inputs and Sustainability: Livestock Manure (2023).' attribution_short: FAOSTAT url_main: http://www.fao.org/faostat/en/#data/EMN diff --git a/snapshots/faostat/2024-03-14/faostat_esb.zip.dvc b/snapshots/faostat/2024-03-14/faostat_esb.zip.dvc index d17f474386d..8032c088af7 100644 --- a/snapshots/faostat/2024-03-14/faostat_esb.zip.dvc +++ b/snapshots/faostat/2024-03-14/faostat_esb.zip.dvc @@ -3,7 +3,11 @@ meta: producer: Food and Agriculture Organization of the United Nations title: 'Land, Inputs and Sustainability: Cropland Nutrient Balance' description: |- - 2022 Cropland nutrient budget analytical briefThe Cropland Nutrient Budget domain contains information on the flows of nitrogen, phosphorus, and potassium from synthetic fertilizer, manure applied to soils, atmospheric deposition, crop removal, and biological fixation over cropland and per unit area of cropland. The flows are aggregated to total inputs and total outputs, from which the overall nutrient budget and nutrient use efficiency on cropland are calculated. Statistics are disseminated in units of tonnes and in kg/ha, as appropriate. Nutrient use efficiency is expressed as a fraction (%). Data are available by country, with global coverage relative to the period 1961-2020, with annual updates. + The Cropland Nutrient Budget domain contains information on the flows of nitrogen, phosphorus, and potassium from synthetic fertilizer, manure applied to soils, atmospheric deposition, crop removal, and biological fixation over cropland and per unit area of cropland. + + The flows are aggregated to total inputs and total outputs, from which the overall nutrient budget and nutrient use efficiency on cropland are calculated. Statistics are disseminated in units of tonnes and in kg/ha, as appropriate. Nutrient use efficiency is expressed as a fraction (%). + + Data are available by country, with global coverage relative to the period 1961-2020, with annual updates. citation_full: |- Food and Agriculture Organization of the United Nations - Land, Inputs and Sustainability: Cropland Nutrient Balance (2023). attribution_short: FAOSTAT diff --git a/snapshots/faostat/2024-03-14/faostat_fbs.zip.dvc b/snapshots/faostat/2024-03-14/faostat_fbs.zip.dvc index 8d92684a14a..3d2c6db6383 100644 --- a/snapshots/faostat/2024-03-14/faostat_fbs.zip.dvc +++ b/snapshots/faostat/2024-03-14/faostat_fbs.zip.dvc @@ -3,7 +3,11 @@ meta: producer: Food and Agriculture Organization of the United Nations title: 'Food Balances: Food Balances (2010-)' description: |- - Food Balance Sheet presents a comprehensive picture of the pattern of a country's food supply during a specified reference period. The food balance sheet shows for each food item - i.e. each primary commodity and a number of processed commodities potentially available for human consumption - the sources of supply and its utilization. The total quantity of foodstuffs produced in a country added to the total quantity imported and adjusted to any change in stocks that may have occurred since the beginning of the reference period gives the supply available during that period. On the utilization side a distinction is made between the quantities exported, fed to livestock, used for seed, put to manufacture for food use and non-food uses, losses during storage and transportation, and food supplies available for human consumption. The per caput supply of each such food item available for human consumption is then obtained by dividing the respective quantity by the related data on the population actually partaking of it. Data on per caput food supplies are expressed in terms of quantity and - by applying appropriate food composition factors for all primary and processed products - also in terms of caloric value and protein and fat content. + Food Balance Sheet presents a comprehensive picture of the pattern of a country's food supply during a specified reference period. + + The food balance sheet shows for each food item - i.e. each primary commodity and a number of processed commodities potentially available for human consumption - the sources of supply and its utilization. The total quantity of foodstuffs produced in a country added to the total quantity imported and adjusted to any change in stocks that may have occurred since the beginning of the reference period gives the supply available during that period. + + On the utilization side a distinction is made between the quantities exported, fed to livestock, used for seed, put to manufacture for food use and non-food uses, losses during storage and transportation, and food supplies available for human consumption. The per caput supply of each such food item available for human consumption is then obtained by dividing the respective quantity by the related data on the population actually partaking of it. Data on per caput food supplies are expressed in terms of quantity and - by applying appropriate food composition factors for all primary and processed products - also in terms of caloric value and protein and fat content. citation_full: 'Food and Agriculture Organization of the United Nations - Food Balances: Food Balances (2010-) (2023).' attribution_short: FAOSTAT url_main: http://www.fao.org/faostat/en/#data/FBS diff --git a/snapshots/faostat/2024-03-14/faostat_fbsh.zip.dvc b/snapshots/faostat/2024-03-14/faostat_fbsh.zip.dvc index 41d13002034..d2b1dbaa6da 100644 --- a/snapshots/faostat/2024-03-14/faostat_fbsh.zip.dvc +++ b/snapshots/faostat/2024-03-14/faostat_fbsh.zip.dvc @@ -3,7 +3,11 @@ meta: producer: Food and Agriculture Organization of the United Nations title: 'Food Balances: Food Balances (-2013, old methodology and population)' description: |- - Food Balance Sheet presents a comprehensive picture of the pattern of a country's food supply during a specified reference period. The food balance sheet shows for each food item - i.e. each primary commodity and a number of processed commodities potentially available for human consumption - the sources of supply and its utilization. The total quantity of foodstuffs produced in a country added to the total quantity imported and adjusted to any change in stocks that may have occurred since the beginning of the reference period gives the supply available during that period. On the utilization side a distinction is made between the quantities exported, fed to livestock, used for seed, put to manufacture for food use and non-food uses, losses during storage and transportation, and food supplies available for human consumption. The per caput supply of each such food item available for human consumption is then obtained by dividing the respective quantity by the related data on the population actually partaking of it. Data on per caput food supplies are expressed in terms of quantity and - by applying appropriate food composition factors for all primary and processed products - also in terms of caloric value and protein and fat content. + Food Balance Sheet presents a comprehensive picture of the pattern of a country's food supply during a specified reference period. + + The food balance sheet shows for each food item - i.e. each primary commodity and a number of processed commodities potentially available for human consumption - the sources of supply and its utilization. The total quantity of foodstuffs produced in a country added to the total quantity imported and adjusted to any change in stocks that may have occurred since the beginning of the reference period gives the supply available during that period. + + On the utilization side a distinction is made between the quantities exported, fed to livestock, used for seed, put to manufacture for food use and non-food uses, losses during storage and transportation, and food supplies available for human consumption. The per caput supply of each such food item available for human consumption is then obtained by dividing the respective quantity by the related data on the population actually partaking of it. Data on per caput food supplies are expressed in terms of quantity and - by applying appropriate food composition factors for all primary and processed products - also in terms of caloric value and protein and fat content. citation_full: |- Food and Agriculture Organization of the United Nations - Food Balances: Food Balances (-2013, old methodology and population) (2023). attribution_short: FAOSTAT diff --git a/snapshots/faostat/2024-03-14/faostat_fo.zip.dvc b/snapshots/faostat/2024-03-14/faostat_fo.zip.dvc index 0d77ef5ed04..d653d5379be 100644 --- a/snapshots/faostat/2024-03-14/faostat_fo.zip.dvc +++ b/snapshots/faostat/2024-03-14/faostat_fo.zip.dvc @@ -3,7 +3,11 @@ meta: producer: Food and Agriculture Organization of the United Nations title: 'Forestry: Forestry Production and Trade' description: |- - The database contains data on the production and trade in roundwood and in primary wood and paper products for all countries and territories in the world.The main types of primary forest products included in this database are roundwood, sawnwood, wood-based panels, pulp, and paper and paperboard. These products are detailed further and defined in the Joint Forest Sector Questionnaire (JFSQ) (https://www.fao.org/forestry/statistics/80572/en/). The database contains details of the following topics: - Roundwood removals (production) by coniferous and non-coniferous wood and assortments, - production and trade in industrial roundwood, sawnwood, wood-based panels, wood charcoal, pulp, paper and paperboard, and other products. More detailed information on wood products, including definitions, can be found at https://www.fao.org/forestry/statistics/80572/en + The database contains data on the production and trade in roundwood and in primary wood and paper products for all countries and territories in the world. + + The main types of primary forest products included in this database are roundwood, sawnwood, wood-based panels, pulp, and paper and paperboard. These products are detailed further and defined in [the Joint Forest Sector Questionnaire (JFSQ)](https://www.fao.org/forestry/statistics/80572/en/). + + The database contains details of the following topics: - Roundwood removals (production) by coniferous and non-coniferous wood and assortments, - production and trade in industrial roundwood, sawnwood, wood-based panels, wood charcoal, pulp, paper and paperboard, and other products. More detailed information on wood products, including definitions, can be found at: https://www.fao.org/forestry/statistics/80572/en citation_full: 'Food and Agriculture Organization of the United Nations - Forestry: Forestry Production and Trade (2023).' attribution_short: FAOSTAT url_main: http://www.fao.org/faostat/en/#data/FO diff --git a/snapshots/faostat/2024-03-14/faostat_fs.zip.dvc b/snapshots/faostat/2024-03-14/faostat_fs.zip.dvc index 727172e82df..4278ced6489 100644 --- a/snapshots/faostat/2024-03-14/faostat_fs.zip.dvc +++ b/snapshots/faostat/2024-03-14/faostat_fs.zip.dvc @@ -3,7 +3,13 @@ meta: producer: Food and Agriculture Organization of the United Nations title: 'Food Security and Nutrition: Suite of Food Security Indicators' description: |- - The Suite of Food Security Indicators presents the core set of food security indicators. Following the recommendation of experts gathered in the Committee on World Food Security (CFS) Round Table on hunger measurement, hosted at FAO headquarters in September 2011, an initial set of indicators aiming to capture various aspects of food insecurity is presented here. The choice of the indicators has been informed by expert judgment and the availability of data with sufficient coverage to enable comparisons across regions and over time. Many of these indicators are produced and published elsewhere by FAO and other international organizations. They are reported here in a single database with the aim of building a wide food security information system. More indicators will be added to this set as more data will become available. Indicators are classified along the four dimensions of food security -- availability, access, utilization and stability. For definitions of these indicators, see Definitions and standards below (under Item). + The Suite of Food Security Indicators presents the core set of food security indicators. + + Following the recommendation of experts gathered in the Committee on World Food Security (CFS) Round Table on hunger measurement, hosted at FAO headquarters in September 2011, an initial set of indicators aiming to capture various aspects of food insecurity is presented here. The choice of the indicators has been informed by expert judgment and the availability of data with sufficient coverage to enable comparisons across regions and over time. + + Many of these indicators are produced and published elsewhere by FAO and other international organizations. They are reported here in a single database with the aim of building a wide food security information system. More indicators will be added to this set as more data will become available. + + Indicators are classified along the four dimensions of food security -- availability, access, utilization and stability. citation_full: |- Food and Agriculture Organization of the United Nations - Food Security and Nutrition: Suite of Food Security Indicators (2023). attribution_short: FAOSTAT diff --git a/snapshots/faostat/2024-03-14/faostat_ic.zip.dvc b/snapshots/faostat/2024-03-14/faostat_ic.zip.dvc index cca81e228e2..74e4b0d651f 100644 --- a/snapshots/faostat/2024-03-14/faostat_ic.zip.dvc +++ b/snapshots/faostat/2024-03-14/faostat_ic.zip.dvc @@ -3,7 +3,11 @@ meta: producer: Food and Agriculture Organization of the United Nations title: 'Investment: Credit to Agriculture' description: |- - The Credit to Agriculture dataset provides national data for over 130 countries on the amount of loans provided by the private/commercial banking sector to producers in agriculture, forestry and fishing, including household producers, cooperatives, and agro-businesses. For some countries, the three subsectors of agriculture, forestry, and fishing are completely specified. In other cases, complete disaggregations are not available. The dataset also provides statistics on the total credit to all industries, indicators on the share of credit to agricultural producers, and an agriculture orientation index (AOI) for credit that normalizes the share of credit to agriculture over total credit by dividing it by the share of agriculture in gross domestic product (GDP). As such, it can provide a more accurate indication of the relative importance that banking sectors place on financing the sector. An AOI lower than 1 indicates that the agriculture sector receives a credit share lower than its contribution to the economy, while an AOI greater than 1 indicates a credit share to the agriculture sector greater than its economic contribution. + The Credit to Agriculture dataset provides national data for over 130 countries on the amount of loans provided by the private/commercial banking sector to producers in agriculture, forestry and fishing, including household producers, cooperatives, and agro-businesses. + + For some countries, the three subsectors of agriculture, forestry, and fishing are completely specified. In other cases, complete disaggregations are not available. + + The dataset also provides statistics on the total credit to all industries, indicators on the share of credit to agricultural producers, and an agriculture orientation index (AOI) for credit that normalizes the share of credit to agriculture over total credit by dividing it by the share of agriculture in gross domestic product (GDP). As such, it can provide a more accurate indication of the relative importance that banking sectors place on financing the sector. An AOI lower than 1 indicates that the agriculture sector receives a credit share lower than its contribution to the economy, while an AOI greater than 1 indicates a credit share to the agriculture sector greater than its economic contribution. citation_full: 'Food and Agriculture Organization of the United Nations - Investment: Credit to Agriculture (2023).' attribution_short: FAOSTAT url_main: http://www.fao.org/faostat/en/#data/IC diff --git a/snapshots/faostat/2024-03-14/faostat_lc.zip.dvc b/snapshots/faostat/2024-03-14/faostat_lc.zip.dvc index 02f13518fac..3f37f5d6e92 100644 --- a/snapshots/faostat/2024-03-14/faostat_lc.zip.dvc +++ b/snapshots/faostat/2024-03-14/faostat_lc.zip.dvc @@ -3,7 +3,9 @@ meta: producer: Food and Agriculture Organization of the United Nations title: 'Land, Inputs and Sustainability: Land Cover' description: |- - The FAOSTAT domain Land Cover under the Agri-Environmental Indicators section contains land cover information organized by the land cover classes of the international standard system for Environmental and Economic Accounting Central Framework (SEEA CF). The land cover information is compiled from publicly available Global Land Cover (GLC) maps: a) MODIS land cover types based on the Land Cover Classification System, LCCS (2001–2021); b) The European Spatial Agency (ESA) Climate Change Initiative (CCI) annual land cover maps (1992–2020) produced by the Université catholique de Louvain (UCL)-Geomatics and now under the European Copernicus Program; c) The annual land cover maps which were produced under the European Copernicus Global Land Service (CGLS) (CGLS land cover, containing discrete land cover categorization for the period 2015–2019), with spatial resolution 100m; and d) 4) The WorldCover maps of the European Space Agency —available for the years 2020 and 2021, produced at 10m resolution. + The FAOSTAT domain Land Cover under the Agri-Environmental Indicators section contains land cover information organized by the land cover classes of the international standard system for Environmental and Economic Accounting Central Framework (SEEA CF). + + The land cover information is compiled from publicly available Global Land Cover (GLC) maps: a) MODIS land cover types based on the Land Cover Classification System, LCCS (2001-2021); b) The European Spatial Agency (ESA) Climate Change Initiative (CCI) annual land cover maps (1992-2020) produced by the Université catholique de Louvain (UCL)-Geomatics and now under the European Copernicus Program; c) The annual land cover maps which were produced under the European Copernicus Global Land Service (CGLS) (CGLS land cover, containing discrete land cover categorization for the period 2015-2019), with spatial resolution 100m; and d) 4) The WorldCover maps of the European Space Agency —available for the years 2020 and 2021, produced at 10m resolution. citation_full: 'Food and Agriculture Organization of the United Nations - Land, Inputs and Sustainability: Land Cover (2023).' attribution_short: FAOSTAT url_main: http://www.fao.org/faostat/en/#data/LC diff --git a/snapshots/faostat/2024-03-14/faostat_qcl.zip.dvc b/snapshots/faostat/2024-03-14/faostat_qcl.zip.dvc index 7efc45a5ac8..ba0daad737b 100644 --- a/snapshots/faostat/2024-03-14/faostat_qcl.zip.dvc +++ b/snapshots/faostat/2024-03-14/faostat_qcl.zip.dvc @@ -3,7 +3,17 @@ meta: producer: Food and Agriculture Organization of the United Nations title: 'Production: Crops and livestock products' description: |- - Crop and livestock statistics are recorded for 278 products, covering the following categories: 1) CROPS PRIMARY: Cereals, Citrus Fruit, Fibre Crops, Fruit, Oil Crops, Oil Crops and Cakes in Oil Equivalent, Pulses, Roots and Tubers, Sugar Crops, Treenuts and Vegetables. Data are expressed in terms of area harvested, production quantity and yield. Cereals: Area and production data on cereals relate to crops harvested for dry grain only. Cereal crops harvested for hay or harvested green for food, feed or silage or used for grazing are therefore excluded. 2) CROPS PROCESSED: Beer of barley; Cotton lint; Cottonseed; Margarine, short; Molasses; Oil, coconut (copra); Oil, cottonseed; Oil, groundnut; Oil, linseed; Oil, maize; Oil, olive, virgin; Oil, palm; Oil, palm kernel; Oil, rapeseed; Oil, safflower; Oil, sesame; Oil, soybean; Oil, sunflower; Palm kernels; Sugar Raw Centrifugal; Wine. 3) LIVE ANIMALS: Animals live n.e.s.; Asses; Beehives; Buffaloes; Camelids, other; Camels; Cattle; Chickens; Ducks; Geese and guinea fowls; Goats; Horses; Mules; Pigeons, other birds; Pigs; Rabbits and hares; Rodents, other; Sheep; Turkeys. 4) LIVESTOCK PRIMARY: Beeswax; Eggs (various types); Hides buffalo, fresh; Hides, cattle, fresh; Honey, natural; Meat (ass, bird nes, buffalo, camel, cattle, chicken, duck, game, goat, goose and guinea fowl, horse, mule, Meat nes, meat other camelids, Meat other rodents, pig, rabbit, sheep, turkey); Milk (buffalo, camel, cow, goat, sheep); Offals, nes; Silk-worm cocoons, reelable; Skins (goat, sheep); Snails, not sea; Wool, greasy. 5) LIVESTOCK PROCESSED: Butter (of milk from sheep, goat, buffalo, cow); Cheese (of milk from goat, buffalo, sheep, cow milk); Cheese of skimmed cow milk; Cream fresh; Ghee (cow and buffalo milk); Lard; Milk (dry buttermilk, skimmed condensed, skimmed cow, skimmed dried, skimmed evaporated, whole condensed, whole dried, whole evaporated); Silk raw; Tallow; Whey (condensed and dry); Yoghurt. + Crop and livestock statistics are recorded for 278 products, covering the following categories: + + 1) Crops primary: Cereals, Citrus Fruit, Fibre Crops, Fruit, Oil Crops, Oil Crops and Cakes in Oil Equivalent, Pulses, Roots and Tubers, Sugar Crops, Treenuts and Vegetables. Data are expressed in terms of area harvested, production quantity and yield. Area and production data on cereals relate to crops harvested for dry grain only. Cereal crops harvested for hay or harvested green for food, feed or silage or used for grazing are therefore excluded. + + 2) Crops processed: Beer of barley; Cotton lint; Cottonseed; Margarine, short; Molasses; Oil, coconut (copra); Oil, cottonseed; Oil, groundnut; Oil, linseed; Oil, maize; Oil, olive, virgin; Oil, palm; Oil, palm kernel; Oil, rapeseed; Oil, safflower; Oil, sesame; Oil, soybean; Oil, sunflower; Palm kernels; Sugar Raw Centrifugal; Wine. + + 3) Live animals: Animals live n.e.s.; Asses; Beehives; Buffaloes; Camelids, other; Camels; Cattle; Chickens; Ducks; Geese and guinea fowls; Goats; Horses; Mules; Pigeons, other birds; Pigs; Rabbits and hares; Rodents, other; Sheep; Turkeys. + + 4) Livestock primary: Beeswax; Eggs (various types); Hides buffalo, fresh; Hides, cattle, fresh; Honey, natural; Meat (ass, bird nes, buffalo, camel, cattle, chicken, duck, game, goat, goose and guinea fowl, horse, mule, Meat nes, meat other camelids, Meat other rodents, pig, rabbit, sheep, turkey); Milk (buffalo, camel, cow, goat, sheep); Offals, nes; Silk-worm cocoons, reelable; Skins (goat, sheep); Snails, not sea; Wool, greasy. + + 5) Livestock processed: Butter (of milk from sheep, goat, buffalo, cow); Cheese (of milk from goat, buffalo, sheep, cow milk); Cheese of skimmed cow milk; Cream fresh; Ghee (cow and buffalo milk); Lard; Milk (dry buttermilk, skimmed condensed, skimmed cow, skimmed dried, skimmed evaporated, whole condensed, whole dried, whole evaporated); Silk raw; Tallow; Whey (condensed and dry); Yoghurt. citation_full: 'Food and Agriculture Organization of the United Nations - Production: Crops and livestock products (2023).' attribution_short: FAOSTAT url_main: http://www.fao.org/faostat/en/#data/QCL diff --git a/snapshots/faostat/2024-03-14/faostat_qi.zip.dvc b/snapshots/faostat/2024-03-14/faostat_qi.zip.dvc index 4cb825a6815..7e1bab9b998 100644 --- a/snapshots/faostat/2024-03-14/faostat_qi.zip.dvc +++ b/snapshots/faostat/2024-03-14/faostat_qi.zip.dvc @@ -3,7 +3,9 @@ meta: producer: Food and Agriculture Organization of the United Nations title: 'Production: Production Indices' description: |- - The FAO indices of agricultural production show the relative level of the aggregate volume of agricultural production for each year in comparison with the base period 2014-2016. Indices for meat production are computed based on data for production from indigenous animals. + The FAO indices of agricultural production show the relative level of the aggregate volume of agricultural production for each year in comparison with the base period 2014-2016. + + Indices for meat production are computed based on data for production from indigenous animals. citation_full: 'Food and Agriculture Organization of the United Nations - Production: Production Indices (2024).' attribution_short: FAOSTAT url_main: http://www.fao.org/faostat/en/#data/QI diff --git a/snapshots/faostat/2024-03-14/faostat_qv.zip.dvc b/snapshots/faostat/2024-03-14/faostat_qv.zip.dvc index e04c7483305..0e8313668fb 100644 --- a/snapshots/faostat/2024-03-14/faostat_qv.zip.dvc +++ b/snapshots/faostat/2024-03-14/faostat_qv.zip.dvc @@ -3,7 +3,9 @@ meta: producer: Food and Agriculture Organization of the United Nations title: 'Production: Value of Agricultural Production' description: |- - Values of agricultural production are calculated based on production data of primary commodities from Production domain and producer prices from Prices domain. The livestock value of production is measured in terms of indigenous meat. + Values of agricultural production are calculated based on production data of primary commodities from Production domain and producer prices from Prices domain. + + The livestock value of production is measured in terms of indigenous meat. citation_full: 'Food and Agriculture Organization of the United Nations - Production: Value of Agricultural Production (2024).' attribution_short: FAOSTAT url_main: http://www.fao.org/faostat/en/#data/QV diff --git a/snapshots/faostat/2024-03-14/faostat_rfb.zip.dvc b/snapshots/faostat/2024-03-14/faostat_rfb.zip.dvc index 34cf2b0c69f..e2cefacf391 100644 --- a/snapshots/faostat/2024-03-14/faostat_rfb.zip.dvc +++ b/snapshots/faostat/2024-03-14/faostat_rfb.zip.dvc @@ -3,7 +3,9 @@ meta: producer: Food and Agriculture Organization of the United Nations title: 'Land, Inputs and Sustainability: Fertilizers by Product' description: |- - The Fertilizers by Product dataset contains information on the Production, Trade and Agriculture Use of inorganic (chemical or mineral) fertilizers products, over the time series 2002-present. The fertilizer statistics data are for a set of 23 product categories. Both straight and compound fertilizers are included. There is information available about methodology at: https://fenixservices.fao.org/faostat/static/documents/RFB/RFB_EN_README.pdf. + The Fertilizers by Product dataset contains information on the Production, Trade and Agriculture Use of inorganic (chemical or mineral) fertilizers products, over the time series 2002-present. + + The fertilizer statistics data are for a set of 23 product categories. Both straight and compound fertilizers are included. There is information available about methodology at: https://fenixservices.fao.org/faostat/static/documents/RFB/RFB_EN_README.pdf citation_full: |- Food and Agriculture Organization of the United Nations - Land, Inputs and Sustainability: Fertilizers by Product (2023). attribution_short: FAOSTAT diff --git a/snapshots/faostat/2024-03-14/faostat_rfn.zip.dvc b/snapshots/faostat/2024-03-14/faostat_rfn.zip.dvc index 8f4c7024951..f245a1aed1f 100644 --- a/snapshots/faostat/2024-03-14/faostat_rfn.zip.dvc +++ b/snapshots/faostat/2024-03-14/faostat_rfn.zip.dvc @@ -3,7 +3,9 @@ meta: producer: Food and Agriculture Organization of the United Nations title: 'Land, Inputs and Sustainability: Fertilizers by Nutrient' description: |- - The Fertilizers by Nutrient dataset contains information on the totals in nutrients for Production, Trade and Agriculture Use of inorganic (chemical or mineral) fertilizers, over the time series 1961-present. The data are provided for the three primary plant nutrients: nitrogen (N), phosphorus (expressed as P2O5) and potassium (expressed as K2O). Both straight and compound fertilizers are included. There is information on the methodology available at: https://fenixservices.fao.org/faostat/static/documents/RFN/RFN_EN_README.pdf + The Fertilizers by Nutrient dataset contains information on the totals in nutrients for Production, Trade and Agriculture Use of inorganic (chemical or mineral) fertilizers, over the time series 1961-present. + + The data are provided for the three primary plant nutrients: nitrogen (N), phosphorus (expressed as P2O5) and potassium (expressed as K2O). Both straight and compound fertilizers are included. There is information on the methodology available at: https://fenixservices.fao.org/faostat/static/documents/RFN/RFN_EN_README.pdf citation_full: |- Food and Agriculture Organization of the United Nations - Land, Inputs and Sustainability: Fertilizers by Nutrient (2023). attribution_short: FAOSTAT diff --git a/snapshots/faostat/2024-03-14/faostat_rl.zip.dvc b/snapshots/faostat/2024-03-14/faostat_rl.zip.dvc index 50472f3f795..3b75829f69f 100644 --- a/snapshots/faostat/2024-03-14/faostat_rl.zip.dvc +++ b/snapshots/faostat/2024-03-14/faostat_rl.zip.dvc @@ -3,7 +3,9 @@ meta: producer: Food and Agriculture Organization of the United Nations title: 'Land, Inputs and Sustainability: Land Use' description: |- - The FAOSTAT Land Use domain contains data on forty-four categories of land use, irrigation and agricultural practices and five indicators relevant to monitor agriculture, forestry and fisheries activities at national, regional and global level. Data are available by country and year, with global coverage and annual updates. + The FAOSTAT Land Use domain contains data on forty-four categories of land use, irrigation and agricultural practices and five indicators relevant to monitor agriculture, forestry and fisheries activities at national, regional and global level. + + Data are available by country and year, with global coverage and annual updates. citation_full: 'Food and Agriculture Organization of the United Nations - Land, Inputs and Sustainability: Land Use (2024).' attribution_short: FAOSTAT url_main: http://www.fao.org/faostat/en/#data/RL diff --git a/snapshots/faostat/2024-03-14/faostat_rp.zip.dvc b/snapshots/faostat/2024-03-14/faostat_rp.zip.dvc index a9df3aae275..b2a30c128d2 100644 --- a/snapshots/faostat/2024-03-14/faostat_rp.zip.dvc +++ b/snapshots/faostat/2024-03-14/faostat_rp.zip.dvc @@ -3,7 +3,9 @@ meta: producer: Food and Agriculture Organization of the United Nations title: 'Land, Inputs and Sustainability: Pesticides Use' description: |- - The Pesticides Use database includes data on the use of major pesticide groups (Insecticides, Herbicides, Fungicides, Plant growth regulators and Rodenticides) and of relevant chemical families. Data report the quantities (in tonnes of active ingredients) + The Pesticides Use database includes data on the use of major pesticide groups (Insecticides, Herbicides, Fungicides, Plant growth regulators and Rodenticides) and of relevant chemical families. + + Data report the quantities (in tonnes of active ingredients). citation_full: 'Food and Agriculture Organization of the United Nations - Land, Inputs and Sustainability: Pesticides Use (2024).' attribution_short: FAOSTAT url_main: http://www.fao.org/faostat/en/#data/RP diff --git a/snapshots/faostat/2024-03-14/faostat_rt.zip.dvc b/snapshots/faostat/2024-03-14/faostat_rt.zip.dvc index ca5d5143ce7..b2d0ab0b4cd 100644 --- a/snapshots/faostat/2024-03-14/faostat_rt.zip.dvc +++ b/snapshots/faostat/2024-03-14/faostat_rt.zip.dvc @@ -3,7 +3,13 @@ meta: producer: Food and Agriculture Organization of the United Nations title: 'Land, Inputs and Sustainability: Pesticides Trade' description: |- - This domain contains data on pesticides and covers two different categories: pesticides traded in form or packagingfor retail sale or as preparations or articles, and pesticides traded as separate chemically defined compounds (if relevant for the Rotterdam Convention on the Prior Informed Consent Procedure for Certain Hazardous Chemicals and Pesticides in International Trade). The pesticides traded for retail sale or as preparations or articles are those classified under code 38.08 in the Harmonized System Nomenclature (HS) and include: hazardous pesticides, insecticides, fungicides, herbicides, disinfectants and other. For these pesticides, this domain contains trade data (imports and exports) in values only (current 1000 US dollars), and the time series extends from 1961 onwards. The pesticides traded as separate chemically defined compounds are those listed in Annex III of the Rotterdam Convention (excluding industrial chemicals) and therefore subject to the Prior Informed Consent (PIC) procedure. The correspondence with the HS Nomenclature is shown in the table at the Related Documents section. For these pesticides, this domain contains trade data (imports and exports) in both value (current 1000 US dollars) and quantity (net weight in tonnes), and the time series extends from 2007 onwards. + This domain contains data on pesticides and covers two different categories: pesticides traded in form or packaging for retail sale or as preparations or articles, and pesticides traded as separate chemically defined compounds (if relevant for the Rotterdam Convention on the Prior Informed Consent Procedure for Certain Hazardous Chemicals and Pesticides in International Trade). + + The pesticides traded for retail sale or as preparations or articles are those classified under code 38.08 in the Harmonized System Nomenclature (HS) and include: hazardous pesticides, insecticides, fungicides, herbicides, disinfectants and other. For these pesticides, this domain contains trade data (imports and exports) in values only (current 1000 US dollars), and the time series extends from 1961 onwards. + + The pesticides traded as separate chemically defined compounds are those listed in Annex III of the Rotterdam Convention (excluding industrial chemicals) and therefore subject to the Prior Informed Consent (PIC) procedure. + + The correspondence with the HS Nomenclature is shown in the table at the Related Documents section. For these pesticides, this domain contains trade data (imports and exports) in both value (current 1000 US dollars) and quantity (net weight in tonnes), and the time series extends from 2007 onwards. citation_full: 'Food and Agriculture Organization of the United Nations - Land, Inputs and Sustainability: Pesticides Trade (2023).' attribution_short: FAOSTAT url_main: http://www.fao.org/faostat/en/#data/RT diff --git a/snapshots/faostat/2024-03-14/faostat_scl.zip.dvc b/snapshots/faostat/2024-03-14/faostat_scl.zip.dvc index b1bb9cb7eba..30c0ac81643 100644 --- a/snapshots/faostat/2024-03-14/faostat_scl.zip.dvc +++ b/snapshots/faostat/2024-03-14/faostat_scl.zip.dvc @@ -3,7 +3,13 @@ meta: producer: Food and Agriculture Organization of the United Nations title: 'Food Balances: Supply Utilization Accounts (2010-)' description: |- - Supply Utilization Accounts (SUA's) present a comprehensive picture of the pattern of a country's food supply during a specified reference period. The total quantity of foodstuffs produced in a country added to the total quantity imported and adjusted to any change in stocks that may have occurred since the beginning of the reference period gives the supply available during that period. On the utilization side a distinction is made between the quantities exported, fed to livestock, used for seed, put to manufacture for food use and non-food uses, losses during storage and transportation, and food supplies available for human consumption. The per caput supply of each such food item available for human consumption is then obtained by dividing the respective quantity by the related data on the population actually partaking of it. Data on per caput food supplies are expressed in terms of quantity and - by applying appropriate food composition factors for all primary and processed products - also in terms of caloric value and protein and fat content. + Supply Utilization Accounts (SUA's) present a comprehensive picture of the pattern of a country's food supply during a specified reference period. + + The total quantity of foodstuffs produced in a country added to the total quantity imported and adjusted to any change in stocks that may have occurred since the beginning of the reference period gives the supply available during that period. + + On the utilization side a distinction is made between the quantities exported, fed to livestock, used for seed, put to manufacture for food use and non-food uses, losses during storage and transportation, and food supplies available for human consumption. + + The per caput supply of each such food item available for human consumption is then obtained by dividing the respective quantity by the related data on the population actually partaking of it. Data on per caput food supplies are expressed in terms of quantity and - by applying appropriate food composition factors for all primary and processed products - also in terms of caloric value and protein and fat content. citation_full: 'Food and Agriculture Organization of the United Nations - Food Balances: Supply Utilization Accounts (2010-) (2023).' attribution_short: FAOSTAT url_main: http://www.fao.org/faostat/en/#data/SCL diff --git a/snapshots/faostat/2024-03-14/faostat_sdgb.zip.dvc b/snapshots/faostat/2024-03-14/faostat_sdgb.zip.dvc index 51c00325eba..86940094cf8 100644 --- a/snapshots/faostat/2024-03-14/faostat_sdgb.zip.dvc +++ b/snapshots/faostat/2024-03-14/faostat_sdgb.zip.dvc @@ -3,7 +3,9 @@ meta: producer: Food and Agriculture Organization of the United Nations title: 'SDG Indicators: SDG Indicators' description: |- - As the custodian agency of 21 SDG indicators, the Food and Agriculture Organization of the United Nations (FAO) is responsible for curating and refining the methodologies of these indicators, collecting data from national sources, ensuring their quality and compatibility with applicable standards and classifications, and disseminating data at global level. This FAOSTAT domain complements the global SDG database administered by the United Nations Statistical Division, as well as FAO’s SDG indicators portal, by providing access to the available data for each of these indicators. Please click the metadata link on the right hand navigation column for an abridged version of the methodology for compiling each of these indicators, a description of data sources and the relevant contact persons responsible for each indicator in the Organization. For a more detailed description of the methodology, data sources and reporting procedures, please follow the link to the official SDG indicator metadata document available at the bottom of each summary metadata page in the document on the right. + As the custodian agency of 21 SDG indicators, the Food and Agriculture Organization of the United Nations (FAO) is responsible for curating and refining the methodologies of these indicators, collecting data from national sources, ensuring their quality and compatibility with applicable standards and classifications, and disseminating data at global level. + + This FAOSTAT domain complements the global SDG database administered by the United Nations Statistical Division, as well as FAO's SDG indicators portal, by providing access to the available data for each of these indicators. citation_full: 'Food and Agriculture Organization of the United Nations - SDG Indicators: SDG Indicators (2023).' attribution_short: FAOSTAT url_main: http://www.fao.org/faostat/en/#data/SDGB diff --git a/snapshots/faostat/2024-03-14/faostat_tcl.zip.dvc b/snapshots/faostat/2024-03-14/faostat_tcl.zip.dvc index 7543a8f78e7..9758e2ab935 100644 --- a/snapshots/faostat/2024-03-14/faostat_tcl.zip.dvc +++ b/snapshots/faostat/2024-03-14/faostat_tcl.zip.dvc @@ -3,7 +3,11 @@ meta: producer: Food and Agriculture Organization of the United Nations title: 'Trade: Crops and livestock products' description: |- - The food and agricultural trade dataset is collected, processed and disseminated by FAO according to the standard International Merchandise Trade Statistics (IMTS) Methodology. The data is mainly provided by UNSD, Eurostat, and other national authorities as needed. This source data is checked for outliers, trade partner data is used for non-reporting countries or missing cells, and data on food aid is added to take into account total cross-border trade flows. The trade database includes the following variables: export quantity, export value, import quantity, and import value. The trade database includes all food and agricultural products imported/exported annually by all the countries in the world. + The food and agricultural trade dataset is collected, processed and disseminated by FAO according to the standard International Merchandise Trade Statistics (IMTS) Methodology. + + The data is mainly provided by UNSD, Eurostat, and other national authorities as needed. This source data is checked for outliers, trade partner data is used for non-reporting countries or missing cells, and data on food aid is added to take into account total cross-border trade flows. + + The trade database includes the following variables: export quantity, export value, import quantity, and import value. The trade database includes all food and agricultural products imported/exported annually by all the countries in the world. citation_full: 'Food and Agriculture Organization of the United Nations - Trade: Crops and livestock products (2023).' attribution_short: FAOSTAT url_main: http://www.fao.org/faostat/en/#data/TCL diff --git a/snapshots/faostat/2024-03-14/faostat_ti.zip.dvc b/snapshots/faostat/2024-03-14/faostat_ti.zip.dvc index 628f8d1e058..72ece2d50bf 100644 --- a/snapshots/faostat/2024-03-14/faostat_ti.zip.dvc +++ b/snapshots/faostat/2024-03-14/faostat_ti.zip.dvc @@ -3,7 +3,11 @@ meta: producer: Food and Agriculture Organization of the United Nations title: 'Trade: Trade Indices' description: |- - The food and agricultural trade dataset is collected, processed and disseminated by FAO according to the standard International Merchandise Trade Statistics (IMTS) Methodology. The data is mainly provided by UNSD, Eurostat, and other national authorities as needed. This source data is checked for outliers, trade partner data is used for non-reporting countries or missing cells, and data on food aid is added to take into account total cross-border trade flows. The trade database includes the following variables: export quantity, export value, import quantity, and import value. The trade database includes all food and agricultural products imported/exported annually by all the countries in the world. + The food and agricultural trade dataset is collected, processed and disseminated by FAO according to the standard International Merchandise Trade Statistics (IMTS) Methodology. + + The data is mainly provided by UNSD, Eurostat, and other national authorities as needed. This source data is checked for outliers, trade partner data is used for non-reporting countries or missing cells, and data on food aid is added to take into account total cross-border trade flows. + + The trade database includes the following variables: export quantity, export value, import quantity, and import value. The trade database includes all food and agricultural products imported/exported annually by all the countries in the world. citation_full: 'Food and Agriculture Organization of the United Nations - Trade: Trade Indices (2023).' attribution_short: FAOSTAT url_main: http://www.fao.org/faostat/en/#data/TI From f0924be040d4512ccf1fdc1395a4fb833239bcd3 Mon Sep 17 00:00:00 2001 From: Pablo Rosado Date: Mon, 25 Mar 2024 11:05:47 +0100 Subject: [PATCH 52/54] Update custom metadata files and documentation --- docs/data/faostat.md | 43 +++--- .../faostat/2024-03-14/custom_datasets.csv | 132 +++++++++++++++--- .../faostat/2024-03-14/custom_items.csv | 4 +- 3 files changed, 133 insertions(+), 46 deletions(-) diff --git a/docs/data/faostat.md b/docs/data/faostat.md index 16b8d8183b6..7603e24ce37 100644 --- a/docs/data/faostat.md +++ b/docs/data/faostat.md @@ -234,7 +234,16 @@ If no dataset requires an update, the workflow stops here. downloading this domain, add it to the list `INCLUDED_DATASETS_CODES`. Then replace variables used in those charts with the new ones. -2. Create new meadow steps. +2. Manually inspect the snapshot metadata files, and fix common issues in dataset descriptions: + - Insert line break after first sentence (which usually is the general description of the dataset). + - Remove spurious symbols. + - Insert spaces where missing (e.g. "end of sentence.Start of next sentence"). + - Remove double spaces (e.g. "end of sentence. Start of next sentence"). + - Insert line breaks to create paragraphs (by context). + - Remove incomplete sentences (sometimes there are half sentences that may have been added by mistake). + - Remove mentions to links in FAOSTAT page (since they will not be seen from grapher). + +3. Create new meadow steps. !!! note @@ -243,19 +252,19 @@ If no dataset requires an update, the workflow stops here. ```bash python etl/scripts/faostat/create_new_steps.py -c meadow -a ``` -3. Run the new etl meadow steps, to generate the meadow datasets. +4. Run the new etl meadow steps, to generate the meadow datasets. ```bash etl run meadow/faostat/YYYY-MM-DD ``` -4. Create new garden steps. +5. Create new garden steps. ```bash python etl/scripts/faostat/create_new_steps.py -c garden ``` -5. Run the new etl garden steps, to generate the garden datasets. +6. Run the new etl garden steps, to generate the garden datasets. ```bash etl run garden/faostat/YYYY-MM-DD @@ -277,7 +286,7 @@ If no dataset requires an update, the workflow stops here. If a new domain has been added to this version, you may need to manually add its meadow step as a dependency of garden/faostat/YYYY-MM-DD/faostat_metadata in the dag (this is a known bug). -6. Inspect and update any possible changes of dataset/item/element/unit names and descriptions. +7. Inspect and update any possible changes of dataset/item/element/unit names and descriptions. ```bash python etl/scripts/faostat/update_custom_metadata.py @@ -288,19 +297,19 @@ If no dataset requires an update, the workflow stops here. etl run garden/faostat/YYYY-MM-DD ``` -7. Create new grapher steps. +8. Create new grapher steps. ```bash python etl/scripts/faostat/create_new_steps.py -c grapher ``` -8. Run the new etl grapher steps, to generate the grapher charts. +9. Run the new etl grapher steps, to generate the grapher charts. ```bash etl run faostat/YYYY-MM-DD --grapher ``` -9. Generate chart revisions (showing a chart using an old version of a variable and the same chart using the new +10. Generate chart revisions (showing a chart using an old version of a variable and the same chart using the new version) for each dataset, to replace variables of a dataset from its second latest version to its latest version. ```bash @@ -311,12 +320,12 @@ version) for each dataset, to replace variables of a dataset from its second lat This step may raise errors (because of limitations in our chart revision tool). If so, continue to the next step and come back to this one again. Keep repeating these two steps until there are no more errors (which may happen after two iterations). -10. Use OWID's internal approval tool to visually inspect changes between the old and new versions of updated charts, and +11. Use OWID's internal approval tool to visually inspect changes between the old and new versions of updated charts, and accept or reject changes. -11. Update the explorers step `data://explorers/faostat/latest/food_explorer` (for the moment, this has to be done manually): Edit the version of its only dependency in the dag, so that it loads the latest garden step. It should be `data://garden/faostat/YYYY-MM-DD/faostat_food_explorer`. +12. Update the explorers step `data://explorers/faostat/latest/food_explorer` (for the moment, this has to be done manually): Edit the version of its only dependency in the dag, so that it loads the latest garden step. It should be `data://garden/faostat/YYYY-MM-DD/faostat_food_explorer`. -12. Run the new etl explorers step, to generate the csv files for the global food explorer. +13. Run the new etl explorers step, to generate the csv files for the global food explorer. ```bash etl run explorers/faostat/latest/food_explorer @@ -328,21 +337,12 @@ accept or reject changes. Sometimes items change in FAOSTAT. If that's the case, you may need to edit a file in the `owid-content` repository, namely `scripts/global-food-explorer/foods.csv`. Then, follow the instructions in `scripts/global-food-explorer/README.md`. -13. Manually create a new garden dataset of additional variables `additional_variables` for the new version, and update its metadata. Then create a new grapher dataset too. Manually update all other datasets that use any faostat dataset as a dependency. +14. Manually create a new garden dataset of additional variables `additional_variables` for the new version, and update its metadata. Then create a new grapher dataset too. Manually update all other datasets that use any faostat dataset as a dependency. !!! note In the future this could be handled automatically by one of the existing scripts. -14. Manually improve the metadata, if needed. For example, inspect the snapshot metadata files, to improve the dataset descriptions: - - Insert line break after first sentence (which usually is the general description of the dataset). - - Remove spurious symbols. - - Insert spaces where missing (e.g. "end of sentence.Start of next sentence"). - - Remove double spaces (e.g. "end of sentence. Start of next sentence"). - - Insert line breaks to create paragraphs (by context). - - Remove incomplete sentences (sometimes there are half sentences that may have been added by mistake). - - Remove mentions to links in FAOSTAT page (since they will not be seen from grapher). - 15. Archive unnecessary DB datasets, and move old, unnecessary etl steps in the dag to the archive dag. ```bash python etl/scripts/faostat/archive_old_datasets.py -e @@ -410,6 +410,7 @@ which contains the following columns: * In the `custom_datasets.csv` file, all datasets are included (unlike other `custom_*.csv` files, where only customized fields are included). * Any empty `owid_*` field in the file will be assumed to be replaced with its corresponding `fao_*` field. + * The `owid_dataset_description` will only be used in combination with `fao_dataset_description` (see function `prepare_dataset_description` in the garden `shared` module). Its output is currently not visible anywhere in the website. It can only be seen by accessing `dataset.metadata.description`. What is publicly visible is the snapshot description. #### Customizing item names and descriptions diff --git a/etl/steps/data/garden/faostat/2024-03-14/custom_datasets.csv b/etl/steps/data/garden/faostat/2024-03-14/custom_datasets.csv index c6fbabd9086..5c9d2f76c96 100644 --- a/etl/steps/data/garden/faostat/2024-03-14/custom_datasets.csv +++ b/etl/steps/data/garden/faostat/2024-03-14/custom_datasets.csv @@ -1,35 +1,121 @@ dataset,fao_dataset_title,owid_dataset_title,fao_dataset_description,owid_dataset_description -faostat_cahd,Cost and Affordability of a Healthy Diet: Cost and Affordability of a Healthy Diet (CoAHD),,"Indicators on the cost and affordability of a healthy diet are estimated in each country and show the population’s physical and economic access to least expensive locally available foods to meet requirements for a healthy diet, as defined in food-based dietary guidelines (FBDGs). The indicators use observed retail food consumer prices and income distributions to provide an operational measure of people’s access to locally available foods in the proportions needed for health. These indicators support efforts within the framework of the Sustainable Development Goals (SDGs) to end hunger, achieve food security and improved nutrition, and promote sustainable agriculture by 2030 (SDG 2). They also support the monitoring of progress towards the objective of transforming agrifood systems by promoting “nutrition-sensitive agriculture”. For definitions of these indicators, see Definitions and standards.", +faostat_cahd,Cost and Affordability of a Healthy Diet: Cost and Affordability of a Healthy Diet (CoAHD),,"Indicators on the cost and affordability of a healthy diet are estimated in each country and show the population's physical and economic access to least expensive locally available foods to meet requirements for a healthy diet, as defined in food-based dietary guidelines (FBDGs). + +The indicators use observed retail food consumer prices and income distributions to provide an operational measure of people's access to locally available foods in the proportions needed for health. + +These indicators support efforts within the framework of the Sustainable Development Goals (SDGs) to end hunger, achieve food security and improved nutrition, and promote sustainable agriculture by 2030 (SDG 2). They also support the monitoring of progress towards the objective of transforming agrifood systems by promoting “nutrition-sensitive agriculture”. For definitions of these indicators, see Definitions and standards.", faostat_ef,"Land, Inputs and Sustainability: Fertilizers indicators - FAO (2022)",Agri-Environmental Indicators: Fertilizers indicators,"The FAOSTAT domain Fertilizers Indicators provides information on three rations: a) the ratio between the totals by nutrient of agricultural use of chemical or mineral fertilizers, reported in the FAOSTAT domain “Inputs/Fertilizers by Nutrient” for nitrogen (N), phosphorus (expressed as P2O5) and potassium (expressed as K2O) and the area of cropland reported in the FAOSTAT domain “Inputs/Land Use”; b) The ratio of fertilizers use and the annual population reported in the FAOSTAT domain “Population and Employment/Population”; and c) The ratio of fertilizers use and the value of agricultural production reported in the FAOSTAT domain “Production/Value of Agricultural Production.Data are available at national, regional, and global level over the time series 1961-present.","Agri-Environmental Indicators: Fertilizers indicators This dataset describes the use of chemical and mineral fertilizers per area of cropland (which corresponds to the sum of arable land and permanent crops) at national, regional, and global level." -faostat_ei,Climate Change: Agrifood systems emissions: Emissions intensities,Agri-Environmental Indicators: Emissions intensities,"The FAOSTAT domain Emissions intensities contains analytical data on the intensity of greenhouse gas (GHG) emissions by agricultural commodity. This indicator is defined as greenhouse gas emissions per kg of product. Data are available for a set of agricultural commodities (e.g. rice and other cereals, meat, milk, eggs), by country, with global coverage and relative to the period 1961–2020.",Agri-Environmental Indicators: Emissions intensities -faostat_ek,"Land, Inputs and Sustainability: Livestock Patterns",Agri-Environmental Indicators: Livestock Patterns,"The Livestock Patterns domain of FAOSTAT contains data on livestock numbers, shares of major livestock species and densities of livestock units in the agricultural land area. Values are calculated using Livestock Units (LSU), which facilitate aggregating information for different livestock types. Data are available by country, with global coverage, for the period 1961 to present, with annual updates. This methodology applies the LSU coefficients reported in the ""Guidelines for the preparation of livestock sector reviews"" (FAO, 2011). From this publication, LSU coefficients are computed by livestock type and by country. The reference unit used for the calculation of livestock units (=1 LSU) is the grazing equivalent of one adult dairy cow producing 3000 kg of milk annually, fed without additional concentrated foodstuffs. FAOSTAT agri-environmental indicators on livestock patterns closely follow the structure of the indicators in EUROSTAT.",Agri-Environmental Indicators: Livestock Patterns +faostat_ei,Climate Change: Agrifood systems emissions: Emissions intensities,Agri-Environmental Indicators: Emissions intensities,"The FAOSTAT domain Emissions intensities contains analytical data on the intensity of greenhouse gas (GHG) emissions by agricultural commodity. + +This indicator is defined as greenhouse gas emissions per kg of product. Data are available for a set of agricultural commodities (e.g. rice and other cereals, meat, milk, eggs), by country, with global coverage and relative to the period 1961-2020.",Agri-Environmental Indicators: Emissions intensities +faostat_ek,"Land, Inputs and Sustainability: Livestock Patterns",Agri-Environmental Indicators: Livestock Patterns,"The Livestock Patterns domain of FAOSTAT contains data on livestock numbers, shares of major livestock species and densities of livestock units in the agricultural land area. + +Values are calculated using Livestock Units (LSU), which facilitate aggregating information for different livestock types. Data are available by country, with global coverage, for the period 1961 to present, with annual updates. This methodology applies the LSU coefficients reported in the ""Guidelines for the preparation of livestock sector reviews"" (FAO, 2011). From this publication, LSU coefficients are computed by livestock type and by country. The reference unit used for the calculation of livestock units (=1 LSU) is the grazing equivalent of one adult dairy cow producing 3000 kg of milk annually, fed without additional concentrated foodstuffs. + +FAOSTAT agri-environmental indicators on livestock patterns closely follow the structure of the indicators in EUROSTAT.",Agri-Environmental Indicators: Livestock Patterns faostat_el,"Land, Inputs and Sustainability: Land use indicators - FAO (2022)",Agri-Environmental Indicators: Land use indicators,"The Agri-environmental Indicators—Land Use domain provides information on the distribution of agricultural and forest land, and their sub-components, including irrigated areas and areas under organic agriculture, at national, regional and global levels.Per capita values are included in this update.",Agri-Environmental Indicators: Land use indicators -faostat_emn,"Land, Inputs and Sustainability: Livestock Manure",Agri-Environmental Indicators: Livestock Manure,"The Livestock Manure domain of FAOSTAT contains estimates of nitrogen (N) inputs to agricultural soils from livestock manure. Data on the N losses to air and water are also disseminated. These estimates are compiled using official FAOSTAT statistics of animal stocks and by applying the internationally approved Guidelines of the Intergovernmental Panel on Climate Change (IPCC). Data are available by country, with global coverage and relative to the period 1961–2020, with annual updates. The following elements are disseminated: 1) Stocks; 2) Amount excreted in manure (N content); 3) Manure left on pasture (N content); 4) Manure left on pasture that volatilises (N content); 5) Manure left on pasture that leaches (N content); 6) Manure treated (N content); 7) Losses from manure treated (N content); 8) Manure applied to soils (N content); 9) Manure applied to soils that volatilises (N content); 10) Manure applied to soils that leaches (N content).",Agri-Environmental Indicators: Livestock Manure +faostat_emn,"Land, Inputs and Sustainability: Livestock Manure",Agri-Environmental Indicators: Livestock Manure,"The Livestock Manure domain of FAOSTAT contains estimates of nitrogen (N) inputs to agricultural soils from livestock manure. + +Data on the N losses to air and water are also disseminated. These estimates are compiled using official FAOSTAT statistics of animal stocks and by applying the internationally approved Guidelines of the Intergovernmental Panel on Climate Change (IPCC). Data are available by country, with global coverage and relative to the period 1961-2020, with annual updates. + +The following elements are disseminated: 1) Stocks; 2) Amount excreted in manure (N content); 3) Manure left on pasture (N content); 4) Manure left on pasture that volatilizes (N content); 5) Manure left on pasture that leaches (N content); 6) Manure treated (N content); 7) Losses from manure treated (N content); 8) Manure applied to soils (N content); 9) Manure applied to soils that volatilizes (N content); 10) Manure applied to soils that leaches (N content).",Agri-Environmental Indicators: Livestock Manure faostat_ep,"Land, Inputs and Sustainability: Pesticides indicators - FAO (2022)",Agri-Environmental Indicators: Pesticides indicators,Agri-environmental indicator on the Use of pesticides per area of cropland (which is the sum of arable land and land under permanent crops) at national level for the period 1990 to 2016.,Agri-Environmental Indicators: Pesticides indicators -faostat_esb,"Land, Inputs and Sustainability: Cropland Nutrient Balance","Land, Inputs and Sustainability: Soil nutrient budget","2022 Cropland nutrient budget analytical briefThe Cropland Nutrient Budget domain contains information on the flows of nitrogen, phosphorus, and potassium from synthetic fertilizer, manure applied to soils, atmospheric deposition, crop removal, and biological fixation over cropland and per unit area of cropland. The flows are aggregated to total inputs and total outputs, from which the overall nutrient budget and nutrient use efficiency on cropland are calculated. Statistics are disseminated in units of tonnes and in kg/ha, as appropriate. Nutrient use efficiency is expressed as a fraction (%). Data are available by country, with global coverage relative to the period 1961-2020, with annual updates.","Land, Inputs and Sustainability: Soil nutrient budget" +faostat_esb,"Land, Inputs and Sustainability: Cropland Nutrient Balance","Land, Inputs and Sustainability: Soil nutrient budget","The Cropland Nutrient Budget domain contains information on the flows of nitrogen, phosphorus, and potassium from synthetic fertilizer, manure applied to soils, atmospheric deposition, crop removal, and biological fixation over cropland and per unit area of cropland. + +The flows are aggregated to total inputs and total outputs, from which the overall nutrient budget and nutrient use efficiency on cropland are calculated. Statistics are disseminated in units of tonnes and in kg/ha, as appropriate. Nutrient use efficiency is expressed as a fraction (%). + +Data are available by country, with global coverage relative to the period 1961-2020, with annual updates.","Land, Inputs and Sustainability: Soil nutrient budget" faostat_fa,Discontinued archives and data series: Food Aid Shipments (WFP),Discontinued archives and data series: Food Aid Shipments (WFP),,Discontinued archives and data series: Food Aid Shipments (WFP) -faostat_fbs,Food Balances: Food Balances (2010-),Food Balance: New Food Balances,"Food Balance Sheet presents a comprehensive picture of the pattern of a country's food supply during a specified reference period. The food balance sheet shows for each food item - i.e. each primary commodity and a number of processed commodities potentially available for human consumption - the sources of supply and its utilization. The total quantity of foodstuffs produced in a country added to the total quantity imported and adjusted to any change in stocks that may have occurred since the beginning of the reference period gives the supply available during that period. On the utilization side a distinction is made between the quantities exported, fed to livestock, used for seed, put to manufacture for food use and non-food uses, losses during storage and transportation, and food supplies available for human consumption. The per caput supply of each such food item available for human consumption is then obtained by dividing the respective quantity by the related data on the population actually partaking of it. Data on per caput food supplies are expressed in terms of quantity and - by applying appropriate food composition factors for all primary and processed products - also in terms of caloric value and protein and fat content.",Food Balance: New Food Balances +faostat_fbs,Food Balances: Food Balances (2010-),Food Balance: New Food Balances,"Food Balance Sheet presents a comprehensive picture of the pattern of a country's food supply during a specified reference period. + +The food balance sheet shows for each food item - i.e. each primary commodity and a number of processed commodities potentially available for human consumption - the sources of supply and its utilization. The total quantity of foodstuffs produced in a country added to the total quantity imported and adjusted to any change in stocks that may have occurred since the beginning of the reference period gives the supply available during that period. + +On the utilization side a distinction is made between the quantities exported, fed to livestock, used for seed, put to manufacture for food use and non-food uses, losses during storage and transportation, and food supplies available for human consumption. The per caput supply of each such food item available for human consumption is then obtained by dividing the respective quantity by the related data on the population actually partaking of it. Data on per caput food supplies are expressed in terms of quantity and - by applying appropriate food composition factors for all primary and processed products - also in terms of caloric value and protein and fat content.",Food Balance: New Food Balances faostat_fbsc,Food Balances: Food Balances (2010-),"Food Balances (old methodology before 2010, and new from 2010 onwards)","Food Balance Sheet presents a comprehensive picture of the pattern of a country's food supply during a specified reference period. The food balance sheet shows for each food item - i.e. each primary commodity and a number of processed commodities potentially available for human consumption - the sources of supply and its utilization. The total quantity of foodstuffs produced in a country added to the total quantity imported and adjusted to any change in stocks that may have occurred since the beginning of the reference period gives the supply available during that period. On the utilization side a distinction is made between the quantities exported, fed to livestock, used for seed, put to manufacture for food use and non-food uses, losses during storage and transportation, and food supplies available for human consumption. The per caput supply of each such food item available for human consumption is then obtained by dividing the respective quantity by the related data on the population actually partaking of it. Data on per caput food supplies are expressed in terms of quantity and - by applying appropriate food composition factors for all primary and processed products - also in terms of caloric value and protein and fat content.","Food Balances (old methodology before 2010, and new from 2010 onwards)" -faostat_fbsh,"Food Balances: Food Balances (-2013, old methodology and population)",Food Balance: Food Balances (old methodology and population),"Food Balance Sheet presents a comprehensive picture of the pattern of a country's food supply during a specified reference period. The food balance sheet shows for each food item - i.e. each primary commodity and a number of processed commodities potentially available for human consumption - the sources of supply and its utilization. The total quantity of foodstuffs produced in a country added to the total quantity imported and adjusted to any change in stocks that may have occurred since the beginning of the reference period gives the supply available during that period. On the utilization side a distinction is made between the quantities exported, fed to livestock, used for seed, put to manufacture for food use and non-food uses, losses during storage and transportation, and food supplies available for human consumption. The per caput supply of each such food item available for human consumption is then obtained by dividing the respective quantity by the related data on the population actually partaking of it. Data on per caput food supplies are expressed in terms of quantity and - by applying appropriate food composition factors for all primary and processed products - also in terms of caloric value and protein and fat content.",Food Balance: Food Balances (old methodology and population) -faostat_fo,Forestry: Forestry Production and Trade,Forestry: Forestry Production and Trade,"The database contains data on the production and trade in roundwood and in primary wood and paper products for all countries and territories in the world.The main types of primary forest products included in this database are roundwood, sawnwood, wood-based panels, pulp, and paper and paperboard. These products are detailed further and defined in the Joint Forest Sector Questionnaire (JFSQ) (https://www.fao.org/forestry/statistics/80572/en/). The database contains details of the following topics: - Roundwood removals (production) by coniferous and non-coniferous wood and assortments, - production and trade in industrial roundwood, sawnwood, wood-based panels, wood charcoal, pulp, paper and paperboard, and other products. More detailed information on wood products, including definitions, can be found at https://www.fao.org/forestry/statistics/80572/en",Forestry: Forestry Production and Trade -faostat_fs,Food Security and Nutrition: Suite of Food Security Indicators,Food Security: Suite of Food Security Indicators,"The Suite of Food Security Indicators presents the core set of food security indicators. Following the recommendation of experts gathered in the Committee on World Food Security (CFS) Round Table on hunger measurement, hosted at FAO headquarters in September 2011, an initial set of indicators aiming to capture various aspects of food insecurity is presented here. The choice of the indicators has been informed by expert judgment and the availability of data with sufficient coverage to enable comparisons across regions and over time. Many of these indicators are produced and published elsewhere by FAO and other international organizations. They are reported here in a single database with the aim of building a wide food security information system. More indicators will be added to this set as more data will become available. Indicators are classified along the four dimensions of food security -- availability, access, utilization and stability. For definitions of these indicators, see Definitions and standards below (under Item).",Food Security: Suite of Food Security Indicators -faostat_ic,Investment: Credit to Agriculture,Investment: Credit to Agriculture,"The Credit to Agriculture dataset provides national data for over 130 countries on the amount of loans provided by the private/commercial banking sector to producers in agriculture, forestry and fishing, including household producers, cooperatives, and agro-businesses. For some countries, the three subsectors of agriculture, forestry, and fishing are completely specified. In other cases, complete disaggregations are not available. The dataset also provides statistics on the total credit to all industries, indicators on the share of credit to agricultural producers, and an agriculture orientation index (AOI) for credit that normalizes the share of credit to agriculture over total credit by dividing it by the share of agriculture in gross domestic product (GDP). As such, it can provide a more accurate indication of the relative importance that banking sectors place on financing the sector. An AOI lower than 1 indicates that the agriculture sector receives a credit share lower than its contribution to the economy, while an AOI greater than 1 indicates a credit share to the agriculture sector greater than its economic contribution.",Investment: Credit to Agriculture -faostat_lc,"Land, Inputs and Sustainability: Land Cover",Agri-Environmental Indicators: Land Cover,"The FAOSTAT domain Land Cover under the Agri-Environmental Indicators section contains land cover information organized by the land cover classes of the international standard system for Environmental and Economic Accounting Central Framework (SEEA CF). The land cover information is compiled from publicly available Global Land Cover (GLC) maps: a) MODIS land cover types based on the Land Cover Classification System, LCCS (2001–2021); b) The European Spatial Agency (ESA) Climate Change Initiative (CCI) annual land cover maps (1992–2020) produced by the Université catholique de Louvain (UCL)-Geomatics and now under the European Copernicus Program; c) The annual land cover maps which were produced under the European Copernicus Global Land Service (CGLS) (CGLS land cover, containing discrete land cover categorization for the period 2015–2019), with spatial resolution 100m; and d) 4) The WorldCover maps of the European Space Agency —available for the years 2020 and 2021, produced at 10m resolution.",Agri-Environmental Indicators: Land Cover -faostat_qcl,Production: Crops and livestock products,Production: Crops and livestock products,"Crop and livestock statistics are recorded for 278 products, covering the following categories: 1) CROPS PRIMARY: Cereals, Citrus Fruit, Fibre Crops, Fruit, Oil Crops, Oil Crops and Cakes in Oil Equivalent, Pulses, Roots and Tubers, Sugar Crops, Treenuts and Vegetables. Data are expressed in terms of area harvested, production quantity and yield. Cereals: Area and production data on cereals relate to crops harvested for dry grain only. Cereal crops harvested for hay or harvested green for food, feed or silage or used for grazing are therefore excluded. 2) CROPS PROCESSED: Beer of barley; Cotton lint; Cottonseed; Margarine, short; Molasses; Oil, coconut (copra); Oil, cottonseed; Oil, groundnut; Oil, linseed; Oil, maize; Oil, olive, virgin; Oil, palm; Oil, palm kernel; Oil, rapeseed; Oil, safflower; Oil, sesame; Oil, soybean; Oil, sunflower; Palm kernels; Sugar Raw Centrifugal; Wine. 3) LIVE ANIMALS: Animals live n.e.s.; Asses; Beehives; Buffaloes; Camelids, other; Camels; Cattle; Chickens; Ducks; Geese and guinea fowls; Goats; Horses; Mules; Pigeons, other birds; Pigs; Rabbits and hares; Rodents, other; Sheep; Turkeys. 4) LIVESTOCK PRIMARY: Beeswax; Eggs (various types); Hides buffalo, fresh; Hides, cattle, fresh; Honey, natural; Meat (ass, bird nes, buffalo, camel, cattle, chicken, duck, game, goat, goose and guinea fowl, horse, mule, Meat nes, meat other camelids, Meat other rodents, pig, rabbit, sheep, turkey); Milk (buffalo, camel, cow, goat, sheep); Offals, nes; Silk-worm cocoons, reelable; Skins (goat, sheep); Snails, not sea; Wool, greasy. 5) LIVESTOCK PROCESSED: Butter (of milk from sheep, goat, buffalo, cow); Cheese (of milk from goat, buffalo, sheep, cow milk); Cheese of skimmed cow milk; Cream fresh; Ghee (cow and buffalo milk); Lard; Milk (dry buttermilk, skimmed condensed, skimmed cow, skimmed dried, skimmed evaporated, whole condensed, whole dried, whole evaporated); Silk raw; Tallow; Whey (condensed and dry); Yoghurt.",Production: Crops and livestock products -faostat_qi,Production: Production Indices,Production: Production Indices,The FAO indices of agricultural production show the relative level of the aggregate volume of agricultural production for each year in comparison with the base period 2014-2016. Indices for meat production are computed based on data for production from indigenous animals.,"Production: Production Indices +faostat_fbsh,"Food Balances: Food Balances (-2013, old methodology and population)",Food Balance: Food Balances (old methodology and population),"Food Balance Sheet presents a comprehensive picture of the pattern of a country's food supply during a specified reference period. + +The food balance sheet shows for each food item - i.e. each primary commodity and a number of processed commodities potentially available for human consumption - the sources of supply and its utilization. The total quantity of foodstuffs produced in a country added to the total quantity imported and adjusted to any change in stocks that may have occurred since the beginning of the reference period gives the supply available during that period. + +On the utilization side a distinction is made between the quantities exported, fed to livestock, used for seed, put to manufacture for food use and non-food uses, losses during storage and transportation, and food supplies available for human consumption. The per caput supply of each such food item available for human consumption is then obtained by dividing the respective quantity by the related data on the population actually partaking of it. Data on per caput food supplies are expressed in terms of quantity and - by applying appropriate food composition factors for all primary and processed products - also in terms of caloric value and protein and fat content.",Food Balance: Food Balances (old methodology and population) +faostat_fo,Forestry: Forestry Production and Trade,Forestry: Forestry Production and Trade,"The database contains data on the production and trade in roundwood and in primary wood and paper products for all countries and territories in the world. + +The main types of primary forest products included in this database are roundwood, sawnwood, wood-based panels, pulp, and paper and paperboard. These products are detailed further and defined in [the Joint Forest Sector Questionnaire (JFSQ)](https://www.fao.org/forestry/statistics/80572/en/). + +The database contains details of the following topics: - Roundwood removals (production) by coniferous and non-coniferous wood and assortments, - production and trade in industrial roundwood, sawnwood, wood-based panels, wood charcoal, pulp, paper and paperboard, and other products. More detailed information on wood products, including definitions, can be found at: https://www.fao.org/forestry/statistics/80572/en",Forestry: Forestry Production and Trade +faostat_fs,Food Security and Nutrition: Suite of Food Security Indicators,Food Security: Suite of Food Security Indicators,"The Suite of Food Security Indicators presents the core set of food security indicators. + +Following the recommendation of experts gathered in the Committee on World Food Security (CFS) Round Table on hunger measurement, hosted at FAO headquarters in September 2011, an initial set of indicators aiming to capture various aspects of food insecurity is presented here. The choice of the indicators has been informed by expert judgment and the availability of data with sufficient coverage to enable comparisons across regions and over time. + +Many of these indicators are produced and published elsewhere by FAO and other international organizations. They are reported here in a single database with the aim of building a wide food security information system. More indicators will be added to this set as more data will become available. + +Indicators are classified along the four dimensions of food security -- availability, access, utilization and stability.",Food Security: Suite of Food Security Indicators +faostat_ic,Investment: Credit to Agriculture,Investment: Credit to Agriculture,"The Credit to Agriculture dataset provides national data for over 130 countries on the amount of loans provided by the private/commercial banking sector to producers in agriculture, forestry and fishing, including household producers, cooperatives, and agro-businesses. + +For some countries, the three subsectors of agriculture, forestry, and fishing are completely specified. In other cases, complete disaggregations are not available. + +The dataset also provides statistics on the total credit to all industries, indicators on the share of credit to agricultural producers, and an agriculture orientation index (AOI) for credit that normalizes the share of credit to agriculture over total credit by dividing it by the share of agriculture in gross domestic product (GDP). As such, it can provide a more accurate indication of the relative importance that banking sectors place on financing the sector. An AOI lower than 1 indicates that the agriculture sector receives a credit share lower than its contribution to the economy, while an AOI greater than 1 indicates a credit share to the agriculture sector greater than its economic contribution.",Investment: Credit to Agriculture +faostat_lc,"Land, Inputs and Sustainability: Land Cover",Agri-Environmental Indicators: Land Cover,"The FAOSTAT domain Land Cover under the Agri-Environmental Indicators section contains land cover information organized by the land cover classes of the international standard system for Environmental and Economic Accounting Central Framework (SEEA CF). + +The land cover information is compiled from publicly available Global Land Cover (GLC) maps: a) MODIS land cover types based on the Land Cover Classification System, LCCS (2001-2021); b) The European Spatial Agency (ESA) Climate Change Initiative (CCI) annual land cover maps (1992-2020) produced by the Université catholique de Louvain (UCL)-Geomatics and now under the European Copernicus Program; c) The annual land cover maps which were produced under the European Copernicus Global Land Service (CGLS) (CGLS land cover, containing discrete land cover categorization for the period 2015-2019), with spatial resolution 100m; and d) 4) The WorldCover maps of the European Space Agency —available for the years 2020 and 2021, produced at 10m resolution.",Agri-Environmental Indicators: Land Cover +faostat_qcl,Production: Crops and livestock products,Production: Crops and livestock products,"Crop and livestock statistics are recorded for 278 products, covering the following categories: + +1) Crops primary: Cereals, Citrus Fruit, Fibre Crops, Fruit, Oil Crops, Oil Crops and Cakes in Oil Equivalent, Pulses, Roots and Tubers, Sugar Crops, Treenuts and Vegetables. Data are expressed in terms of area harvested, production quantity and yield. Area and production data on cereals relate to crops harvested for dry grain only. Cereal crops harvested for hay or harvested green for food, feed or silage or used for grazing are therefore excluded. + +2) Crops processed: Beer of barley; Cotton lint; Cottonseed; Margarine, short; Molasses; Oil, coconut (copra); Oil, cottonseed; Oil, groundnut; Oil, linseed; Oil, maize; Oil, olive, virgin; Oil, palm; Oil, palm kernel; Oil, rapeseed; Oil, safflower; Oil, sesame; Oil, soybean; Oil, sunflower; Palm kernels; Sugar Raw Centrifugal; Wine. + +3) Live animals: Animals live n.e.s.; Asses; Beehives; Buffaloes; Camelids, other; Camels; Cattle; Chickens; Ducks; Geese and guinea fowls; Goats; Horses; Mules; Pigeons, other birds; Pigs; Rabbits and hares; Rodents, other; Sheep; Turkeys. + +4) Livestock primary: Beeswax; Eggs (various types); Hides buffalo, fresh; Hides, cattle, fresh; Honey, natural; Meat (ass, bird nes, buffalo, camel, cattle, chicken, duck, game, goat, goose and guinea fowl, horse, mule, Meat nes, meat other camelids, Meat other rodents, pig, rabbit, sheep, turkey); Milk (buffalo, camel, cow, goat, sheep); Offals, nes; Silk-worm cocoons, reelable; Skins (goat, sheep); Snails, not sea; Wool, greasy. + +5) Livestock processed: Butter (of milk from sheep, goat, buffalo, cow); Cheese (of milk from goat, buffalo, sheep, cow milk); Cheese of skimmed cow milk; Cream fresh; Ghee (cow and buffalo milk); Lard; Milk (dry buttermilk, skimmed condensed, skimmed cow, skimmed dried, skimmed evaporated, whole condensed, whole dried, whole evaporated); Silk raw; Tallow; Whey (condensed and dry); Yoghurt.",Production: Crops and livestock products +faostat_qi,Production: Production Indices,Production: Production Indices,"The FAO indices of agricultural production show the relative level of the aggregate volume of agricultural production for each year in comparison with the base period 2014-2016. + +Indices for meat production are computed based on data for production from indigenous animals.","Production: Production Indices This dataset includes gross and net production indices for various food and agriculture aggregates expressed in both totals and per capita." -faostat_qv,Production: Value of Agricultural Production,Production: Value of Agricultural Production,Values of agricultural production are calculated based on production data of primary commodities from Production domain and producer prices from Prices domain. The livestock value of production is measured in terms of indigenous meat.,"Production: Value of Agricultural Production +faostat_qv,Production: Value of Agricultural Production,Production: Value of Agricultural Production,"Values of agricultural production are calculated based on production data of primary commodities from Production domain and producer prices from Prices domain. + +The livestock value of production is measured in terms of indigenous meat.","Production: Value of Agricultural Production This dataset includes gross and net production values, in constant international US$, and gross production values, in constant and current US$ and Local Currency Units, for various food and agriculture commodities and aggregates thereof, expressed in both total value and value per capita." -faostat_rfb,"Land, Inputs and Sustainability: Fertilizers by Product",Inputs: Fertilizers by Product,"The Fertilizers by Product dataset contains information on the Production, Trade and Agriculture Use of inorganic (chemical or mineral) fertilizers products, over the time series 2002-present. The fertilizer statistics data are for a set of 23 product categories. Both straight and compound fertilizers are included. There is information available about methodology at: https://fenixservices.fao.org/faostat/static/documents/RFB/RFB_EN_README.pdf.",Inputs: Fertilizers by Product -faostat_rfn,"Land, Inputs and Sustainability: Fertilizers by Nutrient",Inputs: Fertilizers by Nutrient,"The Fertilizers by Nutrient dataset contains information on the totals in nutrients for Production, Trade and Agriculture Use of inorganic (chemical or mineral) fertilizers, over the time series 1961-present. The data are provided for the three primary plant nutrients: nitrogen (N), phosphorus (expressed as P2O5) and potassium (expressed as K2O). Both straight and compound fertilizers are included. There is information on the methodology available at: https://fenixservices.fao.org/faostat/static/documents/RFN/RFN_EN_README.pdf",Inputs: Fertilizers by Nutrient -faostat_rl,"Land, Inputs and Sustainability: Land Use",Inputs: Land Use,"The FAOSTAT Land Use domain contains data on forty-four categories of land use, irrigation and agricultural practices and five indicators relevant to monitor agriculture, forestry and fisheries activities at national, regional and global level. Data are available by country and year, with global coverage and annual updates.",Inputs: Land Use -faostat_rp,"Land, Inputs and Sustainability: Pesticides Use",Inputs: Pesticides Use,"The Pesticides Use database includes data on the use of major pesticide groups (Insecticides, Herbicides, Fungicides, Plant growth regulators and Rodenticides) and of relevant chemical families. Data report the quantities (in tonnes of active ingredients)",Inputs: Pesticides Use -faostat_rt,"Land, Inputs and Sustainability: Pesticides Trade",Inputs: Pesticides Trade,"This domain contains data on pesticides and covers two different categories: pesticides traded in form or packagingfor retail sale or as preparations or articles, and pesticides traded as separate chemically defined compounds (if relevant for the Rotterdam Convention on the Prior Informed Consent Procedure for Certain Hazardous Chemicals and Pesticides in International Trade). The pesticides traded for retail sale or as preparations or articles are those classified under code 38.08 in the Harmonized System Nomenclature (HS) and include: hazardous pesticides, insecticides, fungicides, herbicides, disinfectants and other. For these pesticides, this domain contains trade data (imports and exports) in values only (current 1000 US dollars), and the time series extends from 1961 onwards. The pesticides traded as separate chemically defined compounds are those listed in Annex III of the Rotterdam Convention (excluding industrial chemicals) and therefore subject to the Prior Informed Consent (PIC) procedure. The correspondence with the HS Nomenclature is shown in the table at the Related Documents section. For these pesticides, this domain contains trade data (imports and exports) in both value (current 1000 US dollars) and quantity (net weight in tonnes), and the time series extends from 2007 onwards.",Inputs: Pesticides Trade -faostat_scl,Food Balances: Supply Utilization Accounts (2010-),Food Balances: Supply Utilization Accounts,"Supply Utilization Accounts (SUA's) present a comprehensive picture of the pattern of a country's food supply during a specified reference period. The total quantity of foodstuffs produced in a country added to the total quantity imported and adjusted to any change in stocks that may have occurred since the beginning of the reference period gives the supply available during that period. On the utilization side a distinction is made between the quantities exported, fed to livestock, used for seed, put to manufacture for food use and non-food uses, losses during storage and transportation, and food supplies available for human consumption. The per caput supply of each such food item available for human consumption is then obtained by dividing the respective quantity by the related data on the population actually partaking of it. Data on per caput food supplies are expressed in terms of quantity and - by applying appropriate food composition factors for all primary and processed products - also in terms of caloric value and protein and fat content.",Food Balances: Supply Utilization Accounts -faostat_sdgb,SDG Indicators: SDG Indicators,SDG Indicators: SDG Indicators,"As the custodian agency of 21 SDG indicators, the Food and Agriculture Organization of the United Nations (FAO) is responsible for curating and refining the methodologies of these indicators, collecting data from national sources, ensuring their quality and compatibility with applicable standards and classifications, and disseminating data at global level. This FAOSTAT domain complements the global SDG database administered by the United Nations Statistical Division, as well as FAO’s SDG indicators portal, by providing access to the available data for each of these indicators. Please click the metadata link on the right hand navigation column for an abridged version of the methodology for compiling each of these indicators, a description of data sources and the relevant contact persons responsible for each indicator in the Organization. For a more detailed description of the methodology, data sources and reporting procedures, please follow the link to the official SDG indicator metadata document available at the bottom of each summary metadata page in the document on the right.",SDG Indicators: SDG Indicators -faostat_tcl,Trade: Crops and livestock products,Trade: Crops and livestock products,"The food and agricultural trade dataset is collected, processed and disseminated by FAO according to the standard International Merchandise Trade Statistics (IMTS) Methodology. The data is mainly provided by UNSD, Eurostat, and other national authorities as needed. This source data is checked for outliers, trade partner data is used for non-reporting countries or missing cells, and data on food aid is added to take into account total cross-border trade flows. The trade database includes the following variables: export quantity, export value, import quantity, and import value. The trade database includes all food and agricultural products imported/exported annually by all the countries in the world.",Trade: Crops and livestock products -faostat_ti,Trade: Trade Indices,Trade: Trade Indices,"The food and agricultural trade dataset is collected, processed and disseminated by FAO according to the standard International Merchandise Trade Statistics (IMTS) Methodology. The data is mainly provided by UNSD, Eurostat, and other national authorities as needed. This source data is checked for outliers, trade partner data is used for non-reporting countries or missing cells, and data on food aid is added to take into account total cross-border trade flows. The trade database includes the following variables: export quantity, export value, import quantity, and import value. The trade database includes all food and agricultural products imported/exported annually by all the countries in the world.",Trade: Trade Indices +faostat_rfb,"Land, Inputs and Sustainability: Fertilizers by Product",Inputs: Fertilizers by Product,"The Fertilizers by Product dataset contains information on the Production, Trade and Agriculture Use of inorganic (chemical or mineral) fertilizers products, over the time series 2002-present. + +The fertilizer statistics data are for a set of 23 product categories. Both straight and compound fertilizers are included. There is information available about methodology at: https://fenixservices.fao.org/faostat/static/documents/RFB/RFB_EN_README.pdf",Inputs: Fertilizers by Product +faostat_rfn,"Land, Inputs and Sustainability: Fertilizers by Nutrient",Inputs: Fertilizers by Nutrient,"The Fertilizers by Nutrient dataset contains information on the totals in nutrients for Production, Trade and Agriculture Use of inorganic (chemical or mineral) fertilizers, over the time series 1961-present. + +The data are provided for the three primary plant nutrients: nitrogen (N), phosphorus (expressed as P2O5) and potassium (expressed as K2O). Both straight and compound fertilizers are included. There is information on the methodology available at: https://fenixservices.fao.org/faostat/static/documents/RFN/RFN_EN_README.pdf",Inputs: Fertilizers by Nutrient +faostat_rl,"Land, Inputs and Sustainability: Land Use",Inputs: Land Use,"The FAOSTAT Land Use domain contains data on forty-four categories of land use, irrigation and agricultural practices and five indicators relevant to monitor agriculture, forestry and fisheries activities at national, regional and global level. + +Data are available by country and year, with global coverage and annual updates.",Inputs: Land Use +faostat_rp,"Land, Inputs and Sustainability: Pesticides Use",Inputs: Pesticides Use,"The Pesticides Use database includes data on the use of major pesticide groups (Insecticides, Herbicides, Fungicides, Plant growth regulators and Rodenticides) and of relevant chemical families. + +Data report the quantities (in tonnes of active ingredients).",Inputs: Pesticides Use +faostat_rt,"Land, Inputs and Sustainability: Pesticides Trade",Inputs: Pesticides Trade,"This domain contains data on pesticides and covers two different categories: pesticides traded in form or packaging for retail sale or as preparations or articles, and pesticides traded as separate chemically defined compounds (if relevant for the Rotterdam Convention on the Prior Informed Consent Procedure for Certain Hazardous Chemicals and Pesticides in International Trade). + +The pesticides traded for retail sale or as preparations or articles are those classified under code 38.08 in the Harmonized System Nomenclature (HS) and include: hazardous pesticides, insecticides, fungicides, herbicides, disinfectants and other. For these pesticides, this domain contains trade data (imports and exports) in values only (current 1000 US dollars), and the time series extends from 1961 onwards. + +The pesticides traded as separate chemically defined compounds are those listed in Annex III of the Rotterdam Convention (excluding industrial chemicals) and therefore subject to the Prior Informed Consent (PIC) procedure. + +The correspondence with the HS Nomenclature is shown in the table at the Related Documents section. For these pesticides, this domain contains trade data (imports and exports) in both value (current 1000 US dollars) and quantity (net weight in tonnes), and the time series extends from 2007 onwards.",Inputs: Pesticides Trade +faostat_scl,Food Balances: Supply Utilization Accounts (2010-),Food Balances: Supply Utilization Accounts,"Supply Utilization Accounts (SUA's) present a comprehensive picture of the pattern of a country's food supply during a specified reference period. + +The total quantity of foodstuffs produced in a country added to the total quantity imported and adjusted to any change in stocks that may have occurred since the beginning of the reference period gives the supply available during that period. + +On the utilization side a distinction is made between the quantities exported, fed to livestock, used for seed, put to manufacture for food use and non-food uses, losses during storage and transportation, and food supplies available for human consumption. + +The per caput supply of each such food item available for human consumption is then obtained by dividing the respective quantity by the related data on the population actually partaking of it. Data on per caput food supplies are expressed in terms of quantity and - by applying appropriate food composition factors for all primary and processed products - also in terms of caloric value and protein and fat content.",Food Balances: Supply Utilization Accounts +faostat_sdgb,SDG Indicators: SDG Indicators,SDG Indicators: SDG Indicators,"As the custodian agency of 21 SDG indicators, the Food and Agriculture Organization of the United Nations (FAO) is responsible for curating and refining the methodologies of these indicators, collecting data from national sources, ensuring their quality and compatibility with applicable standards and classifications, and disseminating data at global level. + +This FAOSTAT domain complements the global SDG database administered by the United Nations Statistical Division, as well as FAO's SDG indicators portal, by providing access to the available data for each of these indicators.",SDG Indicators: SDG Indicators +faostat_tcl,Trade: Crops and livestock products,Trade: Crops and livestock products,"The food and agricultural trade dataset is collected, processed and disseminated by FAO according to the standard International Merchandise Trade Statistics (IMTS) Methodology. + +The data is mainly provided by UNSD, Eurostat, and other national authorities as needed. This source data is checked for outliers, trade partner data is used for non-reporting countries or missing cells, and data on food aid is added to take into account total cross-border trade flows. + +The trade database includes the following variables: export quantity, export value, import quantity, and import value. The trade database includes all food and agricultural products imported/exported annually by all the countries in the world.",Trade: Crops and livestock products +faostat_ti,Trade: Trade Indices,Trade: Trade Indices,"The food and agricultural trade dataset is collected, processed and disseminated by FAO according to the standard International Merchandise Trade Statistics (IMTS) Methodology. + +The data is mainly provided by UNSD, Eurostat, and other national authorities as needed. This source data is checked for outliers, trade partner data is used for non-reporting countries or missing cells, and data on food aid is added to take into account total cross-border trade flows. + +The trade database includes the following variables: export quantity, export value, import quantity, and import value. The trade database includes all food and agricultural products imported/exported annually by all the countries in the world.",Trade: Trade Indices diff --git a/etl/steps/data/garden/faostat/2024-03-14/custom_items.csv b/etl/steps/data/garden/faostat/2024-03-14/custom_items.csv index a0e6a57cb06..e50dfbd9f82 100644 --- a/etl/steps/data/garden/faostat/2024-03-14/custom_items.csv +++ b/etl/steps/data/garden/faostat/2024-03-14/custom_items.csv @@ -1,5 +1,4 @@ dataset,item_code,fao_item,owid_item,fao_item_description,owid_item_description -faostat_qcl,00000771,,"Flax, raw or retted",, faostat_qcl,00000221,"Almonds, in shell",Almonds,"Almonds, in shell This subclass is defined through the following headings/subheadings of the HS 2007: 0802.11.", faostat_fbsc,00002946,Animal fats,Animal fats group,, faostat_qcl,00000711,"Anise, badian, coriander, cumin, caraway, fennel and juniper berries, raw",Herbs (e.g. fennel),"Anise, badian, coriander, cumin, caraway, fennel and juniper berries, raw This subclass includes: - aniseed, Pimpinella anisum, raw - star anise (badian) or Chinese star anise, Illicium verum, raw - fennel, Foeniculum vulgare, raw (when used as spice) - coriander (cilantro), Coriandrum sativum, raw - cumin, Cuminum cyminum, raw - caraway seeds, Carum carvi, raw - juniper berries, Juniperus communis, raw This subclass does not include: - fennel (when used as a vegetable), cf. 01290 - processed anise, fennel, coriander, cumin, caraway and juniper berries, cf. 23924", @@ -57,8 +56,8 @@ faostat_qcl,00000195,"Cow peas, dry",Cow peas,"Cow peas, dry This subclass is de faostat_qcl,00000554,Cranberries,,"This subclass includes cranberries, species of Vaccinium macrocarpon (American cranberry) and Vaccinium oxycoccus (European cranberry). (Unofficial definition)", faostat_qcl,00000397,Cucumbers and gherkins,,Cucumbers and gherkins This subclass is defined through the following headings/subheadings of the HS 2007: 0707., faostat_qcl,00000550,Currants,,"Currants, species of Ribes nigrum (Black) and Ribes rubrum (red and white). (Unofficial definition)", -faostat_fbsc,00002619,Dates,,Default composition: 577 Dates, faostat_qcl,00000577,Dates,,"Dates This subclass includes: - dates, fruit of the date palm, Phoenix dactylifera", +faostat_fbsc,00002619,Dates,,Default composition: 577 Dates, faostat_qcl,00000948,"Edible offal of buffalo, fresh, chilled or frozen","Offals, buffaloes","Edible offal of buffalo, fresh, chilled or frozen This subclass includes: - edible offal of buffalo, animals of subclass 02112, fresh, chilled or frozen This subclass does not include: - edible offal of cattle, cf. 21151", faostat_qcl,00000868,"Edible offal of cattle, fresh, chilled or frozen","Offals, cattle","Edible offal of cattle, fresh, chilled or frozen This subclass includes: - edible offal of cattle, animals of subclass 02111, fresh, chilled or frozen This subclass does not include: - edible offal of buffalo, cf. 21152", faostat_qcl,00001018,"Edible offal of goat, fresh, chilled or frozen","Offals, goats","Edible offal of goat, fresh, chilled or frozen This subclass includes: - edible offal of goat, animals of subclass 02123, fresh, chilled or frozen", @@ -75,6 +74,7 @@ faostat_qcl,00001129,Fat of camels,"Fat, camels",Unrendered slaughter fats (Unof faostat_qcl,00001037,Fat of pigs,"Fat, pigs","Unrendered slaughter fats of pigs, including edible and inedible fats that are removed in the course of dressing the carcass. (Unofficial definition)", faostat_fbsc,00002737,"Fats, Animals, Raw",Animal fats,"Default composition: 869 Fat, cattle, 871 Fat, cattle butcher, 949 Fat, buffaloes, 979 Fat, sheep, 994 Grease incl. lanolin wool, 1019 Fat, goats, 1037 Fat, pigs, 1040 Fat, pig butcher, 1043 Lard, 1065 Fat, poultry, 1066 Fat, poultry, rendered, 1129 Fat, camels, 1160 Fat, other camelids, 1168 Oils, fats of animal nes, 1221 Lard stearine oil, 1222 Degras, 1225 Tallow, 1243 Fat, nes, prepared", faostat_fbsc,00002960,"Fish, Seafood",Fish and seafood,, +faostat_qcl,00000771,"Flax, raw or retted","Flax, raw or retted","Flax Straw, spp. Linum usitatissimum. Flax is cultivated for seed as well as for fibre. The fibre is obtained from the stem of the plant. Data are reported in terms of straw. (Unofficial definition)", faostat_qcl,00001738,Fruit Primary,Fruit,"Fruit Crops consist of fruits and berries that, with few exceptions, are characterized by their sweet taste. Nearly all are permanent crops, mainly from trees, bushes and shrubs, as well as vines and palms. Fruits and berries grow on branches, stalks or the trunks of plants, usually singly, but sometimes grouped in bunches or clusters (e.g. bananas and grapes). Commercial crops are cultivated in plantations, but significant quantities of fruits are also collected from scattered plants that may or may not be cultivated. Although melons and watermelons are generally considered to be fruits, FAO groups them with vegetables because they are temporary crops. Fruit crops are highly perishable. Their shelf life may be extended through the application of chemical substances that inhibit the growth of micro-organisms and through careful control of the surrounding temperature, pressure and humidity once the fruit has been picked. Fruits and berries have a very high water content accounting for some 70- 90 percent of their weight. They contain, in various degrees, minerals, vitamins and organic acids, some of which reside in the peel or skin. Some fruits have a high fibre content and other inedible components, so that wastage is high, e.g. 60 percent for passion fruit and 35-45 percent for pineapples. The waste in temperate zone fruit is lower, generally of the order of 10-15 percent, while berries contain very little waste. The carbohydrate content of fruits varies widely. Protein content is very low, averaging less than 1 percent, or below that in vegetables. Fat content in fruit is negligible, with the notable exception of avocados. Fruit crops are consumed directly as food and are processed into dried fruit, fruit juice, canned fruit, frozen fruit, jam, alcoholic beverages, etc. Fruit crops are not normally grown for animal feed, although significant quantities of diseased and substandard fruits, as well as certain by-products of the fruit processing industry, are fed to animals. Production data for fruit crops should relate to fruits actually harvested. Data on bananas and plantains should relate to the weight of single bananas or banana hands, excluding the weight of the central stalk. FAO lists 36 primary fruit crops.", faostat_fbsc,00002919,Fruits - Excluding Wine,Fruit,, faostat_qcl,00001163,"Game meat, fresh, chilled or frozen","Meat, game","Meat and offals of wild animals, whether fresh, chilled or frozen. (Unofficial definition)", From 6289859642abd072fb07e8f514d3495ed6ad6c1f Mon Sep 17 00:00:00 2001 From: Pablo Rosado Date: Mon, 25 Mar 2024 11:51:28 +0100 Subject: [PATCH 53/54] Improve metadata --- .../garden/faostat/2024-03-14/faostat_fbsc.py | 6 ++- .../faostat/2024-03-14/faostat_qcl.meta.yml | 47 +++++++++++++++++++ .../garden/faostat/2024-03-14/faostat_qcl.py | 6 ++- .../data/garden/faostat/2024-03-14/shared.py | 5 +- 4 files changed, 58 insertions(+), 6 deletions(-) create mode 100644 etl/steps/data/garden/faostat/2024-03-14/faostat_qcl.meta.yml diff --git a/etl/steps/data/garden/faostat/2024-03-14/faostat_fbsc.py b/etl/steps/data/garden/faostat/2024-03-14/faostat_fbsc.py index af42c1e0104..35d9f1471c9 100644 --- a/etl/steps/data/garden/faostat/2024-03-14/faostat_fbsc.py +++ b/etl/steps/data/garden/faostat/2024-03-14/faostat_fbsc.py @@ -241,9 +241,11 @@ def run(dest_dir: str) -> None: error = "Dataset title given to fbsc is different to the one in custom_datasets.csv. Update the latter file." assert DATASET_TITLE == dataset_metadata["owid_dataset_title"], error - # Update dataset metadata and add description of anomalies (if any) to the dataset description. - ds_garden.metadata.description = dataset_metadata["owid_dataset_description"] + anomaly_descriptions + # Update dataset metadata. + ds_garden.metadata.update_period_days = 365 ds_garden.metadata.title = dataset_metadata["owid_dataset_title"] + # The following description is not publicly shown in charts; it is only visible when accessing the catalog. + ds_garden.metadata.description = dataset_metadata["owid_dataset_description"] + anomaly_descriptions # Create garden dataset. ds_garden.save() diff --git a/etl/steps/data/garden/faostat/2024-03-14/faostat_qcl.meta.yml b/etl/steps/data/garden/faostat/2024-03-14/faostat_qcl.meta.yml new file mode 100644 index 00000000000..bad3cb8a3a8 --- /dev/null +++ b/etl/steps/data/garden/faostat/2024-03-14/faostat_qcl.meta.yml @@ -0,0 +1,47 @@ +tables: + faostat_qcl_flat: + variables: + cereals__00001717__yield__005419__tonnes_per_hectare: + title: Cereals | 00001717 || Yield | 005419 || tonnes per hectare + unit: tonnes per hectare + short_unit: t/ha + description_short: Measured in tonnes per hectare. + description_key: + - Cereals include wheat, rice, maize, barley, oats, rye, millet, sorghum, buckwheat, and mixed grains. + description_from_producer: | + Cereals are generally of the gramineous family and, in the FAO concept, refer to crops harvested for dry grain only. Crops harvested green for forage, silage or grazingare classified as fodder crops. Also excluded are industrial crops, e.g. broom sorghum (Crude organic materials nes) and sweet sorghum when grown for syrup (Sugar crops nes). For international trade classifications, fresh cereals (other than sweet corn), whether or not suitable for use as fresh vegetables, are classified as cereals. + + Cereals are identified according to their genus. However, when two or more genera are sown and harvested as a mixture they should be classified and reported as "mixed grains". Production data are reported in terms of clean, dry weight of grains (12-14 percent moisture) in the form usually marketed. Rice, however, is reported in terms of paddy. Apart from moisture content and inedible substances such as cellulose, cereal grains contain, along with traces of minerals and vitamins, carbohydrates - mainly starches - (comprising 65-75 percent of their total weight), as well as proteins (6-12 percent) and fat (1-5 percent). + + The FAO definitions cover 17 primary cereals, of which one - white maize - is a component of maize. Each definition is listed along with its code, botanical name or names, and a short description. Cereal products derive either from the processing of grain through one or more mechanical or chemical operations, or from the processing of flour, meal or starch. Each cereal product is listed after the cereal from which it is derived. + + Yield: Harvested production per unit of harvested area for crop products. In most of the cases yield data are not recorded but obtained by dividing the production data by the data on area harvested. Data on yields of permanent crops are not as reliable as those for temporary crops either because most of the area information may correspond to planted area, as for grapes, or because of the scarcity and unreliability of the area figures reported by the countries, as for example for cocoa and coffee. Source: FAO Statistics Division. + processing_level: major + presentation: + attribution_short: FAO + topic_tags: + - Agricultural Production + - Crop Yields + title_public: Cereal yields + grapher_config: + title: Cereal yields + subtitle: |- + Yields are measured in tonnes per hectare. Cereals include wheat, rice, maize, barley, oats, rye, millet, sorghum, buckwheat, and mixed grains. + originUrl: https://ourworldindata.org/crop-yields + hasMapTab: true + tab: map + yAxis: + max: 0 + min: 0 + facetDomain: shared + hideAnnotationFieldsInTitle: + time: true + entity: true + changeInPrefix: true + selectedEntityNames: + - World + - Asia + - Africa + - Northern America + - Europe + - South America diff --git a/etl/steps/data/garden/faostat/2024-03-14/faostat_qcl.py b/etl/steps/data/garden/faostat/2024-03-14/faostat_qcl.py index 5ac19596d9b..0f1d6d2a8db 100644 --- a/etl/steps/data/garden/faostat/2024-03-14/faostat_qcl.py +++ b/etl/steps/data/garden/faostat/2024-03-14/faostat_qcl.py @@ -542,11 +542,13 @@ def run(dest_dir: str) -> None: check_variables_metadata=False, ) - # Update dataset metadata and add description of anomalies (if any) to the dataset description. + # Update dataset metadata. + ds_garden.metadata.update_period_days = 365 + ds_garden.metadata.title = dataset_metadata["owid_dataset_title"] + # The following description is not publicly shown in charts; it is only visible when accessing the catalog. ds_garden.metadata.description = ( dataset_metadata["owid_dataset_description"] + anomaly_descriptions + SLAUGHTERED_ANIMALS_ADDITIONAL_DESCRIPTION ) - ds_garden.metadata.title = dataset_metadata["owid_dataset_title"] # Create garden dataset. ds_garden.save() diff --git a/etl/steps/data/garden/faostat/2024-03-14/shared.py b/etl/steps/data/garden/faostat/2024-03-14/shared.py index f515b744707..3aa7e153cfc 100644 --- a/etl/steps/data/garden/faostat/2024-03-14/shared.py +++ b/etl/steps/data/garden/faostat/2024-03-14/shared.py @@ -1904,9 +1904,10 @@ def run(dest_dir: str) -> None: check_variables_metadata=False, ) # Update dataset metadata. - # Add description of anomalies (if any) to the dataset description. - ds_garden.metadata.description = dataset_metadata["owid_dataset_description"] + anomaly_descriptions + ds_garden.metadata.update_period_days = 365 ds_garden.metadata.title = dataset_metadata["owid_dataset_title"] + # The following description is not publicly shown in charts; it is only visible when accessing the catalog. + ds_garden.metadata.description = dataset_metadata["owid_dataset_description"] + anomaly_descriptions # Create garden dataset. ds_garden.save() From c389835000fa2a5062f5e9856af2398b184e9a1c Mon Sep 17 00:00:00 2001 From: Pablo Rosado Date: Mon, 25 Mar 2024 15:00:39 +0100 Subject: [PATCH 54/54] Fix issue in StepUpdater where archive steps are not really ignored --- apps/step_update/cli.py | 2 +- etl/version_tracker.py | 14 +++++++++----- 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/apps/step_update/cli.py b/apps/step_update/cli.py index 2cd3fd11689..5a48b6978f9 100644 --- a/apps/step_update/cli.py +++ b/apps/step_update/cli.py @@ -36,7 +36,7 @@ def _load_version_tracker(self) -> None: # It can be used when initializing StepUpdater, but also to reload steps_df after making changes to the dag. # Initialize version tracker. - self.tracker = VersionTracker() + self.tracker = VersionTracker(ignore_archive=True) # Update the temporary dag. _update_temporary_dag(dag_active=self.tracker.dag_active, dag_all_reverse=self.tracker.dag_all_reverse) diff --git a/etl/version_tracker.py b/etl/version_tracker.py index ffaacf3afd0..6500cafb6d2 100644 --- a/etl/version_tracker.py +++ b/etl/version_tracker.py @@ -264,13 +264,17 @@ class VersionTracker: "snapshot://dummy/2020-01-01/dummy_full.csv", ] - def __init__(self, connect_to_db: bool = True, warn_on_archivable: bool = True): - # Load dag of active and archive steps (a dictionary where each item is step: set of dependencies). - self.dag_all = load_dag(paths.DAG_ARCHIVE_FILE) + def __init__(self, connect_to_db: bool = True, warn_on_archivable: bool = True, ignore_archive: bool = False): + # Load dag of active steps (a dictionary step: set of dependencies). + self.dag_active = load_dag(paths.DAG_FILE) + if ignore_archive: + # Fully ignore the archive dag (so that all steps are only active steps, and there are no archive steps). + self.dag_all = self.dag_active.copy() + else: + # Load dag of active and archive steps. + self.dag_all = load_dag(paths.DAG_ARCHIVE_FILE) # Create a reverse dag (a dictionary where each item is step: set of usages). self.dag_all_reverse = reverse_graph(graph=self.dag_all) - # Load dag of active steps. - self.dag_active = load_dag(paths.DAG_FILE) # Create a reverse dag (a dictionary where each item is step: set of usages) of active steps. self.dag_active_reverse = reverse_graph(graph=self.dag_active) # Generate the dag of only archive steps.