add projections

owid · Dec 18, 2024 · 68b61fb · 68b61fb
1 parent 748c2f3
commit 68b61fb
Show file tree

Hide file tree

Showing 4 changed files with 39 additions and 16 deletions.
diff --git a/dag/demography.yml b/dag/demography.yml
@@ -145,6 +145,9 @@ steps:
     - snapshot://un/2024-12-02/un_wpp_lt_m.csv
     - snapshot://un/2024-12-02/un_wpp_lt_all.csv
     - snapshot://un/2024-12-02/un_wpp_lt_f.csv
+    - snapshot://un/2024-12-02/un_wpp_lt_proj_m.csv
+    - snapshot://un/2024-12-02/un_wpp_lt_proj_all.csv
+    - snapshot://un/2024-12-02/un_wpp_lt_proj_f.csv
   data://garden/un/2024-12-02/un_wpp_lt:
     - data://meadow/un/2024-12-02/un_wpp_lt
 

diff --git a/etl/steps/data/garden/un/2024-12-02/un_wpp_lt.meta.yml b/etl/steps/data/garden/un/2024-12-02/un_wpp_lt.meta.yml
@@ -24,7 +24,7 @@ dataset:
 # Learn more about the available fields:
 # http://docs.owid.io/projects/etl/architecture/metadata/reference/tables/
 tables:
-  un_wpp_lt:
+  un_wpp_lt: &table_metadata
     variables:
       central_death_rate:
         title: Central death rate
@@ -102,3 +102,5 @@ tables:
             <%- else -%>
             It refers to the remaining life expectancy for people who have already survived to the given age.
             <%- endif -%>
+
+  un_wpp_lt_proj: *table_metadata
diff --git a/etl/steps/data/garden/un/2024-12-02/un_wpp_lt.py b/etl/steps/data/garden/un/2024-12-02/un_wpp_lt.py
@@ -12,12 +12,7 @@
     "time": "year",
     "agegrpstart": "age",
 }
-COLUMNS_INDEX = [
-    "location",
-    "year",
-    "sex",
-    "age",
-]
+COLUMNS_INDEX = ["location", "year", "sex", "age", "variant"]
 COLUMNS_INDICATORS = [
     "central_death_rate",
     "probability_of_death",
@@ -30,6 +25,8 @@
     "life_expectancy",
     "average_survival_length",
 ]
+# Year threshold for projections
+YEAR_PROJ_START = 2024
 
 
 def run(dest_dir: str) -> None:
@@ -43,12 +40,16 @@ def run(dest_dir: str) -> None:
     paths.log.info("load tables, concatenate.")
     tb = pr.concat(
         [
-            ds_meadow["un_wpp_lt_all"].reset_index(),
-            ds_meadow["un_wpp_lt_f"].reset_index(),
-            ds_meadow["un_wpp_lt_m"].reset_index(),
+            ds_meadow.read("un_wpp_lt_all"),
+            ds_meadow.read("un_wpp_lt_f"),
+            ds_meadow.read("un_wpp_lt_m"),
+            ds_meadow.read("un_wpp_lt_proj_all"),
+            ds_meadow.read("un_wpp_lt_proj_f"),
+            ds_meadow.read("un_wpp_lt_proj_m"),
         ],
         short_name=paths.short_name,
-    ).reset_index()
+        ignore_index=True,
+    )
 
     #
     # Process data.
@@ -64,7 +65,7 @@ def run(dest_dir: str) -> None:
     # DTypes
     tb = tb.astype(
         {
-            "age": str,
+            "age": "string",
         }
     )
 
@@ -79,21 +80,35 @@ def run(dest_dir: str) -> None:
 
     # Harmonize country names.
     paths.log.info("harmonise country names.")
-    tb = geo.harmonize_countries(df=tb, countries_file=paths.country_mapping_path, country_col="location")
+    tb = geo.harmonize_countries(
+        df=tb,
+        countries_file=paths.country_mapping_path,
+        country_col="location",
+    )
 
     # Harmonize sex sex
     tb["sex"] = tb["sex"].map({"Total": "total", "Male": "male", "Female": "female"})
     assert tb["sex"].notna().all(), "NaNs detected after mapping sex values!"
 
+    # Historical and Projection-only tables
+    tb_hist = tb.loc[tb["year"] < YEAR_PROJ_START]
+    tb_future = tb.loc[tb["year"] >= YEAR_PROJ_START]
+
     # Set index
-    tb = tb.set_index(COLUMNS_INDEX, verify_integrity=True)[COLUMNS_INDICATORS]
+    tables = [
+        tb_hist.format(COLUMNS_INDEX, short_name="un_wpp_lt"),
+        tb_future.format(COLUMNS_INDEX, short_name="un_wpp_lt_proj"),
+    ]
 
     #
     # Save outputs.
     #
     # Create a new garden dataset with the same metadata as the meadow dataset.
     ds_garden = create_dataset(
-        dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=ds_meadow.metadata
+        dest_dir,
+        tables=tables,
+        check_variables_metadata=True,
+        default_metadata=ds_meadow.metadata,
     )
 
     # Save changes in the new garden dataset.

diff --git a/etl/steps/data/meadow/un/2024-12-02/un_wpp_lt.py b/etl/steps/data/meadow/un/2024-12-02/un_wpp_lt.py
@@ -31,6 +31,9 @@ def run(dest_dir: str) -> None:
         "un_wpp_lt_all",  # ALL
         "un_wpp_lt_f",  # FEMALE
         "un_wpp_lt_m",  # MALE
+        "un_wpp_lt_proj_all",  # PROJECTIONS, ALL
+        "un_wpp_lt_proj_f",  # PROJECTIONS, FEMALE
+        "un_wpp_lt_proj_m",  # PROJECTIONS, MALE
     ]
 
     tables = []
@@ -53,7 +56,7 @@ def run(dest_dir: str) -> None:
             tb["LocTypeName"].isin(["Geographic region", "Income group", "Country/Area", "World", "Development group"])
         ]
         # Set index
-        tb = tb.format(["location", "time", "sex", "agegrp", "loctypename"])
+        tb = tb.format(["location", "time", "sex", "agegrp", "loctypename", "variant"])
         # Add to tables list
         tables.append(tb)