Merge branch 'master' into data-fertility-rate-effective

owid · Dec 19, 2024 · 212e752 · 212e752
2 parents 68b61fb + 7ae6528
commit 212e752
Show file tree

Hide file tree

Showing 20 changed files with 658 additions and 53 deletions.
diff --git a/dag/demography.yml b/dag/demography.yml
@@ -287,6 +287,13 @@ steps:
   data://grapher/demography/2024-12-17/efr_malani_jacob:
     - data://garden/demography/2024-12-17/efr_malani_jacob
 
+  # Mean Age at childbirth (HFD + UN WPP)
+  data://garden/demography/2024-12-18/mean_age_childbearing:
+    - data://garden/un/2024-07-12/un_wpp
+    - data://garden/hmd/2024-11-19/hfd
+  data://grapher/demography/2024-12-18/mean_age_childbearing:
+    - data://garden/demography/2024-12-18/mean_age_childbearing
+
   ########################################################################
   # OTHERS
   ########################################################################

diff --git a/etl/config.py b/etl/config.py
@@ -44,14 +44,30 @@ def _normalise_branch(branch_name):
     return re.sub(r"[\/\._]", "-", branch_name)
 
 
+# NOTE: If you edit this function, make sure to update `get_container_name` in ops repo as well
 def get_container_name(branch_name):
     normalized_branch = _normalise_branch(branch_name)
 
     # Strip staging-site- prefix to add it back later
     normalized_branch = normalized_branch.replace("staging-site-", "")
 
     # Ensure the container name is less than 63 characters
-    container_name = f"staging-site-{normalized_branch[:50]}"
+    # however, we truncate it to 28 characters to be consistent with Cloudflare's
+    # 28 character limit (see https://community.cloudflare.com/t/algorithm-to-generate-a-preview-dns-subdomain-from-a-branch-name/477633)
+    # TODO: these ifs were added to be backward compatible with existing branches that are longer than 28 characters
+    #   remove them once they get merged
+    if normalized_branch in (
+        "variable-selector-catalog-path",
+        "grapher-page-dynamic-thumbnail",
+        "data-fertility-rate-effective",
+        "add-reset-metadata-origin-option",
+        "data-battery-cell-prices-private",
+    ):
+        limit = 50
+    else:
+        limit = 28
+
+    container_name = f"staging-site-{normalized_branch[:limit]}"
     # Remove trailing hyphens
     return container_name.rstrip("-")
 

diff --git a/etl/steps/data/garden/demography/2024-12-18/mean_age_childbearing.meta.yml b/etl/steps/data/garden/demography/2024-12-18/mean_age_childbearing.meta.yml
@@ -0,0 +1,51 @@
+# NOTE: To learn more about the fields, hover over their names.
+definitions:
+  common:
+    presentation:
+      topic_tags:
+        - Fertility Rate
+
+# Learn more about the available fields:
+# http://docs.owid.io/projects/etl/architecture/metadata/reference/
+dataset:
+  update_period_days: 365
+  title: Mean age at childbearing (HFD; UN WPP)
+
+tables:
+  mean_age_childbearing:
+    variables:
+      mean_age_childbearing:
+        title: Mean age at childbearing
+        unit: years
+        description_short: &cb_description_short |-
+          Mean age of mothers at the birth of their children if women were subject throughout their lives to the age-specific fertility rates observed in a given year. UN Medium projections for 2024-2100.
+        description_processing: |-
+          This indicator is constructed by combining data from multiple sources:
+
+          - Before 1949: Historical estimates by Human Fertility Database (2024).
+
+          - 1950-2023: Population records by the UN World Population Prospects (2024 revision).
+
+          - 2024-2100: Projections based on Medium variant by the UN World Population Prospects (2024 revision).
+        presentation:
+          title_public: |-
+            Mean age at childbearing
+          grapher_config:
+            subtitle: *cb_description_short
+
+      mean_age_childbearing_hist:
+        title: Mean age at childbearing, historical
+        unit: years
+        description_short: &cb_description_short_hist |-
+          Mean age of mothers at the birth of their children if women were subject throughout their lives to the age-specific fertility rates observed in a given year.
+        description_processing: |-
+          This indicator is constructed by combining data from multiple sources:
+
+          - Before 1949: Historical estimates by Human Fertility Database (2024).
+
+          - 1950-2023: Population records by the UN World Population Prospects (2024 revision).
+        presentation:
+          title_public: |-
+            Mean age at childbearing
+          grapher_config:
+            subtitle: *cb_description_short_hist
diff --git a/etl/steps/data/garden/demography/2024-12-18/mean_age_childbearing.py b/etl/steps/data/garden/demography/2024-12-18/mean_age_childbearing.py
@@ -0,0 +1,69 @@
+"""Load a meadow dataset and create a garden dataset."""
+
+import pandas as pd
+from owid.catalog import processing as pr
+
+from etl.helpers import PathFinder, create_dataset
+
+# Get paths and naming conventions for current step.
+paths = PathFinder(__file__)
+
+
+# Year constants
+YEAR_WPP_START = 1950
+YEAR_WPP_PROJ_START = 2023
+# Table names
+TABLE_NAME_WPP = "mean_age_childbearing"
+TABLE_NAME_HFD = "period"
+TABLE_NAME_NEW = "mean_age_childbearing"
+# Metric names
+COLUMN_NAME_WPP = "mean_age_childbearing"
+COLUMN_NAME_HFD = "mab"
+COLUMN_NEW_NAME = "mean_age_childbearing"
+
+
+def run(dest_dir: str) -> None:
+    #
+    # Load inputs.
+    #
+    # Load meadow dataset.
+    ds_hfd = paths.load_dataset("hfd")
+    ds_un = paths.load_dataset("un_wpp")
+
+    # Read table from meadow dataset.
+    tb_hfd = ds_hfd.read(TABLE_NAME_HFD)
+    tb_un = ds_un.read(TABLE_NAME_WPP)
+
+    # UN: estimates + medium,
+    tb_un = tb_un.loc[
+        (tb_un["sex"] == "all") & (tb_un["variant"].isin(["medium", "estimates"]) & (tb_un["age"] == "all")),
+        ["country", "year", COLUMN_NAME_WPP],
+    ].rename(columns={COLUMN_NAME_WPP: COLUMN_NEW_NAME})
+
+    # HFD: tfr, birth_order=total,
+    tb_hfd = tb_hfd.loc[
+        ((tb_hfd["birth_order"] == "total") & (tb_hfd["year"] < YEAR_WPP_START)), ["country", "year", COLUMN_NAME_HFD]
+    ].rename(columns={COLUMN_NAME_HFD: COLUMN_NEW_NAME})
+
+    # Concatenate
+    tb = pr.concat([tb_hfd, tb_un], ignore_index=True, short_name=TABLE_NAME_NEW)
+
+    # Add historical variant
+    tb[f"{COLUMN_NEW_NAME}_hist"] = tb[COLUMN_NEW_NAME].copy()
+    tb.loc[tb["year"] > YEAR_WPP_PROJ_START, f"{COLUMN_NEW_NAME}_hist"] = pd.NA
+
+    # Format
+    tb = tb.format(["country", "year"])
+
+    #
+    # Save outputs.
+    #
+    # Create a new garden dataset with the same metadata as the meadow dataset.
+    ds_garden = create_dataset(
+        dest_dir,
+        tables=[tb],
+        check_variables_metadata=True,
+    )
+
+    # Save changes in the new garden dataset.
+    ds_garden.save()
diff --git a/etl/steps/data/garden/hmd/2024-11-19/hfd.py b/etl/steps/data/garden/hmd/2024-11-19/hfd.py
@@ -529,13 +529,17 @@ def add_shifted_to_cohort(tb):
         }
     )
 
-    # TODO: move elsewhere
-    # Build special table
-    years = list(range(1925, tb_period_ages["year"].max() + 1, 5))
-    tb_period_years = tb_period_ages.loc[
-        tb_period_ages["year"].isin(years) & (tb_period_ages["birth_order"] == "total")
-    ].drop(columns=["birth_order"])
+    # Special table: Distribution of period metrics
+    ## Keep only birth_order = total
+    tb_period_years = tb_period_ages.loc[(tb_period_ages["birth_order"] == "total")].drop(columns=["birth_order"])
+    ## Keep only cohorts that are multiples of 5 (from year_min to year_max)
+    year_min = tb_period_years.loc[tb_period_years["year"] % 5 == 0, "year"].min()
+    year_max = tb_period_years["year"].max() + 1
+    years = list(range(year_min, year_max, 5))
+    tb_period_years = tb_period_years.loc[tb_period_years["year"].isin(years)]
+    ## Change age group names 12- -> 12, 55+ -> 55
     tb_period_years["age"] = tb_period_years["age"].str.replace("-", "").str.replace("+", "").astype("UInt8")
+    ## HOTFIX: Name of the dimension
     tb_period_years = tb_period_years.rename(
         columns={
             "year": "year_as_dimension",
@@ -579,14 +583,19 @@ def add_shifted_to_cohort(tb):
         }
     )
 
-    # TODO: move elsewhere
-    # Build special table
-    years = list(range(1925, tb_cohort_ages["cohort"].max() + 1, 5))
-    tb_cohort_years = tb_cohort_ages.loc[
-        tb_cohort_ages["cohort"].isin(years) & (tb_cohort_ages["birth_order"] == "total")
-    ].drop(columns=["birth_order"])
+    # Special table: Distribution of cohort metrics
+    ## Keep only birth_order = total
+    tb_cohort_years = tb_cohort_ages.loc[(tb_cohort_ages["birth_order"] == "total")].drop(columns=["birth_order"])
+    ## Keep only cohorts that are multiples of 5 (from year_min to year_max)
+    year_min = tb_cohort_years.loc[tb_cohort_years["cohort"] % 5 == 0, "cohort"].min()
+    year_max = tb_cohort_years["cohort"].max() + 1
+    years = list(range(year_min, year_max, 5))
+    tb_cohort_years = tb_cohort_years.loc[tb_cohort_years["cohort"].isin(years)]
+    ## Change age group names 12- -> 12, 55+ -> 55
     tb_cohort_years["age"] = tb_cohort_years["age"].str.replace("-", "").str.replace("+", "").astype("UInt8")
-    # Fix 12- vs 12, 55+ vs 55 etc.
+    # 'asfr_cohort' and 'ccfr_cohort' don't always use the same names for the same age groups. E.g. 12- vs 12, 55+ vs 55 etc.
+    # Therefore, these age groups are not aligned after the merge. We fix this by grouping + averaging.
+    # The following check ensures that this is actually the case, so that the groupby.mean makes sense!
     assert tb_cohort_years.groupby(["country", "cohort", "age"])["asfr_cohort"].nunique().max() == 1
     assert tb_cohort_years.groupby(["country", "cohort", "age"])["ccfr_cohort"].nunique().max() == 1
     tb_cohort_years = tb_cohort_years.groupby(["country", "cohort", "age"], as_index=False).mean()

diff --git a/etl/steps/data/garden/un/2024-07-12/un_wpp.meta.yml b/etl/steps/data/garden/un/2024-07-12/un_wpp.meta.yml
@@ -1,8 +1,7 @@
 definitions:
   global:
-    projections:
-      <%- if (variant is defined) and (variant != 'estimates') -%>
-        Projections from 2024 onwards are based on the UN's << variant >> scenario.
+    projections: <%- if (variant is defined) and (variant != 'estimates') -%>
+      Projections from 2024 onwards are based on the UN's << variant >> scenario.
       <%- endif -%>
     dimensions:
       title: |-
@@ -65,13 +64,10 @@ definitions:
         subtitle: "{definitions.global.projections}"
         originUrl: "https://ourworldindata.org/population-growth"
 
-
 # this metadata file is not used in garden step, but in grapher step
 tables:
   population:
-
     variables:
-
       population:
         title: Population
         unit: people
@@ -83,7 +79,6 @@ tables:
           grapher_config:
             note: "Values as of 1 July of the indicated year."
 
-
       population_change:
         title: Population change
         unit: people
@@ -153,7 +148,6 @@ tables:
             subtitle: |-
               The natural growth rate is the population change determined by births and deaths. Migration flows are not taken into account. {definitions.global.projections}
 
-
   fertility_rate:
     variables:
       fertility_rate:
@@ -232,8 +226,7 @@ tables:
           title_public: |-
             Death rate, {definitions.global.dimensions.title}
           grapher_config:
-            subtitle:
-              The number of deaths occurring during the year, per 1,000 people. {definitions.global.projections}
+            subtitle: The number of deaths occurring during the year, per 1,000 people. {definitions.global.projections}
 
   births:
     variables:
@@ -273,18 +266,18 @@ tables:
             subtitle: |-
               The median age divides the population into two parts of equal size; that is, there are as many people with ages above the median age as there are with ages below. {definitions.global.projections}
 
-  # childbearing_age:
-  #   variables:
-  #     childbearing_age:
-  #       title: Mean age at childbearing
-  #       unit: years
-  #       description_short: &cb_description_short |-
-  #         Mean age of mothers at the birth of their children if women were subject throughout their lives to the age-specific fertility rates observed in a given year. {definitions.global.dimensions.description_short}
-  #       presentation:
-  #         title_public: |-
-  #           Mean age at childbearing, {definitions.global.dimensions.title}
-  #         grapher_config:
-  #           subtitle: *cb_description_short
+  mean_age_childbearing:
+    variables:
+      mean_age_childbearing:
+        title: Mean age at childbearing
+        unit: years
+        description_short: &cb_description_short |-
+          Mean age of mothers at the birth of their children if women were subject throughout their lives to the age-specific fertility rates observed in a given year. {definitions.global.dimensions.description_short}
+        presentation:
+          title_public: |-
+            Mean age at childbearing, {definitions.global.dimensions.title}
+          grapher_config:
+            subtitle: *cb_description_short
 
   life_expectancy:
     variables:
@@ -384,7 +377,6 @@ tables:
             Old-age dependency ratio
             <%-endif -%>, {definitions.global.dimensions.title}
 
-
   mortality_rate:
     variables:
       mortality_rate:

diff --git a/etl/steps/data/garden/un/2024-07-12/un_wpp.py b/etl/steps/data/garden/un/2024-07-12/un_wpp.py
@@ -41,7 +41,7 @@ def run(dest_dir: str) -> None:
     tb_median_age = ds_meadow["median_age"].reset_index()
     tb_le = ds_meadow["life_expectancy"].reset_index()
     tb_mortality = ds_meadow["mortality_rate"].reset_index()
-    # tb_childbearing_age = ds_meadow["childbearing_age"].reset_index()
+    tb_childbearing_age = ds_meadow["mean_age_childbearing"].reset_index()
 
     #
     # Process data.
@@ -107,6 +107,11 @@ def run(dest_dir: str) -> None:
     tb_mortality = set_variant_to_estimates(tb_mortality)
     tb_mortality = tb_mortality.format(COLUMNS_INDEX)
 
+    ## Mean age at childbearing
+    tb_childbearing_age = process_standard(tb_childbearing_age)
+    tb_childbearing_age = set_variant_to_estimates(tb_childbearing_age)
+    tb_childbearing_age = tb_childbearing_age.format(COLUMNS_INDEX)
+
     # Build tables list for dataset
     tables = [
         tb_population,
@@ -121,6 +126,7 @@ def run(dest_dir: str) -> None:
         tb_sex_ratio,
         tb_mortality,
         tb_dependency,
+        tb_childbearing_age,
     ]
 
     #

diff --git a/etl/steps/data/grapher/demography/2024-12-18/mean_age_childbearing.py b/etl/steps/data/grapher/demography/2024-12-18/mean_age_childbearing.py
@@ -0,0 +1,28 @@
+"""Load a garden dataset and create a grapher dataset."""
+
+from etl.helpers import PathFinder, create_dataset
+
+# Get paths and naming conventions for current step.
+paths = PathFinder(__file__)
+
+
+def run(dest_dir: str) -> None:
+    #
+    # Load inputs.
+    #
+    # Load garden dataset.
+    ds_garden = paths.load_dataset("mean_age_childbearing")
+
+    # Read table from garden dataset.
+    tb = ds_garden.read("mean_age_childbearing", reset_index=False)
+
+    #
+    # Save outputs.
+    #
+    # Create a new grapher dataset with the same metadata as the garden dataset.
+    ds_grapher = create_dataset(
+        dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=ds_garden.metadata
+    )
+
+    # Save changes in the new grapher dataset.
+    ds_grapher.save()
diff --git a/etl/steps/data/grapher/un/2024-07-12/un_wpp.py b/etl/steps/data/grapher/un/2024-07-12/un_wpp.py
@@ -26,7 +26,7 @@ def run(dest_dir: str) -> None:
         ds_garden["sex_ratio"],
         ds_garden["mortality_rate"],
         ds_garden["dependency_ratio"],
-        # ds_garden["childbearing_age"],
+        ds_garden["mean_age_childbearing"],
     ]
     #
     # Save outputs.

diff --git a/etl/steps/data/meadow/un/2024-07-12/un_wpp.py b/etl/steps/data/meadow/un/2024-07-12/un_wpp.py
@@ -84,6 +84,7 @@ def run(dest_dir: str) -> None:
     tb_death_rate = clean_table_standard_xlsx(tb_main, "Crude Death Rate (deaths per 1,000 population)", "death_rate")
     tb_birth_rate = clean_table_standard_xlsx(tb_main, "Crude Birth Rate (births per 1,000 population)", "birth_rate")
     tb_median_age = clean_table_standard_xlsx(tb_main, "Median Age, as of 1 July (years)", "median_age")
+    tb_macb = clean_table_standard_xlsx(tb_main, "Mean Age Childbearing (years)", "mean_age_childbearing")
     tb_mortality = make_tb_mortality(tb_main)
     tb_le = make_tb_life_expectancy(tb_main)
 
@@ -115,7 +116,7 @@ def run(dest_dir: str) -> None:
         tb_median_age,
         tb_le,
         tb_mortality,
-        # tb_childbearing_age,
+        tb_macb,
         # tb_population_doubling,
     ]
     # Create a new meadow dataset with the same metadata as the snapshot.