diff --git a/dag/poverty_inequality.yml b/dag/poverty_inequality.yml index d44b0e1f3b0..d156dd48257 100644 --- a/dag/poverty_inequality.yml +++ b/dag/poverty_inequality.yml @@ -7,24 +7,24 @@ steps: # Poverty and inequality file for Joe's PhD data://explorers/poverty_inequality/latest/poverty_inequality_export: - - data://garden/wb/2024-01-17/world_bank_pip + - data://garden/wb/2024-03-27/world_bank_pip - data://garden/wid/2023-08-24/world_inequality_database - data://garden/lis/2023-08-30/luxembourg_income_study - data://garden/wb/2024-01-22/thousand_bins_distribution - data://garden/worldbank_wdi/2023-05-29/wdi # World Bank Poverty and Inequality Platform - data://meadow/wb/2024-01-17/world_bank_pip: - - snapshot://wb/2024-01-17/world_bank_pip.csv - - snapshot://wb/2024-01-17/world_bank_pip_percentiles.csv - data://garden/wb/2024-01-17/world_bank_pip: - - data://meadow/wb/2024-01-17/world_bank_pip - data://grapher/wb/2024-01-17/world_bank_pip_2011ppp: - - data://garden/wb/2024-01-17/world_bank_pip - data://grapher/wb/2024-01-17/world_bank_pip_2017ppp: - - data://garden/wb/2024-01-17/world_bank_pip + data://meadow/wb/2024-03-27/world_bank_pip: + - snapshot://wb/2024-03-27/world_bank_pip.csv + - snapshot://wb/2024-03-27/world_bank_pip_percentiles.csv + data://garden/wb/2024-03-27/world_bank_pip: + - data://meadow/wb/2024-03-27/world_bank_pip + data://grapher/wb/2024-03-27/world_bank_pip_2011ppp: + - data://garden/wb/2024-03-27/world_bank_pip + data://grapher/wb/2024-03-27/world_bank_pip_2017ppp: + - data://garden/wb/2024-03-27/world_bank_pip data://explorers/wb/latest/world_bank_pip: - - data://garden/wb/2024-01-17/world_bank_pip + - data://garden/wb/2024-03-27/world_bank_pip # World Inequality Database data://meadow/wid/2023-08-24/world_inequality_database: diff --git a/etl/steps/data/garden/wb/2024-03-27/shared.py b/etl/steps/data/garden/wb/2024-03-27/shared.py new file mode 100644 index 00000000000..d2de965bd94 --- /dev/null +++ b/etl/steps/data/garden/wb/2024-03-27/shared.py @@ -0,0 +1,843 @@ +""" +This file includes functions to get variables metadata in the `world_bank_pip` garden step +If new poverty lines or indicators are included, they need to be addressed here +""" + +from owid.catalog import Table, VariableMeta, VariablePresentationMeta + +# This is text to include in description_key and description_processing fields + +non_market_income_description = "Non-market sources of income, including food grown by subsistence farmers for their own consumption, are taken into account." + +processing_description_relative_poverty = "Measures of relative poverty are not directly available in the World Bank PIP data. To calculate this metric we take the median income or consumption for the country and year, calculate a relative poverty line – in this case {povline} of the median – and then run a specific query on the PIP API to return the share of population below that line." + +processing_description_thr = "Income and consumption thresholds by decile are not directly available in the World Bank PIP API. We extract the metric primarily from [auxiliary percentiles data provided by the World Bank](https://datacatalog.worldbank.org/search/dataset/0063646). Missing country values and regional aggregations of the indicator are calculated by running multiple queries on the API to obtain the closest poverty line to each threshold." + +processing_description_avg = "Income and consumption averages by decile are not directly available in the World Bank PIP API. We calculate the metric by multiplying the share of each decile by the mean income or consumption of the distribution and dividing by the population share of the decile (10%)." + +relative_poverty_description = "This is a measure of _relative_ poverty – it captures the share of people whose income is low by the standards typical in their own country." + +ppp_description = "The data is measured in international-$ at {ppp} prices – this adjusts for inflation and for differences in the cost of living between countries." + +processing_description_thr_percentiles = "Missing country values and regional aggregations of the threshold indicator are calculated by running multiple queries on the API to obtain the closest poverty line to each threshold. This data is merged with the percentile files [provided by the World Bank](https://datacatalog.worldbank.org/search/dataset/0063646/_poverty_and_inequality_platform_pip_percentiles)." + + +# Define default tolerance for each variable +TOLERANCE = 5 + +# These are parameters specifically defined for each type of variable +var_dict = { + # POVERTY + "headcount": { + "title": "Number in poverty", + "description": "Number of people in households with an {inc_cons_dict[wel]['name']} per person below {povline}", + "unit": "people", + "short_unit": "", + "numDecimalPlaces": 0, + }, + "headcount_ratio": { + "title": "Share of population in poverty", + "description": "Percentage of population living in households with an {inc_cons_dict[wel]['name']} per person below {povline}", + "unit": "%", + "short_unit": "%", + "numDecimalPlaces": 1, + }, + "total_shortfall": { + "title": "Total daily shortfall", + "description": "This is the amount of money that would be theoretically needed to lift the {inc_cons_dict[wel]['name']} of all people in poverty up to {povline}. However, this is not a measure of the actual cost of eliminating poverty, since it does not take into account the costs involved in making the necessary transfers nor any changes in behaviour they would bring about.", + "unit": "international-$ in {ppp} prices", + "short_unit": "$", + "numDecimalPlaces": 2, + }, + "avg_shortfall": { + "title": "Average shortfall ($)", + "description": "This is the amount of money that would be theoretically needed to lift the {inc_cons_dict[wel]['name']} of all people in poverty up to {povline}, averaged across the population in poverty.", + "unit": "international-$ in {ppp} prices", + "short_unit": "$", + "numDecimalPlaces": 2, + }, + "income_gap_ratio": { + "title": "Average shortfall (%)", + "description": "This is the average shortfall expressed as a share of the poverty line, sometimes called the 'income gap ratio'. It captures the depth of poverty of those living on less than {povline}.", + "unit": "%", + "short_unit": "%", + "numDecimalPlaces": 1, + }, + "poverty_gap_index": { + "title": "Poverty gap index", + "description": "The poverty gap index is a poverty measure that reflects both the prevalence and the depth of poverty. It is calculated as the share of population in poverty multiplied by the average shortfall from the poverty line (expressed as a % of the poverty line).", + "unit": "%", + "short_unit": "%", + "numDecimalPlaces": 1, + }, + "watts": { + "title": "Watts index", + "description": "This is the mean across the population of the proportionate poverty gaps, as measured by the log of the ratio of the poverty line to income, where the mean is formed over the whole population, counting the nonpoor as having a zero poverty gap.", + "unit": "%", + "short_unit": "%", + "numDecimalPlaces": 1, + }, + "poverty_severity": { + "title": "Poverty severity", + "description": "It is calculated as the square of the income gap ratio, the average shortfall expressed as a share of the poverty line.", + "unit": "%", + "short_unit": "%", + "numDecimalPlaces": 1, + }, + # INEQUALITY + "gini": { + "title": "Gini coefficient", + "description": "The [Gini coefficient](#dod:gini) measures inequality on a scale from 0 to 1. Higher values indicate higher inequality.", + "unit": "", + "short_unit": "", + "numDecimalPlaces": 2, + }, + "palma_ratio": { + "title": "Palma ratio", + "description": "The Palma ratio is a measure of inequality that divides the share received by the richest 10% by the share of the poorest 40%. Higher values indicate higher inequality.", + "unit": "", + "short_unit": "", + "numDecimalPlaces": 1, + }, + "s80_s20_ratio": { + "title": "S80/S20 ratio", + "description": "The S80/S20 ratio is a measure of inequality that divides the share received by the richest 20% by the share of the poorest 20%. Higher values indicate higher inequality.", + "unit": "", + "short_unit": "", + "numDecimalPlaces": 1, + }, + "p90_p10_ratio": { + "title": "P90/P10 ratio", + "description": "P90 and P10 are the levels of {inc_cons_dict[wel]['name']} below which 90% and 10% of the population live, respectively. This variable gives the ratio of the two. It is a measure of inequality that indicates the gap between the richest and poorest tenth of the population.", + "unit": "", + "short_unit": "", + "numDecimalPlaces": 1, + }, + "p90_p50_ratio": { + "title": "P90/P50 ratio", + "description": "The P90/P50 ratio measures the degree of inequality within the richest half of the population. A ratio of 2 means that someone just falling in the richest tenth of the population has twice the median {inc_cons_dict[wel]['name']}.", + "unit": "", + "short_unit": "", + "numDecimalPlaces": 1, + }, + "p50_p10_ratio": { + "title": "P50/P10 ratio", + "description": "The P50/P10 ratio measures the degree of inequality within the poorest half of the population. A ratio of 2 means that the median {inc_cons_dict[wel]['name']} is two times higher than that of someone just falling in the poorest tenth of the population.", + "unit": "", + "short_unit": "", + "numDecimalPlaces": 1, + }, + "mld": { + "title": "Mean log deviation", + "description": "The mean log deviation (MLD) is a measure of inequality. An MLD of zero indicates perfect equality and it takes on larger positive values as incomes become more unequal.", + "unit": "", + "short_unit": "", + "numDecimalPlaces": 2, + }, + "polarization": { + "title": "Polarization index", + "description": "The polarization index, also known as the Wolfson polarization index, measures the extent to which the distribution of income or consumption is “spread out” and bi-modal. Like the Gini coefficient, the polarization index ranges from 0 (no polarization) to 1 (complete polarization).", + "unit": "", + "short_unit": "", + "numDecimalPlaces": 2, + }, + # DISTRIBUTIONAL INDICATORS + "mean": { + "title": "Mean", + "description": "Mean {inc_cons_dict[wel]['name']}.", + "unit": "international-$ in {ppp} prices", + "short_unit": "$", + "numDecimalPlaces": 2, + }, + "median": { + "title": "Median", + "description": "Median {inc_cons_dict[wel]['name']}.", + "unit": "international-$ in {ppp} prices", + "short_unit": "$", + "numDecimalPlaces": 2, + }, + "avg": { + "title": "Average", + "description": "The mean {inc_cons_dict[wel]['name_distribution']} per year within the {pct_dict[pct]['decile10']} (tenth of the population).", + "unit": "international-$ in {ppp} prices", + "short_unit": "$", + "numDecimalPlaces": 2, + }, + "share": { + "title": "Share", + "description": "The share of {inc_cons_dict[wel]['name_distribution']} {inc_cons_dict[wel]['verb']} by the {pct_dict[pct]['decile10']} (tenth of the population).", + "unit": "%", + "short_unit": "%", + "numDecimalPlaces": 1, + }, + "thr": { + "title": "Threshold", + "description": "The level of {inc_cons_dict[wel]['name_distribution']} per year below which {str(pct)}% of the population falls.", + "unit": "international-$ in {ppp} prices", + "short_unit": "$", + "numDecimalPlaces": 2, + }, + "bottom50_share": { + "title": "Share of the bottom 50%", + "description": "The share of {inc_cons_dict[wel]['name_distribution']} {inc_cons_dict[wel]['verb']} by the poorest 50%.", + "unit": "%", + "short_unit": "%", + "numDecimalPlaces": 1, + }, + "middle40_share": { + "title": "Share of the middle 40%", + "description": "The share of {inc_cons_dict[wel]['name_distribution']} {inc_cons_dict[wel]['verb']} by the middle 40%. The middle 40% is the share of the population whose {inc_cons_dict[wel]['name']} lies between the poorest 50% and the richest 10%.", + "unit": "%", + "short_unit": "%", + "numDecimalPlaces": 1, + }, +} + +# Details for each consumption or income variable +inc_cons_dict = { + "income": { + "name": "income", + "name_distribution": "after tax income", + "verb": "received", + "description": "The data relates to income measured after taxes and benefits per capita. 'Per capita' means that the income of each household is attributed equally to each member of the household (including children).", + "processing_description": """To construct a global dataset, the World Bank combines estimates based on income data and estimates based on consumption data. Here we only include the estimates based on income data. + +You can find the data with all available income and consumption data points in our [complete dataset](https://github.com/owid/poverty-data#a-global-dataset-of-poverty-and-inequality-measures-prepared-by-our-world-in-data-from-the-world-banks-poverty-and-inequality-platform-pip-database) of the World Bank PIP data.""", + }, + "consumption": { + "name": "consumption", + "name_distribution": "consumption", + "verb": "spent", + "description": "The data relates to consumption per capita. 'Per capita' means that the consumption of each household is attributed equally to each member of the household (including children).", + "processing_description": """To construct a global dataset, the World Bank combines estimates based on income data and estimates based on consumption data. Here we only include the estimates based on consumption data. + +You can find the data with all available income and consumption data points in our [complete dataset](https://github.com/owid/poverty-data#a-global-dataset-of-poverty-and-inequality-measures-prepared-by-our-world-in-data-from-the-world-banks-poverty-and-inequality-platform-pip-database) of the World Bank PIP data.""", + }, + "income_consumption": { + "name": "income or consumption", + "name_distribution": "after tax income or consumption", + "verb": "received", + "description": "Depending on the country and year, the data relates to income measured after taxes and benefits, or to consumption, per capita. 'Per capita' means that the income of each household is attributed equally to each member of the household (including children).", + "processing_description": """For a small number of country-year observations, the World Bank PIP data contains two estimates: one based on income data and one based on consumption data. In these cases we keep only the consumption estimate in order to obtain a single series for each country. + +You can find the data with all available income and consumption data points, including these overlapping estimates, in our [complete dataset](https://github.com/owid/poverty-data#a-global-dataset-of-poverty-and-inequality-measures-prepared-by-our-world-in-data-from-the-world-banks-poverty-and-inequality-platform-pip-database) of the World Bank PIP data.""", + }, +} + +# Details for each relative poverty line +rel_dict = {40: "40% of the median", 50: "50% of the median", 60: "60% of the median"} + +# Details for each absolute poverty line +abs_dict = { + 2011: { + 100: {"title": "$1 a day", "title_between": "$1", "description_key": ""}, + 190: { + "title": "$1.90 a day", + "title_between": "$1.90", + "description_key": "Extreme poverty here is defined as living below the International Poverty Line of $1.90 per day.", + }, + 320: { + "title": "$3.20 a day", + "title_between": "$3.20", + "description_key": "A poverty line of $3.20 a day represents definitions of national poverty lines in lower-middle-income countries.", + }, + 550: { + "title": "$5.50 a day", + "title_between": "$5.50", + "description_key": "A poverty line of $5.50 a day represents definitions of national poverty lines in upper-middle-income countries.", + }, + 1000: { + "title": "$10 a day", + "title_between": "$10", + "description_key": "", + }, + 2000: { + "title": "$20 a day", + "title_between": "$20", + "description_key": "", + }, + 3000: { + "title": "$30 a day", + "title_between": "$30", + "description_key": "A poverty line of $30 a day represents definitions of national poverty lines in high-income countries.", + }, + 4000: { + "title": "$40 a day", + "title_between": "$40", + "description_key": "", + }, + }, + 2017: { + 100: {"title": "$1 a day", "title_between": "$1", "description_key": ""}, + 215: { + "title": "$2.15 a day", + "title_between": "$2.15", + "description_key": "Extreme poverty here is defined as living below the International Poverty Line of $2.15 per day.", + }, + 365: { + "title": "$3.65 a day", + "title_between": "$3.65", + "description_key": "A poverty line of $3.65 a day represents definitions of national poverty lines in lower-middle-income countries.", + }, + 685: { + "title": "$6.85 a day", + "title_between": "$6.85", + "description_key": "A poverty line of $6.85 a day represents definitions of national poverty lines in upper-middle-income countries.", + }, + 1000: { + "title": "$10 a day", + "title_between": "$10", + "description_key": "", + }, + 2000: { + "title": "$20 a day", + "title_between": "$20", + "description_key": "", + }, + 3000: { + "title": "$30 a day", + "title_between": "$30", + "description_key": "A poverty line of $30 a day represents definitions of national poverty lines in high-income countries.", + }, + 4000: { + "title": "$40 a day", + "title_between": "$40", + "description_key": "", + }, + }, +} + +# Details for naming each decile/percentile +pct_dict = { + 1: {"decile10": "Poorest decile", "decile9": "Poorest decile"}, + 2: {"decile10": "2nd decile", "decile9": "2nd decile"}, + 3: {"decile10": "3rd decile", "decile9": "3rd decile"}, + 4: {"decile10": "4th decile", "decile9": "4th decile"}, + 5: {"decile10": "5th decile", "decile9": "5th decile"}, + 6: {"decile10": "6th decile", "decile9": "6th decile"}, + 7: {"decile10": "7th decile", "decile9": "7th decile"}, + 8: {"decile10": "8th decile", "decile9": "8th decile"}, + 9: {"decile10": "9th decile", "decile9": "Richest decile"}, + 10: {"decile10": "Richest decile", "decile9": ""}, +} + + +def add_metadata_vars(tb_garden: Table, ppp_version: int, welfare_type: str) -> Table: + """ + Add metadata for each variable in the dataset, using the dictionaries above and the functions below + """ + + # Add short name + tb_garden.metadata.short_name = f"{welfare_type}_{ppp_version}" + + # Create a list from abs_dict + povline_list = list(abs_dict[ppp_version].keys()) + + # Get a list of all the variables available + cols = list(tb_garden.columns) + + for var in var_dict: + # For variables uniquely defined for each country-year-welfare type-reporting level (mostly inequality indicators + mean and median) + col_name = f"{var}" + + if col_name in cols: + # Get the origins of the variable + origins = tb_garden[col_name].metadata.origins + + # Create metadata for these variables + tb_garden[col_name].metadata = var_metadata_inequality_mean_median(var, origins, welfare_type) + + # Replace placeholders + tb_garden[col_name].metadata.description_short = ( + tb_garden[col_name] + .metadata.description_short.replace("{inc_cons_dict[wel]['name']}", inc_cons_dict[welfare_type]["name"]) + .replace("{inc_cons_dict[wel]['name_distribution']}", inc_cons_dict[welfare_type]["name_distribution"]) + .replace("{inc_cons_dict[wel]['verb']}", inc_cons_dict[welfare_type]["verb"]) + ) + + tb_garden[col_name].metadata.description_key = [ + ppp.replace("{ppp}", str(ppp_version)) for ppp in tb_garden[col_name].metadata.description_key + ] + + tb_garden[col_name].metadata.unit = tb_garden[col_name].metadata.unit.replace("{ppp}", str(ppp_version)) + + for povline in povline_list: + # For variables that use absolute poverty lines + col_name = f"{var}_{povline}" + + if col_name in cols: + # Get the origins of the variable + origins = tb_garden[col_name].metadata.origins + + # Create metadata for these variables + tb_garden[col_name].metadata = var_metadata_absolute_povlines( + var, povline, origins, ppp_version, welfare_type + ) + + # Replace placeholders + tb_garden[col_name].metadata.description_short = ( + tb_garden[col_name] + .metadata.description_short.replace("{povline}", abs_dict[ppp_version][povline]["title"]) + .replace("{inc_cons_dict[wel]['name']}", inc_cons_dict[welfare_type]["name"]) + ) + + tb_garden[col_name].metadata.description_key = [ + ppp.replace("{ppp}", str(ppp_version)) for ppp in tb_garden[col_name].metadata.description_key + ] + + tb_garden[col_name].metadata.unit = tb_garden[col_name].metadata.unit.replace("{ppp}", str(ppp_version)) + + # For variables above poverty lines + col_name = f"{var}_above_{povline}" + + if col_name in cols: + # Get the origins of the variable + origins = tb_garden[col_name].metadata.origins + + # Create metadata for these variables + tb_garden[col_name].metadata = var_metadata_absolute_povlines( + var, povline, origins, ppp_version, welfare_type + ) + + # Replace placeholders + tb_garden[col_name].metadata.description_short = ( + tb_garden[col_name] + .metadata.description_short.replace("{povline}", abs_dict[ppp_version][povline]["title"]) + .replace("{inc_cons_dict[wel]['name']}", inc_cons_dict[welfare_type]["name"]) + ) + + tb_garden[col_name].metadata.description_key = [ + ppp.replace("{ppp}", str(ppp_version)) for ppp in tb_garden[col_name].metadata.description_key + ] + + tb_garden[col_name].metadata.unit = tb_garden[col_name].metadata.unit.replace("{ppp}", str(ppp_version)) + + # Replace "below" with "above" in the description + tb_garden[col_name].metadata.description_short = tb_garden[col_name].metadata.description_short.replace( + "below", "above" + ) + + # Replace "in poverty" with "not in poverty" in the title + tb_garden[col_name].metadata.title = tb_garden[col_name].metadata.title.replace( + "in poverty", "not in poverty" + ) + + # Replicate the title in the display name and title_public + tb_garden[col_name].metadata.display["name"] = tb_garden[col_name].metadata.title + tb_garden[col_name].metadata.presentation = VariablePresentationMeta( + title_public=tb_garden[col_name].metadata.title + ) + + for i in range(len(povline_list)): + if i != 0: + # For variables between poverty lines + col_name = f"{var}_between_{povline_list[i-1]}_{povline_list[i]}" + + if col_name in cols: + # Get the origins of the variable + origins = tb_garden[col_name].metadata.origins + + # Create metadata for these variables + tb_garden[col_name].metadata = var_metadata_between_absolute_povlines( + var, povline_list[i - 1], povline_list[i], origins, ppp_version, welfare_type + ) + + # For variables between poverty lines that jump the original order + col_name = f"{var}_between_{povline_list[1]}_{povline_list[4]}" + + if col_name in cols: + # Get the origins of the variable + origins = tb_garden[col_name].metadata.origins + + # Create metadata for these variables + tb_garden[col_name].metadata = var_metadata_between_absolute_povlines( + var, povline_list[1], povline_list[4], origins, ppp_version, welfare_type + ) + + col_name = f"{var}_between_{povline_list[4]}_{povline_list[6]}" + + if col_name in cols: + # Get the origins of the variable + origins = tb_garden[col_name].metadata.origins + + # Create metadata for these variables + tb_garden[col_name].metadata = var_metadata_between_absolute_povlines( + var, povline_list[4], povline_list[6], origins, ppp_version, welfare_type + ) + + for rel in rel_dict: + # For variables that use relative poverty lines + col_name = f"{var}_{rel}_median" + + if col_name in cols: + # Get the origins of the variable + origins = tb_garden[col_name].metadata.origins + + # Create metadata for these variables + tb_garden[col_name].metadata = var_metadata_relative_povlines(var, rel, origins, welfare_type) + + # Replace placeholders + tb_garden[col_name].metadata.description_short = ( + tb_garden[col_name] + .metadata.description_short.replace("{povline}", rel_dict[rel]) + .replace("{inc_cons_dict[wel]['name']}", inc_cons_dict[welfare_type]["name"]) + ) + + for pct in pct_dict: + # For variables that use percentiles (deciles) + col_name = f"decile{pct}_{var}" + + if col_name in cols: + # Get the origins of the variable + origins = tb_garden[col_name].metadata.origins + + # Create metadata for these variables + tb_garden[col_name].metadata = var_metadata_percentiles(var, pct, origins, ppp_version, welfare_type) + + # Replace placeholders + tb_garden[col_name].metadata.description_short = ( + tb_garden[col_name] + .metadata.description_short.replace("{str(pct)}", f"{str(pct)}0") + .replace( + "{inc_cons_dict[wel]['name_distribution']}", + inc_cons_dict[welfare_type]["name_distribution"], + ) + .replace("{inc_cons_dict[wel]['verb']}", inc_cons_dict[welfare_type]["verb"]) + .replace("{pct_dict[pct]['decile10']}", pct_dict[pct]["decile10"].lower()) + ) + + tb_garden[col_name].metadata.description_key = [ + ppp.replace("{ppp}", str(ppp_version)) for ppp in tb_garden[col_name].metadata.description_key + ] + tb_garden[col_name].metadata.unit = tb_garden[col_name].metadata.unit.replace("{ppp}", str(ppp_version)) + + return tb_garden + + +# Metadata functions to show a clearer main code +def var_metadata_inequality_mean_median(var, origins, welfare_type) -> VariableMeta: + """ + Create metadata for defined uniquely by their name + """ + # For monetary variables I include PPP description + if var in ["mean", "median"]: + meta = VariableMeta( + title=f"{var_dict[var]['title']} {inc_cons_dict[welfare_type]['name']}", + description_short=var_dict[var]["description"], + description_key=[ + ppp_description, + inc_cons_dict[welfare_type]["description"], + non_market_income_description, + ], + description_processing=f"""{inc_cons_dict[welfare_type]['processing_description']}""", + unit=var_dict[var]["unit"], + short_unit=var_dict[var]["short_unit"], + origins=origins, + ) + meta.display = { + "name": meta.title, + "numDecimalPlaces": var_dict[var]["numDecimalPlaces"], + "tolerance": TOLERANCE, + } + + meta.presentation = VariablePresentationMeta(title_public=meta.title) + + else: + meta = VariableMeta( + title=f"{var_dict[var]['title']}", + description_short=var_dict[var]["description"], + description_key=[ + inc_cons_dict[welfare_type]["description"], + non_market_income_description, + ], + description_processing=f"""{inc_cons_dict[welfare_type]['processing_description']}""", + unit=var_dict[var]["unit"], + short_unit=var_dict[var]["short_unit"], + origins=origins, + ) + meta.display = { + "name": meta.title, + "numDecimalPlaces": var_dict[var]["numDecimalPlaces"], + "tolerance": TOLERANCE, + } + + meta.presentation = VariablePresentationMeta(title_public=meta.title) + + return meta + + +def var_metadata_absolute_povlines(var, povline, origins, ppp_version, welfare_type) -> VariableMeta: + """ + Create metadata for variables with absolute poverty lines + """ + # Define the list of description_key, to then remove the empty ones + description_key_list = [ + abs_dict[ppp_version][povline]["description_key"], + ppp_description, + inc_cons_dict[welfare_type]["description"], + non_market_income_description, + ] + + # Remove empty strings from the list + description_key_list = list(filter(None, description_key_list)) + + meta = VariableMeta( + title=f"{abs_dict[ppp_version][povline]['title']} - {var_dict[var]['title']}", + description_short=var_dict[var]["description"], + description_key=description_key_list, + description_processing=f"""{inc_cons_dict[welfare_type]['processing_description']}""", + unit=var_dict[var]["unit"], + short_unit=var_dict[var]["short_unit"], + origins=origins, + ) + meta.display = { + "name": meta.title, + "numDecimalPlaces": var_dict[var]["numDecimalPlaces"], + "tolerance": TOLERANCE, + } + + meta.presentation = VariablePresentationMeta(title_public=meta.title) + + return meta + + +def var_metadata_between_absolute_povlines(var, povline1, povline2, origins, ppp_version, welfare_type) -> VariableMeta: + """ + Create metadata for variables between poverty lines + """ + + meta = VariableMeta( + title=f"{abs_dict[ppp_version][povline1]['title_between']}-{abs_dict[ppp_version][povline2]['title_between']} - {var_dict[var]['title']}", + description_short=var_dict[var]["description"], + description_key=[ + ppp_description, + inc_cons_dict[welfare_type]["description"], + non_market_income_description, + ], + description_processing=f"""{inc_cons_dict[welfare_type]['processing_description']}""", + unit=var_dict[var]["unit"].replace("{ppp}", str(ppp_version)), + short_unit=var_dict[var]["short_unit"], + origins=origins, + ) + meta.display = { + "name": meta.title, + "numDecimalPlaces": var_dict[var]["numDecimalPlaces"], + "tolerance": TOLERANCE, + } + + meta.presentation = VariablePresentationMeta(title_public=meta.title) + + meta.description_short = meta.description_short.replace( + "{povline}", + f"living between {abs_dict[ppp_version][povline1]['title_between']} and {abs_dict[ppp_version][povline2]['title_between']} a day", + ).replace("{inc_cons_dict[wel]['name']}", inc_cons_dict[welfare_type]["name"]) + + meta.description_key = [ppp.replace("{ppp}", str(ppp_version)) for ppp in meta.description_key] + + meta.unit = meta.unit.replace("{ppp}", str(ppp_version)) + + return meta + + +def var_metadata_relative_povlines(var, rel, origins, welfare_type) -> VariableMeta: + """ + Create metadata for variables with relative poverty lines + """ + + meta = VariableMeta( + title=f"{rel_dict[rel]} - {var_dict[var]['title']}", + description_short=var_dict[var]["description"], + description_key=[ + relative_poverty_description, + inc_cons_dict[welfare_type]["description"], + non_market_income_description, + ], + description_processing=f"""{processing_description_relative_poverty} + +{inc_cons_dict[welfare_type]['processing_description']}""", + unit=var_dict[var]["unit"], + short_unit=var_dict[var]["short_unit"], + origins=origins, + ) + meta.display = { + "name": meta.title, + "numDecimalPlaces": var_dict[var]["numDecimalPlaces"], + "tolerance": TOLERANCE, + } + + meta.presentation = VariablePresentationMeta(title_public=meta.title) + + return meta + + +def var_metadata_percentiles(var, pct, origins, ppp_version, welfare_type) -> VariableMeta: + """ + Create metadata for variables with percentiles + """ + + if var == "thr": + meta = VariableMeta( + title=f"{pct_dict[pct]['decile9']} - {var_dict[var]['title']}", + description_short=var_dict[var]["description"], + description_key=[ + ppp_description, + inc_cons_dict[welfare_type]["description"], + non_market_income_description, + ], + description_processing=f"""{processing_description_thr} + +{inc_cons_dict[welfare_type]['processing_description']}""", + unit=var_dict[var]["unit"], + short_unit=var_dict[var]["short_unit"], + origins=origins, + ) + + elif var == "avg": + meta = VariableMeta( + title=f"{pct_dict[pct]['decile10']} - {var_dict[var]['title']}", + description_short=var_dict[var]["description"], + description_key=[ + ppp_description, + inc_cons_dict[welfare_type]["description"], + non_market_income_description, + ], + description_processing=f"""{processing_description_avg} + +{inc_cons_dict[welfare_type]['processing_description']}""", + unit=var_dict[var]["unit"], + short_unit=var_dict[var]["short_unit"], + origins=origins, + ) + + # For shares + else: + meta = VariableMeta( + title=f"{pct_dict[pct]['decile10']} - {var_dict[var]['title']}", + description_short=var_dict[var]["description"], + description_key=[ + inc_cons_dict[welfare_type]["description"], + non_market_income_description, + ], + description_processing=f"""{inc_cons_dict[welfare_type]['processing_description']}""", + unit=var_dict[var]["unit"], + short_unit=var_dict[var]["short_unit"], + origins=origins, + ) + + meta.display = { + "name": meta.title, + "numDecimalPlaces": var_dict[var]["numDecimalPlaces"], + "tolerance": TOLERANCE, + } + + meta.presentation = VariablePresentationMeta(title_public=meta.title) + + return meta + + +# FOR PERCENTILES +def add_metadata_vars_percentiles(tb_garden: Table, ppp_version: int, welfare_type: str) -> Table: + """ + Add metadata for each variable in the dataset, using the dictionaries above and the functions below + This is done for the percentile tables + """ + + # Add short name + tb_garden.metadata.short_name = f"percentiles_{welfare_type}_{ppp_version}" + + # Get a list of all the variables available + cols = list(tb_garden.columns) + + for var in var_dict: + # For variables uniquely defined for each country-year-welfare type-reporting level (mostly inequality indicators + mean and median) + col_name = f"{var}" + + if col_name in cols: + # Get the origins of the variable + origins = tb_garden[col_name].metadata.origins + + # Create metadata for these variables + tb_garden[col_name].metadata = var_metadata_percentile_table(var, origins, welfare_type) + + # Replace placeholders + tb_garden[col_name].metadata.description_short = ( + tb_garden[col_name] + .metadata.description_short.replace("{str(pct)}", "each 1") + .replace( + "{inc_cons_dict[wel]['name_distribution']}", + inc_cons_dict[welfare_type]["name_distribution"], + ) + .replace("{inc_cons_dict[wel]['verb']}", inc_cons_dict[welfare_type]["verb"]) + .replace( + "the {pct_dict[pct]['decile10']} (tenth of the population)", + "each percentile (hundredth of the population)", + ) + ) + tb_garden[col_name].metadata.description_key = [ + ppp.replace("{ppp}", str(ppp_version)) for ppp in tb_garden[col_name].metadata.description_key + ] + + tb_garden[col_name].metadata.unit = tb_garden[col_name].metadata.unit.replace("{ppp}", str(ppp_version)) + + return tb_garden + + +def var_metadata_percentile_table(var, origins, welfare_type) -> VariableMeta: + """ + Create metadata for variables with percentiles + """ + + if var == "thr": + meta = VariableMeta( + title=f"{inc_cons_dict[welfare_type]['name'].capitalize()} {var_dict[var]['title'].lower()}", + description_short=var_dict[var]["description"], + description_key=[ + ppp_description, + inc_cons_dict[welfare_type]["description"], + non_market_income_description, + ], + description_processing=f"""{processing_description_thr_percentiles}""", + unit=var_dict[var]["unit"], + short_unit=var_dict[var]["short_unit"], + origins=origins, + ) + + elif var == "avg": + meta = VariableMeta( + title=f"{inc_cons_dict[welfare_type]['name'].capitalize()} {var_dict[var]['title'].lower()}", + description_short=var_dict[var]["description"], + description_key=[ + ppp_description, + inc_cons_dict[welfare_type]["description"], + non_market_income_description, + ], + description_processing="", + unit=var_dict[var]["unit"], + short_unit=var_dict[var]["short_unit"], + origins=origins, + ) + + # For shares + else: + meta = VariableMeta( + title=f"{inc_cons_dict[welfare_type]['name'].capitalize()} {var_dict[var]['title'].lower()}", + description_short=var_dict[var]["description"], + description_key=[ + inc_cons_dict[welfare_type]["description"], + non_market_income_description, + ], + description_processing="", + unit=var_dict[var]["unit"], + short_unit=var_dict[var]["short_unit"], + origins=origins, + ) + + meta.display = { + "name": meta.title, + "numDecimalPlaces": var_dict[var]["numDecimalPlaces"], + "tolerance": TOLERANCE, + } + + meta.presentation = VariablePresentationMeta(title_public=meta.title) + + return meta diff --git a/etl/steps/data/garden/wb/2024-03-27/world_bank_pip.countries.json b/etl/steps/data/garden/wb/2024-03-27/world_bank_pip.countries.json new file mode 100644 index 00000000000..73342a8a395 --- /dev/null +++ b/etl/steps/data/garden/wb/2024-03-27/world_bank_pip.countries.json @@ -0,0 +1,181 @@ +{ + "Albania": "Albania", + "Algeria": "Algeria", + "Angola": "Angola", + "Argentina": "Argentina", + "Armenia": "Armenia", + "Australia": "Australia", + "Austria": "Austria", + "Azerbaijan": "Azerbaijan", + "Bangladesh": "Bangladesh", + "Belarus": "Belarus", + "Belgium": "Belgium", + "Belize": "Belize", + "Benin": "Benin", + "Bhutan": "Bhutan", + "Bolivia": "Bolivia", + "Bosnia and Herzegovina": "Bosnia and Herzegovina", + "Botswana": "Botswana", + "Brazil": "Brazil", + "Bulgaria": "Bulgaria", + "Burkina Faso": "Burkina Faso", + "Burundi": "Burundi", + "Cabo Verde": "Cape Verde", + "Cameroon": "Cameroon", + "Canada": "Canada", + "Central African Republic": "Central African Republic", + "Chad": "Chad", + "Chile": "Chile", + "China": "China", + "Colombia": "Colombia", + "Comoros": "Comoros", + "Congo, Dem. Rep.": "Democratic Republic of Congo", + "Congo, Rep.": "Congo", + "Costa Rica": "Costa Rica", + "Cote d'Ivoire": "Cote d'Ivoire", + "Croatia": "Croatia", + "Cyprus": "Cyprus", + "Czechia": "Czechia", + "Denmark": "Denmark", + "Djibouti": "Djibouti", + "Dominican Republic": "Dominican Republic", + "Ecuador": "Ecuador", + "Egypt, Arab Rep.": "Egypt", + "El Salvador": "El Salvador", + "Estonia": "Estonia", + "Eswatini": "Eswatini", + "Ethiopia": "Ethiopia", + "Fiji": "Fiji", + "Finland": "Finland", + "France": "France", + "Gabon": "Gabon", + "Gambia, The": "Gambia", + "Georgia": "Georgia", + "Germany": "Germany", + "Ghana": "Ghana", + "Greece": "Greece", + "Grenada": "Grenada", + "Guatemala": "Guatemala", + "Guinea": "Guinea", + "Guinea-Bissau": "Guinea-Bissau", + "Guyana": "Guyana", + "Haiti": "Haiti", + "Honduras": "Honduras", + "Hungary": "Hungary", + "Iceland": "Iceland", + "India": "India", + "Indonesia": "Indonesia", + "Iran, Islamic Rep.": "Iran", + "Iraq": "Iraq", + "Ireland": "Ireland", + "Israel": "Israel", + "Italy": "Italy", + "Jamaica": "Jamaica", + "Japan": "Japan", + "Jordan": "Jordan", + "Kazakhstan": "Kazakhstan", + "Kenya": "Kenya", + "Kiribati": "Kiribati", + "Korea, Rep.": "South Korea", + "Kosovo": "Kosovo", + "Kyrgyz Republic": "Kyrgyzstan", + "Lao PDR": "Laos", + "Latvia": "Latvia", + "Lebanon": "Lebanon", + "Lesotho": "Lesotho", + "Liberia": "Liberia", + "Lithuania": "Lithuania", + "Luxembourg": "Luxembourg", + "Madagascar": "Madagascar", + "Malawi": "Malawi", + "Malaysia": "Malaysia", + "Maldives": "Maldives", + "Mali": "Mali", + "Malta": "Malta", + "Marshall Islands": "Marshall Islands", + "Mauritania": "Mauritania", + "Mauritius": "Mauritius", + "Mexico": "Mexico", + "Micronesia, Fed. Sts.": "Micronesia (country)", + "Moldova": "Moldova", + "Mongolia": "Mongolia", + "Montenegro": "Montenegro", + "Morocco": "Morocco", + "Mozambique": "Mozambique", + "Myanmar": "Myanmar", + "Namibia": "Namibia", + "Nauru": "Nauru", + "Nepal": "Nepal", + "Netherlands": "Netherlands", + "Nicaragua": "Nicaragua", + "Niger": "Niger", + "Nigeria": "Nigeria", + "North Macedonia": "North Macedonia", + "Norway": "Norway", + "Pakistan": "Pakistan", + "Panama": "Panama", + "Papua New Guinea": "Papua New Guinea", + "Paraguay": "Paraguay", + "Peru": "Peru", + "Philippines": "Philippines", + "Poland": "Poland", + "Portugal": "Portugal", + "Romania": "Romania", + "Russian Federation": "Russia", + "Rwanda": "Rwanda", + "Samoa": "Samoa", + "Sao Tome and Principe": "Sao Tome and Principe", + "Senegal": "Senegal", + "Serbia": "Serbia", + "Seychelles": "Seychelles", + "Sierra Leone": "Sierra Leone", + "Slovak Republic": "Slovakia", + "Slovenia": "Slovenia", + "Solomon Islands": "Solomon Islands", + "South Africa": "South Africa", + "South Sudan": "South Sudan", + "Spain": "Spain", + "Sri Lanka": "Sri Lanka", + "St. Lucia": "Saint Lucia", + "Sudan": "Sudan", + "Suriname": "Suriname", + "Sweden": "Sweden", + "Switzerland": "Switzerland", + "Syrian Arab Republic": "Syria", + "Tajikistan": "Tajikistan", + "Tanzania": "Tanzania", + "Thailand": "Thailand", + "Timor-Leste": "East Timor", + "Togo": "Togo", + "Tonga": "Tonga", + "Trinidad and Tobago": "Trinidad and Tobago", + "Tunisia": "Tunisia", + "Turkmenistan": "Turkmenistan", + "Tuvalu": "Tuvalu", + "Uganda": "Uganda", + "Ukraine": "Ukraine", + "United Arab Emirates": "United Arab Emirates", + "United Kingdom": "United Kingdom", + "United States": "United States", + "Uruguay": "Uruguay", + "Uzbekistan": "Uzbekistan", + "Vanuatu": "Vanuatu", + "Venezuela, RB": "Venezuela", + "Viet Nam": "Vietnam", + "West Bank and Gaza": "Palestine", + "World": "World", + "Yemen, Rep.": "Yemen", + "Zambia": "Zambia", + "Zimbabwe": "Zimbabwe", + "East Asia & Pacific": "East Asia and Pacific (PIP)", + "Eastern and Southern Africa": "Eastern and Southern Africa (PIP)", + "Europe & Central Asia": "Europe and Central Asia (PIP)", + "Latin America & Caribbean": "Latin America and the Caribbean (PIP)", + "Middle East & North Africa": "Middle East and North Africa (PIP)", + "Other High Income Countries": "Other high income countries (PIP)", + "South Asia": "South Asia (PIP)", + "Sub-Saharan Africa": "Sub-Saharan Africa (PIP)", + "Taiwan, China": "Taiwan", + "Turkiye": "Turkey", + "Western and Central Africa": "Western and Central Africa (PIP)" +} \ No newline at end of file diff --git a/etl/steps/data/garden/wb/2024-03-27/world_bank_pip.meta.yml b/etl/steps/data/garden/wb/2024-03-27/world_bank_pip.meta.yml new file mode 100644 index 00000000000..a6eba01d529 --- /dev/null +++ b/etl/steps/data/garden/wb/2024-03-27/world_bank_pip.meta.yml @@ -0,0 +1,473 @@ +# NOTE: To learn more about the fields, hover over their names. +definitions: + common: + presentation: + topic_tags: + - Poverty + - Economic Inequality + - Economic Growth + attribution_short: World Bank + grapher_config: + originUrl: https://ourworldindata.org/poverty + $schema: https://files.ourworldindata.org/schemas/grapher-schema.003.json + processing_level: major + + +# Learn more about the available fields: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/dataset/ +dataset: + update_period_days: 180 + + +# Learn more about the available fields: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/tables/ +tables: + income_consumption_2017: + # Learn more about the available fields: + # http://docs.owid.io/projects/etl/architecture/metadata/reference/indicator/ + variables: + headcount_ratio_215: + presentation: + title_public: Share of population living in extreme poverty + faqs: + - fragment_id: poverty-international-poverty-line + gdoc_id: 1gGburArxglFdHXeTLotFW4TOOLoeRq5XW6UfAdKtaAw + - fragment_id: poverty-international-dollars + gdoc_id: 1gGburArxglFdHXeTLotFW4TOOLoeRq5XW6UfAdKtaAw + - fragment_id: poverty-comparability + gdoc_id: 1gGburArxglFdHXeTLotFW4TOOLoeRq5XW6UfAdKtaAw + - fragment_id: poverty-regional-estimates + gdoc_id: 1gGburArxglFdHXeTLotFW4TOOLoeRq5XW6UfAdKtaAw + grapher_config: + title: Share of population living in extreme poverty + subtitle: >- + Extreme poverty is defined as living below the International Poverty Line of + $2.15 per day. This data is adjusted for inflation and for differences in the + cost of living between countries. + note: >- + This data is expressed in [international-$](#dod:int_dollar_abbreviation) at + 2017 prices. Depending on the country and year, it relates to income measured + after taxes and benefits, or to consumption, [per capita](#dod:per-capita). + hasMapTab: true + tab: map + variantName: Line chart + yAxis: + min: 0 + map: + time: 2019 + colorScale: + baseColorScheme: OrRd + binningStrategy: manual + customNumericValues: + - 3 + - 10 + - 20 + - 30 + - 40 + - 50 + - 60 + - 70 + - 80 + - 90 + - 100 + selectedEntityNames: + - Bangladesh + - Bolivia + - Madagascar + - India + - China + - Ethiopia + + headcount_ratio_365: + presentation: + title_public: "Poverty: Share of population living on less than $3.65 a day" + faqs: + - fragment_id: poverty-international-poverty-line + gdoc_id: 1gGburArxglFdHXeTLotFW4TOOLoeRq5XW6UfAdKtaAw + - fragment_id: poverty-international-dollars + gdoc_id: 1gGburArxglFdHXeTLotFW4TOOLoeRq5XW6UfAdKtaAw + - fragment_id: poverty-comparability + gdoc_id: 1gGburArxglFdHXeTLotFW4TOOLoeRq5XW6UfAdKtaAw + - fragment_id: poverty-regional-estimates + gdoc_id: 1gGburArxglFdHXeTLotFW4TOOLoeRq5XW6UfAdKtaAw + grapher_config: + title: "Poverty: Share of population living on less than $3.65 a day" + subtitle: >- + The poverty line of $3.65 per day is set by the World Bank to be representative of the definitions of poverty adopted in lower-middle-income countries. This data is adjusted for inflation and for differences in the cost of living between countries. + note: >- + This data is expressed in [international-$](#dod:int_dollar_abbreviation) at + 2017 prices. Depending on the country and year, it relates to income measured + after taxes and benefits, or to consumption, [per capita](#dod:per-capita). + hasMapTab: true + tab: map + variantName: Line chart + yAxis: + min: 0 + map: + time: 2019 + colorScale: + baseColorScheme: OrRd + binningStrategy: manual + customNumericValues: + - 10 + - 20 + - 30 + - 40 + - 50 + - 60 + - 70 + - 80 + - 90 + - 100 + selectedEntityNames: + - Bangladesh + - Bolivia + - Madagascar + - India + - China + - Ethiopia + + headcount_ratio_685: + presentation: + title_public: "Poverty: Share of population living on less than $6.85 a day" + faqs: + - fragment_id: poverty-international-poverty-line + gdoc_id: 1gGburArxglFdHXeTLotFW4TOOLoeRq5XW6UfAdKtaAw + - fragment_id: poverty-international-dollars + gdoc_id: 1gGburArxglFdHXeTLotFW4TOOLoeRq5XW6UfAdKtaAw + - fragment_id: poverty-comparability + gdoc_id: 1gGburArxglFdHXeTLotFW4TOOLoeRq5XW6UfAdKtaAw + - fragment_id: poverty-regional-estimates + gdoc_id: 1gGburArxglFdHXeTLotFW4TOOLoeRq5XW6UfAdKtaAw + grapher_config: + title: "Poverty: Share of population living on less than $6.85 a day" + subtitle: >- + The poverty line of $6.85 per day is set by the World Bank to be representative of the definitions of poverty adopted in upper-middle-income countries. This data is adjusted for inflation and for differences in the cost of living between countries. + note: >- + This data is expressed in [international-$](#dod:int_dollar_abbreviation) at + 2017 prices. Depending on the country and year, it relates to income measured + after taxes and benefits, or to consumption, [per capita](#dod:per-capita). + hasMapTab: true + tab: map + variantName: Line chart + yAxis: + min: 0 + map: + time: 2019 + colorScale: + baseColorScheme: OrRd + binningStrategy: manual + customNumericValues: + - 10 + - 20 + - 30 + - 40 + - 50 + - 60 + - 70 + - 80 + - 90 + - 100 + selectedEntityNames: + - Bangladesh + - Bolivia + - Madagascar + - India + - China + - Ethiopia + + + headcount_ratio_3000: + presentation: + title_public: Share of population living on less than $30 a day + faqs: + - fragment_id: poverty-international-dollars + gdoc_id: 1gGburArxglFdHXeTLotFW4TOOLoeRq5XW6UfAdKtaAw + - fragment_id: poverty-comparability + gdoc_id: 1gGburArxglFdHXeTLotFW4TOOLoeRq5XW6UfAdKtaAw + - fragment_id: poverty-regional-estimates + gdoc_id: 1gGburArxglFdHXeTLotFW4TOOLoeRq5XW6UfAdKtaAw + grapher_config: + title: "Poverty: Share of population living on less than $30 a day" + subtitle: >- + This data is adjusted for inflation and for differences in the cost of living between countries. + note: >- + This data is expressed in [international-$](#dod:int_dollar_abbreviation) at 2017 prices. Depending on the country and year, it relates to income measured after taxes and benefits, or to consumption, [per capita](#dod:per-capita). + hasMapTab: true + tab: map + variantName: Line chart + yAxis: + min: 0 + map: + time: 2019 + colorScale: + baseColorScheme: OrRd + binningStrategy: manual + customNumericValues: + - 10 + - 20 + - 30 + - 40 + - 50 + - 60 + - 70 + - 80 + - 90 + - 100 + selectedEntityNames: + - Bangladesh + - Bolivia + - Madagascar + - India + - China + - Ethiopia + + headcount_ratio_60_median: + presentation: + title_public: Share of population below 60% of median income or consumption + topic_tags: + - Poverty + - Economic Inequality + faqs: + - fragment_id: poverty-comparability + gdoc_id: 1gGburArxglFdHXeTLotFW4TOOLoeRq5XW6UfAdKtaAw + grapher_config: + title: 'Relative poverty: Share of people below 60% of median income' + subtitle: >- + Relative poverty is measured in terms of a poverty line that rises and falls + over time with average incomes — in this case set at 60% of median income. + note: >- + Depending on the country and year, the data relates to income measured after + taxes and benefits, or to consumption, [per capita](#dod:per-capita). + hasMapTab: true + tab: map + yAxis: + min: 0 + colorScale: + baseColorScheme: OwidDistinctLines + map: + time: 2019 + colorScale: + baseColorScheme: YlOrBr + binningStrategy: manual + customNumericValues: + - 5 + - 10 + - 15 + - 20 + - 25 + - 30 + - 35 + selectedEntityNames: + - Bangladesh + - Bolivia + - Madagascar + - India + - China + - Ethiopia + + headcount_ratio_50_median: + presentation: + title_public: Share of population below 50% of median income or consumption + topic_tags: + - Poverty + - Economic Inequality + faqs: + - fragment_id: poverty-comparability + gdoc_id: 1gGburArxglFdHXeTLotFW4TOOLoeRq5XW6UfAdKtaAw + grapher_config: + title: 'Relative poverty: Share of people below 50% of median income' + subtitle: Relative poverty is measured in terms of a poverty line that rises and falls over time with average incomes – in this case set at 50% of median income. + note: >- + Depending on the country and year, the data relates to income measured after + taxes and benefits, or to consumption, [per capita](#dod:per-capita). + hasMapTab: true + tab: map + yAxis: + min: 0 + colorScale: + baseColorScheme: OwidDistinctLines + map: + time: 2019 + colorScale: + baseColorScheme: YlOrBr + binningStrategy: manual + customNumericValues: + - 3 + - 6 + - 9 + - 12 + - 15 + - 18 + - 21 + - 24 + - 27 + selectedEntityNames: + - Bangladesh + - Bolivia + - Madagascar + - India + - China + - Ethiopia + + headcount_ratio_40_median: + presentation: + title_public: Share of population below 40% of median income or consumption + topic_tags: + - Poverty + - Economic Inequality + faqs: + - fragment_id: poverty-comparability + gdoc_id: 1gGburArxglFdHXeTLotFW4TOOLoeRq5XW6UfAdKtaAw + grapher_config: + title: 'Relative poverty: Share of people below 40% of median income' + subtitle: Relative poverty is measured in terms of a poverty line that rises and falls over time with average incomes – in this case set at 40% of median income. + note: >- + Depending on the country and year, the data relates to income measured after + taxes and benefits, or to consumption, [per capita](#dod:per-capita). + hasMapTab: true + tab: map + yAxis: + min: 0 + colorScale: + baseColorScheme: OwidDistinctLines + map: + time: 2019 + colorScale: + baseColorScheme: YlOrBr + binningStrategy: manual + customNumericValues: + - 2 + - 4 + - 6 + - 8 + - 10 + - 12 + - 14 + - 16 + - 18 + - 20 + selectedEntityNames: + - Bangladesh + - Bolivia + - Madagascar + - India + - China + - Ethiopia + + gini: + presentation: + title_public: Gini Coefficient + topic_tags: + - Poverty + - Economic Inequality + faqs: + - fragment_id: poverty-comparability + gdoc_id: 1gGburArxglFdHXeTLotFW4TOOLoeRq5XW6UfAdKtaAw + grapher_config: + title: 'Income inequality: Gini coefficient' + subtitle: >- + The [Gini coefficient](#dod:gini) measures inequality on a scale from 0 to 1. + Higher values indicate higher inequality. Depending on the country and year, + the data relates to income measured after taxes and benefits, or to + consumption, [per capita](#dod:per-capita). + note: >- + Income and consumption estimates are available separately in this [Data + Explorer](https://ourworldindata.org/explorers/pip-inequality-explorer). + hasMapTab: true + tab: map + variantName: World Bank + originUrl: https://ourworldindata.org/economic-inequality + yAxis: + min: 0 + map: + time: 2019 + colorScale: + baseColorScheme: Oranges + binningStrategy: manual + customNumericMinValue: 1 + customNumericValues: + - 0.3 + - 0.35 + - 0.4 + - 0.45 + - 0.5 + - 0.55 + - 0.6 + selectedEntityNames: + - Chile + - Brazil + - South Africa + - United States + - France + - China + + headcount_215_regions: + title: $2.15 a day - Number in poverty (Regional aggregates) + unit: "people" + short_unit: "" + description_short: Number of people in households with an income or consumption per person below $2.15 a day. + description_key: + - Extreme poverty here is defined as living below the International Poverty Line of $2.15 per day. + - The data is measured in international-$ at 2017 prices – this adjusts for inflation and for differences in the cost of living between countries. + - Depending on the country and year, the data relates to income measured after taxes and benefits, or to consumption, per capita. 'Per capita' means that the income of each household is attributed equally to each member of the household (including children). + - Non-market sources of income, including food grown by subsistence farmers for their own consumption, are taken into account. + description_processing: |- + PIP provides regional aggregate figures for the number of people living below the International Poverty Line. Unfortunately, for certain regions and years the data survey coverage is too low and the results are suppressed. From 1990 onwards, it is only for South Asia and Sub-Saharan Africa (on different years) that regional estimates are sometimes missing. + + For these years we calculate the number of poor in the region as the difference between the estimated total number of poor across the world and the sum of the number of poor across all other regions. + + Prior to 1990 estimates for more than one region are missing, precluding this method. + display: + numDecimalPlaces: 0 + presentation: + title_public: Total of population living in extreme poverty by world region + topic_tags: + - Poverty + - Economic Growth + - Economic Inequality + faqs: + - fragment_id: poverty-international-poverty-line + gdoc_id: 1gGburArxglFdHXeTLotFW4TOOLoeRq5XW6UfAdKtaAw + - fragment_id: poverty-international-dollars + gdoc_id: 1gGburArxglFdHXeTLotFW4TOOLoeRq5XW6UfAdKtaAw + - fragment_id: poverty-comparability + gdoc_id: 1gGburArxglFdHXeTLotFW4TOOLoeRq5XW6UfAdKtaAw + - fragment_id: poverty-regional-estimates + gdoc_id: 1gGburArxglFdHXeTLotFW4TOOLoeRq5XW6UfAdKtaAw + grapher_config: + title: Total population living in extreme poverty by world region + subtitle: >- + Extreme poverty is defined as living below the International Poverty Line of + $2.15 per day. This data is adjusted for inflation and for differences in the + cost of living between countries. + note: >- + This data is expressed in [international-$](#dod:int_dollar_abbreviation) at + 2017 prices. Depending on the country and year, it relates to income measured + after taxes and benefits, or to consumption, [per capita](#dod:per-capita). + type: StackedArea + addCountryMode: disabled + hideRelativeToggle: false + originUrl: https://ourworldindata.org/poverty + baseColorScheme: OwidCategoricalC + invertColorScheme: true + yAxis: + min: 0 + selectedEntityNames: + - Other high income countries (PIP) + - Latin America and the Caribbean (PIP) + - East Asia and Pacific (PIP) + - South Asia (PIP) + - Middle East and North Africa (PIP) + - Europe and Central Asia (PIP) + - Sub-Saharan Africa (PIP) + + surveys_past_decade: + title: Number of surveys in the past decade + unit: "surveys" + short_unit: "" + description_short: The number of income or consumption surveys available in the past decade. Each decade comprises the current year and the nine years before. + description_processing: |- + For a small number of country-year observations, the World Bank PIP data contains two estimates: one based on income data and one based on consumption data. In these cases we keep only the consumption estimate in order to obtain a single series for each country. This means the indicator is estimating the number of years at least one survey was conducted in the past decade, rather than the number of surveys. + display: + numDecimalPlaces: 0 \ No newline at end of file diff --git a/etl/steps/data/garden/wb/2024-03-27/world_bank_pip.py b/etl/steps/data/garden/wb/2024-03-27/world_bank_pip.py new file mode 100644 index 00000000000..065f31c6a0f --- /dev/null +++ b/etl/steps/data/garden/wb/2024-03-27/world_bank_pip.py @@ -0,0 +1,1106 @@ +"""Load a meadow dataset and create a garden dataset.""" + +from typing import Tuple + +import numpy as np +import owid.catalog.processing as pr +from owid.catalog import Table +from shared import add_metadata_vars, add_metadata_vars_percentiles +from structlog import get_logger +from tabulate import tabulate + +from etl.data_helpers import geo +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + +# Initialize logger. +log = get_logger() + +# Define absolute poverty lines used depending on PPP version +# NOTE: Modify if poverty lines are updated from source +povlines_dict = { + 2011: [100, 190, 320, 550, 1000, 2000, 3000, 4000], + 2017: [100, 215, 365, 685, 1000, 2000, 3000, 4000], +} + +# Define regions in the dataset +regions_list = [ + "East Asia and Pacific (PIP)", + "Eastern and Southern Africa (PIP)", + "Europe and Central Asia (PIP)", + "Latin America and the Caribbean (PIP)", + "Middle East and North Africa (PIP)", + "Other high income countries (PIP)", + "South Asia (PIP)", + "Sub-Saharan Africa (PIP)", + "Western and Central Africa (PIP)", + "World", +] + +# Set table format when printing +TABLEFMT = "pretty" + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load meadow dataset. + ds_meadow = paths.load_dataset("world_bank_pip") + + # Read tables from meadow dataset. + # Key indicators + tb = ds_meadow["world_bank_pip"].reset_index() + + # Percentiles + tb_percentiles = ds_meadow["world_bank_pip_percentiles"].reset_index() + + # Process data + # Make table wide and change column names + tb = process_data(tb) + + # Calculate inequality measures + tb = calculate_inequality(tb) + + # Harmonize country names + tb: Table = geo.harmonize_countries(df=tb, countries_file=paths.country_mapping_path) + tb_percentiles: Table = geo.harmonize_countries(df=tb_percentiles, countries_file=paths.country_mapping_path) + + # Show regional data from 1990 onwards + tb = regional_data_from_1990(tb, regions_list) + tb_percentiles = regional_data_from_1990(tb_percentiles, regions_list) + + # Amend the entity to reflect if data refers to urban or rural only + tb = identify_rural_urban(tb) + + # Separate out ppp and filled data from the main dataset + tb_2011, tb_2017 = separate_ppp_data(tb) + tb_percentiles_2011, tb_percentiles_2017 = separate_ppp_data(tb_percentiles) + + # Create stacked variables from headcount and headcount_ratio + tb_2011, col_stacked_n_2011, col_stacked_pct_2011 = create_stacked_variables( + tb_2011, povlines_dict, ppp_version=2011 + ) + tb_2017, col_stacked_n_2017, col_stacked_pct_2017 = create_stacked_variables( + tb_2017, povlines_dict, ppp_version=2017 + ) + + # Sanity checks. I don't run for percentile tables because that process was done in the extraction + tb_2011 = sanity_checks( + tb_2011, povlines_dict, ppp_version=2011, col_stacked_n=col_stacked_n_2011, col_stacked_pct=col_stacked_pct_2011 + ) + tb_2017 = sanity_checks( + tb_2017, povlines_dict, ppp_version=2017, col_stacked_n=col_stacked_n_2017, col_stacked_pct=col_stacked_pct_2017 + ) + + # Separate out consumption-only, income-only. Also, create a table with both income and consumption + tb_inc_2011, tb_cons_2011, tb_inc_or_cons_2011 = inc_or_cons_data(tb_2011) + tb_inc_2017, tb_cons_2017, tb_inc_or_cons_2017 = inc_or_cons_data(tb_2017) + + # Create regional headcount variable, by patching missing values with the difference between world and regional headcount + tb_inc_or_cons_2017 = regional_headcount(tb_inc_or_cons_2017) + + # Create survey count dataset, by counting the number of surveys available for each country in the past decade + tb_inc_or_cons_2017 = survey_count(tb_inc_or_cons_2017) + + # Add metadata by code + tb_inc_2011 = add_metadata_vars(tb_garden=tb_inc_2011, ppp_version=2011, welfare_type="income") + tb_cons_2011 = add_metadata_vars(tb_garden=tb_cons_2011, ppp_version=2011, welfare_type="consumption") + tb_inc_or_cons_2011 = add_metadata_vars( + tb_garden=tb_inc_or_cons_2011, + ppp_version=2011, + welfare_type="income_consumption", + ) + + tb_inc_2017 = add_metadata_vars(tb_garden=tb_inc_2017, ppp_version=2017, welfare_type="income") + tb_cons_2017 = add_metadata_vars(tb_garden=tb_cons_2017, ppp_version=2017, welfare_type="consumption") + tb_inc_or_cons_2017 = add_metadata_vars( + tb_garden=tb_inc_or_cons_2017, + ppp_version=2017, + welfare_type="income_consumption", + ) + + tb_percentiles_2011 = add_metadata_vars_percentiles( + tb_garden=tb_percentiles_2011, + ppp_version=2011, + welfare_type="income_consumption", + ) + tb_percentiles_2017 = add_metadata_vars_percentiles( + tb_garden=tb_percentiles_2017, + ppp_version=2017, + welfare_type="income_consumption", + ) + + # Set index and sort + # Define index cols + index_cols = ["country", "year"] + index_cols_percentiles = ["country", "year", "reporting_level", "welfare_type", "percentile"] + tb_inc_2011 = set_index_and_sort(tb=tb_inc_2011, index_cols=index_cols) + tb_cons_2011 = set_index_and_sort(tb=tb_cons_2011, index_cols=index_cols) + tb_inc_or_cons_2011 = set_index_and_sort(tb=tb_inc_or_cons_2011, index_cols=index_cols) + + tb_inc_2017 = set_index_and_sort(tb=tb_inc_2017, index_cols=index_cols) + tb_cons_2017 = set_index_and_sort(tb=tb_cons_2017, index_cols=index_cols) + tb_inc_or_cons_2017 = set_index_and_sort(tb=tb_inc_or_cons_2017, index_cols=index_cols) + + tb_percentiles_2011 = set_index_and_sort(tb=tb_percentiles_2011, index_cols=index_cols_percentiles) + tb_percentiles_2017 = set_index_and_sort(tb=tb_percentiles_2017, index_cols=index_cols_percentiles) + + # Create spell tables to separate different survey spells in the explorers + spell_tables_inc = create_survey_spells(tb=tb_inc_2017) + spell_tables_cons = create_survey_spells(tb=tb_cons_2017) + spell_tables_inc_or_cons = create_survey_spells(tb=tb_inc_or_cons_2017) + + # Drop columns not needed + tb_inc_2011 = drop_columns(tb_inc_2011) + tb_cons_2011 = drop_columns(tb_cons_2011) + tb_inc_or_cons_2011 = drop_columns(tb_inc_or_cons_2011) + + tb_inc_2017 = drop_columns(tb_inc_2017) + tb_cons_2017 = drop_columns(tb_cons_2017) + tb_inc_or_cons_2017 = drop_columns(tb_inc_or_cons_2017) + + # Merge tables for PPP comparison explorer + tb_inc_2011_2017 = combine_tables_2011_2017(tb_2011=tb_inc_2011, tb_2017=tb_inc_2017, short_name="income_2011_2017") + tb_cons_2011_2017 = combine_tables_2011_2017( + tb_2011=tb_cons_2011, tb_2017=tb_cons_2017, short_name="consumption_2011_2017" + ) + tb_inc_or_cons_2011_2017 = combine_tables_2011_2017( + tb_2011=tb_inc_or_cons_2011, tb_2017=tb_inc_or_cons_2017, short_name="income_consumption_2011_2017" + ) + + # Define tables to upload + # The ones we need in Grapher admin would be tb_inc_or_cons_2011, tb_inc_or_cons_2017 + tables = ( + [ + tb_inc_2011, + tb_cons_2011, + tb_inc_or_cons_2011, + tb_inc_2017, + tb_cons_2017, + tb_inc_or_cons_2017, + tb_inc_2011_2017, + tb_cons_2011_2017, + tb_inc_or_cons_2011_2017, + tb_percentiles_2011, + tb_percentiles_2017, + ] + + spell_tables_inc + + spell_tables_cons + + spell_tables_inc_or_cons + ) + + # + # Save outputs. + # + # Create a new garden dataset with the same metadata as the meadow dataset. + ds_garden = create_dataset( + dest_dir, + tables=tables, + check_variables_metadata=True, + default_metadata=ds_meadow.metadata, + ) + + # Save changes in the new garden dataset. + ds_garden.save() + + +def process_data(tb: Table) -> Table: + # rename columns + tb = tb.rename(columns={"headcount": "headcount_ratio", "poverty_gap": "poverty_gap_index"}) + + # Changing the decile(i) variables for decile(i)_share + for i in range(1, 11): + tb = tb.rename(columns={f"decile{i}": f"decile{i}_share"}) + + # Calculate number in poverty + tb["headcount"] = tb["headcount_ratio"] * tb["reporting_pop"] + tb["headcount"] = tb["headcount"].round(0) + + # Calculate shortfall of incomes + tb["total_shortfall"] = tb["poverty_gap_index"] * tb["poverty_line"] * tb["reporting_pop"] + + # Calculate average shortfall of incomes (averaged across population in poverty) + tb["avg_shortfall"] = tb["total_shortfall"] / tb["headcount"] + + # Calculate income gap ratio (according to Ravallion's definition) + tb["income_gap_ratio"] = (tb["total_shortfall"] / tb["headcount"]) / tb["poverty_line"] + + # Same for relative poverty + for pct in [40, 50, 60]: + tb[f"headcount_{pct}_median"] = tb[f"headcount_ratio_{pct}_median"] * tb["reporting_pop"] + tb[f"headcount_{pct}_median"] = tb[f"headcount_{pct}_median"].round(0) + tb[f"total_shortfall_{pct}_median"] = ( + tb[f"poverty_gap_index_{pct}_median"] * tb["median"] * pct / 100 * tb["reporting_pop"] + ) + tb[f"avg_shortfall_{pct}_median"] = tb[f"total_shortfall_{pct}_median"] / tb[f"headcount_{pct}_median"] + tb[f"income_gap_ratio_{pct}_median"] = (tb[f"total_shortfall_{pct}_median"] / tb[f"headcount_{pct}_median"]) / ( + tb["median"] * pct / 100 + ) + + # Shares to percentages + # executing the function over list of vars + pct_indicators = [ + "headcount_ratio", + "income_gap_ratio", + "poverty_gap_index", + "headcount_ratio_40_median", + "headcount_ratio_50_median", + "headcount_ratio_60_median", + "income_gap_ratio_40_median", + "income_gap_ratio_50_median", + "income_gap_ratio_60_median", + "poverty_gap_index_40_median", + "poverty_gap_index_50_median", + "poverty_gap_index_60_median", + ] + tb.loc[:, pct_indicators] = tb[pct_indicators] * 100 + + # Create a new column for the poverty line in cents and string + tb["poverty_line_cents"] = round(tb["poverty_line"] * 100).astype(int).astype(str) + + # Make the table wide, with poverty_line_cents as columns + tb = tb.pivot( + index=[ + "ppp_version", + "country", + "year", + "reporting_level", + "welfare_type", + "survey_comparability", + "comparable_spell", + "reporting_pop", + "mean", + "median", + "mld", + "gini", + "polarization", + "decile1_share", + "decile2_share", + "decile3_share", + "decile4_share", + "decile5_share", + "decile6_share", + "decile7_share", + "decile8_share", + "decile9_share", + "decile10_share", + "decile1_thr", + "decile2_thr", + "decile3_thr", + "decile4_thr", + "decile5_thr", + "decile6_thr", + "decile7_thr", + "decile8_thr", + "decile9_thr", + "is_interpolated", + "distribution_type", + "estimation_type", + "headcount_40_median", + "headcount_50_median", + "headcount_60_median", + "headcount_ratio_40_median", + "headcount_ratio_50_median", + "headcount_ratio_60_median", + "income_gap_ratio_40_median", + "income_gap_ratio_50_median", + "income_gap_ratio_60_median", + "poverty_gap_index_40_median", + "poverty_gap_index_50_median", + "poverty_gap_index_60_median", + "avg_shortfall_40_median", + "avg_shortfall_50_median", + "avg_shortfall_60_median", + "total_shortfall_40_median", + "total_shortfall_50_median", + "total_shortfall_60_median", + "poverty_severity_40_median", + "poverty_severity_50_median", + "poverty_severity_60_median", + "watts_40_median", + "watts_50_median", + "watts_60_median", + ], + columns="poverty_line_cents", + values=[ + "headcount", + "headcount_ratio", + "income_gap_ratio", + "poverty_gap_index", + "avg_shortfall", + "total_shortfall", + "poverty_severity", + "watts", + ], + ) + + # Flatten column names + tb.columns = ["_".join(col).strip() for col in tb.columns.values] + + # Reset index + tb = tb.reset_index() + + return tb + + +def create_stacked_variables(tb: Table, povlines_dict: dict, ppp_version: int) -> Tuple[Table, list, list]: + """ + Create stacked variables from the indicators to plot them as stacked area/bar charts + """ + # Select poverty lines between 2011 and 2017 and sort in case they are not in order + povlines = povlines_dict[ppp_version] + povlines.sort() + + # Above variables + + col_above_n = [] + col_above_pct = [] + + for p in povlines: + varname_n = f"headcount_above_{p}" + varname_pct = f"headcount_ratio_above_{p}" + + tb[varname_n] = tb["reporting_pop"] - tb[f"headcount_{p}"] + tb[varname_pct] = tb[varname_n] / tb["reporting_pop"] + + col_above_n.append(varname_n) + col_above_pct.append(varname_pct) + + tb.loc[:, col_above_pct] = tb[col_above_pct] * 100 + + # Stacked variables + + col_stacked_n = [] + col_stacked_pct = [] + + for i in range(len(povlines)): + # if it's the first value only continue + if i == 0: + continue + + # If it's the last value calculate the people between this value and the previous + # and also the people over this poverty line (and percentages) + elif i == len(povlines) - 1: + varname_n = f"headcount_between_{povlines[i-1]}_{povlines[i]}" + varname_pct = f"headcount_ratio_between_{povlines[i-1]}_{povlines[i]}" + tb[varname_n] = tb[f"headcount_{povlines[i]}"] - tb[f"headcount_{povlines[i-1]}"] + tb[varname_pct] = tb[varname_n] / tb["reporting_pop"] + col_stacked_n.append(varname_n) + col_stacked_pct.append(varname_pct) + varname_n = f"headcount_above_{povlines[i]}" + varname_pct = f"headcount_ratio_above_{povlines[i]}" + tb[varname_n] = tb["reporting_pop"] - tb[f"headcount_{povlines[i]}"] + tb[varname_pct] = tb[varname_n] / tb["reporting_pop"] + col_stacked_n.append(varname_n) + col_stacked_pct.append(varname_pct) + + # If it's any value between the first and the last calculate the people between this value and the previous (and percentage) + else: + varname_n = f"headcount_between_{povlines[i-1]}_{povlines[i]}" + varname_pct = f"headcount_ratio_between_{povlines[i-1]}_{povlines[i]}" + tb[varname_n] = tb[f"headcount_{povlines[i]}"] - tb[f"headcount_{povlines[i-1]}"] + tb[varname_pct] = tb[varname_n] / tb["reporting_pop"] + col_stacked_n.append(varname_n) + col_stacked_pct.append(varname_pct) + + tb.loc[:, col_stacked_pct] = tb[col_stacked_pct] * 100 + + # Add variables below first poverty line to the stacked variables + col_stacked_n.append(f"headcount_{povlines[0]}") + col_stacked_pct.append(f"headcount_ratio_{povlines[0]}") + + # Calculate stacked variables which "jump" the original order + + tb[f"headcount_between_{povlines[1]}_{povlines[4]}"] = ( + tb[f"headcount_{povlines[4]}"] - tb[f"headcount_{povlines[1]}"] + ) + tb[f"headcount_between_{povlines[4]}_{povlines[6]}"] = ( + tb[f"headcount_{povlines[6]}"] - tb[f"headcount_{povlines[4]}"] + ) + + tb[f"headcount_ratio_between_{povlines[1]}_{povlines[4]}"] = ( + tb[f"headcount_ratio_{povlines[4]}"] - tb[f"headcount_ratio_{povlines[1]}"] + ) + tb[f"headcount_ratio_between_{povlines[4]}_{povlines[6]}"] = ( + tb[f"headcount_ratio_{povlines[6]}"] - tb[f"headcount_ratio_{povlines[4]}"] + ) + + return tb, col_stacked_n, col_stacked_pct + + +def calculate_inequality(tb: Table) -> Table: + """ + Calculate inequality measures: decile averages and ratios + """ + + col_decile_share = [] + col_decile_avg = [] + col_decile_thr = [] + + for i in range(1, 11): + if i != 10: + varname_thr = f"decile{i}_thr" + col_decile_thr.append(varname_thr) + + varname_share = f"decile{i}_share" + varname_avg = f"decile{i}_avg" + tb[varname_avg] = tb[varname_share] * tb["mean"] / 0.1 + + col_decile_share.append(varname_share) + col_decile_avg.append(varname_avg) + + # Multiplies decile columns by 100 + tb.loc[:, col_decile_share] = tb[col_decile_share] * 100 + + # Create bottom 50 and middle 40% shares + tb["bottom50_share"] = ( + tb["decile1_share"] + tb["decile2_share"] + tb["decile3_share"] + tb["decile4_share"] + tb["decile5_share"] + ) + tb["middle40_share"] = tb["decile6_share"] + tb["decile7_share"] + tb["decile8_share"] + tb["decile9_share"] + + # Palma ratio and other average/share ratios + tb["palma_ratio"] = tb["decile10_share"] / ( + tb["decile1_share"] + tb["decile2_share"] + tb["decile3_share"] + tb["decile4_share"] + ) + tb["s80_s20_ratio"] = (tb["decile9_share"] + tb["decile10_share"]) / (tb["decile1_share"] + tb["decile2_share"]) + tb["p90_p10_ratio"] = tb["decile9_thr"] / tb["decile1_thr"] + tb["p90_p50_ratio"] = tb["decile9_thr"] / tb["decile5_thr"] + tb["p50_p10_ratio"] = tb["decile5_thr"] / tb["decile1_thr"] + + # Replace infinite values with nulls + tb = tb.replace([np.inf, -np.inf], np.nan) + return tb + + +def identify_rural_urban(tb: Table) -> Table: + """ + Amend the entity to reflect if data refers to urban or rural only + """ + + # Make country and reporting_level columns into strings + tb["country"] = tb["country"].astype(str) + tb["reporting_level"] = tb["reporting_level"].astype(str) + ix = tb["reporting_level"].isin(["urban", "rural"]) + tb.loc[(ix), "country"] = tb.loc[(ix), "country"] + " (" + tb.loc[(ix), "reporting_level"] + ")" + + return tb + + +def sanity_checks( + tb: Table, povlines_dict: dict, ppp_version: int, col_stacked_n: list, col_stacked_pct: list +) -> Table: + """ + Sanity checks for the table + """ + + # Select poverty lines between 2011 and 2017 and sort in case they are not in order + povlines = povlines_dict[ppp_version] + povlines.sort() + + # Save the number of observations before the checks + obs_before_checks = len(tb) + + # Create lists of variables to check + col_headcount = [] + col_headcount_ratio = [] + col_povertygap = [] + col_tot_shortfall = [] + col_watts = [] + col_poverty_severity = [] + col_decile_share = [] + col_decile_thr = [] + + for p in povlines: + col_headcount.append(f"headcount_{p}") + col_headcount_ratio.append(f"headcount_ratio_{p}") + col_povertygap.append(f"poverty_gap_index_{p}") + col_tot_shortfall.append(f"total_shortfall_{p}") + col_watts.append(f"watts_{p}") + col_poverty_severity.append(f"poverty_severity_{p}") + + for i in range(1, 11): + col_decile_share.append(f"decile{i}_share") + if i != 10: + col_decile_thr.append(f"decile{i}_thr") + + ############################ + # Negative values + mask = ( + tb[ + col_headcount + + col_headcount_ratio + + col_povertygap + + col_tot_shortfall + + col_watts + + col_poverty_severity + + col_decile_share + + col_decile_thr + + ["mean", "median", "mld", "gini", "polarization"] + ] + .lt(0) + .any(axis=1) + ) + tb_error = tb[mask].reset_index(drop=True).copy() + + if not tb_error.empty: + log.fatal( + f"""There are {len(tb_error)} observations with negative values! In + {tabulate(tb_error[['country', 'year', 'reporting_level', 'welfare_type']], headers = 'keys', tablefmt = TABLEFMT)}""" + ) + # NOTE: Check if we want to delete these observations + # tb = tb[~mask].reset_index(drop=True) + + ############################ + # stacked values not adding up to 100% + tb["sum_pct"] = tb[col_stacked_pct].sum(axis=1) + mask = (tb["sum_pct"] >= 100.1) | (tb["sum_pct"] <= 99.9) + tb_error = tb[mask].reset_index(drop=True).copy() + + if not tb_error.empty: + log.warning( + f"""{len(tb_error)} observations of stacked values are not adding up to 100% and will be deleted: + {tabulate(tb_error[['country', 'year', 'reporting_level', 'welfare_type', 'sum_pct']], headers = 'keys', tablefmt = TABLEFMT, floatfmt=".1f")}""" + ) + tb = tb[~mask].reset_index(drop=True).copy() + + ############################ + # missing poverty values (headcount, poverty gap, total shortfall) + cols_to_check = ( + col_headcount + col_headcount_ratio + col_povertygap + col_tot_shortfall + col_stacked_n + col_stacked_pct + ) + mask = tb[cols_to_check].isna().any(axis=1) + tb_error = tb[mask].reset_index(drop=True).copy() + + if not tb_error.empty: + log.warning( + f"""There are {len(tb_error)} observations with missing poverty values and will be deleted: + {tabulate(tb_error[['country', 'year', 'reporting_level', 'welfare_type'] + col_headcount], headers = 'keys', tablefmt = TABLEFMT)}""" + ) + tb = tb[~mask].reset_index(drop=True) + + ############################ + # Missing median + mask = tb["median"].isna() + tb_error = tb[mask].reset_index(drop=True).copy() + + if not tb_error.empty: + log.info(f"""There are {len(tb_error)} observations with missing median. They will be not deleted.""") + + ############################ + # Missing mean + mask = tb["mean"].isna() + tb_error = tb[mask].reset_index(drop=True).copy() + + if not tb_error.empty: + log.info(f"""There are {len(tb_error)} observations with missing mean. They will be not deleted.""") + + ############################ + # Missing gini + mask = tb["gini"].isna() + tb_error = tb[mask].reset_index(drop=True).copy() + + if not tb_error.empty: + log.info(f"""There are {len(tb_error)} observations with missing gini. They will be not deleted.""") + + ############################ + # Missing decile shares + mask = tb[col_decile_share].isna().any(axis=1) + tb_error = tb[mask].reset_index(drop=True).copy() + + if not tb_error.empty: + log.info(f"""There are {len(tb_error)} observations with missing decile shares. They will be not deleted.""") + + ############################ + # Missing decile thresholds + mask = tb[col_decile_thr].isna().any(axis=1) + tb_error = tb[mask].reset_index(drop=True).copy() + + if not tb_error.empty: + log.info( + f"""There are {len(tb_error)} observations with missing decile thresholds. They will be not deleted.""" + ) + + ############################ + # headcount monotonicity check + m_check_vars = [] + for i in range(len(col_headcount)): + if i > 0: + check_varname = f"m_check_{i}" + tb[check_varname] = tb[f"{col_headcount[i]}"] >= tb[f"{col_headcount[i-1]}"] + m_check_vars.append(check_varname) + tb["check_total"] = tb[m_check_vars].all(axis=1) + + tb_error = tb[~tb["check_total"]].reset_index(drop=True) + + if not tb_error.empty: + log.warning( + f"""There are {len(tb_error)} observations with headcount not monotonically increasing and will be deleted: + {tabulate(tb_error[['country', 'year', 'reporting_level', 'welfare_type'] + col_headcount], headers = 'keys', tablefmt = TABLEFMT, floatfmt="0.0f")}""" + ) + tb = tb[tb["check_total"]].reset_index(drop=True) + + ############################ + # Threshold monotonicity check + m_check_vars = [] + for i in range(1, 10): + if i > 1: + check_varname = f"m_check_{i}" + tb[check_varname] = tb[f"decile{i}_thr"] >= tb[f"decile{i-1}_thr"] + m_check_vars.append(check_varname) + + tb["check_total"] = tb[m_check_vars].all(axis=1) + + # Drop rows if columns in col_decile_thr are all null. Keep if some are null + mask = (~tb["check_total"]) & (tb[col_decile_thr].notnull().any(axis=1)) + + tb_error = tb[mask].reset_index(drop=True).copy() + + if not tb_error.empty: + log.warning( + f"""There are {len(tb_error)} observations with thresholds not monotonically increasing and will be deleted: + {tabulate(tb_error[['country', 'year', 'reporting_level', 'welfare_type']], headers = 'keys', tablefmt = TABLEFMT)}""" + ) + tb = tb[~mask].reset_index(drop=True) + + ############################ + # Shares monotonicity check + m_check_vars = [] + for i in range(1, 11): + if i > 1: + check_varname = f"m_check_{i}" + tb[check_varname] = tb[f"decile{i}_share"] >= tb[f"decile{i-1}_share"] + m_check_vars.append(check_varname) + + tb["check_total"] = tb[m_check_vars].all(axis=1) + + # Drop rows if columns in col_decile_share are all null. Keep if some are null + mask = (~tb["check_total"]) & (tb[col_decile_share].notnull().any(axis=1)) + tb_error = tb[mask].reset_index(drop=True).copy() + + if not tb_error.empty: + log.warning( + f"""There are {len(tb_error)} observations with shares not monotonically increasing and will be deleted: + {tabulate(tb_error[['country', 'year', 'reporting_level', 'welfare_type'] + col_decile_share], headers = 'keys', tablefmt = TABLEFMT, floatfmt=".1f")}""" + ) + tb = tb[~mask].reset_index(drop=True) + + ############################ + # Shares not adding up to 100% + + tb["sum_pct"] = tb[col_decile_share].sum(axis=1) + + # Drop rows if columns in col_decile_share are all null. Keep if some are null + mask = (tb["sum_pct"] >= 100.1) | (tb["sum_pct"] <= 99.9) & (tb[col_decile_share].notnull().any(axis=1)) + tb_error = tb[mask].reset_index(drop=True).copy() + + if not tb_error.empty: + log.warning( + f"""{len(tb_error)} observations of shares are not adding up to 100% and will be deleted: + {tabulate(tb_error[['country', 'year', 'reporting_level', 'welfare_type', 'sum_pct']], headers = 'keys', tablefmt = TABLEFMT, floatfmt=".1f")}""" + ) + tb = tb[~mask].reset_index(drop=True) + + ############################ + # delete columns created for the checks + tb = tb.drop(columns=m_check_vars + ["m_check_1", "check_total", "sum_pct"]) + + obs_after_checks = len(tb) + log.info(f"Sanity checks deleted {obs_before_checks - obs_after_checks} observations for {ppp_version} PPPs.") + + return tb + + +def separate_ppp_data(tb: Table) -> Tuple[Table, Table]: + """ + Separate out ppp data from the main dataset + """ + + # Filter table to include only the right ppp_version + # Also, drop columns with all NaNs (which are the ones that are not relevant for the ppp_version) + tb_2011 = tb[tb["ppp_version"] == 2011].dropna(axis=1, how="all").reset_index(drop=True).copy() + tb_2017 = tb[tb["ppp_version"] == 2017].dropna(axis=1, how="all").reset_index(drop=True).copy() + + return tb_2011, tb_2017 + + +def inc_or_cons_data(tb: Table) -> Tuple[Table, Table, Table]: + """ + Separate income and consumption data + """ + + # Separate out consumption-only, income-only. Also, create a table with both income and consumption + tb_inc = tb[tb["welfare_type"] == "income"].reset_index(drop=True).copy() + tb_cons = tb[tb["welfare_type"] == "consumption"].reset_index(drop=True).copy() + tb_inc_or_cons = tb.copy() + + # If both inc and cons are available in a given year, drop inc + + # Flag duplicates – indicating multiple welfare_types + # Sort values to ensure the welfare_type consumption is marked as False when there are multiple welfare types + tb_inc_or_cons = tb_inc_or_cons.sort_values( + by=["ppp_version", "country", "year", "reporting_level", "welfare_type"], ignore_index=True + ) + tb_inc_or_cons["duplicate_flag"] = tb_inc_or_cons.duplicated( + subset=["ppp_version", "country", "year", "reporting_level"] + ) + + # Drop income where income and consumption are available + tb_inc_or_cons = tb_inc_or_cons[ + (~tb_inc_or_cons["duplicate_flag"]) | (tb_inc_or_cons["welfare_type"] == "consumption") + ] + tb_inc_or_cons.drop(columns=["duplicate_flag"], inplace=True) + + # print(f'After dropping duplicates there were {len(tb_inc_or_cons)} rows.') + + tb_inc_or_cons = check_jumps_in_grapher_dataset(tb_inc_or_cons) + + return tb_inc, tb_cons, tb_inc_or_cons + + +def regional_headcount(tb: Table) -> Table: + """ + Create regional headcount dataset, by patching missing values with the difference between world and regional headcount + """ + + # Keep only regional data: for regions, these are the reporting_level rows not in ['national', 'urban', 'rural'] + tb_regions = tb[~tb["reporting_level"].isin(["national", "urban", "rural"])].reset_index(drop=True).copy() + + # Remove Western and Central and Eastern and Southern Africa. It's redundant with Sub-Saharan Africa (PIP) + tb_regions = tb_regions[ + ~tb_regions["country"].isin(["Western and Central Africa (PIP)", "Eastern and Southern Africa (PIP)"]) + ].reset_index(drop=True) + + # Select needed columns and pivot + tb_regions = tb_regions[["country", "year", "headcount_215"]] + tb_regions = tb_regions.pivot(index="year", columns="country", values="headcount_215") + + # Drop rows with more than one region with null headcount + tb_regions["check_total"] = tb_regions[tb_regions.columns].isnull().sum(axis=1) + mask = tb_regions["check_total"] > 1 + + tb_out = tb_regions[mask].reset_index() + if len(tb_out) > 0: + log.info( + f"""There are {len(tb_out)} years with more than one null region value and will be deleted: + {list(tb_out.year.unique())}""" + ) + tb_regions = tb_regions[~mask].reset_index() + tb_regions = tb_regions.drop(columns="check_total") + + # Get difference between world and (total) regional headcount, to patch rows with one missing value + cols_to_sum = [e for e in list(tb_regions.columns) if e not in ["year", "World"]] + tb_regions["sum_regions"] = tb_regions[cols_to_sum].sum(axis=1) + + tb_regions["diff_world_regions"] = tb_regions["World"] - tb_regions["sum_regions"] + + # Fill null values with the difference and drop aux variables + col_dictionary = dict.fromkeys(cols_to_sum, tb_regions["diff_world_regions"]) + tb_regions.loc[:, cols_to_sum] = tb_regions[cols_to_sum].fillna(col_dictionary) + tb_regions = tb_regions.drop(columns=["World", "sum_regions", "diff_world_regions"]) + + # NOTE: I am not extracting data for China and India at least for now, because we are only extracting non filled data + # The data originally came from filled data to plot properly. + + # # Get headcount values for China and India + # df_chn_ind = tb[(tb["country"].isin(["China", "India"])) & (tb["reporting_level"] == "national")].reset_index( + # drop=True + # ) + # df_chn_ind = df_chn_ind[["country", "year", "headcount_215"]] + + # # Make table wide and merge with regional data + # df_chn_ind = df_chn_ind.pivot(index="year", columns="country", values="headcount_215").reset_index() + # tb_regions = pr.merge(tb_regions, df_chn_ind, on="year", how="left") + + # tb_regions["East Asia and Pacific excluding China"] = ( + # tb_regions["East Asia and Pacific (PIP)"] - tb_regions["China"] + # ) + # tb_regions["South Asia excluding India"] = tb_regions["South Asia (PIP)"] - tb_regions["India"] + + tb_regions = pr.melt(tb_regions, id_vars=["year"], var_name="country", value_name="headcount_215") + tb_regions = tb_regions[["country", "year", "headcount_215"]] + + # Rename headcount_215 to headcount_215_region, to distinguish it from the original headcount_215 when merging + tb_regions = tb_regions.rename(columns={"headcount_215": "headcount_215_regions"}) + + # Merge with original table + tb = pr.merge(tb, tb_regions, on=["country", "year"], how="outer") + + return tb + + +def survey_count(tb: Table) -> Table: + """ + Create survey count indicator, by counting the number of surveys available for each country in the past decade + """ + # Remove regions from the table + tb_survey = tb[~tb["reporting_level"].isnull()].reset_index(drop=True).copy() + + min_year = int(tb_survey["year"].min()) + max_year = int(tb_survey["year"].max()) + year_list = list(range(min_year, max_year + 1)) + country_list = list(tb_survey["country"].unique()) + + # Create two tables with all the years and entities + year_tb_survey = Table(year_list) + entity_tb_survey = Table(country_list) + + # Make a cartesian product of both dataframes: join all the combinations between all the entities and all the years + cross = pr.merge(entity_tb_survey, year_tb_survey, how="cross") + cross = cross.rename(columns={"0_x": "country", "0_y": "year"}) + + # Merge cross and df_country, to include all the possible rows in the dataset + tb_survey = pr.merge(cross, tb_survey[["country", "year"]], on=["country", "year"], how="left", indicator=True) + + # Mark with 1 if there are surveys available, 0 if not (this is done by checking if the row is in both datasets) + tb_survey["survey_available"] = 0 + tb_survey.loc[tb_survey["_merge"] == "both", "survey_available"] = 1 + + # Sum for each entity the surveys available for the previous 9 years and the current year + tb_survey["surveys_past_decade"] = ( + tb_survey["survey_available"] + .groupby(tb_survey["country"], sort=False) + .rolling(min_periods=1, window=10) + .sum() + .values + ) + + # Copy metadata + tb_survey["surveys_past_decade"] = tb_survey["surveys_past_decade"].copy_metadata(tb["reporting_level"]) + + # Keep columns needed + tb_survey = tb_survey[["country", "year", "surveys_past_decade"]] + + # Merge with original table + tb = pr.merge(tb_survey, tb, on=["country", "year"], how="left") + + return tb + + +def set_index_and_sort(tb: Table, index_cols: list) -> Table: + """ + Add index and sort + """ + + tb = tb.set_index(index_cols, verify_integrity=True).sort_index() + + return tb + + +def drop_columns(tb: Table) -> Table: + """ + Drop columns not needed + """ + + # Remove columns + tb = tb.drop( + columns=[ + "ppp_version", + "reporting_pop", + "is_interpolated", + "distribution_type", + "estimation_type", + "survey_comparability", + "comparable_spell", + ] + ) + + return tb + + +def create_survey_spells(tb: Table) -> list: + """ + Create tables for each indicator and survey spells, to be able to graph them in explorers. + """ + + tb = tb.copy() + + # drop rows where survey coverage = nan (This is just regions) + tb = tb[tb["survey_comparability"].notna()].reset_index() + + # Add 1 to make comparability var run from 1, not from 0 + tb["survey_comparability"] += 1 + + # Note the welfare type in the comparability spell + tb["survey_comparability"] = ( + tb["welfare_type"].astype(str) + "_spell_" + tb["survey_comparability"].astype(int).astype(str) + ) + + # Remove columns not needed: stacked, above, etc + drop_list = ["above", "between", "poverty_severity", "watts"] + for var in drop_list: + tb = tb[tb.columns.drop(list(tb.filter(like=var)))] + + vars = [ + i + for i in tb.columns + if i + not in [ + "country", + "year", + "ppp_version", + "reporting_level", + "welfare_type", + "reporting_pop", + "is_interpolated", + "distribution_type", + "estimation_type", + "survey_comparability", + "comparable_spell", + "headcount_215_regions", + "surveys_past_decade", + ] + ] + + # Define spell table list + spell_tables = [] + + # Loop over the variables in the main dataset + for select_var in vars: + tb_var = tb[["country", "year", select_var, "survey_comparability"]].copy() + + # convert to wide + tb_var = pr.pivot( + tb_var, + index=["country", "year"], + columns=["survey_comparability"], + values=select_var, + ) + + tb_var.metadata.short_name = f"{tb_var.metadata.short_name}_{select_var}" + + spell_tables.append(tb_var) + + return spell_tables + + +def combine_tables_2011_2017(tb_2011: Table, tb_2017: Table, short_name: str) -> Table: + """ + Combine income and consumption tables from 2011 and 2017 PPPs. + We will use this table for the Poverty Data Explorer: World Bank data - 2011 vs. 2017 prices. + """ + + # Identify columns to use (ID + indicators) + id_cols = ["country", "year"] + + tb_2011 = define_columns_for_ppp_comparison(tb=tb_2011, id_cols=id_cols, ppp_version=2011) + tb_2017 = define_columns_for_ppp_comparison(tb=tb_2017, id_cols=id_cols, ppp_version=2017) + + # Rename all the non-id columns with the suffix _ppp(year) + # (the suffix option in merge only adds suffix when columns coincide) + tb_2011 = tb_2011.rename(columns={c: c + "_ppp2011" for c in tb_2011.columns if c not in id_cols}) + tb_2017 = tb_2017.rename(columns={c: c + "_ppp2017" for c in tb_2017.columns if c not in id_cols}) + + # Merge the two files (it's OK to have an inneer join, because we want to keep country-year pairs that are in both files) + tb_2011_2017 = pr.merge(tb_2011, tb_2017, on=id_cols, validate="one_to_one", short_name=short_name) + + # Add index and sort + tb_2011_2017 = tb_2011_2017.set_index(["country", "year"], verify_integrity=True).sort_index() + + return tb_2011_2017 + + +def define_columns_for_ppp_comparison(tb: Table, id_cols: list, ppp_version: int) -> Table: + """ + Define columns to use for the comparison of 2011 and 2017 PPPs + """ + + tb = tb.reset_index() + # Define poverty lines + povlines_list = povlines_dict[ppp_version] + + # Define groups of columns + headcount_absolute_cols = [f"headcount_{p}" for p in povlines_list] + headcount_ratio_absolute_cols = [f"headcount_ratio_{p}" for p in povlines_list] + + headcount_relative_cols = [f"headcount_{rel}_median" for rel in [40, 50, 60]] + headcount_ratio_relative_cols = [f"headcount_ratio_{rel}_median" for rel in [40, 50, 60]] + + # Define all the columns to filter + + cols_list = ( + id_cols + + headcount_absolute_cols + + headcount_ratio_absolute_cols + + headcount_relative_cols + + headcount_ratio_relative_cols + + ["mean", "median", "decile1_thr", "decile9_thr"] + ) + + # Filter columns + tb = tb[cols_list] + + return tb + + +def regional_data_from_1990(tb: Table, regions_list: list) -> Table: + """ + Select regional data only from 1990 onwards, due to the uncertainty in 1980s data + """ + # Create a regions table + tb_regions = tb[(tb["year"] >= 1990) & (tb["country"].isin(regions_list))].reset_index(drop=True).copy() + + # Remove regions from tb + tb = tb[~tb["country"].isin(regions_list)].reset_index(drop=True).copy() + + # Concatenate both tables + tb = pr.concat([tb, tb_regions], ignore_index=True) + return tb + + +def check_jumps_in_grapher_dataset(tb: Table) -> Table: + """ + Check for jumps in the dataset, which can be caused by combining income and consumption estimates for one country series. + """ + # For each country, year, welfare_type and reporting_level, check if the difference between the columns is too high + + # Define columns to check: all the headcount ratio columns + cols_to_check = [ + col for col in tb.columns if "headcount_ratio" in col and "above" not in col and "between" not in col + ] + + for col in cols_to_check: + # Create a new column, shift_col, that is the same as col but shifted one row down for each country, year, welfare_type and reporting_level + tb["shift_col"] = tb.groupby(["country", "reporting_level"])[col].shift(1) + + # Create shift_year column + tb["shift_year"] = tb.groupby(["country", "reporting_level"])["year"].shift(1) + + # Create shift_welfare_type column + tb["shift_welfare_type"] = tb.groupby(["country", "reporting_level"])["welfare_type"].shift(1) + + # Calculate the difference between col and shift_col + tb["check_diff_column"] = tb[col] - tb["shift_col"] + + # Calculate the difference between years + tb["check_diff_year"] = tb["year"] - tb["shift_year"] + + # Calculate if the welfare type is the same + tb["check_diff_welfare_type"] = tb["welfare_type"] == tb["shift_welfare_type"] + + # Check if the difference is too high + mask = (abs(tb["check_diff_column"]) > 20) & (tb["check_diff_year"] <= 5) & ~tb["check_diff_welfare_type"] + tb_error = tb[mask].reset_index(drop=True).copy() + + if not tb_error.empty: + log.warning( + f"""There are {len(tb_error)} observations with abnormal jumps for {col}: + {tabulate(tb_error[['ppp_version', 'country', 'year', 'reporting_level', col, 'check_diff_column', 'check_diff_year']].sort_values('year'), headers = 'keys', tablefmt = TABLEFMT, floatfmt=".1f")}""" + ) + # tb = tb[~mask].reset_index(drop=True) + + # Drop the columns created for the check + tb = tb.drop( + columns=[ + "shift_col", + "shift_year", + "shift_welfare_type", + "check_diff_column", + "check_diff_year", + "check_diff_welfare_type", + ] + ) + + return tb diff --git a/etl/steps/data/grapher/wb/2024-03-27/world_bank_pip_2011ppp.meta.yml b/etl/steps/data/grapher/wb/2024-03-27/world_bank_pip_2011ppp.meta.yml new file mode 100644 index 00000000000..c25d010b2fc --- /dev/null +++ b/etl/steps/data/grapher/wb/2024-03-27/world_bank_pip_2011ppp.meta.yml @@ -0,0 +1,4 @@ +# Learn more about the available fields: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/dataset/ +dataset: + title: World Bank Poverty and Inequality Platform (PIP) (2011 prices) diff --git a/etl/steps/data/grapher/wb/2024-03-27/world_bank_pip_2011ppp.py b/etl/steps/data/grapher/wb/2024-03-27/world_bank_pip_2011ppp.py new file mode 100644 index 00000000000..92ee99a0383 --- /dev/null +++ b/etl/steps/data/grapher/wb/2024-03-27/world_bank_pip_2011ppp.py @@ -0,0 +1,34 @@ +"""Load a garden dataset and create a grapher dataset.""" + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load garden dataset. + ds_garden = paths.load_dataset("world_bank_pip") + + # Read tables from garden dataset. + tb = ds_garden["income_consumption_2011"] + + # + # Process data. + # + # Drop reporting_level and welfare_type columns + tb = tb.drop(columns=["reporting_level", "welfare_type"]) + + # + # Save outputs. + # + # Create a new grapher dataset with the same metadata as the garden dataset. + ds_grapher = create_dataset( + dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=ds_garden.metadata + ) + + # Save changes in the new grapher dataset. + ds_grapher.save() diff --git a/etl/steps/data/grapher/wb/2024-03-27/world_bank_pip_2017ppp.meta.yml b/etl/steps/data/grapher/wb/2024-03-27/world_bank_pip_2017ppp.meta.yml new file mode 100644 index 00000000000..4afca360dd5 --- /dev/null +++ b/etl/steps/data/grapher/wb/2024-03-27/world_bank_pip_2017ppp.meta.yml @@ -0,0 +1,4 @@ +# Learn more about the available fields: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/dataset/ +dataset: + title: World Bank Poverty and Inequality Platform (PIP) (2017 prices) diff --git a/etl/steps/data/grapher/wb/2024-03-27/world_bank_pip_2017ppp.py b/etl/steps/data/grapher/wb/2024-03-27/world_bank_pip_2017ppp.py new file mode 100644 index 00000000000..319cf11c36e --- /dev/null +++ b/etl/steps/data/grapher/wb/2024-03-27/world_bank_pip_2017ppp.py @@ -0,0 +1,34 @@ +"""Load a garden dataset and create a grapher dataset.""" + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load garden dataset. + ds_garden = paths.load_dataset("world_bank_pip") + + # Read tables from garden dataset. + tb = ds_garden["income_consumption_2017"] + + # + # Process data. + # + # Drop reporting_level and welfare_type columns + tb = tb.drop(columns=["reporting_level", "welfare_type"]) + + # + # Save outputs. + # + # Create a new grapher dataset with the same metadata as the garden dataset. + ds_grapher = create_dataset( + dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=ds_garden.metadata + ) + + # Save changes in the new grapher dataset. + ds_grapher.save() diff --git a/etl/steps/data/meadow/wb/2024-03-27/world_bank_pip.py b/etl/steps/data/meadow/wb/2024-03-27/world_bank_pip.py new file mode 100644 index 00000000000..90c84c0726d --- /dev/null +++ b/etl/steps/data/meadow/wb/2024-03-27/world_bank_pip.py @@ -0,0 +1,51 @@ +"""Load a snapshot and create a meadow dataset.""" + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Retrieve snapshots. + # For key indicators + snap = paths.load_snapshot("world_bank_pip.csv") + tb = snap.read() + + # For percentiles + snap_percentiles = paths.load_snapshot("world_bank_pip_percentiles.csv") + tb_percentiles = snap_percentiles.read() + + # + # Process data. + # + + # Make reporting_level and welfare_type strings + tb["reporting_level"] = tb["reporting_level"].astype(str) + tb["welfare_type"] = tb["welfare_type"].astype(str) + tb_percentiles["reporting_level"] = tb_percentiles["reporting_level"].astype(str) + tb_percentiles["welfare_type"] = tb_percentiles["welfare_type"].astype(str) + + # Set index and sort + tb = tb.set_index( + ["ppp_version", "poverty_line", "country", "year", "reporting_level", "welfare_type"], verify_integrity=True + ).sort_index() + + tb_percentiles = tb_percentiles.set_index( + ["ppp_version", "country", "year", "reporting_level", "welfare_type", "percentile"], + verify_integrity=True, + ).sort_index() + + # + # Save outputs. + # + # Create a new meadow dataset with the same metadata as the snapshot. + ds_meadow = create_dataset( + dest_dir, tables=[tb, tb_percentiles], check_variables_metadata=True, default_metadata=snap.metadata + ) + + # Save changes in the new garden dataset. + ds_meadow.save() diff --git a/snapshots/wb/2024-03-27/pip_api.py b/snapshots/wb/2024-03-27/pip_api.py new file mode 100644 index 00000000000..8b4d17d1c94 --- /dev/null +++ b/snapshots/wb/2024-03-27/pip_api.py @@ -0,0 +1,1573 @@ +""" +DATA EXTRACTION FOR THE WORLD BANK POVERTY AND INEQUALITY PLATFORM (PIP) API + +This code generates key indicators and percentiles from the World Bank PIP API. +This is done by combining the results of several queries to the API: + - A set of poverty lines (8) to obtain key indicators per PPP year (2011, 2017) and for countries and regions. + - 2298 poverty lines to construct percentiles for a group of countries. + - 5148 poverty lines to construct percentiles for all the regions. + - 8217 of poverty lines to construct estimates of relative poverty. + +Percentiles are partially constructed because the data officially published by the World Bank is missing some countries and all the regions. + +To run this code from scratch, + - Connect to the staging server of this pull request: + - Hit Cmd + Shift + P and select Remote-SSH: Connect to Host + - Type in owid@staging-site-{pull_request_name} + - Delete the files in the cache folder: + rm -rf .cache/* + - Check if you need to update the poverty lines in the functions `poverty_lines_countries` and `poverty_lines_regions`. Run + - https://api.worldbank.org/pip/v1/pip?country=CHN&year=all&povline=80&fill_gaps=false&welfare_type=all&reporting_level=all&additional_ind=false&ppp_version=2017&identity=PROD&format=csv + - https://api.worldbank.org/pip/v1/pip-grp?country=OHI&year=all&povline=300&group_by=wb&welfare_type=all&reporting_level=all&additional_ind=false&ppp_version=2017&format=csv + - And see if any of the `headcount` values is lower than 0.99. If so, you need to add more poverty lines to the functions. + - Run the code. You have two options to see the output, in the terminal or in the background: + python snapshots/wb/{version}/pip_api.py + nohup poetry run python snapshots/wb/{version}/pip_api.py > output.log 2>&1 & + +When the code finishes, you will have the following files in the cache folder: + - world_bank_pip.csv: file with the results of the queries for key indicators (8 for countries and 8 for regions), plus some additional indicators (thresholds, relative poverty). + - pip_percentiles.csv: file with the percentiles taken from WB Databank and the ones constructed from the API. + +Copy these files to this folder and run in the terminal: + python snapshots/wb/{version}/world_bank_pip.py --path-to-file snapshots/wb/{version}/world_bank_pip.csv + python snapshots/wb/{version}/world_bank_pip_percentiles.py --path-to-file snapshots/wb/{version}/pip_percentiles.csv + +You can delete the files after this. + +""" + + +import io +import time +from multiprocessing.pool import ThreadPool +from pathlib import Path + +import click +import numpy as np +import pandas as pd +import requests +from botocore.exceptions import ClientError +from joblib import Memory +from structlog import get_logger +from tenacity import retry +from tenacity.stop import stop_after_attempt +from tenacity.wait import wait_random_exponential + +from etl.files import checksum_str +from etl.paths import CACHE_DIR +from etl.publish import connect_s3_cached + +# Initialize logger. +log = get_logger() + +memory = Memory(CACHE_DIR, verbose=0) + +# Basic parameters to use in the functions +MAX_REPEATS = 15 +TIMEOUT = 500 +FILL_GAPS = "false" +# NOTE: Although the number of workers is set to MAX_WORKERS, the actual number of workers for regional queries is half of that, because the API (`pip-grp`) is less able to handle concurrent requests. +MAX_WORKERS = 2 +TOLERANCE_PERCENTILES = 1 + + +# Select live (1) or internal (0) API +LIVE_API = 1 + + +# Constants +def poverty_lines_countries(): + """ + These poverty lines are used to calculate percentiles for countries that are not in the percentile file. + # We only extract to $80 because the highest P99 not available is China, with $64.5 + # NOTE: In future updates, check if these poverty lines are enough for the extraction + """ + # Define poverty lines and their increase + + under_2_dollars = list(range(1, 200, 1)) + between_2_and_5_dollars = list(range(200, 500, 2)) + between_5_and_10_dollars = list(range(500, 1000, 5)) + between_10_and_20_dollars = list(range(1000, 2000, 10)) + between_20_and_30_dollars = list(range(2000, 3000, 10)) + between_30_and_55_dollars = list(range(3000, 5500, 10)) + between_55_and_80_dollars = list(range(5500, 8000, 10)) + between_80_and_100_dollars = list(range(8000, 10000, 10)) + between_100_and_150_dollars = list(range(10000, 15000, 10)) + + # povlines is all these lists together + povlines = ( + under_2_dollars + + between_2_and_5_dollars + + between_5_and_10_dollars + + between_10_and_20_dollars + + between_20_and_30_dollars + + between_30_and_55_dollars + + between_55_and_80_dollars + + between_80_and_100_dollars + + between_100_and_150_dollars + ) + + return povlines + + +def poverty_lines_regions(): + """ + These poverty lines are used to calculate percentiles for regions. None of them are in the percentile file. + # We only extract to $300 because the highest P99 not available is Other High Income Countries, with $280 + # NOTE: In future updates, check if these poverty lines are enough for the extraction + """ + # Define poverty lines and their increase + + under_2_dollars = list(range(1, 200, 1)) + between_2_and_5_dollars = list(range(200, 500, 2)) + between_5_and_10_dollars = list(range(500, 1000, 5)) + between_10_and_20_dollars = list(range(1000, 2000, 10)) + between_20_and_30_dollars = list(range(2000, 3000, 10)) + between_30_and_55_dollars = list(range(3000, 5500, 10)) + between_55_and_80_dollars = list(range(5500, 8000, 10)) + between_80_and_100_dollars = list(range(8000, 10000, 10)) + between_100_and_150_dollars = list(range(10000, 15000, 10)) + between_150_and_175_dollars = list(range(15000, 17500, 10)) + between_175_and_250_dollars = list(range(17500, 25000, 20)) + between_250_and_300_dollars = list(range(25000, 30000, 50)) + + # povlines is all these lists together + povlines = ( + under_2_dollars + + between_2_and_5_dollars + + between_5_and_10_dollars + + between_10_and_20_dollars + + between_20_and_30_dollars + + between_30_and_55_dollars + + between_55_and_80_dollars + + between_80_and_100_dollars + + between_100_and_150_dollars + + between_150_and_175_dollars + + between_175_and_250_dollars + + between_250_and_300_dollars + ) + + return povlines + + +# Define poverty lines for key indicators, depending on the PPP version. +# It includes the international poverty line, lower and upper-middle income lines, and some other lines. +POVLINES_DICT = { + 2011: [100, 190, 320, 550, 1000, 2000, 3000, 4000], + 2017: [100, 215, 365, 685, 1000, 2000, 3000, 4000], +} + + +PPP_VERSIONS = [2011, 2017] +POV_LINES_COUNTRIES = poverty_lines_countries() +POV_LINES_REGIONS = poverty_lines_regions() + +# # DEBUGGING +# PPP_VERSIONS = [2017] +# POV_LINES_COUNTRIES = [1, 1000, 25000, 50000] +# POV_LINES_REGIONS = [1, 1000, 25000, 50000] + + +@click.command() +@click.option( + "--live-api/--internal-api", + default=True, + type=bool, + help="Select live (1) or internal (0) API", +) +# @click.option("--path-to-file", prompt=True, type=str, help="Path to local data file.") +def run(live_api: bool) -> None: + if live_api: + wb_api = WB_API("https://api.worldbank.org/pip/v1") + else: + wb_api = WB_API("https://apiv2qa.worldbank.org/pip/v1") + + # Generate percentiles by extracting the raw files and processing them afterward + df_percentiles = generate_consolidated_percentiles(generate_percentiles_raw(wb_api), wb_api) + + # Generate relative poverty indicators file + df_relative = generate_relative_poverty(wb_api) + + # Generate key indicators file and patch medians + df = generate_key_indicators(wb_api) + df = median_patch(df, country_or_region="country") + + # Add relative poverty indicators and decile thresholds to the key indicators file + df = add_relative_poverty_and_decile_threholds(df, df_relative, df_percentiles) + + +class WB_API: + def __init__(self, api_address, check_health=False): + self.api_address = api_address + self.check_health = check_health + + def health_check(self): + return pd.read_json(f"{self.api_address}/health-check")[0][0] + + def api_health(self): + """ + Check if the API is running. + """ + if not self.check_health: + return + + # Initialize repeat counter + repeat = 0 + + # health comes from a json containing the status + health = self.health_check() + + # If the status is different to "PIP API is running", repeat the request until MAX_REPEATS + while health != "PIP API is running" and repeat < MAX_REPEATS: + repeat += 1 + + if repeat >= MAX_REPEATS: + # If the status is different to "PIP API is running" after MAX_REPEATS, log fatal error + raise AssertionError(f"Health check: {health} (repeated {repeat} times)") + + def versions(self): + return memory.cache(pd.read_csv)(f"{self.api_address}/versions?format=csv") + + def get_table(self, table): + return pd.read_csv(f"{self.api_address}/aux?table={table}&long_format=false&format=csv") + + def fetch_csv(self, url): + return _fetch_csv(f"{self.api_address}{url}") + + +@retry(wait=wait_random_exponential(multiplier=1), stop=stop_after_attempt(MAX_REPEATS)) +def _get_request(url: str) -> requests.Response: + response = requests.get(url, timeout=TIMEOUT) + if response.status_code != 200: + log.info("fetch_csv.retry", url=url) + raise Exception("API timed out") + + if b"Server Error" in response.content: + raise Exception("API returned server error") + + return response + + +@memory.cache +def _fetch_csv(url: str) -> pd.DataFrame: + r2 = connect_s3_cached() + r2_bucket = "owid-private" + r2_key = "cache/pip_api/" + checksum_str(url) + + # try to get it from cache + try: + obj = r2.get_object(Bucket=r2_bucket, Key=r2_key) + s = obj["Body"].read().decode("utf-8") + # we might have cached invalid responses, in that case fetch it again + if "Server Error" not in s: + df = pd.read_csv(io.StringIO(s)) + log.info("fetch_csv.cache_hit", url=url) + return df + else: + log.info("fetch_csv.cache_with_error", url=url) + except ClientError: + pass + + log.info("fetch_csv.start", url=url) + response = _get_request(url) + log.info("fetch_csv.success", url=url, t=response.elapsed.total_seconds()) + + # save the result to R2 cache + r2.put_object( + Body=response.content, + Bucket=r2_bucket, + Key=r2_key, + ) + + df = pd.read_csv(io.StringIO(response.content.decode("utf-8"))) + return df + + +@memory.cache +def _fetch_percentiles(version: int) -> pd.DataFrame: + # These URLs were copied from https://datacatalog.worldbank.org/search/dataset/0063646/_poverty_and_inequality_platform_pip_percentiles + if version == 2011: + url = "https://datacatalogfiles.worldbank.org/ddh-published/0063646/DR0090357/world_100bin.csv" + elif version == 2017: + url = "https://datacatalogfiles.worldbank.org/ddh-published/0063646/DR0090251/world_100bin.csv" + else: + raise ValueError(f"Version {version} is not supported") + return pd.read_csv(url) + + +############################################################################################################ +# FUNCTIONS + + +def pip_aux_tables(wb_api: WB_API, table="all"): + """ + Download aux tables if the API is running. + """ + + wb_api.api_health() + + if table == "all": + aux_tables_list = [ + "aux_versions", + "countries", + "country_coverage", + "country_list", + "cpi", + "decomposition", + "dictionary", + "framework", + "gdp", + "incgrp_coverage", + "indicators", + "interpolated_means", + "missing_data", + "national_poverty_lines", + "pce", + "pop", + "pop_region", + "poverty_lines", + "ppp", + "region_coverage", + "regions", + "spl", + "survey_means", + ] + # Create a list of dataframes + df_dict = {} + + # Download each table and append it to the list + for tab in aux_tables_list: + df = wb_api.get_table(tab) + + # Add table to df_dict + df_dict[tab] = df + + else: + df = wb_api.get_table(table) + + # Add table to df_dict + df_dict = {table: df} + + log.info(f'Auxiliary tables downloaded ("{table}")') + + return df_dict + + +def pip_versions(wb_api) -> dict: + """ + Download latest PIP data versions if the API is running. + """ + + wb_api.api_health() + + df = wb_api.versions() + df = df[["ppp_version", "release_version", "version"]] + + # Obtain the max release_version + max_release_version = df["release_version"].max() + + # Get the version for ppp_versions 2011 and 2017 + versions = df[df["release_version"] == max_release_version] + + # Set index and convert to dict + versions = versions.set_index("ppp_version", verify_integrity=True).sort_index().to_dict(orient="index") + + version_2011 = versions[2011]["version"] + version_2017 = versions[2017]["version"] + + log.info(f"PIP dataset versions extracted: 2011 = {version_2011}, 2017 = {version_2017}") + + return versions + + +def pip_query_country( + wb_api: WB_API, + popshare_or_povline, + value, + versions, + country_code="all", + year="all", + fill_gaps="true", + welfare_type="all", + reporting_level="all", + ppp_version=2017, + download="false", +) -> pd.DataFrame: + """ + Query country data from the PIP API. + """ + + # Test health of the API + wb_api.api_health() + + # Round povline (popshare) to 2 decimals to work with cents as the minimum unit + value = round(value, 2) + + # Extract version and release_version from versions dict + version = versions[ppp_version]["version"] + release_version = versions[ppp_version]["release_version"] + + # Build query + df = wb_api.fetch_csv( + f"/pip?{popshare_or_povline}={value}&country={country_code}&year={year}&fill_gaps={fill_gaps}&welfare_type={welfare_type}&reporting_level={reporting_level}&ppp_version={ppp_version}&version={version}&release_version={release_version}&format=csv" + ) + + # Add PPP version as column + df["ppp_version"] = ppp_version + + # Replace names of columns and drop redundancies + df = df.rename(columns={"country_name": "country", "reporting_year": "year"}) + df = df.drop(columns=["region_name", "region_code"]) + + # Reorder columns: ppp_version, country, year, povline and the rest + first_columns = ["ppp_version", "country", "year", "poverty_line"] + df = df[first_columns + [column for column in df.columns if column not in first_columns]] + + if download == "true": + # make sure the directory exists. If not, create it + Path(f"{CACHE_DIR}/pip_country_data").mkdir(parents=True, exist_ok=True) + # Save to csv + df.to_csv( + f"{CACHE_DIR}/pip_country_data/pip_country_{country_code}_year_{year}_{popshare_or_povline}_{int(round(value*100))}_welfare_{welfare_type}_rep_{reporting_level}_fillgaps_{fill_gaps}_ppp_{ppp_version}.csv", + index=False, + ) + + if country_code == "all": + log.info(f"Country data extracted for {popshare_or_povline} = {value} ({ppp_version} PPPs)") + else: + log.info( + f"Country data extracted for {popshare_or_povline} = {value} ({ppp_version} PPPs) in {country_code} (year = {year})" + ) + + return df + + +def pip_query_region( + wb_api: WB_API, + popshare_or_povline, + value, + versions, + country_code="all", + year="all", + welfare_type="all", + reporting_level="all", + ppp_version=2017, + download="false", +) -> pd.DataFrame: + """ + Query regional data from the PIP API. + """ + + # Test health of the API + wb_api.api_health() + + # Round povline (popshare) to 2 decimals to work with cents as the minimum unit + value = round(value, 2) + + # Extract version and release_version from versions dict + version = versions[ppp_version]["version"] + release_version = versions[ppp_version]["release_version"] + + # Build query + df = wb_api.fetch_csv( + f"/pip-grp?{popshare_or_povline}={value}&country={country_code}&year={year}&welfare_type={welfare_type}&reporting_level={reporting_level}&ppp_version={ppp_version}&version={version}&release_version={release_version}&format=csv" + ) + + # Add PPP version as column + df["ppp_version"] = ppp_version + + # Replace names of columns and drop redundancies + df = df.rename(columns={"region_name": "country", "reporting_year": "year", "region_code": "country_code"}) + + # Reorder columns: ppp_version, country, year, povline and the rest + first_columns = ["ppp_version", "country", "year", "poverty_line"] + df = df[first_columns + [column for column in df.columns if column not in first_columns]] + + if download == "true": + # make sure the directory exists. If not, create it + Path(f"{CACHE_DIR}/pip_region_data").mkdir(parents=True, exist_ok=True) + # Save to csv + df.to_csv( + f"{CACHE_DIR}/pip_region_data/pip_region_{country_code}_year_{year}_{popshare_or_povline}_{int(round(value*100))}_ppp_{ppp_version}.csv", + index=False, + ) + + if country_code == "all": + log.info(f"Regional data extracted for {popshare_or_povline} = {value} ({ppp_version} PPPs)") + else: + log.info( + f"Regional data extracted for {popshare_or_povline} = {value} ({ppp_version} PPPs) in {country_code} (year = {year})" + ) + + return df + + +# GENERATE PERCENTILES FILES +# This is data not given directly by the query, but we can get it by querying a huge set of poverty lines and assign percentiles according to headcount ratio results. + + +def generate_percentiles_raw(wb_api: WB_API): + """ + Generates percentiles data from query results. This is the raw data to get the percentiles. + Uses concurrent.futures to speed up the process. + """ + start_time = time.time() + + def get_percentiles_data(povline, versions, ppp_version, country_code): + """ + Check if country percentiles data exists. If not, run the query. + """ + if Path( + f"{CACHE_DIR}/pip_country_data/pip_country_{country_code}_year_all_povline_{povline}_welfare_all_rep_all_fillgaps_{FILL_GAPS}_ppp_{ppp_version}.csv" + ).is_file(): + return + + else: + return pip_query_country( + wb_api, + popshare_or_povline="povline", + value=povline / 100, + versions=versions, + country_code=country_code, + year="all", + fill_gaps=FILL_GAPS, + welfare_type="all", + reporting_level="all", + ppp_version=ppp_version, + download="true", + ) + + def concurrent_percentiles_function(country_code): + """ + Executes get_percentiles_data concurrently. + """ + # Make sure the directory exists. If not, create it + Path(f"{CACHE_DIR}/pip_country_data").mkdir(parents=True, exist_ok=True) + + with ThreadPool(MAX_WORKERS) as pool: + tasks = [ + (povline, versions, ppp_version, country_code) + for ppp_version in PPP_VERSIONS + for povline in POV_LINES_COUNTRIES + ] + pool.starmap(get_percentiles_data, tasks) + + def get_percentiles_data_region(povline, versions, ppp_version): + """ + Check if region percentiles data exists. If not, run the query. + """ + if Path( + f"{CACHE_DIR}/pip_region_data/pip_region_all_year_all_povline_{povline}_ppp_{ppp_version}.csv" + ).is_file(): + return + else: + return pip_query_region( + wb_api, + popshare_or_povline="povline", + value=povline / 100, + versions=versions, + country_code="all", + year="all", + welfare_type="all", + reporting_level="all", + ppp_version=ppp_version, + download="true", + ) + + def concurrent_percentiles_region_function(): + """ + Executes get_percentiles_data_region concurrently. + """ + # Make sure the directory exists. If not, create it + Path(f"{CACHE_DIR}/pip_region_data").mkdir(parents=True, exist_ok=True) + with ThreadPool(MAX_WORKERS) as pool: + tasks = [(povline, versions, ppp_version) for ppp_version in PPP_VERSIONS for povline in POV_LINES_REGIONS] + pool.starmap(get_percentiles_data_region, tasks) + + def get_query_country(povline, ppp_version, country_code): + """ + Here I check if the country file exists even after the original extraction. If it does, I read it. If not, I start the queries again. + """ + file_path_country = f"{CACHE_DIR}/pip_country_data/pip_country_{country_code}_year_all_povline_{povline}_welfare_all_rep_all_fillgaps_{FILL_GAPS}_ppp_{ppp_version}.csv" + if Path(file_path_country).is_file(): + df_query_country = pd.read_csv(file_path_country) + else: + # Run the main function to get the data + log.warning( + f"We need to come back to the extraction! countries = {country_code}, {povline}, {ppp_version} PPPs)" + ) + get_percentiles_data(povline, versions, ppp_version, country_code) + df_query_country = pd.read_csv(file_path_country) + + return df_query_country + + def get_query_region(povline, ppp_version): + """ + Here I check if the regional file exists even after the original extraction. If it does, I read it. If not, I start the queries again. + """ + file_path_region = ( + f"{CACHE_DIR}/pip_region_data/pip_region_all_year_all_povline_{povline}_ppp_{ppp_version}.csv" + ) + if Path(file_path_region).is_file(): + df_query_region = pd.read_csv(file_path_region) + else: + # Run the main function to get the data + log.warning(f"We need to come back to the extraction! regions, {povline}, {ppp_version} PPPs)") + get_percentiles_data_region(povline, versions, ppp_version) + df_query_region = pd.read_csv(file_path_region) + + return df_query_region + + def get_list_of_missing_countries(): + """ + Compare the list of countries in a common query (reference file) and the list of countries in the percentile file. + It generates missing_countries, which is a string with all the elements of the list, in the format for querying multiple countries in the API. + And also missing_countries_list, which is a list of the countries. + """ + # Obtain the percentile files the World Bank publishes in their Databank + + df_percentiles_published_2017 = _fetch_percentiles(2017) + + # FOR COUNTRIES + # Get data from the most common query + df_reference = pip_query_country( + wb_api, + popshare_or_povline="povline", + value=2.15, + versions=versions, + country_code="all", + year="all", + fill_gaps=FILL_GAPS, + welfare_type="all", + reporting_level="all", + ppp_version=2017, + ) + + # Edit percentile file to get the list of different countries + df_percentiles_pub = df_percentiles_published_2017.copy() + df_percentiles_pub = df_percentiles_pub.drop( + columns=["percentile", "avg_welfare", "pop_share", "welfare_share", "quantile"] + ).drop_duplicates() + + # Merge the two files + df_merge = pd.merge( + df_reference, + df_percentiles_pub, + on=["country_code", "year", "reporting_level", "welfare_type"], + how="outer", + indicator=True, + ) + + # Obtain the list of countries that are in the reference file but not in the percentile file + list_missing_countries = df_merge.loc[df_merge["_merge"] == "left_only", "country_code"].unique().tolist() + + # Generate a string with all the elements of the list, in the format for querying multiple countries in the API + missing_countries = "&country=".join(list_missing_countries) + + return missing_countries, list_missing_countries + + # Obtain latest versions of the PIP dataset + versions = pip_versions(wb_api) + + # Run the main function + missing_countries, list_missing_countries = get_list_of_missing_countries() + log.info( + f"These countries are available in a common query but not in the percentile file: {list_missing_countries}" + ) + + concurrent_percentiles_function(country_code=missing_countries) + log.info("Country files downloaded") + concurrent_percentiles_region_function() + log.info("Region files downloaded") + + log.info("Now we are concatenating the files") + + with ThreadPool(MAX_WORKERS) as pool: + tasks = [ + (povline, ppp_version, missing_countries) for ppp_version in PPP_VERSIONS for povline in POV_LINES_COUNTRIES + ] + dfs = pool.starmap(get_query_country, tasks) + + df_country = pd.concat(dfs, ignore_index=True) + log.info("Country files concatenated") + + with ThreadPool(MAX_WORKERS) as pool: + tasks = [(povline, ppp_version) for ppp_version in PPP_VERSIONS for povline in POV_LINES_REGIONS] + dfs = pool.starmap(get_query_region, tasks) + + df_region = pd.concat(dfs, ignore_index=True) + log.info("Region files concatenated") + + # Create poverty_line_cents column, multiplying by 100, rounding and making it an integer + df_country["poverty_line_cents"] = round(df_country["poverty_line"] * 100).astype(int) + df_region["poverty_line_cents"] = round(df_region["poverty_line"] * 100).astype(int) + + log.info("Checking if all the poverty lines are in the concatenated files") + + # Check if all the poverty lines are in the df in country and region df + assert set(df_country["poverty_line_cents"].unique()) == set(POV_LINES_COUNTRIES), log.fatal( + "Not all poverty lines are in the country file!" + ) + assert set(df_region["poverty_line_cents"].unique()) == set(POV_LINES_REGIONS), log.fatal( + "Not all poverty lines are in the region file!" + ) + + # Drop poverty_line_cents column + df_country = df_country.drop(columns=["poverty_line_cents"]) + df_region = df_region.drop(columns=["poverty_line_cents"]) + + log.info("Checking if the set of countries and regions is the same as in PIP") + + # I check if the set of countries is the same in the df and in the list of missing countries + assert set(df_country["country_code"].unique()) == set(list_missing_countries), log.fatal( + f"List of countries is different from the one we needed to extract! ({list_missing_countries})" + ) + + # I check if the set of regions is the same in the df and in the aux table (list of regions) + aux_dict = pip_aux_tables(wb_api, table="regions") + assert set(df_region["country"].unique()) == set(aux_dict["regions"]["region"].unique()), log.fatal( + "List of regions is not the same as the one defined in PIP!" + ) + + log.info("Concatenating the raw percentile data for countries and regions") + + # Concatenate df_country and df_region + df = pd.concat([df_country, df_region], ignore_index=True) + + end_time = time.time() + elapsed_time = round(end_time - start_time, 2) + log.info( + f"Concatenation of raw percentile data for countries and regions completed. Execution time: {elapsed_time} seconds" + ) + + return df + + +def calculate_percentile(p, df): + """ + Calculates a single percentile and returns a DataFrame with the results. + """ + df["distance_to_p"] = abs(df["headcount"] * 100 - p) + df_closest = ( + df.sort_values("distance_to_p") + .groupby( + ["ppp_version", "country", "year", "reporting_level", "welfare_type"], + as_index=False, + sort=False, + dropna=False, # This is to avoid dropping rows with NaNs (reporting_level and welfare_type for regions) + ) + .first() + ) + df_closest["target_percentile"] = p + df_closest = df_closest[ + [ + "ppp_version", + "country", + "year", + "reporting_level", + "welfare_type", + "target_percentile", + "poverty_line", + "headcount", + "distance_to_p", + ] + ] + log.info(f"Percentile {p}: calculated") + return df_closest + + +def format_official_percentiles(year, wb_api: WB_API): + """ + Download percentiles from the World Bank Databank and format them to the same format as the constructed percentiles + """ + # Load percentile files from the World Bank Databank + df_percentiles_published = _fetch_percentiles(year) + + # Obtain country names from the aux table + aux_dict = pip_aux_tables(wb_api, table="countries") + df_countries = aux_dict["countries"] + + # Merge the two files to get country names + df_percentiles_published = pd.merge( + df_percentiles_published, + df_countries[["country_code", "country_name"]], + on="country_code", + how="left", + ) + + # Rename columns + df_percentiles_published = df_percentiles_published.rename( + columns={ + "country_name": "country", + "percentile": "target_percentile", + "avg_welfare": "avg", + "welfare_share": "share", + "quantile": "thr", + } + ) + + # Drop pop_share + df_percentiles_published = df_percentiles_published.drop(columns=["pop_share"]) + + # Make thr null if target_percentile is 100 + df_percentiles_published.loc[df_percentiles_published["target_percentile"] == 100, "thr"] = np.nan + + # Add ppp_version column + df_percentiles_published["ppp_version"] = year + + return df_percentiles_published + + +def generate_consolidated_percentiles(df, wb_api: WB_API): + """ + Generates percentiles from the raw data. This is the final file with percentiles. + """ + start_time = time.time() + + path_file_percentiles = f"{CACHE_DIR}/pip_percentiles_before_checks.csv" + + if Path(path_file_percentiles).is_file(): + log.info("Percentiles file already exists. No need to consolidate.") + df_percentiles = pd.read_csv(path_file_percentiles) + + else: + log.info("Consolidating percentiles") + + # Define percentiles, from 1 to 99 + percentiles = range(1, 100, 1) + df_percentiles = pd.DataFrame() + + # Estimate percentiles + dfs = [calculate_percentile(p, df) for p in percentiles] + + df_percentiles = pd.concat(dfs, ignore_index=True) + + log.info("Percentiles calculated and consolidated") + + # Rename headcount to estimated_percentile and poverty_line to thr + df_percentiles = df_percentiles.rename(columns={"headcount": "estimated_percentile", "poverty_line": "thr"}) # type: ignore + + # Add official percentiles from the World Bank Databank + df_percentiles_published_2011 = format_official_percentiles(2011, wb_api) + df_percentiles_published_2017 = format_official_percentiles(2017, wb_api) + + df_percentiles = pd.concat( + [df_percentiles, df_percentiles_published_2011, df_percentiles_published_2017], ignore_index=True + ) + + # Drop duplicates. Keep the second one (the official one) + df_percentiles = df_percentiles.drop_duplicates( + subset=["ppp_version", "country", "year", "reporting_level", "welfare_type", "target_percentile"], + keep="last", + ) + + # Sort by ppp_version, country, year, reporting_level, welfare_type and target_percentile + df_percentiles = df_percentiles.sort_values( + by=["ppp_version", "country", "year", "reporting_level", "welfare_type", "target_percentile"] + ) + + # Save to csv + df_percentiles.to_csv(f"{CACHE_DIR}/pip_percentiles_before_checks.csv", index=False) + + # SANITY CHECKS + df_percentiles = sanity_checks(df_percentiles) + + # Drop distance_to_p, estimated_percentile, country_code + df_percentiles = df_percentiles.drop(columns=["distance_to_p", "estimated_percentile", "country_code"]) + + # Rename target_percentile to percentile + df_percentiles = df_percentiles.rename(columns={"target_percentile": "percentile"}) + + # Save to csv + df_percentiles.to_csv(f"{CACHE_DIR}/pip_percentiles.csv", index=False) + + end_time = time.time() + elapsed_time = round(end_time - start_time, 2) + log.info(f"Percentiles calculated and checked. Execution time: {elapsed_time} seconds") + + return df_percentiles + + +def sanity_checks(df_percentiles): + """ + Run different sanity checks to the percentiles file. + """ + log.info("Starting sanity checks") + + # Count number of rows before checks + rows_before = len(df_percentiles) + + # Consecutive percentiles (1, 2, 3, etc) + # Create a column called check that is True if target_percentile is consecutive for each ppp_version, country, year, reporting_level, and welfare_type + df_percentiles["check"] = ( + df_percentiles.groupby(["ppp_version", "country", "year", "reporting_level", "welfare_type"], dropna=False)[ + "target_percentile" + ].diff() + == 1 + ) + + # Replace check with True if target_percentile is 1 + df_percentiles.loc[df_percentiles["target_percentile"] == 1, "check"] = True + + # Assign the boolean value to the entire group + df_percentiles["check"] = df_percentiles.groupby( + ["ppp_version", "country", "year", "reporting_level", "welfare_type"], dropna=False + )["check"].transform("all") + + # Define mask + mask = ~df_percentiles["check"] + df_error = df_percentiles[mask].reset_index(drop=True).copy() + + if len(df_error) > 0: + log.warning( + f"""Percentiles are not consecutive! These distributions will not be used: + {df_error[["ppp_version", "country", "year", "reporting_level", "welfare_type"]].drop_duplicates()}""" + ) + # Drop faulty distributions + df_percentiles = df_percentiles[~mask].reset_index(drop=True) + + ############################################################################################################ + # Distance_to_p is higher than TOLERANCE_PERCENTILES + df_percentiles["check"] = df_percentiles["distance_to_p"] > TOLERANCE_PERCENTILES + + # Assign the boolean value to the entire group + df_percentiles["check"] = df_percentiles.groupby( + ["ppp_version", "country", "year", "reporting_level", "welfare_type"], dropna=False + )["check"].transform("any") + + # Define mask + mask = df_percentiles["check"] + df_error = df_percentiles[mask].reset_index(drop=True).copy() + + if len(df_error) > 0: + log.warning( + f"""Percentiles are not accurate! These distributions will not be used: + {df_error[["ppp_version", "country", "year", "reporting_level", "welfare_type"]].drop_duplicates()}""" + ) + # Drop faulty distributions + df_percentiles = df_percentiles[~mask].reset_index(drop=True) + + ############################################################################################################ + # Nulls for thr, avg and share for the entire group of ppp_version, country, year, reporting_level, and welfare_type + df_percentiles["check_thr"] = df_percentiles.groupby( + ["ppp_version", "country", "year", "reporting_level", "welfare_type"], dropna=False + )["thr"].transform(lambda x: x.isnull().all()) + df_percentiles["check_avg"] = df_percentiles.groupby( + ["ppp_version", "country", "year", "reporting_level", "welfare_type"], dropna=False + )["avg"].transform(lambda x: x.isnull().all()) + df_percentiles["check_share"] = df_percentiles.groupby( + ["ppp_version", "country", "year", "reporting_level", "welfare_type"], dropna=False + )["share"].transform(lambda x: x.isnull().all()) + + df_percentiles["check"] = df_percentiles["check_thr"] & df_percentiles["check_avg"] & df_percentiles["check_share"] + + # Define mask + mask = df_percentiles["check"] + df_error = df_percentiles[mask].reset_index(drop=True).copy() + + if len(df_error) > 0: + log.warning( + f"""There are null values for thr, avg and share! These distributions need to be corrected: + {df_error[["ppp_version", "country", "year", "reporting_level", "welfare_type"]].drop_duplicates()}""" + ) + # Drop distributions with null values for thr, avg and share + df_percentiles = df_percentiles[~mask].reset_index(drop=True) + + ############################################################################################################ + # Find negative values for thr + df_percentiles["check"] = df_percentiles["thr"] < 0 + + # Define mask + mask = df_percentiles["check"] + + df_error = df_percentiles[mask].reset_index(drop=True).copy() + + if len(df_error) > 0: + log.warning( + f"""There are negative values for thr! These distributions need to be corrected: + {df_error[["ppp_version", "country", "year", "reporting_level", "welfare_type"]].drop_duplicates()}""" + ) + # Correct cases where thr, avg and share are negative, by assigning 0 + df_percentiles.loc[mask, "thr"] = 0 + + ############################################################################################################ + # Find negative values for avg + df_percentiles["check"] = df_percentiles["avg"] < 0 + + # Define mask + mask = df_percentiles["check"] + + df_error = df_percentiles[mask].reset_index(drop=True).copy() + + if len(df_error) > 0: + log.warning( + f"""There are negative values for avg! These distributions need to be corrected: + {df_error[["ppp_version", "country", "year", "reporting_level", "welfare_type"]].drop_duplicates()}""" + ) + # Correct cases where thr, avg and share are negative, by assigning 0 + df_percentiles.loc[mask, "avg"] = 0 + + ############################################################################################################ + # Find negative values for share + df_percentiles["check"] = df_percentiles["share"] < 0 + + # Define mask + mask = df_percentiles["check"] + + df_error = df_percentiles[mask].reset_index(drop=True).copy() + + if len(df_error) > 0: + log.warning( + f"""There are negative values for share! These distributions need to be corrected: + {df_error[["ppp_version", "country", "year", "reporting_level", "welfare_type"]].drop_duplicates()}""" + ) + # Correct cases where thr, avg and share are negative, by assigning 0 + df_percentiles.loc[mask, "share"] = 0 + + ############################################################################################################ + # thr is increasing for each ppp_version, country, year, reporting_level, and welfare_type + df_percentiles["check"] = ( + df_percentiles.groupby(["ppp_version", "country", "year", "reporting_level", "welfare_type"], dropna=False)[ + "thr" + ] + .diff() + .round(2) + >= 0 + ) + + # Replace check with True if thr is NaN + df_percentiles.loc[df_percentiles["thr"].isna(), "check"] = True + + # Replace check with True if target_percentile is 1 + df_percentiles.loc[(df_percentiles["target_percentile"] == 1), "check"] = True + + # Define mask + mask = ~df_percentiles["check"] + df_error = df_percentiles[mask].reset_index(drop=True).copy() + + if len(df_error) > 0: + log.warning( + f"""Thresholds are not increasing! These distributions need to be corrected: + {df_error[["ppp_version", "country", "year", "reporting_level", "welfare_type"]].drop_duplicates()}""" + ) + # Correct cases where thr is not increasing, by repeating the previous thr + df_percentiles.loc[mask, "thr"] = df_percentiles.loc[mask, "thr"].shift(1) + + ############################################################################################################ + # avg is increasing for each ppp_version, country, year, reporting_level, and welfare_type + df_percentiles["check"] = ( + df_percentiles.groupby(["ppp_version", "country", "year", "reporting_level", "welfare_type"], dropna=False)[ + "avg" + ] + .diff() + .round(2) + >= 0 + ) + + # Replace check with True if avg is NaN + df_percentiles.loc[df_percentiles["avg"].isna(), "check"] = True + + # Replace check with True if target_percentile is 1 + df_percentiles.loc[(df_percentiles["target_percentile"] == 1), "check"] = True + + # Define mask + mask = ~df_percentiles["check"] + df_error = df_percentiles[mask].reset_index(drop=True).copy() + + if len(df_error) > 0: + log.warning( + f"""Averages are not increasing! These distributions need to be corrected: + {df_error[["ppp_version", "country", "year", "reporting_level", "welfare_type"]].drop_duplicates()}""" + ) + # Correct cases where avg is not increasing, by repeating the previous avg + df_percentiles.loc[mask, "avg"] = df_percentiles.loc[mask, "avg"].shift(1) + + ############################################################################################################ + # Check that avg are between thresholds + # Create thr_lower, which is the threshold for the previous percentile + df_percentiles["thr_lower"] = df_percentiles.groupby( + ["ppp_version", "country", "year", "reporting_level", "welfare_type"], dropna=False + )["thr"].shift(1) + df_percentiles["check"] = (round(df_percentiles["avg"] - df_percentiles["thr_lower"], 2) >= 0) & ( + round(df_percentiles["thr"] - df_percentiles["avg"]) >= 0 + ) + + # Assign True if target_percentile is 1 + df_percentiles.loc[df_percentiles["target_percentile"] == 1, "check"] = True + + # Assign True if target_percentile is 100 and avg is greater than thr_lower + df_percentiles.loc[ + (df_percentiles["target_percentile"] == 100) + & (round(df_percentiles["avg"] - df_percentiles["thr_lower"], 2) >= 0), + "check", + ] = True + + # Assign True if avg is null + df_percentiles.loc[df_percentiles["avg"].isnull(), "check"] = True + + # Assign the boolean value to the entire group + df_percentiles["check"] = df_percentiles.groupby( + ["ppp_version", "country", "year", "reporting_level", "welfare_type"], dropna=False + )["check"].transform("all") + + # Define mask + mask = ~df_percentiles["check"] + df_error = df_percentiles[mask].reset_index(drop=True).copy() + + if len(df_error) > 0: + log.warning( + f"""Averages are not between thresholds! These distributions need to be corrected: + {df_error[["ppp_version", "country", "year", "reporting_level", "welfare_type"]].drop_duplicates()}""" + ) + # Correct cases where avg is not between thresholds, by averaging the two thresholds + df_percentiles.loc[mask, "avg"] = (df_percentiles.loc[mask, "thr_lower"] + df_percentiles.loc[mask, "thr"]) / 2 + + # Drop check columns + df_percentiles = df_percentiles.drop(columns=["check", "check_thr", "check_avg", "check_share", "thr_lower"]) + + # Count number of rows after checks + rows_after = len(df_percentiles) + + log.info(f"Percentiles file generated. {rows_before - rows_after} rows have been deleted.") + + return df_percentiles + + +# GENERATE RELATIVE POVERTY INDICATORS FILE +# This is data not given directly by the query, but we can get it by calculating 40, 50, 60% of the median and query +# NOTE: Medians need to be patched first in order to get data for all country-years (there are several missing values) + + +def generate_relative_poverty(wb_api: WB_API): + """ + Generates relative poverty indicators from query results. Uses concurrent.futures to speed up the process. + """ + start_time = time.time() + + def get_relative_data(df_row, pct, versions): + """ + This function is structured in a way to make it work with concurrent.futures. + It checks if the country file related to the row exists. If not, it runs the query. + """ + if ~np.isnan(df_row["median"]): + if Path( + f"{CACHE_DIR}/pip_country_data/pip_country_{df_row['country_code']}_year_{df_row['year']}_povline_{int(round(df_row['median'] * pct))}_welfare_{df_row['welfare_type']}_rep_{df_row['reporting_level']}_fillgaps_{FILL_GAPS}_ppp_2017.csv" + ).is_file(): + return + else: + return pip_query_country( + wb_api, + popshare_or_povline="povline", + value=df_row["median"] * pct / 100, + versions=versions, + country_code=df_row["country_code"], + year=df_row["year"], + fill_gaps=FILL_GAPS, + welfare_type=df_row["welfare_type"], + reporting_level=df_row["reporting_level"], + ppp_version=2017, + download="true", + ) + + def concurrent_relative_function(df): + """ + This is the main function to make concurrency work for country data. + """ + # Make sure the directory exists. If not, create it + Path(f"{CACHE_DIR}/pip_country_data").mkdir(parents=True, exist_ok=True) + with ThreadPool(MAX_WORKERS) as pool: + tasks = [(df.iloc[i], pct, versions) for pct in [40, 50, 60] for i in range(len(df))] + pool.starmap(get_relative_data, tasks) + + def get_relative_data_region(df_row, pct, versions): + """ + This function is structured in a way to make it work with concurrent.futures. + It checks if the regional file related to the row exists. If not, it runs the query. + """ + if ~np.isnan(df_row["median"]): + if Path( + f"{CACHE_DIR}/pip_region_data/pip_region_{df_row['country_code']}_year_{df_row['year']}_povline_{int(round(df_row['median']*pct))}_ppp_2017.csv" + ).is_file(): + return + else: + return pip_query_region( + wb_api, + popshare_or_povline="povline", + value=df_row["median"] * pct / 100, + versions=versions, + country_code=df_row["country_code"], + year=df_row["year"], + welfare_type="all", + reporting_level="all", + ppp_version=2017, + download="true", + ) + + def concurrent_relative_region_function(df): + """ + This is the main function to make concurrency work for regional data. + """ + # Make sure the directory exists. If not, create it + Path(f"{CACHE_DIR}/pip_region_data").mkdir(parents=True, exist_ok=True) + with ThreadPool(int(round(MAX_WORKERS / 2))) as pool: + tasks = [(df.iloc[i], pct, versions) for pct in [40, 50, 60] for i in range(len(df))] + pool.starmap(get_relative_data_region, tasks) + + def add_relative_indicators(df, country_or_region): + """ + Integrates the relative indicators to the df. + """ + for pct in [40, 50, 60]: + # Initialize lists + headcount_ratio_list = [] + pgi_list = [] + pov_severity_list = [] + watts_list = [] + for i in range(len(df)): + if ~np.isnan(df["median"].iloc[i]): + if country_or_region == "country": + # Here I check if the file exists even after the original extraction. If it does, I read it. If not, I start the queries again. + file_path = f"{CACHE_DIR}/pip_country_data/pip_country_{df.iloc[i]['country_code']}_year_{df.iloc[i]['year']}_povline_{int(round(df.iloc[i]['median']*pct))}_welfare_{df.iloc[i]['welfare_type']}_rep_{df.iloc[i]['reporting_level']}_fillgaps_{FILL_GAPS}_ppp_2017.csv" + if Path(file_path).is_file(): + results = pd.read_csv(file_path) + else: + # Run the main function to get the data + get_relative_data(df.iloc[i], pct, versions) + results = pd.read_csv(file_path) + + elif country_or_region == "region": + # Here I check if the file exists even after the original extraction. If it does, I read it. If not, I start the queries again. + file_path = f"{CACHE_DIR}/pip_region_data/pip_region_{df.iloc[i]['country_code']}_year_{df.iloc[i]['year']}_povline_{int(round(df.iloc[i]['median']*pct))}_ppp_2017.csv" + if Path(file_path).is_file(): + results = pd.read_csv(file_path) + else: + # Run the main function to get the data + get_relative_data_region(df.iloc[i], pct, versions) + results = pd.read_csv(file_path) + else: + raise ValueError("country_or_region must be 'country' or 'region'") + + headcount_ratio_value = results["headcount"].iloc[0] + headcount_ratio_list.append(headcount_ratio_value) + + pgi_value = results["poverty_gap"].iloc[0] + pgi_list.append(pgi_value) + + pov_severity_value = results["poverty_severity"].iloc[0] + pov_severity_list.append(pov_severity_value) + + watts_value = results["watts"].iloc[0] + watts_list.append(watts_value) + + else: + headcount_ratio_list.append(np.nan) + pgi_list.append(np.nan) + pov_severity_list.append(np.nan) + watts_list.append(np.nan) + + # Add the lists as columns to the df + df[f"headcount_ratio_{pct}_median"] = headcount_ratio_list + df[f"poverty_gap_index_{pct}_median"] = pgi_list + df[f"poverty_severity_{pct}_median"] = pov_severity_list + df[f"watts_{pct}_median"] = watts_list + + return df + + # Obtain versions + versions = pip_versions(wb_api) + + # FOR COUNTRIES + # Get data from the most common query + df_country = pip_query_country( + wb_api, + popshare_or_povline="povline", + value=2.15, + versions=versions, + country_code="all", + year="all", + fill_gaps=FILL_GAPS, + welfare_type="all", + reporting_level="all", + ppp_version=2017, + ) + + # Patch medians + df_country = median_patch(df_country, country_or_region="country") + + # Run the main function to get the data + concurrent_relative_function(df_country) + + # Add relative indicators from the results above + df_country = add_relative_indicators(df=df_country, country_or_region="country") + + # FOR REGIONS + # Get data from the most common query + df_region = pip_query_region( + wb_api, + popshare_or_povline="povline", + value=2.15, + versions=versions, + country_code="all", + year="all", + welfare_type="all", + reporting_level="all", + ppp_version=2017, + ) + + # Patch medians + df_region = median_patch(df_region, country_or_region="region") + + # Run the main function to get the data + concurrent_relative_region_function(df_region) + + # Add relative indicators from the results above + df_region = add_relative_indicators(df=df_region, country_or_region="region") + + # Concatenate df_country and df_region + df = pd.concat([df_country, df_region], ignore_index=True) + + # Save to csv + df.to_csv(f"{CACHE_DIR}/pip_relative.csv", index=False) + + end_time = time.time() + elapsed_time = round(end_time - start_time, 2) + log.info(f"Relative poverty indicators calculated. Execution time: {elapsed_time} seconds") + + return df + + +# GENERATE MAIN INDICATORS FILE + + +def generate_key_indicators(wb_api: WB_API): + """ + Generate the main indicators file, from a set of poverty lines and PPP versions. Uses concurrent.futures to speed up the process. + """ + start_time = time.time() + + def get_country_data(povline, ppp_version, versions): + """ + This function is defined inside the main function because it needs to be called by concurrent.futures. + For country data. + """ + return pip_query_country( + wb_api, + popshare_or_povline="povline", + value=povline / 100, + versions=versions, + country_code="all", + year="all", + fill_gaps=FILL_GAPS, + welfare_type="all", + reporting_level="all", + ppp_version=ppp_version, + download="false", + ) + + def get_region_data(povline, ppp_version, versions): + """ + This function is defined inside the main function because it needs to be called by concurrent.futures. + For regional data. + """ + return pip_query_region( + wb_api, + popshare_or_povline="povline", + value=povline / 100, + versions=versions, + country_code="all", + year="all", + welfare_type="all", + reporting_level="all", + ppp_version=ppp_version, + download="false", + ) + + def concurrent_function(): + """ + This function makes concurrency work for country data. + """ + with ThreadPool(MAX_WORKERS) as pool: + tasks = [ + (povline, ppp_version, versions) + for ppp_version, povlines in POVLINES_DICT.items() + for povline in povlines + ] + results = pool.starmap(get_country_data, tasks) + + # Concatenate list of dataframes + results = pd.concat(results, ignore_index=True) + + return results + + def concurrent_region_function(): + """ + This function makes concurrency work for regional data. + """ + with ThreadPool(int(round(MAX_WORKERS / 2))) as pool: + tasks = [ + (povline, ppp_version, versions) + for ppp_version, povlines in POVLINES_DICT.items() + for povline in povlines + ] + results = pool.starmap(get_region_data, tasks) + + # Concatenate list of dataframes + results = pd.concat(results, ignore_index=True) + + return results + + # Obtain latest versions of the PIP dataset + versions = pip_versions(wb_api) + + # Run the main function + results = concurrent_function() + results_region = concurrent_region_function() + + # If country is nan but country_code is TWN, replace country with Taiwan, China + results.loc[results["country"].isnull() & (results["country_code"] == "TWN"), "country"] = "Taiwan, China" + + # I check if the set of countries is the same in the df and in the aux table (list of countries) + aux_dict = pip_aux_tables(wb_api, table="countries") + assert set(results["country"].unique()) == set(aux_dict["countries"]["country_name"].unique()), log.fatal( + f"List of countries is not the same! Differences: {set(results['country'].unique()) - set(aux_dict['countries']['country_name'].unique())}" + ) + + # I check if the set of regions is the same in the df and in the aux table (list of regions) + aux_dict = pip_aux_tables(wb_api, table="regions") + assert set(results_region["country"].unique()) == set(aux_dict["regions"]["region"].unique()), log.fatal( + f"List of regions is not the same! Differences: {set(results_region['country'].unique()) - set(aux_dict['regions']['region'].unique())}" + ) + + # Concatenate df_country and df_region + df = pd.concat([results, results_region], ignore_index=True) + + # Sort ppp_version, country, year and poverty_line + df = df.sort_values(by=["ppp_version", "country", "year", "poverty_line"]) # type: ignore + + # Save to csv + df.to_csv(f"{CACHE_DIR}/pip_raw.csv", index=False) + + end_time = time.time() + elapsed_time = round(end_time - start_time, 2) + log.info(f"Key indicators calculated. Execution time: {elapsed_time} seconds") + + return df + + +def median_patch(df, country_or_region): + """ + Patch missing values in the median column. + PIP queries do not return all the medians, so they are patched with the results of the percentile file. + """ + + # Read percentile file + df_percentiles = pd.read_csv(f"{CACHE_DIR}/pip_percentiles.csv") + + # In df_percentiles, keep only the rows with percentile = 50 + df_percentiles = df_percentiles[df_percentiles["percentile"] == 50].reset_index() + + # If I want to patch the median for regions, I need to drop reporting_level and welfare_type columns + if country_or_region == "country": + # Merge df and df_percentiles + df = pd.merge( + df, + df_percentiles[["ppp_version", "country", "year", "reporting_level", "welfare_type", "thr"]], + on=["ppp_version", "country", "year", "reporting_level", "welfare_type"], + how="left", + ) + + # Replace missing values in median with thr + df["median"] = df["median"].fillna(df["thr"]) + + # Drop thr column + df = df.drop(columns=["thr"]) + + elif country_or_region == "region": + # Merge df and df_percentiles + df = pd.merge( + df, + df_percentiles[["ppp_version", "country", "year", "thr"]], + on=["ppp_version", "country", "year"], + how="left", + ) + + # Rename thr to median + df = df.rename(columns={"thr": "median"}) + + else: + raise ValueError("country_or_region must be 'country' or 'region'") + + log.info("Medians patched!") + + return df + + +def add_relative_poverty_and_decile_threholds(df, df_relative, df_percentiles): + """ + Add relative poverty indicators and decile thresholds to the key indicators file. + """ + + # Add relative poverty indicators + # They don't change with the PPP version, so we can use the 2017 version I estimated before. + df = pd.merge( + df, + df_relative[ + [ + "country", + "year", + "reporting_level", + "welfare_type", + "headcount_ratio_40_median", + "poverty_gap_index_40_median", + "poverty_severity_40_median", + "watts_40_median", + "headcount_ratio_50_median", + "poverty_gap_index_50_median", + "poverty_severity_50_median", + "watts_50_median", + "headcount_ratio_60_median", + "poverty_gap_index_60_median", + "poverty_severity_60_median", + "watts_60_median", + ] + ], + on=["country", "year", "reporting_level", "welfare_type"], + how="left", + ) + + # In df_percentiles, keep only the rows with percentile = 10, 20, 30, ... 90 + df_percentiles = df_percentiles[ + (df_percentiles["percentile"] % 10 == 0) & (df_percentiles["percentile"] != 100) + ].reset_index() + + # Make tb_percentile wide, with percentile as columns + df_percentiles = df_percentiles.pivot( + index=["ppp_version", "country", "year", "reporting_level", "welfare_type"], + columns="percentile", + values="thr", + ) + + # Rename columns + df_percentiles.columns = ["decile" + str(int(round(col / 10))) + "_thr" for col in df_percentiles.columns] + + # Reset index + df_percentiles = df_percentiles.reset_index() + + # Merge df and df_percentiles + df = pd.merge( + df, + df_percentiles, + on=["ppp_version", "country", "year", "reporting_level", "welfare_type"], + how="left", + ) + + # Save key indicators file + df.to_csv(f"{CACHE_DIR}/world_bank_pip.csv", index=False) + + log.info("Relative poverty indicators and decile thresholds added. Key indicators file done :)") + + return df + + +if __name__ == "__main__": + run() diff --git a/snapshots/wb/2024-03-27/world_bank_pip.csv.dvc b/snapshots/wb/2024-03-27/world_bank_pip.csv.dvc new file mode 100644 index 00000000000..4e5434ee522 --- /dev/null +++ b/snapshots/wb/2024-03-27/world_bank_pip.csv.dvc @@ -0,0 +1,31 @@ +# Learn more at: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/origin/ +meta: + origin: + # Data product / Snapshot + title: World Bank Poverty and Inequality Platform (PIP) + description: |- + The Poverty and Inequality Platform (PIP) is an interactive computational tool that offers users quick access to the World Bank’s estimates of poverty, inequality, and shared prosperity. PIP provides a comprehensive view of global, regional, and country-level trends for more than 160 economies around the world. + date_published: 2024-03-26 + version_producer: 20240326_2017, 20240326_2011 + title_snapshot: Key indicators + + # Citation + producer: World Bank Poverty and Inequality Platform + citation_full: |- + World Bank (2024). Poverty and Inequality Platform (version 20240326_2017 and 20240326_2011) [Data set]. World Bank Group. https://pip.worldbank.org/. Accessed March 27, 2024. + + # Files + url_main: https://pip.worldbank.org + date_accessed: 2024-03-27 + + # License + license: + name: CC0 + url: https://datacatalog.worldbank.org/search/dataset/0063646 + +wdir: ../../../data/snapshots/wb/2024-01-17 +outs: + - md5: 5fb032d2de430f79f25e1bdf1259c9bf + size: 35764784 + path: world_bank_pip.csv diff --git a/snapshots/wb/2024-03-27/world_bank_pip.py b/snapshots/wb/2024-03-27/world_bank_pip.py new file mode 100644 index 00000000000..c3db74c58bc --- /dev/null +++ b/snapshots/wb/2024-03-27/world_bank_pip.py @@ -0,0 +1,36 @@ +"""Script to create a snapshot of dataset.""" + +from pathlib import Path + +import click + +from etl.snapshot import Snapshot + +# Version for current snapshot dataset. +SNAPSHOT_VERSION = Path(__file__).parent.name + + +@click.command() +@click.option( + "--upload/--skip-upload", + default=True, + type=bool, + help="Upload dataset to Snapshot", +) +@click.option("--path-to-file", prompt=True, type=str, help="Path to local data file.") +def main(path_to_file: str, upload: bool) -> None: + # Create a new snapshot. + snap = Snapshot(f"wb/{SNAPSHOT_VERSION}/world_bank_pip.csv") + + # Ensure destination folder exists. + snap.path.parent.mkdir(exist_ok=True, parents=True) + + # Copy local data file to snapshots data folder. + snap.path.write_bytes(Path(path_to_file).read_bytes()) + + # Add file to DVC and upload to S3. + snap.dvc_add(upload=upload) + + +if __name__ == "__main__": + main() diff --git a/snapshots/wb/2024-03-27/world_bank_pip_percentiles.csv.dvc b/snapshots/wb/2024-03-27/world_bank_pip_percentiles.csv.dvc new file mode 100644 index 00000000000..d7c1982d021 --- /dev/null +++ b/snapshots/wb/2024-03-27/world_bank_pip_percentiles.csv.dvc @@ -0,0 +1,33 @@ +# Learn more at: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/origin/ +meta: + origin: + # Data product / Snapshot + title: 'World Bank Poverty and Inequality Platform (PIP): Percentiles' + description: |- + The Poverty and Inequality Platform: Percentiles database reports 100 points ranked according to the consumption or income distributions for country-year survey data available in the World Bank’s Poverty and Inequality Platform (PIP). There are, as of March 26, 2024, a total of 2,367 country-survey-year data points, which include 2,201 distributions based on microdata or binned data, and 166 based on grouped data. For the grouped data, the percentiles are derived by fitting a parametric Lorenz distribution following Datt (1998). For ease of communication, all distributions are referred to as survey data henceforth, and the welfare variable is referred to as income. + + We modified the original files available in [World Bank's Databank](https://datacatalog.worldbank.org/search/dataset/0063646/_poverty_and_inequality_platform_pip_percentiles) to include distributions from missing countries and regions with data available in PIP's API. + date_published: 2024-04-08 + version_producer: Version 10 + + # Citation + producer: World Bank Poverty and Inequality Platform + citation_full: |- + - World Bank (2024). Poverty and Inequality Platform: percentiles [Data set]. World Bank Group. https://pip.worldbank.org/. Accessed 09 April 2024. + - World Bank (2024). Poverty and Inequality Platform (version 20240326_2017 and 20240326_2011) [Data set]. World Bank Group. https://pip.worldbank.org/. Accessed March 27, 2024. + + # Files + url_main: https://datacatalog.worldbank.org/search/dataset/0063646/_poverty_and_inequality_platform_pip_percentiles + date_accessed: 2024-04-09 + + # License + license: + name: CC0 + url: https://datacatalog.worldbank.org/search/dataset/0063646/_poverty_and_inequality_platform_pip_percentiles + +wdir: ../../../data/snapshots/wb/2024-01-17 +outs: + - md5: f5bb53372a6fd0f563d20d04b3c897c7 + size: 49972432 + path: world_bank_pip_percentiles.csv diff --git a/snapshots/wb/2024-03-27/world_bank_pip_percentiles.py b/snapshots/wb/2024-03-27/world_bank_pip_percentiles.py new file mode 100644 index 00000000000..17eb2bd88e3 --- /dev/null +++ b/snapshots/wb/2024-03-27/world_bank_pip_percentiles.py @@ -0,0 +1,25 @@ +"""Script to create a snapshot of dataset.""" + +from pathlib import Path + +import click + +from etl.snapshot import Snapshot + +# Version for current snapshot dataset. +SNAPSHOT_VERSION = Path(__file__).parent.name + + +@click.command() +@click.option("--upload/--skip-upload", default=True, type=bool, help="Upload dataset to Snapshot") +@click.option("--path-to-file", prompt=True, type=str, help="Path to local data file.") +def main(path_to_file: str, upload: bool) -> None: + # Create a new snapshot. + snap = Snapshot(f"wb/{SNAPSHOT_VERSION}/world_bank_pip_percentiles.csv") + + # Copy local data file to snapshots data folder, add file to DVC and upload to S3. + snap.create_snapshot(filename=path_to_file, upload=upload) + + +if __name__ == "__main__": + main()