Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Parse CSV output for all parameter values read outside read parameters method #1541

Merged
merged 7 commits into from
Dec 11, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 38 additions & 0 deletions src/tlo/util.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""This file contains helpful utility functions."""
import ast
import hashlib
from collections import defaultdict
from pathlib import Path
Expand Down Expand Up @@ -531,3 +532,40 @@ def clean_dataframe(dataframes_dict: dict[str, DataFrame]) -> None:
# return a dictionary if return_dict flag is set to True else return a dataframe
return all_data if return_dict else next(iter(all_data.values()))


def parse_csv_values_for_columns_with_mixed_datatypes(value: Any):
""" Pandas :py:func:`pandas.read_csv` function handles columns with mixed data types by defaulting to the object
data type, which often results in values being interpreted as strings. The most common place for this in TLO is
when we are reading parameters. This is not a problem when the parameters are read in read parameters method
using load_parameters_from_dataframe method as parameter values are mapped to their defined datatypes.

Problems arise when you're trying to directly use the output from the csv files like it is within some few files
in TLO. This method tries to provide a fix by parsing the parameter values in those few places to their best
possible data types

:param value: mixed datatype column value
"""
# if value is not a string then return value
if not isinstance(value, str):
return value

value = value.strip() # Remove leading/trailing whitespace
# It is important to catch booleans early to avoid int(value) which will convert them into an interger value
# 0(False) or 1(True)
if value.lower() in ['true', 'false']:
return value.lower() == 'true'

try:
return int(value) # try converting the value to an interger, throw excepetion otherwise
except ValueError:
try:
return float(value) # try converting the value to a float, throw excepetion otherwise
except ValueError:
# Check if it's a list using `ast.literal_eval`
try:
parsed = ast.literal_eval(value)
if isinstance(parsed, list):
return parsed
except (ValueError, SyntaxError):
pass
return value # Return as a string if no other type fits
27 changes: 26 additions & 1 deletion tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,12 @@
from tlo import Date, Simulation
from tlo.analysis.utils import parse_log_file
from tlo.methods import demography
from tlo.util import DEFAULT_MOTHER_ID, convert_excel_files_to_csv, read_csv_files
from tlo.util import (
DEFAULT_MOTHER_ID,
convert_excel_files_to_csv,
parse_csv_values_for_columns_with_mixed_datatypes,
read_csv_files,
)

path_to_files = Path(os.path.dirname(__file__))

Expand Down Expand Up @@ -498,3 +503,23 @@ def check_logic_of_converting_excel_files_to_csv_files(folder: Path, files: list
# check behaviours are as expected. New folders containing csv files should be created with names resembling the
# Excel file they were created from
check_logic_of_converting_excel_files_to_csv_files(tmpdir_resourcefilepath, excel_files)

def test_parse_values_in_mixed_datatypes_columns():
""" parse values from a mixed datatype column. Here we create a dataframe with a column that resembles output from
read csv when presented with a mixed datatype column
"""

# define a dataframe with mixed type column setting all values as string(This is the default behaviour when reading
# csv files columns with mixed datatype column
mixed_data_df = pd.DataFrame(data={'param_values':['54', 'inc_malaria', '[1,2,3]', '0.2']})
# confirm all values are strings
for value in mixed_data_df.param_values:
assert isinstance(value, str)
# expected datatypes
exp_dtypes = [int, str, list, float]
# parse values
mixed_data_df['param_values'] = mixed_data_df[
'param_values'].apply(parse_csv_values_for_columns_with_mixed_datatypes)
# confirm value data type is now as expected
for _index, exp_dtype in enumerate(exp_dtypes):
assert isinstance(mixed_data_df.loc[_index, "param_values"], exp_dtype)
Loading