Skip to content

Commit

Permalink
Parse CSV output for all parameter values read outside read parameter…
Browse files Browse the repository at this point in the history
…s method (#1541)
  • Loading branch information
mnjowe authored Dec 11, 2024
1 parent b673a4a commit d6fcffd
Show file tree
Hide file tree
Showing 2 changed files with 64 additions and 1 deletion.
38 changes: 38 additions & 0 deletions src/tlo/util.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""This file contains helpful utility functions."""
import ast
import hashlib
from collections import defaultdict
from pathlib import Path
Expand Down Expand Up @@ -531,3 +532,40 @@ def clean_dataframe(dataframes_dict: dict[str, DataFrame]) -> None:
# return a dictionary if return_dict flag is set to True else return a dataframe
return all_data if return_dict else next(iter(all_data.values()))


def parse_csv_values_for_columns_with_mixed_datatypes(value: Any):
""" Pandas :py:func:`pandas.read_csv` function handles columns with mixed data types by defaulting to the object
data type, which often results in values being interpreted as strings. The most common place for this in TLO is
when we are reading parameters. This is not a problem when the parameters are read in read parameters method
using load_parameters_from_dataframe method as parameter values are mapped to their defined datatypes.
Problems arise when you're trying to directly use the output from the csv files like it is within some few files
in TLO. This method tries to provide a fix by parsing the parameter values in those few places to their best
possible data types
:param value: mixed datatype column value
"""
# if value is not a string then return value
if not isinstance(value, str):
return value

value = value.strip() # Remove leading/trailing whitespace
# It is important to catch booleans early to avoid int(value) which will convert them into an interger value
# 0(False) or 1(True)
if value.lower() in ['true', 'false']:
return value.lower() == 'true'

try:
return int(value) # try converting the value to an interger, throw excepetion otherwise
except ValueError:
try:
return float(value) # try converting the value to a float, throw excepetion otherwise
except ValueError:
# Check if it's a list using `ast.literal_eval`
try:
parsed = ast.literal_eval(value)
if isinstance(parsed, list):
return parsed
except (ValueError, SyntaxError):
pass
return value # Return as a string if no other type fits
27 changes: 26 additions & 1 deletion tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,12 @@
from tlo import Date, Simulation
from tlo.analysis.utils import parse_log_file
from tlo.methods import demography
from tlo.util import DEFAULT_MOTHER_ID, convert_excel_files_to_csv, read_csv_files
from tlo.util import (
DEFAULT_MOTHER_ID,
convert_excel_files_to_csv,
parse_csv_values_for_columns_with_mixed_datatypes,
read_csv_files,
)

path_to_files = Path(os.path.dirname(__file__))

Expand Down Expand Up @@ -498,3 +503,23 @@ def check_logic_of_converting_excel_files_to_csv_files(folder: Path, files: list
# check behaviours are as expected. New folders containing csv files should be created with names resembling the
# Excel file they were created from
check_logic_of_converting_excel_files_to_csv_files(tmpdir_resourcefilepath, excel_files)

def test_parse_values_in_mixed_datatypes_columns():
""" parse values from a mixed datatype column. Here we create a dataframe with a column that resembles output from
read csv when presented with a mixed datatype column
"""

# define a dataframe with mixed type column setting all values as string(This is the default behaviour when reading
# csv files columns with mixed datatype column
mixed_data_df = pd.DataFrame(data={'param_values':['54', 'inc_malaria', '[1,2,3]', '0.2']})
# confirm all values are strings
for value in mixed_data_df.param_values:
assert isinstance(value, str)
# expected datatypes
exp_dtypes = [int, str, list, float]
# parse values
mixed_data_df['param_values'] = mixed_data_df[
'param_values'].apply(parse_csv_values_for_columns_with_mixed_datatypes)
# confirm value data type is now as expected
for _index, exp_dtype in enumerate(exp_dtypes):
assert isinstance(mixed_data_df.loc[_index, "param_values"], exp_dtype)

0 comments on commit d6fcffd

Please sign in to comment.