Parse CSV output for all parameter values read outside read parameter…

…s method (#1541)
UCL · Dec 11, 2024 · d6fcffd · d6fcffd
1 parent b673a4a
commit d6fcffd
Show file tree

Hide file tree

Showing 2 changed files with 64 additions and 1 deletion.
diff --git a/src/tlo/util.py b/src/tlo/util.py
@@ -1,4 +1,5 @@
 """This file contains helpful utility functions."""
+import ast
 import hashlib
 from collections import defaultdict
 from pathlib import Path
@@ -531,3 +532,40 @@ def clean_dataframe(dataframes_dict: dict[str, DataFrame]) -> None:
     # return a dictionary if return_dict flag is set to True else return a dataframe
     return all_data if return_dict else next(iter(all_data.values()))
 
+
+def parse_csv_values_for_columns_with_mixed_datatypes(value: Any):
+    """ Pandas :py:func:`pandas.read_csv` function handles columns with mixed data types by defaulting to the object
+    data type, which often results in values being interpreted as strings. The most common place for this in TLO is
+    when we are reading parameters. This is not a problem when the parameters are read in read parameters method
+    using load_parameters_from_dataframe method as parameter values are mapped to their defined datatypes.
+
+    Problems arise when you're trying to directly use the output from the csv files like it is within some few files
+    in TLO. This method tries to provide a fix by parsing the parameter values in those few places to their best
+    possible data types
+
+    :param value: mixed datatype column value
+    """
+    # if value is not a string then return value
+    if not isinstance(value, str):
+        return value
+
+    value = value.strip()  # Remove leading/trailing whitespace
+    # It is important to catch booleans early to avoid int(value) which will convert them into an interger value
+    # 0(False) or 1(True)
+    if value.lower() in ['true', 'false']:
+        return value.lower() == 'true'
+
+    try:
+        return int(value) # try converting the value to an interger, throw excepetion otherwise
+    except ValueError:
+        try:
+            return float(value) # try converting the value to a float, throw excepetion otherwise
+        except ValueError:
+            # Check if it's a list using `ast.literal_eval`
+            try:
+                parsed = ast.literal_eval(value)
+                if isinstance(parsed, list):
+                    return parsed
+            except (ValueError, SyntaxError):
+                pass
+            return value  # Return as a string if no other type fits
diff --git a/tests/test_utils.py b/tests/test_utils.py
@@ -15,7 +15,12 @@
 from tlo import Date, Simulation
 from tlo.analysis.utils import parse_log_file
 from tlo.methods import demography
-from tlo.util import DEFAULT_MOTHER_ID, convert_excel_files_to_csv, read_csv_files
+from tlo.util import (
+    DEFAULT_MOTHER_ID,
+    convert_excel_files_to_csv,
+    parse_csv_values_for_columns_with_mixed_datatypes,
+    read_csv_files,
+)
 
 path_to_files = Path(os.path.dirname(__file__))
 
@@ -498,3 +503,23 @@ def check_logic_of_converting_excel_files_to_csv_files(folder: Path, files: list
     # check behaviours are as expected. New folders containing csv files should be created with names resembling the
     # Excel file they were created from
     check_logic_of_converting_excel_files_to_csv_files(tmpdir_resourcefilepath, excel_files)
+
+def test_parse_values_in_mixed_datatypes_columns():
+    """ parse values from a mixed datatype column. Here we create a dataframe with a column that resembles output from
+    read csv when presented with a mixed datatype column
+    """
+
+    # define a dataframe with mixed type column setting all values as string(This is the default behaviour when reading
+    # csv files columns with mixed datatype column
+    mixed_data_df = pd.DataFrame(data={'param_values':['54', 'inc_malaria', '[1,2,3]', '0.2']})
+    # confirm all values are strings
+    for value in mixed_data_df.param_values:
+        assert isinstance(value, str)
+    # expected datatypes
+    exp_dtypes = [int, str, list, float]
+    # parse values
+    mixed_data_df['param_values'] = mixed_data_df[
+        'param_values'].apply(parse_csv_values_for_columns_with_mixed_datatypes)
+    # confirm value data type is now as expected
+    for _index, exp_dtype in enumerate(exp_dtypes):
+        assert isinstance(mixed_data_df.loc[_index, "param_values"], exp_dtype)