From be21396f85cfa6c49a82a06267ecfc5629619154 Mon Sep 17 00:00:00 2001
From: Wenyi Kuang <wkuang@nrel.gov>
Date: Fri, 1 Nov 2024 14:23:37 -0600
Subject: [PATCH] Aggregate the error summaries per each upgrade.

---
 postprocessing/comstockpostproc/comstock.py | 34 ++++++++++++++++++---
 1 file changed, 30 insertions(+), 4 deletions(-)

diff --git a/postprocessing/comstockpostproc/comstock.py b/postprocessing/comstockpostproc/comstock.py
index d6db25b8..0f1c9517 100644
--- a/postprocessing/comstockpostproc/comstock.py
+++ b/postprocessing/comstockpostproc/comstock.py
@@ -242,11 +242,12 @@ def __init__(self, s3_base_dir, comstock_run_name, comstock_run_version, comstoc
             # Now, we have self.data is one huge LazyFrame
             # which is exactly like self.data was before because it includes all upgrades
             self.data = pl.concat(up_lazyframes)
+            self._aggregate_failure_summaries() 
             # logger.info(f'comstock data schema: {self.data.dtypes()}')
             # logger.debug('\nComStock columns after adding all data:')
             # for c in self.data.columns:
             #     logger.debug(c)
-
+    
     def download_data(self):
         # Get data on the s3 resource to download data from:
         if self.s3_inpath is None:
@@ -545,14 +546,14 @@ def load_data(self, upgrade_id, acceptable_failure_percentage=0.01, drop_failed_
             # Fill Nulls in measure-within-upgrade applicability columns with False
             for c, dt in up_res.schema.items():
                 if 'applicable' in c:
-                    if dt == pl.Null:
+                    if dt == pl.Null or dt == pl.Boolean:
                         logger.debug(f'For {c}: Nulls set to False (Boolean) in baseline')
                         up_res = up_res.with_columns([pl.col(c).fill_null(pl.lit(False))])
                     elif dt == pl.Utf8:
                         logger.debug(f'For {c}: Nulls set to "False" (String) in baseline')
                         up_res = up_res.with_columns([pl.col(c).fill_null(pl.lit("False"))])
                         up_res = up_res.with_columns([pl.when(pl.col(c).str.lengths() == 0).then(pl.lit('False')).otherwise(pl.col(c)).keep_name()])
-
+                    assert up_res.get_column(c).null_count() == 0, f'Column {c} contains null values' 
             # Convert columns with only 'True' and/or 'False' strings to Boolean
             for col, dt in up_res.schema.items():
                 if not dt == pl.Utf8:
@@ -738,7 +739,7 @@ def load_data(self, upgrade_id, acceptable_failure_percentage=0.01, drop_failed_
             ST_FAIL_NO_STATUS,
         ]
         failure_summaries = failure_summaries.select(fs_cols)
-        file_name = f'failure_summary.csv'
+        file_name = f'failure_summary_{upgrade_id}.csv'
         file_path = os.path.abspath(os.path.join(self.output_dir, file_name))
         logger.info(f'Exporting to: {file_path}')
         failure_summaries.write_csv(file_path)
@@ -2972,3 +2973,28 @@ def export_data_and_enumeration_dictionary(self):
         file_path = os.path.abspath(os.path.join(self.output_dir, file_name))
         logger.info(f'Exporting enumeration dictionary to: {file_path}')
         enum_dictionary.write_csv(file_path, separator='\t')
+
+    def _aggregate_failure_summaries(self):
+        #sinece we are generating summary of falures based on
+        #each upgrade_id(in load_data()), we should aggregate
+        #the summary of failures for each upgrade_id into one
+
+        path = os.path.join(self.output_dir)
+
+        alLines = list()
+        #find all the failure_summary files like with failure_summary_0.csv
+        # failure_summary_1.csv ... failure_summary_k.csv
+        for file in os.listdir(path):
+            if file.startswith("failure_summary_") and file.endswith(".csv"):
+                #open the file and read the content
+                with open(os.path.join(path, file), 'r') as f:
+                    for line in f:
+                        if line not in alLines:
+                            alLines.append(line)
+                 #delete the file
+                # os.remove(os.path.join(path, file))
+        
+        #write the aggregated summary of failures to a new file
+        with open(os.path.join(path, "failure_summary_aggregated.csv"), 'w') as f:
+            for line in alLines:
+                f.write(line)
\ No newline at end of file