From be21396f85cfa6c49a82a06267ecfc5629619154 Mon Sep 17 00:00:00 2001 From: Wenyi Kuang Date: Fri, 1 Nov 2024 14:23:37 -0600 Subject: [PATCH] Aggregate the error summaries per each upgrade. --- postprocessing/comstockpostproc/comstock.py | 34 ++++++++++++++++++--- 1 file changed, 30 insertions(+), 4 deletions(-) diff --git a/postprocessing/comstockpostproc/comstock.py b/postprocessing/comstockpostproc/comstock.py index d6db25b8..0f1c9517 100644 --- a/postprocessing/comstockpostproc/comstock.py +++ b/postprocessing/comstockpostproc/comstock.py @@ -242,11 +242,12 @@ def __init__(self, s3_base_dir, comstock_run_name, comstock_run_version, comstoc # Now, we have self.data is one huge LazyFrame # which is exactly like self.data was before because it includes all upgrades self.data = pl.concat(up_lazyframes) + self._aggregate_failure_summaries() # logger.info(f'comstock data schema: {self.data.dtypes()}') # logger.debug('\nComStock columns after adding all data:') # for c in self.data.columns: # logger.debug(c) - + def download_data(self): # Get data on the s3 resource to download data from: if self.s3_inpath is None: @@ -545,14 +546,14 @@ def load_data(self, upgrade_id, acceptable_failure_percentage=0.01, drop_failed_ # Fill Nulls in measure-within-upgrade applicability columns with False for c, dt in up_res.schema.items(): if 'applicable' in c: - if dt == pl.Null: + if dt == pl.Null or dt == pl.Boolean: logger.debug(f'For {c}: Nulls set to False (Boolean) in baseline') up_res = up_res.with_columns([pl.col(c).fill_null(pl.lit(False))]) elif dt == pl.Utf8: logger.debug(f'For {c}: Nulls set to "False" (String) in baseline') up_res = up_res.with_columns([pl.col(c).fill_null(pl.lit("False"))]) up_res = up_res.with_columns([pl.when(pl.col(c).str.lengths() == 0).then(pl.lit('False')).otherwise(pl.col(c)).keep_name()]) - + assert up_res.get_column(c).null_count() == 0, f'Column {c} contains null values' # Convert columns with only 'True' and/or 'False' strings to Boolean for col, dt in up_res.schema.items(): if not dt == pl.Utf8: @@ -738,7 +739,7 @@ def load_data(self, upgrade_id, acceptable_failure_percentage=0.01, drop_failed_ ST_FAIL_NO_STATUS, ] failure_summaries = failure_summaries.select(fs_cols) - file_name = f'failure_summary.csv' + file_name = f'failure_summary_{upgrade_id}.csv' file_path = os.path.abspath(os.path.join(self.output_dir, file_name)) logger.info(f'Exporting to: {file_path}') failure_summaries.write_csv(file_path) @@ -2972,3 +2973,28 @@ def export_data_and_enumeration_dictionary(self): file_path = os.path.abspath(os.path.join(self.output_dir, file_name)) logger.info(f'Exporting enumeration dictionary to: {file_path}') enum_dictionary.write_csv(file_path, separator='\t') + + def _aggregate_failure_summaries(self): + #sinece we are generating summary of falures based on + #each upgrade_id(in load_data()), we should aggregate + #the summary of failures for each upgrade_id into one + + path = os.path.join(self.output_dir) + + alLines = list() + #find all the failure_summary files like with failure_summary_0.csv + # failure_summary_1.csv ... failure_summary_k.csv + for file in os.listdir(path): + if file.startswith("failure_summary_") and file.endswith(".csv"): + #open the file and read the content + with open(os.path.join(path, file), 'r') as f: + for line in f: + if line not in alLines: + alLines.append(line) + #delete the file + # os.remove(os.path.join(path, file)) + + #write the aggregated summary of failures to a new file + with open(os.path.join(path, "failure_summary_aggregated.csv"), 'w') as f: + for line in alLines: + f.write(line) \ No newline at end of file