Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Aggregate the error summaries per each upgrade. #248

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 30 additions & 4 deletions postprocessing/comstockpostproc/comstock.py
Original file line number Diff line number Diff line change
Expand Up @@ -242,11 +242,12 @@ def __init__(self, s3_base_dir, comstock_run_name, comstock_run_version, comstoc
# Now, we have self.data is one huge LazyFrame
# which is exactly like self.data was before because it includes all upgrades
self.data = pl.concat(up_lazyframes)
self._aggregate_failure_summaries()
# logger.info(f'comstock data schema: {self.data.dtypes()}')
# logger.debug('\nComStock columns after adding all data:')
# for c in self.data.columns:
# logger.debug(c)

def download_data(self):
# Get data on the s3 resource to download data from:
if self.s3_inpath is None:
Expand Down Expand Up @@ -545,14 +546,14 @@ def load_data(self, upgrade_id, acceptable_failure_percentage=0.01, drop_failed_
# Fill Nulls in measure-within-upgrade applicability columns with False
for c, dt in up_res.schema.items():
if 'applicable' in c:
if dt == pl.Null:
if dt == pl.Null or dt == pl.Boolean:
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Small patch to handle the case when type of column is pl.Boolean

logger.debug(f'For {c}: Nulls set to False (Boolean) in baseline')
up_res = up_res.with_columns([pl.col(c).fill_null(pl.lit(False))])
elif dt == pl.Utf8:
logger.debug(f'For {c}: Nulls set to "False" (String) in baseline')
up_res = up_res.with_columns([pl.col(c).fill_null(pl.lit("False"))])
up_res = up_res.with_columns([pl.when(pl.col(c).str.lengths() == 0).then(pl.lit('False')).otherwise(pl.col(c)).keep_name()])

assert up_res.get_column(c).null_count() == 0, f'Column {c} contains null values'
# Convert columns with only 'True' and/or 'False' strings to Boolean
for col, dt in up_res.schema.items():
if not dt == pl.Utf8:
Expand Down Expand Up @@ -738,7 +739,7 @@ def load_data(self, upgrade_id, acceptable_failure_percentage=0.01, drop_failed_
ST_FAIL_NO_STATUS,
]
failure_summaries = failure_summaries.select(fs_cols)
file_name = f'failure_summary.csv'
file_name = f'failure_summary_{upgrade_id}.csv'
file_path = os.path.abspath(os.path.join(self.output_dir, file_name))
logger.info(f'Exporting to: {file_path}')
failure_summaries.write_csv(file_path)
Expand Down Expand Up @@ -2972,3 +2973,28 @@ def export_data_and_enumeration_dictionary(self):
file_path = os.path.abspath(os.path.join(self.output_dir, file_name))
logger.info(f'Exporting enumeration dictionary to: {file_path}')
enum_dictionary.write_csv(file_path, separator='\t')

def _aggregate_failure_summaries(self):
#sinece we are generating summary of falures based on
#each upgrade_id(in load_data()), we should aggregate
#the summary of failures for each upgrade_id into one

path = os.path.join(self.output_dir)

alLines = list()
#find all the failure_summary files like with failure_summary_0.csv
# failure_summary_1.csv ... failure_summary_k.csv
for file in os.listdir(path):
if file.startswith("failure_summary_") and file.endswith(".csv"):
#open the file and read the content
with open(os.path.join(path, file), 'r') as f:
for line in f:
if line not in alLines:
alLines.append(line)
#delete the file
# os.remove(os.path.join(path, file))
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If you don't want to keep the err_summary per each upgrade, just uncomment this line.


#write the aggregated summary of failures to a new file
with open(os.path.join(path, "failure_summary_aggregated.csv"), 'w') as f:
for line in alLines:
f.write(line)