Merge pull request #144 from bptlab/improvement/139-last-cleansing

Improvement/139 last cleansing
bptlab · May 31, 2024 · f84eaaf · f84eaaf
2 parents 127c998 + 7431423
commit f84eaaf
Show file tree

Hide file tree

Showing 36 changed files with 1,251 additions and 1,093 deletions.
diff --git a/README.md b/README.md
@@ -3,7 +3,7 @@
 
 ## Key Points
 
-This bachelorproject focuses on event log extraction from patient journeys using large-language models.
+This bachelorproject focuses on event log extraction from Patient Journeys using large-language models.
 
 Our project partner is mamahealth. More information about them can be found here: [mamahealth](https://www.mamahealth.io/)
 

diff --git a/tracex_project/db.sqlite3 b/tracex_project/db.sqlite3
diff --git a/tracex_project/db_results/forms.py b/tracex_project/db_results/forms.py
@@ -9,9 +9,11 @@
 
 
 class PatientJourneySelectForm(forms.Form):
-    """Form for selecting a patient journey."""
+    """Form for selecting a Patient Journey."""
 
-    selected_patient_journey = forms.ChoiceField(choices=[])
+    selected_patient_journey = forms.ChoiceField(
+        choices=[], label="Selected Patient Journey:"
+    )
 
     def __init__(self, *args, **kwargs):
         """Initializes the PatientJourneySelectForm."""
@@ -20,10 +22,14 @@ def __init__(self, *args, **kwargs):
             "selected_patient_journey"
         ].choices = self.get_patient_journey_choices()
 
-    def get_patient_journey_choices(self) -> List[Tuple[str, str]]:
-        """Retrieves the available patient journey choices with existing metrics from the database."""
-        patient_journeys = PatientJourney.manager.filter(
-            trace__events__metrics__isnull=False
+    @staticmethod
+    def get_patient_journey_choices() -> List[Tuple[str, str]]:
+        """Retrieves the available Patient Journey choices with existing metrics from the database."""
+        patient_journeys: List[PatientJourney] = PatientJourney.manager.filter(
+            trace__events__metrics__isnull=False,
+            trace__events__metrics__activity_relevance__isnull=False,
+            trace__events__metrics__timestamp_correctness__isnull=False,
+            trace__events__metrics__correctness_confidence__isnull=False,
         ).distinct()
         choices = [(pj.name, pj.name) for pj in patient_journeys]
 

diff --git a/tracex_project/db_results/templates/metrics_pj_overview.html b/tracex_project/db_results/templates/metrics_pj_overview.html
@@ -10,7 +10,7 @@
 <h2>Welcome to the Metrics Dashboard</h2>
 <p>The Metrics Dashboard is a comprehensive visual interface that displays key metrics tracked by the metrics analyzer
     module during the extraction of a trace.</p>
-<p>Please select one patient journey for that you want to see the dashboard.</p>
+<p>Please select one Patient Journey for that you want to see the dashboard.</p>
 
 <form method="post">
     {% csrf_token %}

diff --git a/tracex_project/db_results/views.py b/tracex_project/db_results/views.py
@@ -29,7 +29,7 @@ def get_context_data(self, **kwargs):
 
 
 class MetricsOverviewView(FormView):
-    """View for selecting a patient journey for showing metrics."""
+    """View for selecting a Patient Journey for showing metrics."""
 
     form_class = PatientJourneySelectForm
     template_name = "metrics_pj_overview.html"
@@ -50,11 +50,11 @@ class MetricsDashboardView(TemplateView):
 
     def get_context_data(self, **kwargs):
         """
-        Extend the existing context with additional metrics relevant to the patient journey.
+        Extend the existing context with additional metrics relevant to the Patient Journey.
 
-        This method retrieves the patient journey name from the session, fetches the corresponding
+        This method retrieves the Patient Journey name from the session, fetches the corresponding
         data frame, and updates the context object with various metrics and visualizations such as
-        counts, charts, and data tables related to the patient journey.
+        counts, charts, and data tables related to the Patient Journey.
         """
 
         context = super().get_context_data(**kwargs)
@@ -68,10 +68,10 @@ def get_context_data(self, **kwargs):
 
     def get_latest_trace_df(self) -> pd.DataFrame:
         """
-        Fetch the DataFrame for the latest trace of a specific patient journey stored in the session.
+        Fetch the DataFrame for the latest trace of a specific Patient Journey stored in the session.
 
         This method constructs a query to fetch the ID of the latest trace entry related to a
-        patient journey. It considers only those entries where activity relevance, timestamp correctness,
+        Patient Journey. It considers only those entries where activity relevance, timestamp correctness,
         and correctness confidence metrics are not null. It then retrieves the DataFrame for these
         events.
         """
@@ -90,57 +90,86 @@ def get_latest_trace_df(self) -> pd.DataFrame:
         return u.DataFrameUtilities.get_events_df(query_last_trace)
 
     def update_context_with_counts(self, context, trace_df: pd.DataFrame):
-        """Update the given context dictionary with count statistics related to patient journeys and traces."""
+        """Update the given context dictionary with count statistics related to Patient Journeys and traces."""
         patient_journey_name = self.request.session["patient_journey_name"]
 
-        context.update({
-            "patient_journey_name": patient_journey_name,
-            "total_patient_journeys": PatientJourney.manager.count(),
-            "total_traces": Trace.manager.count(),
-            "total_activities": trace_df.shape[0],
-            "traces_count": Trace.manager.filter(patient_journey__name=patient_journey_name).count()
-        })
+        context.update(
+            {
+                "patient_journey_name": patient_journey_name,
+                "total_patient_journeys": PatientJourney.manager.count(),
+                "total_traces": Trace.manager.count(),
+                "total_activities": trace_df.shape[0],
+                "traces_count": Trace.manager.filter(
+                    patient_journey__name=patient_journey_name
+                ).count(),
+            }
+        )
 
     def update_context_with_charts(self, context, trace_df: pd.DataFrame):
         """Update the context dictionary with chart visualizations."""
         relevance_counts = trace_df["activity_relevance"].value_counts()
         timestamp_correctness_counts = trace_df["timestamp_correctness"].value_counts()
 
-        context.update({
-            "activity_relevance_pie_chart": self.create_pie_chart(relevance_counts),
-            "timestamp_correctness_pie_chart": self.create_pie_chart(timestamp_correctness_counts),
-            "activity_relevance_bar_chart": self.create_bar_chart(relevance_counts, "Activity Relevance", "Count"),
-            "timestamp_correctness_bar_chart": self.create_bar_chart(timestamp_correctness_counts,
-                                                                     "Timestamp Correctness", "Count"),
-            "most_frequent_category": relevance_counts.index[0],
-            "most_frequent_category_count": relevance_counts.values[0],
-            "most_frequent_timestamp_correctness": timestamp_correctness_counts.index[0],
-            "most_frequent_timestamp_correctness_count": timestamp_correctness_counts.values[0],
-            "average_timestamp_correctness": round(trace_df["correctness_confidence"].mean(), 2)
-        })
+        context.update(
+            {
+                "activity_relevance_pie_chart": self.create_pie_chart(relevance_counts),
+                "timestamp_correctness_pie_chart": self.create_pie_chart(
+                    timestamp_correctness_counts
+                ),
+                "activity_relevance_bar_chart": self.create_bar_chart(
+                    relevance_counts, "Activity Relevance", "Count"
+                ),
+                "timestamp_correctness_bar_chart": self.create_bar_chart(
+                    timestamp_correctness_counts, "Timestamp Correctness", "Count"
+                ),
+                "most_frequent_category": relevance_counts.index[0],
+                "most_frequent_category_count": relevance_counts.values[0],
+                "most_frequent_timestamp_correctness": timestamp_correctness_counts.index[
+                    0
+                ],
+                "most_frequent_timestamp_correctness_count": timestamp_correctness_counts.values[
+                    0
+                ],
+                "average_timestamp_correctness": round(
+                    trace_df["correctness_confidence"].mean(), 2
+                ),
+            }
+        )
 
     def update_context_with_data_tables(self, context, trace_df: pd.DataFrame):
         """Format trace data into styled HTML tables and add them to the context."""
 
         # Apply renaming, styling, and convert to HTML, then update the context
         relevance_columns = ["activity", "activity_relevance"]
-        timestamp_columns = ["activity", "time:timestamp", "time:end_timestamp", "timestamp_correctness",
-                             "correctness_confidence"]
+        timestamp_columns = [
+            "activity",
+            "time:timestamp",
+            "time:end_timestamp",
+            "timestamp_correctness",
+            "correctness_confidence",
+        ]
 
         relevance_df = trace_df[relevance_columns]
         relevance_df = u.Conversion.rename_columns(relevance_df)
-        relevance_styled = relevance_df.style.set_table_attributes('class="dataframe"').apply(self.color_relevance,
-                                                                                              axis=1).hide().to_html()
+        relevance_styled = (
+            relevance_df.style.set_table_attributes('class="dataframe"')
+            .apply(self.color_relevance, axis=1)
+            .hide()
+            .to_html()
+        )
 
         timestamp_df = trace_df[timestamp_columns]
         timestamp_df = u.Conversion.rename_columns(timestamp_df)
-        timestamp_styled = timestamp_df.style.set_table_attributes('class="dataframe"').apply(
-            self.color_timestamp_correctness, axis=1).hide().to_html()
+        timestamp_styled = (
+            timestamp_df.style.set_table_attributes('class="dataframe"')
+            .apply(self.color_timestamp_correctness, axis=1)
+            .hide()
+            .to_html()
+        )
 
-        context.update({
-            "relevance_df": relevance_styled,
-            "timestamp_df": timestamp_styled
-        })
+        context.update(
+            {"relevance_df": relevance_styled, "timestamp_df": timestamp_styled}
+        )
 
     @staticmethod
     def color_relevance(row: pd.Series) -> List[str]:
@@ -169,9 +198,9 @@ def color_timestamp_correctness(row: pd.Series) -> List[str]:
             styles = ["background-color: tan"] * len(row)
 
         if (
-                low_confidence_threshold
-                <= correctness_confidence
-                <= high_confidence_threshold
+            low_confidence_threshold
+            <= correctness_confidence
+            <= high_confidence_threshold
         ):
             styles[confidence_index] = "background-color: orange"
         elif correctness_confidence < low_confidence_threshold:
@@ -244,8 +273,12 @@ def get_context_data(self, **kwargs):
         cohorts_df = self.get_cohorts_data(traces)
 
         if not event_log_df.empty:
-            event_log_df = self.filter_and_cleanup_event_log(event_log_df, filter_settings)
-            context.update(self.generate_dfg_and_tables(event_log_df, cohorts_df, filter_settings))
+            event_log_df = self.filter_and_cleanup_event_log(
+                event_log_df, filter_settings
+            )
+            context.update(
+                self.generate_dfg_and_tables(event_log_df, cohorts_df, filter_settings)
+            )
 
         context.update({"form": EvaluationForm(initial=filter_settings)})
         self.request.session["event_log"] = event_log_df.to_json()
@@ -275,27 +308,39 @@ def get_traces_and_events(self) -> Tuple[QuerySet, pd.DataFrame]:
     def get_cohorts_data(traces: QuerySet) -> pd.DataFrame:
         """Extract and format cohort data from given traces for further processing and visualization."""
         cohorts = Cohort.manager.filter(trace__in=traces)
-        cohorts_data = list(cohorts.values("trace", "age", "sex", "origin", "condition", "preexisting_condition"))
+        cohorts_data = list(
+            cohorts.values(
+                "trace", "age", "sex", "origin", "condition", "preexisting_condition"
+            )
+        )
         cohorts_df = pd.DataFrame(cohorts_data)
         if not cohorts_df.empty:
             cohorts_df["age"] = cohorts_df["age"].astype(pd.Int64Dtype())
         return cohorts_df
 
     @staticmethod
-    def filter_and_cleanup_event_log(event_log_df: pd.DataFrame, filter_settings: dict) -> pd.DataFrame:
+    def filter_and_cleanup_event_log(
+        event_log_df: pd.DataFrame, filter_settings: dict
+    ) -> pd.DataFrame:
         """Apply user-defined filters to the event log data and clean up unnecessary columns."""
         filter_dict = {
             "event_type": filter_settings.get("event_types"),
             "attribute_location": filter_settings.get("locations"),
         }
         event_log_df = u.DataFrameUtilities.filter_dataframe(event_log_df, filter_dict)
         event_log_df = event_log_df.drop(
-            columns=["activity_relevance", "timestamp_correctness", "correctness_confidence"])
+            columns=[
+                "activity_relevance",
+                "timestamp_correctness",
+                "correctness_confidence",
+            ]
+        )
         return event_log_df
 
     @staticmethod
-    def generate_dfg_and_tables(event_log_df: pd.DataFrame, cohorts_df: pd.DataFrame,
-                                filter_settings: dict) -> dict:
+    def generate_dfg_and_tables(
+        event_log_df: pd.DataFrame, cohorts_df: pd.DataFrame, filter_settings: dict
+    ) -> dict:
         """Generate visualizations and HTML tables for the provided event log and cohort data."""
         activity_key = filter_settings.get("activity_key")
         return {