✨ anomalist: GP support, refactor functions, add dfReduced (#3416)

* ✨ anomalist: nits * abstract df parsing logic * add GP outlier * add dfReduced to table * reset index * incorporate GP * re-arrange functions, add link to indicator * stop reducing dfScore
owid · Oct 16, 2024 · fe3027b · fe3027b
1 parent aab0dc2
commit fe3027b
Show file tree

Hide file tree

Showing 5 changed files with 186 additions and 161 deletions.
diff --git a/apps/anomalist/anomalist_api.py b/apps/anomalist/anomalist_api.py
@@ -284,6 +284,15 @@ def anomaly_detection(
             )
             anomaly.dfScore = df_score_long
 
+            # Reduce dataframe
+            df_score_long_reduced = (
+                df_score_long.sort_values("anomaly_score", ascending=False)
+                .drop_duplicates(subset=["entity_name", "variable_id"], keep="first")
+                .reset_index(drop=True)
+            )
+            anomaly.dfReduced = df_score_long_reduced
+
+            ##################################################################
             # TODO: Use this as an alternative to storing binary files in the DB
             # anomaly = gm.Anomaly(
             #     datasetId=dataset_id,
@@ -293,6 +302,7 @@ def anomaly_detection(
 
             # # Export anomaly file
             # anomaly.path_file = export_anomalies_file(df_score, dataset_id, detector.anomaly_type)
+            ##################################################################
 
             if not dry_run:
                 with Session(engine) as session:

diff --git a/apps/anomalist/detectors.py b/apps/anomalist/detectors.py
@@ -41,13 +41,6 @@ def get_long_format_score_df(df_score: pd.DataFrame) -> pd.DataFrame:
     # Drop zero anomalies.
     df_score_long = df_score_long[df_score_long["anomaly_score"] != 0]
 
-    # For now, keep only the highest anomaly score for each country-indicator.
-    df_score_long = (
-        df_score_long.sort_values("anomaly_score", ascending=False)
-        .drop_duplicates(subset=["variable_id", "entity_name"], keep="first")
-        .reset_index(drop=True)
-    )
-
     # Save memory by converting to categoricals.
     df_score_long = df_score_long.astype({"entity_name": "category", "year": "category", "variable_id": "category"})