Skip to content

Commit

Permalink
✨ anomalist: GP support, refactor functions, add dfReduced (#3416)
Browse files Browse the repository at this point in the history
* ✨ anomalist: nits

* abstract df parsing logic

* add GP outlier

* add dfReduced to table

* reset index

* incorporate GP

* re-arrange functions, add link to indicator

* stop reducing dfScore
  • Loading branch information
lucasrodes authored Oct 16, 2024
1 parent aab0dc2 commit fe3027b
Show file tree
Hide file tree
Showing 5 changed files with 186 additions and 161 deletions.
10 changes: 10 additions & 0 deletions apps/anomalist/anomalist_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -284,6 +284,15 @@ def anomaly_detection(
)
anomaly.dfScore = df_score_long

# Reduce dataframe
df_score_long_reduced = (
df_score_long.sort_values("anomaly_score", ascending=False)
.drop_duplicates(subset=["entity_name", "variable_id"], keep="first")
.reset_index(drop=True)
)
anomaly.dfReduced = df_score_long_reduced

##################################################################
# TODO: Use this as an alternative to storing binary files in the DB
# anomaly = gm.Anomaly(
# datasetId=dataset_id,
Expand All @@ -293,6 +302,7 @@ def anomaly_detection(

# # Export anomaly file
# anomaly.path_file = export_anomalies_file(df_score, dataset_id, detector.anomaly_type)
##################################################################

if not dry_run:
with Session(engine) as session:
Expand Down
7 changes: 0 additions & 7 deletions apps/anomalist/detectors.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,13 +41,6 @@ def get_long_format_score_df(df_score: pd.DataFrame) -> pd.DataFrame:
# Drop zero anomalies.
df_score_long = df_score_long[df_score_long["anomaly_score"] != 0]

# For now, keep only the highest anomaly score for each country-indicator.
df_score_long = (
df_score_long.sort_values("anomaly_score", ascending=False)
.drop_duplicates(subset=["variable_id", "entity_name"], keep="first")
.reset_index(drop=True)
)

# Save memory by converting to categoricals.
df_score_long = df_score_long.astype({"entity_name": "category", "year": "category", "variable_id": "category"})

Expand Down
Loading

0 comments on commit fe3027b

Please sign in to comment.