From 25143bc87e2ed452980024824b1f94125ae3b21f Mon Sep 17 00:00:00 2001 From: Kashif Rasul Date: Sat, 21 Sep 2024 10:28:41 +0200 Subject: [PATCH] 1 - gpd.cdf(score) < threshold --- examples/anomaly_detection_pytorch.py | 33 +++++++++++++++++++++------ 1 file changed, 26 insertions(+), 7 deletions(-) diff --git a/examples/anomaly_detection_pytorch.py b/examples/anomaly_detection_pytorch.py index d3b8c24fc8..ac8ccd61d3 100644 --- a/examples/anomaly_detection_pytorch.py +++ b/examples/anomaly_detection_pytorch.py @@ -90,6 +90,12 @@ def main(args): prediction_length=dataset.metadata.prediction_length, context_length=args.context_length, freq=dataset.metadata.freq, + num_feat_static_cat=len(dataset.metadata.feat_static_cat), + cardinality=[ + int(cat_feat_info.cardinality) + for cat_feat_info in dataset.metadata.feat_static_cat + ], + embedding_dimension=[3], trainer_kwargs=dict( max_epochs=args.max_epochs, ), @@ -108,6 +114,7 @@ def main(args): ) anomalies = [] + means = [] model.eval() with torch.no_grad(): for batch in tqdm(test_data_loader, desc="Processing batches"): @@ -154,6 +161,7 @@ def main(args): ) scaled_past_target = inputs["past_target"] / scale batch_anomalies = [] + batch_means = [] for i in tqdm( range(inputs["future_target"].shape[1]), desc="Processing prediction length", @@ -161,12 +169,12 @@ def main(args): ): target = inputs["future_target"][:, i : i + 1] score = -distr.log_prob(target) - + batch_means.append(distr.mean) # only check if its an anomaly for scores greater than gpd.loc for each entry in the batch is_anomaly = torch.where( score < gpd.loc, False, - gpd.cdf(score) < args.anomaly_threshold, + 1 - gpd.cdf(score) < args.anomaly_threshold, ) batch_anomalies.append(is_anomaly) @@ -193,14 +201,19 @@ def main(args): distr = model.output_distribution(params, scale=scale) # stack the batch_anomalies along the prediction length dimension anomalies.append(torch.stack(batch_anomalies, dim=1)) + means.append(torch.stack(batch_means, dim=1)) # concat the anomalies along the batch dimension anomalies = torch.cat(anomalies, dim=0).cpu().numpy() + means = torch.cat(means, dim=0).cpu().numpy() - # save as csv + # save as pkl all_dates = [] all_flags = [] all_targets = [] - for i, (entry, flags) in enumerate(zip(dataset.test, anomalies)): + all_means = [] + for i, (entry, flags, mean) in enumerate( + zip(dataset.test, anomalies, means) + ): start_date = entry["start"].to_timestamp() target = entry["target"] dates = pd.date_range( @@ -212,13 +225,19 @@ def main(args): all_dates.append(date_index) all_flags.append(flags.flatten().astype(bool)) all_targets.append(target_slice) + all_means.append(mean.flatten()) # create a dataframe with the date_index and the flags anomaly_df = pd.DataFrame( - {"date": all_dates, "is_anomaly": all_flags, "target": all_targets} + { + "date": all_dates, + "is_anomaly": all_flags, + "target": all_targets, + "mean": all_means, + } ) anomaly_df.set_index("date", inplace=True) - anomaly_df.to_csv(f"anomalies_{args.dataset}.csv") + anomaly_df.to_pickle(f"anomalies_{args.dataset}.pkl") if __name__ == "__main__": @@ -246,7 +265,7 @@ def main(args): parser.add_argument( "--top_score_percentage", type=float, - default=0.2, + default=0.1, help="Percentage of top scores to consider for GPD fitting", ) parser.add_argument(