diff --git a/Java/README.md b/Java/README.md index 61c34938..e2759c77 100644 --- a/Java/README.md +++ b/Java/README.md @@ -157,7 +157,7 @@ vector data point, scores the data point, and then updates the model with this point. The program output appends a column of anomaly scores to the input: ```text -$ java -cp core/target/randomcutforest-core-4.1.0.jar com.amazon.randomcutforest.runner.AnomalyScoreRunner < ../example-data/rcf-paper.csv > example_output.csv +$ java -cp core/target/randomcutforest-core-4.2.0.jar com.amazon.randomcutforest.runner.AnomalyScoreRunner < ../example-data/rcf-paper.csv > example_output.csv $ tail example_output.csv -5.0029,0.0170,-0.0057,0.8129401629464965 -4.9975,-0.0102,-0.0065,0.6591046054520615 @@ -176,8 +176,8 @@ read additional usage instructions, including options for setting model hyperparameters, using the `--help` flag: ```text -$ java -cp core/target/randomcutforest-core-4.1.0.jar com.amazon.randomcutforest.runner.AnomalyScoreRunner --help -Usage: java -cp target/random-cut-forest-4.1.0.jar com.amazon.randomcutforest.runner.AnomalyScoreRunner [options] < input_file > output_file +$ java -cp core/target/randomcutforest-core-4.2.0.jar com.amazon.randomcutforest.runner.AnomalyScoreRunner --help +Usage: java -cp target/random-cut-forest-4.2.0.jar com.amazon.randomcutforest.runner.AnomalyScoreRunner [options] < input_file > output_file Compute scalar anomaly scores from the input rows and append them to the output rows. @@ -239,14 +239,14 @@ framework. Build an executable jar containing the benchmark code by running To invoke the full benchmark suite: ```text -% java -jar benchmark/target/randomcutforest-benchmark-4.1.0-jar-with-dependencies.jar +% java -jar benchmark/target/randomcutforest-benchmark-4.2.0-jar-with-dependencies.jar ``` The full benchmark suite takes a long time to run. You can also pass a regex at the command-line, then only matching benchmark methods will be executed. ```text -% java -jar benchmark/target/randomcutforest-benchmark-4.1.0-jar-with-dependencies.jar RandomCutForestBenchmark\.updateAndGetAnomalyScore +% java -jar benchmark/target/randomcutforest-benchmark-4.2.0-jar-with-dependencies.jar RandomCutForestBenchmark\.updateAndGetAnomalyScore ``` [rcf-paper]: http://proceedings.mlr.press/v48/guha16.pdf diff --git a/Java/benchmark/pom.xml b/Java/benchmark/pom.xml index 73d45069..02e05265 100644 --- a/Java/benchmark/pom.xml +++ b/Java/benchmark/pom.xml @@ -6,7 +6,7 @@ software.amazon.randomcutforest randomcutforest-parent - 4.1.0 + 4.2.0 randomcutforest-benchmark diff --git a/Java/core/pom.xml b/Java/core/pom.xml index 6f607062..e7262ef0 100644 --- a/Java/core/pom.xml +++ b/Java/core/pom.xml @@ -6,7 +6,7 @@ software.amazon.randomcutforest randomcutforest-parent - 4.1.0 + 4.2.0 randomcutforest-core diff --git a/Java/core/src/main/java/com/amazon/randomcutforest/preprocessor/ImputePreprocessor.java b/Java/core/src/main/java/com/amazon/randomcutforest/preprocessor/ImputePreprocessor.java index 4a22e042..e87fecd8 100644 --- a/Java/core/src/main/java/com/amazon/randomcutforest/preprocessor/ImputePreprocessor.java +++ b/Java/core/src/main/java/com/amazon/randomcutforest/preprocessor/ImputePreprocessor.java @@ -142,7 +142,7 @@ protected void updateTimestamps(long timestamp) { * continuously since we are always counting missing values that should * eventually be reset to zero. To address the issue, we add code in method * updateForest to decrement numberOfImputed when we move to a new timestamp, - * provided there is no imputation. This ensures th e imputation fraction does + * provided there is no imputation. This ensures the imputation fraction does * not increase as long as the imputation is continuing. This also ensures that * the forest update decision, which relies on the imputation fraction, * functions correctly. The forest is updated only when the imputation fraction diff --git a/Java/core/src/test/java/com/amazon/randomcutforest/SampleSummaryTest.java b/Java/core/src/test/java/com/amazon/randomcutforest/SampleSummaryTest.java index a6948da2..bd078bd0 100644 --- a/Java/core/src/test/java/com/amazon/randomcutforest/SampleSummaryTest.java +++ b/Java/core/src/test/java/com/amazon/randomcutforest/SampleSummaryTest.java @@ -20,6 +20,7 @@ import static org.junit.jupiter.api.Assertions.assertDoesNotThrow; import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertThrows; +import static org.junit.jupiter.api.Assertions.assertTrue; import static org.mockito.ArgumentMatchers.any; import static org.mockito.Mockito.mock; import static org.mockito.Mockito.when; @@ -343,7 +344,11 @@ public void ParallelTest(BiFunction distance) { assertEquals(summary2.weightOfSamples, summary1.weightOfSamples, " sampling inconsistent"); assertEquals(summary2.summaryPoints.length, summary1.summaryPoints.length, " incorrect length of typical points"); - assertEquals(clusters.size(), summary1.summaryPoints.length); + // due to randomization, they might not equal + assertTrue( + Math.abs(clusters.size() - summary1.summaryPoints.length) <= 1, + "The difference between clusters.size() and summary1.summaryPoints.length should be at most 1" + ); double total = clusters.stream().map(ICluster::getWeight).reduce(0.0, Double::sum); assertEquals(total, summary1.weightOfSamples, 1e-3); // parallelization can produce reordering of merges diff --git a/Java/examples/pom.xml b/Java/examples/pom.xml index 2a8805b0..6a840d78 100644 --- a/Java/examples/pom.xml +++ b/Java/examples/pom.xml @@ -7,7 +7,7 @@ software.amazon.randomcutforest randomcutforest-parent - 4.1.0 + 4.2.0 randomcutforest-examples diff --git a/Java/parkservices/pom.xml b/Java/parkservices/pom.xml index 103425f4..203b0916 100644 --- a/Java/parkservices/pom.xml +++ b/Java/parkservices/pom.xml @@ -6,7 +6,7 @@ software.amazon.randomcutforest randomcutforest-parent - 4.1.0 + 4.2.0 randomcutforest-parkservices diff --git a/Java/parkservices/src/main/java/com/amazon/randomcutforest/parkservices/PredictorCorrector.java b/Java/parkservices/src/main/java/com/amazon/randomcutforest/parkservices/PredictorCorrector.java index 08abbace..84a23742 100644 --- a/Java/parkservices/src/main/java/com/amazon/randomcutforest/parkservices/PredictorCorrector.java +++ b/Java/parkservices/src/main/java/com/amazon/randomcutforest/parkservices/PredictorCorrector.java @@ -464,19 +464,74 @@ protected

DiVector constructUncertaintyBox(float[] double[] gapLow = new double[baseDimensions]; double[] gapHigh = new double[baseDimensions]; for (int y = 0; y < baseDimensions; y++) { + // 'a' represents the scaled value of the current point for dimension 'y'. + // Given that 'point[startPosition + y]' is the normalized value of the actual + // data point (x - mean) / std, + // and 'scale[y]' is the standard deviation (std), we have: + // a = std * ((x - mean) / std) = x - mean double a = scale[y] * point[startPosition + y]; + + // 'shiftBase' is the shift value for dimension 'y', which is the mean (mean) double shiftBase = shift[y]; + + // Initialize 'shiftAmount' to zero. This will account for numerical precision + // adjustments later double shiftAmount = 0; + + // If the mean ('shiftBase') is not zero, adjust 'shiftAmount' to account for + // numerical precision if (shiftBase != 0) { + // 'shiftAmount' accounts for potential numerical errors due to shifting and + // scaling shiftAmount += DEFAULT_NORMALIZATION_PRECISION * (scale[y] + Math.abs(shiftBase)); } + + // Calculate the average L1 deviation along the path for dimension 'y'. + // This function computes the average absolute difference between successive + // values in the shingle, + // helping to capture recent fluctuations or trends in the data. double pathGap = calculatePathDeviation(point, startPosition, y, baseDimension, differenced); + + // 'noiseGap' is calculated based on the noise factor and the deviation for + // dimension 'y'. + // It represents the expected variation due to noise, scaled appropriately. double noiseGap = noiseFactor * result.getDeviations()[baseDimension + y]; + + // 'gap' is the maximum of the scaled 'pathGap' and 'noiseGap', adjusted by + // 'shiftAmount' + // and a small constant to ensure it's not zero. This gap accounts for recent + // deviations and noise, + // and serves as a baseline threshold for detecting anomalies. double gap = max(scale[y] * pathGap, noiseGap) + shiftAmount + DEFAULT_NORMALIZATION_PRECISION; - gapLow[y] = max(max(ignoreNearExpectedFromBelow[y], ignoreNearExpectedFromBelowByRatio[y] * Math.abs(a)), - gap); - gapHigh[y] = max(max(ignoreNearExpectedFromAbove[y], ignoreNearExpectedFromAboveByRatio[y] * Math.abs(a)), - gap); + + // Compute 'gapLow[y]' and 'gapHigh[y]', which are thresholds to determine if + // the deviation is significant + // Since 'a = x - mean' and 'shiftBase = mean', then 'a + shiftBase = x - mean + + // mean = x' + // Therefore, 'Math.abs(a + shiftBase)' simplifies to the absolute value of the + // actual data point |x| + // For 'gapLow[y]', calculate the maximum of: + // - 'ignoreNearExpectedFromBelow[y]', an absolute threshold for ignoring small + // deviations below expected + // - 'ignoreNearExpectedFromBelowByRatio[y] * |x|', a relative threshold based + // on the actual value x + // - 'gap', the calculated deviation adjusted for noise and precision + // This ensures that minor deviations within the specified ratio or fixed + // threshold are ignored, + // reducing false positives. + gapLow[y] = max(max(ignoreNearExpectedFromBelow[y], + ignoreNearExpectedFromBelowByRatio[y] * (Math.abs(a + shiftBase))), gap); + + // Similarly, for 'gapHigh[y]': + // - 'ignoreNearExpectedFromAbove[y]', an absolute threshold for ignoring small + // deviations above expected + // - 'ignoreNearExpectedFromAboveByRatio[y] * |x|', a relative threshold based + // on the actual value x + // - 'gap', the calculated deviation adjusted for noise and precision + // This threshold helps in ignoring anomalies that are within an acceptable + // deviation ratio from the expected value. + gapHigh[y] = max(max(ignoreNearExpectedFromAbove[y], + ignoreNearExpectedFromAboveByRatio[y] * (Math.abs(a + shiftBase))), gap); } return new DiVector(gapHigh, gapLow); } diff --git a/Java/parkservices/src/test/java/com/amazon/randomcutforest/parkservices/IgnoreTest.java b/Java/parkservices/src/test/java/com/amazon/randomcutforest/parkservices/IgnoreTest.java new file mode 100644 index 00000000..15517cea --- /dev/null +++ b/Java/parkservices/src/test/java/com/amazon/randomcutforest/parkservices/IgnoreTest.java @@ -0,0 +1,153 @@ +/* + * Copyright 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"). + * You may not use this file except in compliance with the License. + * A copy of the License is located at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * or in the "license" file accompanying this file. This file is distributed + * on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either + * express or implied. See the License for the specific language governing + * permissions and limitations under the License. + */ + +package com.amazon.randomcutforest.parkservices; + +import static org.junit.jupiter.api.Assertions.assertTrue; + +import java.time.LocalDateTime; +import java.time.temporal.ChronoUnit; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.Random; +import java.util.Set; +import java.util.TreeSet; + +import org.junit.jupiter.api.Test; + +import com.amazon.randomcutforest.config.ForestMode; +import com.amazon.randomcutforest.config.Precision; +import com.amazon.randomcutforest.config.TransformMethod; + +public class IgnoreTest { + @Test + public void testAnomalies() { + // Initialize the forest parameters + int shingleSize = 8; + int numberOfTrees = 50; + int sampleSize = 256; + Precision precision = Precision.FLOAT_32; + int baseDimensions = 1; + + long count = 0; + int dimensions = baseDimensions * shingleSize; + + // Build the ThresholdedRandomCutForest + ThresholdedRandomCutForest forest = new ThresholdedRandomCutForest.Builder<>().compact(true) + .dimensions(dimensions).randomSeed(0).numberOfTrees(numberOfTrees).shingleSize(shingleSize) + .sampleSize(sampleSize).precision(precision).anomalyRate(0.01).forestMode(ForestMode.STREAMING_IMPUTE) + .transformMethod(TransformMethod.NORMALIZE).autoAdjust(true) + .ignoreNearExpectedFromAboveByRatio(new double[] { 0.1 }) + .ignoreNearExpectedFromBelowByRatio(new double[] { 0.1 }).build(); + + // Generate the list of doubles + List randomDoubles = generateUniformRandomDoubles(); + + // List to store detected anomaly indices + List anomalies = new ArrayList<>(); + + // Process each data point through the forest + for (double val : randomDoubles) { + double[] point = new double[] { val }; + long newStamp = 100 * count; + + AnomalyDescriptor result = forest.process(point, newStamp); + + if (result.getAnomalyGrade() != 0) { + anomalies.add((int) count); + } + ++count; + } + + // Expected anomalies + List expectedAnomalies = Arrays.asList(273, 283, 505, 1323); + + System.out.println("Anomalies detected at indices: " + anomalies); + + // Verify that all expected anomalies are detected + assertTrue(anomalies.containsAll(expectedAnomalies), + "Anomalies detected do not contain all expected anomalies"); + } + + public static List generateUniformRandomDoubles() { + // Set fixed times for reproducibility + LocalDateTime startTime = LocalDateTime.of(2020, 1, 1, 0, 0, 0); + LocalDateTime endTime = LocalDateTime.of(2020, 1, 2, 0, 0, 0); + long totalIntervals = ChronoUnit.MINUTES.between(startTime, endTime); + + // Generate timestamps (not used but kept for completeness) + List timestamps = new ArrayList<>(); + for (int i = 0; i < totalIntervals; i++) { + timestamps.add(startTime.plusMinutes(i)); + } + + // Initialize variables + Random random = new Random(0); // For reproducibility + double level = 0; + List logCounts = new ArrayList<>(); + + // Decide random change points where level will change + int numChanges = random.nextInt(6) + 5; // Random number between 5 and 10 inclusive + + Set changeIndicesSet = new TreeSet<>(); + changeIndicesSet.add(0); // Ensure the first index is included + + while (changeIndicesSet.size() < numChanges) { + int idx = random.nextInt((int) totalIntervals - 1) + 1; // Random index between 1 and totalIntervals -1 + changeIndicesSet.add(idx); + } + + List changeIndices = new ArrayList<>(changeIndicesSet); + + // Generate levels at each change point + List levels = new ArrayList<>(); + for (int i = 0; i < changeIndices.size(); i++) { + if (i == 0) { + level = random.nextDouble() * 10; // Starting level between 0 and 10 + } else { + double increment = -2 + random.nextDouble() * 7; // Random increment between -2 and 5 + level = Math.max(0, level + increment); + } + levels.add(level); + } + + // Now generate logCounts for each timestamp with even smoother transitions + int currentLevelIndex = 0; + for (int idx = 0; idx < totalIntervals; idx++) { + if (currentLevelIndex + 1 < changeIndices.size() && idx >= changeIndices.get(currentLevelIndex + 1)) { + currentLevelIndex++; + } + level = levels.get(currentLevelIndex); + double sineWave = Math.sin((idx % 300) * (Math.PI / 150)) * 0.05 * level; + double noise = (-0.01 * level) + random.nextDouble() * (0.02 * level); // Noise between -0.01*level and + // 0.01*level + double count = Math.max(0, level + sineWave + noise); + logCounts.add(count); + } + + // Introduce controlled changes for anomaly detection testing + for (int changeIdx : changeIndices) { + if (changeIdx + 10 < totalIntervals) { + logCounts.set(changeIdx + 5, logCounts.get(changeIdx + 5) * 1.05); // 5% increase + logCounts.set(changeIdx + 10, logCounts.get(changeIdx + 10) * 1.10); // 10% increase + } + } + + // Output the generated logCounts + System.out.println("Generated logCounts of size: " + logCounts.size()); + return logCounts; + } +} diff --git a/Java/pom.xml b/Java/pom.xml index af47a502..78eba704 100644 --- a/Java/pom.xml +++ b/Java/pom.xml @@ -4,7 +4,7 @@ software.amazon.randomcutforest randomcutforest-parent - 4.1.0 + 4.2.0 pom software.amazon.randomcutforest:randomcutforest diff --git a/Java/serialization/pom.xml b/Java/serialization/pom.xml index fa070062..9053bd42 100644 --- a/Java/serialization/pom.xml +++ b/Java/serialization/pom.xml @@ -7,7 +7,7 @@ software.amazon.randomcutforest randomcutforest-parent - 4.1.0 + 4.2.0 randomcutforest-serialization diff --git a/Java/testutils/pom.xml b/Java/testutils/pom.xml index 7de80649..8c5395e7 100644 --- a/Java/testutils/pom.xml +++ b/Java/testutils/pom.xml @@ -4,7 +4,7 @@ randomcutforest-parent software.amazon.randomcutforest - 4.1.0 + 4.2.0 randomcutforest-testutils