Skip to content

Commit

Permalink
[ML] add new normalize_above parameter to p_value significant terms h…
Browse files Browse the repository at this point in the history
…euristic (elastic#78833)

This commit adds the new normalize_above parameter to the p_value significant
terms heuristic.

This parameter allows for consistent significance results at various scales. When a total count (in or out of the set background set) is above the normalize_above parameter, both the total set and the set including the term are scaled by normalize_above/count where count is term in the set or total set size.
  • Loading branch information
benwtrent authored Oct 12, 2021
1 parent 5428c3c commit 843fa42
Show file tree
Hide file tree
Showing 4 changed files with 151 additions and 35 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -404,6 +404,10 @@ the foreground set of "ended in failure" versus "NOT ended in failure".
`"background_is_superset": false` indicates that the background set does
not contain the counts of the foreground set as they are filtered out.

`"normalize_above": 1000` facilitates returning consistent significance results
at various scales. `1000` indicates that term counts greater than `1000` are
scaled down by a factor of `1000/term_count`.

[source,console]
--------------------------------------------------
GET /_search
Expand Down Expand Up @@ -466,7 +470,7 @@ GET /_search
]
}
},
"p_value": {"background_is_superset": false}
"p_value": {"background_is_superset": false, "normalize_above": 1000}
}
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
import org.elasticsearch.search.aggregations.bucket.terms.heuristic.SignificanceHeuristic;
import org.elasticsearch.test.ESTestCase;
import org.elasticsearch.test.InternalAggregationTestCase;
import org.elasticsearch.test.VersionUtils;

import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
Expand All @@ -52,7 +53,6 @@
import static java.util.Collections.emptyMap;
import static java.util.Collections.singletonList;
import static org.elasticsearch.search.aggregations.AggregationBuilders.significantTerms;
import static org.elasticsearch.test.VersionUtils.randomVersion;
import static org.hamcrest.Matchers.containsString;
import static org.hamcrest.Matchers.equalTo;
import static org.hamcrest.Matchers.greaterThan;
Expand All @@ -69,9 +69,13 @@ public abstract class AbstractSignificanceHeuristicTestCase extends ESTestCase {
*/
protected abstract SignificanceHeuristic getHeuristic();

protected Version randomVersion() {
return VersionUtils.randomVersion(random());
}

// test that stream output can actually be read - does not replace bwc test
public void testStreamResponse() throws Exception {
Version version = randomVersion(random());
Version version = randomVersion();
InternalMappedSignificantTerms<?, ?> sigTerms = getRandomSignificantTerms(getHeuristic());

// write
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,57 +10,92 @@


import org.apache.commons.math3.util.FastMath;
import org.elasticsearch.Version;
import org.elasticsearch.common.io.stream.StreamInput;
import org.elasticsearch.common.io.stream.StreamOutput;
import org.elasticsearch.xcontent.ConstructingObjectParser;
import org.elasticsearch.xcontent.ParseField;
import org.elasticsearch.xcontent.XContentBuilder;
import org.elasticsearch.xcontent.XContentParser;
import org.elasticsearch.search.aggregations.AggregationExecutionException;
import org.elasticsearch.search.aggregations.bucket.terms.heuristic.NXYSignificanceHeuristic;
import org.elasticsearch.search.aggregations.bucket.terms.heuristic.SignificanceHeuristic;

import java.io.IOException;
import java.util.Objects;

import static org.elasticsearch.xcontent.ConstructingObjectParser.optionalConstructorArg;

/**
* Significant terms heuristic that calculates the p-value between the term existing in foreground and background sets.
*
* The p-value is the probability of obtaining test results at least as extreme as
* the results actually observed, under the assumption that the null hypothesis is
* correct. The p-value is calculated assuming that the foreground set and the
* background set are independent https://en.wikipedia.org/wiki/Bernoulli_trial, with the null
* hypothesis that the probabilities are the same.
*/
public class PValueScore extends NXYSignificanceHeuristic {
public static final String NAME = "p_value";
public static final ParseField NORMALIZE_ABOVE = new ParseField("normalize_above");
public static final ConstructingObjectParser<PValueScore, Void> PARSER = new ConstructingObjectParser<>(NAME, args -> {
boolean backgroundIsSuperset = args[0] == null || (boolean) args[0];
return new PValueScore(backgroundIsSuperset);
return new PValueScore(backgroundIsSuperset, (Long)args[1]);
});
static {
PARSER.declareBoolean(optionalConstructorArg(), BACKGROUND_IS_SUPERSET);
PARSER.declareLong(optionalConstructorArg(), NORMALIZE_ABOVE);
}

private static final MlChiSquaredDistribution CHI_SQUARED_DISTRIBUTION = new MlChiSquaredDistribution(1);

public PValueScore(boolean backgroundIsSuperset) {
// NOTE: `0` is a magic value indicating no normalization occurs
private final long normalizeAbove;

/**
* @param backgroundIsSuperset Does the background contain the foreground docs?
* @param normalizeAbove Should the results be normalized when above the given value.
* Note: `0` is a special value which means no normalization (set as such when `null` is provided)
*/
public PValueScore(boolean backgroundIsSuperset, Long normalizeAbove) {
super(true, backgroundIsSuperset);
if (normalizeAbove != null && normalizeAbove <= 0) {
throw new IllegalArgumentException(
"[" + NORMALIZE_ABOVE.getPreferredName() + "] must be a positive value, provided [" + normalizeAbove + "]"
);
}
this.normalizeAbove = normalizeAbove == null ? 0L : normalizeAbove;
}

public PValueScore(StreamInput in) throws IOException {
super(true, in.readBoolean());
if (in.getVersion().onOrAfter(Version.V_8_0_0)) {
normalizeAbove = in.readVLong();
} else {
normalizeAbove = 0L;
}
}

@Override
public void writeTo(StreamOutput out) throws IOException {
out.writeBoolean(backgroundIsSuperset);
if (out.getVersion().onOrAfter(Version.V_8_0_0)) {
out.writeVLong(normalizeAbove);
}
}

@Override
public boolean equals(Object obj) {
if ((obj instanceof PValueScore) == false) {
return false;
}
return super.equals(obj);
public boolean equals(Object o) {
if (this == o) return true;
if (o == null || getClass() != o.getClass()) return false;
if (super.equals(o) == false) return false;
PValueScore that = (PValueScore) o;
return normalizeAbove == that.normalizeAbove;
}

@Override
public int hashCode() {
int result = NAME.hashCode();
result = 31 * result + super.hashCode();
return result;
return Objects.hash(super.hashCode(), normalizeAbove);
}

@Override
Expand All @@ -72,6 +107,9 @@ public String getWriteableName() {
public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException {
builder.startObject(NAME);
builder.field(BACKGROUND_IS_SUPERSET.getPreferredName(), backgroundIsSuperset);
if (normalizeAbove > 0) {
builder.field(NORMALIZE_ABOVE.getPreferredName(), normalizeAbove);
}
builder.endObject();
return builder;
}
Expand Down Expand Up @@ -113,6 +151,19 @@ public double getScore(long subsetFreq, long subsetSize, long supersetFreq, long
return 0.0;
}

if (normalizeAbove > 0L) {
if (allDocsInClass > normalizeAbove) {
double factor = (double) normalizeAbove / allDocsInClass;
allDocsInClass = (long)(allDocsInClass * factor);
docsContainTermInClass = (long)(docsContainTermInClass * factor);
}
if (allDocsNotInClass > normalizeAbove) {
double factor = (double) normalizeAbove / allDocsNotInClass;
allDocsNotInClass = (long)(allDocsNotInClass * factor);
docsContainTermNotInClass = (long)(docsContainTermNotInClass * factor);
}
}

// casting to `long` to round down to nearest whole number
double epsAllDocsInClass = (long)eps(allDocsInClass);
double epsAllDocsNotInClass = (long)eps(allDocsNotInClass);
Expand Down Expand Up @@ -164,15 +215,25 @@ private double eps(double value) {
}

public static class PValueScoreBuilder extends NXYBuilder {
private final long normalizeAbove;

public PValueScoreBuilder(boolean backgroundIsSuperset) {
public PValueScoreBuilder(boolean backgroundIsSuperset, Long normalizeAbove) {
super(true, backgroundIsSuperset);
this.normalizeAbove = normalizeAbove == null ? 0L : normalizeAbove;
if (normalizeAbove != null && normalizeAbove <= 0) {
throw new IllegalArgumentException(
"[" + NORMALIZE_ABOVE.getPreferredName() + "] must be a positive value, provided [" + normalizeAbove + "]"
);
}
}

@Override
public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException {
builder.startObject(NAME);
builder.field(BACKGROUND_IS_SUPERSET.getPreferredName(), backgroundIsSuperset);
if (normalizeAbove > 0) {
builder.field(NORMALIZE_ABOVE.getPreferredName(), normalizeAbove);
}
builder.endObject();
return builder;
}
Expand Down
Loading

0 comments on commit 843fa42

Please sign in to comment.