Skip to content

Commit

Permalink
Retry timeout tests for aggs (elastic#122031)
Browse files Browse the repository at this point in the history
The aggs timeout test waits for the agg to return and then double checks
that the agg is stopped using the tasks API. We're seeing some failures
where the tasks API reports that the agg is still running. I can't
reproduce them because computers. This adds two things:
1. Logs the hot_threads so we can see if the query is indeed still
   running.
2. Retries the _tasks API for a minute. If it goes away soon after the
   _search returns that's *fine*. If it sticks around for more than a
   few seconds then the cancel isn't working. We wait for a minute
   because CI can't be trusted to do anything quickly.

Closes elastic#121993
  • Loading branch information
nik9000 committed Feb 7, 2025
1 parent 3a52906 commit 80c6503
Showing 1 changed file with 28 additions and 7 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
package org.elasticsearch.multiterms;

import org.apache.http.client.config.RequestConfig;
import org.apache.http.util.EntityUtils;
import org.elasticsearch.action.admin.indices.create.CreateIndexResponse;
import org.elasticsearch.client.Request;
import org.elasticsearch.common.Strings;
Expand All @@ -30,6 +31,7 @@
import java.net.SocketTimeoutException;
import java.util.Locale;
import java.util.Map;
import java.util.concurrent.TimeUnit;

import static org.elasticsearch.test.ListMatcher.matchesList;
import static org.elasticsearch.test.MapMatcher.assertMap;
Expand Down Expand Up @@ -287,14 +289,33 @@ private void setTimeout(Request request) {
request.setOptions(request.getOptions().toBuilder().setRequestConfig(config.build()));
}

/**
* Asserts that within a minute the _search has left the _tasks api.
* <p>
* It'd sure be more convenient if, whenever the _search has returned
* back to us the _tasks API doesn't contain the _search. But sometimes
* it still does. So long as it stops <strong>eventually</strong> that's
* still indicative of the interrupt code working.
* </p>
*/
private void assertNoSearchesRunning() throws Exception {
Request tasks = new Request("GET", "/_tasks");
tasks.addParameter("actions", "*search");
tasks.addParameter("detailed", "");
assertBusy(() -> {
Map<?, ?> response = responseAsMap(client().performRequest(tasks));
// If there are running searches the map in `nodes` is non-empty.
assertMap(response, matchesMap().entry("nodes", matchesMap()));
});
Request tasks = new Request("GET", "/_tasks");
tasks.addParameter("actions", "*search");
tasks.addParameter("detailed", "");
assertBusy(() -> {
Map<?, ?> response = responseAsMap(client().performRequest(tasks));
// If there are running searches the map in `nodes` is non-empty.
if (response.isEmpty() == false) {
logger.warn("search still running, hot threads:\n{}", hotThreads());
}
assertMap(response, matchesMap().entry("nodes", matchesMap()));
});
}, 1, TimeUnit.MINUTES);
}

private String hotThreads() throws IOException {
Request tasks = new Request("GET", "/_nodes/hot_threads");
return EntityUtils.toString(client().performRequest(tasks).getEntity());
}
}

0 comments on commit 80c6503

Please sign in to comment.