-
Notifications
You must be signed in to change notification settings - Fork 36
Cancel query if given detector already have one #54
Changes from all commits
3375396
87badb1
4c18971
08dfb54
8c64199
99e9487
df38d00
d797543
8b9d699
fd1883b
0673741
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -87,7 +87,6 @@ public void run() { | |
} | ||
) | ||
); | ||
|
||
deleteUtil.deleteDetectorResult(client); | ||
} | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -17,41 +17,58 @@ | |
|
||
import static com.amazon.opendistroforelasticsearch.ad.settings.AnomalyDetectorSettings.REQUEST_TIMEOUT; | ||
|
||
import java.util.List; | ||
import java.util.Optional; | ||
import java.util.concurrent.CountDownLatch; | ||
import java.util.concurrent.TimeUnit; | ||
import java.util.concurrent.atomic.AtomicReference; | ||
import java.util.function.BiConsumer; | ||
import java.util.function.Function; | ||
|
||
import com.amazon.opendistroforelasticsearch.ad.common.exception.EndRunException; | ||
import com.amazon.opendistroforelasticsearch.ad.common.exception.InternalFailure; | ||
import com.amazon.opendistroforelasticsearch.ad.constant.CommonName; | ||
import com.amazon.opendistroforelasticsearch.ad.model.AnomalyDetector; | ||
|
||
import org.apache.logging.log4j.Logger; | ||
import org.elasticsearch.ElasticsearchException; | ||
import org.elasticsearch.ElasticsearchTimeoutException; | ||
import org.elasticsearch.action.ActionType; | ||
import org.elasticsearch.action.ActionFuture; | ||
import org.elasticsearch.action.ActionListener; | ||
import org.elasticsearch.action.ActionRequest; | ||
import org.elasticsearch.action.ActionResponse; | ||
import org.elasticsearch.action.LatchedActionListener; | ||
import org.elasticsearch.action.TaskOperationFailure; | ||
import org.elasticsearch.action.admin.cluster.node.tasks.cancel.CancelTasksAction; | ||
import org.elasticsearch.action.admin.cluster.node.tasks.cancel.CancelTasksRequest; | ||
import org.elasticsearch.action.admin.cluster.node.tasks.cancel.CancelTasksResponse; | ||
import org.elasticsearch.action.admin.cluster.node.tasks.list.ListTasksAction; | ||
import org.elasticsearch.action.admin.cluster.node.tasks.list.ListTasksRequest; | ||
import org.elasticsearch.action.admin.cluster.node.tasks.list.ListTasksResponse; | ||
import org.elasticsearch.client.Client; | ||
import org.elasticsearch.common.inject.Inject; | ||
import org.elasticsearch.common.settings.Settings; | ||
import org.elasticsearch.common.unit.TimeValue; | ||
|
||
import com.amazon.opendistroforelasticsearch.ad.constant.CommonErrorMessages; | ||
import org.elasticsearch.common.util.concurrent.ThreadContext; | ||
import org.elasticsearch.tasks.Task; | ||
import org.elasticsearch.tasks.TaskId; | ||
import org.elasticsearch.tasks.TaskInfo; | ||
import org.elasticsearch.threadpool.ThreadPool; | ||
|
||
public class ClientUtil { | ||
private volatile TimeValue requestTimeout; | ||
private Client client; | ||
private final Throttler throttler; | ||
private ThreadPool threadPool; | ||
|
||
@Inject | ||
public ClientUtil(Settings setting, Client client, Throttler throttler) { | ||
public ClientUtil(Settings setting, Client client, Throttler throttler, ThreadPool threadPool) { | ||
this.requestTimeout = REQUEST_TIMEOUT.get(setting); | ||
this.client = client; | ||
this.throttler = throttler; | ||
this.threadPool = threadPool; | ||
} | ||
|
||
/** | ||
|
@@ -159,17 +176,19 @@ public <Request extends ActionRequest, Response extends ActionResponse> Response | |
} | ||
|
||
/** | ||
* Send a nonblocking request with a timeout and return response. The request will first be put into | ||
* the negative cache. Once the request complete, it will be removed from the negative cache. | ||
* | ||
* Send a nonblocking request with a timeout and return response. | ||
* If there is already a query running on given detector, it will try to | ||
* cancel the query. Otherwise it will add this query to the negative cache | ||
* and then attach the AnomalyDetection specific header to the request. | ||
* Once the request complete, it will be removed from the negative cache. | ||
* @param <Request> ActionRequest | ||
* @param <Response> ActionResponse | ||
* @param request request like index/search/get | ||
* @param LOG log | ||
* @param consumer functional interface to operate as a client request like client::get | ||
* @param <Request> ActionRequest | ||
* @param <Response> ActionResponse | ||
* @param detector Anomaly Detector | ||
* @return the response | ||
* @throws EndRunException when there is already a query running | ||
* @throws InternalFailure when there is already a query running | ||
* @throws ElasticsearchTimeoutException when we cannot get response within time. | ||
* @throws IllegalStateException when the waiting thread is interrupted | ||
*/ | ||
|
@@ -179,28 +198,32 @@ public <Request extends ActionRequest, Response extends ActionResponse> Optional | |
BiConsumer<Request, ActionListener<Response>> consumer, | ||
AnomalyDetector detector | ||
) { | ||
|
||
try { | ||
// if key already exist, reject the request and throws exception | ||
if (!throttler.insertFilteredQuery(detector.getDetectorId(), request)) { | ||
LOG.error("There is one query running for detectorId: {}", detector.getDetectorId()); | ||
throw new EndRunException(detector.getDetectorId(), "There is one query running on AnomalyDetector", true); | ||
String detectorId = detector.getDetectorId(); | ||
if (!throttler.insertFilteredQuery(detectorId, request)) { | ||
LOG.info("There is one query running for detectorId: {}. Trying to cancel the long running query", detectorId); | ||
cancelRunningQuery(client, detectorId, LOG); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Return after cancelling since we don't know when the cancel would actually happen? We might keep piling up new queries when the previous old queries are not cancelled. Also, we need to send InternalFailure not EndRunException. EndRunException is used for scenarios when we might need to terminate AD job running soon. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Agree. Return after cancelling will be more safer. Will update. |
||
throw new InternalFailure(detector.getDetectorId(), "There is already a query running on AnomalyDetector"); | ||
} | ||
AtomicReference<Response> respReference = new AtomicReference<>(); | ||
final CountDownLatch latch = new CountDownLatch(1); | ||
|
||
try { | ||
try (ThreadContext.StoredContext context = threadPool.getThreadContext().stashContext()) { | ||
assert context != null; | ||
threadPool.getThreadContext().putHeader(Task.X_OPAQUE_ID, CommonName.ANOMALY_DETECTOR + ":" + detectorId); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Just asking, will the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, it will. I can see both parent and children tasks have the same header from the manual test. |
||
consumer.accept(request, new LatchedActionListener<Response>(ActionListener.wrap(response -> { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It's possible the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think this is similar with Kaituo's comments. For safety concern, I will not start new request if we need to cancel the running one, just in case the cancel failed somehow and we keep adding new requests. We can revisit it later. |
||
// clear negative cache | ||
throttler.clearFilteredQuery(detector.getDetectorId()); | ||
throttler.clearFilteredQuery(detectorId); | ||
respReference.set(response); | ||
}, exception -> { | ||
// clear negative cache | ||
throttler.clearFilteredQuery(detector.getDetectorId()); | ||
throttler.clearFilteredQuery(detectorId); | ||
LOG.error("Cannot get response for request {}, error: {}", request, exception); | ||
}), latch)); | ||
} catch (Exception e) { | ||
LOG.error("Failed to process the request for detectorId: {}.", detector.getDetectorId()); | ||
throttler.clearFilteredQuery(detector.getDetectorId()); | ||
LOG.error("Failed to process the request for detectorId: {}.", detectorId); | ||
throttler.clearFilteredQuery(detectorId); | ||
throw e; | ||
} | ||
|
||
|
@@ -222,4 +245,94 @@ public <Request extends ActionRequest, Response extends ActionResponse> Optional | |
public boolean hasRunningQuery(AnomalyDetector detector) { | ||
return throttler.getFilteredQuery(detector.getDetectorId()).isPresent(); | ||
} | ||
|
||
/** | ||
* Cancel long running query for given detectorId | ||
* @param client Elasticsearch client | ||
* @param detectorId Anomaly Detector Id | ||
* @param LOG Logger | ||
*/ | ||
private void cancelRunningQuery(Client client, String detectorId, Logger LOG) { | ||
ListTasksRequest listTasksRequest = new ListTasksRequest(); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You can add some parameters to speed up task search:
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks for the advice. For the group_by=parents, it's a little weird. For the api it supports this feature(https://www.elastic.co/guide/en/elasticsearch/reference/current/tasks.html#_task_grouping). However when it comes to java api, it's not supported as far as I see. For actions=*search, I will added it as "actions=search" since I can see some child query has something like "indices:data/read/search[phase/query]" There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. you meant we need to use "actions=*search*", right? Yes, please do that. Your current code uses "*search". |
||
listTasksRequest.setActions("*search*"); | ||
client | ||
.execute( | ||
ListTasksAction.INSTANCE, | ||
listTasksRequest, | ||
ActionListener.wrap(response -> { onListTaskResponse(response, detectorId, LOG); }, exception -> { | ||
LOG.error("List Tasks failed.", exception); | ||
throw new InternalFailure(detectorId, "Failed to list current tasks", exception); | ||
}) | ||
); | ||
} | ||
|
||
/** | ||
* Helper function to handle ListTasksResponse | ||
* @param listTasksResponse ListTasksResponse | ||
* @param detectorId Anomaly Detector Id | ||
* @param LOG Logger | ||
*/ | ||
private void onListTaskResponse(ListTasksResponse listTasksResponse, String detectorId, Logger LOG) { | ||
List<TaskInfo> tasks = listTasksResponse.getTasks(); | ||
TaskId matchedParentTaskId = null; | ||
TaskId matchedSingleTaskId = null; | ||
for (TaskInfo task : tasks) { | ||
if (!task.getHeaders().isEmpty() | ||
&& task.getHeaders().get(Task.X_OPAQUE_ID).equals(CommonName.ANOMALY_DETECTOR + ":" + detectorId)) { | ||
if (!task.getParentTaskId().equals(TaskId.EMPTY_TASK_ID)) { | ||
// we found the parent task, don't need to check more | ||
matchedParentTaskId = task.getParentTaskId(); | ||
break; | ||
} else { | ||
// we found one task, keep checking other tasks | ||
matchedSingleTaskId = task.getTaskId(); | ||
} | ||
} | ||
} | ||
// case 1: given detectorId is not in current task list | ||
if (matchedParentTaskId == null && matchedSingleTaskId == null) { | ||
// log and then clear negative cache | ||
LOG.info("Couldn't find task for detectorId: {}. Clean this entry from Throttler", detectorId); | ||
throttler.clearFilteredQuery(detectorId); | ||
return; | ||
} | ||
// case 2: we can find the task for given detectorId | ||
CancelTasksRequest cancelTaskRequest = new CancelTasksRequest(); | ||
if (matchedParentTaskId != null) { | ||
cancelTaskRequest.setParentTaskId(matchedParentTaskId); | ||
LOG.info("Start to cancel task for parentTaskId: {}", matchedParentTaskId.toString()); | ||
} else { | ||
cancelTaskRequest.setTaskId(matchedSingleTaskId); | ||
LOG.info("Start to cancel task for taskId: {}", matchedSingleTaskId.toString()); | ||
} | ||
|
||
client | ||
.execute( | ||
CancelTasksAction.INSTANCE, | ||
cancelTaskRequest, | ||
ActionListener.wrap(response -> { onCancelTaskResponse(response, detectorId, LOG); }, exception -> { | ||
LOG.error("Failed to cancel task for detectorId: " + detectorId, exception); | ||
throw new InternalFailure(detectorId, "Failed to cancel current tasks", exception); | ||
}) | ||
); | ||
} | ||
|
||
/** | ||
* Helper function to handle CancelTasksResponse | ||
* @param cancelTasksResponse CancelTasksResponse | ||
* @param detectorId Anomaly Detector Id | ||
* @param LOG Logger | ||
*/ | ||
private void onCancelTaskResponse(CancelTasksResponse cancelTasksResponse, String detectorId, Logger LOG) { | ||
// todo: adding retry mechanism | ||
List<ElasticsearchException> nodeFailures = cancelTasksResponse.getNodeFailures(); | ||
List<TaskOperationFailure> taskFailures = cancelTasksResponse.getTaskFailures(); | ||
if (nodeFailures.isEmpty() && taskFailures.isEmpty()) { | ||
LOG.info("Cancelling query for detectorId: {} succeeds. Clear entry from Throttler", detectorId); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Better to add some retry for these failed tasks. Otherwise, will wait for next detector run to cancel again. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Will add a todo comment for now. |
||
throttler.clearFilteredQuery(detectorId); | ||
return; | ||
} | ||
LOG.error("Failed to cancel task for detectorId: " + detectorId); | ||
throw new InternalFailure(detectorId, "Failed to cancel current tasks due to node or task failures"); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. log failures? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. added |
||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is there a unit/integration test for the cancel mechanism? If not, I strongly suggest we add one.
You can add an ESIntegTestCase where we
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is also one of my concern. I noticed we don't have any unit test for clientUtil and tried to add one but it's too complicated. When manual testing, I use similar listener which will delay the search to make up the long running query. Not sure if that can be done in integration test, I will sync up with you offline.