opendistro-for-elasticsearch · zhanghg08 · Mar 13, 2020 · Feb 18, 2020 · Feb 18, 2020 · Feb 19, 2020
diff --git a/src/main/java/com/amazon/opendistroforelasticsearch/ad/AnomalyDetectorPlugin.java b/src/main/java/com/amazon/opendistroforelasticsearch/ad/AnomalyDetectorPlugin.java
@@ -221,7 +221,7 @@ public Collection<Object> createComponents(
         Settings settings = environment.settings();
         Clock clock = Clock.systemUTC();
         Throttler throttler = new Throttler(clock);
-        ClientUtil clientUtil = new ClientUtil(settings, client, throttler);
+        ClientUtil clientUtil = new ClientUtil(settings, client, throttler, threadPool);
         IndexUtils indexUtils = new IndexUtils(client, clientUtil, clusterService);
         anomalyDetectionIndices = new AnomalyDetectionIndices(client, clusterService, threadPool, settings, clientUtil);
         this.clusterService = clusterService;

diff --git a/src/main/java/com/amazon/opendistroforelasticsearch/ad/cluster/DailyCron.java b/src/main/java/com/amazon/opendistroforelasticsearch/ad/cluster/DailyCron.java
@@ -87,7 +87,6 @@ public void run() {
                         }
                     )
             );
-
         deleteUtil.deleteDetectorResult(client);
     }
 

diff --git a/src/main/java/com/amazon/opendistroforelasticsearch/ad/constant/CommonName.java b/src/main/java/com/amazon/opendistroforelasticsearch/ad/constant/CommonName.java
@@ -26,4 +26,9 @@ public class CommonName {
     // Format name
     // ======================================
     public static final String EPOCH_MILLIS_FORMAT = "epoch_millis";
+
+    // ======================================
+    // Anomaly Detector name for X-Opaque-Id header
+    // ======================================
+    public static final String ANOMALY_DETECTOR = "[Anomaly Detector]";
 }
diff --git a/...java/com/amazon/opendistroforelasticsearch/ad/transport/AnomalyResultTransportAction.java b/...java/com/amazon/opendistroforelasticsearch/ad/transport/AnomalyResultTransportAction.java
@@ -249,11 +249,6 @@ protected void doExecute(Task task, ActionRequest actionRequest, ActionListener<
                 return;
             }
             AnomalyDetector anomalyDetector = detector.get();
-            if (stateManager.hasRunningQuery(anomalyDetector)) {
-                LOG.error("There is one query running for detectorId: {}", anomalyDetector.getDetectorId());
-                listener.onFailure(new EndRunException(adID, "There is one query running on AnomalyDetector", true));
-                return;
-            }
 
             String thresholdModelID = modelManager.getThresholdModelId(adID);
             Optional<DiscoveryNode> thresholdNode = hashRing.getOwningNode(thresholdModelID);

diff --git a/src/main/java/com/amazon/opendistroforelasticsearch/ad/util/ClientUtil.java b/src/main/java/com/amazon/opendistroforelasticsearch/ad/util/ClientUtil.java
@@ -17,41 +17,58 @@
 
 import static com.amazon.opendistroforelasticsearch.ad.settings.AnomalyDetectorSettings.REQUEST_TIMEOUT;
 
+import java.util.List;
 import java.util.Optional;
 import java.util.concurrent.CountDownLatch;
 import java.util.concurrent.TimeUnit;
 import java.util.concurrent.atomic.AtomicReference;
 import java.util.function.BiConsumer;
 import java.util.function.Function;
 
-import com.amazon.opendistroforelasticsearch.ad.common.exception.EndRunException;
+import com.amazon.opendistroforelasticsearch.ad.common.exception.InternalFailure;
+import com.amazon.opendistroforelasticsearch.ad.constant.CommonName;
 import com.amazon.opendistroforelasticsearch.ad.model.AnomalyDetector;
 
 import org.apache.logging.log4j.Logger;
+import org.elasticsearch.ElasticsearchException;
 import org.elasticsearch.ElasticsearchTimeoutException;
 import org.elasticsearch.action.ActionType;
 import org.elasticsearch.action.ActionFuture;
 import org.elasticsearch.action.ActionListener;
 import org.elasticsearch.action.ActionRequest;
 import org.elasticsearch.action.ActionResponse;
 import org.elasticsearch.action.LatchedActionListener;
+import org.elasticsearch.action.TaskOperationFailure;
+import org.elasticsearch.action.admin.cluster.node.tasks.cancel.CancelTasksAction;
+import org.elasticsearch.action.admin.cluster.node.tasks.cancel.CancelTasksRequest;
+import org.elasticsearch.action.admin.cluster.node.tasks.cancel.CancelTasksResponse;
+import org.elasticsearch.action.admin.cluster.node.tasks.list.ListTasksAction;
+import org.elasticsearch.action.admin.cluster.node.tasks.list.ListTasksRequest;
+import org.elasticsearch.action.admin.cluster.node.tasks.list.ListTasksResponse;
 import org.elasticsearch.client.Client;
 import org.elasticsearch.common.inject.Inject;
 import org.elasticsearch.common.settings.Settings;
 import org.elasticsearch.common.unit.TimeValue;
 
 import com.amazon.opendistroforelasticsearch.ad.constant.CommonErrorMessages;
+import org.elasticsearch.common.util.concurrent.ThreadContext;
+import org.elasticsearch.tasks.Task;
+import org.elasticsearch.tasks.TaskId;
+import org.elasticsearch.tasks.TaskInfo;
+import org.elasticsearch.threadpool.ThreadPool;
 
 public class ClientUtil {
     private volatile TimeValue requestTimeout;
     private Client client;
     private final Throttler throttler;
+    private ThreadPool threadPool;
 
     @Inject
-    public ClientUtil(Settings setting, Client client, Throttler throttler) {
+    public ClientUtil(Settings setting, Client client, Throttler throttler, ThreadPool threadPool) {
         this.requestTimeout = REQUEST_TIMEOUT.get(setting);
         this.client = client;
         this.throttler = throttler;
+        this.threadPool = threadPool;
     }
 
     /**
@@ -159,17 +176,19 @@ public <Request extends ActionRequest, Response extends ActionResponse> Response
     }
 
     /**
-     * Send a nonblocking request with a timeout and return response. The request will first be put into
-     * the negative cache. Once the request complete, it will be removed from the negative cache.
-     *
+     * Send a nonblocking request with a timeout and return response.
+     * If there is already a query running on given detector, it will try to
+     * cancel the query. Otherwise it will add this query to the negative cache
+     * and then attach the AnomalyDetection specific header to the request.
+     * Once the request complete, it will be removed from the negative cache.
+     * @param <Request> ActionRequest
+     * @param <Response> ActionResponse
      * @param request request like index/search/get
      * @param LOG log
      * @param consumer functional interface to operate as a client request like client::get
-     * @param <Request> ActionRequest
-     * @param <Response> ActionResponse
      * @param detector Anomaly Detector
      * @return the response
-     * @throws EndRunException when there is already a query running
+     * @throws InternalFailure when there is already a query running
      * @throws ElasticsearchTimeoutException when we cannot get response within time.
      * @throws IllegalStateException when the waiting thread is interrupted
      */
@@ -179,28 +198,32 @@ public <Request extends ActionRequest, Response extends ActionResponse> Optional
         BiConsumer<Request, ActionListener<Response>> consumer,
         AnomalyDetector detector
     ) {
+
         try {
-            // if key already exist, reject the request and throws exception
-            if (!throttler.insertFilteredQuery(detector.getDetectorId(), request)) {
-                LOG.error("There is one query running for detectorId: {}", detector.getDetectorId());
-                throw new EndRunException(detector.getDetectorId(), "There is one query running on AnomalyDetector", true);
+            String detectorId = detector.getDetectorId();
+            if (!throttler.insertFilteredQuery(detectorId, request)) {
+                LOG.info("There is one query running for detectorId: {}. Trying to cancel the long running query", detectorId);
+                cancelRunningQuery(client, detectorId, LOG);
+                throw new InternalFailure(detector.getDetectorId(), "There is already a query running on AnomalyDetector");
             }
             AtomicReference<Response> respReference = new AtomicReference<>();
             final CountDownLatch latch = new CountDownLatch(1);
 
-            try {
+            try (ThreadContext.StoredContext context = threadPool.getThreadContext().stashContext()) {
+                assert context != null;
+                threadPool.getThreadContext().putHeader(Task.X_OPAQUE_ID, CommonName.ANOMALY_DETECTOR + ":" + detectorId);
                 consumer.accept(request, new LatchedActionListener<Response>(ActionListener.wrap(response -> {
                     // clear negative cache
-                    throttler.clearFilteredQuery(detector.getDetectorId());
+                    throttler.clearFilteredQuery(detectorId);
                     respReference.set(response);
                 }, exception -> {
                     // clear negative cache
-                    throttler.clearFilteredQuery(detector.getDetectorId());
+                    throttler.clearFilteredQuery(detectorId);
                     LOG.error("Cannot get response for request {}, error: {}", request, exception);
                 }), latch));
             } catch (Exception e) {
-                LOG.error("Failed to process the request for detectorId: {}.", detector.getDetectorId());
-                throttler.clearFilteredQuery(detector.getDetectorId());
+                LOG.error("Failed to process the request for detectorId: {}.", detectorId);
+                throttler.clearFilteredQuery(detectorId);
                 throw e;
             }
 
@@ -222,4 +245,94 @@ public <Request extends ActionRequest, Response extends ActionResponse> Optional
     public boolean hasRunningQuery(AnomalyDetector detector) {
         return throttler.getFilteredQuery(detector.getDetectorId()).isPresent();
     }
+
+    /**
+     * Cancel long running query for given detectorId
+     * @param client Elasticsearch client
+     * @param detectorId Anomaly Detector Id
+     * @param LOG Logger
+     */
+    private void cancelRunningQuery(Client client, String detectorId, Logger LOG) {
+        ListTasksRequest listTasksRequest = new ListTasksRequest();
+        listTasksRequest.setActions("*search*");
+        client
+            .execute(
+                ListTasksAction.INSTANCE,
+                listTasksRequest,
+                ActionListener.wrap(response -> { onListTaskResponse(response, detectorId, LOG); }, exception -> {
+                    LOG.error("List Tasks failed.", exception);
+                    throw new InternalFailure(detectorId, "Failed to list current tasks", exception);
+                })
+            );
+    }
+
+    /**
+     * Helper function to handle ListTasksResponse
+     * @param listTasksResponse ListTasksResponse
+     * @param detectorId Anomaly Detector Id
+     * @param LOG Logger
+     */
+    private void onListTaskResponse(ListTasksResponse listTasksResponse, String detectorId, Logger LOG) {
+        List<TaskInfo> tasks = listTasksResponse.getTasks();
+        TaskId matchedParentTaskId = null;
+        TaskId matchedSingleTaskId = null;
+        for (TaskInfo task : tasks) {
+            if (!task.getHeaders().isEmpty()
+                && task.getHeaders().get(Task.X_OPAQUE_ID).equals(CommonName.ANOMALY_DETECTOR + ":" + detectorId)) {
+                if (!task.getParentTaskId().equals(TaskId.EMPTY_TASK_ID)) {
+                    // we found the parent task, don't need to check more
+                    matchedParentTaskId = task.getParentTaskId();
+                    break;
+                } else {
+                    // we found one task, keep checking other tasks
+                    matchedSingleTaskId = task.getTaskId();
+                }
+            }
+        }
+        // case 1: given detectorId is not in current task list
+        if (matchedParentTaskId == null && matchedSingleTaskId == null) {
+            // log and then clear negative cache
+            LOG.info("Couldn't find task for detectorId: {}. Clean this entry from Throttler", detectorId);
+            throttler.clearFilteredQuery(detectorId);
+            return;
+        }
+        // case 2: we can find the task for given detectorId
+        CancelTasksRequest cancelTaskRequest = new CancelTasksRequest();
+        if (matchedParentTaskId != null) {
+            cancelTaskRequest.setParentTaskId(matchedParentTaskId);
+            LOG.info("Start to cancel task for parentTaskId: {}", matchedParentTaskId.toString());
+        } else {
+            cancelTaskRequest.setTaskId(matchedSingleTaskId);
+            LOG.info("Start to cancel task for taskId: {}", matchedSingleTaskId.toString());
+        }
+
+        client
+            .execute(
+                CancelTasksAction.INSTANCE,
+                cancelTaskRequest,
+                ActionListener.wrap(response -> { onCancelTaskResponse(response, detectorId, LOG); }, exception -> {
+                    LOG.error("Failed to cancel task for detectorId: " + detectorId, exception);
+                    throw new InternalFailure(detectorId, "Failed to cancel current tasks", exception);
+                })
+            );
+    }
+
+    /**
+     * Helper function to handle CancelTasksResponse
+     * @param cancelTasksResponse CancelTasksResponse
+     * @param detectorId Anomaly Detector Id
+     * @param LOG Logger
+     */
+    private void onCancelTaskResponse(CancelTasksResponse cancelTasksResponse, String detectorId, Logger LOG) {
+        // todo: adding retry mechanism
+        List<ElasticsearchException> nodeFailures = cancelTasksResponse.getNodeFailures();
+        List<TaskOperationFailure> taskFailures = cancelTasksResponse.getTaskFailures();
+        if (nodeFailures.isEmpty() && taskFailures.isEmpty()) {
+            LOG.info("Cancelling query for detectorId: {} succeeds. Clear entry from Throttler", detectorId);
+            throttler.clearFilteredQuery(detectorId);
+            return;
+        }
+        LOG.error("Failed to cancel task for detectorId: " + detectorId);
+        throw new InternalFailure(detectorId, "Failed to cancel current tasks due to node or task failures");
+    }
 }
diff --git a/src/test/java/com/amazon/opendistroforelasticsearch/ad/TestHelpers.java b/src/test/java/com/amazon/opendistroforelasticsearch/ad/TestHelpers.java
@@ -15,10 +15,10 @@
 
 package com.amazon.opendistroforelasticsearch.ad;
 
+import com.amazon.opendistroforelasticsearch.ad.model.AnomalyDetector;
 import com.amazon.opendistroforelasticsearch.ad.model.AnomalyDetectorExecutionInput;
 import com.amazon.opendistroforelasticsearch.ad.model.AnomalyDetectorJob;
 import com.amazon.opendistroforelasticsearch.ad.model.AnomalyResult;
-import com.amazon.opendistroforelasticsearch.ad.model.AnomalyDetector;
 import com.amazon.opendistroforelasticsearch.ad.model.Feature;
 import com.amazon.opendistroforelasticsearch.ad.model.FeatureData;
 import com.amazon.opendistroforelasticsearch.ad.model.IntervalTimeConfiguration;
@@ -48,6 +48,7 @@
 import org.elasticsearch.common.bytes.BytesReference;
 import org.elasticsearch.common.settings.ClusterSettings;
 import org.elasticsearch.common.settings.Settings;
+import org.elasticsearch.common.util.concurrent.ThreadContext;
 import org.elasticsearch.common.xcontent.LoggingDeprecationHandler;
 import org.elasticsearch.common.xcontent.NamedXContentRegistry;
 import org.elasticsearch.common.xcontent.XContentBuilder;
@@ -78,6 +79,8 @@
 import static org.elasticsearch.test.ESTestCase.randomDouble;
 import static org.elasticsearch.test.ESTestCase.randomInt;
 import static org.elasticsearch.test.ESTestCase.randomLong;
+import static org.powermock.api.mockito.PowerMockito.mock;
+import static org.powermock.api.mockito.PowerMockito.when;
 
 public class TestHelpers {
 
@@ -290,4 +293,18 @@ public static ClusterService createClusterService(ThreadPool threadPool, Cluster
         );
         return ClusterServiceUtils.createClusterService(threadPool, discoveryNode, clusterSettings);
     }
+
+    public static ThreadContext createThreadContext() {
+        Settings build = Settings.builder().put("request.headers.default", "1").build();
+        ThreadContext context = new ThreadContext(build);
+        context.putHeader("foo", "bar");
+        context.putTransient("x", 1);
+        return context;
+    }
+
+    public static ThreadPool createThreadPool() {
+        ThreadPool pool = mock(ThreadPool.class);
+        when(pool.getThreadContext()).thenReturn(createThreadContext());
+        return pool;
+    }
 }
diff --git a/src/test/java/com/amazon/opendistroforelasticsearch/ad/cluster/DailyCronTests.java b/src/test/java/com/amazon/opendistroforelasticsearch/ad/cluster/DailyCronTests.java
@@ -83,6 +83,7 @@ private void templateDailyCron(DailyCronTestExecutionMode mode) {
             return null;
         }).when(clientUtil).execute(eq(DeleteByQueryAction.INSTANCE), any(), any());
 
+        // those tests are covered by each util class
         doNothing().when(deleteUtil).deleteDetectorResult(eq(client));
 
         cron.run();

diff --git a/...t/java/com/amazon/opendistroforelasticsearch/ad/indices/AnomalyDetectionIndicesTests.java b/...t/java/com/amazon/opendistroforelasticsearch/ad/indices/AnomalyDetectionIndicesTests.java
@@ -37,6 +37,7 @@
 import org.elasticsearch.common.xcontent.XContentFactory;
 import org.elasticsearch.rest.RestStatus;
 import org.elasticsearch.test.ESIntegTestCase;
+import org.elasticsearch.threadpool.ThreadPool;
 import org.junit.Before;
 
 import java.io.IOException;
@@ -52,6 +53,7 @@ public class AnomalyDetectionIndicesTests extends ESIntegTestCase {
     private Settings settings;
     private ClusterService clusterService;
     private Client client;
+    private ThreadPool context;
 
     @Before
     public void setup() {
@@ -71,10 +73,11 @@ public void setup() {
         clusterSettings.add(AnomalyDetectorSettings.REQUEST_TIMEOUT);
         clusterSetting = new ClusterSettings(settings, clusterSettings);
         clusterService = TestHelpers.createClusterService(client().threadPool(), clusterSetting);
+        context = TestHelpers.createThreadPool();
         client = mock(Client.class);
         Clock clock = Clock.systemUTC();
         Throttler throttler = new Throttler(clock);
-        requestUtil = new ClientUtil(settings, client, throttler);
+        requestUtil = new ClientUtil(settings, client, throttler, context);
         indices = new AnomalyDetectionIndices(client(), clusterService, client().threadPool(), settings, requestUtil);
     }
 

diff --git a/src/test/java/com/amazon/opendistroforelasticsearch/ad/transport/ADStateManagerTests.java b/src/test/java/com/amazon/opendistroforelasticsearch/ad/transport/ADStateManagerTests.java
@@ -56,6 +56,7 @@
 import org.elasticsearch.common.xcontent.ToXContent;
 import org.elasticsearch.common.xcontent.XContentBuilder;
 import org.elasticsearch.common.xcontent.XContentFactory;
+import org.elasticsearch.threadpool.ThreadPool;
 import org.junit.After;
 import org.junit.Before;
 
@@ -68,6 +69,7 @@ public class ADStateManagerTests extends ESTestCase {
     private Clock clock;
     private Duration duration;
     private Throttler throttler;
+    private ThreadPool context;
 
     @Override
     protected NamedXContentRegistry xContentRegistry() {
@@ -91,12 +93,13 @@ public void setUp() throws Exception {
         clock = mock(Clock.class);
         duration = Duration.ofHours(1);
         throttler = new Throttler(clock);
+
         stateManager = new ADStateManager(
             client,
             xContentRegistry(),
             modelManager,
             settings,
-            new ClientUtil(settings, client, throttler),
+            new ClientUtil(settings, client, throttler, context),
             clock,
             duration
         );

diff --git a/.../java/com/amazon/opendistroforelasticsearch/ad/transport/ADStatsTransportActionTests.java b/.../java/com/amazon/opendistroforelasticsearch/ad/transport/ADStatsTransportActionTests.java
@@ -29,6 +29,7 @@
 import org.elasticsearch.client.Client;
 import org.elasticsearch.common.settings.Settings;
 import org.elasticsearch.test.ESIntegTestCase;
+import org.elasticsearch.threadpool.ThreadPool;
 import org.elasticsearch.transport.TransportService;
 import org.junit.Before;
 import org.junit.Test;
@@ -59,7 +60,8 @@ public void setUp() throws Exception {
         Client client = client();
         Clock clock = mock(Clock.class);
         Throttler throttler = new Throttler(clock);
-        IndexUtils indexUtils = new IndexUtils(client, new ClientUtil(Settings.EMPTY, client, throttler), clusterService());
+        ThreadPool threadPool = mock(ThreadPool.class);
+        IndexUtils indexUtils = new IndexUtils(client, new ClientUtil(Settings.EMPTY, client, throttler, threadPool), clusterService());
         ModelManager modelManager = mock(ModelManager.class);
 
         clusterStatName1 = "clusterStat1";
-Original file line number
+Diff line change
@@ Expand Up / @@ -87,7 +87,6 @@ public void run() { @@
                             }
                         )
                 );
             deleteUtil.deleteDetectorResult(client);
         }
@@ Expand Down @@