apache
diff --git a/‎be/src/common/config.cpp‎
Lines changed: 1 addition & 0 deletions b/‎be/src/common/config.cpp‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎be/src/common/config.h‎
Lines changed: 6 additions & 0 deletions b/‎be/src/common/config.h‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎be/src/http/action/health_action.cpp‎
Lines changed: 24 additions & 3 deletions b/‎be/src/http/action/health_action.cpp‎
Lines changed: 24 additions & 3 deletions
diff --git a/‎be/src/runtime/exec_env.cpp‎
Lines changed: 7 additions & 0 deletions b/‎be/src/runtime/exec_env.cpp‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎be/src/runtime/exec_env.h‎
Lines changed: 3 additions & 0 deletions b/‎be/src/runtime/exec_env.h‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎be/src/service/doris_main.cpp‎
Lines changed: 3 additions & 0 deletions b/‎be/src/service/doris_main.cpp‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎be/test/http/http_client_test.cpp‎
Lines changed: 2 additions & 2 deletions b/‎be/test/http/http_client_test.cpp‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎fe/fe-core/src/main/java/org/apache/doris/DorisFE.java‎
Lines changed: 50 additions & 3 deletions b/‎fe/fe-core/src/main/java/org/apache/doris/DorisFE.java‎
Lines changed: 50 additions & 3 deletions
diff --git a/‎fe/fe-core/src/main/java/org/apache/doris/cloud/catalog/CloudReplica.java‎
Lines changed: 2 additions & 2 deletions b/‎fe/fe-core/src/main/java/org/apache/doris/cloud/catalog/CloudReplica.java‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎fe/fe-core/src/main/java/org/apache/doris/common/FeConstants.java‎
Lines changed: 0 additions & 26 deletions b/‎fe/fe-core/src/main/java/org/apache/doris/common/FeConstants.java‎
Lines changed: 0 additions & 26 deletions
@@ -1295,6 +1295,7 @@ DEFINE_String(user_files_secure_path, "${DORIS_HOME}");
 DEFINE_Int32(fe_expire_duration_seconds, "60");
 
 DEFINE_Int32(grace_shutdown_wait_seconds, "120");
+DEFINE_Int32(grace_shutdown_post_delay_seconds, "30");
 
 DEFINE_Int16(bitmap_serialize_version, "1");
 
 
@@ -1356,6 +1356,12 @@ DECLARE_Int32(fe_expire_duration_seconds);
 // , but if the waiting time exceed the limit, then be will exit directly.
 // During this period, FE will not send any queries to BE and waiting for all running queries to stop.
 DECLARE_Int32(grace_shutdown_wait_seconds);
+// When using the graceful stop feature, after the main process waits for
+// all currently running tasks to finish, it will continue to wait for
+// an additional period to ensure that queries still running on other nodes have also completed.
+// Since a BE node cannot detect the task execution status on other BE nodes,
+// you may need to increase this threshold to allow for a longer waiting time.
+DECLARE_Int32(grace_shutdown_post_delay_seconds);
 
 // BitmapValue serialize version.
 DECLARE_Int16(bitmap_serialize_version);
 
@@ -24,21 +24,42 @@
 #include "http/http_headers.h"
 #include "http/http_request.h"
 #include "http/http_status.h"
+#include "runtime/exec_env.h"
 
 namespace doris {
 
 const static std::string HEADER_JSON = "application/json";
 
 void HealthAction::handle(HttpRequest* req) {
+    std::string status;
+    std::string msg;
+    HttpStatus st;
+    // always return HttpStatus::OK
+    // because in k8s, we don't want the pod to be removed
+    // from service during shutdown
+    if (!doris::k_is_server_ready) {
+        status = "Server is not available";
+        msg = "Server is not ready";
+        st = HttpStatus::OK;
+    } else if (doris::k_doris_exit) {
+        status = "Server is not available";
+        msg = "Server is shutting down";
+        st = HttpStatus::OK;
+    } else {
+        status = "OK";
+        msg = "OK";
+        st = HttpStatus::OK;
+    }
+
     std::stringstream ss;
     ss << "{";
-    ss << "\"status\": \"OK\",";
-    ss << "\"msg\": \"To Be Added\"";
+    ss << "\"status\": \"" << status << "\",";
+    ss << "\"msg\": \"" << msg << "\"";
     ss << "}";
     std::string result = ss.str();
 
     req->add_output_header(HttpHeaders::CONTENT_TYPE, HEADER_JSON.c_str());
-    HttpChannel::send_reply(req, HttpStatus::OK, result);
+    HttpChannel::send_reply(req, st, result);
 }
 
 } // end namespace doris
@@ -176,6 +176,13 @@ void ExecEnv::wait_for_all_tasks_done() {
         sleep(1);
         ++wait_seconds_passed;
     }
+    // This is a conservative strategy.
+    // Because a query might still have fragments running on other BE nodes.
+    // In other words, the query hasn't truly terminated.
+    // If the current BE is shut down at this point,
+    // the FE will detect the downtime of a related BE and cancel the entire query,
+    // defeating the purpose of a graceful stop.
+    sleep(config::grace_shutdown_post_delay_seconds);
 }
 
 bool ExecEnv::check_auth_token(const std::string& auth_token) {
 
@@ -131,7 +131,10 @@ class IndexPolicyMgr;
 struct SyncRowsetStats;
 class DeleteBitmapAggCache;
 
+// set to true when BE is shutting down
 inline bool k_doris_exit = false;
+// set to true after BE start ready
+inline bool k_is_server_ready = false;
 
 // Execution environment for queries/plan fragments.
 // Contains all required global structures, and handles to
 
@@ -602,12 +602,15 @@ int main(int argc, char** argv) {
 
     exec_env->storage_engine().notify_listeners();
 
+    doris::k_is_server_ready = true;
+
     while (!doris::k_doris_exit) {
 #if defined(LEAK_SANITIZER)
         __lsan_do_leak_check();
 #endif
         sleep(3);
     }
+    doris::k_is_server_ready = false;
     LOG(INFO) << "Doris main exiting.";
 #if defined(LLVM_PROFILE)
     __llvm_profile_write_file();
 
@@ -391,7 +391,7 @@ TEST_F(HttpClientTest, enable_http_auth) {
         st = client.execute(&response);
         EXPECT_TRUE(st.ok());
         std::cout << "response = " << response << "\n";
-        EXPECT_TRUE(response.find("To Be Added") != std::string::npos);
+        EXPECT_TRUE(response.find("Server is not ready") != std::string::npos);
     }
 
     {
@@ -422,7 +422,7 @@ TEST_F(HttpClientTest, enable_http_auth) {
         st = client.execute(&response);
         EXPECT_TRUE(st.ok());
         std::cout << "response = " << response << "\n";
-        EXPECT_TRUE(response.find("To Be Added") != std::string::npos);
+        EXPECT_TRUE(response.find("Server is not ready") != std::string::npos);
     }
 
     {
 
@@ -35,6 +35,8 @@
 import org.apache.doris.journal.bdbje.BDBTool;
 import org.apache.doris.journal.bdbje.BDBToolOptions;
 import org.apache.doris.persist.meta.MetaReader;
+import org.apache.doris.qe.Coordinator;
+import org.apache.doris.qe.QeProcessorImpl;
 import org.apache.doris.qe.QeService;
 import org.apache.doris.qe.SimpleScheduler;
 import org.apache.doris.service.ExecuteEnv;
@@ -61,7 +63,10 @@
 import java.nio.channels.FileLock;
 import java.nio.channels.OverlappingFileLockException;
 import java.nio.file.StandardOpenOption;
+
+import java.util.List;
 import java.util.concurrent.TimeUnit;
+import java.util.concurrent.atomic.AtomicBoolean;
 
 public class DorisFE {
     private static final Logger LOG = LogManager.getLogger(DorisFE.class);
@@ -80,6 +85,12 @@ public class DorisFE {
     private static FileChannel processLockFileChannel;
     private static FileLock processFileLock;
 
+    // set to true when all servers are ready.
+    private static final AtomicBoolean serverReady = new AtomicBoolean(false);
+
+    // HTTP server instance, used for graceful shutdown
+    private static HttpServer httpServer;
+
     public static void main(String[] args) {
         // Every doris version should have a final meta version, it should not change
         // between small releases. Add a check here to avoid mistake.
@@ -142,7 +153,19 @@ public static void start(String dorisHomeDir, String pidDir, String[] args, Star
             }
 
             Log4jConfig.initLogging(dorisHomeDir + "/conf/");
-            Runtime.getRuntime().addShutdownHook(new Thread(LogManager::shutdown));
+            // Add shutdown hook for graceful exit
+            Runtime.getRuntime().addShutdownHook(new Thread(() -> {
+                LOG.info("Received shutdown signal, starting graceful shutdown...");
+                serverReady.set(false);
+                gracefulShutdown();
+
+                // Shutdown HTTP server after main process graceful shutdown is complete
+                if (httpServer != null) {
+                    httpServer.shutdown();
+                }
+
+                LogManager.shutdown();
+            }));
 
             // set dns cache ttl
             java.security.Security.setProperty("networkaddress.cache.ttl", "60");
@@ -195,7 +218,7 @@ public static void start(String dorisHomeDir, String pidDir, String[] args, Star
             feServer.start();
 
             if (options.enableHttpServer) {
-                HttpServer httpServer = new HttpServer();
+                httpServer = new HttpServer();
                 httpServer.setPort(Config.http_port);
                 httpServer.setHttpsPort(Config.https_port);
                 httpServer.setMaxHttpPostSize(Config.jetty_server_max_http_post_size);
@@ -224,11 +247,14 @@ public static void start(String dorisHomeDir, String pidDir, String[] args, Star
 
             ThreadPoolManager.registerAllThreadPoolMetric();
             startMonitor();
+
+            serverReady.set(true);
+            // JVM will exit when shutdown hook is completed
             while (true) {
                 Thread.sleep(2000);
             }
         } catch (Throwable e) {
-            // Some exception may thrown before LOG is inited.
+            // Some exception may throw before LOG is inited.
             // So need to print to stdout
             e.printStackTrace();
             LOG.error("", e);
@@ -538,4 +564,25 @@ public static class StartupOptions {
         public boolean enableHttpServer = true;
         public boolean enableQeService = true;
     }
+
+    public static boolean isServerReady() {
+        return serverReady.get();
+    }
+
+    private static void gracefulShutdown() {
+        // wait for all queries to finish
+        try {
+            long now = System.currentTimeMillis();
+            List<Coordinator> allCoordinators = QeProcessorImpl.INSTANCE.getAllCoordinators();
+            while (!allCoordinators.isEmpty() && System.currentTimeMillis() - now < 300 * 1000L) {
+                Thread.sleep(1000);
+                allCoordinators = QeProcessorImpl.INSTANCE.getAllCoordinators();
+                LOG.info("waiting {} queries to finish before shutdown", allCoordinators.size());
+            }
+        } catch (Throwable t) {
+            LOG.error("", t);
+        }
+
+        LOG.info("graceful shutdown finished");
+    }
 }
@@ -102,7 +102,7 @@ private boolean isColocated() {
     public long getColocatedBeId(String clusterId) throws ComputeGroupException {
         CloudSystemInfoService infoService = ((CloudSystemInfoService) Env.getCurrentSystemInfo());
         List<Backend> bes = infoService.getBackendsByClusterId(clusterId).stream()
-                .filter(be -> !be.isQueryDisabled()).collect(Collectors.toList());
+                .filter(be -> be.isQueryAvailable()).collect(Collectors.toList());
         String clusterName = infoService.getClusterNameByClusterId(clusterId);
         if (bes.isEmpty()) {
             LOG.warn("failed to get available be, cluster: {}-{}", clusterName, clusterId);
@@ -418,7 +418,7 @@ public long hashReplicaToBe(String clusterId, boolean isBackGround) throws Compu
             long lastUpdateMs = be.getLastUpdateMs();
             long missTimeMs = Math.abs(lastUpdateMs - System.currentTimeMillis());
             // be core or restart must in heartbeat_interval_second
-            if ((be.isAlive() || missTimeMs <= Config.heartbeat_interval_second * 1000L)
+            if ((be.isQueryAvailable() || missTimeMs <= Config.heartbeat_interval_second * 1000L)
                     && !be.isSmoothUpgradeSrc()) {
                 if (be.isDecommissioned()) {
                     decommissionAvailBes.add(be);
 
@@ -36,16 +36,11 @@ public class FeConstants {
 
     public static int checkpoint_interval_second = 60; // 1 minutes
 
-    // dpp version
-    public static String dpp_version = "3_2_0";
-
     // bloom filter false positive probability
     public static double default_bloom_filter_fpp = 0.05;
 
     // set to true to skip some step when running FE unit test
     public static boolean runningUnitTest = false;
-    // use to set some mocked values for FE unit test
-    public static Object unitTestConstant = null;
 
     // set to false to disable internal schema db
     public static boolean enableInternalSchemaDb = true;
@@ -66,30 +61,9 @@ public class FeConstants {
     // use for copy into test
     public static boolean disablePreHeat = false;
 
-    public static final String FS_PREFIX_S3 = "s3";
-    public static final String FS_PREFIX_S3A = "s3a";
-    public static final String FS_PREFIX_S3N = "s3n";
-    public static final String FS_PREFIX_OSS = "oss";
-    public static final String FS_PREFIX_GCS = "gs";
-    public static final String FS_PREFIX_BOS = "bos";
-    public static final String FS_PREFIX_COS = "cos";
-    public static final String FS_PREFIX_COSN = "cosn";
-    public static final String FS_PREFIX_LAKEFS = "lakefs";
-    public static final String FS_PREFIX_OBS = "obs";
-    public static final String FS_PREFIX_OFS = "ofs";
-    public static final String FS_PREFIX_GFS = "gfs";
-    public static final String FS_PREFIX_JFS = "jfs";
-    public static final String FS_PREFIX_HDFS = "hdfs";
-    public static final String FS_PREFIX_VIEWFS = "viewfs";
-    public static final String FS_PREFIX_FILE = "file";
-
     public static final String INTERNAL_DB_NAME = "__internal_schema";
     public static final String INTERNAL_FILE_CACHE_HOTSPOT_TABLE_NAME = "cloud_cache_hotspot";
     public static String TEMP_MATERIZLIZE_DVIEW_PREFIX = "internal_tmp_materialized_view_";
 
     public static String METADATA_FAILURE_RECOVERY_KEY = "metadata_failure_recovery";
-
-    public static String CLOUD_RETRY_E230 = "E-230";
-
-    public static String BUILT_IN_STORAGE_VAULT_NAME = "built_in_storage_vault";
 }
Original file line number	Diff line number	Diff line change
`@@ -176,6 +176,13 @@ void ExecEnv::wait_for_all_tasks_done() {`
`176`	`176`	`sleep(1);`
`177`	`177`	`++wait_seconds_passed;`
`178`	`178`	`}`
	`179`	`+ // This is a conservative strategy.`
	`180`	`+ // Because a query might still have fragments running on other BE nodes.`
	`181`	`+ // In other words, the query hasn't truly terminated.`
	`182`	`+ // If the current BE is shut down at this point,`
	`183`	`+ // the FE will detect the downtime of a related BE and cancel the entire query,`
	`184`	`+ // defeating the purpose of a graceful stop.`
	`185`	`+ sleep(config::grace_shutdown_post_delay_seconds);`
`179`	`186`	`}`
`180`	`187`
`181`	`188`	`bool ExecEnv::check_auth_token(const std::string& auth_token) {`
Original file line number	Diff line number	Diff line change
`@@ -391,7 +391,7 @@ TEST_F(HttpClientTest, enable_http_auth) {`
`391`	`391`	`st = client.execute(&response);`
`392`	`392`	`EXPECT_TRUE(st.ok());`
`393`	`393`	`std::cout << "response = " << response << "\n";`
`394`		`- EXPECT_TRUE(response.find("To Be Added") != std::string::npos);`
	`394`	`+ EXPECT_TRUE(response.find("Server is not ready") != std::string::npos);`
`395`	`395`	`}`
`396`	`396`
`397`	`397`	`{`
`@@ -422,7 +422,7 @@ TEST_F(HttpClientTest, enable_http_auth) {`
`422`	`422`	`st = client.execute(&response);`
`423`	`423`	`EXPECT_TRUE(st.ok());`
`424`	`424`	`std::cout << "response = " << response << "\n";`
`425`		`- EXPECT_TRUE(response.find("To Be Added") != std::string::npos);`
	`425`	`+ EXPECT_TRUE(response.find("Server is not ready") != std::string::npos);`
`426`	`426`	`}`
`427`	`427`
`428`	`428`	`{`