From 3099cb05386b3214d3cb0bc41cb94b9ae7e64b95 Mon Sep 17 00:00:00 2001 From: camby <104178625@qq.com> Date: Sat, 5 Jul 2025 16:02:42 +0800 Subject: [PATCH] [fix](be java) be hang while enable_java_support (#52818) ### What problem does this PR solve? ### Problem after enable java support, be can not start correctly, it will hang on stack: ``` (gdb) bt #0 0x00007f5fb1e97ce6 in __futex_abstimed_wait_common () from /lib64/libc.so.6 #1 0x00007f5fb1e9a798 in pthread_cond_wait@@GLIBC_2.3.2 () from /lib64/libc.so.6 #2 0x00007f5fb2c98bf3 in os::PlatformEvent::park() () from /usr/lib/jvm/java-17//lib/server/libjvm.so #3 0x00007f5fb2c693a5 in ObjectMonitor::wait(long, bool, JavaThread*) () from /usr/lib/jvm/java-17//lib/server/libjvm.so #4 0x00007f5fb2e8b316 in ObjectSynchronizer::wait(Handle, long, JavaThread*) () from /usr/lib/jvm/java-17//lib/server/libjvm.so #5 0x00007f5fb2934a97 in JVM_MonitorWait () from /usr/lib/jvm/java-17//lib/server/libjvm.so #6 0x00007f5f9de245ba in ?? () #7 0x00007f5f49446158 in ?? () #8 0x00007f5faeb9fa00 in ?? () #9 0x00007ffc37f91178 in ?? () #10 0x00007f5f9de304bd in ?? () #11 0x00007ffc37f910a0 in ?? () #12 0x0000000000000000 in ?? () ``` jstack of be: ``` "main" #1 prio=5 os_prio=0 cpu=931.38ms elapsed=66.08s tid=0x00007fab8e12c400 nid=0x3e68aa in Object.wait() [0x00007ffdb3d4c000] java.lang.Thread.State: WAITING (on object monitor) at java.lang.Object.wait(java.base@17.0.15-ga/Native Method) - waiting on <0x00000000bd183b68> (a java.lang.ProcessImpl) at java.lang.Object.wait(java.base@17.0.15-ga/Object.java:338) at java.lang.ProcessImpl.waitFor(java.base@17.0.15-ga/ProcessImpl.java:434) - locked <0x00000000bd183b68> (a java.lang.ProcessImpl) at org.apache.hadoop.util.Shell.runCommand(Shell.java:1061) at org.apache.hadoop.util.Shell.run(Shell.java:957) at org.apache.hadoop.util.Shell$ShellCommandExecutor.execute(Shell.java:1282) at org.apache.hadoop.util.Shell.isSetsidSupported(Shell.java:853) at org.apache.hadoop.util.Shell.(Shell.java:838) at org.apache.hadoop.util.StringUtils.(StringUtils.java:79) at org.apache.hadoop.conf.Configuration.getBoolean(Configuration.java:1713) at org.apache.hadoop.security.SecurityUtil.setConfigurationInternal(SecurityUtil.java:103) at org.apache.hadoop.security.SecurityUtil.(SecurityUtil.java:92) at org.apache.hadoop.security.UserGroupInformation.initialize(UserGroupInformation.java:312) - locked <0x00000000bd5fd708> (a java.lang.Class for org.apache.hadoop.security.UserGroupInformation) at org.apache.hadoop.security.UserGroupInformation.ensureInitialized(UserGroupInformation.java:300) - locked <0x00000000bd5fd708> (a java.lang.Class for org.apache.hadoop.security.UserGroupInformation) at org.apache.hadoop.security.UserGroupInformation.getCurrentUser(UserGroupInformation.java:575) at org.apache.hadoop.fs.viewfs.ViewFileSystem.(ViewFileSystem.java:279) at jdk.internal.reflect.NativeConstructorAccessorImpl.newInstance0(java.base@17.0.15-ga/Native Method) at jdk.internal.reflect.NativeConstructorAccessorImpl.newInstance(java.base@17.0.15-ga/NativeConstructorAccessorImpl.java:77) at jdk.internal.reflect.DelegatingConstructorAccessorImpl.newInstance(java.base@17.0.15-ga/DelegatingConstructorAccessorImpl.java:45) at java.lang.reflect.Constructor.newInstanceWithCaller(java.base@17.0.15-ga/Constructor.java:500) at java.lang.reflect.Constructor.newInstance(java.base@17.0.15-ga/Constructor.java:481) at java.util.ServiceLoader$ProviderImpl.newInstance(java.base@17.0.15-ga/ServiceLoader.java:789) at java.util.ServiceLoader$ProviderImpl.get(java.base@17.0.15-ga/ServiceLoader.java:729) at java.util.ServiceLoader$3.next(java.base@17.0.15-ga/ServiceLoader.java:1403) at org.apache.hadoop.fs.FileSystem.loadFileSystems(FileSystem.java:3534) - locked <0x00000000826529f0> (a java.lang.Class for org.apache.hadoop.fs.FileSystem) ``` This problem comes from: https://github.com/apache/doris/pull/45287 after this fix, we could enable java support: https://github.com/apache/doris/pull/52412 ### Another Fix Method Add JAVA_OPTS `-Djdk.lang.processReaperUseDefaultStackSize=true` inside be.conf, also can fix this problem: https://bugs.openjdk.org/browse/JDK-8153057 From gemini: ![Clipboard_Screenshot_1751683706](https://github.com/user-attachments/assets/a54ffb3a-cb57-4f8a-bc85-5366081c2d9b) --- be/src/olap/rowset/rowset_meta.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/be/src/olap/rowset/rowset_meta.cpp b/be/src/olap/rowset/rowset_meta.cpp index c9851cdc5fc64b..0da212ea8812d4 100644 --- a/be/src/olap/rowset/rowset_meta.cpp +++ b/be/src/olap/rowset/rowset_meta.cpp @@ -229,7 +229,7 @@ void RowsetMeta::set_segments_key_bounds(const std::vector& segment int32_t truncation_threshold = config::segments_key_bounds_truncation_threshold; if (config::random_segments_key_bounds_truncation) { - static thread_local std::mt19937 generator(std::random_device {}()); + std::mt19937 generator(std::random_device {}()); std::uniform_int_distribution distribution(-10, 40); truncation_threshold = distribution(generator); }