Skip to content

Conversation

@github-actions
Copy link
Contributor

Cherry-picked from #53046

### What problem does this PR solve?

Issue Number: close #xxx

Related PR: #xxx

Problem Summary:

1.fix LRU queue crash use after free
2.fix extra LRU queue info when 'need_to_move' flag unset
3.use concurrent queueu to record queueu change info for thread safety

```
ERROR: AddressSanitizer: heap-use-after-free on address 0x603005548c40 at pc 0x55f28e8c4785 bp 0x7f603582e1f0 sp 0x7f603582e1e8
READ of size 8 at 0x603005548c40 thread T201
    #0 0x55f28e8c4784 in std::_Head_base<0ul, doris::io::CacheLRULog*, false>::_Head_base<doris::io::CacheLRULog*>(doris::io::CacheLRULog*&&) /var/local/ldb-toolchain/bin/../lib/gcc/x86_64-linux-gnu/11/../../../../include/c++/11/tuple:190:17
    #1 0x55f28e8c4784 in std::_Tuple_impl<0ul, doris::io::CacheLRULog*, std::default_delete<doris::io::CacheLRULog>>::_Tuple_impl(std::_Tuple_impl<0ul, doris::io::CacheLRULog*, std::default_delete<doris::io::CacheLRULog>>&&) /var/local/ldb-toolchain/bin/../lib/gcc/x86_64-linux-gnu/11/../../../../include/c++/11/tuple:292:2
    #2 0x55f28e8c4784 in std::tuple<doris::io::CacheLRULog*, std::default_delete<doris::io::CacheLRULog>>::tuple(std::tuple<doris::io::CacheLRULog*, std::default_delete<doris::io::CacheLRULog>>&&) /var/local/ldb-toolchain/bin/../lib/gcc/x86_64-linux-gnu/11/../../../../include/c++/11/tuple:1079:17
    #3 0x55f28e8c4784 in std::_uniq_ptr_impl<doris::io::CacheLRULog, std::default_delete<doris::io::CacheLRULog>>::uniq_ptr_impl(std::_uniq_ptr_impl<doris::io::CacheLRULog, std::default_delete<doris::io::CacheLRULog>>&&) /var/local/ldb-toolchain/bin/../lib/gcc/x86_64-linux-gnu/11/../../../../include/c++/11/bits/unique_ptr.h:162:9
    #4 0x55f28e8c4784 in std::_uniq_ptr_data<doris::io::CacheLRULog, std::default_delete<doris::io::CacheLRULog>, true, true>::uniq_ptr_data(std::_uniq_ptr_data<doris::io::CacheLRULog, std::default_delete<doris::io::CacheLRULog>, true, true>&&) /var/local/ldb-toolchain/bin/../lib/gcc/x86_64-linux-gnu/11/../../../../include/c++/11/bits/unique_ptr.h:211:7
    #5 0x55f28e8c4784 in std::unique_ptr<doris::io::CacheLRULog, std::default_delete<doris::io::CacheLRULog>>::unique_ptr(std::unique_ptr<doris::io::CacheLRULog, std::default_delete<doris::io::CacheLRULog>>&&) /var/local/ldb-toolchain/bin/../lib/gcc/x86_64-linux-gnu/11/../../../../include/c++/11/bits/unique_ptr.h:327:7
    #6 0x55f28e8c4784 in doris::io::LRUQueueRecorder::replay_queue_event(doris::io::FileCacheType) /root/doris/be/src/io/cache/lru_queue_recorder.cpp:40:20
    #7 0x55f28e82d620 in doris::io::BlockFileCache::run_background_lru_log_replay() /root/doris/be/src/io/cache/block_file_cache.cpp:2242:24
    #8 0x55f2cdc2720f in execute_native_thread_routine /data/gcc-11.1.0/build/x86_64-pc-linux-gnu/libstdc+-v3/src/c11/../../../../../libstdc-v3/src/c+11/thread.cc:82:18
    #9 0x7f61f1842608 in start_thread /build/glibc-SzIz7B/glibc-2.31/nptl/pthread_create.c:477:8
    #10 0x7f61f1aef132 in __clone /build/glibc-SzIz7B/glibc-2.31/misc/../sysdeps/unix/sysv/linux/x86_64/clone.S:95

0x603005548c40 is located 16 bytes inside of 24-byte region [0x603005548c30,0x603005548c48)
freed by thread T201 here:
    #0 0x55f28e51680d in operator delete(void*) (/home/work/unlimit_teamcity/TeamCity/Agents/20250708205944agent_172.16.0.48_1/work/60183217f6ee2a9c/output/be/lib/doris_be+0x3975a80d) (BuildId: 8b6ba6101e736655)
    #1 0x55f28e8c3ce0 in std::__cxx11::list<std::unique_ptr<doris::io::CacheLRULog, std::default_delete<doris::io::CacheLRULog>>, std::allocator<std::unique_ptr<doris::io::CacheLRULog, std::default_delete<doris::io::CacheLRULog>>>>::pop_front() /var/local/ldb-toolchain/bin/../lib/gcc/x86_64-linux-gnu/11/../../../../include/c++/11/bits/stl_list.h:1198:15
    #2 0x55f28e8c3ce0 in doris::io::LRUQueueRecorder::replay_queue_event(doris::io::FileCacheType) /root/doris/be/src/io/cache/lru_queue_recorder.cpp:41:19
    #3 0x55f28e82d620 in doris::io::BlockFileCache::run_background_lru_log_replay() /root/doris/be/src/io/cache/block_file_cache.cpp:2242:24
    #4 0x55f2cdc2720f in execute_native_thread_routine /data/gcc-11.1.0/build/x86_64-pc-linux-gnu/libstdc+-v3/src/c11/../../../../../libstdc-v3/src/c+11/thread.cc:82:18

previously allocated by thread T607 (CumuCompactionT) here:
    #0 0x55f28e515fad in operator new(unsigned long) (/home/work/unlimit_teamcity/TeamCity/Agents/20250708205944agent_172.16.0.48_1/work/60183217f6ee2a9c/output/be/lib/doris_be+0x39759fad) (BuildId: 8b6ba6101e736655)
    #1 0x55f28e8c660d in __gnu_cxx::new_allocator<std::_List_node<std::unique_ptr<doris::io::CacheLRULog, std::default_delete<doris::io::CacheLRULog>>>>::allocate(unsigned long, void const*) /var/local/ldb-toolchain/bin/../lib/gcc/x86_64-linux-gnu/11/../../../../include/c++/11/ext/new_allocator.h:121:27
    #2 0x55f28e8c660d in std::allocator<std::_List_node<std::unique_ptr<doris::io::CacheLRULog, std::default_delete<doris::io::CacheLRULog>>>>::allocate(unsigned long) /var/local/ldb-toolchain/bin/../lib/gcc/x86_64-linux-gnu/11/../../../../include/c++/11/bits/allocator.h:173:32
    #3 0x55f28e8c660d in std::allocator_traits<std::allocator<std::_List_node<std::unique_ptr<doris::io::CacheLRULog, std::default_delete<doris::io::CacheLRULog>>>>>::allocate(std::allocator<std::_List_node<std::unique_ptr<doris::io::CacheLRULog, std::default_delete<doris::io::CacheLRULog>>>>&, unsigned long) /var/local/ldb-toolchain/bin/../lib/gcc/x86_64-linux-gnu/11/../../../../include/c++/11/bits/alloc_traits.h:460:20
    #4 0x55f28e8c660d in std::__cxx11::_List_base<std::unique_ptr<doris::io::CacheLRULog, std::default_delete<doris::io::CacheLRULog>>, std::allocator<std::unique_ptr<doris::io::CacheLRULog, std::default_delete<doris::io::CacheLRULog>>>>::_M_get_node() /var/local/ldb-toolchain/bin/../lib/gcc/x86_64-linux-gnu/11/../../../../include/c++/11/bits/stl_list.h:442:16
    #5 0x55f28e8c660d in std::List_node<std::unique_ptr<doris::io::CacheLRULog, std::default_delete<doris::io::CacheLRULog>>>* std::_cxx11::list<std::unique_ptr<doris::io::CacheLRULog, std::default_delete<doris::io::CacheLRULog>>, std::allocator<std::unique_ptr<doris::io::CacheLRULog, std::default_delete<doris::io::CacheLRULog>>>>::_M_create_node<std::unique_ptr<doris::io::CacheLRULog, std::default_delete<doris::io::CacheLRULog>>>(std::unique_ptr<doris::io::CacheLRULog, std::default_delete<doris::io::CacheLRULog>>&&) /var/local/ldb-toolchain/bin/../lib/gcc/x86_64-linux-gnu/11/../../../../include/c++/11/bits/stl_list.h:634:21
    #6 0x55f28e8c660d in void std::__cxx11::list<std::unique_ptr<doris::io::CacheLRULog, std::default_delete<doris::io::CacheLRULog>>, std::allocator<std::unique_ptr<doris::io::CacheLRULog, std::default_delete<doris::io::CacheLRULog>>>>::_M_insert<std::unique_ptr<doris::io::CacheLRULog, std::default_delete<doris::io::CacheLRULog>>>(std::_List_iterator<std::unique_ptr<doris::io::CacheLRULog, std::default_delete<doris::io::CacheLRULog>>>, std::unique_ptr<doris::io::CacheLRULog, std::default_delete<doris::io::CacheLRULog>>&&) /var/local/ldb-toolchain/bin/../lib/gcc/x86_64-linux-gnu/11/../../../../include/c++/11/bits/stl_list.h:1911:18
    #7 0x55f28e8c3522 in std::__cxx11::list<std::unique_ptr<doris::io::CacheLRULog, std::default_delete<doris::io::CacheLRULog>>, std::allocator<std::unique_ptr<doris::io::CacheLRULog, std::default_delete<doris::io::CacheLRULog>>>>::push_back(std::unique_ptr<doris::io::CacheLRULog, std::default_delete<doris::io::CacheLRULog>>&&) /var/local/ldb-toolchain/bin/../lib/gcc/x86_64-linux-gnu/11/../../../../include/c++/11/bits/stl_list.h:1217:15
    #8 0x55f28e8c3522 in doris::io::LRUQueueRecorder::record_queue_event(doris::io::FileCacheType, doris::io::CacheLRULogType, doris::io::UInt128Wrapper, unsigned long, unsigned long) /root/doris/be/src/io/cache/lru_queue_recorder.cpp:29:15
    #9 0x55f28e82f09b in doris::io::BlockFileCache::use_cell(doris::io::BlockFileCache::FileBlockCell const&, std::__cxx11::list<std::shared_ptr<doris::io::FileBlock>, std::allocator<std::shared_ptr<doris::io::FileBlock>>>*, bool, std::lock_guard<std::mutex>&) /root/doris/be/src/io/cache/block_file_cache.cpp:380:20
    #10 0x55f28e833d1b in doris::io::BlockFileCache::get_impl[abi:cxx11](doris::io::UInt128Wrapper const&, doris::io::CacheContext const&, doris::io::FileBlock::Range const&, std::lock_guard<std::mutex>&) /root/doris/be/src/io/cache/block_file_cache.cpp:572:13
    #11 0x55f28e83b4ef in doris::io::BlockFileCache::get_or_set(doris::io::UInt128Wrapper const&, unsigned long, unsigned long, doris::io::CacheContext&) /root/doris/be/src/io/cache/block_file_cache.cpp:762:27
    #12 0x55f28e7ffcee in doris::io::CachedRemoteFileReader::read_at_impl(unsigned long, doris::Slice, unsigned long*, doris::io::IOContext const*) /root/doris/be/src/io/cache/cached_remote_file_reader.cpp:191:21
    #13 0x55f28e7f8017 in doris::io::FileReader::read_at(unsigned long, doris::Slice, unsigned long*, doris::io::IOContext const*) /root/doris/be/src/io/fs/file_reader.cpp:34:17
```

### Release note

None

### Check List (For Author)

- Test <!-- At least one of them must be included. -->
    - [ ] Regression test
    - [ ] Unit Test
    - [ ] Manual test (add detailed scripts or steps below)
    - [ ] No need to test or manual test. Explain why:
- [ ] This is a refactor/code format and no logic has been changed.
        - [x] Previous test can cover this change.
        - [ ] No code files have been changed.
        - [ ] Other reason <!-- Add your reason?  -->

- Behavior changed:
    - [x] No.
    - [ ] Yes. <!-- Explain the behavior change -->

- Does this need documentation?
    - [x] No.
- [ ] Yes. <!-- Add document PR link here. eg:
apache/doris-website#1214 -->

### Check List (For Reviewer who merge this PR)

- [ ] Confirm the release note
- [ ] Confirm test cases
- [ ] Confirm document
- [ ] Add branch pick label <!-- Add branch pick label that this PR
should merge into -->

---------

Signed-off-by: zhengyu <zhangzhengyu@selectdb.com>
@github-actions github-actions bot requested a review from morrySnow as a code owner July 14, 2025 11:21
@Thearas
Copy link
Contributor

Thearas commented Jul 14, 2025

Thank you for your contribution to Apache Doris.
Don't know what should be done next? See How to process your PR.

Please clearly describe your PR:

  1. What problem was fixed (it's best to include specific error reporting information). How it was fixed.
  2. Which behaviors were modified. What was the previous behavior, what is it now, why was it modified, and what possible impacts might there be.
  3. What features were added. Why was this function added?
  4. Which code was refactored and why was this part of the code refactored?
  5. Which functions were optimized and what is the difference before and after the optimization?

@dataroaring dataroaring reopened this Jul 14, 2025
@Thearas
Copy link
Contributor

Thearas commented Jul 14, 2025

run buildall

@doris-robot
Copy link

TPC-H: Total hot run time: 39922 ms
machine: 'aliyun_ecs.c7a.8xlarge_32C64G'
scripts: https://github.com/apache/doris/tree/master/tools/tpch-tools
Tpch sf100 test result on commit 461f54f6b46d6dd076f1d1f60211374eafed9e29, data reload: false

------ Round 1 ----------------------------------
q1	17581	6727	6599	6599
q2	2066	216	197	197
q3	11100	1165	1196	1165
q4	12017	764	667	667
q5	8257	2999	2852	2852
q6	218	135	133	133
q7	979	635	631	631
q8	9349	1946	2014	1946
q9	6729	6402	6430	6402
q10	7044	2246	2298	2246
q11	454	270	256	256
q12	395	217	207	207
q13	17782	2962	2948	2948
q14	239	206	209	206
q15	515	478	467	467
q16	483	383	373	373
q17	1015	627	583	583
q18	7367	6825	6754	6754
q19	1316	952	1000	952
q20	487	201	205	201
q21	4060	3133	3218	3133
q22	1074	1010	1004	1004
Total cold run time: 110527 ms
Total hot run time: 39922 ms

----- Round 2, with runtime_filter_mode=off -----
q1	6621	6645	6545	6545
q2	342	235	235	235
q3	3184	3039	3007	3007
q4	2031	1811	1735	1735
q5	5703	5721	5697	5697
q6	203	126	123	123
q7	2224	1778	1782	1778
q8	3370	3564	3469	3469
q9	8883	8773	8825	8773
q10	3558	3515	3491	3491
q11	582	482	494	482
q12	813	575	611	575
q13	4531	3157	3121	3121
q14	307	266	287	266
q15	527	464	463	463
q16	501	442	433	433
q17	1839	1598	1599	1598
q18	8236	7712	7723	7712
q19	1685	1651	1543	1543
q20	2087	1884	1897	1884
q21	5048	4860	4819	4819
q22	1105	1000	1008	1000
Total cold run time: 63380 ms
Total hot run time: 58749 ms

@doris-robot
Copy link

TPC-DS: Total hot run time: 190462 ms
machine: 'aliyun_ecs.c7a.8xlarge_32C64G'
scripts: https://github.com/apache/doris/tree/master/tools/tpcds-tools
TPC-DS sf100 test result on commit 461f54f6b46d6dd076f1d1f60211374eafed9e29, data reload: false

query1	964	372	365	365
query2	6531	1988	1846	1846
query3	6717	227	224	224
query4	34058	23823	23478	23478
query5	4426	466	443	443
query6	286	174	198	174
query7	4629	316	329	316
query8	295	241	240	240
query9	9706	2596	2598	2596
query10	500	281	255	255
query11	18208	15059	15120	15059
query12	162	105	103	103
query13	1650	430	432	430
query14	9645	7210	6710	6710
query15	258	172	187	172
query16	8141	460	459	459
query17	1636	587	571	571
query18	2167	310	311	310
query19	310	172	163	163
query20	121	111	108	108
query21	207	108	111	108
query22	4449	4073	4118	4073
query23	34564	33383	33720	33383
query24	11775	2879	2861	2861
query25	718	421	418	418
query26	1789	176	173	173
query27	2901	343	351	343
query28	7843	2143	2134	2134
query29	1039	462	453	453
query30	332	161	162	161
query31	1030	802	802	802
query32	102	61	61	61
query33	810	319	309	309
query34	925	506	528	506
query35	849	714	701	701
query36	1115	931	941	931
query37	133	74	73	73
query38	3884	3859	3825	3825
query39	1485	1432	1478	1432
query40	297	103	105	103
query41	55	54	54	54
query42	122	106	106	106
query43	514	476	477	476
query44	1254	810	815	810
query45	186	178	174	174
query46	1147	717	710	710
query47	1962	1817	1870	1817
query48	437	357	359	357
query49	1241	405	411	405
query50	820	433	417	417
query51	7270	7084	7149	7084
query52	110	95	94	94
query53	272	192	196	192
query54	1232	497	485	485
query55	82	82	84	82
query56	275	262	256	256
query57	1286	1190	1171	1171
query58	254	223	230	223
query59	3152	2940	2839	2839
query60	284	259	272	259
query61	114	118	115	115
query62	833	675	674	674
query63	219	190	188	188
query64	5225	668	644	644
query65	3266	3208	3193	3193
query66	1271	323	338	323
query67	16060	15673	15788	15673
query68	4767	583	590	583
query69	420	280	264	264
query70	1175	1074	1108	1074
query71	384	268	260	260
query72	6364	4085	4065	4065
query73	757	348	374	348
query74	10321	9262	9271	9262
query75	3424	2653	2669	2653
query76	2993	1034	1010	1010
query77	426	278	272	272
query78	10624	9605	9545	9545
query79	1182	584	600	584
query80	976	452	444	444
query81	535	225	222	222
query82	1215	92	89	89
query83	228	155	148	148
query84	236	81	86	81
query85	1179	319	307	307
query86	342	290	307	290
query87	4431	4244	4230	4230
query88	3715	2432	2396	2396
query89	402	299	287	287
query90	1891	188	189	188
query91	143	118	119	118
query92	61	53	59	53
query93	1079	548	571	548
query94	753	301	303	301
query95	354	256	258	256
query96	609	286	285	285
query97	3305	3122	3166	3122
query98	215	203	193	193
query99	1502	1335	1291	1291
Total cold run time: 301608 ms
Total hot run time: 190462 ms

@doris-robot
Copy link

ClickBench: Total hot run time: 29.86 s
machine: 'aliyun_ecs.c7a.8xlarge_32C64G'
scripts: https://github.com/apache/doris/tree/master/tools/clickbench-tools
ClickBench test result on commit 461f54f6b46d6dd076f1d1f60211374eafed9e29, data reload: false

query1	0.03	0.03	0.03
query2	0.07	0.03	0.03
query3	0.23	0.07	0.07
query4	1.64	0.11	0.10
query5	0.52	0.50	0.52
query6	1.15	0.73	0.73
query7	0.02	0.02	0.02
query8	0.05	0.05	0.03
query9	0.56	0.50	0.51
query10	0.56	0.56	0.56
query11	0.15	0.10	0.10
query12	0.14	0.11	0.12
query13	0.61	0.60	0.60
query14	0.76	0.80	0.80
query15	0.85	0.84	0.84
query16	0.38	0.37	0.38
query17	1.00	1.05	1.00
query18	0.23	0.22	0.22
query19	1.93	1.87	1.88
query20	0.02	0.01	0.00
query21	15.43	0.59	0.58
query22	2.41	2.12	1.63
query23	17.03	0.94	0.83
query24	2.69	1.04	0.79
query25	0.28	0.24	0.07
query26	0.32	0.14	0.14
query27	0.05	0.04	0.04
query28	10.91	0.48	0.45
query29	12.56	3.25	3.19
query30	0.25	0.06	0.06
query31	2.88	0.38	0.38
query32	3.23	0.46	0.46
query33	2.99	2.97	3.04
query34	16.91	4.50	4.52
query35	4.50	4.50	4.51
query36	0.67	0.48	0.48
query37	0.09	0.06	0.06
query38	0.05	0.04	0.04
query39	0.03	0.02	0.03
query40	0.17	0.12	0.12
query41	0.07	0.02	0.03
query42	0.04	0.02	0.02
query43	0.04	0.03	0.03
Total cold run time: 104.5 s
Total hot run time: 29.86 s

@doris-robot
Copy link

BE UT Coverage Report

Increment line coverage 100.00% (14/14) 🎉

Increment coverage report
Complete coverage report

Category Coverage
Function Coverage 45.22% (12509/27662)
Line Coverage 36.13% (111107/307554)
Region Coverage 35.25% (57478/163076)
Branch Coverage 32.36% (31216/96470)

@morrySnow morrySnow merged commit 9e3067a into branch-3.1 Jul 15, 2025
20 of 22 checks passed
@github-actions github-actions bot deleted the auto-pick-53046-branch-3.1 branch July 15, 2025 02:22
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Labels

None yet

Projects

None yet

Development

Successfully merging this pull request may close these issues.

6 participants