Another CXL/MMIO tcg tlb corner case Hi, While doing some testing using numactl-based interleaving of application memory across regular memory and CXL-based memory using QEMU with tcg, I ran into an issue similar to what we saw a while back - link to old issue: https://lore.kernel.org/qemu-devel/CAFEAcA_a_AyQ=Epz3_+CheAT8Crsk9mOu894wbNW_FywamkZiw@mail.gmail.com/#t. When running: numactl --interleave 0,1 ./cachebench … I hit the following: numactl --interleave 0,1 ./cachebench --json_test_config ../test_configs/hit_ratio/graph_cache_follower_assocs/config.json qemu: fatal: cpu_io_recompile: could not find TB for pc=0x7fffc3926dd4 RAX=00007f65df55ba18 RBX=00007f65df55ba60 RCX=00007f65df221620 RDX=0000000000000000 RSI=00000000011c0260 RDI=00007f65df55ba60 RBP=00007ffdb4b4b280 RSP=00007ffdb4b4b1d0 R8 =00000000011c02c0 R9 =00007f65debf6b20 R10=00000000011bf5d0 R11=00007f65deb7d300 R12=00007ffdb4b4b260 R13=00007ffdb4b4b200 R14=00007ffdb4b4b220 R15=00000000011bf5a0 RIP=00007f65df18affc RFL=00000246 [---Z-P-] CPL=3 II=0 A20=1 SMM=0 HLT=0 ES =0000 0000000000000000 00000000 00000000 CS =0033 0000000000000000 ffffffff 00affb00 DPL=3 CS64 [-RA] SS =002b 0000000000000000 ffffffff 00cff300 DPL=3 DS [-WA] DS =0000 0000000000000000 00000000 00000000 FS =0000 00007f65de2f64c0 00000000 00000000 GS =0000 0000000000000000 00000000 00000000 LDT=0000 0000000000000000 00000000 00008200 DPL=0 LDT TR =0040 fffffe6c37990000 00004087 00008900 DPL=0 TSS64-avl GDT= fffffe6c3798e000 0000007f IDT= fffffe0000000000 00000fff CR0=80050033 CR2=00007f65df1b3eb0 CR3=0000000152a1e000 CR4=00350ef0 DR0=0000000000000000 DR1=0000000000000000 DR2=0000000000000000 DR3=0000000000000000 DR6=00000000ffff0ff0 DR7=0000000000000400 CCS=0000000000000000 CCD=0000000000000001 CCO=CLR EFER=0000000000000d01 FCW=037f FSW=0000 [ST=0] FTW=00 MXCSR=00001f80 FPR0=0000000000000000 0000 FPR1=0000000000000000 0000 FPR2=0000000000000000 0000 FPR3=0000000000000000 0000 FPR4=0000000000000000 0000 FPR5=0000000000000000 0000 FPR6=0000000000000000 0000 FPR7=0000000000000000 0000 YMM00=0000000000000000 0000000000000000 00007f65df2233e0 00007f65df221620 YMM01=0000000000000000 0000000000000000 0000000000000000 43e0000000000000 YMM02=0000000000000000 0000000000000000 0000000000000000 0000000000000000 YMM03=0000000000000000 0000000000000000 0000000000000000 0000000000000000 YMM04=0000000000000000 0000000000000000 0000000000000000 3ff0000000000000 YMM05=0000000000000000 0000000000000000 0000000000000000 00007f65df2233e0 YMM06=0000000000000000 0000000000000000 0000000000000000 00007f65df2233b0 YMM07=0000000000000000 0000000000000000 62694c6568636143 2f65636170736b72 YMM08=0000000000000000 0000000000000000 6d622070656d7320 327876612031696d YMM09=0000000000000000 0000000000000000 0000000000000004 0000000000000004 YMM10=0000000000000000 0000000000000000 0000000000000002 0000000000000002 YMM11=0000000000000000 0000000000000000 0000000000000010 0000000000000010 YMM12=0000000000000000 0000000000000000 0000000000ff00fb 0000000000fe00fa YMM13=0000000000000000 0000000000000000 0000000000000000 00ff00fd00fb00f9 YMM14=0000000000000000 0000000000000000 0000000000000000 0000000000000000 YMM15=0000000000000000 0000000000000000 0000000000000000 0000000000000000 The backtrace is (using Jonathans cxl-2024-03-05 branch): (gdb) bt #0 __pthread_kill_implementation (no_tid=0, signo=6, threadid=140737297516096) at ./nptl/pthread_kill.c:44 #1 __pthread_kill_internal (signo=6, threadid=140737297516096) at ./nptl/pthread_kill.c:78 #2 __GI___pthread_kill (threadid=140737297516096, signo=signo@entry=6) at ./nptl/pthread_kill.c:89 #3 0x00007ffff7642476 in __GI_raise (sig=sig@entry=6) at ../sysdeps/posix/raise.c:26 #4 0x00007ffff76287f3 in __GI_abort () at ./stdlib/abort.c:79 #5 0x0000555555c5a9df in cpu_abort (cpu=cpu@entry=0x5555578c19c0, fmt=fmt@entry=0x55555605d100 "cpu_io_recompile: could not find TB for pc=%p") at ../cpu-target.c:371 #6 0x0000555555caa065 in cpu_io_recompile (cpu=cpu@entry=0x5555578c19c0, retaddr=140736474541524) at ../accel/tcg/translate-all.c:610 #7 0x0000555555cacee7 in io_prepare (retaddr=140736474541524, addr=140075515361944, attrs=..., xlat=, cpu=0x5555578c19c0, out_offset=) at ../accel/tcg/cputlb.c:1336 #8 do_st_mmio_leN (cpu=0x5555578c19c0, full=0x7ffd1a1554d0, val_le=140075515361816, addr=140075515361944, size=8, mmu_idx=3, ra=140736474541524) at ../accel/tcg/cputlb.c:2591 #9 0x0000555555cb179d in do_st_8 (ra=, memop=, mmu_idx=, val=140075515361816, p=, cpu=) at ../accel/tcg/cputlb.c:2784 #10 do_st8_mmu (cpu=0x5555578c19c0, addr=39050, val=140075515361816, oi=6, ra=140736474541524) at ../accel/tcg/cputlb.c:2862 #11 0x00007fffc3926e15 in code_gen_buffer () #12 0x0000555555ca0e5b in cpu_tb_exec (cpu=cpu@entry=0x5555578c19c0, itb=itb@entry=0x7fffc3926cc0 , tb_exit=tb_exit@entry=0x7ffff49ff6d8) at ../accel/tcg/cpu-exec.c:449 #13 0x0000555555ca13ac in cpu_loop_exec_tb (tb_exit=0x7ffff49ff6d8, last_tb=, pc=, tb=0x7fffc3926cc0 , cpu=0x5555578c19c0) at ../accel/tcg/cpu-exec.c:904 #14 cpu_exec_loop (cpu=cpu@entry=0x5555578c19c0, sc=sc@entry=0x7ffff49ff770) at ../accel/tcg/cpu-exec.c:1019 #15 0x0000555555ca1bb1 in cpu_exec_setjmp (cpu=cpu@entry=0x5555578c19c0, sc=sc@entry=0x7ffff49ff770) at ../accel/tcg/cpu-exec.c:1036 #16 0x0000555555ca2388 in cpu_exec (cpu=cpu@entry=0x5555578c19c0) at ../accel/tcg/cpu-exec.c:1062 #17 0x0000555555cc65c4 in tcg_cpu_exec (cpu=cpu@entry=0x5555578c19c0) at ../accel/tcg/tcg-accel-ops.c:76 #18 0x0000555555cc671f in mttcg_cpu_thread_fn (arg=arg@entry=0x5555578c19c0) at ../accel/tcg/tcg-accel-ops-mttcg.c:95 #19 0x0000555555e61261 in qemu_thread_start (args=) at ../util/qemu-thread-posix.c:541 #20 0x00007ffff7694ac3 in start_thread (arg=) at ./nptl/pthread_create.c:442 #21 0x00007ffff7726850 in clone3 () at ../sysdeps/unix/sysv/linux/x86_64/clone3.S:81 Looking at the tb being executed, it looks like it is a single instruction tb, so with my _very_ limited understanding of tcg, it shouldn’t be necessary to do the IO recompile: (gdb) up 13 #13 0x0000555555ca13ac in cpu_loop_exec_tb (tb_exit=0x7ffff49ff6d8, last_tb=, pc=, tb=0x7fffc3926cc0 , cpu=0x5555578c19c0) at ../accel/tcg/cpu-exec.c:904 904 tb = cpu_tb_exec(cpu, tb, tb_exit); (gdb) print *tb $1 = {pc = 0, cs_base = 0, flags = 415285939, cflags = 4278321152, size = 7, icount = 1, tc = {ptr = 0x7fffc3926d80 , size = 176}, page_next = {0, 0}, page_addr = {18446744073709551615, 18446744073709551615}, jmp_lock = {value = 0}, jmp_reset_offset = {65535, 65535}, jmp_insn_offset = {65535, 65535}, jmp_target_addr = {0, 0}, jmp_list_head = 140736474540928, jmp_list_next = {0, 0}, jmp_dest = {0, 0}} If the application is run entirely out of MMIO memory, things work fine (the previous patches related to this is in Jonathans branch), so one thought is that it is related to having the code on a mix of regular and CXL memory. Since we previously had issues with code crossing page boundaries where only the second page is MMIO, I tried out the following change to the fix introduced for that issue thinking that reverting to the slow path in the middle of the translation might not correctly update can_do_io: Message-Id: <33748BB7-E617-4661-BDE3-5D29780FC9FF@wdc.com>

VirtSnd: Driver MUST NOT place device writable buffers in tx queue #3

Description

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions