diff --git a/2022/01/06/linux-kernel-stack.html b/2022/01/06/linux-kernel-stack.html new file mode 100644 index 0000000..e1638ec --- /dev/null +++ b/2022/01/06/linux-kernel-stack.html @@ -0,0 +1,39 @@ + + + +
+ + + +I was enjoying my quiet afternoon, when an interesting application core has been submitted to me for research.
-And what made it interesting is that I did not have to load proprietary/private symbols fort the application that crashed.
+And what made it interesting is that I did not have to load proprietary/private symbols for the application that crashed.
The first thing to check is whether the file is indeed a core file, and it turns out it is not.
rusty@nuc10:~/futex$ file veeamagent.0.crash
veeamagent.0.crash: ASCII text, with very long lines (44565)
@@ -105,22 +108,22 @@
void
__lll_lock_wait (int *futex, int private)
{
- if (atomic_load_relaxed (futex) == 2)
+ if (atomic_load_relaxed (futex) == 2)
goto futex;
- while (atomic_exchange_acquire (futex, 2) != 0)
+ while (atomic_exchange_acquire (futex, 2) != 0)
{
futex:
- LIBC_PROBE (lll_lock_wait, 1, futex);
- futex_wait ((unsigned int *) futex, 2, private); /* Wait if *futex == 2. */
+ LIBC_PROBE (lll_lock_wait, 1, futex);
+ futex_wait ((unsigned int *) futex, 2, private); /* Wait if *futex == 2. */
}
}
-libc_hidden_def (__lll_lock_wait)
+libc_hidden_def (__lll_lock_wait)
futex_wait() is always inlined, that is why it is not in the stack, and futex_wait() calls futex_fatal_error(), which subsequently calls abort()
static __always_inline int
futex_wait (unsigned int *futex_word, unsigned int expected, int private)
{
- int err = lll_futex_timed_wait (futex_word, expected, NULL, private);
+ int err = lll_futex_timed_wait (futex_word, expected, NULL, private);
switch (err)
{
case 0:
@@ -136,7 +139,7 @@
case -ENOSYS: /* Must have been caused by a glibc bug. */
/* No other errors are documented at this time. */
default:
- futex_fatal_error ();
+ futex_fatal_error ();
}
}
After reading the comments, let’s check the easiest thing first, wrong alignment.
@@ -162,14 +165,14 @@
#include <unistd.h>
#include <pthread.h>
-pthread_mutex_t mutex __attribute__((section(".reproducer"))) __attribute__ ((aligned(1)));
+pthread_mutex_t mutex __attribute__((section(".reproducer"))) __attribute__ ((aligned(1)));
int main(int argc, char **argv)
{
- printf("mutex %p\n", &mutex);
- pthread_mutex_lock(&mutex);
- sleep(1);
- pthread_mutex_unlock(&mutex);
+ printf("mutex %p\n", &mutex);
+ pthread_mutex_lock(&mutex);
+ sleep(1);
+ pthread_mutex_unlock(&mutex);
return 0;
}
Now, let’s compile it and test it, it should crash with the same stack trace as we have in our core.
@@ -201,13 +204,13 @@Let’s run this:
rusty@nuc10:~/futex$ gcc -g futex0.c -lpthread -Wl,--section-start=.reproducer=0x8001ea1
@@ -249,7 +252,7 @@
* The futex address must be "naturally" aligned.
*/
key->both.offset = address % PAGE_SIZE;
- if (unlikely((address % sizeof(u32)) != 0))
+ if (unlikely((address % sizeof(u32)) != 0))
return -EINVAL;
And there we have it again, the futex address must be aligned.
And with that, we can conclude our research and send this bug report to Veeam for fixing.
@@ -258,6 +261,10 @@Debugging a kernel bug could be a challenging task.
+Let’s say a bug lies within the memory management subsystem, slub allocator or page allocator.
+Depending on the debugging tools that you have chosen for troubleshooting, these tools might be calling into the mm subsystem, because they themselves need to allocate memory to show you the trace/debug output that you are interested in, and, therefore, they might be hitting the very same bug and not producing any trace/debug output at all.
But the crash file that has been submitted to me for research, luckily, turned out to be a relatively easy one.
+And not a kernel bug per se, but an out-of-tree 3rd party kernel module bug.
+First things first, let’s check the file’s magic number:
+rusty@thinkpad:~$ file 202310190528/dump.202310190528
+202310190528/dump.202310190528: Flattened kdump compressed dump v6, system Linux, node debian11, release 5.10.0-26-amd64, version #1 SMP Debian 5.10.197-1 (2023-09-29), machine x86_64, domain (none)
+
Now we need the debug symbols for the debian kernel 5.10.0-26-amd64.
+Since I am opening the kdump crash file on my Arch Linux machine, I need to manually fetch the deb package from debian repo, extract the contents, and get vmlinux file.
+Now we are ready to research it, let’s open it in the crash utility:
+rusty@thinkpad:~$ crash 202310190528/dump.202310190528 vmlinux-5.10.0-26-amd64
+...
+ KERNEL: vmlinux-5.10.0-26-amd64 [TAINTED]
+ DUMPFILE: 202310190528/dump.202310190528 [PARTIAL DUMP]
+ CPUS: 2
+ DATE: Thu Oct 19 05:28:06 PDT 2023
+ UPTIME: 04:28:07
+LOAD AVERAGE: 0.00, 0.00, 0.00
+ TASKS: 144
+ NODENAME: debian11
+ RELEASE: 5.10.0-26-amd64
+ VERSION: #1 SMP Debian 5.10.197-1 (2023-09-29)
+ MACHINE: x86_64 (2112 Mhz)
+ MEMORY: 4 GB
+ PANIC: "Oops: 0010 [#1] SMP NOPTI" (check log for details)
+ PID: 9462
+ COMMAND: "veeamdeferio8_1"
+ TASK: ffff966246d7b100 [THREAD_INFO: ffff966246d7b100]
+ CPU: 1
+ STATE: TASK_RUNNING (PANIC)
+
+crash> ps -p 9462
+PID: 0 TASK: ffffffffbc613940 CPU: 0 COMMAND: "swapper/0"
+ PID: 2 TASK: ffff9662401eb100 CPU: 1 COMMAND: "kthreadd"
+ PID: 9462 TASK: ffff966246d7b100 CPU: 1 COMMAND: "veeamdeferio8_1"
+
+crash> bt 9462
+PID: 9462 TASK: ffff966246d7b100 CPU: 1 COMMAND: "veeamdeferio8_1"
+ #0 [ffffafd080c7bc70] machine_kexec at ffffffffbb067ca1
+ #1 [ffffafd080c7bcc8] __crash_kexec at ffffffffbb14a52a
+ #2 [ffffafd080c7bd88] crash_kexec at ffffffffbb14b669
+ #3 [ffffafd080c7bd98] oops_end at ffffffffbb02ed3f
+ #4 [ffffafd080c7bdb8] exc_page_fault at ffffffffbb8ffb59
+ #5 [ffffafd080c7bde0] asm_exc_page_fault at ffffffffbba00b4e
+ #6 [ffffafd080c7bf10] kthread at ffffffffbb0b5b48
+ #7 [ffffafd080c7bf50] ret_from_fork at ffffffffbb00451f
+
+crash> lsmod |grep -i veeam
+crash>
+
The kernel crashed in the context of a kernel-space process - a kthread with PID 9462 called veeamdeferio.
+At the moment of the crash, nothing with the name veeam was loaded into the kernel.
+The next step would be to review the kernel ring buffer right before the crash:
+crash> log
+...
+[16086.429347] veeamsnap:tracker | ERR | Tracker is already released for device [8:1]
+[16086.447893] veeamsnap:container | ERR | CRITICAL ERROR: Container is not empty. cnt=2
+[16086.447901] veeamsnap:snapstore | ERR | Cleanup snapstore devices
+[16086.460295] veeamsnap:container | ERR | CRITICAL ERROR: Container is not empty. cnt=1
+[16086.460301] veeamsnap:snapstore | ERR | Unable to perform snapstore cleanup: container is not empty
+[16086.529734] veeamsnap:container | ERR | CRITICAL ERROR: Container is not empty. cnt=1
+[16086.529757] veeamsnap:ctrl_pipe | ERR | Unable to perform ctrl pipes cleanup: container is not empty
+[16086.529967] veeamsnap:main | ERR | container_alloc_counter=3
+[16086.529972] veeamsnap:main container_alloc_counter=3
+[16086.529976] veeamsnap:main | ERR | mem_cnt=65608
+[16086.529980] veeamsnap:main mem_cnt=65608
+[16087.147849] BUG: unable to handle page fault for address: ffffffffc08a65d3
+[16087.149722] #PF: supervisor instruction fetch in kernel mode
+[16087.151349] #PF: error_code(0x0010) - not-present page
+[16087.153012] PGD 6580e067 P4D 6580e067 PUD 65810067 PMD 1047e6067 PTE 0
+[16087.154702] Oops: 0010 [#1] SMP NOPTI
+[16087.156282] CPU: 1 PID: 9462 Comm: veeamdeferio8_1 Kdump: loaded Tainted: G OE 5.10.0-26-amd64 #1 Debian 5.10.197-1
+[16087.157927] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Arch Linux 1.16.2-2-2 04/01/2014
+[16087.159468] RIP: 0010:0xffffffffc08a65d3
+[16087.160915] Code: Unable to access opcode bytes at RIP 0xffffffffc08a65a9.
+[16087.162371] RSP: 0018:ffffafd080c7be98 EFLAGS: 00010282
+[16087.163864] RAX: 0000000000000000 RBX: ffff966242162000 RCX: 0000000000000000
+[16087.165331] RDX: 0000000000000000 RSI: 0000000000000246 RDI: 0000000000000246
+[16087.166742] RBP: 00000000000000fa R08: 0000000000000000 R09: 0000000000000000
+[16087.168145] R10: 0000000000000000 R11: 0000000000000000 R12: ffff966242162018
+[16087.169546] R13: ffffafd08410fcf0 R14: ffff966242162000 R15: ffff966246d7b100
+[16087.170909] FS: 0000000000000000(0000) GS:ffff96627bd00000(0000) knlGS:0000000000000000
+[16087.172305] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
+[16087.174165] CR2: ffffffffc08a65a9 CR3: 000000010342e005 CR4: 0000000000370ee0
+[16087.175734] Call Trace:
+[16087.177402] ? __die_body.cold+0x1a/0x1f
+[16087.178577] ? no_context+0x1a6/0x3c0
+[16087.179462] ? exc_page_fault+0xd9/0x160
+[16087.180338] ? asm_exc_page_fault+0x1e/0x30
+[16087.181203] ? add_wait_queue_exclusive+0x70/0x70
+[16087.182084] ? kthread+0x118/0x140
+[16087.182872] ? __kthread_bind_mask+0x60/0x60
+[16087.183491] ? ret_from_fork+0x1f/0x30
+[16087.184105] Modules linked in: binfmt_misc dm_mod intel_rapl_msr intel_rapl_common intel_pmc_core kvm_intel kvm irqbypass ghash_clmulni_intel aesni_intel libaes crypto_simd cryptd glue_helper rapl ppdev bochs_drm drm_vram_helper drm_ttm_helper ttm joydev evdev parport_pc drm_kms_helper serio_raw pcspkr parport qemu_fw_cfg cec sg button drm configfs fuse ip_tables x_tables autofs4 ext4 crc16 mbcache jbd2 crc32c_generic sd_mod t10_pi crc_t10dif crct10dif_generic sr_mod cdrom ata_generic crct10dif_pclmul crct10dif_common ata_piix crc32_pclmul libata floppy e1000 scsi_mod crc32c_intel psmouse i2c_piix4 [last unloaded: veeamsnap]
+[16087.186898] CR2: ffffffffc08a65d3
+
After carefully going over the oops message, we now have the full picture.
+The instruction pointer register (RIP) for the veeamdeferio kthread was pointing to 0xffffffffc08a65a9, which is an unmapped page, accessing which caused the page fault.
+The reason why the page got unmapped is because it has been freed when the veeamsnap kernel module was unloaded.
+And the problem is that veeamsnap did not stop the veeamdeferio kthread when it was being unloaded.
+At this point we can submit a bug report to Veeam, bug we can research the issue a bit further.
+Why? Because we can and it is open-source, we have the code1.
+The key error message explaining the veeamsnap bug is in the output above:
+++[16086.429347] veeamsnap:tracker | ERR | Tracker is already released for device [8:1]
+
This corresponds to _tracker_release_snapshot() in tracker.c, this function did not call defer_io_stop():
+int _tracker_release_snapshot(tracker_t* tracker)
+{
+ int result = SUCCESS;
+#ifdef VEEAMSNAP_BLK_FREEZE
+ struct super_block* superblock = NULL;
+#endif
+ defer_io_t* defer_io = tracker->defer_io;
+
+ if (!tracker->defer_io) {
+ log_err_dev_t("Tracker is already released for device ", tracker->original_dev_id);
+ return SUCCESS;
+ }
+ ...
+
And the reason for it lies within another code path, in _tracker_capture_snapshot() in the same file:
+int _tracker_capture_snapshot(tracker_t* tracker)
+{
+ int result;
+ defer_io_t* defer_io = NULL;
+
+ result = defer_io_create(tracker->original_dev_id, tracker->target_dev, &defer_io);
+ if (result != SUCCESS) {
+ log_err("Failed to create defer IO processor");
+ return result;
+ }
+
+ tracker->defer_io = defer_io_get_resource(defer_io);
+ ...
+
Here, on the last line, tracker->defer_io pointer gets assigned a new value.
+This assignment overwrites and leaks the existing non-null pointer.
+And this unaccounted-for veeamdeferio kthread never gets stopped, and, subsequently, causes the kernel to crash later on during module removal.
+And now we are good to submit this RCA to Veeam for fixing.
+