From ada018b15ccecbdb95df46db7121516edb906bf6 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 14 Feb 2020 18:32:43 +0100 Subject: [PATCH 001/190] x86/mce/amd: Do proper cleanup on error paths Drop kobject reference counts properly on error in the banks and blocks allocation functions. [ bp: Write commit message. ] Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20200403161943.1458-2-bp@alien8.de --- arch/x86/kernel/cpu/mce/amd.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c index 52de616a806559..477cf773cf1cb6 100644 --- a/arch/x86/kernel/cpu/mce/amd.c +++ b/arch/x86/kernel/cpu/mce/amd.c @@ -1267,13 +1267,12 @@ static int allocate_threshold_blocks(unsigned int cpu, struct threshold_bank *tb if (b) kobject_uevent(&b->kobj, KOBJ_ADD); - return err; + return 0; out_free: if (b) { - kobject_put(&b->kobj); list_del(&b->miscj); - kfree(b); + kobject_put(&b->kobj); } return err; } @@ -1339,6 +1338,7 @@ static int threshold_create_bank(unsigned int cpu, unsigned int bank) goto out; } + /* Associate the bank with the per-CPU MCE device */ b->kobj = kobject_create_and_add(name, &dev->kobj); if (!b->kobj) { err = -EINVAL; @@ -1357,16 +1357,17 @@ static int threshold_create_bank(unsigned int cpu, unsigned int bank) err = allocate_threshold_blocks(cpu, b, bank, 0, msr_ops.misc(bank)); if (err) - goto out_free; + goto out_kobj; per_cpu(threshold_banks, cpu)[bank] = b; return 0; - out_free: +out_kobj: + kobject_put(b->kobj); +out_free: kfree(b); - - out: +out: return err; } From c9bf318f77b3a78483e656e609d005c52aadc86d Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 12 Feb 2020 00:34:01 +0100 Subject: [PATCH 002/190] x86/mce/amd: Init thresholding machinery only on relevant vendors ... and not unconditionally. [ bp: Add a new vendor_flags bit for that. ] Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20200403161943.1458-3-bp@alien8.de --- arch/x86/kernel/cpu/mce/amd.c | 12 ++++++++++-- arch/x86/kernel/cpu/mce/core.c | 1 + arch/x86/kernel/cpu/mce/internal.h | 9 ++++++--- 3 files changed, 17 insertions(+), 5 deletions(-) diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c index 477cf773cf1cb6..c3b3326ad4ac06 100644 --- a/arch/x86/kernel/cpu/mce/amd.c +++ b/arch/x86/kernel/cpu/mce/amd.c @@ -1442,15 +1442,20 @@ static void threshold_remove_bank(unsigned int cpu, int bank) int mce_threshold_remove_device(unsigned int cpu) { + struct threshold_bank **bp = this_cpu_read(threshold_banks); unsigned int bank; + if (!bp) + return 0; + for (bank = 0; bank < per_cpu(mce_num_banks, cpu); ++bank) { if (!(per_cpu(bank_map, cpu) & (1 << bank))) continue; threshold_remove_bank(cpu, bank); } - kfree(per_cpu(threshold_banks, cpu)); - per_cpu(threshold_banks, cpu) = NULL; + /* Clear the pointer before freeing the memory */ + this_cpu_write(threshold_banks, NULL); + kfree(bp); return 0; } @@ -1461,6 +1466,9 @@ int mce_threshold_create_device(unsigned int cpu) struct threshold_bank **bp; int err = 0; + if (!mce_flags.amd_threshold) + return 0; + bp = per_cpu(threshold_banks, cpu); if (bp) return 0; diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index 54165f3569e882..43ca91e14a77c9 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -1756,6 +1756,7 @@ static void __mcheck_cpu_init_early(struct cpuinfo_x86 *c) mce_flags.overflow_recov = !!cpu_has(c, X86_FEATURE_OVERFLOW_RECOV); mce_flags.succor = !!cpu_has(c, X86_FEATURE_SUCCOR); mce_flags.smca = !!cpu_has(c, X86_FEATURE_SMCA); + mce_flags.amd_threshold = 1; if (mce_flags.smca) { msr_ops.ctl = smca_ctl_reg; diff --git a/arch/x86/kernel/cpu/mce/internal.h b/arch/x86/kernel/cpu/mce/internal.h index 3b008172ad73d5..74a01829c4f4a8 100644 --- a/arch/x86/kernel/cpu/mce/internal.h +++ b/arch/x86/kernel/cpu/mce/internal.h @@ -148,7 +148,7 @@ struct mce_vendor_flags { * Recovery. It indicates support for data poisoning in HW and deferred * error interrupts. */ - succor : 1, + succor : 1, /* * (AMD) SMCA: This bit indicates support for Scalable MCA which expands @@ -156,9 +156,12 @@ struct mce_vendor_flags { * banks. Also, to accommodate the new banks and registers, the MCA * register space is moved to a new MSR range. */ - smca : 1, + smca : 1, - __reserved_0 : 61; + /* AMD-style error thresholding banks present. */ + amd_threshold : 1, + + __reserved_0 : 60; }; extern struct mce_vendor_flags mce_flags; From cca9cc05fe98f3eb0cfb58ec6739cfc9d0b4ccbf Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 12 Mar 2020 20:05:43 +0100 Subject: [PATCH 003/190] x86/mce/amd: Protect a not-fully initialized bank from the thresholding interrupt Make sure the thresholding bank descriptor is fully initialized when the thresholding interrupt fires after a hotplug event. [ bp: Write commit message and document long-forgotten bank_map. ] Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20200403161943.1458-4-bp@alien8.de --- arch/x86/kernel/cpu/mce/amd.c | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c index c3b3326ad4ac06..56394215775869 100644 --- a/arch/x86/kernel/cpu/mce/amd.c +++ b/arch/x86/kernel/cpu/mce/amd.c @@ -192,7 +192,12 @@ EXPORT_SYMBOL_GPL(smca_banks); static char buf_mcatype[MAX_MCATYPE_NAME_LEN]; static DEFINE_PER_CPU(struct threshold_bank **, threshold_banks); -static DEFINE_PER_CPU(unsigned int, bank_map); /* see which banks are on */ + +/* + * A list of the banks enabled on each logical CPU. Controls which respective + * descriptors to initialize later in mce_threshold_create_device(). + */ +static DEFINE_PER_CPU(unsigned int, bank_map); /* Map of banks that have more than MCA_MISC0 available. */ static DEFINE_PER_CPU(u32, smca_misc_banks_map); @@ -1016,13 +1021,22 @@ static void log_and_reset_block(struct threshold_block *block) static void amd_threshold_interrupt(void) { struct threshold_block *first_block = NULL, *block = NULL, *tmp = NULL; + struct threshold_bank **bp = this_cpu_read(threshold_banks); unsigned int bank, cpu = smp_processor_id(); + /* + * Validate that the threshold bank has been initialized already. The + * handler is installed at boot time, but on a hotplug event the + * interrupt might fire before the data has been initialized. + */ + if (!bp) + return; + for (bank = 0; bank < this_cpu_read(mce_num_banks); ++bank) { if (!(per_cpu(bank_map, cpu) & (1 << bank))) continue; - first_block = per_cpu(threshold_banks, cpu)[bank]->blocks; + first_block = bp[bank]->blocks; if (!first_block) continue; @@ -1247,6 +1261,7 @@ static int allocate_threshold_blocks(unsigned int cpu, struct threshold_bank *tb INIT_LIST_HEAD(&b->miscj); + /* This is safe as @tb is not visible yet */ if (tb->blocks) list_add(&b->miscj, &tb->blocks->miscj); else From 6e7a41c63abcfee28734c4c8872dae8d642329b6 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 30 Mar 2020 16:21:54 +0200 Subject: [PATCH 004/190] x86/mce/amd: Sanitize thresholding device creation hotplug path Drop the stupid threshold_init_device() initcall iterating over all online CPUs in favor of properly setting up everything on the CPU hotplug path, when each CPU's callback is invoked. [ bp: Write commit message. ] Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20200403161943.1458-5-bp@alien8.de --- arch/x86/kernel/cpu/mce/amd.c | 57 ++++++++++------------------------ arch/x86/kernel/cpu/mce/core.c | 11 +++++++ 2 files changed, 27 insertions(+), 41 deletions(-) diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c index 56394215775869..d3c416b6052a17 100644 --- a/arch/x86/kernel/cpu/mce/amd.c +++ b/arch/x86/kernel/cpu/mce/amd.c @@ -1474,12 +1474,22 @@ int mce_threshold_remove_device(unsigned int cpu) return 0; } -/* create dir/files for all valid threshold banks */ +/** + * mce_threshold_create_device - Create the per-CPU MCE threshold device + * @cpu: The plugged in CPU + * + * Create directories and files for all valid threshold banks. + * + * This is invoked from the CPU hotplug callback which was installed in + * mcheck_init_device(). The invocation happens in context of the hotplug + * thread running on @cpu. The callback is invoked on all CPUs which are + * online when the callback is installed or during a real hotplug event. + */ int mce_threshold_create_device(unsigned int cpu) { unsigned int bank; struct threshold_bank **bp; - int err = 0; + int err; if (!mce_flags.amd_threshold) return 0; @@ -1500,49 +1510,14 @@ int mce_threshold_create_device(unsigned int cpu) continue; err = threshold_create_bank(cpu, bank); if (err) - goto err; - } - return err; -err: - mce_threshold_remove_device(cpu); - return err; -} - -static __init int threshold_init_device(void) -{ - unsigned lcpu = 0; - - /* to hit CPUs online before the notifier is up */ - for_each_online_cpu(lcpu) { - int err = mce_threshold_create_device(lcpu); - - if (err) - return err; + goto out_err; } if (thresholding_irq_en) mce_threshold_vector = amd_threshold_interrupt; return 0; +out_err: + mce_threshold_remove_device(cpu); + return err; } -/* - * there are 3 funcs which need to be _initcalled in a logic sequence: - * 1. xen_late_init_mcelog - * 2. mcheck_init_device - * 3. threshold_init_device - * - * xen_late_init_mcelog must register xen_mce_chrdev_device before - * native mce_chrdev_device registration if running under xen platform; - * - * mcheck_init_device should be inited before threshold_init_device to - * initialize mce_device, otherwise a NULL ptr dereference will cause panic. - * - * so we use following _initcalls - * 1. device_initcall(xen_late_init_mcelog); - * 2. device_initcall_sync(mcheck_init_device); - * 3. late_initcall(threshold_init_device); - * - * when running under xen, the initcall order is 1,2,3; - * on baremetal, we skip 1 and we do only 2 and 3. - */ -late_initcall(threshold_init_device); diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index 43ca91e14a77c9..a6009efdfe2b3d 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -2481,6 +2481,13 @@ static __init void mce_init_banks(void) } } +/* + * When running on XEN, this initcall is ordered against the XEN mcelog + * initcall: + * + * device_initcall(xen_late_init_mcelog); + * device_initcall_sync(mcheck_init_device); + */ static __init int mcheck_init_device(void) { int err; @@ -2512,6 +2519,10 @@ static __init int mcheck_init_device(void) if (err) goto err_out_mem; + /* + * Invokes mce_cpu_online() on all CPUs which are online when + * the state is installed. + */ err = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/mce:online", mce_cpu_online, mce_cpu_pre_down); if (err < 0) From 6458de97fc15530b54477c4e2b70af653e8ac3d9 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 30 Mar 2020 20:30:45 +0200 Subject: [PATCH 005/190] x86/mce/amd: Straighten CPU hotplug path mce_threshold_create_device() hotplug callback runs on the plugged in CPU so: - use this_cpu_read() which is faster - pass in struct threshold_bank **bp to threshold_create_bank() and instead of doing per-CPU accesses - Use rdmsr_safe() instead of rdmsr_safe_on_cpu() which avoids an IPI. No functional changes. Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20200403161943.1458-6-bp@alien8.de --- arch/x86/kernel/cpu/mce/amd.c | 32 +++++++++++++++----------------- 1 file changed, 15 insertions(+), 17 deletions(-) diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c index d3c416b6052a17..a33d9a1caf3694 100644 --- a/arch/x86/kernel/cpu/mce/amd.c +++ b/arch/x86/kernel/cpu/mce/amd.c @@ -1223,10 +1223,10 @@ static int allocate_threshold_blocks(unsigned int cpu, struct threshold_bank *tb u32 low, high; int err; - if ((bank >= per_cpu(mce_num_banks, cpu)) || (block >= NR_BLOCKS)) + if ((bank >= this_cpu_read(mce_num_banks)) || (block >= NR_BLOCKS)) return 0; - if (rdmsr_safe_on_cpu(cpu, address, &low, &high)) + if (rdmsr_safe(address, &low, &high)) return 0; if (!(high & MASK_VALID_HI)) { @@ -1316,9 +1316,10 @@ static int __threshold_add_blocks(struct threshold_bank *b) return err; } -static int threshold_create_bank(unsigned int cpu, unsigned int bank) +static int threshold_create_bank(struct threshold_bank **bp, unsigned int cpu, + unsigned int bank) { - struct device *dev = per_cpu(mce_device, cpu); + struct device *dev = this_cpu_read(mce_device); struct amd_northbridge *nb = NULL; struct threshold_bank *b = NULL; const char *name = get_name(bank, NULL); @@ -1338,7 +1339,7 @@ static int threshold_create_bank(unsigned int cpu, unsigned int bank) if (err) goto out; - per_cpu(threshold_banks, cpu)[bank] = b; + bp[bank] = b; refcount_inc(&b->cpus); err = __threshold_add_blocks(b); @@ -1374,8 +1375,7 @@ static int threshold_create_bank(unsigned int cpu, unsigned int bank) if (err) goto out_kobj; - per_cpu(threshold_banks, cpu)[bank] = b; - + bp[bank] = b; return 0; out_kobj: @@ -1487,35 +1487,33 @@ int mce_threshold_remove_device(unsigned int cpu) */ int mce_threshold_create_device(unsigned int cpu) { - unsigned int bank; + unsigned int numbanks, bank; struct threshold_bank **bp; int err; if (!mce_flags.amd_threshold) return 0; - bp = per_cpu(threshold_banks, cpu); + bp = this_cpu_read(threshold_banks); if (bp) return 0; - bp = kcalloc(per_cpu(mce_num_banks, cpu), sizeof(struct threshold_bank *), - GFP_KERNEL); + numbanks = this_cpu_read(mce_num_banks); + bp = kcalloc(numbanks, sizeof(*bp), GFP_KERNEL); if (!bp) return -ENOMEM; - per_cpu(threshold_banks, cpu) = bp; - - for (bank = 0; bank < per_cpu(mce_num_banks, cpu); ++bank) { - if (!(per_cpu(bank_map, cpu) & (1 << bank))) + for (bank = 0; bank < numbanks; ++bank) { + if (!(this_cpu_read(bank_map) & (1 << bank))) continue; - err = threshold_create_bank(cpu, bank); + err = threshold_create_bank(bp, cpu, bank); if (err) goto out_err; } + this_cpu_write(threshold_banks, bp); if (thresholding_irq_en) mce_threshold_vector = amd_threshold_interrupt; - return 0; out_err: mce_threshold_remove_device(cpu); From f26d2580a7ddc84aa9e51e47fdbb5ad63dbee5a7 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 31 Mar 2020 10:53:18 +0200 Subject: [PATCH 006/190] x86/mce/amd: Cleanup threshold device remove path Pass in the bank pointer directly to the cleaning up functions, obviating the need for per-CPU accesses. Make the clean up path interrupt-safe by cleaning the bank pointer first so that the rest of the teardown happens safe from the thresholding interrupt. No functional changes. [ bp: Write commit message and reverse bank->shared test to save an indentation level in threshold_remove_bank(). ] Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20200403161943.1458-7-bp@alien8.de --- arch/x86/include/asm/amd_nb.h | 1 + arch/x86/kernel/cpu/mce/amd.c | 79 ++++++++++++++++------------------- 2 files changed, 38 insertions(+), 42 deletions(-) diff --git a/arch/x86/include/asm/amd_nb.h b/arch/x86/include/asm/amd_nb.h index c7df20e78b0956..455066a06f607e 100644 --- a/arch/x86/include/asm/amd_nb.h +++ b/arch/x86/include/asm/amd_nb.h @@ -57,6 +57,7 @@ struct threshold_bank { /* initialized to the number of CPUs on the node sharing this bank */ refcount_t cpus; + unsigned int shared; }; struct amd_northbridge { diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c index a33d9a1caf3694..16e7aea86ab119 100644 --- a/arch/x86/kernel/cpu/mce/amd.c +++ b/arch/x86/kernel/cpu/mce/amd.c @@ -1362,6 +1362,7 @@ static int threshold_create_bank(struct threshold_bank **bp, unsigned int cpu, } if (is_shared_bank(bank)) { + b->shared = 1; refcount_set(&b->cpus, 1); /* nb is already initialized, see above */ @@ -1391,21 +1392,16 @@ static void threshold_block_release(struct kobject *kobj) kfree(to_block(kobj)); } -static void deallocate_threshold_block(unsigned int cpu, unsigned int bank) +static void deallocate_threshold_blocks(struct threshold_bank *bank) { - struct threshold_block *pos = NULL; - struct threshold_block *tmp = NULL; - struct threshold_bank *head = per_cpu(threshold_banks, cpu)[bank]; - - if (!head) - return; + struct threshold_block *pos, *tmp; - list_for_each_entry_safe(pos, tmp, &head->blocks->miscj, miscj) { + list_for_each_entry_safe(pos, tmp, &bank->blocks->miscj, miscj) { list_del(&pos->miscj); kobject_put(&pos->kobj); } - kobject_put(&head->blocks->kobj); + kobject_put(&bank->blocks->kobj); } static void __threshold_remove_blocks(struct threshold_bank *b) @@ -1419,57 +1415,56 @@ static void __threshold_remove_blocks(struct threshold_bank *b) kobject_del(&pos->kobj); } -static void threshold_remove_bank(unsigned int cpu, int bank) +static void threshold_remove_bank(struct threshold_bank *bank) { struct amd_northbridge *nb; - struct threshold_bank *b; - b = per_cpu(threshold_banks, cpu)[bank]; - if (!b) - return; + if (!bank->blocks) + goto out_free; - if (!b->blocks) - goto free_out; + if (!bank->shared) + goto out_dealloc; - if (is_shared_bank(bank)) { - if (!refcount_dec_and_test(&b->cpus)) { - __threshold_remove_blocks(b); - per_cpu(threshold_banks, cpu)[bank] = NULL; - return; - } else { - /* - * the last CPU on this node using the shared bank is - * going away, remove that bank now. - */ - nb = node_to_amd_nb(amd_get_nb_id(cpu)); - nb->bank4 = NULL; - } + if (!refcount_dec_and_test(&bank->cpus)) { + __threshold_remove_blocks(bank); + return; + } else { + /* + * The last CPU on this node using the shared bank is going + * away, remove that bank now. + */ + nb = node_to_amd_nb(amd_get_nb_id(smp_processor_id())); + nb->bank4 = NULL; } - deallocate_threshold_block(cpu, bank); +out_dealloc: + deallocate_threshold_blocks(bank); -free_out: - kobject_del(b->kobj); - kobject_put(b->kobj); - kfree(b); - per_cpu(threshold_banks, cpu)[bank] = NULL; +out_free: + kobject_put(bank->kobj); + kfree(bank); } int mce_threshold_remove_device(unsigned int cpu) { struct threshold_bank **bp = this_cpu_read(threshold_banks); - unsigned int bank; + unsigned int bank, numbanks = this_cpu_read(mce_num_banks); if (!bp) return 0; - for (bank = 0; bank < per_cpu(mce_num_banks, cpu); ++bank) { - if (!(per_cpu(bank_map, cpu) & (1 << bank))) - continue; - threshold_remove_bank(cpu, bank); - } - /* Clear the pointer before freeing the memory */ + /* + * Clear the pointer before cleaning up, so that the interrupt won't + * touch anything of this. + */ this_cpu_write(threshold_banks, NULL); + + for (bank = 0; bank < numbanks; bank++) { + if (bp[bank]) { + threshold_remove_bank(bp[bank]); + bp[bank] = NULL; + } + } kfree(bp); return 0; } From a037f3ca0ea0a660e3f961431095a88674b8f3c4 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 31 Mar 2020 13:16:44 +0200 Subject: [PATCH 007/190] x86/mce/amd: Make threshold bank setting hotplug robust Handle the cases when the CPU goes offline before the bank setting/reading happens. [ bp: Write commit message. ] Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20200403161943.1458-8-bp@alien8.de --- arch/x86/kernel/cpu/mce/amd.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c index 16e7aea86ab119..15c87b87b901a1 100644 --- a/arch/x86/kernel/cpu/mce/amd.c +++ b/arch/x86/kernel/cpu/mce/amd.c @@ -386,6 +386,10 @@ static void threshold_restart_bank(void *_tr) struct thresh_restart *tr = _tr; u32 hi, lo; + /* sysfs write might race against an offline operation */ + if (this_cpu_read(threshold_banks)) + return; + rdmsr(tr->b->address, lo, hi); if (tr->b->threshold_limit < (hi & THRESHOLD_MAX)) @@ -1085,7 +1089,8 @@ store_interrupt_enable(struct threshold_block *b, const char *buf, size_t size) memset(&tr, 0, sizeof(tr)); tr.b = b; - smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1); + if (smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1)) + return -ENODEV; return size; } @@ -1109,7 +1114,8 @@ store_threshold_limit(struct threshold_block *b, const char *buf, size_t size) b->threshold_limit = new; tr.b = b; - smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1); + if (smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1)) + return -ENODEV; return size; } @@ -1118,7 +1124,9 @@ static ssize_t show_error_count(struct threshold_block *b, char *buf) { u32 lo, hi; - rdmsr_on_cpu(b->cpu, b->address, &lo, &hi); + /* CPU might be offline by now */ + if (rdmsr_on_cpu(b->cpu, b->address, &lo, &hi)) + return -ENODEV; return sprintf(buf, "%u\n", ((hi & THRESHOLD_MAX) - (THRESHOLD_MAX - b->threshold_limit))); From 3e0fdec858d82c829774f271e88b5ceb17051551 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Tue, 7 Apr 2020 09:55:10 +0200 Subject: [PATCH 008/190] x86/mce/amd, edac: Remove report_gart_errors ... because no one should be interested in spurious MCEs anyway. Make the filtering unconditional and move it to amd_filter_mce(). Signed-off-by: Borislav Petkov Tested-by: Tony Luck Link: https://lkml.kernel.org/r/20200407163414.18058-2-bp@alien8.de --- arch/x86/include/asm/mce.h | 3 ++- arch/x86/kernel/cpu/mce/amd.c | 9 +++++++-- drivers/edac/amd64_edac.c | 8 -------- drivers/edac/mce_amd.c | 24 ------------------------ drivers/edac/mce_amd.h | 2 -- 5 files changed, 9 insertions(+), 37 deletions(-) diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index f9cea081c05b01..83b6ddafa032cf 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -127,6 +127,8 @@ #define MSR_AMD64_SMCA_MCx_DEADDR(x) (MSR_AMD64_SMCA_MC0_DEADDR + 0x10*(x)) #define MSR_AMD64_SMCA_MCx_MISCy(x, y) ((MSR_AMD64_SMCA_MC0_MISC1 + y) + (0x10*(x))) +#define XEC(x, mask) (((x) >> 16) & mask) + /* * This structure contains all data related to the MCE log. Also * carries a signature to make it easier to find from external @@ -347,5 +349,4 @@ umc_normaddr_to_sysaddr(u64 norm_addr, u16 nid, u8 umc, u64 *sys_addr) { return #endif static inline void mce_hygon_feature_init(struct cpuinfo_x86 *c) { return mce_amd_feature_init(c); } - #endif /* _ASM_X86_MCE_H */ diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c index 15c87b87b901a1..ea3cf714b7ad42 100644 --- a/arch/x86/kernel/cpu/mce/amd.c +++ b/arch/x86/kernel/cpu/mce/amd.c @@ -577,14 +577,19 @@ bool amd_filter_mce(struct mce *m) { enum smca_bank_types bank_type = smca_get_bank_type(m->bank); struct cpuinfo_x86 *c = &boot_cpu_data; - u8 xec = (m->status >> 16) & 0x3F; /* See Family 17h Models 10h-2Fh Erratum #1114. */ if (c->x86 == 0x17 && c->x86_model >= 0x10 && c->x86_model <= 0x2F && - bank_type == SMCA_IF && xec == 10) + bank_type == SMCA_IF && XEC(m->status, 0x3f) == 10) return true; + /* NB GART TLB error reporting is disabled by default. */ + if (c->x86 < 0x17) { + if (m->bank == 4 && XEC(m->status, 0x1f) == 0x5) + return true; + } + return false; } diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c index f91f3bc1e0b28b..6bdc5bb8c8bc97 100644 --- a/drivers/edac/amd64_edac.c +++ b/drivers/edac/amd64_edac.c @@ -4,9 +4,6 @@ static struct edac_pci_ctl_info *pci_ctl; -static int report_gart_errors; -module_param(report_gart_errors, int, 0644); - /* * Set by command line parameter. If BIOS has enabled the ECC, this override is * cleared to prevent re-enabling the hardware by this driver. @@ -3681,9 +3678,6 @@ static int __init amd64_edac_init(void) } /* register stuff with EDAC MCE */ - if (report_gart_errors) - amd_report_gart_errors(true); - if (boot_cpu_data.x86 >= 0x17) amd_register_ecc_decoder(decode_umc_error); else @@ -3718,8 +3712,6 @@ static void __exit amd64_edac_exit(void) edac_pci_release_generic_ctl(pci_ctl); /* unregister from EDAC MCE */ - amd_report_gart_errors(false); - if (boot_cpu_data.x86 >= 0x17) amd_unregister_ecc_decoder(decode_umc_error); else diff --git a/drivers/edac/mce_amd.c b/drivers/edac/mce_amd.c index 8874b7722b2f26..e58644d9c92b7f 100644 --- a/drivers/edac/mce_amd.c +++ b/drivers/edac/mce_amd.c @@ -10,15 +10,8 @@ static struct amd_decoder_ops fam_ops; static u8 xec_mask = 0xf; -static bool report_gart_errors; static void (*decode_dram_ecc)(int node_id, struct mce *m); -void amd_report_gart_errors(bool v) -{ - report_gart_errors = v; -} -EXPORT_SYMBOL_GPL(amd_report_gart_errors); - void amd_register_ecc_decoder(void (*f)(int, struct mce *)) { decode_dram_ecc = f; @@ -1030,20 +1023,6 @@ static inline void amd_decode_err_code(u16 ec) pr_cont("\n"); } -/* - * Filter out unwanted MCE signatures here. - */ -static bool ignore_mce(struct mce *m) -{ - /* - * NB GART TLB error reporting is disabled by default. - */ - if (m->bank == 4 && XEC(m->status, 0x1f) == 0x5 && !report_gart_errors) - return true; - - return false; -} - static const char *decode_error_status(struct mce *m) { if (m->status & MCI_STATUS_UC) { @@ -1067,9 +1046,6 @@ amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data) unsigned int fam = x86_family(m->cpuid); int ecc; - if (ignore_mce(m)) - return NOTIFY_STOP; - pr_emerg(HW_ERR "%s\n", decode_error_status(m)); pr_emerg(HW_ERR "CPU:%d (%x:%x:%x) MC%d_STATUS[%s|%s|%s|%s|%s", diff --git a/drivers/edac/mce_amd.h b/drivers/edac/mce_amd.h index 4e9c5e596c6c20..4811b18d9606bb 100644 --- a/drivers/edac/mce_amd.h +++ b/drivers/edac/mce_amd.h @@ -7,7 +7,6 @@ #include #define EC(x) ((x) & 0xffff) -#define XEC(x, mask) (((x) >> 16) & mask) #define LOW_SYNDROME(x) (((x) >> 15) & 0xff) #define HIGH_SYNDROME(x) (((x) >> 24) & 0xff) @@ -77,7 +76,6 @@ struct amd_decoder_ops { bool (*mc2_mce)(u16, u8); }; -void amd_report_gart_errors(bool); void amd_register_ecc_decoder(void (*f)(int, struct mce *)); void amd_unregister_ecc_decoder(void (*f)(int, struct mce *)); From c9c6d216ed28be6e2c91e3651af169eca284813a Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Fri, 14 Feb 2020 14:27:14 -0800 Subject: [PATCH 009/190] x86/mce: Rename "first" function as "early" It isn't going to be first on the notifier chain when the CEC is moved to be a normal user of the notifier chain. Fix the enum for the MCE_PRIO symbols to list them in reverse order so that the compiler can give them numbers from low to high priority. Add an entry for MCE_PRIO_CEC as the highest priority. [ bp: Use passive voice, add comments. ] Signed-off-by: Tony Luck Signed-off-by: Borislav Petkov Tested-by: Tony Luck Link: https://lkml.kernel.org/r/20200214222720.13168-2-tony.luck@intel.com --- arch/x86/include/asm/mce.h | 16 +++++++++------- arch/x86/kernel/cpu/mce/core.c | 10 +++++----- 2 files changed, 14 insertions(+), 12 deletions(-) diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index 83b6ddafa032cf..689ac6e9c65f07 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -144,14 +144,16 @@ struct mce_log_buffer { struct mce entry[]; }; +/* Highest last */ enum mce_notifier_prios { - MCE_PRIO_FIRST = INT_MAX, - MCE_PRIO_UC = INT_MAX - 1, - MCE_PRIO_EXTLOG = INT_MAX - 2, - MCE_PRIO_NFIT = INT_MAX - 3, - MCE_PRIO_EDAC = INT_MAX - 4, - MCE_PRIO_MCELOG = 1, - MCE_PRIO_LOWEST = 0, + MCE_PRIO_LOWEST, + MCE_PRIO_MCELOG, + MCE_PRIO_EDAC, + MCE_PRIO_NFIT, + MCE_PRIO_EXTLOG, + MCE_PRIO_UC, + MCE_PRIO_EARLY, + MCE_PRIO_CEC }; struct notifier_block; diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index a6009efdfe2b3d..43b1519ad4e5bb 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -559,7 +559,7 @@ static bool cec_add_mce(struct mce *m) return false; } -static int mce_first_notifier(struct notifier_block *nb, unsigned long val, +static int mce_early_notifier(struct notifier_block *nb, unsigned long val, void *data) { struct mce *m = (struct mce *)data; @@ -580,9 +580,9 @@ static int mce_first_notifier(struct notifier_block *nb, unsigned long val, return NOTIFY_DONE; } -static struct notifier_block first_nb = { - .notifier_call = mce_first_notifier, - .priority = MCE_PRIO_FIRST, +static struct notifier_block early_nb = { + .notifier_call = mce_early_notifier, + .priority = MCE_PRIO_EARLY, }; static int uc_decode_notifier(struct notifier_block *nb, unsigned long val, @@ -2041,7 +2041,7 @@ __setup("mce", mcheck_enable); int __init mcheck_init(void) { mcheck_intel_therm_init(); - mce_register_decode_chain(&first_nb); + mce_register_decode_chain(&early_nb); mce_register_decode_chain(&mce_uc_nb); mce_register_decode_chain(&mce_default_nb); mcheck_vendor_init_severity(); From 9554bfe403bdfc084823df8695a01f28c680af61 Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Fri, 14 Feb 2020 14:27:15 -0800 Subject: [PATCH 010/190] x86/mce: Convert the CEC to use the MCE notifier The CEC code has its claws in a couple of routines in mce/core.c. Convert it to just register itself on the normal MCE notifier chain. [ bp: Make cec_add_elem() and cec_init() static. ] Signed-off-by: Tony Luck Signed-off-by: Borislav Petkov Tested-by: Tony Luck Link: https://lkml.kernel.org/r/20200214222720.13168-3-tony.luck@intel.com --- arch/x86/kernel/cpu/mce/core.c | 19 ------------------- drivers/ras/cec.c | 30 ++++++++++++++++++++++++++++-- include/linux/ras.h | 5 ----- 3 files changed, 28 insertions(+), 26 deletions(-) diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index 43b1519ad4e5bb..b033b35896300b 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -544,21 +544,6 @@ bool mce_is_correctable(struct mce *m) } EXPORT_SYMBOL_GPL(mce_is_correctable); -static bool cec_add_mce(struct mce *m) -{ - if (!m) - return false; - - /* We eat only correctable DRAM errors with usable addresses. */ - if (mce_is_memory_error(m) && - mce_is_correctable(m) && - mce_usable_address(m)) - if (!cec_add_elem(m->addr >> PAGE_SHIFT)) - return true; - - return false; -} - static int mce_early_notifier(struct notifier_block *nb, unsigned long val, void *data) { @@ -567,9 +552,6 @@ static int mce_early_notifier(struct notifier_block *nb, unsigned long val, if (!m) return NOTIFY_DONE; - if (cec_add_mce(m)) - return NOTIFY_STOP; - /* Emit the trace record: */ trace_mce_record(m); @@ -2612,7 +2594,6 @@ static int __init mcheck_late_init(void) static_branch_inc(&mcsafe_key); mcheck_debugfs_init(); - cec_init(); /* * Flush out everything that has been logged during early boot, now that diff --git a/drivers/ras/cec.c b/drivers/ras/cec.c index c09cf55e2d2040..6b42040bf956e9 100644 --- a/drivers/ras/cec.c +++ b/drivers/ras/cec.c @@ -309,7 +309,7 @@ static bool sanity_check(struct ce_array *ca) return ret; } -int cec_add_elem(u64 pfn) +static int cec_add_elem(u64 pfn) { struct ce_array *ca = &ce_arr; unsigned int to = 0; @@ -527,7 +527,30 @@ static int __init create_debugfs_nodes(void) return 1; } -void __init cec_init(void) +static int cec_notifier(struct notifier_block *nb, unsigned long val, + void *data) +{ + struct mce *m = (struct mce *)data; + + if (!m) + return NOTIFY_DONE; + + /* We eat only correctable DRAM errors with usable addresses. */ + if (mce_is_memory_error(m) && + mce_is_correctable(m) && + mce_usable_address(m)) + if (!cec_add_elem(m->addr >> PAGE_SHIFT)) + return NOTIFY_STOP; + + return NOTIFY_DONE; +} + +static struct notifier_block cec_nb = { + .notifier_call = cec_notifier, + .priority = MCE_PRIO_CEC, +}; + +static void __init cec_init(void) { if (ce_arr.disabled) return; @@ -546,8 +569,11 @@ void __init cec_init(void) INIT_DELAYED_WORK(&cec_work, cec_work_fn); schedule_delayed_work(&cec_work, CEC_DECAY_DEFAULT_INTERVAL); + mce_register_decode_chain(&cec_nb); + pr_info("Correctable Errors collector initialized.\n"); } +late_initcall(cec_init); int __init parse_cec_param(char *str) { diff --git a/include/linux/ras.h b/include/linux/ras.h index 7c3debb47c87ab..1f4048bf2674de 100644 --- a/include/linux/ras.h +++ b/include/linux/ras.h @@ -17,12 +17,7 @@ static inline int ras_add_daemon_trace(void) { return 0; } #endif #ifdef CONFIG_RAS_CEC -void __init cec_init(void); int __init parse_cec_param(char *str); -int cec_add_elem(u64 pfn); -#else -static inline void __init cec_init(void) { } -static inline int cec_add_elem(u64 pfn) { return -ENODEV; } #endif #ifdef CONFIG_RAS From 1de08dccd383482a3e88845d3554094d338f5ff9 Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Fri, 14 Feb 2020 14:27:16 -0800 Subject: [PATCH 011/190] x86/mce: Add a struct mce.kflags field There can be many different subsystems register on the mce handler chain. Add a new bitmask field and define values so that handlers can indicate whether they took any action to log or otherwise handle an error. The default handler at the end of the chain can use this information to decide whether to print to the console log. Boris suggested a generic name and leaving plenty of spare bits for possible future use. [ bp: Move flag bits to the internal mce.h header and use BIT_ULL(). ] Signed-off-by: Tony Luck Signed-off-by: Borislav Petkov Tested-by: Tony Luck Link: https://lkml.kernel.org/r/20200214222720.13168-4-tony.luck@intel.com --- arch/x86/include/asm/mce.h | 8 ++++++++ arch/x86/include/uapi/asm/mce.h | 1 + 2 files changed, 9 insertions(+) diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index 689ac6e9c65f07..5f04a24f30eaa9 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -129,6 +129,14 @@ #define XEC(x, mask) (((x) >> 16) & mask) +/* mce.kflags flag bits for logging etc. */ +#define MCE_HANDLED_CEC BIT_ULL(0) +#define MCE_HANDLED_UC BIT_ULL(1) +#define MCE_HANDLED_EXTLOG BIT_ULL(2) +#define MCE_HANDLED_NFIT BIT_ULL(3) +#define MCE_HANDLED_EDAC BIT_ULL(4) +#define MCE_HANDLED_MCELOG BIT_ULL(5) + /* * This structure contains all data related to the MCE log. Also * carries a signature to make it easier to find from external diff --git a/arch/x86/include/uapi/asm/mce.h b/arch/x86/include/uapi/asm/mce.h index 955c2a2e1cf9d2..5b59d80f1d4e1e 100644 --- a/arch/x86/include/uapi/asm/mce.h +++ b/arch/x86/include/uapi/asm/mce.h @@ -35,6 +35,7 @@ struct mce { __u64 ipid; /* MCA_IPID MSR: only valid on SMCA systems */ __u64 ppin; /* Protected Processor Inventory Number */ __u32 microcode; /* Microcode revision */ + __u64 kflags; /* Internal kernel use. See below */ }; #define MCE_GET_RECORD_LEN _IOR('M', 1, int) From 23ba710a0864108910c7531dc4c73ef65eca5568 Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Fri, 14 Feb 2020 14:27:17 -0800 Subject: [PATCH 012/190] x86/mce: Fix all mce notifiers to update the mce->kflags bitmask If the handler took any action to log or deal with the error, set a bit in mce->kflags so that the default handler on the end of the machine check chain can see what has been done. Get rid of NOTIFY_STOP returns. Make the EDAC and dev-mcelog handlers skip over errors already processed by CEC. Signed-off-by: Tony Luck Signed-off-by: Borislav Petkov Tested-by: Tony Luck Link: https://lkml.kernel.org/r/20200214222720.13168-5-tony.luck@intel.com --- arch/x86/kernel/cpu/mce/core.c | 4 +++- arch/x86/kernel/cpu/mce/dev-mcelog.c | 5 +++++ drivers/acpi/acpi_extlog.c | 5 +++-- drivers/acpi/nfit/mce.c | 1 + drivers/edac/i7core_edac.c | 5 +++-- drivers/edac/mce_amd.c | 6 +++++- drivers/edac/pnd2_edac.c | 5 +++-- drivers/edac/sb_edac.c | 5 ++++- drivers/edac/skx_common.c | 4 ++++ drivers/ras/cec.c | 9 ++++++--- 10 files changed, 37 insertions(+), 12 deletions(-) diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index b033b35896300b..5666a48a4bc9a0 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -581,8 +581,10 @@ static int uc_decode_notifier(struct notifier_block *nb, unsigned long val, return NOTIFY_DONE; pfn = mce->addr >> PAGE_SHIFT; - if (!memory_failure(pfn, 0)) + if (!memory_failure(pfn, 0)) { set_mce_nospec(pfn); + mce->kflags |= MCE_HANDLED_UC; + } return NOTIFY_OK; } diff --git a/arch/x86/kernel/cpu/mce/dev-mcelog.c b/arch/x86/kernel/cpu/mce/dev-mcelog.c index d089567a9ce822..c033e7ea9e3c37 100644 --- a/arch/x86/kernel/cpu/mce/dev-mcelog.c +++ b/arch/x86/kernel/cpu/mce/dev-mcelog.c @@ -39,6 +39,9 @@ static int dev_mce_log(struct notifier_block *nb, unsigned long val, struct mce *mce = (struct mce *)data; unsigned int entry; + if (mce->kflags & MCE_HANDLED_CEC) + return NOTIFY_DONE; + mutex_lock(&mce_chrdev_read_mutex); entry = mcelog->next; @@ -56,6 +59,7 @@ static int dev_mce_log(struct notifier_block *nb, unsigned long val, memcpy(mcelog->entry + entry, mce, sizeof(struct mce)); mcelog->entry[entry].finished = 1; + mcelog->entry[entry].kflags = 0; /* wake processes polling /dev/mcelog */ wake_up_interruptible(&mce_chrdev_wait); @@ -63,6 +67,7 @@ static int dev_mce_log(struct notifier_block *nb, unsigned long val, unlock: mutex_unlock(&mce_chrdev_read_mutex); + mce->kflags |= MCE_HANDLED_MCELOG; return NOTIFY_OK; } diff --git a/drivers/acpi/acpi_extlog.c b/drivers/acpi/acpi_extlog.c index 8596a106a933c1..9cc3c1f92db561 100644 --- a/drivers/acpi/acpi_extlog.c +++ b/drivers/acpi/acpi_extlog.c @@ -146,7 +146,7 @@ static int extlog_print(struct notifier_block *nb, unsigned long val, static u32 err_seq; estatus = extlog_elog_entry_check(cpu, bank); - if (estatus == NULL) + if (estatus == NULL || (mce->kflags & MCE_HANDLED_CEC)) return NOTIFY_DONE; memcpy(elog_buf, (void *)estatus, ELOG_ENTRY_LEN); @@ -176,7 +176,8 @@ static int extlog_print(struct notifier_block *nb, unsigned long val, } out: - return NOTIFY_STOP; + mce->kflags |= MCE_HANDLED_EXTLOG; + return NOTIFY_OK; } static bool __init extlog_get_l1addr(void) diff --git a/drivers/acpi/nfit/mce.c b/drivers/acpi/nfit/mce.c index f0ae48515b48fc..ee8d9973f60b88 100644 --- a/drivers/acpi/nfit/mce.c +++ b/drivers/acpi/nfit/mce.c @@ -76,6 +76,7 @@ static int nfit_handle_mce(struct notifier_block *nb, unsigned long val, */ acpi_nfit_ars_rescan(acpi_desc, 0); } + mce->kflags |= MCE_HANDLED_NFIT; break; } diff --git a/drivers/edac/i7core_edac.c b/drivers/edac/i7core_edac.c index b3135b208f9a0e..5860ca41185cf1 100644 --- a/drivers/edac/i7core_edac.c +++ b/drivers/edac/i7core_edac.c @@ -1815,7 +1815,7 @@ static int i7core_mce_check_error(struct notifier_block *nb, unsigned long val, struct mem_ctl_info *mci; i7_dev = get_i7core_dev(mce->socketid); - if (!i7_dev) + if (!i7_dev || (mce->kflags & MCE_HANDLED_CEC)) return NOTIFY_DONE; mci = i7_dev->mci; @@ -1834,7 +1834,8 @@ static int i7core_mce_check_error(struct notifier_block *nb, unsigned long val, i7core_check_error(mci, mce); /* Advise mcelog that the errors were handled */ - return NOTIFY_STOP; + mce->kflags |= MCE_HANDLED_EDAC; + return NOTIFY_OK; } static struct notifier_block i7_mce_dec = { diff --git a/drivers/edac/mce_amd.c b/drivers/edac/mce_amd.c index e58644d9c92b7f..2b5401db56ad9d 100644 --- a/drivers/edac/mce_amd.c +++ b/drivers/edac/mce_amd.c @@ -1046,6 +1046,9 @@ amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data) unsigned int fam = x86_family(m->cpuid); int ecc; + if (m->kflags & MCE_HANDLED_CEC) + return NOTIFY_DONE; + pr_emerg(HW_ERR "%s\n", decode_error_status(m)); pr_emerg(HW_ERR "CPU:%d (%x:%x:%x) MC%d_STATUS[%s|%s|%s|%s|%s", @@ -1146,7 +1149,8 @@ amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data) err_code: amd_decode_err_code(m->status & 0xffff); - return NOTIFY_STOP; + m->kflags |= MCE_HANDLED_EDAC; + return NOTIFY_OK; } static struct notifier_block amd_mce_dec_nb = { diff --git a/drivers/edac/pnd2_edac.c b/drivers/edac/pnd2_edac.c index bc47328eb48588..1929a5dc8f94d4 100644 --- a/drivers/edac/pnd2_edac.c +++ b/drivers/edac/pnd2_edac.c @@ -1400,7 +1400,7 @@ static int pnd2_mce_check_error(struct notifier_block *nb, unsigned long val, vo return NOTIFY_DONE; mci = pnd2_mci; - if (!mci) + if (!mci || (mce->kflags & MCE_HANDLED_CEC)) return NOTIFY_DONE; /* @@ -1429,7 +1429,8 @@ static int pnd2_mce_check_error(struct notifier_block *nb, unsigned long val, vo pnd2_mce_output_error(mci, mce, &daddr); /* Advice mcelog that the error were handled */ - return NOTIFY_STOP; + mce->kflags |= MCE_HANDLED_EDAC; + return NOTIFY_OK; } static struct notifier_block pnd2_mce_dec = { diff --git a/drivers/edac/sb_edac.c b/drivers/edac/sb_edac.c index 7d51c82be62ba3..f790f7d086880e 100644 --- a/drivers/edac/sb_edac.c +++ b/drivers/edac/sb_edac.c @@ -3136,6 +3136,8 @@ static int sbridge_mce_check_error(struct notifier_block *nb, unsigned long val, if (edac_get_report_status() == EDAC_REPORTING_DISABLED) return NOTIFY_DONE; + if (mce->kflags & MCE_HANDLED_CEC) + return NOTIFY_DONE; /* * Just let mcelog handle it if the error is @@ -3183,7 +3185,8 @@ static int sbridge_mce_check_error(struct notifier_block *nb, unsigned long val, sbridge_mce_output_error(mci, mce); /* Advice mcelog that the error were handled */ - return NOTIFY_STOP; + mce->kflags |= MCE_HANDLED_EDAC; + return NOTIFY_OK; } static struct notifier_block sbridge_mce_dec = { diff --git a/drivers/edac/skx_common.c b/drivers/edac/skx_common.c index 99bbaf629b8d90..6f08a12f6b1103 100644 --- a/drivers/edac/skx_common.c +++ b/drivers/edac/skx_common.c @@ -577,6 +577,9 @@ int skx_mce_check_error(struct notifier_block *nb, unsigned long val, if (edac_get_report_status() == EDAC_REPORTING_DISABLED) return NOTIFY_DONE; + if (mce->kflags & MCE_HANDLED_CEC) + return NOTIFY_DONE; + /* ignore unless this is memory related with an address */ if ((mce->status & 0xefff) >> 7 != 1 || !(mce->status & MCI_STATUS_ADDRV)) return NOTIFY_DONE; @@ -616,6 +619,7 @@ int skx_mce_check_error(struct notifier_block *nb, unsigned long val, skx_mce_output_error(mci, mce, &res); + mce->kflags |= MCE_HANDLED_EDAC; return NOTIFY_DONE; } diff --git a/drivers/ras/cec.c b/drivers/ras/cec.c index 6b42040bf956e9..569d9ad2c59428 100644 --- a/drivers/ras/cec.c +++ b/drivers/ras/cec.c @@ -538,9 +538,12 @@ static int cec_notifier(struct notifier_block *nb, unsigned long val, /* We eat only correctable DRAM errors with usable addresses. */ if (mce_is_memory_error(m) && mce_is_correctable(m) && - mce_usable_address(m)) - if (!cec_add_elem(m->addr >> PAGE_SHIFT)) - return NOTIFY_STOP; + mce_usable_address(m)) { + if (!cec_add_elem(m->addr >> PAGE_SHIFT)) { + m->kflags |= MCE_HANDLED_CEC; + return NOTIFY_OK; + } + } return NOTIFY_DONE; } From 925946cfa715a5a71639528f82b98e58f14dd4cb Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Fri, 14 Feb 2020 14:27:18 -0800 Subject: [PATCH 013/190] x86/mce: Change default MCE logger to check mce->kflags Instead of keeping count of how many handlers are registered on the MCE notifier chain and printing if below some magic value, look at mce->kflags to see if anyone claims to have handled/logged this error. [ bp: Do not print ->kflags in __print_mce(). ] Signed-off-by: Tony Luck Signed-off-by: Borislav Petkov Tested-by: Tony Luck Link: https://lkml.kernel.org/r/20200214222720.13168-6-tony.luck@intel.com --- arch/x86/kernel/cpu/mce/core.c | 19 +++---------------- 1 file changed, 3 insertions(+), 16 deletions(-) diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index 5666a48a4bc9a0..fc879b6669d5fb 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -158,29 +158,17 @@ void mce_log(struct mce *m) } EXPORT_SYMBOL_GPL(mce_log); -/* - * We run the default notifier if we have only the UC, the first and the - * default notifier registered. I.e., the mandatory NUM_DEFAULT_NOTIFIERS - * notifiers registered on the chain. - */ -#define NUM_DEFAULT_NOTIFIERS 3 -static atomic_t num_notifiers; - void mce_register_decode_chain(struct notifier_block *nb) { if (WARN_ON(nb->priority > MCE_PRIO_MCELOG && nb->priority < MCE_PRIO_EDAC)) return; - atomic_inc(&num_notifiers); - blocking_notifier_chain_register(&x86_mce_decoder_chain, nb); } EXPORT_SYMBOL_GPL(mce_register_decode_chain); void mce_unregister_decode_chain(struct notifier_block *nb) { - atomic_dec(&num_notifiers); - blocking_notifier_chain_unregister(&x86_mce_decoder_chain, nb); } EXPORT_SYMBOL_GPL(mce_unregister_decode_chain); @@ -263,6 +251,7 @@ static void __print_mce(struct mce *m) } pr_cont("\n"); + /* * Note this output is parsed by external tools and old fields * should not be changed. @@ -602,10 +591,8 @@ static int mce_default_notifier(struct notifier_block *nb, unsigned long val, if (!m) return NOTIFY_DONE; - if (atomic_read(&num_notifiers) > NUM_DEFAULT_NOTIFIERS) - return NOTIFY_DONE; - - __print_mce(m); + if (!m->kflags) + __print_mce(m); return NOTIFY_DONE; } From 43505646941bee217b91d064756975aa1ab6ee3b Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Fri, 14 Feb 2020 14:27:19 -0800 Subject: [PATCH 014/190] x86/mce: Add mce=print_all option Sometimes, when logs are getting lost, it's nice to just have everything dumped to the serial console. Signed-off-by: Tony Luck Signed-off-by: Borislav Petkov Tested-by: Tony Luck Link: https://lkml.kernel.org/r/20200214222720.13168-7-tony.luck@intel.com --- arch/x86/kernel/cpu/mce/core.c | 7 ++++++- arch/x86/kernel/cpu/mce/internal.h | 1 + 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index fc879b6669d5fb..4efe6c12888780 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -591,7 +591,7 @@ static int mce_default_notifier(struct notifier_block *nb, unsigned long val, if (!m) return NOTIFY_DONE; - if (!m->kflags) + if (mca_cfg.print_all || !m->kflags) __print_mce(m); return NOTIFY_DONE; @@ -1962,6 +1962,7 @@ void mce_disable_bank(int bank) * mce=no_cmci Disables CMCI * mce=no_lmce Disables LMCE * mce=dont_log_ce Clears corrected events silently, no log created for CEs. + * mce=print_all Print all machine check logs to console * mce=ignore_ce Disables polling and CMCI, corrected events are not cleared. * mce=TOLERANCELEVEL[,monarchtimeout] (number, see above) * monarchtimeout is how long to wait for other CPUs on machine @@ -1990,6 +1991,8 @@ static int __init mcheck_enable(char *str) cfg->lmce_disabled = 1; else if (!strcmp(str, "dont_log_ce")) cfg->dont_log_ce = true; + else if (!strcmp(str, "print_all")) + cfg->print_all = true; else if (!strcmp(str, "ignore_ce")) cfg->ignore_ce = true; else if (!strcmp(str, "bootlog") || !strcmp(str, "nobootlog")) @@ -2256,6 +2259,7 @@ static ssize_t store_int_with_restart(struct device *s, static DEVICE_INT_ATTR(tolerant, 0644, mca_cfg.tolerant); static DEVICE_INT_ATTR(monarch_timeout, 0644, mca_cfg.monarch_timeout); static DEVICE_BOOL_ATTR(dont_log_ce, 0644, mca_cfg.dont_log_ce); +static DEVICE_BOOL_ATTR(print_all, 0644, mca_cfg.print_all); static struct dev_ext_attribute dev_attr_check_interval = { __ATTR(check_interval, 0644, device_show_int, store_int_with_restart), @@ -2280,6 +2284,7 @@ static struct device_attribute *mce_device_attrs[] = { #endif &dev_attr_monarch_timeout.attr, &dev_attr_dont_log_ce.attr, + &dev_attr_print_all.attr, &dev_attr_ignore_ce.attr, &dev_attr_cmci_disabled.attr, NULL diff --git a/arch/x86/kernel/cpu/mce/internal.h b/arch/x86/kernel/cpu/mce/internal.h index 74a01829c4f4a8..55f5c7b755f236 100644 --- a/arch/x86/kernel/cpu/mce/internal.h +++ b/arch/x86/kernel/cpu/mce/internal.h @@ -119,6 +119,7 @@ struct mca_config { bool dont_log_ce; bool cmci_disabled; bool ignore_ce; + bool print_all; __u64 lmce_disabled : 1, disabled : 1, From 7fc0b9b995f222646ece8d5bca528060c098ee88 Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Fri, 14 Feb 2020 14:27:20 -0800 Subject: [PATCH 015/190] EDAC: Drop the EDAC report status checks When acpi_extlog was added, we were worried that the same error would be reported more than once by different subsystems. But in the ensuing years I've seen complaints that people could not find an error log (because this mechanism suppressed the log they were looking for). Rip it all out. People are smart enough to notice the same address from different reporting mechanisms. Signed-off-by: Tony Luck Signed-off-by: Borislav Petkov Tested-by: Tony Luck Link: https://lkml.kernel.org/r/20200214222720.13168-8-tony.luck@intel.com --- drivers/acpi/acpi_extlog.c | 14 --------- drivers/edac/edac_mc.c | 61 -------------------------------------- drivers/edac/pnd2_edac.c | 3 -- drivers/edac/sb_edac.c | 4 --- drivers/edac/skx_common.c | 3 -- include/linux/edac.h | 8 ----- 6 files changed, 93 deletions(-) diff --git a/drivers/acpi/acpi_extlog.c b/drivers/acpi/acpi_extlog.c index 9cc3c1f92db561..f138e12b7b8237 100644 --- a/drivers/acpi/acpi_extlog.c +++ b/drivers/acpi/acpi_extlog.c @@ -42,8 +42,6 @@ struct extlog_l1_head { u8 rev1[12]; }; -static int old_edac_report_status; - static u8 extlog_dsm_uuid[] __initdata = "663E35AF-CC10-41A4-88EA-5470AF055295"; /* L1 table related physical address */ @@ -229,11 +227,6 @@ static int __init extlog_init(void) if (!(cap & MCG_ELOG_P) || !extlog_get_l1addr()) return -ENODEV; - if (edac_get_report_status() == EDAC_REPORTING_FORCE) { - pr_warn("Not loading eMCA, error reporting force-enabled through EDAC.\n"); - return -EPERM; - } - rc = -EINVAL; /* get L1 header to fetch necessary information */ l1_hdr_size = sizeof(struct extlog_l1_head); @@ -281,12 +274,6 @@ static int __init extlog_init(void) if (elog_buf == NULL) goto err_release_elog; - /* - * eMCA event report method has higher priority than EDAC method, - * unless EDAC event report method is mandatory. - */ - old_edac_report_status = edac_get_report_status(); - edac_set_report_status(EDAC_REPORTING_DISABLED); mce_register_decode_chain(&extlog_mce_dec); /* enable OS to be involved to take over management from BIOS */ ((struct extlog_l1_head *)extlog_l1_addr)->flags |= FLAG_OS_OPTIN; @@ -308,7 +295,6 @@ static int __init extlog_init(void) static void __exit extlog_exit(void) { - edac_set_report_status(old_edac_report_status); mce_unregister_decode_chain(&extlog_mce_dec); ((struct extlog_l1_head *)extlog_l1_addr)->flags &= ~FLAG_OS_OPTIN; if (extlog_l1_addr) diff --git a/drivers/edac/edac_mc.c b/drivers/edac/edac_mc.c index 75ede27bdf6aa5..5813e931f2f00f 100644 --- a/drivers/edac/edac_mc.c +++ b/drivers/edac/edac_mc.c @@ -43,8 +43,6 @@ int edac_op_state = EDAC_OPSTATE_INVAL; EXPORT_SYMBOL_GPL(edac_op_state); -static int edac_report = EDAC_REPORTING_ENABLED; - /* lock to memory controller's control array */ static DEFINE_MUTEX(mem_ctls_mutex); static LIST_HEAD(mc_devices); @@ -60,65 +58,6 @@ static struct mem_ctl_info *error_desc_to_mci(struct edac_raw_error_desc *e) return container_of(e, struct mem_ctl_info, error_desc); } -int edac_get_report_status(void) -{ - return edac_report; -} -EXPORT_SYMBOL_GPL(edac_get_report_status); - -void edac_set_report_status(int new) -{ - if (new == EDAC_REPORTING_ENABLED || - new == EDAC_REPORTING_DISABLED || - new == EDAC_REPORTING_FORCE) - edac_report = new; -} -EXPORT_SYMBOL_GPL(edac_set_report_status); - -static int edac_report_set(const char *str, const struct kernel_param *kp) -{ - if (!str) - return -EINVAL; - - if (!strncmp(str, "on", 2)) - edac_report = EDAC_REPORTING_ENABLED; - else if (!strncmp(str, "off", 3)) - edac_report = EDAC_REPORTING_DISABLED; - else if (!strncmp(str, "force", 5)) - edac_report = EDAC_REPORTING_FORCE; - - return 0; -} - -static int edac_report_get(char *buffer, const struct kernel_param *kp) -{ - int ret = 0; - - switch (edac_report) { - case EDAC_REPORTING_ENABLED: - ret = sprintf(buffer, "on"); - break; - case EDAC_REPORTING_DISABLED: - ret = sprintf(buffer, "off"); - break; - case EDAC_REPORTING_FORCE: - ret = sprintf(buffer, "force"); - break; - default: - ret = -EINVAL; - break; - } - - return ret; -} - -static const struct kernel_param_ops edac_report_ops = { - .set = edac_report_set, - .get = edac_report_get, -}; - -module_param_cb(edac_report, &edac_report_ops, &edac_report, 0644); - unsigned int edac_dimm_info_location(struct dimm_info *dimm, char *buf, unsigned int len) { diff --git a/drivers/edac/pnd2_edac.c b/drivers/edac/pnd2_edac.c index 1929a5dc8f94d4..c1f2e6deb021ad 100644 --- a/drivers/edac/pnd2_edac.c +++ b/drivers/edac/pnd2_edac.c @@ -1396,9 +1396,6 @@ static int pnd2_mce_check_error(struct notifier_block *nb, unsigned long val, vo struct dram_addr daddr; char *type; - if (edac_get_report_status() == EDAC_REPORTING_DISABLED) - return NOTIFY_DONE; - mci = pnd2_mci; if (!mci || (mce->kflags & MCE_HANDLED_CEC)) return NOTIFY_DONE; diff --git a/drivers/edac/sb_edac.c b/drivers/edac/sb_edac.c index f790f7d086880e..d414698ca32421 100644 --- a/drivers/edac/sb_edac.c +++ b/drivers/edac/sb_edac.c @@ -3134,8 +3134,6 @@ static int sbridge_mce_check_error(struct notifier_block *nb, unsigned long val, struct mem_ctl_info *mci; char *type; - if (edac_get_report_status() == EDAC_REPORTING_DISABLED) - return NOTIFY_DONE; if (mce->kflags & MCE_HANDLED_CEC) return NOTIFY_DONE; @@ -3526,8 +3524,6 @@ static int __init sbridge_init(void) if (rc >= 0) { mce_register_decode_chain(&sbridge_mce_dec); - if (edac_get_report_status() == EDAC_REPORTING_DISABLED) - sbridge_printk(KERN_WARNING, "Loading driver, error reporting disabled.\n"); return 0; } diff --git a/drivers/edac/skx_common.c b/drivers/edac/skx_common.c index 6f08a12f6b1103..423d33aef54f0b 100644 --- a/drivers/edac/skx_common.c +++ b/drivers/edac/skx_common.c @@ -574,9 +574,6 @@ int skx_mce_check_error(struct notifier_block *nb, unsigned long val, struct mem_ctl_info *mci; char *type; - if (edac_get_report_status() == EDAC_REPORTING_DISABLED) - return NOTIFY_DONE; - if (mce->kflags & MCE_HANDLED_CEC) return NOTIFY_DONE; diff --git a/include/linux/edac.h b/include/linux/edac.h index 0f20b986b0abd0..6eb7d55d7c3d24 100644 --- a/include/linux/edac.h +++ b/include/linux/edac.h @@ -31,14 +31,6 @@ struct device; extern int edac_op_state; struct bus_type *edac_get_sysfs_subsys(void); -int edac_get_report_status(void); -void edac_set_report_status(int new); - -enum { - EDAC_REPORTING_ENABLED, - EDAC_REPORTING_DISABLED, - EDAC_REPORTING_FORCE -}; static inline void opstate_init(void) { From 1df73b2131e3b33d518609769636b41ce00212de Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Tue, 7 Apr 2020 13:49:58 +0200 Subject: [PATCH 016/190] x86/mce: Fixup exception only for the correct MCEs The severity grading code returns IN_KERNEL_RECOV error context for errors which have happened in kernel space but from which the kernel can recover. Whether the recovery can happen is determined by the exception table entry having as handler ex_handler_fault() and which has been declared at build time using _ASM_EXTABLE_FAULT(). IN_KERNEL_RECOV is used in mce_severity_intel() to lookup the corresponding error severity in the severities table. However, the mapping back from error severity to whether the error is IN_KERNEL_RECOV is ambiguous and in the very paranoid case - which might not be possible right now - but be better safe than sorry later, an exception fixup could be attempted for another MCE whose address is in the exception table and has the proper severity. Which would be unfortunate, to say the least. Therefore, mark such MCEs explicitly as MCE_IN_KERNEL_RECOV so that the recovery attempt is done only for them. Document the whole handling, while at it, as it is not trivial. Reported-by: Thomas Gleixner Signed-off-by: Borislav Petkov Tested-by: Tony Luck Link: https://lkml.kernel.org/r/20200407163414.18058-10-bp@alien8.de --- arch/x86/include/asm/mce.h | 1 + arch/x86/kernel/cpu/mce/core.c | 15 +++++++++++++-- arch/x86/kernel/cpu/mce/severity.c | 6 +++++- 3 files changed, 19 insertions(+), 3 deletions(-) diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index 5f04a24f30eaa9..c598aaab071b26 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -136,6 +136,7 @@ #define MCE_HANDLED_NFIT BIT_ULL(3) #define MCE_HANDLED_EDAC BIT_ULL(4) #define MCE_HANDLED_MCELOG BIT_ULL(5) +#define MCE_IN_KERNEL_RECOV BIT_ULL(6) /* * This structure contains all data related to the MCE log. Also diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index 4efe6c12888780..02e1f165f14838 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -1331,8 +1331,19 @@ void notrace do_machine_check(struct pt_regs *regs, long error_code) local_irq_disable(); ist_end_non_atomic(); } else { - if (!fixup_exception(regs, X86_TRAP_MC, error_code, 0)) - mce_panic("Failed kernel mode recovery", &m, msg); + /* + * Handle an MCE which has happened in kernel space but from + * which the kernel can recover: ex_has_fault_handler() has + * already verified that the rIP at which the error happened is + * a rIP from which the kernel can recover (by jumping to + * recovery code specified in _ASM_EXTABLE_FAULT()) and the + * corresponding exception handler which would do that is the + * proper one. + */ + if (m.kflags & MCE_IN_KERNEL_RECOV) { + if (!fixup_exception(regs, X86_TRAP_MC, error_code, 0)) + mce_panic("Failed kernel mode recovery", &m, msg); + } } out_ist: diff --git a/arch/x86/kernel/cpu/mce/severity.c b/arch/x86/kernel/cpu/mce/severity.c index 87bcdc6dc2f0c5..e1da619add1921 100644 --- a/arch/x86/kernel/cpu/mce/severity.c +++ b/arch/x86/kernel/cpu/mce/severity.c @@ -213,8 +213,12 @@ static int error_context(struct mce *m) { if ((m->cs & 3) == 3) return IN_USER; - if (mc_recoverable(m->mcgstatus) && ex_has_fault_handler(m->ip)) + + if (mc_recoverable(m->mcgstatus) && ex_has_fault_handler(m->ip)) { + m->kflags |= MCE_IN_KERNEL_RECOV; return IN_KERNEL_RECOV; + } + return IN_KERNEL; } From f82cdff1aa7f6dcf6625c7219342a6e5c977098d Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Wed, 15 Apr 2020 12:58:26 -0700 Subject: [PATCH 017/190] x86/mce: Drop bogus comment about mce.kflags The bit definitions for kflags are for internal use only. A late edit moved them from uapi/asm/mce.h to the internal x86 , but the comment saying "See below" was accidentally left here. Delete "See below". Just labelling this field as internal kernel use is sufficient. Fixes: 1de08dccd383 ("x86/mce: Add a struct mce.kflags field") Signed-off-by: Tony Luck Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20200415195826.GA13681@agluck-desk2.amr.corp.intel.com --- arch/x86/include/uapi/asm/mce.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/include/uapi/asm/mce.h b/arch/x86/include/uapi/asm/mce.h index 5b59d80f1d4e1e..db9adc081c5af3 100644 --- a/arch/x86/include/uapi/asm/mce.h +++ b/arch/x86/include/uapi/asm/mce.h @@ -35,7 +35,7 @@ struct mce { __u64 ipid; /* MCA_IPID MSR: only valid on SMCA systems */ __u64 ppin; /* Protected Processor Inventory Number */ __u32 microcode; /* Microcode revision */ - __u64 kflags; /* Internal kernel use. See below */ + __u64 kflags; /* Internal kernel use */ }; #define MCE_GET_RECORD_LEN _IOR('M', 1, int) From 3b4ff4eb904fef04c36b39052ca8eb31fa41fad0 Mon Sep 17 00:00:00 2001 From: He Zhe Date: Wed, 4 Mar 2020 14:39:07 +0800 Subject: [PATCH 018/190] x86/mcelog: Add compat_ioctl for 32-bit mcelog support A 32-bit version of mcelog issuing ioctls on /dev/mcelog causes errors like the following: MCE_GET_RECORD_LEN: Inappropriate ioctl for device This is due to a missing compat_ioctl callback. Assign to it compat_ptr_ioctl() as a generic implementation of the .compat_ioctl file operation to ioctl functions that either ignore the argument or pass a pointer to a compatible data type. [ bp: Massage commit message. ] Signed-off-by: He Zhe Signed-off-by: Borislav Petkov Acked-by: Tony Luck Link: https://lkml.kernel.org/r/1583303947-49858-1-git-send-email-zhe.he@windriver.com --- arch/x86/kernel/cpu/mce/dev-mcelog.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/kernel/cpu/mce/dev-mcelog.c b/arch/x86/kernel/cpu/mce/dev-mcelog.c index c033e7ea9e3c37..a4fd5287f02f06 100644 --- a/arch/x86/kernel/cpu/mce/dev-mcelog.c +++ b/arch/x86/kernel/cpu/mce/dev-mcelog.c @@ -329,6 +329,7 @@ static const struct file_operations mce_chrdev_ops = { .write = mce_chrdev_write, .poll = mce_chrdev_poll, .unlocked_ioctl = mce_chrdev_ioctl, + .compat_ioctl = compat_ptr_ioctl, .llseek = no_llseek, }; From 0858caa419e6cf9d31e734d09d70b34f64443ef6 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 12 Feb 2020 13:58:35 +0000 Subject: [PATCH 019/190] uapi: General notification queue definitions Add UAPI definitions for the general notification queue, including the following pieces: (*) struct watch_notification. This is the metadata header for notification messages. It includes a type and subtype that indicate the source of the message (eg. WATCH_TYPE_MOUNT_NOTIFY) and the kind of the message (eg. NOTIFY_MOUNT_NEW_MOUNT). The header also contains an information field that conveys the following information: - WATCH_INFO_LENGTH. The size of the entry (entries are variable length). - WATCH_INFO_ID. The watch ID specified when the watchpoint was set. - WATCH_INFO_TYPE_INFO. (Sub)type-specific information. - WATCH_INFO_FLAG_*. Flag bits overlain on the type-specific information. For use by the type. All the information in the header can be used in filtering messages at the point of writing into the buffer. (*) struct watch_notification_removal This is an extended watch-removal notification record that includes an 'id' field that can indicate the identifier of the object being removed if available (for instance, a keyring serial number). Signed-off-by: David Howells --- include/uapi/linux/watch_queue.h | 55 ++++++++++++++++++++++++++++++++ 1 file changed, 55 insertions(+) create mode 100644 include/uapi/linux/watch_queue.h diff --git a/include/uapi/linux/watch_queue.h b/include/uapi/linux/watch_queue.h new file mode 100644 index 00000000000000..5f3d21e8a34b0a --- /dev/null +++ b/include/uapi/linux/watch_queue.h @@ -0,0 +1,55 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef _UAPI_LINUX_WATCH_QUEUE_H +#define _UAPI_LINUX_WATCH_QUEUE_H + +#include + +enum watch_notification_type { + WATCH_TYPE_META = 0, /* Special record */ + WATCH_TYPE__NR = 1 +}; + +enum watch_meta_notification_subtype { + WATCH_META_REMOVAL_NOTIFICATION = 0, /* Watched object was removed */ + WATCH_META_LOSS_NOTIFICATION = 1, /* Data loss occurred */ +}; + +/* + * Notification record header. This is aligned to 64-bits so that subclasses + * can contain __u64 fields. + */ +struct watch_notification { + __u32 type:24; /* enum watch_notification_type */ + __u32 subtype:8; /* Type-specific subtype (filterable) */ + __u32 info; +#define WATCH_INFO_LENGTH 0x0000007f /* Length of record */ +#define WATCH_INFO_LENGTH__SHIFT 0 +#define WATCH_INFO_ID 0x0000ff00 /* ID of watchpoint */ +#define WATCH_INFO_ID__SHIFT 8 +#define WATCH_INFO_TYPE_INFO 0xffff0000 /* Type-specific info */ +#define WATCH_INFO_TYPE_INFO__SHIFT 16 +#define WATCH_INFO_FLAG_0 0x00010000 /* Type-specific info, flag bit 0 */ +#define WATCH_INFO_FLAG_1 0x00020000 /* ... */ +#define WATCH_INFO_FLAG_2 0x00040000 +#define WATCH_INFO_FLAG_3 0x00080000 +#define WATCH_INFO_FLAG_4 0x00100000 +#define WATCH_INFO_FLAG_5 0x00200000 +#define WATCH_INFO_FLAG_6 0x00400000 +#define WATCH_INFO_FLAG_7 0x00800000 +}; + + +/* + * Extended watch removal notification. This is used optionally if the type + * wants to indicate an identifier for the object being watched, if there is + * such. This can be distinguished by the length. + * + * type -> WATCH_TYPE_META + * subtype -> WATCH_META_REMOVAL_NOTIFICATION + */ +struct watch_notification_removal { + struct watch_notification watch; + __u64 id; /* Type-dependent identifier */ +}; + +#endif /* _UAPI_LINUX_WATCH_QUEUE_H */ From 344fa64ef8f6740e99b32ab788b6e3742d7284b3 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 12 Feb 2020 13:58:35 +0000 Subject: [PATCH 020/190] security: Add a hook for the point of notification insertion Add a security hook that allows an LSM to rule on whether a notification message is allowed to be inserted into a particular watch queue. The hook is given the following information: (1) The credentials of the triggerer (which may be init_cred for a system notification, eg. a hardware error). (2) The credentials of the whoever set the watch. (3) The notification message. Signed-off-by: David Howells Acked-by: James Morris cc: Casey Schaufler cc: Stephen Smalley cc: linux-security-module@vger.kernel.org --- include/linux/lsm_hook_defs.h | 5 +++++ include/linux/lsm_hooks.h | 9 +++++++++ include/linux/security.h | 15 +++++++++++++++ security/security.c | 9 +++++++++ 4 files changed, 38 insertions(+) diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h index 5616b2567aa7fb..e0def45b5cc562 100644 --- a/include/linux/lsm_hook_defs.h +++ b/include/linux/lsm_hook_defs.h @@ -253,6 +253,11 @@ LSM_HOOK(int, 0, inode_setsecctx, struct dentry *dentry, void *ctx, u32 ctxlen) LSM_HOOK(int, 0, inode_getsecctx, struct inode *inode, void **ctx, u32 *ctxlen) +#if defined(CONFIG_SECURITY) && defined(CONFIG_WATCH_QUEUE) +LSM_HOOK(int, 0, post_notification, const struct cred *w_cred, + const struct cred *cred, struct watch_notification *n) +#endif /* CONFIG_SECURITY && CONFIG_KEY_NOTIFICATIONS */ + #ifdef CONFIG_SECURITY_NETWORK LSM_HOOK(int, 0, unix_stream_connect, struct sock *sock, struct sock *other, struct sock *newsk) diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h index 988ca0df7824c5..0b5e5769b83670 100644 --- a/include/linux/lsm_hooks.h +++ b/include/linux/lsm_hooks.h @@ -1437,6 +1437,15 @@ * @ctx is a pointer in which to place the allocated security context. * @ctxlen points to the place to put the length of @ctx. * + * Security hooks for the general notification queue: + * + * @post_notification: + * Check to see if a watch notification can be posted to a particular + * queue. + * @w_cred: The credentials of the whoever set the watch. + * @cred: The event-triggerer's credentials + * @n: The notification being posted + * * Security hooks for using the eBPF maps and programs functionalities through * eBPF syscalls. * diff --git a/include/linux/security.h b/include/linux/security.h index a8d9310472dfad..9a5d12ab491bc9 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -56,6 +56,8 @@ struct mm_struct; struct fs_context; struct fs_parameter; enum fs_value_type; +struct watch; +struct watch_notification; /* Default (no) options for the capable function */ #define CAP_OPT_NONE 0x0 @@ -1275,6 +1277,19 @@ static inline int security_locked_down(enum lockdown_reason what) } #endif /* CONFIG_SECURITY */ +#if defined(CONFIG_SECURITY) && defined(CONFIG_WATCH_QUEUE) +int security_post_notification(const struct cred *w_cred, + const struct cred *cred, + struct watch_notification *n); +#else +static inline int security_post_notification(const struct cred *w_cred, + const struct cred *cred, + struct watch_notification *n) +{ + return 0; +} +#endif + #ifdef CONFIG_SECURITY_NETWORK int security_unix_stream_connect(struct sock *sock, struct sock *other, struct sock *newsk); diff --git a/security/security.c b/security/security.c index 7fed24b9d57e62..7d55607120b487 100644 --- a/security/security.c +++ b/security/security.c @@ -2007,6 +2007,15 @@ int security_inode_getsecctx(struct inode *inode, void **ctx, u32 *ctxlen) } EXPORT_SYMBOL(security_inode_getsecctx); +#ifdef CONFIG_WATCH_QUEUE +int security_post_notification(const struct cred *w_cred, + const struct cred *cred, + struct watch_notification *n) +{ + return call_int_hook(post_notification, 0, w_cred, cred, n); +} +#endif /* CONFIG_WATCH_QUEUE */ + #ifdef CONFIG_SECURITY_NETWORK int security_unix_stream_connect(struct sock *sock, struct sock *other, struct sock *newsk) From b580b93664f91db8cb503429030df0f1c1e53528 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 12 Feb 2020 13:58:35 +0000 Subject: [PATCH 021/190] pipe: Add O_NOTIFICATION_PIPE Add an O_NOTIFICATION_PIPE flag that can be passed to pipe2() to indicate that the pipe being created is going to be used for notifications. This suppresses the use of splice(), vmsplice(), tee() and sendfile() on the pipe as calling iov_iter_revert() on a pipe when a kernel notification message has been inserted into the middle of a multi-buffer splice will be messy. The flag is given the same value as O_EXCL as it seems unlikely that this flag will ever be applicable to pipes and I don't want to use up another O_* bit unnecessarily. An alternative could be to add a pipe3() system call. Signed-off-by: David Howells --- include/uapi/linux/watch_queue.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/include/uapi/linux/watch_queue.h b/include/uapi/linux/watch_queue.h index 5f3d21e8a34b0a..9df72227f5154e 100644 --- a/include/uapi/linux/watch_queue.h +++ b/include/uapi/linux/watch_queue.h @@ -3,6 +3,9 @@ #define _UAPI_LINUX_WATCH_QUEUE_H #include +#include + +#define O_NOTIFICATION_PIPE O_EXCL /* Parameter to pipe2() selecting notification pipe */ enum watch_notification_type { WATCH_TYPE_META = 0, /* Special record */ From c73be61cede5882f9605a852414db559c0ebedfd Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 14 Jan 2020 17:07:11 +0000 Subject: [PATCH 022/190] pipe: Add general notification queue support Make it possible to have a general notification queue built on top of a standard pipe. Notifications are 'spliced' into the pipe and then read out. splice(), vmsplice() and sendfile() are forbidden on pipes used for notifications as post_one_notification() cannot take pipe->mutex. This means that notifications could be posted in between individual pipe buffers, making iov_iter_revert() difficult to effect. The way the notification queue is used is: (1) An application opens a pipe with a special flag and indicates the number of messages it wishes to be able to queue at once (this can only be set once): pipe2(fds, O_NOTIFICATION_PIPE); ioctl(fds[0], IOC_WATCH_QUEUE_SET_SIZE, queue_depth); (2) The application then uses poll() and read() as normal to extract data from the pipe. read() will return multiple notifications if the buffer is big enough, but it will not split a notification across buffers - rather it will return a short read or EMSGSIZE. Notification messages include a length in the header so that the caller can split them up. Each message has a header that describes it: struct watch_notification { __u32 type:24; __u32 subtype:8; __u32 info; }; The type indicates the source (eg. mount tree changes, superblock events, keyring changes, block layer events) and the subtype indicates the event type (eg. mount, unmount; EIO, EDQUOT; link, unlink). The info field indicates a number of things, including the entry length, an ID assigned to a watchpoint contributing to this buffer and type-specific flags. Supplementary data, such as the key ID that generated an event, can be attached in additional slots. The maximum message size is 127 bytes. Messages may not be padded or aligned, so there is no guarantee, for example, that the notification type will be on a 4-byte bounary. Signed-off-by: David Howells --- .../userspace-api/ioctl/ioctl-number.rst | 1 + Documentation/watch_queue.rst | 339 +++++++++ fs/pipe.c | 206 ++++-- fs/splice.c | 12 +- include/linux/pipe_fs_i.h | 19 +- include/linux/watch_queue.h | 127 ++++ include/uapi/linux/watch_queue.h | 20 + init/Kconfig | 12 + kernel/Makefile | 1 + kernel/watch_queue.c | 657 ++++++++++++++++++ 10 files changed, 1318 insertions(+), 76 deletions(-) create mode 100644 Documentation/watch_queue.rst create mode 100644 include/linux/watch_queue.h create mode 100644 kernel/watch_queue.c diff --git a/Documentation/userspace-api/ioctl/ioctl-number.rst b/Documentation/userspace-api/ioctl/ioctl-number.rst index f759edafd938a5..9425377615ce48 100644 --- a/Documentation/userspace-api/ioctl/ioctl-number.rst +++ b/Documentation/userspace-api/ioctl/ioctl-number.rst @@ -201,6 +201,7 @@ Code Seq# Include File Comments 'W' 00-1F linux/wanrouter.h conflict! (pre 3.9) 'W' 00-3F sound/asound.h conflict! 'W' 40-5F drivers/pci/switch/switchtec.c +'W' 60-61 linux/watch_queue.h 'X' all fs/xfs/xfs_fs.h, conflict! fs/xfs/linux-2.6/xfs_ioctl32.h, include/linux/falloc.h, diff --git a/Documentation/watch_queue.rst b/Documentation/watch_queue.rst new file mode 100644 index 00000000000000..849fad6893efa6 --- /dev/null +++ b/Documentation/watch_queue.rst @@ -0,0 +1,339 @@ +============================== +General notification mechanism +============================== + +The general notification mechanism is built on top of the standard pipe driver +whereby it effectively splices notification messages from the kernel into pipes +opened by userspace. This can be used in conjunction with:: + + * Key/keyring notifications + + +The notifications buffers can be enabled by: + + "General setup"/"General notification queue" + (CONFIG_WATCH_QUEUE) + +This document has the following sections: + +.. contents:: :local: + + +Overview +======== + +This facility appears as a pipe that is opened in a special mode. The pipe's +internal ring buffer is used to hold messages that are generated by the kernel. +These messages are then read out by read(). Splice and similar are disabled on +such pipes due to them wanting to, under some circumstances, revert their +additions to the ring - which might end up interleaved with notification +messages. + +The owner of the pipe has to tell the kernel which sources it would like to +watch through that pipe. Only sources that have been connected to a pipe will +insert messages into it. Note that a source may be bound to multiple pipes and +insert messages into all of them simultaneously. + +Filters may also be emplaced on a pipe so that certain source types and +subevents can be ignored if they're not of interest. + +A message will be discarded if there isn't a slot available in the ring or if +no preallocated message buffer is available. In both of these cases, read() +will insert a WATCH_META_LOSS_NOTIFICATION message into the output buffer after +the last message currently in the buffer has been read. + +Note that when producing a notification, the kernel does not wait for the +consumers to collect it, but rather just continues on. This means that +notifications can be generated whilst spinlocks are held and also protects the +kernel from being held up indefinitely by a userspace malfunction. + + +Message Structure +================= + +Notification messages begin with a short header:: + + struct watch_notification { + __u32 type:24; + __u32 subtype:8; + __u32 info; + }; + +"type" indicates the source of the notification record and "subtype" indicates +the type of record from that source (see the Watch Sources section below). The +type may also be "WATCH_TYPE_META". This is a special record type generated +internally by the watch queue itself. There are two subtypes: + + * WATCH_META_REMOVAL_NOTIFICATION + * WATCH_META_LOSS_NOTIFICATION + +The first indicates that an object on which a watch was installed was removed +or destroyed and the second indicates that some messages have been lost. + +"info" indicates a bunch of things, including: + + * The length of the message in bytes, including the header (mask with + WATCH_INFO_LENGTH and shift by WATCH_INFO_LENGTH__SHIFT). This indicates + the size of the record, which may be between 8 and 127 bytes. + + * The watch ID (mask with WATCH_INFO_ID and shift by WATCH_INFO_ID__SHIFT). + This indicates that caller's ID of the watch, which may be between 0 + and 255. Multiple watches may share a queue, and this provides a means to + distinguish them. + + * A type-specific field (WATCH_INFO_TYPE_INFO). This is set by the + notification producer to indicate some meaning specific to the type and + subtype. + +Everything in info apart from the length can be used for filtering. + +The header can be followed by supplementary information. The format of this is +at the discretion is defined by the type and subtype. + + +Watch List (Notification Source) API +==================================== + +A "watch list" is a list of watchers that are subscribed to a source of +notifications. A list may be attached to an object (say a key or a superblock) +or may be global (say for device events). From a userspace perspective, a +non-global watch list is typically referred to by reference to the object it +belongs to (such as using KEYCTL_NOTIFY and giving it a key serial number to +watch that specific key). + +To manage a watch list, the following functions are provided: + + * ``void init_watch_list(struct watch_list *wlist, + void (*release_watch)(struct watch *wlist));`` + + Initialise a watch list. If ``release_watch`` is not NULL, then this + indicates a function that should be called when the watch_list object is + destroyed to discard any references the watch list holds on the watched + object. + + * ``void remove_watch_list(struct watch_list *wlist);`` + + This removes all of the watches subscribed to a watch_list and frees them + and then destroys the watch_list object itself. + + +Watch Queue (Notification Output) API +===================================== + +A "watch queue" is the buffer allocated by an application that notification +records will be written into. The workings of this are hidden entirely inside +of the pipe device driver, but it is necessary to gain a reference to it to set +a watch. These can be managed with: + + * ``struct watch_queue *get_watch_queue(int fd);`` + + Since watch queues are indicated to the kernel by the fd of the pipe that + implements the buffer, userspace must hand that fd through a system call. + This can be used to look up an opaque pointer to the watch queue from the + system call. + + * ``void put_watch_queue(struct watch_queue *wqueue);`` + + This discards the reference obtained from ``get_watch_queue()``. + + +Watch Subscription API +====================== + +A "watch" is a subscription on a watch list, indicating the watch queue, and +thus the buffer, into which notification records should be written. The watch +queue object may also carry filtering rules for that object, as set by +userspace. Some parts of the watch struct can be set by the driver:: + + struct watch { + union { + u32 info_id; /* ID to be OR'd in to info field */ + ... + }; + void *private; /* Private data for the watched object */ + u64 id; /* Internal identifier */ + ... + }; + +The ``info_id`` value should be an 8-bit number obtained from userspace and +shifted by WATCH_INFO_ID__SHIFT. This is OR'd into the WATCH_INFO_ID field of +struct watch_notification::info when and if the notification is written into +the associated watch queue buffer. + +The ``private`` field is the driver's data associated with the watch_list and +is cleaned up by the ``watch_list::release_watch()`` method. + +The ``id`` field is the source's ID. Notifications that are posted with a +different ID are ignored. + +The following functions are provided to manage watches: + + * ``void init_watch(struct watch *watch, struct watch_queue *wqueue);`` + + Initialise a watch object, setting its pointer to the watch queue, using + appropriate barriering to avoid lockdep complaints. + + * ``int add_watch_to_object(struct watch *watch, struct watch_list *wlist);`` + + Subscribe a watch to a watch list (notification source). The + driver-settable fields in the watch struct must have been set before this + is called. + + * ``int remove_watch_from_object(struct watch_list *wlist, + struct watch_queue *wqueue, + u64 id, false);`` + + Remove a watch from a watch list, where the watch must match the specified + watch queue (``wqueue``) and object identifier (``id``). A notification + (``WATCH_META_REMOVAL_NOTIFICATION``) is sent to the watch queue to + indicate that the watch got removed. + + * ``int remove_watch_from_object(struct watch_list *wlist, NULL, 0, true);`` + + Remove all the watches from a watch list. It is expected that this will be + called preparatory to destruction and that the watch list will be + inaccessible to new watches by this point. A notification + (``WATCH_META_REMOVAL_NOTIFICATION``) is sent to the watch queue of each + subscribed watch to indicate that the watch got removed. + + +Notification Posting API +======================== + +To post a notification to watch list so that the subscribed watches can see it, +the following function should be used:: + + void post_watch_notification(struct watch_list *wlist, + struct watch_notification *n, + const struct cred *cred, + u64 id); + +The notification should be preformatted and a pointer to the header (``n``) +should be passed in. The notification may be larger than this and the size in +units of buffer slots is noted in ``n->info & WATCH_INFO_LENGTH``. + +The ``cred`` struct indicates the credentials of the source (subject) and is +passed to the LSMs, such as SELinux, to allow or suppress the recording of the +note in each individual queue according to the credentials of that queue +(object). + +The ``id`` is the ID of the source object (such as the serial number on a key). +Only watches that have the same ID set in them will see this notification. + + +Watch Sources +============= + +Any particular buffer can be fed from multiple sources. Sources include: + + * WATCH_TYPE_KEY_NOTIFY + + Notifications of this type indicate changes to keys and keyrings, including + the changes of keyring contents or the attributes of keys. + + See Documentation/security/keys/core.rst for more information. + + +Event Filtering +=============== + +Once a watch queue has been created, a set of filters can be applied to limit +the events that are received using:: + + struct watch_notification_filter filter = { + ... + }; + ioctl(fd, IOC_WATCH_QUEUE_SET_FILTER, &filter) + +The filter description is a variable of type:: + + struct watch_notification_filter { + __u32 nr_filters; + __u32 __reserved; + struct watch_notification_type_filter filters[]; + }; + +Where "nr_filters" is the number of filters in filters[] and "__reserved" +should be 0. The "filters" array has elements of the following type:: + + struct watch_notification_type_filter { + __u32 type; + __u32 info_filter; + __u32 info_mask; + __u32 subtype_filter[8]; + }; + +Where: + + * ``type`` is the event type to filter for and should be something like + "WATCH_TYPE_KEY_NOTIFY" + + * ``info_filter`` and ``info_mask`` act as a filter on the info field of the + notification record. The notification is only written into the buffer if:: + + (watch.info & info_mask) == info_filter + + This could be used, for example, to ignore events that are not exactly on + the watched point in a mount tree. + + * ``subtype_filter`` is a bitmask indicating the subtypes that are of + interest. Bit 0 of subtype_filter[0] corresponds to subtype 0, bit 1 to + subtype 1, and so on. + +If the argument to the ioctl() is NULL, then the filters will be removed and +all events from the watched sources will come through. + + +Userspace Code Example +====================== + +A buffer is created with something like the following:: + + pipe2(fds, O_TMPFILE); + ioctl(fds[1], IOC_WATCH_QUEUE_SET_SIZE, 256); + +It can then be set to receive keyring change notifications:: + + keyctl(KEYCTL_WATCH_KEY, KEY_SPEC_SESSION_KEYRING, fds[1], 0x01); + +The notifications can then be consumed by something like the following:: + + static void consumer(int rfd, struct watch_queue_buffer *buf) + { + unsigned char buffer[128]; + ssize_t buf_len; + + while (buf_len = read(rfd, buffer, sizeof(buffer)), + buf_len > 0 + ) { + void *p = buffer; + void *end = buffer + buf_len; + while (p < end) { + union { + struct watch_notification n; + unsigned char buf1[128]; + } n; + size_t largest, len; + + largest = end - p; + if (largest > 128) + largest = 128; + memcpy(&n, p, largest); + + len = (n->info & WATCH_INFO_LENGTH) >> + WATCH_INFO_LENGTH__SHIFT; + if (len == 0 || len > largest) + return; + + switch (n.n.type) { + case WATCH_TYPE_META: + got_meta(&n.n); + case WATCH_TYPE_KEY_NOTIFY: + saw_key_change(&n.n); + break; + } + + p += len; + } + } + } diff --git a/fs/pipe.c b/fs/pipe.c index 16fb72e9abf7fd..da9bc1f21fd19a 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -24,6 +24,7 @@ #include #include #include +#include #include #include @@ -459,6 +460,13 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from) goto out; } +#ifdef CONFIG_WATCH_QUEUE + if (pipe->watch_queue) { + ret = -EXDEV; + goto out; + } +#endif + /* * Only wake up if the pipe started out empty, since * otherwise there should be no readers waiting. @@ -628,22 +636,37 @@ static long pipe_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) int count, head, tail, mask; switch (cmd) { - case FIONREAD: - __pipe_lock(pipe); - count = 0; - head = pipe->head; - tail = pipe->tail; - mask = pipe->ring_size - 1; + case FIONREAD: + __pipe_lock(pipe); + count = 0; + head = pipe->head; + tail = pipe->tail; + mask = pipe->ring_size - 1; - while (tail != head) { - count += pipe->bufs[tail & mask].len; - tail++; - } - __pipe_unlock(pipe); + while (tail != head) { + count += pipe->bufs[tail & mask].len; + tail++; + } + __pipe_unlock(pipe); - return put_user(count, (int __user *)arg); - default: - return -ENOIOCTLCMD; + return put_user(count, (int __user *)arg); + +#ifdef CONFIG_WATCH_QUEUE + case IOC_WATCH_QUEUE_SET_SIZE: { + int ret; + __pipe_lock(pipe); + ret = watch_queue_set_size(pipe, arg); + __pipe_unlock(pipe); + return ret; + } + + case IOC_WATCH_QUEUE_SET_FILTER: + return watch_queue_set_filter( + pipe, (struct watch_notification_filter __user *)arg); +#endif + + default: + return -ENOIOCTLCMD; } } @@ -754,27 +777,27 @@ pipe_fasync(int fd, struct file *filp, int on) return retval; } -static unsigned long account_pipe_buffers(struct user_struct *user, - unsigned long old, unsigned long new) +unsigned long account_pipe_buffers(struct user_struct *user, + unsigned long old, unsigned long new) { return atomic_long_add_return(new - old, &user->pipe_bufs); } -static bool too_many_pipe_buffers_soft(unsigned long user_bufs) +bool too_many_pipe_buffers_soft(unsigned long user_bufs) { unsigned long soft_limit = READ_ONCE(pipe_user_pages_soft); return soft_limit && user_bufs > soft_limit; } -static bool too_many_pipe_buffers_hard(unsigned long user_bufs) +bool too_many_pipe_buffers_hard(unsigned long user_bufs) { unsigned long hard_limit = READ_ONCE(pipe_user_pages_hard); return hard_limit && user_bufs > hard_limit; } -static bool is_unprivileged_user(void) +bool pipe_is_unprivileged_user(void) { return !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN); } @@ -796,12 +819,12 @@ struct pipe_inode_info *alloc_pipe_info(void) user_bufs = account_pipe_buffers(user, 0, pipe_bufs); - if (too_many_pipe_buffers_soft(user_bufs) && is_unprivileged_user()) { + if (too_many_pipe_buffers_soft(user_bufs) && pipe_is_unprivileged_user()) { user_bufs = account_pipe_buffers(user, pipe_bufs, 1); pipe_bufs = 1; } - if (too_many_pipe_buffers_hard(user_bufs) && is_unprivileged_user()) + if (too_many_pipe_buffers_hard(user_bufs) && pipe_is_unprivileged_user()) goto out_revert_acct; pipe->bufs = kcalloc(pipe_bufs, sizeof(struct pipe_buffer), @@ -813,6 +836,7 @@ struct pipe_inode_info *alloc_pipe_info(void) pipe->r_counter = pipe->w_counter = 1; pipe->max_usage = pipe_bufs; pipe->ring_size = pipe_bufs; + pipe->nr_accounted = pipe_bufs; pipe->user = user; mutex_init(&pipe->mutex); return pipe; @@ -830,7 +854,14 @@ void free_pipe_info(struct pipe_inode_info *pipe) { int i; - (void) account_pipe_buffers(pipe->user, pipe->ring_size, 0); +#ifdef CONFIG_WATCH_QUEUE + if (pipe->watch_queue) { + watch_queue_clear(pipe->watch_queue); + put_watch_queue(pipe->watch_queue); + } +#endif + + (void) account_pipe_buffers(pipe->user, pipe->nr_accounted, 0); free_uid(pipe->user); for (i = 0; i < pipe->ring_size; i++) { struct pipe_buffer *buf = pipe->bufs + i; @@ -906,6 +937,17 @@ int create_pipe_files(struct file **res, int flags) if (!inode) return -ENFILE; + if (flags & O_NOTIFICATION_PIPE) { +#ifdef CONFIG_WATCH_QUEUE + if (watch_queue_init(inode->i_pipe) < 0) { + iput(inode); + return -ENOMEM; + } +#else + return -ENOPKG; +#endif + } + f = alloc_file_pseudo(inode, pipe_mnt, "", O_WRONLY | (flags & (O_NONBLOCK | O_DIRECT)), &pipefifo_fops); @@ -936,7 +978,7 @@ static int __do_pipe_flags(int *fd, struct file **files, int flags) int error; int fdw, fdr; - if (flags & ~(O_CLOEXEC | O_NONBLOCK | O_DIRECT)) + if (flags & ~(O_CLOEXEC | O_NONBLOCK | O_DIRECT | O_NOTIFICATION_PIPE)) return -EINVAL; error = create_pipe_files(files, flags); @@ -1184,42 +1226,12 @@ unsigned int round_pipe_size(unsigned long size) } /* - * Allocate a new array of pipe buffers and copy the info over. Returns the - * pipe size if successful, or return -ERROR on error. + * Resize the pipe ring to a number of slots. */ -static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long arg) +int pipe_resize_ring(struct pipe_inode_info *pipe, unsigned int nr_slots) { struct pipe_buffer *bufs; - unsigned int size, nr_slots, head, tail, mask, n; - unsigned long user_bufs; - long ret = 0; - - size = round_pipe_size(arg); - nr_slots = size >> PAGE_SHIFT; - - if (!nr_slots) - return -EINVAL; - - /* - * If trying to increase the pipe capacity, check that an - * unprivileged user is not trying to exceed various limits - * (soft limit check here, hard limit check just below). - * Decreasing the pipe capacity is always permitted, even - * if the user is currently over a limit. - */ - if (nr_slots > pipe->ring_size && - size > pipe_max_size && !capable(CAP_SYS_RESOURCE)) - return -EPERM; - - user_bufs = account_pipe_buffers(pipe->user, pipe->ring_size, nr_slots); - - if (nr_slots > pipe->ring_size && - (too_many_pipe_buffers_hard(user_bufs) || - too_many_pipe_buffers_soft(user_bufs)) && - is_unprivileged_user()) { - ret = -EPERM; - goto out_revert_acct; - } + unsigned int head, tail, mask, n; /* * We can shrink the pipe, if arg is greater than the ring occupancy. @@ -1231,17 +1243,13 @@ static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long arg) head = pipe->head; tail = pipe->tail; n = pipe_occupancy(pipe->head, pipe->tail); - if (nr_slots < n) { - ret = -EBUSY; - goto out_revert_acct; - } + if (nr_slots < n) + return -EBUSY; bufs = kcalloc(nr_slots, sizeof(*bufs), GFP_KERNEL_ACCOUNT | __GFP_NOWARN); - if (unlikely(!bufs)) { - ret = -ENOMEM; - goto out_revert_acct; - } + if (unlikely(!bufs)) + return -ENOMEM; /* * The pipe array wraps around, so just start the new one at zero @@ -1269,16 +1277,68 @@ static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long arg) kfree(pipe->bufs); pipe->bufs = bufs; pipe->ring_size = nr_slots; - pipe->max_usage = nr_slots; + if (pipe->max_usage > nr_slots) + pipe->max_usage = nr_slots; pipe->tail = tail; pipe->head = head; /* This might have made more room for writers */ wake_up_interruptible(&pipe->wr_wait); + return 0; +} + +/* + * Allocate a new array of pipe buffers and copy the info over. Returns the + * pipe size if successful, or return -ERROR on error. + */ +static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long arg) +{ + unsigned long user_bufs; + unsigned int nr_slots, size; + long ret = 0; + +#ifdef CONFIG_WATCH_QUEUE + if (pipe->watch_queue) + return -EBUSY; +#endif + + size = round_pipe_size(arg); + nr_slots = size >> PAGE_SHIFT; + + if (!nr_slots) + return -EINVAL; + + /* + * If trying to increase the pipe capacity, check that an + * unprivileged user is not trying to exceed various limits + * (soft limit check here, hard limit check just below). + * Decreasing the pipe capacity is always permitted, even + * if the user is currently over a limit. + */ + if (nr_slots > pipe->max_usage && + size > pipe_max_size && !capable(CAP_SYS_RESOURCE)) + return -EPERM; + + user_bufs = account_pipe_buffers(pipe->user, pipe->nr_accounted, nr_slots); + + if (nr_slots > pipe->max_usage && + (too_many_pipe_buffers_hard(user_bufs) || + too_many_pipe_buffers_soft(user_bufs)) && + pipe_is_unprivileged_user()) { + ret = -EPERM; + goto out_revert_acct; + } + + ret = pipe_resize_ring(pipe, nr_slots); + if (ret < 0) + goto out_revert_acct; + + pipe->max_usage = nr_slots; + pipe->nr_accounted = nr_slots; return pipe->max_usage * PAGE_SIZE; out_revert_acct: - (void) account_pipe_buffers(pipe->user, nr_slots, pipe->ring_size); + (void) account_pipe_buffers(pipe->user, nr_slots, pipe->nr_accounted); return ret; } @@ -1287,9 +1347,17 @@ static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long arg) * location, so checking ->i_pipe is not enough to verify that this is a * pipe. */ -struct pipe_inode_info *get_pipe_info(struct file *file) +struct pipe_inode_info *get_pipe_info(struct file *file, bool for_splice) { - return file->f_op == &pipefifo_fops ? file->private_data : NULL; + struct pipe_inode_info *pipe = file->private_data; + + if (file->f_op != &pipefifo_fops || !pipe) + return NULL; +#ifdef CONFIG_WATCH_QUEUE + if (for_splice && pipe->watch_queue) + return NULL; +#endif + return pipe; } long pipe_fcntl(struct file *file, unsigned int cmd, unsigned long arg) @@ -1297,7 +1365,7 @@ long pipe_fcntl(struct file *file, unsigned int cmd, unsigned long arg) struct pipe_inode_info *pipe; long ret; - pipe = get_pipe_info(file); + pipe = get_pipe_info(file, false); if (!pipe) return -EBADF; diff --git a/fs/splice.c b/fs/splice.c index fd0a1e7e5959ad..50f3c0260c005b 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -1122,8 +1122,8 @@ long do_splice(struct file *in, loff_t __user *off_in, !(out->f_mode & FMODE_WRITE))) return -EBADF; - ipipe = get_pipe_info(in); - opipe = get_pipe_info(out); + ipipe = get_pipe_info(in, true); + opipe = get_pipe_info(out, true); if (ipipe && opipe) { if (off_in || off_out) @@ -1273,7 +1273,7 @@ static int pipe_to_user(struct pipe_inode_info *pipe, struct pipe_buffer *buf, static long vmsplice_to_user(struct file *file, struct iov_iter *iter, unsigned int flags) { - struct pipe_inode_info *pipe = get_pipe_info(file); + struct pipe_inode_info *pipe = get_pipe_info(file, true); struct splice_desc sd = { .total_len = iov_iter_count(iter), .flags = flags, @@ -1308,7 +1308,7 @@ static long vmsplice_to_pipe(struct file *file, struct iov_iter *iter, if (flags & SPLICE_F_GIFT) buf_flag = PIPE_BUF_FLAG_GIFT; - pipe = get_pipe_info(file); + pipe = get_pipe_info(file, true); if (!pipe) return -EBADF; @@ -1757,8 +1757,8 @@ static int link_pipe(struct pipe_inode_info *ipipe, static long do_tee(struct file *in, struct file *out, size_t len, unsigned int flags) { - struct pipe_inode_info *ipipe = get_pipe_info(in); - struct pipe_inode_info *opipe = get_pipe_info(out); + struct pipe_inode_info *ipipe = get_pipe_info(in, true); + struct pipe_inode_info *opipe = get_pipe_info(out, true); int ret = -EINVAL; if (unlikely(!(in->f_mode & FMODE_READ) || diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h index ae58fad7f1e0d8..1d3eaa233f4a85 100644 --- a/include/linux/pipe_fs_i.h +++ b/include/linux/pipe_fs_i.h @@ -35,6 +35,7 @@ struct pipe_buffer { * @tail: The point of buffer consumption * @max_usage: The maximum number of slots that may be used in the ring * @ring_size: total number of buffers (should be a power of 2) + * @nr_accounted: The amount this pipe accounts for in user->pipe_bufs * @tmp_page: cached released page * @readers: number of current readers of this pipe * @writers: number of current writers of this pipe @@ -45,6 +46,7 @@ struct pipe_buffer { * @fasync_writers: writer side fasync * @bufs: the circular array of pipe buffers * @user: the user who created this pipe + * @watch_queue: If this pipe is a watch_queue, this is the stuff for that **/ struct pipe_inode_info { struct mutex mutex; @@ -53,6 +55,7 @@ struct pipe_inode_info { unsigned int tail; unsigned int max_usage; unsigned int ring_size; + unsigned int nr_accounted; unsigned int readers; unsigned int writers; unsigned int files; @@ -63,6 +66,9 @@ struct pipe_inode_info { struct fasync_struct *fasync_writers; struct pipe_buffer *bufs; struct user_struct *user; +#ifdef CONFIG_WATCH_QUEUE + struct watch_queue *watch_queue; +#endif }; /* @@ -237,9 +243,20 @@ void pipe_buf_mark_unmergeable(struct pipe_buffer *buf); extern const struct pipe_buf_operations nosteal_pipe_buf_ops; +#ifdef CONFIG_WATCH_QUEUE +unsigned long account_pipe_buffers(struct user_struct *user, + unsigned long old, unsigned long new); +bool too_many_pipe_buffers_soft(unsigned long user_bufs); +bool too_many_pipe_buffers_hard(unsigned long user_bufs); +bool pipe_is_unprivileged_user(void); +#endif + /* for F_SETPIPE_SZ and F_GETPIPE_SZ */ +#ifdef CONFIG_WATCH_QUEUE +int pipe_resize_ring(struct pipe_inode_info *pipe, unsigned int nr_slots); +#endif long pipe_fcntl(struct file *, unsigned int, unsigned long arg); -struct pipe_inode_info *get_pipe_info(struct file *file); +struct pipe_inode_info *get_pipe_info(struct file *file, bool for_splice); int create_pipe_files(struct file **, int); unsigned int round_pipe_size(unsigned long size); diff --git a/include/linux/watch_queue.h b/include/linux/watch_queue.h new file mode 100644 index 00000000000000..5e08db2adc3199 --- /dev/null +++ b/include/linux/watch_queue.h @@ -0,0 +1,127 @@ +// SPDX-License-Identifier: GPL-2.0 +/* User-mappable watch queue + * + * Copyright (C) 2020 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * See Documentation/watch_queue.rst + */ + +#ifndef _LINUX_WATCH_QUEUE_H +#define _LINUX_WATCH_QUEUE_H + +#include +#include +#include + +#ifdef CONFIG_WATCH_QUEUE + +struct cred; + +struct watch_type_filter { + enum watch_notification_type type; + __u32 subtype_filter[1]; /* Bitmask of subtypes to filter on */ + __u32 info_filter; /* Filter on watch_notification::info */ + __u32 info_mask; /* Mask of relevant bits in info_filter */ +}; + +struct watch_filter { + union { + struct rcu_head rcu; + unsigned long type_filter[2]; /* Bitmask of accepted types */ + }; + u32 nr_filters; /* Number of filters */ + struct watch_type_filter filters[]; +}; + +struct watch_queue { + struct rcu_head rcu; + struct watch_filter __rcu *filter; + struct pipe_inode_info *pipe; /* The pipe we're using as a buffer */ + struct hlist_head watches; /* Contributory watches */ + struct page **notes; /* Preallocated notifications */ + unsigned long *notes_bitmap; /* Allocation bitmap for notes */ + struct kref usage; /* Object usage count */ + spinlock_t lock; + unsigned int nr_notes; /* Number of notes */ + unsigned int nr_pages; /* Number of pages in notes[] */ + bool defunct; /* T when queues closed */ +}; + +/* + * Representation of a watch on an object. + */ +struct watch { + union { + struct rcu_head rcu; + u32 info_id; /* ID to be OR'd in to info field */ + }; + struct watch_queue __rcu *queue; /* Queue to post events to */ + struct hlist_node queue_node; /* Link in queue->watches */ + struct watch_list __rcu *watch_list; + struct hlist_node list_node; /* Link in watch_list->watchers */ + const struct cred *cred; /* Creds of the owner of the watch */ + void *private; /* Private data for the watched object */ + u64 id; /* Internal identifier */ + struct kref usage; /* Object usage count */ +}; + +/* + * List of watches on an object. + */ +struct watch_list { + struct rcu_head rcu; + struct hlist_head watchers; + void (*release_watch)(struct watch *); + spinlock_t lock; +}; + +extern void __post_watch_notification(struct watch_list *, + struct watch_notification *, + const struct cred *, + u64); +extern struct watch_queue *get_watch_queue(int); +extern void put_watch_queue(struct watch_queue *); +extern void init_watch(struct watch *, struct watch_queue *); +extern int add_watch_to_object(struct watch *, struct watch_list *); +extern int remove_watch_from_object(struct watch_list *, struct watch_queue *, u64, bool); +extern long watch_queue_set_size(struct pipe_inode_info *, unsigned int); +extern long watch_queue_set_filter(struct pipe_inode_info *, + struct watch_notification_filter __user *); +extern int watch_queue_init(struct pipe_inode_info *); +extern void watch_queue_clear(struct watch_queue *); + +static inline void init_watch_list(struct watch_list *wlist, + void (*release_watch)(struct watch *)) +{ + INIT_HLIST_HEAD(&wlist->watchers); + spin_lock_init(&wlist->lock); + wlist->release_watch = release_watch; +} + +static inline void post_watch_notification(struct watch_list *wlist, + struct watch_notification *n, + const struct cred *cred, + u64 id) +{ + if (unlikely(wlist)) + __post_watch_notification(wlist, n, cred, id); +} + +static inline void remove_watch_list(struct watch_list *wlist, u64 id) +{ + if (wlist) { + remove_watch_from_object(wlist, NULL, id, true); + kfree_rcu(wlist, rcu); + } +} + +/** + * watch_sizeof - Calculate the information part of the size of a watch record, + * given the structure size. + */ +#define watch_sizeof(STRUCT) (sizeof(STRUCT) << WATCH_INFO_LENGTH__SHIFT) + +#endif + +#endif /* _LINUX_WATCH_QUEUE_H */ diff --git a/include/uapi/linux/watch_queue.h b/include/uapi/linux/watch_queue.h index 9df72227f5154e..3a5790f1f05de4 100644 --- a/include/uapi/linux/watch_queue.h +++ b/include/uapi/linux/watch_queue.h @@ -4,9 +4,13 @@ #include #include +#include #define O_NOTIFICATION_PIPE O_EXCL /* Parameter to pipe2() selecting notification pipe */ +#define IOC_WATCH_QUEUE_SET_SIZE _IO('W', 0x60) /* Set the size in pages */ +#define IOC_WATCH_QUEUE_SET_FILTER _IO('W', 0x61) /* Set the filter */ + enum watch_notification_type { WATCH_TYPE_META = 0, /* Special record */ WATCH_TYPE__NR = 1 @@ -41,6 +45,22 @@ struct watch_notification { #define WATCH_INFO_FLAG_7 0x00800000 }; +/* + * Notification filtering rules (IOC_WATCH_QUEUE_SET_FILTER). + */ +struct watch_notification_type_filter { + __u32 type; /* Type to apply filter to */ + __u32 info_filter; /* Filter on watch_notification::info */ + __u32 info_mask; /* Mask of relevant bits in info_filter */ + __u32 subtype_filter[8]; /* Bitmask of subtypes to filter on */ +}; + +struct watch_notification_filter { + __u32 nr_filters; /* Number of filters */ + __u32 __reserved; /* Must be 0 */ + struct watch_notification_type_filter filters[]; +}; + /* * Extended watch removal notification. This is used optionally if the type diff --git a/init/Kconfig b/init/Kconfig index 74a5ac65644f3f..c95a2a5654a906 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -326,6 +326,18 @@ config POSIX_MQUEUE_SYSCTL depends on SYSCTL default y +config WATCH_QUEUE + bool "General notification queue" + default n + help + + This is a general notification queue for the kernel to pass events to + userspace by splicing them into pipes. It can be used in conjunction + with watches for key/keyring change notifications and device + notifications. + + See Documentation/watch_queue.rst + config CROSS_MEMORY_ATTACH bool "Enable process_vm_readv/writev syscalls" depends on MMU diff --git a/kernel/Makefile b/kernel/Makefile index 4cb4130ced3293..41e7e3ae07ec1d 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -115,6 +115,7 @@ obj-$(CONFIG_TORTURE_TEST) += torture.o obj-$(CONFIG_HAS_IOMEM) += iomem.o obj-$(CONFIG_RSEQ) += rseq.o +obj-$(CONFIG_WATCH_QUEUE) += watch_queue.o obj-$(CONFIG_SYSCTL_KUNIT_TEST) += sysctl-test.o diff --git a/kernel/watch_queue.c b/kernel/watch_queue.c new file mode 100644 index 00000000000000..c103e34f870568 --- /dev/null +++ b/kernel/watch_queue.c @@ -0,0 +1,657 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Watch queue and general notification mechanism, built on pipes + * + * Copyright (C) 2020 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * See Documentation/watch_queue.rst + */ + +#define pr_fmt(fmt) "watchq: " fmt +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +MODULE_DESCRIPTION("Watch queue"); +MODULE_AUTHOR("Red Hat, Inc."); +MODULE_LICENSE("GPL"); + +#define WATCH_QUEUE_NOTE_SIZE 128 +#define WATCH_QUEUE_NOTES_PER_PAGE (PAGE_SIZE / WATCH_QUEUE_NOTE_SIZE) + +static void watch_queue_pipe_buf_release(struct pipe_inode_info *pipe, + struct pipe_buffer *buf) +{ + struct watch_queue *wqueue = (struct watch_queue *)buf->private; + struct page *page; + unsigned int bit; + + /* We need to work out which note within the page this refers to, but + * the note might have been maximum size, so merely ANDing the offset + * off doesn't work. OTOH, the note must've been more than zero size. + */ + bit = buf->offset + buf->len; + if ((bit & (WATCH_QUEUE_NOTE_SIZE - 1)) == 0) + bit -= WATCH_QUEUE_NOTE_SIZE; + bit /= WATCH_QUEUE_NOTE_SIZE; + + page = buf->page; + bit += page->index; + + set_bit(bit, wqueue->notes_bitmap); +} + +static int watch_queue_pipe_buf_steal(struct pipe_inode_info *pipe, + struct pipe_buffer *buf) +{ + return -1; /* No. */ +} + +/* New data written to a pipe may be appended to a buffer with this type. */ +static const struct pipe_buf_operations watch_queue_pipe_buf_ops = { + .confirm = generic_pipe_buf_confirm, + .release = watch_queue_pipe_buf_release, + .steal = watch_queue_pipe_buf_steal, + .get = generic_pipe_buf_get, +}; + +/* + * Post a notification to a watch queue. + */ +static bool post_one_notification(struct watch_queue *wqueue, + struct watch_notification *n) +{ + void *p; + struct pipe_inode_info *pipe = wqueue->pipe; + struct pipe_buffer *buf; + struct page *page; + unsigned int head, tail, mask, note, offset, len; + bool done = false; + + if (!pipe) + return false; + + spin_lock_irq(&pipe->rd_wait.lock); + + if (wqueue->defunct) + goto out; + + mask = pipe->ring_size - 1; + head = pipe->head; + tail = pipe->tail; + if (pipe_full(head, tail, pipe->ring_size)) + goto lost; + + note = find_first_bit(wqueue->notes_bitmap, wqueue->nr_notes); + if (note >= wqueue->nr_notes) + goto lost; + + page = wqueue->notes[note / WATCH_QUEUE_NOTES_PER_PAGE]; + offset = note % WATCH_QUEUE_NOTES_PER_PAGE * WATCH_QUEUE_NOTE_SIZE; + get_page(page); + len = n->info & WATCH_INFO_LENGTH; + p = kmap_atomic(page); + memcpy(p + offset, n, len); + kunmap_atomic(p); + + buf = &pipe->bufs[head & mask]; + buf->page = page; + buf->private = (unsigned long)wqueue; + buf->ops = &watch_queue_pipe_buf_ops; + buf->offset = offset; + buf->len = len; + buf->flags = 0; + pipe->head = head + 1; + + if (!test_and_clear_bit(note, wqueue->notes_bitmap)) { + spin_unlock_irq(&pipe->rd_wait.lock); + BUG(); + } + wake_up_interruptible_sync_poll_locked(&pipe->rd_wait, EPOLLIN | EPOLLRDNORM); + done = true; + +out: + spin_unlock_irq(&pipe->rd_wait.lock); + if (done) + kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); + return done; + +lost: + goto out; +} + +/* + * Apply filter rules to a notification. + */ +static bool filter_watch_notification(const struct watch_filter *wf, + const struct watch_notification *n) +{ + const struct watch_type_filter *wt; + unsigned int st_bits = sizeof(wt->subtype_filter[0]) * 8; + unsigned int st_index = n->subtype / st_bits; + unsigned int st_bit = 1U << (n->subtype % st_bits); + int i; + + if (!test_bit(n->type, wf->type_filter)) + return false; + + for (i = 0; i < wf->nr_filters; i++) { + wt = &wf->filters[i]; + if (n->type == wt->type && + (wt->subtype_filter[st_index] & st_bit) && + (n->info & wt->info_mask) == wt->info_filter) + return true; + } + + return false; /* If there is a filter, the default is to reject. */ +} + +/** + * __post_watch_notification - Post an event notification + * @wlist: The watch list to post the event to. + * @n: The notification record to post. + * @cred: The creds of the process that triggered the notification. + * @id: The ID to match on the watch. + * + * Post a notification of an event into a set of watch queues and let the users + * know. + * + * The size of the notification should be set in n->info & WATCH_INFO_LENGTH and + * should be in units of sizeof(*n). + */ +void __post_watch_notification(struct watch_list *wlist, + struct watch_notification *n, + const struct cred *cred, + u64 id) +{ + const struct watch_filter *wf; + struct watch_queue *wqueue; + struct watch *watch; + + if (((n->info & WATCH_INFO_LENGTH) >> WATCH_INFO_LENGTH__SHIFT) == 0) { + WARN_ON(1); + return; + } + + rcu_read_lock(); + + hlist_for_each_entry_rcu(watch, &wlist->watchers, list_node) { + if (watch->id != id) + continue; + n->info &= ~WATCH_INFO_ID; + n->info |= watch->info_id; + + wqueue = rcu_dereference(watch->queue); + wf = rcu_dereference(wqueue->filter); + if (wf && !filter_watch_notification(wf, n)) + continue; + + if (security_post_notification(watch->cred, cred, n) < 0) + continue; + + post_one_notification(wqueue, n); + } + + rcu_read_unlock(); +} +EXPORT_SYMBOL(__post_watch_notification); + +/* + * Allocate sufficient pages to preallocation for the requested number of + * notifications. + */ +long watch_queue_set_size(struct pipe_inode_info *pipe, unsigned int nr_notes) +{ + struct watch_queue *wqueue = pipe->watch_queue; + struct page **pages; + unsigned long *bitmap; + unsigned long user_bufs; + unsigned int bmsize; + int ret, i, nr_pages; + + if (!wqueue) + return -ENODEV; + if (wqueue->notes) + return -EBUSY; + + if (nr_notes < 1 || + nr_notes > 512) /* TODO: choose a better hard limit */ + return -EINVAL; + + nr_pages = (nr_notes + WATCH_QUEUE_NOTES_PER_PAGE - 1); + nr_pages /= WATCH_QUEUE_NOTES_PER_PAGE; + user_bufs = account_pipe_buffers(pipe->user, pipe->nr_accounted, nr_pages); + + if (nr_pages > pipe->max_usage && + (too_many_pipe_buffers_hard(user_bufs) || + too_many_pipe_buffers_soft(user_bufs)) && + pipe_is_unprivileged_user()) { + ret = -EPERM; + goto error; + } + + ret = pipe_resize_ring(pipe, nr_notes); + if (ret < 0) + goto error; + + pages = kcalloc(sizeof(struct page *), nr_pages, GFP_KERNEL); + if (!pages) + goto error; + + for (i = 0; i < nr_pages; i++) { + pages[i] = alloc_page(GFP_KERNEL); + if (!pages[i]) + goto error_p; + pages[i]->index = i * WATCH_QUEUE_NOTES_PER_PAGE; + } + + bmsize = (nr_notes + BITS_PER_LONG - 1) / BITS_PER_LONG; + bmsize *= sizeof(unsigned long); + bitmap = kmalloc(bmsize, GFP_KERNEL); + if (!bitmap) + goto error_p; + + memset(bitmap, 0xff, bmsize); + wqueue->notes = pages; + wqueue->notes_bitmap = bitmap; + wqueue->nr_pages = nr_pages; + wqueue->nr_notes = nr_pages * WATCH_QUEUE_NOTES_PER_PAGE; + return 0; + +error_p: + for (i = 0; i < nr_pages; i++) + __free_page(pages[i]); + kfree(pages); +error: + (void) account_pipe_buffers(pipe->user, nr_pages, pipe->nr_accounted); + return ret; +} + +/* + * Set the filter on a watch queue. + */ +long watch_queue_set_filter(struct pipe_inode_info *pipe, + struct watch_notification_filter __user *_filter) +{ + struct watch_notification_type_filter *tf; + struct watch_notification_filter filter; + struct watch_type_filter *q; + struct watch_filter *wfilter; + struct watch_queue *wqueue = pipe->watch_queue; + int ret, nr_filter = 0, i; + + if (!wqueue) + return -ENODEV; + + if (!_filter) { + /* Remove the old filter */ + wfilter = NULL; + goto set; + } + + /* Grab the user's filter specification */ + if (copy_from_user(&filter, _filter, sizeof(filter)) != 0) + return -EFAULT; + if (filter.nr_filters == 0 || + filter.nr_filters > 16 || + filter.__reserved != 0) + return -EINVAL; + + tf = memdup_user(_filter->filters, filter.nr_filters * sizeof(*tf)); + if (IS_ERR(tf)) + return PTR_ERR(tf); + + ret = -EINVAL; + for (i = 0; i < filter.nr_filters; i++) { + if ((tf[i].info_filter & ~tf[i].info_mask) || + tf[i].info_mask & WATCH_INFO_LENGTH) + goto err_filter; + /* Ignore any unknown types */ + if (tf[i].type >= sizeof(wfilter->type_filter) * 8) + continue; + nr_filter++; + } + + /* Now we need to build the internal filter from only the relevant + * user-specified filters. + */ + ret = -ENOMEM; + wfilter = kzalloc(struct_size(wfilter, filters, nr_filter), GFP_KERNEL); + if (!wfilter) + goto err_filter; + wfilter->nr_filters = nr_filter; + + q = wfilter->filters; + for (i = 0; i < filter.nr_filters; i++) { + if (tf[i].type >= sizeof(wfilter->type_filter) * BITS_PER_LONG) + continue; + + q->type = tf[i].type; + q->info_filter = tf[i].info_filter; + q->info_mask = tf[i].info_mask; + q->subtype_filter[0] = tf[i].subtype_filter[0]; + __set_bit(q->type, wfilter->type_filter); + q++; + } + + kfree(tf); +set: + pipe_lock(pipe); + wfilter = rcu_replace_pointer(wqueue->filter, wfilter, + lockdep_is_held(&pipe->mutex)); + pipe_unlock(pipe); + if (wfilter) + kfree_rcu(wfilter, rcu); + return 0; + +err_filter: + kfree(tf); + return ret; +} + +static void __put_watch_queue(struct kref *kref) +{ + struct watch_queue *wqueue = + container_of(kref, struct watch_queue, usage); + struct watch_filter *wfilter; + int i; + + for (i = 0; i < wqueue->nr_pages; i++) + __free_page(wqueue->notes[i]); + + wfilter = rcu_access_pointer(wqueue->filter); + if (wfilter) + kfree_rcu(wfilter, rcu); + kfree_rcu(wqueue, rcu); +} + +/** + * put_watch_queue - Dispose of a ref on a watchqueue. + * @wqueue: The watch queue to unref. + */ +void put_watch_queue(struct watch_queue *wqueue) +{ + kref_put(&wqueue->usage, __put_watch_queue); +} +EXPORT_SYMBOL(put_watch_queue); + +static void free_watch(struct rcu_head *rcu) +{ + struct watch *watch = container_of(rcu, struct watch, rcu); + + put_watch_queue(rcu_access_pointer(watch->queue)); + put_cred(watch->cred); +} + +static void __put_watch(struct kref *kref) +{ + struct watch *watch = container_of(kref, struct watch, usage); + + call_rcu(&watch->rcu, free_watch); +} + +/* + * Discard a watch. + */ +static void put_watch(struct watch *watch) +{ + kref_put(&watch->usage, __put_watch); +} + +/** + * init_watch_queue - Initialise a watch + * @watch: The watch to initialise. + * @wqueue: The queue to assign. + * + * Initialise a watch and set the watch queue. + */ +void init_watch(struct watch *watch, struct watch_queue *wqueue) +{ + kref_init(&watch->usage); + INIT_HLIST_NODE(&watch->list_node); + INIT_HLIST_NODE(&watch->queue_node); + rcu_assign_pointer(watch->queue, wqueue); +} + +/** + * add_watch_to_object - Add a watch on an object to a watch list + * @watch: The watch to add + * @wlist: The watch list to add to + * + * @watch->queue must have been set to point to the queue to post notifications + * to and the watch list of the object to be watched. @watch->cred must also + * have been set to the appropriate credentials and a ref taken on them. + * + * The caller must pin the queue and the list both and must hold the list + * locked against racing watch additions/removals. + */ +int add_watch_to_object(struct watch *watch, struct watch_list *wlist) +{ + struct watch_queue *wqueue = rcu_access_pointer(watch->queue); + struct watch *w; + + hlist_for_each_entry(w, &wlist->watchers, list_node) { + struct watch_queue *wq = rcu_access_pointer(w->queue); + if (wqueue == wq && watch->id == w->id) + return -EBUSY; + } + + watch->cred = get_current_cred(); + rcu_assign_pointer(watch->watch_list, wlist); + + spin_lock_bh(&wqueue->lock); + kref_get(&wqueue->usage); + kref_get(&watch->usage); + hlist_add_head(&watch->queue_node, &wqueue->watches); + spin_unlock_bh(&wqueue->lock); + + hlist_add_head(&watch->list_node, &wlist->watchers); + return 0; +} +EXPORT_SYMBOL(add_watch_to_object); + +/** + * remove_watch_from_object - Remove a watch or all watches from an object. + * @wlist: The watch list to remove from + * @wq: The watch queue of interest (ignored if @all is true) + * @id: The ID of the watch to remove (ignored if @all is true) + * @all: True to remove all objects + * + * Remove a specific watch or all watches from an object. A notification is + * sent to the watcher to tell them that this happened. + */ +int remove_watch_from_object(struct watch_list *wlist, struct watch_queue *wq, + u64 id, bool all) +{ + struct watch_notification_removal n; + struct watch_queue *wqueue; + struct watch *watch; + int ret = -EBADSLT; + + rcu_read_lock(); + +again: + spin_lock(&wlist->lock); + hlist_for_each_entry(watch, &wlist->watchers, list_node) { + if (all || + (watch->id == id && rcu_access_pointer(watch->queue) == wq)) + goto found; + } + spin_unlock(&wlist->lock); + goto out; + +found: + ret = 0; + hlist_del_init_rcu(&watch->list_node); + rcu_assign_pointer(watch->watch_list, NULL); + spin_unlock(&wlist->lock); + + /* We now own the reference on watch that used to belong to wlist. */ + + n.watch.type = WATCH_TYPE_META; + n.watch.subtype = WATCH_META_REMOVAL_NOTIFICATION; + n.watch.info = watch->info_id | watch_sizeof(n.watch); + n.id = id; + if (id != 0) + n.watch.info = watch->info_id | watch_sizeof(n); + + wqueue = rcu_dereference(watch->queue); + + /* We don't need the watch list lock for the next bit as RCU is + * protecting *wqueue from deallocation. + */ + if (wqueue) { + post_one_notification(wqueue, &n.watch); + + spin_lock_bh(&wqueue->lock); + + if (!hlist_unhashed(&watch->queue_node)) { + hlist_del_init_rcu(&watch->queue_node); + put_watch(watch); + } + + spin_unlock_bh(&wqueue->lock); + } + + if (wlist->release_watch) { + void (*release_watch)(struct watch *); + + release_watch = wlist->release_watch; + rcu_read_unlock(); + (*release_watch)(watch); + rcu_read_lock(); + } + put_watch(watch); + + if (all && !hlist_empty(&wlist->watchers)) + goto again; +out: + rcu_read_unlock(); + return ret; +} +EXPORT_SYMBOL(remove_watch_from_object); + +/* + * Remove all the watches that are contributory to a queue. This has the + * potential to race with removal of the watches by the destruction of the + * objects being watched or with the distribution of notifications. + */ +void watch_queue_clear(struct watch_queue *wqueue) +{ + struct watch_list *wlist; + struct watch *watch; + bool release; + + rcu_read_lock(); + spin_lock_bh(&wqueue->lock); + + /* Prevent new additions and prevent notifications from happening */ + wqueue->defunct = true; + + while (!hlist_empty(&wqueue->watches)) { + watch = hlist_entry(wqueue->watches.first, struct watch, queue_node); + hlist_del_init_rcu(&watch->queue_node); + /* We now own a ref on the watch. */ + spin_unlock_bh(&wqueue->lock); + + /* We can't do the next bit under the queue lock as we need to + * get the list lock - which would cause a deadlock if someone + * was removing from the opposite direction at the same time or + * posting a notification. + */ + wlist = rcu_dereference(watch->watch_list); + if (wlist) { + void (*release_watch)(struct watch *); + + spin_lock(&wlist->lock); + + release = !hlist_unhashed(&watch->list_node); + if (release) { + hlist_del_init_rcu(&watch->list_node); + rcu_assign_pointer(watch->watch_list, NULL); + + /* We now own a second ref on the watch. */ + } + + release_watch = wlist->release_watch; + spin_unlock(&wlist->lock); + + if (release) { + if (release_watch) { + rcu_read_unlock(); + /* This might need to call dput(), so + * we have to drop all the locks. + */ + (*release_watch)(watch); + rcu_read_lock(); + } + put_watch(watch); + } + } + + put_watch(watch); + spin_lock_bh(&wqueue->lock); + } + + spin_unlock_bh(&wqueue->lock); + rcu_read_unlock(); +} + +/** + * get_watch_queue - Get a watch queue from its file descriptor. + * @fd: The fd to query. + */ +struct watch_queue *get_watch_queue(int fd) +{ + struct pipe_inode_info *pipe; + struct watch_queue *wqueue = ERR_PTR(-EINVAL); + struct fd f; + + f = fdget(fd); + if (f.file) { + pipe = get_pipe_info(f.file, false); + if (pipe && pipe->watch_queue) { + wqueue = pipe->watch_queue; + kref_get(&wqueue->usage); + } + fdput(f); + } + + return wqueue; +} +EXPORT_SYMBOL(get_watch_queue); + +/* + * Initialise a watch queue + */ +int watch_queue_init(struct pipe_inode_info *pipe) +{ + struct watch_queue *wqueue; + + wqueue = kzalloc(sizeof(*wqueue), GFP_KERNEL); + if (!wqueue) + return -ENOMEM; + + wqueue->pipe = pipe; + kref_init(&wqueue->usage); + spin_lock_init(&wqueue->lock); + INIT_HLIST_HEAD(&wqueue->watches); + + pipe->watch_queue = wqueue; + return 0; +} From 998f50407ffc9370565c7ed3fcd1366adccdfbbf Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 12 Feb 2020 13:58:35 +0000 Subject: [PATCH 023/190] security: Add hooks to rule on setting a watch Add security hooks that will allow an LSM to rule on whether or not a watch may be set. More than one hook is required as the watches watch different types of object. Signed-off-by: David Howells Acked-by: James Morris cc: Casey Schaufler cc: Stephen Smalley cc: linux-security-module@vger.kernel.org --- include/linux/lsm_hook_defs.h | 4 ++++ include/linux/lsm_hooks.h | 5 +++++ include/linux/security.h | 9 +++++++++ security/security.c | 7 +++++++ 4 files changed, 25 insertions(+) diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h index e0def45b5cc562..a54f49e9570887 100644 --- a/include/linux/lsm_hook_defs.h +++ b/include/linux/lsm_hook_defs.h @@ -256,6 +256,10 @@ LSM_HOOK(int, 0, inode_getsecctx, struct inode *inode, void **ctx, #if defined(CONFIG_SECURITY) && defined(CONFIG_WATCH_QUEUE) LSM_HOOK(int, 0, post_notification, const struct cred *w_cred, const struct cred *cred, struct watch_notification *n) +#endif /* CONFIG_SECURITY && CONFIG_WATCH_QUEUE */ + +#if defined(CONFIG_SECURITY) && defined(CONFIG_KEY_NOTIFICATIONS) +LSM_HOOK(int, 0, watch_key, struct key *key) #endif /* CONFIG_SECURITY && CONFIG_KEY_NOTIFICATIONS */ #ifdef CONFIG_SECURITY_NETWORK diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h index 0b5e5769b83670..3f1374cffb76cb 100644 --- a/include/linux/lsm_hooks.h +++ b/include/linux/lsm_hooks.h @@ -1446,6 +1446,11 @@ * @cred: The event-triggerer's credentials * @n: The notification being posted * + * @watch_key: + * Check to see if a process is allowed to watch for event notifications + * from a key or keyring. + * @key: The key to watch. + * * Security hooks for using the eBPF maps and programs functionalities through * eBPF syscalls. * diff --git a/include/linux/security.h b/include/linux/security.h index 9a5d12ab491bc9..e7914e4e0b0266 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -1290,6 +1290,15 @@ static inline int security_post_notification(const struct cred *w_cred, } #endif +#if defined(CONFIG_SECURITY) && defined(CONFIG_KEY_NOTIFICATIONS) +int security_watch_key(struct key *key); +#else +static inline int security_watch_key(struct key *key) +{ + return 0; +} +#endif + #ifdef CONFIG_SECURITY_NETWORK int security_unix_stream_connect(struct sock *sock, struct sock *other, struct sock *newsk); diff --git a/security/security.c b/security/security.c index 7d55607120b487..c73334ab2882bc 100644 --- a/security/security.c +++ b/security/security.c @@ -2016,6 +2016,13 @@ int security_post_notification(const struct cred *w_cred, } #endif /* CONFIG_WATCH_QUEUE */ +#ifdef CONFIG_KEY_NOTIFICATIONS +int security_watch_key(struct key *key) +{ + return call_int_hook(watch_key, 0, key); +} +#endif + #ifdef CONFIG_SECURITY_NETWORK int security_unix_stream_connect(struct sock *sock, struct sock *other, struct sock *newsk) From f7e47677e39a03057dcced2016c92a9c868693ec Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 14 Jan 2020 17:07:11 +0000 Subject: [PATCH 024/190] watch_queue: Add a key/keyring notification facility Add a key/keyring change notification facility whereby notifications about changes in key and keyring content and attributes can be received. Firstly, an event queue needs to be created: pipe2(fds, O_NOTIFICATION_PIPE); ioctl(fds[1], IOC_WATCH_QUEUE_SET_SIZE, 256); then a notification can be set up to report notifications via that queue: struct watch_notification_filter filter = { .nr_filters = 1, .filters = { [0] = { .type = WATCH_TYPE_KEY_NOTIFY, .subtype_filter[0] = UINT_MAX, }, }, }; ioctl(fds[1], IOC_WATCH_QUEUE_SET_FILTER, &filter); keyctl_watch_key(KEY_SPEC_SESSION_KEYRING, fds[1], 0x01); After that, records will be placed into the queue when events occur in which keys are changed in some way. Records are of the following format: struct key_notification { struct watch_notification watch; __u32 key_id; __u32 aux; } *n; Where: n->watch.type will be WATCH_TYPE_KEY_NOTIFY. n->watch.subtype will indicate the type of event, such as NOTIFY_KEY_REVOKED. n->watch.info & WATCH_INFO_LENGTH will indicate the length of the record. n->watch.info & WATCH_INFO_ID will be the second argument to keyctl_watch_key(), shifted. n->key will be the ID of the affected key. n->aux will hold subtype-dependent information, such as the key being linked into the keyring specified by n->key in the case of NOTIFY_KEY_LINKED. Note that it is permissible for event records to be of variable length - or, at least, the length may be dependent on the subtype. Note also that the queue can be shared between multiple notifications of various types. Signed-off-by: David Howells Reviewed-by: James Morris --- Documentation/security/keys/core.rst | 57 ++++++++++++++++ include/linux/key.h | 3 + include/uapi/linux/keyctl.h | 2 + include/uapi/linux/watch_queue.h | 28 +++++++- security/keys/Kconfig | 9 +++ security/keys/compat.c | 3 + security/keys/gc.c | 5 ++ security/keys/internal.h | 30 ++++++++- security/keys/key.c | 38 +++++++---- security/keys/keyctl.c | 99 +++++++++++++++++++++++++++- security/keys/keyring.c | 20 ++++-- security/keys/request_key.c | 4 +- 12 files changed, 270 insertions(+), 28 deletions(-) diff --git a/Documentation/security/keys/core.rst b/Documentation/security/keys/core.rst index d9b0b859018b42..6cff2b5f88edc4 100644 --- a/Documentation/security/keys/core.rst +++ b/Documentation/security/keys/core.rst @@ -1026,6 +1026,63 @@ The keyctl syscall functions are: written into the output buffer. Verification returns 0 on success. + * Watch a key or keyring for changes:: + + long keyctl(KEYCTL_WATCH_KEY, key_serial_t key, int queue_fd, + const struct watch_notification_filter *filter); + + This will set or remove a watch for changes on the specified key or + keyring. + + "key" is the ID of the key to be watched. + + "queue_fd" is a file descriptor referring to an open "/dev/watch_queue" + which manages the buffer into which notifications will be delivered. + + "filter" is either NULL to remove a watch or a filter specification to + indicate what events are required from the key. + + See Documentation/watch_queue.rst for more information. + + Note that only one watch may be emplaced for any particular { key, + queue_fd } combination. + + Notification records look like:: + + struct key_notification { + struct watch_notification watch; + __u32 key_id; + __u32 aux; + }; + + In this, watch::type will be "WATCH_TYPE_KEY_NOTIFY" and subtype will be + one of:: + + NOTIFY_KEY_INSTANTIATED + NOTIFY_KEY_UPDATED + NOTIFY_KEY_LINKED + NOTIFY_KEY_UNLINKED + NOTIFY_KEY_CLEARED + NOTIFY_KEY_REVOKED + NOTIFY_KEY_INVALIDATED + NOTIFY_KEY_SETATTR + + Where these indicate a key being instantiated/rejected, updated, a link + being made in a keyring, a link being removed from a keyring, a keyring + being cleared, a key being revoked, a key being invalidated or a key + having one of its attributes changed (user, group, perm, timeout, + restriction). + + If a watched key is deleted, a basic watch_notification will be issued + with "type" set to WATCH_TYPE_META and "subtype" set to + watch_meta_removal_notification. The watchpoint ID will be set in the + "info" field. + + This needs to be configured by enabling: + + "Provide key/keyring change notifications" (KEY_NOTIFICATIONS) + + Kernel Services =============== diff --git a/include/linux/key.h b/include/linux/key.h index 6cf8e71cf8b7c8..b99b40db08fc69 100644 --- a/include/linux/key.h +++ b/include/linux/key.h @@ -176,6 +176,9 @@ struct key { struct list_head graveyard_link; struct rb_node serial_node; }; +#ifdef CONFIG_KEY_NOTIFICATIONS + struct watch_list *watchers; /* Entities watching this key for changes */ +#endif struct rw_semaphore sem; /* change vs change sem */ struct key_user *user; /* owner of this key */ void *security; /* security data for this key */ diff --git a/include/uapi/linux/keyctl.h b/include/uapi/linux/keyctl.h index ed3d5893830df5..4c8884eea8084e 100644 --- a/include/uapi/linux/keyctl.h +++ b/include/uapi/linux/keyctl.h @@ -69,6 +69,7 @@ #define KEYCTL_RESTRICT_KEYRING 29 /* Restrict keys allowed to link to a keyring */ #define KEYCTL_MOVE 30 /* Move keys between keyrings */ #define KEYCTL_CAPABILITIES 31 /* Find capabilities of keyrings subsystem */ +#define KEYCTL_WATCH_KEY 32 /* Watch a key or ring of keys for changes */ /* keyctl structures */ struct keyctl_dh_params { @@ -130,5 +131,6 @@ struct keyctl_pkey_params { #define KEYCTL_CAPS0_MOVE 0x80 /* KEYCTL_MOVE supported */ #define KEYCTL_CAPS1_NS_KEYRING_NAME 0x01 /* Keyring names are per-user_namespace */ #define KEYCTL_CAPS1_NS_KEY_TAG 0x02 /* Key indexing can include a namespace tag */ +#define KEYCTL_CAPS1_NOTIFICATIONS 0x04 /* Keys generate watchable notifications */ #endif /* _LINUX_KEYCTL_H */ diff --git a/include/uapi/linux/watch_queue.h b/include/uapi/linux/watch_queue.h index 3a5790f1f05de4..c3d8320b5d3a6a 100644 --- a/include/uapi/linux/watch_queue.h +++ b/include/uapi/linux/watch_queue.h @@ -13,7 +13,8 @@ enum watch_notification_type { WATCH_TYPE_META = 0, /* Special record */ - WATCH_TYPE__NR = 1 + WATCH_TYPE_KEY_NOTIFY = 1, /* Key change event notification */ + WATCH_TYPE__NR = 2 }; enum watch_meta_notification_subtype { @@ -75,4 +76,29 @@ struct watch_notification_removal { __u64 id; /* Type-dependent identifier */ }; +/* + * Type of key/keyring change notification. + */ +enum key_notification_subtype { + NOTIFY_KEY_INSTANTIATED = 0, /* Key was instantiated (aux is error code) */ + NOTIFY_KEY_UPDATED = 1, /* Key was updated */ + NOTIFY_KEY_LINKED = 2, /* Key (aux) was added to watched keyring */ + NOTIFY_KEY_UNLINKED = 3, /* Key (aux) was removed from watched keyring */ + NOTIFY_KEY_CLEARED = 4, /* Keyring was cleared */ + NOTIFY_KEY_REVOKED = 5, /* Key was revoked */ + NOTIFY_KEY_INVALIDATED = 6, /* Key was invalidated */ + NOTIFY_KEY_SETATTR = 7, /* Key's attributes got changed */ +}; + +/* + * Key/keyring notification record. + * - watch.type = WATCH_TYPE_KEY_NOTIFY + * - watch.subtype = enum key_notification_type + */ +struct key_notification { + struct watch_notification watch; + __u32 key_id; /* The key/keyring affected */ + __u32 aux; /* Per-type auxiliary data */ +}; + #endif /* _UAPI_LINUX_WATCH_QUEUE_H */ diff --git a/security/keys/Kconfig b/security/keys/Kconfig index 47c041563d41c5..d4dc5ea208af18 100644 --- a/security/keys/Kconfig +++ b/security/keys/Kconfig @@ -116,3 +116,12 @@ config KEY_DH_OPERATIONS in the kernel. If you are unsure as to whether this is required, answer N. + +config KEY_NOTIFICATIONS + bool "Provide key/keyring change notifications" + depends on KEYS && WATCH_QUEUE + help + This option provides support for getting change notifications on keys + and keyrings on which the caller has View permission. This makes use + of the /dev/watch_queue misc device to handle the notification + buffer and provides KEYCTL_WATCH_KEY to enable/disable watches. diff --git a/security/keys/compat.c b/security/keys/compat.c index b975f8f11124b7..6ee9d8f6a4a5bb 100644 --- a/security/keys/compat.c +++ b/security/keys/compat.c @@ -156,6 +156,9 @@ COMPAT_SYSCALL_DEFINE5(keyctl, u32, option, case KEYCTL_CAPABILITIES: return keyctl_capabilities(compat_ptr(arg2), arg3); + case KEYCTL_WATCH_KEY: + return keyctl_watch_key(arg2, arg3, arg4); + default: return -EOPNOTSUPP; } diff --git a/security/keys/gc.c b/security/keys/gc.c index 671dd730ecfc9e..3c90807476eb0e 100644 --- a/security/keys/gc.c +++ b/security/keys/gc.c @@ -131,6 +131,11 @@ static noinline void key_gc_unused_keys(struct list_head *keys) kdebug("- %u", key->serial); key_check(key); +#ifdef CONFIG_KEY_NOTIFICATIONS + remove_watch_list(key->watchers, key->serial); + key->watchers = NULL; +#endif + /* Throw away the key data if the key is instantiated */ if (state == KEY_IS_POSITIVE && key->type->destroy) key->type->destroy(key); diff --git a/security/keys/internal.h b/security/keys/internal.h index 6d0ca48ae9a50c..28e17f4f3328fb 100644 --- a/security/keys/internal.h +++ b/security/keys/internal.h @@ -15,6 +15,7 @@ #include #include #include +#include #include #include #include @@ -99,7 +100,8 @@ extern int __key_link_begin(struct key *keyring, const struct keyring_index_key *index_key, struct assoc_array_edit **_edit); extern int __key_link_check_live_key(struct key *keyring, struct key *key); -extern void __key_link(struct key *key, struct assoc_array_edit **_edit); +extern void __key_link(struct key *keyring, struct key *key, + struct assoc_array_edit **_edit); extern void __key_link_end(struct key *keyring, const struct keyring_index_key *index_key, struct assoc_array_edit *edit); @@ -183,6 +185,23 @@ extern int key_task_permission(const key_ref_t key_ref, const struct cred *cred, key_perm_t perm); +static inline void notify_key(struct key *key, + enum key_notification_subtype subtype, u32 aux) +{ +#ifdef CONFIG_KEY_NOTIFICATIONS + struct key_notification n = { + .watch.type = WATCH_TYPE_KEY_NOTIFY, + .watch.subtype = subtype, + .watch.info = watch_sizeof(n), + .key_id = key_serial(key), + .aux = aux, + }; + + post_watch_notification(key->watchers, &n.watch, current_cred(), + n.key_id); +#endif +} + /* * Check to see whether permission is granted to use a key in the desired way. */ @@ -333,6 +352,15 @@ static inline long keyctl_pkey_e_d_s(int op, extern long keyctl_capabilities(unsigned char __user *_buffer, size_t buflen); +#ifdef CONFIG_KEY_NOTIFICATIONS +extern long keyctl_watch_key(key_serial_t, int, int); +#else +static inline long keyctl_watch_key(key_serial_t key_id, int watch_fd, int watch_id) +{ + return -EOPNOTSUPP; +} +#endif + /* * Debugging key validation */ diff --git a/security/keys/key.c b/security/keys/key.c index e959b3c96b4870..e282c6179b21d5 100644 --- a/security/keys/key.c +++ b/security/keys/key.c @@ -444,6 +444,7 @@ static int __key_instantiate_and_link(struct key *key, /* mark the key as being instantiated */ atomic_inc(&key->user->nikeys); mark_key_instantiated(key, 0); + notify_key(key, NOTIFY_KEY_INSTANTIATED, 0); if (test_and_clear_bit(KEY_FLAG_USER_CONSTRUCT, &key->flags)) awaken = 1; @@ -453,7 +454,7 @@ static int __key_instantiate_and_link(struct key *key, if (test_bit(KEY_FLAG_KEEP, &keyring->flags)) set_bit(KEY_FLAG_KEEP, &key->flags); - __key_link(key, _edit); + __key_link(keyring, key, _edit); } /* disable the authorisation key */ @@ -601,6 +602,7 @@ int key_reject_and_link(struct key *key, /* mark the key as being negatively instantiated */ atomic_inc(&key->user->nikeys); mark_key_instantiated(key, -error); + notify_key(key, NOTIFY_KEY_INSTANTIATED, -error); key->expiry = ktime_get_real_seconds() + timeout; key_schedule_gc(key->expiry + key_gc_delay); @@ -611,7 +613,7 @@ int key_reject_and_link(struct key *key, /* and link it into the destination keyring */ if (keyring && link_ret == 0) - __key_link(key, &edit); + __key_link(keyring, key, &edit); /* disable the authorisation key */ if (authkey) @@ -764,9 +766,11 @@ static inline key_ref_t __key_update(key_ref_t key_ref, down_write(&key->sem); ret = key->type->update(key, prep); - if (ret == 0) + if (ret == 0) { /* Updating a negative key positively instantiates it */ mark_key_instantiated(key, 0); + notify_key(key, NOTIFY_KEY_UPDATED, 0); + } up_write(&key->sem); @@ -1023,9 +1027,11 @@ int key_update(key_ref_t key_ref, const void *payload, size_t plen) down_write(&key->sem); ret = key->type->update(key, &prep); - if (ret == 0) + if (ret == 0) { /* Updating a negative key positively instantiates it */ mark_key_instantiated(key, 0); + notify_key(key, NOTIFY_KEY_UPDATED, 0); + } up_write(&key->sem); @@ -1057,15 +1063,17 @@ void key_revoke(struct key *key) * instantiated */ down_write_nested(&key->sem, 1); - if (!test_and_set_bit(KEY_FLAG_REVOKED, &key->flags) && - key->type->revoke) - key->type->revoke(key); - - /* set the death time to no more than the expiry time */ - time = ktime_get_real_seconds(); - if (key->revoked_at == 0 || key->revoked_at > time) { - key->revoked_at = time; - key_schedule_gc(key->revoked_at + key_gc_delay); + if (!test_and_set_bit(KEY_FLAG_REVOKED, &key->flags)) { + notify_key(key, NOTIFY_KEY_REVOKED, 0); + if (key->type->revoke) + key->type->revoke(key); + + /* set the death time to no more than the expiry time */ + time = ktime_get_real_seconds(); + if (key->revoked_at == 0 || key->revoked_at > time) { + key->revoked_at = time; + key_schedule_gc(key->revoked_at + key_gc_delay); + } } up_write(&key->sem); @@ -1087,8 +1095,10 @@ void key_invalidate(struct key *key) if (!test_bit(KEY_FLAG_INVALIDATED, &key->flags)) { down_write_nested(&key->sem, 1); - if (!test_and_set_bit(KEY_FLAG_INVALIDATED, &key->flags)) + if (!test_and_set_bit(KEY_FLAG_INVALIDATED, &key->flags)) { + notify_key(key, NOTIFY_KEY_INVALIDATED, 0); key_schedule_gc_links(); + } up_write(&key->sem); } } diff --git a/security/keys/keyctl.c b/security/keys/keyctl.c index 5e01192e222a0b..7d8de1c9a47816 100644 --- a/security/keys/keyctl.c +++ b/security/keys/keyctl.c @@ -37,7 +37,9 @@ static const unsigned char keyrings_capabilities[2] = { KEYCTL_CAPS0_MOVE ), [1] = (KEYCTL_CAPS1_NS_KEYRING_NAME | - KEYCTL_CAPS1_NS_KEY_TAG), + KEYCTL_CAPS1_NS_KEY_TAG | + (IS_ENABLED(CONFIG_KEY_NOTIFICATIONS) ? KEYCTL_CAPS1_NOTIFICATIONS : 0) + ), }; static int key_get_type_from_user(char *type, @@ -1039,6 +1041,7 @@ long keyctl_chown_key(key_serial_t id, uid_t user, gid_t group) if (group != (gid_t) -1) key->gid = gid; + notify_key(key, NOTIFY_KEY_SETATTR, 0); ret = 0; error_put: @@ -1089,6 +1092,7 @@ long keyctl_setperm_key(key_serial_t id, key_perm_t perm) /* if we're not the sysadmin, we can only change a key that we own */ if (capable(CAP_SYS_ADMIN) || uid_eq(key->uid, current_fsuid())) { key->perm = perm; + notify_key(key, NOTIFY_KEY_SETATTR, 0); ret = 0; } @@ -1480,10 +1484,12 @@ long keyctl_set_timeout(key_serial_t id, unsigned timeout) okay: key = key_ref_to_ptr(key_ref); ret = 0; - if (test_bit(KEY_FLAG_KEEP, &key->flags)) + if (test_bit(KEY_FLAG_KEEP, &key->flags)) { ret = -EPERM; - else + } else { key_set_timeout(key, timeout); + notify_key(key, NOTIFY_KEY_SETATTR, 0); + } key_put(key); error: @@ -1757,6 +1763,90 @@ long keyctl_restrict_keyring(key_serial_t id, const char __user *_type, return ret; } +#ifdef CONFIG_KEY_NOTIFICATIONS +/* + * Watch for changes to a key. + * + * The caller must have View permission to watch a key or keyring. + */ +long keyctl_watch_key(key_serial_t id, int watch_queue_fd, int watch_id) +{ + struct watch_queue *wqueue; + struct watch_list *wlist = NULL; + struct watch *watch = NULL; + struct key *key; + key_ref_t key_ref; + long ret; + + if (watch_id < -1 || watch_id > 0xff) + return -EINVAL; + + key_ref = lookup_user_key(id, KEY_LOOKUP_CREATE, KEY_NEED_VIEW); + if (IS_ERR(key_ref)) + return PTR_ERR(key_ref); + key = key_ref_to_ptr(key_ref); + + wqueue = get_watch_queue(watch_queue_fd); + if (IS_ERR(wqueue)) { + ret = PTR_ERR(wqueue); + goto err_key; + } + + if (watch_id >= 0) { + ret = -ENOMEM; + if (!key->watchers) { + wlist = kzalloc(sizeof(*wlist), GFP_KERNEL); + if (!wlist) + goto err_wqueue; + init_watch_list(wlist, NULL); + } + + watch = kzalloc(sizeof(*watch), GFP_KERNEL); + if (!watch) + goto err_wlist; + + init_watch(watch, wqueue); + watch->id = key->serial; + watch->info_id = (u32)watch_id << WATCH_INFO_ID__SHIFT; + + ret = security_watch_key(key); + if (ret < 0) + goto err_watch; + + down_write(&key->sem); + if (!key->watchers) { + key->watchers = wlist; + wlist = NULL; + } + + ret = add_watch_to_object(watch, key->watchers); + up_write(&key->sem); + + if (ret == 0) + watch = NULL; + } else { + ret = -EBADSLT; + if (key->watchers) { + down_write(&key->sem); + ret = remove_watch_from_object(key->watchers, + wqueue, key_serial(key), + false); + up_write(&key->sem); + } + } + +err_watch: + kfree(watch); +err_wlist: + kfree(wlist); +err_wqueue: + put_watch_queue(wqueue); +err_key: + key_put(key); + return ret; +} +#endif /* CONFIG_KEY_NOTIFICATIONS */ + /* * Get keyrings subsystem capabilities. */ @@ -1926,6 +2016,9 @@ SYSCALL_DEFINE5(keyctl, int, option, unsigned long, arg2, unsigned long, arg3, case KEYCTL_CAPABILITIES: return keyctl_capabilities((unsigned char __user *)arg2, (size_t)arg3); + case KEYCTL_WATCH_KEY: + return keyctl_watch_key((key_serial_t)arg2, (int)arg3, (int)arg4); + default: return -EOPNOTSUPP; } diff --git a/security/keys/keyring.c b/security/keys/keyring.c index 5ca620d31cd308..14abfe765b7e78 100644 --- a/security/keys/keyring.c +++ b/security/keys/keyring.c @@ -1056,12 +1056,14 @@ int keyring_restrict(key_ref_t keyring_ref, const char *type, down_write(&keyring->sem); down_write(&keyring_serialise_restrict_sem); - if (keyring->restrict_link) + if (keyring->restrict_link) { ret = -EEXIST; - else if (keyring_detect_restriction_cycle(keyring, restrict_link)) + } else if (keyring_detect_restriction_cycle(keyring, restrict_link)) { ret = -EDEADLK; - else + } else { keyring->restrict_link = restrict_link; + notify_key(keyring, NOTIFY_KEY_SETATTR, 0); + } up_write(&keyring_serialise_restrict_sem); up_write(&keyring->sem); @@ -1362,12 +1364,14 @@ int __key_link_check_live_key(struct key *keyring, struct key *key) * holds at most one link to any given key of a particular type+description * combination. */ -void __key_link(struct key *key, struct assoc_array_edit **_edit) +void __key_link(struct key *keyring, struct key *key, + struct assoc_array_edit **_edit) { __key_get(key); assoc_array_insert_set_object(*_edit, keyring_key_to_ptr(key)); assoc_array_apply_edit(*_edit); *_edit = NULL; + notify_key(keyring, NOTIFY_KEY_LINKED, key_serial(key)); } /* @@ -1451,7 +1455,7 @@ int key_link(struct key *keyring, struct key *key) if (ret == 0) ret = __key_link_check_live_key(keyring, key); if (ret == 0) - __key_link(key, &edit); + __key_link(keyring, key, &edit); error_end: __key_link_end(keyring, &key->index_key, edit); @@ -1483,7 +1487,7 @@ static int __key_unlink_begin(struct key *keyring, struct key *key, struct assoc_array_edit *edit; BUG_ON(*_edit != NULL); - + edit = assoc_array_delete(&keyring->keys, &keyring_assoc_array_ops, &key->index_key); if (IS_ERR(edit)) @@ -1503,6 +1507,7 @@ static void __key_unlink(struct key *keyring, struct key *key, struct assoc_array_edit **_edit) { assoc_array_apply_edit(*_edit); + notify_key(keyring, NOTIFY_KEY_UNLINKED, key_serial(key)); *_edit = NULL; key_payload_reserve(keyring, keyring->datalen - KEYQUOTA_LINK_BYTES); } @@ -1621,7 +1626,7 @@ int key_move(struct key *key, goto error; __key_unlink(from_keyring, key, &from_edit); - __key_link(key, &to_edit); + __key_link(to_keyring, key, &to_edit); error: __key_link_end(to_keyring, &key->index_key, to_edit); __key_unlink_end(from_keyring, key, from_edit); @@ -1655,6 +1660,7 @@ int keyring_clear(struct key *keyring) } else { if (edit) assoc_array_apply_edit(edit); + notify_key(keyring, NOTIFY_KEY_CLEARED, 0); key_payload_reserve(keyring, 0); ret = 0; } diff --git a/security/keys/request_key.c b/security/keys/request_key.c index 957b9e3e14924d..e1b9f1a80676e5 100644 --- a/security/keys/request_key.c +++ b/security/keys/request_key.c @@ -418,7 +418,7 @@ static int construct_alloc_key(struct keyring_search_context *ctx, goto key_already_present; if (dest_keyring) - __key_link(key, &edit); + __key_link(dest_keyring, key, &edit); mutex_unlock(&key_construction_mutex); if (dest_keyring) @@ -437,7 +437,7 @@ static int construct_alloc_key(struct keyring_search_context *ctx, if (dest_keyring) { ret = __key_link_check_live_key(dest_keyring, key); if (ret == 0) - __key_link(key, &edit); + __key_link(dest_keyring, key, &edit); __key_link_end(dest_keyring, &ctx->index_key, edit); if (ret < 0) goto link_check_failed; From f5b5a164f9a11aab5b225f082b33a8f03c07516c Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 14 Jan 2020 17:07:11 +0000 Subject: [PATCH 025/190] Add sample notification program The sample program is run like: ./samples/watch_queue/watch_test and watches "/" for mount changes and the current session keyring for key changes: # keyctl add user a a @s 1035096409 # keyctl unlink 1035096409 @s producing: # ./watch_test read() = 16 NOTIFY[000]: ty=000001 sy=02 i=00000110 KEY 2ffc2e5d change=2[linked] aux=1035096409 read() = 16 NOTIFY[000]: ty=000001 sy=02 i=00000110 KEY 2ffc2e5d change=3[unlinked] aux=1035096409 Other events may be produced, such as with a failing disk: read() = 22 NOTIFY[000]: ty=000003 sy=02 i=00000416 USB 3-7.7 dev-reset e=0 r=0 read() = 24 NOTIFY[000]: ty=000002 sy=06 i=00000418 BLOCK 00800050 e=6[critical medium] s=64000ef8 This corresponds to: blk_update_request: critical medium error, dev sdf, sector 1677725432 op 0x0:(READ) flags 0x0 phys_seg 1 prio class 0 in dmesg. Signed-off-by: David Howells --- samples/Kconfig | 6 + samples/Makefile | 1 + samples/watch_queue/Makefile | 7 ++ samples/watch_queue/watch_test.c | 183 +++++++++++++++++++++++++++++++ 4 files changed, 197 insertions(+) create mode 100644 samples/watch_queue/Makefile create mode 100644 samples/watch_queue/watch_test.c diff --git a/samples/Kconfig b/samples/Kconfig index 9d236c346de506..5c31971a5745c8 100644 --- a/samples/Kconfig +++ b/samples/Kconfig @@ -190,5 +190,11 @@ config SAMPLE_INTEL_MEI help Build a sample program to work with mei device. +config SAMPLE_WATCH_QUEUE + bool "Build example /dev/watch_queue notification consumer" + depends on HEADERS_INSTALL + help + Build example userspace program to use the new mount_notify(), + sb_notify() syscalls and the KEYCTL_WATCH_KEY keyctl() function. endif # SAMPLES diff --git a/samples/Makefile b/samples/Makefile index f8f847b4f61f55..8617edf7f19a30 100644 --- a/samples/Makefile +++ b/samples/Makefile @@ -24,3 +24,4 @@ obj-$(CONFIG_VIDEO_PCI_SKELETON) += v4l/ obj-y += vfio-mdev/ subdir-$(CONFIG_SAMPLE_VFS) += vfs obj-$(CONFIG_SAMPLE_INTEL_MEI) += mei/ +subdir-$(CONFIG_SAMPLE_WATCH_QUEUE) += watch_queue diff --git a/samples/watch_queue/Makefile b/samples/watch_queue/Makefile new file mode 100644 index 00000000000000..8511fb6c53d2b7 --- /dev/null +++ b/samples/watch_queue/Makefile @@ -0,0 +1,7 @@ +# List of programs to build +hostprogs := watch_test + +# Tell kbuild to always build the programs +always-y := $(hostprogs) + +HOSTCFLAGS_watch_test.o += -I$(objtree)/usr/include diff --git a/samples/watch_queue/watch_test.c b/samples/watch_queue/watch_test.c new file mode 100644 index 00000000000000..4a311790d4ca18 --- /dev/null +++ b/samples/watch_queue/watch_test.c @@ -0,0 +1,183 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Use /dev/watch_queue to watch for notifications. + * + * Copyright (C) 2020 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + */ + +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifndef KEYCTL_WATCH_KEY +#define KEYCTL_WATCH_KEY -1 +#endif +#ifndef __NR_keyctl +#define __NR_keyctl -1 +#endif + +#define BUF_SIZE 256 + +static long keyctl_watch_key(int key, int watch_fd, int watch_id) +{ + return syscall(__NR_keyctl, KEYCTL_WATCH_KEY, key, watch_fd, watch_id); +} + +static const char *key_subtypes[256] = { + [NOTIFY_KEY_INSTANTIATED] = "instantiated", + [NOTIFY_KEY_UPDATED] = "updated", + [NOTIFY_KEY_LINKED] = "linked", + [NOTIFY_KEY_UNLINKED] = "unlinked", + [NOTIFY_KEY_CLEARED] = "cleared", + [NOTIFY_KEY_REVOKED] = "revoked", + [NOTIFY_KEY_INVALIDATED] = "invalidated", + [NOTIFY_KEY_SETATTR] = "setattr", +}; + +static void saw_key_change(struct watch_notification *n, size_t len) +{ + struct key_notification *k = (struct key_notification *)n; + + if (len != sizeof(struct key_notification)) { + fprintf(stderr, "Incorrect key message length\n"); + return; + } + + printf("KEY %08x change=%u[%s] aux=%u\n", + k->key_id, n->subtype, key_subtypes[n->subtype], k->aux); +} + +/* + * Consume and display events. + */ +static void consumer(int fd) +{ + unsigned char buffer[4096], *p, *end; + union { + struct watch_notification n; + unsigned char buf1[128]; + } n; + ssize_t buf_len; + + for (;;) { + buf_len = read(fd, buffer, sizeof(buffer)); + if (buf_len == -1) { + perror("read"); + exit(1); + } + + if (buf_len == 0) { + printf("-- END --\n"); + return; + } + + if (buf_len > sizeof(buffer)) { + fprintf(stderr, "Read buffer overrun: %zd\n", buf_len); + return; + } + + printf("read() = %zd\n", buf_len); + + p = buffer; + end = buffer + buf_len; + while (p < end) { + size_t largest, len; + + largest = end - p; + if (largest > 128) + largest = 128; + if (largest < sizeof(struct watch_notification)) { + fprintf(stderr, "Short message header: %zu\n", largest); + return; + } + memcpy(&n, p, largest); + + printf("NOTIFY[%03zx]: ty=%06x sy=%02x i=%08x\n", + p - buffer, n.n.type, n.n.subtype, n.n.info); + + len = n.n.info & WATCH_INFO_LENGTH; + if (len < sizeof(n.n) || len > largest) { + fprintf(stderr, "Bad message length: %zu/%zu\n", len, largest); + exit(1); + } + + switch (n.n.type) { + case WATCH_TYPE_META: + switch (n.n.subtype) { + case WATCH_META_REMOVAL_NOTIFICATION: + printf("REMOVAL of watchpoint %08x\n", + (n.n.info & WATCH_INFO_ID) >> + WATCH_INFO_ID__SHIFT); + break; + default: + printf("other meta record\n"); + break; + } + break; + case WATCH_TYPE_KEY_NOTIFY: + saw_key_change(&n.n, len); + break; + default: + printf("other type\n"); + break; + } + + p += len; + } + } +} + +static struct watch_notification_filter filter = { + .nr_filters = 1, + .filters = { + [0] = { + .type = WATCH_TYPE_KEY_NOTIFY, + .subtype_filter[0] = UINT_MAX, + }, + }, +}; + +int main(int argc, char **argv) +{ + int pipefd[2], fd; + + if (pipe2(pipefd, O_NOTIFICATION_PIPE) == -1) { + perror("pipe2"); + exit(1); + } + fd = pipefd[0]; + + if (ioctl(fd, IOC_WATCH_QUEUE_SET_SIZE, BUF_SIZE) == -1) { + perror("watch_queue(size)"); + exit(1); + } + + if (ioctl(fd, IOC_WATCH_QUEUE_SET_FILTER, &filter) == -1) { + perror("watch_queue(filter)"); + exit(1); + } + + if (keyctl_watch_key(KEY_SPEC_SESSION_KEYRING, fd, 0x01) == -1) { + perror("keyctl"); + exit(1); + } + + if (keyctl_watch_key(KEY_SPEC_USER_KEYRING, fd, 0x02) == -1) { + perror("keyctl"); + exit(1); + } + + consumer(fd); + exit(0); +} From 8cfba76383e902acbed95092163052b1572f17a8 Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 14 Jan 2020 17:07:11 +0000 Subject: [PATCH 026/190] pipe: Allow buffers to be marked read-whole-or-error for notifications Allow a buffer to be marked such that read() must return the entire buffer in one go or return ENOBUFS. Multiple buffers can be amalgamated into a single read, but a short read will occur if the next "whole" buffer won't fit. This is useful for watch queue notifications to make sure we don't split a notification across multiple reads, especially given that we need to fabricate an overrun record under some circumstances - and that isn't in the buffers. Signed-off-by: David Howells --- fs/pipe.c | 8 +++++++- include/linux/pipe_fs_i.h | 1 + kernel/watch_queue.c | 2 +- samples/watch_queue/watch_test.c | 2 +- 4 files changed, 10 insertions(+), 3 deletions(-) diff --git a/fs/pipe.c b/fs/pipe.c index da9bc1f21fd19a..0f9fd897ceb562 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -320,8 +320,14 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to) size_t written; int error; - if (chars > total_len) + if (chars > total_len) { + if (buf->flags & PIPE_BUF_FLAG_WHOLE) { + if (ret == 0) + ret = -ENOBUFS; + break; + } chars = total_len; + } error = pipe_buf_confirm(pipe, buf); if (error) { diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h index 1d3eaa233f4a85..eaff59a2f07433 100644 --- a/include/linux/pipe_fs_i.h +++ b/include/linux/pipe_fs_i.h @@ -8,6 +8,7 @@ #define PIPE_BUF_FLAG_ATOMIC 0x02 /* was atomically mapped */ #define PIPE_BUF_FLAG_GIFT 0x04 /* page is a gift */ #define PIPE_BUF_FLAG_PACKET 0x08 /* read() as a packet */ +#define PIPE_BUF_FLAG_WHOLE 0x10 /* read() must return entire buffer or error */ /** * struct pipe_buffer - a linux kernel pipe buffer diff --git a/kernel/watch_queue.c b/kernel/watch_queue.c index c103e34f870568..ad64ea300f6df3 100644 --- a/kernel/watch_queue.c +++ b/kernel/watch_queue.c @@ -115,7 +115,7 @@ static bool post_one_notification(struct watch_queue *wqueue, buf->ops = &watch_queue_pipe_buf_ops; buf->offset = offset; buf->len = len; - buf->flags = 0; + buf->flags = PIPE_BUF_FLAG_WHOLE; pipe->head = head + 1; if (!test_and_clear_bit(note, wqueue->notes_bitmap)) { diff --git a/samples/watch_queue/watch_test.c b/samples/watch_queue/watch_test.c index 4a311790d4ca18..8628b4c5d56765 100644 --- a/samples/watch_queue/watch_test.c +++ b/samples/watch_queue/watch_test.c @@ -63,7 +63,7 @@ static void saw_key_change(struct watch_notification *n, size_t len) */ static void consumer(int fd) { - unsigned char buffer[4096], *p, *end; + unsigned char buffer[433], *p, *end; union { struct watch_notification n; unsigned char buf1[128]; From e7d553d69cf63aec7de0f38fed49ccbb30922e1e Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 14 Jan 2020 17:07:12 +0000 Subject: [PATCH 027/190] pipe: Add notification lossage handling Add handling for loss of notifications by having read() insert a loss-notification message after it has read the pipe buffer that was last in the ring when the loss occurred. Lossage can come about either by running out of notification descriptors or by running out of space in the pipe ring. Signed-off-by: David Howells --- fs/pipe.c | 28 ++++++++++++++++++++++++++++ include/linux/pipe_fs_i.h | 7 +++++++ kernel/watch_queue.c | 2 ++ samples/watch_queue/watch_test.c | 3 +++ 4 files changed, 40 insertions(+) diff --git a/fs/pipe.c b/fs/pipe.c index 0f9fd897ceb562..620a113f92eb95 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -314,6 +314,30 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to) unsigned int tail = pipe->tail; unsigned int mask = pipe->ring_size - 1; +#ifdef CONFIG_WATCH_QUEUE + if (pipe->note_loss) { + struct watch_notification n; + + if (total_len < 8) { + if (ret == 0) + ret = -ENOBUFS; + break; + } + + n.type = WATCH_TYPE_META; + n.subtype = WATCH_META_LOSS_NOTIFICATION; + n.info = watch_sizeof(n); + if (copy_to_iter(&n, sizeof(n), to) != sizeof(n)) { + if (ret == 0) + ret = -EFAULT; + break; + } + ret += sizeof(n); + total_len -= sizeof(n); + pipe->note_loss = false; + } +#endif + if (!pipe_empty(head, tail)) { struct pipe_buffer *buf = &pipe->bufs[tail & mask]; size_t chars = buf->len; @@ -355,6 +379,10 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to) if (!buf->len) { pipe_buf_release(pipe, buf); spin_lock_irq(&pipe->rd_wait.lock); +#ifdef CONFIG_WATCH_QUEUE + if (buf->flags & PIPE_BUF_FLAG_LOSS) + pipe->note_loss = true; +#endif tail++; pipe->tail = tail; spin_unlock_irq(&pipe->rd_wait.lock); diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h index eaff59a2f07433..6626f511de6fba 100644 --- a/include/linux/pipe_fs_i.h +++ b/include/linux/pipe_fs_i.h @@ -9,6 +9,9 @@ #define PIPE_BUF_FLAG_GIFT 0x04 /* page is a gift */ #define PIPE_BUF_FLAG_PACKET 0x08 /* read() as a packet */ #define PIPE_BUF_FLAG_WHOLE 0x10 /* read() must return entire buffer or error */ +#ifdef CONFIG_WATCH_QUEUE +#define PIPE_BUF_FLAG_LOSS 0x20 /* Message loss happened after this buffer */ +#endif /** * struct pipe_buffer - a linux kernel pipe buffer @@ -34,6 +37,7 @@ struct pipe_buffer { * @wr_wait: writer wait point in case of full pipe * @head: The point of buffer production * @tail: The point of buffer consumption + * @note_loss: The next read() should insert a data-lost message * @max_usage: The maximum number of slots that may be used in the ring * @ring_size: total number of buffers (should be a power of 2) * @nr_accounted: The amount this pipe accounts for in user->pipe_bufs @@ -56,6 +60,9 @@ struct pipe_inode_info { unsigned int tail; unsigned int max_usage; unsigned int ring_size; +#ifdef CONFIG_WATCH_QUEUE + bool note_loss; +#endif unsigned int nr_accounted; unsigned int readers; unsigned int writers; diff --git a/kernel/watch_queue.c b/kernel/watch_queue.c index ad64ea300f6df3..9a9699c06709b1 100644 --- a/kernel/watch_queue.c +++ b/kernel/watch_queue.c @@ -132,6 +132,8 @@ static bool post_one_notification(struct watch_queue *wqueue, return done; lost: + buf = &pipe->bufs[(head - 1) & mask]; + buf->flags |= PIPE_BUF_FLAG_LOSS; goto out; } diff --git a/samples/watch_queue/watch_test.c b/samples/watch_queue/watch_test.c index 8628b4c5d56765..46e618a897fef9 100644 --- a/samples/watch_queue/watch_test.c +++ b/samples/watch_queue/watch_test.c @@ -120,6 +120,9 @@ static void consumer(int fd) (n.n.info & WATCH_INFO_ID) >> WATCH_INFO_ID__SHIFT); break; + case WATCH_META_LOSS_NOTIFICATION: + printf("-- LOSS --\n"); + break; default: printf("other meta record\n"); break; From 8c0637e950d68933a67f7438f779d79b049b5e5c Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 12 May 2020 15:16:29 +0100 Subject: [PATCH 028/190] keys: Make the KEY_NEED_* perms an enum rather than a mask Since the meaning of combining the KEY_NEED_* constants is undefined, make it so that you can't do that by turning them into an enum. The enum is also given some extra values to represent special circumstances, such as: (1) The '0' value is reserved and causes a warning to trap the parameter being unset. (2) The key is to be unlinked and we require no permissions on it, only the keyring, (this replaces the KEY_LOOKUP_FOR_UNLINK flag). (3) An override due to CAP_SYS_ADMIN. (4) An override due to an instantiation token being present. (5) The permissions check is being deferred to later key_permission() calls. The extra values give the opportunity for LSMs to audit these situations. [Note: This really needs overhauling so that lookup_user_key() tells key_task_permission() and the LSM what operation is being done and leaves it to those functions to decide how to map that onto the available permits. However, I don't really want to make these change in the middle of the notifications patchset.] Signed-off-by: David Howells cc: Jarkko Sakkinen cc: Paul Moore cc: Stephen Smalley cc: Casey Schaufler cc: keyrings@vger.kernel.org cc: selinux@vger.kernel.org --- include/linux/key.h | 30 +++++++++++++---------- include/linux/security.h | 6 ++--- security/keys/internal.h | 8 +++---- security/keys/keyctl.c | 16 +++++++------ security/keys/permission.c | 31 ++++++++++++++++++------ security/keys/process_keys.c | 46 +++++++++++++++++------------------- security/security.c | 6 ++--- security/selinux/hooks.c | 37 +++++++++++++++++++++++------ security/smack/smack_lsm.c | 29 +++++++++++++++++------ 9 files changed, 135 insertions(+), 74 deletions(-) diff --git a/include/linux/key.h b/include/linux/key.h index b99b40db08fc69..0f2e24f13c2bdf 100644 --- a/include/linux/key.h +++ b/include/linux/key.h @@ -71,6 +71,23 @@ struct net; #define KEY_PERM_UNDEF 0xffffffff +/* + * The permissions required on a key that we're looking up. + */ +enum key_need_perm { + KEY_NEED_UNSPECIFIED, /* Needed permission unspecified */ + KEY_NEED_VIEW, /* Require permission to view attributes */ + KEY_NEED_READ, /* Require permission to read content */ + KEY_NEED_WRITE, /* Require permission to update / modify */ + KEY_NEED_SEARCH, /* Require permission to search (keyring) or find (key) */ + KEY_NEED_LINK, /* Require permission to link */ + KEY_NEED_SETATTR, /* Require permission to change attributes */ + KEY_NEED_UNLINK, /* Require permission to unlink key */ + KEY_SYSADMIN_OVERRIDE, /* Special: override by CAP_SYS_ADMIN */ + KEY_AUTHTOKEN_OVERRIDE, /* Special: override by possession of auth token */ + KEY_DEFER_PERM_CHECK, /* Special: permission check is deferred */ +}; + struct seq_file; struct user_struct; struct signal_struct; @@ -420,20 +437,9 @@ static inline key_serial_t key_serial(const struct key *key) extern void key_set_timeout(struct key *, unsigned); extern key_ref_t lookup_user_key(key_serial_t id, unsigned long flags, - key_perm_t perm); + enum key_need_perm need_perm); extern void key_free_user_ns(struct user_namespace *); -/* - * The permissions required on a key that we're looking up. - */ -#define KEY_NEED_VIEW 0x01 /* Require permission to view attributes */ -#define KEY_NEED_READ 0x02 /* Require permission to read content */ -#define KEY_NEED_WRITE 0x04 /* Require permission to update / modify */ -#define KEY_NEED_SEARCH 0x08 /* Require permission to search (keyring) or find (key) */ -#define KEY_NEED_LINK 0x10 /* Require permission to link */ -#define KEY_NEED_SETATTR 0x20 /* Require permission to change attributes */ -#define KEY_NEED_ALL 0x3f /* All the above permissions */ - static inline short key_read_state(const struct key *key) { /* Barrier versus mark_key_instantiated(). */ diff --git a/include/linux/security.h b/include/linux/security.h index e7914e4e0b0266..57aac14e341861 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -1767,8 +1767,8 @@ static inline int security_path_chroot(const struct path *path) int security_key_alloc(struct key *key, const struct cred *cred, unsigned long flags); void security_key_free(struct key *key); -int security_key_permission(key_ref_t key_ref, - const struct cred *cred, unsigned perm); +int security_key_permission(key_ref_t key_ref, const struct cred *cred, + enum key_need_perm need_perm); int security_key_getsecurity(struct key *key, char **_buffer); #else @@ -1786,7 +1786,7 @@ static inline void security_key_free(struct key *key) static inline int security_key_permission(key_ref_t key_ref, const struct cred *cred, - unsigned perm) + enum key_need_perm need_perm) { return 0; } diff --git a/security/keys/internal.h b/security/keys/internal.h index 28e17f4f3328fb..1fc17cb317a998 100644 --- a/security/keys/internal.h +++ b/security/keys/internal.h @@ -167,7 +167,6 @@ extern bool lookup_user_key_possessed(const struct key *key, const struct key_match_data *match_data); #define KEY_LOOKUP_CREATE 0x01 #define KEY_LOOKUP_PARTIAL 0x02 -#define KEY_LOOKUP_FOR_UNLINK 0x04 extern long join_session_keyring(const char *name); extern void key_change_session_keyring(struct callback_head *twork); @@ -183,7 +182,7 @@ extern void key_gc_keytype(struct key_type *ktype); extern int key_task_permission(const key_ref_t key_ref, const struct cred *cred, - key_perm_t perm); + enum key_need_perm need_perm); static inline void notify_key(struct key *key, enum key_notification_subtype subtype, u32 aux) @@ -205,9 +204,10 @@ static inline void notify_key(struct key *key, /* * Check to see whether permission is granted to use a key in the desired way. */ -static inline int key_permission(const key_ref_t key_ref, unsigned perm) +static inline int key_permission(const key_ref_t key_ref, + enum key_need_perm need_perm) { - return key_task_permission(key_ref, current_cred(), perm); + return key_task_permission(key_ref, current_cred(), need_perm); } extern struct key_type key_type_request_key_auth; diff --git a/security/keys/keyctl.c b/security/keys/keyctl.c index 7d8de1c9a47816..6763ee45e04d1e 100644 --- a/security/keys/keyctl.c +++ b/security/keys/keyctl.c @@ -434,7 +434,7 @@ long keyctl_invalidate_key(key_serial_t id) /* Root is permitted to invalidate certain special keys */ if (capable(CAP_SYS_ADMIN)) { - key_ref = lookup_user_key(id, 0, 0); + key_ref = lookup_user_key(id, 0, KEY_SYSADMIN_OVERRIDE); if (IS_ERR(key_ref)) goto error; if (test_bit(KEY_FLAG_ROOT_CAN_INVAL, @@ -479,7 +479,8 @@ long keyctl_keyring_clear(key_serial_t ringid) /* Root is permitted to invalidate certain special keyrings */ if (capable(CAP_SYS_ADMIN)) { - keyring_ref = lookup_user_key(ringid, 0, 0); + keyring_ref = lookup_user_key(ringid, 0, + KEY_SYSADMIN_OVERRIDE); if (IS_ERR(keyring_ref)) goto error; if (test_bit(KEY_FLAG_ROOT_CAN_CLEAR, @@ -563,7 +564,7 @@ long keyctl_keyring_unlink(key_serial_t id, key_serial_t ringid) goto error; } - key_ref = lookup_user_key(id, KEY_LOOKUP_FOR_UNLINK, 0); + key_ref = lookup_user_key(id, KEY_LOOKUP_PARTIAL, KEY_NEED_UNLINK); if (IS_ERR(key_ref)) { ret = PTR_ERR(key_ref); goto error2; @@ -663,7 +664,7 @@ long keyctl_describe_key(key_serial_t keyid, key_put(instkey); key_ref = lookup_user_key(keyid, KEY_LOOKUP_PARTIAL, - 0); + KEY_AUTHTOKEN_OVERRIDE); if (!IS_ERR(key_ref)) goto okay; } @@ -833,7 +834,7 @@ long keyctl_read_key(key_serial_t keyid, char __user *buffer, size_t buflen) size_t key_data_len; /* find the key first */ - key_ref = lookup_user_key(keyid, 0, 0); + key_ref = lookup_user_key(keyid, 0, KEY_DEFER_PERM_CHECK); if (IS_ERR(key_ref)) { ret = -ENOKEY; goto out; @@ -1471,7 +1472,7 @@ long keyctl_set_timeout(key_serial_t id, unsigned timeout) key_put(instkey); key_ref = lookup_user_key(id, KEY_LOOKUP_PARTIAL, - 0); + KEY_AUTHTOKEN_OVERRIDE); if (!IS_ERR(key_ref)) goto okay; } @@ -1579,7 +1580,8 @@ long keyctl_get_security(key_serial_t keyid, return PTR_ERR(instkey); key_put(instkey); - key_ref = lookup_user_key(keyid, KEY_LOOKUP_PARTIAL, 0); + key_ref = lookup_user_key(keyid, KEY_LOOKUP_PARTIAL, + KEY_AUTHTOKEN_OVERRIDE); if (IS_ERR(key_ref)) return PTR_ERR(key_ref); } diff --git a/security/keys/permission.c b/security/keys/permission.c index 085f907b64ac5c..4a61f804e80f61 100644 --- a/security/keys/permission.c +++ b/security/keys/permission.c @@ -13,7 +13,7 @@ * key_task_permission - Check a key can be used * @key_ref: The key to check. * @cred: The credentials to use. - * @perm: The permissions to check for. + * @need_perm: The permission required. * * Check to see whether permission is granted to use a key in the desired way, * but permit the security modules to override. @@ -24,12 +24,30 @@ * permissions bits or the LSM check. */ int key_task_permission(const key_ref_t key_ref, const struct cred *cred, - unsigned perm) + enum key_need_perm need_perm) { struct key *key; - key_perm_t kperm; + key_perm_t kperm, mask; int ret; + switch (need_perm) { + default: + WARN_ON(1); + return -EACCES; + case KEY_NEED_UNLINK: + case KEY_SYSADMIN_OVERRIDE: + case KEY_AUTHTOKEN_OVERRIDE: + case KEY_DEFER_PERM_CHECK: + goto lsm; + + case KEY_NEED_VIEW: mask = KEY_OTH_VIEW; break; + case KEY_NEED_READ: mask = KEY_OTH_READ; break; + case KEY_NEED_WRITE: mask = KEY_OTH_WRITE; break; + case KEY_NEED_SEARCH: mask = KEY_OTH_SEARCH; break; + case KEY_NEED_LINK: mask = KEY_OTH_LINK; break; + case KEY_NEED_SETATTR: mask = KEY_OTH_SETATTR; break; + } + key = key_ref_to_ptr(key_ref); /* use the second 8-bits of permissions for keys the caller owns */ @@ -64,13 +82,12 @@ int key_task_permission(const key_ref_t key_ref, const struct cred *cred, if (is_key_possessed(key_ref)) kperm |= key->perm >> 24; - kperm = kperm & perm & KEY_NEED_ALL; - - if (kperm != perm) + if ((kperm & mask) != mask) return -EACCES; /* let LSM be the final arbiter */ - return security_key_permission(key_ref, cred, perm); +lsm: + return security_key_permission(key_ref, cred, need_perm); } EXPORT_SYMBOL(key_task_permission); diff --git a/security/keys/process_keys.c b/security/keys/process_keys.c index 09541de31f2f14..7e0232db1707e5 100644 --- a/security/keys/process_keys.c +++ b/security/keys/process_keys.c @@ -609,7 +609,7 @@ bool lookup_user_key_possessed(const struct key *key, * returned key reference. */ key_ref_t lookup_user_key(key_serial_t id, unsigned long lflags, - key_perm_t perm) + enum key_need_perm need_perm) { struct keyring_search_context ctx = { .match_data.cmp = lookup_user_key_possessed, @@ -773,35 +773,33 @@ key_ref_t lookup_user_key(key_serial_t id, unsigned long lflags, /* unlink does not use the nominated key in any way, so can skip all * the permission checks as it is only concerned with the keyring */ - if (lflags & KEY_LOOKUP_FOR_UNLINK) { - ret = 0; - goto error; - } - - if (!(lflags & KEY_LOOKUP_PARTIAL)) { - ret = wait_for_key_construction(key, true); - switch (ret) { - case -ERESTARTSYS: - goto invalid_key; - default: - if (perm) + if (need_perm != KEY_NEED_UNLINK) { + if (!(lflags & KEY_LOOKUP_PARTIAL)) { + ret = wait_for_key_construction(key, true); + switch (ret) { + case -ERESTARTSYS: + goto invalid_key; + default: + if (need_perm != KEY_AUTHTOKEN_OVERRIDE && + need_perm != KEY_DEFER_PERM_CHECK) + goto invalid_key; + case 0: + break; + } + } else if (need_perm != KEY_DEFER_PERM_CHECK) { + ret = key_validate(key); + if (ret < 0) goto invalid_key; - case 0: - break; } - } else if (perm) { - ret = key_validate(key); - if (ret < 0) + + ret = -EIO; + if (!(lflags & KEY_LOOKUP_PARTIAL) && + key_read_state(key) == KEY_IS_UNINSTANTIATED) goto invalid_key; } - ret = -EIO; - if (!(lflags & KEY_LOOKUP_PARTIAL) && - key_read_state(key) == KEY_IS_UNINSTANTIATED) - goto invalid_key; - /* check the permissions */ - ret = key_task_permission(key_ref, ctx.cred, perm); + ret = key_task_permission(key_ref, ctx.cred, need_perm); if (ret < 0) goto invalid_key; diff --git a/security/security.c b/security/security.c index c73334ab2882bc..af32d4cd0462a3 100644 --- a/security/security.c +++ b/security/security.c @@ -2398,10 +2398,10 @@ void security_key_free(struct key *key) call_void_hook(key_free, key); } -int security_key_permission(key_ref_t key_ref, - const struct cred *cred, unsigned perm) +int security_key_permission(key_ref_t key_ref, const struct cred *cred, + enum key_need_perm need_perm) { - return call_int_hook(key_permission, 0, key_ref, cred, perm); + return call_int_hook(key_permission, 0, key_ref, cred, need_perm); } int security_key_getsecurity(struct key *key, char **_buffer) diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index 4c037c2545c161..196acaccbfddf5 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -6561,20 +6561,43 @@ static void selinux_key_free(struct key *k) static int selinux_key_permission(key_ref_t key_ref, const struct cred *cred, - unsigned perm) + enum key_need_perm need_perm) { struct key *key; struct key_security_struct *ksec; - u32 sid; + u32 perm, sid; - /* if no specific permissions are requested, we skip the - permission check. No serious, additional covert channels - appear to be created. */ - if (perm == 0) + switch (need_perm) { + case KEY_NEED_VIEW: + perm = KEY__VIEW; + break; + case KEY_NEED_READ: + perm = KEY__READ; + break; + case KEY_NEED_WRITE: + perm = KEY__WRITE; + break; + case KEY_NEED_SEARCH: + perm = KEY__SEARCH; + break; + case KEY_NEED_LINK: + perm = KEY__LINK; + break; + case KEY_NEED_SETATTR: + perm = KEY__SETATTR; + break; + case KEY_NEED_UNLINK: + case KEY_SYSADMIN_OVERRIDE: + case KEY_AUTHTOKEN_OVERRIDE: + case KEY_DEFER_PERM_CHECK: return 0; + default: + WARN_ON(1); + return -EPERM; - sid = cred_sid(cred); + } + sid = cred_sid(cred); key = key_ref_to_ptr(key_ref); ksec = key->security; diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c index 8c61d175e19543..0d6bb53efe745f 100644 --- a/security/smack/smack_lsm.c +++ b/security/smack/smack_lsm.c @@ -4230,13 +4230,14 @@ static void smack_key_free(struct key *key) * smack_key_permission - Smack access on a key * @key_ref: gets to the object * @cred: the credentials to use - * @perm: requested key permissions + * @need_perm: requested key permission * * Return 0 if the task has read and write to the object, * an error code otherwise */ static int smack_key_permission(key_ref_t key_ref, - const struct cred *cred, unsigned perm) + const struct cred *cred, + enum key_need_perm need_perm) { struct key *keyp; struct smk_audit_info ad; @@ -4247,8 +4248,26 @@ static int smack_key_permission(key_ref_t key_ref, /* * Validate requested permissions */ - if (perm & ~KEY_NEED_ALL) + switch (need_perm) { + case KEY_NEED_READ: + case KEY_NEED_SEARCH: + case KEY_NEED_VIEW: + request |= MAY_READ; + break; + case KEY_NEED_WRITE: + case KEY_NEED_LINK: + case KEY_NEED_SETATTR: + request |= MAY_WRITE; + break; + case KEY_NEED_UNSPECIFIED: + case KEY_NEED_UNLINK: + case KEY_SYSADMIN_OVERRIDE: + case KEY_AUTHTOKEN_OVERRIDE: + case KEY_DEFER_PERM_CHECK: + return 0; + default: return -EINVAL; + } keyp = key_ref_to_ptr(key_ref); if (keyp == NULL) @@ -4273,10 +4292,6 @@ static int smack_key_permission(key_ref_t key_ref, ad.a.u.key_struct.key = keyp->serial; ad.a.u.key_struct.key_desc = keyp->description; #endif - if (perm & (KEY_NEED_READ | KEY_NEED_SEARCH | KEY_NEED_VIEW)) - request |= MAY_READ; - if (perm & (KEY_NEED_WRITE | KEY_NEED_LINK | KEY_NEED_SETATTR)) - request |= MAY_WRITE; rc = smk_access(tkp, keyp->security, request, &ad); rc = smk_bu_note("key access", tkp, keyp->security, request, rc); return rc; From 3e412ccc22e25666772094fb5ca01af056c54471 Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 14 Jan 2020 17:07:13 +0000 Subject: [PATCH 029/190] selinux: Implement the watch_key security hook Implement the watch_key security hook to make sure that a key grants the caller View permission in order to set a watch on a key. For the moment, the watch_devices security hook is left unimplemented as it's not obvious what the object should be since the queue is global and didn't previously exist. Signed-off-by: David Howells Acked-by: Stephen Smalley Reviewed-by: James Morris --- security/selinux/hooks.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index 196acaccbfddf5..5b3191bd6130bf 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -6619,6 +6619,17 @@ static int selinux_key_getsecurity(struct key *key, char **_buffer) *_buffer = context; return rc; } + +#ifdef CONFIG_KEY_NOTIFICATIONS +static int selinux_watch_key(struct key *key) +{ + struct key_security_struct *ksec = key->security; + u32 sid = current_sid(); + + return avc_has_perm(&selinux_state, + sid, ksec->sid, SECCLASS_KEY, KEY__VIEW, NULL); +} +#endif #endif #ifdef CONFIG_SECURITY_INFINIBAND @@ -7134,6 +7145,9 @@ static struct security_hook_list selinux_hooks[] __lsm_ro_after_init = { LSM_HOOK_INIT(key_free, selinux_key_free), LSM_HOOK_INIT(key_permission, selinux_key_permission), LSM_HOOK_INIT(key_getsecurity, selinux_key_getsecurity), +#ifdef CONFIG_KEY_NOTIFICATIONS + LSM_HOOK_INIT(watch_key, selinux_watch_key), +#endif #endif #ifdef CONFIG_AUDIT From a8478a602913dc89a7cd2060e613edecd07e1dbd Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 14 Jan 2020 17:07:13 +0000 Subject: [PATCH 030/190] smack: Implement the watch_key and post_notification hooks Implement the watch_key security hook in Smack to make sure that a key grants the caller Read permission in order to set a watch on a key. Also implement the post_notification security hook to make sure that the notification source is granted Write permission by the watch queue. For the moment, the watch_devices security hook is left unimplemented as it's not obvious what the object should be since the queue is global and didn't previously exist. Signed-off-by: David Howells Acked-by: Casey Schaufler --- include/linux/lsm_audit.h | 1 + security/smack/smack_lsm.c | 83 +++++++++++++++++++++++++++++++++++++- 2 files changed, 83 insertions(+), 1 deletion(-) diff --git a/include/linux/lsm_audit.h b/include/linux/lsm_audit.h index 99d629fd994457..28f23b341c1ca4 100644 --- a/include/linux/lsm_audit.h +++ b/include/linux/lsm_audit.h @@ -75,6 +75,7 @@ struct common_audit_data { #define LSM_AUDIT_DATA_IBPKEY 13 #define LSM_AUDIT_DATA_IBENDPORT 14 #define LSM_AUDIT_DATA_LOCKDOWN 15 +#define LSM_AUDIT_DATA_NOTIFICATION 16 union { struct path path; struct dentry *dentry; diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c index 0d6bb53efe745f..3c4d4836da4a0f 100644 --- a/security/smack/smack_lsm.c +++ b/security/smack/smack_lsm.c @@ -41,6 +41,7 @@ #include #include #include +#include #include "smack.h" #define TRANS_TRUE "TRUE" @@ -4284,7 +4285,7 @@ static int smack_key_permission(key_ref_t key_ref, if (tkp == NULL) return -EACCES; - if (smack_privileged_cred(CAP_MAC_OVERRIDE, cred)) + if (smack_privileged(CAP_MAC_OVERRIDE)) return 0; #ifdef CONFIG_AUDIT @@ -4326,8 +4327,81 @@ static int smack_key_getsecurity(struct key *key, char **_buffer) return length; } + +#ifdef CONFIG_KEY_NOTIFICATIONS +/** + * smack_watch_key - Smack access to watch a key for notifications. + * @key: The key to be watched + * + * Return 0 if the @watch->cred has permission to read from the key object and + * an error otherwise. + */ +static int smack_watch_key(struct key *key) +{ + struct smk_audit_info ad; + struct smack_known *tkp = smk_of_current(); + int rc; + + if (key == NULL) + return -EINVAL; + /* + * If the key hasn't been initialized give it access so that + * it may do so. + */ + if (key->security == NULL) + return 0; + /* + * This should not occur + */ + if (tkp == NULL) + return -EACCES; + + if (smack_privileged_cred(CAP_MAC_OVERRIDE, current_cred())) + return 0; + +#ifdef CONFIG_AUDIT + smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_KEY); + ad.a.u.key_struct.key = key->serial; + ad.a.u.key_struct.key_desc = key->description; +#endif + rc = smk_access(tkp, key->security, MAY_READ, &ad); + rc = smk_bu_note("key watch", tkp, key->security, MAY_READ, rc); + return rc; +} +#endif /* CONFIG_KEY_NOTIFICATIONS */ #endif /* CONFIG_KEYS */ +#ifdef CONFIG_WATCH_QUEUE +/** + * smack_post_notification - Smack access to post a notification to a queue + * @w_cred: The credentials of the watcher. + * @cred: The credentials of the event source (may be NULL). + * @n: The notification message to be posted. + */ +static int smack_post_notification(const struct cred *w_cred, + const struct cred *cred, + struct watch_notification *n) +{ + struct smk_audit_info ad; + struct smack_known *subj, *obj; + int rc; + + /* Always let maintenance notifications through. */ + if (n->type == WATCH_TYPE_META) + return 0; + + if (!cred) + return 0; + subj = smk_of_task(smack_cred(cred)); + obj = smk_of_task(smack_cred(w_cred)); + + smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_NOTIFICATION); + rc = smk_access(subj, obj, MAY_WRITE, &ad); + rc = smk_bu_note("notification", subj, obj, MAY_WRITE, rc); + return rc; +} +#endif /* CONFIG_WATCH_QUEUE */ + /* * Smack Audit hooks * @@ -4716,8 +4790,15 @@ static struct security_hook_list smack_hooks[] __lsm_ro_after_init = { LSM_HOOK_INIT(key_free, smack_key_free), LSM_HOOK_INIT(key_permission, smack_key_permission), LSM_HOOK_INIT(key_getsecurity, smack_key_getsecurity), +#ifdef CONFIG_KEY_NOTIFICATIONS + LSM_HOOK_INIT(watch_key, smack_watch_key), +#endif #endif /* CONFIG_KEYS */ +#ifdef CONFIG_WATCH_QUEUE + LSM_HOOK_INIT(post_notification, smack_post_notification), +#endif + /* Audit hooks */ #ifdef CONFIG_AUDIT LSM_HOOK_INIT(audit_rule_init, smack_audit_rule_init), From a4e91825d7e1252f7cba005f1451e5464b23c15d Mon Sep 17 00:00:00 2001 From: Alexander Monakov Date: Sun, 10 May 2020 20:48:40 +0000 Subject: [PATCH 031/190] x86/amd_nb: Add AMD family 17h model 60h PCI IDs Add PCI IDs for AMD Renoir (4000-series Ryzen CPUs). This is necessary to enable support for temperature sensors via the k10temp module. Signed-off-by: Alexander Monakov Signed-off-by: Borislav Petkov Acked-by: Yazen Ghannam Acked-by: Guenter Roeck Link: https://lkml.kernel.org/r/20200510204842.2603-2-amonakov@ispras.ru --- arch/x86/kernel/amd_nb.c | 5 +++++ include/linux/pci_ids.h | 1 + 2 files changed, 6 insertions(+) diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c index b6b3297851f37d..18f6b7c4bd79fa 100644 --- a/arch/x86/kernel/amd_nb.c +++ b/arch/x86/kernel/amd_nb.c @@ -18,9 +18,11 @@ #define PCI_DEVICE_ID_AMD_17H_ROOT 0x1450 #define PCI_DEVICE_ID_AMD_17H_M10H_ROOT 0x15d0 #define PCI_DEVICE_ID_AMD_17H_M30H_ROOT 0x1480 +#define PCI_DEVICE_ID_AMD_17H_M60H_ROOT 0x1630 #define PCI_DEVICE_ID_AMD_17H_DF_F4 0x1464 #define PCI_DEVICE_ID_AMD_17H_M10H_DF_F4 0x15ec #define PCI_DEVICE_ID_AMD_17H_M30H_DF_F4 0x1494 +#define PCI_DEVICE_ID_AMD_17H_M60H_DF_F4 0x144c #define PCI_DEVICE_ID_AMD_17H_M70H_DF_F4 0x1444 #define PCI_DEVICE_ID_AMD_19H_DF_F4 0x1654 @@ -33,6 +35,7 @@ static const struct pci_device_id amd_root_ids[] = { { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_ROOT) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M10H_ROOT) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M30H_ROOT) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M60H_ROOT) }, {} }; @@ -50,6 +53,7 @@ static const struct pci_device_id amd_nb_misc_ids[] = { { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_DF_F3) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M10H_DF_F3) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M30H_DF_F3) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M60H_DF_F3) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_CNB17H_F3) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M70H_DF_F3) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_DF_F3) }, @@ -65,6 +69,7 @@ static const struct pci_device_id amd_nb_link_ids[] = { { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_DF_F4) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M10H_DF_F4) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M30H_DF_F4) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M60H_DF_F4) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M70H_DF_F4) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_DF_F4) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_CNB17H_F4) }, diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h index 1dfc4e1dcb94c1..3155f5ada02ebc 100644 --- a/include/linux/pci_ids.h +++ b/include/linux/pci_ids.h @@ -550,6 +550,7 @@ #define PCI_DEVICE_ID_AMD_17H_DF_F3 0x1463 #define PCI_DEVICE_ID_AMD_17H_M10H_DF_F3 0x15eb #define PCI_DEVICE_ID_AMD_17H_M30H_DF_F3 0x1493 +#define PCI_DEVICE_ID_AMD_17H_M60H_DF_F3 0x144b #define PCI_DEVICE_ID_AMD_17H_M70H_DF_F3 0x1443 #define PCI_DEVICE_ID_AMD_19H_DF_F3 0x1653 #define PCI_DEVICE_ID_AMD_CNB17H_F3 0x1703 From 279f0b3a4b80660fba6faadc2ca2fa426bf3f7e9 Mon Sep 17 00:00:00 2001 From: Alexander Monakov Date: Sun, 10 May 2020 20:48:41 +0000 Subject: [PATCH 032/190] hwmon: (k10temp) Add AMD family 17h model 60h PCI match Add support for retrieving Tdie and Tctl on AMD Renoir (4000-series Ryzen CPUs). It appears SMU offsets for reading current/voltage and CCD temperature have changed for this generation (reads from currently used offsets yield zeros), so those features cannot be enabled so trivially. Signed-off-by: Alexander Monakov Signed-off-by: Borislav Petkov Acked-by: Guenter Roeck Link: https://lkml.kernel.org/r/20200510204842.2603-3-amonakov@ispras.ru --- drivers/hwmon/k10temp.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/hwmon/k10temp.c b/drivers/hwmon/k10temp.c index 3f37d5d81fe40e..7ba82e0efbeba2 100644 --- a/drivers/hwmon/k10temp.c +++ b/drivers/hwmon/k10temp.c @@ -632,6 +632,7 @@ static const struct pci_device_id k10temp_id_table[] = { { PCI_VDEVICE(AMD, PCI_DEVICE_ID_AMD_17H_DF_F3) }, { PCI_VDEVICE(AMD, PCI_DEVICE_ID_AMD_17H_M10H_DF_F3) }, { PCI_VDEVICE(AMD, PCI_DEVICE_ID_AMD_17H_M30H_DF_F3) }, + { PCI_VDEVICE(AMD, PCI_DEVICE_ID_AMD_17H_M60H_DF_F3) }, { PCI_VDEVICE(AMD, PCI_DEVICE_ID_AMD_17H_M70H_DF_F3) }, { PCI_VDEVICE(HYGON, PCI_DEVICE_ID_AMD_17H_DF_F3) }, {} From b6bea24d41519e8c31e4798f1c1a3f67e540c5d0 Mon Sep 17 00:00:00 2001 From: Alexander Monakov Date: Sun, 10 May 2020 20:48:42 +0000 Subject: [PATCH 033/190] EDAC/amd64: Add AMD family 17h model 60h PCI IDs Add support for AMD Renoir (4000-series Ryzen CPUs). Signed-off-by: Alexander Monakov Signed-off-by: Borislav Petkov Acked-by: Yazen Ghannam Link: https://lkml.kernel.org/r/20200510204842.2603-4-amonakov@ispras.ru --- drivers/edac/amd64_edac.c | 14 ++++++++++++++ drivers/edac/amd64_edac.h | 3 +++ 2 files changed, 17 insertions(+) diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c index 6bdc5bb8c8bc97..42024a8d6a5811 100644 --- a/drivers/edac/amd64_edac.c +++ b/drivers/edac/amd64_edac.c @@ -2316,6 +2316,16 @@ static struct amd64_family_type family_types[] = { .dbam_to_cs = f17_addr_mask_to_cs_size, } }, + [F17_M60H_CPUS] = { + .ctl_name = "F17h_M60h", + .f0_id = PCI_DEVICE_ID_AMD_17H_M60H_DF_F0, + .f6_id = PCI_DEVICE_ID_AMD_17H_M60H_DF_F6, + .max_mcs = 2, + .ops = { + .early_channel_count = f17_early_channel_count, + .dbam_to_cs = f17_addr_mask_to_cs_size, + } + }, [F17_M70H_CPUS] = { .ctl_name = "F17h_M70h", .f0_id = PCI_DEVICE_ID_AMD_17H_M70H_DF_F0, @@ -3354,6 +3364,10 @@ static struct amd64_family_type *per_family_init(struct amd64_pvt *pvt) fam_type = &family_types[F17_M30H_CPUS]; pvt->ops = &family_types[F17_M30H_CPUS].ops; break; + } else if (pvt->model >= 0x60 && pvt->model <= 0x6f) { + fam_type = &family_types[F17_M60H_CPUS]; + pvt->ops = &family_types[F17_M60H_CPUS].ops; + break; } else if (pvt->model >= 0x70 && pvt->model <= 0x7f) { fam_type = &family_types[F17_M70H_CPUS]; pvt->ops = &family_types[F17_M70H_CPUS].ops; diff --git a/drivers/edac/amd64_edac.h b/drivers/edac/amd64_edac.h index abbf3c274d7498..52b5d03eeba007 100644 --- a/drivers/edac/amd64_edac.h +++ b/drivers/edac/amd64_edac.h @@ -120,6 +120,8 @@ #define PCI_DEVICE_ID_AMD_17H_M10H_DF_F6 0x15ee #define PCI_DEVICE_ID_AMD_17H_M30H_DF_F0 0x1490 #define PCI_DEVICE_ID_AMD_17H_M30H_DF_F6 0x1496 +#define PCI_DEVICE_ID_AMD_17H_M60H_DF_F0 0x1448 +#define PCI_DEVICE_ID_AMD_17H_M60H_DF_F6 0x144e #define PCI_DEVICE_ID_AMD_17H_M70H_DF_F0 0x1440 #define PCI_DEVICE_ID_AMD_17H_M70H_DF_F6 0x1446 #define PCI_DEVICE_ID_AMD_19H_DF_F0 0x1650 @@ -293,6 +295,7 @@ enum amd_families { F17_CPUS, F17_M10H_CPUS, F17_M30H_CPUS, + F17_M60H_CPUS, F17_M70H_CPUS, F19_CPUS, NUM_FAMILIES, From 6bd140e14d9aaa734ec37985b8b20a96c0ece948 Mon Sep 17 00:00:00 2001 From: Stafford Horne Date: Wed, 22 Apr 2020 20:24:11 +0900 Subject: [PATCH 034/190] openrisc: Fix issue with argument clobbering for clone/fork Working on the OpenRISC glibc port I found that sometimes clone was working strange. That the tls data argument sent in r7 was always wrong. Further investigation revealed that the arguments were getting clobbered in the entry code. This patch removes the code that writes to the argument registers. This was likely due to some old code hanging around. This patch fixes this up for clone and fork. This fork clobber is harmless but also useless so remove. Signed-off-by: Stafford Horne --- arch/openrisc/kernel/entry.S | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/openrisc/kernel/entry.S b/arch/openrisc/kernel/entry.S index e4a78571f88335..c6481cfc5220f8 100644 --- a/arch/openrisc/kernel/entry.S +++ b/arch/openrisc/kernel/entry.S @@ -1166,13 +1166,13 @@ ENTRY(__sys_clone) l.movhi r29,hi(sys_clone) l.ori r29,r29,lo(sys_clone) l.j _fork_save_extra_regs_and_call - l.addi r7,r1,0 + l.nop ENTRY(__sys_fork) l.movhi r29,hi(sys_fork) l.ori r29,r29,lo(sys_fork) l.j _fork_save_extra_regs_and_call - l.addi r3,r1,0 + l.nop ENTRY(sys_rt_sigreturn) l.jal _sys_rt_sigreturn From 26fa1263b0ba35f01a25f4f7e70ba7c44455d1e3 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Sun, 19 Apr 2020 14:40:49 +0000 Subject: [PATCH 035/190] x86/entry/64: Remove an unused label The label .Lcommon_\sym was introduced by 39e9543344fa. (x86-64: Reduce amount of redundant code generated for invalidate_interruptNN) And all the other relevant information was removed by 52aec3308db8 (x86/tlb: replace INVALIDATE_TLB_VECTOR by CALL_FUNCTION_VECTOR) Signed-off-by: Lai Jiangshan Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20200419144049.1906-4-laijs@linux.alibaba.com --- arch/x86/entry/entry_64.S | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index eead1e2bebd5c1..609c71e3dba1fd 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -798,7 +798,6 @@ _ASM_NOKPROBE(common_interrupt) SYM_CODE_START(\sym) UNWIND_HINT_IRET_REGS pushq $~(\num) -.Lcommon_\sym: call interrupt_entry UNWIND_HINT_REGS indirect=1 call \do_sym /* rdi points to pt_regs */ From c75890700455113c366f795f3d22ee03623835e8 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Sun, 19 Apr 2020 14:40:47 +0000 Subject: [PATCH 036/190] x86/entry/64: Remove unneeded kernel CR3 switching When native_load_gs_index() fails on .Lgs_change, CR3 must be kernel CR3. No need to switch it. Signed-off-by: Lai Jiangshan Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20200419144049.1906-2-laijs@linux.alibaba.com --- arch/x86/entry/entry_64.S | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 609c71e3dba1fd..87ffa792bc99ee 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -1334,7 +1334,6 @@ SYM_CODE_START_LOCAL(error_entry) */ SWAPGS FENCE_SWAPGS_USER_ENTRY - SWITCH_TO_KERNEL_CR3 scratch_reg=%rax jmp .Lerror_entry_done .Lbstep_iret: From fbaed278a3cc72a46aadae667b8c6754b78640a6 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Sun, 19 Apr 2020 14:40:48 +0000 Subject: [PATCH 037/190] x86/idt: Remove address operator on function machine_check() machine_check is function address, the address operator on it is nop for compiler. Make it consistent with the other function addresses in the same file. Signed-off-by: Lai Jiangshan Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20200419144049.1906-3-laijs@linux.alibaba.com --- arch/x86/kernel/idt.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c index 87ef69a72c52ef..98bcb502f96757 100644 --- a/arch/x86/kernel/idt.c +++ b/arch/x86/kernel/idt.c @@ -93,7 +93,7 @@ static const __initconst struct idt_data def_idts[] = { INTG(X86_TRAP_DB, debug), #ifdef CONFIG_X86_MCE - INTG(X86_TRAP_MC, &machine_check), + INTG(X86_TRAP_MC, machine_check), #endif SYSG(X86_TRAP_OF, overflow), @@ -186,7 +186,7 @@ static const __initconst struct idt_data ist_idts[] = { ISTG(X86_TRAP_NMI, nmi, IST_INDEX_NMI), ISTG(X86_TRAP_DF, double_fault, IST_INDEX_DF), #ifdef CONFIG_X86_MCE - ISTG(X86_TRAP_MC, &machine_check, IST_INDEX_MCE), + ISTG(X86_TRAP_MC, machine_check, IST_INDEX_MCE), #endif }; From a0bb51f2638e0810c347024679239fd10a8f7990 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Tue, 28 Apr 2020 11:38:22 +0200 Subject: [PATCH 038/190] x86/xen: Split HVM vector callback setup and interrupt gate allocation As a preparatory change for making alloc_intr_gate() __init split xen_callback_vector() into callback vector setup via hypercall (xen_setup_callback_vector()) and interrupt gate allocation (xen_alloc_callback_vector()). xen_setup_callback_vector() is being called twice: on init and upon system resume from xen_hvm_post_suspend(). alloc_intr_gate() only needs to be called once. Suggested-by: Thomas Gleixner Signed-off-by: Vitaly Kuznetsov Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20200428093824.1451532-2-vkuznets@redhat.com --- arch/x86/xen/suspend_hvm.c | 2 +- arch/x86/xen/xen-ops.h | 2 +- drivers/xen/events/events_base.c | 28 +++++++++++++++++----------- 3 files changed, 19 insertions(+), 13 deletions(-) diff --git a/arch/x86/xen/suspend_hvm.c b/arch/x86/xen/suspend_hvm.c index e666b614cf6d2c..5152afe1687669 100644 --- a/arch/x86/xen/suspend_hvm.c +++ b/arch/x86/xen/suspend_hvm.c @@ -13,6 +13,6 @@ void xen_hvm_post_suspend(int suspend_cancelled) xen_hvm_init_shared_info(); xen_vcpu_restore(); } - xen_callback_vector(); + xen_setup_callback_vector(); xen_unplug_emulated_devices(); } diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index 45a441c33d6dc4..1cc1568bfe048b 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -55,7 +55,7 @@ void xen_enable_sysenter(void); void xen_enable_syscall(void); void xen_vcpu_restore(void); -void xen_callback_vector(void); +void xen_setup_callback_vector(void); void xen_hvm_init_shared_info(void); void xen_unplug_emulated_devices(void); diff --git a/drivers/xen/events/events_base.c b/drivers/xen/events/events_base.c index 3a791c8485d090..eb35c3cda9a6fd 100644 --- a/drivers/xen/events/events_base.c +++ b/drivers/xen/events/events_base.c @@ -1639,26 +1639,30 @@ EXPORT_SYMBOL_GPL(xen_set_callback_via); /* Vector callbacks are better than PCI interrupts to receive event * channel notifications because we can receive vector callbacks on any * vcpu and we don't need PCI support or APIC interactions. */ -void xen_callback_vector(void) +void xen_setup_callback_vector(void) { - int rc; uint64_t callback_via; if (xen_have_vector_callback) { callback_via = HVM_CALLBACK_VECTOR(HYPERVISOR_CALLBACK_VECTOR); - rc = xen_set_callback_via(callback_via); - if (rc) { + if (xen_set_callback_via(callback_via)) { pr_err("Request for Xen HVM callback vector failed\n"); xen_have_vector_callback = 0; - return; } - pr_info_once("Xen HVM callback vector for event delivery is enabled\n"); - alloc_intr_gate(HYPERVISOR_CALLBACK_VECTOR, - xen_hvm_callback_vector); } } + +static __init void xen_alloc_callback_vector(void) +{ + if (!xen_have_vector_callback) + return; + + pr_info("Xen HVM callback vector for event delivery is enabled\n"); + alloc_intr_gate(HYPERVISOR_CALLBACK_VECTOR, xen_hvm_callback_vector); +} #else -void xen_callback_vector(void) {} +void xen_setup_callback_vector(void) {} +static inline void xen_alloc_callback_vector(void) {} #endif #undef MODULE_PARAM_PREFIX @@ -1692,8 +1696,10 @@ void __init xen_init_IRQ(void) if (xen_initial_domain()) pci_xen_initial_domain(); } - if (xen_feature(XENFEAT_hvm_callback_vector)) - xen_callback_vector(); + if (xen_feature(XENFEAT_hvm_callback_vector)) { + xen_setup_callback_vector(); + xen_alloc_callback_vector(); + } if (xen_hvm_domain()) { native_init_IRQ(); From 06184325a1cce27a02a688d98740f90fe06e0133 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Tue, 28 Apr 2020 11:38:23 +0200 Subject: [PATCH 039/190] x86/idt: Annotate alloc_intr_gate() with __init There seems to be no reason to allocate interrupt gates after init. Mark alloc_intr_gate() as __init and add WARN_ON() checks making sure it is only used before idt_setup_apic_and_irq_gates() finalizes IDT setup and maps all un-allocated entries to spurious entries. Suggested-by: Thomas Gleixner Signed-off-by: Vitaly Kuznetsov Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20200428093824.1451532-3-vkuznets@redhat.com --- arch/x86/kernel/idt.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c index 98bcb502f96757..0e9205137de8a8 100644 --- a/arch/x86/kernel/idt.c +++ b/arch/x86/kernel/idt.c @@ -51,6 +51,9 @@ struct idt_data { #define TSKG(_vector, _gdt) \ G(_vector, NULL, DEFAULT_STACK, GATE_TASK, DPL0, _gdt << 3) + +static bool idt_setup_done __initdata; + /* * Early traps running on the DEFAULT_STACK because the other interrupt * stacks work only after cpu_init(). @@ -323,6 +326,7 @@ void __init idt_setup_apic_and_irq_gates(void) set_intr_gate(i, entry); } #endif + idt_setup_done = true; } /** @@ -352,6 +356,7 @@ void idt_invalidate(void *addr) load_idt(&idt); } +/* This goes away once ASYNC_PF is sanitized */ void __init update_intr_gate(unsigned int n, const void *addr) { if (WARN_ON_ONCE(!test_bit(n, system_vectors))) @@ -359,9 +364,14 @@ void __init update_intr_gate(unsigned int n, const void *addr) set_intr_gate(n, addr); } -void alloc_intr_gate(unsigned int n, const void *addr) +void __init alloc_intr_gate(unsigned int n, const void *addr) { - BUG_ON(n < FIRST_SYSTEM_VECTOR); - if (!test_and_set_bit(n, system_vectors)) + if (WARN_ON(n < FIRST_SYSTEM_VECTOR)) + return; + + if (WARN_ON(idt_setup_done)) + return; + + if (!WARN_ON(test_and_set_bit(n, system_vectors))) set_intr_gate(n, addr); } From 1f1fbc70c10e81f70e9fbe2102d439c883269811 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Tue, 28 Apr 2020 11:38:24 +0200 Subject: [PATCH 040/190] x86/idt: Keep spurious entries unset in system_vectors With commit dc20b2d52653 ("x86/idt: Move interrupt gate initialization to IDT code") non assigned system vectors are also marked as used in 'used_vectors' (now 'system_vectors') bitmap. This makes checks in arch_show_interrupts() whether a particular system vector is allocated to always pass and e.g. 'Hyper-V reenlightenment interrupts' entry always shows up in /proc/interrupts. Another side effect of having all unassigned system vectors marked as used is that irq_matrix_debug_show() will wrongly count them among 'System' vectors. As it is now ensured that alloc_intr_gate() is not called after init, it is possible to leave unused entries in 'system_vectors' unset to fix these issues. Signed-off-by: Vitaly Kuznetsov Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20200428093824.1451532-4-vkuznets@redhat.com --- arch/x86/kernel/idt.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c index 0e9205137de8a8..36fef90a38e796 100644 --- a/arch/x86/kernel/idt.c +++ b/arch/x86/kernel/idt.c @@ -321,7 +321,11 @@ void __init idt_setup_apic_and_irq_gates(void) #ifdef CONFIG_X86_LOCAL_APIC for_each_clear_bit_from(i, system_vectors, NR_VECTORS) { - set_bit(i, system_vectors); + /* + * Don't set the non assigned system vectors in the + * system_vectors bitmap. Otherwise they show up in + * /proc/interrupts. + */ entry = spurious_entries_start + 8 * (i - FIRST_SYSTEM_VECTOR); set_intr_gate(i, entry); } From 24ae0c91cbc57c2deb0401bd653453a508acdcee Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 24 Feb 2020 13:24:58 +0100 Subject: [PATCH 041/190] x86/hw_breakpoint: Prevent data breakpoints on cpu_entry_area A data breakpoint near the top of an IST stack will cause unrecoverable recursion. A data breakpoint on the GDT, IDT, or TSS is terrifying. Prevent either of these from happening. Co-developed-by: Peter Zijlstra Signed-off-by: Andy Lutomirski Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Reviewed-by: Borislav Petkov Reviewed-by: Lai Jiangshan Reviewed-by: Alexandre Chartre Link: https://lkml.kernel.org/r/20200505134058.272448010@linutronix.de --- arch/x86/kernel/hw_breakpoint.c | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c index 4d8d53ed02c953..d42fc0eaf193c8 100644 --- a/arch/x86/kernel/hw_breakpoint.c +++ b/arch/x86/kernel/hw_breakpoint.c @@ -227,10 +227,35 @@ int arch_check_bp_in_kernelspace(struct arch_hw_breakpoint *hw) return (va >= TASK_SIZE_MAX) || ((va + len - 1) >= TASK_SIZE_MAX); } +/* + * Checks whether the range from addr to end, inclusive, overlaps the CPU + * entry area range. + */ +static inline bool within_cpu_entry_area(unsigned long addr, unsigned long end) +{ + return end >= CPU_ENTRY_AREA_BASE && + addr < (CPU_ENTRY_AREA_BASE + CPU_ENTRY_AREA_TOTAL_SIZE); +} + static int arch_build_bp_info(struct perf_event *bp, const struct perf_event_attr *attr, struct arch_hw_breakpoint *hw) { + unsigned long bp_end; + + bp_end = attr->bp_addr + attr->bp_len - 1; + if (bp_end < attr->bp_addr) + return -EINVAL; + + /* + * Prevent any breakpoint of any type that overlaps the + * cpu_entry_area. This protects the IST stacks and also + * reduces the chance that we ever find out what happens if + * there's a data breakpoint on the GDT, IDT, or TSS. + */ + if (within_cpu_entry_area(attr->bp_addr, bp_end)) + return -EINVAL; + hw->address = attr->bp_addr; hw->mask = 0; From e9660391d0ebd174b169af3d6de680c2f836027c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 20 Feb 2020 13:17:27 +0100 Subject: [PATCH 042/190] x86/doublefault: Remove memmove() call Use of memmove() in #DF is problematic considered tracing and other instrumentation. Remove the memmove() call and simply write out what needs doing; this even clarifies the code, win-win! The code copies from the espfix64 stack to the normal task stack, there is no possible way for that to overlap. Survives selftests/x86, specifically sigreturn_64. Suggested-by: Borislav Petkov Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Reviewed-by: Alexandre Chartre Acked-by: Andy Lutomirski Link: https://lkml.kernel.org/r/20200505134058.863038566@linutronix.de --- arch/x86/kernel/traps.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 4cc541051994d5..48468f61202cb9 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -299,6 +299,7 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code, unsign regs->ip == (unsigned long)native_irq_return_iret) { struct pt_regs *gpregs = (struct pt_regs *)this_cpu_read(cpu_tss_rw.x86_tss.sp0) - 1; + unsigned long *p = (unsigned long *)regs->sp; /* * regs->sp points to the failing IRET frame on the @@ -306,7 +307,11 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code, unsign * in gpregs->ss through gpregs->ip. * */ - memmove(&gpregs->ip, (void *)regs->sp, 5*8); + gpregs->ip = p[0]; + gpregs->cs = p[1]; + gpregs->flags = p[2]; + gpregs->sp = p[3]; + gpregs->ss = p[4]; gpregs->orig_ax = 0; /* Missing (lost) #GP error code */ /* From 725005897ec4ba07d6227a1ac3121048153eb3ce Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 25 Feb 2020 23:16:08 +0100 Subject: [PATCH 043/190] x86/entry/64: Avoid pointless code when CONTEXT_TRACKING=n GAS cannot optimize out the test and conditional jump when context tracking is disabled and CALL_enter_from_user_mode is an empty macro. Wrap it in #ifdeffery. Will go away once all this is moved to C. Signed-off-by: Thomas Gleixner Reviewed-by: Alexandre Chartre Acked-by: Frederic Weisbecker Acked-by: Andy Lutomirski Acked-by: Peter Zijlstra Link: https://lkml.kernel.org/r/20200505134058.955968069@linutronix.de --- arch/x86/entry/entry_64.S | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 87ffa792bc99ee..a15b70ac87b53e 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -889,12 +889,14 @@ apicinterrupt IRQ_WORK_VECTOR irq_work_interrupt smp_irq_work_interrupt TRACE_IRQS_OFF .endif +#ifdef CONFIG_CONTEXT_TRACKING .if \paranoid == 0 testb $3, CS(%rsp) jz .Lfrom_kernel_no_context_tracking_\@ CALL_enter_from_user_mode .Lfrom_kernel_no_context_tracking_\@: .endif +#endif movq %rsp, %rdi /* pt_regs pointer */ From 44d7e4fbc08eca153ea4b436a1440639dff2c771 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 5 Mar 2020 11:16:49 +0100 Subject: [PATCH 044/190] x86/entry: Remove the unused LOCKDEP_SYSEXIT cruft No users left since two years due to commit 21d375b6b34f ("x86/entry/64: Remove the SYSCALL64 fast path") Signed-off-by: Thomas Gleixner Reviewed-by: Alexandre Chartre Acked-by: Peter Zijlstra Link: https://lkml.kernel.org/r/20200505134059.061301403@linutronix.de --- arch/x86/entry/thunk_64.S | 5 ----- arch/x86/include/asm/irqflags.h | 24 ------------------------ 2 files changed, 29 deletions(-) diff --git a/arch/x86/entry/thunk_64.S b/arch/x86/entry/thunk_64.S index dbe4493b534e9c..34f980c9b766f5 100644 --- a/arch/x86/entry/thunk_64.S +++ b/arch/x86/entry/thunk_64.S @@ -42,10 +42,6 @@ SYM_FUNC_END(\name) THUNK trace_hardirqs_off_thunk,trace_hardirqs_off_caller,1 #endif -#ifdef CONFIG_DEBUG_LOCK_ALLOC - THUNK lockdep_sys_exit_thunk,lockdep_sys_exit -#endif - #ifdef CONFIG_PREEMPTION THUNK preempt_schedule_thunk, preempt_schedule THUNK preempt_schedule_notrace_thunk, preempt_schedule_notrace @@ -54,7 +50,6 @@ SYM_FUNC_END(\name) #endif #if defined(CONFIG_TRACE_IRQFLAGS) \ - || defined(CONFIG_DEBUG_LOCK_ALLOC) \ || defined(CONFIG_PREEMPTION) SYM_CODE_START_LOCAL_NOALIGN(.L_restore) popq %r11 diff --git a/arch/x86/include/asm/irqflags.h b/arch/x86/include/asm/irqflags.h index 8a0e56e1dcc9cd..e00f064b009ecc 100644 --- a/arch/x86/include/asm/irqflags.h +++ b/arch/x86/include/asm/irqflags.h @@ -180,30 +180,6 @@ static inline int arch_irqs_disabled(void) # define TRACE_IRQS_ON # define TRACE_IRQS_OFF #endif -#ifdef CONFIG_DEBUG_LOCK_ALLOC -# ifdef CONFIG_X86_64 -# define LOCKDEP_SYS_EXIT call lockdep_sys_exit_thunk -# define LOCKDEP_SYS_EXIT_IRQ \ - TRACE_IRQS_ON; \ - sti; \ - call lockdep_sys_exit_thunk; \ - cli; \ - TRACE_IRQS_OFF; -# else -# define LOCKDEP_SYS_EXIT \ - pushl %eax; \ - pushl %ecx; \ - pushl %edx; \ - call lockdep_sys_exit; \ - popl %edx; \ - popl %ecx; \ - popl %eax; -# define LOCKDEP_SYS_EXIT_IRQ -# endif -#else -# define LOCKDEP_SYS_EXIT -# define LOCKDEP_SYS_EXIT_IRQ -#endif #endif /* __ASSEMBLY__ */ #endif From 20355e5f73a75e58cee4c80d4cd92ce0d1628023 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 5 Mar 2020 14:01:25 +0100 Subject: [PATCH 045/190] x86/entry: Exclude low level entry code from sanitizing The sanitizers are not really applicable to the fragile low level entry code. Entry code needs to carefully setup a normal 'runtime' environment. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Reviewed-by: Alexandre Chartre Link: https://lkml.kernel.org/r/20200505134059.970057117@linutronix.de --- arch/x86/entry/Makefile | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/arch/x86/entry/Makefile b/arch/x86/entry/Makefile index 85eb381259c207..cdf45ff92dc8b0 100644 --- a/arch/x86/entry/Makefile +++ b/arch/x86/entry/Makefile @@ -3,6 +3,14 @@ # Makefile for the x86 low level entry code # +KASAN_SANITIZE := n +UBSAN_SANITIZE := n +KCOV_INSTRUMENT := n + +CFLAGS_REMOVE_common.o = $(CC_FLAGS_FTRACE) -fstack-protector -fstack-protector-strong +CFLAGS_REMOVE_syscall_32.o = $(CC_FLAGS_FTRACE) -fstack-protector -fstack-protector-strong +CFLAGS_REMOVE_syscall_64.o = $(CC_FLAGS_FTRACE) -fstack-protector -fstack-protector-strong + OBJECT_FILES_NON_STANDARD_entry_64_compat.o := y CFLAGS_syscall_64.o += $(call cc-option,-Wno-override-init,) From 0372007f5a79d61d3cb48a507717b9afb5d6addd Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 4 Mar 2020 11:05:22 +0100 Subject: [PATCH 046/190] context_tracking: Ensure that the critical path cannot be instrumented context tracking lacks a few protection mechanisms against instrumentation: - While the core functions are marked NOKPROBE they lack protection against function tracing which is required as the function entry/exit points can be utilized by BPF. - static functions invoked from the protected functions need to be marked as well as they can be instrumented otherwise. - using plain inline allows the compiler to emit traceable and probable functions. Fix this by marking the functions noinstr and converting the plain inlines to __always_inline. The NOKPROBE_SYMBOL() annotations are removed as the .noinstr.text section is already excluded from being probed. Cures the following objtool warnings: vmlinux.o: warning: objtool: enter_from_user_mode()+0x34: call to __context_tracking_exit() leaves .noinstr.text section vmlinux.o: warning: objtool: prepare_exit_to_usermode()+0x29: call to __context_tracking_enter() leaves .noinstr.text section vmlinux.o: warning: objtool: syscall_return_slowpath()+0x29: call to __context_tracking_enter() leaves .noinstr.text section vmlinux.o: warning: objtool: do_syscall_64()+0x7f: call to __context_tracking_enter() leaves .noinstr.text section vmlinux.o: warning: objtool: do_int80_syscall_32()+0x3d: call to __context_tracking_enter() leaves .noinstr.text section vmlinux.o: warning: objtool: do_fast_syscall_32()+0x9c: call to __context_tracking_enter() leaves .noinstr.text section and generates new ones... Signed-off-by: Thomas Gleixner Reviewed-by: Masami Hiramatsu Reviewed-by: Alexandre Chartre Acked-by: Peter Zijlstra Link: https://lkml.kernel.org/r/20200505134340.811520478@linutronix.de --- include/linux/context_tracking.h | 6 +++--- include/linux/context_tracking_state.h | 6 +++--- kernel/context_tracking.c | 14 ++++++++------ 3 files changed, 14 insertions(+), 12 deletions(-) diff --git a/include/linux/context_tracking.h b/include/linux/context_tracking.h index 8cac62ee6add1d..981b880d5b605e 100644 --- a/include/linux/context_tracking.h +++ b/include/linux/context_tracking.h @@ -33,13 +33,13 @@ static inline void user_exit(void) } /* Called with interrupts disabled. */ -static inline void user_enter_irqoff(void) +static __always_inline void user_enter_irqoff(void) { if (context_tracking_enabled()) __context_tracking_enter(CONTEXT_USER); } -static inline void user_exit_irqoff(void) +static __always_inline void user_exit_irqoff(void) { if (context_tracking_enabled()) __context_tracking_exit(CONTEXT_USER); @@ -75,7 +75,7 @@ static inline void exception_exit(enum ctx_state prev_ctx) * is enabled. If context tracking is disabled, returns * CONTEXT_DISABLED. This should be used primarily for debugging. */ -static inline enum ctx_state ct_state(void) +static __always_inline enum ctx_state ct_state(void) { return context_tracking_enabled() ? this_cpu_read(context_tracking.state) : CONTEXT_DISABLED; diff --git a/include/linux/context_tracking_state.h b/include/linux/context_tracking_state.h index e7fe6678b7ad13..65a60d3313b006 100644 --- a/include/linux/context_tracking_state.h +++ b/include/linux/context_tracking_state.h @@ -26,12 +26,12 @@ struct context_tracking { extern struct static_key_false context_tracking_key; DECLARE_PER_CPU(struct context_tracking, context_tracking); -static inline bool context_tracking_enabled(void) +static __always_inline bool context_tracking_enabled(void) { return static_branch_unlikely(&context_tracking_key); } -static inline bool context_tracking_enabled_cpu(int cpu) +static __always_inline bool context_tracking_enabled_cpu(int cpu) { return context_tracking_enabled() && per_cpu(context_tracking.active, cpu); } @@ -41,7 +41,7 @@ static inline bool context_tracking_enabled_this_cpu(void) return context_tracking_enabled() && __this_cpu_read(context_tracking.active); } -static inline bool context_tracking_in_user(void) +static __always_inline bool context_tracking_in_user(void) { return __this_cpu_read(context_tracking.state) == CONTEXT_USER; } diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c index ce430885c26c6d..36a98c48aedc7c 100644 --- a/kernel/context_tracking.c +++ b/kernel/context_tracking.c @@ -31,7 +31,7 @@ EXPORT_SYMBOL_GPL(context_tracking_key); DEFINE_PER_CPU(struct context_tracking, context_tracking); EXPORT_SYMBOL_GPL(context_tracking); -static bool context_tracking_recursion_enter(void) +static noinstr bool context_tracking_recursion_enter(void) { int recursion; @@ -45,7 +45,7 @@ static bool context_tracking_recursion_enter(void) return false; } -static void context_tracking_recursion_exit(void) +static __always_inline void context_tracking_recursion_exit(void) { __this_cpu_dec(context_tracking.recursion); } @@ -59,7 +59,7 @@ static void context_tracking_recursion_exit(void) * instructions to execute won't use any RCU read side critical section * because this function sets RCU in extended quiescent state. */ -void __context_tracking_enter(enum ctx_state state) +void noinstr __context_tracking_enter(enum ctx_state state) { /* Kernel threads aren't supposed to go to userspace */ WARN_ON_ONCE(!current->mm); @@ -77,8 +77,10 @@ void __context_tracking_enter(enum ctx_state state) * on the tick. */ if (state == CONTEXT_USER) { + instrumentation_begin(); trace_user_enter(0); vtime_user_enter(current); + instrumentation_end(); } rcu_user_enter(); } @@ -99,7 +101,6 @@ void __context_tracking_enter(enum ctx_state state) } context_tracking_recursion_exit(); } -NOKPROBE_SYMBOL(__context_tracking_enter); EXPORT_SYMBOL_GPL(__context_tracking_enter); void context_tracking_enter(enum ctx_state state) @@ -142,7 +143,7 @@ NOKPROBE_SYMBOL(context_tracking_user_enter); * This call supports re-entrancy. This way it can be called from any exception * handler without needing to know if we came from userspace or not. */ -void __context_tracking_exit(enum ctx_state state) +void noinstr __context_tracking_exit(enum ctx_state state) { if (!context_tracking_recursion_enter()) return; @@ -155,15 +156,16 @@ void __context_tracking_exit(enum ctx_state state) */ rcu_user_exit(); if (state == CONTEXT_USER) { + instrumentation_begin(); vtime_user_exit(current); trace_user_exit(0); + instrumentation_end(); } } __this_cpu_write(context_tracking.state, CONTEXT_KERNEL); } context_tracking_recursion_exit(); } -NOKPROBE_SYMBOL(__context_tracking_exit); EXPORT_SYMBOL_GPL(__context_tracking_exit); void context_tracking_exit(enum ctx_state state) From 126f21f0e8d46e2c0e9daafb67546dbfb316d325 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 10 Mar 2020 23:47:39 +0100 Subject: [PATCH 047/190] lib/smp_processor_id: Move it into noinstr section That code is already not traceable. Move it into the noinstr section so the objtool section validation does not trigger. Annotate the warning code as "safe". While it might be not under all circumstances, getting the information out is important enough. Should this ever trigger from the sensitive code which is shielded against instrumentation, e.g. low level entry, then the printk is the least of the worries. Addresses the objtool warnings: vmlinux.o: warning: objtool: context_tracking_recursion_enter()+0x7: call to __this_cpu_preempt_check() leaves .noinstr.text section vmlinux.o: warning: objtool: __context_tracking_exit()+0x17: call to __this_cpu_preempt_check() leaves .noinstr.text section vmlinux.o: warning: objtool: __context_tracking_enter()+0x2a: call to __this_cpu_preempt_check() leaves .noinstr.text section Signed-off-by: Thomas Gleixner Reviewed-by: Alexandre Chartre Acked-by: Peter Zijlstra Link: https://lkml.kernel.org/r/20200505134340.902709267@linutronix.de --- lib/smp_processor_id.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/lib/smp_processor_id.c b/lib/smp_processor_id.c index bd957165328890..525222e4f40994 100644 --- a/lib/smp_processor_id.c +++ b/lib/smp_processor_id.c @@ -8,7 +8,7 @@ #include #include -notrace static nokprobe_inline +noinstr static unsigned int check_preemption_disabled(const char *what1, const char *what2) { int this_cpu = raw_smp_processor_id(); @@ -37,6 +37,7 @@ unsigned int check_preemption_disabled(const char *what1, const char *what2) */ preempt_disable_notrace(); + instrumentation_begin(); if (!printk_ratelimit()) goto out_enable; @@ -45,6 +46,7 @@ unsigned int check_preemption_disabled(const char *what1, const char *what2) printk("caller is %pS\n", __builtin_return_address(0)); dump_stack(); + instrumentation_end(); out_enable: preempt_enable_no_resched_notrace(); @@ -52,16 +54,14 @@ unsigned int check_preemption_disabled(const char *what1, const char *what2) return this_cpu; } -notrace unsigned int debug_smp_processor_id(void) +noinstr unsigned int debug_smp_processor_id(void) { return check_preemption_disabled("smp_processor_id", ""); } EXPORT_SYMBOL(debug_smp_processor_id); -NOKPROBE_SYMBOL(debug_smp_processor_id); -notrace void __this_cpu_preempt_check(const char *op) +noinstr void __this_cpu_preempt_check(const char *op) { check_preemption_disabled("__this_cpu_", op); } EXPORT_SYMBOL(__this_cpu_preempt_check); -NOKPROBE_SYMBOL(__this_cpu_preempt_check); From 5916d5f9b3347344a3d96ba6b54cf8e290eba96a Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 13 Mar 2020 13:49:51 +0100 Subject: [PATCH 048/190] bug: Annotate WARN/BUG/stackfail as noinstr safe Warnings, bugs and stack protection fails from noinstr sections, e.g. low level and early entry code, are likely to be fatal. Mark them as "safe" to be invoked from noinstr protected code to avoid annotating all usage sites. Getting the information out is important. Signed-off-by: Thomas Gleixner Reviewed-by: Alexandre Chartre Acked-by: Peter Zijlstra Link: https://lkml.kernel.org/r/20200505134100.376598577@linutronix.de --- arch/x86/include/asm/bug.h | 3 +++ include/asm-generic/bug.h | 9 +++++++-- kernel/panic.c | 4 +++- 3 files changed, 13 insertions(+), 3 deletions(-) diff --git a/arch/x86/include/asm/bug.h b/arch/x86/include/asm/bug.h index facba9bc30caa4..fb34ff641e0a1b 100644 --- a/arch/x86/include/asm/bug.h +++ b/arch/x86/include/asm/bug.h @@ -70,14 +70,17 @@ do { \ #define HAVE_ARCH_BUG #define BUG() \ do { \ + instrumentation_begin(); \ _BUG_FLAGS(ASM_UD2, 0); \ unreachable(); \ } while (0) #define __WARN_FLAGS(flags) \ do { \ + instrumentation_begin(); \ _BUG_FLAGS(ASM_UD2, BUGFLAG_WARNING|(flags)); \ annotate_reachable(); \ + instrumentation_end(); \ } while (0) #include diff --git a/include/asm-generic/bug.h b/include/asm-generic/bug.h index 384b5c835ced33..c94e33ae3e7b81 100644 --- a/include/asm-generic/bug.h +++ b/include/asm-generic/bug.h @@ -83,14 +83,19 @@ extern __printf(4, 5) void warn_slowpath_fmt(const char *file, const int line, unsigned taint, const char *fmt, ...); #define __WARN() __WARN_printf(TAINT_WARN, NULL) -#define __WARN_printf(taint, arg...) \ - warn_slowpath_fmt(__FILE__, __LINE__, taint, arg) +#define __WARN_printf(taint, arg...) do { \ + instrumentation_begin(); \ + warn_slowpath_fmt(__FILE__, __LINE__, taint, arg); \ + instrumentation_end(); \ + } while (0) #else extern __printf(1, 2) void __warn_printk(const char *fmt, ...); #define __WARN() __WARN_FLAGS(BUGFLAG_TAINT(TAINT_WARN)) #define __WARN_printf(taint, arg...) do { \ + instrumentation_begin(); \ __warn_printk(arg); \ __WARN_FLAGS(BUGFLAG_NO_CUT_HERE | BUGFLAG_TAINT(taint));\ + instrumentation_end(); \ } while (0) #define WARN_ON_ONCE(condition) ({ \ int __ret_warn_on = !!(condition); \ diff --git a/kernel/panic.c b/kernel/panic.c index 85568bbfb12b7f..e2157ca387c832 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -680,10 +680,12 @@ device_initcall(register_warn_debugfs); * Called when gcc's -fstack-protector feature is used, and * gcc detects corruption of the on-stack canary value */ -__visible void __stack_chk_fail(void) +__visible noinstr void __stack_chk_fail(void) { + instrumentation_begin(); panic("stack-protector: Kernel stack is corrupted in: %pB", __builtin_return_address(0)); + instrumentation_end(); } EXPORT_SYMBOL(__stack_chk_fail); From fba8dbeaf30e2c8db2c2cfeb38f6dbffcbf86bba Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 15 May 2020 17:39:05 +0200 Subject: [PATCH 049/190] x86/idt: Remove update_intr_gate() No more users. Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/desc.h | 1 - arch/x86/kernel/idt.c | 8 -------- 2 files changed, 9 deletions(-) diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h index 68a99d2a5f335d..085a2dd312b47d 100644 --- a/arch/x86/include/asm/desc.h +++ b/arch/x86/include/asm/desc.h @@ -386,7 +386,6 @@ static inline void set_desc_limit(struct desc_struct *desc, unsigned long limit) desc->limit1 = (limit >> 16) & 0xf; } -void update_intr_gate(unsigned int n, const void *addr); void alloc_intr_gate(unsigned int n, const void *addr); extern unsigned long system_vectors[]; diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c index 36fef90a38e796..95609ee4c8b3e1 100644 --- a/arch/x86/kernel/idt.c +++ b/arch/x86/kernel/idt.c @@ -360,14 +360,6 @@ void idt_invalidate(void *addr) load_idt(&idt); } -/* This goes away once ASYNC_PF is sanitized */ -void __init update_intr_gate(unsigned int n, const void *addr) -{ - if (WARN_ON_ONCE(!test_bit(n, system_vectors))) - return; - set_intr_gate(n, addr); -} - void __init alloc_intr_gate(unsigned int n, const void *addr) { if (WARN_ON(n < FIRST_SYSTEM_VECTOR)) From b9f6976bfb949121bb6e1e6f4fd9909735729148 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 25 Mar 2020 19:45:26 +0100 Subject: [PATCH 050/190] x86/entry/64: Move non entry code into .text section All ASM code which is not part of the entry functionality can move out into the .text section. No reason to keep it in the non-instrumentable entry section. Signed-off-by: Thomas Gleixner Reviewed-by: Steven Rostedt (VMware) Reviewed-by: Alexandre Chartre Acked-by: Peter Zijlstra Link: https://lkml.kernel.org/r/20200505134340.227579223@linutronix.de --- arch/x86/entry/entry_64.S | 18 ++++++++++++++---- arch/x86/kernel/ftrace_64.S | 2 +- 2 files changed, 15 insertions(+), 5 deletions(-) diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index a15b70ac87b53e..b199f43cff28d0 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -279,6 +279,7 @@ SYM_CODE_END(entry_SYSCALL_64) * %rdi: prev task * %rsi: next task */ +.pushsection .text, "ax" SYM_FUNC_START(__switch_to_asm) /* * Save callee-saved registers @@ -321,6 +322,7 @@ SYM_FUNC_START(__switch_to_asm) jmp __switch_to SYM_FUNC_END(__switch_to_asm) +.popsection /* * A newly forked process directly context switches into this address. @@ -329,6 +331,7 @@ SYM_FUNC_END(__switch_to_asm) * rbx: kernel thread func (NULL for user thread) * r12: kernel thread arg */ +.pushsection .text, "ax" SYM_CODE_START(ret_from_fork) UNWIND_HINT_EMPTY movq %rax, %rdi @@ -357,6 +360,7 @@ SYM_CODE_START(ret_from_fork) movq $0, RAX(%rsp) jmp 2b SYM_CODE_END(ret_from_fork) +.popsection /* * Build the entry stubs with some assembler magic. @@ -1037,10 +1041,12 @@ idtentry alignment_check do_alignment_check has_error_code=1 idtentry simd_coprocessor_error do_simd_coprocessor_error has_error_code=0 - /* - * Reload gs selector with exception handling - * edi: new selector - */ +/* + * Reload gs selector with exception handling + * edi: new selector + * + * Is in entry.text as it shouldn't be instrumented. + */ SYM_FUNC_START(native_load_gs_index) FRAME_BEGIN pushfq @@ -1076,6 +1082,7 @@ SYM_CODE_END(.Lbad_gs) .previous /* Call softirq on interrupt stack. Interrupts are off. */ +.pushsection .text, "ax" SYM_FUNC_START(do_softirq_own_stack) pushq %rbp mov %rsp, %rbp @@ -1085,6 +1092,7 @@ SYM_FUNC_START(do_softirq_own_stack) leaveq ret SYM_FUNC_END(do_softirq_own_stack) +.popsection #ifdef CONFIG_XEN_PV idtentry hypervisor_callback xen_do_hypervisor_callback has_error_code=0 @@ -1728,6 +1736,7 @@ SYM_CODE_START(ignore_sysret) SYM_CODE_END(ignore_sysret) #endif +.pushsection .text, "ax" SYM_CODE_START(rewind_stack_do_exit) UNWIND_HINT_FUNC /* Prevent any naive code from trying to unwind to our caller. */ @@ -1739,3 +1748,4 @@ SYM_CODE_START(rewind_stack_do_exit) call do_exit SYM_CODE_END(rewind_stack_do_exit) +.popsection diff --git a/arch/x86/kernel/ftrace_64.S b/arch/x86/kernel/ftrace_64.S index aa5d28aeb31e4c..083a3da7bb738a 100644 --- a/arch/x86/kernel/ftrace_64.S +++ b/arch/x86/kernel/ftrace_64.S @@ -12,7 +12,7 @@ #include .code64 - .section .entry.text, "ax" + .section .text, "ax" #ifdef CONFIG_FRAME_POINTER /* Save parent and function stack frames (rip and rbp) */ From 8c0fa8a036cd9c000bcf761413b565b429f629fc Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 25 Mar 2020 19:47:40 +0100 Subject: [PATCH 051/190] x86/entry/32: Move non entry code into .text section All ASM code which is not part of the entry functionality can move out into the .text section. No reason to keep it in the non-instrumentable entry section. Signed-off-by: Thomas Gleixner Reviewed-by: Alexandre Chartre Acked-by: Peter Zijlstra Link: https://lkml.kernel.org/r/20200505134340.320164650@linutronix.de --- arch/x86/entry/entry_32.S | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index a5eed844e948bd..bf0082bb07a1ae 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -730,6 +730,7 @@ * %eax: prev task * %edx: next task */ +.pushsection .text, "ax" SYM_CODE_START(__switch_to_asm) /* * Save callee-saved registers @@ -776,6 +777,7 @@ SYM_CODE_START(__switch_to_asm) jmp __switch_to SYM_CODE_END(__switch_to_asm) +.popsection /* * The unwinder expects the last frame on the stack to always be at the same @@ -784,6 +786,7 @@ SYM_CODE_END(__switch_to_asm) * asmlinkage function so its argument has to be pushed on the stack. This * wrapper creates a proper "end of stack" frame header before the call. */ +.pushsection .text, "ax" SYM_FUNC_START(schedule_tail_wrapper) FRAME_BEGIN @@ -794,6 +797,8 @@ SYM_FUNC_START(schedule_tail_wrapper) FRAME_END ret SYM_FUNC_END(schedule_tail_wrapper) +.popsection + /* * A newly forked process directly context switches into this address. * @@ -801,6 +806,7 @@ SYM_FUNC_END(schedule_tail_wrapper) * ebx: kernel thread func (NULL for user thread) * edi: kernel thread arg */ +.pushsection .text, "ax" SYM_CODE_START(ret_from_fork) call schedule_tail_wrapper @@ -825,6 +831,7 @@ SYM_CODE_START(ret_from_fork) movl $0, PT_EAX(%esp) jmp 2b SYM_CODE_END(ret_from_fork) +.popsection /* * Return to user mode is not as complex as all this looks, @@ -1691,6 +1698,7 @@ SYM_CODE_START(general_protection) jmp common_exception SYM_CODE_END(general_protection) +.pushsection .text, "ax" SYM_CODE_START(rewind_stack_do_exit) /* Prevent any naive code from trying to unwind to our caller. */ xorl %ebp, %ebp @@ -1701,3 +1709,4 @@ SYM_CODE_START(rewind_stack_do_exit) call do_exit 1: jmp 1b SYM_CODE_END(rewind_stack_do_exit) +.popsection From 1723be30e46fbda0c5971d3a19a37a7c2499bc90 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 29 Feb 2020 15:12:33 +0100 Subject: [PATCH 052/190] x86/entry: Mark enter_from_user_mode() noinstr Both the callers in the low level ASM code and __context_tracking_exit() which is invoked from enter_from_user_mode() via user_exit_irqoff() are marked NOKPROBE. Allowing enter_from_user_mode() to be probed is inconsistent at best. Aside of that while function tracing per se is safe the function trace entry/exit points can be used via BPF as well which is not safe to use before context tracking has reached CONTEXT_KERNEL and adjusted RCU. Mark it noinstr which moves it into the instrumentation protected text section and includes notrace. Note, this needs further fixups in context tracking to ensure that the full call chain is protected. Will be addressed in follow up changes. Signed-off-by: Thomas Gleixner Reviewed-by: Masami Hiramatsu Reviewed-by: Alexandre Chartre Acked-by: Peter Zijlstra Link: https://lkml.kernel.org/r/20200505134340.429059405@linutronix.de --- arch/x86/entry/common.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index 76735ec813e6bf..d862add7918b7d 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -41,7 +41,7 @@ #ifdef CONFIG_CONTEXT_TRACKING /* Called on entry from user mode with IRQs off. */ -__visible inline void enter_from_user_mode(void) +__visible inline noinstr void enter_from_user_mode(void) { CT_WARN_ON(ct_state() != CONTEXT_USER); user_exit_irqoff(); From 8f159f1dfa1ea29d70a84335fe6a8bd501a9eecd Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 10 Mar 2020 14:46:27 +0100 Subject: [PATCH 053/190] x86/entry/common: Protect against instrumentation Mark the various syscall entries with noinstr to protect them against instrumentation and add the noinstrumentation_begin()/end() annotations to mark the parts of the functions which are safe to call out into instrumentable code. Signed-off-by: Thomas Gleixner Reviewed-by: Alexandre Chartre Acked-by: Peter Zijlstra Link: https://lkml.kernel.org/r/20200505134340.520277507@linutronix.de --- arch/x86/entry/common.c | 133 +++++++++++++++++++++++++++------------- 1 file changed, 89 insertions(+), 44 deletions(-) diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index d862add7918b7d..9892fb7c9d4400 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -41,15 +41,26 @@ #ifdef CONFIG_CONTEXT_TRACKING /* Called on entry from user mode with IRQs off. */ -__visible inline noinstr void enter_from_user_mode(void) +__visible noinstr void enter_from_user_mode(void) { - CT_WARN_ON(ct_state() != CONTEXT_USER); + enum ctx_state state = ct_state(); + user_exit_irqoff(); + + instrumentation_begin(); + CT_WARN_ON(state != CONTEXT_USER); + instrumentation_end(); } #else static inline void enter_from_user_mode(void) {} #endif +static noinstr void exit_to_user_mode(void) +{ + user_enter_irqoff(); + mds_user_clear_cpu_buffers(); +} + static void do_audit_syscall_entry(struct pt_regs *regs, u32 arch) { #ifdef CONFIG_X86_64 @@ -179,8 +190,7 @@ static void exit_to_usermode_loop(struct pt_regs *regs, u32 cached_flags) } } -/* Called with IRQs disabled. */ -__visible inline void prepare_exit_to_usermode(struct pt_regs *regs) +static void __prepare_exit_to_usermode(struct pt_regs *regs) { struct thread_info *ti = current_thread_info(); u32 cached_flags; @@ -219,10 +229,14 @@ __visible inline void prepare_exit_to_usermode(struct pt_regs *regs) */ ti->status &= ~(TS_COMPAT|TS_I386_REGS_POKED); #endif +} - user_enter_irqoff(); - - mds_user_clear_cpu_buffers(); +__visible noinstr void prepare_exit_to_usermode(struct pt_regs *regs) +{ + instrumentation_begin(); + __prepare_exit_to_usermode(regs); + instrumentation_end(); + exit_to_user_mode(); } #define SYSCALL_EXIT_WORK_FLAGS \ @@ -251,11 +265,7 @@ static void syscall_slow_exit_work(struct pt_regs *regs, u32 cached_flags) tracehook_report_syscall_exit(regs, step); } -/* - * Called with IRQs on and fully valid regs. Returns with IRQs off in a - * state such that we can immediately switch to user mode. - */ -__visible inline void syscall_return_slowpath(struct pt_regs *regs) +static void __syscall_return_slowpath(struct pt_regs *regs) { struct thread_info *ti = current_thread_info(); u32 cached_flags = READ_ONCE(ti->flags); @@ -276,15 +286,29 @@ __visible inline void syscall_return_slowpath(struct pt_regs *regs) syscall_slow_exit_work(regs, cached_flags); local_irq_disable(); - prepare_exit_to_usermode(regs); + __prepare_exit_to_usermode(regs); +} + +/* + * Called with IRQs on and fully valid regs. Returns with IRQs off in a + * state such that we can immediately switch to user mode. + */ +__visible noinstr void syscall_return_slowpath(struct pt_regs *regs) +{ + instrumentation_begin(); + __syscall_return_slowpath(regs); + instrumentation_end(); + exit_to_user_mode(); } #ifdef CONFIG_X86_64 -__visible void do_syscall_64(unsigned long nr, struct pt_regs *regs) +__visible noinstr void do_syscall_64(unsigned long nr, struct pt_regs *regs) { struct thread_info *ti; enter_from_user_mode(); + instrumentation_begin(); + local_irq_enable(); ti = current_thread_info(); if (READ_ONCE(ti->flags) & _TIF_WORK_SYSCALL_ENTRY) @@ -301,8 +325,10 @@ __visible void do_syscall_64(unsigned long nr, struct pt_regs *regs) regs->ax = x32_sys_call_table[nr](regs); #endif } + __syscall_return_slowpath(regs); - syscall_return_slowpath(regs); + instrumentation_end(); + exit_to_user_mode(); } #endif @@ -313,7 +339,7 @@ __visible void do_syscall_64(unsigned long nr, struct pt_regs *regs) * extremely hot in workloads that use it, and it's usually called from * do_fast_syscall_32, so forcibly inline it to improve performance. */ -static __always_inline void do_syscall_32_irqs_on(struct pt_regs *regs) +static void do_syscall_32_irqs_on(struct pt_regs *regs) { struct thread_info *ti = current_thread_info(); unsigned int nr = (unsigned int)regs->orig_ax; @@ -337,27 +363,62 @@ static __always_inline void do_syscall_32_irqs_on(struct pt_regs *regs) regs->ax = ia32_sys_call_table[nr](regs); } - syscall_return_slowpath(regs); + __syscall_return_slowpath(regs); } /* Handles int $0x80 */ -__visible void do_int80_syscall_32(struct pt_regs *regs) +__visible noinstr void do_int80_syscall_32(struct pt_regs *regs) { enter_from_user_mode(); + instrumentation_begin(); + local_irq_enable(); do_syscall_32_irqs_on(regs); + + instrumentation_end(); + exit_to_user_mode(); +} + +static bool __do_fast_syscall_32(struct pt_regs *regs) +{ + int res; + + /* Fetch EBP from where the vDSO stashed it. */ + if (IS_ENABLED(CONFIG_X86_64)) { + /* + * Micro-optimization: the pointer we're following is + * explicitly 32 bits, so it can't be out of range. + */ + res = __get_user(*(u32 *)®s->bp, + (u32 __user __force *)(unsigned long)(u32)regs->sp); + } else { + res = get_user(*(u32 *)®s->bp, + (u32 __user __force *)(unsigned long)(u32)regs->sp); + } + + if (res) { + /* User code screwed up. */ + regs->ax = -EFAULT; + local_irq_disable(); + __prepare_exit_to_usermode(regs); + return false; + } + + /* Now this is just like a normal syscall. */ + do_syscall_32_irqs_on(regs); + return true; } /* Returns 0 to return using IRET or 1 to return using SYSEXIT/SYSRETL. */ -__visible long do_fast_syscall_32(struct pt_regs *regs) +__visible noinstr long do_fast_syscall_32(struct pt_regs *regs) { /* * Called using the internal vDSO SYSENTER/SYSCALL32 calling * convention. Adjust regs so it looks like we entered using int80. */ - unsigned long landing_pad = (unsigned long)current->mm->context.vdso + - vdso_image_32.sym_int80_landing_pad; + vdso_image_32.sym_int80_landing_pad; + bool success; /* * SYSENTER loses EIP, and even SYSCALL32 needs us to skip forward @@ -367,33 +428,17 @@ __visible long do_fast_syscall_32(struct pt_regs *regs) regs->ip = landing_pad; enter_from_user_mode(); + instrumentation_begin(); local_irq_enable(); + success = __do_fast_syscall_32(regs); - /* Fetch EBP from where the vDSO stashed it. */ - if ( -#ifdef CONFIG_X86_64 - /* - * Micro-optimization: the pointer we're following is explicitly - * 32 bits, so it can't be out of range. - */ - __get_user(*(u32 *)®s->bp, - (u32 __user __force *)(unsigned long)(u32)regs->sp) -#else - get_user(*(u32 *)®s->bp, - (u32 __user __force *)(unsigned long)(u32)regs->sp) -#endif - ) { - - /* User code screwed up. */ - local_irq_disable(); - regs->ax = -EFAULT; - prepare_exit_to_usermode(regs); - return 0; /* Keep it simple: use IRET. */ - } + instrumentation_end(); + exit_to_user_mode(); - /* Now this is just like a normal syscall. */ - do_syscall_32_irqs_on(regs); + /* If it failed, keep it simple: use IRET. */ + if (!success) + return 0; #ifdef CONFIG_X86_64 /* From dd8e2d9ae64fa4348530df3e45e9f874d807a1c2 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 25 Feb 2020 23:08:05 +0100 Subject: [PATCH 054/190] x86/entry: Move irq tracing on syscall entry to C-code Now that the C entry points are safe, move the irq flags tracing code into the entry helper: - Invoke lockdep before calling into context tracking - Use the safe trace_hardirqs_on_prepare() trace function after context tracking established state and RCU is watching. enter_from_user_mode() is also still invoked from the exception/interrupt entry code which still contains the ASM irq flags tracing. So this is just a redundant and harmless invocation of tracing / lockdep until these are removed as well. Signed-off-by: Thomas Gleixner Reviewed-by: Alexandre Chartre Acked-by: Peter Zijlstra Link: https://lkml.kernel.org/r/20200505134340.611961721@linutronix.de --- arch/x86/entry/common.c | 21 +++++++++++++++++++-- arch/x86/entry/entry_32.S | 12 ------------ arch/x86/entry/entry_64.S | 2 -- arch/x86/entry/entry_64_compat.S | 18 ------------------ 4 files changed, 19 insertions(+), 34 deletions(-) diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index 9892fb7c9d4400..7473c1297a8419 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -40,19 +40,36 @@ #include #ifdef CONFIG_CONTEXT_TRACKING -/* Called on entry from user mode with IRQs off. */ +/** + * enter_from_user_mode - Establish state when coming from user mode + * + * Syscall entry disables interrupts, but user mode is traced as interrupts + * enabled. Also with NO_HZ_FULL RCU might be idle. + * + * 1) Tell lockdep that interrupts are disabled + * 2) Invoke context tracking if enabled to reactivate RCU + * 3) Trace interrupts off state + */ __visible noinstr void enter_from_user_mode(void) { enum ctx_state state = ct_state(); + lockdep_hardirqs_off(CALLER_ADDR0); user_exit_irqoff(); instrumentation_begin(); CT_WARN_ON(state != CONTEXT_USER); + trace_hardirqs_off_prepare(); instrumentation_end(); } #else -static inline void enter_from_user_mode(void) {} +static __always_inline void enter_from_user_mode(void) +{ + lockdep_hardirqs_off(CALLER_ADDR0); + instrumentation_begin(); + trace_hardirqs_off_prepare(); + instrumentation_end(); +} #endif static noinstr void exit_to_user_mode(void) diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index bf0082bb07a1ae..65704e02e1e22b 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -967,12 +967,6 @@ SYM_FUNC_START(entry_SYSENTER_32) jnz .Lsysenter_fix_flags .Lsysenter_flags_fixed: - /* - * User mode is traced as though IRQs are on, and SYSENTER - * turned them off. - */ - TRACE_IRQS_OFF - movl %esp, %eax call do_fast_syscall_32 /* XEN PV guests always use IRET path */ @@ -1082,12 +1076,6 @@ SYM_FUNC_START(entry_INT80_32) SAVE_ALL pt_regs_ax=$-ENOSYS switch_stacks=1 /* save rest */ - /* - * User mode is traced as though IRQs are on, and the interrupt gate - * turned them off. - */ - TRACE_IRQS_OFF - movl %esp, %eax call do_int80_syscall_32 .Lsyscall_32_done: diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index b199f43cff28d0..9e34fe849a4e5b 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -167,8 +167,6 @@ SYM_INNER_LABEL(entry_SYSCALL_64_after_hwframe, SYM_L_GLOBAL) PUSH_AND_CLEAR_REGS rax=$-ENOSYS - TRACE_IRQS_OFF - /* IRQs are off. */ movq %rax, %rdi movq %rsp, %rsi diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S index f1d3ccae5dd5e7..e2e8bd77dc2729 100644 --- a/arch/x86/entry/entry_64_compat.S +++ b/arch/x86/entry/entry_64_compat.S @@ -129,12 +129,6 @@ SYM_FUNC_START(entry_SYSENTER_compat) jnz .Lsysenter_fix_flags .Lsysenter_flags_fixed: - /* - * User mode is traced as though IRQs are on, and SYSENTER - * turned them off. - */ - TRACE_IRQS_OFF - movq %rsp, %rdi call do_fast_syscall_32 /* XEN PV guests always use IRET path */ @@ -247,12 +241,6 @@ SYM_INNER_LABEL(entry_SYSCALL_compat_after_hwframe, SYM_L_GLOBAL) pushq $0 /* pt_regs->r15 = 0 */ xorl %r15d, %r15d /* nospec r15 */ - /* - * User mode is traced as though IRQs are on, and SYSENTER - * turned them off. - */ - TRACE_IRQS_OFF - movq %rsp, %rdi call do_fast_syscall_32 /* XEN PV guests always use IRET path */ @@ -403,12 +391,6 @@ SYM_CODE_START(entry_INT80_compat) xorl %r15d, %r15d /* nospec r15 */ cld - /* - * User mode is traced as though IRQs are on, and the interrupt - * gate turned them off. - */ - TRACE_IRQS_OFF - movq %rsp, %rdi call do_int80_syscall_32 .Lsyscall_32_done: From 4983e5d74c821780d518232eea4acdc4a8f0b44d Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 4 Mar 2020 12:51:59 +0100 Subject: [PATCH 055/190] x86/entry: Move irq flags tracing to prepare_exit_to_usermode() This is another step towards more C-code and less convoluted ASM. Similar to the entry path, invoke the tracer before context tracking which might turn off RCU and invoke lockdep as the last step before going back to user space. Annotate the code sections in exit_to_user_mode() accordingly so objtool won't complain about the tracer invocation. Signed-off-by: Thomas Gleixner Reviewed-by: Alexandre Chartre Acked-by: Peter Zijlstra Acked-by: Andy Lutomirski Link: https://lkml.kernel.org/r/20200505134340.703783926@linutronix.de --- arch/x86/entry/common.c | 19 ++++++++++++++++++- arch/x86/entry/entry_32.S | 12 ++++-------- arch/x86/entry/entry_64.S | 4 ---- arch/x86/entry/entry_64_compat.S | 14 +++++--------- 4 files changed, 27 insertions(+), 22 deletions(-) diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index 7473c1297a8419..e4f9f5f2c21b06 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -72,10 +72,27 @@ static __always_inline void enter_from_user_mode(void) } #endif -static noinstr void exit_to_user_mode(void) +/** + * exit_to_user_mode - Fixup state when exiting to user mode + * + * Syscall exit enables interrupts, but the kernel state is interrupts + * disabled when this is invoked. Also tell RCU about it. + * + * 1) Trace interrupts on state + * 2) Invoke context tracking if enabled to adjust RCU state + * 3) Clear CPU buffers if CPU is affected by MDS and the migitation is on. + * 4) Tell lockdep that interrupts are enabled + */ +static __always_inline void exit_to_user_mode(void) { + instrumentation_begin(); + trace_hardirqs_on_prepare(); + lockdep_hardirqs_on_prepare(CALLER_ADDR0); + instrumentation_end(); + user_enter_irqoff(); mds_user_clear_cpu_buffers(); + lockdep_hardirqs_on(CALLER_ADDR0); } static void do_audit_syscall_entry(struct pt_regs *regs, u32 arch) diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index 65704e02e1e22b..d9da0b7f38ff92 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -817,8 +817,7 @@ SYM_CODE_START(ret_from_fork) /* When we fork, we trace the syscall return in the child, too. */ movl %esp, %eax call syscall_return_slowpath - STACKLEAK_ERASE - jmp restore_all + jmp .Lsyscall_32_done /* kernel thread */ 1: movl %edi, %eax @@ -862,7 +861,7 @@ ret_from_intr: TRACE_IRQS_OFF movl %esp, %eax call prepare_exit_to_usermode - jmp restore_all + jmp restore_all_switch_stack SYM_CODE_END(ret_from_exception) SYM_ENTRY(__begin_SYSENTER_singlestep_region, SYM_L_GLOBAL, SYM_A_NONE) @@ -975,8 +974,7 @@ SYM_FUNC_START(entry_SYSENTER_32) STACKLEAK_ERASE -/* Opportunistic SYSEXIT */ - TRACE_IRQS_ON /* User mode traces as IRQs on. */ + /* Opportunistic SYSEXIT */ /* * Setup entry stack - we keep the pointer in %eax and do the @@ -1079,11 +1077,9 @@ SYM_FUNC_START(entry_INT80_32) movl %esp, %eax call do_int80_syscall_32 .Lsyscall_32_done: - STACKLEAK_ERASE -restore_all: - TRACE_IRQS_ON +restore_all_switch_stack: SWITCH_TO_ENTRY_STACK CHECK_AND_APPLY_ESPFIX diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 9e34fe849a4e5b..9866b54d9acc92 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -172,8 +172,6 @@ SYM_INNER_LABEL(entry_SYSCALL_64_after_hwframe, SYM_L_GLOBAL) movq %rsp, %rsi call do_syscall_64 /* returns with IRQs disabled */ - TRACE_IRQS_ON /* return enables interrupts */ - /* * Try to use SYSRET instead of IRET if we're returning to * a completely clean 64-bit userspace context. If we're not, @@ -342,7 +340,6 @@ SYM_CODE_START(ret_from_fork) UNWIND_HINT_REGS movq %rsp, %rdi call syscall_return_slowpath /* returns with IRQs disabled */ - TRACE_IRQS_ON /* user mode is traced as IRQS on */ jmp swapgs_restore_regs_and_return_to_usermode 1: @@ -620,7 +617,6 @@ ret_from_intr: .Lretint_user: mov %rsp,%rdi call prepare_exit_to_usermode - TRACE_IRQS_ON SYM_INNER_LABEL(swapgs_restore_regs_and_return_to_usermode, SYM_L_GLOBAL) #ifdef CONFIG_DEBUG_ENTRY diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S index e2e8bd77dc2729..7c29ed89803322 100644 --- a/arch/x86/entry/entry_64_compat.S +++ b/arch/x86/entry/entry_64_compat.S @@ -132,8 +132,8 @@ SYM_FUNC_START(entry_SYSENTER_compat) movq %rsp, %rdi call do_fast_syscall_32 /* XEN PV guests always use IRET path */ - ALTERNATIVE "testl %eax, %eax; jz .Lsyscall_32_done", \ - "jmp .Lsyscall_32_done", X86_FEATURE_XENPV + ALTERNATIVE "testl %eax, %eax; jz swapgs_restore_regs_and_return_to_usermode", \ + "jmp swapgs_restore_regs_and_return_to_usermode", X86_FEATURE_XENPV jmp sysret32_from_system_call .Lsysenter_fix_flags: @@ -244,8 +244,8 @@ SYM_INNER_LABEL(entry_SYSCALL_compat_after_hwframe, SYM_L_GLOBAL) movq %rsp, %rdi call do_fast_syscall_32 /* XEN PV guests always use IRET path */ - ALTERNATIVE "testl %eax, %eax; jz .Lsyscall_32_done", \ - "jmp .Lsyscall_32_done", X86_FEATURE_XENPV + ALTERNATIVE "testl %eax, %eax; jz swapgs_restore_regs_and_return_to_usermode", \ + "jmp swapgs_restore_regs_and_return_to_usermode", X86_FEATURE_XENPV /* Opportunistic SYSRET */ sysret32_from_system_call: @@ -254,7 +254,7 @@ sysret32_from_system_call: * stack. So let's erase the thread stack right now. */ STACKLEAK_ERASE - TRACE_IRQS_ON /* User mode traces as IRQs on. */ + movq RBX(%rsp), %rbx /* pt_regs->rbx */ movq RBP(%rsp), %rbp /* pt_regs->rbp */ movq EFLAGS(%rsp), %r11 /* pt_regs->flags (in r11) */ @@ -393,9 +393,5 @@ SYM_CODE_START(entry_INT80_compat) movq %rsp, %rdi call do_int80_syscall_32 -.Lsyscall_32_done: - - /* Go back to user mode. */ - TRACE_IRQS_ON jmp swapgs_restore_regs_and_return_to_usermode SYM_CODE_END(entry_INT80_compat) From a7ef9ba986b5fae9d80f8a7b31db0423687efe4e Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 4 Mar 2020 12:49:18 +0100 Subject: [PATCH 056/190] x86/speculation/mds: Mark mds_user_clear_cpu_buffers() __always_inline Prevent the compiler from uninlining and creating traceable/probable functions as this is invoked _after_ context tracking switched to CONTEXT_USER and rcu idle. Signed-off-by: Thomas Gleixner Reviewed-by: Alexandre Chartre Acked-by: Peter Zijlstra Link: https://lkml.kernel.org/r/20200505134340.902709267@linutronix.de --- arch/x86/include/asm/nospec-branch.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index d52d1aacdd971c..e7752b4038ff88 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -262,7 +262,7 @@ DECLARE_STATIC_KEY_FALSE(mds_idle_clear); * combination with microcode which triggers a CPU buffer flush when the * instruction is executed. */ -static inline void mds_clear_cpu_buffers(void) +static __always_inline void mds_clear_cpu_buffers(void) { static const u16 ds = __KERNEL_DS; @@ -283,7 +283,7 @@ static inline void mds_clear_cpu_buffers(void) * * Clear CPU buffers if the corresponding static key is enabled */ -static inline void mds_user_clear_cpu_buffers(void) +static __always_inline void mds_user_clear_cpu_buffers(void) { if (static_branch_likely(&mds_user_clear)) mds_clear_cpu_buffers(); From 1c3e5d3f60e26415d4227aa1193cf9e2db4df834 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 12 May 2020 18:17:12 +0200 Subject: [PATCH 057/190] x86/entry: Make entry_64_compat.S objtool clean Currently entry_64_compat is exempt from objtool, but with vmlinux mode there is no hiding it. Make the following changes to make it pass: - change entry_SYSENTER_compat to STT_NOTYPE; it's not a function and doesn't have function type stack setup. - mark all STT_NOTYPE symbols with UNWIND_HINT_EMPTY; so we do validate them and don't treat them as unreachable. - don't abuse RSP as a temp register, this confuses objtool mightily as it (rightfully) thinks we're doing unspeakable things to the stack. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Reviewed-by: Alexandre Chartre Acked-by: Andy Lutomirski Link: https://lkml.kernel.org/r/20200505134341.272248024@linutronix.de --- arch/x86/entry/Makefile | 2 -- arch/x86/entry/entry_64_compat.S | 25 ++++++++++++++++++++----- 2 files changed, 20 insertions(+), 7 deletions(-) diff --git a/arch/x86/entry/Makefile b/arch/x86/entry/Makefile index cdf45ff92dc8b0..b7a5790d8d63e4 100644 --- a/arch/x86/entry/Makefile +++ b/arch/x86/entry/Makefile @@ -11,8 +11,6 @@ CFLAGS_REMOVE_common.o = $(CC_FLAGS_FTRACE) -fstack-protector -fstack-protector- CFLAGS_REMOVE_syscall_32.o = $(CC_FLAGS_FTRACE) -fstack-protector -fstack-protector-strong CFLAGS_REMOVE_syscall_64.o = $(CC_FLAGS_FTRACE) -fstack-protector -fstack-protector-strong -OBJECT_FILES_NON_STANDARD_entry_64_compat.o := y - CFLAGS_syscall_64.o += $(call cc-option,-Wno-override-init,) CFLAGS_syscall_32.o += $(call cc-option,-Wno-override-init,) obj-y := entry_$(BITS).o thunk_$(BITS).o syscall_$(BITS).o diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S index 7c29ed89803322..0f974ae01e62b8 100644 --- a/arch/x86/entry/entry_64_compat.S +++ b/arch/x86/entry/entry_64_compat.S @@ -46,12 +46,14 @@ * ebp user stack * 0(%ebp) arg6 */ -SYM_FUNC_START(entry_SYSENTER_compat) +SYM_CODE_START(entry_SYSENTER_compat) + UNWIND_HINT_EMPTY /* Interrupts are off on entry. */ SWAPGS - /* We are about to clobber %rsp anyway, clobbering here is OK */ - SWITCH_TO_KERNEL_CR3 scratch_reg=%rsp + pushq %rax + SWITCH_TO_KERNEL_CR3 scratch_reg=%rax + popq %rax movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp @@ -104,6 +106,9 @@ SYM_FUNC_START(entry_SYSENTER_compat) xorl %r14d, %r14d /* nospec r14 */ pushq $0 /* pt_regs->r15 = 0 */ xorl %r15d, %r15d /* nospec r15 */ + + UNWIND_HINT_REGS + cld /* @@ -141,7 +146,7 @@ SYM_FUNC_START(entry_SYSENTER_compat) popfq jmp .Lsysenter_flags_fixed SYM_INNER_LABEL(__end_entry_SYSENTER_compat, SYM_L_GLOBAL) -SYM_FUNC_END(entry_SYSENTER_compat) +SYM_CODE_END(entry_SYSENTER_compat) /* * 32-bit SYSCALL entry. @@ -191,6 +196,7 @@ SYM_FUNC_END(entry_SYSENTER_compat) * 0(%esp) arg6 */ SYM_CODE_START(entry_SYSCALL_compat) + UNWIND_HINT_EMPTY /* Interrupts are off on entry. */ swapgs @@ -241,6 +247,8 @@ SYM_INNER_LABEL(entry_SYSCALL_compat_after_hwframe, SYM_L_GLOBAL) pushq $0 /* pt_regs->r15 = 0 */ xorl %r15d, %r15d /* nospec r15 */ + UNWIND_HINT_REGS + movq %rsp, %rdi call do_fast_syscall_32 /* XEN PV guests always use IRET path */ @@ -328,6 +336,7 @@ SYM_CODE_END(entry_SYSCALL_compat) * ebp arg6 */ SYM_CODE_START(entry_INT80_compat) + UNWIND_HINT_EMPTY /* * Interrupts are off on entry. */ @@ -349,8 +358,11 @@ SYM_CODE_START(entry_INT80_compat) /* Need to switch before accessing the thread stack. */ SWITCH_TO_KERNEL_CR3 scratch_reg=%rdi + /* In the Xen PV case we already run on the thread stack. */ - ALTERNATIVE "movq %rsp, %rdi", "jmp .Lint80_keep_stack", X86_FEATURE_XENPV + ALTERNATIVE "", "jmp .Lint80_keep_stack", X86_FEATURE_XENPV + + movq %rsp, %rdi movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp pushq 6*8(%rdi) /* regs->ss */ @@ -389,6 +401,9 @@ SYM_CODE_START(entry_INT80_compat) xorl %r14d, %r14d /* nospec r14 */ pushq %r15 /* pt_regs->r15 */ xorl %r15d, %r15d /* nospec r15 */ + + UNWIND_HINT_REGS + cld movq %rsp, %rdi From d73a332936a6d33be3aa3fa4bee959efab09e431 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 25 Mar 2020 19:53:38 +0100 Subject: [PATCH 058/190] x86/traps: Mark fixup_bad_iret() noinstr This is called from deep entry ASM in a situation where instrumentation will cause more harm than providing useful information. Switch from memmove() to memcpy() because memmove() can't be called from noinstr code. Signed-off-by: Thomas Gleixner Reviewed-by: Alexandre Chartre Reviewed-by: Masami Hiramatsu Acked-by: Peter Zijlstra Acked-by: Andy Lutomirski Link: https://lkml.kernel.org/r/20200505134903.346741553@linutronix.de --- arch/x86/kernel/traps.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 48468f61202cb9..b2b36561a56951 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -578,7 +578,7 @@ struct bad_iret_stack { struct pt_regs regs; }; -asmlinkage __visible notrace +asmlinkage __visible noinstr struct bad_iret_stack *fixup_bad_iret(struct bad_iret_stack *s) { /* @@ -589,19 +589,21 @@ struct bad_iret_stack *fixup_bad_iret(struct bad_iret_stack *s) * just below the IRET frame) and we want to pretend that the * exception came from the IRET target. */ - struct bad_iret_stack *new_stack = - (struct bad_iret_stack *)this_cpu_read(cpu_tss_rw.x86_tss.sp0) - 1; + struct bad_iret_stack tmp, *new_stack = + (struct bad_iret_stack *)__this_cpu_read(cpu_tss_rw.x86_tss.sp0) - 1; - /* Copy the IRET target to the new stack. */ - memmove(&new_stack->regs.ip, (void *)s->regs.sp, 5*8); + /* Copy the IRET target to the temporary storage. */ + memcpy(&tmp.regs.ip, (void *)s->regs.sp, 5*8); /* Copy the remainder of the stack from the current stack. */ - memmove(new_stack, s, offsetof(struct bad_iret_stack, regs.ip)); + memcpy(&tmp, s, offsetof(struct bad_iret_stack, regs.ip)); + + /* Update the entry stack */ + memcpy(new_stack, &tmp, sizeof(tmp)); BUG_ON(!user_mode(&new_stack->regs)); return new_stack; } -NOKPROBE_SYMBOL(fixup_bad_iret); #endif static bool is_sysenter_singlestep(struct pt_regs *regs) From daf7a69787b587d454adea73377a904e09fd54a9 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 25 Mar 2020 23:47:51 +0100 Subject: [PATCH 059/190] x86/traps: Mark sync_regs() noinstr Replace the notrace and NOKPROBE annotations with noinstr. Signed-off-by: Thomas Gleixner Reviewed-by: Alexandre Chartre Reviewed-by: Masami Hiramatsu Acked-by: Peter Zijlstra Acked-by: Andy Lutomirski Link: https://lkml.kernel.org/r/20200505134903.439765290@linutronix.de --- arch/x86/kernel/traps.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index b2b36561a56951..adcc62380ece2e 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -564,14 +564,13 @@ NOKPROBE_SYMBOL(do_int3); * to switch to the normal thread stack if the interrupted code was in * user mode. The actual stack switch is done in entry_64.S */ -asmlinkage __visible notrace struct pt_regs *sync_regs(struct pt_regs *eregs) +asmlinkage __visible noinstr struct pt_regs *sync_regs(struct pt_regs *eregs) { struct pt_regs *regs = (struct pt_regs *)this_cpu_read(cpu_current_top_of_stack) - 1; if (regs != eregs) *regs = *eregs; return regs; } -NOKPROBE_SYMBOL(sync_regs); struct bad_iret_stack { void *error_entry_ret; From 410367e321b5cbd4a616161142a7d162cf55885e Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 4 Mar 2020 23:32:15 +0100 Subject: [PATCH 060/190] x86/entry: Disable interrupts for native_load_gs_index() in C code There is absolutely no point in doing this in ASM code. Move it to C. Signed-off-by: Thomas Gleixner Reviewed-by: Alexandre Chartre Acked-by: Peter Zijlstra Acked-by: Andy Lutomirski Link: https://lkml.kernel.org/r/20200505134903.531534675@linutronix.de --- arch/x86/entry/entry_64.S | 11 +++-------- arch/x86/include/asm/special_insns.h | 14 ++++++++++++-- 2 files changed, 15 insertions(+), 10 deletions(-) diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 9866b54d9acc92..be8ed3a84cafa2 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -1041,22 +1041,17 @@ idtentry simd_coprocessor_error do_simd_coprocessor_error has_error_code=0 * * Is in entry.text as it shouldn't be instrumented. */ -SYM_FUNC_START(native_load_gs_index) +SYM_FUNC_START(asm_load_gs_index) FRAME_BEGIN - pushfq - DISABLE_INTERRUPTS(CLBR_ANY & ~CLBR_RDI) - TRACE_IRQS_OFF SWAPGS .Lgs_change: movl %edi, %gs 2: ALTERNATIVE "", "mfence", X86_BUG_SWAPGS_FENCE SWAPGS - TRACE_IRQS_FLAGS (%rsp) - popfq FRAME_END ret -SYM_FUNC_END(native_load_gs_index) -EXPORT_SYMBOL(native_load_gs_index) +SYM_FUNC_END(asm_load_gs_index) +EXPORT_SYMBOL(asm_load_gs_index) _ASM_EXTABLE(.Lgs_change, .Lbad_gs) .section .fixup, "ax" diff --git a/arch/x86/include/asm/special_insns.h b/arch/x86/include/asm/special_insns.h index 6d37b8fcfc778c..82436cb04ccf5f 100644 --- a/arch/x86/include/asm/special_insns.h +++ b/arch/x86/include/asm/special_insns.h @@ -7,6 +7,7 @@ #include #include +#include #include /* @@ -129,7 +130,16 @@ static inline void native_wbinvd(void) asm volatile("wbinvd": : :"memory"); } -extern asmlinkage void native_load_gs_index(unsigned); +extern asmlinkage void asm_load_gs_index(unsigned int selector); + +static inline void native_load_gs_index(unsigned int selector) +{ + unsigned long flags; + + local_irq_save(flags); + asm_load_gs_index(selector); + local_irq_restore(flags); +} static inline unsigned long __read_cr4(void) { @@ -186,7 +196,7 @@ static inline void wbinvd(void) #ifdef CONFIG_X86_64 -static inline void load_gs_index(unsigned selector) +static inline void load_gs_index(unsigned int selector) { native_load_gs_index(selector); } From c9317202af70ee03d44fdd68abebdb640b8ab411 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 12 May 2020 14:54:14 +0200 Subject: [PATCH 061/190] x86/entry/64: Use native swapgs in asm_load_gs_index() When PARAVIRT_XXL is in use, then load_gs_index() uses xen_load_gs_index() and asm_load_gs_index() is unused. It's therefore pointless to use the paravirtualized SWAPGS implementation in asm_load_gs_index(). Switch it to a plain swapgs. Signed-off-by: Thomas Gleixner Reviewed-by: Steven Rostedt (VMware) Acked-by: Juergen Gross Acked-by: Peter Zijlstra Link: https://lkml.kernel.org/r/20200512213809.583980272@linutronix.de --- arch/x86/entry/entry_64.S | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index be8ed3a84cafa2..9747b42fedd559 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -1043,11 +1043,11 @@ idtentry simd_coprocessor_error do_simd_coprocessor_error has_error_code=0 */ SYM_FUNC_START(asm_load_gs_index) FRAME_BEGIN - SWAPGS + swapgs .Lgs_change: movl %edi, %gs 2: ALTERNATIVE "", "mfence", X86_BUG_SWAPGS_FENCE - SWAPGS + swapgs FRAME_END ret SYM_FUNC_END(asm_load_gs_index) @@ -1057,7 +1057,7 @@ EXPORT_SYMBOL(asm_load_gs_index) .section .fixup, "ax" /* running with kernelgs */ SYM_CODE_START_LOCAL_NOALIGN(.Lbad_gs) - SWAPGS /* switch back to user gs */ + swapgs /* switch back to user gs */ .macro ZAP_GS /* This can't be a string because the preprocessor needs to see it. */ movl $__USER_DS, %eax From ca4c6a9858c23b4f330113f391f2eadc983e780f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 23 Oct 2019 14:27:10 +0200 Subject: [PATCH 062/190] x86/traps: Make interrupt enable/disable symmetric in C code Traps enable interrupts conditionally but rely on the ASM return code to disable them again. That results in redundant interrupt disable and trace calls. Make the trap handlers disable interrupts before returning to avoid that, which allows simplification of the ASM entry code in follow up changes. Originally-by: Peter Zijlstra Signed-off-by: Thomas Gleixner Reviewed-by: Alexandre Chartre Acked-by: Peter Zijlstra Acked-by: Andy Lutomirski Link: https://lkml.kernel.org/r/20200505134903.622702796@linutronix.de --- arch/x86/kernel/traps.c | 28 +++++++++++++++++++--------- arch/x86/mm/fault.c | 15 +++++++++++++-- 2 files changed, 32 insertions(+), 11 deletions(-) diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index adcc62380ece2e..f5f4a76fb51605 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -201,6 +201,7 @@ static void do_error_trap(struct pt_regs *regs, long error_code, char *str, NOTIFY_STOP) { cond_local_irq_enable(regs); do_trap(trapnr, signr, str, regs, error_code, sicode, addr); + cond_local_irq_disable(regs); } } @@ -397,6 +398,8 @@ dotraplinkage void do_bounds(struct pt_regs *regs, long error_code) die("bounds", regs, error_code); do_trap(X86_TRAP_BR, SIGSEGV, "bounds", regs, error_code, 0, NULL); + + cond_local_irq_disable(regs); } enum kernel_gp_hint { @@ -456,12 +459,13 @@ dotraplinkage void do_general_protection(struct pt_regs *regs, long error_code) if (static_cpu_has(X86_FEATURE_UMIP)) { if (user_mode(regs) && fixup_umip_exception(regs)) - return; + goto exit; } if (v8086_mode(regs)) { local_irq_enable(); handle_vm86_fault((struct kernel_vm86_regs *) regs, error_code); + local_irq_disable(); return; } @@ -473,12 +477,11 @@ dotraplinkage void do_general_protection(struct pt_regs *regs, long error_code) show_signal(tsk, SIGSEGV, "", desc, regs, error_code); force_sig(SIGSEGV); - - return; + goto exit; } if (fixup_exception(regs, X86_TRAP_GP, error_code, 0)) - return; + goto exit; tsk->thread.error_code = error_code; tsk->thread.trap_nr = X86_TRAP_GP; @@ -490,11 +493,11 @@ dotraplinkage void do_general_protection(struct pt_regs *regs, long error_code) if (!preemptible() && kprobe_running() && kprobe_fault_handler(regs, X86_TRAP_GP)) - return; + goto exit; ret = notify_die(DIE_GPF, desc, regs, error_code, X86_TRAP_GP, SIGSEGV); if (ret == NOTIFY_STOP) - return; + goto exit; if (error_code) snprintf(desc, sizeof(desc), "segment-related " GPFSTR); @@ -516,6 +519,8 @@ dotraplinkage void do_general_protection(struct pt_regs *regs, long error_code) die_addr(desc, regs, error_code, gp_addr); +exit: + cond_local_irq_disable(regs); } NOKPROBE_SYMBOL(do_general_protection); @@ -773,7 +778,7 @@ static void math_error(struct pt_regs *regs, int error_code, int trapnr) if (!user_mode(regs)) { if (fixup_exception(regs, trapnr, error_code, 0)) - return; + goto exit; task->thread.error_code = error_code; task->thread.trap_nr = trapnr; @@ -781,7 +786,7 @@ static void math_error(struct pt_regs *regs, int error_code, int trapnr) if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, SIGFPE) != NOTIFY_STOP) die(str, regs, error_code); - return; + goto exit; } /* @@ -795,10 +800,12 @@ static void math_error(struct pt_regs *regs, int error_code, int trapnr) si_code = fpu__exception_code(fpu, trapnr); /* Retry when we get spurious exceptions: */ if (!si_code) - return; + goto exit; force_sig_fault(SIGFPE, si_code, (void __user *)uprobe_get_trap_addr(regs)); +exit: + cond_local_irq_disable(regs); } dotraplinkage void do_coprocessor_error(struct pt_regs *regs, long error_code) @@ -853,6 +860,8 @@ do_device_not_available(struct pt_regs *regs, long error_code) info.regs = regs; math_emulate(&info); + + cond_local_irq_disable(regs); return; } #endif @@ -883,6 +892,7 @@ dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code) do_trap(X86_TRAP_IRET, SIGILL, "iret exception", regs, error_code, ILL_BADSTK, (void __user *)NULL); } + local_irq_disable(); } #endif diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 0b03ae8c39cdb2..53db18615f31c5 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -786,6 +786,8 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, force_sig_fault(SIGSEGV, si_code, (void __user *)address); + local_irq_disable(); + return; } @@ -1384,9 +1386,18 @@ do_page_fault(struct pt_regs *regs, unsigned long hw_error_code, return; /* Was the fault on kernel-controlled part of the address space? */ - if (unlikely(fault_in_kernel_space(address))) + if (unlikely(fault_in_kernel_space(address))) { do_kern_addr_fault(regs, hw_error_code, address); - else + } else { do_user_addr_fault(regs, hw_error_code, address); + /* + * User address page fault handling might have reenabled + * interrupts. Fixing up all potential exit points of + * do_user_addr_fault() and its leaf functions is just not + * doable w/o creating an unholy mess or turning the code + * upside down. + */ + local_irq_disable(); + } } NOKPROBE_SYMBOL(do_page_fault); From 877f183f83cc33b1b09313b9c18ab7ee5abda44f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 25 Feb 2020 23:16:07 +0100 Subject: [PATCH 063/190] x86/traps: Split trap numbers out in a separate header So they can be used in ASM code. For this it is also necessary to convert them to defines. Will be used for the rework of the entry code. Signed-off-by: Thomas Gleixner Reviewed-by: Andy Lutomirski Reviewed-by: Alexandre Chartre Acked-by: Peter Zijlstra Link: https://lkml.kernel.org/r/20200505134903.731004084@linutronix.de --- arch/x86/include/asm/trapnr.h | 31 +++++++++++++++++++++++++++++++ arch/x86/include/asm/traps.h | 26 +------------------------- 2 files changed, 32 insertions(+), 25 deletions(-) create mode 100644 arch/x86/include/asm/trapnr.h diff --git a/arch/x86/include/asm/trapnr.h b/arch/x86/include/asm/trapnr.h new file mode 100644 index 00000000000000..082f45631fa955 --- /dev/null +++ b/arch/x86/include/asm/trapnr.h @@ -0,0 +1,31 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_X86_TRAPNR_H +#define _ASM_X86_TRAPNR_H + +/* Interrupts/Exceptions */ + +#define X86_TRAP_DE 0 /* Divide-by-zero */ +#define X86_TRAP_DB 1 /* Debug */ +#define X86_TRAP_NMI 2 /* Non-maskable Interrupt */ +#define X86_TRAP_BP 3 /* Breakpoint */ +#define X86_TRAP_OF 4 /* Overflow */ +#define X86_TRAP_BR 5 /* Bound Range Exceeded */ +#define X86_TRAP_UD 6 /* Invalid Opcode */ +#define X86_TRAP_NM 7 /* Device Not Available */ +#define X86_TRAP_DF 8 /* Double Fault */ +#define X86_TRAP_OLD_MF 9 /* Coprocessor Segment Overrun */ +#define X86_TRAP_TS 10 /* Invalid TSS */ +#define X86_TRAP_NP 11 /* Segment Not Present */ +#define X86_TRAP_SS 12 /* Stack Segment Fault */ +#define X86_TRAP_GP 13 /* General Protection Fault */ +#define X86_TRAP_PF 14 /* Page Fault */ +#define X86_TRAP_SPURIOUS 15 /* Spurious Interrupt */ +#define X86_TRAP_MF 16 /* x87 Floating-Point Exception */ +#define X86_TRAP_AC 17 /* Alignment Check */ +#define X86_TRAP_MC 18 /* Machine Check */ +#define X86_TRAP_XF 19 /* SIMD Floating-Point Exception */ +#define X86_TRAP_VE 20 /* Virtualization Exception */ +#define X86_TRAP_CP 21 /* Control Protection Exception */ +#define X86_TRAP_IRET 32 /* IRET Exception */ + +#endif diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h index 2ae904bf25e423..2376620dc66f3e 100644 --- a/arch/x86/include/asm/traps.h +++ b/arch/x86/include/asm/traps.h @@ -7,6 +7,7 @@ #include #include /* TRAP_TRACE, ... */ +#include #define dotraplinkage __visible @@ -122,31 +123,6 @@ void __noreturn handle_stack_overflow(const char *message, unsigned long fault_address); #endif -/* Interrupts/Exceptions */ -enum { - X86_TRAP_DE = 0, /* 0, Divide-by-zero */ - X86_TRAP_DB, /* 1, Debug */ - X86_TRAP_NMI, /* 2, Non-maskable Interrupt */ - X86_TRAP_BP, /* 3, Breakpoint */ - X86_TRAP_OF, /* 4, Overflow */ - X86_TRAP_BR, /* 5, Bound Range Exceeded */ - X86_TRAP_UD, /* 6, Invalid Opcode */ - X86_TRAP_NM, /* 7, Device Not Available */ - X86_TRAP_DF, /* 8, Double Fault */ - X86_TRAP_OLD_MF, /* 9, Coprocessor Segment Overrun */ - X86_TRAP_TS, /* 10, Invalid TSS */ - X86_TRAP_NP, /* 11, Segment Not Present */ - X86_TRAP_SS, /* 12, Stack Segment Fault */ - X86_TRAP_GP, /* 13, General Protection Fault */ - X86_TRAP_PF, /* 14, Page Fault */ - X86_TRAP_SPURIOUS, /* 15, Spurious Interrupt */ - X86_TRAP_MF, /* 16, x87 Floating-Point Exception */ - X86_TRAP_AC, /* 17, Alignment Check */ - X86_TRAP_MC, /* 18, Machine Check */ - X86_TRAP_XF, /* 19, SIMD Floating-Point Exception */ - X86_TRAP_IRET = 32, /* 32, IRET Exception */ -}; - /* * Page fault error code bits: * From 67f1386616dc0c70f30f214245521c582666edac Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 25 Feb 2020 23:16:09 +0100 Subject: [PATCH 064/190] x86/entry/64: Reorder idtentries Move them all together so verifying the cleanup patches for binary equivalence will be easier. Signed-off-by: Thomas Gleixner Reviewed-by: Alexandre Chartre Acked-by: Andy Lutomirski Acked-by: Peter Zijlstra Link: https://lkml.kernel.org/r/20200505134903.841853522@linutronix.de --- arch/x86/entry/entry_64.S | 36 +++++++++++++++++------------------- 1 file changed, 17 insertions(+), 19 deletions(-) diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 9747b42fedd559..e62061e02b21c5 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -1020,20 +1020,36 @@ _ASM_NOKPROBE(\sym) SYM_CODE_END(\sym) .endm + idtentry divide_error do_divide_error has_error_code=0 idtentry overflow do_overflow has_error_code=0 +idtentry int3 do_int3 has_error_code=0 create_gap=1 idtentry bounds do_bounds has_error_code=0 idtentry invalid_op do_invalid_op has_error_code=0 idtentry device_not_available do_device_not_available has_error_code=0 -idtentry double_fault do_double_fault has_error_code=1 paranoid=2 read_cr2=1 idtentry coprocessor_segment_overrun do_coprocessor_segment_overrun has_error_code=0 idtentry invalid_TSS do_invalid_TSS has_error_code=1 idtentry segment_not_present do_segment_not_present has_error_code=1 +idtentry stack_segment do_stack_segment has_error_code=1 +idtentry general_protection do_general_protection has_error_code=1 idtentry spurious_interrupt_bug do_spurious_interrupt_bug has_error_code=0 idtentry coprocessor_error do_coprocessor_error has_error_code=0 idtentry alignment_check do_alignment_check has_error_code=1 idtentry simd_coprocessor_error do_simd_coprocessor_error has_error_code=0 +idtentry page_fault do_page_fault has_error_code=1 read_cr2=1 + +#ifdef CONFIG_X86_MCE +idtentry machine_check do_mce has_error_code=0 paranoid=1 +#endif +idtentry debug do_debug has_error_code=0 paranoid=1 shift_ist=IST_INDEX_DB ist_offset=DB_STACK_OFFSET +idtentry double_fault do_double_fault has_error_code=1 paranoid=2 read_cr2=1 + +#ifdef CONFIG_XEN_PV +idtentry hypervisor_callback xen_do_hypervisor_callback has_error_code=0 +idtentry xennmi do_nmi has_error_code=0 +idtentry xendebug do_debug has_error_code=0 +#endif /* * Reload gs selector with exception handling @@ -1084,8 +1100,6 @@ SYM_FUNC_END(do_softirq_own_stack) .popsection #ifdef CONFIG_XEN_PV -idtentry hypervisor_callback xen_do_hypervisor_callback has_error_code=0 - /* * A note on the "critical region" in our callback handler. * We want to avoid stacking callback handlers due to events occurring @@ -1188,22 +1202,6 @@ apicinterrupt3 HYPERVISOR_CALLBACK_VECTOR \ acrn_hv_callback_vector acrn_hv_vector_handler #endif -idtentry debug do_debug has_error_code=0 paranoid=1 shift_ist=IST_INDEX_DB ist_offset=DB_STACK_OFFSET -idtentry int3 do_int3 has_error_code=0 create_gap=1 -idtentry stack_segment do_stack_segment has_error_code=1 - -#ifdef CONFIG_XEN_PV -idtentry xennmi do_nmi has_error_code=0 -idtentry xendebug do_debug has_error_code=0 -#endif - -idtentry general_protection do_general_protection has_error_code=1 -idtentry page_fault do_page_fault has_error_code=1 read_cr2=1 - -#ifdef CONFIG_X86_MCE -idtentry machine_check do_mce has_error_code=0 paranoid=1 -#endif - /* * Save all registers in pt_regs, and switch gs if needed. * Use slow, but surefire "are we in kernel?" check. From cfa82a00533f7074011a3a49fbb6ed1b1f6fa010 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 25 Feb 2020 23:16:10 +0100 Subject: [PATCH 065/190] x86/entry: Distangle idtentry idtentry is a completely unreadable maze. Split it into distinct idtentry variants which only contain the minimal code: - idtentry for regular exceptions - idtentry_mce_debug for #MCE and #DB - idtentry_df for #DF The generated binary code is equivalent. Signed-off-by: Thomas Gleixner Reviewed-by: Alexandre Chartre Acked-by: Peter Zijlstra Acked-by: Andy Lutomirski Link: https://lkml.kernel.org/r/20200505134903.949227617@linutronix.de --- arch/x86/entry/entry_64.S | 403 +++++++++++++++++++++----------------- 1 file changed, 220 insertions(+), 183 deletions(-) diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index e62061e02b21c5..01bfe7f1bea5b6 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -37,6 +37,7 @@ #include #include #include +#include #include #include @@ -493,6 +494,202 @@ SYM_CODE_END(spurious_entries_start) decl PER_CPU_VAR(irq_count) .endm +/** + * idtentry_body - Macro to emit code calling the C function + * @vector: Vector number + * @cfunc: C function to be called + * @has_error_code: Hardware pushed error code on stack + */ +.macro idtentry_body vector cfunc has_error_code:req + + call error_entry + UNWIND_HINT_REGS + + .if \vector == X86_TRAP_PF + /* + * Store CR2 early so subsequent faults cannot clobber it. Use R12 as + * intermediate storage as RDX can be clobbered in enter_from_user_mode(). + * GET_CR2_INTO can clobber RAX. + */ + GET_CR2_INTO(%r12); + .endif + + TRACE_IRQS_OFF + +#ifdef CONFIG_CONTEXT_TRACKING + testb $3, CS(%rsp) + jz .Lfrom_kernel_no_ctxt_tracking_\@ + CALL_enter_from_user_mode +.Lfrom_kernel_no_ctxt_tracking_\@: +#endif + + movq %rsp, %rdi /* pt_regs pointer into 1st argument*/ + + .if \has_error_code == 1 + movq ORIG_RAX(%rsp), %rsi /* get error code into 2nd argument*/ + movq $-1, ORIG_RAX(%rsp) /* no syscall to restart */ + .else + xorl %esi, %esi /* Clear the error code */ + .endif + + .if \vector == X86_TRAP_PF + movq %r12, %rdx /* Move CR2 into 3rd argument */ + .endif + + call \cfunc + + jmp error_exit +.endm + +/** + * idtentry - Macro to generate entry stubs for simple IDT entries + * @vector: Vector number + * @asmsym: ASM symbol for the entry point + * @cfunc: C function to be called + * @has_error_code: Hardware pushed error code on stack + * + * The macro emits code to set up the kernel context for straight forward + * and simple IDT entries. No IST stack, no paranoid entry checks. + */ +.macro idtentry vector asmsym cfunc has_error_code:req +SYM_CODE_START(\asmsym) + UNWIND_HINT_IRET_REGS offset=\has_error_code*8 + ASM_CLAC + + .if \has_error_code == 0 + pushq $-1 /* ORIG_RAX: no syscall to restart */ + .endif + + .if \vector == X86_TRAP_BP + /* + * If coming from kernel space, create a 6-word gap to allow the + * int3 handler to emulate a call instruction. + */ + testb $3, CS-ORIG_RAX(%rsp) + jnz .Lfrom_usermode_no_gap_\@ + .rept 6 + pushq 5*8(%rsp) + .endr + UNWIND_HINT_IRET_REGS offset=8 +.Lfrom_usermode_no_gap_\@: + .endif + + idtentry_body \vector \cfunc \has_error_code + +_ASM_NOKPROBE(\asmsym) +SYM_CODE_END(\asmsym) +.endm + +/* + * MCE and DB exceptions + */ +#define CPU_TSS_IST(x) PER_CPU_VAR(cpu_tss_rw) + (TSS_ist + (x) * 8) + +/** + * idtentry_mce_db - Macro to generate entry stubs for #MC and #DB + * @vector: Vector number + * @asmsym: ASM symbol for the entry point + * @cfunc: C function to be called + * + * The macro emits code to set up the kernel context for #MC and #DB + * + * If the entry comes from user space it uses the normal entry path + * including the return to user space work and preemption checks on + * exit. + * + * If hits in kernel mode then it needs to go through the paranoid + * entry as the exception can hit any random state. No preemption + * check on exit to keep the paranoid path simple. + * + * If the trap is #DB then the interrupt stack entry in the IST is + * moved to the second stack, so a potential recursion will have a + * fresh IST. + */ +.macro idtentry_mce_db vector asmsym cfunc +SYM_CODE_START(\asmsym) + UNWIND_HINT_IRET_REGS + ASM_CLAC + + pushq $-1 /* ORIG_RAX: no syscall to restart */ + + /* + * If the entry is from userspace, switch stacks and treat it as + * a normal entry. + */ + testb $3, CS-ORIG_RAX(%rsp) + jnz .Lfrom_usermode_switch_stack_\@ + + /* + * paranoid_entry returns SWAPGS flag for paranoid_exit in EBX. + * EBX == 0 -> SWAPGS, EBX == 1 -> no SWAPGS + */ + call paranoid_entry + + UNWIND_HINT_REGS + + .if \vector == X86_TRAP_DB + TRACE_IRQS_OFF_DEBUG + .else + TRACE_IRQS_OFF + .endif + + movq %rsp, %rdi /* pt_regs pointer */ + xorl %esi, %esi /* Clear the error code */ + + .if \vector == X86_TRAP_DB + subq $DB_STACK_OFFSET, CPU_TSS_IST(IST_INDEX_DB) + .endif + + call \cfunc + + .if \vector == X86_TRAP_DB + addq $DB_STACK_OFFSET, CPU_TSS_IST(IST_INDEX_DB) + .endif + + jmp paranoid_exit + + /* Switch to the regular task stack and use the noist entry point */ +.Lfrom_usermode_switch_stack_\@: + idtentry_body vector \cfunc, has_error_code=0 + +_ASM_NOKPROBE(\asmsym) +SYM_CODE_END(\asmsym) +.endm + +/* + * Double fault entry. Straight paranoid. No checks from which context + * this comes because for the espfix induced #DF this would do the wrong + * thing. + */ +.macro idtentry_df vector asmsym cfunc +SYM_CODE_START(\asmsym) + UNWIND_HINT_IRET_REGS offset=8 + ASM_CLAC + + /* + * paranoid_entry returns SWAPGS flag for paranoid_exit in EBX. + * EBX == 0 -> SWAPGS, EBX == 1 -> no SWAPGS + */ + call paranoid_entry + UNWIND_HINT_REGS + + /* Read CR2 early */ + GET_CR2_INTO(%r12); + + TRACE_IRQS_OFF + + movq %rsp, %rdi /* pt_regs pointer into first argument */ + movq ORIG_RAX(%rsp), %rsi /* get error code into 2nd argument*/ + movq $-1, ORIG_RAX(%rsp) /* no syscall to restart */ + movq %r12, %rdx /* Move CR2 into 3rd argument */ + call \cfunc + + jmp paranoid_exit + +_ASM_NOKPROBE(\asmsym) +SYM_CODE_END(\asmsym) +.endm + /* * Interrupt entry helper function. * @@ -860,195 +1057,35 @@ apicinterrupt IRQ_WORK_VECTOR irq_work_interrupt smp_irq_work_interrupt /* * Exception entry points. */ -#define CPU_TSS_IST(x) PER_CPU_VAR(cpu_tss_rw) + (TSS_ist + (x) * 8) - -.macro idtentry_part do_sym, has_error_code:req, read_cr2:req, paranoid:req, shift_ist=-1, ist_offset=0 - - .if \paranoid - call paranoid_entry - /* returned flag: ebx=0: need swapgs on exit, ebx=1: don't need it */ - .else - call error_entry - .endif - UNWIND_HINT_REGS - - .if \read_cr2 - /* - * Store CR2 early so subsequent faults cannot clobber it. Use R12 as - * intermediate storage as RDX can be clobbered in enter_from_user_mode(). - * GET_CR2_INTO can clobber RAX. - */ - GET_CR2_INTO(%r12); - .endif - - .if \shift_ist != -1 - TRACE_IRQS_OFF_DEBUG /* reload IDT in case of recursion */ - .else - TRACE_IRQS_OFF - .endif - -#ifdef CONFIG_CONTEXT_TRACKING - .if \paranoid == 0 - testb $3, CS(%rsp) - jz .Lfrom_kernel_no_context_tracking_\@ - CALL_enter_from_user_mode -.Lfrom_kernel_no_context_tracking_\@: - .endif -#endif - - movq %rsp, %rdi /* pt_regs pointer */ - - .if \has_error_code - movq ORIG_RAX(%rsp), %rsi /* get error code */ - movq $-1, ORIG_RAX(%rsp) /* no syscall to restart */ - .else - xorl %esi, %esi /* no error code */ - .endif - - .if \shift_ist != -1 - subq $\ist_offset, CPU_TSS_IST(\shift_ist) - .endif - - .if \read_cr2 - movq %r12, %rdx /* Move CR2 into 3rd argument */ - .endif - - call \do_sym - - .if \shift_ist != -1 - addq $\ist_offset, CPU_TSS_IST(\shift_ist) - .endif - - .if \paranoid - /* this procedure expect "no swapgs" flag in ebx */ - jmp paranoid_exit - .else - jmp error_exit - .endif - -.endm - -/** - * idtentry - Generate an IDT entry stub - * @sym: Name of the generated entry point - * @do_sym: C function to be called - * @has_error_code: True if this IDT vector has an error code on the stack - * @paranoid: non-zero means that this vector may be invoked from - * kernel mode with user GSBASE and/or user CR3. - * 2 is special -- see below. - * @shift_ist: Set to an IST index if entries from kernel mode should - * decrement the IST stack so that nested entries get a - * fresh stack. (This is for #DB, which has a nasty habit - * of recursing.) - * @create_gap: create a 6-word stack gap when coming from kernel mode. - * @read_cr2: load CR2 into the 3rd argument; done before calling any C code - * - * idtentry generates an IDT stub that sets up a usable kernel context, - * creates struct pt_regs, and calls @do_sym. The stub has the following - * special behaviors: - * - * On an entry from user mode, the stub switches from the trampoline or - * IST stack to the normal thread stack. On an exit to user mode, the - * normal exit-to-usermode path is invoked. - * - * On an exit to kernel mode, if @paranoid == 0, we check for preemption, - * whereas we omit the preemption check if @paranoid != 0. This is purely - * because the implementation is simpler this way. The kernel only needs - * to check for asynchronous kernel preemption when IRQ handlers return. - * - * If @paranoid == 0, then the stub will handle IRET faults by pretending - * that the fault came from user mode. It will handle gs_change faults by - * pretending that the fault happened with kernel GSBASE. Since this handling - * is omitted for @paranoid != 0, the #GP, #SS, and #NP stubs must have - * @paranoid == 0. This special handling will do the wrong thing for - * espfix-induced #DF on IRET, so #DF must not use @paranoid == 0. - * - * @paranoid == 2 is special: the stub will never switch stacks. This is for - * #DF: if the thread stack is somehow unusable, we'll still get a useful OOPS. - */ -.macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1 ist_offset=0 create_gap=0 read_cr2=0 -SYM_CODE_START(\sym) - UNWIND_HINT_IRET_REGS offset=\has_error_code*8 - - /* Sanity check */ - .if \shift_ist != -1 && \paranoid != 1 - .error "using shift_ist requires paranoid=1" - .endif - - .if \create_gap && \paranoid - .error "using create_gap requires paranoid=0" - .endif - - ASM_CLAC - - .if \has_error_code == 0 - pushq $-1 /* ORIG_RAX: no syscall to restart */ - .endif - - .if \paranoid == 1 - testb $3, CS-ORIG_RAX(%rsp) /* If coming from userspace, switch stacks */ - jnz .Lfrom_usermode_switch_stack_\@ - .endif - - .if \create_gap == 1 - /* - * If coming from kernel space, create a 6-word gap to allow the - * int3 handler to emulate a call instruction. - */ - testb $3, CS-ORIG_RAX(%rsp) - jnz .Lfrom_usermode_no_gap_\@ - .rept 6 - pushq 5*8(%rsp) - .endr - UNWIND_HINT_IRET_REGS offset=8 -.Lfrom_usermode_no_gap_\@: - .endif - - idtentry_part \do_sym, \has_error_code, \read_cr2, \paranoid, \shift_ist, \ist_offset - - .if \paranoid == 1 - /* - * Entry from userspace. Switch stacks and treat it - * as a normal entry. This means that paranoid handlers - * run in real process context if user_mode(regs). - */ -.Lfrom_usermode_switch_stack_\@: - idtentry_part \do_sym, \has_error_code, \read_cr2, paranoid=0 - .endif - -_ASM_NOKPROBE(\sym) -SYM_CODE_END(\sym) -.endm - -idtentry divide_error do_divide_error has_error_code=0 -idtentry overflow do_overflow has_error_code=0 -idtentry int3 do_int3 has_error_code=0 create_gap=1 -idtentry bounds do_bounds has_error_code=0 -idtentry invalid_op do_invalid_op has_error_code=0 -idtentry device_not_available do_device_not_available has_error_code=0 -idtentry coprocessor_segment_overrun do_coprocessor_segment_overrun has_error_code=0 -idtentry invalid_TSS do_invalid_TSS has_error_code=1 -idtentry segment_not_present do_segment_not_present has_error_code=1 -idtentry stack_segment do_stack_segment has_error_code=1 -idtentry general_protection do_general_protection has_error_code=1 -idtentry spurious_interrupt_bug do_spurious_interrupt_bug has_error_code=0 -idtentry coprocessor_error do_coprocessor_error has_error_code=0 -idtentry alignment_check do_alignment_check has_error_code=1 -idtentry simd_coprocessor_error do_simd_coprocessor_error has_error_code=0 - -idtentry page_fault do_page_fault has_error_code=1 read_cr2=1 +idtentry X86_TRAP_DE divide_error do_divide_error has_error_code=0 +idtentry X86_TRAP_OF overflow do_overflow has_error_code=0 +idtentry X86_TRAP_BP int3 do_int3 has_error_code=0 +idtentry X86_TRAP_BR bounds do_bounds has_error_code=0 +idtentry X86_TRAP_UD invalid_op do_invalid_op has_error_code=0 +idtentry X86_TRAP_NM device_not_available do_device_not_available has_error_code=0 +idtentry X86_TRAP_OLD_MF coprocessor_segment_overrun do_coprocessor_segment_overrun has_error_code=0 +idtentry X86_TRAP_TS invalid_TSS do_invalid_TSS has_error_code=1 +idtentry X86_TRAP_NP segment_not_present do_segment_not_present has_error_code=1 +idtentry X86_TRAP_SS stack_segment do_stack_segment has_error_code=1 +idtentry X86_TRAP_GP general_protection do_general_protection has_error_code=1 +idtentry X86_TRAP_SPURIOUS spurious_interrupt_bug do_spurious_interrupt_bug has_error_code=0 +idtentry X86_TRAP_MF coprocessor_error do_coprocessor_error has_error_code=0 +idtentry X86_TRAP_AC alignment_check do_alignment_check has_error_code=1 +idtentry X86_TRAP_XF simd_coprocessor_error do_simd_coprocessor_error has_error_code=0 + +idtentry X86_TRAP_PF page_fault do_page_fault has_error_code=1 #ifdef CONFIG_X86_MCE -idtentry machine_check do_mce has_error_code=0 paranoid=1 +idtentry_mce_db X86_TRAP_MCE machine_check do_mce #endif -idtentry debug do_debug has_error_code=0 paranoid=1 shift_ist=IST_INDEX_DB ist_offset=DB_STACK_OFFSET -idtentry double_fault do_double_fault has_error_code=1 paranoid=2 read_cr2=1 +idtentry_mce_db X86_TRAP_DB debug do_debug +idtentry_df X86_TRAP_DF double_fault do_double_fault #ifdef CONFIG_XEN_PV -idtentry hypervisor_callback xen_do_hypervisor_callback has_error_code=0 -idtentry xennmi do_nmi has_error_code=0 -idtentry xendebug do_debug has_error_code=0 +idtentry 512 /* dummy */ hypervisor_callback xen_do_hypervisor_callback has_error_code=0 +idtentry X86_TRAP_NMI xennmi do_nmi has_error_code=0 +idtentry X86_TRAP_DB xendebug do_debug has_error_code=0 #endif /* From 424c7d0a9a396ba93815b8861033e62791622cc3 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 26 Mar 2020 16:56:20 +0100 Subject: [PATCH 066/190] x86/entry/64: Provide sane error entry/exit For gradual conversion provide a macro parameter and the required code which allows to handle instrumentation and interrupt flags tracking in C. Signed-off-by: Thomas Gleixner Reviewed-by: Alexandre Chartre Acked-by: Peter Zijlstra Acked-by: Andy Lutomirski Link: https://lkml.kernel.org/r/20200505134904.058904490@linutronix.de --- arch/x86/entry/entry_64.S | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 01bfe7f1bea5b6..96ad26f1bcf37a 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -499,8 +499,9 @@ SYM_CODE_END(spurious_entries_start) * @vector: Vector number * @cfunc: C function to be called * @has_error_code: Hardware pushed error code on stack + * @sane: Sane variant which handles irq tracing, context tracking in C */ -.macro idtentry_body vector cfunc has_error_code:req +.macro idtentry_body vector cfunc has_error_code:req sane=0 call error_entry UNWIND_HINT_REGS @@ -514,6 +515,7 @@ SYM_CODE_END(spurious_entries_start) GET_CR2_INTO(%r12); .endif + .if \sane == 0 TRACE_IRQS_OFF #ifdef CONFIG_CONTEXT_TRACKING @@ -522,6 +524,7 @@ SYM_CODE_END(spurious_entries_start) CALL_enter_from_user_mode .Lfrom_kernel_no_ctxt_tracking_\@: #endif + .endif movq %rsp, %rdi /* pt_regs pointer into 1st argument*/ @@ -538,7 +541,11 @@ SYM_CODE_END(spurious_entries_start) call \cfunc + .if \sane == 0 jmp error_exit + .else + jmp error_return + .endif .endm /** @@ -547,11 +554,12 @@ SYM_CODE_END(spurious_entries_start) * @asmsym: ASM symbol for the entry point * @cfunc: C function to be called * @has_error_code: Hardware pushed error code on stack + * @sane: Sane variant which handles irq tracing, context tracking in C * * The macro emits code to set up the kernel context for straight forward * and simple IDT entries. No IST stack, no paranoid entry checks. */ -.macro idtentry vector asmsym cfunc has_error_code:req +.macro idtentry vector asmsym cfunc has_error_code:req sane=0 SYM_CODE_START(\asmsym) UNWIND_HINT_IRET_REGS offset=\has_error_code*8 ASM_CLAC @@ -574,7 +582,7 @@ SYM_CODE_START(\asmsym) .Lfrom_usermode_no_gap_\@: .endif - idtentry_body \vector \cfunc \has_error_code + idtentry_body \vector \cfunc \has_error_code \sane _ASM_NOKPROBE(\asmsym) SYM_CODE_END(\asmsym) @@ -1403,6 +1411,14 @@ SYM_CODE_START_LOCAL(error_exit) jmp .Lretint_user SYM_CODE_END(error_exit) +SYM_CODE_START_LOCAL(error_return) + UNWIND_HINT_REGS + DEBUG_ENTRY_ASSERT_IRQS_OFF + testb $3, CS(%rsp) + jz restore_regs_and_return_to_kernel + jmp swapgs_restore_regs_and_return_to_usermode +SYM_CODE_END(error_return) + /* * Runs on exception stack. Xen PV does not go through this path at all, * so we can use real assembly here. From 60400677e1280dae7d903e5997fb1cfabb22d4bd Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 25 Feb 2020 23:16:11 +0100 Subject: [PATCH 067/190] x86/entry/32: Provide macro to emit IDT entry stubs 32 and 64 bit have unnecessary different ways to populate the exception entry code. Provide a idtentry macro which allows to consolidate all of that. Signed-off-by: Thomas Gleixner Reviewed-by: Alexandre Chartre Acked-by: Andy Lutomirski Acked-by: Peter Zijlstra Link: https://lkml.kernel.org/r/20200505134904.166735365@linutronix.de --- arch/x86/entry/entry_32.S | 68 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 68 insertions(+) diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index d9da0b7f38ff92..eb64e78052e17c 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -44,6 +44,7 @@ #include #include #include +#include #include #include "calling.h" @@ -726,6 +727,31 @@ .Lend_\@: .endm + +/** + * idtentry - Macro to generate entry stubs for simple IDT entries + * @vector: Vector number + * @asmsym: ASM symbol for the entry point + * @cfunc: C function to be called + * @has_error_code: Hardware pushed error code on stack + * @sane: Compatibility flag with 64bit + */ +.macro idtentry vector asmsym cfunc has_error_code:req sane=0 +SYM_CODE_START(\asmsym) + ASM_CLAC + cld + + .if \has_error_code == 0 + pushl $0 /* Clear the error code */ + .endif + + /* Push the C-function address into the GS slot */ + pushl $\cfunc + /* Invoke the common exception entry */ + jmp handle_exception +SYM_CODE_END(\asmsym) +.endm + /* * %eax: prev task * %edx: next task @@ -1517,6 +1543,48 @@ SYM_CODE_START_LOCAL_NOALIGN(common_exception) jmp ret_from_exception SYM_CODE_END(common_exception) +SYM_CODE_START_LOCAL_NOALIGN(handle_exception) + /* the function address is in %gs's slot on the stack */ + SAVE_ALL switch_stacks=1 skip_gs=1 unwind_espfix=1 + ENCODE_FRAME_POINTER + + /* fixup %gs */ + GS_TO_REG %ecx + movl PT_GS(%esp), %edi # get the function address + REG_TO_PTGS %ecx + SET_KERNEL_GS %ecx + + /* fixup orig %eax */ + movl PT_ORIG_EAX(%esp), %edx # get the error code + movl $-1, PT_ORIG_EAX(%esp) # no syscall to restart + + movl %esp, %eax # pt_regs pointer + CALL_NOSPEC edi + +#ifdef CONFIG_VM86 + movl PT_EFLAGS(%esp), %eax # mix EFLAGS and CS + movb PT_CS(%esp), %al + andl $(X86_EFLAGS_VM | SEGMENT_RPL_MASK), %eax +#else + /* + * We can be coming here from child spawned by kernel_thread(). + */ + movl PT_CS(%esp), %eax + andl $SEGMENT_RPL_MASK, %eax +#endif + cmpl $USER_RPL, %eax # returning to v8086 or userspace ? + jnb ret_to_user + + PARANOID_EXIT_TO_KERNEL_MODE + BUG_IF_WRONG_CR3 + RESTORE_REGS 4 + jmp .Lirq_return + +ret_to_user: + movl %esp, %eax + jmp restore_all_switch_stack +SYM_CODE_END(handle_exception) + SYM_CODE_START(debug) /* * Entry from sysenter is now handled in common_exception From 53aaf262c66ee237e4163f1af347939ebd9c51c2 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 25 Feb 2020 23:16:12 +0100 Subject: [PATCH 068/190] x86/idtentry: Provide macros to define/declare IDT entry points Provide DECLARE/DEFINE_IDTENTRY() macros. DEFINE_IDTENTRY() provides a wrapper which acts as the function definition. The exception handler body is just appended to it with curly brackets. The entry point is marked noinstr so that irq tracing and the enter_from_user_mode() can be moved into the C-entry point. As all C-entries use the same macro (or a later variant) the necessary entry handling can be implemented at one central place. DECLARE_IDTENTRY() provides the function prototypes: - The C entry point cfunc - The ASM entry point asm_cfunc - The XEN/PV entry point xen_asm_cfunc They all follow the same naming convention. When included from ASM code DECLARE_IDTENTRY() is a macro which emits the low level entry point in assembly by instantiating idtentry. IDTENTRY is the simplest variant which just has a pt_regs argument. It's going to be used for all exceptions which have no error code. Signed-off-by: Thomas Gleixner Reviewed-by: Alexandre Chartre Acked-by: Andy Lutomirski Acked-by: Peter Zijlstra Link: https://lkml.kernel.org/r/20200505134904.273363275@linutronix.de --- arch/x86/entry/entry_32.S | 6 +++ arch/x86/entry/entry_64.S | 6 +++ arch/x86/include/asm/idtentry.h | 67 +++++++++++++++++++++++++++++++++ arch/x86/include/asm/traps.h | 2 +- 4 files changed, 80 insertions(+), 1 deletion(-) create mode 100644 arch/x86/include/asm/idtentry.h diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index eb64e78052e17c..8c0e07e2860f00 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -752,6 +752,12 @@ SYM_CODE_START(\asmsym) SYM_CODE_END(\asmsym) .endm +/* + * Include the defines which emit the idt entries which are shared + * shared between 32 and 64 bit. + */ +#include + /* * %eax: prev task * %edx: next task diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 96ad26f1bcf37a..ee071629055747 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -698,6 +698,12 @@ _ASM_NOKPROBE(\asmsym) SYM_CODE_END(\asmsym) .endm +/* + * Include the defines which emit the idt entries which are shared + * shared between 32 and 64 bit. + */ +#include + /* * Interrupt entry helper function. * diff --git a/arch/x86/include/asm/idtentry.h b/arch/x86/include/asm/idtentry.h new file mode 100644 index 00000000000000..bbd81e200b32c7 --- /dev/null +++ b/arch/x86/include/asm/idtentry.h @@ -0,0 +1,67 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_X86_IDTENTRY_H +#define _ASM_X86_IDTENTRY_H + +/* Interrupts/Exceptions */ +#include + +#ifndef __ASSEMBLY__ + +/** + * DECLARE_IDTENTRY - Declare functions for simple IDT entry points + * No error code pushed by hardware + * @vector: Vector number (ignored for C) + * @func: Function name of the entry point + * + * Declares three functions: + * - The ASM entry point: asm_##func + * - The XEN PV trap entry point: xen_##func (maybe unused) + * - The C handler called from the ASM entry point + * + * Note: This is the C variant of DECLARE_IDTENTRY(). As the name says it + * declares the entry points for usage in C code. There is an ASM variant + * as well which is used to emit the entry stubs in entry_32/64.S. + */ +#define DECLARE_IDTENTRY(vector, func) \ + asmlinkage void asm_##func(void); \ + asmlinkage void xen_asm_##func(void); \ + __visible void func(struct pt_regs *regs) + +/** + * DEFINE_IDTENTRY - Emit code for simple IDT entry points + * @func: Function name of the entry point + * + * @func is called from ASM entry code with interrupts disabled. + * + * The macro is written so it acts as function definition. Append the + * body with a pair of curly brackets. + * + * idtentry_enter() contains common code which has to be invoked before + * arbitrary code in the body. idtentry_exit() contains common code + * which has to run before returning to the low level assembly code. + */ +#define DEFINE_IDTENTRY(func) \ +static __always_inline void __##func(struct pt_regs *regs); \ + \ +__visible noinstr void func(struct pt_regs *regs) \ +{ \ + idtentry_enter(regs); \ + instrumentation_begin(); \ + __##func (regs); \ + instrumentation_end(); \ + idtentry_exit(regs); \ +} \ + \ +static __always_inline void __##func(struct pt_regs *regs) + +#else /* !__ASSEMBLY__ */ + +/* + * The ASM variants for DECLARE_IDTENTRY*() which emit the ASM entry stubs. + */ +#define DECLARE_IDTENTRY(vector, func) \ + idtentry vector asm_##func func has_error_code=0 sane=1 + +#endif /* __ASSEMBLY__ */ + +#endif diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h index 2376620dc66f3e..814273f8b166ac 100644 --- a/arch/x86/include/asm/traps.h +++ b/arch/x86/include/asm/traps.h @@ -6,8 +6,8 @@ #include #include +#include #include /* TRAP_TRACE, ... */ -#include #define dotraplinkage __visible From 0ba50e861ae9f91ffb6589e3a8ecbd47859b0275 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 26 Mar 2020 16:28:52 +0100 Subject: [PATCH 069/190] x86/entry/common: Provide idtentry_enter/exit() Provide functions which handle the low level entry and exit similar to enter/exit from user mode. Signed-off-by: Thomas Gleixner Reviewed-by: Alexandre Chartre Acked-by: Peter Zijlstra Acked-by: Andy Lutomirski Link: https://lkml.kernel.org/r/20200505134904.457578656@linutronix.de --- arch/x86/entry/common.c | 99 +++++++++++++++++++++++++++++++++ arch/x86/include/asm/idtentry.h | 3 + 2 files changed, 102 insertions(+) diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index e4f9f5f2c21b06..9ebe33485428f8 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -510,3 +510,102 @@ SYSCALL_DEFINE0(ni_syscall) { return -ENOSYS; } + +/** + * idtentry_enter - Handle state tracking on idtentry + * @regs: Pointer to pt_regs of interrupted context + * + * Invokes: + * - lockdep irqflag state tracking as low level ASM entry disabled + * interrupts. + * + * - Context tracking if the exception hit user mode. + * + * - RCU notification if the exception hit kernel mode. + * + * - The hardirq tracer to keep the state consistent as low level ASM + * entry disabled interrupts. + */ +void noinstr idtentry_enter(struct pt_regs *regs) +{ + if (user_mode(regs)) { + enter_from_user_mode(); + } else { + lockdep_hardirqs_off(CALLER_ADDR0); + rcu_irq_enter(); + instrumentation_begin(); + trace_hardirqs_off_prepare(); + instrumentation_end(); + } +} + +/** + * idtentry_exit - Common code to handle return from exceptions + * @regs: Pointer to pt_regs (exception entry regs) + * + * Depending on the return target (kernel/user) this runs the necessary + * preemption and work checks if possible and required and returns to + * the caller with interrupts disabled and no further work pending. + * + * This is the last action before returning to the low level ASM code which + * just needs to return to the appropriate context. + * + * Invoked by all exception/interrupt IDTENTRY handlers which are not + * returning through the paranoid exit path (all except NMI, #DF and the IST + * variants of #MC and #DB) and are therefore on the thread stack. + */ +void noinstr idtentry_exit(struct pt_regs *regs) +{ + lockdep_assert_irqs_disabled(); + + /* Check whether this returns to user mode */ + if (user_mode(regs)) { + prepare_exit_to_usermode(regs); + } else if (regs->flags & X86_EFLAGS_IF) { + /* Check kernel preemption, if enabled */ + if (IS_ENABLED(CONFIG_PREEMPTION)) { + /* + * This needs to be done very carefully. + * idtentry_enter() invoked rcu_irq_enter(). This + * needs to be undone before scheduling. + * + * Preemption is disabled inside of RCU idle + * sections. When the task returns from + * preempt_schedule_irq(), RCU is still watching. + * + * rcu_irq_exit_preempt() has additional state + * checking if CONFIG_PROVE_RCU=y + */ + if (!preempt_count()) { + if (IS_ENABLED(CONFIG_DEBUG_ENTRY)) + WARN_ON_ONCE(!on_thread_stack()); + instrumentation_begin(); + rcu_irq_exit_preempt(); + if (need_resched()) + preempt_schedule_irq(); + /* Covers both tracing and lockdep */ + trace_hardirqs_on(); + instrumentation_end(); + return; + } + } + /* + * If preemption is disabled then this needs to be done + * carefully with respect to RCU. The exception might come + * from a RCU idle section in the idle task due to the fact + * that safe_halt() enables interrupts. So this needs the + * same ordering of lockdep/tracing and RCU as the return + * to user mode path. + */ + instrumentation_begin(); + /* Tell the tracer that IRET will enable interrupts */ + trace_hardirqs_on_prepare(); + lockdep_hardirqs_on_prepare(CALLER_ADDR0); + instrumentation_end(); + rcu_irq_exit(); + lockdep_hardirqs_on(CALLER_ADDR0); + } else { + /* IRQ flags state is correct already. Just tell RCU */ + rcu_irq_exit(); + } +} diff --git a/arch/x86/include/asm/idtentry.h b/arch/x86/include/asm/idtentry.h index bbd81e200b32c7..2adfd80ea2f39a 100644 --- a/arch/x86/include/asm/idtentry.h +++ b/arch/x86/include/asm/idtentry.h @@ -7,6 +7,9 @@ #ifndef __ASSEMBLY__ +void idtentry_enter(struct pt_regs *regs); +void idtentry_exit(struct pt_regs *regs); + /** * DECLARE_IDTENTRY - Declare functions for simple IDT entry points * No error code pushed by hardware From 218e31b6e7a33c9b5e5d608aa79d51665bb84e62 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 25 Feb 2020 23:16:13 +0100 Subject: [PATCH 070/190] x86/traps: Prepare for using DEFINE_IDTENTRY Prepare for using IDTENTRY to define the C exception/trap entry points. It would be possible to glue this into the existing macro maze, but it's simpler and better to read at the end to just make them distinct. Provide a trivial inline helper to read the trap address and add a comment explaining the logic behind it. The existing macros will be removed once all instances are converted. Signed-off-by: Thomas Gleixner Reviewed-by: Alexandre Chartre Acked-by: Peter Zijlstra Link: https://lkml.kernel.org/r/20200505134904.556327833@linutronix.de --- arch/x86/kernel/traps.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index f5f4a76fb51605..3857c0fd3306d4 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -205,6 +205,21 @@ static void do_error_trap(struct pt_regs *regs, long error_code, char *str, } } +/* + * Posix requires to provide the address of the faulting instruction for + * SIGILL (#UD) and SIGFPE (#DE) in the si_addr member of siginfo_t. + * + * This address is usually regs->ip, but when an uprobe moved the code out + * of line then regs->ip points to the XOL code which would confuse + * anything which analyzes the fault address vs. the unmodified binary. If + * a trap happened in XOL code then uprobe maps regs->ip back to the + * original instruction address. + */ +static __always_inline void __user *error_get_trap_addr(struct pt_regs *regs) +{ + return (void __user *)uprobe_get_trap_addr(regs); +} + #define IP ((void __user *)uprobe_get_trap_addr(regs)) #define DO_ERROR(trapnr, signr, sicode, addr, str, name) \ dotraplinkage void do_##name(struct pt_regs *regs, long error_code) \ From 9d06c4027f21fcfa60221bd7203eda3c82568467 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 25 Feb 2020 23:16:14 +0100 Subject: [PATCH 071/190] x86/entry: Convert Divide Error to IDTENTRY Convert #DE to IDTENTRY: - Implement the C entry point with DEFINE_IDTENTRY - Emit the ASM stub with DECLARE_IDTENTRY - Remove the ASM idtentry in 64bit - Remove the open coded ASM entry code in 32bit - Fixup the XEN/PV code No functional change. Signed-off-by: Thomas Gleixner Reviewed-by: Alexandre Chartre Acked-by: Peter Zijlstra Acked-by: Andy Lutomirski Link: https://lkml.kernel.org/r/20200505134904.663914713@linutronix.de --- arch/x86/entry/entry_32.S | 7 ------- arch/x86/entry/entry_64.S | 1 - arch/x86/include/asm/idtentry.h | 12 ++++++++++++ arch/x86/include/asm/traps.h | 3 --- arch/x86/kernel/idt.c | 2 +- arch/x86/kernel/traps.c | 7 ++++++- arch/x86/xen/enlighten_pv.c | 7 ++++++- arch/x86/xen/xen-asm_64.S | 2 +- 8 files changed, 26 insertions(+), 15 deletions(-) diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index 8c0e07e2860f00..398fd3c5922848 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -1377,13 +1377,6 @@ SYM_CODE_START(alignment_check) jmp common_exception SYM_CODE_END(alignment_check) -SYM_CODE_START(divide_error) - ASM_CLAC - pushl $0 # no error code - pushl $do_divide_error - jmp common_exception -SYM_CODE_END(divide_error) - #ifdef CONFIG_X86_MCE SYM_CODE_START(machine_check) ASM_CLAC diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index ee071629055747..e18594d2dffd19 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -1072,7 +1072,6 @@ apicinterrupt IRQ_WORK_VECTOR irq_work_interrupt smp_irq_work_interrupt * Exception entry points. */ -idtentry X86_TRAP_DE divide_error do_divide_error has_error_code=0 idtentry X86_TRAP_OF overflow do_overflow has_error_code=0 idtentry X86_TRAP_BP int3 do_int3 has_error_code=0 idtentry X86_TRAP_BR bounds do_bounds has_error_code=0 diff --git a/arch/x86/include/asm/idtentry.h b/arch/x86/include/asm/idtentry.h index 2adfd80ea2f39a..e6f6e296bfec07 100644 --- a/arch/x86/include/asm/idtentry.h +++ b/arch/x86/include/asm/idtentry.h @@ -67,4 +67,16 @@ static __always_inline void __##func(struct pt_regs *regs) #endif /* __ASSEMBLY__ */ +/* + * The actual entry points. Note that DECLARE_IDTENTRY*() serves two + * purposes: + * - provide the function declarations when included from C-Code + * - emit the ASM stubs when included from entry_32/64.S + * + * This avoids duplicate defines and ensures that everything is consistent. + */ + +/* Simple exception entry points. No hardware error code */ +DECLARE_IDTENTRY(X86_TRAP_DE, exc_divide_error); + #endif diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h index 814273f8b166ac..e68cbf87fe4a09 100644 --- a/arch/x86/include/asm/traps.h +++ b/arch/x86/include/asm/traps.h @@ -11,7 +11,6 @@ #define dotraplinkage __visible -asmlinkage void divide_error(void); asmlinkage void debug(void); asmlinkage void nmi(void); asmlinkage void int3(void); @@ -38,7 +37,6 @@ asmlinkage void machine_check(void); asmlinkage void simd_coprocessor_error(void); #if defined(CONFIG_X86_64) && defined(CONFIG_XEN_PV) -asmlinkage void xen_divide_error(void); asmlinkage void xen_xennmi(void); asmlinkage void xen_xendebug(void); asmlinkage void xen_int3(void); @@ -62,7 +60,6 @@ asmlinkage void xen_machine_check(void); asmlinkage void xen_simd_coprocessor_error(void); #endif -dotraplinkage void do_divide_error(struct pt_regs *regs, long error_code); dotraplinkage void do_debug(struct pt_regs *regs, long error_code); dotraplinkage void do_nmi(struct pt_regs *regs, long error_code); dotraplinkage void do_int3(struct pt_regs *regs, long error_code); diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c index 95609ee4c8b3e1..f2a751b10a01aa 100644 --- a/arch/x86/kernel/idt.c +++ b/arch/x86/kernel/idt.c @@ -73,7 +73,7 @@ static const __initconst struct idt_data early_idts[] = { * set up TSS. */ static const __initconst struct idt_data def_idts[] = { - INTG(X86_TRAP_DE, divide_error), + INTG(X86_TRAP_DE, asm_exc_divide_error), INTG(X86_TRAP_NMI, nmi), INTG(X86_TRAP_BR, bounds), INTG(X86_TRAP_UD, invalid_op), diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 3857c0fd3306d4..37092f74ec42ac 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -220,6 +220,12 @@ static __always_inline void __user *error_get_trap_addr(struct pt_regs *regs) return (void __user *)uprobe_get_trap_addr(regs); } +DEFINE_IDTENTRY(exc_divide_error) +{ + do_error_trap(regs, 0, "divide_error", X86_TRAP_DE, SIGFPE, + FPE_INTDIV, error_get_trap_addr(regs)); +} + #define IP ((void __user *)uprobe_get_trap_addr(regs)) #define DO_ERROR(trapnr, signr, sicode, addr, str, name) \ dotraplinkage void do_##name(struct pt_regs *regs, long error_code) \ @@ -227,7 +233,6 @@ dotraplinkage void do_##name(struct pt_regs *regs, long error_code) \ do_error_trap(regs, error_code, str, trapnr, signr, sicode, addr); \ } -DO_ERROR(X86_TRAP_DE, SIGFPE, FPE_INTDIV, IP, "divide error", divide_error) DO_ERROR(X86_TRAP_OF, SIGSEGV, 0, NULL, "overflow", overflow) DO_ERROR(X86_TRAP_UD, SIGILL, ILL_ILLOPN, IP, "invalid opcode", invalid_op) DO_ERROR(X86_TRAP_OLD_MF, SIGFPE, 0, NULL, "coprocessor segment overrun", coprocessor_segment_overrun) diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c index c2c97faaf004d9..06c53c94c813e7 100644 --- a/arch/x86/xen/enlighten_pv.c +++ b/arch/x86/xen/enlighten_pv.c @@ -604,6 +604,11 @@ struct trap_array_entry { bool ist_okay; }; +#define TRAP_ENTRY(func, ist_ok) { \ + .orig = asm_##func, \ + .xen = xen_asm_##func, \ + .ist_okay = ist_ok } + static struct trap_array_entry trap_array[] = { { debug, xen_xendebug, true }, { double_fault, xen_double_fault, true }, @@ -617,7 +622,7 @@ static struct trap_array_entry trap_array[] = { { entry_INT80_compat, xen_entry_INT80_compat, false }, #endif { page_fault, xen_page_fault, false }, - { divide_error, xen_divide_error, false }, + TRAP_ENTRY(exc_divide_error, false ), { bounds, xen_bounds, false }, { invalid_op, xen_invalid_op, false }, { device_not_available, xen_device_not_available, false }, diff --git a/arch/x86/xen/xen-asm_64.S b/arch/x86/xen/xen-asm_64.S index 0a0fd168683a75..48ac67ef094a8e 100644 --- a/arch/x86/xen/xen-asm_64.S +++ b/arch/x86/xen/xen-asm_64.S @@ -28,7 +28,7 @@ SYM_CODE_END(xen_\name) _ASM_NOKPROBE(xen_\name) .endm -xen_pv_trap divide_error +xen_pv_trap asm_exc_divide_error xen_pv_trap debug xen_pv_trap xendebug xen_pv_trap int3 From 4b6b9111c0b9aa4c3b319f1c5a3b1d5850792167 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 25 Feb 2020 23:16:15 +0100 Subject: [PATCH 072/190] x86/entry: Convert Overflow exception to IDTENTRY Convert #OF to IDTENTRY: - Implement the C entry point with DEFINE_IDTENTRY - Emit the ASM stub with DECLARE_IDTENTRY - Remove the ASM idtentry in 64bit - Remove the open coded ASM entry code in 32bit - Fixup the XEN/PV code - Remove the old prototypes No functional change. Signed-off-by: Thomas Gleixner Reviewed-by: Alexandre Chartre Acked-by: Andy Lutomirski Acked-by: Peter Zijlstra Link: https://lkml.kernel.org/r/20200505134904.771457898@linutronix.de --- arch/x86/entry/entry_32.S | 7 ------- arch/x86/entry/entry_64.S | 1 - arch/x86/include/asm/idtentry.h | 1 + arch/x86/include/asm/traps.h | 3 --- arch/x86/kernel/idt.c | 2 +- arch/x86/kernel/traps.c | 6 +++++- arch/x86/xen/enlighten_pv.c | 2 +- arch/x86/xen/xen-asm_64.S | 2 +- 8 files changed, 9 insertions(+), 15 deletions(-) diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index 398fd3c5922848..e5d57f6754ed71 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -1325,13 +1325,6 @@ SYM_CODE_START(native_iret) SYM_CODE_END(native_iret) #endif -SYM_CODE_START(overflow) - ASM_CLAC - pushl $0 - pushl $do_overflow - jmp common_exception -SYM_CODE_END(overflow) - SYM_CODE_START(bounds) ASM_CLAC pushl $0 diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index e18594d2dffd19..eb503b0b0cffe8 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -1072,7 +1072,6 @@ apicinterrupt IRQ_WORK_VECTOR irq_work_interrupt smp_irq_work_interrupt * Exception entry points. */ -idtentry X86_TRAP_OF overflow do_overflow has_error_code=0 idtentry X86_TRAP_BP int3 do_int3 has_error_code=0 idtentry X86_TRAP_BR bounds do_bounds has_error_code=0 idtentry X86_TRAP_UD invalid_op do_invalid_op has_error_code=0 diff --git a/arch/x86/include/asm/idtentry.h b/arch/x86/include/asm/idtentry.h index e6f6e296bfec07..54c41b3a292584 100644 --- a/arch/x86/include/asm/idtentry.h +++ b/arch/x86/include/asm/idtentry.h @@ -78,5 +78,6 @@ static __always_inline void __##func(struct pt_regs *regs) /* Simple exception entry points. No hardware error code */ DECLARE_IDTENTRY(X86_TRAP_DE, exc_divide_error); +DECLARE_IDTENTRY(X86_TRAP_OF, exc_overflow); #endif diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h index e68cbf87fe4a09..1b0452c4704487 100644 --- a/arch/x86/include/asm/traps.h +++ b/arch/x86/include/asm/traps.h @@ -14,7 +14,6 @@ asmlinkage void debug(void); asmlinkage void nmi(void); asmlinkage void int3(void); -asmlinkage void overflow(void); asmlinkage void bounds(void); asmlinkage void invalid_op(void); asmlinkage void device_not_available(void); @@ -40,7 +39,6 @@ asmlinkage void simd_coprocessor_error(void); asmlinkage void xen_xennmi(void); asmlinkage void xen_xendebug(void); asmlinkage void xen_int3(void); -asmlinkage void xen_overflow(void); asmlinkage void xen_bounds(void); asmlinkage void xen_invalid_op(void); asmlinkage void xen_device_not_available(void); @@ -63,7 +61,6 @@ asmlinkage void xen_simd_coprocessor_error(void); dotraplinkage void do_debug(struct pt_regs *regs, long error_code); dotraplinkage void do_nmi(struct pt_regs *regs, long error_code); dotraplinkage void do_int3(struct pt_regs *regs, long error_code); -dotraplinkage void do_overflow(struct pt_regs *regs, long error_code); dotraplinkage void do_bounds(struct pt_regs *regs, long error_code); dotraplinkage void do_invalid_op(struct pt_regs *regs, long error_code); dotraplinkage void do_device_not_available(struct pt_regs *regs, long error_code); diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c index f2a751b10a01aa..f8d79629535eb2 100644 --- a/arch/x86/kernel/idt.c +++ b/arch/x86/kernel/idt.c @@ -99,7 +99,7 @@ static const __initconst struct idt_data def_idts[] = { INTG(X86_TRAP_MC, machine_check), #endif - SYSG(X86_TRAP_OF, overflow), + SYSG(X86_TRAP_OF, asm_exc_overflow), #if defined(CONFIG_IA32_EMULATION) SYSG(IA32_SYSCALL_VECTOR, entry_INT80_compat), #elif defined(CONFIG_X86_32) diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 37092f74ec42ac..b522e2aa9e9afb 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -226,6 +226,11 @@ DEFINE_IDTENTRY(exc_divide_error) FPE_INTDIV, error_get_trap_addr(regs)); } +DEFINE_IDTENTRY(exc_overflow) +{ + do_error_trap(regs, 0, "overflow", X86_TRAP_OF, SIGSEGV, 0, NULL); +} + #define IP ((void __user *)uprobe_get_trap_addr(regs)) #define DO_ERROR(trapnr, signr, sicode, addr, str, name) \ dotraplinkage void do_##name(struct pt_regs *regs, long error_code) \ @@ -233,7 +238,6 @@ dotraplinkage void do_##name(struct pt_regs *regs, long error_code) \ do_error_trap(regs, error_code, str, trapnr, signr, sicode, addr); \ } -DO_ERROR(X86_TRAP_OF, SIGSEGV, 0, NULL, "overflow", overflow) DO_ERROR(X86_TRAP_UD, SIGILL, ILL_ILLOPN, IP, "invalid opcode", invalid_op) DO_ERROR(X86_TRAP_OLD_MF, SIGFPE, 0, NULL, "coprocessor segment overrun", coprocessor_segment_overrun) DO_ERROR(X86_TRAP_TS, SIGSEGV, 0, NULL, "invalid TSS", invalid_TSS) diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c index 06c53c94c813e7..25cf701f916b91 100644 --- a/arch/x86/xen/enlighten_pv.c +++ b/arch/x86/xen/enlighten_pv.c @@ -617,7 +617,7 @@ static struct trap_array_entry trap_array[] = { #endif { nmi, xen_xennmi, true }, { int3, xen_int3, false }, - { overflow, xen_overflow, false }, + TRAP_ENTRY(exc_overflow, false ), #ifdef CONFIG_IA32_EMULATION { entry_INT80_compat, xen_entry_INT80_compat, false }, #endif diff --git a/arch/x86/xen/xen-asm_64.S b/arch/x86/xen/xen-asm_64.S index 48ac67ef094a8e..c9c86a06b25d39 100644 --- a/arch/x86/xen/xen-asm_64.S +++ b/arch/x86/xen/xen-asm_64.S @@ -33,7 +33,7 @@ xen_pv_trap debug xen_pv_trap xendebug xen_pv_trap int3 xen_pv_trap xennmi -xen_pv_trap overflow +xen_pv_trap asm_exc_overflow xen_pv_trap bounds xen_pv_trap invalid_op xen_pv_trap device_not_available From 58d9c81facf55dbd1836d114ce360a048e3a0582 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 25 Feb 2020 23:16:17 +0100 Subject: [PATCH 073/190] x86/entry: Convert Bounds exception to IDTENTRY Convert #BR to IDTENTRY: - Implement the C entry point with DEFINE_IDTENTRY - Emit the ASM stub with DECLARE_IDTENTRY - Remove the ASM idtentry in 64bit - Remove the open coded ASM entry code in 32bit - Fixup the XEN/PV code - Remove the old prototypes - Remove the RCU warning as the new entry macro ensures correctness No functional change. Signed-off-by: Thomas Gleixner Reviewed-by: Alexandre Chartre Acked-by: Andy Lutomirski Acked-by: Peter Zijlstra Link: https://lkml.kernel.org/r/20200505134904.863001309@linutronix.de --- arch/x86/entry/entry_32.S | 7 ------- arch/x86/entry/entry_64.S | 1 - arch/x86/include/asm/idtentry.h | 1 + arch/x86/include/asm/traps.h | 3 --- arch/x86/kernel/idt.c | 2 +- arch/x86/kernel/traps.c | 9 ++++----- arch/x86/xen/enlighten_pv.c | 2 +- arch/x86/xen/xen-asm_64.S | 2 +- 8 files changed, 8 insertions(+), 19 deletions(-) diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index e5d57f6754ed71..31a64df56e3d1b 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -1325,13 +1325,6 @@ SYM_CODE_START(native_iret) SYM_CODE_END(native_iret) #endif -SYM_CODE_START(bounds) - ASM_CLAC - pushl $0 - pushl $do_bounds - jmp common_exception -SYM_CODE_END(bounds) - SYM_CODE_START(invalid_op) ASM_CLAC pushl $0 diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index eb503b0b0cffe8..a3e484da55d1b2 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -1073,7 +1073,6 @@ apicinterrupt IRQ_WORK_VECTOR irq_work_interrupt smp_irq_work_interrupt */ idtentry X86_TRAP_BP int3 do_int3 has_error_code=0 -idtentry X86_TRAP_BR bounds do_bounds has_error_code=0 idtentry X86_TRAP_UD invalid_op do_invalid_op has_error_code=0 idtentry X86_TRAP_NM device_not_available do_device_not_available has_error_code=0 idtentry X86_TRAP_OLD_MF coprocessor_segment_overrun do_coprocessor_segment_overrun has_error_code=0 diff --git a/arch/x86/include/asm/idtentry.h b/arch/x86/include/asm/idtentry.h index 54c41b3a292584..d7c160b249817a 100644 --- a/arch/x86/include/asm/idtentry.h +++ b/arch/x86/include/asm/idtentry.h @@ -79,5 +79,6 @@ static __always_inline void __##func(struct pt_regs *regs) /* Simple exception entry points. No hardware error code */ DECLARE_IDTENTRY(X86_TRAP_DE, exc_divide_error); DECLARE_IDTENTRY(X86_TRAP_OF, exc_overflow); +DECLARE_IDTENTRY(X86_TRAP_BR, exc_bounds); #endif diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h index 1b0452c4704487..73fe4eb3157c93 100644 --- a/arch/x86/include/asm/traps.h +++ b/arch/x86/include/asm/traps.h @@ -14,7 +14,6 @@ asmlinkage void debug(void); asmlinkage void nmi(void); asmlinkage void int3(void); -asmlinkage void bounds(void); asmlinkage void invalid_op(void); asmlinkage void device_not_available(void); #ifdef CONFIG_X86_64 @@ -39,7 +38,6 @@ asmlinkage void simd_coprocessor_error(void); asmlinkage void xen_xennmi(void); asmlinkage void xen_xendebug(void); asmlinkage void xen_int3(void); -asmlinkage void xen_bounds(void); asmlinkage void xen_invalid_op(void); asmlinkage void xen_device_not_available(void); asmlinkage void xen_double_fault(void); @@ -61,7 +59,6 @@ asmlinkage void xen_simd_coprocessor_error(void); dotraplinkage void do_debug(struct pt_regs *regs, long error_code); dotraplinkage void do_nmi(struct pt_regs *regs, long error_code); dotraplinkage void do_int3(struct pt_regs *regs, long error_code); -dotraplinkage void do_bounds(struct pt_regs *regs, long error_code); dotraplinkage void do_invalid_op(struct pt_regs *regs, long error_code); dotraplinkage void do_device_not_available(struct pt_regs *regs, long error_code); dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code, unsigned long cr2); diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c index f8d79629535eb2..87583b69cbc21c 100644 --- a/arch/x86/kernel/idt.c +++ b/arch/x86/kernel/idt.c @@ -75,7 +75,7 @@ static const __initconst struct idt_data early_idts[] = { static const __initconst struct idt_data def_idts[] = { INTG(X86_TRAP_DE, asm_exc_divide_error), INTG(X86_TRAP_NMI, nmi), - INTG(X86_TRAP_BR, bounds), + INTG(X86_TRAP_BR, asm_exc_bounds), INTG(X86_TRAP_UD, invalid_op), INTG(X86_TRAP_NM, device_not_available), INTG(X86_TRAP_OLD_MF, coprocessor_segment_overrun), diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index b522e2aa9e9afb..7a9fb8b9e1a8e8 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -410,18 +410,17 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code, unsign panic("Machine halted."); } -dotraplinkage void do_bounds(struct pt_regs *regs, long error_code) +DEFINE_IDTENTRY(exc_bounds) { - RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU"); - if (notify_die(DIE_TRAP, "bounds", regs, error_code, + if (notify_die(DIE_TRAP, "bounds", regs, 0, X86_TRAP_BR, SIGSEGV) == NOTIFY_STOP) return; cond_local_irq_enable(regs); if (!user_mode(regs)) - die("bounds", regs, error_code); + die("bounds", regs, 0); - do_trap(X86_TRAP_BR, SIGSEGV, "bounds", regs, error_code, 0, NULL); + do_trap(X86_TRAP_BR, SIGSEGV, "bounds", regs, 0, 0, NULL); cond_local_irq_disable(regs); } diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c index 25cf701f916b91..29a52233b81df6 100644 --- a/arch/x86/xen/enlighten_pv.c +++ b/arch/x86/xen/enlighten_pv.c @@ -623,7 +623,7 @@ static struct trap_array_entry trap_array[] = { #endif { page_fault, xen_page_fault, false }, TRAP_ENTRY(exc_divide_error, false ), - { bounds, xen_bounds, false }, + TRAP_ENTRY(exc_bounds, false ), { invalid_op, xen_invalid_op, false }, { device_not_available, xen_device_not_available, false }, { coprocessor_segment_overrun, xen_coprocessor_segment_overrun, false }, diff --git a/arch/x86/xen/xen-asm_64.S b/arch/x86/xen/xen-asm_64.S index c9c86a06b25d39..2cdbf6a37e169b 100644 --- a/arch/x86/xen/xen-asm_64.S +++ b/arch/x86/xen/xen-asm_64.S @@ -34,7 +34,7 @@ xen_pv_trap xendebug xen_pv_trap int3 xen_pv_trap xennmi xen_pv_trap asm_exc_overflow -xen_pv_trap bounds +xen_pv_trap asm_exc_bounds xen_pv_trap invalid_op xen_pv_trap device_not_available xen_pv_trap double_fault From 49893c5cb281f8691dcbe53e6f85a963f47a4b9b Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 25 Feb 2020 23:16:18 +0100 Subject: [PATCH 074/190] x86/entry: Convert Invalid Opcode exception to IDTENTRY Convert #UD to IDTENTRY: - Implement the C entry point with DEFINE_IDTENTRY - Emit the ASM stub with DECLARE_IDTENTRY - Remove the ASM idtentry in 64bit - Remove the open coded ASM entry code in 32bit - Fixup the XEN/PV code - Fixup the FOOF bug call in fault.c - Remove the old prototypes No functional change. Signed-off-by: Thomas Gleixner Reviewed-by: Alexandre Chartre Acked-by: Andy Lutomirski Acked-by: Peter Zijlstra Link: https://lkml.kernel.org/r/20200505134904.955511913@linutronix.de --- arch/x86/entry/entry_32.S | 7 ------- arch/x86/entry/entry_64.S | 1 - arch/x86/include/asm/idtentry.h | 1 + arch/x86/include/asm/traps.h | 8 +++++--- arch/x86/kernel/idt.c | 2 +- arch/x86/kernel/traps.c | 16 +++++++++++++++- arch/x86/mm/fault.c | 2 +- arch/x86/xen/enlighten_pv.c | 2 +- arch/x86/xen/xen-asm_64.S | 2 +- 9 files changed, 25 insertions(+), 16 deletions(-) diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index 31a64df56e3d1b..95a96020f9e48c 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -1325,13 +1325,6 @@ SYM_CODE_START(native_iret) SYM_CODE_END(native_iret) #endif -SYM_CODE_START(invalid_op) - ASM_CLAC - pushl $0 - pushl $do_invalid_op - jmp common_exception -SYM_CODE_END(invalid_op) - SYM_CODE_START(coprocessor_segment_overrun) ASM_CLAC pushl $0 diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index a3e484da55d1b2..ebd5f9fa4f5c82 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -1073,7 +1073,6 @@ apicinterrupt IRQ_WORK_VECTOR irq_work_interrupt smp_irq_work_interrupt */ idtentry X86_TRAP_BP int3 do_int3 has_error_code=0 -idtentry X86_TRAP_UD invalid_op do_invalid_op has_error_code=0 idtentry X86_TRAP_NM device_not_available do_device_not_available has_error_code=0 idtentry X86_TRAP_OLD_MF coprocessor_segment_overrun do_coprocessor_segment_overrun has_error_code=0 idtentry X86_TRAP_TS invalid_TSS do_invalid_TSS has_error_code=1 diff --git a/arch/x86/include/asm/idtentry.h b/arch/x86/include/asm/idtentry.h index d7c160b249817a..f34630f098646a 100644 --- a/arch/x86/include/asm/idtentry.h +++ b/arch/x86/include/asm/idtentry.h @@ -80,5 +80,6 @@ static __always_inline void __##func(struct pt_regs *regs) DECLARE_IDTENTRY(X86_TRAP_DE, exc_divide_error); DECLARE_IDTENTRY(X86_TRAP_OF, exc_overflow); DECLARE_IDTENTRY(X86_TRAP_BR, exc_bounds); +DECLARE_IDTENTRY(X86_TRAP_UD, exc_invalid_op); #endif diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h index 73fe4eb3157c93..71a4a7e6ba8985 100644 --- a/arch/x86/include/asm/traps.h +++ b/arch/x86/include/asm/traps.h @@ -14,7 +14,6 @@ asmlinkage void debug(void); asmlinkage void nmi(void); asmlinkage void int3(void); -asmlinkage void invalid_op(void); asmlinkage void device_not_available(void); #ifdef CONFIG_X86_64 asmlinkage void double_fault(void); @@ -38,7 +37,6 @@ asmlinkage void simd_coprocessor_error(void); asmlinkage void xen_xennmi(void); asmlinkage void xen_xendebug(void); asmlinkage void xen_int3(void); -asmlinkage void xen_invalid_op(void); asmlinkage void xen_device_not_available(void); asmlinkage void xen_double_fault(void); asmlinkage void xen_coprocessor_segment_overrun(void); @@ -59,7 +57,6 @@ asmlinkage void xen_simd_coprocessor_error(void); dotraplinkage void do_debug(struct pt_regs *regs, long error_code); dotraplinkage void do_nmi(struct pt_regs *regs, long error_code); dotraplinkage void do_int3(struct pt_regs *regs, long error_code); -dotraplinkage void do_invalid_op(struct pt_regs *regs, long error_code); dotraplinkage void do_device_not_available(struct pt_regs *regs, long error_code); dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code, unsigned long cr2); dotraplinkage void do_coprocessor_segment_overrun(struct pt_regs *regs, long error_code); @@ -84,6 +81,11 @@ struct bad_iret_stack *fixup_bad_iret(struct bad_iret_stack *s); void __init trap_init(void); #endif +#ifdef CONFIG_X86_F00F_BUG +/* For handling the FOOF bug */ +void handle_invalid_op(struct pt_regs *regs); +#endif + static inline int get_si_code(unsigned long condition) { if (condition & DR_STEP) diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c index 87583b69cbc21c..8b48f54aeff61b 100644 --- a/arch/x86/kernel/idt.c +++ b/arch/x86/kernel/idt.c @@ -76,7 +76,7 @@ static const __initconst struct idt_data def_idts[] = { INTG(X86_TRAP_DE, asm_exc_divide_error), INTG(X86_TRAP_NMI, nmi), INTG(X86_TRAP_BR, asm_exc_bounds), - INTG(X86_TRAP_UD, invalid_op), + INTG(X86_TRAP_UD, asm_exc_invalid_op), INTG(X86_TRAP_NM, device_not_available), INTG(X86_TRAP_OLD_MF, coprocessor_segment_overrun), INTG(X86_TRAP_TS, invalid_TSS), diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 7a9fb8b9e1a8e8..71ac43dc036a17 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -231,6 +231,21 @@ DEFINE_IDTENTRY(exc_overflow) do_error_trap(regs, 0, "overflow", X86_TRAP_OF, SIGSEGV, 0, NULL); } +#ifdef CONFIG_X86_F00F_BUG +void handle_invalid_op(struct pt_regs *regs) +#else +static inline void handle_invalid_op(struct pt_regs *regs) +#endif +{ + do_error_trap(regs, 0, "invalid opcode", X86_TRAP_UD, SIGILL, + ILL_ILLOPN, error_get_trap_addr(regs)); +} + +DEFINE_IDTENTRY(exc_invalid_op) +{ + handle_invalid_op(regs); +} + #define IP ((void __user *)uprobe_get_trap_addr(regs)) #define DO_ERROR(trapnr, signr, sicode, addr, str, name) \ dotraplinkage void do_##name(struct pt_regs *regs, long error_code) \ @@ -238,7 +253,6 @@ dotraplinkage void do_##name(struct pt_regs *regs, long error_code) \ do_error_trap(regs, error_code, str, trapnr, signr, sicode, addr); \ } -DO_ERROR(X86_TRAP_UD, SIGILL, ILL_ILLOPN, IP, "invalid opcode", invalid_op) DO_ERROR(X86_TRAP_OLD_MF, SIGFPE, 0, NULL, "coprocessor segment overrun", coprocessor_segment_overrun) DO_ERROR(X86_TRAP_TS, SIGSEGV, 0, NULL, "invalid TSS", invalid_TSS) DO_ERROR(X86_TRAP_NP, SIGBUS, 0, NULL, "segment not present", segment_not_present) diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 53db18615f31c5..d7b52a2a1bce49 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -426,7 +426,7 @@ static int is_f00f_bug(struct pt_regs *regs, unsigned long address) nr = (address - idt_descr.address) >> 3; if (nr == 6) { - do_invalid_op(regs, 0); + handle_invalid_op(regs); return 1; } } diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c index 29a52233b81df6..5a0e60986e1964 100644 --- a/arch/x86/xen/enlighten_pv.c +++ b/arch/x86/xen/enlighten_pv.c @@ -624,7 +624,7 @@ static struct trap_array_entry trap_array[] = { { page_fault, xen_page_fault, false }, TRAP_ENTRY(exc_divide_error, false ), TRAP_ENTRY(exc_bounds, false ), - { invalid_op, xen_invalid_op, false }, + TRAP_ENTRY(exc_invalid_op, false ), { device_not_available, xen_device_not_available, false }, { coprocessor_segment_overrun, xen_coprocessor_segment_overrun, false }, { invalid_TSS, xen_invalid_TSS, false }, diff --git a/arch/x86/xen/xen-asm_64.S b/arch/x86/xen/xen-asm_64.S index 2cdbf6a37e169b..999f09e8bb09e6 100644 --- a/arch/x86/xen/xen-asm_64.S +++ b/arch/x86/xen/xen-asm_64.S @@ -35,7 +35,7 @@ xen_pv_trap int3 xen_pv_trap xennmi xen_pv_trap asm_exc_overflow xen_pv_trap asm_exc_bounds -xen_pv_trap invalid_op +xen_pv_trap asm_exc_invalid_op xen_pv_trap device_not_available xen_pv_trap double_fault xen_pv_trap coprocessor_segment_overrun From 866ae2ccee4ac092fea14f18d537205e14c5a904 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 25 Feb 2020 23:16:19 +0100 Subject: [PATCH 075/190] x86/entry: Convert Device not available exception to IDTENTRY Convert #NM to IDTENTRY: - Implement the C entry point with DEFINE_IDTENTRY - Emit the ASM stub with DECLARE_IDTENTRY - Remove the ASM idtentry in 64bit - Remove the open coded ASM entry code in 32bit - Fixup the XEN/PV code - Remove the old prototypes - Remove the RCU warning as the new entry macro ensures correctness No functional change. Signed-off-by: Thomas Gleixner Reviewed-by: Alexandre Chartre Acked-by: Andy Lutomirski Acked-by: Peter Zijlstra Link: https://lkml.kernel.org/r/20200505134905.056243863@linutronix.de --- arch/x86/entry/entry_32.S | 7 ------- arch/x86/entry/entry_64.S | 1 - arch/x86/include/asm/idtentry.h | 1 + arch/x86/include/asm/traps.h | 3 --- arch/x86/kernel/idt.c | 2 +- arch/x86/kernel/traps.c | 8 ++------ arch/x86/xen/enlighten_pv.c | 2 +- arch/x86/xen/xen-asm_64.S | 2 +- 8 files changed, 6 insertions(+), 20 deletions(-) diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index 95a96020f9e48c..7d7f2836c0716a 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -1311,13 +1311,6 @@ SYM_CODE_START(simd_coprocessor_error) jmp common_exception SYM_CODE_END(simd_coprocessor_error) -SYM_CODE_START(device_not_available) - ASM_CLAC - pushl $0 - pushl $do_device_not_available - jmp common_exception -SYM_CODE_END(device_not_available) - #ifdef CONFIG_PARAVIRT SYM_CODE_START(native_iret) iret diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index ebd5f9fa4f5c82..d7cf00026d3317 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -1073,7 +1073,6 @@ apicinterrupt IRQ_WORK_VECTOR irq_work_interrupt smp_irq_work_interrupt */ idtentry X86_TRAP_BP int3 do_int3 has_error_code=0 -idtentry X86_TRAP_NM device_not_available do_device_not_available has_error_code=0 idtentry X86_TRAP_OLD_MF coprocessor_segment_overrun do_coprocessor_segment_overrun has_error_code=0 idtentry X86_TRAP_TS invalid_TSS do_invalid_TSS has_error_code=1 idtentry X86_TRAP_NP segment_not_present do_segment_not_present has_error_code=1 diff --git a/arch/x86/include/asm/idtentry.h b/arch/x86/include/asm/idtentry.h index f34630f098646a..fd6f996ce5845c 100644 --- a/arch/x86/include/asm/idtentry.h +++ b/arch/x86/include/asm/idtentry.h @@ -81,5 +81,6 @@ DECLARE_IDTENTRY(X86_TRAP_DE, exc_divide_error); DECLARE_IDTENTRY(X86_TRAP_OF, exc_overflow); DECLARE_IDTENTRY(X86_TRAP_BR, exc_bounds); DECLARE_IDTENTRY(X86_TRAP_UD, exc_invalid_op); +DECLARE_IDTENTRY(X86_TRAP_NM, exc_device_not_available); #endif diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h index 71a4a7e6ba8985..e5f2c90c0e8bcb 100644 --- a/arch/x86/include/asm/traps.h +++ b/arch/x86/include/asm/traps.h @@ -14,7 +14,6 @@ asmlinkage void debug(void); asmlinkage void nmi(void); asmlinkage void int3(void); -asmlinkage void device_not_available(void); #ifdef CONFIG_X86_64 asmlinkage void double_fault(void); #endif @@ -37,7 +36,6 @@ asmlinkage void simd_coprocessor_error(void); asmlinkage void xen_xennmi(void); asmlinkage void xen_xendebug(void); asmlinkage void xen_int3(void); -asmlinkage void xen_device_not_available(void); asmlinkage void xen_double_fault(void); asmlinkage void xen_coprocessor_segment_overrun(void); asmlinkage void xen_invalid_TSS(void); @@ -57,7 +55,6 @@ asmlinkage void xen_simd_coprocessor_error(void); dotraplinkage void do_debug(struct pt_regs *regs, long error_code); dotraplinkage void do_nmi(struct pt_regs *regs, long error_code); dotraplinkage void do_int3(struct pt_regs *regs, long error_code); -dotraplinkage void do_device_not_available(struct pt_regs *regs, long error_code); dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code, unsigned long cr2); dotraplinkage void do_coprocessor_segment_overrun(struct pt_regs *regs, long error_code); dotraplinkage void do_invalid_TSS(struct pt_regs *regs, long error_code); diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c index 8b48f54aeff61b..cdc2d8bbd338e3 100644 --- a/arch/x86/kernel/idt.c +++ b/arch/x86/kernel/idt.c @@ -77,7 +77,7 @@ static const __initconst struct idt_data def_idts[] = { INTG(X86_TRAP_NMI, nmi), INTG(X86_TRAP_BR, asm_exc_bounds), INTG(X86_TRAP_UD, asm_exc_invalid_op), - INTG(X86_TRAP_NM, device_not_available), + INTG(X86_TRAP_NM, asm_exc_device_not_available), INTG(X86_TRAP_OLD_MF, coprocessor_segment_overrun), INTG(X86_TRAP_TS, invalid_TSS), INTG(X86_TRAP_NP, segment_not_present), diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 71ac43dc036a17..b8af5eb6a92911 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -882,13 +882,10 @@ do_spurious_interrupt_bug(struct pt_regs *regs, long error_code) */ } -dotraplinkage void -do_device_not_available(struct pt_regs *regs, long error_code) +DEFINE_IDTENTRY(exc_device_not_available) { unsigned long cr0 = read_cr0(); - RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU"); - #ifdef CONFIG_MATH_EMULATION if (!boot_cpu_has(X86_FEATURE_FPU) && (cr0 & X86_CR0_EM)) { struct math_emu_info info = { }; @@ -913,10 +910,9 @@ do_device_not_available(struct pt_regs *regs, long error_code) * to kill the task than getting stuck in a never-ending * loop of #NM faults. */ - die("unexpected #NM exception", regs, error_code); + die("unexpected #NM exception", regs, 0); } } -NOKPROBE_SYMBOL(do_device_not_available); #ifdef CONFIG_X86_32 dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code) diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c index 5a0e60986e1964..94e2f918403ec9 100644 --- a/arch/x86/xen/enlighten_pv.c +++ b/arch/x86/xen/enlighten_pv.c @@ -625,7 +625,7 @@ static struct trap_array_entry trap_array[] = { TRAP_ENTRY(exc_divide_error, false ), TRAP_ENTRY(exc_bounds, false ), TRAP_ENTRY(exc_invalid_op, false ), - { device_not_available, xen_device_not_available, false }, + TRAP_ENTRY(exc_device_not_available, false ), { coprocessor_segment_overrun, xen_coprocessor_segment_overrun, false }, { invalid_TSS, xen_invalid_TSS, false }, { segment_not_present, xen_segment_not_present, false }, diff --git a/arch/x86/xen/xen-asm_64.S b/arch/x86/xen/xen-asm_64.S index 999f09e8bb09e6..5215e5724cfb61 100644 --- a/arch/x86/xen/xen-asm_64.S +++ b/arch/x86/xen/xen-asm_64.S @@ -36,7 +36,7 @@ xen_pv_trap xennmi xen_pv_trap asm_exc_overflow xen_pv_trap asm_exc_bounds xen_pv_trap asm_exc_invalid_op -xen_pv_trap device_not_available +xen_pv_trap asm_exc_device_not_available xen_pv_trap double_fault xen_pv_trap coprocessor_segment_overrun xen_pv_trap invalid_TSS From f95658fdb575233f79e3e7ed7ecf990389d31319 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 25 Feb 2020 23:16:20 +0100 Subject: [PATCH 076/190] x86/entry: Convert Coprocessor segment overrun exception to IDTENTRY Convert #OLD_MF to IDTENTRY: - Implement the C entry point with DEFINE_IDTENTRY - Emit the ASM stub with DECLARE_IDTENTRY - Remove the ASM idtentry in 64bit - Remove the open coded ASM entry code in 32bit - Fixup the XEN/PV code - Remove the old prototypes No functional change. Signed-off-by: Thomas Gleixner Reviewed-by: Alexandre Chartre Acked-by: Andy Lutomirski Acked-by: Peter Zijlstra Link: https://lkml.kernel.org/r/20200505134905.838823510@linutronix.de --- arch/x86/entry/entry_32.S | 7 ------- arch/x86/entry/entry_64.S | 1 - arch/x86/include/asm/idtentry.h | 1 + arch/x86/include/asm/traps.h | 3 --- arch/x86/kernel/idt.c | 2 +- arch/x86/kernel/traps.c | 7 ++++++- arch/x86/xen/enlighten_pv.c | 2 +- arch/x86/xen/xen-asm_64.S | 2 +- 8 files changed, 10 insertions(+), 15 deletions(-) diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index 7d7f2836c0716a..916ce827f606c7 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -1318,13 +1318,6 @@ SYM_CODE_START(native_iret) SYM_CODE_END(native_iret) #endif -SYM_CODE_START(coprocessor_segment_overrun) - ASM_CLAC - pushl $0 - pushl $do_coprocessor_segment_overrun - jmp common_exception -SYM_CODE_END(coprocessor_segment_overrun) - SYM_CODE_START(invalid_TSS) ASM_CLAC pushl $do_invalid_TSS diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index d7cf00026d3317..423c3a9783af31 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -1073,7 +1073,6 @@ apicinterrupt IRQ_WORK_VECTOR irq_work_interrupt smp_irq_work_interrupt */ idtentry X86_TRAP_BP int3 do_int3 has_error_code=0 -idtentry X86_TRAP_OLD_MF coprocessor_segment_overrun do_coprocessor_segment_overrun has_error_code=0 idtentry X86_TRAP_TS invalid_TSS do_invalid_TSS has_error_code=1 idtentry X86_TRAP_NP segment_not_present do_segment_not_present has_error_code=1 idtentry X86_TRAP_SS stack_segment do_stack_segment has_error_code=1 diff --git a/arch/x86/include/asm/idtentry.h b/arch/x86/include/asm/idtentry.h index fd6f996ce5845c..0c3374012a5024 100644 --- a/arch/x86/include/asm/idtentry.h +++ b/arch/x86/include/asm/idtentry.h @@ -82,5 +82,6 @@ DECLARE_IDTENTRY(X86_TRAP_OF, exc_overflow); DECLARE_IDTENTRY(X86_TRAP_BR, exc_bounds); DECLARE_IDTENTRY(X86_TRAP_UD, exc_invalid_op); DECLARE_IDTENTRY(X86_TRAP_NM, exc_device_not_available); +DECLARE_IDTENTRY(X86_TRAP_OLD_MF, exc_coproc_segment_overrun); #endif diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h index e5f2c90c0e8bcb..5e9d402e6d6c83 100644 --- a/arch/x86/include/asm/traps.h +++ b/arch/x86/include/asm/traps.h @@ -17,7 +17,6 @@ asmlinkage void int3(void); #ifdef CONFIG_X86_64 asmlinkage void double_fault(void); #endif -asmlinkage void coprocessor_segment_overrun(void); asmlinkage void invalid_TSS(void); asmlinkage void segment_not_present(void); asmlinkage void stack_segment(void); @@ -37,7 +36,6 @@ asmlinkage void xen_xennmi(void); asmlinkage void xen_xendebug(void); asmlinkage void xen_int3(void); asmlinkage void xen_double_fault(void); -asmlinkage void xen_coprocessor_segment_overrun(void); asmlinkage void xen_invalid_TSS(void); asmlinkage void xen_segment_not_present(void); asmlinkage void xen_stack_segment(void); @@ -56,7 +54,6 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code); dotraplinkage void do_nmi(struct pt_regs *regs, long error_code); dotraplinkage void do_int3(struct pt_regs *regs, long error_code); dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code, unsigned long cr2); -dotraplinkage void do_coprocessor_segment_overrun(struct pt_regs *regs, long error_code); dotraplinkage void do_invalid_TSS(struct pt_regs *regs, long error_code); dotraplinkage void do_segment_not_present(struct pt_regs *regs, long error_code); dotraplinkage void do_stack_segment(struct pt_regs *regs, long error_code); diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c index cdc2d8bbd338e3..758d325103e8da 100644 --- a/arch/x86/kernel/idt.c +++ b/arch/x86/kernel/idt.c @@ -78,7 +78,7 @@ static const __initconst struct idt_data def_idts[] = { INTG(X86_TRAP_BR, asm_exc_bounds), INTG(X86_TRAP_UD, asm_exc_invalid_op), INTG(X86_TRAP_NM, asm_exc_device_not_available), - INTG(X86_TRAP_OLD_MF, coprocessor_segment_overrun), + INTG(X86_TRAP_OLD_MF, asm_exc_coproc_segment_overrun), INTG(X86_TRAP_TS, invalid_TSS), INTG(X86_TRAP_NP, segment_not_present), INTG(X86_TRAP_SS, stack_segment), diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index b8af5eb6a92911..3ce1f667d0781c 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -246,6 +246,12 @@ DEFINE_IDTENTRY(exc_invalid_op) handle_invalid_op(regs); } +DEFINE_IDTENTRY(exc_coproc_segment_overrun) +{ + do_error_trap(regs, 0, "coprocessor segment overrun", + X86_TRAP_OLD_MF, SIGFPE, 0, NULL); +} + #define IP ((void __user *)uprobe_get_trap_addr(regs)) #define DO_ERROR(trapnr, signr, sicode, addr, str, name) \ dotraplinkage void do_##name(struct pt_regs *regs, long error_code) \ @@ -253,7 +259,6 @@ dotraplinkage void do_##name(struct pt_regs *regs, long error_code) \ do_error_trap(regs, error_code, str, trapnr, signr, sicode, addr); \ } -DO_ERROR(X86_TRAP_OLD_MF, SIGFPE, 0, NULL, "coprocessor segment overrun", coprocessor_segment_overrun) DO_ERROR(X86_TRAP_TS, SIGSEGV, 0, NULL, "invalid TSS", invalid_TSS) DO_ERROR(X86_TRAP_NP, SIGBUS, 0, NULL, "segment not present", segment_not_present) DO_ERROR(X86_TRAP_SS, SIGBUS, 0, NULL, "stack segment", stack_segment) diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c index 94e2f918403ec9..027cc3d9ad5215 100644 --- a/arch/x86/xen/enlighten_pv.c +++ b/arch/x86/xen/enlighten_pv.c @@ -626,7 +626,7 @@ static struct trap_array_entry trap_array[] = { TRAP_ENTRY(exc_bounds, false ), TRAP_ENTRY(exc_invalid_op, false ), TRAP_ENTRY(exc_device_not_available, false ), - { coprocessor_segment_overrun, xen_coprocessor_segment_overrun, false }, + TRAP_ENTRY(exc_coproc_segment_overrun, false ), { invalid_TSS, xen_invalid_TSS, false }, { segment_not_present, xen_segment_not_present, false }, { stack_segment, xen_stack_segment, false }, diff --git a/arch/x86/xen/xen-asm_64.S b/arch/x86/xen/xen-asm_64.S index 5215e5724cfb61..7ac9c269b994b4 100644 --- a/arch/x86/xen/xen-asm_64.S +++ b/arch/x86/xen/xen-asm_64.S @@ -38,7 +38,7 @@ xen_pv_trap asm_exc_bounds xen_pv_trap asm_exc_invalid_op xen_pv_trap asm_exc_device_not_available xen_pv_trap double_fault -xen_pv_trap coprocessor_segment_overrun +xen_pv_trap asm_exc_coproc_segment_overrun xen_pv_trap invalid_TSS xen_pv_trap segment_not_present xen_pv_trap stack_segment From aabfe5383ec7b480ca222ac05d49eb3c83dc022a Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 25 Feb 2020 23:16:21 +0100 Subject: [PATCH 077/190] x86/idtentry: Provide IDTENTRY_ERRORCODE Same as IDTENTRY but the C entry point has an error code argument. Signed-off-by: Thomas Gleixner Reviewed-by: Alexandre Chartre Acked-by: Andy Lutomirski Acked-by: Peter Zijlstra Link: https://lkml.kernel.org/r/20200505134905.258989060@linutronix.de --- arch/x86/include/asm/idtentry.h | 46 +++++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) diff --git a/arch/x86/include/asm/idtentry.h b/arch/x86/include/asm/idtentry.h index 0c3374012a5024..f35d5c97ddfb54 100644 --- a/arch/x86/include/asm/idtentry.h +++ b/arch/x86/include/asm/idtentry.h @@ -57,6 +57,49 @@ __visible noinstr void func(struct pt_regs *regs) \ \ static __always_inline void __##func(struct pt_regs *regs) +/** + * DECLARE_IDTENTRY_ERRORCODE - Declare functions for simple IDT entry points + * Error code pushed by hardware + * @vector: Vector number (ignored for C) + * @func: Function name of the entry point + * + * Declares three functions: + * - The ASM entry point: asm_##func + * - The XEN PV trap entry point: xen_##func (maybe unused) + * - The C handler called from the ASM entry point + * + * Same as DECLARE_IDTENTRY, but has an extra error_code argument for the + * C-handler. + */ +#define DECLARE_IDTENTRY_ERRORCODE(vector, func) \ + asmlinkage void asm_##func(void); \ + asmlinkage void xen_asm_##func(void); \ + __visible void func(struct pt_regs *regs, unsigned long error_code) + +/** + * DEFINE_IDTENTRY_ERRORCODE - Emit code for simple IDT entry points + * Error code pushed by hardware + * @func: Function name of the entry point + * + * Same as DEFINE_IDTENTRY, but has an extra error_code argument + */ +#define DEFINE_IDTENTRY_ERRORCODE(func) \ +static __always_inline void __##func(struct pt_regs *regs, \ + unsigned long error_code); \ + \ +__visible noinstr void func(struct pt_regs *regs, \ + unsigned long error_code) \ +{ \ + idtentry_enter(regs); \ + instrumentation_begin(); \ + __##func (regs, error_code); \ + instrumentation_end(); \ + idtentry_exit(regs); \ +} \ + \ +static __always_inline void __##func(struct pt_regs *regs, \ + unsigned long error_code) + #else /* !__ASSEMBLY__ */ /* @@ -65,6 +108,9 @@ static __always_inline void __##func(struct pt_regs *regs) #define DECLARE_IDTENTRY(vector, func) \ idtentry vector asm_##func func has_error_code=0 sane=1 +#define DECLARE_IDTENTRY_ERRORCODE(vector, func) \ + idtentry vector asm_##func func has_error_code=1 sane=1 + #endif /* __ASSEMBLY__ */ /* From 97b3d290b865cf9115f7d37d40b7482efba4d46d Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 25 Feb 2020 23:16:22 +0100 Subject: [PATCH 078/190] x86/entry: Convert Invalid TSS exception to IDTENTRY Convert #TS to IDTENTRY_ERRORCODE: - Implement the C entry point with DEFINE_IDTENTRY - Emit the ASM stub with DECLARE_IDTENTRY - Remove the ASM idtentry in 64bit - Remove the open coded ASM entry code in 32bit - Fixup the XEN/PV code - Remove the old prototypes No functional change. Signed-off-by: Thomas Gleixner Reviewed-by: Alexandre Chartre Acked-by: Andy Lutomirski Acked-by: Peter Zijlstra Link: https://lkml.kernel.org/r/20200505134905.350676449@linutronix.de --- arch/x86/entry/entry_32.S | 6 ------ arch/x86/entry/entry_64.S | 1 - arch/x86/include/asm/idtentry.h | 3 +++ arch/x86/include/asm/traps.h | 3 --- arch/x86/kernel/idt.c | 2 +- arch/x86/kernel/traps.c | 7 ++++++- arch/x86/xen/enlighten_pv.c | 2 +- arch/x86/xen/xen-asm_64.S | 2 +- 8 files changed, 12 insertions(+), 14 deletions(-) diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index 916ce827f606c7..2143a6208afc1c 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -1318,12 +1318,6 @@ SYM_CODE_START(native_iret) SYM_CODE_END(native_iret) #endif -SYM_CODE_START(invalid_TSS) - ASM_CLAC - pushl $do_invalid_TSS - jmp common_exception -SYM_CODE_END(invalid_TSS) - SYM_CODE_START(segment_not_present) ASM_CLAC pushl $do_segment_not_present diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 423c3a9783af31..07307cf08c2d38 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -1073,7 +1073,6 @@ apicinterrupt IRQ_WORK_VECTOR irq_work_interrupt smp_irq_work_interrupt */ idtentry X86_TRAP_BP int3 do_int3 has_error_code=0 -idtentry X86_TRAP_TS invalid_TSS do_invalid_TSS has_error_code=1 idtentry X86_TRAP_NP segment_not_present do_segment_not_present has_error_code=1 idtentry X86_TRAP_SS stack_segment do_stack_segment has_error_code=1 idtentry X86_TRAP_GP general_protection do_general_protection has_error_code=1 diff --git a/arch/x86/include/asm/idtentry.h b/arch/x86/include/asm/idtentry.h index f35d5c97ddfb54..aa0d4656684fd4 100644 --- a/arch/x86/include/asm/idtentry.h +++ b/arch/x86/include/asm/idtentry.h @@ -130,4 +130,7 @@ DECLARE_IDTENTRY(X86_TRAP_UD, exc_invalid_op); DECLARE_IDTENTRY(X86_TRAP_NM, exc_device_not_available); DECLARE_IDTENTRY(X86_TRAP_OLD_MF, exc_coproc_segment_overrun); +/* Simple exception entries with error code pushed by hardware */ +DECLARE_IDTENTRY_ERRORCODE(X86_TRAP_TS, exc_invalid_tss); + #endif diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h index 5e9d402e6d6c83..30bc589e9f5af6 100644 --- a/arch/x86/include/asm/traps.h +++ b/arch/x86/include/asm/traps.h @@ -17,7 +17,6 @@ asmlinkage void int3(void); #ifdef CONFIG_X86_64 asmlinkage void double_fault(void); #endif -asmlinkage void invalid_TSS(void); asmlinkage void segment_not_present(void); asmlinkage void stack_segment(void); asmlinkage void general_protection(void); @@ -36,7 +35,6 @@ asmlinkage void xen_xennmi(void); asmlinkage void xen_xendebug(void); asmlinkage void xen_int3(void); asmlinkage void xen_double_fault(void); -asmlinkage void xen_invalid_TSS(void); asmlinkage void xen_segment_not_present(void); asmlinkage void xen_stack_segment(void); asmlinkage void xen_general_protection(void); @@ -54,7 +52,6 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code); dotraplinkage void do_nmi(struct pt_regs *regs, long error_code); dotraplinkage void do_int3(struct pt_regs *regs, long error_code); dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code, unsigned long cr2); -dotraplinkage void do_invalid_TSS(struct pt_regs *regs, long error_code); dotraplinkage void do_segment_not_present(struct pt_regs *regs, long error_code); dotraplinkage void do_stack_segment(struct pt_regs *regs, long error_code); dotraplinkage void do_general_protection(struct pt_regs *regs, long error_code); diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c index 758d325103e8da..caa740df1404d7 100644 --- a/arch/x86/kernel/idt.c +++ b/arch/x86/kernel/idt.c @@ -79,7 +79,7 @@ static const __initconst struct idt_data def_idts[] = { INTG(X86_TRAP_UD, asm_exc_invalid_op), INTG(X86_TRAP_NM, asm_exc_device_not_available), INTG(X86_TRAP_OLD_MF, asm_exc_coproc_segment_overrun), - INTG(X86_TRAP_TS, invalid_TSS), + INTG(X86_TRAP_TS, asm_exc_invalid_tss), INTG(X86_TRAP_NP, segment_not_present), INTG(X86_TRAP_SS, stack_segment), INTG(X86_TRAP_GP, general_protection), diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 3ce1f667d0781c..10ab0837668fba 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -252,6 +252,12 @@ DEFINE_IDTENTRY(exc_coproc_segment_overrun) X86_TRAP_OLD_MF, SIGFPE, 0, NULL); } +DEFINE_IDTENTRY_ERRORCODE(exc_invalid_tss) +{ + do_error_trap(regs, error_code, "invalid TSS", X86_TRAP_TS, SIGSEGV, + 0, NULL); +} + #define IP ((void __user *)uprobe_get_trap_addr(regs)) #define DO_ERROR(trapnr, signr, sicode, addr, str, name) \ dotraplinkage void do_##name(struct pt_regs *regs, long error_code) \ @@ -259,7 +265,6 @@ dotraplinkage void do_##name(struct pt_regs *regs, long error_code) \ do_error_trap(regs, error_code, str, trapnr, signr, sicode, addr); \ } -DO_ERROR(X86_TRAP_TS, SIGSEGV, 0, NULL, "invalid TSS", invalid_TSS) DO_ERROR(X86_TRAP_NP, SIGBUS, 0, NULL, "segment not present", segment_not_present) DO_ERROR(X86_TRAP_SS, SIGBUS, 0, NULL, "stack segment", stack_segment) #undef IP diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c index 027cc3d9ad5215..650ce235585c59 100644 --- a/arch/x86/xen/enlighten_pv.c +++ b/arch/x86/xen/enlighten_pv.c @@ -627,7 +627,7 @@ static struct trap_array_entry trap_array[] = { TRAP_ENTRY(exc_invalid_op, false ), TRAP_ENTRY(exc_device_not_available, false ), TRAP_ENTRY(exc_coproc_segment_overrun, false ), - { invalid_TSS, xen_invalid_TSS, false }, + TRAP_ENTRY(exc_invalid_tss, false ), { segment_not_present, xen_segment_not_present, false }, { stack_segment, xen_stack_segment, false }, { general_protection, xen_general_protection, false }, diff --git a/arch/x86/xen/xen-asm_64.S b/arch/x86/xen/xen-asm_64.S index 7ac9c269b994b4..f7a890c95db782 100644 --- a/arch/x86/xen/xen-asm_64.S +++ b/arch/x86/xen/xen-asm_64.S @@ -39,7 +39,7 @@ xen_pv_trap asm_exc_invalid_op xen_pv_trap asm_exc_device_not_available xen_pv_trap double_fault xen_pv_trap asm_exc_coproc_segment_overrun -xen_pv_trap invalid_TSS +xen_pv_trap asm_exc_invalid_tss xen_pv_trap segment_not_present xen_pv_trap stack_segment xen_pv_trap general_protection From 99a3fb8d01af1085f16a417a748e0a462dc92d29 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 25 Feb 2020 23:16:23 +0100 Subject: [PATCH 079/190] x86/entry: Convert Segment not present exception to IDTENTRY Convert #NP to IDTENTRY_ERRORCODE: - Implement the C entry point with DEFINE_IDTENTRY - Emit the ASM stub with DECLARE_IDTENTRY - Remove the ASM idtentry in 64bit - Remove the open coded ASM entry code in 32bit - Fixup the XEN/PV code - Remove the old prototypes No functional change. Signed-off-by: Thomas Gleixner Reviewed-by: Alexandre Chartre Acked-by: Andy Lutomirski Acked-by: Peter Zijlstra Link: https://lkml.kernel.org/r/20200505134905.443591450@linutronix.de --- arch/x86/entry/entry_32.S | 6 ------ arch/x86/entry/entry_64.S | 1 - arch/x86/include/asm/idtentry.h | 1 + arch/x86/include/asm/traps.h | 3 --- arch/x86/kernel/idt.c | 2 +- arch/x86/kernel/traps.c | 7 ++++++- arch/x86/xen/enlighten_pv.c | 2 +- arch/x86/xen/xen-asm_64.S | 2 +- 8 files changed, 10 insertions(+), 14 deletions(-) diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index 2143a6208afc1c..b01dbb3f616b78 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -1318,12 +1318,6 @@ SYM_CODE_START(native_iret) SYM_CODE_END(native_iret) #endif -SYM_CODE_START(segment_not_present) - ASM_CLAC - pushl $do_segment_not_present - jmp common_exception -SYM_CODE_END(segment_not_present) - SYM_CODE_START(stack_segment) ASM_CLAC pushl $do_stack_segment diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 07307cf08c2d38..367a207603a63a 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -1073,7 +1073,6 @@ apicinterrupt IRQ_WORK_VECTOR irq_work_interrupt smp_irq_work_interrupt */ idtentry X86_TRAP_BP int3 do_int3 has_error_code=0 -idtentry X86_TRAP_NP segment_not_present do_segment_not_present has_error_code=1 idtentry X86_TRAP_SS stack_segment do_stack_segment has_error_code=1 idtentry X86_TRAP_GP general_protection do_general_protection has_error_code=1 idtentry X86_TRAP_SPURIOUS spurious_interrupt_bug do_spurious_interrupt_bug has_error_code=0 diff --git a/arch/x86/include/asm/idtentry.h b/arch/x86/include/asm/idtentry.h index aa0d4656684fd4..d517c09e8d0b81 100644 --- a/arch/x86/include/asm/idtentry.h +++ b/arch/x86/include/asm/idtentry.h @@ -132,5 +132,6 @@ DECLARE_IDTENTRY(X86_TRAP_OLD_MF, exc_coproc_segment_overrun); /* Simple exception entries with error code pushed by hardware */ DECLARE_IDTENTRY_ERRORCODE(X86_TRAP_TS, exc_invalid_tss); +DECLARE_IDTENTRY_ERRORCODE(X86_TRAP_NP, exc_segment_not_present); #endif diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h index 30bc589e9f5af6..970c88894fc4a6 100644 --- a/arch/x86/include/asm/traps.h +++ b/arch/x86/include/asm/traps.h @@ -17,7 +17,6 @@ asmlinkage void int3(void); #ifdef CONFIG_X86_64 asmlinkage void double_fault(void); #endif -asmlinkage void segment_not_present(void); asmlinkage void stack_segment(void); asmlinkage void general_protection(void); asmlinkage void page_fault(void); @@ -35,7 +34,6 @@ asmlinkage void xen_xennmi(void); asmlinkage void xen_xendebug(void); asmlinkage void xen_int3(void); asmlinkage void xen_double_fault(void); -asmlinkage void xen_segment_not_present(void); asmlinkage void xen_stack_segment(void); asmlinkage void xen_general_protection(void); asmlinkage void xen_page_fault(void); @@ -52,7 +50,6 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code); dotraplinkage void do_nmi(struct pt_regs *regs, long error_code); dotraplinkage void do_int3(struct pt_regs *regs, long error_code); dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code, unsigned long cr2); -dotraplinkage void do_segment_not_present(struct pt_regs *regs, long error_code); dotraplinkage void do_stack_segment(struct pt_regs *regs, long error_code); dotraplinkage void do_general_protection(struct pt_regs *regs, long error_code); dotraplinkage void do_page_fault(struct pt_regs *regs, unsigned long error_code, unsigned long address); diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c index caa740df1404d7..b9acc7f5684aa5 100644 --- a/arch/x86/kernel/idt.c +++ b/arch/x86/kernel/idt.c @@ -80,7 +80,7 @@ static const __initconst struct idt_data def_idts[] = { INTG(X86_TRAP_NM, asm_exc_device_not_available), INTG(X86_TRAP_OLD_MF, asm_exc_coproc_segment_overrun), INTG(X86_TRAP_TS, asm_exc_invalid_tss), - INTG(X86_TRAP_NP, segment_not_present), + INTG(X86_TRAP_NP, asm_exc_segment_not_present), INTG(X86_TRAP_SS, stack_segment), INTG(X86_TRAP_GP, general_protection), INTG(X86_TRAP_SPURIOUS, spurious_interrupt_bug), diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 10ab0837668fba..88ba5f0400fd35 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -258,6 +258,12 @@ DEFINE_IDTENTRY_ERRORCODE(exc_invalid_tss) 0, NULL); } +DEFINE_IDTENTRY_ERRORCODE(exc_segment_not_present) +{ + do_error_trap(regs, error_code, "segment not present", X86_TRAP_NP, + SIGBUS, 0, NULL); +} + #define IP ((void __user *)uprobe_get_trap_addr(regs)) #define DO_ERROR(trapnr, signr, sicode, addr, str, name) \ dotraplinkage void do_##name(struct pt_regs *regs, long error_code) \ @@ -265,7 +271,6 @@ dotraplinkage void do_##name(struct pt_regs *regs, long error_code) \ do_error_trap(regs, error_code, str, trapnr, signr, sicode, addr); \ } -DO_ERROR(X86_TRAP_NP, SIGBUS, 0, NULL, "segment not present", segment_not_present) DO_ERROR(X86_TRAP_SS, SIGBUS, 0, NULL, "stack segment", stack_segment) #undef IP diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c index 650ce235585c59..4e2a41ebc369a9 100644 --- a/arch/x86/xen/enlighten_pv.c +++ b/arch/x86/xen/enlighten_pv.c @@ -628,7 +628,7 @@ static struct trap_array_entry trap_array[] = { TRAP_ENTRY(exc_device_not_available, false ), TRAP_ENTRY(exc_coproc_segment_overrun, false ), TRAP_ENTRY(exc_invalid_tss, false ), - { segment_not_present, xen_segment_not_present, false }, + TRAP_ENTRY(exc_segment_not_present, false ), { stack_segment, xen_stack_segment, false }, { general_protection, xen_general_protection, false }, { spurious_interrupt_bug, xen_spurious_interrupt_bug, false }, diff --git a/arch/x86/xen/xen-asm_64.S b/arch/x86/xen/xen-asm_64.S index f7a890c95db782..c8ce7ad7120285 100644 --- a/arch/x86/xen/xen-asm_64.S +++ b/arch/x86/xen/xen-asm_64.S @@ -40,7 +40,7 @@ xen_pv_trap asm_exc_device_not_available xen_pv_trap double_fault xen_pv_trap asm_exc_coproc_segment_overrun xen_pv_trap asm_exc_invalid_tss -xen_pv_trap segment_not_present +xen_pv_trap asm_exc_segment_not_present xen_pv_trap stack_segment xen_pv_trap general_protection xen_pv_trap page_fault From fd9689bf91131c4bea5ea54f828af5267f5ed6a0 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 25 Feb 2020 23:16:24 +0100 Subject: [PATCH 080/190] x86/entry: Convert Stack segment exception to IDTENTRY Convert #SS to IDTENTRY_ERRORCODE: - Implement the C entry point with DEFINE_IDTENTRY - Emit the ASM stub with DECLARE_IDTENTRY - Remove the ASM idtentry in 64bit - Remove the open coded ASM entry code in 32bit - Fixup the XEN/PV code - Remove the old prototypes No functional change. Signed-off-by: Thomas Gleixner Reviewed-by: Alexandre Chartre Acked-by: Peter Zijlstra Acked-by: Andy Lutomirski Link: https://lkml.kernel.org/r/20200505134905.539867572@linutronix.de --- arch/x86/entry/entry_32.S | 6 ------ arch/x86/entry/entry_64.S | 1 - arch/x86/include/asm/idtentry.h | 1 + arch/x86/include/asm/traps.h | 3 --- arch/x86/kernel/idt.c | 2 +- arch/x86/kernel/traps.c | 12 ++++-------- arch/x86/xen/enlighten_pv.c | 2 +- arch/x86/xen/xen-asm_64.S | 2 +- 8 files changed, 8 insertions(+), 21 deletions(-) diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index b01dbb3f616b78..ffe43d2f8fe908 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -1318,12 +1318,6 @@ SYM_CODE_START(native_iret) SYM_CODE_END(native_iret) #endif -SYM_CODE_START(stack_segment) - ASM_CLAC - pushl $do_stack_segment - jmp common_exception -SYM_CODE_END(stack_segment) - SYM_CODE_START(alignment_check) ASM_CLAC pushl $do_alignment_check diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 367a207603a63a..c92592de337072 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -1073,7 +1073,6 @@ apicinterrupt IRQ_WORK_VECTOR irq_work_interrupt smp_irq_work_interrupt */ idtentry X86_TRAP_BP int3 do_int3 has_error_code=0 -idtentry X86_TRAP_SS stack_segment do_stack_segment has_error_code=1 idtentry X86_TRAP_GP general_protection do_general_protection has_error_code=1 idtentry X86_TRAP_SPURIOUS spurious_interrupt_bug do_spurious_interrupt_bug has_error_code=0 idtentry X86_TRAP_MF coprocessor_error do_coprocessor_error has_error_code=0 diff --git a/arch/x86/include/asm/idtentry.h b/arch/x86/include/asm/idtentry.h index d517c09e8d0b81..4c0abd31db5b1b 100644 --- a/arch/x86/include/asm/idtentry.h +++ b/arch/x86/include/asm/idtentry.h @@ -133,5 +133,6 @@ DECLARE_IDTENTRY(X86_TRAP_OLD_MF, exc_coproc_segment_overrun); /* Simple exception entries with error code pushed by hardware */ DECLARE_IDTENTRY_ERRORCODE(X86_TRAP_TS, exc_invalid_tss); DECLARE_IDTENTRY_ERRORCODE(X86_TRAP_NP, exc_segment_not_present); +DECLARE_IDTENTRY_ERRORCODE(X86_TRAP_SS, exc_stack_segment); #endif diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h index 970c88894fc4a6..5e580ff1dcf98d 100644 --- a/arch/x86/include/asm/traps.h +++ b/arch/x86/include/asm/traps.h @@ -17,7 +17,6 @@ asmlinkage void int3(void); #ifdef CONFIG_X86_64 asmlinkage void double_fault(void); #endif -asmlinkage void stack_segment(void); asmlinkage void general_protection(void); asmlinkage void page_fault(void); asmlinkage void async_page_fault(void); @@ -34,7 +33,6 @@ asmlinkage void xen_xennmi(void); asmlinkage void xen_xendebug(void); asmlinkage void xen_int3(void); asmlinkage void xen_double_fault(void); -asmlinkage void xen_stack_segment(void); asmlinkage void xen_general_protection(void); asmlinkage void xen_page_fault(void); asmlinkage void xen_spurious_interrupt_bug(void); @@ -50,7 +48,6 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code); dotraplinkage void do_nmi(struct pt_regs *regs, long error_code); dotraplinkage void do_int3(struct pt_regs *regs, long error_code); dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code, unsigned long cr2); -dotraplinkage void do_stack_segment(struct pt_regs *regs, long error_code); dotraplinkage void do_general_protection(struct pt_regs *regs, long error_code); dotraplinkage void do_page_fault(struct pt_regs *regs, unsigned long error_code, unsigned long address); dotraplinkage void do_spurious_interrupt_bug(struct pt_regs *regs, long error_code); diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c index b9acc7f5684aa5..8d95cbf56624aa 100644 --- a/arch/x86/kernel/idt.c +++ b/arch/x86/kernel/idt.c @@ -81,7 +81,7 @@ static const __initconst struct idt_data def_idts[] = { INTG(X86_TRAP_OLD_MF, asm_exc_coproc_segment_overrun), INTG(X86_TRAP_TS, asm_exc_invalid_tss), INTG(X86_TRAP_NP, asm_exc_segment_not_present), - INTG(X86_TRAP_SS, stack_segment), + INTG(X86_TRAP_SS, asm_exc_stack_segment), INTG(X86_TRAP_GP, general_protection), INTG(X86_TRAP_SPURIOUS, spurious_interrupt_bug), INTG(X86_TRAP_MF, coprocessor_error), diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 88ba5f0400fd35..3dfdc4d3de8741 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -264,16 +264,12 @@ DEFINE_IDTENTRY_ERRORCODE(exc_segment_not_present) SIGBUS, 0, NULL); } -#define IP ((void __user *)uprobe_get_trap_addr(regs)) -#define DO_ERROR(trapnr, signr, sicode, addr, str, name) \ -dotraplinkage void do_##name(struct pt_regs *regs, long error_code) \ -{ \ - do_error_trap(regs, error_code, str, trapnr, signr, sicode, addr); \ +DEFINE_IDTENTRY_ERRORCODE(exc_stack_segment) +{ + do_error_trap(regs, error_code, "stack segment", X86_TRAP_SS, SIGBUS, + 0, NULL); } -DO_ERROR(X86_TRAP_SS, SIGBUS, 0, NULL, "stack segment", stack_segment) -#undef IP - dotraplinkage void do_alignment_check(struct pt_regs *regs, long error_code) { char *str = "alignment check"; diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c index 4e2a41ebc369a9..8290f39d28391a 100644 --- a/arch/x86/xen/enlighten_pv.c +++ b/arch/x86/xen/enlighten_pv.c @@ -629,7 +629,7 @@ static struct trap_array_entry trap_array[] = { TRAP_ENTRY(exc_coproc_segment_overrun, false ), TRAP_ENTRY(exc_invalid_tss, false ), TRAP_ENTRY(exc_segment_not_present, false ), - { stack_segment, xen_stack_segment, false }, + TRAP_ENTRY(exc_stack_segment, false ), { general_protection, xen_general_protection, false }, { spurious_interrupt_bug, xen_spurious_interrupt_bug, false }, { coprocessor_error, xen_coprocessor_error, false }, diff --git a/arch/x86/xen/xen-asm_64.S b/arch/x86/xen/xen-asm_64.S index c8ce7ad7120285..0ecc0559f65776 100644 --- a/arch/x86/xen/xen-asm_64.S +++ b/arch/x86/xen/xen-asm_64.S @@ -41,7 +41,7 @@ xen_pv_trap double_fault xen_pv_trap asm_exc_coproc_segment_overrun xen_pv_trap asm_exc_invalid_tss xen_pv_trap asm_exc_segment_not_present -xen_pv_trap stack_segment +xen_pv_trap asm_exc_stack_segment xen_pv_trap general_protection xen_pv_trap page_fault xen_pv_trap spurious_interrupt_bug From be4c11afbb6d5317274e61fda0edf744080fb72b Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 25 Feb 2020 23:16:25 +0100 Subject: [PATCH 081/190] x86/entry: Convert General protection exception to IDTENTRY Convert #GP to IDTENTRY_ERRORCODE: - Implement the C entry point with DEFINE_IDTENTRY - Emit the ASM stub with DECLARE_IDTENTRY - Remove the ASM idtentry in 64bit - Remove the open coded ASM entry code in 32bit - Fixup the XEN/PV code - Remove the old prototypes - Remove the RCU warning as the new entry macro ensures correctness No functional change. Signed-off-by: Thomas Gleixner Reviewed-by: Alexandre Chartre Acked-by: Peter Zijlstra Acked-by: Andy Lutomirski Link: https://lkml.kernel.org/r/20200505134905.637269946@linutronix.de --- arch/x86/entry/entry_32.S | 8 +------- arch/x86/entry/entry_64.S | 3 +-- arch/x86/include/asm/idtentry.h | 1 + arch/x86/include/asm/traps.h | 3 --- arch/x86/kernel/idt.c | 2 +- arch/x86/kernel/traps.c | 8 +++----- arch/x86/xen/enlighten_pv.c | 2 +- arch/x86/xen/xen-asm_64.S | 2 +- 8 files changed, 9 insertions(+), 20 deletions(-) diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index ffe43d2f8fe908..9d94a036bf355b 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -1302,7 +1302,7 @@ SYM_CODE_START(simd_coprocessor_error) pushl $0 #ifdef CONFIG_X86_INVD_BUG /* AMD 486 bug: invd from userspace calls exception 19 instead of #GP */ - ALTERNATIVE "pushl $do_general_protection", \ + ALTERNATIVE "pushl $exc_general_protection", \ "pushl $do_simd_coprocessor_error", \ X86_FEATURE_XMM #else @@ -1690,12 +1690,6 @@ SYM_CODE_START(int3) jmp common_exception SYM_CODE_END(int3) -SYM_CODE_START(general_protection) - ASM_CLAC - pushl $do_general_protection - jmp common_exception -SYM_CODE_END(general_protection) - .pushsection .text, "ax" SYM_CODE_START(rewind_stack_do_exit) /* Prevent any naive code from trying to unwind to our caller. */ diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index c92592de337072..5cecdd18f54857 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -1073,7 +1073,6 @@ apicinterrupt IRQ_WORK_VECTOR irq_work_interrupt smp_irq_work_interrupt */ idtentry X86_TRAP_BP int3 do_int3 has_error_code=0 -idtentry X86_TRAP_GP general_protection do_general_protection has_error_code=1 idtentry X86_TRAP_SPURIOUS spurious_interrupt_bug do_spurious_interrupt_bug has_error_code=0 idtentry X86_TRAP_MF coprocessor_error do_coprocessor_error has_error_code=0 idtentry X86_TRAP_AC alignment_check do_alignment_check has_error_code=1 @@ -1209,7 +1208,7 @@ SYM_CODE_START(xen_failsafe_callback) addq $0x30, %rsp pushq $0 /* RIP */ UNWIND_HINT_IRET_REGS offset=8 - jmp general_protection + jmp asm_exc_general_protection 1: /* Segment mismatch => Category 1 (Bad segment). Retry the IRET. */ movq (%rsp), %rcx movq 8(%rsp), %r11 diff --git a/arch/x86/include/asm/idtentry.h b/arch/x86/include/asm/idtentry.h index 4c0abd31db5b1b..986fc655d2abf2 100644 --- a/arch/x86/include/asm/idtentry.h +++ b/arch/x86/include/asm/idtentry.h @@ -134,5 +134,6 @@ DECLARE_IDTENTRY(X86_TRAP_OLD_MF, exc_coproc_segment_overrun); DECLARE_IDTENTRY_ERRORCODE(X86_TRAP_TS, exc_invalid_tss); DECLARE_IDTENTRY_ERRORCODE(X86_TRAP_NP, exc_segment_not_present); DECLARE_IDTENTRY_ERRORCODE(X86_TRAP_SS, exc_stack_segment); +DECLARE_IDTENTRY_ERRORCODE(X86_TRAP_GP, exc_general_protection); #endif diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h index 5e580ff1dcf98d..3a096a49b3431c 100644 --- a/arch/x86/include/asm/traps.h +++ b/arch/x86/include/asm/traps.h @@ -17,7 +17,6 @@ asmlinkage void int3(void); #ifdef CONFIG_X86_64 asmlinkage void double_fault(void); #endif -asmlinkage void general_protection(void); asmlinkage void page_fault(void); asmlinkage void async_page_fault(void); asmlinkage void spurious_interrupt_bug(void); @@ -33,7 +32,6 @@ asmlinkage void xen_xennmi(void); asmlinkage void xen_xendebug(void); asmlinkage void xen_int3(void); asmlinkage void xen_double_fault(void); -asmlinkage void xen_general_protection(void); asmlinkage void xen_page_fault(void); asmlinkage void xen_spurious_interrupt_bug(void); asmlinkage void xen_coprocessor_error(void); @@ -48,7 +46,6 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code); dotraplinkage void do_nmi(struct pt_regs *regs, long error_code); dotraplinkage void do_int3(struct pt_regs *regs, long error_code); dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code, unsigned long cr2); -dotraplinkage void do_general_protection(struct pt_regs *regs, long error_code); dotraplinkage void do_page_fault(struct pt_regs *regs, unsigned long error_code, unsigned long address); dotraplinkage void do_spurious_interrupt_bug(struct pt_regs *regs, long error_code); dotraplinkage void do_coprocessor_error(struct pt_regs *regs, long error_code); diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c index 8d95cbf56624aa..6f0af12f08c869 100644 --- a/arch/x86/kernel/idt.c +++ b/arch/x86/kernel/idt.c @@ -82,7 +82,7 @@ static const __initconst struct idt_data def_idts[] = { INTG(X86_TRAP_TS, asm_exc_invalid_tss), INTG(X86_TRAP_NP, asm_exc_segment_not_present), INTG(X86_TRAP_SS, asm_exc_stack_segment), - INTG(X86_TRAP_GP, general_protection), + INTG(X86_TRAP_GP, asm_exc_general_protection), INTG(X86_TRAP_SPURIOUS, spurious_interrupt_bug), INTG(X86_TRAP_MF, coprocessor_error), INTG(X86_TRAP_AC, alignment_check), diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 3dfdc4d3de8741..e65c7612ecf324 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -145,7 +145,7 @@ do_trap_no_signal(struct task_struct *tsk, int trapnr, const char *str, * process no chance to handle the signal and notice the * kernel fault information, so that won't result in polluting * the information about previously queued, but not yet - * delivered, faults. See also do_general_protection below. + * delivered, faults. See also exc_general_protection below. */ tsk->thread.error_code = error_code; tsk->thread.trap_nr = trapnr; @@ -375,7 +375,7 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code, unsign * which is what the stub expects, given that the faulting * RIP will be the IRET instruction. */ - regs->ip = (unsigned long)general_protection; + regs->ip = (unsigned long)asm_exc_general_protection; regs->sp = (unsigned long)&gpregs->orig_ax; return; @@ -494,7 +494,7 @@ static enum kernel_gp_hint get_kernel_gp_address(struct pt_regs *regs, #define GPFSTR "general protection fault" -dotraplinkage void do_general_protection(struct pt_regs *regs, long error_code) +DEFINE_IDTENTRY_ERRORCODE(exc_general_protection) { char desc[sizeof(GPFSTR) + 50 + 2*sizeof(unsigned long) + 1] = GPFSTR; enum kernel_gp_hint hint = GP_NO_HINT; @@ -502,7 +502,6 @@ dotraplinkage void do_general_protection(struct pt_regs *regs, long error_code) unsigned long gp_addr; int ret; - RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU"); cond_local_irq_enable(regs); if (static_cpu_has(X86_FEATURE_UMIP)) { @@ -570,7 +569,6 @@ dotraplinkage void do_general_protection(struct pt_regs *regs, long error_code) exit: cond_local_irq_disable(regs); } -NOKPROBE_SYMBOL(do_general_protection); dotraplinkage void notrace do_int3(struct pt_regs *regs, long error_code) { diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c index 8290f39d28391a..ca441c7c3222b6 100644 --- a/arch/x86/xen/enlighten_pv.c +++ b/arch/x86/xen/enlighten_pv.c @@ -630,7 +630,7 @@ static struct trap_array_entry trap_array[] = { TRAP_ENTRY(exc_invalid_tss, false ), TRAP_ENTRY(exc_segment_not_present, false ), TRAP_ENTRY(exc_stack_segment, false ), - { general_protection, xen_general_protection, false }, + TRAP_ENTRY(exc_general_protection, false ), { spurious_interrupt_bug, xen_spurious_interrupt_bug, false }, { coprocessor_error, xen_coprocessor_error, false }, { alignment_check, xen_alignment_check, false }, diff --git a/arch/x86/xen/xen-asm_64.S b/arch/x86/xen/xen-asm_64.S index 0ecc0559f65776..802ec00b2f9cd2 100644 --- a/arch/x86/xen/xen-asm_64.S +++ b/arch/x86/xen/xen-asm_64.S @@ -42,7 +42,7 @@ xen_pv_trap asm_exc_coproc_segment_overrun xen_pv_trap asm_exc_invalid_tss xen_pv_trap asm_exc_segment_not_present xen_pv_trap asm_exc_stack_segment -xen_pv_trap general_protection +xen_pv_trap asm_exc_general_protection xen_pv_trap page_fault xen_pv_trap spurious_interrupt_bug xen_pv_trap coprocessor_error From dad7106f8194df1b096666c5499ef732497ddb15 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 25 Feb 2020 23:16:26 +0100 Subject: [PATCH 082/190] x86/entry: Convert Spurious interrupt bug exception to IDTENTRY Convert #SPURIOUS to IDTENTRY_ERRORCODE: - Implement the C entry point with DEFINE_IDTENTRY - Emit the ASM stub with DECLARE_IDTENTRY - Remove the ASM idtentry in 64bit - Remove the open coded ASM entry code in 32bit - Fixup the XEN/PV code - Remove the old prototypes No functional change. Signed-off-by: Thomas Gleixner Reviewed-by: Alexandre Chartre Acked-by: Peter Zijlstra Acked-by: Andy Lutomirski Link: https://lkml.kernel.org/r/20200505134905.728077036@linutronix.de --- arch/x86/entry/entry_32.S | 7 ------- arch/x86/entry/entry_64.S | 1 - arch/x86/include/asm/idtentry.h | 1 + arch/x86/include/asm/traps.h | 3 --- arch/x86/kernel/idt.c | 2 +- arch/x86/kernel/traps.c | 3 +-- arch/x86/xen/enlighten_pv.c | 2 +- arch/x86/xen/xen-asm_64.S | 2 +- 8 files changed, 5 insertions(+), 16 deletions(-) diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index 9d94a036bf355b..f7610e1608d342 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -1333,13 +1333,6 @@ SYM_CODE_START(machine_check) SYM_CODE_END(machine_check) #endif -SYM_CODE_START(spurious_interrupt_bug) - ASM_CLAC - pushl $0 - pushl $do_spurious_interrupt_bug - jmp common_exception -SYM_CODE_END(spurious_interrupt_bug) - #ifdef CONFIG_XEN_PV SYM_FUNC_START(xen_hypervisor_callback) /* diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 5cecdd18f54857..1a677eec4d6d19 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -1073,7 +1073,6 @@ apicinterrupt IRQ_WORK_VECTOR irq_work_interrupt smp_irq_work_interrupt */ idtentry X86_TRAP_BP int3 do_int3 has_error_code=0 -idtentry X86_TRAP_SPURIOUS spurious_interrupt_bug do_spurious_interrupt_bug has_error_code=0 idtentry X86_TRAP_MF coprocessor_error do_coprocessor_error has_error_code=0 idtentry X86_TRAP_AC alignment_check do_alignment_check has_error_code=1 idtentry X86_TRAP_XF simd_coprocessor_error do_simd_coprocessor_error has_error_code=0 diff --git a/arch/x86/include/asm/idtentry.h b/arch/x86/include/asm/idtentry.h index 986fc655d2abf2..d309b060422bfb 100644 --- a/arch/x86/include/asm/idtentry.h +++ b/arch/x86/include/asm/idtentry.h @@ -129,6 +129,7 @@ DECLARE_IDTENTRY(X86_TRAP_BR, exc_bounds); DECLARE_IDTENTRY(X86_TRAP_UD, exc_invalid_op); DECLARE_IDTENTRY(X86_TRAP_NM, exc_device_not_available); DECLARE_IDTENTRY(X86_TRAP_OLD_MF, exc_coproc_segment_overrun); +DECLARE_IDTENTRY(X86_TRAP_SPURIOUS, exc_spurious_interrupt_bug); /* Simple exception entries with error code pushed by hardware */ DECLARE_IDTENTRY_ERRORCODE(X86_TRAP_TS, exc_invalid_tss); diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h index 3a096a49b3431c..4450f3b43f9713 100644 --- a/arch/x86/include/asm/traps.h +++ b/arch/x86/include/asm/traps.h @@ -19,7 +19,6 @@ asmlinkage void double_fault(void); #endif asmlinkage void page_fault(void); asmlinkage void async_page_fault(void); -asmlinkage void spurious_interrupt_bug(void); asmlinkage void coprocessor_error(void); asmlinkage void alignment_check(void); #ifdef CONFIG_X86_MCE @@ -33,7 +32,6 @@ asmlinkage void xen_xendebug(void); asmlinkage void xen_int3(void); asmlinkage void xen_double_fault(void); asmlinkage void xen_page_fault(void); -asmlinkage void xen_spurious_interrupt_bug(void); asmlinkage void xen_coprocessor_error(void); asmlinkage void xen_alignment_check(void); #ifdef CONFIG_X86_MCE @@ -47,7 +45,6 @@ dotraplinkage void do_nmi(struct pt_regs *regs, long error_code); dotraplinkage void do_int3(struct pt_regs *regs, long error_code); dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code, unsigned long cr2); dotraplinkage void do_page_fault(struct pt_regs *regs, unsigned long error_code, unsigned long address); -dotraplinkage void do_spurious_interrupt_bug(struct pt_regs *regs, long error_code); dotraplinkage void do_coprocessor_error(struct pt_regs *regs, long error_code); dotraplinkage void do_alignment_check(struct pt_regs *regs, long error_code); dotraplinkage void do_simd_coprocessor_error(struct pt_regs *regs, long error_code); diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c index 6f0af12f08c869..8e8936dcf6e46d 100644 --- a/arch/x86/kernel/idt.c +++ b/arch/x86/kernel/idt.c @@ -83,7 +83,7 @@ static const __initconst struct idt_data def_idts[] = { INTG(X86_TRAP_NP, asm_exc_segment_not_present), INTG(X86_TRAP_SS, asm_exc_stack_segment), INTG(X86_TRAP_GP, asm_exc_general_protection), - INTG(X86_TRAP_SPURIOUS, spurious_interrupt_bug), + INTG(X86_TRAP_SPURIOUS, asm_exc_spurious_interrupt_bug), INTG(X86_TRAP_MF, coprocessor_error), INTG(X86_TRAP_AC, alignment_check), INTG(X86_TRAP_XF, simd_coprocessor_error), diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index e65c7612ecf324..2c638b9bc82725 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -867,8 +867,7 @@ do_simd_coprocessor_error(struct pt_regs *regs, long error_code) math_error(regs, error_code, X86_TRAP_XF); } -dotraplinkage void -do_spurious_interrupt_bug(struct pt_regs *regs, long error_code) +DEFINE_IDTENTRY(exc_spurious_interrupt_bug) { /* * This addresses a Pentium Pro Erratum: diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c index ca441c7c3222b6..5b2dfb027fe267 100644 --- a/arch/x86/xen/enlighten_pv.c +++ b/arch/x86/xen/enlighten_pv.c @@ -631,7 +631,7 @@ static struct trap_array_entry trap_array[] = { TRAP_ENTRY(exc_segment_not_present, false ), TRAP_ENTRY(exc_stack_segment, false ), TRAP_ENTRY(exc_general_protection, false ), - { spurious_interrupt_bug, xen_spurious_interrupt_bug, false }, + TRAP_ENTRY(exc_spurious_interrupt_bug, false ), { coprocessor_error, xen_coprocessor_error, false }, { alignment_check, xen_alignment_check, false }, { simd_coprocessor_error, xen_simd_coprocessor_error, false }, diff --git a/arch/x86/xen/xen-asm_64.S b/arch/x86/xen/xen-asm_64.S index 802ec00b2f9cd2..698a9c50d8771d 100644 --- a/arch/x86/xen/xen-asm_64.S +++ b/arch/x86/xen/xen-asm_64.S @@ -44,7 +44,7 @@ xen_pv_trap asm_exc_segment_not_present xen_pv_trap asm_exc_stack_segment xen_pv_trap asm_exc_general_protection xen_pv_trap page_fault -xen_pv_trap spurious_interrupt_bug +xen_pv_trap asm_exc_spurious_interrupt_bug xen_pv_trap coprocessor_error xen_pv_trap alignment_check #ifdef CONFIG_X86_MCE From 14a8bd2aa7c355b3a8879618a4f70f9c2b0004f7 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 25 Feb 2020 23:16:27 +0100 Subject: [PATCH 083/190] x86/entry: Convert Coprocessor error exception to IDTENTRY Convert #MF to IDTENTRY_ERRORCODE: - Implement the C entry point with DEFINE_IDTENTRY - Emit the ASM stub with DECLARE_IDTENTRY - Remove the ASM idtentry in 64bit - Remove the open coded ASM entry code in 32bit - Fixup the XEN/PV code - Remove the old prototypes - Remove the RCU warning as the new entry macro ensures correctness No functional change. Signed-off-by: Thomas Gleixner Reviewed-by: Alexandre Chartre Acked-by: Peter Zijlstra Acked-by: Andy Lutomirski Link: https://lkml.kernel.org/r/20200505134905.838823510@linutronix.de --- arch/x86/entry/entry_32.S | 7 ------- arch/x86/entry/entry_64.S | 1 - arch/x86/include/asm/idtentry.h | 1 + arch/x86/include/asm/traps.h | 3 --- arch/x86/kernel/idt.c | 2 +- arch/x86/kernel/traps.c | 5 ++--- arch/x86/xen/enlighten_pv.c | 2 +- arch/x86/xen/xen-asm_64.S | 2 +- 8 files changed, 6 insertions(+), 17 deletions(-) diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index f7610e1608d342..66d4683026f0d4 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -1290,13 +1290,6 @@ SYM_FUNC_END(name) /* The include is where all of the SMP etc. interrupts come from */ #include -SYM_CODE_START(coprocessor_error) - ASM_CLAC - pushl $0 - pushl $do_coprocessor_error - jmp common_exception -SYM_CODE_END(coprocessor_error) - SYM_CODE_START(simd_coprocessor_error) ASM_CLAC pushl $0 diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 1a677eec4d6d19..f1f126bb4945ab 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -1073,7 +1073,6 @@ apicinterrupt IRQ_WORK_VECTOR irq_work_interrupt smp_irq_work_interrupt */ idtentry X86_TRAP_BP int3 do_int3 has_error_code=0 -idtentry X86_TRAP_MF coprocessor_error do_coprocessor_error has_error_code=0 idtentry X86_TRAP_AC alignment_check do_alignment_check has_error_code=1 idtentry X86_TRAP_XF simd_coprocessor_error do_simd_coprocessor_error has_error_code=0 diff --git a/arch/x86/include/asm/idtentry.h b/arch/x86/include/asm/idtentry.h index d309b060422bfb..ed44ba6210f2df 100644 --- a/arch/x86/include/asm/idtentry.h +++ b/arch/x86/include/asm/idtentry.h @@ -130,6 +130,7 @@ DECLARE_IDTENTRY(X86_TRAP_UD, exc_invalid_op); DECLARE_IDTENTRY(X86_TRAP_NM, exc_device_not_available); DECLARE_IDTENTRY(X86_TRAP_OLD_MF, exc_coproc_segment_overrun); DECLARE_IDTENTRY(X86_TRAP_SPURIOUS, exc_spurious_interrupt_bug); +DECLARE_IDTENTRY(X86_TRAP_MF, exc_coprocessor_error); /* Simple exception entries with error code pushed by hardware */ DECLARE_IDTENTRY_ERRORCODE(X86_TRAP_TS, exc_invalid_tss); diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h index 4450f3b43f9713..e84677b685c56b 100644 --- a/arch/x86/include/asm/traps.h +++ b/arch/x86/include/asm/traps.h @@ -19,7 +19,6 @@ asmlinkage void double_fault(void); #endif asmlinkage void page_fault(void); asmlinkage void async_page_fault(void); -asmlinkage void coprocessor_error(void); asmlinkage void alignment_check(void); #ifdef CONFIG_X86_MCE asmlinkage void machine_check(void); @@ -32,7 +31,6 @@ asmlinkage void xen_xendebug(void); asmlinkage void xen_int3(void); asmlinkage void xen_double_fault(void); asmlinkage void xen_page_fault(void); -asmlinkage void xen_coprocessor_error(void); asmlinkage void xen_alignment_check(void); #ifdef CONFIG_X86_MCE asmlinkage void xen_machine_check(void); @@ -45,7 +43,6 @@ dotraplinkage void do_nmi(struct pt_regs *regs, long error_code); dotraplinkage void do_int3(struct pt_regs *regs, long error_code); dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code, unsigned long cr2); dotraplinkage void do_page_fault(struct pt_regs *regs, unsigned long error_code, unsigned long address); -dotraplinkage void do_coprocessor_error(struct pt_regs *regs, long error_code); dotraplinkage void do_alignment_check(struct pt_regs *regs, long error_code); dotraplinkage void do_simd_coprocessor_error(struct pt_regs *regs, long error_code); #ifdef CONFIG_X86_32 diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c index 8e8936dcf6e46d..2bde50d4cfa10d 100644 --- a/arch/x86/kernel/idt.c +++ b/arch/x86/kernel/idt.c @@ -84,7 +84,7 @@ static const __initconst struct idt_data def_idts[] = { INTG(X86_TRAP_SS, asm_exc_stack_segment), INTG(X86_TRAP_GP, asm_exc_general_protection), INTG(X86_TRAP_SPURIOUS, asm_exc_spurious_interrupt_bug), - INTG(X86_TRAP_MF, coprocessor_error), + INTG(X86_TRAP_MF, asm_exc_coprocessor_error), INTG(X86_TRAP_AC, alignment_check), INTG(X86_TRAP_XF, simd_coprocessor_error), diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 2c638b9bc82725..ba26bebfed7263 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -854,10 +854,9 @@ static void math_error(struct pt_regs *regs, int error_code, int trapnr) cond_local_irq_disable(regs); } -dotraplinkage void do_coprocessor_error(struct pt_regs *regs, long error_code) +DEFINE_IDTENTRY(exc_coprocessor_error) { - RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU"); - math_error(regs, error_code, X86_TRAP_MF); + math_error(regs, 0, X86_TRAP_MF); } dotraplinkage void diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c index 5b2dfb027fe267..678c50888db22d 100644 --- a/arch/x86/xen/enlighten_pv.c +++ b/arch/x86/xen/enlighten_pv.c @@ -632,7 +632,7 @@ static struct trap_array_entry trap_array[] = { TRAP_ENTRY(exc_stack_segment, false ), TRAP_ENTRY(exc_general_protection, false ), TRAP_ENTRY(exc_spurious_interrupt_bug, false ), - { coprocessor_error, xen_coprocessor_error, false }, + TRAP_ENTRY(exc_coprocessor_error, false ), { alignment_check, xen_alignment_check, false }, { simd_coprocessor_error, xen_simd_coprocessor_error, false }, }; diff --git a/arch/x86/xen/xen-asm_64.S b/arch/x86/xen/xen-asm_64.S index 698a9c50d8771d..589de186d8cd3c 100644 --- a/arch/x86/xen/xen-asm_64.S +++ b/arch/x86/xen/xen-asm_64.S @@ -45,7 +45,7 @@ xen_pv_trap asm_exc_stack_segment xen_pv_trap asm_exc_general_protection xen_pv_trap page_fault xen_pv_trap asm_exc_spurious_interrupt_bug -xen_pv_trap coprocessor_error +xen_pv_trap asm_exc_coprocessor_error xen_pv_trap alignment_check #ifdef CONFIG_X86_MCE xen_pv_trap machine_check From 436608bb00a59f5457cee26f416067860ca88d9d Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 25 Feb 2020 23:16:28 +0100 Subject: [PATCH 084/190] x86/entry: Convert Alignment check exception to IDTENTRY Convert #AC to IDTENTRY_ERRORCODE: - Implement the C entry point with DEFINE_IDTENTRY - Emit the ASM stub with DECLARE_IDTENTRY - Remove the ASM idtentry in 64bit - Remove the open coded ASM entry code in 32bit - Fixup the XEN/PV code - Remove the old prototypes - Remove the RCU warning as the new entry macro ensures correctness No functional change. Signed-off-by: Thomas Gleixner Reviewed-by: Alexandre Chartre Acked-by: Peter Zijlstra Acked-by: Andy Lutomirski Link: https://lkml.kernel.org/r/20200505134905.928967113@linutronix.de --- arch/x86/entry/entry_32.S | 6 ------ arch/x86/entry/entry_64.S | 1 - arch/x86/include/asm/idtentry.h | 1 + arch/x86/include/asm/traps.h | 3 --- arch/x86/kernel/idt.c | 2 +- arch/x86/kernel/traps.c | 4 +--- arch/x86/xen/enlighten_pv.c | 2 +- arch/x86/xen/xen-asm_64.S | 2 +- 8 files changed, 5 insertions(+), 16 deletions(-) diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index 66d4683026f0d4..740289017179e5 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -1311,12 +1311,6 @@ SYM_CODE_START(native_iret) SYM_CODE_END(native_iret) #endif -SYM_CODE_START(alignment_check) - ASM_CLAC - pushl $do_alignment_check - jmp common_exception -SYM_CODE_END(alignment_check) - #ifdef CONFIG_X86_MCE SYM_CODE_START(machine_check) ASM_CLAC diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index f1f126bb4945ab..3c95a6307d723d 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -1073,7 +1073,6 @@ apicinterrupt IRQ_WORK_VECTOR irq_work_interrupt smp_irq_work_interrupt */ idtentry X86_TRAP_BP int3 do_int3 has_error_code=0 -idtentry X86_TRAP_AC alignment_check do_alignment_check has_error_code=1 idtentry X86_TRAP_XF simd_coprocessor_error do_simd_coprocessor_error has_error_code=0 idtentry X86_TRAP_PF page_fault do_page_fault has_error_code=1 diff --git a/arch/x86/include/asm/idtentry.h b/arch/x86/include/asm/idtentry.h index ed44ba6210f2df..531dbc0ee365c6 100644 --- a/arch/x86/include/asm/idtentry.h +++ b/arch/x86/include/asm/idtentry.h @@ -137,5 +137,6 @@ DECLARE_IDTENTRY_ERRORCODE(X86_TRAP_TS, exc_invalid_tss); DECLARE_IDTENTRY_ERRORCODE(X86_TRAP_NP, exc_segment_not_present); DECLARE_IDTENTRY_ERRORCODE(X86_TRAP_SS, exc_stack_segment); DECLARE_IDTENTRY_ERRORCODE(X86_TRAP_GP, exc_general_protection); +DECLARE_IDTENTRY_ERRORCODE(X86_TRAP_AC, exc_alignment_check); #endif diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h index e84677b685c56b..0f755e156f05de 100644 --- a/arch/x86/include/asm/traps.h +++ b/arch/x86/include/asm/traps.h @@ -19,7 +19,6 @@ asmlinkage void double_fault(void); #endif asmlinkage void page_fault(void); asmlinkage void async_page_fault(void); -asmlinkage void alignment_check(void); #ifdef CONFIG_X86_MCE asmlinkage void machine_check(void); #endif /* CONFIG_X86_MCE */ @@ -31,7 +30,6 @@ asmlinkage void xen_xendebug(void); asmlinkage void xen_int3(void); asmlinkage void xen_double_fault(void); asmlinkage void xen_page_fault(void); -asmlinkage void xen_alignment_check(void); #ifdef CONFIG_X86_MCE asmlinkage void xen_machine_check(void); #endif /* CONFIG_X86_MCE */ @@ -43,7 +41,6 @@ dotraplinkage void do_nmi(struct pt_regs *regs, long error_code); dotraplinkage void do_int3(struct pt_regs *regs, long error_code); dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code, unsigned long cr2); dotraplinkage void do_page_fault(struct pt_regs *regs, unsigned long error_code, unsigned long address); -dotraplinkage void do_alignment_check(struct pt_regs *regs, long error_code); dotraplinkage void do_simd_coprocessor_error(struct pt_regs *regs, long error_code); #ifdef CONFIG_X86_32 dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code); diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c index 2bde50d4cfa10d..af481961078333 100644 --- a/arch/x86/kernel/idt.c +++ b/arch/x86/kernel/idt.c @@ -85,7 +85,7 @@ static const __initconst struct idt_data def_idts[] = { INTG(X86_TRAP_GP, asm_exc_general_protection), INTG(X86_TRAP_SPURIOUS, asm_exc_spurious_interrupt_bug), INTG(X86_TRAP_MF, asm_exc_coprocessor_error), - INTG(X86_TRAP_AC, alignment_check), + INTG(X86_TRAP_AC, asm_exc_alignment_check), INTG(X86_TRAP_XF, simd_coprocessor_error), #ifdef CONFIG_X86_32 diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index ba26bebfed7263..9f156c84195dd8 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -270,12 +270,10 @@ DEFINE_IDTENTRY_ERRORCODE(exc_stack_segment) 0, NULL); } -dotraplinkage void do_alignment_check(struct pt_regs *regs, long error_code) +DEFINE_IDTENTRY_ERRORCODE(exc_alignment_check) { char *str = "alignment check"; - RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU"); - if (notify_die(DIE_TRAP, str, regs, error_code, X86_TRAP_AC, SIGBUS) == NOTIFY_STOP) return; diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c index 678c50888db22d..1097e35d6afb04 100644 --- a/arch/x86/xen/enlighten_pv.c +++ b/arch/x86/xen/enlighten_pv.c @@ -633,7 +633,7 @@ static struct trap_array_entry trap_array[] = { TRAP_ENTRY(exc_general_protection, false ), TRAP_ENTRY(exc_spurious_interrupt_bug, false ), TRAP_ENTRY(exc_coprocessor_error, false ), - { alignment_check, xen_alignment_check, false }, + TRAP_ENTRY(exc_alignment_check, false ), { simd_coprocessor_error, xen_simd_coprocessor_error, false }, }; diff --git a/arch/x86/xen/xen-asm_64.S b/arch/x86/xen/xen-asm_64.S index 589de186d8cd3c..a591becaee2b1d 100644 --- a/arch/x86/xen/xen-asm_64.S +++ b/arch/x86/xen/xen-asm_64.S @@ -46,7 +46,7 @@ xen_pv_trap asm_exc_general_protection xen_pv_trap page_fault xen_pv_trap asm_exc_spurious_interrupt_bug xen_pv_trap asm_exc_coprocessor_error -xen_pv_trap alignment_check +xen_pv_trap asm_exc_alignment_check #ifdef CONFIG_X86_MCE xen_pv_trap machine_check #endif /* CONFIG_X86_MCE */ From 48227e21f7430e31042f63e078a45cd230e9fdfc Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 25 Feb 2020 23:16:29 +0100 Subject: [PATCH 085/190] x86/entry: Convert SIMD coprocessor error exception to IDTENTRY Convert #XF to IDTENTRY_ERRORCODE: - Implement the C entry point with DEFINE_IDTENTRY - Emit the ASM stub with DECLARE_IDTENTRY - Handle INVD_BUG in C - Remove the ASM idtentry in 64bit - Remove the open coded ASM entry code in 32bit - Fixup the XEN/PV code - Remove the old prototypes - Remove the RCU warning as the new entry macro ensures correctness No functional change. Signed-off-by: Thomas Gleixner Reviewed-by: Alexandre Chartre Acked-by: Peter Zijlstra Acked-by: Andy Lutomirski Link: https://lkml.kernel.org/r/20200505134906.021552202@linutronix.de --- arch/x86/entry/entry_32.S | 14 -------------- arch/x86/entry/entry_64.S | 1 - arch/x86/include/asm/idtentry.h | 1 + arch/x86/include/asm/traps.h | 3 --- arch/x86/kernel/idt.c | 2 +- arch/x86/kernel/traps.c | 29 +++++++++++++++++------------ arch/x86/xen/enlighten_pv.c | 2 +- arch/x86/xen/xen-asm_64.S | 2 +- 8 files changed, 21 insertions(+), 33 deletions(-) diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index 740289017179e5..c93fb73af03982 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -1290,20 +1290,6 @@ SYM_FUNC_END(name) /* The include is where all of the SMP etc. interrupts come from */ #include -SYM_CODE_START(simd_coprocessor_error) - ASM_CLAC - pushl $0 -#ifdef CONFIG_X86_INVD_BUG - /* AMD 486 bug: invd from userspace calls exception 19 instead of #GP */ - ALTERNATIVE "pushl $exc_general_protection", \ - "pushl $do_simd_coprocessor_error", \ - X86_FEATURE_XMM -#else - pushl $do_simd_coprocessor_error -#endif - jmp common_exception -SYM_CODE_END(simd_coprocessor_error) - #ifdef CONFIG_PARAVIRT SYM_CODE_START(native_iret) iret diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 3c95a6307d723d..1bada7b2621072 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -1073,7 +1073,6 @@ apicinterrupt IRQ_WORK_VECTOR irq_work_interrupt smp_irq_work_interrupt */ idtentry X86_TRAP_BP int3 do_int3 has_error_code=0 -idtentry X86_TRAP_XF simd_coprocessor_error do_simd_coprocessor_error has_error_code=0 idtentry X86_TRAP_PF page_fault do_page_fault has_error_code=1 diff --git a/arch/x86/include/asm/idtentry.h b/arch/x86/include/asm/idtentry.h index 531dbc0ee365c6..99d4759bd9147f 100644 --- a/arch/x86/include/asm/idtentry.h +++ b/arch/x86/include/asm/idtentry.h @@ -131,6 +131,7 @@ DECLARE_IDTENTRY(X86_TRAP_NM, exc_device_not_available); DECLARE_IDTENTRY(X86_TRAP_OLD_MF, exc_coproc_segment_overrun); DECLARE_IDTENTRY(X86_TRAP_SPURIOUS, exc_spurious_interrupt_bug); DECLARE_IDTENTRY(X86_TRAP_MF, exc_coprocessor_error); +DECLARE_IDTENTRY(X86_TRAP_XF, exc_simd_coprocessor_error); /* Simple exception entries with error code pushed by hardware */ DECLARE_IDTENTRY_ERRORCODE(X86_TRAP_TS, exc_invalid_tss); diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h index 0f755e156f05de..e7eb7532233d3e 100644 --- a/arch/x86/include/asm/traps.h +++ b/arch/x86/include/asm/traps.h @@ -22,7 +22,6 @@ asmlinkage void async_page_fault(void); #ifdef CONFIG_X86_MCE asmlinkage void machine_check(void); #endif /* CONFIG_X86_MCE */ -asmlinkage void simd_coprocessor_error(void); #if defined(CONFIG_X86_64) && defined(CONFIG_XEN_PV) asmlinkage void xen_xennmi(void); @@ -33,7 +32,6 @@ asmlinkage void xen_page_fault(void); #ifdef CONFIG_X86_MCE asmlinkage void xen_machine_check(void); #endif /* CONFIG_X86_MCE */ -asmlinkage void xen_simd_coprocessor_error(void); #endif dotraplinkage void do_debug(struct pt_regs *regs, long error_code); @@ -41,7 +39,6 @@ dotraplinkage void do_nmi(struct pt_regs *regs, long error_code); dotraplinkage void do_int3(struct pt_regs *regs, long error_code); dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code, unsigned long cr2); dotraplinkage void do_page_fault(struct pt_regs *regs, unsigned long error_code, unsigned long address); -dotraplinkage void do_simd_coprocessor_error(struct pt_regs *regs, long error_code); #ifdef CONFIG_X86_32 dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code); #endif diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c index af481961078333..38b565b7e5b893 100644 --- a/arch/x86/kernel/idt.c +++ b/arch/x86/kernel/idt.c @@ -86,7 +86,7 @@ static const __initconst struct idt_data def_idts[] = { INTG(X86_TRAP_SPURIOUS, asm_exc_spurious_interrupt_bug), INTG(X86_TRAP_MF, asm_exc_coprocessor_error), INTG(X86_TRAP_AC, asm_exc_alignment_check), - INTG(X86_TRAP_XF, simd_coprocessor_error), + INTG(X86_TRAP_XF, asm_exc_simd_coprocessor_error), #ifdef CONFIG_X86_32 TSKG(X86_TRAP_DF, GDT_ENTRY_DOUBLEFAULT_TSS), diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 9f156c84195dd8..1702922ebd9c1b 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -810,7 +810,7 @@ NOKPROBE_SYMBOL(do_debug); * the correct behaviour even in the presence of the asynchronous * IRQ13 behaviour */ -static void math_error(struct pt_regs *regs, int error_code, int trapnr) +static void math_error(struct pt_regs *regs, int trapnr) { struct task_struct *task = current; struct fpu *fpu = &task->thread.fpu; @@ -821,15 +821,15 @@ static void math_error(struct pt_regs *regs, int error_code, int trapnr) cond_local_irq_enable(regs); if (!user_mode(regs)) { - if (fixup_exception(regs, trapnr, error_code, 0)) + if (fixup_exception(regs, trapnr, 0, 0)) goto exit; - task->thread.error_code = error_code; + task->thread.error_code = 0; task->thread.trap_nr = trapnr; - if (notify_die(DIE_TRAP, str, regs, error_code, - trapnr, SIGFPE) != NOTIFY_STOP) - die(str, regs, error_code); + if (notify_die(DIE_TRAP, str, regs, 0, trapnr, + SIGFPE) != NOTIFY_STOP) + die(str, regs, 0); goto exit; } @@ -839,7 +839,7 @@ static void math_error(struct pt_regs *regs, int error_code, int trapnr) fpu__save(fpu); task->thread.trap_nr = trapnr; - task->thread.error_code = error_code; + task->thread.error_code = 0; si_code = fpu__exception_code(fpu, trapnr); /* Retry when we get spurious exceptions: */ @@ -854,14 +854,19 @@ static void math_error(struct pt_regs *regs, int error_code, int trapnr) DEFINE_IDTENTRY(exc_coprocessor_error) { - math_error(regs, 0, X86_TRAP_MF); + math_error(regs, X86_TRAP_MF); } -dotraplinkage void -do_simd_coprocessor_error(struct pt_regs *regs, long error_code) +DEFINE_IDTENTRY(exc_simd_coprocessor_error) { - RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU"); - math_error(regs, error_code, X86_TRAP_XF); + if (IS_ENABLED(CONFIG_X86_INVD_BUG)) { + /* AMD 486 bug: INVD in CPL 0 raises #XF instead of #GP */ + if (!static_cpu_has(X86_FEATURE_XMM)) { + __exc_general_protection(regs, 0); + return; + } + } + math_error(regs, X86_TRAP_XF); } DEFINE_IDTENTRY(exc_spurious_interrupt_bug) diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c index 1097e35d6afb04..0a30fc0fe0faa7 100644 --- a/arch/x86/xen/enlighten_pv.c +++ b/arch/x86/xen/enlighten_pv.c @@ -634,7 +634,7 @@ static struct trap_array_entry trap_array[] = { TRAP_ENTRY(exc_spurious_interrupt_bug, false ), TRAP_ENTRY(exc_coprocessor_error, false ), TRAP_ENTRY(exc_alignment_check, false ), - { simd_coprocessor_error, xen_simd_coprocessor_error, false }, + TRAP_ENTRY(exc_simd_coprocessor_error, false ), }; static bool __ref get_trap_addr(void **addr, unsigned int ist) diff --git a/arch/x86/xen/xen-asm_64.S b/arch/x86/xen/xen-asm_64.S index a591becaee2b1d..6a91157d5b5ca3 100644 --- a/arch/x86/xen/xen-asm_64.S +++ b/arch/x86/xen/xen-asm_64.S @@ -50,7 +50,7 @@ xen_pv_trap asm_exc_alignment_check #ifdef CONFIG_X86_MCE xen_pv_trap machine_check #endif /* CONFIG_X86_MCE */ -xen_pv_trap simd_coprocessor_error +xen_pv_trap asm_exc_simd_coprocessor_error #ifdef CONFIG_IA32_EMULATION xen_pv_trap entry_INT80_compat #endif From d77290507ab2ac691d50389e255ebd11a6cbc35a Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 25 Feb 2020 23:16:30 +0100 Subject: [PATCH 086/190] x86/entry/32: Convert IRET exception to IDTENTRY_SW Convert the IRET exception handler to IDTENTRY_SW. This is slightly different than the conversions of hardware exceptions as the IRET exception is invoked via an exception table when IRET faults. So it just uses the IDTENTRY_SW mechanism for consistency. It does not emit ASM code as it does not fit the other idtentry exceptions. - Implement the C entry point with DEFINE_IDTENTRY_SW() which maps to DEFINE_IDTENTRY() - Fixup the XEN/PV code - Remove the old prototypes - Remove the RCU warning as the new entry macro ensures correctness No functional change. Signed-off-by: Thomas Gleixner Reviewed-by: Alexandre Chartre Acked-by: Peter Zijlstra Acked-by: Andy Lutomirski Link: https://lkml.kernel.org/r/20200505134906.128769226@linutronix.de --- arch/x86/entry/entry_32.S | 14 +++++++------- arch/x86/include/asm/idtentry.h | 10 ++++++++++ arch/x86/include/asm/traps.h | 3 --- arch/x86/kernel/traps.c | 8 +++----- arch/x86/xen/xen-asm_32.S | 2 +- 5 files changed, 21 insertions(+), 16 deletions(-) diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index c93fb73af03982..f7a5f1cda058aa 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -1147,9 +1147,9 @@ restore_all_kernel: jmp .Lirq_return .section .fixup, "ax" -SYM_CODE_START(iret_exc) +SYM_CODE_START(asm_iret_error) pushl $0 # no error code - pushl $do_iret_error + pushl $iret_error #ifdef CONFIG_DEBUG_ENTRY /* @@ -1163,10 +1163,10 @@ SYM_CODE_START(iret_exc) popl %eax #endif - jmp common_exception -SYM_CODE_END(iret_exc) + jmp handle_exception +SYM_CODE_END(asm_iret_error) .previous - _ASM_EXTABLE(.Lirq_return, iret_exc) + _ASM_EXTABLE(.Lirq_return, asm_iret_error) SYM_FUNC_END(entry_INT80_32) .macro FIXUP_ESPFIX_STACK @@ -1293,7 +1293,7 @@ SYM_FUNC_END(name) #ifdef CONFIG_PARAVIRT SYM_CODE_START(native_iret) iret - _ASM_EXTABLE(native_iret, iret_exc) + _ASM_EXTABLE(native_iret, asm_iret_error) SYM_CODE_END(native_iret) #endif @@ -1358,7 +1358,7 @@ SYM_FUNC_START(xen_failsafe_callback) popl %eax lea 16(%esp), %esp jz 5f - jmp iret_exc + jmp asm_iret_error 5: pushl $-1 /* orig_ax = -1 => not a system call */ SAVE_ALL ENCODE_FRAME_POINTER diff --git a/arch/x86/include/asm/idtentry.h b/arch/x86/include/asm/idtentry.h index 99d4759bd9147f..ee6ebfef7e57dd 100644 --- a/arch/x86/include/asm/idtentry.h +++ b/arch/x86/include/asm/idtentry.h @@ -57,6 +57,10 @@ __visible noinstr void func(struct pt_regs *regs) \ \ static __always_inline void __##func(struct pt_regs *regs) +/* Special case for 32bit IRET 'trap' */ +#define DECLARE_IDTENTRY_SW DECLARE_IDTENTRY +#define DEFINE_IDTENTRY_SW DEFINE_IDTENTRY + /** * DECLARE_IDTENTRY_ERRORCODE - Declare functions for simple IDT entry points * Error code pushed by hardware @@ -111,6 +115,9 @@ static __always_inline void __##func(struct pt_regs *regs, \ #define DECLARE_IDTENTRY_ERRORCODE(vector, func) \ idtentry vector asm_##func func has_error_code=1 sane=1 +/* Special case for 32bit IRET 'trap'. Do not emit ASM code */ +#define DECLARE_IDTENTRY_SW(vector, func) + #endif /* __ASSEMBLY__ */ /* @@ -133,6 +140,9 @@ DECLARE_IDTENTRY(X86_TRAP_SPURIOUS, exc_spurious_interrupt_bug); DECLARE_IDTENTRY(X86_TRAP_MF, exc_coprocessor_error); DECLARE_IDTENTRY(X86_TRAP_XF, exc_simd_coprocessor_error); +/* 32bit software IRET trap. Do not emit ASM code */ +DECLARE_IDTENTRY_SW(X86_TRAP_IRET, iret_error); + /* Simple exception entries with error code pushed by hardware */ DECLARE_IDTENTRY_ERRORCODE(X86_TRAP_TS, exc_invalid_tss); DECLARE_IDTENTRY_ERRORCODE(X86_TRAP_NP, exc_segment_not_present); diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h index e7eb7532233d3e..5774d0b6cf77d3 100644 --- a/arch/x86/include/asm/traps.h +++ b/arch/x86/include/asm/traps.h @@ -39,9 +39,6 @@ dotraplinkage void do_nmi(struct pt_regs *regs, long error_code); dotraplinkage void do_int3(struct pt_regs *regs, long error_code); dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code, unsigned long cr2); dotraplinkage void do_page_fault(struct pt_regs *regs, unsigned long error_code, unsigned long address); -#ifdef CONFIG_X86_32 -dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code); -#endif dotraplinkage void do_mce(struct pt_regs *regs, long error_code); #ifdef CONFIG_X86_64 diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 1702922ebd9c1b..b28a64d7691fbc 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -925,14 +925,12 @@ DEFINE_IDTENTRY(exc_device_not_available) } #ifdef CONFIG_X86_32 -dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code) +DEFINE_IDTENTRY_SW(iret_error) { - RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU"); local_irq_enable(); - - if (notify_die(DIE_TRAP, "iret exception", regs, error_code, + if (notify_die(DIE_TRAP, "iret exception", regs, 0, X86_TRAP_IRET, SIGILL) != NOTIFY_STOP) { - do_trap(X86_TRAP_IRET, SIGILL, "iret exception", regs, error_code, + do_trap(X86_TRAP_IRET, SIGILL, "iret exception", regs, 0, ILL_BADSTK, (void __user *)NULL); } local_irq_disable(); diff --git a/arch/x86/xen/xen-asm_32.S b/arch/x86/xen/xen-asm_32.S index 2712e915530632..812ff01e4e3414 100644 --- a/arch/x86/xen/xen-asm_32.S +++ b/arch/x86/xen/xen-asm_32.S @@ -117,7 +117,7 @@ iret_restore_end: 1: iret xen_iret_end_crit: - _ASM_EXTABLE(1b, iret_exc) + _ASM_EXTABLE(1b, asm_iret_error) hyper_iret: /* put this out of line since its very rarely used */ From 4979fb53ab0ed35eddd20a73c25a5597bc22a57f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 21 Jan 2020 15:53:09 +0100 Subject: [PATCH 087/190] x86/int3: Ensure that poke_int3_handler() is not traced In order to ensure poke_int3_handler() is completely self contained -- this is called while modifying other text, imagine the fun of hitting another INT3 -- ensure that everything it uses is not traced. The primary means here is to force inlining; bsearch() is notrace because all of lib/ is. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Reviewed-by: Alexandre Chartre Acked-by: Andy Lutomirski Link: https://lkml.kernel.org/r/20200505135313.410702173@linutronix.de --- arch/x86/include/asm/ptrace.h | 2 +- arch/x86/include/asm/text-patching.h | 11 +++++++---- arch/x86/kernel/alternative.c | 13 ++++++------- 3 files changed, 14 insertions(+), 12 deletions(-) diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h index 6d6475fdd32789..ebedeab4870435 100644 --- a/arch/x86/include/asm/ptrace.h +++ b/arch/x86/include/asm/ptrace.h @@ -123,7 +123,7 @@ static inline void regs_set_return_value(struct pt_regs *regs, unsigned long rc) * On x86_64, vm86 mode is mercifully nonexistent, and we don't need * the extra check. */ -static inline int user_mode(struct pt_regs *regs) +static __always_inline int user_mode(struct pt_regs *regs) { #ifdef CONFIG_X86_32 return ((regs->cs & SEGMENT_RPL_MASK) | (regs->flags & X86_VM_MASK)) >= USER_RPL; diff --git a/arch/x86/include/asm/text-patching.h b/arch/x86/include/asm/text-patching.h index 67315fa3956a19..6593b42cb37906 100644 --- a/arch/x86/include/asm/text-patching.h +++ b/arch/x86/include/asm/text-patching.h @@ -64,7 +64,7 @@ extern void text_poke_finish(void); #define DISP32_SIZE 4 -static inline int text_opcode_size(u8 opcode) +static __always_inline int text_opcode_size(u8 opcode) { int size = 0; @@ -118,12 +118,14 @@ extern __ro_after_init struct mm_struct *poking_mm; extern __ro_after_init unsigned long poking_addr; #ifndef CONFIG_UML_X86 -static inline void int3_emulate_jmp(struct pt_regs *regs, unsigned long ip) +static __always_inline +void int3_emulate_jmp(struct pt_regs *regs, unsigned long ip) { regs->ip = ip; } -static inline void int3_emulate_push(struct pt_regs *regs, unsigned long val) +static __always_inline +void int3_emulate_push(struct pt_regs *regs, unsigned long val) { /* * The int3 handler in entry_64.S adds a gap between the @@ -138,7 +140,8 @@ static inline void int3_emulate_push(struct pt_regs *regs, unsigned long val) *(unsigned long *)regs->sp = val; } -static inline void int3_emulate_call(struct pt_regs *regs, unsigned long func) +static __always_inline +void int3_emulate_call(struct pt_regs *regs, unsigned long func) { int3_emulate_push(regs, regs->ip - INT3_INSN_SIZE + CALL_INSN_SIZE); int3_emulate_jmp(regs, func); diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index a9195ce8265d72..dd81ed5beecab4 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -1011,7 +1011,8 @@ struct bp_patching_desc { static struct bp_patching_desc *bp_desc; -static inline struct bp_patching_desc *try_get_desc(struct bp_patching_desc **descp) +static __always_inline +struct bp_patching_desc *try_get_desc(struct bp_patching_desc **descp) { struct bp_patching_desc *desc = READ_ONCE(*descp); /* rcu_dereference */ @@ -1021,18 +1022,18 @@ static inline struct bp_patching_desc *try_get_desc(struct bp_patching_desc **de return desc; } -static inline void put_desc(struct bp_patching_desc *desc) +static __always_inline void put_desc(struct bp_patching_desc *desc) { smp_mb__before_atomic(); atomic_dec(&desc->refs); } -static inline void *text_poke_addr(struct text_poke_loc *tp) +static __always_inline void *text_poke_addr(struct text_poke_loc *tp) { return _stext + tp->rel_addr; } -static int notrace patch_cmp(const void *key, const void *elt) +static int noinstr patch_cmp(const void *key, const void *elt) { struct text_poke_loc *tp = (struct text_poke_loc *) elt; @@ -1042,9 +1043,8 @@ static int notrace patch_cmp(const void *key, const void *elt) return 1; return 0; } -NOKPROBE_SYMBOL(patch_cmp); -int notrace poke_int3_handler(struct pt_regs *regs) +int noinstr poke_int3_handler(struct pt_regs *regs) { struct bp_patching_desc *desc; struct text_poke_loc *tp; @@ -1118,7 +1118,6 @@ int notrace poke_int3_handler(struct pt_regs *regs) put_desc(desc); return ret; } -NOKPROBE_SYMBOL(poke_int3_handler); #define TP_VEC_MAX (PAGE_SIZE / sizeof(struct text_poke_loc)) static struct text_poke_loc tp_vec[TP_VEC_MAX]; From ef882bfef933408360e4d9d0c2c83a1e2fc996f3 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 24 Jan 2020 22:08:45 +0100 Subject: [PATCH 088/190] x86/int3: Avoid atomic instrumentation Use arch_atomic_*() and __READ_ONCE() to ensure nothing untoward creeps in and ruins things. That is; this is the INT3 text poke handler, strictly limit the code that runs in it, lest it inadvertenly hits yet another INT3. Reported-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Reviewed-by: Masami Hiramatsu Reviewed-by: Alexandre Chartre Acked-by: Andy Lutomirski Link: https://lkml.kernel.org/r/20200505135313.517429268@linutronix.de --- arch/x86/kernel/alternative.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index dd81ed5beecab4..50a8d24a417ed6 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -1014,9 +1014,9 @@ static struct bp_patching_desc *bp_desc; static __always_inline struct bp_patching_desc *try_get_desc(struct bp_patching_desc **descp) { - struct bp_patching_desc *desc = READ_ONCE(*descp); /* rcu_dereference */ + struct bp_patching_desc *desc = __READ_ONCE(*descp); /* rcu_dereference */ - if (!desc || !atomic_inc_not_zero(&desc->refs)) + if (!desc || !arch_atomic_inc_not_zero(&desc->refs)) return NULL; return desc; @@ -1025,7 +1025,7 @@ struct bp_patching_desc *try_get_desc(struct bp_patching_desc **descp) static __always_inline void put_desc(struct bp_patching_desc *desc) { smp_mb__before_atomic(); - atomic_dec(&desc->refs); + arch_atomic_dec(&desc->refs); } static __always_inline void *text_poke_addr(struct text_poke_loc *tp) From df65bba1dcd8ffadd922a71196b78c6d7630c33b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 19 Feb 2020 18:25:09 +0100 Subject: [PATCH 089/190] lib/bsearch: Provide __always_inline variant For code that needs the ultimate performance (it can inline the @cmp function too) or simply needs to avoid calling external functions for whatever reason, provide an __always_inline variant of bsearch(). [ tglx: Renamed to __inline_bsearch() as suggested by Andy ] Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Reviewed-by: Alexandre Chartre Acked-by: Andy Lutomirski Link: https://lkml.kernel.org/r/20200505135313.624443814@linutronix.de --- include/linux/bsearch.h | 26 ++++++++++++++++++++++++-- lib/bsearch.c | 22 ++-------------------- 2 files changed, 26 insertions(+), 22 deletions(-) diff --git a/include/linux/bsearch.h b/include/linux/bsearch.h index 8ed53d7524ea69..e66b711d091e92 100644 --- a/include/linux/bsearch.h +++ b/include/linux/bsearch.h @@ -4,7 +4,29 @@ #include -void *bsearch(const void *key, const void *base, size_t num, size_t size, - cmp_func_t cmp); +static __always_inline +void *__inline_bsearch(const void *key, const void *base, size_t num, size_t size, cmp_func_t cmp) +{ + const char *pivot; + int result; + + while (num > 0) { + pivot = base + (num >> 1) * size; + result = cmp(key, pivot); + + if (result == 0) + return (void *)pivot; + + if (result > 0) { + base = pivot + size; + num--; + } + num >>= 1; + } + + return NULL; +} + +extern void *bsearch(const void *key, const void *base, size_t num, size_t size, cmp_func_t cmp); #endif /* _LINUX_BSEARCH_H */ diff --git a/lib/bsearch.c b/lib/bsearch.c index 8b3aae5ae77a20..bf86aa66f2b275 100644 --- a/lib/bsearch.c +++ b/lib/bsearch.c @@ -28,27 +28,9 @@ * the key and elements in the array are of the same type, you can use * the same comparison function for both sort() and bsearch(). */ -void *bsearch(const void *key, const void *base, size_t num, size_t size, - cmp_func_t cmp) +void *bsearch(const void *key, const void *base, size_t num, size_t size, cmp_func_t cmp) { - const char *pivot; - int result; - - while (num > 0) { - pivot = base + (num >> 1) * size; - result = cmp(key, pivot); - - if (result == 0) - return (void *)pivot; - - if (result > 0) { - base = pivot + size; - num--; - } - num >>= 1; - } - - return NULL; + return __inline_bsearch(key, base, num, size, cmp); } EXPORT_SYMBOL(bsearch); NOKPROBE_SYMBOL(bsearch); From f64366efd8c60b93138b813d071d2cd201fd0f6e Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 20 Feb 2020 13:28:06 +0100 Subject: [PATCH 090/190] x86/int3: Inline bsearch() Avoid calling out to bsearch() by inlining it, for normal kernel configs this was the last external call and poke_int3_handler() is now fully self sufficient -- no calls to external code. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Reviewed-by: Alexandre Chartre Acked-by: Andy Lutomirski Link: https://lkml.kernel.org/r/20200505135313.731774429@linutronix.de --- arch/x86/kernel/alternative.c | 8 ++++---- arch/x86/kernel/traps.c | 5 +++++ 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 50a8d24a417ed6..8fd39ff74a499b 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -1033,7 +1033,7 @@ static __always_inline void *text_poke_addr(struct text_poke_loc *tp) return _stext + tp->rel_addr; } -static int noinstr patch_cmp(const void *key, const void *elt) +static __always_inline int patch_cmp(const void *key, const void *elt) { struct text_poke_loc *tp = (struct text_poke_loc *) elt; @@ -1077,9 +1077,9 @@ int noinstr poke_int3_handler(struct pt_regs *regs) * Skip the binary search if there is a single member in the vector. */ if (unlikely(desc->nr_entries > 1)) { - tp = bsearch(ip, desc->vec, desc->nr_entries, - sizeof(struct text_poke_loc), - patch_cmp); + tp = __inline_bsearch(ip, desc->vec, desc->nr_entries, + sizeof(struct text_poke_loc), + patch_cmp); if (!tp) goto out_put; } else { diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index b28a64d7691fbc..280c290f414f00 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -570,6 +570,11 @@ DEFINE_IDTENTRY_ERRORCODE(exc_general_protection) dotraplinkage void notrace do_int3(struct pt_regs *regs, long error_code) { + /* + * poke_int3_handler() is completely self contained code; it does (and + * must) *NOT* call out to anything, lest it hits upon yet another + * INT3. + */ if (poke_int3_handler(regs)) return; From 0dc6cdc21b94eed8cdacf34eabb4175cebd13775 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 4 Mar 2020 15:22:09 +0100 Subject: [PATCH 091/190] x86/idtentry: Provide IDTENTRY_RAW Some exception handlers need to do extra work before any of the entry helpers are invoked. Provide IDTENTRY_RAW for this. Signed-off-by: Thomas Gleixner Reviewed-by: Alexandre Chartre Acked-by: Peter Zijlstra Acked-by: Andy Lutomirski Link: https://lkml.kernel.org/r/20200505135313.830540017@linutronix.de --- arch/x86/include/asm/idtentry.h | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/arch/x86/include/asm/idtentry.h b/arch/x86/include/asm/idtentry.h index ee6ebfef7e57dd..2f31d03f3e57c6 100644 --- a/arch/x86/include/asm/idtentry.h +++ b/arch/x86/include/asm/idtentry.h @@ -104,6 +104,34 @@ __visible noinstr void func(struct pt_regs *regs, \ static __always_inline void __##func(struct pt_regs *regs, \ unsigned long error_code) +/** + * DECLARE_IDTENTRY_RAW - Declare functions for raw IDT entry points + * No error code pushed by hardware + * @vector: Vector number (ignored for C) + * @func: Function name of the entry point + * + * Maps to DECLARE_IDTENTRY(). + */ +#define DECLARE_IDTENTRY_RAW(vector, func) \ + DECLARE_IDTENTRY(vector, func) + +/** + * DEFINE_IDTENTRY_RAW - Emit code for raw IDT entry points + * @func: Function name of the entry point + * + * @func is called from ASM entry code with interrupts disabled. + * + * The macro is written so it acts as function definition. Append the + * body with a pair of curly brackets. + * + * Contrary to DEFINE_IDTENTRY() this does not invoke the + * idtentry_enter/exit() helpers before and after the body invocation. This + * needs to be done in the body itself if applicable. Use if extra work + * is required before the enter/exit() helpers are invoked. + */ +#define DEFINE_IDTENTRY_RAW(func) \ +__visible noinstr void func(struct pt_regs *regs) + #else /* !__ASSEMBLY__ */ /* @@ -118,6 +146,9 @@ static __always_inline void __##func(struct pt_regs *regs, \ /* Special case for 32bit IRET 'trap'. Do not emit ASM code */ #define DECLARE_IDTENTRY_SW(vector, func) +#define DECLARE_IDTENTRY_RAW(vector, func) \ + DECLARE_IDTENTRY(vector, func) + #endif /* __ASSEMBLY__ */ /* From 8edd7e37aed8b9df938a63f0b0259c70569ce3d2 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 25 Feb 2020 23:16:16 +0100 Subject: [PATCH 092/190] x86/entry: Convert INT3 exception to IDTENTRY_RAW Convert #BP to IDTENTRY_RAW: - Implement the C entry point with DEFINE_IDTENTRY_RAW - Invoke idtentry_enter/exit() from the function body - Emit the ASM stub with DECLARE_IDTENTRY_RAW - Remove the ASM idtentry in 64bit - Remove the open coded ASM entry code in 32bit - Fixup the XEN/PV code - Remove the old prototypes No functional change. This could be a plain IDTENTRY, but as Peter pointed out INT3 is broken vs. the static key in the context tracking code as this static key might be in the state of being patched and has an int3 which would recurse forever. IDTENTRY_RAW is therefore chosen to allow addressing this issue without lots of code churn. Signed-off-by: Thomas Gleixner Reviewed-by: Alexandre Chartre Acked-by: Peter Zijlstra Acked-by: Andy Lutomirski Link: https://lkml.kernel.org/r/20200505135313.938474960@linutronix.de --- arch/x86/entry/entry_32.S | 7 ------- arch/x86/entry/entry_64.S | 2 -- arch/x86/include/asm/idtentry.h | 3 +++ arch/x86/include/asm/traps.h | 3 --- arch/x86/kernel/idt.c | 2 +- arch/x86/kernel/traps.c | 28 +++++++++++++++++----------- arch/x86/xen/enlighten_pv.c | 2 +- arch/x86/xen/xen-asm_64.S | 2 +- 8 files changed, 23 insertions(+), 26 deletions(-) diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index f7a5f1cda058aa..b9b0ddb53a0828 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -1649,13 +1649,6 @@ SYM_CODE_START(nmi) #endif SYM_CODE_END(nmi) -SYM_CODE_START(int3) - ASM_CLAC - pushl $0 - pushl $do_int3 - jmp common_exception -SYM_CODE_END(int3) - .pushsection .text, "ax" SYM_CODE_START(rewind_stack_do_exit) /* Prevent any naive code from trying to unwind to our caller. */ diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 1bada7b2621072..69ddd052aef211 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -1072,8 +1072,6 @@ apicinterrupt IRQ_WORK_VECTOR irq_work_interrupt smp_irq_work_interrupt * Exception entry points. */ -idtentry X86_TRAP_BP int3 do_int3 has_error_code=0 - idtentry X86_TRAP_PF page_fault do_page_fault has_error_code=1 #ifdef CONFIG_X86_MCE diff --git a/arch/x86/include/asm/idtentry.h b/arch/x86/include/asm/idtentry.h index 2f31d03f3e57c6..3dc4d5b246d315 100644 --- a/arch/x86/include/asm/idtentry.h +++ b/arch/x86/include/asm/idtentry.h @@ -181,4 +181,7 @@ DECLARE_IDTENTRY_ERRORCODE(X86_TRAP_SS, exc_stack_segment); DECLARE_IDTENTRY_ERRORCODE(X86_TRAP_GP, exc_general_protection); DECLARE_IDTENTRY_ERRORCODE(X86_TRAP_AC, exc_alignment_check); +/* Raw exception entries which need extra work */ +DECLARE_IDTENTRY_RAW(X86_TRAP_BP, exc_int3); + #endif diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h index 5774d0b6cf77d3..698285a2b6608b 100644 --- a/arch/x86/include/asm/traps.h +++ b/arch/x86/include/asm/traps.h @@ -13,7 +13,6 @@ asmlinkage void debug(void); asmlinkage void nmi(void); -asmlinkage void int3(void); #ifdef CONFIG_X86_64 asmlinkage void double_fault(void); #endif @@ -26,7 +25,6 @@ asmlinkage void machine_check(void); #if defined(CONFIG_X86_64) && defined(CONFIG_XEN_PV) asmlinkage void xen_xennmi(void); asmlinkage void xen_xendebug(void); -asmlinkage void xen_int3(void); asmlinkage void xen_double_fault(void); asmlinkage void xen_page_fault(void); #ifdef CONFIG_X86_MCE @@ -36,7 +34,6 @@ asmlinkage void xen_machine_check(void); dotraplinkage void do_debug(struct pt_regs *regs, long error_code); dotraplinkage void do_nmi(struct pt_regs *regs, long error_code); -dotraplinkage void do_int3(struct pt_regs *regs, long error_code); dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code, unsigned long cr2); dotraplinkage void do_page_fault(struct pt_regs *regs, unsigned long error_code, unsigned long address); dotraplinkage void do_mce(struct pt_regs *regs, long error_code); diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c index 38b565b7e5b893..9ca8af65a2123f 100644 --- a/arch/x86/kernel/idt.c +++ b/arch/x86/kernel/idt.c @@ -60,7 +60,7 @@ static bool idt_setup_done __initdata; */ static const __initconst struct idt_data early_idts[] = { INTG(X86_TRAP_DB, debug), - SYSG(X86_TRAP_BP, int3), + SYSG(X86_TRAP_BP, asm_exc_int3), #ifdef CONFIG_X86_32 INTG(X86_TRAP_PF, page_fault), #endif diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 280c290f414f00..0ad12dffde22ce 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -568,7 +568,7 @@ DEFINE_IDTENTRY_ERRORCODE(exc_general_protection) cond_local_irq_disable(regs); } -dotraplinkage void notrace do_int3(struct pt_regs *regs, long error_code) +DEFINE_IDTENTRY_RAW(exc_int3) { /* * poke_int3_handler() is completely self contained code; it does (and @@ -579,16 +579,20 @@ dotraplinkage void notrace do_int3(struct pt_regs *regs, long error_code) return; /* - * Unlike any other non-IST entry, we can be called from pretty much - * any location in the kernel through kprobes -- text_poke() will most - * likely be handled by poke_int3_handler() above. This means this - * handler is effectively NMI-like. + * idtentry_enter() uses static_branch_{,un}likely() and therefore + * can trigger INT3, hence poke_int3_handler() must be done + * before. If the entry came from kernel mode, then use nmi_enter() + * because the INT3 could have been hit in any context including + * NMI. */ - if (!user_mode(regs)) + if (user_mode(regs)) + idtentry_enter(regs); + else nmi_enter(); + instrumentation_begin(); #ifdef CONFIG_KGDB_LOW_LEVEL_TRAP - if (kgdb_ll_trap(DIE_INT3, "int3", regs, error_code, X86_TRAP_BP, + if (kgdb_ll_trap(DIE_INT3, "int3", regs, 0, X86_TRAP_BP, SIGTRAP) == NOTIFY_STOP) goto exit; #endif /* CONFIG_KGDB_LOW_LEVEL_TRAP */ @@ -598,19 +602,21 @@ dotraplinkage void notrace do_int3(struct pt_regs *regs, long error_code) goto exit; #endif - if (notify_die(DIE_INT3, "int3", regs, error_code, X86_TRAP_BP, + if (notify_die(DIE_INT3, "int3", regs, 0, X86_TRAP_BP, SIGTRAP) == NOTIFY_STOP) goto exit; cond_local_irq_enable(regs); - do_trap(X86_TRAP_BP, SIGTRAP, "int3", regs, error_code, 0, NULL); + do_trap(X86_TRAP_BP, SIGTRAP, "int3", regs, 0, 0, NULL); cond_local_irq_disable(regs); exit: - if (!user_mode(regs)) + instrumentation_end(); + if (user_mode(regs)) + idtentry_exit(regs); + else nmi_exit(); } -NOKPROBE_SYMBOL(do_int3); #ifdef CONFIG_X86_64 /* diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c index 0a30fc0fe0faa7..5bcd86c4dff045 100644 --- a/arch/x86/xen/enlighten_pv.c +++ b/arch/x86/xen/enlighten_pv.c @@ -616,7 +616,7 @@ static struct trap_array_entry trap_array[] = { { machine_check, xen_machine_check, true }, #endif { nmi, xen_xennmi, true }, - { int3, xen_int3, false }, + TRAP_ENTRY(exc_int3, false ), TRAP_ENTRY(exc_overflow, false ), #ifdef CONFIG_IA32_EMULATION { entry_INT80_compat, xen_entry_INT80_compat, false }, diff --git a/arch/x86/xen/xen-asm_64.S b/arch/x86/xen/xen-asm_64.S index 6a91157d5b5ca3..44f55569a37f56 100644 --- a/arch/x86/xen/xen-asm_64.S +++ b/arch/x86/xen/xen-asm_64.S @@ -31,7 +31,7 @@ _ASM_NOKPROBE(xen_\name) xen_pv_trap asm_exc_divide_error xen_pv_trap debug xen_pv_trap xendebug -xen_pv_trap int3 +xen_pv_trap asm_exc_int3 xen_pv_trap xennmi xen_pv_trap asm_exc_overflow xen_pv_trap asm_exc_bounds From 21e28290b31708b72763641604e239eb369c230d Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 5 Mar 2020 16:09:52 +0100 Subject: [PATCH 093/190] x86/traps: Split int3 handler up For code simplicity split up the int3 handler into a kernel and user part which makes the code flow simpler to understand. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Reviewed-by: Alexandre Chartre Link: https://lkml.kernel.org/r/20200505135314.045220765@linutronix.de --- arch/x86/kernel/traps.c | 68 ++++++++++++++++++++++++----------------- 1 file changed, 40 insertions(+), 28 deletions(-) diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 0ad12dffde22ce..21c8cfce24d314 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -568,6 +568,35 @@ DEFINE_IDTENTRY_ERRORCODE(exc_general_protection) cond_local_irq_disable(regs); } +static bool do_int3(struct pt_regs *regs) +{ + int res; + +#ifdef CONFIG_KGDB_LOW_LEVEL_TRAP + if (kgdb_ll_trap(DIE_INT3, "int3", regs, 0, X86_TRAP_BP, + SIGTRAP) == NOTIFY_STOP) + return true; +#endif /* CONFIG_KGDB_LOW_LEVEL_TRAP */ + +#ifdef CONFIG_KPROBES + if (kprobe_int3_handler(regs)) + return true; +#endif + res = notify_die(DIE_INT3, "int3", regs, 0, X86_TRAP_BP, SIGTRAP); + + return res == NOTIFY_STOP; +} + +static void do_int3_user(struct pt_regs *regs) +{ + if (do_int3(regs)) + return; + + cond_local_irq_enable(regs); + do_trap(X86_TRAP_BP, SIGTRAP, "int3", regs, 0, 0, NULL); + cond_local_irq_disable(regs); +} + DEFINE_IDTENTRY_RAW(exc_int3) { /* @@ -585,37 +614,20 @@ DEFINE_IDTENTRY_RAW(exc_int3) * because the INT3 could have been hit in any context including * NMI. */ - if (user_mode(regs)) + if (user_mode(regs)) { idtentry_enter(regs); - else - nmi_enter(); - - instrumentation_begin(); -#ifdef CONFIG_KGDB_LOW_LEVEL_TRAP - if (kgdb_ll_trap(DIE_INT3, "int3", regs, 0, X86_TRAP_BP, - SIGTRAP) == NOTIFY_STOP) - goto exit; -#endif /* CONFIG_KGDB_LOW_LEVEL_TRAP */ - -#ifdef CONFIG_KPROBES - if (kprobe_int3_handler(regs)) - goto exit; -#endif - - if (notify_die(DIE_INT3, "int3", regs, 0, X86_TRAP_BP, - SIGTRAP) == NOTIFY_STOP) - goto exit; - - cond_local_irq_enable(regs); - do_trap(X86_TRAP_BP, SIGTRAP, "int3", regs, 0, 0, NULL); - cond_local_irq_disable(regs); - -exit: - instrumentation_end(); - if (user_mode(regs)) + instrumentation_begin(); + do_int3_user(regs); + instrumentation_end(); idtentry_exit(regs); - else + } else { + nmi_enter(); + instrumentation_begin(); + if (!do_int3(regs)) + die("int3", regs, 0); + instrumentation_end(); nmi_exit(); + } } #ifdef CONFIG_X86_64 From 2c058b03cc06ba485169778a271f87e5ac57dd83 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 25 Feb 2020 23:33:22 +0100 Subject: [PATCH 094/190] x86/idtentry: Provide IDTENTRY_IST Same as IDTENTRY but for exceptions which run on Interrupt Stacks (IST) on 64bit. For 32bit this maps to IDTENTRY. There are 3 variants which will be used: IDTENTRY_MCE IDTENTRY_DB IDTENTRY_NMI These map to IDTENTRY_IST, but only the MCE and DB variants are emitting ASM code as the NMI entry needs hand crafted ASM still. The function defines do not contain any idtenter/exit calls as these exceptions need special treatment. Signed-off-by: Thomas Gleixner Reviewed-by: Alexandre Chartre Acked-by: Peter Zijlstra Acked-by: Andy Lutomirski Link: https://lkml.kernel.org/r/20200505135314.137125609@linutronix.de --- arch/x86/include/asm/idtentry.h | 54 +++++++++++++++++++++++++++++++++ 1 file changed, 54 insertions(+) diff --git a/arch/x86/include/asm/idtentry.h b/arch/x86/include/asm/idtentry.h index 3dc4d5b246d315..3edd6d011ecd59 100644 --- a/arch/x86/include/asm/idtentry.h +++ b/arch/x86/include/asm/idtentry.h @@ -132,6 +132,42 @@ static __always_inline void __##func(struct pt_regs *regs, \ #define DEFINE_IDTENTRY_RAW(func) \ __visible noinstr void func(struct pt_regs *regs) +#ifdef CONFIG_X86_64 +/** + * DECLARE_IDTENTRY_IST - Declare functions for IST handling IDT entry points + * @vector: Vector number (ignored for C) + * @func: Function name of the entry point + * + * Maps to DECLARE_IDTENTRY_RAW + */ +#define DECLARE_IDTENTRY_IST(vector, func) \ + DECLARE_IDTENTRY_RAW(vector, func) + +/** + * DEFINE_IDTENTRY_IST - Emit code for IST entry points + * @func: Function name of the entry point + * + * Maps to DEFINE_IDTENTRY_RAW + */ +#define DEFINE_IDTENTRY_IST(func) \ + DEFINE_IDTENTRY_RAW(func) + +#else /* CONFIG_X86_64 */ +/* Maps to a regular IDTENTRY on 32bit for now */ +# define DECLARE_IDTENTRY_IST DECLARE_IDTENTRY +# define DEFINE_IDTENTRY_IST DEFINE_IDTENTRY +#endif /* !CONFIG_X86_64 */ + +/* C-Code mapping */ +#define DECLARE_IDTENTRY_MCE DECLARE_IDTENTRY_IST +#define DEFINE_IDTENTRY_MCE DEFINE_IDTENTRY_IST + +#define DECLARE_IDTENTRY_NMI DECLARE_IDTENTRY_IST +#define DEFINE_IDTENTRY_NMI DEFINE_IDTENTRY_IST + +#define DECLARE_IDTENTRY_DEBUG DECLARE_IDTENTRY_IST +#define DEFINE_IDTENTRY_DEBUG DEFINE_IDTENTRY_IST + #else /* !__ASSEMBLY__ */ /* @@ -149,6 +185,24 @@ __visible noinstr void func(struct pt_regs *regs) #define DECLARE_IDTENTRY_RAW(vector, func) \ DECLARE_IDTENTRY(vector, func) +#ifdef CONFIG_X86_64 +# define DECLARE_IDTENTRY_MCE(vector, func) \ + idtentry_mce_db vector asm_##func func + +# define DECLARE_IDTENTRY_DEBUG(vector, func) \ + idtentry_mce_db vector asm_##func func + +#else +# define DECLARE_IDTENTRY_MCE(vector, func) \ + DECLARE_IDTENTRY(vector, func) + +# define DECLARE_IDTENTRY_DEBUG(vector, func) \ + DECLARE_IDTENTRY(vector, func) +#endif + +/* No ASM code emitted for NMI */ +#define DECLARE_IDTENTRY_NMI(vector, func) + #endif /* __ASSEMBLY__ */ /* From 94a46d316f2b54e3de8a4fa884cb16383db7fcd8 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 3 Apr 2020 22:37:31 +0200 Subject: [PATCH 095/190] x86/mce: Move nmi_enter/exit() into the entry point There is no reason to have nmi_enter/exit() in the actual MCE handlers. Move it to the entry point. This also covers the until now uncovered initial handler which only prints. Signed-off-by: Thomas Gleixner Reviewed-by: Alexandre Chartre Acked-by: Peter Zijlstra Acked-by: Andy Lutomirski Link: https://lkml.kernel.org/r/20200505135314.243936614@linutronix.de --- arch/x86/kernel/cpu/mce/core.c | 26 +++++++++++++------------- arch/x86/kernel/cpu/mce/p5.c | 4 ---- arch/x86/kernel/cpu/mce/winchip.c | 4 ---- 3 files changed, 13 insertions(+), 21 deletions(-) diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index e9265e2f28c963..f5993ed6e16b86 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -1100,8 +1100,10 @@ static void mce_clear_state(unsigned long *toclear) * kdump kernel establishing a new #MC handler where a broadcasted MCE * might not get handled properly. */ -static bool __mc_check_crashing_cpu(int cpu) +static noinstr bool mce_check_crashing_cpu(void) { + unsigned int cpu = smp_processor_id(); + if (cpu_is_offline(cpu) || (crashing_cpu != -1 && crashing_cpu != cpu)) { u64 mcgstatus; @@ -1235,7 +1237,6 @@ void noinstr do_machine_check(struct pt_regs *regs, long error_code) DECLARE_BITMAP(valid_banks, MAX_NR_BANKS); DECLARE_BITMAP(toclear, MAX_NR_BANKS); struct mca_config *cfg = &mca_cfg; - int cpu = smp_processor_id(); struct mce m, *final; char *msg = NULL; int worst = 0; @@ -1264,11 +1265,6 @@ void noinstr do_machine_check(struct pt_regs *regs, long error_code) */ int lmce = 1; - if (__mc_check_crashing_cpu(cpu)) - return; - - nmi_enter(); - this_cpu_inc(mce_exception_count); mce_gather_info(&m, regs); @@ -1356,7 +1352,7 @@ void noinstr do_machine_check(struct pt_regs *regs, long error_code) sync_core(); if (worst != MCE_AR_SEVERITY && !kill_it) - goto out_ist; + return; /* Fault was in user mode and we need to take some action */ if ((m.cs & 3) == 3) { @@ -1373,9 +1369,6 @@ void noinstr do_machine_check(struct pt_regs *regs, long error_code) if (!fixup_exception(regs, X86_TRAP_MC, error_code, 0)) mce_panic("Failed kernel mode recovery", &m, msg); } - -out_ist: - nmi_exit(); } EXPORT_SYMBOL_GPL(do_machine_check); @@ -1912,11 +1905,18 @@ static void unexpected_machine_check(struct pt_regs *regs, long error_code) void (*machine_check_vector)(struct pt_regs *, long error_code) = unexpected_machine_check; -dotraplinkage notrace void do_mce(struct pt_regs *regs, long error_code) +dotraplinkage noinstr void do_mce(struct pt_regs *regs, long error_code) { + if (machine_check_vector == do_machine_check && + mce_check_crashing_cpu()) + return; + + nmi_enter(); + machine_check_vector(regs, error_code); + + nmi_exit(); } -NOKPROBE_SYMBOL(do_mce); /* * Called for each booted CPU to set up machine checks. diff --git a/arch/x86/kernel/cpu/mce/p5.c b/arch/x86/kernel/cpu/mce/p5.c index 5ee94aa1b7668b..dc29f0f7b3ed8b 100644 --- a/arch/x86/kernel/cpu/mce/p5.c +++ b/arch/x86/kernel/cpu/mce/p5.c @@ -25,8 +25,6 @@ static void pentium_machine_check(struct pt_regs *regs, long error_code) { u32 loaddr, hi, lotype; - nmi_enter(); - rdmsr(MSR_IA32_P5_MC_ADDR, loaddr, hi); rdmsr(MSR_IA32_P5_MC_TYPE, lotype, hi); @@ -39,8 +37,6 @@ static void pentium_machine_check(struct pt_regs *regs, long error_code) } add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE); - - nmi_exit(); } /* Set up machine check reporting for processors with Intel style MCE: */ diff --git a/arch/x86/kernel/cpu/mce/winchip.c b/arch/x86/kernel/cpu/mce/winchip.c index b3938c195365c1..3f8f84ba0f51c9 100644 --- a/arch/x86/kernel/cpu/mce/winchip.c +++ b/arch/x86/kernel/cpu/mce/winchip.c @@ -19,12 +19,8 @@ /* Machine check handler for WinChip C6: */ static void winchip_machine_check(struct pt_regs *regs, long error_code) { - nmi_enter(); - pr_emerg("CPU0: Machine Check Exception.\n"); add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE); - - nmi_exit(); } /* Set up machine check reporting on the Winchip C6 series */ From 8cd501c1facc159dff6db63775151c9200a3ea1e Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 25 Feb 2020 23:33:23 +0100 Subject: [PATCH 096/190] x86/entry: Convert Machine Check to IDTENTRY_IST Convert #MC to IDTENTRY_MCE: - Implement the C entry points with DEFINE_IDTENTRY_MCE - Emit the ASM stub with DECLARE_IDTENTRY_MCE - Remove the ASM idtentry in 64bit - Remove the open coded ASM entry code in 32bit - Fixup the XEN/PV code - Remove the old prototypes - Remove the error code from *machine_check_vector() as it is always 0 and not used by any of the functions it can point to. Fixup all the functions as well. No functional change. Signed-off-by: Thomas Gleixner Reviewed-by: Alexandre Chartre Acked-by: Peter Zijlstra Acked-by: Andy Lutomirski Link: https://lkml.kernel.org/r/20200505135314.334980426@linutronix.de --- arch/x86/entry/entry_32.S | 9 --------- arch/x86/entry/entry_64.S | 3 --- arch/x86/include/asm/idtentry.h | 4 ++++ arch/x86/include/asm/mce.h | 2 +- arch/x86/include/asm/traps.h | 7 ------- arch/x86/kernel/cpu/mce/core.c | 23 ++++++++++++++--------- arch/x86/kernel/cpu/mce/inject.c | 4 ++-- arch/x86/kernel/cpu/mce/internal.h | 2 +- arch/x86/kernel/cpu/mce/p5.c | 2 +- arch/x86/kernel/cpu/mce/winchip.c | 2 +- arch/x86/kernel/idt.c | 10 +++++----- arch/x86/kvm/svm/svm.c | 2 +- arch/x86/kvm/vmx/vmx.c | 2 +- arch/x86/xen/enlighten_pv.c | 2 +- arch/x86/xen/xen-asm_64.S | 2 +- 15 files changed, 33 insertions(+), 43 deletions(-) diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index b9b0ddb53a0828..4dd3d706d9fcbb 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -1297,15 +1297,6 @@ SYM_CODE_START(native_iret) SYM_CODE_END(native_iret) #endif -#ifdef CONFIG_X86_MCE -SYM_CODE_START(machine_check) - ASM_CLAC - pushl $0 - pushl $do_mce - jmp common_exception -SYM_CODE_END(machine_check) -#endif - #ifdef CONFIG_XEN_PV SYM_FUNC_START(xen_hypervisor_callback) /* diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 69ddd052aef211..5007b975ca7cc5 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -1074,9 +1074,6 @@ apicinterrupt IRQ_WORK_VECTOR irq_work_interrupt smp_irq_work_interrupt idtentry X86_TRAP_PF page_fault do_page_fault has_error_code=1 -#ifdef CONFIG_X86_MCE -idtentry_mce_db X86_TRAP_MCE machine_check do_mce -#endif idtentry_mce_db X86_TRAP_DB debug do_debug idtentry_df X86_TRAP_DF double_fault do_double_fault diff --git a/arch/x86/include/asm/idtentry.h b/arch/x86/include/asm/idtentry.h index 3edd6d011ecd59..36fe964c0d53ee 100644 --- a/arch/x86/include/asm/idtentry.h +++ b/arch/x86/include/asm/idtentry.h @@ -238,4 +238,8 @@ DECLARE_IDTENTRY_ERRORCODE(X86_TRAP_AC, exc_alignment_check); /* Raw exception entries which need extra work */ DECLARE_IDTENTRY_RAW(X86_TRAP_BP, exc_int3); +#ifdef CONFIG_X86_MCE +DECLARE_IDTENTRY_MCE(X86_TRAP_MC, exc_machine_check); +#endif + #endif diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index f9cea081c05b01..a00130112b0245 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -238,7 +238,7 @@ extern void mce_disable_bank(int bank); /* * Exception handler */ -void do_machine_check(struct pt_regs *, long); +void do_machine_check(struct pt_regs *pt_regs); /* * Threshold handler diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h index 698285a2b6608b..6096db912625fc 100644 --- a/arch/x86/include/asm/traps.h +++ b/arch/x86/include/asm/traps.h @@ -18,25 +18,18 @@ asmlinkage void double_fault(void); #endif asmlinkage void page_fault(void); asmlinkage void async_page_fault(void); -#ifdef CONFIG_X86_MCE -asmlinkage void machine_check(void); -#endif /* CONFIG_X86_MCE */ #if defined(CONFIG_X86_64) && defined(CONFIG_XEN_PV) asmlinkage void xen_xennmi(void); asmlinkage void xen_xendebug(void); asmlinkage void xen_double_fault(void); asmlinkage void xen_page_fault(void); -#ifdef CONFIG_X86_MCE -asmlinkage void xen_machine_check(void); -#endif /* CONFIG_X86_MCE */ #endif dotraplinkage void do_debug(struct pt_regs *regs, long error_code); dotraplinkage void do_nmi(struct pt_regs *regs, long error_code); dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code, unsigned long cr2); dotraplinkage void do_page_fault(struct pt_regs *regs, unsigned long error_code, unsigned long address); -dotraplinkage void do_mce(struct pt_regs *regs, long error_code); #ifdef CONFIG_X86_64 asmlinkage __visible notrace struct pt_regs *sync_regs(struct pt_regs *eregs); diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index f5993ed6e16b86..842dd03c3918cc 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -1232,7 +1232,7 @@ static void kill_me_maybe(struct callback_head *cb) * backing the user stack, tracing that reads the user stack will cause * potentially infinite recursion. */ -void noinstr do_machine_check(struct pt_regs *regs, long error_code) +void noinstr do_machine_check(struct pt_regs *regs) { DECLARE_BITMAP(valid_banks, MAX_NR_BANKS); DECLARE_BITMAP(toclear, MAX_NR_BANKS); @@ -1366,7 +1366,7 @@ void noinstr do_machine_check(struct pt_regs *regs, long error_code) current->mce_kill_me.func = kill_me_now; task_work_add(current, ¤t->mce_kill_me, true); } else { - if (!fixup_exception(regs, X86_TRAP_MC, error_code, 0)) + if (!fixup_exception(regs, X86_TRAP_MC, 0, 0)) mce_panic("Failed kernel mode recovery", &m, msg); } } @@ -1895,27 +1895,32 @@ bool filter_mce(struct mce *m) } /* Handle unconfigured int18 (should never happen) */ -static void unexpected_machine_check(struct pt_regs *regs, long error_code) +static void unexpected_machine_check(struct pt_regs *regs) { pr_err("CPU#%d: Unexpected int18 (Machine Check)\n", smp_processor_id()); } /* Call the installed machine check handler for this CPU setup. */ -void (*machine_check_vector)(struct pt_regs *, long error_code) = - unexpected_machine_check; +void (*machine_check_vector)(struct pt_regs *) = unexpected_machine_check; -dotraplinkage noinstr void do_mce(struct pt_regs *regs, long error_code) +DEFINE_IDTENTRY_MCE(exc_machine_check) { if (machine_check_vector == do_machine_check && mce_check_crashing_cpu()) return; - nmi_enter(); + if (user_mode(regs)) + idtentry_enter(regs); + else + nmi_enter(); - machine_check_vector(regs, error_code); + machine_check_vector(regs); - nmi_exit(); + if (user_mode(regs)) + idtentry_exit(regs); + else + nmi_exit(); } /* diff --git a/arch/x86/kernel/cpu/mce/inject.c b/arch/x86/kernel/cpu/mce/inject.c index 3413b41b8d55f2..0593b192eb8fae 100644 --- a/arch/x86/kernel/cpu/mce/inject.c +++ b/arch/x86/kernel/cpu/mce/inject.c @@ -146,9 +146,9 @@ static void raise_exception(struct mce *m, struct pt_regs *pregs) regs.cs = m->cs; pregs = ®s; } - /* in mcheck exeception handler, irq will be disabled */ + /* do_machine_check() expects interrupts disabled -- at least */ local_irq_save(flags); - do_machine_check(pregs, 0); + do_machine_check(pregs); local_irq_restore(flags); m->finished = 0; } diff --git a/arch/x86/kernel/cpu/mce/internal.h b/arch/x86/kernel/cpu/mce/internal.h index 3b008172ad73d5..b74ca4a28c6670 100644 --- a/arch/x86/kernel/cpu/mce/internal.h +++ b/arch/x86/kernel/cpu/mce/internal.h @@ -9,7 +9,7 @@ #include /* Pointer to the installed machine check handler for this CPU setup. */ -extern void (*machine_check_vector)(struct pt_regs *, long error_code); +extern void (*machine_check_vector)(struct pt_regs *); enum severity_level { MCE_NO_SEVERITY, diff --git a/arch/x86/kernel/cpu/mce/p5.c b/arch/x86/kernel/cpu/mce/p5.c index dc29f0f7b3ed8b..eaebc4ce7398db 100644 --- a/arch/x86/kernel/cpu/mce/p5.c +++ b/arch/x86/kernel/cpu/mce/p5.c @@ -21,7 +21,7 @@ int mce_p5_enabled __read_mostly; /* Machine check handler for Pentium class Intel CPUs: */ -static void pentium_machine_check(struct pt_regs *regs, long error_code) +static void pentium_machine_check(struct pt_regs *regs) { u32 loaddr, hi, lotype; diff --git a/arch/x86/kernel/cpu/mce/winchip.c b/arch/x86/kernel/cpu/mce/winchip.c index 3f8f84ba0f51c9..90e3d60c645e95 100644 --- a/arch/x86/kernel/cpu/mce/winchip.c +++ b/arch/x86/kernel/cpu/mce/winchip.c @@ -17,7 +17,7 @@ #include "internal.h" /* Machine check handler for WinChip C6: */ -static void winchip_machine_check(struct pt_regs *regs, long error_code) +static void winchip_machine_check(struct pt_regs *regs) { pr_emerg("CPU0: Machine Check Exception.\n"); add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE); diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c index 9ca8af65a2123f..6b93840784d527 100644 --- a/arch/x86/kernel/idt.c +++ b/arch/x86/kernel/idt.c @@ -96,7 +96,7 @@ static const __initconst struct idt_data def_idts[] = { INTG(X86_TRAP_DB, debug), #ifdef CONFIG_X86_MCE - INTG(X86_TRAP_MC, machine_check), + INTG(X86_TRAP_MC, asm_exc_machine_check), #endif SYSG(X86_TRAP_OF, asm_exc_overflow), @@ -185,11 +185,11 @@ gate_desc debug_idt_table[IDT_ENTRIES] __page_aligned_bss; * cpu_init() when the TSS has been initialized. */ static const __initconst struct idt_data ist_idts[] = { - ISTG(X86_TRAP_DB, debug, IST_INDEX_DB), - ISTG(X86_TRAP_NMI, nmi, IST_INDEX_NMI), - ISTG(X86_TRAP_DF, double_fault, IST_INDEX_DF), + ISTG(X86_TRAP_DB, debug, IST_INDEX_DB), + ISTG(X86_TRAP_NMI, nmi, IST_INDEX_NMI), + ISTG(X86_TRAP_DF, double_fault, IST_INDEX_DF), #ifdef CONFIG_X86_MCE - ISTG(X86_TRAP_MC, machine_check, IST_INDEX_MCE), + ISTG(X86_TRAP_MC, asm_exc_machine_check, IST_INDEX_MCE), #endif }; diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 9e333b91ff7828..7502cd65528fb3 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -1837,7 +1837,7 @@ static void kvm_machine_check(void) .flags = X86_EFLAGS_IF, }; - do_machine_check(®s, 0); + do_machine_check(®s); #endif } diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 170cc76a581f4a..2b5ba6063a2d62 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -4683,7 +4683,7 @@ static void kvm_machine_check(void) .flags = X86_EFLAGS_IF, }; - do_machine_check(®s, 0); + do_machine_check(®s); #endif } diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c index 5bcd86c4dff045..267583f9b20749 100644 --- a/arch/x86/xen/enlighten_pv.c +++ b/arch/x86/xen/enlighten_pv.c @@ -613,7 +613,7 @@ static struct trap_array_entry trap_array[] = { { debug, xen_xendebug, true }, { double_fault, xen_double_fault, true }, #ifdef CONFIG_X86_MCE - { machine_check, xen_machine_check, true }, + TRAP_ENTRY(exc_machine_check, true ), #endif { nmi, xen_xennmi, true }, TRAP_ENTRY(exc_int3, false ), diff --git a/arch/x86/xen/xen-asm_64.S b/arch/x86/xen/xen-asm_64.S index 44f55569a37f56..617ef3f981604e 100644 --- a/arch/x86/xen/xen-asm_64.S +++ b/arch/x86/xen/xen-asm_64.S @@ -48,7 +48,7 @@ xen_pv_trap asm_exc_spurious_interrupt_bug xen_pv_trap asm_exc_coprocessor_error xen_pv_trap asm_exc_alignment_check #ifdef CONFIG_X86_MCE -xen_pv_trap machine_check +xen_pv_trap asm_exc_machine_check #endif /* CONFIG_X86_MCE */ xen_pv_trap asm_exc_simd_coprocessor_error #ifdef CONFIG_IA32_EMULATION From aedbdeab00dcfcc6d751f9fb1b4896b01911d494 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 4 Apr 2020 15:39:13 +0200 Subject: [PATCH 097/190] x86/mce: Use untraced rd/wrmsr in the MCE offline/crash check mce_check_crashing_cpu() is called right at the entry of the MCE handler. It uses mce_rdmsr() and mce_wrmsr() which are wrappers around rdmsr() and wrmsr() to handle the MCE error injection mechanism, which is pointless in this context, i.e. when the MCE hits an offline CPU or the system is already marked crashing. The MSR access can also be traced, so use the untraceable variants. This is also safe vs. XEN paravirt as these MSRs are not affected by XEN PV modifications. Signed-off-by: Thomas Gleixner Reviewed-by: Alexandre Chartre Acked-by: Peter Zijlstra Acked-by: Andy Lutomirski Link: https://lkml.kernel.org/r/20200505135314.426347351@linutronix.de --- arch/x86/kernel/cpu/mce/core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index 842dd03c3918cc..3177652451905e 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -1108,7 +1108,7 @@ static noinstr bool mce_check_crashing_cpu(void) (crashing_cpu != -1 && crashing_cpu != cpu)) { u64 mcgstatus; - mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS); + mcgstatus = __rdmsr(MSR_IA32_MCG_STATUS); if (boot_cpu_data.x86_vendor == X86_VENDOR_ZHAOXIN) { if (mcgstatus & MCG_STATUS_LMCES) @@ -1116,7 +1116,7 @@ static noinstr bool mce_check_crashing_cpu(void) } if (mcgstatus & MCG_STATUS_RIPV) { - mce_wrmsrl(MSR_IA32_MCG_STATUS, 0); + __wrmsr(MSR_IA32_MCG_STATUS, 0, 0); return true; } } From 9cce81cff748ef0e79b41c6e37d7137267f1212f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 25 Feb 2020 23:33:24 +0100 Subject: [PATCH 098/190] x86/idtentry: Provide IDTENTRY_XEN for XEN/PV XEN/PV has special wrappers for NMI and DB exceptions. They redirect these exceptions through regular IDTENTRY points. Provide the necessary IDTENTRY macros to make this work Signed-off-by: Thomas Gleixner Reviewed-by: Alexandre Chartre Acked-by: Peter Zijlstra Acked-by: Andy Lutomirski Link: https://lkml.kernel.org/r/20200505135314.518622698@linutronix.de --- arch/x86/include/asm/idtentry.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/arch/x86/include/asm/idtentry.h b/arch/x86/include/asm/idtentry.h index 36fe964c0d53ee..2315eecf04fdca 100644 --- a/arch/x86/include/asm/idtentry.h +++ b/arch/x86/include/asm/idtentry.h @@ -168,6 +168,18 @@ __visible noinstr void func(struct pt_regs *regs) #define DECLARE_IDTENTRY_DEBUG DECLARE_IDTENTRY_IST #define DEFINE_IDTENTRY_DEBUG DEFINE_IDTENTRY_IST +/** + * DECLARE_IDTENTRY_XEN - Declare functions for XEN redirect IDT entry points + * @vector: Vector number (ignored for C) + * @func: Function name of the entry point + * + * Used for xennmi and xendebug redirections. No DEFINE as this is all ASM + * indirection magic. + */ +#define DECLARE_IDTENTRY_XEN(vector, func) \ + asmlinkage void xen_asm_exc_xen##func(void); \ + asmlinkage void asm_exc_xen##func(void) + #else /* !__ASSEMBLY__ */ /* @@ -203,6 +215,10 @@ __visible noinstr void func(struct pt_regs *regs) /* No ASM code emitted for NMI */ #define DECLARE_IDTENTRY_NMI(vector, func) +/* XEN NMI and DB wrapper */ +#define DECLARE_IDTENTRY_XEN(vector, func) \ + idtentry vector asm_exc_xen##func exc_##func has_error_code=0 sane=1 + #endif /* __ASSEMBLY__ */ /* From 6271fef00b3489690e52ce95edbc378357513547 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 25 Feb 2020 23:33:25 +0100 Subject: [PATCH 099/190] x86/entry: Convert NMI to IDTENTRY_NMI Convert #NMI to IDTENTRY_NMI: - Implement the C entry point with DEFINE_IDTENTRY_NMI - Fixup the XEN/PV code - Remove the old prototypes No functional change. Signed-off-by: Thomas Gleixner Reviewed-by: Alexandre Chartre Acked-by: Peter Zijlstra Acked-by: Andy Lutomirski Link: https://lkml.kernel.org/r/20200505135314.609932306@linutronix.de --- arch/x86/entry/entry_32.S | 8 ++++---- arch/x86/entry/entry_64.S | 15 +++++++-------- arch/x86/include/asm/idtentry.h | 4 ++++ arch/x86/include/asm/traps.h | 3 --- arch/x86/kernel/idt.c | 4 ++-- arch/x86/kernel/nmi.c | 4 +--- arch/x86/xen/enlighten_pv.c | 7 ++++++- arch/x86/xen/xen-asm_64.S | 2 +- 8 files changed, 25 insertions(+), 22 deletions(-) diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index 4dd3d706d9fcbb..d4961cac73f64d 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -1545,7 +1545,7 @@ SYM_CODE_END(double_fault) * switched stacks. We handle both conditions by simply checking whether we * interrupted kernel code running on the SYSENTER stack. */ -SYM_CODE_START(nmi) +SYM_CODE_START(asm_exc_nmi) ASM_CLAC #ifdef CONFIG_X86_ESPFIX32 @@ -1574,7 +1574,7 @@ SYM_CODE_START(nmi) jb .Lnmi_from_sysenter_stack /* Not on SYSENTER stack. */ - call do_nmi + call exc_nmi jmp .Lnmi_return .Lnmi_from_sysenter_stack: @@ -1584,7 +1584,7 @@ SYM_CODE_START(nmi) */ movl %esp, %ebx movl PER_CPU_VAR(cpu_current_top_of_stack), %esp - call do_nmi + call exc_nmi movl %ebx, %esp .Lnmi_return: @@ -1638,7 +1638,7 @@ SYM_CODE_START(nmi) lss (1+5+6)*4(%esp), %esp # back to espfix stack jmp .Lirq_return #endif -SYM_CODE_END(nmi) +SYM_CODE_END(asm_exc_nmi) .pushsection .text, "ax" SYM_CODE_START(rewind_stack_do_exit) diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 5007b975ca7cc5..3d7f2cc29be370 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -1079,7 +1079,6 @@ idtentry_df X86_TRAP_DF double_fault do_double_fault #ifdef CONFIG_XEN_PV idtentry 512 /* dummy */ hypervisor_callback xen_do_hypervisor_callback has_error_code=0 -idtentry X86_TRAP_NMI xennmi do_nmi has_error_code=0 idtentry X86_TRAP_DB xendebug do_debug has_error_code=0 #endif @@ -1414,7 +1413,7 @@ SYM_CODE_END(error_return) * %r14: Used to save/restore the CR3 of the interrupted context * when PAGE_TABLE_ISOLATION is in use. Do not clobber. */ -SYM_CODE_START(nmi) +SYM_CODE_START(asm_exc_nmi) UNWIND_HINT_IRET_REGS /* @@ -1499,7 +1498,7 @@ SYM_CODE_START(nmi) movq %rsp, %rdi movq $-1, %rsi - call do_nmi + call exc_nmi /* * Return back to user mode. We must *not* do the normal exit @@ -1556,7 +1555,7 @@ SYM_CODE_START(nmi) * end_repeat_nmi, then we are a nested NMI. We must not * modify the "iret" frame because it's being written by * the outer NMI. That's okay; the outer NMI handler is - * about to about to call do_nmi anyway, so we can just + * about to about to call exc_nmi() anyway, so we can just * resume the outer NMI. */ @@ -1675,7 +1674,7 @@ repeat_nmi: * RSP is pointing to "outermost RIP". gsbase is unknown, but, if * we're repeating an NMI, gsbase has the same value that it had on * the first iteration. paranoid_entry will load the kernel - * gsbase if needed before we call do_nmi. "NMI executing" + * gsbase if needed before we call exc_nmi(). "NMI executing" * is zero. */ movq $1, 10*8(%rsp) /* Set "NMI executing". */ @@ -1709,10 +1708,10 @@ end_repeat_nmi: call paranoid_entry UNWIND_HINT_REGS - /* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */ + /* paranoidentry exc_nmi(), 0; without TRACE_IRQS_OFF */ movq %rsp, %rdi movq $-1, %rsi - call do_nmi + call exc_nmi /* Always restore stashed CR3 value (see paranoid_entry) */ RESTORE_CR3 scratch_reg=%r15 save_reg=%r14 @@ -1749,7 +1748,7 @@ nmi_restore: * about espfix64 on the way back to kernel mode. */ iretq -SYM_CODE_END(nmi) +SYM_CODE_END(asm_exc_nmi) #ifndef CONFIG_IA32_EMULATION /* diff --git a/arch/x86/include/asm/idtentry.h b/arch/x86/include/asm/idtentry.h index 2315eecf04fdca..1f067e6c4051ad 100644 --- a/arch/x86/include/asm/idtentry.h +++ b/arch/x86/include/asm/idtentry.h @@ -258,4 +258,8 @@ DECLARE_IDTENTRY_RAW(X86_TRAP_BP, exc_int3); DECLARE_IDTENTRY_MCE(X86_TRAP_MC, exc_machine_check); #endif +/* NMI */ +DECLARE_IDTENTRY_NMI(X86_TRAP_NMI, exc_nmi); +DECLARE_IDTENTRY_XEN(X86_TRAP_NMI, nmi); + #endif diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h index 6096db912625fc..57b83ae19c15f2 100644 --- a/arch/x86/include/asm/traps.h +++ b/arch/x86/include/asm/traps.h @@ -12,7 +12,6 @@ #define dotraplinkage __visible asmlinkage void debug(void); -asmlinkage void nmi(void); #ifdef CONFIG_X86_64 asmlinkage void double_fault(void); #endif @@ -20,14 +19,12 @@ asmlinkage void page_fault(void); asmlinkage void async_page_fault(void); #if defined(CONFIG_X86_64) && defined(CONFIG_XEN_PV) -asmlinkage void xen_xennmi(void); asmlinkage void xen_xendebug(void); asmlinkage void xen_double_fault(void); asmlinkage void xen_page_fault(void); #endif dotraplinkage void do_debug(struct pt_regs *regs, long error_code); -dotraplinkage void do_nmi(struct pt_regs *regs, long error_code); dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code, unsigned long cr2); dotraplinkage void do_page_fault(struct pt_regs *regs, unsigned long error_code, unsigned long address); diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c index 6b93840784d527..d3fecd88677c46 100644 --- a/arch/x86/kernel/idt.c +++ b/arch/x86/kernel/idt.c @@ -74,7 +74,7 @@ static const __initconst struct idt_data early_idts[] = { */ static const __initconst struct idt_data def_idts[] = { INTG(X86_TRAP_DE, asm_exc_divide_error), - INTG(X86_TRAP_NMI, nmi), + INTG(X86_TRAP_NMI, asm_exc_nmi), INTG(X86_TRAP_BR, asm_exc_bounds), INTG(X86_TRAP_UD, asm_exc_invalid_op), INTG(X86_TRAP_NM, asm_exc_device_not_available), @@ -186,7 +186,7 @@ gate_desc debug_idt_table[IDT_ENTRIES] __page_aligned_bss; */ static const __initconst struct idt_data ist_idts[] = { ISTG(X86_TRAP_DB, debug, IST_INDEX_DB), - ISTG(X86_TRAP_NMI, nmi, IST_INDEX_NMI), + ISTG(X86_TRAP_NMI, asm_exc_nmi, IST_INDEX_NMI), ISTG(X86_TRAP_DF, double_fault, IST_INDEX_DF), #ifdef CONFIG_X86_MCE ISTG(X86_TRAP_MC, asm_exc_machine_check, IST_INDEX_MCE), diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index bdcc5146de96cd..3b05cc802abbaa 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -503,8 +503,7 @@ static bool notrace is_debug_stack(unsigned long addr) NOKPROBE_SYMBOL(is_debug_stack); #endif -dotraplinkage notrace void -do_nmi(struct pt_regs *regs, long error_code) +DEFINE_IDTENTRY_NMI(exc_nmi) { if (IS_ENABLED(CONFIG_SMP) && cpu_is_offline(smp_processor_id())) return; @@ -554,7 +553,6 @@ do_nmi(struct pt_regs *regs, long error_code) if (user_mode(regs)) mds_user_clear_cpu_buffers(); } -NOKPROBE_SYMBOL(do_nmi); void stop_nmi(void) { diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c index 267583f9b20749..0d6c2789e6768c 100644 --- a/arch/x86/xen/enlighten_pv.c +++ b/arch/x86/xen/enlighten_pv.c @@ -609,13 +609,18 @@ struct trap_array_entry { .xen = xen_asm_##func, \ .ist_okay = ist_ok } +#define TRAP_ENTRY_REDIR(func, xenfunc, ist_ok) { \ + .orig = asm_##func, \ + .xen = xen_asm_##xenfunc, \ + .ist_okay = ist_ok } + static struct trap_array_entry trap_array[] = { { debug, xen_xendebug, true }, { double_fault, xen_double_fault, true }, #ifdef CONFIG_X86_MCE TRAP_ENTRY(exc_machine_check, true ), #endif - { nmi, xen_xennmi, true }, + TRAP_ENTRY_REDIR(exc_nmi, exc_xennmi, true ), TRAP_ENTRY(exc_int3, false ), TRAP_ENTRY(exc_overflow, false ), #ifdef CONFIG_IA32_EMULATION diff --git a/arch/x86/xen/xen-asm_64.S b/arch/x86/xen/xen-asm_64.S index 617ef3f981604e..04fa01b096ee3a 100644 --- a/arch/x86/xen/xen-asm_64.S +++ b/arch/x86/xen/xen-asm_64.S @@ -32,7 +32,7 @@ xen_pv_trap asm_exc_divide_error xen_pv_trap debug xen_pv_trap xendebug xen_pv_trap asm_exc_int3 -xen_pv_trap xennmi +xen_pv_trap asm_exc_xennmi xen_pv_trap asm_exc_overflow xen_pv_trap asm_exc_bounds xen_pv_trap asm_exc_invalid_op From f051f697955049c7cf10a635ab8149aa619243b2 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 6 Apr 2020 15:55:06 +0200 Subject: [PATCH 100/190] x86/nmi: Protect NMI entry against instrumentation Mark all functions in the fragile code parts noinstr or force inlining so they can't be instrumented. Also make the hardware latency tracer invocation explicit outside of non-instrumentable section. Signed-off-by: Thomas Gleixner Reviewed-by: Alexandre Chartre Acked-by: Peter Zijlstra Acked-by: Andy Lutomirski Link: https://lkml.kernel.org/r/20200505135314.716186134@linutronix.de --- arch/x86/include/asm/desc.h | 8 ++++---- arch/x86/kernel/cpu/common.c | 6 ++---- arch/x86/kernel/nmi.c | 15 +++++++++------ 3 files changed, 15 insertions(+), 14 deletions(-) diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h index 085a2dd312b47d..d6c3d346c63af4 100644 --- a/arch/x86/include/asm/desc.h +++ b/arch/x86/include/asm/desc.h @@ -214,7 +214,7 @@ static inline void native_load_gdt(const struct desc_ptr *dtr) asm volatile("lgdt %0"::"m" (*dtr)); } -static inline void native_load_idt(const struct desc_ptr *dtr) +static __always_inline void native_load_idt(const struct desc_ptr *dtr) { asm volatile("lidt %0"::"m" (*dtr)); } @@ -392,7 +392,7 @@ extern unsigned long system_vectors[]; #ifdef CONFIG_X86_64 DECLARE_PER_CPU(u32, debug_idt_ctr); -static inline bool is_debug_idt_enabled(void) +static __always_inline bool is_debug_idt_enabled(void) { if (this_cpu_read(debug_idt_ctr)) return true; @@ -400,7 +400,7 @@ static inline bool is_debug_idt_enabled(void) return false; } -static inline void load_debug_idt(void) +static __always_inline void load_debug_idt(void) { load_idt((const struct desc_ptr *)&debug_idt_descr); } @@ -422,7 +422,7 @@ static inline void load_debug_idt(void) * that doesn't need to disable interrupts, as nothing should be * bothering the CPU then. */ -static inline void load_current_idt(void) +static __always_inline void load_current_idt(void) { if (is_debug_idt_enabled()) load_debug_idt(); diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 8be042df12c30b..f4645f9ff9cb55 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1709,21 +1709,19 @@ void syscall_init(void) DEFINE_PER_CPU(int, debug_stack_usage); DEFINE_PER_CPU(u32, debug_idt_ctr); -void debug_stack_set_zero(void) +noinstr void debug_stack_set_zero(void) { this_cpu_inc(debug_idt_ctr); load_current_idt(); } -NOKPROBE_SYMBOL(debug_stack_set_zero); -void debug_stack_reset(void) +noinstr void debug_stack_reset(void) { if (WARN_ON(!this_cpu_read(debug_idt_ctr))) return; if (this_cpu_dec_return(debug_idt_ctr) == 0) load_current_idt(); } -NOKPROBE_SYMBOL(debug_stack_reset); #else /* CONFIG_X86_64 */ diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index 3b05cc802abbaa..3052c78f03aa27 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -303,7 +303,7 @@ NOKPROBE_SYMBOL(unknown_nmi_error); static DEFINE_PER_CPU(bool, swallow_nmi); static DEFINE_PER_CPU(unsigned long, last_nmi_rip); -static void default_do_nmi(struct pt_regs *regs) +static noinstr void default_do_nmi(struct pt_regs *regs) { unsigned char reason = 0; int handled; @@ -329,6 +329,8 @@ static void default_do_nmi(struct pt_regs *regs) __this_cpu_write(last_nmi_rip, regs->ip); + instrumentation_begin(); + handled = nmi_handle(NMI_LOCAL, regs); __this_cpu_add(nmi_stats.normal, handled); if (handled) { @@ -342,7 +344,7 @@ static void default_do_nmi(struct pt_regs *regs) */ if (handled > 1) __this_cpu_write(swallow_nmi, true); - return; + goto out; } /* @@ -374,7 +376,7 @@ static void default_do_nmi(struct pt_regs *regs) #endif __this_cpu_add(nmi_stats.external, 1); raw_spin_unlock(&nmi_reason_lock); - return; + goto out; } raw_spin_unlock(&nmi_reason_lock); @@ -412,8 +414,10 @@ static void default_do_nmi(struct pt_regs *regs) __this_cpu_add(nmi_stats.swallow, 1); else unknown_nmi_error(reason, regs); + +out: + instrumentation_end(); } -NOKPROBE_SYMBOL(default_do_nmi); /* * NMIs can page fault or hit breakpoints which will cause it to lose @@ -485,7 +489,7 @@ static DEFINE_PER_CPU(unsigned long, nmi_cr2); */ static DEFINE_PER_CPU(int, update_debug_stack); -static bool notrace is_debug_stack(unsigned long addr) +static noinstr bool is_debug_stack(unsigned long addr) { struct cea_exception_stacks *cs = __this_cpu_read(cea_exception_stacks); unsigned long top = CEA_ESTACK_TOP(cs, DB); @@ -500,7 +504,6 @@ static bool notrace is_debug_stack(unsigned long addr) */ return addr >= bot && addr < top; } -NOKPROBE_SYMBOL(is_debug_stack); #endif DEFINE_IDTENTRY_NMI(exc_nmi) From 9f58fdde95c9509a4ded27a6d0035e79294002b4 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 6 Apr 2020 21:02:56 +0200 Subject: [PATCH 101/190] x86/db: Split out dr6/7 handling DR6/7 should be handled before nmi_enter() is invoked and restore after nmi_exit() to minimize the exposure. Split it out into helper inlines and bring it into the correct order. Signed-off-by: Peter Zijlstra Signed-off-by: Thomas Gleixner Reviewed-by: Alexandre Chartre Acked-by: Andy Lutomirski Link: https://lkml.kernel.org/r/20200505135314.808628211@linutronix.de --- arch/x86/kernel/hw_breakpoint.c | 6 +-- arch/x86/kernel/traps.c | 75 ++++++++++++++++++++++++--------- 2 files changed, 57 insertions(+), 24 deletions(-) diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c index d42fc0eaf193c8..9ddf441ccaa87d 100644 --- a/arch/x86/kernel/hw_breakpoint.c +++ b/arch/x86/kernel/hw_breakpoint.c @@ -464,7 +464,7 @@ static int hw_breakpoint_handler(struct die_args *args) { int i, cpu, rc = NOTIFY_STOP; struct perf_event *bp; - unsigned long dr7, dr6; + unsigned long dr6; unsigned long *dr6_p; /* The DR6 value is pointed by args->err */ @@ -479,9 +479,6 @@ static int hw_breakpoint_handler(struct die_args *args) if ((dr6 & DR_TRAP_BITS) == 0) return NOTIFY_DONE; - get_debugreg(dr7, 7); - /* Disable breakpoints during exception handling */ - set_debugreg(0UL, 7); /* * Assert that local interrupts are disabled * Reset the DRn bits in the virtualized register value. @@ -538,7 +535,6 @@ static int hw_breakpoint_handler(struct die_args *args) (dr6 & (~DR_TRAP_BITS))) rc = NOTIFY_DONE; - set_debugreg(dr7, 7); put_cpu(); return rc; diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 21c8cfce24d314..de5120e2fbe126 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -700,6 +700,57 @@ static bool is_sysenter_singlestep(struct pt_regs *regs) #endif } +static __always_inline void debug_enter(unsigned long *dr6, unsigned long *dr7) +{ + /* + * Disable breakpoints during exception handling; recursive exceptions + * are exceedingly 'fun'. + * + * Since this function is NOKPROBE, and that also applies to + * HW_BREAKPOINT_X, we can't hit a breakpoint before this (XXX except a + * HW_BREAKPOINT_W on our stack) + * + * Entry text is excluded for HW_BP_X and cpu_entry_area, which + * includes the entry stack is excluded for everything. + */ + get_debugreg(*dr7, 7); + set_debugreg(0, 7); + + /* + * Ensure the compiler doesn't lower the above statements into + * the critical section; disabling breakpoints late would not + * be good. + */ + barrier(); + + /* + * The Intel SDM says: + * + * Certain debug exceptions may clear bits 0-3. The remaining + * contents of the DR6 register are never cleared by the + * processor. To avoid confusion in identifying debug + * exceptions, debug handlers should clear the register before + * returning to the interrupted task. + * + * Keep it simple: clear DR6 immediately. + */ + get_debugreg(*dr6, 6); + set_debugreg(0, 6); + /* Filter out all the reserved bits which are preset to 1 */ + *dr6 &= ~DR6_RESERVED; +} + +static __always_inline void debug_exit(unsigned long dr7) +{ + /* + * Ensure the compiler doesn't raise this statement into + * the critical section; enabling breakpoints early would + * not be good. + */ + barrier(); + set_debugreg(dr7, 7); +} + /* * Our handling of the processor debug registers is non-trivial. * We do not clear them on entry and exit from the kernel. Therefore @@ -727,28 +778,13 @@ static bool is_sysenter_singlestep(struct pt_regs *regs) dotraplinkage void do_debug(struct pt_regs *regs, long error_code) { struct task_struct *tsk = current; + unsigned long dr6, dr7; int user_icebp = 0; - unsigned long dr6; int si_code; - nmi_enter(); - - get_debugreg(dr6, 6); - /* - * The Intel SDM says: - * - * Certain debug exceptions may clear bits 0-3. The remaining - * contents of the DR6 register are never cleared by the - * processor. To avoid confusion in identifying debug - * exceptions, debug handlers should clear the register before - * returning to the interrupted task. - * - * Keep it simple: clear DR6 immediately. - */ - set_debugreg(0, 6); + debug_enter(&dr6, &dr7); - /* Filter out all the reserved bits which are preset to 1 */ - dr6 &= ~DR6_RESERVED; + nmi_enter(); /* * The SDM says "The processor clears the BTF flag when it @@ -786,7 +822,7 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code) #endif if (notify_die(DIE_DEBUG, "debug", regs, (long)&dr6, error_code, - SIGTRAP) == NOTIFY_STOP) + SIGTRAP) == NOTIFY_STOP) goto exit; /* @@ -825,6 +861,7 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code) exit: nmi_exit(); + debug_exit(dr7); } NOKPROBE_SYMBOL(do_debug); From 2bbc68f8373c0631ebf137f376fbea00e8086be7 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 25 Feb 2020 23:33:26 +0100 Subject: [PATCH 102/190] x86/entry: Convert Debug exception to IDTENTRY_DB Convert #DB to IDTENTRY_ERRORCODE: - Implement the C entry point with DEFINE_IDTENTRY_DB - Emit the ASM stub with DECLARE_IDTENTRY - Remove the ASM idtentry in 64bit - Remove the open coded ASM entry code in 32bit - Fixup the XEN/PV code - Remove the old prototypes No functional change. Signed-off-by: Thomas Gleixner Reviewed-by: Alexandre Chartre Acked-by: Peter Zijlstra Acked-by: Andy Lutomirski Link: https://lkml.kernel.org/r/20200505135314.900297476@linutronix.de --- arch/x86/entry/entry_32.S | 10 ---------- arch/x86/entry/entry_64.S | 2 -- arch/x86/include/asm/idtentry.h | 4 ++++ arch/x86/include/asm/traps.h | 3 --- arch/x86/kernel/idt.c | 8 ++++---- arch/x86/kernel/traps.c | 21 +++++++++++++-------- arch/x86/xen/enlighten_pv.c | 2 +- arch/x86/xen/xen-asm_64.S | 4 ++-- 8 files changed, 24 insertions(+), 30 deletions(-) diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index d4961cac73f64d..30c6ed3d7c52c7 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -1488,16 +1488,6 @@ ret_to_user: jmp restore_all_switch_stack SYM_CODE_END(handle_exception) -SYM_CODE_START(debug) - /* - * Entry from sysenter is now handled in common_exception - */ - ASM_CLAC - pushl $0 - pushl $do_debug - jmp common_exception -SYM_CODE_END(debug) - SYM_CODE_START(double_fault) 1: /* diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 3d7f2cc29be370..f47629a7f8e641 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -1074,12 +1074,10 @@ apicinterrupt IRQ_WORK_VECTOR irq_work_interrupt smp_irq_work_interrupt idtentry X86_TRAP_PF page_fault do_page_fault has_error_code=1 -idtentry_mce_db X86_TRAP_DB debug do_debug idtentry_df X86_TRAP_DF double_fault do_double_fault #ifdef CONFIG_XEN_PV idtentry 512 /* dummy */ hypervisor_callback xen_do_hypervisor_callback has_error_code=0 -idtentry X86_TRAP_DB xendebug do_debug has_error_code=0 #endif /* diff --git a/arch/x86/include/asm/idtentry.h b/arch/x86/include/asm/idtentry.h index 1f067e6c4051ad..fcd4230a979b68 100644 --- a/arch/x86/include/asm/idtentry.h +++ b/arch/x86/include/asm/idtentry.h @@ -262,4 +262,8 @@ DECLARE_IDTENTRY_MCE(X86_TRAP_MC, exc_machine_check); DECLARE_IDTENTRY_NMI(X86_TRAP_NMI, exc_nmi); DECLARE_IDTENTRY_XEN(X86_TRAP_NMI, nmi); +/* #DB */ +DECLARE_IDTENTRY_DEBUG(X86_TRAP_DB, exc_debug); +DECLARE_IDTENTRY_XEN(X86_TRAP_DB, debug); + #endif diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h index 57b83ae19c15f2..9bd602d0130d55 100644 --- a/arch/x86/include/asm/traps.h +++ b/arch/x86/include/asm/traps.h @@ -11,7 +11,6 @@ #define dotraplinkage __visible -asmlinkage void debug(void); #ifdef CONFIG_X86_64 asmlinkage void double_fault(void); #endif @@ -19,12 +18,10 @@ asmlinkage void page_fault(void); asmlinkage void async_page_fault(void); #if defined(CONFIG_X86_64) && defined(CONFIG_XEN_PV) -asmlinkage void xen_xendebug(void); asmlinkage void xen_double_fault(void); asmlinkage void xen_page_fault(void); #endif -dotraplinkage void do_debug(struct pt_regs *regs, long error_code); dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code, unsigned long cr2); dotraplinkage void do_page_fault(struct pt_regs *regs, unsigned long error_code, unsigned long address); diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c index d3fecd88677c46..ddf3f3db3235c2 100644 --- a/arch/x86/kernel/idt.c +++ b/arch/x86/kernel/idt.c @@ -59,7 +59,7 @@ static bool idt_setup_done __initdata; * stacks work only after cpu_init(). */ static const __initconst struct idt_data early_idts[] = { - INTG(X86_TRAP_DB, debug), + INTG(X86_TRAP_DB, asm_exc_debug), SYSG(X86_TRAP_BP, asm_exc_int3), #ifdef CONFIG_X86_32 INTG(X86_TRAP_PF, page_fault), @@ -93,7 +93,7 @@ static const __initconst struct idt_data def_idts[] = { #else INTG(X86_TRAP_DF, double_fault), #endif - INTG(X86_TRAP_DB, debug), + INTG(X86_TRAP_DB, asm_exc_debug), #ifdef CONFIG_X86_MCE INTG(X86_TRAP_MC, asm_exc_machine_check), @@ -164,7 +164,7 @@ static const __initconst struct idt_data early_pf_idts[] = { * stack set to DEFAULT_STACK (0). Required for NMI trap handling. */ static const __initconst struct idt_data dbg_idts[] = { - INTG(X86_TRAP_DB, debug), + INTG(X86_TRAP_DB, asm_exc_debug), }; #endif @@ -185,7 +185,7 @@ gate_desc debug_idt_table[IDT_ENTRIES] __page_aligned_bss; * cpu_init() when the TSS has been initialized. */ static const __initconst struct idt_data ist_idts[] = { - ISTG(X86_TRAP_DB, debug, IST_INDEX_DB), + ISTG(X86_TRAP_DB, asm_exc_debug, IST_INDEX_DB), ISTG(X86_TRAP_NMI, asm_exc_nmi, IST_INDEX_NMI), ISTG(X86_TRAP_DF, double_fault, IST_INDEX_DF), #ifdef CONFIG_X86_MCE diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index de5120e2fbe126..569408a681b6a5 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -775,7 +775,7 @@ static __always_inline void debug_exit(unsigned long dr7) * * May run on IST stack. */ -dotraplinkage void do_debug(struct pt_regs *regs, long error_code) +DEFINE_IDTENTRY_DEBUG(exc_debug) { struct task_struct *tsk = current; unsigned long dr6, dr7; @@ -784,7 +784,10 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code) debug_enter(&dr6, &dr7); - nmi_enter(); + if (user_mode(regs)) + idtentry_enter(regs); + else + nmi_enter(); /* * The SDM says "The processor clears the BTF flag when it @@ -821,7 +824,7 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code) goto exit; #endif - if (notify_die(DIE_DEBUG, "debug", regs, (long)&dr6, error_code, + if (notify_die(DIE_DEBUG, "debug", regs, (long)&dr6, 0, SIGTRAP) == NOTIFY_STOP) goto exit; @@ -835,8 +838,8 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code) cond_local_irq_enable(regs); if (v8086_mode(regs)) { - handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code, - X86_TRAP_DB); + handle_vm86_trap((struct kernel_vm86_regs *) regs, 0, + X86_TRAP_DB); cond_local_irq_disable(regs); debug_stack_usage_dec(); goto exit; @@ -855,15 +858,17 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code) } si_code = get_si_code(tsk->thread.debugreg6); if (tsk->thread.debugreg6 & (DR_STEP | DR_TRAP_BITS) || user_icebp) - send_sigtrap(regs, error_code, si_code); + send_sigtrap(regs, 0, si_code); cond_local_irq_disable(regs); debug_stack_usage_dec(); exit: - nmi_exit(); + if (user_mode(regs)) + idtentry_exit(regs); + else + nmi_exit(); debug_exit(dr7); } -NOKPROBE_SYMBOL(do_debug); /* * Note that we play around with the 'TS' bit in an attempt to get diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c index 0d6c2789e6768c..376851d1039a29 100644 --- a/arch/x86/xen/enlighten_pv.c +++ b/arch/x86/xen/enlighten_pv.c @@ -615,7 +615,7 @@ struct trap_array_entry { .ist_okay = ist_ok } static struct trap_array_entry trap_array[] = { - { debug, xen_xendebug, true }, + TRAP_ENTRY_REDIR(exc_debug, exc_xendebug, true ), { double_fault, xen_double_fault, true }, #ifdef CONFIG_X86_MCE TRAP_ENTRY(exc_machine_check, true ), diff --git a/arch/x86/xen/xen-asm_64.S b/arch/x86/xen/xen-asm_64.S index 04fa01b096ee3a..9999ea3774762e 100644 --- a/arch/x86/xen/xen-asm_64.S +++ b/arch/x86/xen/xen-asm_64.S @@ -29,8 +29,8 @@ _ASM_NOKPROBE(xen_\name) .endm xen_pv_trap asm_exc_divide_error -xen_pv_trap debug -xen_pv_trap xendebug +xen_pv_trap asm_exc_debug +xen_pv_trap asm_exc_xendebug xen_pv_trap asm_exc_int3 xen_pv_trap asm_exc_xennmi xen_pv_trap asm_exc_overflow From df7ccaffd2035ebd4bfbb2d980e5817c31f4a891 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 25 Feb 2020 23:33:27 +0100 Subject: [PATCH 103/190] x86/entry/64: Remove error code clearing from #DB and #MCE ASM stub The C entry points do not expect an error code. Signed-off-by: Thomas Gleixner Reviewed-by: Alexandre Chartre Acked-by: Peter Zijlstra Acked-by: Andy Lutomirski Link: https://lkml.kernel.org/r/20200505135314.992621707@linutronix.de --- arch/x86/entry/entry_64.S | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index f47629a7f8e641..eeb428582d3da6 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -642,7 +642,6 @@ SYM_CODE_START(\asmsym) .endif movq %rsp, %rdi /* pt_regs pointer */ - xorl %esi, %esi /* Clear the error code */ .if \vector == X86_TRAP_DB subq $DB_STACK_OFFSET, CPU_TSS_IST(IST_INDEX_DB) From f08e32ec3cfcf9e6d3640007de590c225ab2e201 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 25 Feb 2020 23:33:28 +0100 Subject: [PATCH 104/190] x86/idtentry: Provide IDTRENTRY_NOIST variants for #DB and #MC Provide NOIST entry point macros which allows to implement NOIST variants of the C entry points. These are invoked when #DB or #MC enter from user space. This allows explicit handling of the difference between user mode and kernel mode entry later. Signed-off-by: Thomas Gleixner Reviewed-by: Alexandre Chartre Acked-by: Peter Zijlstra Acked-by: Andy Lutomirski Link: https://lkml.kernel.org/r/20200505135315.084882104@linutronix.de --- arch/x86/include/asm/idtentry.h | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/idtentry.h b/arch/x86/include/asm/idtentry.h index fcd4230a979b68..060f9e358b1c74 100644 --- a/arch/x86/include/asm/idtentry.h +++ b/arch/x86/include/asm/idtentry.h @@ -138,10 +138,12 @@ __visible noinstr void func(struct pt_regs *regs) * @vector: Vector number (ignored for C) * @func: Function name of the entry point * - * Maps to DECLARE_IDTENTRY_RAW + * Maps to DECLARE_IDTENTRY_RAW, but declares also the NOIST C handler + * which is called from the ASM entry point on user mode entry */ #define DECLARE_IDTENTRY_IST(vector, func) \ - DECLARE_IDTENTRY_RAW(vector, func) + DECLARE_IDTENTRY_RAW(vector, func); \ + __visible void noist_##func(struct pt_regs *regs) /** * DEFINE_IDTENTRY_IST - Emit code for IST entry points @@ -152,6 +154,17 @@ __visible noinstr void func(struct pt_regs *regs) #define DEFINE_IDTENTRY_IST(func) \ DEFINE_IDTENTRY_RAW(func) +/** + * DEFINE_IDTENTRY_NOIST - Emit code for NOIST entry points which + * belong to a IST entry point (MCE, DB) + * @func: Function name of the entry point. Must be the same as + * the function name of the corresponding IST variant + * + * Maps to DEFINE_IDTENTRY_RAW(). + */ +#define DEFINE_IDTENTRY_NOIST(func) \ + DEFINE_IDTENTRY_RAW(noist_##func) + #else /* CONFIG_X86_64 */ /* Maps to a regular IDTENTRY on 32bit for now */ # define DECLARE_IDTENTRY_IST DECLARE_IDTENTRY @@ -161,12 +174,14 @@ __visible noinstr void func(struct pt_regs *regs) /* C-Code mapping */ #define DECLARE_IDTENTRY_MCE DECLARE_IDTENTRY_IST #define DEFINE_IDTENTRY_MCE DEFINE_IDTENTRY_IST +#define DEFINE_IDTENTRY_MCE_USER DEFINE_IDTENTRY_NOIST #define DECLARE_IDTENTRY_NMI DECLARE_IDTENTRY_IST #define DEFINE_IDTENTRY_NMI DEFINE_IDTENTRY_IST #define DECLARE_IDTENTRY_DEBUG DECLARE_IDTENTRY_IST #define DEFINE_IDTENTRY_DEBUG DEFINE_IDTENTRY_IST +#define DEFINE_IDTENTRY_DEBUG_USER DEFINE_IDTENTRY_NOIST /** * DECLARE_IDTENTRY_XEN - Declare functions for XEN redirect IDT entry points From 4c0dcd8350a03cb65f645a039f2772be880ee74a Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 25 Feb 2020 23:33:29 +0100 Subject: [PATCH 105/190] x86/entry: Implement user mode C entry points for #DB and #MCE The MCE entry point uses the same mechanism as the IST entry point for now. For #DB split the inner workings and just keep the nmi_enter/exit() magic in the IST variant. Fixup the ASM code to emit the proper noist_##cfunc call. Signed-off-by: Thomas Gleixner Reviewed-by: Alexandre Chartre Acked-by: Peter Zijlstra Acked-by: Andy Lutomirski Link: https://lkml.kernel.org/r/20200505135315.177564104@linutronix.de --- arch/x86/entry/entry_64.S | 2 +- arch/x86/kernel/cpu/mce/core.c | 40 +++++++++++++++---- arch/x86/kernel/traps.c | 70 ++++++++++++++++++++++++++-------- 3 files changed, 88 insertions(+), 24 deletions(-) diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index eeb428582d3da6..d302839b9b3c6a 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -657,7 +657,7 @@ SYM_CODE_START(\asmsym) /* Switch to the regular task stack and use the noist entry point */ .Lfrom_usermode_switch_stack_\@: - idtentry_body vector \cfunc, has_error_code=0 + idtentry_body vector noist_\cfunc, has_error_code=0 sane=1 _ASM_NOKPROBE(\asmsym) SYM_CODE_END(\asmsym) diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index 3177652451905e..a72c0135a5ec7e 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -1904,24 +1904,50 @@ static void unexpected_machine_check(struct pt_regs *regs) /* Call the installed machine check handler for this CPU setup. */ void (*machine_check_vector)(struct pt_regs *) = unexpected_machine_check; -DEFINE_IDTENTRY_MCE(exc_machine_check) +static __always_inline void exc_machine_check_kernel(struct pt_regs *regs) { + /* + * Only required when from kernel mode. See + * mce_check_crashing_cpu() for details. + */ if (machine_check_vector == do_machine_check && mce_check_crashing_cpu()) return; - if (user_mode(regs)) - idtentry_enter(regs); - else - nmi_enter(); + nmi_enter(); + machine_check_vector(regs); + nmi_exit(); +} +static __always_inline void exc_machine_check_user(struct pt_regs *regs) +{ + idtentry_enter(regs); machine_check_vector(regs); + idtentry_exit(regs); +} +#ifdef CONFIG_X86_64 +/* MCE hit kernel mode */ +DEFINE_IDTENTRY_MCE(exc_machine_check) +{ + exc_machine_check_kernel(regs); +} + +/* The user mode variant. */ +DEFINE_IDTENTRY_MCE_USER(exc_machine_check) +{ + exc_machine_check_user(regs); +} +#else +/* 32bit unified entry point */ +DEFINE_IDTENTRY_MCE(exc_machine_check) +{ if (user_mode(regs)) - idtentry_exit(regs); + exc_machine_check_user(regs); else - nmi_exit(); + exc_machine_check_kernel(regs); } +#endif /* * Called for each booted CPU to set up machine checks. diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 569408a681b6a5..4f248c5d5cabb5 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -775,20 +775,12 @@ static __always_inline void debug_exit(unsigned long dr7) * * May run on IST stack. */ -DEFINE_IDTENTRY_DEBUG(exc_debug) +static noinstr void handle_debug(struct pt_regs *regs, unsigned long dr6) { struct task_struct *tsk = current; - unsigned long dr6, dr7; int user_icebp = 0; int si_code; - debug_enter(&dr6, &dr7); - - if (user_mode(regs)) - idtentry_enter(regs); - else - nmi_enter(); - /* * The SDM says "The processor clears the BTF flag when it * generates a debug exception." Clear TIF_BLOCKSTEP to keep @@ -800,7 +792,7 @@ DEFINE_IDTENTRY_DEBUG(exc_debug) is_sysenter_singlestep(regs))) { dr6 &= ~DR_STEP; if (!dr6) - goto exit; + return; /* * else we might have gotten a single-step trap and hit a * watchpoint at the same time, in which case we should fall @@ -821,12 +813,12 @@ DEFINE_IDTENTRY_DEBUG(exc_debug) #ifdef CONFIG_KPROBES if (kprobe_debug_handler(regs)) - goto exit; + return; #endif if (notify_die(DIE_DEBUG, "debug", regs, (long)&dr6, 0, SIGTRAP) == NOTIFY_STOP) - goto exit; + return; /* * Let others (NMI) know that the debug stack is in use @@ -842,7 +834,7 @@ DEFINE_IDTENTRY_DEBUG(exc_debug) X86_TRAP_DB); cond_local_irq_disable(regs); debug_stack_usage_dec(); - goto exit; + return; } if (WARN_ON_ONCE((dr6 & DR_STEP) && !user_mode(regs))) { @@ -861,14 +853,60 @@ DEFINE_IDTENTRY_DEBUG(exc_debug) send_sigtrap(regs, 0, si_code); cond_local_irq_disable(regs); debug_stack_usage_dec(); +} + +static __always_inline void exc_debug_kernel(struct pt_regs *regs, + unsigned long dr6) +{ + nmi_enter(); + handle_debug(regs, dr6); + nmi_exit(); +} + +static __always_inline void exc_debug_user(struct pt_regs *regs, + unsigned long dr6) +{ + idtentry_enter(regs); + handle_debug(regs, dr6); + idtentry_exit(regs); +} + +#ifdef CONFIG_X86_64 +/* IST stack entry */ +DEFINE_IDTENTRY_DEBUG(exc_debug) +{ + unsigned long dr6, dr7; + + debug_enter(&dr6, &dr7); + exc_debug_kernel(regs, dr6); + debug_exit(dr7); +} + +/* User entry, runs on regular task stack */ +DEFINE_IDTENTRY_DEBUG_USER(exc_debug) +{ + unsigned long dr6, dr7; + + debug_enter(&dr6, &dr7); + exc_debug_user(regs, dr6); + debug_exit(dr7); +} +#else +/* 32 bit does not have separate entry points. */ +DEFINE_IDTENTRY_DEBUG(exc_debug) +{ + unsigned long dr6, dr7; + + debug_enter(&dr6, &dr7); -exit: if (user_mode(regs)) - idtentry_exit(regs); + exc_debug_user(regs, dr6); else - nmi_exit(); + exc_debug_kernel(regs, dr6); + debug_exit(dr7); } +#endif /* * Note that we play around with the 'TS' bit in an attempt to get From 9347f41352181bf4a7e663f7b5f4a4bb32244d73 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 4 May 2020 19:56:26 +0200 Subject: [PATCH 106/190] x86/traps: Restructure #DB handling Now that there are separate entry points, move the kernel/user_mode specifc checks into the entry functions so the common handling code does not need the extra mode checks. Make the code more readable while at it. Signed-off-by: Thomas Gleixner Reviewed-by: Alexandre Chartre Acked-by: Peter Zijlstra Acked-by: Andy Lutomirski Link: https://lkml.kernel.org/r/20200505135315.283276272@linutronix.de --- arch/x86/kernel/traps.c | 69 +++++++++++++++++++++-------------------- 1 file changed, 35 insertions(+), 34 deletions(-) diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 4f248c5d5cabb5..b62e962871f2bb 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -775,39 +775,12 @@ static __always_inline void debug_exit(unsigned long dr7) * * May run on IST stack. */ -static noinstr void handle_debug(struct pt_regs *regs, unsigned long dr6) +static void noinstr handle_debug(struct pt_regs *regs, unsigned long dr6, + bool user_icebp) { struct task_struct *tsk = current; - int user_icebp = 0; int si_code; - /* - * The SDM says "The processor clears the BTF flag when it - * generates a debug exception." Clear TIF_BLOCKSTEP to keep - * TIF_BLOCKSTEP in sync with the hardware BTF flag. - */ - clear_tsk_thread_flag(tsk, TIF_BLOCKSTEP); - - if (unlikely(!user_mode(regs) && (dr6 & DR_STEP) && - is_sysenter_singlestep(regs))) { - dr6 &= ~DR_STEP; - if (!dr6) - return; - /* - * else we might have gotten a single-step trap and hit a - * watchpoint at the same time, in which case we should fall - * through and handle the watchpoint. - */ - } - - /* - * If dr6 has no reason to give us about the origin of this trap, - * then it's very likely the result of an icebp/int01 trap. - * User wants a sigtrap for that. - */ - if (!dr6 && user_mode(regs)) - user_icebp = 1; - /* Store the virtualized DR6 value */ tsk->thread.debugreg6 = dr6; @@ -832,9 +805,7 @@ static noinstr void handle_debug(struct pt_regs *regs, unsigned long dr6) if (v8086_mode(regs)) { handle_vm86_trap((struct kernel_vm86_regs *) regs, 0, X86_TRAP_DB); - cond_local_irq_disable(regs); - debug_stack_usage_dec(); - return; + goto out; } if (WARN_ON_ONCE((dr6 & DR_STEP) && !user_mode(regs))) { @@ -848,9 +819,12 @@ static noinstr void handle_debug(struct pt_regs *regs, unsigned long dr6) set_tsk_thread_flag(tsk, TIF_SINGLESTEP); regs->flags &= ~X86_EFLAGS_TF; } + si_code = get_si_code(tsk->thread.debugreg6); if (tsk->thread.debugreg6 & (DR_STEP | DR_TRAP_BITS) || user_icebp) send_sigtrap(regs, 0, si_code); + +out: cond_local_irq_disable(regs); debug_stack_usage_dec(); } @@ -859,7 +833,27 @@ static __always_inline void exc_debug_kernel(struct pt_regs *regs, unsigned long dr6) { nmi_enter(); - handle_debug(regs, dr6); + /* + * The SDM says "The processor clears the BTF flag when it + * generates a debug exception." Clear TIF_BLOCKSTEP to keep + * TIF_BLOCKSTEP in sync with the hardware BTF flag. + */ + clear_thread_flag(TIF_BLOCKSTEP); + + /* + * Catch SYSENTER with TF set and clear DR_STEP. If this hit a + * watchpoint at the same time then that will still be handled. + */ + if ((dr6 & DR_STEP) && is_sysenter_singlestep(regs)) + dr6 &= ~DR_STEP; + + /* + * If DR6 is zero, no point in trying to handle it. The kernel is + * not using INT1. + */ + if (dr6) + handle_debug(regs, dr6, false); + nmi_exit(); } @@ -867,7 +861,14 @@ static __always_inline void exc_debug_user(struct pt_regs *regs, unsigned long dr6) { idtentry_enter(regs); - handle_debug(regs, dr6); + clear_thread_flag(TIF_BLOCKSTEP); + + /* + * If dr6 has no reason to give us about the origin of this trap, + * then it's very likely the result of an icebp/int01 trap. + * User wants a sigtrap for that. + */ + handle_debug(regs, dr6, !dr6); idtentry_exit(regs); } From 75347bb2535a6d5549cc3e436467b7c40d7bb874 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 30 Apr 2020 11:07:20 +0200 Subject: [PATCH 107/190] x86/traps: Address objtool noinstr complaints in #DB The functions invoked from handle_debug() can be instrumented. Tell objtool about it. Signed-off-by: Thomas Gleixner Reviewed-by: Alexandre Chartre Acked-by: Peter Zijlstra Acked-by: Andy Lutomirski Link: https://lkml.kernel.org/r/20200505135315.380927730@linutronix.de --- arch/x86/kernel/traps.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index b62e962871f2bb..41bb0cb9df84ef 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -784,14 +784,19 @@ static void noinstr handle_debug(struct pt_regs *regs, unsigned long dr6, /* Store the virtualized DR6 value */ tsk->thread.debugreg6 = dr6; + instrumentation_begin(); #ifdef CONFIG_KPROBES - if (kprobe_debug_handler(regs)) + if (kprobe_debug_handler(regs)) { + instrumentation_end(); return; + } #endif if (notify_die(DIE_DEBUG, "debug", regs, (long)&dr6, 0, - SIGTRAP) == NOTIFY_STOP) + SIGTRAP) == NOTIFY_STOP) { + instrumentation_end(); return; + } /* * Let others (NMI) know that the debug stack is in use @@ -827,6 +832,7 @@ static void noinstr handle_debug(struct pt_regs *regs, unsigned long dr6, out: cond_local_irq_disable(regs); debug_stack_usage_dec(); + instrumentation_end(); } static __always_inline void exc_debug_kernel(struct pt_regs *regs, From 865d3a9afe7eddf320e7f61a442864d6efe27505 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 21 Apr 2020 21:22:36 +0200 Subject: [PATCH 108/190] x86/mce: Address objtools noinstr complaints Mark the relevant functions noinstr, use the plain non-instrumented MSR accessors. The only odd part is the instrumentation_begin()/end() pair around the indirect machine_check_vector() call as objtool can't figure that out. The possible invoked functions are annotated correctly. Also use notrace variant of nmi_enter/exit(). If MCEs happen then hardware latency tracing is the least of the worries. Signed-off-by: Thomas Gleixner Reviewed-by: Alexandre Chartre Acked-by: Peter Zijlstra Acked-by: Andy Lutomirski Link: https://lkml.kernel.org/r/20200505135315.476734898@linutronix.de --- arch/x86/kernel/cpu/mce/core.c | 20 +++++++++++++++----- arch/x86/kernel/cpu/mce/p5.c | 4 +++- arch/x86/kernel/cpu/mce/winchip.c | 4 +++- kernel/time/timekeeping.c | 2 +- 4 files changed, 22 insertions(+), 8 deletions(-) diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index a72c0135a5ec7e..a32a7e236bb16d 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -130,7 +130,7 @@ static void (*quirk_no_way_out)(int bank, struct mce *m, struct pt_regs *regs); BLOCKING_NOTIFIER_HEAD(x86_mce_decoder_chain); /* Do initial initialization of a struct mce */ -void mce_setup(struct mce *m) +noinstr void mce_setup(struct mce *m) { memset(m, 0, sizeof(struct mce)); m->cpu = m->extcpu = smp_processor_id(); @@ -140,12 +140,12 @@ void mce_setup(struct mce *m) m->cpuid = cpuid_eax(1); m->socketid = cpu_data(m->extcpu).phys_proc_id; m->apicid = cpu_data(m->extcpu).initial_apicid; - rdmsrl(MSR_IA32_MCG_CAP, m->mcgcap); + m->mcgcap = __rdmsr(MSR_IA32_MCG_CAP); if (this_cpu_has(X86_FEATURE_INTEL_PPIN)) - rdmsrl(MSR_PPIN, m->ppin); + m->ppin = __rdmsr(MSR_PPIN); else if (this_cpu_has(X86_FEATURE_AMD_PPIN)) - rdmsrl(MSR_AMD_PPIN, m->ppin); + m->ppin = __rdmsr(MSR_AMD_PPIN); m->microcode = boot_cpu_data.microcode; } @@ -1895,10 +1895,12 @@ bool filter_mce(struct mce *m) } /* Handle unconfigured int18 (should never happen) */ -static void unexpected_machine_check(struct pt_regs *regs) +static noinstr void unexpected_machine_check(struct pt_regs *regs) { + instrumentation_begin(); pr_err("CPU#%d: Unexpected int18 (Machine Check)\n", smp_processor_id()); + instrumentation_end(); } /* Call the installed machine check handler for this CPU setup. */ @@ -1915,14 +1917,22 @@ static __always_inline void exc_machine_check_kernel(struct pt_regs *regs) return; nmi_enter(); + /* + * The call targets are marked noinstr, but objtool can't figure + * that out because it's an indirect call. Annotate it. + */ + instrumentation_begin(); machine_check_vector(regs); + instrumentation_end(); nmi_exit(); } static __always_inline void exc_machine_check_user(struct pt_regs *regs) { idtentry_enter(regs); + instrumentation_begin(); machine_check_vector(regs); + instrumentation_end(); idtentry_exit(regs); } diff --git a/arch/x86/kernel/cpu/mce/p5.c b/arch/x86/kernel/cpu/mce/p5.c index eaebc4ce7398db..19e90cae8e97d3 100644 --- a/arch/x86/kernel/cpu/mce/p5.c +++ b/arch/x86/kernel/cpu/mce/p5.c @@ -21,10 +21,11 @@ int mce_p5_enabled __read_mostly; /* Machine check handler for Pentium class Intel CPUs: */ -static void pentium_machine_check(struct pt_regs *regs) +static noinstr void pentium_machine_check(struct pt_regs *regs) { u32 loaddr, hi, lotype; + instrumentation_begin(); rdmsr(MSR_IA32_P5_MC_ADDR, loaddr, hi); rdmsr(MSR_IA32_P5_MC_TYPE, lotype, hi); @@ -37,6 +38,7 @@ static void pentium_machine_check(struct pt_regs *regs) } add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE); + instrumentation_end(); } /* Set up machine check reporting for processors with Intel style MCE: */ diff --git a/arch/x86/kernel/cpu/mce/winchip.c b/arch/x86/kernel/cpu/mce/winchip.c index 90e3d60c645e95..9c9f0abd2d7fd3 100644 --- a/arch/x86/kernel/cpu/mce/winchip.c +++ b/arch/x86/kernel/cpu/mce/winchip.c @@ -17,10 +17,12 @@ #include "internal.h" /* Machine check handler for WinChip C6: */ -static void winchip_machine_check(struct pt_regs *regs) +static noinstr void winchip_machine_check(struct pt_regs *regs) { + instrumentation_begin(); pr_emerg("CPU0: Machine Check Exception.\n"); add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE); + instrumentation_end(); } /* Set up machine check reporting on the Winchip C6 series */ diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 9ebaab13339d4c..d20d489841c816 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -953,7 +953,7 @@ EXPORT_SYMBOL_GPL(ktime_get_real_seconds); * but without the sequence counter protect. This internal function * is called just when timekeeping lock is already held. */ -time64_t __ktime_get_real_seconds(void) +noinstr time64_t __ktime_get_real_seconds(void) { struct timekeeper *tk = &tk_core.timekeeper; From 6a8dfa8e4053adfcf02ee4d96287943064166beb Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 25 Feb 2020 23:33:30 +0100 Subject: [PATCH 109/190] x86/idtentry: Provide IDTENTRY_DF Provide a separate macro for #DF as this needs to emit paranoid only code and has also a special ASM stub in 32bit. Signed-off-by: Thomas Gleixner Reviewed-by: Alexandre Chartre Acked-by: Peter Zijlstra Acked-by: Andy Lutomirski Link: https://lkml.kernel.org/r/20200505135315.583415264@linutronix.de --- arch/x86/include/asm/idtentry.h | 87 +++++++++++++++++++++++++++++++++ 1 file changed, 87 insertions(+) diff --git a/arch/x86/include/asm/idtentry.h b/arch/x86/include/asm/idtentry.h index 060f9e358b1c74..9521f329bbbe51 100644 --- a/arch/x86/include/asm/idtentry.h +++ b/arch/x86/include/asm/idtentry.h @@ -132,6 +132,35 @@ static __always_inline void __##func(struct pt_regs *regs, \ #define DEFINE_IDTENTRY_RAW(func) \ __visible noinstr void func(struct pt_regs *regs) +/** + * DECLARE_IDTENTRY_RAW_ERRORCODE - Declare functions for raw IDT entry points + * Error code pushed by hardware + * @vector: Vector number (ignored for C) + * @func: Function name of the entry point + * + * Maps to DECLARE_IDTENTRY_ERRORCODE() + */ +#define DECLARE_IDTENTRY_RAW_ERRORCODE(vector, func) \ + DECLARE_IDTENTRY_ERRORCODE(vector, func) + +/** + * DEFINE_IDTENTRY_RAW_ERRORCODE - Emit code for raw IDT entry points + * @func: Function name of the entry point + * + * @func is called from ASM entry code with interrupts disabled. + * + * The macro is written so it acts as function definition. Append the + * body with a pair of curly brackets. + * + * Contrary to DEFINE_IDTENTRY_ERRORCODE() this does not invoke the + * idtentry_enter/exit() helpers before and after the body invocation. This + * needs to be done in the body itself if applicable. Use if extra work + * is required before the enter/exit() helpers are invoked. + */ +#define DEFINE_IDTENTRY_RAW_ERRORCODE(func) \ +__visible noinstr void func(struct pt_regs *regs, unsigned long error_code) + + #ifdef CONFIG_X86_64 /** * DECLARE_IDTENTRY_IST - Declare functions for IST handling IDT entry points @@ -165,10 +194,58 @@ __visible noinstr void func(struct pt_regs *regs) #define DEFINE_IDTENTRY_NOIST(func) \ DEFINE_IDTENTRY_RAW(noist_##func) +/** + * DECLARE_IDTENTRY_DF - Declare functions for double fault + * @vector: Vector number (ignored for C) + * @func: Function name of the entry point + * + * Maps to DECLARE_IDTENTRY_RAW_ERRORCODE + */ +#define DECLARE_IDTENTRY_DF(vector, func) \ + DECLARE_IDTENTRY_RAW_ERRORCODE(vector, func) + +/** + * DEFINE_IDTENTRY_DF - Emit code for double fault + * @func: Function name of the entry point + * + * Maps to DEFINE_IDTENTRY_RAW_ERRORCODE + */ +#define DEFINE_IDTENTRY_DF(func) \ + DEFINE_IDTENTRY_RAW_ERRORCODE(func) + #else /* CONFIG_X86_64 */ + /* Maps to a regular IDTENTRY on 32bit for now */ # define DECLARE_IDTENTRY_IST DECLARE_IDTENTRY # define DEFINE_IDTENTRY_IST DEFINE_IDTENTRY + +/** + * DECLARE_IDTENTRY_DF - Declare functions for double fault 32bit variant + * @vector: Vector number (ignored for C) + * @func: Function name of the entry point + * + * Declares two functions: + * - The ASM entry point: asm_##func + * - The C handler called from the C shim + */ +#define DECLARE_IDTENTRY_DF(vector, func) \ + asmlinkage void asm_##func(void); \ + __visible void func(struct pt_regs *regs, \ + unsigned long error_code, \ + unsigned long address) + +/** + * DEFINE_IDTENTRY_DF - Emit code for double fault on 32bit + * @func: Function name of the entry point + * + * This is called through the doublefault shim which already provides + * cr2 in the address argument. + */ +#define DEFINE_IDTENTRY_DF(func) \ +__visible noinstr void func(struct pt_regs *regs, \ + unsigned long error_code, \ + unsigned long address) + #endif /* !CONFIG_X86_64 */ /* C-Code mapping */ @@ -212,6 +289,9 @@ __visible noinstr void func(struct pt_regs *regs) #define DECLARE_IDTENTRY_RAW(vector, func) \ DECLARE_IDTENTRY(vector, func) +#define DECLARE_IDTENTRY_RAW_ERRORCODE(vector, func) \ + DECLARE_IDTENTRY_ERRORCODE(vector, func) + #ifdef CONFIG_X86_64 # define DECLARE_IDTENTRY_MCE(vector, func) \ idtentry_mce_db vector asm_##func func @@ -219,12 +299,19 @@ __visible noinstr void func(struct pt_regs *regs) # define DECLARE_IDTENTRY_DEBUG(vector, func) \ idtentry_mce_db vector asm_##func func +# define DECLARE_IDTENTRY_DF(vector, func) \ + idtentry_df vector asm_##func func + #else # define DECLARE_IDTENTRY_MCE(vector, func) \ DECLARE_IDTENTRY(vector, func) # define DECLARE_IDTENTRY_DEBUG(vector, func) \ DECLARE_IDTENTRY(vector, func) + +/* No ASM emitted for DF as this goes through a C shim */ +# define DECLARE_IDTENTRY_DF(vector, func) + #endif /* No ASM code emitted for NMI */ From c29c775a554f7060b6fb31b68f88a3c9087cf1c5 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 25 Feb 2020 23:33:31 +0100 Subject: [PATCH 110/190] x86/entry: Convert double fault exception to IDTENTRY_DF Convert #DF to IDTENTRY_DF - Implement the C entry point with DEFINE_IDTENTRY_DF - Emit the ASM stub with DECLARE_IDTENTRY_DF on 64bit - Remove the ASM idtentry in 64bit - Adjust the 32bit shim code - Fixup the XEN/PV code - Remove the old prototypes No functional change. Signed-off-by: Thomas Gleixner Reviewed-by: Alexandre Chartre Acked-by: Peter Zijlstra Acked-by: Andy Lutomirski Link: https://lkml.kernel.org/r/20200505135315.583415264@linutronix.de --- arch/x86/entry/entry_32.S | 4 ++-- arch/x86/entry/entry_64.S | 10 +--------- arch/x86/include/asm/idtentry.h | 3 +++ arch/x86/include/asm/traps.h | 5 ----- arch/x86/kernel/doublefault_32.c | 10 ++++------ arch/x86/kernel/idt.c | 4 ++-- arch/x86/kernel/traps.c | 17 ++++++++++++++--- arch/x86/xen/enlighten_pv.c | 4 ++-- arch/x86/xen/xen-asm_64.S | 2 +- 9 files changed, 29 insertions(+), 30 deletions(-) diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index 30c6ed3d7c52c7..28d13f07b84d2d 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -1488,7 +1488,7 @@ ret_to_user: jmp restore_all_switch_stack SYM_CODE_END(handle_exception) -SYM_CODE_START(double_fault) +SYM_CODE_START(asm_exc_double_fault) 1: /* * This is a task gate handler, not an interrupt gate handler. @@ -1526,7 +1526,7 @@ SYM_CODE_START(double_fault) 1: hlt jmp 1b -SYM_CODE_END(double_fault) +SYM_CODE_END(asm_exc_double_fault) /* * NMI is doubly nasty. It can happen on the first instruction of diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index d302839b9b3c6a..d983a0d4bc73a5 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -680,15 +680,9 @@ SYM_CODE_START(\asmsym) call paranoid_entry UNWIND_HINT_REGS - /* Read CR2 early */ - GET_CR2_INTO(%r12); - - TRACE_IRQS_OFF - movq %rsp, %rdi /* pt_regs pointer into first argument */ movq ORIG_RAX(%rsp), %rsi /* get error code into 2nd argument*/ movq $-1, ORIG_RAX(%rsp) /* no syscall to restart */ - movq %r12, %rdx /* Move CR2 into 3rd argument */ call \cfunc jmp paranoid_exit @@ -918,7 +912,7 @@ SYM_INNER_LABEL(native_irq_return_iret, SYM_L_GLOBAL) /* * This may fault. Non-paranoid faults on return to userspace are * handled by fixup_bad_iret. These include #SS, #GP, and #NP. - * Double-faults due to espfix64 are handled in do_double_fault. + * Double-faults due to espfix64 are handled in exc_double_fault. * Other faults here are fatal. */ iretq @@ -1073,8 +1067,6 @@ apicinterrupt IRQ_WORK_VECTOR irq_work_interrupt smp_irq_work_interrupt idtentry X86_TRAP_PF page_fault do_page_fault has_error_code=1 -idtentry_df X86_TRAP_DF double_fault do_double_fault - #ifdef CONFIG_XEN_PV idtentry 512 /* dummy */ hypervisor_callback xen_do_hypervisor_callback has_error_code=0 #endif diff --git a/arch/x86/include/asm/idtentry.h b/arch/x86/include/asm/idtentry.h index 9521f329bbbe51..ce97478ffc409d 100644 --- a/arch/x86/include/asm/idtentry.h +++ b/arch/x86/include/asm/idtentry.h @@ -368,4 +368,7 @@ DECLARE_IDTENTRY_XEN(X86_TRAP_NMI, nmi); DECLARE_IDTENTRY_DEBUG(X86_TRAP_DB, exc_debug); DECLARE_IDTENTRY_XEN(X86_TRAP_DB, debug); +/* #DF */ +DECLARE_IDTENTRY_DF(X86_TRAP_DF, exc_double_fault); + #endif diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h index 9bd602d0130d55..f5a2e438a8784d 100644 --- a/arch/x86/include/asm/traps.h +++ b/arch/x86/include/asm/traps.h @@ -11,18 +11,13 @@ #define dotraplinkage __visible -#ifdef CONFIG_X86_64 -asmlinkage void double_fault(void); -#endif asmlinkage void page_fault(void); asmlinkage void async_page_fault(void); #if defined(CONFIG_X86_64) && defined(CONFIG_XEN_PV) -asmlinkage void xen_double_fault(void); asmlinkage void xen_page_fault(void); #endif -dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code, unsigned long cr2); dotraplinkage void do_page_fault(struct pt_regs *regs, unsigned long error_code, unsigned long address); #ifdef CONFIG_X86_64 diff --git a/arch/x86/kernel/doublefault_32.c b/arch/x86/kernel/doublefault_32.c index 2ccc57f152a4fb..759d392cbe9f0e 100644 --- a/arch/x86/kernel/doublefault_32.c +++ b/arch/x86/kernel/doublefault_32.c @@ -10,7 +10,6 @@ #include #include -extern void double_fault(void); #define ptr_ok(x) ((x) > PAGE_OFFSET && (x) < PAGE_OFFSET + MAXMEM) #define TSS(x) this_cpu_read(cpu_tss_rw.x86_tss.x) @@ -21,7 +20,7 @@ static void set_df_gdt_entry(unsigned int cpu); * Called by double_fault with CR0.TS and EFLAGS.NT cleared. The CPU thinks * we're running the doublefault task. Cannot return. */ -asmlinkage notrace void __noreturn doublefault_shim(void) +asmlinkage noinstr void __noreturn doublefault_shim(void) { unsigned long cr2; struct pt_regs regs; @@ -40,7 +39,7 @@ asmlinkage notrace void __noreturn doublefault_shim(void) * Fill in pt_regs. A downside of doing this in C is that the unwinder * won't see it (no ENCODE_FRAME_POINTER), so a nested stack dump * won't successfully unwind to the source of the double fault. - * The main dump from do_double_fault() is fine, though, since it + * The main dump from exc_double_fault() is fine, though, since it * uses these regs directly. * * If anyone ever cares, this could be moved to asm. @@ -70,7 +69,7 @@ asmlinkage notrace void __noreturn doublefault_shim(void) regs.cx = TSS(cx); regs.bx = TSS(bx); - do_double_fault(®s, 0, cr2); + exc_double_fault(®s, 0, cr2); /* * x86_32 does not save the original CR3 anywhere on a task switch. @@ -84,7 +83,6 @@ asmlinkage notrace void __noreturn doublefault_shim(void) */ panic("cannot return from double fault\n"); } -NOKPROBE_SYMBOL(doublefault_shim); DEFINE_PER_CPU_PAGE_ALIGNED(struct doublefault_stack, doublefault_stack) = { .tss = { @@ -95,7 +93,7 @@ DEFINE_PER_CPU_PAGE_ALIGNED(struct doublefault_stack, doublefault_stack) = { .ldt = 0, .io_bitmap_base = IO_BITMAP_OFFSET_INVALID, - .ip = (unsigned long) double_fault, + .ip = (unsigned long) asm_exc_double_fault, .flags = X86_EFLAGS_FIXED, .es = __USER_DS, .cs = __KERNEL_CS, diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c index ddf3f3db3235c2..ec55479e1dd154 100644 --- a/arch/x86/kernel/idt.c +++ b/arch/x86/kernel/idt.c @@ -91,7 +91,7 @@ static const __initconst struct idt_data def_idts[] = { #ifdef CONFIG_X86_32 TSKG(X86_TRAP_DF, GDT_ENTRY_DOUBLEFAULT_TSS), #else - INTG(X86_TRAP_DF, double_fault), + INTG(X86_TRAP_DF, asm_exc_double_fault), #endif INTG(X86_TRAP_DB, asm_exc_debug), @@ -187,7 +187,7 @@ gate_desc debug_idt_table[IDT_ENTRIES] __page_aligned_bss; static const __initconst struct idt_data ist_idts[] = { ISTG(X86_TRAP_DB, asm_exc_debug, IST_INDEX_DB), ISTG(X86_TRAP_NMI, asm_exc_nmi, IST_INDEX_NMI), - ISTG(X86_TRAP_DF, double_fault, IST_INDEX_DF), + ISTG(X86_TRAP_DF, asm_exc_double_fault, IST_INDEX_DF), #ifdef CONFIG_X86_MCE ISTG(X86_TRAP_MC, asm_exc_machine_check, IST_INDEX_MCE), #endif diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 41bb0cb9df84ef..35298c1df32f6e 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -319,12 +319,19 @@ __visible void __noreturn handle_stack_overflow(const char *message, * from the TSS. Returning is, in principle, okay, but changes to regs will * be lost. If, for some reason, we need to return to a context with modified * regs, the shim code could be adjusted to synchronize the registers. + * + * The 32bit #DF shim provides CR2 already as an argument. On 64bit it needs + * to be read before doing anything else. */ -dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code, unsigned long cr2) +DEFINE_IDTENTRY_DF(exc_double_fault) { static const char str[] = "double fault"; struct task_struct *tsk = current; +#ifdef CONFIG_X86_64 + unsigned long address = read_cr2(); +#endif + #ifdef CONFIG_X86_ESPFIX64 extern unsigned char native_irq_return_iret[]; @@ -381,6 +388,7 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code, unsign #endif nmi_enter(); + instrumentation_begin(); notify_die(DIE_TRAP, str, regs, error_code, X86_TRAP_DF, SIGSEGV); tsk->thread.error_code = error_code; @@ -424,13 +432,16 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code, unsign * stack even if the actual trigger for the double fault was * something else. */ - if ((unsigned long)task_stack_page(tsk) - 1 - cr2 < PAGE_SIZE) - handle_stack_overflow("kernel stack overflow (double-fault)", regs, cr2); + if ((unsigned long)task_stack_page(tsk) - 1 - address < PAGE_SIZE) { + handle_stack_overflow("kernel stack overflow (double-fault)", + regs, address); + } #endif pr_emerg("PANIC: double fault, error_code: 0x%lx\n", error_code); die("double fault", regs, error_code); panic("Machine halted."); + instrumentation_end(); } DEFINE_IDTENTRY(exc_bounds) diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c index 376851d1039a29..008291121cb4ee 100644 --- a/arch/x86/xen/enlighten_pv.c +++ b/arch/x86/xen/enlighten_pv.c @@ -616,7 +616,7 @@ struct trap_array_entry { static struct trap_array_entry trap_array[] = { TRAP_ENTRY_REDIR(exc_debug, exc_xendebug, true ), - { double_fault, xen_double_fault, true }, + TRAP_ENTRY(exc_double_fault, true ), #ifdef CONFIG_X86_MCE TRAP_ENTRY(exc_machine_check, true ), #endif @@ -651,7 +651,7 @@ static bool __ref get_trap_addr(void **addr, unsigned int ist) * Replace trap handler addresses by Xen specific ones. * Check for known traps using IST and whitelist them. * The debugger ones are the only ones we care about. - * Xen will handle faults like double_fault, * so we should never see + * Xen will handle faults like double_fault, so we should never see * them. Warn if there's an unexpected IST-using fault handler. */ for (nr = 0; nr < ARRAY_SIZE(trap_array); nr++) { diff --git a/arch/x86/xen/xen-asm_64.S b/arch/x86/xen/xen-asm_64.S index 9999ea3774762e..e46d863bcaa4a8 100644 --- a/arch/x86/xen/xen-asm_64.S +++ b/arch/x86/xen/xen-asm_64.S @@ -37,7 +37,7 @@ xen_pv_trap asm_exc_overflow xen_pv_trap asm_exc_bounds xen_pv_trap asm_exc_invalid_op xen_pv_trap asm_exc_device_not_available -xen_pv_trap double_fault +xen_pv_trap asm_exc_double_fault xen_pv_trap asm_exc_coproc_segment_overrun xen_pv_trap asm_exc_invalid_tss xen_pv_trap asm_exc_segment_not_present From 7102cb07132624cdc09aa8e40c03ae34b4cbb74a Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 25 May 2020 09:42:41 +0200 Subject: [PATCH 111/190] x86/entry: Fix allnoconfig build warning MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The following commit: 095b7a3e7745 ("x86/entry: Convert double fault exception to IDTENTRY_DF") introduced a new build warning on 64-bit allnoconfig kernels, that have CONFIG_VMAP_STACK disabled: arch/x86/kernel/traps.c:332:16: warning: unused variable ‘address’ [-Wunused-variable] This variable is only used if CONFIG_VMAP_STACK is defined, so make it dependent on that, not CONFIG_X86_64. Signed-off-by: Ingo Molnar Cc: Thomas Gleixner Cc: Alexandre Chartre Cc: Peter Zijlstra Cc: Andy Lutomirski Cc: Borislav Petkov --- arch/x86/kernel/traps.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 35298c1df32f6e..9e5d81cb94bad7 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -328,7 +328,7 @@ DEFINE_IDTENTRY_DF(exc_double_fault) static const char str[] = "double fault"; struct task_struct *tsk = current; -#ifdef CONFIG_X86_64 +#ifdef CONFIG_VMAP_STACK unsigned long address = read_cr2(); #endif From 2ab70319bc1f79228da4dce7b9d604740c9beeef Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 21 May 2020 22:05:14 +0200 Subject: [PATCH 112/190] nmi, tracing: Make hardware latency tracing noinstr safe The hardware latency tracer calls into instrumentable functions. Move the calls into the RCU watching sections and annotate them. Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar Acked-by: Andy Lutomirski Cc: Paul E. McKenney Link: https://lore.kernel.org/r/20200521202116.904176298@linutronix.de Signed-off-by: Ingo Molnar --- include/linux/hardirq.h | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h index e07cf853aa161a..29b862aba740db 100644 --- a/include/linux/hardirq.h +++ b/include/linux/hardirq.h @@ -87,20 +87,24 @@ extern void rcu_nmi_exit(void); arch_nmi_enter(); \ printk_nmi_enter(); \ lockdep_off(); \ - ftrace_nmi_enter(); \ BUG_ON(in_nmi() == NMI_MASK); \ __preempt_count_add(NMI_OFFSET + HARDIRQ_OFFSET); \ rcu_nmi_enter(); \ lockdep_hardirq_enter(); \ + instrumentation_begin(); \ + ftrace_nmi_enter(); \ + instrumentation_end(); \ } while (0) #define nmi_exit() \ do { \ + instrumentation_begin(); \ + ftrace_nmi_exit(); \ + instrumentation_end(); \ lockdep_hardirq_exit(); \ rcu_nmi_exit(); \ BUG_ON(!in_nmi()); \ __preempt_count_sub(NMI_OFFSET + HARDIRQ_OFFSET); \ - ftrace_nmi_exit(); \ lockdep_on(); \ printk_nmi_exit(); \ arch_nmi_exit(); \ From 3eeec385848855c8109eb72b8b309078d5507968 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 21 May 2020 22:05:17 +0200 Subject: [PATCH 113/190] x86/entry: Provide idtentry_entry/exit_cond_rcu() After a lengthy discussion [1] it turned out that RCU does not need a full rcu_irq_enter/exit() when RCU is already watching. All it needs if NOHZ_FULL is active is to check whether the tick needs to be restarted. This allows to avoid a separate variant for the pagefault handler which cannot invoke rcu_irq_enter() on a kernel pagefault which might sleep. The cond_rcu argument is only temporary and will be removed once the existing users of idtentry_enter/exit() have been cleaned up. After that the code can be significantly simplified. [ mingo: Simplified the control flow ] Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar Acked-by: "Paul E. McKenney" Acked-by: Andy Lutomirski Link: [1] https://lkml.kernel.org/r/20200515235125.628629605@linutronix.de Link: https://lore.kernel.org/r/20200521202117.181397835@linutronix.de --- arch/x86/entry/common.c | 79 ++++++++++++++++++++++++++------- arch/x86/include/asm/idtentry.h | 14 +++++- 2 files changed, 76 insertions(+), 17 deletions(-) diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index 9ebe33485428f8..a7f5846a4102c3 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -512,8 +512,10 @@ SYSCALL_DEFINE0(ni_syscall) } /** - * idtentry_enter - Handle state tracking on idtentry + * idtentry_enter_cond_rcu - Handle state tracking on idtentry with conditional + * RCU handling * @regs: Pointer to pt_regs of interrupted context + * @cond_rcu: Invoke rcu_irq_enter() only if RCU is not watching * * Invokes: * - lockdep irqflag state tracking as low level ASM entry disabled @@ -521,40 +523,84 @@ SYSCALL_DEFINE0(ni_syscall) * * - Context tracking if the exception hit user mode. * - * - RCU notification if the exception hit kernel mode. - * * - The hardirq tracer to keep the state consistent as low level ASM * entry disabled interrupts. + * + * For kernel mode entries RCU handling is done conditional. If RCU is + * watching then the only RCU requirement is to check whether the tick has + * to be restarted. If RCU is not watching then rcu_irq_enter() has to be + * invoked on entry and rcu_irq_exit() on exit. + * + * Avoiding the rcu_irq_enter/exit() calls is an optimization but also + * solves the problem of kernel mode pagefaults which can schedule, which + * is not possible after invoking rcu_irq_enter() without undoing it. + * + * For user mode entries enter_from_user_mode() must be invoked to + * establish the proper context for NOHZ_FULL. Otherwise scheduling on exit + * would not be possible. + * + * Returns: True if RCU has been adjusted on a kernel entry + * False otherwise + * + * The return value must be fed into the rcu_exit argument of + * idtentry_exit_cond_rcu(). */ -void noinstr idtentry_enter(struct pt_regs *regs) +bool noinstr idtentry_enter_cond_rcu(struct pt_regs *regs, bool cond_rcu) { if (user_mode(regs)) { enter_from_user_mode(); - } else { + return false; + } + + if (!cond_rcu || !__rcu_is_watching()) { + /* + * If RCU is not watching then the same careful + * sequence vs. lockdep and tracing is required + * as in enter_from_user_mode(). + * + * This only happens for IRQs that hit the idle + * loop, i.e. if idle is not using MWAIT. + */ lockdep_hardirqs_off(CALLER_ADDR0); rcu_irq_enter(); instrumentation_begin(); trace_hardirqs_off_prepare(); instrumentation_end(); + + return true; } + + /* + * If RCU is watching then RCU only wants to check + * whether it needs to restart the tick in NOHZ + * mode. + */ + instrumentation_begin(); + rcu_irq_enter_check_tick(); + /* Use the combo lockdep/tracing function */ + trace_hardirqs_off(); + instrumentation_end(); + + return false; } /** - * idtentry_exit - Common code to handle return from exceptions + * idtentry_exit_cond_rcu - Handle return from exception with conditional RCU + * handling * @regs: Pointer to pt_regs (exception entry regs) + * @rcu_exit: Invoke rcu_irq_exit() if true * * Depending on the return target (kernel/user) this runs the necessary - * preemption and work checks if possible and required and returns to + * preemption and work checks if possible and reguired and returns to * the caller with interrupts disabled and no further work pending. * * This is the last action before returning to the low level ASM code which * just needs to return to the appropriate context. * - * Invoked by all exception/interrupt IDTENTRY handlers which are not - * returning through the paranoid exit path (all except NMI, #DF and the IST - * variants of #MC and #DB) and are therefore on the thread stack. + * Counterpart to idtentry_enter_cond_rcu(). The return value of the entry + * function must be fed into the @rcu_exit argument. */ -void noinstr idtentry_exit(struct pt_regs *regs) +void noinstr idtentry_exit_cond_rcu(struct pt_regs *regs, bool rcu_exit) { lockdep_assert_irqs_disabled(); @@ -580,7 +626,8 @@ void noinstr idtentry_exit(struct pt_regs *regs) if (IS_ENABLED(CONFIG_DEBUG_ENTRY)) WARN_ON_ONCE(!on_thread_stack()); instrumentation_begin(); - rcu_irq_exit_preempt(); + if (rcu_exit) + rcu_irq_exit_preempt(); if (need_resched()) preempt_schedule_irq(); /* Covers both tracing and lockdep */ @@ -602,10 +649,12 @@ void noinstr idtentry_exit(struct pt_regs *regs) trace_hardirqs_on_prepare(); lockdep_hardirqs_on_prepare(CALLER_ADDR0); instrumentation_end(); - rcu_irq_exit(); + if (rcu_exit) + rcu_irq_exit(); lockdep_hardirqs_on(CALLER_ADDR0); } else { - /* IRQ flags state is correct already. Just tell RCU */ - rcu_irq_exit(); + /* IRQ flags state is correct already. Just tell RCU. */ + if (rcu_exit) + rcu_irq_exit(); } } diff --git a/arch/x86/include/asm/idtentry.h b/arch/x86/include/asm/idtentry.h index ce97478ffc409d..a116b80662d4c2 100644 --- a/arch/x86/include/asm/idtentry.h +++ b/arch/x86/include/asm/idtentry.h @@ -7,8 +7,18 @@ #ifndef __ASSEMBLY__ -void idtentry_enter(struct pt_regs *regs); -void idtentry_exit(struct pt_regs *regs); +bool idtentry_enter_cond_rcu(struct pt_regs *regs, bool cond_rcu); +void idtentry_exit_cond_rcu(struct pt_regs *regs, bool rcu_exit); + +static __always_inline void idtentry_enter(struct pt_regs *regs) +{ + idtentry_enter_cond_rcu(regs, false); +} + +static __always_inline void idtentry_exit(struct pt_regs *regs) +{ + idtentry_exit_cond_rcu(regs, true); +} /** * DECLARE_IDTENTRY - Declare functions for simple IDT entry points From 9f9781b60dfa68d5094a41982f1efa75215a62b1 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 21 May 2020 22:05:18 +0200 Subject: [PATCH 114/190] x86/entry: Provide idtentry_enter/exit_user() As there are exceptions which already handle entry from user mode and from kernel mode separately, providing explicit user entry/exit handling callbacks makes sense and makes the code easier to understand. Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar Acked-by: Andy Lutomirski Link: https://lore.kernel.org/r/20200521202117.289548561@linutronix.de --- arch/x86/entry/common.c | 31 +++++++++++++++++++++++++++++++ arch/x86/include/asm/idtentry.h | 3 +++ 2 files changed, 34 insertions(+) diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index a7f5846a4102c3..b7fcb1355adff4 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -658,3 +658,34 @@ void noinstr idtentry_exit_cond_rcu(struct pt_regs *regs, bool rcu_exit) rcu_irq_exit(); } } + +/** + * idtentry_enter_user - Handle state tracking on idtentry from user mode + * @regs: Pointer to pt_regs of interrupted context + * + * Invokes enter_from_user_mode() to establish the proper context for + * NOHZ_FULL. Otherwise scheduling on exit would not be possible. + */ +void noinstr idtentry_enter_user(struct pt_regs *regs) +{ + enter_from_user_mode(); +} + +/** + * idtentry_exit_user - Handle return from exception to user mode + * @regs: Pointer to pt_regs (exception entry regs) + * + * Runs the necessary preemption and work checks and returns to the caller + * with interrupts disabled and no further work pending. + * + * This is the last action before returning to the low level ASM code which + * just needs to return to the appropriate context. + * + * Counterpart to idtentry_enter_user(). + */ +void noinstr idtentry_exit_user(struct pt_regs *regs) +{ + lockdep_assert_irqs_disabled(); + + prepare_exit_to_usermode(regs); +} diff --git a/arch/x86/include/asm/idtentry.h b/arch/x86/include/asm/idtentry.h index a116b80662d4c2..b3aca728f2fbd4 100644 --- a/arch/x86/include/asm/idtentry.h +++ b/arch/x86/include/asm/idtentry.h @@ -7,6 +7,9 @@ #ifndef __ASSEMBLY__ +void idtentry_enter_user(struct pt_regs *regs); +void idtentry_exit_user(struct pt_regs *regs); + bool idtentry_enter_cond_rcu(struct pt_regs *regs, bool cond_rcu); void idtentry_exit_cond_rcu(struct pt_regs *regs, bool rcu_exit); From fa95d7dc1abceb288db2959badb9aaf558eb0530 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 21 May 2020 22:05:19 +0200 Subject: [PATCH 115/190] x86/idtentry: Switch to conditional RCU handling Switch all idtentry_enter/exit() users over to the new conditional RCU handling scheme and make the user mode entries in #DB, #INT3 and #MCE use the user mode idtentry functions. Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar Acked-by: Andy Lutomirski Link: https://lore.kernel.org/r/20200521202117.382387286@linutronix.de --- arch/x86/include/asm/idtentry.h | 10 ++++++---- arch/x86/kernel/cpu/mce/core.c | 4 ++-- arch/x86/kernel/traps.c | 10 +++++----- 3 files changed, 13 insertions(+), 11 deletions(-) diff --git a/arch/x86/include/asm/idtentry.h b/arch/x86/include/asm/idtentry.h index b3aca728f2fbd4..0f974e52e13b2e 100644 --- a/arch/x86/include/asm/idtentry.h +++ b/arch/x86/include/asm/idtentry.h @@ -61,11 +61,12 @@ static __always_inline void __##func(struct pt_regs *regs); \ \ __visible noinstr void func(struct pt_regs *regs) \ { \ - idtentry_enter(regs); \ + bool rcu_exit = idtentry_enter_cond_rcu(regs); \ + \ instrumentation_begin(); \ __##func (regs); \ instrumentation_end(); \ - idtentry_exit(regs); \ + idtentry_exit_cond_rcu(regs, rcu_exit); \ } \ \ static __always_inline void __##func(struct pt_regs *regs) @@ -107,11 +108,12 @@ static __always_inline void __##func(struct pt_regs *regs, \ __visible noinstr void func(struct pt_regs *regs, \ unsigned long error_code) \ { \ - idtentry_enter(regs); \ + bool rcu_exit = idtentry_enter_cond_rcu(regs); \ + \ instrumentation_begin(); \ __##func (regs, error_code); \ instrumentation_end(); \ - idtentry_exit(regs); \ + idtentry_exit_cond_rcu(regs, rcu_exit); \ } \ \ static __always_inline void __##func(struct pt_regs *regs, \ diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index a32a7e236bb16d..c47f004f623178 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -1929,11 +1929,11 @@ static __always_inline void exc_machine_check_kernel(struct pt_regs *regs) static __always_inline void exc_machine_check_user(struct pt_regs *regs) { - idtentry_enter(regs); + idtentry_enter_user(regs); instrumentation_begin(); machine_check_vector(regs); instrumentation_end(); - idtentry_exit(regs); + idtentry_exit_user(regs); } #ifdef CONFIG_X86_64 diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 9e5d81cb94bad7..f28be3e51cca4f 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -619,18 +619,18 @@ DEFINE_IDTENTRY_RAW(exc_int3) return; /* - * idtentry_enter() uses static_branch_{,un}likely() and therefore + * idtentry_enter_user() uses static_branch_{,un}likely() and therefore * can trigger INT3, hence poke_int3_handler() must be done * before. If the entry came from kernel mode, then use nmi_enter() * because the INT3 could have been hit in any context including * NMI. */ if (user_mode(regs)) { - idtentry_enter(regs); + idtentry_enter_user(regs); instrumentation_begin(); do_int3_user(regs); instrumentation_end(); - idtentry_exit(regs); + idtentry_exit_user(regs); } else { nmi_enter(); instrumentation_begin(); @@ -877,7 +877,7 @@ static __always_inline void exc_debug_kernel(struct pt_regs *regs, static __always_inline void exc_debug_user(struct pt_regs *regs, unsigned long dr6) { - idtentry_enter(regs); + idtentry_enter_user(regs); clear_thread_flag(TIF_BLOCKSTEP); /* @@ -886,7 +886,7 @@ static __always_inline void exc_debug_user(struct pt_regs *regs, * User wants a sigtrap for that. */ handle_debug(regs, dr6, !dr6); - idtentry_exit(regs); + idtentry_exit_user(regs); } #ifdef CONFIG_X86_64 From 9ee01e0f69a925b6ff7d5f39441413c55132b167 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 21 May 2020 22:05:20 +0200 Subject: [PATCH 116/190] x86/entry: Clean up idtentry_enter/exit() leftovers Now that everything is converted to conditional RCU handling remove idtentry_enter/exit() and tidy up the conditional functions. This does not remove rcu_irq_exit_preempt(), to avoid conflicts with the RCU tree. Will be removed once all of this hits Linus's tree. Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar Acked-by: Andy Lutomirski Link: https://lore.kernel.org/r/20200521202117.473597954@linutronix.de --- arch/x86/entry/common.c | 67 ++++++++++++++------------------- arch/x86/include/asm/idtentry.h | 12 +----- 2 files changed, 30 insertions(+), 49 deletions(-) diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index b7fcb1355adff4..2a80e4e1b4c15b 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -515,7 +515,6 @@ SYSCALL_DEFINE0(ni_syscall) * idtentry_enter_cond_rcu - Handle state tracking on idtentry with conditional * RCU handling * @regs: Pointer to pt_regs of interrupted context - * @cond_rcu: Invoke rcu_irq_enter() only if RCU is not watching * * Invokes: * - lockdep irqflag state tracking as low level ASM entry disabled @@ -545,14 +544,14 @@ SYSCALL_DEFINE0(ni_syscall) * The return value must be fed into the rcu_exit argument of * idtentry_exit_cond_rcu(). */ -bool noinstr idtentry_enter_cond_rcu(struct pt_regs *regs, bool cond_rcu) +bool noinstr idtentry_enter_cond_rcu(struct pt_regs *regs) { if (user_mode(regs)) { enter_from_user_mode(); return false; } - if (!cond_rcu || !__rcu_is_watching()) { + if (!__rcu_is_watching()) { /* * If RCU is not watching then the same careful * sequence vs. lockdep and tracing is required @@ -608,52 +607,44 @@ void noinstr idtentry_exit_cond_rcu(struct pt_regs *regs, bool rcu_exit) if (user_mode(regs)) { prepare_exit_to_usermode(regs); } else if (regs->flags & X86_EFLAGS_IF) { + /* + * If RCU was not watching on entry this needs to be done + * carefully and needs the same ordering of lockdep/tracing + * and RCU as the return to user mode path. + */ + if (rcu_exit) { + instrumentation_begin(); + /* Tell the tracer that IRET will enable interrupts */ + trace_hardirqs_on_prepare(); + lockdep_hardirqs_on_prepare(CALLER_ADDR0); + instrumentation_end(); + rcu_irq_exit(); + lockdep_hardirqs_on(CALLER_ADDR0); + return; + } + + instrumentation_begin(); + /* Check kernel preemption, if enabled */ if (IS_ENABLED(CONFIG_PREEMPTION)) { - /* - * This needs to be done very carefully. - * idtentry_enter() invoked rcu_irq_enter(). This - * needs to be undone before scheduling. - * - * Preemption is disabled inside of RCU idle - * sections. When the task returns from - * preempt_schedule_irq(), RCU is still watching. - * - * rcu_irq_exit_preempt() has additional state - * checking if CONFIG_PROVE_RCU=y - */ if (!preempt_count()) { + /* Sanity check RCU and thread stack */ + rcu_irq_exit_check_preempt(); if (IS_ENABLED(CONFIG_DEBUG_ENTRY)) WARN_ON_ONCE(!on_thread_stack()); - instrumentation_begin(); - if (rcu_exit) - rcu_irq_exit_preempt(); if (need_resched()) preempt_schedule_irq(); - /* Covers both tracing and lockdep */ - trace_hardirqs_on(); - instrumentation_end(); - return; } } - /* - * If preemption is disabled then this needs to be done - * carefully with respect to RCU. The exception might come - * from a RCU idle section in the idle task due to the fact - * that safe_halt() enables interrupts. So this needs the - * same ordering of lockdep/tracing and RCU as the return - * to user mode path. - */ - instrumentation_begin(); - /* Tell the tracer that IRET will enable interrupts */ - trace_hardirqs_on_prepare(); - lockdep_hardirqs_on_prepare(CALLER_ADDR0); + /* Covers both tracing and lockdep */ + trace_hardirqs_on(); + instrumentation_end(); - if (rcu_exit) - rcu_irq_exit(); - lockdep_hardirqs_on(CALLER_ADDR0); } else { - /* IRQ flags state is correct already. Just tell RCU. */ + /* + * IRQ flags state is correct already. Just tell RCU if it + * was not watching on entry. + */ if (rcu_exit) rcu_irq_exit(); } diff --git a/arch/x86/include/asm/idtentry.h b/arch/x86/include/asm/idtentry.h index 0f974e52e13b2e..b05688973b92ad 100644 --- a/arch/x86/include/asm/idtentry.h +++ b/arch/x86/include/asm/idtentry.h @@ -10,19 +10,9 @@ void idtentry_enter_user(struct pt_regs *regs); void idtentry_exit_user(struct pt_regs *regs); -bool idtentry_enter_cond_rcu(struct pt_regs *regs, bool cond_rcu); +bool idtentry_enter_cond_rcu(struct pt_regs *regs); void idtentry_exit_cond_rcu(struct pt_regs *regs, bool rcu_exit); -static __always_inline void idtentry_enter(struct pt_regs *regs) -{ - idtentry_enter_cond_rcu(regs, false); -} - -static __always_inline void idtentry_exit(struct pt_regs *regs) -{ - idtentry_exit_cond_rcu(regs, true); -} - /** * DECLARE_IDTENTRY - Declare functions for simple IDT entry points * No error code pushed by hardware From 8a6bc4787f05d087fda8e11ead225c8830250703 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 21 May 2020 22:05:21 +0200 Subject: [PATCH 117/190] genirq: Provide irq_enter/exit_rcu() irq_enter()/exit() currently include RCU handling. To properly separate the RCU handling code, provide variants which contain only the non-RCU related functionality. Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar Reviewed-by: Andy Lutomirski Link: https://lore.kernel.org/r/20200521202117.567023613@linutronix.de --- include/linux/hardirq.h | 13 +++++++++++-- kernel/softirq.c | 35 +++++++++++++++++++++++++++-------- 2 files changed, 38 insertions(+), 10 deletions(-) diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h index 29b862aba740db..3dc9102d16cf75 100644 --- a/include/linux/hardirq.h +++ b/include/linux/hardirq.h @@ -40,7 +40,11 @@ static __always_inline void rcu_irq_enter_check_tick(void) /* * Enter irq context (on NO_HZ, update jiffies): */ -extern void irq_enter(void); +void irq_enter(void); +/* + * Like irq_enter(), but RCU is already watching. + */ +void irq_enter_rcu(void); /* * Exit irq context without processing softirqs: @@ -55,7 +59,12 @@ extern void irq_enter(void); /* * Exit irq context and process softirqs if needed: */ -extern void irq_exit(void); +void irq_exit(void); + +/* + * Like irq_exit(), but return with RCU watching. + */ +void irq_exit_rcu(void); #ifndef arch_nmi_enter #define arch_nmi_enter() do { } while (0) diff --git a/kernel/softirq.c b/kernel/softirq.c index a47c6dd574520d..beb8e3a66c7c44 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -339,12 +339,11 @@ asmlinkage __visible void do_softirq(void) local_irq_restore(flags); } -/* - * Enter an interrupt context. +/** + * irq_enter_rcu - Enter an interrupt context with RCU watching */ -void irq_enter(void) +void irq_enter_rcu(void) { - rcu_irq_enter(); if (is_idle_task(current) && !in_interrupt()) { /* * Prevent raise_softirq from needlessly waking up ksoftirqd @@ -354,10 +353,18 @@ void irq_enter(void) tick_irq_enter(); _local_bh_enable(); } - __irq_enter(); } +/** + * irq_enter - Enter an interrupt context including RCU update + */ +void irq_enter(void) +{ + rcu_irq_enter(); + irq_enter_rcu(); +} + static inline void invoke_softirq(void) { if (ksoftirqd_running(local_softirq_pending())) @@ -397,10 +404,12 @@ static inline void tick_irq_exit(void) #endif } -/* - * Exit an interrupt context. Process softirqs if needed and possible: +/** + * irq_exit_rcu() - Exit an interrupt context without updating RCU + * + * Also processes softirqs if needed and possible. */ -void irq_exit(void) +void irq_exit_rcu(void) { #ifndef __ARCH_IRQ_EXIT_IRQS_DISABLED local_irq_disable(); @@ -413,6 +422,16 @@ void irq_exit(void) invoke_softirq(); tick_irq_exit(); +} + +/** + * irq_exit - Exit an interrupt context, update RCU and lockdep + * + * Also processes softirqs if needed and possible. + */ +void irq_exit(void) +{ + irq_exit_rcu(); rcu_irq_exit(); /* must be last! */ lockdep_hardirq_exit(); From 98a3bf195e1a14755da3d2b83e1dbb4a3158866d Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 21 May 2020 22:05:22 +0200 Subject: [PATCH 118/190] genirq: Provide __irq_enter/exit_raw() Like __irq_enter/exit() but without time accounting. To be used for "empty" system vectors like the scheduler IPI to avoid the overhead. Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar Reviewed-by: Andy Lutomirski Link: https://lore.kernel.org/r/20200521202117.671682341@linutronix.de --- include/linux/hardirq.h | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h index 3dc9102d16cf75..03c9fece7d43af 100644 --- a/include/linux/hardirq.h +++ b/include/linux/hardirq.h @@ -37,6 +37,17 @@ static __always_inline void rcu_irq_enter_check_tick(void) lockdep_hardirq_enter(); \ } while (0) +/* + * Like __irq_enter() without time accounting for fast + * interrupts, e.g. reschedule IPI where time accounting + * is more expensive than the actual interrupt. + */ +#define __irq_enter_raw() \ + do { \ + preempt_count_add(HARDIRQ_OFFSET); \ + lockdep_hardirq_enter(); \ + } while (0) + /* * Enter irq context (on NO_HZ, update jiffies): */ @@ -56,6 +67,15 @@ void irq_enter_rcu(void); preempt_count_sub(HARDIRQ_OFFSET); \ } while (0) +/* + * Like __irq_exit() without time accounting + */ +#define __irq_exit_raw() \ + do { \ + lockdep_hardirq_exit(); \ + preempt_count_sub(HARDIRQ_OFFSET); \ + } while (0) + /* * Exit irq context and process softirqs if needed: */ From 931b94145981e411bd2c934657649347ba8a9083 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 21 May 2020 22:05:23 +0200 Subject: [PATCH 119/190] x86/entry: Provide helpers for executing on the irqstack Device interrupt handlers and system vector handlers are executed on the interrupt stack. The stack switch happens in the low level assembly entry code. This conflicts with the efforts to consolidate the exit code in C to ensure correctness vs. RCU and tracing. As there is no way to move #DB away from IST due to the MOV SS issue, the requirements vs. #DB and NMI for switching to the interrupt stack do not exist anymore. The only requirement is that interrupts are disabled. That allows the moving of the stack switching to C code, which simplifies the entry/exit handling further, because it allows the switching of stacks after handling the entry and on exit before handling RCU, returning to usermode and kernel preemption in the same way as for regular exceptions. The initial attempt of having the stack switching in inline ASM caused too much headache vs. objtool and the unwinder. After analysing the use cases it was agreed on that having the stack switch in ASM for the price of an indirect call is acceptable, as the main users are indirect call heavy anyway and the few system vectors which are empty shells (scheduler IPI and KVM posted interrupt vectors) can run from the regular stack. Provide helper functions to check whether the interrupt stack is already active and whether stack switching is required. 64-bit only for now, as 32-bit has a variant of that already. Once this is cleaned up, the two implementations might be consolidated as an additional cleanup on top. Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar Acked-by: Andy Lutomirski Link: https://lore.kernel.org/r/20200521202117.763775313@linutronix.de --- arch/x86/entry/entry_64.S | 39 +++++++++++++++++++++++ arch/x86/include/asm/irq_stack.h | 53 ++++++++++++++++++++++++++++++++ 2 files changed, 92 insertions(+) create mode 100644 arch/x86/include/asm/irq_stack.h diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index d983a0d4bc73a5..1597370626115b 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -1106,6 +1106,45 @@ SYM_CODE_START_LOCAL_NOALIGN(.Lbad_gs) SYM_CODE_END(.Lbad_gs) .previous +/* + * rdi: New stack pointer points to the top word of the stack + * rsi: Function pointer + * rdx: Function argument (can be NULL if none) + */ +SYM_FUNC_START(asm_call_on_stack) + /* + * Save the frame pointer unconditionally. This allows the ORC + * unwinder to handle the stack switch. + */ + pushq %rbp + mov %rsp, %rbp + + /* + * The unwinder relies on the word at the top of the new stack + * page linking back to the previous RSP. + */ + mov %rsp, (%rdi) + mov %rdi, %rsp + /* Move the argument to the right place */ + mov %rdx, %rdi + +1: + .pushsection .discard.instr_begin + .long 1b - . + .popsection + + CALL_NOSPEC rsi + +2: + .pushsection .discard.instr_end + .long 2b - . + .popsection + + /* Restore the previous stack pointer from RBP. */ + leaveq + ret +SYM_FUNC_END(asm_call_on_stack) + /* Call softirq on interrupt stack. Interrupts are off. */ .pushsection .text, "ax" SYM_FUNC_START(do_softirq_own_stack) diff --git a/arch/x86/include/asm/irq_stack.h b/arch/x86/include/asm/irq_stack.h new file mode 100644 index 00000000000000..4ae66f097101d1 --- /dev/null +++ b/arch/x86/include/asm/irq_stack.h @@ -0,0 +1,53 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_X86_IRQ_STACK_H +#define _ASM_X86_IRQ_STACK_H + +#include + +#include + +#ifdef CONFIG_X86_64 +static __always_inline bool irqstack_active(void) +{ + return __this_cpu_read(irq_count) != -1; +} + +void asm_call_on_stack(void *sp, void *func, void *arg); + +static __always_inline void __run_on_irqstack(void *func, void *arg) +{ + void *tos = __this_cpu_read(hardirq_stack_ptr); + + __this_cpu_add(irq_count, 1); + asm_call_on_stack(tos - 8, func, arg); + __this_cpu_sub(irq_count, 1); +} + +#else /* CONFIG_X86_64 */ +static inline bool irqstack_active(void) { return false; } +static inline void __run_on_irqstack(void *func, void *arg) { } +#endif /* !CONFIG_X86_64 */ + +static __always_inline bool irq_needs_irq_stack(struct pt_regs *regs) +{ + if (IS_ENABLED(CONFIG_X86_32)) + return false; + if (!regs) + return !irqstack_active(); + return !user_mode(regs) && !irqstack_active(); +} + +static __always_inline void run_on_irqstack_cond(void *func, void *arg, + struct pt_regs *regs) +{ + void (*__func)(void *arg) = func; + + lockdep_assert_irqs_disabled(); + + if (irq_needs_irq_stack(regs)) + __run_on_irqstack(__func, arg); + else + __func(arg); +} + +#endif From eb6555c83933ce8e094d5429d57970aaa9f0591e Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 21 May 2020 22:05:24 +0200 Subject: [PATCH 120/190] x86/entry/64: Move do_softirq_own_stack() to C The first step to get rid of the ENTER/LEAVE_IRQ_STACK ASM macro maze. Use the new C code helpers to move do_softirq_own_stack() out of ASM code. Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar Acked-by: Andy Lutomirski Link: https://lore.kernel.org/r/20200521202117.870911120@linutronix.de --- arch/x86/entry/entry_64.S | 13 ------------- arch/x86/kernel/irq_64.c | 6 ++++++ 2 files changed, 6 insertions(+), 13 deletions(-) diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 1597370626115b..6b518be4da0a9b 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -1145,19 +1145,6 @@ SYM_FUNC_START(asm_call_on_stack) ret SYM_FUNC_END(asm_call_on_stack) -/* Call softirq on interrupt stack. Interrupts are off. */ -.pushsection .text, "ax" -SYM_FUNC_START(do_softirq_own_stack) - pushq %rbp - mov %rsp, %rbp - ENTER_IRQ_STACK regs=0 old_rsp=%r11 - call __do_softirq - LEAVE_IRQ_STACK regs=0 - leaveq - ret -SYM_FUNC_END(do_softirq_own_stack) -.popsection - #ifdef CONFIG_XEN_PV /* * A note on the "critical region" in our callback handler. diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c index 6b32ab009c197a..1b4fe93a86c5ce 100644 --- a/arch/x86/kernel/irq_64.c +++ b/arch/x86/kernel/irq_64.c @@ -20,6 +20,7 @@ #include #include +#include #include #include @@ -70,3 +71,8 @@ int irq_init_percpu_irqstack(unsigned int cpu) return 0; return map_irq_stack(cpu); } + +void do_softirq_own_stack(void) +{ + run_on_irqstack_cond(__do_softirq, NULL, NULL); +} From 1de16e0c17155d138282f3a9f65914a9a5da757e Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 21 May 2020 22:05:25 +0200 Subject: [PATCH 121/190] x86/entry: Split out idtentry_exit_cond_resched() The XEN PV hypercall requires the ability of conditional rescheduling when preemption is disabled because some hypercalls take ages. Split out the rescheduling code from idtentry_exit_cond_rcu() so it can be reused for that. Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar Acked-by: Andy Lutomirski Link: https://lore.kernel.org/r/20200521202117.962199649@linutronix.de --- arch/x86/entry/common.c | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index 2a80e4e1b4c15b..066215a243b4c7 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -583,6 +583,20 @@ bool noinstr idtentry_enter_cond_rcu(struct pt_regs *regs) return false; } +static void idtentry_exit_cond_resched(struct pt_regs *regs, bool may_sched) +{ + if (may_sched && !preempt_count()) { + /* Sanity check RCU and thread stack */ + rcu_irq_exit_check_preempt(); + if (IS_ENABLED(CONFIG_DEBUG_ENTRY)) + WARN_ON_ONCE(!on_thread_stack()); + if (need_resched()) + preempt_schedule_irq(); + } + /* Covers both tracing and lockdep */ + trace_hardirqs_on(); +} + /** * idtentry_exit_cond_rcu - Handle return from exception with conditional RCU * handling @@ -624,21 +638,7 @@ void noinstr idtentry_exit_cond_rcu(struct pt_regs *regs, bool rcu_exit) } instrumentation_begin(); - - /* Check kernel preemption, if enabled */ - if (IS_ENABLED(CONFIG_PREEMPTION)) { - if (!preempt_count()) { - /* Sanity check RCU and thread stack */ - rcu_irq_exit_check_preempt(); - if (IS_ENABLED(CONFIG_DEBUG_ENTRY)) - WARN_ON_ONCE(!on_thread_stack()); - if (need_resched()) - preempt_schedule_irq(); - } - } - /* Covers both tracing and lockdep */ - trace_hardirqs_on(); - + idtentry_exit_cond_resched(regs, IS_ENABLED(CONFIG_PREEMPTION)); instrumentation_end(); } else { /* From 2f6474e4636bcc68af6c44abb2703f12d7f083da Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 21 May 2020 22:05:26 +0200 Subject: [PATCH 122/190] x86/entry: Switch XEN/PV hypercall entry to IDTENTRY Convert the XEN/PV hypercall to IDTENTRY: - Emit the ASM stub with DECLARE_IDTENTRY - Remove the ASM idtentry in 64-bit - Remove the open coded ASM entry code in 32-bit - Remove the old prototypes The handler stubs need to stay in ASM code as they need corner case handling and adjustment of the stack pointer. Provide a new C function which invokes the entry/exit handling and calls into the XEN handler on the interrupt stack if required. The exit code is slightly different from the regular idtentry_exit() on non-preemptible kernels. If the hypercall is preemptible and need_resched() is set then XEN provides a preempt hypercall scheduling function. Move this functionality into the entry code so it can use the existing idtentry functionality. [ mingo: Build fixes. ] Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar Acked-by: Andy Lutomirski Acked-by: Juergen Gross Tested-by: Juergen Gross Link: https://lore.kernel.org/r/20200521202118.055270078@linutronix.de --- arch/x86/entry/common.c | 78 +++++++++++++++++++++++++++++++++ arch/x86/entry/entry_32.S | 20 +++++---- arch/x86/entry/entry_64.S | 20 +++------ arch/x86/include/asm/idtentry.h | 34 ++++++++++++++ arch/x86/xen/setup.c | 4 +- arch/x86/xen/smp_pv.c | 3 +- arch/x86/xen/xen-asm_32.S | 12 ++--- arch/x86/xen/xen-asm_64.S | 2 +- arch/x86/xen/xen-ops.h | 1 - drivers/xen/Makefile | 2 +- drivers/xen/preempt.c | 42 ------------------ include/xen/xen-ops.h | 19 +++----- 12 files changed, 151 insertions(+), 86 deletions(-) delete mode 100644 drivers/xen/preempt.c diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index 066215a243b4c7..a0f8c3cb130a38 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -27,6 +27,11 @@ #include #include +#ifdef CONFIG_XEN_PV +#include +#include +#endif + #include #include #include @@ -35,6 +40,7 @@ #include #include #include +#include #define CREATE_TRACE_POINTS #include @@ -680,3 +686,75 @@ void noinstr idtentry_exit_user(struct pt_regs *regs) prepare_exit_to_usermode(regs); } + +#ifdef CONFIG_XEN_PV +#ifndef CONFIG_PREEMPTION +/* + * Some hypercalls issued by the toolstack can take many 10s of + * seconds. Allow tasks running hypercalls via the privcmd driver to + * be voluntarily preempted even if full kernel preemption is + * disabled. + * + * Such preemptible hypercalls are bracketed by + * xen_preemptible_hcall_begin() and xen_preemptible_hcall_end() + * calls. + */ +DEFINE_PER_CPU(bool, xen_in_preemptible_hcall); +EXPORT_SYMBOL_GPL(xen_in_preemptible_hcall); + +/* + * In case of scheduling the flag must be cleared and restored after + * returning from schedule as the task might move to a different CPU. + */ +static __always_inline bool get_and_clear_inhcall(void) +{ + bool inhcall = __this_cpu_read(xen_in_preemptible_hcall); + + __this_cpu_write(xen_in_preemptible_hcall, false); + return inhcall; +} + +static __always_inline void restore_inhcall(bool inhcall) +{ + __this_cpu_write(xen_in_preemptible_hcall, inhcall); +} +#else +static __always_inline bool get_and_clear_inhcall(void) { return false; } +static __always_inline void restore_inhcall(bool inhcall) { } +#endif + +static void __xen_pv_evtchn_do_upcall(void) +{ + irq_enter_rcu(); + inc_irq_stat(irq_hv_callback_count); + + xen_hvm_evtchn_do_upcall(); + + irq_exit_rcu(); +} + +__visible noinstr void xen_pv_evtchn_do_upcall(struct pt_regs *regs) +{ + struct pt_regs *old_regs; + bool inhcall, rcu_exit; + + rcu_exit = idtentry_enter_cond_rcu(regs); + old_regs = set_irq_regs(regs); + + instrumentation_begin(); + run_on_irqstack_cond(__xen_pv_evtchn_do_upcall, NULL, regs); + instrumentation_begin(); + + set_irq_regs(old_regs); + + inhcall = get_and_clear_inhcall(); + if (inhcall && !WARN_ON_ONCE(rcu_exit)) { + instrumentation_begin(); + idtentry_exit_cond_resched(regs, true); + instrumentation_end(); + restore_inhcall(inhcall); + } else { + idtentry_exit_cond_rcu(regs, rcu_exit); + } +} +#endif /* CONFIG_XEN_PV */ diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index 28d13f07b84d2d..3ab04dca9aab02 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -1298,7 +1298,13 @@ SYM_CODE_END(native_iret) #endif #ifdef CONFIG_XEN_PV -SYM_FUNC_START(xen_hypervisor_callback) +/* + * See comment in entry_64.S for further explanation + * + * Note: This is not an actual IDT entry point. It's a XEN specific entry + * point and therefore named to match the 64-bit trampoline counterpart. + */ +SYM_FUNC_START(xen_asm_exc_xen_hypervisor_callback) /* * Check to see if we got the event in the critical * region in xen_iret_direct, after we've reenabled @@ -1315,14 +1321,11 @@ SYM_FUNC_START(xen_hypervisor_callback) pushl $-1 /* orig_ax = -1 => not a system call */ SAVE_ALL ENCODE_FRAME_POINTER - TRACE_IRQS_OFF + mov %esp, %eax - call xen_evtchn_do_upcall -#ifndef CONFIG_PREEMPTION - call xen_maybe_preempt_hcall -#endif - jmp ret_from_intr -SYM_FUNC_END(xen_hypervisor_callback) + call xen_pv_evtchn_do_upcall + jmp handle_exception_return +SYM_FUNC_END(xen_asm_exc_xen_hypervisor_callback) /* * Hypervisor uses this for application faults while it executes. @@ -1464,6 +1467,7 @@ SYM_CODE_START_LOCAL_NOALIGN(handle_exception) movl %esp, %eax # pt_regs pointer CALL_NOSPEC edi +handle_exception_return: #ifdef CONFIG_VM86 movl PT_EFLAGS(%esp), %eax # mix EFLAGS and CS movb PT_CS(%esp), %al diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 6b518be4da0a9b..dadb37d0726642 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -1067,10 +1067,6 @@ apicinterrupt IRQ_WORK_VECTOR irq_work_interrupt smp_irq_work_interrupt idtentry X86_TRAP_PF page_fault do_page_fault has_error_code=1 -#ifdef CONFIG_XEN_PV -idtentry 512 /* dummy */ hypervisor_callback xen_do_hypervisor_callback has_error_code=0 -#endif - /* * Reload gs selector with exception handling * edi: new selector @@ -1158,9 +1154,10 @@ SYM_FUNC_END(asm_call_on_stack) * So, on entry to the handler we detect whether we interrupted an * existing activation in its critical region -- if so, we pop the current * activation and restart the handler using the previous one. + * + * C calling convention: exc_xen_hypervisor_callback(struct *pt_regs) */ -/* do_hypervisor_callback(struct *pt_regs) */ -SYM_CODE_START_LOCAL(xen_do_hypervisor_callback) +SYM_CODE_START_LOCAL(exc_xen_hypervisor_callback) /* * Since we don't modify %rdi, evtchn_do_upall(struct *pt_regs) will @@ -1170,15 +1167,10 @@ SYM_CODE_START_LOCAL(xen_do_hypervisor_callback) movq %rdi, %rsp /* we don't return, adjust the stack frame */ UNWIND_HINT_REGS - ENTER_IRQ_STACK old_rsp=%r10 - call xen_evtchn_do_upcall - LEAVE_IRQ_STACK + call xen_pv_evtchn_do_upcall -#ifndef CONFIG_PREEMPTION - call xen_maybe_preempt_hcall -#endif - jmp error_exit -SYM_CODE_END(xen_do_hypervisor_callback) + jmp error_return +SYM_CODE_END(exc_xen_hypervisor_callback) /* * Hypervisor uses this for application faults while it executes. diff --git a/arch/x86/include/asm/idtentry.h b/arch/x86/include/asm/idtentry.h index b05688973b92ad..b2a5fe02dcf09e 100644 --- a/arch/x86/include/asm/idtentry.h +++ b/arch/x86/include/asm/idtentry.h @@ -165,6 +165,21 @@ __visible noinstr void func(struct pt_regs *regs) #define DEFINE_IDTENTRY_RAW_ERRORCODE(func) \ __visible noinstr void func(struct pt_regs *regs, unsigned long error_code) +/** + * DECLARE_IDTENTRY_XENCB - Declare functions for XEN HV callback entry point + * @vector: Vector number (ignored for C) + * @func: Function name of the entry point + * + * Declares three functions: + * - The ASM entry point: asm_##func + * - The XEN PV trap entry point: xen_##func (maybe unused) + * - The C handler called from the ASM entry point + * + * Maps to DECLARE_IDTENTRY(). Distinct entry point to handle the 32/64-bit + * difference + */ +#define DECLARE_IDTENTRY_XENCB(vector, func) \ + DECLARE_IDTENTRY(vector, func) #ifdef CONFIG_X86_64 /** @@ -307,6 +322,9 @@ __visible noinstr void func(struct pt_regs *regs, \ # define DECLARE_IDTENTRY_DF(vector, func) \ idtentry_df vector asm_##func func +# define DECLARE_IDTENTRY_XENCB(vector, func) \ + DECLARE_IDTENTRY(vector, func) + #else # define DECLARE_IDTENTRY_MCE(vector, func) \ DECLARE_IDTENTRY(vector, func) @@ -317,6 +335,9 @@ __visible noinstr void func(struct pt_regs *regs, \ /* No ASM emitted for DF as this goes through a C shim */ # define DECLARE_IDTENTRY_DF(vector, func) +/* No ASM emitted for XEN hypervisor callback */ +# define DECLARE_IDTENTRY_XENCB(vector, func) + #endif /* No ASM code emitted for NMI */ @@ -337,6 +358,13 @@ __visible noinstr void func(struct pt_regs *regs, \ * This avoids duplicate defines and ensures that everything is consistent. */ +/* + * Dummy trap number so the low level ASM macro vector number checks do not + * match which results in emitting plain IDTENTRY stubs without bells and + * whistels. + */ +#define X86_TRAP_OTHER 0xFFFF + /* Simple exception entry points. No hardware error code */ DECLARE_IDTENTRY(X86_TRAP_DE, exc_divide_error); DECLARE_IDTENTRY(X86_TRAP_OF, exc_overflow); @@ -376,4 +404,10 @@ DECLARE_IDTENTRY_XEN(X86_TRAP_DB, debug); /* #DF */ DECLARE_IDTENTRY_DF(X86_TRAP_DF, exc_double_fault); +#ifdef CONFIG_XEN_PV +DECLARE_IDTENTRY_XENCB(X86_TRAP_OTHER, exc_xen_hypervisor_callback); +#endif + +#undef X86_TRAP_OTHER + #endif diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index 1a2d8a50dac4b3..3566e37241d7e3 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -20,6 +20,7 @@ #include #include #include +#include #include #include @@ -993,7 +994,8 @@ static void __init xen_pvmmu_arch_setup(void) HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_pae_extended_cr3); - if (register_callback(CALLBACKTYPE_event, xen_hypervisor_callback) || + if (register_callback(CALLBACKTYPE_event, + xen_asm_exc_xen_hypervisor_callback) || register_callback(CALLBACKTYPE_failsafe, xen_failsafe_callback)) BUG(); diff --git a/arch/x86/xen/smp_pv.c b/arch/x86/xen/smp_pv.c index 8fa01c5454601e..171aff1b11f24e 100644 --- a/arch/x86/xen/smp_pv.c +++ b/arch/x86/xen/smp_pv.c @@ -26,6 +26,7 @@ #include #include +#include #include #include @@ -348,7 +349,7 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle) ctxt->gs_base_kernel = per_cpu_offset(cpu); #endif ctxt->event_callback_eip = - (unsigned long)xen_hypervisor_callback; + (unsigned long)xen_asm_exc_xen_hypervisor_callback; ctxt->failsafe_callback_eip = (unsigned long)xen_failsafe_callback; per_cpu(xen_cr3, cpu) = __pa(swapper_pg_dir); diff --git a/arch/x86/xen/xen-asm_32.S b/arch/x86/xen/xen-asm_32.S index 812ff01e4e3414..4757cec33abeb5 100644 --- a/arch/x86/xen/xen-asm_32.S +++ b/arch/x86/xen/xen-asm_32.S @@ -93,7 +93,7 @@ xen_iret_start_crit: /* * If there's something pending, mask events again so we can - * jump back into xen_hypervisor_callback. Otherwise do not + * jump back into exc_xen_hypervisor_callback. Otherwise do not * touch XEN_vcpu_info_mask. */ jne 1f @@ -113,7 +113,7 @@ iret_restore_end: * Events are masked, so jumping out of the critical region is * OK. */ - je xen_hypervisor_callback + je xen_asm_exc_xen_hypervisor_callback 1: iret xen_iret_end_crit: @@ -127,7 +127,7 @@ SYM_CODE_END(xen_iret) .globl xen_iret_start_crit, xen_iret_end_crit /* - * This is called by xen_hypervisor_callback in entry_32.S when it sees + * This is called by xen_asm_exc_xen_hypervisor_callback in entry_32.S when it sees * that the EIP at the time of interrupt was between * xen_iret_start_crit and xen_iret_end_crit. * @@ -144,7 +144,7 @@ SYM_CODE_END(xen_iret) * eflags } * cs } nested exception info * eip } - * return address : (into xen_hypervisor_callback) + * return address : (into xen_asm_exc_xen_hypervisor_callback) * * In order to deliver the nested exception properly, we need to discard the * nested exception frame such that when we handle the exception, we do it @@ -152,7 +152,8 @@ SYM_CODE_END(xen_iret) * * The only caveat is that if the outer eax hasn't been restored yet (i.e. * it's still on stack), we need to restore its value here. - */ +*/ +.pushsection .noinstr.text, "ax" SYM_CODE_START(xen_iret_crit_fixup) /* * Paranoia: Make sure we're really coming from kernel space. @@ -181,3 +182,4 @@ SYM_CODE_START(xen_iret_crit_fixup) 2: ret SYM_CODE_END(xen_iret_crit_fixup) +.popsection diff --git a/arch/x86/xen/xen-asm_64.S b/arch/x86/xen/xen-asm_64.S index e46d863bcaa4a8..19fbbdbcbde920 100644 --- a/arch/x86/xen/xen-asm_64.S +++ b/arch/x86/xen/xen-asm_64.S @@ -54,7 +54,7 @@ xen_pv_trap asm_exc_simd_coprocessor_error #ifdef CONFIG_IA32_EMULATION xen_pv_trap entry_INT80_compat #endif -xen_pv_trap hypervisor_callback +xen_pv_trap asm_exc_xen_hypervisor_callback __INIT SYM_CODE_START(xen_early_idt_handler_array) diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index 1cc1568bfe048b..ad05d058938180 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -8,7 +8,6 @@ #include /* These are code, but not functions. Defined in entry.S */ -extern const char xen_hypervisor_callback[]; extern const char xen_failsafe_callback[]; void xen_sysenter_target(void); diff --git a/drivers/xen/Makefile b/drivers/xen/Makefile index 0c4efa6fe450d1..0d322f3d90cdbc 100644 --- a/drivers/xen/Makefile +++ b/drivers/xen/Makefile @@ -1,6 +1,6 @@ # SPDX-License-Identifier: GPL-2.0 obj-$(CONFIG_HOTPLUG_CPU) += cpu_hotplug.o -obj-y += grant-table.o features.o balloon.o manage.o preempt.o time.o +obj-y += grant-table.o features.o balloon.o manage.o time.o obj-y += mem-reservation.o obj-y += events/ obj-y += xenbus/ diff --git a/drivers/xen/preempt.c b/drivers/xen/preempt.c deleted file mode 100644 index 17240c5325a30c..00000000000000 --- a/drivers/xen/preempt.c +++ /dev/null @@ -1,42 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * Preemptible hypercalls - * - * Copyright (C) 2014 Citrix Systems R&D ltd. - */ - -#include -#include - -#ifndef CONFIG_PREEMPTION - -/* - * Some hypercalls issued by the toolstack can take many 10s of - * seconds. Allow tasks running hypercalls via the privcmd driver to - * be voluntarily preempted even if full kernel preemption is - * disabled. - * - * Such preemptible hypercalls are bracketed by - * xen_preemptible_hcall_begin() and xen_preemptible_hcall_end() - * calls. - */ - -DEFINE_PER_CPU(bool, xen_in_preemptible_hcall); -EXPORT_SYMBOL_GPL(xen_in_preemptible_hcall); - -asmlinkage __visible void xen_maybe_preempt_hcall(void) -{ - if (unlikely(__this_cpu_read(xen_in_preemptible_hcall) - && need_resched())) { - /* - * Clear flag as we may be rescheduled on a different - * cpu. - */ - __this_cpu_write(xen_in_preemptible_hcall, false); - local_irq_enable(); - cond_resched(); - local_irq_disable(); - __this_cpu_write(xen_in_preemptible_hcall, true); - } -} -#endif /* CONFIG_PREEMPTION */ diff --git a/include/xen/xen-ops.h b/include/xen/xen-ops.h index 095be1d66f31c8..39a5580f8feb08 100644 --- a/include/xen/xen-ops.h +++ b/include/xen/xen-ops.h @@ -215,17 +215,7 @@ bool xen_running_on_version_or_later(unsigned int major, unsigned int minor); void xen_efi_runtime_setup(void); -#ifdef CONFIG_PREEMPTION - -static inline void xen_preemptible_hcall_begin(void) -{ -} - -static inline void xen_preemptible_hcall_end(void) -{ -} - -#else +#if defined(CONFIG_XEN_PV) && !defined(CONFIG_PREEMPTION) DECLARE_PER_CPU(bool, xen_in_preemptible_hcall); @@ -239,6 +229,11 @@ static inline void xen_preemptible_hcall_end(void) __this_cpu_write(xen_in_preemptible_hcall, false); } -#endif /* CONFIG_PREEMPTION */ +#else + +static inline void xen_preemptible_hcall_begin(void) { } +static inline void xen_preemptible_hcall_end(void) { } + +#endif /* CONFIG_XEN_PV && !CONFIG_PREEMPTION */ #endif /* INCLUDE_XEN_OPS_H */ From 00cf8baf9c2af3c17f9d77bb9d07d44d330d0df2 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 21 May 2020 22:05:27 +0200 Subject: [PATCH 123/190] x86/entry/64: Simplify idtentry_body All C functions which do not have an error code have been converted to the new IDTENTRY interface which does not expect an error code in the arguments. Spare the XORL. Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar Acked-by: Andy Lutomirski Link: https://lore.kernel.org/r/20200521202118.145811853@linutronix.de --- arch/x86/entry/entry_64.S | 2 -- 1 file changed, 2 deletions(-) diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index dadb37d0726642..b70c7788ef0894 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -531,8 +531,6 @@ SYM_CODE_END(spurious_entries_start) .if \has_error_code == 1 movq ORIG_RAX(%rsp), %rsi /* get error code into 2nd argument*/ movq $-1, ORIG_RAX(%rsp) /* no syscall to restart */ - .else - xorl %esi, %esi /* Clear the error code */ .endif .if \vector == X86_TRAP_PF From 91eeafea1e4b7c95cc4f38af186d7d48fceef89a Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 21 May 2020 22:05:28 +0200 Subject: [PATCH 124/190] x86/entry: Switch page fault exception to IDTENTRY_RAW Convert page fault exceptions to IDTENTRY_RAW: - Implement the C entry point with DEFINE_IDTENTRY_RAW - Add the CR2 read into the exception handler - Add the idtentry_enter/exit_cond_rcu() invocations in in the regular page fault handler and in the async PF part. - Emit the ASM stub with DECLARE_IDTENTRY_RAW - Remove the ASM idtentry in 64-bit - Remove the CR2 read from 64-bit - Remove the open coded ASM entry code in 32-bit - Fix up the XEN/PV code - Remove the old prototypes No functional change. Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar Acked-by: Andy Lutomirski Link: https://lore.kernel.org/r/20200521202118.238455120@linutronix.de --- arch/x86/entry/entry_32.S | 30 -------------- arch/x86/entry/entry_64.S | 19 --------- arch/x86/include/asm/idtentry.h | 3 +- arch/x86/include/asm/traps.h | 11 ------ arch/x86/kernel/idt.c | 4 +- arch/x86/kernel/kvm.c | 15 ++++--- arch/x86/mm/fault.c | 69 +++++++++++++++++++++++---------- arch/x86/xen/enlighten_pv.c | 2 +- arch/x86/xen/xen-asm_64.S | 2 +- 9 files changed, 63 insertions(+), 92 deletions(-) diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index 3ab04dca9aab02..660ed3ed37dc50 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -1398,36 +1398,6 @@ BUILD_INTERRUPT3(hv_stimer0_callback_vector, HYPERV_STIMER0_VECTOR, #endif /* CONFIG_HYPERV */ -SYM_CODE_START(page_fault) - ASM_CLAC - pushl $do_page_fault - jmp common_exception_read_cr2 -SYM_CODE_END(page_fault) - -SYM_CODE_START_LOCAL_NOALIGN(common_exception_read_cr2) - /* the function address is in %gs's slot on the stack */ - SAVE_ALL switch_stacks=1 skip_gs=1 unwind_espfix=1 - - ENCODE_FRAME_POINTER - - /* fixup %gs */ - GS_TO_REG %ecx - movl PT_GS(%esp), %edi - REG_TO_PTGS %ecx - SET_KERNEL_GS %ecx - - GET_CR2_INTO(%ecx) # might clobber %eax - - /* fixup orig %eax */ - movl PT_ORIG_EAX(%esp), %edx # get the error code - movl $-1, PT_ORIG_EAX(%esp) # no syscall to restart - - TRACE_IRQS_OFF - movl %esp, %eax # pt_regs pointer - CALL_NOSPEC edi - jmp ret_from_exception -SYM_CODE_END(common_exception_read_cr2) - SYM_CODE_START_LOCAL_NOALIGN(common_exception) /* the function address is in %gs's slot on the stack */ SAVE_ALL switch_stacks=1 skip_gs=1 unwind_espfix=1 diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index b70c7788ef0894..5789f76932b6cb 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -506,15 +506,6 @@ SYM_CODE_END(spurious_entries_start) call error_entry UNWIND_HINT_REGS - .if \vector == X86_TRAP_PF - /* - * Store CR2 early so subsequent faults cannot clobber it. Use R12 as - * intermediate storage as RDX can be clobbered in enter_from_user_mode(). - * GET_CR2_INTO can clobber RAX. - */ - GET_CR2_INTO(%r12); - .endif - .if \sane == 0 TRACE_IRQS_OFF @@ -533,10 +524,6 @@ SYM_CODE_END(spurious_entries_start) movq $-1, ORIG_RAX(%rsp) /* no syscall to restart */ .endif - .if \vector == X86_TRAP_PF - movq %r12, %rdx /* Move CR2 into 3rd argument */ - .endif - call \cfunc .if \sane == 0 @@ -1059,12 +1046,6 @@ apicinterrupt SPURIOUS_APIC_VECTOR spurious_interrupt smp_spurious_interrupt apicinterrupt IRQ_WORK_VECTOR irq_work_interrupt smp_irq_work_interrupt #endif -/* - * Exception entry points. - */ - -idtentry X86_TRAP_PF page_fault do_page_fault has_error_code=1 - /* * Reload gs selector with exception handling * edi: new selector diff --git a/arch/x86/include/asm/idtentry.h b/arch/x86/include/asm/idtentry.h index b2a5fe02dcf09e..9ec5466e4c0515 100644 --- a/arch/x86/include/asm/idtentry.h +++ b/arch/x86/include/asm/idtentry.h @@ -387,7 +387,8 @@ DECLARE_IDTENTRY_ERRORCODE(X86_TRAP_GP, exc_general_protection); DECLARE_IDTENTRY_ERRORCODE(X86_TRAP_AC, exc_alignment_check); /* Raw exception entries which need extra work */ -DECLARE_IDTENTRY_RAW(X86_TRAP_BP, exc_int3); +DECLARE_IDTENTRY_RAW(X86_TRAP_BP, exc_int3); +DECLARE_IDTENTRY_RAW_ERRORCODE(X86_TRAP_PF, exc_page_fault); #ifdef CONFIG_X86_MCE DECLARE_IDTENTRY_MCE(X86_TRAP_MC, exc_machine_check); diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h index f5a2e438a8784d..d7de360eec74cd 100644 --- a/arch/x86/include/asm/traps.h +++ b/arch/x86/include/asm/traps.h @@ -9,17 +9,6 @@ #include #include /* TRAP_TRACE, ... */ -#define dotraplinkage __visible - -asmlinkage void page_fault(void); -asmlinkage void async_page_fault(void); - -#if defined(CONFIG_X86_64) && defined(CONFIG_XEN_PV) -asmlinkage void xen_page_fault(void); -#endif - -dotraplinkage void do_page_fault(struct pt_regs *regs, unsigned long error_code, unsigned long address); - #ifdef CONFIG_X86_64 asmlinkage __visible notrace struct pt_regs *sync_regs(struct pt_regs *eregs); asmlinkage __visible notrace diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c index ec55479e1dd154..ddb11154aeeeb3 100644 --- a/arch/x86/kernel/idt.c +++ b/arch/x86/kernel/idt.c @@ -62,7 +62,7 @@ static const __initconst struct idt_data early_idts[] = { INTG(X86_TRAP_DB, asm_exc_debug), SYSG(X86_TRAP_BP, asm_exc_int3), #ifdef CONFIG_X86_32 - INTG(X86_TRAP_PF, page_fault), + INTG(X86_TRAP_PF, asm_exc_page_fault), #endif }; @@ -156,7 +156,7 @@ static const __initconst struct idt_data apic_idts[] = { * stacks work only after cpu_init(). */ static const __initconst struct idt_data early_pf_idts[] = { - INTG(X86_TRAP_PF, page_fault), + INTG(X86_TRAP_PF, asm_exc_page_fault), }; /* diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index d6f22a3a1f7da4..d00f7c430e65a9 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -218,7 +218,7 @@ void kvm_async_pf_task_wake(u32 token) } EXPORT_SYMBOL_GPL(kvm_async_pf_task_wake); -u32 kvm_read_and_reset_apf_flags(void) +noinstr u32 kvm_read_and_reset_apf_flags(void) { u32 flags = 0; @@ -230,11 +230,11 @@ u32 kvm_read_and_reset_apf_flags(void) return flags; } EXPORT_SYMBOL_GPL(kvm_read_and_reset_apf_flags); -NOKPROBE_SYMBOL(kvm_read_and_reset_apf_flags); -bool __kvm_handle_async_pf(struct pt_regs *regs, u32 token) +noinstr bool __kvm_handle_async_pf(struct pt_regs *regs, u32 token) { u32 reason = kvm_read_and_reset_apf_flags(); + bool rcu_exit; switch (reason) { case KVM_PV_REASON_PAGE_NOT_PRESENT: @@ -244,6 +244,9 @@ bool __kvm_handle_async_pf(struct pt_regs *regs, u32 token) return false; } + rcu_exit = idtentry_enter_cond_rcu(regs); + instrumentation_begin(); + /* * If the host managed to inject an async #PF into an interrupt * disabled region, then die hard as this is not going to end well @@ -258,13 +261,13 @@ bool __kvm_handle_async_pf(struct pt_regs *regs, u32 token) /* Page is swapped out by the host. */ kvm_async_pf_task_wait_schedule(token); } else { - rcu_irq_enter(); kvm_async_pf_task_wake(token); - rcu_irq_exit(); } + + instrumentation_end(); + idtentry_exit_cond_rcu(regs, rcu_exit); return true; } -NOKPROBE_SYMBOL(__kvm_handle_async_pf); static void __init paravirt_ops_setup(void) { diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index d7b52a2a1bce49..eef29bb53cd0b9 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -1357,11 +1357,38 @@ trace_page_fault_entries(struct pt_regs *regs, unsigned long error_code, trace_page_fault_kernel(address, regs, error_code); } -dotraplinkage void -do_page_fault(struct pt_regs *regs, unsigned long hw_error_code, - unsigned long address) +static __always_inline void +handle_page_fault(struct pt_regs *regs, unsigned long error_code, + unsigned long address) +{ + trace_page_fault_entries(regs, error_code, address); + + if (unlikely(kmmio_fault(regs, address))) + return; + + /* Was the fault on kernel-controlled part of the address space? */ + if (unlikely(fault_in_kernel_space(address))) { + do_kern_addr_fault(regs, error_code, address); + } else { + do_user_addr_fault(regs, error_code, address); + /* + * User address page fault handling might have reenabled + * interrupts. Fixing up all potential exit points of + * do_user_addr_fault() and its leaf functions is just not + * doable w/o creating an unholy mess or turning the code + * upside down. + */ + local_irq_disable(); + } +} + +DEFINE_IDTENTRY_RAW_ERRORCODE(exc_page_fault) { + unsigned long address = read_cr2(); + bool rcu_exit; + prefetchw(¤t->mm->mmap_lock); + /* * KVM has two types of events that are, logically, interrupts, but * are unfortunately delivered using the #PF vector. These events are @@ -1376,28 +1403,28 @@ do_page_fault(struct pt_regs *regs, unsigned long hw_error_code, * getting values from real and async page faults mixed up. * * Fingers crossed. + * + * The async #PF handling code takes care of idtentry handling + * itself. */ if (kvm_handle_async_pf(regs, (u32)address)) return; - trace_page_fault_entries(regs, hw_error_code, address); + /* + * Entry handling for valid #PF from kernel mode is slightly + * different: RCU is already watching and rcu_irq_enter() must not + * be invoked because a kernel fault on a user space address might + * sleep. + * + * In case the fault hit a RCU idle region the conditional entry + * code reenabled RCU to avoid subsequent wreckage which helps + * debugability. + */ + rcu_exit = idtentry_enter_cond_rcu(regs); - if (unlikely(kmmio_fault(regs, address))) - return; + instrumentation_begin(); + handle_page_fault(regs, error_code, address); + instrumentation_end(); - /* Was the fault on kernel-controlled part of the address space? */ - if (unlikely(fault_in_kernel_space(address))) { - do_kern_addr_fault(regs, hw_error_code, address); - } else { - do_user_addr_fault(regs, hw_error_code, address); - /* - * User address page fault handling might have reenabled - * interrupts. Fixing up all potential exit points of - * do_user_addr_fault() and its leaf functions is just not - * doable w/o creating an unholy mess or turning the code - * upside down. - */ - local_irq_disable(); - } + idtentry_exit_cond_rcu(regs, rcu_exit); } -NOKPROBE_SYMBOL(do_page_fault); diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c index 008291121cb4ee..33b309d659559e 100644 --- a/arch/x86/xen/enlighten_pv.c +++ b/arch/x86/xen/enlighten_pv.c @@ -626,7 +626,7 @@ static struct trap_array_entry trap_array[] = { #ifdef CONFIG_IA32_EMULATION { entry_INT80_compat, xen_entry_INT80_compat, false }, #endif - { page_fault, xen_page_fault, false }, + TRAP_ENTRY(exc_page_fault, false ), TRAP_ENTRY(exc_divide_error, false ), TRAP_ENTRY(exc_bounds, false ), TRAP_ENTRY(exc_invalid_op, false ), diff --git a/arch/x86/xen/xen-asm_64.S b/arch/x86/xen/xen-asm_64.S index 19fbbdbcbde920..5d252aaeade8b3 100644 --- a/arch/x86/xen/xen-asm_64.S +++ b/arch/x86/xen/xen-asm_64.S @@ -43,7 +43,7 @@ xen_pv_trap asm_exc_invalid_tss xen_pv_trap asm_exc_segment_not_present xen_pv_trap asm_exc_stack_segment xen_pv_trap asm_exc_general_protection -xen_pv_trap page_fault +xen_pv_trap asm_exc_page_fault xen_pv_trap asm_exc_spurious_interrupt_bug xen_pv_trap asm_exc_coprocessor_error xen_pv_trap asm_exc_alignment_check From e2dcb5f1390715244aec12dbd6f294863ca37b88 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 21 May 2020 22:05:29 +0200 Subject: [PATCH 125/190] x86/entry: Remove the transition leftovers Now that all exceptions are converted over the sane flag is not longer needed. Also the vector argument of idtentry_body on 64-bit is pointless now. Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar Acked-by: Andy Lutomirski Link: https://lore.kernel.org/r/20200521202118.331115895@linutronix.de --- arch/x86/entry/entry_32.S | 3 +-- arch/x86/entry/entry_64.S | 26 ++++---------------------- arch/x86/include/asm/idtentry.h | 6 +++--- 3 files changed, 8 insertions(+), 27 deletions(-) diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index 660ed3ed37dc50..6c6ae3a8c1fc02 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -734,9 +734,8 @@ * @asmsym: ASM symbol for the entry point * @cfunc: C function to be called * @has_error_code: Hardware pushed error code on stack - * @sane: Compatibility flag with 64bit */ -.macro idtentry vector asmsym cfunc has_error_code:req sane=0 +.macro idtentry vector asmsym cfunc has_error_code:req SYM_CODE_START(\asmsym) ASM_CLAC cld diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 5789f76932b6cb..2e476f488acef6 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -496,27 +496,14 @@ SYM_CODE_END(spurious_entries_start) /** * idtentry_body - Macro to emit code calling the C function - * @vector: Vector number * @cfunc: C function to be called * @has_error_code: Hardware pushed error code on stack - * @sane: Sane variant which handles irq tracing, context tracking in C */ -.macro idtentry_body vector cfunc has_error_code:req sane=0 +.macro idtentry_body cfunc has_error_code:req call error_entry UNWIND_HINT_REGS - .if \sane == 0 - TRACE_IRQS_OFF - -#ifdef CONFIG_CONTEXT_TRACKING - testb $3, CS(%rsp) - jz .Lfrom_kernel_no_ctxt_tracking_\@ - CALL_enter_from_user_mode -.Lfrom_kernel_no_ctxt_tracking_\@: -#endif - .endif - movq %rsp, %rdi /* pt_regs pointer into 1st argument*/ .if \has_error_code == 1 @@ -526,11 +513,7 @@ SYM_CODE_END(spurious_entries_start) call \cfunc - .if \sane == 0 - jmp error_exit - .else jmp error_return - .endif .endm /** @@ -539,12 +522,11 @@ SYM_CODE_END(spurious_entries_start) * @asmsym: ASM symbol for the entry point * @cfunc: C function to be called * @has_error_code: Hardware pushed error code on stack - * @sane: Sane variant which handles irq tracing, context tracking in C * * The macro emits code to set up the kernel context for straight forward * and simple IDT entries. No IST stack, no paranoid entry checks. */ -.macro idtentry vector asmsym cfunc has_error_code:req sane=0 +.macro idtentry vector asmsym cfunc has_error_code:req SYM_CODE_START(\asmsym) UNWIND_HINT_IRET_REGS offset=\has_error_code*8 ASM_CLAC @@ -567,7 +549,7 @@ SYM_CODE_START(\asmsym) .Lfrom_usermode_no_gap_\@: .endif - idtentry_body \vector \cfunc \has_error_code \sane + idtentry_body \cfunc \has_error_code _ASM_NOKPROBE(\asmsym) SYM_CODE_END(\asmsym) @@ -642,7 +624,7 @@ SYM_CODE_START(\asmsym) /* Switch to the regular task stack and use the noist entry point */ .Lfrom_usermode_switch_stack_\@: - idtentry_body vector noist_\cfunc, has_error_code=0 sane=1 + idtentry_body noist_\cfunc, has_error_code=0 _ASM_NOKPROBE(\asmsym) SYM_CODE_END(\asmsym) diff --git a/arch/x86/include/asm/idtentry.h b/arch/x86/include/asm/idtentry.h index 9ec5466e4c0515..36e5b929389b6c 100644 --- a/arch/x86/include/asm/idtentry.h +++ b/arch/x86/include/asm/idtentry.h @@ -298,10 +298,10 @@ __visible noinstr void func(struct pt_regs *regs, \ * The ASM variants for DECLARE_IDTENTRY*() which emit the ASM entry stubs. */ #define DECLARE_IDTENTRY(vector, func) \ - idtentry vector asm_##func func has_error_code=0 sane=1 + idtentry vector asm_##func func has_error_code=0 #define DECLARE_IDTENTRY_ERRORCODE(vector, func) \ - idtentry vector asm_##func func has_error_code=1 sane=1 + idtentry vector asm_##func func has_error_code=1 /* Special case for 32bit IRET 'trap'. Do not emit ASM code */ #define DECLARE_IDTENTRY_SW(vector, func) @@ -345,7 +345,7 @@ __visible noinstr void func(struct pt_regs *regs, \ /* XEN NMI and DB wrapper */ #define DECLARE_IDTENTRY_XEN(vector, func) \ - idtentry vector asm_exc_xen##func exc_##func has_error_code=0 sane=1 + idtentry vector asm_exc_xen##func exc_##func has_error_code=0 #endif /* __ASSEMBLY__ */ From e88d974136dbb5d6962eeb63075900603e737a1e Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 21 May 2020 22:05:30 +0200 Subject: [PATCH 126/190] x86/entry: Change exit path of xen_failsafe_callback xen_failsafe_callback() is invoked from XEN for two cases: 1. Fault while reloading DS, ES, FS or GS 2. Fault while executing IRET #1 retries the IRET after XEN has fixed up the segments. #2 injects a #GP which kills the task For #1 there is no reason to go through the full exception return path because the tasks TIF state is still the same. So just going straight to the IRET path is good enough. Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar Acked-by: Andy Lutomirski Link: https://lore.kernel.org/r/20200521202118.423224507@linutronix.de --- arch/x86/entry/entry_32.S | 2 +- arch/x86/entry/entry_64.S | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index 6c6ae3a8c1fc02..6fcdee9feba0e8 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -1355,7 +1355,7 @@ SYM_FUNC_START(xen_failsafe_callback) 5: pushl $-1 /* orig_ax = -1 => not a system call */ SAVE_ALL ENCODE_FRAME_POINTER - jmp ret_from_exception + jmp handle_exception_return .section .fixup, "ax" 6: xorl %eax, %eax diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 2e476f488acef6..a526fb57b65d51 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -1175,7 +1175,7 @@ SYM_CODE_START(xen_failsafe_callback) pushq $-1 /* orig_ax = -1 => not a system call */ PUSH_AND_CLEAR_REGS ENCODE_FRAME_POINTER - jmp error_exit + jmp error_return SYM_CODE_END(xen_failsafe_callback) #endif /* CONFIG_XEN_PV */ From 23d73f2ad4e7e7ce8bebdd1e138c8c79439dc301 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 21 May 2020 22:05:31 +0200 Subject: [PATCH 127/190] x86/entry/64: Remove error_exit() No more users. Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar Acked-by: Andy Lutomirski Link: https://lore.kernel.org/r/20200521202118.516757524@linutronix.de --- arch/x86/entry/entry_64.S | 9 --------- 1 file changed, 9 deletions(-) diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index a526fb57b65d51..76993591fdf60d 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -1356,15 +1356,6 @@ SYM_CODE_START_LOCAL(error_entry) jmp .Lerror_entry_from_usermode_after_swapgs SYM_CODE_END(error_entry) -SYM_CODE_START_LOCAL(error_exit) - UNWIND_HINT_REGS - DISABLE_INTERRUPTS(CLBR_ANY) - TRACE_IRQS_OFF - testb $3, CS(%rsp) - jz retint_kernel - jmp .Lretint_user -SYM_CODE_END(error_exit) - SYM_CODE_START_LOCAL(error_return) UNWIND_HINT_REGS DEBUG_ENTRY_ASSERT_IRQS_OFF From 74ebed3193aa4964b6cb9d146c9c01c7759ef4f2 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 21 May 2020 22:05:32 +0200 Subject: [PATCH 128/190] x86/entry/32: Remove common_exception() No more users. Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar Acked-by: Andy Lutomirski Link: https://lore.kernel.org/r/20200521202118.611906966@linutronix.de --- arch/x86/entry/entry_32.S | 21 --------------------- 1 file changed, 21 deletions(-) diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index 6fcdee9feba0e8..158a5250ebc57f 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -1397,27 +1397,6 @@ BUILD_INTERRUPT3(hv_stimer0_callback_vector, HYPERV_STIMER0_VECTOR, #endif /* CONFIG_HYPERV */ -SYM_CODE_START_LOCAL_NOALIGN(common_exception) - /* the function address is in %gs's slot on the stack */ - SAVE_ALL switch_stacks=1 skip_gs=1 unwind_espfix=1 - ENCODE_FRAME_POINTER - - /* fixup %gs */ - GS_TO_REG %ecx - movl PT_GS(%esp), %edi # get the function address - REG_TO_PTGS %ecx - SET_KERNEL_GS %ecx - - /* fixup orig %eax */ - movl PT_ORIG_EAX(%esp), %edx # get the error code - movl $-1, PT_ORIG_EAX(%esp) # no syscall to restart - - TRACE_IRQS_OFF - movl %esp, %eax # pt_regs pointer - CALL_NOSPEC edi - jmp ret_from_exception -SYM_CODE_END(common_exception) - SYM_CODE_START_LOCAL_NOALIGN(handle_exception) /* the function address is in %gs's slot on the stack */ SAVE_ALL switch_stacks=1 skip_gs=1 unwind_espfix=1 From 79b9c183021ef3f5ca2d5168cd3fd442580eca09 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 21 May 2020 22:05:33 +0200 Subject: [PATCH 129/190] x86/irq: Use generic irq_regs implementation The only difference is the name of the per-CPU variable: irq_regs vs. __irq_regs, but the accessor functions are identical. Remove the pointless copy and use the generic variant. Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar Acked-by: Andy Lutomirski Link: https://lore.kernel.org/r/20200521202118.704169051@linutronix.de --- arch/x86/include/asm/irq_regs.h | 32 -------------------------------- arch/x86/kernel/irq.c | 3 --- 2 files changed, 35 deletions(-) delete mode 100644 arch/x86/include/asm/irq_regs.h diff --git a/arch/x86/include/asm/irq_regs.h b/arch/x86/include/asm/irq_regs.h deleted file mode 100644 index 187ce59aea28e4..00000000000000 --- a/arch/x86/include/asm/irq_regs.h +++ /dev/null @@ -1,32 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * Per-cpu current frame pointer - the location of the last exception frame on - * the stack, stored in the per-cpu area. - * - * Jeremy Fitzhardinge - */ -#ifndef _ASM_X86_IRQ_REGS_H -#define _ASM_X86_IRQ_REGS_H - -#include - -#define ARCH_HAS_OWN_IRQ_REGS - -DECLARE_PER_CPU(struct pt_regs *, irq_regs); - -static inline struct pt_regs *get_irq_regs(void) -{ - return __this_cpu_read(irq_regs); -} - -static inline struct pt_regs *set_irq_regs(struct pt_regs *new_regs) -{ - struct pt_regs *old_regs; - - old_regs = get_irq_regs(); - __this_cpu_write(irq_regs, new_regs); - - return old_regs; -} - -#endif /* _ASM_X86_IRQ_REGS_32_H */ diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index c7965ff429c56e..252065d32ab534 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -26,9 +26,6 @@ DEFINE_PER_CPU_SHARED_ALIGNED(irq_cpustat_t, irq_stat); EXPORT_PER_CPU_SYMBOL(irq_stat); -DEFINE_PER_CPU(struct pt_regs *, irq_regs); -EXPORT_PER_CPU_SYMBOL(irq_regs); - atomic_t irq_err_count; /* From 633260fa143bbed05e65dc557a492667dfdc45bb Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 21 May 2020 22:05:34 +0200 Subject: [PATCH 130/190] x86/irq: Convey vector as argument and not in ptregs Device interrupts which go through do_IRQ() or the spurious interrupt handler have their separate entry code on 64 bit for no good reason. Both 32 and 64 bit transport the vector number through ORIG_[RE]AX in pt_regs. Further the vector number is forced to fit into an u8 and is complemented and offset by 0x80 so it's in the signed character range. Otherwise GAS would expand the pushq to a 5 byte instruction for any vector > 0x7F. Treat the vector number like an error code and hand it to the C function as argument. This allows to get rid of the extra entry code in a later step. Simplify the error code push magic by implementing the pushq imm8 via a '.byte 0x6a, vector' sequence so GAS is not able to screw it up. As the pushq imm8 is sign extending the resulting error code needs to be truncated to 8 bits in C code. Originally-by: Andy Lutomirski Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar Acked-by: Andy Lutomirski Link: https://lore.kernel.org/r/20200521202118.796915981@linutronix.de --- arch/x86/entry/calling.h | 5 +++- arch/x86/entry/entry_32.S | 33 +++------------------ arch/x86/entry/entry_64.S | 40 +++++--------------------- arch/x86/include/asm/entry_arch.h | 2 +- arch/x86/include/asm/hw_irq.h | 1 + arch/x86/include/asm/idtentry.h | 48 +++++++++++++++++++++++++++++++ arch/x86/include/asm/irq.h | 2 +- arch/x86/include/asm/traps.h | 3 +- arch/x86/kernel/apic/apic.c | 31 ++++++++++++++++---- arch/x86/kernel/idt.c | 2 +- arch/x86/kernel/irq.c | 14 +++++---- 11 files changed, 103 insertions(+), 78 deletions(-) diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h index 1c7f13bb672862..98da0d3c0b1a76 100644 --- a/arch/x86/entry/calling.h +++ b/arch/x86/entry/calling.h @@ -341,7 +341,10 @@ For 32-bit we have the following conventions - kernel is built with #endif .endm -#endif /* CONFIG_X86_64 */ +#else /* CONFIG_X86_64 */ +# undef UNWIND_HINT_IRET_REGS +# define UNWIND_HINT_IRET_REGS +#endif /* !CONFIG_X86_64 */ .macro STACKLEAK_ERASE #ifdef CONFIG_GCC_PLUGIN_STACKLEAK diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index 158a5250ebc57f..40092c81dcb81f 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -1215,40 +1215,15 @@ SYM_FUNC_END(entry_INT80_32) #endif .endm -/* - * Build the entry stubs with some assembler magic. - * We pack 1 stub into every 8-byte block. - */ - .align 8 -SYM_CODE_START(irq_entries_start) - vector=FIRST_EXTERNAL_VECTOR - .rept (FIRST_SYSTEM_VECTOR - FIRST_EXTERNAL_VECTOR) - pushl $(~vector+0x80) /* Note: always in signed byte range */ - vector=vector+1 - jmp common_interrupt - .align 8 - .endr -SYM_CODE_END(irq_entries_start) - #ifdef CONFIG_X86_LOCAL_APIC - .align 8 -SYM_CODE_START(spurious_entries_start) - vector=FIRST_SYSTEM_VECTOR - .rept (NR_VECTORS - FIRST_SYSTEM_VECTOR) - pushl $(~vector+0x80) /* Note: always in signed byte range */ - vector=vector+1 - jmp common_spurious - .align 8 - .endr -SYM_CODE_END(spurious_entries_start) - SYM_CODE_START_LOCAL(common_spurious) ASM_CLAC - addl $-0x80, (%esp) /* Adjust vector into the [-256, -1] range */ SAVE_ALL switch_stacks=1 ENCODE_FRAME_POINTER TRACE_IRQS_OFF movl %esp, %eax + movl PT_ORIG_EAX(%esp), %edx /* get the vector from stack */ + movl $-1, PT_ORIG_EAX(%esp) /* no syscall to restart */ call smp_spurious_interrupt jmp ret_from_intr SYM_CODE_END(common_spurious) @@ -1261,12 +1236,12 @@ SYM_CODE_END(common_spurious) .p2align CONFIG_X86_L1_CACHE_SHIFT SYM_CODE_START_LOCAL(common_interrupt) ASM_CLAC - addl $-0x80, (%esp) /* Adjust vector into the [-256, -1] range */ - SAVE_ALL switch_stacks=1 ENCODE_FRAME_POINTER TRACE_IRQS_OFF movl %esp, %eax + movl PT_ORIG_EAX(%esp), %edx /* get the vector from stack */ + movl $-1, PT_ORIG_EAX(%esp) /* no syscall to restart */ call do_IRQ jmp ret_from_intr SYM_CODE_END(common_interrupt) diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 76993591fdf60d..e7434cda9a381e 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -358,34 +358,6 @@ SYM_CODE_START(ret_from_fork) SYM_CODE_END(ret_from_fork) .popsection -/* - * Build the entry stubs with some assembler magic. - * We pack 1 stub into every 8-byte block. - */ - .align 8 -SYM_CODE_START(irq_entries_start) - vector=FIRST_EXTERNAL_VECTOR - .rept (FIRST_SYSTEM_VECTOR - FIRST_EXTERNAL_VECTOR) - UNWIND_HINT_IRET_REGS - pushq $(~vector+0x80) /* Note: always in signed byte range */ - jmp common_interrupt - .align 8 - vector=vector+1 - .endr -SYM_CODE_END(irq_entries_start) - - .align 8 -SYM_CODE_START(spurious_entries_start) - vector=FIRST_SYSTEM_VECTOR - .rept (NR_VECTORS - FIRST_SYSTEM_VECTOR) - UNWIND_HINT_IRET_REGS - pushq $(~vector+0x80) /* Note: always in signed byte range */ - jmp common_spurious - .align 8 - vector=vector+1 - .endr -SYM_CODE_END(spurious_entries_start) - .macro DEBUG_ENTRY_ASSERT_IRQS_OFF #ifdef CONFIG_DEBUG_ENTRY pushq %rax @@ -755,13 +727,14 @@ _ASM_NOKPROBE(interrupt_entry) /* Interrupt entry/exit. */ /* - * The interrupt stubs push (~vector+0x80) onto the stack and + * The interrupt stubs push vector onto the stack and * then jump to common_spurious/interrupt. */ SYM_CODE_START_LOCAL(common_spurious) - addq $-0x80, (%rsp) /* Adjust vector to [-256, -1] range */ call interrupt_entry UNWIND_HINT_REGS indirect=1 + movq ORIG_RAX(%rdi), %rsi /* get vector from stack */ + movq $-1, ORIG_RAX(%rdi) /* no syscall to restart */ call smp_spurious_interrupt /* rdi points to pt_regs */ jmp ret_from_intr SYM_CODE_END(common_spurious) @@ -770,10 +743,11 @@ _ASM_NOKPROBE(common_spurious) /* common_interrupt is a hotpath. Align it */ .p2align CONFIG_X86_L1_CACHE_SHIFT SYM_CODE_START_LOCAL(common_interrupt) - addq $-0x80, (%rsp) /* Adjust vector to [-256, -1] range */ call interrupt_entry UNWIND_HINT_REGS indirect=1 - call do_IRQ /* rdi points to pt_regs */ + movq ORIG_RAX(%rdi), %rsi /* get vector from stack */ + movq $-1, ORIG_RAX(%rdi) /* no syscall to restart */ + call do_IRQ /* rdi points to pt_regs */ /* 0(%rsp): old RSP */ ret_from_intr: DISABLE_INTERRUPTS(CLBR_ANY) @@ -1022,7 +996,7 @@ apicinterrupt RESCHEDULE_VECTOR reschedule_interrupt smp_reschedule_interrupt #endif apicinterrupt ERROR_APIC_VECTOR error_interrupt smp_error_interrupt -apicinterrupt SPURIOUS_APIC_VECTOR spurious_interrupt smp_spurious_interrupt +apicinterrupt SPURIOUS_APIC_VECTOR spurious_apic_interrupt smp_spurious_apic_interrupt #ifdef CONFIG_IRQ_WORK apicinterrupt IRQ_WORK_VECTOR irq_work_interrupt smp_irq_work_interrupt diff --git a/arch/x86/include/asm/entry_arch.h b/arch/x86/include/asm/entry_arch.h index 416422762845b3..cd57ce6134c912 100644 --- a/arch/x86/include/asm/entry_arch.h +++ b/arch/x86/include/asm/entry_arch.h @@ -35,7 +35,7 @@ BUILD_INTERRUPT(kvm_posted_intr_nested_ipi, POSTED_INTR_NESTED_VECTOR) BUILD_INTERRUPT(apic_timer_interrupt,LOCAL_TIMER_VECTOR) BUILD_INTERRUPT(error_interrupt,ERROR_APIC_VECTOR) -BUILD_INTERRUPT(spurious_interrupt,SPURIOUS_APIC_VECTOR) +BUILD_INTERRUPT(spurious_apic_interrupt,SPURIOUS_APIC_VECTOR) BUILD_INTERRUPT(x86_platform_ipi, X86_PLATFORM_IPI_VECTOR) #ifdef CONFIG_IRQ_WORK diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h index 4154bc5f6a4ed0..0ffe80792b2d3e 100644 --- a/arch/x86/include/asm/hw_irq.h +++ b/arch/x86/include/asm/hw_irq.h @@ -39,6 +39,7 @@ extern asmlinkage void irq_work_interrupt(void); extern asmlinkage void uv_bau_message_intr1(void); extern asmlinkage void spurious_interrupt(void); +extern asmlinkage void spurious_apic_interrupt(void); extern asmlinkage void thermal_interrupt(void); extern asmlinkage void reschedule_interrupt(void); diff --git a/arch/x86/include/asm/idtentry.h b/arch/x86/include/asm/idtentry.h index 36e5b929389b6c..2fc0dc8af2a4c4 100644 --- a/arch/x86/include/asm/idtentry.h +++ b/arch/x86/include/asm/idtentry.h @@ -347,6 +347,54 @@ __visible noinstr void func(struct pt_regs *regs, \ #define DECLARE_IDTENTRY_XEN(vector, func) \ idtentry vector asm_exc_xen##func exc_##func has_error_code=0 +/* + * ASM code to emit the common vector entry stubs where each stub is + * packed into 8 bytes. + * + * Note, that the 'pushq imm8' is emitted via '.byte 0x6a, vector' because + * GCC treats the local vector variable as unsigned int and would expand + * all vectors above 0x7F to a 5 byte push. The original code did an + * adjustment of the vector number to be in the signed byte range to avoid + * this. While clever it's mindboggling counterintuitive and requires the + * odd conversion back to a real vector number in the C entry points. Using + * .byte achieves the same thing and the only fixup needed in the C entry + * point is to mask off the bits above bit 7 because the push is sign + * extending. + */ + .align 8 +SYM_CODE_START(irq_entries_start) + vector=FIRST_EXTERNAL_VECTOR + pos = . + .rept (FIRST_SYSTEM_VECTOR - FIRST_EXTERNAL_VECTOR) + UNWIND_HINT_IRET_REGS + .byte 0x6a, vector + jmp common_interrupt + nop + /* Ensure that the above is 8 bytes max */ + . = pos + 8 + pos=pos+8 + vector=vector+1 + .endr +SYM_CODE_END(irq_entries_start) + +#ifdef CONFIG_X86_LOCAL_APIC + .align 8 +SYM_CODE_START(spurious_entries_start) + vector=FIRST_SYSTEM_VECTOR + pos = . + .rept (NR_VECTORS - FIRST_SYSTEM_VECTOR) + UNWIND_HINT_IRET_REGS + .byte 0x6a, vector + jmp common_spurious + nop + /* Ensure that the above is 8 bytes max */ + . = pos + 8 + pos=pos+8 + vector=vector+1 + .endr +SYM_CODE_END(spurious_entries_start) +#endif + #endif /* __ASSEMBLY__ */ /* diff --git a/arch/x86/include/asm/irq.h b/arch/x86/include/asm/irq.h index 72fba0eeeb3074..74690a373c5879 100644 --- a/arch/x86/include/asm/irq.h +++ b/arch/x86/include/asm/irq.h @@ -36,7 +36,7 @@ extern void native_init_IRQ(void); extern void handle_irq(struct irq_desc *desc, struct pt_regs *regs); -extern __visible void do_IRQ(struct pt_regs *regs); +extern __visible void do_IRQ(struct pt_regs *regs, unsigned long vector); extern void init_ISA_irqs(void); diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h index d7de360eec74cd..32b2becf7806cc 100644 --- a/arch/x86/include/asm/traps.h +++ b/arch/x86/include/asm/traps.h @@ -41,8 +41,9 @@ asmlinkage void smp_deferred_error_interrupt(struct pt_regs *regs); #endif void smp_apic_timer_interrupt(struct pt_regs *regs); -void smp_spurious_interrupt(struct pt_regs *regs); void smp_error_interrupt(struct pt_regs *regs); +void smp_spurious_apic_interrupt(struct pt_regs *regs); +void smp_spurious_interrupt(struct pt_regs *regs, unsigned long vector); asmlinkage void smp_irq_move_cleanup_interrupt(void); #ifdef CONFIG_VMAP_STACK diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 4b1d31be50b4a1..6c2b807a7eaead 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -2120,15 +2120,29 @@ void __init register_lapic_address(unsigned long address) * Local APIC interrupts */ -/* - * This interrupt should _never_ happen with our APIC/SMP architecture +/** + * smp_spurious_interrupt - Catch all for interrupts raised on unused vectors + * @regs: Pointer to pt_regs on stack + * @error_code: The vector number is in the lower 8 bits + * + * This is invoked from ASM entry code to catch all interrupts which + * trigger on an entry which is routed to the common_spurious idtentry + * point. + * + * Also called from smp_spurious_apic_interrupt(). */ -__visible void __irq_entry smp_spurious_interrupt(struct pt_regs *regs) +__visible void __irq_entry smp_spurious_interrupt(struct pt_regs *regs, + unsigned long vector) { - u8 vector = ~regs->orig_ax; u32 v; entering_irq(); + /* + * The push in the entry ASM code which stores the vector number on + * the stack in the error code slot is sign expanding. Just use the + * lower 8 bits. + */ + vector &= 0xFF; trace_spurious_apic_entry(vector); inc_irq_stat(irq_spurious_count); @@ -2149,11 +2163,11 @@ __visible void __irq_entry smp_spurious_interrupt(struct pt_regs *regs) */ v = apic_read(APIC_ISR + ((vector & ~0x1f) >> 1)); if (v & (1 << (vector & 0x1f))) { - pr_info("Spurious interrupt (vector 0x%02x) on CPU#%d. Acked\n", + pr_info("Spurious interrupt (vector 0x%02lx) on CPU#%d. Acked\n", vector, smp_processor_id()); ack_APIC_irq(); } else { - pr_info("Spurious interrupt (vector 0x%02x) on CPU#%d. Not pending!\n", + pr_info("Spurious interrupt (vector 0x%02lx) on CPU#%d. Not pending!\n", vector, smp_processor_id()); } out: @@ -2161,6 +2175,11 @@ __visible void __irq_entry smp_spurious_interrupt(struct pt_regs *regs) exiting_irq(); } +__visible void smp_spurious_apic_interrupt(struct pt_regs *regs) +{ + smp_spurious_interrupt(regs, SPURIOUS_APIC_VECTOR); +} + /* * This interrupt should never happen with our APIC/SMP architecture */ diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c index ddb11154aeeeb3..20408e31c18de3 100644 --- a/arch/x86/kernel/idt.c +++ b/arch/x86/kernel/idt.c @@ -145,7 +145,7 @@ static const __initconst struct idt_data apic_idts[] = { #ifdef CONFIG_X86_UV INTG(UV_BAU_MESSAGE, uv_bau_message_intr1), #endif - INTG(SPURIOUS_APIC_VECTOR, spurious_interrupt), + INTG(SPURIOUS_APIC_VECTOR, spurious_apic_interrupt), INTG(ERROR_APIC_VECTOR, error_interrupt), #endif }; diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 252065d32ab534..c7669363251a2b 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -227,14 +227,18 @@ u64 arch_irq_stat(void) * SMP cross-CPU interrupts have their own specific * handlers). */ -__visible void __irq_entry do_IRQ(struct pt_regs *regs) +__visible void __irq_entry do_IRQ(struct pt_regs *regs, unsigned long vector) { struct pt_regs *old_regs = set_irq_regs(regs); - struct irq_desc * desc; - /* high bit used in ret_from_ code */ - unsigned vector = ~regs->orig_ax; + struct irq_desc *desc; entering_irq(); + /* + * The push in the entry ASM code which stores the vector number on + * the stack in the error code slot is sign expanding. Just use the + * lower 8 bits. + */ + vector &= 0xFF; /* entering_irq() tells RCU that we're not quiescent. Check it. */ RCU_LOCKDEP_WARN(!rcu_is_watching(), "IRQ failed to wake up RCU"); @@ -249,7 +253,7 @@ __visible void __irq_entry do_IRQ(struct pt_regs *regs) ack_APIC_irq(); if (desc == VECTOR_UNUSED) { - pr_emerg_ratelimited("%s: %d.%d No irq handler for vector\n", + pr_emerg_ratelimited("%s: %d.%lu No irq handler for vector\n", __func__, smp_processor_id(), vector); } else { From 7c2a57364cae0f2e070a27d728f1df6844ffff56 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 21 May 2020 22:05:35 +0200 Subject: [PATCH 131/190] x86/irq: Rework handle_irq() for 64-bit To consolidate the interrupt entry/exit code vs. the other exceptions make handle_irq() an inline and handle both 64-bit and 32-bit mode. Preparatory change to move irq stack switching for 64-bit to C which allows to consolidate the entry exit handling by reusing the idtentry machinery both in ASM and C. Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar Acked-by: Andy Lutomirski Link: https://lore.kernel.org/r/20200521202118.889972748@linutronix.de --- arch/x86/include/asm/irq.h | 2 +- arch/x86/kernel/irq.c | 11 ++++++++++- arch/x86/kernel/irq_32.c | 2 +- 3 files changed, 12 insertions(+), 3 deletions(-) diff --git a/arch/x86/include/asm/irq.h b/arch/x86/include/asm/irq.h index 74690a373c5879..67aa1e2a5b4aeb 100644 --- a/arch/x86/include/asm/irq.h +++ b/arch/x86/include/asm/irq.h @@ -34,7 +34,7 @@ extern __visible void smp_kvm_posted_intr_nested_ipi(struct pt_regs *regs); extern void (*x86_platform_ipi_callback)(void); extern void native_init_IRQ(void); -extern void handle_irq(struct irq_desc *desc, struct pt_regs *regs); +extern void __handle_irq(struct irq_desc *desc, struct pt_regs *regs); extern __visible void do_IRQ(struct pt_regs *regs, unsigned long vector); diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index c7669363251a2b..5495ea4debba3a 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -13,6 +13,7 @@ #include #include +#include #include #include #include @@ -221,6 +222,14 @@ u64 arch_irq_stat(void) return sum; } +static __always_inline void handle_irq(struct irq_desc *desc, + struct pt_regs *regs) +{ + if (IS_ENABLED(CONFIG_X86_64)) + run_on_irqstack_cond(desc->handle_irq, desc, regs); + else + __handle_irq(desc, regs); +} /* * do_IRQ handles all normal device IRQ's (the special @@ -246,7 +255,7 @@ __visible void __irq_entry do_IRQ(struct pt_regs *regs, unsigned long vector) desc = __this_cpu_read(vector_irq[vector]); if (likely(!IS_ERR_OR_NULL(desc))) { if (IS_ENABLED(CONFIG_X86_32)) - handle_irq(desc, regs); + __handle_irq(desc, regs); else generic_handle_irq_desc(desc); } else { diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c index a759ca97cd01cb..0b79efc87be527 100644 --- a/arch/x86/kernel/irq_32.c +++ b/arch/x86/kernel/irq_32.c @@ -148,7 +148,7 @@ void do_softirq_own_stack(void) call_on_stack(__do_softirq, isp); } -void handle_irq(struct irq_desc *desc, struct pt_regs *regs) +void __handle_irq(struct irq_desc *desc, struct pt_regs *regs) { int overflow = check_stack_overflow(); From 0bf7c314ff68622468945a24ea2f7ebc1edf0a6b Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 21 May 2020 22:05:36 +0200 Subject: [PATCH 132/190] x86/entry: Add IRQENTRY_IRQ macro Provide a seperate IDTENTRY macro for device interrupts. Similar to IDTENTRY_ERRORCODE with the addition of invoking irq_enter/exit_rcu() and providing the errorcode as a 'u8' argument to the C function, which truncates the sign extended vector number. Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar Acked-by: Andy Lutomirski Link: https://lore.kernel.org/r/20200521202118.984573165@linutronix.de --- arch/x86/entry/entry_32.S | 14 ++++++++++ arch/x86/entry/entry_64.S | 14 ++++++++++ arch/x86/include/asm/idtentry.h | 49 +++++++++++++++++++++++++++++++++ 3 files changed, 77 insertions(+) diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index 40092c81dcb81f..ba2a70d7118a1e 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -751,6 +751,20 @@ SYM_CODE_START(\asmsym) SYM_CODE_END(\asmsym) .endm +.macro idtentry_irq vector cfunc + .p2align CONFIG_X86_L1_CACHE_SHIFT +SYM_CODE_START_LOCAL(asm_\cfunc) + ASM_CLAC + SAVE_ALL switch_stacks=1 + ENCODE_FRAME_POINTER + movl %esp, %eax + movl PT_ORIG_EAX(%esp), %edx /* get the vector from stack */ + movl $-1, PT_ORIG_EAX(%esp) /* no syscall to restart */ + call \cfunc + jmp handle_exception_return +SYM_CODE_END(asm_\cfunc) +.endm + /* * Include the defines which emit the idt entries which are shared * shared between 32 and 64 bit. diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index e7434cda9a381e..9162a073e524b1 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -527,6 +527,20 @@ _ASM_NOKPROBE(\asmsym) SYM_CODE_END(\asmsym) .endm +/* + * Interrupt entry/exit. + * + + The interrupt stubs push (vector) onto the stack, which is the error_code + * position of idtentry exceptions, and jump to one of the two idtentry points + * (common/spurious). + * + * common_interrupt is a hotpath, align it to a cache line + */ +.macro idtentry_irq vector cfunc + .p2align CONFIG_X86_L1_CACHE_SHIFT + idtentry \vector asm_\cfunc \cfunc has_error_code=1 +.endm + /* * MCE and DB exceptions */ diff --git a/arch/x86/include/asm/idtentry.h b/arch/x86/include/asm/idtentry.h index 2fc0dc8af2a4c4..eaee48bd6a1958 100644 --- a/arch/x86/include/asm/idtentry.h +++ b/arch/x86/include/asm/idtentry.h @@ -165,6 +165,51 @@ __visible noinstr void func(struct pt_regs *regs) #define DEFINE_IDTENTRY_RAW_ERRORCODE(func) \ __visible noinstr void func(struct pt_regs *regs, unsigned long error_code) +/** + * DECLARE_IDTENTRY_IRQ - Declare functions for device interrupt IDT entry + * points (common/spurious) + * @vector: Vector number (ignored for C) + * @func: Function name of the entry point + * + * Maps to DECLARE_IDTENTRY_ERRORCODE() + */ +#define DECLARE_IDTENTRY_IRQ(vector, func) \ + DECLARE_IDTENTRY_ERRORCODE(vector, func) + +/** + * DEFINE_IDTENTRY_IRQ - Emit code for device interrupt IDT entry points + * @func: Function name of the entry point + * + * The vector number is pushed by the low level entry stub and handed + * to the function as error_code argument which needs to be truncated + * to an u8 because the push is sign extending. + * + * On 64-bit idtentry_enter/exit() are invoked in the ASM entry code before + * and after switching to the interrupt stack. On 32-bit this happens in C. + * + * irq_enter/exit_rcu() are invoked before the function body and the + * KVM L1D flush request is set. + */ +#define DEFINE_IDTENTRY_IRQ(func) \ +static __always_inline void __##func(struct pt_regs *regs, u8 vector); \ + \ +__visible noinstr void func(struct pt_regs *regs, \ + unsigned long error_code) \ +{ \ + bool rcu_exit = idtentry_enter_cond_rcu(regs); \ + \ + instrumentation_begin(); \ + irq_enter_rcu(); \ + kvm_set_cpu_l1tf_flush_l1d(); \ + __##func (regs, (u8)error_code); \ + irq_exit_rcu(); \ + lockdep_hardirq_exit(); \ + instrumentation_end(); \ + idtentry_exit_cond_rcu(regs, rcu_exit); \ +} \ + \ +static __always_inline void __##func(struct pt_regs *regs, u8 vector) + /** * DECLARE_IDTENTRY_XENCB - Declare functions for XEN HV callback entry point * @vector: Vector number (ignored for C) @@ -312,6 +357,10 @@ __visible noinstr void func(struct pt_regs *regs, \ #define DECLARE_IDTENTRY_RAW_ERRORCODE(vector, func) \ DECLARE_IDTENTRY_ERRORCODE(vector, func) +/* Entries for common/spurious (device) interrupts */ +#define DECLARE_IDTENTRY_IRQ(vector, func) \ + idtentry_irq vector func + #ifdef CONFIG_X86_64 # define DECLARE_IDTENTRY_MCE(vector, func) \ idtentry_mce_db vector asm_##func func From fa5e5c409213265da8a188b4a5e4e641b1382eb4 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 21 May 2020 22:05:37 +0200 Subject: [PATCH 133/190] x86/entry: Use idtentry for interrupts Replace the extra interrupt handling code and reuse the existing idtentry machinery. This moves the irq stack switching on 64-bit from ASM to C code; 32-bit already does the stack switching in C. This requires to remove HAVE_IRQ_EXIT_ON_IRQ_STACK as the stack switch is not longer in the low level entry code. Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar Acked-by: Andy Lutomirski Link: https://lore.kernel.org/r/20200521202119.078690991@linutronix.de --- arch/x86/Kconfig | 1 - arch/x86/entry/entry_32.S | 31 ------------------------------- arch/x86/entry/entry_64.S | 31 +++---------------------------- arch/x86/include/asm/hw_irq.h | 1 - arch/x86/include/asm/idtentry.h | 10 ++++++++-- arch/x86/include/asm/traps.h | 1 - arch/x86/kernel/apic/apic.c | 23 ++++++++--------------- arch/x86/kernel/apic/msi.c | 3 ++- arch/x86/kernel/irq.c | 27 +++++++-------------------- 9 files changed, 28 insertions(+), 100 deletions(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 10dae8b96ed57d..a16c45460f1b2c 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -181,7 +181,6 @@ config X86 select HAVE_HW_BREAKPOINT select HAVE_IDE select HAVE_IOREMAP_PROT - select HAVE_IRQ_EXIT_ON_IRQ_STACK if X86_64 select HAVE_IRQ_TIME_ACCOUNTING select HAVE_KERNEL_BZIP2 select HAVE_KERNEL_GZIP diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index ba2a70d7118a1e..b47b7b2238111f 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -1229,37 +1229,6 @@ SYM_FUNC_END(entry_INT80_32) #endif .endm -#ifdef CONFIG_X86_LOCAL_APIC -SYM_CODE_START_LOCAL(common_spurious) - ASM_CLAC - SAVE_ALL switch_stacks=1 - ENCODE_FRAME_POINTER - TRACE_IRQS_OFF - movl %esp, %eax - movl PT_ORIG_EAX(%esp), %edx /* get the vector from stack */ - movl $-1, PT_ORIG_EAX(%esp) /* no syscall to restart */ - call smp_spurious_interrupt - jmp ret_from_intr -SYM_CODE_END(common_spurious) -#endif - -/* - * the CPU automatically disables interrupts when executing an IRQ vector, - * so IRQ-flags tracing has to follow that: - */ - .p2align CONFIG_X86_L1_CACHE_SHIFT -SYM_CODE_START_LOCAL(common_interrupt) - ASM_CLAC - SAVE_ALL switch_stacks=1 - ENCODE_FRAME_POINTER - TRACE_IRQS_OFF - movl %esp, %eax - movl PT_ORIG_EAX(%esp), %edx /* get the vector from stack */ - movl $-1, PT_ORIG_EAX(%esp) /* no syscall to restart */ - call do_IRQ - jmp ret_from_intr -SYM_CODE_END(common_interrupt) - #define BUILD_INTERRUPT3(name, nr, fn) \ SYM_FUNC_START(name) \ ASM_CLAC; \ diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 9162a073e524b1..e54bcd3244f8a2 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -737,32 +737,7 @@ SYM_CODE_START(interrupt_entry) SYM_CODE_END(interrupt_entry) _ASM_NOKPROBE(interrupt_entry) - -/* Interrupt entry/exit. */ - -/* - * The interrupt stubs push vector onto the stack and - * then jump to common_spurious/interrupt. - */ -SYM_CODE_START_LOCAL(common_spurious) - call interrupt_entry - UNWIND_HINT_REGS indirect=1 - movq ORIG_RAX(%rdi), %rsi /* get vector from stack */ - movq $-1, ORIG_RAX(%rdi) /* no syscall to restart */ - call smp_spurious_interrupt /* rdi points to pt_regs */ - jmp ret_from_intr -SYM_CODE_END(common_spurious) -_ASM_NOKPROBE(common_spurious) - -/* common_interrupt is a hotpath. Align it */ - .p2align CONFIG_X86_L1_CACHE_SHIFT -SYM_CODE_START_LOCAL(common_interrupt) - call interrupt_entry - UNWIND_HINT_REGS indirect=1 - movq ORIG_RAX(%rdi), %rsi /* get vector from stack */ - movq $-1, ORIG_RAX(%rdi) /* no syscall to restart */ - call do_IRQ /* rdi points to pt_regs */ - /* 0(%rsp): old RSP */ +SYM_CODE_START_LOCAL(common_interrupt_return) ret_from_intr: DISABLE_INTERRUPTS(CLBR_ANY) TRACE_IRQS_OFF @@ -945,8 +920,8 @@ native_irq_return_ldt: */ jmp native_irq_return_iret #endif -SYM_CODE_END(common_interrupt) -_ASM_NOKPROBE(common_interrupt) +SYM_CODE_END(common_interrupt_return) +_ASM_NOKPROBE(common_interrupt_return) /* * APIC interrupts. diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h index 0ffe80792b2d3e..3213d36b92d337 100644 --- a/arch/x86/include/asm/hw_irq.h +++ b/arch/x86/include/asm/hw_irq.h @@ -38,7 +38,6 @@ extern asmlinkage void error_interrupt(void); extern asmlinkage void irq_work_interrupt(void); extern asmlinkage void uv_bau_message_intr1(void); -extern asmlinkage void spurious_interrupt(void); extern asmlinkage void spurious_apic_interrupt(void); extern asmlinkage void thermal_interrupt(void); extern asmlinkage void reschedule_interrupt(void); diff --git a/arch/x86/include/asm/idtentry.h b/arch/x86/include/asm/idtentry.h index eaee48bd6a1958..341888184d1c9e 100644 --- a/arch/x86/include/asm/idtentry.h +++ b/arch/x86/include/asm/idtentry.h @@ -417,7 +417,7 @@ SYM_CODE_START(irq_entries_start) .rept (FIRST_SYSTEM_VECTOR - FIRST_EXTERNAL_VECTOR) UNWIND_HINT_IRET_REGS .byte 0x6a, vector - jmp common_interrupt + jmp asm_common_interrupt nop /* Ensure that the above is 8 bytes max */ . = pos + 8 @@ -434,7 +434,7 @@ SYM_CODE_START(spurious_entries_start) .rept (NR_VECTORS - FIRST_SYSTEM_VECTOR) UNWIND_HINT_IRET_REGS .byte 0x6a, vector - jmp common_spurious + jmp asm_spurious_interrupt nop /* Ensure that the above is 8 bytes max */ . = pos + 8 @@ -506,6 +506,12 @@ DECLARE_IDTENTRY_DF(X86_TRAP_DF, exc_double_fault); DECLARE_IDTENTRY_XENCB(X86_TRAP_OTHER, exc_xen_hypervisor_callback); #endif +/* Device interrupts common/spurious */ +DECLARE_IDTENTRY_IRQ(X86_TRAP_OTHER, common_interrupt); +#ifdef CONFIG_X86_LOCAL_APIC +DECLARE_IDTENTRY_IRQ(X86_TRAP_OTHER, spurious_interrupt); +#endif + #undef X86_TRAP_OTHER #endif diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h index 32b2becf7806cc..97e6945bfce8e7 100644 --- a/arch/x86/include/asm/traps.h +++ b/arch/x86/include/asm/traps.h @@ -43,7 +43,6 @@ asmlinkage void smp_deferred_error_interrupt(struct pt_regs *regs); void smp_apic_timer_interrupt(struct pt_regs *regs); void smp_error_interrupt(struct pt_regs *regs); void smp_spurious_apic_interrupt(struct pt_regs *regs); -void smp_spurious_interrupt(struct pt_regs *regs, unsigned long vector); asmlinkage void smp_irq_move_cleanup_interrupt(void); #ifdef CONFIG_VMAP_STACK diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 6c2b807a7eaead..b7bfd3a1abb763 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -2121,9 +2121,9 @@ void __init register_lapic_address(unsigned long address) */ /** - * smp_spurious_interrupt - Catch all for interrupts raised on unused vectors + * spurious_interrupt - Catch all for interrupts raised on unused vectors * @regs: Pointer to pt_regs on stack - * @error_code: The vector number is in the lower 8 bits + * @vector: The vector number * * This is invoked from ASM entry code to catch all interrupts which * trigger on an entry which is routed to the common_spurious idtentry @@ -2131,18 +2131,10 @@ void __init register_lapic_address(unsigned long address) * * Also called from smp_spurious_apic_interrupt(). */ -__visible void __irq_entry smp_spurious_interrupt(struct pt_regs *regs, - unsigned long vector) +DEFINE_IDTENTRY_IRQ(spurious_interrupt) { u32 v; - entering_irq(); - /* - * The push in the entry ASM code which stores the vector number on - * the stack in the error code slot is sign expanding. Just use the - * lower 8 bits. - */ - vector &= 0xFF; trace_spurious_apic_entry(vector); inc_irq_stat(irq_spurious_count); @@ -2163,21 +2155,22 @@ __visible void __irq_entry smp_spurious_interrupt(struct pt_regs *regs, */ v = apic_read(APIC_ISR + ((vector & ~0x1f) >> 1)); if (v & (1 << (vector & 0x1f))) { - pr_info("Spurious interrupt (vector 0x%02lx) on CPU#%d. Acked\n", + pr_info("Spurious interrupt (vector 0x%02x) on CPU#%d. Acked\n", vector, smp_processor_id()); ack_APIC_irq(); } else { - pr_info("Spurious interrupt (vector 0x%02lx) on CPU#%d. Not pending!\n", + pr_info("Spurious interrupt (vector 0x%02x) on CPU#%d. Not pending!\n", vector, smp_processor_id()); } out: trace_spurious_apic_exit(vector); - exiting_irq(); } __visible void smp_spurious_apic_interrupt(struct pt_regs *regs) { - smp_spurious_interrupt(regs, SPURIOUS_APIC_VECTOR); + entering_irq(); + __spurious_interrupt(regs, SPURIOUS_APIC_VECTOR); + exiting_irq(); } /* diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c index 159bd0cb854865..5cbaca58af950e 100644 --- a/arch/x86/kernel/apic/msi.c +++ b/arch/x86/kernel/apic/msi.c @@ -115,7 +115,8 @@ msi_set_affinity(struct irq_data *irqd, const struct cpumask *mask, bool force) * denote it as spurious which is no harm as this is a rare event * and interrupt handlers have to cope with spurious interrupts * anyway. If the vector is unused, then it is marked so it won't - * trigger the 'No irq handler for vector' warning in do_IRQ(). + * trigger the 'No irq handler for vector' warning in + * common_interrupt(). * * This requires to hold vector lock to prevent concurrent updates to * the affected vector. diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 5495ea4debba3a..c449b843403630 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -20,6 +20,7 @@ #include #include #include +#include #define CREATE_TRACE_POINTS #include @@ -232,37 +233,25 @@ static __always_inline void handle_irq(struct irq_desc *desc, } /* - * do_IRQ handles all normal device IRQ's (the special - * SMP cross-CPU interrupts have their own specific - * handlers). + * common_interrupt() handles all normal device IRQ's (the special SMP + * cross-CPU interrupts have their own entry points). */ -__visible void __irq_entry do_IRQ(struct pt_regs *regs, unsigned long vector) +DEFINE_IDTENTRY_IRQ(common_interrupt) { struct pt_regs *old_regs = set_irq_regs(regs); struct irq_desc *desc; - entering_irq(); - /* - * The push in the entry ASM code which stores the vector number on - * the stack in the error code slot is sign expanding. Just use the - * lower 8 bits. - */ - vector &= 0xFF; - - /* entering_irq() tells RCU that we're not quiescent. Check it. */ + /* entry code tells RCU that we're not quiescent. Check it. */ RCU_LOCKDEP_WARN(!rcu_is_watching(), "IRQ failed to wake up RCU"); desc = __this_cpu_read(vector_irq[vector]); if (likely(!IS_ERR_OR_NULL(desc))) { - if (IS_ENABLED(CONFIG_X86_32)) - __handle_irq(desc, regs); - else - generic_handle_irq_desc(desc); + handle_irq(desc, regs); } else { ack_APIC_irq(); if (desc == VECTOR_UNUSED) { - pr_emerg_ratelimited("%s: %d.%lu No irq handler for vector\n", + pr_emerg_ratelimited("%s: %d.%u No irq handler for vector\n", __func__, smp_processor_id(), vector); } else { @@ -270,8 +259,6 @@ __visible void __irq_entry do_IRQ(struct pt_regs *regs, unsigned long vector) } } - exiting_irq(); - set_irq_regs(old_regs); } From 6368558c37107bed35950cfbd994f49de07236dc Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 21 May 2020 22:05:38 +0200 Subject: [PATCH 134/190] x86/entry: Provide IDTENTRY_SYSVEC Provide IDTENTRY variants for system vectors to consolidate the different mechanisms to emit the ASM stubs for 32- and 64-bit. On 64-bit this also moves the stack switching from ASM to C code. 32-bit will excute the system vectors w/o stack switching as before. The simple variant is meant for "empty" system vectors like scheduler IPI and KVM posted interrupt vectors. These do not need the full glory of irq enter/exit handling with softirq processing and more. Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar Acked-by: Andy Lutomirski Link: https://lore.kernel.org/r/20200521202119.185317067@linutronix.de --- arch/x86/entry/entry_32.S | 4 ++ arch/x86/entry/entry_64.S | 8 ++++ arch/x86/include/asm/idtentry.h | 79 +++++++++++++++++++++++++++++++++ 3 files changed, 91 insertions(+) diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index b47b7b2238111f..a8803aa3a07bfd 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -765,6 +765,10 @@ SYM_CODE_START_LOCAL(asm_\cfunc) SYM_CODE_END(asm_\cfunc) .endm +.macro idtentry_sysvec vector cfunc + idtentry \vector asm_\cfunc \cfunc has_error_code=0 +.endm + /* * Include the defines which emit the idt entries which are shared * shared between 32 and 64 bit. diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index e54bcd3244f8a2..9b7183dac202c9 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -541,6 +541,14 @@ SYM_CODE_END(\asmsym) idtentry \vector asm_\cfunc \cfunc has_error_code=1 .endm +/* + * System vectors which invoke their handlers directly and are not + * going through the regular common device interrupt handling code. + */ +.macro idtentry_sysvec vector cfunc + idtentry \vector asm_\cfunc \cfunc has_error_code=0 +.endm + /* * MCE and DB exceptions */ diff --git a/arch/x86/include/asm/idtentry.h b/arch/x86/include/asm/idtentry.h index 341888184d1c9e..63f7b99703cf87 100644 --- a/arch/x86/include/asm/idtentry.h +++ b/arch/x86/include/asm/idtentry.h @@ -6,6 +6,9 @@ #include #ifndef __ASSEMBLY__ +#include + +#include void idtentry_enter_user(struct pt_regs *regs); void idtentry_exit_user(struct pt_regs *regs); @@ -210,6 +213,78 @@ __visible noinstr void func(struct pt_regs *regs, \ \ static __always_inline void __##func(struct pt_regs *regs, u8 vector) +/** + * DECLARE_IDTENTRY_SYSVEC - Declare functions for system vector entry points + * @vector: Vector number (ignored for C) + * @func: Function name of the entry point + * + * Declares three functions: + * - The ASM entry point: asm_##func + * - The XEN PV trap entry point: xen_##func (maybe unused) + * - The C handler called from the ASM entry point + * + * Maps to DECLARE_IDTENTRY(). + */ +#define DECLARE_IDTENTRY_SYSVEC(vector, func) \ + DECLARE_IDTENTRY(vector, func) + +/** + * DEFINE_IDTENTRY_SYSVEC - Emit code for system vector IDT entry points + * @func: Function name of the entry point + * + * idtentry_enter/exit() and irq_enter/exit_rcu() are invoked before the + * function body. KVM L1D flush request is set. + * + * Runs the function on the interrupt stack if the entry hit kernel mode + */ +#define DEFINE_IDTENTRY_SYSVEC(func) \ +static void __##func(struct pt_regs *regs); \ + \ +__visible noinstr void func(struct pt_regs *regs) \ +{ \ + bool rcu_exit = idtentry_enter_cond_rcu(regs); \ + \ + instrumentation_begin(); \ + irq_enter_rcu(); \ + kvm_set_cpu_l1tf_flush_l1d(); \ + run_on_irqstack_cond(__##func, regs, regs); \ + irq_exit_rcu(); \ + lockdep_hardirq_exit(); \ + instrumentation_end(); \ + idtentry_exit_cond_rcu(regs, rcu_exit); \ +} \ + \ +static noinline void __##func(struct pt_regs *regs) + +/** + * DEFINE_IDTENTRY_SYSVEC_SIMPLE - Emit code for simple system vector IDT + * entry points + * @func: Function name of the entry point + * + * Runs the function on the interrupted stack. No switch to IRQ stack and + * only the minimal __irq_enter/exit() handling. + * + * Only use for 'empty' vectors like reschedule IPI and KVM posted + * interrupt vectors. + */ +#define DEFINE_IDTENTRY_SYSVEC_SIMPLE(func) \ +static __always_inline void __##func(struct pt_regs *regs); \ + \ +__visible noinstr void func(struct pt_regs *regs) \ +{ \ + bool rcu_exit = idtentry_enter_cond_rcu(regs); \ + \ + instrumentation_begin(); \ + __irq_enter_raw(); \ + kvm_set_cpu_l1tf_flush_l1d(); \ + __##func (regs); \ + __irq_exit_raw(); \ + instrumentation_end(); \ + idtentry_exit_cond_rcu(regs, rcu_exit); \ +} \ + \ +static __always_inline void __##func(struct pt_regs *regs) + /** * DECLARE_IDTENTRY_XENCB - Declare functions for XEN HV callback entry point * @vector: Vector number (ignored for C) @@ -361,6 +436,10 @@ __visible noinstr void func(struct pt_regs *regs, \ #define DECLARE_IDTENTRY_IRQ(vector, func) \ idtentry_irq vector func +/* System vector entries */ +#define DECLARE_IDTENTRY_SYSVEC(vector, func) \ + idtentry_sysvec vector func + #ifdef CONFIG_X86_64 # define DECLARE_IDTENTRY_MCE(vector, func) \ idtentry_mce_db vector asm_##func func From db0338eec5836eea3bd1b274212234d04bac2034 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 21 May 2020 22:05:39 +0200 Subject: [PATCH 135/190] x86/entry: Convert APIC interrupts to IDTENTRY_SYSVEC Convert APIC interrupts to IDTENTRY_SYSVEC: - Implement the C entry point with DEFINE_IDTENTRY_SYSVEC - Emit the ASM stub with DECLARE_IDTENTRY_SYSVEC - Remove the ASM idtentries in 64-bit - Remove the BUILD_INTERRUPT entries in 32-bit - Remove the old prototypes No functional change. Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar Acked-by: Andy Lutomirski Link: https://lore.kernel.org/r/20200521202119.280728850@linutronix.de --- arch/x86/entry/entry_64.S | 6 ------ arch/x86/include/asm/entry_arch.h | 5 ----- arch/x86/include/asm/hw_irq.h | 4 ---- arch/x86/include/asm/idtentry.h | 8 ++++++++ arch/x86/include/asm/irq.h | 1 - arch/x86/include/asm/traps.h | 3 --- arch/x86/kernel/apic/apic.c | 23 +++++------------------ arch/x86/kernel/idt.c | 8 ++++---- arch/x86/kernel/irq.c | 5 ++--- 9 files changed, 19 insertions(+), 44 deletions(-) diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 9b7183dac202c9..25f71a0c9d3e44 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -965,9 +965,6 @@ apicinterrupt3 REBOOT_VECTOR reboot_interrupt smp_reboot_interrupt apicinterrupt3 UV_BAU_MESSAGE uv_bau_message_intr1 uv_bau_message_interrupt #endif -apicinterrupt LOCAL_TIMER_VECTOR apic_timer_interrupt smp_apic_timer_interrupt -apicinterrupt X86_PLATFORM_IPI_VECTOR x86_platform_ipi smp_x86_platform_ipi - #ifdef CONFIG_HAVE_KVM apicinterrupt3 POSTED_INTR_VECTOR kvm_posted_intr_ipi smp_kvm_posted_intr_ipi apicinterrupt3 POSTED_INTR_WAKEUP_VECTOR kvm_posted_intr_wakeup_ipi smp_kvm_posted_intr_wakeup_ipi @@ -992,9 +989,6 @@ apicinterrupt CALL_FUNCTION_VECTOR call_function_interrupt smp_call_function_i apicinterrupt RESCHEDULE_VECTOR reschedule_interrupt smp_reschedule_interrupt #endif -apicinterrupt ERROR_APIC_VECTOR error_interrupt smp_error_interrupt -apicinterrupt SPURIOUS_APIC_VECTOR spurious_apic_interrupt smp_spurious_apic_interrupt - #ifdef CONFIG_IRQ_WORK apicinterrupt IRQ_WORK_VECTOR irq_work_interrupt smp_irq_work_interrupt #endif diff --git a/arch/x86/include/asm/entry_arch.h b/arch/x86/include/asm/entry_arch.h index cd57ce6134c912..d10d6d807e7362 100644 --- a/arch/x86/include/asm/entry_arch.h +++ b/arch/x86/include/asm/entry_arch.h @@ -33,11 +33,6 @@ BUILD_INTERRUPT(kvm_posted_intr_nested_ipi, POSTED_INTR_NESTED_VECTOR) */ #ifdef CONFIG_X86_LOCAL_APIC -BUILD_INTERRUPT(apic_timer_interrupt,LOCAL_TIMER_VECTOR) -BUILD_INTERRUPT(error_interrupt,ERROR_APIC_VECTOR) -BUILD_INTERRUPT(spurious_apic_interrupt,SPURIOUS_APIC_VECTOR) -BUILD_INTERRUPT(x86_platform_ipi, X86_PLATFORM_IPI_VECTOR) - #ifdef CONFIG_IRQ_WORK BUILD_INTERRUPT(irq_work_interrupt, IRQ_WORK_VECTOR) #endif diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h index 3213d36b92d337..1765993360e77c 100644 --- a/arch/x86/include/asm/hw_irq.h +++ b/arch/x86/include/asm/hw_irq.h @@ -29,16 +29,12 @@ #include /* Interrupt handlers registered during init_IRQ */ -extern asmlinkage void apic_timer_interrupt(void); -extern asmlinkage void x86_platform_ipi(void); extern asmlinkage void kvm_posted_intr_ipi(void); extern asmlinkage void kvm_posted_intr_wakeup_ipi(void); extern asmlinkage void kvm_posted_intr_nested_ipi(void); -extern asmlinkage void error_interrupt(void); extern asmlinkage void irq_work_interrupt(void); extern asmlinkage void uv_bau_message_intr1(void); -extern asmlinkage void spurious_apic_interrupt(void); extern asmlinkage void thermal_interrupt(void); extern asmlinkage void reschedule_interrupt(void); diff --git a/arch/x86/include/asm/idtentry.h b/arch/x86/include/asm/idtentry.h index 63f7b99703cf87..b95f36276c6c53 100644 --- a/arch/x86/include/asm/idtentry.h +++ b/arch/x86/include/asm/idtentry.h @@ -591,6 +591,14 @@ DECLARE_IDTENTRY_IRQ(X86_TRAP_OTHER, common_interrupt); DECLARE_IDTENTRY_IRQ(X86_TRAP_OTHER, spurious_interrupt); #endif +/* System vector entry points */ +#ifdef CONFIG_X86_LOCAL_APIC +DECLARE_IDTENTRY_SYSVEC(ERROR_APIC_VECTOR, sysvec_error_interrupt); +DECLARE_IDTENTRY_SYSVEC(SPURIOUS_APIC_VECTOR, sysvec_spurious_apic_interrupt); +DECLARE_IDTENTRY_SYSVEC(LOCAL_TIMER_VECTOR, sysvec_apic_timer_interrupt); +DECLARE_IDTENTRY_SYSVEC(X86_PLATFORM_IPI_VECTOR, sysvec_x86_platform_ipi); +#endif + #undef X86_TRAP_OTHER #endif diff --git a/arch/x86/include/asm/irq.h b/arch/x86/include/asm/irq.h index 67aa1e2a5b4aeb..c7c43e86805a61 100644 --- a/arch/x86/include/asm/irq.h +++ b/arch/x86/include/asm/irq.h @@ -46,7 +46,6 @@ extern void __init init_IRQ(void); void arch_trigger_cpumask_backtrace(const struct cpumask *mask, bool exclude_self); -extern __visible void smp_x86_platform_ipi(struct pt_regs *regs); #define arch_trigger_cpumask_backtrace arch_trigger_cpumask_backtrace #endif diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h index 97e6945bfce8e7..933934c3e17351 100644 --- a/arch/x86/include/asm/traps.h +++ b/arch/x86/include/asm/traps.h @@ -40,9 +40,6 @@ asmlinkage void smp_threshold_interrupt(struct pt_regs *regs); asmlinkage void smp_deferred_error_interrupt(struct pt_regs *regs); #endif -void smp_apic_timer_interrupt(struct pt_regs *regs); -void smp_error_interrupt(struct pt_regs *regs); -void smp_spurious_apic_interrupt(struct pt_regs *regs); asmlinkage void smp_irq_move_cleanup_interrupt(void); #ifdef CONFIG_VMAP_STACK diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index b7bfd3a1abb763..9244377ed454b2 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -1088,23 +1088,14 @@ static void local_apic_timer_interrupt(void) * [ if a single-CPU system runs an SMP kernel then we call the local * interrupt as well. Thus we cannot inline the local irq ... ] */ -__visible void __irq_entry smp_apic_timer_interrupt(struct pt_regs *regs) +DEFINE_IDTENTRY_SYSVEC(sysvec_apic_timer_interrupt) { struct pt_regs *old_regs = set_irq_regs(regs); - /* - * NOTE! We'd better ACK the irq immediately, - * because timer handling can be slow. - * - * update_process_times() expects us to have done irq_enter(). - * Besides, if we don't timer interrupts ignore the global - * interrupt lock, which is the WrongThing (tm) to do. - */ - entering_ack_irq(); + ack_APIC_irq(); trace_local_timer_entry(LOCAL_TIMER_VECTOR); local_apic_timer_interrupt(); trace_local_timer_exit(LOCAL_TIMER_VECTOR); - exiting_irq(); set_irq_regs(old_regs); } @@ -2129,7 +2120,7 @@ void __init register_lapic_address(unsigned long address) * trigger on an entry which is routed to the common_spurious idtentry * point. * - * Also called from smp_spurious_apic_interrupt(). + * Also called from sysvec_spurious_apic_interrupt(). */ DEFINE_IDTENTRY_IRQ(spurious_interrupt) { @@ -2166,17 +2157,15 @@ DEFINE_IDTENTRY_IRQ(spurious_interrupt) trace_spurious_apic_exit(vector); } -__visible void smp_spurious_apic_interrupt(struct pt_regs *regs) +DEFINE_IDTENTRY_SYSVEC(sysvec_spurious_apic_interrupt) { - entering_irq(); __spurious_interrupt(regs, SPURIOUS_APIC_VECTOR); - exiting_irq(); } /* * This interrupt should never happen with our APIC/SMP architecture */ -__visible void __irq_entry smp_error_interrupt(struct pt_regs *regs) +DEFINE_IDTENTRY_SYSVEC(sysvec_error_interrupt) { static const char * const error_interrupt_reason[] = { "Send CS error", /* APIC Error Bit 0 */ @@ -2190,7 +2179,6 @@ __visible void __irq_entry smp_error_interrupt(struct pt_regs *regs) }; u32 v, i = 0; - entering_irq(); trace_error_apic_entry(ERROR_APIC_VECTOR); /* First tickle the hardware, only then report what went on. -- REW */ @@ -2214,7 +2202,6 @@ __visible void __irq_entry smp_error_interrupt(struct pt_regs *regs) apic_printk(APIC_DEBUG, KERN_CONT "\n"); trace_error_apic_exit(ERROR_APIC_VECTOR); - exiting_irq(); } /** diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c index 20408e31c18de3..93c1b27f40f41f 100644 --- a/arch/x86/kernel/idt.c +++ b/arch/x86/kernel/idt.c @@ -132,8 +132,8 @@ static const __initconst struct idt_data apic_idts[] = { #endif #ifdef CONFIG_X86_LOCAL_APIC - INTG(LOCAL_TIMER_VECTOR, apic_timer_interrupt), - INTG(X86_PLATFORM_IPI_VECTOR, x86_platform_ipi), + INTG(LOCAL_TIMER_VECTOR, asm_sysvec_apic_timer_interrupt), + INTG(X86_PLATFORM_IPI_VECTOR, asm_sysvec_x86_platform_ipi), # ifdef CONFIG_HAVE_KVM INTG(POSTED_INTR_VECTOR, kvm_posted_intr_ipi), INTG(POSTED_INTR_WAKEUP_VECTOR, kvm_posted_intr_wakeup_ipi), @@ -145,8 +145,8 @@ static const __initconst struct idt_data apic_idts[] = { #ifdef CONFIG_X86_UV INTG(UV_BAU_MESSAGE, uv_bau_message_intr1), #endif - INTG(SPURIOUS_APIC_VECTOR, spurious_apic_interrupt), - INTG(ERROR_APIC_VECTOR, error_interrupt), + INTG(SPURIOUS_APIC_VECTOR, asm_sysvec_spurious_apic_interrupt), + INTG(ERROR_APIC_VECTOR, asm_sysvec_error_interrupt), #endif }; diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index c449b843403630..7e3005274f8327 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -268,17 +268,16 @@ void (*x86_platform_ipi_callback)(void) = NULL; /* * Handler for X86_PLATFORM_IPI_VECTOR. */ -__visible void __irq_entry smp_x86_platform_ipi(struct pt_regs *regs) +DEFINE_IDTENTRY_SYSVEC(sysvec_x86_platform_ipi) { struct pt_regs *old_regs = set_irq_regs(regs); - entering_ack_irq(); + ack_APIC_irq(); trace_x86_platform_ipi_entry(X86_PLATFORM_IPI_VECTOR); inc_irq_stat(x86_platform_ipis); if (x86_platform_ipi_callback) x86_platform_ipi_callback(); trace_x86_platform_ipi_exit(X86_PLATFORM_IPI_VECTOR); - exiting_irq(); set_irq_regs(old_regs); } #endif From 582f9191231b994582ad5349a7b06b3255c926fb Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 21 May 2020 22:05:40 +0200 Subject: [PATCH 136/190] x86/entry: Convert SMP system vectors to IDTENTRY_SYSVEC Convert SMP system vectors to IDTENTRY_SYSVEC: - Implement the C entry point with DEFINE_IDTENTRY_SYSVEC - Emit the ASM stub with DECLARE_IDTENTRY_SYSVEC - Remove the ASM idtentries in 64-bit - Remove the BUILD_INTERRUPT entries in 32-bit - Remove the old prototypes No functional change. Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar Acked-by: Andy Lutomirski Link: https://lore.kernel.org/r/20200521202119.372234635@linutronix.de --- arch/x86/entry/entry_64.S | 7 ------- arch/x86/include/asm/entry_arch.h | 4 ---- arch/x86/include/asm/hw_irq.h | 5 ----- arch/x86/include/asm/idtentry.h | 7 +++++++ arch/x86/include/asm/traps.h | 2 -- arch/x86/kernel/apic/vector.c | 5 ++--- arch/x86/kernel/idt.c | 10 +++++----- arch/x86/kernel/smp.c | 18 +++++++----------- 8 files changed, 21 insertions(+), 37 deletions(-) diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 25f71a0c9d3e44..f3ccb27c028ea1 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -956,11 +956,6 @@ apicinterrupt3 \num \sym \do_sym POP_SECTION_IRQENTRY .endm -#ifdef CONFIG_SMP -apicinterrupt3 IRQ_MOVE_CLEANUP_VECTOR irq_move_cleanup_interrupt smp_irq_move_cleanup_interrupt -apicinterrupt3 REBOOT_VECTOR reboot_interrupt smp_reboot_interrupt -#endif - #ifdef CONFIG_X86_UV apicinterrupt3 UV_BAU_MESSAGE uv_bau_message_intr1 uv_bau_message_interrupt #endif @@ -984,8 +979,6 @@ apicinterrupt THERMAL_APIC_VECTOR thermal_interrupt smp_thermal_interrupt #endif #ifdef CONFIG_SMP -apicinterrupt CALL_FUNCTION_SINGLE_VECTOR call_function_single_interrupt smp_call_function_single_interrupt -apicinterrupt CALL_FUNCTION_VECTOR call_function_interrupt smp_call_function_interrupt apicinterrupt RESCHEDULE_VECTOR reschedule_interrupt smp_reschedule_interrupt #endif diff --git a/arch/x86/include/asm/entry_arch.h b/arch/x86/include/asm/entry_arch.h index d10d6d807e7362..2e2055bcfeb2e0 100644 --- a/arch/x86/include/asm/entry_arch.h +++ b/arch/x86/include/asm/entry_arch.h @@ -12,10 +12,6 @@ */ #ifdef CONFIG_SMP BUILD_INTERRUPT(reschedule_interrupt,RESCHEDULE_VECTOR) -BUILD_INTERRUPT(call_function_interrupt,CALL_FUNCTION_VECTOR) -BUILD_INTERRUPT(call_function_single_interrupt,CALL_FUNCTION_SINGLE_VECTOR) -BUILD_INTERRUPT(irq_move_cleanup_interrupt, IRQ_MOVE_CLEANUP_VECTOR) -BUILD_INTERRUPT(reboot_interrupt, REBOOT_VECTOR) #endif #ifdef CONFIG_HAVE_KVM diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h index 1765993360e77c..36a38695f27f2a 100644 --- a/arch/x86/include/asm/hw_irq.h +++ b/arch/x86/include/asm/hw_irq.h @@ -38,14 +38,9 @@ extern asmlinkage void uv_bau_message_intr1(void); extern asmlinkage void thermal_interrupt(void); extern asmlinkage void reschedule_interrupt(void); -extern asmlinkage void irq_move_cleanup_interrupt(void); -extern asmlinkage void reboot_interrupt(void); extern asmlinkage void threshold_interrupt(void); extern asmlinkage void deferred_error_interrupt(void); -extern asmlinkage void call_function_interrupt(void); -extern asmlinkage void call_function_single_interrupt(void); - #ifdef CONFIG_X86_LOCAL_APIC struct irq_data; struct pci_dev; diff --git a/arch/x86/include/asm/idtentry.h b/arch/x86/include/asm/idtentry.h index b95f36276c6c53..b44f4ac22af6ff 100644 --- a/arch/x86/include/asm/idtentry.h +++ b/arch/x86/include/asm/idtentry.h @@ -599,6 +599,13 @@ DECLARE_IDTENTRY_SYSVEC(LOCAL_TIMER_VECTOR, sysvec_apic_timer_interrupt); DECLARE_IDTENTRY_SYSVEC(X86_PLATFORM_IPI_VECTOR, sysvec_x86_platform_ipi); #endif +#ifdef CONFIG_SMP +DECLARE_IDTENTRY_SYSVEC(IRQ_MOVE_CLEANUP_VECTOR, sysvec_irq_move_cleanup); +DECLARE_IDTENTRY_SYSVEC(REBOOT_VECTOR, sysvec_reboot); +DECLARE_IDTENTRY_SYSVEC(CALL_FUNCTION_SINGLE_VECTOR, sysvec_call_function_single); +DECLARE_IDTENTRY_SYSVEC(CALL_FUNCTION_VECTOR, sysvec_call_function); +#endif + #undef X86_TRAP_OTHER #endif diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h index 933934c3e17351..0c40f37f8cb7df 100644 --- a/arch/x86/include/asm/traps.h +++ b/arch/x86/include/asm/traps.h @@ -40,8 +40,6 @@ asmlinkage void smp_threshold_interrupt(struct pt_regs *regs); asmlinkage void smp_deferred_error_interrupt(struct pt_regs *regs); #endif -asmlinkage void smp_irq_move_cleanup_interrupt(void); - #ifdef CONFIG_VMAP_STACK void __noreturn handle_stack_overflow(const char *message, struct pt_regs *regs, diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index 67768e54438bde..c48be6e1f6764c 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -861,13 +861,13 @@ static void free_moved_vector(struct apic_chip_data *apicd) apicd->move_in_progress = 0; } -asmlinkage __visible void __irq_entry smp_irq_move_cleanup_interrupt(void) +DEFINE_IDTENTRY_SYSVEC(sysvec_irq_move_cleanup) { struct hlist_head *clhead = this_cpu_ptr(&cleanup_list); struct apic_chip_data *apicd; struct hlist_node *tmp; - entering_ack_irq(); + ack_APIC_irq(); /* Prevent vectors vanishing under us */ raw_spin_lock(&vector_lock); @@ -892,7 +892,6 @@ asmlinkage __visible void __irq_entry smp_irq_move_cleanup_interrupt(void) } raw_spin_unlock(&vector_lock); - exiting_irq(); } static void __send_cleanup_vector(struct apic_chip_data *apicd) diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c index 93c1b27f40f41f..018a5424b57450 100644 --- a/arch/x86/kernel/idt.c +++ b/arch/x86/kernel/idt.c @@ -112,11 +112,11 @@ static const __initconst struct idt_data def_idts[] = { */ static const __initconst struct idt_data apic_idts[] = { #ifdef CONFIG_SMP - INTG(RESCHEDULE_VECTOR, reschedule_interrupt), - INTG(CALL_FUNCTION_VECTOR, call_function_interrupt), - INTG(CALL_FUNCTION_SINGLE_VECTOR, call_function_single_interrupt), - INTG(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt), - INTG(REBOOT_VECTOR, reboot_interrupt), + INTG(RESCHEDULE_VECTOR, reschedule_interrupt), + INTG(CALL_FUNCTION_VECTOR, asm_sysvec_call_function), + INTG(CALL_FUNCTION_SINGLE_VECTOR, asm_sysvec_call_function_single), + INTG(IRQ_MOVE_CLEANUP_VECTOR, asm_sysvec_irq_move_cleanup), + INTG(REBOOT_VECTOR, asm_sysvec_reboot), #endif #ifdef CONFIG_X86_THERMAL_VECTOR diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c index b8d4e9c3c070c0..e5647daa7e9687 100644 --- a/arch/x86/kernel/smp.c +++ b/arch/x86/kernel/smp.c @@ -27,6 +27,7 @@ #include #include #include +#include #include #include #include @@ -130,13 +131,11 @@ static int smp_stop_nmi_callback(unsigned int val, struct pt_regs *regs) /* * this function calls the 'stop' function on all other CPUs in the system. */ - -asmlinkage __visible void smp_reboot_interrupt(void) +DEFINE_IDTENTRY_SYSVEC(sysvec_reboot) { - ipi_entering_ack_irq(); + ack_APIC_irq(); cpu_emergency_vmxoff(); stop_this_cpu(NULL); - irq_exit(); } static int register_stop_handler(void) @@ -227,7 +226,6 @@ __visible void __irq_entry smp_reschedule_interrupt(struct pt_regs *regs) { ack_APIC_irq(); inc_irq_stat(irq_resched_count); - kvm_set_cpu_l1tf_flush_l1d(); if (trace_resched_ipi_enabled()) { /* @@ -244,24 +242,22 @@ __visible void __irq_entry smp_reschedule_interrupt(struct pt_regs *regs) scheduler_ipi(); } -__visible void __irq_entry smp_call_function_interrupt(struct pt_regs *regs) +DEFINE_IDTENTRY_SYSVEC(sysvec_call_function) { - ipi_entering_ack_irq(); + ack_APIC_irq(); trace_call_function_entry(CALL_FUNCTION_VECTOR); inc_irq_stat(irq_call_count); generic_smp_call_function_interrupt(); trace_call_function_exit(CALL_FUNCTION_VECTOR); - exiting_irq(); } -__visible void __irq_entry smp_call_function_single_interrupt(struct pt_regs *r) +DEFINE_IDTENTRY_SYSVEC(sysvec_call_function_single) { - ipi_entering_ack_irq(); + ack_APIC_irq(); trace_call_function_single_entry(CALL_FUNCTION_SINGLE_VECTOR); inc_irq_stat(irq_call_count); generic_smp_call_function_single_interrupt(); trace_call_function_single_exit(CALL_FUNCTION_SINGLE_VECTOR); - exiting_irq(); } static int __init nonmi_ipi_setup(char *str) From 720909a7abd351535bfb485a0ecce03c2e4467e2 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 21 May 2020 22:05:41 +0200 Subject: [PATCH 137/190] x86/entry: Convert various system vectors Convert various system vectors to IDTENTRY_SYSVEC: - Implement the C entry point with DEFINE_IDTENTRY_SYSVEC - Emit the ASM stub with DECLARE_IDTENTRY_SYSVEC - Remove the ASM idtentries in 64-bit - Remove the BUILD_INTERRUPT entries in 32-bit - Remove the old prototypes No functional change. Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar Acked-by: Andy Lutomirski Link: https://lore.kernel.org/r/20200521202119.464812973@linutronix.de --- arch/x86/entry/entry_64.S | 19 ------------------ arch/x86/include/asm/apic.h | 13 ------------- arch/x86/include/asm/entry_arch.h | 25 ------------------------ arch/x86/include/asm/hw_irq.h | 6 ------ arch/x86/include/asm/idtentry.h | 22 +++++++++++++++++++++ arch/x86/include/asm/irq_work.h | 1 - arch/x86/include/asm/traps.h | 5 ----- arch/x86/include/asm/uv/uv_bau.h | 8 ++------ arch/x86/kernel/cpu/mce/amd.c | 5 ++--- arch/x86/kernel/cpu/mce/therm_throt.c | 5 ++--- arch/x86/kernel/cpu/mce/threshold.c | 5 ++--- arch/x86/kernel/idt.c | 28 +++++++++++++-------------- arch/x86/kernel/irq_work.c | 6 +++--- arch/x86/platform/uv/tlb_uv.c | 2 +- 14 files changed, 48 insertions(+), 102 deletions(-) diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index f3ccb27c028ea1..2301f62d08e6dd 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -956,9 +956,6 @@ apicinterrupt3 \num \sym \do_sym POP_SECTION_IRQENTRY .endm -#ifdef CONFIG_X86_UV -apicinterrupt3 UV_BAU_MESSAGE uv_bau_message_intr1 uv_bau_message_interrupt -#endif #ifdef CONFIG_HAVE_KVM apicinterrupt3 POSTED_INTR_VECTOR kvm_posted_intr_ipi smp_kvm_posted_intr_ipi @@ -966,26 +963,10 @@ apicinterrupt3 POSTED_INTR_WAKEUP_VECTOR kvm_posted_intr_wakeup_ipi smp_kvm_post apicinterrupt3 POSTED_INTR_NESTED_VECTOR kvm_posted_intr_nested_ipi smp_kvm_posted_intr_nested_ipi #endif -#ifdef CONFIG_X86_MCE_THRESHOLD -apicinterrupt THRESHOLD_APIC_VECTOR threshold_interrupt smp_threshold_interrupt -#endif - -#ifdef CONFIG_X86_MCE_AMD -apicinterrupt DEFERRED_ERROR_VECTOR deferred_error_interrupt smp_deferred_error_interrupt -#endif - -#ifdef CONFIG_X86_THERMAL_VECTOR -apicinterrupt THERMAL_APIC_VECTOR thermal_interrupt smp_thermal_interrupt -#endif - #ifdef CONFIG_SMP apicinterrupt RESCHEDULE_VECTOR reschedule_interrupt smp_reschedule_interrupt #endif -#ifdef CONFIG_IRQ_WORK -apicinterrupt IRQ_WORK_VECTOR irq_work_interrupt smp_irq_work_interrupt -#endif - /* * Reload gs selector with exception handling * edi: new selector diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index 19e94af9cc5d76..a5416865b6fa77 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -534,24 +534,11 @@ static inline void entering_ack_irq(void) ack_APIC_irq(); } -static inline void ipi_entering_ack_irq(void) -{ - irq_enter(); - ack_APIC_irq(); - kvm_set_cpu_l1tf_flush_l1d(); -} - static inline void exiting_irq(void) { irq_exit(); } -static inline void exiting_ack_irq(void) -{ - ack_APIC_irq(); - irq_exit(); -} - extern void ioapic_zap_locks(void); #endif /* _ASM_X86_APIC_H */ diff --git a/arch/x86/include/asm/entry_arch.h b/arch/x86/include/asm/entry_arch.h index 2e2055bcfeb2e0..69a5320a46735e 100644 --- a/arch/x86/include/asm/entry_arch.h +++ b/arch/x86/include/asm/entry_arch.h @@ -20,28 +20,3 @@ BUILD_INTERRUPT(kvm_posted_intr_wakeup_ipi, POSTED_INTR_WAKEUP_VECTOR) BUILD_INTERRUPT(kvm_posted_intr_nested_ipi, POSTED_INTR_NESTED_VECTOR) #endif -/* - * every pentium local APIC has two 'local interrupts', with a - * soft-definable vector attached to both interrupts, one of - * which is a timer interrupt, the other one is error counter - * overflow. Linux uses the local APIC timer interrupt to get - * a much simpler SMP time architecture: - */ -#ifdef CONFIG_X86_LOCAL_APIC - -#ifdef CONFIG_IRQ_WORK -BUILD_INTERRUPT(irq_work_interrupt, IRQ_WORK_VECTOR) -#endif - -#ifdef CONFIG_X86_THERMAL_VECTOR -BUILD_INTERRUPT(thermal_interrupt,THERMAL_APIC_VECTOR) -#endif - -#ifdef CONFIG_X86_MCE_THRESHOLD -BUILD_INTERRUPT(threshold_interrupt,THRESHOLD_APIC_VECTOR) -#endif - -#ifdef CONFIG_X86_MCE_AMD -BUILD_INTERRUPT(deferred_error_interrupt, DEFERRED_ERROR_VECTOR) -#endif -#endif diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h index 36a38695f27f2a..7281c7e3a0f6ec 100644 --- a/arch/x86/include/asm/hw_irq.h +++ b/arch/x86/include/asm/hw_irq.h @@ -32,15 +32,9 @@ extern asmlinkage void kvm_posted_intr_ipi(void); extern asmlinkage void kvm_posted_intr_wakeup_ipi(void); extern asmlinkage void kvm_posted_intr_nested_ipi(void); -extern asmlinkage void irq_work_interrupt(void); -extern asmlinkage void uv_bau_message_intr1(void); -extern asmlinkage void thermal_interrupt(void); extern asmlinkage void reschedule_interrupt(void); -extern asmlinkage void threshold_interrupt(void); -extern asmlinkage void deferred_error_interrupt(void); - #ifdef CONFIG_X86_LOCAL_APIC struct irq_data; struct pci_dev; diff --git a/arch/x86/include/asm/idtentry.h b/arch/x86/include/asm/idtentry.h index b44f4ac22af6ff..cd752e6cf5af04 100644 --- a/arch/x86/include/asm/idtentry.h +++ b/arch/x86/include/asm/idtentry.h @@ -606,6 +606,28 @@ DECLARE_IDTENTRY_SYSVEC(CALL_FUNCTION_SINGLE_VECTOR, sysvec_call_function_single DECLARE_IDTENTRY_SYSVEC(CALL_FUNCTION_VECTOR, sysvec_call_function); #endif +#ifdef CONFIG_X86_LOCAL_APIC +# ifdef CONFIG_X86_UV +DECLARE_IDTENTRY_SYSVEC(UV_BAU_MESSAGE, sysvec_uv_bau_message); +# endif + +# ifdef CONFIG_X86_MCE_THRESHOLD +DECLARE_IDTENTRY_SYSVEC(THRESHOLD_APIC_VECTOR, sysvec_threshold); +# endif + +# ifdef CONFIG_X86_MCE_AMD +DECLARE_IDTENTRY_SYSVEC(DEFERRED_ERROR_VECTOR, sysvec_deferred_error); +# endif + +# ifdef CONFIG_X86_THERMAL_VECTOR +DECLARE_IDTENTRY_SYSVEC(THERMAL_APIC_VECTOR, sysvec_thermal); +# endif + +# ifdef CONFIG_IRQ_WORK +DECLARE_IDTENTRY_SYSVEC(IRQ_WORK_VECTOR, sysvec_irq_work); +# endif +#endif + #undef X86_TRAP_OTHER #endif diff --git a/arch/x86/include/asm/irq_work.h b/arch/x86/include/asm/irq_work.h index 80b35e3adf039c..800ffce0db29e3 100644 --- a/arch/x86/include/asm/irq_work.h +++ b/arch/x86/include/asm/irq_work.h @@ -10,7 +10,6 @@ static inline bool arch_irq_work_has_interrupt(void) return boot_cpu_has(X86_FEATURE_APIC); } extern void arch_irq_work_raise(void); -extern __visible void smp_irq_work_interrupt(struct pt_regs *regs); #else static inline bool arch_irq_work_has_interrupt(void) { diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h index 0c40f37f8cb7df..714b1a30e7b09a 100644 --- a/arch/x86/include/asm/traps.h +++ b/arch/x86/include/asm/traps.h @@ -34,11 +34,6 @@ static inline int get_si_code(unsigned long condition) extern int panic_on_unrecovered_nmi; void math_emulate(struct math_emu_info *); -#ifndef CONFIG_X86_32 -asmlinkage void smp_thermal_interrupt(struct pt_regs *regs); -asmlinkage void smp_threshold_interrupt(struct pt_regs *regs); -asmlinkage void smp_deferred_error_interrupt(struct pt_regs *regs); -#endif #ifdef CONFIG_VMAP_STACK void __noreturn handle_stack_overflow(const char *message, diff --git a/arch/x86/include/asm/uv/uv_bau.h b/arch/x86/include/asm/uv/uv_bau.h index 13687bf0e0a941..f1188bd4765804 100644 --- a/arch/x86/include/asm/uv/uv_bau.h +++ b/arch/x86/include/asm/uv/uv_bau.h @@ -12,6 +12,8 @@ #define _ASM_X86_UV_UV_BAU_H #include +#include + #define BITSPERBYTE 8 /* @@ -799,12 +801,6 @@ static inline void bau_cpubits_clear(struct bau_local_cpumask *dstp, int nbits) bitmap_zero(&dstp->bits, nbits); } -extern void uv_bau_message_intr1(void); -#ifdef CONFIG_TRACING -#define trace_uv_bau_message_intr1 uv_bau_message_intr1 -#endif -extern void uv_bau_timeout_intr1(void); - struct atomic_short { short counter; }; diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c index 52de616a806559..a906d68a18a2b1 100644 --- a/arch/x86/kernel/cpu/mce/amd.c +++ b/arch/x86/kernel/cpu/mce/amd.c @@ -907,14 +907,13 @@ static void __log_error(unsigned int bank, u64 status, u64 addr, u64 misc) mce_log(&m); } -asmlinkage __visible void __irq_entry smp_deferred_error_interrupt(struct pt_regs *regs) +DEFINE_IDTENTRY_SYSVEC(sysvec_deferred_error) { - entering_irq(); trace_deferred_error_apic_entry(DEFERRED_ERROR_VECTOR); inc_irq_stat(irq_deferred_error_count); deferred_error_int_vector(); trace_deferred_error_apic_exit(DEFERRED_ERROR_VECTOR); - exiting_ack_irq(); + ack_APIC_irq(); } /* diff --git a/arch/x86/kernel/cpu/mce/therm_throt.c b/arch/x86/kernel/cpu/mce/therm_throt.c index f36dc07420851b..a7cd2d203ceda6 100644 --- a/arch/x86/kernel/cpu/mce/therm_throt.c +++ b/arch/x86/kernel/cpu/mce/therm_throt.c @@ -614,14 +614,13 @@ static void unexpected_thermal_interrupt(void) static void (*smp_thermal_vector)(void) = unexpected_thermal_interrupt; -asmlinkage __visible void __irq_entry smp_thermal_interrupt(struct pt_regs *regs) +DEFINE_IDTENTRY_SYSVEC(sysvec_thermal) { - entering_irq(); trace_thermal_apic_entry(THERMAL_APIC_VECTOR); inc_irq_stat(irq_thermal_count); smp_thermal_vector(); trace_thermal_apic_exit(THERMAL_APIC_VECTOR); - exiting_ack_irq(); + ack_APIC_irq(); } /* Thermal monitoring depends on APIC, ACPI and clock modulation */ diff --git a/arch/x86/kernel/cpu/mce/threshold.c b/arch/x86/kernel/cpu/mce/threshold.c index 28812cc15300d9..6a059a035021d8 100644 --- a/arch/x86/kernel/cpu/mce/threshold.c +++ b/arch/x86/kernel/cpu/mce/threshold.c @@ -21,12 +21,11 @@ static void default_threshold_interrupt(void) void (*mce_threshold_vector)(void) = default_threshold_interrupt; -asmlinkage __visible void __irq_entry smp_threshold_interrupt(struct pt_regs *regs) +DEFINE_IDTENTRY_SYSVEC(sysvec_threshold) { - entering_irq(); trace_threshold_apic_entry(THRESHOLD_APIC_VECTOR); inc_irq_stat(irq_threshold_count); mce_threshold_vector(); trace_threshold_apic_exit(THRESHOLD_APIC_VECTOR); - exiting_ack_irq(); + ack_APIC_irq(); } diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c index 018a5424b57450..3d811d058f2eec 100644 --- a/arch/x86/kernel/idt.c +++ b/arch/x86/kernel/idt.c @@ -120,33 +120,33 @@ static const __initconst struct idt_data apic_idts[] = { #endif #ifdef CONFIG_X86_THERMAL_VECTOR - INTG(THERMAL_APIC_VECTOR, thermal_interrupt), + INTG(THERMAL_APIC_VECTOR, asm_sysvec_thermal), #endif #ifdef CONFIG_X86_MCE_THRESHOLD - INTG(THRESHOLD_APIC_VECTOR, threshold_interrupt), + INTG(THRESHOLD_APIC_VECTOR, asm_sysvec_threshold), #endif #ifdef CONFIG_X86_MCE_AMD - INTG(DEFERRED_ERROR_VECTOR, deferred_error_interrupt), + INTG(DEFERRED_ERROR_VECTOR, asm_sysvec_deferred_error), #endif #ifdef CONFIG_X86_LOCAL_APIC - INTG(LOCAL_TIMER_VECTOR, asm_sysvec_apic_timer_interrupt), - INTG(X86_PLATFORM_IPI_VECTOR, asm_sysvec_x86_platform_ipi), + INTG(LOCAL_TIMER_VECTOR, asm_sysvec_apic_timer_interrupt), + INTG(X86_PLATFORM_IPI_VECTOR, asm_sysvec_x86_platform_ipi), # ifdef CONFIG_HAVE_KVM - INTG(POSTED_INTR_VECTOR, kvm_posted_intr_ipi), - INTG(POSTED_INTR_WAKEUP_VECTOR, kvm_posted_intr_wakeup_ipi), - INTG(POSTED_INTR_NESTED_VECTOR, kvm_posted_intr_nested_ipi), + INTG(POSTED_INTR_VECTOR, kvm_posted_intr_ipi), + INTG(POSTED_INTR_WAKEUP_VECTOR, kvm_posted_intr_wakeup_ipi), + INTG(POSTED_INTR_NESTED_VECTOR, kvm_posted_intr_nested_ipi), # endif # ifdef CONFIG_IRQ_WORK - INTG(IRQ_WORK_VECTOR, irq_work_interrupt), + INTG(IRQ_WORK_VECTOR, asm_sysvec_irq_work), # endif -#ifdef CONFIG_X86_UV - INTG(UV_BAU_MESSAGE, uv_bau_message_intr1), -#endif - INTG(SPURIOUS_APIC_VECTOR, asm_sysvec_spurious_apic_interrupt), - INTG(ERROR_APIC_VECTOR, asm_sysvec_error_interrupt), +# ifdef CONFIG_X86_UV + INTG(UV_BAU_MESSAGE, asm_sysvec_uv_bau_message), +# endif + INTG(SPURIOUS_APIC_VECTOR, asm_sysvec_spurious_apic_interrupt), + INTG(ERROR_APIC_VECTOR, asm_sysvec_error_interrupt), #endif }; diff --git a/arch/x86/kernel/irq_work.c b/arch/x86/kernel/irq_work.c index 80bee7695a2040..890d4778cd352c 100644 --- a/arch/x86/kernel/irq_work.c +++ b/arch/x86/kernel/irq_work.c @@ -9,18 +9,18 @@ #include #include #include +#include #include #include #ifdef CONFIG_X86_LOCAL_APIC -__visible void __irq_entry smp_irq_work_interrupt(struct pt_regs *regs) +DEFINE_IDTENTRY_SYSVEC(sysvec_irq_work) { - ipi_entering_ack_irq(); + ack_APIC_irq(); trace_irq_work_entry(IRQ_WORK_VECTOR); inc_irq_stat(apic_irq_work_irqs); irq_work_run(); trace_irq_work_exit(IRQ_WORK_VECTOR); - exiting_irq(); } void arch_irq_work_raise(void) diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c index 4ea69690c3e49a..0ac96ca304c7b5 100644 --- a/arch/x86/platform/uv/tlb_uv.c +++ b/arch/x86/platform/uv/tlb_uv.c @@ -1272,7 +1272,7 @@ static void process_uv2_message(struct msg_desc *mdp, struct bau_control *bcp) * (the resource will not be freed until noninterruptable cpus see this * interrupt; hardware may timeout the s/w ack and reply ERROR) */ -void uv_bau_message_interrupt(struct pt_regs *regs) +DEFINE_IDTENTRY_SYSVEC(sysvec_uv_bau_message) { int count = 0; cycles_t time_start; From 9c3b1f4975c46fc2932fd6d53e63c14f0ddf985f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 21 May 2020 22:05:42 +0200 Subject: [PATCH 138/190] x86/entry: Convert KVM vectors to IDTENTRY_SYSVEC* Convert KVM specific system vectors to IDTENTRY_SYSVEC*: The two empty stub handlers which only increment the stats counter do no need to run on the interrupt stack. Use IDTENTRY_SYSVEC_SIMPLE for them. The wakeup handler does more work and runs on the interrupt stack. None of these handlers need to save and restore the irq_regs pointer. Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar Acked-by: Paolo Bonzini Acked-by: Andy Lutomirski Link: https://lore.kernel.org/r/20200521202119.555715519@linutronix.de --- arch/x86/entry/entry_64.S | 7 ------- arch/x86/include/asm/entry_arch.h | 7 ------- arch/x86/include/asm/hw_irq.h | 4 ---- arch/x86/include/asm/idtentry.h | 6 ++++++ arch/x86/include/asm/irq.h | 3 --- arch/x86/kernel/idt.c | 6 +++--- arch/x86/kernel/irq.c | 24 ++++++------------------ 7 files changed, 15 insertions(+), 42 deletions(-) diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 2301f62d08e6dd..ea1f2930876c4c 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -956,13 +956,6 @@ apicinterrupt3 \num \sym \do_sym POP_SECTION_IRQENTRY .endm - -#ifdef CONFIG_HAVE_KVM -apicinterrupt3 POSTED_INTR_VECTOR kvm_posted_intr_ipi smp_kvm_posted_intr_ipi -apicinterrupt3 POSTED_INTR_WAKEUP_VECTOR kvm_posted_intr_wakeup_ipi smp_kvm_posted_intr_wakeup_ipi -apicinterrupt3 POSTED_INTR_NESTED_VECTOR kvm_posted_intr_nested_ipi smp_kvm_posted_intr_nested_ipi -#endif - #ifdef CONFIG_SMP apicinterrupt RESCHEDULE_VECTOR reschedule_interrupt smp_reschedule_interrupt #endif diff --git a/arch/x86/include/asm/entry_arch.h b/arch/x86/include/asm/entry_arch.h index 69a5320a46735e..a01bb74244ac17 100644 --- a/arch/x86/include/asm/entry_arch.h +++ b/arch/x86/include/asm/entry_arch.h @@ -13,10 +13,3 @@ #ifdef CONFIG_SMP BUILD_INTERRUPT(reschedule_interrupt,RESCHEDULE_VECTOR) #endif - -#ifdef CONFIG_HAVE_KVM -BUILD_INTERRUPT(kvm_posted_intr_ipi, POSTED_INTR_VECTOR) -BUILD_INTERRUPT(kvm_posted_intr_wakeup_ipi, POSTED_INTR_WAKEUP_VECTOR) -BUILD_INTERRUPT(kvm_posted_intr_nested_ipi, POSTED_INTR_NESTED_VECTOR) -#endif - diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h index 7281c7e3a0f6ec..fd5e7c8825e18e 100644 --- a/arch/x86/include/asm/hw_irq.h +++ b/arch/x86/include/asm/hw_irq.h @@ -29,10 +29,6 @@ #include /* Interrupt handlers registered during init_IRQ */ -extern asmlinkage void kvm_posted_intr_ipi(void); -extern asmlinkage void kvm_posted_intr_wakeup_ipi(void); -extern asmlinkage void kvm_posted_intr_nested_ipi(void); - extern asmlinkage void reschedule_interrupt(void); #ifdef CONFIG_X86_LOCAL_APIC diff --git a/arch/x86/include/asm/idtentry.h b/arch/x86/include/asm/idtentry.h index cd752e6cf5af04..7f7c80bd272c69 100644 --- a/arch/x86/include/asm/idtentry.h +++ b/arch/x86/include/asm/idtentry.h @@ -628,6 +628,12 @@ DECLARE_IDTENTRY_SYSVEC(IRQ_WORK_VECTOR, sysvec_irq_work); # endif #endif +#ifdef CONFIG_HAVE_KVM +DECLARE_IDTENTRY_SYSVEC(POSTED_INTR_VECTOR, sysvec_kvm_posted_intr_ipi); +DECLARE_IDTENTRY_SYSVEC(POSTED_INTR_WAKEUP_VECTOR, sysvec_kvm_posted_intr_wakeup_ipi); +DECLARE_IDTENTRY_SYSVEC(POSTED_INTR_NESTED_VECTOR, sysvec_kvm_posted_intr_nested_ipi); +#endif + #undef X86_TRAP_OTHER #endif diff --git a/arch/x86/include/asm/irq.h b/arch/x86/include/asm/irq.h index c7c43e86805a61..f73dd3f8b043f4 100644 --- a/arch/x86/include/asm/irq.h +++ b/arch/x86/include/asm/irq.h @@ -26,9 +26,6 @@ extern void fixup_irqs(void); #ifdef CONFIG_HAVE_KVM extern void kvm_set_posted_intr_wakeup_handler(void (*handler)(void)); -extern __visible void smp_kvm_posted_intr_ipi(struct pt_regs *regs); -extern __visible void smp_kvm_posted_intr_wakeup_ipi(struct pt_regs *regs); -extern __visible void smp_kvm_posted_intr_nested_ipi(struct pt_regs *regs); #endif extern void (*x86_platform_ipi_callback)(void); diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c index 3d811d058f2eec..faaadd430882d7 100644 --- a/arch/x86/kernel/idt.c +++ b/arch/x86/kernel/idt.c @@ -135,9 +135,9 @@ static const __initconst struct idt_data apic_idts[] = { INTG(LOCAL_TIMER_VECTOR, asm_sysvec_apic_timer_interrupt), INTG(X86_PLATFORM_IPI_VECTOR, asm_sysvec_x86_platform_ipi), # ifdef CONFIG_HAVE_KVM - INTG(POSTED_INTR_VECTOR, kvm_posted_intr_ipi), - INTG(POSTED_INTR_WAKEUP_VECTOR, kvm_posted_intr_wakeup_ipi), - INTG(POSTED_INTR_NESTED_VECTOR, kvm_posted_intr_nested_ipi), + INTG(POSTED_INTR_VECTOR, asm_sysvec_kvm_posted_intr_ipi), + INTG(POSTED_INTR_WAKEUP_VECTOR, asm_sysvec_kvm_posted_intr_wakeup_ipi), + INTG(POSTED_INTR_NESTED_VECTOR, asm_sysvec_kvm_posted_intr_nested_ipi), # endif # ifdef CONFIG_IRQ_WORK INTG(IRQ_WORK_VECTOR, asm_sysvec_irq_work), diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 7e3005274f8327..181060247e3cb7 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -298,41 +298,29 @@ EXPORT_SYMBOL_GPL(kvm_set_posted_intr_wakeup_handler); /* * Handler for POSTED_INTERRUPT_VECTOR. */ -__visible void smp_kvm_posted_intr_ipi(struct pt_regs *regs) +DEFINE_IDTENTRY_SYSVEC_SIMPLE(sysvec_kvm_posted_intr_ipi) { - struct pt_regs *old_regs = set_irq_regs(regs); - - entering_ack_irq(); + ack_APIC_irq(); inc_irq_stat(kvm_posted_intr_ipis); - exiting_irq(); - set_irq_regs(old_regs); } /* * Handler for POSTED_INTERRUPT_WAKEUP_VECTOR. */ -__visible void smp_kvm_posted_intr_wakeup_ipi(struct pt_regs *regs) +DEFINE_IDTENTRY_SYSVEC(sysvec_kvm_posted_intr_wakeup_ipi) { - struct pt_regs *old_regs = set_irq_regs(regs); - - entering_ack_irq(); + ack_APIC_irq(); inc_irq_stat(kvm_posted_intr_wakeup_ipis); kvm_posted_intr_wakeup_handler(); - exiting_irq(); - set_irq_regs(old_regs); } /* * Handler for POSTED_INTERRUPT_NESTED_VECTOR. */ -__visible void smp_kvm_posted_intr_nested_ipi(struct pt_regs *regs) +DEFINE_IDTENTRY_SYSVEC_SIMPLE(sysvec_kvm_posted_intr_nested_ipi) { - struct pt_regs *old_regs = set_irq_regs(regs); - - entering_ack_irq(); + ack_APIC_irq(); inc_irq_stat(kvm_posted_intr_nested_ipis); - exiting_irq(); - set_irq_regs(old_regs); } #endif From a16be368dd3fb695077cc9bc59c988b548955eec Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 21 May 2020 22:05:43 +0200 Subject: [PATCH 139/190] x86/entry: Convert various hypervisor vectors to IDTENTRY_SYSVEC Convert various hypervisor vectors to IDTENTRY_SYSVEC: - Implement the C entry point with DEFINE_IDTENTRY_SYSVEC - Emit the ASM stub with DECLARE_IDTENTRY_SYSVEC - Remove the ASM idtentries in 64-bit - Remove the BUILD_INTERRUPT entries in 32-bit - Remove the old prototypes No functional change. Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar Acked-by: Andy Lutomirski Reviewed-by: Wei Liu Link: https://lore.kernel.org/r/20200521202119.647997594@linutronix.de --- arch/x86/entry/entry_32.S | 14 -------------- arch/x86/entry/entry_64.S | 17 ----------------- arch/x86/hyperv/hv_init.c | 9 +++------ arch/x86/include/asm/acrn.h | 11 ----------- arch/x86/include/asm/apic.h | 20 -------------------- arch/x86/include/asm/idtentry.h | 10 ++++++++++ arch/x86/include/asm/mshyperv.h | 13 ------------- arch/x86/kernel/cpu/acrn.c | 9 ++++----- arch/x86/kernel/cpu/mshyperv.c | 22 ++++++++++------------ 9 files changed, 27 insertions(+), 98 deletions(-) delete mode 100644 arch/x86/include/asm/acrn.h diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index a8803aa3a07bfd..bb9ebd2df05e89 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -1345,20 +1345,6 @@ BUILD_INTERRUPT3(xen_hvm_callback_vector, HYPERVISOR_CALLBACK_VECTOR, xen_evtchn_do_upcall) #endif - -#if IS_ENABLED(CONFIG_HYPERV) - -BUILD_INTERRUPT3(hyperv_callback_vector, HYPERVISOR_CALLBACK_VECTOR, - hyperv_vector_handler) - -BUILD_INTERRUPT3(hyperv_reenlightenment_vector, HYPERV_REENLIGHTENMENT_VECTOR, - hyperv_reenlightenment_intr) - -BUILD_INTERRUPT3(hv_stimer0_callback_vector, HYPERV_STIMER0_VECTOR, - hv_stimer0_vector_handler) - -#endif /* CONFIG_HYPERV */ - SYM_CODE_START_LOCAL_NOALIGN(handle_exception) /* the function address is in %gs's slot on the stack */ SAVE_ALL switch_stacks=1 skip_gs=1 unwind_espfix=1 diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index ea1f2930876c4c..b97fcda28019e3 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -1116,23 +1116,6 @@ apicinterrupt3 HYPERVISOR_CALLBACK_VECTOR \ xen_hvm_callback_vector xen_evtchn_do_upcall #endif - -#if IS_ENABLED(CONFIG_HYPERV) -apicinterrupt3 HYPERVISOR_CALLBACK_VECTOR \ - hyperv_callback_vector hyperv_vector_handler - -apicinterrupt3 HYPERV_REENLIGHTENMENT_VECTOR \ - hyperv_reenlightenment_vector hyperv_reenlightenment_intr - -apicinterrupt3 HYPERV_STIMER0_VECTOR \ - hv_stimer0_callback_vector hv_stimer0_vector_handler -#endif /* CONFIG_HYPERV */ - -#if IS_ENABLED(CONFIG_ACRN_GUEST) -apicinterrupt3 HYPERVISOR_CALLBACK_VECTOR \ - acrn_hv_callback_vector acrn_hv_vector_handler -#endif - /* * Save all registers in pt_regs, and switch gs if needed. * Use slow, but surefire "are we in kernel?" check. diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c index e2137070386a58..a54c6a401581dd 100644 --- a/arch/x86/hyperv/hv_init.c +++ b/arch/x86/hyperv/hv_init.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include #include @@ -152,15 +153,11 @@ static inline bool hv_reenlightenment_available(void) ms_hyperv.features & HV_X64_ACCESS_REENLIGHTENMENT; } -__visible void __irq_entry hyperv_reenlightenment_intr(struct pt_regs *regs) +DEFINE_IDTENTRY_SYSVEC(sysvec_hyperv_reenlightenment) { - entering_ack_irq(); - + ack_APIC_irq(); inc_irq_stat(irq_hv_reenlightenment_count); - schedule_delayed_work(&hv_reenlightenment_work, HZ/10); - - exiting_irq(); } void set_hv_tscchange_cb(void (*cb)(void)) diff --git a/arch/x86/include/asm/acrn.h b/arch/x86/include/asm/acrn.h deleted file mode 100644 index 4adb13f08af71c..00000000000000 --- a/arch/x86/include/asm/acrn.h +++ /dev/null @@ -1,11 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _ASM_X86_ACRN_H -#define _ASM_X86_ACRN_H - -extern void acrn_hv_callback_vector(void); -#ifdef CONFIG_TRACING -#define trace_acrn_hv_callback_vector acrn_hv_callback_vector -#endif - -extern void acrn_hv_vector_handler(struct pt_regs *regs); -#endif /* _ASM_X86_ACRN_H */ diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index a5416865b6fa77..2cc44e957c318f 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -519,26 +519,6 @@ static inline bool apic_id_is_primary_thread(unsigned int id) { return false; } static inline void apic_smt_update(void) { } #endif -extern void irq_enter(void); -extern void irq_exit(void); - -static inline void entering_irq(void) -{ - irq_enter(); - kvm_set_cpu_l1tf_flush_l1d(); -} - -static inline void entering_ack_irq(void) -{ - entering_irq(); - ack_APIC_irq(); -} - -static inline void exiting_irq(void) -{ - irq_exit(); -} - extern void ioapic_zap_locks(void); #endif /* _ASM_X86_APIC_H */ diff --git a/arch/x86/include/asm/idtentry.h b/arch/x86/include/asm/idtentry.h index 7f7c80bd272c69..1b6d3ea1fc96bd 100644 --- a/arch/x86/include/asm/idtentry.h +++ b/arch/x86/include/asm/idtentry.h @@ -634,6 +634,16 @@ DECLARE_IDTENTRY_SYSVEC(POSTED_INTR_WAKEUP_VECTOR, sysvec_kvm_posted_intr_wakeup DECLARE_IDTENTRY_SYSVEC(POSTED_INTR_NESTED_VECTOR, sysvec_kvm_posted_intr_nested_ipi); #endif +#if IS_ENABLED(CONFIG_HYPERV) +DECLARE_IDTENTRY_SYSVEC(HYPERVISOR_CALLBACK_VECTOR, sysvec_hyperv_callback); +DECLARE_IDTENTRY_SYSVEC(HYPERVISOR_REENLIGHTENMENT_VECTOR, sysvec_hyperv_reenlightenment); +DECLARE_IDTENTRY_SYSVEC(HYPERVISOR_STIMER0_VECTOR, sysvec_hyperv_stimer0); +#endif + +#if IS_ENABLED(CONFIG_ACRN_GUEST) +DECLARE_IDTENTRY_SYSVEC(HYPERVISOR_CALLBACK_VECTOR, sysvec_acrn_hv_callback); +#endif + #undef X86_TRAP_OTHER #endif diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h index d30805ed323ef9..60b944dd2df1a6 100644 --- a/arch/x86/include/asm/mshyperv.h +++ b/arch/x86/include/asm/mshyperv.h @@ -54,20 +54,8 @@ typedef int (*hyperv_fill_flush_list_func)( vclocks_set_used(VDSO_CLOCKMODE_HVCLOCK); #define hv_get_raw_timer() rdtsc_ordered() -void hyperv_callback_vector(void); -void hyperv_reenlightenment_vector(void); -#ifdef CONFIG_TRACING -#define trace_hyperv_callback_vector hyperv_callback_vector -#endif void hyperv_vector_handler(struct pt_regs *regs); -/* - * Routines for stimer0 Direct Mode handling. - * On x86/x64, there are no percpu actions to take. - */ -void hv_stimer0_vector_handler(struct pt_regs *regs); -void hv_stimer0_callback_vector(void); - static inline void hv_enable_stimer0_percpu_irq(int irq) {} static inline void hv_disable_stimer0_percpu_irq(int irq) {} @@ -226,7 +214,6 @@ void hyperv_setup_mmu_ops(void); void *hv_alloc_hyperv_page(void); void *hv_alloc_hyperv_zeroed_page(void); void hv_free_hyperv_page(unsigned long addr); -void hyperv_reenlightenment_intr(struct pt_regs *regs); void set_hv_tscchange_cb(void (*cb)(void)); void clear_hv_tscchange_cb(void); void hyperv_stop_tsc_emulation(void); diff --git a/arch/x86/kernel/cpu/acrn.c b/arch/x86/kernel/cpu/acrn.c index 676022e7179195..1da9b1c9a2dbf9 100644 --- a/arch/x86/kernel/cpu/acrn.c +++ b/arch/x86/kernel/cpu/acrn.c @@ -10,10 +10,10 @@ */ #include -#include #include #include #include +#include #include static uint32_t __init acrn_detect(void) @@ -24,7 +24,7 @@ static uint32_t __init acrn_detect(void) static void __init acrn_init_platform(void) { /* Setup the IDT for ACRN hypervisor callback */ - alloc_intr_gate(HYPERVISOR_CALLBACK_VECTOR, acrn_hv_callback_vector); + alloc_intr_gate(HYPERVISOR_CALLBACK_VECTOR, asm_sysvec_acrn_hv_callback); } static bool acrn_x2apic_available(void) @@ -39,7 +39,7 @@ static bool acrn_x2apic_available(void) static void (*acrn_intr_handler)(void); -__visible void __irq_entry acrn_hv_vector_handler(struct pt_regs *regs) +DEFINE_IDTENTRY_SYSVEC(sysvec_acrn_hv_callback) { struct pt_regs *old_regs = set_irq_regs(regs); @@ -50,13 +50,12 @@ __visible void __irq_entry acrn_hv_vector_handler(struct pt_regs *regs) * will block the interrupt whose vector is lower than * HYPERVISOR_CALLBACK_VECTOR. */ - entering_ack_irq(); + ack_APIC_irq(); inc_irq_stat(irq_hv_callback_count); if (acrn_intr_handler) acrn_intr_handler(); - exiting_irq(); set_irq_regs(old_regs); } diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c index ebf34c7bc8bc0d..af94f05a5c668a 100644 --- a/arch/x86/kernel/cpu/mshyperv.c +++ b/arch/x86/kernel/cpu/mshyperv.c @@ -23,6 +23,7 @@ #include #include #include +#include #include #include #include @@ -40,11 +41,10 @@ static void (*hv_stimer0_handler)(void); static void (*hv_kexec_handler)(void); static void (*hv_crash_handler)(struct pt_regs *regs); -__visible void __irq_entry hyperv_vector_handler(struct pt_regs *regs) +DEFINE_IDTENTRY_SYSVEC(sysvec_hyperv_callback) { struct pt_regs *old_regs = set_irq_regs(regs); - entering_irq(); inc_irq_stat(irq_hv_callback_count); if (vmbus_handler) vmbus_handler(); @@ -52,7 +52,6 @@ __visible void __irq_entry hyperv_vector_handler(struct pt_regs *regs) if (ms_hyperv.hints & HV_DEPRECATING_AEOI_RECOMMENDED) ack_APIC_irq(); - exiting_irq(); set_irq_regs(old_regs); } @@ -73,19 +72,16 @@ EXPORT_SYMBOL_GPL(hv_remove_vmbus_irq); * Routines to do per-architecture handling of stimer0 * interrupts when in Direct Mode */ - -__visible void __irq_entry hv_stimer0_vector_handler(struct pt_regs *regs) +DEFINE_IDTENTRY_SYSVEC(sysvec_hyperv_stimer0) { struct pt_regs *old_regs = set_irq_regs(regs); - entering_irq(); inc_irq_stat(hyperv_stimer0_count); if (hv_stimer0_handler) hv_stimer0_handler(); add_interrupt_randomness(HYPERV_STIMER0_VECTOR, 0); ack_APIC_irq(); - exiting_irq(); set_irq_regs(old_regs); } @@ -331,17 +327,19 @@ static void __init ms_hyperv_init_platform(void) x86_platform.apic_post_init = hyperv_init; hyperv_setup_mmu_ops(); /* Setup the IDT for hypervisor callback */ - alloc_intr_gate(HYPERVISOR_CALLBACK_VECTOR, hyperv_callback_vector); + alloc_intr_gate(HYPERVISOR_CALLBACK_VECTOR, asm_sysvec_hyperv_callback); /* Setup the IDT for reenlightenment notifications */ - if (ms_hyperv.features & HV_X64_ACCESS_REENLIGHTENMENT) + if (ms_hyperv.features & HV_X64_ACCESS_REENLIGHTENMENT) { alloc_intr_gate(HYPERV_REENLIGHTENMENT_VECTOR, - hyperv_reenlightenment_vector); + asm_sysvec_hyperv_reenlightenment); + } /* Setup the IDT for stimer0 */ - if (ms_hyperv.misc_features & HV_STIMER_DIRECT_MODE_AVAILABLE) + if (ms_hyperv.misc_features & HV_STIMER_DIRECT_MODE_AVAILABLE) { alloc_intr_gate(HYPERV_STIMER0_VECTOR, - hv_stimer0_callback_vector); + asm_sysvec_hyperv_stimer0); + } # ifdef CONFIG_SMP smp_ops.smp_prepare_boot_cpu = hv_smp_prepare_boot_cpu; From cb09ea2924cbf1a42da59bd30a59cc1836240bcb Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 21 May 2020 22:05:44 +0200 Subject: [PATCH 140/190] x86/entry: Convert XEN hypercall vector to IDTENTRY_SYSVEC Convert the last oldstyle defined vector to IDTENTRY_SYSVEC: - Implement the C entry point with DEFINE_IDTENTRY_SYSVEC - Emit the ASM stub with DECLARE_IDTENTRY_SYSVEC - Remove the ASM idtentries in 64-bit - Remove the BUILD_INTERRUPT entries in 32-bit - Remove the old prototypes Fixup the related XEN code by providing the primary C entry point in x86 to avoid cluttering the generic code with X86'isms. No functional change. Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar Acked-by: Andy Lutomirski Link: https://lore.kernel.org/r/20200521202119.741950104@linutronix.de --- arch/x86/entry/entry_32.S | 5 ----- arch/x86/entry/entry_64.S | 5 ----- arch/x86/include/asm/idtentry.h | 4 ++++ arch/x86/xen/enlighten_hvm.c | 12 ++++++++++++ drivers/xen/events/events_base.c | 6 ++---- include/xen/events.h | 7 ------- 6 files changed, 18 insertions(+), 21 deletions(-) diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index bb9ebd2df05e89..f8e8aeb10ba4b6 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -1340,11 +1340,6 @@ SYM_FUNC_START(xen_failsafe_callback) SYM_FUNC_END(xen_failsafe_callback) #endif /* CONFIG_XEN_PV */ -#ifdef CONFIG_XEN_PVHVM -BUILD_INTERRUPT3(xen_hvm_callback_vector, HYPERVISOR_CALLBACK_VECTOR, - xen_evtchn_do_upcall) -#endif - SYM_CODE_START_LOCAL_NOALIGN(handle_exception) /* the function address is in %gs's slot on the stack */ SAVE_ALL switch_stacks=1 skip_gs=1 unwind_espfix=1 diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index b97fcda28019e3..fd7efb8dededb3 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -1111,11 +1111,6 @@ SYM_CODE_START(xen_failsafe_callback) SYM_CODE_END(xen_failsafe_callback) #endif /* CONFIG_XEN_PV */ -#ifdef CONFIG_XEN_PVHVM -apicinterrupt3 HYPERVISOR_CALLBACK_VECTOR \ - xen_hvm_callback_vector xen_evtchn_do_upcall -#endif - /* * Save all registers in pt_regs, and switch gs if needed. * Use slow, but surefire "are we in kernel?" check. diff --git a/arch/x86/include/asm/idtentry.h b/arch/x86/include/asm/idtentry.h index 1b6d3ea1fc96bd..71cf82bf24ba01 100644 --- a/arch/x86/include/asm/idtentry.h +++ b/arch/x86/include/asm/idtentry.h @@ -644,6 +644,10 @@ DECLARE_IDTENTRY_SYSVEC(HYPERVISOR_STIMER0_VECTOR, sysvec_hyperv_stimer0); DECLARE_IDTENTRY_SYSVEC(HYPERVISOR_CALLBACK_VECTOR, sysvec_acrn_hv_callback); #endif +#ifdef CONFIG_XEN_PVHVM +DECLARE_IDTENTRY_SYSVEC(HYPERVISOR_CALLBACK_VECTOR, sysvec_xen_hvm_callback); +#endif + #undef X86_TRAP_OTHER #endif diff --git a/arch/x86/xen/enlighten_hvm.c b/arch/x86/xen/enlighten_hvm.c index e138f7de52d200..3e89b0067ff0ca 100644 --- a/arch/x86/xen/enlighten_hvm.c +++ b/arch/x86/xen/enlighten_hvm.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include @@ -118,6 +119,17 @@ static void __init init_hvm_pv_info(void) this_cpu_write(xen_vcpu_id, smp_processor_id()); } +DEFINE_IDTENTRY_SYSVEC(sysvec_xen_hvm_callback) +{ + struct pt_regs *old_regs = set_irq_regs(regs); + + inc_irq_stat(irq_hv_callback_count); + + xen_hvm_evtchn_do_upcall(); + + set_irq_regs(old_regs); +} + #ifdef CONFIG_KEXEC_CORE static void xen_hvm_shutdown(void) { diff --git a/drivers/xen/events/events_base.c b/drivers/xen/events/events_base.c index eb35c3cda9a6fd..140c7bf33a9899 100644 --- a/drivers/xen/events/events_base.c +++ b/drivers/xen/events/events_base.c @@ -37,6 +37,7 @@ #ifdef CONFIG_X86 #include #include +#include #include #include #include @@ -1236,9 +1237,6 @@ void xen_evtchn_do_upcall(struct pt_regs *regs) struct pt_regs *old_regs = set_irq_regs(regs); irq_enter(); -#ifdef CONFIG_X86 - inc_irq_stat(irq_hv_callback_count); -#endif __xen_evtchn_do_upcall(); @@ -1658,7 +1656,7 @@ static __init void xen_alloc_callback_vector(void) return; pr_info("Xen HVM callback vector for event delivery is enabled\n"); - alloc_intr_gate(HYPERVISOR_CALLBACK_VECTOR, xen_hvm_callback_vector); + alloc_intr_gate(HYPERVISOR_CALLBACK_VECTOR, asm_sysvec_xen_hvm_callback); } #else void xen_setup_callback_vector(void) {} diff --git a/include/xen/events.h b/include/xen/events.h index 12b0dcb6a12018..df1e6391f63ff6 100644 --- a/include/xen/events.h +++ b/include/xen/events.h @@ -90,13 +90,6 @@ unsigned int irq_from_evtchn(evtchn_port_t evtchn); int irq_from_virq(unsigned int cpu, unsigned int virq); evtchn_port_t evtchn_from_irq(unsigned irq); -#ifdef CONFIG_XEN_PVHVM -/* Xen HVM evtchn vector callback */ -void xen_hvm_callback_vector(void); -#ifdef CONFIG_TRACING -#define trace_xen_hvm_callback_vector xen_hvm_callback_vector -#endif -#endif int xen_set_callback_via(uint64_t via); void xen_evtchn_do_upcall(struct pt_regs *regs); void xen_hvm_evtchn_do_upcall(void); From 13cad9851ef1d004640991d45227dd35c08f45fc Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 21 May 2020 22:05:45 +0200 Subject: [PATCH 141/190] x86/entry: Convert reschedule interrupt to IDTENTRY_SYSVEC_SIMPLE The scheduler IPI does not need the full interrupt entry handling logic when the entry is from kernel mode. Use IDTENTRY_SYSVEC_SIMPLE and spare all the overhead. Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar Acked-by: Andy Lutomirski Link: https://lore.kernel.org/r/20200521202119.835425642@linutronix.de --- arch/x86/entry/entry_64.S | 4 ---- arch/x86/include/asm/entry_arch.h | 3 --- arch/x86/include/asm/hw_irq.h | 3 --- arch/x86/include/asm/idtentry.h | 1 + arch/x86/include/asm/trace/common.h | 4 ---- arch/x86/include/asm/trace/irq_vectors.h | 17 +---------------- arch/x86/kernel/idt.c | 2 +- arch/x86/kernel/smp.c | 19 ++++--------------- arch/x86/kernel/tracepoint.c | 17 ----------------- 9 files changed, 7 insertions(+), 63 deletions(-) diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index fd7efb8dededb3..9c0722e725ef5a 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -956,10 +956,6 @@ apicinterrupt3 \num \sym \do_sym POP_SECTION_IRQENTRY .endm -#ifdef CONFIG_SMP -apicinterrupt RESCHEDULE_VECTOR reschedule_interrupt smp_reschedule_interrupt -#endif - /* * Reload gs selector with exception handling * edi: new selector diff --git a/arch/x86/include/asm/entry_arch.h b/arch/x86/include/asm/entry_arch.h index a01bb74244ac17..3e841ed5c17a40 100644 --- a/arch/x86/include/asm/entry_arch.h +++ b/arch/x86/include/asm/entry_arch.h @@ -10,6 +10,3 @@ * is no hardware IRQ pin equivalent for them, they are triggered * through the ICC by us (IPIs) */ -#ifdef CONFIG_SMP -BUILD_INTERRUPT(reschedule_interrupt,RESCHEDULE_VECTOR) -#endif diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h index fd5e7c8825e18e..74c12437401eb2 100644 --- a/arch/x86/include/asm/hw_irq.h +++ b/arch/x86/include/asm/hw_irq.h @@ -28,9 +28,6 @@ #include #include -/* Interrupt handlers registered during init_IRQ */ -extern asmlinkage void reschedule_interrupt(void); - #ifdef CONFIG_X86_LOCAL_APIC struct irq_data; struct pci_dev; diff --git a/arch/x86/include/asm/idtentry.h b/arch/x86/include/asm/idtentry.h index 71cf82bf24ba01..38b672ded40bd1 100644 --- a/arch/x86/include/asm/idtentry.h +++ b/arch/x86/include/asm/idtentry.h @@ -600,6 +600,7 @@ DECLARE_IDTENTRY_SYSVEC(X86_PLATFORM_IPI_VECTOR, sysvec_x86_platform_ipi); #endif #ifdef CONFIG_SMP +DECLARE_IDTENTRY(RESCHEDULE_VECTOR, sysvec_reschedule_ipi); DECLARE_IDTENTRY_SYSVEC(IRQ_MOVE_CLEANUP_VECTOR, sysvec_irq_move_cleanup); DECLARE_IDTENTRY_SYSVEC(REBOOT_VECTOR, sysvec_reboot); DECLARE_IDTENTRY_SYSVEC(CALL_FUNCTION_SINGLE_VECTOR, sysvec_call_function_single); diff --git a/arch/x86/include/asm/trace/common.h b/arch/x86/include/asm/trace/common.h index 57c8da027d9923..f0f9bcdb74d9ca 100644 --- a/arch/x86/include/asm/trace/common.h +++ b/arch/x86/include/asm/trace/common.h @@ -5,12 +5,8 @@ DECLARE_STATIC_KEY_FALSE(trace_pagefault_key); #define trace_pagefault_enabled() \ static_branch_unlikely(&trace_pagefault_key) -DECLARE_STATIC_KEY_FALSE(trace_resched_ipi_key); -#define trace_resched_ipi_enabled() \ - static_branch_unlikely(&trace_resched_ipi_key) #else static inline bool trace_pagefault_enabled(void) { return false; } -static inline bool trace_resched_ipi_enabled(void) { return false; } #endif #endif diff --git a/arch/x86/include/asm/trace/irq_vectors.h b/arch/x86/include/asm/trace/irq_vectors.h index 33b9d0f0aafe59..88e7f0f3bf6275 100644 --- a/arch/x86/include/asm/trace/irq_vectors.h +++ b/arch/x86/include/asm/trace/irq_vectors.h @@ -10,9 +10,6 @@ #ifdef CONFIG_X86_LOCAL_APIC -extern int trace_resched_ipi_reg(void); -extern void trace_resched_ipi_unreg(void); - DECLARE_EVENT_CLASS(x86_irq_vector, TP_PROTO(int vector), @@ -37,18 +34,6 @@ DEFINE_EVENT_FN(x86_irq_vector, name##_exit, \ TP_PROTO(int vector), \ TP_ARGS(vector), NULL, NULL); -#define DEFINE_RESCHED_IPI_EVENT(name) \ -DEFINE_EVENT_FN(x86_irq_vector, name##_entry, \ - TP_PROTO(int vector), \ - TP_ARGS(vector), \ - trace_resched_ipi_reg, \ - trace_resched_ipi_unreg); \ -DEFINE_EVENT_FN(x86_irq_vector, name##_exit, \ - TP_PROTO(int vector), \ - TP_ARGS(vector), \ - trace_resched_ipi_reg, \ - trace_resched_ipi_unreg); - /* * local_timer - called when entering/exiting a local timer interrupt * vector handler @@ -99,7 +84,7 @@ TRACE_EVENT_PERF_PERM(irq_work_exit, is_sampling_event(p_event) ? -EPERM : 0); /* * reschedule - called when entering/exiting a reschedule vector handler */ -DEFINE_RESCHED_IPI_EVENT(reschedule); +DEFINE_IRQ_VECTOR_EVENT(reschedule); /* * call_function - called when entering/exiting a call function interrupt diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c index faaadd430882d7..bc9b0d1d7bb8ac 100644 --- a/arch/x86/kernel/idt.c +++ b/arch/x86/kernel/idt.c @@ -112,7 +112,7 @@ static const __initconst struct idt_data def_idts[] = { */ static const __initconst struct idt_data apic_idts[] = { #ifdef CONFIG_SMP - INTG(RESCHEDULE_VECTOR, reschedule_interrupt), + INTG(RESCHEDULE_VECTOR, asm_sysvec_reschedule_ipi), INTG(CALL_FUNCTION_VECTOR, asm_sysvec_call_function), INTG(CALL_FUNCTION_SINGLE_VECTOR, asm_sysvec_call_function_single), INTG(IRQ_MOVE_CLEANUP_VECTOR, asm_sysvec_irq_move_cleanup), diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c index e5647daa7e9687..eff4ce3b10da71 100644 --- a/arch/x86/kernel/smp.c +++ b/arch/x86/kernel/smp.c @@ -220,26 +220,15 @@ static void native_stop_other_cpus(int wait) /* * Reschedule call back. KVM uses this interrupt to force a cpu out of - * guest mode + * guest mode. */ -__visible void __irq_entry smp_reschedule_interrupt(struct pt_regs *regs) +DEFINE_IDTENTRY_SYSVEC_SIMPLE(sysvec_reschedule_ipi) { ack_APIC_irq(); + trace_reschedule_entry(RESCHEDULE_VECTOR); inc_irq_stat(irq_resched_count); - - if (trace_resched_ipi_enabled()) { - /* - * scheduler_ipi() might call irq_enter() as well, but - * nested calls are fine. - */ - irq_enter(); - trace_reschedule_entry(RESCHEDULE_VECTOR); - scheduler_ipi(); - trace_reschedule_exit(RESCHEDULE_VECTOR); - irq_exit(); - return; - } scheduler_ipi(); + trace_reschedule_exit(RESCHEDULE_VECTOR); } DEFINE_IDTENTRY_SYSVEC(sysvec_call_function) diff --git a/arch/x86/kernel/tracepoint.c b/arch/x86/kernel/tracepoint.c index 496748ed266a97..fcfc077afe2dcf 100644 --- a/arch/x86/kernel/tracepoint.c +++ b/arch/x86/kernel/tracepoint.c @@ -25,20 +25,3 @@ void trace_pagefault_unreg(void) { static_branch_dec(&trace_pagefault_key); } - -#ifdef CONFIG_SMP - -DEFINE_STATIC_KEY_FALSE(trace_resched_ipi_key); - -int trace_resched_ipi_reg(void) -{ - static_branch_inc(&trace_resched_ipi_key); - return 0; -} - -void trace_resched_ipi_unreg(void) -{ - static_branch_dec(&trace_resched_ipi_key); -} - -#endif From 75da04f7f3cb416a68475e040175dc013da32de2 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 21 May 2020 22:05:46 +0200 Subject: [PATCH 142/190] x86/entry: Remove the apic/BUILD interrupt leftovers Remove all the code which was there to emit the system vector stubs. All users are gone. Move the now unused GET_CR2_INTO macro muck to head_64.S where the last user is. Fixup the eye hurting comment there while at it. Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar Acked-by: Andy Lutomirski Link: https://lore.kernel.org/r/20200521202119.927433002@linutronix.de --- arch/x86/entry/calling.h | 20 ----- arch/x86/entry/entry_32.S | 18 ---- arch/x86/entry/entry_64.S | 143 ------------------------------ arch/x86/include/asm/entry_arch.h | 12 --- arch/x86/kernel/head_64.S | 7 +- 5 files changed, 4 insertions(+), 196 deletions(-) delete mode 100644 arch/x86/include/asm/entry_arch.h diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h index 98da0d3c0b1a76..4208c1e3f601a1 100644 --- a/arch/x86/entry/calling.h +++ b/arch/x86/entry/calling.h @@ -351,23 +351,3 @@ For 32-bit we have the following conventions - kernel is built with call stackleak_erase #endif .endm - -/* - * This does 'call enter_from_user_mode' unless we can avoid it based on - * kernel config or using the static jump infrastructure. - */ -.macro CALL_enter_from_user_mode -#ifdef CONFIG_CONTEXT_TRACKING -#ifdef CONFIG_JUMP_LABEL - STATIC_JUMP_IF_FALSE .Lafter_call_\@, context_tracking_key, def=0 -#endif - call enter_from_user_mode -.Lafter_call_\@: -#endif -.endm - -#ifdef CONFIG_PARAVIRT_XXL -#define GET_CR2_INTO(reg) GET_CR2_INTO_AX ; _ASM_MOV %_ASM_AX, reg -#else -#define GET_CR2_INTO(reg) _ASM_MOV %cr2, reg -#endif diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index f8e8aeb10ba4b6..c8f176c88a3cce 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -1233,24 +1233,6 @@ SYM_FUNC_END(entry_INT80_32) #endif .endm -#define BUILD_INTERRUPT3(name, nr, fn) \ -SYM_FUNC_START(name) \ - ASM_CLAC; \ - pushl $~(nr); \ - SAVE_ALL switch_stacks=1; \ - ENCODE_FRAME_POINTER; \ - TRACE_IRQS_OFF \ - movl %esp, %eax; \ - call fn; \ - jmp ret_from_intr; \ -SYM_FUNC_END(name) - -#define BUILD_INTERRUPT(name, nr) \ - BUILD_INTERRUPT3(name, nr, smp_##name); \ - -/* The include is where all of the SMP etc. interrupts come from */ -#include - #ifdef CONFIG_PARAVIRT SYM_CODE_START(native_iret) iret diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 9c0722e725ef5a..389f97faee456c 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -658,108 +658,7 @@ SYM_CODE_END(\asmsym) */ #include -/* - * Interrupt entry helper function. - * - * Entry runs with interrupts off. Stack layout at entry: - * +----------------------------------------------------+ - * | regs->ss | - * | regs->rsp | - * | regs->eflags | - * | regs->cs | - * | regs->ip | - * +----------------------------------------------------+ - * | regs->orig_ax = ~(interrupt number) | - * +----------------------------------------------------+ - * | return address | - * +----------------------------------------------------+ - */ -SYM_CODE_START(interrupt_entry) - UNWIND_HINT_IRET_REGS offset=16 - ASM_CLAC - cld - - testb $3, CS-ORIG_RAX+8(%rsp) - jz 1f - SWAPGS - FENCE_SWAPGS_USER_ENTRY - /* - * Switch to the thread stack. The IRET frame and orig_ax are - * on the stack, as well as the return address. RDI..R12 are - * not (yet) on the stack and space has not (yet) been - * allocated for them. - */ - pushq %rdi - - /* Need to switch before accessing the thread stack. */ - SWITCH_TO_KERNEL_CR3 scratch_reg=%rdi - movq %rsp, %rdi - movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp - - /* - * We have RDI, return address, and orig_ax on the stack on - * top of the IRET frame. That means offset=24 - */ - UNWIND_HINT_IRET_REGS base=%rdi offset=24 - - pushq 7*8(%rdi) /* regs->ss */ - pushq 6*8(%rdi) /* regs->rsp */ - pushq 5*8(%rdi) /* regs->eflags */ - pushq 4*8(%rdi) /* regs->cs */ - pushq 3*8(%rdi) /* regs->ip */ - UNWIND_HINT_IRET_REGS - pushq 2*8(%rdi) /* regs->orig_ax */ - pushq 8(%rdi) /* return address */ - - movq (%rdi), %rdi - jmp 2f -1: - FENCE_SWAPGS_KERNEL_ENTRY -2: - PUSH_AND_CLEAR_REGS save_ret=1 - ENCODE_FRAME_POINTER 8 - - testb $3, CS+8(%rsp) - jz 1f - - /* - * IRQ from user mode. - * - * We need to tell lockdep that IRQs are off. We can't do this until - * we fix gsbase, and we should do it before enter_from_user_mode - * (which can take locks). Since TRACE_IRQS_OFF is idempotent, - * the simplest way to handle it is to just call it twice if - * we enter from user mode. There's no reason to optimize this since - * TRACE_IRQS_OFF is a no-op if lockdep is off. - */ - TRACE_IRQS_OFF - - CALL_enter_from_user_mode - -1: - ENTER_IRQ_STACK old_rsp=%rdi save_ret=1 - /* We entered an interrupt context - irqs are off: */ - TRACE_IRQS_OFF - - ret -SYM_CODE_END(interrupt_entry) -_ASM_NOKPROBE(interrupt_entry) - SYM_CODE_START_LOCAL(common_interrupt_return) -ret_from_intr: - DISABLE_INTERRUPTS(CLBR_ANY) - TRACE_IRQS_OFF - - LEAVE_IRQ_STACK - - testb $3, CS(%rsp) - jz retint_kernel - - /* Interrupt came from user space */ -.Lretint_user: - mov %rsp,%rdi - call prepare_exit_to_usermode - SYM_INNER_LABEL(swapgs_restore_regs_and_return_to_usermode, SYM_L_GLOBAL) #ifdef CONFIG_DEBUG_ENTRY /* Assert that pt_regs indicates user mode. */ @@ -802,23 +701,6 @@ SYM_INNER_LABEL(swapgs_restore_regs_and_return_to_usermode, SYM_L_GLOBAL) INTERRUPT_RETURN -/* Returning to kernel space */ -retint_kernel: -#ifdef CONFIG_PREEMPTION - /* Interrupts are off */ - /* Check if we need preemption */ - btl $9, EFLAGS(%rsp) /* were interrupts off? */ - jnc 1f - cmpl $0, PER_CPU_VAR(__preempt_count) - jnz 1f - call preempt_schedule_irq -1: -#endif - /* - * The iretq could re-enable interrupts: - */ - TRACE_IRQS_IRETQ - SYM_INNER_LABEL(restore_regs_and_return_to_kernel, SYM_L_GLOBAL) #ifdef CONFIG_DEBUG_ENTRY /* Assert that pt_regs indicates kernel mode. */ @@ -931,31 +813,6 @@ native_irq_return_ldt: SYM_CODE_END(common_interrupt_return) _ASM_NOKPROBE(common_interrupt_return) -/* - * APIC interrupts. - */ -.macro apicinterrupt3 num sym do_sym -SYM_CODE_START(\sym) - UNWIND_HINT_IRET_REGS - pushq $~(\num) - call interrupt_entry - UNWIND_HINT_REGS indirect=1 - call \do_sym /* rdi points to pt_regs */ - jmp ret_from_intr -SYM_CODE_END(\sym) -_ASM_NOKPROBE(\sym) -.endm - -/* Make sure APIC interrupt handlers end up in the irqentry section: */ -#define PUSH_SECTION_IRQENTRY .pushsection .irqentry.text, "ax" -#define POP_SECTION_IRQENTRY .popsection - -.macro apicinterrupt num sym do_sym -PUSH_SECTION_IRQENTRY -apicinterrupt3 \num \sym \do_sym -POP_SECTION_IRQENTRY -.endm - /* * Reload gs selector with exception handling * edi: new selector diff --git a/arch/x86/include/asm/entry_arch.h b/arch/x86/include/asm/entry_arch.h deleted file mode 100644 index 3e841ed5c17a40..00000000000000 --- a/arch/x86/include/asm/entry_arch.h +++ /dev/null @@ -1,12 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * This file is designed to contain the BUILD_INTERRUPT specifications for - * all of the extra named interrupt vectors used by the architecture. - * Usually this is the Inter Process Interrupts (IPIs) - */ - -/* - * The following vectors are part of the Linux architecture, there - * is no hardware IRQ pin equivalent for them, they are triggered - * through the ICC by us (IPIs) - */ diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 4fc33fdf0f1616..16da4ac015971c 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -29,15 +29,16 @@ #ifdef CONFIG_PARAVIRT_XXL #include #include +#define GET_CR2_INTO(reg) GET_CR2_INTO_AX ; _ASM_MOV %_ASM_AX, reg #else #define INTERRUPT_RETURN iretq +#define GET_CR2_INTO(reg) _ASM_MOV %cr2, reg #endif -/* we are not able to switch in one step to the final KERNEL ADDRESS SPACE +/* + * We are not able to switch in one step to the final KERNEL ADDRESS SPACE * because we need identity-mapped pages. - * */ - #define l4_index(x) (((x) >> 39) & 511) #define pud_index(x) (((x) >> PUD_SHIFT) & (PTRS_PER_PUD-1)) From e3e5c64ea1f5a81ace6984e7abbdd369ab631c93 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 21 May 2020 22:05:47 +0200 Subject: [PATCH 143/190] x86/entry/64: Remove IRQ stack switching ASM No more users. Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar Acked-by: Andy Lutomirski Link: https://lore.kernel.org/r/20200521202120.021462159@linutronix.de --- arch/x86/entry/entry_64.S | 96 --------------------------------------- 1 file changed, 96 deletions(-) diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 389f97faee456c..29a8a83a6f5fdb 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -370,102 +370,6 @@ SYM_CODE_END(ret_from_fork) #endif .endm -/* - * Enters the IRQ stack if we're not already using it. NMI-safe. Clobbers - * flags and puts old RSP into old_rsp, and leaves all other GPRs alone. - * Requires kernel GSBASE. - * - * The invariant is that, if irq_count != -1, then the IRQ stack is in use. - */ -.macro ENTER_IRQ_STACK regs=1 old_rsp save_ret=0 - DEBUG_ENTRY_ASSERT_IRQS_OFF - - .if \save_ret - /* - * If save_ret is set, the original stack contains one additional - * entry -- the return address. Therefore, move the address one - * entry below %rsp to \old_rsp. - */ - leaq 8(%rsp), \old_rsp - .else - movq %rsp, \old_rsp - .endif - - .if \regs - UNWIND_HINT_REGS base=\old_rsp - .endif - - incl PER_CPU_VAR(irq_count) - jnz .Lirq_stack_push_old_rsp_\@ - - /* - * Right now, if we just incremented irq_count to zero, we've - * claimed the IRQ stack but we haven't switched to it yet. - * - * If anything is added that can interrupt us here without using IST, - * it must be *extremely* careful to limit its stack usage. This - * could include kprobes and a hypothetical future IST-less #DB - * handler. - * - * The OOPS unwinder relies on the word at the top of the IRQ - * stack linking back to the previous RSP for the entire time we're - * on the IRQ stack. For this to work reliably, we need to write - * it before we actually move ourselves to the IRQ stack. - */ - - movq \old_rsp, PER_CPU_VAR(irq_stack_backing_store + IRQ_STACK_SIZE - 8) - movq PER_CPU_VAR(hardirq_stack_ptr), %rsp - -#ifdef CONFIG_DEBUG_ENTRY - /* - * If the first movq above becomes wrong due to IRQ stack layout - * changes, the only way we'll notice is if we try to unwind right - * here. Assert that we set up the stack right to catch this type - * of bug quickly. - */ - cmpq -8(%rsp), \old_rsp - je .Lirq_stack_okay\@ - ud2 - .Lirq_stack_okay\@: -#endif - -.Lirq_stack_push_old_rsp_\@: - pushq \old_rsp - - .if \regs - UNWIND_HINT_REGS indirect=1 - .endif - - .if \save_ret - /* - * Push the return address to the stack. This return address can - * be found at the "real" original RSP, which was offset by 8 at - * the beginning of this macro. - */ - pushq -8(\old_rsp) - .endif -.endm - -/* - * Undoes ENTER_IRQ_STACK. - */ -.macro LEAVE_IRQ_STACK regs=1 - DEBUG_ENTRY_ASSERT_IRQS_OFF - /* We need to be off the IRQ stack before decrementing irq_count. */ - popq %rsp - - .if \regs - UNWIND_HINT_REGS - .endif - - /* - * As in ENTER_IRQ_STACK, irq_count == 0, we are still claiming - * the irq stack but we're not on it. - */ - - decl PER_CPU_VAR(irq_count) -.endm - /** * idtentry_body - Macro to emit code calling the C function * @cfunc: C function to be called From 3b6c9bf69ef34c5ca36c78aad4ff76b9d9afc92c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 21 May 2020 22:05:48 +0200 Subject: [PATCH 144/190] x86/entry: Make enter_from_user_mode() static The ASM users are gone. All callers are local. Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar Acked-by: Andy Lutomirski Link: https://lore.kernel.org/r/20200521202120.129232680@linutronix.de --- arch/x86/entry/common.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index a0f8c3cb130a38..b0b1c3cf0e6e5c 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -56,7 +56,7 @@ * 2) Invoke context tracking if enabled to reactivate RCU * 3) Trace interrupts off state */ -__visible noinstr void enter_from_user_mode(void) +static noinstr void enter_from_user_mode(void) { enum ctx_state state = ct_state(); From fa95a0cb0423661eb615b6ac1e9882ce9a75719a Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 21 May 2020 22:05:49 +0200 Subject: [PATCH 145/190] x86/entry/32: Remove redundant irq disable code All exceptions/interrupts return with interrupts disabled now. No point in doing this in ASM again. Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar Acked-by: Andy Lutomirski Link: https://lore.kernel.org/r/20200521202120.221223450@linutronix.de --- arch/x86/entry/entry_32.S | 76 --------------------------------------- 1 file changed, 76 deletions(-) diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index c8f176c88a3cce..2d29f77a360122 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -51,34 +51,6 @@ .section .entry.text, "ax" -/* - * We use macros for low-level operations which need to be overridden - * for paravirtualization. The following will never clobber any registers: - * INTERRUPT_RETURN (aka. "iret") - * GET_CR0_INTO_EAX (aka. "movl %cr0, %eax") - * ENABLE_INTERRUPTS_SYSEXIT (aka "sti; sysexit"). - * - * For DISABLE_INTERRUPTS/ENABLE_INTERRUPTS (aka "cli"/"sti"), you must - * specify what registers can be overwritten (CLBR_NONE, CLBR_EAX/EDX/ECX/ANY). - * Allowing a register to be clobbered can shrink the paravirt replacement - * enough to patch inline, increasing performance. - */ - -#ifdef CONFIG_PREEMPTION -# define preempt_stop(clobbers) DISABLE_INTERRUPTS(clobbers); TRACE_IRQS_OFF -#else -# define preempt_stop(clobbers) -#endif - -.macro TRACE_IRQS_IRET -#ifdef CONFIG_TRACE_IRQFLAGS - testl $X86_EFLAGS_IF, PT_EFLAGS(%esp) # interrupts off? - jz 1f - TRACE_IRQS_ON -1: -#endif -.endm - #define PTI_SWITCH_MASK (1 << PAGE_SHIFT) /* @@ -881,38 +853,6 @@ SYM_CODE_START(ret_from_fork) SYM_CODE_END(ret_from_fork) .popsection -/* - * Return to user mode is not as complex as all this looks, - * but we want the default path for a system call return to - * go as quickly as possible which is why some of this is - * less clear than it otherwise should be. - */ - - # userspace resumption stub bypassing syscall exit tracing -SYM_CODE_START_LOCAL(ret_from_exception) - preempt_stop(CLBR_ANY) -ret_from_intr: -#ifdef CONFIG_VM86 - movl PT_EFLAGS(%esp), %eax # mix EFLAGS and CS - movb PT_CS(%esp), %al - andl $(X86_EFLAGS_VM | SEGMENT_RPL_MASK), %eax -#else - /* - * We can be coming here from child spawned by kernel_thread(). - */ - movl PT_CS(%esp), %eax - andl $SEGMENT_RPL_MASK, %eax -#endif - cmpl $USER_RPL, %eax - jb restore_all_kernel # not returning to v8086 or userspace - - DISABLE_INTERRUPTS(CLBR_ANY) - TRACE_IRQS_OFF - movl %esp, %eax - call prepare_exit_to_usermode - jmp restore_all_switch_stack -SYM_CODE_END(ret_from_exception) - SYM_ENTRY(__begin_SYSENTER_singlestep_region, SYM_L_GLOBAL, SYM_A_NONE) /* * All code from here through __end_SYSENTER_singlestep_region is subject @@ -1147,22 +1087,6 @@ restore_all_switch_stack: */ INTERRUPT_RETURN -restore_all_kernel: -#ifdef CONFIG_PREEMPTION - DISABLE_INTERRUPTS(CLBR_ANY) - cmpl $0, PER_CPU_VAR(__preempt_count) - jnz .Lno_preempt - testl $X86_EFLAGS_IF, PT_EFLAGS(%esp) # interrupts off (exception path) ? - jz .Lno_preempt - call preempt_schedule_irq -.Lno_preempt: -#endif - TRACE_IRQS_IRET - PARANOID_EXIT_TO_KERNEL_MODE - BUG_IF_WRONG_CR3 - RESTORE_REGS 4 - jmp .Lirq_return - .section .fixup, "ax" SYM_CODE_START(asm_iret_error) pushl $0 # no error code From 9628f26baef262a49d877e3785e8b88d241bc064 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 21 May 2020 22:05:50 +0200 Subject: [PATCH 146/190] x86/entry/64: Remove TRACE_IRQS_*_DEBUG Since INT3/#BP no longer runs on an IST, this workaround is no longer required. Tested by running lockdep+ftrace as described in the initial commit: 5963e317b1e9 ("ftrace/x86: Do not change stacks in DEBUG when calling lockdep") Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar Reviewed-by: Steven Rostedt (VMware) Acked-by: Andy Lutomirski Link: https://lore.kernel.org/r/20200521202120.319418546@linutronix.de --- arch/x86/entry/entry_64.S | 48 +++------------------------------------ 1 file changed, 3 insertions(+), 45 deletions(-) diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 29a8a83a6f5fdb..fb7f1262834657 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -67,44 +67,6 @@ SYM_CODE_END(native_usergs_sysret64) TRACE_IRQS_FLAGS EFLAGS(%rsp) .endm -/* - * When dynamic function tracer is enabled it will add a breakpoint - * to all locations that it is about to modify, sync CPUs, update - * all the code, sync CPUs, then remove the breakpoints. In this time - * if lockdep is enabled, it might jump back into the debug handler - * outside the updating of the IST protection. (TRACE_IRQS_ON/OFF). - * - * We need to change the IDT table before calling TRACE_IRQS_ON/OFF to - * make sure the stack pointer does not get reset back to the top - * of the debug stack, and instead just reuses the current stack. - */ -#if defined(CONFIG_DYNAMIC_FTRACE) && defined(CONFIG_TRACE_IRQFLAGS) - -.macro TRACE_IRQS_OFF_DEBUG - call debug_stack_set_zero - TRACE_IRQS_OFF - call debug_stack_reset -.endm - -.macro TRACE_IRQS_ON_DEBUG - call debug_stack_set_zero - TRACE_IRQS_ON - call debug_stack_reset -.endm - -.macro TRACE_IRQS_IRETQ_DEBUG - btl $9, EFLAGS(%rsp) /* interrupts off? */ - jnc 1f - TRACE_IRQS_ON_DEBUG -1: -.endm - -#else -# define TRACE_IRQS_OFF_DEBUG TRACE_IRQS_OFF -# define TRACE_IRQS_ON_DEBUG TRACE_IRQS_ON -# define TRACE_IRQS_IRETQ_DEBUG TRACE_IRQS_IRETQ -#endif - /* * 64-bit SYSCALL instruction entry. Up to 6 arguments in registers. * @@ -500,11 +462,7 @@ SYM_CODE_START(\asmsym) UNWIND_HINT_REGS - .if \vector == X86_TRAP_DB - TRACE_IRQS_OFF_DEBUG - .else - TRACE_IRQS_OFF - .endif + TRACE_IRQS_OFF movq %rsp, %rdi /* pt_regs pointer */ @@ -924,7 +882,7 @@ SYM_CODE_END(paranoid_entry) SYM_CODE_START_LOCAL(paranoid_exit) UNWIND_HINT_REGS DISABLE_INTERRUPTS(CLBR_ANY) - TRACE_IRQS_OFF_DEBUG + TRACE_IRQS_OFF testl %ebx, %ebx /* swapgs needed? */ jnz .Lparanoid_exit_no_swapgs TRACE_IRQS_IRETQ @@ -933,7 +891,7 @@ SYM_CODE_START_LOCAL(paranoid_exit) SWAPGS_UNSAFE_STACK jmp restore_regs_and_return_to_kernel .Lparanoid_exit_no_swapgs: - TRACE_IRQS_IRETQ_DEBUG + TRACE_IRQS_IRETQ /* Always restore stashed CR3 value (see paranoid_entry) */ RESTORE_CR3 scratch_reg=%rbx save_reg=%r14 jmp restore_regs_and_return_to_kernel From 3ffdfdcec1bae39b68b990762350b3cd3127f23f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 21 May 2020 22:05:51 +0200 Subject: [PATCH 147/190] x86/entry: Move paranoid irq tracing out of ASM code The last step to remove the irq tracing cruft from ASM. Ignore #DF as the maschine is going to die anyway. Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar Acked-by: Andy Lutomirski Link: https://lore.kernel.org/r/20200521202120.414043330@linutronix.de --- arch/x86/entry/entry_64.S | 13 ------------- arch/x86/kernel/cpu/mce/core.c | 3 +++ arch/x86/kernel/nmi.c | 3 +++ arch/x86/kernel/traps.c | 11 +++++++++++ 4 files changed, 17 insertions(+), 13 deletions(-) diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index fb7f1262834657..2566554fe04e71 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -16,7 +16,6 @@ * * Some macro usage: * - SYM_FUNC_START/END:Define functions in the symbol table. - * - TRACE_IRQ_*: Trace hardirq state for lock debugging. * - idtentry: Define exception entry points. */ #include @@ -107,11 +106,6 @@ SYM_CODE_END(native_usergs_sysret64) SYM_CODE_START(entry_SYSCALL_64) UNWIND_HINT_EMPTY - /* - * Interrupts are off on entry. - * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON, - * it is too small to ever cause noticeable irq latency. - */ swapgs /* tss.sp2 is scratch space. */ @@ -462,8 +456,6 @@ SYM_CODE_START(\asmsym) UNWIND_HINT_REGS - TRACE_IRQS_OFF - movq %rsp, %rdi /* pt_regs pointer */ .if \vector == X86_TRAP_DB @@ -881,17 +873,13 @@ SYM_CODE_END(paranoid_entry) */ SYM_CODE_START_LOCAL(paranoid_exit) UNWIND_HINT_REGS - DISABLE_INTERRUPTS(CLBR_ANY) - TRACE_IRQS_OFF testl %ebx, %ebx /* swapgs needed? */ jnz .Lparanoid_exit_no_swapgs - TRACE_IRQS_IRETQ /* Always restore stashed CR3 value (see paranoid_entry) */ RESTORE_CR3 scratch_reg=%rbx save_reg=%r14 SWAPGS_UNSAFE_STACK jmp restore_regs_and_return_to_kernel .Lparanoid_exit_no_swapgs: - TRACE_IRQS_IRETQ /* Always restore stashed CR3 value (see paranoid_entry) */ RESTORE_CR3 scratch_reg=%rbx save_reg=%r14 jmp restore_regs_and_return_to_kernel @@ -1292,7 +1280,6 @@ end_repeat_nmi: call paranoid_entry UNWIND_HINT_REGS - /* paranoidentry exc_nmi(), 0; without TRACE_IRQS_OFF */ movq %rsp, %rdi movq $-1, %rsi call exc_nmi diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index c47f004f623178..068e6cab1286c0 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -1922,7 +1922,10 @@ static __always_inline void exc_machine_check_kernel(struct pt_regs *regs) * that out because it's an indirect call. Annotate it. */ instrumentation_begin(); + trace_hardirqs_off_prepare(); machine_check_vector(regs); + if (regs->flags & X86_EFLAGS_IF) + trace_hardirqs_on_prepare(); instrumentation_end(); nmi_exit(); } diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index 3052c78f03aa27..5df4e7f5836961 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -330,6 +330,7 @@ static noinstr void default_do_nmi(struct pt_regs *regs) __this_cpu_write(last_nmi_rip, regs->ip); instrumentation_begin(); + trace_hardirqs_off_prepare(); handled = nmi_handle(NMI_LOCAL, regs); __this_cpu_add(nmi_stats.normal, handled); @@ -416,6 +417,8 @@ static noinstr void default_do_nmi(struct pt_regs *regs) unknown_nmi_error(reason, regs); out: + if (regs->flags & X86_EFLAGS_IF) + trace_hardirqs_on_prepare(); instrumentation_end(); } diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index f28be3e51cca4f..50fb9cd5be97b0 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -634,8 +634,11 @@ DEFINE_IDTENTRY_RAW(exc_int3) } else { nmi_enter(); instrumentation_begin(); + trace_hardirqs_off_prepare(); if (!do_int3(regs)) die("int3", regs, 0); + if (regs->flags & X86_EFLAGS_IF) + trace_hardirqs_on_prepare(); instrumentation_end(); nmi_exit(); } @@ -850,6 +853,10 @@ static __always_inline void exc_debug_kernel(struct pt_regs *regs, unsigned long dr6) { nmi_enter(); + instrumentation_begin(); + trace_hardirqs_off_prepare(); + instrumentation_end(); + /* * The SDM says "The processor clears the BTF flag when it * generates a debug exception." Clear TIF_BLOCKSTEP to keep @@ -871,6 +878,10 @@ static __always_inline void exc_debug_kernel(struct pt_regs *regs, if (dr6) handle_debug(regs, dr6, false); + instrumentation_begin(); + if (regs->flags & X86_EFLAGS_IF) + trace_hardirqs_on_prepare(); + instrumentation_end(); nmi_exit(); } From 320100a5ffe5ec781ec3dc190a57ce5e32885855 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 21 May 2020 22:05:52 +0200 Subject: [PATCH 148/190] x86/entry: Remove the TRACE_IRQS cruft No more users. Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar Acked-by: Andy Lutomirski Link: https://lore.kernel.org/r/20200521202120.523289762@linutronix.de --- arch/x86/entry/entry_64.S | 13 ------------- arch/x86/entry/thunk_64.S | 9 +-------- arch/x86/include/asm/irqflags.h | 10 ---------- 3 files changed, 1 insertion(+), 31 deletions(-) diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 2566554fe04e71..265ff97b396171 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -53,19 +53,6 @@ SYM_CODE_START(native_usergs_sysret64) SYM_CODE_END(native_usergs_sysret64) #endif /* CONFIG_PARAVIRT */ -.macro TRACE_IRQS_FLAGS flags:req -#ifdef CONFIG_TRACE_IRQFLAGS - btl $9, \flags /* interrupts off? */ - jnc 1f - TRACE_IRQS_ON -1: -#endif -.endm - -.macro TRACE_IRQS_IRETQ - TRACE_IRQS_FLAGS EFLAGS(%rsp) -.endm - /* * 64-bit SYSCALL instruction entry. Up to 6 arguments in registers. * diff --git a/arch/x86/entry/thunk_64.S b/arch/x86/entry/thunk_64.S index 34f980c9b766f5..ccd32877a3c414 100644 --- a/arch/x86/entry/thunk_64.S +++ b/arch/x86/entry/thunk_64.S @@ -3,7 +3,6 @@ * Save registers before calling assembly functions. This avoids * disturbance of register allocation in some inline assembly constructs. * Copyright 2001,2002 by Andi Kleen, SuSE Labs. - * Added trace_hardirqs callers - Copyright 2007 Steven Rostedt, Red Hat, Inc. */ #include #include "calling.h" @@ -37,11 +36,6 @@ SYM_FUNC_END(\name) _ASM_NOKPROBE(\name) .endm -#ifdef CONFIG_TRACE_IRQFLAGS - THUNK trace_hardirqs_on_thunk,trace_hardirqs_on_caller,1 - THUNK trace_hardirqs_off_thunk,trace_hardirqs_off_caller,1 -#endif - #ifdef CONFIG_PREEMPTION THUNK preempt_schedule_thunk, preempt_schedule THUNK preempt_schedule_notrace_thunk, preempt_schedule_notrace @@ -49,8 +43,7 @@ SYM_FUNC_END(\name) EXPORT_SYMBOL(preempt_schedule_notrace_thunk) #endif -#if defined(CONFIG_TRACE_IRQFLAGS) \ - || defined(CONFIG_PREEMPTION) +#ifdef CONFIG_PREEMPTION SYM_CODE_START_LOCAL_NOALIGN(.L_restore) popq %r11 popq %r10 diff --git a/arch/x86/include/asm/irqflags.h b/arch/x86/include/asm/irqflags.h index e00f064b009ecc..8ddff8dbaed5ab 100644 --- a/arch/x86/include/asm/irqflags.h +++ b/arch/x86/include/asm/irqflags.h @@ -172,14 +172,4 @@ static inline int arch_irqs_disabled(void) } #endif /* !__ASSEMBLY__ */ -#ifdef __ASSEMBLY__ -#ifdef CONFIG_TRACE_IRQFLAGS -# define TRACE_IRQS_ON call trace_hardirqs_on_thunk; -# define TRACE_IRQS_OFF call trace_hardirqs_off_thunk; -#else -# define TRACE_IRQS_ON -# define TRACE_IRQS_OFF -#endif -#endif /* __ASSEMBLY__ */ - #endif From 998c2034c6a36bd48284e8c8f945a6c6ffc0e3f0 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Wed, 20 May 2020 18:16:00 +0200 Subject: [PATCH 149/190] xen: Move xen_setup_callback_vector() definition to include/xen/hvm.h Kbuild test robot reports the following problem on ARM: for 'xen_setup_callback_vector' [-Wmissing-prototypes] 1664 | void xen_setup_callback_vector(void) {} | ^~~~~~~~~~~~~~~~~~~~~~~~~ The problem is that xen_setup_callback_vector is a x86 only thing, its definition is present in arch/x86/xen/xen-ops.h but not on ARM. In events_base.c there is a stub for !CONFIG_XEN_PVHVM but it is not declared as 'static'. On x86 the situation is hardly better: drivers/xen/events/events_base.c doesn't include 'xen-ops.h' from arch/x86/xen/, it includes its namesake from include/xen/ which also results in a 'no previous prototype' warning. Currently, xen_setup_callback_vector() has two call sites: one in drivers/xen/events_base.c and another in arch/x86/xen/suspend_hvm.c. The former is placed under #ifdef CONFIG_X86 and the later is only compiled in when CONFIG_XEN_PVHVM. Resolve the issue by moving xen_setup_callback_vector() declaration to arch neutral 'include/xen/hvm.h' as the implementation lives in arch neutral drivers/xen/events/events_base.c. Reported-by: kbuild test robot Signed-off-by: Vitaly Kuznetsov Signed-off-by: Thomas Gleixner Reviewed-by: Juergen Gross Link: https://lkml.kernel.org/r/20200520161600.361895-1-vkuznets@redhat.com --- arch/x86/xen/suspend_hvm.c | 1 + arch/x86/xen/xen-ops.h | 1 - include/xen/hvm.h | 2 ++ include/xen/interface/hvm/hvm_op.h | 2 ++ 4 files changed, 5 insertions(+), 1 deletion(-) diff --git a/arch/x86/xen/suspend_hvm.c b/arch/x86/xen/suspend_hvm.c index 5152afe1687669..9d548b0c772f15 100644 --- a/arch/x86/xen/suspend_hvm.c +++ b/arch/x86/xen/suspend_hvm.c @@ -2,6 +2,7 @@ #include #include +#include #include #include diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index ad05d058938180..53b224fd617706 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -54,7 +54,6 @@ void xen_enable_sysenter(void); void xen_enable_syscall(void); void xen_vcpu_restore(void); -void xen_setup_callback_vector(void); void xen_hvm_init_shared_info(void); void xen_unplug_emulated_devices(void); diff --git a/include/xen/hvm.h b/include/xen/hvm.h index 0b15f8cb17fcbc..b7fd7fc9ad414a 100644 --- a/include/xen/hvm.h +++ b/include/xen/hvm.h @@ -58,4 +58,6 @@ static inline int hvm_get_parameter(int idx, uint64_t *value) #define HVM_CALLBACK_VECTOR(x) (((uint64_t)HVM_CALLBACK_VIA_TYPE_VECTOR)<<\ HVM_CALLBACK_VIA_TYPE_SHIFT | (x)) +void xen_setup_callback_vector(void); + #endif /* XEN_HVM_H__ */ diff --git a/include/xen/interface/hvm/hvm_op.h b/include/xen/interface/hvm/hvm_op.h index 956a04682865bb..25d945ef17de0e 100644 --- a/include/xen/interface/hvm/hvm_op.h +++ b/include/xen/interface/hvm/hvm_op.h @@ -21,6 +21,8 @@ #ifndef __XEN_PUBLIC_HVM_HVM_OP_H__ #define __XEN_PUBLIC_HVM_HVM_OP_H__ +#include + /* Get/set subcommands: the second argument of the hypercall is a * pointer to a xen_hvm_param struct. */ #define HVMOP_set_param 0 From d390e6de89d30402bd06056c40cea72328aec9b1 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Fri, 29 May 2020 23:27:29 +0200 Subject: [PATCH 150/190] x86/hw_breakpoint: Add within_area() to check data breakpoints Add a within_area() helper to checking whether the data breakpoints overlap with cpu_entry_area. It will be used to completely prevent data breakpoints on GDT, IDT, or TSS. Signed-off-by: Lai Jiangshan Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20200526014221.2119-2-laijs@linux.alibaba.com Link: https://lkml.kernel.org/r/20200529213320.784524504@infradead.org --- arch/x86/kernel/hw_breakpoint.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c index 9ddf441ccaa87d..c149c7b29ac318 100644 --- a/arch/x86/kernel/hw_breakpoint.c +++ b/arch/x86/kernel/hw_breakpoint.c @@ -227,14 +227,23 @@ int arch_check_bp_in_kernelspace(struct arch_hw_breakpoint *hw) return (va >= TASK_SIZE_MAX) || ((va + len - 1) >= TASK_SIZE_MAX); } +/* + * Checks whether the range [addr, end], overlaps the area [base, base + size). + */ +static inline bool within_area(unsigned long addr, unsigned long end, + unsigned long base, unsigned long size) +{ + return end >= base && addr < (base + size); +} + /* * Checks whether the range from addr to end, inclusive, overlaps the CPU * entry area range. */ static inline bool within_cpu_entry_area(unsigned long addr, unsigned long end) { - return end >= CPU_ENTRY_AREA_BASE && - addr < (CPU_ENTRY_AREA_BASE + CPU_ENTRY_AREA_TOTAL_SIZE); + return within_area(addr, end, CPU_ENTRY_AREA_BASE, + CPU_ENTRY_AREA_TOTAL_SIZE); } static int arch_build_bp_info(struct perf_event *bp, From 97417cb9ad4ed052d7a4c5c0d75db1ff1b0981fb Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Fri, 29 May 2020 23:27:30 +0200 Subject: [PATCH 151/190] x86/hw_breakpoint: Prevent data breakpoints on direct GDT A data breakpoint on the GDT can be fatal and must be avoided. The GDT in the CPU entry area is already protected, but not the direct GDT. Add the necessary protection. Signed-off-by: Lai Jiangshan Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20200526014221.2119-3-laijs@linux.alibaba.com Link: https://lkml.kernel.org/r/20200529213320.840953950@infradead.org --- arch/x86/kernel/hw_breakpoint.c | 30 ++++++++++++++++++++++-------- 1 file changed, 22 insertions(+), 8 deletions(-) diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c index c149c7b29ac318..f859095c1b6c25 100644 --- a/arch/x86/kernel/hw_breakpoint.c +++ b/arch/x86/kernel/hw_breakpoint.c @@ -32,6 +32,7 @@ #include #include #include +#include /* Per cpu debug control register value */ DEFINE_PER_CPU(unsigned long, cpu_dr7); @@ -237,13 +238,26 @@ static inline bool within_area(unsigned long addr, unsigned long end, } /* - * Checks whether the range from addr to end, inclusive, overlaps the CPU - * entry area range. + * Checks whether the range from addr to end, inclusive, overlaps the fixed + * mapped CPU entry area range or other ranges used for CPU entry. */ -static inline bool within_cpu_entry_area(unsigned long addr, unsigned long end) +static inline bool within_cpu_entry(unsigned long addr, unsigned long end) { - return within_area(addr, end, CPU_ENTRY_AREA_BASE, - CPU_ENTRY_AREA_TOTAL_SIZE); + int cpu; + + /* CPU entry erea is always used for CPU entry */ + if (within_area(addr, end, CPU_ENTRY_AREA_BASE, + CPU_ENTRY_AREA_TOTAL_SIZE)) + return true; + + for_each_possible_cpu(cpu) { + /* The original rw GDT is being used after load_direct_gdt() */ + if (within_area(addr, end, (unsigned long)get_cpu_gdt_rw(cpu), + GDT_SIZE)) + return true; + } + + return false; } static int arch_build_bp_info(struct perf_event *bp, @@ -257,12 +271,12 @@ static int arch_build_bp_info(struct perf_event *bp, return -EINVAL; /* - * Prevent any breakpoint of any type that overlaps the - * cpu_entry_area. This protects the IST stacks and also + * Prevent any breakpoint of any type that overlaps the CPU + * entry area and data. This protects the IST stacks and also * reduces the chance that we ever find out what happens if * there's a data breakpoint on the GDT, IDT, or TSS. */ - if (within_cpu_entry_area(attr->bp_addr, bp_end)) + if (within_cpu_entry(attr->bp_addr, bp_end)) return -EINVAL; hw->address = attr->bp_addr; From f9fe0b89f05441c6e4034e024c2c75a0d93024c1 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Fri, 29 May 2020 23:27:31 +0200 Subject: [PATCH 152/190] x86/hw_breakpoint: Prevent data breakpoints on per_cpu cpu_tss_rw cpu_tss_rw is not directly referenced by hardware, but cpu_tss_rw is accessed in CPU entry code, especially when #DB shifts its stacks. If a data breakpoint would be set on cpu_tss_rw.x86_tss.ist[IST_INDEX_DB], it would cause recursive #DB ending up in a double fault. Add it to the list of protected items. Signed-off-by: Lai Jiangshan Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20200526014221.2119-4-laijs@linux.alibaba.com Link: https://lkml.kernel.org/r/20200529213320.897976479@infradead.org --- arch/x86/kernel/hw_breakpoint.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c index f859095c1b6c25..f311bbfda1ba93 100644 --- a/arch/x86/kernel/hw_breakpoint.c +++ b/arch/x86/kernel/hw_breakpoint.c @@ -255,6 +255,15 @@ static inline bool within_cpu_entry(unsigned long addr, unsigned long end) if (within_area(addr, end, (unsigned long)get_cpu_gdt_rw(cpu), GDT_SIZE)) return true; + + /* + * cpu_tss_rw is not directly referenced by hardware, but + * cpu_tss_rw is also used in CPU entry code, + */ + if (within_area(addr, end, + (unsigned long)&per_cpu(cpu_tss_rw, cpu), + sizeof(struct tss_struct))) + return true; } return false; From fdef24dfccb7be06e6ebe11d6c6c56987421870f Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Fri, 29 May 2020 23:27:32 +0200 Subject: [PATCH 153/190] x86/hw_breakpoint: Prevent data breakpoints on user_pcid_flush_mask The per-CPU user_pcid_flush_mask is used in the low level entry code. A data breakpoint can cause #DB recursion. Protect the full cpu_tlbstate structure for simplicity. Signed-off-by: Lai Jiangshan Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20200526014221.2119-5-laijs@linux.alibaba.com Link: https://lkml.kernel.org/r/20200529213320.955117574@infradead.org --- arch/x86/kernel/hw_breakpoint.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c index f311bbfda1ba93..fc1743a2b0e959 100644 --- a/arch/x86/kernel/hw_breakpoint.c +++ b/arch/x86/kernel/hw_breakpoint.c @@ -33,6 +33,7 @@ #include #include #include +#include /* Per cpu debug control register value */ DEFINE_PER_CPU(unsigned long, cpu_dr7); @@ -264,6 +265,16 @@ static inline bool within_cpu_entry(unsigned long addr, unsigned long end) (unsigned long)&per_cpu(cpu_tss_rw, cpu), sizeof(struct tss_struct))) return true; + + /* + * cpu_tlbstate.user_pcid_flush_mask is used for CPU entry. + * If a data breakpoint on it, it will cause an unwanted #DB. + * Protect the full cpu_tlbstate structure to be sure. + */ + if (within_area(addr, end, + (unsigned long)&per_cpu(cpu_tlbstate, cpu), + sizeof(struct tlb_state))) + return true; } return false; From e1de11d4d1a64ac1b90b9833f1a3629dae18facb Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 29 May 2020 23:27:33 +0200 Subject: [PATCH 154/190] x86/entry: Introduce local_db_{save,restore}() In order to allow other exceptions than #DB to disable breakpoints, provide common helpers. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20200529213321.012060983@infradead.org --- arch/x86/include/asm/debugreg.h | 30 ++++++++++++++++++++++++++++++ arch/x86/kernel/traps.c | 18 ++---------------- 2 files changed, 32 insertions(+), 16 deletions(-) diff --git a/arch/x86/include/asm/debugreg.h b/arch/x86/include/asm/debugreg.h index 1a8609a15856f6..4ef8690be8a76e 100644 --- a/arch/x86/include/asm/debugreg.h +++ b/arch/x86/include/asm/debugreg.h @@ -113,6 +113,36 @@ static inline void debug_stack_usage_inc(void) { } static inline void debug_stack_usage_dec(void) { } #endif /* X86_64 */ +static __always_inline unsigned long local_db_save(void) +{ + unsigned long dr7; + + get_debugreg(dr7, 7); + dr7 &= ~0x400; /* architecturally set bit */ + if (dr7) + set_debugreg(0, 7); + /* + * Ensure the compiler doesn't lower the above statements into + * the critical section; disabling breakpoints late would not + * be good. + */ + barrier(); + + return dr7; +} + +static __always_inline void local_db_restore(unsigned long dr7) +{ + /* + * Ensure the compiler doesn't raise this statement into + * the critical section; enabling breakpoints early would + * not be good. + */ + barrier(); + if (dr7) + set_debugreg(dr7, 7); +} + #ifdef CONFIG_CPU_SUP_AMD extern void set_dr_addr_mask(unsigned long mask, int dr); #else diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 50fb9cd5be97b0..bcb9dd961c6d60 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -727,15 +727,7 @@ static __always_inline void debug_enter(unsigned long *dr6, unsigned long *dr7) * Entry text is excluded for HW_BP_X and cpu_entry_area, which * includes the entry stack is excluded for everything. */ - get_debugreg(*dr7, 7); - set_debugreg(0, 7); - - /* - * Ensure the compiler doesn't lower the above statements into - * the critical section; disabling breakpoints late would not - * be good. - */ - barrier(); + *dr7 = local_db_save(); /* * The Intel SDM says: @@ -756,13 +748,7 @@ static __always_inline void debug_enter(unsigned long *dr6, unsigned long *dr7) static __always_inline void debug_exit(unsigned long dr7) { - /* - * Ensure the compiler doesn't raise this statement into - * the critical section; enabling breakpoints early would - * not be good. - */ - barrier(); - set_debugreg(dr7, 7); + local_db_restore(dr7); } /* From fd338e3564b0b8597a89f83941a0eda3e5092cc0 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 29 May 2020 23:27:34 +0200 Subject: [PATCH 155/190] x86/entry, nmi: Disable #DB Instead of playing stupid games with IST stacks, fully disallow #DB during NMIs. There is absolutely no reason to allow them, and killing this saves a heap of trouble. #DB is already forbidden on noinstr and CEA, so there can't be a #DB before this. Disabling it right after nmi_enter() ensures that the full NMI code is protected. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20200529213321.069223695@infradead.org --- arch/x86/kernel/nmi.c | 55 +++---------------------------------------- 1 file changed, 3 insertions(+), 52 deletions(-) diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index 5df4e7f5836961..873a8c040b86d3 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -474,40 +474,7 @@ enum nmi_states { }; static DEFINE_PER_CPU(enum nmi_states, nmi_state); static DEFINE_PER_CPU(unsigned long, nmi_cr2); - -#ifdef CONFIG_X86_64 -/* - * In x86_64, we need to handle breakpoint -> NMI -> breakpoint. Without - * some care, the inner breakpoint will clobber the outer breakpoint's - * stack. - * - * If a breakpoint is being processed, and the debug stack is being - * used, if an NMI comes in and also hits a breakpoint, the stack - * pointer will be set to the same fixed address as the breakpoint that - * was interrupted, causing that stack to be corrupted. To handle this - * case, check if the stack that was interrupted is the debug stack, and - * if so, change the IDT so that new breakpoints will use the current - * stack and not switch to the fixed address. On return of the NMI, - * switch back to the original IDT. - */ -static DEFINE_PER_CPU(int, update_debug_stack); - -static noinstr bool is_debug_stack(unsigned long addr) -{ - struct cea_exception_stacks *cs = __this_cpu_read(cea_exception_stacks); - unsigned long top = CEA_ESTACK_TOP(cs, DB); - unsigned long bot = CEA_ESTACK_BOT(cs, DB1); - - if (__this_cpu_read(debug_stack_usage)) - return true; - /* - * Note, this covers the guard page between DB and DB1 as well to - * avoid two checks. But by all means @addr can never point into - * the guard page. - */ - return addr >= bot && addr < top; -} -#endif +static DEFINE_PER_CPU(unsigned long, nmi_dr7); DEFINE_IDTENTRY_NMI(exc_nmi) { @@ -522,18 +489,7 @@ DEFINE_IDTENTRY_NMI(exc_nmi) this_cpu_write(nmi_cr2, read_cr2()); nmi_restart: -#ifdef CONFIG_X86_64 - /* - * If we interrupted a breakpoint, it is possible that - * the nmi handler will have breakpoints too. We need to - * change the IDT such that breakpoints that happen here - * continue to use the NMI stack. - */ - if (unlikely(is_debug_stack(regs->sp))) { - debug_stack_set_zero(); - this_cpu_write(update_debug_stack, 1); - } -#endif + this_cpu_write(nmi_dr7, local_db_save()); nmi_enter(); @@ -544,12 +500,7 @@ DEFINE_IDTENTRY_NMI(exc_nmi) nmi_exit(); -#ifdef CONFIG_X86_64 - if (unlikely(this_cpu_read(update_debug_stack))) { - debug_stack_reset(); - this_cpu_write(update_debug_stack, 0); - } -#endif + local_db_restore(this_cpu_read(nmi_dr7)); if (unlikely(this_cpu_read(nmi_cr2) != read_cr2())) write_cr2(this_cpu_read(nmi_cr2)); From cd840e424f27fcc1ae8d14b7ec3ec4560ee6561a Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 29 May 2020 23:27:35 +0200 Subject: [PATCH 156/190] x86/entry, mce: Disallow #DB during #MC #MC is fragile as heck, don't tempt fate. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20200529213321.131187767@infradead.org --- arch/x86/kernel/cpu/mce/core.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index 068e6cab1286c0..be499267bbb47a 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -1943,22 +1943,34 @@ static __always_inline void exc_machine_check_user(struct pt_regs *regs) /* MCE hit kernel mode */ DEFINE_IDTENTRY_MCE(exc_machine_check) { + unsigned long dr7; + + dr7 = local_db_save(); exc_machine_check_kernel(regs); + local_db_restore(dr7); } /* The user mode variant. */ DEFINE_IDTENTRY_MCE_USER(exc_machine_check) { + unsigned long dr7; + + dr7 = local_db_save(); exc_machine_check_user(regs); + local_db_restore(dr7); } #else /* 32bit unified entry point */ DEFINE_IDTENTRY_MCE(exc_machine_check) { + unsigned long dr7; + + dr7 = local_db_save(); if (user_mode(regs)) exc_machine_check_user(regs); else exc_machine_check_kernel(regs); + local_db_restore(dr7); } #endif From 84b6a3491567a540f955e18d8e615493afa36df0 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 29 May 2020 23:27:36 +0200 Subject: [PATCH 157/190] x86/entry: Optimize local_db_save() for virt Because DRn access is 'difficult' with virt; but the DR7 read is cheaper than a cacheline miss on native, add a virt specific fast path to local_db_save(), such that when breakpoints are not in use to avoid touching DRn entirely. Suggested-by: Andy Lutomirski Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20200529213321.187833200@infradead.org --- arch/x86/include/asm/debugreg.h | 5 ++++- arch/x86/kernel/hw_breakpoint.c | 26 ++++++++++++++++++++++---- arch/x86/kvm/vmx/nested.c | 2 +- 3 files changed, 27 insertions(+), 6 deletions(-) diff --git a/arch/x86/include/asm/debugreg.h b/arch/x86/include/asm/debugreg.h index 4ef8690be8a76e..3e1c5021d0f8d8 100644 --- a/arch/x86/include/asm/debugreg.h +++ b/arch/x86/include/asm/debugreg.h @@ -85,7 +85,7 @@ static inline void hw_breakpoint_disable(void) set_debugreg(0UL, 3); } -static inline int hw_breakpoint_active(void) +static inline bool hw_breakpoint_active(void) { return __this_cpu_read(cpu_dr7) & DR_GLOBAL_ENABLE_MASK; } @@ -117,6 +117,9 @@ static __always_inline unsigned long local_db_save(void) { unsigned long dr7; + if (static_cpu_has(X86_FEATURE_HYPERVISOR) && !hw_breakpoint_active()) + return 0; + get_debugreg(dr7, 7); dr7 &= ~0x400; /* architecturally set bit */ if (dr7) diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c index fc1743a2b0e959..8cdf29ffd95f17 100644 --- a/arch/x86/kernel/hw_breakpoint.c +++ b/arch/x86/kernel/hw_breakpoint.c @@ -99,6 +99,8 @@ int arch_install_hw_breakpoint(struct perf_event *bp) unsigned long *dr7; int i; + lockdep_assert_irqs_disabled(); + for (i = 0; i < HBP_NUM; i++) { struct perf_event **slot = this_cpu_ptr(&bp_per_reg[i]); @@ -117,6 +119,12 @@ int arch_install_hw_breakpoint(struct perf_event *bp) dr7 = this_cpu_ptr(&cpu_dr7); *dr7 |= encode_dr7(i, info->len, info->type); + /* + * Ensure we first write cpu_dr7 before we set the DR7 register. + * This ensures an NMI never see cpu_dr7 0 when DR7 is not. + */ + barrier(); + set_debugreg(*dr7, 7); if (info->mask) set_dr_addr_mask(info->mask, i); @@ -136,9 +144,11 @@ int arch_install_hw_breakpoint(struct perf_event *bp) void arch_uninstall_hw_breakpoint(struct perf_event *bp) { struct arch_hw_breakpoint *info = counter_arch_bp(bp); - unsigned long *dr7; + unsigned long dr7; int i; + lockdep_assert_irqs_disabled(); + for (i = 0; i < HBP_NUM; i++) { struct perf_event **slot = this_cpu_ptr(&bp_per_reg[i]); @@ -151,12 +161,20 @@ void arch_uninstall_hw_breakpoint(struct perf_event *bp) if (WARN_ONCE(i == HBP_NUM, "Can't find any breakpoint slot")) return; - dr7 = this_cpu_ptr(&cpu_dr7); - *dr7 &= ~__encode_dr7(i, info->len, info->type); + dr7 = this_cpu_read(cpu_dr7); + dr7 &= ~__encode_dr7(i, info->len, info->type); - set_debugreg(*dr7, 7); + set_debugreg(dr7, 7); if (info->mask) set_dr_addr_mask(0, i); + + /* + * Ensure the write to cpu_dr7 is after we've set the DR7 register. + * This ensures an NMI never see cpu_dr7 0 when DR7 is not. + */ + barrier(); + + this_cpu_write(cpu_dr7, dr7); } static int arch_bp_generic_len(int x86_len) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 9c74a732b08d8f..2e7238a57fc1f3 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -3087,9 +3087,9 @@ static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu) /* * VMExit clears RFLAGS.IF and DR7, even on a consistency check. */ - local_irq_enable(); if (hw_breakpoint_active()) set_debugreg(__this_cpu_read(cpu_dr7), 7); + local_irq_enable(); preempt_enable(); /* From f9912ada82862df341b3e86864cbd532d0d24b84 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 29 May 2020 23:27:37 +0200 Subject: [PATCH 158/190] x86/entry: Remove debug IDT frobbing This is all unused now. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20200529213321.245019500@infradead.org --- arch/x86/include/asm/debugreg.h | 19 ------------------ arch/x86/include/asm/desc.h | 34 +-------------------------------- arch/x86/kernel/cpu/common.c | 17 ----------------- arch/x86/kernel/idt.c | 30 ----------------------------- arch/x86/kernel/traps.c | 9 --------- 5 files changed, 1 insertion(+), 108 deletions(-) diff --git a/arch/x86/include/asm/debugreg.h b/arch/x86/include/asm/debugreg.h index 3e1c5021d0f8d8..42fc35d8653570 100644 --- a/arch/x86/include/asm/debugreg.h +++ b/arch/x86/include/asm/debugreg.h @@ -94,25 +94,6 @@ extern void aout_dump_debugregs(struct user *dump); extern void hw_breakpoint_restore(void); -#ifdef CONFIG_X86_64 -DECLARE_PER_CPU(int, debug_stack_usage); -static inline void debug_stack_usage_inc(void) -{ - __this_cpu_inc(debug_stack_usage); -} -static inline void debug_stack_usage_dec(void) -{ - __this_cpu_dec(debug_stack_usage); -} -void debug_stack_set_zero(void); -void debug_stack_reset(void); -#else /* !X86_64 */ -static inline void debug_stack_set_zero(void) { } -static inline void debug_stack_reset(void) { } -static inline void debug_stack_usage_inc(void) { } -static inline void debug_stack_usage_dec(void) { } -#endif /* X86_64 */ - static __always_inline unsigned long local_db_save(void) { unsigned long dr7; diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h index d6c3d346c63af4..07632f31147aed 100644 --- a/arch/x86/include/asm/desc.h +++ b/arch/x86/include/asm/desc.h @@ -42,8 +42,6 @@ static inline void fill_ldt(struct desc_struct *desc, const struct user_desc *in extern struct desc_ptr idt_descr; extern gate_desc idt_table[]; -extern const struct desc_ptr debug_idt_descr; -extern gate_desc debug_idt_table[]; struct gdt_page { struct desc_struct gdt[GDT_ENTRIES]; @@ -390,31 +388,6 @@ void alloc_intr_gate(unsigned int n, const void *addr); extern unsigned long system_vectors[]; -#ifdef CONFIG_X86_64 -DECLARE_PER_CPU(u32, debug_idt_ctr); -static __always_inline bool is_debug_idt_enabled(void) -{ - if (this_cpu_read(debug_idt_ctr)) - return true; - - return false; -} - -static __always_inline void load_debug_idt(void) -{ - load_idt((const struct desc_ptr *)&debug_idt_descr); -} -#else -static inline bool is_debug_idt_enabled(void) -{ - return false; -} - -static inline void load_debug_idt(void) -{ -} -#endif - /* * The load_current_idt() must be called with interrupts disabled * to avoid races. That way the IDT will always be set back to the expected @@ -424,10 +397,7 @@ static inline void load_debug_idt(void) */ static __always_inline void load_current_idt(void) { - if (is_debug_idt_enabled()) - load_debug_idt(); - else - load_idt((const struct desc_ptr *)&idt_descr); + load_idt((const struct desc_ptr *)&idt_descr); } extern void idt_setup_early_handler(void); @@ -438,11 +408,9 @@ extern void idt_setup_apic_and_irq_gates(void); #ifdef CONFIG_X86_64 extern void idt_setup_early_pf(void); extern void idt_setup_ist_traps(void); -extern void idt_setup_debugidt_traps(void); #else static inline void idt_setup_early_pf(void) { } static inline void idt_setup_ist_traps(void) { } -static inline void idt_setup_debugidt_traps(void) { } #endif extern void idt_invalidate(void *addr); diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index f4645f9ff9cb55..043d93cdcaad29 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1706,23 +1706,6 @@ void syscall_init(void) X86_EFLAGS_IOPL|X86_EFLAGS_AC|X86_EFLAGS_NT); } -DEFINE_PER_CPU(int, debug_stack_usage); -DEFINE_PER_CPU(u32, debug_idt_ctr); - -noinstr void debug_stack_set_zero(void) -{ - this_cpu_inc(debug_idt_ctr); - load_current_idt(); -} - -noinstr void debug_stack_reset(void) -{ - if (WARN_ON(!this_cpu_read(debug_idt_ctr))) - return; - if (this_cpu_dec_return(debug_idt_ctr) == 0) - load_current_idt(); -} - #else /* CONFIG_X86_64 */ DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task; diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c index bc9b0d1d7bb8ac..226c9922988659 100644 --- a/arch/x86/kernel/idt.c +++ b/arch/x86/kernel/idt.c @@ -158,14 +158,6 @@ static const __initconst struct idt_data apic_idts[] = { static const __initconst struct idt_data early_pf_idts[] = { INTG(X86_TRAP_PF, asm_exc_page_fault), }; - -/* - * Override for the debug_idt. Same as the default, but with interrupt - * stack set to DEFAULT_STACK (0). Required for NMI trap handling. - */ -static const __initconst struct idt_data dbg_idts[] = { - INTG(X86_TRAP_DB, asm_exc_debug), -}; #endif /* Must be page-aligned because the real IDT is used in a fixmap. */ @@ -177,9 +169,6 @@ struct desc_ptr idt_descr __ro_after_init = { }; #ifdef CONFIG_X86_64 -/* No need to be aligned, but done to keep all IDTs defined the same way. */ -gate_desc debug_idt_table[IDT_ENTRIES] __page_aligned_bss; - /* * The exceptions which use Interrupt stacks. They are setup after * cpu_init() when the TSS has been initialized. @@ -192,15 +181,6 @@ static const __initconst struct idt_data ist_idts[] = { ISTG(X86_TRAP_MC, asm_exc_machine_check, IST_INDEX_MCE), #endif }; - -/* - * Override for the debug_idt. Same as the default, but with interrupt - * stack set to DEFAULT_STACK (0). Required for NMI trap handling. - */ -const struct desc_ptr debug_idt_descr = { - .size = IDT_ENTRIES * 16 - 1, - .address = (unsigned long) debug_idt_table, -}; #endif static inline void idt_init_desc(gate_desc *gate, const struct idt_data *d) @@ -292,16 +272,6 @@ void __init idt_setup_ist_traps(void) { idt_setup_from_table(idt_table, ist_idts, ARRAY_SIZE(ist_idts), true); } - -/** - * idt_setup_debugidt_traps - Initialize the debug idt table with debug traps - */ -void __init idt_setup_debugidt_traps(void) -{ - memcpy(&debug_idt_table, &idt_table, IDT_ENTRIES * 16); - - idt_setup_from_table(debug_idt_table, dbg_idts, ARRAY_SIZE(dbg_idts), false); -} #endif /** diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index bcb9dd961c6d60..6f887be1ac0c85 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -798,12 +798,6 @@ static void noinstr handle_debug(struct pt_regs *regs, unsigned long dr6, return; } - /* - * Let others (NMI) know that the debug stack is in use - * as we may switch to the interrupt stack. - */ - debug_stack_usage_inc(); - /* It's safe to allow irq's after DR6 has been saved */ cond_local_irq_enable(regs); @@ -831,7 +825,6 @@ static void noinstr handle_debug(struct pt_regs *regs, unsigned long dr6, out: cond_local_irq_disable(regs); - debug_stack_usage_dec(); instrumentation_end(); } @@ -1077,6 +1070,4 @@ void __init trap_init(void) cpu_init(); idt_setup_ist_traps(); - - idt_setup_debugidt_traps(); } From fd501d4f0399700011acde486576c7c1eb8e7a61 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 29 May 2020 23:27:38 +0200 Subject: [PATCH 159/190] x86/entry: Remove DBn stacks Both #DB itself, as all other IST users (NMI, #MC) now clear DR7 on entry. Combined with not allowing breakpoints on entry/noinstr/NOKPROBE text and no single step (EFLAGS.TF) inside the #DB handler should guarantee no nested #DB. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20200529213321.303027161@infradead.org --- arch/x86/entry/entry_64.S | 17 ----------------- arch/x86/include/asm/cpu_entry_area.h | 12 +++--------- arch/x86/kernel/asm-offsets_64.c | 3 --- arch/x86/kernel/dumpstack_64.c | 7 ++----- arch/x86/mm/cpu_entry_area.c | 1 - 5 files changed, 5 insertions(+), 35 deletions(-) diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 265ff97b396171..8ecaeee5365383 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -396,11 +396,6 @@ SYM_CODE_END(\asmsym) idtentry \vector asm_\cfunc \cfunc has_error_code=0 .endm -/* - * MCE and DB exceptions - */ -#define CPU_TSS_IST(x) PER_CPU_VAR(cpu_tss_rw) + (TSS_ist + (x) * 8) - /** * idtentry_mce_db - Macro to generate entry stubs for #MC and #DB * @vector: Vector number @@ -416,10 +411,6 @@ SYM_CODE_END(\asmsym) * If hits in kernel mode then it needs to go through the paranoid * entry as the exception can hit any random state. No preemption * check on exit to keep the paranoid path simple. - * - * If the trap is #DB then the interrupt stack entry in the IST is - * moved to the second stack, so a potential recursion will have a - * fresh IST. */ .macro idtentry_mce_db vector asmsym cfunc SYM_CODE_START(\asmsym) @@ -445,16 +436,8 @@ SYM_CODE_START(\asmsym) movq %rsp, %rdi /* pt_regs pointer */ - .if \vector == X86_TRAP_DB - subq $DB_STACK_OFFSET, CPU_TSS_IST(IST_INDEX_DB) - .endif - call \cfunc - .if \vector == X86_TRAP_DB - addq $DB_STACK_OFFSET, CPU_TSS_IST(IST_INDEX_DB) - .endif - jmp paranoid_exit /* Switch to the regular task stack and use the noist entry point */ diff --git a/arch/x86/include/asm/cpu_entry_area.h b/arch/x86/include/asm/cpu_entry_area.h index 02c0078d3787b4..8902fdb7de13e1 100644 --- a/arch/x86/include/asm/cpu_entry_area.h +++ b/arch/x86/include/asm/cpu_entry_area.h @@ -11,15 +11,11 @@ #ifdef CONFIG_X86_64 /* Macro to enforce the same ordering and stack sizes */ -#define ESTACKS_MEMBERS(guardsize, db2_holesize)\ +#define ESTACKS_MEMBERS(guardsize) \ char DF_stack_guard[guardsize]; \ char DF_stack[EXCEPTION_STKSZ]; \ char NMI_stack_guard[guardsize]; \ char NMI_stack[EXCEPTION_STKSZ]; \ - char DB2_stack_guard[guardsize]; \ - char DB2_stack[db2_holesize]; \ - char DB1_stack_guard[guardsize]; \ - char DB1_stack[EXCEPTION_STKSZ]; \ char DB_stack_guard[guardsize]; \ char DB_stack[EXCEPTION_STKSZ]; \ char MCE_stack_guard[guardsize]; \ @@ -28,12 +24,12 @@ /* The exception stacks' physical storage. No guard pages required */ struct exception_stacks { - ESTACKS_MEMBERS(0, 0) + ESTACKS_MEMBERS(0) }; /* The effective cpu entry area mapping with guard pages. */ struct cea_exception_stacks { - ESTACKS_MEMBERS(PAGE_SIZE, EXCEPTION_STKSZ) + ESTACKS_MEMBERS(PAGE_SIZE) }; /* @@ -42,8 +38,6 @@ struct cea_exception_stacks { enum exception_stack_ordering { ESTACK_DF, ESTACK_NMI, - ESTACK_DB2, - ESTACK_DB1, ESTACK_DB, ESTACK_MCE, N_EXCEPTION_STACKS diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c index c2a47016f24334..828be792231e9d 100644 --- a/arch/x86/kernel/asm-offsets_64.c +++ b/arch/x86/kernel/asm-offsets_64.c @@ -57,9 +57,6 @@ int main(void) BLANK(); #undef ENTRY - OFFSET(TSS_ist, tss_struct, x86_tss.ist); - DEFINE(DB_STACK_OFFSET, offsetof(struct cea_exception_stacks, DB_stack) - - offsetof(struct cea_exception_stacks, DB1_stack)); BLANK(); #ifdef CONFIG_STACKPROTECTOR diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index 460ae7f66818c7..4a94d38cd14147 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -22,15 +22,13 @@ static const char * const exception_stack_names[] = { [ ESTACK_DF ] = "#DF", [ ESTACK_NMI ] = "NMI", - [ ESTACK_DB2 ] = "#DB2", - [ ESTACK_DB1 ] = "#DB1", [ ESTACK_DB ] = "#DB", [ ESTACK_MCE ] = "#MC", }; const char *stack_type_name(enum stack_type type) { - BUILD_BUG_ON(N_EXCEPTION_STACKS != 6); + BUILD_BUG_ON(N_EXCEPTION_STACKS != 4); if (type == STACK_TYPE_IRQ) return "IRQ"; @@ -79,7 +77,6 @@ static const struct estack_pages estack_pages[CEA_ESTACK_PAGES] ____cacheline_aligned = { EPAGERANGE(DF), EPAGERANGE(NMI), - EPAGERANGE(DB1), EPAGERANGE(DB), EPAGERANGE(MCE), }; @@ -91,7 +88,7 @@ static bool in_exception_stack(unsigned long *stack, struct stack_info *info) struct pt_regs *regs; unsigned int k; - BUILD_BUG_ON(N_EXCEPTION_STACKS != 6); + BUILD_BUG_ON(N_EXCEPTION_STACKS != 4); begin = (unsigned long)__this_cpu_read(cea_exception_stacks); /* diff --git a/arch/x86/mm/cpu_entry_area.c b/arch/x86/mm/cpu_entry_area.c index 6f8b48f545f4e3..770b613790b3db 100644 --- a/arch/x86/mm/cpu_entry_area.c +++ b/arch/x86/mm/cpu_entry_area.c @@ -107,7 +107,6 @@ static void __init percpu_setup_exception_stacks(unsigned int cpu) */ cea_map_stack(DF); cea_map_stack(NMI); - cea_map_stack(DB1); cea_map_stack(DB); cea_map_stack(MCE); } From 59bc300b712998d10254ee20e24f2e7ec09c560a Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 29 May 2020 23:27:39 +0200 Subject: [PATCH 160/190] x86/entry: Clarify irq_{enter,exit}_rcu() Because: irq_enter_rcu() includes lockdep_hardirq_enter() irq_exit_rcu() does *NOT* include lockdep_hardirq_exit() Which resulted in two 'stray' lockdep_hardirq_exit() calls in idtentry.h, and me spending a long time trying to find the matching enter calls. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20200529213321.359433429@infradead.org --- arch/x86/include/asm/idtentry.h | 2 -- kernel/softirq.c | 21 ++++++++++++++------- 2 files changed, 14 insertions(+), 9 deletions(-) diff --git a/arch/x86/include/asm/idtentry.h b/arch/x86/include/asm/idtentry.h index 38b672ded40bd1..d203c541a65a4e 100644 --- a/arch/x86/include/asm/idtentry.h +++ b/arch/x86/include/asm/idtentry.h @@ -206,7 +206,6 @@ __visible noinstr void func(struct pt_regs *regs, \ kvm_set_cpu_l1tf_flush_l1d(); \ __##func (regs, (u8)error_code); \ irq_exit_rcu(); \ - lockdep_hardirq_exit(); \ instrumentation_end(); \ idtentry_exit_cond_rcu(regs, rcu_exit); \ } \ @@ -249,7 +248,6 @@ __visible noinstr void func(struct pt_regs *regs) \ kvm_set_cpu_l1tf_flush_l1d(); \ run_on_irqstack_cond(__##func, regs, regs); \ irq_exit_rcu(); \ - lockdep_hardirq_exit(); \ instrumentation_end(); \ idtentry_exit_cond_rcu(regs, rcu_exit); \ } \ diff --git a/kernel/softirq.c b/kernel/softirq.c index beb8e3a66c7c44..c4201b7f42b1f9 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -404,12 +404,7 @@ static inline void tick_irq_exit(void) #endif } -/** - * irq_exit_rcu() - Exit an interrupt context without updating RCU - * - * Also processes softirqs if needed and possible. - */ -void irq_exit_rcu(void) +static inline void __irq_exit_rcu(void) { #ifndef __ARCH_IRQ_EXIT_IRQS_DISABLED local_irq_disable(); @@ -424,6 +419,18 @@ void irq_exit_rcu(void) tick_irq_exit(); } +/** + * irq_exit_rcu() - Exit an interrupt context without updating RCU + * + * Also processes softirqs if needed and possible. + */ +void irq_exit_rcu(void) +{ + __irq_exit_rcu(); + /* must be last! */ + lockdep_hardirq_exit(); +} + /** * irq_exit - Exit an interrupt context, update RCU and lockdep * @@ -431,7 +438,7 @@ void irq_exit_rcu(void) */ void irq_exit(void) { - irq_exit_rcu(); + __irq_exit_rcu(); rcu_irq_exit(); /* must be last! */ lockdep_hardirq_exit(); From bf2b3008440072068580c609d79a079656af0588 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 29 May 2020 23:27:40 +0200 Subject: [PATCH 161/190] x86/entry: Rename trace_hardirqs_off_prepare() The typical pattern for trace_hardirqs_off_prepare() is: ENTRY lockdep_hardirqs_off(); // because hardware ... do entry magic instrumentation_begin(); trace_hardirqs_off_prepare(); ... do actual work trace_hardirqs_on_prepare(); lockdep_hardirqs_on_prepare(); instrumentation_end(); ... do exit magic lockdep_hardirqs_on(); which shows that it's named wrong, rename it to trace_hardirqs_off_finish(), as it concludes the hardirq_off transition. Also, given that the above is the only correct order, make the traditional all-in-one trace_hardirqs_off() follow suit. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20200529213321.415774872@infradead.org --- arch/x86/entry/common.c | 6 +++--- arch/x86/kernel/cpu/mce/core.c | 2 +- arch/x86/kernel/nmi.c | 2 +- arch/x86/kernel/traps.c | 4 ++-- include/linux/irqflags.h | 4 ++-- kernel/trace/trace_preemptirq.c | 10 +++++----- 6 files changed, 14 insertions(+), 14 deletions(-) diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index b0b1c3cf0e6e5c..f4d57782c14bb4 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -65,7 +65,7 @@ static noinstr void enter_from_user_mode(void) instrumentation_begin(); CT_WARN_ON(state != CONTEXT_USER); - trace_hardirqs_off_prepare(); + trace_hardirqs_off_finish(); instrumentation_end(); } #else @@ -73,7 +73,7 @@ static __always_inline void enter_from_user_mode(void) { lockdep_hardirqs_off(CALLER_ADDR0); instrumentation_begin(); - trace_hardirqs_off_prepare(); + trace_hardirqs_off_finish(); instrumentation_end(); } #endif @@ -569,7 +569,7 @@ bool noinstr idtentry_enter_cond_rcu(struct pt_regs *regs) lockdep_hardirqs_off(CALLER_ADDR0); rcu_irq_enter(); instrumentation_begin(); - trace_hardirqs_off_prepare(); + trace_hardirqs_off_finish(); instrumentation_end(); return true; diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index be499267bbb47a..b9cb381b4019ca 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -1922,7 +1922,7 @@ static __always_inline void exc_machine_check_kernel(struct pt_regs *regs) * that out because it's an indirect call. Annotate it. */ instrumentation_begin(); - trace_hardirqs_off_prepare(); + trace_hardirqs_off_finish(); machine_check_vector(regs); if (regs->flags & X86_EFLAGS_IF) trace_hardirqs_on_prepare(); diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index 873a8c040b86d3..3a98ff36f4115f 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -330,7 +330,7 @@ static noinstr void default_do_nmi(struct pt_regs *regs) __this_cpu_write(last_nmi_rip, regs->ip); instrumentation_begin(); - trace_hardirqs_off_prepare(); + trace_hardirqs_off_finish(); handled = nmi_handle(NMI_LOCAL, regs); __this_cpu_add(nmi_stats.normal, handled); diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 6f887be1ac0c85..79af913e78a388 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -634,7 +634,7 @@ DEFINE_IDTENTRY_RAW(exc_int3) } else { nmi_enter(); instrumentation_begin(); - trace_hardirqs_off_prepare(); + trace_hardirqs_off_finish(); if (!do_int3(regs)) die("int3", regs, 0); if (regs->flags & X86_EFLAGS_IF) @@ -833,7 +833,7 @@ static __always_inline void exc_debug_kernel(struct pt_regs *regs, { nmi_enter(); instrumentation_begin(); - trace_hardirqs_off_prepare(); + trace_hardirqs_off_finish(); instrumentation_end(); /* diff --git a/include/linux/irqflags.h b/include/linux/irqflags.h index d7f7e436c3afc7..6384d2813deda8 100644 --- a/include/linux/irqflags.h +++ b/include/linux/irqflags.h @@ -32,7 +32,7 @@ #ifdef CONFIG_TRACE_IRQFLAGS extern void trace_hardirqs_on_prepare(void); - extern void trace_hardirqs_off_prepare(void); + extern void trace_hardirqs_off_finish(void); extern void trace_hardirqs_on(void); extern void trace_hardirqs_off(void); # define lockdep_hardirq_context(p) ((p)->hardirq_context) @@ -101,7 +101,7 @@ do { \ #else # define trace_hardirqs_on_prepare() do { } while (0) -# define trace_hardirqs_off_prepare() do { } while (0) +# define trace_hardirqs_off_finish() do { } while (0) # define trace_hardirqs_on() do { } while (0) # define trace_hardirqs_off() do { } while (0) # define lockdep_hardirq_context(p) 0 diff --git a/kernel/trace/trace_preemptirq.c b/kernel/trace/trace_preemptirq.c index fb0691b8a88de6..f10073e6260305 100644 --- a/kernel/trace/trace_preemptirq.c +++ b/kernel/trace/trace_preemptirq.c @@ -58,7 +58,7 @@ NOKPROBE_SYMBOL(trace_hardirqs_on); * and lockdep uses a staged approach which splits the lockdep hardirq * tracking into a RCU on and a RCU off section. */ -void trace_hardirqs_off_prepare(void) +void trace_hardirqs_off_finish(void) { if (!this_cpu_read(tracing_irq_cpu)) { this_cpu_write(tracing_irq_cpu, 1); @@ -68,19 +68,19 @@ void trace_hardirqs_off_prepare(void) } } -EXPORT_SYMBOL(trace_hardirqs_off_prepare); -NOKPROBE_SYMBOL(trace_hardirqs_off_prepare); +EXPORT_SYMBOL(trace_hardirqs_off_finish); +NOKPROBE_SYMBOL(trace_hardirqs_off_finish); void trace_hardirqs_off(void) { + lockdep_hardirqs_off(CALLER_ADDR0); + if (!this_cpu_read(tracing_irq_cpu)) { this_cpu_write(tracing_irq_cpu, 1); tracer_hardirqs_off(CALLER_ADDR0, CALLER_ADDR1); if (!in_nmi()) trace_irq_disable_rcuidle(CALLER_ADDR0, CALLER_ADDR1); } - - lockdep_hardirqs_off(CALLER_ADDR0); } EXPORT_SYMBOL(trace_hardirqs_off); NOKPROBE_SYMBOL(trace_hardirqs_off); From bdf5bde8aec7b53d0ea3a44d880a4e5106ff37f3 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 28 May 2020 16:53:16 +0200 Subject: [PATCH 162/190] x86/idt: Mark init only functions __init Since 8175cfbbbfcb ("x86/idt: Remove update_intr_gate()") set_intr_gate() and idt_setup_from_table() are only called from __init functions. Mark them as well. Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20200528145522.715816477@linutronix.de --- arch/x86/kernel/idt.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c index 226c9922988659..4b99f7bec38443 100644 --- a/arch/x86/kernel/idt.c +++ b/arch/x86/kernel/idt.c @@ -197,7 +197,7 @@ static inline void idt_init_desc(gate_desc *gate, const struct idt_data *d) #endif } -static void +static __init void idt_setup_from_table(gate_desc *idt, const struct idt_data *t, int size, bool sys) { gate_desc desc; @@ -210,7 +210,7 @@ idt_setup_from_table(gate_desc *idt, const struct idt_data *t, int size, bool sy } } -static void set_intr_gate(unsigned int n, const void *addr) +static __init void set_intr_gate(unsigned int n, const void *addr) { struct idt_data data; From 94438af40d06c110988fc9e30baf801f38b1491a Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 28 May 2020 16:53:17 +0200 Subject: [PATCH 163/190] x86/idt: Add comments about early #PF handling The difference between 32 and 64 bit vs. early #PF handling is not documented. Replace the FIXME at idt_setup_early_pf() with proper comments. Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20200528145522.807135882@linutronix.de --- arch/x86/kernel/idt.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c index 4b99f7bec38443..5ef82fcf333d06 100644 --- a/arch/x86/kernel/idt.c +++ b/arch/x86/kernel/idt.c @@ -61,7 +61,11 @@ static bool idt_setup_done __initdata; static const __initconst struct idt_data early_idts[] = { INTG(X86_TRAP_DB, asm_exc_debug), SYSG(X86_TRAP_BP, asm_exc_int3), + #ifdef CONFIG_X86_32 + /* + * Not possible on 64-bit. See idt_setup_early_pf() for details. + */ INTG(X86_TRAP_PF, asm_exc_page_fault), #endif }; @@ -256,8 +260,10 @@ void __init idt_setup_traps(void) * cpu_init() is invoked and sets up TSS. The IST variant is installed * after that. * - * FIXME: Why is 32bit and 64bit installing the PF handler at different - * places in the early setup code? + * Note, that X86_64 cannot install the real #PF handler in + * idt_setup_early_traps() because the memory intialization needs the #PF + * handler from the early_idt_handler_array to initialize the early page + * tables. */ void __init idt_setup_early_pf(void) { From 5a2bafca1b0675a126143eea3610143130347783 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 28 May 2020 16:53:18 +0200 Subject: [PATCH 164/190] x86/idt: Use proper constants for table size Use the actual struct size to calculate the IDT table size instead of hardcoded values. Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20200528145522.898591501@linutronix.de --- arch/x86/kernel/idt.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c index 5ef82fcf333d06..b6e1a87f082222 100644 --- a/arch/x86/kernel/idt.c +++ b/arch/x86/kernel/idt.c @@ -51,6 +51,7 @@ struct idt_data { #define TSKG(_vector, _gdt) \ G(_vector, NULL, DEFAULT_STACK, GATE_TASK, DPL0, _gdt << 3) +#define IDT_TABLE_SIZE (IDT_ENTRIES * sizeof(gate_desc)) static bool idt_setup_done __initdata; @@ -168,7 +169,7 @@ static const __initconst struct idt_data early_pf_idts[] = { gate_desc idt_table[IDT_ENTRIES] __page_aligned_bss; struct desc_ptr idt_descr __ro_after_init = { - .size = (IDT_ENTRIES * 2 * sizeof(unsigned long)) - 1, + .size = IDT_TABLE_SIZE - 1, .address = (unsigned long) idt_table, }; From 00229a54300108502f68c8777faca2d13f805f1a Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 28 May 2020 16:53:19 +0200 Subject: [PATCH 165/190] x86/idt: Cleanup trap_init() No point in having all the IDT cruft in trap_init(). Move it into the IDT code and fixup the comments. Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20200528145522.992376498@linutronix.de --- arch/x86/kernel/idt.c | 18 ++++++++++++++++++ arch/x86/kernel/traps.c | 9 --------- 2 files changed, 18 insertions(+), 9 deletions(-) diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c index b6e1a87f082222..902cdd00631314 100644 --- a/arch/x86/kernel/idt.c +++ b/arch/x86/kernel/idt.c @@ -4,6 +4,7 @@ */ #include +#include #include #include #include @@ -281,6 +282,19 @@ void __init idt_setup_ist_traps(void) } #endif +static void __init idt_map_in_cea(void) +{ + /* + * Set the IDT descriptor to a fixed read-only location in the cpu + * entry area, so that the "sidt" instruction will not leak the + * location of the kernel, and to defend the IDT against arbitrary + * memory write vulnerabilities. + */ + cea_set_pte(CPU_ENTRY_AREA_RO_IDT_VADDR, __pa_symbol(idt_table), + PAGE_KERNEL_RO); + idt_descr.address = CPU_ENTRY_AREA_RO_IDT; +} + /** * idt_setup_apic_and_irq_gates - Setup APIC/SMP and normal interrupt gates */ @@ -307,6 +321,10 @@ void __init idt_setup_apic_and_irq_gates(void) set_intr_gate(i, entry); } #endif + /* Map IDT into CPU entry area and reload it. */ + idt_map_in_cea(); + load_idt(&idt_descr); + idt_setup_done = true; } diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 79af913e78a388..5566fe50ef98fe 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -1055,15 +1055,6 @@ void __init trap_init(void) idt_setup_traps(); - /* - * Set the IDT descriptor to a fixed read-only location, so that the - * "sidt" instruction will not leak the location of the kernel, and - * to defend the IDT against arbitrary memory write vulnerabilities. - * It will be reloaded in cpu_init() */ - cea_set_pte(CPU_ENTRY_AREA_RO_IDT_VADDR, __pa_symbol(idt_table), - PAGE_KERNEL_RO); - idt_descr.address = CPU_ENTRY_AREA_RO_IDT; - /* * Should be a barrier for any external CPU state: */ From 3e77abda65b1cec10ef6b18b1ccfee0beaf400f1 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 28 May 2020 16:53:20 +0200 Subject: [PATCH 166/190] x86/idt: Consolidate idt functionality - Move load_current_idt() out of line and replace the hideous comment with a lockdep assert. This allows to make idt_table and idt_descr static. - Mark idt_table read only after the IDT initialization is complete. - Shuffle code around to consolidate the #ifdef sections into one. - Adapt the F00F bug code. Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20200528145523.084915381@linutronix.de --- arch/x86/include/asm/desc.h | 17 ++-------- arch/x86/kernel/idt.c | 63 ++++++++++++++++++++++--------------- arch/x86/mm/fault.c | 16 +++------- 3 files changed, 44 insertions(+), 52 deletions(-) diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h index 07632f31147aed..1ced11d3193249 100644 --- a/arch/x86/include/asm/desc.h +++ b/arch/x86/include/asm/desc.h @@ -40,9 +40,6 @@ static inline void fill_ldt(struct desc_struct *desc, const struct user_desc *in desc->l = 0; } -extern struct desc_ptr idt_descr; -extern gate_desc idt_table[]; - struct gdt_page { struct desc_struct gdt[GDT_ENTRIES]; } __attribute__((aligned(PAGE_SIZE))); @@ -388,22 +385,12 @@ void alloc_intr_gate(unsigned int n, const void *addr); extern unsigned long system_vectors[]; -/* - * The load_current_idt() must be called with interrupts disabled - * to avoid races. That way the IDT will always be set back to the expected - * descriptor. It's also called when a CPU is being initialized, and - * that doesn't need to disable interrupts, as nothing should be - * bothering the CPU then. - */ -static __always_inline void load_current_idt(void) -{ - load_idt((const struct desc_ptr *)&idt_descr); -} - +extern void load_current_idt(void); extern void idt_setup_early_handler(void); extern void idt_setup_early_traps(void); extern void idt_setup_traps(void); extern void idt_setup_apic_and_irq_gates(void); +extern bool idt_is_f00f_address(unsigned long address); #ifdef CONFIG_X86_64 extern void idt_setup_early_pf(void); diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c index 902cdd00631314..0db21206f2f33a 100644 --- a/arch/x86/kernel/idt.c +++ b/arch/x86/kernel/idt.c @@ -5,6 +5,7 @@ #include #include +#include #include #include #include @@ -156,37 +157,25 @@ static const __initconst struct idt_data apic_idts[] = { #endif }; -#ifdef CONFIG_X86_64 -/* - * Early traps running on the DEFAULT_STACK because the other interrupt - * stacks work only after cpu_init(). - */ -static const __initconst struct idt_data early_pf_idts[] = { - INTG(X86_TRAP_PF, asm_exc_page_fault), -}; -#endif - -/* Must be page-aligned because the real IDT is used in a fixmap. */ -gate_desc idt_table[IDT_ENTRIES] __page_aligned_bss; +/* Must be page-aligned because the real IDT is used in the cpu entry area */ +static gate_desc idt_table[IDT_ENTRIES] __page_aligned_bss; struct desc_ptr idt_descr __ro_after_init = { .size = IDT_TABLE_SIZE - 1, .address = (unsigned long) idt_table, }; -#ifdef CONFIG_X86_64 -/* - * The exceptions which use Interrupt stacks. They are setup after - * cpu_init() when the TSS has been initialized. - */ -static const __initconst struct idt_data ist_idts[] = { - ISTG(X86_TRAP_DB, asm_exc_debug, IST_INDEX_DB), - ISTG(X86_TRAP_NMI, asm_exc_nmi, IST_INDEX_NMI), - ISTG(X86_TRAP_DF, asm_exc_double_fault, IST_INDEX_DF), -#ifdef CONFIG_X86_MCE - ISTG(X86_TRAP_MC, asm_exc_machine_check, IST_INDEX_MCE), -#endif -}; +void load_current_idt(void) +{ + lockdep_assert_irqs_disabled(); + load_idt(&idt_descr); +} + +#ifdef CONFIG_X86_F00F_BUG +bool idt_is_f00f_address(unsigned long address) +{ + return ((address - idt_descr.address) >> 3) == 6; +} #endif static inline void idt_init_desc(gate_desc *gate, const struct idt_data *d) @@ -255,6 +244,27 @@ void __init idt_setup_traps(void) } #ifdef CONFIG_X86_64 +/* + * Early traps running on the DEFAULT_STACK because the other interrupt + * stacks work only after cpu_init(). + */ +static const __initconst struct idt_data early_pf_idts[] = { + INTG(X86_TRAP_PF, asm_exc_page_fault), +}; + +/* + * The exceptions which use Interrupt stacks. They are setup after + * cpu_init() when the TSS has been initialized. + */ +static const __initconst struct idt_data ist_idts[] = { + ISTG(X86_TRAP_DB, asm_exc_debug, IST_INDEX_DB), + ISTG(X86_TRAP_NMI, asm_exc_nmi, IST_INDEX_NMI), + ISTG(X86_TRAP_DF, asm_exc_double_fault, IST_INDEX_DF), +#ifdef CONFIG_X86_MCE + ISTG(X86_TRAP_MC, asm_exc_machine_check, IST_INDEX_MCE), +#endif +}; + /** * idt_setup_early_pf - Initialize the idt table with early pagefault handler * @@ -325,6 +335,9 @@ void __init idt_setup_apic_and_irq_gates(void) idt_map_in_cea(); load_idt(&idt_descr); + /* Make the IDT table read only */ + set_memory_ro((unsigned long)&idt_table, 1); + idt_setup_done = true; } diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index eef29bb53cd0b9..66be9bd60307d2 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -414,21 +414,13 @@ static int is_errata100(struct pt_regs *regs, unsigned long address) return 0; } +/* Pentium F0 0F C7 C8 bug workaround: */ static int is_f00f_bug(struct pt_regs *regs, unsigned long address) { #ifdef CONFIG_X86_F00F_BUG - unsigned long nr; - - /* - * Pentium F0 0F C7 C8 bug workaround: - */ - if (boot_cpu_has_bug(X86_BUG_F00F)) { - nr = (address - idt_descr.address) >> 3; - - if (nr == 6) { - handle_invalid_op(regs); - return 1; - } + if (boot_cpu_has_bug(X86_BUG_F00F) && idt_is_f00f_address(address)) { + handle_invalid_op(regs); + return 1; } #endif return 0; From 28eaf87121abfb574fabfb08e21322b4dc377b10 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 3 Jun 2020 13:40:17 +0200 Subject: [PATCH 167/190] x86/entry: __always_inline debugreg for noinstr vmlinux.o: warning: objtool: exc_debug()+0x21: call to native_get_debugreg() leaves .noinstr.text section Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20200603114051.954401211@infradead.org --- arch/x86/include/asm/debugreg.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/x86/include/asm/debugreg.h b/arch/x86/include/asm/debugreg.h index 42fc35d8653570..e89558a3fe4af8 100644 --- a/arch/x86/include/asm/debugreg.h +++ b/arch/x86/include/asm/debugreg.h @@ -18,7 +18,7 @@ DECLARE_PER_CPU(unsigned long, cpu_dr7); native_set_debugreg(register, value) #endif -static inline unsigned long native_get_debugreg(int regno) +static __always_inline unsigned long native_get_debugreg(int regno) { unsigned long val = 0; /* Damn you, gcc! */ @@ -47,7 +47,7 @@ static inline unsigned long native_get_debugreg(int regno) return val; } -static inline void native_set_debugreg(int regno, unsigned long value) +static __always_inline void native_set_debugreg(int regno, unsigned long value) { switch (regno) { case 0: @@ -85,7 +85,7 @@ static inline void hw_breakpoint_disable(void) set_debugreg(0UL, 3); } -static inline bool hw_breakpoint_active(void) +static __always_inline bool hw_breakpoint_active(void) { return __this_cpu_read(cpu_dr7) & DR_GLOBAL_ENABLE_MASK; } From 7a745be1cc902d7376fdc29d6b5533eb46532be1 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 3 Jun 2020 13:40:18 +0200 Subject: [PATCH 168/190] x86/entry: __always_inline irqflags for noinstr vmlinux.o: warning: objtool: lockdep_hardirqs_on()+0x65: call to arch_local_save_flags() leaves .noinstr.text section vmlinux.o: warning: objtool: lockdep_hardirqs_off()+0x5d: call to arch_local_save_flags() leaves .noinstr.text section vmlinux.o: warning: objtool: lock_is_held_type()+0x35: call to arch_local_irq_save() leaves .noinstr.text section vmlinux.o: warning: objtool: check_preemption_disabled()+0x31: call to arch_local_save_flags() leaves .noinstr.text section vmlinux.o: warning: objtool: check_preemption_disabled()+0x33: call to arch_irqs_disabled_flags() leaves .noinstr.text section vmlinux.o: warning: objtool: lock_is_held_type()+0x2f: call to native_irq_disable() leaves .noinstr.text section Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20200603114052.012171668@infradead.org --- arch/x86/include/asm/irqflags.h | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/arch/x86/include/asm/irqflags.h b/arch/x86/include/asm/irqflags.h index 8ddff8dbaed5ab..02a0cf547d7b48 100644 --- a/arch/x86/include/asm/irqflags.h +++ b/arch/x86/include/asm/irqflags.h @@ -17,7 +17,7 @@ /* Declaration required for gcc < 4.9 to prevent -Werror=missing-prototypes */ extern inline unsigned long native_save_fl(void); -extern inline unsigned long native_save_fl(void) +extern __always_inline unsigned long native_save_fl(void) { unsigned long flags; @@ -44,12 +44,12 @@ extern inline void native_restore_fl(unsigned long flags) :"memory", "cc"); } -static inline void native_irq_disable(void) +static __always_inline void native_irq_disable(void) { asm volatile("cli": : :"memory"); } -static inline void native_irq_enable(void) +static __always_inline void native_irq_enable(void) { asm volatile("sti": : :"memory"); } @@ -74,22 +74,22 @@ static inline __cpuidle void native_halt(void) #ifndef __ASSEMBLY__ #include -static inline notrace unsigned long arch_local_save_flags(void) +static __always_inline unsigned long arch_local_save_flags(void) { return native_save_fl(); } -static inline notrace void arch_local_irq_restore(unsigned long flags) +static __always_inline void arch_local_irq_restore(unsigned long flags) { native_restore_fl(flags); } -static inline notrace void arch_local_irq_disable(void) +static __always_inline void arch_local_irq_disable(void) { native_irq_disable(); } -static inline notrace void arch_local_irq_enable(void) +static __always_inline void arch_local_irq_enable(void) { native_irq_enable(); } @@ -115,7 +115,7 @@ static inline __cpuidle void halt(void) /* * For spinlocks, etc: */ -static inline notrace unsigned long arch_local_irq_save(void) +static __always_inline unsigned long arch_local_irq_save(void) { unsigned long flags = arch_local_save_flags(); arch_local_irq_disable(); @@ -159,12 +159,12 @@ static inline notrace unsigned long arch_local_irq_save(void) #endif /* CONFIG_PARAVIRT_XXL */ #ifndef __ASSEMBLY__ -static inline int arch_irqs_disabled_flags(unsigned long flags) +static __always_inline int arch_irqs_disabled_flags(unsigned long flags) { return !(flags & X86_EFLAGS_IF); } -static inline int arch_irqs_disabled(void) +static __always_inline int arch_irqs_disabled(void) { unsigned long flags = arch_local_save_flags(); From 4b281e541bba74bf9574335289484c577f41eaf7 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 3 Jun 2020 13:40:19 +0200 Subject: [PATCH 169/190] x86/entry: __always_inline arch_atomic_* for noinstr vmlinux.o: warning: objtool: rcu_dynticks_eqs_exit()+0x33: call to arch_atomic_and.constprop.0() leaves .noinstr.text section Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20200603114052.070166551@infradead.org --- arch/x86/include/asm/atomic.h | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/arch/x86/include/asm/atomic.h b/arch/x86/include/asm/atomic.h index a9ae5882607475..bf35e476a77613 100644 --- a/arch/x86/include/asm/atomic.h +++ b/arch/x86/include/asm/atomic.h @@ -205,13 +205,13 @@ static __always_inline bool arch_atomic_try_cmpxchg(atomic_t *v, int *old, int n } #define arch_atomic_try_cmpxchg arch_atomic_try_cmpxchg -static inline int arch_atomic_xchg(atomic_t *v, int new) +static __always_inline int arch_atomic_xchg(atomic_t *v, int new) { return arch_xchg(&v->counter, new); } #define arch_atomic_xchg arch_atomic_xchg -static inline void arch_atomic_and(int i, atomic_t *v) +static __always_inline void arch_atomic_and(int i, atomic_t *v) { asm volatile(LOCK_PREFIX "andl %1,%0" : "+m" (v->counter) @@ -219,7 +219,7 @@ static inline void arch_atomic_and(int i, atomic_t *v) : "memory"); } -static inline int arch_atomic_fetch_and(int i, atomic_t *v) +static __always_inline int arch_atomic_fetch_and(int i, atomic_t *v) { int val = arch_atomic_read(v); @@ -229,7 +229,7 @@ static inline int arch_atomic_fetch_and(int i, atomic_t *v) } #define arch_atomic_fetch_and arch_atomic_fetch_and -static inline void arch_atomic_or(int i, atomic_t *v) +static __always_inline void arch_atomic_or(int i, atomic_t *v) { asm volatile(LOCK_PREFIX "orl %1,%0" : "+m" (v->counter) @@ -237,7 +237,7 @@ static inline void arch_atomic_or(int i, atomic_t *v) : "memory"); } -static inline int arch_atomic_fetch_or(int i, atomic_t *v) +static __always_inline int arch_atomic_fetch_or(int i, atomic_t *v) { int val = arch_atomic_read(v); @@ -247,7 +247,7 @@ static inline int arch_atomic_fetch_or(int i, atomic_t *v) } #define arch_atomic_fetch_or arch_atomic_fetch_or -static inline void arch_atomic_xor(int i, atomic_t *v) +static __always_inline void arch_atomic_xor(int i, atomic_t *v) { asm volatile(LOCK_PREFIX "xorl %1,%0" : "+m" (v->counter) @@ -255,7 +255,7 @@ static inline void arch_atomic_xor(int i, atomic_t *v) : "memory"); } -static inline int arch_atomic_fetch_xor(int i, atomic_t *v) +static __always_inline int arch_atomic_fetch_xor(int i, atomic_t *v) { int val = arch_atomic_read(v); From 5ef227933117085f1320b3421ef43a26bf624b4c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 3 Jun 2020 13:40:20 +0200 Subject: [PATCH 170/190] x86/entry: Re-order #DB handler to avoid *SAN instrumentation vmlinux.o: warning: objtool: exc_debug()+0xbb: call to clear_ti_thread_flag.constprop.0() leaves .noinstr.text section vmlinux.o: warning: objtool: noist_exc_debug()+0x55: call to clear_ti_thread_flag.constprop.0() leaves .noinstr.text section Rework things so that handle_debug() looses the noinstr and move the clear_thread_flag() into that. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20200603114052.127756554@infradead.org --- arch/x86/kernel/traps.c | 55 ++++++++++++++++++++--------------------- 1 file changed, 27 insertions(+), 28 deletions(-) diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 5566fe50ef98fe..7febae381b911a 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -775,26 +775,44 @@ static __always_inline void debug_exit(unsigned long dr7) * * May run on IST stack. */ -static void noinstr handle_debug(struct pt_regs *regs, unsigned long dr6, - bool user_icebp) +static void handle_debug(struct pt_regs *regs, unsigned long dr6, bool user) { struct task_struct *tsk = current; + bool user_icebp; int si_code; + /* + * The SDM says "The processor clears the BTF flag when it + * generates a debug exception." Clear TIF_BLOCKSTEP to keep + * TIF_BLOCKSTEP in sync with the hardware BTF flag. + */ + clear_thread_flag(TIF_BLOCKSTEP); + + /* + * If DR6 is zero, no point in trying to handle it. The kernel is + * not using INT1. + */ + if (!user && !dr6) + return; + + /* + * If dr6 has no reason to give us about the origin of this trap, + * then it's very likely the result of an icebp/int01 trap. + * User wants a sigtrap for that. + */ + user_icebp = user && !dr6; + /* Store the virtualized DR6 value */ tsk->thread.debugreg6 = dr6; - instrumentation_begin(); #ifdef CONFIG_KPROBES if (kprobe_debug_handler(regs)) { - instrumentation_end(); return; } #endif if (notify_die(DIE_DEBUG, "debug", regs, (long)&dr6, 0, SIGTRAP) == NOTIFY_STOP) { - instrumentation_end(); return; } @@ -825,7 +843,6 @@ static void noinstr handle_debug(struct pt_regs *regs, unsigned long dr6, out: cond_local_irq_disable(regs); - instrumentation_end(); } static __always_inline void exc_debug_kernel(struct pt_regs *regs, @@ -834,14 +851,6 @@ static __always_inline void exc_debug_kernel(struct pt_regs *regs, nmi_enter(); instrumentation_begin(); trace_hardirqs_off_finish(); - instrumentation_end(); - - /* - * The SDM says "The processor clears the BTF flag when it - * generates a debug exception." Clear TIF_BLOCKSTEP to keep - * TIF_BLOCKSTEP in sync with the hardware BTF flag. - */ - clear_thread_flag(TIF_BLOCKSTEP); /* * Catch SYSENTER with TF set and clear DR_STEP. If this hit a @@ -850,14 +859,8 @@ static __always_inline void exc_debug_kernel(struct pt_regs *regs, if ((dr6 & DR_STEP) && is_sysenter_singlestep(regs)) dr6 &= ~DR_STEP; - /* - * If DR6 is zero, no point in trying to handle it. The kernel is - * not using INT1. - */ - if (dr6) - handle_debug(regs, dr6, false); + handle_debug(regs, dr6, false); - instrumentation_begin(); if (regs->flags & X86_EFLAGS_IF) trace_hardirqs_on_prepare(); instrumentation_end(); @@ -868,14 +871,10 @@ static __always_inline void exc_debug_user(struct pt_regs *regs, unsigned long dr6) { idtentry_enter_user(regs); - clear_thread_flag(TIF_BLOCKSTEP); + instrumentation_begin(); - /* - * If dr6 has no reason to give us about the origin of this trap, - * then it's very likely the result of an icebp/int01 trap. - * User wants a sigtrap for that. - */ - handle_debug(regs, dr6, !dr6); + handle_debug(regs, dr6, true); + instrumentation_end(); idtentry_exit_user(regs); } From 6eebad1ad303db360ebe3e51c2b9656c3d407157 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 3 Jun 2020 13:40:21 +0200 Subject: [PATCH 171/190] lockdep: __always_inline more for noinstr vmlinux.o: warning: objtool: debug_locks_off()+0xd: call to __debug_locks_off() leaves .noinstr.text section vmlinux.o: warning: objtool: match_held_lock()+0x6a: call to look_up_lock_class.isra.0() leaves .noinstr.text section vmlinux.o: warning: objtool: lock_is_held_type()+0x90: call to lockdep_recursion_finish() leaves .noinstr.text section Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20200603114052.185201076@infradead.org --- include/linux/debug_locks.h | 2 +- kernel/locking/lockdep.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/include/linux/debug_locks.h b/include/linux/debug_locks.h index 257ab3c92cb8ad..e7e45f0cc7da51 100644 --- a/include/linux/debug_locks.h +++ b/include/linux/debug_locks.h @@ -12,7 +12,7 @@ extern int debug_locks __read_mostly; extern int debug_locks_silent __read_mostly; -static inline int __debug_locks_off(void) +static __always_inline int __debug_locks_off(void) { return xchg(&debug_locks, 0); } diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 38cce34d03dc38..29a8de4c50b90e 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -393,7 +393,7 @@ void lockdep_init_task(struct task_struct *task) task->lockdep_recursion = 0; } -static inline void lockdep_recursion_finish(void) +static __always_inline void lockdep_recursion_finish(void) { if (WARN_ON_ONCE(--current->lockdep_recursion)) current->lockdep_recursion = 0; @@ -801,7 +801,7 @@ static int count_matching_names(struct lock_class *new_class) } /* used from NMI context -- must be lockless */ -static inline struct lock_class * +static __always_inline struct lock_class * look_up_lock_class(const struct lockdep_map *lock, unsigned int subclass) { struct lockdep_subclass_key *key; From 2823e83a3dc0f54d23db67ca07d74b9c8bb1fdda Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 3 Jun 2020 13:40:22 +0200 Subject: [PATCH 172/190] x86/entry: __always_inline CR2 for noinstr vmlinux.o: warning: objtool: exc_page_fault()+0x9: call to read_cr2() leaves .noinstr.text section vmlinux.o: warning: objtool: exc_page_fault()+0x24: call to prefetchw() leaves .noinstr.text section vmlinux.o: warning: objtool: exc_page_fault()+0x21: call to kvm_handle_async_pf.isra.0() leaves .noinstr.text section vmlinux.o: warning: objtool: exc_nmi()+0x1cc: call to write_cr2() leaves .noinstr.text section Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20200603114052.243227806@infradead.org --- arch/x86/include/asm/kvm_para.h | 2 +- arch/x86/include/asm/processor.h | 2 +- arch/x86/include/asm/special_insns.h | 8 ++++---- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h index 57fd1966c4ea9a..49d3a9edb06f64 100644 --- a/arch/x86/include/asm/kvm_para.h +++ b/arch/x86/include/asm/kvm_para.h @@ -141,7 +141,7 @@ static inline void kvm_disable_steal_time(void) return; } -static inline bool kvm_handle_async_pf(struct pt_regs *regs, u32 token) +static __always_inline bool kvm_handle_async_pf(struct pt_regs *regs, u32 token) { return false; } diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 29ee0c088009d0..42cd333616c487 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -823,7 +823,7 @@ static inline void prefetch(const void *x) * Useful for spinlocks to avoid one state transition in the * cache coherency protocol: */ -static inline void prefetchw(const void *x) +static __always_inline void prefetchw(const void *x) { alternative_input(BASE_PREFETCH, "prefetchw %P1", X86_FEATURE_3DNOWPREFETCH, diff --git a/arch/x86/include/asm/special_insns.h b/arch/x86/include/asm/special_insns.h index 82436cb04ccf5f..eb8e781c435394 100644 --- a/arch/x86/include/asm/special_insns.h +++ b/arch/x86/include/asm/special_insns.h @@ -28,14 +28,14 @@ static inline unsigned long native_read_cr0(void) return val; } -static inline unsigned long native_read_cr2(void) +static __always_inline unsigned long native_read_cr2(void) { unsigned long val; asm volatile("mov %%cr2,%0\n\t" : "=r" (val), "=m" (__force_order)); return val; } -static inline void native_write_cr2(unsigned long val) +static __always_inline void native_write_cr2(unsigned long val) { asm volatile("mov %0,%%cr2": : "r" (val), "m" (__force_order)); } @@ -160,12 +160,12 @@ static inline void write_cr0(unsigned long x) native_write_cr0(x); } -static inline unsigned long read_cr2(void) +static __always_inline unsigned long read_cr2(void) { return native_read_cr2(); } -static inline void write_cr2(unsigned long x) +static __always_inline void write_cr2(unsigned long x) { native_write_cr2(x); } From f0178fc01fe46bab6a95415f5647d1a74efcad1b Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 10 Jun 2020 08:37:01 +0200 Subject: [PATCH 173/190] x86/entry: Unbreak __irqentry_text_start/end magic The entry rework moved interrupt entry code from the irqentry to the noinstr section which made the irqentry section empty. This breaks boundary checks which rely on the __irqentry_text_start/end markers to find out whether a function in a stack trace is interrupt/exception entry code. This affects the function graph tracer and filter_irq_stacks(). As the IDT entry points are all sequentialy emitted this is rather simple to unbreak by injecting __irqentry_text_start/end as global labels. To make this work correctly: - Remove the IRQENTRY_TEXT section from the x86 linker script - Define __irqentry so it breaks the build if it's used - Adjust the entry mirroring in PTI - Remove the redundant kprobes and unwinder bound checks Reported-by: Qian Cai Signed-off-by: Thomas Gleixner --- arch/x86/entry/entry_32.S | 11 ++++++++++- arch/x86/entry/entry_64.S | 11 ++++++++++- arch/x86/include/asm/irq.h | 7 +++++++ arch/x86/kernel/kprobes/core.c | 7 ------- arch/x86/kernel/kprobes/opt.c | 4 +--- arch/x86/kernel/unwind_frame.c | 8 +------- arch/x86/kernel/vmlinux.lds.S | 1 - arch/x86/mm/pti.c | 4 ++-- include/linux/interrupt.h | 8 +++++--- 9 files changed, 36 insertions(+), 25 deletions(-) diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index 2d29f77a360122..024d7d276cd40b 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -743,10 +743,19 @@ SYM_CODE_END(asm_\cfunc) /* * Include the defines which emit the idt entries which are shared - * shared between 32 and 64 bit. + * shared between 32 and 64 bit and emit the __irqentry_text_* markers + * so the stacktrace boundary checks work. */ + .align 16 + .globl __irqentry_text_start +__irqentry_text_start: + #include + .align 16 + .globl __irqentry_text_end +__irqentry_text_end: + /* * %eax: prev task * %edx: next task diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 8ecaeee5365383..d2a00c97e53f60 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -478,10 +478,19 @@ SYM_CODE_END(\asmsym) /* * Include the defines which emit the idt entries which are shared - * shared between 32 and 64 bit. + * shared between 32 and 64 bit and emit the __irqentry_text_* markers + * so the stacktrace boundary checks work. */ + .align 16 + .globl __irqentry_text_start +__irqentry_text_start: + #include + .align 16 + .globl __irqentry_text_end +__irqentry_text_end: + SYM_CODE_START_LOCAL(common_interrupt_return) SYM_INNER_LABEL(swapgs_restore_regs_and_return_to_usermode, SYM_L_GLOBAL) #ifdef CONFIG_DEBUG_ENTRY diff --git a/arch/x86/include/asm/irq.h b/arch/x86/include/asm/irq.h index f73dd3f8b043f4..528c8a71fe7f72 100644 --- a/arch/x86/include/asm/irq.h +++ b/arch/x86/include/asm/irq.h @@ -11,6 +11,13 @@ #include #include +/* + * The irq entry code is in the noinstr section and the start/end of + * __irqentry_text is emitted via labels. Make the build fail if + * something moves a C function into the __irq_entry section. + */ +#define __irq_entry __invalid_section + static inline int irq_canonicalize(int irq) { return ((irq == 2) ? 9 : irq); diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c index 85de8fa69b241c..3bafe1bd4dc702 100644 --- a/arch/x86/kernel/kprobes/core.c +++ b/arch/x86/kernel/kprobes/core.c @@ -1073,13 +1073,6 @@ NOKPROBE_SYMBOL(kprobe_fault_handler); int __init arch_populate_kprobe_blacklist(void) { - int ret; - - ret = kprobe_add_area_blacklist((unsigned long)__irqentry_text_start, - (unsigned long)__irqentry_text_end); - if (ret) - return ret; - return kprobe_add_area_blacklist((unsigned long)__entry_text_start, (unsigned long)__entry_text_end); } diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c index 234f58e0fe8cc6..321c1995028504 100644 --- a/arch/x86/kernel/kprobes/opt.c +++ b/arch/x86/kernel/kprobes/opt.c @@ -286,9 +286,7 @@ static int can_optimize(unsigned long paddr) * stack handling and registers setup. */ if (((paddr >= (unsigned long)__entry_text_start) && - (paddr < (unsigned long)__entry_text_end)) || - ((paddr >= (unsigned long)__irqentry_text_start) && - (paddr < (unsigned long)__irqentry_text_end))) + (paddr < (unsigned long)__entry_text_end))) return 0; /* Check there is enough space for a relative jump. */ diff --git a/arch/x86/kernel/unwind_frame.c b/arch/x86/kernel/unwind_frame.c index 54226110bc7fd7..722a85f3b2dd09 100644 --- a/arch/x86/kernel/unwind_frame.c +++ b/arch/x86/kernel/unwind_frame.c @@ -74,13 +74,7 @@ static bool in_entry_code(unsigned long ip) { char *addr = (char *)ip; - if (addr >= __entry_text_start && addr < __entry_text_end) - return true; - - if (addr >= __irqentry_text_start && addr < __irqentry_text_end) - return true; - - return false; + return addr >= __entry_text_start && addr < __entry_text_end; } static inline unsigned long *last_frame(struct unwind_state *state) diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 1bf7e312361f4f..b4c6b6f35548af 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -134,7 +134,6 @@ SECTIONS KPROBES_TEXT ALIGN_ENTRY_TEXT_BEGIN ENTRY_TEXT - IRQENTRY_TEXT ALIGN_ENTRY_TEXT_END SOFTIRQENTRY_TEXT *(.fixup) diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c index a3c6757a65c75f..a8a924b3c33589 100644 --- a/arch/x86/mm/pti.c +++ b/arch/x86/mm/pti.c @@ -492,12 +492,12 @@ static void __init pti_setup_espfix64(void) } /* - * Clone the populated PMDs of the entry and irqentry text and force it RO. + * Clone the populated PMDs of the entry text and force it RO. */ static void pti_clone_entry_text(void) { pti_clone_pgtable((unsigned long) __entry_text_start, - (unsigned long) __irqentry_text_end, + (unsigned long) __entry_text_end, PTI_CLONE_PMD); } diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h index 80f637c3a6f38d..5db970b6615afd 100644 --- a/include/linux/interrupt.h +++ b/include/linux/interrupt.h @@ -760,8 +760,10 @@ extern int arch_early_irq_init(void); /* * We want to know which function is an entrypoint of a hardirq or a softirq. */ -#define __irq_entry __attribute__((__section__(".irqentry.text"))) -#define __softirq_entry \ - __attribute__((__section__(".softirqentry.text"))) +#ifndef __irq_entry +# define __irq_entry __attribute__((__section__(".irqentry.text"))) +#endif + +#define __softirq_entry __attribute__((__section__(".softirqentry.text"))) #endif From 17fae1294ad9d711b2c3dd0edef479d40c76a5e8 Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Wed, 20 May 2020 09:35:46 -0700 Subject: [PATCH 174/190] x86/{mce,mm}: Unmap the entire page if the whole page is affected and poisoned An interesting thing happened when a guest Linux instance took a machine check. The VMM unmapped the bad page from guest physical space and passed the machine check to the guest. Linux took all the normal actions to offline the page from the process that was using it. But then guest Linux crashed because it said there was a second machine check inside the kernel with this stack trace: do_memory_failure set_mce_nospec set_memory_uc _set_memory_uc change_page_attr_set_clr cpa_flush clflush_cache_range_opt This was odd, because a CLFLUSH instruction shouldn't raise a machine check (it isn't consuming the data). Further investigation showed that the VMM had passed in another machine check because is appeared that the guest was accessing the bad page. Fix is to check the scope of the poison by checking the MCi_MISC register. If the entire page is affected, then unmap the page. If only part of the page is affected, then mark the page as uncacheable. This assumes that VMMs will do the logical thing and pass in the "whole page scope" via the MCi_MISC register (since they unmapped the entire page). [ bp: Adjust to x86/entry changes. ] Fixes: 284ce4011ba6 ("x86/memory_failure: Introduce {set, clear}_mce_nospec()") Reported-by: Jue Wang Signed-off-by: Tony Luck Signed-off-by: Borislav Petkov Signed-off-by: Thomas Gleixner Tested-by: Jue Wang Cc: Link: https://lkml.kernel.org/r/20200520163546.GA7977@agluck-desk2.amr.corp.intel.com --- arch/x86/include/asm/set_memory.h | 19 +++++++++++++------ arch/x86/kernel/cpu/mce/core.c | 18 ++++++++++++++---- include/linux/sched.h | 4 +++- include/linux/set_memory.h | 2 +- 4 files changed, 31 insertions(+), 12 deletions(-) diff --git a/arch/x86/include/asm/set_memory.h b/arch/x86/include/asm/set_memory.h index ec2c0a094b5da3..5948218f35c584 100644 --- a/arch/x86/include/asm/set_memory.h +++ b/arch/x86/include/asm/set_memory.h @@ -86,28 +86,35 @@ int set_direct_map_default_noflush(struct page *page); extern int kernel_set_to_readonly; #ifdef CONFIG_X86_64 -static inline int set_mce_nospec(unsigned long pfn) +/* + * Prevent speculative access to the page by either unmapping + * it (if we do not require access to any part of the page) or + * marking it uncacheable (if we want to try to retrieve data + * from non-poisoned lines in the page). + */ +static inline int set_mce_nospec(unsigned long pfn, bool unmap) { unsigned long decoy_addr; int rc; /* - * Mark the linear address as UC to make sure we don't log more - * errors because of speculative access to the page. * We would like to just call: - * set_memory_uc((unsigned long)pfn_to_kaddr(pfn), 1); + * set_memory_XX((unsigned long)pfn_to_kaddr(pfn), 1); * but doing that would radically increase the odds of a * speculative access to the poison page because we'd have * the virtual address of the kernel 1:1 mapping sitting * around in registers. * Instead we get tricky. We create a non-canonical address * that looks just like the one we want, but has bit 63 flipped. - * This relies on set_memory_uc() properly sanitizing any __pa() + * This relies on set_memory_XX() properly sanitizing any __pa() * results with __PHYSICAL_MASK or PTE_PFN_MASK. */ decoy_addr = (pfn << PAGE_SHIFT) + (PAGE_OFFSET ^ BIT(63)); - rc = set_memory_uc(decoy_addr, 1); + if (unmap) + rc = set_memory_np(decoy_addr, 1); + else + rc = set_memory_uc(decoy_addr, 1); if (rc) pr_warn("Could not invalidate pfn=0x%lx from 1:1 map\n", pfn); return rc; diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index 30413325de22f1..ce9120c4f74094 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -520,6 +520,14 @@ bool mce_is_memory_error(struct mce *m) } EXPORT_SYMBOL_GPL(mce_is_memory_error); +static bool whole_page(struct mce *m) +{ + if (!mca_cfg.ser || !(m->status & MCI_STATUS_MISCV)) + return true; + + return MCI_MISC_ADDR_LSB(m->misc) >= PAGE_SHIFT; +} + bool mce_is_correctable(struct mce *m) { if (m->cpuvendor == X86_VENDOR_AMD && m->status & MCI_STATUS_DEFERRED) @@ -573,7 +581,7 @@ static int uc_decode_notifier(struct notifier_block *nb, unsigned long val, pfn = mce->addr >> PAGE_SHIFT; if (!memory_failure(pfn, 0)) { - set_mce_nospec(pfn); + set_mce_nospec(pfn, whole_page(mce)); mce->kflags |= MCE_HANDLED_UC; } @@ -1173,11 +1181,12 @@ static void kill_me_maybe(struct callback_head *cb) int flags = MF_ACTION_REQUIRED; pr_err("Uncorrected hardware memory error in user-access at %llx", p->mce_addr); - if (!(p->mce_status & MCG_STATUS_RIPV)) + + if (!p->mce_ripv) flags |= MF_MUST_KILL; if (!memory_failure(p->mce_addr >> PAGE_SHIFT, flags)) { - set_mce_nospec(p->mce_addr >> PAGE_SHIFT); + set_mce_nospec(p->mce_addr >> PAGE_SHIFT, p->mce_whole_page); return; } @@ -1331,7 +1340,8 @@ void noinstr do_machine_check(struct pt_regs *regs) BUG_ON(!on_thread_stack() || !user_mode(regs)); current->mce_addr = m.addr; - current->mce_status = m.mcgstatus; + current->mce_ripv = !!(m.mcgstatus & MCG_STATUS_RIPV); + current->mce_whole_page = whole_page(&m); current->mce_kill_me.func = kill_me_maybe; if (kill_it) current->mce_kill_me.func = kill_me_now; diff --git a/include/linux/sched.h b/include/linux/sched.h index c5d96e3e7fff42..62c1de522fc5d3 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1304,7 +1304,9 @@ struct task_struct { #ifdef CONFIG_X86_MCE u64 mce_addr; - u64 mce_status; + __u64 mce_ripv : 1, + mce_whole_page : 1, + __mce_reserved : 62; struct callback_head mce_kill_me; #endif diff --git a/include/linux/set_memory.h b/include/linux/set_memory.h index 86281ac7c30574..860e0f843c12bf 100644 --- a/include/linux/set_memory.h +++ b/include/linux/set_memory.h @@ -26,7 +26,7 @@ static inline int set_direct_map_default_noflush(struct page *page) #endif #ifndef set_mce_nospec -static inline int set_mce_nospec(unsigned long pfn) +static inline int set_mce_nospec(unsigned long pfn, bool unmap) { return 0; } From 7ccddc4613db446dc3cbb69a3763ba60ec651d13 Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Wed, 27 May 2020 11:28:08 -0700 Subject: [PATCH 175/190] x86/mce/dev-mcelog: Fix -Wstringop-truncation warning about strncpy() The kbuild test robot reported this warning: arch/x86/kernel/cpu/mce/dev-mcelog.c: In function 'dev_mcelog_init_device': arch/x86/kernel/cpu/mce/dev-mcelog.c:346:2: warning: 'strncpy' output \ truncated before terminating nul copying 12 bytes from a string of the \ same length [-Wstringop-truncation] This is accurate, but I don't care that the trailing NUL character isn't copied. The string being copied is just a magic number signature so that crash dump tools can be sure they are decoding the right blob of memory. Use memcpy() instead of strncpy(). Fixes: d8ecca4043f2 ("x86/mce/dev-mcelog: Dynamically allocate space for machine check records") Reported-by: kbuild test robot Signed-off-by: Tony Luck Signed-off-by: Borislav Petkov Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20200527182808.27737-1-tony.luck@intel.com --- arch/x86/kernel/cpu/mce/dev-mcelog.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/cpu/mce/dev-mcelog.c b/arch/x86/kernel/cpu/mce/dev-mcelog.c index a4fd5287f02f06..43c466020ed5b0 100644 --- a/arch/x86/kernel/cpu/mce/dev-mcelog.c +++ b/arch/x86/kernel/cpu/mce/dev-mcelog.c @@ -349,7 +349,7 @@ static __init int dev_mcelog_init_device(void) if (!mcelog) return -ENOMEM; - strncpy(mcelog->signature, MCE_LOG_SIGNATURE, sizeof(mcelog->signature)); + memcpy(mcelog->signature, MCE_LOG_SIGNATURE, sizeof(mcelog->signature)); mcelog->len = mce_log_len; mcelog->recordlen = sizeof(struct mce); From e881bfaf5a5f409390973e076333281465f2b0d9 Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Thu, 11 Jun 2020 13:05:59 +1000 Subject: [PATCH 176/190] KVM: PPC: Fix nested guest RC bits update Before commit 6cdf30375f82 ("powerpc/kvm/book3s: Use kvm helpers to walk shadow or secondary table") we called __find_linux_pte() with a page table pointer from a kvm_nested_guest struct but now we rely on kvmhv_find_nested() which takes an L1 LPID and returns a kvm_nested_guest pointer, however we pass a L0 LPID there and the L2 guest hangs. This fixes the LPID passed to kvmppc_hv_handle_set_rc(). Fixes: 6cdf30375f82 ("powerpc/kvm/book3s: Use kvm helpers to walk shadow or secondary table") Signed-off-by: Alexey Kardashevskiy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20200611030559.75257-1-aik@ozlabs.ru --- arch/powerpc/kvm/book3s_hv_nested.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/kvm/book3s_hv_nested.c b/arch/powerpc/kvm/book3s_hv_nested.c index 66c38ee37fd54d..75993f44519b22 100644 --- a/arch/powerpc/kvm/book3s_hv_nested.c +++ b/arch/powerpc/kvm/book3s_hv_nested.c @@ -1234,7 +1234,7 @@ static long kvmhv_handle_nested_set_rc(struct kvm_vcpu *vcpu, /* Set the rc bit in the pte of the shadow_pgtable for the nest guest */ ret = kvmppc_hv_handle_set_rc(kvm, true, writing, - n_gpa, gp->shadow_lpid); + n_gpa, gp->l1_lpid); if (!ret) ret = -EINVAL; else From 15a416e8aaa758b5534f64a3972dae05275bc225 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 11 Jun 2020 20:26:38 -0700 Subject: [PATCH 177/190] x86/entry: Treat BUG/WARN as NMI-like entries BUG/WARN are cleverly optimized using UD2 to handle the BUG/WARN out of line in an exception fixup. But if BUG or WARN is issued in a funny RCU context, then the idtentry_enter...() path might helpfully WARN that the RCU context is invalid, which results in infinite recursion. Split the BUG/WARN handling into an nmi_enter()/nmi_exit() path in exc_invalid_op() to increase the chance to survive the experience. [ tglx: Make the declaration match the implementation ] Signed-off-by: Andy Lutomirski Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/f8fe40e0088749734b4435b554f73eee53dcf7a8.1591932307.git.luto@kernel.org --- arch/x86/include/asm/idtentry.h | 2 +- arch/x86/kernel/traps.c | 64 +++++++++++++++++++-------------- arch/x86/mm/extable.c | 15 ++++++-- 3 files changed, 52 insertions(+), 29 deletions(-) diff --git a/arch/x86/include/asm/idtentry.h b/arch/x86/include/asm/idtentry.h index d203c541a65a4e..2fc6b0c583413b 100644 --- a/arch/x86/include/asm/idtentry.h +++ b/arch/x86/include/asm/idtentry.h @@ -543,7 +543,6 @@ SYM_CODE_END(spurious_entries_start) DECLARE_IDTENTRY(X86_TRAP_DE, exc_divide_error); DECLARE_IDTENTRY(X86_TRAP_OF, exc_overflow); DECLARE_IDTENTRY(X86_TRAP_BR, exc_bounds); -DECLARE_IDTENTRY(X86_TRAP_UD, exc_invalid_op); DECLARE_IDTENTRY(X86_TRAP_NM, exc_device_not_available); DECLARE_IDTENTRY(X86_TRAP_OLD_MF, exc_coproc_segment_overrun); DECLARE_IDTENTRY(X86_TRAP_SPURIOUS, exc_spurious_interrupt_bug); @@ -561,6 +560,7 @@ DECLARE_IDTENTRY_ERRORCODE(X86_TRAP_GP, exc_general_protection); DECLARE_IDTENTRY_ERRORCODE(X86_TRAP_AC, exc_alignment_check); /* Raw exception entries which need extra work */ +DECLARE_IDTENTRY_RAW(X86_TRAP_UD, exc_invalid_op); DECLARE_IDTENTRY_RAW(X86_TRAP_BP, exc_int3); DECLARE_IDTENTRY_RAW_ERRORCODE(X86_TRAP_PF, exc_page_fault); diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 7febae381b911a..af75109485c263 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -97,24 +97,6 @@ int is_valid_bugaddr(unsigned long addr) return ud == INSN_UD0 || ud == INSN_UD2; } -int fixup_bug(struct pt_regs *regs, int trapnr) -{ - if (trapnr != X86_TRAP_UD) - return 0; - - switch (report_bug(regs->ip, regs)) { - case BUG_TRAP_TYPE_NONE: - case BUG_TRAP_TYPE_BUG: - break; - - case BUG_TRAP_TYPE_WARN: - regs->ip += LEN_UD2; - return 1; - } - - return 0; -} - static nokprobe_inline int do_trap_no_signal(struct task_struct *tsk, int trapnr, const char *str, struct pt_regs *regs, long error_code) @@ -190,13 +172,6 @@ static void do_error_trap(struct pt_regs *regs, long error_code, char *str, { RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU"); - /* - * WARN*()s end up here; fix them up before we call the - * notifier chain. - */ - if (!user_mode(regs) && fixup_bug(regs, trapnr)) - return; - if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) != NOTIFY_STOP) { cond_local_irq_enable(regs); @@ -241,9 +216,46 @@ static inline void handle_invalid_op(struct pt_regs *regs) ILL_ILLOPN, error_get_trap_addr(regs)); } -DEFINE_IDTENTRY(exc_invalid_op) +DEFINE_IDTENTRY_RAW(exc_invalid_op) { + bool rcu_exit; + + /* + * Handle BUG/WARN like NMIs instead of like normal idtentries: + * if we bugged/warned in a bad RCU context, for example, the last + * thing we want is to BUG/WARN again in the idtentry code, ad + * infinitum. + */ + if (!user_mode(regs) && is_valid_bugaddr(regs->ip)) { + enum bug_trap_type type; + + nmi_enter(); + instrumentation_begin(); + trace_hardirqs_off_finish(); + type = report_bug(regs->ip, regs); + if (regs->flags & X86_EFLAGS_IF) + trace_hardirqs_on_prepare(); + instrumentation_end(); + nmi_exit(); + + if (type == BUG_TRAP_TYPE_WARN) { + /* Skip the ud2. */ + regs->ip += LEN_UD2; + return; + } + + /* + * Else, if this was a BUG and report_bug returns or if this + * was just a normal #UD, we want to continue onward and + * crash. + */ + } + + rcu_exit = idtentry_enter_cond_rcu(regs); + instrumentation_begin(); handle_invalid_op(regs); + instrumentation_end(); + idtentry_exit_cond_rcu(regs, rcu_exit); } DEFINE_IDTENTRY(exc_coproc_segment_overrun) diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c index b991aa4bdfaee9..1d6cb07f4f86f5 100644 --- a/arch/x86/mm/extable.c +++ b/arch/x86/mm/extable.c @@ -204,8 +204,19 @@ void __init early_fixup_exception(struct pt_regs *regs, int trapnr) if (fixup_exception(regs, trapnr, regs->orig_ax, 0)) return; - if (fixup_bug(regs, trapnr)) - return; + if (trapnr == X86_TRAP_UD) { + if (report_bug(regs->ip, regs) == BUG_TRAP_TYPE_WARN) { + /* Skip the ud2. */ + regs->ip += LEN_UD2; + return; + } + + /* + * If this was a BUG and report_bug returns or if this + * was just a normal #UD, we want to continue onward and + * crash. + */ + } fail: early_printk("PANIC: early exception 0x%02x IP %lx:%lx error %lx cr2 0x%lx\n", From 71ed49d8fb33023f242419a77ecb1141c029cac4 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 12 Jun 2020 14:02:27 +0200 Subject: [PATCH 178/190] x86/entry: Make NMI use IDTENTRY_RAW For no reason other than beginning brainmelt, IDTENTRY_NMI was mapped to IDTENTRY_IST. This is not a problem on 64bit because the IST default entry point maps to IDTENTRY_RAW which does not any entry handling. The surplus function declaration for the noist C entry point is unused and as there is no ASM code emitted for NMI this went unnoticed. On 32bit IDTENTRY_IST maps to a regular IDTENTRY which does the normal entry handling. That is clearly the wrong thing to do for NMI. Map it to IDTENTRY_RAW to unbreak it. The IDTENTRY_NMI mapping needs to stay to avoid emitting ASM code. Fixes: 6271fef00b34 ("x86/entry: Convert NMI to IDTENTRY_NMI") Reported-by: Naresh Kamboju Debugged-by: Andy Lutomirski Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/CA+G9fYvF3cyrY+-iw_SZtpN-i2qA2BruHg4M=QYECU2-dNdsMw@mail.gmail.com --- arch/x86/include/asm/idtentry.h | 4 ++-- arch/x86/kernel/nmi.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/x86/include/asm/idtentry.h b/arch/x86/include/asm/idtentry.h index 2fc6b0c583413b..cf51c50eb356dd 100644 --- a/arch/x86/include/asm/idtentry.h +++ b/arch/x86/include/asm/idtentry.h @@ -391,8 +391,8 @@ __visible noinstr void func(struct pt_regs *regs, \ #define DEFINE_IDTENTRY_MCE DEFINE_IDTENTRY_IST #define DEFINE_IDTENTRY_MCE_USER DEFINE_IDTENTRY_NOIST -#define DECLARE_IDTENTRY_NMI DECLARE_IDTENTRY_IST -#define DEFINE_IDTENTRY_NMI DEFINE_IDTENTRY_IST +#define DECLARE_IDTENTRY_NMI DECLARE_IDTENTRY_RAW +#define DEFINE_IDTENTRY_NMI DEFINE_IDTENTRY_RAW #define DECLARE_IDTENTRY_DEBUG DECLARE_IDTENTRY_IST #define DEFINE_IDTENTRY_DEBUG DEFINE_IDTENTRY_IST diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index 3a98ff36f4115f..2de365f15684cd 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -476,7 +476,7 @@ static DEFINE_PER_CPU(enum nmi_states, nmi_state); static DEFINE_PER_CPU(unsigned long, nmi_cr2); static DEFINE_PER_CPU(unsigned long, nmi_dr7); -DEFINE_IDTENTRY_NMI(exc_nmi) +DEFINE_IDTENTRY_RAW(exc_nmi) { if (IS_ENABLED(CONFIG_SMP) && cpu_is_offline(smp_processor_id())) return; From 0bf3924bfabd13ba21aa702344fc00b3b3263e5a Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 12 Jun 2020 15:55:00 +0200 Subject: [PATCH 179/190] x86/entry: Force rcu_irq_enter() when in idle task The idea of conditionally calling into rcu_irq_enter() only when RCU is not watching turned out to be not completely thought through. Paul noticed occasional premature end of grace periods in RCU torture testing. Bisection led to the commit which made the invocation of rcu_irq_enter() conditional on !rcu_is_watching(). It turned out that this conditional breaks RCU assumptions about the idle task when the scheduler tick happens to be a nested interrupt. Nested interrupts can happen when the first interrupt invokes softirq processing on return which enables interrupts. If that nested tick interrupt does not invoke rcu_irq_enter() then the RCU's irq-nesting checks will believe that this interrupt came directly from idle, which will cause RCU to report a quiescent state. Because this interrupt instead came from a softirq handler which might have been executing an RCU read-side critical section, this can cause the grace period to end prematurely. Change the condition from !rcu_is_watching() to is_idle_task(current) which enforces that interrupts in the idle task unconditionally invoke rcu_irq_enter() independent of the RCU state. This is also correct vs. user mode entries in NOHZ full scenarios because user mode entries bring RCU out of EQS and force the RCU irq nesting state accounting to nested. As only the first interrupt can enter from user mode a nested tick interrupt will enter from kernel mode and as the nesting state accounting is forced to nesting it will not do anything stupid even if rcu_irq_enter() has not been invoked. Fixes: 3eeec3858488 ("x86/entry: Provide idtentry_entry/exit_cond_rcu()") Reported-by: "Paul E. McKenney" Signed-off-by: Thomas Gleixner Tested-by: "Paul E. McKenney" Reviewed-by: "Paul E. McKenney" Acked-by: Andy Lutomirski Acked-by: Frederic Weisbecker Link: https://lkml.kernel.org/r/87wo4cxubv.fsf@nanos.tec.linutronix.de --- arch/x86/entry/common.c | 35 ++++++++++++++++++++++++++++------- 1 file changed, 28 insertions(+), 7 deletions(-) diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index f4d57782c14bb4..bd3f14175193c3 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -557,14 +557,34 @@ bool noinstr idtentry_enter_cond_rcu(struct pt_regs *regs) return false; } - if (!__rcu_is_watching()) { + /* + * If this entry hit the idle task invoke rcu_irq_enter() whether + * RCU is watching or not. + * + * Interupts can nest when the first interrupt invokes softirq + * processing on return which enables interrupts. + * + * Scheduler ticks in the idle task can mark quiescent state and + * terminate a grace period, if and only if the timer interrupt is + * not nested into another interrupt. + * + * Checking for __rcu_is_watching() here would prevent the nesting + * interrupt to invoke rcu_irq_enter(). If that nested interrupt is + * the tick then rcu_flavor_sched_clock_irq() would wrongfully + * assume that it is the first interupt and eventually claim + * quiescient state and end grace periods prematurely. + * + * Unconditionally invoke rcu_irq_enter() so RCU state stays + * consistent. + * + * TINY_RCU does not support EQS, so let the compiler eliminate + * this part when enabled. + */ + if (!IS_ENABLED(CONFIG_TINY_RCU) && is_idle_task(current)) { /* * If RCU is not watching then the same careful * sequence vs. lockdep and tracing is required * as in enter_from_user_mode(). - * - * This only happens for IRQs that hit the idle - * loop, i.e. if idle is not using MWAIT. */ lockdep_hardirqs_off(CALLER_ADDR0); rcu_irq_enter(); @@ -576,9 +596,10 @@ bool noinstr idtentry_enter_cond_rcu(struct pt_regs *regs) } /* - * If RCU is watching then RCU only wants to check - * whether it needs to restart the tick in NOHZ - * mode. + * If RCU is watching then RCU only wants to check whether it needs + * to restart the tick in NOHZ mode. rcu_irq_enter_check_tick() + * already contains a warning when RCU is not watching, so no point + * in having another one here. */ instrumentation_begin(); rcu_irq_enter_check_tick(); From 8b3ebda6d81af05b4827f18086d27bee9000f67c Mon Sep 17 00:00:00 2001 From: "Enrico Weigelt, metux IT consult" Date: Mon, 11 Mar 2019 14:42:08 +0100 Subject: [PATCH 180/190] alpha: Kconfig: pedantic formatting Formatting of Kconfig files doesn't look so pretty, so let the Great White Handkerchief come around and clean it up. Signed-off-by: Enrico Weigelt, metux IT consult Signed-off-by: Matt Turner --- arch/alpha/Kconfig | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/alpha/Kconfig b/arch/alpha/Kconfig index ef179033a7c214..48f6e22b644799 100644 --- a/arch/alpha/Kconfig +++ b/arch/alpha/Kconfig @@ -545,7 +545,7 @@ config NR_CPUS default "4" if !ALPHA_GENERIC && !ALPHA_MARVEL help MARVEL support can handle a maximum of 32 CPUs, all the others - with working support have a maximum of 4 CPUs. + with working support have a maximum of 4 CPUs. config ARCH_DISCONTIGMEM_ENABLE bool "Discontiguous Memory Support" @@ -657,7 +657,7 @@ choice endchoice config HZ - int + int default 32 if HZ_32 default 64 if HZ_64 default 128 if HZ_128 From 5bea3044a74acb3d780288c0fecd0eb25bdda24d Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Fri, 5 Apr 2019 07:16:31 -0400 Subject: [PATCH 181/190] alpha: fix rtc port ranges Alpha incorrectly reports "0070-0080 : rtc" in /proc/ioports. Fix this, so that it is "0070-007f". Signed-off-by: Mikulas Patocka Signed-off-by: Matt Turner --- arch/alpha/kernel/setup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/alpha/kernel/setup.c b/arch/alpha/kernel/setup.c index f5c42a8fcf9c83..c3934ef703d8fa 100644 --- a/arch/alpha/kernel/setup.c +++ b/arch/alpha/kernel/setup.c @@ -253,7 +253,7 @@ reserve_std_resources(void) /* Fix up for the Jensen's queer RTC placement. */ standard_io_resources[0].start = RTC_PORT(0); - standard_io_resources[0].end = RTC_PORT(0) + 0x10; + standard_io_resources[0].end = RTC_PORT(0) + 0x0f; for (i = 0; i < ARRAY_SIZE(standard_io_resources); ++i) request_resource(io, standard_io_resources+i); From 5f14596e55de458987ee38043019b3d5cd636af1 Mon Sep 17 00:00:00 2001 From: Chuhong Yuan Date: Tue, 30 Jul 2019 11:02:39 +0800 Subject: [PATCH 182/190] alpha: Replace strncmp with str_has_prefix In commit b6b2735514bc ("tracing: Use str_has_prefix() instead of using fixed sizes") the newly introduced str_has_prefix() was used to replace error-prone strncmp(str, const, len). Here fix codes with the same pattern. Signed-off-by: Chuhong Yuan Signed-off-by: Matt Turner --- arch/alpha/boot/tools/objstrip.c | 2 +- arch/alpha/kernel/setup.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/alpha/boot/tools/objstrip.c b/arch/alpha/boot/tools/objstrip.c index 825a16f5f62246..08b430d25a315f 100644 --- a/arch/alpha/boot/tools/objstrip.c +++ b/arch/alpha/boot/tools/objstrip.c @@ -148,7 +148,7 @@ main (int argc, char *argv[]) #ifdef __ELF__ elf = (struct elfhdr *) buf; - if (elf->e_ident[0] == 0x7f && strncmp((char *)elf->e_ident + 1, "ELF", 3) == 0) { + if (elf->e_ident[0] == 0x7f && str_has_prefix((char *)elf->e_ident + 1, "ELF")) { if (elf->e_type != ET_EXEC) { fprintf(stderr, "%s: %s is not an ELF executable\n", prog_name, inname); diff --git a/arch/alpha/kernel/setup.c b/arch/alpha/kernel/setup.c index c3934ef703d8fa..d1fba112bdb4f5 100644 --- a/arch/alpha/kernel/setup.c +++ b/arch/alpha/kernel/setup.c @@ -474,7 +474,7 @@ setup_arch(char **cmdline_p) #ifndef alpha_using_srm /* Assume that we've booted from SRM if we haven't booted from MILO. Detect the later by looking for "MILO" in the system serial nr. */ - alpha_using_srm = strncmp((const char *)hwrpb->ssn, "MILO", 4) != 0; + alpha_using_srm = !str_has_prefix((const char *)hwrpb->ssn, "MILO"); #endif #ifndef alpha_using_qemu /* Similarly, look for QEMU. */ From a466a5cfbb5665a0595cbd4cbe05140d57346990 Mon Sep 17 00:00:00 2001 From: Jason Yan Date: Tue, 28 Apr 2020 14:32:35 +0800 Subject: [PATCH 183/190] alpha: remove unneeded semicolon in osf_sys.c Fix the following coccicheck warning: arch/alpha/kernel/osf_sys.c:680:2-3: Unneeded semicolon Signed-off-by: Jason Yan Signed-off-by: Matt Turner --- arch/alpha/kernel/osf_sys.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/alpha/kernel/osf_sys.c b/arch/alpha/kernel/osf_sys.c index 94e4cde8071ac4..d5367a1c6300c1 100644 --- a/arch/alpha/kernel/osf_sys.c +++ b/arch/alpha/kernel/osf_sys.c @@ -677,7 +677,7 @@ SYSCALL_DEFINE2(osf_proplist_syscall, enum pl_code, code, default: error = -EOPNOTSUPP; break; - }; + } return error; } From c0ebf71506f11c5f66f3b47fb27f8c7d5e176baa Mon Sep 17 00:00:00 2001 From: Jason Yan Date: Tue, 28 Apr 2020 14:32:25 +0800 Subject: [PATCH 184/190] alpha: remove unneeded semicolon in sys_eiger.c Fix the following coccicheck warning: arch/alpha/kernel/sys_eiger.c:179:2-3: Unneeded semicolon Signed-off-by: Jason Yan Signed-off-by: Matt Turner --- arch/alpha/kernel/sys_eiger.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/alpha/kernel/sys_eiger.c b/arch/alpha/kernel/sys_eiger.c index bf99dcfd40c4e3..aea8a54da4bcc7 100644 --- a/arch/alpha/kernel/sys_eiger.c +++ b/arch/alpha/kernel/sys_eiger.c @@ -175,7 +175,7 @@ eiger_swizzle(struct pci_dev *dev, u8 *pinp) case 0x03: bridge_count = 2; break; /* 2 */ case 0x07: bridge_count = 3; break; /* 3 */ case 0x0f: bridge_count = 4; break; /* 4 */ - }; + } slot = PCI_SLOT(dev->devfn); while (dev->bus->self) { From 54505a1e2083fc54cbe8779b97479f969cd30a00 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Tue, 26 May 2020 10:47:49 -0400 Subject: [PATCH 185/190] alpha: fix memory barriers so that they conform to the specification The commits cd0e00c10672 and 92d7223a7423 broke boot on the Alpha Avanti platform. The patches move memory barriers after a write before the write. The result is that if there's iowrite followed by ioread, there is no barrier between them. The Alpha architecture allows reordering of the accesses to the I/O space, and the missing barrier between write and read causes hang with serial port and real time clock. This patch makes barriers confiorm to the specification. 1. We add mb() before readX_relaxed and writeX_relaxed - memory-barriers.txt claims that these functions must be ordered w.r.t. each other. Alpha doesn't order them, so we need an explicit barrier. 2. We add mb() before reads from the I/O space - so that if there's a write followed by a read, there should be a barrier between them. Signed-off-by: Mikulas Patocka Fixes: cd0e00c10672 ("alpha: io: reorder barriers to guarantee writeX() and iowriteX() ordering") Fixes: 92d7223a7423 ("alpha: io: reorder barriers to guarantee writeX() and iowriteX() ordering #2") Cc: stable@vger.kernel.org # v4.17+ Acked-by: Ivan Kokshaysky Reviewed-by: Maciej W. Rozycki Signed-off-by: Matt Turner --- arch/alpha/include/asm/io.h | 74 +++++++++++++++++++++++++++++-------- arch/alpha/kernel/io.c | 60 ++++++++++++++++++++++++++---- 2 files changed, 112 insertions(+), 22 deletions(-) diff --git a/arch/alpha/include/asm/io.h b/arch/alpha/include/asm/io.h index 13bea465f1c063..a4d0c19f1e796a 100644 --- a/arch/alpha/include/asm/io.h +++ b/arch/alpha/include/asm/io.h @@ -309,14 +309,18 @@ static inline int __is_mmio(const volatile void __iomem *addr) #if IO_CONCAT(__IO_PREFIX,trivial_io_bw) extern inline unsigned int ioread8(void __iomem *addr) { - unsigned int ret = IO_CONCAT(__IO_PREFIX,ioread8)(addr); + unsigned int ret; + mb(); + ret = IO_CONCAT(__IO_PREFIX,ioread8)(addr); mb(); return ret; } extern inline unsigned int ioread16(void __iomem *addr) { - unsigned int ret = IO_CONCAT(__IO_PREFIX,ioread16)(addr); + unsigned int ret; + mb(); + ret = IO_CONCAT(__IO_PREFIX,ioread16)(addr); mb(); return ret; } @@ -357,7 +361,9 @@ extern inline void outw(u16 b, unsigned long port) #if IO_CONCAT(__IO_PREFIX,trivial_io_lq) extern inline unsigned int ioread32(void __iomem *addr) { - unsigned int ret = IO_CONCAT(__IO_PREFIX,ioread32)(addr); + unsigned int ret; + mb(); + ret = IO_CONCAT(__IO_PREFIX,ioread32)(addr); mb(); return ret; } @@ -402,14 +408,18 @@ extern inline void __raw_writew(u16 b, volatile void __iomem *addr) extern inline u8 readb(const volatile void __iomem *addr) { - u8 ret = __raw_readb(addr); + u8 ret; + mb(); + ret = __raw_readb(addr); mb(); return ret; } extern inline u16 readw(const volatile void __iomem *addr) { - u16 ret = __raw_readw(addr); + u16 ret; + mb(); + ret = __raw_readw(addr); mb(); return ret; } @@ -450,14 +460,18 @@ extern inline void __raw_writeq(u64 b, volatile void __iomem *addr) extern inline u32 readl(const volatile void __iomem *addr) { - u32 ret = __raw_readl(addr); + u32 ret; + mb(); + ret = __raw_readl(addr); mb(); return ret; } extern inline u64 readq(const volatile void __iomem *addr) { - u64 ret = __raw_readq(addr); + u64 ret; + mb(); + ret = __raw_readq(addr); mb(); return ret; } @@ -486,14 +500,44 @@ extern inline void writeq(u64 b, volatile void __iomem *addr) #define outb_p outb #define outw_p outw #define outl_p outl -#define readb_relaxed(addr) __raw_readb(addr) -#define readw_relaxed(addr) __raw_readw(addr) -#define readl_relaxed(addr) __raw_readl(addr) -#define readq_relaxed(addr) __raw_readq(addr) -#define writeb_relaxed(b, addr) __raw_writeb(b, addr) -#define writew_relaxed(b, addr) __raw_writew(b, addr) -#define writel_relaxed(b, addr) __raw_writel(b, addr) -#define writeq_relaxed(b, addr) __raw_writeq(b, addr) + +extern u8 readb_relaxed(const volatile void __iomem *addr); +extern u16 readw_relaxed(const volatile void __iomem *addr); +extern u32 readl_relaxed(const volatile void __iomem *addr); +extern u64 readq_relaxed(const volatile void __iomem *addr); + +#if IO_CONCAT(__IO_PREFIX,trivial_io_bw) +extern inline u8 readb_relaxed(const volatile void __iomem *addr) +{ + mb(); + return __raw_readb(addr); +} + +extern inline u16 readw_relaxed(const volatile void __iomem *addr) +{ + mb(); + return __raw_readw(addr); +} +#endif + +#if IO_CONCAT(__IO_PREFIX,trivial_io_lq) +extern inline u32 readl_relaxed(const volatile void __iomem *addr) +{ + mb(); + return __raw_readl(addr); +} + +extern inline u64 readq_relaxed(const volatile void __iomem *addr) +{ + mb(); + return __raw_readq(addr); +} +#endif + +#define writeb_relaxed writeb +#define writew_relaxed writew +#define writel_relaxed writel +#define writeq_relaxed writeq /* * String version of IO memory access ops: diff --git a/arch/alpha/kernel/io.c b/arch/alpha/kernel/io.c index c025a3e5e3578b..938de13adfbfe4 100644 --- a/arch/alpha/kernel/io.c +++ b/arch/alpha/kernel/io.c @@ -16,21 +16,27 @@ unsigned int ioread8(void __iomem *addr) { - unsigned int ret = IO_CONCAT(__IO_PREFIX,ioread8)(addr); + unsigned int ret; + mb(); + ret = IO_CONCAT(__IO_PREFIX,ioread8)(addr); mb(); return ret; } unsigned int ioread16(void __iomem *addr) { - unsigned int ret = IO_CONCAT(__IO_PREFIX,ioread16)(addr); + unsigned int ret; + mb(); + ret = IO_CONCAT(__IO_PREFIX,ioread16)(addr); mb(); return ret; } unsigned int ioread32(void __iomem *addr) { - unsigned int ret = IO_CONCAT(__IO_PREFIX,ioread32)(addr); + unsigned int ret; + mb(); + ret = IO_CONCAT(__IO_PREFIX,ioread32)(addr); mb(); return ret; } @@ -148,28 +154,36 @@ EXPORT_SYMBOL(__raw_writeq); u8 readb(const volatile void __iomem *addr) { - u8 ret = __raw_readb(addr); + u8 ret; + mb(); + ret = __raw_readb(addr); mb(); return ret; } u16 readw(const volatile void __iomem *addr) { - u16 ret = __raw_readw(addr); + u16 ret; + mb(); + ret = __raw_readw(addr); mb(); return ret; } u32 readl(const volatile void __iomem *addr) { - u32 ret = __raw_readl(addr); + u32 ret; + mb(); + ret = __raw_readl(addr); mb(); return ret; } u64 readq(const volatile void __iomem *addr) { - u64 ret = __raw_readq(addr); + u64 ret; + mb(); + ret = __raw_readq(addr); mb(); return ret; } @@ -207,6 +221,38 @@ EXPORT_SYMBOL(writew); EXPORT_SYMBOL(writel); EXPORT_SYMBOL(writeq); +/* + * The _relaxed functions must be ordered w.r.t. each other, but they don't + * have to be ordered w.r.t. other memory accesses. + */ +u8 readb_relaxed(const volatile void __iomem *addr) +{ + mb(); + return __raw_readb(addr); +} + +u16 readw_relaxed(const volatile void __iomem *addr) +{ + mb(); + return __raw_readw(addr); +} + +u32 readl_relaxed(const volatile void __iomem *addr) +{ + mb(); + return __raw_readl(addr); +} + +u64 readq_relaxed(const volatile void __iomem *addr) +{ + mb(); + return __raw_readq(addr); +} + +EXPORT_SYMBOL(readb_relaxed); +EXPORT_SYMBOL(readw_relaxed); +EXPORT_SYMBOL(readl_relaxed); +EXPORT_SYMBOL(readq_relaxed); /* * Read COUNT 8-bit bytes from port PORT into memory starting at SRC. From e66dd01e33bdea1c580bf037feec39aae6946ade Mon Sep 17 00:00:00 2001 From: Xu Wang Date: Wed, 3 Jun 2020 02:31:59 +0000 Subject: [PATCH 186/190] alpha: Replace sg++ with sg = sg_next(sg) Replace sg++ with sg = sg_next(sg). Signed-off-by: Xu Wang Signed-off-by: Matt Turner --- arch/alpha/kernel/pci_iommu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/alpha/kernel/pci_iommu.c b/arch/alpha/kernel/pci_iommu.c index 7f1925a32c992c..81037907268d5c 100644 --- a/arch/alpha/kernel/pci_iommu.c +++ b/arch/alpha/kernel/pci_iommu.c @@ -638,7 +638,7 @@ sg_fill(struct device *dev, struct scatterlist *leader, struct scatterlist *end, while (sg+1 < end && (int) sg[1].dma_address == -1) { size += sg[1].length; - sg++; + sg = sg_next(sg); } npages = iommu_num_pages(paddr, size, PAGE_SIZE); From 7812193ca88bcb6c7bddea61c8797e2d5a6df5bd Mon Sep 17 00:00:00 2001 From: Matt Turner Date: Wed, 10 Jun 2020 16:59:30 -0700 Subject: [PATCH 187/190] alpha: c_next should increase position index Signed-off-by: Matt Turner --- arch/alpha/kernel/setup.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/alpha/kernel/setup.c b/arch/alpha/kernel/setup.c index d1fba112bdb4f5..e6d8aad15b22cf 100644 --- a/arch/alpha/kernel/setup.c +++ b/arch/alpha/kernel/setup.c @@ -1420,6 +1420,7 @@ c_start(struct seq_file *f, loff_t *pos) static void * c_next(struct seq_file *f, void *v, loff_t *pos) { + (*pos)++; return NULL; } From 777747f634ba765085373f851e9c48dccb12ad52 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Thu, 11 Jun 2020 11:11:39 +0200 Subject: [PATCH 188/190] alpha: Fix build around srm_sysrq_reboot_op The patch introducing the struct was probably never compile tested, because it sets a handler with a wrong function signature. Wrap the handler into a functions with the correct signature to fix the build. Fixes: 0f1c9688a194 ("tty/sysrq: alpha: export and use __sysrq_get_key_op()") Cc: Emil Velikov Reviewed-by: Guenter Roeck Tested-by: Guenter Roeck Signed-off-by: Joerg Roedel Signed-off-by: Matt Turner --- arch/alpha/kernel/setup.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/arch/alpha/kernel/setup.c b/arch/alpha/kernel/setup.c index e6d8aad15b22cf..916e42d74a8695 100644 --- a/arch/alpha/kernel/setup.c +++ b/arch/alpha/kernel/setup.c @@ -430,8 +430,13 @@ register_cpus(void) arch_initcall(register_cpus); #ifdef CONFIG_MAGIC_SYSRQ +static void sysrq_reboot_handler(int unused) +{ + machine_halt(); +} + static const struct sysrq_key_op srm_sysrq_reboot_op = { - .handler = machine_halt, + .handler = sysrq_reboot_handler, .help_msg = "reboot(b)", .action_msg = "Resetting", .enable_mask = SYSRQ_ENABLE_BOOT, From 39c3e304567a013ac096ca8747fe53b44a76e44b Mon Sep 17 00:00:00 2001 From: Chris Packham Date: Tue, 9 Jun 2020 03:28:14 +0100 Subject: [PATCH 189/190] ARM: 8984/1: Kconfig: set default ZBOOT_ROM_TEXT/BSS value to 0x0 ZBOOT_ROM_TEXT and ZBOOT_ROM_BSS are defined as 'hex' but had a default of "0". Kconfig will helpfully expand a text entry of 0 to 0x0 but because this is not the same as the default value it was treated as being explicitly set when running 'make savedefconfig' so most arm defconfigs have CONFIG_ZBOOT_ROM_TEXT=0x0 and CONFIG_ZBOOT_ROM_BSS=0x0. Change the default to 0x0 which will mean next time the defconfigs are re-generated the spurious config entries will be removed. Signed-off-by: Chris Packham Signed-off-by: Russell King --- arch/arm/Kconfig | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index bbac2867062d87..8789e0e96787dd 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -1749,7 +1749,7 @@ config DEPRECATED_PARAM_STRUCT # TEXT and BSS so we preserve their values in the config files. config ZBOOT_ROM_TEXT hex "Compressed ROM boot loader base address" - default "0" + default 0x0 help The physical address at which the ROM-able zImage is to be placed in the target. Platforms which normally make use of @@ -1760,7 +1760,7 @@ config ZBOOT_ROM_TEXT config ZBOOT_ROM_BSS hex "Compressed ROM boot loader BSS address" - default "0" + default 0x0 help The base address of an area of read/write memory in the target for the ROM-able zImage which must be available while the From db227c19e68db353e4cc6c99b6bc86bb24736943 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Fri, 12 Jun 2020 11:21:35 +0100 Subject: [PATCH 190/190] ARM: 8985/1: efi/decompressor: deal with HYP mode boot gracefully EFI on ARM only supports short descriptors, and given that it mandates that the MMU and caches are on, it is implied that booting in HYP mode is not supported. However, implementations of EFI exist (i.e., U-Boot) that ignore this requirement, which is not entirely unreasonable, given that it makes HYP mode inaccessible to the operating system. So let's make sure that we can deal with this condition gracefully. We already tolerate booting the EFI stub with the caches off (even though this violates the EFI spec as well), and so we should deal with HYP mode boot with MMU and caches either on or off. - When the MMU and caches are on, we can ignore the HYP stub altogether, since we can carry on executing at HYP. We do need to ensure that we disable the MMU at HYP before entering the kernel proper. - When the MMU and caches are off, we have to drop to SVC mode so that we can set up the page tables using short descriptors. In this case, we need to install the HYP stub as usual, so that we can return to HYP mode before handing over to the kernel proper. Tested-by: Heinrich Schuchardt Signed-off-by: Ard Biesheuvel Signed-off-by: Russell King --- arch/arm/boot/compressed/head.S | 62 +++++++++++++++++++++++++++++++++ 1 file changed, 62 insertions(+) diff --git a/arch/arm/boot/compressed/head.S b/arch/arm/boot/compressed/head.S index c79db44ba1284b..434a16982e344f 100644 --- a/arch/arm/boot/compressed/head.S +++ b/arch/arm/boot/compressed/head.S @@ -1410,7 +1410,11 @@ memdump: mov r12, r0 __hyp_reentry_vectors: W(b) . @ reset W(b) . @ undef +#ifdef CONFIG_EFI_STUB + W(b) __enter_kernel_from_hyp @ hvc from HYP +#else W(b) . @ svc +#endif W(b) . @ pabort W(b) . @ dabort W(b) __enter_kernel @ hyp @@ -1429,14 +1433,72 @@ __enter_kernel: reloc_code_end: #ifdef CONFIG_EFI_STUB +__enter_kernel_from_hyp: + mrc p15, 4, r0, c1, c0, 0 @ read HSCTLR + bic r0, r0, #0x5 @ disable MMU and caches + mcr p15, 4, r0, c1, c0, 0 @ write HSCTLR + isb + b __enter_kernel + ENTRY(efi_enter_kernel) mov r4, r0 @ preserve image base mov r8, r1 @ preserve DT pointer + ARM( adrl r0, call_cache_fn ) + THUMB( adr r0, call_cache_fn ) + adr r1, 0f @ clean the region of code we + bl cache_clean_flush @ may run with the MMU off + +#ifdef CONFIG_ARM_VIRT_EXT + @ + @ The EFI spec does not support booting on ARM in HYP mode, + @ since it mandates that the MMU and caches are on, with all + @ 32-bit addressable DRAM mapped 1:1 using short descriptors. + @ + @ While the EDK2 reference implementation adheres to this, + @ U-Boot might decide to enter the EFI stub in HYP mode + @ anyway, with the MMU and caches either on or off. + @ + mrs r0, cpsr @ get the current mode + msr spsr_cxsf, r0 @ record boot mode + and r0, r0, #MODE_MASK @ are we running in HYP mode? + cmp r0, #HYP_MODE + bne .Lefi_svc + + mrc p15, 4, r1, c1, c0, 0 @ read HSCTLR + tst r1, #0x1 @ MMU enabled at HYP? + beq 1f + + @ + @ When running in HYP mode with the caches on, we're better + @ off just carrying on using the cached 1:1 mapping that the + @ firmware provided. Set up the HYP vectors so HVC instructions + @ issued from HYP mode take us to the correct handler code. We + @ will disable the MMU before jumping to the kernel proper. + @ + adr r0, __hyp_reentry_vectors + mcr p15, 4, r0, c12, c0, 0 @ set HYP vector base (HVBAR) + isb + b .Lefi_hyp + + @ + @ When running in HYP mode with the caches off, we need to drop + @ into SVC mode now, and let the decompressor set up its cached + @ 1:1 mapping as usual. + @ +1: mov r9, r4 @ preserve image base + bl __hyp_stub_install @ install HYP stub vectors + safe_svcmode_maskall r1 @ drop to SVC mode + msr spsr_cxsf, r0 @ record boot mode + orr r4, r9, #1 @ restore image base and set LSB + b .Lefi_hyp +.Lefi_svc: +#endif mrc p15, 0, r0, c1, c0, 0 @ read SCTLR tst r0, #0x1 @ MMU enabled? orreq r4, r4, #1 @ set LSB if not +.Lefi_hyp: mov r0, r8 @ DT start add r1, r8, r2 @ DT end bl cache_clean_flush