From 9e85adc66a520b1c8b4c72ad5c50587d64033137 Mon Sep 17 00:00:00 2001 From: Steve Beattie Date: Tue, 10 May 2016 12:44:04 +0100 Subject: [PATCH 1/9] UBUNTU: SAUCE: (no-up) disable -pie when gcc has it enabled by default In Ubuntu 16.10, gcc's defaults have been set to build Position Independent Executables (PIE) on amd64 and ppc64le (gcc was configured this way for s390x in Ubuntu 16.04 LTS). This breaks the kernel build on amd64. The following patch disables pie for x86 builds (though not yet verified to work with gcc configured to build PIE by default i386 -- we're not planning to enable it for that architecture). The intent is for this patch to go upstream after expanding it to additional architectures where needed, but I wanted to ensure that we could build 16.10 kernels first. I've successfully built kernels and booted them with this patch applied using the 16.10 compiler. Patch is against yakkety.git, but also applies with minor movement (no fuzz) against current linus.git. Signed-off-by: Steve Beattie [apw@canonical.com: shifted up so works in arch/ Acked-by: Tim Gardner Acked-by: Stefan Bader Signed-off-by: Kamal Mostafa Signed-off-by: Andy Whitcroft Signed-off-by: Laurent Dufour --- Makefile | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/Makefile b/Makefile index 80b8671d5c46f7..714163aed0b633 100644 --- a/Makefile +++ b/Makefile @@ -612,6 +612,12 @@ endif # $(dot-config) # Defaults to vmlinux, but the arch makefile usually adds further targets all: vmlinux +# force no-pie for distro compilers that enable pie by default +KBUILD_CFLAGS += $(call cc-option, -fno-pie) +KBUILD_CFLAGS += $(call cc-option, -no-pie) +KBUILD_AFLAGS += $(call cc-option, -fno-pie) +KBUILD_CPPFLAGS += $(call cc-option, -fno-pie) + # The arch Makefile can set ARCH_{CPP,A,C}FLAGS to override the default # values of the respective KBUILD_* variables ARCH_CPPFLAGS := From fb8d94667ee4a47fc63e300cc293224da47f6fe8 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 20 Oct 2014 23:56:34 +0200 Subject: [PATCH 2/9] mm: Dont assume page-table invariance during faults One of the side effects of speculating on faults (without holding mmap_sem) is that we can race with free_pgtables() and therefore we cannot assume the page-tables will stick around. Remove the relyance on the pte pointer. Signed-off-by: Peter Zijlstra (Intel) --- mm/memory.c | 27 --------------------------- 1 file changed, 27 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index 793fe0f9841c09..53e0abb35c2eb8 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1962,30 +1962,6 @@ int apply_to_page_range(struct mm_struct *mm, unsigned long addr, } EXPORT_SYMBOL_GPL(apply_to_page_range); -/* - * handle_pte_fault chooses page fault handler according to an entry which was - * read non-atomically. Before making any commitment, on those architectures - * or configurations (e.g. i386 with PAE) which might give a mix of unmatched - * parts, do_swap_page must check under lock before unmapping the pte and - * proceeding (but do_wp_page is only called after already making such a check; - * and do_anonymous_page can safely check later on). - */ -static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd, - pte_t *page_table, pte_t orig_pte) -{ - int same = 1; -#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT) - if (sizeof(pte_t) > sizeof(unsigned long)) { - spinlock_t *ptl = pte_lockptr(mm, pmd); - spin_lock(ptl); - same = pte_same(*page_table, orig_pte); - spin_unlock(ptl); - } -#endif - pte_unmap(page_table); - return same; -} - static inline void cow_user_page(struct page *dst, struct page *src, unsigned long va, struct vm_area_struct *vma) { debug_dma_assert_idle(src); @@ -2524,9 +2500,6 @@ int do_swap_page(struct fault_env *fe, pte_t orig_pte) int exclusive = 0; int ret = 0; - if (!pte_unmap_same(vma->vm_mm, fe->pmd, fe->pte, orig_pte)) - goto out; - entry = pte_to_swp_entry(orig_pte); if (unlikely(non_swap_entry(entry))) { if (is_migration_entry(entry)) { From a07d2d6952e6904ce6bbabfd549f397f3c1c631d Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 20 Oct 2014 23:56:35 +0200 Subject: [PATCH 3/9] mm: Prepare for FAULT_FLAG_SPECULATIVE When speculating faults (without holding mmap_sem) we need to validate that the vma against which we loaded pages is still valid when we're ready to install the new PTE. Therefore, replace the pte_offset_map_lock() calls that (re)take the PTL with pte_map_lock() which can fail in case we find the VMA changed since we started the fault. Instead of passing around the endless list of function arguments, replace the lot with a single structure so we can change context without endless function signature changes. Signed-off-by: Peter Zijlstra (Intel) [port to 4.8 kernel] Signed-off-by: Laurent Dufour --- include/linux/mm.h | 1 + mm/memory.c | 73 +++++++++++++++++++++++++++++++++------------- 2 files changed, 54 insertions(+), 20 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index ef815b9cd42696..e8e9e3dc4a0dbd 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -280,6 +280,7 @@ extern pgprot_t protection_map[16]; #define FAULT_FLAG_USER 0x40 /* The fault originated in userspace */ #define FAULT_FLAG_REMOTE 0x80 /* faulting for non current tsk/mm */ #define FAULT_FLAG_INSTRUCTION 0x100 /* The fault was during an instruction fetch */ +#define FAULT_FLAG_SPECULATIVE 0x200 /* Speculative fault, not holding mmap_sem */ /* * vm_fault is filled by the the pagefault handler and passed to the vma's diff --git a/mm/memory.c b/mm/memory.c index 53e0abb35c2eb8..08922b34575dd7 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2095,6 +2095,12 @@ static inline int wp_page_reuse(struct fault_env *fe, pte_t orig_pte, return VM_FAULT_WRITE; } +static bool pte_map_lock(struct fault_env *fe) +{ + fe->pte = pte_offset_map_lock(fe->vma->vm_mm, fe->pmd, fe->address, &fe->ptl); + return true; +} + /* * Handle the case of a page which we actually need to copy to a new page. * @@ -2122,6 +2128,7 @@ static int wp_page_copy(struct fault_env *fe, pte_t orig_pte, const unsigned long mmun_start = fe->address & PAGE_MASK; const unsigned long mmun_end = mmun_start + PAGE_SIZE; struct mem_cgroup *memcg; + int ret = VM_FAULT_OOM; if (unlikely(anon_vma_prepare(vma))) goto oom; @@ -2148,7 +2155,11 @@ static int wp_page_copy(struct fault_env *fe, pte_t orig_pte, /* * Re-check the pte - we dropped the lock */ - fe->pte = pte_offset_map_lock(mm, fe->pmd, fe->address, &fe->ptl); + if (!pte_map_lock(fe)) { + mem_cgroup_cancel_charge(new_page, memcg, false); + ret = VM_FAULT_RETRY; + goto oom_free_new; + } if (likely(pte_same(*fe->pte, orig_pte))) { if (old_page) { if (!PageAnon(old_page)) { @@ -2236,7 +2247,7 @@ static int wp_page_copy(struct fault_env *fe, pte_t orig_pte, oom: if (old_page) put_page(old_page); - return VM_FAULT_OOM; + return ret; } /* @@ -2261,8 +2272,12 @@ static int wp_pfn_shared(struct fault_env *fe, pte_t orig_pte) ret = vma->vm_ops->pfn_mkwrite(vma, &vmf); if (ret & VM_FAULT_ERROR) return ret; - fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address, - &fe->ptl); + + if (!pte_map_lock(fe)) { + ret |= VM_FAULT_RETRY; + return ret; + } + /* * We might have raced with another page fault while we * released the pte_offset_map_lock. @@ -2300,8 +2315,11 @@ static int wp_page_shared(struct fault_env *fe, pte_t orig_pte, * they did, we just return, as we can count on the * MMU to tell us if they didn't also make it writable. */ - fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address, - &fe->ptl); + if (!pte_map_lock(fe)) { + unlock_page(old_page); + put_page(old_page); + return VM_FAULT_RETRY; + } if (!pte_same(*fe->pte, orig_pte)) { unlock_page(old_page); pte_unmap_unlock(fe->pte, fe->ptl); @@ -2365,8 +2383,11 @@ static int do_wp_page(struct fault_env *fe, pte_t orig_pte) get_page(old_page); pte_unmap_unlock(fe->pte, fe->ptl); lock_page(old_page); - fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, - fe->address, &fe->ptl); + if (!pte_map_lock(fe)) { + unlock_page(old_page); + put_page(old_page); + return VM_FAULT_RETRY; + } if (!pte_same(*fe->pte, orig_pte)) { unlock_page(old_page); pte_unmap_unlock(fe->pte, fe->ptl); @@ -2522,8 +2543,10 @@ int do_swap_page(struct fault_env *fe, pte_t orig_pte) * Back out if somebody else faulted in this pte * while we released the pte lock. */ - fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, - fe->address, &fe->ptl); + if (!pte_map_lock(fe)) { + delayacct_clear_flag(DELAYACCT_PF_SWAPIN); + return VM_FAULT_RETRY; + } if (likely(pte_same(*fe->pte, orig_pte))) ret = VM_FAULT_OOM; delayacct_clear_flag(DELAYACCT_PF_SWAPIN); @@ -2579,8 +2602,11 @@ int do_swap_page(struct fault_env *fe, pte_t orig_pte) /* * Back out if somebody else already faulted in this pte. */ - fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address, - &fe->ptl); + if (!pte_map_lock(fe)) { + ret = VM_FAULT_RETRY; + mem_cgroup_cancel_charge(page, memcg, false); + goto out_page; + } if (unlikely(!pte_same(*fe->pte, orig_pte))) goto out_nomap; @@ -2712,6 +2738,7 @@ static int do_anonymous_page(struct fault_env *fe) struct mem_cgroup *memcg; struct page *page; pte_t entry; + int ret = 0; /* File mapping without ->vm_ops ? */ if (vma->vm_flags & VM_SHARED) @@ -2743,8 +2770,8 @@ static int do_anonymous_page(struct fault_env *fe) !mm_forbids_zeropage(vma->vm_mm)) { entry = pte_mkspecial(pfn_pte(my_zero_pfn(fe->address), vma->vm_page_prot)); - fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address, - &fe->ptl); + if (!pte_map_lock(fe)) + return VM_FAULT_RETRY; if (!pte_none(*fe->pte)) goto unlock; /* Deliver the page fault to userland, check inside PT lock */ @@ -2776,8 +2803,12 @@ static int do_anonymous_page(struct fault_env *fe) if (vma->vm_flags & VM_WRITE) entry = pte_mkwrite(pte_mkdirty(entry)); - fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address, - &fe->ptl); + if (!pte_map_lock(fe)) { + /* XXX: should be factorized */ + mem_cgroup_cancel_charge(page, memcg, false); + put_page(page); + return VM_FAULT_RETRY; + } if (!pte_none(*fe->pte)) goto release; @@ -2800,7 +2831,7 @@ static int do_anonymous_page(struct fault_env *fe) update_mmu_cache(vma, fe->address, fe->pte); unlock: pte_unmap_unlock(fe->pte, fe->ptl); - return 0; + return ret; release: mem_cgroup_cancel_charge(page, memcg, false); put_page(page); @@ -2842,7 +2873,7 @@ static int __do_fault(struct fault_env *fe, pgoff_t pgoff, if (ret & VM_FAULT_LOCKED) unlock_page(vmf.page); put_page(vmf.page); - return VM_FAULT_HWPOISON; + return ret | VM_FAULT_HWPOISON; } if (unlikely(!(ret & VM_FAULT_LOCKED))) @@ -2889,8 +2920,9 @@ static int pte_alloc_one_map(struct fault_env *fe) if (pmd_trans_unstable(fe->pmd) || pmd_devmap(*fe->pmd)) return VM_FAULT_NOPAGE; - fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address, - &fe->ptl); + if (!pte_map_lock(fe)) + return VM_FAULT_RETRY; + return 0; } @@ -3152,6 +3184,7 @@ static int do_read_fault(struct fault_env *fe, pgoff_t pgoff) * something). */ if (vma->vm_ops->map_pages && fault_around_bytes >> PAGE_SHIFT > 1) { + /* XXX: is a call to pte_map_lock(fe) required here ? */ ret = do_fault_around(fe, pgoff); if (ret) return ret; From b8a92f60f93aef8e29f3a408137400ebbda0b797 Mon Sep 17 00:00:00 2001 From: Laurent Dufour Date: Thu, 10 Nov 2016 11:56:31 +0100 Subject: [PATCH 4/9] mm: Introduce pte_spinlock This is need because in handle_pte_fault() pte_offset_map() called and then fe->ptl is fetched and spin_locked. This was previously embedded in the call to pte_offset_map_lock(). --- mm/memory.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index 08922b34575dd7..d19800904272cd 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2095,6 +2095,13 @@ static inline int wp_page_reuse(struct fault_env *fe, pte_t orig_pte, return VM_FAULT_WRITE; } +static bool pte_spinlock(struct fault_env *fe) +{ + fe->ptl = pte_lockptr(fe->vma->vm_mm, fe->pmd); + spin_lock(fe->ptl); + return true; +} + static bool pte_map_lock(struct fault_env *fe) { fe->pte = pte_offset_map_lock(fe->vma->vm_mm, fe->pmd, fe->address, &fe->ptl); @@ -3366,8 +3373,8 @@ static int do_numa_page(struct fault_env *fe, pte_t pte) * page table entry is not accessible, so there would be no * concurrent hardware modifications to the PTE. */ - fe->ptl = pte_lockptr(vma->vm_mm, fe->pmd); - spin_lock(fe->ptl); + if (!pte_spinlock(fe)) + return VM_FAULT_RETRY; if (unlikely(!pte_same(*fe->pte, pte))) { pte_unmap_unlock(fe->pte, fe->ptl); goto out; @@ -3535,8 +3542,8 @@ static int handle_pte_fault(struct fault_env *fe) if (pte_protnone(entry) && vma_is_accessible(fe->vma)) return do_numa_page(fe, entry); - fe->ptl = pte_lockptr(fe->vma->vm_mm, fe->pmd); - spin_lock(fe->ptl); + if (!pte_spinlock(fe)) + return VM_FAULT_RETRY; if (unlikely(!pte_same(*fe->pte, entry))) goto unlock; if (fe->flags & FAULT_FLAG_WRITE) { From 964906c78d585e1c5f323ae6aae53689a881b312 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 20 Oct 2014 23:56:36 +0200 Subject: [PATCH 5/9] mm: VMA sequence count Wrap the VMA modifications (vma_adjust/unmap_page_range) with sequence counts such that we can easily test if a VMA is changed. The unmap_page_range() one allows us to make assumptions about page-tables; when we find the seqcount hasn't changed we can assume page-tables are still valid. The flip side is that we cannot distinguish between a vma_adjust() and the unmap_page_range() -- where with the former we could have re-checked the vma bounds against the address. Signed-off-by: Peter Zijlstra (Intel) --- include/linux/mm_types.h | 1 + mm/memory.c | 2 ++ mm/mmap.c | 12 ++++++++++++ 3 files changed, 15 insertions(+) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 903200f4ec41ce..620719bef8087c 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -358,6 +358,7 @@ struct vm_area_struct { struct mempolicy *vm_policy; /* NUMA policy for the VMA */ #endif struct vm_userfaultfd_ctx vm_userfaultfd_ctx; + seqcount_t vm_sequence; }; struct core_thread { diff --git a/mm/memory.c b/mm/memory.c index d19800904272cd..ec32cf710403bb 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1290,6 +1290,7 @@ void unmap_page_range(struct mmu_gather *tlb, unsigned long next; BUG_ON(addr >= end); + write_seqcount_begin(&vma->vm_sequence); tlb_start_vma(tlb, vma); pgd = pgd_offset(vma->vm_mm, addr); do { @@ -1299,6 +1300,7 @@ void unmap_page_range(struct mmu_gather *tlb, next = zap_pud_range(tlb, vma, pgd, addr, next, details); } while (pgd++, addr = next, addr != end); tlb_end_vma(tlb, vma); + write_seqcount_end(&vma->vm_sequence); } diff --git a/mm/mmap.c b/mm/mmap.c index ca9d91bca0d6c6..c2be9bd0ad922e 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -514,6 +514,8 @@ void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma, else mm->highest_vm_end = vma->vm_end; + seqcount_init(&vma->vm_sequence); + /* * vma->vm_prev wasn't known when we followed the rbtree to find the * correct insertion point for that vma. As a result, we could not @@ -629,6 +631,10 @@ int vma_adjust(struct vm_area_struct *vma, unsigned long start, long adjust_next = 0; int remove_next = 0; + write_seqcount_begin(&vma->vm_sequence); + if (next) + write_seqcount_begin_nested(&next->vm_sequence, SINGLE_DEPTH_NESTING); + if (next && !insert) { struct vm_area_struct *exporter = NULL, *importer = NULL; @@ -802,7 +808,9 @@ int vma_adjust(struct vm_area_struct *vma, unsigned long start, * we must remove another next too. It would clutter * up the code too much to do both in one go. */ + write_seqcount_end(&next->vm_sequence); next = vma->vm_next; + write_seqcount_begin_nested(&next->vm_sequence, SINGLE_DEPTH_NESTING); if (remove_next == 2) { remove_next = 1; end = next->vm_end; @@ -816,6 +824,10 @@ int vma_adjust(struct vm_area_struct *vma, unsigned long start, if (insert && file) uprobe_mmap(insert); + if (next) + write_seqcount_end(&next->vm_sequence); + write_seqcount_end(&vma->vm_sequence); + validate_mm(mm); return 0; From 538abc35443176fa5c895bc4525fca460d176a83 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 20 Oct 2014 23:56:37 +0200 Subject: [PATCH 6/9] SRCU free VMAs Manage the VMAs with SRCU such that we can do a lockless VMA lookup. We put the fput(vma->vm_file) in the SRCU callback, this keeps files valid during speculative faults, this is possible due to the delayed fput work by Al Viro -- do we need srcu_barrier() in unmount someplace? We guard the mm_rb tree with a seqlock (XXX could be a seqcount but we'd have to disable preemption around the write side in order to make the retry loop in __read_seqcount_begin() work) such that we can know if the rb tree walk was correct. We cannot trust the restult of a lockless tree walk in the face of concurrent tree rotations; although we can trust on the termination of such walks -- tree rotations guarantee the end result is a tree again after all. Furthermore, we rely on the WMB implied by the write_seqlock/count_begin() to separate the VMA initialization and the publishing stores, analogous to the RELEASE in rcu_assign_pointer(). We also rely on the RMB from read_seqretry() to separate the vma load from further loads like the smp_read_barrier_depends() in regular RCU. We must not touch the vmacache while doing SRCU lookups as that is not properly serialized against changes. We update gap information after publishing the VMA, but A) we don't use that and B) the seqlock read side would fix that anyhow. We clear vma->vm_rb for nodes removed from the vma tree such that we can easily detect such 'dead' nodes, we rely on the WMB from write_sequnlock() to separate the tree removal and clearing the node. Provide find_vma_srcu() which wraps the required magic. XXX: mmap()/munmap() heavy workloads might suffer from the global lock in call_srcu() -- this is fixable with a 'better' SRCU implementation. Signed-off-by: Peter Zijlstra (Intel) --- include/linux/mm_types.h | 2 + kernel/fork.c | 1 + mm/init-mm.c | 1 + mm/internal.h | 18 +++++++++ mm/mmap.c | 87 +++++++++++++++++++++++++++++++--------- 5 files changed, 89 insertions(+), 20 deletions(-) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 620719bef8087c..eac866b0987ffb 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -359,6 +359,7 @@ struct vm_area_struct { #endif struct vm_userfaultfd_ctx vm_userfaultfd_ctx; seqcount_t vm_sequence; + struct rcu_head vm_rcu_head; }; struct core_thread { @@ -397,6 +398,7 @@ struct kioctx_table; struct mm_struct { struct vm_area_struct *mmap; /* list of VMAs */ struct rb_root mm_rb; + seqlock_t mm_seq; u32 vmacache_seqnum; /* per-thread vmacache */ #ifdef CONFIG_MMU unsigned long (*get_unmapped_area) (struct file *filp, diff --git a/kernel/fork.c b/kernel/fork.c index beb31725f7e274..a15f5fdf129c2d 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -603,6 +603,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p) mm->mmap = NULL; mm->mm_rb = RB_ROOT; mm->vmacache_seqnum = 0; + seqlock_init(&mm->mm_seq); atomic_set(&mm->mm_users, 1); atomic_set(&mm->mm_count, 1); init_rwsem(&mm->mmap_sem); diff --git a/mm/init-mm.c b/mm/init-mm.c index a56a851908d245..5ef625bbb33428 100644 --- a/mm/init-mm.c +++ b/mm/init-mm.c @@ -15,6 +15,7 @@ struct mm_struct init_mm = { .mm_rb = RB_ROOT, + .mm_seq = __SEQLOCK_UNLOCKED(init_mm.mm_seq), .pgd = swapper_pg_dir, .mm_users = ATOMIC_INIT(2), .mm_count = ATOMIC_INIT(1), diff --git a/mm/internal.h b/mm/internal.h index 1501304f87a41a..2f6c700e237534 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -38,6 +38,24 @@ int do_swap_page(struct fault_env *fe, pte_t orig_pte); +extern struct srcu_struct vma_srcu; + +extern struct vm_area_struct *find_vma_srcu(struct mm_struct *mm, unsigned long addr); + +static inline bool vma_is_dead(struct vm_area_struct *vma, unsigned int sequence) +{ + int ret = RB_EMPTY_NODE(&vma->vm_rb); + unsigned seq = ACCESS_ONCE(vma->vm_sequence.sequence); + + /* + * Matches both the wmb in write_seqlock_{begin,end}() and + * the wmb in vma_rb_erase(). + */ + smp_rmb(); + + return ret || seq != sequence; +} + void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma, unsigned long floor, unsigned long ceiling); diff --git a/mm/mmap.c b/mm/mmap.c index c2be9bd0ad922e..fb769f4243d62c 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -152,6 +152,23 @@ void unlink_file_vma(struct vm_area_struct *vma) } } +DEFINE_SRCU(vma_srcu); + +static void __free_vma(struct rcu_head *head) +{ + struct vm_area_struct *vma = + container_of(head, struct vm_area_struct, vm_rcu_head); + + if (vma->vm_file) + fput(vma->vm_file); + kmem_cache_free(vm_area_cachep, vma); +} + +static void free_vma(struct vm_area_struct *vma) +{ + call_srcu(&vma_srcu, &vma->vm_rcu_head, __free_vma); +} + /* * Close a vm structure and free it, returning the next. */ @@ -162,10 +179,8 @@ static struct vm_area_struct *remove_vma(struct vm_area_struct *vma) might_sleep(); if (vma->vm_ops && vma->vm_ops->close) vma->vm_ops->close(vma); - if (vma->vm_file) - fput(vma->vm_file); mpol_put(vma_policy(vma)); - kmem_cache_free(vm_area_cachep, vma); + free_vma(vma); return next; } @@ -386,17 +401,19 @@ static void vma_gap_update(struct vm_area_struct *vma) vma_gap_callbacks_propagate(&vma->vm_rb, NULL); } -static inline void vma_rb_insert(struct vm_area_struct *vma, - struct rb_root *root) +static inline void vma_rb_insert(struct vm_area_struct *vma, struct mm_struct *mm) { + struct rb_root *root = &mm->mm_rb; + /* All rb_subtree_gap values must be consistent prior to insertion */ validate_mm_rb(root, NULL); rb_insert_augmented(&vma->vm_rb, root, &vma_gap_callbacks); } -static void vma_rb_erase(struct vm_area_struct *vma, struct rb_root *root) +static void vma_rb_erase(struct vm_area_struct *vma, struct mm_struct *mm) { + struct rb_root *root = &mm->mm_rb; /* * All rb_subtree_gap values must be consistent prior to erase, * with the possible exception of the vma being erased. @@ -408,7 +425,15 @@ static void vma_rb_erase(struct vm_area_struct *vma, struct rb_root *root) * so make sure we instantiate it only once with our desired * augmented rbtree callbacks. */ + write_seqlock(&mm->mm_seq); rb_erase_augmented(&vma->vm_rb, root, &vma_gap_callbacks); + write_sequnlock(&mm->mm_seq); /* wmb */ + + /* + * Ensure the removal is complete before clearing the node. + * Matched by vma_is_dead()/handle_speculative_fault(). + */ + RB_CLEAR_NODE(&vma->vm_rb); } /* @@ -525,10 +550,12 @@ void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma, * immediately update the gap to the correct value. Finally we * rebalance the rbtree after all augmented values have been set. */ + write_seqlock(&mm->mm_seq); rb_link_node(&vma->vm_rb, rb_parent, rb_link); vma->rb_subtree_gap = 0; vma_gap_update(vma); - vma_rb_insert(vma, &mm->mm_rb); + vma_rb_insert(vma, mm); + write_sequnlock(&mm->mm_seq); } static void __vma_link_file(struct vm_area_struct *vma) @@ -602,7 +629,7 @@ __vma_unlink(struct mm_struct *mm, struct vm_area_struct *vma, { struct vm_area_struct *next; - vma_rb_erase(vma, &mm->mm_rb); + vma_rb_erase(vma, mm); prev->vm_next = next = vma->vm_next; if (next) next->vm_prev = prev; @@ -794,15 +821,13 @@ int vma_adjust(struct vm_area_struct *vma, unsigned long start, } if (remove_next) { - if (file) { + if (file) uprobe_munmap(next, next->vm_start, next->vm_end); - fput(file); - } if (next->anon_vma) anon_vma_merge(vma, next); mm->map_count--; mpol_put(vma_policy(next)); - kmem_cache_free(vm_area_cachep, next); + free_vma(next); /* * In mprotect's case 6 (see comments on vma_merge), * we must remove another next too. It would clutter @@ -1949,16 +1974,11 @@ get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, EXPORT_SYMBOL(get_unmapped_area); /* Look up the first VMA which satisfies addr < vm_end, NULL if none. */ -struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr) +static struct vm_area_struct *__find_vma(struct mm_struct *mm, unsigned long addr) { struct rb_node *rb_node; struct vm_area_struct *vma; - /* Check the cache first. */ - vma = vmacache_find(mm, addr); - if (likely(vma)) - return vma; - rb_node = mm->mm_rb.rb_node; while (rb_node) { @@ -1975,13 +1995,40 @@ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr) rb_node = rb_node->rb_right; } + return vma; +} + +struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr) +{ + struct vm_area_struct *vma; + + /* Check the cache first. */ + vma = vmacache_find(mm, addr); + if (likely(vma)) + return vma; + + vma = __find_vma(mm, addr); if (vma) vmacache_update(addr, vma); return vma; } - EXPORT_SYMBOL(find_vma); +struct vm_area_struct *find_vma_srcu(struct mm_struct *mm, unsigned long addr) +{ + struct vm_area_struct *vma; + unsigned int seq; + + WARN_ON_ONCE(!srcu_read_lock_held(&vma_srcu)); + + do { + seq = read_seqbegin(&mm->mm_seq); + vma = __find_vma(mm, addr); + } while (read_seqretry(&mm->mm_seq, seq)); + + return vma; +} + /* * Same as find_vma, but also return a pointer to the previous VMA in *pprev. */ @@ -2336,7 +2383,7 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma, insertion_point = (prev ? &prev->vm_next : &mm->mmap); vma->vm_prev = NULL; do { - vma_rb_erase(vma, &mm->mm_rb); + vma_rb_erase(vma, mm); mm->map_count--; tail_vma = vma; vma = vma->vm_next; From b7b7cc5765bdbcb3f96acf2c41356c24b8c112f1 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 20 Oct 2014 23:56:38 +0200 Subject: [PATCH 7/9] mm: Provide speculative fault infrastructure Provide infrastructure to do a speculative fault (not holding mmap_sem). The not holding of mmap_sem means we can race against VMA change/removal and page-table destruction. We use the SRCU VMA freeing to keep the VMA around. We use the VMA seqcount to detect change (including umapping / page-table deletion) and we use gup_fast() style page-table walking to deal with page-table races. Once we've obtained the page and are ready to update the PTE, we validate if the state we started the fault with is still valid, if not, we'll fail the fault with VM_FAULT_RETRY, otherwise we update the PTE and we're done. Signed-off-by: Peter Zijlstra (Intel) --- include/linux/mm.h | 3 ++ mm/memory.c | 122 ++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 123 insertions(+), 2 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index e8e9e3dc4a0dbd..6d4285c0df65f8 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -320,6 +320,7 @@ struct fault_env { struct vm_area_struct *vma; /* Target VMA */ unsigned long address; /* Faulting virtual address */ unsigned int flags; /* FAULT_FLAG_xxx flags */ + unsigned int sequence; pmd_t *pmd; /* Pointer to pmd entry matching * the 'address' */ @@ -1258,6 +1259,8 @@ int invalidate_inode_page(struct page *page); #ifdef CONFIG_MMU extern int handle_mm_fault(struct vm_area_struct *vma, unsigned long address, unsigned int flags); +extern int handle_speculative_fault(struct mm_struct *mm, + unsigned long address, unsigned int flags); extern int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm, unsigned long address, unsigned int fault_flags, bool *unlocked); diff --git a/mm/memory.c b/mm/memory.c index ec32cf710403bb..05fff1feba690d 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2106,8 +2106,37 @@ static bool pte_spinlock(struct fault_env *fe) static bool pte_map_lock(struct fault_env *fe) { - fe->pte = pte_offset_map_lock(fe->vma->vm_mm, fe->pmd, fe->address, &fe->ptl); - return true; + bool ret = false; + + if (!(fe->flags & FAULT_FLAG_SPECULATIVE)) { + fe->pte = pte_offset_map_lock(fe->vma->vm_mm, fe->pmd, + fe->address, &fe->ptl); + return true; + } + + /* + * The first vma_is_dead() guarantees the page-tables are still valid, + * having IRQs disabled ensures they stay around, hence the second + * vma_is_dead() to make sure they are still valid once we've got the + * lock. After that a concurrent zap_pte_range() will block on the PTL + * and thus we're safe. + */ + local_irq_disable(); + if (vma_is_dead(fe->vma, fe->sequence)) + goto out; + + fe->pte = pte_offset_map_lock(fe->vma->vm_mm, fe->pmd, + fe->address, &fe->ptl); + + if (vma_is_dead(fe->vma, fe->sequence)) { + pte_unmap_unlock(fe->pte, fe->ptl); + goto out; + } + + ret = true; +out: + local_irq_enable(); + return ret; } /* @@ -2533,6 +2562,7 @@ int do_swap_page(struct fault_env *fe, pte_t orig_pte) entry = pte_to_swp_entry(orig_pte); if (unlikely(non_swap_entry(entry))) { if (is_migration_entry(entry)) { + /* XXX fe->pmd might be dead */ migration_entry_wait(vma->vm_mm, fe->pmd, fe->address); } else if (is_hwpoison_entry(entry)) { ret = VM_FAULT_HWPOISON; @@ -3625,6 +3655,94 @@ static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address, return handle_pte_fault(&fe); } +int handle_speculative_fault(struct mm_struct *mm, unsigned long address, unsigned int flags) +{ + struct fault_env fe = { + .address = address, + .flags = flags | FAULT_FLAG_SPECULATIVE, + }; + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + int dead, seq, idx, ret = VM_FAULT_RETRY; + struct vm_area_struct *vma; + + idx = srcu_read_lock(&vma_srcu); + vma = find_vma_srcu(mm, address); + if (!vma) + goto unlock; + + /* + * Validate the VMA found by the lockless lookup. + */ + dead = RB_EMPTY_NODE(&vma->vm_rb); + seq = raw_read_seqcount(&vma->vm_sequence); /* rmb <-> seqlock,vma_rb_erase() */ + if ((seq & 1) || dead) /* XXX wait for !&1 instead? */ + goto unlock; + + if (address < vma->vm_start || vma->vm_end <= address) + goto unlock; + + /* + * We need to re-validate the VMA after checking the bounds, otherwise + * we might have a false positive on the bounds. + */ + if (read_seqcount_retry(&vma->vm_sequence, seq)) + goto unlock; + + /* + * Do a speculative lookup of the PTE entry. + */ + local_irq_disable(); + pgd = pgd_offset(mm, address); + if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) + goto out_walk; + + pud = pud_offset(pgd, address); + if (pud_none(*pud) || unlikely(pud_bad(*pud))) + goto out_walk; + + pmd = pmd_offset(pud, address); + if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd))) + goto out_walk; + + /* + * The above does not allocate/instantiate page-tables because doing so + * would lead to the possibility of instantiating page-tables after + * free_pgtables() -- and consequently leaking them. + * + * The result is that we take at least one !speculative fault per PMD + * in order to instantiate it. + * + * XXX try and fix that.. should be possible somehow. + */ + + if (pmd_huge(*pmd)) /* XXX no huge support */ + goto out_walk; + + fe.vma = vma; + fe.pmd = pmd; + fe.sequence = seq; + +#if 0 +#warning This is done in handle_pte_fault()... + pte = pte_offset_map(pmd, address); + fe.entry = ACCESS_ONCE(pte); /* XXX gup_get_pte() */ + pte_unmap(pte); +#endif + local_irq_enable(); + + ret = handle_pte_fault(&fe); + +unlock: + srcu_read_unlock(&vma_srcu, idx); + return ret; + +out_walk: + local_irq_enable(); + goto unlock; +} + /* * By the time we get here, we already hold the mm semaphore * From 02983e39a474bc7183287f40d22ecba373b13006 Mon Sep 17 00:00:00 2001 From: Laurent Dufour Date: Wed, 9 Nov 2016 15:36:12 +0100 Subject: [PATCH 8/9] mm: Fix pte_spinlock for speculative page fault --- mm/memory.c | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/mm/memory.c b/mm/memory.c index 05fff1feba690d..1c06b45c609713 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2099,9 +2099,31 @@ static inline int wp_page_reuse(struct fault_env *fe, pte_t orig_pte, static bool pte_spinlock(struct fault_env *fe) { + bool ret = false; + + /* Check if vma is still valid */ + if (!(fe->flags & FAULT_FLAG_SPECULATIVE)) { + fe->ptl = pte_lockptr(fe->vma->vm_mm, fe->pmd); + spin_lock(fe->ptl); + return true; + } + + local_irq_disable(); + if (vma_is_dead(fe->vma, fe->sequence)) + goto out; + fe->ptl = pte_lockptr(fe->vma->vm_mm, fe->pmd); spin_lock(fe->ptl); - return true; + + if (vma_is_dead(fe->vma, fe->sequence)) { + spin_unlock(fe->ptl); + goto out; + } + + ret = true; +out: + local_irq_enable(); + return ret; } static bool pte_map_lock(struct fault_env *fe) From cb682a51bc88a8b3ffb72cc6ddca7d3ec0de36d6 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 20 Oct 2014 23:56:39 +0200 Subject: [PATCH 9/9] mm,x86: Add speculative pagefault handling Try a speculative fault before acquiring mmap_sem, if it returns with VM_FAULT_RETRY continue with the mmap_sem acquisition and do the traditional fault. Signed-off-by: Peter Zijlstra (Intel) --- arch/x86/mm/fault.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index dc802306045653..5313ec9ac57ed1 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -1276,6 +1276,16 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code, if (error_code & PF_INSTR) flags |= FAULT_FLAG_INSTRUCTION; + if (error_code & PF_USER) { + fault = handle_speculative_fault(mm, address, + flags & ~FAULT_FLAG_ALLOW_RETRY); + + if (fault & VM_FAULT_RETRY) + goto retry; + + goto done; + } + /* * When running in the kernel we expect faults to occur only to * addresses in user space. All other faults represent errors in @@ -1379,7 +1389,15 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code, return; } + if (unlikely(fault & VM_FAULT_RETRY)) { + if (fatal_signal_pending(current)) + return; + + goto done; + } + up_read(&mm->mmap_sem); +done: if (unlikely(fault & VM_FAULT_ERROR)) { mm_fault_error(regs, error_code, address, vma, fault); return;