Skip to content

Commit 56b2744

Browse files
Liu Shixingregkh
Liu Shixin
authored andcommitted
mm: hugetlb: independent PMD page table shared count
[ Upstream commit 59d9094 ] The folio refcount may be increased unexpectly through try_get_folio() by caller such as split_huge_pages. In huge_pmd_unshare(), we use refcount to check whether a pmd page table is shared. The check is incorrect if the refcount is increased by the above caller, and this can cause the page table leaked: BUG: Bad page state in process sh pfn:109324 page: refcount:0 mapcount:0 mapping:0000000000000000 index:0x66 pfn:0x109324 flags: 0x17ffff800000000(node=0|zone=2|lastcpupid=0xfffff) page_type: f2(table) raw: 017ffff800000000 0000000000000000 0000000000000000 0000000000000000 raw: 0000000000000066 0000000000000000 00000000f2000000 0000000000000000 page dumped because: nonzero mapcount ... CPU: 31 UID: 0 PID: 7515 Comm: sh Kdump: loaded Tainted: G B 6.13.0-rc2master+ #7 Tainted: [B]=BAD_PAGE Hardware name: QEMU KVM Virtual Machine, BIOS 0.0.0 02/06/2015 Call trace: show_stack+0x20/0x38 (C) dump_stack_lvl+0x80/0xf8 dump_stack+0x18/0x28 bad_page+0x8c/0x130 free_page_is_bad_report+0xa4/0xb0 free_unref_page+0x3cc/0x620 __folio_put+0xf4/0x158 split_huge_pages_all+0x1e0/0x3e8 split_huge_pages_write+0x25c/0x2d8 full_proxy_write+0x64/0xd8 vfs_write+0xcc/0x280 ksys_write+0x70/0x110 __arm64_sys_write+0x24/0x38 invoke_syscall+0x50/0x120 el0_svc_common.constprop.0+0xc8/0xf0 do_el0_svc+0x24/0x38 el0_svc+0x34/0x128 el0t_64_sync_handler+0xc8/0xd0 el0t_64_sync+0x190/0x198 The issue may be triggered by damon, offline_page, page_idle, etc, which will increase the refcount of page table. 1. The page table itself will be discarded after reporting the "nonzero mapcount". 2. The HugeTLB page mapped by the page table miss freeing since we treat the page table as shared and a shared page table will not be unmapped. Fix it by introducing independent PMD page table shared count. As described by comment, pt_index/pt_mm/pt_frag_refcount are used for s390 gmap, x86 pgds and powerpc, pt_share_count is used for x86/arm64/riscv pmds, so we can reuse the field as pt_share_count. Link: https://lkml.kernel.org/r/20241216071147.3984217-1-liushixin2@huawei.com Fixes: 39dde65 ("[PATCH] shared page table for hugetlb page") Signed-off-by: Liu Shixin <liushixin2@huawei.com> Cc: Kefeng Wang <wangkefeng.wang@huawei.com> Cc: Ken Chen <kenneth.w.chen@intel.com> Cc: Muchun Song <muchun.song@linux.dev> Cc: Nanyong Sun <sunnanyong@huawei.com> Cc: Jane Chu <jane.chu@oracle.com> Cc: <stable@vger.kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Sasha Levin <sashal@kernel.org>
1 parent ec50023 commit 56b2744

File tree

3 files changed

+38
-9
lines changed

3 files changed

+38
-9
lines changed

Diff for: include/linux/mm.h

+1
Original file line numberDiff line numberDiff line change
@@ -3031,6 +3031,7 @@ static inline bool pagetable_pmd_ctor(struct ptdesc *ptdesc)
30313031
if (!pmd_ptlock_init(ptdesc))
30323032
return false;
30333033
__folio_set_pgtable(folio);
3034+
ptdesc_pmd_pts_init(ptdesc);
30343035
lruvec_stat_add_folio(folio, NR_PAGETABLE);
30353036
return true;
30363037
}

Diff for: include/linux/mm_types.h

+30
Original file line numberDiff line numberDiff line change
@@ -399,6 +399,7 @@ FOLIO_MATCH(compound_head, _head_2a);
399399
* @__page_mapping: Aliases with page->mapping. Unused for page tables.
400400
* @pt_mm: Used for x86 pgds.
401401
* @pt_frag_refcount: For fragmented page table tracking. Powerpc only.
402+
* @pt_share_count: Used for HugeTLB PMD page table share count.
402403
* @_pt_pad_2: Padding to ensure proper alignment.
403404
* @ptl: Lock for the page table.
404405
* @__page_type: Same as page->page_type. Unused for page tables.
@@ -424,6 +425,9 @@ struct ptdesc {
424425
union {
425426
struct mm_struct *pt_mm;
426427
atomic_t pt_frag_refcount;
428+
#ifdef CONFIG_HUGETLB_PMD_PAGE_TABLE_SHARING
429+
atomic_t pt_share_count;
430+
#endif
427431
};
428432

429433
union {
@@ -468,6 +472,32 @@ static_assert(sizeof(struct ptdesc) <= sizeof(struct page));
468472
const struct page *: (const struct ptdesc *)(p), \
469473
struct page *: (struct ptdesc *)(p)))
470474

475+
#ifdef CONFIG_HUGETLB_PMD_PAGE_TABLE_SHARING
476+
static inline void ptdesc_pmd_pts_init(struct ptdesc *ptdesc)
477+
{
478+
atomic_set(&ptdesc->pt_share_count, 0);
479+
}
480+
481+
static inline void ptdesc_pmd_pts_inc(struct ptdesc *ptdesc)
482+
{
483+
atomic_inc(&ptdesc->pt_share_count);
484+
}
485+
486+
static inline void ptdesc_pmd_pts_dec(struct ptdesc *ptdesc)
487+
{
488+
atomic_dec(&ptdesc->pt_share_count);
489+
}
490+
491+
static inline int ptdesc_pmd_pts_count(struct ptdesc *ptdesc)
492+
{
493+
return atomic_read(&ptdesc->pt_share_count);
494+
}
495+
#else
496+
static inline void ptdesc_pmd_pts_init(struct ptdesc *ptdesc)
497+
{
498+
}
499+
#endif
500+
471501
/*
472502
* Used for sizing the vmemmap region on some architectures
473503
*/

Diff for: mm/hugetlb.c

+7-9
Original file line numberDiff line numberDiff line change
@@ -7014,7 +7014,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma,
70147014
spte = hugetlb_walk(svma, saddr,
70157015
vma_mmu_pagesize(svma));
70167016
if (spte) {
7017-
get_page(virt_to_page(spte));
7017+
ptdesc_pmd_pts_inc(virt_to_ptdesc(spte));
70187018
break;
70197019
}
70207020
}
@@ -7029,7 +7029,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma,
70297029
(pmd_t *)((unsigned long)spte & PAGE_MASK));
70307030
mm_inc_nr_pmds(mm);
70317031
} else {
7032-
put_page(virt_to_page(spte));
7032+
ptdesc_pmd_pts_dec(virt_to_ptdesc(spte));
70337033
}
70347034
spin_unlock(&mm->page_table_lock);
70357035
out:
@@ -7041,10 +7041,6 @@ pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma,
70417041
/*
70427042
* unmap huge page backed by shared pte.
70437043
*
7044-
* Hugetlb pte page is ref counted at the time of mapping. If pte is shared
7045-
* indicated by page_count > 1, unmap is achieved by clearing pud and
7046-
* decrementing the ref count. If count == 1, the pte page is not shared.
7047-
*
70487044
* Called with page table lock held.
70497045
*
70507046
* returns: 1 successfully unmapped a shared pte page
@@ -7053,18 +7049,20 @@ pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma,
70537049
int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma,
70547050
unsigned long addr, pte_t *ptep)
70557051
{
7052+
unsigned long sz = huge_page_size(hstate_vma(vma));
70567053
pgd_t *pgd = pgd_offset(mm, addr);
70577054
p4d_t *p4d = p4d_offset(pgd, addr);
70587055
pud_t *pud = pud_offset(p4d, addr);
70597056

70607057
i_mmap_assert_write_locked(vma->vm_file->f_mapping);
70617058
hugetlb_vma_assert_locked(vma);
7062-
BUG_ON(page_count(virt_to_page(ptep)) == 0);
7063-
if (page_count(virt_to_page(ptep)) == 1)
7059+
if (sz != PMD_SIZE)
7060+
return 0;
7061+
if (!ptdesc_pmd_pts_count(virt_to_ptdesc(ptep)))
70647062
return 0;
70657063

70667064
pud_clear(pud);
7067-
put_page(virt_to_page(ptep));
7065+
ptdesc_pmd_pts_dec(virt_to_ptdesc(ptep));
70687066
mm_dec_nr_pmds(mm);
70697067
return 1;
70707068
}

0 commit comments

Comments
 (0)