From 4cba5db0c645a13dedf88471b70046bc575e63b7 Mon Sep 17 00:00:00 2001 From: Nick Hollinghurst Date: Fri, 29 Aug 2025 15:50:03 +0100 Subject: [PATCH 1/2] iommu: bcm2712-iommu: Allocate tables on demand; add OF properties Allocate space for level-2 IOMMU translation tables on demand. This should save memory in most cases but means that map_pages() can now fail with -ENOMEM. Unused pages are retained for re-use. Move all dma_sync* calls into map and unmap functions rather than batching them up. This makes it easier to ensure they are safely balanced, now that the tables are held as separate pages. Add OF properties to override the default aperture size (2GB) and base address (40GB); this doesn't include any dma-iova-offset. Various tidy-ups. The internal representation of aperture limits *does* include dma_iova_offset, as that is more commonly useful. Clarify the distinction between Linux pages and IOMMU table pages. Fix wrong definition of MMMU_CTRL_PT_INVALID_EN flag. Signed-off-by: Nick Hollinghurst --- drivers/iommu/bcm2712-iommu.c | 442 ++++++++++++++++++++-------------- drivers/iommu/bcm2712-iommu.h | 12 +- 2 files changed, 262 insertions(+), 192 deletions(-) diff --git a/drivers/iommu/bcm2712-iommu.c b/drivers/iommu/bcm2712-iommu.c index 4e6aa0352f5a57..5ca171863d4435 100644 --- a/drivers/iommu/bcm2712-iommu.c +++ b/drivers/iommu/bcm2712-iommu.c @@ -2,14 +2,16 @@ /* * IOMMU driver for BCM2712 * - * Copyright (c) 2023 Raspberry Pi Ltd. + * Copyright (c) 2023-2025 Raspberry Pi Ltd. */ #include "bcm2712-iommu.h" +#include "iommu-pages.h" #include #include #include +#include #include #include #include @@ -26,8 +28,9 @@ #define MMMU_CTRL_CAP_EXCEEDED_EXCEPTION_EN BIT(24) #define MMMU_CTRL_PT_INVALID BIT(20) #define MMMU_CTRL_PT_INVALID_ABORT_EN BIT(19) -#define MMMU_CTRL_PT_INVALID_EXCEPTION_EN BIT(18) -#define MMMU_CTRL_PT_INVALID_EN BIT(17) +#define MMMU_CTRL_PT_INVALID_INT_EN BIT(18) +#define MMMU_CTRL_PT_INVALID_EXCEPTION_EN BIT(17) +#define MMMU_CTRL_PT_INVALID_EN BIT(16) #define MMMU_CTRL_WRITE_VIOLATION BIT(12) #define MMMU_CTRL_WRITE_VIOLATION_ABORT_EN BIT(11) #define MMMU_CTRL_WRITE_VIOLATION_INT_EN BIT(10) @@ -66,6 +69,8 @@ #define MMMU_ILLEGAL_ADR_OFFSET 0x30 #define MMMU_ILLEGAL_ADR_ENABLE BIT(31) +#define MMMU_VIO_ADDR_OFFSET 0x34 + #define MMMU_DEBUG_INFO_OFFSET 0x38 #define MMMU_DEBUG_INFO_VERSION_MASK 0x0000000Fu #define MMMU_DEBUG_INFO_VA_WIDTH_MASK 0x000000F0u @@ -80,41 +85,67 @@ #define MMMU_PTE_VALID BIT(28) /* - * BCM2712 IOMMU is organized around 4Kbyte pages (MMU_PAGE_SIZE). + * BCM2712 IOMMU is organized around 4Kbyte pages (IOMMU_PAGE_SIZE). * Linux PAGE_SIZE must not be smaller but may be larger (e.g. 4K, 16K). * * Unlike many larger MMUs, this one uses a 4-byte word size, allowing * 1024 entries within each 4K table page, and two-level translation. + * To avoid needing CMA, we require the top-level table to fit within + * a single Linux page. This restricts IOVA space to (PAGE_SIZE << 20). + * A "default" page is allocated to catch illegal reads and writes. * - * Let's allocate enough table space for 2GB of translated memory (IOVA). - * This requires 512 4K pages (2MB) of level-2 tables, one page of - * top-level table (only half-filled in this particular configuration), - * plus one "default" page to catch illegal requests. - * - * The translated virtual address region is between 40GB and 42GB; - * addresses below this range pass straight through to the SDRAM. + * To deal with multiple blocks connected to an IOMMU when only some + * of them are IOMMU clients, by default we start the IOVA region at + * 40GB; requests to lower addresses pass straight through to SDRAM. + * Both the base and size of the IOVA region are configurable. * * Currently we assume a 1:1:1 correspondence of IOMMU, group and domain. */ -#define MMU_PAGE_SHIFT 12 -#define MMU_PAGE_SIZE BIT(MMU_PAGE_SHIFT) +#define IOMMU_PAGE_SHIFT 12 +#define IOMMU_PAGE_SIZE BIT(IOMMU_PAGE_SHIFT) -#define PAGEWORDS_SHIFT (MMU_PAGE_SHIFT - 2) -#define HUGEPAGE_SHIFT (MMU_PAGE_SHIFT + PAGEWORDS_SHIFT) -#define L1_CHUNK_SHIFT (MMU_PAGE_SHIFT + 2 * PAGEWORDS_SHIFT) +#define LX_PAGEWORDS_SHIFT (PAGE_SHIFT - 2) +#define LX_PAGEWORDS_MASK ((1u << LX_PAGEWORDS_SHIFT) - 1u) +#define IOMMU_PAGEWORDS_SHIFT (IOMMU_PAGE_SHIFT - 2) +#define IOMMU_HUGEPAGE_SHIFT (IOMMU_PAGE_SHIFT + IOMMU_PAGEWORDS_SHIFT) +#define L1_AP_BASE_SHIFT (IOMMU_PAGE_SHIFT + 2 * IOMMU_PAGEWORDS_SHIFT) +#define IOVA_TO_LXPAGE_SHIFT (IOMMU_PAGE_SHIFT + LX_PAGEWORDS_SHIFT) -#define APERTURE_BASE (40ul << 30) -#define APERTURE_SIZE (2ul << 30) -#define APERTURE_TOP (APERTURE_BASE + APERTURE_SIZE) -#define TRANSLATED_PAGES (APERTURE_SIZE >> MMU_PAGE_SHIFT) -#define L2_PAGES (TRANSLATED_PAGES >> PAGEWORDS_SHIFT) -#define TABLES_ALLOC_SIZE (L2_PAGES * MMU_PAGE_SIZE + 2 * PAGE_SIZE) +#define DEFAULT_APERTURE_BASE (40ul << 30) +#define DEFAULT_APERTURE_SIZE (2ul << 30) -static void bcm2712_iommu_init(struct bcm2712_iommu *mmu) +/* + * Allocate a Linux page for tables; return the index of its first IOMMU page. + * To avoid having to store DMA addresses, we assume that a DMA address used + * for IOMMU tables is always a physical address -- this is true for BCM2712. + * Note that iommu_alloc_page() will zero the page contents. + */ +static u32 bcm2712_iommu_get_page(struct bcm2712_iommu *mmu, u32 **ptr) +{ + *ptr = iommu_alloc_page(GFP_KERNEL); + if (*ptr) { + dma_addr_t dma = dma_map_single(mmu->dev, *ptr, + PAGE_SIZE, DMA_TO_DEVICE); + WARN_ON(dma_mapping_error(mmu->dev, dma) || + dma != virt_to_phys(*ptr)); + return (u32)(dma >> IOMMU_PAGE_SHIFT); + } + dev_err(mmu->dev, "Failed to allocate a page\n"); + return 0; +} + +static void bcm2712_iommu_free_page(struct bcm2712_iommu *mmu, void *ptr) +{ + if (ptr) { + dma_unmap_single(mmu->dev, virt_to_phys(ptr), + PAGE_SIZE, DMA_TO_DEVICE); + iommu_free_page(ptr); + } +} + +static int bcm2712_iommu_init(struct bcm2712_iommu *mmu) { - unsigned int i, bypass_shift; - struct sg_dma_page_iter it; u32 u = MMU_RD(MMMU_DEBUG_INFO_OFFSET); /* @@ -130,12 +161,14 @@ static void bcm2712_iommu_init(struct bcm2712_iommu *mmu) FIELD_GET(MMMU_DEBUG_INFO_PA_WIDTH_MASK, u) < 6 || !(u & MMMU_DEBUG_INFO_BYPASS)); + dma_set_mask_and_coherent(mmu->dev, + DMA_BIT_MASK(FIELD_GET(MMMU_DEBUG_INFO_PA_WIDTH_MASK, u) + 30u)); mmu->bigpage_mask = - ((1u << FIELD_GET(MMMU_DEBUG_INFO_BIGPAGE_WIDTH_MASK, u)) - 1u) << MMU_PAGE_SHIFT; + ((1u << FIELD_GET(MMMU_DEBUG_INFO_BIGPAGE_WIDTH_MASK, u)) - 1u) << + IOMMU_PAGE_SHIFT; mmu->superpage_mask = - ((1u << FIELD_GET(MMMU_DEBUG_INFO_SUPERPAGE_WIDTH_MASK, u)) - 1u) << MMU_PAGE_SHIFT; - bypass_shift = (u & MMMU_DEBUG_INFO_BYPASS_4M) ? - HUGEPAGE_SHIFT : ADDR_CAP_SHIFT; + ((1u << FIELD_GET(MMMU_DEBUG_INFO_SUPERPAGE_WIDTH_MASK, u)) - 1u) << + IOMMU_PAGE_SHIFT; /* Disable MMU and clear sticky flags; meanwhile flush the TLB */ MMU_WR(MMMU_CTRL_OFFSET, @@ -148,7 +181,7 @@ static void bcm2712_iommu_init(struct bcm2712_iommu *mmu) /* * Put MMU into 2-level mode; set address cap and "bypass" range * (note that some of these registers have unintuitive off-by-ones). - * Addresses below APERTURE_BASE are passed unchanged: this is + * Addresses below the "aperture" are passed unchanged: this is * useful for blocks which share an IOMMU with other blocks * whose drivers are not IOMMU-aware. */ @@ -156,49 +189,36 @@ static void bcm2712_iommu_init(struct bcm2712_iommu *mmu) MMU_RD(MMMU_MISC_OFFSET) & ~MMMU_MISC_SINGLE_TABLE); MMU_WR(MMMU_ADDR_CAP_OFFSET, MMMU_ADDR_CAP_ENABLE + - (APERTURE_TOP >> ADDR_CAP_SHIFT) - 1); - if (APERTURE_BASE > 0) { - MMU_WR(MMMU_BYPASS_START_OFFSET, - MMMU_BYPASS_START_ENABLE + MMMU_BYPASS_START_INVERT + - (APERTURE_BASE >> bypass_shift) - 1); + ((mmu->aperture_end - mmu->dma_iova_offset) >> ADDR_CAP_SHIFT) - 1); + MMU_WR(MMMU_BYPASS_START_OFFSET, 0); + if (mmu->aperture_base > mmu->dma_iova_offset) { + unsigned int byp_shift = (u & MMMU_DEBUG_INFO_BYPASS_4M) ? + IOMMU_HUGEPAGE_SHIFT : ADDR_CAP_SHIFT; + MMU_WR(MMMU_BYPASS_END_OFFSET, MMMU_BYPASS_END_ENABLE + - (APERTURE_TOP >> bypass_shift)); + ((mmu->aperture_base - mmu->dma_iova_offset) >> byp_shift)); } else { - MMU_WR(MMMU_BYPASS_START_OFFSET, 0); MMU_WR(MMMU_BYPASS_END_OFFSET, 0); } - /* Ensure tables are zeroed (which marks all pages as invalid) */ - dma_sync_sgtable_for_cpu(mmu->dev, mmu->sgt, DMA_TO_DEVICE); - memset(mmu->tables, 0, TABLES_ALLOC_SIZE); - mmu->nmapped_pages = 0; - - /* Initialize the high-level table to point to the low-level pages */ - __sg_page_iter_start(&it.base, mmu->sgt->sgl, mmu->sgt->nents, 0); - for (i = 0; i < L2_PAGES; i++) { - if (!(i % (PAGE_SIZE / MMU_PAGE_SIZE))) { - __sg_page_iter_dma_next(&it); - u = (sg_page_iter_dma_address(&it) >> MMU_PAGE_SHIFT); - } else { - u++; - } - mmu->tables[TRANSLATED_PAGES + i] = MMMU_PTE_VALID + u; - } - /* * Configure the addresses of the top-level table (offset because * the aperture does not start from zero), and of the default page. * For simplicity, both these regions are whole Linux pages. */ - __sg_page_iter_dma_next(&it); - u = (sg_page_iter_dma_address(&it) >> MMU_PAGE_SHIFT); - MMU_WR(MMMU_PT_PA_BASE_OFFSET, u - (APERTURE_BASE >> L1_CHUNK_SHIFT)); - __sg_page_iter_dma_next(&it); - u = (sg_page_iter_dma_address(&it) >> MMU_PAGE_SHIFT); + u = bcm2712_iommu_get_page(mmu, &mmu->top_table); + if (!u) + return -ENOMEM; + MMU_WR(MMMU_PT_PA_BASE_OFFSET, + u - ((mmu->aperture_base - mmu->dma_iova_offset) >> L1_AP_BASE_SHIFT)); + u = bcm2712_iommu_get_page(mmu, &mmu->default_page); + if (!u) { + bcm2712_iommu_free_page(mmu, mmu->top_table); + return -ENOMEM; + } MMU_WR(MMMU_ILLEGAL_ADR_OFFSET, MMMU_ILLEGAL_ADR_ENABLE + u); - dma_sync_sgtable_for_device(mmu->dev, mmu->sgt, DMA_TO_DEVICE); - mmu->dirty = false; + mmu->nmapped_pages = 0; /* Flush (and enable) the shared TLB cache; enable this MMU. */ if (mmu->cache) @@ -206,9 +226,12 @@ static void bcm2712_iommu_init(struct bcm2712_iommu *mmu) MMU_WR(MMMU_CTRL_OFFSET, MMMU_CTRL_CAP_EXCEEDED_ABORT_EN | MMMU_CTRL_PT_INVALID_ABORT_EN | + MMMU_CTRL_PT_INVALID_EN | MMMU_CTRL_WRITE_VIOLATION_ABORT_EN | MMMU_CTRL_STATS_ENABLE | MMMU_CTRL_ENABLE); + + return 0; } static int bcm2712_iommu_attach_dev(struct iommu_domain *domain, struct device *dev) @@ -223,13 +246,9 @@ static int bcm2712_iommu_attach_dev(struct iommu_domain *domain, struct device * if (mmu) { mydomain->mmu = mmu; mmu->domain = mydomain; - - if (mmu->dma_iova_offset) { - domain->geometry.aperture_start = - mmu->dma_iova_offset + APERTURE_BASE; - domain->geometry.aperture_end = - mmu->dma_iova_offset + APERTURE_TOP - 1ul; - } + domain->geometry.aperture_start = mmu->aperture_base; + domain->geometry.aperture_end = mmu->aperture_end - 1ul; + domain->geometry.force_aperture = true; return 0; } @@ -241,25 +260,62 @@ static int bcm2712_iommu_map(struct iommu_domain *domain, unsigned long iova, int prot, gfp_t gfp, size_t *mapped) { struct bcm2712_iommu *mmu = domain_to_mmu(domain); - u32 entry = MMMU_PTE_VALID | (pa >> MMU_PAGE_SHIFT); + u32 entry = MMMU_PTE_VALID | (pa >> IOMMU_PAGE_SHIFT); u32 align = (u32)(iova | pa | bytes); - unsigned int p; unsigned long flags; + unsigned int p, p_last, lxp; + bool dirty_top = false; /* Reject if not entirely within our aperture (should never happen) */ bytes *= count; - if (iova < mmu->dma_iova_offset + APERTURE_BASE || - iova + bytes > mmu->dma_iova_offset + APERTURE_TOP) { - dev_warn(mmu->dev, "%s: iova=0x%lx pa=0x%llx bytes=0x%lx OUT OF RANGE\n", - __func__, iova, - (unsigned long long)pa, (unsigned long)bytes); + if (iova < mmu->aperture_base || iova + bytes > mmu->aperture_end) { *mapped = 0; - return -EINVAL; } + /* DMA addresses -> page numbers */ + iova -= mmu->aperture_base; + p = iova >> IOMMU_PAGE_SHIFT; + p_last = (iova + bytes - 1) >> IOMMU_PAGE_SHIFT; + + /* + * Check we have allocated the required Level-2 tables (in units of + * whole Linux pages, each covering one or more IOMMU-table pages). + * Any new L2 pages need to be linked to the top-level table. + * A failure here will cause the entire map_pages() call to fail. + */ + spin_lock_irqsave(&mmu->hw_lock, flags); + for (lxp = (p >> LX_PAGEWORDS_SHIFT); + lxp <= (p_last >> LX_PAGEWORDS_SHIFT); lxp++) { + if (!mmu->tables[lxp]) { + unsigned int i, u; + + u = bcm2712_iommu_get_page(mmu, &mmu->tables[lxp]); + if (!u) + break; + u |= MMMU_PTE_VALID; + if (!dirty_top) { + dma_sync_single_for_cpu(mmu->dev, virt_to_phys(mmu->top_table), + PAGE_SIZE, DMA_TO_DEVICE); + dirty_top = true; + } + for (i = lxp << (PAGE_SHIFT - IOMMU_PAGE_SHIFT); + i < (lxp + 1) << (PAGE_SHIFT - IOMMU_PAGE_SHIFT); i++) { + mmu->top_table[i] = u++; + } + } + } + if (dirty_top) + dma_sync_single_for_device(mmu->dev, virt_to_phys(mmu->top_table), + PAGE_SIZE, DMA_TO_DEVICE); + if (lxp <= (p_last >> LX_PAGEWORDS_SHIFT)) { + spin_unlock_irqrestore(&mmu->hw_lock, flags); + *mapped = 0; + return -ENOMEM; + } + /* large page and write enable flags */ - if (!(align & ((1 << HUGEPAGE_SHIFT) - 1))) + if (!(align & ((1 << IOMMU_HUGEPAGE_SHIFT) - 1))) entry |= FIELD_PREP(MMMU_PTE_PAGESIZE_MASK, 3); else if (!(align & mmu->superpage_mask) && mmu->superpage_mask) entry |= FIELD_PREP(MMMU_PTE_PAGESIZE_MASK, 2); @@ -268,26 +324,27 @@ static int bcm2712_iommu_map(struct iommu_domain *domain, unsigned long iova, if (prot & IOMMU_WRITE) entry |= MMMU_PTE_WRITEABLE; - /* Ensure tables are cache-coherent with CPU */ - spin_lock_irqsave(&mmu->hw_lock, flags); - if (!mmu->dirty) { - dma_sync_sgtable_for_cpu(mmu->dev, mmu->sgt, DMA_TO_DEVICE); - mmu->dirty = true; - } - - /* Make iova relative to table base */ - iova -= (mmu->dma_iova_offset + APERTURE_BASE); - - /* Iterate over table by smallest native IOMMU page size */ - for (p = iova >> MMU_PAGE_SHIFT; - p < (iova + bytes) >> MMU_PAGE_SHIFT; p++) { - mmu->nmapped_pages += !(mmu->tables[p]); - mmu->tables[p] = entry++; + /* + * Again iterate over table-pages and bring them into CPU ownwership. + * Fill in the required PT entries, then give them back to the device. + */ + while (p <= p_last) { + u32 *tbl = mmu->tables[p >> LX_PAGEWORDS_SHIFT]; + + dma_sync_single_for_cpu(mmu->dev, virt_to_phys(tbl), + PAGE_SIZE, DMA_TO_DEVICE); + while (p <= p_last) { + mmu->nmapped_pages += !tbl[p & LX_PAGEWORDS_MASK]; + tbl[p & LX_PAGEWORDS_MASK] = entry++; + if (IS_ALIGNED(++p, (1u << LX_PAGEWORDS_SHIFT))) + break; + } + dma_sync_single_for_device(mmu->dev, virt_to_phys(tbl), + PAGE_SIZE, DMA_TO_DEVICE); } spin_unlock_irqrestore(&mmu->hw_lock, flags); *mapped = bytes; - return 0; } @@ -297,33 +354,41 @@ static size_t bcm2712_iommu_unmap(struct iommu_domain *domain, unsigned long iov { struct bcm2712_iommu *mmu = domain_to_mmu(domain); unsigned long flags; - unsigned int p; + unsigned int p, p_last; /* Reject if not entirely within our aperture (should never happen) */ bytes *= count; - if (iova < mmu->dma_iova_offset + APERTURE_BASE || - iova + bytes > mmu->dma_iova_offset + APERTURE_TOP) + if (iova < mmu->aperture_base || iova + bytes > mmu->aperture_end) return 0; /* Record just the lower and upper bounds in "gather" */ spin_lock_irqsave(&mmu->hw_lock, flags); iommu_iotlb_gather_add_range(gather, iova, bytes); - /* Ensure tables are cache-coherent with CPU */ - if (!mmu->dirty) { - dma_sync_sgtable_for_cpu(mmu->dev, mmu->sgt, DMA_TO_DEVICE); - mmu->dirty = true; - } + /* DMA addresses -> page numbers */ + iova -= mmu->aperture_base; + p = iova >> IOMMU_PAGE_SHIFT; + p_last = (iova + bytes - 1) >> IOMMU_PAGE_SHIFT; - /* Make iova relative to table base */ - iova -= (mmu->dma_iova_offset + APERTURE_BASE); - - /* Clear table entries, this marks the addresses as illegal */ - for (p = iova >> MMU_PAGE_SHIFT; - p < (iova + bytes) >> MMU_PAGE_SHIFT; - p++) { - mmu->nmapped_pages -= !!(mmu->tables[p]); - mmu->tables[p] = 0; + /* + * Iterate over tables in Linux-pages and bring them into CPU ownwership. + * Clear the required PT entries, then give them back to the device. + */ + while (p <= p_last) { + u32 *tbl = mmu->tables[p >> LX_PAGEWORDS_SHIFT]; + + if (tbl) { + dma_sync_single_for_cpu(mmu->dev, virt_to_phys(tbl), + PAGE_SIZE, DMA_TO_DEVICE); + while (p <= p_last) { + mmu->nmapped_pages -= !!tbl[p & LX_PAGEWORDS_MASK]; + tbl[p & LX_PAGEWORDS_MASK] = 0; + if (IS_ALIGNED(++p, (1u << LX_PAGEWORDS_SHIFT))) + break; + } + dma_sync_single_for_device(mmu->dev, virt_to_phys(tbl), + PAGE_SIZE, DMA_TO_DEVICE); + } } spin_unlock_irqrestore(&mmu->hw_lock, flags); @@ -337,31 +402,30 @@ static int bcm2712_iommu_sync_range(struct iommu_domain *domain, unsigned long flags, iova_end; unsigned int i, p4; - if (!mmu || !mmu->dirty) + if (!mmu) return 0; - /* Ensure tables are cleaned from CPU cache or write-buffer */ - spin_lock_irqsave(&mmu->hw_lock, flags); - dma_sync_sgtable_for_device(mmu->dev, mmu->sgt, DMA_TO_DEVICE); - mmu->dirty = false; + iova_end = min(mmu->aperture_end, iova + size); + iova = max(mmu->aperture_base, iova); + if (iova_end <= iova) + return 0; /* Flush the shared TLB cache */ + spin_lock_irqsave(&mmu->hw_lock, flags); if (mmu->cache) bcm2712_iommu_cache_flush(mmu->cache); /* * When flushing a large range or when nothing needs to be kept, - * it's quicker to use the"TLB_CLEAR" flag. Otherwise, invalidate + * it's quicker to use the "TLB_CLEAR" flag. Otherwise, invalidate * TLB entries in lines of 4 words each. Each flush/clear operation * should complete almost instantaneously. */ - iova -= mmu->dma_iova_offset; - iova_end = min(APERTURE_TOP, iova + size); - iova = max(APERTURE_BASE, iova); - if (mmu->nmapped_pages == 0 || iova_end - iova >= APERTURE_SIZE / 8) { + if (mmu->nmapped_pages == 0 || iova_end >= iova + (1ul << 24)) { MMU_WR(MMMU_CTRL_OFFSET, MMMU_CTRL_CAP_EXCEEDED_ABORT_EN | MMMU_CTRL_PT_INVALID_ABORT_EN | + MMMU_CTRL_PT_INVALID_EN | MMMU_CTRL_WRITE_VIOLATION_ABORT_EN | MMMU_CTRL_TLB_CLEAR | MMMU_CTRL_STATS_ENABLE | @@ -372,9 +436,10 @@ static int bcm2712_iommu_sync_range(struct iommu_domain *domain, cpu_relax(); } } else { - for (p4 = iova >> (MMU_PAGE_SHIFT + 2); - p4 < (iova_end + 3 * MMU_PAGE_SIZE) >> (MMU_PAGE_SHIFT + 2); - p4++) { + iova -= mmu->dma_iova_offset; /* DMA address -> IOMMU virtual address */ + iova_end -= (mmu->dma_iova_offset + 1); /* last byte in range */ + for (p4 = iova >> (IOMMU_PAGE_SHIFT + 2); + p4 <= iova_end >> (IOMMU_PAGE_SHIFT + 2); p4++) { MMU_WR(MMMU_SHOOT_DOWN_OFFSET, MMMU_SHOOT_DOWN_SHOOT + (p4 << 2)); for (i = 0; i < 1024; i++) { @@ -403,33 +468,32 @@ static void bcm2712_iommu_sync_all(struct iommu_domain *domain) if (mmu) bcm2712_iommu_sync_range(domain, - mmu->dma_iova_offset + APERTURE_BASE, - APERTURE_SIZE); + mmu->aperture_base, + mmu->aperture_end - mmu->aperture_base); } static phys_addr_t bcm2712_iommu_iova_to_phys(struct iommu_domain *domain, dma_addr_t iova) { + phys_addr_t addr = 0; struct bcm2712_iommu *mmu = domain_to_mmu(domain); - u32 p; - iova -= mmu->dma_iova_offset; - if (iova >= APERTURE_BASE && iova < APERTURE_TOP) { + if (iova >= mmu->aperture_base && iova < mmu->aperture_end) { unsigned long flags; - phys_addr_t addr; + u32 *ptr; + u32 p; spin_lock_irqsave(&mmu->hw_lock, flags); - p = (iova - APERTURE_BASE) >> MMU_PAGE_SHIFT; - p = mmu->tables[p] & 0x0FFFFFFFu; - addr = (((phys_addr_t)p) << MMU_PAGE_SHIFT) + - (iova & (MMU_PAGE_SIZE - 1u)); - + p = (iova - mmu->aperture_base) >> IOMMU_PAGE_SHIFT; + ptr = mmu->tables[p >> LX_PAGEWORDS_SHIFT]; + if (ptr) { + p = ptr[p & LX_PAGEWORDS_MASK] & 0x0FFFFFFFu; + addr = (((phys_addr_t)p) << IOMMU_PAGE_SHIFT) + + (iova & (IOMMU_PAGE_SIZE - 1u)); + } spin_unlock_irqrestore(&mmu->hw_lock, flags); - return addr; - } else if (iova < APERTURE_BASE) { - return (phys_addr_t)iova; - } else { - return (phys_addr_t)-EINVAL; } + + return addr; } static void bcm2712_iommu_domain_free(struct iommu_domain *domain) @@ -464,8 +528,6 @@ static struct iommu_domain *bcm2712_iommu_domain_alloc(unsigned int type) domain->base.type = type; domain->base.ops = &bcm2712_iommu_domain_ops; - domain->base.geometry.aperture_start = APERTURE_BASE; - domain->base.geometry.aperture_end = APERTURE_TOP - 1ul; domain->base.geometry.force_aperture = true; return &domain->base; } @@ -558,6 +620,7 @@ static int bcm2712_iommu_probe(struct platform_device *pdev) { struct bcm2712_iommu *mmu; struct bcm2712_iommu_cache *cache = NULL; + unsigned int num_pages; int ret; /* First of all, check for an IOMMU shared cache */ @@ -588,38 +651,46 @@ static int bcm2712_iommu_probe(struct platform_device *pdev) spin_lock_init(&mmu->hw_lock); /* - * XXX When an IOMMU is downstream of a PCIe RC or some other chip/bus + * Configurable properties. The IOVA base address may be any + * multiple of 4GB; when it is zero, pass-through will be disabled. + * Its size may be any multiple of 256MB up to (PAGE_SIZE << 20). + * + * When the IOMMU is downstream of a PCIe RC or some other chip/bus * and serves some of the masters thereon (others using pass-through), - * we seem to fumble and lose the "dma-ranges" address offset for - * masters using IOMMU. This property restores it, where needed. - */ - if (!pdev->dev.of_node || - of_property_read_u64(pdev->dev.of_node, "dma-iova-offset", - &mmu->dma_iova_offset)) - mmu->dma_iova_offset = 0; - - /* - * The IOMMU is itself a device that allocates DMA-able memory - * to hold its translation tables. Provided the IOVA aperture - * is no larger than 4 GBytes (so that the L1 table fits within - * a single 4K page), we don't need the tables to be contiguous. - * Assume we can address at least 36 bits (64 GB). + * we seem to fumble and lose the "dma-ranges" address offset for DMA + * masters on the far bus. The "dma-iova-offset" property optionally + * restores it. Where present, we add the offset to all DMA addresses + * (but we don't expect to see any offset on the IOMMU's AXI bus). */ - ret = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(36)); - WARN_ON(ret); - mmu->sgt = dma_alloc_noncontiguous(&pdev->dev, TABLES_ALLOC_SIZE, - DMA_TO_DEVICE, GFP_KERNEL, - DMA_ATTR_ALLOC_SINGLE_PAGES); - if (!mmu->sgt) { - ret = -ENOMEM; - goto done_err; - } - mmu->tables = dma_vmap_noncontiguous(&pdev->dev, TABLES_ALLOC_SIZE, - mmu->sgt); - if (!mmu->tables) { - ret = -ENOMEM; - goto done_err; + mmu->aperture_base = DEFAULT_APERTURE_BASE; + mmu->aperture_end = DEFAULT_APERTURE_SIZE; + mmu->dma_iova_offset = 0; + if (pdev->dev.of_node) { + if (!of_property_read_u64(pdev->dev.of_node, + "aperture-base", &mmu->aperture_base)) + mmu->aperture_base &= ~((1ul << L1_AP_BASE_SHIFT) - 1); + if (!of_property_read_u64(pdev->dev.of_node, + "aperture-size", &mmu->aperture_end)) + mmu->aperture_end = clamp(mmu->aperture_end & + ~((1ul << ADDR_CAP_SHIFT) - 1), + (1ul << ADDR_CAP_SHIFT), + (PAGE_SIZE << 20)); + if (!of_property_read_u64(pdev->dev.of_node, + "dma-iova-offset", &mmu->dma_iova_offset)) + mmu->aperture_base += mmu->dma_iova_offset; } + num_pages = mmu->aperture_end >> IOVA_TO_LXPAGE_SHIFT; + mmu->aperture_end += mmu->aperture_base; + dev_info(&pdev->dev, "IOVA aperture 0x%llx..0x%llx including DMA offset 0x%llx\n", + (unsigned long long)mmu->aperture_base, + (unsigned long long)mmu->aperture_end, + (unsigned long long)mmu->dma_iova_offset); + + /* Allocate pointers for each page */ + mmu->tables = devm_kzalloc(&pdev->dev, + num_pages * sizeof(u32 *), GFP_KERNEL); + if (!mmu->tables) + return -ENOMEM; /* Get IOMMU registers */ mmu->reg_base = devm_platform_ioremap_resource(pdev, 0); @@ -640,38 +711,35 @@ static int bcm2712_iommu_probe(struct platform_device *pdev) if (ret) goto done_err; - /* Initialize table and hardware */ - bcm2712_iommu_init(mmu); - ret = iommu_device_register(&mmu->iommu, &bcm2712_iommu_ops, &pdev->dev); + /* Initialize hardware -- this will try to allocate 2 pages */ + ret = bcm2712_iommu_init(mmu); + if (ret) + goto done_err; - dev_info(&pdev->dev, "%s: Success\n", __func__); - return 0; + ret = iommu_device_register(&mmu->iommu, &bcm2712_iommu_ops, &pdev->dev); + if (!ret) + return 0; done_err: dev_info(&pdev->dev, "%s: Failure %d\n", __func__, ret); if (mmu->group) iommu_group_put(mmu->group); - if (mmu->tables) - dma_vunmap_noncontiguous(&pdev->dev, - (void *)(mmu->tables)); - mmu->tables = NULL; - if (mmu->sgt) - dma_free_noncontiguous(&pdev->dev, TABLES_ALLOC_SIZE, - mmu->sgt, DMA_TO_DEVICE); - mmu->sgt = NULL; - kfree(mmu); + return ret; } static void bcm2712_iommu_remove(struct platform_device *pdev) { struct bcm2712_iommu *mmu = platform_get_drvdata(pdev); + unsigned int i; + + for (i = 0; i < (mmu->aperture_end - mmu->aperture_base) >> IOVA_TO_LXPAGE_SHIFT; i++) + bcm2712_iommu_free_page(mmu, mmu->tables[i]); + bcm2712_iommu_free_page(mmu, mmu->top_table); + bcm2712_iommu_free_page(mmu, mmu->default_page); if (mmu->reg_base) MMU_WR(MMMU_CTRL_OFFSET, 0); /* disable the MMU */ - if (mmu->sgt) - dma_free_noncontiguous(&pdev->dev, TABLES_ALLOC_SIZE, - mmu->sgt, DMA_TO_DEVICE); } static const struct of_device_id bcm2712_iommu_of_match[] = { diff --git a/drivers/iommu/bcm2712-iommu.h b/drivers/iommu/bcm2712-iommu.h index 31b811e426ddb8..0597b4f0da87cc 100644 --- a/drivers/iommu/bcm2712-iommu.h +++ b/drivers/iommu/bcm2712-iommu.h @@ -2,7 +2,7 @@ /* * IOMMU driver for BCM2712 * - * Copyright (c) 2023 Raspberry Pi Ltd. + * Copyright (c) 2023-2025 Raspberry Pi Ltd. */ #ifndef _BCM2712_IOMMU_H @@ -25,16 +25,18 @@ struct bcm2712_iommu { struct iommu_group *group; struct bcm2712_iommu_domain *domain; char const *name; - struct sg_table *sgt; /* allocated memory for page tables */ - u32 *tables; /* kernel mapping for page tables */ + u64 dma_iova_offset; /* Hack for IOMMU attached to PCIe RC, included below */ + u64 aperture_base; /* Base of IOVA region as passed to DMA peripherals */ + u64 aperture_end; /* End of IOVA region as passed to DMA peripherals */ + u32 **tables; /* For each Linux page of L2 tables, kernel virtual address */ + u32 *top_table; /* Top-level table holding IOMMU page-numbers of L2 tables */ + u32 *default_page; /* Page used to trap illegal reads and writes */ struct bcm2712_iommu_cache *cache; spinlock_t hw_lock; /* to protect HW registers */ void __iomem *reg_base; - u64 dma_iova_offset; /* Hack for IOMMU attached to PCIe RC */ u32 bigpage_mask; u32 superpage_mask; unsigned int nmapped_pages; - bool dirty; /* true when tables are oriented towards CPU */ }; struct bcm2712_iommu_domain { From 01363b8126c65baebf51d06ebf82596dc2c8cc17 Mon Sep 17 00:00:00 2001 From: Nick Hollinghurst Date: Fri, 29 Aug 2025 15:57:19 +0100 Subject: [PATCH 2/2] DT: bcm2712: Increase IOMMU2 (ISP-BE, HEVC) IOVA aperture to 4GBytes This is largely to test a previous change that made IOMMU aperture configurable and allocated lazily; it may be useful in its own right. We expect IOMMU2 to be well-utilized e.g. when using 64MPix cameras. Signed-off-by: Nick Hollinghurst --- arch/arm64/boot/dts/broadcom/bcm2712-ds.dtsi | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/arm64/boot/dts/broadcom/bcm2712-ds.dtsi b/arch/arm64/boot/dts/broadcom/bcm2712-ds.dtsi index 95482c40a9eb57..f4e6cbc02ecea3 100644 --- a/arch/arm64/boot/dts/broadcom/bcm2712-ds.dtsi +++ b/arch/arm64/boot/dts/broadcom/bcm2712-ds.dtsi @@ -285,6 +285,7 @@ compatible = "brcm,bcm2712-iommu"; reg = <0x10 0x5100 0x0 0x80>; cache = <&iommuc>; + aperture-size = <0x1 0x00000000>; // Increase IOVA aperture size to 4GBytes #iommu-cells = <0>; }; @@ -497,4 +498,4 @@ &pcie2 { brcm,vdm-qos-map = /bits/ 8 <8 8 8 9 10 10 11 11>; -}; \ No newline at end of file +};