Skip to content

Commit f728c17

Browse files
Farah-kassabriogabbay
authored andcommitted
accel/habanalabs/gaudi2: move HMMU page tables to device memory
Currently the HMMU page tables reside in the host memory, which will cause host access from the device for every page walk. This can affect PCIe bandwidth in certain scenarios. To prevent that problem, HMMU page tables will be moved to the device memory so the miss transaction will read the hops from there instead of going to the host. Signed-off-by: Farah Kassabri <fkassabri@habana.ai> Reviewed-by: Oded Gabbay <ogabbay@kernel.org> Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
1 parent 246d8b6 commit f728c17

File tree

10 files changed

+836
-382
lines changed

10 files changed

+836
-382
lines changed

drivers/accel/habanalabs/common/habanalabs.h

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -443,18 +443,22 @@ enum hl_collective_mode {
443443
* a CB handle can be provided for jobs on this queue.
444444
* Otherwise, a CB address must be provided.
445445
* @collective_mode: collective mode of current queue
446+
* @q_dram_bd_address: PQ dram address, used when PQ need to reside in DRAM.
446447
* @driver_only: true if only the driver is allowed to send a job to this queue,
447448
* false otherwise.
448449
* @binned: True if the queue is binned out and should not be used
449450
* @supports_sync_stream: True if queue supports sync stream
451+
* @dram_bd: True if the bd should be copied to dram, needed for PQ which has been allocated on dram
450452
*/
451453
struct hw_queue_properties {
452454
enum hl_queue_type type;
453455
enum queue_cb_alloc_flags cb_alloc_flags;
454456
enum hl_collective_mode collective_mode;
457+
u64 q_dram_bd_address;
455458
u8 driver_only;
456459
u8 binned;
457460
u8 supports_sync_stream;
461+
u8 dram_bd;
458462
};
459463

460464
/**
@@ -1052,6 +1056,8 @@ struct hl_encaps_signals_mgr {
10521056
* @collective_mode: collective mode of current queue
10531057
* @kernel_address: holds the queue's kernel virtual address.
10541058
* @bus_address: holds the queue's DMA address.
1059+
* @pq_dram_address: hold the dram address when the PQ is allocated, used when dram_bd is true in
1060+
* queue properites.
10551061
* @pi: holds the queue's pi value.
10561062
* @ci: holds the queue's ci value, AS CALCULATED BY THE DRIVER (not real ci).
10571063
* @hw_queue_id: the id of the H/W queue.
@@ -1061,6 +1067,7 @@ struct hl_encaps_signals_mgr {
10611067
* @valid: is the queue valid (we have array of 32 queues, not all of them
10621068
* exist).
10631069
* @supports_sync_stream: True if queue supports sync stream
1070+
* @dram_bd: True if the bd should be copied to dram, needed for PQ which has been allocated on dram
10641071
*/
10651072
struct hl_hw_queue {
10661073
struct hl_cs_job **shadow_queue;
@@ -1069,6 +1076,7 @@ struct hl_hw_queue {
10691076
enum hl_collective_mode collective_mode;
10701077
void *kernel_address;
10711078
dma_addr_t bus_address;
1079+
u64 pq_dram_address;
10721080
u32 pi;
10731081
atomic_t ci;
10741082
u32 hw_queue_id;
@@ -1077,6 +1085,7 @@ struct hl_hw_queue {
10771085
u16 int_queue_len;
10781086
u8 valid;
10791087
u8 supports_sync_stream;
1088+
u8 dram_bd;
10801089
};
10811090

10821091
/**
@@ -3889,13 +3898,30 @@ int hl_mmu_hr_get_tlb_info(struct hl_ctx *ctx, u64 virt_addr, struct hl_mmu_hop_
38893898
struct hl_hr_mmu_funcs *hr_func);
38903899
int hl_mmu_if_set_funcs(struct hl_device *hdev);
38913900
void hl_mmu_v1_set_funcs(struct hl_device *hdev, struct hl_mmu_funcs *mmu);
3901+
void hl_mmu_v2_set_funcs(struct hl_device *hdev, struct hl_mmu_funcs *mmu);
38923902
void hl_mmu_v2_hr_set_funcs(struct hl_device *hdev, struct hl_mmu_funcs *mmu);
38933903
int hl_mmu_va_to_pa(struct hl_ctx *ctx, u64 virt_addr, u64 *phys_addr);
38943904
int hl_mmu_get_tlb_info(struct hl_ctx *ctx, u64 virt_addr,
38953905
struct hl_mmu_hop_info *hops);
38963906
u64 hl_mmu_scramble_addr(struct hl_device *hdev, u64 addr);
38973907
u64 hl_mmu_descramble_addr(struct hl_device *hdev, u64 addr);
38983908
bool hl_is_dram_va(struct hl_device *hdev, u64 virt_addr);
3909+
struct pgt_info *hl_mmu_dr_get_pgt_info(struct hl_ctx *ctx, u64 hop_addr);
3910+
void hl_mmu_dr_free_hop(struct hl_ctx *ctx, u64 hop_addr);
3911+
void hl_mmu_dr_free_pgt_node(struct hl_ctx *ctx, struct pgt_info *pgt_info);
3912+
u64 hl_mmu_dr_get_phys_hop0_addr(struct hl_ctx *ctx);
3913+
u64 hl_mmu_dr_get_hop0_addr(struct hl_ctx *ctx);
3914+
void hl_mmu_dr_write_pte(struct hl_ctx *ctx, u64 shadow_pte_addr, u64 val);
3915+
void hl_mmu_dr_write_final_pte(struct hl_ctx *ctx, u64 shadow_pte_addr, u64 val);
3916+
void hl_mmu_dr_clear_pte(struct hl_ctx *ctx, u64 pte_addr);
3917+
u64 hl_mmu_dr_get_phys_addr(struct hl_ctx *ctx, u64 shadow_addr);
3918+
void hl_mmu_dr_get_pte(struct hl_ctx *ctx, u64 hop_addr);
3919+
int hl_mmu_dr_put_pte(struct hl_ctx *ctx, u64 hop_addr);
3920+
u64 hl_mmu_dr_get_alloc_next_hop_addr(struct hl_ctx *ctx, u64 curr_pte, bool *is_new_hop);
3921+
u64 hl_mmu_dr_alloc_hop(struct hl_ctx *ctx);
3922+
void hl_mmu_dr_flush(struct hl_ctx *ctx);
3923+
int hl_mmu_dr_init(struct hl_device *hdev);
3924+
void hl_mmu_dr_fini(struct hl_device *hdev);
38993925

39003926
int hl_fw_load_fw_to_device(struct hl_device *hdev, const char *fw_name,
39013927
void __iomem *dst, u32 src_offset, u32 size);

drivers/accel/habanalabs/common/hw_queue.c

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -84,14 +84,25 @@ void hl_hw_queue_submit_bd(struct hl_device *hdev, struct hl_hw_queue *q,
8484
u32 ctl, u32 len, u64 ptr)
8585
{
8686
struct hl_bd *bd;
87+
u64 addr;
88+
int i;
8789

8890
bd = q->kernel_address;
8991
bd += hl_pi_2_offset(q->pi);
9092
bd->ctl = cpu_to_le32(ctl);
9193
bd->len = cpu_to_le32(len);
9294
bd->ptr = cpu_to_le64(ptr);
9395

96+
if (q->dram_bd)
97+
for (i = 0 ; i < 2 ; i++) {
98+
addr = q->pq_dram_address +
99+
((hl_pi_2_offset(q->pi) * sizeof(struct hl_bd)) + (i * sizeof(u64)));
100+
hdev->asic_funcs->access_dev_mem(hdev, PCI_REGION_DRAM, addr,
101+
(u64 *)(bd) + i, DEBUGFS_WRITE64);
102+
}
103+
94104
q->pi = hl_queue_inc_ptr(q->pi);
105+
95106
hdev->asic_funcs->ring_doorbell(hdev, q->hw_queue_id, q->pi);
96107
}
97108

@@ -1087,12 +1098,18 @@ int hl_hw_queues_create(struct hl_device *hdev)
10871098
q->supports_sync_stream =
10881099
asic->hw_queues_props[i].supports_sync_stream;
10891100
q->collective_mode = asic->hw_queues_props[i].collective_mode;
1101+
q->dram_bd = asic->hw_queues_props[i].dram_bd;
1102+
10901103
rc = queue_init(hdev, q, i);
10911104
if (rc) {
10921105
dev_err(hdev->dev,
10931106
"failed to initialize queue %d\n", i);
10941107
goto release_queues;
10951108
}
1109+
1110+
/* Set DRAM PQ address for the queue if it should be at DRAM */
1111+
if (q->dram_bd)
1112+
q->pq_dram_address = asic->hw_queues_props[i].q_dram_bd_address;
10961113
}
10971114

10981115
return 0;
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
11
# SPDX-License-Identifier: GPL-2.0-only
22
HL_COMMON_MMU_FILES := common/mmu/mmu.o common/mmu/mmu_v1.o \
3-
common/mmu/mmu_v2_hr.o
3+
common/mmu/mmu_v2.o common/mmu/mmu_v2_hr.o

drivers/accel/habanalabs/common/mmu/mmu.c

Lines changed: 221 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -585,6 +585,8 @@ int hl_mmu_get_tlb_info(struct hl_ctx *ctx, u64 virt_addr,
585585

586586
int hl_mmu_if_set_funcs(struct hl_device *hdev)
587587
{
588+
struct asic_fixed_properties *prop = &hdev->asic_prop;
589+
588590
if (hdev->mmu_disable)
589591
return 0;
590592

@@ -597,8 +599,9 @@ int hl_mmu_if_set_funcs(struct hl_device *hdev)
597599
case ASIC_GAUDI2:
598600
case ASIC_GAUDI2B:
599601
case ASIC_GAUDI2C:
600-
/* MMUs in Gaudi2 are always host resident */
601-
hl_mmu_v2_hr_set_funcs(hdev, &hdev->mmu_func[MMU_HR_PGT]);
602+
hl_mmu_v2_set_funcs(hdev, &hdev->mmu_func[MMU_DR_PGT]);
603+
if (prop->pmmu.host_resident)
604+
hl_mmu_v2_hr_set_funcs(hdev, &hdev->mmu_func[MMU_HR_PGT]);
602605
break;
603606
default:
604607
dev_err(hdev->dev, "Unrecognized ASIC type %d\n",
@@ -1209,3 +1212,219 @@ int hl_mmu_hr_get_tlb_info(struct hl_ctx *ctx, u64 virt_addr, struct hl_mmu_hop_
12091212
return 0;
12101213
}
12111214

1215+
struct pgt_info *hl_mmu_dr_get_pgt_info(struct hl_ctx *ctx, u64 hop_addr)
1216+
{
1217+
struct pgt_info *pgt_info = NULL;
1218+
1219+
hash_for_each_possible(ctx->mmu_shadow_hash, pgt_info, node,
1220+
(unsigned long) hop_addr)
1221+
if (hop_addr == pgt_info->shadow_addr)
1222+
break;
1223+
1224+
return pgt_info;
1225+
}
1226+
1227+
void hl_mmu_dr_free_hop(struct hl_ctx *ctx, u64 hop_addr)
1228+
{
1229+
struct pgt_info *pgt_info = hl_mmu_dr_get_pgt_info(ctx, hop_addr);
1230+
1231+
hl_mmu_dr_free_pgt_node(ctx, pgt_info);
1232+
}
1233+
1234+
void hl_mmu_dr_free_pgt_node(struct hl_ctx *ctx, struct pgt_info *pgt_info)
1235+
{
1236+
struct hl_device *hdev = ctx->hdev;
1237+
1238+
gen_pool_free(hdev->mmu_priv.dr.mmu_pgt_pool, pgt_info->phys_addr,
1239+
hdev->asic_prop.mmu_hop_table_size);
1240+
hash_del(&pgt_info->node);
1241+
kfree((u64 *) (uintptr_t) pgt_info->shadow_addr);
1242+
kfree(pgt_info);
1243+
}
1244+
1245+
u64 hl_mmu_dr_get_phys_hop0_addr(struct hl_ctx *ctx)
1246+
{
1247+
return ctx->hdev->asic_prop.mmu_pgt_addr +
1248+
(ctx->asid * ctx->hdev->asic_prop.mmu_hop_table_size);
1249+
}
1250+
1251+
u64 hl_mmu_dr_get_hop0_addr(struct hl_ctx *ctx)
1252+
{
1253+
return (u64) (uintptr_t) ctx->hdev->mmu_priv.dr.mmu_shadow_hop0 +
1254+
(ctx->asid * ctx->hdev->asic_prop.mmu_hop_table_size);
1255+
}
1256+
1257+
u64 hl_mmu_dr_get_phys_addr(struct hl_ctx *ctx, u64 shadow_addr)
1258+
{
1259+
u64 page_mask = ctx->hdev->asic_prop.mmu_hop_table_size - 1;
1260+
u64 shadow_hop_addr = shadow_addr & (~page_mask);
1261+
u64 pte_offset = shadow_addr & page_mask;
1262+
u64 phys_hop_addr;
1263+
1264+
if (shadow_hop_addr != hl_mmu_dr_get_hop0_addr(ctx))
1265+
phys_hop_addr = hl_mmu_dr_get_pgt_info(ctx, shadow_hop_addr)->phys_addr;
1266+
else
1267+
phys_hop_addr = hl_mmu_dr_get_phys_hop0_addr(ctx);
1268+
1269+
return phys_hop_addr + pte_offset;
1270+
}
1271+
1272+
void hl_mmu_dr_write_pte(struct hl_ctx *ctx, u64 shadow_pte_addr, u64 val)
1273+
{
1274+
u64 phys_val = hl_mmu_dr_get_phys_addr(ctx, val);
1275+
1276+
ctx->hdev->asic_funcs->write_pte(ctx->hdev, hl_mmu_dr_get_phys_addr(ctx, shadow_pte_addr),
1277+
phys_val);
1278+
1279+
*(u64 *) (uintptr_t) shadow_pte_addr = val;
1280+
}
1281+
1282+
void hl_mmu_dr_write_final_pte(struct hl_ctx *ctx, u64 shadow_pte_addr, u64 val)
1283+
{
1284+
ctx->hdev->asic_funcs->write_pte(ctx->hdev,
1285+
hl_mmu_dr_get_phys_addr(ctx, shadow_pte_addr), val);
1286+
*(u64 *) (uintptr_t) shadow_pte_addr = val;
1287+
}
1288+
1289+
void hl_mmu_dr_clear_pte(struct hl_ctx *ctx, u64 pte_addr)
1290+
{
1291+
hl_mmu_dr_write_final_pte(ctx, pte_addr, 0);
1292+
}
1293+
1294+
void hl_mmu_dr_get_pte(struct hl_ctx *ctx, u64 hop_addr)
1295+
{
1296+
hl_mmu_dr_get_pgt_info(ctx, hop_addr)->num_of_ptes++;
1297+
}
1298+
1299+
int hl_mmu_dr_put_pte(struct hl_ctx *ctx, u64 hop_addr)
1300+
{
1301+
struct pgt_info *pgt_info = hl_mmu_dr_get_pgt_info(ctx, hop_addr);
1302+
int num_of_ptes_left;
1303+
1304+
pgt_info->num_of_ptes--;
1305+
1306+
/*
1307+
* Need to save the number of ptes left because hl_mmu_free_hop might free
1308+
* the pgt_info
1309+
*/
1310+
num_of_ptes_left = pgt_info->num_of_ptes;
1311+
if (!num_of_ptes_left)
1312+
hl_mmu_dr_free_pgt_node(ctx, pgt_info);
1313+
1314+
return num_of_ptes_left;
1315+
}
1316+
1317+
u64 hl_mmu_dr_alloc_hop(struct hl_ctx *ctx)
1318+
{
1319+
struct hl_device *hdev = ctx->hdev;
1320+
struct asic_fixed_properties *prop = &hdev->asic_prop;
1321+
struct pgt_info *pgt_info;
1322+
u64 phys_addr, shadow_addr;
1323+
1324+
pgt_info = kmalloc(sizeof(*pgt_info), GFP_KERNEL);
1325+
if (!pgt_info)
1326+
return ULLONG_MAX;
1327+
1328+
phys_addr = (u64) gen_pool_alloc(hdev->mmu_priv.dr.mmu_pgt_pool,
1329+
prop->mmu_hop_table_size);
1330+
if (!phys_addr) {
1331+
dev_err(hdev->dev, "failed to allocate page\n");
1332+
goto pool_add_err;
1333+
}
1334+
1335+
shadow_addr = (u64) (uintptr_t) kzalloc(prop->mmu_hop_table_size,
1336+
GFP_KERNEL);
1337+
if (!shadow_addr)
1338+
goto shadow_err;
1339+
1340+
pgt_info->phys_addr = phys_addr;
1341+
pgt_info->shadow_addr = shadow_addr;
1342+
pgt_info->ctx = ctx;
1343+
pgt_info->num_of_ptes = 0;
1344+
hash_add(ctx->mmu_shadow_hash, &pgt_info->node, shadow_addr);
1345+
1346+
return shadow_addr;
1347+
1348+
shadow_err:
1349+
gen_pool_free(hdev->mmu_priv.dr.mmu_pgt_pool,
1350+
phys_addr, prop->mmu_hop_table_size);
1351+
pool_add_err:
1352+
kfree(pgt_info);
1353+
1354+
return ULLONG_MAX;
1355+
}
1356+
1357+
u64 hl_mmu_dr_get_alloc_next_hop_addr(struct hl_ctx *ctx, u64 curr_pte, bool *is_new_hop)
1358+
{
1359+
u64 hop_addr = hl_mmu_get_next_hop_addr(ctx, curr_pte);
1360+
1361+
if (hop_addr == ULLONG_MAX) {
1362+
hop_addr = hl_mmu_dr_alloc_hop(ctx);
1363+
*is_new_hop = (hop_addr != ULLONG_MAX);
1364+
}
1365+
1366+
return hop_addr;
1367+
}
1368+
1369+
void hl_mmu_dr_flush(struct hl_ctx *ctx)
1370+
{
1371+
/* flush all writes from all cores to reach PCI */
1372+
mb();
1373+
ctx->hdev->asic_funcs->read_pte(ctx->hdev, hl_mmu_dr_get_phys_hop0_addr(ctx));
1374+
}
1375+
1376+
int hl_mmu_dr_init(struct hl_device *hdev)
1377+
{
1378+
struct asic_fixed_properties *prop = &hdev->asic_prop;
1379+
int rc;
1380+
1381+
hdev->mmu_priv.dr.mmu_pgt_pool =
1382+
gen_pool_create(__ffs(prop->mmu_hop_table_size), -1);
1383+
1384+
if (!hdev->mmu_priv.dr.mmu_pgt_pool) {
1385+
dev_err(hdev->dev, "Failed to create page gen pool\n");
1386+
return -ENOMEM;
1387+
}
1388+
1389+
rc = gen_pool_add(hdev->mmu_priv.dr.mmu_pgt_pool, prop->mmu_pgt_addr +
1390+
prop->mmu_hop0_tables_total_size,
1391+
prop->dmmu.pgt_size - prop->mmu_hop0_tables_total_size,
1392+
-1);
1393+
if (rc) {
1394+
dev_err(hdev->dev, "Failed to add memory to page gen pool\n");
1395+
goto err_pool_add;
1396+
}
1397+
1398+
hdev->mmu_priv.dr.mmu_shadow_hop0 = kvcalloc(prop->max_asid,
1399+
prop->mmu_hop_table_size, GFP_KERNEL);
1400+
if (ZERO_OR_NULL_PTR(hdev->mmu_priv.dr.mmu_shadow_hop0)) {
1401+
rc = -ENOMEM;
1402+
goto err_pool_add;
1403+
}
1404+
1405+
/* MMU H/W init will be done in device hw_init() */
1406+
1407+
return 0;
1408+
1409+
err_pool_add:
1410+
gen_pool_destroy(hdev->mmu_priv.dr.mmu_pgt_pool);
1411+
1412+
return rc;
1413+
}
1414+
1415+
void hl_mmu_dr_fini(struct hl_device *hdev)
1416+
{
1417+
/* MMU H/W fini was already done in device hw_fini() */
1418+
1419+
if (ZERO_OR_NULL_PTR(hdev->mmu_priv.dr.mmu_shadow_hop0))
1420+
return;
1421+
1422+
kvfree(hdev->mmu_priv.dr.mmu_shadow_hop0);
1423+
gen_pool_destroy(hdev->mmu_priv.dr.mmu_pgt_pool);
1424+
1425+
/* Make sure that if we arrive here again without init was
1426+
* called we won't cause kernel panic. This can happen for
1427+
* example if we fail during hard reset code at certain points
1428+
*/
1429+
hdev->mmu_priv.dr.mmu_shadow_hop0 = NULL;
1430+
}

0 commit comments

Comments
 (0)