Skip to content

Commit ac2021f

Browse files
dayatsin-amdgregkh
authored andcommitted
drm/amdkfd: Fix checkpoint-restore on multi-xcc
commit f6c0f3d upstream. GPUs with multi-xcc have multiple MQDs per queue. This patch saves and restores all the MQDs within the partition. Signed-off-by: David Yat Sin <David.YatSin@amd.com> Reviewed-by: Felix Kuehling <felix.kuehling@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com> (cherry picked from commit a578f2a) Cc: stable@vger.kernel.org Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
1 parent 910735d commit ac2021f

File tree

3 files changed

+67
-16
lines changed

3 files changed

+67
-16
lines changed

drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2716,7 +2716,7 @@ static void get_queue_checkpoint_info(struct device_queue_manager *dqm,
27162716

27172717
dqm_lock(dqm);
27182718
mqd_mgr = dqm->mqd_mgrs[mqd_type];
2719-
*mqd_size = mqd_mgr->mqd_size;
2719+
*mqd_size = mqd_mgr->mqd_size * NUM_XCC(mqd_mgr->dev->xcc_mask);
27202720
*ctl_stack_size = 0;
27212721

27222722
if (q->properties.type == KFD_QUEUE_TYPE_COMPUTE && mqd_mgr->get_checkpoint_info)

drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c

Lines changed: 51 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -373,7 +373,7 @@ static void get_checkpoint_info(struct mqd_manager *mm, void *mqd, u32 *ctl_stac
373373
{
374374
struct v9_mqd *m = get_mqd(mqd);
375375

376-
*ctl_stack_size = m->cp_hqd_cntl_stack_size;
376+
*ctl_stack_size = m->cp_hqd_cntl_stack_size * NUM_XCC(mm->dev->xcc_mask);
377377
}
378378

379379
static void checkpoint_mqd(struct mqd_manager *mm, void *mqd, void *mqd_dst, void *ctl_stack_dst)
@@ -388,6 +388,24 @@ static void checkpoint_mqd(struct mqd_manager *mm, void *mqd, void *mqd_dst, voi
388388
memcpy(ctl_stack_dst, ctl_stack, m->cp_hqd_cntl_stack_size);
389389
}
390390

391+
static void checkpoint_mqd_v9_4_3(struct mqd_manager *mm,
392+
void *mqd,
393+
void *mqd_dst,
394+
void *ctl_stack_dst)
395+
{
396+
struct v9_mqd *m;
397+
int xcc;
398+
uint64_t size = get_mqd(mqd)->cp_mqd_stride_size;
399+
400+
for (xcc = 0; xcc < NUM_XCC(mm->dev->xcc_mask); xcc++) {
401+
m = get_mqd(mqd + size * xcc);
402+
403+
checkpoint_mqd(mm, m,
404+
(uint8_t *)mqd_dst + sizeof(*m) * xcc,
405+
(uint8_t *)ctl_stack_dst + m->cp_hqd_cntl_stack_size * xcc);
406+
}
407+
}
408+
391409
static void restore_mqd(struct mqd_manager *mm, void **mqd,
392410
struct kfd_mem_obj *mqd_mem_obj, uint64_t *gart_addr,
393411
struct queue_properties *qp,
@@ -764,13 +782,35 @@ static void restore_mqd_v9_4_3(struct mqd_manager *mm, void **mqd,
764782
const void *mqd_src,
765783
const void *ctl_stack_src, u32 ctl_stack_size)
766784
{
767-
restore_mqd(mm, mqd, mqd_mem_obj, gart_addr, qp, mqd_src, ctl_stack_src, ctl_stack_size);
768-
if (amdgpu_sriov_multi_vf_mode(mm->dev->adev)) {
769-
struct v9_mqd *m;
785+
struct kfd_mem_obj xcc_mqd_mem_obj;
786+
u32 mqd_ctl_stack_size;
787+
struct v9_mqd *m;
788+
u32 num_xcc;
789+
int xcc;
770790

771-
m = (struct v9_mqd *) mqd_mem_obj->cpu_ptr;
772-
m->cp_hqd_pq_doorbell_control |= 1 <<
773-
CP_HQD_PQ_DOORBELL_CONTROL__DOORBELL_MODE__SHIFT;
791+
uint64_t offset = mm->mqd_stride(mm, qp);
792+
793+
mm->dev->dqm->current_logical_xcc_start++;
794+
795+
num_xcc = NUM_XCC(mm->dev->xcc_mask);
796+
mqd_ctl_stack_size = ctl_stack_size / num_xcc;
797+
798+
memset(&xcc_mqd_mem_obj, 0x0, sizeof(struct kfd_mem_obj));
799+
800+
/* Set the MQD pointer and gart address to XCC0 MQD */
801+
*mqd = mqd_mem_obj->cpu_ptr;
802+
if (gart_addr)
803+
*gart_addr = mqd_mem_obj->gpu_addr;
804+
805+
for (xcc = 0; xcc < num_xcc; xcc++) {
806+
get_xcc_mqd(mqd_mem_obj, &xcc_mqd_mem_obj, offset * xcc);
807+
restore_mqd(mm, (void **)&m,
808+
&xcc_mqd_mem_obj,
809+
NULL,
810+
qp,
811+
(uint8_t *)mqd_src + xcc * sizeof(*m),
812+
(uint8_t *)ctl_stack_src + xcc * mqd_ctl_stack_size,
813+
mqd_ctl_stack_size);
774814
}
775815
}
776816
static int destroy_mqd_v9_4_3(struct mqd_manager *mm, void *mqd,
@@ -906,7 +946,6 @@ struct mqd_manager *mqd_manager_init_v9(enum KFD_MQD_TYPE type,
906946
mqd->free_mqd = kfd_free_mqd_cp;
907947
mqd->is_occupied = kfd_is_occupied_cp;
908948
mqd->get_checkpoint_info = get_checkpoint_info;
909-
mqd->checkpoint_mqd = checkpoint_mqd;
910949
mqd->mqd_size = sizeof(struct v9_mqd);
911950
mqd->mqd_stride = mqd_stride_v9;
912951
#if defined(CONFIG_DEBUG_FS)
@@ -918,16 +957,18 @@ struct mqd_manager *mqd_manager_init_v9(enum KFD_MQD_TYPE type,
918957
mqd->init_mqd = init_mqd_v9_4_3;
919958
mqd->load_mqd = load_mqd_v9_4_3;
920959
mqd->update_mqd = update_mqd_v9_4_3;
921-
mqd->restore_mqd = restore_mqd_v9_4_3;
922960
mqd->destroy_mqd = destroy_mqd_v9_4_3;
923961
mqd->get_wave_state = get_wave_state_v9_4_3;
962+
mqd->checkpoint_mqd = checkpoint_mqd_v9_4_3;
963+
mqd->restore_mqd = restore_mqd_v9_4_3;
924964
} else {
925965
mqd->init_mqd = init_mqd;
926966
mqd->load_mqd = load_mqd;
927967
mqd->update_mqd = update_mqd;
928-
mqd->restore_mqd = restore_mqd;
929968
mqd->destroy_mqd = kfd_destroy_mqd_cp;
930969
mqd->get_wave_state = get_wave_state;
970+
mqd->checkpoint_mqd = checkpoint_mqd;
971+
mqd->restore_mqd = restore_mqd;
931972
}
932973
break;
933974
case KFD_MQD_TYPE_HIQ:

drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c

Lines changed: 15 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -914,7 +914,10 @@ static int criu_checkpoint_queues_device(struct kfd_process_device *pdd,
914914

915915
q_data = (struct kfd_criu_queue_priv_data *)q_private_data;
916916

917-
/* data stored in this order: priv_data, mqd, ctl_stack */
917+
/*
918+
* data stored in this order:
919+
* priv_data, mqd[xcc0], mqd[xcc1],..., ctl_stack[xcc0], ctl_stack[xcc1]...
920+
*/
918921
q_data->mqd_size = mqd_size;
919922
q_data->ctl_stack_size = ctl_stack_size;
920923

@@ -963,7 +966,7 @@ int kfd_criu_checkpoint_queues(struct kfd_process *p,
963966
}
964967

965968
static void set_queue_properties_from_criu(struct queue_properties *qp,
966-
struct kfd_criu_queue_priv_data *q_data)
969+
struct kfd_criu_queue_priv_data *q_data, uint32_t num_xcc)
967970
{
968971
qp->is_interop = false;
969972
qp->queue_percent = q_data->q_percent;
@@ -976,7 +979,11 @@ static void set_queue_properties_from_criu(struct queue_properties *qp,
976979
qp->eop_ring_buffer_size = q_data->eop_ring_buffer_size;
977980
qp->ctx_save_restore_area_address = q_data->ctx_save_restore_area_address;
978981
qp->ctx_save_restore_area_size = q_data->ctx_save_restore_area_size;
979-
qp->ctl_stack_size = q_data->ctl_stack_size;
982+
if (q_data->type == KFD_QUEUE_TYPE_COMPUTE)
983+
qp->ctl_stack_size = q_data->ctl_stack_size / num_xcc;
984+
else
985+
qp->ctl_stack_size = q_data->ctl_stack_size;
986+
980987
qp->type = q_data->type;
981988
qp->format = q_data->format;
982989
}
@@ -1036,12 +1043,15 @@ int kfd_criu_restore_queue(struct kfd_process *p,
10361043
goto exit;
10371044
}
10381045

1039-
/* data stored in this order: mqd, ctl_stack */
1046+
/*
1047+
* data stored in this order:
1048+
* mqd[xcc0], mqd[xcc1],..., ctl_stack[xcc0], ctl_stack[xcc1]...
1049+
*/
10401050
mqd = q_extra_data;
10411051
ctl_stack = mqd + q_data->mqd_size;
10421052

10431053
memset(&qp, 0, sizeof(qp));
1044-
set_queue_properties_from_criu(&qp, q_data);
1054+
set_queue_properties_from_criu(&qp, q_data, NUM_XCC(pdd->dev->adev->gfx.xcc_mask));
10451055

10461056
print_queue_properties(&qp);
10471057

0 commit comments

Comments
 (0)