Skip to content

Commit 2a909ae

Browse files
rajbharalexdeucher
authored andcommitted
drm/amdkfd: CRIU resume shared virtual memory ranges
In CRIU resume stage, resume all the shared virtual memory ranges from the data stored inside the resuming kfd process during CRIU restore phase. Also setup xnack mode and free up the resources. KFD_IOCTL_SVM_ATTR_CLR_FLAGS is not available for querying via get_attr interface but we must clear the flags during restore as there might be some default flags set when the prange is created. Also handle the invalid PREFETCH atribute values saved during checkpoint by replacing them with another dummy KFD_IOCTL_SVM_ATTR_SET_FLAGS attribute. (rajneesh: Fixed the checkpatch reported problems) Reviewed-by: Felix Kuehling <Felix.Kuehling@amd.com> Signed-off-by: Rajneesh Bhardwaj <rajneesh.bhardwaj@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
1 parent c2db32c commit 2a909ae

File tree

3 files changed

+119
-0
lines changed

3 files changed

+119
-0
lines changed

drivers/gpu/drm/amd/amdkfd/kfd_chardev.c

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2766,7 +2766,17 @@ static int criu_resume(struct file *filep,
27662766
}
27672767

27682768
mutex_lock(&target->mutex);
2769+
ret = kfd_criu_resume_svm(target);
2770+
if (ret) {
2771+
pr_err("kfd_criu_resume_svm failed for %i\n", args->pid);
2772+
goto exit;
2773+
}
2774+
27692775
ret = amdgpu_amdkfd_criu_resume(target->kgd_process_info);
2776+
if (ret)
2777+
pr_err("amdgpu_amdkfd_criu_resume failed for %i\n", args->pid);
2778+
2779+
exit:
27702780
mutex_unlock(&target->mutex);
27712781

27722782
kfd_unref_process(target);

drivers/gpu/drm/amd/amdkfd/kfd_svm.c

Lines changed: 103 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3487,6 +3487,109 @@ svm_range_get_attr(struct kfd_process *p, struct mm_struct *mm,
34873487
return 0;
34883488
}
34893489

3490+
int kfd_criu_resume_svm(struct kfd_process *p)
3491+
{
3492+
struct kfd_ioctl_svm_attribute *set_attr_new, *set_attr = NULL;
3493+
int nattr_common = 4, nattr_accessibility = 1;
3494+
struct criu_svm_metadata *criu_svm_md = NULL;
3495+
struct svm_range_list *svms = &p->svms;
3496+
struct criu_svm_metadata *next = NULL;
3497+
uint32_t set_flags = 0xffffffff;
3498+
int i, j, num_attrs, ret = 0;
3499+
uint64_t set_attr_size;
3500+
struct mm_struct *mm;
3501+
3502+
if (list_empty(&svms->criu_svm_metadata_list)) {
3503+
pr_debug("No SVM data from CRIU restore stage 2\n");
3504+
return ret;
3505+
}
3506+
3507+
mm = get_task_mm(p->lead_thread);
3508+
if (!mm) {
3509+
pr_err("failed to get mm for the target process\n");
3510+
return -ESRCH;
3511+
}
3512+
3513+
num_attrs = nattr_common + (nattr_accessibility * p->n_pdds);
3514+
3515+
i = j = 0;
3516+
list_for_each_entry(criu_svm_md, &svms->criu_svm_metadata_list, list) {
3517+
pr_debug("criu_svm_md[%d]\n\tstart: 0x%llx size: 0x%llx (npages)\n",
3518+
i, criu_svm_md->data.start_addr, criu_svm_md->data.size);
3519+
3520+
for (j = 0; j < num_attrs; j++) {
3521+
pr_debug("\ncriu_svm_md[%d]->attrs[%d].type : 0x%x \ncriu_svm_md[%d]->attrs[%d].value : 0x%x\n",
3522+
i, j, criu_svm_md->data.attrs[j].type,
3523+
i, j, criu_svm_md->data.attrs[j].value);
3524+
switch (criu_svm_md->data.attrs[j].type) {
3525+
/* During Checkpoint operation, the query for
3526+
* KFD_IOCTL_SVM_ATTR_PREFETCH_LOC attribute might
3527+
* return KFD_IOCTL_SVM_LOCATION_UNDEFINED if they were
3528+
* not used by the range which was checkpointed. Care
3529+
* must be taken to not restore with an invalid value
3530+
* otherwise the gpuidx value will be invalid and
3531+
* set_attr would eventually fail so just replace those
3532+
* with another dummy attribute such as
3533+
* KFD_IOCTL_SVM_ATTR_SET_FLAGS.
3534+
*/
3535+
case KFD_IOCTL_SVM_ATTR_PREFETCH_LOC:
3536+
if (criu_svm_md->data.attrs[j].value ==
3537+
KFD_IOCTL_SVM_LOCATION_UNDEFINED) {
3538+
criu_svm_md->data.attrs[j].type =
3539+
KFD_IOCTL_SVM_ATTR_SET_FLAGS;
3540+
criu_svm_md->data.attrs[j].value = 0;
3541+
}
3542+
break;
3543+
case KFD_IOCTL_SVM_ATTR_SET_FLAGS:
3544+
set_flags = criu_svm_md->data.attrs[j].value;
3545+
break;
3546+
default:
3547+
break;
3548+
}
3549+
}
3550+
3551+
/* CLR_FLAGS is not available via get_attr during checkpoint but
3552+
* it needs to be inserted before restoring the ranges so
3553+
* allocate extra space for it before calling set_attr
3554+
*/
3555+
set_attr_size = sizeof(struct kfd_ioctl_svm_attribute) *
3556+
(num_attrs + 1);
3557+
set_attr_new = krealloc(set_attr, set_attr_size,
3558+
GFP_KERNEL);
3559+
if (!set_attr_new) {
3560+
ret = -ENOMEM;
3561+
goto exit;
3562+
}
3563+
set_attr = set_attr_new;
3564+
3565+
memcpy(set_attr, criu_svm_md->data.attrs, num_attrs *
3566+
sizeof(struct kfd_ioctl_svm_attribute));
3567+
set_attr[num_attrs].type = KFD_IOCTL_SVM_ATTR_CLR_FLAGS;
3568+
set_attr[num_attrs].value = ~set_flags;
3569+
3570+
ret = svm_range_set_attr(p, mm, criu_svm_md->data.start_addr,
3571+
criu_svm_md->data.size, num_attrs + 1,
3572+
set_attr);
3573+
if (ret) {
3574+
pr_err("CRIU: failed to set range attributes\n");
3575+
goto exit;
3576+
}
3577+
3578+
i++;
3579+
}
3580+
exit:
3581+
kfree(set_attr);
3582+
list_for_each_entry_safe(criu_svm_md, next, &svms->criu_svm_metadata_list, list) {
3583+
pr_debug("freeing criu_svm_md[]\n\tstart: 0x%llx\n",
3584+
criu_svm_md->data.start_addr);
3585+
kfree(criu_svm_md);
3586+
}
3587+
3588+
mmput(mm);
3589+
return ret;
3590+
3591+
}
3592+
34903593
int kfd_criu_restore_svm(struct kfd_process *p,
34913594
uint8_t __user *user_priv_ptr,
34923595
uint64_t *priv_data_offset,

drivers/gpu/drm/amd/amdkfd/kfd_svm.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -192,6 +192,7 @@ int kfd_criu_restore_svm(struct kfd_process *p,
192192
uint8_t __user *user_priv_ptr,
193193
uint64_t *priv_data_offset,
194194
uint64_t max_priv_data_size);
195+
int kfd_criu_resume_svm(struct kfd_process *p);
195196
struct kfd_process_device *
196197
svm_range_get_pdd_by_adev(struct svm_range *prange, struct amdgpu_device *adev);
197198
void svm_range_list_lock_and_flush_work(struct svm_range_list *svms, struct mm_struct *mm);
@@ -253,6 +254,11 @@ static inline int kfd_criu_restore_svm(struct kfd_process *p,
253254
return -EINVAL;
254255
}
255256

257+
static inline int kfd_criu_resume_svm(struct kfd_process *p)
258+
{
259+
return 0;
260+
}
261+
256262
#define KFD_IS_SVM_API_SUPPORTED(dev) false
257263

258264
#endif /* IS_ENABLED(CONFIG_HSA_AMD_SVM) */

0 commit comments

Comments
 (0)