@@ -144,31 +144,79 @@ void mlx5_odp_populate_klm(struct mlx5_klm *pklm, size_t idx, size_t nentries,
144144	}
145145}
146146
147- static  void  mr_leaf_free_action (struct  work_struct  * work )
147+ /* 
148+  * This must be called after the mr has been removed from implicit_children 
149+  * and odp_mkeys and the SRCU synchronized.  NOTE: The MR does not necessarily 
150+  * have to be empty here, parallel page faults could have raced with the free 
151+  * process and added pages to it. 
152+  */ 
153+ static  void  free_implicit_child_mr (struct  mlx5_ib_mr  * mr , bool  need_imr_xlt )
148154{
149- 	struct  ib_umem_odp  * odp  =  container_of (work , struct  ib_umem_odp , work );
150- 	int  idx  =  ib_umem_start (odp ) >> MLX5_IMR_MTT_SHIFT ;
151- 	struct  mlx5_ib_mr  * mr  =  odp -> private , * imr  =  mr -> parent ;
155+ 	struct  mlx5_ib_mr  * imr  =  mr -> parent ;
152156	struct  ib_umem_odp  * odp_imr  =  to_ib_umem_odp (imr -> umem );
157+ 	struct  ib_umem_odp  * odp  =  to_ib_umem_odp (mr -> umem );
158+ 	unsigned long  idx  =  ib_umem_start (odp ) >> MLX5_IMR_MTT_SHIFT ;
153159	int  srcu_key ;
154160
155- 	mr -> parent   =   NULL ; 
156- 	synchronize_srcu ( & mr -> dev -> odp_srcu );
161+ 	/* implicit_child_mr's are not allowed to have deferred work */ 
162+ 	WARN_ON ( atomic_read ( & mr -> num_deferred_work ) );
157163
158- 	if  (xa_load ( & mr -> dev -> odp_mkeys ,  mlx5_base_mkey ( imr -> mmkey . key )) ) {
164+ 	if  (need_imr_xlt ) {
159165		srcu_key  =  srcu_read_lock (& mr -> dev -> odp_srcu );
160166		mutex_lock (& odp_imr -> umem_mutex );
161- 		mlx5_ib_update_xlt (imr , idx , 1 , 0 ,
167+ 		mlx5_ib_update_xlt (mr -> parent , idx , 1 , 0 ,
162168				   MLX5_IB_UPD_XLT_INDIRECT  |
163169				   MLX5_IB_UPD_XLT_ATOMIC );
164170		mutex_unlock (& odp_imr -> umem_mutex );
165171		srcu_read_unlock (& mr -> dev -> odp_srcu , srcu_key );
166172	}
167- 	ib_umem_odp_release (odp );
173+ 
174+ 	mr -> parent  =  NULL ;
168175	mlx5_mr_cache_free (mr -> dev , mr );
176+ 	ib_umem_odp_release (odp );
177+ 	atomic_dec (& imr -> num_deferred_work );
178+ }
179+ 
180+ static  void  free_implicit_child_mr_work (struct  work_struct  * work )
181+ {
182+ 	struct  mlx5_ib_mr  * mr  = 
183+ 		container_of (work , struct  mlx5_ib_mr , odp_destroy .work );
184+ 
185+ 	free_implicit_child_mr (mr , true);
186+ }
187+ 
188+ static  void  free_implicit_child_mr_rcu (struct  rcu_head  * head )
189+ {
190+ 	struct  mlx5_ib_mr  * mr  = 
191+ 		container_of (head , struct  mlx5_ib_mr , odp_destroy .rcu );
192+ 
193+ 	/* Freeing a MR is a sleeping operation, so bounce to a work queue */ 
194+ 	INIT_WORK (& mr -> odp_destroy .work , free_implicit_child_mr_work );
195+ 	queue_work (system_unbound_wq , & mr -> odp_destroy .work );
196+ }
197+ 
198+ static  void  destroy_unused_implicit_child_mr (struct  mlx5_ib_mr  * mr )
199+ {
200+ 	struct  ib_umem_odp  * odp  =  to_ib_umem_odp (mr -> umem );
201+ 	unsigned long  idx  =  ib_umem_start (odp ) >> MLX5_IMR_MTT_SHIFT ;
202+ 	struct  mlx5_ib_mr  * imr  =  mr -> parent ;
169203
170- 	if  (atomic_dec_and_test (& imr -> num_leaf_free ))
171- 		wake_up (& imr -> q_leaf_free );
204+ 	xa_lock (& imr -> implicit_children );
205+ 	/* 
206+ 	 * This can race with mlx5_ib_free_implicit_mr(), the first one to 
207+ 	 * reach the xa lock wins the race and destroys the MR. 
208+ 	 */ 
209+ 	if  (__xa_cmpxchg (& imr -> implicit_children , idx , mr , NULL , GFP_ATOMIC ) != 
210+ 	    mr )
211+ 		goto out_unlock ;
212+ 
213+ 	__xa_erase (& mr -> dev -> odp_mkeys , mlx5_base_mkey (mr -> mmkey .key ));
214+ 	atomic_inc (& imr -> num_deferred_work );
215+ 	call_srcu (& mr -> dev -> odp_srcu , & mr -> odp_destroy .rcu ,
216+ 		  free_implicit_child_mr_rcu );
217+ 
218+ out_unlock :
219+ 	xa_unlock (& imr -> implicit_children );
172220}
173221
174222void  mlx5_ib_invalidate_range (struct  ib_umem_odp  * umem_odp , unsigned long  start ,
@@ -240,15 +288,8 @@ void mlx5_ib_invalidate_range(struct ib_umem_odp *umem_odp, unsigned long start,
240288
241289	ib_umem_odp_unmap_dma_pages (umem_odp , start , end );
242290
243- 	if  (unlikely (!umem_odp -> npages  &&  mr -> parent  && 
244- 		     !umem_odp -> dying )) {
245- 		xa_erase (& mr -> parent -> implicit_children ,
246- 			 ib_umem_start (umem_odp ) >> MLX5_IMR_MTT_SHIFT );
247- 		xa_erase (& mr -> dev -> odp_mkeys , mlx5_base_mkey (mr -> mmkey .key ));
248- 		umem_odp -> dying  =  1 ;
249- 		atomic_inc (& mr -> parent -> num_leaf_free );
250- 		schedule_work (& umem_odp -> work );
251- 	}
291+ 	if  (unlikely (!umem_odp -> npages  &&  mr -> parent ))
292+ 		destroy_unused_implicit_child_mr (mr );
252293	mutex_unlock (& umem_odp -> umem_mutex );
253294}
254295
@@ -375,7 +416,6 @@ static struct mlx5_ib_mr *implicit_get_child_mr(struct mlx5_ib_mr *imr,
375416	mr -> mmkey .iova  =  idx  *  MLX5_IMR_MTT_SIZE ;
376417	mr -> parent  =  imr ;
377418	odp -> private  =  mr ;
378- 	INIT_WORK (& odp -> work , mr_leaf_free_action );
379419
380420	err  =  mlx5_ib_update_xlt (mr , 0 ,
381421				 MLX5_IMR_MTT_ENTRIES ,
@@ -391,7 +431,11 @@ static struct mlx5_ib_mr *implicit_get_child_mr(struct mlx5_ib_mr *imr,
391431	 * Once the store to either xarray completes any error unwind has to 
392432	 * use synchronize_srcu(). Avoid this with xa_reserve() 
393433	 */ 
394- 	ret  =  xa_cmpxchg (& imr -> implicit_children , idx , NULL , mr , GFP_KERNEL );
434+ 	ret  =  xa_cmpxchg (& imr -> implicit_children , idx , NULL , mr ,
435+ 			 GFP_KERNEL );
436+ 	if  (likely (!ret ))
437+ 		xa_store (& imr -> dev -> odp_mkeys , mlx5_base_mkey (mr -> mmkey .key ),
438+ 			 & mr -> mmkey , GFP_ATOMIC );
395439	if  (unlikely (ret )) {
396440		if  (xa_is_err (ret )) {
397441			ret  =  ERR_PTR (xa_err (ret ));
@@ -404,9 +448,6 @@ static struct mlx5_ib_mr *implicit_get_child_mr(struct mlx5_ib_mr *imr,
404448		goto out_release ;
405449	}
406450
407- 	xa_store (& imr -> dev -> odp_mkeys , mlx5_base_mkey (mr -> mmkey .key ),
408- 		 & mr -> mmkey , GFP_ATOMIC );
409- 
410451	mlx5_ib_dbg (imr -> dev , "key %x mr %p\n" , mr -> mmkey .key , mr );
411452	return  mr ;
412453
@@ -445,9 +486,7 @@ struct mlx5_ib_mr *mlx5_ib_alloc_implicit_mr(struct mlx5_ib_pd *pd,
445486	imr -> ibmr .lkey  =  imr -> mmkey .key ;
446487	imr -> ibmr .rkey  =  imr -> mmkey .key ;
447488	imr -> umem  =  & umem_odp -> umem ;
448- 	init_waitqueue_head (& imr -> q_leaf_free );
449- 	atomic_set (& imr -> num_leaf_free , 0 );
450- 	atomic_set (& imr -> num_pending_prefetch , 0 );
489+ 	atomic_set (& imr -> num_deferred_work , 0 );
451490	xa_init (& imr -> implicit_children );
452491
453492	err  =  mlx5_ib_update_xlt (imr , 0 ,
@@ -477,35 +516,48 @@ struct mlx5_ib_mr *mlx5_ib_alloc_implicit_mr(struct mlx5_ib_pd *pd,
477516void  mlx5_ib_free_implicit_mr (struct  mlx5_ib_mr  * imr )
478517{
479518	struct  ib_umem_odp  * odp_imr  =  to_ib_umem_odp (imr -> umem );
519+ 	struct  mlx5_ib_dev  * dev  =  imr -> dev ;
520+ 	struct  list_head  destroy_list ;
480521	struct  mlx5_ib_mr  * mtt ;
522+ 	struct  mlx5_ib_mr  * tmp ;
481523	unsigned long  idx ;
482524
483- 	mutex_lock (& odp_imr -> umem_mutex );
484- 	xa_for_each  (& imr -> implicit_children , idx , mtt ) {
485- 		struct  ib_umem_odp  * umem_odp  =  to_ib_umem_odp (mtt -> umem );
525+ 	INIT_LIST_HEAD (& destroy_list );
486526
487- 		xa_erase (& imr -> implicit_children , idx );
527+ 	xa_erase (& dev -> odp_mkeys , mlx5_base_mkey (imr -> mmkey .key ));
528+ 	/* 
529+ 	 * This stops the SRCU protected page fault path from touching either 
530+ 	 * the imr or any children. The page fault path can only reach the 
531+ 	 * children xarray via the imr. 
532+ 	 */ 
533+ 	synchronize_srcu (& dev -> odp_srcu );
488534
489- 		mutex_lock (& umem_odp -> umem_mutex );
490- 		ib_umem_odp_unmap_dma_pages (umem_odp , ib_umem_start (umem_odp ),
491- 					    ib_umem_end (umem_odp ));
535+ 	xa_lock (& imr -> implicit_children );
536+ 	xa_for_each  (& imr -> implicit_children , idx , mtt ) {
537+ 		__xa_erase (& imr -> implicit_children , idx );
538+ 		__xa_erase (& dev -> odp_mkeys , mlx5_base_mkey (mtt -> mmkey .key ));
539+ 		list_add (& mtt -> odp_destroy .elm , & destroy_list );
540+ 	}
541+ 	xa_unlock (& imr -> implicit_children );
492542
493- 		if  (umem_odp -> dying ) {
494- 			mutex_unlock (& umem_odp -> umem_mutex );
495- 			continue ;
496- 		}
543+ 	/* Fence access to the child pointers via the pagefault thread */ 
544+ 	synchronize_srcu (& dev -> odp_srcu );
497545
498- 		umem_odp -> dying  =  1 ;
499- 		atomic_inc (& imr -> num_leaf_free );
500- 		schedule_work (& umem_odp -> work );
501- 		mutex_unlock (& umem_odp -> umem_mutex );
546+ 	/* 
547+ 	 * num_deferred_work can only be incremented inside the odp_srcu, or 
548+ 	 * under xa_lock while the child is in the xarray. Thus at this point 
549+ 	 * it is only decreasing, and all work holding it is now on the wq. 
550+ 	 */ 
551+ 	if  (atomic_read (& imr -> num_deferred_work )) {
552+ 		flush_workqueue (system_unbound_wq );
553+ 		WARN_ON (atomic_read (& imr -> num_deferred_work ));
502554	}
503- 	mutex_unlock (& odp_imr -> umem_mutex );
504555
505- 	wait_event (imr -> q_leaf_free , !atomic_read (& imr -> num_leaf_free ));
506- 	WARN_ON (!xa_empty (& imr -> implicit_children ));
507- 	/* Remove any left over reserved elements */ 
508- 	xa_destroy (& imr -> implicit_children );
556+ 	list_for_each_entry_safe  (mtt , tmp , & destroy_list , odp_destroy .elm )
557+ 		free_implicit_child_mr (mtt , false);
558+ 
559+ 	mlx5_mr_cache_free (dev , imr );
560+ 	ib_umem_odp_release (odp_imr );
509561}
510562
511563#define  MLX5_PF_FLAGS_DOWNGRADE  BIT(1)
@@ -1579,7 +1631,7 @@ static void destroy_prefetch_work(struct prefetch_mr_work *work)
15791631	u32  i ;
15801632
15811633	for  (i  =  0 ; i  <  work -> num_sge ; ++ i )
1582- 		atomic_dec (& work -> frags [i ].mr -> num_pending_prefetch );
1634+ 		atomic_dec (& work -> frags [i ].mr -> num_deferred_work );
15831635	kvfree (work );
15841636}
15851637
@@ -1658,7 +1710,7 @@ static bool init_prefetch_work(struct ib_pd *pd,
16581710		}
16591711
16601712		/* Keep the MR pointer will valid outside the SRCU */ 
1661- 		atomic_inc (& work -> frags [i ].mr -> num_pending_prefetch );
1713+ 		atomic_inc (& work -> frags [i ].mr -> num_deferred_work );
16621714	}
16631715	work -> num_sge  =  num_sge ;
16641716	return  true;
0 commit comments