@@ -68,11 +68,8 @@ struct bpf_cpu_map_entry {
6868 struct bpf_cpumap_val value ;
6969 struct bpf_prog * prog ;
7070
71- atomic_t refcnt ; /* Control when this struct can be free'ed */
72- struct rcu_head rcu ;
73-
74- struct work_struct kthread_stop_wq ;
7571 struct completion kthread_running ;
72+ struct rcu_work free_work ;
7673};
7774
7875struct bpf_cpu_map {
@@ -117,11 +114,6 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr)
117114 return & cmap -> map ;
118115}
119116
120- static void get_cpu_map_entry (struct bpf_cpu_map_entry * rcpu )
121- {
122- atomic_inc (& rcpu -> refcnt );
123- }
124-
125117static void __cpu_map_ring_cleanup (struct ptr_ring * ring )
126118{
127119 /* The tear-down procedure should have made sure that queue is
@@ -142,35 +134,6 @@ static void __cpu_map_ring_cleanup(struct ptr_ring *ring)
142134 }
143135}
144136
145- static void put_cpu_map_entry (struct bpf_cpu_map_entry * rcpu )
146- {
147- if (atomic_dec_and_test (& rcpu -> refcnt )) {
148- if (rcpu -> prog )
149- bpf_prog_put (rcpu -> prog );
150- /* The queue should be empty at this point */
151- __cpu_map_ring_cleanup (rcpu -> queue );
152- ptr_ring_cleanup (rcpu -> queue , NULL );
153- kfree (rcpu -> queue );
154- kfree (rcpu );
155- }
156- }
157-
158- /* called from workqueue, to workaround syscall using preempt_disable */
159- static void cpu_map_kthread_stop (struct work_struct * work )
160- {
161- struct bpf_cpu_map_entry * rcpu ;
162-
163- rcpu = container_of (work , struct bpf_cpu_map_entry , kthread_stop_wq );
164-
165- /* Wait for flush in __cpu_map_entry_free(), via full RCU barrier,
166- * as it waits until all in-flight call_rcu() callbacks complete.
167- */
168- rcu_barrier ();
169-
170- /* kthread_stop will wake_up_process and wait for it to complete */
171- kthread_stop (rcpu -> kthread );
172- }
173-
174137static void cpu_map_bpf_prog_run_skb (struct bpf_cpu_map_entry * rcpu ,
175138 struct list_head * listp ,
176139 struct xdp_cpumap_stats * stats )
@@ -395,7 +358,6 @@ static int cpu_map_kthread_run(void *data)
395358 }
396359 __set_current_state (TASK_RUNNING );
397360
398- put_cpu_map_entry (rcpu );
399361 return 0 ;
400362}
401363
@@ -472,9 +434,6 @@ __cpu_map_entry_alloc(struct bpf_map *map, struct bpf_cpumap_val *value,
472434 if (IS_ERR (rcpu -> kthread ))
473435 goto free_prog ;
474436
475- get_cpu_map_entry (rcpu ); /* 1-refcnt for being in cmap->cpu_map[] */
476- get_cpu_map_entry (rcpu ); /* 1-refcnt for kthread */
477-
478437 /* Make sure kthread runs on a single CPU */
479438 kthread_bind (rcpu -> kthread , cpu );
480439 wake_up_process (rcpu -> kthread );
@@ -501,40 +460,40 @@ __cpu_map_entry_alloc(struct bpf_map *map, struct bpf_cpumap_val *value,
501460 return NULL ;
502461}
503462
504- static void __cpu_map_entry_free (struct rcu_head * rcu )
463+ static void __cpu_map_entry_free (struct work_struct * work )
505464{
506465 struct bpf_cpu_map_entry * rcpu ;
507466
508467 /* This cpu_map_entry have been disconnected from map and one
509- * RCU grace-period have elapsed. Thus, XDP cannot queue any
468+ * RCU grace-period have elapsed. Thus, XDP cannot queue any
510469 * new packets and cannot change/set flush_needed that can
511470 * find this entry.
512471 */
513- rcpu = container_of (rcu , struct bpf_cpu_map_entry , rcu );
472+ rcpu = container_of (to_rcu_work ( work ) , struct bpf_cpu_map_entry , free_work );
514473
474+ /* kthread_stop will wake_up_process and wait for it to complete.
475+ * cpu_map_kthread_run() makes sure the pointer ring is empty
476+ * before exiting.
477+ */
478+ kthread_stop (rcpu -> kthread );
479+
480+ if (rcpu -> prog )
481+ bpf_prog_put (rcpu -> prog );
482+ /* The queue should be empty at this point */
483+ __cpu_map_ring_cleanup (rcpu -> queue );
484+ ptr_ring_cleanup (rcpu -> queue , NULL );
485+ kfree (rcpu -> queue );
515486 free_percpu (rcpu -> bulkq );
516- /* Cannot kthread_stop() here, last put free rcpu resources */
517- put_cpu_map_entry (rcpu );
487+ kfree (rcpu );
518488}
519489
520- /* After xchg pointer to bpf_cpu_map_entry, use the call_rcu() to
521- * ensure any driver rcu critical sections have completed, but this
522- * does not guarantee a flush has happened yet. Because driver side
523- * rcu_read_lock/unlock only protects the running XDP program. The
524- * atomic xchg and NULL-ptr check in __cpu_map_flush() makes sure a
525- * pending flush op doesn't fail.
526- *
527- * The bpf_cpu_map_entry is still used by the kthread, and there can
528- * still be pending packets (in queue and percpu bulkq). A refcnt
529- * makes sure to last user (kthread_stop vs. call_rcu) free memory
530- * resources.
531- *
532- * The rcu callback __cpu_map_entry_free flush remaining packets in
533- * percpu bulkq to queue. Due to caller map_delete_elem() disable
534- * preemption, cannot call kthread_stop() to make sure queue is empty.
535- * Instead a work_queue is started for stopping kthread,
536- * cpu_map_kthread_stop, which waits for an RCU grace period before
537- * stopping kthread, emptying the queue.
490+ /* After the xchg of the bpf_cpu_map_entry pointer, we need to make sure the old
491+ * entry is no longer in use before freeing. We use queue_rcu_work() to call
492+ * __cpu_map_entry_free() in a separate workqueue after waiting for an RCU grace
493+ * period. This means that (a) all pending enqueue and flush operations have
494+ * completed (because of the RCU callback), and (b) we are in a workqueue
495+ * context where we can stop the kthread and wait for it to exit before freeing
496+ * everything.
538497 */
539498static void __cpu_map_entry_replace (struct bpf_cpu_map * cmap ,
540499 u32 key_cpu , struct bpf_cpu_map_entry * rcpu )
@@ -543,9 +502,8 @@ static void __cpu_map_entry_replace(struct bpf_cpu_map *cmap,
543502
544503 old_rcpu = unrcu_pointer (xchg (& cmap -> cpu_map [key_cpu ], RCU_INITIALIZER (rcpu )));
545504 if (old_rcpu ) {
546- call_rcu (& old_rcpu -> rcu , __cpu_map_entry_free );
547- INIT_WORK (& old_rcpu -> kthread_stop_wq , cpu_map_kthread_stop );
548- schedule_work (& old_rcpu -> kthread_stop_wq );
505+ INIT_RCU_WORK (& old_rcpu -> free_work , __cpu_map_entry_free );
506+ queue_rcu_work (system_wq , & old_rcpu -> free_work );
549507 }
550508}
551509
@@ -557,7 +515,7 @@ static long cpu_map_delete_elem(struct bpf_map *map, void *key)
557515 if (key_cpu >= map -> max_entries )
558516 return - EINVAL ;
559517
560- /* notice caller map_delete_elem() use preempt_disable () */
518+ /* notice caller map_delete_elem() uses rcu_read_lock () */
561519 __cpu_map_entry_replace (cmap , key_cpu , NULL );
562520 return 0 ;
563521}
@@ -608,16 +566,15 @@ static void cpu_map_free(struct bpf_map *map)
608566 /* At this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0,
609567 * so the bpf programs (can be more than one that used this map) were
610568 * disconnected from events. Wait for outstanding critical sections in
611- * these programs to complete. The rcu critical section only guarantees
612- * no further "XDP/bpf-side" reads against bpf_cpu_map->cpu_map.
613- * It does __not__ ensure pending flush operations (if any) are
614- * complete .
569+ * these programs to complete. synchronize_rcu() below not only
570+ * guarantees no further "XDP/bpf-side" reads against
571+ * bpf_cpu_map->cpu_map, but also ensure pending flush operations
572+ * (if any) are completed .
615573 */
616-
617574 synchronize_rcu ();
618575
619- /* For cpu_map the remote CPUs can still be using the entries
620- * (struct bpf_cpu_map_entry ).
576+ /* The only possible user of bpf_cpu_map_entry is
577+ * cpu_map_kthread_run( ).
621578 */
622579 for (i = 0 ; i < cmap -> map .max_entries ; i ++ ) {
623580 struct bpf_cpu_map_entry * rcpu ;
@@ -626,8 +583,8 @@ static void cpu_map_free(struct bpf_map *map)
626583 if (!rcpu )
627584 continue ;
628585
629- /* bq flush and cleanup happens after RCU grace-period */
630- __cpu_map_entry_replace ( cmap , i , NULL ); /* call_rcu */
586+ /* Stop kthread and cleanup entry directly */
587+ __cpu_map_entry_free ( & rcpu -> free_work . work );
631588 }
632589 bpf_map_area_free (cmap -> cpu_map );
633590 bpf_map_area_free (cmap );
0 commit comments