diff --git a/tools/sched_ext/scx_flatcg.bpf.c b/tools/sched_ext/scx_flatcg.bpf.c
index 869115805b2882..d6a947bc981517 100644
--- a/tools/sched_ext/scx_flatcg.bpf.c
+++ b/tools/sched_ext/scx_flatcg.bpf.c
@@ -46,6 +46,11 @@
 #include <scx/common.bpf.h>
 #include "scx_flatcg.h"
 
+/*
+ * Maximum amount of retries to find a valid cgroup.
+ */
+#define CGROUP_MAX_RETRIES 1024
+
 char _license[] SEC("license") = "GPL";
 
 const volatile u32 nr_cpus = 32;	/* !0 for veristat, set during init */
@@ -302,6 +307,17 @@ static void cgrp_enqueued(struct cgroup *cgrp, struct fcg_cgrp_ctx *cgc)
 	bpf_spin_unlock(&cgv_tree_lock);
 }
 
+static void set_bypassed_at(struct task_struct *p, struct fcg_task_ctx *taskc)
+{
+	/*
+	 * Tell fcg_stopping() that this bypassed the regular scheduling path
+	 * and should be force charged to the cgroup. 0 is used to indicate that
+	 * the task isn't bypassing, so if the current runtime is 0, go back by
+	 * one nanosecond.
+	 */
+	taskc->bypassed_at = p->se.sum_exec_runtime ?: (u64)-1;
+}
+
 s32 BPF_STRUCT_OPS(fcg_select_cpu, struct task_struct *p, s32 prev_cpu, u64 wake_flags)
 {
 	struct fcg_task_ctx *taskc;
@@ -319,35 +335,12 @@ s32 BPF_STRUCT_OPS(fcg_select_cpu, struct task_struct *p, s32 prev_cpu, u64 wake
 	/*
 	 * If select_cpu_dfl() is recommending local enqueue, the target CPU is
 	 * idle. Follow it and charge the cgroup later in fcg_stopping() after
-	 * the fact. Use the same mechanism to deal with tasks with custom
-	 * affinities so that we don't have to worry about per-cgroup dq's
-	 * containing tasks that can't be executed from some CPUs.
+	 * the fact.
 	 */
-	if (is_idle || p->nr_cpus_allowed != nr_cpus) {
-		/*
-		 * Tell fcg_stopping() that this bypassed the regular scheduling
-		 * path and should be force charged to the cgroup. 0 is used to
-		 * indicate that the task isn't bypassing, so if the current
-		 * runtime is 0, go back by one nanosecond.
-		 */
-		taskc->bypassed_at = p->se.sum_exec_runtime ?: (u64)-1;
-
-		/*
-		 * The global dq is deprioritized as we don't want to let tasks
-		 * to boost themselves by constraining its cpumask. The
-		 * deprioritization is rather severe, so let's not apply that to
-		 * per-cpu kernel threads. This is ham-fisted. We probably wanna
-		 * implement per-cgroup fallback dq's instead so that we have
-		 * more control over when tasks with custom cpumask get issued.
-		 */
-		if (is_idle ||
-		    (p->nr_cpus_allowed == 1 && (p->flags & PF_KTHREAD))) {
-			stat_inc(FCG_STAT_LOCAL);
-			scx_bpf_dispatch(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, 0);
-		} else {
-			stat_inc(FCG_STAT_GLOBAL);
-			scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, 0);
-		}
+	if (is_idle) {
+		set_bypassed_at(p, taskc);
+		stat_inc(FCG_STAT_LOCAL);
+		scx_bpf_dispatch(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, 0);
 	}
 
 	return cpu;
@@ -365,6 +358,32 @@ void BPF_STRUCT_OPS(fcg_enqueue, struct task_struct *p, u64 enq_flags)
 		return;
 	}
 
+	/*
+	 * Use the direct dispatching and force charging to deal with tasks with
+	 * custom affinities so that we don't have to worry about per-cgroup
+	 * dq's containing tasks that can't be executed from some CPUs.
+	 */
+	if (p->nr_cpus_allowed != nr_cpus) {
+		set_bypassed_at(p, taskc);
+
+		/*
+		 * The global dq is deprioritized as we don't want to let tasks
+		 * to boost themselves by constraining its cpumask. The
+		 * deprioritization is rather severe, so let's not apply that to
+		 * per-cpu kernel threads. This is ham-fisted. We probably wanna
+		 * implement per-cgroup fallback dq's instead so that we have
+		 * more control over when tasks with custom cpumask get issued.
+		 */
+		if (p->nr_cpus_allowed == 1 && (p->flags & PF_KTHREAD)) {
+			stat_inc(FCG_STAT_LOCAL);
+			scx_bpf_dispatch(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, enq_flags);
+		} else {
+			stat_inc(FCG_STAT_GLOBAL);
+			scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags);
+		}
+		return;
+	}
+
 	cgrp = scx_bpf_task_cgroup(p);
 	cgc = find_cgrp_ctx(cgrp);
 	if (!cgc)
@@ -691,6 +710,7 @@ static bool try_pick_next_cgroup(u64 *cgidp)
 		bpf_spin_lock(&cgv_tree_lock);
 		bpf_rbtree_add(&cgv_tree, &cgv_node->rb_node, cgv_node_less);
 		bpf_spin_unlock(&cgv_tree_lock);
+		stat_inc(FCG_STAT_PNC_RACE);
 	} else {
 		cgv_node = bpf_kptr_xchg(&stash->node, cgv_node);
 		if (cgv_node) {
@@ -712,6 +732,7 @@ void BPF_STRUCT_OPS(fcg_dispatch, s32 cpu, struct task_struct *prev)
 	struct fcg_cgrp_ctx *cgc;
 	struct cgroup *cgrp;
 	u64 now = bpf_ktime_get_ns();
+	bool picked_next = false;
 
 	cpuc = find_cpu_ctx();
 	if (!cpuc)
@@ -766,10 +787,21 @@ void BPF_STRUCT_OPS(fcg_dispatch, s32 cpu, struct task_struct *prev)
 		return;
 	}
 
-	bpf_repeat(BPF_MAX_LOOPS) {
-		if (try_pick_next_cgroup(&cpuc->cur_cgid))
+	bpf_repeat(CGROUP_MAX_RETRIES) {
+		if (try_pick_next_cgroup(&cpuc->cur_cgid)) {
+			picked_next = true;
 			break;
+		}
 	}
+
+	/*
+	 * This only happens if try_pick_next_cgroup() races against enqueue
+	 * path for more than CGROUP_MAX_RETRIES times, which is extremely
+	 * unlikely and likely indicates an underlying bug. There shouldn't be
+	 * any stall risk as the race is against enqueue.
+	 */
+	if (!picked_next)
+		stat_inc(FCG_STAT_PNC_FAIL);
 }
 
 s32 BPF_STRUCT_OPS(fcg_init_task, struct task_struct *p,
diff --git a/tools/sched_ext/scx_flatcg.c b/tools/sched_ext/scx_flatcg.c
index b326b2d3ec3501..6c2f9715f69255 100644
--- a/tools/sched_ext/scx_flatcg.c
+++ b/tools/sched_ext/scx_flatcg.c
@@ -186,29 +186,31 @@ int main(int argc, char **argv)
 
 		printf("\n[SEQ %6lu cpu=%5.1lf hweight_gen=%" PRIu64 "]\n",
 		       seq++, cpu_util * 100.0, skel->data->hweight_gen);
-		printf("       act:%6llu  deact:%6llu local:%6llu global:%6llu\n",
+		printf("       act:%6llu  deact:%6llu global:%6llu local:%6llu\n",
 		       stats[FCG_STAT_ACT],
 		       stats[FCG_STAT_DEACT],
-		       stats[FCG_STAT_LOCAL],
-		       stats[FCG_STAT_GLOBAL]);
-		printf("HWT   skip:%6llu   race:%6llu cache:%6llu update:%6llu\n",
-		       stats[FCG_STAT_HWT_SKIP],
-		       stats[FCG_STAT_HWT_RACE],
+		       stats[FCG_STAT_GLOBAL],
+		       stats[FCG_STAT_LOCAL]);
+		printf("HWT  cache:%6llu update:%6llu   skip:%6llu  race:%6llu\n",
 		       stats[FCG_STAT_HWT_CACHE],
-		       stats[FCG_STAT_HWT_UPDATES]);
+		       stats[FCG_STAT_HWT_UPDATES],
+		       stats[FCG_STAT_HWT_SKIP],
+		       stats[FCG_STAT_HWT_RACE]);
 		printf("ENQ   skip:%6llu   race:%6llu\n",
 		       stats[FCG_STAT_ENQ_SKIP],
 		       stats[FCG_STAT_ENQ_RACE]);
-		printf("CNS   keep:%6llu expire:%6llu empty:%6llu   gone:%6llu\n",
+		printf("CNS   keep:%6llu expire:%6llu  empty:%6llu  gone:%6llu\n",
 		       stats[FCG_STAT_CNS_KEEP],
 		       stats[FCG_STAT_CNS_EXPIRE],
 		       stats[FCG_STAT_CNS_EMPTY],
 		       stats[FCG_STAT_CNS_GONE]);
-		printf("PNC nocgrp:%6llu   next:%6llu empty:%6llu   gone:%6llu\n",
-		       stats[FCG_STAT_PNC_NO_CGRP],
+		printf("PNC   next:%6llu  empty:%6llu nocgrp:%6llu  gone:%6llu race:%6llu fail:%6llu\n",
 		       stats[FCG_STAT_PNC_NEXT],
 		       stats[FCG_STAT_PNC_EMPTY],
-		       stats[FCG_STAT_PNC_GONE]);
+		       stats[FCG_STAT_PNC_NO_CGRP],
+		       stats[FCG_STAT_PNC_GONE],
+		       stats[FCG_STAT_PNC_RACE],
+		       stats[FCG_STAT_PNC_FAIL]);
 		printf("BAD remove:%6llu\n",
 		       acc_stats[FCG_STAT_BAD_REMOVAL]);
 		fflush(stdout);
diff --git a/tools/sched_ext/scx_flatcg.h b/tools/sched_ext/scx_flatcg.h
index 490758ed41f0fb..6f2ea50acb1cb3 100644
--- a/tools/sched_ext/scx_flatcg.h
+++ b/tools/sched_ext/scx_flatcg.h
@@ -28,6 +28,8 @@ enum fcg_stat_idx {
 	FCG_STAT_PNC_NEXT,
 	FCG_STAT_PNC_EMPTY,
 	FCG_STAT_PNC_GONE,
+	FCG_STAT_PNC_RACE,
+	FCG_STAT_PNC_FAIL,
 
 	FCG_STAT_BAD_REMOVAL,