Update on "[Executorch] parallelize op_choose_qparams"

kimishpatel · kimishpatel · commit c69dd4b69798 · 2025-11-06T12:51:12.000-08:00
When doing prefill for quantized kv cache, with large prefill length, parallelizing this op helps. Differential Revision: [D84962234](https://our.internmc.facebook.com/intern/diff/D84962234/) **NOTE FOR REVIEWERS**: This PR has internal Meta-specific changes or comments, please review them on [Phabricator](https://our.internmc.facebook.com/intern/diff/D84962234/)! [ghstack-poisoned]
diff --git a/extension/llm/custom_ops/TARGETS b/extension/llm/custom_ops/TARGETS
@@ -60,5 +60,6 @@ runtime.python_test(
     ],
     deps = [
         "//caffe2:torch",
+        "//executorch/extension/pybindings:portable_lib",
     ],
 )
diff --git a/extension/llm/custom_ops/test_quantized_sdpa.py b/extension/llm/custom_ops/test_quantized_sdpa.py
@@ -12,6 +12,7 @@
 import torch.nn.functional as F
 
 from executorch.extension.llm.custom_ops import custom_ops  # noqa
+from executorch.extension.pybindings.portable_lib import _unsafe_reset_threadpool
 
 
 def is_fbcode():
@@ -40,6 +41,11 @@ def setUp(self):
         self.q_shape = None
         self.kv_shape = None
         self.is_seq_at_dim_2 = True
+        # For some reason 4 threads doesnt work
+        # This setting is needed to make this test not flaky due to OMP
+        # error of "OMP: Error #131: Thread identifier invalid"
+        # Not clear why that happens but having smaller threadpool resolves it
+        _unsafe_reset_threadpool(3)
 
     def _scale_tensor(self, tensor, min_value, max_value, scale=True):
         normalized_tensor = (tensor - tensor.min()) / (tensor.max() - tensor.min())

Original file line number	Diff line number	Diff line change
`@@ -60,5 +60,6 @@ runtime.python_test(`
`60`	`60`	`],`
`61`	`61`	`deps = [`
`62`	`62`	`"//caffe2:torch",`
	`63`	`+ "//executorch/extension/pybindings:portable_lib",`
`63`	`64`	`],`
`64`	`65`	`)`