Fix multiprocessing issues in FilterByValueBlock

markmc · markmc · commit 7cfbaa9dc11a · 2024-07-11T23:40:05.000+01:00
This addresses issues with using num_proc>1 with Dataset.map() and Dataset.filter(). The first issue is: ``` File "/usr/lib64/python3.11/pickle.py", line 578, in save rv = reduce(self.proto) ^^^^^^^^^^^^^^^^^^ TypeError: cannot pickle 'SSLContext' object ``` What was happening here is that the entire FilterByValueBlock object was being serialized to send to the multiprocessing worker. And now that this includes PipelineContext, which includes the OpenAI client object, which includes SSLContext, we hit a known issue: uqfoundation/dill#308 The second issue is specific to map(): ``` ValueError: The features can't be aligned because the key score of features {'task_description': Value(dtype='string', id=None), 'seed_question': Value(dtype='string', id=None), 'seed_response': Value(dtype='string', id=None), 'num_samples': Value(dtype='int64', id=None), 'question': Value(dtype='string', id=None), '__index_level_0__': Value(dtype='int64', id=None), 'evaluation': Value(dtype='string', id=None), 'score': Value(dtype='string', id=None)} has unexpected type - Value(dtype='string', id=None) (expected either Value(dtype='float64', id=None) or Value("null"). ``` It appears the the datasets, only in the case of num_proc>1, when we hit the "error converting dtype" case and set the column to None, it ends up being still considered a string column rather than the new dtype. This second issue deserves further investigation and may require a fix to the datasets library. Signed-off-by: Mark McLoughlin <markmc@redhat.com>
diff --git a/src/instructlab/sdg/filterblock.py b/src/instructlab/sdg/filterblock.py
@@ -9,6 +9,39 @@
 logger = setup_logger(__name__)
 
 
+# Note - this is not a method on the class below in order to avoid
+# serializing the object itself when multi-processing is used.
+# In particular, SSLContext - embedded in the OpenAI client object -
+# cannot be pickled.
+def _filter_by_values(samples, column, op, values, num_proc=1):
+    return samples.filter(
+        lambda x: any(op(x[column], value) for value in values),
+        num_proc=num_proc,
+    )
+
+
+def _map_dtype(samples, column, dtype, num_proc=1):
+    def convert_column(sample):
+        try:
+            sample[column] = dtype(sample[column])
+        except ValueError as e:
+            logger.error(
+                "Error converting dtype: %s, filling with None to be filtered later", e
+            )
+            sample[column] = None
+        return sample
+
+    # FIXME: it appears multiprocessing map has issues with
+    # None columns. If we pass num_proc>1 here and the error
+    # case is triggered above, we get:
+    #   ValueError: The features can't be aligned ...
+    # because the column is still considered a string not
+    # the new dtype.
+    num_proc = 1
+
+    return samples.map(convert_column, num_proc=num_proc)
+
+
 class FilterByValueBlock(Block):
     def __init__(
         self,
@@ -40,26 +73,12 @@ def __init__(
         self.convert_dtype = convert_dtype
         self.num_procs = batch_kwargs.get("num_procs", 1)
 
-    def _convert_dtype(self, sample):
-        try:
-            sample[self.column_name] = self.convert_dtype(sample[self.column_name])
-        except ValueError as e:
-            logger.error(
-                "Error converting dtype: %s, filling with None to be filtered later", e
-            )
-            sample[self.column_name] = None
-        return sample
-
     def generate(self, samples) -> Dataset:
         if self.convert_dtype:
-            samples = samples.map(
-                self._convert_dtype,
-                num_proc=self.num_procs,
+            samples = _map_dtype(
+                samples, self.column_name, self.convert_dtype, self.num_procs
             )
 
-        return samples.filter(
-            lambda x: any(
-                self.operation(x[self.column_name], value) for value in self.value
-            ),
-            num_proc=self.num_procs,
+        return _filter_by_values(
+            samples, self.column_name, self.operation, self.value, self.num_procs
         )