Skip to content

Commit 7cfbaa9

Browse files
committedJul 11, 2024
Fix multiprocessing issues in FilterByValueBlock
This addresses issues with using num_proc>1 with Dataset.map() and Dataset.filter(). The first issue is: ``` File "/usr/lib64/python3.11/pickle.py", line 578, in save rv = reduce(self.proto) ^^^^^^^^^^^^^^^^^^ TypeError: cannot pickle 'SSLContext' object ``` What was happening here is that the entire FilterByValueBlock object was being serialized to send to the multiprocessing worker. And now that this includes PipelineContext, which includes the OpenAI client object, which includes SSLContext, we hit a known issue: uqfoundation/dill#308 The second issue is specific to map(): ``` ValueError: The features can't be aligned because the key score of features {'task_description': Value(dtype='string', id=None), 'seed_question': Value(dtype='string', id=None), 'seed_response': Value(dtype='string', id=None), 'num_samples': Value(dtype='int64', id=None), 'question': Value(dtype='string', id=None), '__index_level_0__': Value(dtype='int64', id=None), 'evaluation': Value(dtype='string', id=None), 'score': Value(dtype='string', id=None)} has unexpected type - Value(dtype='string', id=None) (expected either Value(dtype='float64', id=None) or Value("null"). ``` It appears the the datasets, only in the case of num_proc>1, when we hit the "error converting dtype" case and set the column to None, it ends up being still considered a string column rather than the new dtype. This second issue deserves further investigation and may require a fix to the datasets library. Signed-off-by: Mark McLoughlin <markmc@redhat.com>
1 parent 49c87d5 commit 7cfbaa9

File tree

1 file changed

+37
-18
lines changed

1 file changed

+37
-18
lines changed
 

‎src/instructlab/sdg/filterblock.py

+37-18
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,39 @@
99
logger = setup_logger(__name__)
1010

1111

12+
# Note - this is not a method on the class below in order to avoid
13+
# serializing the object itself when multi-processing is used.
14+
# In particular, SSLContext - embedded in the OpenAI client object -
15+
# cannot be pickled.
16+
def _filter_by_values(samples, column, op, values, num_proc=1):
17+
return samples.filter(
18+
lambda x: any(op(x[column], value) for value in values),
19+
num_proc=num_proc,
20+
)
21+
22+
23+
def _map_dtype(samples, column, dtype, num_proc=1):
24+
def convert_column(sample):
25+
try:
26+
sample[column] = dtype(sample[column])
27+
except ValueError as e:
28+
logger.error(
29+
"Error converting dtype: %s, filling with None to be filtered later", e
30+
)
31+
sample[column] = None
32+
return sample
33+
34+
# FIXME: it appears multiprocessing map has issues with
35+
# None columns. If we pass num_proc>1 here and the error
36+
# case is triggered above, we get:
37+
# ValueError: The features can't be aligned ...
38+
# because the column is still considered a string not
39+
# the new dtype.
40+
num_proc = 1
41+
42+
return samples.map(convert_column, num_proc=num_proc)
43+
44+
1245
class FilterByValueBlock(Block):
1346
def __init__(
1447
self,
@@ -40,26 +73,12 @@ def __init__(
4073
self.convert_dtype = convert_dtype
4174
self.num_procs = batch_kwargs.get("num_procs", 1)
4275

43-
def _convert_dtype(self, sample):
44-
try:
45-
sample[self.column_name] = self.convert_dtype(sample[self.column_name])
46-
except ValueError as e:
47-
logger.error(
48-
"Error converting dtype: %s, filling with None to be filtered later", e
49-
)
50-
sample[self.column_name] = None
51-
return sample
52-
5376
def generate(self, samples) -> Dataset:
5477
if self.convert_dtype:
55-
samples = samples.map(
56-
self._convert_dtype,
57-
num_proc=self.num_procs,
78+
samples = _map_dtype(
79+
samples, self.column_name, self.convert_dtype, self.num_procs
5880
)
5981

60-
return samples.filter(
61-
lambda x: any(
62-
self.operation(x[self.column_name], value) for value in self.value
63-
),
64-
num_proc=self.num_procs,
82+
return _filter_by_values(
83+
samples, self.column_name, self.operation, self.value, self.num_procs
6584
)

0 commit comments

Comments
 (0)
Please sign in to comment.