Fix for batched tokenization.

Adamliu1 · Jun 9, 2024 · cab4141 · cab4141
1 parent 6045e0b
commit cab4141
Showing 1 changed file with 11 additions and 10 deletions.
diff --git a/llm_unlearn_ucl/utils.py b/llm_unlearn_ucl/utils.py
@@ -33,17 +33,18 @@ def create_symbolic_dataloader_from_dataset(
     def preprocess(examples):
         results = {"input_ids": [], "attention_mask": [], "start_locs": []}
 
-        prompt = examples["input"]
-        output = examples["output"]
-        text = f"### Question: {prompt} ### Answer: {output}"
+        for i in range(len(examples["input"])):
+            prompt = examples["input"][i]
+            output = examples["output"][i]
+            text = f"### Question: {prompt} ### Answer: {output}"
 
-        tokenized = tokenizer(text, truncation=True, padding="max_length")
-        results["input_ids"].append(tokenized["input_ids"])
-        results["attention_mask"].append(tokenized["attention_mask"])
-        # Calculate start idx for answer
-        test_text = f"### Question: {prompt} ### Answer: "
-        test_tokenized = tokenizer(test_text, truncation=True, padding="max_length")
-        results["start_locs"].append(len(test_tokenized["input_ids"]) - 1)
+            tokenized = tokenizer(text, truncation=True, padding="max_length")
+            results["input_ids"].append(tokenized["input_ids"])
+            results["attention_mask"].append(tokenized["attention_mask"])
+            # Calculate start idx for answer
+            test_text = f"### Question: {prompt} ### Answer: "
+            test_tokenized = tokenizer(test_text, truncation=True, padding="max_length")
+            results["start_locs"].append(len(test_tokenized["input_ids"]) - 1)
 
         return results