ucbepic · shreyashankar · Sep 30, 2024 · Sep 30, 2024 · Sep 30, 2024
diff --git a/docetl/operations/base.py b/docetl/operations/base.py
@@ -33,6 +33,7 @@ def __init__(
         self.default_model = default_model
         self.max_threads = max_threads
         self.console = console or Console()
+        self.manually_fix_errors = self.config.get("manually_fix_errors", False)
         self.status = status
         self.num_retries_on_validate_failure = self.config.get(
             "num_retries_on_validate_failure", 0

diff --git a/docetl/operations/equijoin.py b/docetl/operations/equijoin.py
@@ -86,7 +86,7 @@ def compare_pair(
         timeout_seconds=timeout_seconds,
         max_retries_per_timeout=max_retries_per_timeout,
     )
-    output = parse_llm_response(response)[0]
+    output = parse_llm_response(response, {"is_match": "bool"})[0]
     return output["is_match"], completion_cost(response)
 
 
@@ -201,6 +201,9 @@ def get_hashable_key(item: Dict) -> str:
         if len(left_data) == 0 or len(right_data) == 0:
             return [], 0
 
+        if self.status:
+            self.status.stop()
+
         # Initial blocking using multiprocessing
         num_processes = min(cpu_count(), len(left_data))
 
@@ -441,4 +444,7 @@ def get_embeddings(
         )
         self.console.log(f"Equijoin selectivity: {join_selectivity:.4f}")
 
+        if self.status:
+            self.status.start()
+
         return results, total_cost
diff --git a/docetl/operations/filter.py b/docetl/operations/filter.py
@@ -114,12 +114,19 @@ def execute(
             )
         )
 
+        if self.status:
+            self.status.start()
+
         def _process_filter_item(item: Dict) -> Tuple[Optional[Dict], float]:
             prompt_template = Template(self.config["prompt"])
             prompt = prompt_template.render(input=item)
 
             def validation_fn(response: Dict[str, Any]):
-                output = parse_llm_response(response)[0]
+                output = parse_llm_response(
+                    response,
+                    self.config["output"]["schema"],
+                    manually_fix_errors=self.manually_fix_errors,
+                )[0]
                 for key, value in item.items():
                     if key not in self.config["output"]["schema"]:
                         output[key] = value
@@ -159,7 +166,7 @@ def validation_fn(response: Dict[str, Any]):
             total_cost = 0
             pbar = RichLoopBar(
                 range(len(futures)),
-                desc="Processing filter items",
+                desc=f"Processing {self.config['name']} (filter) on all documents",
                 console=self.console,
             )
             for i in pbar:
@@ -174,4 +181,7 @@ def validation_fn(response: Dict[str, Any]):
                             results.append(result)
                 pbar.update(1)
 
+        if self.status:
+            self.status.start()
+
         return results, total_cost
diff --git a/docetl/operations/map.py b/docetl/operations/map.py
@@ -128,13 +128,19 @@ def execute(self, input_data: List[Dict]) -> Tuple[List[Dict], float]:
                 dropped_results.append(new_item)
             return dropped_results, 0.0  # Return the modified data with no cost
 
+        if self.status:
+            self.status.stop()
+
         def _process_map_item(item: Dict) -> Tuple[Optional[Dict], float]:
             prompt_template = Template(self.config["prompt"])
             prompt = prompt_template.render(input=item)
 
             def validation_fn(response: Dict[str, Any]):
                 output = parse_llm_response(
-                    response, tools=self.config.get("tools", None)
+                    response,
+                    schema=self.config["output"]["schema"],
+                    tools=self.config.get("tools", None),
+                    manually_fix_errors=self.manually_fix_errors,
                 )[0]
                 for key, value in item.items():
                     if key not in self.config["output"]["schema"]:
@@ -196,7 +202,7 @@ def validation_fn(response: Dict[str, Any]):
             total_cost = 0
             pbar = RichLoopBar(
                 range(len(futures)),
-                desc="Processing map items",
+                desc=f"Processing {self.config['name']} (map) on all documents",
                 console=self.console,
             )
             for i in pbar:
@@ -212,6 +218,9 @@ def validation_fn(response: Dict[str, Any]):
                 total_cost += item_cost
                 pbar.update(i)
 
+        if self.status:
+            self.status.start()
+
         return results, total_cost
 
     def validate_output(self, output: Dict) -> bool:
@@ -349,6 +358,9 @@ def execute(self, input_data: List[Dict]) -> Tuple[List[Dict], float]:
                 dropped_results.append(new_item)
             return dropped_results, 0.0  # Return the modified data with no cost
 
+        if self.status:
+            self.status.stop()
+
         def process_prompt(item, prompt_config):
             prompt_template = Template(prompt_config["prompt"])
             prompt = prompt_template.render(input=item)
@@ -368,7 +380,10 @@ def process_prompt(item, prompt_config):
                 max_retries_per_timeout=self.config.get("max_retries_per_timeout", 2),
             )
             output = parse_llm_response(
-                response, tools=prompt_config.get("tools", None)
+                response,
+                schema=local_output_schema,
+                tools=prompt_config.get("tools", None),
+                manually_fix_errors=self.manually_fix_errors,
             )[0]
             return output, completion_cost(response)
 
@@ -384,7 +399,7 @@ def process_prompt(item, prompt_config):
                 # Process results in order
                 pbar = RichLoopBar(
                     range(len(all_futures)),
-                    desc="Processing parallel map items",
+                    desc=f"Processing {self.config['name']} (parallel map) on all documents",
                     console=self.console,
                 )
                 for i in pbar:
@@ -418,5 +433,8 @@ def process_prompt(item, prompt_config):
                 for key in drop_keys:
                     item.pop(key, None)
 
+        if self.status:
+            self.status.start()
+
         # Return the results in order
         return [results[i] for i in range(len(input_data)) if i in results], total_cost
diff --git a/docetl/operations/reduce.py b/docetl/operations/reduce.py
@@ -252,6 +252,9 @@ def execute(self, input_data: List[Dict]) -> Tuple[List[Dict], float]:
             reduce_keys = [reduce_keys]
         input_schema = self.config.get("input", {}).get("schema", {})
 
+        if self.status:
+            self.status.stop()
+
         # Check if we need to group everything into one group
         if reduce_keys == ["_all"] or reduce_keys == "_all":
             grouped_data = [("_all", input_data)]
@@ -341,7 +344,7 @@ def process_group(
             for future in rich_as_completed(
                 futures,
                 total=len(futures),
-                desc="Processing reduce items",
+                desc=f"Processing {self.config['name']} (reduce) on all documents",
                 leave=True,
                 console=self.console,
             ):
@@ -358,6 +361,9 @@ def process_group(
                         self.intermediates[key]
                     )
 
+        if self.status:
+            self.status.start()
+
         return results, total_cost
 
     def _get_embeddings(
@@ -694,7 +700,11 @@ def _increment_fold(
             timeout_seconds=self.config.get("timeout", 120),
             max_retries_per_timeout=self.config.get("max_retries_per_timeout", 2),
         )
-        folded_output = parse_llm_response(response)[0]
+        folded_output = parse_llm_response(
+            response,
+            self.config["output"]["schema"],
+            manually_fix_errors=self.manually_fix_errors,
+        )[0]
 
         folded_output.update(dict(zip(self.config["reduce_key"], key)))
         fold_cost = completion_cost(response)
@@ -735,7 +745,7 @@ def _merge_results(
             timeout_seconds=self.config.get("timeout", 120),
             max_retries_per_timeout=self.config.get("max_retries_per_timeout", 2),
         )
-        merged_output = parse_llm_response(response)[0]
+        merged_output = parse_llm_response(response, self.config["output"]["schema"])[0]
         merged_output.update(dict(zip(self.config["reduce_key"], key)))
         merge_cost = completion_cost(response)
         end_time = time.time()
@@ -844,7 +854,11 @@ def _batch_reduce(
 
         item_cost += completion_cost(response)
 
-        output = parse_llm_response(response)[0]
+        output = parse_llm_response(
+            response,
+            self.config["output"]["schema"],
+            manually_fix_errors=self.manually_fix_errors,
+        )[0]
         output.update(dict(zip(self.config["reduce_key"], key)))
 
         if validate_output(self.config, output, self.console):

diff --git a/docetl/operations/resolve.py b/docetl/operations/resolve.py
@@ -63,7 +63,10 @@ def compare_pair(
         timeout_seconds=timeout_seconds,
         max_retries_per_timeout=max_retries_per_timeout,
     )
-    output = parse_llm_response(response)[0]
+    output = parse_llm_response(
+        response,
+        {"is_match": "bool"},
+    )[0]
     return output["is_match"], completion_cost(response)
 
 
@@ -199,11 +202,11 @@ def execute(self, input_data: List[Dict]) -> Tuple[List[Dict], float]:
         blocking_keys = self.config.get("blocking_keys", [])
         blocking_threshold = self.config.get("blocking_threshold")
         blocking_conditions = self.config.get("blocking_conditions", [])
+        if self.status:
+            self.status.stop()
 
         if not blocking_threshold and not blocking_conditions:
             # Prompt the user for confirmation
-            if self.status:
-                self.status.stop()
             if not Confirm.ask(
                 f"[yellow]Warning: No blocking keys or conditions specified. "
                 f"This may result in a large number of comparisons. "
@@ -212,9 +215,6 @@ def execute(self, input_data: List[Dict]) -> Tuple[List[Dict], float]:
             ):
                 raise ValueError("Operation cancelled by user.")
 
-            if self.status:
-                self.status.start()
-
         input_schema = self.config.get("input", {}).get("schema", {})
         if not blocking_keys:
             # Set them to all keys in the input data
@@ -413,7 +413,11 @@ def process_cluster(cluster):
                         "max_retries_per_timeout", 2
                     ),
                 )
-                reduction_output = parse_llm_response(reduction_response)[0]
+                reduction_output = parse_llm_response(
+                    reduction_response,
+                    self.config["output"]["schema"],
+                    manually_fix_errors=self.manually_fix_errors,
+                )[0]
                 reduction_cost = completion_cost(reduction_response)
 
                 if validate_output(self.config, reduction_output, self.console):
@@ -467,4 +471,7 @@ def process_cluster(cluster):
         )
         self.console.log(f"Self-join selectivity: {true_match_selectivity:.4f}")
 
+        if self.status:
+            self.status.start()
+
         return results, total_cost