Add ability to save intermediates

ucbepic · Sep 14, 2024 · b31d6bf · b31d6bf
1 parent 17e069e
commit b31d6bf
Show file tree

Hide file tree

Showing 12 changed files with 1,347 additions and 322 deletions.
diff --git a/docetl/runner.py b/docetl/runner.py
@@ -45,6 +45,23 @@ def __init__(self, yaml_file: str, max_threads: int = None):
         self.console = Console()
         self.status = None
         self.datasets = {}
+
+        self.intermediate_dir = self.config["pipeline"]["output"].get(
+            "intermediate_dir"
+        )
+
+        # Check if output path is correctly formatted as JSON
+        output_path = self.config.get("pipeline", {}).get("output", {}).get("path")
+        if output_path:
+            if not output_path.lower().endswith(".json"):
+                raise ValueError(
+                    f"Output path '{output_path}' is not a JSON file. Please provide a path ending with '.json'."
+                )
+        else:
+            raise ValueError(
+                "No output path specified in the configuration. Please provide an output path ending with '.json' in the configuration."
+            )
+
         self.syntax_check()
 
     def syntax_check(self):
@@ -57,6 +74,10 @@ def syntax_check(self):
         Raises:
             ValueError: If any operation fails the syntax check.
         """
+        self.console.log(
+            "[yellow]Performing syntax check on all operations...[/yellow]"
+        )
+
         for operation_config in self.config["operations"]:
             operation = operation_config["name"]
             operation_type = operation_config["type"]
@@ -218,8 +239,39 @@ def execute_step(
                 description=f"Operation [cyan]{operation_name}[/cyan] completed. Cost: [green]${cost:.2f}[/green]",
             )
 
+            # Checkpoint after each operation
+            if self.intermediate_dir:
+                self._save_checkpoint(step["name"], operation_name, input_data)
+
         return input_data, total_cost
 
+    def _save_checkpoint(self, step_name: str, operation_name: str, data: List[Dict]):
+        """
+        Save a checkpoint of the current data after an operation.
+
+        This method creates a JSON file containing the current state of the data
+        after an operation has been executed. The checkpoint is saved in a directory
+        structure that reflects the step and operation names.
+
+        Args:
+            step_name (str): The name of the current step in the pipeline.
+            operation_name (str): The name of the operation that was just executed.
+            data (List[Dict]): The current state of the data to be checkpointed.
+
+        Note:
+            The checkpoint is saved only if a checkpoint directory has been specified
+            when initializing the DSLRunner.
+        """
+        checkpoint_path = os.path.join(
+            self.intermediate_dir, step_name, f"{operation_name}.json"
+        )
+        os.makedirs(os.path.dirname(checkpoint_path), exist_ok=True)
+        with open(checkpoint_path, "w") as f:
+            json.dump(data, f)
+        self.console.log(
+            f"[green]Checkpoint saved for operation '{operation_name}' in step '{step_name}' at {checkpoint_path}[/green]"
+        )
+
 
 if __name__ == "__main__":
     runner = DSLRunner("workloads/medical/map_opt.yaml")

diff --git a/docs/api-reference/cli.md b/docs/api-reference/cli.md
@@ -0,0 +1,26 @@
+::: docetl.cli.run
+    options:
+        show_root_heading: true
+        heading_level: 3
+        show_if_no_docstring: false
+        docstring_options:
+            ignore_init_summary: false
+            trim_doctest_flags: true
+
+::: docetl.cli.build
+    options:
+        show_root_heading: true
+        heading_level: 3
+        show_if_no_docstring: false
+        docstring_options:
+            ignore_init_summary: false
+            trim_doctest_flags: true
+
+::: docetl.cli.clear_cache
+    options:
+        show_root_heading: true
+        heading_level: 3
+        show_if_no_docstring: false
+        docstring_options:
+            ignore_init_summary: false
+            trim_doctest_flags: true
diff --git a/docs/concepts/schemas.md b/docs/concepts/schemas.md
@@ -10,7 +10,7 @@ In docetl, schemas play an important role in defining the structure of output fr
 
 !!! tip "Schema Simplicity"
 
-    Empirically, we've observed that **the more complex the output schema is, the worse the quality of the output tends to be**. Keep your schemas as simple as possible for better results.
+    We've observed that **the more complex the output schema is, the worse the quality of the output tends to be**. Keep your schemas as simple as possible for better results.
 
 ## Defining Schemas
 

diff --git a/docs/execution/running-pipelines.md b/docs/execution/running-pipelines.md
@@ -9,7 +9,7 @@ In this section, we will walk through an example of a complex medical informatio
 3. **Unnesting**: The extracted medication list is unnested to process each medication individually.
 4. **Medication Resolution**: Similar medication names are resolved to standardize the entries. _Note: If you are unsure about the optimal configuration for this operation, you can skip this step and move on to the optimizer section (covered in a later part of this documentation)._
 5. **Summary Generation**: For each unique medication, the pipeline generates a summary of side effects and therapeutic uses based on information from all relevant transcripts.
-6. **Output**: The final summaries are saved to a JSON file.
+6. **Output**: The final summaries are saved to a JSON file. If you provide an intermediate directory in your configuration, the outputs of each operation will be saved to this directory. This allows you to inspect the results of individual steps in the pipeline and can be useful for debugging or analyzing the pipeline's progress.
 
 Now, let's look at the detailed configuration for this pipeline:
 
@@ -94,6 +94,7 @@ Now, let's look at the detailed configuration for this pipeline:
       output:
         path: medication_summaries.json
         type: file
+        intermediate_dir: intermediate_results # This is optional, but if you want to inspect intermediate results, you can specify a checkpoint directory
       steps:
         - input: transcripts
           name: medical_info_extraction
@@ -115,7 +116,7 @@ Ensure your pipeline configuration includes all the required components as descr
 - Default model
 - Datasets
 - Operations
-- Pipeline specification (steps and output)
+- Pipeline specification (steps and output configuration)
 
 Once you have your pipeline configuration ready, you can execute it using the `docetl run` command if you're confident that this pipeline is suitable for your task and data. This is typically the case when your documents are relatively small and your task is straightforward.
 
@@ -150,18 +151,26 @@ Here are some additional notes to help you get the most out of your pipeline:
       # ... rest of the operation configuration
   ```
 
-- **The `run` Function**: The main entry point for running a pipeline is the `run` function in `docetl/cli.py`. Here's a description of its parameters and functionality:
+- **Caching**: Docetl caches the results of operations by default. This means that if you run the same operation on the same data multiple times, the results will be retrieved from the cache rather than being recomputed. You can clear the cache by running `docetl clear-cache`.
 
+- **The `run` Function**: The main entry point for running a pipeline is the `run` function in `docetl/cli.py`. Here's a description of its parameters and functionality:
 
 ::: docetl.cli.run
-    handler: python
-    options:
-        members:
-            - run
-        show_root_full_path: true
-        show_root_toc_entry: true
-        show_root_heading: true
-        show_source: false
-        show_name: true
-
-- **Raw Object Output**: We have not implemented this yet! But we are working on it. If you need access to the raw objects produced by the pipeline for debugging or further processing, use the `--write-raw-objects` flag.
+handler: python
+options:
+members: - run
+show_root_full_path: true
+show_root_toc_entry: true
+show_root_heading: true
+show_source: false
+show_name: true
+
+- **Intermediate Output**: If you provide an intermediate directory in your configuration, the outputs of each operation will be saved to this directory. This allows you to inspect the results of individual steps in the pipeline and can be useful for debugging or analyzing the pipeline's progress. Set the `intermediate_dir` parameter in your pipeline's output configuration to specify the directory where intermediate results should be saved; e.g.,
+
+```yaml
+pipeline:
+  output:
+    type: file
+    path: ...
+    intermediate_dir: intermediate_results
+```
diff --git a/docs/index.md b/docs/index.md
@@ -26,5 +26,3 @@ docetl is the ideal choice when you're looking to **maximize correctness and out
 - You're unsure how to best express your task to maximize LLM accuracy
 - You're working with long documents that don't fit into a single prompt or are too lengthy for effective LLM reasoning
 - You have validation criteria and want tasks to automatically retry when the validation fails
-
-docetl is particularly well-suited for complex document processing tasks. It excels in analyzing large corpora of legal documents, processing extensive medical records, conducting in-depth social science research, and investigating complex datasets in journalism. These scenarios benefit greatly from docetl's ability to handle intricate data processing workflows efficiently and accurately.
diff --git a/mkdocs.yml b/mkdocs.yml
@@ -18,8 +18,8 @@ nav:
           - Installation: installation.md
           - Tutorial: tutorial.md
       - Core Concepts:
-          - Schemas: concepts/schemas.md
           - Operators: concepts/operators.md
+          - Schemas: concepts/schemas.md
           - Pipelines: concepts/pipelines.md
           - Optimization: concepts/optimization.md
       - LLM-Powered Operators:
@@ -42,6 +42,7 @@ nav:
       #     - Performance Tuning: advanced/performance-tuning.md
   - API Reference:
       - docetl: api-reference/docetl.md
+      - docetl.cli: api-reference/cli.md
       - docetl.operations: api-reference/operations.md
       - docetl.optimizers: api-reference/optimizers.md
   - Cookbook:

diff --git a/todos.md b/todos.md
@@ -80,9 +80,8 @@ TODO:
 - [ ] Filter optimizer
   - [x] Extend map optimizer to support filter
   - [ ] Train an embedding classifier for filter
-- [ ] Support model pools
 - [ ] Support passing expectations
-- [ ] Write intermediates to disk
+- [x] Write intermediates to disk
 - [ ] Support order by
 - [ ] Track traces
 - [ ] Reduce operations: eagerly process merges to prevent against stragglers/tail latencies in folds?

diff --git a/workloads/medical/checkpoints/medical_info_extraction/extract_medications.json b/workloads/medical/checkpoints/medical_info_extraction/extract_medications.json
diff --git a/workloads/medical/checkpoints/medical_info_extraction/resolve_medications.json b/workloads/medical/checkpoints/medical_info_extraction/resolve_medications.json
diff --git a/workloads/medical/checkpoints/medical_info_extraction/unnest_medications.json b/workloads/medical/checkpoints/medical_info_extraction/unnest_medications.json