Fix split syntax

ucbepic · Sep 19, 2024 · f6d40af · f6d40af
1 parent f98147c
commit f6d40af
Show file tree

Hide file tree

Showing 9 changed files with 20 additions and 19 deletions.
diff --git a/docetl/operations/split.py b/docetl/operations/split.py
@@ -39,23 +39,20 @@ def syntax_check(self) -> None:
 
         if self.config["method"] == "token_count":
             if (
-                not isinstance(self.config["method_kwargs"]["token_count"], int)
-                or self.config["method_kwargs"]["token_count"] <= 0
+                not isinstance(self.config["method_kwargs"]["num_tokens"], int)
+                or self.config["method_kwargs"]["num_tokens"] <= 0
             ):
-                raise ValueError("'token_count' must be a positive integer")
+                raise ValueError("'num_tokens' must be a positive integer")
         elif self.config["method"] == "delimiter":
             if not isinstance(self.config["method_kwargs"]["delimiter"], str):
                 raise ValueError("'delimiter' must be a string")
 
-        if "model" in self.config and not isinstance(self.config["model"], str):
-            raise TypeError("'model' in configuration must be a string")
-
     def execute(self, input_data: List[Dict]) -> Tuple[List[Dict], float]:
         split_key = self.config["split_key"]
         method = self.config["method"]
         method_kwargs = self.config["method_kwargs"]
         encoder = tiktoken.encoding_for_model(
-            self.config.get("model", self.default_model)
+            self.config["method_kwargs"].get("model", self.default_model)
         )
         results = []
         cost = 0.0
@@ -68,7 +65,7 @@ def execute(self, input_data: List[Dict]) -> Tuple[List[Dict], float]:
             doc_id = str(uuid.uuid4())
 
             if method == "token_count":
-                token_count = method_kwargs["token_count"]
+                token_count = method_kwargs["num_tokens"]
                 tokens = encoder.encode(content)
 
                 for chunk_num, i in enumerate(

diff --git a/docetl/optimizers/map_optimizer/operation_creators.py b/docetl/optimizers/map_optimizer/operation_creators.py
@@ -66,7 +66,7 @@ def create_split_map_gather_operations(
             "name": split_name,
             "split_key": split_key,
             "method": "token_count",
-            "method_kwargs": {"token_count": chunk_size},
+            "method_kwargs": {"num_tokens": chunk_size},
         }
         pipeline.append(split_config)
 

diff --git a/docs/operators/split.md b/docs/operators/split.md
@@ -21,8 +21,8 @@ Here's an example of using the Split operation to divide customer support transc
   split_key: transcript
   method: token_count
   method_kwargs:
-    token_count: 500
-  model: gpt-4o-mini
+    num_tokens: 500
+    model: gpt-4o-mini
 ```
 
 This Split operation processes long customer support transcripts:
@@ -44,7 +44,7 @@ Note that chunks will not overlap in content.
   - For "delimiter" method: `delimiter` (string) to use for splitting.
   - For "token_count" method: `token_count` (integer) specifying the maximum number of tokens per chunk.
 
-### Optional Parameters
+### Optional Parameters in `method_kwargs
 
 | Parameter             | Description                                                                     | Default                       |
 | --------------------- | ------------------------------------------------------------------------------- | ----------------------------- |

diff --git a/mkdocs.yml b/mkdocs.yml
@@ -8,8 +8,8 @@ site_url: https://docetl.com/
 # extra:
 #   docs_url: /docs
 
-repo_url: https://github.com/shreyashankar/docetl
-repo_name: shreyashankar/docetl
+repo_url: https://github.com/ucbepic/docetl
+repo_name: ucbepic/docetl
 remote_branch: gh-pages
 nav:
   - Home:

diff --git a/tests/test_basic.py b/tests/test_basic.py
@@ -309,7 +309,7 @@ def split_config():
         "type": "split",
         "split_key": "content",
         "method": "token_count",
-        "method_kwargs": {"token_count": 4},
+        "method_kwargs": {"num_tokens": 4},
         "name": "split_doc",
     }
 

diff --git a/tests/test_eugene.py b/tests/test_eugene.py
@@ -81,7 +81,7 @@ def extract_themes_config():
 
 @pytest.fixture
 def unnest_themes_config():
-    return {"type": "unnest", "unnest_key": "theme"}
+    return {"type": "unnest", "unnest_key": "theme", "name": "unnest_themes"}
 
 
 @pytest.fixture

diff --git a/tests/test_split.py b/tests/test_split.py
@@ -21,7 +21,7 @@ def split_config():
         "type": "split",
         "split_key": "content",
         "method": "token_count",
-        "method_kwargs": {"token_count": 10},
+        "method_kwargs": {"num_tokens": 10},
         "name": "split_doc",
     }
 

diff --git a/tests/test_synth_gather.py b/tests/test_synth_gather.py
@@ -174,7 +174,7 @@ def test_split_map_gather(sample_data):
         "type": "split",
         "split_key": "content",
         "method": "token_count",
-        "method_kwargs": {"token_count": 100},
+        "method_kwargs": {"num_tokens": 100},
         "name": "split_doc",
     }
 

diff --git a/tests/test_synth_resolve.py b/tests/test_synth_resolve.py
@@ -87,7 +87,11 @@ def test_synth_resolve(config_yaml):
         for op in step["operations"]:
             if op.startswith("synthesized_resolve_"):
                 synthesized_resolve_found = True
-                synthesized_op = optimizer.optimized_config["operations"][op]
+                synthesized_op = [
+                    operation
+                    for operation in optimizer.optimized_config["operations"]
+                    if operation["name"] == op
+                ]
 
                 # Check if the synthesized operation has the correct properties
                 assert synthesized_op["type"] == "resolve"