Skip to content

Commit

Permalink
Fix split syntax
Browse files Browse the repository at this point in the history
  • Loading branch information
shreyashankar committed Sep 19, 2024
1 parent f98147c commit f6d40af
Show file tree
Hide file tree
Showing 9 changed files with 20 additions and 19 deletions.
13 changes: 5 additions & 8 deletions docetl/operations/split.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,23 +39,20 @@ def syntax_check(self) -> None:

if self.config["method"] == "token_count":
if (
not isinstance(self.config["method_kwargs"]["token_count"], int)
or self.config["method_kwargs"]["token_count"] <= 0
not isinstance(self.config["method_kwargs"]["num_tokens"], int)
or self.config["method_kwargs"]["num_tokens"] <= 0
):
raise ValueError("'token_count' must be a positive integer")
raise ValueError("'num_tokens' must be a positive integer")
elif self.config["method"] == "delimiter":
if not isinstance(self.config["method_kwargs"]["delimiter"], str):
raise ValueError("'delimiter' must be a string")

if "model" in self.config and not isinstance(self.config["model"], str):
raise TypeError("'model' in configuration must be a string")

def execute(self, input_data: List[Dict]) -> Tuple[List[Dict], float]:
split_key = self.config["split_key"]
method = self.config["method"]
method_kwargs = self.config["method_kwargs"]
encoder = tiktoken.encoding_for_model(
self.config.get("model", self.default_model)
self.config["method_kwargs"].get("model", self.default_model)
)
results = []
cost = 0.0
Expand All @@ -68,7 +65,7 @@ def execute(self, input_data: List[Dict]) -> Tuple[List[Dict], float]:
doc_id = str(uuid.uuid4())

if method == "token_count":
token_count = method_kwargs["token_count"]
token_count = method_kwargs["num_tokens"]
tokens = encoder.encode(content)

for chunk_num, i in enumerate(
Expand Down
2 changes: 1 addition & 1 deletion docetl/optimizers/map_optimizer/operation_creators.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ def create_split_map_gather_operations(
"name": split_name,
"split_key": split_key,
"method": "token_count",
"method_kwargs": {"token_count": chunk_size},
"method_kwargs": {"num_tokens": chunk_size},
}
pipeline.append(split_config)

Expand Down
6 changes: 3 additions & 3 deletions docs/operators/split.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,8 @@ Here's an example of using the Split operation to divide customer support transc
split_key: transcript
method: token_count
method_kwargs:
token_count: 500
model: gpt-4o-mini
num_tokens: 500
model: gpt-4o-mini
```
This Split operation processes long customer support transcripts:
Expand All @@ -44,7 +44,7 @@ Note that chunks will not overlap in content.
- For "delimiter" method: `delimiter` (string) to use for splitting.
- For "token_count" method: `token_count` (integer) specifying the maximum number of tokens per chunk.

### Optional Parameters
### Optional Parameters in `method_kwargs

| Parameter | Description | Default |
| --------------------- | ------------------------------------------------------------------------------- | ----------------------------- |
Expand Down
4 changes: 2 additions & 2 deletions mkdocs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,8 @@ site_url: https://docetl.com/
# extra:
# docs_url: /docs

repo_url: https://github.com/shreyashankar/docetl
repo_name: shreyashankar/docetl
repo_url: https://github.com/ucbepic/docetl
repo_name: ucbepic/docetl
remote_branch: gh-pages
nav:
- Home:
Expand Down
2 changes: 1 addition & 1 deletion tests/test_basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -309,7 +309,7 @@ def split_config():
"type": "split",
"split_key": "content",
"method": "token_count",
"method_kwargs": {"token_count": 4},
"method_kwargs": {"num_tokens": 4},
"name": "split_doc",
}

Expand Down
2 changes: 1 addition & 1 deletion tests/test_eugene.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ def extract_themes_config():

@pytest.fixture
def unnest_themes_config():
return {"type": "unnest", "unnest_key": "theme"}
return {"type": "unnest", "unnest_key": "theme", "name": "unnest_themes"}


@pytest.fixture
Expand Down
2 changes: 1 addition & 1 deletion tests/test_split.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ def split_config():
"type": "split",
"split_key": "content",
"method": "token_count",
"method_kwargs": {"token_count": 10},
"method_kwargs": {"num_tokens": 10},
"name": "split_doc",
}

Expand Down
2 changes: 1 addition & 1 deletion tests/test_synth_gather.py
Original file line number Diff line number Diff line change
Expand Up @@ -174,7 +174,7 @@ def test_split_map_gather(sample_data):
"type": "split",
"split_key": "content",
"method": "token_count",
"method_kwargs": {"token_count": 100},
"method_kwargs": {"num_tokens": 100},
"name": "split_doc",
}

Expand Down
6 changes: 5 additions & 1 deletion tests/test_synth_resolve.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,11 @@ def test_synth_resolve(config_yaml):
for op in step["operations"]:
if op.startswith("synthesized_resolve_"):
synthesized_resolve_found = True
synthesized_op = optimizer.optimized_config["operations"][op]
synthesized_op = [
operation
for operation in optimizer.optimized_config["operations"]
if operation["name"] == op
]

# Check if the synthesized operation has the correct properties
assert synthesized_op["type"] == "resolve"
Expand Down

0 comments on commit f6d40af

Please sign in to comment.