Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,14 @@ The format is based on Keep a Changelog, and this project adheres to Semantic Ve

Nothing yet.

## [0.4.0] - 2025-01-10
### Added
- Quality tagging system for generated records
- `generate_quality_tags()` method in `JudgeResponse` to automatically tag samples based on judge scores
- Tags include overall quality levels (high/medium/low_quality) and dimension-specific tags (excellent/poor tool selection, arguments, clarity)
- Configurable thresholds for quality classification
- `quality_tags` field automatically populated in generated records

## [0.3.0] - 2025-01-10
### Added
- Hugging Face dataset integration utilities in `examples/nano_tool_calling_v1/`
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ toolsgen = ["prompts/*.txt"]

[project]
name = "toolsgen"
version = "0.3.0"
version = "0.4.0"
description = "Generate tool-calling datasets from OpenAI-compatible tool specs"
readme = "README.md"
requires-python = ">=3.9"
Expand Down
3 changes: 3 additions & 0 deletions src/toolsgen/core/record_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ def _build_record(
"model": role_config.judge.model,
"temperature": role_config.judge.temperature,
}
quality_tags = []
try:
judge_result = judge_tool_calls(
client=judge_client,
Expand All @@ -65,6 +66,7 @@ def _build_record(
max_tokens=role_config.judge.max_tokens,
)
judge_dict.update(judge_result.to_dict())
quality_tags = judge_result.generate_quality_tags()
except Exception:
pass # Continue without judge data

Expand All @@ -76,6 +78,7 @@ def _build_record(
assistant_calls=tool_calls,
problem_metadata={"generated": True, "user_request": user_request},
judge=judge_dict,
quality_tags=quality_tags,
tools_metadata={"num_tools": len(tools)},
)

Expand Down
52 changes: 52 additions & 0 deletions src/toolsgen/judge.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,58 @@ def to_dict(self) -> Dict[str, Any]:
"rubric_version": "0.1.0",
}

def generate_quality_tags(
self,
high_quality_threshold: float = 0.9,
medium_quality_threshold: float = 0.7,
excellent_dimension_pct: float = 0.875,
poor_dimension_pct: float = 0.5,
) -> List[str]:
"""Generate quality tags based on scores.

Args:
high_quality_threshold: Overall score threshold for high_quality tag (default: 0.9).
medium_quality_threshold: Overall score threshold for medium_quality tag (default: 0.7).
excellent_dimension_pct: Percentage of max score for excellent tags (default: 0.875 = 87.5%).
poor_dimension_pct: Percentage of max score for poor tags (default: 0.5 = 50%).

Returns:
List of quality tags describing the sample.
"""
tags = []

# Overall quality
if self.score >= high_quality_threshold:
tags.append("high_quality")
elif self.score >= medium_quality_threshold:
tags.append("medium_quality")
else:
tags.append("low_quality")

# Dimension-specific tags (based on percentage of max possible score)
tool_rel_excellent = 0.4 * excellent_dimension_pct
tool_rel_poor = 0.4 * poor_dimension_pct
if self.tool_relevance >= tool_rel_excellent:
tags.append("excellent_tool_selection")
elif self.tool_relevance < tool_rel_poor:
tags.append("poor_tool_selection")

arg_qual_excellent = 0.4 * excellent_dimension_pct
arg_qual_poor = 0.4 * poor_dimension_pct
if self.argument_quality >= arg_qual_excellent:
tags.append("excellent_arguments")
elif self.argument_quality < arg_qual_poor:
tags.append("poor_arguments")

clarity_excellent = 0.2 * excellent_dimension_pct
clarity_poor = 0.2 * poor_dimension_pct
if self.clarity >= clarity_excellent:
tags.append("high_clarity")
elif self.clarity < clarity_poor:
tags.append("low_clarity")

return tags


def judge_tool_calls(
client: OpenAI,
Expand Down
2 changes: 1 addition & 1 deletion uv.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.