Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2,186 changes: 1,080 additions & 1,106 deletions docs/notebooks/Alpha_Llama_Stack_Post_Training.ipynb

Large diffs are not rendered by default.

352 changes: 268 additions & 84 deletions docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb

Large diffs are not rendered by default.

24 changes: 9 additions & 15 deletions docs/source/references/evals_reference/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -114,23 +114,17 @@ pprint(response)
simpleqa_dataset_id = "huggingface::simpleqa"

_ = client.datasets.register(
dataset_id=simpleqa_dataset_id,
provider_id="huggingface",
url={"uri": "https://huggingface.co/datasets/llamastack/simpleqa"},
metadata={
"path": "llamastack/simpleqa",
"split": "train",
},
dataset_schema={
"input_query": {"type": "string"},
"expected_answer": {"type": "string"},
"chat_completion_input": {"type": "chat_completion_input"},
purpose="eval/messages-answer",
source={
"type": "uri",
"uri": "huggingface://datasets/llamastack/simpleqa?split=train",
},
dataset_id=simpleqa_dataset_id,
)

eval_rows = client.datasetio.get_rows_paginated(
eval_rows = client.datasets.iterrows(
dataset_id=simpleqa_dataset_id,
rows_in_page=5,
limit=5,
)
```

Expand All @@ -143,7 +137,7 @@ client.benchmarks.register(

response = client.eval.evaluate_rows(
benchmark_id="meta-reference::simpleqa",
input_rows=eval_rows.rows,
input_rows=eval_rows.data,
scoring_functions=["llm-as-judge::405b-simpleqa"],
benchmark_config={
"eval_candidate": {
Expand Down Expand Up @@ -191,7 +185,7 @@ agent_config = {

response = client.eval.evaluate_rows(
benchmark_id="meta-reference::simpleqa",
input_rows=eval_rows.rows,
input_rows=eval_rows.data,
scoring_functions=["llm-as-judge::405b-simpleqa"],
benchmark_config={
"eval_candidate": {
Expand Down
7 changes: 3 additions & 4 deletions llama_stack/distribution/ui/page/evaluations/native_eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -166,11 +166,10 @@ def run_evaluation_3():
eval_candidate = st.session_state["eval_candidate"]

dataset_id = benchmarks[selected_benchmark].dataset_id
rows = llama_stack_api.client.datasetio.iterrows(
rows = llama_stack_api.client.datasets.iterrows(
dataset_id=dataset_id,
rows_in_page=-1,
)
total_rows = len(rows.rows)
total_rows = len(rows.data)
# Add number of examples control
num_rows = st.number_input(
"Number of Examples to Evaluate",
Expand All @@ -195,7 +194,7 @@ def run_evaluation_3():
if st.button("Run Evaluation"):
progress_text = "Running evaluation..."
progress_bar = st.progress(0, text=progress_text)
rows = rows.rows
rows = rows.data
if num_rows < total_rows:
rows = rows[:num_rows]

Expand Down
4 changes: 2 additions & 2 deletions llama_stack/providers/inline/eval/meta_reference/eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,11 +92,11 @@ async def run_eval(
validate_dataset_schema(dataset_def.dataset_schema, get_valid_schemas(Api.eval.value))
all_rows = await self.datasetio_api.iterrows(
dataset_id=dataset_id,
rows_in_page=(-1 if benchmark_config.num_examples is None else benchmark_config.num_examples),
limit=(-1 if benchmark_config.num_examples is None else benchmark_config.num_examples),
)
res = await self.evaluate_rows(
benchmark_id=benchmark_id,
input_rows=all_rows.rows,
input_rows=all_rows.data,
scoring_functions=scoring_functions,
benchmark_config=benchmark_config,
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -330,11 +330,11 @@ async def _setup_data(
async def fetch_rows(dataset_id: str):
return await self.datasetio_api.iterrows(
dataset_id=dataset_id,
rows_in_page=-1,
limit=-1,
)

all_rows = await fetch_rows(dataset_id)
rows = all_rows.rows
rows = all_rows.data

await validate_input_dataset_schema(
datasets_api=self.datasets_api,
Expand Down
4 changes: 2 additions & 2 deletions llama_stack/providers/inline/scoring/basic/scoring.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,10 +86,10 @@ async def score_batch(

all_rows = await self.datasetio_api.iterrows(
dataset_id=dataset_id,
rows_in_page=-1,
limit=-1,
)
res = await self.score(
input_rows=all_rows.rows,
input_rows=all_rows.data,
scoring_functions=scoring_functions,
)
if save_results_dataset:
Expand Down
4 changes: 2 additions & 2 deletions llama_stack/providers/inline/scoring/braintrust/braintrust.py
Original file line number Diff line number Diff line change
Expand Up @@ -169,9 +169,9 @@ async def score_batch(

all_rows = await self.datasetio_api.iterrows(
dataset_id=dataset_id,
rows_in_page=-1,
limit=-1,
)
res = await self.score(input_rows=all_rows.rows, scoring_functions=scoring_functions)
res = await self.score(input_rows=all_rows.data, scoring_functions=scoring_functions)
if save_results_dataset:
# TODO: persist and register dataset on to server for reading
# self.datasets_api.register_dataset()
Expand Down
4 changes: 2 additions & 2 deletions llama_stack/providers/inline/scoring/llm_as_judge/scoring.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,10 +74,10 @@ async def score_batch(

all_rows = await self.datasetio_api.iterrows(
dataset_id=dataset_id,
rows_in_page=-1,
limit=-1,
)
res = await self.score(
input_rows=all_rows.rows,
input_rows=all_rows.data,
scoring_functions=scoring_functions,
)
if save_results_dataset:
Expand Down
8 changes: 4 additions & 4 deletions tests/integration/eval/test_eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,11 +20,11 @@ def test_evaluate_rows(llama_stack_client, text_model_id, scoring_fn_id):
response = llama_stack_client.datasets.list()
assert any(x.identifier == "test_dataset_for_eval" for x in response)

rows = llama_stack_client.datasetio.get_rows_paginated(
rows = llama_stack_client.datasets.iterrows(
dataset_id="test_dataset_for_eval",
rows_in_page=3,
limit=3,
)
assert len(rows.rows) == 3
assert len(rows.data) == 3

scoring_functions = [
scoring_fn_id,
Expand All @@ -40,7 +40,7 @@ def test_evaluate_rows(llama_stack_client, text_model_id, scoring_fn_id):

response = llama_stack_client.eval.evaluate_rows(
benchmark_id=benchmark_id,
input_rows=rows.rows,
input_rows=rows.data,
scoring_functions=scoring_functions,
benchmark_config={
"eval_candidate": {
Expand Down