Skip to content

Commit

Permalink
feat: make Result more usefull (#39)
Browse files Browse the repository at this point in the history
- added a few tests too
  • Loading branch information
jjmachan authored Jun 9, 2023
1 parent 48ae599 commit 1e07768
Show file tree
Hide file tree
Showing 5 changed files with 92 additions and 20 deletions.
3 changes: 3 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -28,3 +28,6 @@ run-ci: format lint type ## Running all CI checks
run-benchmarks: ## Run benchmarks
@echo "Running benchmarks..."
@cd $(GIT_ROOT)/tests/benchmarks && python benchmark.py
test: ## Run tests
@echo "Running tests..."
@pytest tests/unit
64 changes: 60 additions & 4 deletions src/ragas/evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,9 +26,46 @@ def get_evaluation_mode(ds: Dataset):

def evaluate(
dataset: Dataset,
metrics: list[Metric],
metrics: list[Metric] | None = None,
) -> Result:
""" """
"""
Run the evaluation on the dataset with different metrics
Parameters
----------
dataset : Dataset[question: list[str], contexts: list[list[str]], answer: list[str]]
The dataset in the format of ragas which the metrics will use to score the RAG
pipeline with
metrics : list[Metric] , optional
List of metrics to use for evaluation. If not provided then ragas will run the
evaluation on the best set of metrics to give a complete view.
Returns
-------
result : Result
Result object containing the scores of each metric. You can use this do analysis
later. If the top 3 metrics are provided then it also returns the `ragas_score`
for the entire pipeline.
Examples
--------
the basic usage is as follows:
```
from ragas import evaluate
>>> dataset
Dataset({
features: ['question', 'ground_truths', 'answer', 'contexts'],
num_rows: 30
})
>>> result = evaluate(dataset)
>>> print(result["ragas_score"])
{'ragas_score': 0.860, 'context_relavency': 0.817, 'factuality': 0.892,
'answer_relevancy': 0.874}
```
"""
if dataset is None:
raise ValueError("Provide dataset!")

Expand All @@ -37,6 +74,11 @@ def evaluate(

# TODO: check if all the metrics are compatible with the evaluation mode

if metrics is None:
from ragas.metrics import answer_relevancy, context_relevancy, factuality

metrics = [answer_relevancy, context_relevancy, factuality]

# run the evaluation on dataset with different metrics
# initialize all the models in the metrics
[m.init_model() for m in metrics]
Expand All @@ -45,12 +87,14 @@ def evaluate(
for metric in metrics:
scores.append(metric.score(dataset).select_columns(metric.name))

return Result(concatenate_datasets(scores, axis=1))
return Result(scores=concatenate_datasets(scores, axis=1), dataset=dataset)


@dataclass
class Result(dict):
scores: Dataset
dataset: Dataset | None = None
ragas_score: float | None = None

def __post_init__(self):
values = []
Expand All @@ -77,5 +121,17 @@ def describe(self):
}
return description

def to_pandas(self, batch_size: int | None = None, batched: bool = False):
if self.dataset is None:
raise ValueError("dataset is not provided for the results class")
assert self.scores.shape[0] == self.dataset.shape[0]
result_ds = concatenate_datasets([self.dataset, self.scores], axis=1)

return result_ds.to_pandas(batch_size=batch_size, batched=batched)

def __repr__(self) -> str:
return super().__repr__()
scores = self.copy()
ragas_score = scores.pop("ragas_score")
score_strs = [f"'ragas_score': {ragas_score:0.3f}"]
score_strs.extend([f"'{k}': {v:0.3f}" for k, v in scores.items()])
return "{" + ", ".join(score_strs) + "}"
32 changes: 17 additions & 15 deletions src/ragas/metrics/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,21 @@
from datasets import Dataset


def make_batches(total_size: int, batch_size: int) -> list[range]:
"""
Take a total size and batch size and return a list of ranges for the batches
"""
tail = total_size % batch_size
num_batches = floor(total_size / batch_size)
batches = [
range(i, i + batch_size) for i in range(0, batch_size * num_batches, batch_size)
]
if tail != 0:
batches.append(range(batch_size * num_batches, batch_size * num_batches + tail))

return batches


@dataclass
class Metric(ABC):
@property
Expand All @@ -40,18 +55,5 @@ def init_model():
def score(self: t.Self, dataset: Dataset) -> Dataset:
...

def get_batches(self, dataset_size: int):
tail = dataset_size % self.batch_size
num_batches = floor(dataset_size / self.batch_size)
batches = [
range(i, i + self.batch_size)
for i in range(0, self.batch_size * num_batches, self.batch_size)
]
if tail != 0:
batches.append(
range(
self.batch_size * num_batches, self.batch_size * num_batches + tail
)
)

return batches
def get_batches(self, dataset_size: int) -> list[range]:
return make_batches(dataset_size, self.batch_size)
2 changes: 1 addition & 1 deletion src/ragas/metrics/factual.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ class Factuality(Metric):

@property
def name(self):
return "NLI_score"
return "factuality"

def init_model(self: t.Self):
pass
Expand Down
11 changes: 11 additions & 0 deletions tests/unit/test_metric.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
import pytest

from ragas.metrics.base import make_batches


@pytest.mark.parametrize(
"batch_size, total_size, len_expected", [(5, 10, 2), (5, 11, 3), (5, 9, 2)]
)
def test_make_batches(batch_size, total_size, len_expected):
batches = make_batches(total_size, batch_size)
assert len(batches) == len_expected

0 comments on commit 1e07768

Please sign in to comment.