diff --git a/.changeset/good-falcons-applaud.md b/.changeset/good-falcons-applaud.md new file mode 100644 index 00000000..7b431a5c --- /dev/null +++ b/.changeset/good-falcons-applaud.md @@ -0,0 +1,5 @@ +--- +"@empiricalrun/scorer": patch +--- + +feat: py-script scorer has a simpler return type diff --git a/docs/scoring/python.mdx b/docs/scoring/python.mdx index b10e69ce..0e4ff4fd 100644 --- a/docs/scoring/python.mdx +++ b/docs/scoring/python.mdx @@ -10,7 +10,8 @@ the `scorers` section of the configuration. The `path` key should be the path to "scorers": [ { "type": "py-script", - "path": "eval.py" + "path": "score.py", + "name": "my-custom-scorer" } ] ``` @@ -21,17 +22,32 @@ In the script, you need to define an `evaluate` method, with the following signa - output: dict with key `value` to get the output value (string) and key `metadata` to get metadata (dict) - inputs: dict of key-value pairs from the dataset sample - **Returns** - - List of results: each result is dict with score (0 or 1), message (string) and name (string) + - List of results: each result is dict with score (number between 0 to 1), message (optional, string) and name (optional, string) -```python +```python score.py def evaluate(output, inputs): - # ... + model_response = output["value"] + metadata = output["metadata"] + # ... score the model response + return { + "score": 1, + "message": "Optional reasoning for this score" + } +``` + +## Multiple scores + +It is possible for the Python script to return an array of scores. Use `name` to distinguish +between them. + +```python score.py +def evaluate(output, inputs): + model_response = output["value"] + metadata = output["metadata"] + # ... score the model response return [ - { - "score": 1, - "message": "Reason for this score", - "name": "name-for-this-scorer" - } + { "score": 1, "name": "syntax-score" }, + { "score": 0, "name": "semantic-score", "message": "failure reason"} ] ``` diff --git a/examples/humaneval/score.py b/examples/humaneval/score.py index 7891c303..1c1bb374 100644 --- a/examples/humaneval/score.py +++ b/examples/humaneval/score.py @@ -22,4 +22,4 @@ def evaluate(output, inputs): except Exception as e: passed, reason = 0, repr(e) - return [{"score": passed, "message": reason, "name": "unit-tests"}] + return {"score": passed, "message": reason} diff --git a/packages/scorer/src/provider/deterministic/script.test.ts b/packages/scorer/src/provider/deterministic/script.test.ts index ccc9b22c..4f84efc6 100644 --- a/packages/scorer/src/provider/deterministic/script.test.ts +++ b/packages/scorer/src/provider/deterministic/script.test.ts @@ -1,8 +1,9 @@ -import { DatasetSample } from "@empiricalrun/types"; +import { DatasetSample, RunOutput, Scorer } from "@empiricalrun/types"; import { expect, test } from "vitest"; import { scoreWithPythonScript } from "./script"; +import score from "../../index"; -const humanEval = { +const humanEvalSample = { output: "def truncate_number(number):\n integer_part = int(number)\n decimal_part = number - integer_part\n return decimal_part", test: "\n\nMETADATA = {\n 'author': 'jt',\n 'dataset': 'test'\n}\n\n\ndef check(candidate):\n assert candidate(3.5) == 0.5\n assert abs(candidate(1.33) - 0.33) < 1e-6\n assert abs(candidate(123.456) - 0.456) < 1e-6\n", @@ -11,94 +12,88 @@ const humanEval = { // Using relative path to use the python script from HumanEval example // Tests run out of the $root/packages/evals directory -const scriptPath = "../../examples/humaneval/score.py"; +const humanEvalScriptPath = "../../examples/humaneval/score.py"; -test("script scorer works for a correct humaneval output", async () => { +test("py-script scorer works for a correct humaneval output", async () => { const sample: DatasetSample = { id: "1", inputs: { - test: humanEval.test, - entry_point: humanEval.funcName, + test: humanEvalSample.test, + entry_point: humanEvalSample.funcName, }, }; - expect( await scoreWithPythonScript({ sample, output: { - value: humanEval.output, + value: humanEvalSample.output, }, config: { type: "py-script", - path: scriptPath, + name: "unit-tests", + path: humanEvalScriptPath, }, }), - ).toStrictEqual([ - { - score: 1, - name: "unit-tests", - message: "Tests passed", - }, - ]); + ).toStrictEqual({ + score: 1, + name: "unit-tests", + message: "Tests passed", + }); }); -test("script scorer works for a incorrect humaneval output", async () => { +test("py-script scorer works for a incorrect humaneval output", async () => { const sample: DatasetSample = { id: "1", inputs: { - test: humanEval.test, - entry_point: humanEval.funcName + "123", // wrong function name + test: humanEvalSample.test, + entry_point: humanEvalSample.funcName + "123", // wrong function name }, }; - expect( await scoreWithPythonScript({ sample, - output: { value: humanEval.output }, + output: { value: humanEvalSample.output }, config: { type: "py-script", - path: scriptPath, + name: "unit-tests", + path: humanEvalScriptPath, }, }), - ).toStrictEqual([ - { - score: 0, - name: "unit-tests", - message: "NameError(\"name 'truncate_number123' is not defined\")", - }, - ]); + ).toStrictEqual({ + score: 0, + name: "unit-tests", + message: "NameError(\"name 'truncate_number123' is not defined\")", + }); }); -test("script scorer works for a humaneval output that has backticks", async () => { +test("py-script scorer works for a humaneval output that has backticks", async () => { const sample: DatasetSample = { id: "1", inputs: { - test: humanEval.test, - entry_point: humanEval.funcName, + test: humanEvalSample.test, + entry_point: humanEvalSample.funcName, }, }; - expect( await scoreWithPythonScript({ sample, output: { - value: "```python\n" + humanEval.output + "\n```", + value: "```python\n" + humanEvalSample.output + "\n```", }, config: { type: "py-script", - path: scriptPath, + name: "unit-tests", + path: humanEvalScriptPath, }, }), - ).toStrictEqual([ - { - score: 1, - name: "unit-tests", - message: "Tests passed", - }, - ]); + ).toStrictEqual({ + score: 1, + name: "unit-tests", + message: "Tests passed", + }); }); -test("script scorer times out a long running script", async () => { +test("py-script scorer times out a long running script", async () => { const sample: DatasetSample = { id: "0", inputs: {}, @@ -122,7 +117,7 @@ test("script scorer times out a long running script", async () => { ]); }, 21000); -test("script scorer works with a python script that throws", async () => { +test("py-script scorer works with a python script that throws", async () => { const sample: DatasetSample = { id: "0", inputs: {}, @@ -136,7 +131,6 @@ test("script scorer works with a python script that throws", async () => { path: scriptWithError, }, }); - expect(score).toEqual( expect.objectContaining({ score: 0, @@ -144,3 +138,46 @@ test("script scorer works with a python script that throws", async () => { }), ); }); + +test("py-script scorer works when returning array of scores", async () => { + const sample: DatasetSample = { + id: "0", + inputs: {}, + }; + const output: RunOutput = { + value: "output", + }; + const scorer: Scorer = { + type: "py-script", + path: __dirname + "/test-assets/returns_array_of_scores.py", + name: "score-name", + }; + const result = await score({ sample, output, scorers: [scorer] }); + expect(result.length).toBe(2); + expect(result[0].score).toBe(1); + expect(result[0].name).toBe("score_1"); + expect(result[0].message).toBe(undefined); + expect(result[1].score).toBe(0); + expect(result[1].name).toBe("score_2"); + expect(result[1].message).toBe("why this failed"); +}); + +test("py-script scorer works when returning single score without name", async () => { + const sample: DatasetSample = { + id: "0", + inputs: {}, + }; + const output: RunOutput = { + value: "output", + }; + const scorer: Scorer = { + type: "py-script", + path: __dirname + "/test-assets/returns_single_score.py", + name: "single-score", + }; + const result = await score({ sample, output, scorers: [scorer] }); + expect(result.length).toBe(1); + expect(result[0].score).toBe(1); + expect(result[0].message).toBe(undefined); + expect(result[0].name).toBe("single-score"); +}); diff --git a/packages/scorer/src/provider/deterministic/script.ts b/packages/scorer/src/provider/deterministic/script.ts index e3bdafaf..5b022c21 100644 --- a/packages/scorer/src/provider/deterministic/script.ts +++ b/packages/scorer/src/provider/deterministic/script.ts @@ -92,6 +92,13 @@ export const scoreWithPythonScript: ScoringFn = async ({ }); }); - const result = runOutput[runOutput.length - 1]; - return JSON.parse(result!); + const rawResult = runOutput[runOutput.length - 1]; + let result = JSON.parse(rawResult!); + if (!Array.isArray(result)) { + result = { + name: config.name || name, + ...result, + }; + } + return result; }; diff --git a/packages/scorer/src/provider/deterministic/test-assets/returns_array_of_scores.py b/packages/scorer/src/provider/deterministic/test-assets/returns_array_of_scores.py new file mode 100644 index 00000000..0ef1dd89 --- /dev/null +++ b/packages/scorer/src/provider/deterministic/test-assets/returns_array_of_scores.py @@ -0,0 +1,5 @@ +def evaluate(output, inputs): + return [ + {"score": 1, "name": "score_1"}, + {"score": 0, "name": "score_2", "message": "why this failed"}, + ] diff --git a/packages/scorer/src/provider/deterministic/test-assets/returns_single_score.py b/packages/scorer/src/provider/deterministic/test-assets/returns_single_score.py new file mode 100644 index 00000000..3331cea3 --- /dev/null +++ b/packages/scorer/src/provider/deterministic/test-assets/returns_single_score.py @@ -0,0 +1,2 @@ +def evaluate(output, inputs): + return { "score": 1 }