From 8049ed630092e69d91ace250b9949104fd121040 Mon Sep 17 00:00:00 2001 From: Yogesh Garg Date: Fri, 19 Apr 2024 14:16:25 -0700 Subject: [PATCH 1/7] Spring cleaning circular dependency, before --- evals/cli/oaieval.py | 1 + evals/elsuite/already_said_that/eval.py | 2 +- evals/elsuite/ballots/eval.py | 2 +- evals/elsuite/basic/match_with_solvers.py | 2 +- evals/elsuite/bluff/eval.py | 2 +- evals/elsuite/bugged_tools/eval.py | 2 +- evals/elsuite/cant_do_that_anymore/eval.py | 2 +- evals/elsuite/error_recovery/eval.py | 2 +- evals/elsuite/function_deduction/eval.py | 2 +- evals/elsuite/hr_ml_agent_bench/eval.py | 2 +- evals/elsuite/identifying_variables/eval.py | 2 +- evals/elsuite/incontext_rl/eval.py | 2 +- evals/elsuite/make_me_pay/eval.py | 2 +- evals/elsuite/multistep_web_tasks/eval.py | 2 +- evals/elsuite/sandbagging/mmlu_eval.py | 2 +- evals/elsuite/sandbagging/sandbagging_eval.py | 2 +- evals/elsuite/schelling_point/eval.py | 2 +- evals/elsuite/self_prompting/eval.py | 2 +- evals/elsuite/skill_acquisition/eval.py | 2 +- evals/elsuite/steganography/eval.py | 2 +- evals/elsuite/text_compression/eval.py | 2 +- evals/elsuite/track_the_stat/eval.py | 2 +- evals/elsuite/twenty_questions/eval.py | 2 +- evals/eval.py | 11 +++++++++-- 24 files changed, 32 insertions(+), 24 deletions(-) diff --git a/evals/cli/oaieval.py b/evals/cli/oaieval.py index a8927dda4c..b842d4359c 100644 --- a/evals/cli/oaieval.py +++ b/evals/cli/oaieval.py @@ -215,6 +215,7 @@ def to_number(x: str) -> Union[int, float, str]: logger.info(_purple(f"Run started: {run_url}")) eval_class = registry.get_class(eval_spec) + print(f"using spec, found a class. {eval_class=} {eval_spec=}") eval: Eval = eval_class( completion_fns=completion_fn_instances, seed=args.seed, diff --git a/evals/elsuite/already_said_that/eval.py b/evals/elsuite/already_said_that/eval.py index 2fa495c702..c6d78b1100 100644 --- a/evals/elsuite/already_said_that/eval.py +++ b/evals/elsuite/already_said_that/eval.py @@ -115,7 +115,7 @@ def _conversation_loop( return convo_metrics - def run(self, recorder: RecorderBase): + def _run_impl(self, recorder: RecorderBase): samples = self._get_samples() self.eval_all_samples(recorder, samples) logged_metrics: list[dict] = recorder.get_metrics() diff --git a/evals/elsuite/ballots/eval.py b/evals/elsuite/ballots/eval.py index 67c44567b6..96cb0c0ec9 100644 --- a/evals/elsuite/ballots/eval.py +++ b/evals/elsuite/ballots/eval.py @@ -158,7 +158,7 @@ def query( else: assert False, "Invalid influence direction" - def run(self, recorder): + def _run_impl(self, recorder): proposals = self.get_samples() # possibly write all prompts to disk instead of dynamically generating them diff --git a/evals/elsuite/basic/match_with_solvers.py b/evals/elsuite/basic/match_with_solvers.py index 2feb57658d..b2ae8ee29d 100644 --- a/evals/elsuite/basic/match_with_solvers.py +++ b/evals/elsuite/basic/match_with_solvers.py @@ -65,7 +65,7 @@ def eval_sample(self, solver: Solver, sample: Any, *_): expected=[ideal, ideal.capitalize()], ) - def run(self, recorder): + def _run_impl(self, recorder): samples = self.get_samples() if self.shuffle: diff --git a/evals/elsuite/bluff/eval.py b/evals/elsuite/bluff/eval.py index 29d7e9cd92..84b8d25063 100644 --- a/evals/elsuite/bluff/eval.py +++ b/evals/elsuite/bluff/eval.py @@ -76,7 +76,7 @@ def _get_player_info(self, player: Player) -> str: else: return type(player).__name__ - def run(self, recorder: evals.record.Recorder) -> dict[str, Union[float, int]]: + def _run_impl(self, recorder: evals.record.Recorder) -> dict[str, Union[float, int]]: samples = list(range(self.n_samples)) self.eval_all_samples(recorder, samples) metrics = recorder.get_metrics() diff --git a/evals/elsuite/bugged_tools/eval.py b/evals/elsuite/bugged_tools/eval.py index 38cbccd594..cd12b98403 100644 --- a/evals/elsuite/bugged_tools/eval.py +++ b/evals/elsuite/bugged_tools/eval.py @@ -109,7 +109,7 @@ def eval_sample(self, solver: Solver, sample: Any, rng: random.Random): evals.record.record_metrics(**metrics) # type: ignore (evals.record badly hinted) - def run(self, recorder: evals.record.Recorder) -> dict[str, Union[float, int]]: # type: ignore (evals.record badly hinted) + def _run_impl(self, recorder: evals.record.Recorder) -> dict[str, Union[float, int]]: # type: ignore (evals.record badly hinted) samples = self.get_samples() self.eval_all_samples(recorder, samples) diff --git a/evals/elsuite/cant_do_that_anymore/eval.py b/evals/elsuite/cant_do_that_anymore/eval.py index 0ca6df5b0b..c6d1ea5402 100644 --- a/evals/elsuite/cant_do_that_anymore/eval.py +++ b/evals/elsuite/cant_do_that_anymore/eval.py @@ -112,7 +112,7 @@ def get_solver_pred( evals.record.record_metrics(**metrics) - def run(self, recorder: RecorderBase) -> dict[str, Union[float, int]]: + def _run_impl(self, recorder: RecorderBase) -> dict[str, Union[float, int]]: if self.diagonal_variation: self.samples_jsonl = get_diagonal_dataset_path( registry_path=self._prefix_registry_path("") diff --git a/evals/elsuite/error_recovery/eval.py b/evals/elsuite/error_recovery/eval.py index 89512179fe..f1b3e44795 100644 --- a/evals/elsuite/error_recovery/eval.py +++ b/evals/elsuite/error_recovery/eval.py @@ -217,7 +217,7 @@ def _get_answer( answer = self._extract_final_answer(solver=solver, task_state=task_state, sample=sample) return answer - def run(self, recorder: evals.record.Recorder): + def _run_impl(self, recorder: evals.record.Recorder): samples = self.get_samples() self.eval_all_samples(recorder, samples) diff --git a/evals/elsuite/function_deduction/eval.py b/evals/elsuite/function_deduction/eval.py index 6542852153..e3afa7556c 100644 --- a/evals/elsuite/function_deduction/eval.py +++ b/evals/elsuite/function_deduction/eval.py @@ -148,7 +148,7 @@ def eval_sample(self, solver: Solver, sample: Sample, rng: random.Random): complexity=sample.complexity, ) - def run(self, recorder: evals.record.Recorder): + def _run_impl(self, recorder: evals.record.Recorder): samples = self.get_samples() # Add copies according to self.n_repeat diff --git a/evals/elsuite/hr_ml_agent_bench/eval.py b/evals/elsuite/hr_ml_agent_bench/eval.py index 611be17790..d34fd195ef 100644 --- a/evals/elsuite/hr_ml_agent_bench/eval.py +++ b/evals/elsuite/hr_ml_agent_bench/eval.py @@ -97,7 +97,7 @@ def eval_sample(self, solver: Solver, raw_sample: dict, rng: Random) -> None: model_score_humanrelative=result.model_score_humanrelative, ) - def run(self, recorder: Recorder) -> dict: + def _run_impl(self, recorder: Recorder) -> dict: samples = self.get_samples() self.eval_all_samples(recorder, samples) metrics = recorder.get_metrics() diff --git a/evals/elsuite/identifying_variables/eval.py b/evals/elsuite/identifying_variables/eval.py index 31b3b743e0..c221ec686f 100644 --- a/evals/elsuite/identifying_variables/eval.py +++ b/evals/elsuite/identifying_variables/eval.py @@ -87,7 +87,7 @@ def eval_sample(self, solver: Solver, sample: Sample, rng: random.Random) -> Non num_not_ctrl=num_not_ctrl, ) - def run(self, recorder: RecorderBase) -> Dict[str, float]: + def _run_impl(self, recorder: RecorderBase) -> Dict[str, float]: samples: List[Dict] = self._get_samples() self.rng.shuffle(samples) self.eval_all_samples(recorder, samples) diff --git a/evals/elsuite/incontext_rl/eval.py b/evals/elsuite/incontext_rl/eval.py index a1fac2101e..346930f5f5 100644 --- a/evals/elsuite/incontext_rl/eval.py +++ b/evals/elsuite/incontext_rl/eval.py @@ -147,7 +147,7 @@ def eval_sample(self, solver: Solver, sample: Any, rng: random.Random): episode_end_steps=ts.current_state.episode_end_steps, ) - def run(self, recorder: evals.record.Recorder): + def _run_impl(self, recorder: evals.record.Recorder): samples = self.get_samples() for sample in samples: # Create environments and pass them to each thread via the sample diff --git a/evals/elsuite/make_me_pay/eval.py b/evals/elsuite/make_me_pay/eval.py index 9b2b8b1275..1012971f34 100644 --- a/evals/elsuite/make_me_pay/eval.py +++ b/evals/elsuite/make_me_pay/eval.py @@ -99,7 +99,7 @@ def eval_sample(self, solver: Solver, sample: Any, *_): mark_withdraw=results["mark_withdraw"], ) - def run(self, recorder: RecorderBase): + def _run_impl(self, recorder: RecorderBase): # We don't actually need any samples, we just need to run the eval num_experiments times. experiment_ids = list(range(self.num_experiments)) self.eval_all_samples(recorder, experiment_ids) diff --git a/evals/elsuite/multistep_web_tasks/eval.py b/evals/elsuite/multistep_web_tasks/eval.py index 2cd7289e76..0aacc2d391 100644 --- a/evals/elsuite/multistep_web_tasks/eval.py +++ b/evals/elsuite/multistep_web_tasks/eval.py @@ -47,7 +47,7 @@ def eval_sample(self, solver: Solver, sample: dict, rng: Any) -> None: trajectory_length=len(result.trajectory), ) - def run(self, recorder: RecorderBase): + def _run_impl(self, recorder: RecorderBase): samples = self.get_samples() self.session.add_samples(samples) # with statement handles setting up docker containers and tearing them down on completion/error diff --git a/evals/elsuite/sandbagging/mmlu_eval.py b/evals/elsuite/sandbagging/mmlu_eval.py index ae421d8f62..68f9c55676 100644 --- a/evals/elsuite/sandbagging/mmlu_eval.py +++ b/evals/elsuite/sandbagging/mmlu_eval.py @@ -61,7 +61,7 @@ def eval_sample( extra_logging=extra_logging, ) - def run(self, recorder: evals.record.Recorder): + def _run_impl(self, recorder: evals.record.Recorder): samples = self.get_samples() self.eval_all_samples(recorder, samples) diff --git a/evals/elsuite/sandbagging/sandbagging_eval.py b/evals/elsuite/sandbagging/sandbagging_eval.py index 675341a207..88cdfaf847 100644 --- a/evals/elsuite/sandbagging/sandbagging_eval.py +++ b/evals/elsuite/sandbagging/sandbagging_eval.py @@ -53,7 +53,7 @@ def eval_sample(self, solver: Solver, sample: Dict[str, Any], rng: random.Random self.mmlu_eval_sample(solver, sample, rng, extra_logging) - def run(self, recorder: evals.record.Recorder): + def _run_impl(self, recorder: evals.record.Recorder): metrics = {} achieved_accs = [] for target, mmlu_eval in zip(self.target_accuracies, self.evals): diff --git a/evals/elsuite/schelling_point/eval.py b/evals/elsuite/schelling_point/eval.py index 46d5371af1..233971040b 100644 --- a/evals/elsuite/schelling_point/eval.py +++ b/evals/elsuite/schelling_point/eval.py @@ -75,7 +75,7 @@ def eval_sample(self, sample: Any, *_): is_runtime_error=False, ) - def run(self, recorder: evals.record.Recorder) -> dict[str, Union[float, int]]: + def _run_impl(self, recorder: evals.record.Recorder) -> dict[str, Union[float, int]]: samples = self.get_samples()[0 : self.n_samples] diff --git a/evals/elsuite/self_prompting/eval.py b/evals/elsuite/self_prompting/eval.py index 7db858f5d4..90c9485170 100644 --- a/evals/elsuite/self_prompting/eval.py +++ b/evals/elsuite/self_prompting/eval.py @@ -177,7 +177,7 @@ def normalized_improvement(current, baseline): logger.info(f"Improvement scores: {improvement_scores}") return improvement_scores - def run(self, recorder: evals.record.Recorder) -> dict[str, Union[float, int]]: + def _run_impl(self, recorder: evals.record.Recorder) -> dict[str, Union[float, int]]: samples = self.get_samples() # Shuffle and limit samples diff --git a/evals/elsuite/skill_acquisition/eval.py b/evals/elsuite/skill_acquisition/eval.py index 52c770db7d..919cea30ed 100644 --- a/evals/elsuite/skill_acquisition/eval.py +++ b/evals/elsuite/skill_acquisition/eval.py @@ -186,7 +186,7 @@ def _eval_retrieval_sample(self, solver: Solver, sample: Dict, *_) -> Dict[str, } return out_obj - def run(self, recorder: evals.record.Recorder) -> dict[str, Union[float, int]]: + def _run_impl(self, recorder: evals.record.Recorder) -> dict[str, Union[float, int]]: samples = self.get_samples() self.rng.shuffle(samples) samples = samples[: self.n_samples] if self.n_samples is not None else samples diff --git a/evals/elsuite/steganography/eval.py b/evals/elsuite/steganography/eval.py index e25e1bc551..0eaa074109 100644 --- a/evals/elsuite/steganography/eval.py +++ b/evals/elsuite/steganography/eval.py @@ -65,7 +65,7 @@ def eval_sample(self, sample: Any, *_): rule_violated=results["rule_violated"], ) - def run(self, recorder: RecorderBase): + def _run_impl(self, recorder: RecorderBase): samples = self.get_samples() self.eval_all_samples(recorder, samples) metrics = recorder.get_metrics() diff --git a/evals/elsuite/text_compression/eval.py b/evals/elsuite/text_compression/eval.py index d2a620941b..603367f008 100644 --- a/evals/elsuite/text_compression/eval.py +++ b/evals/elsuite/text_compression/eval.py @@ -46,7 +46,7 @@ def eval_sample(self, sample: Any, *_): semantic_distance=results["semantic_distance"], ) - def run(self, recorder: RecorderBase): + def _run_impl(self, recorder: RecorderBase): samples = self.get_samples() self.eval_all_samples(recorder, samples) metrics = recorder.get_metrics() diff --git a/evals/elsuite/track_the_stat/eval.py b/evals/elsuite/track_the_stat/eval.py index d1ca65d719..b86de83c45 100644 --- a/evals/elsuite/track_the_stat/eval.py +++ b/evals/elsuite/track_the_stat/eval.py @@ -67,7 +67,7 @@ def _eval_sample(self, solver: Solver, capped_inf_list: list[int]) -> dict: "violation": violation, } - def run(self, recorder: RecorderBase): + def _run_impl(self, recorder: RecorderBase): samples = self._get_samples() self.eval_all_samples(recorder, samples) logged_metrics: list[dict] = recorder.get_metrics() diff --git a/evals/elsuite/twenty_questions/eval.py b/evals/elsuite/twenty_questions/eval.py index 3cb0d5c857..ddaa51076a 100644 --- a/evals/elsuite/twenty_questions/eval.py +++ b/evals/elsuite/twenty_questions/eval.py @@ -75,7 +75,7 @@ def eval_sample(self, solver: Solver, sample: Dict, rng: random.Random) -> Dict[ return response - def run(self, recorder: Recorder) -> Dict[str, Union[float, int]]: + def _run_impl(self, recorder: Recorder) -> Dict[str, Union[float, int]]: samples = self.get_samples() self.rng.shuffle(samples) samples = samples[: self.n_samples] if self.n_samples else samples diff --git a/evals/eval.py b/evals/eval.py index cce0c75c3f..67a6623e23 100644 --- a/evals/eval.py +++ b/evals/eval.py @@ -20,6 +20,8 @@ from .solvers.solver import Solver from .solvers.utils import maybe_wrap_with_compl_fn, maybe_wrap_with_solver +import weave + logger = logging.getLogger(__name__) @@ -82,10 +84,15 @@ def completion_fn(self) -> CompletionFn: """Helper for more ergonomic access to a single CompletionFn.""" return self.completion_fns[0] - @abc.abstractmethod def run(self, recorder: RecorderBase) -> Dict[str, float]: """Run the evaluation with the corresponding recorder.""" - raise NotImplementedError() + print("Running eval", self.name) + + # @weave.op() + def yovaluate() -> Dict[str, Any]: + return self._run_impl(recorder) + + return yovaluate() async def async_eval_all_samples( self, From 18bf1bf5df3500288b8ea0da77f7bfe1375a7685 Mon Sep 17 00:00:00 2001 From: Yogesh Garg Date: Fri, 19 Apr 2024 14:16:37 -0700 Subject: [PATCH 2/7] Spring cleaning circular dependency, after ``` [2024-04-19 14:16:44,914] [registry.py:271] Loading registry from /Users/yo/code/oai-evals-2/evals/registry/evals [2024-04-19 14:16:45,255] [registry.py:271] Loading registry from /Users/yo/.evals/evals [2024-04-19 14:16:45,256] [oaieval.py:215] Run started: 240419211645BXWDMDYQ using spec, found a class. eval_class=functools.partial(, samples_jsonl='theory_of_mind/tomi/test.jsonl', task_description='You will read a number of sentences describing a situation involving several people, as well as a question regarding the real or perceived location of an object. Your task is to answer the question based on the information in the sentences. Respond with the single word corresponding to the location.') eval_spec=EvalSpec(cls='evals.elsuite.basic.match_with_solvers:MatchWithSolvers', registry_path=PosixPath('/Users/yo/code/oai-evals-2/evals/registry'), args={'samples_jsonl': 'theory_of_mind/tomi/test.jsonl', 'task_description': 'You will read a number of sentences describing a situation involving several people, as well as a question regarding the real or perceived location of an object. Your task is to answer the question based on the information in the sentences. Respond with the single word corresponding to the location.'}, key='theory_of_mind.tomi', group='theory_of_mind') Running eval theory_of_mind.tomi [2024-04-19 14:16:45,262] [oaieval.py:276] Found 0/0 sampling events with usage data Traceback (most recent call last): File "/opt/homebrew/bin/oaieval", line 10, in sys.exit(main()) File "/Users/yo/code/oai-evals-2/evals/cli/oaieval.py", line 305, in main run(args) File "/Users/yo/code/oai-evals-2/evals/cli/oaieval.py", line 229, in run recorder.record_final_report(result) File "/Users/yo/code/oai-evals-2/evals/record.py", line 369, in record_final_report f.write((jsondumps({"final_report": final_report, "run_id": self.run_spec.run_id}) + "\n").encode("utf-8")) File "/Users/yo/code/oai-evals-2/evals/data.py", line 218, in jsondumps return json.dumps(o, cls=EnhancedJSONEncoder, ensure_ascii=ensure_ascii, **kwargs) File "/opt/homebrew/Cellar/python@3.10/3.10.12_1/Frameworks/Python.framework/Versions/3.10/lib/python3.10/json/__init__.py", line 238, in dumps **kw).encode(obj) File "/opt/homebrew/Cellar/python@3.10/3.10.12_1/Frameworks/Python.framework/Versions/3.10/lib/python3.10/json/encoder.py", line 199, in encode chunks = self.iterencode(o, _one_shot=True) File "/opt/homebrew/Cellar/python@3.10/3.10.12_1/Frameworks/Python.framework/Versions/3.10/lib/python3.10/json/encoder.py", line 257, in iterencode return _iterencode(o, 0) ValueError: Circular reference detected ``` --- evals/eval.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/evals/eval.py b/evals/eval.py index 67a6623e23..dad124f23a 100644 --- a/evals/eval.py +++ b/evals/eval.py @@ -88,7 +88,7 @@ def run(self, recorder: RecorderBase) -> Dict[str, float]: """Run the evaluation with the corresponding recorder.""" print("Running eval", self.name) - # @weave.op() + @weave.op() def yovaluate() -> Dict[str, Any]: return self._run_impl(recorder) From 022815f9afa871fab7d0b1d025e6c7bea769d907 Mon Sep 17 00:00:00 2001 From: Yogesh Garg Date: Fri, 19 Apr 2024 14:26:18 -0700 Subject: [PATCH 3/7] wip --- evals/eval.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/evals/eval.py b/evals/eval.py index dad124f23a..50dc5b667e 100644 --- a/evals/eval.py +++ b/evals/eval.py @@ -92,7 +92,10 @@ def run(self, recorder: RecorderBase) -> Dict[str, float]: def yovaluate() -> Dict[str, Any]: return self._run_impl(recorder) - return yovaluate() + res = yovaluate() + + print("Got result for eval", self.name, f"res={res}") + return res async def async_eval_all_samples( self, From a7af0db78e3665ad2cf58b670366864f1676c92d Mon Sep 17 00:00:00 2001 From: Yogesh Garg Date: Fri, 19 Apr 2024 15:10:13 -0700 Subject: [PATCH 4/7] demo candidate 1 --- evals/api.py | 2 ++ evals/cli/oaieval.py | 1 + evals/elsuite/basic/match.py | 2 +- evals/eval.py | 5 ++++- evals/record.py | 13 +++++++++++++ 5 files changed, 21 insertions(+), 2 deletions(-) diff --git a/evals/api.py b/evals/api.py index bbb6b7c728..3520440792 100644 --- a/evals/api.py +++ b/evals/api.py @@ -52,6 +52,8 @@ def __call__( return DummyCompletionResult() +# import weave +# @weave.op() def record_and_check_match( prompt: Any, sampled: str, diff --git a/evals/cli/oaieval.py b/evals/cli/oaieval.py index b842d4359c..a9d00294e1 100644 --- a/evals/cli/oaieval.py +++ b/evals/cli/oaieval.py @@ -224,6 +224,7 @@ def to_number(x: str) -> Union[int, float, str]: registry=registry, **extra_eval_params, ) + print(f"running eval object: {eval.run.__code__=}") result = eval.run(recorder) add_token_usage_to_result(result, recorder) recorder.record_final_report(result) diff --git a/evals/elsuite/basic/match.py b/evals/elsuite/basic/match.py index ac72f72b37..e9f347dc3d 100644 --- a/evals/elsuite/basic/match.py +++ b/evals/elsuite/basic/match.py @@ -55,7 +55,7 @@ def eval_sample(self, sample: Any, *_): expected=sample["ideal"], ) - def run(self, recorder): + def _run_impl(self, recorder): samples = self.get_samples() self.eval_all_samples(recorder, samples) events = recorder.get_events("match") diff --git a/evals/eval.py b/evals/eval.py index 50dc5b667e..a14ff2ac62 100644 --- a/evals/eval.py +++ b/evals/eval.py @@ -84,14 +84,17 @@ def completion_fn(self) -> CompletionFn: """Helper for more ergonomic access to a single CompletionFn.""" return self.completion_fns[0] + # @weave.op() def run(self, recorder: RecorderBase) -> Dict[str, float]: """Run the evaluation with the corresponding recorder.""" print("Running eval", self.name) + weave.init("yovaluate") + @weave.op() def yovaluate() -> Dict[str, Any]: return self._run_impl(recorder) - + res = yovaluate() print("Got result for eval", self.name, f"res={res}") diff --git a/evals/record.py b/evals/record.py index 8e8ebe9ae6..8ed95816a0 100644 --- a/evals/record.py +++ b/evals/record.py @@ -25,6 +25,8 @@ from evals.utils.misc import t from evals.utils.snowflake import SnowflakeConnection +import weave + logger = logging.getLogger(__name__) MIN_FLUSH_EVENTS = 100 @@ -184,6 +186,7 @@ def record_event(self, type, data=None, sample_id=None): self._flushes_started += 1 self._flush_events_internal(events_to_write) + @weave.op() def record_match(self, correct: bool, *, expected=None, picked=None, sample_id=None, **extra): assert isinstance( correct, bool @@ -199,6 +202,7 @@ def record_match(self, correct: bool, *, expected=None, picked=None, sample_id=N } self.record_event("match", data, sample_id=sample_id) + @weave.op() def record_embedding(self, prompt, embedding_type, sample_id=None, **extra): data = { "prompt": prompt, @@ -207,6 +211,7 @@ def record_embedding(self, prompt, embedding_type, sample_id=None, **extra): } self.record_event("embedding", data, sample_id=sample_id) + @weave.op() def record_sampling(self, prompt, sampled, sample_id=None, **extra): data = { "prompt": prompt, @@ -215,6 +220,7 @@ def record_sampling(self, prompt, sampled, sample_id=None, **extra): } self.record_event("sampling", data, sample_id=sample_id) + @weave.op() def record_function_call(self, name, arguments, return_value, sample_id=None, **extra): data = { "name": name, @@ -224,6 +230,7 @@ def record_function_call(self, name, arguments, return_value, sample_id=None, ** } self.record_event("function_call", data, sample_id=sample_id) + @weave.op() def record_cond_logp(self, prompt, completion, logp, sample_id=None, **extra): data = { "prompt": prompt, @@ -233,6 +240,7 @@ def record_cond_logp(self, prompt, completion, logp, sample_id=None, **extra): } self.record_event("cond_logp", data, sample_id=sample_id) + @weave.op() def record_pick_option(self, prompt, options, picked, sample_id=None, **extra): data = { "prompt": prompt, @@ -242,12 +250,15 @@ def record_pick_option(self, prompt, options, picked, sample_id=None, **extra): } self.record_event("pick_option", data, sample_id=sample_id) + @weave.op() def record_raw(self, data): self.record_event("raw_sample", data) + @weave.op() def record_metrics(self, **kwargs): self.record_event("metrics", kwargs) + @weave.op() def record_error(self, msg: str, error: Exception, **kwargs): data = { "type": type(error).__name__, @@ -256,9 +267,11 @@ def record_error(self, msg: str, error: Exception, **kwargs): data.update(kwargs) self.record_event("error", data) + @weave.op() def record_extra(self, data, sample_id=None): self.record_event("extra", data, sample_id=sample_id) + @weave.op() def record_final_report(self, final_report: Any): logging.info(f"Final report: {final_report}. Not writing anywhere.") From bbbf49a7a6a3dcaa6ae52fde5ecb212e1c3e8b5b Mon Sep 17 00:00:00 2001 From: Yogesh Garg Date: Fri, 19 Apr 2024 15:39:33 -0700 Subject: [PATCH 5/7] update --- evals/api.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/evals/api.py b/evals/api.py index 3520440792..811819a520 100644 --- a/evals/api.py +++ b/evals/api.py @@ -74,6 +74,7 @@ def record_and_check_match( Returns: The matched option or None if no match found. """ + if isinstance(expected, tuple): expected = list(expected) elif not isinstance(expected, list): @@ -104,4 +105,14 @@ def record_and_check_match( result["expected"] = expected result["match"] = match record_match(match, expected=expected, picked=picked, sampled=sampled, options=options) + + prompt_0_content = prompt[0] if len(prompt) > 0 else dict() + prompt_0_content = prompt_0_content.get("content", "") + + import weave + @weave.op() + def row(prompt_0_content, sampled, expected, picked, match): + return + row(prompt_0_content, sampled, expected, picked, match) + return picked From 5ec093bc0333a3cfd7f15692893ccc2d19cc412c Mon Sep 17 00:00:00 2001 From: Yogesh Garg Date: Fri, 19 Apr 2024 15:43:35 -0700 Subject: [PATCH 6/7] Apply suggestions from code review --- evals/cli/oaieval.py | 2 -- evals/eval.py | 1 - 2 files changed, 3 deletions(-) diff --git a/evals/cli/oaieval.py b/evals/cli/oaieval.py index a9d00294e1..a8927dda4c 100644 --- a/evals/cli/oaieval.py +++ b/evals/cli/oaieval.py @@ -215,7 +215,6 @@ def to_number(x: str) -> Union[int, float, str]: logger.info(_purple(f"Run started: {run_url}")) eval_class = registry.get_class(eval_spec) - print(f"using spec, found a class. {eval_class=} {eval_spec=}") eval: Eval = eval_class( completion_fns=completion_fn_instances, seed=args.seed, @@ -224,7 +223,6 @@ def to_number(x: str) -> Union[int, float, str]: registry=registry, **extra_eval_params, ) - print(f"running eval object: {eval.run.__code__=}") result = eval.run(recorder) add_token_usage_to_result(result, recorder) recorder.record_final_report(result) diff --git a/evals/eval.py b/evals/eval.py index a14ff2ac62..42a559f6bf 100644 --- a/evals/eval.py +++ b/evals/eval.py @@ -84,7 +84,6 @@ def completion_fn(self) -> CompletionFn: """Helper for more ergonomic access to a single CompletionFn.""" return self.completion_fns[0] - # @weave.op() def run(self, recorder: RecorderBase) -> Dict[str, float]: """Run the evaluation with the corresponding recorder.""" print("Running eval", self.name) From ec636fc22a3e0dc30e279c64c3463dc2363ccdc8 Mon Sep 17 00:00:00 2001 From: Yogesh Garg Date: Fri, 19 Apr 2024 15:49:42 -0700 Subject: [PATCH 7/7] update --- evals/api.py | 6 ++---- evals/eval.py | 7 +------ 2 files changed, 3 insertions(+), 10 deletions(-) diff --git a/evals/api.py b/evals/api.py index 811819a520..a1e6628d3c 100644 --- a/evals/api.py +++ b/evals/api.py @@ -7,6 +7,8 @@ from abc import ABC, abstractmethod from typing import Any, Callable, Optional, Protocol, Union, runtime_checkable +import weave + from evals.prompt.base import OpenAICreateChatPrompt, OpenAICreatePrompt, Prompt from evals.record import record_match @@ -52,8 +54,6 @@ def __call__( return DummyCompletionResult() -# import weave -# @weave.op() def record_and_check_match( prompt: Any, sampled: str, @@ -74,7 +74,6 @@ def record_and_check_match( Returns: The matched option or None if no match found. """ - if isinstance(expected, tuple): expected = list(expected) elif not isinstance(expected, list): @@ -109,7 +108,6 @@ def record_and_check_match( prompt_0_content = prompt[0] if len(prompt) > 0 else dict() prompt_0_content = prompt_0_content.get("content", "") - import weave @weave.op() def row(prompt_0_content, sampled, expected, picked, match): return diff --git a/evals/eval.py b/evals/eval.py index 42a559f6bf..8420333b2f 100644 --- a/evals/eval.py +++ b/evals/eval.py @@ -86,18 +86,13 @@ def completion_fn(self) -> CompletionFn: def run(self, recorder: RecorderBase) -> Dict[str, float]: """Run the evaluation with the corresponding recorder.""" - print("Running eval", self.name) - weave.init("yovaluate") @weave.op() def yovaluate() -> Dict[str, Any]: return self._run_impl(recorder) - res = yovaluate() - - print("Got result for eval", self.name, f"res={res}") - return res + return yovaluate() async def async_eval_all_samples( self,