Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

hackweek: Online Evals: Weave #2749

Draft
wants to merge 8 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 40 additions & 0 deletions pr.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
# PR TODOs

- [ ] Decide on the feedback schema that is appropriate for actions. There are concepts that seem to overlap (ex. config ~ self, score ~ action, etc...)
- [ ] Firm up the Action Spec (seems pretty good IMO)
- [ ] Python API for creating objects is pretty bad - especially when we want to reference other objects... this is not clean right now.
- [ ] Create the concept of a filter action (needs to have "enabled" attribute)
- [ ] UI Elements
- [ ] Configured Actions
- [ ] List
- [ ] Create
- [ ] Edit
- [ ] Delete (delete objects)
- [ ] View?
- [ ] See Mappings
- [ ] Mappings
- [ ] List
- [ ] Create
- [ ] Edit
- [ ] Delete (delete objects)
- [ ] View?
- [ ] Link to configured action
- [ ] See Actioned Calls (listing of feedback)
- [ ] Filter Action
- [ ] List
- [ ] Create
- [ ] Edit
- [ ] Disable / Pause
- [ ] Delete (delete objects)
- [ ] View?
- [ ] Link to mapping
- [ ] See "live feed" of applicable calls
- [ ] Call Table
- [ ] Action Result Column(s)
- [ ] "Fill" Button (or create filter action - basically a single or live version)
- [ ] Call View
- [ ] Action Results
- [ ] Single Execution Button (would be nice to have smart mapping)
- [ ] OpVersion View
- [ ] View associated mappings
- [ ] Create additional
3 changes: 2 additions & 1 deletion tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -477,6 +477,7 @@ def create_client(request) -> weave_init.InitializedClient:
server: tsi.TraceServerInterface
entity = "shawn"
project = "test-project"
weave_server_flag = "clickhouse"
if weave_server_flag == "sqlite":
sqlite_server = sqlite_trace_server.SqliteTraceServer(
"file::memory:?cache=shared"
Expand All @@ -500,7 +501,7 @@ def create_client(request) -> weave_init.InitializedClient:
)
server = remote_server
elif weave_server_flag == ("prod"):
inited_client = weave_init.init_weave("dev_testing")
inited_client = weave_init.init_weave("dev_testing_evals_3")

if inited_client is None:
client = TestOnlyFlushingWeaveClient(
Expand Down
182 changes: 182 additions & 0 deletions tests/trace/demo.ipynb

Large diffs are not rendered by default.

154 changes: 154 additions & 0 deletions tests/trace/test_actions.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,154 @@
import os

from openai import OpenAI

import weave
from weave.collection_objects import action_objects
from weave.trace.weave_client import WeaveClient, get_ref
from weave.trace_server import trace_server_interface as tsi
from weave.trace_server.interface.collections import action_collection
from weave.trace_server.interface.feedback_types.action_feedback_type import (
ACTION_FEEDBACK_TYPE_NAME,
)


def test_action_create(client: WeaveClient):
api_key = os.environ.get("OPENAI_API_KEY", "DUMMY_API_KEY")

openai_client = OpenAI(api_key=api_key)

@weave.op
def extract_name(user_input: str) -> str:
response = openai_client.chat.completions.create(
model="gpt-3.5-turbo",
messages=[
{
"role": "system",
"content": "Extract the name from the user input. If there is no name, return an empty string.",
},
{"role": "user", "content": user_input},
],
temperature=0.0,
max_tokens=64,
top_p=1,
)
return response.choices[0].message.content

res, call = extract_name.call("My name is Tim.")

action = action_objects.ActionWithConfigObject(
name="is_name_extracted",
action=action_collection._BuiltinAction(
name="openai_completion",
),
config={
"model": "gpt-4o-mini",
"system_prompt": "Given the following prompt and response, determine if the name was extracted correctly.",
"response_format": {
"type": "json_schema",
"json_schema": {
"name": "is_name_extracted",
"schema": {
"type": "object",
"properties": {"is_extracted": {"type": "boolean"}},
"required": ["is_extracted"],
"additionalProperties": False,
},
"strict": True,
},
},
},
)

mapping = action_objects.ActionOpMappingObject(
name="extract_name-is_name_extracted",
action=action,
op_name=get_ref(extract_name).name,
op_digest=get_ref(extract_name).digest,
input_mapping={
"prompt": "inputs.user_input",
"response": "output",
},
)
req = tsi.ExecuteBatchActionReq(
project_id=client._project_id(), call_ids=[call.id], mapping=mapping
)

res = client.server.execute_batch_action(req=req)

# AFTER CALL!
weave.publish(mapping)

gotten_call = client.server.calls_query(
req=tsi.CallsQueryReq(
project_id=client._project_id(), call_ids=[call.id], include_feedback=True
)
)
assert len(gotten_call.calls) == 2
target_call = gotten_call.calls[0]

assert target_call.op_name == get_ref(extract_name).uri()
feedbacks = target_call.summary["weave"]["feedback"]
assert len(feedbacks) == 1
feedback = feedbacks[0]
assert feedback["feedback_type"] == ACTION_FEEDBACK_TYPE_NAME
assert feedback["payload"]["name"] == "is_name_extracted"
assert feedback["payload"]["action_mapping_ref"] == get_ref(mapping).uri()
assert feedback["payload"]["results"] == {"is_extracted": True}


# def test_builtin_actions(client: WeaveClient):
# actions = client.server.actions_list()
# assert len(actions) > 0


# def test_action_flow(client: WeaveClient):
# # 1. Bootstrap builtin actions
# # 2. Query Available Actions
# # Run an op
# # 3. Create a 1-off batch action using mapping
# # 4. Create an online trigger
# # Run more ops
# # 5. Query back the feedback results.
# pass


"""
Framing:

1. We support a number of functions that serve as scorers from a standard lib like https://docs.ragas.io/en/stable/concepts/metrics
2. Each scorer can have a config to configure the rules of the scorer (think of this like a closure)
3. When executing a scorer, we will need to define a mapping for an op (inputs and outputs) to the specific fields


(Scorers - Hard coded, but versioned non-the-less)
Mapping (Mapping from Op to Scorer fields)
Run (single / Batch) - not saved, needs config
Online - query/filter, sample rate, scorer, config, mapping, op



Spec:

"""

# Shouldn't actually put thiese in the user space
# input_schema=actions.JSONSchema(
# schema={
# "type": "object",
# "properties": {"prompt": {"type": "string"}},
# "required": ["prompt"],
# "additionalProperties": False,
# }
# ),
# config_schema=actions.JSONSchema(
# schema={
# "type": "object",
# "properties": {
# "system_prompt": {"type": "string"},
# "response_format": {"type": "object"},
# },
# "required": ["system_prompt"],
# "additionalProperties": False,
# }
# ),
13 changes: 13 additions & 0 deletions weave/collection_objects/action_objects.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
import weave
from weave.trace_server.interface.collections.action_collection import (
ActionOpMapping,
ActionWithConfig,
)


class ActionWithConfigObject(weave.Object, ActionWithConfig):
pass


class ActionOpMappingObject(weave.Object, ActionOpMapping):
pass
Loading
Loading