Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: add support for grouping inputs #215

Merged
merged 6 commits into from
May 8, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions .changeset/tidy-bulldogs-accept.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
---
"@empiricalrun/scorer": minor
"@empiricalrun/types": minor
"@empiricalrun/cli": minor
"web": minor
---

feat: add support for merging inputs and add multi-turn chat example
5 changes: 0 additions & 5 deletions apps/web/app/page.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -44,10 +44,6 @@ export default function Page(): JSX.Element {
() => (dataset?.samples || [])?.map((s) => s.id),
[dataset],
);
const datasetInputNames = useMemo(
() => Object.keys(dataset?.samples?.[0]?.inputs || {}),
[dataset],
);
const runColumnHeaders = useMemo(
() => tableHeaders.filter((h) => h.type == "completion"),
[tableHeaders],
Expand Down Expand Up @@ -202,7 +198,6 @@ export default function Page(): JSX.Element {
<div className="flex flex-1 min-w-[500px] overflow-hidden">
<SampleCard
sample={inputSample!}
inputTabs={datasetInputNames}
onSampleAdd={(sample) => addDatasetSample(sample)}
onSampleInputUpdate={updateDatasetSampleInput}
onSampleRemove={(sample) => {
Expand Down
18 changes: 5 additions & 13 deletions apps/web/components/json-as-tab.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -49,12 +49,6 @@ export function JsonAsTab({
const { activeTab: remoteActiveTab, onChangeTab: remoteOnChangeTab } =
useSyncedTabs(tabs, storeKey);
const [activeTab, setActiveTab] = useState<string | undefined>();
const activeTabValue = useMemo(() => {
if (activeTab && data) {
return data[activeTab];
}
return undefined;
}, [activeTab, data]);

useEffect(() => {
if (remoteActiveTab && data[remoteActiveTab]) {
Expand All @@ -67,6 +61,8 @@ export function JsonAsTab({
const onChangeTab = useCallback(
(tab: string) => {
setActiveTab(tab);
console.log(JSON.stringify(data));
console.log("setting remote active tab", tab);
remoteOnChangeTab(tab);
},
[remoteOnChangeTab],
Expand All @@ -76,7 +72,7 @@ export function JsonAsTab({
<>
<div className="flex flex-row space-x-2 justify-end">
<>
{activeTabValue && showExpandOption && (
{showExpandOption && (
<Sheet>
<SheetTrigger asChild>
<Button
Expand All @@ -90,15 +86,11 @@ export function JsonAsTab({
</SheetTrigger>
<SheetContent className="w-[700px] sm:w-[540px]">
<SheetHeader>
<SheetTitle>{activeTab}</SheetTitle>
<SheetTitle>Inputs</SheetTitle>
</SheetHeader>
<div className="py-4 h-full">
<CodeViewer
value={
typeof activeTabValue === "string"
? activeTabValue
: JSON.stringify(activeTabValue, null, 2)
}
value={JSON.stringify(data, null, 2)}
readOnly // expand sheet is readonly
scrollable
language="json"
Expand Down
3 changes: 0 additions & 3 deletions apps/web/components/sample-card.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -5,15 +5,13 @@ import { JsonAsTab } from "./json-as-tab";

export default function SampleCard({
sample,
inputTabs,
onSampleRemove,
onSampleAdd,
onSampleInputUpdate,
onClickRunOnAllModels,
hasMissingCompletion,
}: {
sample: DatasetSample;
inputTabs?: string[];
hasMissingCompletion: boolean;
onSampleRemove?: (sample: DatasetSample) => void;
onSampleAdd?: (sample: DatasetSample) => void;
Expand All @@ -29,7 +27,6 @@ export default function SampleCard({
<JsonAsTab
storeKey="input"
data={sample?.inputs}
defaultTabs={inputTabs}
showRunButton={hasMissingCompletion}
onSampleAdd={() => onSampleAdd?.(sample)}
onSampleRemove={() => onSampleRemove?.(sample)}
Expand Down
17 changes: 1 addition & 16 deletions apps/web/hooks/useSyncedTab.ts
Original file line number Diff line number Diff line change
Expand Up @@ -39,23 +39,8 @@ export function useSyncedTabs(tabList: string[], tabStoreKey: string) {
);

useEffect(() => {
if ((tabs || []).length < tabList.length) {
// find the missing tab
const missingTabs = tabList.filter(
// @ts-ignore
(tab) => !tabs.includes(tab),
);
setActiveTab(missingTabs[0]);
}
setTabs(tabList);
}, [tabs, setTabs, tabList, activeTab]);

useEffect(() => {
//@ts-ignore
if (!tabs.includes(activeTab || "")) {
setActiveTab(tabs[0]);
}
}, [activeTab, tabs]);
}, [setTabs, tabList]);

return {
tabs,
Expand Down
3 changes: 3 additions & 0 deletions examples/multi-turn-chat/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@

# Ignore outputs from Empirical
.empiricalrun
29 changes: 29 additions & 0 deletions examples/multi-turn-chat/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
# Evaluating multi-turn chat
This example illustrates how to score outputs for multi-turn chat scenarios.

### Dataset
The dataset configured is configured in a [Google Sheet](https://docs.google.com/spreadsheets/d/1fZ_3FFj94SiucglQOTrCHQTZVhrWJwWqfSk-vEIp8_I/edit#gid=0).

### Run Configuration
The run is implemented using python script. The run configuration mentions `chat.py` as part of configuration.
Essentially `chat.py` implements multi-turn conversation.

### Scorer Configuration
The scoring mechanism is implemented through a Python script named `score.py`.

## Steps to run
To execute the example:
1. Install dependencies:
```
poetry install
```

1. Evaluate multi-turn chat using Empirical:
```
npx @empiricalrun/cli run --python-path `poetry env info -e`
```
>Note: Ensure `OPENAI_API_KEY` is exported before running above command.
1. Visualize the output:
```
npx @empiricalrun/cli ui
```
27 changes: 27 additions & 0 deletions examples/multi-turn-chat/chat.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
from openai import AsyncOpenAI


async def execute(inputs, parameters):
openai = AsyncOpenAI()
messages = []
for input in inputs:
input.get("user_query")
messages.append({"role": "user", "content": input.get("user_query")})
chat_completion = await openai.chat.completions.create(
messages=messages,
model="gpt-3.5-turbo",
)
messages.append(
{
"role": chat_completion.choices[0].message.role,
"content": chat_completion.choices[0].message.content,
}
)
openai.chat.completions.create
thread_length = len(messages)
return {
# setting the last response as the final output of the conversation
"value": messages[thread_length - 1].get("content", ""),
# saving the thread in metadata for eyeball and scoring output
"metadata": {"messages": messages},
}
20 changes: 20 additions & 0 deletions examples/multi-turn-chat/empiricalrc.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
{
"$schema": "https://assets.empirical.run/config/schema/latest.json",
"runs": [
{
"type": "py-script",
"path": "chat.py"
}
],
"dataset": {
"path": "https://docs.google.com/spreadsheets/d/1fZ_3FFj94SiucglQOTrCHQTZVhrWJwWqfSk-vEIp8_I/edit#gid=0",
"group_by": "conv_id"
},
"scorers": [
{
"name": "llm-evaluation",
"type": "py-script",
"path": "score.py"
}
]
}
Loading