Skip to content

Commit

Permalink
feat: add support for grouping inputs (#215)
Browse files Browse the repository at this point in the history
  • Loading branch information
saikatmitra91 authored May 8, 2024
1 parent 6f001be commit 740a844
Show file tree
Hide file tree
Showing 16 changed files with 554 additions and 46 deletions.
8 changes: 8 additions & 0 deletions .changeset/tidy-bulldogs-accept.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
---
"@empiricalrun/scorer": minor
"@empiricalrun/types": minor
"@empiricalrun/cli": minor
"web": minor
---

feat: add support for merging inputs and add multi-turn chat example
5 changes: 0 additions & 5 deletions apps/web/app/page.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -44,10 +44,6 @@ export default function Page(): JSX.Element {
() => (dataset?.samples || [])?.map((s) => s.id),
[dataset],
);
const datasetInputNames = useMemo(
() => Object.keys(dataset?.samples?.[0]?.inputs || {}),
[dataset],
);
const runColumnHeaders = useMemo(
() => tableHeaders.filter((h) => h.type == "completion"),
[tableHeaders],
Expand Down Expand Up @@ -202,7 +198,6 @@ export default function Page(): JSX.Element {
<div className="flex flex-1 min-w-[500px] overflow-hidden">
<SampleCard
sample={inputSample!}
inputTabs={datasetInputNames}
onSampleAdd={(sample) => addDatasetSample(sample)}
onSampleInputUpdate={updateDatasetSampleInput}
onSampleRemove={(sample) => {
Expand Down
18 changes: 5 additions & 13 deletions apps/web/components/json-as-tab.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -49,12 +49,6 @@ export function JsonAsTab({
const { activeTab: remoteActiveTab, onChangeTab: remoteOnChangeTab } =
useSyncedTabs(tabs, storeKey);
const [activeTab, setActiveTab] = useState<string | undefined>();
const activeTabValue = useMemo(() => {
if (activeTab && data) {
return data[activeTab];
}
return undefined;
}, [activeTab, data]);

useEffect(() => {
if (remoteActiveTab && data[remoteActiveTab]) {
Expand All @@ -67,6 +61,8 @@ export function JsonAsTab({
const onChangeTab = useCallback(
(tab: string) => {
setActiveTab(tab);
console.log(JSON.stringify(data));
console.log("setting remote active tab", tab);
remoteOnChangeTab(tab);
},
[remoteOnChangeTab],
Expand All @@ -76,7 +72,7 @@ export function JsonAsTab({
<>
<div className="flex flex-row space-x-2 justify-end">
<>
{activeTabValue && showExpandOption && (
{showExpandOption && (
<Sheet>
<SheetTrigger asChild>
<Button
Expand All @@ -90,15 +86,11 @@ export function JsonAsTab({
</SheetTrigger>
<SheetContent className="w-[700px] sm:w-[540px]">
<SheetHeader>
<SheetTitle>{activeTab}</SheetTitle>
<SheetTitle>Inputs</SheetTitle>
</SheetHeader>
<div className="py-4 h-full">
<CodeViewer
value={
typeof activeTabValue === "string"
? activeTabValue
: JSON.stringify(activeTabValue, null, 2)
}
value={JSON.stringify(data, null, 2)}
readOnly // expand sheet is readonly
scrollable
language="json"
Expand Down
3 changes: 0 additions & 3 deletions apps/web/components/sample-card.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -5,15 +5,13 @@ import { JsonAsTab } from "./json-as-tab";

export default function SampleCard({
sample,
inputTabs,
onSampleRemove,
onSampleAdd,
onSampleInputUpdate,
onClickRunOnAllModels,
hasMissingCompletion,
}: {
sample: DatasetSample;
inputTabs?: string[];
hasMissingCompletion: boolean;
onSampleRemove?: (sample: DatasetSample) => void;
onSampleAdd?: (sample: DatasetSample) => void;
Expand All @@ -29,7 +27,6 @@ export default function SampleCard({
<JsonAsTab
storeKey="input"
data={sample?.inputs}
defaultTabs={inputTabs}
showRunButton={hasMissingCompletion}
onSampleAdd={() => onSampleAdd?.(sample)}
onSampleRemove={() => onSampleRemove?.(sample)}
Expand Down
17 changes: 1 addition & 16 deletions apps/web/hooks/useSyncedTab.ts
Original file line number Diff line number Diff line change
Expand Up @@ -39,23 +39,8 @@ export function useSyncedTabs(tabList: string[], tabStoreKey: string) {
);

useEffect(() => {
if ((tabs || []).length < tabList.length) {
// find the missing tab
const missingTabs = tabList.filter(
// @ts-ignore
(tab) => !tabs.includes(tab),
);
setActiveTab(missingTabs[0]);
}
setTabs(tabList);
}, [tabs, setTabs, tabList, activeTab]);

useEffect(() => {
//@ts-ignore
if (!tabs.includes(activeTab || "")) {
setActiveTab(tabs[0]);
}
}, [activeTab, tabs]);
}, [setTabs, tabList]);

return {
tabs,
Expand Down
3 changes: 3 additions & 0 deletions examples/multi-turn-chat/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@

# Ignore outputs from Empirical
.empiricalrun
29 changes: 29 additions & 0 deletions examples/multi-turn-chat/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
# Evaluating multi-turn chat
This example illustrates how to score outputs for multi-turn chat scenarios.

### Dataset
The dataset configured is configured in a [Google Sheet](https://docs.google.com/spreadsheets/d/1fZ_3FFj94SiucglQOTrCHQTZVhrWJwWqfSk-vEIp8_I/edit#gid=0).

### Run Configuration
The run is implemented using python script. The run configuration mentions `chat.py` as part of configuration.
Essentially `chat.py` implements multi-turn conversation.

### Scorer Configuration
The scoring mechanism is implemented through a Python script named `score.py`.

## Steps to run
To execute the example:
1. Install dependencies:
```
poetry install
```
1. Evaluate multi-turn chat using Empirical:
```
npx @empiricalrun/cli run --python-path `poetry env info -e`
```
>Note: Ensure `OPENAI_API_KEY` is exported before running above command.
1. Visualize the output:
```
npx @empiricalrun/cli ui
```
27 changes: 27 additions & 0 deletions examples/multi-turn-chat/chat.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
from openai import AsyncOpenAI


async def execute(inputs, parameters):
openai = AsyncOpenAI()
messages = []
for input in inputs:
input.get("user_query")
messages.append({"role": "user", "content": input.get("user_query")})
chat_completion = await openai.chat.completions.create(
messages=messages,
model="gpt-3.5-turbo",
)
messages.append(
{
"role": chat_completion.choices[0].message.role,
"content": chat_completion.choices[0].message.content,
}
)
openai.chat.completions.create
thread_length = len(messages)
return {
# setting the last response as the final output of the conversation
"value": messages[thread_length - 1].get("content", ""),
# saving the thread in metadata for eyeball and scoring output
"metadata": {"messages": messages},
}
20 changes: 20 additions & 0 deletions examples/multi-turn-chat/empiricalrc.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
{
"$schema": "https://assets.empirical.run/config/schema/latest.json",
"runs": [
{
"type": "py-script",
"path": "chat.py"
}
],
"dataset": {
"path": "https://docs.google.com/spreadsheets/d/1fZ_3FFj94SiucglQOTrCHQTZVhrWJwWqfSk-vEIp8_I/edit#gid=0",
"group_by": "conv_id"
},
"scorers": [
{
"name": "llm-evaluation",
"type": "py-script",
"path": "score.py"
}
]
}
Loading

0 comments on commit 740a844

Please sign in to comment.