empirical-run · saikatmitra91 · May 8, 2024 · May 8, 2024 · May 8, 2024 · May 8, 2024
diff --git a/.changeset/tidy-bulldogs-accept.md b/.changeset/tidy-bulldogs-accept.md
@@ -0,0 +1,8 @@
+---
+"@empiricalrun/scorer": minor
+"@empiricalrun/types": minor
+"@empiricalrun/cli": minor
+"web": minor
+---
+
+feat: add support for merging inputs and add multi-turn chat example
diff --git a/apps/web/app/page.tsx b/apps/web/app/page.tsx
@@ -44,10 +44,6 @@ export default function Page(): JSX.Element {
     () => (dataset?.samples || [])?.map((s) => s.id),
     [dataset],
   );
-  const datasetInputNames = useMemo(
-    () => Object.keys(dataset?.samples?.[0]?.inputs || {}),
-    [dataset],
-  );
   const runColumnHeaders = useMemo(
     () => tableHeaders.filter((h) => h.type == "completion"),
     [tableHeaders],
@@ -202,7 +198,6 @@ export default function Page(): JSX.Element {
                   <div className="flex flex-1 min-w-[500px] overflow-hidden">
                     <SampleCard
                       sample={inputSample!}
-                      inputTabs={datasetInputNames}
                       onSampleAdd={(sample) => addDatasetSample(sample)}
                       onSampleInputUpdate={updateDatasetSampleInput}
                       onSampleRemove={(sample) => {

diff --git a/apps/web/components/json-as-tab.tsx b/apps/web/components/json-as-tab.tsx
@@ -49,12 +49,6 @@ export function JsonAsTab({
   const { activeTab: remoteActiveTab, onChangeTab: remoteOnChangeTab } =
     useSyncedTabs(tabs, storeKey);
   const [activeTab, setActiveTab] = useState<string | undefined>();
-  const activeTabValue = useMemo(() => {
-    if (activeTab && data) {
-      return data[activeTab];
-    }
-    return undefined;
-  }, [activeTab, data]);
 
   useEffect(() => {
     if (remoteActiveTab && data[remoteActiveTab]) {
@@ -67,6 +61,8 @@ export function JsonAsTab({
   const onChangeTab = useCallback(
     (tab: string) => {
       setActiveTab(tab);
+      console.log(JSON.stringify(data));
+      console.log("setting remote active tab", tab);
       remoteOnChangeTab(tab);
     },
     [remoteOnChangeTab],
@@ -76,7 +72,7 @@ export function JsonAsTab({
     <>
       <div className="flex flex-row space-x-2 justify-end">
         <>
-          {activeTabValue && showExpandOption && (
+          {showExpandOption && (
             <Sheet>
               <SheetTrigger asChild>
                 <Button
@@ -90,15 +86,11 @@ export function JsonAsTab({
               </SheetTrigger>
               <SheetContent className="w-[700px] sm:w-[540px]">
                 <SheetHeader>
-                  <SheetTitle>{activeTab}</SheetTitle>
+                  <SheetTitle>Inputs</SheetTitle>
                 </SheetHeader>
                 <div className="py-4 h-full">
                   <CodeViewer
-                    value={
-                      typeof activeTabValue === "string"
-                        ? activeTabValue
-                        : JSON.stringify(activeTabValue, null, 2)
-                    }
+                    value={JSON.stringify(data, null, 2)}
                     readOnly // expand sheet is readonly
                     scrollable
                     language="json"

diff --git a/apps/web/components/sample-card.tsx b/apps/web/components/sample-card.tsx
@@ -5,15 +5,13 @@ import { JsonAsTab } from "./json-as-tab";
 
 export default function SampleCard({
   sample,
-  inputTabs,
   onSampleRemove,
   onSampleAdd,
   onSampleInputUpdate,
   onClickRunOnAllModels,
   hasMissingCompletion,
 }: {
   sample: DatasetSample;
-  inputTabs?: string[];
   hasMissingCompletion: boolean;
   onSampleRemove?: (sample: DatasetSample) => void;
   onSampleAdd?: (sample: DatasetSample) => void;
@@ -29,7 +27,6 @@ export default function SampleCard({
         <JsonAsTab
           storeKey="input"
           data={sample?.inputs}
-          defaultTabs={inputTabs}
           showRunButton={hasMissingCompletion}
           onSampleAdd={() => onSampleAdd?.(sample)}
           onSampleRemove={() => onSampleRemove?.(sample)}

diff --git a/apps/web/hooks/useSyncedTab.ts b/apps/web/hooks/useSyncedTab.ts
@@ -39,23 +39,8 @@ export function useSyncedTabs(tabList: string[], tabStoreKey: string) {
   );
 
   useEffect(() => {
-    if ((tabs || []).length < tabList.length) {
-      // find the missing tab
-      const missingTabs = tabList.filter(
-        // @ts-ignore
-        (tab) => !tabs.includes(tab),
-      );
-      setActiveTab(missingTabs[0]);
-    }
     setTabs(tabList);
-  }, [tabs, setTabs, tabList, activeTab]);
-
-  useEffect(() => {
-    //@ts-ignore
-    if (!tabs.includes(activeTab || "")) {
-      setActiveTab(tabs[0]);
-    }
-  }, [activeTab, tabs]);
+  }, [setTabs, tabList]);
 
   return {
     tabs,

diff --git a/examples/multi-turn-chat/.gitignore b/examples/multi-turn-chat/.gitignore
@@ -0,0 +1,3 @@
+
+# Ignore outputs from Empirical
+.empiricalrun
diff --git a/examples/multi-turn-chat/README.md b/examples/multi-turn-chat/README.md
@@ -0,0 +1,29 @@
+# Evaluating multi-turn chat
+This example illustrates how to score outputs for multi-turn chat scenarios.
+
+### Dataset
+The dataset configured is configured in a [Google Sheet](https://docs.google.com/spreadsheets/d/1fZ_3FFj94SiucglQOTrCHQTZVhrWJwWqfSk-vEIp8_I/edit#gid=0). 
+
+### Run Configuration
+The run is implemented using python script. The run configuration mentions `chat.py` as part of configuration.
+Essentially `chat.py` implements multi-turn conversation.
+
+### Scorer Configuration
+The scoring mechanism is implemented through a Python script named `score.py`.
+
+## Steps to run
+To execute the example:
+1. Install dependencies:
+    ```
+    poetry install
+    ```
+
+1. Evaluate multi-turn chat using Empirical:
+    ```
+    npx @empiricalrun/cli run --python-path `poetry env info -e`
+    ```
+    >Note: Ensure `OPENAI_API_KEY` is exported before running above command.
+1. Visualize the output:
+    ```
+    npx @empiricalrun/cli ui
+    ```
diff --git a/examples/multi-turn-chat/chat.py b/examples/multi-turn-chat/chat.py
@@ -0,0 +1,27 @@
+from openai import AsyncOpenAI
+
+
+async def execute(inputs, parameters):
+    openai = AsyncOpenAI()
+    messages = []
+    for input in inputs:
+        input.get("user_query")
+        messages.append({"role": "user", "content": input.get("user_query")})
+        chat_completion = await openai.chat.completions.create(
+            messages=messages,
+            model="gpt-3.5-turbo",
+        )
+        messages.append(
+            {
+                "role": chat_completion.choices[0].message.role,
+                "content": chat_completion.choices[0].message.content,
+            }
+        )
+        openai.chat.completions.create
+    thread_length = len(messages)
+    return {
+        # setting the last response as the final output of the conversation
+        "value": messages[thread_length - 1].get("content", ""),
+        # saving the thread in metadata for eyeball and scoring output
+        "metadata": {"messages": messages},
+    }
diff --git a/examples/multi-turn-chat/empiricalrc.json b/examples/multi-turn-chat/empiricalrc.json
@@ -0,0 +1,20 @@
+{
+  "$schema": "https://assets.empirical.run/config/schema/latest.json",
+  "runs": [
+    {
+      "type": "py-script",
+      "path": "chat.py"
+    }
+  ],
+  "dataset": {
+    "path": "https://docs.google.com/spreadsheets/d/1fZ_3FFj94SiucglQOTrCHQTZVhrWJwWqfSk-vEIp8_I/edit#gid=0",
+    "group_by": "conv_id"
+  },
+  "scorers": [
+    {
+      "name": "llm-evaluation",
+      "type": "py-script",
+      "path": "score.py"
+    }
+  ]
+}