From c08f51b8466396056d53c0c773fb06bf004a31f9 Mon Sep 17 00:00:00 2001
From: "Rossdan Craig rossdan@lastmileai.dev" <>
Date: Mon, 8 Jan 2024 12:11:51 -0500
Subject: [PATCH] Make all outputs in "streaming" format

Doing this to make it easier for Ryan to parse with a singular unified output format, regardless of whether the model parser actually supports streaming or not.

- moved `runPromptStream` -> `runPrompt`, overriding the old definition of `runPrompt` by now passing in an `enableStreaming` flag
- still relying on `isStreamingSupported()` function to set the `enableStreaming` param to true or false
- default to `stream` param being `True` on backend server (however this has no effect for non-streaming models like Dall-E)

## Test Plan
Test that both streaming and non-streaming settings work as expected, as well as test that a model which does not support streaming (ex: Dall-E) still works. Now when user hasn't explicitly clicked the "stream" setting, it will default to streaming. However if they explictly toggle it turns off. Follow up we should have the "stream" button auto-enabled to reflect this (doesn't have to actually be in the config, we should just have the UI show it as on by default to match user expectation)

Update: Updated this so that the default value for `stream` is now `true` inside of `OpenAIChatModelParserPromptSchema`, `HuggingFaceTextGenerationParserPromptSchema` and `AnyscaleEndpointPromptSchema`. Couldn't see it defined in `PaLMTextParserPromptSchema`


https://github.com/lastmile-ai/aiconfig/assets/151060367/34214a66-0cea-4774-a917-9476359f172c
---
 .../src/aiconfig/editor/client/src/Editor.tsx | 23 +++++-----
 .../client/src/components/EditorContainer.tsx | 35 ++++++++-------
 .../AnyscaleEndpointPromptSchema.ts           |  1 +
 ...ingFaceTextGenerationParserPromptSchema.ts |  1 +
 .../OpenAIChatModelParserPromptSchema.ts      |  1 +
 .../client/src/utils/aiconfigStateUtils.ts    | 14 +++---
 python/src/aiconfig/editor/server/server.py   | 44 +++++--------------
 7 files changed, 51 insertions(+), 68 deletions(-)

diff --git a/python/src/aiconfig/editor/client/src/Editor.tsx b/python/src/aiconfig/editor/client/src/Editor.tsx
index 3c552fae6..f4ad2c32f 100644
--- a/python/src/aiconfig/editor/client/src/Editor.tsx
+++ b/python/src/aiconfig/editor/client/src/Editor.tsx
@@ -68,21 +68,22 @@ export default function Editor() {
     });
   }, []);
 
-  const runPrompt = useCallback(async (promptName: string) => {
-    return await ufetch.post(ROUTE_TABLE.RUN_PROMPT, {
-      prompt_name: promptName,
-    });
-  }, []);
-
-  const runPromptStream = useCallback(
-    async (promptName: string, onStream: RunPromptStreamCallback) => {
-      await streamingApi(
+  const runPrompt = useCallback(
+    async (
+      promptName: string,
+      onStream: RunPromptStreamCallback,
+      enableStreaming: boolean = true
+    ) => {
+      // Note: We run the streaming API even for
+      // non-streaming runs so that we can unify
+      // the way we process data on the client
+      return await streamingApi<{ aiconfig?: AIConfig }>(
         {
           url: ROUTE_TABLE.RUN_PROMPT,
           method: "POST",
           body: {
             prompt_name: promptName,
-            stream: true,
+            stream: enableStreaming,
           },
         },
         "output_chunk",
@@ -156,7 +157,6 @@ export default function Editor() {
       getModels,
       getServerStatus,
       runPrompt,
-      runPromptStream,
       save,
       setConfigDescription,
       setConfigName,
@@ -170,7 +170,6 @@ export default function Editor() {
       getModels,
       getServerStatus,
       runPrompt,
-      runPromptStream,
       save,
       setConfigDescription,
       setConfigName,
diff --git a/python/src/aiconfig/editor/client/src/components/EditorContainer.tsx b/python/src/aiconfig/editor/client/src/components/EditorContainer.tsx
index 6b89dd634..b4c671e56 100644
--- a/python/src/aiconfig/editor/client/src/components/EditorContainer.tsx
+++ b/python/src/aiconfig/editor/client/src/components/EditorContainer.tsx
@@ -36,10 +36,10 @@ import {
 import AddPromptButton from "./prompt/AddPromptButton";
 import {
   getDefaultNewPromptName,
+  getModelSettingsStream,
   getPrompt,
-  isStreamingSupported,
 } from "../utils/aiconfigStateUtils";
-import { debounce, uniqueId } from "lodash";
+import { FieldWithPossiblyUndefined, debounce, uniqueId } from "lodash";
 import PromptMenuButton from "./prompt/PromptMenuButton";
 import GlobalParametersContainer from "./GlobalParametersContainer";
 import AIConfigContext from "./AIConfigContext";
@@ -79,11 +79,11 @@ export type AIConfigCallbacks = {
   deletePrompt: (promptName: string) => Promise<void>;
   getModels: (search: string) => Promise<string[]>;
   getServerStatus?: () => Promise<{ status: "OK" | "ERROR" }>;
-  runPrompt: (promptName: string) => Promise<{ aiconfig: AIConfig }>;
-  runPromptStream: (
+  runPrompt: (
     promptName: string,
-    onStream: RunPromptStreamCallback
-  ) => Promise<void>;
+    onStream: RunPromptStreamCallback,
+    enableStreaming?: boolean
+  ) => Promise<{ aiconfig?: AIConfig }>;
   save: (aiconfig: AIConfig) => Promise<void>;
   setConfigDescription: (description: string) => Promise<void>;
   setConfigName: (name: string) => Promise<void>;
@@ -544,7 +544,6 @@ export default function EditorContainer({
   );
 
   const runPromptCallback = callbacks.runPrompt;
-  const runPromptStreamCallback = callbacks.runPromptStream;
 
   const onRunPrompt = useCallback(
     async (promptId: string) => {
@@ -562,10 +561,14 @@ export default function EditorContainer({
         }
 
         const promptName = statePrompt.name;
-        const isStream = isStreamingSupported(statePrompt, stateRef.current);
+        const enableStreaming: boolean | undefined = getModelSettingsStream(
+          statePrompt!,
+          stateRef.current
+        );
 
-        if (isStream) {
-          await runPromptStreamCallback(promptName, (event) => {
+        const serverConfigResponse = await runPromptCallback(
+          promptName,
+          (event) => {
             if (event.type === "output_chunk") {
               dispatch({
                 type: "STREAM_OUTPUT_CHUNK",
@@ -579,15 +582,15 @@ export default function EditorContainer({
                 config: event.data,
               });
             }
-          });
-          return;
-        } else {
-          const serverConfigRes = await runPromptCallback(promptName);
+          },
+          enableStreaming
+        );
 
+        if (serverConfigResponse.aiconfig) {
           dispatch({
             type: "CONSOLIDATE_AICONFIG",
             action,
-            config: serverConfigRes.aiconfig,
+            config: serverConfigResponse.aiconfig,
           });
         }
       } catch (err: unknown) {
@@ -609,7 +612,7 @@ export default function EditorContainer({
         });
       }
     },
-    [runPromptCallback, runPromptStreamCallback]
+    [runPromptCallback]
   );
 
   const setNameCallback = callbacks.setConfigName;
diff --git a/python/src/aiconfig/editor/client/src/shared/prompt_schemas/AnyscaleEndpointPromptSchema.ts b/python/src/aiconfig/editor/client/src/shared/prompt_schemas/AnyscaleEndpointPromptSchema.ts
index f3b7e4db8..43a69de30 100644
--- a/python/src/aiconfig/editor/client/src/shared/prompt_schemas/AnyscaleEndpointPromptSchema.ts
+++ b/python/src/aiconfig/editor/client/src/shared/prompt_schemas/AnyscaleEndpointPromptSchema.ts
@@ -40,6 +40,7 @@ export const AnyscaleEndpointPromptSchema: PromptSchema = {
       },
       stream: {
         type: "boolean",
+        default: true,
         description: `If true, send messages token by token. If false, messages send in bulk.`,
       },
       temperature: {
diff --git a/python/src/aiconfig/editor/client/src/shared/prompt_schemas/HuggingFaceTextGenerationParserPromptSchema.ts b/python/src/aiconfig/editor/client/src/shared/prompt_schemas/HuggingFaceTextGenerationParserPromptSchema.ts
index cc605cf9c..4d0f4de6d 100644
--- a/python/src/aiconfig/editor/client/src/shared/prompt_schemas/HuggingFaceTextGenerationParserPromptSchema.ts
+++ b/python/src/aiconfig/editor/client/src/shared/prompt_schemas/HuggingFaceTextGenerationParserPromptSchema.ts
@@ -32,6 +32,7 @@ export const HuggingFaceTextGenerationParserPromptSchema: PromptSchema = {
       },
       stream: {
         type: "boolean",
+        default: true,
       },
       do_sample: {
         type: "boolean",
diff --git a/python/src/aiconfig/editor/client/src/shared/prompt_schemas/OpenAIChatModelParserPromptSchema.ts b/python/src/aiconfig/editor/client/src/shared/prompt_schemas/OpenAIChatModelParserPromptSchema.ts
index 22c888350..20526b161 100644
--- a/python/src/aiconfig/editor/client/src/shared/prompt_schemas/OpenAIChatModelParserPromptSchema.ts
+++ b/python/src/aiconfig/editor/client/src/shared/prompt_schemas/OpenAIChatModelParserPromptSchema.ts
@@ -104,6 +104,7 @@ export const OpenAIChatModelParserPromptSchema: PromptSchema = {
       },
       stream: {
         type: "boolean",
+        default: true,
         description: `Whether to stream back partial progress. 
         If set, tokens will be sent as data-only server-sent events as they become available, with the stream terminated by a data: [DONE] message. Example Python code.`,
       },
diff --git a/python/src/aiconfig/editor/client/src/utils/aiconfigStateUtils.ts b/python/src/aiconfig/editor/client/src/utils/aiconfigStateUtils.ts
index dd9a1e934..e90d2e3b1 100644
--- a/python/src/aiconfig/editor/client/src/utils/aiconfigStateUtils.ts
+++ b/python/src/aiconfig/editor/client/src/utils/aiconfigStateUtils.ts
@@ -18,12 +18,10 @@ export function getPrompt(
   return aiconfig.prompts.find((prompt) => prompt._ui.id === id);
 }
 
-// TODO: This is pretty hacky. Streaming is actually part of AIConfig runtime and not necessarily part of model settings,
-// let alone required to be defined by 'stream' boolean... Ideally we should treat everything as stream but this should work for now.
-export function isStreamingSupported(
+export function getModelSettingsStream(
   prompt: ClientPrompt,
   config: ClientAIConfig
-): boolean {
+): boolean | undefined {
   const promptModelSettings =
     prompt.metadata?.model && typeof prompt.metadata.model !== "string"
       ? prompt.metadata.model?.settings
@@ -40,8 +38,12 @@ export function isStreamingSupported(
   if (promptModelName) {
     const globalModelSettings =
       config.metadata?.models?.[promptModelName]?.settings;
-    return globalModelSettings?.stream === true;
+    if (globalModelSettings?.stream === true) {
+      return true;
+    } else if (promptModelSettings?.stream === false) {
+      return false;
+    }
   }
 
-  return false;
+  return undefined;
 }
diff --git a/python/src/aiconfig/editor/server/server.py b/python/src/aiconfig/editor/server/server.py
index f9cd467f9..e650e9193 100644
--- a/python/src/aiconfig/editor/server/server.py
+++ b/python/src/aiconfig/editor/server/server.py
@@ -202,7 +202,7 @@ def run() -> FlaskResponse:
     # aiconfig.get_parameters(prompt_name) directly inside of run. See:
     # https://github.com/lastmile-ai/aiconfig/issues/671
     params = request_json.get("params", aiconfig.get_parameters(prompt_name))  # type: ignore
-    stream = request_json.get("stream", False) # TODO: set this automatically to True after client supports stream output
+    stream = request_json.get("stream", True)
 
     # Define stream callback and queue object for streaming results
     output_text_queue = QueueIterator()
@@ -291,40 +291,16 @@ def run_async_config_in_thread():
         yield "]"
 
     try:
-        if stream:
-            LOGGER.info(f"Running `aiconfig.run()` command with request: {request_json}")
-            # Streaming based on
-            # https://stackoverflow.com/questions/73275517/flask-not-streaming-json-response
-            return Response(
-                stream_with_context(generate()),
-                status=200,
-                content_type="application/json",
-            )
-        
-        # Run without streaming
-        inference_options = InferenceOptions(stream=stream)
-        def run_async_config_in_thread():
-            asyncio.run(
-                aiconfig.run(
-                    prompt_name=prompt_name,
-                    params=params,
-                    run_with_dependencies=False,
-                    options=inference_options,
-                )
-            )
-            output_text_queue.put(STOP_STREAMING_SIGNAL)
-
-        t = threading.Thread(target=run_async_config_in_thread)
-        t.start()
         LOGGER.info(f"Running `aiconfig.run()` command with request: {request_json}")
-        t.join()
-        return HttpResponseWithAIConfig(
-            #
-            message="Ran prompt",
-            code=200,
-            aiconfig=aiconfig,
-        ).to_flask_format()
-            
+        # Note; We run the streaming API even for non-streaming runs so that
+        # we can unify the way we process data on the client
+        # Streaming based on
+        # https://stackoverflow.com/questions/73275517/flask-not-streaming-json-response
+        return Response(
+            stream_with_context(generate()),
+            status=200,
+            content_type="application/json",
+        )   
     except Exception as e:
         return HttpResponseWithAIConfig(
             #