Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

New: Helicone Realtime API - Phase 2 #3352

Open
wants to merge 10 commits into
base: main
Choose a base branch
from
3 changes: 2 additions & 1 deletion bifrost/lib/clients/jawnTypes/private.ts
Original file line number Diff line number Diff line change
Expand Up @@ -686,6 +686,7 @@ Json: JsonObject;
contentArray?: components["schemas"]["Message"][];
/** Format: double */
idx?: number;
audio_data?: string;
image_url?: string;
timestamp?: string;
tool_call_id?: string;
Expand All @@ -695,7 +696,7 @@ Json: JsonObject;
role?: string;
id?: string;
/** @enum {string} */
_type: "function" | "functionCall" | "image" | "message" | "autoInput" | "contentArray";
_type: "functionCall" | "function" | "image" | "message" | "autoInput" | "contentArray" | "audio";
};
Tool: {
name: string;
Expand Down
3 changes: 2 additions & 1 deletion bifrost/lib/clients/jawnTypes/public.ts
Original file line number Diff line number Diff line change
Expand Up @@ -1132,6 +1132,7 @@ Json: JsonObject;
contentArray?: components["schemas"]["Message"][];
/** Format: double */
idx?: number;
audio_data?: string;
image_url?: string;
timestamp?: string;
tool_call_id?: string;
Expand All @@ -1141,7 +1142,7 @@ Json: JsonObject;
role?: string;
id?: string;
/** @enum {string} */
_type: "function" | "functionCall" | "image" | "message" | "autoInput" | "contentArray";
_type: "functionCall" | "function" | "image" | "message" | "autoInput" | "contentArray" | "audio";
};
Tool: {
name: string;
Expand Down
8 changes: 6 additions & 2 deletions docs/swagger.json
Original file line number Diff line number Diff line change
Expand Up @@ -2652,6 +2652,9 @@
"type": "number",
"format": "double"
},
"audio_data": {
"type": "string"
},
"image_url": {
"type": "string"
},
Expand Down Expand Up @@ -2682,12 +2685,13 @@
"_type": {
"type": "string",
"enum": [
"function",
"functionCall",
"function",
"image",
"message",
"autoInput",
"contentArray"
"contentArray",
"audio"
]
}
},
Expand Down
216 changes: 209 additions & 7 deletions examples/qawolf-ws/index.ts
Original file line number Diff line number Diff line change
@@ -1,7 +1,20 @@
import { config } from "dotenv";
import Microphone from "node-microphone";
import * as readline from "readline";
import { inspect } from "util";
import { resample } from "wave-resampler";
import WebSocket from "ws";

/**
* OpenAI Realtime API Audio Format Requirements:
* - Format: PCM with 16-bit integer samples (Int16Array)
* - Mono audio (single channel)
* - Sample rate: 24000 Hz (24 kHz)
*
* Note: This example uses node-microphone which only supports 8000, 16000, or 44100 Hz.
* We use wave-resampler to convert from 44.1kHz to 24kHz.
*/

config({ path: ".env" });

const url =
Expand All @@ -11,20 +24,31 @@ const ws = new WebSocket(url, {
Authorization: "Bearer " + process.env.OPENAI_API_KEY,
"OpenAI-Beta": "realtime=v1",
"Helicone-Auth": "Bearer " + process.env.HELICONE_API_KEY,
"Helicone-Session-Id": "session_123",
"Helicone-Session-Id": `session_${Date.now()}`,
"Helicone-User-Id": "qawolf",
},
});

const sessionUpdate = {
event_id: "event_123",
type: "session.update",
session: {
modalities: ["text"],
modalities: ["text", "audio"],
instructions:
"You are a highly capable AI assistant. Your responses should be:\n\n- **Helpful and Direct**: Provide clear, actionable information without unnecessary caveats or hedging\n\n- **Accurate and Thorough**: Break down complex topics step-by-step, cite sources when relevant, and acknowledge uncertainty when appropriate\n\n- **Adaptable**: Match your communication style to the user's needs - technical for technical questions, simple for basic queries\n\n- **Ethical**: Do not assist with harmful or illegal activities. If a request could be interpreted as either harmful or benign, assume the benign interpretation and seek clarification\n\n- **Creative and Analytical**: Use a systematic approach for technical problems while being imaginative for creative tasks\n\n- **Natural in Conversation**: Engage authentically without being overly formal or repetitive. Ask relevant follow-up questions when needed\n\nGuidelines for specific tasks:\n\n1. For coding: Provide complete, working solutions with comments explaining key concepts\n2. For analysis: Break down problems step-by-step, showing your reasoning\n3. For writing: Adapt tone and style to match the requested format\n4. For explanations: Use clear examples and analogies\n5. For factual queries: Cite sources when possible and indicate any uncertainty\n\nFormatting preferences:\n- Use markdown for code blocks and text formatting\n- Present lists and steps clearly with proper spacing\n- Structure long responses with appropriate headers and sections\n\nSafety approach:\n- If a request seems harmful, seek clarification\n- If a request could have both harmful and benign interpretations, assume the benign one\n- Provide factual information about sensitive topics while avoiding promotion of harmful activities\n\nKnowledge limits:\n- Acknowledge when information might be outdated\n- Be clear about uncertainty rather than making assumptions\n- Defer to authoritative sources on critical matters",
voice: "sage",
input_audio_format: "pcm16",
output_audio_format: "pcm16",
input_audio_transcription: {
model: "whisper-1",
},
turn_detection: {
type: "server_vad",
threshold: 0.5,
prefix_padding_ms: 300,
silence_duration_ms: 500,
create_response: true,
},
tools: [
{
type: "function",
Expand All @@ -51,6 +75,118 @@ const rl = readline.createInterface({
output: process.stdout,
});

// Microphone setup
let mic: any = null;
let micStream: any = null;
let isRecording = false;

// Initialize microphone
function initMicrophone() {
try {
// The API requires PCM with 16-bit integer samples, mono audio, 24kHz sample rate
// node-microphone only supports 8000, 16000, or 44100 Hz
// Using 44100 Hz and converting to 24kHz using wave-resampler
mic = new Microphone({
rate: 44100, // Using highest quality, then converting to 24kHz
channels: 1, // Mono
bitwidth: 16, // 16-bit
});
console.log("Microphone initialized successfully with 44.1kHz sample rate");
console.log("Audio will be resampled to 24kHz as required by OpenAI");
return true;
} catch (error) {
console.error("Failed to initialize microphone:", error);
return false;
}
}

// Start recording from microphone
function startRecording() {
if (!mic) {
if (!initMicrophone()) {
console.log("Cannot start recording - microphone initialization failed");
return false;
}
}

try {
console.log("Starting microphone recording...");
micStream = mic.startRecording();

micStream.on("data", (data: Buffer) => {
if (ws.readyState === WebSocket.OPEN) {
// Convert the Buffer to Int16Array for resampling
const int16Array = new Int16Array(
data.buffer,
data.byteOffset,
data.byteLength / 2
);

// Log original data info
if (process.env.DEBUG) {
console.log(
`Original audio: ${data.byteLength} bytes, ${int16Array.length} samples at 44.1kHz`
);
}

// Resample from 44.1kHz to 24kHz
const resampledData = resample(int16Array, 44100, 24000);

// Convert the resampled data to a Buffer
// Create a new Int16Array from the resampled data
const resampledInt16 = new Int16Array(resampledData);
const resampledBuffer = Buffer.from(resampledInt16.buffer);

// Log resampled data info
if (process.env.DEBUG) {
console.log(
`Resampled audio: ${resampledBuffer.byteLength} bytes, ${resampledInt16.length} samples at 24kHz`
);
}

// Send audio data to the WebSocket server
ws.send(
JSON.stringify({
type: "input_audio_buffer.append",
audio: resampledBuffer.toString("base64"),
})
);
}
});

micStream.on("error", (error: Error) => {
console.error("Error from microphone stream:", error);
});

isRecording = true;
return true;
} catch (error) {
console.error("Failed to start recording:", error);
return false;
}
}

// Stop recording from microphone
function stopRecording() {
if (mic && isRecording) {
console.log("Stopping microphone recording...");
mic.stopRecording();
isRecording = false;

// Commit the audio buffer instead of sending audio_end
if (ws.readyState === WebSocket.OPEN) {
ws.send(
JSON.stringify({
type: "input_audio_buffer.commit",
})
);
}

return true;
}
return false;
}

ws.on("open", function open() {
console.log("Connected to server.");

Expand All @@ -59,6 +195,9 @@ ws.on("open", function open() {
ws.send(JSON.stringify(sessionUpdate));

console.log("Enter your message (or 'quit' to exit):");
console.log(
"Commands: 'mic' to toggle microphone, 'update' to send session update"
);
startCliLoop();
});

Expand All @@ -70,6 +209,29 @@ ws.on("message", function incoming(message: WebSocket.RawData) {
inspect(response, { colors: true, depth: null })
);

// Handle specific event types
switch (response.type) {
case "input_audio_buffer.speech_started":
console.log("Speech detected! Speaking...");
break;

case "input_audio_buffer.speech_stopped":
console.log("Speech ended. Processing...");
break;

case "input_audio_buffer.committed":
console.log("Audio buffer committed. Item ID:", response.item_id);
break;

case "conversation.item.input_audio_transcription.completed":
console.log("Transcription completed:", response.transcript);
break;

case "error":
console.error("Error from server:", response.error.message);
break;
}

// Handle function calls
if (response.type === "response.done" && response.response.output) {
const functionCalls = response.response.output.filter(
Expand Down Expand Up @@ -130,6 +292,9 @@ function startCliLoop() {
rl.on("line", (input: string) => {
// If input is "quit"
if (input.toLowerCase() === "quit") {
if (isRecording) {
stopRecording();
}
console.log("Closing connection...");
ws.close();
rl.close();
Expand All @@ -143,17 +308,51 @@ function startCliLoop() {
return;
}

// Otherwise, send the message as a normal response
// If input is "mic"
if (input.toLowerCase() === "mic") {
if (isRecording) {
if (stopRecording()) {
console.log("Microphone recording stopped.");
} else {
console.log("Failed to stop microphone recording.");
}
} else {
if (startRecording()) {
console.log("Microphone recording started. Speak now...");
console.log("Type 'mic' again to stop recording.");
} else {
console.log("Failed to start microphone recording.");
}
}
return;
}

// Otherwise, send the message as text
try {
// Create a text message item
ws.send(
JSON.stringify({
type: "response.create",
response: {
modalities: ["text", "audio"],
instructions: input,
type: "conversation.item.create",
item: {
type: "message",
role: "user",
content: [
{
type: "input_text",
text: input,
},
],
},
})
);

// Then create a response
ws.send(
JSON.stringify({
type: "response.create",
response: {},
})
);
} catch (error) {
console.error("Error sending message:", error);
}
Expand All @@ -163,6 +362,9 @@ function startCliLoop() {
// Handle cleanup
process.on("SIGINT", () => {
console.log("\nClosing connection...");
if (isRecording) {
stopRecording();
}
ws.close();
rl.close();
process.exit(0);
Expand Down
Loading
Loading