Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

llms: Add support for using the whisper model to transcribe audio #696

Open
wants to merge 12 commits into
base: main
Choose a base branch
from
14 changes: 13 additions & 1 deletion llms/openai/internal/openaiclient/openaiclient.go
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ type Client struct {
// required when APIType is APITypeAzure or APITypeAzureAD
apiVersion string
embeddingsModel string
language string
}

// Option is an option for the OpenAI client.
Expand All @@ -48,7 +49,7 @@ type Doer interface {

// New returns a new OpenAI client.
func New(token string, model string, baseURL string, organization string,
apiType APIType, apiVersion string, httpClient Doer, embeddingsModel string,
apiType APIType, apiVersion string, httpClient Doer, embeddingsModel string, language string,
opts ...Option,
) (*Client, error) {
c := &Client{
Expand All @@ -60,6 +61,7 @@ func New(token string, model string, baseURL string, organization string,
apiType: apiType,
apiVersion: apiVersion,
httpClient: httpClient,
language: language,
}

for _, opt := range opts {
Expand Down Expand Up @@ -144,6 +146,16 @@ func (c *Client) CreateChat(ctx context.Context, r *ChatRequest) (*ChatResponse,
return resp, nil
}

// Transcription with audio file.
func (c *Client) Transcription(ctx context.Context, audioFilePath string, temperature float64) ([]byte, error) {
res, err := c.uploadAudioAndGetTranscription(ctx, audioFilePath, c.language, temperature)
if err != nil {
return nil, err
}

return res, nil
}

func IsAzure(apiType APIType) bool {
return apiType == APITypeAzure || apiType == APITypeAzureAD
}
Expand Down
72 changes: 72 additions & 0 deletions llms/openai/internal/openaiclient/wisper.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
package openaiclient
devalexandre marked this conversation as resolved.
Show resolved Hide resolved

import (
"bytes"
"context"
"encoding/json"
"fmt"
"io"
"mime/multipart"
"net/http"
"os"
"path/filepath"
)

type TranscribeAudioResponse struct {
Text string `json:"text"`
}

func (c *Client) uploadAudioAndGetTranscription(ctx context.Context, audioFilePath, language string, temperature float64) ([]byte, error) {
payload := &bytes.Buffer{}
writer := multipart.NewWriter(payload)
file, errFile1 := os.Open(audioFilePath)

if errFile1 != nil {
return nil, errFile1
}

defer file.Close()

part1, errFile1 := writer.CreateFormFile("file", filepath.Base(audioFilePath))
if errFile1 != nil {
return nil, errFile1
}
_, errFile1 = io.Copy(part1, file)
if errFile1 != nil {
return nil, errFile1
}

_ = writer.WriteField("model", c.Model)
_ = writer.WriteField("response_format", "json")
_ = writer.WriteField("temperature", fmt.Sprintf("%f", temperature))
_ = writer.WriteField("language", language)
err := writer.Close()
if err != nil {
return nil, err
}

client := &http.Client{}
req, err := http.NewRequestWithContext(ctx, http.MethodPost, "https://api.openai.com/v1/audio/transcriptions", payload)
if err != nil {
return nil, err
}
c.setHeaders(req)

req.Header.Set("Content-Type", writer.FormDataContentType())
res, err := client.Do(req)
if err != nil {
return nil, err
}
defer res.Body.Close()

body, err := io.ReadAll(res.Body)
if err != nil {
return nil, err
}
var transcriptionResponse TranscribeAudioResponse
err = json.Unmarshal(body, &transcriptionResponse)
if err != nil {
return nil, err
}
return []byte(transcriptionResponse.Text), nil
}
3 changes: 2 additions & 1 deletion llms/openai/llm.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ func newClient(opts ...Option) (*options, *openaiclient.Client, error) {
organization: os.Getenv(organizationEnvVarName),
apiType: APIType(openaiclient.APITypeOpenAI),
httpClient: http.DefaultClient,
language: "en",
}

for _, opt := range opts {
Expand All @@ -44,7 +45,7 @@ func newClient(opts ...Option) (*options, *openaiclient.Client, error) {
}

cli, err := openaiclient.New(options.token, options.model, options.baseURL, options.organization,
openaiclient.APIType(options.apiType), options.apiVersion, options.httpClient, options.embeddingModel)
openaiclient.APIType(options.apiType), options.apiVersion, options.httpClient, options.embeddingModel, options.language)
return options, cli, err
}

Expand Down
16 changes: 15 additions & 1 deletion llms/openai/multicontent_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ import (
"github.com/tmc/langchaingo/schema"
)

func newTestClient(t *testing.T, opts ...Option) llms.Model {
func newTestClient(t *testing.T, opts ...Option) *LLM {
t.Helper()
if openaiKey := os.Getenv("OPENAI_API_KEY"); openaiKey == "" {
t.Skip("OPENAI_API_KEY not set")
Expand Down Expand Up @@ -162,6 +162,20 @@ func TestFunctionCall(t *testing.T) {
assert.NotNil(t, c1.FuncCall)
}

func TestTranscription(t *testing.T) {
t.Parallel()
llm := newTestClient(t, WithModel("whisper-1"))

audioFilePath := "./sample.mp3"
_, err := os.Stat(audioFilePath)
require.NoError(t, err)

rsp, err := llm.TranscribeAudio(context.Background(), audioFilePath)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does it make sense to think of "transcribe audio" in the context of LLMs? AFAIU Whisper is a distinct model from LLMs like the GPT family.

Is this intended as a one-off method only for openai, or as some general audio transcription interface?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

only for openai, but I think it's interesting to include it in the general context, because there are other models that do this, but at the moment it's only for openai

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I haven't tried it yet, but this seems interesting as a locally running alternative: https://github.com/JigsawStack/insanely-fast-whisper-api

It would be cool if we could support something like that, so you can combine it with a Ollama to build some local-only tools.

(So far I've been using https://github.com/Purfview/whisper-standalone-win locally, which is a single-binary wrapper around https://github.com/SYSTRAN/faster-whisper)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We can implement it in other LLMS, the problem is that so far I have only found the standalone version for Windows, but I am looking for other alternatives

require.NoError(t, err)

assert.NotEmpty(t, rsp)
}

func showResponse(rsp any) string { //nolint:golint,unused
b, err := json.MarshalIndent(rsp, "", " ")
if err != nil {
Expand Down
9 changes: 9 additions & 0 deletions llms/openai/openaillm.go
Original file line number Diff line number Diff line change
Expand Up @@ -150,3 +150,12 @@ func (o *LLM) CreateEmbedding(ctx context.Context, inputTexts []string) ([][]flo
}
return embeddings, nil
}

// Transcription with audio file.
devalexandre marked this conversation as resolved.
Show resolved Hide resolved
func (o *LLM) TranscribeAudio(ctx context.Context, audioFilePath string, options ...llms.CallOption) ([]byte, error) {
opts := llms.CallOptions{}
for _, opt := range options {
opt(&opts)
}
return o.client.Transcription(ctx, audioFilePath, opts.Temperature)
}
9 changes: 9 additions & 0 deletions llms/openai/openaillm_option.go
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,8 @@ type options struct {
embeddingModel string

callbackHandler callbacks.Handler
// required when use wisper
devalexandre marked this conversation as resolved.
Show resolved Hide resolved
language string
}

// Option is a functional option for the OpenAI client.
Expand Down Expand Up @@ -128,3 +130,10 @@ func WithResponseFormat(responseFormat ResponseFormat) Option {
opts.responseFormat = responseFormat
}
}

// WithLanguage allows setting a custom language.
devalexandre marked this conversation as resolved.
Show resolved Hide resolved
func WithLanguage(language string) Option {
return func(opts *options) {
opts.language = language
}
}
Binary file added llms/openai/sample.mp3
Binary file not shown.
Loading