tmc · devalexandre · Mar 19, 2024 · Mar 20, 2024 · Mar 20, 2024 · Mar 20, 2024
diff --git a/llms/openai/internal/openaiclient/openaiclient.go b/llms/openai/internal/openaiclient/openaiclient.go
@@ -36,6 +36,7 @@ type Client struct {
 	// required when APIType is APITypeAzure or APITypeAzureAD
 	apiVersion      string
 	embeddingsModel string
+	language        string
 }
 
 // Option is an option for the OpenAI client.
@@ -48,7 +49,7 @@ type Doer interface {
 
 // New returns a new OpenAI client.
 func New(token string, model string, baseURL string, organization string,
-	apiType APIType, apiVersion string, httpClient Doer, embeddingsModel string,
+	apiType APIType, apiVersion string, httpClient Doer, embeddingsModel string, language string,
 	opts ...Option,
 ) (*Client, error) {
 	c := &Client{
@@ -60,6 +61,7 @@ func New(token string, model string, baseURL string, organization string,
 		apiType:         apiType,
 		apiVersion:      apiVersion,
 		httpClient:      httpClient,
+		language:        language,
 	}
 
 	for _, opt := range opts {
@@ -144,6 +146,16 @@ func (c *Client) CreateChat(ctx context.Context, r *ChatRequest) (*ChatResponse,
 	return resp, nil
 }
 
+// Transcription with audio file.
+func (c *Client) Transcription(ctx context.Context, audioFilePath string, temperature float64) ([]byte, error) {
+	res, err := c.uploadAudioAndGetTranscription(ctx, audioFilePath, c.language, temperature)
+	if err != nil {
+		return nil, err
+	}
+
+	return res, nil
+}
+
 func IsAzure(apiType APIType) bool {
 	return apiType == APITypeAzure || apiType == APITypeAzureAD
 }

diff --git a/llms/openai/internal/openaiclient/wisper.go b/llms/openai/internal/openaiclient/wisper.go
@@ -0,0 +1,72 @@
+package openaiclient
+
+import (
+	"bytes"
+	"context"
+	"encoding/json"
+	"fmt"
+	"io"
+	"mime/multipart"
+	"net/http"
+	"os"
+	"path/filepath"
+)
+
+type TranscribeAudioResponse struct {
+	Text string `json:"text"`
+}
+
+func (c *Client) uploadAudioAndGetTranscription(ctx context.Context, audioFilePath, language string, temperature float64) ([]byte, error) {
+	payload := &bytes.Buffer{}
+	writer := multipart.NewWriter(payload)
+	file, errFile1 := os.Open(audioFilePath)
+
+	if errFile1 != nil {
+		return nil, errFile1
+	}
+
+	defer file.Close()
+
+	part1, errFile1 := writer.CreateFormFile("file", filepath.Base(audioFilePath))
+	if errFile1 != nil {
+		return nil, errFile1
+	}
+	_, errFile1 = io.Copy(part1, file)
+	if errFile1 != nil {
+		return nil, errFile1
+	}
+
+	_ = writer.WriteField("model", c.Model)
+	_ = writer.WriteField("response_format", "json")
+	_ = writer.WriteField("temperature", fmt.Sprintf("%f", temperature))
+	_ = writer.WriteField("language", language)
+	err := writer.Close()
+	if err != nil {
+		return nil, err
+	}
+
+	client := &http.Client{}
+	req, err := http.NewRequestWithContext(ctx, http.MethodPost, "https://api.openai.com/v1/audio/transcriptions", payload)
+	if err != nil {
+		return nil, err
+	}
+	c.setHeaders(req)
+
+	req.Header.Set("Content-Type", writer.FormDataContentType())
+	res, err := client.Do(req)
+	if err != nil {
+		return nil, err
+	}
+	defer res.Body.Close()
+
+	body, err := io.ReadAll(res.Body)
+	if err != nil {
+		return nil, err
+	}
+	var transcriptionResponse TranscribeAudioResponse
+	err = json.Unmarshal(body, &transcriptionResponse)
+	if err != nil {
+		return nil, err
+	}
+	return []byte(transcriptionResponse.Text), nil
+}
diff --git a/llms/openai/llm.go b/llms/openai/llm.go
@@ -25,6 +25,7 @@ func newClient(opts ...Option) (*options, *openaiclient.Client, error) {
 		organization: os.Getenv(organizationEnvVarName),
 		apiType:      APIType(openaiclient.APITypeOpenAI),
 		httpClient:   http.DefaultClient,
+		language:     "en",
 	}
 
 	for _, opt := range opts {
@@ -44,7 +45,7 @@ func newClient(opts ...Option) (*options, *openaiclient.Client, error) {
 	}
 
 	cli, err := openaiclient.New(options.token, options.model, options.baseURL, options.organization,
-		openaiclient.APIType(options.apiType), options.apiVersion, options.httpClient, options.embeddingModel)
+		openaiclient.APIType(options.apiType), options.apiVersion, options.httpClient, options.embeddingModel, options.language)
 	return options, cli, err
 }
 

diff --git a/llms/openai/multicontent_test.go b/llms/openai/multicontent_test.go
@@ -13,7 +13,7 @@ import (
 	"github.com/tmc/langchaingo/schema"
 )
 
-func newTestClient(t *testing.T, opts ...Option) llms.Model {
+func newTestClient(t *testing.T, opts ...Option) *LLM {
 	t.Helper()
 	if openaiKey := os.Getenv("OPENAI_API_KEY"); openaiKey == "" {
 		t.Skip("OPENAI_API_KEY not set")
@@ -162,6 +162,20 @@ func TestFunctionCall(t *testing.T) {
 	assert.NotNil(t, c1.FuncCall)
 }
 
+func TestTranscription(t *testing.T) {
+	t.Parallel()
+	llm := newTestClient(t, WithModel("whisper-1"))
+
+	audioFilePath := "./sample.mp3"
+	_, err := os.Stat(audioFilePath)
+	require.NoError(t, err)
+
+	rsp, err := llm.TranscribeAudio(context.Background(), audioFilePath)
+	require.NoError(t, err)
+
+	assert.NotEmpty(t, rsp)
+}
+
 func showResponse(rsp any) string { //nolint:golint,unused
 	b, err := json.MarshalIndent(rsp, "", "  ")
 	if err != nil {

diff --git a/llms/openai/openaillm.go b/llms/openai/openaillm.go
@@ -150,3 +150,12 @@ func (o *LLM) CreateEmbedding(ctx context.Context, inputTexts []string) ([][]flo
 	}
 	return embeddings, nil
 }
+
+// Transcription with audio file.
+func (o *LLM) TranscribeAudio(ctx context.Context, audioFilePath string, options ...llms.CallOption) ([]byte, error) {
+	opts := llms.CallOptions{}
+	for _, opt := range options {
+		opt(&opts)
+	}
+	return o.client.Transcription(ctx, audioFilePath, opts.Temperature)
+}
diff --git a/llms/openai/openaillm_option.go b/llms/openai/openaillm_option.go
@@ -40,6 +40,8 @@ type options struct {
 	embeddingModel string
 
 	callbackHandler callbacks.Handler
+	// required when use wisper
+	language string
 }
 
 // Option is a functional option for the OpenAI client.
@@ -128,3 +130,10 @@ func WithResponseFormat(responseFormat ResponseFormat) Option {
 		opts.responseFormat = responseFormat
 	}
 }
+
+// WithLanguage allows setting a custom language.
+func WithLanguage(language string) Option {
+	return func(opts *options) {
+		opts.language = language
+	}
+}
diff --git a/llms/openai/sample.mp3 b/llms/openai/sample.mp3