TheoKanning · TheoKanning · Jun 19, 2023 · Jun 9, 2023 · Jun 9, 2023 · Jun 19, 2023
diff --git a/api/src/main/java/com/theokanning/openai/audio/CreateTranscriptionRequest.java b/api/src/main/java/com/theokanning/openai/audio/CreateTranscriptionRequest.java
@@ -0,0 +1,46 @@
+package com.theokanning.openai.audio;
+
+import com.fasterxml.jackson.annotation.JsonProperty;
+import lombok.*;
+
+/**
+ * A request for OpenAi to create transcription based on an audio file
+ * All fields except model are optional
+ *
+ * https://platform.openai.com/docs/api-reference/audio/create
+ */
+@Builder
+@NoArgsConstructor
+@AllArgsConstructor
+@Data
+public class CreateTranscriptionRequest {
+
+    /**
+     * The name of the model to use.
+     */
+    @NonNull
+    String model;
+
+    /**
+     * An optional text to guide the model's style or continue a previous audio segment. The prompt should match the audio language.
+     */
+    String prompt;
+
+    /**
+     * The format of the transcript output, in one of these options: json or verbose_json
+     */
+    @JsonProperty("response_format")
+    String responseFormat;
+
+    /**
+     * The sampling temperature, between 0 and 1.
+     * Higher values like 0.8 will make the output more random, while lower values like 0.2 will make it more focused and deterministic.
+     * If set to 0, the model will use log probability to automatically increase the temperature until certain thresholds are hit.
+     */
+    Double temperature;
+
+    /**
+     * The language of the input audio. Supplying the input language in ISO-639-1 format will improve accuracy and latency.
+     */
+    String language;
+}
diff --git a/api/src/main/java/com/theokanning/openai/audio/CreateTranslationRequest.java b/api/src/main/java/com/theokanning/openai/audio/CreateTranslationRequest.java
@@ -0,0 +1,41 @@
+package com.theokanning.openai.audio;
+
+import com.fasterxml.jackson.annotation.JsonProperty;
+import lombok.*;
+
+/**
+ * A request for OpenAi to create English translation based on an audio file
+ * All fields except model are optional
+ *
+ * https://platform.openai.com/docs/api-reference/audio/create
+ */
+@Builder
+@NoArgsConstructor
+@AllArgsConstructor
+@Data
+public class CreateTranslationRequest {
+
+    /**
+     * The name of the model to use.
+     */
+    @NonNull
+    String model;
+
+    /**
+     * An optional text to guide the model's style or continue a previous audio segment. The prompt should be in English.
+     */
+    String prompt;
+
+    /**
+     * The format of the translated output, in one of these options: json or verbose_json
+     */
+    @JsonProperty("response_format")
+    String responseFormat;
+
+    /**
+     * The sampling temperature, between 0 and 1.
+     * Higher values like 0.8 will make the output more random, while lower values like 0.2 will make it more focused and deterministic.
+     * If set to 0, the model will use log probability to automatically increase the temperature until certain thresholds are hit.
+     */
+    Double temperature;
+}
diff --git a/api/src/main/java/com/theokanning/openai/audio/TranscriptionResult.java b/api/src/main/java/com/theokanning/openai/audio/TranscriptionResult.java
@@ -0,0 +1,44 @@
+package com.theokanning.openai.audio;
+
+import lombok.Data;
+
+import java.util.List;
+
+/**
+ * An object with the text transcription
+ *
+ * https://platform.openai.com/docs/api-reference/audio/create
+ */
+@Data
+public class TranscriptionResult {
+
+    /**
+     * The text transcription.
+     */
+    String text;
+
+    /**
+     * Task name
+     * @apiNote verbose_json response format only
+     */
+    String task;
+
+    /**
+     * Speech language
+     * @apiNote verbose_json response format only
+     */
+    String language;
+
+    /**
+     * Speech duration
+     * @apiNote verbose_json response format only
+     */
+    Double duration;
+
+    /**
+     * List of segments
+     * @apiNote verbose_json response format only
+     */
+    List<TranscriptionSegment> segments;
+
+}
diff --git a/api/src/main/java/com/theokanning/openai/audio/TranscriptionSegment.java b/api/src/main/java/com/theokanning/openai/audio/TranscriptionSegment.java
@@ -0,0 +1,32 @@
+package com.theokanning.openai.audio;
+
+import com.fasterxml.jackson.annotation.JsonProperty;
+import lombok.Data;
+
+import java.util.List;
+
+/**
+ * An object represents transcription segment
+ *
+ * https://platform.openai.com/docs/api-reference/audio/create
+ */
+@Data
+public class TranscriptionSegment {
+
+    Integer id;
+    Integer seek;
+    Double start;
+    Double end;
+    String text;
+    List<Integer> tokens;
+    Double temperature;
+    @JsonProperty("avg_logprob")
+    Double averageLogProb;
+    @JsonProperty("compression_ratio")
+    Double compressionRatio;
+    @JsonProperty("no_speech_prob")
+    Double noSpeechProb;
+    @JsonProperty("transient")
+    Boolean transientFlag;
+
+}
diff --git a/api/src/main/java/com/theokanning/openai/audio/TranslationResult.java b/api/src/main/java/com/theokanning/openai/audio/TranslationResult.java
@@ -0,0 +1,44 @@
+package com.theokanning.openai.audio;
+
+import lombok.Data;
+
+import java.util.List;
+
+/**
+ * An object with the English transcription
+ *
+ * https://platform.openai.com/docs/api-reference/audio/create
+ */
+@Data
+public class TranslationResult {
+
+    /**
+     * Translated text.
+     */
+    String text;
+
+    /**
+     * Task name
+     * @apiNote verbose_json response format only
+     */
+    String task;
+
+    /**
+     * Translated language
+     * @apiNote verbose_json response format only
+     */
+    String language;
+
+    /**
+     * Speech duration
+     * @apiNote verbose_json response format only
+     */
+    Double duration;
+
+    /**
+     * List of segments
+     * @apiNote verbose_json response format only
+     */
+    List<TranscriptionSegment> segments;
+
+}
diff --git a/api/src/test/java/com/theokanning/openai/JsonTest.java b/api/src/test/java/com/theokanning/openai/JsonTest.java
@@ -2,6 +2,8 @@
 
 import com.fasterxml.jackson.annotation.JsonInclude;
 import com.fasterxml.jackson.databind.*;
+import com.theokanning.openai.audio.TranscriptionResult;
+import com.theokanning.openai.audio.TranslationResult;
 import com.theokanning.openai.completion.chat.ChatCompletionRequest;
 import com.theokanning.openai.completion.chat.ChatCompletionResult;
 import com.theokanning.openai.edit.EditRequest;
@@ -42,6 +44,8 @@ public class JsonTest {
             FineTuneEvent.class,
             FineTuneResult.class,
             ImageResult.class,
+            TranscriptionResult.class,
+            TranslationResult.class,
             Model.class,
             ModerationRequest.class,
             ModerationResult.class

diff --git a/api/src/test/resources/fixtures/TranscriptionResult.json b/api/src/test/resources/fixtures/TranscriptionResult.json
@@ -0,0 +1,27 @@
+{
+  "task": "transcribe",
+  "language": "english",
+  "duration": 1.1,
+  "segments": [
+    {
+      "id": 0,
+      "seek": 0,
+      "start": 0.0,
+      "end": 0.96,
+      "text": " Hello World.",
+      "tokens": [
+        50364,
+        2425,
+        3937,
+        13,
+        50412
+      ],
+      "temperature": 0.0,
+      "avg_logprob": -0.7308251063028971,
+      "compression_ratio": 0.6,
+      "no_speech_prob": 0.015335720032453537,
+      "transient": false
+    }
+  ],
+  "text": "Hello World."
+}
diff --git a/api/src/test/resources/fixtures/TranslationResult.json b/api/src/test/resources/fixtures/TranslationResult.json
@@ -0,0 +1,37 @@
+{
+  "task": "translate",
+  "language": "english",
+  "duration": 4.38,
+  "segments": [
+    {
+      "id": 0,
+      "seek": 0,
+      "start": 0.0,
+      "end": 4.32,
+      "text": " Hello, my name is Yuna. I am Korean voice.",
+      "tokens": [
+        50364,
+        2425,
+        11,
+        452,
+        1315,
+        307,
+        398,
+        5051,
+        13,
+        286,
+        669,
+        6933,
+        3177,
+        13,
+        50580
+      ],
+      "temperature": 0.0,
+      "avg_logprob": -0.6644304394721985,
+      "compression_ratio": 0.84,
+      "no_speech_prob": 0.006824055220931768,
+      "transient": false
+    }
+  ],
+  "text": "Hello, my name is Yuna. I am Korean voice."
+}
diff --git a/service/src/main/java/com/theokanning/openai/service/OpenAiService.java b/service/src/main/java/com/theokanning/openai/service/OpenAiService.java
@@ -7,6 +7,10 @@
 import com.theokanning.openai.DeleteResult;
 import com.theokanning.openai.OpenAiError;
 import com.theokanning.openai.OpenAiHttpException;
+import com.theokanning.openai.audio.CreateTranscriptionRequest;
+import com.theokanning.openai.audio.CreateTranslationRequest;
+import com.theokanning.openai.audio.TranscriptionResult;
+import com.theokanning.openai.audio.TranslationResult;
 import com.theokanning.openai.client.OpenAiApi;
 import com.theokanning.openai.completion.CompletionChunk;
 import com.theokanning.openai.completion.CompletionRequest;
@@ -248,6 +252,61 @@ public ImageResult createImageVariation(CreateImageVariationRequest request, jav
         return execute(api.createImageVariation(builder.build()));
     }
 
+    public TranscriptionResult createTranscription(CreateTranscriptionRequest request, String audioPath) {
+        java.io.File audio = new java.io.File(audioPath);
+        return createTranscription(request, audio);
+    }
+
+    public TranscriptionResult createTranscription(CreateTranscriptionRequest request, java.io.File audio) {
+        RequestBody audioBody = RequestBody.create(MediaType.parse("audio"), audio);
+
+        MultipartBody.Builder builder = new MultipartBody.Builder()
+                .setType(MediaType.get("multipart/form-data"))
+                .addFormDataPart("model", request.getModel())
+                .addFormDataPart("file", audio.getName(), audioBody);
+
+        if (request.getPrompt() != null) {
+            builder.addFormDataPart("prompt", request.getPrompt());
+        }
+        if (request.getResponseFormat() != null) {
+            builder.addFormDataPart("response_format", request.getResponseFormat());
+        }
+        if (request.getTemperature() != null) {
+            builder.addFormDataPart("temperature", request.getTemperature().toString());
+        }
+        if (request.getLanguage() != null) {
+            builder.addFormDataPart("language", request.getLanguage());
+        }
+
+        return execute(api.createTranscription(builder.build()));
+    }
+
+    public TranslationResult createTranslation(CreateTranslationRequest request, String audioPath) {
+        java.io.File audio = new java.io.File(audioPath);
+        return createTranslation(request, audio);
+    }
+
+    public TranslationResult createTranslation(CreateTranslationRequest request, java.io.File audio) {
+        RequestBody audioBody = RequestBody.create(MediaType.parse("audio"), audio);
+
+        MultipartBody.Builder builder = new MultipartBody.Builder()
+                .setType(MediaType.get("multipart/form-data"))
+                .addFormDataPart("model", request.getModel())
+                .addFormDataPart("file", audio.getName(), audioBody);
+
+        if (request.getPrompt() != null) {
+            builder.addFormDataPart("prompt", request.getPrompt());
+        }
+        if (request.getResponseFormat() != null) {
+            builder.addFormDataPart("response_format", request.getResponseFormat());
+        }
+        if (request.getTemperature() != null) {
+            builder.addFormDataPart("temperature", request.getTemperature().toString());
+        }
+
+        return execute(api.createTranslation(builder.build()));
+    }
+
     public ModerationResult createModeration(ModerationRequest request) {
         return execute(api.createModeration(request));
     }