diff --git a/__tests__/jigsawstack.test.ts b/__tests__/jigsawstack.test.ts new file mode 100644 index 0000000..b3fe620 --- /dev/null +++ b/__tests__/jigsawstack.test.ts @@ -0,0 +1,30 @@ +import { writeFile } from 'node:fs/promises'; +import { describe, expect, it } from 'vitest'; +import { speak, transcribeWithUrl } from '../src'; +import { jigsawstack } from '../src/jigsawstack'; + +describe('JigsawStack Tests', () => { + it('should convert text to speech', async () => { + const speech = await speak({ + model: jigsawstack.tts("en-US-female-10"), + prompt: 'Hello from Orate, the AI toolkit for speech.', + }); + + await writeFile( + './__tests__/output/jigsawstack-speech.wav', + Buffer.from(await speech.arrayBuffer()) + ); + + expect(speech).toBeInstanceOf(File); + expect(speech.size).toBeGreaterThan(0); + }); + + it('should convert speech to text', async () => { + const text = await transcribeWithUrl({ + model: jigsawstack.stt(), + url:"https://uuvhpoxkzjnrvvajhnyb.supabase.co/storage/v1/object/sign/default/preview/stt-examples/stt_very_short_audio_sample_2.mp3?token=eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJ1cmwiOiJkZWZhdWx0L3ByZXZpZXcvc3R0LWV4YW1wbGVzL3N0dF92ZXJ5X3Nob3J0X2F1ZGlvX3NhbXBsZV8yLm1wMyIsImlhdCI6MTczMjIwMzIwNywiZXhwIjozMTU1MzAwNjY3MjA3fQ._R0cLbrIx_FUR3CMRYaUMj616diA_1fjWUcVq2vAONg&t=2024-11-21T15%3A33%3A27.154Z", + }); + expect(typeof text).toBe('string'); + expect(text.length).toBeGreaterThan(0); + }); +}); diff --git a/package.json b/package.json index ffd4b15..7bfb072 100644 --- a/package.json +++ b/package.json @@ -56,6 +56,7 @@ "fluent-ffmpeg": "^2.1.3", "groq-sdk": "^0.12.0", "ibm-watson": "^10.0.0", + "jigsawstack": "^0.0.25", "ky": "^1.7.4", "microsoft-cognitiveservices-speech-sdk": "^1.42.0", "openai": "^4.80.0", diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 481123f..266ff5e 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -38,6 +38,9 @@ importers: ibm-watson: specifier: ^10.0.0 version: 10.0.0 + jigsawstack: + specifier: ^0.0.25 + version: 0.0.25 ky: specifier: ^1.7.4 version: 1.7.4 @@ -1652,6 +1655,9 @@ packages: isexe@2.0.0: resolution: {integrity: sha512-RHxMLp9lnKHGHRng9QFhRCMbYAcVpn69smSGcq3f36xjgVVWThj4qqLbTLlq7Ssj8B+fIQ1EuCEGI2lKsyQeIw==} + isomorphic-fetch@3.0.0: + resolution: {integrity: sha512-qvUtwJ3j6qwsF3jLxkZ72qCgjMysPzDfeV240JHiGZsANBYd+EEuu35v7dfrJ9Up0Ak07D7GGSkGhCHTqg/5wA==} + isstream@0.1.2: resolution: {integrity: sha512-Yljz7ffyPbrLpLngrMtZ7NduUgVvi6wG9RJ9IUcyCd59YQ911PBJphODUcbOVbqYfxe1wuYf/LJ8PauMRwsM/g==} @@ -1662,6 +1668,10 @@ packages: resolution: {integrity: sha512-qjdpeo2yKlYTH7nFdK0vbZWuTCesk4o63v5iVOlhMQPfuIZQfW/HI35SjfhA+4qpg36rnFSvUK5b1m+ckIblQQ==} engines: {node: '>= 0.6.0'} + jigsawstack@0.0.25: + resolution: {integrity: sha512-0AQIkpt5gsiokSoEheyoUIhE45gwcAZjrVxBQXn20EDmRB2qO3IlOxBWDWzXm+d3j7sBxxdJV5eivq/Adfgqnw==} + engines: {node: '>=18'} + joycon@3.1.1: resolution: {integrity: sha512-34wB/Y7MW7bzjKRjUKTa46I2Z7eV62Rkhva+KkopW7Qvv/OSWBqvkSY7vusOPrNuZcUG3tApvdVgNB8POj3SPw==} engines: {node: '>=10'} @@ -2684,6 +2694,9 @@ packages: resolution: {integrity: sha512-6qN4hJdMwfYBtE3YBTTHhoeuUrDBPZmbQaxWAqSALV/MeEnR5z1xd8UKud2RAkFoPkmB+hli1TZSnyi84xz1vQ==} engines: {node: '>=18'} + whatwg-fetch@3.6.20: + resolution: {integrity: sha512-EqhiFU6daOA8kpjOWTL0olhVOF3i7OrFzSYiGsEMB8GcXS+RrzauAERX65xMeNWVqxA6HXH2m69Z9LaKKdisfg==} + whatwg-mimetype@4.0.0: resolution: {integrity: sha512-QaKxh0eNIi2mE9p2vEdzfagOKHCcj1pJ56EEHGQOVxp8r9/iszLUUV7v89x9O1p/T+NlTM5W7jW6+cz4Fq1YVg==} engines: {node: '>=18'} @@ -4521,6 +4534,13 @@ snapshots: isexe@2.0.0: {} + isomorphic-fetch@3.0.0: + dependencies: + node-fetch: 2.7.0 + whatwg-fetch: 3.6.20 + transitivePeerDependencies: + - encoding + isstream@0.1.2: {} jackspeak@3.4.3: @@ -4531,6 +4551,12 @@ snapshots: java-properties@1.0.2: {} + jigsawstack@0.0.25: + dependencies: + isomorphic-fetch: 3.0.0 + transitivePeerDependencies: + - encoding + joycon@3.1.1: {} js-tokens@4.0.0: {} @@ -5609,6 +5635,8 @@ snapshots: dependencies: iconv-lite: 0.6.3 + whatwg-fetch@3.6.20: {} + whatwg-mimetype@4.0.0: {} whatwg-url@14.1.0: diff --git a/src/index.ts b/src/index.ts index 369d6af..d4f1ca3 100644 --- a/src/index.ts +++ b/src/index.ts @@ -85,3 +85,29 @@ export const isolate = async ({ model, audio, }: IsolateOptions): Promise => model(audio); + + + +/** + * Options for the transcribe function to convert speech to text. + * @interface TranscribeWithUrlOptions + * @property {function} model - A function that takes an audio url and returns a Promise resolving to the transcribed text + * @property {string} url - The audio url to transcribe + */ +export type TranscribeWithUrlOptions = { + model: (url: string) => Promise; + url: string; +}; + + +/** + * Transcribes audio url to text using the provided model. + * @param {TranscribeWithUrlOptions} options - The options for speech-to-text transcription using audio url + * @param {function} options.model - The model function to use for transcription + * @param {string} options.url - The audio url to transcribe + * @returns {Promise} A Promise that resolves to the transcribed text + */ +export const transcribeWithUrl = async ({ + model, + url, +}: TranscribeWithUrlOptions): Promise => model(url); \ No newline at end of file diff --git a/src/jigsawstack.ts b/src/jigsawstack.ts new file mode 100644 index 0000000..bd2fe49 --- /dev/null +++ b/src/jigsawstack.ts @@ -0,0 +1,74 @@ +import { JigsawStack } from 'jigsawstack'; + +type JigsawStackType = ReturnType; +type STTParams = Parameters['0']; +type TTSParams = Parameters['0']; + +/** + * Creates an JigsawStack provider instance with API key from environment variables + * @returns {JigsawStackType} Configured JigsawStack client instance + * @throws {Error} If JIGSAWSTACK_API_KEY environment variable is not set + */ +const createProvider = () => { + const apiKey = process.env.JIGSAWSTACK_API_KEY; + + if (!apiKey) { + throw new Error('JIGSAWSTACK_API_KEY is not set'); + } + + return JigsawStack({ apiKey }); +}; + +/** + * JigsawStack speech-to-text functionality + */ +export const jigsawstack = { + /** + * Creates a speech-to-text transcription function using JigsawStack + * @param {Omit} options - Additional options for the transcription + * @returns {Function} Async function that takes audio url and returns transcribed text + */ + stt: (options?: Omit) => { + const provider = createProvider(); + /** + * Transcribes audio to text using JigsawStack + * @param {string} url - The audio url to transcribe + * @returns {Promise} The transcribed text + * @throws {Error} If no transcription results are found + */ + return async (url: string) => { + const response = await provider.audio.speech_to_text({ + url, + ...options, + }); + return response.text; + }; + }, + + /** + * Creates a text-to-speech synthesis function using JigsawStack TTS + * @param {TTSParams["accent"]} accent - The voice to use for synthesis. Defaults to 'en-US-female-27' + * @returns {Function} Async function that takes text and returns synthesized audio + */ + tts: ( + accent: TTSParams['accent'] = 'en-US-female-27', + properties?: Omit + ) => { + const provider = createProvider(); + + /** + * Synthesizes text to speech using JigsawStack TTS + * @param {string} text - The text to convert to speech + * @returns {Promise} The synthesized audio data + */ + return async (text: string) => { + const response = await provider.audio.text_to_speech({ + text, + accent, + ...properties, + }); + const file = await response.file('speech.mp3', { type: 'audio/mpeg' }); + return file; + }; + }, +}; diff --git a/src/openai.ts b/src/openai.ts index f28a1d8..9ef9af0 100644 --- a/src/openai.ts +++ b/src/openai.ts @@ -77,6 +77,7 @@ export const openai = { const response = await provider.audio.transcriptions.create({ model, file: audio, + ...properties, }); diff --git a/website/app/(home)/components/providers.tsx b/website/app/(home)/components/providers.tsx index 52aefb2..6be5414 100644 --- a/website/app/(home)/components/providers.tsx +++ b/website/app/(home)/components/providers.tsx @@ -12,6 +12,7 @@ import Gladia from '../../../public/providers/gladia.svg'; import Google from '../../../public/providers/google.svg'; import Groq from '../../../public/providers/groq.svg'; import IBM from '../../../public/providers/ibm.svg'; +import JigsawStack from '../../../public/providers/jigsaw-stack.svg'; import Murf from '../../../public/providers/murf.svg'; import OpenAI from '../../../public/providers/openai.svg'; import Play from '../../../public/providers/play.svg'; @@ -34,6 +35,7 @@ const providers = [ { name: 'Replicate', image: Replicate, href: '/docs/replicate' }, { name: 'Groq', image: Groq, href: '/docs/groq' }, { name: 'Play', image: Play, href: '/docs/play' }, + { name: 'JigsawStack', image: JigsawStack, href: '/docs/jigsawstack' }, ]; export const Providers = () => ( diff --git a/website/content/docs/jigsaw-stack.mdx b/website/content/docs/jigsaw-stack.mdx new file mode 100644 index 0000000..9dd90cd --- /dev/null +++ b/website/content/docs/jigsaw-stack.mdx @@ -0,0 +1,87 @@ +--- +title: JigsawStack +description: Orate supports JigsawStack's speech and transcription services. +--- + +import { AutoTypeTable } from 'fumadocs-typescript/ui'; + +JigsawStack offer a suite of small fast models that automate the boring and complex tasks in every tech stack with specialized finetuning. + +## Setup + +The JigsawStack provider is available by default in Orate. To import it, you can use the following code: + +```ts +import { jigsawstack } from 'orate/jigsawstack'; +``` + +## Configuration + +The JigsawStack provider looks for the `JIGSAWSTACK_API_KEY` environment variable. This variable is required for the provider to work. Simply add the following to your `.env` file: + +```bash +JIGSAWSTACK_API_KEY="your_api_key" +``` + +## Usage + +The JigsawStack provider provides a single interface for all of JigsawStack's speech and transcription services. + +### Text to Speech + +The JigsawStack provider provides a `tts` function that allows you to create a text-to-speech synthesis function using JigsawStack TTS. By default, the `tts` function uses the `en-US-female-27` voice (accent). + +```ts +import { speak } from 'orate'; +import { jigsawstack } from 'orate/jigsawstack'; + +const speech = await speak({ + model: jigsawstack.tts(), + prompt: 'Hello, world!', +}); +``` + +You can specify the model and voice to use by passing them as arguments to the `tts` function. + +```ts +const speech = await speak({ + model: jigsawstack.tts('en-US-female-27'), + prompt: 'Hello, world!', +}); +``` + +You can also specify specific JigsawStack properties by passing them as an argument to the `tts` function. + +```ts +const speech = await speak({ + model: jigsawstack.tts('en-US-female-27', { + speaker_clone_file_store_key: '...', + }), + prompt: 'Hello, world!', +}); +``` + +### Speech to Text + +The JigsawStack provider provides a `stt` function that allows you to create a speech-to-text transcription function using JigsawStack. + +```ts +import { transcribe } from 'orate'; +import { jigsawstack } from 'orate/jigsawstack'; + +const text = await transcribe({ + model: jigsawstack.stt(), + audio: someArrayBuffer, +}); +``` + +You can also specify specific JigsawStack properties by passing them as an argument to the `stt` function. + +```ts +const text = await transcribe({ + model: jigsawstack.stt({ + batch_size: 1, + }), + audio: someArrayBuffer, +}); +``` diff --git a/website/public/providers/jigsaw-stack.svg b/website/public/providers/jigsaw-stack.svg new file mode 100644 index 0000000..1cef199 --- /dev/null +++ b/website/public/providers/jigsaw-stack.svg @@ -0,0 +1 @@ + \ No newline at end of file