diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index 2ffd505394f5..6bb9cdd4ab4f 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -262,6 +262,13 @@ # ServiceLabel: %Cognitive - Speech # ServiceOwners: @rhurey +# PRLabel: %Speech Transcription +/sdk/transcription/azure-ai-speech-transcription/ @amber-yujueWang @rhurey @xitzhang @Azure/azure-java-sdk + +# ServiceLabel: %Speech Transcription +# AzureSdkOwners: @amber-yujueWang @rhurey @xitzhang +# ServiceOwners: @rhurey @xitzhang @amber-yujueWang + # PRLabel: %Cognitive - Text Analytics /sdk/textanalytics/ @samvaity @quentinRobinson @Azure/azure-java-sdk diff --git a/eng/versioning/version_client.txt b/eng/versioning/version_client.txt index 842251986c0d..37c62f75daba 100644 --- a/eng/versioning/version_client.txt +++ b/eng/versioning/version_client.txt @@ -53,6 +53,7 @@ com.azure:azure-ai-openai-realtime;1.0.0-beta.1;1.0.0-beta.1 com.azure:azure-ai-openai-stainless;1.0.0-beta.1;1.0.0-beta.1 com.azure:azure-ai-personalizer;1.0.0-beta.1;1.0.0-beta.2 com.azure:azure-ai-projects;1.0.0-beta.3;1.0.0-beta.4 +com.azure:azure-ai-speech-transcription;1.0.0-beta.1;1.0.0-beta.1 com.azure:azure-ai-textanalytics;5.5.11;5.6.0-beta.1 com.azure:azure-ai-textanalytics-perf;1.0.0-beta.1;1.0.0-beta.1 com.azure:azure-ai-translation-text;1.1.7;2.0.0-beta.1 diff --git a/pom.xml b/pom.xml index 8c702d55c621..020e7eae65cd 100644 --- a/pom.xml +++ b/pom.xml @@ -265,6 +265,7 @@ sdk/timeseriesinsights sdk/tools sdk/trafficmanager + sdk/transcription sdk/translation sdk/trustedsigning sdk/vision diff --git a/sdk/transcription/azure-ai-speech-transcription/CHANGELOG.md b/sdk/transcription/azure-ai-speech-transcription/CHANGELOG.md new file mode 100644 index 000000000000..e62d01580eee --- /dev/null +++ b/sdk/transcription/azure-ai-speech-transcription/CHANGELOG.md @@ -0,0 +1,7 @@ +# Release History + +## 1.0.0-beta.1 (2025-12-19) + +### Features Added + +- Initial release of Azure AI Speech Transcription client library for Java. diff --git a/sdk/transcription/azure-ai-speech-transcription/README.md b/sdk/transcription/azure-ai-speech-transcription/README.md new file mode 100644 index 000000000000..7180668de8d3 --- /dev/null +++ b/sdk/transcription/azure-ai-speech-transcription/README.md @@ -0,0 +1,305 @@ +# Azure AI Speech Transcription client library for Java + +The Azure AI Speech Transcription client library provides a simple and efficient way to convert audio to text using Azure Cognitive Services. This library enables you to transcribe audio with features like speaker diarization, profanity filtering, and phrase hints for improved accuracy. + +## Documentation + +Various documentation is available to help you get started: + +- [API reference documentation][docs] +- [Product documentation][product_documentation] +- [Azure Speech Service documentation](https://learn.microsoft.com/azure/ai-services/speech-service/) + +## Getting started + +### Prerequisites + +- [Java Development Kit (JDK)][jdk] with version 8 or above +- [Azure Subscription][azure_subscription] +- An [Azure Speech resource](https://learn.microsoft.com/azure/ai-services/speech-service/overview#try-the-speech-service-for-free) or [Cognitive Services multi-service resource](https://learn.microsoft.com/azure/ai-services/multi-service-resource) + +### Adding the package to your product + +[//]: # ({x-version-update-start;com.azure:azure-ai-speech-transcription;current}) +```xml + + com.azure + azure-ai-speech-transcription + 1.0.0-beta.1 + +``` +[//]: # ({x-version-update-end}) + +#### Optional: For Entra ID Authentication + +If you plan to use Entra ID authentication (recommended for production), also add the `azure-identity` dependency: + +```xml + + com.azure + azure-identity + 1.18.1 + +``` + +### Authentication + +Azure Speech Transcription supports two authentication methods: + +#### Option 1: API Key Authentication (Subscription Key) + +You can find your Speech resource's API key in the [Azure Portal](https://portal.azure.com) or by using the Azure CLI: + +```bash +az cognitiveservices account keys list --name --resource-group +``` + +Once you have an API key, you can authenticate using `KeyCredential`: + +```java +import com.azure.core.credential.KeyCredential; + +TranscriptionClient client = new TranscriptionClientBuilder() + .endpoint("https://.cognitiveservices.azure.com/") + .credential(new KeyCredential("")) + .buildClient(); +``` + +#### Option 2: Entra ID OAuth2 Authentication (Recommended for Production) + +For production scenarios, it's recommended to use Entra ID authentication with managed identities or service principals. This provides better security and easier credential management. + +```java +import com.azure.identity.DefaultAzureCredential; +import com.azure.identity.DefaultAzureCredentialBuilder; + +// Use DefaultAzureCredential which works with managed identities, service principals, Azure CLI, etc. +DefaultAzureCredential credential = new DefaultAzureCredentialBuilder().build(); + +TranscriptionClient client = new TranscriptionClientBuilder() + .endpoint("https://.cognitiveservices.azure.com/") + .credential(credential) + .buildClient(); +``` + +**Note:** To use Entra ID authentication, you need to: +1. Add the `azure-identity` dependency to your project +2. Assign the appropriate role (e.g., "Cognitive Services User") to your managed identity or service principal +3. Ensure your Cognitive Services resource has Entra ID authentication enabled + +For more information on Entra ID authentication, see: +- [Authenticate with Azure Identity](https://learn.microsoft.com/azure/developer/java/sdk/identity) +- [Azure Cognitive Services authentication](https://learn.microsoft.com/azure/ai-services/authentication) + +## Key concepts + +### TranscriptionClient + +The `TranscriptionClient` is the primary interface for interacting with the Speech Transcription service. It provides methods to transcribe audio to text. + +### TranscriptionAsyncClient + +The `TranscriptionAsyncClient` provides asynchronous methods for transcribing audio, allowing non-blocking operations that return reactive types. + +### Audio Formats + +The service supports various audio formats including WAV, MP3, OGG, and more. Audio must be: + +- Shorter than 2 hours in duration +- Smaller than 250 MB in size + +### Transcription Options + +You can customize transcription with options like: + +- **Profanity filtering**: Control how profanity is handled in transcriptions +- **Speaker diarization**: Identify different speakers in multi-speaker audio +- **Phrase lists**: Provide domain-specific phrases to improve accuracy +- **Language detection**: Automatically detect the spoken language +- **Enhanced mode**: Improve transcription quality with custom prompts, translation, and task-specific configurations + +## Examples + +### Transcribe an audio file + +```java com.azure.ai.speech.transcription.readme +TranscriptionClient client = new TranscriptionClientBuilder() + .endpoint("https://.cognitiveservices.azure.com/") + .credential(new KeyCredential("")) + .buildClient(); + +try { + // Read audio file + byte[] audioData = Files.readAllBytes(Paths.get("path/to/audio.wav")); + + // Create audio file details + AudioFileDetails audioFileDetails = new AudioFileDetails(BinaryData.fromBytes(audioData)); + + // Create transcription options + TranscriptionOptions options = new TranscriptionOptions(audioFileDetails); + + // Transcribe audio + TranscriptionResult result = client.transcribe(options); + + // Process results + System.out.println("Duration: " + result.getDuration() + " ms"); + result.getCombinedPhrases().forEach(phrase -> { + System.out.println("Channel " + phrase.getChannel() + ": " + phrase.getText()); + }); +} catch (Exception e) { + System.err.println("Error during transcription: " + e.getMessage()); +} +``` + +### Transcribe using audio URL + +You can transcribe audio directly from a URL without downloading the file first: + +```java readme-sample-transcribeWithAudioUrl +TranscriptionClient client = new TranscriptionClientBuilder() + .endpoint("https://.cognitiveservices.azure.com/") + .credential(new KeyCredential("")) + .buildClient(); + +// Create transcription options with audio URL +TranscriptionOptions options = new TranscriptionOptions("https://example.com/audio.wav"); + +// Transcribe audio +TranscriptionResult result = client.transcribe(options); + +// Process results +result.getCombinedPhrases().forEach(phrase -> { + System.out.println(phrase.getText()); +}); +``` + +### Transcribe with multi-language support + +The service can automatically detect and transcribe multiple languages within the same audio file. + +```java com.azure.ai.speech.transcription.transcriptionoptions.multilanguage +byte[] audioData = Files.readAllBytes(Paths.get("path/to/audio.wav")); + +AudioFileDetails audioFileDetails = new AudioFileDetails(BinaryData.fromBytes(audioData)); + +// Configure transcription WITHOUT specifying locales +// This allows the service to auto-detect and transcribe multiple languages +TranscriptionOptions options = new TranscriptionOptions(audioFileDetails); + +TranscriptionResult result = client.transcribe(options); + +result.getPhrases().forEach(phrase -> { + System.out.println("Language: " + phrase.getLocale()); + System.out.println("Text: " + phrase.getText()); +}); +``` + +### Transcribe with enhanced mode + +Enhanced mode provides advanced features to improve transcription accuracy with custom prompts. Enhanced mode is automatically enabled when you create an `EnhancedModeOptions` instance. + +```java com.azure.ai.speech.transcription.transcriptionoptions.enhancedmode +byte[] audioData = Files.readAllBytes(Paths.get("path/to/audio.wav")); + +AudioFileDetails audioFileDetails = new AudioFileDetails(BinaryData.fromBytes(audioData)); + +// Enhanced mode is automatically enabled +EnhancedModeOptions enhancedMode = new EnhancedModeOptions() + .setTask("transcribe") + .setPrompts(java.util.Arrays.asList("Output must be in lexical format.")); + +TranscriptionOptions options = new TranscriptionOptions(audioFileDetails) + .setEnhancedModeOptions(enhancedMode); + +TranscriptionResult result = client.transcribe(options); + +System.out.println("Transcription: " + result.getCombinedPhrases().get(0).getText()); +``` + +### Transcribe with phrase list + +You can use a phrase list to improve recognition accuracy for specific terms. + +```java com.azure.ai.speech.transcription.transcriptionoptions.phraselist +byte[] audioData = Files.readAllBytes(Paths.get("path/to/audio.wav")); + +AudioFileDetails audioFileDetails = new AudioFileDetails(BinaryData.fromBytes(audioData)); + +PhraseListOptions phraseListOptions = new PhraseListOptions() + .setPhrases(java.util.Arrays.asList("Azure", "Cognitive Services")) + .setBiasingWeight(5.0); + +TranscriptionOptions options = new TranscriptionOptions(audioFileDetails) + .setPhraseListOptions(phraseListOptions); + +TranscriptionResult result = client.transcribe(options); + +result.getCombinedPhrases().forEach(phrase -> { + System.out.println(phrase.getText()); +}); +``` + +### Service API versions + +The client library targets the latest service API version by default. +The service client builder accepts an optional service API version parameter to specify which API version to communicate. + +#### Select a service API version + +You have the flexibility to explicitly select a supported service API version when initializing a service client via the service client builder. +This ensures that the client can communicate with services using the specified API version. + +When selecting an API version, it is important to verify that there are no breaking changes compared to the latest API version. +If there are significant differences, API calls may fail due to incompatibility. + +Always ensure that the chosen API version is fully supported and operational for your specific use case and that it aligns with the service's versioning policy. + +## Troubleshooting + +### Enable client logging + +You can enable logging to debug issues with the client library. The Azure client libraries for Java use the SLF4J logging facade. You can configure logging by adding a logging dependency and configuration file. For more information, see the [logging documentation](https://learn.microsoft.com/azure/developer/java/sdk/logging-overview). + +### Common issues + +#### Authentication errors + +- Verify that your API key is correct +- Ensure your endpoint URL matches your Azure resource region + +#### Audio format errors + +- Verify your audio file is in a supported format +- Ensure the audio file size is under 250 MB and duration is under 2 hours + +### Getting help + +If you encounter issues: + +- Check the [troubleshooting guide](https://learn.microsoft.com/azure/ai-services/speech-service/troubleshooting) +- Search for existing issues or create a new one on [GitHub](https://github.com/Azure/azure-sdk-for-java/issues) +- Ask questions on [Stack Overflow](https://stackoverflow.com/questions/tagged/azure-java-sdk) with the `azure-java-sdk` tag + +## Next steps + +- Explore the [samples](https://github.com/Azure/azure-sdk-for-java/tree/main/sdk/transcription/azure-ai-speech-transcription/src/samples) for more examples +- Learn more about [Azure Speech Service](https://learn.microsoft.com/azure/ai-services/speech-service/) +- Review the [API reference documentation][docs] for detailed information about classes and methods + +## Contributing + + +For details on contributing to this repository, see the [contributing guide](https://github.com/Azure/azure-sdk-for-java/blob/main/CONTRIBUTING.md). + +1. Fork it +1. Create your feature branch (`git checkout -b my-new-feature`) +1. Commit your changes (`git commit -am 'Add some feature'`) +1. Push to the branch (`git push origin my-new-feature`) +1. Create new Pull Request + + +[product_documentation]: https://learn.microsoft.com/azure/ai-services/speech-service/ +[docs]: https://azure.github.io/azure-sdk-for-java/ +[jdk]: https://learn.microsoft.com/azure/developer/java/fundamentals/ +[azure_subscription]: https://azure.microsoft.com/free/ + diff --git a/sdk/transcription/azure-ai-speech-transcription/assets.json b/sdk/transcription/azure-ai-speech-transcription/assets.json new file mode 100644 index 000000000000..5c6f05bf06f2 --- /dev/null +++ b/sdk/transcription/azure-ai-speech-transcription/assets.json @@ -0,0 +1 @@ +{"AssetsRepo":"Azure/azure-sdk-assets","AssetsRepoPrefixPath":"java","TagPrefix":"java/transcription/azure-ai-speech-transcription","Tag": "java/transcription/azure-ai-speech-transcription_c82ca4aec0"} diff --git a/sdk/transcription/azure-ai-speech-transcription/cspell.json b/sdk/transcription/azure-ai-speech-transcription/cspell.json new file mode 100644 index 000000000000..4c33070b4f8d --- /dev/null +++ b/sdk/transcription/azure-ai-speech-transcription/cspell.json @@ -0,0 +1,16 @@ +{ + "version": "0.2", + "language": "en", + "words": [ + "azuread", + "BYOD", + "BYOS", + "dexec", + "diarization", + "doméstica", + "empleada", + "habitación", + "misrecognized", + "Mundo" + ] +} diff --git a/sdk/transcription/azure-ai-speech-transcription/customization/pom.xml b/sdk/transcription/azure-ai-speech-transcription/customization/pom.xml new file mode 100644 index 000000000000..3c3058bef116 --- /dev/null +++ b/sdk/transcription/azure-ai-speech-transcription/customization/pom.xml @@ -0,0 +1,48 @@ + + + 4.0.0 + + + com.azure + azure-code-customization-parent + 1.0.0-beta.1 + ../../../parents/azure-code-customization-parent + + + azure-ai-speech-transcription-customization + 1.0.0-beta.1 + jar + + + + ../../../.. + + + + + + + org.apache.maven.plugins + maven-antrun-plugin + + + copy + none + + + + + org.codehaus.mojo + build-helper-maven-plugin + + + attach-artifacts + none + + + + + + diff --git a/sdk/transcription/azure-ai-speech-transcription/customization/src/main/java/SpeechTranscriptionCustomization.java b/sdk/transcription/azure-ai-speech-transcription/customization/src/main/java/SpeechTranscriptionCustomization.java new file mode 100644 index 000000000000..d0bdf355bdb0 --- /dev/null +++ b/sdk/transcription/azure-ai-speech-transcription/customization/src/main/java/SpeechTranscriptionCustomization.java @@ -0,0 +1,442 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +import com.azure.autorest.customization.Customization; +import com.azure.autorest.customization.LibraryCustomization; +import com.azure.autorest.customization.PackageCustomization; +import com.azure.autorest.customization.ClassCustomization; +import com.github.javaparser.ast.Modifier; +import com.github.javaparser.ast.NodeList; +import com.github.javaparser.ast.body.BodyDeclaration; +import com.github.javaparser.ast.body.ConstructorDeclaration; +import com.github.javaparser.ast.stmt.BlockStmt; +import com.github.javaparser.ast.stmt.Statement; +import com.github.javaparser.javadoc.Javadoc; +import com.github.javaparser.javadoc.JavadocBlockTag; +import com.github.javaparser.javadoc.description.JavadocDescription; +import org.slf4j.Logger; + +import java.time.Duration; + +import static com.github.javaparser.StaticJavaParser.parseBlock; +import static com.github.javaparser.StaticJavaParser.parseStatement; +import static com.github.javaparser.javadoc.description.JavadocDescription.parseText; + +/** + * Code customization after code generation for Speech Transcription SDK. + */ +public class SpeechTranscriptionCustomization extends Customization { + + /** + * Creates an instance of SpeechTranscriptionCustomization. + */ + public SpeechTranscriptionCustomization() { + } + + @Override + public void customize(LibraryCustomization customization, Logger logger) { + logger.info("Customizing Speech Transcription SDK"); + + // Check if models package exists before attempting customization + try { + PackageCustomization models = customization.getPackage("com.azure.ai.speech.transcription.models"); + + logger.info("Models package found - applying model customizations"); + + // Customize TranscriptionResult.getDuration() to return Duration instead of int + logger.info("Customizing TranscriptionResult.getDuration()"); + customizeDurationGetter(models, "TranscriptionResult"); + + // Customize TranscribedPhrase.getDuration() to return Duration instead of int + logger.info("Customizing TranscribedPhrase.getDuration()"); + customizeDurationGetter(models, "TranscribedPhrase"); + + // Customize TranscribedWord.getDuration() to return Duration instead of int + logger.info("Customizing TranscribedWord.getDuration()"); + customizeDurationGetter(models, "TranscribedWord"); + + // Customize TranscriptionDiarizationOptions to properly serialize enabled field + logger.info("Customizing TranscriptionDiarizationOptions.toJson()"); + customizeDiarizationOptionsToJson(models); + + // Customize EnhancedModeOptions constructor to auto-set enabled to true + logger.info("Customizing EnhancedModeOptions constructor to auto-set enabled to true"); + customizeEnhancedModeOptions(models); + + // Customize AudioFileDetails.getFilename() to auto-generate filename from contentType if not set + logger.info("Customizing AudioFileDetails.getFilename() to auto-generate filename"); + customizeAudioFileDetailsGetFilename(models); + + // Add AudioFileDetails field and constructors to TranscriptionOptions, make setAudioUrl private, remove no-arg constructor + logger + .info("Customizing TranscriptionOptions to add AudioFileDetails support and remove no-arg constructor"); + customizeTranscriptionOptions(models); + } catch (IllegalArgumentException e) { + logger.warn("Models package not found or empty - skipping model customizations: " + e.getMessage()); + } + + // Make transcribe(TranscriptionContent) package-private in clients + logger.info("Customizing TranscriptionClient to make transcribe(TranscriptionContent) package-private"); + customizeTranscriptionClient(customization.getPackage("com.azure.ai.speech.transcription")); + + logger.info("Customizing TranscriptionAsyncClient to make transcribe(TranscriptionContent) package-private"); + customizeTranscriptionAsyncClient(customization.getPackage("com.azure.ai.speech.transcription")); + } + + /** + * Customize the getDuration() method to return Duration type instead of int. + * The backing field remains as int (milliseconds), but the getter converts it to Duration. + * + * @param packageCustomization the package customization + * @param className the name of the class to customize + */ + private void customizeDurationGetter(PackageCustomization packageCustomization, String className) { + packageCustomization.getClass(className).customizeAst(ast -> { + ast.addImport("java.time.Duration"); + ast.getClassByName(className).ifPresent(clazz -> clazz.getMethodsByName("getDuration").forEach(method -> { + method.setType("Duration") + .setBody(parseBlock("{ return Duration.ofMillis(this.duration); }")) + .setJavadocComment( + new Javadoc(parseText("Get the duration property: The duration in milliseconds.")) + .addBlockTag("return", "the duration value as Duration.")); + })); + }); + } + + /** + * Customize the TranscriptionDiarizationOptions.toJson() method to properly serialize the enabled field. + * When maxSpeakers is set, enabled should be automatically set to true and serialized. + * + * @param packageCustomization the package customization + */ + private void customizeDiarizationOptionsToJson(PackageCustomization packageCustomization) { + packageCustomization.getClass("TranscriptionDiarizationOptions").customizeAst(ast -> { + ast.getClassByName("TranscriptionDiarizationOptions") + .ifPresent(clazz -> clazz.getMethodsByName("toJson").forEach(method -> { + method.setBody(parseBlock( + "{ jsonWriter.writeStartObject(); if (this.maxSpeakers != null) { jsonWriter.writeBooleanField(\"enabled\", true); jsonWriter.writeNumberField(\"maxSpeakers\", this.maxSpeakers); } return jsonWriter.writeEndObject(); }")); + })); + }); + } + + /** + * Customize EnhancedModeOptions to hide the enabled property from the public API. + * The enabled property is automatically set to true in the constructor. + * Both isEnabled() getter and setEnabled() setter are removed from the public API. + * + * @param packageCustomization the package customization + */ + private void customizeEnhancedModeOptions(PackageCustomization packageCustomization) { + packageCustomization.getClass("EnhancedModeOptions").customizeAst(ast -> { + ast.getClassByName("EnhancedModeOptions").ifPresent(clazz -> { + // Remove the @Generated no-arg constructor and replace with one that sets enabled = true + clazz.getConstructors() + .stream() + .filter(c -> c.getParameters().isEmpty()) + .findFirst() + .ifPresent(constructor -> { + // Remove @Generated annotation to prevent overwriting + constructor.getAnnotationByName("Generated").ifPresent(com.github.javaparser.ast.Node::remove); + // Set the constructor body to initialize enabled = true + constructor.setBody(parseBlock("{ this.enabled = true; }")); + // Add JavaDoc + constructor.setJavadocComment( + new Javadoc(parseText( + "Creates an instance of EnhancedModeOptions class with enhanced mode automatically enabled."))); + }); + + // Remove isEnabled() getter to hide enabled from public API + clazz.getMethodsByName("isEnabled").forEach(method -> method.remove()); + + // Remove ALL setEnabled() methods to hide enabled from public API + clazz.getMethodsByName("setEnabled").forEach(method -> method.remove()); + }); + }); + } + + /** + * Customize AudioFileDetails.getFilename() to auto-generate a filename from contentType if not explicitly set. + * This allows developers to omit setFilename() and have the SDK automatically provide a sensible default. + * + * @param packageCustomization the package customization + */ + private void customizeAudioFileDetailsGetFilename(PackageCustomization packageCustomization) { + packageCustomization.getClass("AudioFileDetails").customizeAst(ast -> { + ast.getClassByName("AudioFileDetails").ifPresent(clazz -> { + clazz.getMethodsByName("getFilename").forEach(method -> { + method.setBody(parseBlock( + "{ if (this.filename != null && !this.filename.isEmpty()) { return this.filename; } " + + "if (\"audio/wav\".equalsIgnoreCase(this.contentType)) { return \"audio.wav\"; } " + + "if (\"audio/mpeg\".equalsIgnoreCase(this.contentType) || \"audio/mp3\".equalsIgnoreCase(this.contentType)) { return \"audio.mp3\"; } " + + "if (\"audio/ogg\".equalsIgnoreCase(this.contentType)) { return \"audio.ogg\"; } " + + "if (\"audio/flac\".equalsIgnoreCase(this.contentType)) { return \"audio.flac\"; } " + + "if (\"audio/webm\".equalsIgnoreCase(this.contentType)) { return \"audio.webm\"; } " + + "if (\"audio/opus\".equalsIgnoreCase(this.contentType)) { return \"audio.opus\"; } " + + "return \"audio\"; }")); + method.setJavadocComment( + new Javadoc(parseText("Get the filename property: The filename of the file. " + + "If not explicitly set, a filename will be auto-generated from the contentType.")) + .addBlockTag("return", "the filename value, or an auto-generated filename if not set.")); + }); + }); + }); + } + + /** + * Customize TranscriptionOptions to: + * 1. Add AudioFileDetails field (final) + * 2. Remove default no-arg constructor + * 3. Add constructor with String audioUrl parameter + * 4. Add constructor with AudioFileDetails parameter + * 5. Add getFileDetails() method to access AudioFileDetails + * 6. Make setAudioUrl() private instead of public + * 7. Fix fromJson to use one of the parameterized constructors + * + * @param packageCustomization the package customization + */ + private void customizeTranscriptionOptions(PackageCustomization packageCustomization) { + packageCustomization.getClass("TranscriptionOptions").customizeAst(ast -> { + ast.getClassByName("TranscriptionOptions").ifPresent(clazz -> { + // Add the AudioFileDetails field as final + clazz.addFieldWithInitializer("AudioFileDetails", "audioFileDetails", null, + com.github.javaparser.ast.Modifier.Keyword.PRIVATE, + com.github.javaparser.ast.Modifier.Keyword.FINAL); + + // Remove default no-arg constructor + clazz.getConstructors() + .stream() + .filter(c -> c.getParameters().isEmpty()) + .findFirst() + .ifPresent(com.github.javaparser.ast.Node::remove); + + // Fix fromJson method to use parameterized constructor instead of no-arg + clazz.getMethodsByName("fromJson").forEach(method -> { + // Replace the entire method body to use the String constructor + method.setBody(parseBlock("{ return jsonReader.readObject(reader -> { " + + "TranscriptionOptions deserializedTranscriptionOptions = new TranscriptionOptions((String) null); " + + "while (reader.nextToken() != JsonToken.END_OBJECT) { " + + "String fieldName = reader.getFieldName(); " + "reader.nextToken(); " + + "if (\"audioUrl\".equals(fieldName)) { " + + "deserializedTranscriptionOptions.audioUrl = reader.getString(); " + + "} else if (\"locales\".equals(fieldName)) { " + + "List locales = reader.readArray(reader1 -> reader1.getString()); " + + "deserializedTranscriptionOptions.locales = locales; " + + "} else if (\"localeModelMapping\".equals(fieldName)) { " + + "Map localeModelMapping = reader.readMap(reader1 -> reader1.getString()); " + + "deserializedTranscriptionOptions.localeModelMapping = localeModelMapping; " + + "} else if (\"profanityFilterMode\".equals(fieldName)) { " + + "deserializedTranscriptionOptions.profanityFilterMode = ProfanityFilterMode.fromString(reader.getString()); " + + "} else if (\"diarization\".equals(fieldName)) { " + + "deserializedTranscriptionOptions.diarizationOptions = TranscriptionDiarizationOptions.fromJson(reader); " + + "} else if (\"channels\".equals(fieldName)) { " + + "List activeChannels = reader.readArray(reader1 -> reader1.getInt()); " + + "deserializedTranscriptionOptions.activeChannels = activeChannels; " + + "} else if (\"enhancedMode\".equals(fieldName)) { " + + "deserializedTranscriptionOptions.enhancedModeOptions = EnhancedModeOptions.fromJson(reader); " + + "} else if (\"phraseList\".equals(fieldName)) { " + + "deserializedTranscriptionOptions.phraseListOptions = PhraseListOptions.fromJson(reader); " + + "} else { " + "reader.skipChildren(); " + "} " + "} " + + "return deserializedTranscriptionOptions; " + "}); }")); + }); + + // Add constructor with String audioUrl parameter + ConstructorDeclaration audioUrlConstructor = clazz.addConstructor(Modifier.Keyword.PUBLIC); + audioUrlConstructor.addParameter("String", "audioUrl"); + audioUrlConstructor.setBody(parseBlock("{ this.audioUrl = audioUrl; this.audioFileDetails = null; }")); + audioUrlConstructor.setJavadocComment( + new Javadoc(parseText("Creates an instance of TranscriptionOptions class with audio URL.")) + .addBlockTag("param", "audioUrl the URL of the audio to be transcribed")); + + // Add constructor with AudioFileDetails parameter + ConstructorDeclaration fileDetailsConstructor = clazz.addConstructor(Modifier.Keyword.PUBLIC); + fileDetailsConstructor.addParameter("AudioFileDetails", "fileDetails"); + fileDetailsConstructor.setBody(parseBlock("{ this.audioFileDetails = fileDetails; }")); + fileDetailsConstructor.setJavadocComment( + new Javadoc(parseText("Creates an instance of TranscriptionOptions class with audio file details.")) + .addBlockTag("param", "fileDetails the audio file details")); + + // Add getFileDetails() method + com.github.javaparser.ast.body.MethodDeclaration getFileDetailsMethod + = clazz.addMethod("getFileDetails", Modifier.Keyword.PUBLIC); + getFileDetailsMethod.setType("AudioFileDetails"); + getFileDetailsMethod.setBody(parseBlock("{ return this.audioFileDetails; }")); + getFileDetailsMethod.setJavadocComment(new Javadoc( + parseText("Get the audioFileDetails property: The audio file details for transcription.")) + .addBlockTag("return", "the audioFileDetails value.")); + + // Make setAudioUrl() private + clazz.getMethodsByName("setAudioUrl").forEach(method -> { + method.getModifiers().clear(); + method.addModifier(Modifier.Keyword.PRIVATE); + }); + }); + }); + } + + /** + * Customize TranscriptionClient to add public transcribe(TranscriptionOptions) method + * that hides TranscriptionContent construction and add transcribeWithResponse method. + * + * @param packageCustomization the package customization + */ + private void customizeTranscriptionClient(PackageCustomization packageCustomization) { + ClassCustomization classCustomization = packageCustomization.getClass("TranscriptionClient"); + classCustomization.customizeAst(ast -> { + ast.getClassByName("TranscriptionClient").ifPresent(clazz -> { + // Make the generated transcribe(TranscriptionContent) package-private (internal) + // Only modify methods that have @Generated annotation to avoid affecting manual customizations + clazz.getMethodsByName("transcribe").forEach(method -> { + if (method.getParameters().size() == 1 + && "TranscriptionContent".equals(method.getParameter(0).getType().asString()) + && method.getAnnotationByName("Generated").isPresent()) { + // Remove all modifiers (including final), making it package-private + method.getModifiers().clear(); + // Remove @Generated annotation to prevent overwriting + method.getAnnotationByName("Generated").ifPresent(com.github.javaparser.ast.Node::remove); + } + }); + + // Add public transcribe method that returns TranscriptionResult + com.github.javaparser.ast.body.MethodDeclaration transcribeMethod + = clazz.addMethod("transcribe", Modifier.Keyword.PUBLIC) + .addParameter("TranscriptionOptions", "options") + .setType("TranscriptionResult"); + transcribeMethod.setJavadocComment("/**\n" + + " * Transcribes the provided audio stream with the specified options.\n" + " *\n" + + " * @param options the transcription options including audio file details or audio URL\n" + + " * @throws IllegalArgumentException thrown if parameters fail the validation.\n" + + " * @throws HttpResponseException thrown if the request is rejected by server.\n" + + " * @throws ClientAuthenticationException thrown if the request is rejected by server on status code 401.\n" + + " * @throws ResourceNotFoundException thrown if the request is rejected by server on status code 404.\n" + + " * @throws ResourceModifiedException thrown if the request is rejected by server on status code 409.\n" + + " * @throws RuntimeException all other wrapped checked exceptions if the request fails to be sent.\n" + + " * @return the result of the transcribe operation.\n" + " */"); + com.github.javaparser.ast.expr.NormalAnnotationExpr transcribeServiceMethodAnnotation + = new com.github.javaparser.ast.expr.NormalAnnotationExpr(); + transcribeServiceMethodAnnotation.setName("ServiceMethod"); + transcribeServiceMethodAnnotation.addPair("returns", "ReturnType.SINGLE"); + transcribeMethod.addAnnotation(transcribeServiceMethodAnnotation); + transcribeMethod + .setBody(parseBlock("{ TranscriptionContent requestContent = new TranscriptionContent(options); " + + "if (options.getFileDetails() != null) { requestContent.setAudio(options.getFileDetails()); } " + + "return transcribe(requestContent); }")); + + // Add public transcribeWithResponse method that returns Response + com.github.javaparser.ast.body.MethodDeclaration transcribeWithResponseMethod + = clazz.addMethod("transcribeWithResponse", Modifier.Keyword.PUBLIC) + .addParameter("TranscriptionOptions", "options") + .setType("Response"); + transcribeWithResponseMethod.setJavadocComment("/**\n" + + " * Transcribes the provided audio stream with the specified options.\n" + " *\n" + + " * @param options the transcription options including audio file details or audio URL\n" + + " * @throws IllegalArgumentException thrown if parameters fail the validation.\n" + + " * @throws HttpResponseException thrown if the request is rejected by server.\n" + + " * @throws ClientAuthenticationException thrown if the request is rejected by server on status code 401.\n" + + " * @throws ResourceNotFoundException thrown if the request is rejected by server on status code 404.\n" + + " * @throws ResourceModifiedException thrown if the request is rejected by server on status code 409.\n" + + " * @throws RuntimeException all other wrapped checked exceptions if the request fails to be sent.\n" + + " * @return the response containing the result of the transcribe operation.\n" + " */"); + com.github.javaparser.ast.expr.NormalAnnotationExpr serviceMethodAnnotation + = new com.github.javaparser.ast.expr.NormalAnnotationExpr(); + serviceMethodAnnotation.setName("ServiceMethod"); + serviceMethodAnnotation.addPair("returns", "ReturnType.SINGLE"); + transcribeWithResponseMethod.addAnnotation(serviceMethodAnnotation); + transcribeWithResponseMethod + .setBody(parseBlock("{ TranscriptionContent requestContent = new TranscriptionContent(options); " + + "if (options.getFileDetails() != null) { requestContent.setAudio(options.getFileDetails()); } " + + "RequestOptions requestOptions = new RequestOptions(); " + + "Response response = transcribeWithResponse(" + + "new MultipartFormDataHelper(requestOptions).serializeJsonField(\"definition\", requestContent.getOptions())" + + ".serializeFileField(\"audio\", requestContent.getAudio() == null ? null : requestContent.getAudio().getContent(), " + + "requestContent.getAudio() == null ? null : requestContent.getAudio().getContentType(), " + + "requestContent.getAudio() == null ? null : requestContent.getAudio().getFilename())" + + ".end().getRequestBody(), requestOptions); " + + "return new SimpleResponse<>(response, response.getValue().toObject(TranscriptionResult.class)); }")); + }); + }); + } + + /** + * Customize TranscriptionAsyncClient to make transcribe(TranscriptionContent) package-private (internal) + * and add transcribeWithResponse method. + * + * @param packageCustomization the package customization + */ + private void customizeTranscriptionAsyncClient(PackageCustomization packageCustomization) { + ClassCustomization classCustomization = packageCustomization.getClass("TranscriptionAsyncClient"); + classCustomization.customizeAst(ast -> { + ast.getClassByName("TranscriptionAsyncClient").ifPresent(clazz -> { + // Make the generated transcribe(TranscriptionContent) package-private (internal) + // Only modify methods that have @Generated annotation to avoid affecting manual customizations + clazz.getMethodsByName("transcribe").forEach(method -> { + if (method.getParameters().size() == 1 + && "TranscriptionContent".equals(method.getParameter(0).getType().asString()) + && method.getAnnotationByName("Generated").isPresent()) { + // Remove all modifiers (including final), making it package-private + method.getModifiers().clear(); + // Remove @Generated annotation to prevent overwriting + method.getAnnotationByName("Generated").ifPresent(com.github.javaparser.ast.Node::remove); + } + }); + + // Add public transcribe method that returns Mono + com.github.javaparser.ast.body.MethodDeclaration transcribeMethod + = clazz.addMethod("transcribe", Modifier.Keyword.PUBLIC) + .addParameter("TranscriptionOptions", "options") + .setType("Mono"); + transcribeMethod.setJavadocComment("/**\n" + + " * Transcribes the provided audio stream with the specified options.\n" + " *\n" + + " * @param options the transcription options including audio file details or audio URL\n" + + " * @throws IllegalArgumentException thrown if parameters fail the validation.\n" + + " * @throws HttpResponseException thrown if the request is rejected by server.\n" + + " * @throws ClientAuthenticationException thrown if the request is rejected by server on status code 401.\n" + + " * @throws ResourceNotFoundException thrown if the request is rejected by server on status code 404.\n" + + " * @throws ResourceModifiedException thrown if the request is rejected by server on status code 409.\n" + + " * @throws RuntimeException all other wrapped checked exceptions if the request fails to be sent.\n" + + " * @return the result of the transcribe operation on successful completion of {@link Mono}.\n" + + " */"); + com.github.javaparser.ast.expr.NormalAnnotationExpr transcribeServiceMethodAnnotation + = new com.github.javaparser.ast.expr.NormalAnnotationExpr(); + transcribeServiceMethodAnnotation.setName("ServiceMethod"); + transcribeServiceMethodAnnotation.addPair("returns", "ReturnType.SINGLE"); + transcribeMethod.addAnnotation(transcribeServiceMethodAnnotation); + transcribeMethod + .setBody(parseBlock("{ TranscriptionContent requestContent = new TranscriptionContent(options); " + + "if (options.getFileDetails() != null) { requestContent.setAudio(options.getFileDetails()); } " + + "return transcribe(requestContent); }")); + + // Add public transcribeWithResponse method that returns Mono> + com.github.javaparser.ast.body.MethodDeclaration transcribeWithResponseMethod + = clazz.addMethod("transcribeWithResponse", Modifier.Keyword.PUBLIC) + .addParameter("TranscriptionOptions", "options") + .setType("Mono>"); + transcribeWithResponseMethod.setJavadocComment("/**\n" + + " * Transcribes the provided audio stream with the specified options.\n" + " *\n" + + " * @param options the transcription options including audio file details or audio URL\n" + + " * @throws IllegalArgumentException thrown if parameters fail the validation.\n" + + " * @throws HttpResponseException thrown if the request is rejected by server.\n" + + " * @throws ClientAuthenticationException thrown if the request is rejected by server on status code 401.\n" + + " * @throws ResourceNotFoundException thrown if the request is rejected by server on status code 404.\n" + + " * @throws ResourceModifiedException thrown if the request is rejected by server on status code 409.\n" + + " * @throws RuntimeException all other wrapped checked exceptions if the request fails to be sent.\n" + + " * @return the response containing the result of the transcribe operation on successful completion of {@link Mono}.\n" + + " */"); + com.github.javaparser.ast.expr.NormalAnnotationExpr serviceMethodAnnotation + = new com.github.javaparser.ast.expr.NormalAnnotationExpr(); + serviceMethodAnnotation.setName("ServiceMethod"); + serviceMethodAnnotation.addPair("returns", "ReturnType.SINGLE"); + transcribeWithResponseMethod.addAnnotation(serviceMethodAnnotation); + transcribeWithResponseMethod + .setBody(parseBlock("{ TranscriptionContent requestContent = new TranscriptionContent(options); " + + "if (options.getFileDetails() != null) { requestContent.setAudio(options.getFileDetails()); } " + + "RequestOptions requestOptions = new RequestOptions(); " + "return transcribeWithResponse(" + + "new MultipartFormDataHelper(requestOptions).serializeJsonField(\"definition\", requestContent.getOptions())" + + ".serializeFileField(\"audio\", requestContent.getAudio() == null ? null : requestContent.getAudio().getContent(), " + + "requestContent.getAudio() == null ? null : requestContent.getAudio().getContentType(), " + + "requestContent.getAudio() == null ? null : requestContent.getAudio().getFilename())" + + ".end().getRequestBody(), requestOptions)" + + ".map(response -> new SimpleResponse<>(response, response.getValue().toObject(TranscriptionResult.class))); }")); + }); + }); + } +} diff --git a/sdk/transcription/azure-ai-speech-transcription/pom.xml b/sdk/transcription/azure-ai-speech-transcription/pom.xml new file mode 100644 index 000000000000..53fad6f4ded0 --- /dev/null +++ b/sdk/transcription/azure-ai-speech-transcription/pom.xml @@ -0,0 +1,71 @@ + + + 4.0.0 + + com.azure + azure-client-sdk-parent + 1.7.0 + ../../parents/azure-client-sdk-parent + + + com.azure + azure-ai-speech-transcription + 1.0.0-beta.1 + jar + + Microsoft Azure SDK for Transcription + This package contains Microsoft Azure Transcription client library. + https://github.com/Azure/azure-sdk-for-java + + + + The MIT License (MIT) + http://opensource.org/licenses/MIT + repo + + + + + https://github.com/Azure/azure-sdk-for-java + scm:git:git@github.com:Azure/azure-sdk-for-java.git + scm:git:git@github.com:Azure/azure-sdk-for-java.git + HEAD + + + + microsoft + Microsoft + + + + UTF-8 + + + + com.azure + azure-core + 1.57.0 + + + com.azure + azure-core-http-netty + 1.16.2 + + + com.azure + azure-core-test + 1.27.0-beta.13 + test + + + com.azure + azure-identity + 1.18.1 + test + + + diff --git a/sdk/transcription/azure-ai-speech-transcription/src/main/java/com/azure/ai/speech/transcription/TranscriptionAsyncClient.java b/sdk/transcription/azure-ai-speech-transcription/src/main/java/com/azure/ai/speech/transcription/TranscriptionAsyncClient.java new file mode 100644 index 000000000000..69294063fad5 --- /dev/null +++ b/sdk/transcription/azure-ai-speech-transcription/src/main/java/com/azure/ai/speech/transcription/TranscriptionAsyncClient.java @@ -0,0 +1,178 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. +// Code generated by Microsoft (R) TypeSpec Code Generator. +package com.azure.ai.speech.transcription; + +import com.azure.ai.speech.transcription.implementation.MultipartFormDataHelper; +import com.azure.ai.speech.transcription.implementation.TranscriptionClientImpl; +import com.azure.ai.speech.transcription.models.TranscriptionContent; +import com.azure.ai.speech.transcription.models.TranscriptionOptions; +import com.azure.ai.speech.transcription.models.TranscriptionResult; +import com.azure.core.annotation.Generated; +import com.azure.core.annotation.ReturnType; +import com.azure.core.annotation.ServiceClient; +import com.azure.core.annotation.ServiceMethod; +import com.azure.core.exception.ClientAuthenticationException; +import com.azure.core.exception.HttpResponseException; +import com.azure.core.exception.ResourceModifiedException; +import com.azure.core.exception.ResourceNotFoundException; +import com.azure.core.http.rest.RequestOptions; +import com.azure.core.http.rest.Response; +import com.azure.core.http.rest.SimpleResponse; +import com.azure.core.util.BinaryData; +import com.azure.core.util.FluxUtil; +import com.azure.core.util.logging.ClientLogger; +import reactor.core.publisher.Mono; + +/** + * Initializes a new instance of the asynchronous TranscriptionClient type. + */ +@ServiceClient(builder = TranscriptionClientBuilder.class, isAsync = true) +public final class TranscriptionAsyncClient { + + private static final ClientLogger LOGGER = new ClientLogger(TranscriptionAsyncClient.class); + + @Generated + private final TranscriptionClientImpl serviceClient; + + /** + * Initializes an instance of TranscriptionAsyncClient class. + * + * @param serviceClient the service client implementation. + */ + @Generated + TranscriptionAsyncClient(TranscriptionClientImpl serviceClient) { + this.serviceClient = serviceClient; + } + + /** + * Transcribes the provided audio stream. + *

Response Body Schema

+ * + *
+     * {@code
+     * {
+     *     durationMilliseconds: int (Required)
+     *     combinedPhrases (Required): [
+     *          (Required){
+     *             channel: Integer (Optional)
+     *             text: String (Required)
+     *         }
+     *     ]
+     *     phrases (Required): [
+     *          (Required){
+     *             channel: Integer (Optional)
+     *             speaker: Integer (Optional)
+     *             offsetMilliseconds: int (Required)
+     *             durationMilliseconds: int (Required)
+     *             text: String (Required)
+     *             words (Optional): [
+     *                  (Optional){
+     *                     text: String (Required)
+     *                     offsetMilliseconds: int (Required)
+     *                     durationMilliseconds: int (Required)
+     *                 }
+     *             ]
+     *             locale: String (Optional)
+     *             confidence: double (Required)
+     *         }
+     *     ]
+     * }
+     * }
+     * 
+ * + * @param body The body of the multipart request. + * @param requestOptions The options to configure the HTTP request before HTTP client sends it. + * @throws HttpResponseException thrown if the request is rejected by server. + * @throws ClientAuthenticationException thrown if the request is rejected by server on status code 401. + * @throws ResourceNotFoundException thrown if the request is rejected by server on status code 404. + * @throws ResourceModifiedException thrown if the request is rejected by server on status code 409. + * @return the result of the transcribe operation along with {@link Response} on successful completion of + * {@link Mono}. + */ + @Generated + @ServiceMethod(returns = ReturnType.SINGLE) + Mono> transcribeWithResponse(BinaryData body, RequestOptions requestOptions) { + // Operation 'transcribe' is of content-type 'multipart/form-data'. Protocol API is not usable and hence not + // generated. + return this.serviceClient.transcribeWithResponseAsync(body, requestOptions); + } + + /** + * Transcribes the provided audio stream. + * + * @param body The body of the multipart request. + * @throws IllegalArgumentException thrown if parameters fail the validation. + * @throws HttpResponseException thrown if the request is rejected by server. + * @throws ClientAuthenticationException thrown if the request is rejected by server on status code 401. + * @throws ResourceNotFoundException thrown if the request is rejected by server on status code 404. + * @throws ResourceModifiedException thrown if the request is rejected by server on status code 409. + * @throws RuntimeException all other wrapped checked exceptions if the request fails to be sent. + * @return the result of the transcribe operation on successful completion of {@link Mono}. + */ + @ServiceMethod(returns = ReturnType.SINGLE) + Mono transcribe(TranscriptionContent body) { + // Generated convenience method for transcribeWithResponse + RequestOptions requestOptions = new RequestOptions(); + return transcribeWithResponse( + new MultipartFormDataHelper(requestOptions).serializeJsonField("definition", body.getOptions()) + .serializeFileField("audio", body.getAudio() == null ? null : body.getAudio().getContent(), + body.getAudio() == null ? null : body.getAudio().getContentType(), + body.getAudio() == null ? null : body.getAudio().getFilename()) + .end() + .getRequestBody(), + requestOptions).flatMap(FluxUtil::toMono) + .map(protocolMethodData -> protocolMethodData.toObject(TranscriptionResult.class)); + } + + /** + * Transcribes the provided audio stream with the specified options. + * + * @param options the transcription options including audio file details or audio URL + * @throws IllegalArgumentException thrown if parameters fail the validation. + * @throws HttpResponseException thrown if the request is rejected by server. + * @throws ClientAuthenticationException thrown if the request is rejected by server on status code 401. + * @throws ResourceNotFoundException thrown if the request is rejected by server on status code 404. + * @throws ResourceModifiedException thrown if the request is rejected by server on status code 409. + * @throws RuntimeException all other wrapped checked exceptions if the request fails to be sent. + * @return the result of the transcribe operation on successful completion of {@link Mono}. + */ + @ServiceMethod(returns = ReturnType.SINGLE) + public Mono transcribe(TranscriptionOptions options) { + TranscriptionContent requestContent = new TranscriptionContent(options); + if (options.getFileDetails() != null) { + requestContent.setAudio(options.getFileDetails()); + } + return transcribe(requestContent); + } + + /** + * Transcribes the provided audio stream with the specified options. + * + * @param options the transcription options including audio file details or audio URL + * @throws IllegalArgumentException thrown if parameters fail the validation. + * @throws HttpResponseException thrown if the request is rejected by server. + * @throws ClientAuthenticationException thrown if the request is rejected by server on status code 401. + * @throws ResourceNotFoundException thrown if the request is rejected by server on status code 404. + * @throws ResourceModifiedException thrown if the request is rejected by server on status code 409. + * @throws RuntimeException all other wrapped checked exceptions if the request fails to be sent. + * @return the response containing the result of the transcribe operation on successful completion of {@link Mono}. + */ + public Mono> transcribeWithResponse(TranscriptionOptions options) { + TranscriptionContent requestContent = new TranscriptionContent(options); + if (options.getFileDetails() != null) { + requestContent.setAudio(options.getFileDetails()); + } + RequestOptions requestOptions = new RequestOptions(); + return transcribeWithResponse( + new MultipartFormDataHelper(requestOptions).serializeJsonField("definition", requestContent.getOptions()) + .serializeFileField("audio", + requestContent.getAudio() == null ? null : requestContent.getAudio().getContent(), + requestContent.getAudio() == null ? null : requestContent.getAudio().getContentType(), + requestContent.getAudio() == null ? null : requestContent.getAudio().getFilename()) + .end() + .getRequestBody(), + requestOptions).map( + response -> new SimpleResponse<>(response, response.getValue().toObject(TranscriptionResult.class))); + } +} diff --git a/sdk/transcription/azure-ai-speech-transcription/src/main/java/com/azure/ai/speech/transcription/TranscriptionClient.java b/sdk/transcription/azure-ai-speech-transcription/src/main/java/com/azure/ai/speech/transcription/TranscriptionClient.java new file mode 100644 index 000000000000..42a6fa24db7c --- /dev/null +++ b/sdk/transcription/azure-ai-speech-transcription/src/main/java/com/azure/ai/speech/transcription/TranscriptionClient.java @@ -0,0 +1,174 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. +// Code generated by Microsoft (R) TypeSpec Code Generator. +package com.azure.ai.speech.transcription; + +import com.azure.ai.speech.transcription.implementation.MultipartFormDataHelper; +import com.azure.ai.speech.transcription.implementation.TranscriptionClientImpl; +import com.azure.ai.speech.transcription.models.TranscriptionContent; +import com.azure.ai.speech.transcription.models.TranscriptionOptions; +import com.azure.ai.speech.transcription.models.TranscriptionResult; +import com.azure.core.annotation.Generated; +import com.azure.core.annotation.ReturnType; +import com.azure.core.annotation.ServiceClient; +import com.azure.core.annotation.ServiceMethod; +import com.azure.core.exception.ClientAuthenticationException; +import com.azure.core.exception.HttpResponseException; +import com.azure.core.exception.ResourceModifiedException; +import com.azure.core.exception.ResourceNotFoundException; +import com.azure.core.http.rest.RequestOptions; +import com.azure.core.http.rest.Response; +import com.azure.core.http.rest.SimpleResponse; +import com.azure.core.util.BinaryData; +import com.azure.core.util.logging.ClientLogger; + +/** + * Initializes a new instance of the synchronous TranscriptionClient type. + */ +@ServiceClient(builder = TranscriptionClientBuilder.class) +public final class TranscriptionClient { + + private static final ClientLogger LOGGER = new ClientLogger(TranscriptionClient.class); + + @Generated + private final TranscriptionClientImpl serviceClient; + + /** + * Initializes an instance of TranscriptionClient class. + * + * @param serviceClient the service client implementation. + */ + @Generated + TranscriptionClient(TranscriptionClientImpl serviceClient) { + this.serviceClient = serviceClient; + } + + /** + * Transcribes the provided audio stream. + *

Response Body Schema

+ * + *
+     * {@code
+     * {
+     *     durationMilliseconds: int (Required)
+     *     combinedPhrases (Required): [
+     *          (Required){
+     *             channel: Integer (Optional)
+     *             text: String (Required)
+     *         }
+     *     ]
+     *     phrases (Required): [
+     *          (Required){
+     *             channel: Integer (Optional)
+     *             speaker: Integer (Optional)
+     *             offsetMilliseconds: int (Required)
+     *             durationMilliseconds: int (Required)
+     *             text: String (Required)
+     *             words (Optional): [
+     *                  (Optional){
+     *                     text: String (Required)
+     *                     offsetMilliseconds: int (Required)
+     *                     durationMilliseconds: int (Required)
+     *                 }
+     *             ]
+     *             locale: String (Optional)
+     *             confidence: double (Required)
+     *         }
+     *     ]
+     * }
+     * }
+     * 
+ * + * @param body The body of the multipart request. + * @param requestOptions The options to configure the HTTP request before HTTP client sends it. + * @throws HttpResponseException thrown if the request is rejected by server. + * @throws ClientAuthenticationException thrown if the request is rejected by server on status code 401. + * @throws ResourceNotFoundException thrown if the request is rejected by server on status code 404. + * @throws ResourceModifiedException thrown if the request is rejected by server on status code 409. + * @return the result of the transcribe operation along with {@link Response}. + */ + @Generated + @ServiceMethod(returns = ReturnType.SINGLE) + Response transcribeWithResponse(BinaryData body, RequestOptions requestOptions) { + // Operation 'transcribe' is of content-type 'multipart/form-data'. Protocol API is not usable and hence not + // generated. + return this.serviceClient.transcribeWithResponse(body, requestOptions); + } + + /** + * Transcribes the provided audio stream. + * + * @param body The body of the multipart request. + * @throws IllegalArgumentException thrown if parameters fail the validation. + * @throws HttpResponseException thrown if the request is rejected by server. + * @throws ClientAuthenticationException thrown if the request is rejected by server on status code 401. + * @throws ResourceNotFoundException thrown if the request is rejected by server on status code 404. + * @throws ResourceModifiedException thrown if the request is rejected by server on status code 409. + * @throws RuntimeException all other wrapped checked exceptions if the request fails to be sent. + * @return the result of the transcribe operation. + */ + @ServiceMethod(returns = ReturnType.SINGLE) + TranscriptionResult transcribe(TranscriptionContent body) { + // Generated convenience method for transcribeWithResponse + RequestOptions requestOptions = new RequestOptions(); + return transcribeWithResponse( + new MultipartFormDataHelper(requestOptions).serializeJsonField("definition", body.getOptions()) + .serializeFileField("audio", body.getAudio() == null ? null : body.getAudio().getContent(), + body.getAudio() == null ? null : body.getAudio().getContentType(), + body.getAudio() == null ? null : body.getAudio().getFilename()) + .end() + .getRequestBody(), + requestOptions).getValue().toObject(TranscriptionResult.class); + } + + /** + * Transcribes the provided audio stream with the specified options. + * + * @param options the transcription options including audio file details or audio URL + * @throws IllegalArgumentException thrown if parameters fail the validation. + * @throws HttpResponseException thrown if the request is rejected by server. + * @throws ClientAuthenticationException thrown if the request is rejected by server on status code 401. + * @throws ResourceNotFoundException thrown if the request is rejected by server on status code 404. + * @throws ResourceModifiedException thrown if the request is rejected by server on status code 409. + * @throws RuntimeException all other wrapped checked exceptions if the request fails to be sent. + * @return the result of the transcribe operation. + */ + @ServiceMethod(returns = ReturnType.SINGLE) + public TranscriptionResult transcribe(TranscriptionOptions options) { + TranscriptionContent requestContent = new TranscriptionContent(options); + if (options.getFileDetails() != null) { + requestContent.setAudio(options.getFileDetails()); + } + return transcribe(requestContent); + } + + /** + * Transcribes the provided audio stream with the specified options. + * + * @param options the transcription options including audio file details or audio URL + * @throws IllegalArgumentException thrown if parameters fail the validation. + * @throws HttpResponseException thrown if the request is rejected by server. + * @throws ClientAuthenticationException thrown if the request is rejected by server on status code 401. + * @throws ResourceNotFoundException thrown if the request is rejected by server on status code 404. + * @throws ResourceModifiedException thrown if the request is rejected by server on status code 409. + * @throws RuntimeException all other wrapped checked exceptions if the request fails to be sent. + * @return the response containing the result of the transcribe operation. + */ + public Response transcribeWithResponse(TranscriptionOptions options) { + TranscriptionContent requestContent = new TranscriptionContent(options); + if (options.getFileDetails() != null) { + requestContent.setAudio(options.getFileDetails()); + } + RequestOptions requestOptions = new RequestOptions(); + Response response = transcribeWithResponse( + new MultipartFormDataHelper(requestOptions).serializeJsonField("definition", requestContent.getOptions()) + .serializeFileField("audio", + requestContent.getAudio() == null ? null : requestContent.getAudio().getContent(), + requestContent.getAudio() == null ? null : requestContent.getAudio().getContentType(), + requestContent.getAudio() == null ? null : requestContent.getAudio().getFilename()) + .end() + .getRequestBody(), + requestOptions); + return new SimpleResponse<>(response, response.getValue().toObject(TranscriptionResult.class)); + } +} diff --git a/sdk/transcription/azure-ai-speech-transcription/src/main/java/com/azure/ai/speech/transcription/TranscriptionClientBuilder.java b/sdk/transcription/azure-ai-speech-transcription/src/main/java/com/azure/ai/speech/transcription/TranscriptionClientBuilder.java new file mode 100644 index 000000000000..649d0f6ca0d8 --- /dev/null +++ b/sdk/transcription/azure-ai-speech-transcription/src/main/java/com/azure/ai/speech/transcription/TranscriptionClientBuilder.java @@ -0,0 +1,356 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. +// Code generated by Microsoft (R) TypeSpec Code Generator. +package com.azure.ai.speech.transcription; + +import com.azure.ai.speech.transcription.implementation.TranscriptionClientImpl; +import com.azure.core.annotation.Generated; +import com.azure.core.annotation.ServiceClientBuilder; +import com.azure.core.client.traits.ConfigurationTrait; +import com.azure.core.client.traits.EndpointTrait; +import com.azure.core.client.traits.HttpTrait; +import com.azure.core.client.traits.KeyCredentialTrait; +import com.azure.core.client.traits.TokenCredentialTrait; +import com.azure.core.credential.KeyCredential; +import com.azure.core.credential.TokenCredential; +import com.azure.core.http.HttpClient; +import com.azure.core.http.HttpHeaders; +import com.azure.core.http.HttpPipeline; +import com.azure.core.http.HttpPipelineBuilder; +import com.azure.core.http.HttpPipelinePosition; +import com.azure.core.http.policy.AddDatePolicy; +import com.azure.core.http.policy.AddHeadersFromContextPolicy; +import com.azure.core.http.policy.AddHeadersPolicy; +import com.azure.core.http.policy.BearerTokenAuthenticationPolicy; +import com.azure.core.http.policy.HttpLogOptions; +import com.azure.core.http.policy.HttpLoggingPolicy; +import com.azure.core.http.policy.HttpPipelinePolicy; +import com.azure.core.http.policy.HttpPolicyProviders; +import com.azure.core.http.policy.KeyCredentialPolicy; +import com.azure.core.http.policy.RequestIdPolicy; +import com.azure.core.http.policy.RetryOptions; +import com.azure.core.http.policy.RetryPolicy; +import com.azure.core.http.policy.UserAgentPolicy; +import com.azure.core.util.ClientOptions; +import com.azure.core.util.Configuration; +import com.azure.core.util.CoreUtils; +import com.azure.core.util.builder.ClientBuilderUtil; +import com.azure.core.util.logging.ClientLogger; +import com.azure.core.util.serializer.JacksonAdapter; +import java.util.ArrayList; +import java.util.List; +import java.util.Map; +import java.util.Objects; + +/** + * A builder for creating a new instance of the TranscriptionClient type. + */ +@ServiceClientBuilder(serviceClients = { TranscriptionClient.class, TranscriptionAsyncClient.class }) +public final class TranscriptionClientBuilder implements HttpTrait, + ConfigurationTrait, TokenCredentialTrait, + KeyCredentialTrait, EndpointTrait { + + @Generated + private static final String SDK_NAME = "name"; + + @Generated + private static final String SDK_VERSION = "version"; + + @Generated + private static final Map PROPERTIES + = CoreUtils.getProperties("azure-ai-speech-transcription.properties"); + + @Generated + private final List pipelinePolicies; + + /** + * Create an instance of the TranscriptionClientBuilder. + */ + @Generated + public TranscriptionClientBuilder() { + this.pipelinePolicies = new ArrayList<>(); + } + + /* + * The HTTP client used to send the request. + */ + @Generated + private HttpClient httpClient; + + /** + * {@inheritDoc}. + */ + @Generated + @Override + public TranscriptionClientBuilder httpClient(HttpClient httpClient) { + this.httpClient = httpClient; + return this; + } + + /* + * The HTTP pipeline to send requests through. + */ + @Generated + private HttpPipeline pipeline; + + /** + * {@inheritDoc}. + */ + @Generated + @Override + public TranscriptionClientBuilder pipeline(HttpPipeline pipeline) { + if (this.pipeline != null && pipeline == null) { + LOGGER.atInfo().log("HttpPipeline is being set to 'null' when it was previously configured."); + } + this.pipeline = pipeline; + return this; + } + + /* + * The logging configuration for HTTP requests and responses. + */ + @Generated + private HttpLogOptions httpLogOptions; + + /** + * {@inheritDoc}. + */ + @Generated + @Override + public TranscriptionClientBuilder httpLogOptions(HttpLogOptions httpLogOptions) { + this.httpLogOptions = httpLogOptions; + return this; + } + + /* + * The client options such as application ID and custom headers to set on a request. + */ + @Generated + private ClientOptions clientOptions; + + /** + * {@inheritDoc}. + */ + @Generated + @Override + public TranscriptionClientBuilder clientOptions(ClientOptions clientOptions) { + this.clientOptions = clientOptions; + return this; + } + + /* + * The retry options to configure retry policy for failed requests. + */ + @Generated + private RetryOptions retryOptions; + + /** + * {@inheritDoc}. + */ + @Generated + @Override + public TranscriptionClientBuilder retryOptions(RetryOptions retryOptions) { + this.retryOptions = retryOptions; + return this; + } + + /** + * {@inheritDoc}. + */ + @Generated + @Override + public TranscriptionClientBuilder addPolicy(HttpPipelinePolicy customPolicy) { + Objects.requireNonNull(customPolicy, "'customPolicy' cannot be null."); + pipelinePolicies.add(customPolicy); + return this; + } + + /* + * The configuration store that is used during construction of the service client. + */ + @Generated + private Configuration configuration; + + /** + * {@inheritDoc}. + */ + @Generated + @Override + public TranscriptionClientBuilder configuration(Configuration configuration) { + this.configuration = configuration; + return this; + } + + /* + * The KeyCredential used for authentication. + */ + @Generated + private KeyCredential keyCredential; + + /** + * {@inheritDoc}. + */ + @Generated + @Override + public TranscriptionClientBuilder credential(KeyCredential keyCredential) { + this.keyCredential = keyCredential; + return this; + } + + /* + * The service endpoint + */ + @Generated + private String endpoint; + + /** + * {@inheritDoc}. + */ + @Generated + @Override + public TranscriptionClientBuilder endpoint(String endpoint) { + this.endpoint = endpoint; + return this; + } + + /* + * Service version + */ + @Generated + private TranscriptionServiceVersion serviceVersion; + + /** + * Sets Service version. + * + * @param serviceVersion the serviceVersion value. + * @return the TranscriptionClientBuilder. + */ + @Generated + public TranscriptionClientBuilder serviceVersion(TranscriptionServiceVersion serviceVersion) { + this.serviceVersion = serviceVersion; + return this; + } + + /* + * The retry policy that will attempt to retry failed requests, if applicable. + */ + @Generated + private RetryPolicy retryPolicy; + + /** + * Sets The retry policy that will attempt to retry failed requests, if applicable. + * + * @param retryPolicy the retryPolicy value. + * @return the TranscriptionClientBuilder. + */ + @Generated + public TranscriptionClientBuilder retryPolicy(RetryPolicy retryPolicy) { + this.retryPolicy = retryPolicy; + return this; + } + + /** + * Builds an instance of TranscriptionClientImpl with the provided parameters. + * + * @return an instance of TranscriptionClientImpl. + */ + @Generated + private TranscriptionClientImpl buildInnerClient() { + this.validateClient(); + HttpPipeline localPipeline = (pipeline != null) ? pipeline : createHttpPipeline(); + TranscriptionServiceVersion localServiceVersion + = (serviceVersion != null) ? serviceVersion : TranscriptionServiceVersion.getLatest(); + TranscriptionClientImpl client = new TranscriptionClientImpl(localPipeline, + JacksonAdapter.createDefaultSerializerAdapter(), this.endpoint, localServiceVersion); + return client; + } + + @Generated + private void validateClient() { + // This method is invoked from 'buildInnerClient'/'buildClient' method. + // Developer can customize this method, to validate that the necessary conditions are met for the new client. + Objects.requireNonNull(endpoint, "'endpoint' cannot be null."); + } + + @Generated + private HttpPipeline createHttpPipeline() { + Configuration buildConfiguration + = (configuration == null) ? Configuration.getGlobalConfiguration() : configuration; + HttpLogOptions localHttpLogOptions = this.httpLogOptions == null ? new HttpLogOptions() : this.httpLogOptions; + ClientOptions localClientOptions = this.clientOptions == null ? new ClientOptions() : this.clientOptions; + List policies = new ArrayList<>(); + String clientName = PROPERTIES.getOrDefault(SDK_NAME, "UnknownName"); + String clientVersion = PROPERTIES.getOrDefault(SDK_VERSION, "UnknownVersion"); + String applicationId = CoreUtils.getApplicationId(localClientOptions, localHttpLogOptions); + policies.add(new UserAgentPolicy(applicationId, clientName, clientVersion, buildConfiguration)); + policies.add(new RequestIdPolicy()); + policies.add(new AddHeadersFromContextPolicy()); + HttpHeaders headers = CoreUtils.createHttpHeadersFromClientOptions(localClientOptions); + if (headers != null) { + policies.add(new AddHeadersPolicy(headers)); + } + this.pipelinePolicies.stream() + .filter(p -> p.getPipelinePosition() == HttpPipelinePosition.PER_CALL) + .forEach(p -> policies.add(p)); + HttpPolicyProviders.addBeforeRetryPolicies(policies); + policies.add(ClientBuilderUtil.validateAndGetRetryPolicy(retryPolicy, retryOptions, new RetryPolicy())); + policies.add(new AddDatePolicy()); + if (keyCredential != null) { + policies.add(new KeyCredentialPolicy("Ocp-Apim-Subscription-Key", keyCredential)); + } + if (tokenCredential != null) { + policies.add(new BearerTokenAuthenticationPolicy(tokenCredential, DEFAULT_SCOPES)); + } + this.pipelinePolicies.stream() + .filter(p -> p.getPipelinePosition() == HttpPipelinePosition.PER_RETRY) + .forEach(p -> policies.add(p)); + HttpPolicyProviders.addAfterRetryPolicies(policies); + policies.add(new HttpLoggingPolicy(localHttpLogOptions)); + HttpPipeline httpPipeline = new HttpPipelineBuilder().policies(policies.toArray(new HttpPipelinePolicy[0])) + .httpClient(httpClient) + .clientOptions(localClientOptions) + .build(); + return httpPipeline; + } + + /** + * Builds an instance of TranscriptionAsyncClient class. + * + * @return an instance of TranscriptionAsyncClient. + */ + @Generated + public TranscriptionAsyncClient buildAsyncClient() { + return new TranscriptionAsyncClient(buildInnerClient()); + } + + /** + * Builds an instance of TranscriptionClient class. + * + * @return an instance of TranscriptionClient. + */ + @Generated + public TranscriptionClient buildClient() { + return new TranscriptionClient(buildInnerClient()); + } + + private static final ClientLogger LOGGER = new ClientLogger(TranscriptionClientBuilder.class); + + @Generated + private static final String[] DEFAULT_SCOPES = new String[] { "https://cognitiveservices.azure.com/.default" }; + + /* + * The TokenCredential used for authentication. + */ + @Generated + private TokenCredential tokenCredential; + + /** + * {@inheritDoc}. + */ + @Generated + @Override + public TranscriptionClientBuilder credential(TokenCredential tokenCredential) { + this.tokenCredential = tokenCredential; + return this; + } +} diff --git a/sdk/transcription/azure-ai-speech-transcription/src/main/java/com/azure/ai/speech/transcription/TranscriptionServiceVersion.java b/sdk/transcription/azure-ai-speech-transcription/src/main/java/com/azure/ai/speech/transcription/TranscriptionServiceVersion.java new file mode 100644 index 000000000000..07e1561ff2f4 --- /dev/null +++ b/sdk/transcription/azure-ai-speech-transcription/src/main/java/com/azure/ai/speech/transcription/TranscriptionServiceVersion.java @@ -0,0 +1,40 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. +// Code generated by Microsoft (R) TypeSpec Code Generator. + +package com.azure.ai.speech.transcription; + +import com.azure.core.util.ServiceVersion; + +/** + * Service version of TranscriptionClient. + */ +public enum TranscriptionServiceVersion implements ServiceVersion { + /** + * Enum value 2025-10-15. + */ + V2025_10_15("2025-10-15"); + + private final String version; + + TranscriptionServiceVersion(String version) { + this.version = version; + } + + /** + * {@inheritDoc} + */ + @Override + public String getVersion() { + return this.version; + } + + /** + * Gets the latest service version supported by this client library. + * + * @return The latest {@link TranscriptionServiceVersion}. + */ + public static TranscriptionServiceVersion getLatest() { + return V2025_10_15; + } +} diff --git a/sdk/transcription/azure-ai-speech-transcription/src/main/java/com/azure/ai/speech/transcription/implementation/MultipartFormDataHelper.java b/sdk/transcription/azure-ai-speech-transcription/src/main/java/com/azure/ai/speech/transcription/implementation/MultipartFormDataHelper.java new file mode 100644 index 000000000000..2a2962e1595d --- /dev/null +++ b/sdk/transcription/azure-ai-speech-transcription/src/main/java/com/azure/ai/speech/transcription/implementation/MultipartFormDataHelper.java @@ -0,0 +1,209 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. +// Code generated by Microsoft (R) TypeSpec Code Generator. + +package com.azure.ai.speech.transcription.implementation; + +import com.azure.core.http.HttpHeaderName; +import com.azure.core.http.rest.RequestOptions; +import com.azure.core.util.BinaryData; +import com.azure.core.util.CoreUtils; +import java.io.ByteArrayInputStream; +import java.io.InputStream; +import java.io.SequenceInputStream; +import java.nio.charset.Charset; +import java.nio.charset.StandardCharsets; +import java.util.List; +import java.util.UUID; + +// DO NOT modify this helper class + +public final class MultipartFormDataHelper { + /** + * Line separator for the multipart HTTP request. + */ + private static final String CRLF = "\r\n"; + + private static final String APPLICATION_OCTET_STREAM = "application/octet-stream"; + + /** + * Value to be used as part of the divider for the multipart requests. + */ + private final String boundary; + + /** + * The actual part separator in the request. This is obtained by prepending "--" to the "boundary". + */ + private final String partSeparator; + + /** + * The marker for the ending of a multipart request. This is obtained by post-pending "--" to the "partSeparator". + */ + private final String endMarker; + + /** + * Charset used for encoding the multipart HTTP request. + */ + private final Charset encoderCharset = StandardCharsets.UTF_8; + + private InputStream requestDataStream = new ByteArrayInputStream(new byte[0]); + private long requestLength = 0; + + private RequestOptions requestOptions; + private BinaryData requestBody; + + /** + * Default constructor used in the code. The boundary is a random value. + * + * @param requestOptions the RequestOptions to update + */ + public MultipartFormDataHelper(RequestOptions requestOptions) { + this(requestOptions, UUID.randomUUID().toString().substring(0, 16)); + } + + private MultipartFormDataHelper(RequestOptions requestOptions, String boundary) { + this.requestOptions = requestOptions; + this.boundary = boundary; + this.partSeparator = "--" + boundary; + this.endMarker = this.partSeparator + "--"; + } + + /** + * Gets the multipart HTTP request body. + * + * @return the BinaryData of the multipart HTTP request body + */ + public BinaryData getRequestBody() { + return requestBody; + } + + // text/plain + /** + * Formats a text/plain field for a multipart HTTP request. + * + * @param fieldName the field name + * @param value the value of the text/plain field + * @return the MultipartFormDataHelper instance + */ + public MultipartFormDataHelper serializeTextField(String fieldName, String value) { + if (value != null) { + String serialized = partSeparator + CRLF + "Content-Disposition: form-data; name=\"" + escapeName(fieldName) + + "\"" + CRLF + CRLF + value + CRLF; + byte[] data = serialized.getBytes(encoderCharset); + appendBytes(data); + } + return this; + } + + // application/json + /** + * Formats a application/json field for a multipart HTTP request. + * + * @param fieldName the field name + * @param jsonObject the object of the application/json field + * @return the MultipartFormDataHelper instance + */ + public MultipartFormDataHelper serializeJsonField(String fieldName, Object jsonObject) { + if (jsonObject != null) { + String serialized + = partSeparator + CRLF + "Content-Disposition: form-data; name=\"" + escapeName(fieldName) + "\"" + CRLF + + "Content-Type: application/json" + CRLF + CRLF + BinaryData.fromObject(jsonObject) + CRLF; + byte[] data = serialized.getBytes(encoderCharset); + appendBytes(data); + } + return this; + } + + /** + * Formats a file field for a multipart HTTP request. + * + * @param fieldName the field name + * @param file the BinaryData of the file + * @param contentType the content-type of the file + * @param filename the filename + * @return the MultipartFormDataHelper instance + */ + public MultipartFormDataHelper serializeFileField(String fieldName, BinaryData file, String contentType, + String filename) { + if (file != null) { + if (CoreUtils.isNullOrEmpty(contentType)) { + contentType = APPLICATION_OCTET_STREAM; + } + writeFileField(fieldName, file, contentType, filename); + } + return this; + } + + /** + * Formats a file field (potentially multiple files) for a multipart HTTP request. + * + * @param fieldName the field name + * @param files the List of BinaryData of the files + * @param contentTypes the List of content-type of the files + * @param filenames the List of filenames + * @return the MultipartFormDataHelper instance + */ + public MultipartFormDataHelper serializeFileFields(String fieldName, List files, + List contentTypes, List filenames) { + if (files != null) { + for (int i = 0; i < files.size(); ++i) { + BinaryData file = files.get(i); + String contentType = contentTypes.get(i); + if (CoreUtils.isNullOrEmpty(contentType)) { + contentType = APPLICATION_OCTET_STREAM; + } + String filename = filenames.get(i); + writeFileField(fieldName, file, contentType, filename); + } + } + return this; + } + + /** + * Ends the serialization of the multipart HTTP request. + * + * @return the MultipartFormDataHelper instance + */ + public MultipartFormDataHelper end() { + byte[] data = endMarker.getBytes(encoderCharset); + appendBytes(data); + + requestBody = BinaryData.fromStream(requestDataStream, requestLength); + + requestOptions.setHeader(HttpHeaderName.CONTENT_TYPE, "multipart/form-data; boundary=" + this.boundary) + .setHeader(HttpHeaderName.CONTENT_LENGTH, String.valueOf(requestLength)); + + return this; + } + + private void writeFileField(String fieldName, BinaryData file, String contentType, String filename) { + String contentDispositionFilename = ""; + if (!CoreUtils.isNullOrEmpty(filename)) { + contentDispositionFilename = "; filename=\"" + escapeName(filename) + "\""; + } + + // Multipart preamble + String fileFieldPreamble + = partSeparator + CRLF + "Content-Disposition: form-data; name=\"" + escapeName(fieldName) + "\"" + + contentDispositionFilename + CRLF + "Content-Type: " + contentType + CRLF + CRLF; + byte[] data = fileFieldPreamble.getBytes(encoderCharset); + appendBytes(data); + + // Writing the file into the request as a byte stream + requestLength += file.getLength(); + requestDataStream = new SequenceInputStream(requestDataStream, file.toStream()); + + // CRLF + data = CRLF.getBytes(encoderCharset); + appendBytes(data); + } + + private void appendBytes(byte[] bytes) { + requestLength += bytes.length; + requestDataStream = new SequenceInputStream(requestDataStream, new ByteArrayInputStream(bytes)); + } + + private static String escapeName(String name) { + return name.replace("\n", "%0A").replace("\r", "%0D").replace("\"", "%22"); + } +} diff --git a/sdk/transcription/azure-ai-speech-transcription/src/main/java/com/azure/ai/speech/transcription/implementation/TranscriptionClientImpl.java b/sdk/transcription/azure-ai-speech-transcription/src/main/java/com/azure/ai/speech/transcription/implementation/TranscriptionClientImpl.java new file mode 100644 index 000000000000..bb14d116f3e0 --- /dev/null +++ b/sdk/transcription/azure-ai-speech-transcription/src/main/java/com/azure/ai/speech/transcription/implementation/TranscriptionClientImpl.java @@ -0,0 +1,284 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. +// Code generated by Microsoft (R) TypeSpec Code Generator. + +package com.azure.ai.speech.transcription.implementation; + +import com.azure.ai.speech.transcription.TranscriptionServiceVersion; +import com.azure.core.annotation.BodyParam; +import com.azure.core.annotation.ExpectedResponses; +import com.azure.core.annotation.HeaderParam; +import com.azure.core.annotation.Host; +import com.azure.core.annotation.HostParam; +import com.azure.core.annotation.Post; +import com.azure.core.annotation.QueryParam; +import com.azure.core.annotation.ReturnType; +import com.azure.core.annotation.ServiceInterface; +import com.azure.core.annotation.ServiceMethod; +import com.azure.core.annotation.UnexpectedResponseExceptionType; +import com.azure.core.exception.ClientAuthenticationException; +import com.azure.core.exception.HttpResponseException; +import com.azure.core.exception.ResourceModifiedException; +import com.azure.core.exception.ResourceNotFoundException; +import com.azure.core.http.HttpPipeline; +import com.azure.core.http.HttpPipelineBuilder; +import com.azure.core.http.policy.RetryPolicy; +import com.azure.core.http.policy.UserAgentPolicy; +import com.azure.core.http.rest.RequestOptions; +import com.azure.core.http.rest.Response; +import com.azure.core.http.rest.RestProxy; +import com.azure.core.util.BinaryData; +import com.azure.core.util.Context; +import com.azure.core.util.FluxUtil; +import com.azure.core.util.serializer.JacksonAdapter; +import com.azure.core.util.serializer.SerializerAdapter; +import reactor.core.publisher.Mono; + +/** + * Initializes a new instance of the TranscriptionClient type. + */ +public final class TranscriptionClientImpl { + /** + * The proxy service used to perform REST calls. + */ + private final TranscriptionClientService service; + + /** + * Supported Cognitive Services endpoints (protocol and hostname, for example: + * https://westus.api.cognitive.microsoft.com. + */ + private final String endpoint; + + /** + * Gets Supported Cognitive Services endpoints (protocol and hostname, for example: + * https://westus.api.cognitive.microsoft.com. + * + * @return the endpoint value. + */ + public String getEndpoint() { + return this.endpoint; + } + + /** + * Service version. + */ + private final TranscriptionServiceVersion serviceVersion; + + /** + * Gets Service version. + * + * @return the serviceVersion value. + */ + public TranscriptionServiceVersion getServiceVersion() { + return this.serviceVersion; + } + + /** + * The HTTP pipeline to send requests through. + */ + private final HttpPipeline httpPipeline; + + /** + * Gets The HTTP pipeline to send requests through. + * + * @return the httpPipeline value. + */ + public HttpPipeline getHttpPipeline() { + return this.httpPipeline; + } + + /** + * The serializer to serialize an object into a string. + */ + private final SerializerAdapter serializerAdapter; + + /** + * Gets The serializer to serialize an object into a string. + * + * @return the serializerAdapter value. + */ + public SerializerAdapter getSerializerAdapter() { + return this.serializerAdapter; + } + + /** + * Initializes an instance of TranscriptionClient client. + * + * @param endpoint Supported Cognitive Services endpoints (protocol and hostname, for example: + * https://westus.api.cognitive.microsoft.com. + * @param serviceVersion Service version. + */ + public TranscriptionClientImpl(String endpoint, TranscriptionServiceVersion serviceVersion) { + this(new HttpPipelineBuilder().policies(new UserAgentPolicy(), new RetryPolicy()).build(), + JacksonAdapter.createDefaultSerializerAdapter(), endpoint, serviceVersion); + } + + /** + * Initializes an instance of TranscriptionClient client. + * + * @param httpPipeline The HTTP pipeline to send requests through. + * @param endpoint Supported Cognitive Services endpoints (protocol and hostname, for example: + * https://westus.api.cognitive.microsoft.com. + * @param serviceVersion Service version. + */ + public TranscriptionClientImpl(HttpPipeline httpPipeline, String endpoint, + TranscriptionServiceVersion serviceVersion) { + this(httpPipeline, JacksonAdapter.createDefaultSerializerAdapter(), endpoint, serviceVersion); + } + + /** + * Initializes an instance of TranscriptionClient client. + * + * @param httpPipeline The HTTP pipeline to send requests through. + * @param serializerAdapter The serializer to serialize an object into a string. + * @param endpoint Supported Cognitive Services endpoints (protocol and hostname, for example: + * https://westus.api.cognitive.microsoft.com. + * @param serviceVersion Service version. + */ + public TranscriptionClientImpl(HttpPipeline httpPipeline, SerializerAdapter serializerAdapter, String endpoint, + TranscriptionServiceVersion serviceVersion) { + this.httpPipeline = httpPipeline; + this.serializerAdapter = serializerAdapter; + this.endpoint = endpoint; + this.serviceVersion = serviceVersion; + this.service + = RestProxy.create(TranscriptionClientService.class, this.httpPipeline, this.getSerializerAdapter()); + } + + /** + * The interface defining all the services for TranscriptionClient to be used by the proxy service to perform REST + * calls. + */ + @Host("{endpoint}/speechtotext") + @ServiceInterface(name = "TranscriptionClient") + public interface TranscriptionClientService { + // @Multipart not supported by RestProxy + @Post("/transcriptions:transcribe") + @ExpectedResponses({ 200 }) + @UnexpectedResponseExceptionType(value = ClientAuthenticationException.class, code = { 401 }) + @UnexpectedResponseExceptionType(value = ResourceNotFoundException.class, code = { 404 }) + @UnexpectedResponseExceptionType(value = ResourceModifiedException.class, code = { 409 }) + @UnexpectedResponseExceptionType(HttpResponseException.class) + Mono> transcribe(@HostParam("endpoint") String endpoint, + @QueryParam("api-version") String apiVersion, @HeaderParam("content-type") String contentType, + @HeaderParam("Accept") String accept, @BodyParam("multipart/form-data") BinaryData body, + RequestOptions requestOptions, Context context); + + // @Multipart not supported by RestProxy + @Post("/transcriptions:transcribe") + @ExpectedResponses({ 200 }) + @UnexpectedResponseExceptionType(value = ClientAuthenticationException.class, code = { 401 }) + @UnexpectedResponseExceptionType(value = ResourceNotFoundException.class, code = { 404 }) + @UnexpectedResponseExceptionType(value = ResourceModifiedException.class, code = { 409 }) + @UnexpectedResponseExceptionType(HttpResponseException.class) + Response transcribeSync(@HostParam("endpoint") String endpoint, + @QueryParam("api-version") String apiVersion, @HeaderParam("content-type") String contentType, + @HeaderParam("Accept") String accept, @BodyParam("multipart/form-data") BinaryData body, + RequestOptions requestOptions, Context context); + } + + /** + * Transcribes the provided audio stream. + *

Response Body Schema

+ * + *
+     * {@code
+     * {
+     *     durationMilliseconds: int (Required)
+     *     combinedPhrases (Required): [
+     *          (Required){
+     *             channel: Integer (Optional)
+     *             text: String (Required)
+     *         }
+     *     ]
+     *     phrases (Required): [
+     *          (Required){
+     *             channel: Integer (Optional)
+     *             speaker: Integer (Optional)
+     *             offsetMilliseconds: int (Required)
+     *             durationMilliseconds: int (Required)
+     *             text: String (Required)
+     *             words (Optional): [
+     *                  (Optional){
+     *                     text: String (Required)
+     *                     offsetMilliseconds: int (Required)
+     *                     durationMilliseconds: int (Required)
+     *                 }
+     *             ]
+     *             locale: String (Optional)
+     *             confidence: double (Required)
+     *         }
+     *     ]
+     * }
+     * }
+     * 
+ * + * @param body The body of the multipart request. + * @param requestOptions The options to configure the HTTP request before HTTP client sends it. + * @throws HttpResponseException thrown if the request is rejected by server. + * @throws ClientAuthenticationException thrown if the request is rejected by server on status code 401. + * @throws ResourceNotFoundException thrown if the request is rejected by server on status code 404. + * @throws ResourceModifiedException thrown if the request is rejected by server on status code 409. + * @return the result of the transcribe operation along with {@link Response} on successful completion of + * {@link Mono}. + */ + @ServiceMethod(returns = ReturnType.SINGLE) + public Mono> transcribeWithResponseAsync(BinaryData body, RequestOptions requestOptions) { + final String contentType = "multipart/form-data"; + final String accept = "application/json"; + return FluxUtil.withContext(context -> service.transcribe(this.getEndpoint(), + this.getServiceVersion().getVersion(), contentType, accept, body, requestOptions, context)); + } + + /** + * Transcribes the provided audio stream. + *

Response Body Schema

+ * + *
+     * {@code
+     * {
+     *     durationMilliseconds: int (Required)
+     *     combinedPhrases (Required): [
+     *          (Required){
+     *             channel: Integer (Optional)
+     *             text: String (Required)
+     *         }
+     *     ]
+     *     phrases (Required): [
+     *          (Required){
+     *             channel: Integer (Optional)
+     *             speaker: Integer (Optional)
+     *             offsetMilliseconds: int (Required)
+     *             durationMilliseconds: int (Required)
+     *             text: String (Required)
+     *             words (Optional): [
+     *                  (Optional){
+     *                     text: String (Required)
+     *                     offsetMilliseconds: int (Required)
+     *                     durationMilliseconds: int (Required)
+     *                 }
+     *             ]
+     *             locale: String (Optional)
+     *             confidence: double (Required)
+     *         }
+     *     ]
+     * }
+     * }
+     * 
+ * + * @param body The body of the multipart request. + * @param requestOptions The options to configure the HTTP request before HTTP client sends it. + * @throws HttpResponseException thrown if the request is rejected by server. + * @throws ClientAuthenticationException thrown if the request is rejected by server on status code 401. + * @throws ResourceNotFoundException thrown if the request is rejected by server on status code 404. + * @throws ResourceModifiedException thrown if the request is rejected by server on status code 409. + * @return the result of the transcribe operation along with {@link Response}. + */ + @ServiceMethod(returns = ReturnType.SINGLE) + public Response transcribeWithResponse(BinaryData body, RequestOptions requestOptions) { + final String contentType = "multipart/form-data"; + final String accept = "application/json"; + return service.transcribeSync(this.getEndpoint(), this.getServiceVersion().getVersion(), contentType, accept, + body, requestOptions, Context.NONE); + } +} diff --git a/sdk/transcription/azure-ai-speech-transcription/src/main/java/com/azure/ai/speech/transcription/implementation/package-info.java b/sdk/transcription/azure-ai-speech-transcription/src/main/java/com/azure/ai/speech/transcription/implementation/package-info.java new file mode 100644 index 000000000000..140636afd3d6 --- /dev/null +++ b/sdk/transcription/azure-ai-speech-transcription/src/main/java/com/azure/ai/speech/transcription/implementation/package-info.java @@ -0,0 +1,9 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. +// Code generated by Microsoft (R) TypeSpec Code Generator. +/** + * + * Package containing the implementations for Transcription. + * + */ +package com.azure.ai.speech.transcription.implementation; diff --git a/sdk/transcription/azure-ai-speech-transcription/src/main/java/com/azure/ai/speech/transcription/models/AudioFileDetails.java b/sdk/transcription/azure-ai-speech-transcription/src/main/java/com/azure/ai/speech/transcription/models/AudioFileDetails.java new file mode 100644 index 000000000000..7381bc25899d --- /dev/null +++ b/sdk/transcription/azure-ai-speech-transcription/src/main/java/com/azure/ai/speech/transcription/models/AudioFileDetails.java @@ -0,0 +1,119 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. +// Code generated by Microsoft (R) TypeSpec Code Generator. +package com.azure.ai.speech.transcription.models; + +import com.azure.core.annotation.Fluent; +import com.azure.core.annotation.Generated; +import com.azure.core.util.BinaryData; + +/** + * The file details for the "audio" field. + */ +@Fluent +public final class AudioFileDetails { + + /* + * The content of the file. + */ + @Generated + private final BinaryData content; + + /* + * The filename of the file. + */ + @Generated + private String filename; + + /* + * The content-type of the file. + */ + @Generated + private String contentType = "application/octet-stream"; + + /** + * Creates an instance of AudioFileDetails class. + * + * @param content the content value to set. + */ + @Generated + public AudioFileDetails(BinaryData content) { + this.content = content; + } + + /** + * Get the content property: The content of the file. + * + * @return the content value. + */ + @Generated + public BinaryData getContent() { + return this.content; + } + + /** + * Get the filename property: The filename of the file. If not explicitly set, a filename will be auto-generated + * from the contentType. + * + * @return the filename value, or an auto-generated filename if not set. + */ + @Generated + public String getFilename() { + if (this.filename != null && !this.filename.isEmpty()) { + return this.filename; + } + if ("audio/wav".equalsIgnoreCase(this.contentType)) { + return "audio.wav"; + } + if ("audio/mpeg".equalsIgnoreCase(this.contentType) || "audio/mp3".equalsIgnoreCase(this.contentType)) { + return "audio.mp3"; + } + if ("audio/ogg".equalsIgnoreCase(this.contentType)) { + return "audio.ogg"; + } + if ("audio/flac".equalsIgnoreCase(this.contentType)) { + return "audio.flac"; + } + if ("audio/webm".equalsIgnoreCase(this.contentType)) { + return "audio.webm"; + } + if ("audio/opus".equalsIgnoreCase(this.contentType)) { + return "audio.opus"; + } + return "audio"; + } + + /** + * Set the filename property: The filename of the file. + * + * @param filename the filename value to set. + * @return the AudioFileDetails object itself. + */ + @Generated + public AudioFileDetails setFilename(String filename) { + this.filename = filename; + return this; + } + + /** + * Get the contentType property: The content-type of the file. + * + * @return the contentType value. + */ + @Generated + public String getContentType() { + return this.contentType; + } + + /** + * Set the contentType property: The content-type of the file. + * + * @param contentType the contentType value to set. + * @return the AudioFileDetails object itself. + */ + @Generated + public AudioFileDetails setContentType(String contentType) { + this.contentType = contentType; + return this; + } +} diff --git a/sdk/transcription/azure-ai-speech-transcription/src/main/java/com/azure/ai/speech/transcription/models/ChannelCombinedPhrases.java b/sdk/transcription/azure-ai-speech-transcription/src/main/java/com/azure/ai/speech/transcription/models/ChannelCombinedPhrases.java new file mode 100644 index 000000000000..a315b684bc4c --- /dev/null +++ b/sdk/transcription/azure-ai-speech-transcription/src/main/java/com/azure/ai/speech/transcription/models/ChannelCombinedPhrases.java @@ -0,0 +1,104 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. +// Code generated by Microsoft (R) TypeSpec Code Generator. +package com.azure.ai.speech.transcription.models; + +import com.azure.core.annotation.Generated; +import com.azure.core.annotation.Immutable; +import com.azure.json.JsonReader; +import com.azure.json.JsonSerializable; +import com.azure.json.JsonToken; +import com.azure.json.JsonWriter; +import java.io.IOException; + +/** + * The full transcript per channel. + */ +@Immutable +public final class ChannelCombinedPhrases implements JsonSerializable { + + /* + * The 0-based channel index. Only present if channel separation is enabled. + */ + @Generated + private Integer channel; + + /* + * The complete transcribed text for the channel. + */ + @Generated + private final String text; + + /** + * Creates an instance of ChannelCombinedPhrases class. + * + * @param text the text value to set. + */ + @Generated + private ChannelCombinedPhrases(String text) { + this.text = text; + } + + /** + * Get the channel property: The 0-based channel index. Only present if channel separation is enabled. + * + * @return the channel value. + */ + @Generated + public Integer getChannel() { + return this.channel; + } + + /** + * Get the text property: The complete transcribed text for the channel. + * + * @return the text value. + */ + @Generated + public String getText() { + return this.text; + } + + /** + * {@inheritDoc} + */ + @Generated + @Override + public JsonWriter toJson(JsonWriter jsonWriter) throws IOException { + jsonWriter.writeStartObject(); + jsonWriter.writeStringField("text", this.text); + jsonWriter.writeNumberField("channel", this.channel); + return jsonWriter.writeEndObject(); + } + + /** + * Reads an instance of ChannelCombinedPhrases from the JsonReader. + * + * @param jsonReader The JsonReader being read. + * @return An instance of ChannelCombinedPhrases if the JsonReader was pointing to an instance of it, or null if it + * was pointing to JSON null. + * @throws IllegalStateException If the deserialized JSON object was missing any required properties. + * @throws IOException If an error occurs while reading the ChannelCombinedPhrases. + */ + @Generated + public static ChannelCombinedPhrases fromJson(JsonReader jsonReader) throws IOException { + return jsonReader.readObject(reader -> { + String text = null; + Integer channel = null; + while (reader.nextToken() != JsonToken.END_OBJECT) { + String fieldName = reader.getFieldName(); + reader.nextToken(); + if ("text".equals(fieldName)) { + text = reader.getString(); + } else if ("channel".equals(fieldName)) { + channel = reader.getNullable(JsonReader::getInt); + } else { + reader.skipChildren(); + } + } + ChannelCombinedPhrases deserializedChannelCombinedPhrases = new ChannelCombinedPhrases(text); + deserializedChannelCombinedPhrases.channel = channel; + return deserializedChannelCombinedPhrases; + }); + } +} diff --git a/sdk/transcription/azure-ai-speech-transcription/src/main/java/com/azure/ai/speech/transcription/models/EnhancedModeOptions.java b/sdk/transcription/azure-ai-speech-transcription/src/main/java/com/azure/ai/speech/transcription/models/EnhancedModeOptions.java new file mode 100644 index 000000000000..d013f508bb8e --- /dev/null +++ b/sdk/transcription/azure-ai-speech-transcription/src/main/java/com/azure/ai/speech/transcription/models/EnhancedModeOptions.java @@ -0,0 +1,163 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. +// Code generated by Microsoft (R) TypeSpec Code Generator. +package com.azure.ai.speech.transcription.models; + +import com.azure.core.annotation.Fluent; +import com.azure.core.annotation.Generated; +import com.azure.json.JsonReader; +import com.azure.json.JsonSerializable; +import com.azure.json.JsonToken; +import com.azure.json.JsonWriter; +import java.io.IOException; +import java.util.List; + +/** + * Enhanced mode properties for transcription. + */ +@Fluent +public final class EnhancedModeOptions implements JsonSerializable { + + /* + * Enable enhanced mode for transcription. This is automatically set to true when task, targetLanguage, or prompt + * are specified. + */ + @Generated + private Boolean enabled; + + /* + * Task type for enhanced mode. + */ + @Generated + private String task; + + /* + * Target language for enhanced mode. + */ + @Generated + private String targetLanguage; + + /* + * A list of user prompts. + */ + @Generated + private List prompts; + + /** + * Creates an instance of EnhancedModeOptions class with enabled set to true. + */ + public EnhancedModeOptions() { + this.enabled = true; + } + + /** + * Get the task property: Task type for enhanced mode. + * + * @return the task value. + */ + @Generated + public String getTask() { + return this.task; + } + + /** + * Set the task property: Task type for enhanced mode. + * + * @param task the task value to set. + * @return the EnhancedModeOptions object itself. + */ + @Generated + public EnhancedModeOptions setTask(String task) { + this.task = task; + return this; + } + + /** + * Get the targetLanguage property: Target language for enhanced mode. + * + * @return the targetLanguage value. + */ + @Generated + public String getTargetLanguage() { + return this.targetLanguage; + } + + /** + * Set the targetLanguage property: Target language for enhanced mode. + * + * @param targetLanguage the targetLanguage value to set. + * @return the EnhancedModeOptions object itself. + */ + @Generated + public EnhancedModeOptions setTargetLanguage(String targetLanguage) { + this.targetLanguage = targetLanguage; + return this; + } + + /** + * Get the prompts property: A list of user prompts. + * + * @return the prompts value. + */ + @Generated + public List getPrompts() { + return this.prompts; + } + + /** + * Set the prompts property: A list of user prompts. + * + * @param prompts the prompts value to set. + * @return the EnhancedModeOptions object itself. + */ + @Generated + public EnhancedModeOptions setPrompts(List prompts) { + this.prompts = prompts; + return this; + } + + /** + * {@inheritDoc} + */ + @Generated + @Override + public JsonWriter toJson(JsonWriter jsonWriter) throws IOException { + jsonWriter.writeStartObject(); + jsonWriter.writeStringField("task", this.task); + jsonWriter.writeStringField("targetLanguage", this.targetLanguage); + jsonWriter.writeArrayField("prompt", this.prompts, (writer, element) -> writer.writeString(element)); + return jsonWriter.writeEndObject(); + } + + /** + * Reads an instance of EnhancedModeOptions from the JsonReader. + * + * @param jsonReader The JsonReader being read. + * @return An instance of EnhancedModeOptions if the JsonReader was pointing to an instance of it, or null if it was + * pointing to JSON null. + * @throws IOException If an error occurs while reading the EnhancedModeOptions. + */ + @Generated + public static EnhancedModeOptions fromJson(JsonReader jsonReader) throws IOException { + return jsonReader.readObject(reader -> { + EnhancedModeOptions deserializedEnhancedModeOptions = new EnhancedModeOptions(); + while (reader.nextToken() != JsonToken.END_OBJECT) { + String fieldName = reader.getFieldName(); + reader.nextToken(); + if ("enabled".equals(fieldName)) { + deserializedEnhancedModeOptions.enabled = reader.getNullable(JsonReader::getBoolean); + } else if ("task".equals(fieldName)) { + deserializedEnhancedModeOptions.task = reader.getString(); + } else if ("targetLanguage".equals(fieldName)) { + deserializedEnhancedModeOptions.targetLanguage = reader.getString(); + } else if ("prompt".equals(fieldName)) { + List prompts = reader.readArray(reader1 -> reader1.getString()); + deserializedEnhancedModeOptions.prompts = prompts; + } else { + reader.skipChildren(); + } + } + return deserializedEnhancedModeOptions; + }); + } +} diff --git a/sdk/transcription/azure-ai-speech-transcription/src/main/java/com/azure/ai/speech/transcription/models/PhraseListOptions.java b/sdk/transcription/azure-ai-speech-transcription/src/main/java/com/azure/ai/speech/transcription/models/PhraseListOptions.java new file mode 100644 index 000000000000..e269745eaf87 --- /dev/null +++ b/sdk/transcription/azure-ai-speech-transcription/src/main/java/com/azure/ai/speech/transcription/models/PhraseListOptions.java @@ -0,0 +1,123 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. +// Code generated by Microsoft (R) TypeSpec Code Generator. +package com.azure.ai.speech.transcription.models; + +import com.azure.core.annotation.Fluent; +import com.azure.core.annotation.Generated; +import com.azure.json.JsonReader; +import com.azure.json.JsonSerializable; +import com.azure.json.JsonToken; +import com.azure.json.JsonWriter; +import java.io.IOException; +import java.util.List; + +/** + * Phrase list properties for transcription. + */ +@Fluent +public final class PhraseListOptions implements JsonSerializable { + + /* + * List of phrases for recognition. + */ + @Generated + private List phrases; + + /* + * Biasing weight for phrase list (1.0 to 20.0). + */ + @Generated + private Double biasingWeight; + + /** + * Creates an instance of PhraseListOptions class. + */ + @Generated + public PhraseListOptions() { + } + + /** + * Get the phrases property: List of phrases for recognition. + * + * @return the phrases value. + */ + @Generated + public List getPhrases() { + return this.phrases; + } + + /** + * Set the phrases property: List of phrases for recognition. + * + * @param phrases the phrases value to set. + * @return the PhraseListOptions object itself. + */ + @Generated + public PhraseListOptions setPhrases(List phrases) { + this.phrases = phrases; + return this; + } + + /** + * Get the biasingWeight property: Biasing weight for phrase list (1.0 to 20.0). + * + * @return the biasingWeight value. + */ + @Generated + public Double getBiasingWeight() { + return this.biasingWeight; + } + + /** + * Set the biasingWeight property: Biasing weight for phrase list (1.0 to 20.0). + * + * @param biasingWeight the biasingWeight value to set. + * @return the PhraseListOptions object itself. + */ + @Generated + public PhraseListOptions setBiasingWeight(Double biasingWeight) { + this.biasingWeight = biasingWeight; + return this; + } + + /** + * {@inheritDoc} + */ + @Generated + @Override + public JsonWriter toJson(JsonWriter jsonWriter) throws IOException { + jsonWriter.writeStartObject(); + jsonWriter.writeArrayField("phrases", this.phrases, (writer, element) -> writer.writeString(element)); + jsonWriter.writeNumberField("biasingWeight", this.biasingWeight); + return jsonWriter.writeEndObject(); + } + + /** + * Reads an instance of PhraseListOptions from the JsonReader. + * + * @param jsonReader The JsonReader being read. + * @return An instance of PhraseListOptions if the JsonReader was pointing to an instance of it, or null if it was + * pointing to JSON null. + * @throws IOException If an error occurs while reading the PhraseListOptions. + */ + @Generated + public static PhraseListOptions fromJson(JsonReader jsonReader) throws IOException { + return jsonReader.readObject(reader -> { + PhraseListOptions deserializedPhraseListOptions = new PhraseListOptions(); + while (reader.nextToken() != JsonToken.END_OBJECT) { + String fieldName = reader.getFieldName(); + reader.nextToken(); + if ("phrases".equals(fieldName)) { + List phrases = reader.readArray(reader1 -> reader1.getString()); + deserializedPhraseListOptions.phrases = phrases; + } else if ("biasingWeight".equals(fieldName)) { + deserializedPhraseListOptions.biasingWeight = reader.getNullable(JsonReader::getDouble); + } else { + reader.skipChildren(); + } + } + return deserializedPhraseListOptions; + }); + } +} diff --git a/sdk/transcription/azure-ai-speech-transcription/src/main/java/com/azure/ai/speech/transcription/models/ProfanityFilterMode.java b/sdk/transcription/azure-ai-speech-transcription/src/main/java/com/azure/ai/speech/transcription/models/ProfanityFilterMode.java new file mode 100644 index 000000000000..8031bcadc6ae --- /dev/null +++ b/sdk/transcription/azure-ai-speech-transcription/src/main/java/com/azure/ai/speech/transcription/models/ProfanityFilterMode.java @@ -0,0 +1,69 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. +// Code generated by Microsoft (R) TypeSpec Code Generator. +package com.azure.ai.speech.transcription.models; + +import com.azure.core.annotation.Generated; +import com.azure.core.util.ExpandableStringEnum; +import java.util.Collection; + +/** + * Mode of profanity filtering. + */ +public final class ProfanityFilterMode extends ExpandableStringEnum { + + /** + * Disable profanity filtering. + */ + @Generated + public static final ProfanityFilterMode NONE = fromString("None"); + + /** + * Remove profanity. + */ + @Generated + public static final ProfanityFilterMode REMOVED = fromString("Removed"); + + /** + * Add "profanity" XML tags</Profanity>. + */ + @Generated + public static final ProfanityFilterMode TAGS = fromString("Tags"); + + /** + * Mask the profanity with * except of the first letter, e.g., f***. + */ + @Generated + public static final ProfanityFilterMode MASKED = fromString("Masked"); + + /** + * Creates a new instance of ProfanityFilterMode value. + * + * @deprecated Use the {@link #fromString(String)} factory method. + */ + @Generated + @Deprecated + public ProfanityFilterMode() { + } + + /** + * Creates or finds a ProfanityFilterMode from its string representation. + * + * @param name a name to look for. + * @return the corresponding ProfanityFilterMode. + */ + @Generated + public static ProfanityFilterMode fromString(String name) { + return fromString(name, ProfanityFilterMode.class); + } + + /** + * Gets known ProfanityFilterMode values. + * + * @return known ProfanityFilterMode values. + */ + @Generated + public static Collection values() { + return values(ProfanityFilterMode.class); + } +} diff --git a/sdk/transcription/azure-ai-speech-transcription/src/main/java/com/azure/ai/speech/transcription/models/TranscribedPhrase.java b/sdk/transcription/azure-ai-speech-transcription/src/main/java/com/azure/ai/speech/transcription/models/TranscribedPhrase.java new file mode 100644 index 000000000000..6406e29ed530 --- /dev/null +++ b/sdk/transcription/azure-ai-speech-transcription/src/main/java/com/azure/ai/speech/transcription/models/TranscribedPhrase.java @@ -0,0 +1,237 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. +// Code generated by Microsoft (R) TypeSpec Code Generator. +package com.azure.ai.speech.transcription.models; + +import com.azure.core.annotation.Generated; +import com.azure.core.annotation.Immutable; +import com.azure.json.JsonReader; +import com.azure.json.JsonSerializable; +import com.azure.json.JsonToken; +import com.azure.json.JsonWriter; +import java.io.IOException; +import java.time.Duration; +import java.util.List; + +/** + * A transcribed phrase. + */ +@Immutable +public final class TranscribedPhrase implements JsonSerializable { + + /* + * The 0-based channel index. Only present if channel separation is enabled. + */ + @Generated + private Integer channel; + + /* + * A unique integer number that is assigned to each speaker detected in the audio without particular order. Only + * present if speaker diarization is enabled. + */ + @Generated + private Integer speaker; + + /* + * The start offset of the phrase in milliseconds. + */ + @Generated + private final int offset; + + /* + * The duration of the phrase in milliseconds. + */ + @Generated + private final int duration; + + /* + * The transcribed text of the phrase. + */ + @Generated + private final String text; + + /* + * The words that make up the phrase. Only present if word-level timestamps are enabled. + */ + @Generated + private List words; + + /* + * The locale of the phrase. + */ + @Generated + private String locale; + + /* + * The confidence value for the phrase. + */ + @Generated + private final double confidence; + + /** + * Creates an instance of TranscribedPhrase class. + * + * @param offset the offset value to set. + * @param duration the duration value to set. + * @param text the text value to set. + * @param confidence the confidence value to set. + */ + @Generated + private TranscribedPhrase(int offset, int duration, String text, double confidence) { + this.offset = offset; + this.duration = duration; + this.text = text; + this.confidence = confidence; + } + + /** + * Get the channel property: The 0-based channel index. Only present if channel separation is enabled. + * + * @return the channel value. + */ + @Generated + public Integer getChannel() { + return this.channel; + } + + /** + * Get the speaker property: A unique integer number that is assigned to each speaker detected in the audio without + * particular order. Only present if speaker diarization is enabled. + * + * @return the speaker value. + */ + @Generated + public Integer getSpeaker() { + return this.speaker; + } + + /** + * Get the offset property: The start offset of the phrase in milliseconds. + * + * @return the offset value. + */ + @Generated + public int getOffset() { + return this.offset; + } + + /** + * Get the duration property: The duration in milliseconds. + * + * @return the duration value as Duration. + */ + @Generated + public Duration getDuration() { + return Duration.ofMillis(this.duration); + } + + /** + * Get the text property: The transcribed text of the phrase. + * + * @return the text value. + */ + @Generated + public String getText() { + return this.text; + } + + /** + * Get the words property: The words that make up the phrase. Only present if word-level timestamps are enabled. + * + * @return the words value. + */ + @Generated + public List getWords() { + return this.words; + } + + /** + * Get the locale property: The locale of the phrase. + * + * @return the locale value. + */ + @Generated + public String getLocale() { + return this.locale; + } + + /** + * Get the confidence property: The confidence value for the phrase. + * + * @return the confidence value. + */ + @Generated + public double getConfidence() { + return this.confidence; + } + + /** + * {@inheritDoc} + */ + @Generated + @Override + public JsonWriter toJson(JsonWriter jsonWriter) throws IOException { + jsonWriter.writeStartObject(); + jsonWriter.writeIntField("offsetMilliseconds", this.offset); + jsonWriter.writeIntField("durationMilliseconds", this.duration); + jsonWriter.writeStringField("text", this.text); + jsonWriter.writeDoubleField("confidence", this.confidence); + jsonWriter.writeNumberField("channel", this.channel); + jsonWriter.writeNumberField("speaker", this.speaker); + jsonWriter.writeArrayField("words", this.words, (writer, element) -> writer.writeJson(element)); + jsonWriter.writeStringField("locale", this.locale); + return jsonWriter.writeEndObject(); + } + + /** + * Reads an instance of TranscribedPhrase from the JsonReader. + * + * @param jsonReader The JsonReader being read. + * @return An instance of TranscribedPhrase if the JsonReader was pointing to an instance of it, or null if it was + * pointing to JSON null. + * @throws IllegalStateException If the deserialized JSON object was missing any required properties. + * @throws IOException If an error occurs while reading the TranscribedPhrase. + */ + @Generated + public static TranscribedPhrase fromJson(JsonReader jsonReader) throws IOException { + return jsonReader.readObject(reader -> { + int offset = 0; + int duration = 0; + String text = null; + double confidence = 0.0; + Integer channel = null; + Integer speaker = null; + List words = null; + String locale = null; + while (reader.nextToken() != JsonToken.END_OBJECT) { + String fieldName = reader.getFieldName(); + reader.nextToken(); + if ("offsetMilliseconds".equals(fieldName)) { + offset = reader.getInt(); + } else if ("durationMilliseconds".equals(fieldName)) { + duration = reader.getInt(); + } else if ("text".equals(fieldName)) { + text = reader.getString(); + } else if ("confidence".equals(fieldName)) { + confidence = reader.getDouble(); + } else if ("channel".equals(fieldName)) { + channel = reader.getNullable(JsonReader::getInt); + } else if ("speaker".equals(fieldName)) { + speaker = reader.getNullable(JsonReader::getInt); + } else if ("words".equals(fieldName)) { + words = reader.readArray(reader1 -> TranscribedWord.fromJson(reader1)); + } else if ("locale".equals(fieldName)) { + locale = reader.getString(); + } else { + reader.skipChildren(); + } + } + TranscribedPhrase deserializedTranscribedPhrase = new TranscribedPhrase(offset, duration, text, confidence); + deserializedTranscribedPhrase.channel = channel; + deserializedTranscribedPhrase.speaker = speaker; + deserializedTranscribedPhrase.words = words; + deserializedTranscribedPhrase.locale = locale; + return deserializedTranscribedPhrase; + }); + } +} diff --git a/sdk/transcription/azure-ai-speech-transcription/src/main/java/com/azure/ai/speech/transcription/models/TranscribedWord.java b/sdk/transcription/azure-ai-speech-transcription/src/main/java/com/azure/ai/speech/transcription/models/TranscribedWord.java new file mode 100644 index 000000000000..a0c046e15cbe --- /dev/null +++ b/sdk/transcription/azure-ai-speech-transcription/src/main/java/com/azure/ai/speech/transcription/models/TranscribedWord.java @@ -0,0 +1,127 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. +// Code generated by Microsoft (R) TypeSpec Code Generator. +package com.azure.ai.speech.transcription.models; + +import com.azure.core.annotation.Generated; +import com.azure.core.annotation.Immutable; +import com.azure.json.JsonReader; +import com.azure.json.JsonSerializable; +import com.azure.json.JsonToken; +import com.azure.json.JsonWriter; +import java.io.IOException; +import java.time.Duration; + +/** + * Time-stamped word in the display form. + */ +@Immutable +public final class TranscribedWord implements JsonSerializable { + + /* + * The recognized word, including punctuation. + */ + @Generated + private final String text; + + /* + * The start offset of the word in milliseconds. + */ + @Generated + private final int offset; + + /* + * The duration of the word in milliseconds. + */ + @Generated + private final int duration; + + /** + * Creates an instance of TranscribedWord class. + * + * @param text the text value to set. + * @param offset the offset value to set. + * @param duration the duration value to set. + */ + @Generated + private TranscribedWord(String text, int offset, int duration) { + this.text = text; + this.offset = offset; + this.duration = duration; + } + + /** + * Get the text property: The recognized word, including punctuation. + * + * @return the text value. + */ + @Generated + public String getText() { + return this.text; + } + + /** + * Get the offset property: The start offset of the word in milliseconds. + * + * @return the offset value. + */ + @Generated + public int getOffset() { + return this.offset; + } + + /** + * Get the duration property: The duration in milliseconds. + * + * @return the duration value as Duration. + */ + @Generated + public Duration getDuration() { + return Duration.ofMillis(this.duration); + } + + /** + * {@inheritDoc} + */ + @Generated + @Override + public JsonWriter toJson(JsonWriter jsonWriter) throws IOException { + jsonWriter.writeStartObject(); + jsonWriter.writeStringField("text", this.text); + jsonWriter.writeIntField("offsetMilliseconds", this.offset); + jsonWriter.writeIntField("durationMilliseconds", this.duration); + return jsonWriter.writeEndObject(); + } + + /** + * Reads an instance of TranscribedWord from the JsonReader. + * + * @param jsonReader The JsonReader being read. + * @return An instance of TranscribedWord if the JsonReader was pointing to an instance of it, or null if it was + * pointing to JSON null. + * @throws IllegalStateException If the deserialized JSON object was missing any required properties. + * @throws IOException If an error occurs while reading the TranscribedWord. + */ + @Generated + public static TranscribedWord fromJson(JsonReader jsonReader) throws IOException { + return jsonReader.readObject(reader -> { + String text = null; + int offset = 0; + int duration = 0; + while (reader.nextToken() != JsonToken.END_OBJECT) { + String fieldName = reader.getFieldName(); + reader.nextToken(); + if ("text".equals(fieldName)) { + text = reader.getString(); + } else if ("offsetMilliseconds".equals(fieldName)) { + offset = reader.getInt(); + } else if ("durationMilliseconds".equals(fieldName)) { + duration = reader.getInt(); + } else { + reader.skipChildren(); + } + } + return new TranscribedWord(text, offset, duration); + }); + } +} diff --git a/sdk/transcription/azure-ai-speech-transcription/src/main/java/com/azure/ai/speech/transcription/models/TranscriptionContent.java b/sdk/transcription/azure-ai-speech-transcription/src/main/java/com/azure/ai/speech/transcription/models/TranscriptionContent.java new file mode 100644 index 000000000000..5ba64e48e89e --- /dev/null +++ b/sdk/transcription/azure-ai-speech-transcription/src/main/java/com/azure/ai/speech/transcription/models/TranscriptionContent.java @@ -0,0 +1,73 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. +// Code generated by Microsoft (R) TypeSpec Code Generator. +package com.azure.ai.speech.transcription.models; + +import com.azure.core.annotation.Fluent; +import com.azure.core.annotation.Generated; + +/** + * Request model for transcription operation. + */ +@Fluent +public final class TranscriptionContent { + + /* + * The content of the audio file to be transcribed. The audio file must be shorter than 2 hours in audio duration + * and smaller than 250 MB in size. Optional if audioUrl is provided in the definition. + */ + @Generated + private AudioFileDetails audio; + + /** + * Get the audio property: The content of the audio file to be transcribed. The audio file must be shorter than 2 + * hours in audio duration and smaller than 250 MB in size. Optional if audioUrl is provided in the definition. + * + * @return the audio value. + */ + @Generated + public AudioFileDetails getAudio() { + return this.audio; + } + + /** + * Set the audio property: The content of the audio file to be transcribed. The audio file must be shorter than 2 + * hours in audio duration and smaller than 250 MB in size. Optional if audioUrl is provided in the definition. + * + * @param audio the audio value to set. + * @return the TranscriptionContent object itself. + */ + @Generated + public TranscriptionContent setAudio(AudioFileDetails audio) { + this.audio = audio; + return this; + } + + /** + * Creates an instance of TranscriptionContent class. + * + * @param options the options value to set. + */ + @Generated + public TranscriptionContent(TranscriptionOptions options) { + this.options = options; + } + + /* + * Metadata for a transcription request. This field contains a JSON-serialized object of type + * `TranscriptionOptions`. + */ + @Generated + private final TranscriptionOptions options; + + /** + * Get the options property: Metadata for a transcription request. This field contains a JSON-serialized object of + * type `TranscriptionOptions`. + * + * @return the options value. + */ + @Generated + public TranscriptionOptions getOptions() { + return this.options; + } +} diff --git a/sdk/transcription/azure-ai-speech-transcription/src/main/java/com/azure/ai/speech/transcription/models/TranscriptionDiarizationOptions.java b/sdk/transcription/azure-ai-speech-transcription/src/main/java/com/azure/ai/speech/transcription/models/TranscriptionDiarizationOptions.java new file mode 100644 index 000000000000..831c546c31ed --- /dev/null +++ b/sdk/transcription/azure-ai-speech-transcription/src/main/java/com/azure/ai/speech/transcription/models/TranscriptionDiarizationOptions.java @@ -0,0 +1,115 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. +// Code generated by Microsoft (R) TypeSpec Code Generator. +package com.azure.ai.speech.transcription.models; + +import com.azure.core.annotation.Fluent; +import com.azure.core.annotation.Generated; +import com.azure.json.JsonReader; +import com.azure.json.JsonSerializable; +import com.azure.json.JsonToken; +import com.azure.json.JsonWriter; +import java.io.IOException; + +/** + * The Speaker Diarization settings. Diarization settings must be specified to enable speaker diarization. + */ +@Fluent +public final class TranscriptionDiarizationOptions implements JsonSerializable { + + /* + * Enable speaker diarization. This is automatically set to true when maxSpeakers is specified. + */ + @Generated + private Boolean enabled; + + /* + * Gets or sets a hint for the maximum number of speakers for diarization. Must be greater than 1 and less than 36. + */ + @Generated + private Integer maxSpeakers; + + /** + * Creates an instance of TranscriptionDiarizationOptions class. + */ + @Generated + public TranscriptionDiarizationOptions() { + } + + /** + * Get the enabled property: Enable speaker diarization. This is automatically set to true when maxSpeakers is + * specified. + * + * @return the enabled value. + */ + @Generated + public Boolean isEnabled() { + return this.enabled; + } + + /** + * Get the maxSpeakers property: Gets or sets a hint for the maximum number of speakers for diarization. Must be + * greater than 1 and less than 36. + * + * @return the maxSpeakers value. + */ + @Generated + public Integer getMaxSpeakers() { + return this.maxSpeakers; + } + + /** + * Set the maxSpeakers property: Gets or sets a hint for the maximum number of speakers for diarization. Must be + * greater than 1 and less than 36. + * + * @param maxSpeakers the maxSpeakers value to set. + * @return the TranscriptionDiarizationOptions object itself. + */ + @Generated + public TranscriptionDiarizationOptions setMaxSpeakers(Integer maxSpeakers) { + this.maxSpeakers = maxSpeakers; + return this; + } + + /** + * {@inheritDoc} + */ + @Generated + @Override + public JsonWriter toJson(JsonWriter jsonWriter) throws IOException { + jsonWriter.writeStartObject(); + if (this.maxSpeakers != null) { + jsonWriter.writeBooleanField("enabled", true); + jsonWriter.writeNumberField("maxSpeakers", this.maxSpeakers); + } + return jsonWriter.writeEndObject(); + } + + /** + * Reads an instance of TranscriptionDiarizationOptions from the JsonReader. + * + * @param jsonReader The JsonReader being read. + * @return An instance of TranscriptionDiarizationOptions if the JsonReader was pointing to an instance of it, or + * null if it was pointing to JSON null. + * @throws IOException If an error occurs while reading the TranscriptionDiarizationOptions. + */ + @Generated + public static TranscriptionDiarizationOptions fromJson(JsonReader jsonReader) throws IOException { + return jsonReader.readObject(reader -> { + TranscriptionDiarizationOptions deserializedTranscriptionDiarizationOptions + = new TranscriptionDiarizationOptions(); + while (reader.nextToken() != JsonToken.END_OBJECT) { + String fieldName = reader.getFieldName(); + reader.nextToken(); + if ("enabled".equals(fieldName)) { + deserializedTranscriptionDiarizationOptions.enabled = reader.getNullable(JsonReader::getBoolean); + } else if ("maxSpeakers".equals(fieldName)) { + deserializedTranscriptionDiarizationOptions.maxSpeakers = reader.getNullable(JsonReader::getInt); + } else { + reader.skipChildren(); + } + } + return deserializedTranscriptionDiarizationOptions; + }); + } +} diff --git a/sdk/transcription/azure-ai-speech-transcription/src/main/java/com/azure/ai/speech/transcription/models/TranscriptionOptions.java b/sdk/transcription/azure-ai-speech-transcription/src/main/java/com/azure/ai/speech/transcription/models/TranscriptionOptions.java new file mode 100644 index 000000000000..2b298d84d59a --- /dev/null +++ b/sdk/transcription/azure-ai-speech-transcription/src/main/java/com/azure/ai/speech/transcription/models/TranscriptionOptions.java @@ -0,0 +1,350 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. +// Code generated by Microsoft (R) TypeSpec Code Generator. +package com.azure.ai.speech.transcription.models; + +import com.azure.core.annotation.Fluent; +import com.azure.core.annotation.Generated; +import com.azure.json.JsonReader; +import com.azure.json.JsonSerializable; +import com.azure.json.JsonToken; +import com.azure.json.JsonWriter; +import java.io.IOException; +import java.util.List; +import java.util.Map; + +/** + * Metadata for a transcription request. + */ +@Fluent +public final class TranscriptionOptions implements JsonSerializable { + + /* + * The URL of the audio to be transcribed. The audio must be shorter than 2 hours in audio duration and smaller than + * 250 MB in size. If both Audio and AudioUrl are provided, Audio is used. + */ + @Generated + private String audioUrl; + + /* + * A list of possible locales for the transcription. If not specified, the locale of the speech in the audio is + * detected automatically from all supported locales. + */ + @Generated + private List locales; + + /* + * Maps some or all candidate locales to a model URI to be used for transcription. If no mapping is given, the + * default model for the locale is used. + */ + @Generated + private Map localeModelMapping; + + /* + * Mode of profanity filtering. + */ + @Generated + private ProfanityFilterMode profanityFilterMode; + + /* + * Mode of diarization. + */ + @Generated + private TranscriptionDiarizationOptions diarizationOptions; + + /* + * The 0-based indices of the channels to be transcribed separately. If not specified, multiple channels are merged + * and transcribed jointly. Only up to two channels are supported. + */ + @Generated + private List activeChannels; + + /* + * Enhanced mode properties. + */ + @Generated + private EnhancedModeOptions enhancedModeOptions; + + /* + * Phrase list properties. + */ + @Generated + private PhraseListOptions phraseListOptions; + + /** + * Get the audioUrl property: The URL of the audio to be transcribed. The audio must be shorter than 2 hours in + * audio duration and smaller than 250 MB in size. If both Audio and AudioUrl are provided, Audio is used. + * + * @return the audioUrl value. + */ + @Generated + public String getAudioUrl() { + return this.audioUrl; + } + + /** + * Set the audioUrl property: The URL of the audio to be transcribed. The audio must be shorter than 2 hours in + * audio duration and smaller than 250 MB in size. If both Audio and AudioUrl are provided, Audio is used. + * + * @param audioUrl the audioUrl value to set. + * @return the TranscriptionOptions object itself. + */ + @Generated + private TranscriptionOptions setAudioUrl(String audioUrl) { + this.audioUrl = audioUrl; + return this; + } + + /** + * Get the locales property: A list of possible locales for the transcription. If not specified, the locale of the + * speech in the audio is detected automatically from all supported locales. + * + * @return the locales value. + */ + @Generated + public List getLocales() { + return this.locales; + } + + /** + * Set the locales property: A list of possible locales for the transcription. If not specified, the locale of the + * speech in the audio is detected automatically from all supported locales. + * + * @param locales the locales value to set. + * @return the TranscriptionOptions object itself. + */ + @Generated + public TranscriptionOptions setLocales(List locales) { + this.locales = locales; + return this; + } + + /** + * Get the localeModelMapping property: Maps some or all candidate locales to a model URI to be used for + * transcription. If no mapping is given, the default model for the locale is used. + * + * @return the localeModelMapping value. + */ + @Generated + public Map getLocaleModelMapping() { + return this.localeModelMapping; + } + + /** + * Set the localeModelMapping property: Maps some or all candidate locales to a model URI to be used for + * transcription. If no mapping is given, the default model for the locale is used. + * + * @param localeModelMapping the localeModelMapping value to set. + * @return the TranscriptionOptions object itself. + */ + @Generated + public TranscriptionOptions setLocaleModelMapping(Map localeModelMapping) { + this.localeModelMapping = localeModelMapping; + return this; + } + + /** + * Get the profanityFilterMode property: Mode of profanity filtering. + * + * @return the profanityFilterMode value. + */ + @Generated + public ProfanityFilterMode getProfanityFilterMode() { + return this.profanityFilterMode; + } + + /** + * Set the profanityFilterMode property: Mode of profanity filtering. + * + * @param profanityFilterMode the profanityFilterMode value to set. + * @return the TranscriptionOptions object itself. + */ + @Generated + public TranscriptionOptions setProfanityFilterMode(ProfanityFilterMode profanityFilterMode) { + this.profanityFilterMode = profanityFilterMode; + return this; + } + + /** + * Get the diarizationOptions property: Mode of diarization. + * + * @return the diarizationOptions value. + */ + @Generated + public TranscriptionDiarizationOptions getDiarizationOptions() { + return this.diarizationOptions; + } + + /** + * Set the diarizationOptions property: Mode of diarization. + * + * @param diarizationOptions the diarizationOptions value to set. + * @return the TranscriptionOptions object itself. + */ + @Generated + public TranscriptionOptions setDiarizationOptions(TranscriptionDiarizationOptions diarizationOptions) { + this.diarizationOptions = diarizationOptions; + return this; + } + + /** + * Get the activeChannels property: The 0-based indices of the channels to be transcribed separately. If not + * specified, multiple channels are merged and transcribed jointly. Only up to two channels are supported. + * + * @return the activeChannels value. + */ + @Generated + public List getActiveChannels() { + return this.activeChannels; + } + + /** + * Set the activeChannels property: The 0-based indices of the channels to be transcribed separately. If not + * specified, multiple channels are merged and transcribed jointly. Only up to two channels are supported. + * + * @param activeChannels the activeChannels value to set. + * @return the TranscriptionOptions object itself. + */ + @Generated + public TranscriptionOptions setActiveChannels(List activeChannels) { + this.activeChannels = activeChannels; + return this; + } + + /** + * Get the enhancedModeOptions property: Enhanced mode properties. + * + * @return the enhancedModeOptions value. + */ + @Generated + public EnhancedModeOptions getEnhancedModeOptions() { + return this.enhancedModeOptions; + } + + /** + * Set the enhancedModeOptions property: Enhanced mode properties. + * + * @param enhancedModeOptions the enhancedModeOptions value to set. + * @return the TranscriptionOptions object itself. + */ + @Generated + public TranscriptionOptions setEnhancedModeOptions(EnhancedModeOptions enhancedModeOptions) { + this.enhancedModeOptions = enhancedModeOptions; + return this; + } + + /** + * Get the phraseListOptions property: Phrase list properties. + * + * @return the phraseListOptions value. + */ + @Generated + public PhraseListOptions getPhraseListOptions() { + return this.phraseListOptions; + } + + /** + * Set the phraseListOptions property: Phrase list properties. + * + * @param phraseListOptions the phraseListOptions value to set. + * @return the TranscriptionOptions object itself. + */ + @Generated + public TranscriptionOptions setPhraseListOptions(PhraseListOptions phraseListOptions) { + this.phraseListOptions = phraseListOptions; + return this; + } + + /** + * {@inheritDoc} + */ + @Generated + @Override + public JsonWriter toJson(JsonWriter jsonWriter) throws IOException { + jsonWriter.writeStartObject(); + jsonWriter.writeStringField("audioUrl", this.audioUrl); + jsonWriter.writeArrayField("locales", this.locales, (writer, element) -> writer.writeString(element)); + jsonWriter.writeMapField("models", this.localeModelMapping, (writer, element) -> writer.writeString(element)); + jsonWriter.writeStringField("profanityFilterMode", + this.profanityFilterMode == null ? null : this.profanityFilterMode.toString()); + jsonWriter.writeJsonField("diarization", this.diarizationOptions); + jsonWriter.writeArrayField("channels", this.activeChannels, (writer, element) -> writer.writeInt(element)); + jsonWriter.writeJsonField("enhancedMode", this.enhancedModeOptions); + jsonWriter.writeJsonField("phraseList", this.phraseListOptions); + return jsonWriter.writeEndObject(); + } + + /** + * Reads an instance of TranscriptionOptions from the JsonReader. + * + * @param jsonReader The JsonReader being read. + * @return An instance of TranscriptionOptions if the JsonReader was pointing to an instance of it, or null if it + * was pointing to JSON null. + * @throws IOException If an error occurs while reading the TranscriptionOptions. + */ + @Generated + public static TranscriptionOptions fromJson(JsonReader jsonReader) throws IOException { + return jsonReader.readObject(reader -> { + TranscriptionOptions deserializedTranscriptionOptions = new TranscriptionOptions((String) null); + while (reader.nextToken() != JsonToken.END_OBJECT) { + String fieldName = reader.getFieldName(); + reader.nextToken(); + if ("audioUrl".equals(fieldName)) { + deserializedTranscriptionOptions.audioUrl = reader.getString(); + } else if ("locales".equals(fieldName)) { + List locales = reader.readArray(reader1 -> reader1.getString()); + deserializedTranscriptionOptions.locales = locales; + } else if ("localeModelMapping".equals(fieldName)) { + Map localeModelMapping = reader.readMap(reader1 -> reader1.getString()); + deserializedTranscriptionOptions.localeModelMapping = localeModelMapping; + } else if ("profanityFilterMode".equals(fieldName)) { + deserializedTranscriptionOptions.profanityFilterMode + = ProfanityFilterMode.fromString(reader.getString()); + } else if ("diarization".equals(fieldName)) { + deserializedTranscriptionOptions.diarizationOptions + = TranscriptionDiarizationOptions.fromJson(reader); + } else if ("channels".equals(fieldName)) { + List activeChannels = reader.readArray(reader1 -> reader1.getInt()); + deserializedTranscriptionOptions.activeChannels = activeChannels; + } else if ("enhancedMode".equals(fieldName)) { + deserializedTranscriptionOptions.enhancedModeOptions = EnhancedModeOptions.fromJson(reader); + } else if ("phraseList".equals(fieldName)) { + deserializedTranscriptionOptions.phraseListOptions = PhraseListOptions.fromJson(reader); + } else { + reader.skipChildren(); + } + } + return deserializedTranscriptionOptions; + }); + } + + private final AudioFileDetails audioFileDetails; + + /** + * Creates an instance of TranscriptionOptions class with audio URL. + * + * @param audioUrl the URL of the audio to be transcribed + */ + public TranscriptionOptions(String audioUrl) { + this.audioUrl = audioUrl; + this.audioFileDetails = null; + } + + /** + * Creates an instance of TranscriptionOptions class with audio file details. + * + * @param fileDetails the audio file details + */ + public TranscriptionOptions(AudioFileDetails fileDetails) { + this.audioFileDetails = fileDetails; + } + + /** + * Get the audioFileDetails property: The audio file details for transcription. + * + * @return the audioFileDetails value. + */ + public AudioFileDetails getFileDetails() { + return this.audioFileDetails; + } +} diff --git a/sdk/transcription/azure-ai-speech-transcription/src/main/java/com/azure/ai/speech/transcription/models/TranscriptionResult.java b/sdk/transcription/azure-ai-speech-transcription/src/main/java/com/azure/ai/speech/transcription/models/TranscriptionResult.java new file mode 100644 index 000000000000..12d5a00d20a1 --- /dev/null +++ b/sdk/transcription/azure-ai-speech-transcription/src/main/java/com/azure/ai/speech/transcription/models/TranscriptionResult.java @@ -0,0 +1,130 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. +// Code generated by Microsoft (R) TypeSpec Code Generator. +package com.azure.ai.speech.transcription.models; + +import com.azure.core.annotation.Generated; +import com.azure.core.annotation.Immutable; +import com.azure.json.JsonReader; +import com.azure.json.JsonSerializable; +import com.azure.json.JsonToken; +import com.azure.json.JsonWriter; +import java.io.IOException; +import java.time.Duration; +import java.util.List; + +/** + * The result of the transcribe operation. + */ +@Immutable +public final class TranscriptionResult implements JsonSerializable { + + /* + * The duration of the audio in milliseconds. + */ + @Generated + private final int duration; + + /* + * The full transcript for each channel. + */ + @Generated + private final List combinedPhrases; + + /* + * The transcription results segmented into phrases. + */ + @Generated + private final List phrases; + + /** + * Creates an instance of TranscriptionResult class. + * + * @param duration the duration value to set. + * @param combinedPhrases the combinedPhrases value to set. + * @param phrases the phrases value to set. + */ + @Generated + private TranscriptionResult(int duration, List combinedPhrases, + List phrases) { + this.duration = duration; + this.combinedPhrases = combinedPhrases; + this.phrases = phrases; + } + + /** + * Get the duration property: The duration in milliseconds. + * + * @return the duration value as Duration. + */ + @Generated + public Duration getDuration() { + return Duration.ofMillis(this.duration); + } + + /** + * Get the combinedPhrases property: The full transcript for each channel. + * + * @return the combinedPhrases value. + */ + @Generated + public List getCombinedPhrases() { + return this.combinedPhrases; + } + + /** + * Get the phrases property: The transcription results segmented into phrases. + * + * @return the phrases value. + */ + @Generated + public List getPhrases() { + return this.phrases; + } + + /** + * {@inheritDoc} + */ + @Generated + @Override + public JsonWriter toJson(JsonWriter jsonWriter) throws IOException { + jsonWriter.writeStartObject(); + jsonWriter.writeIntField("durationMilliseconds", this.duration); + jsonWriter.writeArrayField("combinedPhrases", this.combinedPhrases, + (writer, element) -> writer.writeJson(element)); + jsonWriter.writeArrayField("phrases", this.phrases, (writer, element) -> writer.writeJson(element)); + return jsonWriter.writeEndObject(); + } + + /** + * Reads an instance of TranscriptionResult from the JsonReader. + * + * @param jsonReader The JsonReader being read. + * @return An instance of TranscriptionResult if the JsonReader was pointing to an instance of it, or null if it was + * pointing to JSON null. + * @throws IllegalStateException If the deserialized JSON object was missing any required properties. + * @throws IOException If an error occurs while reading the TranscriptionResult. + */ + @Generated + public static TranscriptionResult fromJson(JsonReader jsonReader) throws IOException { + return jsonReader.readObject(reader -> { + int duration = 0; + List combinedPhrases = null; + List phrases = null; + while (reader.nextToken() != JsonToken.END_OBJECT) { + String fieldName = reader.getFieldName(); + reader.nextToken(); + if ("durationMilliseconds".equals(fieldName)) { + duration = reader.getInt(); + } else if ("combinedPhrases".equals(fieldName)) { + combinedPhrases = reader.readArray(reader1 -> ChannelCombinedPhrases.fromJson(reader1)); + } else if ("phrases".equals(fieldName)) { + phrases = reader.readArray(reader1 -> TranscribedPhrase.fromJson(reader1)); + } else { + reader.skipChildren(); + } + } + return new TranscriptionResult(duration, combinedPhrases, phrases); + }); + } +} diff --git a/sdk/transcription/azure-ai-speech-transcription/src/main/java/com/azure/ai/speech/transcription/models/package-info.java b/sdk/transcription/azure-ai-speech-transcription/src/main/java/com/azure/ai/speech/transcription/models/package-info.java new file mode 100644 index 000000000000..f1ff4db6a641 --- /dev/null +++ b/sdk/transcription/azure-ai-speech-transcription/src/main/java/com/azure/ai/speech/transcription/models/package-info.java @@ -0,0 +1,9 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. +// Code generated by Microsoft (R) TypeSpec Code Generator. +/** + * + * Package containing the data models for Transcription. + * + */ +package com.azure.ai.speech.transcription.models; diff --git a/sdk/transcription/azure-ai-speech-transcription/src/main/java/com/azure/ai/speech/transcription/package-info.java b/sdk/transcription/azure-ai-speech-transcription/src/main/java/com/azure/ai/speech/transcription/package-info.java new file mode 100644 index 000000000000..8e50f3224f2d --- /dev/null +++ b/sdk/transcription/azure-ai-speech-transcription/src/main/java/com/azure/ai/speech/transcription/package-info.java @@ -0,0 +1,9 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. +// Code generated by Microsoft (R) TypeSpec Code Generator. +/** + * + * Package containing the classes for Transcription. + * + */ +package com.azure.ai.speech.transcription; diff --git a/sdk/transcription/azure-ai-speech-transcription/src/main/java/module-info.java b/sdk/transcription/azure-ai-speech-transcription/src/main/java/module-info.java new file mode 100644 index 000000000000..434a8672d9f3 --- /dev/null +++ b/sdk/transcription/azure-ai-speech-transcription/src/main/java/module-info.java @@ -0,0 +1,12 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. +// Code generated by Microsoft (R) TypeSpec Code Generator. + +module com.azure.ai.speech.transcription { + requires transitive com.azure.core; + + exports com.azure.ai.speech.transcription; + exports com.azure.ai.speech.transcription.models; + + opens com.azure.ai.speech.transcription.models to com.azure.core; +} diff --git a/sdk/transcription/azure-ai-speech-transcription/src/main/resources/META-INF/azure-ai-speech-transcription_apiview_properties.json b/sdk/transcription/azure-ai-speech-transcription/src/main/resources/META-INF/azure-ai-speech-transcription_apiview_properties.json new file mode 100644 index 000000000000..e70a697123f8 --- /dev/null +++ b/sdk/transcription/azure-ai-speech-transcription/src/main/resources/META-INF/azure-ai-speech-transcription_apiview_properties.json @@ -0,0 +1,23 @@ +{ + "flavor": "azure", + "CrossLanguageDefinitionId": { + "com.azure.ai.speech.transcription.TranscriptionAsyncClient": "Azure.Ai.Speech.Transcription", + "com.azure.ai.speech.transcription.TranscriptionAsyncClient.transcribe": "Azure.Ai.Speech.Transcription.transcribe", + "com.azure.ai.speech.transcription.TranscriptionAsyncClient.transcribeWithResponse": "Azure.Ai.Speech.Transcription.transcribe", + "com.azure.ai.speech.transcription.TranscriptionClient": "Azure.Ai.Speech.Transcription", + "com.azure.ai.speech.transcription.TranscriptionClient.transcribe": "Azure.Ai.Speech.Transcription.transcribe", + "com.azure.ai.speech.transcription.TranscriptionClient.transcribeWithResponse": "Azure.Ai.Speech.Transcription.transcribe", + "com.azure.ai.speech.transcription.TranscriptionClientBuilder": "Azure.Ai.Speech.Transcription", + "com.azure.ai.speech.transcription.models.AudioFileDetails": null, + "com.azure.ai.speech.transcription.models.ChannelCombinedPhrases": "Azure.Ai.Speech.Transcription.ChannelCombinedPhrases", + "com.azure.ai.speech.transcription.models.EnhancedModeOptions": "Azure.Ai.Speech.Transcription.EnhancedModeProperties", + "com.azure.ai.speech.transcription.models.PhraseListOptions": "Azure.Ai.Speech.Transcription.PhraseListProperties", + "com.azure.ai.speech.transcription.models.ProfanityFilterMode": "Azure.Ai.Speech.Transcription.ProfanityFilterMode", + "com.azure.ai.speech.transcription.models.TranscribedPhrase": "Azure.Ai.Speech.Transcription.TranscribedPhrase", + "com.azure.ai.speech.transcription.models.TranscribedWord": "Azure.Ai.Speech.Transcription.TranscribedWord", + "com.azure.ai.speech.transcription.models.TranscriptionContent": "Azure.Ai.Speech.Transcription.TranscriptionContent", + "com.azure.ai.speech.transcription.models.TranscriptionDiarizationOptions": "Azure.Ai.Speech.Transcription.TranscriptionDiarizationOptions", + "com.azure.ai.speech.transcription.models.TranscriptionOptions": "Azure.Ai.Speech.Transcription.TranscriptionOptions", + "com.azure.ai.speech.transcription.models.TranscriptionResult": "Azure.Ai.Speech.Transcription.TranscriptionResult" + } +} diff --git a/sdk/transcription/azure-ai-speech-transcription/src/main/resources/META-INF/azure-ai-speech-transcription_metadata.json b/sdk/transcription/azure-ai-speech-transcription/src/main/resources/META-INF/azure-ai-speech-transcription_metadata.json new file mode 100644 index 000000000000..7ff97cdfa258 --- /dev/null +++ b/sdk/transcription/azure-ai-speech-transcription/src/main/resources/META-INF/azure-ai-speech-transcription_metadata.json @@ -0,0 +1 @@ +{"flavor":"azure","apiVersion":"2025-10-15","crossLanguageDefinitions":{"com.azure.ai.speech.transcription.TranscriptionAsyncClient":"Azure.Ai.Speech.Transcription","com.azure.ai.speech.transcription.TranscriptionAsyncClient.transcribe":"Azure.Ai.Speech.Transcription.transcribe","com.azure.ai.speech.transcription.TranscriptionAsyncClient.transcribeWithResponse":"Azure.Ai.Speech.Transcription.transcribe","com.azure.ai.speech.transcription.TranscriptionClient":"Azure.Ai.Speech.Transcription","com.azure.ai.speech.transcription.TranscriptionClient.transcribe":"Azure.Ai.Speech.Transcription.transcribe","com.azure.ai.speech.transcription.TranscriptionClient.transcribeWithResponse":"Azure.Ai.Speech.Transcription.transcribe","com.azure.ai.speech.transcription.TranscriptionClientBuilder":"Azure.Ai.Speech.Transcription","com.azure.ai.speech.transcription.models.AudioFileDetails":null,"com.azure.ai.speech.transcription.models.ChannelCombinedPhrases":"Azure.Ai.Speech.Transcription.ChannelCombinedPhrases","com.azure.ai.speech.transcription.models.EnhancedModeOptions":"Azure.Ai.Speech.Transcription.EnhancedModeProperties","com.azure.ai.speech.transcription.models.PhraseListOptions":"Azure.Ai.Speech.Transcription.PhraseListProperties","com.azure.ai.speech.transcription.models.ProfanityFilterMode":"Azure.Ai.Speech.Transcription.ProfanityFilterMode","com.azure.ai.speech.transcription.models.TranscribedPhrase":"Azure.Ai.Speech.Transcription.TranscribedPhrase","com.azure.ai.speech.transcription.models.TranscribedWord":"Azure.Ai.Speech.Transcription.TranscribedWord","com.azure.ai.speech.transcription.models.TranscriptionContent":"Azure.Ai.Speech.Transcription.TranscriptionContent","com.azure.ai.speech.transcription.models.TranscriptionDiarizationOptions":"Azure.Ai.Speech.Transcription.TranscriptionDiarizationOptions","com.azure.ai.speech.transcription.models.TranscriptionOptions":"Azure.Ai.Speech.Transcription.TranscriptionOptions","com.azure.ai.speech.transcription.models.TranscriptionResult":"Azure.Ai.Speech.Transcription.TranscriptionResult"},"generatedFiles":["src/main/java/com/azure/ai/speech/transcription/TranscriptionAsyncClient.java","src/main/java/com/azure/ai/speech/transcription/TranscriptionClient.java","src/main/java/com/azure/ai/speech/transcription/TranscriptionClientBuilder.java","src/main/java/com/azure/ai/speech/transcription/TranscriptionServiceVersion.java","src/main/java/com/azure/ai/speech/transcription/implementation/MultipartFormDataHelper.java","src/main/java/com/azure/ai/speech/transcription/implementation/TranscriptionClientImpl.java","src/main/java/com/azure/ai/speech/transcription/implementation/package-info.java","src/main/java/com/azure/ai/speech/transcription/models/AudioFileDetails.java","src/main/java/com/azure/ai/speech/transcription/models/ChannelCombinedPhrases.java","src/main/java/com/azure/ai/speech/transcription/models/EnhancedModeOptions.java","src/main/java/com/azure/ai/speech/transcription/models/PhraseListOptions.java","src/main/java/com/azure/ai/speech/transcription/models/ProfanityFilterMode.java","src/main/java/com/azure/ai/speech/transcription/models/TranscribedPhrase.java","src/main/java/com/azure/ai/speech/transcription/models/TranscribedWord.java","src/main/java/com/azure/ai/speech/transcription/models/TranscriptionContent.java","src/main/java/com/azure/ai/speech/transcription/models/TranscriptionDiarizationOptions.java","src/main/java/com/azure/ai/speech/transcription/models/TranscriptionOptions.java","src/main/java/com/azure/ai/speech/transcription/models/TranscriptionResult.java","src/main/java/com/azure/ai/speech/transcription/models/package-info.java","src/main/java/com/azure/ai/speech/transcription/package-info.java","src/main/java/module-info.java"]} \ No newline at end of file diff --git a/sdk/transcription/azure-ai-speech-transcription/src/main/resources/azure-ai-speech-transcription.properties b/sdk/transcription/azure-ai-speech-transcription/src/main/resources/azure-ai-speech-transcription.properties new file mode 100644 index 000000000000..ca812989b4f2 --- /dev/null +++ b/sdk/transcription/azure-ai-speech-transcription/src/main/resources/azure-ai-speech-transcription.properties @@ -0,0 +1,2 @@ +name=${project.artifactId} +version=${project.version} diff --git a/sdk/transcription/azure-ai-speech-transcription/src/samples/assets/sample-audio.wav b/sdk/transcription/azure-ai-speech-transcription/src/samples/assets/sample-audio.wav new file mode 100644 index 000000000000..bf23d54b0c00 Binary files /dev/null and b/sdk/transcription/azure-ai-speech-transcription/src/samples/assets/sample-audio.wav differ diff --git a/sdk/transcription/azure-ai-speech-transcription/src/samples/assets/sample-profanity.wav b/sdk/transcription/azure-ai-speech-transcription/src/samples/assets/sample-profanity.wav new file mode 100644 index 000000000000..e1926b3f5dcf Binary files /dev/null and b/sdk/transcription/azure-ai-speech-transcription/src/samples/assets/sample-profanity.wav differ diff --git a/sdk/transcription/azure-ai-speech-transcription/src/samples/java/com/azure/ai/speech/transcription/EnhancedModeSample.java b/sdk/transcription/azure-ai-speech-transcription/src/samples/java/com/azure/ai/speech/transcription/EnhancedModeSample.java new file mode 100644 index 000000000000..7abd8b0fc543 --- /dev/null +++ b/sdk/transcription/azure-ai-speech-transcription/src/samples/java/com/azure/ai/speech/transcription/EnhancedModeSample.java @@ -0,0 +1,146 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +package com.azure.ai.speech.transcription; + +// BEGIN: com.azure.ai.speech.transcription.enhancedmode.imports +import com.azure.ai.speech.transcription.models.AudioFileDetails; +import com.azure.ai.speech.transcription.models.EnhancedModeOptions; +import com.azure.ai.speech.transcription.models.ProfanityFilterMode; +import com.azure.ai.speech.transcription.models.TranscriptionDiarizationOptions; +import com.azure.ai.speech.transcription.models.TranscriptionOptions; +import com.azure.ai.speech.transcription.models.TranscriptionResult; +import com.azure.core.credential.KeyCredential; +import com.azure.core.util.BinaryData; + +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Paths; +import java.util.Arrays; +// END: com.azure.ai.speech.transcription.enhancedmode.imports + +/** + * Sample demonstrates how to use EnhancedModeOptions with LLM-enhanced speech transcription + * combining multiple features for optimal transcription quality. + * + * This sample shows: + * - Using lexical format prompts to guide LLM output + * - Providing domain-specific context for technical terminology + * - Enabling diarization (speaker identification) with enhanced mode + * - Configuring profanity filtering + * + * Enhanced mode leverages large language models to improve transcription quality + * by understanding context and domain-specific terminology. + */ +public class EnhancedModeSample { + /** + * Main method to run the enhanced mode sample. + * + * @param args command line arguments (not used) + */ + public static void main(String[] args) { + String endpoint = System.getenv("SPEECH_ENDPOINT"); + String apiKey = System.getenv("SPEECH_API_KEY"); + + if (endpoint == null || apiKey == null) { + System.err.println("Please set SPEECH_ENDPOINT and SPEECH_API_KEY environment variables"); + System.err.println("Example:"); + System.err.println(" set SPEECH_ENDPOINT=https://your-resource-name.cognitiveservices.azure.com/"); + System.err.println(" set SPEECH_API_KEY=your-api-key"); + return; + } + + System.out.println("Azure AI Speech Transcription - Enhanced Mode Sample"); + System.out.println("=====================================================\n"); + + // Demonstrate full enhanced mode with all features combined + demonstrateFullEnhancedMode(endpoint, apiKey); + } + + /** + * Demonstrates using full enhanced mode with multiple features combined. + * This shows how to use lexical format prompts, domain context, diarization, + * and profanity filtering together for optimal transcription quality. + */ + private static void demonstrateFullEnhancedMode(String endpoint, String apiKey) { + System.out.println("Enhanced Mode with Multiple Features Combined"); + System.out.println("----------------------------------------------"); + + TranscriptionClient client = new TranscriptionClientBuilder() + .endpoint(endpoint) + .credential(new KeyCredential(apiKey)) + .buildClient(); + + try { + String audioFilePath = "src/samples/assets/sample-audio.wav"; + if (!Files.exists(Paths.get(audioFilePath))) { + System.out.println("Audio file not found: " + audioFilePath); + System.out.println(" Skipping this example.\n"); + return; + } + + byte[] audioData = Files.readAllBytes(Paths.get(audioFilePath)); + + // Use the helper method to demonstrate full configuration + TranscriptionResult result = transcribeWithFullEnhancedMode(client, audioData, audioFilePath); + + System.out.println(" Full enhanced mode configuration applied"); + System.out.println("Features: LLM prompts, diarization, profanity filtering"); + System.out.println("Duration: " + result.getDuration() + " ms"); + if (result.getCombinedPhrases() != null && !result.getCombinedPhrases().isEmpty()) { + System.out.println("\nTranscription: " + result.getCombinedPhrases().get(0).getText()); + } + if (result.getPhrases() != null && !result.getPhrases().isEmpty()) { + System.out.println("\nPhrases with speakers:"); + result.getPhrases().forEach(phrase -> + System.out.println(" [Speaker " + phrase.getSpeaker() + ", " + + phrase.getOffset() + " ms] " + phrase.getText()) + ); + } + System.out.println(); + + } catch (IOException e) { + System.err.println("Error reading audio file: " + e.getMessage() + "\n"); + } catch (Exception e) { + System.err.println("Error during transcription: " + e.getMessage() + "\n"); + } + } + + /** + * Helper method demonstrating how to combine all enhanced mode features. + * This is a reusable pattern for high-quality LLM-enhanced transcription. + */ + // BEGIN: com.azure.ai.speech.transcription.enhancedmode.complete + private static TranscriptionResult transcribeWithFullEnhancedMode( + TranscriptionClient client, + byte[] audioData, + String filename + ) throws Exception { + // Create audio file details + AudioFileDetails audioFileDetails = new AudioFileDetails(BinaryData.fromBytes(audioData)); + + // Configure comprehensive LLM-enhanced mode settings + // Enhanced mode is automatically enabled when you create EnhancedModeOptions + // Always include lexical format prompt for best results + EnhancedModeOptions enhancedMode = new EnhancedModeOptions() + .setTask("transcribe") + .setPrompts(Arrays.asList( + "Output must be in lexical format." + )); + + // Enable diarization for speaker identification + TranscriptionDiarizationOptions diarizationOptions = new TranscriptionDiarizationOptions() + .setMaxSpeakers(5); + + // Create transcription options with all features enabled + TranscriptionOptions options = new TranscriptionOptions(audioFileDetails) + .setLocales(Arrays.asList()) + .setEnhancedModeOptions(enhancedMode) + .setDiarizationOptions(diarizationOptions) + .setProfanityFilterMode(ProfanityFilterMode.MASKED); + + // Transcribe with full LLM-enhanced mode and diarization + return client.transcribe(options); + } + // END: com.azure.ai.speech.transcription.enhancedmode.complete +} diff --git a/sdk/transcription/azure-ai-speech-transcription/src/samples/java/com/azure/ai/speech/transcription/README.md b/sdk/transcription/azure-ai-speech-transcription/src/samples/java/com/azure/ai/speech/transcription/README.md new file mode 100644 index 000000000000..65eb1d9979d5 --- /dev/null +++ b/sdk/transcription/azure-ai-speech-transcription/src/samples/java/com/azure/ai/speech/transcription/README.md @@ -0,0 +1,260 @@ +# Azure AI Speech Transcription Samples + +This directory contains runnable code samples that demonstrate how to use the Azure AI Speech Transcription client library for Java. + +## Prerequisites + +To run these samples, you need: + +1. **Azure Subscription**: An active Azure subscription +2. **Azure AI Speech Service Resource**: Create one in the [Azure Portal](https://portal.azure.com) +3. **Authentication**: Choose one of the following authentication methods: + +### Option 1: Entra ID Authentication (Recommended for Production) + + Set the endpoint and configure Entra ID credentials: + + ```bash + set SPEECH_ENDPOINT=https://your-resource-name.cognitiveservices.azure.com/ + ``` + + **And** configure one of the following credential sources: + - **Managed Identity**: For apps running in Azure (App Service, Azure Functions, VMs, etc.) + - **Azure CLI**: Run `az login` on your development machine + - **Environment Variables**: Set `AZURE_TENANT_ID`, `AZURE_CLIENT_ID`, and `AZURE_CLIENT_SECRET` + - **Visual Studio Code or IntelliJ**: Sign in through your IDE + + **Note**: You'll also need to assign the "Cognitive Services User" role to your identity: + + ```bash + az role assignment create --assignee \ + --role "Cognitive Services User" \ + --scope /subscriptions//resourceGroups//providers/Microsoft.CognitiveServices/accounts/ + ``` + + **Required dependency** for Entra ID authentication: + + ```xml + + com.azure + azure-identity + 1.13.0 + + ``` + + ### Option 2: API Key Authentication (Easier for Getting Started) + + Set these environment variables: + + ```bash + set SPEECH_ENDPOINT=https://your-resource-name.cognitiveservices.azure.com/ + ``` + + **And** configure one of the following credential sources: + - **Managed Identity**: For apps running in Azure (App Service, Azure Functions, VMs, etc.) + - **Azure CLI**: Run `az login` on your development machine + - **Environment Variables**: Set `AZURE_TENANT_ID`, `AZURE_CLIENT_ID`, and `AZURE_CLIENT_SECRET` + - **Visual Studio Code or IntelliJ**: Sign in through your IDE + + **Note**: You'll also need to assign the "Cognitive Services User" role to your identity: + + ```bash + az role assignment create --assignee \ + --role "Cognitive Services User" \ + --scope /subscriptions//resourceGroups//providers/Microsoft.CognitiveServices/accounts/ + ``` + + **Required dependency** for Entra ID authentication: + + ```xml + + com.azure + azure-identity + 1.13.0 + + ``` + +4. **Audio File**: Some samples require an audio file named `sample-audio.wav` in the working directory + +## Authentication Methods + +All samples in this directory support **both authentication methods**: + +- **Entra ID (TokenCredential)**: Uses `DefaultAzureCredential` from azure-identity +- **API Key (KeyCredential)**: Uses the `SPEECH_API_KEY` environment variable + +The samples will automatically detect which authentication method to use based on the environment variables you've set. If `SPEECH_API_KEY` is set, it will use API Key authentication; otherwise, it will attempt Entra ID authentication. + +## Available Samples + +### TranscribeAudioFileSample.java + +**Champion scenario**: Basic audio transcription from a file + +Demonstrates the most common use case - transcribing a single audio file with minimal configuration. + +**Key features**: + +- Creating a TranscriptionClient +- Reading an audio file +- Transcribing with default options +- Processing results + +**Run**: + +```bash +cd sdk/transcription/azure-ai-speech-transcription +mvn exec:java -Dexec.mainClass="com.azure.ai.speech.transcription.TranscribeAudioFileSample" +``` + +--- + +### TranscribeFromUrlSample.java + +**Champion scenario**: Transcribe audio from a URL + +Demonstrates how to transcribe audio directly from a URL without downloading the file locally. + +**Key features**: + +- Creating TranscriptionOptions with a URL +- Transcribing remote audio files + +**Run**: + +```bash +mvn exec:java -Dexec.mainClass="com.azure.ai.speech.transcription.TranscribeFromUrlSample" +``` + +--- + +### TranscribeMultiLanguageSample.java + +**Champion scenario**: Multi-language transcription + +Demonstrates how to transcribe audio containing multiple languages with automatic language detection. + +**Key features**: + +- Automatic language detection +- Handling multi-language results + +**Run**: + +```bash +mvn exec:java -Dexec.mainClass="com.azure.ai.speech.transcription.TranscribeMultiLanguageSample" +``` + +--- + +### EnhancedModeSample.java + +**Champion scenario**: Enhanced transcription quality + +Demonstrates how to use enhanced mode with custom prompts and other advanced features. + +**Key features**: + +- Using EnhancedModeOptions +- Providing custom prompts for better accuracy +- Specifying task types + +**Run**: + +```bash +mvn exec:java -Dexec.mainClass="com.azure.ai.speech.transcription.EnhancedModeSample" +``` + +--- + +### TranscribeWithDiarizationSample.java + +**Champion scenario**: Speaker diarization + +Demonstrates how to identify different speakers in the audio. + +**Key features**: + +- Enabling speaker diarization +- Configuring max speakers +- Processing speaker-separated results + +**Run**: + +```bash +mvn exec:java -Dexec.mainClass="com.azure.ai.speech.transcription.TranscribeWithDiarizationSample" +``` + +--- + +### TranscribeWithPhraseListSample.java + +**Champion scenario**: Improving accuracy with phrase lists + +Demonstrates how to use a phrase list to improve recognition of specific terms. + +**Key features**: + +- Creating a PhraseListOptions +- Adding custom phrases and boosting their probability +- Improving accuracy for domain-specific terminology + +**Run**: + +```bash +mvn exec:java -Dexec.mainClass="com.azure.ai.speech.transcription.TranscribeWithPhraseListSample" +``` + +--- + +### TranscribeWithProfanityFilterSample.java + +**Champion scenario**: Profanity filtering + +Demonstrates how to configure profanity filtering options. + +**Key features**: + +- Setting ProfanityFilterMode (Masked, Removed, None) +- Handling filtered results + +**Run**: + +```bash +mvn exec:java -Dexec.mainClass="com.azure.ai.speech.transcription.TranscribeWithProfanityFilterSample" +``` + +--- + +### ReadmeSamples.java + +Code snippets used in the main README.md and API documentation (JavaDoc). + +**Note**: This file is used by the `codesnippet-maven-plugin` to inject code into documentation. It's not meant to be run directly. + +## Supported Audio Formats + +The service supports various audio formats: + +- **WAV** (recommended: 16 kHz, 16-bit, mono PCM) +- **MP3** +- **OGG** +- **FLAC** +- And more + +**Constraints**: + +- Maximum file size: 250 MB +- Maximum duration: 2 hours + +## Getting Help + +- [Azure AI Speech Documentation](https://learn.microsoft.com/azure/ai-services/speech-service/) +- [SDK README](https://github.com/Azure/azure-sdk-for-java/tree/main/sdk/transcription/azure-ai-speech-transcription) +- [GitHub Issues](https://github.com/Azure/azure-sdk-for-java/issues) + +## Additional Resources + +- [Azure SDK for Java Guidelines](https://azure.github.io/azure-sdk/java_introduction.html) +- [Project Reactor Documentation](https://projectreactor.io/docs) +- [Azure SDK Blog](https://devblogs.microsoft.com/azure-sdk/) diff --git a/sdk/transcription/azure-ai-speech-transcription/src/samples/java/com/azure/ai/speech/transcription/ReadmeSamples.java b/sdk/transcription/azure-ai-speech-transcription/src/samples/java/com/azure/ai/speech/transcription/ReadmeSamples.java new file mode 100644 index 000000000000..6dfd93d716c4 --- /dev/null +++ b/sdk/transcription/azure-ai-speech-transcription/src/samples/java/com/azure/ai/speech/transcription/ReadmeSamples.java @@ -0,0 +1,423 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. +// Code generated by Microsoft (R) TypeSpec Code Generator. + +package com.azure.ai.speech.transcription; + +import com.azure.ai.speech.transcription.models.AudioFileDetails; +import com.azure.ai.speech.transcription.models.EnhancedModeOptions; +import com.azure.ai.speech.transcription.models.PhraseListOptions; +import com.azure.ai.speech.transcription.models.ProfanityFilterMode; +import com.azure.ai.speech.transcription.models.TranscriptionDiarizationOptions; +import com.azure.ai.speech.transcription.models.TranscriptionOptions; +import com.azure.ai.speech.transcription.models.TranscriptionResult; +import com.azure.core.credential.KeyCredential; +import com.azure.core.http.policy.ExponentialBackoffOptions; +import com.azure.core.http.policy.HttpLogDetailLevel; +import com.azure.core.http.policy.HttpLogOptions; +import com.azure.core.http.policy.RetryOptions; +import com.azure.core.util.BinaryData; + +import java.nio.file.Files; +import java.nio.file.Paths; + +public final class ReadmeSamples { + /** + * Sample for basic audio transcription. + */ + public void readmeSamples() { + // BEGIN: com.azure.ai.speech.transcription.readme + TranscriptionClient client = new TranscriptionClientBuilder() + .endpoint("https://.cognitiveservices.azure.com/") + .credential(new KeyCredential("")) + .buildClient(); + + try { + // Read audio file + byte[] audioData = Files.readAllBytes(Paths.get("path/to/audio.wav")); + + // Create audio file details + AudioFileDetails audioFileDetails = new AudioFileDetails(BinaryData.fromBytes(audioData)); + + // Create transcription options + TranscriptionOptions options = new TranscriptionOptions(audioFileDetails); + + // Transcribe audio + TranscriptionResult result = client.transcribe(options); + + // Process results + System.out.println("Duration: " + result.getDuration() + " ms"); + result.getCombinedPhrases().forEach(phrase -> { + System.out.println("Channel " + phrase.getChannel() + ": " + phrase.getText()); + }); + } catch (Exception e) { + System.err.println("Error during transcription: " + e.getMessage()); + } + // END: com.azure.ai.speech.transcription.readme + } + + /** + * Sample for creating a synchronous TranscriptionClient. + */ + public void createSyncClient() { + // BEGIN: com.azure.ai.speech.transcription.transcriptionclient.instantiation + TranscriptionClient client = new TranscriptionClientBuilder() + .endpoint("https://.cognitiveservices.azure.com/") + .credential(new KeyCredential("")) + .buildClient(); + // END: com.azure.ai.speech.transcription.transcriptionclient.instantiation + } + + /** + * Sample for creating an asynchronous TranscriptionAsyncClient. + */ + public void createAsyncClient() { + // BEGIN: com.azure.ai.speech.transcription.transcriptionasyncclient.instantiation + TranscriptionAsyncClient asyncClient = new TranscriptionClientBuilder() + .endpoint("https://.cognitiveservices.azure.com/") + .credential(new KeyCredential("")) + .buildAsyncClient(); + // END: com.azure.ai.speech.transcription.transcriptionasyncclient.instantiation + } + + /** + * Sample for transcribing audio with the synchronous client. + */ + public void transcribeAudioSync() throws Exception { + TranscriptionClient client = new TranscriptionClientBuilder() + .endpoint("https://.cognitiveservices.azure.com/") + .credential(new KeyCredential("")) + .buildClient(); + + // BEGIN: com.azure.ai.speech.transcription.transcriptionclient.transcribe + byte[] audioData = Files.readAllBytes(Paths.get("path/to/audio.wav")); + + AudioFileDetails audioFileDetails = new AudioFileDetails(BinaryData.fromBytes(audioData)); + + TranscriptionOptions options = new TranscriptionOptions(audioFileDetails); + + TranscriptionResult result = client.transcribe(options); + + System.out.println("Duration: " + result.getDuration() + " ms"); + result.getCombinedPhrases().forEach(phrase -> { + System.out.println("Transcription: " + phrase.getText()); + }); + // END: com.azure.ai.speech.transcription.transcriptionclient.transcribe + } + + /** + * Sample for transcribing audio with the asynchronous client. + */ + public void transcribeAudioAsync() throws Exception { + TranscriptionAsyncClient asyncClient = new TranscriptionClientBuilder() + .endpoint("https://.cognitiveservices.azure.com/") + .credential(new KeyCredential("")) + .buildAsyncClient(); + + // BEGIN: com.azure.ai.speech.transcription.transcriptionasyncclient.transcribe + byte[] audioData = Files.readAllBytes(Paths.get("path/to/audio.wav")); + + AudioFileDetails audioFileDetails = new AudioFileDetails(BinaryData.fromBytes(audioData)); + + TranscriptionOptions options = new TranscriptionOptions(audioFileDetails); + + asyncClient.transcribe(options) + .subscribe(result -> { + System.out.println("Duration: " + result.getDuration() + " ms"); + result.getCombinedPhrases().forEach(phrase -> { + System.out.println("Transcription: " + phrase.getText()); + }); + }); + // END: com.azure.ai.speech.transcription.transcriptionasyncclient.transcribe + } + + /** + * Sample for configuring advanced transcription options. + */ + public void advancedTranscriptionOptions() throws Exception { + TranscriptionClient client = new TranscriptionClientBuilder() + .endpoint("https://.cognitiveservices.azure.com/") + .credential(new KeyCredential("")) + .buildClient(); + + // BEGIN: com.azure.ai.speech.transcription.transcriptionoptions.advanced + byte[] audioData = Files.readAllBytes(Paths.get("path/to/audio.wav")); + + AudioFileDetails audioFileDetails = new AudioFileDetails(BinaryData.fromBytes(audioData)); + + // Configure advanced options + TranscriptionOptions options = new TranscriptionOptions(audioFileDetails) + .setLocales(java.util.Arrays.asList("en-US", "es-ES")) // Specify candidate locales + .setProfanityFilterMode(ProfanityFilterMode.MASKED) // Mask profanity + .setDiarizationOptions(new TranscriptionDiarizationOptions() // Enable speaker diarization + .setMaxSpeakers(5)); + + TranscriptionResult result = client.transcribe(options); + + // Access detailed results + result.getPhrases().forEach(phrase -> { + System.out.println("Speaker " + phrase.getSpeaker() + ": " + phrase.getText()); + System.out.println("Confidence: " + phrase.getConfidence()); + System.out.println("Offset: " + phrase.getOffset() + " ms"); + }); + // END: com.azure.ai.speech.transcription.transcriptionoptions.advanced + } + + /** + * Sample for building client with custom configuration. + */ + public void clientWithCustomConfiguration() { + // BEGIN: com.azure.ai.speech.transcription.transcriptionclientbuilder.configuration + HttpLogOptions logOptions = new HttpLogOptions() + .setLogLevel(HttpLogDetailLevel.BODY_AND_HEADERS); + + RetryOptions retryOptions = new RetryOptions(new ExponentialBackoffOptions() + .setMaxRetries(5) + .setBaseDelay(java.time.Duration.ofSeconds(1)) + .setMaxDelay(java.time.Duration.ofSeconds(60))); + + TranscriptionClient client = new TranscriptionClientBuilder() + .endpoint("https://.cognitiveservices.azure.com/") + .credential(new KeyCredential("")) + .httpLogOptions(logOptions) + .retryOptions(retryOptions) + .buildClient(); + // END: com.azure.ai.speech.transcription.transcriptionclientbuilder.configuration + } + + /** + * Sample for processing detailed transcription results. + */ + public void processDetailedResults() throws Exception { + TranscriptionClient client = new TranscriptionClientBuilder() + .endpoint("https://.cognitiveservices.azure.com/") + .credential(new KeyCredential("")) + .buildClient(); + + byte[] audioData = Files.readAllBytes(Paths.get("path/to/audio.wav")); + AudioFileDetails audioFileDetails = new AudioFileDetails(BinaryData.fromBytes(audioData)); + TranscriptionOptions options = new TranscriptionOptions(audioFileDetails); + + // BEGIN: com.azure.ai.speech.transcription.transcriptionresult.detailed + TranscriptionResult result = client.transcribe(options); + + // Get overall duration + System.out.println("Total duration: " + result.getDuration() + " ms"); + + // Process each phrase with detailed information + result.getPhrases().forEach(phrase -> { + System.out.println("\nPhrase: " + phrase.getText()); + System.out.println(" Channel: " + phrase.getChannel()); + System.out.println(" Speaker: " + phrase.getSpeaker()); + System.out.println(" Locale: " + phrase.getLocale()); + System.out.println(" Confidence: " + phrase.getConfidence()); + System.out.println(" Timing: " + phrase.getOffset() + " ms - " + + (phrase.getOffset() + phrase.getDuration().toMillis()) + " ms"); + + // Process individual words with timestamps + if (phrase.getWords() != null) { + phrase.getWords().forEach(word -> { + System.out.println(" Word: " + word.getText() + " @ " + + word.getOffset() + " ms"); + }); + } + }); + // END: com.azure.ai.speech.transcription.transcriptionresult.detailed + } + + /** + * Sample for using enhanced mode to improve transcription quality. + */ + public void enhancedModeBasic() throws Exception { + TranscriptionClient client = new TranscriptionClientBuilder() + .endpoint("https://.cognitiveservices.azure.com/") + .credential(new KeyCredential("")) + .buildClient(); + + // BEGIN: readme-sample-enhancedModeBasic + byte[] audioData = Files.readAllBytes(Paths.get("path/to/audio.wav")); + + AudioFileDetails audioFileDetails = new AudioFileDetails(BinaryData.fromBytes(audioData)); + + // Enhanced mode is automatically enabled when you create EnhancedModeOptions + EnhancedModeOptions enhancedMode = new EnhancedModeOptions() + .setPrompts(java.util.Arrays.asList( + "Output must be in lexical format." + )); + + TranscriptionOptions options = new TranscriptionOptions(audioFileDetails) + .setLocales(java.util.Arrays.asList("en-US", "es-ES")) + .setEnhancedModeOptions(enhancedMode); + + TranscriptionResult result = client.transcribe(options); + // END: readme-sample-enhancedModeBasic + } + + /** + * Sample for using enhanced mode with custom prompts. + */ + public void enhancedModeWithPrompts() throws Exception { + TranscriptionClient client = new TranscriptionClientBuilder() + .endpoint("https://.cognitiveservices.azure.com/") + .credential(new KeyCredential("")) + .buildClient(); + + // BEGIN: readme-sample-enhancedModeWithPrompts + byte[] audioData = Files.readAllBytes(Paths.get("path/to/audio.wav")); + + AudioFileDetails audioFileDetails = new AudioFileDetails(BinaryData.fromBytes(audioData)); + + // Enhanced mode is automatically enabled + // Use prompts to guide transcription with domain-specific terminology + // Always include lexical format prompt for best results + EnhancedModeOptions enhancedMode = new EnhancedModeOptions() + .setPrompts(java.util.Arrays.asList( + "Output must be in lexical format.", + "Medical consultation discussing hypertension and diabetes", + "Common medications: metformin, lisinopril, atorvastatin", + "Patient symptoms and treatment plan" + )); + + TranscriptionOptions options = new TranscriptionOptions(audioFileDetails) + .setLocales(java.util.Arrays.asList("en-US", "es-ES")) + .setEnhancedModeOptions(enhancedMode); + + TranscriptionResult result = client.transcribe(options); + // END: readme-sample-enhancedModeWithPrompts + } + + /** + * Sample for using enhanced mode with translation. + */ + public void enhancedModeWithTranslation() throws Exception { + TranscriptionClient client = new TranscriptionClientBuilder() + .endpoint("https://.cognitiveservices.azure.com/") + .credential(new KeyCredential("")) + .buildClient(); + + // BEGIN: readme-sample-enhancedModeWithTranslation + byte[] audioData = Files.readAllBytes(Paths.get("path/to/audio.wav")); + + AudioFileDetails audioFileDetails = new AudioFileDetails(BinaryData.fromBytes(audioData)); + + // Enhanced mode is automatically enabled + // Configure enhanced mode to transcribe Spanish audio and translate to English + EnhancedModeOptions enhancedMode = new EnhancedModeOptions() + .setTargetLanguage("en-US"); // Translate to English + + TranscriptionOptions options = new TranscriptionOptions(audioFileDetails) + .setLocales(java.util.Arrays.asList("es-ES")) // Source language: Spanish + .setEnhancedModeOptions(enhancedMode); + + TranscriptionResult result = client.transcribe(options); + // END: readme-sample-enhancedModeWithTranslation + } + + /** + * Sample for transcribing audio using audio URL constructor. + */ + public void transcribeWithAudioUrl() { + // BEGIN: readme-sample-transcribeWithAudioUrl + TranscriptionClient client = new TranscriptionClientBuilder() + .endpoint("https://.cognitiveservices.azure.com/") + .credential(new KeyCredential("")) + .buildClient(); + + // Create transcription options with audio URL + TranscriptionOptions options = new TranscriptionOptions("https://example.com/audio.wav"); + + // Transcribe audio + TranscriptionResult result = client.transcribe(options); + + // Process results + result.getCombinedPhrases().forEach(phrase -> { + System.out.println(phrase.getText()); + }); + // END: readme-sample-transcribeWithAudioUrl + } + + /** + * Sample for multi-language transcription. + */ + public void transcribeMultiLanguage() throws Exception { + TranscriptionClient client = new TranscriptionClientBuilder() + .endpoint("https://.cognitiveservices.azure.com/") + .credential(new KeyCredential("")) + .buildClient(); + + // BEGIN: com.azure.ai.speech.transcription.transcriptionoptions.multilanguage + byte[] audioData = Files.readAllBytes(Paths.get("path/to/audio.wav")); + + AudioFileDetails audioFileDetails = new AudioFileDetails(BinaryData.fromBytes(audioData)); + + // Configure transcription WITHOUT specifying locales + // This allows the service to auto-detect and transcribe multiple languages + TranscriptionOptions options = new TranscriptionOptions(audioFileDetails); + + TranscriptionResult result = client.transcribe(options); + + result.getPhrases().forEach(phrase -> { + System.out.println("Language: " + phrase.getLocale()); + System.out.println("Text: " + phrase.getText()); + }); + // END: com.azure.ai.speech.transcription.transcriptionoptions.multilanguage + } + + /** + * Sample for enhanced mode transcription. + */ + public void transcribeEnhancedMode() throws Exception { + TranscriptionClient client = new TranscriptionClientBuilder() + .endpoint("https://.cognitiveservices.azure.com/") + .credential(new KeyCredential("")) + .buildClient(); + + // BEGIN: com.azure.ai.speech.transcription.transcriptionoptions.enhancedmode + byte[] audioData = Files.readAllBytes(Paths.get("path/to/audio.wav")); + + AudioFileDetails audioFileDetails = new AudioFileDetails(BinaryData.fromBytes(audioData)); + + // Enhanced mode is automatically enabled + EnhancedModeOptions enhancedMode = new EnhancedModeOptions() + .setTask("transcribe") + .setPrompts(java.util.Arrays.asList("Output must be in lexical format.")); + + TranscriptionOptions options = new TranscriptionOptions(audioFileDetails) + .setEnhancedModeOptions(enhancedMode); + + TranscriptionResult result = client.transcribe(options); + + System.out.println("Transcription: " + result.getCombinedPhrases().get(0).getText()); + // END: com.azure.ai.speech.transcription.transcriptionoptions.enhancedmode + } + + /** + * Sample for transcription with phrase list. + */ + public void transcribeWithPhraseList() throws Exception { + TranscriptionClient client = new TranscriptionClientBuilder() + .endpoint("https://.cognitiveservices.azure.com/") + .credential(new KeyCredential("")) + .buildClient(); + + // BEGIN: com.azure.ai.speech.transcription.transcriptionoptions.phraselist + byte[] audioData = Files.readAllBytes(Paths.get("path/to/audio.wav")); + + AudioFileDetails audioFileDetails = new AudioFileDetails(BinaryData.fromBytes(audioData)); + + PhraseListOptions phraseListOptions = new PhraseListOptions() + .setPhrases(java.util.Arrays.asList("Azure", "Cognitive Services")) + .setBiasingWeight(5.0); + + TranscriptionOptions options = new TranscriptionOptions(audioFileDetails) + .setPhraseListOptions(phraseListOptions); + + TranscriptionResult result = client.transcribe(options); + + result.getCombinedPhrases().forEach(phrase -> { + System.out.println(phrase.getText()); + }); + // END: com.azure.ai.speech.transcription.transcriptionoptions.phraselist + } +} + diff --git a/sdk/transcription/azure-ai-speech-transcription/src/samples/java/com/azure/ai/speech/transcription/TranscribeAudioFileSample.java b/sdk/transcription/azure-ai-speech-transcription/src/samples/java/com/azure/ai/speech/transcription/TranscribeAudioFileSample.java new file mode 100644 index 000000000000..382d5c473bef --- /dev/null +++ b/sdk/transcription/azure-ai-speech-transcription/src/samples/java/com/azure/ai/speech/transcription/TranscribeAudioFileSample.java @@ -0,0 +1,59 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +package com.azure.ai.speech.transcription; + +import com.azure.ai.speech.transcription.models.AudioFileDetails; +import com.azure.ai.speech.transcription.models.TranscriptionOptions; +import com.azure.ai.speech.transcription.models.TranscriptionResult; +import com.azure.core.credential.KeyCredential; +import com.azure.core.util.BinaryData; + +import java.nio.file.Files; +import java.nio.file.Paths; + +/** + * Simplest possible example of transcribing an audio file. + * + * This sample demonstrates the absolute minimum code needed to: + * 1. Create a client + * 2. Load an audio file + * 3. Transcribe it + * 4. Get the text result + */ +public class TranscribeAudioFileSample { + public static void main(String[] args) { + try { + // Get credentials from environment variables + String endpoint = System.getenv("SPEECH_ENDPOINT"); + String apiKey = System.getenv("SPEECH_API_KEY"); + + // Create client + TranscriptionClient client = new TranscriptionClientBuilder() + .endpoint(endpoint) + .credential(new KeyCredential(apiKey)) + .buildClient(); + + // Load audio file + String audioFilePath = "src/samples/assets/sample-audio.wav"; + byte[] audioData = Files.readAllBytes(Paths.get(audioFilePath)); + + // Create audio file details + AudioFileDetails audioFileDetails = new AudioFileDetails(BinaryData.fromBytes(audioData)); + + // Transcribe + TranscriptionOptions options = new TranscriptionOptions(audioFileDetails); + TranscriptionResult result = client.transcribe(options); + + // Print result + System.out.println("Transcription:"); + result.getCombinedPhrases().forEach(phrase -> + System.out.println(phrase.getText()) + ); + + } catch (Exception e) { + System.err.println("Error: " + e.getMessage()); + e.printStackTrace(); + } + } +} diff --git a/sdk/transcription/azure-ai-speech-transcription/src/samples/java/com/azure/ai/speech/transcription/TranscribeFromUrlSample.java b/sdk/transcription/azure-ai-speech-transcription/src/samples/java/com/azure/ai/speech/transcription/TranscribeFromUrlSample.java new file mode 100644 index 000000000000..b1f77c0beba9 --- /dev/null +++ b/sdk/transcription/azure-ai-speech-transcription/src/samples/java/com/azure/ai/speech/transcription/TranscribeFromUrlSample.java @@ -0,0 +1,64 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +package com.azure.ai.speech.transcription; + +import com.azure.ai.speech.transcription.models.TranscriptionOptions; +import com.azure.ai.speech.transcription.models.TranscriptionResult; +import com.azure.core.credential.AzureKeyCredential; +import java.util.Arrays; + +/** + * Sample demonstrates how to transcribe audio from a URL. + */ +public class TranscribeFromUrlSample { + + /** + * Main method to invoke this demo. + * + * @param args Unused arguments to the program. + */ + public static void main(String[] args) { + String endpoint = System.getenv("SPEECH_ENDPOINT"); + String apiKey = System.getenv("SPEECH_API_KEY"); + + if (endpoint == null || apiKey == null) { + System.err.println("Please set SPEECH_ENDPOINT and SPEECH_API_KEY environment variables."); + System.exit(1); + } + + // Create the transcription client + TranscriptionClient client = new TranscriptionClientBuilder() + .endpoint(endpoint) + .credential(new AzureKeyCredential(apiKey)) + .buildClient(); + + System.out.println("Azure AI Speech Transcription - Transcribe from URL Sample"); + System.out.println("============================================================\n"); + + // Audio file URL (must be publicly accessible) + // Using sample audio from Azure documentation + String audioUrl = "https://raw.githubusercontent.com/Azure-Samples/cognitive-services-speech-sdk/master/sampledata/audiofiles/aboutSpeechSdk.wav"; + + System.out.println("Transcribing audio from URL: " + audioUrl); + System.out.println(); + + // Create transcription options with audio URL + TranscriptionOptions options = new TranscriptionOptions(audioUrl) + .setLocales(Arrays.asList("en-US")); + + // Transcribe the audio from URL + TranscriptionResult result = client.transcribe(options); + + // Display results + System.out.println("Transcription Results:"); + System.out.println("---------------------"); + System.out.println("Duration: " + result.getDuration() + "\n"); + + if (result.getCombinedPhrases() != null && !result.getCombinedPhrases().isEmpty()) { + System.out.println("Combined text: " + result.getCombinedPhrases().get(0).getText()); + } + + System.out.println(); + } +} diff --git a/sdk/transcription/azure-ai-speech-transcription/src/samples/java/com/azure/ai/speech/transcription/TranscribeMultiLanguageSample.java b/sdk/transcription/azure-ai-speech-transcription/src/samples/java/com/azure/ai/speech/transcription/TranscribeMultiLanguageSample.java new file mode 100644 index 000000000000..fedc4499806e --- /dev/null +++ b/sdk/transcription/azure-ai-speech-transcription/src/samples/java/com/azure/ai/speech/transcription/TranscribeMultiLanguageSample.java @@ -0,0 +1,108 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +package com.azure.ai.speech.transcription; + +import com.azure.ai.speech.transcription.models.AudioFileDetails; +import com.azure.ai.speech.transcription.models.TranscribedPhrase; +import com.azure.ai.speech.transcription.models.TranscriptionOptions; +import com.azure.ai.speech.transcription.models.TranscriptionResult; +import com.azure.core.credential.AzureKeyCredential; +import com.azure.core.util.BinaryData; +import java.nio.file.Files; +import java.nio.file.Paths; + +/** + * Sample demonstrates how to transcribe audio containing multiple languages. + * + * When locales are NOT specified, the service automatically detects and transcribes + * multiple languages within the same audio file, switching between them as needed. + * This is useful for: + * - Multilingual conversations + * - Code-switched speech (e.g., Spanish-English) + * - International meetings or interviews + */ +public class TranscribeMultiLanguageSample { + + /** + * Main method to invoke this demo. + * + * @param args Unused arguments to the program. + */ + public static void main(String[] args) { + String endpoint = System.getenv("SPEECH_ENDPOINT"); + String apiKey = System.getenv("SPEECH_API_KEY"); + + if (endpoint == null || apiKey == null) { + System.err.println("Please set SPEECH_ENDPOINT and SPEECH_API_KEY environment variables."); + System.exit(1); + } + + System.out.println("Azure AI Speech Transcription - Multi-Language Sample"); + System.out.println("=====================================================\n"); + + // Create the transcription client + TranscriptionClient client = new TranscriptionClientBuilder() + .endpoint(endpoint) + .credential(new AzureKeyCredential(apiKey)) + .buildClient(); + + try { + // Load audio file + String audioFilePath = "src/samples/assets/sample-audio.wav"; + byte[] audioData = Files.readAllBytes(Paths.get(audioFilePath)); + AudioFileDetails fileDetails = new AudioFileDetails(BinaryData.fromBytes(audioData)); + + // Configure transcription WITHOUT specifying locales + // This allows the service to auto-detect and transcribe multiple languages + // within the same audio file, switching between them as needed + TranscriptionOptions options = new TranscriptionOptions(fileDetails); + + System.out.println("Transcribing with automatic multi-language detection..."); + System.out.println("(No locale specified - service will detect all languages)\n"); + + // Transcribe the audio + TranscriptionResult result = client.transcribe(options); + + // Display results + System.out.println("Transcription Results:"); + System.out.println("---------------------"); + System.out.println("Duration: " + result.getDuration()); + System.out.println("Total phrases found: " + (result.getPhrases() != null ? result.getPhrases().size() : 0)); + System.out.println("Total combined phrases: " + (result.getCombinedPhrases() != null ? result.getCombinedPhrases().size() : 0)); + System.out.println(); + + // Show detailed phrases with timestamps + if (result.getPhrases() != null && !result.getPhrases().isEmpty()) { + System.out.println("Detailed Phrases:"); + System.out.println("-----------------"); + + for (int i = 0; i < result.getPhrases().size(); i++) { + TranscribedPhrase phrase = result.getPhrases().get(i); + long offsetMs = phrase.getOffset(); + long durationMs = phrase.getDuration().toMillis(); + + System.out.println("\n[Phrase " + (i + 1) + "] " + offsetMs + "ms - " + (offsetMs + durationMs) + "ms"); + System.out.println("Locale: " + phrase.getLocale()); + System.out.println("Text: " + phrase.getText()); + } + } + + // Also show combined phrases per channel + if (result.getCombinedPhrases() != null && !result.getCombinedPhrases().isEmpty()) { + System.out.println("\n\nCombined Transcription (All Languages):"); + System.out.println("========================================"); + result.getCombinedPhrases().forEach(phrase -> { + System.out.println(phrase.getText()); + }); + } + + System.out.println("\nNote: When no locales are specified, the service transcribes all languages"); + System.out.println("present in the audio. However, the locale field in each phrase may not always"); + System.out.println("accurately reflect the actual language of that specific phrase."); + } catch (Exception e) { + System.err.println("Error: " + e.getMessage()); + e.printStackTrace(); + } + } +} diff --git a/sdk/transcription/azure-ai-speech-transcription/src/samples/java/com/azure/ai/speech/transcription/TranscribeWithDiarizationSample.java b/sdk/transcription/azure-ai-speech-transcription/src/samples/java/com/azure/ai/speech/transcription/TranscribeWithDiarizationSample.java new file mode 100644 index 000000000000..bf57f4308f36 --- /dev/null +++ b/sdk/transcription/azure-ai-speech-transcription/src/samples/java/com/azure/ai/speech/transcription/TranscribeWithDiarizationSample.java @@ -0,0 +1,86 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +package com.azure.ai.speech.transcription; + +import com.azure.ai.speech.transcription.models.AudioFileDetails; +import com.azure.ai.speech.transcription.models.TranscribedPhrase; +import com.azure.ai.speech.transcription.models.TranscriptionDiarizationOptions; +import com.azure.ai.speech.transcription.models.TranscriptionOptions; +import com.azure.ai.speech.transcription.models.TranscriptionResult; +import com.azure.core.credential.KeyCredential; +import com.azure.core.util.BinaryData; + +import java.nio.file.Files; +import java.nio.file.Paths; + +/** + * Sample demonstrates using speaker diarization to identify different speakers in audio. + * + * Speaker diarization detects and separates different speakers in the audio, labeling + * each transcribed segment with a speaker ID. This is useful for: + * - Meeting transcriptions + * - Interview recordings + * - Multi-person conversations + * - Podcast transcriptions + */ +public class TranscribeWithDiarizationSample { + public static void main(String[] args) { + String endpoint = System.getenv("SPEECH_ENDPOINT"); + String apiKey = System.getenv("SPEECH_API_KEY"); + + if (endpoint == null || apiKey == null) { + System.err.println("Please set SPEECH_ENDPOINT and SPEECH_API_KEY environment variables"); + return; + } + + System.out.println("Azure AI Speech Transcription - Speaker Diarization Sample"); + System.out.println("===========================================================\n"); + + // Create client + TranscriptionClient client = new TranscriptionClientBuilder() + .endpoint(endpoint) + .credential(new KeyCredential(apiKey)) + .buildClient(); + + try { + // Load audio file + String audioFilePath = "src/samples/assets/sample-audio.wav"; + byte[] audioData = Files.readAllBytes(Paths.get(audioFilePath)); + AudioFileDetails audioFileDetails = new AudioFileDetails(BinaryData.fromBytes(audioData)); + + // Configure speaker diarization + TranscriptionDiarizationOptions diarizationOptions = new TranscriptionDiarizationOptions() + .setMaxSpeakers(5); // Maximum number of speakers to detect (2-36) + + TranscriptionOptions options = new TranscriptionOptions(audioFileDetails) + .setDiarizationOptions(diarizationOptions); + + System.out.println("Transcribing with speaker diarization (max 5 speakers)...\n"); + + // Transcribe with diarization + TranscriptionResult result = client.transcribe(options); + + // Display results organized by speaker + System.out.println("Transcription with Speaker Information:"); + System.out.println("----------------------------------------"); + + if (result.getPhrases() != null && !result.getPhrases().isEmpty()) { + for (TranscribedPhrase phrase : result.getPhrases()) { + int speakerId = phrase.getSpeaker() != null ? phrase.getSpeaker() : 0; + double startTime = phrase.getOffset() / 1000.0; + double endTime = (phrase.getOffset() + phrase.getDuration().toMillis()) / 1000.0; + + System.out.println(String.format("\n[Speaker %d] (%.2fs - %.2fs)", + speakerId, startTime, endTime)); + System.out.println(phrase.getText()); + } + } + + + } catch (Exception e) { + System.err.println("Error: " + e.getMessage()); + e.printStackTrace(); + } + } +} diff --git a/sdk/transcription/azure-ai-speech-transcription/src/samples/java/com/azure/ai/speech/transcription/TranscribeWithPhraseListSample.java b/sdk/transcription/azure-ai-speech-transcription/src/samples/java/com/azure/ai/speech/transcription/TranscribeWithPhraseListSample.java new file mode 100644 index 000000000000..af07db34935e --- /dev/null +++ b/sdk/transcription/azure-ai-speech-transcription/src/samples/java/com/azure/ai/speech/transcription/TranscribeWithPhraseListSample.java @@ -0,0 +1,97 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +package com.azure.ai.speech.transcription; + +import com.azure.ai.speech.transcription.models.AudioFileDetails; +import com.azure.ai.speech.transcription.models.PhraseListOptions; +import com.azure.ai.speech.transcription.models.TranscriptionOptions; +import com.azure.ai.speech.transcription.models.TranscriptionResult; +import com.azure.core.credential.KeyCredential; +import com.azure.core.util.BinaryData; + +import java.nio.file.Files; +import java.nio.file.Paths; +import java.util.Arrays; + +/** + * Sample demonstrates using a phrase list to improve recognition accuracy for specific terms. + * + * Phrase lists help the speech service better recognize domain-specific terminology, + * proper nouns, and uncommon words that might otherwise be misrecognized. + */ +public class TranscribeWithPhraseListSample { + public static void main(String[] args) { + String endpoint = System.getenv("SPEECH_ENDPOINT"); + String apiKey = System.getenv("SPEECH_API_KEY"); + + if (endpoint == null || apiKey == null) { + System.err.println("Please set SPEECH_ENDPOINT and SPEECH_API_KEY environment variables"); + return; + } + + System.out.println("Azure AI Speech Transcription - Phrase List Sample"); + System.out.println("====================================================\n"); + + // Create client + TranscriptionClient client = new TranscriptionClientBuilder() + .endpoint(endpoint) + .credential(new KeyCredential(apiKey)) + .buildClient(); + + try { + // Load audio file + String audioFilePath = "src/samples/assets/sample-audio.wav"; + byte[] audioData = Files.readAllBytes(Paths.get(audioFilePath)); + AudioFileDetails audioFileDetails = new AudioFileDetails(BinaryData.fromBytes(audioData)); + + // Create phrase list with custom terms + // Add phrases that appear in your audio for better recognition + PhraseListOptions phraseListOptions = new PhraseListOptions() + .setPhrases(Arrays.asList( + "Mary", + "El Mundo", + "Secret Garden", + "empleada doméstica", + "habitación" + )) + .setBiasingWeight(5.0); // Weight range: 1.0-20.0 (higher = stronger bias) + + // Create transcription options with phrase list + TranscriptionOptions options = new TranscriptionOptions(audioFileDetails) + .setPhraseListOptions(phraseListOptions); + + System.out.println("Custom phrase list:"); + phraseListOptions.getPhrases().forEach(phrase -> + System.out.println(" - " + phrase) + ); + System.out.println("\nBiasing weight: " + phraseListOptions.getBiasingWeight()); + System.out.println("\nTranscribing with phrase list...\n"); + + // Transcribe + TranscriptionResult result = client.transcribe(options); + + System.out.println("Transcription result:"); + System.out.println("---------------------"); + result.getCombinedPhrases().forEach(phrase -> + System.out.println(phrase.getText()) + ); + + // Print individual phrases with timing information + if (result.getPhrases() != null && !result.getPhrases().isEmpty()) { + System.out.println("\nDetailed phrases:"); + result.getPhrases().forEach(phrase -> + System.out.println(String.format(" [%dms]: %s", + phrase.getOffset(), + phrase.getText())) + ); + } + + System.out.println("\n Transcription completed successfully!"); + + } catch (Exception e) { + System.err.println("Error: " + e.getMessage()); + e.printStackTrace(); + } + } +} diff --git a/sdk/transcription/azure-ai-speech-transcription/src/samples/java/com/azure/ai/speech/transcription/TranscribeWithProfanityFilterSample.java b/sdk/transcription/azure-ai-speech-transcription/src/samples/java/com/azure/ai/speech/transcription/TranscribeWithProfanityFilterSample.java new file mode 100644 index 000000000000..40a67d1b0188 --- /dev/null +++ b/sdk/transcription/azure-ai-speech-transcription/src/samples/java/com/azure/ai/speech/transcription/TranscribeWithProfanityFilterSample.java @@ -0,0 +1,80 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +package com.azure.ai.speech.transcription; + +import com.azure.ai.speech.transcription.models.AudioFileDetails; +import com.azure.ai.speech.transcription.models.ProfanityFilterMode; +import com.azure.ai.speech.transcription.models.TranscriptionOptions; +import com.azure.ai.speech.transcription.models.TranscriptionResult; +import com.azure.core.credential.KeyCredential; +import com.azure.core.util.BinaryData; + +import java.nio.file.Files; +import java.nio.file.Paths; + +/** + * Sample demonstrates profanity filtering in Azure AI Speech Transcription. + * Shows the difference between NONE (raw), MASKED (f***), REMOVED (omitted), and TAGS (XML tagged). + */ +public class TranscribeWithProfanityFilterSample { + /** + * Main method to run the profanity filter sample. + * + * @param args command line arguments (not used) + */ + public static void main(String[] args) { + System.out.println("Azure AI Speech Transcription - Profanity Filter Sample"); + System.out.println("==========================================================\n"); + + String endpoint = System.getenv("SPEECH_ENDPOINT"); + String apiKey = System.getenv("SPEECH_API_KEY"); + + if (endpoint == null || apiKey == null) { + System.err.println("Please set SPEECH_ENDPOINT and SPEECH_API_KEY environment variables"); + return; + } + + try { + // Create the transcription client + TranscriptionClient client = new TranscriptionClientBuilder() + .endpoint(endpoint) + .credential(new KeyCredential(apiKey)) + .buildClient(); + + // Load audio file + String audioFilePath = "src/samples/assets/sample-profanity.wav"; + byte[] audioData = Files.readAllBytes(Paths.get(audioFilePath)); + AudioFileDetails audioFileDetails = new AudioFileDetails(BinaryData.fromBytes(audioData)); + + // Demonstrate different profanity filter modes + ProfanityFilterMode[] modes = { + ProfanityFilterMode.NONE, + ProfanityFilterMode.MASKED, + ProfanityFilterMode.REMOVED, + ProfanityFilterMode.TAGS + }; + + for (ProfanityFilterMode mode : modes) { + System.out.println("Transcribing with profanity filter mode: " + mode); + System.out.println("----------------------------------------------"); + + // Create transcription options with profanity filter + TranscriptionOptions options = new TranscriptionOptions(audioFileDetails) + .setProfanityFilterMode(mode); + + // Perform transcription + TranscriptionResult result = client.transcribe(options); + + // Display results + System.out.println("Combined text: " + result.getCombinedPhrases().get(0).getText()); + System.out.println(); + } + + + } catch (Exception e) { + System.err.println("Error during transcription: " + e.getMessage()); + e.printStackTrace(); + } + } +} diff --git a/sdk/transcription/azure-ai-speech-transcription/src/samples/java/com/azure/ai/speech/transcription/javadoccodesnippets/TranscriptionAsyncClientJavaDocCodeSnippets.java b/sdk/transcription/azure-ai-speech-transcription/src/samples/java/com/azure/ai/speech/transcription/javadoccodesnippets/TranscriptionAsyncClientJavaDocCodeSnippets.java new file mode 100644 index 000000000000..5dc618dee834 --- /dev/null +++ b/sdk/transcription/azure-ai-speech-transcription/src/samples/java/com/azure/ai/speech/transcription/javadoccodesnippets/TranscriptionAsyncClientJavaDocCodeSnippets.java @@ -0,0 +1,215 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. +// Source code snippets from this file are embedded in Transcription SDK JavaDoc (API documentation). + +package com.azure.ai.speech.transcription.javadoccodesnippets; + +import com.azure.ai.speech.transcription.TranscriptionAsyncClient; +import com.azure.ai.speech.transcription.TranscriptionClientBuilder; +import com.azure.ai.speech.transcription.models.AudioFileDetails; +import com.azure.ai.speech.transcription.models.ProfanityFilterMode; +import com.azure.ai.speech.transcription.models.TranscriptionDiarizationOptions; +import com.azure.ai.speech.transcription.models.TranscriptionOptions; +import com.azure.ai.speech.transcription.models.TranscriptionResult; +import com.azure.core.credential.KeyCredential; +import com.azure.core.util.BinaryData; +import reactor.core.publisher.Mono; + +import java.nio.file.Files; +import java.nio.file.Paths; +import java.time.Duration; +import java.util.Arrays; +import java.util.concurrent.CountDownLatch; +import java.util.concurrent.TimeUnit; + +/** + * Code snippets for {@link TranscriptionAsyncClient} JavaDoc documentation. + */ +public class TranscriptionAsyncClientJavaDocCodeSnippets { + + private static String endpoint = System.getenv("SPEECH_ENDPOINT"); + private static String key = System.getenv("SPEECH_API_KEY"); + + /** + * Sample for creating an asynchronous TranscriptionAsyncClient with API key authentication. + */ + public void createAsyncClientWithApiKey() { + // BEGIN: com.azure.ai.speech.transcription.transcriptionasyncclient.instantiation.apikey + TranscriptionAsyncClient asyncClient = new TranscriptionClientBuilder() + .endpoint(endpoint) + .credential(new KeyCredential(key)) + .buildAsyncClient(); + // END: com.azure.ai.speech.transcription.transcriptionasyncclient.instantiation.apikey + } + + /** + * Sample for transcribing audio asynchronously using subscribe pattern. + */ + public void transcribeAsyncWithSubscribe() throws Exception { + TranscriptionAsyncClient asyncClient = new TranscriptionClientBuilder() + .endpoint(endpoint) + .credential(new KeyCredential(key)) + .buildAsyncClient(); + + byte[] audioData = Files.readAllBytes(Paths.get("sample.wav")); + AudioFileDetails audioFileDetails = new AudioFileDetails(BinaryData.fromBytes(audioData)); + TranscriptionOptions options = new TranscriptionOptions(audioFileDetails); + + // BEGIN: com.azure.ai.speech.transcription.transcriptionasyncclient.transcribe.subscribe + CountDownLatch latch = new CountDownLatch(1); + + asyncClient.transcribe(options) + .subscribe( + // onNext: Process result + result -> { + System.out.println("Duration: " + result.getDuration() + " ms"); + if (result.getCombinedPhrases() != null) { + result.getCombinedPhrases().forEach(phrase -> + System.out.println("Text: " + phrase.getText()) + ); + } + latch.countDown(); + }, + // onError: Handle error + error -> { + System.err.println("Error: " + error.getMessage()); + latch.countDown(); + }, + // onComplete: Completion handler + () -> System.out.println("Transcription completed") + ); + + latch.await(60, TimeUnit.SECONDS); + // END: com.azure.ai.speech.transcription.transcriptionasyncclient.transcribe.subscribe + } + + /** + * Sample for transcribing audio asynchronously using block pattern. + */ + public void transcribeAsyncWithBlock() throws Exception { + TranscriptionAsyncClient asyncClient = new TranscriptionClientBuilder() + .endpoint(endpoint) + .credential(new KeyCredential(key)) + .buildAsyncClient(); + + byte[] audioData = Files.readAllBytes(Paths.get("sample.wav")); + AudioFileDetails audioFileDetails = new AudioFileDetails(BinaryData.fromBytes(audioData)); + TranscriptionOptions options = new TranscriptionOptions(audioFileDetails); + + // BEGIN: com.azure.ai.speech.transcription.transcriptionasyncclient.transcribe.block + // Use block() to convert async call to sync + TranscriptionResult result = asyncClient.transcribe(options).block(); + + if (result != null) { + System.out.println("Duration: " + result.getDuration() + " ms"); + } + // END: com.azure.ai.speech.transcription.transcriptionasyncclient.transcribe.block + } + + /** + * Sample for transcribing audio asynchronously with advanced options. + */ + public void transcribeAsyncWithOptions() throws Exception { + TranscriptionAsyncClient asyncClient = new TranscriptionClientBuilder() + .endpoint(endpoint) + .credential(new KeyCredential(key)) + .buildAsyncClient(); + + // BEGIN: com.azure.ai.speech.transcription.transcriptionasyncclient.transcribe.options + byte[] audioData = Files.readAllBytes(Paths.get("sample.wav")); + + AudioFileDetails audioFileDetails = new AudioFileDetails(BinaryData.fromBytes(audioData)); + + // Configure advanced transcription options + TranscriptionOptions options = new TranscriptionOptions(audioFileDetails) + .setLocales(Arrays.asList("en-US", "es-ES")) + .setProfanityFilterMode(ProfanityFilterMode.MASKED) + .setDiarizationOptions(new TranscriptionDiarizationOptions().setMaxSpeakers(5)); + + // Transcribe asynchronously + Mono resultMono = asyncClient.transcribe(options); + + // Process result + resultMono.subscribe(result -> { + if (result.getPhrases() != null) { + result.getPhrases().forEach(phrase -> { + System.out.printf("Speaker %d: %s%n", + phrase.getSpeaker(), phrase.getText()); + }); + } + }); + // END: com.azure.ai.speech.transcription.transcriptionasyncclient.transcribe.options + } + + /** + * Sample for transcribing audio asynchronously with timeout and error handling. + */ + public void transcribeAsyncWithTimeoutAndErrorHandling() throws Exception { + TranscriptionAsyncClient asyncClient = new TranscriptionClientBuilder() + .endpoint(endpoint) + .credential(new KeyCredential(key)) + .buildAsyncClient(); + + byte[] audioData = Files.readAllBytes(Paths.get("sample.wav")); + AudioFileDetails audioFileDetails = new AudioFileDetails(BinaryData.fromBytes(audioData)); + TranscriptionOptions options = new TranscriptionOptions(audioFileDetails); + + // BEGIN: com.azure.ai.speech.transcription.transcriptionasyncclient.transcribe.timeout + Mono resultMono = asyncClient.transcribe(options) + .timeout(Duration.ofMinutes(2)) + .doOnError(error -> System.err.println("Error: " + error.getMessage())) + .onErrorResume(error -> { + System.err.println("Fallback: Returning empty result"); + return Mono.empty(); + }); + + TranscriptionResult result = resultMono.block(); + // END: com.azure.ai.speech.transcription.transcriptionasyncclient.transcribe.timeout + } + + /** + * Sample for processing detailed async transcription results. + */ + public void processDetailedAsyncResults() throws Exception { + TranscriptionAsyncClient asyncClient = new TranscriptionClientBuilder() + .endpoint(endpoint) + .credential(new KeyCredential(key)) + .buildAsyncClient(); + + byte[] audioData = Files.readAllBytes(Paths.get("sample.wav")); + AudioFileDetails audioFileDetails = new AudioFileDetails(BinaryData.fromBytes(audioData)); + TranscriptionOptions options = new TranscriptionOptions(audioFileDetails); + + // BEGIN: com.azure.ai.speech.transcription.transcriptionasyncclient.results.detailed + asyncClient.transcribe(options) + .subscribe(result -> { + // Access combined phrases + if (result.getCombinedPhrases() != null) { + result.getCombinedPhrases().forEach(channelPhrase -> + System.out.printf("[Channel %d] %s%n", + channelPhrase.getChannel(), channelPhrase.getText()) + ); + } + + // Access detailed phrases with word-level timing + if (result.getPhrases() != null) { + result.getPhrases().forEach(phrase -> { + System.out.printf("Phrase (%.2f-%.2fs): %s%n", + phrase.getOffset() / 1000.0, + (phrase.getOffset() + phrase.getDuration().toMillis()) / 1000.0, + phrase.getText()); + + if (phrase.getWords() != null) { + phrase.getWords().forEach(word -> + System.out.printf(" \"%s\" at %.2fs%n", + word.getText(), + word.getOffset() / 1000.0) + ); + } + }); + } + }); + // END: com.azure.ai.speech.transcription.transcriptionasyncclient.results.detailed + } +} + diff --git a/sdk/transcription/azure-ai-speech-transcription/src/samples/java/com/azure/ai/speech/transcription/javadoccodesnippets/TranscriptionClientJavaDocCodeSnippets.java b/sdk/transcription/azure-ai-speech-transcription/src/samples/java/com/azure/ai/speech/transcription/javadoccodesnippets/TranscriptionClientJavaDocCodeSnippets.java new file mode 100644 index 000000000000..cc9ebd2c57fc --- /dev/null +++ b/sdk/transcription/azure-ai-speech-transcription/src/samples/java/com/azure/ai/speech/transcription/javadoccodesnippets/TranscriptionClientJavaDocCodeSnippets.java @@ -0,0 +1,164 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. +// Source code snippets from this file are embedded in Transcription SDK JavaDoc (API documentation). + +package com.azure.ai.speech.transcription.javadoccodesnippets; + +import com.azure.ai.speech.transcription.TranscriptionClient; +import com.azure.ai.speech.transcription.TranscriptionClientBuilder; +import com.azure.ai.speech.transcription.models.AudioFileDetails; +import com.azure.ai.speech.transcription.models.ProfanityFilterMode; +import com.azure.ai.speech.transcription.models.TranscriptionDiarizationOptions; +import com.azure.ai.speech.transcription.models.TranscriptionOptions; +import com.azure.ai.speech.transcription.models.TranscriptionResult; +import com.azure.core.credential.KeyCredential; +import com.azure.core.credential.TokenCredential; +import com.azure.core.util.BinaryData; +import com.azure.identity.DefaultAzureCredentialBuilder; + +import java.nio.file.Files; +import java.nio.file.Paths; +import java.util.Arrays; + +/** + * Code snippets for {@link TranscriptionClient} JavaDoc documentation. + */ +public class TranscriptionClientJavaDocCodeSnippets { + + private static String endpoint = System.getenv("SPEECH_ENDPOINT"); + private static String key = System.getenv("SPEECH_API_KEY"); + + /** + * Sample for creating a synchronous TranscriptionClient with API key authentication. + */ + public void createClientWithApiKey() { + // BEGIN: com.azure.ai.speech.transcription.transcriptionclient.instantiation.apikey + TranscriptionClient client = new TranscriptionClientBuilder() + .endpoint(endpoint) + .credential(new KeyCredential(key)) + .buildClient(); + // END: com.azure.ai.speech.transcription.transcriptionclient.instantiation.apikey + } + + /** + * Sample for creating a synchronous TranscriptionClient with Entra ID authentication. + */ + public void createClientWithTokenCredential() { + // BEGIN: com.azure.ai.speech.transcription.transcriptionclient.instantiation.tokencredential + // Use DefaultAzureCredential for Entra ID authentication + TokenCredential credential = new DefaultAzureCredentialBuilder().build(); + + TranscriptionClient client = new TranscriptionClientBuilder() + .endpoint(endpoint) + .credential(credential) + .buildClient(); + // END: com.azure.ai.speech.transcription.transcriptionclient.instantiation.tokencredential + } + + /** + * Sample for transcribing audio from a file with default options. + */ + public void transcribeFromFile() throws Exception { + TranscriptionClient client = new TranscriptionClientBuilder() + .endpoint(endpoint) + .credential(new KeyCredential(key)) + .buildClient(); + + // BEGIN: com.azure.ai.speech.transcription.transcriptionclient.transcribe.file + // Read audio file + byte[] audioData = Files.readAllBytes(Paths.get("sample.wav")); + + // Create audio file details + AudioFileDetails audioFileDetails = new AudioFileDetails(BinaryData.fromBytes(audioData)); + + // Create transcription options using the AudioFileDetails constructor + TranscriptionOptions options = new TranscriptionOptions(audioFileDetails); + + // Transcribe audio + TranscriptionResult result = client.transcribe(options); + + // Process results + System.out.println("Duration: " + result.getDuration() + " ms"); + result.getCombinedPhrases().forEach(phrase -> { + System.out.println("Channel " + phrase.getChannel() + ": " + phrase.getText()); + }); + // END: com.azure.ai.speech.transcription.transcriptionclient.transcribe.file + } + + /** + * Sample for transcribing audio with advanced options. + */ + public void transcribeWithOptions() throws Exception { + TranscriptionClient client = new TranscriptionClientBuilder() + .endpoint(endpoint) + .credential(new KeyCredential(key)) + .buildClient(); + + // BEGIN: com.azure.ai.speech.transcription.transcriptionclient.transcribe.options + byte[] audioData = Files.readAllBytes(Paths.get("sample.wav")); + + AudioFileDetails audioFileDetails = new AudioFileDetails(BinaryData.fromBytes(audioData)); + + // Configure advanced transcription options + TranscriptionOptions options = new TranscriptionOptions(audioFileDetails) + .setLocales(Arrays.asList("en-US", "es-ES")) + .setProfanityFilterMode(ProfanityFilterMode.MASKED) + .setDiarizationOptions(new TranscriptionDiarizationOptions().setMaxSpeakers(5)); + + TranscriptionResult result = client.transcribe(options); + + // Access detailed results + if (result.getPhrases() != null) { + result.getPhrases().forEach(phrase -> { + System.out.printf("Speaker %d: %s%n", + phrase.getSpeaker(), phrase.getText()); + }); + } + // END: com.azure.ai.speech.transcription.transcriptionclient.transcribe.options + } + + /** + * Sample for processing detailed transcription results with word-level timing. + */ + public void processDetailedResults() throws Exception { + TranscriptionClient client = new TranscriptionClientBuilder() + .endpoint(endpoint) + .credential(new KeyCredential(key)) + .buildClient(); + + byte[] audioData = Files.readAllBytes(Paths.get("sample.wav")); + AudioFileDetails audioFileDetails = new AudioFileDetails(BinaryData.fromBytes(audioData)); + TranscriptionOptions options = new TranscriptionOptions(audioFileDetails); + TranscriptionResult result = client.transcribe(options); + + // BEGIN: com.azure.ai.speech.transcription.transcriptionclient.results.detailed + // Access sentence-level combined phrases + if (result.getCombinedPhrases() != null) { + result.getCombinedPhrases().forEach(channelPhrase -> { + System.out.printf("[Channel %d] %s%n", + channelPhrase.getChannel(), channelPhrase.getText()); + }); + } + + // Access word-level details with timing + if (result.getPhrases() != null) { + result.getPhrases().forEach(phrase -> { + System.out.printf("Phrase (%.2f-%.2fs): %s%n", + phrase.getOffset() / 1000.0, + (phrase.getOffset() + phrase.getDuration().toMillis()) / 1000.0, + phrase.getText()); + + // Get word-level timing information + if (phrase.getWords() != null) { + phrase.getWords().forEach(word -> { + System.out.printf(" Word: \"%s\" at %.2fs%n", + word.getText(), + word.getOffset() / 1000.0); + }); + } + }); + } + // END: com.azure.ai.speech.transcription.transcriptionclient.results.detailed + } +} + diff --git a/sdk/transcription/azure-ai-speech-transcription/src/test/java/com/azure/ai/speech/transcription/README.md b/sdk/transcription/azure-ai-speech-transcription/src/test/java/com/azure/ai/speech/transcription/README.md new file mode 100644 index 000000000000..353f2605ba20 --- /dev/null +++ b/sdk/transcription/azure-ai-speech-transcription/src/test/java/com/azure/ai/speech/transcription/README.md @@ -0,0 +1,214 @@ +# Azure AI Speech Transcription client library tests for Java + +This directory contains tests for the Azure AI Speech Transcription client library for Java. + +## Test Structure + +The tests are organized as follows: + +- **TranscriptionClientTestBase.java**: Base class containing common test infrastructure, helper methods, and validation logic. Includes support for both file-based and URL-based transcription. +- **TranscriptionClientTest.java**: Tests for the synchronous `TranscriptionClient` (14 tests) +- **TranscriptionAsyncClientTest.java**: Tests for the asynchronous `TranscriptionAsyncClient` (16 tests) +- **generated/**: Auto-generated test templates (for reference only) + +## Prerequisites + +Before running the tests, you need: + +1. An Azure Cognitive Services Speech resource. Create one using the [Azure Portal](https://portal.azure.com/). +2. Java Development Kit (JDK) 8 or later +3. Maven 3.x or later +4. A sample audio file for testing (WAV, MP3, or OGG format, shorter than 2 hours, smaller than 250 MB) + +## Set Environment Variables + +Set the following environment variables to run live tests: + +### Windows (PowerShell) + +```powershell +$env:SPEECH_ENDPOINT = "https://.cognitiveservices.azure.com" +$env:SPEECH_API_KEY = "" +``` + +### Windows (Command Prompt) + +```cmd +set SPEECH_ENDPOINT=https://.cognitiveservices.azure.com +set SPEECH_API_KEY= +``` + +### Linux/macOS (Bash) + +```bash +export SPEECH_ENDPOINT="https://.cognitiveservices.azure.com" +export SPEECH_API_KEY="" +``` + +## Configure Test Proxy + +The Azure SDK for Java uses a test proxy for recording and playing back HTTP interactions. This library has been migrated to use the test proxy following the [Test Proxy Migration Guide](https://github.com/Azure/azure-sdk-for-java/blob/main/sdk/core/azure-core-test/TestProxyMigrationGuide.md). + +Test recordings are stored in the [azure-sdk-assets](https://github.com/Azure/azure-sdk-assets) repository and referenced via the `assets.json` file. Configure the test mode by setting the `AZURE_TEST_MODE` environment variable: + +### Live Mode (against live service) + +```powershell +$env:AZURE_TEST_MODE = "LIVE" +``` + +This mode makes real HTTP calls to the Azure service. Use this when you want to test against the actual service. + +### Record Mode (record interactions) + +```powershell +$env:AZURE_TEST_MODE = "RECORD" +``` + +This mode makes real HTTP calls and records them for later playback. Recordings are managed by the test-proxy tool and can be pushed to the azure-sdk-assets repository using: + +```bash +test-proxy push -a assets.json +``` + +### Playback Mode (use recordings) + +```powershell +$env:AZURE_TEST_MODE = "PLAYBACK" +``` + +This mode uses previously recorded HTTP interactions instead of making real calls. This is the default mode and doesn't require credentials. + +## Running Tests + +### Run All Tests + +From the `sdk/transcription/azure-ai-speech-transcription` directory: + +```bash +mvn clean test +``` + +### Run Specific Test Class + +```bash +mvn test -Dtest=TranscriptionClientTest +``` + +or + +```bash +mvn test -Dtest=TranscriptionAsyncClientTest +``` + +### Run a Specific Test Method + +```bash +mvn test -Dtest=TranscriptionClientTest#testTranscribeSyncBasicFromFile +``` + +## Test Organization + +### Synchronous Tests (TranscriptionClientTest) + +Tests for the synchronous `TranscriptionClient` (14 tests), including: + +- Basic transcription from file +- Transcription from URL (using publicly accessible audio URL) +- Transcription with language specification +- Transcription with multiple languages +- Transcription with speaker diarization +- Transcription with profanity filtering +- Transcription with word-level timestamps +- Tests using `transcribeWithResponse()` method +- Tests with custom RequestOptions + +### Asynchronous Tests (TranscriptionAsyncClientTest) + +Tests for the asynchronous `TranscriptionAsyncClient` (16 tests), mirroring the synchronous tests but using reactive programming patterns with `Mono` and `Flux`. Includes additional tests for: + +- Transcription from URL (using publicly accessible audio URL) +- Error handling with invalid language codes +- Placeholder tests for empty audio data and cancellation scenarios + +## Authentication + +The tests support two authentication methods: + +1. **Key-based authentication** (default): Uses the API key from `SPEECH_API_KEY` environment variable +2. **Token-based authentication**: Uses Entra ID credentials via `DefaultAzureCredential` + +To test with token-based authentication, some tests use `createClient(false, true, sync)` where the first parameter is `false`. + +## Troubleshooting + +### Common Issues + +1. **Missing environment variables**: Ensure `SPEECH_ENDPOINT` and `SPEECH_API_KEY` are set correctly +2. **Missing sample audio file**: Make sure you have a `sample.wav` file in the test directory (WAV, MP3, or OGG format, shorter than 2 hours, smaller than 250 MB) +3. **URL transcription failures**: URL-based transcription requires a specific API key tier that supports this feature. If URL tests fail with 401 errors, verify your Speech resource supports URL transcription. +4. **Test proxy issues**: If playback tests fail, try running in LIVE or RECORD mode first to regenerate recordings +5. **Network issues**: Check your network connection and firewall settings + +### Enable Detailed Logging + +To enable detailed HTTP logging during tests, set the logging level in your `logback-test.xml` or via environment variables: + +```powershell +$env:AZURE_LOG_LEVEL = "verbose" +``` + +## Additional Resources + +- [Azure SDK for Java Test Documentation](https://github.com/Azure/azure-sdk-for-java/blob/main/sdk/core/azure-core-test/README.md) +- [TypeSpec Java QuickStart - Adding Tests](https://github.com/Azure/azure-sdk-for-java/wiki/TypeSpec-Java-QuickStart#adding-tests) +- [Azure Speech Service Documentation](https://learn.microsoft.com/azure/cognitive-services/speech-service/) +- [Azure SDK for Java Contributing Guide](https://github.com/Azure/azure-sdk-for-java/blob/main/CONTRIBUTING.md) + +## Test Coverage + +The current tests cover: + +- ✅ Client instantiation with different authentication methods +- ✅ Basic transcription functionality from files +- ✅ Transcription from publicly accessible URLs +- ✅ Transcription with various options (language, diarization, profanity filter, timestamps) +- ✅ Both synchronous and asynchronous clients +- ✅ Methods with and without `Response` wrappers +- ✅ Custom RequestOptions and headers +- ✅ Error handling (invalid language codes) + +Areas for future enhancement: + +- ⏳ Empty audio data handling (placeholder test exists) +- ⏳ Cancellation scenarios (placeholder test exists) +- ⬜ Performance tests +- ⬜ Concurrent request handling +- ⬜ Edge cases (very long audio, multiple channels, etc.) + +## Recording Sanitizers + +The tests use the test-proxy's built-in sanitizers to automatically redact sensitive information from recordings: + +- API keys and authentication tokens +- Connection strings and passwords +- Account names and identifiers +- Hostnames in URLs + +Some default sanitizers (AZSDK2003, AZSDK2030, AZSDK3430, AZSDK3493) are explicitly removed to preserve resource identifiers needed for proper request matching during playback. + +## Managing Test Recordings + +### Restore recordings from assets repo + +```bash +test-proxy restore -a assets.json +``` + +### Push new recordings to assets repo + +```bash +test-proxy push -a assets.json +``` + +This creates a new tag in the azure-sdk-assets repository and updates `assets.json` with the new tag reference. diff --git a/sdk/transcription/azure-ai-speech-transcription/src/test/java/com/azure/ai/speech/transcription/TranscriptionAsyncClientTest.java b/sdk/transcription/azure-ai-speech-transcription/src/test/java/com/azure/ai/speech/transcription/TranscriptionAsyncClientTest.java new file mode 100644 index 000000000000..2d46371ee25d --- /dev/null +++ b/sdk/transcription/azure-ai-speech-transcription/src/test/java/com/azure/ai/speech/transcription/TranscriptionAsyncClientTest.java @@ -0,0 +1,273 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +package com.azure.ai.speech.transcription; + +import com.azure.ai.speech.transcription.models.ProfanityFilterMode; +import com.azure.ai.speech.transcription.models.TranscriptionDiarizationOptions; +import com.azure.ai.speech.transcription.models.TranscriptionOptions; +import com.azure.core.exception.HttpResponseException; +import com.azure.core.http.HttpHeaderName; +import com.azure.core.http.rest.RequestOptions; +import org.junit.jupiter.api.Test; + +import java.util.Arrays; + +/** + * Tests for TranscriptionAsyncClient (asynchronous client). + */ +class TranscriptionAsyncClientTest extends TranscriptionClientTestBase { + + private final Boolean sync = false; // All tests in this file use the async client + + /*********************************************************************************** + * + * HAPPY PATH TESTS + * + ***********************************************************************************/ + + @Test + public void testTranscribeAsyncBasicFromFile() { + createClient(true, true, sync); + + String methodName = new Object() { + }.getClass().getEnclosingMethod().getName(); + + TranscriptionOptions options = new TranscriptionOptions((String) null); + + doTranscription(methodName, sync, false, audioFile, options, null); + } + + @Test + public void testTranscribeAsyncWithLanguageFromFile() { + createClient(true, true, sync); + + String methodName = new Object() { + }.getClass().getEnclosingMethod().getName(); + + TranscriptionOptions options = new TranscriptionOptions((String) null).setLocales(Arrays.asList("en-US")); + + doTranscription(methodName, sync, false, audioFile, options, null); + } + + @Test + public void testTranscribeAsyncWithMultipleLanguagesFromFile() { + createClient(true, true, sync); + + String methodName = new Object() { + }.getClass().getEnclosingMethod().getName(); + + TranscriptionOptions options + = new TranscriptionOptions((String) null).setLocales(Arrays.asList("en-US", "es-ES", "fr-FR")); + + doTranscription(methodName, sync, false, audioFile, options, null); + } + + @Test + public void testTranscribeAsyncWithDiarizationFromFile() { + createClient(true, true, sync); + + String methodName = new Object() { + }.getClass().getEnclosingMethod().getName(); + + TranscriptionDiarizationOptions diarizationOptions = new TranscriptionDiarizationOptions().setMaxSpeakers(5); + + TranscriptionOptions options + = new TranscriptionOptions((String) null).setDiarizationOptions(diarizationOptions); + + doTranscription(methodName, sync, false, audioFile, options, null); + } + + @Test + public void testTranscribeAsyncWithProfanityFilterFromFile() { + createClient(true, true, sync); + + String methodName = new Object() { + }.getClass().getEnclosingMethod().getName(); + + TranscriptionOptions options + = new TranscriptionOptions((String) null).setProfanityFilterMode(ProfanityFilterMode.MASKED); + + doTranscription(methodName, sync, false, audioFile, options, null); + } + + @Test + public void testTranscribeAsyncWithChannelsFromFile() { + createClient(true, true, sync); + + String methodName = new Object() { + }.getClass().getEnclosingMethod().getName(); + + TranscriptionOptions options = new TranscriptionOptions((String) null).setActiveChannels(Arrays.asList(0)); + + doTranscription(methodName, sync, false, audioFile, options, null); + } + + @Test + public void testTranscribeAsyncAllOptionsFromFile() { + createClient(true, true, sync); + + String methodName = new Object() { + }.getClass().getEnclosingMethod().getName(); + + TranscriptionDiarizationOptions diarizationOptions = new TranscriptionDiarizationOptions().setMaxSpeakers(5); + + TranscriptionOptions options = new TranscriptionOptions((String) null).setLocales(Arrays.asList("en-US")) + .setDiarizationOptions(diarizationOptions) + .setProfanityFilterMode(ProfanityFilterMode.MASKED) + .setActiveChannels(Arrays.asList(0)); + + doTranscription(methodName, sync, false, audioFile, options, null); + } + + @Test + public void testTranscribeAsyncBasicFromFileWithResponse() { + createClient(true, true, sync); + + String methodName = new Object() { + }.getClass().getEnclosingMethod().getName(); + + TranscriptionOptions options = new TranscriptionOptions((String) null); + RequestOptions requestOptions + = new RequestOptions().addHeader(HttpHeaderName.fromString("x-custom-header"), "custom-value"); + + doTranscription(methodName, sync, true, audioFile, options, requestOptions); + } + + @Test + public void testTranscribeAsyncWithAllOptionsFromFileWithResponse() { + createClient(true, true, sync); + + String methodName = new Object() { + }.getClass().getEnclosingMethod().getName(); + + TranscriptionDiarizationOptions diarizationOptions = new TranscriptionDiarizationOptions().setMaxSpeakers(5); + + TranscriptionOptions options + = new TranscriptionOptions((String) null).setLocales(Arrays.asList("en-US", "es-ES")) + .setDiarizationOptions(diarizationOptions) + .setProfanityFilterMode(ProfanityFilterMode.REMOVED) + .setActiveChannels(Arrays.asList(0, 1)); + + RequestOptions requestOptions + = new RequestOptions().addHeader(HttpHeaderName.fromString("x-custom-header"), "custom-value") + .addQueryParam("test-param", "test-value"); + + doTranscription(methodName, sync, true, audioFile, options, requestOptions); + } + + @Test + public void testTranscribeAsyncWithAudioUrl() { + createClient(true, true, sync); + + String methodName = new Object() { + }.getClass().getEnclosingMethod().getName(); + + // Using a publicly accessible sample audio file from Azure samples + String audioUrl + = "https://raw.githubusercontent.com/Azure-Samples/cognitive-services-speech-sdk/master/sampledata/audiofiles/aboutSpeechSdk.wav"; + TranscriptionOptions options = new TranscriptionOptions(audioUrl).setLocales(Arrays.asList("en-US")); + + // For URL-based transcription, we don't pass the local audio file path + doTranscriptionWithUrl(methodName, sync, options); + } + + @Test + public void testTranscribeAsyncWithProfanityModeMasked() { + createClient(true, true, sync); + + String methodName = new Object() { + }.getClass().getEnclosingMethod().getName(); + + TranscriptionOptions options + = new TranscriptionOptions((String) null).setProfanityFilterMode(ProfanityFilterMode.MASKED); + doTranscription(methodName, sync, false, audioFile, options, null); + } + + @Test + public void testTranscribeAsyncWithProfanityModeRemoved() { + createClient(true, true, sync); + + String methodName = new Object() { + }.getClass().getEnclosingMethod().getName(); + + TranscriptionOptions options + = new TranscriptionOptions((String) null).setProfanityFilterMode(ProfanityFilterMode.REMOVED); + doTranscription(methodName, sync, false, audioFile, options, null); + } + + @Test + public void testTranscribeAsyncWithProfanityModeTags() { + createClient(true, true, sync); + + String methodName = new Object() { + }.getClass().getEnclosingMethod().getName(); + + TranscriptionOptions options + = new TranscriptionOptions((String) null).setProfanityFilterMode(ProfanityFilterMode.TAGS); + doTranscription(methodName, sync, false, audioFile, options, null); + } + + /*********************************************************************************** + * + * ERROR HANDLING TESTS + * + ***********************************************************************************/ + + @Test + public void testTranscribeAsyncWithEmptyAudioData() { + createClient(true, true, sync); + + String methodName = new Object() { + }.getClass().getEnclosingMethod().getName(); + + // Test with minimal audio data - service should handle gracefully + TranscriptionOptions options = new TranscriptionOptions((String) null); + + doTranscription(methodName, sync, false, audioFile, options, null); + } + + @Test + public void testTranscribeAsyncWithInvalidLanguageCode() { + createClient(true, true, sync); + + String methodName = new Object() { + }.getClass().getEnclosingMethod().getName(); + + // Use invalid language code to trigger service error + TranscriptionOptions options + = new TranscriptionOptions((String) null).setLocales(Arrays.asList("invalid-locale-code")); + + // The service should return a 400 error for invalid locale + // doTranscription wraps exceptions in RuntimeException, so we catch that + try { + doTranscription(methodName, sync, false, audioFile, options, null); + // Should not reach here - the above should throw an exception + throw new AssertionError("Expected RuntimeException with HttpResponseException cause but none was thrown"); + } catch (RuntimeException e) { + // Expected behavior - verify the cause is HttpResponseException with 400 status + if (!(e.getCause() instanceof HttpResponseException)) { + throw new AssertionError( + "Expected RuntimeException cause to be HttpResponseException but got: " + e.getCause().getClass()); + } + HttpResponseException httpException = (HttpResponseException) e.getCause(); + if (httpException.getResponse().getStatusCode() != 400) { + throw new AssertionError( + "Expected 400 status code but got: " + httpException.getResponse().getStatusCode()); + } + } + } + + @Test + public void testTranscribeAsyncCancellation() { + createClient(true, true, sync); + + String methodName = new Object() { + }.getClass().getEnclosingMethod().getName(); + + // Test cancellation behavior with a normal transcription request + TranscriptionOptions options = new TranscriptionOptions((String) null); + + doTranscription(methodName, sync, false, audioFile, options, null); + } +} diff --git a/sdk/transcription/azure-ai-speech-transcription/src/test/java/com/azure/ai/speech/transcription/TranscriptionClientTest.java b/sdk/transcription/azure-ai-speech-transcription/src/test/java/com/azure/ai/speech/transcription/TranscriptionClientTest.java new file mode 100644 index 000000000000..0d9b261ccf44 --- /dev/null +++ b/sdk/transcription/azure-ai-speech-transcription/src/test/java/com/azure/ai/speech/transcription/TranscriptionClientTest.java @@ -0,0 +1,226 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +package com.azure.ai.speech.transcription; + +import com.azure.ai.speech.transcription.models.ProfanityFilterMode; +import com.azure.ai.speech.transcription.models.TranscriptionDiarizationOptions; +import com.azure.ai.speech.transcription.models.TranscriptionOptions; +import com.azure.core.http.HttpHeaderName; +import com.azure.core.http.rest.RequestOptions; +import org.junit.jupiter.api.Test; + +import java.util.Arrays; + +import static org.junit.jupiter.api.Assertions.assertThrows; + +/** + * Tests for TranscriptionClient (synchronous client). + */ +class TranscriptionClientTest extends TranscriptionClientTestBase { + + private final Boolean sync = true; // All tests in this file use the sync client + + /*********************************************************************************** + * + * HAPPY PATH TESTS + * + ***********************************************************************************/ + + @Test + public void testTranscribeSyncBasicFromFile() { + createClient(true, true, sync); + + String methodName = new Object() { + }.getClass().getEnclosingMethod().getName(); + + TranscriptionOptions options = new TranscriptionOptions((String) null); + + doTranscription(methodName, sync, false, audioFile, options, null); + } + + @Test + public void testTranscribeSyncWithLanguageFromFile() { + createClient(true, true, sync); + + String methodName = new Object() { + }.getClass().getEnclosingMethod().getName(); + + TranscriptionOptions options = new TranscriptionOptions((String) null).setLocales(Arrays.asList("en-US")); + + doTranscription(methodName, sync, false, audioFile, options, null); + } + + @Test + public void testTranscribeSyncWithMultipleLanguagesFromFile() { + createClient(true, true, sync); + + String methodName = new Object() { + }.getClass().getEnclosingMethod().getName(); + + TranscriptionOptions options + = new TranscriptionOptions((String) null).setLocales(Arrays.asList("en-US", "es-ES", "fr-FR")); + + doTranscription(methodName, sync, false, audioFile, options, null); + } + + @Test + public void testTranscribeSyncWithDiarizationFromFile() { + createClient(true, true, sync); + + String methodName = new Object() { + }.getClass().getEnclosingMethod().getName(); + + TranscriptionDiarizationOptions diarizationOptions = new TranscriptionDiarizationOptions().setMaxSpeakers(5); + + TranscriptionOptions options + = new TranscriptionOptions((String) null).setDiarizationOptions(diarizationOptions); + + doTranscription(methodName, sync, false, audioFile, options, null); + } + + @Test + public void testTranscribeSyncWithProfanityFilterFromFile() { + createClient(true, true, sync); + + String methodName = new Object() { + }.getClass().getEnclosingMethod().getName(); + + TranscriptionOptions options + = new TranscriptionOptions((String) null).setProfanityFilterMode(ProfanityFilterMode.MASKED); + + doTranscription(methodName, sync, false, audioFile, options, null); + } + + @Test + public void testTranscribeSyncWithChannelsFromFile() { + createClient(true, true, sync); + + String methodName = new Object() { + }.getClass().getEnclosingMethod().getName(); + + TranscriptionOptions options = new TranscriptionOptions((String) null).setActiveChannels(Arrays.asList(0)); + + doTranscription(methodName, sync, false, audioFile, options, null); + } + + @Test + public void testTranscribeSyncAllOptionsFromFile() { + createClient(true, true, sync); + + String methodName = new Object() { + }.getClass().getEnclosingMethod().getName(); + + TranscriptionDiarizationOptions diarizationOptions = new TranscriptionDiarizationOptions().setMaxSpeakers(5); + + TranscriptionOptions options = new TranscriptionOptions((String) null).setLocales(Arrays.asList("en-US")) + .setDiarizationOptions(diarizationOptions) + .setProfanityFilterMode(ProfanityFilterMode.MASKED) + .setActiveChannels(Arrays.asList(0)); + + doTranscription(methodName, sync, false, audioFile, options, null); + } + + @Test + public void testTranscribeSyncBasicFromFileWithResponse() { + createClient(true, true, sync); + + String methodName = new Object() { + }.getClass().getEnclosingMethod().getName(); + + TranscriptionOptions options = new TranscriptionOptions((String) null); + RequestOptions requestOptions + = new RequestOptions().addHeader(HttpHeaderName.fromString("x-custom-header"), "custom-value"); + + doTranscription(methodName, sync, true, audioFile, options, requestOptions); + } + + @Test + public void testTranscribeSyncWithAllOptionsFromFileWithResponse() { + createClient(true, true, sync); + + String methodName = new Object() { + }.getClass().getEnclosingMethod().getName(); + + TranscriptionDiarizationOptions diarizationOptions = new TranscriptionDiarizationOptions().setMaxSpeakers(5); + + TranscriptionOptions options + = new TranscriptionOptions((String) null).setLocales(Arrays.asList("en-US", "es-ES")) + .setDiarizationOptions(diarizationOptions) + .setProfanityFilterMode(ProfanityFilterMode.REMOVED) + .setActiveChannels(Arrays.asList(0, 1)); + + RequestOptions requestOptions + = new RequestOptions().addHeader(HttpHeaderName.fromString("x-custom-header"), "custom-value") + .addQueryParam("test-param", "test-value"); + + doTranscription(methodName, sync, true, audioFile, options, requestOptions); + } + + @Test + public void testTranscribeSyncWithMultipleChannels() { + // Test with multiple channel indices + createClient(true, true, sync); + + String methodName = new Object() { + }.getClass().getEnclosingMethod().getName(); + + TranscriptionOptions options = new TranscriptionOptions((String) null).setActiveChannels(Arrays.asList(0, 1)); + + doTranscription(methodName, sync, false, audioFile, options, null); + } + + @Test + public void testTranscribeSyncWithAudioUrl() { + createClient(true, true, sync); + + String methodName = new Object() { + }.getClass().getEnclosingMethod().getName(); + + // Using a publicly accessible sample audio file from Azure samples + String audioUrl + = "https://raw.githubusercontent.com/Azure-Samples/cognitive-services-speech-sdk/master/sampledata/audiofiles/aboutSpeechSdk.wav"; + TranscriptionOptions options = new TranscriptionOptions(audioUrl).setLocales(Arrays.asList("en-US")); + + // For URL-based transcription, we don't pass the local audio file path + doTranscriptionWithUrl(methodName, sync, options); + } + + /*********************************************************************************** + * + * ERROR HANDLING TESTS + * + ***********************************************************************************/ + + @Test + public void testTranscribeSyncWithNullOptions() { + createClient(true, true, sync); + + // Test that null options throws appropriate exception + assertThrows(NullPointerException.class, () -> { + getClient().transcribe((TranscriptionOptions) null); + }, "Transcribe should throw NullPointerException when options is null"); + } + + @Test + public void testTranscribeSyncWithEmptyAudioData() { + createClient(true, true, sync); + + // Test with empty audio data - this should result in a service error + // Note: Depending on service behavior, this may throw HttpResponseException + // The exact behavior should be validated based on actual service responses + } + + @Test + public void testTranscribeSyncWithInvalidLanguageCode() { + createClient(true, true, sync); + + // Note: This test requires actual service call to verify behavior + // In PLAYBACK mode, this would replay the recorded error response + // Example implementation: + // TranscriptionOptions options = new TranscriptionOptions((String) null) + // .setLocales(Arrays.asList("invalid-locale")); + // doTranscription(methodName, sync, false, audioFile, options, null); + // The service should return an error for invalid locale + } +} diff --git a/sdk/transcription/azure-ai-speech-transcription/src/test/java/com/azure/ai/speech/transcription/TranscriptionClientTestBase.java b/sdk/transcription/azure-ai-speech-transcription/src/test/java/com/azure/ai/speech/transcription/TranscriptionClientTestBase.java new file mode 100644 index 000000000000..7a8b8c1ec0f8 --- /dev/null +++ b/sdk/transcription/azure-ai-speech-transcription/src/test/java/com/azure/ai/speech/transcription/TranscriptionClientTestBase.java @@ -0,0 +1,337 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +package com.azure.ai.speech.transcription; + +import com.azure.ai.speech.transcription.models.AudioFileDetails; +import com.azure.ai.speech.transcription.models.TranscriptionOptions; +import com.azure.ai.speech.transcription.models.TranscriptionResult; +import com.azure.core.credential.KeyCredential; +import com.azure.core.credential.TokenCredential; +import com.azure.core.http.HttpRequest; +import com.azure.core.http.policy.HttpLogDetailLevel; +import com.azure.core.http.policy.HttpLogOptions; +import com.azure.core.http.rest.RequestOptions; +import com.azure.core.http.rest.Response; +import com.azure.core.test.TestMode; +import com.azure.core.test.TestProxyTestBase; +import com.azure.core.util.BinaryData; +import com.azure.core.util.Configuration; +import com.azure.core.util.logging.ClientLogger; +import com.azure.identity.DefaultAzureCredentialBuilder; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; + +import java.io.File; +import java.nio.file.Files; +import java.nio.file.Paths; + +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertTrue; + +/** + * Base class for TranscriptionClient tests. Contains helper methods and common test infrastructure. + * Supports both API Key (KeyCredential) and Entra ID (TokenCredential) authentication. + */ +class TranscriptionClientTestBase extends TestProxyTestBase { + private static final ClientLogger LOGGER = new ClientLogger(TranscriptionClientTestBase.class); + + final Boolean printResults = false; // Set to true to print results to console window + + // Sample audio file for testing + final String audioFile = "./src/test/java/com/azure/ai/speech/transcription/sample.wav"; + + // The clients that will be used for tests + private TranscriptionClient client = null; + private TranscriptionAsyncClient asyncClient = null; + + /** + * Sets up the test resources before each test. + */ + @BeforeEach + public void setupTest() { + // Reset clients before each test to ensure clean state + client = null; + asyncClient = null; + } + + /** + * Cleans up test resources after each test. + */ + @AfterEach + public void cleanupTest() { + // Clean up any resources if needed + // Note: The clients don't require explicit cleanup as they are managed by the test framework + } + + /** + * Creates a client for testing. + * + * @param useKeyAuth Whether to use key-based authentication (true) or token-based authentication (false) + * @param useRealKey Whether to use a real key from environment variables (true) or a fake key (false). + * Only applies when useKeyAuth is true. + * @param sync Whether to create a synchronous client (true) or asynchronous client (false) + */ + protected void createClient(Boolean useKeyAuth, Boolean useRealKey, Boolean sync) { + TestMode testMode = getTestMode(); + + // Define endpoint and auth credentials + String endpoint = "https://fake-resource-name.cognitiveservices.azure.com"; + String key = "00000000000000000000000000000000"; + + if (testMode == TestMode.LIVE || testMode == TestMode.RECORD) { + endpoint = Configuration.getGlobalConfiguration().get("SPEECH_ENDPOINT"); + assertTrue(endpoint != null && !endpoint.isEmpty(), "Endpoint URL is required to run live tests."); + + if (useKeyAuth && useRealKey) { + key = Configuration.getGlobalConfiguration().get("SPEECH_API_KEY"); + assertTrue(key != null && !key.isEmpty(), "API key is required to run live tests with KeyCredential."); + } + } + + // Create the client builder + TranscriptionClientBuilder transcriptionClientBuilder = new TranscriptionClientBuilder().endpoint(endpoint) + .httpLogOptions(new HttpLogOptions().setLogLevel(HttpLogDetailLevel.BODY_AND_HEADERS)); + + // Update the client builder with credentials and recording/playback policies + if (getTestMode() == TestMode.LIVE) { + if (useKeyAuth) { + transcriptionClientBuilder.credential(new KeyCredential(key)); + } else { + // Use Entra ID authentication (TokenCredential) + TokenCredential credential = new DefaultAzureCredentialBuilder().build(); + transcriptionClientBuilder.credential(credential); + } + } else if (getTestMode() == TestMode.RECORD) { + transcriptionClientBuilder.addPolicy(interceptorManager.getRecordPolicy()); + if (useKeyAuth) { + transcriptionClientBuilder.credential(new KeyCredential(key)); + } else { + TokenCredential credential = new DefaultAzureCredentialBuilder().build(); + transcriptionClientBuilder.credential(credential); + } + } else if (getTestMode() == TestMode.PLAYBACK) { + transcriptionClientBuilder.httpClient(interceptorManager.getPlaybackClient()); + // In playback mode, use a fake key regardless of authentication method + transcriptionClientBuilder.credential(new KeyCredential(key)); + } + + // Configure sanitizers - must be done after registering the record policy or playback client + if (!interceptorManager.isLiveMode()) { + // Remove default sanitizers that would interfere with Speech service recordings: + // - AZSDK3430 (id sanitizer): Preserve resource identifiers needed for request matching + // - AZSDK3493 (name sanitizer): Preserve resource names needed for request matching + // - AZSDK2003, AZSDK2030: URI-related sanitizers that may affect Speech endpoints + interceptorManager.removeSanitizers("AZSDK2003", "AZSDK2030", "AZSDK3430", "AZSDK3493"); + } + + if (sync) { + client = transcriptionClientBuilder.buildClient(); + } else { + asyncClient = transcriptionClientBuilder.buildAsyncClient(); + } + } + + /** + * Performs transcription with audio URL and validates the result. + * + * @param testName A label that uniquely defines the test. Used in console printout. + * @param sync 'true' to use synchronous client, 'false' to use asynchronous client. + * @param options TranscriptionOptions with audioUrl set + */ + protected void doTranscriptionWithUrl(String testName, Boolean sync, TranscriptionOptions options) { + try { + // Verify that audioUrl is set + assertNotNull(options.getAudioUrl(), "AudioUrl must be set for URL-based transcription"); + assertFalse(options.getAudioUrl().isEmpty(), "AudioUrl must not be empty"); + + TranscriptionResult result = null; + if (sync) { + result = client.transcribe(options); + } else { + result = asyncClient.transcribe(options).block(); + } + + validateTranscriptionResult(testName, result); + } catch (Exception e) { + LOGGER.error("Error in test {}: {}", testName, e.getMessage()); + throw new RuntimeException(e); + } + } + + /** + * Performs transcription and validates the result. + * + * @param testName A label that uniquely defines the test. Used in console printout. + * @param sync 'true' to use synchronous client, 'false' to use asynchronous client. + * @param transcribeWithResponse 'true' to use transcribeWithResponse(), 'false' to use transcribe(). + * @param audioFilePath Path to the audio file to transcribe + * @param options TranscriptionOptions (can be null) + * @param requestOptions RequestOptions (can be null) + */ + protected void doTranscription(String testName, Boolean sync, Boolean transcribeWithResponse, String audioFilePath, + TranscriptionOptions options, RequestOptions requestOptions) { + + try { + // Load audio file + byte[] audioData = Files.readAllBytes(Paths.get(audioFilePath)); + AudioFileDetails audioFileDetails + = new AudioFileDetails(BinaryData.fromBytes(audioData)).setFilename(new File(audioFilePath).getName()); + + // Create new options with audio file details if options is currently using URL or null + if (options.getAudioUrl() == null) { + // Options was created with null, need to create a new one with audio file details + options = new TranscriptionOptions(audioFileDetails).setLocales(options.getLocales()) + .setLocaleModelMapping(options.getLocaleModelMapping()) + .setProfanityFilterMode(options.getProfanityFilterMode()) + .setDiarizationOptions(options.getDiarizationOptions()) + .setActiveChannels(options.getActiveChannels()) + .setEnhancedModeOptions(options.getEnhancedModeOptions()) + .setPhraseListOptions(options.getPhraseListOptions()); + } + + if (sync) { + TranscriptionResult result = null; + if (!transcribeWithResponse) { + result = client.transcribe(options); + } else { + if (requestOptions == null) { + // Use the new transcribeWithResponse(TranscriptionOptions) convenience method + Response response = client.transcribeWithResponse(options); + printHttpRequestAndResponse(response); + result = response.getValue(); + } else { + // When custom RequestOptions are needed, use the lower-level API + BinaryData multipartBody + = new com.azure.ai.speech.transcription.implementation.MultipartFormDataHelper( + requestOptions) + .serializeJsonField("definition", options) + .serializeFileField("audio", audioFileDetails.getContent(), + audioFileDetails.getContentType(), audioFileDetails.getFilename()) + .end() + .getRequestBody(); + Response response = client.transcribeWithResponse(multipartBody, requestOptions); + printHttpRequestAndResponse(response); + result = response.getValue().toObject(TranscriptionResult.class); + } + } + validateTranscriptionResult(testName, result); + } else { + TranscriptionResult result = null; + if (!transcribeWithResponse) { + result = asyncClient.transcribe(options).block(); + } else { + if (requestOptions == null) { + // Use the new transcribeWithResponse(TranscriptionOptions) convenience method + Response response = asyncClient.transcribeWithResponse(options).block(); + printHttpRequestAndResponse(response); + result = response.getValue(); + } else { + // When custom RequestOptions are needed, use the lower-level API + BinaryData multipartBody + = new com.azure.ai.speech.transcription.implementation.MultipartFormDataHelper( + requestOptions) + .serializeJsonField("definition", options) + .serializeFileField("audio", audioFileDetails.getContent(), + audioFileDetails.getContentType(), audioFileDetails.getFilename()) + .end() + .getRequestBody(); + Response response + = asyncClient.transcribeWithResponse(multipartBody, requestOptions).block(); + printHttpRequestAndResponse(response); + result = response.getValue().toObject(TranscriptionResult.class); + } + } + validateTranscriptionResult(testName, result); + } + } catch (Exception e) { + LOGGER.error("Error in test {}: {}", testName, e.getMessage()); + throw new RuntimeException(e); + } + } + + /** + * Validates the transcription result. + * + * @param testName The name of the test + * @param result The transcription result to validate + */ + protected void validateTranscriptionResult(String testName, TranscriptionResult result) { + if (printResults) { + System.out.println("\n===== Test: " + testName + " ====="); + System.out.println("Duration: " + result.getDuration() + "ms"); + if (result.getCombinedPhrases() != null) { + result.getCombinedPhrases().forEach(phrase -> { + System.out.println("Channel " + phrase.getChannel() + ": " + phrase.getText()); + }); + } + if (result.getPhrases() != null) { + result.getPhrases().forEach(phrase -> { + System.out.println("Phrase: " + phrase.getText() + " (confidence: " + phrase.getConfidence() + ")"); + }); + } + } + + // Basic validation + assertNotNull(result, "Transcription result should not be null"); + assertNotNull(result.getDuration(), "Duration should not be null"); + assertTrue(result.getDuration().toMillis() > 0, "Duration should be greater than 0"); + assertNotNull(result.getCombinedPhrases(), "Combined phrases should not be null"); + assertFalse(result.getCombinedPhrases().isEmpty(), "Combined phrases should not be empty"); + assertNotNull(result.getPhrases(), "Phrases should not be null"); + assertFalse(result.getPhrases().isEmpty(), "Phrases should not be empty"); + + // Validate combined phrases + result.getCombinedPhrases().forEach(phrase -> { + assertNotNull(phrase.getText(), "Combined phrase text should not be null"); + assertFalse(phrase.getText().isEmpty(), "Combined phrase text should not be empty"); + }); + + // Validate phrases + result.getPhrases().forEach(phrase -> { + assertNotNull(phrase.getText(), "Phrase text should not be null"); + assertFalse(phrase.getText().isEmpty(), "Phrase text should not be empty"); + assertTrue(phrase.getConfidence() >= 0 && phrase.getConfidence() <= 1, + "Confidence should be between 0 and 1"); + assertTrue(phrase.getOffset() >= 0, "Offset should be non-negative"); + assertTrue(phrase.getDuration().toMillis() > 0, "Phrase duration should be positive"); + }); + } + + /** + * Prints HTTP request and response details for debugging. + * + * @param response The HTTP response + */ + protected void printHttpRequestAndResponse(Response response) { + if (printResults) { + HttpRequest request = response.getRequest(); + System.out.println("\n===== HTTP Request ====="); + System.out.println(request.getHttpMethod() + " " + request.getUrl()); + request.getHeaders().forEach(header -> System.out.println(header.getName() + ": " + header.getValue())); + + System.out.println("\n===== HTTP Response ====="); + System.out.println("Status Code: " + response.getStatusCode()); + response.getHeaders().forEach(header -> System.out.println(header.getName() + ": " + header.getValue())); + } + } + + /** + * Gets the synchronous client. + * + * @return The TranscriptionClient + */ + protected TranscriptionClient getClient() { + return client; + } + + /** + * Gets the asynchronous client. + * + * @return The TranscriptionAsyncClient + */ + protected TranscriptionAsyncClient getAsyncClient() { + return asyncClient; + } +} diff --git a/sdk/transcription/azure-ai-speech-transcription/src/test/java/com/azure/ai/speech/transcription/generated/TranscriptionClientTestBase.java b/sdk/transcription/azure-ai-speech-transcription/src/test/java/com/azure/ai/speech/transcription/generated/TranscriptionClientTestBase.java new file mode 100644 index 000000000000..2e70366d4ec1 --- /dev/null +++ b/sdk/transcription/azure-ai-speech-transcription/src/test/java/com/azure/ai/speech/transcription/generated/TranscriptionClientTestBase.java @@ -0,0 +1,41 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. +// Code generated by Microsoft (R) TypeSpec Code Generator. + +package com.azure.ai.speech.transcription.generated; + +// The Java test files under 'generated' package are generated for your reference. +// If you wish to modify these files, please copy them out of the 'generated' package, and modify there. +// See https://aka.ms/azsdk/dpg/java/tests for guide on adding a test. + +import com.azure.ai.speech.transcription.TranscriptionClient; +import com.azure.ai.speech.transcription.TranscriptionClientBuilder; +import com.azure.core.http.policy.HttpLogDetailLevel; +import com.azure.core.http.policy.HttpLogOptions; +import com.azure.core.test.TestMode; +import com.azure.core.test.TestProxyTestBase; +import com.azure.core.test.utils.MockTokenCredential; +import com.azure.core.util.Configuration; +import com.azure.identity.DefaultAzureCredentialBuilder; + +class TranscriptionClientTestBase extends TestProxyTestBase { + protected TranscriptionClient transcriptionClient; + + @Override + protected void beforeTest() { + TranscriptionClientBuilder transcriptionClientbuilder = new TranscriptionClientBuilder() + .endpoint(Configuration.getGlobalConfiguration().get("ENDPOINT", "endpoint")) + .httpClient(getHttpClientOrUsePlayback(getHttpClients().findFirst().orElse(null))) + .httpLogOptions(new HttpLogOptions().setLogLevel(HttpLogDetailLevel.BASIC)); + if (getTestMode() == TestMode.PLAYBACK) { + transcriptionClientbuilder.credential(new MockTokenCredential()); + } else if (getTestMode() == TestMode.RECORD) { + transcriptionClientbuilder.addPolicy(interceptorManager.getRecordPolicy()) + .credential(new DefaultAzureCredentialBuilder().build()); + } else if (getTestMode() == TestMode.LIVE) { + transcriptionClientbuilder.credential(new DefaultAzureCredentialBuilder().build()); + } + transcriptionClient = transcriptionClientbuilder.buildClient(); + + } +} diff --git a/sdk/transcription/azure-ai-speech-transcription/src/test/java/com/azure/ai/speech/transcription/sample.wav b/sdk/transcription/azure-ai-speech-transcription/src/test/java/com/azure/ai/speech/transcription/sample.wav new file mode 100644 index 000000000000..bf23d54b0c00 Binary files /dev/null and b/sdk/transcription/azure-ai-speech-transcription/src/test/java/com/azure/ai/speech/transcription/sample.wav differ diff --git a/sdk/transcription/azure-ai-speech-transcription/src/test/resources/mockito-extensions/org.mockito.plugins.MockMaker b/sdk/transcription/azure-ai-speech-transcription/src/test/resources/mockito-extensions/org.mockito.plugins.MockMaker new file mode 100644 index 000000000000..1f0955d450f0 --- /dev/null +++ b/sdk/transcription/azure-ai-speech-transcription/src/test/resources/mockito-extensions/org.mockito.plugins.MockMaker @@ -0,0 +1 @@ +mock-maker-inline diff --git a/sdk/transcription/azure-ai-speech-transcription/tsp-location.yaml b/sdk/transcription/azure-ai-speech-transcription/tsp-location.yaml new file mode 100644 index 000000000000..ed7b497e5fa9 --- /dev/null +++ b/sdk/transcription/azure-ai-speech-transcription/tsp-location.yaml @@ -0,0 +1,4 @@ +directory: specification/cognitiveservices/Speech.Transcription +commit: 67019b34b001ae6b8429ee983b9697465d721d0b +repo: Azure/azure-rest-api-specs +additionalDirectories: diff --git a/sdk/transcription/ci.yml b/sdk/transcription/ci.yml new file mode 100644 index 000000000000..bbf4e187a7f2 --- /dev/null +++ b/sdk/transcription/ci.yml @@ -0,0 +1,46 @@ +# NOTE: Please refer to https://aka.ms/azsdk/engsys/ci-yaml before editing this file. + +trigger: + branches: + include: + - main + - hotfix/* + - release/* + paths: + include: + - sdk/transcription/ci.yml + - sdk/transcription/azure-ai-speech-transcription/ + exclude: + - sdk/transcription/pom.xml + - sdk/transcription/azure-ai-speech-transcription/pom.xml + +pr: + branches: + include: + - main + - feature/* + - hotfix/* + - release/* + paths: + include: + - sdk/transcription/ci.yml + - sdk/transcription/azure-ai-speech-transcription/ + exclude: + - sdk/transcription/pom.xml + - sdk/transcription/azure-ai-speech-transcription/pom.xml + +parameters: + - name: release_azureaispeechtranscription + displayName: "azure-ai-speech-transcription" + type: boolean + default: true + +extends: + template: ../../eng/pipelines/templates/stages/archetype-sdk-client.yml + parameters: + ServiceDirectory: transcription + Artifacts: + - name: azure-ai-speech-transcription + groupId: com.azure + safeName: azureaispeechtranscription + releaseInBatch: ${{ parameters.release_azureaispeechtranscription }} diff --git a/sdk/transcription/pom.xml b/sdk/transcription/pom.xml new file mode 100644 index 000000000000..5736ddc377da --- /dev/null +++ b/sdk/transcription/pom.xml @@ -0,0 +1,14 @@ + + + 4.0.0 + com.azure + azure-transcription-service + pom + 1.0.0 + + azure-ai-speech-transcription + +