From 92e9832d032a0cab76d3f553365a05b670a78e64 Mon Sep 17 00:00:00 2001 From: "Fei Zhao (SPEECH)" Date: Mon, 20 Oct 2025 14:17:17 +0800 Subject: [PATCH 01/14] initial runnable version --- .../src/Commands/SpeechJsonContext.cs | 4 + .../src/Commands/Tts/TtsSynthesizeCommand.cs | 172 ++++++++ .../src/Models/SynthesisResult.cs | 27 ++ .../src/Options/SpeechOptionDefinitions.cs | 22 + .../src/Options/Tts/TtsSynthesizeOptions.cs | 27 ++ .../src/Services/ISpeechService.cs | 10 + .../src/Services/SpeechService.cs | 416 ++++++++++++++++++ .../Azure.Mcp.Tools.Speech/src/SpeechSetup.cs | 12 + .../Tts/TtsSynthesizeCommandTests.cs | 283 ++++++++++++ 9 files changed, 973 insertions(+) create mode 100644 tools/Azure.Mcp.Tools.Speech/src/Commands/Tts/TtsSynthesizeCommand.cs create mode 100644 tools/Azure.Mcp.Tools.Speech/src/Models/SynthesisResult.cs create mode 100644 tools/Azure.Mcp.Tools.Speech/src/Options/Tts/TtsSynthesizeOptions.cs create mode 100644 tools/Azure.Mcp.Tools.Speech/tests/Azure.Mcp.Tools.Speech.UnitTests/Tts/TtsSynthesizeCommandTests.cs diff --git a/tools/Azure.Mcp.Tools.Speech/src/Commands/SpeechJsonContext.cs b/tools/Azure.Mcp.Tools.Speech/src/Commands/SpeechJsonContext.cs index dffbc8dff..4932d7830 100644 --- a/tools/Azure.Mcp.Tools.Speech/src/Commands/SpeechJsonContext.cs +++ b/tools/Azure.Mcp.Tools.Speech/src/Commands/SpeechJsonContext.cs @@ -3,6 +3,7 @@ using System.Text.Json.Serialization; using Azure.Mcp.Tools.Speech.Commands.Stt; +using Azure.Mcp.Tools.Speech.Commands.Tts; using Azure.Mcp.Tools.Speech.Models; using Azure.Mcp.Tools.Speech.Models.FastTranscription; using Azure.Mcp.Tools.Speech.Models.Realtime; @@ -24,6 +25,9 @@ [JsonSerializable(typeof(SpeechRecognitionResult))] [JsonSerializable(typeof(SttRecognizeCommand.SttRecognizeCommandResult))] +[JsonSerializable(typeof(SynthesisResult))] +[JsonSerializable(typeof(TtsSynthesizeCommand.TtsSynthesizeCommandResult))] +[JsonSerializable(typeof(WordResult))] [JsonSourceGenerationOptions( PropertyNamingPolicy = JsonKnownNamingPolicy.CamelCase, WriteIndented = true, diff --git a/tools/Azure.Mcp.Tools.Speech/src/Commands/Tts/TtsSynthesizeCommand.cs b/tools/Azure.Mcp.Tools.Speech/src/Commands/Tts/TtsSynthesizeCommand.cs new file mode 100644 index 000000000..dfe0d528c --- /dev/null +++ b/tools/Azure.Mcp.Tools.Speech/src/Commands/Tts/TtsSynthesizeCommand.cs @@ -0,0 +1,172 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +using System.Net; +using Azure.Mcp.Core.Commands; +using Azure.Mcp.Core.Extensions; +using Azure.Mcp.Tools.Speech.Models; +using Azure.Mcp.Tools.Speech.Options; +using Azure.Mcp.Tools.Speech.Options.Tts; +using Azure.Mcp.Tools.Speech.Services; +using Microsoft.Extensions.Logging; + +namespace Azure.Mcp.Tools.Speech.Commands.Tts; + +public sealed class TtsSynthesizeCommand(ILogger logger) : BaseSpeechCommand() +{ + internal record TtsSynthesizeCommandResult(SynthesisResult Result); + + private const string CommandTitle = "Synthesize Speech from Text"; + private readonly ILogger _logger = logger; + + public override string Name => "synthesize"; + + public override string Description => + """ + Convert text to speech using Azure AI Services Speech. This command takes text input and generates an audio file using advanced neural text-to-speech capabilities. + You must provide an Azure AI Services endpoint (e.g., https://your-service.cognitiveservices.azure.com/), the text to convert, and an output file path. + Optional parameters include language specification (default: en-US), voice selection, audio output format (default: Riff24Khz16BitMonoPcm), and custom voice endpoint ID. + The command supports a wide variety of output formats and neural voices for natural-sounding speech synthesis. + """; + + public override string Title => CommandTitle; + + public override ToolMetadata Metadata => new() + { + Destructive = false, + Idempotent = true, + OpenWorld = false, + ReadOnly = false, + LocalRequired = true, // Requires local file output + Secret = false + }; + + protected override void RegisterOptions(Command command) + { + base.RegisterOptions(command); + + command.Options.Add(SpeechOptionDefinitions.Text); + command.Options.Add(SpeechOptionDefinitions.File); + command.Options.Add(SpeechOptionDefinitions.Language); + command.Options.Add(SpeechOptionDefinitions.Voice); + command.Options.Add(SpeechOptionDefinitions.Format); + command.Options.Add(SpeechOptionDefinitions.EndpointId); + + // Command-level validation + command.Validators.Add(commandResult => + { + var textValue = commandResult.GetValueOrDefault(SpeechOptionDefinitions.Text); + + // Validate text is not empty + if (string.IsNullOrWhiteSpace(textValue)) + { + commandResult.AddError("Text cannot be empty or whitespace."); + } + + var fileValue = commandResult.GetValueOrDefault(SpeechOptionDefinitions.File); + + // Validate output file path + if (string.IsNullOrWhiteSpace(fileValue)) + { + commandResult.AddError("Output file path cannot be empty."); + } + else + { + // Validate file extension + var extension = Path.GetExtension(fileValue).ToLowerInvariant(); + if (extension != ".wav") + { + commandResult.AddError($"Output file must have .wav extension. Got: {extension}"); + } + } + + // Validate language format if provided + var languageValue = commandResult.GetValueOrDefault(SpeechOptionDefinitions.Language); + if (!string.IsNullOrEmpty(languageValue)) + { + // Basic validation: language should be in format like "en-US", "es-ES" + if (!System.Text.RegularExpressions.Regex.IsMatch(languageValue, @"^[a-z]{2}-[A-Z]{2}$")) + { + commandResult.AddError($"Language must be in format 'xx-XX' (e.g., 'en-US', 'es-ES'). Got: {languageValue}"); + } + } + }); + } + + protected override TtsSynthesizeOptions BindOptions(ParseResult parseResult) + { + var options = base.BindOptions(parseResult); + options.Text = parseResult.GetValueOrDefault(SpeechOptionDefinitions.Text.Name); + options.File = parseResult.GetValueOrDefault(SpeechOptionDefinitions.File.Name); + options.Language = parseResult.GetValueOrDefault(SpeechOptionDefinitions.Language.Name); + options.Voice = parseResult.GetValueOrDefault(SpeechOptionDefinitions.Voice.Name); + options.Format = parseResult.GetValueOrDefault(SpeechOptionDefinitions.Format.Name); + options.EndpointId = parseResult.GetValueOrDefault(SpeechOptionDefinitions.EndpointId.Name); + + return options; + } + + public override async Task ExecuteAsync(CommandContext context, ParseResult parseResult) + { + if (!Validate(parseResult.CommandResult, context.Response).IsValid) + { + return context.Response; + } + + var options = BindOptions(parseResult); + + try + { + var speechService = context.GetService(); + var result = await speechService.SynthesizeSpeechToFile( + options.Endpoint!, + options.Text!, + options.File!, + options.Language, + options.Voice, + options.Format, + options.EndpointId, + options.RetryPolicy); + + _logger.LogInformation( + "Successfully synthesized speech to file: {File}. Audio length: {Length} bytes, Voice: {Voice}", + result.FilePath, + result.AudioLength, + result.Voice); + + context.Response.Status = HttpStatusCode.OK; + context.Response.Message = "Speech synthesis completed successfully."; + context.Response.Results = ResponseResult.Create( + new TtsSynthesizeCommandResult(result), + SpeechJsonContext.Default.TtsSynthesizeCommandResult); + } + catch (Exception ex) + { + _logger.LogError(ex, "Error synthesizing speech to file: {File}", options.File); + HandleException(context, ex); + } + + return context.Response; + } + + protected override string GetErrorMessage(Exception ex) => ex switch + { + ArgumentException argEx => $"Invalid parameter: {argEx.Message}", + UnauthorizedAccessException => "Access denied. Check Azure AI Services credentials and permissions.", + DirectoryNotFoundException => "Output directory not found. Ensure the directory exists before synthesizing.", + IOException ioEx => $"File operation failed: {ioEx.Message}", + _ => base.GetErrorMessage(ex) + }; + + protected override HttpStatusCode GetStatusCode(Exception ex) => ex switch + { + ArgumentException => HttpStatusCode.BadRequest, + UnauthorizedAccessException => HttpStatusCode.Unauthorized, + DirectoryNotFoundException => HttpStatusCode.NotFound, + IOException => HttpStatusCode.InternalServerError, + HttpRequestException => HttpStatusCode.ServiceUnavailable, + TimeoutException => HttpStatusCode.GatewayTimeout, + InvalidOperationException => HttpStatusCode.InternalServerError, + _ => base.GetStatusCode(ex) + }; +} diff --git a/tools/Azure.Mcp.Tools.Speech/src/Models/SynthesisResult.cs b/tools/Azure.Mcp.Tools.Speech/src/Models/SynthesisResult.cs new file mode 100644 index 000000000..b93f20ece --- /dev/null +++ b/tools/Azure.Mcp.Tools.Speech/src/Models/SynthesisResult.cs @@ -0,0 +1,27 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +using System.Text.Json.Serialization; + +namespace Azure.Mcp.Tools.Speech.Models; + +public record SynthesisResult +{ + [JsonPropertyName("filePath")] + public string? FilePath { get; set; } + + [JsonPropertyName("duration")] + public long Duration { get; set; } + + [JsonPropertyName("audioLength")] + public int AudioLength { get; set; } + + [JsonPropertyName("format")] + public string? Format { get; set; } + + [JsonPropertyName("voice")] + public string? Voice { get; set; } + + [JsonPropertyName("language")] + public string? Language { get; set; } +} diff --git a/tools/Azure.Mcp.Tools.Speech/src/Options/SpeechOptionDefinitions.cs b/tools/Azure.Mcp.Tools.Speech/src/Options/SpeechOptionDefinitions.cs index 6c57b71c2..ce9288519 100644 --- a/tools/Azure.Mcp.Tools.Speech/src/Options/SpeechOptionDefinitions.cs +++ b/tools/Azure.Mcp.Tools.Speech/src/Options/SpeechOptionDefinitions.cs @@ -11,6 +11,9 @@ public static class SpeechOptionDefinitions public const string PhrasesName = "phrases"; public const string FormatName = "format"; public const string ProfanityName = "profanity"; + public const string TextName = "text"; + public const string VoiceName = "voice"; + public const string EndpointIdName = "endpointId"; public static readonly Option Endpoint = new( $"--{EndpointName}") @@ -50,4 +53,23 @@ public static class SpeechOptionDefinitions { Description = "Profanity filter: masked, removed, or raw. Default is masked." }; + + public static readonly Option Text = new( + $"--{TextName}") + { + Description = "The text to convert to speech.", + Required = true + }; + + public static readonly Option Voice = new( + $"--{VoiceName}") + { + Description = "The voice to use for speech synthesis (e.g., en-US-JennyNeural). If not specified, the default voice for the language will be used." + }; + + public static readonly Option EndpointId = new( + $"--{EndpointIdName}") + { + Description = "The endpoint ID of a custom voice model for speech synthesis." + }; } diff --git a/tools/Azure.Mcp.Tools.Speech/src/Options/Tts/TtsSynthesizeOptions.cs b/tools/Azure.Mcp.Tools.Speech/src/Options/Tts/TtsSynthesizeOptions.cs new file mode 100644 index 000000000..7892e8adb --- /dev/null +++ b/tools/Azure.Mcp.Tools.Speech/src/Options/Tts/TtsSynthesizeOptions.cs @@ -0,0 +1,27 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +using System.Text.Json.Serialization; + +namespace Azure.Mcp.Tools.Speech.Options.Tts; + +public class TtsSynthesizeOptions : BaseSpeechOptions +{ + [JsonPropertyName(SpeechOptionDefinitions.TextName)] + public string? Text { get; set; } + + [JsonPropertyName(SpeechOptionDefinitions.FileName)] + public string? File { get; set; } + + [JsonPropertyName(SpeechOptionDefinitions.LanguageName)] + public string? Language { get; set; } + + [JsonPropertyName(SpeechOptionDefinitions.VoiceName)] + public string? Voice { get; set; } + + [JsonPropertyName(SpeechOptionDefinitions.FormatName)] + public string? Format { get; set; } + + [JsonPropertyName(SpeechOptionDefinitions.EndpointIdName)] + public string? EndpointId { get; set; } +} diff --git a/tools/Azure.Mcp.Tools.Speech/src/Services/ISpeechService.cs b/tools/Azure.Mcp.Tools.Speech/src/Services/ISpeechService.cs index dcc6fb0aa..c5e60dc5a 100644 --- a/tools/Azure.Mcp.Tools.Speech/src/Services/ISpeechService.cs +++ b/tools/Azure.Mcp.Tools.Speech/src/Services/ISpeechService.cs @@ -16,4 +16,14 @@ Task RecognizeSpeechFromFile( string? format = null, string? profanity = null, RetryPolicyOptions? retryPolicy = null); + + Task SynthesizeSpeechToFile( + string endpoint, + string text, + string outputFilePath, + string? language = null, + string? voice = null, + string? format = null, + string? endpointId = null, + RetryPolicyOptions? retryPolicy = null); } diff --git a/tools/Azure.Mcp.Tools.Speech/src/Services/SpeechService.cs b/tools/Azure.Mcp.Tools.Speech/src/Services/SpeechService.cs index ff49d3142..a7ef89678 100644 --- a/tools/Azure.Mcp.Tools.Speech/src/Services/SpeechService.cs +++ b/tools/Azure.Mcp.Tools.Speech/src/Services/SpeechService.cs @@ -81,4 +81,420 @@ public async Task RecognizeSpeechFromFile( throw; } } + + /// + /// Determines if the cancellation details indicate an invalid endpoint error. + /// + /// The cancellation details from the speech recognition + /// True if the error indicates an invalid endpoint, false otherwise + private static bool IsInvalidEndpointError(CancellationDetails cancellationDetails) + { + // Check for common error codes that indicate endpoint issues + return cancellationDetails.Reason == CancellationReason.Error && + (cancellationDetails.ErrorCode == CancellationErrorCode.ConnectionFailure || + cancellationDetails.ErrorCode == CancellationErrorCode.AuthenticationFailure || + cancellationDetails.ErrorCode == CancellationErrorCode.Forbidden || + cancellationDetails.ErrorDetails?.Contains("endpoint", StringComparison.OrdinalIgnoreCase) == true || + cancellationDetails.ErrorDetails?.Contains("connection", StringComparison.OrdinalIgnoreCase) == true || + cancellationDetails.ErrorDetails?.Contains("network", StringComparison.OrdinalIgnoreCase) == true); + } + + /// + /// Creates an AudioConfig from a file, automatically detecting the format based on file extension. + /// Supports WAV, MP3, OPUS/OGG, FLAC, and other common audio formats using GStreamer when available. + /// + /// Path to the audio file + /// AudioConfig configured for the specified audio file + /// Thrown when compressed audio format is used but GStreamer is not properly configured + private static AudioConfig CreateAudioConfigFromFile(string filePath) + { + var extension = Path.GetExtension(filePath).ToLowerInvariant(); + + // WAV files don't require GStreamer + if (extension == ".wav") + { + return AudioConfig.FromWavFileInput(filePath); + } + + // For compressed formats, check if GStreamer is available + var isCompressedFormat = extension is ".mp3" or ".ogg" or ".opus" or ".flac" or ".alaw" or ".mulaw" or ".mp4" or ".m4a" or ".aac"; + + if (isCompressedFormat) + { + return extension switch + { + ".mp3" => CreateCompressedAudioConfig(filePath, AudioStreamContainerFormat.MP3), + ".ogg" => CreateCompressedAudioConfig(filePath, AudioStreamContainerFormat.OGG_OPUS), + ".opus" => CreateCompressedAudioConfig(filePath, AudioStreamContainerFormat.OGG_OPUS), + ".flac" => CreateCompressedAudioConfig(filePath, AudioStreamContainerFormat.FLAC), + ".alaw" => CreateCompressedAudioConfig(filePath, AudioStreamContainerFormat.ALAW), + ".mulaw" => CreateCompressedAudioConfig(filePath, AudioStreamContainerFormat.MULAW), + ".mp4" => CreateCompressedAudioConfig(filePath, AudioStreamContainerFormat.ANY), + ".m4a" => CreateCompressedAudioConfig(filePath, AudioStreamContainerFormat.ANY), + ".aac" => CreateCompressedAudioConfig(filePath, AudioStreamContainerFormat.ANY), + _ => throw new NotSupportedException($"Audio format {extension} is not supported") + }; + } + + // Throw exception for unsupported formats + throw new NotSupportedException($"Audio format '{extension}' is not supported. Supported formats are: .wav, .mp3, .ogg, .opus, .flac, .alaw, .mulaw, .mp4, .m4a, .aac"); + } + + /// + /// Creates an AudioConfig for compressed audio formats using PullAudioInputStream. + /// Requires GStreamer to be installed and available in the system PATH. + /// + /// Path to the compressed audio file + /// The audio container format + /// AudioConfig configured for the compressed audio file + private static AudioConfig CreateCompressedAudioConfig(string filePath, AudioStreamContainerFormat containerFormat) + { + // Create compressed audio stream format + var audioFormat = AudioStreamFormat.GetCompressedFormat(containerFormat); + + // Create a custom PullAudioInputStream using a callback + var callback = new BinaryFileReaderCallback(filePath); + var pullStream = AudioInputStream.CreatePullStream(callback, audioFormat); + + return AudioConfig.FromStreamInput(pullStream); + } + + /// + /// Determines if an exception indicates that GStreamer is missing or not properly configured. + /// + /// The exception to check + /// True if the exception indicates GStreamer is missing, false otherwise + private static bool IsGStreamerMissingError(Exception ex) + { + // Check for common GStreamer-related error patterns + var message = ex.Message?.ToLowerInvariant() ?? ""; + var innerMessage = ex.InnerException?.Message?.ToLowerInvariant() ?? ""; + + // Common GStreamer error indicators + var gstreamerErrorPatterns = new[] + { + "gstreamer", + "0x27", // SPXERR_GSTREAMER_INTERNAL_ERROR + "spxerr_gstreamer", + "compressed audio", + "codec", + "audio format not supported", + "audio stream format", + "pipeline", + "element", + "decoder" + }; + + return gstreamerErrorPatterns.Any(pattern => + message.Contains(pattern) || innerMessage.Contains(pattern)); + } + + /// + /// Binary file reader callback for PullAudioInputStream. + /// Reads binary audio data from file for compressed audio processing. + /// + private sealed class BinaryFileReaderCallback : PullAudioInputStreamCallback + { + private readonly FileStream _fileStream; + + public BinaryFileReaderCallback(string filePath) + { + _fileStream = File.OpenRead(filePath); + } + + public override int Read(byte[] dataBuffer, uint size) + { + try + { + var bytesToRead = Math.Min((int)size, dataBuffer.Length); + return _fileStream.Read(dataBuffer, 0, bytesToRead); + } + catch + { + return 0; // End of stream or error + } + } + + public override void Close() + { + _fileStream?.Dispose(); + } + } + + private static Models.SpeechRecognitionResult CreateNoMatchResult() + { + return new Models.SpeechRecognitionResult + { + Text = string.Empty, + Reason = ResultReason.NoMatch.ToString() + }; + } + + private static ProfanityOption GetProfanityOption(string profanity) => + profanity.ToLowerInvariant() switch + { + "masked" => ProfanityOption.Masked, + "removed" => ProfanityOption.Removed, + "raw" => ProfanityOption.Raw, + _ => ProfanityOption.Masked + }; + + private static Models.SpeechRecognitionResult ConvertToSpeechRecognitionResult(SdkSpeechRecognitionResult speechResult, string? format) + { + // detailed format + if (format?.ToLowerInvariant() == "detailed") + { + return new Models.DetailedSpeechRecognitionResult + { + Text = speechResult.Text, + Reason = speechResult.Reason.ToString(), + Offset = (ulong)speechResult.OffsetInTicks, + Duration = (ulong)speechResult.Duration.Ticks, + NBest = ExtractNBestResults(speechResult) + }; + } + // simple format + else + { + return new Models.SpeechRecognitionResult + { + Text = speechResult.Text, + Reason = speechResult.Reason.ToString(), + Offset = (ulong)speechResult.OffsetInTicks, + Duration = (ulong)speechResult.Duration.Ticks + }; + } + } + + /// + /// Extracts NBest results from speech recognition result properties. + /// Parses the detailed JSON response to get confidence scores and alternative text candidates. + /// + /// The speech recognition result + /// List of NBest results with actual confidence values + private static List ExtractNBestResults(SdkSpeechRecognitionResult speechResult) + { + var nbestResults = new List(); + try + { + // Try to get the detailed JSON result from Properties + var jsonProperty = speechResult.Properties.GetProperty(PropertyId.SpeechServiceResponse_JsonResult); + + if (!string.IsNullOrEmpty(jsonProperty)) + { + using var jsonDoc = JsonDocument.Parse(jsonProperty); + + if (jsonDoc.RootElement.TryGetProperty("NBest", out var nbestArray)) + { + foreach (var item in nbestArray.EnumerateArray()) + { + var confidence = item.TryGetProperty("Confidence", out var confidenceProp) ? confidenceProp.GetDouble() : 0.0; + var lexical = item.TryGetProperty("Lexical", out var lexicalProp) ? lexicalProp.GetString() : ""; + var itn = item.TryGetProperty("ITN", out var itnProp) ? itnProp.GetString() : ""; + var maskedITN = item.TryGetProperty("MaskedITN", out var maskedITNProp) ? maskedITNProp.GetString() : ""; + var display = item.TryGetProperty("Display", out var displayProp) ? displayProp.GetString() : ""; + + // Extract words if available + List? words = null; + if (item.TryGetProperty("Words", out var wordsArray)) + { + words = new List(); + foreach (var wordItem in wordsArray.EnumerateArray()) + { + var word = new WordResult + { + Word = wordItem.TryGetProperty("Word", out var wordProp) ? wordProp.GetString() : "", + Offset = wordItem.TryGetProperty("Offset", out var offsetProp) ? (ulong)offsetProp.GetInt64() : null, + Duration = wordItem.TryGetProperty("Duration", out var durationProp) ? (ulong)durationProp.GetInt64() : null + }; + words.Add(word); + } + } + + nbestResults.Add(new NBestResult + { + Confidence = confidence, + Lexical = lexical, + ITN = itn, + MaskedITN = maskedITN, + Display = display, + Words = words + }); + } + } + } + } + catch (JsonException) + { + // If JSON parsing fails, fall back to simple result + } + + return nbestResults; + } + + /// + /// Synthesizes speech from text and saves it to an audio file using Azure AI Services Speech. + /// + /// Azure AI Services endpoint (e.g., https://your-service.cognitiveservices.azure.com/) + /// The text to convert to speech + /// Path where the audio file will be saved + /// Language for synthesis (default: en-US) + /// Voice name to use (e.g., en-US-JennyNeural). If not specified, default voice for language is used + /// Output audio format (default: Riff24Khz16BitMonoPcm) + /// Optional endpoint ID for custom voice model + /// Optional retry policy for resilience + /// Synthesis result with file information + public async Task SynthesizeSpeechToFile( + string endpoint, + string text, + string outputFilePath, + string? language = null, + string? voice = null, + string? format = null, + string? endpointId = null, + RetryPolicyOptions? retryPolicy = null) + { + ValidateRequiredParameters((nameof(endpoint), endpoint), (nameof(text), text), (nameof(outputFilePath), outputFilePath)); + + if (string.IsNullOrWhiteSpace(text)) + { + throw new ArgumentException("Text cannot be empty or whitespace.", nameof(text)); + } + + try + { + // Get Azure AD credential and token + var credential = await GetCredential(); + + // Get access token for Cognitive Services with proper scope + var tokenRequestContext = new TokenRequestContext(["https://cognitiveservices.azure.com/.default"]); + var accessToken = await credential.GetTokenAsync(tokenRequestContext, CancellationToken.None); + + // Configure Speech SDK with endpoint + var config = SpeechConfig.FromEndpoint(new Uri(endpoint)); + + // Set the authorization token + config.AuthorizationToken = accessToken.Token; + + // Set language (default to en-US) + var synthesisLanguage = language ?? "en-US"; + config.SpeechSynthesisLanguage = synthesisLanguage; + + // Set voice if provided + string? actualVoice = voice; + if (!string.IsNullOrEmpty(voice)) + { + config.SpeechSynthesisVoiceName = voice; + } + + // Set output format (default to Riff24Khz16BitMonoPcm) + var outputFormat = ParseOutputFormat(format); + config.SetSpeechSynthesisOutputFormat(outputFormat); + + // Set custom endpoint ID if provided + if (!string.IsNullOrEmpty(endpointId)) + { + config.EndpointId = endpointId; + } + + // Create audio configuration for file output + using var audioConfig = AudioConfig.FromWavFileOutput(outputFilePath); + using var synthesizer = new SpeechSynthesizer(config, audioConfig); + + // Perform synthesis + var startTime = DateTime.UtcNow; + var result = await synthesizer.SpeakTextAsync(text); + var duration = (DateTime.UtcNow - startTime).Ticks; + + // Check result + if (result.Reason == ResultReason.SynthesizingAudioCompleted) + { + _logger.LogInformation( + "Speech synthesized successfully. Output file: {OutputFile}, Audio length: {AudioLength} bytes", + outputFilePath, + result.AudioData.Length); + + // Get actual voice used (either specified or default) + if (string.IsNullOrEmpty(actualVoice)) + { + // The voice name might not be easily retrievable from result properties + // Set to a default or leave as is + actualVoice = voice ?? "default"; + } + + return new SynthesisResult + { + FilePath = outputFilePath, + Duration = duration, + AudioLength = result.AudioData.Length, + Format = format ?? "Riff24Khz16BitMonoPcm", + Voice = actualVoice, + Language = synthesisLanguage + }; + } + else if (result.Reason == ResultReason.Canceled) + { + var cancellation = SpeechSynthesisCancellationDetails.FromResult(result); + _logger.LogError( + "Speech synthesis canceled: Reason={Reason}, ErrorCode={ErrorCode}, ErrorDetails={ErrorDetails}", + cancellation.Reason, + cancellation.ErrorCode, + cancellation.ErrorDetails); + + if (IsSynthesisInvalidEndpointError(cancellation)) + { + throw new InvalidOperationException( + $"Invalid endpoint or connectivity issue. Reason: {cancellation.Reason}, ErrorCode: {cancellation.ErrorCode}, Details: {cancellation.ErrorDetails}"); + } + + throw new InvalidOperationException( + $"Speech synthesis failed: {cancellation.Reason} - {cancellation.ErrorDetails}"); + } + + throw new InvalidOperationException($"Speech synthesis failed with reason: {result.Reason}"); + } + catch (Exception ex) + { + _logger.LogError(ex, "Error during speech synthesis."); + throw; + } + } + + /// + /// Determines if the cancellation details indicate an invalid endpoint error for synthesis. + /// + /// The cancellation details from speech synthesis + /// True if the error indicates an invalid endpoint, false otherwise + private static bool IsSynthesisInvalidEndpointError(SpeechSynthesisCancellationDetails cancellationDetails) + { + return cancellationDetails.Reason == CancellationReason.Error && + (cancellationDetails.ErrorCode == CancellationErrorCode.ConnectionFailure || + cancellationDetails.ErrorCode == CancellationErrorCode.AuthenticationFailure || + cancellationDetails.ErrorCode == CancellationErrorCode.Forbidden || + cancellationDetails.ErrorDetails?.Contains("endpoint", StringComparison.OrdinalIgnoreCase) == true || + cancellationDetails.ErrorDetails?.Contains("connection", StringComparison.OrdinalIgnoreCase) == true || + cancellationDetails.ErrorDetails?.Contains("network", StringComparison.OrdinalIgnoreCase) == true); + } + + /// + /// Parses the output format string to SpeechSynthesisOutputFormat enum. + /// + /// Format string (e.g., "Riff24Khz16BitMonoPcm", "Audio16Khz32KBitRateMonoMp3") + /// SpeechSynthesisOutputFormat enum value + private static SpeechSynthesisOutputFormat ParseOutputFormat(string? format) + { + if (string.IsNullOrEmpty(format)) + { + return SpeechSynthesisOutputFormat.Riff24Khz16BitMonoPcm; + } + + // Try to parse the format string directly to enum + if (Enum.TryParse(format, true, out var parsedFormat)) + { + return parsedFormat; + } + + // If parsing fails, default to Riff24Khz16BitMonoPcm + return SpeechSynthesisOutputFormat.Riff24Khz16BitMonoPcm; + } } diff --git a/tools/Azure.Mcp.Tools.Speech/src/SpeechSetup.cs b/tools/Azure.Mcp.Tools.Speech/src/SpeechSetup.cs index c1f3737a5..e9ca77088 100644 --- a/tools/Azure.Mcp.Tools.Speech/src/SpeechSetup.cs +++ b/tools/Azure.Mcp.Tools.Speech/src/SpeechSetup.cs @@ -4,6 +4,7 @@ using Azure.Mcp.Core.Areas; using Azure.Mcp.Core.Commands; using Azure.Mcp.Tools.Speech.Commands.Stt; +using Azure.Mcp.Tools.Speech.Commands.Tts; using Azure.Mcp.Tools.Speech.Services; using Azure.Mcp.Tools.Speech.Services.Recognizers; using Microsoft.Extensions.DependencyInjection; @@ -23,6 +24,7 @@ public void ConfigureServices(IServiceCollection services) services.AddSingleton(); services.AddSingleton(); services.AddSingleton(); + services.AddSingleton(); } public CommandGroup RegisterCommands(IServiceProvider serviceProvider) @@ -48,6 +50,16 @@ Services Speech endpoints and will only access speech resources accessible to th stt.AddCommand(sttRecognize.Name, sttRecognize); speech.AddSubGroup(stt); + + var tts = new CommandGroup( + name: "tts", + description: "Text-to-speech operations - Commands for converting text to spoken audio using Azure AI Services Speech synthesis."); + + var ttsSynthesize = serviceProvider.GetRequiredService(); + tts.AddCommand(ttsSynthesize.Name, ttsSynthesize); + + speech.AddSubGroup(tts); + return speech; } } diff --git a/tools/Azure.Mcp.Tools.Speech/tests/Azure.Mcp.Tools.Speech.UnitTests/Tts/TtsSynthesizeCommandTests.cs b/tools/Azure.Mcp.Tools.Speech/tests/Azure.Mcp.Tools.Speech.UnitTests/Tts/TtsSynthesizeCommandTests.cs new file mode 100644 index 000000000..36b286c15 --- /dev/null +++ b/tools/Azure.Mcp.Tools.Speech/tests/Azure.Mcp.Tools.Speech.UnitTests/Tts/TtsSynthesizeCommandTests.cs @@ -0,0 +1,283 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +using System.CommandLine; +using System.Net; +using System.Text.Json; +using Azure.Mcp.Core.Models.Command; +using Azure.Mcp.Core.Options; +using Azure.Mcp.Tools.Speech.Commands.Tts; +using Azure.Mcp.Tools.Speech.Models; +using Azure.Mcp.Tools.Speech.Services; +using Microsoft.Extensions.DependencyInjection; +using Microsoft.Extensions.Logging; +using NSubstitute; +using NSubstitute.ExceptionExtensions; +using Xunit; + +namespace Azure.Mcp.Tools.Speech.UnitTests.Tts; + +public class TtsSynthesizeCommandTests +{ + private readonly IServiceProvider _serviceProvider; + private readonly ISpeechService _speechService; + private readonly ILogger _logger; + private readonly TtsSynthesizeCommand _command; + private readonly CommandContext _context; + private readonly Command _commandDefinition; + private readonly string _knownEndpoint = "https://eastus.cognitiveservices.azure.com/"; + private readonly string _knownSubscription = "sub123"; + + public TtsSynthesizeCommandTests() + { + _speechService = Substitute.For(); + _logger = Substitute.For>(); + + var collection = new ServiceCollection().AddSingleton(_speechService); + + _serviceProvider = collection.BuildServiceProvider(); + _command = new(_logger); + _context = new(_serviceProvider); + _commandDefinition = _command.GetCommand(); + } + + [Fact] + public void Constructor_WithValidLogger_ShouldCreateInstance() + { + var command = new TtsSynthesizeCommand(_logger); + Assert.NotNull(command); + Assert.Equal("synthesize", command.Name); + } + + [Fact] + public void Properties_ShouldHaveExpectedValues() + { + Assert.Equal("synthesize", _command.Name); + Assert.Equal("Synthesize Speech from Text", _command.Title); + Assert.NotEmpty(_command.Description); + Assert.False(_command.Metadata.Destructive); + Assert.True(_command.Metadata.Idempotent); + Assert.False(_command.Metadata.OpenWorld); + Assert.False(_command.Metadata.ReadOnly); + Assert.True(_command.Metadata.LocalRequired); + Assert.False(_command.Metadata.Secret); + } + + [Theory] + [InlineData("", false, "Missing Required options: --endpoint, --text, --file")] + [InlineData("--subscription sub123", false, "Missing Required options: --endpoint, --text, --file")] + [InlineData("--subscription sub123 --endpoint https://test.cognitiveservices.azure.com/", false, "Missing Required options: --text, --file")] + [InlineData("--subscription sub123 --endpoint https://test.cognitiveservices.azure.com/ --text Hello", false, "Missing Required options: --file")] + [InlineData("--subscription sub123 --endpoint https://test.cognitiveservices.azure.com/ --text Hello --file output.txt", false, "Output file must have .wav extension")] + [InlineData("--subscription sub123 --endpoint https://test.cognitiveservices.azure.com/ --text Hello --file output.wav --language invalid", false, "Language must be in format 'xx-XX'")] + public async Task ExecuteAsync_ValidatesInput(string args, bool shouldSucceed, string expectedError) + { + var parseResult = _commandDefinition.Parse(args.Split(' ', StringSplitOptions.RemoveEmptyEntries)); + var response = await _command.ExecuteAsync(_context, parseResult); + + if (shouldSucceed) + { + Assert.Equal(HttpStatusCode.OK, response.Status); + } + else + { + Assert.NotEqual(HttpStatusCode.OK, response.Status); + Assert.Contains(expectedError, response.Message, StringComparison.OrdinalIgnoreCase); + } + } + + [Fact] + public async Task ExecuteAsync_WithValidParameters_ShouldSucceed() + { + // Arrange + var text = "HelloWorld"; + var outputFile = "test-output.wav"; + + var expectedResult = new SynthesisResult + { + FilePath = outputFile, + Duration = 1000000, + AudioLength = 48000, + Format = "Riff24Khz16BitMonoPcm", + Voice = "en-US-JennyNeural", + Language = "en-US" + }; + + _speechService.SynthesizeSpeechToFile( + Arg.Any(), + Arg.Any(), + Arg.Any(), + Arg.Any(), + Arg.Any(), + Arg.Any(), + Arg.Any(), + Arg.Any()) + .Returns(expectedResult); + + try + { + // Act + var args = $"--subscription {_knownSubscription} --endpoint {_knownEndpoint} --text {text} --file {outputFile}"; + var parseResult = _commandDefinition.Parse(args.Split(' ', StringSplitOptions.RemoveEmptyEntries)); + var response = await _command.ExecuteAsync(_context, parseResult); + + // Assert + Assert.Equal(HttpStatusCode.OK, response.Status); + Assert.NotNull(response.Results); + + var result = JsonSerializer.Deserialize( + JsonSerializer.Serialize(response.Results), SpeechJsonContext.Default.TtsSynthesizeCommandResult); + Assert.NotNull(result); + Assert.Equal(outputFile, result.Result.FilePath); + Assert.Equal(48000, result.Result.AudioLength); + } + finally + { + // Clean up + if (File.Exists(outputFile)) + { + File.Delete(outputFile); + } + } + } + + [Fact] + public async Task ExecuteAsync_WithAllOptionalParameters_ShouldPassThemCorrectly() + { + // Arrange + var text = "HolaMundo"; + var outputFile = "test-output-spanish.wav"; + var language = "es-ES"; + var voice = "es-ES-ElviraNeural"; + var format = "Audio16Khz32KBitRateMonoMp3"; + var endpointId = "custom-endpoint-id"; + + var expectedResult = new SynthesisResult + { + FilePath = outputFile, + Duration = 1000000, + AudioLength = 32000, + Format = format, + Voice = voice, + Language = language + }; + + _speechService.SynthesizeSpeechToFile( + Arg.Is(_knownEndpoint), + Arg.Is(text), + Arg.Is(outputFile), + Arg.Is(language), + Arg.Is(voice), + Arg.Is(format), + Arg.Is(endpointId), + Arg.Any()) + .Returns(expectedResult); + + try + { + // Act + var args = $"--subscription {_knownSubscription} --endpoint {_knownEndpoint} --text {text} --file {outputFile} --language {language} --voice {voice} --format {format} --endpointId {endpointId}"; + var parseResult = _commandDefinition.Parse(args.Split(' ', StringSplitOptions.RemoveEmptyEntries)); + var response = await _command.ExecuteAsync(_context, parseResult); + + // Assert + Assert.Equal(HttpStatusCode.OK, response.Status); + + await _speechService.Received(1).SynthesizeSpeechToFile( + _knownEndpoint, + text, + outputFile, + language, + voice, + format, + endpointId, + Arg.Any()); + } + finally + { + // Clean up + if (File.Exists(outputFile)) + { + File.Delete(outputFile); + } + } + } + + [Fact] + public async Task ExecuteAsync_ServiceThrowsException_ShouldHandleGracefully() + { + // Arrange + var text = "HelloWorld"; + var outputFile = "test-output-error.wav"; + + _speechService.SynthesizeSpeechToFile( + Arg.Any(), + Arg.Any(), + Arg.Any(), + Arg.Any(), + Arg.Any(), + Arg.Any(), + Arg.Any(), + Arg.Any()) + .ThrowsAsync(new InvalidOperationException("Synthesis failed")); + + try + { + // Act + var args = $"--subscription {_knownSubscription} --endpoint {_knownEndpoint} --text {text} --file {outputFile}"; + var parseResult = _commandDefinition.Parse(args.Split(' ', StringSplitOptions.RemoveEmptyEntries)); + var response = await _command.ExecuteAsync(_context, parseResult); + + // Assert + Assert.Equal(HttpStatusCode.InternalServerError, response.Status); + Assert.Contains("synthesis failed", response.Message.ToLower()); + } + finally + { + // Clean up + if (File.Exists(outputFile)) + { + File.Delete(outputFile); + } + } + } + + [Fact] + public async Task ExecuteAsync_UnauthorizedException_ShouldReturnUnauthorizedStatus() + { + // Arrange + var text = "HelloWorld"; + var outputFile = "test-output-unauth.wav"; + + _speechService.SynthesizeSpeechToFile( + Arg.Any(), + Arg.Any(), + Arg.Any(), + Arg.Any(), + Arg.Any(), + Arg.Any(), + Arg.Any(), + Arg.Any()) + .ThrowsAsync(new UnauthorizedAccessException("Access denied")); + + try + { + // Act + var args = $"--subscription {_knownSubscription} --endpoint {_knownEndpoint} --text {text} --file {outputFile}"; + var parseResult = _commandDefinition.Parse(args.Split(' ', StringSplitOptions.RemoveEmptyEntries)); + var response = await _command.ExecuteAsync(_context, parseResult); + + // Assert + Assert.Equal(HttpStatusCode.Unauthorized, response.Status); + } + finally + { + // Clean up + if (File.Exists(outputFile)) + { + File.Delete(outputFile); + } + } + } +} + From 2b2f1838a2fd446e4e2dbfe6f86ffc1e806b867d Mon Sep 17 00:00:00 2001 From: "Fei Zhao (SPEECH)" Date: Mon, 20 Oct 2025 17:41:24 +0800 Subject: [PATCH 02/14] fix and add live tests --- .../src/Commands/Tts/TtsSynthesizeCommand.cs | 9 +- .../src/Services/SpeechService.cs | 274 ++++++++++++------ .../SpeechCommandTests.cs | 274 ++++++++++++++++++ 3 files changed, 472 insertions(+), 85 deletions(-) diff --git a/tools/Azure.Mcp.Tools.Speech/src/Commands/Tts/TtsSynthesizeCommand.cs b/tools/Azure.Mcp.Tools.Speech/src/Commands/Tts/TtsSynthesizeCommand.cs index dfe0d528c..3d0da1078 100644 --- a/tools/Azure.Mcp.Tools.Speech/src/Commands/Tts/TtsSynthesizeCommand.cs +++ b/tools/Azure.Mcp.Tools.Speech/src/Commands/Tts/TtsSynthesizeCommand.cs @@ -74,9 +74,14 @@ protected override void RegisterOptions(Command command) { // Validate file extension var extension = Path.GetExtension(fileValue).ToLowerInvariant(); - if (extension != ".wav") + var supportedExtensions = new HashSet { - commandResult.AddError($"Output file must have .wav extension. Got: {extension}"); + ".wav", ".mp3", ".ogg", ".raw" + }; + + if (!supportedExtensions.Contains(extension)) + { + commandResult.AddError($"Unsupported output file format: {extension}. Only {string.Join(", ", supportedExtensions)} are supported."); } } diff --git a/tools/Azure.Mcp.Tools.Speech/src/Services/SpeechService.cs b/tools/Azure.Mcp.Tools.Speech/src/Services/SpeechService.cs index a7ef89678..3abe424cb 100644 --- a/tools/Azure.Mcp.Tools.Speech/src/Services/SpeechService.cs +++ b/tools/Azure.Mcp.Tools.Speech/src/Services/SpeechService.cs @@ -332,8 +332,169 @@ private static List ExtractNBestResults(SdkSpeechRecognitionResult return nbestResults; } + /// + /// Synthesizes speech from text and returns the audio data as a byte array. + /// This method uses push stream to collect audio data during synthesis for efficient memory management. + /// + /// Azure AI Services endpoint + /// The text to convert to speech + /// Language for synthesis (default: en-US) + /// Voice name to use (e.g., en-US-JennyNeural) + /// Output audio format (default: Riff24Khz16BitMonoPcm) + /// Optional endpoint ID for custom voice model + /// Tuple containing audio data, actual voice used, and duration in ticks + private async Task<(byte[] AudioData, string Voice, long Duration)> SynthesizeSpeechToStream( + string endpoint, + string text, + string? language = null, + string? voice = null, + string? format = null, + string? endpointId = null) + { + // Get Azure AD credential and token + var credential = await GetCredential(); + + // Get access token for Cognitive Services with proper scope + var tokenRequestContext = new TokenRequestContext(["https://cognitiveservices.azure.com/.default"]); + var accessToken = await credential.GetTokenAsync(tokenRequestContext, CancellationToken.None); + + // Configure Speech SDK with endpoint + var config = SpeechConfig.FromEndpoint(new Uri(endpoint)); + + // Set the authorization token + config.AuthorizationToken = accessToken.Token; + + // Set language (default to en-US) + var synthesisLanguage = language ?? "en-US"; + config.SpeechSynthesisLanguage = synthesisLanguage; + + // Set voice if provided + string? actualVoice = voice; + if (!string.IsNullOrEmpty(voice)) + { + config.SpeechSynthesisVoiceName = voice; + } + + // Set output format (default to Riff24Khz16BitMonoPcm) + var outputFormat = ParseOutputFormat(format); + config.SetSpeechSynthesisOutputFormat(outputFormat); + + // Set custom endpoint ID if provided + if (!string.IsNullOrEmpty(endpointId)) + { + config.EndpointId = endpointId; + } + + // Create a memory stream to collect audio data via push stream + var audioStream = new MemoryStream(); + using var pushStream = AudioOutputStream.CreatePushStream(new PushAudioStreamCallback(audioStream, _logger)); + using var audioConfig = AudioConfig.FromStreamOutput(pushStream); + using var synthesizer = new SpeechSynthesizer(config, audioConfig); + + // Track synthesis progress + var taskCompletionSource = new TaskCompletionSource(); + SpeechSynthesisCancellationDetails? cancellationDetails = null; + + // Subscribe to synthesis events + synthesizer.SynthesisStarted += (s, e) => + { + _logger.LogInformation("Speech synthesis started for text length: {Length} characters", text.Length); + }; + + synthesizer.Synthesizing += (s, e) => + { + if (e.Result.AudioData.Length > 0) + { + _logger.LogDebug("Received audio chunk: {ChunkSize} bytes", e.Result.AudioData.Length); + } + }; + + synthesizer.SynthesisCompleted += (s, e) => + { + _logger.LogInformation("Speech synthesis completed"); + taskCompletionSource.TrySetResult(true); + }; + + synthesizer.SynthesisCanceled += (s, e) => + { + var details = SpeechSynthesisCancellationDetails.FromResult(e.Result); + _logger.LogError("Speech synthesis canceled: Reason={Reason}, ErrorCode={ErrorCode}, ErrorDetails={ErrorDetails}", + details.Reason, details.ErrorCode, details.ErrorDetails); + cancellationDetails = details; + taskCompletionSource.TrySetResult(false); + }; + + // Start synthesis + var startTime = DateTime.UtcNow; + await synthesizer.SpeakTextAsync(text); + + // Wait for synthesis to complete + var success = await taskCompletionSource.Task; + var duration = (DateTime.UtcNow - startTime).Ticks; + + // Check if synthesis was successful + if (!success && cancellationDetails != null) + { + if (IsSynthesisInvalidEndpointError(cancellationDetails)) + { + throw new InvalidOperationException( + $"Invalid endpoint or connectivity issue. Reason: {cancellationDetails.Reason}, ErrorCode: {cancellationDetails.ErrorCode}, Details: {cancellationDetails.ErrorDetails}"); + } + + throw new InvalidOperationException( + $"Speech synthesis failed: {cancellationDetails.Reason} - {cancellationDetails.ErrorDetails}"); + } + + if (!success) + { + throw new InvalidOperationException("Speech synthesis failed for unknown reason"); + } + + // Get the collected audio data from the stream + var audioData = audioStream.ToArray(); + + _logger.LogInformation( + "Speech synthesized successfully. Total audio length: {AudioLength} bytes", + audioData.Length); + + // Get actual voice used (either specified or default) + if (string.IsNullOrEmpty(actualVoice)) + { + actualVoice = voice ?? "default"; + } + + return (audioData, actualVoice, duration); + } + + /// + /// Push stream callback that writes audio data to a memory stream as it arrives. + /// This allows for efficient collection of audio data during synthesis without blocking. + /// + private sealed class PushAudioStreamCallback(MemoryStream targetStream, ILogger logger) : PushAudioOutputStreamCallback + { + private readonly MemoryStream _targetStream = targetStream; + private readonly ILogger _logger = logger; + + public override uint Write(byte[] dataBuffer) + { + if (dataBuffer != null && dataBuffer.Length > 0) + { + _targetStream.Write(dataBuffer, 0, dataBuffer.Length); + _logger.LogDebug("Wrote {BytesWritten} bytes to audio stream", dataBuffer.Length); + return (uint)dataBuffer.Length; + } + return 0; + } + + public override void Close() + { + _logger.LogDebug("Push stream closed, total bytes collected: {TotalBytes}", _targetStream.Length); + } + } + /// /// Synthesizes speech from text and saves it to an audio file using Azure AI Services Speech. + /// Uses streaming synthesis to handle large texts efficiently and avoid memory issues. /// /// Azure AI Services endpoint (e.g., https://your-service.cognitiveservices.azure.com/) /// The text to convert to speech @@ -363,99 +524,46 @@ public async Task SynthesizeSpeechToFile( try { - // Get Azure AD credential and token - var credential = await GetCredential(); - - // Get access token for Cognitive Services with proper scope - var tokenRequestContext = new TokenRequestContext(["https://cognitiveservices.azure.com/.default"]); - var accessToken = await credential.GetTokenAsync(tokenRequestContext, CancellationToken.None); - - // Configure Speech SDK with endpoint - var config = SpeechConfig.FromEndpoint(new Uri(endpoint)); - - // Set the authorization token - config.AuthorizationToken = accessToken.Token; + // Use the reusable streaming synthesis method + var (audioData, actualVoice, duration) = await SynthesizeSpeechToStream( + endpoint, text, language, voice, format, endpointId); - // Set language (default to en-US) - var synthesisLanguage = language ?? "en-US"; - config.SpeechSynthesisLanguage = synthesisLanguage; + // Write the complete audio data to file + await File.WriteAllBytesAsync(outputFilePath, audioData); - // Set voice if provided - string? actualVoice = voice; - if (!string.IsNullOrEmpty(voice)) - { - config.SpeechSynthesisVoiceName = voice; - } + _logger.LogInformation( + "Speech synthesized and saved to file: {OutputFile}, Audio length: {AudioLength} bytes", + outputFilePath, + audioData.Length); - // Set output format (default to Riff24Khz16BitMonoPcm) - var outputFormat = ParseOutputFormat(format); - config.SetSpeechSynthesisOutputFormat(outputFormat); - - // Set custom endpoint ID if provided - if (!string.IsNullOrEmpty(endpointId)) + return new SynthesisResult { - config.EndpointId = endpointId; - } - - // Create audio configuration for file output - using var audioConfig = AudioConfig.FromWavFileOutput(outputFilePath); - using var synthesizer = new SpeechSynthesizer(config, audioConfig); - - // Perform synthesis - var startTime = DateTime.UtcNow; - var result = await synthesizer.SpeakTextAsync(text); - var duration = (DateTime.UtcNow - startTime).Ticks; - - // Check result - if (result.Reason == ResultReason.SynthesizingAudioCompleted) + FilePath = outputFilePath, + Duration = duration, + AudioLength = audioData.Length, + Format = format ?? "Riff24Khz16BitMonoPcm", + Voice = actualVoice, + Language = language ?? "en-US" + }; + } + catch (Exception ex) + { + _logger.LogError(ex, "Error during speech synthesis."); + + // Clean up partial file on error + if (File.Exists(outputFilePath)) { - _logger.LogInformation( - "Speech synthesized successfully. Output file: {OutputFile}, Audio length: {AudioLength} bytes", - outputFilePath, - result.AudioData.Length); - - // Get actual voice used (either specified or default) - if (string.IsNullOrEmpty(actualVoice)) + try { - // The voice name might not be easily retrievable from result properties - // Set to a default or leave as is - actualVoice = voice ?? "default"; + File.Delete(outputFilePath); + _logger.LogInformation("Cleaned up partial output file after error: {OutputFile}", outputFilePath); } - - return new SynthesisResult - { - FilePath = outputFilePath, - Duration = duration, - AudioLength = result.AudioData.Length, - Format = format ?? "Riff24Khz16BitMonoPcm", - Voice = actualVoice, - Language = synthesisLanguage - }; - } - else if (result.Reason == ResultReason.Canceled) - { - var cancellation = SpeechSynthesisCancellationDetails.FromResult(result); - _logger.LogError( - "Speech synthesis canceled: Reason={Reason}, ErrorCode={ErrorCode}, ErrorDetails={ErrorDetails}", - cancellation.Reason, - cancellation.ErrorCode, - cancellation.ErrorDetails); - - if (IsSynthesisInvalidEndpointError(cancellation)) + catch (Exception cleanupEx) { - throw new InvalidOperationException( - $"Invalid endpoint or connectivity issue. Reason: {cancellation.Reason}, ErrorCode: {cancellation.ErrorCode}, Details: {cancellation.ErrorDetails}"); + _logger.LogWarning(cleanupEx, "Failed to clean up partial output file: {OutputFile}", outputFilePath); } - - throw new InvalidOperationException( - $"Speech synthesis failed: {cancellation.Reason} - {cancellation.ErrorDetails}"); } - - throw new InvalidOperationException($"Speech synthesis failed with reason: {result.Reason}"); - } - catch (Exception ex) - { - _logger.LogError(ex, "Error during speech synthesis."); + throw; } } diff --git a/tools/Azure.Mcp.Tools.Speech/tests/Azure.Mcp.Tools.Speech.LiveTests/SpeechCommandTests.cs b/tools/Azure.Mcp.Tools.Speech/tests/Azure.Mcp.Tools.Speech.LiveTests/SpeechCommandTests.cs index faf4984b8..2c87f6435 100644 --- a/tools/Azure.Mcp.Tools.Speech/tests/Azure.Mcp.Tools.Speech.LiveTests/SpeechCommandTests.cs +++ b/tools/Azure.Mcp.Tools.Speech/tests/Azure.Mcp.Tools.Speech.LiveTests/SpeechCommandTests.cs @@ -519,6 +519,280 @@ public async Task SpeechToText_RecognizeCompressedAudioWithRealtimeTranscription #endregion + #region TTS Synthesize Tests + + [Fact] + public async Task Should_synthesize_speech_to_file_with_text() + { + // Test basic TTS synthesis with text input + var aiServicesEndpoint = $"https://{Settings.ResourceBaseName}.cognitiveservices.azure.com/"; + var outputFile = Path.Combine(Path.GetTempPath(), $"tts-test-{Guid.NewGuid()}.wav"); + + try + { + var result = await CallToolAsync( + "speech_tts_synthesize", + new() + { + { "subscription", Settings.SubscriptionId }, + { "endpoint", aiServicesEndpoint }, + { "text", "Hello, this is a test of text to speech synthesis." }, + { "file", outputFile }, + { "language", "en-US" } + }); + + // Verify successful response + Assert.NotNull(result); + var resultText = result.ToString(); + Assert.NotNull(resultText); + + // Parse and validate the JSON result + var jsonResult = JsonDocument.Parse(resultText); + var resultObject = jsonResult.RootElement; + Assert.True(resultObject.TryGetProperty("result", out var resultProperty)); + + // Verify file path + Assert.True(resultProperty.TryGetProperty("filePath", out var filePathProperty)); + Assert.Equal(outputFile, filePathProperty.GetString()); + + // Verify duration and audio length are present + Assert.True(resultProperty.TryGetProperty("duration", out var durationProperty)); + Assert.True(durationProperty.GetInt64() > 0); + + Assert.True(resultProperty.TryGetProperty("audioLength", out var audioLengthProperty)); + Assert.True(audioLengthProperty.GetInt64() > 0); + + // Verify the output file was created and has content + Assert.True(File.Exists(outputFile), $"Output file not created at: {outputFile}"); + var fileInfo = new FileInfo(outputFile); + Assert.True(fileInfo.Length > 0, "Output file should not be empty"); + } + finally + { + // Clean up + if (File.Exists(outputFile)) + { + File.Delete(outputFile); + } + } + } + + [Theory] + [InlineData("en-US", "en-US-JennyNeural")] + [InlineData("zh-CN", "zh-CN-XiaoxiaoNeural")] + [InlineData("ja-JP", "ja-JP-NanamiNeural")] + public async Task Should_synthesize_speech_with_different_voices(string language, string voice) + { + // Test TTS synthesis with different language/voice combinations + var aiServicesEndpoint = $"https://{Settings.ResourceBaseName}.cognitiveservices.azure.com/"; + var outputFile = Path.Combine(Path.GetTempPath(), $"tts-test-{language}-{Guid.NewGuid()}.wav"); + + try + { + var result = await CallToolAsync( + "speech_tts_synthesize", + new() + { + { "subscription", Settings.SubscriptionId }, + { "endpoint", aiServicesEndpoint }, + { "text", "Hello world" }, + { "file", outputFile }, + { "language", language }, + { "voice", voice } + }); + + Assert.NotNull(result); + var resultText = result.ToString(); + Assert.NotNull(resultText); + + var jsonResult = JsonDocument.Parse(resultText); + var resultObject = jsonResult.RootElement; + Assert.True(resultObject.TryGetProperty("result", out var resultProperty)); + + // Verify voice was used + Assert.True(resultProperty.TryGetProperty("voice", out var voiceProperty)); + Assert.Equal(voice, voiceProperty.GetString()); + + // Verify language + Assert.True(resultProperty.TryGetProperty("language", out var languageProperty)); + Assert.Equal(language, languageProperty.GetString()); + + // Verify file exists + Assert.True(File.Exists(outputFile)); + } + finally + { + if (File.Exists(outputFile)) + { + File.Delete(outputFile); + } + } + } + + [Theory] + [InlineData("Riff8Khz16BitMonoPcm")] + [InlineData("Riff24Khz16BitMonoPcm")] + [InlineData("Audio16Khz32KBitRateMonoMp3")] + public async Task Should_synthesize_speech_with_different_formats(string format) + { + // Test TTS synthesis with different audio formats + var aiServicesEndpoint = $"https://{Settings.ResourceBaseName}.cognitiveservices.azure.com/"; + var extension = format.Contains("Mp3") ? ".mp3" : ".wav"; + var outputFile = Path.Combine(Path.GetTempPath(), $"tts-test-{format}-{Guid.NewGuid()}{extension}"); + + try + { + var result = await CallToolAsync( + "speech_tts_synthesize", + new() + { + { "subscription", Settings.SubscriptionId }, + { "endpoint", aiServicesEndpoint }, + { "text", "Testing different audio formats" }, + { "file", outputFile }, + { "language", "en-US" }, + { "format", format } + }); + + Assert.NotNull(result); + var resultText = result.ToString(); + Assert.NotNull(resultText); + + var jsonResult = JsonDocument.Parse(resultText); + var resultObject = jsonResult.RootElement; + Assert.True(resultObject.TryGetProperty("result", out var resultProperty)); + + // Verify format + Assert.True(resultProperty.TryGetProperty("format", out var formatProperty)); + Assert.Equal(format, formatProperty.GetString()); + + // Verify file exists and has content + Assert.True(File.Exists(outputFile)); + var fileInfo = new FileInfo(outputFile); + Assert.True(fileInfo.Length > 0); + } + finally + { + if (File.Exists(outputFile)) + { + File.Delete(outputFile); + } + } + } + + [Fact] + public async Task Should_handle_invalid_text_input() + { + // Test error handling for empty text + var aiServicesEndpoint = $"https://{Settings.ResourceBaseName}.cognitiveservices.azure.com/"; + var outputFile = Path.Combine(Path.GetTempPath(), $"tts-test-invalid-{Guid.NewGuid()}.wav"); + + try + { + var result = await CallToolAsync( + "speech_tts_synthesize", + new() + { + { "subscription", Settings.SubscriptionId }, + { "endpoint", aiServicesEndpoint }, + { "text", "" }, // Empty text should fail validation + { "file", outputFile }, + { "language", "en-US" } + }); + + // Should return error response + Assert.Null(result); + } + finally + { + if (File.Exists(outputFile)) + { + File.Delete(outputFile); + } + } + } + + [Fact] + public async Task Should_handle_invalid_language_format() + { + // Test error handling for invalid language format + var aiServicesEndpoint = $"https://{Settings.ResourceBaseName}.cognitiveservices.azure.com/"; + var outputFile = Path.Combine(Path.GetTempPath(), $"tts-test-invalid-lang-{Guid.NewGuid()}.wav"); + + try + { + var result = await CallToolAsync( + "speech_tts_synthesize", + new() + { + { "subscription", Settings.SubscriptionId }, + { "endpoint", aiServicesEndpoint }, + { "text", "Hello world" }, + { "file", outputFile }, + { "language", "invalid-format" } // Invalid language format + }); + + // Should return error response + Assert.Null(result); + } + finally + { + if (File.Exists(outputFile)) + { + File.Delete(outputFile); + } + } + } + + [Fact] + public async Task Should_handle_large_text_input() + { + // Test TTS with larger text to verify streaming works correctly + var aiServicesEndpoint = $"https://{Settings.ResourceBaseName}.cognitiveservices.azure.com/"; + var outputFile = Path.Combine(Path.GetTempPath(), $"tts-test-large-{Guid.NewGuid()}.wav"); + + // Create a longer text (around 500 words) + var largeText = string.Join(" ", Enumerable.Repeat( + "This is a test of text to speech synthesis with a longer input to verify that streaming works correctly.", + 50)); + + try + { + var result = await CallToolAsync( + "speech_tts_synthesize", + new() + { + { "subscription", Settings.SubscriptionId }, + { "endpoint", aiServicesEndpoint }, + { "text", largeText }, + { "file", outputFile }, + { "language", "en-US" } + }); + + Assert.NotNull(result); + var resultText = result.ToString(); + Assert.NotNull(resultText); + + var jsonResult = JsonDocument.Parse(resultText); + var resultObject = jsonResult.RootElement; + Assert.True(resultObject.TryGetProperty("result", out var resultProperty)); + + // Verify file exists and is significantly larger than a short phrase + Assert.True(File.Exists(outputFile)); + var fileInfo = new FileInfo(outputFile); + Assert.True(fileInfo.Length > 50000, "Large text should produce a substantial audio file"); + } + finally + { + if (File.Exists(outputFile)) + { + File.Delete(outputFile); + } + } + } + + #endregion + /// /// Create a WAV file with given duration (seconds). /// If durationSeconds = 0, generates an empty WAV file with header only. From de93a6589785d5d2a7bab255eabf1b17a875dab0 Mon Sep 17 00:00:00 2001 From: "Fei Zhao (SPEECH)" Date: Tue, 21 Oct 2025 10:26:30 +0800 Subject: [PATCH 03/14] update parameter name --- .../src/Commands/Tts/TtsSynthesizeCommand.cs | 10 +++++----- .../src/Options/SpeechOptionDefinitions.cs | 8 ++++++++ .../src/Options/Tts/TtsSynthesizeOptions.cs | 4 ++-- .../SpeechCommandTests.cs | 12 ++++++------ .../Tts/TtsSynthesizeCommandTests.cs | 8 ++++---- 5 files changed, 25 insertions(+), 17 deletions(-) diff --git a/tools/Azure.Mcp.Tools.Speech/src/Commands/Tts/TtsSynthesizeCommand.cs b/tools/Azure.Mcp.Tools.Speech/src/Commands/Tts/TtsSynthesizeCommand.cs index 3d0da1078..9b8606433 100644 --- a/tools/Azure.Mcp.Tools.Speech/src/Commands/Tts/TtsSynthesizeCommand.cs +++ b/tools/Azure.Mcp.Tools.Speech/src/Commands/Tts/TtsSynthesizeCommand.cs @@ -46,7 +46,7 @@ protected override void RegisterOptions(Command command) base.RegisterOptions(command); command.Options.Add(SpeechOptionDefinitions.Text); - command.Options.Add(SpeechOptionDefinitions.File); + command.Options.Add(SpeechOptionDefinitions.OutputAudio); command.Options.Add(SpeechOptionDefinitions.Language); command.Options.Add(SpeechOptionDefinitions.Voice); command.Options.Add(SpeechOptionDefinitions.Format); @@ -63,7 +63,7 @@ protected override void RegisterOptions(Command command) commandResult.AddError("Text cannot be empty or whitespace."); } - var fileValue = commandResult.GetValueOrDefault(SpeechOptionDefinitions.File); + var fileValue = commandResult.GetValueOrDefault(SpeechOptionDefinitions.OutputAudio); // Validate output file path if (string.IsNullOrWhiteSpace(fileValue)) @@ -102,7 +102,7 @@ protected override TtsSynthesizeOptions BindOptions(ParseResult parseResult) { var options = base.BindOptions(parseResult); options.Text = parseResult.GetValueOrDefault(SpeechOptionDefinitions.Text.Name); - options.File = parseResult.GetValueOrDefault(SpeechOptionDefinitions.File.Name); + options.OutputAudio = parseResult.GetValueOrDefault(SpeechOptionDefinitions.OutputAudio.Name); options.Language = parseResult.GetValueOrDefault(SpeechOptionDefinitions.Language.Name); options.Voice = parseResult.GetValueOrDefault(SpeechOptionDefinitions.Voice.Name); options.Format = parseResult.GetValueOrDefault(SpeechOptionDefinitions.Format.Name); @@ -126,7 +126,7 @@ public override async Task ExecuteAsync(CommandContext context, var result = await speechService.SynthesizeSpeechToFile( options.Endpoint!, options.Text!, - options.File!, + options.OutputAudio!, options.Language, options.Voice, options.Format, @@ -147,7 +147,7 @@ public override async Task ExecuteAsync(CommandContext context, } catch (Exception ex) { - _logger.LogError(ex, "Error synthesizing speech to file: {File}", options.File); + _logger.LogError(ex, "Error synthesizing speech to file: {File}", options.OutputAudio); HandleException(context, ex); } diff --git a/tools/Azure.Mcp.Tools.Speech/src/Options/SpeechOptionDefinitions.cs b/tools/Azure.Mcp.Tools.Speech/src/Options/SpeechOptionDefinitions.cs index ce9288519..e8031b698 100644 --- a/tools/Azure.Mcp.Tools.Speech/src/Options/SpeechOptionDefinitions.cs +++ b/tools/Azure.Mcp.Tools.Speech/src/Options/SpeechOptionDefinitions.cs @@ -7,6 +7,7 @@ public static class SpeechOptionDefinitions { public const string EndpointName = "endpoint"; public const string FileName = "file"; + public const string OutputAudioName = "outputAudio"; public const string LanguageName = "language"; public const string PhrasesName = "phrases"; public const string FormatName = "format"; @@ -61,6 +62,13 @@ public static class SpeechOptionDefinitions Required = true }; + public static readonly Option OutputAudio = new( + $"--{OutputAudioName}") + { + Description = "Path where the synthesized audio file will be saved.", + Required = true + }; + public static readonly Option Voice = new( $"--{VoiceName}") { diff --git a/tools/Azure.Mcp.Tools.Speech/src/Options/Tts/TtsSynthesizeOptions.cs b/tools/Azure.Mcp.Tools.Speech/src/Options/Tts/TtsSynthesizeOptions.cs index 7892e8adb..f0d843bdf 100644 --- a/tools/Azure.Mcp.Tools.Speech/src/Options/Tts/TtsSynthesizeOptions.cs +++ b/tools/Azure.Mcp.Tools.Speech/src/Options/Tts/TtsSynthesizeOptions.cs @@ -10,8 +10,8 @@ public class TtsSynthesizeOptions : BaseSpeechOptions [JsonPropertyName(SpeechOptionDefinitions.TextName)] public string? Text { get; set; } - [JsonPropertyName(SpeechOptionDefinitions.FileName)] - public string? File { get; set; } + [JsonPropertyName(SpeechOptionDefinitions.OutputAudioName)] + public string? OutputAudio { get; set; } [JsonPropertyName(SpeechOptionDefinitions.LanguageName)] public string? Language { get; set; } diff --git a/tools/Azure.Mcp.Tools.Speech/tests/Azure.Mcp.Tools.Speech.LiveTests/SpeechCommandTests.cs b/tools/Azure.Mcp.Tools.Speech/tests/Azure.Mcp.Tools.Speech.LiveTests/SpeechCommandTests.cs index 2c87f6435..b43c86744 100644 --- a/tools/Azure.Mcp.Tools.Speech/tests/Azure.Mcp.Tools.Speech.LiveTests/SpeechCommandTests.cs +++ b/tools/Azure.Mcp.Tools.Speech/tests/Azure.Mcp.Tools.Speech.LiveTests/SpeechCommandTests.cs @@ -537,7 +537,7 @@ public async Task Should_synthesize_speech_to_file_with_text() { "subscription", Settings.SubscriptionId }, { "endpoint", aiServicesEndpoint }, { "text", "Hello, this is a test of text to speech synthesis." }, - { "file", outputFile }, + { "outputAudio", outputFile }, { "language", "en-US" } }); @@ -596,7 +596,7 @@ public async Task Should_synthesize_speech_with_different_voices(string language { "subscription", Settings.SubscriptionId }, { "endpoint", aiServicesEndpoint }, { "text", "Hello world" }, - { "file", outputFile }, + { "outputAudio", outputFile }, { "language", language }, { "voice", voice } }); @@ -649,7 +649,7 @@ public async Task Should_synthesize_speech_with_different_formats(string format) { "subscription", Settings.SubscriptionId }, { "endpoint", aiServicesEndpoint }, { "text", "Testing different audio formats" }, - { "file", outputFile }, + { "outputAudio", outputFile }, { "language", "en-US" }, { "format", format } }); @@ -696,7 +696,7 @@ public async Task Should_handle_invalid_text_input() { "subscription", Settings.SubscriptionId }, { "endpoint", aiServicesEndpoint }, { "text", "" }, // Empty text should fail validation - { "file", outputFile }, + { "outputAudio", outputFile }, { "language", "en-US" } }); @@ -728,7 +728,7 @@ public async Task Should_handle_invalid_language_format() { "subscription", Settings.SubscriptionId }, { "endpoint", aiServicesEndpoint }, { "text", "Hello world" }, - { "file", outputFile }, + { "outputAudio", outputFile }, { "language", "invalid-format" } // Invalid language format }); @@ -765,7 +765,7 @@ public async Task Should_handle_large_text_input() { "subscription", Settings.SubscriptionId }, { "endpoint", aiServicesEndpoint }, { "text", largeText }, - { "file", outputFile }, + { "outputAudio", outputFile }, { "language", "en-US" } }); diff --git a/tools/Azure.Mcp.Tools.Speech/tests/Azure.Mcp.Tools.Speech.UnitTests/Tts/TtsSynthesizeCommandTests.cs b/tools/Azure.Mcp.Tools.Speech/tests/Azure.Mcp.Tools.Speech.UnitTests/Tts/TtsSynthesizeCommandTests.cs index 36b286c15..15cb16efc 100644 --- a/tools/Azure.Mcp.Tools.Speech/tests/Azure.Mcp.Tools.Speech.UnitTests/Tts/TtsSynthesizeCommandTests.cs +++ b/tools/Azure.Mcp.Tools.Speech/tests/Azure.Mcp.Tools.Speech.UnitTests/Tts/TtsSynthesizeCommandTests.cs @@ -117,7 +117,7 @@ public async Task ExecuteAsync_WithValidParameters_ShouldSucceed() try { // Act - var args = $"--subscription {_knownSubscription} --endpoint {_knownEndpoint} --text {text} --file {outputFile}"; + var args = $"--subscription {_knownSubscription} --endpoint {_knownEndpoint} --text {text} --outputAudio {outputFile}"; var parseResult = _commandDefinition.Parse(args.Split(' ', StringSplitOptions.RemoveEmptyEntries)); var response = await _command.ExecuteAsync(_context, parseResult); @@ -176,7 +176,7 @@ public async Task ExecuteAsync_WithAllOptionalParameters_ShouldPassThemCorrectly try { // Act - var args = $"--subscription {_knownSubscription} --endpoint {_knownEndpoint} --text {text} --file {outputFile} --language {language} --voice {voice} --format {format} --endpointId {endpointId}"; + var args = $"--subscription {_knownSubscription} --endpoint {_knownEndpoint} --text {text} --outputAudio {outputFile} --language {language} --voice {voice} --format {format} --endpointId {endpointId}"; var parseResult = _commandDefinition.Parse(args.Split(' ', StringSplitOptions.RemoveEmptyEntries)); var response = await _command.ExecuteAsync(_context, parseResult); @@ -224,7 +224,7 @@ public async Task ExecuteAsync_ServiceThrowsException_ShouldHandleGracefully() try { // Act - var args = $"--subscription {_knownSubscription} --endpoint {_knownEndpoint} --text {text} --file {outputFile}"; + var args = $"--subscription {_knownSubscription} --endpoint {_knownEndpoint} --text {text} --outputAudio {outputFile}"; var parseResult = _commandDefinition.Parse(args.Split(' ', StringSplitOptions.RemoveEmptyEntries)); var response = await _command.ExecuteAsync(_context, parseResult); @@ -263,7 +263,7 @@ public async Task ExecuteAsync_UnauthorizedException_ShouldReturnUnauthorizedSta try { // Act - var args = $"--subscription {_knownSubscription} --endpoint {_knownEndpoint} --text {text} --file {outputFile}"; + var args = $"--subscription {_knownSubscription} --endpoint {_knownEndpoint} --text {text} --outputAudio {outputFile}"; var parseResult = _commandDefinition.Parse(args.Split(' ', StringSplitOptions.RemoveEmptyEntries)); var response = await _command.ExecuteAsync(_context, parseResult); From 141f6926f1ee07e982af654ca02aec5c05c68820 Mon Sep 17 00:00:00 2001 From: "Fei Zhao (SPEECH)" Date: Tue, 21 Oct 2025 14:53:55 +0800 Subject: [PATCH 04/14] update response --- .../src/Commands/Tts/TtsSynthesizeCommand.cs | 4 ++-- .../src/Models/SynthesisResult.cs | 9 +++------ .../src/Services/SpeechService.cs | 15 ++++++-------- .../SpeechCommandTests.cs | 6 +----- .../Tts/TtsSynthesizeCommandTests.cs | 20 +++++++++---------- 5 files changed, 21 insertions(+), 33 deletions(-) diff --git a/tools/Azure.Mcp.Tools.Speech/src/Commands/Tts/TtsSynthesizeCommand.cs b/tools/Azure.Mcp.Tools.Speech/src/Commands/Tts/TtsSynthesizeCommand.cs index 9b8606433..76dbb47f0 100644 --- a/tools/Azure.Mcp.Tools.Speech/src/Commands/Tts/TtsSynthesizeCommand.cs +++ b/tools/Azure.Mcp.Tools.Speech/src/Commands/Tts/TtsSynthesizeCommand.cs @@ -134,9 +134,9 @@ public override async Task ExecuteAsync(CommandContext context, options.RetryPolicy); _logger.LogInformation( - "Successfully synthesized speech to file: {File}. Audio length: {Length} bytes, Voice: {Voice}", + "Successfully synthesized speech to file: {File}. Audio size: {Size} bytes, Voice: {Voice}", result.FilePath, - result.AudioLength, + result.AudioSize, result.Voice); context.Response.Status = HttpStatusCode.OK; diff --git a/tools/Azure.Mcp.Tools.Speech/src/Models/SynthesisResult.cs b/tools/Azure.Mcp.Tools.Speech/src/Models/SynthesisResult.cs index b93f20ece..fa3e68717 100644 --- a/tools/Azure.Mcp.Tools.Speech/src/Models/SynthesisResult.cs +++ b/tools/Azure.Mcp.Tools.Speech/src/Models/SynthesisResult.cs @@ -5,16 +5,13 @@ namespace Azure.Mcp.Tools.Speech.Models; -public record SynthesisResult +public class SynthesisResult { [JsonPropertyName("filePath")] public string? FilePath { get; set; } - [JsonPropertyName("duration")] - public long Duration { get; set; } - - [JsonPropertyName("audioLength")] - public int AudioLength { get; set; } + [JsonPropertyName("audioSize")] + public long AudioSize { get; set; } [JsonPropertyName("format")] public string? Format { get; set; } diff --git a/tools/Azure.Mcp.Tools.Speech/src/Services/SpeechService.cs b/tools/Azure.Mcp.Tools.Speech/src/Services/SpeechService.cs index 3abe424cb..e2de3ec8f 100644 --- a/tools/Azure.Mcp.Tools.Speech/src/Services/SpeechService.cs +++ b/tools/Azure.Mcp.Tools.Speech/src/Services/SpeechService.cs @@ -342,8 +342,8 @@ private static List ExtractNBestResults(SdkSpeechRecognitionResult /// Voice name to use (e.g., en-US-JennyNeural) /// Output audio format (default: Riff24Khz16BitMonoPcm) /// Optional endpoint ID for custom voice model - /// Tuple containing audio data, actual voice used, and duration in ticks - private async Task<(byte[] AudioData, string Voice, long Duration)> SynthesizeSpeechToStream( + /// Tuple containing audio data, actual voice used, and duration in seconds + private async Task<(byte[] AudioData, string Voice)> SynthesizeSpeechToStream( string endpoint, string text, string? language = null, @@ -425,12 +425,10 @@ private static List ExtractNBestResults(SdkSpeechRecognitionResult }; // Start synthesis - var startTime = DateTime.UtcNow; await synthesizer.SpeakTextAsync(text); // Wait for synthesis to complete var success = await taskCompletionSource.Task; - var duration = (DateTime.UtcNow - startTime).Ticks; // Check if synthesis was successful if (!success && cancellationDetails != null) @@ -463,7 +461,7 @@ private static List ExtractNBestResults(SdkSpeechRecognitionResult actualVoice = voice ?? "default"; } - return (audioData, actualVoice, duration); + return (audioData, actualVoice); } /// @@ -525,22 +523,21 @@ public async Task SynthesizeSpeechToFile( try { // Use the reusable streaming synthesis method - var (audioData, actualVoice, duration) = await SynthesizeSpeechToStream( + var (audioData, actualVoice) = await SynthesizeSpeechToStream( endpoint, text, language, voice, format, endpointId); // Write the complete audio data to file await File.WriteAllBytesAsync(outputFilePath, audioData); _logger.LogInformation( - "Speech synthesized and saved to file: {OutputFile}, Audio length: {AudioLength} bytes", + "Speech synthesized and saved to file: {OutputFile}, Audio size: {AudioSize} bytes", outputFilePath, audioData.Length); return new SynthesisResult { FilePath = outputFilePath, - Duration = duration, - AudioLength = audioData.Length, + AudioSize = audioData.Length, Format = format ?? "Riff24Khz16BitMonoPcm", Voice = actualVoice, Language = language ?? "en-US" diff --git a/tools/Azure.Mcp.Tools.Speech/tests/Azure.Mcp.Tools.Speech.LiveTests/SpeechCommandTests.cs b/tools/Azure.Mcp.Tools.Speech/tests/Azure.Mcp.Tools.Speech.LiveTests/SpeechCommandTests.cs index b43c86744..69f07bab2 100644 --- a/tools/Azure.Mcp.Tools.Speech/tests/Azure.Mcp.Tools.Speech.LiveTests/SpeechCommandTests.cs +++ b/tools/Azure.Mcp.Tools.Speech/tests/Azure.Mcp.Tools.Speech.LiveTests/SpeechCommandTests.cs @@ -555,11 +555,7 @@ public async Task Should_synthesize_speech_to_file_with_text() Assert.True(resultProperty.TryGetProperty("filePath", out var filePathProperty)); Assert.Equal(outputFile, filePathProperty.GetString()); - // Verify duration and audio length are present - Assert.True(resultProperty.TryGetProperty("duration", out var durationProperty)); - Assert.True(durationProperty.GetInt64() > 0); - - Assert.True(resultProperty.TryGetProperty("audioLength", out var audioLengthProperty)); + Assert.True(resultProperty.TryGetProperty("audioSize", out var audioLengthProperty)); Assert.True(audioLengthProperty.GetInt64() > 0); // Verify the output file was created and has content diff --git a/tools/Azure.Mcp.Tools.Speech/tests/Azure.Mcp.Tools.Speech.UnitTests/Tts/TtsSynthesizeCommandTests.cs b/tools/Azure.Mcp.Tools.Speech/tests/Azure.Mcp.Tools.Speech.UnitTests/Tts/TtsSynthesizeCommandTests.cs index 15cb16efc..e659d7df7 100644 --- a/tools/Azure.Mcp.Tools.Speech/tests/Azure.Mcp.Tools.Speech.UnitTests/Tts/TtsSynthesizeCommandTests.cs +++ b/tools/Azure.Mcp.Tools.Speech/tests/Azure.Mcp.Tools.Speech.UnitTests/Tts/TtsSynthesizeCommandTests.cs @@ -64,12 +64,12 @@ public void Properties_ShouldHaveExpectedValues() } [Theory] - [InlineData("", false, "Missing Required options: --endpoint, --text, --file")] - [InlineData("--subscription sub123", false, "Missing Required options: --endpoint, --text, --file")] - [InlineData("--subscription sub123 --endpoint https://test.cognitiveservices.azure.com/", false, "Missing Required options: --text, --file")] - [InlineData("--subscription sub123 --endpoint https://test.cognitiveservices.azure.com/ --text Hello", false, "Missing Required options: --file")] - [InlineData("--subscription sub123 --endpoint https://test.cognitiveservices.azure.com/ --text Hello --file output.txt", false, "Output file must have .wav extension")] - [InlineData("--subscription sub123 --endpoint https://test.cognitiveservices.azure.com/ --text Hello --file output.wav --language invalid", false, "Language must be in format 'xx-XX'")] + [InlineData("", false, "Missing Required options: --endpoint, --text, --outputAudio")] + [InlineData("--subscription sub123", false, "Missing Required options: --endpoint, --text, --outputAudio")] + [InlineData("--subscription sub123 --endpoint https://test.cognitiveservices.azure.com/", false, "Missing Required options: --text, --outputAudio")] + [InlineData("--subscription sub123 --endpoint https://test.cognitiveservices.azure.com/ --text Hello", false, "Missing Required options: --outputAudio")] + [InlineData("--subscription sub123 --endpoint https://test.cognitiveservices.azure.com/ --text Hello --outputAudio output.txt", false, "Unsupported output file format")] + [InlineData("--subscription sub123 --endpoint https://test.cognitiveservices.azure.com/ --text Hello --outputAudio output.wav --language invalid", false, "Language must be in format 'xx-XX'")] public async Task ExecuteAsync_ValidatesInput(string args, bool shouldSucceed, string expectedError) { var parseResult = _commandDefinition.Parse(args.Split(' ', StringSplitOptions.RemoveEmptyEntries)); @@ -96,8 +96,7 @@ public async Task ExecuteAsync_WithValidParameters_ShouldSucceed() var expectedResult = new SynthesisResult { FilePath = outputFile, - Duration = 1000000, - AudioLength = 48000, + AudioSize = 48000, Format = "Riff24Khz16BitMonoPcm", Voice = "en-US-JennyNeural", Language = "en-US" @@ -129,7 +128,7 @@ public async Task ExecuteAsync_WithValidParameters_ShouldSucceed() JsonSerializer.Serialize(response.Results), SpeechJsonContext.Default.TtsSynthesizeCommandResult); Assert.NotNull(result); Assert.Equal(outputFile, result.Result.FilePath); - Assert.Equal(48000, result.Result.AudioLength); + Assert.Equal(48000, result.Result.AudioSize); } finally { @@ -155,8 +154,7 @@ public async Task ExecuteAsync_WithAllOptionalParameters_ShouldPassThemCorrectly var expectedResult = new SynthesisResult { FilePath = outputFile, - Duration = 1000000, - AudioLength = 32000, + AudioSize = 32000, Format = format, Voice = voice, Language = language From 4baed235873d938ad4e3d364ee6de5cff933efa0 Mon Sep 17 00:00:00 2001 From: "Fei Zhao (SPEECH)" Date: Tue, 21 Oct 2025 17:03:00 +0800 Subject: [PATCH 05/14] update prompts and tool description evaluator --- .../ToolDescriptionEvaluator/prompts.json | 12 + eng/tools/ToolDescriptionEvaluator/results.md | 8155 ++++++++++++++++- eng/tools/ToolDescriptionEvaluator/tools.json | 105 + servers/Azure.Mcp.Server/README.md | 5 +- .../Azure.Mcp.Server/docs/azmcp-commands.md | 61 + .../Azure.Mcp.Server/docs/e2eTestPrompts.md | 10 + .../src/Commands/SpeechJsonContext.cs | 1 - 7 files changed, 8321 insertions(+), 28 deletions(-) diff --git a/eng/tools/ToolDescriptionEvaluator/prompts.json b/eng/tools/ToolDescriptionEvaluator/prompts.json index 638bac49c..938dae450 100644 --- a/eng/tools/ToolDescriptionEvaluator/prompts.json +++ b/eng/tools/ToolDescriptionEvaluator/prompts.json @@ -116,6 +116,18 @@ "Convert speech to text with comma-separated phrase hints: \"Azure, cognitive services, API\"", "Transcribe audio with raw profanity output from file " ], + "speech_tts_synthesize": [ + "Convert text to speech and save to output.wav", + "Synthesize speech from \"Hello, welcome to Azure\" and save to welcome.wav", + "Generate speech audio from text \"Hello world\" using Azure Speech Services", + "Convert text to speech with Spanish language and save to spanish-audio.wav", + "Synthesize speech with voice en-US-JennyNeural from text \"Azure AI Services\"", + "Create MP3 audio file from text \"Welcome to Azure\" with high quality format", + "Generate speech with custom voice model using endpoint ID ", + "Convert text to OGG/Opus format audio file", + "Synthesize long text content to audio file with streaming", + "Create audio file from text in French language with appropriate voice" + ], "appconfig_account_list": [ "List all App Configuration stores in my subscription", "Show me the App Configuration stores in my subscription", diff --git a/eng/tools/ToolDescriptionEvaluator/results.md b/eng/tools/ToolDescriptionEvaluator/results.md index d4c06c36d..ddccc083e 100644 --- a/eng/tools/ToolDescriptionEvaluator/results.md +++ b/eng/tools/ToolDescriptionEvaluator/results.md @@ -1,15 +1,37 @@ # Tool Selection Analysis Setup +<<<<<<< HEAD **Setup completed:** 2025-11-06 17:16:26 **Tool count:** 179 **Database setup time:** 32.4934401s +======= +<<<<<<< HEAD +**Setup completed:** 2025-11-03 14:57:47 +**Tool count:** 173 +**Database setup time:** 1.2016078s +======= +**Setup completed:** 2025-11-04 15:41:36 +**Tool count:** 174 +**Database setup time:** 1.4888934s +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) --- # Tool Selection Analysis Results +<<<<<<< HEAD **Analysis Date:** 2025-11-06 17:16:26 **Tool count:** 179 +======= +<<<<<<< HEAD +**Analysis Date:** 2025-11-03 14:57:47 +**Tool count:** 173 +======= +**Analysis Date:** 2025-11-04 15:41:36 +**Tool count:** 174 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) ## Table of Contents @@ -73,6 +95,7 @@ - [Test 58: speech_stt_recognize](#test-58) - [Test 59: speech_stt_recognize](#test-59) - [Test 60: speech_stt_recognize](#test-60) +<<<<<<< HEAD - [Test 61: speech_stt_recognize](#test-61) - [Test 62: speech_stt_recognize](#test-62) - [Test 63: speech_stt_recognize](#test-63) @@ -92,6 +115,28 @@ - [Test 77: applens_resource_diagnose](#test-77) - [Test 78: applens_resource_diagnose](#test-78) - [Test 79: applens_resource_diagnose](#test-79) +======= +<<<<<<< HEAD +- [Test 61: appconfig_account_list](#test-61) +- [Test 62: appconfig_account_list](#test-62) +- [Test 63: appconfig_account_list](#test-63) +- [Test 64: appconfig_kv_delete](#test-64) +- [Test 65: appconfig_kv_get](#test-65) +- [Test 66: appconfig_kv_get](#test-66) +- [Test 67: appconfig_kv_get](#test-67) +- [Test 68: appconfig_kv_get](#test-68) +- [Test 69: appconfig_kv_lock_set](#test-69) +- [Test 70: appconfig_kv_lock_set](#test-70) +- [Test 71: appconfig_kv_set](#test-71) +- [Test 72: applens_resource_diagnose](#test-72) +- [Test 73: applens_resource_diagnose](#test-73) +- [Test 74: applens_resource_diagnose](#test-74) +- [Test 75: appservice_database_add](#test-75) +- [Test 76: appservice_database_add](#test-76) +- [Test 77: appservice_database_add](#test-77) +- [Test 78: appservice_database_add](#test-78) +- [Test 79: appservice_database_add](#test-79) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) - [Test 80: appservice_database_add](#test-80) - [Test 81: appservice_database_add](#test-81) - [Test 82: appservice_database_add](#test-82) @@ -297,6 +342,7 @@ - [Test 282: aks_cluster_get](#test-282) - [Test 283: aks_cluster_get](#test-283) - [Test 284: aks_nodepool_get](#test-284) +<<<<<<< HEAD - [Test 285: aks_nodepool_get](#test-285) - [Test 286: aks_nodepool_get](#test-286) - [Test 287: aks_nodepool_get](#test-287) @@ -326,6 +372,432 @@ - [Test 311: azureaibestpractices_get](#test-311) - [Test 312: azureaibestpractices_get](#test-312) - [Test 313: azureaibestpractices_get](#test-313) +======= +- [Test 285: loadtesting_test_create](#test-285) +- [Test 286: loadtesting_test_get](#test-286) +- [Test 287: loadtesting_testresource_create](#test-287) +- [Test 288: loadtesting_testresource_list](#test-288) +- [Test 289: loadtesting_testrun_create](#test-289) +- [Test 290: loadtesting_testrun_get](#test-290) +- [Test 291: loadtesting_testrun_list](#test-291) +- [Test 292: loadtesting_testrun_update](#test-292) +- [Test 293: grafana_list](#test-293) +- [Test 294: managedlustre_fs_create](#test-294) +- [Test 295: managedlustre_fs_list](#test-295) +- [Test 296: managedlustre_fs_list](#test-296) +- [Test 297: managedlustre_fs_sku_get](#test-297) +- [Test 298: managedlustre_fs_subnetsize_ask](#test-298) +- [Test 299: managedlustre_fs_subnetsize_validate](#test-299) +- [Test 300: managedlustre_fs_update](#test-300) +- [Test 301: marketplace_product_get](#test-301) +- [Test 302: marketplace_product_list](#test-302) +- [Test 303: marketplace_product_list](#test-303) +- [Test 304: get_bestpractices_get](#test-304) +- [Test 305: get_bestpractices_get](#test-305) +- [Test 306: get_bestpractices_get](#test-306) +- [Test 307: get_bestpractices_get](#test-307) +- [Test 308: get_bestpractices_get](#test-308) +- [Test 309: get_bestpractices_get](#test-309) +- [Test 310: get_bestpractices_get](#test-310) +- [Test 311: get_bestpractices_get](#test-311) +- [Test 312: get_bestpractices_get](#test-312) +- [Test 313: monitor_activitylog_list](#test-313) +- [Test 314: monitor_healthmodels_entity_get](#test-314) +- [Test 315: monitor_metrics_definitions](#test-315) +- [Test 316: monitor_metrics_definitions](#test-316) +- [Test 317: monitor_metrics_definitions](#test-317) +- [Test 318: monitor_metrics_query](#test-318) +- [Test 319: monitor_metrics_query](#test-319) +- [Test 320: monitor_metrics_query](#test-320) +- [Test 321: monitor_metrics_query](#test-321) +- [Test 322: monitor_metrics_query](#test-322) +- [Test 323: monitor_metrics_query](#test-323) +- [Test 324: monitor_resource_log_query](#test-324) +- [Test 325: monitor_table_list](#test-325) +- [Test 326: monitor_table_list](#test-326) +- [Test 327: monitor_table_type_list](#test-327) +- [Test 328: monitor_table_type_list](#test-328) +- [Test 329: monitor_webtests_create](#test-329) +- [Test 330: monitor_webtests_get](#test-330) +- [Test 331: monitor_webtests_list](#test-331) +- [Test 332: monitor_webtests_list](#test-332) +- [Test 333: monitor_webtests_update](#test-333) +- [Test 334: monitor_workspace_list](#test-334) +- [Test 335: monitor_workspace_list](#test-335) +- [Test 336: monitor_workspace_list](#test-336) +- [Test 337: monitor_workspace_log_query](#test-337) +- [Test 338: datadog_monitoredresources_list](#test-338) +- [Test 339: datadog_monitoredresources_list](#test-339) +- [Test 340: extension_azqr](#test-340) +- [Test 341: extension_azqr](#test-341) +- [Test 342: extension_azqr](#test-342) +- [Test 343: quota_region_availability_list](#test-343) +- [Test 344: quota_usage_check](#test-344) +- [Test 345: role_assignment_list](#test-345) +- [Test 346: role_assignment_list](#test-346) +- [Test 347: redis_list](#test-347) +- [Test 348: redis_list](#test-348) +- [Test 349: redis_list](#test-349) +- [Test 350: redis_list](#test-350) +- [Test 351: redis_list](#test-351) +- [Test 352: group_list](#test-352) +- [Test 353: group_list](#test-353) +- [Test 354: group_list](#test-354) +- [Test 355: resourcehealth_availability-status_get](#test-355) +- [Test 356: resourcehealth_availability-status_get](#test-356) +- [Test 357: resourcehealth_availability-status_get](#test-357) +- [Test 358: resourcehealth_availability-status_list](#test-358) +- [Test 359: resourcehealth_availability-status_list](#test-359) +- [Test 360: resourcehealth_availability-status_list](#test-360) +- [Test 361: resourcehealth_health-events_list](#test-361) +- [Test 362: resourcehealth_health-events_list](#test-362) +- [Test 363: resourcehealth_health-events_list](#test-363) +- [Test 364: resourcehealth_health-events_list](#test-364) +- [Test 365: resourcehealth_health-events_list](#test-365) +- [Test 366: servicebus_queue_details](#test-366) +- [Test 367: servicebus_topic_details](#test-367) +- [Test 368: servicebus_topic_subscription_details](#test-368) +- [Test 369: signalr_runtime_get](#test-369) +- [Test 370: signalr_runtime_get](#test-370) +- [Test 371: signalr_runtime_get](#test-371) +- [Test 372: signalr_runtime_get](#test-372) +- [Test 373: signalr_runtime_get](#test-373) +- [Test 374: signalr_runtime_get](#test-374) +- [Test 375: sql_db_create](#test-375) +- [Test 376: sql_db_create](#test-376) +- [Test 377: sql_db_create](#test-377) +- [Test 378: sql_db_delete](#test-378) +- [Test 379: sql_db_delete](#test-379) +- [Test 380: sql_db_delete](#test-380) +- [Test 381: sql_db_list](#test-381) +- [Test 382: sql_db_list](#test-382) +- [Test 383: sql_db_rename](#test-383) +- [Test 384: sql_db_rename](#test-384) +- [Test 385: sql_db_show](#test-385) +- [Test 386: sql_db_show](#test-386) +- [Test 387: sql_db_update](#test-387) +- [Test 388: sql_db_update](#test-388) +- [Test 389: sql_elastic-pool_list](#test-389) +- [Test 390: sql_elastic-pool_list](#test-390) +- [Test 391: sql_elastic-pool_list](#test-391) +- [Test 392: sql_server_create](#test-392) +- [Test 393: sql_server_create](#test-393) +- [Test 394: sql_server_create](#test-394) +- [Test 395: sql_server_delete](#test-395) +- [Test 396: sql_server_delete](#test-396) +- [Test 397: sql_server_delete](#test-397) +- [Test 398: sql_server_entra-admin_list](#test-398) +- [Test 399: sql_server_entra-admin_list](#test-399) +- [Test 400: sql_server_entra-admin_list](#test-400) +- [Test 401: sql_server_firewall-rule_create](#test-401) +- [Test 402: sql_server_firewall-rule_create](#test-402) +- [Test 403: sql_server_firewall-rule_create](#test-403) +- [Test 404: sql_server_firewall-rule_delete](#test-404) +- [Test 405: sql_server_firewall-rule_delete](#test-405) +- [Test 406: sql_server_firewall-rule_delete](#test-406) +- [Test 407: sql_server_firewall-rule_list](#test-407) +- [Test 408: sql_server_firewall-rule_list](#test-408) +- [Test 409: sql_server_firewall-rule_list](#test-409) +- [Test 410: sql_server_list](#test-410) +- [Test 411: sql_server_list](#test-411) +- [Test 412: sql_server_show](#test-412) +- [Test 413: sql_server_show](#test-413) +- [Test 414: sql_server_show](#test-414) +- [Test 415: storage_account_create](#test-415) +- [Test 416: storage_account_create](#test-416) +- [Test 417: storage_account_create](#test-417) +- [Test 418: storage_account_get](#test-418) +- [Test 419: storage_account_get](#test-419) +- [Test 420: storage_account_get](#test-420) +- [Test 421: storage_account_get](#test-421) +- [Test 422: storage_account_get](#test-422) +- [Test 423: storage_blob_container_create](#test-423) +- [Test 424: storage_blob_container_create](#test-424) +- [Test 425: storage_blob_container_create](#test-425) +- [Test 426: storage_blob_container_get](#test-426) +- [Test 427: storage_blob_container_get](#test-427) +- [Test 428: storage_blob_container_get](#test-428) +- [Test 429: storage_blob_get](#test-429) +- [Test 430: storage_blob_get](#test-430) +- [Test 431: storage_blob_get](#test-431) +- [Test 432: storage_blob_get](#test-432) +- [Test 433: storage_blob_upload](#test-433) +- [Test 434: subscription_list](#test-434) +- [Test 435: subscription_list](#test-435) +- [Test 436: subscription_list](#test-436) +- [Test 437: subscription_list](#test-437) +- [Test 438: azureterraformbestpractices_get](#test-438) +- [Test 439: azureterraformbestpractices_get](#test-439) +- [Test 440: virtualdesktop_hostpool_list](#test-440) +- [Test 441: virtualdesktop_hostpool_host_list](#test-441) +- [Test 442: virtualdesktop_hostpool_host_user-list](#test-442) +- [Test 443: workbooks_create](#test-443) +- [Test 444: workbooks_delete](#test-444) +- [Test 445: workbooks_list](#test-445) +- [Test 446: workbooks_list](#test-446) +- [Test 447: workbooks_show](#test-447) +- [Test 448: workbooks_show](#test-448) +- [Test 449: workbooks_update](#test-449) +- [Test 450: bicepschema_get](#test-450) +- [Test 451: cloudarchitect_design](#test-451) +- [Test 452: cloudarchitect_design](#test-452) +- [Test 453: cloudarchitect_design](#test-453) +- [Test 454: cloudarchitect_design](#test-454) +======= +- [Test 61: speech_tts_synthesize](#test-61) +- [Test 62: speech_tts_synthesize](#test-62) +- [Test 63: speech_tts_synthesize](#test-63) +- [Test 64: speech_tts_synthesize](#test-64) +- [Test 65: speech_tts_synthesize](#test-65) +- [Test 66: speech_tts_synthesize](#test-66) +- [Test 67: speech_tts_synthesize](#test-67) +- [Test 68: speech_tts_synthesize](#test-68) +- [Test 69: speech_tts_synthesize](#test-69) +- [Test 70: speech_tts_synthesize](#test-70) +- [Test 71: appconfig_account_list](#test-71) +- [Test 72: appconfig_account_list](#test-72) +- [Test 73: appconfig_account_list](#test-73) +- [Test 74: appconfig_kv_delete](#test-74) +- [Test 75: appconfig_kv_get](#test-75) +- [Test 76: appconfig_kv_get](#test-76) +- [Test 77: appconfig_kv_get](#test-77) +- [Test 78: appconfig_kv_get](#test-78) +- [Test 79: appconfig_kv_lock_set](#test-79) +- [Test 80: appconfig_kv_lock_set](#test-80) +- [Test 81: appconfig_kv_set](#test-81) +- [Test 82: applens_resource_diagnose](#test-82) +- [Test 83: applens_resource_diagnose](#test-83) +- [Test 84: applens_resource_diagnose](#test-84) +- [Test 85: appservice_database_add](#test-85) +- [Test 86: appservice_database_add](#test-86) +- [Test 87: appservice_database_add](#test-87) +- [Test 88: appservice_database_add](#test-88) +- [Test 89: appservice_database_add](#test-89) +- [Test 90: appservice_database_add](#test-90) +- [Test 91: appservice_database_add](#test-91) +- [Test 92: appservice_database_add](#test-92) +- [Test 93: appservice_database_add](#test-93) +- [Test 94: appservice_database_add](#test-94) +- [Test 95: applicationinsights_recommendation_list](#test-95) +- [Test 96: applicationinsights_recommendation_list](#test-96) +- [Test 97: applicationinsights_recommendation_list](#test-97) +- [Test 98: applicationinsights_recommendation_list](#test-98) +- [Test 99: extension_cli_generate](#test-99) +- [Test 100: extension_cli_generate](#test-100) +- [Test 101: extension_cli_generate](#test-101) +- [Test 102: extension_cli_install](#test-102) +- [Test 103: extension_cli_install](#test-103) +- [Test 104: extension_cli_install](#test-104) +- [Test 105: acr_registry_list](#test-105) +- [Test 106: acr_registry_list](#test-106) +- [Test 107: acr_registry_list](#test-107) +- [Test 108: acr_registry_list](#test-108) +- [Test 109: acr_registry_list](#test-109) +- [Test 110: acr_registry_repository_list](#test-110) +- [Test 111: acr_registry_repository_list](#test-111) +- [Test 112: acr_registry_repository_list](#test-112) +- [Test 113: acr_registry_repository_list](#test-113) +- [Test 114: communication_email_send](#test-114) +- [Test 115: communication_email_send](#test-115) +- [Test 116: communication_email_send](#test-116) +- [Test 117: communication_email_send](#test-117) +- [Test 118: communication_email_send](#test-118) +- [Test 119: communication_email_send](#test-119) +- [Test 120: communication_email_send](#test-120) +- [Test 121: communication_email_send](#test-121) +- [Test 122: communication_sms_send](#test-122) +- [Test 123: communication_sms_send](#test-123) +- [Test 124: communication_sms_send](#test-124) +- [Test 125: communication_sms_send](#test-125) +- [Test 126: communication_sms_send](#test-126) +- [Test 127: communication_sms_send](#test-127) +- [Test 128: communication_sms_send](#test-128) +- [Test 129: communication_sms_send](#test-129) +- [Test 130: confidentialledger_entries_append](#test-130) +- [Test 131: confidentialledger_entries_append](#test-131) +- [Test 132: confidentialledger_entries_append](#test-132) +- [Test 133: confidentialledger_entries_append](#test-133) +- [Test 134: confidentialledger_entries_append](#test-134) +- [Test 135: confidentialledger_entries_get](#test-135) +- [Test 136: confidentialledger_entries_get](#test-136) +- [Test 137: cosmos_account_list](#test-137) +- [Test 138: cosmos_account_list](#test-138) +- [Test 139: cosmos_account_list](#test-139) +- [Test 140: cosmos_database_container_item_query](#test-140) +- [Test 141: cosmos_database_container_list](#test-141) +- [Test 142: cosmos_database_container_list](#test-142) +- [Test 143: cosmos_database_list](#test-143) +- [Test 144: cosmos_database_list](#test-144) +- [Test 145: kusto_cluster_get](#test-145) +- [Test 146: kusto_cluster_list](#test-146) +- [Test 147: kusto_cluster_list](#test-147) +- [Test 148: kusto_cluster_list](#test-148) +- [Test 149: kusto_database_list](#test-149) +- [Test 150: kusto_database_list](#test-150) +- [Test 151: kusto_query](#test-151) +- [Test 152: kusto_sample](#test-152) +- [Test 153: kusto_table_list](#test-153) +- [Test 154: kusto_table_list](#test-154) +- [Test 155: kusto_table_schema](#test-155) +- [Test 156: mysql_database_list](#test-156) +- [Test 157: mysql_database_list](#test-157) +- [Test 158: mysql_database_query](#test-158) +- [Test 159: mysql_server_config_get](#test-159) +- [Test 160: mysql_server_list](#test-160) +- [Test 161: mysql_server_list](#test-161) +- [Test 162: mysql_server_list](#test-162) +- [Test 163: mysql_server_param_get](#test-163) +- [Test 164: mysql_server_param_set](#test-164) +- [Test 165: mysql_table_list](#test-165) +- [Test 166: mysql_table_list](#test-166) +- [Test 167: mysql_table_schema_get](#test-167) +- [Test 168: postgres_database_list](#test-168) +- [Test 169: postgres_database_list](#test-169) +- [Test 170: postgres_database_query](#test-170) +- [Test 171: postgres_server_config_get](#test-171) +- [Test 172: postgres_server_list](#test-172) +- [Test 173: postgres_server_list](#test-173) +- [Test 174: postgres_server_list](#test-174) +- [Test 175: postgres_server_param_get](#test-175) +- [Test 176: postgres_server_param_set](#test-176) +- [Test 177: postgres_table_list](#test-177) +- [Test 178: postgres_table_list](#test-178) +- [Test 179: postgres_table_schema_get](#test-179) +- [Test 180: deploy_app_logs_get](#test-180) +- [Test 181: deploy_architecture_diagram_generate](#test-181) +- [Test 182: deploy_iac_rules_get](#test-182) +- [Test 183: deploy_pipeline_guidance_get](#test-183) +- [Test 184: deploy_plan_get](#test-184) +- [Test 185: eventgrid_events_publish](#test-185) +- [Test 186: eventgrid_events_publish](#test-186) +- [Test 187: eventgrid_events_publish](#test-187) +- [Test 188: eventgrid_topic_list](#test-188) +- [Test 189: eventgrid_topic_list](#test-189) +- [Test 190: eventgrid_topic_list](#test-190) +- [Test 191: eventgrid_topic_list](#test-191) +- [Test 192: eventgrid_subscription_list](#test-192) +- [Test 193: eventgrid_subscription_list](#test-193) +- [Test 194: eventgrid_subscription_list](#test-194) +- [Test 195: eventgrid_subscription_list](#test-195) +- [Test 196: eventgrid_subscription_list](#test-196) +- [Test 197: eventgrid_subscription_list](#test-197) +- [Test 198: eventgrid_subscription_list](#test-198) +- [Test 199: eventhubs_eventhub_consumergroup_delete](#test-199) +- [Test 200: eventhubs_eventhub_consumergroup_get](#test-200) +- [Test 201: eventhubs_eventhub_consumergroup_get](#test-201) +- [Test 202: eventhubs_eventhub_consumergroup_update](#test-202) +- [Test 203: eventhubs_eventhub_consumergroup_update](#test-203) +- [Test 204: eventhubs_eventhub_delete](#test-204) +- [Test 205: eventhubs_eventhub_get](#test-205) +- [Test 206: eventhubs_eventhub_get](#test-206) +- [Test 207: eventhubs_eventhub_update](#test-207) +- [Test 208: eventhubs_eventhub_update](#test-208) +- [Test 209: eventhubs_namespace_delete](#test-209) +- [Test 210: eventhubs_namespace_get](#test-210) +- [Test 211: eventhubs_namespace_get](#test-211) +- [Test 212: eventhubs_namespace_update](#test-212) +- [Test 213: eventhubs_namespace_update](#test-213) +- [Test 214: functionapp_get](#test-214) +- [Test 215: functionapp_get](#test-215) +- [Test 216: functionapp_get](#test-216) +- [Test 217: functionapp_get](#test-217) +- [Test 218: functionapp_get](#test-218) +- [Test 219: functionapp_get](#test-219) +- [Test 220: functionapp_get](#test-220) +- [Test 221: functionapp_get](#test-221) +- [Test 222: functionapp_get](#test-222) +- [Test 223: functionapp_get](#test-223) +- [Test 224: functionapp_get](#test-224) +- [Test 225: functionapp_get](#test-225) +- [Test 226: keyvault_admin_settings_get](#test-226) +- [Test 227: keyvault_admin_settings_get](#test-227) +- [Test 228: keyvault_admin_settings_get](#test-228) +- [Test 229: keyvault_certificate_create](#test-229) +- [Test 230: keyvault_certificate_create](#test-230) +- [Test 231: keyvault_certificate_create](#test-231) +- [Test 232: keyvault_certificate_create](#test-232) +- [Test 233: keyvault_certificate_create](#test-233) +- [Test 234: keyvault_certificate_get](#test-234) +- [Test 235: keyvault_certificate_get](#test-235) +- [Test 236: keyvault_certificate_get](#test-236) +- [Test 237: keyvault_certificate_get](#test-237) +- [Test 238: keyvault_certificate_get](#test-238) +- [Test 239: keyvault_certificate_import](#test-239) +- [Test 240: keyvault_certificate_import](#test-240) +- [Test 241: keyvault_certificate_import](#test-241) +- [Test 242: keyvault_certificate_import](#test-242) +- [Test 243: keyvault_certificate_import](#test-243) +- [Test 244: keyvault_certificate_list](#test-244) +- [Test 245: keyvault_certificate_list](#test-245) +- [Test 246: keyvault_certificate_list](#test-246) +- [Test 247: keyvault_certificate_list](#test-247) +- [Test 248: keyvault_certificate_list](#test-248) +- [Test 249: keyvault_certificate_list](#test-249) +- [Test 250: keyvault_key_create](#test-250) +- [Test 251: keyvault_key_create](#test-251) +- [Test 252: keyvault_key_create](#test-252) +- [Test 253: keyvault_key_create](#test-253) +- [Test 254: keyvault_key_create](#test-254) +- [Test 255: keyvault_key_get](#test-255) +- [Test 256: keyvault_key_get](#test-256) +- [Test 257: keyvault_key_get](#test-257) +- [Test 258: keyvault_key_get](#test-258) +- [Test 259: keyvault_key_get](#test-259) +- [Test 260: keyvault_key_list](#test-260) +- [Test 261: keyvault_key_list](#test-261) +- [Test 262: keyvault_key_list](#test-262) +- [Test 263: keyvault_key_list](#test-263) +- [Test 264: keyvault_key_list](#test-264) +- [Test 265: keyvault_key_list](#test-265) +- [Test 266: keyvault_secret_create](#test-266) +- [Test 267: keyvault_secret_create](#test-267) +- [Test 268: keyvault_secret_create](#test-268) +- [Test 269: keyvault_secret_create](#test-269) +- [Test 270: keyvault_secret_create](#test-270) +- [Test 271: keyvault_secret_get](#test-271) +- [Test 272: keyvault_secret_get](#test-272) +- [Test 273: keyvault_secret_get](#test-273) +- [Test 274: keyvault_secret_get](#test-274) +- [Test 275: keyvault_secret_get](#test-275) +- [Test 276: keyvault_secret_list](#test-276) +- [Test 277: keyvault_secret_list](#test-277) +- [Test 278: keyvault_secret_list](#test-278) +- [Test 279: keyvault_secret_list](#test-279) +- [Test 280: keyvault_secret_list](#test-280) +- [Test 281: keyvault_secret_list](#test-281) +- [Test 282: aks_cluster_get](#test-282) +- [Test 283: aks_cluster_get](#test-283) +- [Test 284: aks_cluster_get](#test-284) +- [Test 285: aks_cluster_get](#test-285) +- [Test 286: aks_cluster_get](#test-286) +- [Test 287: aks_cluster_get](#test-287) +- [Test 288: aks_cluster_get](#test-288) +- [Test 289: aks_nodepool_get](#test-289) +- [Test 290: aks_nodepool_get](#test-290) +- [Test 291: aks_nodepool_get](#test-291) +- [Test 292: aks_nodepool_get](#test-292) +- [Test 293: aks_nodepool_get](#test-293) +- [Test 294: aks_nodepool_get](#test-294) +- [Test 295: loadtesting_test_create](#test-295) +- [Test 296: loadtesting_test_get](#test-296) +- [Test 297: loadtesting_testresource_create](#test-297) +- [Test 298: loadtesting_testresource_list](#test-298) +- [Test 299: loadtesting_testrun_create](#test-299) +- [Test 300: loadtesting_testrun_get](#test-300) +- [Test 301: loadtesting_testrun_list](#test-301) +- [Test 302: loadtesting_testrun_update](#test-302) +- [Test 303: grafana_list](#test-303) +- [Test 304: managedlustre_fs_create](#test-304) +- [Test 305: managedlustre_fs_list](#test-305) +- [Test 306: managedlustre_fs_list](#test-306) +- [Test 307: managedlustre_fs_sku_get](#test-307) +- [Test 308: managedlustre_fs_subnetsize_ask](#test-308) +- [Test 309: managedlustre_fs_subnetsize_validate](#test-309) +- [Test 310: managedlustre_fs_update](#test-310) +- [Test 311: marketplace_product_get](#test-311) +- [Test 312: marketplace_product_list](#test-312) +- [Test 313: marketplace_product_list](#test-313) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) - [Test 314: get_bestpractices_get](#test-314) - [Test 315: get_bestpractices_get](#test-315) - [Test 316: get_bestpractices_get](#test-316) @@ -477,6 +949,10 @@ - [Test 462: cloudarchitect_design](#test-462) - [Test 463: cloudarchitect_design](#test-463) - [Test 464: cloudarchitect_design](#test-464) +<<<<<<< HEAD +======= +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) --- @@ -490,10 +966,24 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| | 1 | 0.705410 | `foundry_agents_connect` | ✅ **EXPECTED** | +<<<<<<< HEAD | 2 | 0.663468 | `foundry_agents_list` | ❌ | | 3 | 0.617213 | `foundry_resource_get` | ❌ | | 4 | 0.548044 | `foundry_openai_models-list` | ❌ | | 5 | 0.547459 | `foundry_agents_get-sdk-sample` | ❌ | +======= +<<<<<<< HEAD +| 2 | 0.663568 | `foundry_agents_list` | ❌ | +| 3 | 0.617213 | `foundry_resource_get` | ❌ | +| 4 | 0.548044 | `foundry_openai_models-list` | ❌ | +| 5 | 0.537580 | `foundry_agents_query-and-evaluate` | ❌ | +======= +| 2 | 0.617213 | `foundry_resource_get` | ❌ | +| 3 | 0.592487 | `foundry_agents_list` | ❌ | +| 4 | 0.537591 | `foundry_agents_query-and-evaluate` | ❌ | +| 5 | 0.536533 | `search_index_query` | ❌ | +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) --- @@ -506,11 +996,23 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.543045 | `foundry_agents_query-and-evaluate` | ❌ | | 2 | 0.469272 | `foundry_agents_evaluate` | ✅ **EXPECTED** | | 3 | 0.445585 | `foundry_agents_connect` | ❌ | | 4 | 0.298494 | `foundry_threads_list` | ❌ | | 5 | 0.279058 | `foundry_agents_list` | ❌ | +======= +<<<<<<< HEAD +| 1 | 0.544099 | `foundry_agents_query-and-evaluate` | ❌ | +======= +| 1 | 0.544237 | `foundry_agents_query-and-evaluate` | ❌ | +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +| 2 | 0.469428 | `foundry_agents_evaluate` | ✅ **EXPECTED** | +| 3 | 0.445964 | `foundry_agents_connect` | ❌ | +| 4 | 0.278921 | `foundry_agents_list` | ❌ | +| 5 | 0.250023 | `monitor_workspace_log_query` | ❌ | +>>>>>>> 58ab8585 (update prompts and tool description evaluator) --- @@ -523,11 +1025,27 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.797701 | `foundry_agents_list` | ✅ **EXPECTED** | | 2 | 0.666021 | `foundry_resource_get` | ❌ | | 3 | 0.654206 | `foundry_openai_models-list` | ❌ | | 4 | 0.647246 | `foundry_threads_list` | ❌ | | 5 | 0.575761 | `foundry_models_deployments_list` | ❌ | +======= +<<<<<<< HEAD +| 1 | 0.797877 | `foundry_agents_list` | ✅ **EXPECTED** | +| 2 | 0.666021 | `foundry_resource_get` | ❌ | +| 3 | 0.654206 | `foundry_openai_models-list` | ❌ | +| 4 | 0.575553 | `foundry_models_deployments_list` | ❌ | +| 5 | 0.561946 | `search_service_list` | ❌ | +======= +| 1 | 0.748474 | `foundry_agents_list` | ✅ **EXPECTED** | +| 2 | 0.666021 | `foundry_resource_get` | ❌ | +| 3 | 0.561946 | `search_service_list` | ❌ | +| 4 | 0.556912 | `foundry_agents_connect` | ❌ | +| 5 | 0.542125 | `foundry_knowledge_index_list` | ❌ | +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) --- @@ -540,11 +1058,27 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.749704 | `foundry_agents_list` | ✅ **EXPECTED** | | 2 | 0.630323 | `foundry_resource_get` | ❌ | | 3 | 0.611801 | `foundry_openai_models-list` | ❌ | | 4 | 0.603708 | `foundry_threads_list` | ❌ | | 5 | 0.556580 | `foundry_agents_get-sdk-sample` | ❌ | +======= +<<<<<<< HEAD +| 1 | 0.749829 | `foundry_agents_list` | ✅ **EXPECTED** | +| 2 | 0.630288 | `foundry_resource_get` | ❌ | +| 3 | 0.611722 | `foundry_openai_models-list` | ❌ | +| 4 | 0.548511 | `foundry_agents_connect` | ❌ | +| 5 | 0.535020 | `foundry_models_list` | ❌ | +======= +| 1 | 0.730759 | `foundry_agents_list` | ✅ **EXPECTED** | +| 2 | 0.630288 | `foundry_resource_get` | ❌ | +| 3 | 0.548511 | `foundry_agents_connect` | ❌ | +| 4 | 0.535020 | `foundry_models_list` | ❌ | +| 5 | 0.519892 | `foundry_knowledge_index_list` | ❌ | +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) --- @@ -575,10 +1109,23 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| | 1 | 0.652200 | `foundry_agents_connect` | ❌ | +<<<<<<< HEAD | 2 | 0.570725 | `foundry_agents_list` | ❌ | | 3 | 0.553233 | `foundry_agents_query-and-evaluate` | ✅ **EXPECTED** | | 4 | 0.493778 | `foundry_agents_evaluate` | ❌ | | 5 | 0.469431 | `foundry_threads_list` | ❌ | +======= +<<<<<<< HEAD +| 2 | 0.570788 | `foundry_agents_list` | ❌ | +| 3 | 0.553190 | `foundry_agents_query-and-evaluate` | ✅ **EXPECTED** | +| 4 | 0.493779 | `foundry_agents_evaluate` | ❌ | +======= +| 2 | 0.553370 | `foundry_agents_query-and-evaluate` | ✅ **EXPECTED** | +| 3 | 0.493779 | `foundry_agents_evaluate` | ❌ | +| 4 | 0.469096 | `foundry_agents_list` | ❌ | +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +| 5 | 0.460662 | `foundry_resource_get` | ❌ | +>>>>>>> 58ab8585 (update prompts and tool description evaluator) --- @@ -694,7 +1241,11 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| | 1 | 0.739885 | `foundry_knowledge_index_schema` | ✅ **EXPECTED** | +<<<<<<< HEAD | 2 | 0.589536 | `foundry_knowledge_index_list` | ❌ | +======= +| 2 | 0.614851 | `foundry_knowledge_index_list` | ❌ | +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) | 3 | 0.494004 | `foundry_resource_get` | ❌ | | 4 | 0.491510 | `search_index_get` | ❌ | | 5 | 0.490410 | `search_knowledge_base_get` | ❌ | @@ -728,10 +1279,21 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| | 1 | 0.562920 | `foundry_models_deploy` | ✅ **EXPECTED** | +<<<<<<< HEAD | 2 | 0.299986 | `foundry_openai_models-list` | ❌ | | 3 | 0.298490 | `loadtesting_testrun_create` | ❌ | | 4 | 0.293050 | `loadtesting_testresource_create` | ❌ | +<<<<<<< HEAD | 5 | 0.290387 | `foundry_openai_embeddings-create` | ❌ | +======= +| 5 | 0.290381 | `foundry_openai_embeddings-create` | ❌ | +======= +| 2 | 0.335116 | `foundry_openai_models-list` | ❌ | +| 3 | 0.298490 | `loadtesting_testrun_create` | ❌ | +| 4 | 0.293050 | `loadtesting_testresource_create` | ❌ | +| 5 | 0.282464 | `mysql_server_list` | ❌ | +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) --- @@ -744,11 +1306,27 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.681081 | `foundry_models_deployments_list` | ✅ **EXPECTED** | | 2 | 0.674510 | `foundry_openai_models-list` | ❌ | | 3 | 0.572625 | `foundry_threads_list` | ❌ | | 4 | 0.568871 | `foundry_agents_list` | ❌ | | 5 | 0.566272 | `foundry_resource_get` | ❌ | +======= +<<<<<<< HEAD +| 1 | 0.681385 | `foundry_models_deployments_list` | ✅ **EXPECTED** | +| 2 | 0.674510 | `foundry_openai_models-list` | ❌ | +| 3 | 0.569059 | `foundry_agents_list` | ❌ | +| 4 | 0.566272 | `foundry_resource_get` | ❌ | +| 5 | 0.549636 | `foundry_models_list` | ❌ | +======= +| 1 | 0.663599 | `foundry_models_deployments_list` | ✅ **EXPECTED** | +| 2 | 0.583429 | `foundry_openai_models-list` | ❌ | +| 3 | 0.566272 | `foundry_resource_get` | ❌ | +| 4 | 0.549636 | `foundry_models_list` | ❌ | +| 5 | 0.539695 | `foundry_agents_list` | ❌ | +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) --- @@ -761,11 +1339,27 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.619840 | `foundry_models_deployments_list` | ✅ **EXPECTED** | | 2 | 0.619299 | `foundry_openai_models-list` | ❌ | | 3 | 0.543385 | `foundry_resource_get` | ❌ | | 4 | 0.540528 | `foundry_agents_list` | ❌ | | 5 | 0.527141 | `foundry_threads_list` | ❌ | +======= +<<<<<<< HEAD +| 1 | 0.620173 | `foundry_models_deployments_list` | ✅ **EXPECTED** | +| 2 | 0.619231 | `foundry_openai_models-list` | ❌ | +| 3 | 0.543352 | `foundry_resource_get` | ❌ | +| 4 | 0.540551 | `foundry_agents_list` | ❌ | +| 5 | 0.521475 | `foundry_models_deploy` | ❌ | +======= +| 1 | 0.606516 | `foundry_models_deployments_list` | ✅ **EXPECTED** | +| 2 | 0.543352 | `foundry_resource_get` | ❌ | +| 3 | 0.521475 | `foundry_models_deploy` | ❌ | +| 4 | 0.518221 | `foundry_models_list` | ❌ | +| 5 | 0.507301 | `foundry_openai_models-list` | ❌ | +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) --- @@ -778,11 +1372,25 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.603415 | `foundry_openai_models-list` | ❌ | | 2 | 0.560022 | `foundry_models_list` | ✅ **EXPECTED** | +<<<<<<< HEAD | 3 | 0.553634 | `foundry_threads_list` | ❌ | | 4 | 0.537958 | `foundry_models_deployments_list` | ❌ | | 5 | 0.519191 | `foundry_agents_list` | ❌ | +======= +| 3 | 0.537981 | `foundry_models_deployments_list` | ❌ | +| 4 | 0.519472 | `foundry_agents_list` | ❌ | +| 5 | 0.514253 | `foundry_resource_get` | ❌ | +======= +| 1 | 0.560022 | `foundry_models_list` | ✅ **EXPECTED** | +| 2 | 0.514253 | `foundry_resource_get` | ❌ | +| 3 | 0.506418 | `foundry_models_deployments_list` | ❌ | +| 4 | 0.491952 | `foundry_agents_list` | ❌ | +| 5 | 0.475204 | `foundry_openai_models-list` | ❌ | +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) --- @@ -795,11 +1403,24 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.576904 | `foundry_openai_models-list` | ❌ | | 2 | 0.574818 | `foundry_models_list` | ✅ **EXPECTED** | | 3 | 0.525312 | `foundry_resource_get` | ❌ | +<<<<<<< HEAD | 4 | 0.522153 | `foundry_agents_get-sdk-sample` | ❌ | | 5 | 0.517825 | `foundry_models_deployments_list` | ❌ | +======= +| 4 | 0.517980 | `foundry_models_deployments_list` | ❌ | +| 5 | 0.504087 | `foundry_agents_list` | ❌ | +======= +| 1 | 0.574818 | `foundry_models_list` | ✅ **EXPECTED** | +| 2 | 0.525312 | `foundry_resource_get` | ❌ | +| 3 | 0.497061 | `foundry_models_deployments_list` | ❌ | +| 4 | 0.475139 | `foundry_agents_list` | ❌ | +| 5 | 0.467671 | `foundry_models_deploy` | ❌ | +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) --- @@ -812,11 +1433,25 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.641293 | `foundry_openai_chat-completions-create` | ✅ **EXPECTED** | | 2 | 0.546736 | `foundry_openai_create-completion` | ❌ | +<<<<<<< HEAD | 3 | 0.420018 | `foundry_threads_create` | ❌ | | 4 | 0.415482 | `foundry_agents_connect` | ❌ | | 5 | 0.399382 | `foundry_openai_embeddings-create` | ❌ | +======= +| 3 | 0.415483 | `foundry_agents_connect` | ❌ | +| 4 | 0.399383 | `foundry_openai_embeddings-create` | ❌ | +| 5 | 0.364105 | `foundry_models_deploy` | ❌ | +======= +| 1 | 0.558888 | `foundry_openai_chat-completions-create` | ✅ **EXPECTED** | +| 2 | 0.533147 | `foundry_openai_create-completion` | ❌ | +| 3 | 0.415483 | `foundry_agents_connect` | ❌ | +| 4 | 0.364105 | `foundry_models_deploy` | ❌ | +| 5 | 0.361151 | `foundry_resource_get` | ❌ | +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) --- @@ -829,11 +1464,25 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.696936 | `foundry_openai_create-completion` | ✅ **EXPECTED** | | 2 | 0.579108 | `foundry_openai_chat-completions-create` | ❌ | +<<<<<<< HEAD | 3 | 0.465558 | `azureaibestpractices_get` | ❌ | | 4 | 0.463703 | `foundry_models_deploy` | ❌ | | 5 | 0.459126 | `foundry_resource_get` | ❌ | +======= +| 3 | 0.463703 | `foundry_models_deploy` | ❌ | +| 4 | 0.459126 | `foundry_resource_get` | ❌ | +| 5 | 0.458622 | `foundry_openai_embeddings-create` | ❌ | +======= +| 1 | 0.682250 | `foundry_openai_create-completion` | ✅ **EXPECTED** | +| 2 | 0.539297 | `foundry_openai_chat-completions-create` | ❌ | +| 3 | 0.463703 | `foundry_models_deploy` | ❌ | +| 4 | 0.459126 | `foundry_resource_get` | ❌ | +| 5 | 0.450993 | `deploy_pipeline_guidance_get` | ❌ | +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) --- @@ -846,11 +1495,24 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.766496 | `foundry_openai_embeddings-create` | ✅ **EXPECTED** | | 2 | 0.543339 | `foundry_models_deploy` | ❌ | +======= +<<<<<<< HEAD +| 1 | 0.766338 | `foundry_openai_embeddings-create` | ✅ **EXPECTED** | +| 2 | 0.543338 | `foundry_models_deploy` | ❌ | +>>>>>>> 58ab8585 (update prompts and tool description evaluator) | 3 | 0.542214 | `foundry_openai_create-completion` | ❌ | | 4 | 0.520746 | `foundry_openai_models-list` | ❌ | | 5 | 0.519335 | `foundry_resource_get` | ❌ | +======= +| 1 | 0.681346 | `foundry_openai_embeddings-create` | ✅ **EXPECTED** | +| 2 | 0.556419 | `foundry_openai_create-completion` | ❌ | +| 3 | 0.543338 | `foundry_models_deploy` | ❌ | +| 4 | 0.519335 | `foundry_resource_get` | ❌ | +| 5 | 0.463954 | `foundry_openai_models-list` | ❌ | +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) --- @@ -863,11 +1525,27 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.724369 | `foundry_openai_embeddings-create` | ✅ **EXPECTED** | | 2 | 0.494544 | `foundry_resource_get` | ❌ | | 3 | 0.480389 | `foundry_models_deploy` | ❌ | | 4 | 0.480294 | `foundry_openai_create-completion` | ❌ | | 5 | 0.463885 | `foundry_openai_chat-completions-create` | ❌ | +======= +<<<<<<< HEAD +| 1 | 0.724120 | `foundry_openai_embeddings-create` | ✅ **EXPECTED** | +| 2 | 0.494485 | `foundry_resource_get` | ❌ | +| 3 | 0.480296 | `foundry_models_deploy` | ❌ | +| 4 | 0.480218 | `foundry_openai_create-completion` | ❌ | +| 5 | 0.463797 | `foundry_openai_chat-completions-create` | ❌ | +======= +| 1 | 0.638843 | `foundry_openai_embeddings-create` | ✅ **EXPECTED** | +| 2 | 0.494506 | `foundry_openai_create-completion` | ❌ | +| 3 | 0.494485 | `foundry_resource_get` | ❌ | +| 4 | 0.480296 | `foundry_models_deploy` | ❌ | +| 5 | 0.399908 | `foundry_openai_chat-completions-create` | ❌ | +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) --- @@ -880,11 +1558,25 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.799059 | `foundry_openai_models-list` | ✅ **EXPECTED** | | 2 | 0.668887 | `foundry_resource_get` | ❌ | +<<<<<<< HEAD | 3 | 0.667041 | `foundry_models_list` | ❌ | | 4 | 0.666560 | `foundry_models_deployments_list` | ❌ | | 5 | 0.657393 | `foundry_agents_list` | ❌ | +======= +| 3 | 0.667040 | `foundry_models_list` | ❌ | +| 4 | 0.666207 | `foundry_models_deployments_list` | ❌ | +| 5 | 0.657546 | `foundry_agents_list` | ❌ | +======= +| 1 | 0.729075 | `foundry_openai_models-list` | ✅ **EXPECTED** | +| 2 | 0.668887 | `foundry_resource_get` | ❌ | +| 3 | 0.667040 | `foundry_models_list` | ❌ | +| 4 | 0.660489 | `foundry_agents_list` | ❌ | +| 5 | 0.604808 | `foundry_models_deployments_list` | ❌ | +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) --- @@ -897,11 +1589,23 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.741659 | `foundry_openai_models-list` | ✅ **EXPECTED** | | 2 | 0.660115 | `foundry_models_deployments_list` | ❌ | | 3 | 0.648218 | `foundry_resource_get` | ❌ | | 4 | 0.640650 | `foundry_models_deploy` | ❌ | +<<<<<<< HEAD | 5 | 0.619790 | `foundry_agents_list` | ❌ | +======= +| 5 | 0.619878 | `foundry_agents_list` | ❌ | +======= +| 1 | 0.654318 | `foundry_openai_models-list` | ✅ **EXPECTED** | +| 2 | 0.648219 | `foundry_resource_get` | ❌ | +| 3 | 0.640650 | `foundry_models_deploy` | ❌ | +| 4 | 0.637676 | `foundry_models_deployments_list` | ❌ | +| 5 | 0.576563 | `foundry_agents_list` | ❌ | +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) --- @@ -932,10 +1636,17 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| | 1 | 0.665311 | `foundry_resource_get` | ✅ **EXPECTED** | +<<<<<<< HEAD | 2 | 0.585305 | `foundry_openai_models-list` | ❌ | | 3 | 0.553808 | `foundry_agents_list` | ❌ | | 4 | 0.518747 | `foundry_openai_embeddings-create` | ❌ | | 5 | 0.492911 | `foundry_models_deploy` | ❌ | +======= +| 2 | 0.492911 | `foundry_models_deploy` | ❌ | +| 3 | 0.474905 | `foundry_agents_list` | ❌ | +| 4 | 0.467211 | `loadtesting_testresource_list` | ❌ | +| 5 | 0.453632 | `foundry_openai_models-list` | ❌ | +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) --- @@ -949,10 +1660,23 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| | 1 | 0.735316 | `foundry_resource_get` | ✅ **EXPECTED** | +<<<<<<< HEAD | 2 | 0.571906 | `foundry_openai_models-list` | ❌ | +<<<<<<< HEAD | 3 | 0.509484 | `monitor_webtests_get` | ❌ | | 4 | 0.496980 | `foundry_openai_embeddings-create` | ❌ | | 5 | 0.475498 | `foundry_agents_list` | ❌ | +======= +| 3 | 0.510197 | `monitor_webtests_get` | ❌ | +| 4 | 0.497090 | `foundry_openai_embeddings-create` | ❌ | +| 5 | 0.475722 | `foundry_agents_list` | ❌ | +======= +| 2 | 0.509484 | `monitor_webtests_get` | ❌ | +| 3 | 0.455154 | `foundry_openai_models-list` | ❌ | +| 4 | 0.452340 | `foundry_models_deploy` | ❌ | +| 5 | 0.444390 | `loadtesting_testresource_list` | ❌ | +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) --- @@ -965,11 +1689,27 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.785967 | `search_knowledge_base_get` | ✅ **EXPECTED** | | 2 | 0.700824 | `search_knowledge_source_get` | ❌ | | 3 | 0.692681 | `search_service_list` | ❌ | | 4 | 0.635863 | `search_knowledge_base_retrieve` | ❌ | | 5 | 0.586575 | `search_index_get` | ❌ | +======= +<<<<<<< HEAD +| 1 | 0.785556 | `search_knowledge_base_get` | ✅ **EXPECTED** | +| 2 | 0.700785 | `search_knowledge_source_get` | ❌ | +| 3 | 0.693600 | `search_service_list` | ❌ | +| 4 | 0.635477 | `search_knowledge_base_retrieve` | ❌ | +| 5 | 0.586578 | `search_index_get` | ❌ | +======= +| 1 | 0.785967 | `search_knowledge_base_get` | ✅ **EXPECTED** | +| 2 | 0.700968 | `search_knowledge_source_get` | ❌ | +| 3 | 0.693471 | `search_service_list` | ❌ | +| 4 | 0.635863 | `search_knowledge_base_retrieve` | ❌ | +| 5 | 0.603324 | `foundry_knowledge_index_list` | ❌ | +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) --- @@ -983,7 +1723,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| | 1 | 0.748213 | `search_knowledge_base_get` | ✅ **EXPECTED** | -| 2 | 0.668487 | `search_knowledge_source_get` | ❌ | +| 2 | 0.668479 | `search_knowledge_source_get` | ❌ | | 3 | 0.628582 | `search_knowledge_base_retrieve` | ❌ | | 4 | 0.623715 | `search_service_list` | ❌ | | 5 | 0.566618 | `search_index_get` | ❌ | @@ -1000,7 +1740,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| | 1 | 0.702942 | `search_knowledge_base_get` | ✅ **EXPECTED** | -| 2 | 0.605964 | `search_knowledge_source_get` | ❌ | +| 2 | 0.606164 | `search_knowledge_source_get` | ❌ | | 3 | 0.583234 | `search_knowledge_base_retrieve` | ❌ | | 4 | 0.512825 | `search_service_list` | ❌ | | 5 | 0.476815 | `foundry_knowledge_index_list` | ❌ | @@ -1016,11 +1756,19 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.688155 | `search_knowledge_base_get` | ✅ **EXPECTED** | | 2 | 0.599348 | `search_knowledge_source_get` | ❌ | | 3 | 0.578437 | `search_knowledge_base_retrieve` | ❌ | | 4 | 0.456512 | `search_service_list` | ❌ | | 5 | 0.439493 | `foundry_knowledge_index_list` | ❌ | +======= +| 1 | 0.688051 | `search_knowledge_base_get` | ✅ **EXPECTED** | +| 2 | 0.599305 | `search_knowledge_source_get` | ❌ | +| 3 | 0.578499 | `search_knowledge_base_retrieve` | ❌ | +| 4 | 0.457619 | `search_service_list` | ❌ | +| 5 | 0.439529 | `foundry_knowledge_index_list` | ❌ | +>>>>>>> 58ab8585 (update prompts and tool description evaluator) --- @@ -1033,11 +1781,27 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.769383 | `search_knowledge_base_get` | ✅ **EXPECTED** | | 2 | 0.685640 | `search_knowledge_source_get` | ❌ | | 3 | 0.636958 | `search_knowledge_base_retrieve` | ❌ | | 4 | 0.585949 | `search_index_get` | ❌ | | 5 | 0.533298 | `search_service_list` | ❌ | +======= +<<<<<<< HEAD +| 1 | 0.769443 | `search_knowledge_base_get` | ✅ **EXPECTED** | +| 2 | 0.685642 | `search_knowledge_source_get` | ❌ | +| 3 | 0.636767 | `search_knowledge_base_retrieve` | ❌ | +| 4 | 0.586085 | `search_index_get` | ❌ | +| 5 | 0.533859 | `search_service_list` | ❌ | +======= +| 1 | 0.769384 | `search_knowledge_base_get` | ✅ **EXPECTED** | +| 2 | 0.685412 | `search_knowledge_source_get` | ❌ | +| 3 | 0.636958 | `search_knowledge_base_retrieve` | ❌ | +| 4 | 0.585949 | `search_index_get` | ❌ | +| 5 | 0.533700 | `search_service_list` | ❌ | +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) --- @@ -1052,9 +1816,15 @@ |------|-------|------|--------| | 1 | 0.595585 | `search_knowledge_base_get` | ✅ **EXPECTED** | | 2 | 0.551922 | `search_knowledge_base_retrieve` | ❌ | +<<<<<<< HEAD | 3 | 0.515480 | `search_knowledge_source_get` | ❌ | | 4 | 0.366170 | `search_service_list` | ❌ | | 5 | 0.365633 | `search_index_get` | ❌ | +======= +| 3 | 0.515607 | `search_knowledge_source_get` | ❌ | +| 4 | 0.376599 | `foundry_knowledge_index_list` | ❌ | +| 5 | 0.366893 | `search_service_list` | ❌ | +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) --- @@ -1067,11 +1837,27 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.724869 | `search_knowledge_base_retrieve` | ✅ **EXPECTED** | | 2 | 0.650606 | `search_knowledge_base_get` | ❌ | | 3 | 0.575356 | `search_index_query` | ❌ | | 4 | 0.567386 | `search_knowledge_source_get` | ❌ | | 5 | 0.520336 | `foundry_agents_connect` | ❌ | +======= +<<<<<<< HEAD +| 1 | 0.724846 | `search_knowledge_base_retrieve` | ✅ **EXPECTED** | +| 2 | 0.650590 | `search_knowledge_base_get` | ❌ | +| 3 | 0.575307 | `search_index_query` | ❌ | +| 4 | 0.567361 | `search_knowledge_source_get` | ❌ | +| 5 | 0.520360 | `foundry_agents_connect` | ❌ | +======= +| 1 | 0.724733 | `search_knowledge_base_retrieve` | ✅ **EXPECTED** | +| 2 | 0.650523 | `search_knowledge_base_get` | ❌ | +| 3 | 0.575078 | `search_index_query` | ❌ | +| 4 | 0.566839 | `search_knowledge_source_get` | ❌ | +| 5 | 0.520277 | `foundry_agents_connect` | ❌ | +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) --- @@ -1084,11 +1870,24 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.633877 | `search_knowledge_base_retrieve` | ✅ **EXPECTED** | | 2 | 0.589927 | `search_knowledge_base_get` | ❌ | | 3 | 0.502173 | `search_knowledge_source_get` | ❌ | | 4 | 0.422676 | `foundry_agents_query-and-evaluate` | ❌ | | 5 | 0.399110 | `search_index_query` | ❌ | +======= +| 1 | 0.633766 | `search_knowledge_base_retrieve` | ✅ **EXPECTED** | +| 2 | 0.589869 | `search_knowledge_base_get` | ❌ | +<<<<<<< HEAD +| 3 | 0.502085 | `search_knowledge_source_get` | ❌ | +| 4 | 0.422671 | `foundry_agents_query-and-evaluate` | ❌ | +======= +| 3 | 0.501973 | `search_knowledge_source_get` | ❌ | +| 4 | 0.422489 | `foundry_agents_query-and-evaluate` | ❌ | +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +| 5 | 0.399595 | `search_index_query` | ❌ | +>>>>>>> 58ab8585 (update prompts and tool description evaluator) --- @@ -1101,11 +1900,27 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.657866 | `search_knowledge_base_retrieve` | ✅ **EXPECTED** | | 2 | 0.557206 | `search_knowledge_base_get` | ❌ | | 3 | 0.463605 | `search_knowledge_source_get` | ❌ | | 4 | 0.436719 | `foundry_agents_query-and-evaluate` | ❌ | | 5 | 0.422173 | `foundry_agents_connect` | ❌ | +======= +<<<<<<< HEAD +| 1 | 0.657844 | `search_knowledge_base_retrieve` | ✅ **EXPECTED** | +| 2 | 0.557115 | `search_knowledge_base_get` | ❌ | +| 3 | 0.463461 | `search_knowledge_source_get` | ❌ | +| 4 | 0.436952 | `foundry_agents_query-and-evaluate` | ❌ | +| 5 | 0.422469 | `foundry_agents_connect` | ❌ | +======= +| 1 | 0.657865 | `search_knowledge_base_retrieve` | ✅ **EXPECTED** | +| 2 | 0.557206 | `search_knowledge_base_get` | ❌ | +| 3 | 0.463023 | `search_knowledge_source_get` | ❌ | +| 4 | 0.436580 | `foundry_agents_query-and-evaluate` | ❌ | +| 5 | 0.422173 | `foundry_agents_connect` | ❌ | +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) --- @@ -1120,9 +1935,19 @@ |------|-------|------|--------| | 1 | 0.633766 | `search_knowledge_base_retrieve` | ✅ **EXPECTED** | | 2 | 0.589869 | `search_knowledge_base_get` | ❌ | +<<<<<<< HEAD | 3 | 0.502085 | `search_knowledge_source_get` | ❌ | +<<<<<<< HEAD | 4 | 0.422610 | `foundry_agents_query-and-evaluate` | ❌ | | 5 | 0.399521 | `search_index_query` | ❌ | +======= +| 4 | 0.422671 | `foundry_agents_query-and-evaluate` | ❌ | +======= +| 3 | 0.501973 | `search_knowledge_source_get` | ❌ | +| 4 | 0.422489 | `foundry_agents_query-and-evaluate` | ❌ | +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +| 5 | 0.399595 | `search_index_query` | ❌ | +>>>>>>> 58ab8585 (update prompts and tool description evaluator) --- @@ -1137,8 +1962,17 @@ |------|-------|------|--------| | 1 | 0.598868 | `search_knowledge_base_retrieve` | ✅ **EXPECTED** | | 2 | 0.547862 | `search_knowledge_base_get` | ❌ | +<<<<<<< HEAD | 3 | 0.467868 | `foundry_agents_query-and-evaluate` | ❌ | +======= +<<<<<<< HEAD +| 3 | 0.467907 | `foundry_agents_query-and-evaluate` | ❌ | +>>>>>>> 58ab8585 (update prompts and tool description evaluator) | 4 | 0.464904 | `search_knowledge_source_get` | ❌ | +======= +| 3 | 0.467711 | `foundry_agents_query-and-evaluate` | ❌ | +| 4 | 0.464987 | `search_knowledge_source_get` | ❌ | +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) | 5 | 0.412481 | `foundry_agents_connect` | ❌ | --- @@ -1152,11 +1986,27 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.649767 | `search_knowledge_base_retrieve` | ✅ **EXPECTED** | | 2 | 0.631435 | `search_knowledge_base_get` | ❌ | | 3 | 0.581359 | `search_index_query` | ❌ | | 4 | 0.571156 | `search_knowledge_source_get` | ❌ | | 5 | 0.544545 | `search_service_list` | ❌ | +======= +<<<<<<< HEAD +| 1 | 0.649751 | `search_knowledge_base_retrieve` | ✅ **EXPECTED** | +| 2 | 0.631420 | `search_knowledge_base_get` | ❌ | +| 3 | 0.581412 | `search_index_query` | ❌ | +| 4 | 0.571126 | `search_knowledge_source_get` | ❌ | +| 5 | 0.544488 | `search_service_list` | ❌ | +======= +| 1 | 0.649767 | `search_knowledge_base_retrieve` | ✅ **EXPECTED** | +| 2 | 0.631435 | `search_knowledge_base_get` | ❌ | +| 3 | 0.581387 | `search_index_query` | ❌ | +| 4 | 0.571101 | `search_knowledge_source_get` | ❌ | +| 5 | 0.544501 | `search_service_list` | ❌ | +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) --- @@ -1171,9 +2021,20 @@ |------|-------|------|--------| | 1 | 0.579716 | `search_knowledge_base_retrieve` | ✅ **EXPECTED** | | 2 | 0.560688 | `search_knowledge_base_get` | ❌ | +<<<<<<< HEAD | 3 | 0.477941 | `search_knowledge_source_get` | ❌ | | 4 | 0.402530 | `foundry_agents_query-and-evaluate` | ❌ | +======= +<<<<<<< HEAD +| 3 | 0.477942 | `search_knowledge_source_get` | ❌ | +| 4 | 0.402582 | `foundry_agents_query-and-evaluate` | ❌ | +>>>>>>> 58ab8585 (update prompts and tool description evaluator) | 5 | 0.361231 | `foundry_knowledge_index_list` | ❌ | +======= +| 3 | 0.478132 | `search_knowledge_source_get` | ❌ | +| 4 | 0.402474 | `foundry_agents_query-and-evaluate` | ❌ | +| 5 | 0.371055 | `foundry_knowledge_index_list` | ❌ | +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) --- @@ -1186,11 +2047,27 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.582662 | `search_knowledge_base_retrieve` | ✅ **EXPECTED** | | 2 | 0.528610 | `search_knowledge_base_get` | ❌ | | 3 | 0.449336 | `search_knowledge_source_get` | ❌ | | 4 | 0.447690 | `foundry_agents_query-and-evaluate` | ❌ | | 5 | 0.397187 | `foundry_agents_connect` | ❌ | +======= +<<<<<<< HEAD +| 1 | 0.582660 | `search_knowledge_base_retrieve` | ✅ **EXPECTED** | +| 2 | 0.528583 | `search_knowledge_base_get` | ❌ | +| 3 | 0.449290 | `search_knowledge_source_get` | ❌ | +| 4 | 0.447915 | `foundry_agents_query-and-evaluate` | ❌ | +| 5 | 0.397238 | `foundry_agents_connect` | ❌ | +======= +| 1 | 0.582662 | `search_knowledge_base_retrieve` | ✅ **EXPECTED** | +| 2 | 0.528610 | `search_knowledge_base_get` | ❌ | +| 3 | 0.449340 | `search_knowledge_source_get` | ❌ | +| 4 | 0.447632 | `foundry_agents_query-and-evaluate` | ❌ | +| 5 | 0.397187 | `foundry_agents_connect` | ❌ | +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) --- @@ -1203,11 +2080,27 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.760406 | `search_knowledge_source_get` | ✅ **EXPECTED** | | 2 | 0.690845 | `search_service_list` | ❌ | | 3 | 0.665905 | `search_knowledge_base_get` | ❌ | | 4 | 0.573014 | `search_index_get` | ❌ | | 5 | 0.560755 | `search_knowledge_base_retrieve` | ❌ | +======= +<<<<<<< HEAD +| 1 | 0.760416 | `search_knowledge_source_get` | ✅ **EXPECTED** | +| 2 | 0.691931 | `search_service_list` | ❌ | +| 3 | 0.665923 | `search_knowledge_base_get` | ❌ | +| 4 | 0.573012 | `search_index_get` | ❌ | +| 5 | 0.560779 | `search_knowledge_base_retrieve` | ❌ | +======= +| 1 | 0.760757 | `search_knowledge_source_get` | ✅ **EXPECTED** | +| 2 | 0.692251 | `search_service_list` | ❌ | +| 3 | 0.666204 | `search_knowledge_base_get` | ❌ | +| 4 | 0.579582 | `foundry_knowledge_index_list` | ❌ | +| 5 | 0.573177 | `search_index_get` | ❌ | +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) --- @@ -1220,8 +2113,13 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.737860 | `search_knowledge_source_get` | ✅ **EXPECTED** | | 2 | 0.659236 | `search_service_list` | ❌ | +======= +| 1 | 0.737971 | `search_knowledge_source_get` | ✅ **EXPECTED** | +| 2 | 0.660170 | `search_service_list` | ❌ | +>>>>>>> 58ab8585 (update prompts and tool description evaluator) | 3 | 0.652969 | `search_knowledge_base_get` | ❌ | | 4 | 0.578836 | `search_index_get` | ❌ | | 5 | 0.560519 | `search_index_query` | ❌ | @@ -1237,7 +2135,11 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.657936 | `search_knowledge_source_get` | ✅ **EXPECTED** | +======= +| 1 | 0.658365 | `search_knowledge_source_get` | ✅ **EXPECTED** | +>>>>>>> 58ab8585 (update prompts and tool description evaluator) | 2 | 0.558516 | `search_knowledge_base_get` | ❌ | | 3 | 0.510338 | `search_service_list` | ❌ | | 4 | 0.470560 | `search_knowledge_base_retrieve` | ❌ | @@ -1254,7 +2156,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -| 1 | 0.652945 | `search_knowledge_source_get` | ✅ **EXPECTED** | +| 1 | 0.653143 | `search_knowledge_source_get` | ✅ **EXPECTED** | | 2 | 0.563270 | `search_knowledge_base_get` | ❌ | | 3 | 0.485934 | `search_service_list` | ❌ | | 4 | 0.477636 | `search_knowledge_base_retrieve` | ❌ | @@ -1271,8 +2173,13 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.825604 | `search_knowledge_source_get` | ✅ **EXPECTED** | | 2 | 0.693438 | `search_knowledge_base_get` | ❌ | +======= +| 1 | 0.825664 | `search_knowledge_source_get` | ✅ **EXPECTED** | +| 2 | 0.693437 | `search_knowledge_base_get` | ❌ | +>>>>>>> 58ab8585 (update prompts and tool description evaluator) | 3 | 0.595643 | `search_index_get` | ❌ | | 4 | 0.540550 | `search_knowledge_base_retrieve` | ❌ | | 5 | 0.531085 | `search_service_list` | ❌ | @@ -1288,7 +2195,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -| 1 | 0.630840 | `search_knowledge_source_get` | ✅ **EXPECTED** | +| 1 | 0.631283 | `search_knowledge_source_get` | ✅ **EXPECTED** | | 2 | 0.523643 | `search_knowledge_base_get` | ❌ | | 3 | 0.459923 | `search_knowledge_base_retrieve` | ❌ | | 4 | 0.371465 | `search_index_get` | ❌ | @@ -1308,8 +2215,13 @@ | 1 | 0.681052 | `search_index_get` | ✅ **EXPECTED** | | 2 | 0.544557 | `foundry_knowledge_index_schema` | ❌ | | 3 | 0.528153 | `search_knowledge_base_get` | ❌ | +<<<<<<< HEAD | 4 | 0.521765 | `search_knowledge_source_get` | ❌ | | 5 | 0.490553 | `search_service_list` | ❌ | +======= +| 4 | 0.522514 | `search_knowledge_source_get` | ❌ | +| 5 | 0.490624 | `search_service_list` | ❌ | +>>>>>>> 58ab8585 (update prompts and tool description evaluator) --- @@ -1326,7 +2238,7 @@ | 2 | 0.619949 | `search_service_list` | ❌ | | 3 | 0.538885 | `foundry_knowledge_index_list` | ❌ | | 4 | 0.511485 | `search_knowledge_base_get` | ❌ | -| 5 | 0.496094 | `search_knowledge_source_get` | ❌ | +| 5 | 0.496554 | `search_knowledge_source_get` | ❌ | --- @@ -1343,7 +2255,7 @@ | 2 | 0.562503 | `search_service_list` | ❌ | | 3 | 0.538471 | `foundry_knowledge_index_list` | ❌ | | 4 | 0.500365 | `search_knowledge_base_get` | ❌ | -| 5 | 0.490025 | `search_knowledge_source_get` | ❌ | +| 5 | 0.490330 | `search_knowledge_source_get` | ❌ | --- @@ -1356,11 +2268,27 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.522598 | `search_index_get` | ❌ | | 2 | 0.515911 | `search_index_query` | ✅ **EXPECTED** | | 3 | 0.498264 | `search_service_list` | ❌ | | 4 | 0.447868 | `search_knowledge_base_retrieve` | ❌ | | 5 | 0.437608 | `postgres_database_query` | ❌ | +======= +<<<<<<< HEAD +| 1 | 0.522953 | `search_index_get` | ❌ | +| 2 | 0.515871 | `search_index_query` | ✅ **EXPECTED** | +| 3 | 0.497392 | `search_service_list` | ❌ | +| 4 | 0.447993 | `search_knowledge_base_retrieve` | ❌ | +| 5 | 0.437640 | `postgres_database_query` | ❌ | +======= +| 1 | 0.522754 | `search_index_get` | ❌ | +| 2 | 0.515812 | `search_index_query` | ✅ **EXPECTED** | +| 3 | 0.497494 | `search_service_list` | ❌ | +| 4 | 0.447954 | `search_knowledge_base_retrieve` | ❌ | +| 5 | 0.437709 | `postgres_database_query` | ❌ | +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) --- @@ -1373,11 +2301,23 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.791803 | `search_service_list` | ✅ **EXPECTED** | | 2 | 0.553012 | `kusto_cluster_list` | ❌ | +======= +| 1 | 0.793651 | `search_service_list` | ✅ **EXPECTED** | +<<<<<<< HEAD +| 2 | 0.553011 | `kusto_cluster_list` | ❌ | +>>>>>>> 58ab8585 (update prompts and tool description evaluator) | 3 | 0.509479 | `subscription_list` | ❌ | | 4 | 0.505971 | `search_index_get` | ❌ | | 5 | 0.504693 | `marketplace_product_list` | ❌ | +======= +| 2 | 0.553043 | `kusto_cluster_list` | ❌ | +| 3 | 0.520340 | `foundry_agents_list` | ❌ | +| 4 | 0.509461 | `subscription_list` | ❌ | +| 5 | 0.505971 | `search_index_get` | ❌ | +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) --- @@ -1409,9 +2349,19 @@ |------|-------|------|--------| | 1 | 0.551241 | `search_service_list` | ✅ **EXPECTED** | | 2 | 0.436230 | `search_index_get` | ❌ | +<<<<<<< HEAD | 3 | 0.415277 | `search_knowledge_base_get` | ❌ | | 4 | 0.410461 | `search_knowledge_source_get` | ❌ | +<<<<<<< HEAD | 5 | 0.404707 | `search_index_query` | ❌ | +======= +| 5 | 0.404758 | `search_index_query` | ❌ | +======= +| 3 | 0.417096 | `foundry_agents_list` | ❌ | +| 4 | 0.415277 | `search_knowledge_base_get` | ❌ | +| 5 | 0.410568 | `search_knowledge_source_get` | ❌ | +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) --- @@ -1424,11 +2374,24 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.666038 | `speech_stt_recognize` | ✅ **EXPECTED** | | 2 | 0.377210 | `foundry_openai_embeddings-create` | ❌ | | 3 | 0.351127 | `deploy_plan_get` | ❌ | +<<<<<<< HEAD | 4 | 0.338137 | `extension_cli_generate` | ❌ | | 5 | 0.337763 | `deploy_pipeline_guidance_get` | ❌ | +======= +| 4 | 0.338047 | `extension_cli_generate` | ❌ | +| 5 | 0.337685 | `deploy_pipeline_guidance_get` | ❌ | +======= +| 1 | 0.677871 | `speech_tts_synthesize` | ❌ | +| 2 | 0.666038 | `speech_stt_recognize` | ✅ **EXPECTED** | +| 3 | 0.415224 | `foundry_openai_embeddings-create` | ❌ | +| 4 | 0.365228 | `foundry_openai_chat-completions-create` | ❌ | +| 5 | 0.351127 | `deploy_plan_get` | ❌ | +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) --- @@ -1442,10 +2405,24 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| | 1 | 0.511324 | `speech_stt_recognize` | ✅ **EXPECTED** | +<<<<<<< HEAD | 2 | 0.198123 | `foundry_agents_get-sdk-sample` | ❌ | | 3 | 0.192462 | `foundry_openai_embeddings-create` | ❌ | | 4 | 0.170157 | `foundry_openai_create-completion` | ❌ | | 5 | 0.167159 | `foundry_openai_chat-completions-create` | ❌ | +======= +<<<<<<< HEAD +| 2 | 0.192450 | `foundry_openai_embeddings-create` | ❌ | +| 3 | 0.170157 | `foundry_openai_create-completion` | ❌ | +| 4 | 0.167159 | `foundry_openai_chat-completions-create` | ❌ | +| 5 | 0.159108 | `foundry_agents_connect` | ❌ | +======= +| 2 | 0.353620 | `speech_tts_synthesize` | ❌ | +| 3 | 0.202056 | `foundry_openai_chat-completions-create` | ❌ | +| 4 | 0.190197 | `foundry_openai_embeddings-create` | ❌ | +| 5 | 0.184542 | `foundry_openai_create-completion` | ❌ | +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) --- @@ -1459,10 +2436,24 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| | 1 | 0.486489 | `speech_stt_recognize` | ✅ **EXPECTED** | +<<<<<<< HEAD | 2 | 0.162863 | `foundry_threads_create` | ❌ | | 3 | 0.160209 | `foundry_agents_connect` | ❌ | | 4 | 0.156936 | `deploy_pipeline_guidance_get` | ❌ | | 5 | 0.154737 | `foundry_openai_create-completion` | ❌ | +======= +<<<<<<< HEAD +| 2 | 0.160209 | `foundry_agents_connect` | ❌ | +| 3 | 0.156850 | `deploy_pipeline_guidance_get` | ❌ | +| 4 | 0.154737 | `foundry_openai_create-completion` | ❌ | +| 5 | 0.154098 | `foundry_openai_embeddings-create` | ❌ | +======= +| 2 | 0.354154 | `speech_tts_synthesize` | ❌ | +| 3 | 0.180941 | `foundry_openai_create-completion` | ❌ | +| 4 | 0.178944 | `foundry_openai_chat-completions-create` | ❌ | +| 5 | 0.160209 | `foundry_agents_connect` | ❌ | +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) --- @@ -1475,11 +2466,26 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.612032 | `speech_stt_recognize` | ✅ **EXPECTED** | | 2 | 0.309860 | `foundry_openai_embeddings-create` | ❌ | | 3 | 0.244223 | `foundry_resource_get` | ❌ | | 4 | 0.243658 | `foundry_openai_create-completion` | ❌ | | 5 | 0.242816 | `foundry_openai_chat-completions-create` | ❌ | +======= +| 1 | 0.611992 | `speech_stt_recognize` | ✅ **EXPECTED** | +<<<<<<< HEAD +| 2 | 0.309895 | `foundry_openai_embeddings-create` | ❌ | +| 3 | 0.244218 | `foundry_resource_get` | ❌ | +| 4 | 0.243626 | `foundry_openai_create-completion` | ❌ | +| 5 | 0.242771 | `foundry_openai_chat-completions-create` | ❌ | +======= +| 2 | 0.584104 | `speech_tts_synthesize` | ❌ | +| 3 | 0.322301 | `foundry_openai_embeddings-create` | ❌ | +| 4 | 0.263196 | `foundry_openai_create-completion` | ❌ | +| 5 | 0.251200 | `foundry_openai_chat-completions-create` | ❌ | +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) --- @@ -1493,10 +2499,21 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| | 1 | 0.410533 | `speech_stt_recognize` | ✅ **EXPECTED** | +<<<<<<< HEAD | 2 | 0.152414 | `foundry_openai_embeddings-create` | ❌ | +======= +<<<<<<< HEAD +| 2 | 0.152391 | `foundry_openai_embeddings-create` | ❌ | +>>>>>>> 58ab8585 (update prompts and tool description evaluator) | 3 | 0.152137 | `foundry_models_deploy` | ❌ | | 4 | 0.151799 | `deploy_pipeline_guidance_get` | ❌ | | 5 | 0.140373 | `deploy_plan_get` | ❌ | +======= +| 2 | 0.373433 | `speech_tts_synthesize` | ❌ | +| 3 | 0.159775 | `foundry_openai_embeddings-create` | ❌ | +| 4 | 0.158032 | `foundry_openai_chat-completions-create` | ❌ | +| 5 | 0.152137 | `foundry_models_deploy` | ❌ | +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) --- @@ -1510,10 +2527,17 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| | 1 | 0.546259 | `speech_stt_recognize` | ✅ **EXPECTED** | +<<<<<<< HEAD | 2 | 0.218092 | `foundry_resource_get` | ❌ | | 3 | 0.202860 | `foundry_openai_embeddings-create` | ❌ | | 4 | 0.183420 | `extension_azqr` | ❌ | | 5 | 0.181020 | `search_index_get` | ❌ | +======= +| 2 | 0.499808 | `speech_tts_synthesize` | ❌ | +| 3 | 0.225372 | `foundry_openai_embeddings-create` | ❌ | +| 4 | 0.218092 | `foundry_resource_get` | ❌ | +| 5 | 0.200865 | `foundry_openai_chat-completions-create` | ❌ | +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) --- @@ -1526,11 +2550,27 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.539963 | `speech_stt_recognize` | ✅ **EXPECTED** | | 2 | 0.228587 | `foundry_openai_create-completion` | ❌ | | 3 | 0.203413 | `foundry_agents_connect` | ❌ | | 4 | 0.199517 | `foundry_openai_embeddings-create` | ❌ | | 5 | 0.197301 | `foundry_openai_chat-completions-create` | ❌ | +======= +<<<<<<< HEAD +| 1 | 0.540249 | `speech_stt_recognize` | ✅ **EXPECTED** | +| 2 | 0.227953 | `foundry_openai_create-completion` | ❌ | +| 3 | 0.203215 | `foundry_agents_connect` | ❌ | +| 4 | 0.199441 | `foundry_openai_embeddings-create` | ❌ | +| 5 | 0.197199 | `foundry_openai_chat-completions-create` | ❌ | +======= +| 1 | 0.539963 | `speech_stt_recognize` | ✅ **EXPECTED** | +| 2 | 0.382022 | `speech_tts_synthesize` | ❌ | +| 3 | 0.246979 | `foundry_openai_create-completion` | ❌ | +| 4 | 0.238192 | `foundry_openai_chat-completions-create` | ❌ | +| 5 | 0.203413 | `foundry_agents_connect` | ❌ | +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) --- @@ -1543,11 +2583,27 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.549151 | `speech_stt_recognize` | ✅ **EXPECTED** | | 2 | 0.393626 | `azureaibestpractices_get` | ❌ | | 3 | 0.342537 | `extension_cli_generate` | ❌ | | 4 | 0.337387 | `cloudarchitect_design` | ❌ | | 5 | 0.335741 | `foundry_openai_create-completion` | ❌ | +======= +<<<<<<< HEAD +| 1 | 0.548967 | `speech_stt_recognize` | ✅ **EXPECTED** | +| 2 | 0.342494 | `extension_cli_generate` | ❌ | +| 3 | 0.337434 | `cloudarchitect_design` | ❌ | +| 4 | 0.335792 | `foundry_openai_create-completion` | ❌ | +| 5 | 0.333130 | `get_bestpractices_get` | ❌ | +======= +| 1 | 0.549151 | `speech_stt_recognize` | ✅ **EXPECTED** | +| 2 | 0.460662 | `speech_tts_synthesize` | ❌ | +| 3 | 0.357816 | `foundry_openai_chat-completions-create` | ❌ | +| 4 | 0.345661 | `foundry_openai_create-completion` | ❌ | +| 5 | 0.342537 | `extension_cli_generate` | ❌ | +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) --- @@ -1561,10 +2617,23 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| | 1 | 0.532536 | `speech_stt_recognize` | ✅ **EXPECTED** | +<<<<<<< HEAD | 2 | 0.349892 | `foundry_openai_create-completion` | ❌ | +<<<<<<< HEAD | 3 | 0.348381 | `azureaibestpractices_get` | ❌ | | 4 | 0.340893 | `foundry_openai_chat-completions-create` | ❌ | | 5 | 0.332862 | `foundry_openai_embeddings-create` | ❌ | +======= +| 3 | 0.340893 | `foundry_openai_chat-completions-create` | ❌ | +| 4 | 0.332669 | `foundry_openai_embeddings-create` | ❌ | +| 5 | 0.326712 | `get_bestpractices_get` | ❌ | +======= +| 2 | 0.506045 | `speech_tts_synthesize` | ❌ | +| 3 | 0.385033 | `foundry_openai_embeddings-create` | ❌ | +| 4 | 0.381487 | `foundry_openai_chat-completions-create` | ❌ | +| 5 | 0.378382 | `foundry_openai_create-completion` | ❌ | +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) --- @@ -1578,15 +2647,199 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| | 1 | 0.453396 | `speech_stt_recognize` | ✅ **EXPECTED** | +<<<<<<< HEAD | 2 | 0.173280 | `deploy_pipeline_guidance_get` | ❌ | | 3 | 0.164929 | `foundry_openai_embeddings-create` | ❌ | | 4 | 0.160483 | `foundry_agents_connect` | ❌ | | 5 | 0.160185 | `extension_azqr` | ❌ | +======= +<<<<<<< HEAD +| 2 | 0.173205 | `deploy_pipeline_guidance_get` | ❌ | +| 3 | 0.164990 | `foundry_openai_embeddings-create` | ❌ | +| 4 | 0.160523 | `extension_azqr` | ❌ | +| 5 | 0.160483 | `foundry_agents_connect` | ❌ | +======= +| 2 | 0.342007 | `speech_tts_synthesize` | ❌ | +| 3 | 0.181994 | `foundry_openai_create-completion` | ❌ | +| 4 | 0.174375 | `foundry_openai_chat-completions-create` | ❌ | +| 5 | 0.173205 | `deploy_pipeline_guidance_get` | ❌ | +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) + +--- + +## Test 66 + +**Expected Tool:** `speech_tts_synthesize` +**Prompt:** Convert text to speech and save to output.wav + +### Results + +| Rank | Score | Tool | Status | +|------|-------|------|--------| +| 1 | 0.547977 | `speech_tts_synthesize` | ✅ **EXPECTED** | +| 2 | 0.422457 | `speech_stt_recognize` | ❌ | +| 3 | 0.231058 | `foundry_openai_embeddings-create` | ❌ | +| 4 | 0.200920 | `foundry_openai_create-completion` | ❌ | +| 5 | 0.192179 | `foundry_openai_chat-completions-create` | ❌ | + +--- + +## Test 62 + +**Expected Tool:** `speech_tts_synthesize` +**Prompt:** Synthesize speech from "Hello, welcome to Azure" and save to welcome.wav + +### Results + +| Rank | Score | Tool | Status | +|------|-------|------|--------| +| 1 | 0.531396 | `speech_tts_synthesize` | ✅ **EXPECTED** | +| 2 | 0.486019 | `speech_stt_recognize` | ❌ | +| 3 | 0.329765 | `deploy_pipeline_guidance_get` | ❌ | +| 4 | 0.323728 | `extension_cli_generate` | ❌ | +| 5 | 0.320006 | `foundry_openai_chat-completions-create` | ❌ | + +--- + +## Test 63 + +**Expected Tool:** `speech_tts_synthesize` +**Prompt:** Generate speech audio from text "Hello world" using Azure Speech Services + +### Results + +| Rank | Score | Tool | Status | +|------|-------|------|--------| +| 1 | 0.590514 | `speech_tts_synthesize` | ✅ **EXPECTED** | +| 2 | 0.534002 | `speech_stt_recognize` | ❌ | +| 3 | 0.362626 | `foundry_openai_embeddings-create` | ❌ | +| 4 | 0.341003 | `foundry_openai_create-completion` | ❌ | +| 5 | 0.333557 | `foundry_openai_chat-completions-create` | ❌ | + +--- + +## Test 64 + +**Expected Tool:** `speech_tts_synthesize` +**Prompt:** Convert text to speech with Spanish language and save to spanish-audio.wav + +### Results + +| Rank | Score | Tool | Status | +|------|-------|------|--------| +| 1 | 0.520866 | `speech_tts_synthesize` | ✅ **EXPECTED** | +| 2 | 0.452648 | `speech_stt_recognize` | ❌ | +| 3 | 0.231393 | `foundry_openai_embeddings-create` | ❌ | +| 4 | 0.204970 | `foundry_openai_create-completion` | ❌ | +| 5 | 0.202502 | `foundry_openai_chat-completions-create` | ❌ | + +--- + +## Test 65 + +**Expected Tool:** `speech_tts_synthesize` +**Prompt:** Synthesize speech with voice en-US-JennyNeural from text "Azure AI Services" + +### Results + +| Rank | Score | Tool | Status | +|------|-------|------|--------| +| 1 | 0.604553 | `speech_tts_synthesize` | ✅ **EXPECTED** | +| 2 | 0.496715 | `speech_stt_recognize` | ❌ | +| 3 | 0.423461 | `foundry_openai_embeddings-create` | ❌ | +| 4 | 0.390312 | `foundry_agents_list` | ❌ | +| 5 | 0.381735 | `foundry_openai_chat-completions-create` | ❌ | --- ## Test 66 +**Expected Tool:** `speech_tts_synthesize` +**Prompt:** Create MP3 audio file from text "Welcome to Azure" with high quality format + +### Results + +| Rank | Score | Tool | Status | +|------|-------|------|--------| +| 1 | 0.564876 | `speech_tts_synthesize` | ✅ **EXPECTED** | +| 2 | 0.510908 | `speech_stt_recognize` | ❌ | +| 3 | 0.360542 | `foundry_openai_embeddings-create` | ❌ | +| 4 | 0.347597 | `deploy_pipeline_guidance_get` | ❌ | +| 5 | 0.345073 | `deploy_iac_rules_get` | ❌ | + +--- + +## Test 67 + +**Expected Tool:** `speech_tts_synthesize` +**Prompt:** Generate speech with custom voice model using endpoint ID + +### Results + +| Rank | Score | Tool | Status | +|------|-------|------|--------| +| 1 | 0.547864 | `speech_tts_synthesize` | ✅ **EXPECTED** | +| 2 | 0.455734 | `speech_stt_recognize` | ❌ | +| 3 | 0.367601 | `foundry_openai_embeddings-create` | ❌ | +| 4 | 0.358913 | `foundry_openai_chat-completions-create` | ❌ | +| 5 | 0.356105 | `foundry_models_deployments_list` | ❌ | + +--- + +## Test 68 + +**Expected Tool:** `speech_tts_synthesize` +**Prompt:** Convert text to OGG/Opus format audio file + +### Results + +| Rank | Score | Tool | Status | +|------|-------|------|--------| +| 1 | 0.446150 | `speech_tts_synthesize` | ✅ **EXPECTED** | +| 2 | 0.410086 | `speech_stt_recognize` | ❌ | +| 3 | 0.263503 | `foundry_openai_embeddings-create` | ❌ | +| 4 | 0.199147 | `foundry_openai_create-completion` | ❌ | +| 5 | 0.196153 | `extension_cli_generate` | ❌ | + +--- + +## Test 69 + +**Expected Tool:** `speech_tts_synthesize` +**Prompt:** Synthesize long text content to audio file with streaming + +### Results + +| Rank | Score | Tool | Status | +|------|-------|------|--------| +| 1 | 0.449165 | `speech_tts_synthesize` | ✅ **EXPECTED** | +| 2 | 0.369045 | `speech_stt_recognize` | ❌ | +| 3 | 0.225665 | `foundry_openai_create-completion` | ❌ | +| 4 | 0.225088 | `foundry_openai_chat-completions-create` | ❌ | +| 5 | 0.218260 | `foundry_openai_embeddings-create` | ❌ | + +--- + +## Test 70 + +**Expected Tool:** `speech_tts_synthesize` +**Prompt:** Create audio file from text in French language with appropriate voice + +### Results + +| Rank | Score | Tool | Status | +|------|-------|------|--------| +| 1 | 0.467698 | `speech_tts_synthesize` | ✅ **EXPECTED** | +| 2 | 0.385267 | `speech_stt_recognize` | ❌ | +| 3 | 0.235591 | `foundry_openai_create-completion` | ❌ | +| 4 | 0.215304 | `foundry_openai_embeddings-create` | ❌ | +| 5 | 0.208978 | `foundry_openai_chat-completions-create` | ❌ | + +--- + +## Test 71 + **Expected Tool:** `appconfig_account_list` **Prompt:** List all App Configuration stores in my subscription @@ -1596,13 +2849,27 @@ |------|-------|------|--------| | 1 | 0.786298 | `appconfig_account_list` | ✅ **EXPECTED** | | 2 | 0.530613 | `appconfig_kv_get` | ❌ | +<<<<<<< HEAD | 3 | 0.491380 | `postgres_server_list` | ❌ | +======= +<<<<<<< HEAD +| 3 | 0.491358 | `postgres_server_list` | ❌ | +>>>>>>> 58ab8585 (update prompts and tool description evaluator) | 4 | 0.481223 | `kusto_cluster_list` | ❌ | | 5 | 0.479997 | `subscription_list` | ❌ | +======= +| 3 | 0.491380 | `postgres_server_list` | ❌ | +| 4 | 0.481174 | `kusto_cluster_list` | ❌ | +| 5 | 0.479988 | `subscription_list` | ❌ | +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) --- +<<<<<<< HEAD ## Test 67 +======= +## Test 72 +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `appconfig_account_list` **Prompt:** Show me the App Configuration stores in my subscription @@ -1619,7 +2886,11 @@ --- +<<<<<<< HEAD ## Test 68 +======= +## Test 73 +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `appconfig_account_list` **Prompt:** Show me my App Configuration stores @@ -1636,7 +2907,11 @@ --- +<<<<<<< HEAD ## Test 69 +======= +## Test 74 +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `appconfig_kv_delete` **Prompt:** Delete the key in App Configuration store @@ -1645,6 +2920,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.618276 | `appconfig_kv_delete` | ✅ **EXPECTED** | | 2 | 0.464358 | `appconfig_kv_get` | ❌ | | 3 | 0.424344 | `appconfig_kv_set` | ❌ | @@ -1654,6 +2930,25 @@ --- ## Test 70 +======= +<<<<<<< HEAD +| 1 | 0.618277 | `appconfig_kv_delete` | ✅ **EXPECTED** | +| 2 | 0.464358 | `appconfig_kv_get` | ❌ | +| 3 | 0.424344 | `appconfig_kv_set` | ❌ | +| 4 | 0.422700 | `appconfig_kv_lock_set` | ❌ | +| 5 | 0.392016 | `appconfig_account_list` | ❌ | +======= +| 1 | 0.618267 | `appconfig_kv_delete` | ✅ **EXPECTED** | +| 2 | 0.464368 | `appconfig_kv_get` | ❌ | +| 3 | 0.424296 | `appconfig_kv_set` | ❌ | +| 4 | 0.422722 | `appconfig_kv_lock_set` | ❌ | +| 5 | 0.392081 | `appconfig_account_list` | ❌ | +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) + +--- + +## Test 75 +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `appconfig_kv_get` **Prompt:** List all key-value settings in App Configuration store @@ -1670,7 +2965,11 @@ --- +<<<<<<< HEAD ## Test 71 +======= +## Test 76 +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `appconfig_kv_get` **Prompt:** Show me the key-value settings in App Configuration store @@ -1687,7 +2986,11 @@ --- +<<<<<<< HEAD ## Test 72 +======= +## Test 77 +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `appconfig_kv_get` **Prompt:** List all key-value settings with key name starting with 'prod-' in App Configuration store @@ -1696,6 +2999,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.512883 | `appconfig_kv_get` | ✅ **EXPECTED** | | 2 | 0.450109 | `appconfig_account_list` | ❌ | | 3 | 0.398684 | `appconfig_kv_set` | ❌ | @@ -1705,6 +3009,25 @@ --- ## Test 73 +======= +<<<<<<< HEAD +| 1 | 0.512880 | `appconfig_kv_get` | ✅ **EXPECTED** | +| 2 | 0.449934 | `appconfig_account_list` | ❌ | +| 3 | 0.398698 | `appconfig_kv_set` | ❌ | +| 4 | 0.380636 | `appconfig_kv_delete` | ❌ | +| 5 | 0.346156 | `appconfig_kv_lock_set` | ❌ | +======= +| 1 | 0.512804 | `appconfig_kv_get` | ✅ **EXPECTED** | +| 2 | 0.449871 | `appconfig_account_list` | ❌ | +| 3 | 0.398608 | `appconfig_kv_set` | ❌ | +| 4 | 0.380599 | `appconfig_kv_delete` | ❌ | +| 5 | 0.346117 | `appconfig_kv_lock_set` | ❌ | +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) + +--- + +## Test 78 +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `appconfig_kv_get` **Prompt:** Show the content for the key in App Configuration store @@ -1721,7 +3044,11 @@ --- +<<<<<<< HEAD ## Test 74 +======= +## Test 79 +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `appconfig_kv_lock_set` **Prompt:** Lock the key in App Configuration store @@ -1738,7 +3065,11 @@ --- +<<<<<<< HEAD ## Test 75 +======= +## Test 80 +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `appconfig_kv_lock_set` **Prompt:** Unlock the key in App Configuration store @@ -1747,15 +3078,31 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.555699 | `appconfig_kv_lock_set` | ✅ **EXPECTED** | | 2 | 0.505681 | `appconfig_kv_get` | ❌ | | 3 | 0.476497 | `appconfig_kv_delete` | ❌ | | 4 | 0.425488 | `appconfig_kv_set` | ❌ | +<<<<<<< HEAD | 5 | 0.409649 | `appconfig_account_list` | ❌ | --- ## Test 76 +======= +| 5 | 0.409406 | `appconfig_account_list` | ❌ | +======= +| 1 | 0.555732 | `appconfig_kv_lock_set` | ✅ **EXPECTED** | +| 2 | 0.505675 | `appconfig_kv_get` | ❌ | +| 3 | 0.476507 | `appconfig_kv_delete` | ❌ | +| 4 | 0.425479 | `appconfig_kv_set` | ❌ | +| 5 | 0.409370 | `appconfig_account_list` | ❌ | +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) + +--- + +## Test 81 +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `appconfig_kv_set` **Prompt:** Set the key in App Configuration store to @@ -1772,7 +3119,11 @@ --- +<<<<<<< HEAD ## Test 77 +======= +## Test 82 +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `applens_resource_diagnose` **Prompt:** Please help me diagnose issues with my app using app lens @@ -1789,7 +3140,11 @@ --- +<<<<<<< HEAD ## Test 78 +======= +## Test 83 +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `applens_resource_diagnose` **Prompt:** Use app lens to check why my app is slow? @@ -1802,11 +3157,23 @@ | 2 | 0.316002 | `deploy_app_logs_get` | ❌ | | 3 | 0.255570 | `deploy_architecture_diagram_generate` | ❌ | | 4 | 0.249583 | `monitor_resource_log_query` | ❌ | +<<<<<<< HEAD | 5 | 0.226030 | `quota_usage_check` | ❌ | --- ## Test 79 +======= +<<<<<<< HEAD +| 5 | 0.226092 | `quota_usage_check` | ❌ | +======= +| 5 | 0.225972 | `quota_usage_check` | ❌ | +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) + +--- + +## Test 84 +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `applens_resource_diagnose` **Prompt:** What does app lens say is wrong with my service? @@ -1823,7 +3190,11 @@ --- +<<<<<<< HEAD ## Test 80 +======= +## Test 85 +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `appservice_database_add` **Prompt:** Add database connection to my app service for database in resource group @@ -1832,6 +3203,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.717878 | `appservice_database_add` | ✅ **EXPECTED** | | 2 | 0.401376 | `sql_db_rename` | ❌ | | 3 | 0.399941 | `sql_db_create` | ❌ | @@ -1841,6 +3213,25 @@ --- ## Test 81 +======= +<<<<<<< HEAD +| 1 | 0.717887 | `appservice_database_add` | ✅ **EXPECTED** | +| 2 | 0.401337 | `sql_db_rename` | ❌ | +| 3 | 0.399997 | `sql_db_create` | ❌ | +| 4 | 0.362889 | `sql_db_show` | ❌ | +| 5 | 0.357708 | `sql_db_list` | ❌ | +======= +| 1 | 0.682502 | `appservice_database_add` | ✅ **EXPECTED** | +| 2 | 0.401311 | `sql_db_rename` | ❌ | +| 3 | 0.400175 | `sql_db_create` | ❌ | +| 4 | 0.363123 | `sql_db_show` | ❌ | +| 5 | 0.357874 | `sql_db_list` | ❌ | +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) + +--- + +## Test 86 +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `appservice_database_add` **Prompt:** Configure SQL Server database for app service with connection string in resource group @@ -1849,6 +3240,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.688410 | `appservice_database_add` | ✅ **EXPECTED** | | 2 | 0.498122 | `sql_db_rename` | ❌ | | 3 | 0.497502 | `sql_db_create` | ❌ | @@ -1858,6 +3250,25 @@ --- ## Test 82 +======= +<<<<<<< HEAD +| 1 | 0.688364 | `appservice_database_add` | ✅ **EXPECTED** | +| 2 | 0.498175 | `sql_db_rename` | ❌ | +| 3 | 0.497711 | `sql_db_create` | ❌ | +| 4 | 0.469526 | `sql_db_show` | ❌ | +| 5 | 0.453040 | `sql_db_list` | ❌ | +======= +| 1 | 0.654513 | `appservice_database_add` | ✅ **EXPECTED** | +| 2 | 0.498175 | `sql_db_rename` | ❌ | +| 3 | 0.497522 | `sql_db_create` | ❌ | +| 4 | 0.469526 | `sql_db_show` | ❌ | +| 5 | 0.453088 | `sql_db_list` | ❌ | +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) + +--- + +## Test 87 +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `appservice_database_add` **Prompt:** Add MySQL database to app service using connection in resource group @@ -1866,6 +3277,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.675970 | `appservice_database_add` | ✅ **EXPECTED** | | 2 | 0.464756 | `sql_db_create` | ❌ | | 3 | 0.452407 | `sql_db_rename` | ❌ | @@ -1875,6 +3287,25 @@ --- ## Test 83 +======= +<<<<<<< HEAD +| 1 | 0.675548 | `appservice_database_add` | ✅ **EXPECTED** | +| 2 | 0.465376 | `sql_db_create` | ❌ | +| 3 | 0.452528 | `sql_db_rename` | ❌ | +| 4 | 0.433256 | `mysql_server_list` | ❌ | +| 5 | 0.410221 | `sql_db_show` | ❌ | +======= +| 1 | 0.655045 | `appservice_database_add` | ✅ **EXPECTED** | +| 2 | 0.465281 | `sql_db_create` | ❌ | +| 3 | 0.452630 | `sql_db_rename` | ❌ | +| 4 | 0.433191 | `mysql_server_list` | ❌ | +| 5 | 0.410316 | `sql_db_show` | ❌ | +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) + +--- + +## Test 88 +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `appservice_database_add` **Prompt:** Add PostgreSQL database to app service using connection in resource group @@ -1883,6 +3314,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.628119 | `appservice_database_add` | ✅ **EXPECTED** | | 2 | 0.444212 | `sql_db_create` | ❌ | | 3 | 0.405314 | `postgres_database_query` | ❌ | @@ -1892,6 +3324,25 @@ --- ## Test 84 +======= +<<<<<<< HEAD +| 1 | 0.627847 | `appservice_database_add` | ✅ **EXPECTED** | +| 2 | 0.444822 | `sql_db_create` | ❌ | +| 3 | 0.404711 | `postgres_database_query` | ❌ | +| 4 | 0.401105 | `postgres_database_list` | ❌ | +| 5 | 0.400866 | `sql_db_rename` | ❌ | +======= +| 1 | 0.599525 | `appservice_database_add` | ✅ **EXPECTED** | +| 2 | 0.444152 | `sql_db_create` | ❌ | +| 3 | 0.404912 | `postgres_database_query` | ❌ | +| 4 | 0.401137 | `postgres_database_list` | ❌ | +| 5 | 0.400754 | `sql_db_rename` | ❌ | +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) + +--- + +## Test 89 +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `appservice_database_add` **Prompt:** Connect CosmosDB database using connection string to app service in resource group @@ -1900,6 +3351,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.663086 | `appservice_database_add` | ✅ **EXPECTED** | | 2 | 0.446465 | `cosmos_database_list` | ❌ | | 3 | 0.441966 | `cosmos_database_container_item_query` | ❌ | @@ -1909,6 +3361,25 @@ --- ## Test 85 +======= +<<<<<<< HEAD +| 1 | 0.663498 | `appservice_database_add` | ✅ **EXPECTED** | +| 2 | 0.446339 | `cosmos_database_list` | ❌ | +| 3 | 0.441990 | `cosmos_database_container_item_query` | ❌ | +| 4 | 0.427167 | `cosmos_database_container_list` | ❌ | +| 5 | 0.420405 | `sql_db_rename` | ❌ | +======= +| 1 | 0.608259 | `appservice_database_add` | ✅ **EXPECTED** | +| 2 | 0.445781 | `cosmos_database_list` | ❌ | +| 3 | 0.441836 | `cosmos_database_container_item_query` | ❌ | +| 4 | 0.426789 | `cosmos_database_container_list` | ❌ | +| 5 | 0.420630 | `sql_db_rename` | ❌ | +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) + +--- + +## Test 90 +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `appservice_database_add` **Prompt:** Add database connection for database on server to app service in resource group @@ -1917,6 +3388,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.733852 | `appservice_database_add` | ✅ **EXPECTED** | | 2 | 0.454554 | `sql_db_create` | ❌ | | 3 | 0.415271 | `sql_db_rename` | ❌ | @@ -1926,6 +3398,25 @@ --- ## Test 86 +======= +<<<<<<< HEAD +| 1 | 0.733775 | `appservice_database_add` | ✅ **EXPECTED** | +| 2 | 0.454433 | `sql_db_create` | ❌ | +| 3 | 0.415274 | `sql_db_rename` | ❌ | +| 4 | 0.414045 | `sql_server_create` | ❌ | +| 5 | 0.410100 | `sql_db_list` | ❌ | +======= +| 1 | 0.702259 | `appservice_database_add` | ✅ **EXPECTED** | +| 2 | 0.454592 | `sql_db_create` | ❌ | +| 3 | 0.415290 | `sql_db_rename` | ❌ | +| 4 | 0.414069 | `sql_server_create` | ❌ | +| 5 | 0.410258 | `sql_db_list` | ❌ | +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) + +--- + +## Test 91 +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `appservice_database_add` **Prompt:** Add database connection string for to app service using connection string in resource group @@ -1934,6 +3425,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.746766 | `appservice_database_add` | ✅ **EXPECTED** | | 2 | 0.441682 | `sql_db_rename` | ❌ | | 3 | 0.434020 | `sql_db_create` | ❌ | @@ -1943,6 +3435,25 @@ --- ## Test 87 +======= +<<<<<<< HEAD +| 1 | 0.746379 | `appservice_database_add` | ✅ **EXPECTED** | +| 2 | 0.441584 | `sql_db_rename` | ❌ | +| 3 | 0.434079 | `sql_db_create` | ❌ | +| 4 | 0.391000 | `sql_db_list` | ❌ | +| 5 | 0.389995 | `sql_db_show` | ❌ | +======= +| 1 | 0.686506 | `appservice_database_add` | ✅ **EXPECTED** | +| 2 | 0.441542 | `sql_db_rename` | ❌ | +| 3 | 0.433865 | `sql_db_create` | ❌ | +| 4 | 0.391188 | `sql_db_list` | ❌ | +| 5 | 0.390129 | `sql_db_show` | ❌ | +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) + +--- + +## Test 92 +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `appservice_database_add` **Prompt:** Connect database to my app service using connection string in resource group @@ -1951,6 +3462,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.680503 | `appservice_database_add` | ✅ **EXPECTED** | | 2 | 0.429273 | `sql_db_rename` | ❌ | | 3 | 0.406267 | `sql_db_create` | ❌ | @@ -1960,6 +3472,25 @@ --- ## Test 88 +======= +<<<<<<< HEAD +| 1 | 0.680525 | `appservice_database_add` | ✅ **EXPECTED** | +| 2 | 0.429291 | `sql_db_rename` | ❌ | +| 3 | 0.406599 | `sql_db_create` | ❌ | +| 4 | 0.396524 | `sql_db_show` | ❌ | +| 5 | 0.391416 | `sql_db_list` | ❌ | +======= +| 1 | 0.643888 | `appservice_database_add` | ✅ **EXPECTED** | +| 2 | 0.429317 | `sql_db_rename` | ❌ | +| 3 | 0.406322 | `sql_db_create` | ❌ | +| 4 | 0.396523 | `sql_db_show` | ❌ | +| 5 | 0.391430 | `sql_db_list` | ❌ | +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) + +--- + +## Test 93 +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `appservice_database_add` **Prompt:** Set up database for app service with connection string under resource group @@ -1968,6 +3499,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.640738 | `appservice_database_add` | ✅ **EXPECTED** | | 2 | 0.456785 | `sql_db_create` | ❌ | | 3 | 0.402668 | `sql_db_rename` | ❌ | @@ -1977,6 +3509,42 @@ --- ## Test 89 +======= +<<<<<<< HEAD +| 1 | 0.640622 | `appservice_database_add` | ✅ **EXPECTED** | +| 2 | 0.456508 | `sql_db_create` | ❌ | +| 3 | 0.402651 | `sql_db_rename` | ❌ | +| 4 | 0.402081 | `sql_db_show` | ❌ | +| 5 | 0.394177 | `sql_db_list` | ❌ | +======= +| 1 | 0.598494 | `appservice_database_add` | ✅ **EXPECTED** | +| 2 | 0.456884 | `sql_db_create` | ❌ | +| 3 | 0.402743 | `sql_db_rename` | ❌ | +| 4 | 0.402138 | `sql_db_show` | ❌ | +| 5 | 0.394211 | `sql_db_list` | ❌ | +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) + +--- + +## Test 94 + +**Expected Tool:** `appservice_database_add` +**Prompt:** Configure database for app service with the connection string in resource group + +### Results + +| Rank | Score | Tool | Status | +|------|-------|------|--------| +| 1 | 0.650888 | `appservice_database_add` | ✅ **EXPECTED** | +| 2 | 0.449175 | `sql_db_rename` | ❌ | +| 3 | 0.448382 | `sql_db_create` | ❌ | +| 4 | 0.414323 | `sql_db_show` | ❌ | +| 5 | 0.411790 | `sql_db_list` | ❌ | + +--- + +## Test 95 +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `appservice_database_add` **Prompt:** Configure database for app service with the connection string in resource group @@ -2003,6 +3571,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| | 1 | 0.572473 | `applicationinsights_recommendation_list` | ✅ **EXPECTED** | +<<<<<<< HEAD | 2 | 0.454559 | `azureaibestpractices_get` | ❌ | | 3 | 0.445157 | `get_bestpractices_get` | ❌ | | 4 | 0.390478 | `azureterraformbestpractices_get` | ❌ | @@ -2011,6 +3580,25 @@ --- ## Test 91 +======= +<<<<<<< HEAD +| 2 | 0.445157 | `get_bestpractices_get` | ❌ | +| 3 | 0.390549 | `azureterraformbestpractices_get` | ❌ | +======= +| 2 | 0.449459 | `get_bestpractices_get` | ❌ | +| 3 | 0.390478 | `azureterraformbestpractices_get` | ❌ | +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +| 4 | 0.383948 | `applens_resource_diagnose` | ❌ | +| 5 | 0.375286 | `deploy_iac_rules_get` | ❌ | + +--- + +<<<<<<< HEAD +## Test 86 +======= +## Test 96 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `applicationinsights_recommendation_list` **Prompt:** Show me code optimization recommendations for all Application Insights resources in my subscription @@ -2019,6 +3607,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.696531 | `applicationinsights_recommendation_list` | ✅ **EXPECTED** | | 2 | 0.506351 | `azureaibestpractices_get` | ❌ | | 3 | 0.468384 | `get_bestpractices_get` | ❌ | @@ -2027,7 +3616,22 @@ --- +<<<<<<< HEAD ## Test 92 +======= +## Test 87 +======= +| 1 | 0.696565 | `applicationinsights_recommendation_list` | ✅ **EXPECTED** | +| 2 | 0.470670 | `get_bestpractices_get` | ❌ | +| 3 | 0.452233 | `applens_resource_diagnose` | ❌ | +| 4 | 0.435290 | `azureterraformbestpractices_get` | ❌ | +| 5 | 0.424629 | `search_service_list` | ❌ | + +--- + +## Test 97 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `applicationinsights_recommendation_list` **Prompt:** List profiler recommendations for Application Insights in resource group @@ -2038,13 +3642,29 @@ |------|-------|------|--------| | 1 | 0.626722 | `applicationinsights_recommendation_list` | ✅ **EXPECTED** | | 2 | 0.488002 | `loadtesting_testresource_list` | ❌ | +<<<<<<< HEAD +| 3 | 0.479392 | `mysql_server_list` | ❌ | +======= +<<<<<<< HEAD +| 3 | 0.479416 | `mysql_server_list` | ❌ | +======= | 3 | 0.479392 | `mysql_server_list` | ❌ | +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) | 4 | 0.477396 | `applens_resource_diagnose` | ❌ | | 5 | 0.468847 | `resourcehealth_availability-status_list` | ❌ | --- +<<<<<<< HEAD ## Test 93 +======= +<<<<<<< HEAD +## Test 88 +======= +## Test 98 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `applicationinsights_recommendation_list` **Prompt:** Show me performance improvement recommendations from Application Insights @@ -2053,6 +3673,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.509615 | `applicationinsights_recommendation_list` | ✅ **EXPECTED** | | 2 | 0.433835 | `azureaibestpractices_get` | ❌ | | 3 | 0.419699 | `applens_resource_diagnose` | ❌ | @@ -2062,6 +3683,26 @@ --- ## Test 94 +======= +| 1 | 0.509502 | `applicationinsights_recommendation_list` | ✅ **EXPECTED** | +| 2 | 0.419670 | `applens_resource_diagnose` | ❌ | +<<<<<<< HEAD +| 3 | 0.383767 | `get_bestpractices_get` | ❌ | +| 4 | 0.367260 | `deploy_architecture_diagram_generate` | ❌ | +======= +| 3 | 0.385936 | `get_bestpractices_get` | ❌ | +| 4 | 0.367278 | `deploy_architecture_diagram_generate` | ❌ | +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +| 5 | 0.343931 | `cloudarchitect_design` | ❌ | + +--- + +<<<<<<< HEAD +## Test 89 +======= +## Test 99 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `extension_cli_generate` **Prompt:** Create a Storage account with name using Azure CLI @@ -2072,6 +3713,7 @@ |------|-------|------|--------| | 1 | 0.593241 | `storage_account_create` | ❌ | | 2 | 0.564940 | `storage_blob_container_create` | ❌ | +<<<<<<< HEAD | 3 | 0.493684 | `storage_account_get` | ❌ | | 4 | 0.473547 | `storage_blob_container_get` | ❌ | | 5 | 0.456428 | `managedlustre_fs_create` | ❌ | @@ -2079,6 +3721,23 @@ --- ## Test 95 +======= +<<<<<<< HEAD +| 3 | 0.493609 | `storage_account_get` | ❌ | +======= +| 3 | 0.493641 | `storage_account_get` | ❌ | +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +| 4 | 0.474399 | `storage_blob_container_get` | ❌ | +| 5 | 0.454194 | `managedlustre_fs_create` | ❌ | + +--- + +<<<<<<< HEAD +## Test 90 +======= +## Test 100 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `extension_cli_generate` **Prompt:** List all virtual machines in my subscription using Azure CLI @@ -2087,7 +3746,12 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.592102 | `search_service_list` | ❌ | +======= +| 1 | 0.593467 | `search_service_list` | ❌ | +<<<<<<< HEAD +>>>>>>> 58ab8585 (update prompts and tool description evaluator) | 2 | 0.575274 | `kusto_cluster_list` | ❌ | | 3 | 0.549918 | `virtualdesktop_hostpool_list` | ❌ | | 4 | 0.544688 | `monitor_workspace_list` | ❌ | @@ -2095,24 +3759,55 @@ --- +<<<<<<< HEAD ## Test 96 +======= +## Test 91 +======= +| 2 | 0.575351 | `kusto_cluster_list` | ❌ | +| 3 | 0.549966 | `virtualdesktop_hostpool_list` | ❌ | +| 4 | 0.544412 | `monitor_workspace_list` | ❌ | +| 5 | 0.536252 | `subscription_list` | ❌ | -**Expected Tool:** `extension_cli_generate` +--- + +## Test 101 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) + +**Expected Tool:** `extension_cli_generate` **Prompt:** Show me the details of the storage account with Azure CLI commands ### Results | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.710307 | `storage_account_get` | ❌ | | 2 | 0.601571 | `storage_blob_container_get` | ❌ | +======= +<<<<<<< HEAD +| 1 | 0.710155 | `storage_account_get` | ❌ | +======= +| 1 | 0.710305 | `storage_account_get` | ❌ | +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +| 2 | 0.602173 | `storage_blob_container_get` | ❌ | +>>>>>>> 58ab8585 (update prompts and tool description evaluator) | 3 | 0.543268 | `storage_blob_get` | ❌ | | 4 | 0.519788 | `storage_account_create` | ❌ | -| 5 | 0.493145 | `cosmos_account_list` | ❌ | +| 5 | 0.493100 | `cosmos_account_list` | ❌ | --- +<<<<<<< HEAD ## Test 97 +======= +<<<<<<< HEAD +## Test 92 +======= +## Test 102 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `extension_cli_install` **Prompt:** @@ -2121,6 +3816,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.479652 | `extension_cli_install` | ✅ **EXPECTED** | | 2 | 0.473369 | `extension_cli_generate` | ❌ | | 3 | 0.389405 | `azureterraformbestpractices_get` | ❌ | @@ -2130,6 +3826,29 @@ --- ## Test 98 +======= +<<<<<<< HEAD +| 1 | 0.479590 | `extension_cli_install` | ✅ **EXPECTED** | +| 2 | 0.473266 | `extension_cli_generate` | ❌ | +| 3 | 0.389369 | `azureterraformbestpractices_get` | ❌ | +| 4 | 0.382389 | `deploy_plan_get` | ❌ | +| 5 | 0.366012 | `get_bestpractices_get` | ❌ | + +--- + +## Test 93 +======= +| 1 | 0.497777 | `extension_cli_generate` | ❌ | +| 2 | 0.497497 | `extension_cli_install` | ✅ **EXPECTED** | +| 3 | 0.401453 | `azureterraformbestpractices_get` | ❌ | +| 4 | 0.383619 | `deploy_plan_get` | ❌ | +| 5 | 0.382552 | `deploy_pipeline_guidance_get` | ❌ | + +--- + +## Test 103 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `extension_cli_install` **Prompt:** How to install azd @@ -2146,7 +3865,15 @@ --- +<<<<<<< HEAD ## Test 99 +======= +<<<<<<< HEAD +## Test 94 +======= +## Test 104 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `extension_cli_install` **Prompt:** What is Azure Functions Core tools and how to install it @@ -2155,6 +3882,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.622670 | `extension_cli_install` | ✅ **EXPECTED** | | 2 | 0.439414 | `get_bestpractices_get` | ❌ | | 3 | 0.432859 | `deploy_pipeline_guidance_get` | ❌ | @@ -2164,6 +3892,21 @@ --- ## Test 100 +======= +| 1 | 0.622705 | `extension_cli_install` | ✅ **EXPECTED** | +| 2 | 0.443050 | `get_bestpractices_get` | ❌ | +| 3 | 0.432913 | `deploy_pipeline_guidance_get` | ❌ | +| 4 | 0.430483 | `extension_cli_generate` | ❌ | +| 5 | 0.418161 | `deploy_plan_get` | ❌ | + +--- + +<<<<<<< HEAD +## Test 95 +======= +## Test 105 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `acr_registry_list` **Prompt:** List all Azure Container Registries in my subscription @@ -2174,13 +3917,30 @@ |------|-------|------|--------| | 1 | 0.743568 | `acr_registry_list` | ✅ **EXPECTED** | | 2 | 0.711580 | `acr_registry_repository_list` | ❌ | +<<<<<<< HEAD | 3 | 0.585675 | `kusto_cluster_list` | ❌ | +<<<<<<< HEAD | 4 | 0.540241 | `search_service_list` | ❌ | | 5 | 0.514293 | `cosmos_account_list` | ❌ | --- ## Test 101 +======= +======= +| 3 | 0.585618 | `kusto_cluster_list` | ❌ | +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +| 4 | 0.541506 | `search_service_list` | ❌ | +| 5 | 0.514326 | `cosmos_account_list` | ❌ | + +--- + +<<<<<<< HEAD +## Test 96 +======= +## Test 106 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `acr_registry_list` **Prompt:** Show me my Azure Container Registries @@ -2197,7 +3957,15 @@ --- +<<<<<<< HEAD ## Test 102 +======= +<<<<<<< HEAD +## Test 97 +======= +## Test 107 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `acr_registry_list` **Prompt:** Show me the container registries in my subscription @@ -2208,13 +3976,29 @@ |------|-------|------|--------| | 1 | 0.637130 | `acr_registry_list` | ✅ **EXPECTED** | | 2 | 0.563476 | `acr_registry_repository_list` | ❌ | +<<<<<<< HEAD | 3 | 0.516769 | `kusto_cluster_list` | ❌ | +<<<<<<< HEAD | 4 | 0.515365 | `storage_blob_container_get` | ❌ | +======= +======= +| 3 | 0.516826 | `kusto_cluster_list` | ❌ | +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +| 4 | 0.515378 | `storage_blob_container_get` | ❌ | +>>>>>>> 58ab8585 (update prompts and tool description evaluator) | 5 | 0.480352 | `redis_list` | ❌ | --- +<<<<<<< HEAD ## Test 103 +======= +<<<<<<< HEAD +## Test 98 +======= +## Test 108 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `acr_registry_list` **Prompt:** List container registries in resource group @@ -2225,13 +4009,29 @@ |------|-------|------|--------| | 1 | 0.654318 | `acr_registry_repository_list` | ❌ | | 2 | 0.633938 | `acr_registry_list` | ✅ **EXPECTED** | +<<<<<<< HEAD +| 3 | 0.476015 | `mysql_server_list` | ❌ | +======= +<<<<<<< HEAD +| 3 | 0.476043 | `mysql_server_list` | ❌ | +======= | 3 | 0.476015 | `mysql_server_list` | ❌ | +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) | 4 | 0.454929 | `group_list` | ❌ | | 5 | 0.454003 | `datadog_monitoredresources_list` | ❌ | --- +<<<<<<< HEAD ## Test 104 +======= +<<<<<<< HEAD +## Test 99 +======= +## Test 109 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `acr_registry_list` **Prompt:** Show me the container registries in resource group @@ -2242,13 +4042,29 @@ |------|-------|------|--------| | 1 | 0.639391 | `acr_registry_list` | ✅ **EXPECTED** | | 2 | 0.637972 | `acr_registry_repository_list` | ❌ | +<<<<<<< HEAD | 3 | 0.468028 | `mysql_server_list` | ❌ | +======= +<<<<<<< HEAD +| 3 | 0.468078 | `mysql_server_list` | ❌ | +======= +| 3 | 0.468028 | `mysql_server_list` | ❌ | +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) | 4 | 0.449649 | `datadog_monitoredresources_list` | ❌ | | 5 | 0.445741 | `group_list` | ❌ | --- +<<<<<<< HEAD ## Test 105 +======= +<<<<<<< HEAD +## Test 100 +======= +## Test 110 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `acr_registry_repository_list` **Prompt:** List all container registry repositories in my subscription @@ -2259,13 +4075,30 @@ |------|-------|------|--------| | 1 | 0.626482 | `acr_registry_repository_list` | ✅ **EXPECTED** | | 2 | 0.617504 | `acr_registry_list` | ❌ | +<<<<<<< HEAD | 3 | 0.544172 | `kusto_cluster_list` | ❌ | +<<<<<<< HEAD | 4 | 0.508863 | `storage_blob_container_get` | ❌ | | 5 | 0.495567 | `postgres_server_list` | ❌ | --- ## Test 106 +======= +======= +| 3 | 0.544238 | `kusto_cluster_list` | ❌ | +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +| 4 | 0.508483 | `storage_blob_container_get` | ❌ | +| 5 | 0.495526 | `postgres_server_list` | ❌ | + +--- + +<<<<<<< HEAD +## Test 101 +======= +## Test 111 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `acr_registry_repository_list` **Prompt:** Show me my container registry repositories @@ -2282,7 +4115,15 @@ --- +<<<<<<< HEAD ## Test 107 +======= +<<<<<<< HEAD +## Test 102 +======= +## Test 112 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `acr_registry_repository_list` **Prompt:** List repositories in the container registry @@ -2295,11 +4136,27 @@ | 2 | 0.541779 | `acr_registry_list` | ❌ | | 3 | 0.437756 | `storage_blob_container_get` | ❌ | | 4 | 0.433927 | `cosmos_database_container_list` | ❌ | +<<<<<<< HEAD | 5 | 0.383001 | `kusto_database_list` | ❌ | --- ## Test 108 +======= +<<<<<<< HEAD +| 5 | 0.383201 | `kusto_database_list` | ❌ | + +--- + +## Test 103 +======= +| 5 | 0.383621 | `kusto_database_list` | ❌ | + +--- + +## Test 113 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `acr_registry_repository_list` **Prompt:** Show me the repositories in the container registry @@ -2316,7 +4173,15 @@ --- +<<<<<<< HEAD ## Test 109 +======= +<<<<<<< HEAD +## Test 104 +======= +## Test 114 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `communication_email_send` **Prompt:** Send an email to with subject @@ -2325,15 +4190,35 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.498396 | `communication_email_send` | ✅ **EXPECTED** | | 2 | 0.229071 | `communication_sms_send` | ❌ | +======= +| 1 | 0.498292 | `communication_email_send` | ✅ **EXPECTED** | +<<<<<<< HEAD +| 2 | 0.226847 | `communication_sms_send` | ❌ | +>>>>>>> 58ab8585 (update prompts and tool description evaluator) | 3 | 0.188975 | `eventgrid_events_publish` | ❌ | | 4 | 0.161257 | `foundry_agents_create` | ❌ | | 5 | 0.146045 | `servicebus_topic_details` | ❌ | --- +<<<<<<< HEAD ## Test 110 +======= +## Test 105 +======= +| 2 | 0.229081 | `communication_sms_send` | ❌ | +| 3 | 0.189000 | `eventgrid_events_publish` | ❌ | +| 4 | 0.155364 | `speech_tts_synthesize` | ❌ | +| 5 | 0.145951 | `servicebus_topic_details` | ❌ | + +--- + +## Test 115 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `communication_email_send` **Prompt:** Send an email from my communication service to @@ -2342,6 +4227,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.498459 | `communication_email_send` | ✅ **EXPECTED** | | 2 | 0.314408 | `communication_sms_send` | ❌ | | 3 | 0.235110 | `foundry_openai_chat-completions-create` | ❌ | @@ -2351,6 +4237,28 @@ --- ## Test 111 +======= +| 1 | 0.498406 | `communication_email_send` | ✅ **EXPECTED** | +<<<<<<< HEAD +| 2 | 0.313058 | `communication_sms_send` | ❌ | +| 3 | 0.235127 | `foundry_openai_chat-completions-create` | ❌ | +| 4 | 0.211154 | `search_knowledge_base_retrieve` | ❌ | +| 5 | 0.194094 | `speech_stt_recognize` | ❌ | + +--- + +## Test 106 +======= +| 2 | 0.314462 | `communication_sms_send` | ❌ | +| 3 | 0.228890 | `speech_tts_synthesize` | ❌ | +| 4 | 0.218524 | `foundry_openai_chat-completions-create` | ❌ | +| 5 | 0.211154 | `search_knowledge_base_retrieve` | ❌ | + +--- + +## Test 116 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `communication_email_send` **Prompt:** Send HTML-formatted email to with subject @@ -2359,15 +4267,35 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.521087 | `communication_email_send` | ✅ **EXPECTED** | | 2 | 0.207644 | `communication_sms_send` | ❌ | | 3 | 0.152418 | `eventgrid_events_publish` | ❌ | | 4 | 0.152056 | `servicebus_topic_details` | ❌ | +======= +| 1 | 0.520967 | `communication_email_send` | ✅ **EXPECTED** | +<<<<<<< HEAD +| 2 | 0.205130 | `communication_sms_send` | ❌ | +| 3 | 0.152418 | `eventgrid_events_publish` | ❌ | +======= +| 2 | 0.207658 | `communication_sms_send` | ❌ | +| 3 | 0.152427 | `eventgrid_events_publish` | ❌ | +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +| 4 | 0.152013 | `servicebus_topic_details` | ❌ | +>>>>>>> 58ab8585 (update prompts and tool description evaluator) | 5 | 0.143660 | `foundry_agents_evaluate` | ❌ | --- +<<<<<<< HEAD ## Test 112 +======= +<<<<<<< HEAD +## Test 107 +======= +## Test 117 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `communication_email_send` **Prompt:** Send email with CC to and @@ -2376,15 +4304,36 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.533532 | `communication_email_send` | ✅ **EXPECTED** | | 2 | 0.219566 | `communication_sms_send` | ❌ | | 3 | 0.106042 | `foundry_agents_query-and-evaluate` | ❌ | +======= +| 1 | 0.533447 | `communication_email_send` | ✅ **EXPECTED** | +<<<<<<< HEAD +| 2 | 0.217412 | `communication_sms_send` | ❌ | +| 3 | 0.106026 | `foundry_agents_query-and-evaluate` | ❌ | +>>>>>>> 58ab8585 (update prompts and tool description evaluator) | 4 | 0.103723 | `foundry_openai_chat-completions-create` | ❌ | | 5 | 0.084905 | `cosmos_account_list` | ❌ | --- +<<<<<<< HEAD ## Test 113 +======= +## Test 108 +======= +| 2 | 0.219584 | `communication_sms_send` | ❌ | +| 3 | 0.106044 | `foundry_agents_query-and-evaluate` | ❌ | +| 4 | 0.087784 | `foundry_openai_chat-completions-create` | ❌ | +| 5 | 0.084933 | `cosmos_account_list` | ❌ | + +--- + +## Test 118 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `communication_email_send` **Prompt:** Send email to multiple recipients: , @@ -2393,6 +4342,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.540910 | `communication_email_send` | ✅ **EXPECTED** | | 2 | 0.244525 | `communication_sms_send` | ❌ | | 3 | 0.134996 | `foundry_openai_chat-completions-create` | ❌ | @@ -2402,6 +4352,27 @@ --- ## Test 114 +======= +| 1 | 0.540792 | `communication_email_send` | ✅ **EXPECTED** | +<<<<<<< HEAD +| 2 | 0.241620 | `communication_sms_send` | ❌ | +| 3 | 0.134975 | `foundry_openai_chat-completions-create` | ❌ | +| 4 | 0.114324 | `foundry_agents_query-and-evaluate` | ❌ | +======= +| 2 | 0.244521 | `communication_sms_send` | ❌ | +| 3 | 0.114380 | `foundry_agents_query-and-evaluate` | ❌ | +| 4 | 0.098798 | `foundry_openai_chat-completions-create` | ❌ | +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +| 5 | 0.087063 | `postgres_server_param_set` | ❌ | + +--- + +<<<<<<< HEAD +## Test 109 +======= +## Test 119 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `communication_email_send` **Prompt:** Send email with reply-to address set to @@ -2410,15 +4381,33 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.512721 | `communication_email_send` | ✅ **EXPECTED** | | 2 | 0.200189 | `communication_sms_send` | ❌ | | 3 | 0.164422 | `mysql_server_param_set` | ❌ | +======= +| 1 | 0.512623 | `communication_email_send` | ✅ **EXPECTED** | +<<<<<<< HEAD +| 2 | 0.198552 | `communication_sms_send` | ❌ | +======= +| 2 | 0.200177 | `communication_sms_send` | ❌ | +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +| 3 | 0.164115 | `mysql_server_param_set` | ❌ | +>>>>>>> 58ab8585 (update prompts and tool description evaluator) | 4 | 0.158759 | `postgres_server_param_set` | ❌ | | 5 | 0.143574 | `appconfig_kv_set` | ❌ | --- +<<<<<<< HEAD ## Test 115 +======= +<<<<<<< HEAD +## Test 110 +======= +## Test 120 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `communication_email_send` **Prompt:** Send email with custom sender name @@ -2427,15 +4416,35 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.473192 | `communication_email_send` | ✅ **EXPECTED** | | 2 | 0.255124 | `communication_sms_send` | ❌ | +======= +| 1 | 0.473175 | `communication_email_send` | ✅ **EXPECTED** | +<<<<<<< HEAD +| 2 | 0.253449 | `communication_sms_send` | ❌ | +>>>>>>> 58ab8585 (update prompts and tool description evaluator) | 3 | 0.164811 | `foundry_openai_chat-completions-create` | ❌ | | 4 | 0.160285 | `foundry_openai_embeddings-create` | ❌ | | 5 | 0.156869 | `cosmos_database_container_item_query` | ❌ | --- +<<<<<<< HEAD ## Test 116 +======= +## Test 111 +======= +| 2 | 0.255169 | `communication_sms_send` | ❌ | +| 3 | 0.156869 | `cosmos_database_container_item_query` | ❌ | +| 4 | 0.143626 | `sql_db_rename` | ❌ | +| 5 | 0.139388 | `foundry_openai_chat-completions-create` | ❌ | + +--- + +## Test 121 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `communication_email_send` **Prompt:** Send an email with BCC recipients @@ -2444,15 +4453,36 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.528899 | `communication_email_send` | ✅ **EXPECTED** | | 2 | 0.241091 | `communication_sms_send` | ❌ | | 3 | 0.137538 | `confidentialledger_entries_append` | ❌ | | 4 | 0.108748 | `confidentialledger_entries_get` | ❌ | +======= +| 1 | 0.528789 | `communication_email_send` | ✅ **EXPECTED** | +<<<<<<< HEAD +| 2 | 0.239846 | `communication_sms_send` | ❌ | +| 3 | 0.137565 | `confidentialledger_entries_append` | ❌ | +| 4 | 0.108725 | `confidentialledger_entries_get` | ❌ | +======= +| 2 | 0.241114 | `communication_sms_send` | ❌ | +| 3 | 0.137538 | `confidentialledger_entries_append` | ❌ | +| 4 | 0.108748 | `confidentialledger_entries_get` | ❌ | +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) | 5 | 0.105033 | `storage_blob_upload` | ❌ | --- +<<<<<<< HEAD ## Test 117 +======= +<<<<<<< HEAD +## Test 112 +======= +## Test 122 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `communication_sms_send` **Prompt:** Send an SMS message to saying "Hello" @@ -2461,15 +4491,36 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.533822 | `communication_sms_send` | ✅ **EXPECTED** | | 2 | 0.251480 | `communication_email_send` | ❌ | +======= +<<<<<<< HEAD +| 1 | 0.533763 | `communication_sms_send` | ✅ **EXPECTED** | +| 2 | 0.251429 | `communication_email_send` | ❌ | +>>>>>>> 58ab8585 (update prompts and tool description evaluator) | 3 | 0.218656 | `foundry_openai_chat-completions-create` | ❌ | | 4 | 0.175534 | `foundry_agents_create` | ❌ | | 5 | 0.156040 | `foundry_threads_create` | ❌ | --- +<<<<<<< HEAD ## Test 118 +======= +## Test 113 +======= +| 1 | 0.533868 | `communication_sms_send` | ✅ **EXPECTED** | +| 2 | 0.251429 | `communication_email_send` | ❌ | +| 3 | 0.178085 | `foundry_openai_chat-completions-create` | ❌ | +| 4 | 0.170676 | `speech_tts_synthesize` | ❌ | +| 5 | 0.148584 | `foundry_agents_connect` | ❌ | + +--- + +## Test 123 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `communication_sms_send` **Prompt:** Send SMS to from with message "Test message" @@ -2478,6 +4529,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.546006 | `communication_sms_send` | ✅ **EXPECTED** | | 2 | 0.294912 | `communication_email_send` | ❌ | | 3 | 0.204585 | `loadtesting_testrun_create` | ❌ | @@ -2487,6 +4539,29 @@ --- ## Test 119 +======= +<<<<<<< HEAD +| 1 | 0.543875 | `communication_sms_send` | ✅ **EXPECTED** | +| 2 | 0.294603 | `communication_email_send` | ❌ | +| 3 | 0.204487 | `loadtesting_testrun_create` | ❌ | +| 4 | 0.200633 | `foundry_openai_chat-completions-create` | ❌ | +| 5 | 0.136763 | `loadtesting_testrun_update` | ❌ | + +--- + +## Test 114 +======= +| 1 | 0.546019 | `communication_sms_send` | ✅ **EXPECTED** | +| 2 | 0.294859 | `communication_email_send` | ❌ | +| 3 | 0.204588 | `loadtesting_testrun_create` | ❌ | +| 4 | 0.155927 | `foundry_openai_chat-completions-create` | ❌ | +| 5 | 0.139313 | `speech_tts_synthesize` | ❌ | + +--- + +## Test 124 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `communication_sms_send` **Prompt:** Send SMS to multiple recipients: , @@ -2495,6 +4570,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.545744 | `communication_sms_send` | ✅ **EXPECTED** | | 2 | 0.422028 | `communication_email_send` | ❌ | | 3 | 0.186088 | `foundry_openai_chat-completions-create` | ❌ | @@ -2504,6 +4580,28 @@ --- ## Test 120 +======= +<<<<<<< HEAD +| 1 | 0.543753 | `communication_sms_send` | ✅ **EXPECTED** | +| 2 | 0.421988 | `communication_email_send` | ❌ | +| 3 | 0.186088 | `foundry_openai_chat-completions-create` | ❌ | +| 4 | 0.142030 | `foundry_agents_query-and-evaluate` | ❌ | +======= +| 1 | 0.545755 | `communication_sms_send` | ✅ **EXPECTED** | +| 2 | 0.421988 | `communication_email_send` | ❌ | +| 3 | 0.142602 | `foundry_openai_chat-completions-create` | ❌ | +| 4 | 0.141987 | `foundry_agents_query-and-evaluate` | ❌ | +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +| 5 | 0.104124 | `search_knowledge_base_retrieve` | ❌ | + +--- + +<<<<<<< HEAD +## Test 115 +======= +## Test 125 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `communication_sms_send` **Prompt:** Send SMS with delivery reporting enabled @@ -2512,15 +4610,37 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.554917 | `communication_sms_send` | ✅ **EXPECTED** | | 2 | 0.269203 | `communication_email_send` | ❌ | | 3 | 0.191848 | `extension_azqr` | ❌ | +======= +<<<<<<< HEAD +| 1 | 0.548617 | `communication_sms_send` | ✅ **EXPECTED** | +| 2 | 0.269080 | `communication_email_send` | ❌ | +| 3 | 0.192340 | `extension_azqr` | ❌ | +>>>>>>> 58ab8585 (update prompts and tool description evaluator) | 4 | 0.185916 | `foundry_openai_chat-completions-create` | ❌ | | 5 | 0.170749 | `foundry_agents_query-and-evaluate` | ❌ | --- +<<<<<<< HEAD ## Test 121 +======= +## Test 116 +======= +| 1 | 0.554908 | `communication_sms_send` | ✅ **EXPECTED** | +| 2 | 0.269080 | `communication_email_send` | ❌ | +| 3 | 0.191848 | `extension_azqr` | ❌ | +| 4 | 0.170743 | `foundry_agents_query-and-evaluate` | ❌ | +| 5 | 0.166385 | `foundry_openai_chat-completions-create` | ❌ | + +--- + +## Test 126 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `communication_sms_send` **Prompt:** Send SMS message with custom tracking tag "campaign1" @@ -2529,15 +4649,36 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.538893 | `communication_sms_send` | ✅ **EXPECTED** | | 2 | 0.269915 | `communication_email_send` | ❌ | +======= +<<<<<<< HEAD +| 1 | 0.534739 | `communication_sms_send` | ✅ **EXPECTED** | +| 2 | 0.269794 | `communication_email_send` | ❌ | +>>>>>>> 58ab8585 (update prompts and tool description evaluator) | 3 | 0.188153 | `loadtesting_testrun_create` | ❌ | | 4 | 0.185403 | `foundry_openai_chat-completions-create` | ❌ | | 5 | 0.175135 | `foundry_agents_create` | ❌ | --- +<<<<<<< HEAD ## Test 122 +======= +## Test 117 +======= +| 1 | 0.538827 | `communication_sms_send` | ✅ **EXPECTED** | +| 2 | 0.269794 | `communication_email_send` | ❌ | +| 3 | 0.188153 | `loadtesting_testrun_create` | ❌ | +| 4 | 0.159177 | `appconfig_kv_set` | ❌ | +| 5 | 0.158295 | `loadtesting_test_create` | ❌ | + +--- + +## Test 127 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `communication_sms_send` **Prompt:** Send broadcast SMS to and saying "Urgent notification" @@ -2546,6 +4687,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.474775 | `communication_sms_send` | ✅ **EXPECTED** | | 2 | 0.286381 | `communication_email_send` | ❌ | | 3 | 0.164341 | `foundry_agents_query-and-evaluate` | ❌ | @@ -2555,6 +4697,29 @@ --- ## Test 123 +======= +<<<<<<< HEAD +| 1 | 0.471991 | `communication_sms_send` | ✅ **EXPECTED** | +| 2 | 0.286936 | `communication_email_send` | ❌ | +| 3 | 0.164059 | `foundry_agents_query-and-evaluate` | ❌ | +| 4 | 0.146501 | `foundry_openai_chat-completions-create` | ❌ | +| 5 | 0.128592 | `cosmos_account_list` | ❌ | + +--- + +## Test 118 +======= +| 1 | 0.474786 | `communication_sms_send` | ✅ **EXPECTED** | +| 2 | 0.286338 | `communication_email_send` | ❌ | +| 3 | 0.164288 | `foundry_agents_query-and-evaluate` | ❌ | +| 4 | 0.129965 | `foundry_openai_chat-completions-create` | ❌ | +| 5 | 0.128744 | `cosmos_account_list` | ❌ | + +--- + +## Test 128 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `communication_sms_send` **Prompt:** Send SMS from my communication service to @@ -2563,6 +4728,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.564058 | `communication_sms_send` | ✅ **EXPECTED** | | 2 | 0.302377 | `communication_email_send` | ❌ | | 3 | 0.238340 | `foundry_openai_chat-completions-create` | ❌ | @@ -2572,6 +4738,29 @@ --- ## Test 124 +======= +<<<<<<< HEAD +| 1 | 0.563359 | `communication_sms_send` | ✅ **EXPECTED** | +| 2 | 0.302360 | `communication_email_send` | ❌ | +| 3 | 0.238341 | `foundry_openai_chat-completions-create` | ❌ | +| 4 | 0.183684 | `search_knowledge_base_retrieve` | ❌ | +| 5 | 0.174092 | `foundry_openai_create-completion` | ❌ | + +--- + +## Test 119 +======= +| 1 | 0.564114 | `communication_sms_send` | ✅ **EXPECTED** | +| 2 | 0.302363 | `communication_email_send` | ❌ | +| 3 | 0.213669 | `foundry_openai_chat-completions-create` | ❌ | +| 4 | 0.183651 | `search_knowledge_base_retrieve` | ❌ | +| 5 | 0.177315 | `appservice_database_add` | ❌ | + +--- + +## Test 129 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `communication_sms_send` **Prompt:** Send an SMS with delivery receipt tracking @@ -2580,15 +4769,37 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.598236 | `communication_sms_send` | ✅ **EXPECTED** | | 2 | 0.314267 | `communication_email_send` | ❌ | | 3 | 0.206931 | `foundry_agents_query-and-evaluate` | ❌ | +======= +<<<<<<< HEAD +| 1 | 0.592519 | `communication_sms_send` | ✅ **EXPECTED** | +| 2 | 0.314134 | `communication_email_send` | ❌ | +| 3 | 0.206916 | `foundry_agents_query-and-evaluate` | ❌ | +>>>>>>> 58ab8585 (update prompts and tool description evaluator) | 4 | 0.201142 | `foundry_openai_chat-completions-create` | ❌ | | 5 | 0.187824 | `confidentialledger_entries_append` | ❌ | --- +<<<<<<< HEAD ## Test 125 +======= +## Test 120 +======= +| 1 | 0.598211 | `communication_sms_send` | ✅ **EXPECTED** | +| 2 | 0.314134 | `communication_email_send` | ❌ | +| 3 | 0.206814 | `foundry_agents_query-and-evaluate` | ❌ | +| 4 | 0.187824 | `confidentialledger_entries_append` | ❌ | +| 5 | 0.181824 | `foundry_openai_chat-completions-create` | ❌ | + +--- + +## Test 130 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `confidentialledger_entries_append` **Prompt:** Append an entry to my ledger with data {"key": "value"} @@ -2597,6 +4808,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.511241 | `confidentialledger_entries_append` | ✅ **EXPECTED** | | 2 | 0.295319 | `confidentialledger_entries_get` | ❌ | | 3 | 0.291757 | `appconfig_kv_set` | ❌ | @@ -2606,6 +4818,26 @@ --- ## Test 126 +======= +<<<<<<< HEAD +| 1 | 0.510689 | `confidentialledger_entries_append` | ✅ **EXPECTED** | +| 2 | 0.293736 | `confidentialledger_entries_get` | ❌ | +======= +| 1 | 0.510651 | `confidentialledger_entries_append` | ✅ **EXPECTED** | +| 2 | 0.294885 | `confidentialledger_entries_get` | ❌ | +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +| 3 | 0.292014 | `appconfig_kv_set` | ❌ | +| 4 | 0.258967 | `appconfig_kv_lock_set` | ❌ | +| 5 | 0.249704 | `keyvault_certificate_import` | ❌ | + +--- + +<<<<<<< HEAD +## Test 121 +======= +## Test 131 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `confidentialledger_entries_append` **Prompt:** Write a tamper-proof entry to ledger containing {"transaction": "data"} @@ -2614,6 +4846,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.602321 | `confidentialledger_entries_append` | ✅ **EXPECTED** | | 2 | 0.357401 | `confidentialledger_entries_get` | ❌ | | 3 | 0.211998 | `appconfig_kv_lock_set` | ❌ | @@ -2623,6 +4856,21 @@ --- ## Test 127 +======= +| 1 | 0.602257 | `confidentialledger_entries_append` | ✅ **EXPECTED** | +| 2 | 0.356510 | `confidentialledger_entries_get` | ❌ | +| 3 | 0.211990 | `appconfig_kv_lock_set` | ❌ | +| 4 | 0.195471 | `keyvault_secret_create` | ❌ | +| 5 | 0.183820 | `keyvault_certificate_import` | ❌ | + +--- + +<<<<<<< HEAD +## Test 122 +======= +## Test 132 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `confidentialledger_entries_append` **Prompt:** Append {"hello": "from mcp"} to my confidential ledger in collection @@ -2631,6 +4879,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.546786 | `confidentialledger_entries_append` | ✅ **EXPECTED** | | 2 | 0.452117 | `confidentialledger_entries_get` | ❌ | | 3 | 0.225013 | `appconfig_kv_lock_set` | ❌ | @@ -2640,6 +4889,29 @@ --- ## Test 128 +======= +<<<<<<< HEAD +| 1 | 0.546573 | `confidentialledger_entries_append` | ✅ **EXPECTED** | +| 2 | 0.451031 | `confidentialledger_entries_get` | ❌ | +| 3 | 0.224978 | `appconfig_kv_lock_set` | ❌ | +| 4 | 0.215862 | `appconfig_kv_set` | ❌ | +| 5 | 0.203109 | `keyvault_certificate_import` | ❌ | + +--- + +## Test 123 +======= +| 1 | 0.546675 | `confidentialledger_entries_append` | ✅ **EXPECTED** | +| 2 | 0.452058 | `confidentialledger_entries_get` | ❌ | +| 3 | 0.225145 | `appconfig_kv_lock_set` | ❌ | +| 4 | 0.215898 | `appconfig_kv_set` | ❌ | +| 5 | 0.211661 | `appservice_database_add` | ❌ | + +--- + +## Test 133 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `confidentialledger_entries_append` **Prompt:** Create an immutable ledger entry in with content {"audit": "log"} @@ -2648,15 +4920,35 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD +| 1 | 0.496023 | `confidentialledger_entries_append` | ✅ **EXPECTED** | +| 2 | 0.340187 | `confidentialledger_entries_get` | ❌ | +| 3 | 0.218473 | `monitor_activitylog_list` | ❌ | +======= +<<<<<<< HEAD +| 1 | 0.496032 | `confidentialledger_entries_append` | ✅ **EXPECTED** | +| 2 | 0.338270 | `confidentialledger_entries_get` | ❌ | +| 3 | 0.218518 | `monitor_activitylog_list` | ❌ | +======= | 1 | 0.496023 | `confidentialledger_entries_append` | ✅ **EXPECTED** | | 2 | 0.340187 | `confidentialledger_entries_get` | ❌ | | 3 | 0.218473 | `monitor_activitylog_list` | ❌ | +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) | 4 | 0.215229 | `storage_blob_container_create` | ❌ | | 5 | 0.204925 | `monitor_resource_log_query` | ❌ | --- +<<<<<<< HEAD ## Test 129 +======= +<<<<<<< HEAD +## Test 124 +======= +## Test 134 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `confidentialledger_entries_append` **Prompt:** Write an entry to confidential ledger @@ -2673,7 +4965,15 @@ --- +<<<<<<< HEAD ## Test 130 +======= +<<<<<<< HEAD +## Test 125 +======= +## Test 135 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `confidentialledger_entries_get` **Prompt:** Get entry from Confidential Ledger for transaction on ledger @@ -2682,6 +4982,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.707252 | `confidentialledger_entries_get` | ✅ **EXPECTED** | | 2 | 0.551953 | `confidentialledger_entries_append` | ❌ | | 3 | 0.245549 | `keyvault_secret_get` | ❌ | @@ -2691,6 +4992,29 @@ --- ## Test 131 +======= +<<<<<<< HEAD +| 1 | 0.706506 | `confidentialledger_entries_get` | ✅ **EXPECTED** | +| 2 | 0.551901 | `confidentialledger_entries_append` | ❌ | +| 3 | 0.245541 | `keyvault_secret_get` | ❌ | +| 4 | 0.229943 | `keyvault_key_get` | ❌ | +| 5 | 0.212658 | `loadtesting_testrun_get` | ❌ | + +--- + +## Test 126 +======= +| 1 | 0.707252 | `confidentialledger_entries_get` | ✅ **EXPECTED** | +| 2 | 0.551953 | `confidentialledger_entries_append` | ❌ | +| 3 | 0.245541 | `keyvault_secret_get` | ❌ | +| 4 | 0.229943 | `keyvault_key_get` | ❌ | +| 5 | 0.211925 | `loadtesting_testrun_get` | ❌ | + +--- + +## Test 136 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `confidentialledger_entries_get` **Prompt:** Get transaction from ledger @@ -2699,15 +5023,35 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.509714 | `confidentialledger_entries_get` | ✅ **EXPECTED** | | 2 | 0.416580 | `confidentialledger_entries_append` | ❌ | | 3 | 0.223959 | `loadtesting_testrun_get` | ❌ | +======= +<<<<<<< HEAD +| 1 | 0.510283 | `confidentialledger_entries_get` | ✅ **EXPECTED** | +| 2 | 0.416550 | `confidentialledger_entries_append` | ❌ | +| 3 | 0.224523 | `loadtesting_testrun_get` | ❌ | +======= +| 1 | 0.509714 | `confidentialledger_entries_get` | ✅ **EXPECTED** | +| 2 | 0.416580 | `confidentialledger_entries_append` | ❌ | +| 3 | 0.224029 | `loadtesting_testrun_get` | ❌ | +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) | 4 | 0.218412 | `monitor_resource_log_query` | ❌ | | 5 | 0.217671 | `loadtesting_testrun_list` | ❌ | --- +<<<<<<< HEAD ## Test 132 +======= +<<<<<<< HEAD +## Test 127 +======= +## Test 137 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `cosmos_account_list` **Prompt:** List all cosmosdb accounts in my subscription @@ -2716,15 +5060,27 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -| 1 | 0.818357 | `cosmos_account_list` | ✅ **EXPECTED** | +| 1 | 0.818340 | `cosmos_account_list` | ✅ **EXPECTED** | | 2 | 0.668480 | `cosmos_database_list` | ❌ | | 3 | 0.636009 | `subscription_list` | ❌ | | 4 | 0.615268 | `cosmos_database_container_list` | ❌ | +<<<<<<< HEAD | 5 | 0.601467 | `kusto_cluster_list` | ❌ | --- +<<<<<<< HEAD ## Test 133 +======= +## Test 128 +======= +| 5 | 0.601388 | `kusto_cluster_list` | ❌ | + +--- + +## Test 138 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `cosmos_account_list` **Prompt:** Show me my cosmosdb accounts @@ -2733,6 +5089,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.665422 | `cosmos_account_list` | ✅ **EXPECTED** | | 2 | 0.605325 | `cosmos_database_list` | ❌ | | 3 | 0.571573 | `cosmos_database_container_list` | ❌ | @@ -2742,6 +5099,26 @@ --- ## Test 134 +======= +| 1 | 0.665440 | `cosmos_account_list` | ✅ **EXPECTED** | +| 2 | 0.605357 | `cosmos_database_list` | ❌ | +| 3 | 0.571613 | `cosmos_database_container_list` | ❌ | +<<<<<<< HEAD +| 4 | 0.549476 | `cosmos_database_container_item_query` | ❌ | +| 5 | 0.504032 | `storage_account_get` | ❌ | + +--- + +## Test 129 +======= +| 4 | 0.549447 | `cosmos_database_container_item_query` | ❌ | +| 5 | 0.503850 | `storage_account_get` | ❌ | + +--- + +## Test 139 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `cosmos_account_list` **Prompt:** Show me the cosmosdb accounts in my subscription @@ -2750,6 +5127,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.752494 | `cosmos_account_list` | ✅ **EXPECTED** | | 2 | 0.607165 | `subscription_list` | ❌ | | 3 | 0.605125 | `cosmos_database_list` | ❌ | @@ -2758,7 +5136,22 @@ --- +<<<<<<< HEAD ## Test 135 +======= +## Test 130 +======= +| 1 | 0.752501 | `cosmos_account_list` | ✅ **EXPECTED** | +| 2 | 0.607201 | `subscription_list` | ❌ | +| 3 | 0.605125 | `cosmos_database_list` | ❌ | +| 4 | 0.566249 | `cosmos_database_container_list` | ❌ | +| 5 | 0.563921 | `cosmos_database_container_item_query` | ❌ | + +--- + +## Test 140 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `cosmos_database_container_item_query` **Prompt:** Show me the items that contain the word in the container in the database for the cosmosdb account @@ -2767,15 +5160,31 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD +| 1 | 0.658701 | `cosmos_database_container_item_query` | ✅ **EXPECTED** | +======= +<<<<<<< HEAD +| 1 | 0.658738 | `cosmos_database_container_item_query` | ✅ **EXPECTED** | +======= | 1 | 0.658701 | `cosmos_database_container_item_query` | ✅ **EXPECTED** | +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) | 2 | 0.605253 | `cosmos_database_container_list` | ❌ | | 3 | 0.488353 | `storage_blob_container_get` | ❌ | | 4 | 0.477874 | `cosmos_database_list` | ❌ | -| 5 | 0.447757 | `cosmos_account_list` | ❌ | +| 5 | 0.447777 | `cosmos_account_list` | ❌ | --- +<<<<<<< HEAD ## Test 136 +======= +<<<<<<< HEAD +## Test 131 +======= +## Test 141 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `cosmos_database_container_list` **Prompt:** List all the containers in the database for the cosmosdb account @@ -2784,6 +5193,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.852875 | `cosmos_database_container_list` | ✅ **EXPECTED** | | 2 | 0.680991 | `cosmos_database_list` | ❌ | | 3 | 0.680758 | `cosmos_database_container_item_query` | ❌ | @@ -2793,6 +5203,25 @@ --- ## Test 137 +======= +| 1 | 0.852832 | `cosmos_database_container_list` | ✅ **EXPECTED** | +| 2 | 0.681044 | `cosmos_database_list` | ❌ | +<<<<<<< HEAD +| 3 | 0.680794 | `cosmos_database_container_item_query` | ❌ | +======= +| 3 | 0.680762 | `cosmos_database_container_item_query` | ❌ | +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +| 4 | 0.632335 | `storage_blob_container_get` | ❌ | +| 5 | 0.630597 | `cosmos_account_list` | ❌ | + +--- + +<<<<<<< HEAD +## Test 132 +======= +## Test 142 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `cosmos_database_container_list` **Prompt:** Show me the containers in the database for the cosmosdb account @@ -2801,6 +5230,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.789395 | `cosmos_database_container_list` | ✅ **EXPECTED** | | 2 | 0.648126 | `cosmos_database_container_item_query` | ❌ | | 3 | 0.614220 | `cosmos_database_list` | ❌ | @@ -2810,6 +5240,29 @@ --- ## Test 138 +======= +<<<<<<< HEAD +| 1 | 0.789413 | `cosmos_database_container_list` | ✅ **EXPECTED** | +| 2 | 0.648207 | `cosmos_database_container_item_query` | ❌ | +| 3 | 0.614278 | `cosmos_database_list` | ❌ | +| 4 | 0.591387 | `storage_blob_container_get` | ❌ | +| 5 | 0.562096 | `cosmos_account_list` | ❌ | + +--- + +## Test 133 +======= +| 1 | 0.789395 | `cosmos_database_container_list` | ✅ **EXPECTED** | +| 2 | 0.648126 | `cosmos_database_container_item_query` | ❌ | +| 3 | 0.614220 | `cosmos_database_list` | ❌ | +| 4 | 0.591361 | `storage_blob_container_get` | ❌ | +| 5 | 0.562033 | `cosmos_account_list` | ❌ | + +--- + +## Test 143 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `cosmos_database_list` **Prompt:** List all the databases in the cosmosdb account @@ -2819,14 +5272,32 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| | 1 | 0.815683 | `cosmos_database_list` | ✅ **EXPECTED** | -| 2 | 0.668515 | `cosmos_account_list` | ❌ | +| 2 | 0.668468 | `cosmos_account_list` | ❌ | | 3 | 0.665298 | `cosmos_database_container_list` | ❌ | +<<<<<<< HEAD | 4 | 0.606433 | `cosmos_database_container_item_query` | ❌ | | 5 | 0.582804 | `kusto_database_list` | ❌ | --- ## Test 139 +======= +<<<<<<< HEAD +| 4 | 0.606414 | `cosmos_database_container_item_query` | ❌ | +| 5 | 0.583507 | `kusto_database_list` | ❌ | + +--- + +## Test 134 +======= +| 4 | 0.606433 | `cosmos_database_container_item_query` | ❌ | +| 5 | 0.583097 | `kusto_database_list` | ❌ | + +--- + +## Test 144 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `cosmos_database_list` **Prompt:** Show me the databases in the cosmosdb account @@ -2837,13 +5308,30 @@ |------|-------|------|--------| | 1 | 0.749370 | `cosmos_database_list` | ✅ **EXPECTED** | | 2 | 0.624759 | `cosmos_database_container_list` | ❌ | +<<<<<<< HEAD | 3 | 0.614572 | `cosmos_account_list` | ❌ | +<<<<<<< HEAD +| 4 | 0.579919 | `cosmos_database_container_item_query` | ❌ | +======= +| 4 | 0.579913 | `cosmos_database_container_item_query` | ❌ | +======= +| 3 | 0.614554 | `cosmos_account_list` | ❌ | | 4 | 0.579919 | `cosmos_database_container_item_query` | ❌ | +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) | 5 | 0.538479 | `mysql_database_list` | ❌ | --- +<<<<<<< HEAD ## Test 140 +======= +<<<<<<< HEAD +## Test 135 +======= +## Test 145 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `kusto_cluster_get` **Prompt:** Show me the details of the Data Explorer cluster @@ -2853,14 +5341,32 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| | 1 | 0.590264 | `kusto_cluster_get` | ✅ **EXPECTED** | +<<<<<<< HEAD | 2 | 0.463832 | `kusto_cluster_list` | ❌ | | 3 | 0.428159 | `kusto_query` | ❌ | +<<<<<<< HEAD | 4 | 0.425909 | `kusto_database_list` | ❌ | +======= +| 4 | 0.425688 | `kusto_database_list` | ❌ | +======= +| 2 | 0.463623 | `kusto_cluster_list` | ❌ | +| 3 | 0.428159 | `kusto_query` | ❌ | +| 4 | 0.425469 | `kusto_database_list` | ❌ | +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) | 5 | 0.422577 | `kusto_table_schema` | ❌ | --- +<<<<<<< HEAD ## Test 141 +======= +<<<<<<< HEAD +## Test 136 +======= +## Test 146 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `kusto_cluster_list` **Prompt:** List all Data Explorer clusters in my subscription @@ -2869,15 +5375,32 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.793744 | `kusto_cluster_list` | ✅ **EXPECTED** | +<<<<<<< HEAD | 2 | 0.630451 | `kusto_database_list` | ❌ | +======= +| 2 | 0.630504 | `kusto_database_list` | ❌ | +======= +| 1 | 0.793453 | `kusto_cluster_list` | ✅ **EXPECTED** | +| 2 | 0.630261 | `kusto_database_list` | ❌ | +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) | 3 | 0.573395 | `kusto_cluster_get` | ❌ | | 4 | 0.525025 | `aks_cluster_get` | ❌ | | 5 | 0.509397 | `grafana_list` | ❌ | --- +<<<<<<< HEAD ## Test 142 +======= +<<<<<<< HEAD +## Test 137 +======= +## Test 147 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `kusto_cluster_list` **Prompt:** Show me my Data Explorer clusters @@ -2886,15 +5409,33 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.531307 | `kusto_cluster_list` | ✅ **EXPECTED** | | 2 | 0.465277 | `kusto_cluster_get` | ❌ | +<<<<<<< HEAD | 3 | 0.432311 | `kusto_database_list` | ❌ | +======= +| 3 | 0.432320 | `kusto_database_list` | ❌ | +======= +| 1 | 0.530932 | `kusto_cluster_list` | ✅ **EXPECTED** | +| 2 | 0.465277 | `kusto_cluster_get` | ❌ | +| 3 | 0.432552 | `kusto_database_list` | ❌ | +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) | 4 | 0.369596 | `aks_cluster_get` | ❌ | | 5 | 0.363119 | `kusto_table_schema` | ❌ | --- +<<<<<<< HEAD ## Test 143 +======= +<<<<<<< HEAD +## Test 138 +======= +## Test 148 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `kusto_cluster_list` **Prompt:** Show me the Data Explorer clusters in my subscription @@ -2903,15 +5444,33 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.701484 | `kusto_cluster_list` | ✅ **EXPECTED** | | 2 | 0.571191 | `kusto_cluster_get` | ❌ | +<<<<<<< HEAD | 3 | 0.548734 | `kusto_database_list` | ❌ | +======= +| 3 | 0.548690 | `kusto_database_list` | ❌ | +======= +| 1 | 0.701232 | `kusto_cluster_list` | ✅ **EXPECTED** | +| 2 | 0.571191 | `kusto_cluster_get` | ❌ | +| 3 | 0.548589 | `kusto_database_list` | ❌ | +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) | 4 | 0.498909 | `aks_cluster_get` | ❌ | | 5 | 0.474201 | `redis_list` | ❌ | --- +<<<<<<< HEAD ## Test 144 +======= +<<<<<<< HEAD +## Test 139 +======= +## Test 149 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `kusto_database_list` **Prompt:** List all databases in the Data Explorer cluster @@ -2920,15 +5479,35 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.676656 | `kusto_database_list` | ✅ **EXPECTED** | | 2 | 0.560592 | `kusto_cluster_list` | ❌ | | 3 | 0.556795 | `kusto_table_list` | ❌ | +======= +<<<<<<< HEAD +| 1 | 0.677042 | `kusto_database_list` | ✅ **EXPECTED** | +| 2 | 0.560592 | `kusto_cluster_list` | ❌ | +| 3 | 0.556688 | `kusto_table_list` | ❌ | +======= +| 1 | 0.676699 | `kusto_database_list` | ✅ **EXPECTED** | +| 2 | 0.560388 | `kusto_cluster_list` | ❌ | +| 3 | 0.556795 | `kusto_table_list` | ❌ | +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) | 4 | 0.553218 | `postgres_database_list` | ❌ | | 5 | 0.549673 | `cosmos_database_list` | ❌ | --- +<<<<<<< HEAD ## Test 145 +======= +<<<<<<< HEAD +## Test 140 +======= +## Test 150 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `kusto_database_list` **Prompt:** Show me the databases in the Data Explorer cluster @@ -2937,15 +5516,35 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.623242 | `kusto_database_list` | ✅ **EXPECTED** | | 2 | 0.509952 | `kusto_cluster_list` | ❌ | | 3 | 0.507073 | `kusto_table_list` | ❌ | +======= +<<<<<<< HEAD +| 1 | 0.623528 | `kusto_database_list` | ✅ **EXPECTED** | +| 2 | 0.509953 | `kusto_cluster_list` | ❌ | +| 3 | 0.506997 | `kusto_table_list` | ❌ | +======= +| 1 | 0.623401 | `kusto_database_list` | ✅ **EXPECTED** | +| 2 | 0.509763 | `kusto_cluster_list` | ❌ | +| 3 | 0.507073 | `kusto_table_list` | ❌ | +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) | 4 | 0.497144 | `cosmos_database_list` | ❌ | | 5 | 0.491400 | `mysql_database_list` | ❌ | --- +<<<<<<< HEAD ## Test 146 +======= +<<<<<<< HEAD +## Test 141 +======= +## Test 151 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `kusto_query` **Prompt:** Show me all items that contain the word in the Data Explorer table in cluster @@ -2955,6 +5554,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| | 1 | 0.423660 | `kusto_query` | ✅ **EXPECTED** | +<<<<<<< HEAD | 2 | 0.409485 | `postgres_database_query` | ❌ | | 3 | 0.408178 | `kusto_table_schema` | ❌ | | 4 | 0.407740 | `kusto_sample` | ❌ | @@ -2963,23 +5563,58 @@ --- ## Test 147 +======= +| 2 | 0.409534 | `postgres_database_query` | ❌ | +| 3 | 0.408178 | `kusto_table_schema` | ❌ | +| 4 | 0.407741 | `kusto_sample` | ❌ | +<<<<<<< HEAD +| 5 | 0.403990 | `kusto_cluster_list` | ❌ | -**Expected Tool:** `kusto_sample` -**Prompt:** Show me a data sample from the Data Explorer table in cluster +--- -### Results +## Test 142 +======= +| 5 | 0.403800 | `kusto_cluster_list` | ❌ | -| Rank | Score | Tool | Status | -|------|-------|------|--------| +--- + +## Test 152 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) + +**Expected Tool:** `kusto_sample` +**Prompt:** Show me a data sample from the Data Explorer table in cluster + +### Results + +| Rank | Score | Tool | Status | +|------|-------|------|--------| | 1 | 0.595554 | `kusto_sample` | ✅ **EXPECTED** | | 2 | 0.510233 | `kusto_table_schema` | ❌ | +<<<<<<< HEAD | 3 | 0.424212 | `kusto_table_list` | ❌ | +======= +<<<<<<< HEAD +| 3 | 0.424221 | `kusto_table_list` | ❌ | +>>>>>>> 58ab8585 (update prompts and tool description evaluator) | 4 | 0.400924 | `kusto_cluster_list` | ❌ | +======= +| 3 | 0.424212 | `kusto_table_list` | ❌ | +| 4 | 0.400719 | `kusto_cluster_list` | ❌ | +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) | 5 | 0.399525 | `kusto_cluster_get` | ❌ | --- +<<<<<<< HEAD ## Test 148 +======= +<<<<<<< HEAD +## Test 143 +======= +## Test 153 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `kusto_table_list` **Prompt:** List all tables in the Data Explorer database in cluster @@ -2988,7 +5623,12 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.679642 | `kusto_table_list` | ✅ **EXPECTED** | +======= +<<<<<<< HEAD +| 1 | 0.679655 | `kusto_table_list` | ✅ **EXPECTED** | +>>>>>>> 58ab8585 (update prompts and tool description evaluator) | 2 | 0.585237 | `postgres_table_list` | ❌ | | 3 | 0.580964 | `kusto_database_list` | ❌ | | 4 | 0.556724 | `mysql_table_list` | ❌ | @@ -2996,7 +5636,22 @@ --- +<<<<<<< HEAD ## Test 149 +======= +## Test 144 +======= +| 1 | 0.679642 | `kusto_table_list` | ✅ **EXPECTED** | +| 2 | 0.585237 | `postgres_table_list` | ❌ | +| 3 | 0.581015 | `kusto_database_list` | ❌ | +| 4 | 0.556724 | `mysql_table_list` | ❌ | +| 5 | 0.549762 | `monitor_table_list` | ❌ | + +--- + +## Test 154 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `kusto_table_list` **Prompt:** Show me the tables in the Data Explorer database in cluster @@ -3005,15 +5660,35 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.619252 | `kusto_table_list` | ✅ **EXPECTED** | | 2 | 0.554332 | `kusto_table_schema` | ❌ | | 3 | 0.527431 | `kusto_database_list` | ❌ | | 4 | 0.524691 | `mysql_table_list` | ❌ | +======= +| 1 | 0.619269 | `kusto_table_list` | ✅ **EXPECTED** | +| 2 | 0.554333 | `kusto_table_schema` | ❌ | +<<<<<<< HEAD +| 3 | 0.527616 | `kusto_database_list` | ❌ | +| 4 | 0.524607 | `mysql_table_list` | ❌ | +======= +| 3 | 0.527570 | `kusto_database_list` | ❌ | +| 4 | 0.524691 | `mysql_table_list` | ❌ | +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) | 5 | 0.523432 | `postgres_table_list` | ❌ | --- +<<<<<<< HEAD ## Test 150 +======= +<<<<<<< HEAD +## Test 145 +======= +## Test 155 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `kusto_table_schema` **Prompt:** Show me the schema for table in the Data Explorer database in cluster @@ -3022,6 +5697,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.666980 | `kusto_table_schema` | ✅ **EXPECTED** | | 2 | 0.564204 | `postgres_table_schema_get` | ❌ | | 3 | 0.528301 | `mysql_table_schema_get` | ❌ | @@ -3031,6 +5707,29 @@ --- ## Test 151 +======= +<<<<<<< HEAD +| 1 | 0.667033 | `kusto_table_schema` | ✅ **EXPECTED** | +| 2 | 0.564282 | `postgres_table_schema_get` | ❌ | +| 3 | 0.527921 | `mysql_table_schema_get` | ❌ | +| 4 | 0.490939 | `kusto_sample` | ❌ | +| 5 | 0.489722 | `kusto_table_list` | ❌ | + +--- + +## Test 146 +======= +| 1 | 0.667095 | `kusto_table_schema` | ✅ **EXPECTED** | +| 2 | 0.564717 | `postgres_table_schema_get` | ❌ | +| 3 | 0.528210 | `mysql_table_schema_get` | ❌ | +| 4 | 0.490775 | `kusto_sample` | ❌ | +| 5 | 0.489814 | `kusto_table_list` | ❌ | + +--- + +## Test 156 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `mysql_database_list` **Prompt:** List all MySQL databases in server @@ -3039,6 +5738,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.633991 | `postgres_database_list` | ❌ | | 2 | 0.623359 | `mysql_database_list` | ✅ **EXPECTED** | | 3 | 0.534434 | `mysql_table_list` | ❌ | @@ -3048,6 +5748,29 @@ --- ## Test 152 +======= +<<<<<<< HEAD +| 1 | 0.633973 | `postgres_database_list` | ❌ | +| 2 | 0.623333 | `mysql_database_list` | ✅ **EXPECTED** | +| 3 | 0.534537 | `mysql_table_list` | ❌ | +| 4 | 0.498854 | `mysql_server_list` | ❌ | +| 5 | 0.490179 | `sql_db_list` | ❌ | + +--- + +## Test 147 +======= +| 1 | 0.634056 | `postgres_database_list` | ❌ | +| 2 | 0.623421 | `mysql_database_list` | ✅ **EXPECTED** | +| 3 | 0.534457 | `mysql_table_list` | ❌ | +| 4 | 0.498918 | `mysql_server_list` | ❌ | +| 5 | 0.490148 | `sql_db_list` | ❌ | + +--- + +## Test 157 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `mysql_database_list` **Prompt:** Show me the MySQL databases in server @@ -3058,6 +5781,7 @@ |------|-------|------|--------| | 1 | 0.588121 | `mysql_database_list` | ✅ **EXPECTED** | | 2 | 0.574089 | `postgres_database_list` | ❌ | +<<<<<<< HEAD | 3 | 0.483855 | `mysql_table_list` | ❌ | | 4 | 0.463244 | `mysql_server_list` | ❌ | | 5 | 0.444547 | `sql_db_list` | ❌ | @@ -3065,6 +5789,25 @@ --- ## Test 153 +======= +<<<<<<< HEAD +| 3 | 0.483938 | `mysql_table_list` | ❌ | +| 4 | 0.463238 | `mysql_server_list` | ❌ | +| 5 | 0.444622 | `sql_db_list` | ❌ | + +--- + +## Test 148 +======= +| 3 | 0.483855 | `mysql_table_list` | ❌ | +| 4 | 0.463244 | `mysql_server_list` | ❌ | +| 5 | 0.444547 | `sql_db_list` | ❌ | + +--- + +## Test 158 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `mysql_database_query` **Prompt:** Show me all items that contain the word in the MySQL database in server @@ -3073,6 +5816,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.476423 | `mysql_table_list` | ❌ | | 2 | 0.455770 | `mysql_database_list` | ❌ | | 3 | 0.432703 | `mysql_database_query` | ✅ **EXPECTED** | @@ -3082,6 +5826,28 @@ --- ## Test 154 +======= +<<<<<<< HEAD +| 1 | 0.476539 | `mysql_table_list` | ❌ | +| 2 | 0.455770 | `mysql_database_list` | ❌ | +| 3 | 0.433392 | `mysql_database_query` | ✅ **EXPECTED** | +| 4 | 0.419938 | `mysql_server_list` | ❌ | +======= +| 1 | 0.476423 | `mysql_table_list` | ❌ | +| 2 | 0.455770 | `mysql_database_list` | ❌ | +| 3 | 0.433202 | `mysql_database_query` | ✅ **EXPECTED** | +| 4 | 0.419859 | `mysql_server_list` | ❌ | +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +| 5 | 0.409445 | `mysql_table_schema_get` | ❌ | + +--- + +<<<<<<< HEAD +## Test 149 +======= +## Test 159 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `mysql_server_config_get` **Prompt:** Show me the configuration of MySQL server @@ -3098,7 +5864,15 @@ --- +<<<<<<< HEAD ## Test 155 +======= +<<<<<<< HEAD +## Test 150 +======= +## Test 160 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `mysql_server_list` **Prompt:** List all MySQL servers in my subscription @@ -3107,7 +5881,12 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.678473 | `postgres_server_list` | ❌ | +======= +<<<<<<< HEAD +| 1 | 0.678536 | `postgres_server_list` | ❌ | +>>>>>>> 58ab8585 (update prompts and tool description evaluator) | 2 | 0.558177 | `mysql_database_list` | ❌ | | 3 | 0.554818 | `mysql_server_list` | ✅ **EXPECTED** | | 4 | 0.513706 | `kusto_cluster_list` | ❌ | @@ -3115,7 +5894,22 @@ --- +<<<<<<< HEAD ## Test 156 +======= +## Test 151 +======= +| 1 | 0.678472 | `postgres_server_list` | ❌ | +| 2 | 0.558177 | `mysql_database_list` | ❌ | +| 3 | 0.554817 | `mysql_server_list` | ✅ **EXPECTED** | +| 4 | 0.513750 | `kusto_cluster_list` | ❌ | +| 5 | 0.501199 | `mysql_table_list` | ❌ | + +--- + +## Test 161 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `mysql_server_list` **Prompt:** Show me my MySQL servers @@ -3125,14 +5919,34 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| | 1 | 0.478518 | `mysql_database_list` | ❌ | +<<<<<<< HEAD +| 2 | 0.474586 | `mysql_server_list` | ✅ **EXPECTED** | +| 3 | 0.435642 | `postgres_server_list` | ❌ | +| 4 | 0.412380 | `mysql_table_list` | ❌ | +======= +<<<<<<< HEAD +| 2 | 0.474630 | `mysql_server_list` | ✅ **EXPECTED** | +| 3 | 0.435692 | `postgres_server_list` | ❌ | +| 4 | 0.412417 | `mysql_table_list` | ❌ | +======= | 2 | 0.474586 | `mysql_server_list` | ✅ **EXPECTED** | | 3 | 0.435642 | `postgres_server_list` | ❌ | | 4 | 0.412380 | `mysql_table_list` | ❌ | +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) | 5 | 0.389993 | `postgres_database_list` | ❌ | --- +<<<<<<< HEAD ## Test 157 +======= +<<<<<<< HEAD +## Test 152 +======= +## Test 162 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `mysql_server_list` **Prompt:** Show me the MySQL servers in my subscription @@ -3141,15 +5955,35 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.636435 | `postgres_server_list` | ❌ | | 2 | 0.534266 | `mysql_server_list` | ✅ **EXPECTED** | +======= +<<<<<<< HEAD +| 1 | 0.636471 | `postgres_server_list` | ❌ | +| 2 | 0.534277 | `mysql_server_list` | ✅ **EXPECTED** | +>>>>>>> 58ab8585 (update prompts and tool description evaluator) | 3 | 0.530210 | `mysql_database_list` | ❌ | | 4 | 0.475052 | `kusto_cluster_list` | ❌ | +======= +| 1 | 0.636435 | `postgres_server_list` | ❌ | +| 2 | 0.534266 | `mysql_server_list` | ✅ **EXPECTED** | +| 3 | 0.530210 | `mysql_database_list` | ❌ | +| 4 | 0.475138 | `kusto_cluster_list` | ❌ | +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) | 5 | 0.470468 | `redis_list` | ❌ | --- +<<<<<<< HEAD ## Test 158 +======= +<<<<<<< HEAD +## Test 153 +======= +## Test 163 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `mysql_server_param_get` **Prompt:** Show me the value of connection timeout in seconds in my MySQL server @@ -3159,6 +5993,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| | 1 | 0.495071 | `mysql_server_param_get` | ✅ **EXPECTED** | +<<<<<<< HEAD | 2 | 0.437857 | `mysql_server_param_set` | ❌ | | 3 | 0.333041 | `mysql_database_query` | ❌ | | 4 | 0.313364 | `mysql_table_schema_get` | ❌ | @@ -3167,6 +6002,20 @@ --- ## Test 159 +======= +| 2 | 0.438075 | `mysql_server_param_set` | ❌ | +| 3 | 0.333906 | `mysql_database_query` | ❌ | +| 4 | 0.313150 | `mysql_table_schema_get` | ❌ | +| 5 | 0.310834 | `postgres_server_param_get` | ❌ | + +--- + +<<<<<<< HEAD +## Test 154 +======= +## Test 164 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `mysql_server_param_set` **Prompt:** Set connection timeout to 20 seconds for my MySQL server @@ -3178,12 +6027,30 @@ | 1 | 0.449612 | `mysql_server_param_set` | ✅ **EXPECTED** | | 2 | 0.381144 | `mysql_server_param_get` | ❌ | | 3 | 0.303499 | `postgres_server_param_set` | ❌ | +<<<<<<< HEAD | 4 | 0.298661 | `mysql_database_query` | ❌ | | 5 | 0.254180 | `mysql_server_list` | ❌ | --- ## Test 160 +======= +<<<<<<< HEAD +| 4 | 0.298911 | `mysql_database_query` | ❌ | +| 5 | 0.254206 | `mysql_server_list` | ❌ | + +--- + +## Test 155 +======= +| 4 | 0.299246 | `mysql_database_query` | ❌ | +| 5 | 0.277569 | `appservice_database_add` | ❌ | + +--- + +## Test 165 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `mysql_table_list` **Prompt:** List all tables in the MySQL database in server @@ -3192,6 +6059,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.633542 | `mysql_table_list` | ✅ **EXPECTED** | | 2 | 0.573851 | `postgres_table_list` | ❌ | | 3 | 0.550878 | `postgres_database_list` | ❌ | @@ -3201,6 +6069,29 @@ --- ## Test 161 +======= +<<<<<<< HEAD +| 1 | 0.633547 | `mysql_table_list` | ✅ **EXPECTED** | +| 2 | 0.573844 | `postgres_table_list` | ❌ | +| 3 | 0.550898 | `postgres_database_list` | ❌ | +| 4 | 0.546963 | `mysql_database_list` | ❌ | +| 5 | 0.511906 | `kusto_table_list` | ❌ | + +--- + +## Test 156 +======= +| 1 | 0.633542 | `mysql_table_list` | ✅ **EXPECTED** | +| 2 | 0.573851 | `postgres_table_list` | ❌ | +| 3 | 0.550878 | `postgres_database_list` | ❌ | +| 4 | 0.546987 | `mysql_database_list` | ❌ | +| 5 | 0.511879 | `kusto_table_list` | ❌ | + +--- + +## Test 166 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `mysql_table_list` **Prompt:** Show me the tables in the MySQL database in server @@ -3217,7 +6108,15 @@ --- +<<<<<<< HEAD ## Test 162 +======= +<<<<<<< HEAD +## Test 157 +======= +## Test 167 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `mysql_table_schema_get` **Prompt:** Show me the schema of table in the MySQL database in server @@ -3234,7 +6133,15 @@ --- +<<<<<<< HEAD ## Test 163 +======= +<<<<<<< HEAD +## Test 158 +======= +## Test 168 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `postgres_database_list` **Prompt:** List all PostgreSQL databases in server @@ -3251,7 +6158,15 @@ --- +<<<<<<< HEAD ## Test 164 +======= +<<<<<<< HEAD +## Test 159 +======= +## Test 169 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `postgres_database_list` **Prompt:** Show me the PostgreSQL databases in server @@ -3268,7 +6183,15 @@ --- +<<<<<<< HEAD ## Test 165 +======= +<<<<<<< HEAD +## Test 160 +======= +## Test 170 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `postgres_database_query` **Prompt:** Show me all items that contain the word in the PostgreSQL database in server @@ -3278,6 +6201,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| | 1 | 0.546211 | `postgres_database_list` | ❌ | +<<<<<<< HEAD | 2 | 0.523223 | `postgres_database_query` | ✅ **EXPECTED** | | 3 | 0.503267 | `postgres_table_list` | ❌ | | 4 | 0.466599 | `postgres_server_list` | ❌ | @@ -3286,6 +6210,26 @@ --- ## Test 166 +======= +<<<<<<< HEAD +| 2 | 0.523142 | `postgres_database_query` | ✅ **EXPECTED** | +| 3 | 0.503267 | `postgres_table_list` | ❌ | +| 4 | 0.466608 | `postgres_server_list` | ❌ | +======= +| 2 | 0.523122 | `postgres_database_query` | ✅ **EXPECTED** | +| 3 | 0.503267 | `postgres_table_list` | ❌ | +| 4 | 0.466599 | `postgres_server_list` | ❌ | +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +| 5 | 0.403969 | `postgres_server_param_get` | ❌ | + +--- + +<<<<<<< HEAD +## Test 161 +======= +## Test 171 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `postgres_server_config_get` **Prompt:** Show me the configuration of PostgreSQL server @@ -3302,7 +6246,15 @@ --- +<<<<<<< HEAD ## Test 167 +======= +<<<<<<< HEAD +## Test 162 +======= +## Test 172 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `postgres_server_list` **Prompt:** List all PostgreSQL servers in my subscription @@ -3311,7 +6263,12 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.900023 | `postgres_server_list` | ✅ **EXPECTED** | +======= +<<<<<<< HEAD +| 1 | 0.900052 | `postgres_server_list` | ✅ **EXPECTED** | +>>>>>>> 58ab8585 (update prompts and tool description evaluator) | 2 | 0.640733 | `postgres_database_list` | ❌ | | 3 | 0.565914 | `postgres_table_list` | ❌ | | 4 | 0.538997 | `postgres_server_config_get` | ❌ | @@ -3319,7 +6276,22 @@ --- +<<<<<<< HEAD ## Test 168 +======= +## Test 163 +======= +| 1 | 0.900023 | `postgres_server_list` | ✅ **EXPECTED** | +| 2 | 0.640733 | `postgres_database_list` | ❌ | +| 3 | 0.565914 | `postgres_table_list` | ❌ | +| 4 | 0.538997 | `postgres_server_config_get` | ❌ | +| 5 | 0.534345 | `kusto_cluster_list` | ❌ | + +--- + +## Test 173 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `postgres_server_list` **Prompt:** Show me my PostgreSQL servers @@ -3336,7 +6308,15 @@ --- +<<<<<<< HEAD ## Test 169 +======= +<<<<<<< HEAD +## Test 164 +======= +## Test 174 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `postgres_server_list` **Prompt:** Show me the PostgreSQL servers in my subscription @@ -3353,7 +6333,15 @@ --- +<<<<<<< HEAD ## Test 170 +======= +<<<<<<< HEAD +## Test 165 +======= +## Test 175 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `postgres_server_param_get` **Prompt:** Show me if the parameter my PostgreSQL server has replication enabled @@ -3370,7 +6358,15 @@ --- +<<<<<<< HEAD ## Test 171 +======= +<<<<<<< HEAD +## Test 166 +======= +## Test 176 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `postgres_server_param_set` **Prompt:** Enable replication for my PostgreSQL server @@ -3387,7 +6383,15 @@ --- +<<<<<<< HEAD ## Test 172 +======= +<<<<<<< HEAD +## Test 167 +======= +## Test 177 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `postgres_table_list` **Prompt:** List all tables in the PostgreSQL database in server @@ -3404,7 +6408,15 @@ --- +<<<<<<< HEAD ## Test 173 +======= +<<<<<<< HEAD +## Test 168 +======= +## Test 178 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `postgres_table_list` **Prompt:** Show me the tables in the PostgreSQL database in server @@ -3421,7 +6433,15 @@ --- +<<<<<<< HEAD ## Test 174 +======= +<<<<<<< HEAD +## Test 169 +======= +## Test 179 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `postgres_table_schema_get` **Prompt:** Show me the schema of table
in the PostgreSQL database in server @@ -3438,7 +6458,15 @@ --- +<<<<<<< HEAD ## Test 175 +======= +<<<<<<< HEAD +## Test 170 +======= +## Test 180 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `deploy_app_logs_get` **Prompt:** Show me the log of the application deployed by azd @@ -3449,13 +6477,30 @@ |------|-------|------|--------| | 1 | 0.711844 | `deploy_app_logs_get` | ✅ **EXPECTED** | | 2 | 0.471692 | `deploy_plan_get` | ❌ | +<<<<<<< HEAD | 3 | 0.451639 | `monitor_activitylog_list` | ❌ | | 4 | 0.404892 | `deploy_pipeline_guidance_get` | ❌ | +======= +<<<<<<< HEAD +| 3 | 0.451653 | `monitor_activitylog_list` | ❌ | +======= +| 3 | 0.451638 | `monitor_activitylog_list` | ❌ | +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +| 4 | 0.404890 | `deploy_pipeline_guidance_get` | ❌ | +>>>>>>> 58ab8585 (update prompts and tool description evaluator) | 5 | 0.401388 | `monitor_resource_log_query` | ❌ | --- +<<<<<<< HEAD ## Test 176 +======= +<<<<<<< HEAD +## Test 171 +======= +## Test 181 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `deploy_architecture_diagram_generate` **Prompt:** Generate the azure architecture diagram for this application @@ -3472,7 +6517,15 @@ --- +<<<<<<< HEAD ## Test 177 +======= +<<<<<<< HEAD +## Test 172 +======= +## Test 182 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `deploy_iac_rules_get` **Prompt:** Show me the rules to generate bicep scripts @@ -3482,14 +6535,32 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| | 1 | 0.529092 | `deploy_iac_rules_get` | ✅ **EXPECTED** | +<<<<<<< HEAD | 2 | 0.480324 | `bicepschema_get` | ❌ | +======= +| 2 | 0.479903 | `bicepschema_get` | ❌ | +<<<<<<< HEAD +>>>>>>> 58ab8585 (update prompts and tool description evaluator) | 3 | 0.391965 | `get_bestpractices_get` | ❌ | | 4 | 0.383210 | `azureterraformbestpractices_get` | ❌ | | 5 | 0.375561 | `extension_cli_generate` | ❌ | --- +<<<<<<< HEAD ## Test 178 +======= +## Test 173 +======= +| 3 | 0.394509 | `get_bestpractices_get` | ❌ | +| 4 | 0.383210 | `azureterraformbestpractices_get` | ❌ | +| 5 | 0.375561 | `extension_cli_generate` | ❌ | + +--- + +## Test 183 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `deploy_pipeline_guidance_get` **Prompt:** How can I create a CI/CD pipeline to deploy this app to Azure? @@ -3500,6 +6571,7 @@ |------|-------|------|--------| | 1 | 0.638588 | `deploy_pipeline_guidance_get` | ✅ **EXPECTED** | | 2 | 0.499242 | `deploy_plan_get` | ❌ | +<<<<<<< HEAD | 3 | 0.448917 | `deploy_iac_rules_get` | ❌ | | 4 | 0.385670 | `deploy_app_logs_get` | ❌ | | 5 | 0.382240 | `get_bestpractices_get` | ❌ | @@ -3507,6 +6579,19 @@ --- ## Test 179 +======= +| 3 | 0.448918 | `deploy_iac_rules_get` | ❌ | +| 4 | 0.385940 | `get_bestpractices_get` | ❌ | +| 5 | 0.385920 | `deploy_app_logs_get` | ❌ | + +--- + +<<<<<<< HEAD +## Test 174 +======= +## Test 184 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `deploy_plan_get` **Prompt:** Create a plan to deploy this application to azure @@ -3523,7 +6608,15 @@ --- +<<<<<<< HEAD ## Test 180 +======= +<<<<<<< HEAD +## Test 175 +======= +## Test 185 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `eventgrid_events_publish` **Prompt:** Publish an event to Event Grid topic using with the following data @@ -3532,6 +6625,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.755353 | `eventgrid_events_publish` | ✅ **EXPECTED** | | 2 | 0.482544 | `eventgrid_subscription_list` | ❌ | | 3 | 0.465759 | `eventgrid_topic_list` | ❌ | @@ -3541,6 +6635,29 @@ --- ## Test 181 +======= +<<<<<<< HEAD +| 1 | 0.755366 | `eventgrid_events_publish` | ✅ **EXPECTED** | +| 2 | 0.482575 | `eventgrid_subscription_list` | ❌ | +| 3 | 0.465432 | `eventgrid_topic_list` | ❌ | +| 4 | 0.360845 | `eventhubs_eventhub_update` | ❌ | +| 5 | 0.354313 | `servicebus_topic_details` | ❌ | + +--- + +## Test 176 +======= +| 1 | 0.755380 | `eventgrid_events_publish` | ✅ **EXPECTED** | +| 2 | 0.483021 | `eventgrid_subscription_list` | ❌ | +| 3 | 0.466031 | `eventgrid_topic_list` | ❌ | +| 4 | 0.360676 | `eventhubs_eventhub_update` | ❌ | +| 5 | 0.355599 | `servicebus_topic_details` | ❌ | + +--- + +## Test 186 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `eventgrid_events_publish` **Prompt:** Publish event to my Event Grid topic with the following events @@ -3549,15 +6666,34 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.654648 | `eventgrid_events_publish` | ✅ **EXPECTED** | | 2 | 0.524134 | `eventgrid_subscription_list` | ❌ | | 3 | 0.509777 | `eventgrid_topic_list` | ❌ | | 4 | 0.373438 | `servicebus_topic_details` | ❌ | +======= +<<<<<<< HEAD +| 1 | 0.654647 | `eventgrid_events_publish` | ✅ **EXPECTED** | +======= +| 1 | 0.654668 | `eventgrid_events_publish` | ✅ **EXPECTED** | +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +| 2 | 0.524503 | `eventgrid_subscription_list` | ❌ | +| 3 | 0.510039 | `eventgrid_topic_list` | ❌ | +| 4 | 0.373718 | `servicebus_topic_details` | ❌ | +>>>>>>> 58ab8585 (update prompts and tool description evaluator) | 5 | 0.359908 | `eventhubs_eventhub_update` | ❌ | --- +<<<<<<< HEAD ## Test 182 +======= +<<<<<<< HEAD +## Test 177 +======= +## Test 187 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `eventgrid_events_publish` **Prompt:** Send an event to Event Grid topic in resource group with @@ -3566,15 +6702,36 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.600274 | `eventgrid_events_publish` | ✅ **EXPECTED** | +<<<<<<< HEAD | 2 | 0.521041 | `eventgrid_topic_list` | ❌ | | 3 | 0.504642 | `eventgrid_subscription_list` | ❌ | | 4 | 0.411129 | `eventhubs_eventhub_consumergroup_update` | ❌ | +======= +| 2 | 0.521240 | `eventgrid_topic_list` | ❌ | +| 3 | 0.504808 | `eventgrid_subscription_list` | ❌ | +| 4 | 0.411390 | `eventhubs_eventhub_consumergroup_update` | ❌ | +======= +| 1 | 0.600303 | `eventgrid_events_publish` | ✅ **EXPECTED** | +| 2 | 0.521240 | `eventgrid_topic_list` | ❌ | +| 3 | 0.504808 | `eventgrid_subscription_list` | ❌ | +| 4 | 0.411130 | `eventhubs_eventhub_consumergroup_update` | ❌ | +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) | 5 | 0.389439 | `eventhubs_eventhub_consumergroup_get` | ❌ | --- +<<<<<<< HEAD ## Test 183 +======= +<<<<<<< HEAD +## Test 178 +======= +## Test 188 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `eventgrid_topic_list` **Prompt:** List all Event Grid topics in my subscription @@ -3583,15 +6740,34 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.769921 | `eventgrid_topic_list` | ✅ **EXPECTED** | | 2 | 0.745048 | `eventgrid_subscription_list` | ❌ | | 3 | 0.561862 | `kusto_cluster_list` | ❌ | | 4 | 0.543887 | `search_service_list` | ❌ | +======= +| 1 | 0.770140 | `eventgrid_topic_list` | ✅ **EXPECTED** | +| 2 | 0.745470 | `eventgrid_subscription_list` | ❌ | +<<<<<<< HEAD +| 3 | 0.561862 | `kusto_cluster_list` | ❌ | +======= +| 3 | 0.561858 | `kusto_cluster_list` | ❌ | +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +| 4 | 0.545540 | `search_service_list` | ❌ | +>>>>>>> 58ab8585 (update prompts and tool description evaluator) | 5 | 0.526123 | `subscription_list` | ❌ | --- +<<<<<<< HEAD ## Test 184 +======= +<<<<<<< HEAD +## Test 179 +======= +## Test 189 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `eventgrid_topic_list` **Prompt:** Show me the Event Grid topics in my subscription @@ -3600,6 +6776,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.738040 | `eventgrid_topic_list` | ✅ **EXPECTED** | | 2 | 0.736919 | `eventgrid_subscription_list` | ❌ | | 3 | 0.492592 | `kusto_cluster_list` | ❌ | @@ -3609,6 +6786,26 @@ --- ## Test 185 +======= +| 1 | 0.738258 | `eventgrid_topic_list` | ✅ **EXPECTED** | +| 2 | 0.737486 | `eventgrid_subscription_list` | ❌ | +<<<<<<< HEAD +| 3 | 0.492592 | `kusto_cluster_list` | ❌ | +| 4 | 0.480252 | `subscription_list` | ❌ | +======= +| 3 | 0.492527 | `kusto_cluster_list` | ❌ | +| 4 | 0.480287 | `subscription_list` | ❌ | +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +| 5 | 0.475119 | `search_service_list` | ❌ | + +--- + +<<<<<<< HEAD +## Test 180 +======= +## Test 190 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `eventgrid_topic_list` **Prompt:** List all Event Grid topics in subscription @@ -3617,6 +6814,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.769840 | `eventgrid_topic_list` | ✅ **EXPECTED** | | 2 | 0.720426 | `eventgrid_subscription_list` | ❌ | | 3 | 0.535369 | `kusto_cluster_list` | ❌ | @@ -3626,6 +6824,25 @@ --- ## Test 186 +======= +| 1 | 0.770140 | `eventgrid_topic_list` | ✅ **EXPECTED** | +| 2 | 0.721362 | `eventgrid_subscription_list` | ❌ | +<<<<<<< HEAD +| 3 | 0.535326 | `kusto_cluster_list` | ❌ | +======= +| 3 | 0.535427 | `kusto_cluster_list` | ❌ | +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +| 4 | 0.514248 | `search_service_list` | ❌ | +| 5 | 0.495952 | `subscription_list` | ❌ | + +--- + +<<<<<<< HEAD +## Test 181 +======= +## Test 191 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `eventgrid_topic_list` **Prompt:** List all Event Grid topics in resource group in subscription @@ -3634,15 +6851,36 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.758562 | `eventgrid_topic_list` | ✅ **EXPECTED** | | 2 | 0.704062 | `eventgrid_subscription_list` | ❌ | +======= +<<<<<<< HEAD +| 1 | 0.758816 | `eventgrid_topic_list` | ✅ **EXPECTED** | +| 2 | 0.704462 | `eventgrid_subscription_list` | ❌ | +>>>>>>> 58ab8585 (update prompts and tool description evaluator) | 3 | 0.609175 | `group_list` | ❌ | | 4 | 0.544809 | `monitor_webtests_list` | ❌ | | 5 | 0.524209 | `eventhubs_namespace_get` | ❌ | --- +<<<<<<< HEAD ## Test 187 +======= +## Test 182 +======= +| 1 | 0.758786 | `eventgrid_topic_list` | ✅ **EXPECTED** | +| 2 | 0.704443 | `eventgrid_subscription_list` | ❌ | +| 3 | 0.609074 | `group_list` | ❌ | +| 4 | 0.536981 | `monitor_webtests_list` | ❌ | +| 5 | 0.524359 | `eventhubs_namespace_get` | ❌ | + +--- + +## Test 192 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `eventgrid_subscription_list` **Prompt:** Show me all Event Grid subscriptions for topic @@ -3655,11 +6893,23 @@ | 2 | 0.720373 | `eventgrid_topic_list` | ❌ | | 3 | 0.498398 | `servicebus_topic_details` | ❌ | | 4 | 0.486216 | `servicebus_topic_subscription_details` | ❌ | +<<<<<<< HEAD | 5 | 0.486162 | `eventgrid_events_publish` | ❌ | --- +<<<<<<< HEAD ## Test 188 +======= +## Test 183 +======= +| 5 | 0.486132 | `eventgrid_events_publish` | ❌ | + +--- + +## Test 193 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `eventgrid_subscription_list` **Prompt:** List Event Grid subscriptions for topic in subscription @@ -3671,12 +6921,28 @@ | 1 | 0.717676 | `eventgrid_subscription_list` | ✅ **EXPECTED** | | 2 | 0.709586 | `eventgrid_topic_list` | ❌ | | 3 | 0.539977 | `servicebus_topic_subscription_details` | ❌ | +<<<<<<< HEAD | 4 | 0.529084 | `servicebus_topic_details` | ❌ | +======= +| 4 | 0.529286 | `servicebus_topic_details` | ❌ | +<<<<<<< HEAD +>>>>>>> 58ab8585 (update prompts and tool description evaluator) | 5 | 0.477876 | `eventgrid_events_publish` | ❌ | --- +<<<<<<< HEAD ## Test 189 +======= +## Test 184 +======= +| 5 | 0.477848 | `eventgrid_events_publish` | ❌ | + +--- + +## Test 194 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `eventgrid_subscription_list` **Prompt:** List Event Grid subscriptions for topic in resource group @@ -3685,6 +6951,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.746672 | `eventgrid_subscription_list` | ✅ **EXPECTED** | | 2 | 0.745851 | `eventgrid_topic_list` | ❌ | | 3 | 0.535463 | `monitor_webtests_list` | ❌ | @@ -3694,6 +6961,29 @@ --- ## Test 190 +======= +<<<<<<< HEAD +| 1 | 0.746815 | `eventgrid_subscription_list` | ✅ **EXPECTED** | +| 2 | 0.746174 | `eventgrid_topic_list` | ❌ | +| 3 | 0.535731 | `monitor_webtests_list` | ❌ | +| 4 | 0.524919 | `group_list` | ❌ | +| 5 | 0.503158 | `servicebus_topic_details` | ❌ | + +--- + +## Test 185 +======= +| 1 | 0.746335 | `eventgrid_subscription_list` | ✅ **EXPECTED** | +| 2 | 0.745666 | `eventgrid_topic_list` | ❌ | +| 3 | 0.528105 | `monitor_webtests_list` | ❌ | +| 4 | 0.524883 | `group_list` | ❌ | +| 5 | 0.502820 | `servicebus_topic_details` | ❌ | + +--- + +## Test 195 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `eventgrid_subscription_list` **Prompt:** Show all Event Grid subscriptions in my subscription @@ -3702,6 +6992,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.736844 | `eventgrid_subscription_list` | ✅ **EXPECTED** | | 2 | 0.659612 | `eventgrid_topic_list` | ❌ | | 3 | 0.569255 | `subscription_list` | ❌ | @@ -3711,6 +7002,26 @@ --- ## Test 191 +======= +| 1 | 0.736436 | `eventgrid_subscription_list` | ✅ **EXPECTED** | +| 2 | 0.659727 | `eventgrid_topic_list` | ❌ | +<<<<<<< HEAD +| 3 | 0.569256 | `subscription_list` | ❌ | +| 4 | 0.537922 | `kusto_cluster_list` | ❌ | +======= +| 3 | 0.569254 | `subscription_list` | ❌ | +| 4 | 0.537909 | `kusto_cluster_list` | ❌ | +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +| 5 | 0.518857 | `search_service_list` | ❌ | + +--- + +<<<<<<< HEAD +## Test 186 +======= +## Test 196 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `eventgrid_subscription_list` **Prompt:** List all Event Grid subscriptions in subscription @@ -3719,6 +7030,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.684586 | `eventgrid_subscription_list` | ✅ **EXPECTED** | | 2 | 0.656227 | `eventgrid_topic_list` | ❌ | | 3 | 0.542362 | `subscription_list` | ❌ | @@ -3728,6 +7040,29 @@ --- ## Test 192 +======= +<<<<<<< HEAD +| 1 | 0.684444 | `eventgrid_subscription_list` | ✅ **EXPECTED** | +| 2 | 0.656183 | `eventgrid_topic_list` | ❌ | +| 3 | 0.542320 | `subscription_list` | ❌ | +| 4 | 0.521015 | `kusto_cluster_list` | ❌ | +| 5 | 0.510024 | `group_list` | ❌ | + +--- + +## Test 187 +======= +| 1 | 0.684543 | `eventgrid_subscription_list` | ✅ **EXPECTED** | +| 2 | 0.656277 | `eventgrid_topic_list` | ❌ | +| 3 | 0.542388 | `subscription_list` | ❌ | +| 4 | 0.521119 | `kusto_cluster_list` | ❌ | +| 5 | 0.510115 | `group_list` | ❌ | + +--- + +## Test 197 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `eventgrid_subscription_list` **Prompt:** Show Event Grid subscriptions in resource group in subscription @@ -3739,12 +7074,30 @@ | 1 | 0.696332 | `eventgrid_subscription_list` | ✅ **EXPECTED** | | 2 | 0.691623 | `eventgrid_topic_list` | ❌ | | 3 | 0.557573 | `group_list` | ❌ | +<<<<<<< HEAD | 4 | 0.510684 | `monitor_webtests_list` | ❌ | | 5 | 0.504984 | `resourcehealth_availability-status_list` | ❌ | --- ## Test 193 +======= +<<<<<<< HEAD +| 4 | 0.510814 | `monitor_webtests_list` | ❌ | +| 5 | 0.505497 | `resourcehealth_availability-status_list` | ❌ | + +--- + +## Test 188 +======= +| 4 | 0.504984 | `resourcehealth_availability-status_list` | ❌ | +| 5 | 0.503099 | `monitor_webtests_list` | ❌ | + +--- + +## Test 198 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `eventgrid_subscription_list` **Prompt:** List Event Grid subscriptions for subscription in location @@ -3756,12 +7109,28 @@ | 1 | 0.710457 | `eventgrid_subscription_list` | ✅ **EXPECTED** | | 2 | 0.642001 | `eventgrid_topic_list` | ❌ | | 3 | 0.506618 | `subscription_list` | ❌ | +<<<<<<< HEAD | 4 | 0.476396 | `search_service_list` | ❌ | +======= +| 4 | 0.476763 | `search_service_list` | ❌ | +<<<<<<< HEAD +>>>>>>> 58ab8585 (update prompts and tool description evaluator) | 5 | 0.475782 | `kusto_cluster_list` | ❌ | --- +<<<<<<< HEAD ## Test 194 +======= +## Test 189 +======= +| 5 | 0.475718 | `kusto_cluster_list` | ❌ | + +--- + +## Test 199 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `eventhubs_eventhub_consumergroup_delete` **Prompt:** Delete my consumer group in my event hub , namespace , and resource group @@ -3770,6 +7139,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.766928 | `eventhubs_eventhub_consumergroup_delete` | ✅ **EXPECTED** | | 2 | 0.675842 | `eventhubs_eventhub_consumergroup_update` | ❌ | | 3 | 0.641112 | `eventhubs_eventhub_consumergroup_get` | ❌ | @@ -3779,6 +7149,29 @@ --- ## Test 195 +======= +<<<<<<< HEAD +| 1 | 0.766896 | `eventhubs_eventhub_consumergroup_delete` | ✅ **EXPECTED** | +| 2 | 0.675127 | `eventhubs_eventhub_consumergroup_update` | ❌ | +| 3 | 0.641111 | `eventhubs_eventhub_consumergroup_get` | ❌ | +| 4 | 0.633848 | `eventhubs_namespace_delete` | ❌ | +| 5 | 0.605802 | `eventhubs_eventhub_delete` | ❌ | + +--- + +## Test 190 +======= +| 1 | 0.767014 | `eventhubs_eventhub_consumergroup_delete` | ✅ **EXPECTED** | +| 2 | 0.675937 | `eventhubs_eventhub_consumergroup_update` | ❌ | +| 3 | 0.641200 | `eventhubs_eventhub_consumergroup_get` | ❌ | +| 4 | 0.631867 | `eventhubs_namespace_delete` | ❌ | +| 5 | 0.605622 | `eventhubs_eventhub_delete` | ❌ | + +--- + +## Test 200 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `eventhubs_eventhub_consumergroup_get` **Prompt:** List all consumer groups in my event hub in namespace @@ -3788,14 +7181,31 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| | 1 | 0.738475 | `eventhubs_eventhub_consumergroup_get` | ✅ **EXPECTED** | +<<<<<<< HEAD | 2 | 0.634517 | `eventhubs_eventhub_consumergroup_update` | ❌ | | 3 | 0.626486 | `eventhubs_eventhub_consumergroup_delete` | ❌ | +======= +<<<<<<< HEAD +| 2 | 0.634345 | `eventhubs_eventhub_consumergroup_update` | ❌ | +======= +| 2 | 0.634517 | `eventhubs_eventhub_consumergroup_update` | ❌ | +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +| 3 | 0.626485 | `eventhubs_eventhub_consumergroup_delete` | ❌ | +>>>>>>> 58ab8585 (update prompts and tool description evaluator) | 4 | 0.606619 | `eventhubs_namespace_get` | ❌ | | 5 | 0.593098 | `eventhubs_eventhub_get` | ❌ | --- +<<<<<<< HEAD ## Test 196 +======= +<<<<<<< HEAD +## Test 191 +======= +## Test 201 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `eventhubs_eventhub_consumergroup_get` **Prompt:** Get the details of my consumer group in my event hub , namespace , and resource group @@ -3805,14 +7215,30 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| | 1 | 0.712861 | `eventhubs_eventhub_consumergroup_get` | ✅ **EXPECTED** | +<<<<<<< HEAD +| 2 | 0.637170 | `eventhubs_eventhub_consumergroup_update` | ❌ | +======= +<<<<<<< HEAD +| 2 | 0.637418 | `eventhubs_eventhub_consumergroup_update` | ❌ | +======= | 2 | 0.637170 | `eventhubs_eventhub_consumergroup_update` | ❌ | +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) | 3 | 0.625913 | `eventhubs_eventhub_consumergroup_delete` | ❌ | | 4 | 0.576800 | `eventhubs_namespace_get` | ❌ | | 5 | 0.529940 | `eventhubs_eventhub_get` | ❌ | --- +<<<<<<< HEAD ## Test 197 +======= +<<<<<<< HEAD +## Test 192 +======= +## Test 202 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `eventhubs_eventhub_consumergroup_update` **Prompt:** Create a new consumer group in my event hub , namespace , and resource group @@ -3821,6 +7247,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.756873 | `eventhubs_eventhub_consumergroup_update` | ✅ **EXPECTED** | | 2 | 0.688248 | `eventhubs_eventhub_consumergroup_get` | ❌ | | 3 | 0.669384 | `eventhubs_eventhub_consumergroup_delete` | ❌ | @@ -3830,6 +7257,25 @@ --- ## Test 198 +======= +<<<<<<< HEAD +| 1 | 0.757520 | `eventhubs_eventhub_consumergroup_update` | ✅ **EXPECTED** | +======= +| 1 | 0.757614 | `eventhubs_eventhub_consumergroup_update` | ✅ **EXPECTED** | +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +| 2 | 0.688923 | `eventhubs_eventhub_consumergroup_get` | ❌ | +| 3 | 0.670026 | `eventhubs_eventhub_consumergroup_delete` | ❌ | +| 4 | 0.554314 | `eventhubs_eventhub_update` | ❌ | +| 5 | 0.545003 | `eventhubs_namespace_get` | ❌ | + +--- + +<<<<<<< HEAD +## Test 193 +======= +## Test 203 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `eventhubs_eventhub_consumergroup_update` **Prompt:** Update my consumer group in my event hub , namespace , and resource group @@ -3838,6 +7284,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.739158 | `eventhubs_eventhub_consumergroup_update` | ✅ **EXPECTED** | | 2 | 0.655927 | `eventhubs_eventhub_consumergroup_delete` | ❌ | | 3 | 0.642524 | `eventhubs_eventhub_consumergroup_get` | ❌ | @@ -3847,6 +7294,29 @@ --- ## Test 199 +======= +<<<<<<< HEAD +| 1 | 0.739615 | `eventhubs_eventhub_consumergroup_update` | ✅ **EXPECTED** | +| 2 | 0.655951 | `eventhubs_eventhub_consumergroup_delete` | ❌ | +| 3 | 0.642701 | `eventhubs_eventhub_consumergroup_get` | ❌ | +| 4 | 0.552830 | `eventhubs_eventhub_update` | ❌ | +| 5 | 0.524428 | `eventhubs_namespace_delete` | ❌ | + +--- + +## Test 194 +======= +| 1 | 0.738818 | `eventhubs_eventhub_consumergroup_update` | ✅ **EXPECTED** | +| 2 | 0.655610 | `eventhubs_eventhub_consumergroup_delete` | ❌ | +| 3 | 0.642206 | `eventhubs_eventhub_consumergroup_get` | ❌ | +| 4 | 0.552216 | `eventhubs_eventhub_update` | ❌ | +| 5 | 0.523137 | `eventhubs_namespace_get` | ❌ | + +--- + +## Test 204 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `eventhubs_eventhub_delete` **Prompt:** Delete my event hub in my namespace and resource group @@ -3855,6 +7325,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.699266 | `eventhubs_namespace_delete` | ❌ | | 2 | 0.688646 | `eventhubs_eventhub_delete` | ✅ **EXPECTED** | | 3 | 0.627721 | `eventhubs_eventhub_consumergroup_delete` | ❌ | @@ -3864,6 +7335,29 @@ --- ## Test 200 +======= +<<<<<<< HEAD +| 1 | 0.699621 | `eventhubs_namespace_delete` | ❌ | +| 2 | 0.689171 | `eventhubs_eventhub_delete` | ✅ **EXPECTED** | +| 3 | 0.627887 | `eventhubs_eventhub_consumergroup_delete` | ❌ | +| 4 | 0.579273 | `eventhubs_namespace_get` | ❌ | +| 5 | 0.553715 | `eventhubs_eventhub_get` | ❌ | + +--- + +## Test 195 +======= +| 1 | 0.697894 | `eventhubs_namespace_delete` | ❌ | +| 2 | 0.688471 | `eventhubs_eventhub_delete` | ✅ **EXPECTED** | +| 3 | 0.627661 | `eventhubs_eventhub_consumergroup_delete` | ❌ | +| 4 | 0.578662 | `eventhubs_namespace_get` | ❌ | +| 5 | 0.552931 | `eventhubs_eventhub_get` | ❌ | + +--- + +## Test 205 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `eventhubs_eventhub_get` **Prompt:** List all Event Hubs in my namespace @@ -3872,6 +7366,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.773277 | `eventhubs_eventhub_get` | ✅ **EXPECTED** | | 2 | 0.687596 | `eventhubs_namespace_get` | ❌ | | 3 | 0.578709 | `eventhubs_eventhub_update` | ❌ | @@ -3881,6 +7376,21 @@ --- ## Test 201 +======= +| 1 | 0.773231 | `eventhubs_eventhub_get` | ✅ **EXPECTED** | +| 2 | 0.687582 | `eventhubs_namespace_get` | ❌ | +| 3 | 0.578689 | `eventhubs_eventhub_update` | ❌ | +| 4 | 0.560155 | `eventhubs_namespace_delete` | ❌ | +| 5 | 0.545475 | `eventhubs_eventhub_consumergroup_get` | ❌ | + +--- + +<<<<<<< HEAD +## Test 196 +======= +## Test 206 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `eventhubs_eventhub_get` **Prompt:** Get the details of my event hub in my namespace and resource group @@ -3889,6 +7399,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.638112 | `eventhubs_namespace_get` | ❌ | | 2 | 0.627528 | `eventhubs_eventhub_get` | ✅ **EXPECTED** | | 3 | 0.570964 | `eventhubs_eventhub_consumergroup_get` | ❌ | @@ -3898,6 +7409,29 @@ --- ## Test 202 +======= +<<<<<<< HEAD +| 1 | 0.638030 | `eventhubs_namespace_get` | ❌ | +| 2 | 0.627606 | `eventhubs_eventhub_get` | ✅ **EXPECTED** | +| 3 | 0.570898 | `eventhubs_eventhub_consumergroup_get` | ❌ | +| 4 | 0.527564 | `eventhubs_eventhub_update` | ❌ | +| 5 | 0.521837 | `eventhubs_namespace_delete` | ❌ | + +--- + +## Test 197 +======= +| 1 | 0.638173 | `eventhubs_namespace_get` | ❌ | +| 2 | 0.627712 | `eventhubs_eventhub_get` | ✅ **EXPECTED** | +| 3 | 0.571001 | `eventhubs_eventhub_consumergroup_get` | ❌ | +| 4 | 0.527639 | `eventhubs_eventhub_update` | ❌ | +| 5 | 0.521101 | `eventhubs_namespace_delete` | ❌ | + +--- + +## Test 207 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `eventhubs_eventhub_update` **Prompt:** Create a new event hub in my namespace and resource group @@ -3906,6 +7440,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.645976 | `eventhubs_eventhub_update` | ✅ **EXPECTED** | | 2 | 0.605856 | `eventhubs_namespace_get` | ❌ | | 3 | 0.574389 | `eventhubs_eventhub_get` | ❌ | @@ -3915,6 +7450,29 @@ --- ## Test 203 +======= +<<<<<<< HEAD +| 1 | 0.645723 | `eventhubs_eventhub_update` | ✅ **EXPECTED** | +| 2 | 0.605716 | `eventhubs_namespace_get` | ❌ | +| 3 | 0.574303 | `eventhubs_eventhub_get` | ❌ | +| 4 | 0.571748 | `eventhubs_eventhub_consumergroup_update` | ❌ | +| 5 | 0.557530 | `eventhubs_namespace_delete` | ❌ | + +--- + +## Test 198 +======= +| 1 | 0.645976 | `eventhubs_eventhub_update` | ✅ **EXPECTED** | +| 2 | 0.605856 | `eventhubs_namespace_get` | ❌ | +| 3 | 0.574389 | `eventhubs_eventhub_get` | ❌ | +| 4 | 0.571676 | `eventhubs_eventhub_consumergroup_update` | ❌ | +| 5 | 0.557073 | `eventhubs_namespace_delete` | ❌ | + +--- + +## Test 208 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `eventhubs_eventhub_update` **Prompt:** Update my event hub in my namespace and resource group @@ -3923,6 +7481,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.655283 | `eventhubs_eventhub_update` | ✅ **EXPECTED** | | 2 | 0.571661 | `eventhubs_eventhub_delete` | ❌ | | 3 | 0.568605 | `eventhubs_eventhub_consumergroup_update` | ❌ | @@ -3932,6 +7491,29 @@ --- ## Test 204 +======= +<<<<<<< HEAD +| 1 | 0.655261 | `eventhubs_eventhub_update` | ✅ **EXPECTED** | +| 2 | 0.571762 | `eventhubs_eventhub_delete` | ❌ | +| 3 | 0.569417 | `eventhubs_eventhub_consumergroup_update` | ❌ | +| 4 | 0.568279 | `eventhubs_namespace_get` | ❌ | +| 5 | 0.565852 | `eventhubs_namespace_delete` | ❌ | + +--- + +## Test 199 +======= +| 1 | 0.655104 | `eventhubs_eventhub_update` | ✅ **EXPECTED** | +| 2 | 0.571580 | `eventhubs_eventhub_delete` | ❌ | +| 3 | 0.568796 | `eventhubs_eventhub_consumergroup_update` | ❌ | +| 4 | 0.568526 | `eventhubs_namespace_get` | ❌ | +| 5 | 0.564849 | `eventhubs_namespace_delete` | ❌ | + +--- + +## Test 209 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `eventhubs_namespace_delete` **Prompt:** Delete my namespace in my resource group @@ -3940,15 +7522,28 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.623995 | `eventhubs_namespace_delete` | ✅ **EXPECTED** | | 2 | 0.525810 | `eventhubs_namespace_update` | ❌ | +======= +| 1 | 0.626113 | `eventhubs_namespace_delete` | ✅ **EXPECTED** | +| 2 | 0.525446 | `eventhubs_namespace_update` | ❌ | +>>>>>>> 58ab8585 (update prompts and tool description evaluator) | 3 | 0.505082 | `eventhubs_eventhub_consumergroup_delete` | ❌ | | 4 | 0.449841 | `eventhubs_namespace_get` | ❌ | | 5 | 0.435037 | `workbooks_delete` | ❌ | --- +<<<<<<< HEAD ## Test 205 +======= +<<<<<<< HEAD +## Test 200 +======= +## Test 210 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `eventhubs_namespace_get` **Prompt:** List all Event Hubs namespaces in my subscription @@ -3959,13 +7554,30 @@ |------|-------|------|--------| | 1 | 0.659838 | `eventhubs_eventhub_get` | ❌ | | 2 | 0.658827 | `eventhubs_namespace_get` | ✅ **EXPECTED** | +<<<<<<< HEAD | 3 | 0.607372 | `kusto_cluster_list` | ❌ | +<<<<<<< HEAD | 4 | 0.557150 | `eventgrid_topic_list` | ❌ | | 5 | 0.556016 | `eventgrid_subscription_list` | ❌ | --- ## Test 206 +======= +======= +| 3 | 0.607365 | `kusto_cluster_list` | ❌ | +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +| 4 | 0.557200 | `eventgrid_topic_list` | ❌ | +| 5 | 0.556126 | `eventgrid_subscription_list` | ❌ | + +--- + +<<<<<<< HEAD +## Test 201 +======= +## Test 211 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `eventhubs_namespace_get` **Prompt:** Get the details of my namespace in my resource group @@ -3974,6 +7586,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.509749 | `eventhubs_namespace_get` | ✅ **EXPECTED** | | 2 | 0.509432 | `monitor_webtests_get` | ❌ | | 3 | 0.497399 | `servicebus_queue_details` | ❌ | @@ -3983,6 +7596,29 @@ --- ## Test 207 +======= +<<<<<<< HEAD +| 1 | 0.510078 | `monitor_webtests_get` | ❌ | +| 2 | 0.509993 | `eventhubs_namespace_get` | ✅ **EXPECTED** | +| 3 | 0.497527 | `servicebus_queue_details` | ❌ | +| 4 | 0.490095 | `eventhubs_namespace_update` | ❌ | +| 5 | 0.470636 | `functionapp_get` | ❌ | + +--- + +## Test 202 +======= +| 1 | 0.509749 | `eventhubs_namespace_get` | ✅ **EXPECTED** | +| 2 | 0.509431 | `monitor_webtests_get` | ❌ | +| 3 | 0.497399 | `servicebus_queue_details` | ❌ | +| 4 | 0.490055 | `eventhubs_namespace_update` | ❌ | +| 5 | 0.470455 | `functionapp_get` | ❌ | + +--- + +## Test 212 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `eventhubs_namespace_update` **Prompt:** Create an new namespace in my resource group @@ -3993,13 +7629,21 @@ |------|-------|------|--------| | 1 | 0.610313 | `eventhubs_namespace_update` | ✅ **EXPECTED** | | 2 | 0.466721 | `eventhubs_namespace_get` | ❌ | -| 3 | 0.458458 | `eventhubs_namespace_delete` | ❌ | +| 3 | 0.461181 | `eventhubs_namespace_delete` | ❌ | | 4 | 0.449724 | `workbooks_create` | ❌ | | 5 | 0.438492 | `eventhubs_eventhub_consumergroup_update` | ❌ | --- +<<<<<<< HEAD ## Test 208 +======= +<<<<<<< HEAD +## Test 203 +======= +## Test 213 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `eventhubs_namespace_update` **Prompt:** Update my namespace in my resource group @@ -4008,6 +7652,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.622219 | `eventhubs_namespace_update` | ✅ **EXPECTED** | | 2 | 0.474098 | `eventhubs_namespace_delete` | ❌ | | 3 | 0.448723 | `eventhubs_namespace_get` | ❌ | @@ -4017,6 +7662,21 @@ --- ## Test 209 +======= +| 1 | 0.622338 | `eventhubs_namespace_update` | ✅ **EXPECTED** | +| 2 | 0.476290 | `eventhubs_namespace_delete` | ❌ | +| 3 | 0.448723 | `eventhubs_namespace_get` | ❌ | +| 4 | 0.436549 | `eventhubs_eventhub_consumergroup_update` | ❌ | +| 5 | 0.372632 | `sql_db_rename` | ❌ | + +--- + +<<<<<<< HEAD +## Test 204 +======= +## Test 214 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `functionapp_get` **Prompt:** Describe the function app in resource group @@ -4028,12 +7688,29 @@ | 1 | 0.660116 | `functionapp_get` | ✅ **EXPECTED** | | 2 | 0.451226 | `deploy_app_logs_get` | ❌ | | 3 | 0.450457 | `applens_resource_diagnose` | ❌ | +<<<<<<< HEAD | 4 | 0.390048 | `mysql_server_list` | ❌ | +======= +<<<<<<< HEAD +| 4 | 0.390107 | `mysql_server_list` | ❌ | +>>>>>>> 58ab8585 (update prompts and tool description evaluator) | 5 | 0.380314 | `get_bestpractices_get` | ❌ | --- +<<<<<<< HEAD ## Test 210 +======= +## Test 205 +======= +| 4 | 0.390048 | `mysql_server_list` | ❌ | +| 5 | 0.380262 | `get_bestpractices_get` | ❌ | + +--- + +## Test 215 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `functionapp_get` **Prompt:** Get configuration for function app @@ -4050,7 +7727,15 @@ --- +<<<<<<< HEAD ## Test 211 +======= +<<<<<<< HEAD +## Test 206 +======= +## Test 216 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `functionapp_get` **Prompt:** Get function app status for @@ -4060,6 +7745,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| | 1 | 0.622384 | `functionapp_get` | ✅ **EXPECTED** | +<<<<<<< HEAD | 2 | 0.413523 | `resourcehealth_availability-status_get` | ❌ | | 3 | 0.390708 | `resourcehealth_availability-status_list` | ❌ | | 4 | 0.383293 | `deploy_app_logs_get` | ❌ | @@ -4068,6 +7754,27 @@ --- ## Test 212 +======= +<<<<<<< HEAD +| 2 | 0.413481 | `resourcehealth_availability-status_get` | ❌ | +| 3 | 0.390766 | `resourcehealth_availability-status_list` | ❌ | +| 4 | 0.383533 | `deploy_app_logs_get` | ❌ | +| 5 | 0.360677 | `storage_account_get` | ❌ | + +--- + +## Test 207 +======= +| 2 | 0.411718 | `resourcehealth_availability-status_get` | ❌ | +| 3 | 0.390708 | `resourcehealth_availability-status_list` | ❌ | +| 4 | 0.383533 | `deploy_app_logs_get` | ❌ | +| 5 | 0.360764 | `storage_account_get` | ❌ | + +--- + +## Test 217 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `functionapp_get` **Prompt:** Get information about my function app in @@ -4078,13 +7785,31 @@ |------|-------|------|--------| | 1 | 0.690933 | `functionapp_get` | ✅ **EXPECTED** | | 2 | 0.441937 | `foundry_resource_get` | ❌ | +<<<<<<< HEAD | 3 | 0.432317 | `resourcehealth_availability-status_list` | ❌ | +======= +<<<<<<< HEAD +| 3 | 0.432458 | `resourcehealth_availability-status_list` | ❌ | +>>>>>>> 58ab8585 (update prompts and tool description evaluator) | 4 | 0.431821 | `applens_resource_diagnose` | ❌ | | 5 | 0.429077 | `storage_account_get` | ❌ | --- +<<<<<<< HEAD ## Test 213 +======= +## Test 208 +======= +| 3 | 0.432317 | `resourcehealth_availability-status_list` | ❌ | +| 4 | 0.431821 | `applens_resource_diagnose` | ❌ | +| 5 | 0.429120 | `storage_account_get` | ❌ | + +--- + +## Test 218 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `functionapp_get` **Prompt:** Retrieve host name and status of function app @@ -4094,14 +7819,34 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| | 1 | 0.592791 | `functionapp_get` | ✅ **EXPECTED** | +<<<<<<< HEAD | 2 | 0.417779 | `resourcehealth_availability-status_get` | ❌ | | 3 | 0.409487 | `deploy_app_logs_get` | ❌ | | 4 | 0.399953 | `storage_account_get` | ❌ | +======= +<<<<<<< HEAD +| 2 | 0.417817 | `resourcehealth_availability-status_get` | ❌ | +| 3 | 0.409712 | `deploy_app_logs_get` | ❌ | +| 4 | 0.399896 | `storage_account_get` | ❌ | +======= +| 2 | 0.417634 | `resourcehealth_availability-status_get` | ❌ | +| 3 | 0.409712 | `deploy_app_logs_get` | ❌ | +| 4 | 0.400049 | `storage_account_get` | ❌ | +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) | 5 | 0.392237 | `applens_resource_diagnose` | ❌ | --- +<<<<<<< HEAD ## Test 214 +======= +<<<<<<< HEAD +## Test 209 +======= +## Test 219 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `functionapp_get` **Prompt:** Show function app details for in @@ -4114,11 +7859,27 @@ | 2 | 0.449033 | `deploy_app_logs_get` | ❌ | | 3 | 0.428689 | `applens_resource_diagnose` | ❌ | | 4 | 0.424686 | `foundry_resource_get` | ❌ | +<<<<<<< HEAD | 5 | 0.391781 | `monitor_webtests_get` | ❌ | --- ## Test 215 +======= +<<<<<<< HEAD +| 5 | 0.392451 | `monitor_webtests_get` | ❌ | + +--- + +## Test 210 +======= +| 5 | 0.391781 | `monitor_webtests_get` | ❌ | + +--- + +## Test 220 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `functionapp_get` **Prompt:** Show me the details for the function app @@ -4128,14 +7889,32 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| | 1 | 0.644882 | `functionapp_get` | ✅ **EXPECTED** | +<<<<<<< HEAD | 2 | 0.429692 | `deploy_app_logs_get` | ❌ | | 3 | 0.421082 | `storage_account_get` | ❌ | | 4 | 0.403261 | `signalr_runtime_get` | ❌ | +======= +| 2 | 0.430189 | `deploy_app_logs_get` | ❌ | +<<<<<<< HEAD +| 3 | 0.421127 | `storage_account_get` | ❌ | +======= +| 3 | 0.421155 | `storage_account_get` | ❌ | +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +| 4 | 0.403311 | `signalr_runtime_get` | ❌ | +>>>>>>> 58ab8585 (update prompts and tool description evaluator) | 5 | 0.391615 | `foundry_resource_get` | ❌ | --- +<<<<<<< HEAD ## Test 216 +======= +<<<<<<< HEAD +## Test 211 +======= +## Test 221 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `functionapp_get` **Prompt:** Show plan and region for function app @@ -4145,14 +7924,31 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| | 1 | 0.554980 | `functionapp_get` | ✅ **EXPECTED** | +<<<<<<< HEAD | 2 | 0.426921 | `quota_usage_check` | ❌ | | 3 | 0.424062 | `deploy_app_logs_get` | ❌ | +======= +<<<<<<< HEAD +| 2 | 0.426976 | `quota_usage_check` | ❌ | +======= +| 2 | 0.426703 | `quota_usage_check` | ❌ | +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +| 3 | 0.424610 | `deploy_app_logs_get` | ❌ | +>>>>>>> 58ab8585 (update prompts and tool description evaluator) | 4 | 0.408011 | `deploy_plan_get` | ❌ | | 5 | 0.381629 | `deploy_architecture_diagram_generate` | ❌ | --- +<<<<<<< HEAD ## Test 217 +======= +<<<<<<< HEAD +## Test 212 +======= +## Test 222 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `functionapp_get` **Prompt:** What is the status of function app ? @@ -4165,11 +7961,27 @@ | 2 | 0.403246 | `deploy_app_logs_get` | ❌ | | 3 | 0.384159 | `resourcehealth_availability-status_list` | ❌ | | 4 | 0.369868 | `applens_resource_diagnose` | ❌ | +<<<<<<< HEAD | 5 | 0.354912 | `resourcehealth_availability-status_get` | ❌ | --- ## Test 218 +======= +<<<<<<< HEAD +| 5 | 0.355044 | `resourcehealth_availability-status_get` | ❌ | + +--- + +## Test 213 +======= +| 5 | 0.352966 | `resourcehealth_availability-status_get` | ❌ | + +--- + +## Test 223 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `functionapp_get` **Prompt:** List all function apps in my subscription @@ -4179,14 +7991,33 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| | 1 | 0.646561 | `functionapp_get` | ✅ **EXPECTED** | +<<<<<<< HEAD | 2 | 0.557549 | `search_service_list` | ❌ | | 3 | 0.534936 | `subscription_list` | ❌ | +======= +| 2 | 0.559382 | `search_service_list` | ❌ | +<<<<<<< HEAD +| 3 | 0.534935 | `subscription_list` | ❌ | +>>>>>>> 58ab8585 (update prompts and tool description evaluator) | 4 | 0.529031 | `kusto_cluster_list` | ❌ | | 5 | 0.516618 | `cosmos_account_list` | ❌ | --- +<<<<<<< HEAD ## Test 219 +======= +## Test 214 +======= +| 3 | 0.534930 | `subscription_list` | ❌ | +| 4 | 0.528892 | `kusto_cluster_list` | ❌ | +| 5 | 0.516664 | `cosmos_account_list` | ❌ | + +--- + +## Test 224 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `functionapp_get` **Prompt:** Show me my Azure function apps @@ -4196,14 +8027,34 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| | 1 | 0.560249 | `functionapp_get` | ✅ **EXPECTED** | +<<<<<<< HEAD | 2 | 0.464637 | `deploy_app_logs_get` | ❌ | | 3 | 0.411323 | `get_bestpractices_get` | ❌ | | 4 | 0.410461 | `search_service_list` | ❌ | +======= +| 2 | 0.464985 | `deploy_app_logs_get` | ❌ | +<<<<<<< HEAD +| 3 | 0.412646 | `search_service_list` | ❌ | +| 4 | 0.411323 | `get_bestpractices_get` | ❌ | +>>>>>>> 58ab8585 (update prompts and tool description evaluator) | 5 | 0.398503 | `extension_cli_install` | ❌ | --- +<<<<<<< HEAD ## Test 220 +======= +## Test 215 +======= +| 3 | 0.436167 | `foundry_agents_list` | ❌ | +| 4 | 0.413594 | `get_bestpractices_get` | ❌ | +| 5 | 0.412646 | `search_service_list` | ❌ | + +--- + +## Test 225 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `functionapp_get` **Prompt:** What function apps do I have? @@ -4216,11 +8067,19 @@ | 2 | 0.346031 | `deploy_app_logs_get` | ❌ | | 3 | 0.337966 | `applens_resource_diagnose` | ❌ | | 4 | 0.316594 | `extension_cli_install` | ❌ | -| 5 | 0.284362 | `get_bestpractices_get` | ❌ | +| 5 | 0.286490 | `get_bestpractices_get` | ❌ | --- +<<<<<<< HEAD ## Test 221 +======= +<<<<<<< HEAD +## Test 216 +======= +## Test 226 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `keyvault_admin_settings_get` **Prompt:** Get the account settings for my key vault @@ -4229,6 +8088,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.604780 | `keyvault_admin_settings_get` | ✅ **EXPECTED** | | 2 | 0.532196 | `storage_account_get` | ❌ | | 3 | 0.496042 | `keyvault_key_get` | ❌ | @@ -4238,6 +8098,26 @@ --- ## Test 222 +======= +<<<<<<< HEAD +| 1 | 0.604797 | `keyvault_admin_settings_get` | ✅ **EXPECTED** | +| 2 | 0.532029 | `storage_account_get` | ❌ | +======= +| 1 | 0.604780 | `keyvault_admin_settings_get` | ✅ **EXPECTED** | +| 2 | 0.532169 | `storage_account_get` | ❌ | +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +| 3 | 0.496629 | `keyvault_key_get` | ❌ | +| 4 | 0.452366 | `appconfig_kv_set` | ❌ | +| 5 | 0.448039 | `keyvault_secret_get` | ❌ | + +--- + +<<<<<<< HEAD +## Test 217 +======= +## Test 227 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `keyvault_admin_settings_get` **Prompt:** Show me the account settings for managed HSM keyvault @@ -4246,15 +8126,34 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.671370 | `keyvault_admin_settings_get` | ✅ **EXPECTED** | | 2 | 0.455561 | `storage_account_get` | ❌ | | 3 | 0.440966 | `keyvault_key_get` | ❌ | +======= +<<<<<<< HEAD +| 1 | 0.671368 | `keyvault_admin_settings_get` | ✅ **EXPECTED** | +| 2 | 0.455516 | `storage_account_get` | ❌ | +======= +| 1 | 0.671370 | `keyvault_admin_settings_get` | ✅ **EXPECTED** | +| 2 | 0.455526 | `storage_account_get` | ❌ | +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +| 3 | 0.441225 | `keyvault_key_get` | ❌ | +>>>>>>> 58ab8585 (update prompts and tool description evaluator) | 4 | 0.404666 | `appconfig_kv_set` | ❌ | | 5 | 0.395449 | `keyvault_secret_get` | ❌ | --- +<<<<<<< HEAD ## Test 223 +======= +<<<<<<< HEAD +## Test 218 +======= +## Test 228 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `keyvault_admin_settings_get` **Prompt:** What's the value of the setting in my key vault with name @@ -4263,6 +8162,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.505709 | `keyvault_admin_settings_get` | ✅ **EXPECTED** | | 2 | 0.496565 | `appconfig_kv_set` | ❌ | | 3 | 0.420067 | `appconfig_kv_lock_set` | ❌ | @@ -4272,6 +8172,25 @@ --- ## Test 224 +======= +<<<<<<< HEAD +| 1 | 0.505731 | `keyvault_admin_settings_get` | ✅ **EXPECTED** | +======= +| 1 | 0.505750 | `keyvault_admin_settings_get` | ✅ **EXPECTED** | +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +| 2 | 0.496540 | `appconfig_kv_set` | ❌ | +| 3 | 0.420145 | `appconfig_kv_lock_set` | ❌ | +| 4 | 0.419126 | `keyvault_key_get` | ❌ | +| 5 | 0.410215 | `keyvault_secret_get` | ❌ | + +--- + +<<<<<<< HEAD +## Test 219 +======= +## Test 229 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `keyvault_certificate_create` **Prompt:** Create a new certificate called in the key vault @@ -4280,6 +8199,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.627727 | `keyvault_certificate_create` | ✅ **EXPECTED** | | 2 | 0.570319 | `keyvault_certificate_import` | ❌ | | 3 | 0.540199 | `keyvault_key_create` | ❌ | @@ -4289,6 +8209,29 @@ --- ## Test 225 +======= +<<<<<<< HEAD +| 1 | 0.627882 | `keyvault_certificate_create` | ✅ **EXPECTED** | +| 2 | 0.570708 | `keyvault_certificate_import` | ❌ | +| 3 | 0.540476 | `keyvault_key_create` | ❌ | +| 4 | 0.519268 | `keyvault_certificate_get` | ❌ | +| 5 | 0.500093 | `keyvault_certificate_list` | ❌ | + +--- + +## Test 220 +======= +| 1 | 0.627727 | `keyvault_certificate_create` | ✅ **EXPECTED** | +| 2 | 0.570398 | `keyvault_certificate_import` | ❌ | +| 3 | 0.540199 | `keyvault_key_create` | ❌ | +| 4 | 0.519218 | `keyvault_certificate_get` | ❌ | +| 5 | 0.500027 | `keyvault_certificate_list` | ❌ | + +--- + +## Test 230 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `keyvault_certificate_create` **Prompt:** Generate a certificate named in key vault @@ -4297,6 +8240,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.599548 | `keyvault_certificate_create` | ✅ **EXPECTED** | | 2 | 0.561717 | `keyvault_certificate_import` | ❌ | | 3 | 0.521910 | `keyvault_certificate_get` | ❌ | @@ -4306,6 +8250,21 @@ --- ## Test 226 +======= +| 1 | 0.599990 | `keyvault_certificate_create` | ✅ **EXPECTED** | +| 2 | 0.561458 | `keyvault_certificate_import` | ❌ | +| 3 | 0.522706 | `keyvault_certificate_get` | ❌ | +| 4 | 0.502128 | `keyvault_key_create` | ❌ | +| 5 | 0.497145 | `keyvault_certificate_list` | ❌ | + +--- + +<<<<<<< HEAD +## Test 221 +======= +## Test 231 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `keyvault_certificate_create` **Prompt:** Request creation of certificate in the key vault @@ -4314,6 +8273,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.573998 | `keyvault_certificate_create` | ✅ **EXPECTED** | | 2 | 0.527759 | `keyvault_certificate_import` | ❌ | | 3 | 0.498278 | `keyvault_certificate_get` | ❌ | @@ -4323,6 +8283,29 @@ --- ## Test 227 +======= +<<<<<<< HEAD +| 1 | 0.574040 | `keyvault_certificate_create` | ✅ **EXPECTED** | +| 2 | 0.527743 | `keyvault_certificate_import` | ❌ | +| 3 | 0.498226 | `keyvault_certificate_get` | ❌ | +| 4 | 0.481666 | `keyvault_key_create` | ❌ | +| 5 | 0.469651 | `keyvault_certificate_list` | ❌ | + +--- + +## Test 222 +======= +| 1 | 0.573998 | `keyvault_certificate_create` | ✅ **EXPECTED** | +| 2 | 0.527813 | `keyvault_certificate_import` | ❌ | +| 3 | 0.498278 | `keyvault_certificate_get` | ❌ | +| 4 | 0.481548 | `keyvault_key_create` | ❌ | +| 5 | 0.469601 | `keyvault_certificate_list` | ❌ | + +--- + +## Test 232 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `keyvault_certificate_create` **Prompt:** Provision a new key vault certificate in vault @@ -4332,14 +8315,22 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| | 1 | 0.591697 | `keyvault_certificate_create` | ✅ **EXPECTED** | -| 2 | 0.562265 | `keyvault_certificate_import` | ❌ | +| 2 | 0.562234 | `keyvault_certificate_import` | ❌ | | 3 | 0.522147 | `keyvault_certificate_get` | ❌ | | 4 | 0.502529 | `keyvault_key_create` | ❌ | | 5 | 0.479992 | `keyvault_certificate_list` | ❌ | --- +<<<<<<< HEAD ## Test 228 +======= +<<<<<<< HEAD +## Test 223 +======= +## Test 233 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `keyvault_certificate_create` **Prompt:** Issue a certificate in key vault @@ -4349,14 +8340,22 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| | 1 | 0.622788 | `keyvault_certificate_create` | ✅ **EXPECTED** | -| 2 | 0.558532 | `keyvault_certificate_import` | ❌ | +| 2 | 0.558533 | `keyvault_certificate_import` | ❌ | | 3 | 0.534503 | `keyvault_certificate_get` | ❌ | | 4 | 0.521316 | `keyvault_certificate_list` | ❌ | | 5 | 0.465056 | `keyvault_key_create` | ❌ | --- +<<<<<<< HEAD ## Test 229 +======= +<<<<<<< HEAD +## Test 224 +======= +## Test 234 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `keyvault_certificate_get` **Prompt:** Show me the certificate in the key vault @@ -4367,13 +8366,25 @@ |------|-------|------|--------| | 1 | 0.600625 | `keyvault_certificate_get` | ✅ **EXPECTED** | | 2 | 0.528405 | `keyvault_certificate_list` | ❌ | +<<<<<<< HEAD | 3 | 0.519037 | `keyvault_certificate_import` | ❌ | +======= +| 3 | 0.518919 | `keyvault_certificate_import` | ❌ | +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) | 4 | 0.499293 | `keyvault_certificate_create` | ❌ | | 5 | 0.487691 | `keyvault_key_get` | ❌ | --- +<<<<<<< HEAD ## Test 230 +======= +<<<<<<< HEAD +## Test 225 +======= +## Test 235 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `keyvault_certificate_get` **Prompt:** Show me the details of the certificate in the key vault @@ -4386,11 +8397,23 @@ | 2 | 0.563263 | `keyvault_key_get` | ❌ | | 3 | 0.514499 | `keyvault_secret_get` | ❌ | | 4 | 0.509446 | `keyvault_certificate_list` | ❌ | +<<<<<<< HEAD | 5 | 0.507738 | `keyvault_certificate_import` | ❌ | --- ## Test 231 +======= +| 5 | 0.507630 | `keyvault_certificate_import` | ❌ | + +--- + +<<<<<<< HEAD +## Test 226 +======= +## Test 236 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `keyvault_certificate_get` **Prompt:** Get the certificate from vault @@ -4402,12 +8425,25 @@ | 1 | 0.609523 | `keyvault_certificate_get` | ✅ **EXPECTED** | | 2 | 0.515570 | `keyvault_certificate_list` | ❌ | | 3 | 0.511197 | `keyvault_certificate_create` | ❌ | +<<<<<<< HEAD | 4 | 0.507768 | `keyvault_certificate_import` | ❌ | | 5 | 0.475674 | `keyvault_key_get` | ❌ | --- ## Test 232 +======= +| 4 | 0.507693 | `keyvault_certificate_import` | ❌ | +| 5 | 0.474394 | `keyvault_key_get` | ❌ | + +--- + +<<<<<<< HEAD +## Test 227 +======= +## Test 237 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `keyvault_certificate_get` **Prompt:** Display the certificate details for in vault @@ -4416,6 +8452,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.647669 | `keyvault_certificate_get` | ✅ **EXPECTED** | | 2 | 0.528243 | `keyvault_key_get` | ❌ | | 3 | 0.521556 | `keyvault_certificate_list` | ❌ | @@ -4424,7 +8461,22 @@ --- +<<<<<<< HEAD ## Test 233 +======= +## Test 228 +======= +| 1 | 0.647626 | `keyvault_certificate_get` | ✅ **EXPECTED** | +| 2 | 0.527284 | `keyvault_key_get` | ❌ | +| 3 | 0.521689 | `keyvault_certificate_list` | ❌ | +| 4 | 0.509907 | `keyvault_certificate_import` | ❌ | +| 5 | 0.501942 | `keyvault_secret_get` | ❌ | + +--- + +## Test 238 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `keyvault_certificate_get` **Prompt:** Retrieve certificate metadata for in vault @@ -4433,6 +8485,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.595959 | `keyvault_certificate_get` | ✅ **EXPECTED** | | 2 | 0.527404 | `keyvault_certificate_list` | ❌ | | 3 | 0.519059 | `keyvault_certificate_import` | ❌ | @@ -4442,6 +8495,29 @@ --- ## Test 234 +======= +<<<<<<< HEAD +| 1 | 0.595902 | `keyvault_certificate_get` | ✅ **EXPECTED** | +| 2 | 0.527167 | `keyvault_certificate_list` | ❌ | +| 3 | 0.518836 | `keyvault_certificate_import` | ❌ | +| 4 | 0.500932 | `keyvault_certificate_create` | ❌ | +| 5 | 0.465265 | `keyvault_key_get` | ❌ | + +--- + +## Test 229 +======= +| 1 | 0.595959 | `keyvault_certificate_get` | ✅ **EXPECTED** | +| 2 | 0.527404 | `keyvault_certificate_list` | ❌ | +| 3 | 0.518970 | `keyvault_certificate_import` | ❌ | +| 4 | 0.501138 | `keyvault_certificate_create` | ❌ | +| 5 | 0.465174 | `keyvault_key_get` | ❌ | + +--- + +## Test 239 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `keyvault_certificate_import` **Prompt:** Import the certificate in file into the key vault @@ -4450,6 +8526,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.585481 | `keyvault_certificate_import` | ✅ **EXPECTED** | | 2 | 0.420747 | `keyvault_certificate_get` | ❌ | | 3 | 0.402595 | `keyvault_certificate_create` | ❌ | @@ -4459,6 +8536,29 @@ --- ## Test 235 +======= +<<<<<<< HEAD +| 1 | 0.585549 | `keyvault_certificate_import` | ✅ **EXPECTED** | +| 2 | 0.420798 | `keyvault_certificate_get` | ❌ | +| 3 | 0.402853 | `keyvault_certificate_create` | ❌ | +| 4 | 0.399353 | `keyvault_certificate_list` | ❌ | +| 5 | 0.353196 | `keyvault_key_create` | ❌ | + +--- + +## Test 230 +======= +| 1 | 0.585374 | `keyvault_certificate_import` | ✅ **EXPECTED** | +| 2 | 0.420747 | `keyvault_certificate_get` | ❌ | +| 3 | 0.402595 | `keyvault_certificate_create` | ❌ | +| 4 | 0.399342 | `keyvault_certificate_list` | ❌ | +| 5 | 0.352905 | `keyvault_key_create` | ❌ | + +--- + +## Test 240 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `keyvault_certificate_import` **Prompt:** Import a certificate into the key vault using the name @@ -4467,6 +8567,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.622125 | `keyvault_certificate_import` | ✅ **EXPECTED** | | 2 | 0.504314 | `keyvault_certificate_get` | ❌ | | 3 | 0.498847 | `keyvault_certificate_create` | ❌ | @@ -4475,7 +8576,22 @@ --- +<<<<<<< HEAD ## Test 236 +======= +## Test 231 +======= +| 1 | 0.622168 | `keyvault_certificate_import` | ✅ **EXPECTED** | +| 2 | 0.504306 | `keyvault_certificate_get` | ❌ | +| 3 | 0.498841 | `keyvault_certificate_create` | ❌ | +| 4 | 0.448114 | `keyvault_certificate_list` | ❌ | +| 5 | 0.419794 | `keyvault_key_create` | ❌ | + +--- + +## Test 241 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `keyvault_certificate_import` **Prompt:** Upload certificate file to key vault @@ -4484,6 +8600,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.595707 | `keyvault_certificate_import` | ✅ **EXPECTED** | | 2 | 0.453929 | `keyvault_certificate_create` | ❌ | | 3 | 0.452551 | `keyvault_certificate_get` | ❌ | @@ -4492,7 +8609,22 @@ --- +<<<<<<< HEAD ## Test 237 +======= +## Test 232 +======= +| 1 | 0.594990 | `keyvault_certificate_import` | ✅ **EXPECTED** | +| 2 | 0.453726 | `keyvault_certificate_create` | ❌ | +| 3 | 0.452165 | `keyvault_certificate_get` | ❌ | +| 4 | 0.418142 | `keyvault_certificate_list` | ❌ | +| 5 | 0.413240 | `keyvault_key_create` | ❌ | + +--- + +## Test 242 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `keyvault_certificate_import` **Prompt:** Load certificate from file into vault @@ -4501,7 +8633,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -| 1 | 0.619480 | `keyvault_certificate_import` | ✅ **EXPECTED** | +| 1 | 0.619385 | `keyvault_certificate_import` | ✅ **EXPECTED** | | 2 | 0.517804 | `keyvault_certificate_get` | ❌ | | 3 | 0.480815 | `keyvault_certificate_create` | ❌ | | 4 | 0.444386 | `keyvault_certificate_list` | ❌ | @@ -4509,7 +8641,15 @@ --- +<<<<<<< HEAD ## Test 238 +======= +<<<<<<< HEAD +## Test 233 +======= +## Test 243 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `keyvault_certificate_import` **Prompt:** Add existing certificate file to the key vault with name @@ -4518,15 +8658,36 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.595418 | `keyvault_certificate_import` | ✅ **EXPECTED** | | 2 | 0.452490 | `keyvault_certificate_create` | ❌ | +======= +<<<<<<< HEAD +| 1 | 0.595417 | `keyvault_certificate_import` | ✅ **EXPECTED** | +| 2 | 0.452489 | `keyvault_certificate_create` | ❌ | +>>>>>>> 58ab8585 (update prompts and tool description evaluator) | 3 | 0.441616 | `keyvault_certificate_get` | ❌ | | 4 | 0.408018 | `keyvault_key_create` | ❌ | | 5 | 0.392244 | `keyvault_secret_create` | ❌ | --- +<<<<<<< HEAD ## Test 239 +======= +## Test 234 +======= +| 1 | 0.595426 | `keyvault_certificate_import` | ✅ **EXPECTED** | +| 2 | 0.452531 | `keyvault_certificate_create` | ❌ | +| 3 | 0.441676 | `keyvault_certificate_get` | ❌ | +| 4 | 0.408033 | `keyvault_key_create` | ❌ | +| 5 | 0.392316 | `keyvault_secret_create` | ❌ | + +--- + +## Test 244 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `keyvault_certificate_list` **Prompt:** List all certificates in the key vault @@ -4536,14 +8697,30 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| | 1 | 0.726124 | `keyvault_certificate_list` | ✅ **EXPECTED** | +<<<<<<< HEAD | 2 | 0.583110 | `keyvault_key_list` | ❌ | +======= +<<<<<<< HEAD +| 2 | 0.583138 | `keyvault_key_list` | ❌ | +======= +| 2 | 0.583079 | `keyvault_key_list` | ❌ | +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) | 3 | 0.531988 | `keyvault_secret_list` | ❌ | | 4 | 0.515236 | `keyvault_certificate_get` | ❌ | | 5 | 0.485792 | `keyvault_certificate_create` | ❌ | --- +<<<<<<< HEAD ## Test 240 +======= +<<<<<<< HEAD +## Test 235 +======= +## Test 245 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `keyvault_certificate_list` **Prompt:** Show me the certificates in the key vault @@ -4554,13 +8731,29 @@ |------|-------|------|--------| | 1 | 0.615541 | `keyvault_certificate_list` | ✅ **EXPECTED** | | 2 | 0.522453 | `keyvault_certificate_get` | ❌ | +<<<<<<< HEAD | 3 | 0.475156 | `keyvault_key_list` | ❌ | +======= +<<<<<<< HEAD +| 3 | 0.475197 | `keyvault_key_list` | ❌ | +======= +| 3 | 0.475142 | `keyvault_key_list` | ❌ | +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) | 4 | 0.460973 | `keyvault_certificate_create` | ❌ | | 5 | 0.449381 | `keyvault_key_get` | ❌ | --- +<<<<<<< HEAD ## Test 241 +======= +<<<<<<< HEAD +## Test 236 +======= +## Test 246 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `keyvault_certificate_list` **Prompt:** What certificates are in the key vault ? @@ -4572,12 +8765,25 @@ | 1 | 0.624710 | `keyvault_certificate_list` | ✅ **EXPECTED** | | 2 | 0.519739 | `keyvault_certificate_get` | ❌ | | 3 | 0.510048 | `keyvault_certificate_create` | ❌ | +<<<<<<< HEAD | 4 | 0.505534 | `keyvault_certificate_import` | ❌ | | 5 | 0.497356 | `keyvault_key_list` | ❌ | --- +<<<<<<< HEAD ## Test 242 +======= +## Test 237 +======= +| 4 | 0.505367 | `keyvault_certificate_import` | ❌ | +| 5 | 0.497322 | `keyvault_key_list` | ❌ | + +--- + +## Test 247 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `keyvault_certificate_list` **Prompt:** List certificate names in vault @@ -4587,14 +8793,30 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| | 1 | 0.672622 | `keyvault_certificate_list` | ✅ **EXPECTED** | +<<<<<<< HEAD | 2 | 0.553990 | `keyvault_key_list` | ❌ | +======= +<<<<<<< HEAD +| 2 | 0.554016 | `keyvault_key_list` | ❌ | +======= +| 2 | 0.553960 | `keyvault_key_list` | ❌ | +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) | 3 | 0.511905 | `keyvault_secret_list` | ❌ | | 4 | 0.507062 | `keyvault_certificate_get` | ❌ | | 5 | 0.492357 | `keyvault_certificate_create` | ❌ | --- +<<<<<<< HEAD ## Test 243 +======= +<<<<<<< HEAD +## Test 238 +======= +## Test 248 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `keyvault_certificate_list` **Prompt:** Enumerate certificates in key vault @@ -4603,16 +8825,33 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.747408 | `keyvault_certificate_list` | ✅ **EXPECTED** | | 2 | 0.594216 | `keyvault_key_list` | ❌ | +======= +| 1 | 0.747407 | `keyvault_certificate_list` | ✅ **EXPECTED** | +<<<<<<< HEAD +| 2 | 0.594268 | `keyvault_key_list` | ❌ | +======= +| 2 | 0.594121 | `keyvault_key_list` | ❌ | +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) | 3 | 0.558771 | `keyvault_secret_list` | ❌ | | 4 | 0.515568 | `keyvault_certificate_get` | ❌ | | 5 | 0.490876 | `keyvault_certificate_create` | ❌ | --- +<<<<<<< HEAD ## Test 244 - +======= +<<<<<<< HEAD +## Test 239 +======= +## Test 249 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) + **Expected Tool:** `keyvault_certificate_list` **Prompt:** Show certificate names in the key vault @@ -4622,13 +8861,29 @@ |------|-------|------|--------| | 1 | 0.639711 | `keyvault_certificate_list` | ✅ **EXPECTED** | | 2 | 0.512475 | `keyvault_certificate_get` | ❌ | +<<<<<<< HEAD | 3 | 0.507572 | `keyvault_key_list` | ❌ | +======= +<<<<<<< HEAD +| 3 | 0.507603 | `keyvault_key_list` | ❌ | +======= +| 3 | 0.507562 | `keyvault_key_list` | ❌ | +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) | 4 | 0.482583 | `keyvault_certificate_create` | ❌ | | 5 | 0.464725 | `keyvault_secret_list` | ❌ | --- +<<<<<<< HEAD ## Test 245 +======= +<<<<<<< HEAD +## Test 240 +======= +## Test 250 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `keyvault_key_create` **Prompt:** Create a new key called with the RSA type in the key vault @@ -4637,6 +8892,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.661466 | `keyvault_key_create` | ✅ **EXPECTED** | | 2 | 0.456580 | `keyvault_secret_create` | ❌ | | 3 | 0.451790 | `keyvault_certificate_create` | ❌ | @@ -4645,7 +8901,22 @@ --- +<<<<<<< HEAD ## Test 246 +======= +## Test 241 +======= +| 1 | 0.661548 | `keyvault_key_create` | ✅ **EXPECTED** | +| 2 | 0.456628 | `keyvault_secret_create` | ❌ | +| 3 | 0.451826 | `keyvault_certificate_create` | ❌ | +| 4 | 0.429537 | `keyvault_certificate_import` | ❌ | +| 5 | 0.399324 | `keyvault_key_get` | ❌ | + +--- + +## Test 251 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `keyvault_key_create` **Prompt:** Generate a key with type in vault @@ -4654,6 +8925,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.641070 | `keyvault_key_create` | ✅ **EXPECTED** | | 2 | 0.428964 | `keyvault_key_get` | ❌ | | 3 | 0.422763 | `keyvault_certificate_create` | ❌ | @@ -4663,6 +8935,29 @@ --- ## Test 247 +======= +<<<<<<< HEAD +| 1 | 0.641022 | `keyvault_key_create` | ✅ **EXPECTED** | +| 2 | 0.428461 | `keyvault_key_get` | ❌ | +| 3 | 0.422686 | `keyvault_certificate_create` | ❌ | +| 4 | 0.419964 | `keyvault_secret_create` | ❌ | +| 5 | 0.405612 | `appconfig_kv_set` | ❌ | + +--- + +## Test 242 +======= +| 1 | 0.641070 | `keyvault_key_create` | ✅ **EXPECTED** | +| 2 | 0.428502 | `keyvault_key_get` | ❌ | +| 3 | 0.422763 | `keyvault_certificate_create` | ❌ | +| 4 | 0.420045 | `keyvault_secret_create` | ❌ | +| 5 | 0.405644 | `appconfig_kv_set` | ❌ | + +--- + +## Test 252 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `keyvault_key_create` **Prompt:** Create an oct key in the vault @@ -4671,6 +8966,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.547493 | `keyvault_key_create` | ✅ **EXPECTED** | | 2 | 0.463557 | `keyvault_secret_create` | ❌ | | 3 | 0.447410 | `keyvault_certificate_create` | ❌ | @@ -4680,6 +8976,29 @@ --- ## Test 248 +======= +<<<<<<< HEAD +| 1 | 0.548424 | `keyvault_key_create` | ✅ **EXPECTED** | +| 2 | 0.464221 | `keyvault_secret_create` | ❌ | +| 3 | 0.448379 | `keyvault_certificate_create` | ❌ | +| 4 | 0.421467 | `keyvault_key_get` | ❌ | +| 5 | 0.405195 | `keyvault_certificate_import` | ❌ | + +--- + +## Test 243 +======= +| 1 | 0.547493 | `keyvault_key_create` | ✅ **EXPECTED** | +| 2 | 0.463557 | `keyvault_secret_create` | ❌ | +| 3 | 0.447410 | `keyvault_certificate_create` | ❌ | +| 4 | 0.420366 | `keyvault_key_get` | ❌ | +| 5 | 0.404180 | `keyvault_certificate_import` | ❌ | + +--- + +## Test 253 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `keyvault_key_create` **Prompt:** Create an RSA key in the vault with name @@ -4688,6 +9007,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.641369 | `keyvault_key_create` | ✅ **EXPECTED** | | 2 | 0.501636 | `keyvault_secret_create` | ❌ | | 3 | 0.491735 | `keyvault_certificate_create` | ❌ | @@ -4696,7 +9016,22 @@ --- +<<<<<<< HEAD ## Test 249 +======= +## Test 244 +======= +| 1 | 0.640853 | `keyvault_key_create` | ✅ **EXPECTED** | +| 2 | 0.500742 | `keyvault_secret_create` | ❌ | +| 3 | 0.491071 | `keyvault_certificate_create` | ❌ | +| 4 | 0.463536 | `keyvault_certificate_import` | ❌ | +| 5 | 0.450448 | `keyvault_key_get` | ❌ | + +--- + +## Test 254 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `keyvault_key_create` **Prompt:** Create an EC key with name in the vault @@ -4705,6 +9040,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.571793 | `keyvault_key_create` | ✅ **EXPECTED** | | 2 | 0.443085 | `keyvault_certificate_create` | ❌ | | 3 | 0.434697 | `keyvault_secret_create` | ❌ | @@ -4714,6 +9050,25 @@ --- ## Test 250 +======= +| 1 | 0.571718 | `keyvault_key_create` | ✅ **EXPECTED** | +| 2 | 0.443369 | `keyvault_certificate_create` | ❌ | +| 3 | 0.434675 | `keyvault_secret_create` | ❌ | +| 4 | 0.421721 | `keyvault_key_get` | ❌ | +<<<<<<< HEAD +| 5 | 0.400533 | `keyvault_certificate_import` | ❌ | + +--- + +## Test 245 +======= +| 5 | 0.400433 | `keyvault_certificate_import` | ❌ | + +--- + +## Test 255 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `keyvault_key_get` **Prompt:** Show me the key in the key vault @@ -4725,12 +9080,28 @@ | 1 | 0.550225 | `keyvault_key_get` | ✅ **EXPECTED** | | 2 | 0.468243 | `keyvault_secret_get` | ❌ | | 3 | 0.452816 | `keyvault_key_create` | ❌ | +<<<<<<< HEAD | 4 | 0.439969 | `keyvault_key_list` | ❌ | +======= +<<<<<<< HEAD +| 4 | 0.440015 | `keyvault_key_list` | ❌ | +======= +| 4 | 0.439941 | `keyvault_key_list` | ❌ | +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) | 5 | 0.426545 | `keyvault_certificate_get` | ❌ | --- +<<<<<<< HEAD ## Test 251 +======= +<<<<<<< HEAD +## Test 246 +======= +## Test 256 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `keyvault_key_get` **Prompt:** Show me the details of the key in the key vault @@ -4739,15 +9110,37 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.629372 | `keyvault_key_get` | ✅ **EXPECTED** | | 2 | 0.532872 | `keyvault_secret_get` | ❌ | | 3 | 0.512278 | `storage_account_get` | ❌ | +======= +<<<<<<< HEAD +| 1 | 0.629552 | `keyvault_key_get` | ✅ **EXPECTED** | +| 2 | 0.532651 | `keyvault_secret_get` | ❌ | +| 3 | 0.512106 | `storage_account_get` | ❌ | +>>>>>>> 58ab8585 (update prompts and tool description evaluator) | 4 | 0.495957 | `keyvault_certificate_get` | ❌ | | 5 | 0.456992 | `keyvault_key_create` | ❌ | --- +<<<<<<< HEAD ## Test 252 +======= +## Test 247 +======= +| 1 | 0.629579 | `keyvault_key_get` | ✅ **EXPECTED** | +| 2 | 0.532628 | `keyvault_secret_get` | ❌ | +| 3 | 0.512235 | `storage_account_get` | ❌ | +| 4 | 0.496014 | `keyvault_certificate_get` | ❌ | +| 5 | 0.457056 | `keyvault_key_create` | ❌ | + +--- + +## Test 257 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `keyvault_key_get` **Prompt:** Get the key from vault @@ -4764,7 +9157,15 @@ --- +<<<<<<< HEAD ## Test 253 +======= +<<<<<<< HEAD +## Test 248 +======= +## Test 258 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `keyvault_key_get` **Prompt:** Display the key details for in vault @@ -4773,15 +9174,33 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.590297 | `keyvault_key_get` | ✅ **EXPECTED** | | 2 | 0.488574 | `keyvault_secret_get` | ❌ | | 3 | 0.476498 | `storage_account_get` | ❌ | +======= +| 1 | 0.590303 | `keyvault_key_get` | ✅ **EXPECTED** | +| 2 | 0.488213 | `keyvault_secret_get` | ❌ | +<<<<<<< HEAD +| 3 | 0.476278 | `storage_account_get` | ❌ | +======= +| 3 | 0.476529 | `storage_account_get` | ❌ | +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) | 4 | 0.460796 | `keyvault_certificate_get` | ❌ | | 5 | 0.436511 | `keyvault_admin_settings_get` | ❌ | --- +<<<<<<< HEAD ## Test 254 +======= +<<<<<<< HEAD +## Test 249 +======= +## Test 259 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `keyvault_key_get` **Prompt:** Retrieve key metadata for in vault @@ -4790,15 +9209,34 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.518346 | `keyvault_key_get` | ✅ **EXPECTED** | | 2 | 0.432950 | `storage_account_get` | ❌ | | 3 | 0.432742 | `keyvault_admin_settings_get` | ❌ | +======= +| 1 | 0.518886 | `keyvault_key_get` | ✅ **EXPECTED** | +<<<<<<< HEAD +| 2 | 0.432731 | `keyvault_admin_settings_get` | ❌ | +| 3 | 0.432677 | `storage_account_get` | ❌ | +======= +| 2 | 0.432980 | `storage_account_get` | ❌ | +| 3 | 0.432742 | `keyvault_admin_settings_get` | ❌ | +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) | 4 | 0.429131 | `keyvault_key_create` | ❌ | | 5 | 0.422731 | `keyvault_secret_get` | ❌ | --- +<<<<<<< HEAD ## Test 255 +======= +<<<<<<< HEAD +## Test 250 +======= +## Test 260 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `keyvault_key_list` **Prompt:** List all keys in the key vault @@ -4807,7 +9245,12 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.701448 | `keyvault_key_list` | ✅ **EXPECTED** | +======= +<<<<<<< HEAD +| 1 | 0.701474 | `keyvault_key_list` | ✅ **EXPECTED** | +>>>>>>> 58ab8585 (update prompts and tool description evaluator) | 2 | 0.601513 | `keyvault_certificate_list` | ❌ | | 3 | 0.587427 | `keyvault_secret_list` | ❌ | | 4 | 0.498767 | `cosmos_account_list` | ❌ | @@ -4815,7 +9258,22 @@ --- +<<<<<<< HEAD ## Test 256 +======= +## Test 251 +======= +| 1 | 0.701420 | `keyvault_key_list` | ✅ **EXPECTED** | +| 2 | 0.601513 | `keyvault_certificate_list` | ❌ | +| 3 | 0.587427 | `keyvault_secret_list` | ❌ | +| 4 | 0.498750 | `cosmos_account_list` | ❌ | +| 5 | 0.480129 | `keyvault_admin_settings_get` | ❌ | + +--- + +## Test 261 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `keyvault_key_list` **Prompt:** Show me the keys in the key vault @@ -4824,6 +9282,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.549453 | `keyvault_key_list` | ✅ **EXPECTED** | | 2 | 0.507865 | `keyvault_key_get` | ❌ | | 3 | 0.475507 | `keyvault_certificate_list` | ❌ | @@ -4833,6 +9292,28 @@ --- ## Test 257 +======= +<<<<<<< HEAD +| 1 | 0.549498 | `keyvault_key_list` | ✅ **EXPECTED** | +| 2 | 0.506815 | `keyvault_key_get` | ❌ | +| 3 | 0.475507 | `keyvault_certificate_list` | ❌ | +| 4 | 0.472457 | `keyvault_admin_settings_get` | ❌ | +======= +| 1 | 0.549442 | `keyvault_key_list` | ✅ **EXPECTED** | +| 2 | 0.506815 | `keyvault_key_get` | ❌ | +| 3 | 0.475507 | `keyvault_certificate_list` | ❌ | +| 4 | 0.472465 | `keyvault_admin_settings_get` | ❌ | +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +| 5 | 0.455683 | `keyvault_secret_get` | ❌ | + +--- + +<<<<<<< HEAD +## Test 252 +======= +## Test 262 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `keyvault_key_list` **Prompt:** What keys are in the key vault ? @@ -4841,15 +9322,33 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.581970 | `keyvault_key_list` | ✅ **EXPECTED** | | 2 | 0.502245 | `keyvault_admin_settings_get` | ❌ | +======= +<<<<<<< HEAD +| 1 | 0.582010 | `keyvault_key_list` | ✅ **EXPECTED** | +| 2 | 0.502252 | `keyvault_admin_settings_get` | ❌ | +======= +| 1 | 0.581948 | `keyvault_key_list` | ✅ **EXPECTED** | +| 2 | 0.502245 | `keyvault_admin_settings_get` | ❌ | +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) | 3 | 0.501481 | `keyvault_certificate_list` | ❌ | | 4 | 0.477451 | `keyvault_key_get` | ❌ | | 5 | 0.472414 | `keyvault_secret_list` | ❌ | --- +<<<<<<< HEAD ## Test 258 +======= +<<<<<<< HEAD +## Test 253 +======= +## Test 263 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `keyvault_key_list` **Prompt:** List key names in vault @@ -4858,7 +9357,12 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.641314 | `keyvault_key_list` | ✅ **EXPECTED** | +======= +<<<<<<< HEAD +| 1 | 0.641339 | `keyvault_key_list` | ✅ **EXPECTED** | +>>>>>>> 58ab8585 (update prompts and tool description evaluator) | 2 | 0.559550 | `keyvault_certificate_list` | ❌ | | 3 | 0.553553 | `keyvault_secret_list` | ❌ | | 4 | 0.486377 | `keyvault_admin_settings_get` | ❌ | @@ -4866,7 +9370,22 @@ --- +<<<<<<< HEAD ## Test 259 +======= +## Test 254 +======= +| 1 | 0.641210 | `keyvault_key_list` | ✅ **EXPECTED** | +| 2 | 0.559476 | `keyvault_certificate_list` | ❌ | +| 3 | 0.553501 | `keyvault_secret_list` | ❌ | +| 4 | 0.486377 | `keyvault_admin_settings_get` | ❌ | +| 5 | 0.475945 | `cosmos_account_list` | ❌ | + +--- + +## Test 264 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `keyvault_key_list` **Prompt:** Enumerate keys in key vault @@ -4875,6 +9394,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.723266 | `keyvault_key_list` | ✅ **EXPECTED** | | 2 | 0.611366 | `keyvault_certificate_list` | ❌ | | 3 | 0.611185 | `keyvault_secret_list` | ❌ | @@ -4884,6 +9404,28 @@ --- ## Test 260 +======= +<<<<<<< HEAD +| 1 | 0.723318 | `keyvault_key_list` | ✅ **EXPECTED** | +| 2 | 0.611366 | `keyvault_certificate_list` | ❌ | +| 3 | 0.611185 | `keyvault_secret_list` | ❌ | +| 4 | 0.473874 | `keyvault_admin_settings_get` | ❌ | +======= +| 1 | 0.723171 | `keyvault_key_list` | ✅ **EXPECTED** | +| 2 | 0.611366 | `keyvault_certificate_list` | ❌ | +| 3 | 0.611185 | `keyvault_secret_list` | ❌ | +| 4 | 0.473886 | `keyvault_admin_settings_get` | ❌ | +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +| 5 | 0.441881 | `keyvault_key_get` | ❌ | + +--- + +<<<<<<< HEAD +## Test 255 +======= +## Test 265 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `keyvault_key_list` **Prompt:** Show key names in the key vault @@ -4892,15 +9434,37 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.570444 | `keyvault_key_list` | ✅ **EXPECTED** | | 2 | 0.501953 | `keyvault_key_get` | ❌ | | 3 | 0.500103 | `keyvault_certificate_list` | ❌ | | 4 | 0.496817 | `storage_account_get` | ❌ | +======= +<<<<<<< HEAD +| 1 | 0.570489 | `keyvault_key_list` | ✅ **EXPECTED** | +| 2 | 0.501073 | `keyvault_key_get` | ❌ | +| 3 | 0.500103 | `keyvault_certificate_list` | ❌ | +| 4 | 0.496907 | `storage_account_get` | ❌ | +======= +| 1 | 0.570418 | `keyvault_key_list` | ✅ **EXPECTED** | +| 2 | 0.501073 | `keyvault_key_get` | ❌ | +| 3 | 0.500103 | `keyvault_certificate_list` | ❌ | +| 4 | 0.496837 | `storage_account_get` | ❌ | +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) | 5 | 0.490367 | `keyvault_secret_list` | ❌ | --- +<<<<<<< HEAD ## Test 261 +======= +<<<<<<< HEAD +## Test 256 +======= +## Test 266 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `keyvault_secret_create` **Prompt:** Create a new secret called with value in the key vault @@ -4917,7 +9481,15 @@ --- +<<<<<<< HEAD ## Test 262 +======= +<<<<<<< HEAD +## Test 257 +======= +## Test 267 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `keyvault_secret_create` **Prompt:** Set a secret named with value in key vault @@ -4934,7 +9506,15 @@ --- +<<<<<<< HEAD ## Test 263 +======= +<<<<<<< HEAD +## Test 258 +======= +## Test 268 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `keyvault_secret_create` **Prompt:** Store secret value in the key vault @@ -4943,6 +9523,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.639897 | `keyvault_secret_create` | ✅ **EXPECTED** | | 2 | 0.509526 | `keyvault_secret_get` | ❌ | | 3 | 0.485203 | `appconfig_kv_set` | ❌ | @@ -4951,7 +9532,22 @@ --- +<<<<<<< HEAD ## Test 264 +======= +## Test 259 +======= +| 1 | 0.639804 | `keyvault_secret_create` | ✅ **EXPECTED** | +| 2 | 0.509509 | `keyvault_secret_get` | ❌ | +| 3 | 0.485174 | `appconfig_kv_set` | ❌ | +| 4 | 0.484391 | `keyvault_key_create` | ❌ | +| 5 | 0.449001 | `appconfig_kv_lock_set` | ❌ | + +--- + +## Test 269 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `keyvault_secret_create` **Prompt:** Add a new version of secret with value in vault @@ -4960,6 +9556,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.675145 | `keyvault_secret_create` | ✅ **EXPECTED** | | 2 | 0.499276 | `keyvault_secret_get` | ❌ | | 3 | 0.498228 | `keyvault_key_create` | ❌ | @@ -4969,6 +9566,29 @@ --- ## Test 265 +======= +<<<<<<< HEAD +| 1 | 0.675147 | `keyvault_secret_create` | ✅ **EXPECTED** | +| 2 | 0.499602 | `keyvault_secret_get` | ❌ | +| 3 | 0.498196 | `keyvault_key_create` | ❌ | +| 4 | 0.479173 | `keyvault_certificate_import` | ❌ | +| 5 | 0.458587 | `appconfig_kv_set` | ❌ | + +--- + +## Test 260 +======= +| 1 | 0.675145 | `keyvault_secret_create` | ✅ **EXPECTED** | +| 2 | 0.499612 | `keyvault_secret_get` | ❌ | +| 3 | 0.498228 | `keyvault_key_create` | ❌ | +| 4 | 0.478700 | `keyvault_certificate_import` | ❌ | +| 5 | 0.458574 | `appconfig_kv_set` | ❌ | + +--- + +## Test 270 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `keyvault_secret_create` **Prompt:** Update secret to value in the key vault @@ -4977,6 +9597,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.571597 | `keyvault_secret_create` | ✅ **EXPECTED** | | 2 | 0.513012 | `keyvault_secret_get` | ❌ | | 3 | 0.441198 | `appconfig_kv_set` | ❌ | @@ -4986,6 +9607,29 @@ --- ## Test 266 +======= +<<<<<<< HEAD +| 1 | 0.571716 | `keyvault_secret_create` | ✅ **EXPECTED** | +| 2 | 0.513963 | `keyvault_secret_get` | ❌ | +| 3 | 0.441281 | `appconfig_kv_set` | ❌ | +| 4 | 0.417998 | `appconfig_kv_lock_set` | ❌ | +| 5 | 0.408505 | `keyvault_key_get` | ❌ | + +--- + +## Test 261 +======= +| 1 | 0.571612 | `keyvault_secret_create` | ✅ **EXPECTED** | +| 2 | 0.513767 | `keyvault_secret_get` | ❌ | +| 3 | 0.441223 | `appconfig_kv_set` | ❌ | +| 4 | 0.417943 | `appconfig_kv_lock_set` | ❌ | +| 5 | 0.408242 | `keyvault_key_get` | ❌ | + +--- + +## Test 271 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `keyvault_secret_get` **Prompt:** Show me the secret in the key vault @@ -4994,6 +9638,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.602686 | `keyvault_secret_get` | ✅ **EXPECTED** | | 2 | 0.505620 | `keyvault_key_get` | ❌ | | 3 | 0.501397 | `keyvault_secret_create` | ❌ | @@ -5003,6 +9648,29 @@ --- ## Test 267 +======= +<<<<<<< HEAD +| 1 | 0.605040 | `keyvault_secret_get` | ✅ **EXPECTED** | +| 2 | 0.504063 | `keyvault_key_get` | ❌ | +| 3 | 0.502826 | `keyvault_secret_create` | ❌ | +| 4 | 0.479767 | `keyvault_secret_list` | ❌ | +| 5 | 0.440063 | `keyvault_certificate_get` | ❌ | + +--- + +## Test 262 +======= +| 1 | 0.602769 | `keyvault_secret_get` | ✅ **EXPECTED** | +| 2 | 0.504212 | `keyvault_key_get` | ❌ | +| 3 | 0.501397 | `keyvault_secret_create` | ❌ | +| 4 | 0.478769 | `keyvault_secret_list` | ❌ | +| 5 | 0.439521 | `keyvault_certificate_get` | ❌ | + +--- + +## Test 272 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `keyvault_secret_get` **Prompt:** Show me the details of the secret in the key vault @@ -5011,15 +9679,33 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.653920 | `keyvault_secret_get` | ✅ **EXPECTED** | | 2 | 0.567036 | `keyvault_key_get` | ❌ | | 3 | 0.517547 | `storage_account_get` | ❌ | +======= +| 1 | 0.653871 | `keyvault_secret_get` | ✅ **EXPECTED** | +| 2 | 0.566786 | `keyvault_key_get` | ❌ | +<<<<<<< HEAD +| 3 | 0.517355 | `storage_account_get` | ❌ | +======= +| 3 | 0.517561 | `storage_account_get` | ❌ | +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) | 4 | 0.496050 | `keyvault_certificate_get` | ❌ | | 5 | 0.485249 | `keyvault_secret_list` | ❌ | --- +<<<<<<< HEAD ## Test 268 +======= +<<<<<<< HEAD +## Test 263 +======= +## Test 273 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `keyvault_secret_get` **Prompt:** Get the secret from vault @@ -5036,7 +9722,15 @@ --- +<<<<<<< HEAD ## Test 269 +======= +<<<<<<< HEAD +## Test 264 +======= +## Test 274 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `keyvault_secret_get` **Prompt:** Display the secret details for in vault @@ -5045,15 +9739,33 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.649423 | `keyvault_secret_get` | ✅ **EXPECTED** | | 2 | 0.548102 | `keyvault_key_get` | ❌ | | 3 | 0.497402 | `storage_account_get` | ❌ | +======= +| 1 | 0.649267 | `keyvault_secret_get` | ✅ **EXPECTED** | +| 2 | 0.546992 | `keyvault_key_get` | ❌ | +<<<<<<< HEAD +| 3 | 0.497258 | `storage_account_get` | ❌ | +======= +| 3 | 0.497410 | `storage_account_get` | ❌ | +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) | 4 | 0.492583 | `keyvault_certificate_get` | ❌ | | 5 | 0.491597 | `keyvault_secret_list` | ❌ | --- +<<<<<<< HEAD ## Test 270 +======= +<<<<<<< HEAD +## Test 265 +======= +## Test 275 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `keyvault_secret_get` **Prompt:** Retrieve secret metadata for in vault @@ -5066,11 +9778,27 @@ | 2 | 0.475492 | `keyvault_key_get` | ❌ | | 3 | 0.466890 | `keyvault_secret_create` | ❌ | | 4 | 0.447602 | `keyvault_secret_list` | ❌ | +<<<<<<< HEAD | 5 | 0.439583 | `storage_account_get` | ❌ | --- ## Test 271 +======= +<<<<<<< HEAD +| 5 | 0.439381 | `storage_account_get` | ❌ | + +--- + +## Test 266 +======= +| 5 | 0.439597 | `storage_account_get` | ❌ | + +--- + +## Test 276 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `keyvault_secret_list` **Prompt:** List all secrets in the key vault @@ -5080,6 +9808,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| | 1 | 0.701227 | `keyvault_secret_list` | ✅ **EXPECTED** | +<<<<<<< HEAD | 2 | 0.563736 | `keyvault_key_list` | ❌ | | 3 | 0.538337 | `keyvault_certificate_list` | ❌ | | 4 | 0.499888 | `keyvault_secret_get` | ❌ | @@ -5088,6 +9817,24 @@ --- ## Test 272 +======= +<<<<<<< HEAD +| 2 | 0.563760 | `keyvault_key_list` | ❌ | +======= +| 2 | 0.563694 | `keyvault_key_list` | ❌ | +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +| 3 | 0.538337 | `keyvault_certificate_list` | ❌ | +| 4 | 0.499642 | `keyvault_secret_get` | ❌ | +| 5 | 0.455469 | `cosmos_account_list` | ❌ | + +--- + +<<<<<<< HEAD +## Test 267 +======= +## Test 277 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `keyvault_secret_list` **Prompt:** Show me the secrets in the key vault @@ -5097,6 +9844,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| | 1 | 0.555681 | `keyvault_secret_list` | ✅ **EXPECTED** | +<<<<<<< HEAD | 2 | 0.544015 | `keyvault_secret_get` | ❌ | | 3 | 0.498713 | `keyvault_key_get` | ❌ | | 4 | 0.464661 | `keyvault_key_list` | ❌ | @@ -5105,6 +9853,25 @@ --- ## Test 273 +======= +| 2 | 0.543861 | `keyvault_secret_get` | ❌ | +| 3 | 0.497525 | `keyvault_key_get` | ❌ | +<<<<<<< HEAD +| 4 | 0.464705 | `keyvault_key_list` | ❌ | +| 5 | 0.453107 | `keyvault_admin_settings_get` | ❌ | + +--- + +## Test 268 +======= +| 4 | 0.464652 | `keyvault_key_list` | ❌ | +| 5 | 0.453130 | `keyvault_admin_settings_get` | ❌ | + +--- + +## Test 278 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `keyvault_secret_list` **Prompt:** What secrets are in the key vault ? @@ -5114,6 +9881,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| | 1 | 0.572540 | `keyvault_secret_list` | ✅ **EXPECTED** | +<<<<<<< HEAD | 2 | 0.529389 | `keyvault_secret_get` | ❌ | | 3 | 0.493761 | `keyvault_key_list` | ❌ | | 4 | 0.487620 | `keyvault_admin_settings_get` | ❌ | @@ -5122,6 +9890,25 @@ --- ## Test 274 +======= +| 2 | 0.529258 | `keyvault_secret_get` | ❌ | +<<<<<<< HEAD +| 3 | 0.493797 | `keyvault_key_list` | ❌ | +| 4 | 0.487611 | `keyvault_admin_settings_get` | ❌ | +======= +| 3 | 0.493728 | `keyvault_key_list` | ❌ | +| 4 | 0.487620 | `keyvault_admin_settings_get` | ❌ | +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +| 5 | 0.475273 | `keyvault_key_get` | ❌ | + +--- + +<<<<<<< HEAD +## Test 269 +======= +## Test 279 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `keyvault_secret_list` **Prompt:** List secrets names in vault @@ -5131,14 +9918,30 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| | 1 | 0.624290 | `keyvault_secret_list` | ✅ **EXPECTED** | +<<<<<<< HEAD | 2 | 0.559681 | `keyvault_key_list` | ❌ | +======= +<<<<<<< HEAD +| 2 | 0.559700 | `keyvault_key_list` | ❌ | +======= +| 2 | 0.559622 | `keyvault_key_list` | ❌ | +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) | 3 | 0.517516 | `keyvault_certificate_list` | ❌ | | 4 | 0.479771 | `keyvault_secret_get` | ❌ | | 5 | 0.453295 | `storage_blob_container_get` | ❌ | --- +<<<<<<< HEAD ## Test 275 +======= +<<<<<<< HEAD +## Test 270 +======= +## Test 280 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `keyvault_secret_list` **Prompt:** Enumerate secrets in key vault @@ -5148,14 +9951,30 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| | 1 | 0.742358 | `keyvault_secret_list` | ✅ **EXPECTED** | +<<<<<<< HEAD | 2 | 0.601183 | `keyvault_key_list` | ❌ | +======= +<<<<<<< HEAD +| 2 | 0.601234 | `keyvault_key_list` | ❌ | +======= +| 2 | 0.601079 | `keyvault_key_list` | ❌ | +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) | 3 | 0.567827 | `keyvault_certificate_list` | ❌ | | 4 | 0.496363 | `keyvault_secret_get` | ❌ | | 5 | 0.437560 | `keyvault_admin_settings_get` | ❌ | --- +<<<<<<< HEAD ## Test 276 +======= +<<<<<<< HEAD +## Test 271 +======= +## Test 281 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `keyvault_secret_list` **Prompt:** Show secrets names in the key vault @@ -5165,6 +9984,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| | 1 | 0.567110 | `keyvault_secret_list` | ✅ **EXPECTED** | +<<<<<<< HEAD | 2 | 0.522600 | `keyvault_secret_get` | ❌ | | 3 | 0.476309 | `keyvault_key_list` | ❌ | | 4 | 0.462711 | `keyvault_key_get` | ❌ | @@ -5173,6 +9993,24 @@ --- ## Test 277 +======= +| 2 | 0.522398 | `keyvault_secret_get` | ❌ | +<<<<<<< HEAD +| 3 | 0.476354 | `keyvault_key_list` | ❌ | +======= +| 3 | 0.476288 | `keyvault_key_list` | ❌ | +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +| 4 | 0.462676 | `keyvault_secret_create` | ❌ | +| 5 | 0.461326 | `keyvault_key_get` | ❌ | + +--- + +<<<<<<< HEAD +## Test 272 +======= +## Test 282 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `aks_cluster_get` **Prompt:** Get the configuration of AKS cluster @@ -5189,7 +10027,15 @@ --- +<<<<<<< HEAD ## Test 278 +======= +<<<<<<< HEAD +## Test 273 +======= +## Test 283 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `aks_cluster_get` **Prompt:** Show me the details of AKS cluster in resource group @@ -5198,6 +10044,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.621759 | `aks_cluster_get` | ✅ **EXPECTED** | | 2 | 0.575626 | `aks_nodepool_get` | ❌ | | 3 | 0.567870 | `kusto_cluster_get` | ❌ | @@ -5207,6 +10054,29 @@ --- ## Test 279 +======= +<<<<<<< HEAD +| 1 | 0.621536 | `aks_cluster_get` | ✅ **EXPECTED** | +| 2 | 0.575434 | `aks_nodepool_get` | ❌ | +| 3 | 0.567416 | `kusto_cluster_get` | ❌ | +| 4 | 0.461358 | `sql_db_show` | ❌ | +| 5 | 0.445310 | `monitor_webtests_get` | ❌ | + +--- + +## Test 274 +======= +| 1 | 0.621759 | `aks_cluster_get` | ✅ **EXPECTED** | +| 2 | 0.575625 | `aks_nodepool_get` | ❌ | +| 3 | 0.567870 | `kusto_cluster_get` | ❌ | +| 4 | 0.461466 | `sql_db_show` | ❌ | +| 5 | 0.444327 | `monitor_webtests_get` | ❌ | + +--- + +## Test 284 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `aks_cluster_get` **Prompt:** Show me the network configuration for AKS cluster @@ -5219,11 +10089,23 @@ | 2 | 0.483220 | `aks_nodepool_get` | ❌ | | 3 | 0.434684 | `kusto_cluster_get` | ❌ | | 4 | 0.380301 | `mysql_server_config_get` | ❌ | +<<<<<<< HEAD | 5 | 0.366689 | `kusto_cluster_list` | ❌ | --- +<<<<<<< HEAD ## Test 280 +======= +## Test 275 +======= +| 5 | 0.366594 | `kusto_cluster_list` | ❌ | + +--- + +## Test 285 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `aks_cluster_get` **Prompt:** What are the details of my AKS cluster in ? @@ -5235,12 +10117,28 @@ | 1 | 0.588634 | `aks_cluster_get` | ✅ **EXPECTED** | | 2 | 0.550555 | `aks_nodepool_get` | ❌ | | 3 | 0.527511 | `kusto_cluster_get` | ❌ | +<<<<<<< HEAD | 4 | 0.445722 | `storage_account_get` | ❌ | +======= +<<<<<<< HEAD +| 4 | 0.445813 | `storage_account_get` | ❌ | +======= +| 4 | 0.445833 | `storage_account_get` | ❌ | +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) | 5 | 0.435597 | `foundry_resource_get` | ❌ | --- +<<<<<<< HEAD ## Test 281 +======= +<<<<<<< HEAD +## Test 276 +======= +## Test 286 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `aks_cluster_get` **Prompt:** List all AKS clusters in my subscription @@ -5250,14 +10148,33 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| | 1 | 0.756471 | `aks_cluster_get` | ✅ **EXPECTED** | +<<<<<<< HEAD | 2 | 0.749416 | `kusto_cluster_list` | ❌ | | 3 | 0.590166 | `aks_nodepool_get` | ❌ | +<<<<<<< HEAD | 4 | 0.568635 | `kusto_database_list` | ❌ | | 5 | 0.560522 | `search_service_list` | ❌ | --- ## Test 282 +======= +| 4 | 0.568440 | `kusto_database_list` | ❌ | +======= +| 2 | 0.749293 | `kusto_cluster_list` | ❌ | +| 3 | 0.590166 | `aks_nodepool_get` | ❌ | +| 4 | 0.568301 | `kusto_database_list` | ❌ | +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +| 5 | 0.562043 | `search_service_list` | ❌ | + +--- + +<<<<<<< HEAD +## Test 277 +======= +## Test 287 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `aks_cluster_get` **Prompt:** Show me my Azure Kubernetes Service clusters @@ -5267,6 +10184,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| | 1 | 0.612123 | `aks_cluster_get` | ✅ **EXPECTED** | +<<<<<<< HEAD | 2 | 0.586661 | `kusto_cluster_list` | ❌ | | 3 | 0.507757 | `aks_nodepool_get` | ❌ | | 4 | 0.489724 | `kusto_cluster_get` | ❌ | @@ -5274,7 +10192,21 @@ --- +<<<<<<< HEAD ## Test 283 +======= +## Test 278 +======= +| 2 | 0.586466 | `kusto_cluster_list` | ❌ | +| 3 | 0.507757 | `aks_nodepool_get` | ❌ | +| 4 | 0.489724 | `kusto_cluster_get` | ❌ | +| 5 | 0.462718 | `kusto_database_list` | ❌ | + +--- + +## Test 288 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `aks_cluster_get` **Prompt:** What AKS clusters do I have? @@ -5283,6 +10215,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.628470 | `aks_cluster_get` | ✅ **EXPECTED** | | 2 | 0.563211 | `aks_nodepool_get` | ❌ | | 3 | 0.526840 | `kusto_cluster_list` | ❌ | @@ -5292,6 +10225,27 @@ --- ## Test 284 +======= +| 1 | 0.628429 | `aks_cluster_get` | ✅ **EXPECTED** | +| 2 | 0.563189 | `aks_nodepool_get` | ❌ | +<<<<<<< HEAD +| 3 | 0.526756 | `kusto_cluster_list` | ❌ | +| 4 | 0.426157 | `kusto_cluster_get` | ❌ | +| 5 | 0.409163 | `kusto_database_list` | ❌ | + +--- + +## Test 279 +======= +| 3 | 0.526670 | `kusto_cluster_list` | ❌ | +| 4 | 0.426157 | `kusto_cluster_get` | ❌ | +| 5 | 0.409404 | `kusto_database_list` | ❌ | + +--- + +## Test 289 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `aks_nodepool_get` **Prompt:** Get details for nodepool in AKS cluster in @@ -5300,6 +10254,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.728569 | `aks_nodepool_get` | ✅ **EXPECTED** | | 2 | 0.516573 | `kusto_cluster_get` | ❌ | | 3 | 0.509314 | `aks_cluster_get` | ❌ | @@ -5309,6 +10264,29 @@ --- ## Test 285 +======= +<<<<<<< HEAD +| 1 | 0.729136 | `aks_nodepool_get` | ✅ **EXPECTED** | +| 2 | 0.517116 | `kusto_cluster_get` | ❌ | +| 3 | 0.510014 | `aks_cluster_get` | ❌ | +| 4 | 0.468597 | `virtualdesktop_hostpool_list` | ❌ | +| 5 | 0.463489 | `sql_elastic-pool_list` | ❌ | + +--- + +## Test 280 +======= +| 1 | 0.728937 | `aks_nodepool_get` | ✅ **EXPECTED** | +| 2 | 0.517021 | `kusto_cluster_get` | ❌ | +| 3 | 0.509820 | `aks_cluster_get` | ❌ | +| 4 | 0.468392 | `virtualdesktop_hostpool_list` | ❌ | +| 5 | 0.463192 | `sql_elastic-pool_list` | ❌ | + +--- + +## Test 290 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `aks_nodepool_get` **Prompt:** Show me the configuration for nodepool in AKS cluster in resource group @@ -5317,6 +10295,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.654106 | `aks_nodepool_get` | ✅ **EXPECTED** | | 2 | 0.458596 | `sql_elastic-pool_list` | ❌ | | 3 | 0.446035 | `aks_cluster_get` | ❌ | @@ -5325,7 +10304,22 @@ --- +<<<<<<< HEAD ## Test 286 +======= +## Test 281 +======= +| 1 | 0.654031 | `aks_nodepool_get` | ✅ **EXPECTED** | +| 2 | 0.458651 | `sql_elastic-pool_list` | ❌ | +| 3 | 0.445952 | `aks_cluster_get` | ❌ | +| 4 | 0.440187 | `virtualdesktop_hostpool_list` | ❌ | +| 5 | 0.413711 | `kusto_cluster_get` | ❌ | + +--- + +## Test 291 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `aks_nodepool_get` **Prompt:** What is the setup of nodepool for AKS cluster in ? @@ -5342,7 +10336,15 @@ --- +<<<<<<< HEAD ## Test 287 +======= +<<<<<<< HEAD +## Test 282 +======= +## Test 292 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `aks_nodepool_get` **Prompt:** List nodepools for AKS cluster in @@ -5351,6 +10353,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.692231 | `aks_nodepool_get` | ✅ **EXPECTED** | | 2 | 0.519037 | `aks_cluster_get` | ❌ | | 3 | 0.506720 | `virtualdesktop_hostpool_list` | ❌ | @@ -5360,6 +10363,29 @@ --- ## Test 288 +======= +<<<<<<< HEAD +| 1 | 0.692264 | `aks_nodepool_get` | ✅ **EXPECTED** | +| 2 | 0.519034 | `aks_cluster_get` | ❌ | +| 3 | 0.506649 | `virtualdesktop_hostpool_list` | ❌ | +| 4 | 0.500705 | `kusto_cluster_list` | ❌ | +| 5 | 0.487723 | `sql_elastic-pool_list` | ❌ | + +--- + +## Test 283 +======= +| 1 | 0.692231 | `aks_nodepool_get` | ✅ **EXPECTED** | +| 2 | 0.519037 | `aks_cluster_get` | ❌ | +| 3 | 0.506624 | `virtualdesktop_hostpool_list` | ❌ | +| 4 | 0.500514 | `kusto_cluster_list` | ❌ | +| 5 | 0.487707 | `sql_elastic-pool_list` | ❌ | + +--- + +## Test 293 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `aks_nodepool_get` **Prompt:** Show me the nodepool list for AKS cluster in @@ -5371,12 +10397,28 @@ | 1 | 0.732132 | `aks_nodepool_get` | ✅ **EXPECTED** | | 2 | 0.561829 | `aks_cluster_get` | ❌ | | 3 | 0.510269 | `sql_elastic-pool_list` | ❌ | +<<<<<<< HEAD | 4 | 0.509840 | `virtualdesktop_hostpool_list` | ❌ | +======= +| 4 | 0.509732 | `virtualdesktop_hostpool_list` | ❌ | +<<<<<<< HEAD +>>>>>>> 58ab8585 (update prompts and tool description evaluator) | 5 | 0.486700 | `kusto_cluster_list` | ❌ | --- +<<<<<<< HEAD ## Test 289 +======= +## Test 284 +======= +| 5 | 0.486544 | `kusto_cluster_list` | ❌ | + +--- + +## Test 294 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `aks_nodepool_get` **Prompt:** What nodepools do I have for AKS cluster in @@ -5387,14 +10429,30 @@ |------|-------|------|--------| | 1 | 0.629358 | `aks_nodepool_get` | ✅ **EXPECTED** | | 2 | 0.456911 | `aks_cluster_get` | ❌ | +<<<<<<< HEAD | 3 | 0.443940 | `virtualdesktop_hostpool_list` | ❌ | +======= +| 3 | 0.443902 | `virtualdesktop_hostpool_list` | ❌ | +<<<<<<< HEAD +>>>>>>> 58ab8585 (update prompts and tool description evaluator) | 4 | 0.433006 | `kusto_cluster_list` | ❌ | +======= +| 4 | 0.432757 | `kusto_cluster_list` | ❌ | +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) | 5 | 0.425448 | `sql_elastic-pool_list` | ❌ | --- +<<<<<<< HEAD ## Test 290 - +======= +<<<<<<< HEAD +## Test 285 +======= +## Test 295 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) + **Expected Tool:** `loadtesting_test_create` **Prompt:** Create a basic URL test using the following endpoint URL that runs for 30 minutes with 45 virtual users. The test name is with the test id and the load testing resource is in the resource group in my subscription @@ -5410,7 +10468,15 @@ --- +<<<<<<< HEAD ## Test 291 +======= +<<<<<<< HEAD +## Test 286 +======= +## Test 296 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `loadtesting_test_get` **Prompt:** Get the load test with id in the load test resource in resource group @@ -5419,6 +10485,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.626226 | `loadtesting_testresource_list` | ❌ | | 2 | 0.619944 | `loadtesting_test_get` | ✅ **EXPECTED** | | 3 | 0.594666 | `loadtesting_testresource_create` | ❌ | @@ -5428,6 +10495,29 @@ --- ## Test 292 +======= +<<<<<<< HEAD +| 1 | 0.626213 | `loadtesting_testresource_list` | ❌ | +| 2 | 0.620147 | `loadtesting_test_get` | ✅ **EXPECTED** | +| 3 | 0.594630 | `loadtesting_testresource_create` | ❌ | +| 4 | 0.591112 | `monitor_webtests_get` | ❌ | +| 5 | 0.535891 | `monitor_webtests_list` | ❌ | + +--- + +## Test 287 +======= +| 1 | 0.626271 | `loadtesting_testresource_list` | ❌ | +| 2 | 0.620094 | `loadtesting_test_get` | ✅ **EXPECTED** | +| 3 | 0.594881 | `loadtesting_testresource_create` | ❌ | +| 4 | 0.590679 | `monitor_webtests_get` | ❌ | +| 5 | 0.537187 | `monitor_webtests_list` | ❌ | + +--- + +## Test 297 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `loadtesting_testresource_create` **Prompt:** Create a load test resource in the resource group in my subscription @@ -5436,6 +10526,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.645537 | `loadtesting_testresource_create` | ✅ **EXPECTED** | | 2 | 0.618773 | `loadtesting_testresource_list` | ❌ | | 3 | 0.541696 | `loadtesting_test_create` | ❌ | @@ -5445,6 +10536,29 @@ --- ## Test 293 +======= +<<<<<<< HEAD +| 1 | 0.645750 | `loadtesting_testresource_create` | ✅ **EXPECTED** | +| 2 | 0.618984 | `loadtesting_testresource_list` | ❌ | +| 3 | 0.541950 | `loadtesting_test_create` | ❌ | +| 4 | 0.539866 | `loadtesting_testrun_create` | ❌ | +| 5 | 0.526644 | `monitor_webtests_list` | ❌ | + +--- + +## Test 288 +======= +| 1 | 0.645537 | `loadtesting_testresource_create` | ✅ **EXPECTED** | +| 2 | 0.618773 | `loadtesting_testresource_list` | ❌ | +| 3 | 0.541746 | `loadtesting_test_create` | ❌ | +| 4 | 0.539771 | `loadtesting_testrun_create` | ❌ | +| 5 | 0.525628 | `monitor_webtests_list` | ❌ | + +--- + +## Test 298 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `loadtesting_testresource_list` **Prompt:** List all load testing resources in the resource group in my subscription @@ -5454,14 +10568,30 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| | 1 | 0.794326 | `loadtesting_testresource_list` | ✅ **EXPECTED** | +<<<<<<< HEAD | 2 | 0.653165 | `monitor_webtests_list` | ❌ | +======= +<<<<<<< HEAD +| 2 | 0.653137 | `monitor_webtests_list` | ❌ | +======= +| 2 | 0.651533 | `monitor_webtests_list` | ❌ | +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) | 3 | 0.577408 | `group_list` | ❌ | | 4 | 0.575172 | `loadtesting_testresource_create` | ❌ | | 5 | 0.565565 | `datadog_monitoredresources_list` | ❌ | --- +<<<<<<< HEAD ## Test 294 +======= +<<<<<<< HEAD +## Test 289 +======= +## Test 299 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `loadtesting_testrun_create` **Prompt:** Create a test run using the id for test in the load testing resource in resource group . Use the name of test run and description as @@ -5478,7 +10608,15 @@ --- +<<<<<<< HEAD ## Test 295 +======= +<<<<<<< HEAD +## Test 290 +======= +## Test 300 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `loadtesting_testrun_get` **Prompt:** Get the load test run with id in the load test resource in resource group @@ -5490,12 +10628,28 @@ | 1 | 0.619146 | `loadtesting_testresource_list` | ❌ | | 2 | 0.601927 | `loadtesting_test_get` | ❌ | | 3 | 0.597430 | `loadtesting_testresource_create` | ❌ | +<<<<<<< HEAD | 4 | 0.577532 | `monitor_webtests_get` | ❌ | +======= +<<<<<<< HEAD +| 4 | 0.577924 | `monitor_webtests_get` | ❌ | +======= +| 4 | 0.577532 | `monitor_webtests_get` | ❌ | +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) | 5 | 0.565996 | `loadtesting_testrun_list` | ❌ | --- +<<<<<<< HEAD ## Test 296 +======= +<<<<<<< HEAD +## Test 291 +======= +## Test 301 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `loadtesting_testrun_list` **Prompt:** Get all the load test runs for the test with id in the load test resource in resource group @@ -5504,6 +10658,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.669307 | `loadtesting_testresource_list` | ❌ | | 2 | 0.640644 | `loadtesting_testrun_list` | ✅ **EXPECTED** | | 3 | 0.600977 | `loadtesting_test_get` | ❌ | @@ -5513,6 +10668,25 @@ --- ## Test 297 +======= +| 1 | 0.669180 | `loadtesting_testresource_list` | ❌ | +| 2 | 0.640360 | `loadtesting_testrun_list` | ✅ **EXPECTED** | +| 3 | 0.601075 | `loadtesting_test_get` | ❌ | +| 4 | 0.577460 | `loadtesting_testresource_create` | ❌ | +<<<<<<< HEAD +| 5 | 0.569963 | `monitor_webtests_get` | ❌ | + +--- + +## Test 292 +======= +| 5 | 0.569424 | `monitor_webtests_get` | ❌ | + +--- + +## Test 302 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `loadtesting_testrun_update` **Prompt:** Update a test run display name as for the id for test in the load testing resource in resource group . @@ -5523,13 +10697,31 @@ |------|-------|------|--------| | 1 | 0.706747 | `loadtesting_testrun_update` | ✅ **EXPECTED** | | 2 | 0.514428 | `loadtesting_testrun_create` | ❌ | +<<<<<<< HEAD | 3 | 0.486977 | `monitor_webtests_update` | ❌ | +======= +<<<<<<< HEAD +| 3 | 0.486980 | `monitor_webtests_update` | ❌ | +>>>>>>> 58ab8585 (update prompts and tool description evaluator) | 4 | 0.470337 | `loadtesting_testresource_list` | ❌ | | 5 | 0.468374 | `monitor_webtests_get` | ❌ | --- +<<<<<<< HEAD ## Test 298 +======= +## Test 293 +======= +| 3 | 0.487022 | `monitor_webtests_update` | ❌ | +| 4 | 0.470337 | `loadtesting_testresource_list` | ❌ | +| 5 | 0.468374 | `monitor_webtests_get` | ❌ | + +--- + +## Test 303 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `grafana_list` **Prompt:** List all Azure Managed Grafana in one subscription @@ -5538,7 +10730,11 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.599427 | `kusto_cluster_list` | ❌ | +======= +| 1 | 0.599428 | `kusto_cluster_list` | ❌ | +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) | 2 | 0.578892 | `grafana_list` | ✅ **EXPECTED** | | 3 | 0.550372 | `subscription_list` | ❌ | | 4 | 0.549957 | `search_service_list` | ❌ | @@ -5546,7 +10742,15 @@ --- +<<<<<<< HEAD ## Test 299 +======= +<<<<<<< HEAD +## Test 294 +======= +## Test 304 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `managedlustre_fs_create` **Prompt:** Create an Azure Managed Lustre filesystem with name , size , SKU , and subnet for availability zone in location . Maintenance should occur on at @@ -5555,15 +10759,38 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.726553 | `managedlustre_fs_create` | ✅ **EXPECTED** | | 2 | 0.616164 | `managedlustre_fs_list` | ❌ | | 3 | 0.605701 | `managedlustre_fs_sku_get` | ❌ | | 4 | 0.598215 | `managedlustre_fs_update` | ❌ | +======= +<<<<<<< HEAD +| 1 | 0.728113 | `managedlustre_fs_create` | ✅ **EXPECTED** | +| 2 | 0.615874 | `managedlustre_fs_list` | ❌ | +| 3 | 0.605775 | `managedlustre_fs_sku_get` | ❌ | +| 4 | 0.598255 | `managedlustre_fs_update` | ❌ | +>>>>>>> 58ab8585 (update prompts and tool description evaluator) | 5 | 0.557720 | `managedlustre_fs_subnetsize_validate` | ❌ | --- +<<<<<<< HEAD ## Test 300 +======= +## Test 295 +======= +| 1 | 0.728113 | `managedlustre_filesystem_create` | ❌ | +| 2 | 0.616164 | `managedlustre_filesystem_list` | ❌ | +| 3 | 0.605775 | `managedlustre_filesystem_sku_get` | ❌ | +| 4 | 0.598255 | `managedlustre_filesystem_update` | ❌ | +| 5 | 0.557720 | `managedlustre_filesystem_subnetsize_validate` | ❌ | + +--- + +## Test 305 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `managedlustre_fs_list` **Prompt:** List the Azure Managed Lustre filesystems in my subscription @@ -5572,6 +10799,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.750675 | `managedlustre_fs_list` | ✅ **EXPECTED** | | 2 | 0.631730 | `managedlustre_fs_sku_get` | ❌ | | 3 | 0.579855 | `managedlustre_fs_create` | ❌ | @@ -5581,6 +10809,28 @@ --- ## Test 301 +======= +<<<<<<< HEAD +| 1 | 0.750302 | `managedlustre_fs_list` | ✅ **EXPECTED** | +| 2 | 0.631770 | `managedlustre_fs_sku_get` | ❌ | +| 3 | 0.582660 | `managedlustre_fs_create` | ❌ | +| 4 | 0.562377 | `kusto_cluster_list` | ❌ | +======= +| 1 | 0.750675 | `managedlustre_filesystem_list` | ❌ | +| 2 | 0.631770 | `managedlustre_filesystem_sku_get` | ❌ | +| 3 | 0.582660 | `managedlustre_filesystem_create` | ❌ | +| 4 | 0.562520 | `kusto_cluster_list` | ❌ | +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +| 5 | 0.513156 | `search_service_list` | ❌ | + +--- + +<<<<<<< HEAD +## Test 296 +======= +## Test 306 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `managedlustre_fs_list` **Prompt:** List the Azure Managed Lustre filesystems in my resource group @@ -5589,15 +10839,35 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.743903 | `managedlustre_fs_list` | ✅ **EXPECTED** | | 2 | 0.613164 | `managedlustre_fs_sku_get` | ❌ | | 3 | 0.563081 | `managedlustre_fs_create` | ❌ | +======= +<<<<<<< HEAD +| 1 | 0.743639 | `managedlustre_fs_list` | ✅ **EXPECTED** | +| 2 | 0.613217 | `managedlustre_fs_sku_get` | ❌ | +| 3 | 0.565856 | `managedlustre_fs_create` | ❌ | +======= +| 1 | 0.743903 | `managedlustre_filesystem_list` | ❌ | +| 2 | 0.613217 | `managedlustre_filesystem_sku_get` | ❌ | +| 3 | 0.565856 | `managedlustre_filesystem_create` | ❌ | +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) | 4 | 0.519986 | `datadog_monitoredresources_list` | ❌ | | 5 | 0.515433 | `loadtesting_testresource_list` | ❌ | --- +<<<<<<< HEAD ## Test 302 +======= +<<<<<<< HEAD +## Test 297 +======= +## Test 307 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `managedlustre_fs_sku_get` **Prompt:** List the Azure Managed Lustre SKUs available in location @@ -5606,15 +10876,37 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.827360 | `managedlustre_fs_sku_get` | ✅ **EXPECTED** | | 2 | 0.613674 | `managedlustre_fs_list` | ❌ | | 3 | 0.511625 | `managedlustre_fs_create` | ❌ | +======= +<<<<<<< HEAD +| 1 | 0.827381 | `managedlustre_fs_sku_get` | ✅ **EXPECTED** | +| 2 | 0.613245 | `managedlustre_fs_list` | ❌ | +| 3 | 0.513242 | `managedlustre_fs_create` | ❌ | +>>>>>>> 58ab8585 (update prompts and tool description evaluator) | 4 | 0.496242 | `managedlustre_fs_subnetsize_validate` | ❌ | | 5 | 0.470241 | `kusto_cluster_list` | ❌ | --- +<<<<<<< HEAD ## Test 303 +======= +## Test 298 +======= +| 1 | 0.827381 | `managedlustre_filesystem_sku_get` | ❌ | +| 2 | 0.613674 | `managedlustre_filesystem_list` | ❌ | +| 3 | 0.513242 | `managedlustre_filesystem_create` | ❌ | +| 4 | 0.496242 | `managedlustre_filesystem_subnetsize_validate` | ❌ | +| 5 | 0.470347 | `kusto_cluster_list` | ❌ | + +--- + +## Test 308 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `managedlustre_fs_subnetsize_ask` **Prompt:** Tell me how many IP addresses I need for an Azure Managed Lustre filesystem of size using the SKU @@ -5623,6 +10915,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.739766 | `managedlustre_fs_subnetsize_ask` | ✅ **EXPECTED** | | 2 | 0.651598 | `managedlustre_fs_subnetsize_validate` | ❌ | | 3 | 0.594536 | `managedlustre_fs_sku_get` | ❌ | @@ -5632,6 +10925,29 @@ --- ## Test 304 +======= +<<<<<<< HEAD +| 1 | 0.739679 | `managedlustre_fs_subnetsize_ask` | ✅ **EXPECTED** | +| 2 | 0.651615 | `managedlustre_fs_subnetsize_validate` | ❌ | +| 3 | 0.594695 | `managedlustre_fs_sku_get` | ❌ | +| 4 | 0.559034 | `managedlustre_fs_list` | ❌ | +| 5 | 0.533796 | `managedlustre_fs_create` | ❌ | + +--- + +## Test 299 +======= +| 1 | 0.739721 | `managedlustre_filesystem_subnetsize_ask` | ❌ | +| 2 | 0.651551 | `managedlustre_filesystem_subnetsize_validate` | ❌ | +| 3 | 0.594559 | `managedlustre_filesystem_sku_get` | ❌ | +| 4 | 0.559415 | `managedlustre_filesystem_list` | ❌ | +| 5 | 0.533625 | `managedlustre_filesystem_create` | ❌ | + +--- + +## Test 309 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `managedlustre_fs_subnetsize_validate` **Prompt:** Validate if the network can host Azure Managed Lustre filesystem of size using the SKU @@ -5640,6 +10956,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.879240 | `managedlustre_fs_subnetsize_validate` | ✅ **EXPECTED** | | 2 | 0.622368 | `managedlustre_fs_subnetsize_ask` | ❌ | | 3 | 0.542555 | `managedlustre_fs_sku_get` | ❌ | @@ -5649,6 +10966,29 @@ --- ## Test 305 +======= +<<<<<<< HEAD +| 1 | 0.879541 | `managedlustre_fs_subnetsize_validate` | ✅ **EXPECTED** | +| 2 | 0.622603 | `managedlustre_fs_subnetsize_ask` | ❌ | +| 3 | 0.542788 | `managedlustre_fs_sku_get` | ❌ | +| 4 | 0.515947 | `managedlustre_fs_create` | ❌ | +| 5 | 0.480673 | `managedlustre_fs_list` | ❌ | + +--- + +## Test 300 +======= +| 1 | 0.879453 | `managedlustre_filesystem_subnetsize_validate` | ❌ | +| 2 | 0.622511 | `managedlustre_filesystem_subnetsize_ask` | ❌ | +| 3 | 0.542894 | `managedlustre_filesystem_sku_get` | ❌ | +| 4 | 0.516028 | `managedlustre_filesystem_create` | ❌ | +| 5 | 0.480920 | `managedlustre_filesystem_list` | ❌ | + +--- + +## Test 310 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `managedlustre_fs_update` **Prompt:** Update the maintenance window of the Azure Managed Lustre filesystem to at @@ -5657,15 +10997,38 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.738895 | `managedlustre_fs_update` | ✅ **EXPECTED** | | 2 | 0.525980 | `managedlustre_fs_create` | ❌ | | 3 | 0.487193 | `managedlustre_fs_list` | ❌ | | 4 | 0.385318 | `managedlustre_fs_sku_get` | ❌ | +======= +<<<<<<< HEAD +| 1 | 0.739000 | `managedlustre_fs_update` | ✅ **EXPECTED** | +| 2 | 0.527525 | `managedlustre_fs_create` | ❌ | +| 3 | 0.487003 | `managedlustre_fs_list` | ❌ | +| 4 | 0.385349 | `managedlustre_fs_sku_get` | ❌ | +>>>>>>> 58ab8585 (update prompts and tool description evaluator) | 5 | 0.344891 | `managedlustre_fs_subnetsize_validate` | ❌ | --- +<<<<<<< HEAD ## Test 306 +======= +## Test 301 +======= +| 1 | 0.739000 | `managedlustre_filesystem_update` | ❌ | +| 2 | 0.527525 | `managedlustre_filesystem_create` | ❌ | +| 3 | 0.487193 | `managedlustre_filesystem_list` | ❌ | +| 4 | 0.385349 | `managedlustre_filesystem_sku_get` | ❌ | +| 5 | 0.344891 | `managedlustre_filesystem_subnetsize_validate` | ❌ | + +--- + +## Test 311 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `marketplace_product_get` **Prompt:** Get details about marketplace product @@ -5674,6 +11037,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.570164 | `marketplace_product_get` | ✅ **EXPECTED** | | 2 | 0.499208 | `marketplace_product_list` | ❌ | | 3 | 0.353280 | `servicebus_topic_subscription_details` | ❌ | @@ -5683,6 +11047,25 @@ --- ## Test 307 +======= +<<<<<<< HEAD +| 1 | 0.570028 | `marketplace_product_get` | ✅ **EXPECTED** | +======= +| 1 | 0.570109 | `marketplace_product_get` | ✅ **EXPECTED** | +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +| 2 | 0.499184 | `marketplace_product_list` | ❌ | +| 3 | 0.353256 | `servicebus_topic_subscription_details` | ❌ | +| 4 | 0.333160 | `servicebus_topic_details` | ❌ | +| 5 | 0.330935 | `servicebus_queue_details` | ❌ | + +--- + +<<<<<<< HEAD +## Test 302 +======= +## Test 312 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `marketplace_product_list` **Prompt:** Search for Microsoft products in the marketplace @@ -5691,6 +11074,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.607950 | `marketplace_product_list` | ✅ **EXPECTED** | | 2 | 0.443177 | `marketplace_product_get` | ❌ | | 3 | 0.341360 | `search_service_list` | ❌ | @@ -5700,6 +11084,25 @@ --- ## Test 308 +======= +| 1 | 0.607916 | `marketplace_product_list` | ✅ **EXPECTED** | +<<<<<<< HEAD +| 2 | 0.443178 | `marketplace_product_get` | ❌ | +======= +| 2 | 0.443109 | `marketplace_product_get` | ❌ | +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +| 3 | 0.343549 | `search_service_list` | ❌ | +| 4 | 0.330500 | `foundry_models_list` | ❌ | +| 5 | 0.328676 | `managedlustre_fs_sku_get` | ❌ | + +--- + +<<<<<<< HEAD +## Test 303 +======= +## Test 313 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `marketplace_product_list` **Prompt:** Show me marketplace products from publisher @@ -5709,13 +11112,22 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| | 1 | 0.537726 | `marketplace_product_list` | ✅ **EXPECTED** | +<<<<<<< HEAD | 2 | 0.385167 | `marketplace_product_get` | ❌ | +======= +<<<<<<< HEAD +| 2 | 0.385198 | `marketplace_product_get` | ❌ | +======= +| 2 | 0.385111 | `marketplace_product_get` | ❌ | +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) | 3 | 0.308769 | `foundry_models_list` | ❌ | | 4 | 0.288006 | `redis_list` | ❌ | | 5 | 0.260421 | `managedlustre_fs_sku_get` | ❌ | --- +<<<<<<< HEAD ## Test 309 **Expected Tool:** `azureaibestpractices_get` @@ -5802,6 +11214,13 @@ --- ## Test 314 +======= +<<<<<<< HEAD +## Test 304 +======= +## Test 314 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `get_bestpractices_get` **Prompt:** Get the latest Azure code generation best practices @@ -5810,6 +11229,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.656395 | `azureaibestpractices_get` | ❌ | | 2 | 0.646844 | `get_bestpractices_get` | ✅ **EXPECTED** | | 3 | 0.635406 | `azureterraformbestpractices_get` | ❌ | @@ -5819,6 +11239,27 @@ --- ## Test 315 +======= +<<<<<<< HEAD +| 1 | 0.646857 | `get_bestpractices_get` | ✅ **EXPECTED** | +| 2 | 0.635437 | `azureterraformbestpractices_get` | ❌ | +| 3 | 0.586894 | `deploy_iac_rules_get` | ❌ | +======= +| 1 | 0.651264 | `get_bestpractices_get` | ✅ **EXPECTED** | +| 2 | 0.635406 | `azureterraformbestpractices_get` | ❌ | +| 3 | 0.586907 | `deploy_iac_rules_get` | ❌ | +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +| 4 | 0.531727 | `deploy_pipeline_guidance_get` | ❌ | +| 5 | 0.490235 | `deploy_plan_get` | ❌ | + +--- + +<<<<<<< HEAD +## Test 305 +======= +## Test 315 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `get_bestpractices_get` **Prompt:** Get the latest Azure deployment best practices @@ -5827,15 +11268,32 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.600903 | `get_bestpractices_get` | ✅ **EXPECTED** | +<<<<<<< HEAD +| 2 | 0.548542 | `azureterraformbestpractices_get` | ❌ | +======= +| 2 | 0.548655 | `azureterraformbestpractices_get` | ❌ | +======= +| 1 | 0.602216 | `get_bestpractices_get` | ✅ **EXPECTED** | | 2 | 0.548542 | `azureterraformbestpractices_get` | ❌ | +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) | 3 | 0.541091 | `deploy_iac_rules_get` | ❌ | | 4 | 0.516852 | `deploy_plan_get` | ❌ | | 5 | 0.516203 | `deploy_pipeline_guidance_get` | ❌ | --- +<<<<<<< HEAD +## Test 316 +======= +<<<<<<< HEAD +## Test 306 +======= ## Test 316 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `get_bestpractices_get` **Prompt:** Get the latest Azure best practices @@ -5844,7 +11302,9 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.625259 | `get_bestpractices_get` | ✅ **EXPECTED** | +<<<<<<< HEAD | 2 | 0.594323 | `azureterraformbestpractices_get` | ❌ | | 3 | 0.539715 | `azureaibestpractices_get` | ❌ | | 4 | 0.518643 | `deploy_iac_rules_get` | ❌ | @@ -5853,6 +11313,24 @@ --- ## Test 317 +======= +| 2 | 0.594455 | `azureterraformbestpractices_get` | ❌ | +======= +| 1 | 0.624689 | `get_bestpractices_get` | ✅ **EXPECTED** | +| 2 | 0.594323 | `azureterraformbestpractices_get` | ❌ | +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +| 3 | 0.518643 | `deploy_iac_rules_get` | ❌ | +| 4 | 0.465572 | `deploy_pipeline_guidance_get` | ❌ | +| 5 | 0.450629 | `cloudarchitect_design` | ❌ | + +--- + +<<<<<<< HEAD +## Test 307 +======= +## Test 317 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `get_bestpractices_get` **Prompt:** Get the latest Azure Functions code generation best practices @@ -5861,7 +11339,9 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.624273 | `get_bestpractices_get` | ✅ **EXPECTED** | +<<<<<<< HEAD | 2 | 0.587474 | `azureaibestpractices_get` | ❌ | | 3 | 0.570488 | `azureterraformbestpractices_get` | ❌ | | 4 | 0.522998 | `deploy_iac_rules_get` | ❌ | @@ -5870,6 +11350,24 @@ --- ## Test 318 +======= +| 2 | 0.570547 | `azureterraformbestpractices_get` | ❌ | +======= +| 1 | 0.629031 | `get_bestpractices_get` | ✅ **EXPECTED** | +| 2 | 0.570488 | `azureterraformbestpractices_get` | ❌ | +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +| 3 | 0.522998 | `deploy_iac_rules_get` | ❌ | +| 4 | 0.493998 | `deploy_pipeline_guidance_get` | ❌ | +| 5 | 0.467377 | `extension_cli_install` | ❌ | + +--- + +<<<<<<< HEAD +## Test 308 +======= +## Test 318 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `get_bestpractices_get` **Prompt:** Get the latest Azure Functions deployment best practices @@ -5878,15 +11376,28 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.581850 | `get_bestpractices_get` | ✅ **EXPECTED** | | 2 | 0.497056 | `deploy_pipeline_guidance_get` | ❌ | +======= +| 1 | 0.584392 | `get_bestpractices_get` | ✅ **EXPECTED** | +| 2 | 0.497350 | `deploy_pipeline_guidance_get` | ❌ | +>>>>>>> 58ab8585 (update prompts and tool description evaluator) | 3 | 0.495659 | `deploy_iac_rules_get` | ❌ | | 4 | 0.486886 | `azureterraformbestpractices_get` | ❌ | | 5 | 0.474511 | `deploy_plan_get` | ❌ | --- +<<<<<<< HEAD +## Test 319 +======= +<<<<<<< HEAD +## Test 309 +======= ## Test 319 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `get_bestpractices_get` **Prompt:** Get the latest Azure Functions best practices @@ -5895,7 +11406,9 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.610986 | `get_bestpractices_get` | ✅ **EXPECTED** | +<<<<<<< HEAD | 2 | 0.532790 | `azureterraformbestpractices_get` | ❌ | | 3 | 0.518386 | `azureaibestpractices_get` | ❌ | | 4 | 0.487322 | `deploy_iac_rules_get` | ❌ | @@ -5904,6 +11417,24 @@ --- ## Test 320 +======= +| 2 | 0.532921 | `azureterraformbestpractices_get` | ❌ | +======= +| 1 | 0.612552 | `get_bestpractices_get` | ✅ **EXPECTED** | +| 2 | 0.532790 | `azureterraformbestpractices_get` | ❌ | +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +| 3 | 0.487322 | `deploy_iac_rules_get` | ❌ | +| 4 | 0.458060 | `deploy_pipeline_guidance_get` | ❌ | +| 5 | 0.448034 | `extension_cli_install` | ❌ | + +--- + +<<<<<<< HEAD +## Test 310 +======= +## Test 320 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `get_bestpractices_get` **Prompt:** Get the latest Azure Static Web Apps best practices @@ -5912,7 +11443,9 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.557862 | `get_bestpractices_get` | ✅ **EXPECTED** | +<<<<<<< HEAD | 2 | 0.513262 | `azureterraformbestpractices_get` | ❌ | | 3 | 0.510399 | `azureaibestpractices_get` | ❌ | | 4 | 0.505123 | `deploy_iac_rules_get` | ❌ | @@ -5921,6 +11454,24 @@ --- ## Test 321 +======= +| 2 | 0.513385 | `azureterraformbestpractices_get` | ❌ | +======= +| 1 | 0.559184 | `get_bestpractices_get` | ✅ **EXPECTED** | +| 2 | 0.513262 | `azureterraformbestpractices_get` | ❌ | +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +| 3 | 0.505123 | `deploy_iac_rules_get` | ❌ | +| 4 | 0.483705 | `deploy_pipeline_guidance_get` | ❌ | +| 5 | 0.421581 | `cloudarchitect_design` | ❌ | + +--- + +<<<<<<< HEAD +## Test 311 +======= +## Test 321 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `get_bestpractices_get` **Prompt:** What are azure function best practices? @@ -5929,7 +11480,9 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.582541 | `get_bestpractices_get` | ✅ **EXPECTED** | +<<<<<<< HEAD | 2 | 0.500368 | `azureterraformbestpractices_get` | ❌ | | 3 | 0.475018 | `azureaibestpractices_get` | ❌ | | 4 | 0.472112 | `deploy_iac_rules_get` | ❌ | @@ -5938,6 +11491,24 @@ --- ## Test 322 +======= +| 2 | 0.500479 | `azureterraformbestpractices_get` | ❌ | +======= +| 1 | 0.584536 | `get_bestpractices_get` | ✅ **EXPECTED** | +| 2 | 0.500368 | `azureterraformbestpractices_get` | ❌ | +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +| 3 | 0.472112 | `deploy_iac_rules_get` | ❌ | +| 4 | 0.433134 | `deploy_pipeline_guidance_get` | ❌ | +| 5 | 0.432087 | `cloudarchitect_design` | ❌ | + +--- + +<<<<<<< HEAD +## Test 312 +======= +## Test 322 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `get_bestpractices_get` **Prompt:** configure azure mcp in coding agent for my repo @@ -5946,6 +11517,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.488855 | `deploy_plan_get` | ❌ | | 2 | 0.460745 | `deploy_pipeline_guidance_get` | ❌ | | 3 | 0.390270 | `deploy_iac_rules_get` | ❌ | @@ -5954,7 +11526,22 @@ --- +<<<<<<< HEAD +## Test 323 +======= +## Test 313 +======= +| 1 | 0.488915 | `deploy_plan_get` | ❌ | +| 2 | 0.460980 | `deploy_pipeline_guidance_get` | ❌ | +| 3 | 0.390340 | `deploy_iac_rules_get` | ❌ | +| 4 | 0.370368 | `azureterraformbestpractices_get` | ❌ | +| 5 | 0.369284 | `extension_cli_install` | ❌ | + +--- + ## Test 323 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `monitor_activitylog_list` **Prompt:** List the activity logs of the last month for @@ -5963,7 +11550,12 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.537893 | `monitor_activitylog_list` | ✅ **EXPECTED** | +======= +<<<<<<< HEAD +| 1 | 0.537916 | `monitor_activitylog_list` | ✅ **EXPECTED** | +>>>>>>> 58ab8585 (update prompts and tool description evaluator) | 2 | 0.506212 | `monitor_resource_log_query` | ❌ | | 3 | 0.371728 | `monitor_workspace_log_query` | ❌ | | 4 | 0.363798 | `resourcehealth_health-events_list` | ❌ | @@ -5971,7 +11563,22 @@ --- +<<<<<<< HEAD +## Test 324 +======= +## Test 314 +======= +| 1 | 0.537780 | `monitor_activitylog_list` | ✅ **EXPECTED** | +| 2 | 0.506270 | `monitor_resource_log_query` | ❌ | +| 3 | 0.371737 | `monitor_workspace_log_query` | ❌ | +| 4 | 0.363731 | `resourcehealth_service-health-events_list` | ❌ | +| 5 | 0.344620 | `datadog_monitoredresources_list` | ❌ | + +--- + ## Test 324 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `monitor_healthmodels_entity_get` **Prompt:** Show me the health status of entity using the health model @@ -5980,6 +11587,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.660947 | `monitor_healthmodels_entity_get` | ✅ **EXPECTED** | | 2 | 0.608665 | `resourcehealth_availability-status_get` | ❌ | | 3 | 0.351697 | `resourcehealth_availability-status_list` | ❌ | @@ -5988,7 +11596,22 @@ --- +<<<<<<< HEAD +## Test 325 +======= +## Test 315 +======= +| 1 | 0.660947 | `monitor_healthmodels_entity_gethealth` | ❌ | +| 2 | 0.603153 | `resourcehealth_availability-status_get` | ❌ | +| 3 | 0.355116 | `foundry_openai_models-list` | ❌ | +| 4 | 0.351697 | `resourcehealth_availability-status_list` | ❌ | +| 5 | 0.328321 | `resourcehealth_service-health-events_list` | ❌ | + +--- + ## Test 325 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `monitor_metrics_definitions` **Prompt:** Get metric definitions for from the namespace @@ -5997,6 +11620,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.592640 | `monitor_metrics_definitions` | ✅ **EXPECTED** | | 2 | 0.424141 | `monitor_metrics_query` | ❌ | | 3 | 0.368006 | `bicepschema_get` | ❌ | @@ -6006,6 +11630,29 @@ --- ## Test 326 +======= +<<<<<<< HEAD +| 1 | 0.592676 | `monitor_metrics_definitions` | ✅ **EXPECTED** | +| 2 | 0.424006 | `monitor_metrics_query` | ❌ | +| 3 | 0.368319 | `bicepschema_get` | ❌ | +| 4 | 0.332356 | `monitor_table_type_list` | ❌ | +| 5 | 0.324986 | `resourcehealth_availability-status_get` | ❌ | + +--- + +## Test 316 +======= +| 1 | 0.592640 | `monitor_metrics_definitions` | ✅ **EXPECTED** | +| 2 | 0.424141 | `monitor_metrics_query` | ❌ | +| 3 | 0.368319 | `bicepschema_get` | ❌ | +| 4 | 0.332356 | `monitor_table_type_list` | ❌ | +| 5 | 0.323083 | `resourcehealth_availability-status_get` | ❌ | + +--- + +## Test 326 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `monitor_metrics_definitions` **Prompt:** Show me all available metrics and their definitions for storage account @@ -6014,15 +11661,37 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.607600 | `storage_account_get` | ❌ | | 2 | 0.587736 | `monitor_metrics_definitions` | ✅ **EXPECTED** | | 3 | 0.544043 | `storage_blob_container_get` | ❌ | +======= +<<<<<<< HEAD +| 1 | 0.607537 | `storage_account_get` | ❌ | +| 2 | 0.587640 | `monitor_metrics_definitions` | ✅ **EXPECTED** | +| 3 | 0.544781 | `storage_blob_container_get` | ❌ | +>>>>>>> 58ab8585 (update prompts and tool description evaluator) | 4 | 0.495829 | `storage_blob_get` | ❌ | | 5 | 0.473421 | `managedlustre_fs_list` | ❌ | --- +<<<<<<< HEAD ## Test 327 +======= +## Test 317 +======= +| 1 | 0.607575 | `storage_account_get` | ❌ | +| 2 | 0.587736 | `monitor_metrics_definitions` | ✅ **EXPECTED** | +| 3 | 0.544781 | `storage_blob_container_get` | ❌ | +| 4 | 0.495829 | `storage_blob_get` | ❌ | +| 5 | 0.473421 | `managedlustre_filesystem_list` | ❌ | + +--- + +## Test 327 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `monitor_metrics_definitions` **Prompt:** What metric definitions are available for the Application Insights resource @@ -6031,15 +11700,33 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD +| 1 | 0.633173 | `monitor_metrics_definitions` | ✅ **EXPECTED** | +| 2 | 0.495513 | `monitor_metrics_query` | ❌ | +======= +<<<<<<< HEAD +| 1 | 0.633132 | `monitor_metrics_definitions` | ✅ **EXPECTED** | +| 2 | 0.495439 | `monitor_metrics_query` | ❌ | +======= | 1 | 0.633173 | `monitor_metrics_definitions` | ✅ **EXPECTED** | | 2 | 0.495513 | `monitor_metrics_query` | ❌ | +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) | 3 | 0.433945 | `monitor_resource_log_query` | ❌ | | 4 | 0.392960 | `loadtesting_testresource_list` | ❌ | | 5 | 0.388569 | `bicepschema_get` | ❌ | --- +<<<<<<< HEAD +## Test 328 +======= +<<<<<<< HEAD +## Test 318 +======= ## Test 328 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `monitor_metrics_query` **Prompt:** Analyze the performance trends and response times for Application Insights resource over the last @@ -6056,7 +11743,15 @@ --- +<<<<<<< HEAD +## Test 329 +======= +<<<<<<< HEAD +## Test 319 +======= ## Test 329 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `monitor_metrics_query` **Prompt:** Check the availability metrics for my Application Insights resource for the last @@ -6067,6 +11762,7 @@ |------|-------|------|--------| | 1 | 0.557830 | `monitor_metrics_query` | ✅ **EXPECTED** | | 2 | 0.476671 | `monitor_resource_log_query` | ❌ | +<<<<<<< HEAD | 3 | 0.460611 | `resourcehealth_availability-status_list` | ❌ | | 4 | 0.456360 | `quota_usage_check` | ❌ | | 5 | 0.438233 | `monitor_metrics_definitions` | ❌ | @@ -6074,6 +11770,25 @@ --- ## Test 330 +======= +<<<<<<< HEAD +| 3 | 0.460351 | `resourcehealth_availability-status_list` | ❌ | +| 4 | 0.456321 | `quota_usage_check` | ❌ | +| 5 | 0.438171 | `monitor_metrics_definitions` | ❌ | + +--- + +## Test 320 +======= +| 3 | 0.460611 | `resourcehealth_availability-status_list` | ❌ | +| 4 | 0.455904 | `quota_usage_check` | ❌ | +| 5 | 0.438233 | `monitor_metrics_definitions` | ❌ | + +--- + +## Test 330 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `monitor_metrics_query` **Prompt:** Get the metric for over the last with intervals @@ -6082,6 +11797,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.461249 | `monitor_metrics_query` | ✅ **EXPECTED** | | 2 | 0.390029 | `monitor_metrics_definitions` | ❌ | | 3 | 0.338557 | `monitor_resource_log_query` | ❌ | @@ -6091,6 +11807,29 @@ --- ## Test 331 +======= +<<<<<<< HEAD +| 1 | 0.461138 | `monitor_metrics_query` | ✅ **EXPECTED** | +| 2 | 0.389998 | `monitor_metrics_definitions` | ❌ | +| 3 | 0.338392 | `monitor_resource_log_query` | ❌ | +| 4 | 0.334417 | `resourcehealth_availability-status_get` | ❌ | +| 5 | 0.306224 | `resourcehealth_availability-status_list` | ❌ | + +--- + +## Test 321 +======= +| 1 | 0.461249 | `monitor_metrics_query` | ✅ **EXPECTED** | +| 2 | 0.390029 | `monitor_metrics_definitions` | ❌ | +| 3 | 0.338557 | `monitor_resource_log_query` | ❌ | +| 4 | 0.330533 | `resourcehealth_availability-status_get` | ❌ | +| 5 | 0.306338 | `resourcehealth_availability-status_list` | ❌ | + +--- + +## Test 331 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `monitor_metrics_query` **Prompt:** Investigate error rates and failed requests for Application Insights resource for the last @@ -6100,14 +11839,33 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| | 1 | 0.496878 | `monitor_resource_log_query` | ❌ | +<<<<<<< HEAD | 2 | 0.492138 | `monitor_metrics_query` | ✅ **EXPECTED** | +======= +<<<<<<< HEAD +| 2 | 0.491782 | `monitor_metrics_query` | ✅ **EXPECTED** | +>>>>>>> 58ab8585 (update prompts and tool description evaluator) | 3 | 0.448148 | `applens_resource_diagnose` | ❌ | | 4 | 0.412184 | `resourcehealth_health-events_list` | ❌ | | 5 | 0.397853 | `quota_usage_check` | ❌ | --- +<<<<<<< HEAD +## Test 332 +======= +## Test 322 +======= +| 2 | 0.492138 | `monitor_metrics_query` | ✅ **EXPECTED** | +| 3 | 0.448148 | `applens_resource_diagnose` | ❌ | +| 4 | 0.412184 | `resourcehealth_service-health-events_list` | ❌ | +| 5 | 0.397335 | `quota_usage_check` | ❌ | + +--- + ## Test 332 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `monitor_metrics_query` **Prompt:** Query the metric for for the last @@ -6116,6 +11874,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.525890 | `monitor_metrics_query` | ✅ **EXPECTED** | | 2 | 0.405838 | `monitor_resource_log_query` | ❌ | | 3 | 0.384811 | `monitor_metrics_definitions` | ❌ | @@ -6125,6 +11884,27 @@ --- ## Test 333 +======= +| 1 | 0.525326 | `monitor_metrics_query` | ✅ **EXPECTED** | +| 2 | 0.406185 | `monitor_resource_log_query` | ❌ | +<<<<<<< HEAD +| 3 | 0.384524 | `monitor_metrics_definitions` | ❌ | +| 4 | 0.347723 | `monitor_workspace_log_query` | ❌ | +| 5 | 0.330713 | `resourcehealth_availability-status_get` | ❌ | + +--- + +## Test 323 +======= +| 3 | 0.384482 | `monitor_metrics_definitions` | ❌ | +| 4 | 0.347723 | `monitor_workspace_log_query` | ❌ | +| 5 | 0.325967 | `resourcehealth_availability-status_get` | ❌ | + +--- + +## Test 333 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `monitor_metrics_query` **Prompt:** What's the request per second rate for my Application Insights resource over the last @@ -6136,12 +11916,29 @@ | 1 | 0.480140 | `monitor_metrics_query` | ✅ **EXPECTED** | | 2 | 0.444779 | `monitor_resource_log_query` | ❌ | | 3 | 0.388382 | `applens_resource_diagnose` | ❌ | +<<<<<<< HEAD | 4 | 0.363672 | `quota_usage_check` | ❌ | +======= +<<<<<<< HEAD +| 4 | 0.363640 | `quota_usage_check` | ❌ | +>>>>>>> 58ab8585 (update prompts and tool description evaluator) | 5 | 0.350076 | `resourcehealth_health-events_list` | ❌ | --- +<<<<<<< HEAD +## Test 334 +======= +## Test 324 +======= +| 4 | 0.363412 | `quota_usage_check` | ❌ | +| 5 | 0.350076 | `resourcehealth_service-health-events_list` | ❌ | + +--- + ## Test 334 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `monitor_resource_log_query` **Prompt:** Show me the logs for the past hour for the resource in the Log Analytics workspace @@ -6152,6 +11949,7 @@ |------|-------|------|--------| | 1 | 0.687852 | `monitor_resource_log_query` | ✅ **EXPECTED** | | 2 | 0.621919 | `monitor_workspace_log_query` | ❌ | +<<<<<<< HEAD | 3 | 0.598393 | `monitor_activitylog_list` | ❌ | | 4 | 0.485528 | `deploy_app_logs_get` | ❌ | | 5 | 0.469703 | `monitor_metrics_query` | ❌ | @@ -6159,6 +11957,23 @@ --- ## Test 335 +======= +<<<<<<< HEAD +| 3 | 0.598436 | `monitor_activitylog_list` | ❌ | +======= +| 3 | 0.598393 | `monitor_activitylog_list` | ❌ | +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +| 4 | 0.485633 | `deploy_app_logs_get` | ❌ | +| 5 | 0.470119 | `monitor_metrics_query` | ❌ | + +--- + +<<<<<<< HEAD +## Test 325 +======= +## Test 335 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `monitor_table_list` **Prompt:** List all tables in the Log Analytics workspace @@ -6167,15 +11982,36 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.851075 | `monitor_table_list` | ✅ **EXPECTED** | +<<<<<<< HEAD | 2 | 0.725693 | `monitor_table_type_list` | ❌ | | 3 | 0.620451 | `monitor_workspace_list` | ❌ | | 4 | 0.541928 | `kusto_table_list` | ❌ | +======= +| 2 | 0.725738 | `monitor_table_type_list` | ❌ | +| 3 | 0.620445 | `monitor_workspace_list` | ❌ | +| 4 | 0.541959 | `kusto_table_list` | ❌ | +======= +| 1 | 0.850522 | `monitor_table_list` | ✅ **EXPECTED** | +| 2 | 0.725738 | `monitor_table_type_list` | ❌ | +| 3 | 0.620445 | `monitor_workspace_list` | ❌ | +| 4 | 0.541928 | `kusto_table_list` | ❌ | +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) | 5 | 0.539481 | `monitor_workspace_log_query` | ❌ | --- +<<<<<<< HEAD +## Test 336 +======= +<<<<<<< HEAD +## Test 326 +======= ## Test 336 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `monitor_table_list` **Prompt:** Show me the tables in the Log Analytics workspace @@ -6184,15 +12020,34 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.798459 | `monitor_table_list` | ✅ **EXPECTED** | | 2 | 0.701092 | `monitor_table_type_list` | ❌ | | 3 | 0.600003 | `monitor_workspace_list` | ❌ | | 4 | 0.542820 | `monitor_workspace_log_query` | ❌ | +======= +<<<<<<< HEAD +| 1 | 0.798460 | `monitor_table_list` | ✅ **EXPECTED** | +======= +| 1 | 0.798109 | `monitor_table_list` | ✅ **EXPECTED** | +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +| 2 | 0.701122 | `monitor_table_type_list` | ❌ | +| 3 | 0.599917 | `monitor_workspace_list` | ❌ | +| 4 | 0.542821 | `monitor_workspace_log_query` | ❌ | +>>>>>>> 58ab8585 (update prompts and tool description evaluator) | 5 | 0.502882 | `monitor_resource_log_query` | ❌ | --- +<<<<<<< HEAD ## Test 337 +======= +<<<<<<< HEAD +## Test 327 +======= +## Test 337 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `monitor_table_type_list` **Prompt:** List all available table types in the Log Analytics workspace @@ -6201,15 +12056,36 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.881468 | `monitor_table_type_list` | ✅ **EXPECTED** | | 2 | 0.765694 | `monitor_table_list` | ❌ | | 3 | 0.570092 | `monitor_workspace_list` | ❌ | | 4 | 0.504683 | `mysql_table_list` | ❌ | +======= +| 1 | 0.881524 | `monitor_table_type_list` | ✅ **EXPECTED** | +<<<<<<< HEAD +| 2 | 0.765702 | `monitor_table_list` | ❌ | +| 3 | 0.569921 | `monitor_workspace_list` | ❌ | +| 4 | 0.504789 | `mysql_table_list` | ❌ | +======= +| 2 | 0.765548 | `monitor_table_list` | ❌ | +| 3 | 0.569921 | `monitor_workspace_list` | ❌ | +| 4 | 0.504683 | `mysql_table_list` | ❌ | +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) | 5 | 0.497622 | `monitor_workspace_log_query` | ❌ | --- +<<<<<<< HEAD +## Test 338 +======= +<<<<<<< HEAD +## Test 328 +======= ## Test 338 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `monitor_table_type_list` **Prompt:** Show me the available table types in the Log Analytics workspace @@ -6218,15 +12094,33 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.843110 | `monitor_table_type_list` | ✅ **EXPECTED** | | 2 | 0.736831 | `monitor_table_list` | ❌ | | 3 | 0.576934 | `monitor_workspace_list` | ❌ | +======= +| 1 | 0.843138 | `monitor_table_type_list` | ✅ **EXPECTED** | +<<<<<<< HEAD +| 2 | 0.736837 | `monitor_table_list` | ❌ | +======= +| 2 | 0.736830 | `monitor_table_list` | ❌ | +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +| 3 | 0.576731 | `monitor_workspace_list` | ❌ | +>>>>>>> 58ab8585 (update prompts and tool description evaluator) | 4 | 0.509598 | `monitor_workspace_log_query` | ❌ | | 5 | 0.481189 | `mysql_table_list` | ❌ | --- +<<<<<<< HEAD +## Test 339 +======= +<<<<<<< HEAD +## Test 329 +======= ## Test 339 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `monitor_webtests_create` **Prompt:** Create a new Standard Web Test with name in my subscription in in a given @@ -6235,6 +12129,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.651084 | `monitor_webtests_create` | ✅ **EXPECTED** | | 2 | 0.570105 | `monitor_webtests_list` | ❌ | | 3 | 0.550426 | `monitor_webtests_update` | ❌ | @@ -6244,6 +12139,29 @@ --- ## Test 340 +======= +<<<<<<< HEAD +| 1 | 0.650749 | `monitor_webtests_create` | ✅ **EXPECTED** | +| 2 | 0.569999 | `monitor_webtests_list` | ❌ | +| 3 | 0.550088 | `monitor_webtests_update` | ❌ | +| 4 | 0.533466 | `monitor_webtests_get` | ❌ | +| 5 | 0.482122 | `loadtesting_testresource_create` | ❌ | + +--- + +## Test 330 +======= +| 1 | 0.650734 | `monitor_webtests_create` | ✅ **EXPECTED** | +| 2 | 0.572163 | `monitor_webtests_list` | ❌ | +| 3 | 0.550075 | `monitor_webtests_update` | ❌ | +| 4 | 0.533352 | `monitor_webtests_get` | ❌ | +| 5 | 0.482145 | `loadtesting_testresource_create` | ❌ | + +--- + +## Test 340 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `monitor_webtests_get` **Prompt:** Get Web Test details for in my subscription in @@ -6252,6 +12170,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.758910 | `monitor_webtests_get` | ✅ **EXPECTED** | | 2 | 0.725360 | `monitor_webtests_list` | ❌ | | 3 | 0.583663 | `loadtesting_testresource_list` | ❌ | @@ -6261,6 +12180,29 @@ --- ## Test 341 +======= +<<<<<<< HEAD +| 1 | 0.759380 | `monitor_webtests_get` | ✅ **EXPECTED** | +| 2 | 0.725337 | `monitor_webtests_list` | ❌ | +| 3 | 0.583816 | `loadtesting_testresource_list` | ❌ | +| 4 | 0.562797 | `monitor_webtests_update` | ❌ | +| 5 | 0.530557 | `monitor_webtests_create` | ❌ | + +--- + +## Test 331 +======= +| 1 | 0.759062 | `monitor_webtests_get` | ✅ **EXPECTED** | +| 2 | 0.726138 | `monitor_webtests_list` | ❌ | +| 3 | 0.583770 | `loadtesting_testresource_list` | ❌ | +| 4 | 0.562773 | `monitor_webtests_update` | ❌ | +| 5 | 0.530496 | `monitor_webtests_create` | ❌ | + +--- + +## Test 341 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `monitor_webtests_list` **Prompt:** List all Web Test resources in my subscription @@ -6269,7 +12211,12 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.730616 | `monitor_webtests_list` | ✅ **EXPECTED** | +======= +<<<<<<< HEAD +| 1 | 0.730568 | `monitor_webtests_list` | ✅ **EXPECTED** | +>>>>>>> 58ab8585 (update prompts and tool description evaluator) | 2 | 0.610160 | `loadtesting_testresource_list` | ❌ | | 3 | 0.547708 | `grafana_list` | ❌ | | 4 | 0.520828 | `redis_list` | ❌ | @@ -6277,7 +12224,22 @@ --- +<<<<<<< HEAD +## Test 342 +======= +## Test 332 +======= +| 1 | 0.732801 | `monitor_webtests_list` | ✅ **EXPECTED** | +| 2 | 0.610160 | `loadtesting_testresource_list` | ❌ | +| 3 | 0.547708 | `grafana_list` | ❌ | +| 4 | 0.520829 | `redis_list` | ❌ | +| 5 | 0.496166 | `monitor_webtests_get` | ❌ | + +--- + ## Test 342 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `monitor_webtests_list` **Prompt:** List all Web Test resources in my subscription in @@ -6286,15 +12248,35 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.793807 | `monitor_webtests_list` | ✅ **EXPECTED** | | 2 | 0.675965 | `loadtesting_testresource_list` | ❌ | | 3 | 0.584429 | `monitor_webtests_get` | ❌ | -| 4 | 0.573602 | `group_list` | ❌ | +======= +<<<<<<< HEAD +| 1 | 0.793702 | `monitor_webtests_list` | ✅ **EXPECTED** | +| 2 | 0.675965 | `loadtesting_testresource_list` | ❌ | +| 3 | 0.584942 | `monitor_webtests_get` | ❌ | +======= +| 1 | 0.793581 | `monitor_webtests_list` | ✅ **EXPECTED** | +| 2 | 0.675965 | `loadtesting_testresource_list` | ❌ | +| 3 | 0.584429 | `monitor_webtests_get` | ❌ | +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) +| 4 | 0.573602 | `group_list` | ❌ | | 5 | 0.546088 | `resourcehealth_availability-status_list` | ❌ | --- +<<<<<<< HEAD ## Test 343 +======= +<<<<<<< HEAD +## Test 333 +======= +## Test 343 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `monitor_webtests_update` **Prompt:** Update an existing Standard Web Test with name in my subscription in in a given @@ -6303,6 +12285,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.686427 | `monitor_webtests_update` | ✅ **EXPECTED** | | 2 | 0.558816 | `monitor_webtests_get` | ❌ | | 3 | 0.557828 | `monitor_webtests_create` | ❌ | @@ -6312,6 +12295,29 @@ --- ## Test 344 +======= +<<<<<<< HEAD +| 1 | 0.686449 | `monitor_webtests_update` | ✅ **EXPECTED** | +| 2 | 0.559199 | `monitor_webtests_get` | ❌ | +| 3 | 0.558234 | `monitor_webtests_create` | ❌ | +| 4 | 0.553545 | `monitor_webtests_list` | ❌ | +| 5 | 0.508736 | `loadtesting_testrun_update` | ❌ | + +--- + +## Test 334 +======= +| 1 | 0.686466 | `monitor_webtests_update` | ✅ **EXPECTED** | +| 2 | 0.559612 | `monitor_webtests_get` | ❌ | +| 3 | 0.558102 | `monitor_webtests_create` | ❌ | +| 4 | 0.555899 | `monitor_webtests_list` | ❌ | +| 5 | 0.509033 | `loadtesting_testrun_update` | ❌ | + +--- + +## Test 344 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `monitor_workspace_list` **Prompt:** List all Log Analytics workspaces in my subscription @@ -6322,6 +12328,7 @@ |------|-------|------|--------| | 1 | 0.813871 | `monitor_workspace_list` | ✅ **EXPECTED** | | 2 | 0.680201 | `grafana_list` | ❌ | +<<<<<<< HEAD | 3 | 0.660127 | `monitor_table_list` | ❌ | | 4 | 0.610623 | `kusto_cluster_list` | ❌ | | 5 | 0.599636 | `search_service_list` | ❌ | @@ -6329,6 +12336,24 @@ --- ## Test 345 +======= +<<<<<<< HEAD +| 3 | 0.660135 | `monitor_table_list` | ❌ | +| 4 | 0.610623 | `kusto_cluster_list` | ❌ | +======= +| 3 | 0.659287 | `monitor_table_list` | ❌ | +| 4 | 0.610480 | `kusto_cluster_list` | ❌ | +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +| 5 | 0.600802 | `search_service_list` | ❌ | + +--- + +<<<<<<< HEAD +## Test 335 +======= +## Test 345 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `monitor_workspace_list` **Prompt:** Show me my Log Analytics workspaces @@ -6337,6 +12362,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.656159 | `monitor_workspace_list` | ✅ **EXPECTED** | | 2 | 0.585355 | `monitor_table_list` | ❌ | | 3 | 0.531036 | `monitor_table_type_list` | ❌ | @@ -6346,6 +12372,29 @@ --- ## Test 346 +======= +<<<<<<< HEAD +| 1 | 0.656194 | `monitor_workspace_list` | ✅ **EXPECTED** | +| 2 | 0.585436 | `monitor_table_list` | ❌ | +| 3 | 0.531083 | `monitor_table_type_list` | ❌ | +| 4 | 0.518254 | `grafana_list` | ❌ | +| 5 | 0.506772 | `monitor_workspace_log_query` | ❌ | + +--- + +## Test 336 +======= +| 1 | 0.656153 | `monitor_workspace_list` | ✅ **EXPECTED** | +| 2 | 0.584651 | `monitor_table_list` | ❌ | +| 3 | 0.531025 | `monitor_table_type_list` | ❌ | +| 4 | 0.518275 | `grafana_list` | ❌ | +| 5 | 0.506663 | `monitor_workspace_log_query` | ❌ | + +--- + +## Test 346 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `monitor_workspace_list` **Prompt:** Show me the Log Analytics workspaces in my subscription @@ -6356,13 +12405,31 @@ |------|-------|------|--------| | 1 | 0.732964 | `monitor_workspace_list` | ✅ **EXPECTED** | | 2 | 0.601481 | `grafana_list` | ❌ | +<<<<<<< HEAD | 3 | 0.580244 | `monitor_table_list` | ❌ | +======= +<<<<<<< HEAD +| 3 | 0.580261 | `monitor_table_list` | ❌ | +>>>>>>> 58ab8585 (update prompts and tool description evaluator) | 4 | 0.523782 | `monitor_workspace_log_query` | ❌ | | 5 | 0.522749 | `kusto_cluster_list` | ❌ | --- +<<<<<<< HEAD ## Test 347 +======= +## Test 337 +======= +| 3 | 0.579582 | `monitor_table_list` | ❌ | +| 4 | 0.523782 | `monitor_workspace_log_query` | ❌ | +| 5 | 0.522605 | `kusto_cluster_list` | ❌ | + +--- + +## Test 347 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `monitor_workspace_log_query` **Prompt:** Show me the logs for the past hour in the Log Analytics workspace @@ -6371,6 +12438,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.610115 | `monitor_workspace_log_query` | ✅ **EXPECTED** | | 2 | 0.587614 | `monitor_resource_log_query` | ❌ | | 3 | 0.527733 | `monitor_activitylog_list` | ❌ | @@ -6379,7 +12447,22 @@ --- +<<<<<<< HEAD +## Test 348 +======= +## Test 338 +======= +| 1 | 0.610116 | `monitor_workspace_log_query` | ✅ **EXPECTED** | +| 2 | 0.587644 | `monitor_resource_log_query` | ❌ | +| 3 | 0.527761 | `monitor_activitylog_list` | ❌ | +| 4 | 0.498255 | `deploy_app_logs_get` | ❌ | +| 5 | 0.485667 | `monitor_table_list` | ❌ | + +--- + ## Test 348 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `datadog_monitoredresources_list` **Prompt:** List all monitored resources in the Datadog resource @@ -6391,12 +12474,28 @@ | 1 | 0.668828 | `datadog_monitoredresources_list` | ✅ **EXPECTED** | | 2 | 0.454270 | `redis_list` | ❌ | | 3 | 0.413661 | `loadtesting_testresource_list` | ❌ | +<<<<<<< HEAD | 4 | 0.413173 | `monitor_metrics_query` | ❌ | +======= +<<<<<<< HEAD +| 4 | 0.413208 | `monitor_metrics_query` | ❌ | +======= +| 4 | 0.413173 | `monitor_metrics_query` | ❌ | +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) | 5 | 0.401731 | `grafana_list` | ❌ | --- +<<<<<<< HEAD ## Test 349 +======= +<<<<<<< HEAD +## Test 339 +======= +## Test 349 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `datadog_monitoredresources_list` **Prompt:** Show me the monitored resources in the Datadog resource @@ -6413,7 +12512,15 @@ --- +<<<<<<< HEAD +## Test 350 +======= +<<<<<<< HEAD +## Test 340 +======= ## Test 350 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `extension_azqr` **Prompt:** Check my Azure subscription for any compliance issues or recommendations @@ -6422,15 +12529,36 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.533403 | `quota_usage_check` | ❌ | | 2 | 0.481143 | `azureterraformbestpractices_get` | ❌ | | 3 | 0.476826 | `extension_azqr` | ✅ **EXPECTED** | +======= +<<<<<<< HEAD +| 1 | 0.533406 | `quota_usage_check` | ❌ | +| 2 | 0.481236 | `azureterraformbestpractices_get` | ❌ | +| 3 | 0.476761 | `extension_azqr` | ✅ **EXPECTED** | +>>>>>>> 58ab8585 (update prompts and tool description evaluator) | 4 | 0.471547 | `subscription_list` | ❌ | +======= +| 1 | 0.533164 | `quota_usage_check` | ❌ | +| 2 | 0.481143 | `azureterraformbestpractices_get` | ❌ | +| 3 | 0.476826 | `extension_azqr` | ✅ **EXPECTED** | +| 4 | 0.471499 | `subscription_list` | ❌ | +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) | 5 | 0.468404 | `applens_resource_diagnose` | ❌ | --- +<<<<<<< HEAD +## Test 351 +======= +<<<<<<< HEAD +## Test 341 +======= ## Test 351 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `extension_azqr` **Prompt:** Provide compliance recommendations for my current Azure subscription @@ -6439,15 +12567,32 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.532792 | `azureterraformbestpractices_get` | ❌ | +======= +<<<<<<< HEAD +| 1 | 0.532869 | `azureterraformbestpractices_get` | ❌ | +>>>>>>> 58ab8585 (update prompts and tool description evaluator) | 2 | 0.492863 | `get_bestpractices_get` | ❌ | +======= +| 1 | 0.532792 | `azureterraformbestpractices_get` | ❌ | +| 2 | 0.492602 | `get_bestpractices_get` | ❌ | +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) | 3 | 0.476164 | `applicationinsights_recommendation_list` | ❌ | | 4 | 0.473365 | `deploy_iac_rules_get` | ❌ | | 5 | 0.468491 | `azureaibestpractices_get` | ❌ | --- +<<<<<<< HEAD ## Test 352 +======= +<<<<<<< HEAD +## Test 342 +======= +## Test 352 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `extension_azqr` **Prompt:** Scan my Azure subscription for compliance recommendations @@ -6456,6 +12601,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.536917 | `azureterraformbestpractices_get` | ❌ | | 2 | 0.516910 | `extension_azqr` | ✅ **EXPECTED** | | 3 | 0.514947 | `applicationinsights_recommendation_list` | ❌ | @@ -6465,6 +12611,25 @@ --- ## Test 353 +======= +| 1 | 0.536984 | `azureterraformbestpractices_get` | ❌ | +| 2 | 0.516810 | `extension_azqr` | ✅ **EXPECTED** | +| 3 | 0.514978 | `applicationinsights_recommendation_list` | ❌ | +<<<<<<< HEAD +| 4 | 0.504929 | `quota_usage_check` | ❌ | +======= +| 4 | 0.504673 | `quota_usage_check` | ❌ | +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +| 5 | 0.494872 | `deploy_plan_get` | ❌ | + +--- + +<<<<<<< HEAD +## Test 343 +======= +## Test 353 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `quota_region_availability_list` **Prompt:** Show me the available regions for these resource types @@ -6474,14 +12639,30 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| | 1 | 0.590878 | `quota_region_availability_list` | ✅ **EXPECTED** | +<<<<<<< HEAD | 2 | 0.413662 | `quota_usage_check` | ❌ | +======= +<<<<<<< HEAD +| 2 | 0.413577 | `quota_usage_check` | ❌ | +======= +| 2 | 0.413274 | `quota_usage_check` | ❌ | +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) | 3 | 0.391332 | `redis_list` | ❌ | | 4 | 0.372940 | `resourcehealth_availability-status_list` | ❌ | | 5 | 0.369915 | `managedlustre_fs_sku_get` | ❌ | --- +<<<<<<< HEAD ## Test 354 +======= +<<<<<<< HEAD +## Test 344 +======= +## Test 354 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `quota_usage_check` **Prompt:** Check usage information for in region @@ -6490,15 +12671,37 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.609711 | `quota_usage_check` | ✅ **EXPECTED** | | 2 | 0.491058 | `quota_region_availability_list` | ❌ | | 3 | 0.384350 | `resourcehealth_availability-status_list` | ❌ | | 4 | 0.376819 | `resourcehealth_availability-status_get` | ❌ | +======= +<<<<<<< HEAD +| 1 | 0.609607 | `quota_usage_check` | ✅ **EXPECTED** | +| 2 | 0.491058 | `quota_region_availability_list` | ❌ | +| 3 | 0.384500 | `resourcehealth_availability-status_list` | ❌ | +| 4 | 0.376368 | `resourcehealth_availability-status_get` | ❌ | +======= +| 1 | 0.609244 | `quota_usage_check` | ✅ **EXPECTED** | +| 2 | 0.491058 | `quota_region_availability_list` | ❌ | +| 3 | 0.384350 | `resourcehealth_availability-status_list` | ❌ | +| 4 | 0.374248 | `resourcehealth_availability-status_get` | ❌ | +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) | 5 | 0.371407 | `redis_list` | ❌ | --- +<<<<<<< HEAD +## Test 355 +======= +<<<<<<< HEAD +## Test 345 +======= ## Test 355 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `role_assignment_list` **Prompt:** List all available role assignments in my subscription @@ -6511,11 +12714,19 @@ | 2 | 0.539757 | `subscription_list` | ❌ | | 3 | 0.483988 | `group_list` | ❌ | | 4 | 0.478700 | `grafana_list` | ❌ | -| 5 | 0.471364 | `cosmos_account_list` | ❌ | +| 5 | 0.471431 | `cosmos_account_list` | ❌ | --- +<<<<<<< HEAD ## Test 356 +======= +<<<<<<< HEAD +## Test 346 +======= +## Test 356 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `role_assignment_list` **Prompt:** Show me the available role assignments in my subscription @@ -6532,7 +12743,15 @@ --- +<<<<<<< HEAD +## Test 357 +======= +<<<<<<< HEAD +## Test 347 +======= ## Test 357 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `redis_list` **Prompt:** List all Redis resources in my subscription @@ -6541,6 +12760,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.810504 | `redis_list` | ✅ **EXPECTED** | | 2 | 0.587836 | `grafana_list` | ❌ | | 3 | 0.512954 | `kusto_cluster_list` | ❌ | @@ -6550,6 +12770,29 @@ --- ## Test 358 +======= +<<<<<<< HEAD +| 1 | 0.810487 | `redis_list` | ✅ **EXPECTED** | +| 2 | 0.587872 | `grafana_list` | ❌ | +| 3 | 0.512995 | `kusto_cluster_list` | ❌ | +| 4 | 0.508555 | `datadog_monitoredresources_list` | ❌ | +| 5 | 0.501183 | `postgres_server_list` | ❌ | + +--- + +## Test 348 +======= +| 1 | 0.810504 | `redis_list` | ✅ **EXPECTED** | +| 2 | 0.587836 | `grafana_list` | ❌ | +| 3 | 0.512970 | `kusto_cluster_list` | ❌ | +| 4 | 0.508531 | `datadog_monitoredresources_list` | ❌ | +| 5 | 0.501218 | `postgres_server_list` | ❌ | + +--- + +## Test 358 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `redis_list` **Prompt:** Show me my Redis resources @@ -6561,12 +12804,28 @@ | 1 | 0.685128 | `redis_list` | ✅ **EXPECTED** | | 2 | 0.374327 | `grafana_list` | ❌ | | 3 | 0.364197 | `datadog_monitoredresources_list` | ❌ | +<<<<<<< HEAD | 4 | 0.359659 | `mysql_server_list` | ❌ | +======= +<<<<<<< HEAD +| 4 | 0.359709 | `mysql_server_list` | ❌ | +======= +| 4 | 0.359659 | `mysql_server_list` | ❌ | +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) | 5 | 0.331502 | `mysql_database_list` | ❌ | --- +<<<<<<< HEAD ## Test 359 +======= +<<<<<<< HEAD +## Test 349 +======= +## Test 359 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `redis_list` **Prompt:** Show me the Redis resources in my subscription @@ -6578,12 +12837,29 @@ | 1 | 0.781228 | `redis_list` | ✅ **EXPECTED** | | 2 | 0.539177 | `grafana_list` | ❌ | | 3 | 0.449276 | `datadog_monitoredresources_list` | ❌ | +<<<<<<< HEAD | 4 | 0.449014 | `postgres_server_list` | ❌ | +======= +<<<<<<< HEAD +| 4 | 0.448989 | `postgres_server_list` | ❌ | +>>>>>>> 58ab8585 (update prompts and tool description evaluator) | 5 | 0.442854 | `kusto_cluster_list` | ❌ | --- +<<<<<<< HEAD ## Test 360 +======= +## Test 350 +======= +| 4 | 0.449014 | `postgres_server_list` | ❌ | +| 5 | 0.442860 | `kusto_cluster_list` | ❌ | + +--- + +## Test 360 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `redis_list` **Prompt:** Show me my Redis caches @@ -6595,12 +12871,29 @@ | 1 | 0.572767 | `redis_list` | ✅ **EXPECTED** | | 2 | 0.316630 | `mysql_database_list` | ❌ | | 3 | 0.301786 | `postgres_database_list` | ❌ | +<<<<<<< HEAD | 4 | 0.286513 | `mysql_server_list` | ❌ | +======= +<<<<<<< HEAD +| 4 | 0.286570 | `mysql_server_list` | ❌ | +>>>>>>> 58ab8585 (update prompts and tool description evaluator) | 5 | 0.273014 | `kusto_cluster_list` | ❌ | --- +<<<<<<< HEAD ## Test 361 +======= +## Test 351 +======= +| 4 | 0.286513 | `mysql_server_list` | ❌ | +| 5 | 0.272972 | `kusto_cluster_list` | ❌ | + +--- + +## Test 361 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `redis_list` **Prompt:** Get Redis clusters @@ -6609,6 +12902,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.478070 | `redis_list` | ✅ **EXPECTED** | | 2 | 0.456308 | `kusto_cluster_list` | ❌ | | 3 | 0.384630 | `kusto_cluster_get` | ❌ | @@ -6618,6 +12912,29 @@ --- ## Test 362 +======= +<<<<<<< HEAD +| 1 | 0.478109 | `redis_list` | ✅ **EXPECTED** | +| 2 | 0.456382 | `kusto_cluster_list` | ❌ | +| 3 | 0.384637 | `kusto_cluster_get` | ❌ | +| 4 | 0.359466 | `kusto_database_list` | ❌ | +| 5 | 0.343367 | `aks_cluster_get` | ❌ | + +--- + +## Test 352 +======= +| 1 | 0.478070 | `redis_list` | ✅ **EXPECTED** | +| 2 | 0.456311 | `kusto_cluster_list` | ❌ | +| 3 | 0.384630 | `kusto_cluster_get` | ❌ | +| 4 | 0.359797 | `kusto_database_list` | ❌ | +| 5 | 0.343305 | `aks_cluster_get` | ❌ | + +--- + +## Test 362 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `group_list` **Prompt:** List all resource groups in my subscription @@ -6627,14 +12944,33 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| | 1 | 0.755935 | `group_list` | ✅ **EXPECTED** | +<<<<<<< HEAD | 2 | 0.566552 | `workbooks_list` | ❌ | +======= +<<<<<<< HEAD +| 2 | 0.566497 | `workbooks_list` | ❌ | +>>>>>>> 58ab8585 (update prompts and tool description evaluator) | 3 | 0.564566 | `loadtesting_testresource_list` | ❌ | | 4 | 0.552633 | `datadog_monitoredresources_list` | ❌ | | 5 | 0.549477 | `monitor_webtests_list` | ❌ | --- +<<<<<<< HEAD ## Test 363 +======= +## Test 353 +======= +| 2 | 0.566552 | `workbooks_list` | ❌ | +| 3 | 0.564566 | `loadtesting_testresource_list` | ❌ | +| 4 | 0.552633 | `datadog_monitoredresources_list` | ❌ | +| 5 | 0.546156 | `resourcehealth_availability-status_list` | ❌ | + +--- + +## Test 363 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `group_list` **Prompt:** Show me my resource groups @@ -6646,12 +12982,28 @@ | 1 | 0.529504 | `group_list` | ✅ **EXPECTED** | | 2 | 0.464690 | `redis_list` | ❌ | | 3 | 0.463685 | `datadog_monitoredresources_list` | ❌ | +<<<<<<< HEAD +| 4 | 0.462391 | `mysql_server_list` | ❌ | +======= +<<<<<<< HEAD +| 4 | 0.462388 | `mysql_server_list` | ❌ | +======= | 4 | 0.462391 | `mysql_server_list` | ❌ | +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) | 5 | 0.460280 | `loadtesting_testresource_list` | ❌ | --- +<<<<<<< HEAD +## Test 364 +======= +<<<<<<< HEAD +## Test 354 +======= ## Test 364 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `group_list` **Prompt:** Show me the resource groups in my subscription @@ -6668,7 +13020,15 @@ --- +<<<<<<< HEAD +## Test 365 +======= +<<<<<<< HEAD +## Test 355 +======= ## Test 365 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `resourcehealth_availability-status_get` **Prompt:** Get the availability status for resource @@ -6677,15 +13037,37 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.556926 | `resourcehealth_availability-status_get` | ✅ **EXPECTED** | | 2 | 0.538273 | `resourcehealth_availability-status_list` | ❌ | | 3 | 0.378030 | `quota_usage_check` | ❌ | +======= +<<<<<<< HEAD +| 1 | 0.556629 | `resourcehealth_availability-status_get` | ✅ **EXPECTED** | +| 2 | 0.538277 | `resourcehealth_availability-status_list` | ❌ | +| 3 | 0.377966 | `quota_usage_check` | ❌ | +>>>>>>> 58ab8585 (update prompts and tool description evaluator) | 4 | 0.373112 | `monitor_healthmodels_entity_get` | ❌ | | 5 | 0.349981 | `datadog_monitoredresources_list` | ❌ | --- +<<<<<<< HEAD ## Test 366 +======= +## Test 356 +======= +| 1 | 0.555432 | `resourcehealth_availability-status_get` | ✅ **EXPECTED** | +| 2 | 0.538273 | `resourcehealth_availability-status_list` | ❌ | +| 3 | 0.404305 | `foundry_openai_models-list` | ❌ | +| 4 | 0.377586 | `quota_usage_check` | ❌ | +| 5 | 0.373112 | `monitor_healthmodels_entity_gethealth` | ❌ | + +--- + +## Test 366 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `resourcehealth_availability-status_get` **Prompt:** Show me the health status of the storage account @@ -6694,15 +13076,37 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.576591 | `storage_account_get` | ❌ | | 2 | 0.564706 | `resourcehealth_availability-status_get` | ✅ **EXPECTED** | | 3 | 0.555636 | `storage_blob_container_get` | ❌ | +======= +<<<<<<< HEAD +| 1 | 0.576617 | `storage_account_get` | ❌ | +| 2 | 0.564128 | `resourcehealth_availability-status_get` | ✅ **EXPECTED** | +| 3 | 0.556167 | `storage_blob_container_get` | ❌ | +>>>>>>> 58ab8585 (update prompts and tool description evaluator) | 4 | 0.487207 | `storage_blob_get` | ❌ | | 5 | 0.466885 | `resourcehealth_availability-status_list` | ❌ | --- +<<<<<<< HEAD ## Test 367 +======= +## Test 357 +======= +| 1 | 0.576591 | `storage_account_get` | ❌ | +| 2 | 0.566633 | `resourcehealth_availability-status_get` | ✅ **EXPECTED** | +| 3 | 0.556167 | `storage_blob_container_get` | ❌ | +| 4 | 0.487207 | `storage_blob_get` | ❌ | +| 5 | 0.466885 | `resourcehealth_availability-status_list` | ❌ | + +--- + +## Test 367 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `resourcehealth_availability-status_get` **Prompt:** What is the availability status of virtual machine in resource group ? @@ -6711,15 +13115,37 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.577398 | `resourcehealth_availability-status_list` | ❌ | | 2 | 0.502794 | `resourcehealth_availability-status_get` | ✅ **EXPECTED** | | 3 | 0.424939 | `mysql_server_list` | ❌ | +======= +<<<<<<< HEAD +| 1 | 0.577529 | `resourcehealth_availability-status_list` | ❌ | +| 2 | 0.501568 | `resourcehealth_availability-status_get` | ✅ **EXPECTED** | +| 3 | 0.424957 | `mysql_server_list` | ❌ | +>>>>>>> 58ab8585 (update prompts and tool description evaluator) | 4 | 0.412025 | `loadtesting_testresource_list` | ❌ | | 5 | 0.393479 | `managedlustre_fs_list` | ❌ | --- +<<<<<<< HEAD ## Test 368 +======= +## Test 358 +======= +| 1 | 0.577398 | `resourcehealth_availability-status_list` | ❌ | +| 2 | 0.502457 | `resourcehealth_availability-status_get` | ✅ **EXPECTED** | +| 3 | 0.424939 | `mysql_server_list` | ❌ | +| 4 | 0.413484 | `foundry_openai_models-list` | ❌ | +| 5 | 0.412025 | `loadtesting_testresource_list` | ❌ | + +--- + +## Test 368 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `resourcehealth_availability-status_list` **Prompt:** List availability status for all resources in my subscription @@ -6736,7 +13162,15 @@ --- +<<<<<<< HEAD +## Test 369 +======= +<<<<<<< HEAD +## Test 359 +======= ## Test 369 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `resourcehealth_availability-status_list` **Prompt:** Show me the health status of all my Azure resources @@ -6745,15 +13179,37 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.644982 | `resourcehealth_availability-status_list` | ✅ **EXPECTED** | | 2 | 0.544917 | `resourcehealth_availability-status_get` | ❌ | | 3 | 0.509740 | `resourcehealth_health-events_list` | ❌ | | 4 | 0.508766 | `quota_usage_check` | ❌ | +======= +<<<<<<< HEAD +| 1 | 0.644908 | `resourcehealth_availability-status_list` | ✅ **EXPECTED** | +| 2 | 0.545208 | `resourcehealth_availability-status_get` | ❌ | +| 3 | 0.509740 | `resourcehealth_health-events_list` | ❌ | +| 4 | 0.508703 | `quota_usage_check` | ❌ | +======= +| 1 | 0.644982 | `resourcehealth_availability-status_list` | ✅ **EXPECTED** | +| 2 | 0.546520 | `resourcehealth_availability-status_get` | ❌ | +| 3 | 0.509740 | `resourcehealth_service-health-events_list` | ❌ | +| 4 | 0.508252 | `quota_usage_check` | ❌ | +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) | 5 | 0.505776 | `redis_list` | ❌ | --- +<<<<<<< HEAD +## Test 370 +======= +<<<<<<< HEAD +## Test 360 +======= ## Test 370 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `resourcehealth_availability-status_list` **Prompt:** What resources in resource group have health issues? @@ -6762,15 +13218,34 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.596890 | `resourcehealth_availability-status_list` | ✅ **EXPECTED** | | 2 | 0.550812 | `resourcehealth_availability-status_get` | ❌ | +======= +<<<<<<< HEAD +| 1 | 0.596817 | `resourcehealth_availability-status_list` | ✅ **EXPECTED** | +| 2 | 0.549900 | `resourcehealth_availability-status_get` | ❌ | +>>>>>>> 58ab8585 (update prompts and tool description evaluator) | 3 | 0.496640 | `resourcehealth_health-events_list` | ❌ | +======= +| 1 | 0.596890 | `resourcehealth_availability-status_list` | ✅ **EXPECTED** | +| 2 | 0.551332 | `resourcehealth_availability-status_get` | ❌ | +| 3 | 0.496640 | `resourcehealth_service-health-events_list` | ❌ | +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) | 4 | 0.441921 | `applens_resource_diagnose` | ❌ | | 5 | 0.433614 | `loadtesting_testresource_list` | ❌ | --- +<<<<<<< HEAD +## Test 371 +======= +<<<<<<< HEAD +## Test 361 +======= ## Test 371 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `resourcehealth_health-events_list` **Prompt:** List all service health events in my subscription @@ -6779,6 +13254,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.690720 | `resourcehealth_health-events_list` | ✅ **EXPECTED** | | 2 | 0.553485 | `search_service_list` | ❌ | | 3 | 0.534169 | `eventgrid_topic_list` | ❌ | @@ -6788,6 +13264,25 @@ --- ## Test 372 +======= +<<<<<<< HEAD +| 1 | 0.690719 | `resourcehealth_health-events_list` | ✅ **EXPECTED** | +======= +| 1 | 0.690719 | `resourcehealth_service-health-events_list` | ❌ | +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +| 2 | 0.554895 | `search_service_list` | ❌ | +| 3 | 0.534250 | `eventgrid_topic_list` | ❌ | +| 4 | 0.529761 | `eventgrid_subscription_list` | ❌ | +| 5 | 0.518595 | `resourcehealth_availability-status_list` | ❌ | + +--- + +<<<<<<< HEAD +## Test 362 +======= +## Test 372 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `resourcehealth_health-events_list` **Prompt:** Show me Azure service health events for subscription @@ -6796,15 +13291,33 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.686448 | `resourcehealth_health-events_list` | ✅ **EXPECTED** | +<<<<<<< HEAD | 2 | 0.534707 | `eventgrid_subscription_list` | ❌ | | 3 | 0.513302 | `search_service_list` | ❌ | | 4 | 0.513237 | `eventgrid_topic_list` | ❌ | +======= +======= +| 1 | 0.686448 | `resourcehealth_service-health-events_list` | ❌ | +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +| 2 | 0.534556 | `eventgrid_subscription_list` | ❌ | +| 3 | 0.513815 | `search_service_list` | ❌ | +| 4 | 0.513259 | `eventgrid_topic_list` | ❌ | +>>>>>>> 58ab8585 (update prompts and tool description evaluator) | 5 | 0.501121 | `subscription_list` | ❌ | --- +<<<<<<< HEAD ## Test 373 +======= +<<<<<<< HEAD +## Test 363 +======= +## Test 373 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `resourcehealth_health-events_list` **Prompt:** What service issues have occurred in the last 30 days? @@ -6813,6 +13326,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.450841 | `resourcehealth_health-events_list` | ✅ **EXPECTED** | | 2 | 0.267663 | `applens_resource_diagnose` | ❌ | | 3 | 0.245720 | `cloudarchitect_design` | ❌ | @@ -6822,6 +13336,29 @@ --- ## Test 374 +======= +<<<<<<< HEAD +| 1 | 0.450909 | `resourcehealth_health-events_list` | ✅ **EXPECTED** | +| 2 | 0.267752 | `applens_resource_diagnose` | ❌ | +| 3 | 0.245709 | `cloudarchitect_design` | ❌ | +| 4 | 0.217130 | `resourcehealth_availability-status_list` | ❌ | +| 5 | 0.211900 | `search_service_list` | ❌ | + +--- + +## Test 364 +======= +| 1 | 0.450841 | `resourcehealth_service-health-events_list` | ❌ | +| 2 | 0.267663 | `applens_resource_diagnose` | ❌ | +| 3 | 0.245720 | `cloudarchitect_design` | ❌ | +| 4 | 0.216847 | `resourcehealth_availability-status_list` | ❌ | +| 5 | 0.211842 | `search_service_list` | ❌ | + +--- + +## Test 374 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `resourcehealth_health-events_list` **Prompt:** List active service health events in my subscription @@ -6830,7 +13367,9 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.685391 | `resourcehealth_health-events_list` | ✅ **EXPECTED** | +<<<<<<< HEAD | 2 | 0.527255 | `eventgrid_subscription_list` | ❌ | | 3 | 0.523975 | `eventgrid_topic_list` | ❌ | | 4 | 0.518668 | `search_service_list` | ❌ | @@ -6839,6 +13378,23 @@ --- ## Test 375 +======= +======= +| 1 | 0.685391 | `resourcehealth_service-health-events_list` | ❌ | +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +| 2 | 0.527905 | `eventgrid_subscription_list` | ❌ | +| 3 | 0.524063 | `eventgrid_topic_list` | ❌ | +| 4 | 0.520197 | `search_service_list` | ❌ | +| 5 | 0.502345 | `resourcehealth_availability-status_list` | ❌ | + +--- + +<<<<<<< HEAD +## Test 365 +======= +## Test 375 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `resourcehealth_health-events_list` **Prompt:** Show me planned maintenance events for my Azure services @@ -6847,7 +13403,9 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.565851 | `resourcehealth_health-events_list` | ✅ **EXPECTED** | +<<<<<<< HEAD | 2 | 0.436322 | `search_service_list` | ❌ | | 3 | 0.404191 | `eventgrid_subscription_list` | ❌ | | 4 | 0.402493 | `resourcehealth_availability-status_list` | ❌ | @@ -6856,6 +13414,23 @@ --- ## Test 376 +======= +======= +| 1 | 0.565851 | `resourcehealth_service-health-events_list` | ❌ | +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +| 2 | 0.437868 | `search_service_list` | ❌ | +| 3 | 0.403665 | `eventgrid_subscription_list` | ❌ | +| 4 | 0.402532 | `resourcehealth_availability-status_list` | ❌ | +| 5 | 0.398084 | `quota_usage_check` | ❌ | + +--- + +<<<<<<< HEAD +## Test 366 +======= +## Test 376 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `servicebus_queue_details` **Prompt:** Show me the details of service bus queue @@ -6868,11 +13443,27 @@ | 2 | 0.460932 | `servicebus_topic_subscription_details` | ❌ | | 3 | 0.437000 | `servicebus_topic_details` | ❌ | | 4 | 0.385812 | `search_knowledge_base_get` | ❌ | +<<<<<<< HEAD | 5 | 0.384139 | `storage_account_get` | ❌ | --- ## Test 377 +======= +<<<<<<< HEAD +| 5 | 0.384133 | `storage_account_get` | ❌ | + +--- + +## Test 367 +======= +| 5 | 0.384187 | `storage_account_get` | ❌ | + +--- + +## Test 377 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `servicebus_topic_details` **Prompt:** Show me the details of service bus topic @@ -6889,7 +13480,15 @@ --- +<<<<<<< HEAD +## Test 378 +======= +<<<<<<< HEAD +## Test 368 +======= ## Test 378 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `servicebus_topic_subscription_details` **Prompt:** Show me the details of service bus subscription @@ -6906,7 +13505,15 @@ --- +<<<<<<< HEAD ## Test 379 +======= +<<<<<<< HEAD +## Test 369 +======= +## Test 379 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `signalr_runtime_get` **Prompt:** Show me the details of SignalR @@ -6923,7 +13530,15 @@ --- +<<<<<<< HEAD +## Test 380 +======= +<<<<<<< HEAD +## Test 370 +======= ## Test 380 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `signalr_runtime_get` **Prompt:** Show me the network information of SignalR runtime @@ -6940,7 +13555,15 @@ --- +<<<<<<< HEAD +## Test 381 +======= +<<<<<<< HEAD +## Test 371 +======= ## Test 381 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `signalr_runtime_get` **Prompt:** Describe the SignalR runtime in resource group @@ -6952,12 +13575,30 @@ | 1 | 0.710281 | `signalr_runtime_get` | ✅ **EXPECTED** | | 2 | 0.411396 | `loadtesting_testresource_list` | ❌ | | 3 | 0.410606 | `foundry_resource_get` | ❌ | +<<<<<<< HEAD | 4 | 0.399412 | `resourcehealth_availability-status_list` | ❌ | | 5 | 0.382028 | `sql_server_list` | ❌ | --- ## Test 382 +======= +<<<<<<< HEAD +| 4 | 0.399745 | `resourcehealth_availability-status_list` | ❌ | +| 5 | 0.382472 | `sql_server_list` | ❌ | + +--- + +## Test 372 +======= +| 4 | 0.399412 | `resourcehealth_availability-status_list` | ❌ | +| 5 | 0.382152 | `sql_server_list` | ❌ | + +--- + +## Test 382 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `signalr_runtime_get` **Prompt:** Get information about my SignalR runtime in @@ -6966,6 +13607,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.715701 | `signalr_runtime_get` | ✅ **EXPECTED** | | 2 | 0.458894 | `foundry_resource_get` | ❌ | | 3 | 0.431212 | `resourcehealth_availability-status_list` | ❌ | @@ -6975,6 +13617,29 @@ --- ## Test 383 +======= +<<<<<<< HEAD +| 1 | 0.715913 | `signalr_runtime_get` | ✅ **EXPECTED** | +| 2 | 0.459979 | `foundry_resource_get` | ❌ | +| 3 | 0.431800 | `resourcehealth_availability-status_list` | ❌ | +| 4 | 0.431393 | `loadtesting_testresource_list` | ❌ | +| 5 | 0.417497 | `functionapp_get` | ❌ | + +--- + +## Test 373 +======= +| 1 | 0.715937 | `signalr_runtime_get` | ✅ **EXPECTED** | +| 2 | 0.459543 | `foundry_resource_get` | ❌ | +| 3 | 0.431534 | `resourcehealth_availability-status_list` | ❌ | +| 4 | 0.430926 | `loadtesting_testresource_list` | ❌ | +| 5 | 0.417653 | `functionapp_get` | ❌ | + +--- + +## Test 383 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `signalr_runtime_get` **Prompt:** Show all the SignalRs information in @@ -6985,13 +13650,29 @@ |------|-------|------|--------| | 1 | 0.563883 | `signalr_runtime_get` | ✅ **EXPECTED** | | 2 | 0.501077 | `redis_list` | ❌ | +<<<<<<< HEAD +| 3 | 0.494478 | `resourcehealth_availability-status_list` | ❌ | +======= +<<<<<<< HEAD +| 3 | 0.494808 | `resourcehealth_availability-status_list` | ❌ | +======= | 3 | 0.494478 | `resourcehealth_availability-status_list` | ❌ | +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) | 4 | 0.481428 | `loadtesting_testresource_list` | ❌ | | 5 | 0.462090 | `mysql_server_list` | ❌ | --- +<<<<<<< HEAD ## Test 384 +======= +<<<<<<< HEAD +## Test 374 +======= +## Test 384 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `signalr_runtime_get` **Prompt:** List all SignalRs in my subscription @@ -7003,12 +13684,25 @@ | 1 | 0.530514 | `signalr_runtime_get` | ✅ **EXPECTED** | | 2 | 0.507654 | `postgres_server_list` | ❌ | | 3 | 0.495157 | `redis_list` | ❌ | +<<<<<<< HEAD | 4 | 0.494498 | `kusto_cluster_list` | ❌ | | 5 | 0.487906 | `subscription_list` | ❌ | --- +<<<<<<< HEAD +## Test 385 +======= +## Test 375 +======= +| 4 | 0.494513 | `kusto_cluster_list` | ❌ | +| 5 | 0.487856 | `subscription_list` | ❌ | + +--- + ## Test 385 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `sql_db_create` **Prompt:** Create a new SQL database named in server @@ -7025,7 +13719,15 @@ --- +<<<<<<< HEAD ## Test 386 +======= +<<<<<<< HEAD +## Test 376 +======= +## Test 386 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `sql_db_create` **Prompt:** Create a SQL database with Basic tier in server @@ -7036,13 +13738,30 @@ |------|-------|------|--------| | 1 | 0.571760 | `sql_db_create` | ✅ **EXPECTED** | | 2 | 0.459672 | `sql_server_create` | ❌ | +<<<<<<< HEAD | 3 | 0.437525 | `sql_server_delete` | ❌ | +======= +| 3 | 0.437526 | `sql_server_delete` | ❌ | +<<<<<<< HEAD +>>>>>>> 58ab8585 (update prompts and tool description evaluator) | 4 | 0.420843 | `sql_db_show` | ❌ | | 5 | 0.417661 | `sql_db_delete` | ❌ | --- +<<<<<<< HEAD +## Test 387 +======= +## Test 377 +======= +| 4 | 0.424021 | `appservice_database_add` | ❌ | +| 5 | 0.420843 | `sql_db_show` | ❌ | + +--- + ## Test 387 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `sql_db_create` **Prompt:** Create a new database called on SQL server in resource group @@ -7055,11 +13774,27 @@ | 2 | 0.545906 | `sql_server_create` | ❌ | | 3 | 0.503938 | `sql_db_rename` | ❌ | | 4 | 0.494377 | `sql_db_show` | ❌ | +<<<<<<< HEAD +| 5 | 0.473975 | `sql_db_list` | ❌ | + +--- + +## Test 388 +======= +<<<<<<< HEAD +| 5 | 0.473859 | `sql_db_list` | ❌ | + +--- + +## Test 378 +======= | 5 | 0.473975 | `sql_db_list` | ❌ | --- ## Test 388 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `sql_db_delete` **Prompt:** Delete the SQL database from server @@ -7076,7 +13811,15 @@ --- +<<<<<<< HEAD ## Test 389 +======= +<<<<<<< HEAD +## Test 379 +======= +## Test 389 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `sql_db_delete` **Prompt:** Remove database from SQL server in resource group @@ -7085,6 +13828,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.567513 | `sql_server_delete` | ❌ | | 2 | 0.543440 | `sql_db_delete` | ✅ **EXPECTED** | | 3 | 0.500756 | `sql_db_show` | ❌ | @@ -7094,6 +13838,29 @@ --- ## Test 390 +======= +<<<<<<< HEAD +| 1 | 0.567481 | `sql_server_delete` | ❌ | +| 2 | 0.543378 | `sql_db_delete` | ✅ **EXPECTED** | +| 3 | 0.500746 | `sql_db_show` | ❌ | +| 4 | 0.480981 | `sql_db_rename` | ❌ | +| 5 | 0.478583 | `sql_db_list` | ❌ | + +--- + +## Test 380 +======= +| 1 | 0.567513 | `sql_server_delete` | ❌ | +| 2 | 0.543440 | `sql_db_delete` | ✅ **EXPECTED** | +| 3 | 0.500756 | `sql_db_show` | ❌ | +| 4 | 0.481083 | `sql_db_rename` | ❌ | +| 5 | 0.478729 | `sql_db_list` | ❌ | + +--- + +## Test 390 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `sql_db_delete` **Prompt:** Delete the database called on server @@ -7110,7 +13877,15 @@ --- +<<<<<<< HEAD ## Test 391 +======= +<<<<<<< HEAD +## Test 381 +======= +## Test 391 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `sql_db_list` **Prompt:** List all databases in the Azure SQL server @@ -7119,6 +13894,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.643138 | `sql_db_list` | ✅ **EXPECTED** | | 2 | 0.639644 | `mysql_database_list` | ❌ | | 3 | 0.609116 | `postgres_database_list` | ❌ | @@ -7128,6 +13904,29 @@ --- ## Test 392 +======= +<<<<<<< HEAD +| 1 | 0.643202 | `sql_db_list` | ✅ **EXPECTED** | +| 2 | 0.639694 | `mysql_database_list` | ❌ | +| 3 | 0.609178 | `postgres_database_list` | ❌ | +| 4 | 0.602890 | `cosmos_database_list` | ❌ | +| 5 | 0.570103 | `kusto_database_list` | ❌ | + +--- + +## Test 382 +======= +| 1 | 0.643186 | `sql_db_list` | ✅ **EXPECTED** | +| 2 | 0.639694 | `mysql_database_list` | ❌ | +| 3 | 0.609178 | `postgres_database_list` | ❌ | +| 4 | 0.602890 | `cosmos_database_list` | ❌ | +| 5 | 0.569739 | `kusto_database_list` | ❌ | + +--- + +## Test 392 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `sql_db_list` **Prompt:** Show me all the databases configuration details in the Azure SQL server @@ -7137,14 +13936,30 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| | 1 | 0.617746 | `sql_server_show` | ❌ | +<<<<<<< HEAD +| 2 | 0.609322 | `sql_db_list` | ✅ **EXPECTED** | +======= +<<<<<<< HEAD +| 2 | 0.609291 | `sql_db_list` | ✅ **EXPECTED** | +======= | 2 | 0.609322 | `sql_db_list` | ✅ **EXPECTED** | +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) | 3 | 0.557353 | `mysql_database_list` | ❌ | | 4 | 0.553488 | `mysql_server_config_get` | ❌ | | 5 | 0.524274 | `sql_db_show` | ❌ | --- +<<<<<<< HEAD +## Test 393 +======= +<<<<<<< HEAD +## Test 383 +======= ## Test 393 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `sql_db_rename` **Prompt:** Rename the SQL database on server to @@ -7153,6 +13968,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.593251 | `sql_db_rename` | ✅ **EXPECTED** | | 2 | 0.425282 | `sql_server_delete` | ❌ | | 3 | 0.416207 | `sql_db_delete` | ❌ | @@ -7162,6 +13978,29 @@ --- ## Test 394 +======= +<<<<<<< HEAD +| 1 | 0.593308 | `sql_db_rename` | ✅ **EXPECTED** | +| 2 | 0.425296 | `sql_server_delete` | ❌ | +| 3 | 0.416187 | `sql_db_delete` | ❌ | +| 4 | 0.396109 | `sql_db_create` | ❌ | +| 5 | 0.345991 | `sql_db_show` | ❌ | + +--- + +## Test 384 +======= +| 1 | 0.593348 | `sql_db_rename` | ✅ **EXPECTED** | +| 2 | 0.425282 | `sql_server_delete` | ❌ | +| 3 | 0.416207 | `sql_db_delete` | ❌ | +| 4 | 0.396947 | `sql_db_create` | ❌ | +| 5 | 0.346018 | `sql_db_show` | ❌ | + +--- + +## Test 394 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `sql_db_rename` **Prompt:** Rename my Azure SQL database to on server @@ -7170,6 +14009,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.711257 | `sql_db_rename` | ✅ **EXPECTED** | | 2 | 0.516770 | `sql_server_delete` | ❌ | | 3 | 0.506834 | `sql_db_delete` | ❌ | @@ -7179,6 +14019,29 @@ --- ## Test 395 +======= +<<<<<<< HEAD +| 1 | 0.710788 | `sql_db_rename` | ✅ **EXPECTED** | +| 2 | 0.516432 | `sql_server_delete` | ❌ | +| 3 | 0.506388 | `sql_db_delete` | ❌ | +| 4 | 0.500926 | `sql_db_create` | ❌ | +| 5 | 0.434133 | `sql_server_show` | ❌ | + +--- + +## Test 385 +======= +| 1 | 0.710925 | `sql_db_rename` | ✅ **EXPECTED** | +| 2 | 0.516662 | `sql_server_delete` | ❌ | +| 3 | 0.506572 | `sql_db_delete` | ❌ | +| 4 | 0.501347 | `sql_db_create` | ❌ | +| 5 | 0.433966 | `sql_server_show` | ❌ | + +--- + +## Test 395 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `sql_db_show` **Prompt:** Get the configuration details for the SQL database on server @@ -7187,6 +14050,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.610991 | `sql_server_show` | ❌ | | 2 | 0.593150 | `postgres_server_config_get` | ❌ | | 3 | 0.530422 | `mysql_server_config_get` | ❌ | @@ -7196,6 +14060,29 @@ --- ## Test 396 +======= +<<<<<<< HEAD +| 1 | 0.611215 | `sql_server_show` | ❌ | +| 2 | 0.593200 | `postgres_server_config_get` | ❌ | +| 3 | 0.530520 | `mysql_server_config_get` | ❌ | +| 4 | 0.528378 | `sql_db_show` | ✅ **EXPECTED** | +| 5 | 0.465779 | `sql_db_list` | ❌ | + +--- + +## Test 386 +======= +| 1 | 0.610991 | `sql_server_show` | ❌ | +| 2 | 0.593150 | `postgres_server_config_get` | ❌ | +| 3 | 0.530422 | `mysql_server_config_get` | ❌ | +| 4 | 0.528136 | `sql_db_show` | ✅ **EXPECTED** | +| 5 | 0.465693 | `sql_db_list` | ❌ | + +--- + +## Test 396 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `sql_db_show` **Prompt:** Show me the details of SQL database in server @@ -7204,6 +14091,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.530095 | `sql_db_show` | ✅ **EXPECTED** | | 2 | 0.503681 | `sql_server_show` | ❌ | | 3 | 0.440073 | `sql_db_list` | ❌ | @@ -7213,6 +14101,29 @@ --- ## Test 397 +======= +<<<<<<< HEAD +| 1 | 0.530071 | `sql_db_show` | ✅ **EXPECTED** | +| 2 | 0.503602 | `sql_server_show` | ❌ | +| 3 | 0.439895 | `sql_db_list` | ❌ | +| 4 | 0.438615 | `mysql_table_schema_get` | ❌ | +| 5 | 0.432907 | `mysql_database_list` | ❌ | + +--- + +## Test 387 +======= +| 1 | 0.530095 | `sql_db_show` | ✅ **EXPECTED** | +| 2 | 0.503681 | `sql_server_show` | ❌ | +| 3 | 0.440073 | `sql_db_list` | ❌ | +| 4 | 0.438622 | `mysql_table_schema_get` | ❌ | +| 5 | 0.432919 | `mysql_database_list` | ❌ | + +--- + +## Test 397 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `sql_db_update` **Prompt:** Update the performance tier of SQL database on server @@ -7221,6 +14132,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.603271 | `sql_db_update` | ✅ **EXPECTED** | | 2 | 0.467571 | `sql_db_create` | ❌ | | 3 | 0.440442 | `sql_db_rename` | ❌ | @@ -7230,6 +14142,29 @@ --- ## Test 398 +======= +<<<<<<< HEAD +| 1 | 0.603537 | `sql_db_update` | ✅ **EXPECTED** | +| 2 | 0.467332 | `sql_db_create` | ❌ | +| 3 | 0.440688 | `sql_db_rename` | ❌ | +| 4 | 0.427542 | `sql_db_show` | ❌ | +| 5 | 0.414267 | `sql_server_delete` | ❌ | + +--- + +## Test 388 +======= +| 1 | 0.603360 | `sql_db_update` | ✅ **EXPECTED** | +| 2 | 0.467590 | `sql_db_create` | ❌ | +| 3 | 0.440550 | `sql_db_rename` | ❌ | +| 4 | 0.427654 | `sql_db_show` | ❌ | +| 5 | 0.414041 | `sql_server_delete` | ❌ | + +--- + +## Test 398 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `sql_db_update` **Prompt:** Scale SQL database on server to use SKU @@ -7238,6 +14173,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.550449 | `sql_db_update` | ✅ **EXPECTED** | | 2 | 0.418358 | `sql_server_delete` | ❌ | | 3 | 0.401817 | `sql_db_list` | ❌ | @@ -7247,6 +14183,29 @@ --- ## Test 399 +======= +<<<<<<< HEAD +| 1 | 0.550501 | `sql_db_update` | ✅ **EXPECTED** | +| 2 | 0.418334 | `sql_server_delete` | ❌ | +| 3 | 0.401717 | `sql_db_list` | ❌ | +| 4 | 0.395462 | `sql_db_rename` | ❌ | +| 5 | 0.394705 | `sql_db_show` | ❌ | + +--- + +## Test 389 +======= +| 1 | 0.550556 | `sql_db_update` | ✅ **EXPECTED** | +| 2 | 0.418358 | `sql_server_delete` | ❌ | +| 3 | 0.401817 | `sql_db_list` | ❌ | +| 4 | 0.395518 | `sql_db_rename` | ❌ | +| 5 | 0.394770 | `sql_db_show` | ❌ | + +--- + +## Test 399 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `sql_elastic-pool_list` **Prompt:** List all elastic pools in SQL server @@ -7256,14 +14215,30 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| | 1 | 0.678124 | `sql_elastic-pool_list` | ✅ **EXPECTED** | +<<<<<<< HEAD | 2 | 0.502376 | `sql_db_list` | ❌ | +======= +<<<<<<< HEAD +| 2 | 0.502382 | `sql_db_list` | ❌ | +======= +| 2 | 0.502376 | `sql_db_list` | ❌ | +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) | 3 | 0.498367 | `mysql_database_list` | ❌ | | 4 | 0.485249 | `aks_nodepool_get` | ❌ | | 5 | 0.479044 | `sql_server_show` | ❌ | --- +<<<<<<< HEAD +## Test 400 +======= +<<<<<<< HEAD +## Test 390 +======= ## Test 400 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `sql_elastic-pool_list` **Prompt:** Show me the elastic pools configured for SQL server @@ -7272,6 +14247,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.606425 | `sql_elastic-pool_list` | ✅ **EXPECTED** | | 2 | 0.502877 | `sql_server_show` | ❌ | | 3 | 0.457164 | `sql_db_list` | ❌ | @@ -7281,6 +14257,29 @@ --- ## Test 401 +======= +<<<<<<< HEAD +| 1 | 0.606478 | `sql_elastic-pool_list` | ✅ **EXPECTED** | +| 2 | 0.502977 | `sql_server_show` | ❌ | +| 3 | 0.457262 | `sql_db_list` | ❌ | +| 4 | 0.450790 | `aks_nodepool_get` | ❌ | +| 5 | 0.432867 | `mysql_database_list` | ❌ | + +--- + +## Test 391 +======= +| 1 | 0.606425 | `sql_elastic-pool_list` | ✅ **EXPECTED** | +| 2 | 0.502877 | `sql_server_show` | ❌ | +| 3 | 0.457163 | `sql_db_list` | ❌ | +| 4 | 0.450743 | `aks_nodepool_get` | ❌ | +| 5 | 0.432816 | `mysql_database_list` | ❌ | + +--- + +## Test 401 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `sql_elastic-pool_list` **Prompt:** What elastic pools are available in my SQL server ? @@ -7292,12 +14291,30 @@ | 1 | 0.592709 | `sql_elastic-pool_list` | ✅ **EXPECTED** | | 2 | 0.420325 | `mysql_database_list` | ❌ | | 3 | 0.407169 | `aks_nodepool_get` | ❌ | +<<<<<<< HEAD | 4 | 0.402616 | `mysql_server_list` | ❌ | | 5 | 0.397670 | `sql_db_list` | ❌ | --- ## Test 402 +======= +<<<<<<< HEAD +| 4 | 0.402602 | `mysql_server_list` | ❌ | +| 5 | 0.397708 | `sql_db_list` | ❌ | + +--- + +## Test 392 +======= +| 4 | 0.402616 | `mysql_server_list` | ❌ | +| 5 | 0.397670 | `sql_db_list` | ❌ | + +--- + +## Test 402 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `sql_server_create` **Prompt:** Create a new Azure SQL server named in resource group @@ -7306,6 +14323,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.682605 | `sql_server_create` | ✅ **EXPECTED** | | 2 | 0.563707 | `sql_db_create` | ❌ | | 3 | 0.529198 | `sql_server_list` | ❌ | @@ -7315,6 +14333,29 @@ --- ## Test 403 +======= +<<<<<<< HEAD +| 1 | 0.682198 | `sql_server_create` | ✅ **EXPECTED** | +| 2 | 0.563307 | `sql_db_create` | ❌ | +| 3 | 0.529314 | `sql_server_list` | ❌ | +| 4 | 0.481645 | `storage_account_create` | ❌ | +| 5 | 0.473844 | `sql_db_rename` | ❌ | + +--- + +## Test 393 +======= +| 1 | 0.682812 | `sql_server_create` | ✅ **EXPECTED** | +| 2 | 0.563994 | `sql_db_create` | ❌ | +| 3 | 0.529755 | `sql_server_list` | ❌ | +| 4 | 0.482437 | `storage_account_create` | ❌ | +| 5 | 0.474643 | `sql_db_rename` | ❌ | + +--- + +## Test 403 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `sql_server_create` **Prompt:** Create an Azure SQL server with name in location with admin user @@ -7323,15 +14364,36 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.618354 | `sql_server_create` | ✅ **EXPECTED** | | 2 | 0.510222 | `sql_db_create` | ❌ | +======= +<<<<<<< HEAD +| 1 | 0.618244 | `sql_server_create` | ✅ **EXPECTED** | +| 2 | 0.510507 | `sql_db_create` | ❌ | +>>>>>>> 58ab8585 (update prompts and tool description evaluator) | 3 | 0.472462 | `sql_server_show` | ❌ | | 4 | 0.441267 | `sql_server_delete` | ❌ | | 5 | 0.400941 | `sql_db_rename` | ❌ | --- +<<<<<<< HEAD +## Test 404 +======= +## Test 394 +======= +| 1 | 0.618309 | `sql_server_create` | ✅ **EXPECTED** | +| 2 | 0.510169 | `sql_db_create` | ❌ | +| 3 | 0.472463 | `sql_server_show` | ❌ | +| 4 | 0.441174 | `sql_server_delete` | ❌ | +| 5 | 0.400939 | `sql_db_rename` | ❌ | + +--- + ## Test 404 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `sql_server_create` **Prompt:** Set up a new SQL server called in my resource group @@ -7341,6 +14403,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| | 1 | 0.589818 | `sql_server_create` | ✅ **EXPECTED** | +<<<<<<< HEAD | 2 | 0.501403 | `sql_db_create` | ❌ | | 3 | 0.497890 | `sql_server_list` | ❌ | | 4 | 0.461147 | `sql_db_rename` | ❌ | @@ -7349,6 +14412,27 @@ --- ## Test 405 +======= +<<<<<<< HEAD +| 2 | 0.500874 | `sql_db_create` | ❌ | +| 3 | 0.498255 | `sql_server_list` | ❌ | +| 4 | 0.461181 | `sql_db_rename` | ❌ | +| 5 | 0.442984 | `mysql_server_list` | ❌ | + +--- + +## Test 395 +======= +| 2 | 0.501403 | `sql_db_create` | ❌ | +| 3 | 0.498298 | `sql_server_list` | ❌ | +| 4 | 0.461181 | `sql_db_rename` | ❌ | +| 5 | 0.442934 | `mysql_server_list` | ❌ | + +--- + +## Test 405 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `sql_server_delete` **Prompt:** Delete the Azure SQL server from resource group @@ -7359,13 +14443,29 @@ |------|-------|------|--------| | 1 | 0.656593 | `sql_server_delete` | ✅ **EXPECTED** | | 2 | 0.548064 | `sql_db_delete` | ❌ | +<<<<<<< HEAD | 3 | 0.518037 | `sql_server_list` | ❌ | +======= +<<<<<<< HEAD +| 3 | 0.518306 | `sql_server_list` | ❌ | +======= +| 3 | 0.518201 | `sql_server_list` | ❌ | +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) | 4 | 0.495550 | `sql_server_create` | ❌ | | 5 | 0.483132 | `workbooks_delete` | ❌ | --- +<<<<<<< HEAD +## Test 406 +======= +<<<<<<< HEAD +## Test 396 +======= ## Test 406 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `sql_server_delete` **Prompt:** Remove the SQL server from my subscription @@ -7378,11 +14478,27 @@ | 2 | 0.393885 | `postgres_server_list` | ❌ | | 3 | 0.379760 | `sql_db_delete` | ❌ | | 4 | 0.376660 | `sql_server_show` | ❌ | +<<<<<<< HEAD | 5 | 0.350103 | `sql_server_list` | ❌ | --- ## Test 407 +======= +<<<<<<< HEAD +| 5 | 0.350384 | `sql_server_list` | ❌ | + +--- + +## Test 397 +======= +| 5 | 0.350173 | `sql_server_list` | ❌ | + +--- + +## Test 407 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `sql_server_delete` **Prompt:** Delete SQL server permanently @@ -7395,11 +14511,27 @@ | 2 | 0.454892 | `sql_db_delete` | ❌ | | 3 | 0.362561 | `sql_server_firewall-rule_delete` | ❌ | | 4 | 0.341503 | `sql_server_show` | ❌ | +<<<<<<< HEAD | 5 | 0.318758 | `eventhubs_eventhub_delete` | ❌ | --- ## Test 408 +======= +<<<<<<< HEAD +| 5 | 0.319013 | `eventhubs_eventhub_delete` | ❌ | + +--- + +## Test 398 +======= +| 5 | 0.318758 | `eventhubs_eventhub_delete` | ❌ | + +--- + +## Test 408 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `sql_server_entra-admin_list` **Prompt:** List Microsoft Entra ID administrators for SQL server @@ -7410,6 +14542,7 @@ |------|-------|------|--------| | 1 | 0.783479 | `sql_server_entra-admin_list` | ✅ **EXPECTED** | | 2 | 0.456051 | `sql_server_show` | ❌ | +<<<<<<< HEAD | 3 | 0.434868 | `sql_server_list` | ❌ | | 4 | 0.401854 | `sql_server_firewall-rule_list` | ❌ | | 5 | 0.376055 | `sql_db_list` | ❌ | @@ -7417,6 +14550,25 @@ --- ## Test 409 +======= +<<<<<<< HEAD +| 3 | 0.434565 | `sql_server_list` | ❌ | +| 4 | 0.401908 | `sql_server_firewall-rule_list` | ❌ | +| 5 | 0.375977 | `sql_db_list` | ❌ | + +--- + +## Test 399 +======= +| 3 | 0.434776 | `sql_server_list` | ❌ | +| 4 | 0.401880 | `sql_server_firewall-rule_list` | ❌ | +| 5 | 0.376055 | `sql_db_list` | ❌ | + +--- + +## Test 409 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `sql_server_entra-admin_list` **Prompt:** Show me the Entra ID administrators configured for SQL server @@ -7427,6 +14579,7 @@ |------|-------|------|--------| | 1 | 0.713306 | `sql_server_entra-admin_list` | ✅ **EXPECTED** | | 2 | 0.413144 | `sql_server_show` | ❌ | +<<<<<<< HEAD | 3 | 0.368082 | `sql_server_list` | ❌ | | 4 | 0.315966 | `sql_db_list` | ❌ | | 5 | 0.311085 | `postgres_server_list` | ❌ | @@ -7434,6 +14587,25 @@ --- ## Test 410 +======= +<<<<<<< HEAD +| 3 | 0.367692 | `sql_server_list` | ❌ | +| 4 | 0.315939 | `sql_db_list` | ❌ | +| 5 | 0.311071 | `postgres_server_list` | ❌ | + +--- + +## Test 400 +======= +| 3 | 0.368018 | `sql_server_list` | ❌ | +| 4 | 0.315966 | `sql_db_list` | ❌ | +| 5 | 0.311085 | `postgres_server_list` | ❌ | + +--- + +## Test 410 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `sql_server_entra-admin_list` **Prompt:** What Microsoft Entra ID administrators are set up for my SQL server ? @@ -7444,13 +14616,29 @@ |------|-------|------|--------| | 1 | 0.646419 | `sql_server_entra-admin_list` | ✅ **EXPECTED** | | 2 | 0.356025 | `sql_server_show` | ❌ | +<<<<<<< HEAD | 3 | 0.322155 | `sql_server_list` | ❌ | +======= +<<<<<<< HEAD +| 3 | 0.322084 | `sql_server_list` | ❌ | +======= +| 3 | 0.322362 | `sql_server_list` | ❌ | +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) | 4 | 0.307823 | `sql_server_create` | ❌ | | 5 | 0.269788 | `sql_server_delete` | ❌ | --- +<<<<<<< HEAD +## Test 411 +======= +<<<<<<< HEAD +## Test 401 +======= ## Test 411 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `sql_server_firewall-rule_create` **Prompt:** Create a firewall rule for my Azure SQL server @@ -7459,15 +14647,33 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.635467 | `sql_server_firewall-rule_create` | ✅ **EXPECTED** | | 2 | 0.532658 | `sql_server_firewall-rule_list` | ❌ | | 3 | 0.522133 | `sql_server_firewall-rule_delete` | ❌ | +======= +| 1 | 0.635466 | `sql_server_firewall-rule_create` | ✅ **EXPECTED** | +<<<<<<< HEAD +| 2 | 0.532712 | `sql_server_firewall-rule_list` | ❌ | +======= +| 2 | 0.532682 | `sql_server_firewall-rule_list` | ❌ | +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +| 3 | 0.522184 | `sql_server_firewall-rule_delete` | ❌ | +>>>>>>> 58ab8585 (update prompts and tool description evaluator) | 4 | 0.448822 | `sql_server_create` | ❌ | | 5 | 0.440845 | `sql_server_delete` | ❌ | --- +<<<<<<< HEAD +## Test 412 +======= +<<<<<<< HEAD +## Test 402 +======= ## Test 412 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `sql_server_firewall-rule_create` **Prompt:** Add a firewall rule to allow access from IP range to for SQL server @@ -7476,6 +14682,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.670392 | `sql_server_firewall-rule_create` | ✅ **EXPECTED** | | 2 | 0.533587 | `sql_server_firewall-rule_list` | ❌ | | 3 | 0.503740 | `sql_server_firewall-rule_delete` | ❌ | @@ -7485,6 +14692,29 @@ --- ## Test 413 +======= +<<<<<<< HEAD +| 1 | 0.670233 | `sql_server_firewall-rule_create` | ✅ **EXPECTED** | +| 2 | 0.533669 | `sql_server_firewall-rule_list` | ❌ | +| 3 | 0.503500 | `sql_server_firewall-rule_delete` | ❌ | +| 4 | 0.316954 | `sql_server_list` | ❌ | +| 5 | 0.302510 | `sql_server_delete` | ❌ | + +--- + +## Test 403 +======= +| 1 | 0.670189 | `sql_server_firewall-rule_create` | ✅ **EXPECTED** | +| 2 | 0.533532 | `sql_server_firewall-rule_list` | ❌ | +| 3 | 0.503648 | `sql_server_firewall-rule_delete` | ❌ | +| 4 | 0.316667 | `sql_server_list` | ❌ | +| 5 | 0.302362 | `sql_server_delete` | ❌ | + +--- + +## Test 413 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `sql_server_firewall-rule_create` **Prompt:** Create a new firewall rule named for SQL server @@ -7493,6 +14723,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.685125 | `sql_server_firewall-rule_create` | ✅ **EXPECTED** | | 2 | 0.574393 | `sql_server_firewall-rule_list` | ❌ | | 3 | 0.539643 | `sql_server_firewall-rule_delete` | ❌ | @@ -7502,6 +14733,28 @@ --- ## Test 414 +======= +| 1 | 0.685107 | `sql_server_firewall-rule_create` | ✅ **EXPECTED** | +<<<<<<< HEAD +| 2 | 0.574336 | `sql_server_firewall-rule_list` | ❌ | +| 3 | 0.539577 | `sql_server_firewall-rule_delete` | ❌ | +| 4 | 0.428919 | `sql_server_create` | ❌ | +| 5 | 0.394446 | `sql_db_create` | ❌ | + +--- + +## Test 404 +======= +| 2 | 0.574310 | `sql_server_firewall-rule_list` | ❌ | +| 3 | 0.539577 | `sql_server_firewall-rule_delete` | ❌ | +| 4 | 0.428919 | `sql_server_create` | ❌ | +| 5 | 0.395165 | `sql_db_create` | ❌ | + +--- + +## Test 414 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `sql_server_firewall-rule_delete` **Prompt:** Delete a firewall rule from my Azure SQL server @@ -7512,13 +14765,25 @@ |------|-------|------|--------| | 1 | 0.691498 | `sql_server_firewall-rule_delete` | ✅ **EXPECTED** | | 2 | 0.584379 | `sql_server_delete` | ❌ | +<<<<<<< HEAD | 3 | 0.543780 | `sql_server_firewall-rule_list` | ❌ | +======= +| 3 | 0.543839 | `sql_server_firewall-rule_list` | ❌ | +>>>>>>> 58ab8585 (update prompts and tool description evaluator) | 4 | 0.540333 | `sql_server_firewall-rule_create` | ❌ | | 5 | 0.498444 | `sql_db_delete` | ❌ | --- +<<<<<<< HEAD +## Test 415 +======= +<<<<<<< HEAD +## Test 405 +======= ## Test 415 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `sql_server_firewall-rule_delete` **Prompt:** Remove the firewall rule from SQL server @@ -7527,15 +14792,28 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.670233 | `sql_server_firewall-rule_delete` | ✅ **EXPECTED** | | 2 | 0.574296 | `sql_server_firewall-rule_list` | ❌ | +======= +| 1 | 0.670179 | `sql_server_firewall-rule_delete` | ✅ **EXPECTED** | +| 2 | 0.574321 | `sql_server_firewall-rule_list` | ❌ | +>>>>>>> 58ab8585 (update prompts and tool description evaluator) | 3 | 0.530419 | `sql_server_firewall-rule_create` | ❌ | | 4 | 0.488418 | `sql_server_delete` | ❌ | | 5 | 0.360381 | `sql_db_delete` | ❌ | --- +<<<<<<< HEAD +## Test 416 +======= +<<<<<<< HEAD +## Test 406 +======= ## Test 416 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `sql_server_firewall-rule_delete` **Prompt:** Delete firewall rule for SQL server @@ -7544,15 +14822,28 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.671298 | `sql_server_firewall-rule_delete` | ✅ **EXPECTED** | | 2 | 0.601174 | `sql_server_firewall-rule_list` | ❌ | +======= +| 1 | 0.671212 | `sql_server_firewall-rule_delete` | ✅ **EXPECTED** | +| 2 | 0.601217 | `sql_server_firewall-rule_list` | ❌ | +>>>>>>> 58ab8585 (update prompts and tool description evaluator) | 3 | 0.577330 | `sql_server_firewall-rule_create` | ❌ | | 4 | 0.499272 | `sql_server_delete` | ❌ | | 5 | 0.378586 | `sql_db_delete` | ❌ | --- +<<<<<<< HEAD +## Test 417 +======= +<<<<<<< HEAD +## Test 407 +======= ## Test 417 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `sql_server_firewall-rule_list` **Prompt:** List all firewall rules for SQL server @@ -7561,15 +14852,35 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.729336 | `sql_server_firewall-rule_list` | ✅ **EXPECTED** | +======= +| 1 | 0.729320 | `sql_server_firewall-rule_list` | ✅ **EXPECTED** | +>>>>>>> 58ab8585 (update prompts and tool description evaluator) | 2 | 0.549667 | `sql_server_firewall-rule_create` | ❌ | | 3 | 0.513187 | `sql_server_firewall-rule_delete` | ❌ | | 4 | 0.468812 | `sql_server_show` | ❌ | +<<<<<<< HEAD | 5 | 0.418817 | `sql_server_list` | ❌ | --- ## Test 418 +======= +<<<<<<< HEAD +| 5 | 0.418869 | `sql_server_list` | ❌ | + +--- + +## Test 408 +======= +| 5 | 0.418738 | `sql_server_list` | ❌ | + +--- + +## Test 418 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `sql_server_firewall-rule_list` **Prompt:** Show me the firewall rules for SQL server @@ -7582,11 +14893,27 @@ | 2 | 0.524126 | `sql_server_firewall-rule_create` | ❌ | | 3 | 0.476792 | `sql_server_firewall-rule_delete` | ❌ | | 4 | 0.410680 | `sql_server_show` | ❌ | +<<<<<<< HEAD | 5 | 0.348100 | `sql_server_list` | ❌ | --- ## Test 419 +======= +<<<<<<< HEAD +| 5 | 0.348249 | `sql_server_list` | ❌ | + +--- + +## Test 409 +======= +| 5 | 0.348096 | `sql_server_list` | ❌ | + +--- + +## Test 419 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `sql_server_firewall-rule_list` **Prompt:** What firewall rules are configured for my SQL server ? @@ -7595,15 +14922,35 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.630460 | `sql_server_firewall-rule_list` | ✅ **EXPECTED** | +======= +| 1 | 0.630494 | `sql_server_firewall-rule_list` | ✅ **EXPECTED** | +>>>>>>> 58ab8585 (update prompts and tool description evaluator) | 2 | 0.532454 | `sql_server_firewall-rule_create` | ❌ | | 3 | 0.473596 | `sql_server_firewall-rule_delete` | ❌ | | 4 | 0.412957 | `sql_server_show` | ❌ | +<<<<<<< HEAD | 5 | 0.350513 | `sql_server_list` | ❌ | --- ## Test 420 +======= +<<<<<<< HEAD +| 5 | 0.350545 | `sql_server_list` | ❌ | + +--- + +## Test 410 +======= +| 5 | 0.350474 | `sql_server_list` | ❌ | + +--- + +## Test 420 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `sql_server_list` **Prompt:** List all Azure SQL servers in resource group @@ -7612,15 +14959,35 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.694404 | `sql_server_list` | ✅ **EXPECTED** | | 2 | 0.596686 | `mysql_server_list` | ❌ | | 3 | 0.578238 | `sql_db_list` | ❌ | +======= +<<<<<<< HEAD +| 1 | 0.694268 | `sql_server_list` | ✅ **EXPECTED** | +| 2 | 0.596720 | `mysql_server_list` | ❌ | +| 3 | 0.578135 | `sql_db_list` | ❌ | +======= +| 1 | 0.694306 | `sql_server_list` | ✅ **EXPECTED** | +| 2 | 0.596686 | `mysql_server_list` | ❌ | +| 3 | 0.578239 | `sql_db_list` | ❌ | +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) | 4 | 0.515851 | `sql_elastic-pool_list` | ❌ | | 5 | 0.509789 | `sql_db_show` | ❌ | --- +<<<<<<< HEAD ## Test 421 +======= +<<<<<<< HEAD +## Test 411 +======= +## Test 421 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `sql_server_list` **Prompt:** Show me every SQL server available in resource group @@ -7629,15 +14996,37 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.618218 | `sql_server_list` | ✅ **EXPECTED** | | 2 | 0.593837 | `mysql_server_list` | ❌ | | 3 | 0.542398 | `sql_db_list` | ❌ | | 4 | 0.507404 | `resourcehealth_availability-status_list` | ❌ | +======= +<<<<<<< HEAD +| 1 | 0.618206 | `sql_server_list` | ✅ **EXPECTED** | +| 2 | 0.593874 | `mysql_server_list` | ❌ | +| 3 | 0.542307 | `sql_db_list` | ❌ | +| 4 | 0.507683 | `resourcehealth_availability-status_list` | ❌ | +======= +| 1 | 0.618222 | `sql_server_list` | ✅ **EXPECTED** | +| 2 | 0.593837 | `mysql_server_list` | ❌ | +| 3 | 0.542398 | `sql_db_list` | ❌ | +| 4 | 0.507404 | `resourcehealth_availability-status_list` | ❌ | +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) | 5 | 0.496200 | `group_list` | ❌ | --- +<<<<<<< HEAD +## Test 422 +======= +<<<<<<< HEAD +## Test 412 +======= ## Test 422 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `sql_server_show` **Prompt:** Show me the details of Azure SQL server in resource group @@ -7648,6 +15037,7 @@ |------|-------|------|--------| | 1 | 0.629672 | `sql_db_show` | ❌ | | 2 | 0.595184 | `sql_server_show` | ✅ **EXPECTED** | +<<<<<<< HEAD | 3 | 0.587728 | `sql_server_list` | ❌ | | 4 | 0.559893 | `mysql_server_list` | ❌ | | 5 | 0.540218 | `sql_db_list` | ❌ | @@ -7655,6 +15045,25 @@ --- ## Test 423 +======= +<<<<<<< HEAD +| 3 | 0.587826 | `sql_server_list` | ❌ | +| 4 | 0.559936 | `mysql_server_list` | ❌ | +| 5 | 0.540037 | `sql_db_list` | ❌ | + +--- + +## Test 413 +======= +| 3 | 0.587806 | `sql_server_list` | ❌ | +| 4 | 0.559893 | `mysql_server_list` | ❌ | +| 5 | 0.540218 | `sql_db_list` | ❌ | + +--- + +## Test 423 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `sql_server_show` **Prompt:** Get the configuration details for SQL server @@ -7671,7 +15080,15 @@ --- +<<<<<<< HEAD +## Test 424 +======= +<<<<<<< HEAD +## Test 414 +======= ## Test 424 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `sql_server_show` **Prompt:** Display the properties of SQL server @@ -7682,13 +15099,30 @@ |------|-------|------|--------| | 1 | 0.563143 | `sql_server_show` | ✅ **EXPECTED** | | 2 | 0.392532 | `postgres_server_config_get` | ❌ | +<<<<<<< HEAD | 3 | 0.380035 | `postgres_server_param_get` | ❌ | | 4 | 0.372102 | `sql_server_firewall-rule_list` | ❌ | +======= +| 3 | 0.380021 | `postgres_server_param_get` | ❌ | +<<<<<<< HEAD +| 4 | 0.372194 | `sql_server_firewall-rule_list` | ❌ | +======= +| 4 | 0.372172 | `sql_server_firewall-rule_list` | ❌ | +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) | 5 | 0.370539 | `sql_db_show` | ❌ | --- +<<<<<<< HEAD +## Test 425 +======= +<<<<<<< HEAD +## Test 415 +======= ## Test 425 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `storage_account_create` **Prompt:** Create a new storage account called testaccount123 in East US region @@ -7699,6 +15133,7 @@ |------|-------|------|--------| | 1 | 0.533552 | `storage_account_create` | ✅ **EXPECTED** | | 2 | 0.438046 | `storage_blob_container_create` | ❌ | +<<<<<<< HEAD | 3 | 0.418191 | `storage_account_get` | ❌ | | 4 | 0.413950 | `storage_blob_container_get` | ❌ | | 5 | 0.373651 | `managedlustre_fs_create` | ❌ | @@ -7706,6 +15141,23 @@ --- ## Test 426 +======= +<<<<<<< HEAD +| 3 | 0.418002 | `storage_account_get` | ❌ | +======= +| 3 | 0.418134 | `storage_account_get` | ❌ | +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +| 4 | 0.414518 | `storage_blob_container_get` | ❌ | +| 5 | 0.370957 | `managedlustre_fs_create` | ❌ | + +--- + +<<<<<<< HEAD +## Test 416 +======= +## Test 426 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `storage_account_create` **Prompt:** Create a storage account with premium performance and LRS replication @@ -7715,14 +15167,32 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| | 1 | 0.500638 | `storage_account_create` | ✅ **EXPECTED** | +<<<<<<< HEAD | 2 | 0.484584 | `managedlustre_fs_create` | ❌ | | 3 | 0.407222 | `storage_account_get` | ❌ | +======= +<<<<<<< HEAD +| 2 | 0.483202 | `managedlustre_fs_create` | ❌ | +| 3 | 0.407182 | `storage_account_get` | ❌ | +======= +| 2 | 0.483202 | `managedlustre_filesystem_create` | ❌ | +| 3 | 0.407200 | `storage_account_get` | ❌ | +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) | 4 | 0.406804 | `storage_blob_container_create` | ❌ | | 5 | 0.400134 | `managedlustre_fs_sku_get` | ❌ | --- +<<<<<<< HEAD +## Test 427 +======= +<<<<<<< HEAD +## Test 417 +======= ## Test 427 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `storage_account_create` **Prompt:** Create a new storage account with Data Lake Storage Gen2 enabled @@ -7734,12 +15204,30 @@ | 1 | 0.589002 | `storage_account_create` | ✅ **EXPECTED** | | 2 | 0.538023 | `managedlustre_fs_create` | ❌ | | 3 | 0.509731 | `storage_blob_container_create` | ❌ | +<<<<<<< HEAD | 4 | 0.462519 | `storage_account_get` | ❌ | | 5 | 0.447156 | `sql_db_create` | ❌ | --- ## Test 428 +======= +<<<<<<< HEAD +| 4 | 0.462494 | `storage_account_get` | ❌ | +| 5 | 0.447560 | `sql_db_create` | ❌ | + +--- + +## Test 418 +======= +| 4 | 0.462480 | `storage_account_get` | ❌ | +| 5 | 0.447156 | `sql_db_create` | ❌ | + +--- + +## Test 428 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `storage_account_get` **Prompt:** Show me the details for my storage account @@ -7748,6 +15236,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.673750 | `storage_account_get` | ✅ **EXPECTED** | | 2 | 0.607762 | `storage_blob_container_get` | ❌ | | 3 | 0.556457 | `storage_blob_get` | ❌ | @@ -7757,6 +15246,29 @@ --- ## Test 429 +======= +<<<<<<< HEAD +| 1 | 0.673569 | `storage_account_get` | ✅ **EXPECTED** | +| 2 | 0.608073 | `storage_blob_container_get` | ❌ | +| 3 | 0.556407 | `storage_blob_get` | ❌ | +| 4 | 0.483573 | `storage_account_create` | ❌ | +| 5 | 0.439109 | `cosmos_account_list` | ❌ | + +--- + +## Test 419 +======= +| 1 | 0.673754 | `storage_account_get` | ✅ **EXPECTED** | +| 2 | 0.608256 | `storage_blob_container_get` | ❌ | +| 3 | 0.556457 | `storage_blob_get` | ❌ | +| 4 | 0.483435 | `storage_account_create` | ❌ | +| 5 | 0.439187 | `cosmos_account_list` | ❌ | + +--- + +## Test 429 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `storage_account_get` **Prompt:** Get details about the storage account @@ -7765,15 +15277,32 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.692687 | `storage_account_get` | ✅ **EXPECTED** | | 2 | 0.577173 | `storage_blob_container_get` | ❌ | +======= +<<<<<<< HEAD +| 1 | 0.692473 | `storage_account_get` | ✅ **EXPECTED** | +======= +| 1 | 0.692698 | `storage_account_get` | ✅ **EXPECTED** | +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +| 2 | 0.577547 | `storage_blob_container_get` | ❌ | +>>>>>>> 58ab8585 (update prompts and tool description evaluator) | 3 | 0.529205 | `storage_blob_get` | ❌ | | 4 | 0.518215 | `storage_account_create` | ❌ | | 5 | 0.448506 | `storage_blob_container_create` | ❌ | --- +<<<<<<< HEAD +## Test 430 +======= +<<<<<<< HEAD +## Test 420 +======= ## Test 430 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `storage_account_get` **Prompt:** List all storage accounts in my subscription including their location and SKU @@ -7782,15 +15311,37 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.649215 | `storage_account_get` | ✅ **EXPECTED** | | 2 | 0.557093 | `managedlustre_fs_sku_get` | ❌ | | 3 | 0.549448 | `storage_blob_container_get` | ❌ | +======= +<<<<<<< HEAD +| 1 | 0.649393 | `storage_account_get` | ✅ **EXPECTED** | +| 2 | 0.557016 | `managedlustre_fs_sku_get` | ❌ | +| 3 | 0.550148 | `storage_blob_container_get` | ❌ | +>>>>>>> 58ab8585 (update prompts and tool description evaluator) | 4 | 0.547577 | `subscription_list` | ❌ | | 5 | 0.536909 | `cosmos_account_list` | ❌ | --- +<<<<<<< HEAD +## Test 431 +======= +## Test 421 +======= +| 1 | 0.649191 | `storage_account_get` | ✅ **EXPECTED** | +| 2 | 0.557016 | `managedlustre_filesystem_sku_get` | ❌ | +| 3 | 0.550148 | `storage_blob_container_get` | ❌ | +| 4 | 0.547647 | `subscription_list` | ❌ | +| 5 | 0.536912 | `cosmos_account_list` | ❌ | + +--- + ## Test 431 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `storage_account_get` **Prompt:** Show me my storage accounts with whether hierarchical namespace (HNS) is enabled @@ -7799,15 +15350,36 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.556860 | `storage_account_get` | ✅ **EXPECTED** | | 2 | 0.481664 | `storage_blob_container_get` | ❌ | | 3 | 0.461284 | `managedlustre_fs_list` | ❌ | +======= +<<<<<<< HEAD +| 1 | 0.557064 | `storage_account_get` | ✅ **EXPECTED** | +| 2 | 0.482418 | `storage_blob_container_get` | ❌ | +| 3 | 0.461308 | `managedlustre_fs_list` | ❌ | +>>>>>>> 58ab8585 (update prompts and tool description evaluator) | 4 | 0.421642 | `cosmos_account_list` | ❌ | +======= +| 1 | 0.556930 | `storage_account_get` | ✅ **EXPECTED** | +| 2 | 0.482418 | `storage_blob_container_get` | ❌ | +| 3 | 0.461284 | `managedlustre_filesystem_list` | ❌ | +| 4 | 0.421671 | `cosmos_account_list` | ❌ | +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) | 5 | 0.410587 | `storage_blob_get` | ❌ | --- +<<<<<<< HEAD +## Test 432 +======= +<<<<<<< HEAD +## Test 422 +======= ## Test 432 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `storage_account_get` **Prompt:** Show me the storage accounts in my subscription and include HTTPS-only and public blob access settings @@ -7816,15 +15388,36 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.619462 | `storage_account_get` | ✅ **EXPECTED** | | 2 | 0.555677 | `storage_blob_container_get` | ❌ | +======= +<<<<<<< HEAD +| 1 | 0.619639 | `storage_account_get` | ✅ **EXPECTED** | +| 2 | 0.556436 | `storage_blob_container_get` | ❌ | +>>>>>>> 58ab8585 (update prompts and tool description evaluator) | 3 | 0.518229 | `storage_blob_get` | ❌ | | 4 | 0.473598 | `cosmos_account_list` | ❌ | | 5 | 0.465527 | `subscription_list` | ❌ | --- +<<<<<<< HEAD +## Test 433 +======= +## Test 423 +======= +| 1 | 0.619491 | `storage_account_get` | ✅ **EXPECTED** | +| 2 | 0.556436 | `storage_blob_container_get` | ❌ | +| 3 | 0.518229 | `storage_blob_get` | ❌ | +| 4 | 0.473662 | `cosmos_account_list` | ❌ | +| 5 | 0.465571 | `subscription_list` | ❌ | + +--- + ## Test 433 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `storage_blob_container_create` **Prompt:** Create the storage container mycontainer in storage account @@ -7841,7 +15434,15 @@ --- +<<<<<<< HEAD +## Test 434 +======= +<<<<<<< HEAD +## Test 424 +======= ## Test 434 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `storage_blob_container_create` **Prompt:** Create the container using blob public access in storage account @@ -7851,6 +15452,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| | 1 | 0.682161 | `storage_blob_container_create` | ✅ **EXPECTED** | +<<<<<<< HEAD | 2 | 0.590826 | `storage_blob_container_get` | ❌ | | 3 | 0.559264 | `storage_blob_get` | ❌ | | 4 | 0.500625 | `storage_account_create` | ❌ | @@ -7859,6 +15461,24 @@ --- ## Test 435 +======= +| 2 | 0.590160 | `storage_blob_container_get` | ❌ | +| 3 | 0.559263 | `storage_blob_get` | ❌ | +| 4 | 0.500624 | `storage_account_create` | ❌ | +<<<<<<< HEAD +| 5 | 0.420434 | `storage_account_get` | ❌ | + +--- + +## Test 425 +======= +| 5 | 0.420516 | `storage_account_get` | ❌ | + +--- + +## Test 435 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `storage_blob_container_create` **Prompt:** Create a new blob container named documents with container public access in storage account @@ -7875,7 +15495,15 @@ --- +<<<<<<< HEAD ## Test 436 +======= +<<<<<<< HEAD +## Test 426 +======= +## Test 436 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `storage_blob_container_get` **Prompt:** Show me the properties of the storage container in the storage account @@ -7886,6 +15514,7 @@ |------|-------|------|--------| | 1 | 0.703348 | `storage_blob_container_get` | ✅ **EXPECTED** | | 2 | 0.623681 | `storage_blob_get` | ❌ | +<<<<<<< HEAD | 3 | 0.577921 | `storage_account_get` | ❌ | | 4 | 0.549804 | `storage_blob_container_create` | ❌ | | 5 | 0.523289 | `cosmos_database_container_list` | ❌ | @@ -7893,6 +15522,23 @@ --- ## Test 437 +======= +<<<<<<< HEAD +| 3 | 0.577740 | `storage_account_get` | ❌ | +======= +| 3 | 0.577904 | `storage_account_get` | ❌ | +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +| 4 | 0.549803 | `storage_blob_container_create` | ❌ | +| 5 | 0.523288 | `cosmos_database_container_list` | ❌ | + +--- + +<<<<<<< HEAD +## Test 427 +======= +## Test 437 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `storage_blob_container_get` **Prompt:** List all blob containers in the storage account @@ -7909,7 +15555,15 @@ --- +<<<<<<< HEAD +## Test 438 +======= +<<<<<<< HEAD +## Test 428 +======= ## Test 438 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `storage_blob_container_get` **Prompt:** Show me the containers in the storage account @@ -7921,12 +15575,29 @@ | 1 | 0.713080 | `storage_blob_container_get` | ✅ **EXPECTED** | | 2 | 0.592373 | `cosmos_database_container_list` | ❌ | | 3 | 0.586169 | `storage_blob_get` | ❌ | +<<<<<<< HEAD | 4 | 0.523322 | `storage_account_get` | ❌ | | 5 | 0.487520 | `storage_blob_container_create` | ❌ | --- ## Test 439 +======= +<<<<<<< HEAD +| 4 | 0.523353 | `storage_account_get` | ❌ | +======= +| 4 | 0.523293 | `storage_account_get` | ❌ | +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +| 5 | 0.487521 | `storage_blob_container_create` | ❌ | + +--- + +<<<<<<< HEAD +## Test 429 +======= +## Test 439 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `storage_blob_get` **Prompt:** Show me the properties for blob in container in storage account @@ -7935,6 +15606,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.700963 | `storage_blob_get` | ✅ **EXPECTED** | | 2 | 0.648279 | `storage_blob_container_get` | ❌ | | 3 | 0.540987 | `storage_blob_container_create` | ❌ | @@ -7944,6 +15616,29 @@ --- ## Test 440 +======= +<<<<<<< HEAD +| 1 | 0.700969 | `storage_blob_get` | ✅ **EXPECTED** | +| 2 | 0.647029 | `storage_blob_container_get` | ❌ | +| 3 | 0.541060 | `storage_blob_container_create` | ❌ | +| 4 | 0.527327 | `storage_account_get` | ❌ | +| 5 | 0.477993 | `cosmos_database_container_list` | ❌ | + +--- + +## Test 430 +======= +| 1 | 0.700973 | `storage_blob_get` | ✅ **EXPECTED** | +| 2 | 0.646973 | `storage_blob_container_get` | ❌ | +| 3 | 0.541019 | `storage_blob_container_create` | ❌ | +| 4 | 0.527428 | `storage_account_get` | ❌ | +| 5 | 0.477946 | `cosmos_database_container_list` | ❌ | + +--- + +## Test 440 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `storage_blob_get` **Prompt:** Get the details about blob in the container in storage account @@ -7953,14 +15648,32 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| | 1 | 0.694997 | `storage_blob_get` | ✅ **EXPECTED** | +<<<<<<< HEAD | 2 | 0.633397 | `storage_blob_container_get` | ❌ | | 3 | 0.589151 | `storage_blob_container_create` | ❌ | | 4 | 0.580226 | `storage_account_get` | ❌ | +======= +| 2 | 0.631161 | `storage_blob_container_get` | ❌ | +| 3 | 0.589152 | `storage_blob_container_create` | ❌ | +<<<<<<< HEAD +| 4 | 0.579989 | `storage_account_get` | ❌ | +======= +| 4 | 0.580235 | `storage_account_get` | ❌ | +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) | 5 | 0.457038 | `storage_account_create` | ❌ | --- +<<<<<<< HEAD +## Test 441 +======= +<<<<<<< HEAD +## Test 431 +======= ## Test 441 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `storage_blob_get` **Prompt:** List all blobs in the blob container in the storage account @@ -7973,11 +15686,27 @@ | 2 | 0.702342 | `storage_blob_container_get` | ❌ | | 3 | 0.605993 | `storage_blob_container_create` | ❌ | | 4 | 0.579070 | `cosmos_database_container_list` | ❌ | +<<<<<<< HEAD +| 5 | 0.506639 | `cosmos_database_container_item_query` | ❌ | + +--- + +## Test 442 +======= +<<<<<<< HEAD +| 5 | 0.506792 | `cosmos_database_container_item_query` | ❌ | + +--- + +## Test 432 +======= | 5 | 0.506639 | `cosmos_database_container_item_query` | ❌ | --- ## Test 442 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `storage_blob_get` **Prompt:** Show me the blobs in the blob container in the storage account @@ -7986,6 +15715,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.704426 | `storage_blob_get` | ✅ **EXPECTED** | | 2 | 0.666342 | `storage_blob_container_get` | ❌ | | 3 | 0.561557 | `storage_blob_container_create` | ❌ | @@ -7994,7 +15724,22 @@ --- +<<<<<<< HEAD +## Test 443 +======= +## Test 433 +======= +| 1 | 0.704413 | `storage_blob_get` | ✅ **EXPECTED** | +| 2 | 0.664877 | `storage_blob_container_get` | ❌ | +| 3 | 0.561546 | `storage_blob_container_create` | ❌ | +| 4 | 0.533442 | `cosmos_database_container_list` | ❌ | +| 5 | 0.483914 | `storage_account_get` | ❌ | + +--- + ## Test 443 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `storage_blob_upload` **Prompt:** Upload file to storage blob in container in storage account @@ -8003,6 +15748,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.566278 | `storage_blob_upload` | ✅ **EXPECTED** | | 2 | 0.525685 | `storage_blob_container_create` | ❌ | | 3 | 0.517524 | `storage_blob_get` | ❌ | @@ -8012,6 +15758,29 @@ --- ## Test 444 +======= +<<<<<<< HEAD +| 1 | 0.566280 | `storage_blob_upload` | ✅ **EXPECTED** | +| 2 | 0.525689 | `storage_blob_container_create` | ❌ | +| 3 | 0.517628 | `storage_blob_get` | ❌ | +| 4 | 0.473667 | `storage_blob_container_get` | ❌ | +| 5 | 0.382148 | `storage_account_create` | ❌ | + +--- + +## Test 434 +======= +| 1 | 0.566287 | `storage_blob_upload` | ✅ **EXPECTED** | +| 2 | 0.525674 | `storage_blob_container_create` | ❌ | +| 3 | 0.517616 | `storage_blob_get` | ❌ | +| 4 | 0.473645 | `storage_blob_container_get` | ❌ | +| 5 | 0.382123 | `storage_account_create` | ❌ | + +--- + +## Test 444 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `subscription_list` **Prompt:** List all subscriptions for my account @@ -8020,15 +15789,30 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.654048 | `subscription_list` | ✅ **EXPECTED** | | 2 | 0.512964 | `cosmos_account_list` | ❌ | | 3 | 0.471653 | `postgres_server_list` | ❌ | | 4 | 0.469023 | `kusto_cluster_list` | ❌ | +======= +| 1 | 0.654071 | `subscription_list` | ✅ **EXPECTED** | +| 2 | 0.512954 | `cosmos_account_list` | ❌ | +| 3 | 0.471653 | `postgres_server_list` | ❌ | +| 4 | 0.469085 | `kusto_cluster_list` | ❌ | +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) | 5 | 0.461078 | `redis_list` | ❌ | --- +<<<<<<< HEAD +## Test 445 +======= +<<<<<<< HEAD +## Test 435 +======= ## Test 445 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `subscription_list` **Prompt:** Show me my subscriptions @@ -8045,7 +15829,15 @@ --- +<<<<<<< HEAD +## Test 446 +======= +<<<<<<< HEAD +## Test 436 +======= ## Test 446 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `subscription_list` **Prompt:** What is my current subscription? @@ -8056,6 +15848,7 @@ |------|-------|------|--------| | 1 | 0.433242 | `subscription_list` | ✅ **EXPECTED** | | 2 | 0.319579 | `marketplace_product_list` | ❌ | +<<<<<<< HEAD | 3 | 0.315547 | `marketplace_product_get` | ❌ | | 4 | 0.293009 | `eventgrid_subscription_list` | ❌ | | 5 | 0.289280 | `eventgrid_topic_list` | ❌ | @@ -8063,6 +15856,23 @@ --- ## Test 447 +======= +<<<<<<< HEAD +| 3 | 0.315354 | `marketplace_product_get` | ❌ | +======= +| 3 | 0.315474 | `marketplace_product_get` | ❌ | +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +| 4 | 0.293772 | `eventgrid_subscription_list` | ❌ | +| 5 | 0.289334 | `eventgrid_topic_list` | ❌ | + +--- + +<<<<<<< HEAD +## Test 437 +======= +## Test 447 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `subscription_list` **Prompt:** What subscriptions do I have? @@ -8079,7 +15889,15 @@ --- +<<<<<<< HEAD ## Test 448 +======= +<<<<<<< HEAD +## Test 438 +======= +## Test 448 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `azureterraformbestpractices_get` **Prompt:** Fetch the Azure Terraform best practices @@ -8090,6 +15908,7 @@ |------|-------|------|--------| | 1 | 0.686886 | `azureterraformbestpractices_get` | ✅ **EXPECTED** | | 2 | 0.625270 | `deploy_iac_rules_get` | ❌ | +<<<<<<< HEAD | 3 | 0.605048 | `get_bestpractices_get` | ❌ | | 4 | 0.482745 | `deploy_pipeline_guidance_get` | ❌ | | 5 | 0.468390 | `azureaibestpractices_get` | ❌ | @@ -8097,6 +15916,19 @@ --- ## Test 449 +======= +| 3 | 0.605599 | `get_bestpractices_get` | ❌ | +| 4 | 0.482936 | `deploy_pipeline_guidance_get` | ❌ | +| 5 | 0.466199 | `deploy_plan_get` | ❌ | + +--- + +<<<<<<< HEAD +## Test 439 +======= +## Test 449 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `azureterraformbestpractices_get` **Prompt:** Show me the Azure Terraform best practices and generate code sample to get a secret from Azure Key Vault @@ -8105,6 +15937,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.581316 | `azureterraformbestpractices_get` | ✅ **EXPECTED** | | 2 | 0.512141 | `get_bestpractices_get` | ❌ | | 3 | 0.510005 | `deploy_iac_rules_get` | ❌ | @@ -8114,6 +15947,26 @@ --- ## Test 450 +======= +<<<<<<< HEAD +| 1 | 0.581332 | `azureterraformbestpractices_get` | ✅ **EXPECTED** | +| 2 | 0.512141 | `get_bestpractices_get` | ❌ | +======= +| 1 | 0.581316 | `azureterraformbestpractices_get` | ✅ **EXPECTED** | +| 2 | 0.515758 | `get_bestpractices_get` | ❌ | +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +| 3 | 0.510004 | `deploy_iac_rules_get` | ❌ | +| 4 | 0.473596 | `keyvault_secret_get` | ❌ | +| 5 | 0.444297 | `deploy_pipeline_guidance_get` | ❌ | + +--- + +<<<<<<< HEAD +## Test 440 +======= +## Test 450 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `virtualdesktop_hostpool_list` **Prompt:** List all host pools in my subscription @@ -8122,6 +15975,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.711905 | `virtualdesktop_hostpool_list` | ✅ **EXPECTED** | | 2 | 0.659763 | `virtualdesktop_hostpool_host_list` | ❌ | | 3 | 0.620665 | `kusto_cluster_list` | ❌ | @@ -8131,6 +15985,26 @@ --- ## Test 451 +======= +| 1 | 0.711969 | `virtualdesktop_hostpool_list` | ✅ **EXPECTED** | +<<<<<<< HEAD +| 2 | 0.659763 | `virtualdesktop_hostpool_host_list` | ❌ | +| 3 | 0.620666 | `kusto_cluster_list` | ❌ | +======= +| 2 | 0.659732 | `virtualdesktop_hostpool_sessionhost_list` | ❌ | +| 3 | 0.620507 | `kusto_cluster_list` | ❌ | +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +| 4 | 0.548888 | `search_service_list` | ❌ | +| 5 | 0.535777 | `virtualdesktop_hostpool_host_user-list` | ❌ | + +--- + +<<<<<<< HEAD +## Test 441 +======= +## Test 451 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `virtualdesktop_hostpool_host_list` **Prompt:** List all session hosts in host pool @@ -8139,15 +16013,33 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.727054 | `virtualdesktop_hostpool_host_list` | ✅ **EXPECTED** | +<<<<<<< HEAD | 2 | 0.715572 | `virtualdesktop_hostpool_host_user-list` | ❌ | | 3 | 0.573350 | `virtualdesktop_hostpool_list` | ❌ | +======= +| 2 | 0.714553 | `virtualdesktop_hostpool_host_user-list` | ❌ | +======= +| 1 | 0.726933 | `virtualdesktop_hostpool_sessionhost_list` | ❌ | +| 2 | 0.714469 | `virtualdesktop_hostpool_sessionhost_usersession-list` | ❌ | +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +| 3 | 0.573352 | `virtualdesktop_hostpool_list` | ❌ | +>>>>>>> 58ab8585 (update prompts and tool description evaluator) | 4 | 0.438659 | `aks_nodepool_get` | ❌ | | 5 | 0.393721 | `sql_elastic-pool_list` | ❌ | --- +<<<<<<< HEAD ## Test 452 +======= +<<<<<<< HEAD +## Test 442 +======= +## Test 452 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `virtualdesktop_hostpool_host_user-list` **Prompt:** List all user sessions on session host in host pool @@ -8156,15 +16048,37 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.813311 | `virtualdesktop_hostpool_host_user-list` | ✅ **EXPECTED** | | 2 | 0.659213 | `virtualdesktop_hostpool_host_list` | ❌ | | 3 | 0.501113 | `virtualdesktop_hostpool_list` | ❌ | +======= +<<<<<<< HEAD +| 1 | 0.812787 | `virtualdesktop_hostpool_host_user-list` | ✅ **EXPECTED** | +| 2 | 0.659212 | `virtualdesktop_hostpool_host_list` | ❌ | +| 3 | 0.501167 | `virtualdesktop_hostpool_list` | ❌ | +>>>>>>> 58ab8585 (update prompts and tool description evaluator) | 4 | 0.357561 | `aks_nodepool_get` | ❌ | | 5 | 0.336576 | `monitor_workspace_list` | ❌ | --- +<<<<<<< HEAD +## Test 453 +======= +## Test 443 +======= +| 1 | 0.812628 | `virtualdesktop_hostpool_sessionhost_usersession-list` | ❌ | +| 2 | 0.658986 | `virtualdesktop_hostpool_sessionhost_list` | ❌ | +| 3 | 0.501050 | `virtualdesktop_hostpool_list` | ❌ | +| 4 | 0.357450 | `aks_nodepool_get` | ❌ | +| 5 | 0.336389 | `monitor_workspace_list` | ❌ | + +--- + ## Test 453 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `workbooks_create` **Prompt:** Create a new workbook named @@ -8181,7 +16095,15 @@ --- +<<<<<<< HEAD +## Test 454 +======= +<<<<<<< HEAD +## Test 444 +======= ## Test 454 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `workbooks_delete` **Prompt:** Delete the workbook with resource ID @@ -8193,12 +16115,28 @@ | 1 | 0.621310 | `workbooks_delete` | ✅ **EXPECTED** | | 2 | 0.498506 | `workbooks_show` | ❌ | | 3 | 0.432454 | `workbooks_create` | ❌ | +<<<<<<< HEAD +| 4 | 0.425569 | `workbooks_list` | ❌ | +======= +<<<<<<< HEAD +| 4 | 0.425484 | `workbooks_list` | ❌ | +======= | 4 | 0.425569 | `workbooks_list` | ❌ | +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) | 5 | 0.421897 | `workbooks_update` | ❌ | --- +<<<<<<< HEAD +## Test 455 +======= +<<<<<<< HEAD +## Test 445 +======= ## Test 455 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `workbooks_list` **Prompt:** List all workbooks in my resource group @@ -8215,7 +16153,15 @@ --- +<<<<<<< HEAD ## Test 456 +======= +<<<<<<< HEAD +## Test 446 +======= +## Test 456 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `workbooks_list` **Prompt:** What workbooks do I have in resource group ? @@ -8232,7 +16178,15 @@ --- +<<<<<<< HEAD +## Test 457 +======= +<<<<<<< HEAD +## Test 447 +======= ## Test 457 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `workbooks_show` **Prompt:** Get information about the workbook with resource ID @@ -8243,13 +16197,29 @@ |------|-------|------|--------| | 1 | 0.686095 | `workbooks_show` | ✅ **EXPECTED** | | 2 | 0.498390 | `workbooks_create` | ❌ | +<<<<<<< HEAD | 3 | 0.494708 | `workbooks_list` | ❌ | +======= +<<<<<<< HEAD +| 3 | 0.494492 | `workbooks_list` | ❌ | +======= +| 3 | 0.494708 | `workbooks_list` | ❌ | +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) | 4 | 0.463156 | `workbooks_update` | ❌ | | 5 | 0.452348 | `workbooks_delete` | ❌ | --- +<<<<<<< HEAD +## Test 458 +======= +<<<<<<< HEAD +## Test 448 +======= ## Test 458 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `workbooks_show` **Prompt:** Show me the workbook with resource ID @@ -8266,7 +16236,15 @@ --- +<<<<<<< HEAD +## Test 459 +======= +<<<<<<< HEAD +## Test 449 +======= ## Test 459 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `workbooks_update` **Prompt:** Update the workbook with a new text step @@ -8283,7 +16261,15 @@ --- +<<<<<<< HEAD +## Test 460 +======= +<<<<<<< HEAD +## Test 450 +======= ## Test 460 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `bicepschema_get` **Prompt:** How can I use Bicep to create an Azure OpenAI service? @@ -8295,12 +16281,30 @@ | 1 | 0.543803 | `bicepschema_get` | ✅ **EXPECTED** | | 2 | 0.485970 | `foundry_models_deploy` | ❌ | | 3 | 0.485889 | `deploy_iac_rules_get` | ❌ | +<<<<<<< HEAD | 4 | 0.468898 | `azureaibestpractices_get` | ❌ | | 5 | 0.453412 | `foundry_openai_embeddings-create` | ❌ | --- ## Test 461 +======= +<<<<<<< HEAD +| 4 | 0.453282 | `foundry_openai_embeddings-create` | ❌ | +| 5 | 0.448373 | `get_bestpractices_get` | ❌ | + +--- + +## Test 451 +======= +| 4 | 0.462146 | `foundry_openai_embeddings-create` | ❌ | +| 5 | 0.449694 | `get_bestpractices_get` | ❌ | + +--- + +## Test 461 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `cloudarchitect_design` **Prompt:** Please help me design an architecture for a large-scale file upload, storage, and retrieval service @@ -8311,13 +16315,32 @@ |------|-------|------|--------| | 1 | 0.502125 | `cloudarchitect_design` | ✅ **EXPECTED** | | 2 | 0.290902 | `storage_blob_upload` | ❌ | +<<<<<<< HEAD | 3 | 0.260101 | `managedlustre_fs_create` | ❌ | | 4 | 0.254991 | `deploy_architecture_diagram_generate` | ❌ | +======= +<<<<<<< HEAD +| 3 | 0.259162 | `managedlustre_fs_create` | ❌ | +| 4 | 0.254853 | `deploy_architecture_diagram_generate` | ❌ | +>>>>>>> 58ab8585 (update prompts and tool description evaluator) | 5 | 0.245034 | `managedlustre_fs_subnetsize_validate` | ❌ | --- +<<<<<<< HEAD +## Test 462 +======= +## Test 452 +======= +| 3 | 0.259162 | `managedlustre_filesystem_create` | ❌ | +| 4 | 0.254991 | `deploy_architecture_diagram_generate` | ❌ | +| 5 | 0.245034 | `managedlustre_filesystem_subnetsize_validate` | ❌ | + +--- + ## Test 462 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `cloudarchitect_design` **Prompt:** Help me design an Azure cloud service that will serve as an ATM for users @@ -8327,6 +16350,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| | 1 | 0.508153 | `cloudarchitect_design` | ✅ **EXPECTED** | +<<<<<<< HEAD | 2 | 0.377941 | `deploy_architecture_diagram_generate` | ❌ | | 3 | 0.341316 | `deploy_pipeline_guidance_get` | ❌ | | 4 | 0.336385 | `azureaibestpractices_get` | ❌ | @@ -8335,6 +16359,26 @@ --- ## Test 463 +======= +<<<<<<< HEAD +| 2 | 0.377584 | `deploy_architecture_diagram_generate` | ❌ | +| 3 | 0.341462 | `deploy_pipeline_guidance_get` | ❌ | +| 4 | 0.328747 | `get_bestpractices_get` | ❌ | +======= +| 2 | 0.377941 | `deploy_architecture_diagram_generate` | ❌ | +| 3 | 0.341462 | `deploy_pipeline_guidance_get` | ❌ | +| 4 | 0.331626 | `get_bestpractices_get` | ❌ | +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +| 5 | 0.321855 | `deploy_plan_get` | ❌ | + +--- + +<<<<<<< HEAD +## Test 453 +======= +## Test 463 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `cloudarchitect_design` **Prompt:** I want to design a cloud app for ordering groceries @@ -8351,7 +16395,15 @@ --- +<<<<<<< HEAD +## Test 464 +======= +<<<<<<< HEAD +## Test 454 +======= ## Test 464 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) **Expected Tool:** `cloudarchitect_design` **Prompt:** How can I design a cloud service in Azure that will store and present videos for users? @@ -8361,17 +16413,35 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| | 1 | 0.534690 | `cloudarchitect_design` | ✅ **EXPECTED** | +<<<<<<< HEAD | 2 | 0.369872 | `deploy_pipeline_guidance_get` | ❌ | | 3 | 0.357808 | `managedlustre_fs_create` | ❌ | | 4 | 0.352797 | `deploy_architecture_diagram_generate` | ❌ | | 5 | 0.324217 | `azureaibestpractices_get` | ❌ | +======= +| 2 | 0.369969 | `deploy_pipeline_guidance_get` | ❌ | +<<<<<<< HEAD +| 3 | 0.356331 | `managedlustre_fs_create` | ❌ | +| 4 | 0.352914 | `deploy_architecture_diagram_generate` | ❌ | +======= +| 3 | 0.356331 | `managedlustre_filesystem_create` | ❌ | +| 4 | 0.352797 | `deploy_architecture_diagram_generate` | ❌ | +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +| 5 | 0.323920 | `storage_blob_upload` | ❌ | +>>>>>>> 58ab8585 (update prompts and tool description evaluator) --- ## Summary +<<<<<<< HEAD **Total Prompts Tested:** 464 **Analysis Execution Time:** 186.7791311s +======= +<<<<<<< HEAD +**Total Prompts Tested:** 454 +**Analysis Execution Time:** 61.2275421s +>>>>>>> 58ab8585 (update prompts and tool description evaluator) ### Success Rate Metrics @@ -8388,15 +16458,48 @@ #### Top Choice + Confidence Combinations +<<<<<<< HEAD **💪 Top Choice + Very High Confidence (≥0.8):** 3.2% (15/464 tests) **🎯 Top Choice + High Confidence (≥0.7):** 22.8% (106/464 tests) **✅ Top Choice + Good Confidence (≥0.6):** 60.3% (280/464 tests) **👍 Top Choice + Fair Confidence (≥0.5):** 86.9% (403/464 tests) **👌 Top Choice + Acceptable Confidence (≥0.4):** 92.2% (428/464 tests) +======= +**💪 Top Choice + Very High Confidence (≥0.8):** 3.3% (15/454 tests) +**🎯 Top Choice + High Confidence (≥0.7):** 23.3% (106/454 tests) +**✅ Top Choice + Good Confidence (≥0.6):** 60.6% (275/454 tests) +**👍 Top Choice + Fair Confidence (≥0.5):** 86.8% (394/454 tests) +**👌 Top Choice + Acceptable Confidence (≥0.4):** 92.1% (418/454 tests) +======= +**Total Prompts Tested:** 464 +**Analysis Execution Time:** 123.7654249s + +### Success Rate Metrics + +**Top Choice Success:** 89.2% (414/464 tests) + +#### Confidence Level Distribution + +**💪 Very High Confidence (≥0.8):** 2.6% (12/464 tests) +**🎯 High Confidence (≥0.7):** 19.6% (91/464 tests) +**✅ Good Confidence (≥0.6):** 57.8% (268/464 tests) +**👍 Fair Confidence (≥0.5):** 88.8% (412/464 tests) +**👌 Acceptable Confidence (≥0.4):** 96.3% (447/464 tests) +**❌ Low Confidence (<0.4):** 3.7% (17/464 tests) + +#### Top Choice + Confidence Combinations + +**💪 Top Choice + Very High Confidence (≥0.8):** 2.6% (12/464 tests) +**🎯 Top Choice + High Confidence (≥0.7):** 19.6% (91/464 tests) +**✅ Top Choice + Good Confidence (≥0.6):** 55.8% (259/464 tests) +**👍 Top Choice + Fair Confidence (≥0.5):** 83.8% (389/464 tests) +**👌 Top Choice + Acceptable Confidence (≥0.4):** 89.2% (414/464 tests) +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) ### Success Rate Analysis -🟢 **Excellent** - The tool selection system is performing exceptionally well. +🟡 **Good** - The tool selection system is performing well. ⚠️ **Recommendation:** Tool descriptions need improvement to better match user intent (targets: ≥0.6 good, ≥0.7 high). diff --git a/eng/tools/ToolDescriptionEvaluator/tools.json b/eng/tools/ToolDescriptionEvaluator/tools.json index d83b47f83..4920f90f3 100644 --- a/eng/tools/ToolDescriptionEvaluator/tools.json +++ b/eng/tools/ToolDescriptionEvaluator/tools.json @@ -10985,6 +10985,103 @@ } ] }, + { + "name": "synthesize", + "description": "Convert text to speech using Azure AI Services Speech. This command takes text input and generates an audio file using advanced neural text-to-speech capabilities.\nYou must provide an Azure AI Services endpoint (e.g., https://your-service.cognitiveservices.azure.com/), the text to convert, and an output file path.\nOptional parameters include language specification (default: en-US), voice selection, audio output format (default: Riff24Khz16BitMonoPcm), and custom voice endpoint ID.\nThe command supports a wide variety of output formats and neural voices for natural-sounding speech synthesis.", + "command": "speech tts synthesize", + "option": [ + { + "name": "--tenant", + "description": "The Microsoft Entra ID tenant ID or name. This can be either the GUID identifier or the display name of your Entra ID tenant.", + "type": "string", + "required": null + }, + { + "name": "--auth-method", + "description": "Authentication method to use. Options: 'credential' (Azure CLI/managed identity), 'key' (access key), or 'connectionString'.", + "type": "string", + "required": null + }, + { + "name": "--retry-delay", + "description": "Initial delay in seconds between retry attempts. For exponential backoff, this value is used as the base.", + "type": "string", + "required": null + }, + { + "name": "--retry-max-delay", + "description": "Maximum delay in seconds between retries, regardless of the retry strategy.", + "type": "string", + "required": null + }, + { + "name": "--retry-max-retries", + "description": "Maximum number of retry attempts for failed operations before giving up.", + "type": "string", + "required": null + }, + { + "name": "--retry-mode", + "description": "Retry strategy to use. 'fixed' uses consistent delays, 'exponential' increases delay between attempts.", + "type": "string", + "required": null + }, + { + "name": "--retry-network-timeout", + "description": "Network operation timeout in seconds. Operations taking longer than this will be cancelled.", + "type": "string", + "required": null + }, + { + "name": "--subscription", + "description": "Specifies the Azure subscription to use. Accepts either a subscription ID (GUID) or display name. If not specified, the AZURE_SUBSCRIPTION_ID environment variable will be used instead.", + "type": "string", + "required": null + }, + { + "name": "--endpoint", + "description": "The Azure AI Services endpoint URL (e.g., https://your-service.cognitiveservices.azure.com/).", + "type": "string", + "required": true + }, + { + "name": "--text", + "description": "The text to convert to speech.", + "type": "string", + "required": true + }, + { + "name": "--outputAudio", + "description": "Path where the synthesized audio file will be saved.", + "type": "string", + "required": true + }, + { + "name": "--language", + "description": "The language for speech recognition (e.g., en-US, es-ES). Default is en-US.", + "type": "string", + "required": null + }, + { + "name": "--voice", + "description": "The voice to use for speech synthesis (e.g., en-US-JennyNeural). If not specified, the default voice for the language will be used.", + "type": "string", + "required": null + }, + { + "name": "--format", + "description": "Output format: simple or detailed. Default is simple.", + "type": "string", + "required": null + }, + { + "name": "--endpointId", + "description": "The endpoint ID of a custom voice model for speech synthesis.", + "type": "string", + "required": null + } + ] + }, { "name": "create", "description": "Create a new Azure SQL Database on an existing SQL Server. This command creates a database with configurable\r\nperformance tiers, size limits, and other settings. Equivalent to 'az sql db create'.\r\nReturns the newly created database information including configuration details.", @@ -13171,5 +13268,13 @@ } ], "consolidated_tools": null, +<<<<<<< HEAD "duration": 53 +======= +<<<<<<< HEAD + "duration": 47 +======= + "duration": 49 +>>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> 58ab8585 (update prompts and tool description evaluator) } \ No newline at end of file diff --git a/servers/Azure.Mcp.Server/README.md b/servers/Azure.Mcp.Server/README.md index 7117db872..c6938d2b0 100644 --- a/servers/Azure.Mcp.Server/README.md +++ b/servers/Azure.Mcp.Server/README.md @@ -368,6 +368,9 @@ To use Azure Entra ID, review the [troubleshooting guide](https://github.com/mic * "Recognize speech from my audio file with language detection" * "Transcribe speech from audio with profanity filtering" * "Transcribe audio with phrase hints for better accuracy" +* "Convert text to speech and save to output.wav" +* "Synthesize speech from 'Hello, welcome to Azure' with Spanish voice" +* "Generate MP3 audio from text with high quality format" ### ⚙️ Azure App Configuration @@ -509,7 +512,7 @@ The Azure MCP Server provides tools for interacting with **40+ Azure service are - 🧮 **Azure AI Foundry** - AI model management, AI model deployment, and knowledge index management - 🔎 **Azure AI Search** - Search engine/vector database operations -- 🎤 **Azure AI Services Speech** - Speech-to-text recognition +- 🎤 **Azure AI Services Speech** - Speech-to-text recognition and text-to-speech synthesis - 🤖 **Azure AI Best Practices** - AI app development guidance for Azure AI Foundry and Microsoft Agent Framework - ⚙️ **Azure App Configuration** - Configuration management - 🕸️ **Azure App Service** - Web app hosting diff --git a/servers/Azure.Mcp.Server/docs/azmcp-commands.md b/servers/Azure.Mcp.Server/docs/azmcp-commands.md index a38694ba2..3aa1d8f5f 100644 --- a/servers/Azure.Mcp.Server/docs/azmcp-commands.md +++ b/servers/Azure.Mcp.Server/docs/azmcp-commands.md @@ -418,6 +418,67 @@ azmcp speech stt recognize --endpoint --file audio.wav \ Use phrase hints when you expect specific terminology, technical terms, or domain-specific vocabulary in your audio content. This significantly improves recognition accuracy for specialized content. +```bash +# Synthesize speech from text and save to an audio file using Azure AI Services Speech +# ❌ Destructive | ✅ Idempotent | ❌ OpenWorld | ❌ ReadOnly | ❌ Secret | ✅ LocalRequired +azmcp speech tts synthesize --endpoint \ + --text \ + --outputAudio \ + [--language ] \ + [--voice ] \ + [--format ] \ + [--endpointId ] +``` + +#### Text-to-Speech Parameters + +| Parameter | Required | Description | +|-----------|----------|-------------| +| `--endpoint` | Yes | Azure AI Services endpoint URL (e.g., https://your-service.cognitiveservices.azure.com/) | +| `--text` | Yes | The text to convert to speech | +| `--outputAudio` | Yes | Path where the synthesized audio file will be saved (e.g., output.wav, speech.mp3) | +| `--language` | No | Speech synthesis language (default: en-US). Examples: es-ES, fr-FR, de-DE | +| `--voice` | No | Neural voice to use (e.g., en-US-JennyNeural, es-ES-ElviraNeural). If not specified, default voice for the language is used | +| `--format` | No | Output audio format (default: Riff24Khz16BitMonoPcm). Supported formats: Riff24Khz16BitMonoPcm, Audio16Khz32KBitRateMonoMp3, Audio24Khz96KBitRateMonoMp3, Ogg16Khz16BitMonoOpus, Raw16Khz16BitMonoPcm | +| `--endpointId` | No | Endpoint ID of a custom voice model for personalized speech synthesis | + +#### Supported Audio Formats + +The `--format` parameter accepts the following values: + +- **WAV formats**: `Riff24Khz16BitMonoPcm` (default), `Riff16Khz16BitMonoPcm`, `Raw16Khz16BitMonoPcm` +- **MP3 formats**: `Audio16Khz32KBitRateMonoMp3`, `Audio24Khz96KBitRateMonoMp3`, `Audio48Khz192KBitRateMonoMp3` +- **OGG/Opus formats**: `Ogg16Khz16BitMonoOpus`, `Ogg24Khz16BitMonoOpus` + +**Examples:** + +```bash +# Basic text-to-speech synthesis +azmcp speech tts synthesize --endpoint https://myservice.cognitiveservices.azure.com/ \ + --text "Hello, welcome to Azure AI Services Speech" \ + --outputAudio welcome.wav + +# Synthesize with specific language and voice +azmcp speech tts synthesize --endpoint https://myservice.cognitiveservices.azure.com/ \ + --text "Hola, bienvenido a los servicios de voz de Azure" \ + --outputAudio spanish-greeting.wav \ + --language es-ES \ + --voice es-ES-ElviraNeural + +# Generate MP3 output with high quality +azmcp speech tts synthesize --endpoint https://myservice.cognitiveservices.azure.com/ \ + --text "This is a high quality audio output" \ + --outputAudio output.mp3 \ + --format Audio48Khz192KBitRateMonoMp3 + +# Use custom voice model +azmcp speech tts synthesize --endpoint https://myservice.cognitiveservices.azure.com/ \ + --text "This uses my custom trained voice" \ + --outputAudio custom-voice.wav \ + --voice my-custom-voice-model + --endpointId my-custom-voice-endpoint-id +``` + ### Azure App Configuration Operations ```bash diff --git a/servers/Azure.Mcp.Server/docs/e2eTestPrompts.md b/servers/Azure.Mcp.Server/docs/e2eTestPrompts.md index 33746b90e..b5b8a104b 100644 --- a/servers/Azure.Mcp.Server/docs/e2eTestPrompts.md +++ b/servers/Azure.Mcp.Server/docs/e2eTestPrompts.md @@ -81,6 +81,16 @@ This file contains prompts used for end-to-end testing to ensure each tool is in | speech_stt_recognize | Transcribe audio using multiple phrase hints: "Azure", "cognitive services", "machine learning" | | speech_stt_recognize | Convert speech to text with comma-separated phrase hints: "Azure, cognitive services, API" | | speech_stt_recognize | Transcribe audio with raw profanity output from file | +| speech_tts_synthesize | Convert text to speech and save to output.wav | +| speech_tts_synthesize | Synthesize speech from "Hello, welcome to Azure" and save to welcome.wav | +| speech_tts_synthesize | Generate speech audio from text "Hello world" using Azure Speech Services | +| speech_tts_synthesize | Convert text to speech with Spanish language and save to spanish-audio.wav | +| speech_tts_synthesize | Synthesize speech with voice en-US-JennyNeural from text "Azure AI Services" | +| speech_tts_synthesize | Create MP3 audio file from text "Welcome to Azure" with high quality format | +| speech_tts_synthesize | Generate speech with custom voice model using endpoint ID | +| speech_tts_synthesize | Convert text to OGG/Opus format audio file | +| speech_tts_synthesize | Synthesize long text content to audio file with streaming | +| speech_tts_synthesize | Create audio file from text in French language with appropriate voice | ## Azure App Configuration diff --git a/tools/Azure.Mcp.Tools.Speech/src/Commands/SpeechJsonContext.cs b/tools/Azure.Mcp.Tools.Speech/src/Commands/SpeechJsonContext.cs index 4932d7830..9a36c66e0 100644 --- a/tools/Azure.Mcp.Tools.Speech/src/Commands/SpeechJsonContext.cs +++ b/tools/Azure.Mcp.Tools.Speech/src/Commands/SpeechJsonContext.cs @@ -27,7 +27,6 @@ [JsonSerializable(typeof(SttRecognizeCommand.SttRecognizeCommandResult))] [JsonSerializable(typeof(SynthesisResult))] [JsonSerializable(typeof(TtsSynthesizeCommand.TtsSynthesizeCommandResult))] -[JsonSerializable(typeof(WordResult))] [JsonSourceGenerationOptions( PropertyNamingPolicy = JsonKnownNamingPolicy.CamelCase, WriteIndented = true, From fc3a53760dd735dadc90b6d97003056fca940cc0 Mon Sep 17 00:00:00 2001 From: "Fei Zhao (SPEECH)" Date: Tue, 21 Oct 2025 18:30:57 +0800 Subject: [PATCH 06/14] fix comment in live tests --- .../Azure.Mcp.Tools.Speech.LiveTests/SpeechCommandTests.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/Azure.Mcp.Tools.Speech/tests/Azure.Mcp.Tools.Speech.LiveTests/SpeechCommandTests.cs b/tools/Azure.Mcp.Tools.Speech/tests/Azure.Mcp.Tools.Speech.LiveTests/SpeechCommandTests.cs index 69f07bab2..84dc816ab 100644 --- a/tools/Azure.Mcp.Tools.Speech/tests/Azure.Mcp.Tools.Speech.LiveTests/SpeechCommandTests.cs +++ b/tools/Azure.Mcp.Tools.Speech/tests/Azure.Mcp.Tools.Speech.LiveTests/SpeechCommandTests.cs @@ -747,7 +747,7 @@ public async Task Should_handle_large_text_input() var aiServicesEndpoint = $"https://{Settings.ResourceBaseName}.cognitiveservices.azure.com/"; var outputFile = Path.Combine(Path.GetTempPath(), $"tts-test-large-{Guid.NewGuid()}.wav"); - // Create a longer text (around 500 words) + // Create a longer text (around 1000 words) var largeText = string.Join(" ", Enumerable.Repeat( "This is a test of text to speech synthesis with a longer input to verify that streaming works correctly.", 50)); From bb05212c33639979a5f38ae6fa2528c21303d93d Mon Sep 17 00:00:00 2001 From: "Fei Zhao (SPEECH)" Date: Tue, 21 Oct 2025 18:41:08 +0800 Subject: [PATCH 07/14] fix dotnet format errors --- tools/Azure.Mcp.Tools.Speech/src/Services/SpeechService.cs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tools/Azure.Mcp.Tools.Speech/src/Services/SpeechService.cs b/tools/Azure.Mcp.Tools.Speech/src/Services/SpeechService.cs index e2de3ec8f..ec36c55c6 100644 --- a/tools/Azure.Mcp.Tools.Speech/src/Services/SpeechService.cs +++ b/tools/Azure.Mcp.Tools.Speech/src/Services/SpeechService.cs @@ -450,7 +450,7 @@ private static List ExtractNBestResults(SdkSpeechRecognitionResult // Get the collected audio data from the stream var audioData = audioStream.ToArray(); - + _logger.LogInformation( "Speech synthesized successfully. Total audio length: {AudioLength} bytes", audioData.Length); @@ -546,7 +546,7 @@ public async Task SynthesizeSpeechToFile( catch (Exception ex) { _logger.LogError(ex, "Error during speech synthesis."); - + // Clean up partial file on error if (File.Exists(outputFilePath)) { @@ -560,7 +560,7 @@ public async Task SynthesizeSpeechToFile( _logger.LogWarning(cleanupEx, "Failed to clean up partial output file: {OutputFile}", outputFilePath); } } - + throw; } } From 7e47125aec8ae1a4fcb2d133b46d081c4643c4af Mon Sep 17 00:00:00 2001 From: "Fei Zhao (SPEECH)" Date: Tue, 4 Nov 2025 16:12:32 +0800 Subject: [PATCH 08/14] fix command id --- .../src/Commands/Tts/TtsSynthesizeCommand.cs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/Azure.Mcp.Tools.Speech/src/Commands/Tts/TtsSynthesizeCommand.cs b/tools/Azure.Mcp.Tools.Speech/src/Commands/Tts/TtsSynthesizeCommand.cs index 76dbb47f0..61ab99ff2 100644 --- a/tools/Azure.Mcp.Tools.Speech/src/Commands/Tts/TtsSynthesizeCommand.cs +++ b/tools/Azure.Mcp.Tools.Speech/src/Commands/Tts/TtsSynthesizeCommand.cs @@ -21,6 +21,8 @@ internal record TtsSynthesizeCommandResult(SynthesisResult Result); public override string Name => "synthesize"; + public override string Id => "d6f6687f-feee-4e15-9b98-71aea4076e04"; + public override string Description => """ Convert text to speech using Azure AI Services Speech. This command takes text input and generates an audio file using advanced neural text-to-speech capabilities. From 7b53c23287df605964c2839fa0a798a7e63210b3 Mon Sep 17 00:00:00 2001 From: "Fei Zhao (SPEECH)" Date: Tue, 4 Nov 2025 16:41:07 +0800 Subject: [PATCH 09/14] fix azmcp-commands.md --- servers/Azure.Mcp.Server/docs/azmcp-commands.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/servers/Azure.Mcp.Server/docs/azmcp-commands.md b/servers/Azure.Mcp.Server/docs/azmcp-commands.md index 3aa1d8f5f..cd0cd9d89 100644 --- a/servers/Azure.Mcp.Server/docs/azmcp-commands.md +++ b/servers/Azure.Mcp.Server/docs/azmcp-commands.md @@ -454,11 +454,13 @@ The `--format` parameter accepts the following values: ```bash # Basic text-to-speech synthesis +# ❌ Destructive | ✅ Idempotent | ❌ OpenWorld | ❌ ReadOnly | ❌ Secret | ✅ LocalRequired azmcp speech tts synthesize --endpoint https://myservice.cognitiveservices.azure.com/ \ --text "Hello, welcome to Azure AI Services Speech" \ --outputAudio welcome.wav # Synthesize with specific language and voice +# ❌ Destructive | ✅ Idempotent | ❌ OpenWorld | ❌ ReadOnly | ❌ Secret | ✅ LocalRequired azmcp speech tts synthesize --endpoint https://myservice.cognitiveservices.azure.com/ \ --text "Hola, bienvenido a los servicios de voz de Azure" \ --outputAudio spanish-greeting.wav \ @@ -466,12 +468,14 @@ azmcp speech tts synthesize --endpoint https://myservice.cognitiveservices.azure --voice es-ES-ElviraNeural # Generate MP3 output with high quality +# ❌ Destructive | ✅ Idempotent | ❌ OpenWorld | ❌ ReadOnly | ❌ Secret | ✅ LocalRequired azmcp speech tts synthesize --endpoint https://myservice.cognitiveservices.azure.com/ \ --text "This is a high quality audio output" \ --outputAudio output.mp3 \ --format Audio48Khz192KBitRateMonoMp3 # Use custom voice model +# ❌ Destructive | ✅ Idempotent | ❌ OpenWorld | ❌ ReadOnly | ❌ Secret | ✅ LocalRequired azmcp speech tts synthesize --endpoint https://myservice.cognitiveservices.azure.com/ \ --text "This uses my custom trained voice" \ --outputAudio custom-voice.wav \ From b0c006c94012df919323619acd3933bbd6f15b0d Mon Sep 17 00:00:00 2001 From: "Fei Zhao (SPEECH)" Date: Thu, 6 Nov 2025 16:30:41 +0800 Subject: [PATCH 10/14] refactor tts mcp tool --- eng/tools/ToolDescriptionEvaluator/results.md | 5078 ++++++++++++++--- eng/tools/ToolDescriptionEvaluator/tools.json | 8 +- .../src/Azure.Mcp.Tools.Speech.csproj | 2 +- .../src/Commands/Tts/TtsSynthesizeCommand.cs | 2 +- .../src/Services/ISpeechService.cs | 2 +- .../src/Services/SpeechService.cs | 517 +- .../Synthesizers/IRealtimeTtsSynthesizer.cs | 35 + .../Synthesizers/RealtimeTtsSynthesizer.cs | 274 + .../Azure.Mcp.Tools.Speech/src/SpeechSetup.cs | 10 +- .../Services/SpeechServiceTests.cs | 7 +- .../Stt/SttRecognizeCommandTests.cs | 5 +- .../Tts/TtsSynthesizeCommandTests.cs | 10 +- 12 files changed, 4579 insertions(+), 1371 deletions(-) create mode 100644 tools/Azure.Mcp.Tools.Speech/src/Services/Synthesizers/IRealtimeTtsSynthesizer.cs create mode 100644 tools/Azure.Mcp.Tools.Speech/src/Services/Synthesizers/RealtimeTtsSynthesizer.cs diff --git a/eng/tools/ToolDescriptionEvaluator/results.md b/eng/tools/ToolDescriptionEvaluator/results.md index ddccc083e..3ab78fbfe 100644 --- a/eng/tools/ToolDescriptionEvaluator/results.md +++ b/eng/tools/ToolDescriptionEvaluator/results.md @@ -1,5 +1,6 @@ # Tool Selection Analysis Setup +<<<<<<< HEAD <<<<<<< HEAD **Setup completed:** 2025-11-06 17:16:26 **Tool count:** 179 @@ -15,11 +16,17 @@ **Database setup time:** 1.4888934s >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +**Setup completed:** 2025-11-06 16:24:20 +**Tool count:** 179 +**Database setup time:** 1.5156559s +>>>>>>> e2fd2eac (refactor tts mcp tool) --- # Tool Selection Analysis Results +<<<<<<< HEAD <<<<<<< HEAD **Analysis Date:** 2025-11-06 17:16:26 **Tool count:** 179 @@ -32,6 +39,10 @@ **Tool count:** 174 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +**Analysis Date:** 2025-11-06 16:24:20 +**Tool count:** 179 +>>>>>>> e2fd2eac (refactor tts mcp tool) ## Table of Contents @@ -96,11 +107,15 @@ - [Test 59: speech_stt_recognize](#test-59) - [Test 60: speech_stt_recognize](#test-60) <<<<<<< HEAD +<<<<<<< HEAD +======= +>>>>>>> e2fd2eac (refactor tts mcp tool) - [Test 61: speech_stt_recognize](#test-61) - [Test 62: speech_stt_recognize](#test-62) - [Test 63: speech_stt_recognize](#test-63) - [Test 64: speech_stt_recognize](#test-64) - [Test 65: speech_stt_recognize](#test-65) +<<<<<<< HEAD - [Test 66: appconfig_account_list](#test-66) - [Test 67: appconfig_account_list](#test-67) - [Test 68: appconfig_account_list](#test-68) @@ -549,159 +564,161 @@ - [Test 63: speech_tts_synthesize](#test-63) - [Test 64: speech_tts_synthesize](#test-64) - [Test 65: speech_tts_synthesize](#test-65) +======= +>>>>>>> e2fd2eac (refactor tts mcp tool) - [Test 66: speech_tts_synthesize](#test-66) - [Test 67: speech_tts_synthesize](#test-67) - [Test 68: speech_tts_synthesize](#test-68) - [Test 69: speech_tts_synthesize](#test-69) - [Test 70: speech_tts_synthesize](#test-70) -- [Test 71: appconfig_account_list](#test-71) -- [Test 72: appconfig_account_list](#test-72) -- [Test 73: appconfig_account_list](#test-73) -- [Test 74: appconfig_kv_delete](#test-74) -- [Test 75: appconfig_kv_get](#test-75) -- [Test 76: appconfig_kv_get](#test-76) -- [Test 77: appconfig_kv_get](#test-77) -- [Test 78: appconfig_kv_get](#test-78) -- [Test 79: appconfig_kv_lock_set](#test-79) -- [Test 80: appconfig_kv_lock_set](#test-80) -- [Test 81: appconfig_kv_set](#test-81) -- [Test 82: applens_resource_diagnose](#test-82) -- [Test 83: applens_resource_diagnose](#test-83) -- [Test 84: applens_resource_diagnose](#test-84) -- [Test 85: appservice_database_add](#test-85) -- [Test 86: appservice_database_add](#test-86) -- [Test 87: appservice_database_add](#test-87) -- [Test 88: appservice_database_add](#test-88) -- [Test 89: appservice_database_add](#test-89) +- [Test 71: speech_tts_synthesize](#test-71) +- [Test 72: speech_tts_synthesize](#test-72) +- [Test 73: speech_tts_synthesize](#test-73) +- [Test 74: speech_tts_synthesize](#test-74) +- [Test 75: speech_tts_synthesize](#test-75) +- [Test 76: appconfig_account_list](#test-76) +- [Test 77: appconfig_account_list](#test-77) +- [Test 78: appconfig_account_list](#test-78) +- [Test 79: appconfig_kv_delete](#test-79) +- [Test 80: appconfig_kv_get](#test-80) +- [Test 81: appconfig_kv_get](#test-81) +- [Test 82: appconfig_kv_get](#test-82) +- [Test 83: appconfig_kv_get](#test-83) +- [Test 84: appconfig_kv_lock_set](#test-84) +- [Test 85: appconfig_kv_lock_set](#test-85) +- [Test 86: appconfig_kv_set](#test-86) +- [Test 87: applens_resource_diagnose](#test-87) +- [Test 88: applens_resource_diagnose](#test-88) +- [Test 89: applens_resource_diagnose](#test-89) - [Test 90: appservice_database_add](#test-90) - [Test 91: appservice_database_add](#test-91) - [Test 92: appservice_database_add](#test-92) - [Test 93: appservice_database_add](#test-93) - [Test 94: appservice_database_add](#test-94) -- [Test 95: applicationinsights_recommendation_list](#test-95) -- [Test 96: applicationinsights_recommendation_list](#test-96) -- [Test 97: applicationinsights_recommendation_list](#test-97) -- [Test 98: applicationinsights_recommendation_list](#test-98) -- [Test 99: extension_cli_generate](#test-99) -- [Test 100: extension_cli_generate](#test-100) -- [Test 101: extension_cli_generate](#test-101) -- [Test 102: extension_cli_install](#test-102) -- [Test 103: extension_cli_install](#test-103) -- [Test 104: extension_cli_install](#test-104) -- [Test 105: acr_registry_list](#test-105) -- [Test 106: acr_registry_list](#test-106) -- [Test 107: acr_registry_list](#test-107) -- [Test 108: acr_registry_list](#test-108) -- [Test 109: acr_registry_list](#test-109) -- [Test 110: acr_registry_repository_list](#test-110) -- [Test 111: acr_registry_repository_list](#test-111) -- [Test 112: acr_registry_repository_list](#test-112) -- [Test 113: acr_registry_repository_list](#test-113) -- [Test 114: communication_email_send](#test-114) -- [Test 115: communication_email_send](#test-115) -- [Test 116: communication_email_send](#test-116) -- [Test 117: communication_email_send](#test-117) -- [Test 118: communication_email_send](#test-118) +- [Test 95: appservice_database_add](#test-95) +- [Test 96: appservice_database_add](#test-96) +- [Test 97: appservice_database_add](#test-97) +- [Test 98: appservice_database_add](#test-98) +- [Test 99: appservice_database_add](#test-99) +- [Test 100: applicationinsights_recommendation_list](#test-100) +- [Test 101: applicationinsights_recommendation_list](#test-101) +- [Test 102: applicationinsights_recommendation_list](#test-102) +- [Test 103: applicationinsights_recommendation_list](#test-103) +- [Test 104: extension_cli_generate](#test-104) +- [Test 105: extension_cli_generate](#test-105) +- [Test 106: extension_cli_generate](#test-106) +- [Test 107: extension_cli_install](#test-107) +- [Test 108: extension_cli_install](#test-108) +- [Test 109: extension_cli_install](#test-109) +- [Test 110: acr_registry_list](#test-110) +- [Test 111: acr_registry_list](#test-111) +- [Test 112: acr_registry_list](#test-112) +- [Test 113: acr_registry_list](#test-113) +- [Test 114: acr_registry_list](#test-114) +- [Test 115: acr_registry_repository_list](#test-115) +- [Test 116: acr_registry_repository_list](#test-116) +- [Test 117: acr_registry_repository_list](#test-117) +- [Test 118: acr_registry_repository_list](#test-118) - [Test 119: communication_email_send](#test-119) - [Test 120: communication_email_send](#test-120) - [Test 121: communication_email_send](#test-121) -- [Test 122: communication_sms_send](#test-122) -- [Test 123: communication_sms_send](#test-123) -- [Test 124: communication_sms_send](#test-124) -- [Test 125: communication_sms_send](#test-125) -- [Test 126: communication_sms_send](#test-126) +- [Test 122: communication_email_send](#test-122) +- [Test 123: communication_email_send](#test-123) +- [Test 124: communication_email_send](#test-124) +- [Test 125: communication_email_send](#test-125) +- [Test 126: communication_email_send](#test-126) - [Test 127: communication_sms_send](#test-127) - [Test 128: communication_sms_send](#test-128) - [Test 129: communication_sms_send](#test-129) -- [Test 130: confidentialledger_entries_append](#test-130) -- [Test 131: confidentialledger_entries_append](#test-131) -- [Test 132: confidentialledger_entries_append](#test-132) -- [Test 133: confidentialledger_entries_append](#test-133) -- [Test 134: confidentialledger_entries_append](#test-134) -- [Test 135: confidentialledger_entries_get](#test-135) -- [Test 136: confidentialledger_entries_get](#test-136) -- [Test 137: cosmos_account_list](#test-137) -- [Test 138: cosmos_account_list](#test-138) -- [Test 139: cosmos_account_list](#test-139) -- [Test 140: cosmos_database_container_item_query](#test-140) -- [Test 141: cosmos_database_container_list](#test-141) -- [Test 142: cosmos_database_container_list](#test-142) -- [Test 143: cosmos_database_list](#test-143) -- [Test 144: cosmos_database_list](#test-144) -- [Test 145: kusto_cluster_get](#test-145) -- [Test 146: kusto_cluster_list](#test-146) -- [Test 147: kusto_cluster_list](#test-147) -- [Test 148: kusto_cluster_list](#test-148) -- [Test 149: kusto_database_list](#test-149) -- [Test 150: kusto_database_list](#test-150) -- [Test 151: kusto_query](#test-151) -- [Test 152: kusto_sample](#test-152) -- [Test 153: kusto_table_list](#test-153) -- [Test 154: kusto_table_list](#test-154) -- [Test 155: kusto_table_schema](#test-155) -- [Test 156: mysql_database_list](#test-156) -- [Test 157: mysql_database_list](#test-157) -- [Test 158: mysql_database_query](#test-158) -- [Test 159: mysql_server_config_get](#test-159) -- [Test 160: mysql_server_list](#test-160) -- [Test 161: mysql_server_list](#test-161) -- [Test 162: mysql_server_list](#test-162) -- [Test 163: mysql_server_param_get](#test-163) -- [Test 164: mysql_server_param_set](#test-164) -- [Test 165: mysql_table_list](#test-165) -- [Test 166: mysql_table_list](#test-166) -- [Test 167: mysql_table_schema_get](#test-167) -- [Test 168: postgres_database_list](#test-168) -- [Test 169: postgres_database_list](#test-169) -- [Test 170: postgres_database_query](#test-170) -- [Test 171: postgres_server_config_get](#test-171) -- [Test 172: postgres_server_list](#test-172) -- [Test 173: postgres_server_list](#test-173) -- [Test 174: postgres_server_list](#test-174) -- [Test 175: postgres_server_param_get](#test-175) -- [Test 176: postgres_server_param_set](#test-176) -- [Test 177: postgres_table_list](#test-177) -- [Test 178: postgres_table_list](#test-178) -- [Test 179: postgres_table_schema_get](#test-179) -- [Test 180: deploy_app_logs_get](#test-180) -- [Test 181: deploy_architecture_diagram_generate](#test-181) -- [Test 182: deploy_iac_rules_get](#test-182) -- [Test 183: deploy_pipeline_guidance_get](#test-183) -- [Test 184: deploy_plan_get](#test-184) -- [Test 185: eventgrid_events_publish](#test-185) -- [Test 186: eventgrid_events_publish](#test-186) -- [Test 187: eventgrid_events_publish](#test-187) -- [Test 188: eventgrid_topic_list](#test-188) -- [Test 189: eventgrid_topic_list](#test-189) -- [Test 190: eventgrid_topic_list](#test-190) -- [Test 191: eventgrid_topic_list](#test-191) -- [Test 192: eventgrid_subscription_list](#test-192) -- [Test 193: eventgrid_subscription_list](#test-193) -- [Test 194: eventgrid_subscription_list](#test-194) -- [Test 195: eventgrid_subscription_list](#test-195) -- [Test 196: eventgrid_subscription_list](#test-196) +- [Test 130: communication_sms_send](#test-130) +- [Test 131: communication_sms_send](#test-131) +- [Test 132: communication_sms_send](#test-132) +- [Test 133: communication_sms_send](#test-133) +- [Test 134: communication_sms_send](#test-134) +- [Test 135: confidentialledger_entries_append](#test-135) +- [Test 136: confidentialledger_entries_append](#test-136) +- [Test 137: confidentialledger_entries_append](#test-137) +- [Test 138: confidentialledger_entries_append](#test-138) +- [Test 139: confidentialledger_entries_append](#test-139) +- [Test 140: confidentialledger_entries_get](#test-140) +- [Test 141: confidentialledger_entries_get](#test-141) +- [Test 142: cosmos_account_list](#test-142) +- [Test 143: cosmos_account_list](#test-143) +- [Test 144: cosmos_account_list](#test-144) +- [Test 145: cosmos_database_container_item_query](#test-145) +- [Test 146: cosmos_database_container_list](#test-146) +- [Test 147: cosmos_database_container_list](#test-147) +- [Test 148: cosmos_database_list](#test-148) +- [Test 149: cosmos_database_list](#test-149) +- [Test 150: kusto_cluster_get](#test-150) +- [Test 151: kusto_cluster_list](#test-151) +- [Test 152: kusto_cluster_list](#test-152) +- [Test 153: kusto_cluster_list](#test-153) +- [Test 154: kusto_database_list](#test-154) +- [Test 155: kusto_database_list](#test-155) +- [Test 156: kusto_query](#test-156) +- [Test 157: kusto_sample](#test-157) +- [Test 158: kusto_table_list](#test-158) +- [Test 159: kusto_table_list](#test-159) +- [Test 160: kusto_table_schema](#test-160) +- [Test 161: mysql_database_list](#test-161) +- [Test 162: mysql_database_list](#test-162) +- [Test 163: mysql_database_query](#test-163) +- [Test 164: mysql_server_config_get](#test-164) +- [Test 165: mysql_server_list](#test-165) +- [Test 166: mysql_server_list](#test-166) +- [Test 167: mysql_server_list](#test-167) +- [Test 168: mysql_server_param_get](#test-168) +- [Test 169: mysql_server_param_set](#test-169) +- [Test 170: mysql_table_list](#test-170) +- [Test 171: mysql_table_list](#test-171) +- [Test 172: mysql_table_schema_get](#test-172) +- [Test 173: postgres_database_list](#test-173) +- [Test 174: postgres_database_list](#test-174) +- [Test 175: postgres_database_query](#test-175) +- [Test 176: postgres_server_config_get](#test-176) +- [Test 177: postgres_server_list](#test-177) +- [Test 178: postgres_server_list](#test-178) +- [Test 179: postgres_server_list](#test-179) +- [Test 180: postgres_server_param_get](#test-180) +- [Test 181: postgres_server_param_set](#test-181) +- [Test 182: postgres_table_list](#test-182) +- [Test 183: postgres_table_list](#test-183) +- [Test 184: postgres_table_schema_get](#test-184) +- [Test 185: deploy_app_logs_get](#test-185) +- [Test 186: deploy_architecture_diagram_generate](#test-186) +- [Test 187: deploy_iac_rules_get](#test-187) +- [Test 188: deploy_pipeline_guidance_get](#test-188) +- [Test 189: deploy_plan_get](#test-189) +- [Test 190: eventgrid_events_publish](#test-190) +- [Test 191: eventgrid_events_publish](#test-191) +- [Test 192: eventgrid_events_publish](#test-192) +- [Test 193: eventgrid_topic_list](#test-193) +- [Test 194: eventgrid_topic_list](#test-194) +- [Test 195: eventgrid_topic_list](#test-195) +- [Test 196: eventgrid_topic_list](#test-196) - [Test 197: eventgrid_subscription_list](#test-197) - [Test 198: eventgrid_subscription_list](#test-198) -- [Test 199: eventhubs_eventhub_consumergroup_delete](#test-199) -- [Test 200: eventhubs_eventhub_consumergroup_get](#test-200) -- [Test 201: eventhubs_eventhub_consumergroup_get](#test-201) -- [Test 202: eventhubs_eventhub_consumergroup_update](#test-202) -- [Test 203: eventhubs_eventhub_consumergroup_update](#test-203) -- [Test 204: eventhubs_eventhub_delete](#test-204) -- [Test 205: eventhubs_eventhub_get](#test-205) -- [Test 206: eventhubs_eventhub_get](#test-206) -- [Test 207: eventhubs_eventhub_update](#test-207) -- [Test 208: eventhubs_eventhub_update](#test-208) -- [Test 209: eventhubs_namespace_delete](#test-209) -- [Test 210: eventhubs_namespace_get](#test-210) -- [Test 211: eventhubs_namespace_get](#test-211) -- [Test 212: eventhubs_namespace_update](#test-212) -- [Test 213: eventhubs_namespace_update](#test-213) -- [Test 214: functionapp_get](#test-214) -- [Test 215: functionapp_get](#test-215) -- [Test 216: functionapp_get](#test-216) -- [Test 217: functionapp_get](#test-217) -- [Test 218: functionapp_get](#test-218) +- [Test 199: eventgrid_subscription_list](#test-199) +- [Test 200: eventgrid_subscription_list](#test-200) +- [Test 201: eventgrid_subscription_list](#test-201) +- [Test 202: eventgrid_subscription_list](#test-202) +- [Test 203: eventgrid_subscription_list](#test-203) +- [Test 204: eventhubs_eventhub_consumergroup_delete](#test-204) +- [Test 205: eventhubs_eventhub_consumergroup_get](#test-205) +- [Test 206: eventhubs_eventhub_consumergroup_get](#test-206) +- [Test 207: eventhubs_eventhub_consumergroup_update](#test-207) +- [Test 208: eventhubs_eventhub_consumergroup_update](#test-208) +- [Test 209: eventhubs_eventhub_delete](#test-209) +- [Test 210: eventhubs_eventhub_get](#test-210) +- [Test 211: eventhubs_eventhub_get](#test-211) +- [Test 212: eventhubs_eventhub_update](#test-212) +- [Test 213: eventhubs_eventhub_update](#test-213) +- [Test 214: eventhubs_namespace_delete](#test-214) +- [Test 215: eventhubs_namespace_get](#test-215) +- [Test 216: eventhubs_namespace_get](#test-216) +- [Test 217: eventhubs_namespace_update](#test-217) +- [Test 218: eventhubs_namespace_update](#test-218) - [Test 219: functionapp_get](#test-219) - [Test 220: functionapp_get](#test-220) - [Test 221: functionapp_get](#test-221) @@ -709,75 +726,76 @@ - [Test 223: functionapp_get](#test-223) - [Test 224: functionapp_get](#test-224) - [Test 225: functionapp_get](#test-225) -- [Test 226: keyvault_admin_settings_get](#test-226) -- [Test 227: keyvault_admin_settings_get](#test-227) -- [Test 228: keyvault_admin_settings_get](#test-228) -- [Test 229: keyvault_certificate_create](#test-229) -- [Test 230: keyvault_certificate_create](#test-230) -- [Test 231: keyvault_certificate_create](#test-231) -- [Test 232: keyvault_certificate_create](#test-232) -- [Test 233: keyvault_certificate_create](#test-233) -- [Test 234: keyvault_certificate_get](#test-234) -- [Test 235: keyvault_certificate_get](#test-235) -- [Test 236: keyvault_certificate_get](#test-236) -- [Test 237: keyvault_certificate_get](#test-237) -- [Test 238: keyvault_certificate_get](#test-238) -- [Test 239: keyvault_certificate_import](#test-239) -- [Test 240: keyvault_certificate_import](#test-240) -- [Test 241: keyvault_certificate_import](#test-241) -- [Test 242: keyvault_certificate_import](#test-242) -- [Test 243: keyvault_certificate_import](#test-243) -- [Test 244: keyvault_certificate_list](#test-244) -- [Test 245: keyvault_certificate_list](#test-245) -- [Test 246: keyvault_certificate_list](#test-246) -- [Test 247: keyvault_certificate_list](#test-247) -- [Test 248: keyvault_certificate_list](#test-248) +- [Test 226: functionapp_get](#test-226) +- [Test 227: functionapp_get](#test-227) +- [Test 228: functionapp_get](#test-228) +- [Test 229: functionapp_get](#test-229) +- [Test 230: functionapp_get](#test-230) +- [Test 231: keyvault_admin_settings_get](#test-231) +- [Test 232: keyvault_admin_settings_get](#test-232) +- [Test 233: keyvault_admin_settings_get](#test-233) +- [Test 234: keyvault_certificate_create](#test-234) +- [Test 235: keyvault_certificate_create](#test-235) +- [Test 236: keyvault_certificate_create](#test-236) +- [Test 237: keyvault_certificate_create](#test-237) +- [Test 238: keyvault_certificate_create](#test-238) +- [Test 239: keyvault_certificate_get](#test-239) +- [Test 240: keyvault_certificate_get](#test-240) +- [Test 241: keyvault_certificate_get](#test-241) +- [Test 242: keyvault_certificate_get](#test-242) +- [Test 243: keyvault_certificate_get](#test-243) +- [Test 244: keyvault_certificate_import](#test-244) +- [Test 245: keyvault_certificate_import](#test-245) +- [Test 246: keyvault_certificate_import](#test-246) +- [Test 247: keyvault_certificate_import](#test-247) +- [Test 248: keyvault_certificate_import](#test-248) - [Test 249: keyvault_certificate_list](#test-249) -- [Test 250: keyvault_key_create](#test-250) -- [Test 251: keyvault_key_create](#test-251) -- [Test 252: keyvault_key_create](#test-252) -- [Test 253: keyvault_key_create](#test-253) -- [Test 254: keyvault_key_create](#test-254) -- [Test 255: keyvault_key_get](#test-255) -- [Test 256: keyvault_key_get](#test-256) -- [Test 257: keyvault_key_get](#test-257) -- [Test 258: keyvault_key_get](#test-258) -- [Test 259: keyvault_key_get](#test-259) -- [Test 260: keyvault_key_list](#test-260) -- [Test 261: keyvault_key_list](#test-261) -- [Test 262: keyvault_key_list](#test-262) -- [Test 263: keyvault_key_list](#test-263) -- [Test 264: keyvault_key_list](#test-264) +- [Test 250: keyvault_certificate_list](#test-250) +- [Test 251: keyvault_certificate_list](#test-251) +- [Test 252: keyvault_certificate_list](#test-252) +- [Test 253: keyvault_certificate_list](#test-253) +- [Test 254: keyvault_certificate_list](#test-254) +- [Test 255: keyvault_key_create](#test-255) +- [Test 256: keyvault_key_create](#test-256) +- [Test 257: keyvault_key_create](#test-257) +- [Test 258: keyvault_key_create](#test-258) +- [Test 259: keyvault_key_create](#test-259) +- [Test 260: keyvault_key_get](#test-260) +- [Test 261: keyvault_key_get](#test-261) +- [Test 262: keyvault_key_get](#test-262) +- [Test 263: keyvault_key_get](#test-263) +- [Test 264: keyvault_key_get](#test-264) - [Test 265: keyvault_key_list](#test-265) -- [Test 266: keyvault_secret_create](#test-266) -- [Test 267: keyvault_secret_create](#test-267) -- [Test 268: keyvault_secret_create](#test-268) -- [Test 269: keyvault_secret_create](#test-269) -- [Test 270: keyvault_secret_create](#test-270) -- [Test 271: keyvault_secret_get](#test-271) -- [Test 272: keyvault_secret_get](#test-272) -- [Test 273: keyvault_secret_get](#test-273) -- [Test 274: keyvault_secret_get](#test-274) -- [Test 275: keyvault_secret_get](#test-275) -- [Test 276: keyvault_secret_list](#test-276) -- [Test 277: keyvault_secret_list](#test-277) -- [Test 278: keyvault_secret_list](#test-278) -- [Test 279: keyvault_secret_list](#test-279) -- [Test 280: keyvault_secret_list](#test-280) +- [Test 266: keyvault_key_list](#test-266) +- [Test 267: keyvault_key_list](#test-267) +- [Test 268: keyvault_key_list](#test-268) +- [Test 269: keyvault_key_list](#test-269) +- [Test 270: keyvault_key_list](#test-270) +- [Test 271: keyvault_secret_create](#test-271) +- [Test 272: keyvault_secret_create](#test-272) +- [Test 273: keyvault_secret_create](#test-273) +- [Test 274: keyvault_secret_create](#test-274) +- [Test 275: keyvault_secret_create](#test-275) +- [Test 276: keyvault_secret_get](#test-276) +- [Test 277: keyvault_secret_get](#test-277) +- [Test 278: keyvault_secret_get](#test-278) +- [Test 279: keyvault_secret_get](#test-279) +- [Test 280: keyvault_secret_get](#test-280) - [Test 281: keyvault_secret_list](#test-281) -- [Test 282: aks_cluster_get](#test-282) -- [Test 283: aks_cluster_get](#test-283) -- [Test 284: aks_cluster_get](#test-284) -- [Test 285: aks_cluster_get](#test-285) -- [Test 286: aks_cluster_get](#test-286) +- [Test 282: keyvault_secret_list](#test-282) +- [Test 283: keyvault_secret_list](#test-283) +- [Test 284: keyvault_secret_list](#test-284) +- [Test 285: keyvault_secret_list](#test-285) +- [Test 286: keyvault_secret_list](#test-286) - [Test 287: aks_cluster_get](#test-287) - [Test 288: aks_cluster_get](#test-288) -- [Test 289: aks_nodepool_get](#test-289) -- [Test 290: aks_nodepool_get](#test-290) -- [Test 291: aks_nodepool_get](#test-291) -- [Test 292: aks_nodepool_get](#test-292) -- [Test 293: aks_nodepool_get](#test-293) +- [Test 289: aks_cluster_get](#test-289) +- [Test 290: aks_cluster_get](#test-290) +- [Test 291: aks_cluster_get](#test-291) +- [Test 292: aks_cluster_get](#test-292) +- [Test 293: aks_cluster_get](#test-293) - [Test 294: aks_nodepool_get](#test-294) +<<<<<<< HEAD - [Test 295: loadtesting_test_create](#test-295) - [Test 296: loadtesting_test_get](#test-296) - [Test 297: loadtesting_testresource_create](#test-297) @@ -803,72 +821,99 @@ - [Test 316: get_bestpractices_get](#test-316) - [Test 317: get_bestpractices_get](#test-317) - [Test 318: get_bestpractices_get](#test-318) +======= +- [Test 295: aks_nodepool_get](#test-295) +- [Test 296: aks_nodepool_get](#test-296) +- [Test 297: aks_nodepool_get](#test-297) +- [Test 298: aks_nodepool_get](#test-298) +- [Test 299: aks_nodepool_get](#test-299) +- [Test 300: loadtesting_test_create](#test-300) +- [Test 301: loadtesting_test_get](#test-301) +- [Test 302: loadtesting_testresource_create](#test-302) +- [Test 303: loadtesting_testresource_list](#test-303) +- [Test 304: loadtesting_testrun_create](#test-304) +- [Test 305: loadtesting_testrun_get](#test-305) +- [Test 306: loadtesting_testrun_list](#test-306) +- [Test 307: loadtesting_testrun_update](#test-307) +- [Test 308: grafana_list](#test-308) +- [Test 309: managedlustre_fs_create](#test-309) +- [Test 310: managedlustre_fs_list](#test-310) +- [Test 311: managedlustre_fs_list](#test-311) +- [Test 312: managedlustre_fs_sku_get](#test-312) +- [Test 313: managedlustre_fs_subnetsize_ask](#test-313) +- [Test 314: managedlustre_fs_subnetsize_validate](#test-314) +- [Test 315: managedlustre_fs_update](#test-315) +- [Test 316: marketplace_product_get](#test-316) +- [Test 317: marketplace_product_list](#test-317) +- [Test 318: marketplace_product_list](#test-318) +>>>>>>> e2fd2eac (refactor tts mcp tool) - [Test 319: get_bestpractices_get](#test-319) - [Test 320: get_bestpractices_get](#test-320) - [Test 321: get_bestpractices_get](#test-321) - [Test 322: get_bestpractices_get](#test-322) -- [Test 323: monitor_activitylog_list](#test-323) -- [Test 324: monitor_healthmodels_entity_get](#test-324) -- [Test 325: monitor_metrics_definitions](#test-325) -- [Test 326: monitor_metrics_definitions](#test-326) -- [Test 327: monitor_metrics_definitions](#test-327) -- [Test 328: monitor_metrics_query](#test-328) -- [Test 329: monitor_metrics_query](#test-329) -- [Test 330: monitor_metrics_query](#test-330) -- [Test 331: monitor_metrics_query](#test-331) -- [Test 332: monitor_metrics_query](#test-332) +- [Test 323: get_bestpractices_get](#test-323) +- [Test 324: get_bestpractices_get](#test-324) +- [Test 325: get_bestpractices_get](#test-325) +- [Test 326: get_bestpractices_get](#test-326) +- [Test 327: get_bestpractices_get](#test-327) +- [Test 328: monitor_activitylog_list](#test-328) +- [Test 329: monitor_healthmodels_entity_get](#test-329) +- [Test 330: monitor_metrics_definitions](#test-330) +- [Test 331: monitor_metrics_definitions](#test-331) +- [Test 332: monitor_metrics_definitions](#test-332) - [Test 333: monitor_metrics_query](#test-333) -- [Test 334: monitor_resource_log_query](#test-334) -- [Test 335: monitor_table_list](#test-335) -- [Test 336: monitor_table_list](#test-336) -- [Test 337: monitor_table_type_list](#test-337) -- [Test 338: monitor_table_type_list](#test-338) -- [Test 339: monitor_webtests_create](#test-339) -- [Test 340: monitor_webtests_get](#test-340) -- [Test 341: monitor_webtests_list](#test-341) -- [Test 342: monitor_webtests_list](#test-342) -- [Test 343: monitor_webtests_update](#test-343) -- [Test 344: monitor_workspace_list](#test-344) -- [Test 345: monitor_workspace_list](#test-345) -- [Test 346: monitor_workspace_list](#test-346) -- [Test 347: monitor_workspace_log_query](#test-347) -- [Test 348: datadog_monitoredresources_list](#test-348) -- [Test 349: datadog_monitoredresources_list](#test-349) -- [Test 350: extension_azqr](#test-350) -- [Test 351: extension_azqr](#test-351) -- [Test 352: extension_azqr](#test-352) -- [Test 353: quota_region_availability_list](#test-353) -- [Test 354: quota_usage_check](#test-354) -- [Test 355: role_assignment_list](#test-355) -- [Test 356: role_assignment_list](#test-356) -- [Test 357: redis_list](#test-357) -- [Test 358: redis_list](#test-358) -- [Test 359: redis_list](#test-359) -- [Test 360: redis_list](#test-360) -- [Test 361: redis_list](#test-361) -- [Test 362: group_list](#test-362) -- [Test 363: group_list](#test-363) -- [Test 364: group_list](#test-364) -- [Test 365: resourcehealth_availability-status_get](#test-365) -- [Test 366: resourcehealth_availability-status_get](#test-366) -- [Test 367: resourcehealth_availability-status_get](#test-367) -- [Test 368: resourcehealth_availability-status_list](#test-368) -- [Test 369: resourcehealth_availability-status_list](#test-369) -- [Test 370: resourcehealth_availability-status_list](#test-370) -- [Test 371: resourcehealth_health-events_list](#test-371) -- [Test 372: resourcehealth_health-events_list](#test-372) -- [Test 373: resourcehealth_health-events_list](#test-373) -- [Test 374: resourcehealth_health-events_list](#test-374) -- [Test 375: resourcehealth_health-events_list](#test-375) -- [Test 376: servicebus_queue_details](#test-376) -- [Test 377: servicebus_topic_details](#test-377) -- [Test 378: servicebus_topic_subscription_details](#test-378) -- [Test 379: signalr_runtime_get](#test-379) -- [Test 380: signalr_runtime_get](#test-380) -- [Test 381: signalr_runtime_get](#test-381) -- [Test 382: signalr_runtime_get](#test-382) -- [Test 383: signalr_runtime_get](#test-383) +- [Test 334: monitor_metrics_query](#test-334) +- [Test 335: monitor_metrics_query](#test-335) +- [Test 336: monitor_metrics_query](#test-336) +- [Test 337: monitor_metrics_query](#test-337) +- [Test 338: monitor_metrics_query](#test-338) +- [Test 339: monitor_resource_log_query](#test-339) +- [Test 340: monitor_table_list](#test-340) +- [Test 341: monitor_table_list](#test-341) +- [Test 342: monitor_table_type_list](#test-342) +- [Test 343: monitor_table_type_list](#test-343) +- [Test 344: monitor_webtests_create](#test-344) +- [Test 345: monitor_webtests_get](#test-345) +- [Test 346: monitor_webtests_list](#test-346) +- [Test 347: monitor_webtests_list](#test-347) +- [Test 348: monitor_webtests_update](#test-348) +- [Test 349: monitor_workspace_list](#test-349) +- [Test 350: monitor_workspace_list](#test-350) +- [Test 351: monitor_workspace_list](#test-351) +- [Test 352: monitor_workspace_log_query](#test-352) +- [Test 353: datadog_monitoredresources_list](#test-353) +- [Test 354: datadog_monitoredresources_list](#test-354) +- [Test 355: extension_azqr](#test-355) +- [Test 356: extension_azqr](#test-356) +- [Test 357: extension_azqr](#test-357) +- [Test 358: quota_region_availability_list](#test-358) +- [Test 359: quota_usage_check](#test-359) +- [Test 360: role_assignment_list](#test-360) +- [Test 361: role_assignment_list](#test-361) +- [Test 362: redis_list](#test-362) +- [Test 363: redis_list](#test-363) +- [Test 364: redis_list](#test-364) +- [Test 365: redis_list](#test-365) +- [Test 366: redis_list](#test-366) +- [Test 367: group_list](#test-367) +- [Test 368: group_list](#test-368) +- [Test 369: group_list](#test-369) +- [Test 370: resourcehealth_availability-status_get](#test-370) +- [Test 371: resourcehealth_availability-status_get](#test-371) +- [Test 372: resourcehealth_availability-status_get](#test-372) +- [Test 373: resourcehealth_availability-status_list](#test-373) +- [Test 374: resourcehealth_availability-status_list](#test-374) +- [Test 375: resourcehealth_availability-status_list](#test-375) +- [Test 376: resourcehealth_health-events_list](#test-376) +- [Test 377: resourcehealth_health-events_list](#test-377) +- [Test 378: resourcehealth_health-events_list](#test-378) +- [Test 379: resourcehealth_health-events_list](#test-379) +- [Test 380: resourcehealth_health-events_list](#test-380) +- [Test 381: servicebus_queue_details](#test-381) +- [Test 382: servicebus_topic_details](#test-382) +- [Test 383: servicebus_topic_subscription_details](#test-383) - [Test 384: signalr_runtime_get](#test-384) +<<<<<<< HEAD - [Test 385: sql_db_create](#test-385) - [Test 386: sql_db_create](#test-386) - [Test 387: sql_db_create](#test-387) @@ -953,6 +998,93 @@ ======= >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +- [Test 385: signalr_runtime_get](#test-385) +- [Test 386: signalr_runtime_get](#test-386) +- [Test 387: signalr_runtime_get](#test-387) +- [Test 388: signalr_runtime_get](#test-388) +- [Test 389: signalr_runtime_get](#test-389) +- [Test 390: sql_db_create](#test-390) +- [Test 391: sql_db_create](#test-391) +- [Test 392: sql_db_create](#test-392) +- [Test 393: sql_db_delete](#test-393) +- [Test 394: sql_db_delete](#test-394) +- [Test 395: sql_db_delete](#test-395) +- [Test 396: sql_db_list](#test-396) +- [Test 397: sql_db_list](#test-397) +- [Test 398: sql_db_rename](#test-398) +- [Test 399: sql_db_rename](#test-399) +- [Test 400: sql_db_show](#test-400) +- [Test 401: sql_db_show](#test-401) +- [Test 402: sql_db_update](#test-402) +- [Test 403: sql_db_update](#test-403) +- [Test 404: sql_elastic-pool_list](#test-404) +- [Test 405: sql_elastic-pool_list](#test-405) +- [Test 406: sql_elastic-pool_list](#test-406) +- [Test 407: sql_server_create](#test-407) +- [Test 408: sql_server_create](#test-408) +- [Test 409: sql_server_create](#test-409) +- [Test 410: sql_server_delete](#test-410) +- [Test 411: sql_server_delete](#test-411) +- [Test 412: sql_server_delete](#test-412) +- [Test 413: sql_server_entra-admin_list](#test-413) +- [Test 414: sql_server_entra-admin_list](#test-414) +- [Test 415: sql_server_entra-admin_list](#test-415) +- [Test 416: sql_server_firewall-rule_create](#test-416) +- [Test 417: sql_server_firewall-rule_create](#test-417) +- [Test 418: sql_server_firewall-rule_create](#test-418) +- [Test 419: sql_server_firewall-rule_delete](#test-419) +- [Test 420: sql_server_firewall-rule_delete](#test-420) +- [Test 421: sql_server_firewall-rule_delete](#test-421) +- [Test 422: sql_server_firewall-rule_list](#test-422) +- [Test 423: sql_server_firewall-rule_list](#test-423) +- [Test 424: sql_server_firewall-rule_list](#test-424) +- [Test 425: sql_server_list](#test-425) +- [Test 426: sql_server_list](#test-426) +- [Test 427: sql_server_show](#test-427) +- [Test 428: sql_server_show](#test-428) +- [Test 429: sql_server_show](#test-429) +- [Test 430: storage_account_create](#test-430) +- [Test 431: storage_account_create](#test-431) +- [Test 432: storage_account_create](#test-432) +- [Test 433: storage_account_get](#test-433) +- [Test 434: storage_account_get](#test-434) +- [Test 435: storage_account_get](#test-435) +- [Test 436: storage_account_get](#test-436) +- [Test 437: storage_account_get](#test-437) +- [Test 438: storage_blob_container_create](#test-438) +- [Test 439: storage_blob_container_create](#test-439) +- [Test 440: storage_blob_container_create](#test-440) +- [Test 441: storage_blob_container_get](#test-441) +- [Test 442: storage_blob_container_get](#test-442) +- [Test 443: storage_blob_container_get](#test-443) +- [Test 444: storage_blob_get](#test-444) +- [Test 445: storage_blob_get](#test-445) +- [Test 446: storage_blob_get](#test-446) +- [Test 447: storage_blob_get](#test-447) +- [Test 448: storage_blob_upload](#test-448) +- [Test 449: subscription_list](#test-449) +- [Test 450: subscription_list](#test-450) +- [Test 451: subscription_list](#test-451) +- [Test 452: subscription_list](#test-452) +- [Test 453: azureterraformbestpractices_get](#test-453) +- [Test 454: azureterraformbestpractices_get](#test-454) +- [Test 455: virtualdesktop_hostpool_list](#test-455) +- [Test 456: virtualdesktop_hostpool_host_list](#test-456) +- [Test 457: virtualdesktop_hostpool_host_user-list](#test-457) +- [Test 458: workbooks_create](#test-458) +- [Test 459: workbooks_delete](#test-459) +- [Test 460: workbooks_list](#test-460) +- [Test 461: workbooks_list](#test-461) +- [Test 462: workbooks_show](#test-462) +- [Test 463: workbooks_show](#test-463) +- [Test 464: workbooks_update](#test-464) +- [Test 465: bicepschema_get](#test-465) +- [Test 466: cloudarchitect_design](#test-466) +- [Test 467: cloudarchitect_design](#test-467) +- [Test 468: cloudarchitect_design](#test-468) +- [Test 469: cloudarchitect_design](#test-469) +>>>>>>> e2fd2eac (refactor tts mcp tool) --- @@ -967,6 +1099,7 @@ |------|-------|------|--------| | 1 | 0.705410 | `foundry_agents_connect` | ✅ **EXPECTED** | <<<<<<< HEAD +<<<<<<< HEAD | 2 | 0.663468 | `foundry_agents_list` | ❌ | | 3 | 0.617213 | `foundry_resource_get` | ❌ | | 4 | 0.548044 | `foundry_openai_models-list` | ❌ | @@ -984,6 +1117,12 @@ | 5 | 0.536533 | `search_index_query` | ❌ | >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +| 2 | 0.663568 | `foundry_agents_list` | ❌ | +| 3 | 0.617213 | `foundry_resource_get` | ❌ | +| 4 | 0.548108 | `foundry_agents_get-sdk-sample` | ❌ | +| 5 | 0.548044 | `foundry_openai_models-list` | ❌ | +>>>>>>> e2fd2eac (refactor tts mcp tool) --- @@ -997,6 +1136,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| <<<<<<< HEAD +<<<<<<< HEAD | 1 | 0.543045 | `foundry_agents_query-and-evaluate` | ❌ | | 2 | 0.469272 | `foundry_agents_evaluate` | ✅ **EXPECTED** | | 3 | 0.445585 | `foundry_agents_connect` | ❌ | @@ -1004,15 +1144,19 @@ | 5 | 0.279058 | `foundry_agents_list` | ❌ | ======= <<<<<<< HEAD -| 1 | 0.544099 | `foundry_agents_query-and-evaluate` | ❌ | ======= -| 1 | 0.544237 | `foundry_agents_query-and-evaluate` | ❌ | ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> e2fd2eac (refactor tts mcp tool) +| 1 | 0.544099 | `foundry_agents_query-and-evaluate` | ❌ | | 2 | 0.469428 | `foundry_agents_evaluate` | ✅ **EXPECTED** | | 3 | 0.445964 | `foundry_agents_connect` | ❌ | +<<<<<<< HEAD | 4 | 0.278921 | `foundry_agents_list` | ❌ | | 5 | 0.250023 | `monitor_workspace_log_query` | ❌ | >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +| 4 | 0.297986 | `foundry_threads_list` | ❌ | +| 5 | 0.278921 | `foundry_agents_list` | ❌ | +>>>>>>> e2fd2eac (refactor tts mcp tool) --- @@ -1026,6 +1170,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| <<<<<<< HEAD +<<<<<<< HEAD | 1 | 0.797701 | `foundry_agents_list` | ✅ **EXPECTED** | | 2 | 0.666021 | `foundry_resource_get` | ❌ | | 3 | 0.654206 | `foundry_openai_models-list` | ❌ | @@ -1046,6 +1191,13 @@ | 5 | 0.542125 | `foundry_knowledge_index_list` | ❌ | >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +| 1 | 0.797877 | `foundry_agents_list` | ✅ **EXPECTED** | +| 2 | 0.666021 | `foundry_resource_get` | ❌ | +| 3 | 0.654206 | `foundry_openai_models-list` | ❌ | +| 4 | 0.647246 | `foundry_threads_list` | ❌ | +| 5 | 0.575553 | `foundry_models_deployments_list` | ❌ | +>>>>>>> e2fd2eac (refactor tts mcp tool) --- @@ -1059,6 +1211,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| <<<<<<< HEAD +<<<<<<< HEAD | 1 | 0.749704 | `foundry_agents_list` | ✅ **EXPECTED** | | 2 | 0.630323 | `foundry_resource_get` | ❌ | | 3 | 0.611801 | `foundry_openai_models-list` | ❌ | @@ -1079,6 +1232,13 @@ | 5 | 0.519892 | `foundry_knowledge_index_list` | ❌ | >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +| 1 | 0.749829 | `foundry_agents_list` | ✅ **EXPECTED** | +| 2 | 0.630288 | `foundry_resource_get` | ❌ | +| 3 | 0.611722 | `foundry_openai_models-list` | ❌ | +| 4 | 0.603689 | `foundry_threads_list` | ❌ | +| 5 | 0.556990 | `foundry_agents_get-sdk-sample` | ❌ | +>>>>>>> e2fd2eac (refactor tts mcp tool) --- @@ -1092,10 +1252,17 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| | 1 | 0.587064 | `foundry_agents_create` | ✅ **EXPECTED** | +<<<<<<< HEAD | 2 | 0.561567 | `foundry_agents_get-sdk-sample` | ❌ | | 3 | 0.554070 | `foundry_threads_create` | ❌ | | 4 | 0.525727 | `foundry_models_deploy` | ❌ | | 5 | 0.525461 | `foundry_agents_list` | ❌ | +======= +| 2 | 0.562087 | `foundry_agents_get-sdk-sample` | ❌ | +| 3 | 0.554195 | `foundry_threads_create` | ❌ | +| 4 | 0.525727 | `foundry_models_deploy` | ❌ | +| 5 | 0.525615 | `foundry_agents_list` | ❌ | +>>>>>>> e2fd2eac (refactor tts mcp tool) --- @@ -1110,6 +1277,7 @@ |------|-------|------|--------| | 1 | 0.652200 | `foundry_agents_connect` | ❌ | <<<<<<< HEAD +<<<<<<< HEAD | 2 | 0.570725 | `foundry_agents_list` | ❌ | | 3 | 0.553233 | `foundry_agents_query-and-evaluate` | ✅ **EXPECTED** | | 4 | 0.493778 | `foundry_agents_evaluate` | ❌ | @@ -1126,6 +1294,12 @@ >>>>>>> 84ad4f44 (update prompts and tool description evaluator) | 5 | 0.460662 | `foundry_resource_get` | ❌ | >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +| 2 | 0.570788 | `foundry_agents_list` | ❌ | +| 3 | 0.553190 | `foundry_agents_query-and-evaluate` | ✅ **EXPECTED** | +| 4 | 0.493779 | `foundry_agents_evaluate` | ❌ | +| 5 | 0.469431 | `foundry_threads_list` | ❌ | +>>>>>>> e2fd2eac (refactor tts mcp tool) --- @@ -1138,11 +1312,19 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.595581 | `foundry_agents_get-sdk-sample` | ✅ **EXPECTED** | | 2 | 0.552197 | `foundry_threads_create` | ❌ | | 3 | 0.521920 | `foundry_agents_connect` | ❌ | | 4 | 0.518552 | `foundry_agents_create` | ❌ | | 5 | 0.509581 | `foundry_agents_list` | ❌ | +======= +| 1 | 0.595766 | `foundry_agents_get-sdk-sample` | ✅ **EXPECTED** | +| 2 | 0.552022 | `foundry_threads_create` | ❌ | +| 3 | 0.521920 | `foundry_agents_connect` | ❌ | +| 4 | 0.518552 | `foundry_agents_create` | ❌ | +| 5 | 0.509764 | `foundry_agents_list` | ❌ | +>>>>>>> e2fd2eac (refactor tts mcp tool) --- @@ -1155,9 +1337,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.606811 | `foundry_threads_create` | ✅ **EXPECTED** | | 2 | 0.528310 | `foundry_openai_chat-completions-create` | ❌ | | 3 | 0.519709 | `foundry_threads_get-messages` | ❌ | +======= +| 1 | 0.606417 | `foundry_threads_create` | ✅ **EXPECTED** | +| 2 | 0.528310 | `foundry_openai_chat-completions-create` | ❌ | +| 3 | 0.519708 | `foundry_threads_get-messages` | ❌ | +>>>>>>> e2fd2eac (refactor tts mcp tool) | 4 | 0.506089 | `foundry_threads_list` | ❌ | | 5 | 0.490796 | `foundry_models_deploy` | ❌ | @@ -1172,11 +1360,19 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.677249 | `foundry_threads_list` | ✅ **EXPECTED** | | 2 | 0.574068 | `foundry_threads_get-messages` | ❌ | | 3 | 0.566999 | `foundry_threads_create` | ❌ | | 4 | 0.471737 | `foundry_agents_get-sdk-sample` | ❌ | | 5 | 0.448682 | `foundry_agents_list` | ❌ | +======= +| 1 | 0.677248 | `foundry_threads_list` | ✅ **EXPECTED** | +| 2 | 0.574068 | `foundry_threads_get-messages` | ❌ | +| 3 | 0.566387 | `foundry_threads_create` | ❌ | +| 4 | 0.471544 | `foundry_agents_get-sdk-sample` | ❌ | +| 5 | 0.448963 | `foundry_agents_list` | ❌ | +>>>>>>> e2fd2eac (refactor tts mcp tool) --- @@ -1190,9 +1386,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| | 1 | 0.669937 | `foundry_threads_get-messages` | ✅ **EXPECTED** | +<<<<<<< HEAD | 2 | 0.584431 | `foundry_threads_create` | ❌ | | 3 | 0.529381 | `foundry_threads_list` | ❌ | | 4 | 0.437894 | `foundry_agents_get-sdk-sample` | ❌ | +======= +| 2 | 0.583991 | `foundry_threads_create` | ❌ | +| 3 | 0.529381 | `foundry_threads_list` | ❌ | +| 4 | 0.437480 | `foundry_agents_get-sdk-sample` | ❌ | +>>>>>>> e2fd2eac (refactor tts mcp tool) | 5 | 0.427894 | `foundry_agents_create` | ❌ | --- @@ -1210,7 +1412,11 @@ | 2 | 0.537540 | `foundry_agents_list` | ❌ | | 3 | 0.526528 | `foundry_knowledge_index_schema` | ❌ | | 4 | 0.500786 | `foundry_threads_list` | ❌ | +<<<<<<< HEAD | 5 | 0.475746 | `foundry_models_deployments_list` | ❌ | +======= +| 5 | 0.475802 | `foundry_models_deployments_list` | ❌ | +>>>>>>> e2fd2eac (refactor tts mcp tool) --- @@ -1225,7 +1431,11 @@ |------|-------|------|--------| | 1 | 0.615458 | `foundry_knowledge_index_list` | ✅ **EXPECTED** | | 2 | 0.489311 | `foundry_knowledge_index_schema` | ❌ | +<<<<<<< HEAD | 3 | 0.484329 | `foundry_agents_list` | ❌ | +======= +| 3 | 0.484466 | `foundry_agents_list` | ❌ | +>>>>>>> e2fd2eac (refactor tts mcp tool) | 4 | 0.454174 | `foundry_threads_list` | ❌ | | 5 | 0.441521 | `foundry_resource_get` | ❌ | @@ -1241,11 +1451,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| | 1 | 0.739885 | `foundry_knowledge_index_schema` | ✅ **EXPECTED** | -<<<<<<< HEAD | 2 | 0.589536 | `foundry_knowledge_index_list` | ❌ | -======= -| 2 | 0.614851 | `foundry_knowledge_index_list` | ❌ | ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) | 3 | 0.494004 | `foundry_resource_get` | ❌ | | 4 | 0.491510 | `search_index_get` | ❌ | | 5 | 0.490410 | `search_knowledge_base_get` | ❌ | @@ -1279,7 +1485,6 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| | 1 | 0.562920 | `foundry_models_deploy` | ✅ **EXPECTED** | -<<<<<<< HEAD | 2 | 0.299986 | `foundry_openai_models-list` | ❌ | | 3 | 0.298490 | `loadtesting_testrun_create` | ❌ | | 4 | 0.293050 | `loadtesting_testresource_create` | ❌ | @@ -1287,6 +1492,7 @@ | 5 | 0.290387 | `foundry_openai_embeddings-create` | ❌ | ======= | 5 | 0.290381 | `foundry_openai_embeddings-create` | ❌ | +<<<<<<< HEAD ======= | 2 | 0.335116 | `foundry_openai_models-list` | ❌ | | 3 | 0.298490 | `loadtesting_testrun_create` | ❌ | @@ -1294,6 +1500,8 @@ | 5 | 0.282464 | `mysql_server_list` | ❌ | >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +>>>>>>> e2fd2eac (refactor tts mcp tool) --- @@ -1307,6 +1515,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| <<<<<<< HEAD +<<<<<<< HEAD | 1 | 0.681081 | `foundry_models_deployments_list` | ✅ **EXPECTED** | | 2 | 0.674510 | `foundry_openai_models-list` | ❌ | | 3 | 0.572625 | `foundry_threads_list` | ❌ | @@ -1327,6 +1536,13 @@ | 5 | 0.539695 | `foundry_agents_list` | ❌ | >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +| 1 | 0.681385 | `foundry_models_deployments_list` | ✅ **EXPECTED** | +| 2 | 0.674510 | `foundry_openai_models-list` | ❌ | +| 3 | 0.572625 | `foundry_threads_list` | ❌ | +| 4 | 0.569059 | `foundry_agents_list` | ❌ | +| 5 | 0.566272 | `foundry_resource_get` | ❌ | +>>>>>>> e2fd2eac (refactor tts mcp tool) --- @@ -1340,6 +1556,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| <<<<<<< HEAD +<<<<<<< HEAD | 1 | 0.619840 | `foundry_models_deployments_list` | ✅ **EXPECTED** | | 2 | 0.619299 | `foundry_openai_models-list` | ❌ | | 3 | 0.543385 | `foundry_resource_get` | ❌ | @@ -1347,10 +1564,13 @@ | 5 | 0.527141 | `foundry_threads_list` | ❌ | ======= <<<<<<< HEAD +======= +>>>>>>> e2fd2eac (refactor tts mcp tool) | 1 | 0.620173 | `foundry_models_deployments_list` | ✅ **EXPECTED** | | 2 | 0.619231 | `foundry_openai_models-list` | ❌ | | 3 | 0.543352 | `foundry_resource_get` | ❌ | | 4 | 0.540551 | `foundry_agents_list` | ❌ | +<<<<<<< HEAD | 5 | 0.521475 | `foundry_models_deploy` | ❌ | ======= | 1 | 0.606516 | `foundry_models_deployments_list` | ✅ **EXPECTED** | @@ -1360,6 +1580,9 @@ | 5 | 0.507301 | `foundry_openai_models-list` | ❌ | >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +| 5 | 0.527121 | `foundry_threads_list` | ❌ | +>>>>>>> e2fd2eac (refactor tts mcp tool) --- @@ -1372,10 +1595,10 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD | 1 | 0.603415 | `foundry_openai_models-list` | ❌ | | 2 | 0.560022 | `foundry_models_list` | ✅ **EXPECTED** | <<<<<<< HEAD +<<<<<<< HEAD | 3 | 0.553634 | `foundry_threads_list` | ❌ | | 4 | 0.537958 | `foundry_models_deployments_list` | ❌ | | 5 | 0.519191 | `foundry_agents_list` | ❌ | @@ -1391,6 +1614,11 @@ | 5 | 0.475204 | `foundry_openai_models-list` | ❌ | >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +| 3 | 0.553634 | `foundry_threads_list` | ❌ | +| 4 | 0.537981 | `foundry_models_deployments_list` | ❌ | +| 5 | 0.519472 | `foundry_agents_list` | ❌ | +>>>>>>> e2fd2eac (refactor tts mcp tool) --- @@ -1403,11 +1631,11 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD | 1 | 0.576904 | `foundry_openai_models-list` | ❌ | | 2 | 0.574818 | `foundry_models_list` | ✅ **EXPECTED** | | 3 | 0.525312 | `foundry_resource_get` | ❌ | <<<<<<< HEAD +<<<<<<< HEAD | 4 | 0.522153 | `foundry_agents_get-sdk-sample` | ❌ | | 5 | 0.517825 | `foundry_models_deployments_list` | ❌ | ======= @@ -1421,6 +1649,10 @@ | 5 | 0.467671 | `foundry_models_deploy` | ❌ | >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +| 4 | 0.521474 | `foundry_agents_get-sdk-sample` | ❌ | +| 5 | 0.517980 | `foundry_models_deployments_list` | ❌ | +>>>>>>> e2fd2eac (refactor tts mcp tool) --- @@ -1433,10 +1665,10 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD | 1 | 0.641293 | `foundry_openai_chat-completions-create` | ✅ **EXPECTED** | | 2 | 0.546736 | `foundry_openai_create-completion` | ❌ | <<<<<<< HEAD +<<<<<<< HEAD | 3 | 0.420018 | `foundry_threads_create` | ❌ | | 4 | 0.415482 | `foundry_agents_connect` | ❌ | | 5 | 0.399382 | `foundry_openai_embeddings-create` | ❌ | @@ -1452,6 +1684,11 @@ | 5 | 0.361151 | `foundry_resource_get` | ❌ | >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +| 3 | 0.421034 | `foundry_threads_create` | ❌ | +| 4 | 0.415483 | `foundry_agents_connect` | ❌ | +| 5 | 0.399383 | `foundry_openai_embeddings-create` | ❌ | +>>>>>>> e2fd2eac (refactor tts mcp tool) --- @@ -1464,7 +1701,6 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD | 1 | 0.696936 | `foundry_openai_create-completion` | ✅ **EXPECTED** | | 2 | 0.579108 | `foundry_openai_chat-completions-create` | ❌ | <<<<<<< HEAD @@ -1475,6 +1711,7 @@ | 3 | 0.463703 | `foundry_models_deploy` | ❌ | | 4 | 0.459126 | `foundry_resource_get` | ❌ | | 5 | 0.458622 | `foundry_openai_embeddings-create` | ❌ | +<<<<<<< HEAD ======= | 1 | 0.682250 | `foundry_openai_create-completion` | ✅ **EXPECTED** | | 2 | 0.539297 | `foundry_openai_chat-completions-create` | ❌ | @@ -1483,6 +1720,8 @@ | 5 | 0.450993 | `deploy_pipeline_guidance_get` | ❌ | >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +>>>>>>> e2fd2eac (refactor tts mcp tool) --- @@ -1496,23 +1735,19 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| <<<<<<< HEAD +<<<<<<< HEAD | 1 | 0.766496 | `foundry_openai_embeddings-create` | ✅ **EXPECTED** | | 2 | 0.543339 | `foundry_models_deploy` | ❌ | ======= <<<<<<< HEAD +======= +>>>>>>> e2fd2eac (refactor tts mcp tool) | 1 | 0.766338 | `foundry_openai_embeddings-create` | ✅ **EXPECTED** | | 2 | 0.543338 | `foundry_models_deploy` | ❌ | >>>>>>> 58ab8585 (update prompts and tool description evaluator) | 3 | 0.542214 | `foundry_openai_create-completion` | ❌ | | 4 | 0.520746 | `foundry_openai_models-list` | ❌ | | 5 | 0.519335 | `foundry_resource_get` | ❌ | -======= -| 1 | 0.681346 | `foundry_openai_embeddings-create` | ✅ **EXPECTED** | -| 2 | 0.556419 | `foundry_openai_create-completion` | ❌ | -| 3 | 0.543338 | `foundry_models_deploy` | ❌ | -| 4 | 0.519335 | `foundry_resource_get` | ❌ | -| 5 | 0.463954 | `foundry_openai_models-list` | ❌ | ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) --- @@ -1526,6 +1761,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| <<<<<<< HEAD +<<<<<<< HEAD | 1 | 0.724369 | `foundry_openai_embeddings-create` | ✅ **EXPECTED** | | 2 | 0.494544 | `foundry_resource_get` | ❌ | | 3 | 0.480389 | `foundry_models_deploy` | ❌ | @@ -1533,11 +1769,14 @@ | 5 | 0.463885 | `foundry_openai_chat-completions-create` | ❌ | ======= <<<<<<< HEAD +======= +>>>>>>> e2fd2eac (refactor tts mcp tool) | 1 | 0.724120 | `foundry_openai_embeddings-create` | ✅ **EXPECTED** | | 2 | 0.494485 | `foundry_resource_get` | ❌ | | 3 | 0.480296 | `foundry_models_deploy` | ❌ | | 4 | 0.480218 | `foundry_openai_create-completion` | ❌ | | 5 | 0.463797 | `foundry_openai_chat-completions-create` | ❌ | +<<<<<<< HEAD ======= | 1 | 0.638843 | `foundry_openai_embeddings-create` | ✅ **EXPECTED** | | 2 | 0.494506 | `foundry_openai_create-completion` | ❌ | @@ -1546,6 +1785,8 @@ | 5 | 0.399908 | `foundry_openai_chat-completions-create` | ❌ | >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +>>>>>>> e2fd2eac (refactor tts mcp tool) --- @@ -1558,7 +1799,6 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD | 1 | 0.799059 | `foundry_openai_models-list` | ✅ **EXPECTED** | | 2 | 0.668887 | `foundry_resource_get` | ❌ | <<<<<<< HEAD @@ -1569,6 +1809,7 @@ | 3 | 0.667040 | `foundry_models_list` | ❌ | | 4 | 0.666207 | `foundry_models_deployments_list` | ❌ | | 5 | 0.657546 | `foundry_agents_list` | ❌ | +<<<<<<< HEAD ======= | 1 | 0.729075 | `foundry_openai_models-list` | ✅ **EXPECTED** | | 2 | 0.668887 | `foundry_resource_get` | ❌ | @@ -1577,6 +1818,8 @@ | 5 | 0.604808 | `foundry_models_deployments_list` | ❌ | >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +>>>>>>> e2fd2eac (refactor tts mcp tool) --- @@ -1589,7 +1832,6 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD | 1 | 0.741659 | `foundry_openai_models-list` | ✅ **EXPECTED** | | 2 | 0.660115 | `foundry_models_deployments_list` | ❌ | | 3 | 0.648218 | `foundry_resource_get` | ❌ | @@ -1598,6 +1840,7 @@ | 5 | 0.619790 | `foundry_agents_list` | ❌ | ======= | 5 | 0.619878 | `foundry_agents_list` | ❌ | +<<<<<<< HEAD ======= | 1 | 0.654318 | `foundry_openai_models-list` | ✅ **EXPECTED** | | 2 | 0.648219 | `foundry_resource_get` | ❌ | @@ -1606,6 +1849,8 @@ | 5 | 0.576563 | `foundry_agents_list` | ❌ | >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +>>>>>>> e2fd2eac (refactor tts mcp tool) --- @@ -1620,9 +1865,15 @@ |------|-------|------|--------| | 1 | 0.594096 | `foundry_resource_get` | ✅ **EXPECTED** | | 2 | 0.571916 | `foundry_openai_models-list` | ❌ | +<<<<<<< HEAD | 3 | 0.566762 | `foundry_agents_list` | ❌ | | 4 | 0.558075 | `foundry_threads_list` | ❌ | | 5 | 0.556154 | `search_service_list` | ❌ | +======= +| 3 | 0.567019 | `foundry_agents_list` | ❌ | +| 4 | 0.558290 | `search_service_list` | ❌ | +| 5 | 0.558076 | `foundry_threads_list` | ❌ | +>>>>>>> e2fd2eac (refactor tts mcp tool) --- @@ -1636,17 +1887,10 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| | 1 | 0.665311 | `foundry_resource_get` | ✅ **EXPECTED** | -<<<<<<< HEAD | 2 | 0.585305 | `foundry_openai_models-list` | ❌ | | 3 | 0.553808 | `foundry_agents_list` | ❌ | | 4 | 0.518747 | `foundry_openai_embeddings-create` | ❌ | | 5 | 0.492911 | `foundry_models_deploy` | ❌ | -======= -| 2 | 0.492911 | `foundry_models_deploy` | ❌ | -| 3 | 0.474905 | `foundry_agents_list` | ❌ | -| 4 | 0.467211 | `loadtesting_testresource_list` | ❌ | -| 5 | 0.453632 | `foundry_openai_models-list` | ❌ | ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) --- @@ -1660,9 +1904,9 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| | 1 | 0.735316 | `foundry_resource_get` | ✅ **EXPECTED** | -<<<<<<< HEAD | 2 | 0.571906 | `foundry_openai_models-list` | ❌ | <<<<<<< HEAD +<<<<<<< HEAD | 3 | 0.509484 | `monitor_webtests_get` | ❌ | | 4 | 0.496980 | `foundry_openai_embeddings-create` | ❌ | | 5 | 0.475498 | `foundry_agents_list` | ❌ | @@ -1677,6 +1921,11 @@ | 5 | 0.444390 | `loadtesting_testresource_list` | ❌ | >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +| 3 | 0.509484 | `monitor_webtests_get` | ❌ | +| 4 | 0.497090 | `foundry_openai_embeddings-create` | ❌ | +| 5 | 0.475722 | `foundry_agents_list` | ❌ | +>>>>>>> e2fd2eac (refactor tts mcp tool) --- @@ -1690,6 +1939,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| <<<<<<< HEAD +<<<<<<< HEAD | 1 | 0.785967 | `search_knowledge_base_get` | ✅ **EXPECTED** | | 2 | 0.700824 | `search_knowledge_source_get` | ❌ | | 3 | 0.692681 | `search_service_list` | ❌ | @@ -1703,13 +1953,19 @@ | 4 | 0.635477 | `search_knowledge_base_retrieve` | ❌ | | 5 | 0.586578 | `search_index_get` | ❌ | ======= +======= +>>>>>>> e2fd2eac (refactor tts mcp tool) | 1 | 0.785967 | `search_knowledge_base_get` | ✅ **EXPECTED** | -| 2 | 0.700968 | `search_knowledge_source_get` | ❌ | +| 2 | 0.700824 | `search_knowledge_source_get` | ❌ | | 3 | 0.693471 | `search_service_list` | ❌ | | 4 | 0.635863 | `search_knowledge_base_retrieve` | ❌ | +<<<<<<< HEAD | 5 | 0.603324 | `foundry_knowledge_index_list` | ❌ | >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +| 5 | 0.586574 | `search_index_get` | ❌ | +>>>>>>> e2fd2eac (refactor tts mcp tool) --- @@ -1723,7 +1979,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| | 1 | 0.748213 | `search_knowledge_base_get` | ✅ **EXPECTED** | -| 2 | 0.668479 | `search_knowledge_source_get` | ❌ | +| 2 | 0.668487 | `search_knowledge_source_get` | ❌ | | 3 | 0.628582 | `search_knowledge_base_retrieve` | ❌ | | 4 | 0.623715 | `search_service_list` | ❌ | | 5 | 0.566618 | `search_index_get` | ❌ | @@ -1740,7 +1996,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| | 1 | 0.702942 | `search_knowledge_base_get` | ✅ **EXPECTED** | -| 2 | 0.606164 | `search_knowledge_source_get` | ❌ | +| 2 | 0.605964 | `search_knowledge_source_get` | ❌ | | 3 | 0.583234 | `search_knowledge_base_retrieve` | ❌ | | 4 | 0.512825 | `search_service_list` | ❌ | | 5 | 0.476815 | `foundry_knowledge_index_list` | ❌ | @@ -1757,6 +2013,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| <<<<<<< HEAD +<<<<<<< HEAD | 1 | 0.688155 | `search_knowledge_base_get` | ✅ **EXPECTED** | | 2 | 0.599348 | `search_knowledge_source_get` | ❌ | | 3 | 0.578437 | `search_knowledge_base_retrieve` | ❌ | @@ -1769,6 +2026,13 @@ | 4 | 0.457619 | `search_service_list` | ❌ | | 5 | 0.439529 | `foundry_knowledge_index_list` | ❌ | >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +| 1 | 0.688202 | `search_knowledge_base_get` | ✅ **EXPECTED** | +| 2 | 0.599369 | `search_knowledge_source_get` | ❌ | +| 3 | 0.578428 | `search_knowledge_base_retrieve` | ❌ | +| 4 | 0.457427 | `search_service_list` | ❌ | +| 5 | 0.439548 | `foundry_knowledge_index_list` | ❌ | +>>>>>>> e2fd2eac (refactor tts mcp tool) --- @@ -1782,6 +2046,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| <<<<<<< HEAD +<<<<<<< HEAD | 1 | 0.769383 | `search_knowledge_base_get` | ✅ **EXPECTED** | | 2 | 0.685640 | `search_knowledge_source_get` | ❌ | | 3 | 0.636958 | `search_knowledge_base_retrieve` | ❌ | @@ -1795,13 +2060,18 @@ | 4 | 0.586085 | `search_index_get` | ❌ | | 5 | 0.533859 | `search_service_list` | ❌ | ======= +======= +>>>>>>> e2fd2eac (refactor tts mcp tool) | 1 | 0.769384 | `search_knowledge_base_get` | ✅ **EXPECTED** | -| 2 | 0.685412 | `search_knowledge_source_get` | ❌ | +| 2 | 0.685640 | `search_knowledge_source_get` | ❌ | | 3 | 0.636958 | `search_knowledge_base_retrieve` | ❌ | | 4 | 0.585949 | `search_index_get` | ❌ | | 5 | 0.533700 | `search_service_list` | ❌ | +<<<<<<< HEAD >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +>>>>>>> e2fd2eac (refactor tts mcp tool) --- @@ -1816,15 +2086,9 @@ |------|-------|------|--------| | 1 | 0.595585 | `search_knowledge_base_get` | ✅ **EXPECTED** | | 2 | 0.551922 | `search_knowledge_base_retrieve` | ❌ | -<<<<<<< HEAD | 3 | 0.515480 | `search_knowledge_source_get` | ❌ | | 4 | 0.366170 | `search_service_list` | ❌ | | 5 | 0.365633 | `search_index_get` | ❌ | -======= -| 3 | 0.515607 | `search_knowledge_source_get` | ❌ | -| 4 | 0.376599 | `foundry_knowledge_index_list` | ❌ | -| 5 | 0.366893 | `search_service_list` | ❌ | ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) --- @@ -1838,6 +2102,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| <<<<<<< HEAD +<<<<<<< HEAD | 1 | 0.724869 | `search_knowledge_base_retrieve` | ✅ **EXPECTED** | | 2 | 0.650606 | `search_knowledge_base_get` | ❌ | | 3 | 0.575356 | `search_index_query` | ❌ | @@ -1845,11 +2110,14 @@ | 5 | 0.520336 | `foundry_agents_connect` | ❌ | ======= <<<<<<< HEAD +======= +>>>>>>> e2fd2eac (refactor tts mcp tool) | 1 | 0.724846 | `search_knowledge_base_retrieve` | ✅ **EXPECTED** | | 2 | 0.650590 | `search_knowledge_base_get` | ❌ | | 3 | 0.575307 | `search_index_query` | ❌ | | 4 | 0.567361 | `search_knowledge_source_get` | ❌ | | 5 | 0.520360 | `foundry_agents_connect` | ❌ | +<<<<<<< HEAD ======= | 1 | 0.724733 | `search_knowledge_base_retrieve` | ✅ **EXPECTED** | | 2 | 0.650523 | `search_knowledge_base_get` | ❌ | @@ -1858,6 +2126,8 @@ | 5 | 0.520277 | `foundry_agents_connect` | ❌ | >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +>>>>>>> e2fd2eac (refactor tts mcp tool) --- @@ -1879,13 +2149,8 @@ ======= | 1 | 0.633766 | `search_knowledge_base_retrieve` | ✅ **EXPECTED** | | 2 | 0.589869 | `search_knowledge_base_get` | ❌ | -<<<<<<< HEAD | 3 | 0.502085 | `search_knowledge_source_get` | ❌ | | 4 | 0.422671 | `foundry_agents_query-and-evaluate` | ❌ | -======= -| 3 | 0.501973 | `search_knowledge_source_get` | ❌ | -| 4 | 0.422489 | `foundry_agents_query-and-evaluate` | ❌ | ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) | 5 | 0.399595 | `search_index_query` | ❌ | >>>>>>> 58ab8585 (update prompts and tool description evaluator) @@ -1901,6 +2166,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| <<<<<<< HEAD +<<<<<<< HEAD | 1 | 0.657866 | `search_knowledge_base_retrieve` | ✅ **EXPECTED** | | 2 | 0.557206 | `search_knowledge_base_get` | ❌ | | 3 | 0.463605 | `search_knowledge_source_get` | ❌ | @@ -1914,13 +2180,18 @@ | 4 | 0.436952 | `foundry_agents_query-and-evaluate` | ❌ | | 5 | 0.422469 | `foundry_agents_connect` | ❌ | ======= +======= +>>>>>>> e2fd2eac (refactor tts mcp tool) | 1 | 0.657865 | `search_knowledge_base_retrieve` | ✅ **EXPECTED** | | 2 | 0.557206 | `search_knowledge_base_get` | ❌ | -| 3 | 0.463023 | `search_knowledge_source_get` | ❌ | -| 4 | 0.436580 | `foundry_agents_query-and-evaluate` | ❌ | +| 3 | 0.463605 | `search_knowledge_source_get` | ❌ | +| 4 | 0.436739 | `foundry_agents_query-and-evaluate` | ❌ | | 5 | 0.422173 | `foundry_agents_connect` | ❌ | +<<<<<<< HEAD >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +>>>>>>> e2fd2eac (refactor tts mcp tool) --- @@ -1935,17 +2206,12 @@ |------|-------|------|--------| | 1 | 0.633766 | `search_knowledge_base_retrieve` | ✅ **EXPECTED** | | 2 | 0.589869 | `search_knowledge_base_get` | ❌ | -<<<<<<< HEAD | 3 | 0.502085 | `search_knowledge_source_get` | ❌ | <<<<<<< HEAD | 4 | 0.422610 | `foundry_agents_query-and-evaluate` | ❌ | | 5 | 0.399521 | `search_index_query` | ❌ | ======= | 4 | 0.422671 | `foundry_agents_query-and-evaluate` | ❌ | -======= -| 3 | 0.501973 | `search_knowledge_source_get` | ❌ | -| 4 | 0.422489 | `foundry_agents_query-and-evaluate` | ❌ | ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) | 5 | 0.399595 | `search_index_query` | ❌ | >>>>>>> 58ab8585 (update prompts and tool description evaluator) @@ -1963,16 +2229,15 @@ | 1 | 0.598868 | `search_knowledge_base_retrieve` | ✅ **EXPECTED** | | 2 | 0.547862 | `search_knowledge_base_get` | ❌ | <<<<<<< HEAD +<<<<<<< HEAD | 3 | 0.467868 | `foundry_agents_query-and-evaluate` | ❌ | ======= <<<<<<< HEAD +======= +>>>>>>> e2fd2eac (refactor tts mcp tool) | 3 | 0.467907 | `foundry_agents_query-and-evaluate` | ❌ | >>>>>>> 58ab8585 (update prompts and tool description evaluator) | 4 | 0.464904 | `search_knowledge_source_get` | ❌ | -======= -| 3 | 0.467711 | `foundry_agents_query-and-evaluate` | ❌ | -| 4 | 0.464987 | `search_knowledge_source_get` | ❌ | ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) | 5 | 0.412481 | `foundry_agents_connect` | ❌ | --- @@ -1987,6 +2252,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| <<<<<<< HEAD +<<<<<<< HEAD | 1 | 0.649767 | `search_knowledge_base_retrieve` | ✅ **EXPECTED** | | 2 | 0.631435 | `search_knowledge_base_get` | ❌ | | 3 | 0.581359 | `search_index_query` | ❌ | @@ -2000,13 +2266,18 @@ | 4 | 0.571126 | `search_knowledge_source_get` | ❌ | | 5 | 0.544488 | `search_service_list` | ❌ | ======= +======= +>>>>>>> e2fd2eac (refactor tts mcp tool) | 1 | 0.649767 | `search_knowledge_base_retrieve` | ✅ **EXPECTED** | | 2 | 0.631435 | `search_knowledge_base_get` | ❌ | | 3 | 0.581387 | `search_index_query` | ❌ | -| 4 | 0.571101 | `search_knowledge_source_get` | ❌ | +| 4 | 0.571156 | `search_knowledge_source_get` | ❌ | | 5 | 0.544501 | `search_service_list` | ❌ | +<<<<<<< HEAD >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +>>>>>>> e2fd2eac (refactor tts mcp tool) --- @@ -2022,19 +2293,17 @@ | 1 | 0.579716 | `search_knowledge_base_retrieve` | ✅ **EXPECTED** | | 2 | 0.560688 | `search_knowledge_base_get` | ❌ | <<<<<<< HEAD +<<<<<<< HEAD | 3 | 0.477941 | `search_knowledge_source_get` | ❌ | | 4 | 0.402530 | `foundry_agents_query-and-evaluate` | ❌ | ======= <<<<<<< HEAD +======= +>>>>>>> e2fd2eac (refactor tts mcp tool) | 3 | 0.477942 | `search_knowledge_source_get` | ❌ | | 4 | 0.402582 | `foundry_agents_query-and-evaluate` | ❌ | >>>>>>> 58ab8585 (update prompts and tool description evaluator) | 5 | 0.361231 | `foundry_knowledge_index_list` | ❌ | -======= -| 3 | 0.478132 | `search_knowledge_source_get` | ❌ | -| 4 | 0.402474 | `foundry_agents_query-and-evaluate` | ❌ | -| 5 | 0.371055 | `foundry_knowledge_index_list` | ❌ | ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) --- @@ -2048,6 +2317,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| <<<<<<< HEAD +<<<<<<< HEAD | 1 | 0.582662 | `search_knowledge_base_retrieve` | ✅ **EXPECTED** | | 2 | 0.528610 | `search_knowledge_base_get` | ❌ | | 3 | 0.449336 | `search_knowledge_source_get` | ❌ | @@ -2061,13 +2331,18 @@ | 4 | 0.447915 | `foundry_agents_query-and-evaluate` | ❌ | | 5 | 0.397238 | `foundry_agents_connect` | ❌ | ======= +======= +>>>>>>> e2fd2eac (refactor tts mcp tool) | 1 | 0.582662 | `search_knowledge_base_retrieve` | ✅ **EXPECTED** | | 2 | 0.528610 | `search_knowledge_base_get` | ❌ | -| 3 | 0.449340 | `search_knowledge_source_get` | ❌ | -| 4 | 0.447632 | `foundry_agents_query-and-evaluate` | ❌ | +| 3 | 0.449336 | `search_knowledge_source_get` | ❌ | +| 4 | 0.447780 | `foundry_agents_query-and-evaluate` | ❌ | | 5 | 0.397187 | `foundry_agents_connect` | ❌ | +<<<<<<< HEAD >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +>>>>>>> e2fd2eac (refactor tts mcp tool) --- @@ -2081,6 +2356,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| <<<<<<< HEAD +<<<<<<< HEAD | 1 | 0.760406 | `search_knowledge_source_get` | ✅ **EXPECTED** | | 2 | 0.690845 | `search_service_list` | ❌ | | 3 | 0.665905 | `search_knowledge_base_get` | ❌ | @@ -2088,11 +2364,14 @@ | 5 | 0.560755 | `search_knowledge_base_retrieve` | ❌ | ======= <<<<<<< HEAD +======= +>>>>>>> e2fd2eac (refactor tts mcp tool) | 1 | 0.760416 | `search_knowledge_source_get` | ✅ **EXPECTED** | | 2 | 0.691931 | `search_service_list` | ❌ | | 3 | 0.665923 | `search_knowledge_base_get` | ❌ | | 4 | 0.573012 | `search_index_get` | ❌ | | 5 | 0.560779 | `search_knowledge_base_retrieve` | ❌ | +<<<<<<< HEAD ======= | 1 | 0.760757 | `search_knowledge_source_get` | ✅ **EXPECTED** | | 2 | 0.692251 | `search_service_list` | ❌ | @@ -2101,6 +2380,8 @@ | 5 | 0.573177 | `search_index_get` | ❌ | >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +>>>>>>> e2fd2eac (refactor tts mcp tool) --- @@ -2114,10 +2395,14 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| <<<<<<< HEAD +<<<<<<< HEAD | 1 | 0.737860 | `search_knowledge_source_get` | ✅ **EXPECTED** | | 2 | 0.659236 | `search_service_list` | ❌ | ======= | 1 | 0.737971 | `search_knowledge_source_get` | ✅ **EXPECTED** | +======= +| 1 | 0.737860 | `search_knowledge_source_get` | ✅ **EXPECTED** | +>>>>>>> e2fd2eac (refactor tts mcp tool) | 2 | 0.660170 | `search_service_list` | ❌ | >>>>>>> 58ab8585 (update prompts and tool description evaluator) | 3 | 0.652969 | `search_knowledge_base_get` | ❌ | @@ -2136,10 +2421,14 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| <<<<<<< HEAD +<<<<<<< HEAD | 1 | 0.657936 | `search_knowledge_source_get` | ✅ **EXPECTED** | ======= | 1 | 0.658365 | `search_knowledge_source_get` | ✅ **EXPECTED** | >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +| 1 | 0.657935 | `search_knowledge_source_get` | ✅ **EXPECTED** | +>>>>>>> e2fd2eac (refactor tts mcp tool) | 2 | 0.558516 | `search_knowledge_base_get` | ❌ | | 3 | 0.510338 | `search_service_list` | ❌ | | 4 | 0.470560 | `search_knowledge_base_retrieve` | ❌ | @@ -2156,7 +2445,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -| 1 | 0.653143 | `search_knowledge_source_get` | ✅ **EXPECTED** | +| 1 | 0.652945 | `search_knowledge_source_get` | ✅ **EXPECTED** | | 2 | 0.563270 | `search_knowledge_base_get` | ❌ | | 3 | 0.485934 | `search_service_list` | ❌ | | 4 | 0.477636 | `search_knowledge_base_retrieve` | ❌ | @@ -2174,6 +2463,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| <<<<<<< HEAD +<<<<<<< HEAD | 1 | 0.825604 | `search_knowledge_source_get` | ✅ **EXPECTED** | | 2 | 0.693438 | `search_knowledge_base_get` | ❌ | ======= @@ -2183,6 +2473,13 @@ | 3 | 0.595643 | `search_index_get` | ❌ | | 4 | 0.540550 | `search_knowledge_base_retrieve` | ❌ | | 5 | 0.531085 | `search_service_list` | ❌ | +======= +| 1 | 0.825552 | `search_knowledge_source_get` | ✅ **EXPECTED** | +| 2 | 0.693321 | `search_knowledge_base_get` | ❌ | +| 3 | 0.595371 | `search_index_get` | ❌ | +| 4 | 0.540647 | `search_knowledge_base_retrieve` | ❌ | +| 5 | 0.530887 | `search_service_list` | ❌ | +>>>>>>> e2fd2eac (refactor tts mcp tool) --- @@ -2195,7 +2492,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -| 1 | 0.631283 | `search_knowledge_source_get` | ✅ **EXPECTED** | +| 1 | 0.630840 | `search_knowledge_source_get` | ✅ **EXPECTED** | | 2 | 0.523643 | `search_knowledge_base_get` | ❌ | | 3 | 0.459923 | `search_knowledge_base_retrieve` | ❌ | | 4 | 0.371465 | `search_index_get` | ❌ | @@ -2212,6 +2509,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.681052 | `search_index_get` | ✅ **EXPECTED** | | 2 | 0.544557 | `foundry_knowledge_index_schema` | ❌ | | 3 | 0.528153 | `search_knowledge_base_get` | ❌ | @@ -2222,6 +2520,13 @@ | 4 | 0.522514 | `search_knowledge_source_get` | ❌ | | 5 | 0.490624 | `search_service_list` | ❌ | >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +| 1 | 0.680762 | `search_index_get` | ✅ **EXPECTED** | +| 2 | 0.544458 | `foundry_knowledge_index_schema` | ❌ | +| 3 | 0.527906 | `search_knowledge_base_get` | ❌ | +| 4 | 0.521626 | `search_knowledge_source_get` | ❌ | +| 5 | 0.490379 | `search_service_list` | ❌ | +>>>>>>> e2fd2eac (refactor tts mcp tool) --- @@ -2238,7 +2543,7 @@ | 2 | 0.619949 | `search_service_list` | ❌ | | 3 | 0.538885 | `foundry_knowledge_index_list` | ❌ | | 4 | 0.511485 | `search_knowledge_base_get` | ❌ | -| 5 | 0.496554 | `search_knowledge_source_get` | ❌ | +| 5 | 0.496094 | `search_knowledge_source_get` | ❌ | --- @@ -2255,7 +2560,7 @@ | 2 | 0.562503 | `search_service_list` | ❌ | | 3 | 0.538471 | `foundry_knowledge_index_list` | ❌ | | 4 | 0.500365 | `search_knowledge_base_get` | ❌ | -| 5 | 0.490330 | `search_knowledge_source_get` | ❌ | +| 5 | 0.490025 | `search_knowledge_source_get` | ❌ | --- @@ -2269,6 +2574,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| <<<<<<< HEAD +<<<<<<< HEAD | 1 | 0.522598 | `search_index_get` | ❌ | | 2 | 0.515911 | `search_index_query` | ✅ **EXPECTED** | | 3 | 0.498264 | `search_service_list` | ❌ | @@ -2289,6 +2595,13 @@ | 5 | 0.437709 | `postgres_database_query` | ❌ | >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +| 1 | 0.522826 | `search_index_get` | ❌ | +| 2 | 0.515870 | `search_index_query` | ✅ **EXPECTED** | +| 3 | 0.497467 | `search_service_list` | ❌ | +| 4 | 0.447977 | `search_knowledge_base_retrieve` | ❌ | +| 5 | 0.437715 | `postgres_database_query` | ❌ | +>>>>>>> e2fd2eac (refactor tts mcp tool) --- @@ -2306,18 +2619,15 @@ | 2 | 0.553012 | `kusto_cluster_list` | ❌ | ======= | 1 | 0.793651 | `search_service_list` | ✅ **EXPECTED** | -<<<<<<< HEAD | 2 | 0.553011 | `kusto_cluster_list` | ❌ | +<<<<<<< HEAD >>>>>>> 58ab8585 (update prompts and tool description evaluator) | 3 | 0.509479 | `subscription_list` | ❌ | +======= +| 3 | 0.509461 | `subscription_list` | ❌ | +>>>>>>> e2fd2eac (refactor tts mcp tool) | 4 | 0.505971 | `search_index_get` | ❌ | | 5 | 0.504693 | `marketplace_product_list` | ❌ | -======= -| 2 | 0.553043 | `kusto_cluster_list` | ❌ | -| 3 | 0.520340 | `foundry_agents_list` | ❌ | -| 4 | 0.509461 | `subscription_list` | ❌ | -| 5 | 0.505971 | `search_index_get` | ❌ | ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) --- @@ -2349,19 +2659,21 @@ |------|-------|------|--------| | 1 | 0.551241 | `search_service_list` | ✅ **EXPECTED** | | 2 | 0.436230 | `search_index_get` | ❌ | -<<<<<<< HEAD | 3 | 0.415277 | `search_knowledge_base_get` | ❌ | | 4 | 0.410461 | `search_knowledge_source_get` | ❌ | <<<<<<< HEAD | 5 | 0.404707 | `search_index_query` | ❌ | ======= | 5 | 0.404758 | `search_index_query` | ❌ | +<<<<<<< HEAD ======= | 3 | 0.417096 | `foundry_agents_list` | ❌ | | 4 | 0.415277 | `search_knowledge_base_get` | ❌ | | 5 | 0.410568 | `search_knowledge_source_get` | ❌ | >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +>>>>>>> e2fd2eac (refactor tts mcp tool) --- @@ -2375,6 +2687,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| <<<<<<< HEAD +<<<<<<< HEAD | 1 | 0.666038 | `speech_stt_recognize` | ✅ **EXPECTED** | | 2 | 0.377210 | `foundry_openai_embeddings-create` | ❌ | | 3 | 0.351127 | `deploy_plan_get` | ❌ | @@ -2392,6 +2705,13 @@ | 5 | 0.351127 | `deploy_plan_get` | ❌ | >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +| 1 | 0.682065 | `speech_tts_synthesize` | ❌ | +| 2 | 0.666038 | `speech_stt_recognize` | ✅ **EXPECTED** | +| 3 | 0.377022 | `foundry_openai_embeddings-create` | ❌ | +| 4 | 0.351127 | `deploy_plan_get` | ❌ | +| 5 | 0.338137 | `extension_cli_generate` | ❌ | +>>>>>>> e2fd2eac (refactor tts mcp tool) --- @@ -2406,6 +2726,7 @@ |------|-------|------|--------| | 1 | 0.511324 | `speech_stt_recognize` | ✅ **EXPECTED** | <<<<<<< HEAD +<<<<<<< HEAD | 2 | 0.198123 | `foundry_agents_get-sdk-sample` | ❌ | | 3 | 0.192462 | `foundry_openai_embeddings-create` | ❌ | | 4 | 0.170157 | `foundry_openai_create-completion` | ❌ | @@ -2423,6 +2744,12 @@ | 5 | 0.184542 | `foundry_openai_create-completion` | ❌ | >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +| 2 | 0.344404 | `speech_tts_synthesize` | ❌ | +| 3 | 0.197854 | `foundry_agents_get-sdk-sample` | ❌ | +| 4 | 0.192450 | `foundry_openai_embeddings-create` | ❌ | +| 5 | 0.170157 | `foundry_openai_create-completion` | ❌ | +>>>>>>> e2fd2eac (refactor tts mcp tool) --- @@ -2437,6 +2764,7 @@ |------|-------|------|--------| | 1 | 0.486489 | `speech_stt_recognize` | ✅ **EXPECTED** | <<<<<<< HEAD +<<<<<<< HEAD | 2 | 0.162863 | `foundry_threads_create` | ❌ | | 3 | 0.160209 | `foundry_agents_connect` | ❌ | | 4 | 0.156936 | `deploy_pipeline_guidance_get` | ❌ | @@ -2454,6 +2782,12 @@ | 5 | 0.160209 | `foundry_agents_connect` | ❌ | >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +| 2 | 0.335115 | `speech_tts_synthesize` | ❌ | +| 3 | 0.163357 | `foundry_threads_create` | ❌ | +| 4 | 0.160209 | `foundry_agents_connect` | ❌ | +| 5 | 0.156850 | `deploy_pipeline_guidance_get` | ❌ | +>>>>>>> e2fd2eac (refactor tts mcp tool) --- @@ -2475,6 +2809,7 @@ ======= | 1 | 0.611992 | `speech_stt_recognize` | ✅ **EXPECTED** | <<<<<<< HEAD +<<<<<<< HEAD | 2 | 0.309895 | `foundry_openai_embeddings-create` | ❌ | | 3 | 0.244218 | `foundry_resource_get` | ❌ | | 4 | 0.243626 | `foundry_openai_create-completion` | ❌ | @@ -2486,6 +2821,12 @@ | 5 | 0.251200 | `foundry_openai_chat-completions-create` | ❌ | >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +| 2 | 0.573185 | `speech_tts_synthesize` | ❌ | +| 3 | 0.309895 | `foundry_openai_embeddings-create` | ❌ | +| 4 | 0.244218 | `foundry_resource_get` | ❌ | +| 5 | 0.243626 | `foundry_openai_create-completion` | ❌ | +>>>>>>> e2fd2eac (refactor tts mcp tool) --- @@ -2500,6 +2841,7 @@ |------|-------|------|--------| | 1 | 0.410533 | `speech_stt_recognize` | ✅ **EXPECTED** | <<<<<<< HEAD +<<<<<<< HEAD | 2 | 0.152414 | `foundry_openai_embeddings-create` | ❌ | ======= <<<<<<< HEAD @@ -2514,6 +2856,12 @@ | 4 | 0.158032 | `foundry_openai_chat-completions-create` | ❌ | | 5 | 0.152137 | `foundry_models_deploy` | ❌ | >>>>>>> 84ad4f44 (update prompts and tool description evaluator) +======= +| 2 | 0.353783 | `speech_tts_synthesize` | ❌ | +| 3 | 0.152391 | `foundry_openai_embeddings-create` | ❌ | +| 4 | 0.152137 | `foundry_models_deploy` | ❌ | +| 5 | 0.151632 | `deploy_pipeline_guidance_get` | ❌ | +>>>>>>> e2fd2eac (refactor tts mcp tool) --- @@ -2528,6 +2876,7 @@ |------|-------|------|--------| | 1 | 0.546259 | `speech_stt_recognize` | ✅ **EXPECTED** | <<<<<<< HEAD +<<<<<<< HEAD | 2 | 0.218092 | `foundry_resource_get` | ❌ | | 3 | 0.202860 | `foundry_openai_embeddings-create` | ❌ | | 4 | 0.183420 | `extension_azqr` | ❌ | @@ -2538,6 +2887,12 @@ | 4 | 0.218092 | `foundry_resource_get` | ❌ | | 5 | 0.200865 | `foundry_openai_chat-completions-create` | ❌ | >>>>>>> 84ad4f44 (update prompts and tool description evaluator) +======= +| 2 | 0.480203 | `speech_tts_synthesize` | ❌ | +| 3 | 0.218092 | `foundry_resource_get` | ❌ | +| 4 | 0.202935 | `foundry_openai_embeddings-create` | ❌ | +| 5 | 0.183420 | `extension_azqr` | ❌ | +>>>>>>> e2fd2eac (refactor tts mcp tool) --- @@ -2551,6 +2906,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| <<<<<<< HEAD +<<<<<<< HEAD | 1 | 0.539963 | `speech_stt_recognize` | ✅ **EXPECTED** | | 2 | 0.228587 | `foundry_openai_create-completion` | ❌ | | 3 | 0.203413 | `foundry_agents_connect` | ❌ | @@ -2571,6 +2927,13 @@ | 5 | 0.203413 | `foundry_agents_connect` | ❌ | >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +| 1 | 0.539963 | `speech_stt_recognize` | ✅ **EXPECTED** | +| 2 | 0.367401 | `speech_tts_synthesize` | ❌ | +| 3 | 0.228587 | `foundry_openai_create-completion` | ❌ | +| 4 | 0.203413 | `foundry_agents_connect` | ❌ | +| 5 | 0.199585 | `foundry_openai_embeddings-create` | ❌ | +>>>>>>> e2fd2eac (refactor tts mcp tool) --- @@ -2584,6 +2947,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| <<<<<<< HEAD +<<<<<<< HEAD | 1 | 0.549151 | `speech_stt_recognize` | ✅ **EXPECTED** | | 2 | 0.393626 | `azureaibestpractices_get` | ❌ | | 3 | 0.342537 | `extension_cli_generate` | ❌ | @@ -2604,6 +2968,13 @@ | 5 | 0.342537 | `extension_cli_generate` | ❌ | >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +| 1 | 0.549151 | `speech_stt_recognize` | ✅ **EXPECTED** | +| 2 | 0.468161 | `speech_tts_synthesize` | ❌ | +| 3 | 0.342537 | `extension_cli_generate` | ❌ | +| 4 | 0.337387 | `cloudarchitect_design` | ❌ | +| 5 | 0.335741 | `foundry_openai_create-completion` | ❌ | +>>>>>>> e2fd2eac (refactor tts mcp tool) --- @@ -2618,6 +2989,7 @@ |------|-------|------|--------| | 1 | 0.532536 | `speech_stt_recognize` | ✅ **EXPECTED** | <<<<<<< HEAD +<<<<<<< HEAD | 2 | 0.349892 | `foundry_openai_create-completion` | ❌ | <<<<<<< HEAD | 3 | 0.348381 | `azureaibestpractices_get` | ❌ | @@ -2634,6 +3006,12 @@ | 5 | 0.378382 | `foundry_openai_create-completion` | ❌ | >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +| 2 | 0.515532 | `speech_tts_synthesize` | ❌ | +| 3 | 0.349892 | `foundry_openai_create-completion` | ❌ | +| 4 | 0.340893 | `foundry_openai_chat-completions-create` | ❌ | +| 5 | 0.332669 | `foundry_openai_embeddings-create` | ❌ | +>>>>>>> e2fd2eac (refactor tts mcp tool) --- @@ -2648,6 +3026,7 @@ |------|-------|------|--------| | 1 | 0.453396 | `speech_stt_recognize` | ✅ **EXPECTED** | <<<<<<< HEAD +<<<<<<< HEAD | 2 | 0.173280 | `deploy_pipeline_guidance_get` | ❌ | | 3 | 0.164929 | `foundry_openai_embeddings-create` | ❌ | | 4 | 0.160483 | `foundry_agents_connect` | ❌ | @@ -2665,6 +3044,12 @@ | 5 | 0.173205 | `deploy_pipeline_guidance_get` | ❌ | >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +| 2 | 0.322710 | `speech_tts_synthesize` | ❌ | +| 3 | 0.173205 | `deploy_pipeline_guidance_get` | ❌ | +| 4 | 0.164990 | `foundry_openai_embeddings-create` | ❌ | +| 5 | 0.160483 | `foundry_agents_connect` | ❌ | +>>>>>>> e2fd2eac (refactor tts mcp tool) --- @@ -2677,15 +3062,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -| 1 | 0.547977 | `speech_tts_synthesize` | ✅ **EXPECTED** | +| 1 | 0.521797 | `speech_tts_synthesize` | ✅ **EXPECTED** | | 2 | 0.422457 | `speech_stt_recognize` | ❌ | -| 3 | 0.231058 | `foundry_openai_embeddings-create` | ❌ | -| 4 | 0.200920 | `foundry_openai_create-completion` | ❌ | -| 5 | 0.192179 | `foundry_openai_chat-completions-create` | ❌ | +| 3 | 0.196049 | `foundry_openai_embeddings-create` | ❌ | +| 4 | 0.189438 | `foundry_openai_create-completion` | ❌ | +| 5 | 0.174955 | `foundry_openai_chat-completions-create` | ❌ | --- -## Test 62 +## Test 67 **Expected Tool:** `speech_tts_synthesize` **Prompt:** Synthesize speech from "Hello, welcome to Azure" and save to welcome.wav @@ -2694,15 +3079,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -| 1 | 0.531396 | `speech_tts_synthesize` | ✅ **EXPECTED** | +| 1 | 0.516973 | `speech_tts_synthesize` | ✅ **EXPECTED** | | 2 | 0.486019 | `speech_stt_recognize` | ❌ | | 3 | 0.329765 | `deploy_pipeline_guidance_get` | ❌ | | 4 | 0.323728 | `extension_cli_generate` | ❌ | -| 5 | 0.320006 | `foundry_openai_chat-completions-create` | ❌ | +| 5 | 0.317525 | `azureterraformbestpractices_get` | ❌ | --- -## Test 63 +## Test 68 **Expected Tool:** `speech_tts_synthesize` **Prompt:** Generate speech audio from text "Hello world" using Azure Speech Services @@ -2711,15 +3096,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -| 1 | 0.590514 | `speech_tts_synthesize` | ✅ **EXPECTED** | +| 1 | 0.592156 | `speech_tts_synthesize` | ✅ **EXPECTED** | | 2 | 0.534002 | `speech_stt_recognize` | ❌ | -| 3 | 0.362626 | `foundry_openai_embeddings-create` | ❌ | -| 4 | 0.341003 | `foundry_openai_create-completion` | ❌ | -| 5 | 0.333557 | `foundry_openai_chat-completions-create` | ❌ | +| 3 | 0.339580 | `foundry_openai_embeddings-create` | ❌ | +| 4 | 0.327397 | `foundry_openai_create-completion` | ❌ | +| 5 | 0.315764 | `foundry_openai_chat-completions-create` | ❌ | --- -## Test 64 +## Test 69 **Expected Tool:** `speech_tts_synthesize` **Prompt:** Convert text to speech with Spanish language and save to spanish-audio.wav @@ -2728,15 +3113,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -| 1 | 0.520866 | `speech_tts_synthesize` | ✅ **EXPECTED** | +| 1 | 0.501096 | `speech_tts_synthesize` | ✅ **EXPECTED** | | 2 | 0.452648 | `speech_stt_recognize` | ❌ | -| 3 | 0.231393 | `foundry_openai_embeddings-create` | ❌ | -| 4 | 0.204970 | `foundry_openai_create-completion` | ❌ | -| 5 | 0.202502 | `foundry_openai_chat-completions-create` | ❌ | +| 3 | 0.210841 | `foundry_openai_embeddings-create` | ❌ | +| 4 | 0.196766 | `foundry_models_deploy` | ❌ | +| 5 | 0.191812 | `foundry_openai_chat-completions-create` | ❌ | --- -## Test 65 +## Test 70 **Expected Tool:** `speech_tts_synthesize` **Prompt:** Synthesize speech with voice en-US-JennyNeural from text "Azure AI Services" @@ -2745,15 +3130,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -| 1 | 0.604553 | `speech_tts_synthesize` | ✅ **EXPECTED** | +| 1 | 0.604878 | `speech_tts_synthesize` | ✅ **EXPECTED** | | 2 | 0.496715 | `speech_stt_recognize` | ❌ | -| 3 | 0.423461 | `foundry_openai_embeddings-create` | ❌ | -| 4 | 0.390312 | `foundry_agents_list` | ❌ | -| 5 | 0.381735 | `foundry_openai_chat-completions-create` | ❌ | +| 3 | 0.417045 | `foundry_openai_embeddings-create` | ❌ | +| 4 | 0.379840 | `foundry_openai_create-completion` | ❌ | +| 5 | 0.354130 | `foundry_openai_chat-completions-create` | ❌ | --- -## Test 66 +## Test 71 **Expected Tool:** `speech_tts_synthesize` **Prompt:** Create MP3 audio file from text "Welcome to Azure" with high quality format @@ -2762,15 +3147,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -| 1 | 0.564876 | `speech_tts_synthesize` | ✅ **EXPECTED** | +| 1 | 0.561285 | `speech_tts_synthesize` | ✅ **EXPECTED** | | 2 | 0.510908 | `speech_stt_recognize` | ❌ | -| 3 | 0.360542 | `foundry_openai_embeddings-create` | ❌ | +| 3 | 0.348757 | `foundry_openai_embeddings-create` | ❌ | | 4 | 0.347597 | `deploy_pipeline_guidance_get` | ❌ | | 5 | 0.345073 | `deploy_iac_rules_get` | ❌ | --- -## Test 67 +## Test 72 **Expected Tool:** `speech_tts_synthesize` **Prompt:** Generate speech with custom voice model using endpoint ID @@ -2779,15 +3164,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -| 1 | 0.547864 | `speech_tts_synthesize` | ✅ **EXPECTED** | +| 1 | 0.527294 | `speech_tts_synthesize` | ✅ **EXPECTED** | | 2 | 0.455734 | `speech_stt_recognize` | ❌ | -| 3 | 0.367601 | `foundry_openai_embeddings-create` | ❌ | -| 4 | 0.358913 | `foundry_openai_chat-completions-create` | ❌ | -| 5 | 0.356105 | `foundry_models_deployments_list` | ❌ | +| 3 | 0.353108 | `foundry_resource_get` | ❌ | +| 4 | 0.343308 | `foundry_models_deploy` | ❌ | +| 5 | 0.337888 | `foundry_openai_embeddings-create` | ❌ | --- -## Test 68 +## Test 73 **Expected Tool:** `speech_tts_synthesize` **Prompt:** Convert text to OGG/Opus format audio file @@ -2796,15 +3181,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -| 1 | 0.446150 | `speech_tts_synthesize` | ✅ **EXPECTED** | +| 1 | 0.432836 | `speech_tts_synthesize` | ✅ **EXPECTED** | | 2 | 0.410086 | `speech_stt_recognize` | ❌ | -| 3 | 0.263503 | `foundry_openai_embeddings-create` | ❌ | -| 4 | 0.199147 | `foundry_openai_create-completion` | ❌ | -| 5 | 0.196153 | `extension_cli_generate` | ❌ | +| 3 | 0.234237 | `foundry_openai_embeddings-create` | ❌ | +| 4 | 0.196153 | `extension_cli_generate` | ❌ | +| 5 | 0.175963 | `foundry_openai_create-completion` | ❌ | --- -## Test 69 +## Test 74 **Expected Tool:** `speech_tts_synthesize` **Prompt:** Synthesize long text content to audio file with streaming @@ -2813,15 +3198,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -| 1 | 0.449165 | `speech_tts_synthesize` | ✅ **EXPECTED** | +| 1 | 0.428079 | `speech_tts_synthesize` | ✅ **EXPECTED** | | 2 | 0.369045 | `speech_stt_recognize` | ❌ | -| 3 | 0.225665 | `foundry_openai_create-completion` | ❌ | -| 4 | 0.225088 | `foundry_openai_chat-completions-create` | ❌ | -| 5 | 0.218260 | `foundry_openai_embeddings-create` | ❌ | +| 3 | 0.230725 | `foundry_openai_embeddings-create` | ❌ | +| 4 | 0.220793 | `foundry_openai_create-completion` | ❌ | +| 5 | 0.216475 | `foundry_openai_chat-completions-create` | ❌ | --- -## Test 70 +## Test 75 **Expected Tool:** `speech_tts_synthesize` **Prompt:** Create audio file from text in French language with appropriate voice @@ -2830,15 +3215,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -| 1 | 0.467698 | `speech_tts_synthesize` | ✅ **EXPECTED** | +| 1 | 0.444444 | `speech_tts_synthesize` | ✅ **EXPECTED** | | 2 | 0.385267 | `speech_stt_recognize` | ❌ | -| 3 | 0.235591 | `foundry_openai_create-completion` | ❌ | -| 4 | 0.215304 | `foundry_openai_embeddings-create` | ❌ | -| 5 | 0.208978 | `foundry_openai_chat-completions-create` | ❌ | +| 3 | 0.229890 | `foundry_openai_create-completion` | ❌ | +| 4 | 0.228317 | `foundry_openai_embeddings-create` | ❌ | +| 5 | 0.213222 | `foundry_openai_chat-completions-create` | ❌ | --- -## Test 71 +## Test 76 **Expected Tool:** `appconfig_account_list` **Prompt:** List all App Configuration stores in my subscription @@ -2850,6 +3235,7 @@ | 1 | 0.786298 | `appconfig_account_list` | ✅ **EXPECTED** | | 2 | 0.530613 | `appconfig_kv_get` | ❌ | <<<<<<< HEAD +<<<<<<< HEAD | 3 | 0.491380 | `postgres_server_list` | ❌ | ======= <<<<<<< HEAD @@ -2858,18 +3244,23 @@ | 4 | 0.481223 | `kusto_cluster_list` | ❌ | | 5 | 0.479997 | `subscription_list` | ❌ | ======= +======= +>>>>>>> e2fd2eac (refactor tts mcp tool) | 3 | 0.491380 | `postgres_server_list` | ❌ | -| 4 | 0.481174 | `kusto_cluster_list` | ❌ | +| 4 | 0.481223 | `kusto_cluster_list` | ❌ | | 5 | 0.479988 | `subscription_list` | ❌ | ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 67 ======= ## Test 72 >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 77 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `appconfig_account_list` **Prompt:** Show me the App Configuration stores in my subscription @@ -2878,6 +3269,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.635056 | `appconfig_account_list` | ✅ **EXPECTED** | | 2 | 0.464826 | `appconfig_kv_get` | ❌ | | 3 | 0.398562 | `subscription_list` | ❌ | @@ -2891,6 +3283,17 @@ ======= ## Test 73 >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +| 1 | 0.634978 | `appconfig_account_list` | ✅ **EXPECTED** | +| 2 | 0.464865 | `appconfig_kv_get` | ❌ | +| 3 | 0.398495 | `subscription_list` | ❌ | +| 4 | 0.391286 | `redis_list` | ❌ | +| 5 | 0.372456 | `postgres_server_list` | ❌ | + +--- + +## Test 78 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `appconfig_account_list` **Prompt:** Show me my App Configuration stores @@ -2907,11 +3310,15 @@ --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 69 ======= ## Test 74 >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 79 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `appconfig_kv_delete` **Prompt:** Delete the key in App Configuration store @@ -2921,6 +3328,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| <<<<<<< HEAD +<<<<<<< HEAD | 1 | 0.618276 | `appconfig_kv_delete` | ✅ **EXPECTED** | | 2 | 0.464358 | `appconfig_kv_get` | ❌ | | 3 | 0.424344 | `appconfig_kv_set` | ❌ | @@ -2932,23 +3340,22 @@ ## Test 70 ======= <<<<<<< HEAD +======= +>>>>>>> e2fd2eac (refactor tts mcp tool) | 1 | 0.618277 | `appconfig_kv_delete` | ✅ **EXPECTED** | | 2 | 0.464358 | `appconfig_kv_get` | ❌ | | 3 | 0.424344 | `appconfig_kv_set` | ❌ | | 4 | 0.422700 | `appconfig_kv_lock_set` | ❌ | | 5 | 0.392016 | `appconfig_account_list` | ❌ | -======= -| 1 | 0.618267 | `appconfig_kv_delete` | ✅ **EXPECTED** | -| 2 | 0.464368 | `appconfig_kv_get` | ❌ | -| 3 | 0.424296 | `appconfig_kv_set` | ❌ | -| 4 | 0.422722 | `appconfig_kv_lock_set` | ❌ | -| 5 | 0.392081 | `appconfig_account_list` | ❌ | ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) --- +<<<<<<< HEAD ## Test 75 >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 80 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `appconfig_kv_get` **Prompt:** List all key-value settings in App Configuration store @@ -2965,11 +3372,15 @@ --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 71 ======= ## Test 76 >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 81 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `appconfig_kv_get` **Prompt:** Show me the key-value settings in App Configuration store @@ -2986,11 +3397,15 @@ --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 72 ======= ## Test 77 >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 82 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `appconfig_kv_get` **Prompt:** List all key-value settings with key name starting with 'prod-' in App Configuration store @@ -3000,6 +3415,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| <<<<<<< HEAD +<<<<<<< HEAD | 1 | 0.512883 | `appconfig_kv_get` | ✅ **EXPECTED** | | 2 | 0.450109 | `appconfig_account_list` | ❌ | | 3 | 0.398684 | `appconfig_kv_set` | ❌ | @@ -3028,6 +3444,17 @@ ## Test 78 >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +| 1 | 0.512883 | `appconfig_kv_get` | ✅ **EXPECTED** | +| 2 | 0.449905 | `appconfig_account_list` | ❌ | +| 3 | 0.398684 | `appconfig_kv_set` | ❌ | +| 4 | 0.380614 | `appconfig_kv_delete` | ❌ | +| 5 | 0.346166 | `appconfig_kv_lock_set` | ❌ | + +--- + +## Test 83 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `appconfig_kv_get` **Prompt:** Show the content for the key in App Configuration store @@ -3044,11 +3471,15 @@ --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 74 ======= ## Test 79 >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 84 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `appconfig_kv_lock_set` **Prompt:** Lock the key in App Configuration store @@ -3065,11 +3496,15 @@ --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 75 ======= ## Test 80 >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 85 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `appconfig_kv_lock_set` **Prompt:** Unlock the key in App Configuration store @@ -3078,7 +3513,6 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD | 1 | 0.555699 | `appconfig_kv_lock_set` | ✅ **EXPECTED** | | 2 | 0.505681 | `appconfig_kv_get` | ❌ | | 3 | 0.476497 | `appconfig_kv_delete` | ❌ | @@ -3091,18 +3525,15 @@ ## Test 76 ======= | 5 | 0.409406 | `appconfig_account_list` | ❌ | -======= -| 1 | 0.555732 | `appconfig_kv_lock_set` | ✅ **EXPECTED** | -| 2 | 0.505675 | `appconfig_kv_get` | ❌ | -| 3 | 0.476507 | `appconfig_kv_delete` | ❌ | -| 4 | 0.425479 | `appconfig_kv_set` | ❌ | -| 5 | 0.409370 | `appconfig_account_list` | ❌ | ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) --- +<<<<<<< HEAD ## Test 81 >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 86 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `appconfig_kv_set` **Prompt:** Set the key in App Configuration store to @@ -3111,6 +3542,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.609635 | `appconfig_kv_set` | ✅ **EXPECTED** | | 2 | 0.536497 | `appconfig_kv_lock_set` | ❌ | | 3 | 0.512707 | `appconfig_kv_get` | ❌ | @@ -3124,6 +3556,17 @@ ======= ## Test 82 >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +| 1 | 0.609760 | `appconfig_kv_set` | ✅ **EXPECTED** | +| 2 | 0.536630 | `appconfig_kv_lock_set` | ❌ | +| 3 | 0.512740 | `appconfig_kv_get` | ❌ | +| 4 | 0.505638 | `appconfig_kv_delete` | ❌ | +| 5 | 0.377900 | `appconfig_account_list` | ❌ | + +--- + +## Test 87 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `applens_resource_diagnose` **Prompt:** Please help me diagnose issues with my app using app lens @@ -3133,18 +3576,28 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| | 1 | 0.595632 | `applens_resource_diagnose` | ✅ **EXPECTED** | +<<<<<<< HEAD | 2 | 0.335768 | `deploy_app_logs_get` | ❌ | | 3 | 0.300786 | `deploy_architecture_diagram_generate` | ❌ | | 4 | 0.273083 | `cloudarchitect_design` | ❌ | +======= +| 2 | 0.336090 | `deploy_app_logs_get` | ❌ | +| 3 | 0.300786 | `deploy_architecture_diagram_generate` | ❌ | +| 4 | 0.273082 | `cloudarchitect_design` | ❌ | +>>>>>>> e2fd2eac (refactor tts mcp tool) | 5 | 0.254473 | `monitor_resource_log_query` | ❌ | --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 78 ======= ## Test 83 >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 88 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `applens_resource_diagnose` **Prompt:** Use app lens to check why my app is slow? @@ -3154,6 +3607,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| | 1 | 0.502361 | `applens_resource_diagnose` | ✅ **EXPECTED** | +<<<<<<< HEAD | 2 | 0.316002 | `deploy_app_logs_get` | ❌ | | 3 | 0.255570 | `deploy_architecture_diagram_generate` | ❌ | | 4 | 0.249583 | `monitor_resource_log_query` | ❌ | @@ -3167,13 +3621,21 @@ <<<<<<< HEAD | 5 | 0.226092 | `quota_usage_check` | ❌ | ======= +======= +| 2 | 0.316297 | `deploy_app_logs_get` | ❌ | +| 3 | 0.255570 | `deploy_architecture_diagram_generate` | ❌ | +| 4 | 0.249583 | `monitor_resource_log_query` | ❌ | +>>>>>>> e2fd2eac (refactor tts mcp tool) | 5 | 0.225972 | `quota_usage_check` | ❌ | ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) --- +<<<<<<< HEAD ## Test 84 >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 89 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `applens_resource_diagnose` **Prompt:** What does app lens say is wrong with my service? @@ -3190,11 +3652,15 @@ --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 80 ======= ## Test 85 >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 90 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `appservice_database_add` **Prompt:** Add database connection to my app service for database in resource group @@ -3204,6 +3670,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| <<<<<<< HEAD +<<<<<<< HEAD | 1 | 0.717878 | `appservice_database_add` | ✅ **EXPECTED** | | 2 | 0.401376 | `sql_db_rename` | ❌ | | 3 | 0.399941 | `sql_db_create` | ❌ | @@ -3215,23 +3682,22 @@ ## Test 81 ======= <<<<<<< HEAD +======= +>>>>>>> e2fd2eac (refactor tts mcp tool) | 1 | 0.717887 | `appservice_database_add` | ✅ **EXPECTED** | | 2 | 0.401337 | `sql_db_rename` | ❌ | -| 3 | 0.399997 | `sql_db_create` | ❌ | +| 3 | 0.399820 | `sql_db_create` | ❌ | | 4 | 0.362889 | `sql_db_show` | ❌ | -| 5 | 0.357708 | `sql_db_list` | ❌ | -======= -| 1 | 0.682502 | `appservice_database_add` | ✅ **EXPECTED** | -| 2 | 0.401311 | `sql_db_rename` | ❌ | -| 3 | 0.400175 | `sql_db_create` | ❌ | -| 4 | 0.363123 | `sql_db_show` | ❌ | -| 5 | 0.357874 | `sql_db_list` | ❌ | ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) +| 5 | 0.357806 | `sql_db_list` | ❌ | --- +<<<<<<< HEAD ## Test 86 >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 91 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `appservice_database_add` **Prompt:** Configure SQL Server database for app service with connection string in resource group @@ -3241,6 +3707,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| <<<<<<< HEAD +<<<<<<< HEAD | 1 | 0.688410 | `appservice_database_add` | ✅ **EXPECTED** | | 2 | 0.498122 | `sql_db_rename` | ❌ | | 3 | 0.497502 | `sql_db_create` | ❌ | @@ -3269,6 +3736,17 @@ ## Test 87 >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +| 1 | 0.688409 | `appservice_database_add` | ✅ **EXPECTED** | +| 2 | 0.498049 | `sql_db_rename` | ❌ | +| 3 | 0.497520 | `sql_db_create` | ❌ | +| 4 | 0.469335 | `sql_db_show` | ❌ | +| 5 | 0.452906 | `sql_db_list` | ❌ | + +--- + +## Test 92 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `appservice_database_add` **Prompt:** Add MySQL database to app service using connection in resource group @@ -3278,6 +3756,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| <<<<<<< HEAD +<<<<<<< HEAD | 1 | 0.675970 | `appservice_database_add` | ✅ **EXPECTED** | | 2 | 0.464756 | `sql_db_create` | ❌ | | 3 | 0.452407 | `sql_db_rename` | ❌ | @@ -3306,6 +3785,17 @@ ## Test 88 >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +| 1 | 0.675678 | `appservice_database_add` | ✅ **EXPECTED** | +| 2 | 0.465147 | `sql_db_create` | ❌ | +| 3 | 0.452626 | `sql_db_rename` | ❌ | +| 4 | 0.433261 | `mysql_server_list` | ❌ | +| 5 | 0.410304 | `sql_db_show` | ❌ | + +--- + +## Test 93 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `appservice_database_add` **Prompt:** Add PostgreSQL database to app service using connection in resource group @@ -3315,6 +3805,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| <<<<<<< HEAD +<<<<<<< HEAD | 1 | 0.628119 | `appservice_database_add` | ✅ **EXPECTED** | | 2 | 0.444212 | `sql_db_create` | ❌ | | 3 | 0.405314 | `postgres_database_query` | ❌ | @@ -3343,6 +3834,17 @@ ## Test 89 >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +| 1 | 0.627767 | `appservice_database_add` | ✅ **EXPECTED** | +| 2 | 0.444459 | `sql_db_create` | ❌ | +| 3 | 0.404249 | `postgres_database_query` | ❌ | +| 4 | 0.400435 | `postgres_database_list` | ❌ | +| 5 | 0.400352 | `sql_db_rename` | ❌ | + +--- + +## Test 94 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `appservice_database_add` **Prompt:** Connect CosmosDB database using connection string to app service in resource group @@ -3352,6 +3854,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| <<<<<<< HEAD +<<<<<<< HEAD | 1 | 0.663086 | `appservice_database_add` | ✅ **EXPECTED** | | 2 | 0.446465 | `cosmos_database_list` | ❌ | | 3 | 0.441966 | `cosmos_database_container_item_query` | ❌ | @@ -3380,6 +3883,17 @@ ## Test 90 >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +| 1 | 0.662987 | `appservice_database_add` | ✅ **EXPECTED** | +| 2 | 0.446741 | `cosmos_database_list` | ❌ | +| 3 | 0.442115 | `cosmos_database_container_item_query` | ❌ | +| 4 | 0.427312 | `cosmos_database_container_list` | ❌ | +| 5 | 0.420799 | `sql_db_rename` | ❌ | + +--- + +## Test 95 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `appservice_database_add` **Prompt:** Add database connection for database on server to app service in resource group @@ -3389,6 +3903,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| <<<<<<< HEAD +<<<<<<< HEAD | 1 | 0.733852 | `appservice_database_add` | ✅ **EXPECTED** | | 2 | 0.454554 | `sql_db_create` | ❌ | | 3 | 0.415271 | `sql_db_rename` | ❌ | @@ -3400,23 +3915,22 @@ ## Test 86 ======= <<<<<<< HEAD +======= +>>>>>>> e2fd2eac (refactor tts mcp tool) | 1 | 0.733775 | `appservice_database_add` | ✅ **EXPECTED** | -| 2 | 0.454433 | `sql_db_create` | ❌ | +| 2 | 0.454554 | `sql_db_create` | ❌ | | 3 | 0.415274 | `sql_db_rename` | ❌ | | 4 | 0.414045 | `sql_server_create` | ❌ | -| 5 | 0.410100 | `sql_db_list` | ❌ | -======= -| 1 | 0.702259 | `appservice_database_add` | ✅ **EXPECTED** | -| 2 | 0.454592 | `sql_db_create` | ❌ | -| 3 | 0.415290 | `sql_db_rename` | ❌ | -| 4 | 0.414069 | `sql_server_create` | ❌ | -| 5 | 0.410258 | `sql_db_list` | ❌ | ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) +| 5 | 0.410260 | `sql_db_list` | ❌ | --- +<<<<<<< HEAD ## Test 91 >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 96 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `appservice_database_add` **Prompt:** Add database connection string for to app service using connection string in resource group @@ -3426,6 +3940,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| <<<<<<< HEAD +<<<<<<< HEAD | 1 | 0.746766 | `appservice_database_add` | ✅ **EXPECTED** | | 2 | 0.441682 | `sql_db_rename` | ❌ | | 3 | 0.434020 | `sql_db_create` | ❌ | @@ -3454,6 +3969,17 @@ ## Test 92 >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +| 1 | 0.746361 | `appservice_database_add` | ✅ **EXPECTED** | +| 2 | 0.441645 | `sql_db_rename` | ❌ | +| 3 | 0.433902 | `sql_db_create` | ❌ | +| 4 | 0.391238 | `sql_db_list` | ❌ | +| 5 | 0.390155 | `sql_db_show` | ❌ | + +--- + +## Test 97 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `appservice_database_add` **Prompt:** Connect database to my app service using connection string in resource group @@ -3463,6 +3989,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| <<<<<<< HEAD +<<<<<<< HEAD | 1 | 0.680503 | `appservice_database_add` | ✅ **EXPECTED** | | 2 | 0.429273 | `sql_db_rename` | ❌ | | 3 | 0.406267 | `sql_db_create` | ❌ | @@ -3481,16 +4008,22 @@ | 5 | 0.391416 | `sql_db_list` | ❌ | ======= | 1 | 0.643888 | `appservice_database_add` | ✅ **EXPECTED** | +======= +| 1 | 0.680400 | `appservice_database_add` | ✅ **EXPECTED** | +>>>>>>> e2fd2eac (refactor tts mcp tool) | 2 | 0.429317 | `sql_db_rename` | ❌ | | 3 | 0.406322 | `sql_db_create` | ❌ | | 4 | 0.396523 | `sql_db_show` | ❌ | | 5 | 0.391430 | `sql_db_list` | ❌ | ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) --- +<<<<<<< HEAD ## Test 93 >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 98 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `appservice_database_add` **Prompt:** Set up database for app service with connection string under resource group @@ -3500,6 +4033,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| <<<<<<< HEAD +<<<<<<< HEAD | 1 | 0.640738 | `appservice_database_add` | ✅ **EXPECTED** | | 2 | 0.456785 | `sql_db_create` | ❌ | | 3 | 0.402668 | `sql_db_rename` | ❌ | @@ -3518,15 +4052,17 @@ | 5 | 0.394177 | `sql_db_list` | ❌ | ======= | 1 | 0.598494 | `appservice_database_add` | ✅ **EXPECTED** | +======= +| 1 | 0.640548 | `appservice_database_add` | ✅ **EXPECTED** | +>>>>>>> e2fd2eac (refactor tts mcp tool) | 2 | 0.456884 | `sql_db_create` | ❌ | | 3 | 0.402743 | `sql_db_rename` | ❌ | | 4 | 0.402138 | `sql_db_show` | ❌ | | 5 | 0.394211 | `sql_db_list` | ❌ | ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) --- -## Test 94 +## Test 99 **Expected Tool:** `appservice_database_add` **Prompt:** Configure database for app service with the connection string in resource group @@ -3535,14 +4071,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -| 1 | 0.650888 | `appservice_database_add` | ✅ **EXPECTED** | -| 2 | 0.449175 | `sql_db_rename` | ❌ | -| 3 | 0.448382 | `sql_db_create` | ❌ | -| 4 | 0.414323 | `sql_db_show` | ❌ | -| 5 | 0.411790 | `sql_db_list` | ❌ | +| 1 | 0.688343 | `appservice_database_add` | ✅ **EXPECTED** | +| 2 | 0.449174 | `sql_db_rename` | ❌ | +| 3 | 0.448432 | `sql_db_create` | ❌ | +| 4 | 0.414400 | `sql_db_show` | ❌ | +| 5 | 0.411810 | `sql_db_list` | ❌ | --- +<<<<<<< HEAD ## Test 95 >>>>>>> 58ab8585 (update prompts and tool description evaluator) @@ -3562,6 +4099,9 @@ --- ## Test 90 +======= +## Test 100 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `applicationinsights_recommendation_list` **Prompt:** List code optimization recommendations across my Application Insights components @@ -3572,6 +4112,7 @@ |------|-------|------|--------| | 1 | 0.572473 | `applicationinsights_recommendation_list` | ✅ **EXPECTED** | <<<<<<< HEAD +<<<<<<< HEAD | 2 | 0.454559 | `azureaibestpractices_get` | ❌ | | 3 | 0.445157 | `get_bestpractices_get` | ❌ | | 4 | 0.390478 | `azureterraformbestpractices_get` | ❌ | @@ -3582,23 +4123,25 @@ ## Test 91 ======= <<<<<<< HEAD -| 2 | 0.445157 | `get_bestpractices_get` | ❌ | -| 3 | 0.390549 | `azureterraformbestpractices_get` | ❌ | ======= -| 2 | 0.449459 | `get_bestpractices_get` | ❌ | +>>>>>>> e2fd2eac (refactor tts mcp tool) +| 2 | 0.445157 | `get_bestpractices_get` | ❌ | | 3 | 0.390478 | `azureterraformbestpractices_get` | ❌ | ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) | 4 | 0.383948 | `applens_resource_diagnose` | ❌ | | 5 | 0.375286 | `deploy_iac_rules_get` | ❌ | --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 86 ======= ## Test 96 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 101 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `applicationinsights_recommendation_list` **Prompt:** Show me code optimization recommendations for all Application Insights resources in my subscription @@ -3607,8 +4150,8 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD | 1 | 0.696531 | `applicationinsights_recommendation_list` | ✅ **EXPECTED** | +<<<<<<< HEAD | 2 | 0.506351 | `azureaibestpractices_get` | ❌ | | 3 | 0.468384 | `get_bestpractices_get` | ❌ | | 4 | 0.452231 | `applens_resource_diagnose` | ❌ | @@ -3632,6 +4175,16 @@ ## Test 97 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +| 2 | 0.468384 | `get_bestpractices_get` | ❌ | +| 3 | 0.452231 | `applens_resource_diagnose` | ❌ | +| 4 | 0.435241 | `azureterraformbestpractices_get` | ❌ | +| 5 | 0.424622 | `search_service_list` | ❌ | + +--- + +## Test 102 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `applicationinsights_recommendation_list` **Prompt:** List profiler recommendations for Application Insights in resource group @@ -3643,6 +4196,7 @@ | 1 | 0.626722 | `applicationinsights_recommendation_list` | ✅ **EXPECTED** | | 2 | 0.488002 | `loadtesting_testresource_list` | ❌ | <<<<<<< HEAD +<<<<<<< HEAD | 3 | 0.479392 | `mysql_server_list` | ❌ | ======= <<<<<<< HEAD @@ -3651,11 +4205,15 @@ | 3 | 0.479392 | `mysql_server_list` | ❌ | >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +| 3 | 0.479392 | `mysql_server_list` | ❌ | +>>>>>>> e2fd2eac (refactor tts mcp tool) | 4 | 0.477396 | `applens_resource_diagnose` | ❌ | | 5 | 0.468847 | `resourcehealth_availability-status_list` | ❌ | --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 93 ======= @@ -3665,6 +4223,9 @@ ## Test 98 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 103 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `applicationinsights_recommendation_list` **Prompt:** Show me performance improvement recommendations from Application Insights @@ -3686,23 +4247,22 @@ ======= | 1 | 0.509502 | `applicationinsights_recommendation_list` | ✅ **EXPECTED** | | 2 | 0.419670 | `applens_resource_diagnose` | ❌ | -<<<<<<< HEAD | 3 | 0.383767 | `get_bestpractices_get` | ❌ | -| 4 | 0.367260 | `deploy_architecture_diagram_generate` | ❌ | -======= -| 3 | 0.385936 | `get_bestpractices_get` | ❌ | | 4 | 0.367278 | `deploy_architecture_diagram_generate` | ❌ | ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) | 5 | 0.343931 | `cloudarchitect_design` | ❌ | --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 89 ======= ## Test 99 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 104 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `extension_cli_generate` **Prompt:** Create a Storage account with name using Azure CLI @@ -3714,6 +4274,7 @@ | 1 | 0.593241 | `storage_account_create` | ❌ | | 2 | 0.564940 | `storage_blob_container_create` | ❌ | <<<<<<< HEAD +<<<<<<< HEAD | 3 | 0.493684 | `storage_account_get` | ❌ | | 4 | 0.473547 | `storage_blob_container_get` | ❌ | | 5 | 0.456428 | `managedlustre_fs_create` | ❌ | @@ -3728,16 +4289,24 @@ | 3 | 0.493641 | `storage_account_get` | ❌ | >>>>>>> 84ad4f44 (update prompts and tool description evaluator) | 4 | 0.474399 | `storage_blob_container_get` | ❌ | +======= +| 3 | 0.493684 | `storage_account_get` | ❌ | +| 4 | 0.474987 | `storage_blob_container_get` | ❌ | +>>>>>>> e2fd2eac (refactor tts mcp tool) | 5 | 0.454194 | `managedlustre_fs_create` | ❌ | --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 90 ======= ## Test 100 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 105 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `extension_cli_generate` **Prompt:** List all virtual machines in my subscription using Azure CLI @@ -3751,6 +4320,7 @@ ======= | 1 | 0.593467 | `search_service_list` | ❌ | <<<<<<< HEAD +<<<<<<< HEAD >>>>>>> 58ab8585 (update prompts and tool description evaluator) | 2 | 0.575274 | `kusto_cluster_list` | ❌ | | 3 | 0.549918 | `virtualdesktop_hostpool_list` | ❌ | @@ -3765,15 +4335,22 @@ ## Test 91 ======= | 2 | 0.575351 | `kusto_cluster_list` | ❌ | +======= +| 2 | 0.575274 | `kusto_cluster_list` | ❌ | +>>>>>>> e2fd2eac (refactor tts mcp tool) | 3 | 0.549966 | `virtualdesktop_hostpool_list` | ❌ | | 4 | 0.544412 | `monitor_workspace_list` | ❌ | | 5 | 0.536252 | `subscription_list` | ❌ | --- +<<<<<<< HEAD ## Test 101 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 106 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `extension_cli_generate` **Prompt:** Show me the details of the storage account with Azure CLI commands @@ -3783,6 +4360,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| <<<<<<< HEAD +<<<<<<< HEAD | 1 | 0.710307 | `storage_account_get` | ❌ | | 2 | 0.601571 | `storage_blob_container_get` | ❌ | ======= @@ -3793,12 +4371,17 @@ >>>>>>> 84ad4f44 (update prompts and tool description evaluator) | 2 | 0.602173 | `storage_blob_container_get` | ❌ | >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +| 1 | 0.710307 | `storage_account_get` | ❌ | +| 2 | 0.602446 | `storage_blob_container_get` | ❌ | +>>>>>>> e2fd2eac (refactor tts mcp tool) | 3 | 0.543268 | `storage_blob_get` | ❌ | | 4 | 0.519788 | `storage_account_create` | ❌ | -| 5 | 0.493100 | `cosmos_account_list` | ❌ | +| 5 | 0.493145 | `cosmos_account_list` | ❌ | --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 97 ======= @@ -3808,6 +4391,9 @@ ## Test 102 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 107 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `extension_cli_install` **Prompt:** @@ -3817,6 +4403,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| <<<<<<< HEAD +<<<<<<< HEAD | 1 | 0.479652 | `extension_cli_install` | ✅ **EXPECTED** | | 2 | 0.473369 | `extension_cli_generate` | ❌ | | 3 | 0.389405 | `azureterraformbestpractices_get` | ❌ | @@ -3828,14 +4415,17 @@ ## Test 98 ======= <<<<<<< HEAD +======= +>>>>>>> e2fd2eac (refactor tts mcp tool) | 1 | 0.479590 | `extension_cli_install` | ✅ **EXPECTED** | -| 2 | 0.473266 | `extension_cli_generate` | ❌ | -| 3 | 0.389369 | `azureterraformbestpractices_get` | ❌ | +| 2 | 0.473250 | `extension_cli_generate` | ❌ | +| 3 | 0.389354 | `azureterraformbestpractices_get` | ❌ | | 4 | 0.382389 | `deploy_plan_get` | ❌ | | 5 | 0.366012 | `get_bestpractices_get` | ❌ | --- +<<<<<<< HEAD ## Test 93 ======= | 1 | 0.497777 | `extension_cli_generate` | ❌ | @@ -3849,6 +4439,9 @@ ## Test 103 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 108 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `extension_cli_install` **Prompt:** How to install azd @@ -3865,6 +4458,7 @@ --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 99 ======= @@ -3874,6 +4468,9 @@ ## Test 104 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 109 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `extension_cli_install` **Prompt:** What is Azure Functions Core tools and how to install it @@ -3894,19 +4491,23 @@ ## Test 100 ======= | 1 | 0.622705 | `extension_cli_install` | ✅ **EXPECTED** | -| 2 | 0.443050 | `get_bestpractices_get` | ❌ | +| 2 | 0.439474 | `get_bestpractices_get` | ❌ | | 3 | 0.432913 | `deploy_pipeline_guidance_get` | ❌ | -| 4 | 0.430483 | `extension_cli_generate` | ❌ | +| 4 | 0.430723 | `extension_cli_generate` | ❌ | | 5 | 0.418161 | `deploy_plan_get` | ❌ | --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 95 ======= ## Test 105 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 110 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `acr_registry_list` **Prompt:** List all Azure Container Registries in my subscription @@ -3917,9 +4518,9 @@ |------|-------|------|--------| | 1 | 0.743568 | `acr_registry_list` | ✅ **EXPECTED** | | 2 | 0.711580 | `acr_registry_repository_list` | ❌ | -<<<<<<< HEAD | 3 | 0.585675 | `kusto_cluster_list` | ❌ | <<<<<<< HEAD +<<<<<<< HEAD | 4 | 0.540241 | `search_service_list` | ❌ | | 5 | 0.514293 | `cosmos_account_list` | ❌ | @@ -3930,17 +4531,23 @@ ======= | 3 | 0.585618 | `kusto_cluster_list` | ❌ | >>>>>>> 84ad4f44 (update prompts and tool description evaluator) +======= +>>>>>>> e2fd2eac (refactor tts mcp tool) | 4 | 0.541506 | `search_service_list` | ❌ | -| 5 | 0.514326 | `cosmos_account_list` | ❌ | +| 5 | 0.514293 | `cosmos_account_list` | ❌ | --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 96 ======= ## Test 106 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 111 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `acr_registry_list` **Prompt:** Show me my Azure Container Registries @@ -3951,12 +4558,17 @@ |------|-------|------|--------| | 1 | 0.586014 | `acr_registry_list` | ✅ **EXPECTED** | | 2 | 0.563636 | `acr_registry_repository_list` | ❌ | +<<<<<<< HEAD | 3 | 0.460834 | `storage_blob_container_get` | ❌ | +======= +| 3 | 0.460570 | `storage_blob_container_get` | ❌ | +>>>>>>> e2fd2eac (refactor tts mcp tool) | 4 | 0.415552 | `cosmos_database_container_list` | ❌ | -| 5 | 0.402247 | `redis_list` | ❌ | +| 5 | 0.402318 | `redis_list` | ❌ | --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 102 ======= @@ -3966,6 +4578,9 @@ ## Test 107 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 112 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `acr_registry_list` **Prompt:** Show me the container registries in my subscription @@ -3976,9 +4591,9 @@ |------|-------|------|--------| | 1 | 0.637130 | `acr_registry_list` | ✅ **EXPECTED** | | 2 | 0.563476 | `acr_registry_repository_list` | ❌ | -<<<<<<< HEAD | 3 | 0.516769 | `kusto_cluster_list` | ❌ | <<<<<<< HEAD +<<<<<<< HEAD | 4 | 0.515365 | `storage_blob_container_get` | ❌ | ======= ======= @@ -3999,6 +4614,14 @@ ## Test 108 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +| 4 | 0.515153 | `storage_blob_container_get` | ❌ | +| 5 | 0.480398 | `redis_list` | ❌ | + +--- + +## Test 113 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `acr_registry_list` **Prompt:** List container registries in resource group @@ -4010,6 +4633,7 @@ | 1 | 0.654318 | `acr_registry_repository_list` | ❌ | | 2 | 0.633938 | `acr_registry_list` | ✅ **EXPECTED** | <<<<<<< HEAD +<<<<<<< HEAD | 3 | 0.476015 | `mysql_server_list` | ❌ | ======= <<<<<<< HEAD @@ -4018,11 +4642,15 @@ | 3 | 0.476015 | `mysql_server_list` | ❌ | >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +| 3 | 0.476015 | `mysql_server_list` | ❌ | +>>>>>>> e2fd2eac (refactor tts mcp tool) | 4 | 0.454929 | `group_list` | ❌ | | 5 | 0.454003 | `datadog_monitoredresources_list` | ❌ | --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 104 ======= @@ -4032,6 +4660,9 @@ ## Test 109 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 114 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `acr_registry_list` **Prompt:** Show me the container registries in resource group @@ -4043,6 +4674,7 @@ | 1 | 0.639391 | `acr_registry_list` | ✅ **EXPECTED** | | 2 | 0.637972 | `acr_registry_repository_list` | ❌ | <<<<<<< HEAD +<<<<<<< HEAD | 3 | 0.468028 | `mysql_server_list` | ❌ | ======= <<<<<<< HEAD @@ -4051,11 +4683,15 @@ | 3 | 0.468028 | `mysql_server_list` | ❌ | >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +| 3 | 0.468028 | `mysql_server_list` | ❌ | +>>>>>>> e2fd2eac (refactor tts mcp tool) | 4 | 0.449649 | `datadog_monitoredresources_list` | ❌ | | 5 | 0.445741 | `group_list` | ❌ | --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 105 ======= @@ -4065,6 +4701,9 @@ ## Test 110 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 115 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `acr_registry_repository_list` **Prompt:** List all container registry repositories in my subscription @@ -4075,9 +4714,9 @@ |------|-------|------|--------| | 1 | 0.626482 | `acr_registry_repository_list` | ✅ **EXPECTED** | | 2 | 0.617504 | `acr_registry_list` | ❌ | -<<<<<<< HEAD | 3 | 0.544172 | `kusto_cluster_list` | ❌ | <<<<<<< HEAD +<<<<<<< HEAD | 4 | 0.508863 | `storage_blob_container_get` | ❌ | | 5 | 0.495567 | `postgres_server_list` | ❌ | @@ -4099,6 +4738,14 @@ ## Test 111 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +| 4 | 0.508318 | `storage_blob_container_get` | ❌ | +| 5 | 0.495567 | `postgres_server_list` | ❌ | + +--- + +## Test 116 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `acr_registry_repository_list` **Prompt:** Show me my container registry repositories @@ -4109,12 +4756,17 @@ |------|-------|------|--------| | 1 | 0.546334 | `acr_registry_repository_list` | ✅ **EXPECTED** | | 2 | 0.469295 | `acr_registry_list` | ❌ | +<<<<<<< HEAD | 3 | 0.451973 | `storage_blob_container_get` | ❌ | +======= +| 3 | 0.450946 | `storage_blob_container_get` | ❌ | +>>>>>>> e2fd2eac (refactor tts mcp tool) | 4 | 0.407973 | `cosmos_database_container_list` | ❌ | | 5 | 0.373464 | `storage_blob_get` | ❌ | --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 107 ======= @@ -4124,6 +4776,9 @@ ## Test 112 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 117 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `acr_registry_repository_list` **Prompt:** List repositories in the container registry @@ -4134,6 +4789,7 @@ |------|-------|------|--------| | 1 | 0.674296 | `acr_registry_repository_list` | ✅ **EXPECTED** | | 2 | 0.541779 | `acr_registry_list` | ❌ | +<<<<<<< HEAD | 3 | 0.437756 | `storage_blob_container_get` | ❌ | | 4 | 0.433927 | `cosmos_database_container_list` | ❌ | <<<<<<< HEAD @@ -4157,6 +4813,15 @@ ## Test 113 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +| 3 | 0.437348 | `storage_blob_container_get` | ❌ | +| 4 | 0.433927 | `cosmos_database_container_list` | ❌ | +| 5 | 0.383183 | `kusto_database_list` | ❌ | + +--- + +## Test 118 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `acr_registry_repository_list` **Prompt:** Show me the repositories in the container registry @@ -4167,12 +4832,17 @@ |------|-------|------|--------| | 1 | 0.600780 | `acr_registry_repository_list` | ✅ **EXPECTED** | | 2 | 0.501842 | `acr_registry_list` | ❌ | +<<<<<<< HEAD | 3 | 0.431148 | `storage_blob_container_get` | ❌ | +======= +| 3 | 0.430783 | `storage_blob_container_get` | ❌ | +>>>>>>> e2fd2eac (refactor tts mcp tool) | 4 | 0.418623 | `cosmos_database_container_list` | ❌ | -| 5 | 0.378151 | `redis_list` | ❌ | +| 5 | 0.378216 | `redis_list` | ❌ | --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 109 ======= @@ -4182,6 +4852,9 @@ ## Test 114 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 119 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `communication_email_send` **Prompt:** Send an email to with subject @@ -4196,6 +4869,7 @@ ======= | 1 | 0.498292 | `communication_email_send` | ✅ **EXPECTED** | <<<<<<< HEAD +<<<<<<< HEAD | 2 | 0.226847 | `communication_sms_send` | ❌ | >>>>>>> 58ab8585 (update prompts and tool description evaluator) | 3 | 0.188975 | `eventgrid_events_publish` | ❌ | @@ -4209,16 +4883,22 @@ ======= ## Test 105 ======= +======= +>>>>>>> e2fd2eac (refactor tts mcp tool) | 2 | 0.229081 | `communication_sms_send` | ❌ | -| 3 | 0.189000 | `eventgrid_events_publish` | ❌ | -| 4 | 0.155364 | `speech_tts_synthesize` | ❌ | +| 3 | 0.188975 | `eventgrid_events_publish` | ❌ | +| 4 | 0.161257 | `foundry_agents_create` | ❌ | | 5 | 0.145951 | `servicebus_topic_details` | ❌ | --- +<<<<<<< HEAD ## Test 115 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 120 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `communication_email_send` **Prompt:** Send an email from my communication service to @@ -4239,26 +4919,20 @@ ## Test 111 ======= | 1 | 0.498406 | `communication_email_send` | ✅ **EXPECTED** | -<<<<<<< HEAD -| 2 | 0.313058 | `communication_sms_send` | ❌ | -| 3 | 0.235127 | `foundry_openai_chat-completions-create` | ❌ | -| 4 | 0.211154 | `search_knowledge_base_retrieve` | ❌ | -| 5 | 0.194094 | `speech_stt_recognize` | ❌ | - ---- - -## Test 106 -======= | 2 | 0.314462 | `communication_sms_send` | ❌ | -| 3 | 0.228890 | `speech_tts_synthesize` | ❌ | -| 4 | 0.218524 | `foundry_openai_chat-completions-create` | ❌ | +| 3 | 0.235127 | `foundry_openai_chat-completions-create` | ❌ | +| 4 | 0.215392 | `speech_tts_synthesize` | ❌ | | 5 | 0.211154 | `search_knowledge_base_retrieve` | ❌ | --- +<<<<<<< HEAD ## Test 116 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 121 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `communication_email_send` **Prompt:** Send HTML-formatted email to with subject @@ -4274,19 +4948,15 @@ | 4 | 0.152056 | `servicebus_topic_details` | ❌ | ======= | 1 | 0.520967 | `communication_email_send` | ✅ **EXPECTED** | -<<<<<<< HEAD -| 2 | 0.205130 | `communication_sms_send` | ❌ | -| 3 | 0.152418 | `eventgrid_events_publish` | ❌ | -======= | 2 | 0.207658 | `communication_sms_send` | ❌ | -| 3 | 0.152427 | `eventgrid_events_publish` | ❌ | ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) +| 3 | 0.152418 | `eventgrid_events_publish` | ❌ | | 4 | 0.152013 | `servicebus_topic_details` | ❌ | >>>>>>> 58ab8585 (update prompts and tool description evaluator) | 5 | 0.143660 | `foundry_agents_evaluate` | ❌ | --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 112 ======= @@ -4296,6 +4966,9 @@ ## Test 117 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 122 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `communication_email_send` **Prompt:** Send email with CC to and @@ -4310,8 +4983,7 @@ | 3 | 0.106042 | `foundry_agents_query-and-evaluate` | ❌ | ======= | 1 | 0.533447 | `communication_email_send` | ✅ **EXPECTED** | -<<<<<<< HEAD -| 2 | 0.217412 | `communication_sms_send` | ❌ | +| 2 | 0.219584 | `communication_sms_send` | ❌ | | 3 | 0.106026 | `foundry_agents_query-and-evaluate` | ❌ | >>>>>>> 58ab8585 (update prompts and tool description evaluator) | 4 | 0.103723 | `foundry_openai_chat-completions-create` | ❌ | @@ -4319,6 +4991,7 @@ --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 113 ======= @@ -4334,6 +5007,9 @@ ## Test 118 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 123 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `communication_email_send` **Prompt:** Send email to multiple recipients: , @@ -4354,25 +5030,23 @@ ## Test 114 ======= | 1 | 0.540792 | `communication_email_send` | ✅ **EXPECTED** | -<<<<<<< HEAD -| 2 | 0.241620 | `communication_sms_send` | ❌ | +| 2 | 0.244521 | `communication_sms_send` | ❌ | | 3 | 0.134975 | `foundry_openai_chat-completions-create` | ❌ | | 4 | 0.114324 | `foundry_agents_query-and-evaluate` | ❌ | -======= -| 2 | 0.244521 | `communication_sms_send` | ❌ | -| 3 | 0.114380 | `foundry_agents_query-and-evaluate` | ❌ | -| 4 | 0.098798 | `foundry_openai_chat-completions-create` | ❌ | ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) | 5 | 0.087063 | `postgres_server_param_set` | ❌ | --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 109 ======= ## Test 119 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 124 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `communication_email_send` **Prompt:** Send email with reply-to address set to @@ -4387,11 +5061,7 @@ | 3 | 0.164422 | `mysql_server_param_set` | ❌ | ======= | 1 | 0.512623 | `communication_email_send` | ✅ **EXPECTED** | -<<<<<<< HEAD -| 2 | 0.198552 | `communication_sms_send` | ❌ | -======= | 2 | 0.200177 | `communication_sms_send` | ❌ | ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) | 3 | 0.164115 | `mysql_server_param_set` | ❌ | >>>>>>> 58ab8585 (update prompts and tool description evaluator) | 4 | 0.158759 | `postgres_server_param_set` | ❌ | @@ -4399,6 +5069,7 @@ --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 115 ======= @@ -4408,6 +5079,9 @@ ## Test 120 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 125 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `communication_email_send` **Prompt:** Send email with custom sender name @@ -4422,14 +5096,21 @@ ======= | 1 | 0.473175 | `communication_email_send` | ✅ **EXPECTED** | <<<<<<< HEAD +<<<<<<< HEAD | 2 | 0.253449 | `communication_sms_send` | ❌ | >>>>>>> 58ab8585 (update prompts and tool description evaluator) | 3 | 0.164811 | `foundry_openai_chat-completions-create` | ❌ | | 4 | 0.160285 | `foundry_openai_embeddings-create` | ❌ | +======= +| 2 | 0.255169 | `communication_sms_send` | ❌ | +| 3 | 0.164811 | `foundry_openai_chat-completions-create` | ❌ | +| 4 | 0.160393 | `foundry_openai_embeddings-create` | ❌ | +>>>>>>> e2fd2eac (refactor tts mcp tool) | 5 | 0.156869 | `cosmos_database_container_item_query` | ❌ | --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 116 ======= @@ -4445,6 +5126,9 @@ ## Test 121 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 126 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `communication_email_send` **Prompt:** Send an email with BCC recipients @@ -4460,20 +5144,19 @@ | 4 | 0.108748 | `confidentialledger_entries_get` | ❌ | ======= | 1 | 0.528789 | `communication_email_send` | ✅ **EXPECTED** | -<<<<<<< HEAD -| 2 | 0.239846 | `communication_sms_send` | ❌ | -| 3 | 0.137565 | `confidentialledger_entries_append` | ❌ | -| 4 | 0.108725 | `confidentialledger_entries_get` | ❌ | -======= | 2 | 0.241114 | `communication_sms_send` | ❌ | | 3 | 0.137538 | `confidentialledger_entries_append` | ❌ | | 4 | 0.108748 | `confidentialledger_entries_get` | ❌ | +<<<<<<< HEAD >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +>>>>>>> e2fd2eac (refactor tts mcp tool) | 5 | 0.105033 | `storage_blob_upload` | ❌ | --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 117 ======= @@ -4483,6 +5166,9 @@ ## Test 122 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 127 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `communication_sms_send` **Prompt:** Send an SMS message to saying "Hello" @@ -4492,6 +5178,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| <<<<<<< HEAD +<<<<<<< HEAD | 1 | 0.533822 | `communication_sms_send` | ✅ **EXPECTED** | | 2 | 0.251480 | `communication_email_send` | ❌ | ======= @@ -4510,17 +5197,23 @@ ======= ## Test 113 ======= +======= +>>>>>>> e2fd2eac (refactor tts mcp tool) | 1 | 0.533868 | `communication_sms_send` | ✅ **EXPECTED** | | 2 | 0.251429 | `communication_email_send` | ❌ | -| 3 | 0.178085 | `foundry_openai_chat-completions-create` | ❌ | -| 4 | 0.170676 | `speech_tts_synthesize` | ❌ | -| 5 | 0.148584 | `foundry_agents_connect` | ❌ | +| 3 | 0.218656 | `foundry_openai_chat-completions-create` | ❌ | +| 4 | 0.175534 | `foundry_agents_create` | ❌ | +| 5 | 0.166041 | `speech_tts_synthesize` | ❌ | --- +<<<<<<< HEAD ## Test 123 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 128 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `communication_sms_send` **Prompt:** Send SMS to from with message "Test message" @@ -4530,6 +5223,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| <<<<<<< HEAD +<<<<<<< HEAD | 1 | 0.546006 | `communication_sms_send` | ✅ **EXPECTED** | | 2 | 0.294912 | `communication_email_send` | ❌ | | 3 | 0.204585 | `loadtesting_testrun_create` | ❌ | @@ -4551,17 +5245,23 @@ ## Test 114 ======= +======= +>>>>>>> e2fd2eac (refactor tts mcp tool) | 1 | 0.546019 | `communication_sms_send` | ✅ **EXPECTED** | | 2 | 0.294859 | `communication_email_send` | ❌ | | 3 | 0.204588 | `loadtesting_testrun_create` | ❌ | -| 4 | 0.155927 | `foundry_openai_chat-completions-create` | ❌ | -| 5 | 0.139313 | `speech_tts_synthesize` | ❌ | +| 4 | 0.200655 | `foundry_openai_chat-completions-create` | ❌ | +| 5 | 0.141113 | `foundry_agents_create` | ❌ | --- +<<<<<<< HEAD ## Test 124 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 129 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `communication_sms_send` **Prompt:** Send SMS to multiple recipients: , @@ -4571,6 +5271,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| <<<<<<< HEAD +<<<<<<< HEAD | 1 | 0.545744 | `communication_sms_send` | ✅ **EXPECTED** | | 2 | 0.422028 | `communication_email_send` | ❌ | | 3 | 0.186088 | `foundry_openai_chat-completions-create` | ❌ | @@ -4583,25 +5284,26 @@ ======= <<<<<<< HEAD | 1 | 0.543753 | `communication_sms_send` | ✅ **EXPECTED** | -| 2 | 0.421988 | `communication_email_send` | ❌ | -| 3 | 0.186088 | `foundry_openai_chat-completions-create` | ❌ | -| 4 | 0.142030 | `foundry_agents_query-and-evaluate` | ❌ | ======= | 1 | 0.545755 | `communication_sms_send` | ✅ **EXPECTED** | +>>>>>>> e2fd2eac (refactor tts mcp tool) | 2 | 0.421988 | `communication_email_send` | ❌ | -| 3 | 0.142602 | `foundry_openai_chat-completions-create` | ❌ | -| 4 | 0.141987 | `foundry_agents_query-and-evaluate` | ❌ | ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) -| 5 | 0.104124 | `search_knowledge_base_retrieve` | ❌ | +| 3 | 0.186088 | `foundry_openai_chat-completions-create` | ❌ | +| 4 | 0.142030 | `foundry_agents_query-and-evaluate` | ❌ | +| 5 | 0.113722 | `foundry_threads_get-messages` | ❌ | --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 115 ======= ## Test 125 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 130 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `communication_sms_send` **Prompt:** Send SMS with delivery reporting enabled @@ -4611,6 +5313,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| <<<<<<< HEAD +<<<<<<< HEAD | 1 | 0.554917 | `communication_sms_send` | ✅ **EXPECTED** | | 2 | 0.269203 | `communication_email_send` | ❌ | | 3 | 0.191848 | `extension_azqr` | ❌ | @@ -4620,11 +5323,17 @@ | 2 | 0.269080 | `communication_email_send` | ❌ | | 3 | 0.192340 | `extension_azqr` | ❌ | >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +| 1 | 0.554908 | `communication_sms_send` | ✅ **EXPECTED** | +| 2 | 0.269080 | `communication_email_send` | ❌ | +| 3 | 0.191848 | `extension_azqr` | ❌ | +>>>>>>> e2fd2eac (refactor tts mcp tool) | 4 | 0.185916 | `foundry_openai_chat-completions-create` | ❌ | | 5 | 0.170749 | `foundry_agents_query-and-evaluate` | ❌ | --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 121 ======= @@ -4641,6 +5350,9 @@ ## Test 126 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 131 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `communication_sms_send` **Prompt:** Send SMS message with custom tracking tag "campaign1" @@ -4650,6 +5362,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| <<<<<<< HEAD +<<<<<<< HEAD | 1 | 0.538893 | `communication_sms_send` | ✅ **EXPECTED** | | 2 | 0.269915 | `communication_email_send` | ❌ | ======= @@ -4668,17 +5381,23 @@ ======= ## Test 117 ======= +======= +>>>>>>> e2fd2eac (refactor tts mcp tool) | 1 | 0.538827 | `communication_sms_send` | ✅ **EXPECTED** | | 2 | 0.269794 | `communication_email_send` | ❌ | | 3 | 0.188153 | `loadtesting_testrun_create` | ❌ | -| 4 | 0.159177 | `appconfig_kv_set` | ❌ | -| 5 | 0.158295 | `loadtesting_test_create` | ❌ | +| 4 | 0.185403 | `foundry_openai_chat-completions-create` | ❌ | +| 5 | 0.175135 | `foundry_agents_create` | ❌ | --- +<<<<<<< HEAD ## Test 127 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 132 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `communication_sms_send` **Prompt:** Send broadcast SMS to and saying "Urgent notification" @@ -4688,6 +5407,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| <<<<<<< HEAD +<<<<<<< HEAD | 1 | 0.474775 | `communication_sms_send` | ✅ **EXPECTED** | | 2 | 0.286381 | `communication_email_send` | ❌ | | 3 | 0.164341 | `foundry_agents_query-and-evaluate` | ❌ | @@ -4720,6 +5440,17 @@ ## Test 128 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +| 1 | 0.474935 | `communication_sms_send` | ✅ **EXPECTED** | +| 2 | 0.286337 | `communication_email_send` | ❌ | +| 3 | 0.164209 | `foundry_agents_query-and-evaluate` | ❌ | +| 4 | 0.147352 | `foundry_openai_chat-completions-create` | ❌ | +| 5 | 0.128661 | `cosmos_account_list` | ❌ | + +--- + +## Test 133 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `communication_sms_send` **Prompt:** Send SMS from my communication service to @@ -4729,6 +5460,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| <<<<<<< HEAD +<<<<<<< HEAD | 1 | 0.564058 | `communication_sms_send` | ✅ **EXPECTED** | | 2 | 0.302377 | `communication_email_send` | ❌ | | 3 | 0.238340 | `foundry_openai_chat-completions-create` | ❌ | @@ -4750,17 +5482,23 @@ ## Test 119 ======= +======= +>>>>>>> e2fd2eac (refactor tts mcp tool) | 1 | 0.564114 | `communication_sms_send` | ✅ **EXPECTED** | | 2 | 0.302363 | `communication_email_send` | ❌ | -| 3 | 0.213669 | `foundry_openai_chat-completions-create` | ❌ | -| 4 | 0.183651 | `search_knowledge_base_retrieve` | ❌ | -| 5 | 0.177315 | `appservice_database_add` | ❌ | +| 3 | 0.238296 | `foundry_openai_chat-completions-create` | ❌ | +| 4 | 0.184264 | `foundry_agents_create` | ❌ | +| 5 | 0.183651 | `search_knowledge_base_retrieve` | ❌ | --- +<<<<<<< HEAD ## Test 129 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 134 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `communication_sms_send` **Prompt:** Send an SMS with delivery receipt tracking @@ -4770,12 +5508,16 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| <<<<<<< HEAD +<<<<<<< HEAD | 1 | 0.598236 | `communication_sms_send` | ✅ **EXPECTED** | | 2 | 0.314267 | `communication_email_send` | ❌ | | 3 | 0.206931 | `foundry_agents_query-and-evaluate` | ❌ | ======= <<<<<<< HEAD | 1 | 0.592519 | `communication_sms_send` | ✅ **EXPECTED** | +======= +| 1 | 0.598211 | `communication_sms_send` | ✅ **EXPECTED** | +>>>>>>> e2fd2eac (refactor tts mcp tool) | 2 | 0.314134 | `communication_email_send` | ❌ | | 3 | 0.206916 | `foundry_agents_query-and-evaluate` | ❌ | >>>>>>> 58ab8585 (update prompts and tool description evaluator) @@ -4784,6 +5526,7 @@ --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 125 ======= @@ -4800,6 +5543,9 @@ ## Test 130 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 135 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `confidentialledger_entries_append` **Prompt:** Append an entry to my ledger with data {"key": "value"} @@ -4809,6 +5555,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| <<<<<<< HEAD +<<<<<<< HEAD | 1 | 0.511241 | `confidentialledger_entries_append` | ✅ **EXPECTED** | | 2 | 0.295319 | `confidentialledger_entries_get` | ❌ | | 3 | 0.291757 | `appconfig_kv_set` | ❌ | @@ -4823,21 +5570,26 @@ | 1 | 0.510689 | `confidentialledger_entries_append` | ✅ **EXPECTED** | | 2 | 0.293736 | `confidentialledger_entries_get` | ❌ | ======= +======= +>>>>>>> e2fd2eac (refactor tts mcp tool) | 1 | 0.510651 | `confidentialledger_entries_append` | ✅ **EXPECTED** | | 2 | 0.294885 | `confidentialledger_entries_get` | ❌ | ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) | 3 | 0.292014 | `appconfig_kv_set` | ❌ | | 4 | 0.258967 | `appconfig_kv_lock_set` | ❌ | -| 5 | 0.249704 | `keyvault_certificate_import` | ❌ | +| 5 | 0.249908 | `keyvault_certificate_import` | ❌ | --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 121 ======= ## Test 131 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 136 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `confidentialledger_entries_append` **Prompt:** Write a tamper-proof entry to ledger containing {"transaction": "data"} @@ -4847,6 +5599,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| <<<<<<< HEAD +<<<<<<< HEAD | 1 | 0.602321 | `confidentialledger_entries_append` | ✅ **EXPECTED** | | 2 | 0.357401 | `confidentialledger_entries_get` | ❌ | | 3 | 0.211998 | `appconfig_kv_lock_set` | ❌ | @@ -4859,18 +5612,26 @@ ======= | 1 | 0.602257 | `confidentialledger_entries_append` | ✅ **EXPECTED** | | 2 | 0.356510 | `confidentialledger_entries_get` | ❌ | +======= +| 1 | 0.602247 | `confidentialledger_entries_append` | ✅ **EXPECTED** | +| 2 | 0.357646 | `confidentialledger_entries_get` | ❌ | +>>>>>>> e2fd2eac (refactor tts mcp tool) | 3 | 0.211990 | `appconfig_kv_lock_set` | ❌ | | 4 | 0.195471 | `keyvault_secret_create` | ❌ | -| 5 | 0.183820 | `keyvault_certificate_import` | ❌ | +| 5 | 0.184077 | `keyvault_certificate_import` | ❌ | --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 122 ======= ## Test 132 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 137 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `confidentialledger_entries_append` **Prompt:** Append {"hello": "from mcp"} to my confidential ledger in collection @@ -4880,6 +5641,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| <<<<<<< HEAD +<<<<<<< HEAD | 1 | 0.546786 | `confidentialledger_entries_append` | ✅ **EXPECTED** | | 2 | 0.452117 | `confidentialledger_entries_get` | ❌ | | 3 | 0.225013 | `appconfig_kv_lock_set` | ❌ | @@ -4912,6 +5674,17 @@ ## Test 133 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +| 1 | 0.546660 | `confidentialledger_entries_append` | ✅ **EXPECTED** | +| 2 | 0.451994 | `confidentialledger_entries_get` | ❌ | +| 3 | 0.225141 | `appconfig_kv_lock_set` | ❌ | +| 4 | 0.215932 | `appconfig_kv_set` | ❌ | +| 5 | 0.203262 | `keyvault_certificate_import` | ❌ | + +--- + +## Test 138 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `confidentialledger_entries_append` **Prompt:** Create an immutable ledger entry in with content {"audit": "log"} @@ -4921,6 +5694,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| <<<<<<< HEAD +<<<<<<< HEAD | 1 | 0.496023 | `confidentialledger_entries_append` | ✅ **EXPECTED** | | 2 | 0.340187 | `confidentialledger_entries_get` | ❌ | | 3 | 0.218473 | `monitor_activitylog_list` | ❌ | @@ -4949,6 +5723,17 @@ ## Test 134 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +| 1 | 0.495719 | `confidentialledger_entries_append` | ✅ **EXPECTED** | +| 2 | 0.340160 | `confidentialledger_entries_get` | ❌ | +| 3 | 0.218437 | `monitor_activitylog_list` | ❌ | +| 4 | 0.215039 | `storage_blob_container_create` | ❌ | +| 5 | 0.204909 | `monitor_resource_log_query` | ❌ | + +--- + +## Test 139 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `confidentialledger_entries_append` **Prompt:** Write an entry to confidential ledger @@ -4965,6 +5750,7 @@ --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 130 ======= @@ -4974,6 +5760,9 @@ ## Test 135 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 140 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `confidentialledger_entries_get` **Prompt:** Get entry from Confidential Ledger for transaction on ledger @@ -4983,6 +5772,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| <<<<<<< HEAD +<<<<<<< HEAD | 1 | 0.707252 | `confidentialledger_entries_get` | ✅ **EXPECTED** | | 2 | 0.551953 | `confidentialledger_entries_append` | ❌ | | 3 | 0.245549 | `keyvault_secret_get` | ❌ | @@ -5004,17 +5794,23 @@ ## Test 126 ======= +======= +>>>>>>> e2fd2eac (refactor tts mcp tool) | 1 | 0.707252 | `confidentialledger_entries_get` | ✅ **EXPECTED** | | 2 | 0.551953 | `confidentialledger_entries_append` | ❌ | | 3 | 0.245541 | `keyvault_secret_get` | ❌ | | 4 | 0.229943 | `keyvault_key_get` | ❌ | -| 5 | 0.211925 | `loadtesting_testrun_get` | ❌ | +| 5 | 0.211839 | `loadtesting_testrun_get` | ❌ | --- +<<<<<<< HEAD ## Test 136 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 141 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `confidentialledger_entries_get` **Prompt:** Get transaction from ledger @@ -5024,6 +5820,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| <<<<<<< HEAD +<<<<<<< HEAD | 1 | 0.509714 | `confidentialledger_entries_get` | ✅ **EXPECTED** | | 2 | 0.416580 | `confidentialledger_entries_append` | ❌ | | 3 | 0.223959 | `loadtesting_testrun_get` | ❌ | @@ -5038,11 +5835,17 @@ | 3 | 0.224029 | `loadtesting_testrun_get` | ❌ | >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +| 1 | 0.509714 | `confidentialledger_entries_get` | ✅ **EXPECTED** | +| 2 | 0.416580 | `confidentialledger_entries_append` | ❌ | +| 3 | 0.223959 | `loadtesting_testrun_get` | ❌ | +>>>>>>> e2fd2eac (refactor tts mcp tool) | 4 | 0.218412 | `monitor_resource_log_query` | ❌ | | 5 | 0.217671 | `loadtesting_testrun_list` | ❌ | --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 132 ======= @@ -5052,6 +5855,9 @@ ## Test 137 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 142 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `cosmos_account_list` **Prompt:** List all cosmosdb accounts in my subscription @@ -5060,15 +5866,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -| 1 | 0.818340 | `cosmos_account_list` | ✅ **EXPECTED** | +| 1 | 0.818357 | `cosmos_account_list` | ✅ **EXPECTED** | | 2 | 0.668480 | `cosmos_database_list` | ❌ | -| 3 | 0.636009 | `subscription_list` | ❌ | +| 3 | 0.636036 | `subscription_list` | ❌ | | 4 | 0.615268 | `cosmos_database_container_list` | ❌ | -<<<<<<< HEAD | 5 | 0.601467 | `kusto_cluster_list` | ❌ | --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 133 ======= @@ -5081,6 +5887,9 @@ ## Test 138 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 143 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `cosmos_account_list` **Prompt:** Show me my cosmosdb accounts @@ -5090,6 +5899,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| <<<<<<< HEAD +<<<<<<< HEAD | 1 | 0.665422 | `cosmos_account_list` | ✅ **EXPECTED** | | 2 | 0.605325 | `cosmos_database_list` | ❌ | | 3 | 0.571573 | `cosmos_database_container_list` | ❌ | @@ -5101,24 +5911,23 @@ ## Test 134 ======= | 1 | 0.665440 | `cosmos_account_list` | ✅ **EXPECTED** | +======= +| 1 | 0.665447 | `cosmos_account_list` | ✅ **EXPECTED** | +>>>>>>> e2fd2eac (refactor tts mcp tool) | 2 | 0.605357 | `cosmos_database_list` | ❌ | | 3 | 0.571613 | `cosmos_database_container_list` | ❌ | -<<<<<<< HEAD -| 4 | 0.549476 | `cosmos_database_container_item_query` | ❌ | -| 5 | 0.504032 | `storage_account_get` | ❌ | - ---- - -## Test 129 -======= | 4 | 0.549447 | `cosmos_database_container_item_query` | ❌ | -| 5 | 0.503850 | `storage_account_get` | ❌ | +| 5 | 0.503830 | `storage_account_get` | ❌ | --- +<<<<<<< HEAD ## Test 139 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 144 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `cosmos_account_list` **Prompt:** Show me the cosmosdb accounts in my subscription @@ -5127,8 +5936,8 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD | 1 | 0.752494 | `cosmos_account_list` | ✅ **EXPECTED** | +<<<<<<< HEAD | 2 | 0.607165 | `subscription_list` | ❌ | | 3 | 0.605125 | `cosmos_database_list` | ❌ | | 4 | 0.566249 | `cosmos_database_container_list` | ❌ | @@ -5142,6 +5951,8 @@ ## Test 130 ======= | 1 | 0.752501 | `cosmos_account_list` | ✅ **EXPECTED** | +======= +>>>>>>> e2fd2eac (refactor tts mcp tool) | 2 | 0.607201 | `subscription_list` | ❌ | | 3 | 0.605125 | `cosmos_database_list` | ❌ | | 4 | 0.566249 | `cosmos_database_container_list` | ❌ | @@ -5149,9 +5960,13 @@ --- +<<<<<<< HEAD ## Test 140 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 145 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `cosmos_database_container_item_query` **Prompt:** Show me the items that contain the word in the container in the database for the cosmosdb account @@ -5161,6 +5976,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| <<<<<<< HEAD +<<<<<<< HEAD | 1 | 0.658701 | `cosmos_database_container_item_query` | ✅ **EXPECTED** | ======= <<<<<<< HEAD @@ -5171,11 +5987,17 @@ >>>>>>> 58ab8585 (update prompts and tool description evaluator) | 2 | 0.605253 | `cosmos_database_container_list` | ❌ | | 3 | 0.488353 | `storage_blob_container_get` | ❌ | +======= +| 1 | 0.658701 | `cosmos_database_container_item_query` | ✅ **EXPECTED** | +| 2 | 0.605253 | `cosmos_database_container_list` | ❌ | +| 3 | 0.487789 | `storage_blob_container_get` | ❌ | +>>>>>>> e2fd2eac (refactor tts mcp tool) | 4 | 0.477874 | `cosmos_database_list` | ❌ | -| 5 | 0.447777 | `cosmos_account_list` | ❌ | +| 5 | 0.447757 | `cosmos_account_list` | ❌ | --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 136 ======= @@ -5185,6 +6007,9 @@ ## Test 141 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 146 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `cosmos_database_container_list` **Prompt:** List all the containers in the database for the cosmosdb account @@ -5206,22 +6031,22 @@ ======= | 1 | 0.852832 | `cosmos_database_container_list` | ✅ **EXPECTED** | | 2 | 0.681044 | `cosmos_database_list` | ❌ | -<<<<<<< HEAD -| 3 | 0.680794 | `cosmos_database_container_item_query` | ❌ | -======= | 3 | 0.680762 | `cosmos_database_container_item_query` | ❌ | ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) -| 4 | 0.632335 | `storage_blob_container_get` | ❌ | -| 5 | 0.630597 | `cosmos_account_list` | ❌ | +| 4 | 0.632577 | `storage_blob_container_get` | ❌ | +| 5 | 0.630659 | `cosmos_account_list` | ❌ | --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 132 ======= ## Test 142 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 147 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `cosmos_database_container_list` **Prompt:** Show me the containers in the database for the cosmosdb account @@ -5231,6 +6056,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| <<<<<<< HEAD +<<<<<<< HEAD | 1 | 0.789395 | `cosmos_database_container_list` | ✅ **EXPECTED** | | 2 | 0.648126 | `cosmos_database_container_item_query` | ❌ | | 3 | 0.614220 | `cosmos_database_list` | ❌ | @@ -5252,17 +6078,23 @@ ## Test 133 ======= +======= +>>>>>>> e2fd2eac (refactor tts mcp tool) | 1 | 0.789395 | `cosmos_database_container_list` | ✅ **EXPECTED** | | 2 | 0.648126 | `cosmos_database_container_item_query` | ❌ | | 3 | 0.614220 | `cosmos_database_list` | ❌ | -| 4 | 0.591361 | `storage_blob_container_get` | ❌ | -| 5 | 0.562033 | `cosmos_account_list` | ❌ | +| 4 | 0.591594 | `storage_blob_container_get` | ❌ | +| 5 | 0.562062 | `cosmos_account_list` | ❌ | --- +<<<<<<< HEAD ## Test 143 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 148 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `cosmos_database_list` **Prompt:** List all the databases in the cosmosdb account @@ -5272,9 +6104,10 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| | 1 | 0.815683 | `cosmos_database_list` | ✅ **EXPECTED** | -| 2 | 0.668468 | `cosmos_account_list` | ❌ | +| 2 | 0.668515 | `cosmos_account_list` | ❌ | | 3 | 0.665298 | `cosmos_database_container_list` | ❌ | <<<<<<< HEAD +<<<<<<< HEAD | 4 | 0.606433 | `cosmos_database_container_item_query` | ❌ | | 5 | 0.582804 | `kusto_database_list` | ❌ | @@ -5290,14 +6123,20 @@ ## Test 134 ======= +======= +>>>>>>> e2fd2eac (refactor tts mcp tool) | 4 | 0.606433 | `cosmos_database_container_item_query` | ❌ | -| 5 | 0.583097 | `kusto_database_list` | ❌ | +| 5 | 0.583535 | `kusto_database_list` | ❌ | --- +<<<<<<< HEAD ## Test 144 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 149 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `cosmos_database_list` **Prompt:** Show me the databases in the cosmosdb account @@ -5308,9 +6147,9 @@ |------|-------|------|--------| | 1 | 0.749370 | `cosmos_database_list` | ✅ **EXPECTED** | | 2 | 0.624759 | `cosmos_database_container_list` | ❌ | -<<<<<<< HEAD | 3 | 0.614572 | `cosmos_account_list` | ❌ | <<<<<<< HEAD +<<<<<<< HEAD | 4 | 0.579919 | `cosmos_database_container_item_query` | ❌ | ======= | 4 | 0.579913 | `cosmos_database_container_item_query` | ❌ | @@ -5319,10 +6158,14 @@ | 4 | 0.579919 | `cosmos_database_container_item_query` | ❌ | >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +| 4 | 0.579919 | `cosmos_database_container_item_query` | ❌ | +>>>>>>> e2fd2eac (refactor tts mcp tool) | 5 | 0.538479 | `mysql_database_list` | ❌ | --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 140 ======= @@ -5332,6 +6175,9 @@ ## Test 145 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 150 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `kusto_cluster_get` **Prompt:** Show me the details of the Data Explorer cluster @@ -5341,10 +6187,10 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| | 1 | 0.590264 | `kusto_cluster_get` | ✅ **EXPECTED** | -<<<<<<< HEAD | 2 | 0.463832 | `kusto_cluster_list` | ❌ | | 3 | 0.428159 | `kusto_query` | ❌ | <<<<<<< HEAD +<<<<<<< HEAD | 4 | 0.425909 | `kusto_database_list` | ❌ | ======= | 4 | 0.425688 | `kusto_database_list` | ❌ | @@ -5354,10 +6200,14 @@ | 4 | 0.425469 | `kusto_database_list` | ❌ | >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +| 4 | 0.425669 | `kusto_database_list` | ❌ | +>>>>>>> e2fd2eac (refactor tts mcp tool) | 5 | 0.422577 | `kusto_table_schema` | ❌ | --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 141 ======= @@ -5367,6 +6217,9 @@ ## Test 146 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 151 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `kusto_cluster_list` **Prompt:** List all Data Explorer clusters in my subscription @@ -5375,9 +6228,9 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD | 1 | 0.793744 | `kusto_cluster_list` | ✅ **EXPECTED** | <<<<<<< HEAD +<<<<<<< HEAD | 2 | 0.630451 | `kusto_database_list` | ❌ | ======= | 2 | 0.630504 | `kusto_database_list` | ❌ | @@ -5386,12 +6239,16 @@ | 2 | 0.630261 | `kusto_database_list` | ❌ | >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +| 2 | 0.630507 | `kusto_database_list` | ❌ | +>>>>>>> e2fd2eac (refactor tts mcp tool) | 3 | 0.573395 | `kusto_cluster_get` | ❌ | | 4 | 0.525025 | `aks_cluster_get` | ❌ | | 5 | 0.509397 | `grafana_list` | ❌ | --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 142 ======= @@ -5401,6 +6258,9 @@ ## Test 147 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 152 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `kusto_cluster_list` **Prompt:** Show me my Data Explorer clusters @@ -5409,10 +6269,10 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD | 1 | 0.531307 | `kusto_cluster_list` | ✅ **EXPECTED** | | 2 | 0.465277 | `kusto_cluster_get` | ❌ | <<<<<<< HEAD +<<<<<<< HEAD | 3 | 0.432311 | `kusto_database_list` | ❌ | ======= | 3 | 0.432320 | `kusto_database_list` | ❌ | @@ -5422,11 +6282,15 @@ | 3 | 0.432552 | `kusto_database_list` | ❌ | >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +| 3 | 0.432288 | `kusto_database_list` | ❌ | +>>>>>>> e2fd2eac (refactor tts mcp tool) | 4 | 0.369596 | `aks_cluster_get` | ❌ | | 5 | 0.363119 | `kusto_table_schema` | ❌ | --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 143 ======= @@ -5436,6 +6300,9 @@ ## Test 148 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 153 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `kusto_cluster_list` **Prompt:** Show me the Data Explorer clusters in my subscription @@ -5444,10 +6311,10 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD | 1 | 0.701484 | `kusto_cluster_list` | ✅ **EXPECTED** | | 2 | 0.571191 | `kusto_cluster_get` | ❌ | <<<<<<< HEAD +<<<<<<< HEAD | 3 | 0.548734 | `kusto_database_list` | ❌ | ======= | 3 | 0.548690 | `kusto_database_list` | ❌ | @@ -5457,11 +6324,15 @@ | 3 | 0.548589 | `kusto_database_list` | ❌ | >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +| 3 | 0.548685 | `kusto_database_list` | ❌ | +>>>>>>> e2fd2eac (refactor tts mcp tool) | 4 | 0.498909 | `aks_cluster_get` | ❌ | -| 5 | 0.474201 | `redis_list` | ❌ | +| 5 | 0.474253 | `redis_list` | ❌ | --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 144 ======= @@ -5471,6 +6342,9 @@ ## Test 149 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 154 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `kusto_database_list` **Prompt:** List all databases in the Data Explorer cluster @@ -5480,25 +6354,29 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| <<<<<<< HEAD +<<<<<<< HEAD | 1 | 0.676656 | `kusto_database_list` | ✅ **EXPECTED** | | 2 | 0.560592 | `kusto_cluster_list` | ❌ | | 3 | 0.556795 | `kusto_table_list` | ❌ | ======= <<<<<<< HEAD | 1 | 0.677042 | `kusto_database_list` | ✅ **EXPECTED** | -| 2 | 0.560592 | `kusto_cluster_list` | ❌ | -| 3 | 0.556688 | `kusto_table_list` | ❌ | ======= -| 1 | 0.676699 | `kusto_database_list` | ✅ **EXPECTED** | -| 2 | 0.560388 | `kusto_cluster_list` | ❌ | +| 1 | 0.677059 | `kusto_database_list` | ✅ **EXPECTED** | +>>>>>>> e2fd2eac (refactor tts mcp tool) +| 2 | 0.560592 | `kusto_cluster_list` | ❌ | | 3 | 0.556795 | `kusto_table_list` | ❌ | +<<<<<<< HEAD >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +>>>>>>> e2fd2eac (refactor tts mcp tool) | 4 | 0.553218 | `postgres_database_list` | ❌ | | 5 | 0.549673 | `cosmos_database_list` | ❌ | --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 145 ======= @@ -5508,6 +6386,9 @@ ## Test 150 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 155 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `kusto_database_list` **Prompt:** Show me the databases in the Data Explorer cluster @@ -5517,25 +6398,29 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| <<<<<<< HEAD +<<<<<<< HEAD | 1 | 0.623242 | `kusto_database_list` | ✅ **EXPECTED** | | 2 | 0.509952 | `kusto_cluster_list` | ❌ | | 3 | 0.507073 | `kusto_table_list` | ❌ | ======= <<<<<<< HEAD | 1 | 0.623528 | `kusto_database_list` | ✅ **EXPECTED** | -| 2 | 0.509953 | `kusto_cluster_list` | ❌ | -| 3 | 0.506997 | `kusto_table_list` | ❌ | ======= -| 1 | 0.623401 | `kusto_database_list` | ✅ **EXPECTED** | -| 2 | 0.509763 | `kusto_cluster_list` | ❌ | +| 1 | 0.623523 | `kusto_database_list` | ✅ **EXPECTED** | +>>>>>>> e2fd2eac (refactor tts mcp tool) +| 2 | 0.509953 | `kusto_cluster_list` | ❌ | | 3 | 0.507073 | `kusto_table_list` | ❌ | +<<<<<<< HEAD >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +>>>>>>> e2fd2eac (refactor tts mcp tool) | 4 | 0.497144 | `cosmos_database_list` | ❌ | | 5 | 0.491400 | `mysql_database_list` | ❌ | --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 146 ======= @@ -5545,6 +6430,9 @@ ## Test 151 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 156 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `kusto_query` **Prompt:** Show me all items that contain the word in the Data Explorer table in cluster @@ -5553,6 +6441,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.423660 | `kusto_query` | ✅ **EXPECTED** | <<<<<<< HEAD | 2 | 0.409485 | `postgres_database_query` | ❌ | @@ -5581,6 +6470,17 @@ ## Test 152 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +| 1 | 0.423694 | `kusto_query` | ✅ **EXPECTED** | +| 2 | 0.409649 | `postgres_database_query` | ❌ | +| 3 | 0.408162 | `kusto_table_schema` | ❌ | +| 4 | 0.407690 | `kusto_sample` | ❌ | +| 5 | 0.403967 | `kusto_cluster_list` | ❌ | + +--- + +## Test 157 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `kusto_sample` **Prompt:** Show me a data sample from the Data Explorer table in cluster @@ -5592,6 +6492,7 @@ | 1 | 0.595554 | `kusto_sample` | ✅ **EXPECTED** | | 2 | 0.510233 | `kusto_table_schema` | ❌ | <<<<<<< HEAD +<<<<<<< HEAD | 3 | 0.424212 | `kusto_table_list` | ❌ | ======= <<<<<<< HEAD @@ -5599,13 +6500,15 @@ >>>>>>> 58ab8585 (update prompts and tool description evaluator) | 4 | 0.400924 | `kusto_cluster_list` | ❌ | ======= +======= +>>>>>>> e2fd2eac (refactor tts mcp tool) | 3 | 0.424212 | `kusto_table_list` | ❌ | -| 4 | 0.400719 | `kusto_cluster_list` | ❌ | ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) +| 4 | 0.400924 | `kusto_cluster_list` | ❌ | | 5 | 0.399525 | `kusto_cluster_get` | ❌ | --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 148 ======= @@ -5615,6 +6518,9 @@ ## Test 153 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 158 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `kusto_table_list` **Prompt:** List all tables in the Data Explorer database in cluster @@ -5624,6 +6530,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| <<<<<<< HEAD +<<<<<<< HEAD | 1 | 0.679642 | `kusto_table_list` | ✅ **EXPECTED** | ======= <<<<<<< HEAD @@ -5652,6 +6559,17 @@ ## Test 154 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +| 1 | 0.679642 | `kusto_table_list` | ✅ **EXPECTED** | +| 2 | 0.585237 | `postgres_table_list` | ❌ | +| 3 | 0.581207 | `kusto_database_list` | ❌ | +| 4 | 0.556724 | `mysql_table_list` | ❌ | +| 5 | 0.550007 | `monitor_table_list` | ❌ | + +--- + +## Test 159 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `kusto_table_list` **Prompt:** Show me the tables in the Data Explorer database in cluster @@ -5661,25 +6579,29 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| <<<<<<< HEAD +<<<<<<< HEAD | 1 | 0.619252 | `kusto_table_list` | ✅ **EXPECTED** | | 2 | 0.554332 | `kusto_table_schema` | ❌ | | 3 | 0.527431 | `kusto_database_list` | ❌ | | 4 | 0.524691 | `mysql_table_list` | ❌ | ======= | 1 | 0.619269 | `kusto_table_list` | ✅ **EXPECTED** | -| 2 | 0.554333 | `kusto_table_schema` | ❌ | -<<<<<<< HEAD -| 3 | 0.527616 | `kusto_database_list` | ❌ | -| 4 | 0.524607 | `mysql_table_list` | ❌ | ======= -| 3 | 0.527570 | `kusto_database_list` | ❌ | +| 1 | 0.619252 | `kusto_table_list` | ✅ **EXPECTED** | +>>>>>>> e2fd2eac (refactor tts mcp tool) +| 2 | 0.554333 | `kusto_table_schema` | ❌ | +| 3 | 0.527626 | `kusto_database_list` | ❌ | | 4 | 0.524691 | `mysql_table_list` | ❌ | +<<<<<<< HEAD >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +>>>>>>> e2fd2eac (refactor tts mcp tool) | 5 | 0.523432 | `postgres_table_list` | ❌ | --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 150 ======= @@ -5689,6 +6611,9 @@ ## Test 155 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 160 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `kusto_table_schema` **Prompt:** Show me the schema for table in the Data Explorer database in cluster @@ -5698,6 +6623,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| <<<<<<< HEAD +<<<<<<< HEAD | 1 | 0.666980 | `kusto_table_schema` | ✅ **EXPECTED** | | 2 | 0.564204 | `postgres_table_schema_get` | ❌ | | 3 | 0.528301 | `mysql_table_schema_get` | ❌ | @@ -5730,6 +6656,17 @@ ## Test 156 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +| 1 | 0.666828 | `kusto_table_schema` | ✅ **EXPECTED** | +| 2 | 0.564124 | `postgres_table_schema_get` | ❌ | +| 3 | 0.527717 | `mysql_table_schema_get` | ❌ | +| 4 | 0.490739 | `kusto_sample` | ❌ | +| 5 | 0.489476 | `kusto_table_list` | ❌ | + +--- + +## Test 161 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `mysql_database_list` **Prompt:** List all MySQL databases in server @@ -5739,6 +6676,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| <<<<<<< HEAD +<<<<<<< HEAD | 1 | 0.633991 | `postgres_database_list` | ❌ | | 2 | 0.623359 | `mysql_database_list` | ✅ **EXPECTED** | | 3 | 0.534434 | `mysql_table_list` | ❌ | @@ -5760,6 +6698,8 @@ ## Test 147 ======= +======= +>>>>>>> e2fd2eac (refactor tts mcp tool) | 1 | 0.634056 | `postgres_database_list` | ❌ | | 2 | 0.623421 | `mysql_database_list` | ✅ **EXPECTED** | | 3 | 0.534457 | `mysql_table_list` | ❌ | @@ -5768,9 +6708,13 @@ --- +<<<<<<< HEAD ## Test 157 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 162 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `mysql_database_list` **Prompt:** Show me the MySQL databases in server @@ -5782,6 +6726,7 @@ | 1 | 0.588121 | `mysql_database_list` | ✅ **EXPECTED** | | 2 | 0.574089 | `postgres_database_list` | ❌ | <<<<<<< HEAD +<<<<<<< HEAD | 3 | 0.483855 | `mysql_table_list` | ❌ | | 4 | 0.463244 | `mysql_server_list` | ❌ | | 5 | 0.444547 | `sql_db_list` | ❌ | @@ -5799,15 +6744,21 @@ ## Test 148 ======= +======= +>>>>>>> e2fd2eac (refactor tts mcp tool) | 3 | 0.483855 | `mysql_table_list` | ❌ | | 4 | 0.463244 | `mysql_server_list` | ❌ | | 5 | 0.444547 | `sql_db_list` | ❌ | --- +<<<<<<< HEAD ## Test 158 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 163 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `mysql_database_query` **Prompt:** Show me all items that contain the word in the MySQL database in server @@ -5817,6 +6768,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| <<<<<<< HEAD +<<<<<<< HEAD | 1 | 0.476423 | `mysql_table_list` | ❌ | | 2 | 0.455770 | `mysql_database_list` | ❌ | | 3 | 0.432703 | `mysql_database_query` | ✅ **EXPECTED** | @@ -5836,18 +6788,26 @@ | 1 | 0.476423 | `mysql_table_list` | ❌ | | 2 | 0.455770 | `mysql_database_list` | ❌ | | 3 | 0.433202 | `mysql_database_query` | ✅ **EXPECTED** | +======= +| 1 | 0.476420 | `mysql_table_list` | ❌ | +| 2 | 0.455766 | `mysql_database_list` | ❌ | +| 3 | 0.433385 | `mysql_database_query` | ✅ **EXPECTED** | +>>>>>>> e2fd2eac (refactor tts mcp tool) | 4 | 0.419859 | `mysql_server_list` | ❌ | ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) -| 5 | 0.409445 | `mysql_table_schema_get` | ❌ | +| 5 | 0.409450 | `mysql_table_schema_get` | ❌ | --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 149 ======= ## Test 159 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 164 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `mysql_server_config_get` **Prompt:** Show me the configuration of MySQL server @@ -5864,6 +6824,7 @@ --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 155 ======= @@ -5873,6 +6834,9 @@ ## Test 160 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 165 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `mysql_server_list` **Prompt:** List all MySQL servers in my subscription @@ -5882,6 +6846,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| <<<<<<< HEAD +<<<<<<< HEAD | 1 | 0.678473 | `postgres_server_list` | ❌ | ======= <<<<<<< HEAD @@ -5899,17 +6864,23 @@ ======= ## Test 151 ======= +======= +>>>>>>> e2fd2eac (refactor tts mcp tool) | 1 | 0.678472 | `postgres_server_list` | ❌ | | 2 | 0.558177 | `mysql_database_list` | ❌ | | 3 | 0.554817 | `mysql_server_list` | ✅ **EXPECTED** | -| 4 | 0.513750 | `kusto_cluster_list` | ❌ | +| 4 | 0.513706 | `kusto_cluster_list` | ❌ | | 5 | 0.501199 | `mysql_table_list` | ❌ | --- +<<<<<<< HEAD ## Test 161 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 166 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `mysql_server_list` **Prompt:** Show me my MySQL servers @@ -5920,6 +6891,7 @@ |------|-------|------|--------| | 1 | 0.478518 | `mysql_database_list` | ❌ | <<<<<<< HEAD +<<<<<<< HEAD | 2 | 0.474586 | `mysql_server_list` | ✅ **EXPECTED** | | 3 | 0.435642 | `postgres_server_list` | ❌ | | 4 | 0.412380 | `mysql_table_list` | ❌ | @@ -5934,10 +6906,16 @@ | 4 | 0.412380 | `mysql_table_list` | ❌ | >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +| 2 | 0.474586 | `mysql_server_list` | ✅ **EXPECTED** | +| 3 | 0.435642 | `postgres_server_list` | ❌ | +| 4 | 0.412380 | `mysql_table_list` | ❌ | +>>>>>>> e2fd2eac (refactor tts mcp tool) | 5 | 0.389993 | `postgres_database_list` | ❌ | --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 157 ======= @@ -5947,6 +6925,9 @@ ## Test 162 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 167 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `mysql_server_list` **Prompt:** Show me the MySQL servers in my subscription @@ -5956,6 +6937,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| <<<<<<< HEAD +<<<<<<< HEAD | 1 | 0.636435 | `postgres_server_list` | ❌ | | 2 | 0.534266 | `mysql_server_list` | ✅ **EXPECTED** | ======= @@ -5966,15 +6948,17 @@ | 3 | 0.530210 | `mysql_database_list` | ❌ | | 4 | 0.475052 | `kusto_cluster_list` | ❌ | ======= +======= +>>>>>>> e2fd2eac (refactor tts mcp tool) | 1 | 0.636435 | `postgres_server_list` | ❌ | | 2 | 0.534266 | `mysql_server_list` | ✅ **EXPECTED** | | 3 | 0.530210 | `mysql_database_list` | ❌ | -| 4 | 0.475138 | `kusto_cluster_list` | ❌ | ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) -| 5 | 0.470468 | `redis_list` | ❌ | +| 4 | 0.475052 | `kusto_cluster_list` | ❌ | +| 5 | 0.470469 | `redis_list` | ❌ | --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 158 ======= @@ -5984,6 +6968,9 @@ ## Test 163 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 168 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `mysql_server_param_get` **Prompt:** Show me the value of connection timeout in seconds in my MySQL server @@ -6004,18 +6991,22 @@ ## Test 159 ======= | 2 | 0.438075 | `mysql_server_param_set` | ❌ | -| 3 | 0.333906 | `mysql_database_query` | ❌ | +| 3 | 0.333841 | `mysql_database_query` | ❌ | | 4 | 0.313150 | `mysql_table_schema_get` | ❌ | | 5 | 0.310834 | `postgres_server_param_get` | ❌ | --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 154 ======= ## Test 164 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 169 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `mysql_server_param_set` **Prompt:** Set connection timeout to 20 seconds for my MySQL server @@ -6028,6 +7019,7 @@ | 2 | 0.381144 | `mysql_server_param_get` | ❌ | | 3 | 0.303499 | `postgres_server_param_set` | ❌ | <<<<<<< HEAD +<<<<<<< HEAD | 4 | 0.298661 | `mysql_database_query` | ❌ | | 5 | 0.254180 | `mysql_server_list` | ❌ | @@ -6036,11 +7028,14 @@ ## Test 160 ======= <<<<<<< HEAD +======= +>>>>>>> e2fd2eac (refactor tts mcp tool) | 4 | 0.298911 | `mysql_database_query` | ❌ | -| 5 | 0.254206 | `mysql_server_list` | ❌ | +| 5 | 0.254180 | `mysql_server_list` | ❌ | --- +<<<<<<< HEAD ## Test 155 ======= | 4 | 0.299246 | `mysql_database_query` | ❌ | @@ -6051,6 +7046,9 @@ ## Test 165 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 170 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `mysql_table_list` **Prompt:** List all tables in the MySQL database in server @@ -6060,6 +7058,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| <<<<<<< HEAD +<<<<<<< HEAD | 1 | 0.633542 | `mysql_table_list` | ✅ **EXPECTED** | | 2 | 0.573851 | `postgres_table_list` | ❌ | | 3 | 0.550878 | `postgres_database_list` | ❌ | @@ -6072,13 +7071,17 @@ ======= <<<<<<< HEAD | 1 | 0.633547 | `mysql_table_list` | ✅ **EXPECTED** | +======= +| 1 | 0.633448 | `mysql_table_list` | ✅ **EXPECTED** | +>>>>>>> e2fd2eac (refactor tts mcp tool) | 2 | 0.573844 | `postgres_table_list` | ❌ | | 3 | 0.550898 | `postgres_database_list` | ❌ | | 4 | 0.546963 | `mysql_database_list` | ❌ | -| 5 | 0.511906 | `kusto_table_list` | ❌ | +| 5 | 0.511847 | `kusto_table_list` | ❌ | --- +<<<<<<< HEAD ## Test 156 ======= | 1 | 0.633542 | `mysql_table_list` | ✅ **EXPECTED** | @@ -6092,6 +7095,9 @@ ## Test 166 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 171 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `mysql_table_list` **Prompt:** Show me the tables in the MySQL database in server @@ -6108,6 +7114,7 @@ --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 162 ======= @@ -6117,6 +7124,9 @@ ## Test 167 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 172 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `mysql_table_schema_get` **Prompt:** Show me the schema of table
in the MySQL database in server @@ -6133,6 +7143,7 @@ --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 163 ======= @@ -6142,6 +7153,9 @@ ## Test 168 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 173 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `postgres_database_list` **Prompt:** List all PostgreSQL databases in server @@ -6150,6 +7164,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.815470 | `postgres_database_list` | ✅ **EXPECTED** | | 2 | 0.643680 | `postgres_table_list` | ❌ | | 3 | 0.622824 | `postgres_server_list` | ❌ | @@ -6167,6 +7182,17 @@ ## Test 169 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +| 1 | 0.815617 | `postgres_database_list` | ✅ **EXPECTED** | +| 2 | 0.644014 | `postgres_table_list` | ❌ | +| 3 | 0.622790 | `postgres_server_list` | ❌ | +| 4 | 0.542685 | `postgres_server_config_get` | ❌ | +| 5 | 0.490904 | `postgres_server_param_get` | ❌ | + +--- + +## Test 174 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `postgres_database_list` **Prompt:** Show me the PostgreSQL databases in server @@ -6176,13 +7202,18 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| | 1 | 0.760033 | `postgres_database_list` | ✅ **EXPECTED** | +<<<<<<< HEAD | 2 | 0.589784 | `postgres_server_list` | ❌ | +======= +| 2 | 0.589783 | `postgres_server_list` | ❌ | +>>>>>>> e2fd2eac (refactor tts mcp tool) | 3 | 0.585891 | `postgres_table_list` | ❌ | | 4 | 0.552660 | `postgres_server_config_get` | ❌ | | 5 | 0.495685 | `postgres_server_param_get` | ❌ | --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 165 ======= @@ -6192,6 +7223,9 @@ ## Test 170 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 175 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `postgres_database_query` **Prompt:** Show me all items that contain the word in the PostgreSQL database in server @@ -6200,6 +7234,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.546211 | `postgres_database_list` | ❌ | <<<<<<< HEAD | 2 | 0.523223 | `postgres_database_query` | ✅ **EXPECTED** | @@ -6230,6 +7265,17 @@ ## Test 171 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +| 1 | 0.546505 | `postgres_database_list` | ❌ | +| 2 | 0.523181 | `postgres_database_query` | ✅ **EXPECTED** | +| 3 | 0.503458 | `postgres_table_list` | ❌ | +| 4 | 0.466623 | `postgres_server_list` | ❌ | +| 5 | 0.404170 | `postgres_server_config_get` | ❌ | + +--- + +## Test 176 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `postgres_server_config_get` **Prompt:** Show me the configuration of PostgreSQL server @@ -6240,12 +7286,18 @@ |------|-------|------|--------| | 1 | 0.756593 | `postgres_server_config_get` | ✅ **EXPECTED** | | 2 | 0.615429 | `postgres_server_param_set` | ❌ | +<<<<<<< HEAD | 3 | 0.599487 | `postgres_server_param_get` | ❌ | | 4 | 0.535050 | `postgres_database_list` | ❌ | +======= +| 3 | 0.599471 | `postgres_server_param_get` | ❌ | +| 4 | 0.535049 | `postgres_database_list` | ❌ | +>>>>>>> e2fd2eac (refactor tts mcp tool) | 5 | 0.518574 | `postgres_server_list` | ❌ | --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 167 ======= @@ -6255,6 +7307,9 @@ ## Test 172 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 177 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `postgres_server_list` **Prompt:** List all PostgreSQL servers in my subscription @@ -6264,11 +7319,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| <<<<<<< HEAD +<<<<<<< HEAD | 1 | 0.900023 | `postgres_server_list` | ✅ **EXPECTED** | ======= <<<<<<< HEAD | 1 | 0.900052 | `postgres_server_list` | ✅ **EXPECTED** | >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +| 1 | 0.900023 | `postgres_server_list` | ✅ **EXPECTED** | +>>>>>>> e2fd2eac (refactor tts mcp tool) | 2 | 0.640733 | `postgres_database_list` | ❌ | | 3 | 0.565914 | `postgres_table_list` | ❌ | | 4 | 0.538997 | `postgres_server_config_get` | ❌ | @@ -6276,6 +7335,7 @@ --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 168 ======= @@ -6292,6 +7352,9 @@ ## Test 173 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 178 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `postgres_server_list` **Prompt:** Show me my PostgreSQL servers @@ -6308,6 +7371,7 @@ --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 169 ======= @@ -6317,6 +7381,9 @@ ## Test 174 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 179 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `postgres_server_list` **Prompt:** Show me the PostgreSQL servers in my subscription @@ -6333,6 +7400,7 @@ --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 170 ======= @@ -6342,6 +7410,9 @@ ## Test 175 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 180 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `postgres_server_param_get` **Prompt:** Show me if the parameter my PostgreSQL server has replication enabled @@ -6358,6 +7429,7 @@ --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 171 ======= @@ -6367,6 +7439,9 @@ ## Test 176 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 181 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `postgres_server_param_set` **Prompt:** Enable replication for my PostgreSQL server @@ -6375,6 +7450,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.579909 | `postgres_server_param_set` | ✅ **EXPECTED** | | 2 | 0.488496 | `postgres_server_config_get` | ❌ | | 3 | 0.469810 | `postgres_server_list` | ❌ | @@ -6392,6 +7468,17 @@ ## Test 177 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +| 1 | 0.579873 | `postgres_server_param_set` | ✅ **EXPECTED** | +| 2 | 0.488474 | `postgres_server_config_get` | ❌ | +| 3 | 0.469794 | `postgres_server_list` | ❌ | +| 4 | 0.447011 | `postgres_server_param_get` | ❌ | +| 5 | 0.440760 | `postgres_database_list` | ❌ | + +--- + +## Test 182 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `postgres_table_list` **Prompt:** List all tables in the PostgreSQL database in server @@ -6400,6 +7487,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.789934 | `postgres_table_list` | ✅ **EXPECTED** | | 2 | 0.750592 | `postgres_database_list` | ❌ | | 3 | 0.574975 | `postgres_server_list` | ❌ | @@ -6417,6 +7505,17 @@ ## Test 178 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +| 1 | 0.789883 | `postgres_table_list` | ✅ **EXPECTED** | +| 2 | 0.750580 | `postgres_database_list` | ❌ | +| 3 | 0.574930 | `postgres_server_list` | ❌ | +| 4 | 0.519820 | `postgres_table_schema_get` | ❌ | +| 5 | 0.501400 | `postgres_server_config_get` | ❌ | + +--- + +## Test 183 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `postgres_table_list` **Prompt:** Show me the tables in the PostgreSQL database in server @@ -6433,6 +7532,7 @@ --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 174 ======= @@ -6442,6 +7542,9 @@ ## Test 179 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 184 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `postgres_table_schema_get` **Prompt:** Show me the schema of table
in the PostgreSQL database in server @@ -6450,6 +7553,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.714916 | `postgres_table_schema_get` | ✅ **EXPECTED** | | 2 | 0.597892 | `postgres_table_list` | ❌ | | 3 | 0.574251 | `postgres_database_list` | ❌ | @@ -6467,6 +7571,17 @@ ## Test 180 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +| 1 | 0.716024 | `postgres_table_schema_get` | ✅ **EXPECTED** | +| 2 | 0.599077 | `postgres_table_list` | ❌ | +| 3 | 0.574928 | `postgres_database_list` | ❌ | +| 4 | 0.508250 | `postgres_server_config_get` | ❌ | +| 5 | 0.502665 | `kusto_table_schema` | ❌ | + +--- + +## Test 185 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `deploy_app_logs_get` **Prompt:** Show me the log of the application deployed by azd @@ -6478,20 +7593,23 @@ | 1 | 0.711844 | `deploy_app_logs_get` | ✅ **EXPECTED** | | 2 | 0.471692 | `deploy_plan_get` | ❌ | <<<<<<< HEAD +<<<<<<< HEAD | 3 | 0.451639 | `monitor_activitylog_list` | ❌ | | 4 | 0.404892 | `deploy_pipeline_guidance_get` | ❌ | ======= <<<<<<< HEAD | 3 | 0.451653 | `monitor_activitylog_list` | ❌ | ======= +======= +>>>>>>> e2fd2eac (refactor tts mcp tool) | 3 | 0.451638 | `monitor_activitylog_list` | ❌ | ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) | 4 | 0.404890 | `deploy_pipeline_guidance_get` | ❌ | >>>>>>> 58ab8585 (update prompts and tool description evaluator) | 5 | 0.401388 | `monitor_resource_log_query` | ❌ | --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 176 ======= @@ -6501,6 +7619,9 @@ ## Test 181 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 186 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `deploy_architecture_diagram_generate` **Prompt:** Generate the azure architecture diagram for this application @@ -6509,6 +7630,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.680599 | `deploy_architecture_diagram_generate` | ✅ **EXPECTED** | | 2 | 0.562485 | `deploy_plan_get` | ❌ | | 3 | 0.497326 | `deploy_pipeline_guidance_get` | ❌ | @@ -6526,6 +7648,17 @@ ## Test 182 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +| 1 | 0.680640 | `deploy_architecture_diagram_generate` | ✅ **EXPECTED** | +| 2 | 0.562521 | `deploy_plan_get` | ❌ | +| 3 | 0.497193 | `deploy_pipeline_guidance_get` | ❌ | +| 4 | 0.489344 | `cloudarchitect_design` | ❌ | +| 5 | 0.435921 | `deploy_iac_rules_get` | ❌ | + +--- + +## Test 187 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `deploy_iac_rules_get` **Prompt:** Show me the rules to generate bicep scripts @@ -6540,6 +7673,7 @@ ======= | 2 | 0.479903 | `bicepschema_get` | ❌ | <<<<<<< HEAD +<<<<<<< HEAD >>>>>>> 58ab8585 (update prompts and tool description evaluator) | 3 | 0.391965 | `get_bestpractices_get` | ❌ | | 4 | 0.383210 | `azureterraformbestpractices_get` | ❌ | @@ -6553,14 +7687,21 @@ ## Test 173 ======= | 3 | 0.394509 | `get_bestpractices_get` | ❌ | +======= +| 3 | 0.391965 | `get_bestpractices_get` | ❌ | +>>>>>>> e2fd2eac (refactor tts mcp tool) | 4 | 0.383210 | `azureterraformbestpractices_get` | ❌ | | 5 | 0.375561 | `extension_cli_generate` | ❌ | --- +<<<<<<< HEAD ## Test 183 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 188 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `deploy_pipeline_guidance_get` **Prompt:** How can I create a CI/CD pipeline to deploy this app to Azure? @@ -6581,17 +7722,21 @@ ## Test 179 ======= | 3 | 0.448918 | `deploy_iac_rules_get` | ❌ | -| 4 | 0.385940 | `get_bestpractices_get` | ❌ | -| 5 | 0.385920 | `deploy_app_logs_get` | ❌ | +| 4 | 0.385920 | `deploy_app_logs_get` | ❌ | +| 5 | 0.382240 | `get_bestpractices_get` | ❌ | --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 174 ======= ## Test 184 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 189 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `deploy_plan_get` **Prompt:** Create a plan to deploy this application to azure @@ -6604,6 +7749,7 @@ | 2 | 0.587963 | `deploy_pipeline_guidance_get` | ❌ | | 3 | 0.499385 | `deploy_iac_rules_get` | ❌ | | 4 | 0.498575 | `deploy_architecture_diagram_generate` | ❌ | +<<<<<<< HEAD | 5 | 0.448912 | `loadtesting_test_create` | ❌ | --- @@ -6617,6 +7763,13 @@ ## Test 185 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +| 5 | 0.448692 | `loadtesting_test_create` | ❌ | + +--- + +## Test 190 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `eventgrid_events_publish` **Prompt:** Publish an event to Event Grid topic using with the following data @@ -6626,6 +7779,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| <<<<<<< HEAD +<<<<<<< HEAD | 1 | 0.755353 | `eventgrid_events_publish` | ✅ **EXPECTED** | | 2 | 0.482544 | `eventgrid_subscription_list` | ❌ | | 3 | 0.465759 | `eventgrid_topic_list` | ❌ | @@ -6648,6 +7802,9 @@ ## Test 176 ======= | 1 | 0.755380 | `eventgrid_events_publish` | ✅ **EXPECTED** | +======= +| 1 | 0.755365 | `eventgrid_events_publish` | ✅ **EXPECTED** | +>>>>>>> e2fd2eac (refactor tts mcp tool) | 2 | 0.483021 | `eventgrid_subscription_list` | ❌ | | 3 | 0.466031 | `eventgrid_topic_list` | ❌ | | 4 | 0.360676 | `eventhubs_eventhub_update` | ❌ | @@ -6655,9 +7812,13 @@ --- +<<<<<<< HEAD ## Test 186 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 191 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `eventgrid_events_publish` **Prompt:** Publish event to my Event Grid topic with the following events @@ -6667,16 +7828,16 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| <<<<<<< HEAD +<<<<<<< HEAD | 1 | 0.654648 | `eventgrid_events_publish` | ✅ **EXPECTED** | | 2 | 0.524134 | `eventgrid_subscription_list` | ❌ | | 3 | 0.509777 | `eventgrid_topic_list` | ❌ | | 4 | 0.373438 | `servicebus_topic_details` | ❌ | ======= <<<<<<< HEAD -| 1 | 0.654647 | `eventgrid_events_publish` | ✅ **EXPECTED** | ======= -| 1 | 0.654668 | `eventgrid_events_publish` | ✅ **EXPECTED** | ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> e2fd2eac (refactor tts mcp tool) +| 1 | 0.654647 | `eventgrid_events_publish` | ✅ **EXPECTED** | | 2 | 0.524503 | `eventgrid_subscription_list` | ❌ | | 3 | 0.510039 | `eventgrid_topic_list` | ❌ | | 4 | 0.373718 | `servicebus_topic_details` | ❌ | @@ -6685,6 +7846,7 @@ --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 182 ======= @@ -6694,6 +7856,9 @@ ## Test 187 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 192 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `eventgrid_events_publish` **Prompt:** Send an event to Event Grid topic in resource group with @@ -6702,7 +7867,6 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD | 1 | 0.600274 | `eventgrid_events_publish` | ✅ **EXPECTED** | <<<<<<< HEAD | 2 | 0.521041 | `eventgrid_topic_list` | ❌ | @@ -6711,18 +7875,17 @@ ======= | 2 | 0.521240 | `eventgrid_topic_list` | ❌ | | 3 | 0.504808 | `eventgrid_subscription_list` | ❌ | -| 4 | 0.411390 | `eventhubs_eventhub_consumergroup_update` | ❌ | -======= -| 1 | 0.600303 | `eventgrid_events_publish` | ✅ **EXPECTED** | -| 2 | 0.521240 | `eventgrid_topic_list` | ❌ | -| 3 | 0.504808 | `eventgrid_subscription_list` | ❌ | | 4 | 0.411130 | `eventhubs_eventhub_consumergroup_update` | ❌ | +<<<<<<< HEAD >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +>>>>>>> e2fd2eac (refactor tts mcp tool) | 5 | 0.389439 | `eventhubs_eventhub_consumergroup_get` | ❌ | --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 183 ======= @@ -6732,6 +7895,9 @@ ## Test 188 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 193 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `eventgrid_topic_list` **Prompt:** List all Event Grid topics in my subscription @@ -6748,12 +7914,9 @@ ======= | 1 | 0.770140 | `eventgrid_topic_list` | ✅ **EXPECTED** | | 2 | 0.745470 | `eventgrid_subscription_list` | ❌ | -<<<<<<< HEAD | 3 | 0.561862 | `kusto_cluster_list` | ❌ | -======= -| 3 | 0.561858 | `kusto_cluster_list` | ❌ | ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) | 4 | 0.545540 | `search_service_list` | ❌ | +<<<<<<< HEAD >>>>>>> 58ab8585 (update prompts and tool description evaluator) | 5 | 0.526123 | `subscription_list` | ❌ | @@ -6768,6 +7931,13 @@ ## Test 189 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +| 5 | 0.526138 | `subscription_list` | ❌ | + +--- + +## Test 194 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `eventgrid_topic_list` **Prompt:** Show me the Event Grid topics in my subscription @@ -6789,23 +7959,22 @@ ======= | 1 | 0.738258 | `eventgrid_topic_list` | ✅ **EXPECTED** | | 2 | 0.737486 | `eventgrid_subscription_list` | ❌ | -<<<<<<< HEAD | 3 | 0.492592 | `kusto_cluster_list` | ❌ | -| 4 | 0.480252 | `subscription_list` | ❌ | -======= -| 3 | 0.492527 | `kusto_cluster_list` | ❌ | | 4 | 0.480287 | `subscription_list` | ❌ | ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) | 5 | 0.475119 | `search_service_list` | ❌ | --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 180 ======= ## Test 190 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 195 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `eventgrid_topic_list` **Prompt:** List all Event Grid topics in subscription @@ -6827,22 +7996,22 @@ ======= | 1 | 0.770140 | `eventgrid_topic_list` | ✅ **EXPECTED** | | 2 | 0.721362 | `eventgrid_subscription_list` | ❌ | -<<<<<<< HEAD | 3 | 0.535326 | `kusto_cluster_list` | ❌ | -======= -| 3 | 0.535427 | `kusto_cluster_list` | ❌ | ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) | 4 | 0.514248 | `search_service_list` | ❌ | -| 5 | 0.495952 | `subscription_list` | ❌ | +| 5 | 0.495987 | `subscription_list` | ❌ | --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 181 ======= ## Test 191 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 196 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `eventgrid_topic_list` **Prompt:** List all Event Grid topics in resource group in subscription @@ -6852,10 +8021,13 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| <<<<<<< HEAD +<<<<<<< HEAD | 1 | 0.758562 | `eventgrid_topic_list` | ✅ **EXPECTED** | | 2 | 0.704062 | `eventgrid_subscription_list` | ❌ | ======= <<<<<<< HEAD +======= +>>>>>>> e2fd2eac (refactor tts mcp tool) | 1 | 0.758816 | `eventgrid_topic_list` | ✅ **EXPECTED** | | 2 | 0.704462 | `eventgrid_subscription_list` | ❌ | >>>>>>> 58ab8585 (update prompts and tool description evaluator) @@ -6865,6 +8037,7 @@ --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 187 ======= @@ -6881,6 +8054,9 @@ ## Test 192 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 197 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `eventgrid_subscription_list` **Prompt:** Show me all Event Grid subscriptions for topic @@ -6893,11 +8069,11 @@ | 2 | 0.720373 | `eventgrid_topic_list` | ❌ | | 3 | 0.498398 | `servicebus_topic_details` | ❌ | | 4 | 0.486216 | `servicebus_topic_subscription_details` | ❌ | -<<<<<<< HEAD | 5 | 0.486162 | `eventgrid_events_publish` | ❌ | --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 188 ======= @@ -6910,6 +8086,9 @@ ## Test 193 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 198 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `eventgrid_subscription_list` **Prompt:** List Event Grid subscriptions for topic in subscription @@ -6926,11 +8105,15 @@ ======= | 4 | 0.529286 | `servicebus_topic_details` | ❌ | <<<<<<< HEAD +<<<<<<< HEAD >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +>>>>>>> e2fd2eac (refactor tts mcp tool) | 5 | 0.477876 | `eventgrid_events_publish` | ❌ | --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 189 ======= @@ -6943,6 +8126,9 @@ ## Test 194 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 199 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `eventgrid_subscription_list` **Prompt:** List Event Grid subscriptions for topic in resource group @@ -6952,6 +8138,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| <<<<<<< HEAD +<<<<<<< HEAD | 1 | 0.746672 | `eventgrid_subscription_list` | ✅ **EXPECTED** | | 2 | 0.745851 | `eventgrid_topic_list` | ❌ | | 3 | 0.535463 | `monitor_webtests_list` | ❌ | @@ -6963,14 +8150,17 @@ ## Test 190 ======= <<<<<<< HEAD +======= +>>>>>>> e2fd2eac (refactor tts mcp tool) | 1 | 0.746815 | `eventgrid_subscription_list` | ✅ **EXPECTED** | | 2 | 0.746174 | `eventgrid_topic_list` | ❌ | -| 3 | 0.535731 | `monitor_webtests_list` | ❌ | +| 3 | 0.535569 | `monitor_webtests_list` | ❌ | | 4 | 0.524919 | `group_list` | ❌ | | 5 | 0.503158 | `servicebus_topic_details` | ❌ | --- +<<<<<<< HEAD ## Test 185 ======= | 1 | 0.746335 | `eventgrid_subscription_list` | ✅ **EXPECTED** | @@ -6984,6 +8174,9 @@ ## Test 195 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 200 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `eventgrid_subscription_list` **Prompt:** Show all Event Grid subscriptions in my subscription @@ -7005,23 +8198,22 @@ ======= | 1 | 0.736436 | `eventgrid_subscription_list` | ✅ **EXPECTED** | | 2 | 0.659727 | `eventgrid_topic_list` | ❌ | -<<<<<<< HEAD -| 3 | 0.569256 | `subscription_list` | ❌ | -| 4 | 0.537922 | `kusto_cluster_list` | ❌ | -======= | 3 | 0.569254 | `subscription_list` | ❌ | -| 4 | 0.537909 | `kusto_cluster_list` | ❌ | ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) +| 4 | 0.537922 | `kusto_cluster_list` | ❌ | | 5 | 0.518857 | `search_service_list` | ❌ | --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 186 ======= ## Test 196 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 201 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `eventgrid_subscription_list` **Prompt:** List all Event Grid subscriptions in subscription @@ -7031,6 +8223,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| <<<<<<< HEAD +<<<<<<< HEAD | 1 | 0.684586 | `eventgrid_subscription_list` | ✅ **EXPECTED** | | 2 | 0.656227 | `eventgrid_topic_list` | ❌ | | 3 | 0.542362 | `subscription_list` | ❌ | @@ -7052,17 +8245,23 @@ ## Test 187 ======= +======= +>>>>>>> e2fd2eac (refactor tts mcp tool) | 1 | 0.684543 | `eventgrid_subscription_list` | ✅ **EXPECTED** | | 2 | 0.656277 | `eventgrid_topic_list` | ❌ | | 3 | 0.542388 | `subscription_list` | ❌ | -| 4 | 0.521119 | `kusto_cluster_list` | ❌ | +| 4 | 0.521053 | `kusto_cluster_list` | ❌ | | 5 | 0.510115 | `group_list` | ❌ | --- +<<<<<<< HEAD ## Test 197 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 202 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `eventgrid_subscription_list` **Prompt:** Show Event Grid subscriptions in resource group in subscription @@ -7075,6 +8274,7 @@ | 2 | 0.691623 | `eventgrid_topic_list` | ❌ | | 3 | 0.557573 | `group_list` | ❌ | <<<<<<< HEAD +<<<<<<< HEAD | 4 | 0.510684 | `monitor_webtests_list` | ❌ | | 5 | 0.504984 | `resourcehealth_availability-status_list` | ❌ | @@ -7098,6 +8298,14 @@ ## Test 198 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +| 4 | 0.510684 | `monitor_webtests_list` | ❌ | +| 5 | 0.504984 | `resourcehealth_availability-status_list` | ❌ | + +--- + +## Test 203 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `eventgrid_subscription_list` **Prompt:** List Event Grid subscriptions for subscription in location @@ -7106,6 +8314,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.710457 | `eventgrid_subscription_list` | ✅ **EXPECTED** | | 2 | 0.642001 | `eventgrid_topic_list` | ❌ | | 3 | 0.506618 | `subscription_list` | ❌ | @@ -7115,10 +8324,17 @@ | 4 | 0.476763 | `search_service_list` | ❌ | <<<<<<< HEAD >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +| 1 | 0.709801 | `eventgrid_subscription_list` | ✅ **EXPECTED** | +| 2 | 0.642095 | `eventgrid_topic_list` | ❌ | +| 3 | 0.506697 | `subscription_list` | ❌ | +| 4 | 0.476763 | `search_service_list` | ❌ | +>>>>>>> e2fd2eac (refactor tts mcp tool) | 5 | 0.475782 | `kusto_cluster_list` | ❌ | --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 194 ======= @@ -7131,6 +8347,9 @@ ## Test 199 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 204 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `eventhubs_eventhub_consumergroup_delete` **Prompt:** Delete my consumer group in my event hub , namespace , and resource group @@ -7140,6 +8359,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| <<<<<<< HEAD +<<<<<<< HEAD | 1 | 0.766928 | `eventhubs_eventhub_consumergroup_delete` | ✅ **EXPECTED** | | 2 | 0.675842 | `eventhubs_eventhub_consumergroup_update` | ❌ | | 3 | 0.641112 | `eventhubs_eventhub_consumergroup_get` | ❌ | @@ -7172,6 +8392,17 @@ ## Test 200 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +| 1 | 0.766871 | `eventhubs_eventhub_consumergroup_delete` | ✅ **EXPECTED** | +| 2 | 0.675824 | `eventhubs_eventhub_consumergroup_update` | ❌ | +| 3 | 0.641096 | `eventhubs_eventhub_consumergroup_get` | ❌ | +| 4 | 0.633729 | `eventhubs_namespace_delete` | ❌ | +| 5 | 0.605488 | `eventhubs_eventhub_delete` | ❌ | + +--- + +## Test 205 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `eventhubs_eventhub_consumergroup_get` **Prompt:** List all consumer groups in my event hub in namespace @@ -7182,17 +8413,20 @@ |------|-------|------|--------| | 1 | 0.738475 | `eventhubs_eventhub_consumergroup_get` | ✅ **EXPECTED** | <<<<<<< HEAD +<<<<<<< HEAD | 2 | 0.634517 | `eventhubs_eventhub_consumergroup_update` | ❌ | | 3 | 0.626486 | `eventhubs_eventhub_consumergroup_delete` | ❌ | ======= <<<<<<< HEAD | 2 | 0.634345 | `eventhubs_eventhub_consumergroup_update` | ❌ | ======= +======= +>>>>>>> e2fd2eac (refactor tts mcp tool) | 2 | 0.634517 | `eventhubs_eventhub_consumergroup_update` | ❌ | ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) | 3 | 0.626485 | `eventhubs_eventhub_consumergroup_delete` | ❌ | >>>>>>> 58ab8585 (update prompts and tool description evaluator) | 4 | 0.606619 | `eventhubs_namespace_get` | ❌ | +<<<<<<< HEAD | 5 | 0.593098 | `eventhubs_eventhub_get` | ❌ | --- @@ -7206,6 +8440,13 @@ ## Test 201 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +| 5 | 0.593085 | `eventhubs_eventhub_get` | ❌ | + +--- + +## Test 206 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `eventhubs_eventhub_consumergroup_get` **Prompt:** Get the details of my consumer group in my event hub , namespace , and resource group @@ -7216,6 +8457,7 @@ |------|-------|------|--------| | 1 | 0.712861 | `eventhubs_eventhub_consumergroup_get` | ✅ **EXPECTED** | <<<<<<< HEAD +<<<<<<< HEAD | 2 | 0.637170 | `eventhubs_eventhub_consumergroup_update` | ❌ | ======= <<<<<<< HEAD @@ -7239,6 +8481,16 @@ ## Test 202 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +| 2 | 0.637170 | `eventhubs_eventhub_consumergroup_update` | ❌ | +| 3 | 0.625913 | `eventhubs_eventhub_consumergroup_delete` | ❌ | +| 4 | 0.576800 | `eventhubs_namespace_get` | ❌ | +| 5 | 0.529926 | `eventhubs_eventhub_get` | ❌ | + +--- + +## Test 207 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `eventhubs_eventhub_consumergroup_update` **Prompt:** Create a new consumer group in my event hub , namespace , and resource group @@ -7248,6 +8500,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| <<<<<<< HEAD +<<<<<<< HEAD | 1 | 0.756873 | `eventhubs_eventhub_consumergroup_update` | ✅ **EXPECTED** | | 2 | 0.688248 | `eventhubs_eventhub_consumergroup_get` | ❌ | | 3 | 0.669384 | `eventhubs_eventhub_consumergroup_delete` | ❌ | @@ -7261,8 +8514,9 @@ <<<<<<< HEAD | 1 | 0.757520 | `eventhubs_eventhub_consumergroup_update` | ✅ **EXPECTED** | ======= +======= +>>>>>>> e2fd2eac (refactor tts mcp tool) | 1 | 0.757614 | `eventhubs_eventhub_consumergroup_update` | ✅ **EXPECTED** | ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) | 2 | 0.688923 | `eventhubs_eventhub_consumergroup_get` | ❌ | | 3 | 0.670026 | `eventhubs_eventhub_consumergroup_delete` | ❌ | | 4 | 0.554314 | `eventhubs_eventhub_update` | ❌ | @@ -7270,12 +8524,16 @@ --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 193 ======= ## Test 203 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 208 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `eventhubs_eventhub_consumergroup_update` **Prompt:** Update my consumer group in my event hub , namespace , and resource group @@ -7285,6 +8543,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| <<<<<<< HEAD +<<<<<<< HEAD | 1 | 0.739158 | `eventhubs_eventhub_consumergroup_update` | ✅ **EXPECTED** | | 2 | 0.655927 | `eventhubs_eventhub_consumergroup_delete` | ❌ | | 3 | 0.642524 | `eventhubs_eventhub_consumergroup_get` | ❌ | @@ -7306,17 +8565,23 @@ ## Test 194 ======= +======= +>>>>>>> e2fd2eac (refactor tts mcp tool) | 1 | 0.738818 | `eventhubs_eventhub_consumergroup_update` | ✅ **EXPECTED** | -| 2 | 0.655610 | `eventhubs_eventhub_consumergroup_delete` | ❌ | -| 3 | 0.642206 | `eventhubs_eventhub_consumergroup_get` | ❌ | -| 4 | 0.552216 | `eventhubs_eventhub_update` | ❌ | -| 5 | 0.523137 | `eventhubs_namespace_get` | ❌ | +| 2 | 0.655614 | `eventhubs_eventhub_consumergroup_delete` | ❌ | +| 3 | 0.642219 | `eventhubs_eventhub_consumergroup_get` | ❌ | +| 4 | 0.552234 | `eventhubs_eventhub_update` | ❌ | +| 5 | 0.524019 | `eventhubs_namespace_delete` | ❌ | --- +<<<<<<< HEAD ## Test 204 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 209 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `eventhubs_eventhub_delete` **Prompt:** Delete my event hub in my namespace and resource group @@ -7326,6 +8591,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| <<<<<<< HEAD +<<<<<<< HEAD | 1 | 0.699266 | `eventhubs_namespace_delete` | ❌ | | 2 | 0.688646 | `eventhubs_eventhub_delete` | ✅ **EXPECTED** | | 3 | 0.627721 | `eventhubs_eventhub_consumergroup_delete` | ❌ | @@ -7358,6 +8624,17 @@ ## Test 205 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +| 1 | 0.699213 | `eventhubs_namespace_delete` | ❌ | +| 2 | 0.688502 | `eventhubs_eventhub_delete` | ✅ **EXPECTED** | +| 3 | 0.627718 | `eventhubs_eventhub_consumergroup_delete` | ❌ | +| 4 | 0.578687 | `eventhubs_namespace_get` | ❌ | +| 5 | 0.552908 | `eventhubs_eventhub_get` | ❌ | + +--- + +## Test 210 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `eventhubs_eventhub_get` **Prompt:** List all Event Hubs in my namespace @@ -7367,6 +8644,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| <<<<<<< HEAD +<<<<<<< HEAD | 1 | 0.773277 | `eventhubs_eventhub_get` | ✅ **EXPECTED** | | 2 | 0.687596 | `eventhubs_namespace_get` | ❌ | | 3 | 0.578709 | `eventhubs_eventhub_update` | ❌ | @@ -7378,19 +8656,26 @@ ## Test 201 ======= | 1 | 0.773231 | `eventhubs_eventhub_get` | ✅ **EXPECTED** | +======= +| 1 | 0.773218 | `eventhubs_eventhub_get` | ✅ **EXPECTED** | +>>>>>>> e2fd2eac (refactor tts mcp tool) | 2 | 0.687582 | `eventhubs_namespace_get` | ❌ | | 3 | 0.578689 | `eventhubs_eventhub_update` | ❌ | -| 4 | 0.560155 | `eventhubs_namespace_delete` | ❌ | +| 4 | 0.561545 | `eventhubs_namespace_delete` | ❌ | | 5 | 0.545475 | `eventhubs_eventhub_consumergroup_get` | ❌ | --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 196 ======= ## Test 206 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 211 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `eventhubs_eventhub_get` **Prompt:** Get the details of my event hub in my namespace and resource group @@ -7400,6 +8685,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| <<<<<<< HEAD +<<<<<<< HEAD | 1 | 0.638112 | `eventhubs_namespace_get` | ❌ | | 2 | 0.627528 | `eventhubs_eventhub_get` | ✅ **EXPECTED** | | 3 | 0.570964 | `eventhubs_eventhub_consumergroup_get` | ❌ | @@ -7432,6 +8718,17 @@ ## Test 207 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +| 1 | 0.638083 | `eventhubs_namespace_get` | ❌ | +| 2 | 0.627619 | `eventhubs_eventhub_get` | ✅ **EXPECTED** | +| 3 | 0.570904 | `eventhubs_eventhub_consumergroup_get` | ❌ | +| 4 | 0.527646 | `eventhubs_eventhub_update` | ❌ | +| 5 | 0.521920 | `eventhubs_namespace_delete` | ❌ | + +--- + +## Test 212 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `eventhubs_eventhub_update` **Prompt:** Create a new event hub in my namespace and resource group @@ -7441,6 +8738,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| <<<<<<< HEAD +<<<<<<< HEAD | 1 | 0.645976 | `eventhubs_eventhub_update` | ✅ **EXPECTED** | | 2 | 0.605856 | `eventhubs_namespace_get` | ❌ | | 3 | 0.574389 | `eventhubs_eventhub_get` | ❌ | @@ -7465,14 +8763,23 @@ | 1 | 0.645976 | `eventhubs_eventhub_update` | ✅ **EXPECTED** | | 2 | 0.605856 | `eventhubs_namespace_get` | ❌ | | 3 | 0.574389 | `eventhubs_eventhub_get` | ❌ | +======= +| 1 | 0.646114 | `eventhubs_eventhub_update` | ✅ **EXPECTED** | +| 2 | 0.605940 | `eventhubs_namespace_get` | ❌ | +| 3 | 0.574547 | `eventhubs_eventhub_get` | ❌ | +>>>>>>> e2fd2eac (refactor tts mcp tool) | 4 | 0.571676 | `eventhubs_eventhub_consumergroup_update` | ❌ | -| 5 | 0.557073 | `eventhubs_namespace_delete` | ❌ | +| 5 | 0.557693 | `eventhubs_namespace_delete` | ❌ | --- +<<<<<<< HEAD ## Test 208 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 213 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `eventhubs_eventhub_update` **Prompt:** Update my event hub in my namespace and resource group @@ -7482,6 +8789,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| <<<<<<< HEAD +<<<<<<< HEAD | 1 | 0.655283 | `eventhubs_eventhub_update` | ✅ **EXPECTED** | | 2 | 0.571661 | `eventhubs_eventhub_delete` | ❌ | | 3 | 0.568605 | `eventhubs_eventhub_consumergroup_update` | ❌ | @@ -7514,6 +8822,17 @@ ## Test 209 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +| 1 | 0.655283 | `eventhubs_eventhub_update` | ✅ **EXPECTED** | +| 2 | 0.571661 | `eventhubs_eventhub_delete` | ❌ | +| 3 | 0.568606 | `eventhubs_eventhub_consumergroup_update` | ❌ | +| 4 | 0.568396 | `eventhubs_namespace_get` | ❌ | +| 5 | 0.565977 | `eventhubs_namespace_delete` | ❌ | + +--- + +## Test 214 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `eventhubs_namespace_delete` **Prompt:** Delete my namespace in my resource group @@ -7523,10 +8842,14 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| <<<<<<< HEAD +<<<<<<< HEAD | 1 | 0.623995 | `eventhubs_namespace_delete` | ✅ **EXPECTED** | | 2 | 0.525810 | `eventhubs_namespace_update` | ❌ | ======= | 1 | 0.626113 | `eventhubs_namespace_delete` | ✅ **EXPECTED** | +======= +| 1 | 0.623995 | `eventhubs_namespace_delete` | ✅ **EXPECTED** | +>>>>>>> e2fd2eac (refactor tts mcp tool) | 2 | 0.525446 | `eventhubs_namespace_update` | ❌ | >>>>>>> 58ab8585 (update prompts and tool description evaluator) | 3 | 0.505082 | `eventhubs_eventhub_consumergroup_delete` | ❌ | @@ -7535,6 +8858,7 @@ --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 205 ======= @@ -7544,6 +8868,9 @@ ## Test 210 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 215 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `eventhubs_namespace_get` **Prompt:** List all Event Hubs namespaces in my subscription @@ -7552,11 +8879,11 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -| 1 | 0.659838 | `eventhubs_eventhub_get` | ❌ | +| 1 | 0.659800 | `eventhubs_eventhub_get` | ❌ | | 2 | 0.658827 | `eventhubs_namespace_get` | ✅ **EXPECTED** | -<<<<<<< HEAD | 3 | 0.607372 | `kusto_cluster_list` | ❌ | <<<<<<< HEAD +<<<<<<< HEAD | 4 | 0.557150 | `eventgrid_topic_list` | ❌ | | 5 | 0.556016 | `eventgrid_subscription_list` | ❌ | @@ -7567,17 +8894,23 @@ ======= | 3 | 0.607365 | `kusto_cluster_list` | ❌ | >>>>>>> 84ad4f44 (update prompts and tool description evaluator) +======= +>>>>>>> e2fd2eac (refactor tts mcp tool) | 4 | 0.557200 | `eventgrid_topic_list` | ❌ | | 5 | 0.556126 | `eventgrid_subscription_list` | ❌ | --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 201 ======= ## Test 211 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 216 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `eventhubs_namespace_get` **Prompt:** Get the details of my namespace in my resource group @@ -7587,6 +8920,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| <<<<<<< HEAD +<<<<<<< HEAD | 1 | 0.509749 | `eventhubs_namespace_get` | ✅ **EXPECTED** | | 2 | 0.509432 | `monitor_webtests_get` | ❌ | | 3 | 0.497399 | `servicebus_queue_details` | ❌ | @@ -7608,6 +8942,8 @@ ## Test 202 ======= +======= +>>>>>>> e2fd2eac (refactor tts mcp tool) | 1 | 0.509749 | `eventhubs_namespace_get` | ✅ **EXPECTED** | | 2 | 0.509431 | `monitor_webtests_get` | ❌ | | 3 | 0.497399 | `servicebus_queue_details` | ❌ | @@ -7616,9 +8952,13 @@ --- +<<<<<<< HEAD ## Test 212 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 217 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `eventhubs_namespace_update` **Prompt:** Create an new namespace in my resource group @@ -7629,12 +8969,13 @@ |------|-------|------|--------| | 1 | 0.610313 | `eventhubs_namespace_update` | ✅ **EXPECTED** | | 2 | 0.466721 | `eventhubs_namespace_get` | ❌ | -| 3 | 0.461181 | `eventhubs_namespace_delete` | ❌ | -| 4 | 0.449724 | `workbooks_create` | ❌ | +| 3 | 0.458458 | `eventhubs_namespace_delete` | ❌ | +| 4 | 0.449562 | `workbooks_create` | ❌ | | 5 | 0.438492 | `eventhubs_eventhub_consumergroup_update` | ❌ | --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 208 ======= @@ -7644,6 +8985,9 @@ ## Test 213 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 218 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `eventhubs_namespace_update` **Prompt:** Update my namespace in my resource group @@ -7664,19 +9008,23 @@ ## Test 209 ======= | 1 | 0.622338 | `eventhubs_namespace_update` | ✅ **EXPECTED** | -| 2 | 0.476290 | `eventhubs_namespace_delete` | ❌ | +| 2 | 0.474099 | `eventhubs_namespace_delete` | ❌ | | 3 | 0.448723 | `eventhubs_namespace_get` | ❌ | | 4 | 0.436549 | `eventhubs_eventhub_consumergroup_update` | ❌ | | 5 | 0.372632 | `sql_db_rename` | ❌ | --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 204 ======= ## Test 214 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 219 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `functionapp_get` **Prompt:** Describe the function app in resource group @@ -7689,15 +9037,20 @@ | 2 | 0.451226 | `deploy_app_logs_get` | ❌ | | 3 | 0.450457 | `applens_resource_diagnose` | ❌ | <<<<<<< HEAD +<<<<<<< HEAD | 4 | 0.390048 | `mysql_server_list` | ❌ | ======= <<<<<<< HEAD | 4 | 0.390107 | `mysql_server_list` | ❌ | >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +| 4 | 0.390048 | `mysql_server_list` | ❌ | +>>>>>>> e2fd2eac (refactor tts mcp tool) | 5 | 0.380314 | `get_bestpractices_get` | ❌ | --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 210 ======= @@ -7711,6 +9064,9 @@ ## Test 215 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 220 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `functionapp_get` **Prompt:** Get configuration for function app @@ -7727,6 +9083,7 @@ --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 211 ======= @@ -7736,6 +9093,9 @@ ## Test 216 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 221 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `functionapp_get` **Prompt:** Get function app status for @@ -7746,6 +9106,7 @@ |------|-------|------|--------| | 1 | 0.622384 | `functionapp_get` | ✅ **EXPECTED** | <<<<<<< HEAD +<<<<<<< HEAD | 2 | 0.413523 | `resourcehealth_availability-status_get` | ❌ | | 3 | 0.390708 | `resourcehealth_availability-status_list` | ❌ | | 4 | 0.383293 | `deploy_app_logs_get` | ❌ | @@ -7756,25 +9117,22 @@ ## Test 212 ======= <<<<<<< HEAD -| 2 | 0.413481 | `resourcehealth_availability-status_get` | ❌ | -| 3 | 0.390766 | `resourcehealth_availability-status_list` | ❌ | -| 4 | 0.383533 | `deploy_app_logs_get` | ❌ | -| 5 | 0.360677 | `storage_account_get` | ❌ | - ---- - -## Test 207 ======= -| 2 | 0.411718 | `resourcehealth_availability-status_get` | ❌ | +>>>>>>> e2fd2eac (refactor tts mcp tool) +| 2 | 0.413481 | `resourcehealth_availability-status_get` | ❌ | | 3 | 0.390708 | `resourcehealth_availability-status_list` | ❌ | | 4 | 0.383533 | `deploy_app_logs_get` | ❌ | -| 5 | 0.360764 | `storage_account_get` | ❌ | +| 5 | 0.360665 | `storage_account_get` | ❌ | --- +<<<<<<< HEAD ## Test 217 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 222 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `functionapp_get` **Prompt:** Get information about my function app in @@ -7786,6 +9144,7 @@ | 1 | 0.690933 | `functionapp_get` | ✅ **EXPECTED** | | 2 | 0.441937 | `foundry_resource_get` | ❌ | <<<<<<< HEAD +<<<<<<< HEAD | 3 | 0.432317 | `resourcehealth_availability-status_list` | ❌ | ======= <<<<<<< HEAD @@ -7801,15 +9160,21 @@ ======= ## Test 208 ======= +======= +>>>>>>> e2fd2eac (refactor tts mcp tool) | 3 | 0.432317 | `resourcehealth_availability-status_list` | ❌ | | 4 | 0.431821 | `applens_resource_diagnose` | ❌ | -| 5 | 0.429120 | `storage_account_get` | ❌ | +| 5 | 0.429077 | `storage_account_get` | ❌ | --- +<<<<<<< HEAD ## Test 218 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 223 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `functionapp_get` **Prompt:** Retrieve host name and status of function app @@ -7820,6 +9185,7 @@ |------|-------|------|--------| | 1 | 0.592791 | `functionapp_get` | ✅ **EXPECTED** | <<<<<<< HEAD +<<<<<<< HEAD | 2 | 0.417779 | `resourcehealth_availability-status_get` | ❌ | | 3 | 0.409487 | `deploy_app_logs_get` | ❌ | | 4 | 0.399953 | `storage_account_get` | ❌ | @@ -7834,10 +9200,16 @@ | 4 | 0.400049 | `storage_account_get` | ❌ | >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +| 2 | 0.417817 | `resourcehealth_availability-status_get` | ❌ | +| 3 | 0.409712 | `deploy_app_logs_get` | ❌ | +| 4 | 0.399953 | `storage_account_get` | ❌ | +>>>>>>> e2fd2eac (refactor tts mcp tool) | 5 | 0.392237 | `applens_resource_diagnose` | ❌ | --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 214 ======= @@ -7847,6 +9219,9 @@ ## Test 219 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 224 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `functionapp_get` **Prompt:** Show function app details for in @@ -7860,6 +9235,7 @@ | 3 | 0.428689 | `applens_resource_diagnose` | ❌ | | 4 | 0.424686 | `foundry_resource_get` | ❌ | <<<<<<< HEAD +<<<<<<< HEAD | 5 | 0.391781 | `monitor_webtests_get` | ❌ | --- @@ -7873,13 +9249,19 @@ ## Test 210 ======= +======= +>>>>>>> e2fd2eac (refactor tts mcp tool) | 5 | 0.391781 | `monitor_webtests_get` | ❌ | --- +<<<<<<< HEAD ## Test 220 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 225 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `functionapp_get` **Prompt:** Show me the details for the function app @@ -7895,17 +9277,14 @@ | 4 | 0.403261 | `signalr_runtime_get` | ❌ | ======= | 2 | 0.430189 | `deploy_app_logs_get` | ❌ | -<<<<<<< HEAD -| 3 | 0.421127 | `storage_account_get` | ❌ | -======= -| 3 | 0.421155 | `storage_account_get` | ❌ | ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) +| 3 | 0.421082 | `storage_account_get` | ❌ | | 4 | 0.403311 | `signalr_runtime_get` | ❌ | >>>>>>> 58ab8585 (update prompts and tool description evaluator) | 5 | 0.391615 | `foundry_resource_get` | ❌ | --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 216 ======= @@ -7915,6 +9294,9 @@ ## Test 221 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 226 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `functionapp_get` **Prompt:** Show plan and region for function app @@ -7925,14 +9307,16 @@ |------|-------|------|--------| | 1 | 0.554980 | `functionapp_get` | ✅ **EXPECTED** | <<<<<<< HEAD +<<<<<<< HEAD | 2 | 0.426921 | `quota_usage_check` | ❌ | | 3 | 0.424062 | `deploy_app_logs_get` | ❌ | ======= <<<<<<< HEAD | 2 | 0.426976 | `quota_usage_check` | ❌ | ======= +======= +>>>>>>> e2fd2eac (refactor tts mcp tool) | 2 | 0.426703 | `quota_usage_check` | ❌ | ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) | 3 | 0.424610 | `deploy_app_logs_get` | ❌ | >>>>>>> 58ab8585 (update prompts and tool description evaluator) | 4 | 0.408011 | `deploy_plan_get` | ❌ | @@ -7940,6 +9324,7 @@ --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 217 ======= @@ -7949,6 +9334,9 @@ ## Test 222 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 227 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `functionapp_get` **Prompt:** What is the status of function app ? @@ -7958,6 +9346,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| | 1 | 0.565797 | `functionapp_get` | ✅ **EXPECTED** | +<<<<<<< HEAD | 2 | 0.403246 | `deploy_app_logs_get` | ❌ | | 3 | 0.384159 | `resourcehealth_availability-status_list` | ❌ | | 4 | 0.369868 | `applens_resource_diagnose` | ❌ | @@ -7969,10 +9358,16 @@ ## Test 218 ======= <<<<<<< HEAD +======= +| 2 | 0.403665 | `deploy_app_logs_get` | ❌ | +| 3 | 0.384159 | `resourcehealth_availability-status_list` | ❌ | +| 4 | 0.369868 | `applens_resource_diagnose` | ❌ | +>>>>>>> e2fd2eac (refactor tts mcp tool) | 5 | 0.355044 | `resourcehealth_availability-status_get` | ❌ | --- +<<<<<<< HEAD ## Test 213 ======= | 5 | 0.352966 | `resourcehealth_availability-status_get` | ❌ | @@ -7982,6 +9377,9 @@ ## Test 223 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 228 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `functionapp_get` **Prompt:** List all function apps in my subscription @@ -7997,13 +9395,18 @@ ======= | 2 | 0.559382 | `search_service_list` | ❌ | <<<<<<< HEAD +<<<<<<< HEAD | 3 | 0.534935 | `subscription_list` | ❌ | >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +| 3 | 0.534930 | `subscription_list` | ❌ | +>>>>>>> e2fd2eac (refactor tts mcp tool) | 4 | 0.529031 | `kusto_cluster_list` | ❌ | | 5 | 0.516618 | `cosmos_account_list` | ❌ | --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 219 ======= @@ -8018,6 +9421,9 @@ ## Test 224 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 229 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `functionapp_get` **Prompt:** Show me my Azure function apps @@ -8033,7 +9439,6 @@ | 4 | 0.410461 | `search_service_list` | ❌ | ======= | 2 | 0.464985 | `deploy_app_logs_get` | ❌ | -<<<<<<< HEAD | 3 | 0.412646 | `search_service_list` | ❌ | | 4 | 0.411323 | `get_bestpractices_get` | ❌ | >>>>>>> 58ab8585 (update prompts and tool description evaluator) @@ -8041,6 +9446,7 @@ --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 220 ======= @@ -8055,6 +9461,9 @@ ## Test 225 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 230 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `functionapp_get` **Prompt:** What function apps do I have? @@ -8067,10 +9476,11 @@ | 2 | 0.346031 | `deploy_app_logs_get` | ❌ | | 3 | 0.337966 | `applens_resource_diagnose` | ❌ | | 4 | 0.316594 | `extension_cli_install` | ❌ | -| 5 | 0.286490 | `get_bestpractices_get` | ❌ | +| 5 | 0.284362 | `get_bestpractices_get` | ❌ | --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 221 ======= @@ -8080,6 +9490,9 @@ ## Test 226 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 231 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `keyvault_admin_settings_get` **Prompt:** Get the account settings for my key vault @@ -8089,6 +9502,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| <<<<<<< HEAD +<<<<<<< HEAD | 1 | 0.604780 | `keyvault_admin_settings_get` | ✅ **EXPECTED** | | 2 | 0.532196 | `storage_account_get` | ❌ | | 3 | 0.496042 | `keyvault_key_get` | ❌ | @@ -8103,21 +9517,26 @@ | 1 | 0.604797 | `keyvault_admin_settings_get` | ✅ **EXPECTED** | | 2 | 0.532029 | `storage_account_get` | ❌ | ======= +======= +>>>>>>> e2fd2eac (refactor tts mcp tool) | 1 | 0.604780 | `keyvault_admin_settings_get` | ✅ **EXPECTED** | -| 2 | 0.532169 | `storage_account_get` | ❌ | ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) +| 2 | 0.532196 | `storage_account_get` | ❌ | | 3 | 0.496629 | `keyvault_key_get` | ❌ | | 4 | 0.452366 | `appconfig_kv_set` | ❌ | | 5 | 0.448039 | `keyvault_secret_get` | ❌ | --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 217 ======= ## Test 227 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 232 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `keyvault_admin_settings_get` **Prompt:** Show me the account settings for managed HSM keyvault @@ -8127,6 +9546,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| <<<<<<< HEAD +<<<<<<< HEAD | 1 | 0.671370 | `keyvault_admin_settings_get` | ✅ **EXPECTED** | | 2 | 0.455561 | `storage_account_get` | ❌ | | 3 | 0.440966 | `keyvault_key_get` | ❌ | @@ -8135,9 +9555,10 @@ | 1 | 0.671368 | `keyvault_admin_settings_get` | ✅ **EXPECTED** | | 2 | 0.455516 | `storage_account_get` | ❌ | ======= +======= +>>>>>>> e2fd2eac (refactor tts mcp tool) | 1 | 0.671370 | `keyvault_admin_settings_get` | ✅ **EXPECTED** | -| 2 | 0.455526 | `storage_account_get` | ❌ | ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) +| 2 | 0.455561 | `storage_account_get` | ❌ | | 3 | 0.441225 | `keyvault_key_get` | ❌ | >>>>>>> 58ab8585 (update prompts and tool description evaluator) | 4 | 0.404666 | `appconfig_kv_set` | ❌ | @@ -8145,6 +9566,7 @@ --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 223 ======= @@ -8154,6 +9576,9 @@ ## Test 228 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 233 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `keyvault_admin_settings_get` **Prompt:** What's the value of the setting in my key vault with name @@ -8163,6 +9588,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| <<<<<<< HEAD +<<<<<<< HEAD | 1 | 0.505709 | `keyvault_admin_settings_get` | ✅ **EXPECTED** | | 2 | 0.496565 | `appconfig_kv_set` | ❌ | | 3 | 0.420067 | `appconfig_kv_lock_set` | ❌ | @@ -8176,8 +9602,9 @@ <<<<<<< HEAD | 1 | 0.505731 | `keyvault_admin_settings_get` | ✅ **EXPECTED** | ======= +======= +>>>>>>> e2fd2eac (refactor tts mcp tool) | 1 | 0.505750 | `keyvault_admin_settings_get` | ✅ **EXPECTED** | ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) | 2 | 0.496540 | `appconfig_kv_set` | ❌ | | 3 | 0.420145 | `appconfig_kv_lock_set` | ❌ | | 4 | 0.419126 | `keyvault_key_get` | ❌ | @@ -8185,12 +9612,16 @@ --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 219 ======= ## Test 229 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 234 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `keyvault_certificate_create` **Prompt:** Create a new certificate called in the key vault @@ -8200,6 +9631,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| <<<<<<< HEAD +<<<<<<< HEAD | 1 | 0.627727 | `keyvault_certificate_create` | ✅ **EXPECTED** | | 2 | 0.570319 | `keyvault_certificate_import` | ❌ | | 3 | 0.540199 | `keyvault_key_create` | ❌ | @@ -8221,17 +9653,23 @@ ## Test 220 ======= +======= +>>>>>>> e2fd2eac (refactor tts mcp tool) | 1 | 0.627727 | `keyvault_certificate_create` | ✅ **EXPECTED** | -| 2 | 0.570398 | `keyvault_certificate_import` | ❌ | +| 2 | 0.570318 | `keyvault_certificate_import` | ❌ | | 3 | 0.540199 | `keyvault_key_create` | ❌ | | 4 | 0.519218 | `keyvault_certificate_get` | ❌ | -| 5 | 0.500027 | `keyvault_certificate_list` | ❌ | +| 5 | 0.499900 | `keyvault_certificate_list` | ❌ | --- +<<<<<<< HEAD ## Test 230 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 235 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `keyvault_certificate_create` **Prompt:** Generate a certificate named in key vault @@ -8241,6 +9679,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| <<<<<<< HEAD +<<<<<<< HEAD | 1 | 0.599548 | `keyvault_certificate_create` | ✅ **EXPECTED** | | 2 | 0.561717 | `keyvault_certificate_import` | ❌ | | 3 | 0.521910 | `keyvault_certificate_get` | ❌ | @@ -8265,6 +9704,17 @@ ## Test 231 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +| 1 | 0.600003 | `keyvault_certificate_create` | ✅ **EXPECTED** | +| 2 | 0.561463 | `keyvault_certificate_import` | ❌ | +| 3 | 0.522705 | `keyvault_certificate_get` | ❌ | +| 4 | 0.502139 | `keyvault_key_create` | ❌ | +| 5 | 0.497143 | `keyvault_certificate_list` | ❌ | + +--- + +## Test 236 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `keyvault_certificate_create` **Prompt:** Request creation of certificate in the key vault @@ -8274,6 +9724,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| <<<<<<< HEAD +<<<<<<< HEAD | 1 | 0.573998 | `keyvault_certificate_create` | ✅ **EXPECTED** | | 2 | 0.527759 | `keyvault_certificate_import` | ❌ | | 3 | 0.498278 | `keyvault_certificate_get` | ❌ | @@ -8295,17 +9746,23 @@ ## Test 222 ======= +======= +>>>>>>> e2fd2eac (refactor tts mcp tool) | 1 | 0.573998 | `keyvault_certificate_create` | ✅ **EXPECTED** | -| 2 | 0.527813 | `keyvault_certificate_import` | ❌ | +| 2 | 0.527759 | `keyvault_certificate_import` | ❌ | | 3 | 0.498278 | `keyvault_certificate_get` | ❌ | | 4 | 0.481548 | `keyvault_key_create` | ❌ | -| 5 | 0.469601 | `keyvault_certificate_list` | ❌ | +| 5 | 0.469457 | `keyvault_certificate_list` | ❌ | --- +<<<<<<< HEAD ## Test 232 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 237 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `keyvault_certificate_create` **Prompt:** Provision a new key vault certificate in vault @@ -8315,13 +9772,14 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| | 1 | 0.591697 | `keyvault_certificate_create` | ✅ **EXPECTED** | -| 2 | 0.562234 | `keyvault_certificate_import` | ❌ | +| 2 | 0.562265 | `keyvault_certificate_import` | ❌ | | 3 | 0.522147 | `keyvault_certificate_get` | ❌ | | 4 | 0.502529 | `keyvault_key_create` | ❌ | -| 5 | 0.479992 | `keyvault_certificate_list` | ❌ | +| 5 | 0.479936 | `keyvault_certificate_list` | ❌ | --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 228 ======= @@ -8331,6 +9789,9 @@ ## Test 233 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 238 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `keyvault_certificate_create` **Prompt:** Issue a certificate in key vault @@ -8340,13 +9801,14 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| | 1 | 0.622788 | `keyvault_certificate_create` | ✅ **EXPECTED** | -| 2 | 0.558533 | `keyvault_certificate_import` | ❌ | +| 2 | 0.558532 | `keyvault_certificate_import` | ❌ | | 3 | 0.534503 | `keyvault_certificate_get` | ❌ | -| 4 | 0.521316 | `keyvault_certificate_list` | ❌ | +| 4 | 0.521205 | `keyvault_certificate_list` | ❌ | | 5 | 0.465056 | `keyvault_key_create` | ❌ | --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 229 ======= @@ -8356,6 +9818,9 @@ ## Test 234 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 239 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `keyvault_certificate_get` **Prompt:** Show me the certificate in the key vault @@ -8365,17 +9830,14 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| | 1 | 0.600625 | `keyvault_certificate_get` | ✅ **EXPECTED** | -| 2 | 0.528405 | `keyvault_certificate_list` | ❌ | -<<<<<<< HEAD +| 2 | 0.528153 | `keyvault_certificate_list` | ❌ | | 3 | 0.519037 | `keyvault_certificate_import` | ❌ | -======= -| 3 | 0.518919 | `keyvault_certificate_import` | ❌ | ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) | 4 | 0.499293 | `keyvault_certificate_create` | ❌ | | 5 | 0.487691 | `keyvault_key_get` | ❌ | --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 230 ======= @@ -8385,6 +9847,9 @@ ## Test 235 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 240 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `keyvault_certificate_get` **Prompt:** Show me the details of the certificate in the key vault @@ -8394,6 +9859,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| | 1 | 0.646098 | `keyvault_certificate_get` | ✅ **EXPECTED** | +<<<<<<< HEAD | 2 | 0.563263 | `keyvault_key_get` | ❌ | | 3 | 0.514499 | `keyvault_secret_get` | ❌ | | 4 | 0.509446 | `keyvault_certificate_list` | ❌ | @@ -8414,6 +9880,16 @@ ## Test 236 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +| 2 | 0.562988 | `keyvault_key_get` | ❌ | +| 3 | 0.514170 | `keyvault_secret_get` | ❌ | +| 4 | 0.509201 | `keyvault_certificate_list` | ❌ | +| 5 | 0.507737 | `keyvault_certificate_import` | ❌ | + +--- + +## Test 241 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `keyvault_certificate_get` **Prompt:** Get the certificate from vault @@ -8423,9 +9899,10 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| | 1 | 0.609523 | `keyvault_certificate_get` | ✅ **EXPECTED** | -| 2 | 0.515570 | `keyvault_certificate_list` | ❌ | +| 2 | 0.515460 | `keyvault_certificate_list` | ❌ | | 3 | 0.511197 | `keyvault_certificate_create` | ❌ | <<<<<<< HEAD +<<<<<<< HEAD | 4 | 0.507768 | `keyvault_certificate_import` | ❌ | | 5 | 0.475674 | `keyvault_key_get` | ❌ | @@ -8434,16 +9911,23 @@ ## Test 232 ======= | 4 | 0.507693 | `keyvault_certificate_import` | ❌ | +======= +| 4 | 0.507768 | `keyvault_certificate_import` | ❌ | +>>>>>>> e2fd2eac (refactor tts mcp tool) | 5 | 0.474394 | `keyvault_key_get` | ❌ | --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 227 ======= ## Test 237 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 242 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `keyvault_certificate_get` **Prompt:** Display the certificate details for in vault @@ -8453,6 +9937,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| <<<<<<< HEAD +<<<<<<< HEAD | 1 | 0.647669 | `keyvault_certificate_get` | ✅ **EXPECTED** | | 2 | 0.528243 | `keyvault_key_get` | ❌ | | 3 | 0.521556 | `keyvault_certificate_list` | ❌ | @@ -8477,6 +9962,17 @@ ## Test 238 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +| 1 | 0.647745 | `keyvault_certificate_get` | ✅ **EXPECTED** | +| 2 | 0.527487 | `keyvault_key_get` | ❌ | +| 3 | 0.521168 | `keyvault_certificate_list` | ❌ | +| 4 | 0.509776 | `keyvault_certificate_import` | ❌ | +| 5 | 0.502207 | `keyvault_secret_get` | ❌ | + +--- + +## Test 243 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `keyvault_certificate_get` **Prompt:** Retrieve certificate metadata for in vault @@ -8486,6 +9982,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| <<<<<<< HEAD +<<<<<<< HEAD | 1 | 0.595959 | `keyvault_certificate_get` | ✅ **EXPECTED** | | 2 | 0.527404 | `keyvault_certificate_list` | ❌ | | 3 | 0.519059 | `keyvault_certificate_import` | ❌ | @@ -8507,17 +10004,23 @@ ## Test 229 ======= +======= +>>>>>>> e2fd2eac (refactor tts mcp tool) | 1 | 0.595959 | `keyvault_certificate_get` | ✅ **EXPECTED** | -| 2 | 0.527404 | `keyvault_certificate_list` | ❌ | -| 3 | 0.518970 | `keyvault_certificate_import` | ❌ | +| 2 | 0.527274 | `keyvault_certificate_list` | ❌ | +| 3 | 0.519059 | `keyvault_certificate_import` | ❌ | | 4 | 0.501138 | `keyvault_certificate_create` | ❌ | | 5 | 0.465174 | `keyvault_key_get` | ❌ | --- +<<<<<<< HEAD ## Test 239 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 244 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `keyvault_certificate_import` **Prompt:** Import the certificate in file into the key vault @@ -8527,6 +10030,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| <<<<<<< HEAD +<<<<<<< HEAD | 1 | 0.585481 | `keyvault_certificate_import` | ✅ **EXPECTED** | | 2 | 0.420747 | `keyvault_certificate_get` | ❌ | | 3 | 0.402595 | `keyvault_certificate_create` | ❌ | @@ -8549,16 +10053,23 @@ ## Test 230 ======= | 1 | 0.585374 | `keyvault_certificate_import` | ✅ **EXPECTED** | +======= +| 1 | 0.585481 | `keyvault_certificate_import` | ✅ **EXPECTED** | +>>>>>>> e2fd2eac (refactor tts mcp tool) | 2 | 0.420747 | `keyvault_certificate_get` | ❌ | | 3 | 0.402595 | `keyvault_certificate_create` | ❌ | -| 4 | 0.399342 | `keyvault_certificate_list` | ❌ | +| 4 | 0.399228 | `keyvault_certificate_list` | ❌ | | 5 | 0.352905 | `keyvault_key_create` | ❌ | --- +<<<<<<< HEAD ## Test 240 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 245 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `keyvault_certificate_import` **Prompt:** Import a certificate into the key vault using the name @@ -8567,15 +10078,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -| 1 | 0.622125 | `keyvault_certificate_import` | ✅ **EXPECTED** | -| 2 | 0.504314 | `keyvault_certificate_get` | ❌ | -| 3 | 0.498847 | `keyvault_certificate_create` | ❌ | -| 4 | 0.448105 | `keyvault_certificate_list` | ❌ | -| 5 | 0.419811 | `keyvault_key_create` | ❌ | +| 1 | 0.622172 | `keyvault_certificate_import` | ✅ **EXPECTED** | +| 2 | 0.504401 | `keyvault_certificate_get` | ❌ | +| 3 | 0.498608 | `keyvault_certificate_create` | ❌ | +| 4 | 0.448038 | `keyvault_certificate_list` | ❌ | +| 5 | 0.419465 | `keyvault_key_create` | ❌ | --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 236 ======= @@ -8592,6 +10103,9 @@ ## Test 241 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 246 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `keyvault_certificate_import` **Prompt:** Upload certificate file to key vault @@ -8600,15 +10114,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD | 1 | 0.595707 | `keyvault_certificate_import` | ✅ **EXPECTED** | | 2 | 0.453929 | `keyvault_certificate_create` | ❌ | | 3 | 0.452551 | `keyvault_certificate_get` | ❌ | -| 4 | 0.418203 | `keyvault_certificate_list` | ❌ | +| 4 | 0.418115 | `keyvault_certificate_list` | ❌ | | 5 | 0.413377 | `keyvault_key_create` | ❌ | --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 237 ======= @@ -8625,6 +10139,9 @@ ## Test 242 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 247 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `keyvault_certificate_import` **Prompt:** Load certificate from file into vault @@ -8633,14 +10150,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -| 1 | 0.619385 | `keyvault_certificate_import` | ✅ **EXPECTED** | +| 1 | 0.619480 | `keyvault_certificate_import` | ✅ **EXPECTED** | | 2 | 0.517804 | `keyvault_certificate_get` | ❌ | | 3 | 0.480815 | `keyvault_certificate_create` | ❌ | -| 4 | 0.444386 | `keyvault_certificate_list` | ❌ | +| 4 | 0.444264 | `keyvault_certificate_list` | ❌ | | 5 | 0.381873 | `keyvault_key_create` | ❌ | --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 238 ======= @@ -8650,7 +10168,10 @@ ## Test 243 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) - +======= +## Test 248 +>>>>>>> e2fd2eac (refactor tts mcp tool) + **Expected Tool:** `keyvault_certificate_import` **Prompt:** Add existing certificate file to the key vault with name @@ -8659,6 +10180,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| <<<<<<< HEAD +<<<<<<< HEAD | 1 | 0.595418 | `keyvault_certificate_import` | ✅ **EXPECTED** | | 2 | 0.452490 | `keyvault_certificate_create` | ❌ | ======= @@ -8688,6 +10210,17 @@ ## Test 244 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +| 1 | 0.595460 | `keyvault_certificate_import` | ✅ **EXPECTED** | +| 2 | 0.452480 | `keyvault_certificate_create` | ❌ | +| 3 | 0.441646 | `keyvault_certificate_get` | ❌ | +| 4 | 0.408002 | `keyvault_key_create` | ❌ | +| 5 | 0.392240 | `keyvault_secret_create` | ❌ | + +--- + +## Test 249 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `keyvault_certificate_list` **Prompt:** List all certificates in the key vault @@ -8696,6 +10229,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.726124 | `keyvault_certificate_list` | ✅ **EXPECTED** | <<<<<<< HEAD | 2 | 0.583110 | `keyvault_key_list` | ❌ | @@ -8707,11 +10241,17 @@ >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) | 3 | 0.531988 | `keyvault_secret_list` | ❌ | +======= +| 1 | 0.726049 | `keyvault_certificate_list` | ✅ **EXPECTED** | +| 2 | 0.583110 | `keyvault_key_list` | ❌ | +| 3 | 0.532060 | `keyvault_secret_list` | ❌ | +>>>>>>> e2fd2eac (refactor tts mcp tool) | 4 | 0.515236 | `keyvault_certificate_get` | ❌ | | 5 | 0.485792 | `keyvault_certificate_create` | ❌ | --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 240 ======= @@ -8721,6 +10261,9 @@ ## Test 245 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 250 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `keyvault_certificate_list` **Prompt:** Show me the certificates in the key vault @@ -8729,9 +10272,10 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -| 1 | 0.615541 | `keyvault_certificate_list` | ✅ **EXPECTED** | +| 1 | 0.615289 | `keyvault_certificate_list` | ✅ **EXPECTED** | | 2 | 0.522453 | `keyvault_certificate_get` | ❌ | <<<<<<< HEAD +<<<<<<< HEAD | 3 | 0.475156 | `keyvault_key_list` | ❌ | ======= <<<<<<< HEAD @@ -8740,11 +10284,15 @@ | 3 | 0.475142 | `keyvault_key_list` | ❌ | >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +| 3 | 0.475156 | `keyvault_key_list` | ❌ | +>>>>>>> e2fd2eac (refactor tts mcp tool) | 4 | 0.460973 | `keyvault_certificate_create` | ❌ | | 5 | 0.449381 | `keyvault_key_get` | ❌ | --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 241 ======= @@ -8754,6 +10302,9 @@ ## Test 246 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 251 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `keyvault_certificate_list` **Prompt:** What certificates are in the key vault ? @@ -8762,15 +10313,19 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.624710 | `keyvault_certificate_list` | ✅ **EXPECTED** | +======= +| 1 | 0.624522 | `keyvault_certificate_list` | ✅ **EXPECTED** | +>>>>>>> e2fd2eac (refactor tts mcp tool) | 2 | 0.519739 | `keyvault_certificate_get` | ❌ | | 3 | 0.510048 | `keyvault_certificate_create` | ❌ | -<<<<<<< HEAD | 4 | 0.505534 | `keyvault_certificate_import` | ❌ | | 5 | 0.497356 | `keyvault_key_list` | ❌ | --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 242 ======= @@ -8784,6 +10339,9 @@ ## Test 247 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 252 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `keyvault_certificate_list` **Prompt:** List certificate names in vault @@ -8792,6 +10350,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.672622 | `keyvault_certificate_list` | ✅ **EXPECTED** | <<<<<<< HEAD | 2 | 0.553990 | `keyvault_key_list` | ❌ | @@ -8803,11 +10362,17 @@ >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) | 3 | 0.511905 | `keyvault_secret_list` | ❌ | +======= +| 1 | 0.672392 | `keyvault_certificate_list` | ✅ **EXPECTED** | +| 2 | 0.553990 | `keyvault_key_list` | ❌ | +| 3 | 0.511981 | `keyvault_secret_list` | ❌ | +>>>>>>> e2fd2eac (refactor tts mcp tool) | 4 | 0.507062 | `keyvault_certificate_get` | ❌ | | 5 | 0.492357 | `keyvault_certificate_create` | ❌ | --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 243 ======= @@ -8817,6 +10382,9 @@ ## Test 248 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 253 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `keyvault_certificate_list` **Prompt:** Enumerate certificates in key vault @@ -8826,6 +10394,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| <<<<<<< HEAD +<<<<<<< HEAD | 1 | 0.747408 | `keyvault_certificate_list` | ✅ **EXPECTED** | | 2 | 0.594216 | `keyvault_key_list` | ❌ | ======= @@ -8837,11 +10406,17 @@ >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) | 3 | 0.558771 | `keyvault_secret_list` | ❌ | +======= +| 1 | 0.747416 | `keyvault_certificate_list` | ✅ **EXPECTED** | +| 2 | 0.594216 | `keyvault_key_list` | ❌ | +| 3 | 0.558818 | `keyvault_secret_list` | ❌ | +>>>>>>> e2fd2eac (refactor tts mcp tool) | 4 | 0.515568 | `keyvault_certificate_get` | ❌ | | 5 | 0.490876 | `keyvault_certificate_create` | ❌ | --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 244 ======= @@ -8851,6 +10426,9 @@ ## Test 249 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 254 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `keyvault_certificate_list` **Prompt:** Show certificate names in the key vault @@ -8859,9 +10437,10 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -| 1 | 0.639711 | `keyvault_certificate_list` | ✅ **EXPECTED** | +| 1 | 0.639473 | `keyvault_certificate_list` | ✅ **EXPECTED** | | 2 | 0.512475 | `keyvault_certificate_get` | ❌ | <<<<<<< HEAD +<<<<<<< HEAD | 3 | 0.507572 | `keyvault_key_list` | ❌ | ======= <<<<<<< HEAD @@ -8870,11 +10449,15 @@ | 3 | 0.507562 | `keyvault_key_list` | ❌ | >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +| 3 | 0.507572 | `keyvault_key_list` | ❌ | +>>>>>>> e2fd2eac (refactor tts mcp tool) | 4 | 0.482583 | `keyvault_certificate_create` | ❌ | -| 5 | 0.464725 | `keyvault_secret_list` | ❌ | +| 5 | 0.464824 | `keyvault_secret_list` | ❌ | --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 245 ======= @@ -8884,6 +10467,9 @@ ## Test 250 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 255 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `keyvault_key_create` **Prompt:** Create a new key called with the RSA type in the key vault @@ -8892,7 +10478,6 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD | 1 | 0.661466 | `keyvault_key_create` | ✅ **EXPECTED** | | 2 | 0.456580 | `keyvault_secret_create` | ❌ | | 3 | 0.451790 | `keyvault_certificate_create` | ❌ | @@ -8901,6 +10486,7 @@ --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 246 ======= @@ -8917,6 +10503,9 @@ ## Test 251 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 256 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `keyvault_key_create` **Prompt:** Generate a key with type in vault @@ -8926,6 +10515,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| <<<<<<< HEAD +<<<<<<< HEAD | 1 | 0.641070 | `keyvault_key_create` | ✅ **EXPECTED** | | 2 | 0.428964 | `keyvault_key_get` | ❌ | | 3 | 0.422763 | `keyvault_certificate_create` | ❌ | @@ -8958,6 +10548,17 @@ ## Test 252 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +| 1 | 0.641639 | `keyvault_key_create` | ✅ **EXPECTED** | +| 2 | 0.428841 | `keyvault_key_get` | ❌ | +| 3 | 0.423116 | `keyvault_certificate_create` | ❌ | +| 4 | 0.420631 | `keyvault_secret_create` | ❌ | +| 5 | 0.406157 | `appconfig_kv_set` | ❌ | + +--- + +## Test 257 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `keyvault_key_create` **Prompt:** Create an oct key in the vault @@ -8967,6 +10568,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| <<<<<<< HEAD +<<<<<<< HEAD | 1 | 0.547493 | `keyvault_key_create` | ✅ **EXPECTED** | | 2 | 0.463557 | `keyvault_secret_create` | ❌ | | 3 | 0.447410 | `keyvault_certificate_create` | ❌ | @@ -8988,17 +10590,23 @@ ## Test 243 ======= +======= +>>>>>>> e2fd2eac (refactor tts mcp tool) | 1 | 0.547493 | `keyvault_key_create` | ✅ **EXPECTED** | | 2 | 0.463557 | `keyvault_secret_create` | ❌ | | 3 | 0.447410 | `keyvault_certificate_create` | ❌ | | 4 | 0.420366 | `keyvault_key_get` | ❌ | -| 5 | 0.404180 | `keyvault_certificate_import` | ❌ | +| 5 | 0.404350 | `keyvault_certificate_import` | ❌ | --- +<<<<<<< HEAD ## Test 253 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 258 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `keyvault_key_create` **Prompt:** Create an RSA key in the vault with name @@ -9007,7 +10615,6 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD | 1 | 0.641369 | `keyvault_key_create` | ✅ **EXPECTED** | | 2 | 0.501636 | `keyvault_secret_create` | ❌ | | 3 | 0.491735 | `keyvault_certificate_create` | ❌ | @@ -9016,6 +10623,7 @@ --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 249 ======= @@ -9032,6 +10640,9 @@ ## Test 254 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 259 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `keyvault_key_create` **Prompt:** Create an EC key with name in the vault @@ -9055,11 +10666,11 @@ | 2 | 0.443369 | `keyvault_certificate_create` | ❌ | | 3 | 0.434675 | `keyvault_secret_create` | ❌ | | 4 | 0.421721 | `keyvault_key_get` | ❌ | -<<<<<<< HEAD | 5 | 0.400533 | `keyvault_certificate_import` | ❌ | --- +<<<<<<< HEAD ## Test 245 ======= | 5 | 0.400433 | `keyvault_certificate_import` | ❌ | @@ -9069,6 +10680,9 @@ ## Test 255 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 260 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `keyvault_key_get` **Prompt:** Show me the key in the key vault @@ -9081,6 +10695,7 @@ | 2 | 0.468243 | `keyvault_secret_get` | ❌ | | 3 | 0.452816 | `keyvault_key_create` | ❌ | <<<<<<< HEAD +<<<<<<< HEAD | 4 | 0.439969 | `keyvault_key_list` | ❌ | ======= <<<<<<< HEAD @@ -9089,10 +10704,14 @@ | 4 | 0.439941 | `keyvault_key_list` | ❌ | >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +| 4 | 0.439969 | `keyvault_key_list` | ❌ | +>>>>>>> e2fd2eac (refactor tts mcp tool) | 5 | 0.426545 | `keyvault_certificate_get` | ❌ | --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 251 ======= @@ -9102,6 +10721,9 @@ ## Test 256 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 261 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `keyvault_key_get` **Prompt:** Show me the details of the key in the key vault @@ -9111,6 +10733,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| <<<<<<< HEAD +<<<<<<< HEAD | 1 | 0.629372 | `keyvault_key_get` | ✅ **EXPECTED** | | 2 | 0.532872 | `keyvault_secret_get` | ❌ | | 3 | 0.512278 | `storage_account_get` | ❌ | @@ -9120,11 +10743,17 @@ | 2 | 0.532651 | `keyvault_secret_get` | ❌ | | 3 | 0.512106 | `storage_account_get` | ❌ | >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +| 1 | 0.629552 | `keyvault_key_get` | ✅ **EXPECTED** | +| 2 | 0.532651 | `keyvault_secret_get` | ❌ | +| 3 | 0.512278 | `storage_account_get` | ❌ | +>>>>>>> e2fd2eac (refactor tts mcp tool) | 4 | 0.495957 | `keyvault_certificate_get` | ❌ | | 5 | 0.456992 | `keyvault_key_create` | ❌ | --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 252 ======= @@ -9141,6 +10770,9 @@ ## Test 257 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 262 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `keyvault_key_get` **Prompt:** Get the key from vault @@ -9151,12 +10783,17 @@ |------|-------|------|--------| | 1 | 0.485492 | `keyvault_key_get` | ✅ **EXPECTED** | | 2 | 0.443182 | `keyvault_key_create` | ❌ | +<<<<<<< HEAD | 3 | 0.409356 | `keyvault_secret_get` | ❌ | +======= +| 3 | 0.409388 | `keyvault_secret_get` | ❌ | +>>>>>>> e2fd2eac (refactor tts mcp tool) | 4 | 0.395491 | `keyvault_admin_settings_get` | ❌ | | 5 | 0.383519 | `appconfig_kv_lock_set` | ❌ | --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 253 ======= @@ -9166,6 +10803,9 @@ ## Test 258 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 263 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `keyvault_key_get` **Prompt:** Display the key details for in vault @@ -9182,16 +10822,21 @@ | 1 | 0.590303 | `keyvault_key_get` | ✅ **EXPECTED** | | 2 | 0.488213 | `keyvault_secret_get` | ❌ | <<<<<<< HEAD +<<<<<<< HEAD | 3 | 0.476278 | `storage_account_get` | ❌ | ======= | 3 | 0.476529 | `storage_account_get` | ❌ | >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +| 3 | 0.476498 | `storage_account_get` | ❌ | +>>>>>>> e2fd2eac (refactor tts mcp tool) | 4 | 0.460796 | `keyvault_certificate_get` | ❌ | | 5 | 0.436511 | `keyvault_admin_settings_get` | ❌ | --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 254 ======= @@ -9201,6 +10846,9 @@ ## Test 259 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 264 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `keyvault_key_get` **Prompt:** Retrieve key metadata for in vault @@ -9215,19 +10863,19 @@ | 3 | 0.432742 | `keyvault_admin_settings_get` | ❌ | ======= | 1 | 0.518886 | `keyvault_key_get` | ✅ **EXPECTED** | -<<<<<<< HEAD -| 2 | 0.432731 | `keyvault_admin_settings_get` | ❌ | -| 3 | 0.432677 | `storage_account_get` | ❌ | -======= -| 2 | 0.432980 | `storage_account_get` | ❌ | +| 2 | 0.432950 | `storage_account_get` | ❌ | | 3 | 0.432742 | `keyvault_admin_settings_get` | ❌ | +<<<<<<< HEAD >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +>>>>>>> e2fd2eac (refactor tts mcp tool) | 4 | 0.429131 | `keyvault_key_create` | ❌ | | 5 | 0.422731 | `keyvault_secret_get` | ❌ | --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 255 ======= @@ -9237,6 +10885,9 @@ ## Test 260 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 265 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `keyvault_key_list` **Prompt:** List all keys in the key vault @@ -9246,6 +10897,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| <<<<<<< HEAD +<<<<<<< HEAD | 1 | 0.701448 | `keyvault_key_list` | ✅ **EXPECTED** | ======= <<<<<<< HEAD @@ -9267,13 +10919,23 @@ | 2 | 0.601513 | `keyvault_certificate_list` | ❌ | | 3 | 0.587427 | `keyvault_secret_list` | ❌ | | 4 | 0.498750 | `cosmos_account_list` | ❌ | +======= +| 1 | 0.701448 | `keyvault_key_list` | ✅ **EXPECTED** | +| 2 | 0.601430 | `keyvault_certificate_list` | ❌ | +| 3 | 0.587541 | `keyvault_secret_list` | ❌ | +| 4 | 0.498767 | `cosmos_account_list` | ❌ | +>>>>>>> e2fd2eac (refactor tts mcp tool) | 5 | 0.480129 | `keyvault_admin_settings_get` | ❌ | --- +<<<<<<< HEAD ## Test 261 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 266 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `keyvault_key_list` **Prompt:** Show me the keys in the key vault @@ -9283,6 +10945,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| <<<<<<< HEAD +<<<<<<< HEAD | 1 | 0.549453 | `keyvault_key_list` | ✅ **EXPECTED** | | 2 | 0.507865 | `keyvault_key_get` | ❌ | | 3 | 0.475507 | `keyvault_certificate_list` | ❌ | @@ -9295,25 +10958,26 @@ ======= <<<<<<< HEAD | 1 | 0.549498 | `keyvault_key_list` | ✅ **EXPECTED** | -| 2 | 0.506815 | `keyvault_key_get` | ❌ | -| 3 | 0.475507 | `keyvault_certificate_list` | ❌ | -| 4 | 0.472457 | `keyvault_admin_settings_get` | ❌ | ======= -| 1 | 0.549442 | `keyvault_key_list` | ✅ **EXPECTED** | +| 1 | 0.549453 | `keyvault_key_list` | ✅ **EXPECTED** | +>>>>>>> e2fd2eac (refactor tts mcp tool) | 2 | 0.506815 | `keyvault_key_get` | ❌ | -| 3 | 0.475507 | `keyvault_certificate_list` | ❌ | +| 3 | 0.475251 | `keyvault_certificate_list` | ❌ | | 4 | 0.472465 | `keyvault_admin_settings_get` | ❌ | ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) | 5 | 0.455683 | `keyvault_secret_get` | ❌ | --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 252 ======= ## Test 262 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 267 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `keyvault_key_list` **Prompt:** What keys are in the key vault ? @@ -9323,6 +10987,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| <<<<<<< HEAD +<<<<<<< HEAD | 1 | 0.581970 | `keyvault_key_list` | ✅ **EXPECTED** | | 2 | 0.502245 | `keyvault_admin_settings_get` | ❌ | ======= @@ -9349,6 +11014,17 @@ ## Test 263 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +| 1 | 0.581970 | `keyvault_key_list` | ✅ **EXPECTED** | +| 2 | 0.502245 | `keyvault_admin_settings_get` | ❌ | +| 3 | 0.501285 | `keyvault_certificate_list` | ❌ | +| 4 | 0.476470 | `keyvault_key_get` | ❌ | +| 5 | 0.472515 | `keyvault_secret_list` | ❌ | + +--- + +## Test 268 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `keyvault_key_list` **Prompt:** List key names in vault @@ -9358,6 +11034,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| <<<<<<< HEAD +<<<<<<< HEAD | 1 | 0.641314 | `keyvault_key_list` | ✅ **EXPECTED** | ======= <<<<<<< HEAD @@ -9365,11 +11042,17 @@ >>>>>>> 58ab8585 (update prompts and tool description evaluator) | 2 | 0.559550 | `keyvault_certificate_list` | ❌ | | 3 | 0.553553 | `keyvault_secret_list` | ❌ | +======= +| 1 | 0.641314 | `keyvault_key_list` | ✅ **EXPECTED** | +| 2 | 0.559318 | `keyvault_certificate_list` | ❌ | +| 3 | 0.553669 | `keyvault_secret_list` | ❌ | +>>>>>>> e2fd2eac (refactor tts mcp tool) | 4 | 0.486377 | `keyvault_admin_settings_get` | ❌ | | 5 | 0.475992 | `cosmos_account_list` | ❌ | --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 259 ======= @@ -9386,6 +11069,9 @@ ## Test 264 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 269 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `keyvault_key_list` **Prompt:** Enumerate keys in key vault @@ -9395,6 +11081,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| <<<<<<< HEAD +<<<<<<< HEAD | 1 | 0.723266 | `keyvault_key_list` | ✅ **EXPECTED** | | 2 | 0.611366 | `keyvault_certificate_list` | ❌ | | 3 | 0.611185 | `keyvault_secret_list` | ❌ | @@ -9414,18 +11101,26 @@ | 1 | 0.723171 | `keyvault_key_list` | ✅ **EXPECTED** | | 2 | 0.611366 | `keyvault_certificate_list` | ❌ | | 3 | 0.611185 | `keyvault_secret_list` | ❌ | +======= +| 1 | 0.723266 | `keyvault_key_list` | ✅ **EXPECTED** | +| 2 | 0.611390 | `keyvault_certificate_list` | ❌ | +| 3 | 0.611279 | `keyvault_secret_list` | ❌ | +>>>>>>> e2fd2eac (refactor tts mcp tool) | 4 | 0.473886 | `keyvault_admin_settings_get` | ❌ | ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) | 5 | 0.441881 | `keyvault_key_get` | ❌ | --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 255 ======= ## Test 265 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 270 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `keyvault_key_list` **Prompt:** Show key names in the key vault @@ -9435,6 +11130,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| <<<<<<< HEAD +<<<<<<< HEAD | 1 | 0.570444 | `keyvault_key_list` | ✅ **EXPECTED** | | 2 | 0.501953 | `keyvault_key_get` | ❌ | | 3 | 0.500103 | `keyvault_certificate_list` | ❌ | @@ -9465,6 +11161,17 @@ ## Test 266 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +| 1 | 0.570444 | `keyvault_key_list` | ✅ **EXPECTED** | +| 2 | 0.501073 | `keyvault_key_get` | ❌ | +| 3 | 0.499912 | `keyvault_certificate_list` | ❌ | +| 4 | 0.496817 | `storage_account_get` | ❌ | +| 5 | 0.490504 | `keyvault_secret_list` | ❌ | + +--- + +## Test 271 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `keyvault_secret_create` **Prompt:** Create a new secret called with value in the key vault @@ -9481,6 +11188,7 @@ --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 262 ======= @@ -9490,6 +11198,9 @@ ## Test 267 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 272 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `keyvault_secret_create` **Prompt:** Set a secret named with value in key vault @@ -9498,6 +11209,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.663094 | `keyvault_secret_create` | ✅ **EXPECTED** | | 2 | 0.519306 | `keyvault_secret_get` | ❌ | | 3 | 0.512233 | `appconfig_kv_set` | ❌ | @@ -9515,6 +11227,17 @@ ## Test 268 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +| 1 | 0.663051 | `keyvault_secret_create` | ✅ **EXPECTED** | +| 2 | 0.519554 | `keyvault_secret_get` | ❌ | +| 3 | 0.512173 | `appconfig_kv_set` | ❌ | +| 4 | 0.458563 | `keyvault_key_create` | ❌ | +| 5 | 0.429786 | `appconfig_kv_lock_set` | ❌ | + +--- + +## Test 273 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `keyvault_secret_create` **Prompt:** Store secret value in the key vault @@ -9524,6 +11247,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| <<<<<<< HEAD +<<<<<<< HEAD | 1 | 0.639897 | `keyvault_secret_create` | ✅ **EXPECTED** | | 2 | 0.509526 | `keyvault_secret_get` | ❌ | | 3 | 0.485203 | `appconfig_kv_set` | ❌ | @@ -9548,6 +11272,17 @@ ## Test 269 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +| 1 | 0.639908 | `keyvault_secret_create` | ✅ **EXPECTED** | +| 2 | 0.509778 | `keyvault_secret_get` | ❌ | +| 3 | 0.485096 | `appconfig_kv_set` | ❌ | +| 4 | 0.484619 | `keyvault_key_create` | ❌ | +| 5 | 0.448908 | `appconfig_kv_lock_set` | ❌ | + +--- + +## Test 274 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `keyvault_secret_create` **Prompt:** Add a new version of secret with value in vault @@ -9557,6 +11292,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| <<<<<<< HEAD +<<<<<<< HEAD | 1 | 0.675145 | `keyvault_secret_create` | ✅ **EXPECTED** | | 2 | 0.499276 | `keyvault_secret_get` | ❌ | | 3 | 0.498228 | `keyvault_key_create` | ❌ | @@ -9589,6 +11325,17 @@ ## Test 270 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +| 1 | 0.675151 | `keyvault_secret_create` | ✅ **EXPECTED** | +| 2 | 0.499630 | `keyvault_secret_get` | ❌ | +| 3 | 0.498091 | `keyvault_key_create` | ❌ | +| 4 | 0.479063 | `keyvault_certificate_import` | ❌ | +| 5 | 0.458559 | `appconfig_kv_set` | ❌ | + +--- + +## Test 275 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `keyvault_secret_create` **Prompt:** Update secret to value in the key vault @@ -9598,6 +11345,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| <<<<<<< HEAD +<<<<<<< HEAD | 1 | 0.571597 | `keyvault_secret_create` | ✅ **EXPECTED** | | 2 | 0.513012 | `keyvault_secret_get` | ❌ | | 3 | 0.441198 | `appconfig_kv_set` | ❌ | @@ -9630,6 +11378,17 @@ ## Test 271 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +| 1 | 0.571590 | `keyvault_secret_create` | ✅ **EXPECTED** | +| 2 | 0.513749 | `keyvault_secret_get` | ❌ | +| 3 | 0.441094 | `appconfig_kv_set` | ❌ | +| 4 | 0.417832 | `appconfig_kv_lock_set` | ❌ | +| 5 | 0.408233 | `keyvault_key_get` | ❌ | + +--- + +## Test 276 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `keyvault_secret_get` **Prompt:** Show me the secret in the key vault @@ -9639,6 +11398,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| <<<<<<< HEAD +<<<<<<< HEAD | 1 | 0.602686 | `keyvault_secret_get` | ✅ **EXPECTED** | | 2 | 0.505620 | `keyvault_key_get` | ❌ | | 3 | 0.501397 | `keyvault_secret_create` | ❌ | @@ -9660,17 +11420,23 @@ ## Test 262 ======= +======= +>>>>>>> e2fd2eac (refactor tts mcp tool) | 1 | 0.602769 | `keyvault_secret_get` | ✅ **EXPECTED** | | 2 | 0.504212 | `keyvault_key_get` | ❌ | | 3 | 0.501397 | `keyvault_secret_create` | ❌ | -| 4 | 0.478769 | `keyvault_secret_list` | ❌ | +| 4 | 0.478828 | `keyvault_secret_list` | ❌ | | 5 | 0.439521 | `keyvault_certificate_get` | ❌ | --- +<<<<<<< HEAD ## Test 272 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 277 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `keyvault_secret_get` **Prompt:** Show me the details of the secret in the key vault @@ -9680,6 +11446,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| <<<<<<< HEAD +<<<<<<< HEAD | 1 | 0.653920 | `keyvault_secret_get` | ✅ **EXPECTED** | | 2 | 0.567036 | `keyvault_key_get` | ❌ | | 3 | 0.517547 | `storage_account_get` | ❌ | @@ -9706,6 +11473,17 @@ ## Test 273 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +| 1 | 0.653702 | `keyvault_secret_get` | ✅ **EXPECTED** | +| 2 | 0.566721 | `keyvault_key_get` | ❌ | +| 3 | 0.517433 | `storage_account_get` | ❌ | +| 4 | 0.495959 | `keyvault_certificate_get` | ❌ | +| 5 | 0.485474 | `keyvault_secret_list` | ❌ | + +--- + +## Test 278 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `keyvault_secret_get` **Prompt:** Get the secret from vault @@ -9717,11 +11495,16 @@ | 1 | 0.578261 | `keyvault_secret_get` | ✅ **EXPECTED** | | 2 | 0.493543 | `keyvault_key_get` | ❌ | | 3 | 0.488705 | `keyvault_secret_create` | ❌ | +<<<<<<< HEAD | 4 | 0.443676 | `keyvault_secret_list` | ❌ | +======= +| 4 | 0.443696 | `keyvault_secret_list` | ❌ | +>>>>>>> e2fd2eac (refactor tts mcp tool) | 5 | 0.424167 | `keyvault_admin_settings_get` | ❌ | --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 269 ======= @@ -9731,6 +11514,9 @@ ## Test 274 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 279 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `keyvault_secret_get` **Prompt:** Display the secret details for in vault @@ -9747,6 +11533,7 @@ | 1 | 0.649267 | `keyvault_secret_get` | ✅ **EXPECTED** | | 2 | 0.546992 | `keyvault_key_get` | ❌ | <<<<<<< HEAD +<<<<<<< HEAD | 3 | 0.497258 | `storage_account_get` | ❌ | ======= | 3 | 0.497410 | `storage_account_get` | ❌ | @@ -9766,6 +11553,15 @@ ## Test 275 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +| 3 | 0.497402 | `storage_account_get` | ❌ | +| 4 | 0.492583 | `keyvault_certificate_get` | ❌ | +| 5 | 0.491655 | `keyvault_secret_list` | ❌ | + +--- + +## Test 280 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `keyvault_secret_get` **Prompt:** Retrieve secret metadata for in vault @@ -9774,6 +11570,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.577338 | `keyvault_secret_get` | ✅ **EXPECTED** | | 2 | 0.475492 | `keyvault_key_get` | ❌ | | 3 | 0.466890 | `keyvault_secret_create` | ❌ | @@ -9799,6 +11596,17 @@ ## Test 276 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +| 1 | 0.577471 | `keyvault_secret_get` | ✅ **EXPECTED** | +| 2 | 0.475432 | `keyvault_key_get` | ❌ | +| 3 | 0.466876 | `keyvault_secret_create` | ❌ | +| 4 | 0.447631 | `keyvault_secret_list` | ❌ | +| 5 | 0.439582 | `storage_account_get` | ❌ | + +--- + +## Test 281 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `keyvault_secret_list` **Prompt:** List all secrets in the key vault @@ -9807,6 +11615,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.701227 | `keyvault_secret_list` | ✅ **EXPECTED** | <<<<<<< HEAD | 2 | 0.563736 | `keyvault_key_list` | ❌ | @@ -9824,17 +11633,26 @@ | 2 | 0.563694 | `keyvault_key_list` | ❌ | >>>>>>> 84ad4f44 (update prompts and tool description evaluator) | 3 | 0.538337 | `keyvault_certificate_list` | ❌ | +======= +| 1 | 0.701255 | `keyvault_secret_list` | ✅ **EXPECTED** | +| 2 | 0.563736 | `keyvault_key_list` | ❌ | +| 3 | 0.538290 | `keyvault_certificate_list` | ❌ | +>>>>>>> e2fd2eac (refactor tts mcp tool) | 4 | 0.499642 | `keyvault_secret_get` | ❌ | -| 5 | 0.455469 | `cosmos_account_list` | ❌ | +| 5 | 0.455500 | `cosmos_account_list` | ❌ | --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 267 ======= ## Test 277 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 282 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `keyvault_secret_list` **Prompt:** Show me the secrets in the key vault @@ -9843,6 +11661,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.555681 | `keyvault_secret_list` | ✅ **EXPECTED** | <<<<<<< HEAD | 2 | 0.544015 | `keyvault_secret_get` | ❌ | @@ -9854,24 +11673,23 @@ ## Test 273 ======= +======= +| 1 | 0.555768 | `keyvault_secret_list` | ✅ **EXPECTED** | +>>>>>>> e2fd2eac (refactor tts mcp tool) | 2 | 0.543861 | `keyvault_secret_get` | ❌ | | 3 | 0.497525 | `keyvault_key_get` | ❌ | -<<<<<<< HEAD -| 4 | 0.464705 | `keyvault_key_list` | ❌ | -| 5 | 0.453107 | `keyvault_admin_settings_get` | ❌ | - ---- - -## Test 268 -======= -| 4 | 0.464652 | `keyvault_key_list` | ❌ | +| 4 | 0.464661 | `keyvault_key_list` | ❌ | | 5 | 0.453130 | `keyvault_admin_settings_get` | ❌ | --- +<<<<<<< HEAD ## Test 278 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 283 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `keyvault_secret_list` **Prompt:** What secrets are in the key vault ? @@ -9880,6 +11698,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.572540 | `keyvault_secret_list` | ✅ **EXPECTED** | <<<<<<< HEAD | 2 | 0.529389 | `keyvault_secret_get` | ❌ | @@ -9891,24 +11710,26 @@ ## Test 274 ======= -| 2 | 0.529258 | `keyvault_secret_get` | ❌ | -<<<<<<< HEAD -| 3 | 0.493797 | `keyvault_key_list` | ❌ | -| 4 | 0.487611 | `keyvault_admin_settings_get` | ❌ | ======= -| 3 | 0.493728 | `keyvault_key_list` | ❌ | +| 1 | 0.572620 | `keyvault_secret_list` | ✅ **EXPECTED** | +>>>>>>> e2fd2eac (refactor tts mcp tool) +| 2 | 0.529258 | `keyvault_secret_get` | ❌ | +| 3 | 0.493761 | `keyvault_key_list` | ❌ | | 4 | 0.487620 | `keyvault_admin_settings_get` | ❌ | ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) | 5 | 0.475273 | `keyvault_key_get` | ❌ | --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 269 ======= ## Test 279 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 284 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `keyvault_secret_list` **Prompt:** List secrets names in vault @@ -9917,6 +11738,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.624290 | `keyvault_secret_list` | ✅ **EXPECTED** | <<<<<<< HEAD | 2 | 0.559681 | `keyvault_key_list` | ❌ | @@ -9942,6 +11764,17 @@ ## Test 280 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +| 1 | 0.624379 | `keyvault_secret_list` | ✅ **EXPECTED** | +| 2 | 0.559681 | `keyvault_key_list` | ❌ | +| 3 | 0.517338 | `keyvault_certificate_list` | ❌ | +| 4 | 0.479547 | `keyvault_secret_get` | ❌ | +| 5 | 0.454596 | `storage_blob_container_get` | ❌ | + +--- + +## Test 285 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `keyvault_secret_list` **Prompt:** Enumerate secrets in key vault @@ -9950,6 +11783,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.742358 | `keyvault_secret_list` | ✅ **EXPECTED** | <<<<<<< HEAD | 2 | 0.601183 | `keyvault_key_list` | ❌ | @@ -9962,10 +11796,17 @@ >>>>>>> 58ab8585 (update prompts and tool description evaluator) | 3 | 0.567827 | `keyvault_certificate_list` | ❌ | | 4 | 0.496363 | `keyvault_secret_get` | ❌ | +======= +| 1 | 0.742378 | `keyvault_secret_list` | ✅ **EXPECTED** | +| 2 | 0.601183 | `keyvault_key_list` | ❌ | +| 3 | 0.567881 | `keyvault_certificate_list` | ❌ | +| 4 | 0.496127 | `keyvault_secret_get` | ❌ | +>>>>>>> e2fd2eac (refactor tts mcp tool) | 5 | 0.437560 | `keyvault_admin_settings_get` | ❌ | --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 276 ======= @@ -9975,6 +11816,9 @@ ## Test 281 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 286 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `keyvault_secret_list` **Prompt:** Show secrets names in the key vault @@ -9983,6 +11827,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.567110 | `keyvault_secret_list` | ✅ **EXPECTED** | <<<<<<< HEAD | 2 | 0.522600 | `keyvault_secret_get` | ❌ | @@ -9994,23 +11839,26 @@ ## Test 277 ======= -| 2 | 0.522398 | `keyvault_secret_get` | ❌ | -<<<<<<< HEAD -| 3 | 0.476354 | `keyvault_key_list` | ❌ | ======= -| 3 | 0.476288 | `keyvault_key_list` | ❌ | ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) +| 1 | 0.567204 | `keyvault_secret_list` | ✅ **EXPECTED** | +>>>>>>> e2fd2eac (refactor tts mcp tool) +| 2 | 0.522398 | `keyvault_secret_get` | ❌ | +| 3 | 0.476309 | `keyvault_key_list` | ❌ | | 4 | 0.462676 | `keyvault_secret_create` | ❌ | | 5 | 0.461326 | `keyvault_key_get` | ❌ | --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 272 ======= ## Test 282 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 287 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `aks_cluster_get` **Prompt:** Get the configuration of AKS cluster @@ -10027,6 +11875,7 @@ --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 278 ======= @@ -10036,6 +11885,9 @@ ## Test 283 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 288 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `aks_cluster_get` **Prompt:** Show me the details of AKS cluster in resource group @@ -10045,6 +11897,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| <<<<<<< HEAD +<<<<<<< HEAD | 1 | 0.621759 | `aks_cluster_get` | ✅ **EXPECTED** | | 2 | 0.575626 | `aks_nodepool_get` | ❌ | | 3 | 0.567870 | `kusto_cluster_get` | ❌ | @@ -10066,6 +11919,8 @@ ## Test 274 ======= +======= +>>>>>>> e2fd2eac (refactor tts mcp tool) | 1 | 0.621759 | `aks_cluster_get` | ✅ **EXPECTED** | | 2 | 0.575625 | `aks_nodepool_get` | ❌ | | 3 | 0.567870 | `kusto_cluster_get` | ❌ | @@ -10074,9 +11929,13 @@ --- +<<<<<<< HEAD ## Test 284 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 289 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `aks_cluster_get` **Prompt:** Show me the network configuration for AKS cluster @@ -10089,11 +11948,11 @@ | 2 | 0.483220 | `aks_nodepool_get` | ❌ | | 3 | 0.434684 | `kusto_cluster_get` | ❌ | | 4 | 0.380301 | `mysql_server_config_get` | ❌ | -<<<<<<< HEAD | 5 | 0.366689 | `kusto_cluster_list` | ❌ | --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 280 ======= @@ -10106,6 +11965,9 @@ ## Test 285 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 290 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `aks_cluster_get` **Prompt:** What are the details of my AKS cluster in ? @@ -10118,6 +11980,7 @@ | 2 | 0.550555 | `aks_nodepool_get` | ❌ | | 3 | 0.527511 | `kusto_cluster_get` | ❌ | <<<<<<< HEAD +<<<<<<< HEAD | 4 | 0.445722 | `storage_account_get` | ❌ | ======= <<<<<<< HEAD @@ -10126,10 +11989,14 @@ | 4 | 0.445833 | `storage_account_get` | ❌ | >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +| 4 | 0.445722 | `storage_account_get` | ❌ | +>>>>>>> e2fd2eac (refactor tts mcp tool) | 5 | 0.435597 | `foundry_resource_get` | ❌ | --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 281 ======= @@ -10139,6 +12006,9 @@ ## Test 286 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 291 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `aks_cluster_get` **Prompt:** List all AKS clusters in my subscription @@ -10148,10 +12018,10 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| | 1 | 0.756471 | `aks_cluster_get` | ✅ **EXPECTED** | -<<<<<<< HEAD | 2 | 0.749416 | `kusto_cluster_list` | ❌ | | 3 | 0.590166 | `aks_nodepool_get` | ❌ | <<<<<<< HEAD +<<<<<<< HEAD | 4 | 0.568635 | `kusto_database_list` | ❌ | | 5 | 0.560522 | `search_service_list` | ❌ | @@ -10165,16 +12035,23 @@ | 3 | 0.590166 | `aks_nodepool_get` | ❌ | | 4 | 0.568301 | `kusto_database_list` | ❌ | >>>>>>> 84ad4f44 (update prompts and tool description evaluator) +======= +| 4 | 0.568403 | `kusto_database_list` | ❌ | +>>>>>>> e2fd2eac (refactor tts mcp tool) | 5 | 0.562043 | `search_service_list` | ❌ | --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 277 ======= ## Test 287 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 292 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `aks_cluster_get` **Prompt:** Show me my Azure Kubernetes Service clusters @@ -10183,6 +12060,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.612123 | `aks_cluster_get` | ✅ **EXPECTED** | <<<<<<< HEAD | 2 | 0.586661 | `kusto_cluster_list` | ❌ | @@ -10207,6 +12085,17 @@ ## Test 288 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +| 1 | 0.612043 | `aks_cluster_get` | ✅ **EXPECTED** | +| 2 | 0.586527 | `kusto_cluster_list` | ❌ | +| 3 | 0.507689 | `aks_nodepool_get` | ❌ | +| 4 | 0.489677 | `kusto_cluster_get` | ❌ | +| 5 | 0.462776 | `kusto_database_list` | ❌ | + +--- + +## Test 293 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `aks_cluster_get` **Prompt:** What AKS clusters do I have? @@ -10228,13 +12117,13 @@ ======= | 1 | 0.628429 | `aks_cluster_get` | ✅ **EXPECTED** | | 2 | 0.563189 | `aks_nodepool_get` | ❌ | -<<<<<<< HEAD | 3 | 0.526756 | `kusto_cluster_list` | ❌ | | 4 | 0.426157 | `kusto_cluster_get` | ❌ | -| 5 | 0.409163 | `kusto_database_list` | ❌ | +| 5 | 0.409103 | `kusto_database_list` | ❌ | --- +<<<<<<< HEAD ## Test 279 ======= | 3 | 0.526670 | `kusto_cluster_list` | ❌ | @@ -10246,6 +12135,9 @@ ## Test 289 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 294 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `aks_nodepool_get` **Prompt:** Get details for nodepool in AKS cluster in @@ -10255,6 +12147,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| <<<<<<< HEAD +<<<<<<< HEAD | 1 | 0.728569 | `aks_nodepool_get` | ✅ **EXPECTED** | | 2 | 0.516573 | `kusto_cluster_get` | ❌ | | 3 | 0.509314 | `aks_cluster_get` | ❌ | @@ -10276,6 +12169,8 @@ ## Test 280 ======= +======= +>>>>>>> e2fd2eac (refactor tts mcp tool) | 1 | 0.728937 | `aks_nodepool_get` | ✅ **EXPECTED** | | 2 | 0.517021 | `kusto_cluster_get` | ❌ | | 3 | 0.509820 | `aks_cluster_get` | ❌ | @@ -10284,9 +12179,13 @@ --- +<<<<<<< HEAD ## Test 290 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 295 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `aks_nodepool_get` **Prompt:** Show me the configuration for nodepool in AKS cluster in resource group @@ -10295,7 +12194,6 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD | 1 | 0.654106 | `aks_nodepool_get` | ✅ **EXPECTED** | | 2 | 0.458596 | `sql_elastic-pool_list` | ❌ | | 3 | 0.446035 | `aks_cluster_get` | ❌ | @@ -10304,6 +12202,7 @@ --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 286 ======= @@ -10320,6 +12219,9 @@ ## Test 291 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 296 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `aks_nodepool_get` **Prompt:** What is the setup of nodepool for AKS cluster in ? @@ -10336,6 +12238,7 @@ --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 287 ======= @@ -10345,6 +12248,9 @@ ## Test 292 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 297 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `aks_nodepool_get` **Prompt:** List nodepools for AKS cluster in @@ -10354,6 +12260,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| <<<<<<< HEAD +<<<<<<< HEAD | 1 | 0.692231 | `aks_nodepool_get` | ✅ **EXPECTED** | | 2 | 0.519037 | `aks_cluster_get` | ❌ | | 3 | 0.506720 | `virtualdesktop_hostpool_list` | ❌ | @@ -10375,17 +12282,23 @@ ## Test 283 ======= +======= +>>>>>>> e2fd2eac (refactor tts mcp tool) | 1 | 0.692231 | `aks_nodepool_get` | ✅ **EXPECTED** | | 2 | 0.519037 | `aks_cluster_get` | ❌ | | 3 | 0.506624 | `virtualdesktop_hostpool_list` | ❌ | -| 4 | 0.500514 | `kusto_cluster_list` | ❌ | +| 4 | 0.500749 | `kusto_cluster_list` | ❌ | | 5 | 0.487707 | `sql_elastic-pool_list` | ❌ | --- +<<<<<<< HEAD ## Test 293 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 298 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `aks_nodepool_get` **Prompt:** Show me the nodepool list for AKS cluster in @@ -10402,11 +12315,15 @@ ======= | 4 | 0.509732 | `virtualdesktop_hostpool_list` | ❌ | <<<<<<< HEAD +<<<<<<< HEAD >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +>>>>>>> e2fd2eac (refactor tts mcp tool) | 5 | 0.486700 | `kusto_cluster_list` | ❌ | --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 289 ======= @@ -10419,6 +12336,9 @@ ## Test 294 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 299 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `aks_nodepool_get` **Prompt:** What nodepools do I have for AKS cluster in @@ -10434,15 +12354,16 @@ ======= | 3 | 0.443902 | `virtualdesktop_hostpool_list` | ❌ | <<<<<<< HEAD +<<<<<<< HEAD >>>>>>> 58ab8585 (update prompts and tool description evaluator) -| 4 | 0.433006 | `kusto_cluster_list` | ❌ | ======= -| 4 | 0.432757 | `kusto_cluster_list` | ❌ | ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> e2fd2eac (refactor tts mcp tool) +| 4 | 0.433006 | `kusto_cluster_list` | ❌ | | 5 | 0.425448 | `sql_elastic-pool_list` | ❌ | --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 290 ======= @@ -10452,6 +12373,9 @@ ## Test 295 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 300 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `loadtesting_test_create` **Prompt:** Create a basic URL test using the following endpoint URL that runs for 30 minutes with 45 virtual users. The test name is with the test id and the load testing resource is in the resource group in my subscription @@ -10468,6 +12392,7 @@ --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 291 ======= @@ -10477,6 +12402,9 @@ ## Test 296 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 301 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `loadtesting_test_get` **Prompt:** Get the load test with id in the load test resource in resource group @@ -10486,6 +12414,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| <<<<<<< HEAD +<<<<<<< HEAD | 1 | 0.626226 | `loadtesting_testresource_list` | ❌ | | 2 | 0.619944 | `loadtesting_test_get` | ✅ **EXPECTED** | | 3 | 0.594666 | `loadtesting_testresource_create` | ❌ | @@ -10518,6 +12447,17 @@ ## Test 297 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +| 1 | 0.626226 | `loadtesting_testresource_list` | ❌ | +| 2 | 0.619944 | `loadtesting_test_get` | ✅ **EXPECTED** | +| 3 | 0.594666 | `loadtesting_testresource_create` | ❌ | +| 4 | 0.590697 | `monitor_webtests_get` | ❌ | +| 5 | 0.536024 | `monitor_webtests_list` | ❌ | + +--- + +## Test 302 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `loadtesting_testresource_create` **Prompt:** Create a load test resource in the resource group in my subscription @@ -10527,6 +12467,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| <<<<<<< HEAD +<<<<<<< HEAD | 1 | 0.645537 | `loadtesting_testresource_create` | ✅ **EXPECTED** | | 2 | 0.618773 | `loadtesting_testresource_list` | ❌ | | 3 | 0.541696 | `loadtesting_test_create` | ❌ | @@ -10559,6 +12500,17 @@ ## Test 298 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +| 1 | 0.644693 | `loadtesting_testresource_create` | ✅ **EXPECTED** | +| 2 | 0.618375 | `loadtesting_testresource_list` | ❌ | +| 3 | 0.541221 | `loadtesting_test_create` | ❌ | +| 4 | 0.540031 | `loadtesting_testrun_create` | ❌ | +| 5 | 0.526768 | `monitor_webtests_list` | ❌ | + +--- + +## Test 303 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `loadtesting_testresource_list` **Prompt:** List all load testing resources in the resource group in my subscription @@ -10569,6 +12521,7 @@ |------|-------|------|--------| | 1 | 0.794326 | `loadtesting_testresource_list` | ✅ **EXPECTED** | <<<<<<< HEAD +<<<<<<< HEAD | 2 | 0.653165 | `monitor_webtests_list` | ❌ | ======= <<<<<<< HEAD @@ -10577,12 +12530,16 @@ | 2 | 0.651533 | `monitor_webtests_list` | ❌ | >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +| 2 | 0.653165 | `monitor_webtests_list` | ❌ | +>>>>>>> e2fd2eac (refactor tts mcp tool) | 3 | 0.577408 | `group_list` | ❌ | | 4 | 0.575172 | `loadtesting_testresource_create` | ❌ | | 5 | 0.565565 | `datadog_monitoredresources_list` | ❌ | --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 294 ======= @@ -10592,6 +12549,9 @@ ## Test 299 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 304 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `loadtesting_testrun_create` **Prompt:** Create a test run using the id for test in the load testing resource in resource group . Use the name of test run and description as @@ -10601,13 +12561,19 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| | 1 | 0.688976 | `loadtesting_testrun_create` | ✅ **EXPECTED** | +<<<<<<< HEAD | 2 | 0.594879 | `loadtesting_testrun_update` | ❌ | | 3 | 0.558566 | `loadtesting_test_create` | ❌ | +======= +| 2 | 0.594779 | `loadtesting_testrun_update` | ❌ | +| 3 | 0.558636 | `loadtesting_test_create` | ❌ | +>>>>>>> e2fd2eac (refactor tts mcp tool) | 4 | 0.547102 | `loadtesting_testresource_create` | ❌ | | 5 | 0.496224 | `loadtesting_testresource_list` | ❌ | --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 295 ======= @@ -10617,6 +12583,9 @@ ## Test 300 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 305 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `loadtesting_testrun_get` **Prompt:** Get the load test run with id in the load test resource in resource group @@ -10625,6 +12594,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.619146 | `loadtesting_testresource_list` | ❌ | | 2 | 0.601927 | `loadtesting_test_get` | ❌ | | 3 | 0.597430 | `loadtesting_testresource_create` | ❌ | @@ -10650,6 +12620,17 @@ ## Test 301 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +| 1 | 0.618926 | `loadtesting_testresource_list` | ❌ | +| 2 | 0.602281 | `loadtesting_test_get` | ❌ | +| 3 | 0.596851 | `loadtesting_testresource_create` | ❌ | +| 4 | 0.577610 | `monitor_webtests_get` | ❌ | +| 5 | 0.566147 | `loadtesting_testrun_list` | ❌ | + +--- + +## Test 306 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `loadtesting_testrun_list` **Prompt:** Get all the load test runs for the test with id in the load test resource in resource group @@ -10659,6 +12640,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| <<<<<<< HEAD +<<<<<<< HEAD | 1 | 0.669307 | `loadtesting_testresource_list` | ❌ | | 2 | 0.640644 | `loadtesting_testrun_list` | ✅ **EXPECTED** | | 3 | 0.600977 | `loadtesting_test_get` | ❌ | @@ -10687,6 +12669,17 @@ ## Test 302 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +| 1 | 0.669160 | `loadtesting_testresource_list` | ❌ | +| 2 | 0.640500 | `loadtesting_testrun_list` | ✅ **EXPECTED** | +| 3 | 0.601136 | `loadtesting_test_get` | ❌ | +| 4 | 0.577398 | `loadtesting_testresource_create` | ❌ | +| 5 | 0.569408 | `monitor_webtests_get` | ❌ | + +--- + +## Test 307 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `loadtesting_testrun_update` **Prompt:** Update a test run display name as for the id for test in the load testing resource in resource group . @@ -10695,15 +12688,19 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -| 1 | 0.706747 | `loadtesting_testrun_update` | ✅ **EXPECTED** | +| 1 | 0.706390 | `loadtesting_testrun_update` | ✅ **EXPECTED** | | 2 | 0.514428 | `loadtesting_testrun_create` | ❌ | <<<<<<< HEAD +<<<<<<< HEAD | 3 | 0.486977 | `monitor_webtests_update` | ❌ | ======= <<<<<<< HEAD +======= +>>>>>>> e2fd2eac (refactor tts mcp tool) | 3 | 0.486980 | `monitor_webtests_update` | ❌ | >>>>>>> 58ab8585 (update prompts and tool description evaluator) | 4 | 0.470337 | `loadtesting_testresource_list` | ❌ | +<<<<<<< HEAD | 5 | 0.468374 | `monitor_webtests_get` | ❌ | --- @@ -10715,13 +12712,19 @@ ======= | 3 | 0.487022 | `monitor_webtests_update` | ❌ | | 4 | 0.470337 | `loadtesting_testresource_list` | ❌ | +======= +>>>>>>> e2fd2eac (refactor tts mcp tool) | 5 | 0.468374 | `monitor_webtests_get` | ❌ | --- +<<<<<<< HEAD ## Test 303 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 308 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `grafana_list` **Prompt:** List all Azure Managed Grafana in one subscription @@ -10730,12 +12733,9 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD | 1 | 0.599427 | `kusto_cluster_list` | ❌ | -======= -| 1 | 0.599428 | `kusto_cluster_list` | ❌ | ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) | 2 | 0.578892 | `grafana_list` | ✅ **EXPECTED** | +<<<<<<< HEAD | 3 | 0.550372 | `subscription_list` | ❌ | | 4 | 0.549957 | `search_service_list` | ❌ | | 5 | 0.531259 | `redis_list` | ❌ | @@ -10751,6 +12751,15 @@ ## Test 304 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +| 3 | 0.551851 | `search_service_list` | ❌ | +| 4 | 0.550372 | `subscription_list` | ❌ | +| 5 | 0.531277 | `redis_list` | ❌ | + +--- + +## Test 309 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `managedlustre_fs_create` **Prompt:** Create an Azure Managed Lustre filesystem with name , size , SKU , and subnet for availability zone in location . Maintenance should occur on at @@ -10760,21 +12769,29 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| <<<<<<< HEAD +<<<<<<< HEAD | 1 | 0.726553 | `managedlustre_fs_create` | ✅ **EXPECTED** | | 2 | 0.616164 | `managedlustre_fs_list` | ❌ | | 3 | 0.605701 | `managedlustre_fs_sku_get` | ❌ | | 4 | 0.598215 | `managedlustre_fs_update` | ❌ | ======= <<<<<<< HEAD +======= +>>>>>>> e2fd2eac (refactor tts mcp tool) | 1 | 0.728113 | `managedlustre_fs_create` | ✅ **EXPECTED** | -| 2 | 0.615874 | `managedlustre_fs_list` | ❌ | +| 2 | 0.616164 | `managedlustre_fs_list` | ❌ | | 3 | 0.605775 | `managedlustre_fs_sku_get` | ❌ | +<<<<<<< HEAD | 4 | 0.598255 | `managedlustre_fs_update` | ❌ | >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +| 4 | 0.598293 | `managedlustre_fs_update` | ❌ | +>>>>>>> e2fd2eac (refactor tts mcp tool) | 5 | 0.557720 | `managedlustre_fs_subnetsize_validate` | ❌ | --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 300 ======= @@ -10791,6 +12808,9 @@ ## Test 305 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 310 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `managedlustre_fs_list` **Prompt:** List the Azure Managed Lustre filesystems in my subscription @@ -10800,6 +12820,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| <<<<<<< HEAD +<<<<<<< HEAD | 1 | 0.750675 | `managedlustre_fs_list` | ✅ **EXPECTED** | | 2 | 0.631730 | `managedlustre_fs_sku_get` | ❌ | | 3 | 0.579855 | `managedlustre_fs_create` | ❌ | @@ -10831,6 +12852,17 @@ ## Test 306 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +| 1 | 0.750667 | `managedlustre_fs_list` | ✅ **EXPECTED** | +| 2 | 0.631727 | `managedlustre_fs_sku_get` | ❌ | +| 3 | 0.582749 | `managedlustre_fs_create` | ❌ | +| 4 | 0.562295 | `kusto_cluster_list` | ❌ | +| 5 | 0.513090 | `search_service_list` | ❌ | + +--- + +## Test 311 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `managedlustre_fs_list` **Prompt:** List the Azure Managed Lustre filesystems in my resource group @@ -10840,6 +12872,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| <<<<<<< HEAD +<<<<<<< HEAD | 1 | 0.743903 | `managedlustre_fs_list` | ✅ **EXPECTED** | | 2 | 0.613164 | `managedlustre_fs_sku_get` | ❌ | | 3 | 0.563081 | `managedlustre_fs_create` | ❌ | @@ -10854,11 +12887,17 @@ | 3 | 0.565856 | `managedlustre_filesystem_create` | ❌ | >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +| 1 | 0.743903 | `managedlustre_fs_list` | ✅ **EXPECTED** | +| 2 | 0.613217 | `managedlustre_fs_sku_get` | ❌ | +| 3 | 0.565856 | `managedlustre_fs_create` | ❌ | +>>>>>>> e2fd2eac (refactor tts mcp tool) | 4 | 0.519986 | `datadog_monitoredresources_list` | ❌ | | 5 | 0.515433 | `loadtesting_testresource_list` | ❌ | --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 302 ======= @@ -10868,6 +12907,9 @@ ## Test 307 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 312 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `managedlustre_fs_sku_get` **Prompt:** List the Azure Managed Lustre SKUs available in location @@ -10877,13 +12919,16 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| <<<<<<< HEAD +<<<<<<< HEAD | 1 | 0.827360 | `managedlustre_fs_sku_get` | ✅ **EXPECTED** | | 2 | 0.613674 | `managedlustre_fs_list` | ❌ | | 3 | 0.511625 | `managedlustre_fs_create` | ❌ | ======= <<<<<<< HEAD +======= +>>>>>>> e2fd2eac (refactor tts mcp tool) | 1 | 0.827381 | `managedlustre_fs_sku_get` | ✅ **EXPECTED** | -| 2 | 0.613245 | `managedlustre_fs_list` | ❌ | +| 2 | 0.613674 | `managedlustre_fs_list` | ❌ | | 3 | 0.513242 | `managedlustre_fs_create` | ❌ | >>>>>>> 58ab8585 (update prompts and tool description evaluator) | 4 | 0.496242 | `managedlustre_fs_subnetsize_validate` | ❌ | @@ -10891,6 +12936,7 @@ --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 303 ======= @@ -10907,6 +12953,9 @@ ## Test 308 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 313 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `managedlustre_fs_subnetsize_ask` **Prompt:** Tell me how many IP addresses I need for an Azure Managed Lustre filesystem of size using the SKU @@ -10916,6 +12965,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| <<<<<<< HEAD +<<<<<<< HEAD | 1 | 0.739766 | `managedlustre_fs_subnetsize_ask` | ✅ **EXPECTED** | | 2 | 0.651598 | `managedlustre_fs_subnetsize_validate` | ❌ | | 3 | 0.594536 | `managedlustre_fs_sku_get` | ❌ | @@ -10948,6 +12998,17 @@ ## Test 309 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +| 1 | 0.739766 | `managedlustre_fs_subnetsize_ask` | ✅ **EXPECTED** | +| 2 | 0.651598 | `managedlustre_fs_subnetsize_validate` | ❌ | +| 3 | 0.594585 | `managedlustre_fs_sku_get` | ❌ | +| 4 | 0.559498 | `managedlustre_fs_list` | ❌ | +| 5 | 0.533684 | `managedlustre_fs_create` | ❌ | + +--- + +## Test 314 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `managedlustre_fs_subnetsize_validate` **Prompt:** Validate if the network can host Azure Managed Lustre filesystem of size using the SKU @@ -10957,6 +13018,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| <<<<<<< HEAD +<<<<<<< HEAD | 1 | 0.879240 | `managedlustre_fs_subnetsize_validate` | ✅ **EXPECTED** | | 2 | 0.622368 | `managedlustre_fs_subnetsize_ask` | ❌ | | 3 | 0.542555 | `managedlustre_fs_sku_get` | ❌ | @@ -10989,6 +13051,17 @@ ## Test 310 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +| 1 | 0.879742 | `managedlustre_fs_subnetsize_validate` | ✅ **EXPECTED** | +| 2 | 0.623614 | `managedlustre_fs_subnetsize_ask` | ❌ | +| 3 | 0.543132 | `managedlustre_fs_sku_get` | ❌ | +| 4 | 0.516528 | `managedlustre_fs_create` | ❌ | +| 5 | 0.480633 | `managedlustre_fs_list` | ❌ | + +--- + +## Test 315 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `managedlustre_fs_update` **Prompt:** Update the maintenance window of the Azure Managed Lustre filesystem to at @@ -10998,6 +13071,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| <<<<<<< HEAD +<<<<<<< HEAD | 1 | 0.738895 | `managedlustre_fs_update` | ✅ **EXPECTED** | | 2 | 0.525980 | `managedlustre_fs_create` | ❌ | | 3 | 0.487193 | `managedlustre_fs_list` | ❌ | @@ -11029,6 +13103,17 @@ ## Test 311 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +| 1 | 0.739170 | `managedlustre_fs_update` | ✅ **EXPECTED** | +| 2 | 0.527527 | `managedlustre_fs_create` | ❌ | +| 3 | 0.487191 | `managedlustre_fs_list` | ❌ | +| 4 | 0.385343 | `managedlustre_fs_sku_get` | ❌ | +| 5 | 0.344858 | `managedlustre_fs_subnetsize_validate` | ❌ | + +--- + +## Test 316 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `marketplace_product_get` **Prompt:** Get details about marketplace product @@ -11038,6 +13123,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| <<<<<<< HEAD +<<<<<<< HEAD | 1 | 0.570164 | `marketplace_product_get` | ✅ **EXPECTED** | | 2 | 0.499208 | `marketplace_product_list` | ❌ | | 3 | 0.353280 | `servicebus_topic_subscription_details` | ❌ | @@ -11066,6 +13152,17 @@ ## Test 312 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +| 1 | 0.570164 | `marketplace_product_get` | ✅ **EXPECTED** | +| 2 | 0.499208 | `marketplace_product_list` | ❌ | +| 3 | 0.353280 | `servicebus_topic_subscription_details` | ❌ | +| 4 | 0.333178 | `servicebus_topic_details` | ❌ | +| 5 | 0.330949 | `servicebus_queue_details` | ❌ | + +--- + +## Test 317 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `marketplace_product_list` **Prompt:** Search for Microsoft products in the marketplace @@ -11086,23 +13183,23 @@ ## Test 308 ======= | 1 | 0.607916 | `marketplace_product_list` | ✅ **EXPECTED** | -<<<<<<< HEAD -| 2 | 0.443178 | `marketplace_product_get` | ❌ | -======= -| 2 | 0.443109 | `marketplace_product_get` | ❌ | ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) +| 2 | 0.443133 | `marketplace_product_get` | ❌ | | 3 | 0.343549 | `search_service_list` | ❌ | | 4 | 0.330500 | `foundry_models_list` | ❌ | | 5 | 0.328676 | `managedlustre_fs_sku_get` | ❌ | --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 303 ======= ## Test 313 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 318 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `marketplace_product_list` **Prompt:** Show me marketplace products from publisher @@ -11113,6 +13210,7 @@ |------|-------|------|--------| | 1 | 0.537726 | `marketplace_product_list` | ✅ **EXPECTED** | <<<<<<< HEAD +<<<<<<< HEAD | 2 | 0.385167 | `marketplace_product_get` | ❌ | ======= <<<<<<< HEAD @@ -11221,6 +13319,16 @@ ## Test 314 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +| 2 | 0.385167 | `marketplace_product_get` | ❌ | +| 3 | 0.308769 | `foundry_models_list` | ❌ | +| 4 | 0.288030 | `redis_list` | ❌ | +| 5 | 0.260387 | `managedlustre_fs_sku_get` | ❌ | + +--- + +## Test 319 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `get_bestpractices_get` **Prompt:** Get the latest Azure code generation best practices @@ -11230,6 +13338,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| <<<<<<< HEAD +<<<<<<< HEAD | 1 | 0.656395 | `azureaibestpractices_get` | ❌ | | 2 | 0.646844 | `get_bestpractices_get` | ✅ **EXPECTED** | | 3 | 0.635406 | `azureterraformbestpractices_get` | ❌ | @@ -11246,20 +13355,26 @@ | 3 | 0.586894 | `deploy_iac_rules_get` | ❌ | ======= | 1 | 0.651264 | `get_bestpractices_get` | ✅ **EXPECTED** | +======= +| 1 | 0.646844 | `get_bestpractices_get` | ✅ **EXPECTED** | +>>>>>>> e2fd2eac (refactor tts mcp tool) | 2 | 0.635406 | `azureterraformbestpractices_get` | ❌ | | 3 | 0.586907 | `deploy_iac_rules_get` | ❌ | ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) | 4 | 0.531727 | `deploy_pipeline_guidance_get` | ❌ | | 5 | 0.490235 | `deploy_plan_get` | ❌ | --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 305 ======= ## Test 315 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 320 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `get_bestpractices_get` **Prompt:** Get the latest Azure deployment best practices @@ -11268,9 +13383,9 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD | 1 | 0.600903 | `get_bestpractices_get` | ✅ **EXPECTED** | <<<<<<< HEAD +<<<<<<< HEAD | 2 | 0.548542 | `azureterraformbestpractices_get` | ❌ | ======= | 2 | 0.548655 | `azureterraformbestpractices_get` | ❌ | @@ -11279,12 +13394,16 @@ | 2 | 0.548542 | `azureterraformbestpractices_get` | ❌ | >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +| 2 | 0.548542 | `azureterraformbestpractices_get` | ❌ | +>>>>>>> e2fd2eac (refactor tts mcp tool) | 3 | 0.541091 | `deploy_iac_rules_get` | ❌ | | 4 | 0.516852 | `deploy_plan_get` | ❌ | | 5 | 0.516203 | `deploy_pipeline_guidance_get` | ❌ | --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 316 ======= @@ -11294,6 +13413,9 @@ ## Test 316 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 321 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `get_bestpractices_get` **Prompt:** Get the latest Azure best practices @@ -11302,9 +13424,9 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD | 1 | 0.625259 | `get_bestpractices_get` | ✅ **EXPECTED** | <<<<<<< HEAD +<<<<<<< HEAD | 2 | 0.594323 | `azureterraformbestpractices_get` | ❌ | | 3 | 0.539715 | `azureaibestpractices_get` | ❌ | | 4 | 0.518643 | `deploy_iac_rules_get` | ❌ | @@ -11317,20 +13439,25 @@ | 2 | 0.594455 | `azureterraformbestpractices_get` | ❌ | ======= | 1 | 0.624689 | `get_bestpractices_get` | ✅ **EXPECTED** | +======= +>>>>>>> e2fd2eac (refactor tts mcp tool) | 2 | 0.594323 | `azureterraformbestpractices_get` | ❌ | ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) | 3 | 0.518643 | `deploy_iac_rules_get` | ❌ | | 4 | 0.465572 | `deploy_pipeline_guidance_get` | ❌ | | 5 | 0.450629 | `cloudarchitect_design` | ❌ | --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 307 ======= ## Test 317 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 322 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `get_bestpractices_get` **Prompt:** Get the latest Azure Functions code generation best practices @@ -11339,9 +13466,9 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD | 1 | 0.624273 | `get_bestpractices_get` | ✅ **EXPECTED** | <<<<<<< HEAD +<<<<<<< HEAD | 2 | 0.587474 | `azureaibestpractices_get` | ❌ | | 3 | 0.570488 | `azureterraformbestpractices_get` | ❌ | | 4 | 0.522998 | `deploy_iac_rules_get` | ❌ | @@ -11354,20 +13481,25 @@ | 2 | 0.570547 | `azureterraformbestpractices_get` | ❌ | ======= | 1 | 0.629031 | `get_bestpractices_get` | ✅ **EXPECTED** | +======= +>>>>>>> e2fd2eac (refactor tts mcp tool) | 2 | 0.570488 | `azureterraformbestpractices_get` | ❌ | ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) | 3 | 0.522998 | `deploy_iac_rules_get` | ❌ | | 4 | 0.493998 | `deploy_pipeline_guidance_get` | ❌ | | 5 | 0.467377 | `extension_cli_install` | ❌ | --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 308 ======= ## Test 318 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 323 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `get_bestpractices_get` **Prompt:** Get the latest Azure Functions deployment best practices @@ -11377,10 +13509,14 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| <<<<<<< HEAD +<<<<<<< HEAD | 1 | 0.581850 | `get_bestpractices_get` | ✅ **EXPECTED** | | 2 | 0.497056 | `deploy_pipeline_guidance_get` | ❌ | ======= | 1 | 0.584392 | `get_bestpractices_get` | ✅ **EXPECTED** | +======= +| 1 | 0.581850 | `get_bestpractices_get` | ✅ **EXPECTED** | +>>>>>>> e2fd2eac (refactor tts mcp tool) | 2 | 0.497350 | `deploy_pipeline_guidance_get` | ❌ | >>>>>>> 58ab8585 (update prompts and tool description evaluator) | 3 | 0.495659 | `deploy_iac_rules_get` | ❌ | @@ -11389,6 +13525,7 @@ --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 319 ======= @@ -11398,6 +13535,9 @@ ## Test 319 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 324 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `get_bestpractices_get` **Prompt:** Get the latest Azure Functions best practices @@ -11406,9 +13546,9 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD | 1 | 0.610986 | `get_bestpractices_get` | ✅ **EXPECTED** | <<<<<<< HEAD +<<<<<<< HEAD | 2 | 0.532790 | `azureterraformbestpractices_get` | ❌ | | 3 | 0.518386 | `azureaibestpractices_get` | ❌ | | 4 | 0.487322 | `deploy_iac_rules_get` | ❌ | @@ -11421,20 +13561,25 @@ | 2 | 0.532921 | `azureterraformbestpractices_get` | ❌ | ======= | 1 | 0.612552 | `get_bestpractices_get` | ✅ **EXPECTED** | +======= +>>>>>>> e2fd2eac (refactor tts mcp tool) | 2 | 0.532790 | `azureterraformbestpractices_get` | ❌ | ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) | 3 | 0.487322 | `deploy_iac_rules_get` | ❌ | | 4 | 0.458060 | `deploy_pipeline_guidance_get` | ❌ | | 5 | 0.448034 | `extension_cli_install` | ❌ | --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 310 ======= ## Test 320 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 325 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `get_bestpractices_get` **Prompt:** Get the latest Azure Static Web Apps best practices @@ -11443,9 +13588,9 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD | 1 | 0.557862 | `get_bestpractices_get` | ✅ **EXPECTED** | <<<<<<< HEAD +<<<<<<< HEAD | 2 | 0.513262 | `azureterraformbestpractices_get` | ❌ | | 3 | 0.510399 | `azureaibestpractices_get` | ❌ | | 4 | 0.505123 | `deploy_iac_rules_get` | ❌ | @@ -11458,20 +13603,25 @@ | 2 | 0.513385 | `azureterraformbestpractices_get` | ❌ | ======= | 1 | 0.559184 | `get_bestpractices_get` | ✅ **EXPECTED** | +======= +>>>>>>> e2fd2eac (refactor tts mcp tool) | 2 | 0.513262 | `azureterraformbestpractices_get` | ❌ | ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) | 3 | 0.505123 | `deploy_iac_rules_get` | ❌ | | 4 | 0.483705 | `deploy_pipeline_guidance_get` | ❌ | | 5 | 0.421581 | `cloudarchitect_design` | ❌ | --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 311 ======= ## Test 321 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 326 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `get_bestpractices_get` **Prompt:** What are azure function best practices? @@ -11480,9 +13630,9 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD | 1 | 0.582541 | `get_bestpractices_get` | ✅ **EXPECTED** | <<<<<<< HEAD +<<<<<<< HEAD | 2 | 0.500368 | `azureterraformbestpractices_get` | ❌ | | 3 | 0.475018 | `azureaibestpractices_get` | ❌ | | 4 | 0.472112 | `deploy_iac_rules_get` | ❌ | @@ -11495,20 +13645,25 @@ | 2 | 0.500479 | `azureterraformbestpractices_get` | ❌ | ======= | 1 | 0.584536 | `get_bestpractices_get` | ✅ **EXPECTED** | +======= +>>>>>>> e2fd2eac (refactor tts mcp tool) | 2 | 0.500368 | `azureterraformbestpractices_get` | ❌ | ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) | 3 | 0.472112 | `deploy_iac_rules_get` | ❌ | | 4 | 0.433134 | `deploy_pipeline_guidance_get` | ❌ | | 5 | 0.432087 | `cloudarchitect_design` | ❌ | --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 312 ======= ## Test 322 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 327 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `get_bestpractices_get` **Prompt:** configure azure mcp in coding agent for my repo @@ -11517,10 +13672,10 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD | 1 | 0.488855 | `deploy_plan_get` | ❌ | | 2 | 0.460745 | `deploy_pipeline_guidance_get` | ❌ | | 3 | 0.390270 | `deploy_iac_rules_get` | ❌ | +<<<<<<< HEAD | 4 | 0.370753 | `azureaibestpractices_get` | ❌ | | 5 | 0.370298 | `azureterraformbestpractices_get` | ❌ | @@ -11542,6 +13697,14 @@ ## Test 323 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +| 4 | 0.370298 | `azureterraformbestpractices_get` | ❌ | +| 5 | 0.369169 | `extension_cli_install` | ❌ | + +--- + +## Test 328 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `monitor_activitylog_list` **Prompt:** List the activity logs of the last month for @@ -11551,11 +13714,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| <<<<<<< HEAD +<<<<<<< HEAD | 1 | 0.537893 | `monitor_activitylog_list` | ✅ **EXPECTED** | ======= <<<<<<< HEAD | 1 | 0.537916 | `monitor_activitylog_list` | ✅ **EXPECTED** | >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +| 1 | 0.537893 | `monitor_activitylog_list` | ✅ **EXPECTED** | +>>>>>>> e2fd2eac (refactor tts mcp tool) | 2 | 0.506212 | `monitor_resource_log_query` | ❌ | | 3 | 0.371728 | `monitor_workspace_log_query` | ❌ | | 4 | 0.363798 | `resourcehealth_health-events_list` | ❌ | @@ -11563,6 +13730,7 @@ --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 324 ======= @@ -11579,6 +13747,9 @@ ## Test 324 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 329 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `monitor_healthmodels_entity_get` **Prompt:** Show me the health status of entity using the health model @@ -11587,15 +13758,19 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD | 1 | 0.660947 | `monitor_healthmodels_entity_get` | ✅ **EXPECTED** | +<<<<<<< HEAD | 2 | 0.608665 | `resourcehealth_availability-status_get` | ❌ | +======= +| 2 | 0.609276 | `resourcehealth_availability-status_get` | ❌ | +>>>>>>> e2fd2eac (refactor tts mcp tool) | 3 | 0.351697 | `resourcehealth_availability-status_list` | ❌ | | 4 | 0.328321 | `resourcehealth_health-events_list` | ❌ | | 5 | 0.288127 | `foundry_models_deployments_list` | ❌ | --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 325 ======= @@ -11612,6 +13787,9 @@ ## Test 325 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 330 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `monitor_metrics_definitions` **Prompt:** Get metric definitions for from the namespace @@ -11621,6 +13799,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| <<<<<<< HEAD +<<<<<<< HEAD | 1 | 0.592640 | `monitor_metrics_definitions` | ✅ **EXPECTED** | | 2 | 0.424141 | `monitor_metrics_query` | ❌ | | 3 | 0.368006 | `bicepschema_get` | ❌ | @@ -11634,12 +13813,17 @@ <<<<<<< HEAD | 1 | 0.592676 | `monitor_metrics_definitions` | ✅ **EXPECTED** | | 2 | 0.424006 | `monitor_metrics_query` | ❌ | +======= +| 1 | 0.592640 | `monitor_metrics_definitions` | ✅ **EXPECTED** | +| 2 | 0.424256 | `monitor_metrics_query` | ❌ | +>>>>>>> e2fd2eac (refactor tts mcp tool) | 3 | 0.368319 | `bicepschema_get` | ❌ | | 4 | 0.332356 | `monitor_table_type_list` | ❌ | | 5 | 0.324986 | `resourcehealth_availability-status_get` | ❌ | --- +<<<<<<< HEAD ## Test 316 ======= | 1 | 0.592640 | `monitor_metrics_definitions` | ✅ **EXPECTED** | @@ -11653,6 +13837,9 @@ ## Test 326 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 331 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `monitor_metrics_definitions` **Prompt:** Show me all available metrics and their definitions for storage account @@ -11662,6 +13849,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| <<<<<<< HEAD +<<<<<<< HEAD | 1 | 0.607600 | `storage_account_get` | ❌ | | 2 | 0.587736 | `monitor_metrics_definitions` | ✅ **EXPECTED** | | 3 | 0.544043 | `storage_blob_container_get` | ❌ | @@ -11682,16 +13870,23 @@ ## Test 317 ======= | 1 | 0.607575 | `storage_account_get` | ❌ | +======= +| 1 | 0.607600 | `storage_account_get` | ❌ | +>>>>>>> e2fd2eac (refactor tts mcp tool) | 2 | 0.587736 | `monitor_metrics_definitions` | ✅ **EXPECTED** | -| 3 | 0.544781 | `storage_blob_container_get` | ❌ | +| 3 | 0.545035 | `storage_blob_container_get` | ❌ | | 4 | 0.495829 | `storage_blob_get` | ❌ | -| 5 | 0.473421 | `managedlustre_filesystem_list` | ❌ | +| 5 | 0.473421 | `managedlustre_fs_list` | ❌ | --- +<<<<<<< HEAD ## Test 327 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 332 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `monitor_metrics_definitions` **Prompt:** What metric definitions are available for the Application Insights resource @@ -11701,6 +13896,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| <<<<<<< HEAD +<<<<<<< HEAD | 1 | 0.633173 | `monitor_metrics_definitions` | ✅ **EXPECTED** | | 2 | 0.495513 | `monitor_metrics_query` | ❌ | ======= @@ -11712,12 +13908,17 @@ | 2 | 0.495513 | `monitor_metrics_query` | ❌ | >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +| 1 | 0.633173 | `monitor_metrics_definitions` | ✅ **EXPECTED** | +| 2 | 0.495587 | `monitor_metrics_query` | ❌ | +>>>>>>> e2fd2eac (refactor tts mcp tool) | 3 | 0.433945 | `monitor_resource_log_query` | ❌ | | 4 | 0.392960 | `loadtesting_testresource_list` | ❌ | | 5 | 0.388569 | `bicepschema_get` | ❌ | --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 328 ======= @@ -11727,6 +13928,9 @@ ## Test 328 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 333 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `monitor_metrics_query` **Prompt:** Analyze the performance trends and response times for Application Insights resource over the last @@ -11735,6 +13939,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.555377 | `monitor_metrics_query` | ✅ **EXPECTED** | | 2 | 0.527530 | `monitor_resource_log_query` | ❌ | | 3 | 0.464743 | `applens_resource_diagnose` | ❌ | @@ -11752,6 +13957,17 @@ ## Test 329 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +| 1 | 0.555259 | `monitor_metrics_query` | ✅ **EXPECTED** | +| 2 | 0.527465 | `monitor_resource_log_query` | ❌ | +| 3 | 0.464988 | `applens_resource_diagnose` | ❌ | +| 4 | 0.420447 | `resourcehealth_health-events_list` | ❌ | +| 5 | 0.413438 | `applicationinsights_recommendation_list` | ❌ | + +--- + +## Test 334 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `monitor_metrics_query` **Prompt:** Check the availability metrics for my Application Insights resource for the last @@ -11760,6 +13976,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.557830 | `monitor_metrics_query` | ✅ **EXPECTED** | | 2 | 0.476671 | `monitor_resource_log_query` | ❌ | <<<<<<< HEAD @@ -11780,15 +13997,23 @@ ## Test 320 ======= +======= +| 1 | 0.558015 | `monitor_metrics_query` | ✅ **EXPECTED** | +| 2 | 0.476671 | `monitor_resource_log_query` | ❌ | +>>>>>>> e2fd2eac (refactor tts mcp tool) | 3 | 0.460611 | `resourcehealth_availability-status_list` | ❌ | | 4 | 0.455904 | `quota_usage_check` | ❌ | | 5 | 0.438233 | `monitor_metrics_definitions` | ❌ | --- +<<<<<<< HEAD ## Test 330 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 335 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `monitor_metrics_query` **Prompt:** Get the metric for over the last with intervals @@ -11798,6 +14023,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| <<<<<<< HEAD +<<<<<<< HEAD | 1 | 0.461249 | `monitor_metrics_query` | ✅ **EXPECTED** | | 2 | 0.390029 | `monitor_metrics_definitions` | ❌ | | 3 | 0.338557 | `monitor_resource_log_query` | ❌ | @@ -11820,16 +14046,23 @@ ## Test 321 ======= | 1 | 0.461249 | `monitor_metrics_query` | ✅ **EXPECTED** | +======= +| 1 | 0.461420 | `monitor_metrics_query` | ✅ **EXPECTED** | +>>>>>>> e2fd2eac (refactor tts mcp tool) | 2 | 0.390029 | `monitor_metrics_definitions` | ❌ | | 3 | 0.338557 | `monitor_resource_log_query` | ❌ | -| 4 | 0.330533 | `resourcehealth_availability-status_get` | ❌ | +| 4 | 0.334519 | `resourcehealth_availability-status_get` | ❌ | | 5 | 0.306338 | `resourcehealth_availability-status_list` | ❌ | --- +<<<<<<< HEAD ## Test 331 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 336 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `monitor_metrics_query` **Prompt:** Investigate error rates and failed requests for Application Insights resource for the last @@ -11838,6 +14071,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.496878 | `monitor_resource_log_query` | ❌ | <<<<<<< HEAD | 2 | 0.492138 | `monitor_metrics_query` | ✅ **EXPECTED** | @@ -11866,6 +14100,17 @@ ## Test 332 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +| 1 | 0.496911 | `monitor_resource_log_query` | ❌ | +| 2 | 0.492280 | `monitor_metrics_query` | ✅ **EXPECTED** | +| 3 | 0.448203 | `applens_resource_diagnose` | ❌ | +| 4 | 0.412199 | `resourcehealth_health-events_list` | ❌ | +| 5 | 0.397367 | `quota_usage_check` | ❌ | + +--- + +## Test 337 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `monitor_metrics_query` **Prompt:** Query the metric for for the last @@ -11875,6 +14120,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| <<<<<<< HEAD +<<<<<<< HEAD | 1 | 0.525890 | `monitor_metrics_query` | ✅ **EXPECTED** | | 2 | 0.405838 | `monitor_resource_log_query` | ❌ | | 3 | 0.384811 | `monitor_metrics_definitions` | ❌ | @@ -11886,14 +14132,17 @@ ## Test 333 ======= | 1 | 0.525326 | `monitor_metrics_query` | ✅ **EXPECTED** | +======= +| 1 | 0.525816 | `monitor_metrics_query` | ✅ **EXPECTED** | +>>>>>>> e2fd2eac (refactor tts mcp tool) | 2 | 0.406185 | `monitor_resource_log_query` | ❌ | -<<<<<<< HEAD -| 3 | 0.384524 | `monitor_metrics_definitions` | ❌ | +| 3 | 0.384482 | `monitor_metrics_definitions` | ❌ | | 4 | 0.347723 | `monitor_workspace_log_query` | ❌ | | 5 | 0.330713 | `resourcehealth_availability-status_get` | ❌ | --- +<<<<<<< HEAD ## Test 323 ======= | 3 | 0.384482 | `monitor_metrics_definitions` | ❌ | @@ -11905,6 +14154,9 @@ ## Test 333 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 338 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `monitor_metrics_query` **Prompt:** What's the request per second rate for my Application Insights resource over the last @@ -11913,6 +14165,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.480140 | `monitor_metrics_query` | ✅ **EXPECTED** | | 2 | 0.444779 | `monitor_resource_log_query` | ❌ | | 3 | 0.388382 | `applens_resource_diagnose` | ❌ | @@ -11922,10 +14175,17 @@ <<<<<<< HEAD | 4 | 0.363640 | `quota_usage_check` | ❌ | >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +| 1 | 0.480194 | `monitor_metrics_query` | ✅ **EXPECTED** | +| 2 | 0.444779 | `monitor_resource_log_query` | ❌ | +| 3 | 0.388382 | `applens_resource_diagnose` | ❌ | +| 4 | 0.363412 | `quota_usage_check` | ❌ | +>>>>>>> e2fd2eac (refactor tts mcp tool) | 5 | 0.350076 | `resourcehealth_health-events_list` | ❌ | --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 334 ======= @@ -11939,6 +14199,9 @@ ## Test 334 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 339 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `monitor_resource_log_query` **Prompt:** Show me the logs for the past hour for the resource in the Log Analytics workspace @@ -11947,6 +14210,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.687852 | `monitor_resource_log_query` | ✅ **EXPECTED** | | 2 | 0.621919 | `monitor_workspace_log_query` | ❌ | <<<<<<< HEAD @@ -11974,6 +14238,17 @@ ## Test 335 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +| 1 | 0.687702 | `monitor_resource_log_query` | ✅ **EXPECTED** | +| 2 | 0.621740 | `monitor_workspace_log_query` | ❌ | +| 3 | 0.598494 | `monitor_activitylog_list` | ❌ | +| 4 | 0.485733 | `deploy_app_logs_get` | ❌ | +| 5 | 0.469848 | `monitor_metrics_query` | ❌ | + +--- + +## Test 340 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `monitor_table_list` **Prompt:** List all tables in the Log Analytics workspace @@ -11982,7 +14257,6 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD | 1 | 0.851075 | `monitor_table_list` | ✅ **EXPECTED** | <<<<<<< HEAD | 2 | 0.725693 | `monitor_table_type_list` | ❌ | @@ -11991,18 +14265,17 @@ ======= | 2 | 0.725738 | `monitor_table_type_list` | ❌ | | 3 | 0.620445 | `monitor_workspace_list` | ❌ | -| 4 | 0.541959 | `kusto_table_list` | ❌ | -======= -| 1 | 0.850522 | `monitor_table_list` | ✅ **EXPECTED** | -| 2 | 0.725738 | `monitor_table_type_list` | ❌ | -| 3 | 0.620445 | `monitor_workspace_list` | ❌ | | 4 | 0.541928 | `kusto_table_list` | ❌ | +<<<<<<< HEAD >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +>>>>>>> e2fd2eac (refactor tts mcp tool) | 5 | 0.539481 | `monitor_workspace_log_query` | ❌ | --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 336 ======= @@ -12012,6 +14285,9 @@ ## Test 336 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 341 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `monitor_table_list` **Prompt:** Show me the tables in the Log Analytics workspace @@ -12021,16 +14297,16 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| <<<<<<< HEAD +<<<<<<< HEAD | 1 | 0.798459 | `monitor_table_list` | ✅ **EXPECTED** | | 2 | 0.701092 | `monitor_table_type_list` | ❌ | | 3 | 0.600003 | `monitor_workspace_list` | ❌ | | 4 | 0.542820 | `monitor_workspace_log_query` | ❌ | ======= <<<<<<< HEAD -| 1 | 0.798460 | `monitor_table_list` | ✅ **EXPECTED** | ======= -| 1 | 0.798109 | `monitor_table_list` | ✅ **EXPECTED** | ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> e2fd2eac (refactor tts mcp tool) +| 1 | 0.798460 | `monitor_table_list` | ✅ **EXPECTED** | | 2 | 0.701122 | `monitor_table_type_list` | ❌ | | 3 | 0.599917 | `monitor_workspace_list` | ❌ | | 4 | 0.542821 | `monitor_workspace_log_query` | ❌ | @@ -12039,6 +14315,7 @@ --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 337 ======= @@ -12048,6 +14325,9 @@ ## Test 337 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 342 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `monitor_table_type_list` **Prompt:** List all available table types in the Log Analytics workspace @@ -12063,20 +14343,19 @@ | 4 | 0.504683 | `mysql_table_list` | ❌ | ======= | 1 | 0.881524 | `monitor_table_type_list` | ✅ **EXPECTED** | -<<<<<<< HEAD | 2 | 0.765702 | `monitor_table_list` | ❌ | | 3 | 0.569921 | `monitor_workspace_list` | ❌ | -| 4 | 0.504789 | `mysql_table_list` | ❌ | -======= -| 2 | 0.765548 | `monitor_table_list` | ❌ | -| 3 | 0.569921 | `monitor_workspace_list` | ❌ | | 4 | 0.504683 | `mysql_table_list` | ❌ | +<<<<<<< HEAD >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +>>>>>>> e2fd2eac (refactor tts mcp tool) | 5 | 0.497622 | `monitor_workspace_log_query` | ❌ | --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 338 ======= @@ -12086,6 +14365,9 @@ ## Test 338 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 343 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `monitor_table_type_list` **Prompt:** Show me the available table types in the Log Analytics workspace @@ -12100,11 +14382,7 @@ | 3 | 0.576934 | `monitor_workspace_list` | ❌ | ======= | 1 | 0.843138 | `monitor_table_type_list` | ✅ **EXPECTED** | -<<<<<<< HEAD | 2 | 0.736837 | `monitor_table_list` | ❌ | -======= -| 2 | 0.736830 | `monitor_table_list` | ❌ | ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) | 3 | 0.576731 | `monitor_workspace_list` | ❌ | >>>>>>> 58ab8585 (update prompts and tool description evaluator) | 4 | 0.509598 | `monitor_workspace_log_query` | ❌ | @@ -12112,6 +14390,7 @@ --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 339 ======= @@ -12121,6 +14400,9 @@ ## Test 339 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 344 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `monitor_webtests_create` **Prompt:** Create a new Standard Web Test with name in my subscription in in a given @@ -12130,6 +14412,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| <<<<<<< HEAD +<<<<<<< HEAD | 1 | 0.651084 | `monitor_webtests_create` | ✅ **EXPECTED** | | 2 | 0.570105 | `monitor_webtests_list` | ❌ | | 3 | 0.550426 | `monitor_webtests_update` | ❌ | @@ -12162,6 +14445,17 @@ ## Test 340 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +| 1 | 0.650804 | `monitor_webtests_create` | ✅ **EXPECTED** | +| 2 | 0.570334 | `monitor_webtests_list` | ❌ | +| 3 | 0.550263 | `monitor_webtests_update` | ❌ | +| 4 | 0.533405 | `monitor_webtests_get` | ❌ | +| 5 | 0.482023 | `loadtesting_testresource_create` | ❌ | + +--- + +## Test 345 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `monitor_webtests_get` **Prompt:** Get Web Test details for in my subscription in @@ -12171,6 +14465,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| <<<<<<< HEAD +<<<<<<< HEAD | 1 | 0.758910 | `monitor_webtests_get` | ✅ **EXPECTED** | | 2 | 0.725360 | `monitor_webtests_list` | ❌ | | 3 | 0.583663 | `loadtesting_testresource_list` | ❌ | @@ -12184,12 +14479,17 @@ <<<<<<< HEAD | 1 | 0.759380 | `monitor_webtests_get` | ✅ **EXPECTED** | | 2 | 0.725337 | `monitor_webtests_list` | ❌ | +======= +| 1 | 0.759015 | `monitor_webtests_get` | ✅ **EXPECTED** | +| 2 | 0.725442 | `monitor_webtests_list` | ❌ | +>>>>>>> e2fd2eac (refactor tts mcp tool) | 3 | 0.583816 | `loadtesting_testresource_list` | ❌ | | 4 | 0.562797 | `monitor_webtests_update` | ❌ | | 5 | 0.530557 | `monitor_webtests_create` | ❌ | --- +<<<<<<< HEAD ## Test 331 ======= | 1 | 0.759062 | `monitor_webtests_get` | ✅ **EXPECTED** | @@ -12203,6 +14503,9 @@ ## Test 341 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 346 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `monitor_webtests_list` **Prompt:** List all Web Test resources in my subscription @@ -12212,6 +14515,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| <<<<<<< HEAD +<<<<<<< HEAD | 1 | 0.730616 | `monitor_webtests_list` | ✅ **EXPECTED** | ======= <<<<<<< HEAD @@ -12233,13 +14537,23 @@ | 2 | 0.610160 | `loadtesting_testresource_list` | ❌ | | 3 | 0.547708 | `grafana_list` | ❌ | | 4 | 0.520829 | `redis_list` | ❌ | +======= +| 1 | 0.730616 | `monitor_webtests_list` | ✅ **EXPECTED** | +| 2 | 0.610160 | `loadtesting_testresource_list` | ❌ | +| 3 | 0.547708 | `grafana_list` | ❌ | +| 4 | 0.520842 | `redis_list` | ❌ | +>>>>>>> e2fd2eac (refactor tts mcp tool) | 5 | 0.496166 | `monitor_webtests_get` | ❌ | --- +<<<<<<< HEAD ## Test 342 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 347 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `monitor_webtests_list` **Prompt:** List all Web Test resources in my subscription in @@ -12249,6 +14563,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| <<<<<<< HEAD +<<<<<<< HEAD | 1 | 0.793807 | `monitor_webtests_list` | ✅ **EXPECTED** | | 2 | 0.675965 | `loadtesting_testresource_list` | ❌ | | 3 | 0.584429 | `monitor_webtests_get` | ❌ | @@ -12263,11 +14578,17 @@ | 3 | 0.584429 | `monitor_webtests_get` | ❌ | >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +| 1 | 0.793807 | `monitor_webtests_list` | ✅ **EXPECTED** | +| 2 | 0.675965 | `loadtesting_testresource_list` | ❌ | +| 3 | 0.584429 | `monitor_webtests_get` | ❌ | +>>>>>>> e2fd2eac (refactor tts mcp tool) | 4 | 0.573602 | `group_list` | ❌ | | 5 | 0.546088 | `resourcehealth_availability-status_list` | ❌ | --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 343 ======= @@ -12277,6 +14598,9 @@ ## Test 343 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 348 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `monitor_webtests_update` **Prompt:** Update an existing Standard Web Test with name in my subscription in in a given @@ -12286,6 +14610,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| <<<<<<< HEAD +<<<<<<< HEAD | 1 | 0.686427 | `monitor_webtests_update` | ✅ **EXPECTED** | | 2 | 0.558816 | `monitor_webtests_get` | ❌ | | 3 | 0.557828 | `monitor_webtests_create` | ❌ | @@ -12318,6 +14643,17 @@ ## Test 344 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +| 1 | 0.686426 | `monitor_webtests_update` | ✅ **EXPECTED** | +| 2 | 0.559273 | `monitor_webtests_get` | ❌ | +| 3 | 0.558221 | `monitor_webtests_create` | ❌ | +| 4 | 0.553741 | `monitor_webtests_list` | ❌ | +| 5 | 0.508780 | `loadtesting_testrun_update` | ❌ | + +--- + +## Test 349 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `monitor_workspace_list` **Prompt:** List all Log Analytics workspaces in my subscription @@ -12326,6 +14662,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.813871 | `monitor_workspace_list` | ✅ **EXPECTED** | | 2 | 0.680201 | `grafana_list` | ❌ | <<<<<<< HEAD @@ -12354,6 +14691,17 @@ ## Test 345 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +| 1 | 0.813506 | `monitor_workspace_list` | ✅ **EXPECTED** | +| 2 | 0.679650 | `grafana_list` | ❌ | +| 3 | 0.659506 | `monitor_table_list` | ❌ | +| 4 | 0.610550 | `kusto_cluster_list` | ❌ | +| 5 | 0.601012 | `search_service_list` | ❌ | + +--- + +## Test 350 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `monitor_workspace_list` **Prompt:** Show me my Log Analytics workspaces @@ -12363,6 +14711,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| <<<<<<< HEAD +<<<<<<< HEAD | 1 | 0.656159 | `monitor_workspace_list` | ✅ **EXPECTED** | | 2 | 0.585355 | `monitor_table_list` | ❌ | | 3 | 0.531036 | `monitor_table_type_list` | ❌ | @@ -12374,6 +14723,8 @@ ## Test 346 ======= <<<<<<< HEAD +======= +>>>>>>> e2fd2eac (refactor tts mcp tool) | 1 | 0.656194 | `monitor_workspace_list` | ✅ **EXPECTED** | | 2 | 0.585436 | `monitor_table_list` | ❌ | | 3 | 0.531083 | `monitor_table_type_list` | ❌ | @@ -12382,6 +14733,7 @@ --- +<<<<<<< HEAD ## Test 336 ======= | 1 | 0.656153 | `monitor_workspace_list` | ✅ **EXPECTED** | @@ -12395,6 +14747,9 @@ ## Test 346 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 351 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `monitor_workspace_list` **Prompt:** Show me the Log Analytics workspaces in my subscription @@ -12406,9 +14761,12 @@ | 1 | 0.732964 | `monitor_workspace_list` | ✅ **EXPECTED** | | 2 | 0.601481 | `grafana_list` | ❌ | <<<<<<< HEAD +<<<<<<< HEAD | 3 | 0.580244 | `monitor_table_list` | ❌ | ======= <<<<<<< HEAD +======= +>>>>>>> e2fd2eac (refactor tts mcp tool) | 3 | 0.580261 | `monitor_table_list` | ❌ | >>>>>>> 58ab8585 (update prompts and tool description evaluator) | 4 | 0.523782 | `monitor_workspace_log_query` | ❌ | @@ -12416,6 +14774,7 @@ --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 347 ======= @@ -12430,6 +14789,9 @@ ## Test 347 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 352 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `monitor_workspace_log_query` **Prompt:** Show me the logs for the past hour in the Log Analytics workspace @@ -12438,10 +14800,10 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD | 1 | 0.610115 | `monitor_workspace_log_query` | ✅ **EXPECTED** | | 2 | 0.587614 | `monitor_resource_log_query` | ❌ | | 3 | 0.527733 | `monitor_activitylog_list` | ❌ | +<<<<<<< HEAD | 4 | 0.498148 | `deploy_app_logs_get` | ❌ | | 5 | 0.485982 | `monitor_table_list` | ❌ | @@ -12463,6 +14825,14 @@ ## Test 348 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +| 4 | 0.498269 | `deploy_app_logs_get` | ❌ | +| 5 | 0.485984 | `monitor_table_list` | ❌ | + +--- + +## Test 353 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `datadog_monitoredresources_list` **Prompt:** List all monitored resources in the Datadog resource @@ -12471,6 +14841,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.668828 | `datadog_monitoredresources_list` | ✅ **EXPECTED** | | 2 | 0.454270 | `redis_list` | ❌ | | 3 | 0.413661 | `loadtesting_testresource_list` | ❌ | @@ -12483,10 +14854,17 @@ | 4 | 0.413173 | `monitor_metrics_query` | ❌ | >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +| 1 | 0.668827 | `datadog_monitoredresources_list` | ✅ **EXPECTED** | +| 2 | 0.454295 | `redis_list` | ❌ | +| 3 | 0.413661 | `loadtesting_testresource_list` | ❌ | +| 4 | 0.413407 | `monitor_metrics_query` | ❌ | +>>>>>>> e2fd2eac (refactor tts mcp tool) | 5 | 0.401731 | `grafana_list` | ❌ | --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 349 ======= @@ -12496,6 +14874,9 @@ ## Test 349 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 354 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `datadog_monitoredresources_list` **Prompt:** Show me the monitored resources in the Datadog resource @@ -12505,13 +14886,19 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| | 1 | 0.624066 | `datadog_monitoredresources_list` | ✅ **EXPECTED** | +<<<<<<< HEAD | 2 | 0.443481 | `monitor_metrics_query` | ❌ | | 3 | 0.440052 | `redis_list` | ❌ | +======= +| 2 | 0.443652 | `monitor_metrics_query` | ❌ | +| 3 | 0.440095 | `redis_list` | ❌ | +>>>>>>> e2fd2eac (refactor tts mcp tool) | 4 | 0.424391 | `monitor_resource_log_query` | ❌ | | 5 | 0.385122 | `loadtesting_testresource_list` | ❌ | --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 350 ======= @@ -12521,6 +14908,9 @@ ## Test 350 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 355 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `extension_azqr` **Prompt:** Check my Azure subscription for any compliance issues or recommendations @@ -12530,6 +14920,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| <<<<<<< HEAD +<<<<<<< HEAD | 1 | 0.533403 | `quota_usage_check` | ❌ | | 2 | 0.481143 | `azureterraformbestpractices_get` | ❌ | | 3 | 0.476826 | `extension_azqr` | ✅ **EXPECTED** | @@ -12541,15 +14932,17 @@ >>>>>>> 58ab8585 (update prompts and tool description evaluator) | 4 | 0.471547 | `subscription_list` | ❌ | ======= +======= +>>>>>>> e2fd2eac (refactor tts mcp tool) | 1 | 0.533164 | `quota_usage_check` | ❌ | | 2 | 0.481143 | `azureterraformbestpractices_get` | ❌ | | 3 | 0.476826 | `extension_azqr` | ✅ **EXPECTED** | | 4 | 0.471499 | `subscription_list` | ❌ | ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) | 5 | 0.468404 | `applens_resource_diagnose` | ❌ | --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 351 ======= @@ -12559,6 +14952,9 @@ ## Test 351 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 356 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `extension_azqr` **Prompt:** Provide compliance recommendations for my current Azure subscription @@ -12568,6 +14964,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| <<<<<<< HEAD +<<<<<<< HEAD | 1 | 0.532792 | `azureterraformbestpractices_get` | ❌ | ======= <<<<<<< HEAD @@ -12575,15 +14972,17 @@ >>>>>>> 58ab8585 (update prompts and tool description evaluator) | 2 | 0.492863 | `get_bestpractices_get` | ❌ | ======= +======= +>>>>>>> e2fd2eac (refactor tts mcp tool) | 1 | 0.532792 | `azureterraformbestpractices_get` | ❌ | -| 2 | 0.492602 | `get_bestpractices_get` | ❌ | ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) +| 2 | 0.492863 | `get_bestpractices_get` | ❌ | | 3 | 0.476164 | `applicationinsights_recommendation_list` | ❌ | | 4 | 0.473365 | `deploy_iac_rules_get` | ❌ | | 5 | 0.468491 | `azureaibestpractices_get` | ❌ | --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 352 ======= @@ -12593,6 +14992,9 @@ ## Test 352 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 357 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `extension_azqr` **Prompt:** Scan my Azure subscription for compliance recommendations @@ -12602,6 +15004,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| <<<<<<< HEAD +<<<<<<< HEAD | 1 | 0.536917 | `azureterraformbestpractices_get` | ❌ | | 2 | 0.516910 | `extension_azqr` | ✅ **EXPECTED** | | 3 | 0.514947 | `applicationinsights_recommendation_list` | ❌ | @@ -12614,22 +15017,26 @@ ======= | 1 | 0.536984 | `azureterraformbestpractices_get` | ❌ | | 2 | 0.516810 | `extension_azqr` | ✅ **EXPECTED** | -| 3 | 0.514978 | `applicationinsights_recommendation_list` | ❌ | -<<<<<<< HEAD -| 4 | 0.504929 | `quota_usage_check` | ❌ | ======= +| 1 | 0.536934 | `azureterraformbestpractices_get` | ❌ | +| 2 | 0.516925 | `extension_azqr` | ✅ **EXPECTED** | +>>>>>>> e2fd2eac (refactor tts mcp tool) +| 3 | 0.514978 | `applicationinsights_recommendation_list` | ❌ | | 4 | 0.504673 | `quota_usage_check` | ❌ | ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) | 5 | 0.494872 | `deploy_plan_get` | ❌ | --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 343 ======= ## Test 353 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 358 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `quota_region_availability_list` **Prompt:** Show me the available regions for these resource types @@ -12640,6 +15047,7 @@ |------|-------|------|--------| | 1 | 0.590878 | `quota_region_availability_list` | ✅ **EXPECTED** | <<<<<<< HEAD +<<<<<<< HEAD | 2 | 0.413662 | `quota_usage_check` | ❌ | ======= <<<<<<< HEAD @@ -12663,6 +15071,16 @@ ## Test 354 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +| 2 | 0.413274 | `quota_usage_check` | ❌ | +| 3 | 0.391361 | `redis_list` | ❌ | +| 4 | 0.372940 | `resourcehealth_availability-status_list` | ❌ | +| 5 | 0.369855 | `managedlustre_fs_sku_get` | ❌ | + +--- + +## Test 359 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `quota_usage_check` **Prompt:** Check usage information for in region @@ -12672,6 +15090,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| <<<<<<< HEAD +<<<<<<< HEAD | 1 | 0.609711 | `quota_usage_check` | ✅ **EXPECTED** | | 2 | 0.491058 | `quota_region_availability_list` | ❌ | | 3 | 0.384350 | `resourcehealth_availability-status_list` | ❌ | @@ -12702,6 +15121,17 @@ ## Test 355 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +| 1 | 0.609244 | `quota_usage_check` | ✅ **EXPECTED** | +| 2 | 0.491058 | `quota_region_availability_list` | ❌ | +| 3 | 0.384350 | `resourcehealth_availability-status_list` | ❌ | +| 4 | 0.376368 | `resourcehealth_availability-status_get` | ❌ | +| 5 | 0.371447 | `redis_list` | ❌ | + +--- + +## Test 360 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `role_assignment_list` **Prompt:** List all available role assignments in my subscription @@ -12710,14 +15140,20 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.645258 | `role_assignment_list` | ✅ **EXPECTED** | | 2 | 0.539757 | `subscription_list` | ❌ | +======= +| 1 | 0.645259 | `role_assignment_list` | ✅ **EXPECTED** | +| 2 | 0.539761 | `subscription_list` | ❌ | +>>>>>>> e2fd2eac (refactor tts mcp tool) | 3 | 0.483988 | `group_list` | ❌ | | 4 | 0.478700 | `grafana_list` | ❌ | -| 5 | 0.471431 | `cosmos_account_list` | ❌ | +| 5 | 0.471364 | `cosmos_account_list` | ❌ | --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 356 ======= @@ -12727,6 +15163,9 @@ ## Test 356 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 361 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `role_assignment_list` **Prompt:** Show me the available role assignments in my subscription @@ -12735,6 +15174,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.609704 | `role_assignment_list` | ✅ **EXPECTED** | | 2 | 0.514697 | `subscription_list` | ❌ | | 3 | 0.456956 | `grafana_list` | ❌ | @@ -12752,6 +15192,17 @@ ## Test 357 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +| 1 | 0.609705 | `role_assignment_list` | ✅ **EXPECTED** | +| 2 | 0.514696 | `subscription_list` | ❌ | +| 3 | 0.456956 | `grafana_list` | ❌ | +| 4 | 0.449210 | `eventgrid_subscription_list` | ❌ | +| 5 | 0.445176 | `redis_list` | ❌ | + +--- + +## Test 362 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `redis_list` **Prompt:** List all Redis resources in my subscription @@ -12761,6 +15212,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| <<<<<<< HEAD +<<<<<<< HEAD | 1 | 0.810504 | `redis_list` | ✅ **EXPECTED** | | 2 | 0.587836 | `grafana_list` | ❌ | | 3 | 0.512954 | `kusto_cluster_list` | ❌ | @@ -12783,16 +15235,23 @@ ## Test 348 ======= | 1 | 0.810504 | `redis_list` | ✅ **EXPECTED** | +======= +| 1 | 0.810532 | `redis_list` | ✅ **EXPECTED** | +>>>>>>> e2fd2eac (refactor tts mcp tool) | 2 | 0.587836 | `grafana_list` | ❌ | -| 3 | 0.512970 | `kusto_cluster_list` | ❌ | +| 3 | 0.512954 | `kusto_cluster_list` | ❌ | | 4 | 0.508531 | `datadog_monitoredresources_list` | ❌ | | 5 | 0.501218 | `postgres_server_list` | ❌ | --- +<<<<<<< HEAD ## Test 358 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 363 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `redis_list` **Prompt:** Show me my Redis resources @@ -12801,6 +15260,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.685128 | `redis_list` | ✅ **EXPECTED** | | 2 | 0.374327 | `grafana_list` | ❌ | | 3 | 0.364197 | `datadog_monitoredresources_list` | ❌ | @@ -12813,10 +15273,17 @@ | 4 | 0.359659 | `mysql_server_list` | ❌ | >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +| 1 | 0.685197 | `redis_list` | ✅ **EXPECTED** | +| 2 | 0.374328 | `grafana_list` | ❌ | +| 3 | 0.364197 | `datadog_monitoredresources_list` | ❌ | +| 4 | 0.359659 | `mysql_server_list` | ❌ | +>>>>>>> e2fd2eac (refactor tts mcp tool) | 5 | 0.331502 | `mysql_database_list` | ❌ | --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 359 ======= @@ -12826,6 +15293,9 @@ ## Test 359 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 364 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `redis_list` **Prompt:** Show me the Redis resources in my subscription @@ -12834,19 +15304,24 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -| 1 | 0.781228 | `redis_list` | ✅ **EXPECTED** | +| 1 | 0.781276 | `redis_list` | ✅ **EXPECTED** | | 2 | 0.539177 | `grafana_list` | ❌ | | 3 | 0.449276 | `datadog_monitoredresources_list` | ❌ | <<<<<<< HEAD +<<<<<<< HEAD | 4 | 0.449014 | `postgres_server_list` | ❌ | ======= <<<<<<< HEAD | 4 | 0.448989 | `postgres_server_list` | ❌ | >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +| 4 | 0.449014 | `postgres_server_list` | ❌ | +>>>>>>> e2fd2eac (refactor tts mcp tool) | 5 | 0.442854 | `kusto_cluster_list` | ❌ | --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 360 ======= @@ -12860,6 +15335,9 @@ ## Test 360 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 365 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `redis_list` **Prompt:** Show me my Redis caches @@ -12868,19 +15346,24 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -| 1 | 0.572767 | `redis_list` | ✅ **EXPECTED** | +| 1 | 0.572836 | `redis_list` | ✅ **EXPECTED** | | 2 | 0.316630 | `mysql_database_list` | ❌ | | 3 | 0.301786 | `postgres_database_list` | ❌ | <<<<<<< HEAD +<<<<<<< HEAD | 4 | 0.286513 | `mysql_server_list` | ❌ | ======= <<<<<<< HEAD | 4 | 0.286570 | `mysql_server_list` | ❌ | >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +| 4 | 0.286513 | `mysql_server_list` | ❌ | +>>>>>>> e2fd2eac (refactor tts mcp tool) | 5 | 0.273014 | `kusto_cluster_list` | ❌ | --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 361 ======= @@ -12894,6 +15377,9 @@ ## Test 361 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 366 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `redis_list` **Prompt:** Get Redis clusters @@ -12903,6 +15389,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| <<<<<<< HEAD +<<<<<<< HEAD | 1 | 0.478070 | `redis_list` | ✅ **EXPECTED** | | 2 | 0.456308 | `kusto_cluster_list` | ❌ | | 3 | 0.384630 | `kusto_cluster_get` | ❌ | @@ -12926,15 +15413,23 @@ ======= | 1 | 0.478070 | `redis_list` | ✅ **EXPECTED** | | 2 | 0.456311 | `kusto_cluster_list` | ❌ | +======= +| 1 | 0.478136 | `redis_list` | ✅ **EXPECTED** | +| 2 | 0.456309 | `kusto_cluster_list` | ❌ | +>>>>>>> e2fd2eac (refactor tts mcp tool) | 3 | 0.384630 | `kusto_cluster_get` | ❌ | -| 4 | 0.359797 | `kusto_database_list` | ❌ | +| 4 | 0.359434 | `kusto_database_list` | ❌ | | 5 | 0.343305 | `aks_cluster_get` | ❌ | --- +<<<<<<< HEAD ## Test 362 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 367 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `group_list` **Prompt:** List all resource groups in my subscription @@ -12945,6 +15440,7 @@ |------|-------|------|--------| | 1 | 0.755935 | `group_list` | ✅ **EXPECTED** | <<<<<<< HEAD +<<<<<<< HEAD | 2 | 0.566552 | `workbooks_list` | ❌ | ======= <<<<<<< HEAD @@ -12961,16 +15457,22 @@ ======= ## Test 353 ======= +======= +>>>>>>> e2fd2eac (refactor tts mcp tool) | 2 | 0.566552 | `workbooks_list` | ❌ | | 3 | 0.564566 | `loadtesting_testresource_list` | ❌ | | 4 | 0.552633 | `datadog_monitoredresources_list` | ❌ | -| 5 | 0.546156 | `resourcehealth_availability-status_list` | ❌ | +| 5 | 0.549477 | `monitor_webtests_list` | ❌ | --- +<<<<<<< HEAD ## Test 363 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 368 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `group_list` **Prompt:** Show me my resource groups @@ -12980,9 +15482,10 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| | 1 | 0.529504 | `group_list` | ✅ **EXPECTED** | -| 2 | 0.464690 | `redis_list` | ❌ | +| 2 | 0.464725 | `redis_list` | ❌ | | 3 | 0.463685 | `datadog_monitoredresources_list` | ❌ | <<<<<<< HEAD +<<<<<<< HEAD | 4 | 0.462391 | `mysql_server_list` | ❌ | ======= <<<<<<< HEAD @@ -12991,10 +15494,14 @@ | 4 | 0.462391 | `mysql_server_list` | ❌ | >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +| 4 | 0.462391 | `mysql_server_list` | ❌ | +>>>>>>> e2fd2eac (refactor tts mcp tool) | 5 | 0.460280 | `loadtesting_testresource_list` | ❌ | --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 364 ======= @@ -13004,6 +15511,9 @@ ## Test 364 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 369 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `group_list` **Prompt:** Show me the resource groups in my subscription @@ -13014,12 +15524,18 @@ |------|-------|------|--------| | 1 | 0.665772 | `group_list` | ✅ **EXPECTED** | | 2 | 0.532656 | `datadog_monitoredresources_list` | ❌ | +<<<<<<< HEAD | 3 | 0.532505 | `redis_list` | ❌ | | 4 | 0.532015 | `eventgrid_topic_list` | ❌ | +======= +| 3 | 0.532524 | `redis_list` | ❌ | +| 4 | 0.532054 | `eventgrid_topic_list` | ❌ | +>>>>>>> e2fd2eac (refactor tts mcp tool) | 5 | 0.531920 | `resourcehealth_availability-status_list` | ❌ | --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 365 ======= @@ -13029,6 +15545,9 @@ ## Test 365 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 370 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `resourcehealth_availability-status_get` **Prompt:** Get the availability status for resource @@ -13038,6 +15557,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| <<<<<<< HEAD +<<<<<<< HEAD | 1 | 0.556926 | `resourcehealth_availability-status_get` | ✅ **EXPECTED** | | 2 | 0.538273 | `resourcehealth_availability-status_list` | ❌ | | 3 | 0.378030 | `quota_usage_check` | ❌ | @@ -13047,11 +15567,17 @@ | 2 | 0.538277 | `resourcehealth_availability-status_list` | ❌ | | 3 | 0.377966 | `quota_usage_check` | ❌ | >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +| 1 | 0.556629 | `resourcehealth_availability-status_get` | ✅ **EXPECTED** | +| 2 | 0.538273 | `resourcehealth_availability-status_list` | ❌ | +| 3 | 0.377586 | `quota_usage_check` | ❌ | +>>>>>>> e2fd2eac (refactor tts mcp tool) | 4 | 0.373112 | `monitor_healthmodels_entity_get` | ❌ | | 5 | 0.349981 | `datadog_monitoredresources_list` | ❌ | --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 366 ======= @@ -13068,6 +15594,9 @@ ## Test 366 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 371 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `resourcehealth_availability-status_get` **Prompt:** Show me the health status of the storage account @@ -13077,6 +15606,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| <<<<<<< HEAD +<<<<<<< HEAD | 1 | 0.576591 | `storage_account_get` | ❌ | | 2 | 0.564706 | `resourcehealth_availability-status_get` | ✅ **EXPECTED** | | 3 | 0.555636 | `storage_blob_container_get` | ❌ | @@ -13096,17 +15626,23 @@ ======= ## Test 357 ======= +======= +>>>>>>> e2fd2eac (refactor tts mcp tool) | 1 | 0.576591 | `storage_account_get` | ❌ | -| 2 | 0.566633 | `resourcehealth_availability-status_get` | ✅ **EXPECTED** | -| 3 | 0.556167 | `storage_blob_container_get` | ❌ | +| 2 | 0.564128 | `resourcehealth_availability-status_get` | ✅ **EXPECTED** | +| 3 | 0.556369 | `storage_blob_container_get` | ❌ | | 4 | 0.487207 | `storage_blob_get` | ❌ | | 5 | 0.466885 | `resourcehealth_availability-status_list` | ❌ | --- +<<<<<<< HEAD ## Test 367 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 372 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `resourcehealth_availability-status_get` **Prompt:** What is the availability status of virtual machine in resource group ? @@ -13116,6 +15652,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| <<<<<<< HEAD +<<<<<<< HEAD | 1 | 0.577398 | `resourcehealth_availability-status_list` | ❌ | | 2 | 0.502794 | `resourcehealth_availability-status_get` | ✅ **EXPECTED** | | 3 | 0.424939 | `mysql_server_list` | ❌ | @@ -13135,17 +15672,23 @@ ======= ## Test 358 ======= +======= +>>>>>>> e2fd2eac (refactor tts mcp tool) | 1 | 0.577398 | `resourcehealth_availability-status_list` | ❌ | -| 2 | 0.502457 | `resourcehealth_availability-status_get` | ✅ **EXPECTED** | +| 2 | 0.501568 | `resourcehealth_availability-status_get` | ✅ **EXPECTED** | | 3 | 0.424939 | `mysql_server_list` | ❌ | -| 4 | 0.413484 | `foundry_openai_models-list` | ❌ | -| 5 | 0.412025 | `loadtesting_testresource_list` | ❌ | +| 4 | 0.412025 | `loadtesting_testresource_list` | ❌ | +| 5 | 0.393479 | `managedlustre_fs_list` | ❌ | --- +<<<<<<< HEAD ## Test 368 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 373 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `resourcehealth_availability-status_list` **Prompt:** List availability status for all resources in my subscription @@ -13155,13 +15698,18 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| | 1 | 0.737219 | `resourcehealth_availability-status_list` | ✅ **EXPECTED** | +<<<<<<< HEAD | 2 | 0.585501 | `redis_list` | ❌ | +======= +| 2 | 0.585487 | `redis_list` | ❌ | +>>>>>>> e2fd2eac (refactor tts mcp tool) | 3 | 0.549914 | `loadtesting_testresource_list` | ❌ | | 4 | 0.548549 | `grafana_list` | ❌ | -| 5 | 0.544514 | `subscription_list` | ❌ | +| 5 | 0.544505 | `subscription_list` | ❌ | --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 369 ======= @@ -13171,6 +15719,9 @@ ## Test 369 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 374 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `resourcehealth_availability-status_list` **Prompt:** Show me the health status of all my Azure resources @@ -13180,6 +15731,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| <<<<<<< HEAD +<<<<<<< HEAD | 1 | 0.644982 | `resourcehealth_availability-status_list` | ✅ **EXPECTED** | | 2 | 0.544917 | `resourcehealth_availability-status_get` | ❌ | | 3 | 0.509740 | `resourcehealth_health-events_list` | ❌ | @@ -13187,14 +15739,13 @@ ======= <<<<<<< HEAD | 1 | 0.644908 | `resourcehealth_availability-status_list` | ✅ **EXPECTED** | -| 2 | 0.545208 | `resourcehealth_availability-status_get` | ❌ | -| 3 | 0.509740 | `resourcehealth_health-events_list` | ❌ | -| 4 | 0.508703 | `quota_usage_check` | ❌ | ======= | 1 | 0.644982 | `resourcehealth_availability-status_list` | ✅ **EXPECTED** | -| 2 | 0.546520 | `resourcehealth_availability-status_get` | ❌ | -| 3 | 0.509740 | `resourcehealth_service-health-events_list` | ❌ | +>>>>>>> e2fd2eac (refactor tts mcp tool) +| 2 | 0.545208 | `resourcehealth_availability-status_get` | ❌ | +| 3 | 0.509740 | `resourcehealth_health-events_list` | ❌ | | 4 | 0.508252 | `quota_usage_check` | ❌ | +<<<<<<< HEAD >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) | 5 | 0.505776 | `redis_list` | ❌ | @@ -13210,6 +15761,13 @@ ## Test 370 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +| 5 | 0.505799 | `redis_list` | ❌ | + +--- + +## Test 375 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `resourcehealth_availability-status_list` **Prompt:** What resources in resource group have health issues? @@ -13219,24 +15777,24 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| <<<<<<< HEAD +<<<<<<< HEAD | 1 | 0.596890 | `resourcehealth_availability-status_list` | ✅ **EXPECTED** | | 2 | 0.550812 | `resourcehealth_availability-status_get` | ❌ | ======= <<<<<<< HEAD | 1 | 0.596817 | `resourcehealth_availability-status_list` | ✅ **EXPECTED** | +======= +| 1 | 0.596890 | `resourcehealth_availability-status_list` | ✅ **EXPECTED** | +>>>>>>> e2fd2eac (refactor tts mcp tool) | 2 | 0.549900 | `resourcehealth_availability-status_get` | ❌ | >>>>>>> 58ab8585 (update prompts and tool description evaluator) | 3 | 0.496640 | `resourcehealth_health-events_list` | ❌ | -======= -| 1 | 0.596890 | `resourcehealth_availability-status_list` | ✅ **EXPECTED** | -| 2 | 0.551332 | `resourcehealth_availability-status_get` | ❌ | -| 3 | 0.496640 | `resourcehealth_service-health-events_list` | ❌ | ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) | 4 | 0.441921 | `applens_resource_diagnose` | ❌ | | 5 | 0.433614 | `loadtesting_testresource_list` | ❌ | --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 371 ======= @@ -13246,6 +15804,9 @@ ## Test 371 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 376 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `resourcehealth_health-events_list` **Prompt:** List all service health events in my subscription @@ -13255,6 +15816,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| <<<<<<< HEAD +<<<<<<< HEAD | 1 | 0.690720 | `resourcehealth_health-events_list` | ✅ **EXPECTED** | | 2 | 0.553485 | `search_service_list` | ❌ | | 3 | 0.534169 | `eventgrid_topic_list` | ❌ | @@ -13266,23 +15828,26 @@ ## Test 372 ======= <<<<<<< HEAD -| 1 | 0.690719 | `resourcehealth_health-events_list` | ✅ **EXPECTED** | ======= -| 1 | 0.690719 | `resourcehealth_service-health-events_list` | ❌ | ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) +>>>>>>> e2fd2eac (refactor tts mcp tool) +| 1 | 0.690719 | `resourcehealth_health-events_list` | ✅ **EXPECTED** | | 2 | 0.554895 | `search_service_list` | ❌ | | 3 | 0.534250 | `eventgrid_topic_list` | ❌ | | 4 | 0.529761 | `eventgrid_subscription_list` | ❌ | -| 5 | 0.518595 | `resourcehealth_availability-status_list` | ❌ | +| 5 | 0.518372 | `resourcehealth_availability-status_list` | ❌ | --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 362 ======= ## Test 372 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 377 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `resourcehealth_health-events_list` **Prompt:** Show me Azure service health events for subscription @@ -13291,9 +15856,9 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD | 1 | 0.686448 | `resourcehealth_health-events_list` | ✅ **EXPECTED** | <<<<<<< HEAD +<<<<<<< HEAD | 2 | 0.534707 | `eventgrid_subscription_list` | ❌ | | 3 | 0.513302 | `search_service_list` | ❌ | | 4 | 0.513237 | `eventgrid_topic_list` | ❌ | @@ -13318,6 +15883,16 @@ ## Test 373 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +| 2 | 0.534556 | `eventgrid_subscription_list` | ❌ | +| 3 | 0.513815 | `search_service_list` | ❌ | +| 4 | 0.513259 | `eventgrid_topic_list` | ❌ | +| 5 | 0.501135 | `subscription_list` | ❌ | + +--- + +## Test 378 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `resourcehealth_health-events_list` **Prompt:** What service issues have occurred in the last 30 days? @@ -13327,6 +15902,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| <<<<<<< HEAD +<<<<<<< HEAD | 1 | 0.450841 | `resourcehealth_health-events_list` | ✅ **EXPECTED** | | 2 | 0.267663 | `applens_resource_diagnose` | ❌ | | 3 | 0.245720 | `cloudarchitect_design` | ❌ | @@ -13349,6 +15925,9 @@ ## Test 364 ======= | 1 | 0.450841 | `resourcehealth_service-health-events_list` | ❌ | +======= +| 1 | 0.450841 | `resourcehealth_health-events_list` | ✅ **EXPECTED** | +>>>>>>> e2fd2eac (refactor tts mcp tool) | 2 | 0.267663 | `applens_resource_diagnose` | ❌ | | 3 | 0.245720 | `cloudarchitect_design` | ❌ | | 4 | 0.216847 | `resourcehealth_availability-status_list` | ❌ | @@ -13356,9 +15935,13 @@ --- +<<<<<<< HEAD ## Test 374 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 379 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `resourcehealth_health-events_list` **Prompt:** List active service health events in my subscription @@ -13367,9 +15950,9 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD | 1 | 0.685391 | `resourcehealth_health-events_list` | ✅ **EXPECTED** | <<<<<<< HEAD +<<<<<<< HEAD | 2 | 0.527255 | `eventgrid_subscription_list` | ❌ | | 3 | 0.523975 | `eventgrid_topic_list` | ❌ | | 4 | 0.518668 | `search_service_list` | ❌ | @@ -13382,19 +15965,25 @@ ======= | 1 | 0.685391 | `resourcehealth_service-health-events_list` | ❌ | >>>>>>> 84ad4f44 (update prompts and tool description evaluator) +======= +>>>>>>> e2fd2eac (refactor tts mcp tool) | 2 | 0.527905 | `eventgrid_subscription_list` | ❌ | | 3 | 0.524063 | `eventgrid_topic_list` | ❌ | | 4 | 0.520197 | `search_service_list` | ❌ | -| 5 | 0.502345 | `resourcehealth_availability-status_list` | ❌ | +| 5 | 0.502064 | `resourcehealth_availability-status_list` | ❌ | --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 365 ======= ## Test 375 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 380 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `resourcehealth_health-events_list` **Prompt:** Show me planned maintenance events for my Azure services @@ -13403,9 +15992,9 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD | 1 | 0.565851 | `resourcehealth_health-events_list` | ✅ **EXPECTED** | <<<<<<< HEAD +<<<<<<< HEAD | 2 | 0.436322 | `search_service_list` | ❌ | | 3 | 0.404191 | `eventgrid_subscription_list` | ❌ | | 4 | 0.402493 | `resourcehealth_availability-status_list` | ❌ | @@ -13418,19 +16007,25 @@ ======= | 1 | 0.565851 | `resourcehealth_service-health-events_list` | ❌ | >>>>>>> 84ad4f44 (update prompts and tool description evaluator) +======= +>>>>>>> e2fd2eac (refactor tts mcp tool) | 2 | 0.437868 | `search_service_list` | ❌ | | 3 | 0.403665 | `eventgrid_subscription_list` | ❌ | -| 4 | 0.402532 | `resourcehealth_availability-status_list` | ❌ | -| 5 | 0.398084 | `quota_usage_check` | ❌ | +| 4 | 0.402493 | `resourcehealth_availability-status_list` | ❌ | +| 5 | 0.397735 | `quota_usage_check` | ❌ | --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 366 ======= ## Test 376 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 381 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `servicebus_queue_details` **Prompt:** Show me the details of service bus queue @@ -13444,6 +16039,7 @@ | 3 | 0.437000 | `servicebus_topic_details` | ❌ | | 4 | 0.385812 | `search_knowledge_base_get` | ❌ | <<<<<<< HEAD +<<<<<<< HEAD | 5 | 0.384139 | `storage_account_get` | ❌ | --- @@ -13464,6 +16060,13 @@ ## Test 377 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +| 5 | 0.384139 | `storage_account_get` | ❌ | + +--- + +## Test 382 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `servicebus_topic_details` **Prompt:** Show me the details of service bus topic @@ -13480,6 +16083,7 @@ --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 378 ======= @@ -13489,6 +16093,9 @@ ## Test 378 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 383 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `servicebus_topic_subscription_details` **Prompt:** Show me the details of service bus subscription @@ -13505,6 +16112,7 @@ --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 379 ======= @@ -13514,6 +16122,9 @@ ## Test 379 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 384 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `signalr_runtime_get` **Prompt:** Show me the details of SignalR @@ -13522,14 +16133,20 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.532742 | `signalr_runtime_get` | ✅ **EXPECTED** | | 2 | 0.355028 | `redis_list` | ❌ | +======= +| 1 | 0.532544 | `signalr_runtime_get` | ✅ **EXPECTED** | +| 2 | 0.355082 | `redis_list` | ❌ | +>>>>>>> e2fd2eac (refactor tts mcp tool) | 3 | 0.329804 | `foundry_resource_get` | ❌ | | 4 | 0.319981 | `sql_server_show` | ❌ | | 5 | 0.304420 | `servicebus_queue_details` | ❌ | --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 380 ======= @@ -13539,6 +16156,9 @@ ## Test 380 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 385 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `signalr_runtime_get` **Prompt:** Show me the network information of SignalR runtime @@ -13550,6 +16170,7 @@ | 1 | 0.573540 | `signalr_runtime_get` | ✅ **EXPECTED** | | 2 | 0.337342 | `sql_server_show` | ❌ | | 3 | 0.306559 | `foundry_resource_get` | ❌ | +<<<<<<< HEAD | 4 | 0.305021 | `redis_list` | ❌ | | 5 | 0.301114 | `servicebus_topic_details` | ❌ | @@ -13564,6 +16185,14 @@ ## Test 381 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +| 4 | 0.305083 | `redis_list` | ❌ | +| 5 | 0.300956 | `servicebus_topic_details` | ❌ | + +--- + +## Test 386 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `signalr_runtime_get` **Prompt:** Describe the SignalR runtime in resource group @@ -13576,6 +16205,7 @@ | 2 | 0.411396 | `loadtesting_testresource_list` | ❌ | | 3 | 0.410606 | `foundry_resource_get` | ❌ | <<<<<<< HEAD +<<<<<<< HEAD | 4 | 0.399412 | `resourcehealth_availability-status_list` | ❌ | | 5 | 0.382028 | `sql_server_list` | ❌ | @@ -13591,14 +16221,20 @@ ## Test 372 ======= +======= +>>>>>>> e2fd2eac (refactor tts mcp tool) | 4 | 0.399412 | `resourcehealth_availability-status_list` | ❌ | -| 5 | 0.382152 | `sql_server_list` | ❌ | +| 5 | 0.382028 | `sql_server_list` | ❌ | --- +<<<<<<< HEAD ## Test 382 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 387 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `signalr_runtime_get` **Prompt:** Get information about my SignalR runtime in @@ -13608,6 +16244,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| <<<<<<< HEAD +<<<<<<< HEAD | 1 | 0.715701 | `signalr_runtime_get` | ✅ **EXPECTED** | | 2 | 0.458894 | `foundry_resource_get` | ❌ | | 3 | 0.431212 | `resourcehealth_availability-status_list` | ❌ | @@ -13640,6 +16277,17 @@ ## Test 383 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +| 1 | 0.715974 | `signalr_runtime_get` | ✅ **EXPECTED** | +| 2 | 0.459045 | `foundry_resource_get` | ❌ | +| 3 | 0.430829 | `resourcehealth_availability-status_list` | ❌ | +| 4 | 0.430765 | `loadtesting_testresource_list` | ❌ | +| 5 | 0.417032 | `functionapp_get` | ❌ | + +--- + +## Test 388 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `signalr_runtime_get` **Prompt:** Show all the SignalRs information in @@ -13648,6 +16296,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.563883 | `signalr_runtime_get` | ✅ **EXPECTED** | | 2 | 0.501077 | `redis_list` | ❌ | <<<<<<< HEAD @@ -13659,11 +16308,17 @@ | 3 | 0.494478 | `resourcehealth_availability-status_list` | ❌ | >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +| 1 | 0.564072 | `signalr_runtime_get` | ✅ **EXPECTED** | +| 2 | 0.501156 | `redis_list` | ❌ | +| 3 | 0.494478 | `resourcehealth_availability-status_list` | ❌ | +>>>>>>> e2fd2eac (refactor tts mcp tool) | 4 | 0.481428 | `loadtesting_testresource_list` | ❌ | | 5 | 0.462090 | `mysql_server_list` | ❌ | --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 384 ======= @@ -13673,6 +16328,9 @@ ## Test 384 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 389 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `signalr_runtime_get` **Prompt:** List all SignalRs in my subscription @@ -13681,6 +16339,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.530514 | `signalr_runtime_get` | ✅ **EXPECTED** | | 2 | 0.507654 | `postgres_server_list` | ❌ | | 3 | 0.495157 | `redis_list` | ❌ | @@ -13696,13 +16355,23 @@ ## Test 375 ======= | 4 | 0.494513 | `kusto_cluster_list` | ❌ | +======= +| 1 | 0.530646 | `signalr_runtime_get` | ✅ **EXPECTED** | +| 2 | 0.507653 | `postgres_server_list` | ❌ | +| 3 | 0.495179 | `redis_list` | ❌ | +| 4 | 0.494498 | `kusto_cluster_list` | ❌ | +>>>>>>> e2fd2eac (refactor tts mcp tool) | 5 | 0.487856 | `subscription_list` | ❌ | --- +<<<<<<< HEAD ## Test 385 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 390 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `sql_db_create` **Prompt:** Create a new SQL database named in server @@ -13713,12 +16382,18 @@ |------|-------|------|--------| | 1 | 0.516780 | `sql_db_create` | ✅ **EXPECTED** | | 2 | 0.470892 | `sql_server_create` | ❌ | +<<<<<<< HEAD | 3 | 0.420389 | `sql_db_rename` | ❌ | | 4 | 0.408515 | `sql_db_delete` | ❌ | +======= +| 3 | 0.420504 | `sql_db_rename` | ❌ | +| 4 | 0.408628 | `sql_db_delete` | ❌ | +>>>>>>> e2fd2eac (refactor tts mcp tool) | 5 | 0.404860 | `sql_server_delete` | ❌ | --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 386 ======= @@ -13728,6 +16403,9 @@ ## Test 386 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 391 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `sql_db_create` **Prompt:** Create a SQL database with Basic tier in server @@ -13743,6 +16421,7 @@ ======= | 3 | 0.437526 | `sql_server_delete` | ❌ | <<<<<<< HEAD +<<<<<<< HEAD >>>>>>> 58ab8585 (update prompts and tool description evaluator) | 4 | 0.420843 | `sql_db_show` | ❌ | | 5 | 0.417661 | `sql_db_delete` | ❌ | @@ -13762,6 +16441,14 @@ ## Test 387 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +| 4 | 0.420843 | `sql_db_show` | ❌ | +| 5 | 0.417795 | `sql_db_delete` | ❌ | + +--- + +## Test 392 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `sql_db_create` **Prompt:** Create a new database called on SQL server in resource group @@ -13775,6 +16462,7 @@ | 3 | 0.503938 | `sql_db_rename` | ❌ | | 4 | 0.494377 | `sql_db_show` | ❌ | <<<<<<< HEAD +<<<<<<< HEAD | 5 | 0.473975 | `sql_db_list` | ❌ | --- @@ -13788,13 +16476,19 @@ ## Test 378 ======= +======= +>>>>>>> e2fd2eac (refactor tts mcp tool) | 5 | 0.473975 | `sql_db_list` | ❌ | --- +<<<<<<< HEAD ## Test 388 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 393 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `sql_db_delete` **Prompt:** Delete the SQL database from server @@ -13803,7 +16497,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -| 1 | 0.568196 | `sql_db_delete` | ✅ **EXPECTED** | +| 1 | 0.568205 | `sql_db_delete` | ✅ **EXPECTED** | | 2 | 0.567412 | `sql_server_delete` | ❌ | | 3 | 0.391436 | `sql_db_rename` | ❌ | | 4 | 0.386721 | `sql_server_firewall-rule_delete` | ❌ | @@ -13811,6 +16505,7 @@ --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 389 ======= @@ -13820,6 +16515,9 @@ ## Test 389 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 394 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `sql_db_delete` **Prompt:** Remove database from SQL server in resource group @@ -13829,6 +16527,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| <<<<<<< HEAD +<<<<<<< HEAD | 1 | 0.567513 | `sql_server_delete` | ❌ | | 2 | 0.543440 | `sql_db_delete` | ✅ **EXPECTED** | | 3 | 0.500756 | `sql_db_show` | ❌ | @@ -13850,17 +16549,23 @@ ## Test 380 ======= +======= +>>>>>>> e2fd2eac (refactor tts mcp tool) | 1 | 0.567513 | `sql_server_delete` | ❌ | -| 2 | 0.543440 | `sql_db_delete` | ✅ **EXPECTED** | +| 2 | 0.543468 | `sql_db_delete` | ✅ **EXPECTED** | | 3 | 0.500756 | `sql_db_show` | ❌ | | 4 | 0.481083 | `sql_db_rename` | ❌ | | 5 | 0.478729 | `sql_db_list` | ❌ | --- +<<<<<<< HEAD ## Test 390 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 395 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `sql_db_delete` **Prompt:** Delete the database called on server @@ -13869,14 +16574,20 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.509916 | `sql_db_delete` | ✅ **EXPECTED** | | 2 | 0.490893 | `sql_server_delete` | ❌ | +======= +| 1 | 0.509939 | `sql_db_delete` | ✅ **EXPECTED** | +| 2 | 0.490892 | `sql_server_delete` | ❌ | +>>>>>>> e2fd2eac (refactor tts mcp tool) | 3 | 0.364494 | `postgres_database_list` | ❌ | | 4 | 0.355416 | `mysql_database_list` | ❌ | | 5 | 0.347703 | `sql_db_rename` | ❌ | --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 391 ======= @@ -13886,6 +16597,9 @@ ## Test 391 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 396 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `sql_db_list` **Prompt:** List all databases in the Azure SQL server @@ -13895,6 +16609,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| <<<<<<< HEAD +<<<<<<< HEAD | 1 | 0.643138 | `sql_db_list` | ✅ **EXPECTED** | | 2 | 0.639644 | `mysql_database_list` | ❌ | | 3 | 0.609116 | `postgres_database_list` | ❌ | @@ -13916,17 +16631,23 @@ ## Test 382 ======= +======= +>>>>>>> e2fd2eac (refactor tts mcp tool) | 1 | 0.643186 | `sql_db_list` | ✅ **EXPECTED** | | 2 | 0.639694 | `mysql_database_list` | ❌ | | 3 | 0.609178 | `postgres_database_list` | ❌ | | 4 | 0.602890 | `cosmos_database_list` | ❌ | -| 5 | 0.569739 | `kusto_database_list` | ❌ | +| 5 | 0.570140 | `kusto_database_list` | ❌ | --- +<<<<<<< HEAD ## Test 392 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 397 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `sql_db_list` **Prompt:** Show me all the databases configuration details in the Azure SQL server @@ -13937,6 +16658,7 @@ |------|-------|------|--------| | 1 | 0.617746 | `sql_server_show` | ❌ | <<<<<<< HEAD +<<<<<<< HEAD | 2 | 0.609322 | `sql_db_list` | ✅ **EXPECTED** | ======= <<<<<<< HEAD @@ -13945,12 +16667,16 @@ | 2 | 0.609322 | `sql_db_list` | ✅ **EXPECTED** | >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +| 2 | 0.609322 | `sql_db_list` | ✅ **EXPECTED** | +>>>>>>> e2fd2eac (refactor tts mcp tool) | 3 | 0.557353 | `mysql_database_list` | ❌ | | 4 | 0.553488 | `mysql_server_config_get` | ❌ | | 5 | 0.524274 | `sql_db_show` | ❌ | --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 393 ======= @@ -13960,6 +16686,9 @@ ## Test 393 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 398 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `sql_db_rename` **Prompt:** Rename the SQL database on server to @@ -13969,6 +16698,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| <<<<<<< HEAD +<<<<<<< HEAD | 1 | 0.593251 | `sql_db_rename` | ✅ **EXPECTED** | | 2 | 0.425282 | `sql_server_delete` | ❌ | | 3 | 0.416207 | `sql_db_delete` | ❌ | @@ -13990,17 +16720,23 @@ ## Test 384 ======= +======= +>>>>>>> e2fd2eac (refactor tts mcp tool) | 1 | 0.593348 | `sql_db_rename` | ✅ **EXPECTED** | | 2 | 0.425282 | `sql_server_delete` | ❌ | -| 3 | 0.416207 | `sql_db_delete` | ❌ | +| 3 | 0.416267 | `sql_db_delete` | ❌ | | 4 | 0.396947 | `sql_db_create` | ❌ | | 5 | 0.346018 | `sql_db_show` | ❌ | --- +<<<<<<< HEAD ## Test 394 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 399 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `sql_db_rename` **Prompt:** Rename my Azure SQL database to on server @@ -14010,6 +16746,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| <<<<<<< HEAD +<<<<<<< HEAD | 1 | 0.711257 | `sql_db_rename` | ✅ **EXPECTED** | | 2 | 0.516770 | `sql_server_delete` | ❌ | | 3 | 0.506834 | `sql_db_delete` | ❌ | @@ -14042,6 +16779,17 @@ ## Test 395 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +| 1 | 0.711063 | `sql_db_rename` | ✅ **EXPECTED** | +| 2 | 0.516485 | `sql_server_delete` | ❌ | +| 3 | 0.506579 | `sql_db_delete` | ❌ | +| 4 | 0.501476 | `sql_db_create` | ❌ | +| 5 | 0.433897 | `sql_server_show` | ❌ | + +--- + +## Test 400 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `sql_db_show` **Prompt:** Get the configuration details for the SQL database on server @@ -14051,6 +16799,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| <<<<<<< HEAD +<<<<<<< HEAD | 1 | 0.610991 | `sql_server_show` | ❌ | | 2 | 0.593150 | `postgres_server_config_get` | ❌ | | 3 | 0.530422 | `mysql_server_config_get` | ❌ | @@ -14072,6 +16821,8 @@ ## Test 386 ======= +======= +>>>>>>> e2fd2eac (refactor tts mcp tool) | 1 | 0.610991 | `sql_server_show` | ❌ | | 2 | 0.593150 | `postgres_server_config_get` | ❌ | | 3 | 0.530422 | `mysql_server_config_get` | ❌ | @@ -14080,9 +16831,13 @@ --- +<<<<<<< HEAD ## Test 396 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 401 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `sql_db_show` **Prompt:** Show me the details of SQL database in server @@ -14092,6 +16847,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| <<<<<<< HEAD +<<<<<<< HEAD | 1 | 0.530095 | `sql_db_show` | ✅ **EXPECTED** | | 2 | 0.503681 | `sql_server_show` | ❌ | | 3 | 0.440073 | `sql_db_list` | ❌ | @@ -14124,6 +16880,17 @@ ## Test 397 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +| 1 | 0.530040 | `sql_db_show` | ✅ **EXPECTED** | +| 2 | 0.503614 | `sql_server_show` | ❌ | +| 3 | 0.440041 | `sql_db_list` | ❌ | +| 4 | 0.438628 | `mysql_table_schema_get` | ❌ | +| 5 | 0.432915 | `mysql_database_list` | ❌ | + +--- + +## Test 402 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `sql_db_update` **Prompt:** Update the performance tier of SQL database on server @@ -14133,6 +16900,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| <<<<<<< HEAD +<<<<<<< HEAD | 1 | 0.603271 | `sql_db_update` | ✅ **EXPECTED** | | 2 | 0.467571 | `sql_db_create` | ❌ | | 3 | 0.440442 | `sql_db_rename` | ❌ | @@ -14165,6 +16933,17 @@ ## Test 398 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +| 1 | 0.603376 | `sql_db_update` | ✅ **EXPECTED** | +| 2 | 0.467571 | `sql_db_create` | ❌ | +| 3 | 0.440493 | `sql_db_rename` | ❌ | +| 4 | 0.427621 | `sql_db_show` | ❌ | +| 5 | 0.413941 | `sql_server_delete` | ❌ | + +--- + +## Test 403 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `sql_db_update` **Prompt:** Scale SQL database on server to use SKU @@ -14174,6 +16953,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| <<<<<<< HEAD +<<<<<<< HEAD | 1 | 0.550449 | `sql_db_update` | ✅ **EXPECTED** | | 2 | 0.418358 | `sql_server_delete` | ❌ | | 3 | 0.401817 | `sql_db_list` | ❌ | @@ -14196,6 +16976,9 @@ ## Test 389 ======= | 1 | 0.550556 | `sql_db_update` | ✅ **EXPECTED** | +======= +| 1 | 0.550661 | `sql_db_update` | ✅ **EXPECTED** | +>>>>>>> e2fd2eac (refactor tts mcp tool) | 2 | 0.418358 | `sql_server_delete` | ❌ | | 3 | 0.401817 | `sql_db_list` | ❌ | | 4 | 0.395518 | `sql_db_rename` | ❌ | @@ -14203,9 +16986,13 @@ --- +<<<<<<< HEAD ## Test 399 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 404 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `sql_elastic-pool_list` **Prompt:** List all elastic pools in SQL server @@ -14216,6 +17003,7 @@ |------|-------|------|--------| | 1 | 0.678124 | `sql_elastic-pool_list` | ✅ **EXPECTED** | <<<<<<< HEAD +<<<<<<< HEAD | 2 | 0.502376 | `sql_db_list` | ❌ | ======= <<<<<<< HEAD @@ -14224,12 +17012,16 @@ | 2 | 0.502376 | `sql_db_list` | ❌ | >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +| 2 | 0.502376 | `sql_db_list` | ❌ | +>>>>>>> e2fd2eac (refactor tts mcp tool) | 3 | 0.498367 | `mysql_database_list` | ❌ | | 4 | 0.485249 | `aks_nodepool_get` | ❌ | | 5 | 0.479044 | `sql_server_show` | ❌ | --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 400 ======= @@ -14239,6 +17031,9 @@ ## Test 400 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 405 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `sql_elastic-pool_list` **Prompt:** Show me the elastic pools configured for SQL server @@ -14248,6 +17043,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| <<<<<<< HEAD +<<<<<<< HEAD | 1 | 0.606425 | `sql_elastic-pool_list` | ✅ **EXPECTED** | | 2 | 0.502877 | `sql_server_show` | ❌ | | 3 | 0.457164 | `sql_db_list` | ❌ | @@ -14269,6 +17065,8 @@ ## Test 391 ======= +======= +>>>>>>> e2fd2eac (refactor tts mcp tool) | 1 | 0.606425 | `sql_elastic-pool_list` | ✅ **EXPECTED** | | 2 | 0.502877 | `sql_server_show` | ❌ | | 3 | 0.457163 | `sql_db_list` | ❌ | @@ -14277,9 +17075,13 @@ --- +<<<<<<< HEAD ## Test 401 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 406 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `sql_elastic-pool_list` **Prompt:** What elastic pools are available in my SQL server ? @@ -14292,6 +17094,7 @@ | 2 | 0.420325 | `mysql_database_list` | ❌ | | 3 | 0.407169 | `aks_nodepool_get` | ❌ | <<<<<<< HEAD +<<<<<<< HEAD | 4 | 0.402616 | `mysql_server_list` | ❌ | | 5 | 0.397670 | `sql_db_list` | ❌ | @@ -14307,14 +17110,20 @@ ## Test 392 ======= +======= +>>>>>>> e2fd2eac (refactor tts mcp tool) | 4 | 0.402616 | `mysql_server_list` | ❌ | | 5 | 0.397670 | `sql_db_list` | ❌ | --- +<<<<<<< HEAD ## Test 402 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 407 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `sql_server_create` **Prompt:** Create a new Azure SQL server named in resource group @@ -14324,6 +17133,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| <<<<<<< HEAD +<<<<<<< HEAD | 1 | 0.682605 | `sql_server_create` | ✅ **EXPECTED** | | 2 | 0.563707 | `sql_db_create` | ❌ | | 3 | 0.529198 | `sql_server_list` | ❌ | @@ -14356,6 +17166,17 @@ ## Test 403 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +| 1 | 0.682606 | `sql_server_create` | ✅ **EXPECTED** | +| 2 | 0.563708 | `sql_db_create` | ❌ | +| 3 | 0.529198 | `sql_server_list` | ❌ | +| 4 | 0.482102 | `storage_account_create` | ❌ | +| 5 | 0.474207 | `sql_db_rename` | ❌ | + +--- + +## Test 408 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `sql_server_create` **Prompt:** Create an Azure SQL server with name in location with admin user @@ -14365,6 +17186,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| <<<<<<< HEAD +<<<<<<< HEAD | 1 | 0.618354 | `sql_server_create` | ✅ **EXPECTED** | | 2 | 0.510222 | `sql_db_create` | ❌ | ======= @@ -14383,6 +17205,8 @@ ======= ## Test 394 ======= +======= +>>>>>>> e2fd2eac (refactor tts mcp tool) | 1 | 0.618309 | `sql_server_create` | ✅ **EXPECTED** | | 2 | 0.510169 | `sql_db_create` | ❌ | | 3 | 0.472463 | `sql_server_show` | ❌ | @@ -14391,9 +17215,13 @@ --- +<<<<<<< HEAD ## Test 404 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 409 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `sql_server_create` **Prompt:** Set up a new SQL server called in my resource group @@ -14404,6 +17232,7 @@ |------|-------|------|--------| | 1 | 0.589818 | `sql_server_create` | ✅ **EXPECTED** | <<<<<<< HEAD +<<<<<<< HEAD | 2 | 0.501403 | `sql_db_create` | ❌ | | 3 | 0.497890 | `sql_server_list` | ❌ | | 4 | 0.461147 | `sql_db_rename` | ❌ | @@ -14423,16 +17252,22 @@ ## Test 395 ======= +======= +>>>>>>> e2fd2eac (refactor tts mcp tool) | 2 | 0.501403 | `sql_db_create` | ❌ | -| 3 | 0.498298 | `sql_server_list` | ❌ | +| 3 | 0.497890 | `sql_server_list` | ❌ | | 4 | 0.461181 | `sql_db_rename` | ❌ | | 5 | 0.442934 | `mysql_server_list` | ❌ | --- +<<<<<<< HEAD ## Test 405 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 410 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `sql_server_delete` **Prompt:** Delete the Azure SQL server from resource group @@ -14442,6 +17277,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| | 1 | 0.656593 | `sql_server_delete` | ✅ **EXPECTED** | +<<<<<<< HEAD | 2 | 0.548064 | `sql_db_delete` | ❌ | <<<<<<< HEAD | 3 | 0.518037 | `sql_server_list` | ❌ | @@ -14452,11 +17288,16 @@ | 3 | 0.518201 | `sql_server_list` | ❌ | >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +| 2 | 0.548024 | `sql_db_delete` | ❌ | +| 3 | 0.518036 | `sql_server_list` | ❌ | +>>>>>>> e2fd2eac (refactor tts mcp tool) | 4 | 0.495550 | `sql_server_create` | ❌ | | 5 | 0.483132 | `workbooks_delete` | ❌ | --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 406 ======= @@ -14466,6 +17307,9 @@ ## Test 406 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 411 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `sql_server_delete` **Prompt:** Remove the SQL server from my subscription @@ -14476,6 +17320,7 @@ |------|-------|------|--------| | 1 | 0.615073 | `sql_server_delete` | ✅ **EXPECTED** | | 2 | 0.393885 | `postgres_server_list` | ❌ | +<<<<<<< HEAD | 3 | 0.379760 | `sql_db_delete` | ❌ | | 4 | 0.376660 | `sql_server_show` | ❌ | <<<<<<< HEAD @@ -14499,6 +17344,15 @@ ## Test 407 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +| 3 | 0.379763 | `sql_db_delete` | ❌ | +| 4 | 0.376660 | `sql_server_show` | ❌ | +| 5 | 0.350103 | `sql_server_list` | ❌ | + +--- + +## Test 412 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `sql_server_delete` **Prompt:** Delete SQL server permanently @@ -14508,6 +17362,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| | 1 | 0.624310 | `sql_server_delete` | ✅ **EXPECTED** | +<<<<<<< HEAD | 2 | 0.454892 | `sql_db_delete` | ❌ | | 3 | 0.362561 | `sql_server_firewall-rule_delete` | ❌ | | 4 | 0.341503 | `sql_server_show` | ❌ | @@ -14525,13 +17380,22 @@ ## Test 398 ======= +======= +| 2 | 0.454907 | `sql_db_delete` | ❌ | +| 3 | 0.362389 | `sql_server_firewall-rule_delete` | ❌ | +| 4 | 0.341503 | `sql_server_show` | ❌ | +>>>>>>> e2fd2eac (refactor tts mcp tool) | 5 | 0.318758 | `eventhubs_eventhub_delete` | ❌ | --- +<<<<<<< HEAD ## Test 408 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 413 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `sql_server_entra-admin_list` **Prompt:** List Microsoft Entra ID administrators for SQL server @@ -14543,6 +17407,7 @@ | 1 | 0.783479 | `sql_server_entra-admin_list` | ✅ **EXPECTED** | | 2 | 0.456051 | `sql_server_show` | ❌ | <<<<<<< HEAD +<<<<<<< HEAD | 3 | 0.434868 | `sql_server_list` | ❌ | | 4 | 0.401854 | `sql_server_firewall-rule_list` | ❌ | | 5 | 0.376055 | `sql_db_list` | ❌ | @@ -14562,13 +17427,21 @@ ======= | 3 | 0.434776 | `sql_server_list` | ❌ | | 4 | 0.401880 | `sql_server_firewall-rule_list` | ❌ | +======= +| 3 | 0.434868 | `sql_server_list` | ❌ | +| 4 | 0.401878 | `sql_server_firewall-rule_list` | ❌ | +>>>>>>> e2fd2eac (refactor tts mcp tool) | 5 | 0.376055 | `sql_db_list` | ❌ | --- +<<<<<<< HEAD ## Test 409 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 414 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `sql_server_entra-admin_list` **Prompt:** Show me the Entra ID administrators configured for SQL server @@ -14577,6 +17450,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.713306 | `sql_server_entra-admin_list` | ✅ **EXPECTED** | | 2 | 0.413144 | `sql_server_show` | ❌ | <<<<<<< HEAD @@ -14606,6 +17480,17 @@ ## Test 410 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +| 1 | 0.713093 | `sql_server_entra-admin_list` | ✅ **EXPECTED** | +| 2 | 0.412505 | `sql_server_show` | ❌ | +| 3 | 0.368257 | `sql_server_list` | ❌ | +| 4 | 0.315605 | `sql_db_list` | ❌ | +| 5 | 0.310940 | `postgres_server_list` | ❌ | + +--- + +## Test 415 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `sql_server_entra-admin_list` **Prompt:** What Microsoft Entra ID administrators are set up for my SQL server ? @@ -14617,6 +17502,7 @@ | 1 | 0.646419 | `sql_server_entra-admin_list` | ✅ **EXPECTED** | | 2 | 0.356025 | `sql_server_show` | ❌ | <<<<<<< HEAD +<<<<<<< HEAD | 3 | 0.322155 | `sql_server_list` | ❌ | ======= <<<<<<< HEAD @@ -14625,11 +17511,15 @@ | 3 | 0.322362 | `sql_server_list` | ❌ | >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +| 3 | 0.322155 | `sql_server_list` | ❌ | +>>>>>>> e2fd2eac (refactor tts mcp tool) | 4 | 0.307823 | `sql_server_create` | ❌ | | 5 | 0.269788 | `sql_server_delete` | ❌ | --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 411 ======= @@ -14639,6 +17529,9 @@ ## Test 411 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 416 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `sql_server_firewall-rule_create` **Prompt:** Create a firewall rule for my Azure SQL server @@ -14653,11 +17546,7 @@ | 3 | 0.522133 | `sql_server_firewall-rule_delete` | ❌ | ======= | 1 | 0.635466 | `sql_server_firewall-rule_create` | ✅ **EXPECTED** | -<<<<<<< HEAD -| 2 | 0.532712 | `sql_server_firewall-rule_list` | ❌ | -======= -| 2 | 0.532682 | `sql_server_firewall-rule_list` | ❌ | ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) +| 2 | 0.532758 | `sql_server_firewall-rule_list` | ❌ | | 3 | 0.522184 | `sql_server_firewall-rule_delete` | ❌ | >>>>>>> 58ab8585 (update prompts and tool description evaluator) | 4 | 0.448822 | `sql_server_create` | ❌ | @@ -14665,6 +17554,7 @@ --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 412 ======= @@ -14674,6 +17564,9 @@ ## Test 412 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 417 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `sql_server_firewall-rule_create` **Prompt:** Add a firewall rule to allow access from IP range to for SQL server @@ -14683,6 +17576,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| <<<<<<< HEAD +<<<<<<< HEAD | 1 | 0.670392 | `sql_server_firewall-rule_create` | ✅ **EXPECTED** | | 2 | 0.533587 | `sql_server_firewall-rule_list` | ❌ | | 3 | 0.503740 | `sql_server_firewall-rule_delete` | ❌ | @@ -14715,6 +17609,17 @@ ## Test 413 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +| 1 | 0.670186 | `sql_server_firewall-rule_create` | ✅ **EXPECTED** | +| 2 | 0.533573 | `sql_server_firewall-rule_list` | ❌ | +| 3 | 0.503564 | `sql_server_firewall-rule_delete` | ❌ | +| 4 | 0.316641 | `sql_server_list` | ❌ | +| 5 | 0.302391 | `sql_server_delete` | ❌ | + +--- + +## Test 418 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `sql_server_firewall-rule_create` **Prompt:** Create a new firewall rule named for SQL server @@ -14735,26 +17640,20 @@ ## Test 414 ======= | 1 | 0.685107 | `sql_server_firewall-rule_create` | ✅ **EXPECTED** | -<<<<<<< HEAD -| 2 | 0.574336 | `sql_server_firewall-rule_list` | ❌ | -| 3 | 0.539577 | `sql_server_firewall-rule_delete` | ❌ | -| 4 | 0.428919 | `sql_server_create` | ❌ | -| 5 | 0.394446 | `sql_db_create` | ❌ | - ---- - -## Test 404 -======= -| 2 | 0.574310 | `sql_server_firewall-rule_list` | ❌ | +| 2 | 0.574431 | `sql_server_firewall-rule_list` | ❌ | | 3 | 0.539577 | `sql_server_firewall-rule_delete` | ❌ | | 4 | 0.428919 | `sql_server_create` | ❌ | | 5 | 0.395165 | `sql_db_create` | ❌ | --- +<<<<<<< HEAD ## Test 414 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 419 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `sql_server_firewall-rule_delete` **Prompt:** Delete a firewall rule from my Azure SQL server @@ -14766,15 +17665,20 @@ | 1 | 0.691498 | `sql_server_firewall-rule_delete` | ✅ **EXPECTED** | | 2 | 0.584379 | `sql_server_delete` | ❌ | <<<<<<< HEAD +<<<<<<< HEAD | 3 | 0.543780 | `sql_server_firewall-rule_list` | ❌ | ======= | 3 | 0.543839 | `sql_server_firewall-rule_list` | ❌ | >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +| 3 | 0.543913 | `sql_server_firewall-rule_list` | ❌ | +>>>>>>> e2fd2eac (refactor tts mcp tool) | 4 | 0.540333 | `sql_server_firewall-rule_create` | ❌ | -| 5 | 0.498444 | `sql_db_delete` | ❌ | +| 5 | 0.498448 | `sql_db_delete` | ❌ | --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 415 ======= @@ -14784,6 +17688,9 @@ ## Test 415 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 420 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `sql_server_firewall-rule_delete` **Prompt:** Remove the firewall rule from SQL server @@ -14797,14 +17704,19 @@ | 2 | 0.574296 | `sql_server_firewall-rule_list` | ❌ | ======= | 1 | 0.670179 | `sql_server_firewall-rule_delete` | ✅ **EXPECTED** | +<<<<<<< HEAD | 2 | 0.574321 | `sql_server_firewall-rule_list` | ❌ | >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +| 2 | 0.574448 | `sql_server_firewall-rule_list` | ❌ | +>>>>>>> e2fd2eac (refactor tts mcp tool) | 3 | 0.530419 | `sql_server_firewall-rule_create` | ❌ | | 4 | 0.488418 | `sql_server_delete` | ❌ | -| 5 | 0.360381 | `sql_db_delete` | ❌ | +| 5 | 0.360401 | `sql_db_delete` | ❌ | --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 416 ======= @@ -14814,6 +17726,9 @@ ## Test 416 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 421 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `sql_server_firewall-rule_delete` **Prompt:** Delete firewall rule for SQL server @@ -14827,6 +17742,7 @@ | 2 | 0.601174 | `sql_server_firewall-rule_list` | ❌ | ======= | 1 | 0.671212 | `sql_server_firewall-rule_delete` | ✅ **EXPECTED** | +<<<<<<< HEAD | 2 | 0.601217 | `sql_server_firewall-rule_list` | ❌ | >>>>>>> 58ab8585 (update prompts and tool description evaluator) | 3 | 0.577330 | `sql_server_firewall-rule_create` | ❌ | @@ -14844,6 +17760,16 @@ ## Test 417 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +| 2 | 0.601324 | `sql_server_firewall-rule_list` | ❌ | +| 3 | 0.577330 | `sql_server_firewall-rule_create` | ❌ | +| 4 | 0.499272 | `sql_server_delete` | ❌ | +| 5 | 0.378589 | `sql_db_delete` | ❌ | + +--- + +## Test 422 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `sql_server_firewall-rule_list` **Prompt:** List all firewall rules for SQL server @@ -14853,14 +17779,19 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| <<<<<<< HEAD +<<<<<<< HEAD | 1 | 0.729336 | `sql_server_firewall-rule_list` | ✅ **EXPECTED** | ======= | 1 | 0.729320 | `sql_server_firewall-rule_list` | ✅ **EXPECTED** | >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +| 1 | 0.729415 | `sql_server_firewall-rule_list` | ✅ **EXPECTED** | +>>>>>>> e2fd2eac (refactor tts mcp tool) | 2 | 0.549667 | `sql_server_firewall-rule_create` | ❌ | | 3 | 0.513187 | `sql_server_firewall-rule_delete` | ❌ | | 4 | 0.468812 | `sql_server_show` | ❌ | <<<<<<< HEAD +<<<<<<< HEAD | 5 | 0.418817 | `sql_server_list` | ❌ | --- @@ -14881,6 +17812,13 @@ ## Test 418 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +| 5 | 0.418817 | `sql_server_list` | ❌ | + +--- + +## Test 423 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `sql_server_firewall-rule_list` **Prompt:** Show me the firewall rules for SQL server @@ -14889,11 +17827,12 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -| 1 | 0.630671 | `sql_server_firewall-rule_list` | ✅ **EXPECTED** | +| 1 | 0.630795 | `sql_server_firewall-rule_list` | ✅ **EXPECTED** | | 2 | 0.524126 | `sql_server_firewall-rule_create` | ❌ | | 3 | 0.476792 | `sql_server_firewall-rule_delete` | ❌ | | 4 | 0.410680 | `sql_server_show` | ❌ | <<<<<<< HEAD +<<<<<<< HEAD | 5 | 0.348100 | `sql_server_list` | ❌ | --- @@ -14914,6 +17853,13 @@ ## Test 419 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +| 5 | 0.348100 | `sql_server_list` | ❌ | + +--- + +## Test 424 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `sql_server_firewall-rule_list` **Prompt:** What firewall rules are configured for my SQL server ? @@ -14923,14 +17869,19 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| <<<<<<< HEAD +<<<<<<< HEAD | 1 | 0.630460 | `sql_server_firewall-rule_list` | ✅ **EXPECTED** | ======= | 1 | 0.630494 | `sql_server_firewall-rule_list` | ✅ **EXPECTED** | >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +| 1 | 0.630582 | `sql_server_firewall-rule_list` | ✅ **EXPECTED** | +>>>>>>> e2fd2eac (refactor tts mcp tool) | 2 | 0.532454 | `sql_server_firewall-rule_create` | ❌ | | 3 | 0.473596 | `sql_server_firewall-rule_delete` | ❌ | | 4 | 0.412957 | `sql_server_show` | ❌ | <<<<<<< HEAD +<<<<<<< HEAD | 5 | 0.350513 | `sql_server_list` | ❌ | --- @@ -14951,6 +17902,13 @@ ## Test 420 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +| 5 | 0.350513 | `sql_server_list` | ❌ | + +--- + +## Test 425 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `sql_server_list` **Prompt:** List all Azure SQL servers in resource group @@ -14960,6 +17918,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| <<<<<<< HEAD +<<<<<<< HEAD | 1 | 0.694404 | `sql_server_list` | ✅ **EXPECTED** | | 2 | 0.596686 | `mysql_server_list` | ❌ | | 3 | 0.578238 | `sql_db_list` | ❌ | @@ -14974,11 +17933,17 @@ | 3 | 0.578239 | `sql_db_list` | ❌ | >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +| 1 | 0.694404 | `sql_server_list` | ✅ **EXPECTED** | +| 2 | 0.596686 | `mysql_server_list` | ❌ | +| 3 | 0.578239 | `sql_db_list` | ❌ | +>>>>>>> e2fd2eac (refactor tts mcp tool) | 4 | 0.515851 | `sql_elastic-pool_list` | ❌ | | 5 | 0.509789 | `sql_db_show` | ❌ | --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 421 ======= @@ -14988,6 +17953,9 @@ ## Test 421 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 426 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `sql_server_list` **Prompt:** Show me every SQL server available in resource group @@ -14997,6 +17965,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| <<<<<<< HEAD +<<<<<<< HEAD | 1 | 0.618218 | `sql_server_list` | ✅ **EXPECTED** | | 2 | 0.593837 | `mysql_server_list` | ❌ | | 3 | 0.542398 | `sql_db_list` | ❌ | @@ -15014,10 +17983,17 @@ | 4 | 0.507404 | `resourcehealth_availability-status_list` | ❌ | >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +| 1 | 0.618218 | `sql_server_list` | ✅ **EXPECTED** | +| 2 | 0.593837 | `mysql_server_list` | ❌ | +| 3 | 0.542398 | `sql_db_list` | ❌ | +| 4 | 0.507404 | `resourcehealth_availability-status_list` | ❌ | +>>>>>>> e2fd2eac (refactor tts mcp tool) | 5 | 0.496200 | `group_list` | ❌ | --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 422 ======= @@ -15027,6 +18003,9 @@ ## Test 422 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 427 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `sql_server_show` **Prompt:** Show me the details of Azure SQL server in resource group @@ -15038,6 +18017,7 @@ | 1 | 0.629672 | `sql_db_show` | ❌ | | 2 | 0.595184 | `sql_server_show` | ✅ **EXPECTED** | <<<<<<< HEAD +<<<<<<< HEAD | 3 | 0.587728 | `sql_server_list` | ❌ | | 4 | 0.559893 | `mysql_server_list` | ❌ | | 5 | 0.540218 | `sql_db_list` | ❌ | @@ -15056,14 +18036,21 @@ ## Test 413 ======= | 3 | 0.587806 | `sql_server_list` | ❌ | +======= +| 3 | 0.587728 | `sql_server_list` | ❌ | +>>>>>>> e2fd2eac (refactor tts mcp tool) | 4 | 0.559893 | `mysql_server_list` | ❌ | | 5 | 0.540218 | `sql_db_list` | ❌ | --- +<<<<<<< HEAD ## Test 423 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 428 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `sql_server_show` **Prompt:** Get the configuration details for SQL server @@ -15080,6 +18067,7 @@ --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 424 ======= @@ -15089,6 +18077,9 @@ ## Test 424 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 429 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `sql_server_show` **Prompt:** Display the properties of SQL server @@ -15105,15 +18096,20 @@ ======= | 3 | 0.380021 | `postgres_server_param_get` | ❌ | <<<<<<< HEAD +<<<<<<< HEAD | 4 | 0.372194 | `sql_server_firewall-rule_list` | ❌ | ======= | 4 | 0.372172 | `sql_server_firewall-rule_list` | ❌ | >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +| 4 | 0.372179 | `sql_server_firewall-rule_list` | ❌ | +>>>>>>> e2fd2eac (refactor tts mcp tool) | 5 | 0.370539 | `sql_db_show` | ❌ | --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 425 ======= @@ -15123,6 +18119,9 @@ ## Test 425 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 430 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `storage_account_create` **Prompt:** Create a new storage account called testaccount123 in East US region @@ -15134,6 +18133,7 @@ | 1 | 0.533552 | `storage_account_create` | ✅ **EXPECTED** | | 2 | 0.438046 | `storage_blob_container_create` | ❌ | <<<<<<< HEAD +<<<<<<< HEAD | 3 | 0.418191 | `storage_account_get` | ❌ | | 4 | 0.413950 | `storage_blob_container_get` | ❌ | | 5 | 0.373651 | `managedlustre_fs_create` | ❌ | @@ -15148,16 +18148,24 @@ | 3 | 0.418134 | `storage_account_get` | ❌ | >>>>>>> 84ad4f44 (update prompts and tool description evaluator) | 4 | 0.414518 | `storage_blob_container_get` | ❌ | +======= +| 3 | 0.418191 | `storage_account_get` | ❌ | +| 4 | 0.414964 | `storage_blob_container_get` | ❌ | +>>>>>>> e2fd2eac (refactor tts mcp tool) | 5 | 0.370957 | `managedlustre_fs_create` | ❌ | --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 416 ======= ## Test 426 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 431 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `storage_account_create` **Prompt:** Create a storage account with premium performance and LRS replication @@ -15168,6 +18176,7 @@ |------|-------|------|--------| | 1 | 0.500638 | `storage_account_create` | ✅ **EXPECTED** | <<<<<<< HEAD +<<<<<<< HEAD | 2 | 0.484584 | `managedlustre_fs_create` | ❌ | | 3 | 0.407222 | `storage_account_get` | ❌ | ======= @@ -15179,11 +18188,16 @@ | 3 | 0.407200 | `storage_account_get` | ❌ | >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +| 2 | 0.483202 | `managedlustre_fs_create` | ❌ | +| 3 | 0.407222 | `storage_account_get` | ❌ | +>>>>>>> e2fd2eac (refactor tts mcp tool) | 4 | 0.406804 | `storage_blob_container_create` | ❌ | | 5 | 0.400134 | `managedlustre_fs_sku_get` | ❌ | --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 427 ======= @@ -15193,6 +18207,9 @@ ## Test 427 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 432 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `storage_account_create` **Prompt:** Create a new storage account with Data Lake Storage Gen2 enabled @@ -15205,6 +18222,7 @@ | 2 | 0.538023 | `managedlustre_fs_create` | ❌ | | 3 | 0.509731 | `storage_blob_container_create` | ❌ | <<<<<<< HEAD +<<<<<<< HEAD | 4 | 0.462519 | `storage_account_get` | ❌ | | 5 | 0.447156 | `sql_db_create` | ❌ | @@ -15221,13 +18239,20 @@ ## Test 418 ======= | 4 | 0.462480 | `storage_account_get` | ❌ | +======= +| 4 | 0.462519 | `storage_account_get` | ❌ | +>>>>>>> e2fd2eac (refactor tts mcp tool) | 5 | 0.447156 | `sql_db_create` | ❌ | --- +<<<<<<< HEAD ## Test 428 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 433 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `storage_account_get` **Prompt:** Show me the details for my storage account @@ -15237,6 +18262,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| <<<<<<< HEAD +<<<<<<< HEAD | 1 | 0.673750 | `storage_account_get` | ✅ **EXPECTED** | | 2 | 0.607762 | `storage_blob_container_get` | ❌ | | 3 | 0.556457 | `storage_blob_get` | ❌ | @@ -15260,15 +18286,23 @@ ======= | 1 | 0.673754 | `storage_account_get` | ✅ **EXPECTED** | | 2 | 0.608256 | `storage_blob_container_get` | ❌ | +======= +| 1 | 0.673749 | `storage_account_get` | ✅ **EXPECTED** | +| 2 | 0.608245 | `storage_blob_container_get` | ❌ | +>>>>>>> e2fd2eac (refactor tts mcp tool) | 3 | 0.556457 | `storage_blob_get` | ❌ | | 4 | 0.483435 | `storage_account_create` | ❌ | -| 5 | 0.439187 | `cosmos_account_list` | ❌ | +| 5 | 0.439236 | `cosmos_account_list` | ❌ | --- +<<<<<<< HEAD ## Test 429 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 434 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `storage_account_get` **Prompt:** Get details about the storage account @@ -15278,6 +18312,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| <<<<<<< HEAD +<<<<<<< HEAD | 1 | 0.692687 | `storage_account_get` | ✅ **EXPECTED** | | 2 | 0.577173 | `storage_blob_container_get` | ❌ | ======= @@ -15288,12 +18323,17 @@ >>>>>>> 84ad4f44 (update prompts and tool description evaluator) | 2 | 0.577547 | `storage_blob_container_get` | ❌ | >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +| 1 | 0.692687 | `storage_account_get` | ✅ **EXPECTED** | +| 2 | 0.577692 | `storage_blob_container_get` | ❌ | +>>>>>>> e2fd2eac (refactor tts mcp tool) | 3 | 0.529205 | `storage_blob_get` | ❌ | | 4 | 0.518215 | `storage_account_create` | ❌ | | 5 | 0.448506 | `storage_blob_container_create` | ❌ | --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 430 ======= @@ -15303,6 +18343,9 @@ ## Test 430 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 435 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `storage_account_get` **Prompt:** List all storage accounts in my subscription including their location and SKU @@ -15312,6 +18355,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| <<<<<<< HEAD +<<<<<<< HEAD | 1 | 0.649215 | `storage_account_get` | ✅ **EXPECTED** | | 2 | 0.557093 | `managedlustre_fs_sku_get` | ❌ | | 3 | 0.549448 | `storage_blob_container_get` | ❌ | @@ -15322,10 +18366,17 @@ | 3 | 0.550148 | `storage_blob_container_get` | ❌ | >>>>>>> 58ab8585 (update prompts and tool description evaluator) | 4 | 0.547577 | `subscription_list` | ❌ | +======= +| 1 | 0.649215 | `storage_account_get` | ✅ **EXPECTED** | +| 2 | 0.557016 | `managedlustre_fs_sku_get` | ❌ | +| 3 | 0.550292 | `storage_blob_container_get` | ❌ | +| 4 | 0.547647 | `subscription_list` | ❌ | +>>>>>>> e2fd2eac (refactor tts mcp tool) | 5 | 0.536909 | `cosmos_account_list` | ❌ | --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 431 ======= @@ -15342,6 +18393,9 @@ ## Test 431 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 436 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `storage_account_get` **Prompt:** Show me my storage accounts with whether hierarchical namespace (HNS) is enabled @@ -15351,6 +18405,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| <<<<<<< HEAD +<<<<<<< HEAD | 1 | 0.556860 | `storage_account_get` | ✅ **EXPECTED** | | 2 | 0.481664 | `storage_blob_container_get` | ❌ | | 3 | 0.461284 | `managedlustre_fs_list` | ❌ | @@ -15360,17 +18415,17 @@ | 2 | 0.482418 | `storage_blob_container_get` | ❌ | | 3 | 0.461308 | `managedlustre_fs_list` | ❌ | >>>>>>> 58ab8585 (update prompts and tool description evaluator) -| 4 | 0.421642 | `cosmos_account_list` | ❌ | ======= -| 1 | 0.556930 | `storage_account_get` | ✅ **EXPECTED** | -| 2 | 0.482418 | `storage_blob_container_get` | ❌ | -| 3 | 0.461284 | `managedlustre_filesystem_list` | ❌ | -| 4 | 0.421671 | `cosmos_account_list` | ❌ | ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) +| 1 | 0.556860 | `storage_account_get` | ✅ **EXPECTED** | +| 2 | 0.482650 | `storage_blob_container_get` | ❌ | +| 3 | 0.461284 | `managedlustre_fs_list` | ❌ | +>>>>>>> e2fd2eac (refactor tts mcp tool) +| 4 | 0.421642 | `cosmos_account_list` | ❌ | | 5 | 0.410587 | `storage_blob_get` | ❌ | --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 432 ======= @@ -15380,6 +18435,9 @@ ## Test 432 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 437 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `storage_account_get` **Prompt:** Show me the storage accounts in my subscription and include HTTPS-only and public blob access settings @@ -15389,6 +18447,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| <<<<<<< HEAD +<<<<<<< HEAD | 1 | 0.619462 | `storage_account_get` | ✅ **EXPECTED** | | 2 | 0.555677 | `storage_blob_container_get` | ❌ | ======= @@ -15411,13 +18470,23 @@ | 2 | 0.556436 | `storage_blob_container_get` | ❌ | | 3 | 0.518229 | `storage_blob_get` | ❌ | | 4 | 0.473662 | `cosmos_account_list` | ❌ | +======= +| 1 | 0.619462 | `storage_account_get` | ✅ **EXPECTED** | +| 2 | 0.556525 | `storage_blob_container_get` | ❌ | +| 3 | 0.518229 | `storage_blob_get` | ❌ | +| 4 | 0.473598 | `cosmos_account_list` | ❌ | +>>>>>>> e2fd2eac (refactor tts mcp tool) | 5 | 0.465571 | `subscription_list` | ❌ | --- +<<<<<<< HEAD ## Test 433 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 438 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `storage_blob_container_create` **Prompt:** Create the storage container mycontainer in storage account @@ -15427,13 +18496,18 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| | 1 | 0.649793 | `storage_blob_container_create` | ✅ **EXPECTED** | +<<<<<<< HEAD | 2 | 0.585556 | `storage_blob_container_get` | ❌ | +======= +| 2 | 0.584263 | `storage_blob_container_get` | ❌ | +>>>>>>> e2fd2eac (refactor tts mcp tool) | 3 | 0.524779 | `storage_account_create` | ❌ | | 4 | 0.496679 | `storage_blob_get` | ❌ | | 5 | 0.447784 | `cosmos_database_container_list` | ❌ | --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 434 ======= @@ -15443,6 +18517,9 @@ ## Test 434 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 439 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `storage_blob_container_create` **Prompt:** Create the container using blob public access in storage account @@ -15453,6 +18530,7 @@ |------|-------|------|--------| | 1 | 0.682161 | `storage_blob_container_create` | ✅ **EXPECTED** | <<<<<<< HEAD +<<<<<<< HEAD | 2 | 0.590826 | `storage_blob_container_get` | ❌ | | 3 | 0.559264 | `storage_blob_get` | ❌ | | 4 | 0.500625 | `storage_account_create` | ❌ | @@ -15463,13 +18541,16 @@ ## Test 435 ======= | 2 | 0.590160 | `storage_blob_container_get` | ❌ | +======= +| 2 | 0.590461 | `storage_blob_container_get` | ❌ | +>>>>>>> e2fd2eac (refactor tts mcp tool) | 3 | 0.559263 | `storage_blob_get` | ❌ | | 4 | 0.500624 | `storage_account_create` | ❌ | -<<<<<<< HEAD -| 5 | 0.420434 | `storage_account_get` | ❌ | +| 5 | 0.420514 | `storage_account_get` | ❌ | --- +<<<<<<< HEAD ## Test 425 ======= | 5 | 0.420516 | `storage_account_get` | ❌ | @@ -15479,6 +18560,9 @@ ## Test 435 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 440 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `storage_blob_container_create` **Prompt:** Create a new blob container named documents with container public access in storage account @@ -15487,6 +18571,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.625397 | `storage_blob_container_create` | ✅ **EXPECTED** | | 2 | 0.544024 | `storage_blob_container_get` | ❌ | | 3 | 0.497804 | `storage_blob_get` | ❌ | @@ -15504,6 +18589,17 @@ ## Test 436 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +| 1 | 0.625122 | `storage_blob_container_create` | ✅ **EXPECTED** | +| 2 | 0.543590 | `storage_blob_container_get` | ❌ | +| 3 | 0.497579 | `storage_blob_get` | ❌ | +| 4 | 0.463133 | `storage_account_create` | ❌ | +| 5 | 0.435075 | `cosmos_database_container_list` | ❌ | + +--- + +## Test 441 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `storage_blob_container_get` **Prompt:** Show me the properties of the storage container in the storage account @@ -15512,6 +18608,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.703348 | `storage_blob_container_get` | ✅ **EXPECTED** | | 2 | 0.623681 | `storage_blob_get` | ❌ | <<<<<<< HEAD @@ -15528,17 +18625,26 @@ ======= | 3 | 0.577904 | `storage_account_get` | ❌ | >>>>>>> 84ad4f44 (update prompts and tool description evaluator) +======= +| 1 | 0.701878 | `storage_blob_container_get` | ✅ **EXPECTED** | +| 2 | 0.623681 | `storage_blob_get` | ❌ | +| 3 | 0.577921 | `storage_account_get` | ❌ | +>>>>>>> e2fd2eac (refactor tts mcp tool) | 4 | 0.549803 | `storage_blob_container_create` | ❌ | | 5 | 0.523288 | `cosmos_database_container_list` | ❌ | --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 427 ======= ## Test 437 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 442 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `storage_blob_container_get` **Prompt:** List all blob containers in the storage account @@ -15547,7 +18653,11 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.712012 | `storage_blob_container_get` | ✅ **EXPECTED** | +======= +| 1 | 0.712439 | `storage_blob_container_get` | ✅ **EXPECTED** | +>>>>>>> e2fd2eac (refactor tts mcp tool) | 2 | 0.680802 | `storage_blob_get` | ❌ | | 3 | 0.613933 | `cosmos_database_container_list` | ❌ | | 4 | 0.556319 | `storage_blob_container_create` | ❌ | @@ -15555,6 +18665,7 @@ --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 438 ======= @@ -15564,6 +18675,9 @@ ## Test 438 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 443 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `storage_blob_container_get` **Prompt:** Show me the containers in the storage account @@ -15572,6 +18686,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.713080 | `storage_blob_container_get` | ✅ **EXPECTED** | | 2 | 0.592373 | `cosmos_database_container_list` | ❌ | | 3 | 0.586169 | `storage_blob_get` | ❌ | @@ -15588,16 +18703,26 @@ ======= | 4 | 0.523293 | `storage_account_get` | ❌ | >>>>>>> 84ad4f44 (update prompts and tool description evaluator) +======= +| 1 | 0.713761 | `storage_blob_container_get` | ✅ **EXPECTED** | +| 2 | 0.592373 | `cosmos_database_container_list` | ❌ | +| 3 | 0.586169 | `storage_blob_get` | ❌ | +| 4 | 0.523322 | `storage_account_get` | ❌ | +>>>>>>> e2fd2eac (refactor tts mcp tool) | 5 | 0.487521 | `storage_blob_container_create` | ❌ | --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 429 ======= ## Test 439 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 444 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `storage_blob_get` **Prompt:** Show me the properties for blob in container in storage account @@ -15607,6 +18732,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| <<<<<<< HEAD +<<<<<<< HEAD | 1 | 0.700963 | `storage_blob_get` | ✅ **EXPECTED** | | 2 | 0.648279 | `storage_blob_container_get` | ❌ | | 3 | 0.540987 | `storage_blob_container_create` | ❌ | @@ -15628,17 +18754,23 @@ ## Test 430 ======= +======= +>>>>>>> e2fd2eac (refactor tts mcp tool) | 1 | 0.700973 | `storage_blob_get` | ✅ **EXPECTED** | -| 2 | 0.646973 | `storage_blob_container_get` | ❌ | +| 2 | 0.647348 | `storage_blob_container_get` | ❌ | | 3 | 0.541019 | `storage_blob_container_create` | ❌ | -| 4 | 0.527428 | `storage_account_get` | ❌ | +| 4 | 0.527427 | `storage_account_get` | ❌ | | 5 | 0.477946 | `cosmos_database_container_list` | ❌ | --- +<<<<<<< HEAD ## Test 440 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 445 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `storage_blob_get` **Prompt:** Get the details about blob in the container in storage account @@ -15647,6 +18779,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.694997 | `storage_blob_get` | ✅ **EXPECTED** | <<<<<<< HEAD | 2 | 0.633397 | `storage_blob_container_get` | ❌ | @@ -15674,6 +18807,17 @@ ## Test 441 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +| 1 | 0.694812 | `storage_blob_get` | ✅ **EXPECTED** | +| 2 | 0.631318 | `storage_blob_container_get` | ❌ | +| 3 | 0.589010 | `storage_blob_container_create` | ❌ | +| 4 | 0.580074 | `storage_account_get` | ❌ | +| 5 | 0.457004 | `storage_account_create` | ❌ | + +--- + +## Test 446 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `storage_blob_get` **Prompt:** List all blobs in the blob container in the storage account @@ -15683,6 +18827,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| | 1 | 0.733586 | `storage_blob_get` | ✅ **EXPECTED** | +<<<<<<< HEAD | 2 | 0.702342 | `storage_blob_container_get` | ❌ | | 3 | 0.605993 | `storage_blob_container_create` | ❌ | | 4 | 0.579070 | `cosmos_database_container_list` | ❌ | @@ -15700,13 +18845,22 @@ ## Test 432 ======= +======= +| 2 | 0.701375 | `storage_blob_container_get` | ❌ | +| 3 | 0.605993 | `storage_blob_container_create` | ❌ | +| 4 | 0.579070 | `cosmos_database_container_list` | ❌ | +>>>>>>> e2fd2eac (refactor tts mcp tool) | 5 | 0.506639 | `cosmos_database_container_item_query` | ❌ | --- +<<<<<<< HEAD ## Test 442 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 447 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `storage_blob_get` **Prompt:** Show me the blobs in the blob container in the storage account @@ -15715,15 +18869,19 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD | 1 | 0.704426 | `storage_blob_get` | ✅ **EXPECTED** | +<<<<<<< HEAD | 2 | 0.666342 | `storage_blob_container_get` | ❌ | +======= +| 2 | 0.665280 | `storage_blob_container_get` | ❌ | +>>>>>>> e2fd2eac (refactor tts mcp tool) | 3 | 0.561557 | `storage_blob_container_create` | ❌ | | 4 | 0.533515 | `cosmos_database_container_list` | ❌ | | 5 | 0.484018 | `storage_account_get` | ❌ | --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 443 ======= @@ -15740,6 +18898,9 @@ ## Test 443 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 448 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `storage_blob_upload` **Prompt:** Upload file to storage blob in container in storage account @@ -15749,6 +18910,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| <<<<<<< HEAD +<<<<<<< HEAD | 1 | 0.566278 | `storage_blob_upload` | ✅ **EXPECTED** | | 2 | 0.525685 | `storage_blob_container_create` | ❌ | | 3 | 0.517524 | `storage_blob_get` | ❌ | @@ -15781,6 +18943,17 @@ ## Test 444 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +| 1 | 0.566243 | `storage_blob_upload` | ✅ **EXPECTED** | +| 2 | 0.525629 | `storage_blob_container_create` | ❌ | +| 3 | 0.517631 | `storage_blob_get` | ❌ | +| 4 | 0.474198 | `storage_blob_container_get` | ❌ | +| 5 | 0.382137 | `storage_account_create` | ❌ | + +--- + +## Test 449 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `subscription_list` **Prompt:** List all subscriptions for my account @@ -15790,20 +18963,23 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| <<<<<<< HEAD +<<<<<<< HEAD | 1 | 0.654048 | `subscription_list` | ✅ **EXPECTED** | | 2 | 0.512964 | `cosmos_account_list` | ❌ | | 3 | 0.471653 | `postgres_server_list` | ❌ | | 4 | 0.469023 | `kusto_cluster_list` | ❌ | ======= +======= +>>>>>>> e2fd2eac (refactor tts mcp tool) | 1 | 0.654071 | `subscription_list` | ✅ **EXPECTED** | -| 2 | 0.512954 | `cosmos_account_list` | ❌ | +| 2 | 0.512964 | `cosmos_account_list` | ❌ | | 3 | 0.471653 | `postgres_server_list` | ❌ | -| 4 | 0.469085 | `kusto_cluster_list` | ❌ | ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) -| 5 | 0.461078 | `redis_list` | ❌ | +| 4 | 0.469023 | `kusto_cluster_list` | ❌ | +| 5 | 0.461054 | `redis_list` | ❌ | --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 445 ======= @@ -15813,6 +18989,9 @@ ## Test 445 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 450 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `subscription_list` **Prompt:** Show me my subscriptions @@ -15821,14 +19000,22 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.458834 | `subscription_list` | ✅ **EXPECTED** | | 2 | 0.407101 | `eventgrid_subscription_list` | ❌ | | 3 | 0.393662 | `eventgrid_topic_list` | ❌ | | 4 | 0.391555 | `redis_list` | ❌ | +======= +| 1 | 0.458821 | `subscription_list` | ✅ **EXPECTED** | +| 2 | 0.407471 | `eventgrid_subscription_list` | ❌ | +| 3 | 0.393695 | `eventgrid_topic_list` | ❌ | +| 4 | 0.391545 | `redis_list` | ❌ | +>>>>>>> e2fd2eac (refactor tts mcp tool) | 5 | 0.381238 | `postgres_server_list` | ❌ | --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 446 ======= @@ -15838,6 +19025,9 @@ ## Test 446 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 451 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `subscription_list` **Prompt:** What is my current subscription? @@ -15846,9 +19036,10 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -| 1 | 0.433242 | `subscription_list` | ✅ **EXPECTED** | +| 1 | 0.433196 | `subscription_list` | ✅ **EXPECTED** | | 2 | 0.319579 | `marketplace_product_list` | ❌ | <<<<<<< HEAD +<<<<<<< HEAD | 3 | 0.315547 | `marketplace_product_get` | ❌ | | 4 | 0.293009 | `eventgrid_subscription_list` | ❌ | | 5 | 0.289280 | `eventgrid_topic_list` | ❌ | @@ -15862,17 +19053,24 @@ ======= | 3 | 0.315474 | `marketplace_product_get` | ❌ | >>>>>>> 84ad4f44 (update prompts and tool description evaluator) +======= +| 3 | 0.315547 | `marketplace_product_get` | ❌ | +>>>>>>> e2fd2eac (refactor tts mcp tool) | 4 | 0.293772 | `eventgrid_subscription_list` | ❌ | | 5 | 0.289334 | `eventgrid_topic_list` | ❌ | --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 437 ======= ## Test 447 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 452 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `subscription_list` **Prompt:** What subscriptions do I have? @@ -15881,6 +19079,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.477657 | `subscription_list` | ✅ **EXPECTED** | | 2 | 0.356775 | `eventgrid_subscription_list` | ❌ | | 3 | 0.354286 | `marketplace_product_list` | ❌ | @@ -15898,6 +19097,17 @@ ## Test 448 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +| 1 | 0.477592 | `subscription_list` | ✅ **EXPECTED** | +| 2 | 0.357625 | `eventgrid_subscription_list` | ❌ | +| 3 | 0.354286 | `marketplace_product_list` | ❌ | +| 4 | 0.344527 | `redis_list` | ❌ | +| 5 | 0.340837 | `eventgrid_topic_list` | ❌ | + +--- + +## Test 453 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `azureterraformbestpractices_get` **Prompt:** Fetch the Azure Terraform best practices @@ -15909,6 +19119,7 @@ | 1 | 0.686886 | `azureterraformbestpractices_get` | ✅ **EXPECTED** | | 2 | 0.625270 | `deploy_iac_rules_get` | ❌ | <<<<<<< HEAD +<<<<<<< HEAD | 3 | 0.605048 | `get_bestpractices_get` | ❌ | | 4 | 0.482745 | `deploy_pipeline_guidance_get` | ❌ | | 5 | 0.468390 | `azureaibestpractices_get` | ❌ | @@ -15918,17 +19129,24 @@ ## Test 449 ======= | 3 | 0.605599 | `get_bestpractices_get` | ❌ | +======= +| 3 | 0.605047 | `get_bestpractices_get` | ❌ | +>>>>>>> e2fd2eac (refactor tts mcp tool) | 4 | 0.482936 | `deploy_pipeline_guidance_get` | ❌ | | 5 | 0.466199 | `deploy_plan_get` | ❌ | --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 439 ======= ## Test 449 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 454 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `azureterraformbestpractices_get` **Prompt:** Show me the Azure Terraform best practices and generate code sample to get a secret from Azure Key Vault @@ -15938,6 +19156,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| <<<<<<< HEAD +<<<<<<< HEAD | 1 | 0.581316 | `azureterraformbestpractices_get` | ✅ **EXPECTED** | | 2 | 0.512141 | `get_bestpractices_get` | ❌ | | 3 | 0.510005 | `deploy_iac_rules_get` | ❌ | @@ -15952,21 +19171,26 @@ | 1 | 0.581332 | `azureterraformbestpractices_get` | ✅ **EXPECTED** | | 2 | 0.512141 | `get_bestpractices_get` | ❌ | ======= +======= +>>>>>>> e2fd2eac (refactor tts mcp tool) | 1 | 0.581316 | `azureterraformbestpractices_get` | ✅ **EXPECTED** | -| 2 | 0.515758 | `get_bestpractices_get` | ❌ | ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) +| 2 | 0.512141 | `get_bestpractices_get` | ❌ | | 3 | 0.510004 | `deploy_iac_rules_get` | ❌ | | 4 | 0.473596 | `keyvault_secret_get` | ❌ | | 5 | 0.444297 | `deploy_pipeline_guidance_get` | ❌ | --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 440 ======= ## Test 450 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 455 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `virtualdesktop_hostpool_list` **Prompt:** List all host pools in my subscription @@ -15987,24 +19211,23 @@ ## Test 451 ======= | 1 | 0.711969 | `virtualdesktop_hostpool_list` | ✅ **EXPECTED** | -<<<<<<< HEAD | 2 | 0.659763 | `virtualdesktop_hostpool_host_list` | ❌ | | 3 | 0.620666 | `kusto_cluster_list` | ❌ | -======= -| 2 | 0.659732 | `virtualdesktop_hostpool_sessionhost_list` | ❌ | -| 3 | 0.620507 | `kusto_cluster_list` | ❌ | ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) | 4 | 0.548888 | `search_service_list` | ❌ | -| 5 | 0.535777 | `virtualdesktop_hostpool_host_user-list` | ❌ | +| 5 | 0.535739 | `virtualdesktop_hostpool_host_user-list` | ❌ | --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 441 ======= ## Test 451 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 456 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `virtualdesktop_hostpool_host_list` **Prompt:** List all session hosts in host pool @@ -16013,9 +19236,9 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD | 1 | 0.727054 | `virtualdesktop_hostpool_host_list` | ✅ **EXPECTED** | <<<<<<< HEAD +<<<<<<< HEAD | 2 | 0.715572 | `virtualdesktop_hostpool_host_user-list` | ❌ | | 3 | 0.573350 | `virtualdesktop_hostpool_list` | ❌ | ======= @@ -16024,6 +19247,9 @@ | 1 | 0.726933 | `virtualdesktop_hostpool_sessionhost_list` | ❌ | | 2 | 0.714469 | `virtualdesktop_hostpool_sessionhost_usersession-list` | ❌ | >>>>>>> 84ad4f44 (update prompts and tool description evaluator) +======= +| 2 | 0.714469 | `virtualdesktop_hostpool_host_user-list` | ❌ | +>>>>>>> e2fd2eac (refactor tts mcp tool) | 3 | 0.573352 | `virtualdesktop_hostpool_list` | ❌ | >>>>>>> 58ab8585 (update prompts and tool description evaluator) | 4 | 0.438659 | `aks_nodepool_get` | ❌ | @@ -16031,6 +19257,7 @@ --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 452 ======= @@ -16040,6 +19267,9 @@ ## Test 452 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 457 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `virtualdesktop_hostpool_host_user-list` **Prompt:** List all user sessions on session host in host pool @@ -16049,12 +19279,16 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| <<<<<<< HEAD +<<<<<<< HEAD | 1 | 0.813311 | `virtualdesktop_hostpool_host_user-list` | ✅ **EXPECTED** | | 2 | 0.659213 | `virtualdesktop_hostpool_host_list` | ❌ | | 3 | 0.501113 | `virtualdesktop_hostpool_list` | ❌ | ======= <<<<<<< HEAD | 1 | 0.812787 | `virtualdesktop_hostpool_host_user-list` | ✅ **EXPECTED** | +======= +| 1 | 0.812659 | `virtualdesktop_hostpool_host_user-list` | ✅ **EXPECTED** | +>>>>>>> e2fd2eac (refactor tts mcp tool) | 2 | 0.659212 | `virtualdesktop_hostpool_host_list` | ❌ | | 3 | 0.501167 | `virtualdesktop_hostpool_list` | ❌ | >>>>>>> 58ab8585 (update prompts and tool description evaluator) @@ -16063,6 +19297,7 @@ --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 453 ======= @@ -16079,6 +19314,9 @@ ## Test 453 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 458 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `workbooks_create` **Prompt:** Create a new workbook named @@ -16087,14 +19325,19 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -| 1 | 0.552212 | `workbooks_create` | ✅ **EXPECTED** | +| 1 | 0.552307 | `workbooks_create` | ✅ **EXPECTED** | | 2 | 0.417950 | `workbooks_update` | ❌ | | 3 | 0.361364 | `workbooks_delete` | ❌ | +<<<<<<< HEAD | 4 | 0.329077 | `workbooks_show` | ❌ | +======= +| 4 | 0.329118 | `workbooks_show` | ❌ | +>>>>>>> e2fd2eac (refactor tts mcp tool) | 5 | 0.328113 | `workbooks_list` | ❌ | --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 454 ======= @@ -16104,6 +19347,9 @@ ## Test 454 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 459 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `workbooks_delete` **Prompt:** Delete the workbook with resource ID @@ -16113,6 +19359,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| | 1 | 0.621310 | `workbooks_delete` | ✅ **EXPECTED** | +<<<<<<< HEAD | 2 | 0.498506 | `workbooks_show` | ❌ | | 3 | 0.432454 | `workbooks_create` | ❌ | <<<<<<< HEAD @@ -16124,10 +19371,16 @@ | 4 | 0.425569 | `workbooks_list` | ❌ | >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +| 2 | 0.498518 | `workbooks_show` | ❌ | +| 3 | 0.432643 | `workbooks_create` | ❌ | +| 4 | 0.425569 | `workbooks_list` | ❌ | +>>>>>>> e2fd2eac (refactor tts mcp tool) | 5 | 0.421897 | `workbooks_update` | ❌ | --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 455 ======= @@ -16137,6 +19390,9 @@ ## Test 455 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 460 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `workbooks_list` **Prompt:** List all workbooks in my resource group @@ -16145,6 +19401,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.772404 | `workbooks_list` | ✅ **EXPECTED** | | 2 | 0.562476 | `workbooks_create` | ❌ | | 3 | 0.516733 | `grafana_list` | ❌ | @@ -16162,6 +19419,17 @@ ## Test 456 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +| 1 | 0.772431 | `workbooks_list` | ✅ **EXPECTED** | +| 2 | 0.562794 | `workbooks_create` | ❌ | +| 3 | 0.516739 | `grafana_list` | ❌ | +| 4 | 0.494073 | `workbooks_show` | ❌ | +| 5 | 0.488600 | `group_list` | ❌ | + +--- + +## Test 461 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `workbooks_list` **Prompt:** What workbooks do I have in resource group ? @@ -16171,13 +19439,19 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| | 1 | 0.708612 | `workbooks_list` | ✅ **EXPECTED** | +<<<<<<< HEAD | 2 | 0.570260 | `workbooks_create` | ❌ | | 3 | 0.499633 | `workbooks_show` | ❌ | +======= +| 2 | 0.570521 | `workbooks_create` | ❌ | +| 3 | 0.499716 | `workbooks_show` | ❌ | +>>>>>>> e2fd2eac (refactor tts mcp tool) | 4 | 0.485504 | `workbooks_delete` | ❌ | | 5 | 0.472378 | `grafana_list` | ❌ | --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 457 ======= @@ -16187,6 +19461,9 @@ ## Test 457 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 462 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `workbooks_show` **Prompt:** Get information about the workbook with resource ID @@ -16195,6 +19472,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.686095 | `workbooks_show` | ✅ **EXPECTED** | | 2 | 0.498390 | `workbooks_create` | ❌ | <<<<<<< HEAD @@ -16206,11 +19484,17 @@ | 3 | 0.494708 | `workbooks_list` | ❌ | >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +| 1 | 0.686087 | `workbooks_show` | ✅ **EXPECTED** | +| 2 | 0.498518 | `workbooks_create` | ❌ | +| 3 | 0.494708 | `workbooks_list` | ❌ | +>>>>>>> e2fd2eac (refactor tts mcp tool) | 4 | 0.463156 | `workbooks_update` | ❌ | | 5 | 0.452348 | `workbooks_delete` | ❌ | --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 458 ======= @@ -16220,6 +19504,9 @@ ## Test 458 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 463 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `workbooks_show` **Prompt:** Show me the workbook with resource ID @@ -16228,14 +19515,21 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| +<<<<<<< HEAD | 1 | 0.581575 | `workbooks_show` | ✅ **EXPECTED** | | 2 | 0.500475 | `workbooks_list` | ❌ | | 3 | 0.468996 | `workbooks_create` | ❌ | +======= +| 1 | 0.581501 | `workbooks_show` | ✅ **EXPECTED** | +| 2 | 0.500475 | `workbooks_list` | ❌ | +| 3 | 0.469214 | `workbooks_create` | ❌ | +>>>>>>> e2fd2eac (refactor tts mcp tool) | 4 | 0.466266 | `workbooks_update` | ❌ | | 5 | 0.455311 | `workbooks_delete` | ❌ | --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 459 ======= @@ -16245,6 +19539,9 @@ ## Test 459 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 464 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `workbooks_update` **Prompt:** Update the workbook with a new text step @@ -16254,8 +19551,9 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| | 1 | 0.586347 | `workbooks_update` | ✅ **EXPECTED** | -| 2 | 0.382651 | `workbooks_create` | ❌ | +| 2 | 0.382724 | `workbooks_create` | ❌ | | 3 | 0.349689 | `workbooks_delete` | ❌ | +<<<<<<< HEAD | 4 | 0.347778 | `workbooks_show` | ❌ | | 5 | 0.292904 | `loadtesting_testrun_update` | ❌ | @@ -16270,6 +19568,14 @@ ## Test 460 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +| 4 | 0.347944 | `workbooks_show` | ❌ | +| 5 | 0.292993 | `loadtesting_testrun_update` | ❌ | + +--- + +## Test 465 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `bicepschema_get` **Prompt:** How can I use Bicep to create an Azure OpenAI service? @@ -16282,6 +19588,7 @@ | 2 | 0.485970 | `foundry_models_deploy` | ❌ | | 3 | 0.485889 | `deploy_iac_rules_get` | ❌ | <<<<<<< HEAD +<<<<<<< HEAD | 4 | 0.468898 | `azureaibestpractices_get` | ❌ | | 5 | 0.453412 | `foundry_openai_embeddings-create` | ❌ | @@ -16290,11 +19597,14 @@ ## Test 461 ======= <<<<<<< HEAD +======= +>>>>>>> e2fd2eac (refactor tts mcp tool) | 4 | 0.453282 | `foundry_openai_embeddings-create` | ❌ | | 5 | 0.448373 | `get_bestpractices_get` | ❌ | --- +<<<<<<< HEAD ## Test 451 ======= | 4 | 0.462146 | `foundry_openai_embeddings-create` | ❌ | @@ -16305,6 +19615,9 @@ ## Test 461 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 466 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `cloudarchitect_design` **Prompt:** Please help me design an architecture for a large-scale file upload, storage, and retrieval service @@ -16316,6 +19629,7 @@ | 1 | 0.502125 | `cloudarchitect_design` | ✅ **EXPECTED** | | 2 | 0.290902 | `storage_blob_upload` | ❌ | <<<<<<< HEAD +<<<<<<< HEAD | 3 | 0.260101 | `managedlustre_fs_create` | ❌ | | 4 | 0.254991 | `deploy_architecture_diagram_generate` | ❌ | ======= @@ -16323,10 +19637,15 @@ | 3 | 0.259162 | `managedlustre_fs_create` | ❌ | | 4 | 0.254853 | `deploy_architecture_diagram_generate` | ❌ | >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +| 3 | 0.259162 | `managedlustre_fs_create` | ❌ | +| 4 | 0.254991 | `deploy_architecture_diagram_generate` | ❌ | +>>>>>>> e2fd2eac (refactor tts mcp tool) | 5 | 0.245034 | `managedlustre_fs_subnetsize_validate` | ❌ | --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 462 ======= @@ -16341,6 +19660,9 @@ ## Test 462 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 467 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `cloudarchitect_design` **Prompt:** Help me design an Azure cloud service that will serve as an ATM for users @@ -16351,6 +19673,7 @@ |------|-------|------|--------| | 1 | 0.508153 | `cloudarchitect_design` | ✅ **EXPECTED** | <<<<<<< HEAD +<<<<<<< HEAD | 2 | 0.377941 | `deploy_architecture_diagram_generate` | ❌ | | 3 | 0.341316 | `deploy_pipeline_guidance_get` | ❌ | | 4 | 0.336385 | `azureaibestpractices_get` | ❌ | @@ -16365,20 +19688,25 @@ | 3 | 0.341462 | `deploy_pipeline_guidance_get` | ❌ | | 4 | 0.328747 | `get_bestpractices_get` | ❌ | ======= +======= +>>>>>>> e2fd2eac (refactor tts mcp tool) | 2 | 0.377941 | `deploy_architecture_diagram_generate` | ❌ | | 3 | 0.341462 | `deploy_pipeline_guidance_get` | ❌ | -| 4 | 0.331626 | `get_bestpractices_get` | ❌ | ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) +| 4 | 0.328747 | `get_bestpractices_get` | ❌ | | 5 | 0.321855 | `deploy_plan_get` | ❌ | --- +<<<<<<< HEAD <<<<<<< HEAD ## Test 453 ======= ## Test 463 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +## Test 468 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `cloudarchitect_design` **Prompt:** I want to design a cloud app for ordering groceries @@ -16388,6 +19716,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| | 1 | 0.423577 | `cloudarchitect_design` | ✅ **EXPECTED** | +<<<<<<< HEAD | 2 | 0.271869 | `deploy_pipeline_guidance_get` | ❌ | | 3 | 0.265972 | `deploy_architecture_diagram_generate` | ❌ | | 4 | 0.242581 | `deploy_plan_get` | ❌ | @@ -16404,6 +19733,16 @@ ## Test 464 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +| 2 | 0.271943 | `deploy_pipeline_guidance_get` | ❌ | +| 3 | 0.265972 | `deploy_architecture_diagram_generate` | ❌ | +| 4 | 0.242581 | `deploy_plan_get` | ❌ | +| 5 | 0.229074 | `extension_cli_generate` | ❌ | + +--- + +## Test 469 +>>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `cloudarchitect_design` **Prompt:** How can I design a cloud service in Azure that will store and present videos for users? @@ -16420,13 +19759,8 @@ | 5 | 0.324217 | `azureaibestpractices_get` | ❌ | ======= | 2 | 0.369969 | `deploy_pipeline_guidance_get` | ❌ | -<<<<<<< HEAD | 3 | 0.356331 | `managedlustre_fs_create` | ❌ | -| 4 | 0.352914 | `deploy_architecture_diagram_generate` | ❌ | -======= -| 3 | 0.356331 | `managedlustre_filesystem_create` | ❌ | | 4 | 0.352797 | `deploy_architecture_diagram_generate` | ❌ | ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) | 5 | 0.323920 | `storage_blob_upload` | ❌ | >>>>>>> 58ab8585 (update prompts and tool description evaluator) @@ -16434,6 +19768,7 @@ ## Summary +<<<<<<< HEAD <<<<<<< HEAD **Total Prompts Tested:** 464 **Analysis Execution Time:** 186.7791311s @@ -16496,10 +19831,35 @@ **👌 Top Choice + Acceptable Confidence (≥0.4):** 89.2% (414/464 tests) >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= +**Total Prompts Tested:** 469 +**Analysis Execution Time:** 101.3684843s + +### Success Rate Metrics + +**Top Choice Success:** 92.3% (433/469 tests) + +#### Confidence Level Distribution + +**💪 Very High Confidence (≥0.8):** 3.2% (15/469 tests) +**🎯 High Confidence (≥0.7):** 22.6% (106/469 tests) +**✅ Good Confidence (≥0.6):** 61.2% (287/469 tests) +**👍 Fair Confidence (≥0.5):** 91.9% (431/469 tests) +**👌 Acceptable Confidence (≥0.4):** 99.6% (467/469 tests) +**❌ Low Confidence (<0.4):** 0.4% (2/469 tests) + +#### Top Choice + Confidence Combinations + +**💪 Top Choice + Very High Confidence (≥0.8):** 3.2% (15/469 tests) +**🎯 Top Choice + High Confidence (≥0.7):** 22.6% (106/469 tests) +**✅ Top Choice + Good Confidence (≥0.6):** 59.3% (278/469 tests) +**👍 Top Choice + Fair Confidence (≥0.5):** 86.6% (406/469 tests) +**👌 Top Choice + Acceptable Confidence (≥0.4):** 92.3% (433/469 tests) +>>>>>>> e2fd2eac (refactor tts mcp tool) ### Success Rate Analysis -🟡 **Good** - The tool selection system is performing well. +🟢 **Excellent** - The tool selection system is performing exceptionally well. ⚠️ **Recommendation:** Tool descriptions need improvement to better match user intent (targets: ≥0.6 good, ≥0.7 high). diff --git a/eng/tools/ToolDescriptionEvaluator/tools.json b/eng/tools/ToolDescriptionEvaluator/tools.json index 4920f90f3..839af1bc3 100644 --- a/eng/tools/ToolDescriptionEvaluator/tools.json +++ b/eng/tools/ToolDescriptionEvaluator/tools.json @@ -10987,7 +10987,7 @@ }, { "name": "synthesize", - "description": "Convert text to speech using Azure AI Services Speech. This command takes text input and generates an audio file using advanced neural text-to-speech capabilities.\nYou must provide an Azure AI Services endpoint (e.g., https://your-service.cognitiveservices.azure.com/), the text to convert, and an output file path.\nOptional parameters include language specification (default: en-US), voice selection, audio output format (default: Riff24Khz16BitMonoPcm), and custom voice endpoint ID.\nThe command supports a wide variety of output formats and neural voices for natural-sounding speech synthesis.", + "description": "Convert text to speech using Azure AI Services Speech. This command takes text input and generates an audio file using advanced neural text-to-speech capabilities.\r\nYou must provide an Azure AI Services endpoint (e.g., https://your-service.cognitiveservices.azure.com/), the text to convert, and an output file path.\r\nOptional parameters include language specification (default: en-US), voice selection, audio output format (default: Riff24Khz16BitMonoPcm), and custom voice endpoint ID.\r\nThe command supports a wide variety of output formats and neural voices for natural-sounding speech synthesis.", "command": "speech tts synthesize", "option": [ { @@ -11070,7 +11070,7 @@ }, { "name": "--format", - "description": "Output format: simple or detailed. Default is simple.", + "description": "Output format: simple or detailed.", "type": "string", "required": null }, @@ -13268,6 +13268,7 @@ } ], "consolidated_tools": null, +<<<<<<< HEAD <<<<<<< HEAD "duration": 53 ======= @@ -13277,4 +13278,7 @@ "duration": 49 >>>>>>> 84ad4f44 (update prompts and tool description evaluator) >>>>>>> 58ab8585 (update prompts and tool description evaluator) +======= + "duration": 55 +>>>>>>> e2fd2eac (refactor tts mcp tool) } \ No newline at end of file diff --git a/tools/Azure.Mcp.Tools.Speech/src/Azure.Mcp.Tools.Speech.csproj b/tools/Azure.Mcp.Tools.Speech/src/Azure.Mcp.Tools.Speech.csproj index c320a34e2..7b1158487 100644 --- a/tools/Azure.Mcp.Tools.Speech/src/Azure.Mcp.Tools.Speech.csproj +++ b/tools/Azure.Mcp.Tools.Speech/src/Azure.Mcp.Tools.Speech.csproj @@ -1,4 +1,4 @@ - + true diff --git a/tools/Azure.Mcp.Tools.Speech/src/Commands/Tts/TtsSynthesizeCommand.cs b/tools/Azure.Mcp.Tools.Speech/src/Commands/Tts/TtsSynthesizeCommand.cs index 61ab99ff2..a812fd11c 100644 --- a/tools/Azure.Mcp.Tools.Speech/src/Commands/Tts/TtsSynthesizeCommand.cs +++ b/tools/Azure.Mcp.Tools.Speech/src/Commands/Tts/TtsSynthesizeCommand.cs @@ -113,7 +113,7 @@ protected override TtsSynthesizeOptions BindOptions(ParseResult parseResult) return options; } - public override async Task ExecuteAsync(CommandContext context, ParseResult parseResult) + public override async Task ExecuteAsync(CommandContext context, ParseResult parseResult, CancellationToken cancellationToken) { if (!Validate(parseResult.CommandResult, context.Response).IsValid) { diff --git a/tools/Azure.Mcp.Tools.Speech/src/Services/ISpeechService.cs b/tools/Azure.Mcp.Tools.Speech/src/Services/ISpeechService.cs index c5e60dc5a..213ba8c12 100644 --- a/tools/Azure.Mcp.Tools.Speech/src/Services/ISpeechService.cs +++ b/tools/Azure.Mcp.Tools.Speech/src/Services/ISpeechService.cs @@ -17,7 +17,7 @@ Task RecognizeSpeechFromFile( string? profanity = null, RetryPolicyOptions? retryPolicy = null); - Task SynthesizeSpeechToFile( + Task SynthesizeSpeechToFile( string endpoint, string text, string outputFilePath, diff --git a/tools/Azure.Mcp.Tools.Speech/src/Services/SpeechService.cs b/tools/Azure.Mcp.Tools.Speech/src/Services/SpeechService.cs index ec36c55c6..e25306c12 100644 --- a/tools/Azure.Mcp.Tools.Speech/src/Services/SpeechService.cs +++ b/tools/Azure.Mcp.Tools.Speech/src/Services/SpeechService.cs @@ -6,15 +6,23 @@ using Azure.Mcp.Core.Services.Azure.Tenant; using Azure.Mcp.Tools.Speech.Models; using Azure.Mcp.Tools.Speech.Services.Recognizers; +using Azure.Mcp.Tools.Speech.Services.Synthesizers; using Microsoft.Extensions.Logging; namespace Azure.Mcp.Tools.Speech.Services; -public class SpeechService(ITenantService tenantService, ILogger logger, IFastTranscriptionRecognizer fastTranscriptionRecognizer, IRealtimeTranscriptionRecognizer realtimeTranscriptionRecognizer) : BaseAzureService(tenantService), ISpeechService +public class SpeechService( + ITenantService tenantService, + ILogger logger, + IFastTranscriptionRecognizer fastTranscriptionRecognizer, + IRealtimeTranscriptionRecognizer realtimeTranscriptionRecognizer, + IRealtimeTtsSynthesizer speechSynthesizer) + : BaseAzureService(tenantService), ISpeechService { private readonly ILogger _logger = logger; private readonly IFastTranscriptionRecognizer _fastTranscriptionRecognizer = fastTranscriptionRecognizer; private readonly IRealtimeTranscriptionRecognizer _realtimeTranscriptionRecognizer = realtimeTranscriptionRecognizer; + private readonly IRealtimeTtsSynthesizer _speechSynthesizer = speechSynthesizer; /// /// Recognizes speech from an audio file using either Fast Transcription or Realtime Transcription. /// Fast Transcription is preferred when the language is supported. @@ -82,417 +90,9 @@ public async Task RecognizeSpeechFromFile( } } - /// - /// Determines if the cancellation details indicate an invalid endpoint error. - /// - /// The cancellation details from the speech recognition - /// True if the error indicates an invalid endpoint, false otherwise - private static bool IsInvalidEndpointError(CancellationDetails cancellationDetails) - { - // Check for common error codes that indicate endpoint issues - return cancellationDetails.Reason == CancellationReason.Error && - (cancellationDetails.ErrorCode == CancellationErrorCode.ConnectionFailure || - cancellationDetails.ErrorCode == CancellationErrorCode.AuthenticationFailure || - cancellationDetails.ErrorCode == CancellationErrorCode.Forbidden || - cancellationDetails.ErrorDetails?.Contains("endpoint", StringComparison.OrdinalIgnoreCase) == true || - cancellationDetails.ErrorDetails?.Contains("connection", StringComparison.OrdinalIgnoreCase) == true || - cancellationDetails.ErrorDetails?.Contains("network", StringComparison.OrdinalIgnoreCase) == true); - } - - /// - /// Creates an AudioConfig from a file, automatically detecting the format based on file extension. - /// Supports WAV, MP3, OPUS/OGG, FLAC, and other common audio formats using GStreamer when available. - /// - /// Path to the audio file - /// AudioConfig configured for the specified audio file - /// Thrown when compressed audio format is used but GStreamer is not properly configured - private static AudioConfig CreateAudioConfigFromFile(string filePath) - { - var extension = Path.GetExtension(filePath).ToLowerInvariant(); - - // WAV files don't require GStreamer - if (extension == ".wav") - { - return AudioConfig.FromWavFileInput(filePath); - } - - // For compressed formats, check if GStreamer is available - var isCompressedFormat = extension is ".mp3" or ".ogg" or ".opus" or ".flac" or ".alaw" or ".mulaw" or ".mp4" or ".m4a" or ".aac"; - - if (isCompressedFormat) - { - return extension switch - { - ".mp3" => CreateCompressedAudioConfig(filePath, AudioStreamContainerFormat.MP3), - ".ogg" => CreateCompressedAudioConfig(filePath, AudioStreamContainerFormat.OGG_OPUS), - ".opus" => CreateCompressedAudioConfig(filePath, AudioStreamContainerFormat.OGG_OPUS), - ".flac" => CreateCompressedAudioConfig(filePath, AudioStreamContainerFormat.FLAC), - ".alaw" => CreateCompressedAudioConfig(filePath, AudioStreamContainerFormat.ALAW), - ".mulaw" => CreateCompressedAudioConfig(filePath, AudioStreamContainerFormat.MULAW), - ".mp4" => CreateCompressedAudioConfig(filePath, AudioStreamContainerFormat.ANY), - ".m4a" => CreateCompressedAudioConfig(filePath, AudioStreamContainerFormat.ANY), - ".aac" => CreateCompressedAudioConfig(filePath, AudioStreamContainerFormat.ANY), - _ => throw new NotSupportedException($"Audio format {extension} is not supported") - }; - } - - // Throw exception for unsupported formats - throw new NotSupportedException($"Audio format '{extension}' is not supported. Supported formats are: .wav, .mp3, .ogg, .opus, .flac, .alaw, .mulaw, .mp4, .m4a, .aac"); - } - - /// - /// Creates an AudioConfig for compressed audio formats using PullAudioInputStream. - /// Requires GStreamer to be installed and available in the system PATH. - /// - /// Path to the compressed audio file - /// The audio container format - /// AudioConfig configured for the compressed audio file - private static AudioConfig CreateCompressedAudioConfig(string filePath, AudioStreamContainerFormat containerFormat) - { - // Create compressed audio stream format - var audioFormat = AudioStreamFormat.GetCompressedFormat(containerFormat); - - // Create a custom PullAudioInputStream using a callback - var callback = new BinaryFileReaderCallback(filePath); - var pullStream = AudioInputStream.CreatePullStream(callback, audioFormat); - - return AudioConfig.FromStreamInput(pullStream); - } - - /// - /// Determines if an exception indicates that GStreamer is missing or not properly configured. - /// - /// The exception to check - /// True if the exception indicates GStreamer is missing, false otherwise - private static bool IsGStreamerMissingError(Exception ex) - { - // Check for common GStreamer-related error patterns - var message = ex.Message?.ToLowerInvariant() ?? ""; - var innerMessage = ex.InnerException?.Message?.ToLowerInvariant() ?? ""; - - // Common GStreamer error indicators - var gstreamerErrorPatterns = new[] - { - "gstreamer", - "0x27", // SPXERR_GSTREAMER_INTERNAL_ERROR - "spxerr_gstreamer", - "compressed audio", - "codec", - "audio format not supported", - "audio stream format", - "pipeline", - "element", - "decoder" - }; - - return gstreamerErrorPatterns.Any(pattern => - message.Contains(pattern) || innerMessage.Contains(pattern)); - } - - /// - /// Binary file reader callback for PullAudioInputStream. - /// Reads binary audio data from file for compressed audio processing. - /// - private sealed class BinaryFileReaderCallback : PullAudioInputStreamCallback - { - private readonly FileStream _fileStream; - - public BinaryFileReaderCallback(string filePath) - { - _fileStream = File.OpenRead(filePath); - } - - public override int Read(byte[] dataBuffer, uint size) - { - try - { - var bytesToRead = Math.Min((int)size, dataBuffer.Length); - return _fileStream.Read(dataBuffer, 0, bytesToRead); - } - catch - { - return 0; // End of stream or error - } - } - - public override void Close() - { - _fileStream?.Dispose(); - } - } - - private static Models.SpeechRecognitionResult CreateNoMatchResult() - { - return new Models.SpeechRecognitionResult - { - Text = string.Empty, - Reason = ResultReason.NoMatch.ToString() - }; - } - - private static ProfanityOption GetProfanityOption(string profanity) => - profanity.ToLowerInvariant() switch - { - "masked" => ProfanityOption.Masked, - "removed" => ProfanityOption.Removed, - "raw" => ProfanityOption.Raw, - _ => ProfanityOption.Masked - }; - - private static Models.SpeechRecognitionResult ConvertToSpeechRecognitionResult(SdkSpeechRecognitionResult speechResult, string? format) - { - // detailed format - if (format?.ToLowerInvariant() == "detailed") - { - return new Models.DetailedSpeechRecognitionResult - { - Text = speechResult.Text, - Reason = speechResult.Reason.ToString(), - Offset = (ulong)speechResult.OffsetInTicks, - Duration = (ulong)speechResult.Duration.Ticks, - NBest = ExtractNBestResults(speechResult) - }; - } - // simple format - else - { - return new Models.SpeechRecognitionResult - { - Text = speechResult.Text, - Reason = speechResult.Reason.ToString(), - Offset = (ulong)speechResult.OffsetInTicks, - Duration = (ulong)speechResult.Duration.Ticks - }; - } - } - - /// - /// Extracts NBest results from speech recognition result properties. - /// Parses the detailed JSON response to get confidence scores and alternative text candidates. - /// - /// The speech recognition result - /// List of NBest results with actual confidence values - private static List ExtractNBestResults(SdkSpeechRecognitionResult speechResult) - { - var nbestResults = new List(); - try - { - // Try to get the detailed JSON result from Properties - var jsonProperty = speechResult.Properties.GetProperty(PropertyId.SpeechServiceResponse_JsonResult); - - if (!string.IsNullOrEmpty(jsonProperty)) - { - using var jsonDoc = JsonDocument.Parse(jsonProperty); - - if (jsonDoc.RootElement.TryGetProperty("NBest", out var nbestArray)) - { - foreach (var item in nbestArray.EnumerateArray()) - { - var confidence = item.TryGetProperty("Confidence", out var confidenceProp) ? confidenceProp.GetDouble() : 0.0; - var lexical = item.TryGetProperty("Lexical", out var lexicalProp) ? lexicalProp.GetString() : ""; - var itn = item.TryGetProperty("ITN", out var itnProp) ? itnProp.GetString() : ""; - var maskedITN = item.TryGetProperty("MaskedITN", out var maskedITNProp) ? maskedITNProp.GetString() : ""; - var display = item.TryGetProperty("Display", out var displayProp) ? displayProp.GetString() : ""; - - // Extract words if available - List? words = null; - if (item.TryGetProperty("Words", out var wordsArray)) - { - words = new List(); - foreach (var wordItem in wordsArray.EnumerateArray()) - { - var word = new WordResult - { - Word = wordItem.TryGetProperty("Word", out var wordProp) ? wordProp.GetString() : "", - Offset = wordItem.TryGetProperty("Offset", out var offsetProp) ? (ulong)offsetProp.GetInt64() : null, - Duration = wordItem.TryGetProperty("Duration", out var durationProp) ? (ulong)durationProp.GetInt64() : null - }; - words.Add(word); - } - } - - nbestResults.Add(new NBestResult - { - Confidence = confidence, - Lexical = lexical, - ITN = itn, - MaskedITN = maskedITN, - Display = display, - Words = words - }); - } - } - } - } - catch (JsonException) - { - // If JSON parsing fails, fall back to simple result - } - - return nbestResults; - } - - /// - /// Synthesizes speech from text and returns the audio data as a byte array. - /// This method uses push stream to collect audio data during synthesis for efficient memory management. - /// - /// Azure AI Services endpoint - /// The text to convert to speech - /// Language for synthesis (default: en-US) - /// Voice name to use (e.g., en-US-JennyNeural) - /// Output audio format (default: Riff24Khz16BitMonoPcm) - /// Optional endpoint ID for custom voice model - /// Tuple containing audio data, actual voice used, and duration in seconds - private async Task<(byte[] AudioData, string Voice)> SynthesizeSpeechToStream( - string endpoint, - string text, - string? language = null, - string? voice = null, - string? format = null, - string? endpointId = null) - { - // Get Azure AD credential and token - var credential = await GetCredential(); - - // Get access token for Cognitive Services with proper scope - var tokenRequestContext = new TokenRequestContext(["https://cognitiveservices.azure.com/.default"]); - var accessToken = await credential.GetTokenAsync(tokenRequestContext, CancellationToken.None); - - // Configure Speech SDK with endpoint - var config = SpeechConfig.FromEndpoint(new Uri(endpoint)); - - // Set the authorization token - config.AuthorizationToken = accessToken.Token; - - // Set language (default to en-US) - var synthesisLanguage = language ?? "en-US"; - config.SpeechSynthesisLanguage = synthesisLanguage; - - // Set voice if provided - string? actualVoice = voice; - if (!string.IsNullOrEmpty(voice)) - { - config.SpeechSynthesisVoiceName = voice; - } - - // Set output format (default to Riff24Khz16BitMonoPcm) - var outputFormat = ParseOutputFormat(format); - config.SetSpeechSynthesisOutputFormat(outputFormat); - - // Set custom endpoint ID if provided - if (!string.IsNullOrEmpty(endpointId)) - { - config.EndpointId = endpointId; - } - - // Create a memory stream to collect audio data via push stream - var audioStream = new MemoryStream(); - using var pushStream = AudioOutputStream.CreatePushStream(new PushAudioStreamCallback(audioStream, _logger)); - using var audioConfig = AudioConfig.FromStreamOutput(pushStream); - using var synthesizer = new SpeechSynthesizer(config, audioConfig); - - // Track synthesis progress - var taskCompletionSource = new TaskCompletionSource(); - SpeechSynthesisCancellationDetails? cancellationDetails = null; - - // Subscribe to synthesis events - synthesizer.SynthesisStarted += (s, e) => - { - _logger.LogInformation("Speech synthesis started for text length: {Length} characters", text.Length); - }; - - synthesizer.Synthesizing += (s, e) => - { - if (e.Result.AudioData.Length > 0) - { - _logger.LogDebug("Received audio chunk: {ChunkSize} bytes", e.Result.AudioData.Length); - } - }; - - synthesizer.SynthesisCompleted += (s, e) => - { - _logger.LogInformation("Speech synthesis completed"); - taskCompletionSource.TrySetResult(true); - }; - - synthesizer.SynthesisCanceled += (s, e) => - { - var details = SpeechSynthesisCancellationDetails.FromResult(e.Result); - _logger.LogError("Speech synthesis canceled: Reason={Reason}, ErrorCode={ErrorCode}, ErrorDetails={ErrorDetails}", - details.Reason, details.ErrorCode, details.ErrorDetails); - cancellationDetails = details; - taskCompletionSource.TrySetResult(false); - }; - - // Start synthesis - await synthesizer.SpeakTextAsync(text); - - // Wait for synthesis to complete - var success = await taskCompletionSource.Task; - - // Check if synthesis was successful - if (!success && cancellationDetails != null) - { - if (IsSynthesisInvalidEndpointError(cancellationDetails)) - { - throw new InvalidOperationException( - $"Invalid endpoint or connectivity issue. Reason: {cancellationDetails.Reason}, ErrorCode: {cancellationDetails.ErrorCode}, Details: {cancellationDetails.ErrorDetails}"); - } - - throw new InvalidOperationException( - $"Speech synthesis failed: {cancellationDetails.Reason} - {cancellationDetails.ErrorDetails}"); - } - - if (!success) - { - throw new InvalidOperationException("Speech synthesis failed for unknown reason"); - } - - // Get the collected audio data from the stream - var audioData = audioStream.ToArray(); - - _logger.LogInformation( - "Speech synthesized successfully. Total audio length: {AudioLength} bytes", - audioData.Length); - - // Get actual voice used (either specified or default) - if (string.IsNullOrEmpty(actualVoice)) - { - actualVoice = voice ?? "default"; - } - - return (audioData, actualVoice); - } - - /// - /// Push stream callback that writes audio data to a memory stream as it arrives. - /// This allows for efficient collection of audio data during synthesis without blocking. - /// - private sealed class PushAudioStreamCallback(MemoryStream targetStream, ILogger logger) : PushAudioOutputStreamCallback - { - private readonly MemoryStream _targetStream = targetStream; - private readonly ILogger _logger = logger; - - public override uint Write(byte[] dataBuffer) - { - if (dataBuffer != null && dataBuffer.Length > 0) - { - _targetStream.Write(dataBuffer, 0, dataBuffer.Length); - _logger.LogDebug("Wrote {BytesWritten} bytes to audio stream", dataBuffer.Length); - return (uint)dataBuffer.Length; - } - return 0; - } - - public override void Close() - { - _logger.LogDebug("Push stream closed, total bytes collected: {TotalBytes}", _targetStream.Length); - } - } - /// /// Synthesizes speech from text and saves it to an audio file using Azure AI Services Speech. - /// Uses streaming synthesis to handle large texts efficiently and avoid memory issues. + /// Delegates to the speech synthesizer for actual synthesis implementation. /// /// Azure AI Services endpoint (e.g., https://your-service.cognitiveservices.azure.com/) /// The text to convert to speech @@ -513,93 +113,14 @@ public async Task SynthesizeSpeechToFile( string? endpointId = null, RetryPolicyOptions? retryPolicy = null) { - ValidateRequiredParameters((nameof(endpoint), endpoint), (nameof(text), text), (nameof(outputFilePath), outputFilePath)); - - if (string.IsNullOrWhiteSpace(text)) - { - throw new ArgumentException("Text cannot be empty or whitespace.", nameof(text)); - } - - try - { - // Use the reusable streaming synthesis method - var (audioData, actualVoice) = await SynthesizeSpeechToStream( - endpoint, text, language, voice, format, endpointId); - - // Write the complete audio data to file - await File.WriteAllBytesAsync(outputFilePath, audioData); - - _logger.LogInformation( - "Speech synthesized and saved to file: {OutputFile}, Audio size: {AudioSize} bytes", - outputFilePath, - audioData.Length); - - return new SynthesisResult - { - FilePath = outputFilePath, - AudioSize = audioData.Length, - Format = format ?? "Riff24Khz16BitMonoPcm", - Voice = actualVoice, - Language = language ?? "en-US" - }; - } - catch (Exception ex) - { - _logger.LogError(ex, "Error during speech synthesis."); - - // Clean up partial file on error - if (File.Exists(outputFilePath)) - { - try - { - File.Delete(outputFilePath); - _logger.LogInformation("Cleaned up partial output file after error: {OutputFile}", outputFilePath); - } - catch (Exception cleanupEx) - { - _logger.LogWarning(cleanupEx, "Failed to clean up partial output file: {OutputFile}", outputFilePath); - } - } - - throw; - } - } - - /// - /// Determines if the cancellation details indicate an invalid endpoint error for synthesis. - /// - /// The cancellation details from speech synthesis - /// True if the error indicates an invalid endpoint, false otherwise - private static bool IsSynthesisInvalidEndpointError(SpeechSynthesisCancellationDetails cancellationDetails) - { - return cancellationDetails.Reason == CancellationReason.Error && - (cancellationDetails.ErrorCode == CancellationErrorCode.ConnectionFailure || - cancellationDetails.ErrorCode == CancellationErrorCode.AuthenticationFailure || - cancellationDetails.ErrorCode == CancellationErrorCode.Forbidden || - cancellationDetails.ErrorDetails?.Contains("endpoint", StringComparison.OrdinalIgnoreCase) == true || - cancellationDetails.ErrorDetails?.Contains("connection", StringComparison.OrdinalIgnoreCase) == true || - cancellationDetails.ErrorDetails?.Contains("network", StringComparison.OrdinalIgnoreCase) == true); - } - - /// - /// Parses the output format string to SpeechSynthesisOutputFormat enum. - /// - /// Format string (e.g., "Riff24Khz16BitMonoPcm", "Audio16Khz32KBitRateMonoMp3") - /// SpeechSynthesisOutputFormat enum value - private static SpeechSynthesisOutputFormat ParseOutputFormat(string? format) - { - if (string.IsNullOrEmpty(format)) - { - return SpeechSynthesisOutputFormat.Riff24Khz16BitMonoPcm; - } - - // Try to parse the format string directly to enum - if (Enum.TryParse(format, true, out var parsedFormat)) - { - return parsedFormat; - } - - // If parsing fails, default to Riff24Khz16BitMonoPcm - return SpeechSynthesisOutputFormat.Riff24Khz16BitMonoPcm; + return await _speechSynthesizer.SynthesizeToFileAsync( + endpoint, + text, + outputFilePath, + language, + voice, + format, + endpointId, + retryPolicy); } } diff --git a/tools/Azure.Mcp.Tools.Speech/src/Services/Synthesizers/IRealtimeTtsSynthesizer.cs b/tools/Azure.Mcp.Tools.Speech/src/Services/Synthesizers/IRealtimeTtsSynthesizer.cs new file mode 100644 index 000000000..ea04476b6 --- /dev/null +++ b/tools/Azure.Mcp.Tools.Speech/src/Services/Synthesizers/IRealtimeTtsSynthesizer.cs @@ -0,0 +1,35 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +using Azure.Mcp.Core.Options; +using Azure.Mcp.Tools.Speech.Models; + +namespace Azure.Mcp.Tools.Speech.Services.Synthesizers; + +/// +/// Interface for speech synthesis services. +/// +public interface IRealtimeTtsSynthesizer +{ + /// + /// Synthesizes speech from text and saves it to an audio file. + /// + /// Azure AI Services endpoint + /// The text to convert to speech + /// Path where the audio file will be saved + /// Language for synthesis (default: en-US) + /// Voice name to use (e.g., en-US-JennyNeural) + /// Output audio format (default: Riff24Khz16BitMonoPcm) + /// Optional endpoint ID for custom voice model + /// Optional retry policy for resilience + /// Synthesis result with file information + Task SynthesizeToFileAsync( + string endpoint, + string text, + string outputFilePath, + string? language = null, + string? voice = null, + string? format = null, + string? endpointId = null, + RetryPolicyOptions? retryPolicy = null); +} diff --git a/tools/Azure.Mcp.Tools.Speech/src/Services/Synthesizers/RealtimeTtsSynthesizer.cs b/tools/Azure.Mcp.Tools.Speech/src/Services/Synthesizers/RealtimeTtsSynthesizer.cs new file mode 100644 index 000000000..7ead0cc24 --- /dev/null +++ b/tools/Azure.Mcp.Tools.Speech/src/Services/Synthesizers/RealtimeTtsSynthesizer.cs @@ -0,0 +1,274 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +using Azure.Core; +using Azure.Mcp.Core.Options; +using Azure.Mcp.Core.Services.Azure; +using Azure.Mcp.Core.Services.Azure.Tenant; +using Azure.Mcp.Tools.Speech.Models; +using Microsoft.CognitiveServices.Speech; +using Microsoft.CognitiveServices.Speech.Audio; +using Microsoft.Extensions.Logging; + +namespace Azure.Mcp.Tools.Speech.Services.Synthesizers; + +/// +/// Neural speech synthesizer using Azure AI Services Speech SDK. +/// Implements streaming synthesis for efficient memory management with large texts. +/// +public class RealtimeTtsSynthesizer(ITenantService tenantService, ILogger logger) + : BaseAzureService(tenantService), IRealtimeTtsSynthesizer +{ + private readonly ILogger _logger = logger; + + /// + /// Synthesizes speech from text and saves it to an audio file using Azure AI Services Speech. + /// Uses streaming synthesis to handle large texts efficiently and avoid memory issues. + /// + public async Task SynthesizeToFileAsync( + string endpoint, + string text, + string outputFilePath, + string? language = null, + string? voice = null, + string? format = null, + string? endpointId = null, + RetryPolicyOptions? retryPolicy = null) + { + ValidateRequiredParameters((nameof(endpoint), endpoint), (nameof(text), text), (nameof(outputFilePath), outputFilePath)); + + if (string.IsNullOrWhiteSpace(text)) + { + throw new ArgumentException("Text cannot be empty or whitespace.", nameof(text)); + } + + try + { + // Use the reusable streaming synthesis method + var (audioData, actualVoice) = await SynthesizeSpeechToStreamAsync( + endpoint, text, language, voice, format, endpointId); + + // Write the complete audio data to file + await File.WriteAllBytesAsync(outputFilePath, audioData); + + _logger.LogInformation( + "Speech synthesized and saved to file: {OutputFile}, Audio size: {AudioSize} bytes", + outputFilePath, + audioData.Length); + + return new SynthesisResult + { + FilePath = outputFilePath, + AudioSize = audioData.Length, + Format = format ?? "Riff24Khz16BitMonoPcm", + Voice = actualVoice, + Language = language ?? "en-US" + }; + } + catch (Exception ex) + { + _logger.LogError(ex, "Error during speech synthesis."); + + // Clean up partial file on error + if (File.Exists(outputFilePath)) + { + try + { + File.Delete(outputFilePath); + _logger.LogInformation("Cleaned up partial output file after error: {OutputFile}", outputFilePath); + } + catch (Exception cleanupEx) + { + _logger.LogWarning(cleanupEx, "Failed to clean up partial output file: {OutputFile}", outputFilePath); + } + } + + throw; + } + } + + /// + /// Synthesizes speech from text and returns the audio data as a byte array. + /// This method uses push stream to collect audio data during synthesis for efficient memory management. + /// + private async Task<(byte[] AudioData, string Voice)> SynthesizeSpeechToStreamAsync( + string endpoint, + string text, + string? language = null, + string? voice = null, + string? format = null, + string? endpointId = null) + { + // Get Azure AD credential and token + var credential = await GetCredential(); + + // Get access token for Cognitive Services with proper scope + var tokenRequestContext = new TokenRequestContext(["https://cognitiveservices.azure.com/.default"]); + var accessToken = await credential.GetTokenAsync(tokenRequestContext, CancellationToken.None); + + // Configure Speech SDK with endpoint + var config = SpeechConfig.FromEndpoint(new Uri(endpoint)); + + // Set the authorization token + config.AuthorizationToken = accessToken.Token; + + // Set language (default to en-US) + var synthesisLanguage = language ?? "en-US"; + config.SpeechSynthesisLanguage = synthesisLanguage; + + // Set voice if provided + string? actualVoice = voice; + if (!string.IsNullOrEmpty(voice)) + { + config.SpeechSynthesisVoiceName = voice; + } + + // Set output format (default to Riff24Khz16BitMonoPcm) + var outputFormat = ParseOutputFormat(format); + config.SetSpeechSynthesisOutputFormat(outputFormat); + + // Set custom endpoint ID if provided + if (!string.IsNullOrEmpty(endpointId)) + { + config.EndpointId = endpointId; + } + + // Create a memory stream to collect audio data via push stream + var audioStream = new MemoryStream(); + using var pushStream = AudioOutputStream.CreatePushStream(new PushAudioStreamCallback(audioStream, _logger)); + using var audioConfig = AudioConfig.FromStreamOutput(pushStream); + using var synthesizer = new SpeechSynthesizer(config, audioConfig); + + // Track synthesis progress + var taskCompletionSource = new TaskCompletionSource(); + SpeechSynthesisCancellationDetails? cancellationDetails = null; + + // Subscribe to synthesis events + synthesizer.SynthesisStarted += (s, e) => + { + _logger.LogInformation("Speech synthesis started for text length: {Length} characters", text.Length); + }; + + synthesizer.Synthesizing += (s, e) => + { + if (e.Result.AudioData.Length > 0) + { + _logger.LogDebug("Received audio chunk: {ChunkSize} bytes", e.Result.AudioData.Length); + } + }; + + synthesizer.SynthesisCompleted += (s, e) => + { + _logger.LogInformation("Speech synthesis completed"); + taskCompletionSource.TrySetResult(true); + }; + + synthesizer.SynthesisCanceled += (s, e) => + { + var details = SpeechSynthesisCancellationDetails.FromResult(e.Result); + _logger.LogError("Speech synthesis canceled: Reason={Reason}, ErrorCode={ErrorCode}, ErrorDetails={ErrorDetails}", + details.Reason, details.ErrorCode, details.ErrorDetails); + cancellationDetails = details; + taskCompletionSource.TrySetResult(false); + }; + + // Start synthesis + await synthesizer.SpeakTextAsync(text); + + // Wait for synthesis to complete + var success = await taskCompletionSource.Task; + + // Check if synthesis was successful + if (!success && cancellationDetails != null) + { + if (IsSynthesisInvalidEndpointError(cancellationDetails)) + { + throw new InvalidOperationException( + $"Invalid endpoint or connectivity issue. Reason: {cancellationDetails.Reason}, ErrorCode: {cancellationDetails.ErrorCode}, Details: {cancellationDetails.ErrorDetails}"); + } + + throw new InvalidOperationException( + $"Speech synthesis failed: {cancellationDetails.Reason} - {cancellationDetails.ErrorDetails}"); + } + + if (!success) + { + throw new InvalidOperationException("Speech synthesis failed for unknown reason"); + } + + // Get the collected audio data from the stream + var audioData = audioStream.ToArray(); + + _logger.LogInformation( + "Speech synthesized successfully. Total audio length: {AudioLength} bytes", + audioData.Length); + + // Get actual voice used (either specified or default) + if (string.IsNullOrEmpty(actualVoice)) + { + actualVoice = voice ?? "default"; + } + + return (audioData, actualVoice); + } + + /// + /// Push stream callback that writes audio data to a memory stream as it arrives. + /// This allows for efficient collection of audio data during synthesis without blocking. + /// + private sealed class PushAudioStreamCallback(MemoryStream targetStream, ILogger logger) : PushAudioOutputStreamCallback + { + private readonly MemoryStream _targetStream = targetStream; + private readonly ILogger _logger = logger; + + public override uint Write(byte[] dataBuffer) + { + if (dataBuffer != null && dataBuffer.Length > 0) + { + _targetStream.Write(dataBuffer, 0, dataBuffer.Length); + _logger.LogDebug("Wrote {BytesWritten} bytes to audio stream", dataBuffer.Length); + return (uint)dataBuffer.Length; + } + return 0; + } + + public override void Close() + { + _logger.LogDebug("Push stream closed, total bytes collected: {TotalBytes}", _targetStream.Length); + } + } + + /// + /// Determines if the cancellation details indicate an invalid endpoint error for synthesis. + /// + private static bool IsSynthesisInvalidEndpointError(SpeechSynthesisCancellationDetails cancellationDetails) + { + return cancellationDetails.Reason == CancellationReason.Error && + (cancellationDetails.ErrorCode == CancellationErrorCode.ConnectionFailure || + cancellationDetails.ErrorCode == CancellationErrorCode.AuthenticationFailure || + cancellationDetails.ErrorCode == CancellationErrorCode.Forbidden || + cancellationDetails.ErrorDetails?.Contains("endpoint", StringComparison.OrdinalIgnoreCase) == true || + cancellationDetails.ErrorDetails?.Contains("connection", StringComparison.OrdinalIgnoreCase) == true || + cancellationDetails.ErrorDetails?.Contains("network", StringComparison.OrdinalIgnoreCase) == true); + } + + /// + /// Parses the output format string to SpeechSynthesisOutputFormat enum. + /// + private static SpeechSynthesisOutputFormat ParseOutputFormat(string? format) + { + if (string.IsNullOrEmpty(format)) + { + return SpeechSynthesisOutputFormat.Riff24Khz16BitMonoPcm; + } + + // Try to parse the format string directly to enum + if (Enum.TryParse(format, true, out var parsedFormat)) + { + return parsedFormat; + } + + // If parsing fails, default to Riff24Khz16BitMonoPcm + return SpeechSynthesisOutputFormat.Riff24Khz16BitMonoPcm; + } +} diff --git a/tools/Azure.Mcp.Tools.Speech/src/SpeechSetup.cs b/tools/Azure.Mcp.Tools.Speech/src/SpeechSetup.cs index e9ca77088..7e0f77f5a 100644 --- a/tools/Azure.Mcp.Tools.Speech/src/SpeechSetup.cs +++ b/tools/Azure.Mcp.Tools.Speech/src/SpeechSetup.cs @@ -7,6 +7,7 @@ using Azure.Mcp.Tools.Speech.Commands.Tts; using Azure.Mcp.Tools.Speech.Services; using Azure.Mcp.Tools.Speech.Services.Recognizers; +using Azure.Mcp.Tools.Speech.Services.Synthesizers; using Microsoft.Extensions.DependencyInjection; namespace Azure.Mcp.Tools.Speech; @@ -19,10 +20,17 @@ public class SpeechSetup : IAreaSetup public void ConfigureServices(IServiceCollection services) { - // New recognizer-based architecture + // New recognizer-based architecture for STT services.AddSingleton(); services.AddSingleton(); + + // New synthesizer-based architecture for TTS + services.AddSingleton(); + + // Orchestration service services.AddSingleton(); + + // Commands services.AddSingleton(); services.AddSingleton(); } diff --git a/tools/Azure.Mcp.Tools.Speech/tests/Azure.Mcp.Tools.Speech.UnitTests/Services/SpeechServiceTests.cs b/tools/Azure.Mcp.Tools.Speech/tests/Azure.Mcp.Tools.Speech.UnitTests/Services/SpeechServiceTests.cs index 7386c90ba..1c9b41363 100644 --- a/tools/Azure.Mcp.Tools.Speech/tests/Azure.Mcp.Tools.Speech.UnitTests/Services/SpeechServiceTests.cs +++ b/tools/Azure.Mcp.Tools.Speech/tests/Azure.Mcp.Tools.Speech.UnitTests/Services/SpeechServiceTests.cs @@ -4,6 +4,7 @@ using Azure.Mcp.Core.Services.Azure.Tenant; using Azure.Mcp.Tools.Speech.Services; using Azure.Mcp.Tools.Speech.Services.Recognizers; +using Azure.Mcp.Tools.Speech.Services.Synthesizers; using Microsoft.Extensions.Logging; using NSubstitute; using Xunit; @@ -16,6 +17,7 @@ public class SpeechServiceTests private readonly ILogger _logger; private readonly IFastTranscriptionRecognizer _fastTranscriptionRecognizer; private readonly IRealtimeTranscriptionRecognizer _realtimeTranscriptionRecognizer; + private readonly IRealtimeTtsSynthesizer _realtimeTtsSynthesizer; private readonly SpeechService _speechService; public SpeechServiceTests() @@ -24,15 +26,16 @@ public SpeechServiceTests() _logger = Substitute.For>(); _fastTranscriptionRecognizer = Substitute.For(); _realtimeTranscriptionRecognizer = Substitute.For(); + _realtimeTtsSynthesizer = Substitute.For(); - _speechService = new SpeechService(_tenantService, _logger, _fastTranscriptionRecognizer, _realtimeTranscriptionRecognizer); + _speechService = new SpeechService(_tenantService, _logger, _fastTranscriptionRecognizer, _realtimeTranscriptionRecognizer, _realtimeTtsSynthesizer); } [Fact] public void Constructor_WithValidParameters_ShouldCreateInstance() { // Arrange & Act - var service = new SpeechService(_tenantService, _logger, _fastTranscriptionRecognizer, _realtimeTranscriptionRecognizer); + var service = new SpeechService(_tenantService, _logger, _fastTranscriptionRecognizer, _realtimeTranscriptionRecognizer, _realtimeTtsSynthesizer); // Assert Assert.NotNull(service); diff --git a/tools/Azure.Mcp.Tools.Speech/tests/Azure.Mcp.Tools.Speech.UnitTests/Stt/SttRecognizeCommandTests.cs b/tools/Azure.Mcp.Tools.Speech/tests/Azure.Mcp.Tools.Speech.UnitTests/Stt/SttRecognizeCommandTests.cs index 662beabc9..ff56773ce 100644 --- a/tools/Azure.Mcp.Tools.Speech/tests/Azure.Mcp.Tools.Speech.UnitTests/Stt/SttRecognizeCommandTests.cs +++ b/tools/Azure.Mcp.Tools.Speech/tests/Azure.Mcp.Tools.Speech.UnitTests/Stt/SttRecognizeCommandTests.cs @@ -13,6 +13,7 @@ using Azure.Mcp.Tools.Speech.Models.Realtime; using Azure.Mcp.Tools.Speech.Services; using Azure.Mcp.Tools.Speech.Services.Recognizers; +using Azure.Mcp.Tools.Speech.Services.Synthesizers; using Microsoft.Extensions.DependencyInjection; using Microsoft.Extensions.Logging; using NSubstitute; @@ -27,6 +28,7 @@ public class SttRecognizeCommandTests : IDisposable private readonly ISpeechService _speechService; private readonly IFastTranscriptionRecognizer _fastTranscriptionRecognizer; private readonly IRealtimeTranscriptionRecognizer _realtimeTranscriptionRecognizer; + private readonly IRealtimeTtsSynthesizer _realtimeTtsSynthesizer; private readonly ITenantService _tenantService; private readonly ILogger _logger; private readonly ILogger _speechServiceLogger; @@ -42,12 +44,13 @@ public SttRecognizeCommandTests() // Mock the recognizers and their dependencies _fastTranscriptionRecognizer = Substitute.For(); _realtimeTranscriptionRecognizer = Substitute.For(); + _realtimeTtsSynthesizer = Substitute.For(); _tenantService = Substitute.For(); _logger = Substitute.For>(); _speechServiceLogger = Substitute.For>(); // Create real SpeechService with mocked dependencies - _speechService = new SpeechService(_tenantService, _speechServiceLogger, _fastTranscriptionRecognizer, _realtimeTranscriptionRecognizer); + _speechService = new SpeechService(_tenantService, _speechServiceLogger, _fastTranscriptionRecognizer, _realtimeTranscriptionRecognizer, _realtimeTtsSynthesizer); var collection = new ServiceCollection().AddSingleton(_speechService); diff --git a/tools/Azure.Mcp.Tools.Speech/tests/Azure.Mcp.Tools.Speech.UnitTests/Tts/TtsSynthesizeCommandTests.cs b/tools/Azure.Mcp.Tools.Speech/tests/Azure.Mcp.Tools.Speech.UnitTests/Tts/TtsSynthesizeCommandTests.cs index e659d7df7..c13fac421 100644 --- a/tools/Azure.Mcp.Tools.Speech/tests/Azure.Mcp.Tools.Speech.UnitTests/Tts/TtsSynthesizeCommandTests.cs +++ b/tools/Azure.Mcp.Tools.Speech/tests/Azure.Mcp.Tools.Speech.UnitTests/Tts/TtsSynthesizeCommandTests.cs @@ -73,7 +73,7 @@ public void Properties_ShouldHaveExpectedValues() public async Task ExecuteAsync_ValidatesInput(string args, bool shouldSucceed, string expectedError) { var parseResult = _commandDefinition.Parse(args.Split(' ', StringSplitOptions.RemoveEmptyEntries)); - var response = await _command.ExecuteAsync(_context, parseResult); + var response = await _command.ExecuteAsync(_context, parseResult, TestContext.Current.CancellationToken); if (shouldSucceed) { @@ -118,7 +118,7 @@ public async Task ExecuteAsync_WithValidParameters_ShouldSucceed() // Act var args = $"--subscription {_knownSubscription} --endpoint {_knownEndpoint} --text {text} --outputAudio {outputFile}"; var parseResult = _commandDefinition.Parse(args.Split(' ', StringSplitOptions.RemoveEmptyEntries)); - var response = await _command.ExecuteAsync(_context, parseResult); + var response = await _command.ExecuteAsync(_context, parseResult, TestContext.Current.CancellationToken); // Assert Assert.Equal(HttpStatusCode.OK, response.Status); @@ -176,7 +176,7 @@ public async Task ExecuteAsync_WithAllOptionalParameters_ShouldPassThemCorrectly // Act var args = $"--subscription {_knownSubscription} --endpoint {_knownEndpoint} --text {text} --outputAudio {outputFile} --language {language} --voice {voice} --format {format} --endpointId {endpointId}"; var parseResult = _commandDefinition.Parse(args.Split(' ', StringSplitOptions.RemoveEmptyEntries)); - var response = await _command.ExecuteAsync(_context, parseResult); + var response = await _command.ExecuteAsync(_context, parseResult, TestContext.Current.CancellationToken); // Assert Assert.Equal(HttpStatusCode.OK, response.Status); @@ -224,7 +224,7 @@ public async Task ExecuteAsync_ServiceThrowsException_ShouldHandleGracefully() // Act var args = $"--subscription {_knownSubscription} --endpoint {_knownEndpoint} --text {text} --outputAudio {outputFile}"; var parseResult = _commandDefinition.Parse(args.Split(' ', StringSplitOptions.RemoveEmptyEntries)); - var response = await _command.ExecuteAsync(_context, parseResult); + var response = await _command.ExecuteAsync(_context, parseResult, TestContext.Current.CancellationToken); // Assert Assert.Equal(HttpStatusCode.InternalServerError, response.Status); @@ -263,7 +263,7 @@ public async Task ExecuteAsync_UnauthorizedException_ShouldReturnUnauthorizedSta // Act var args = $"--subscription {_knownSubscription} --endpoint {_knownEndpoint} --text {text} --outputAudio {outputFile}"; var parseResult = _commandDefinition.Parse(args.Split(' ', StringSplitOptions.RemoveEmptyEntries)); - var response = await _command.ExecuteAsync(_context, parseResult); + var response = await _command.ExecuteAsync(_context, parseResult, TestContext.Current.CancellationToken); // Assert Assert.Equal(HttpStatusCode.Unauthorized, response.Status); From 7c3da3a05adef93bd82ea45f68f8ab1a459813cf Mon Sep 17 00:00:00 2001 From: "Fei Zhao (SPEECH)" Date: Thu, 6 Nov 2025 16:48:36 +0800 Subject: [PATCH 11/14] fix format --- tools/Azure.Mcp.Tools.Speech/src/SpeechSetup.cs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tools/Azure.Mcp.Tools.Speech/src/SpeechSetup.cs b/tools/Azure.Mcp.Tools.Speech/src/SpeechSetup.cs index 7e0f77f5a..754d49eb7 100644 --- a/tools/Azure.Mcp.Tools.Speech/src/SpeechSetup.cs +++ b/tools/Azure.Mcp.Tools.Speech/src/SpeechSetup.cs @@ -23,13 +23,13 @@ public void ConfigureServices(IServiceCollection services) // New recognizer-based architecture for STT services.AddSingleton(); services.AddSingleton(); - + // New synthesizer-based architecture for TTS services.AddSingleton(); - + // Orchestration service services.AddSingleton(); - + // Commands services.AddSingleton(); services.AddSingleton(); From daeb5d5e721c8a53999dda659ff7389eaf353cf9 Mon Sep 17 00:00:00 2001 From: "Fei Zhao (SPEECH)" Date: Mon, 10 Nov 2025 11:15:01 +0800 Subject: [PATCH 12/14] update according to comments --- .../src/Commands/Tts/TtsSynthesizeCommand.cs | 17 ++++++++------ .../SpeechCommandTests.cs | 7 +++--- .../Tts/TtsSynthesizeCommandTests.cs | 22 ------------------- 3 files changed, 14 insertions(+), 32 deletions(-) diff --git a/tools/Azure.Mcp.Tools.Speech/src/Commands/Tts/TtsSynthesizeCommand.cs b/tools/Azure.Mcp.Tools.Speech/src/Commands/Tts/TtsSynthesizeCommand.cs index a812fd11c..56fa08f26 100644 --- a/tools/Azure.Mcp.Tools.Speech/src/Commands/Tts/TtsSynthesizeCommand.cs +++ b/tools/Azure.Mcp.Tools.Speech/src/Commands/Tts/TtsSynthesizeCommand.cs @@ -17,6 +17,7 @@ public sealed class TtsSynthesizeCommand(ILogger logger) : internal record TtsSynthesizeCommandResult(SynthesisResult Result); private const string CommandTitle = "Synthesize Speech from Text"; + private static readonly HashSet SupportedExtensions = [".wav", ".mp3", ".ogg", ".raw"]; private readonly ILogger _logger = logger; public override string Name => "synthesize"; @@ -74,16 +75,18 @@ protected override void RegisterOptions(Command command) } else { + // Check if file already exists (don't allow overwriting) + if (File.Exists(fileValue)) + { + commandResult.AddError($"Output file already exists: {fileValue}. Please specify a different file path or delete the existing file."); + } + // Validate file extension var extension = Path.GetExtension(fileValue).ToLowerInvariant(); - var supportedExtensions = new HashSet - { - ".wav", ".mp3", ".ogg", ".raw" - }; - if (!supportedExtensions.Contains(extension)) + if (!SupportedExtensions.Contains(extension)) { - commandResult.AddError($"Unsupported output file format: {extension}. Only {string.Join(", ", supportedExtensions)} are supported."); + commandResult.AddError($"Unsupported output file format: {extension}. Only {string.Join(", ", SupportedExtensions)} are supported."); } } @@ -144,7 +147,7 @@ public override async Task ExecuteAsync(CommandContext context, context.Response.Status = HttpStatusCode.OK; context.Response.Message = "Speech synthesis completed successfully."; context.Response.Results = ResponseResult.Create( - new TtsSynthesizeCommandResult(result), + new(result), SpeechJsonContext.Default.TtsSynthesizeCommandResult); } catch (Exception ex) diff --git a/tools/Azure.Mcp.Tools.Speech/tests/Azure.Mcp.Tools.Speech.LiveTests/SpeechCommandTests.cs b/tools/Azure.Mcp.Tools.Speech/tests/Azure.Mcp.Tools.Speech.LiveTests/SpeechCommandTests.cs index 84dc816ab..27bea9922 100644 --- a/tools/Azure.Mcp.Tools.Speech/tests/Azure.Mcp.Tools.Speech.LiveTests/SpeechCommandTests.cs +++ b/tools/Azure.Mcp.Tools.Speech/tests/Azure.Mcp.Tools.Speech.LiveTests/SpeechCommandTests.cs @@ -2,6 +2,7 @@ // Licensed under the MIT License. using System.Text.Json; +using Azure.Mcp.Tests; using Azure.Mcp.Tests.Client; using Azure.Mcp.Tools.Speech.Models; using Azure.Mcp.Tools.Speech.Models.Realtime; @@ -549,13 +550,13 @@ public async Task Should_synthesize_speech_to_file_with_text() // Parse and validate the JSON result var jsonResult = JsonDocument.Parse(resultText); var resultObject = jsonResult.RootElement; - Assert.True(resultObject.TryGetProperty("result", out var resultProperty)); + var resultProperty = resultObject.AssertProperty("result"); // Verify file path - Assert.True(resultProperty.TryGetProperty("filePath", out var filePathProperty)); + var filePathProperty = resultProperty.AssertProperty("filePath"); Assert.Equal(outputFile, filePathProperty.GetString()); - Assert.True(resultProperty.TryGetProperty("audioSize", out var audioLengthProperty)); + var audioLengthProperty = resultProperty.AssertProperty("audioSize"); Assert.True(audioLengthProperty.GetInt64() > 0); // Verify the output file was created and has content diff --git a/tools/Azure.Mcp.Tools.Speech/tests/Azure.Mcp.Tools.Speech.UnitTests/Tts/TtsSynthesizeCommandTests.cs b/tools/Azure.Mcp.Tools.Speech/tests/Azure.Mcp.Tools.Speech.UnitTests/Tts/TtsSynthesizeCommandTests.cs index c13fac421..a74810645 100644 --- a/tools/Azure.Mcp.Tools.Speech/tests/Azure.Mcp.Tools.Speech.UnitTests/Tts/TtsSynthesizeCommandTests.cs +++ b/tools/Azure.Mcp.Tools.Speech/tests/Azure.Mcp.Tools.Speech.UnitTests/Tts/TtsSynthesizeCommandTests.cs @@ -41,28 +41,6 @@ public TtsSynthesizeCommandTests() _commandDefinition = _command.GetCommand(); } - [Fact] - public void Constructor_WithValidLogger_ShouldCreateInstance() - { - var command = new TtsSynthesizeCommand(_logger); - Assert.NotNull(command); - Assert.Equal("synthesize", command.Name); - } - - [Fact] - public void Properties_ShouldHaveExpectedValues() - { - Assert.Equal("synthesize", _command.Name); - Assert.Equal("Synthesize Speech from Text", _command.Title); - Assert.NotEmpty(_command.Description); - Assert.False(_command.Metadata.Destructive); - Assert.True(_command.Metadata.Idempotent); - Assert.False(_command.Metadata.OpenWorld); - Assert.False(_command.Metadata.ReadOnly); - Assert.True(_command.Metadata.LocalRequired); - Assert.False(_command.Metadata.Secret); - } - [Theory] [InlineData("", false, "Missing Required options: --endpoint, --text, --outputAudio")] [InlineData("--subscription sub123", false, "Missing Required options: --endpoint, --text, --outputAudio")] From c7d98b5075b2eb5906e6aa6f400a5588b8027f12 Mon Sep 17 00:00:00 2001 From: "Fei Zhao (SPEECH)" Date: Mon, 10 Nov 2025 11:29:28 +0800 Subject: [PATCH 13/14] update ToolDescriptionEvaluator results --- .../ToolDescriptionEvaluator/prompts.json | 30 +- eng/tools/ToolDescriptionEvaluator/results.md | 13697 ++-------------- eng/tools/ToolDescriptionEvaluator/tools.json | 20 +- 3 files changed, 1223 insertions(+), 12524 deletions(-) diff --git a/eng/tools/ToolDescriptionEvaluator/prompts.json b/eng/tools/ToolDescriptionEvaluator/prompts.json index 938dae450..79de5ee70 100644 --- a/eng/tools/ToolDescriptionEvaluator/prompts.json +++ b/eng/tools/ToolDescriptionEvaluator/prompts.json @@ -2,31 +2,22 @@ "foundry_agents_connect": [ "Query an agent in my Azure AI foundry resource" ], + "foundry_agents_create": [ + "Create a new Azure AI Foundry agent using instructions in the active editor" + ], "foundry_agents_evaluate": [ "Evaluate the full query and response I got from my agent for task_adherence" ], + "foundry_agents_get-sdk-sample": [ + "Create a CLI app that can talk to an Azure AI Foundry Agent using Python SDK" + ], "foundry_agents_list": [ "List all agents in my Azure AI Foundry resource", "Show me the available agents in my Azure AI Foundry resource" ], - "foundry_agents_create": [ - "Create a new Azure AI Foundry agent using instructions in the active editor" - ], "foundry_agents_query-and-evaluate": [ "Query and evaluate an agent in my Azure AI Foundry resource for task_adherence" ], - "foundry_agents_get-sdk-sample": [ - "Create a CLI app that can talk to an Azure AI Foundry Agent using Python SDK" - ], - "foundry_threads_create": [ - "Create an Azure AI Foundry thread to hold the conversation" - ], - "foundry_threads_list": [ - "List my AI Foundry threads" - ], - "foundry_threads_get-messages": [ - "Show me the messages in the AI Foundry thread with id " - ], "foundry_knowledge_index_list": [ "List all knowledge indexes in my AI Foundry project", "Show me the knowledge indexes in my AI Foundry project" @@ -65,6 +56,15 @@ "Show me the AI Foundry resources in resource group ", "Get details for AI Foundry resource in resource group " ], + "foundry_threads_create": [ + "Create an Azure AI Foundry thread to hold the conversation" + ], + "foundry_threads_get-messages": [ + "Show me the messages in the AI Foundry thread with id " + ], + "foundry_threads_list": [ + "List my AI Foundry threads" + ], "search_knowledge_base_get": [ "List all knowledge bases in the Azure AI Search service ", "Show me the knowledge bases in the Azure AI Search service ", diff --git a/eng/tools/ToolDescriptionEvaluator/results.md b/eng/tools/ToolDescriptionEvaluator/results.md index 3ab78fbfe..b41f6850c 100644 --- a/eng/tools/ToolDescriptionEvaluator/results.md +++ b/eng/tools/ToolDescriptionEvaluator/results.md @@ -1,79 +1,46 @@ # Tool Selection Analysis Setup -<<<<<<< HEAD -<<<<<<< HEAD -**Setup completed:** 2025-11-06 17:16:26 +**Setup completed:** 2025-11-10 11:23:50 **Tool count:** 179 -**Database setup time:** 32.4934401s -======= -<<<<<<< HEAD -**Setup completed:** 2025-11-03 14:57:47 -**Tool count:** 173 -**Database setup time:** 1.2016078s -======= -**Setup completed:** 2025-11-04 15:41:36 -**Tool count:** 174 -**Database setup time:** 1.4888934s ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -**Setup completed:** 2025-11-06 16:24:20 -**Tool count:** 179 -**Database setup time:** 1.5156559s ->>>>>>> e2fd2eac (refactor tts mcp tool) +**Database setup time:** 2.2959325s --- # Tool Selection Analysis Results -<<<<<<< HEAD -<<<<<<< HEAD -**Analysis Date:** 2025-11-06 17:16:26 -**Tool count:** 179 -======= -<<<<<<< HEAD -**Analysis Date:** 2025-11-03 14:57:47 -**Tool count:** 173 -======= -**Analysis Date:** 2025-11-04 15:41:36 -**Tool count:** 174 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -**Analysis Date:** 2025-11-06 16:24:20 +**Analysis Date:** 2025-11-10 11:23:50 **Tool count:** 179 ->>>>>>> e2fd2eac (refactor tts mcp tool) ## Table of Contents - [Test 1: foundry_agents_connect](#test-1) -- [Test 2: foundry_agents_evaluate](#test-2) -- [Test 3: foundry_agents_list](#test-3) -- [Test 4: foundry_agents_list](#test-4) -- [Test 5: foundry_agents_create](#test-5) -- [Test 6: foundry_agents_query-and-evaluate](#test-6) -- [Test 7: foundry_agents_get-sdk-sample](#test-7) -- [Test 8: foundry_threads_create](#test-8) -- [Test 9: foundry_threads_list](#test-9) -- [Test 10: foundry_threads_get-messages](#test-10) -- [Test 11: foundry_knowledge_index_list](#test-11) -- [Test 12: foundry_knowledge_index_list](#test-12) -- [Test 13: foundry_knowledge_index_schema](#test-13) -- [Test 14: foundry_knowledge_index_schema](#test-14) -- [Test 15: foundry_models_deploy](#test-15) -- [Test 16: foundry_models_deployments_list](#test-16) -- [Test 17: foundry_models_deployments_list](#test-17) -- [Test 18: foundry_models_list](#test-18) -- [Test 19: foundry_models_list](#test-19) -- [Test 20: foundry_openai_chat-completions-create](#test-20) -- [Test 21: foundry_openai_create-completion](#test-21) -- [Test 22: foundry_openai_embeddings-create](#test-22) -- [Test 23: foundry_openai_embeddings-create](#test-23) -- [Test 24: foundry_openai_models-list](#test-24) -- [Test 25: foundry_openai_models-list](#test-25) -- [Test 26: foundry_resource_get](#test-26) -- [Test 27: foundry_resource_get](#test-27) -- [Test 28: foundry_resource_get](#test-28) +- [Test 2: foundry_agents_create](#test-2) +- [Test 3: foundry_agents_evaluate](#test-3) +- [Test 4: foundry_agents_get-sdk-sample](#test-4) +- [Test 5: foundry_agents_list](#test-5) +- [Test 6: foundry_agents_list](#test-6) +- [Test 7: foundry_agents_query-and-evaluate](#test-7) +- [Test 8: foundry_knowledge_index_list](#test-8) +- [Test 9: foundry_knowledge_index_list](#test-9) +- [Test 10: foundry_knowledge_index_schema](#test-10) +- [Test 11: foundry_knowledge_index_schema](#test-11) +- [Test 12: foundry_models_deploy](#test-12) +- [Test 13: foundry_models_deployments_list](#test-13) +- [Test 14: foundry_models_deployments_list](#test-14) +- [Test 15: foundry_models_list](#test-15) +- [Test 16: foundry_models_list](#test-16) +- [Test 17: foundry_openai_chat-completions-create](#test-17) +- [Test 18: foundry_openai_create-completion](#test-18) +- [Test 19: foundry_openai_embeddings-create](#test-19) +- [Test 20: foundry_openai_embeddings-create](#test-20) +- [Test 21: foundry_openai_models-list](#test-21) +- [Test 22: foundry_openai_models-list](#test-22) +- [Test 23: foundry_resource_get](#test-23) +- [Test 24: foundry_resource_get](#test-24) +- [Test 25: foundry_resource_get](#test-25) +- [Test 26: foundry_threads_create](#test-26) +- [Test 27: foundry_threads_get-messages](#test-27) +- [Test 28: foundry_threads_list](#test-28) - [Test 29: search_knowledge_base_get](#test-29) - [Test 30: search_knowledge_base_get](#test-30) - [Test 31: search_knowledge_base_get](#test-31) @@ -106,466 +73,11 @@ - [Test 58: speech_stt_recognize](#test-58) - [Test 59: speech_stt_recognize](#test-59) - [Test 60: speech_stt_recognize](#test-60) -<<<<<<< HEAD -<<<<<<< HEAD -======= ->>>>>>> e2fd2eac (refactor tts mcp tool) - [Test 61: speech_stt_recognize](#test-61) - [Test 62: speech_stt_recognize](#test-62) - [Test 63: speech_stt_recognize](#test-63) - [Test 64: speech_stt_recognize](#test-64) - [Test 65: speech_stt_recognize](#test-65) -<<<<<<< HEAD -- [Test 66: appconfig_account_list](#test-66) -- [Test 67: appconfig_account_list](#test-67) -- [Test 68: appconfig_account_list](#test-68) -- [Test 69: appconfig_kv_delete](#test-69) -- [Test 70: appconfig_kv_get](#test-70) -- [Test 71: appconfig_kv_get](#test-71) -- [Test 72: appconfig_kv_get](#test-72) -- [Test 73: appconfig_kv_get](#test-73) -- [Test 74: appconfig_kv_lock_set](#test-74) -- [Test 75: appconfig_kv_lock_set](#test-75) -- [Test 76: appconfig_kv_set](#test-76) -- [Test 77: applens_resource_diagnose](#test-77) -- [Test 78: applens_resource_diagnose](#test-78) -- [Test 79: applens_resource_diagnose](#test-79) -======= -<<<<<<< HEAD -- [Test 61: appconfig_account_list](#test-61) -- [Test 62: appconfig_account_list](#test-62) -- [Test 63: appconfig_account_list](#test-63) -- [Test 64: appconfig_kv_delete](#test-64) -- [Test 65: appconfig_kv_get](#test-65) -- [Test 66: appconfig_kv_get](#test-66) -- [Test 67: appconfig_kv_get](#test-67) -- [Test 68: appconfig_kv_get](#test-68) -- [Test 69: appconfig_kv_lock_set](#test-69) -- [Test 70: appconfig_kv_lock_set](#test-70) -- [Test 71: appconfig_kv_set](#test-71) -- [Test 72: applens_resource_diagnose](#test-72) -- [Test 73: applens_resource_diagnose](#test-73) -- [Test 74: applens_resource_diagnose](#test-74) -- [Test 75: appservice_database_add](#test-75) -- [Test 76: appservice_database_add](#test-76) -- [Test 77: appservice_database_add](#test-77) -- [Test 78: appservice_database_add](#test-78) -- [Test 79: appservice_database_add](#test-79) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -- [Test 80: appservice_database_add](#test-80) -- [Test 81: appservice_database_add](#test-81) -- [Test 82: appservice_database_add](#test-82) -- [Test 83: appservice_database_add](#test-83) -- [Test 84: appservice_database_add](#test-84) -- [Test 85: appservice_database_add](#test-85) -- [Test 86: appservice_database_add](#test-86) -- [Test 87: appservice_database_add](#test-87) -- [Test 88: appservice_database_add](#test-88) -- [Test 89: appservice_database_add](#test-89) -- [Test 90: applicationinsights_recommendation_list](#test-90) -- [Test 91: applicationinsights_recommendation_list](#test-91) -- [Test 92: applicationinsights_recommendation_list](#test-92) -- [Test 93: applicationinsights_recommendation_list](#test-93) -- [Test 94: extension_cli_generate](#test-94) -- [Test 95: extension_cli_generate](#test-95) -- [Test 96: extension_cli_generate](#test-96) -- [Test 97: extension_cli_install](#test-97) -- [Test 98: extension_cli_install](#test-98) -- [Test 99: extension_cli_install](#test-99) -- [Test 100: acr_registry_list](#test-100) -- [Test 101: acr_registry_list](#test-101) -- [Test 102: acr_registry_list](#test-102) -- [Test 103: acr_registry_list](#test-103) -- [Test 104: acr_registry_list](#test-104) -- [Test 105: acr_registry_repository_list](#test-105) -- [Test 106: acr_registry_repository_list](#test-106) -- [Test 107: acr_registry_repository_list](#test-107) -- [Test 108: acr_registry_repository_list](#test-108) -- [Test 109: communication_email_send](#test-109) -- [Test 110: communication_email_send](#test-110) -- [Test 111: communication_email_send](#test-111) -- [Test 112: communication_email_send](#test-112) -- [Test 113: communication_email_send](#test-113) -- [Test 114: communication_email_send](#test-114) -- [Test 115: communication_email_send](#test-115) -- [Test 116: communication_email_send](#test-116) -- [Test 117: communication_sms_send](#test-117) -- [Test 118: communication_sms_send](#test-118) -- [Test 119: communication_sms_send](#test-119) -- [Test 120: communication_sms_send](#test-120) -- [Test 121: communication_sms_send](#test-121) -- [Test 122: communication_sms_send](#test-122) -- [Test 123: communication_sms_send](#test-123) -- [Test 124: communication_sms_send](#test-124) -- [Test 125: confidentialledger_entries_append](#test-125) -- [Test 126: confidentialledger_entries_append](#test-126) -- [Test 127: confidentialledger_entries_append](#test-127) -- [Test 128: confidentialledger_entries_append](#test-128) -- [Test 129: confidentialledger_entries_append](#test-129) -- [Test 130: confidentialledger_entries_get](#test-130) -- [Test 131: confidentialledger_entries_get](#test-131) -- [Test 132: cosmos_account_list](#test-132) -- [Test 133: cosmos_account_list](#test-133) -- [Test 134: cosmos_account_list](#test-134) -- [Test 135: cosmos_database_container_item_query](#test-135) -- [Test 136: cosmos_database_container_list](#test-136) -- [Test 137: cosmos_database_container_list](#test-137) -- [Test 138: cosmos_database_list](#test-138) -- [Test 139: cosmos_database_list](#test-139) -- [Test 140: kusto_cluster_get](#test-140) -- [Test 141: kusto_cluster_list](#test-141) -- [Test 142: kusto_cluster_list](#test-142) -- [Test 143: kusto_cluster_list](#test-143) -- [Test 144: kusto_database_list](#test-144) -- [Test 145: kusto_database_list](#test-145) -- [Test 146: kusto_query](#test-146) -- [Test 147: kusto_sample](#test-147) -- [Test 148: kusto_table_list](#test-148) -- [Test 149: kusto_table_list](#test-149) -- [Test 150: kusto_table_schema](#test-150) -- [Test 151: mysql_database_list](#test-151) -- [Test 152: mysql_database_list](#test-152) -- [Test 153: mysql_database_query](#test-153) -- [Test 154: mysql_server_config_get](#test-154) -- [Test 155: mysql_server_list](#test-155) -- [Test 156: mysql_server_list](#test-156) -- [Test 157: mysql_server_list](#test-157) -- [Test 158: mysql_server_param_get](#test-158) -- [Test 159: mysql_server_param_set](#test-159) -- [Test 160: mysql_table_list](#test-160) -- [Test 161: mysql_table_list](#test-161) -- [Test 162: mysql_table_schema_get](#test-162) -- [Test 163: postgres_database_list](#test-163) -- [Test 164: postgres_database_list](#test-164) -- [Test 165: postgres_database_query](#test-165) -- [Test 166: postgres_server_config_get](#test-166) -- [Test 167: postgres_server_list](#test-167) -- [Test 168: postgres_server_list](#test-168) -- [Test 169: postgres_server_list](#test-169) -- [Test 170: postgres_server_param_get](#test-170) -- [Test 171: postgres_server_param_set](#test-171) -- [Test 172: postgres_table_list](#test-172) -- [Test 173: postgres_table_list](#test-173) -- [Test 174: postgres_table_schema_get](#test-174) -- [Test 175: deploy_app_logs_get](#test-175) -- [Test 176: deploy_architecture_diagram_generate](#test-176) -- [Test 177: deploy_iac_rules_get](#test-177) -- [Test 178: deploy_pipeline_guidance_get](#test-178) -- [Test 179: deploy_plan_get](#test-179) -- [Test 180: eventgrid_events_publish](#test-180) -- [Test 181: eventgrid_events_publish](#test-181) -- [Test 182: eventgrid_events_publish](#test-182) -- [Test 183: eventgrid_topic_list](#test-183) -- [Test 184: eventgrid_topic_list](#test-184) -- [Test 185: eventgrid_topic_list](#test-185) -- [Test 186: eventgrid_topic_list](#test-186) -- [Test 187: eventgrid_subscription_list](#test-187) -- [Test 188: eventgrid_subscription_list](#test-188) -- [Test 189: eventgrid_subscription_list](#test-189) -- [Test 190: eventgrid_subscription_list](#test-190) -- [Test 191: eventgrid_subscription_list](#test-191) -- [Test 192: eventgrid_subscription_list](#test-192) -- [Test 193: eventgrid_subscription_list](#test-193) -- [Test 194: eventhubs_eventhub_consumergroup_delete](#test-194) -- [Test 195: eventhubs_eventhub_consumergroup_get](#test-195) -- [Test 196: eventhubs_eventhub_consumergroup_get](#test-196) -- [Test 197: eventhubs_eventhub_consumergroup_update](#test-197) -- [Test 198: eventhubs_eventhub_consumergroup_update](#test-198) -- [Test 199: eventhubs_eventhub_delete](#test-199) -- [Test 200: eventhubs_eventhub_get](#test-200) -- [Test 201: eventhubs_eventhub_get](#test-201) -- [Test 202: eventhubs_eventhub_update](#test-202) -- [Test 203: eventhubs_eventhub_update](#test-203) -- [Test 204: eventhubs_namespace_delete](#test-204) -- [Test 205: eventhubs_namespace_get](#test-205) -- [Test 206: eventhubs_namespace_get](#test-206) -- [Test 207: eventhubs_namespace_update](#test-207) -- [Test 208: eventhubs_namespace_update](#test-208) -- [Test 209: functionapp_get](#test-209) -- [Test 210: functionapp_get](#test-210) -- [Test 211: functionapp_get](#test-211) -- [Test 212: functionapp_get](#test-212) -- [Test 213: functionapp_get](#test-213) -- [Test 214: functionapp_get](#test-214) -- [Test 215: functionapp_get](#test-215) -- [Test 216: functionapp_get](#test-216) -- [Test 217: functionapp_get](#test-217) -- [Test 218: functionapp_get](#test-218) -- [Test 219: functionapp_get](#test-219) -- [Test 220: functionapp_get](#test-220) -- [Test 221: keyvault_admin_settings_get](#test-221) -- [Test 222: keyvault_admin_settings_get](#test-222) -- [Test 223: keyvault_admin_settings_get](#test-223) -- [Test 224: keyvault_certificate_create](#test-224) -- [Test 225: keyvault_certificate_create](#test-225) -- [Test 226: keyvault_certificate_create](#test-226) -- [Test 227: keyvault_certificate_create](#test-227) -- [Test 228: keyvault_certificate_create](#test-228) -- [Test 229: keyvault_certificate_get](#test-229) -- [Test 230: keyvault_certificate_get](#test-230) -- [Test 231: keyvault_certificate_get](#test-231) -- [Test 232: keyvault_certificate_get](#test-232) -- [Test 233: keyvault_certificate_get](#test-233) -- [Test 234: keyvault_certificate_import](#test-234) -- [Test 235: keyvault_certificate_import](#test-235) -- [Test 236: keyvault_certificate_import](#test-236) -- [Test 237: keyvault_certificate_import](#test-237) -- [Test 238: keyvault_certificate_import](#test-238) -- [Test 239: keyvault_certificate_list](#test-239) -- [Test 240: keyvault_certificate_list](#test-240) -- [Test 241: keyvault_certificate_list](#test-241) -- [Test 242: keyvault_certificate_list](#test-242) -- [Test 243: keyvault_certificate_list](#test-243) -- [Test 244: keyvault_certificate_list](#test-244) -- [Test 245: keyvault_key_create](#test-245) -- [Test 246: keyvault_key_create](#test-246) -- [Test 247: keyvault_key_create](#test-247) -- [Test 248: keyvault_key_create](#test-248) -- [Test 249: keyvault_key_create](#test-249) -- [Test 250: keyvault_key_get](#test-250) -- [Test 251: keyvault_key_get](#test-251) -- [Test 252: keyvault_key_get](#test-252) -- [Test 253: keyvault_key_get](#test-253) -- [Test 254: keyvault_key_get](#test-254) -- [Test 255: keyvault_key_list](#test-255) -- [Test 256: keyvault_key_list](#test-256) -- [Test 257: keyvault_key_list](#test-257) -- [Test 258: keyvault_key_list](#test-258) -- [Test 259: keyvault_key_list](#test-259) -- [Test 260: keyvault_key_list](#test-260) -- [Test 261: keyvault_secret_create](#test-261) -- [Test 262: keyvault_secret_create](#test-262) -- [Test 263: keyvault_secret_create](#test-263) -- [Test 264: keyvault_secret_create](#test-264) -- [Test 265: keyvault_secret_create](#test-265) -- [Test 266: keyvault_secret_get](#test-266) -- [Test 267: keyvault_secret_get](#test-267) -- [Test 268: keyvault_secret_get](#test-268) -- [Test 269: keyvault_secret_get](#test-269) -- [Test 270: keyvault_secret_get](#test-270) -- [Test 271: keyvault_secret_list](#test-271) -- [Test 272: keyvault_secret_list](#test-272) -- [Test 273: keyvault_secret_list](#test-273) -- [Test 274: keyvault_secret_list](#test-274) -- [Test 275: keyvault_secret_list](#test-275) -- [Test 276: keyvault_secret_list](#test-276) -- [Test 277: aks_cluster_get](#test-277) -- [Test 278: aks_cluster_get](#test-278) -- [Test 279: aks_cluster_get](#test-279) -- [Test 280: aks_cluster_get](#test-280) -- [Test 281: aks_cluster_get](#test-281) -- [Test 282: aks_cluster_get](#test-282) -- [Test 283: aks_cluster_get](#test-283) -- [Test 284: aks_nodepool_get](#test-284) -<<<<<<< HEAD -- [Test 285: aks_nodepool_get](#test-285) -- [Test 286: aks_nodepool_get](#test-286) -- [Test 287: aks_nodepool_get](#test-287) -- [Test 288: aks_nodepool_get](#test-288) -- [Test 289: aks_nodepool_get](#test-289) -- [Test 290: loadtesting_test_create](#test-290) -- [Test 291: loadtesting_test_get](#test-291) -- [Test 292: loadtesting_testresource_create](#test-292) -- [Test 293: loadtesting_testresource_list](#test-293) -- [Test 294: loadtesting_testrun_create](#test-294) -- [Test 295: loadtesting_testrun_get](#test-295) -- [Test 296: loadtesting_testrun_list](#test-296) -- [Test 297: loadtesting_testrun_update](#test-297) -- [Test 298: grafana_list](#test-298) -- [Test 299: managedlustre_fs_create](#test-299) -- [Test 300: managedlustre_fs_list](#test-300) -- [Test 301: managedlustre_fs_list](#test-301) -- [Test 302: managedlustre_fs_sku_get](#test-302) -- [Test 303: managedlustre_fs_subnetsize_ask](#test-303) -- [Test 304: managedlustre_fs_subnetsize_validate](#test-304) -- [Test 305: managedlustre_fs_update](#test-305) -- [Test 306: marketplace_product_get](#test-306) -- [Test 307: marketplace_product_list](#test-307) -- [Test 308: marketplace_product_list](#test-308) -- [Test 309: azureaibestpractices_get](#test-309) -- [Test 310: azureaibestpractices_get](#test-310) -- [Test 311: azureaibestpractices_get](#test-311) -- [Test 312: azureaibestpractices_get](#test-312) -- [Test 313: azureaibestpractices_get](#test-313) -======= -- [Test 285: loadtesting_test_create](#test-285) -- [Test 286: loadtesting_test_get](#test-286) -- [Test 287: loadtesting_testresource_create](#test-287) -- [Test 288: loadtesting_testresource_list](#test-288) -- [Test 289: loadtesting_testrun_create](#test-289) -- [Test 290: loadtesting_testrun_get](#test-290) -- [Test 291: loadtesting_testrun_list](#test-291) -- [Test 292: loadtesting_testrun_update](#test-292) -- [Test 293: grafana_list](#test-293) -- [Test 294: managedlustre_fs_create](#test-294) -- [Test 295: managedlustre_fs_list](#test-295) -- [Test 296: managedlustre_fs_list](#test-296) -- [Test 297: managedlustre_fs_sku_get](#test-297) -- [Test 298: managedlustre_fs_subnetsize_ask](#test-298) -- [Test 299: managedlustre_fs_subnetsize_validate](#test-299) -- [Test 300: managedlustre_fs_update](#test-300) -- [Test 301: marketplace_product_get](#test-301) -- [Test 302: marketplace_product_list](#test-302) -- [Test 303: marketplace_product_list](#test-303) -- [Test 304: get_bestpractices_get](#test-304) -- [Test 305: get_bestpractices_get](#test-305) -- [Test 306: get_bestpractices_get](#test-306) -- [Test 307: get_bestpractices_get](#test-307) -- [Test 308: get_bestpractices_get](#test-308) -- [Test 309: get_bestpractices_get](#test-309) -- [Test 310: get_bestpractices_get](#test-310) -- [Test 311: get_bestpractices_get](#test-311) -- [Test 312: get_bestpractices_get](#test-312) -- [Test 313: monitor_activitylog_list](#test-313) -- [Test 314: monitor_healthmodels_entity_get](#test-314) -- [Test 315: monitor_metrics_definitions](#test-315) -- [Test 316: monitor_metrics_definitions](#test-316) -- [Test 317: monitor_metrics_definitions](#test-317) -- [Test 318: monitor_metrics_query](#test-318) -- [Test 319: monitor_metrics_query](#test-319) -- [Test 320: monitor_metrics_query](#test-320) -- [Test 321: monitor_metrics_query](#test-321) -- [Test 322: monitor_metrics_query](#test-322) -- [Test 323: monitor_metrics_query](#test-323) -- [Test 324: monitor_resource_log_query](#test-324) -- [Test 325: monitor_table_list](#test-325) -- [Test 326: monitor_table_list](#test-326) -- [Test 327: monitor_table_type_list](#test-327) -- [Test 328: monitor_table_type_list](#test-328) -- [Test 329: monitor_webtests_create](#test-329) -- [Test 330: monitor_webtests_get](#test-330) -- [Test 331: monitor_webtests_list](#test-331) -- [Test 332: monitor_webtests_list](#test-332) -- [Test 333: monitor_webtests_update](#test-333) -- [Test 334: monitor_workspace_list](#test-334) -- [Test 335: monitor_workspace_list](#test-335) -- [Test 336: monitor_workspace_list](#test-336) -- [Test 337: monitor_workspace_log_query](#test-337) -- [Test 338: datadog_monitoredresources_list](#test-338) -- [Test 339: datadog_monitoredresources_list](#test-339) -- [Test 340: extension_azqr](#test-340) -- [Test 341: extension_azqr](#test-341) -- [Test 342: extension_azqr](#test-342) -- [Test 343: quota_region_availability_list](#test-343) -- [Test 344: quota_usage_check](#test-344) -- [Test 345: role_assignment_list](#test-345) -- [Test 346: role_assignment_list](#test-346) -- [Test 347: redis_list](#test-347) -- [Test 348: redis_list](#test-348) -- [Test 349: redis_list](#test-349) -- [Test 350: redis_list](#test-350) -- [Test 351: redis_list](#test-351) -- [Test 352: group_list](#test-352) -- [Test 353: group_list](#test-353) -- [Test 354: group_list](#test-354) -- [Test 355: resourcehealth_availability-status_get](#test-355) -- [Test 356: resourcehealth_availability-status_get](#test-356) -- [Test 357: resourcehealth_availability-status_get](#test-357) -- [Test 358: resourcehealth_availability-status_list](#test-358) -- [Test 359: resourcehealth_availability-status_list](#test-359) -- [Test 360: resourcehealth_availability-status_list](#test-360) -- [Test 361: resourcehealth_health-events_list](#test-361) -- [Test 362: resourcehealth_health-events_list](#test-362) -- [Test 363: resourcehealth_health-events_list](#test-363) -- [Test 364: resourcehealth_health-events_list](#test-364) -- [Test 365: resourcehealth_health-events_list](#test-365) -- [Test 366: servicebus_queue_details](#test-366) -- [Test 367: servicebus_topic_details](#test-367) -- [Test 368: servicebus_topic_subscription_details](#test-368) -- [Test 369: signalr_runtime_get](#test-369) -- [Test 370: signalr_runtime_get](#test-370) -- [Test 371: signalr_runtime_get](#test-371) -- [Test 372: signalr_runtime_get](#test-372) -- [Test 373: signalr_runtime_get](#test-373) -- [Test 374: signalr_runtime_get](#test-374) -- [Test 375: sql_db_create](#test-375) -- [Test 376: sql_db_create](#test-376) -- [Test 377: sql_db_create](#test-377) -- [Test 378: sql_db_delete](#test-378) -- [Test 379: sql_db_delete](#test-379) -- [Test 380: sql_db_delete](#test-380) -- [Test 381: sql_db_list](#test-381) -- [Test 382: sql_db_list](#test-382) -- [Test 383: sql_db_rename](#test-383) -- [Test 384: sql_db_rename](#test-384) -- [Test 385: sql_db_show](#test-385) -- [Test 386: sql_db_show](#test-386) -- [Test 387: sql_db_update](#test-387) -- [Test 388: sql_db_update](#test-388) -- [Test 389: sql_elastic-pool_list](#test-389) -- [Test 390: sql_elastic-pool_list](#test-390) -- [Test 391: sql_elastic-pool_list](#test-391) -- [Test 392: sql_server_create](#test-392) -- [Test 393: sql_server_create](#test-393) -- [Test 394: sql_server_create](#test-394) -- [Test 395: sql_server_delete](#test-395) -- [Test 396: sql_server_delete](#test-396) -- [Test 397: sql_server_delete](#test-397) -- [Test 398: sql_server_entra-admin_list](#test-398) -- [Test 399: sql_server_entra-admin_list](#test-399) -- [Test 400: sql_server_entra-admin_list](#test-400) -- [Test 401: sql_server_firewall-rule_create](#test-401) -- [Test 402: sql_server_firewall-rule_create](#test-402) -- [Test 403: sql_server_firewall-rule_create](#test-403) -- [Test 404: sql_server_firewall-rule_delete](#test-404) -- [Test 405: sql_server_firewall-rule_delete](#test-405) -- [Test 406: sql_server_firewall-rule_delete](#test-406) -- [Test 407: sql_server_firewall-rule_list](#test-407) -- [Test 408: sql_server_firewall-rule_list](#test-408) -- [Test 409: sql_server_firewall-rule_list](#test-409) -- [Test 410: sql_server_list](#test-410) -- [Test 411: sql_server_list](#test-411) -- [Test 412: sql_server_show](#test-412) -- [Test 413: sql_server_show](#test-413) -- [Test 414: sql_server_show](#test-414) -- [Test 415: storage_account_create](#test-415) -- [Test 416: storage_account_create](#test-416) -- [Test 417: storage_account_create](#test-417) -- [Test 418: storage_account_get](#test-418) -- [Test 419: storage_account_get](#test-419) -- [Test 420: storage_account_get](#test-420) -- [Test 421: storage_account_get](#test-421) -- [Test 422: storage_account_get](#test-422) -- [Test 423: storage_blob_container_create](#test-423) -- [Test 424: storage_blob_container_create](#test-424) -- [Test 425: storage_blob_container_create](#test-425) -- [Test 426: storage_blob_container_get](#test-426) -- [Test 427: storage_blob_container_get](#test-427) -- [Test 428: storage_blob_container_get](#test-428) -- [Test 429: storage_blob_get](#test-429) -- [Test 430: storage_blob_get](#test-430) -- [Test 431: storage_blob_get](#test-431) -- [Test 432: storage_blob_get](#test-432) -- [Test 433: storage_blob_upload](#test-433) -- [Test 434: subscription_list](#test-434) -- [Test 435: subscription_list](#test-435) -- [Test 436: subscription_list](#test-436) -- [Test 437: subscription_list](#test-437) -- [Test 438: azureterraformbestpractices_get](#test-438) -- [Test 439: azureterraformbestpractices_get](#test-439) -- [Test 440: virtualdesktop_hostpool_list](#test-440) -- [Test 441: virtualdesktop_hostpool_host_list](#test-441) -- [Test 442: virtualdesktop_hostpool_host_user-list](#test-442) -- [Test 443: workbooks_create](#test-443) -- [Test 444: workbooks_delete](#test-444) -- [Test 445: workbooks_list](#test-445) -- [Test 446: workbooks_list](#test-446) -- [Test 447: workbooks_show](#test-447) -- [Test 448: workbooks_show](#test-448) -- [Test 449: workbooks_update](#test-449) -- [Test 450: bicepschema_get](#test-450) -- [Test 451: cloudarchitect_design](#test-451) -- [Test 452: cloudarchitect_design](#test-452) -- [Test 453: cloudarchitect_design](#test-453) -- [Test 454: cloudarchitect_design](#test-454) -======= -- [Test 61: speech_tts_synthesize](#test-61) -- [Test 62: speech_tts_synthesize](#test-62) -- [Test 63: speech_tts_synthesize](#test-63) -- [Test 64: speech_tts_synthesize](#test-64) -- [Test 65: speech_tts_synthesize](#test-65) -======= ->>>>>>> e2fd2eac (refactor tts mcp tool) - [Test 66: speech_tts_synthesize](#test-66) - [Test 67: speech_tts_synthesize](#test-67) - [Test 68: speech_tts_synthesize](#test-68) @@ -795,33 +307,6 @@ - [Test 292: aks_cluster_get](#test-292) - [Test 293: aks_cluster_get](#test-293) - [Test 294: aks_nodepool_get](#test-294) -<<<<<<< HEAD -- [Test 295: loadtesting_test_create](#test-295) -- [Test 296: loadtesting_test_get](#test-296) -- [Test 297: loadtesting_testresource_create](#test-297) -- [Test 298: loadtesting_testresource_list](#test-298) -- [Test 299: loadtesting_testrun_create](#test-299) -- [Test 300: loadtesting_testrun_get](#test-300) -- [Test 301: loadtesting_testrun_list](#test-301) -- [Test 302: loadtesting_testrun_update](#test-302) -- [Test 303: grafana_list](#test-303) -- [Test 304: managedlustre_fs_create](#test-304) -- [Test 305: managedlustre_fs_list](#test-305) -- [Test 306: managedlustre_fs_list](#test-306) -- [Test 307: managedlustre_fs_sku_get](#test-307) -- [Test 308: managedlustre_fs_subnetsize_ask](#test-308) -- [Test 309: managedlustre_fs_subnetsize_validate](#test-309) -- [Test 310: managedlustre_fs_update](#test-310) -- [Test 311: marketplace_product_get](#test-311) -- [Test 312: marketplace_product_list](#test-312) -- [Test 313: marketplace_product_list](#test-313) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -- [Test 314: get_bestpractices_get](#test-314) -- [Test 315: get_bestpractices_get](#test-315) -- [Test 316: get_bestpractices_get](#test-316) -- [Test 317: get_bestpractices_get](#test-317) -- [Test 318: get_bestpractices_get](#test-318) -======= - [Test 295: aks_nodepool_get](#test-295) - [Test 296: aks_nodepool_get](#test-296) - [Test 297: aks_nodepool_get](#test-297) @@ -846,245 +331,162 @@ - [Test 316: marketplace_product_get](#test-316) - [Test 317: marketplace_product_list](#test-317) - [Test 318: marketplace_product_list](#test-318) ->>>>>>> e2fd2eac (refactor tts mcp tool) -- [Test 319: get_bestpractices_get](#test-319) -- [Test 320: get_bestpractices_get](#test-320) -- [Test 321: get_bestpractices_get](#test-321) -- [Test 322: get_bestpractices_get](#test-322) -- [Test 323: get_bestpractices_get](#test-323) +- [Test 319: azureaibestpractices_get](#test-319) +- [Test 320: azureaibestpractices_get](#test-320) +- [Test 321: azureaibestpractices_get](#test-321) +- [Test 322: azureaibestpractices_get](#test-322) +- [Test 323: azureaibestpractices_get](#test-323) - [Test 324: get_bestpractices_get](#test-324) - [Test 325: get_bestpractices_get](#test-325) - [Test 326: get_bestpractices_get](#test-326) - [Test 327: get_bestpractices_get](#test-327) -- [Test 328: monitor_activitylog_list](#test-328) -- [Test 329: monitor_healthmodels_entity_get](#test-329) -- [Test 330: monitor_metrics_definitions](#test-330) -- [Test 331: monitor_metrics_definitions](#test-331) -- [Test 332: monitor_metrics_definitions](#test-332) -- [Test 333: monitor_metrics_query](#test-333) -- [Test 334: monitor_metrics_query](#test-334) -- [Test 335: monitor_metrics_query](#test-335) -- [Test 336: monitor_metrics_query](#test-336) -- [Test 337: monitor_metrics_query](#test-337) +- [Test 328: get_bestpractices_get](#test-328) +- [Test 329: get_bestpractices_get](#test-329) +- [Test 330: get_bestpractices_get](#test-330) +- [Test 331: get_bestpractices_get](#test-331) +- [Test 332: get_bestpractices_get](#test-332) +- [Test 333: monitor_activitylog_list](#test-333) +- [Test 334: monitor_healthmodels_entity_get](#test-334) +- [Test 335: monitor_metrics_definitions](#test-335) +- [Test 336: monitor_metrics_definitions](#test-336) +- [Test 337: monitor_metrics_definitions](#test-337) - [Test 338: monitor_metrics_query](#test-338) -- [Test 339: monitor_resource_log_query](#test-339) -- [Test 340: monitor_table_list](#test-340) -- [Test 341: monitor_table_list](#test-341) -- [Test 342: monitor_table_type_list](#test-342) -- [Test 343: monitor_table_type_list](#test-343) -- [Test 344: monitor_webtests_create](#test-344) -- [Test 345: monitor_webtests_get](#test-345) -- [Test 346: monitor_webtests_list](#test-346) -- [Test 347: monitor_webtests_list](#test-347) -- [Test 348: monitor_webtests_update](#test-348) -- [Test 349: monitor_workspace_list](#test-349) -- [Test 350: monitor_workspace_list](#test-350) -- [Test 351: monitor_workspace_list](#test-351) -- [Test 352: monitor_workspace_log_query](#test-352) -- [Test 353: datadog_monitoredresources_list](#test-353) -- [Test 354: datadog_monitoredresources_list](#test-354) -- [Test 355: extension_azqr](#test-355) -- [Test 356: extension_azqr](#test-356) -- [Test 357: extension_azqr](#test-357) -- [Test 358: quota_region_availability_list](#test-358) -- [Test 359: quota_usage_check](#test-359) -- [Test 360: role_assignment_list](#test-360) -- [Test 361: role_assignment_list](#test-361) -- [Test 362: redis_list](#test-362) -- [Test 363: redis_list](#test-363) -- [Test 364: redis_list](#test-364) -- [Test 365: redis_list](#test-365) -- [Test 366: redis_list](#test-366) -- [Test 367: group_list](#test-367) -- [Test 368: group_list](#test-368) -- [Test 369: group_list](#test-369) -- [Test 370: resourcehealth_availability-status_get](#test-370) -- [Test 371: resourcehealth_availability-status_get](#test-371) -- [Test 372: resourcehealth_availability-status_get](#test-372) -- [Test 373: resourcehealth_availability-status_list](#test-373) -- [Test 374: resourcehealth_availability-status_list](#test-374) -- [Test 375: resourcehealth_availability-status_list](#test-375) -- [Test 376: resourcehealth_health-events_list](#test-376) -- [Test 377: resourcehealth_health-events_list](#test-377) -- [Test 378: resourcehealth_health-events_list](#test-378) -- [Test 379: resourcehealth_health-events_list](#test-379) -- [Test 380: resourcehealth_health-events_list](#test-380) -- [Test 381: servicebus_queue_details](#test-381) -- [Test 382: servicebus_topic_details](#test-382) -- [Test 383: servicebus_topic_subscription_details](#test-383) -- [Test 384: signalr_runtime_get](#test-384) -<<<<<<< HEAD -- [Test 385: sql_db_create](#test-385) -- [Test 386: sql_db_create](#test-386) -- [Test 387: sql_db_create](#test-387) -- [Test 388: sql_db_delete](#test-388) -- [Test 389: sql_db_delete](#test-389) -- [Test 390: sql_db_delete](#test-390) -- [Test 391: sql_db_list](#test-391) -- [Test 392: sql_db_list](#test-392) -- [Test 393: sql_db_rename](#test-393) -- [Test 394: sql_db_rename](#test-394) -- [Test 395: sql_db_show](#test-395) -- [Test 396: sql_db_show](#test-396) -- [Test 397: sql_db_update](#test-397) -- [Test 398: sql_db_update](#test-398) -- [Test 399: sql_elastic-pool_list](#test-399) -- [Test 400: sql_elastic-pool_list](#test-400) -- [Test 401: sql_elastic-pool_list](#test-401) -- [Test 402: sql_server_create](#test-402) -- [Test 403: sql_server_create](#test-403) -- [Test 404: sql_server_create](#test-404) -- [Test 405: sql_server_delete](#test-405) -- [Test 406: sql_server_delete](#test-406) -- [Test 407: sql_server_delete](#test-407) -- [Test 408: sql_server_entra-admin_list](#test-408) -- [Test 409: sql_server_entra-admin_list](#test-409) -- [Test 410: sql_server_entra-admin_list](#test-410) -- [Test 411: sql_server_firewall-rule_create](#test-411) -- [Test 412: sql_server_firewall-rule_create](#test-412) -- [Test 413: sql_server_firewall-rule_create](#test-413) -- [Test 414: sql_server_firewall-rule_delete](#test-414) -- [Test 415: sql_server_firewall-rule_delete](#test-415) -- [Test 416: sql_server_firewall-rule_delete](#test-416) -- [Test 417: sql_server_firewall-rule_list](#test-417) -- [Test 418: sql_server_firewall-rule_list](#test-418) -- [Test 419: sql_server_firewall-rule_list](#test-419) -- [Test 420: sql_server_list](#test-420) -- [Test 421: sql_server_list](#test-421) -- [Test 422: sql_server_show](#test-422) -- [Test 423: sql_server_show](#test-423) -- [Test 424: sql_server_show](#test-424) -- [Test 425: storage_account_create](#test-425) -- [Test 426: storage_account_create](#test-426) -- [Test 427: storage_account_create](#test-427) -- [Test 428: storage_account_get](#test-428) -- [Test 429: storage_account_get](#test-429) -- [Test 430: storage_account_get](#test-430) -- [Test 431: storage_account_get](#test-431) -- [Test 432: storage_account_get](#test-432) -- [Test 433: storage_blob_container_create](#test-433) -- [Test 434: storage_blob_container_create](#test-434) -- [Test 435: storage_blob_container_create](#test-435) -- [Test 436: storage_blob_container_get](#test-436) -- [Test 437: storage_blob_container_get](#test-437) -- [Test 438: storage_blob_container_get](#test-438) -- [Test 439: storage_blob_get](#test-439) -- [Test 440: storage_blob_get](#test-440) -- [Test 441: storage_blob_get](#test-441) -- [Test 442: storage_blob_get](#test-442) -- [Test 443: storage_blob_upload](#test-443) -- [Test 444: subscription_list](#test-444) -- [Test 445: subscription_list](#test-445) -- [Test 446: subscription_list](#test-446) -- [Test 447: subscription_list](#test-447) -- [Test 448: azureterraformbestpractices_get](#test-448) -- [Test 449: azureterraformbestpractices_get](#test-449) -- [Test 450: virtualdesktop_hostpool_list](#test-450) -- [Test 451: virtualdesktop_hostpool_host_list](#test-451) -- [Test 452: virtualdesktop_hostpool_host_user-list](#test-452) -- [Test 453: workbooks_create](#test-453) -- [Test 454: workbooks_delete](#test-454) -- [Test 455: workbooks_list](#test-455) -- [Test 456: workbooks_list](#test-456) -- [Test 457: workbooks_show](#test-457) -- [Test 458: workbooks_show](#test-458) -- [Test 459: workbooks_update](#test-459) -- [Test 460: bicepschema_get](#test-460) -- [Test 461: cloudarchitect_design](#test-461) -- [Test 462: cloudarchitect_design](#test-462) -- [Test 463: cloudarchitect_design](#test-463) -- [Test 464: cloudarchitect_design](#test-464) -<<<<<<< HEAD -======= ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -- [Test 385: signalr_runtime_get](#test-385) -- [Test 386: signalr_runtime_get](#test-386) -- [Test 387: signalr_runtime_get](#test-387) -- [Test 388: signalr_runtime_get](#test-388) +- [Test 339: monitor_metrics_query](#test-339) +- [Test 340: monitor_metrics_query](#test-340) +- [Test 341: monitor_metrics_query](#test-341) +- [Test 342: monitor_metrics_query](#test-342) +- [Test 343: monitor_metrics_query](#test-343) +- [Test 344: monitor_resource_log_query](#test-344) +- [Test 345: monitor_table_list](#test-345) +- [Test 346: monitor_table_list](#test-346) +- [Test 347: monitor_table_type_list](#test-347) +- [Test 348: monitor_table_type_list](#test-348) +- [Test 349: monitor_webtests_create](#test-349) +- [Test 350: monitor_webtests_get](#test-350) +- [Test 351: monitor_webtests_list](#test-351) +- [Test 352: monitor_webtests_list](#test-352) +- [Test 353: monitor_webtests_update](#test-353) +- [Test 354: monitor_workspace_list](#test-354) +- [Test 355: monitor_workspace_list](#test-355) +- [Test 356: monitor_workspace_list](#test-356) +- [Test 357: monitor_workspace_log_query](#test-357) +- [Test 358: datadog_monitoredresources_list](#test-358) +- [Test 359: datadog_monitoredresources_list](#test-359) +- [Test 360: extension_azqr](#test-360) +- [Test 361: extension_azqr](#test-361) +- [Test 362: extension_azqr](#test-362) +- [Test 363: quota_region_availability_list](#test-363) +- [Test 364: quota_usage_check](#test-364) +- [Test 365: role_assignment_list](#test-365) +- [Test 366: role_assignment_list](#test-366) +- [Test 367: redis_list](#test-367) +- [Test 368: redis_list](#test-368) +- [Test 369: redis_list](#test-369) +- [Test 370: redis_list](#test-370) +- [Test 371: redis_list](#test-371) +- [Test 372: group_list](#test-372) +- [Test 373: group_list](#test-373) +- [Test 374: group_list](#test-374) +- [Test 375: resourcehealth_availability-status_get](#test-375) +- [Test 376: resourcehealth_availability-status_get](#test-376) +- [Test 377: resourcehealth_availability-status_get](#test-377) +- [Test 378: resourcehealth_availability-status_list](#test-378) +- [Test 379: resourcehealth_availability-status_list](#test-379) +- [Test 380: resourcehealth_availability-status_list](#test-380) +- [Test 381: resourcehealth_health-events_list](#test-381) +- [Test 382: resourcehealth_health-events_list](#test-382) +- [Test 383: resourcehealth_health-events_list](#test-383) +- [Test 384: resourcehealth_health-events_list](#test-384) +- [Test 385: resourcehealth_health-events_list](#test-385) +- [Test 386: servicebus_queue_details](#test-386) +- [Test 387: servicebus_topic_details](#test-387) +- [Test 388: servicebus_topic_subscription_details](#test-388) - [Test 389: signalr_runtime_get](#test-389) -- [Test 390: sql_db_create](#test-390) -- [Test 391: sql_db_create](#test-391) -- [Test 392: sql_db_create](#test-392) -- [Test 393: sql_db_delete](#test-393) -- [Test 394: sql_db_delete](#test-394) -- [Test 395: sql_db_delete](#test-395) -- [Test 396: sql_db_list](#test-396) -- [Test 397: sql_db_list](#test-397) -- [Test 398: sql_db_rename](#test-398) -- [Test 399: sql_db_rename](#test-399) -- [Test 400: sql_db_show](#test-400) -- [Test 401: sql_db_show](#test-401) -- [Test 402: sql_db_update](#test-402) -- [Test 403: sql_db_update](#test-403) -- [Test 404: sql_elastic-pool_list](#test-404) -- [Test 405: sql_elastic-pool_list](#test-405) -- [Test 406: sql_elastic-pool_list](#test-406) -- [Test 407: sql_server_create](#test-407) -- [Test 408: sql_server_create](#test-408) -- [Test 409: sql_server_create](#test-409) -- [Test 410: sql_server_delete](#test-410) -- [Test 411: sql_server_delete](#test-411) -- [Test 412: sql_server_delete](#test-412) -- [Test 413: sql_server_entra-admin_list](#test-413) -- [Test 414: sql_server_entra-admin_list](#test-414) -- [Test 415: sql_server_entra-admin_list](#test-415) -- [Test 416: sql_server_firewall-rule_create](#test-416) -- [Test 417: sql_server_firewall-rule_create](#test-417) -- [Test 418: sql_server_firewall-rule_create](#test-418) -- [Test 419: sql_server_firewall-rule_delete](#test-419) -- [Test 420: sql_server_firewall-rule_delete](#test-420) -- [Test 421: sql_server_firewall-rule_delete](#test-421) -- [Test 422: sql_server_firewall-rule_list](#test-422) -- [Test 423: sql_server_firewall-rule_list](#test-423) -- [Test 424: sql_server_firewall-rule_list](#test-424) -- [Test 425: sql_server_list](#test-425) -- [Test 426: sql_server_list](#test-426) -- [Test 427: sql_server_show](#test-427) -- [Test 428: sql_server_show](#test-428) -- [Test 429: sql_server_show](#test-429) -- [Test 430: storage_account_create](#test-430) -- [Test 431: storage_account_create](#test-431) -- [Test 432: storage_account_create](#test-432) -- [Test 433: storage_account_get](#test-433) -- [Test 434: storage_account_get](#test-434) -- [Test 435: storage_account_get](#test-435) -- [Test 436: storage_account_get](#test-436) -- [Test 437: storage_account_get](#test-437) -- [Test 438: storage_blob_container_create](#test-438) -- [Test 439: storage_blob_container_create](#test-439) -- [Test 440: storage_blob_container_create](#test-440) -- [Test 441: storage_blob_container_get](#test-441) -- [Test 442: storage_blob_container_get](#test-442) -- [Test 443: storage_blob_container_get](#test-443) -- [Test 444: storage_blob_get](#test-444) -- [Test 445: storage_blob_get](#test-445) -- [Test 446: storage_blob_get](#test-446) -- [Test 447: storage_blob_get](#test-447) -- [Test 448: storage_blob_upload](#test-448) -- [Test 449: subscription_list](#test-449) -- [Test 450: subscription_list](#test-450) -- [Test 451: subscription_list](#test-451) -- [Test 452: subscription_list](#test-452) -- [Test 453: azureterraformbestpractices_get](#test-453) -- [Test 454: azureterraformbestpractices_get](#test-454) -- [Test 455: virtualdesktop_hostpool_list](#test-455) -- [Test 456: virtualdesktop_hostpool_host_list](#test-456) -- [Test 457: virtualdesktop_hostpool_host_user-list](#test-457) -- [Test 458: workbooks_create](#test-458) -- [Test 459: workbooks_delete](#test-459) -- [Test 460: workbooks_list](#test-460) -- [Test 461: workbooks_list](#test-461) -- [Test 462: workbooks_show](#test-462) -- [Test 463: workbooks_show](#test-463) -- [Test 464: workbooks_update](#test-464) -- [Test 465: bicepschema_get](#test-465) -- [Test 466: cloudarchitect_design](#test-466) -- [Test 467: cloudarchitect_design](#test-467) -- [Test 468: cloudarchitect_design](#test-468) -- [Test 469: cloudarchitect_design](#test-469) ->>>>>>> e2fd2eac (refactor tts mcp tool) +- [Test 390: signalr_runtime_get](#test-390) +- [Test 391: signalr_runtime_get](#test-391) +- [Test 392: signalr_runtime_get](#test-392) +- [Test 393: signalr_runtime_get](#test-393) +- [Test 394: signalr_runtime_get](#test-394) +- [Test 395: sql_db_create](#test-395) +- [Test 396: sql_db_create](#test-396) +- [Test 397: sql_db_create](#test-397) +- [Test 398: sql_db_delete](#test-398) +- [Test 399: sql_db_delete](#test-399) +- [Test 400: sql_db_delete](#test-400) +- [Test 401: sql_db_list](#test-401) +- [Test 402: sql_db_list](#test-402) +- [Test 403: sql_db_rename](#test-403) +- [Test 404: sql_db_rename](#test-404) +- [Test 405: sql_db_show](#test-405) +- [Test 406: sql_db_show](#test-406) +- [Test 407: sql_db_update](#test-407) +- [Test 408: sql_db_update](#test-408) +- [Test 409: sql_elastic-pool_list](#test-409) +- [Test 410: sql_elastic-pool_list](#test-410) +- [Test 411: sql_elastic-pool_list](#test-411) +- [Test 412: sql_server_create](#test-412) +- [Test 413: sql_server_create](#test-413) +- [Test 414: sql_server_create](#test-414) +- [Test 415: sql_server_delete](#test-415) +- [Test 416: sql_server_delete](#test-416) +- [Test 417: sql_server_delete](#test-417) +- [Test 418: sql_server_entra-admin_list](#test-418) +- [Test 419: sql_server_entra-admin_list](#test-419) +- [Test 420: sql_server_entra-admin_list](#test-420) +- [Test 421: sql_server_firewall-rule_create](#test-421) +- [Test 422: sql_server_firewall-rule_create](#test-422) +- [Test 423: sql_server_firewall-rule_create](#test-423) +- [Test 424: sql_server_firewall-rule_delete](#test-424) +- [Test 425: sql_server_firewall-rule_delete](#test-425) +- [Test 426: sql_server_firewall-rule_delete](#test-426) +- [Test 427: sql_server_firewall-rule_list](#test-427) +- [Test 428: sql_server_firewall-rule_list](#test-428) +- [Test 429: sql_server_firewall-rule_list](#test-429) +- [Test 430: sql_server_list](#test-430) +- [Test 431: sql_server_list](#test-431) +- [Test 432: sql_server_show](#test-432) +- [Test 433: sql_server_show](#test-433) +- [Test 434: sql_server_show](#test-434) +- [Test 435: storage_account_create](#test-435) +- [Test 436: storage_account_create](#test-436) +- [Test 437: storage_account_create](#test-437) +- [Test 438: storage_account_get](#test-438) +- [Test 439: storage_account_get](#test-439) +- [Test 440: storage_account_get](#test-440) +- [Test 441: storage_account_get](#test-441) +- [Test 442: storage_account_get](#test-442) +- [Test 443: storage_blob_container_create](#test-443) +- [Test 444: storage_blob_container_create](#test-444) +- [Test 445: storage_blob_container_create](#test-445) +- [Test 446: storage_blob_container_get](#test-446) +- [Test 447: storage_blob_container_get](#test-447) +- [Test 448: storage_blob_container_get](#test-448) +- [Test 449: storage_blob_get](#test-449) +- [Test 450: storage_blob_get](#test-450) +- [Test 451: storage_blob_get](#test-451) +- [Test 452: storage_blob_get](#test-452) +- [Test 453: storage_blob_upload](#test-453) +- [Test 454: subscription_list](#test-454) +- [Test 455: subscription_list](#test-455) +- [Test 456: subscription_list](#test-456) +- [Test 457: subscription_list](#test-457) +- [Test 458: azureterraformbestpractices_get](#test-458) +- [Test 459: azureterraformbestpractices_get](#test-459) +- [Test 460: virtualdesktop_hostpool_list](#test-460) +- [Test 461: virtualdesktop_hostpool_host_list](#test-461) +- [Test 462: virtualdesktop_hostpool_host_user-list](#test-462) +- [Test 463: workbooks_create](#test-463) +- [Test 464: workbooks_delete](#test-464) +- [Test 465: workbooks_list](#test-465) +- [Test 466: workbooks_list](#test-466) +- [Test 467: workbooks_show](#test-467) +- [Test 468: workbooks_show](#test-468) +- [Test 469: workbooks_update](#test-469) +- [Test 470: bicepschema_get](#test-470) +- [Test 471: cloudarchitect_design](#test-471) +- [Test 472: cloudarchitect_design](#test-472) +- [Test 473: cloudarchitect_design](#test-473) +- [Test 474: cloudarchitect_design](#test-474) --- @@ -1098,36 +500,32 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| | 1 | 0.705410 | `foundry_agents_connect` | ✅ **EXPECTED** | -<<<<<<< HEAD -<<<<<<< HEAD -| 2 | 0.663468 | `foundry_agents_list` | ❌ | -| 3 | 0.617213 | `foundry_resource_get` | ❌ | -| 4 | 0.548044 | `foundry_openai_models-list` | ❌ | -| 5 | 0.547459 | `foundry_agents_get-sdk-sample` | ❌ | -======= -<<<<<<< HEAD -| 2 | 0.663568 | `foundry_agents_list` | ❌ | -| 3 | 0.617213 | `foundry_resource_get` | ❌ | -| 4 | 0.548044 | `foundry_openai_models-list` | ❌ | -| 5 | 0.537580 | `foundry_agents_query-and-evaluate` | ❌ | -======= -| 2 | 0.617213 | `foundry_resource_get` | ❌ | -| 3 | 0.592487 | `foundry_agents_list` | ❌ | -| 4 | 0.537591 | `foundry_agents_query-and-evaluate` | ❌ | -| 5 | 0.536533 | `search_index_query` | ❌ | ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= | 2 | 0.663568 | `foundry_agents_list` | ❌ | | 3 | 0.617213 | `foundry_resource_get` | ❌ | | 4 | 0.548108 | `foundry_agents_get-sdk-sample` | ❌ | | 5 | 0.548044 | `foundry_openai_models-list` | ❌ | ->>>>>>> e2fd2eac (refactor tts mcp tool) --- ## Test 2 +**Expected Tool:** `foundry_agents_create` +**Prompt:** Create a new Azure AI Foundry agent using instructions in the active editor + +### Results + +| Rank | Score | Tool | Status | +|------|-------|------|--------| +| 1 | 0.586996 | `foundry_agents_create` | ✅ **EXPECTED** | +| 2 | 0.562087 | `foundry_agents_get-sdk-sample` | ❌ | +| 3 | 0.554009 | `foundry_threads_create` | ❌ | +| 4 | 0.525727 | `foundry_models_deploy` | ❌ | +| 5 | 0.525615 | `foundry_agents_list` | ❌ | + +--- + +## Test 3 + **Expected Tool:** `foundry_agents_evaluate` **Prompt:** Evaluate the full query and response I got from my agent for task_adherence @@ -1135,32 +533,32 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -<<<<<<< HEAD -| 1 | 0.543045 | `foundry_agents_query-and-evaluate` | ❌ | -| 2 | 0.469272 | `foundry_agents_evaluate` | ✅ **EXPECTED** | -| 3 | 0.445585 | `foundry_agents_connect` | ❌ | -| 4 | 0.298494 | `foundry_threads_list` | ❌ | -| 5 | 0.279058 | `foundry_agents_list` | ❌ | -======= -<<<<<<< HEAD -======= ->>>>>>> e2fd2eac (refactor tts mcp tool) | 1 | 0.544099 | `foundry_agents_query-and-evaluate` | ❌ | | 2 | 0.469428 | `foundry_agents_evaluate` | ✅ **EXPECTED** | | 3 | 0.445964 | `foundry_agents_connect` | ❌ | -<<<<<<< HEAD -| 4 | 0.278921 | `foundry_agents_list` | ❌ | -| 5 | 0.250023 | `monitor_workspace_log_query` | ❌ | ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= | 4 | 0.297986 | `foundry_threads_list` | ❌ | | 5 | 0.278921 | `foundry_agents_list` | ❌ | ->>>>>>> e2fd2eac (refactor tts mcp tool) --- -## Test 3 +## Test 4 + +**Expected Tool:** `foundry_agents_get-sdk-sample` +**Prompt:** Create a CLI app that can talk to an Azure AI Foundry Agent using Python SDK + +### Results + +| Rank | Score | Tool | Status | +|------|-------|------|--------| +| 1 | 0.595766 | `foundry_agents_get-sdk-sample` | ✅ **EXPECTED** | +| 2 | 0.552180 | `foundry_threads_create` | ❌ | +| 3 | 0.521920 | `foundry_agents_connect` | ❌ | +| 4 | 0.518652 | `foundry_agents_create` | ❌ | +| 5 | 0.509764 | `foundry_agents_list` | ❌ | + +--- + +## Test 5 **Expected Tool:** `foundry_agents_list` **Prompt:** List all agents in my Azure AI Foundry resource @@ -1169,39 +567,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -<<<<<<< HEAD -| 1 | 0.797701 | `foundry_agents_list` | ✅ **EXPECTED** | -| 2 | 0.666021 | `foundry_resource_get` | ❌ | -| 3 | 0.654206 | `foundry_openai_models-list` | ❌ | -| 4 | 0.647246 | `foundry_threads_list` | ❌ | -| 5 | 0.575761 | `foundry_models_deployments_list` | ❌ | -======= -<<<<<<< HEAD -| 1 | 0.797877 | `foundry_agents_list` | ✅ **EXPECTED** | -| 2 | 0.666021 | `foundry_resource_get` | ❌ | -| 3 | 0.654206 | `foundry_openai_models-list` | ❌ | -| 4 | 0.575553 | `foundry_models_deployments_list` | ❌ | -| 5 | 0.561946 | `search_service_list` | ❌ | -======= -| 1 | 0.748474 | `foundry_agents_list` | ✅ **EXPECTED** | -| 2 | 0.666021 | `foundry_resource_get` | ❌ | -| 3 | 0.561946 | `search_service_list` | ❌ | -| 4 | 0.556912 | `foundry_agents_connect` | ❌ | -| 5 | 0.542125 | `foundry_knowledge_index_list` | ❌ | ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= | 1 | 0.797877 | `foundry_agents_list` | ✅ **EXPECTED** | | 2 | 0.666021 | `foundry_resource_get` | ❌ | | 3 | 0.654206 | `foundry_openai_models-list` | ❌ | | 4 | 0.647246 | `foundry_threads_list` | ❌ | | 5 | 0.575553 | `foundry_models_deployments_list` | ❌ | ->>>>>>> e2fd2eac (refactor tts mcp tool) --- -## Test 4 +## Test 6 **Expected Tool:** `foundry_agents_list` **Prompt:** Show me the available agents in my Azure AI Foundry resource @@ -1210,63 +584,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -<<<<<<< HEAD -| 1 | 0.749704 | `foundry_agents_list` | ✅ **EXPECTED** | -| 2 | 0.630323 | `foundry_resource_get` | ❌ | -| 3 | 0.611801 | `foundry_openai_models-list` | ❌ | -| 4 | 0.603708 | `foundry_threads_list` | ❌ | -| 5 | 0.556580 | `foundry_agents_get-sdk-sample` | ❌ | -======= -<<<<<<< HEAD -| 1 | 0.749829 | `foundry_agents_list` | ✅ **EXPECTED** | -| 2 | 0.630288 | `foundry_resource_get` | ❌ | -| 3 | 0.611722 | `foundry_openai_models-list` | ❌ | -| 4 | 0.548511 | `foundry_agents_connect` | ❌ | -| 5 | 0.535020 | `foundry_models_list` | ❌ | -======= -| 1 | 0.730759 | `foundry_agents_list` | ✅ **EXPECTED** | -| 2 | 0.630288 | `foundry_resource_get` | ❌ | -| 3 | 0.548511 | `foundry_agents_connect` | ❌ | -| 4 | 0.535020 | `foundry_models_list` | ❌ | -| 5 | 0.519892 | `foundry_knowledge_index_list` | ❌ | ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= | 1 | 0.749829 | `foundry_agents_list` | ✅ **EXPECTED** | | 2 | 0.630288 | `foundry_resource_get` | ❌ | | 3 | 0.611722 | `foundry_openai_models-list` | ❌ | | 4 | 0.603689 | `foundry_threads_list` | ❌ | | 5 | 0.556990 | `foundry_agents_get-sdk-sample` | ❌ | ->>>>>>> e2fd2eac (refactor tts mcp tool) - ---- - -## Test 5 - -**Expected Tool:** `foundry_agents_create` -**Prompt:** Create a new Azure AI Foundry agent using instructions in the active editor - -### Results - -| Rank | Score | Tool | Status | -|------|-------|------|--------| -| 1 | 0.587064 | `foundry_agents_create` | ✅ **EXPECTED** | -<<<<<<< HEAD -| 2 | 0.561567 | `foundry_agents_get-sdk-sample` | ❌ | -| 3 | 0.554070 | `foundry_threads_create` | ❌ | -| 4 | 0.525727 | `foundry_models_deploy` | ❌ | -| 5 | 0.525461 | `foundry_agents_list` | ❌ | -======= -| 2 | 0.562087 | `foundry_agents_get-sdk-sample` | ❌ | -| 3 | 0.554195 | `foundry_threads_create` | ❌ | -| 4 | 0.525727 | `foundry_models_deploy` | ❌ | -| 5 | 0.525615 | `foundry_agents_list` | ❌ | ->>>>>>> e2fd2eac (refactor tts mcp tool) --- -## Test 6 +## Test 7 **Expected Tool:** `foundry_agents_query-and-evaluate` **Prompt:** Query and evaluate an agent in my Azure AI Foundry resource for task_adherence @@ -1276,131 +602,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| | 1 | 0.652200 | `foundry_agents_connect` | ❌ | -<<<<<<< HEAD -<<<<<<< HEAD -| 2 | 0.570725 | `foundry_agents_list` | ❌ | -| 3 | 0.553233 | `foundry_agents_query-and-evaluate` | ✅ **EXPECTED** | -| 4 | 0.493778 | `foundry_agents_evaluate` | ❌ | -| 5 | 0.469431 | `foundry_threads_list` | ❌ | -======= -<<<<<<< HEAD -| 2 | 0.570788 | `foundry_agents_list` | ❌ | -| 3 | 0.553190 | `foundry_agents_query-and-evaluate` | ✅ **EXPECTED** | -| 4 | 0.493779 | `foundry_agents_evaluate` | ❌ | -======= -| 2 | 0.553370 | `foundry_agents_query-and-evaluate` | ✅ **EXPECTED** | -| 3 | 0.493779 | `foundry_agents_evaluate` | ❌ | -| 4 | 0.469096 | `foundry_agents_list` | ❌ | ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) -| 5 | 0.460662 | `foundry_resource_get` | ❌ | ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= | 2 | 0.570788 | `foundry_agents_list` | ❌ | | 3 | 0.553190 | `foundry_agents_query-and-evaluate` | ✅ **EXPECTED** | | 4 | 0.493779 | `foundry_agents_evaluate` | ❌ | | 5 | 0.469431 | `foundry_threads_list` | ❌ | ->>>>>>> e2fd2eac (refactor tts mcp tool) - ---- - -## Test 7 - -**Expected Tool:** `foundry_agents_get-sdk-sample` -**Prompt:** Create a CLI app that can talk to an Azure AI Foundry Agent using Python SDK - -### Results - -| Rank | Score | Tool | Status | -|------|-------|------|--------| -<<<<<<< HEAD -| 1 | 0.595581 | `foundry_agents_get-sdk-sample` | ✅ **EXPECTED** | -| 2 | 0.552197 | `foundry_threads_create` | ❌ | -| 3 | 0.521920 | `foundry_agents_connect` | ❌ | -| 4 | 0.518552 | `foundry_agents_create` | ❌ | -| 5 | 0.509581 | `foundry_agents_list` | ❌ | -======= -| 1 | 0.595766 | `foundry_agents_get-sdk-sample` | ✅ **EXPECTED** | -| 2 | 0.552022 | `foundry_threads_create` | ❌ | -| 3 | 0.521920 | `foundry_agents_connect` | ❌ | -| 4 | 0.518552 | `foundry_agents_create` | ❌ | -| 5 | 0.509764 | `foundry_agents_list` | ❌ | ->>>>>>> e2fd2eac (refactor tts mcp tool) --- ## Test 8 -**Expected Tool:** `foundry_threads_create` -**Prompt:** Create an Azure AI Foundry thread to hold the conversation - -### Results - -| Rank | Score | Tool | Status | -|------|-------|------|--------| -<<<<<<< HEAD -| 1 | 0.606811 | `foundry_threads_create` | ✅ **EXPECTED** | -| 2 | 0.528310 | `foundry_openai_chat-completions-create` | ❌ | -| 3 | 0.519709 | `foundry_threads_get-messages` | ❌ | -======= -| 1 | 0.606417 | `foundry_threads_create` | ✅ **EXPECTED** | -| 2 | 0.528310 | `foundry_openai_chat-completions-create` | ❌ | -| 3 | 0.519708 | `foundry_threads_get-messages` | ❌ | ->>>>>>> e2fd2eac (refactor tts mcp tool) -| 4 | 0.506089 | `foundry_threads_list` | ❌ | -| 5 | 0.490796 | `foundry_models_deploy` | ❌ | - ---- - -## Test 9 - -**Expected Tool:** `foundry_threads_list` -**Prompt:** List my AI Foundry threads - -### Results - -| Rank | Score | Tool | Status | -|------|-------|------|--------| -<<<<<<< HEAD -| 1 | 0.677249 | `foundry_threads_list` | ✅ **EXPECTED** | -| 2 | 0.574068 | `foundry_threads_get-messages` | ❌ | -| 3 | 0.566999 | `foundry_threads_create` | ❌ | -| 4 | 0.471737 | `foundry_agents_get-sdk-sample` | ❌ | -| 5 | 0.448682 | `foundry_agents_list` | ❌ | -======= -| 1 | 0.677248 | `foundry_threads_list` | ✅ **EXPECTED** | -| 2 | 0.574068 | `foundry_threads_get-messages` | ❌ | -| 3 | 0.566387 | `foundry_threads_create` | ❌ | -| 4 | 0.471544 | `foundry_agents_get-sdk-sample` | ❌ | -| 5 | 0.448963 | `foundry_agents_list` | ❌ | ->>>>>>> e2fd2eac (refactor tts mcp tool) - ---- - -## Test 10 - -**Expected Tool:** `foundry_threads_get-messages` -**Prompt:** Show me the messages in the AI Foundry thread with id - -### Results - -| Rank | Score | Tool | Status | -|------|-------|------|--------| -| 1 | 0.669937 | `foundry_threads_get-messages` | ✅ **EXPECTED** | -<<<<<<< HEAD -| 2 | 0.584431 | `foundry_threads_create` | ❌ | -| 3 | 0.529381 | `foundry_threads_list` | ❌ | -| 4 | 0.437894 | `foundry_agents_get-sdk-sample` | ❌ | -======= -| 2 | 0.583991 | `foundry_threads_create` | ❌ | -| 3 | 0.529381 | `foundry_threads_list` | ❌ | -| 4 | 0.437480 | `foundry_agents_get-sdk-sample` | ❌ | ->>>>>>> e2fd2eac (refactor tts mcp tool) -| 5 | 0.427894 | `foundry_agents_create` | ❌ | - ---- - -## Test 11 - **Expected Tool:** `foundry_knowledge_index_list` **Prompt:** List all knowledge indexes in my AI Foundry project @@ -1409,18 +619,14 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| | 1 | 0.703772 | `foundry_knowledge_index_list` | ✅ **EXPECTED** | -| 2 | 0.537540 | `foundry_agents_list` | ❌ | +| 2 | 0.537700 | `foundry_agents_list` | ❌ | | 3 | 0.526528 | `foundry_knowledge_index_schema` | ❌ | | 4 | 0.500786 | `foundry_threads_list` | ❌ | -<<<<<<< HEAD -| 5 | 0.475746 | `foundry_models_deployments_list` | ❌ | -======= | 5 | 0.475802 | `foundry_models_deployments_list` | ❌ | ->>>>>>> e2fd2eac (refactor tts mcp tool) --- -## Test 12 +## Test 9 **Expected Tool:** `foundry_knowledge_index_list` **Prompt:** Show me the knowledge indexes in my AI Foundry project @@ -1431,17 +637,13 @@ |------|-------|------|--------| | 1 | 0.615458 | `foundry_knowledge_index_list` | ✅ **EXPECTED** | | 2 | 0.489311 | `foundry_knowledge_index_schema` | ❌ | -<<<<<<< HEAD -| 3 | 0.484329 | `foundry_agents_list` | ❌ | -======= | 3 | 0.484466 | `foundry_agents_list` | ❌ | ->>>>>>> e2fd2eac (refactor tts mcp tool) | 4 | 0.454174 | `foundry_threads_list` | ❌ | | 5 | 0.441521 | `foundry_resource_get` | ❌ | --- -## Test 13 +## Test 10 **Expected Tool:** `foundry_knowledge_index_schema` **Prompt:** Show me the schema for knowledge index in my Azure AI Foundry resource @@ -1458,7 +660,7 @@ --- -## Test 14 +## Test 11 **Expected Tool:** `foundry_knowledge_index_schema` **Prompt:** Get the schema configuration for knowledge index @@ -1467,15 +669,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -| 1 | 0.650203 | `foundry_knowledge_index_schema` | ✅ **EXPECTED** | -| 2 | 0.432792 | `postgres_table_schema_get` | ❌ | -| 3 | 0.417496 | `kusto_table_schema` | ❌ | -| 4 | 0.398322 | `mysql_table_schema_get` | ❌ | -| 5 | 0.396119 | `foundry_knowledge_index_list` | ❌ | +| 1 | 0.650269 | `foundry_knowledge_index_schema` | ✅ **EXPECTED** | +| 2 | 0.432759 | `postgres_table_schema_get` | ❌ | +| 3 | 0.417421 | `kusto_table_schema` | ❌ | +| 4 | 0.398186 | `mysql_table_schema_get` | ❌ | +| 5 | 0.396194 | `foundry_knowledge_index_list` | ❌ | --- -## Test 15 +## Test 12 **Expected Tool:** `foundry_models_deploy` **Prompt:** Deploy a GPT4o instance on my resource @@ -1488,24 +690,11 @@ | 2 | 0.299986 | `foundry_openai_models-list` | ❌ | | 3 | 0.298490 | `loadtesting_testrun_create` | ❌ | | 4 | 0.293050 | `loadtesting_testresource_create` | ❌ | -<<<<<<< HEAD -| 5 | 0.290387 | `foundry_openai_embeddings-create` | ❌ | -======= | 5 | 0.290381 | `foundry_openai_embeddings-create` | ❌ | -<<<<<<< HEAD -======= -| 2 | 0.335116 | `foundry_openai_models-list` | ❌ | -| 3 | 0.298490 | `loadtesting_testrun_create` | ❌ | -| 4 | 0.293050 | `loadtesting_testresource_create` | ❌ | -| 5 | 0.282464 | `mysql_server_list` | ❌ | ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= ->>>>>>> e2fd2eac (refactor tts mcp tool) --- -## Test 16 +## Test 13 **Expected Tool:** `foundry_models_deployments_list` **Prompt:** List all AI Foundry model deployments @@ -1514,39 +703,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -<<<<<<< HEAD -| 1 | 0.681081 | `foundry_models_deployments_list` | ✅ **EXPECTED** | -| 2 | 0.674510 | `foundry_openai_models-list` | ❌ | -| 3 | 0.572625 | `foundry_threads_list` | ❌ | -| 4 | 0.568871 | `foundry_agents_list` | ❌ | -| 5 | 0.566272 | `foundry_resource_get` | ❌ | -======= -<<<<<<< HEAD -| 1 | 0.681385 | `foundry_models_deployments_list` | ✅ **EXPECTED** | -| 2 | 0.674510 | `foundry_openai_models-list` | ❌ | -| 3 | 0.569059 | `foundry_agents_list` | ❌ | -| 4 | 0.566272 | `foundry_resource_get` | ❌ | -| 5 | 0.549636 | `foundry_models_list` | ❌ | -======= -| 1 | 0.663599 | `foundry_models_deployments_list` | ✅ **EXPECTED** | -| 2 | 0.583429 | `foundry_openai_models-list` | ❌ | -| 3 | 0.566272 | `foundry_resource_get` | ❌ | -| 4 | 0.549636 | `foundry_models_list` | ❌ | -| 5 | 0.539695 | `foundry_agents_list` | ❌ | ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= | 1 | 0.681385 | `foundry_models_deployments_list` | ✅ **EXPECTED** | | 2 | 0.674510 | `foundry_openai_models-list` | ❌ | | 3 | 0.572625 | `foundry_threads_list` | ❌ | | 4 | 0.569059 | `foundry_agents_list` | ❌ | | 5 | 0.566272 | `foundry_resource_get` | ❌ | ->>>>>>> e2fd2eac (refactor tts mcp tool) --- -## Test 17 +## Test 14 **Expected Tool:** `foundry_models_deployments_list` **Prompt:** Show me all AI Foundry model deployments @@ -1555,38 +720,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -<<<<<<< HEAD -| 1 | 0.619840 | `foundry_models_deployments_list` | ✅ **EXPECTED** | -| 2 | 0.619299 | `foundry_openai_models-list` | ❌ | -| 3 | 0.543385 | `foundry_resource_get` | ❌ | -| 4 | 0.540528 | `foundry_agents_list` | ❌ | -| 5 | 0.527141 | `foundry_threads_list` | ❌ | -======= -<<<<<<< HEAD -======= ->>>>>>> e2fd2eac (refactor tts mcp tool) | 1 | 0.620173 | `foundry_models_deployments_list` | ✅ **EXPECTED** | | 2 | 0.619231 | `foundry_openai_models-list` | ❌ | | 3 | 0.543352 | `foundry_resource_get` | ❌ | | 4 | 0.540551 | `foundry_agents_list` | ❌ | -<<<<<<< HEAD -| 5 | 0.521475 | `foundry_models_deploy` | ❌ | -======= -| 1 | 0.606516 | `foundry_models_deployments_list` | ✅ **EXPECTED** | -| 2 | 0.543352 | `foundry_resource_get` | ❌ | -| 3 | 0.521475 | `foundry_models_deploy` | ❌ | -| 4 | 0.518221 | `foundry_models_list` | ❌ | -| 5 | 0.507301 | `foundry_openai_models-list` | ❌ | ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= | 5 | 0.527121 | `foundry_threads_list` | ❌ | ->>>>>>> e2fd2eac (refactor tts mcp tool) --- -## Test 18 +## Test 15 **Expected Tool:** `foundry_models_list` **Prompt:** List all AI Foundry models @@ -1597,32 +739,13 @@ |------|-------|------|--------| | 1 | 0.603415 | `foundry_openai_models-list` | ❌ | | 2 | 0.560022 | `foundry_models_list` | ✅ **EXPECTED** | -<<<<<<< HEAD -<<<<<<< HEAD -| 3 | 0.553634 | `foundry_threads_list` | ❌ | -| 4 | 0.537958 | `foundry_models_deployments_list` | ❌ | -| 5 | 0.519191 | `foundry_agents_list` | ❌ | -======= -| 3 | 0.537981 | `foundry_models_deployments_list` | ❌ | -| 4 | 0.519472 | `foundry_agents_list` | ❌ | -| 5 | 0.514253 | `foundry_resource_get` | ❌ | -======= -| 1 | 0.560022 | `foundry_models_list` | ✅ **EXPECTED** | -| 2 | 0.514253 | `foundry_resource_get` | ❌ | -| 3 | 0.506418 | `foundry_models_deployments_list` | ❌ | -| 4 | 0.491952 | `foundry_agents_list` | ❌ | -| 5 | 0.475204 | `foundry_openai_models-list` | ❌ | ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= | 3 | 0.553634 | `foundry_threads_list` | ❌ | | 4 | 0.537981 | `foundry_models_deployments_list` | ❌ | | 5 | 0.519472 | `foundry_agents_list` | ❌ | ->>>>>>> e2fd2eac (refactor tts mcp tool) --- -## Test 19 +## Test 16 **Expected Tool:** `foundry_models_list` **Prompt:** Show me the available AI Foundry models @@ -1634,29 +757,12 @@ | 1 | 0.576904 | `foundry_openai_models-list` | ❌ | | 2 | 0.574818 | `foundry_models_list` | ✅ **EXPECTED** | | 3 | 0.525312 | `foundry_resource_get` | ❌ | -<<<<<<< HEAD -<<<<<<< HEAD -| 4 | 0.522153 | `foundry_agents_get-sdk-sample` | ❌ | -| 5 | 0.517825 | `foundry_models_deployments_list` | ❌ | -======= -| 4 | 0.517980 | `foundry_models_deployments_list` | ❌ | -| 5 | 0.504087 | `foundry_agents_list` | ❌ | -======= -| 1 | 0.574818 | `foundry_models_list` | ✅ **EXPECTED** | -| 2 | 0.525312 | `foundry_resource_get` | ❌ | -| 3 | 0.497061 | `foundry_models_deployments_list` | ❌ | -| 4 | 0.475139 | `foundry_agents_list` | ❌ | -| 5 | 0.467671 | `foundry_models_deploy` | ❌ | ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= | 4 | 0.521474 | `foundry_agents_get-sdk-sample` | ❌ | | 5 | 0.517980 | `foundry_models_deployments_list` | ❌ | ->>>>>>> e2fd2eac (refactor tts mcp tool) --- -## Test 20 +## Test 17 **Expected Tool:** `foundry_openai_chat-completions-create` **Prompt:** Create a chat completion with the message "Hello, how are you today?" using my Azure AI Foundry resource @@ -1667,32 +773,13 @@ |------|-------|------|--------| | 1 | 0.641293 | `foundry_openai_chat-completions-create` | ✅ **EXPECTED** | | 2 | 0.546736 | `foundry_openai_create-completion` | ❌ | -<<<<<<< HEAD -<<<<<<< HEAD -| 3 | 0.420018 | `foundry_threads_create` | ❌ | -| 4 | 0.415482 | `foundry_agents_connect` | ❌ | -| 5 | 0.399382 | `foundry_openai_embeddings-create` | ❌ | -======= -| 3 | 0.415483 | `foundry_agents_connect` | ❌ | -| 4 | 0.399383 | `foundry_openai_embeddings-create` | ❌ | -| 5 | 0.364105 | `foundry_models_deploy` | ❌ | -======= -| 1 | 0.558888 | `foundry_openai_chat-completions-create` | ✅ **EXPECTED** | -| 2 | 0.533147 | `foundry_openai_create-completion` | ❌ | -| 3 | 0.415483 | `foundry_agents_connect` | ❌ | -| 4 | 0.364105 | `foundry_models_deploy` | ❌ | -| 5 | 0.361151 | `foundry_resource_get` | ❌ | ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -| 3 | 0.421034 | `foundry_threads_create` | ❌ | +| 3 | 0.419991 | `foundry_threads_create` | ❌ | | 4 | 0.415483 | `foundry_agents_connect` | ❌ | | 5 | 0.399383 | `foundry_openai_embeddings-create` | ❌ | ->>>>>>> e2fd2eac (refactor tts mcp tool) --- -## Test 21 +## Test 18 **Expected Tool:** `foundry_openai_create-completion` **Prompt:** Create a completion with the prompt "What is Azure?" using my Azure AI Foundry resource @@ -1703,29 +790,13 @@ |------|-------|------|--------| | 1 | 0.696936 | `foundry_openai_create-completion` | ✅ **EXPECTED** | | 2 | 0.579108 | `foundry_openai_chat-completions-create` | ❌ | -<<<<<<< HEAD -| 3 | 0.465558 | `azureaibestpractices_get` | ❌ | -| 4 | 0.463703 | `foundry_models_deploy` | ❌ | -| 5 | 0.459126 | `foundry_resource_get` | ❌ | -======= | 3 | 0.463703 | `foundry_models_deploy` | ❌ | | 4 | 0.459126 | `foundry_resource_get` | ❌ | | 5 | 0.458622 | `foundry_openai_embeddings-create` | ❌ | -<<<<<<< HEAD -======= -| 1 | 0.682250 | `foundry_openai_create-completion` | ✅ **EXPECTED** | -| 2 | 0.539297 | `foundry_openai_chat-completions-create` | ❌ | -| 3 | 0.463703 | `foundry_models_deploy` | ❌ | -| 4 | 0.459126 | `foundry_resource_get` | ❌ | -| 5 | 0.450993 | `deploy_pipeline_guidance_get` | ❌ | ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= ->>>>>>> e2fd2eac (refactor tts mcp tool) --- -## Test 22 +## Test 19 **Expected Tool:** `foundry_openai_embeddings-create` **Prompt:** Generate embeddings for the text "Azure OpenAI Service" using my Azure AI Foundry resource @@ -1734,24 +805,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -<<<<<<< HEAD -| 1 | 0.766496 | `foundry_openai_embeddings-create` | ✅ **EXPECTED** | -| 2 | 0.543339 | `foundry_models_deploy` | ❌ | -======= -<<<<<<< HEAD -======= ->>>>>>> e2fd2eac (refactor tts mcp tool) -| 1 | 0.766338 | `foundry_openai_embeddings-create` | ✅ **EXPECTED** | -| 2 | 0.543338 | `foundry_models_deploy` | ❌ | ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -| 3 | 0.542214 | `foundry_openai_create-completion` | ❌ | -| 4 | 0.520746 | `foundry_openai_models-list` | ❌ | -| 5 | 0.519335 | `foundry_resource_get` | ❌ | +| 1 | 0.766123 | `foundry_openai_embeddings-create` | ✅ **EXPECTED** | +| 2 | 0.542818 | `foundry_models_deploy` | ❌ | +| 3 | 0.542113 | `foundry_openai_create-completion` | ❌ | +| 4 | 0.520274 | `foundry_openai_models-list` | ❌ | +| 5 | 0.518834 | `foundry_resource_get` | ❌ | --- -## Test 23 +## Test 20 **Expected Tool:** `foundry_openai_embeddings-create` **Prompt:** Create vector embeddings for my text using my Azure AI Foundry resource @@ -1760,37 +822,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -<<<<<<< HEAD -| 1 | 0.724369 | `foundry_openai_embeddings-create` | ✅ **EXPECTED** | -| 2 | 0.494544 | `foundry_resource_get` | ❌ | -| 3 | 0.480389 | `foundry_models_deploy` | ❌ | -| 4 | 0.480294 | `foundry_openai_create-completion` | ❌ | -| 5 | 0.463885 | `foundry_openai_chat-completions-create` | ❌ | -======= -<<<<<<< HEAD -======= ->>>>>>> e2fd2eac (refactor tts mcp tool) | 1 | 0.724120 | `foundry_openai_embeddings-create` | ✅ **EXPECTED** | | 2 | 0.494485 | `foundry_resource_get` | ❌ | | 3 | 0.480296 | `foundry_models_deploy` | ❌ | | 4 | 0.480218 | `foundry_openai_create-completion` | ❌ | | 5 | 0.463797 | `foundry_openai_chat-completions-create` | ❌ | -<<<<<<< HEAD -======= -| 1 | 0.638843 | `foundry_openai_embeddings-create` | ✅ **EXPECTED** | -| 2 | 0.494506 | `foundry_openai_create-completion` | ❌ | -| 3 | 0.494485 | `foundry_resource_get` | ❌ | -| 4 | 0.480296 | `foundry_models_deploy` | ❌ | -| 5 | 0.399908 | `foundry_openai_chat-completions-create` | ❌ | ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= ->>>>>>> e2fd2eac (refactor tts mcp tool) --- -## Test 24 +## Test 21 **Expected Tool:** `foundry_openai_models-list` **Prompt:** List all available OpenAI models in my Azure AI Foundry resource @@ -1801,29 +841,13 @@ |------|-------|------|--------| | 1 | 0.799059 | `foundry_openai_models-list` | ✅ **EXPECTED** | | 2 | 0.668887 | `foundry_resource_get` | ❌ | -<<<<<<< HEAD -| 3 | 0.667041 | `foundry_models_list` | ❌ | -| 4 | 0.666560 | `foundry_models_deployments_list` | ❌ | -| 5 | 0.657393 | `foundry_agents_list` | ❌ | -======= | 3 | 0.667040 | `foundry_models_list` | ❌ | | 4 | 0.666207 | `foundry_models_deployments_list` | ❌ | | 5 | 0.657546 | `foundry_agents_list` | ❌ | -<<<<<<< HEAD -======= -| 1 | 0.729075 | `foundry_openai_models-list` | ✅ **EXPECTED** | -| 2 | 0.668887 | `foundry_resource_get` | ❌ | -| 3 | 0.667040 | `foundry_models_list` | ❌ | -| 4 | 0.660489 | `foundry_agents_list` | ❌ | -| 5 | 0.604808 | `foundry_models_deployments_list` | ❌ | ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= ->>>>>>> e2fd2eac (refactor tts mcp tool) --- -## Test 25 +## Test 22 **Expected Tool:** `foundry_openai_models-list` **Prompt:** Show me the OpenAI model deployments in my Azure AI Foundry resource @@ -1833,28 +857,14 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| | 1 | 0.741659 | `foundry_openai_models-list` | ✅ **EXPECTED** | -| 2 | 0.660115 | `foundry_models_deployments_list` | ❌ | -| 3 | 0.648218 | `foundry_resource_get` | ❌ | +| 2 | 0.660160 | `foundry_models_deployments_list` | ❌ | +| 3 | 0.648219 | `foundry_resource_get` | ❌ | | 4 | 0.640650 | `foundry_models_deploy` | ❌ | -<<<<<<< HEAD -| 5 | 0.619790 | `foundry_agents_list` | ❌ | -======= | 5 | 0.619878 | `foundry_agents_list` | ❌ | -<<<<<<< HEAD -======= -| 1 | 0.654318 | `foundry_openai_models-list` | ✅ **EXPECTED** | -| 2 | 0.648219 | `foundry_resource_get` | ❌ | -| 3 | 0.640650 | `foundry_models_deploy` | ❌ | -| 4 | 0.637676 | `foundry_models_deployments_list` | ❌ | -| 5 | 0.576563 | `foundry_agents_list` | ❌ | ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= ->>>>>>> e2fd2eac (refactor tts mcp tool) --- -## Test 26 +## Test 23 **Expected Tool:** `foundry_resource_get` **Prompt:** List all AI Foundry resources in my subscription @@ -1865,19 +875,13 @@ |------|-------|------|--------| | 1 | 0.594096 | `foundry_resource_get` | ✅ **EXPECTED** | | 2 | 0.571916 | `foundry_openai_models-list` | ❌ | -<<<<<<< HEAD -| 3 | 0.566762 | `foundry_agents_list` | ❌ | -| 4 | 0.558075 | `foundry_threads_list` | ❌ | -| 5 | 0.556154 | `search_service_list` | ❌ | -======= | 3 | 0.567019 | `foundry_agents_list` | ❌ | | 4 | 0.558290 | `search_service_list` | ❌ | | 5 | 0.558076 | `foundry_threads_list` | ❌ | ->>>>>>> e2fd2eac (refactor tts mcp tool) --- -## Test 27 +## Test 24 **Expected Tool:** `foundry_resource_get` **Prompt:** Show me the AI Foundry resources in resource group @@ -1888,13 +892,13 @@ |------|-------|------|--------| | 1 | 0.665311 | `foundry_resource_get` | ✅ **EXPECTED** | | 2 | 0.585305 | `foundry_openai_models-list` | ❌ | -| 3 | 0.553808 | `foundry_agents_list` | ❌ | -| 4 | 0.518747 | `foundry_openai_embeddings-create` | ❌ | +| 3 | 0.553993 | `foundry_agents_list` | ❌ | +| 4 | 0.518767 | `foundry_openai_embeddings-create` | ❌ | | 5 | 0.492911 | `foundry_models_deploy` | ❌ | --- -## Test 28 +## Test 25 **Expected Tool:** `foundry_resource_get` **Prompt:** Get details for AI Foundry resource in resource group @@ -1905,27 +909,60 @@ |------|-------|------|--------| | 1 | 0.735316 | `foundry_resource_get` | ✅ **EXPECTED** | | 2 | 0.571906 | `foundry_openai_models-list` | ❌ | -<<<<<<< HEAD -<<<<<<< HEAD -| 3 | 0.509484 | `monitor_webtests_get` | ❌ | -| 4 | 0.496980 | `foundry_openai_embeddings-create` | ❌ | -| 5 | 0.475498 | `foundry_agents_list` | ❌ | -======= -| 3 | 0.510197 | `monitor_webtests_get` | ❌ | -| 4 | 0.497090 | `foundry_openai_embeddings-create` | ❌ | -| 5 | 0.475722 | `foundry_agents_list` | ❌ | -======= -| 2 | 0.509484 | `monitor_webtests_get` | ❌ | -| 3 | 0.455154 | `foundry_openai_models-list` | ❌ | -| 4 | 0.452340 | `foundry_models_deploy` | ❌ | -| 5 | 0.444390 | `loadtesting_testresource_list` | ❌ | ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= | 3 | 0.509484 | `monitor_webtests_get` | ❌ | | 4 | 0.497090 | `foundry_openai_embeddings-create` | ❌ | | 5 | 0.475722 | `foundry_agents_list` | ❌ | ->>>>>>> e2fd2eac (refactor tts mcp tool) + +--- + +## Test 26 + +**Expected Tool:** `foundry_threads_create` +**Prompt:** Create an Azure AI Foundry thread to hold the conversation + +### Results + +| Rank | Score | Tool | Status | +|------|-------|------|--------| +| 1 | 0.606760 | `foundry_threads_create` | ✅ **EXPECTED** | +| 2 | 0.528310 | `foundry_openai_chat-completions-create` | ❌ | +| 3 | 0.519794 | `foundry_threads_get-messages` | ❌ | +| 4 | 0.506089 | `foundry_threads_list` | ❌ | +| 5 | 0.490796 | `foundry_models_deploy` | ❌ | + +--- + +## Test 27 + +**Expected Tool:** `foundry_threads_get-messages` +**Prompt:** Show me the messages in the AI Foundry thread with id + +### Results + +| Rank | Score | Tool | Status | +|------|-------|------|--------| +| 1 | 0.669946 | `foundry_threads_get-messages` | ✅ **EXPECTED** | +| 2 | 0.584411 | `foundry_threads_create` | ❌ | +| 3 | 0.529381 | `foundry_threads_list` | ❌ | +| 4 | 0.437480 | `foundry_agents_get-sdk-sample` | ❌ | +| 5 | 0.427594 | `foundry_agents_create` | ❌ | + +--- + +## Test 28 + +**Expected Tool:** `foundry_threads_list` +**Prompt:** List my AI Foundry threads + +### Results + +| Rank | Score | Tool | Status | +|------|-------|------|--------| +| 1 | 0.677248 | `foundry_threads_list` | ✅ **EXPECTED** | +| 2 | 0.574065 | `foundry_threads_get-messages` | ❌ | +| 3 | 0.566910 | `foundry_threads_create` | ❌ | +| 4 | 0.471544 | `foundry_agents_get-sdk-sample` | ❌ | +| 5 | 0.448963 | `foundry_agents_list` | ❌ | --- @@ -1938,34 +975,11 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -<<<<<<< HEAD -| 1 | 0.785967 | `search_knowledge_base_get` | ✅ **EXPECTED** | -| 2 | 0.700824 | `search_knowledge_source_get` | ❌ | -| 3 | 0.692681 | `search_service_list` | ❌ | -| 4 | 0.635863 | `search_knowledge_base_retrieve` | ❌ | -| 5 | 0.586575 | `search_index_get` | ❌ | -======= -<<<<<<< HEAD -| 1 | 0.785556 | `search_knowledge_base_get` | ✅ **EXPECTED** | -| 2 | 0.700785 | `search_knowledge_source_get` | ❌ | -| 3 | 0.693600 | `search_service_list` | ❌ | -| 4 | 0.635477 | `search_knowledge_base_retrieve` | ❌ | -| 5 | 0.586578 | `search_index_get` | ❌ | -======= -======= ->>>>>>> e2fd2eac (refactor tts mcp tool) | 1 | 0.785967 | `search_knowledge_base_get` | ✅ **EXPECTED** | | 2 | 0.700824 | `search_knowledge_source_get` | ❌ | | 3 | 0.693471 | `search_service_list` | ❌ | | 4 | 0.635863 | `search_knowledge_base_retrieve` | ❌ | -<<<<<<< HEAD -| 5 | 0.603324 | `foundry_knowledge_index_list` | ❌ | ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= | 5 | 0.586574 | `search_index_get` | ❌ | ->>>>>>> e2fd2eac (refactor tts mcp tool) --- @@ -1981,7 +995,7 @@ | 1 | 0.748213 | `search_knowledge_base_get` | ✅ **EXPECTED** | | 2 | 0.668487 | `search_knowledge_source_get` | ❌ | | 3 | 0.628582 | `search_knowledge_base_retrieve` | ❌ | -| 4 | 0.623715 | `search_service_list` | ❌ | +| 4 | 0.624479 | `search_service_list` | ❌ | | 5 | 0.566618 | `search_index_get` | ❌ | --- @@ -1998,8 +1012,8 @@ | 1 | 0.702942 | `search_knowledge_base_get` | ✅ **EXPECTED** | | 2 | 0.605964 | `search_knowledge_source_get` | ❌ | | 3 | 0.583234 | `search_knowledge_base_retrieve` | ❌ | -| 4 | 0.512825 | `search_service_list` | ❌ | -| 5 | 0.476815 | `foundry_knowledge_index_list` | ❌ | +| 4 | 0.513638 | `search_service_list` | ❌ | +| 5 | 0.476816 | `foundry_knowledge_index_list` | ❌ | --- @@ -2012,27 +1026,11 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -<<<<<<< HEAD -| 1 | 0.688155 | `search_knowledge_base_get` | ✅ **EXPECTED** | -| 2 | 0.599348 | `search_knowledge_source_get` | ❌ | -| 3 | 0.578437 | `search_knowledge_base_retrieve` | ❌ | -| 4 | 0.456512 | `search_service_list` | ❌ | -| 5 | 0.439493 | `foundry_knowledge_index_list` | ❌ | -======= | 1 | 0.688051 | `search_knowledge_base_get` | ✅ **EXPECTED** | -| 2 | 0.599305 | `search_knowledge_source_get` | ❌ | +| 2 | 0.599247 | `search_knowledge_source_get` | ❌ | | 3 | 0.578499 | `search_knowledge_base_retrieve` | ❌ | | 4 | 0.457619 | `search_service_list` | ❌ | | 5 | 0.439529 | `foundry_knowledge_index_list` | ❌ | ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -| 1 | 0.688202 | `search_knowledge_base_get` | ✅ **EXPECTED** | -| 2 | 0.599369 | `search_knowledge_source_get` | ❌ | -| 3 | 0.578428 | `search_knowledge_base_retrieve` | ❌ | -| 4 | 0.457427 | `search_service_list` | ❌ | -| 5 | 0.439548 | `foundry_knowledge_index_list` | ❌ | ->>>>>>> e2fd2eac (refactor tts mcp tool) --- @@ -2045,33 +1043,11 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -<<<<<<< HEAD -| 1 | 0.769383 | `search_knowledge_base_get` | ✅ **EXPECTED** | -| 2 | 0.685640 | `search_knowledge_source_get` | ❌ | -| 3 | 0.636958 | `search_knowledge_base_retrieve` | ❌ | -| 4 | 0.585949 | `search_index_get` | ❌ | -| 5 | 0.533298 | `search_service_list` | ❌ | -======= -<<<<<<< HEAD -| 1 | 0.769443 | `search_knowledge_base_get` | ✅ **EXPECTED** | -| 2 | 0.685642 | `search_knowledge_source_get` | ❌ | -| 3 | 0.636767 | `search_knowledge_base_retrieve` | ❌ | -| 4 | 0.586085 | `search_index_get` | ❌ | -| 5 | 0.533859 | `search_service_list` | ❌ | -======= -======= ->>>>>>> e2fd2eac (refactor tts mcp tool) | 1 | 0.769384 | `search_knowledge_base_get` | ✅ **EXPECTED** | | 2 | 0.685640 | `search_knowledge_source_get` | ❌ | | 3 | 0.636958 | `search_knowledge_base_retrieve` | ❌ | | 4 | 0.585949 | `search_index_get` | ❌ | | 5 | 0.533700 | `search_service_list` | ❌ | -<<<<<<< HEAD ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= ->>>>>>> e2fd2eac (refactor tts mcp tool) --- @@ -2087,7 +1063,7 @@ | 1 | 0.595585 | `search_knowledge_base_get` | ✅ **EXPECTED** | | 2 | 0.551922 | `search_knowledge_base_retrieve` | ❌ | | 3 | 0.515480 | `search_knowledge_source_get` | ❌ | -| 4 | 0.366170 | `search_service_list` | ❌ | +| 4 | 0.366893 | `search_service_list` | ❌ | | 5 | 0.365633 | `search_index_get` | ❌ | --- @@ -2101,33 +1077,11 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -<<<<<<< HEAD -| 1 | 0.724869 | `search_knowledge_base_retrieve` | ✅ **EXPECTED** | -| 2 | 0.650606 | `search_knowledge_base_get` | ❌ | -| 3 | 0.575356 | `search_index_query` | ❌ | -| 4 | 0.567386 | `search_knowledge_source_get` | ❌ | -| 5 | 0.520336 | `foundry_agents_connect` | ❌ | -======= -<<<<<<< HEAD -======= ->>>>>>> e2fd2eac (refactor tts mcp tool) | 1 | 0.724846 | `search_knowledge_base_retrieve` | ✅ **EXPECTED** | | 2 | 0.650590 | `search_knowledge_base_get` | ❌ | | 3 | 0.575307 | `search_index_query` | ❌ | | 4 | 0.567361 | `search_knowledge_source_get` | ❌ | | 5 | 0.520360 | `foundry_agents_connect` | ❌ | -<<<<<<< HEAD -======= -| 1 | 0.724733 | `search_knowledge_base_retrieve` | ✅ **EXPECTED** | -| 2 | 0.650523 | `search_knowledge_base_get` | ❌ | -| 3 | 0.575078 | `search_index_query` | ❌ | -| 4 | 0.566839 | `search_knowledge_source_get` | ❌ | -| 5 | 0.520277 | `foundry_agents_connect` | ❌ | ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= ->>>>>>> e2fd2eac (refactor tts mcp tool) --- @@ -2140,19 +1094,11 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -| 1 | 0.633877 | `search_knowledge_base_retrieve` | ✅ **EXPECTED** | -| 2 | 0.589927 | `search_knowledge_base_get` | ❌ | -| 3 | 0.502173 | `search_knowledge_source_get` | ❌ | -| 4 | 0.422676 | `foundry_agents_query-and-evaluate` | ❌ | -| 5 | 0.399110 | `search_index_query` | ❌ | -======= | 1 | 0.633766 | `search_knowledge_base_retrieve` | ✅ **EXPECTED** | | 2 | 0.589869 | `search_knowledge_base_get` | ❌ | | 3 | 0.502085 | `search_knowledge_source_get` | ❌ | | 4 | 0.422671 | `foundry_agents_query-and-evaluate` | ❌ | | 5 | 0.399595 | `search_index_query` | ❌ | ->>>>>>> 58ab8585 (update prompts and tool description evaluator) --- @@ -2165,33 +1111,11 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -<<<<<<< HEAD -| 1 | 0.657866 | `search_knowledge_base_retrieve` | ✅ **EXPECTED** | -| 2 | 0.557206 | `search_knowledge_base_get` | ❌ | -| 3 | 0.463605 | `search_knowledge_source_get` | ❌ | -| 4 | 0.436719 | `foundry_agents_query-and-evaluate` | ❌ | -| 5 | 0.422173 | `foundry_agents_connect` | ❌ | -======= -<<<<<<< HEAD -| 1 | 0.657844 | `search_knowledge_base_retrieve` | ✅ **EXPECTED** | -| 2 | 0.557115 | `search_knowledge_base_get` | ❌ | -| 3 | 0.463461 | `search_knowledge_source_get` | ❌ | -| 4 | 0.436952 | `foundry_agents_query-and-evaluate` | ❌ | -| 5 | 0.422469 | `foundry_agents_connect` | ❌ | -======= -======= ->>>>>>> e2fd2eac (refactor tts mcp tool) | 1 | 0.657865 | `search_knowledge_base_retrieve` | ✅ **EXPECTED** | | 2 | 0.557206 | `search_knowledge_base_get` | ❌ | | 3 | 0.463605 | `search_knowledge_source_get` | ❌ | | 4 | 0.436739 | `foundry_agents_query-and-evaluate` | ❌ | | 5 | 0.422173 | `foundry_agents_connect` | ❌ | -<<<<<<< HEAD ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= ->>>>>>> e2fd2eac (refactor tts mcp tool) --- @@ -2204,16 +1128,11 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -| 1 | 0.633766 | `search_knowledge_base_retrieve` | ✅ **EXPECTED** | -| 2 | 0.589869 | `search_knowledge_base_get` | ❌ | -| 3 | 0.502085 | `search_knowledge_source_get` | ❌ | -<<<<<<< HEAD -| 4 | 0.422610 | `foundry_agents_query-and-evaluate` | ❌ | -| 5 | 0.399521 | `search_index_query` | ❌ | -======= -| 4 | 0.422671 | `foundry_agents_query-and-evaluate` | ❌ | -| 5 | 0.399595 | `search_index_query` | ❌ | ->>>>>>> 58ab8585 (update prompts and tool description evaluator) +| 1 | 0.633677 | `search_knowledge_base_retrieve` | ✅ **EXPECTED** | +| 2 | 0.589633 | `search_knowledge_base_get` | ❌ | +| 3 | 0.501914 | `search_knowledge_source_get` | ❌ | +| 4 | 0.422494 | `foundry_agents_query-and-evaluate` | ❌ | +| 5 | 0.399138 | `search_index_query` | ❌ | --- @@ -2228,15 +1147,7 @@ |------|-------|------|--------| | 1 | 0.598868 | `search_knowledge_base_retrieve` | ✅ **EXPECTED** | | 2 | 0.547862 | `search_knowledge_base_get` | ❌ | -<<<<<<< HEAD -<<<<<<< HEAD -| 3 | 0.467868 | `foundry_agents_query-and-evaluate` | ❌ | -======= -<<<<<<< HEAD -======= ->>>>>>> e2fd2eac (refactor tts mcp tool) | 3 | 0.467907 | `foundry_agents_query-and-evaluate` | ❌ | ->>>>>>> 58ab8585 (update prompts and tool description evaluator) | 4 | 0.464904 | `search_knowledge_source_get` | ❌ | | 5 | 0.412481 | `foundry_agents_connect` | ❌ | @@ -2251,33 +1162,11 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -<<<<<<< HEAD -| 1 | 0.649767 | `search_knowledge_base_retrieve` | ✅ **EXPECTED** | -| 2 | 0.631435 | `search_knowledge_base_get` | ❌ | -| 3 | 0.581359 | `search_index_query` | ❌ | -| 4 | 0.571156 | `search_knowledge_source_get` | ❌ | -| 5 | 0.544545 | `search_service_list` | ❌ | -======= -<<<<<<< HEAD -| 1 | 0.649751 | `search_knowledge_base_retrieve` | ✅ **EXPECTED** | -| 2 | 0.631420 | `search_knowledge_base_get` | ❌ | -| 3 | 0.581412 | `search_index_query` | ❌ | -| 4 | 0.571126 | `search_knowledge_source_get` | ❌ | -| 5 | 0.544488 | `search_service_list` | ❌ | -======= -======= ->>>>>>> e2fd2eac (refactor tts mcp tool) -| 1 | 0.649767 | `search_knowledge_base_retrieve` | ✅ **EXPECTED** | -| 2 | 0.631435 | `search_knowledge_base_get` | ❌ | -| 3 | 0.581387 | `search_index_query` | ❌ | -| 4 | 0.571156 | `search_knowledge_source_get` | ❌ | -| 5 | 0.544501 | `search_service_list` | ❌ | -<<<<<<< HEAD ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= ->>>>>>> e2fd2eac (refactor tts mcp tool) +| 1 | 0.649090 | `search_knowledge_base_retrieve` | ✅ **EXPECTED** | +| 2 | 0.630905 | `search_knowledge_base_get` | ❌ | +| 3 | 0.581015 | `search_index_query` | ❌ | +| 4 | 0.570684 | `search_knowledge_source_get` | ❌ | +| 5 | 0.544428 | `search_service_list` | ❌ | --- @@ -2292,17 +1181,8 @@ |------|-------|------|--------| | 1 | 0.579716 | `search_knowledge_base_retrieve` | ✅ **EXPECTED** | | 2 | 0.560688 | `search_knowledge_base_get` | ❌ | -<<<<<<< HEAD -<<<<<<< HEAD -| 3 | 0.477941 | `search_knowledge_source_get` | ❌ | -| 4 | 0.402530 | `foundry_agents_query-and-evaluate` | ❌ | -======= -<<<<<<< HEAD -======= ->>>>>>> e2fd2eac (refactor tts mcp tool) | 3 | 0.477942 | `search_knowledge_source_get` | ❌ | | 4 | 0.402582 | `foundry_agents_query-and-evaluate` | ❌ | ->>>>>>> 58ab8585 (update prompts and tool description evaluator) | 5 | 0.361231 | `foundry_knowledge_index_list` | ❌ | --- @@ -2316,33 +1196,11 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -<<<<<<< HEAD -| 1 | 0.582662 | `search_knowledge_base_retrieve` | ✅ **EXPECTED** | -| 2 | 0.528610 | `search_knowledge_base_get` | ❌ | -| 3 | 0.449336 | `search_knowledge_source_get` | ❌ | -| 4 | 0.447690 | `foundry_agents_query-and-evaluate` | ❌ | -| 5 | 0.397187 | `foundry_agents_connect` | ❌ | -======= -<<<<<<< HEAD -| 1 | 0.582660 | `search_knowledge_base_retrieve` | ✅ **EXPECTED** | -| 2 | 0.528583 | `search_knowledge_base_get` | ❌ | -| 3 | 0.449290 | `search_knowledge_source_get` | ❌ | -| 4 | 0.447915 | `foundry_agents_query-and-evaluate` | ❌ | -| 5 | 0.397238 | `foundry_agents_connect` | ❌ | -======= -======= ->>>>>>> e2fd2eac (refactor tts mcp tool) | 1 | 0.582662 | `search_knowledge_base_retrieve` | ✅ **EXPECTED** | | 2 | 0.528610 | `search_knowledge_base_get` | ❌ | | 3 | 0.449336 | `search_knowledge_source_get` | ❌ | | 4 | 0.447780 | `foundry_agents_query-and-evaluate` | ❌ | | 5 | 0.397187 | `foundry_agents_connect` | ❌ | -<<<<<<< HEAD ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= ->>>>>>> e2fd2eac (refactor tts mcp tool) --- @@ -2355,33 +1213,11 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -<<<<<<< HEAD -| 1 | 0.760406 | `search_knowledge_source_get` | ✅ **EXPECTED** | -| 2 | 0.690845 | `search_service_list` | ❌ | -| 3 | 0.665905 | `search_knowledge_base_get` | ❌ | -| 4 | 0.573014 | `search_index_get` | ❌ | -| 5 | 0.560755 | `search_knowledge_base_retrieve` | ❌ | -======= -<<<<<<< HEAD -======= ->>>>>>> e2fd2eac (refactor tts mcp tool) | 1 | 0.760416 | `search_knowledge_source_get` | ✅ **EXPECTED** | | 2 | 0.691931 | `search_service_list` | ❌ | | 3 | 0.665923 | `search_knowledge_base_get` | ❌ | | 4 | 0.573012 | `search_index_get` | ❌ | | 5 | 0.560779 | `search_knowledge_base_retrieve` | ❌ | -<<<<<<< HEAD -======= -| 1 | 0.760757 | `search_knowledge_source_get` | ✅ **EXPECTED** | -| 2 | 0.692251 | `search_service_list` | ❌ | -| 3 | 0.666204 | `search_knowledge_base_get` | ❌ | -| 4 | 0.579582 | `foundry_knowledge_index_list` | ❌ | -| 5 | 0.573177 | `search_index_get` | ❌ | ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= ->>>>>>> e2fd2eac (refactor tts mcp tool) --- @@ -2394,20 +1230,11 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -<<<<<<< HEAD -| 1 | 0.737860 | `search_knowledge_source_get` | ✅ **EXPECTED** | -| 2 | 0.659236 | `search_service_list` | ❌ | -======= -| 1 | 0.737971 | `search_knowledge_source_get` | ✅ **EXPECTED** | -======= | 1 | 0.737860 | `search_knowledge_source_get` | ✅ **EXPECTED** | ->>>>>>> e2fd2eac (refactor tts mcp tool) | 2 | 0.660170 | `search_service_list` | ❌ | ->>>>>>> 58ab8585 (update prompts and tool description evaluator) | 3 | 0.652969 | `search_knowledge_base_get` | ❌ | -| 4 | 0.578836 | `search_index_get` | ❌ | -| 5 | 0.560519 | `search_index_query` | ❌ | +| 4 | 0.578835 | `search_index_get` | ❌ | +| 5 | 0.560564 | `search_index_query` | ❌ | --- @@ -2420,17 +1247,9 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -<<<<<<< HEAD -| 1 | 0.657936 | `search_knowledge_source_get` | ✅ **EXPECTED** | -======= -| 1 | 0.658365 | `search_knowledge_source_get` | ✅ **EXPECTED** | ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= | 1 | 0.657935 | `search_knowledge_source_get` | ✅ **EXPECTED** | ->>>>>>> e2fd2eac (refactor tts mcp tool) | 2 | 0.558516 | `search_knowledge_base_get` | ❌ | -| 3 | 0.510338 | `search_service_list` | ❌ | +| 3 | 0.511469 | `search_service_list` | ❌ | | 4 | 0.470560 | `search_knowledge_base_retrieve` | ❌ | | 5 | 0.433657 | `foundry_knowledge_index_list` | ❌ | @@ -2447,7 +1266,7 @@ |------|-------|------|--------| | 1 | 0.652945 | `search_knowledge_source_get` | ✅ **EXPECTED** | | 2 | 0.563270 | `search_knowledge_base_get` | ❌ | -| 3 | 0.485934 | `search_service_list` | ❌ | +| 3 | 0.487022 | `search_service_list` | ❌ | | 4 | 0.477636 | `search_knowledge_base_retrieve` | ❌ | | 5 | 0.430518 | `search_index_get` | ❌ | @@ -2462,24 +1281,11 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -<<<<<<< HEAD | 1 | 0.825604 | `search_knowledge_source_get` | ✅ **EXPECTED** | -| 2 | 0.693438 | `search_knowledge_base_get` | ❌ | -======= -| 1 | 0.825664 | `search_knowledge_source_get` | ✅ **EXPECTED** | | 2 | 0.693437 | `search_knowledge_base_get` | ❌ | ->>>>>>> 58ab8585 (update prompts and tool description evaluator) | 3 | 0.595643 | `search_index_get` | ❌ | | 4 | 0.540550 | `search_knowledge_base_retrieve` | ❌ | -| 5 | 0.531085 | `search_service_list` | ❌ | -======= -| 1 | 0.825552 | `search_knowledge_source_get` | ✅ **EXPECTED** | -| 2 | 0.693321 | `search_knowledge_base_get` | ❌ | -| 3 | 0.595371 | `search_index_get` | ❌ | -| 4 | 0.540647 | `search_knowledge_base_retrieve` | ❌ | -| 5 | 0.530887 | `search_service_list` | ❌ | ->>>>>>> e2fd2eac (refactor tts mcp tool) +| 5 | 0.531247 | `search_service_list` | ❌ | --- @@ -2496,7 +1302,7 @@ | 2 | 0.523643 | `search_knowledge_base_get` | ❌ | | 3 | 0.459923 | `search_knowledge_base_retrieve` | ❌ | | 4 | 0.371465 | `search_index_get` | ❌ | -| 5 | 0.370585 | `search_service_list` | ❌ | +| 5 | 0.370838 | `search_service_list` | ❌ | --- @@ -2509,24 +1315,11 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD | 1 | 0.681052 | `search_index_get` | ✅ **EXPECTED** | | 2 | 0.544557 | `foundry_knowledge_index_schema` | ❌ | | 3 | 0.528153 | `search_knowledge_base_get` | ❌ | -<<<<<<< HEAD | 4 | 0.521765 | `search_knowledge_source_get` | ❌ | -| 5 | 0.490553 | `search_service_list` | ❌ | -======= -| 4 | 0.522514 | `search_knowledge_source_get` | ❌ | | 5 | 0.490624 | `search_service_list` | ❌ | ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -| 1 | 0.680762 | `search_index_get` | ✅ **EXPECTED** | -| 2 | 0.544458 | `foundry_knowledge_index_schema` | ❌ | -| 3 | 0.527906 | `search_knowledge_base_get` | ❌ | -| 4 | 0.521626 | `search_knowledge_source_get` | ❌ | -| 5 | 0.490379 | `search_service_list` | ❌ | ->>>>>>> e2fd2eac (refactor tts mcp tool) --- @@ -2540,7 +1333,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| | 1 | 0.640256 | `search_index_get` | ✅ **EXPECTED** | -| 2 | 0.619949 | `search_service_list` | ❌ | +| 2 | 0.620140 | `search_service_list` | ❌ | | 3 | 0.538885 | `foundry_knowledge_index_list` | ❌ | | 4 | 0.511485 | `search_knowledge_base_get` | ❌ | | 5 | 0.496094 | `search_knowledge_source_get` | ❌ | @@ -2557,7 +1350,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| | 1 | 0.620759 | `search_index_get` | ✅ **EXPECTED** | -| 2 | 0.562503 | `search_service_list` | ❌ | +| 2 | 0.562775 | `search_service_list` | ❌ | | 3 | 0.538471 | `foundry_knowledge_index_list` | ❌ | | 4 | 0.500365 | `search_knowledge_base_get` | ❌ | | 5 | 0.490025 | `search_knowledge_source_get` | ❌ | @@ -2573,35 +1366,11 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -<<<<<<< HEAD -| 1 | 0.522598 | `search_index_get` | ❌ | -| 2 | 0.515911 | `search_index_query` | ✅ **EXPECTED** | -| 3 | 0.498264 | `search_service_list` | ❌ | -| 4 | 0.447868 | `search_knowledge_base_retrieve` | ❌ | -| 5 | 0.437608 | `postgres_database_query` | ❌ | -======= -<<<<<<< HEAD -| 1 | 0.522953 | `search_index_get` | ❌ | -| 2 | 0.515871 | `search_index_query` | ✅ **EXPECTED** | -| 3 | 0.497392 | `search_service_list` | ❌ | -| 4 | 0.447993 | `search_knowledge_base_retrieve` | ❌ | -| 5 | 0.437640 | `postgres_database_query` | ❌ | -======= -| 1 | 0.522754 | `search_index_get` | ❌ | -| 2 | 0.515812 | `search_index_query` | ✅ **EXPECTED** | -| 3 | 0.497494 | `search_service_list` | ❌ | -| 4 | 0.447954 | `search_knowledge_base_retrieve` | ❌ | -| 5 | 0.437709 | `postgres_database_query` | ❌ | ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= | 1 | 0.522826 | `search_index_get` | ❌ | | 2 | 0.515870 | `search_index_query` | ✅ **EXPECTED** | | 3 | 0.497467 | `search_service_list` | ❌ | | 4 | 0.447977 | `search_knowledge_base_retrieve` | ❌ | -| 5 | 0.437715 | `postgres_database_query` | ❌ | ->>>>>>> e2fd2eac (refactor tts mcp tool) +| 5 | 0.437665 | `postgres_database_query` | ❌ | --- @@ -2614,18 +1383,9 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -| 1 | 0.791803 | `search_service_list` | ✅ **EXPECTED** | -| 2 | 0.553012 | `kusto_cluster_list` | ❌ | -======= | 1 | 0.793651 | `search_service_list` | ✅ **EXPECTED** | | 2 | 0.553011 | `kusto_cluster_list` | ❌ | -<<<<<<< HEAD ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -| 3 | 0.509479 | `subscription_list` | ❌ | -======= | 3 | 0.509461 | `subscription_list` | ❌ | ->>>>>>> e2fd2eac (refactor tts mcp tool) | 4 | 0.505971 | `search_index_get` | ❌ | | 5 | 0.504693 | `marketplace_product_list` | ❌ | @@ -2640,10 +1400,10 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -| 1 | 0.684837 | `search_service_list` | ✅ **EXPECTED** | +| 1 | 0.686140 | `search_service_list` | ✅ **EXPECTED** | | 2 | 0.484092 | `marketplace_product_list` | ❌ | | 3 | 0.479898 | `search_index_get` | ❌ | -| 4 | 0.462337 | `search_knowledge_base_get` | ❌ | +| 4 | 0.462336 | `search_knowledge_base_get` | ❌ | | 5 | 0.461786 | `kusto_cluster_list` | ❌ | --- @@ -2657,23 +1417,11 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -| 1 | 0.551241 | `search_service_list` | ✅ **EXPECTED** | +| 1 | 0.553025 | `search_service_list` | ✅ **EXPECTED** | | 2 | 0.436230 | `search_index_get` | ❌ | | 3 | 0.415277 | `search_knowledge_base_get` | ❌ | | 4 | 0.410461 | `search_knowledge_source_get` | ❌ | -<<<<<<< HEAD -| 5 | 0.404707 | `search_index_query` | ❌ | -======= | 5 | 0.404758 | `search_index_query` | ❌ | -<<<<<<< HEAD -======= -| 3 | 0.417096 | `foundry_agents_list` | ❌ | -| 4 | 0.415277 | `search_knowledge_base_get` | ❌ | -| 5 | 0.410568 | `search_knowledge_source_get` | ❌ | ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= ->>>>>>> e2fd2eac (refactor tts mcp tool) --- @@ -2686,32 +1434,11 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -<<<<<<< HEAD -| 1 | 0.666038 | `speech_stt_recognize` | ✅ **EXPECTED** | -| 2 | 0.377210 | `foundry_openai_embeddings-create` | ❌ | -| 3 | 0.351127 | `deploy_plan_get` | ❌ | -<<<<<<< HEAD -| 4 | 0.338137 | `extension_cli_generate` | ❌ | -| 5 | 0.337763 | `deploy_pipeline_guidance_get` | ❌ | -======= -| 4 | 0.338047 | `extension_cli_generate` | ❌ | -| 5 | 0.337685 | `deploy_pipeline_guidance_get` | ❌ | -======= -| 1 | 0.677871 | `speech_tts_synthesize` | ❌ | -| 2 | 0.666038 | `speech_stt_recognize` | ✅ **EXPECTED** | -| 3 | 0.415224 | `foundry_openai_embeddings-create` | ❌ | -| 4 | 0.365228 | `foundry_openai_chat-completions-create` | ❌ | -| 5 | 0.351127 | `deploy_plan_get` | ❌ | ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= | 1 | 0.682065 | `speech_tts_synthesize` | ❌ | | 2 | 0.666038 | `speech_stt_recognize` | ✅ **EXPECTED** | | 3 | 0.377022 | `foundry_openai_embeddings-create` | ❌ | | 4 | 0.351127 | `deploy_plan_get` | ❌ | | 5 | 0.338137 | `extension_cli_generate` | ❌ | ->>>>>>> e2fd2eac (refactor tts mcp tool) --- @@ -2725,31 +1452,10 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| | 1 | 0.511324 | `speech_stt_recognize` | ✅ **EXPECTED** | -<<<<<<< HEAD -<<<<<<< HEAD -| 2 | 0.198123 | `foundry_agents_get-sdk-sample` | ❌ | -| 3 | 0.192462 | `foundry_openai_embeddings-create` | ❌ | -| 4 | 0.170157 | `foundry_openai_create-completion` | ❌ | -| 5 | 0.167159 | `foundry_openai_chat-completions-create` | ❌ | -======= -<<<<<<< HEAD -| 2 | 0.192450 | `foundry_openai_embeddings-create` | ❌ | -| 3 | 0.170157 | `foundry_openai_create-completion` | ❌ | -| 4 | 0.167159 | `foundry_openai_chat-completions-create` | ❌ | -| 5 | 0.159108 | `foundry_agents_connect` | ❌ | -======= -| 2 | 0.353620 | `speech_tts_synthesize` | ❌ | -| 3 | 0.202056 | `foundry_openai_chat-completions-create` | ❌ | -| 4 | 0.190197 | `foundry_openai_embeddings-create` | ❌ | -| 5 | 0.184542 | `foundry_openai_create-completion` | ❌ | ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= | 2 | 0.344404 | `speech_tts_synthesize` | ❌ | | 3 | 0.197854 | `foundry_agents_get-sdk-sample` | ❌ | | 4 | 0.192450 | `foundry_openai_embeddings-create` | ❌ | | 5 | 0.170157 | `foundry_openai_create-completion` | ❌ | ->>>>>>> e2fd2eac (refactor tts mcp tool) --- @@ -2763,31 +1469,10 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| | 1 | 0.486489 | `speech_stt_recognize` | ✅ **EXPECTED** | -<<<<<<< HEAD -<<<<<<< HEAD -| 2 | 0.162863 | `foundry_threads_create` | ❌ | -| 3 | 0.160209 | `foundry_agents_connect` | ❌ | -| 4 | 0.156936 | `deploy_pipeline_guidance_get` | ❌ | -| 5 | 0.154737 | `foundry_openai_create-completion` | ❌ | -======= -<<<<<<< HEAD -| 2 | 0.160209 | `foundry_agents_connect` | ❌ | -| 3 | 0.156850 | `deploy_pipeline_guidance_get` | ❌ | -| 4 | 0.154737 | `foundry_openai_create-completion` | ❌ | -| 5 | 0.154098 | `foundry_openai_embeddings-create` | ❌ | -======= -| 2 | 0.354154 | `speech_tts_synthesize` | ❌ | -| 3 | 0.180941 | `foundry_openai_create-completion` | ❌ | -| 4 | 0.178944 | `foundry_openai_chat-completions-create` | ❌ | -| 5 | 0.160209 | `foundry_agents_connect` | ❌ | ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= | 2 | 0.335115 | `speech_tts_synthesize` | ❌ | -| 3 | 0.163357 | `foundry_threads_create` | ❌ | +| 3 | 0.162870 | `foundry_threads_create` | ❌ | | 4 | 0.160209 | `foundry_agents_connect` | ❌ | | 5 | 0.156850 | `deploy_pipeline_guidance_get` | ❌ | ->>>>>>> e2fd2eac (refactor tts mcp tool) --- @@ -2800,33 +1485,11 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -| 1 | 0.612032 | `speech_stt_recognize` | ✅ **EXPECTED** | -| 2 | 0.309860 | `foundry_openai_embeddings-create` | ❌ | -| 3 | 0.244223 | `foundry_resource_get` | ❌ | -| 4 | 0.243658 | `foundry_openai_create-completion` | ❌ | -| 5 | 0.242816 | `foundry_openai_chat-completions-create` | ❌ | -======= | 1 | 0.611992 | `speech_stt_recognize` | ✅ **EXPECTED** | -<<<<<<< HEAD -<<<<<<< HEAD -| 2 | 0.309895 | `foundry_openai_embeddings-create` | ❌ | -| 3 | 0.244218 | `foundry_resource_get` | ❌ | -| 4 | 0.243626 | `foundry_openai_create-completion` | ❌ | -| 5 | 0.242771 | `foundry_openai_chat-completions-create` | ❌ | -======= -| 2 | 0.584104 | `speech_tts_synthesize` | ❌ | -| 3 | 0.322301 | `foundry_openai_embeddings-create` | ❌ | -| 4 | 0.263196 | `foundry_openai_create-completion` | ❌ | -| 5 | 0.251200 | `foundry_openai_chat-completions-create` | ❌ | ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= | 2 | 0.573185 | `speech_tts_synthesize` | ❌ | | 3 | 0.309895 | `foundry_openai_embeddings-create` | ❌ | | 4 | 0.244218 | `foundry_resource_get` | ❌ | | 5 | 0.243626 | `foundry_openai_create-completion` | ❌ | ->>>>>>> e2fd2eac (refactor tts mcp tool) --- @@ -2840,28 +1503,10 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| | 1 | 0.410533 | `speech_stt_recognize` | ✅ **EXPECTED** | -<<<<<<< HEAD -<<<<<<< HEAD -| 2 | 0.152414 | `foundry_openai_embeddings-create` | ❌ | -======= -<<<<<<< HEAD -| 2 | 0.152391 | `foundry_openai_embeddings-create` | ❌ | ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -| 3 | 0.152137 | `foundry_models_deploy` | ❌ | -| 4 | 0.151799 | `deploy_pipeline_guidance_get` | ❌ | -| 5 | 0.140373 | `deploy_plan_get` | ❌ | -======= -| 2 | 0.373433 | `speech_tts_synthesize` | ❌ | -| 3 | 0.159775 | `foundry_openai_embeddings-create` | ❌ | -| 4 | 0.158032 | `foundry_openai_chat-completions-create` | ❌ | -| 5 | 0.152137 | `foundry_models_deploy` | ❌ | ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) -======= | 2 | 0.353783 | `speech_tts_synthesize` | ❌ | | 3 | 0.152391 | `foundry_openai_embeddings-create` | ❌ | | 4 | 0.152137 | `foundry_models_deploy` | ❌ | | 5 | 0.151632 | `deploy_pipeline_guidance_get` | ❌ | ->>>>>>> e2fd2eac (refactor tts mcp tool) --- @@ -2875,24 +1520,10 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| | 1 | 0.546259 | `speech_stt_recognize` | ✅ **EXPECTED** | -<<<<<<< HEAD -<<<<<<< HEAD -| 2 | 0.218092 | `foundry_resource_get` | ❌ | -| 3 | 0.202860 | `foundry_openai_embeddings-create` | ❌ | -| 4 | 0.183420 | `extension_azqr` | ❌ | -| 5 | 0.181020 | `search_index_get` | ❌ | -======= -| 2 | 0.499808 | `speech_tts_synthesize` | ❌ | -| 3 | 0.225372 | `foundry_openai_embeddings-create` | ❌ | -| 4 | 0.218092 | `foundry_resource_get` | ❌ | -| 5 | 0.200865 | `foundry_openai_chat-completions-create` | ❌ | ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) -======= | 2 | 0.480203 | `speech_tts_synthesize` | ❌ | | 3 | 0.218092 | `foundry_resource_get` | ❌ | | 4 | 0.202935 | `foundry_openai_embeddings-create` | ❌ | | 5 | 0.183420 | `extension_azqr` | ❌ | ->>>>>>> e2fd2eac (refactor tts mcp tool) --- @@ -2905,35 +1536,11 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -<<<<<<< HEAD -| 1 | 0.539963 | `speech_stt_recognize` | ✅ **EXPECTED** | -| 2 | 0.228587 | `foundry_openai_create-completion` | ❌ | -| 3 | 0.203413 | `foundry_agents_connect` | ❌ | -| 4 | 0.199517 | `foundry_openai_embeddings-create` | ❌ | -| 5 | 0.197301 | `foundry_openai_chat-completions-create` | ❌ | -======= -<<<<<<< HEAD -| 1 | 0.540249 | `speech_stt_recognize` | ✅ **EXPECTED** | -| 2 | 0.227953 | `foundry_openai_create-completion` | ❌ | -| 3 | 0.203215 | `foundry_agents_connect` | ❌ | -| 4 | 0.199441 | `foundry_openai_embeddings-create` | ❌ | -| 5 | 0.197199 | `foundry_openai_chat-completions-create` | ❌ | -======= -| 1 | 0.539963 | `speech_stt_recognize` | ✅ **EXPECTED** | -| 2 | 0.382022 | `speech_tts_synthesize` | ❌ | -| 3 | 0.246979 | `foundry_openai_create-completion` | ❌ | -| 4 | 0.238192 | `foundry_openai_chat-completions-create` | ❌ | -| 5 | 0.203413 | `foundry_agents_connect` | ❌ | ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= | 1 | 0.539963 | `speech_stt_recognize` | ✅ **EXPECTED** | | 2 | 0.367401 | `speech_tts_synthesize` | ❌ | | 3 | 0.228587 | `foundry_openai_create-completion` | ❌ | | 4 | 0.203413 | `foundry_agents_connect` | ❌ | | 5 | 0.199585 | `foundry_openai_embeddings-create` | ❌ | ->>>>>>> e2fd2eac (refactor tts mcp tool) --- @@ -2946,35 +1553,11 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -<<<<<<< HEAD -| 1 | 0.549151 | `speech_stt_recognize` | ✅ **EXPECTED** | -| 2 | 0.393626 | `azureaibestpractices_get` | ❌ | -| 3 | 0.342537 | `extension_cli_generate` | ❌ | -| 4 | 0.337387 | `cloudarchitect_design` | ❌ | -| 5 | 0.335741 | `foundry_openai_create-completion` | ❌ | -======= -<<<<<<< HEAD -| 1 | 0.548967 | `speech_stt_recognize` | ✅ **EXPECTED** | -| 2 | 0.342494 | `extension_cli_generate` | ❌ | -| 3 | 0.337434 | `cloudarchitect_design` | ❌ | -| 4 | 0.335792 | `foundry_openai_create-completion` | ❌ | -| 5 | 0.333130 | `get_bestpractices_get` | ❌ | -======= -| 1 | 0.549151 | `speech_stt_recognize` | ✅ **EXPECTED** | -| 2 | 0.460662 | `speech_tts_synthesize` | ❌ | -| 3 | 0.357816 | `foundry_openai_chat-completions-create` | ❌ | -| 4 | 0.345661 | `foundry_openai_create-completion` | ❌ | -| 5 | 0.342537 | `extension_cli_generate` | ❌ | ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= | 1 | 0.549151 | `speech_stt_recognize` | ✅ **EXPECTED** | | 2 | 0.468161 | `speech_tts_synthesize` | ❌ | | 3 | 0.342537 | `extension_cli_generate` | ❌ | -| 4 | 0.337387 | `cloudarchitect_design` | ❌ | +| 4 | 0.338302 | `cloudarchitect_design` | ❌ | | 5 | 0.335741 | `foundry_openai_create-completion` | ❌ | ->>>>>>> e2fd2eac (refactor tts mcp tool) --- @@ -2988,30 +1571,10 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| | 1 | 0.532536 | `speech_stt_recognize` | ✅ **EXPECTED** | -<<<<<<< HEAD -<<<<<<< HEAD -| 2 | 0.349892 | `foundry_openai_create-completion` | ❌ | -<<<<<<< HEAD -| 3 | 0.348381 | `azureaibestpractices_get` | ❌ | -| 4 | 0.340893 | `foundry_openai_chat-completions-create` | ❌ | -| 5 | 0.332862 | `foundry_openai_embeddings-create` | ❌ | -======= -| 3 | 0.340893 | `foundry_openai_chat-completions-create` | ❌ | -| 4 | 0.332669 | `foundry_openai_embeddings-create` | ❌ | -| 5 | 0.326712 | `get_bestpractices_get` | ❌ | -======= -| 2 | 0.506045 | `speech_tts_synthesize` | ❌ | -| 3 | 0.385033 | `foundry_openai_embeddings-create` | ❌ | -| 4 | 0.381487 | `foundry_openai_chat-completions-create` | ❌ | -| 5 | 0.378382 | `foundry_openai_create-completion` | ❌ | ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= | 2 | 0.515532 | `speech_tts_synthesize` | ❌ | | 3 | 0.349892 | `foundry_openai_create-completion` | ❌ | | 4 | 0.340893 | `foundry_openai_chat-completions-create` | ❌ | | 5 | 0.332669 | `foundry_openai_embeddings-create` | ❌ | ->>>>>>> e2fd2eac (refactor tts mcp tool) --- @@ -3025,31 +1588,10 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| | 1 | 0.453396 | `speech_stt_recognize` | ✅ **EXPECTED** | -<<<<<<< HEAD -<<<<<<< HEAD -| 2 | 0.173280 | `deploy_pipeline_guidance_get` | ❌ | -| 3 | 0.164929 | `foundry_openai_embeddings-create` | ❌ | -| 4 | 0.160483 | `foundry_agents_connect` | ❌ | -| 5 | 0.160185 | `extension_azqr` | ❌ | -======= -<<<<<<< HEAD -| 2 | 0.173205 | `deploy_pipeline_guidance_get` | ❌ | -| 3 | 0.164990 | `foundry_openai_embeddings-create` | ❌ | -| 4 | 0.160523 | `extension_azqr` | ❌ | -| 5 | 0.160483 | `foundry_agents_connect` | ❌ | -======= -| 2 | 0.342007 | `speech_tts_synthesize` | ❌ | -| 3 | 0.181994 | `foundry_openai_create-completion` | ❌ | -| 4 | 0.174375 | `foundry_openai_chat-completions-create` | ❌ | -| 5 | 0.173205 | `deploy_pipeline_guidance_get` | ❌ | ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= | 2 | 0.322710 | `speech_tts_synthesize` | ❌ | | 3 | 0.173205 | `deploy_pipeline_guidance_get` | ❌ | | 4 | 0.164990 | `foundry_openai_embeddings-create` | ❌ | | 5 | 0.160483 | `foundry_agents_connect` | ❌ | ->>>>>>> e2fd2eac (refactor tts mcp tool) --- @@ -3164,11 +1706,11 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -| 1 | 0.527294 | `speech_tts_synthesize` | ✅ **EXPECTED** | -| 2 | 0.455734 | `speech_stt_recognize` | ❌ | -| 3 | 0.353108 | `foundry_resource_get` | ❌ | -| 4 | 0.343308 | `foundry_models_deploy` | ❌ | -| 5 | 0.337888 | `foundry_openai_embeddings-create` | ❌ | +| 1 | 0.527400 | `speech_tts_synthesize` | ✅ **EXPECTED** | +| 2 | 0.455811 | `speech_stt_recognize` | ❌ | +| 3 | 0.353132 | `foundry_resource_get` | ❌ | +| 4 | 0.343330 | `foundry_models_deploy` | ❌ | +| 5 | 0.337912 | `foundry_openai_embeddings-create` | ❌ | --- @@ -3232,35 +1774,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -| 1 | 0.786298 | `appconfig_account_list` | ✅ **EXPECTED** | +| 1 | 0.786360 | `appconfig_account_list` | ✅ **EXPECTED** | | 2 | 0.530613 | `appconfig_kv_get` | ❌ | -<<<<<<< HEAD -<<<<<<< HEAD -| 3 | 0.491380 | `postgres_server_list` | ❌ | -======= -<<<<<<< HEAD -| 3 | 0.491358 | `postgres_server_list` | ❌ | ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -| 4 | 0.481223 | `kusto_cluster_list` | ❌ | -| 5 | 0.479997 | `subscription_list` | ❌ | -======= -======= ->>>>>>> e2fd2eac (refactor tts mcp tool) | 3 | 0.491380 | `postgres_server_list` | ❌ | | 4 | 0.481223 | `kusto_cluster_list` | ❌ | | 5 | 0.479988 | `subscription_list` | ❌ | --- -<<<<<<< HEAD -<<<<<<< HEAD -## Test 67 -======= -## Test 72 ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= ## Test 77 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `appconfig_account_list` **Prompt:** Show me the App Configuration stores in my subscription @@ -3269,31 +1791,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -| 1 | 0.635056 | `appconfig_account_list` | ✅ **EXPECTED** | -| 2 | 0.464826 | `appconfig_kv_get` | ❌ | -| 3 | 0.398562 | `subscription_list` | ❌ | -| 4 | 0.391398 | `redis_list` | ❌ | -| 5 | 0.372579 | `postgres_server_list` | ❌ | - ---- - -<<<<<<< HEAD -## Test 68 -======= -## Test 73 ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= | 1 | 0.634978 | `appconfig_account_list` | ✅ **EXPECTED** | | 2 | 0.464865 | `appconfig_kv_get` | ❌ | | 3 | 0.398495 | `subscription_list` | ❌ | -| 4 | 0.391286 | `redis_list` | ❌ | +| 4 | 0.391291 | `redis_list` | ❌ | | 5 | 0.372456 | `postgres_server_list` | ❌ | --- ## Test 78 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `appconfig_account_list` **Prompt:** Show me my App Configuration stores @@ -3302,7 +1808,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -| 1 | 0.565365 | `appconfig_account_list` | ✅ **EXPECTED** | +| 1 | 0.565435 | `appconfig_account_list` | ✅ **EXPECTED** | | 2 | 0.465344 | `appconfig_kv_get` | ❌ | | 3 | 0.355916 | `postgres_server_config_get` | ❌ | | 4 | 0.348661 | `appconfig_kv_delete` | ❌ | @@ -3310,15 +1816,7 @@ --- -<<<<<<< HEAD -<<<<<<< HEAD -## Test 69 -======= -## Test 74 ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= ## Test 79 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `appconfig_kv_delete` **Prompt:** Delete the key in App Configuration store @@ -3327,21 +1825,6 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -<<<<<<< HEAD -| 1 | 0.618276 | `appconfig_kv_delete` | ✅ **EXPECTED** | -| 2 | 0.464358 | `appconfig_kv_get` | ❌ | -| 3 | 0.424344 | `appconfig_kv_set` | ❌ | -| 4 | 0.422700 | `appconfig_kv_lock_set` | ❌ | -| 5 | 0.392260 | `appconfig_account_list` | ❌ | - ---- - -## Test 70 -======= -<<<<<<< HEAD -======= ->>>>>>> e2fd2eac (refactor tts mcp tool) | 1 | 0.618277 | `appconfig_kv_delete` | ✅ **EXPECTED** | | 2 | 0.464358 | `appconfig_kv_get` | ❌ | | 3 | 0.424344 | `appconfig_kv_set` | ❌ | @@ -3350,12 +1833,7 @@ --- -<<<<<<< HEAD -## Test 75 ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= ## Test 80 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `appconfig_kv_get` **Prompt:** List all key-value settings in App Configuration store @@ -3364,23 +1842,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -| 1 | 0.632652 | `appconfig_kv_get` | ✅ **EXPECTED** | -| 2 | 0.558116 | `appconfig_account_list` | ❌ | -| 3 | 0.531033 | `appconfig_kv_set` | ❌ | -| 4 | 0.464568 | `appconfig_kv_delete` | ❌ | -| 5 | 0.438999 | `appconfig_kv_lock_set` | ❌ | +| 1 | 0.632687 | `appconfig_kv_get` | ✅ **EXPECTED** | +| 2 | 0.557810 | `appconfig_account_list` | ❌ | +| 3 | 0.530884 | `appconfig_kv_set` | ❌ | +| 4 | 0.464635 | `appconfig_kv_delete` | ❌ | +| 5 | 0.439089 | `appconfig_kv_lock_set` | ❌ | --- -<<<<<<< HEAD -<<<<<<< HEAD -## Test 71 -======= -## Test 76 ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= ## Test 81 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `appconfig_kv_get` **Prompt:** Show me the key-value settings in App Configuration store @@ -3390,22 +1860,14 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| | 1 | 0.612555 | `appconfig_kv_get` | ✅ **EXPECTED** | -| 2 | 0.522671 | `appconfig_account_list` | ❌ | +| 2 | 0.522426 | `appconfig_account_list` | ❌ | | 3 | 0.512945 | `appconfig_kv_set` | ❌ | | 4 | 0.468503 | `appconfig_kv_delete` | ❌ | | 5 | 0.457866 | `appconfig_kv_lock_set` | ❌ | --- -<<<<<<< HEAD -<<<<<<< HEAD -## Test 72 -======= -## Test 77 ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= ## Test 82 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `appconfig_kv_get` **Prompt:** List all key-value settings with key name starting with 'prod-' in App Configuration store @@ -3414,47 +1876,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -<<<<<<< HEAD -| 1 | 0.512883 | `appconfig_kv_get` | ✅ **EXPECTED** | -| 2 | 0.450109 | `appconfig_account_list` | ❌ | -| 3 | 0.398684 | `appconfig_kv_set` | ❌ | -| 4 | 0.380614 | `appconfig_kv_delete` | ❌ | -| 5 | 0.346166 | `appconfig_kv_lock_set` | ❌ | - ---- - -## Test 73 -======= -<<<<<<< HEAD -| 1 | 0.512880 | `appconfig_kv_get` | ✅ **EXPECTED** | -| 2 | 0.449934 | `appconfig_account_list` | ❌ | -| 3 | 0.398698 | `appconfig_kv_set` | ❌ | -| 4 | 0.380636 | `appconfig_kv_delete` | ❌ | -| 5 | 0.346156 | `appconfig_kv_lock_set` | ❌ | -======= -| 1 | 0.512804 | `appconfig_kv_get` | ✅ **EXPECTED** | -| 2 | 0.449871 | `appconfig_account_list` | ❌ | -| 3 | 0.398608 | `appconfig_kv_set` | ❌ | -| 4 | 0.380599 | `appconfig_kv_delete` | ❌ | -| 5 | 0.346117 | `appconfig_kv_lock_set` | ❌ | ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) - ---- - -## Test 78 ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -| 1 | 0.512883 | `appconfig_kv_get` | ✅ **EXPECTED** | -| 2 | 0.449905 | `appconfig_account_list` | ❌ | -| 3 | 0.398684 | `appconfig_kv_set` | ❌ | -| 4 | 0.380614 | `appconfig_kv_delete` | ❌ | -| 5 | 0.346166 | `appconfig_kv_lock_set` | ❌ | +| 1 | 0.513021 | `appconfig_kv_get` | ✅ **EXPECTED** | +| 2 | 0.450004 | `appconfig_account_list` | ❌ | +| 3 | 0.398800 | `appconfig_kv_set` | ❌ | +| 4 | 0.380746 | `appconfig_kv_delete` | ❌ | +| 5 | 0.346260 | `appconfig_kv_lock_set` | ❌ | --- ## Test 83 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `appconfig_kv_get` **Prompt:** Show the content for the key in App Configuration store @@ -3466,20 +1896,12 @@ | 1 | 0.552300 | `appconfig_kv_get` | ✅ **EXPECTED** | | 2 | 0.448912 | `appconfig_kv_set` | ❌ | | 3 | 0.441713 | `appconfig_kv_delete` | ❌ | -| 4 | 0.437745 | `appconfig_account_list` | ❌ | +| 4 | 0.437432 | `appconfig_account_list` | ❌ | | 5 | 0.416264 | `appconfig_kv_lock_set` | ❌ | --- -<<<<<<< HEAD -<<<<<<< HEAD -## Test 74 -======= -## Test 79 ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= ## Test 84 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `appconfig_kv_lock_set` **Prompt:** Lock the key in App Configuration store @@ -3488,23 +1910,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -| 1 | 0.591253 | `appconfig_kv_lock_set` | ✅ **EXPECTED** | -| 2 | 0.487221 | `appconfig_kv_get` | ❌ | -| 3 | 0.445541 | `appconfig_kv_set` | ❌ | -| 4 | 0.431462 | `appconfig_kv_delete` | ❌ | -| 5 | 0.373617 | `appconfig_account_list` | ❌ | +| 1 | 0.591237 | `appconfig_kv_lock_set` | ✅ **EXPECTED** | +| 2 | 0.487174 | `appconfig_kv_get` | ❌ | +| 3 | 0.445551 | `appconfig_kv_set` | ❌ | +| 4 | 0.431516 | `appconfig_kv_delete` | ❌ | +| 5 | 0.373656 | `appconfig_account_list` | ❌ | --- -<<<<<<< HEAD -<<<<<<< HEAD -## Test 75 -======= -## Test 80 ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= ## Test 85 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `appconfig_kv_lock_set` **Prompt:** Unlock the key in App Configuration store @@ -3515,25 +1929,13 @@ |------|-------|------|--------| | 1 | 0.555699 | `appconfig_kv_lock_set` | ✅ **EXPECTED** | | 2 | 0.505681 | `appconfig_kv_get` | ❌ | -| 3 | 0.476497 | `appconfig_kv_delete` | ❌ | +| 3 | 0.476496 | `appconfig_kv_delete` | ❌ | | 4 | 0.425488 | `appconfig_kv_set` | ❌ | -<<<<<<< HEAD -| 5 | 0.409649 | `appconfig_account_list` | ❌ | - ---- - -## Test 76 -======= | 5 | 0.409406 | `appconfig_account_list` | ❌ | --- -<<<<<<< HEAD -## Test 81 ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= ## Test 86 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `appconfig_kv_set` **Prompt:** Set the key in App Configuration store to @@ -3542,31 +1944,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD | 1 | 0.609635 | `appconfig_kv_set` | ✅ **EXPECTED** | | 2 | 0.536497 | `appconfig_kv_lock_set` | ❌ | | 3 | 0.512707 | `appconfig_kv_get` | ❌ | | 4 | 0.505571 | `appconfig_kv_delete` | ❌ | -| 5 | 0.378223 | `appconfig_account_list` | ❌ | - ---- - -<<<<<<< HEAD -## Test 77 -======= -## Test 82 ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -| 1 | 0.609760 | `appconfig_kv_set` | ✅ **EXPECTED** | -| 2 | 0.536630 | `appconfig_kv_lock_set` | ❌ | -| 3 | 0.512740 | `appconfig_kv_get` | ❌ | -| 4 | 0.505638 | `appconfig_kv_delete` | ❌ | -| 5 | 0.377900 | `appconfig_account_list` | ❌ | +| 5 | 0.377919 | `appconfig_account_list` | ❌ | --- ## Test 87 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `applens_resource_diagnose` **Prompt:** Please help me diagnose issues with my app using app lens @@ -3575,29 +1961,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -| 1 | 0.595632 | `applens_resource_diagnose` | ✅ **EXPECTED** | -<<<<<<< HEAD -| 2 | 0.335768 | `deploy_app_logs_get` | ❌ | -| 3 | 0.300786 | `deploy_architecture_diagram_generate` | ❌ | -| 4 | 0.273083 | `cloudarchitect_design` | ❌ | -======= -| 2 | 0.336090 | `deploy_app_logs_get` | ❌ | -| 3 | 0.300786 | `deploy_architecture_diagram_generate` | ❌ | -| 4 | 0.273082 | `cloudarchitect_design` | ❌ | ->>>>>>> e2fd2eac (refactor tts mcp tool) -| 5 | 0.254473 | `monitor_resource_log_query` | ❌ | +| 1 | 0.595737 | `applens_resource_diagnose` | ✅ **EXPECTED** | +| 2 | 0.336142 | `deploy_app_logs_get` | ❌ | +| 3 | 0.300835 | `deploy_architecture_diagram_generate` | ❌ | +| 4 | 0.272689 | `cloudarchitect_design` | ❌ | +| 5 | 0.254527 | `monitor_resource_log_query` | ❌ | --- -<<<<<<< HEAD -<<<<<<< HEAD -## Test 78 -======= -## Test 83 ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= ## Test 88 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `applens_resource_diagnose` **Prompt:** Use app lens to check why my app is slow? @@ -3607,35 +1979,14 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| | 1 | 0.502361 | `applens_resource_diagnose` | ✅ **EXPECTED** | -<<<<<<< HEAD -| 2 | 0.316002 | `deploy_app_logs_get` | ❌ | -| 3 | 0.255570 | `deploy_architecture_diagram_generate` | ❌ | -| 4 | 0.249583 | `monitor_resource_log_query` | ❌ | -<<<<<<< HEAD -| 5 | 0.226030 | `quota_usage_check` | ❌ | - ---- - -## Test 79 -======= -<<<<<<< HEAD -| 5 | 0.226092 | `quota_usage_check` | ❌ | -======= -======= | 2 | 0.316297 | `deploy_app_logs_get` | ❌ | | 3 | 0.255570 | `deploy_architecture_diagram_generate` | ❌ | | 4 | 0.249583 | `monitor_resource_log_query` | ❌ | ->>>>>>> e2fd2eac (refactor tts mcp tool) | 5 | 0.225972 | `quota_usage_check` | ❌ | --- -<<<<<<< HEAD -## Test 84 ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= ## Test 89 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `applens_resource_diagnose` **Prompt:** What does app lens say is wrong with my service? @@ -3646,21 +1997,13 @@ |------|-------|------|--------| | 1 | 0.492820 | `applens_resource_diagnose` | ✅ **EXPECTED** | | 2 | 0.256325 | `deploy_architecture_diagram_generate` | ❌ | -| 3 | 0.242574 | `cloudarchitect_design` | ❌ | +| 3 | 0.242800 | `cloudarchitect_design` | ❌ | | 4 | 0.225608 | `resourcehealth_health-events_list` | ❌ | -| 5 | 0.211260 | `deploy_app_logs_get` | ❌ | +| 5 | 0.211564 | `deploy_app_logs_get` | ❌ | --- -<<<<<<< HEAD -<<<<<<< HEAD -## Test 80 -======= -## Test 85 ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= ## Test 90 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `appservice_database_add` **Prompt:** Add database connection to my app service for database in resource group @@ -3669,35 +2012,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -<<<<<<< HEAD -| 1 | 0.717878 | `appservice_database_add` | ✅ **EXPECTED** | -| 2 | 0.401376 | `sql_db_rename` | ❌ | -| 3 | 0.399941 | `sql_db_create` | ❌ | -| 4 | 0.362997 | `sql_db_show` | ❌ | -| 5 | 0.357919 | `sql_db_list` | ❌ | - ---- - -## Test 81 -======= -<<<<<<< HEAD -======= ->>>>>>> e2fd2eac (refactor tts mcp tool) -| 1 | 0.717887 | `appservice_database_add` | ✅ **EXPECTED** | -| 2 | 0.401337 | `sql_db_rename` | ❌ | -| 3 | 0.399820 | `sql_db_create` | ❌ | -| 4 | 0.362889 | `sql_db_show` | ❌ | -| 5 | 0.357806 | `sql_db_list` | ❌ | +| 1 | 0.718027 | `appservice_database_add` | ✅ **EXPECTED** | +| 2 | 0.402135 | `sql_db_rename` | ❌ | +| 3 | 0.400714 | `sql_db_create` | ❌ | +| 4 | 0.363505 | `sql_db_show` | ❌ | +| 5 | 0.358956 | `sql_db_list` | ❌ | --- -<<<<<<< HEAD -## Test 86 ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= ## Test 91 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `appservice_database_add` **Prompt:** Configure SQL Server database for app service with connection string in resource group @@ -3706,47 +2029,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -<<<<<<< HEAD -| 1 | 0.688410 | `appservice_database_add` | ✅ **EXPECTED** | -| 2 | 0.498122 | `sql_db_rename` | ❌ | -| 3 | 0.497502 | `sql_db_create` | ❌ | -| 4 | 0.469326 | `sql_db_show` | ❌ | -| 5 | 0.452937 | `sql_db_list` | ❌ | - ---- - -## Test 82 -======= -<<<<<<< HEAD | 1 | 0.688364 | `appservice_database_add` | ✅ **EXPECTED** | | 2 | 0.498175 | `sql_db_rename` | ❌ | -| 3 | 0.497711 | `sql_db_create` | ❌ | -| 4 | 0.469526 | `sql_db_show` | ❌ | -| 5 | 0.453040 | `sql_db_list` | ❌ | -======= -| 1 | 0.654513 | `appservice_database_add` | ✅ **EXPECTED** | -| 2 | 0.498175 | `sql_db_rename` | ❌ | | 3 | 0.497522 | `sql_db_create` | ❌ | | 4 | 0.469526 | `sql_db_show` | ❌ | | 5 | 0.453088 | `sql_db_list` | ❌ | ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) - ---- - -## Test 87 ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -| 1 | 0.688409 | `appservice_database_add` | ✅ **EXPECTED** | -| 2 | 0.498049 | `sql_db_rename` | ❌ | -| 3 | 0.497520 | `sql_db_create` | ❌ | -| 4 | 0.469335 | `sql_db_show` | ❌ | -| 5 | 0.452906 | `sql_db_list` | ❌ | --- ## Test 92 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `appservice_database_add` **Prompt:** Add MySQL database to app service using connection in resource group @@ -3755,47 +2046,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -<<<<<<< HEAD -| 1 | 0.675970 | `appservice_database_add` | ✅ **EXPECTED** | -| 2 | 0.464756 | `sql_db_create` | ❌ | -| 3 | 0.452407 | `sql_db_rename` | ❌ | -| 4 | 0.432948 | `mysql_server_list` | ❌ | -| 5 | 0.410292 | `sql_db_show` | ❌ | - ---- - -## Test 83 -======= -<<<<<<< HEAD -| 1 | 0.675548 | `appservice_database_add` | ✅ **EXPECTED** | -| 2 | 0.465376 | `sql_db_create` | ❌ | -| 3 | 0.452528 | `sql_db_rename` | ❌ | -| 4 | 0.433256 | `mysql_server_list` | ❌ | -| 5 | 0.410221 | `sql_db_show` | ❌ | -======= -| 1 | 0.655045 | `appservice_database_add` | ✅ **EXPECTED** | -| 2 | 0.465281 | `sql_db_create` | ❌ | -| 3 | 0.452630 | `sql_db_rename` | ❌ | -| 4 | 0.433191 | `mysql_server_list` | ❌ | -| 5 | 0.410316 | `sql_db_show` | ❌ | ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) - ---- - -## Test 88 ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -| 1 | 0.675678 | `appservice_database_add` | ✅ **EXPECTED** | -| 2 | 0.465147 | `sql_db_create` | ❌ | -| 3 | 0.452626 | `sql_db_rename` | ❌ | -| 4 | 0.433261 | `mysql_server_list` | ❌ | -| 5 | 0.410304 | `sql_db_show` | ❌ | +| 1 | 0.675443 | `appservice_database_add` | ✅ **EXPECTED** | +| 2 | 0.465187 | `sql_db_create` | ❌ | +| 3 | 0.452537 | `sql_db_rename` | ❌ | +| 4 | 0.432946 | `mysql_server_list` | ❌ | +| 5 | 0.410266 | `sql_db_show` | ❌ | --- ## Test 93 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `appservice_database_add` **Prompt:** Add PostgreSQL database to app service using connection in resource group @@ -3804,47 +2063,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -<<<<<<< HEAD -| 1 | 0.628119 | `appservice_database_add` | ✅ **EXPECTED** | -| 2 | 0.444212 | `sql_db_create` | ❌ | -| 3 | 0.405314 | `postgres_database_query` | ❌ | -| 4 | 0.401117 | `postgres_database_list` | ❌ | -| 5 | 0.400767 | `sql_db_rename` | ❌ | - ---- - -## Test 84 -======= -<<<<<<< HEAD -| 1 | 0.627847 | `appservice_database_add` | ✅ **EXPECTED** | -| 2 | 0.444822 | `sql_db_create` | ❌ | -| 3 | 0.404711 | `postgres_database_query` | ❌ | -| 4 | 0.401105 | `postgres_database_list` | ❌ | -| 5 | 0.400866 | `sql_db_rename` | ❌ | -======= -| 1 | 0.599525 | `appservice_database_add` | ✅ **EXPECTED** | +| 1 | 0.627784 | `appservice_database_add` | ✅ **EXPECTED** | | 2 | 0.444152 | `sql_db_create` | ❌ | -| 3 | 0.404912 | `postgres_database_query` | ❌ | +| 3 | 0.404874 | `postgres_database_query` | ❌ | | 4 | 0.401137 | `postgres_database_list` | ❌ | | 5 | 0.400754 | `sql_db_rename` | ❌ | ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) - ---- - -## Test 89 ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -| 1 | 0.627767 | `appservice_database_add` | ✅ **EXPECTED** | -| 2 | 0.444459 | `sql_db_create` | ❌ | -| 3 | 0.404249 | `postgres_database_query` | ❌ | -| 4 | 0.400435 | `postgres_database_list` | ❌ | -| 5 | 0.400352 | `sql_db_rename` | ❌ | --- ## Test 94 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `appservice_database_add` **Prompt:** Connect CosmosDB database using connection string to app service in resource group @@ -3853,47 +2080,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -<<<<<<< HEAD -| 1 | 0.663086 | `appservice_database_add` | ✅ **EXPECTED** | -| 2 | 0.446465 | `cosmos_database_list` | ❌ | -| 3 | 0.441966 | `cosmos_database_container_item_query` | ❌ | -| 4 | 0.427284 | `cosmos_database_container_list` | ❌ | -| 5 | 0.420488 | `sql_db_rename` | ❌ | - ---- - -## Test 85 -======= -<<<<<<< HEAD -| 1 | 0.663498 | `appservice_database_add` | ✅ **EXPECTED** | -| 2 | 0.446339 | `cosmos_database_list` | ❌ | -| 3 | 0.441990 | `cosmos_database_container_item_query` | ❌ | -| 4 | 0.427167 | `cosmos_database_container_list` | ❌ | -| 5 | 0.420405 | `sql_db_rename` | ❌ | -======= -| 1 | 0.608259 | `appservice_database_add` | ✅ **EXPECTED** | -| 2 | 0.445781 | `cosmos_database_list` | ❌ | -| 3 | 0.441836 | `cosmos_database_container_item_query` | ❌ | -| 4 | 0.426789 | `cosmos_database_container_list` | ❌ | -| 5 | 0.420630 | `sql_db_rename` | ❌ | ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) - ---- - -## Test 90 ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -| 1 | 0.662987 | `appservice_database_add` | ✅ **EXPECTED** | -| 2 | 0.446741 | `cosmos_database_list` | ❌ | -| 3 | 0.442115 | `cosmos_database_container_item_query` | ❌ | -| 4 | 0.427312 | `cosmos_database_container_list` | ❌ | -| 5 | 0.420799 | `sql_db_rename` | ❌ | +| 1 | 0.663057 | `appservice_database_add` | ✅ **EXPECTED** | +| 2 | 0.446328 | `cosmos_database_list` | ❌ | +| 3 | 0.441849 | `cosmos_database_container_item_query` | ❌ | +| 4 | 0.427159 | `cosmos_database_container_list` | ❌ | +| 5 | 0.420379 | `sql_db_rename` | ❌ | --- ## Test 95 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `appservice_database_add` **Prompt:** Add database connection for database on server to app service in resource group @@ -3902,35 +2097,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -<<<<<<< HEAD -| 1 | 0.733852 | `appservice_database_add` | ✅ **EXPECTED** | -| 2 | 0.454554 | `sql_db_create` | ❌ | -| 3 | 0.415271 | `sql_db_rename` | ❌ | -| 4 | 0.414045 | `sql_server_create` | ❌ | -| 5 | 0.410260 | `sql_db_list` | ❌ | - ---- - -## Test 86 -======= -<<<<<<< HEAD -======= ->>>>>>> e2fd2eac (refactor tts mcp tool) | 1 | 0.733775 | `appservice_database_add` | ✅ **EXPECTED** | | 2 | 0.454554 | `sql_db_create` | ❌ | | 3 | 0.415274 | `sql_db_rename` | ❌ | -| 4 | 0.414045 | `sql_server_create` | ❌ | +| 4 | 0.414101 | `sql_server_create` | ❌ | | 5 | 0.410260 | `sql_db_list` | ❌ | --- -<<<<<<< HEAD -## Test 91 ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= ## Test 96 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `appservice_database_add` **Prompt:** Add database connection string for to app service using connection string in resource group @@ -3939,47 +2114,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -<<<<<<< HEAD -| 1 | 0.746766 | `appservice_database_add` | ✅ **EXPECTED** | -| 2 | 0.441682 | `sql_db_rename` | ❌ | -| 3 | 0.434020 | `sql_db_create` | ❌ | -| 4 | 0.391311 | `sql_db_list` | ❌ | -| 5 | 0.390014 | `sql_db_show` | ❌ | - ---- - -## Test 87 -======= -<<<<<<< HEAD -| 1 | 0.746379 | `appservice_database_add` | ✅ **EXPECTED** | -| 2 | 0.441584 | `sql_db_rename` | ❌ | -| 3 | 0.434079 | `sql_db_create` | ❌ | -| 4 | 0.391000 | `sql_db_list` | ❌ | -| 5 | 0.389995 | `sql_db_show` | ❌ | -======= -| 1 | 0.686506 | `appservice_database_add` | ✅ **EXPECTED** | -| 2 | 0.441542 | `sql_db_rename` | ❌ | -| 3 | 0.433865 | `sql_db_create` | ❌ | -| 4 | 0.391188 | `sql_db_list` | ❌ | -| 5 | 0.390129 | `sql_db_show` | ❌ | ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) - ---- - -## Test 92 ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -| 1 | 0.746361 | `appservice_database_add` | ✅ **EXPECTED** | -| 2 | 0.441645 | `sql_db_rename` | ❌ | -| 3 | 0.433902 | `sql_db_create` | ❌ | -| 4 | 0.391238 | `sql_db_list` | ❌ | -| 5 | 0.390155 | `sql_db_show` | ❌ | +| 1 | 0.746518 | `appservice_database_add` | ✅ **EXPECTED** | +| 2 | 0.441688 | `sql_db_rename` | ❌ | +| 3 | 0.433979 | `sql_db_create` | ❌ | +| 4 | 0.391370 | `sql_db_list` | ❌ | +| 5 | 0.390219 | `sql_db_show` | ❌ | --- ## Test 97 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `appservice_database_add` **Prompt:** Connect database to my app service using connection string in resource group @@ -3988,42 +2131,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -<<<<<<< HEAD -| 1 | 0.680503 | `appservice_database_add` | ✅ **EXPECTED** | -| 2 | 0.429273 | `sql_db_rename` | ❌ | -| 3 | 0.406267 | `sql_db_create` | ❌ | -| 4 | 0.396537 | `sql_db_show` | ❌ | -| 5 | 0.391409 | `sql_db_list` | ❌ | +| 1 | 0.680411 | `appservice_database_add` | ✅ **EXPECTED** | +| 2 | 0.429294 | `sql_db_rename` | ❌ | +| 3 | 0.406245 | `sql_db_create` | ❌ | +| 4 | 0.396491 | `sql_db_show` | ❌ | +| 5 | 0.391479 | `sql_db_list` | ❌ | --- -## Test 88 -======= -<<<<<<< HEAD -| 1 | 0.680525 | `appservice_database_add` | ✅ **EXPECTED** | -| 2 | 0.429291 | `sql_db_rename` | ❌ | -| 3 | 0.406599 | `sql_db_create` | ❌ | -| 4 | 0.396524 | `sql_db_show` | ❌ | -| 5 | 0.391416 | `sql_db_list` | ❌ | -======= -| 1 | 0.643888 | `appservice_database_add` | ✅ **EXPECTED** | -======= -| 1 | 0.680400 | `appservice_database_add` | ✅ **EXPECTED** | ->>>>>>> e2fd2eac (refactor tts mcp tool) -| 2 | 0.429317 | `sql_db_rename` | ❌ | -| 3 | 0.406322 | `sql_db_create` | ❌ | -| 4 | 0.396523 | `sql_db_show` | ❌ | -| 5 | 0.391430 | `sql_db_list` | ❌ | - ---- - -<<<<<<< HEAD -## Test 93 ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= ## Test 98 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `appservice_database_add` **Prompt:** Set up database for app service with connection string under resource group @@ -4032,33 +2148,11 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -<<<<<<< HEAD -| 1 | 0.640738 | `appservice_database_add` | ✅ **EXPECTED** | -| 2 | 0.456785 | `sql_db_create` | ❌ | -| 3 | 0.402668 | `sql_db_rename` | ❌ | -| 4 | 0.401985 | `sql_db_show` | ❌ | -| 5 | 0.394072 | `sql_db_list` | ❌ | - ---- - -## Test 89 -======= -<<<<<<< HEAD -| 1 | 0.640622 | `appservice_database_add` | ✅ **EXPECTED** | -| 2 | 0.456508 | `sql_db_create` | ❌ | -| 3 | 0.402651 | `sql_db_rename` | ❌ | -| 4 | 0.402081 | `sql_db_show` | ❌ | -| 5 | 0.394177 | `sql_db_list` | ❌ | -======= -| 1 | 0.598494 | `appservice_database_add` | ✅ **EXPECTED** | -======= -| 1 | 0.640548 | `appservice_database_add` | ✅ **EXPECTED** | ->>>>>>> e2fd2eac (refactor tts mcp tool) -| 2 | 0.456884 | `sql_db_create` | ❌ | -| 3 | 0.402743 | `sql_db_rename` | ❌ | -| 4 | 0.402138 | `sql_db_show` | ❌ | -| 5 | 0.394211 | `sql_db_list` | ❌ | +| 1 | 0.640585 | `appservice_database_add` | ✅ **EXPECTED** | +| 2 | 0.456892 | `sql_db_create` | ❌ | +| 3 | 0.402757 | `sql_db_rename` | ❌ | +| 4 | 0.402144 | `sql_db_show` | ❌ | +| 5 | 0.394206 | `sql_db_list` | ❌ | --- @@ -4071,37 +2165,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -| 1 | 0.688343 | `appservice_database_add` | ✅ **EXPECTED** | -| 2 | 0.449174 | `sql_db_rename` | ❌ | -| 3 | 0.448432 | `sql_db_create` | ❌ | +| 1 | 0.688394 | `appservice_database_add` | ✅ **EXPECTED** | +| 2 | 0.449133 | `sql_db_rename` | ❌ | +| 3 | 0.448418 | `sql_db_create` | ❌ | | 4 | 0.414400 | `sql_db_show` | ❌ | -| 5 | 0.411810 | `sql_db_list` | ❌ | - ---- - -<<<<<<< HEAD -## Test 95 ->>>>>>> 58ab8585 (update prompts and tool description evaluator) - -**Expected Tool:** `appservice_database_add` -**Prompt:** Configure database for app service with the connection string in resource group - -### Results - -| Rank | Score | Tool | Status | -|------|-------|------|--------| -| 1 | 0.688527 | `appservice_database_add` | ✅ **EXPECTED** | -| 2 | 0.449176 | `sql_db_rename` | ❌ | -| 3 | 0.448382 | `sql_db_create` | ❌ | -| 4 | 0.414329 | `sql_db_show` | ❌ | -| 5 | 0.411782 | `sql_db_list` | ❌ | +| 5 | 0.411818 | `sql_db_list` | ❌ | --- -## Test 90 -======= ## Test 100 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `applicationinsights_recommendation_list` **Prompt:** List code optimization recommendations across my Application Insights components @@ -4111,20 +2183,6 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| | 1 | 0.572473 | `applicationinsights_recommendation_list` | ✅ **EXPECTED** | -<<<<<<< HEAD -<<<<<<< HEAD -| 2 | 0.454559 | `azureaibestpractices_get` | ❌ | -| 3 | 0.445157 | `get_bestpractices_get` | ❌ | -| 4 | 0.390478 | `azureterraformbestpractices_get` | ❌ | -| 5 | 0.383948 | `applens_resource_diagnose` | ❌ | - ---- - -## Test 91 -======= -<<<<<<< HEAD -======= ->>>>>>> e2fd2eac (refactor tts mcp tool) | 2 | 0.445157 | `get_bestpractices_get` | ❌ | | 3 | 0.390478 | `azureterraformbestpractices_get` | ❌ | | 4 | 0.383948 | `applens_resource_diagnose` | ❌ | @@ -4132,16 +2190,7 @@ --- -<<<<<<< HEAD -<<<<<<< HEAD -## Test 86 -======= -## Test 96 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= ## Test 101 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `applicationinsights_recommendation_list` **Prompt:** Show me code optimization recommendations for all Application Insights resources in my subscription @@ -4151,31 +2200,6 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| | 1 | 0.696531 | `applicationinsights_recommendation_list` | ✅ **EXPECTED** | -<<<<<<< HEAD -| 2 | 0.506351 | `azureaibestpractices_get` | ❌ | -| 3 | 0.468384 | `get_bestpractices_get` | ❌ | -| 4 | 0.452231 | `applens_resource_diagnose` | ❌ | -| 5 | 0.435241 | `azureterraformbestpractices_get` | ❌ | - ---- - -<<<<<<< HEAD -## Test 92 -======= -## Test 87 -======= -| 1 | 0.696565 | `applicationinsights_recommendation_list` | ✅ **EXPECTED** | -| 2 | 0.470670 | `get_bestpractices_get` | ❌ | -| 3 | 0.452233 | `applens_resource_diagnose` | ❌ | -| 4 | 0.435290 | `azureterraformbestpractices_get` | ❌ | -| 5 | 0.424629 | `search_service_list` | ❌ | - ---- - -## Test 97 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= | 2 | 0.468384 | `get_bestpractices_get` | ❌ | | 3 | 0.452231 | `applens_resource_diagnose` | ❌ | | 4 | 0.435241 | `azureterraformbestpractices_get` | ❌ | @@ -4184,7 +2208,6 @@ --- ## Test 102 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `applicationinsights_recommendation_list` **Prompt:** List profiler recommendations for Application Insights in resource group @@ -4195,37 +2218,13 @@ |------|-------|------|--------| | 1 | 0.626722 | `applicationinsights_recommendation_list` | ✅ **EXPECTED** | | 2 | 0.488002 | `loadtesting_testresource_list` | ❌ | -<<<<<<< HEAD -<<<<<<< HEAD -| 3 | 0.479392 | `mysql_server_list` | ❌ | -======= -<<<<<<< HEAD -| 3 | 0.479416 | `mysql_server_list` | ❌ | -======= -| 3 | 0.479392 | `mysql_server_list` | ❌ | ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -| 3 | 0.479392 | `mysql_server_list` | ❌ | ->>>>>>> e2fd2eac (refactor tts mcp tool) +| 3 | 0.479425 | `mysql_server_list` | ❌ | | 4 | 0.477396 | `applens_resource_diagnose` | ❌ | | 5 | 0.468847 | `resourcehealth_availability-status_list` | ❌ | --- -<<<<<<< HEAD -<<<<<<< HEAD -## Test 93 -======= -<<<<<<< HEAD -## Test 88 -======= -## Test 98 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= ## Test 103 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `applicationinsights_recommendation_list` **Prompt:** Show me performance improvement recommendations from Application Insights @@ -4234,35 +2233,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -| 1 | 0.509615 | `applicationinsights_recommendation_list` | ✅ **EXPECTED** | -| 2 | 0.433835 | `azureaibestpractices_get` | ❌ | -| 3 | 0.419699 | `applens_resource_diagnose` | ❌ | -| 4 | 0.383861 | `get_bestpractices_get` | ❌ | -| 5 | 0.367317 | `deploy_architecture_diagram_generate` | ❌ | - ---- - -## Test 94 -======= | 1 | 0.509502 | `applicationinsights_recommendation_list` | ✅ **EXPECTED** | | 2 | 0.419670 | `applens_resource_diagnose` | ❌ | | 3 | 0.383767 | `get_bestpractices_get` | ❌ | | 4 | 0.367278 | `deploy_architecture_diagram_generate` | ❌ | -| 5 | 0.343931 | `cloudarchitect_design` | ❌ | +| 5 | 0.343879 | `cloudarchitect_design` | ❌ | --- -<<<<<<< HEAD -<<<<<<< HEAD -## Test 89 -======= -## Test 99 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= ## Test 104 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `extension_cli_generate` **Prompt:** Create a Storage account with name using Azure CLI @@ -4271,42 +2250,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -| 1 | 0.593241 | `storage_account_create` | ❌ | +| 1 | 0.593242 | `storage_account_create` | ❌ | | 2 | 0.564940 | `storage_blob_container_create` | ❌ | -<<<<<<< HEAD -<<<<<<< HEAD | 3 | 0.493684 | `storage_account_get` | ❌ | -| 4 | 0.473547 | `storage_blob_container_get` | ❌ | -| 5 | 0.456428 | `managedlustre_fs_create` | ❌ | - ---- - -## Test 95 -======= -<<<<<<< HEAD -| 3 | 0.493609 | `storage_account_get` | ❌ | -======= -| 3 | 0.493641 | `storage_account_get` | ❌ | ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) | 4 | 0.474399 | `storage_blob_container_get` | ❌ | -======= -| 3 | 0.493684 | `storage_account_get` | ❌ | -| 4 | 0.474987 | `storage_blob_container_get` | ❌ | ->>>>>>> e2fd2eac (refactor tts mcp tool) | 5 | 0.454194 | `managedlustre_fs_create` | ❌ | --- -<<<<<<< HEAD -<<<<<<< HEAD -## Test 90 -======= -## Test 100 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= ## Test 105 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `extension_cli_generate` **Prompt:** List all virtual machines in my subscription using Azure CLI @@ -4315,42 +2267,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -| 1 | 0.592102 | `search_service_list` | ❌ | -======= | 1 | 0.593467 | `search_service_list` | ❌ | -<<<<<<< HEAD -<<<<<<< HEAD ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -| 2 | 0.575274 | `kusto_cluster_list` | ❌ | -| 3 | 0.549918 | `virtualdesktop_hostpool_list` | ❌ | -| 4 | 0.544688 | `monitor_workspace_list` | ❌ | -| 5 | 0.536238 | `subscription_list` | ❌ | - ---- - -<<<<<<< HEAD -## Test 96 -======= -## Test 91 -======= -| 2 | 0.575351 | `kusto_cluster_list` | ❌ | -======= | 2 | 0.575274 | `kusto_cluster_list` | ❌ | ->>>>>>> e2fd2eac (refactor tts mcp tool) | 3 | 0.549966 | `virtualdesktop_hostpool_list` | ❌ | | 4 | 0.544412 | `monitor_workspace_list` | ❌ | | 5 | 0.536252 | `subscription_list` | ❌ | --- -<<<<<<< HEAD -## Test 101 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= ## Test 106 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `extension_cli_generate` **Prompt:** Show me the details of the storage account with Azure CLI commands @@ -4359,41 +2284,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -<<<<<<< HEAD | 1 | 0.710307 | `storage_account_get` | ❌ | -| 2 | 0.601571 | `storage_blob_container_get` | ❌ | -======= -<<<<<<< HEAD -| 1 | 0.710155 | `storage_account_get` | ❌ | -======= -| 1 | 0.710305 | `storage_account_get` | ❌ | ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) | 2 | 0.602173 | `storage_blob_container_get` | ❌ | ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -| 1 | 0.710307 | `storage_account_get` | ❌ | -| 2 | 0.602446 | `storage_blob_container_get` | ❌ | ->>>>>>> e2fd2eac (refactor tts mcp tool) | 3 | 0.543268 | `storage_blob_get` | ❌ | | 4 | 0.519788 | `storage_account_create` | ❌ | | 5 | 0.493145 | `cosmos_account_list` | ❌ | --- -<<<<<<< HEAD -<<<<<<< HEAD -## Test 97 -======= -<<<<<<< HEAD -## Test 92 -======= -## Test 102 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= ## Test 107 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `extension_cli_install` **Prompt:** @@ -4402,21 +2301,6 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -<<<<<<< HEAD -| 1 | 0.479652 | `extension_cli_install` | ✅ **EXPECTED** | -| 2 | 0.473369 | `extension_cli_generate` | ❌ | -| 3 | 0.389405 | `azureterraformbestpractices_get` | ❌ | -| 4 | 0.382473 | `deploy_plan_get` | ❌ | -| 5 | 0.366067 | `get_bestpractices_get` | ❌ | - ---- - -## Test 98 -======= -<<<<<<< HEAD -======= ->>>>>>> e2fd2eac (refactor tts mcp tool) | 1 | 0.479590 | `extension_cli_install` | ✅ **EXPECTED** | | 2 | 0.473250 | `extension_cli_generate` | ❌ | | 3 | 0.389354 | `azureterraformbestpractices_get` | ❌ | @@ -4425,23 +2309,7 @@ --- -<<<<<<< HEAD -## Test 93 -======= -| 1 | 0.497777 | `extension_cli_generate` | ❌ | -| 2 | 0.497497 | `extension_cli_install` | ✅ **EXPECTED** | -| 3 | 0.401453 | `azureterraformbestpractices_get` | ❌ | -| 4 | 0.383619 | `deploy_plan_get` | ❌ | -| 5 | 0.382552 | `deploy_pipeline_guidance_get` | ❌ | - ---- - -## Test 103 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= ## Test 108 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `extension_cli_install` **Prompt:** How to install azd @@ -4451,26 +2319,14 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| | 1 | 0.460416 | `extension_cli_install` | ✅ **EXPECTED** | -| 2 | 0.429269 | `deploy_app_logs_get` | ❌ | +| 2 | 0.429599 | `deploy_app_logs_get` | ❌ | | 3 | 0.365212 | `deploy_iac_rules_get` | ❌ | | 4 | 0.335279 | `deploy_plan_get` | ❌ | -| 5 | 0.326165 | `deploy_pipeline_guidance_get` | ❌ | +| 5 | 0.326135 | `deploy_pipeline_guidance_get` | ❌ | --- -<<<<<<< HEAD -<<<<<<< HEAD -## Test 99 -======= -<<<<<<< HEAD -## Test 94 -======= -## Test 104 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= ## Test 109 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `extension_cli_install` **Prompt:** What is Azure Functions Core tools and how to install it @@ -4479,17 +2335,6 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -| 1 | 0.622670 | `extension_cli_install` | ✅ **EXPECTED** | -| 2 | 0.439414 | `get_bestpractices_get` | ❌ | -| 3 | 0.432859 | `deploy_pipeline_guidance_get` | ❌ | -| 4 | 0.430682 | `extension_cli_generate` | ❌ | -| 5 | 0.418085 | `deploy_plan_get` | ❌ | - ---- - -## Test 100 -======= | 1 | 0.622705 | `extension_cli_install` | ✅ **EXPECTED** | | 2 | 0.439474 | `get_bestpractices_get` | ❌ | | 3 | 0.432913 | `deploy_pipeline_guidance_get` | ❌ | @@ -4498,16 +2343,7 @@ --- -<<<<<<< HEAD -<<<<<<< HEAD -## Test 95 -======= -## Test 105 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= ## Test 110 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `acr_registry_list` **Prompt:** List all Azure Container Registries in my subscription @@ -4519,35 +2355,12 @@ | 1 | 0.743568 | `acr_registry_list` | ✅ **EXPECTED** | | 2 | 0.711580 | `acr_registry_repository_list` | ❌ | | 3 | 0.585675 | `kusto_cluster_list` | ❌ | -<<<<<<< HEAD -<<<<<<< HEAD -| 4 | 0.540241 | `search_service_list` | ❌ | -| 5 | 0.514293 | `cosmos_account_list` | ❌ | - ---- - -## Test 101 -======= -======= -| 3 | 0.585618 | `kusto_cluster_list` | ❌ | ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) -======= ->>>>>>> e2fd2eac (refactor tts mcp tool) | 4 | 0.541506 | `search_service_list` | ❌ | | 5 | 0.514293 | `cosmos_account_list` | ❌ | --- -<<<<<<< HEAD -<<<<<<< HEAD -## Test 96 -======= -## Test 106 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= ## Test 111 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `acr_registry_list` **Prompt:** Show me my Azure Container Registries @@ -4558,29 +2371,13 @@ |------|-------|------|--------| | 1 | 0.586014 | `acr_registry_list` | ✅ **EXPECTED** | | 2 | 0.563636 | `acr_registry_repository_list` | ❌ | -<<<<<<< HEAD -| 3 | 0.460834 | `storage_blob_container_get` | ❌ | -======= -| 3 | 0.460570 | `storage_blob_container_get` | ❌ | ->>>>>>> e2fd2eac (refactor tts mcp tool) +| 3 | 0.460544 | `storage_blob_container_get` | ❌ | | 4 | 0.415552 | `cosmos_database_container_list` | ❌ | -| 5 | 0.402318 | `redis_list` | ❌ | +| 5 | 0.402247 | `redis_list` | ❌ | --- -<<<<<<< HEAD -<<<<<<< HEAD -## Test 102 -======= -<<<<<<< HEAD -## Test 97 -======= -## Test 107 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= ## Test 112 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `acr_registry_list` **Prompt:** Show me the container registries in my subscription @@ -4592,36 +2389,12 @@ | 1 | 0.637130 | `acr_registry_list` | ✅ **EXPECTED** | | 2 | 0.563476 | `acr_registry_repository_list` | ❌ | | 3 | 0.516769 | `kusto_cluster_list` | ❌ | -<<<<<<< HEAD -<<<<<<< HEAD -| 4 | 0.515365 | `storage_blob_container_get` | ❌ | -======= -======= -| 3 | 0.516826 | `kusto_cluster_list` | ❌ | ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) | 4 | 0.515378 | `storage_blob_container_get` | ❌ | ->>>>>>> 58ab8585 (update prompts and tool description evaluator) | 5 | 0.480352 | `redis_list` | ❌ | --- -<<<<<<< HEAD -## Test 103 -======= -<<<<<<< HEAD -## Test 98 -======= -## Test 108 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -| 4 | 0.515153 | `storage_blob_container_get` | ❌ | -| 5 | 0.480398 | `redis_list` | ❌ | - ---- - ## Test 113 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `acr_registry_list` **Prompt:** List container registries in resource group @@ -4632,37 +2405,13 @@ |------|-------|------|--------| | 1 | 0.654318 | `acr_registry_repository_list` | ❌ | | 2 | 0.633938 | `acr_registry_list` | ✅ **EXPECTED** | -<<<<<<< HEAD -<<<<<<< HEAD -| 3 | 0.476015 | `mysql_server_list` | ❌ | -======= -<<<<<<< HEAD -| 3 | 0.476043 | `mysql_server_list` | ❌ | -======= -| 3 | 0.476015 | `mysql_server_list` | ❌ | ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -| 3 | 0.476015 | `mysql_server_list` | ❌ | ->>>>>>> e2fd2eac (refactor tts mcp tool) -| 4 | 0.454929 | `group_list` | ❌ | +| 3 | 0.476294 | `mysql_server_list` | ❌ | +| 4 | 0.454887 | `group_list` | ❌ | | 5 | 0.454003 | `datadog_monitoredresources_list` | ❌ | --- -<<<<<<< HEAD -<<<<<<< HEAD -## Test 104 -======= -<<<<<<< HEAD -## Test 99 -======= -## Test 109 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= ## Test 114 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `acr_registry_list` **Prompt:** Show me the container registries in resource group @@ -4673,37 +2422,13 @@ |------|-------|------|--------| | 1 | 0.639391 | `acr_registry_list` | ✅ **EXPECTED** | | 2 | 0.637972 | `acr_registry_repository_list` | ❌ | -<<<<<<< HEAD -<<<<<<< HEAD -| 3 | 0.468028 | `mysql_server_list` | ❌ | -======= -<<<<<<< HEAD -| 3 | 0.468078 | `mysql_server_list` | ❌ | -======= -| 3 | 0.468028 | `mysql_server_list` | ❌ | ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -| 3 | 0.468028 | `mysql_server_list` | ❌ | ->>>>>>> e2fd2eac (refactor tts mcp tool) +| 3 | 0.468371 | `mysql_server_list` | ❌ | | 4 | 0.449649 | `datadog_monitoredresources_list` | ❌ | -| 5 | 0.445741 | `group_list` | ❌ | +| 5 | 0.445729 | `group_list` | ❌ | --- -<<<<<<< HEAD -<<<<<<< HEAD -## Test 105 -======= -<<<<<<< HEAD -## Test 100 -======= -## Test 110 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= ## Test 115 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `acr_registry_repository_list` **Prompt:** List all container registry repositories in my subscription @@ -4715,37 +2440,12 @@ | 1 | 0.626482 | `acr_registry_repository_list` | ✅ **EXPECTED** | | 2 | 0.617504 | `acr_registry_list` | ❌ | | 3 | 0.544172 | `kusto_cluster_list` | ❌ | -<<<<<<< HEAD -<<<<<<< HEAD -| 4 | 0.508863 | `storage_blob_container_get` | ❌ | -| 5 | 0.495567 | `postgres_server_list` | ❌ | - ---- - -## Test 106 -======= -======= -| 3 | 0.544238 | `kusto_cluster_list` | ❌ | ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) | 4 | 0.508483 | `storage_blob_container_get` | ❌ | -| 5 | 0.495526 | `postgres_server_list` | ❌ | - ---- - -<<<<<<< HEAD -## Test 101 -======= -## Test 111 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -| 4 | 0.508318 | `storage_blob_container_get` | ❌ | | 5 | 0.495567 | `postgres_server_list` | ❌ | --- ## Test 116 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `acr_registry_repository_list` **Prompt:** Show me my container registry repositories @@ -4754,31 +2454,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -| 1 | 0.546334 | `acr_registry_repository_list` | ✅ **EXPECTED** | +| 1 | 0.546333 | `acr_registry_repository_list` | ✅ **EXPECTED** | | 2 | 0.469295 | `acr_registry_list` | ❌ | -<<<<<<< HEAD -| 3 | 0.451973 | `storage_blob_container_get` | ❌ | -======= -| 3 | 0.450946 | `storage_blob_container_get` | ❌ | ->>>>>>> e2fd2eac (refactor tts mcp tool) +| 3 | 0.451083 | `storage_blob_container_get` | ❌ | | 4 | 0.407973 | `cosmos_database_container_list` | ❌ | | 5 | 0.373464 | `storage_blob_get` | ❌ | --- -<<<<<<< HEAD -<<<<<<< HEAD -## Test 107 -======= -<<<<<<< HEAD -## Test 102 -======= -## Test 112 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= ## Test 117 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `acr_registry_repository_list` **Prompt:** List repositories in the container registry @@ -4789,39 +2473,13 @@ |------|-------|------|--------| | 1 | 0.674296 | `acr_registry_repository_list` | ✅ **EXPECTED** | | 2 | 0.541779 | `acr_registry_list` | ❌ | -<<<<<<< HEAD -| 3 | 0.437756 | `storage_blob_container_get` | ❌ | -| 4 | 0.433927 | `cosmos_database_container_list` | ❌ | -<<<<<<< HEAD -| 5 | 0.383001 | `kusto_database_list` | ❌ | - ---- - -## Test 108 -======= -<<<<<<< HEAD -| 5 | 0.383201 | `kusto_database_list` | ❌ | - ---- - -## Test 103 -======= -| 5 | 0.383621 | `kusto_database_list` | ❌ | - ---- - -## Test 113 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -| 3 | 0.437348 | `storage_blob_container_get` | ❌ | +| 3 | 0.437509 | `storage_blob_container_get` | ❌ | | 4 | 0.433927 | `cosmos_database_container_list` | ❌ | -| 5 | 0.383183 | `kusto_database_list` | ❌ | +| 5 | 0.383387 | `kusto_database_list` | ❌ | --- ## Test 118 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `acr_registry_repository_list` **Prompt:** Show me the repositories in the container registry @@ -4832,29 +2490,13 @@ |------|-------|------|--------| | 1 | 0.600780 | `acr_registry_repository_list` | ✅ **EXPECTED** | | 2 | 0.501842 | `acr_registry_list` | ❌ | -<<<<<<< HEAD -| 3 | 0.431148 | `storage_blob_container_get` | ❌ | -======= -| 3 | 0.430783 | `storage_blob_container_get` | ❌ | ->>>>>>> e2fd2eac (refactor tts mcp tool) +| 3 | 0.430880 | `storage_blob_container_get` | ❌ | | 4 | 0.418623 | `cosmos_database_container_list` | ❌ | -| 5 | 0.378216 | `redis_list` | ❌ | +| 5 | 0.378151 | `redis_list` | ❌ | --- -<<<<<<< HEAD -<<<<<<< HEAD -## Test 109 -======= -<<<<<<< HEAD -## Test 104 -======= -## Test 114 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= ## Test 119 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `communication_email_send` **Prompt:** Send an email to with subject @@ -4863,42 +2505,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -| 1 | 0.498396 | `communication_email_send` | ✅ **EXPECTED** | -| 2 | 0.229071 | `communication_sms_send` | ❌ | -======= | 1 | 0.498292 | `communication_email_send` | ✅ **EXPECTED** | -<<<<<<< HEAD -<<<<<<< HEAD -| 2 | 0.226847 | `communication_sms_send` | ❌ | ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -| 3 | 0.188975 | `eventgrid_events_publish` | ❌ | -| 4 | 0.161257 | `foundry_agents_create` | ❌ | -| 5 | 0.146045 | `servicebus_topic_details` | ❌ | - ---- - -<<<<<<< HEAD -## Test 110 -======= -## Test 105 -======= -======= ->>>>>>> e2fd2eac (refactor tts mcp tool) | 2 | 0.229081 | `communication_sms_send` | ❌ | | 3 | 0.188975 | `eventgrid_events_publish` | ❌ | -| 4 | 0.161257 | `foundry_agents_create` | ❌ | +| 4 | 0.161150 | `foundry_agents_create` | ❌ | | 5 | 0.145951 | `servicebus_topic_details` | ❌ | --- -<<<<<<< HEAD -## Test 115 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= ## Test 120 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `communication_email_send` **Prompt:** Send an email from my communication service to @@ -4907,17 +2522,6 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -| 1 | 0.498459 | `communication_email_send` | ✅ **EXPECTED** | -| 2 | 0.314408 | `communication_sms_send` | ❌ | -| 3 | 0.235110 | `foundry_openai_chat-completions-create` | ❌ | -| 4 | 0.211067 | `search_knowledge_base_retrieve` | ❌ | -| 5 | 0.210014 | `foundry_agents_create` | ❌ | - ---- - -## Test 111 -======= | 1 | 0.498406 | `communication_email_send` | ✅ **EXPECTED** | | 2 | 0.314462 | `communication_sms_send` | ❌ | | 3 | 0.235127 | `foundry_openai_chat-completions-create` | ❌ | @@ -4926,13 +2530,7 @@ --- -<<<<<<< HEAD -## Test 116 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= ## Test 121 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `communication_email_send` **Prompt:** Send HTML-formatted email to with subject @@ -4941,34 +2539,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -| 1 | 0.521087 | `communication_email_send` | ✅ **EXPECTED** | -| 2 | 0.207644 | `communication_sms_send` | ❌ | -| 3 | 0.152418 | `eventgrid_events_publish` | ❌ | -| 4 | 0.152056 | `servicebus_topic_details` | ❌ | -======= | 1 | 0.520967 | `communication_email_send` | ✅ **EXPECTED** | | 2 | 0.207658 | `communication_sms_send` | ❌ | | 3 | 0.152418 | `eventgrid_events_publish` | ❌ | | 4 | 0.152013 | `servicebus_topic_details` | ❌ | ->>>>>>> 58ab8585 (update prompts and tool description evaluator) | 5 | 0.143660 | `foundry_agents_evaluate` | ❌ | --- -<<<<<<< HEAD -<<<<<<< HEAD -## Test 112 -======= -<<<<<<< HEAD -## Test 107 -======= -## Test 117 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= ## Test 122 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `communication_email_send` **Prompt:** Send email with CC to and @@ -4977,39 +2556,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -| 1 | 0.533532 | `communication_email_send` | ✅ **EXPECTED** | -| 2 | 0.219566 | `communication_sms_send` | ❌ | -| 3 | 0.106042 | `foundry_agents_query-and-evaluate` | ❌ | -======= | 1 | 0.533447 | `communication_email_send` | ✅ **EXPECTED** | | 2 | 0.219584 | `communication_sms_send` | ❌ | | 3 | 0.106026 | `foundry_agents_query-and-evaluate` | ❌ | ->>>>>>> 58ab8585 (update prompts and tool description evaluator) | 4 | 0.103723 | `foundry_openai_chat-completions-create` | ❌ | | 5 | 0.084905 | `cosmos_account_list` | ❌ | --- -<<<<<<< HEAD -<<<<<<< HEAD -## Test 113 -======= -## Test 108 -======= -| 2 | 0.219584 | `communication_sms_send` | ❌ | -| 3 | 0.106044 | `foundry_agents_query-and-evaluate` | ❌ | -| 4 | 0.087784 | `foundry_openai_chat-completions-create` | ❌ | -| 5 | 0.084933 | `cosmos_account_list` | ❌ | - ---- - -## Test 118 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= ## Test 123 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `communication_email_send` **Prompt:** Send email to multiple recipients: , @@ -5018,17 +2573,6 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -| 1 | 0.540910 | `communication_email_send` | ✅ **EXPECTED** | -| 2 | 0.244525 | `communication_sms_send` | ❌ | -| 3 | 0.134996 | `foundry_openai_chat-completions-create` | ❌ | -| 4 | 0.114359 | `foundry_agents_query-and-evaluate` | ❌ | -| 5 | 0.087005 | `postgres_server_param_set` | ❌ | - ---- - -## Test 114 -======= | 1 | 0.540792 | `communication_email_send` | ✅ **EXPECTED** | | 2 | 0.244521 | `communication_sms_send` | ❌ | | 3 | 0.134975 | `foundry_openai_chat-completions-create` | ❌ | @@ -5037,16 +2581,7 @@ --- -<<<<<<< HEAD -<<<<<<< HEAD -## Test 109 -======= -## Test 119 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= ## Test 124 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `communication_email_send` **Prompt:** Send email with reply-to address set to @@ -5055,33 +2590,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -| 1 | 0.512721 | `communication_email_send` | ✅ **EXPECTED** | -| 2 | 0.200189 | `communication_sms_send` | ❌ | -| 3 | 0.164422 | `mysql_server_param_set` | ❌ | -======= | 1 | 0.512623 | `communication_email_send` | ✅ **EXPECTED** | | 2 | 0.200177 | `communication_sms_send` | ❌ | | 3 | 0.164115 | `mysql_server_param_set` | ❌ | ->>>>>>> 58ab8585 (update prompts and tool description evaluator) | 4 | 0.158759 | `postgres_server_param_set` | ❌ | | 5 | 0.143574 | `appconfig_kv_set` | ❌ | --- -<<<<<<< HEAD -<<<<<<< HEAD -## Test 115 -======= -<<<<<<< HEAD -## Test 110 -======= -## Test 120 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= ## Test 125 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `communication_email_send` **Prompt:** Send email with custom sender name @@ -5090,45 +2607,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -| 1 | 0.473192 | `communication_email_send` | ✅ **EXPECTED** | -| 2 | 0.255124 | `communication_sms_send` | ❌ | -======= | 1 | 0.473175 | `communication_email_send` | ✅ **EXPECTED** | -<<<<<<< HEAD -<<<<<<< HEAD -| 2 | 0.253449 | `communication_sms_send` | ❌ | ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -| 3 | 0.164811 | `foundry_openai_chat-completions-create` | ❌ | -| 4 | 0.160285 | `foundry_openai_embeddings-create` | ❌ | -======= | 2 | 0.255169 | `communication_sms_send` | ❌ | | 3 | 0.164811 | `foundry_openai_chat-completions-create` | ❌ | | 4 | 0.160393 | `foundry_openai_embeddings-create` | ❌ | ->>>>>>> e2fd2eac (refactor tts mcp tool) | 5 | 0.156869 | `cosmos_database_container_item_query` | ❌ | --- -<<<<<<< HEAD -<<<<<<< HEAD -## Test 116 -======= -## Test 111 -======= -| 2 | 0.255169 | `communication_sms_send` | ❌ | -| 3 | 0.156869 | `cosmos_database_container_item_query` | ❌ | -| 4 | 0.143626 | `sql_db_rename` | ❌ | -| 5 | 0.139388 | `foundry_openai_chat-completions-create` | ❌ | - ---- - -## Test 121 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= ## Test 126 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `communication_email_send` **Prompt:** Send an email with BCC recipients @@ -5137,38 +2624,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -| 1 | 0.528899 | `communication_email_send` | ✅ **EXPECTED** | -| 2 | 0.241091 | `communication_sms_send` | ❌ | -| 3 | 0.137538 | `confidentialledger_entries_append` | ❌ | -| 4 | 0.108748 | `confidentialledger_entries_get` | ❌ | -======= -| 1 | 0.528789 | `communication_email_send` | ✅ **EXPECTED** | -| 2 | 0.241114 | `communication_sms_send` | ❌ | -| 3 | 0.137538 | `confidentialledger_entries_append` | ❌ | -| 4 | 0.108748 | `confidentialledger_entries_get` | ❌ | -<<<<<<< HEAD ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= ->>>>>>> e2fd2eac (refactor tts mcp tool) -| 5 | 0.105033 | `storage_blob_upload` | ❌ | +| 1 | 0.528759 | `communication_email_send` | ✅ **EXPECTED** | +| 2 | 0.241047 | `communication_sms_send` | ❌ | +| 3 | 0.137545 | `confidentialledger_entries_append` | ❌ | +| 4 | 0.108728 | `confidentialledger_entries_get` | ❌ | +| 5 | 0.105048 | `storage_blob_upload` | ❌ | --- -<<<<<<< HEAD -<<<<<<< HEAD -## Test 117 -======= -<<<<<<< HEAD -## Test 112 -======= -## Test 122 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= ## Test 127 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `communication_sms_send` **Prompt:** Send an SMS message to saying "Hello" @@ -5177,43 +2641,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -<<<<<<< HEAD -| 1 | 0.533822 | `communication_sms_send` | ✅ **EXPECTED** | -| 2 | 0.251480 | `communication_email_send` | ❌ | -======= -<<<<<<< HEAD -| 1 | 0.533763 | `communication_sms_send` | ✅ **EXPECTED** | -| 2 | 0.251429 | `communication_email_send` | ❌ | ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -| 3 | 0.218656 | `foundry_openai_chat-completions-create` | ❌ | -| 4 | 0.175534 | `foundry_agents_create` | ❌ | -| 5 | 0.156040 | `foundry_threads_create` | ❌ | - ---- - -<<<<<<< HEAD -## Test 118 -======= -## Test 113 -======= -======= ->>>>>>> e2fd2eac (refactor tts mcp tool) | 1 | 0.533868 | `communication_sms_send` | ✅ **EXPECTED** | | 2 | 0.251429 | `communication_email_send` | ❌ | | 3 | 0.218656 | `foundry_openai_chat-completions-create` | ❌ | -| 4 | 0.175534 | `foundry_agents_create` | ❌ | +| 4 | 0.175341 | `foundry_agents_create` | ❌ | | 5 | 0.166041 | `speech_tts_synthesize` | ❌ | --- -<<<<<<< HEAD -## Test 123 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= ## Test 128 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `communication_sms_send` **Prompt:** Send SMS to from with message "Test message" @@ -5222,46 +2658,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -<<<<<<< HEAD -| 1 | 0.546006 | `communication_sms_send` | ✅ **EXPECTED** | -| 2 | 0.294912 | `communication_email_send` | ❌ | -| 3 | 0.204585 | `loadtesting_testrun_create` | ❌ | -| 4 | 0.200656 | `foundry_openai_chat-completions-create` | ❌ | -| 5 | 0.141105 | `foundry_agents_create` | ❌ | - ---- - -## Test 119 -======= -<<<<<<< HEAD -| 1 | 0.543875 | `communication_sms_send` | ✅ **EXPECTED** | -| 2 | 0.294603 | `communication_email_send` | ❌ | -| 3 | 0.204487 | `loadtesting_testrun_create` | ❌ | -| 4 | 0.200633 | `foundry_openai_chat-completions-create` | ❌ | -| 5 | 0.136763 | `loadtesting_testrun_update` | ❌ | +| 1 | 0.545976 | `communication_sms_send` | ✅ **EXPECTED** | +| 2 | 0.294793 | `communication_email_send` | ❌ | +| 3 | 0.204688 | `loadtesting_testrun_create` | ❌ | +| 4 | 0.200676 | `foundry_openai_chat-completions-create` | ❌ | +| 5 | 0.141118 | `foundry_agents_create` | ❌ | --- -## Test 114 -======= -======= ->>>>>>> e2fd2eac (refactor tts mcp tool) -| 1 | 0.546019 | `communication_sms_send` | ✅ **EXPECTED** | -| 2 | 0.294859 | `communication_email_send` | ❌ | -| 3 | 0.204588 | `loadtesting_testrun_create` | ❌ | -| 4 | 0.200655 | `foundry_openai_chat-completions-create` | ❌ | -| 5 | 0.141113 | `foundry_agents_create` | ❌ | - ---- - -<<<<<<< HEAD -## Test 124 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= ## Test 129 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `communication_sms_send` **Prompt:** Send SMS to multiple recipients: , @@ -5270,40 +2675,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -<<<<<<< HEAD -| 1 | 0.545744 | `communication_sms_send` | ✅ **EXPECTED** | -| 2 | 0.422028 | `communication_email_send` | ❌ | -| 3 | 0.186088 | `foundry_openai_chat-completions-create` | ❌ | -| 4 | 0.142054 | `foundry_agents_query-and-evaluate` | ❌ | -| 5 | 0.113722 | `foundry_threads_get-messages` | ❌ | - ---- - -## Test 120 -======= -<<<<<<< HEAD -| 1 | 0.543753 | `communication_sms_send` | ✅ **EXPECTED** | -======= | 1 | 0.545755 | `communication_sms_send` | ✅ **EXPECTED** | ->>>>>>> e2fd2eac (refactor tts mcp tool) | 2 | 0.421988 | `communication_email_send` | ❌ | | 3 | 0.186088 | `foundry_openai_chat-completions-create` | ❌ | | 4 | 0.142030 | `foundry_agents_query-and-evaluate` | ❌ | -| 5 | 0.113722 | `foundry_threads_get-messages` | ❌ | +| 5 | 0.113656 | `foundry_threads_get-messages` | ❌ | --- -<<<<<<< HEAD -<<<<<<< HEAD -## Test 115 -======= -## Test 125 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= ## Test 130 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `communication_sms_send` **Prompt:** Send SMS with delivery reporting enabled @@ -5312,47 +2692,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -<<<<<<< HEAD -| 1 | 0.554917 | `communication_sms_send` | ✅ **EXPECTED** | -| 2 | 0.269203 | `communication_email_send` | ❌ | -| 3 | 0.191848 | `extension_azqr` | ❌ | -======= -<<<<<<< HEAD -| 1 | 0.548617 | `communication_sms_send` | ✅ **EXPECTED** | -| 2 | 0.269080 | `communication_email_send` | ❌ | -| 3 | 0.192340 | `extension_azqr` | ❌ | ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= | 1 | 0.554908 | `communication_sms_send` | ✅ **EXPECTED** | | 2 | 0.269080 | `communication_email_send` | ❌ | | 3 | 0.191848 | `extension_azqr` | ❌ | ->>>>>>> e2fd2eac (refactor tts mcp tool) | 4 | 0.185916 | `foundry_openai_chat-completions-create` | ❌ | -| 5 | 0.170749 | `foundry_agents_query-and-evaluate` | ❌ | - ---- - -<<<<<<< HEAD -<<<<<<< HEAD -## Test 121 -======= -## Test 116 -======= -| 1 | 0.554908 | `communication_sms_send` | ✅ **EXPECTED** | -| 2 | 0.269080 | `communication_email_send` | ❌ | -| 3 | 0.191848 | `extension_azqr` | ❌ | -| 4 | 0.170743 | `foundry_agents_query-and-evaluate` | ❌ | -| 5 | 0.166385 | `foundry_openai_chat-completions-create` | ❌ | +| 5 | 0.170726 | `foundry_agents_query-and-evaluate` | ❌ | --- -## Test 126 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= ## Test 131 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `communication_sms_send` **Prompt:** Send SMS message with custom tracking tag "campaign1" @@ -5361,43 +2709,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -<<<<<<< HEAD -| 1 | 0.538893 | `communication_sms_send` | ✅ **EXPECTED** | -| 2 | 0.269915 | `communication_email_send` | ❌ | -======= -<<<<<<< HEAD -| 1 | 0.534739 | `communication_sms_send` | ✅ **EXPECTED** | -| 2 | 0.269794 | `communication_email_send` | ❌ | ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -| 3 | 0.188153 | `loadtesting_testrun_create` | ❌ | -| 4 | 0.185403 | `foundry_openai_chat-completions-create` | ❌ | -| 5 | 0.175135 | `foundry_agents_create` | ❌ | - ---- - -<<<<<<< HEAD -## Test 122 -======= -## Test 117 -======= -======= ->>>>>>> e2fd2eac (refactor tts mcp tool) | 1 | 0.538827 | `communication_sms_send` | ✅ **EXPECTED** | | 2 | 0.269794 | `communication_email_send` | ❌ | | 3 | 0.188153 | `loadtesting_testrun_create` | ❌ | | 4 | 0.185403 | `foundry_openai_chat-completions-create` | ❌ | -| 5 | 0.175135 | `foundry_agents_create` | ❌ | +| 5 | 0.174747 | `foundry_agents_create` | ❌ | --- -<<<<<<< HEAD -## Test 127 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= ## Test 132 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `communication_sms_send` **Prompt:** Send broadcast SMS to and saying "Urgent notification" @@ -5406,51 +2726,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -<<<<<<< HEAD -| 1 | 0.474775 | `communication_sms_send` | ✅ **EXPECTED** | -| 2 | 0.286381 | `communication_email_send` | ❌ | -| 3 | 0.164341 | `foundry_agents_query-and-evaluate` | ❌ | -| 4 | 0.147338 | `foundry_openai_chat-completions-create` | ❌ | -| 5 | 0.128704 | `cosmos_account_list` | ❌ | - ---- - -## Test 123 -======= -<<<<<<< HEAD -| 1 | 0.471991 | `communication_sms_send` | ✅ **EXPECTED** | -| 2 | 0.286936 | `communication_email_send` | ❌ | -| 3 | 0.164059 | `foundry_agents_query-and-evaluate` | ❌ | -| 4 | 0.146501 | `foundry_openai_chat-completions-create` | ❌ | -| 5 | 0.128592 | `cosmos_account_list` | ❌ | - ---- - -## Test 118 -======= | 1 | 0.474786 | `communication_sms_send` | ✅ **EXPECTED** | | 2 | 0.286338 | `communication_email_send` | ❌ | -| 3 | 0.164288 | `foundry_agents_query-and-evaluate` | ❌ | -| 4 | 0.129965 | `foundry_openai_chat-completions-create` | ❌ | -| 5 | 0.128744 | `cosmos_account_list` | ❌ | - ---- - -## Test 128 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -| 1 | 0.474935 | `communication_sms_send` | ✅ **EXPECTED** | -| 2 | 0.286337 | `communication_email_send` | ❌ | -| 3 | 0.164209 | `foundry_agents_query-and-evaluate` | ❌ | -| 4 | 0.147352 | `foundry_openai_chat-completions-create` | ❌ | -| 5 | 0.128661 | `cosmos_account_list` | ❌ | +| 3 | 0.164289 | `foundry_agents_query-and-evaluate` | ❌ | +| 4 | 0.147338 | `foundry_openai_chat-completions-create` | ❌ | +| 5 | 0.128704 | `cosmos_account_list` | ❌ | --- ## Test 133 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `communication_sms_send` **Prompt:** Send SMS from my communication service to @@ -5459,46 +2743,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -<<<<<<< HEAD -| 1 | 0.564058 | `communication_sms_send` | ✅ **EXPECTED** | -| 2 | 0.302377 | `communication_email_send` | ❌ | -| 3 | 0.238340 | `foundry_openai_chat-completions-create` | ❌ | -| 4 | 0.184240 | `foundry_agents_create` | ❌ | -| 5 | 0.183684 | `search_knowledge_base_retrieve` | ❌ | - ---- - -## Test 124 -======= -<<<<<<< HEAD -| 1 | 0.563359 | `communication_sms_send` | ✅ **EXPECTED** | -| 2 | 0.302360 | `communication_email_send` | ❌ | -| 3 | 0.238341 | `foundry_openai_chat-completions-create` | ❌ | -| 4 | 0.183684 | `search_knowledge_base_retrieve` | ❌ | -| 5 | 0.174092 | `foundry_openai_create-completion` | ❌ | - ---- - -## Test 119 -======= -======= ->>>>>>> e2fd2eac (refactor tts mcp tool) | 1 | 0.564114 | `communication_sms_send` | ✅ **EXPECTED** | | 2 | 0.302363 | `communication_email_send` | ❌ | | 3 | 0.238296 | `foundry_openai_chat-completions-create` | ❌ | -| 4 | 0.184264 | `foundry_agents_create` | ❌ | +| 4 | 0.184482 | `foundry_agents_create` | ❌ | | 5 | 0.183651 | `search_knowledge_base_retrieve` | ❌ | --- -<<<<<<< HEAD -## Test 129 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= ## Test 134 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `communication_sms_send` **Prompt:** Send an SMS with delivery receipt tracking @@ -5507,45 +2760,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -<<<<<<< HEAD -| 1 | 0.598236 | `communication_sms_send` | ✅ **EXPECTED** | -| 2 | 0.314267 | `communication_email_send` | ❌ | -| 3 | 0.206931 | `foundry_agents_query-and-evaluate` | ❌ | -======= -<<<<<<< HEAD -| 1 | 0.592519 | `communication_sms_send` | ✅ **EXPECTED** | -======= | 1 | 0.598211 | `communication_sms_send` | ✅ **EXPECTED** | ->>>>>>> e2fd2eac (refactor tts mcp tool) | 2 | 0.314134 | `communication_email_send` | ❌ | | 3 | 0.206916 | `foundry_agents_query-and-evaluate` | ❌ | ->>>>>>> 58ab8585 (update prompts and tool description evaluator) | 4 | 0.201142 | `foundry_openai_chat-completions-create` | ❌ | -| 5 | 0.187824 | `confidentialledger_entries_append` | ❌ | - ---- - -<<<<<<< HEAD -<<<<<<< HEAD -## Test 125 -======= -## Test 120 -======= -| 1 | 0.598211 | `communication_sms_send` | ✅ **EXPECTED** | -| 2 | 0.314134 | `communication_email_send` | ❌ | -| 3 | 0.206814 | `foundry_agents_query-and-evaluate` | ❌ | -| 4 | 0.187824 | `confidentialledger_entries_append` | ❌ | -| 5 | 0.181824 | `foundry_openai_chat-completions-create` | ❌ | +| 5 | 0.187855 | `confidentialledger_entries_append` | ❌ | --- -## Test 130 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= ## Test 135 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `confidentialledger_entries_append` **Prompt:** Append an entry to my ledger with data {"key": "value"} @@ -5554,25 +2777,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -<<<<<<< HEAD -| 1 | 0.511241 | `confidentialledger_entries_append` | ✅ **EXPECTED** | -| 2 | 0.295319 | `confidentialledger_entries_get` | ❌ | -| 3 | 0.291757 | `appconfig_kv_set` | ❌ | -| 4 | 0.258741 | `appconfig_kv_lock_set` | ❌ | -| 5 | 0.250106 | `keyvault_certificate_import` | ❌ | - ---- - -## Test 126 -======= -<<<<<<< HEAD -| 1 | 0.510689 | `confidentialledger_entries_append` | ✅ **EXPECTED** | -| 2 | 0.293736 | `confidentialledger_entries_get` | ❌ | -======= -======= ->>>>>>> e2fd2eac (refactor tts mcp tool) -| 1 | 0.510651 | `confidentialledger_entries_append` | ✅ **EXPECTED** | +| 1 | 0.510554 | `confidentialledger_entries_append` | ✅ **EXPECTED** | | 2 | 0.294885 | `confidentialledger_entries_get` | ❌ | | 3 | 0.292014 | `appconfig_kv_set` | ❌ | | 4 | 0.258967 | `appconfig_kv_lock_set` | ❌ | @@ -5580,16 +2785,7 @@ --- -<<<<<<< HEAD -<<<<<<< HEAD -## Test 121 -======= -## Test 131 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= ## Test 136 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `confidentialledger_entries_append` **Prompt:** Write a tamper-proof entry to ledger containing {"transaction": "data"} @@ -5598,40 +2794,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -<<<<<<< HEAD -| 1 | 0.602321 | `confidentialledger_entries_append` | ✅ **EXPECTED** | -| 2 | 0.357401 | `confidentialledger_entries_get` | ❌ | -| 3 | 0.211998 | `appconfig_kv_lock_set` | ❌ | -| 4 | 0.195461 | `keyvault_secret_create` | ❌ | -| 5 | 0.184070 | `keyvault_certificate_import` | ❌ | - ---- - -## Test 127 -======= -| 1 | 0.602257 | `confidentialledger_entries_append` | ✅ **EXPECTED** | -| 2 | 0.356510 | `confidentialledger_entries_get` | ❌ | -======= -| 1 | 0.602247 | `confidentialledger_entries_append` | ✅ **EXPECTED** | -| 2 | 0.357646 | `confidentialledger_entries_get` | ❌ | ->>>>>>> e2fd2eac (refactor tts mcp tool) -| 3 | 0.211990 | `appconfig_kv_lock_set` | ❌ | -| 4 | 0.195471 | `keyvault_secret_create` | ❌ | -| 5 | 0.184077 | `keyvault_certificate_import` | ❌ | +| 1 | 0.602324 | `confidentialledger_entries_append` | ✅ **EXPECTED** | +| 2 | 0.357780 | `confidentialledger_entries_get` | ❌ | +| 3 | 0.211969 | `appconfig_kv_lock_set` | ❌ | +| 4 | 0.195485 | `keyvault_secret_create` | ❌ | +| 5 | 0.184066 | `keyvault_certificate_import` | ❌ | --- -<<<<<<< HEAD -<<<<<<< HEAD -## Test 122 -======= -## Test 132 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= ## Test 137 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `confidentialledger_entries_append` **Prompt:** Append {"hello": "from mcp"} to my confidential ledger in collection @@ -5640,51 +2811,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -<<<<<<< HEAD -| 1 | 0.546786 | `confidentialledger_entries_append` | ✅ **EXPECTED** | -| 2 | 0.452117 | `confidentialledger_entries_get` | ❌ | -| 3 | 0.225013 | `appconfig_kv_lock_set` | ❌ | -| 4 | 0.215828 | `appconfig_kv_set` | ❌ | -| 5 | 0.203162 | `keyvault_certificate_import` | ❌ | - ---- - -## Test 128 -======= -<<<<<<< HEAD -| 1 | 0.546573 | `confidentialledger_entries_append` | ✅ **EXPECTED** | -| 2 | 0.451031 | `confidentialledger_entries_get` | ❌ | -| 3 | 0.224978 | `appconfig_kv_lock_set` | ❌ | -| 4 | 0.215862 | `appconfig_kv_set` | ❌ | -| 5 | 0.203109 | `keyvault_certificate_import` | ❌ | - ---- - -## Test 123 -======= -| 1 | 0.546675 | `confidentialledger_entries_append` | ✅ **EXPECTED** | -| 2 | 0.452058 | `confidentialledger_entries_get` | ❌ | -| 3 | 0.225145 | `appconfig_kv_lock_set` | ❌ | -| 4 | 0.215898 | `appconfig_kv_set` | ❌ | -| 5 | 0.211661 | `appservice_database_add` | ❌ | - ---- - -## Test 133 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -| 1 | 0.546660 | `confidentialledger_entries_append` | ✅ **EXPECTED** | -| 2 | 0.451994 | `confidentialledger_entries_get` | ❌ | -| 3 | 0.225141 | `appconfig_kv_lock_set` | ❌ | -| 4 | 0.215932 | `appconfig_kv_set` | ❌ | -| 5 | 0.203262 | `keyvault_certificate_import` | ❌ | +| 1 | 0.546394 | `confidentialledger_entries_append` | ✅ **EXPECTED** | +| 2 | 0.451837 | `confidentialledger_entries_get` | ❌ | +| 3 | 0.225163 | `appconfig_kv_lock_set` | ❌ | +| 4 | 0.216036 | `appconfig_kv_set` | ❌ | +| 5 | 0.203220 | `keyvault_certificate_import` | ❌ | --- ## Test 138 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `confidentialledger_entries_append` **Prompt:** Create an immutable ledger entry in with content {"audit": "log"} @@ -5693,47 +2828,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -<<<<<<< HEAD -| 1 | 0.496023 | `confidentialledger_entries_append` | ✅ **EXPECTED** | -| 2 | 0.340187 | `confidentialledger_entries_get` | ❌ | -| 3 | 0.218473 | `monitor_activitylog_list` | ❌ | -======= -<<<<<<< HEAD -| 1 | 0.496032 | `confidentialledger_entries_append` | ✅ **EXPECTED** | -| 2 | 0.338270 | `confidentialledger_entries_get` | ❌ | -| 3 | 0.218518 | `monitor_activitylog_list` | ❌ | -======= -| 1 | 0.496023 | `confidentialledger_entries_append` | ✅ **EXPECTED** | +| 1 | 0.496008 | `confidentialledger_entries_append` | ✅ **EXPECTED** | | 2 | 0.340187 | `confidentialledger_entries_get` | ❌ | | 3 | 0.218473 | `monitor_activitylog_list` | ❌ | ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) | 4 | 0.215229 | `storage_blob_container_create` | ❌ | | 5 | 0.204925 | `monitor_resource_log_query` | ❌ | --- -<<<<<<< HEAD -## Test 129 -======= -<<<<<<< HEAD -## Test 124 -======= -## Test 134 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -| 1 | 0.495719 | `confidentialledger_entries_append` | ✅ **EXPECTED** | -| 2 | 0.340160 | `confidentialledger_entries_get` | ❌ | -| 3 | 0.218437 | `monitor_activitylog_list` | ❌ | -| 4 | 0.215039 | `storage_blob_container_create` | ❌ | -| 5 | 0.204909 | `monitor_resource_log_query` | ❌ | - ---- - ## Test 139 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `confidentialledger_entries_append` **Prompt:** Write an entry to confidential ledger @@ -5742,27 +2845,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -| 1 | 0.622138 | `confidentialledger_entries_append` | ✅ **EXPECTED** | +| 1 | 0.622014 | `confidentialledger_entries_append` | ✅ **EXPECTED** | | 2 | 0.524777 | `confidentialledger_entries_get` | ❌ | | 3 | 0.252508 | `appconfig_kv_lock_set` | ❌ | -| 4 | 0.240252 | `keyvault_secret_create` | ❌ | +| 4 | 0.240315 | `keyvault_secret_create` | ❌ | | 5 | 0.186890 | `appconfig_kv_set` | ❌ | --- -<<<<<<< HEAD -<<<<<<< HEAD -## Test 130 -======= -<<<<<<< HEAD -## Test 125 -======= -## Test 135 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= ## Test 140 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `confidentialledger_entries_get` **Prompt:** Get entry from Confidential Ledger for transaction on ledger @@ -5771,46 +2862,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -<<<<<<< HEAD -| 1 | 0.707252 | `confidentialledger_entries_get` | ✅ **EXPECTED** | -| 2 | 0.551953 | `confidentialledger_entries_append` | ❌ | -| 3 | 0.245549 | `keyvault_secret_get` | ❌ | -| 4 | 0.231190 | `keyvault_key_get` | ❌ | -| 5 | 0.211839 | `loadtesting_testrun_get` | ❌ | - ---- - -## Test 131 -======= -<<<<<<< HEAD -| 1 | 0.706506 | `confidentialledger_entries_get` | ✅ **EXPECTED** | -| 2 | 0.551901 | `confidentialledger_entries_append` | ❌ | -| 3 | 0.245541 | `keyvault_secret_get` | ❌ | -| 4 | 0.229943 | `keyvault_key_get` | ❌ | -| 5 | 0.212658 | `loadtesting_testrun_get` | ❌ | - ---- - -## Test 126 -======= -======= ->>>>>>> e2fd2eac (refactor tts mcp tool) | 1 | 0.707252 | `confidentialledger_entries_get` | ✅ **EXPECTED** | -| 2 | 0.551953 | `confidentialledger_entries_append` | ❌ | +| 2 | 0.551952 | `confidentialledger_entries_append` | ❌ | | 3 | 0.245541 | `keyvault_secret_get` | ❌ | | 4 | 0.229943 | `keyvault_key_get` | ❌ | | 5 | 0.211839 | `loadtesting_testrun_get` | ❌ | --- -<<<<<<< HEAD -## Test 136 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= ## Test 141 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `confidentialledger_entries_get` **Prompt:** Get transaction from ledger @@ -5819,45 +2879,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -<<<<<<< HEAD -| 1 | 0.509714 | `confidentialledger_entries_get` | ✅ **EXPECTED** | -| 2 | 0.416580 | `confidentialledger_entries_append` | ❌ | -| 3 | 0.223959 | `loadtesting_testrun_get` | ❌ | -======= -<<<<<<< HEAD -| 1 | 0.510283 | `confidentialledger_entries_get` | ✅ **EXPECTED** | -| 2 | 0.416550 | `confidentialledger_entries_append` | ❌ | -| 3 | 0.224523 | `loadtesting_testrun_get` | ❌ | -======= -| 1 | 0.509714 | `confidentialledger_entries_get` | ✅ **EXPECTED** | -| 2 | 0.416580 | `confidentialledger_entries_append` | ❌ | -| 3 | 0.224029 | `loadtesting_testrun_get` | ❌ | ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= | 1 | 0.509714 | `confidentialledger_entries_get` | ✅ **EXPECTED** | -| 2 | 0.416580 | `confidentialledger_entries_append` | ❌ | +| 2 | 0.416730 | `confidentialledger_entries_append` | ❌ | | 3 | 0.223959 | `loadtesting_testrun_get` | ❌ | ->>>>>>> e2fd2eac (refactor tts mcp tool) | 4 | 0.218412 | `monitor_resource_log_query` | ❌ | | 5 | 0.217671 | `loadtesting_testrun_list` | ❌ | --- -<<<<<<< HEAD -<<<<<<< HEAD -## Test 132 -======= -<<<<<<< HEAD -## Test 127 -======= -## Test 137 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= ## Test 142 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `cosmos_account_list` **Prompt:** List all cosmosdb accounts in my subscription @@ -5874,22 +2904,7 @@ --- -<<<<<<< HEAD -<<<<<<< HEAD -## Test 133 -======= -## Test 128 -======= -| 5 | 0.601388 | `kusto_cluster_list` | ❌ | - ---- - -## Test 138 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= ## Test 143 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `cosmos_account_list` **Prompt:** Show me my cosmosdb accounts @@ -5898,36 +2913,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -<<<<<<< HEAD -| 1 | 0.665422 | `cosmos_account_list` | ✅ **EXPECTED** | -| 2 | 0.605325 | `cosmos_database_list` | ❌ | -| 3 | 0.571573 | `cosmos_database_container_list` | ❌ | -| 4 | 0.549420 | `cosmos_database_container_item_query` | ❌ | -| 5 | 0.503865 | `storage_account_get` | ❌ | - ---- - -## Test 134 -======= -| 1 | 0.665440 | `cosmos_account_list` | ✅ **EXPECTED** | -======= -| 1 | 0.665447 | `cosmos_account_list` | ✅ **EXPECTED** | ->>>>>>> e2fd2eac (refactor tts mcp tool) -| 2 | 0.605357 | `cosmos_database_list` | ❌ | -| 3 | 0.571613 | `cosmos_database_container_list` | ❌ | -| 4 | 0.549447 | `cosmos_database_container_item_query` | ❌ | -| 5 | 0.503830 | `storage_account_get` | ❌ | +| 1 | 0.665477 | `cosmos_account_list` | ✅ **EXPECTED** | +| 2 | 0.605386 | `cosmos_database_list` | ❌ | +| 3 | 0.571628 | `cosmos_database_container_list` | ❌ | +| 4 | 0.549485 | `cosmos_database_container_item_query` | ❌ | +| 5 | 0.503849 | `storage_account_get` | ❌ | --- -<<<<<<< HEAD -## Test 139 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= ## Test 144 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `cosmos_account_list` **Prompt:** Show me the cosmosdb accounts in my subscription @@ -5936,37 +2930,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -| 1 | 0.752494 | `cosmos_account_list` | ✅ **EXPECTED** | -<<<<<<< HEAD -| 2 | 0.607165 | `subscription_list` | ❌ | -| 3 | 0.605125 | `cosmos_database_list` | ❌ | -| 4 | 0.566249 | `cosmos_database_container_list` | ❌ | -| 5 | 0.563922 | `cosmos_database_container_item_query` | ❌ | - ---- - -<<<<<<< HEAD -## Test 135 -======= -## Test 130 -======= -| 1 | 0.752501 | `cosmos_account_list` | ✅ **EXPECTED** | -======= ->>>>>>> e2fd2eac (refactor tts mcp tool) -| 2 | 0.607201 | `subscription_list` | ❌ | -| 3 | 0.605125 | `cosmos_database_list` | ❌ | -| 4 | 0.566249 | `cosmos_database_container_list` | ❌ | -| 5 | 0.563921 | `cosmos_database_container_item_query` | ❌ | +| 1 | 0.752413 | `cosmos_account_list` | ✅ **EXPECTED** | +| 2 | 0.606937 | `subscription_list` | ❌ | +| 3 | 0.605196 | `cosmos_database_list` | ❌ | +| 4 | 0.566379 | `cosmos_database_container_list` | ❌ | +| 5 | 0.564082 | `cosmos_database_container_item_query` | ❌ | --- -<<<<<<< HEAD -## Test 140 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= ## Test 145 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `cosmos_database_container_item_query` **Prompt:** Show me the items that contain the word in the container in the database for the cosmosdb account @@ -5975,41 +2947,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -<<<<<<< HEAD -| 1 | 0.658701 | `cosmos_database_container_item_query` | ✅ **EXPECTED** | -======= -<<<<<<< HEAD -| 1 | 0.658738 | `cosmos_database_container_item_query` | ✅ **EXPECTED** | -======= -| 1 | 0.658701 | `cosmos_database_container_item_query` | ✅ **EXPECTED** | ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -| 2 | 0.605253 | `cosmos_database_container_list` | ❌ | -| 3 | 0.488353 | `storage_blob_container_get` | ❌ | -======= | 1 | 0.658701 | `cosmos_database_container_item_query` | ✅ **EXPECTED** | | 2 | 0.605253 | `cosmos_database_container_list` | ❌ | -| 3 | 0.487789 | `storage_blob_container_get` | ❌ | ->>>>>>> e2fd2eac (refactor tts mcp tool) +| 3 | 0.487612 | `storage_blob_container_get` | ❌ | | 4 | 0.477874 | `cosmos_database_list` | ❌ | | 5 | 0.447757 | `cosmos_account_list` | ❌ | --- -<<<<<<< HEAD -<<<<<<< HEAD -## Test 136 -======= -<<<<<<< HEAD -## Test 131 -======= -## Test 141 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= ## Test 146 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `cosmos_database_container_list` **Prompt:** List all the containers in the database for the cosmosdb account @@ -6018,35 +2964,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -| 1 | 0.852875 | `cosmos_database_container_list` | ✅ **EXPECTED** | -| 2 | 0.680991 | `cosmos_database_list` | ❌ | -| 3 | 0.680758 | `cosmos_database_container_item_query` | ❌ | -| 4 | 0.632634 | `storage_blob_container_get` | ❌ | -| 5 | 0.630588 | `cosmos_account_list` | ❌ | - ---- - -## Test 137 -======= -| 1 | 0.852832 | `cosmos_database_container_list` | ✅ **EXPECTED** | -| 2 | 0.681044 | `cosmos_database_list` | ❌ | -| 3 | 0.680762 | `cosmos_database_container_item_query` | ❌ | -| 4 | 0.632577 | `storage_blob_container_get` | ❌ | -| 5 | 0.630659 | `cosmos_account_list` | ❌ | +| 1 | 0.852826 | `cosmos_database_container_list` | ✅ **EXPECTED** | +| 2 | 0.681006 | `cosmos_database_list` | ❌ | +| 3 | 0.680795 | `cosmos_database_container_item_query` | ❌ | +| 4 | 0.632368 | `storage_blob_container_get` | ❌ | +| 5 | 0.630666 | `cosmos_account_list` | ❌ | --- -<<<<<<< HEAD -<<<<<<< HEAD -## Test 132 -======= -## Test 142 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= ## Test 147 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `cosmos_database_container_list` **Prompt:** Show me the containers in the database for the cosmosdb account @@ -6055,46 +2981,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -<<<<<<< HEAD -| 1 | 0.789395 | `cosmos_database_container_list` | ✅ **EXPECTED** | -| 2 | 0.648126 | `cosmos_database_container_item_query` | ❌ | -| 3 | 0.614220 | `cosmos_database_list` | ❌ | -| 4 | 0.591350 | `storage_blob_container_get` | ❌ | -| 5 | 0.562062 | `cosmos_account_list` | ❌ | - ---- - -## Test 138 -======= -<<<<<<< HEAD -| 1 | 0.789413 | `cosmos_database_container_list` | ✅ **EXPECTED** | -| 2 | 0.648207 | `cosmos_database_container_item_query` | ❌ | -| 3 | 0.614278 | `cosmos_database_list` | ❌ | -| 4 | 0.591387 | `storage_blob_container_get` | ❌ | -| 5 | 0.562096 | `cosmos_account_list` | ❌ | - ---- - -## Test 133 -======= -======= ->>>>>>> e2fd2eac (refactor tts mcp tool) | 1 | 0.789395 | `cosmos_database_container_list` | ✅ **EXPECTED** | | 2 | 0.648126 | `cosmos_database_container_item_query` | ❌ | | 3 | 0.614220 | `cosmos_database_list` | ❌ | -| 4 | 0.591594 | `storage_blob_container_get` | ❌ | +| 4 | 0.591361 | `storage_blob_container_get` | ❌ | | 5 | 0.562062 | `cosmos_account_list` | ❌ | --- -<<<<<<< HEAD -## Test 143 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= ## Test 148 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `cosmos_database_list` **Prompt:** List all the databases in the cosmosdb account @@ -6106,37 +3001,12 @@ | 1 | 0.815683 | `cosmos_database_list` | ✅ **EXPECTED** | | 2 | 0.668515 | `cosmos_account_list` | ❌ | | 3 | 0.665298 | `cosmos_database_container_list` | ❌ | -<<<<<<< HEAD -<<<<<<< HEAD -| 4 | 0.606433 | `cosmos_database_container_item_query` | ❌ | -| 5 | 0.582804 | `kusto_database_list` | ❌ | - ---- - -## Test 139 -======= -<<<<<<< HEAD -| 4 | 0.606414 | `cosmos_database_container_item_query` | ❌ | -| 5 | 0.583507 | `kusto_database_list` | ❌ | - ---- - -## Test 134 -======= -======= ->>>>>>> e2fd2eac (refactor tts mcp tool) | 4 | 0.606433 | `cosmos_database_container_item_query` | ❌ | -| 5 | 0.583535 | `kusto_database_list` | ❌ | +| 5 | 0.583402 | `kusto_database_list` | ❌ | --- -<<<<<<< HEAD -## Test 144 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= ## Test 149 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `cosmos_database_list` **Prompt:** Show me the databases in the cosmosdb account @@ -6148,36 +3018,12 @@ | 1 | 0.749370 | `cosmos_database_list` | ✅ **EXPECTED** | | 2 | 0.624759 | `cosmos_database_container_list` | ❌ | | 3 | 0.614572 | `cosmos_account_list` | ❌ | -<<<<<<< HEAD -<<<<<<< HEAD -| 4 | 0.579919 | `cosmos_database_container_item_query` | ❌ | -======= -| 4 | 0.579913 | `cosmos_database_container_item_query` | ❌ | -======= -| 3 | 0.614554 | `cosmos_account_list` | ❌ | | 4 | 0.579919 | `cosmos_database_container_item_query` | ❌ | ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -| 4 | 0.579919 | `cosmos_database_container_item_query` | ❌ | ->>>>>>> e2fd2eac (refactor tts mcp tool) -| 5 | 0.538479 | `mysql_database_list` | ❌ | +| 5 | 0.538045 | `mysql_database_list` | ❌ | --- -<<<<<<< HEAD -<<<<<<< HEAD -## Test 140 -======= -<<<<<<< HEAD -## Test 135 -======= -## Test 145 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= ## Test 150 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `kusto_cluster_get` **Prompt:** Show me the details of the Data Explorer cluster @@ -6189,37 +3035,12 @@ | 1 | 0.590264 | `kusto_cluster_get` | ✅ **EXPECTED** | | 2 | 0.463832 | `kusto_cluster_list` | ❌ | | 3 | 0.428159 | `kusto_query` | ❌ | -<<<<<<< HEAD -<<<<<<< HEAD -| 4 | 0.425909 | `kusto_database_list` | ❌ | -======= -| 4 | 0.425688 | `kusto_database_list` | ❌ | -======= -| 2 | 0.463623 | `kusto_cluster_list` | ❌ | -| 3 | 0.428159 | `kusto_query` | ❌ | -| 4 | 0.425469 | `kusto_database_list` | ❌ | ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -| 4 | 0.425669 | `kusto_database_list` | ❌ | ->>>>>>> e2fd2eac (refactor tts mcp tool) +| 4 | 0.425508 | `kusto_database_list` | ❌ | | 5 | 0.422577 | `kusto_table_schema` | ❌ | --- -<<<<<<< HEAD -<<<<<<< HEAD -## Test 141 -======= -<<<<<<< HEAD -## Test 136 -======= -## Test 146 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= ## Test 151 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `kusto_cluster_list` **Prompt:** List all Data Explorer clusters in my subscription @@ -6229,38 +3050,14 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| | 1 | 0.793744 | `kusto_cluster_list` | ✅ **EXPECTED** | -<<<<<<< HEAD -<<<<<<< HEAD -| 2 | 0.630451 | `kusto_database_list` | ❌ | -======= -| 2 | 0.630504 | `kusto_database_list` | ❌ | -======= -| 1 | 0.793453 | `kusto_cluster_list` | ✅ **EXPECTED** | -| 2 | 0.630261 | `kusto_database_list` | ❌ | ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -| 2 | 0.630507 | `kusto_database_list` | ❌ | ->>>>>>> e2fd2eac (refactor tts mcp tool) +| 2 | 0.630260 | `kusto_database_list` | ❌ | | 3 | 0.573395 | `kusto_cluster_get` | ❌ | | 4 | 0.525025 | `aks_cluster_get` | ❌ | -| 5 | 0.509397 | `grafana_list` | ❌ | +| 5 | 0.509396 | `grafana_list` | ❌ | --- -<<<<<<< HEAD -<<<<<<< HEAD -## Test 142 -======= -<<<<<<< HEAD -## Test 137 -======= -## Test 147 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= ## Test 152 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `kusto_cluster_list` **Prompt:** Show me my Data Explorer clusters @@ -6271,38 +3068,13 @@ |------|-------|------|--------| | 1 | 0.531307 | `kusto_cluster_list` | ✅ **EXPECTED** | | 2 | 0.465277 | `kusto_cluster_get` | ❌ | -<<<<<<< HEAD -<<<<<<< HEAD -| 3 | 0.432311 | `kusto_database_list` | ❌ | -======= -| 3 | 0.432320 | `kusto_database_list` | ❌ | -======= -| 1 | 0.530932 | `kusto_cluster_list` | ✅ **EXPECTED** | -| 2 | 0.465277 | `kusto_cluster_get` | ❌ | -| 3 | 0.432552 | `kusto_database_list` | ❌ | ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -| 3 | 0.432288 | `kusto_database_list` | ❌ | ->>>>>>> e2fd2eac (refactor tts mcp tool) +| 3 | 0.432028 | `kusto_database_list` | ❌ | | 4 | 0.369596 | `aks_cluster_get` | ❌ | | 5 | 0.363119 | `kusto_table_schema` | ❌ | --- -<<<<<<< HEAD -<<<<<<< HEAD -## Test 143 -======= -<<<<<<< HEAD -## Test 138 -======= -## Test 148 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= ## Test 153 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `kusto_cluster_list` **Prompt:** Show me the Data Explorer clusters in my subscription @@ -6313,38 +3085,13 @@ |------|-------|------|--------| | 1 | 0.701484 | `kusto_cluster_list` | ✅ **EXPECTED** | | 2 | 0.571191 | `kusto_cluster_get` | ❌ | -<<<<<<< HEAD -<<<<<<< HEAD -| 3 | 0.548734 | `kusto_database_list` | ❌ | -======= -| 3 | 0.548690 | `kusto_database_list` | ❌ | -======= -| 1 | 0.701232 | `kusto_cluster_list` | ✅ **EXPECTED** | -| 2 | 0.571191 | `kusto_cluster_get` | ❌ | -| 3 | 0.548589 | `kusto_database_list` | ❌ | ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -| 3 | 0.548685 | `kusto_database_list` | ❌ | ->>>>>>> e2fd2eac (refactor tts mcp tool) +| 3 | 0.548216 | `kusto_database_list` | ❌ | | 4 | 0.498909 | `aks_cluster_get` | ❌ | -| 5 | 0.474253 | `redis_list` | ❌ | +| 5 | 0.474201 | `redis_list` | ❌ | --- -<<<<<<< HEAD -<<<<<<< HEAD -## Test 144 -======= -<<<<<<< HEAD -## Test 139 -======= -## Test 149 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= ## Test 154 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `kusto_database_list` **Prompt:** List all databases in the Data Explorer cluster @@ -6353,42 +3100,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -<<<<<<< HEAD -| 1 | 0.676656 | `kusto_database_list` | ✅ **EXPECTED** | -| 2 | 0.560592 | `kusto_cluster_list` | ❌ | -| 3 | 0.556795 | `kusto_table_list` | ❌ | -======= -<<<<<<< HEAD -| 1 | 0.677042 | `kusto_database_list` | ✅ **EXPECTED** | -======= -| 1 | 0.677059 | `kusto_database_list` | ✅ **EXPECTED** | ->>>>>>> e2fd2eac (refactor tts mcp tool) -| 2 | 0.560592 | `kusto_cluster_list` | ❌ | -| 3 | 0.556795 | `kusto_table_list` | ❌ | -<<<<<<< HEAD ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= ->>>>>>> e2fd2eac (refactor tts mcp tool) -| 4 | 0.553218 | `postgres_database_list` | ❌ | -| 5 | 0.549673 | `cosmos_database_list` | ❌ | +| 1 | 0.677160 | `kusto_database_list` | ✅ **EXPECTED** | +| 2 | 0.560715 | `kusto_cluster_list` | ❌ | +| 3 | 0.556662 | `kusto_table_list` | ❌ | +| 4 | 0.553239 | `postgres_database_list` | ❌ | +| 5 | 0.549605 | `cosmos_database_list` | ❌ | --- -<<<<<<< HEAD -<<<<<<< HEAD -## Test 145 -======= -<<<<<<< HEAD -## Test 140 -======= -## Test 150 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= ## Test 155 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `kusto_database_list` **Prompt:** Show me the databases in the Data Explorer cluster @@ -6397,42 +3117,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -<<<<<<< HEAD -| 1 | 0.623242 | `kusto_database_list` | ✅ **EXPECTED** | -| 2 | 0.509952 | `kusto_cluster_list` | ❌ | -| 3 | 0.507073 | `kusto_table_list` | ❌ | -======= -<<<<<<< HEAD -| 1 | 0.623528 | `kusto_database_list` | ✅ **EXPECTED** | -======= -| 1 | 0.623523 | `kusto_database_list` | ✅ **EXPECTED** | ->>>>>>> e2fd2eac (refactor tts mcp tool) +| 1 | 0.623592 | `kusto_database_list` | ✅ **EXPECTED** | | 2 | 0.509953 | `kusto_cluster_list` | ❌ | | 3 | 0.507073 | `kusto_table_list` | ❌ | -<<<<<<< HEAD ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= ->>>>>>> e2fd2eac (refactor tts mcp tool) | 4 | 0.497144 | `cosmos_database_list` | ❌ | -| 5 | 0.491400 | `mysql_database_list` | ❌ | +| 5 | 0.491166 | `mysql_database_list` | ❌ | --- -<<<<<<< HEAD -<<<<<<< HEAD -## Test 146 -======= -<<<<<<< HEAD -## Test 141 -======= -## Test 151 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= ## Test 156 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `kusto_query` **Prompt:** Show me all items that contain the word in the Data Explorer table in cluster @@ -6441,46 +3134,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD | 1 | 0.423660 | `kusto_query` | ✅ **EXPECTED** | -<<<<<<< HEAD -| 2 | 0.409485 | `postgres_database_query` | ❌ | -| 3 | 0.408178 | `kusto_table_schema` | ❌ | -| 4 | 0.407740 | `kusto_sample` | ❌ | -| 5 | 0.403989 | `kusto_cluster_list` | ❌ | - ---- - -## Test 147 -======= -| 2 | 0.409534 | `postgres_database_query` | ❌ | +| 2 | 0.409526 | `postgres_database_query` | ❌ | | 3 | 0.408178 | `kusto_table_schema` | ❌ | | 4 | 0.407741 | `kusto_sample` | ❌ | -<<<<<<< HEAD | 5 | 0.403990 | `kusto_cluster_list` | ❌ | --- -## Test 142 -======= -| 5 | 0.403800 | `kusto_cluster_list` | ❌ | - ---- - -## Test 152 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -| 1 | 0.423694 | `kusto_query` | ✅ **EXPECTED** | -| 2 | 0.409649 | `postgres_database_query` | ❌ | -| 3 | 0.408162 | `kusto_table_schema` | ❌ | -| 4 | 0.407690 | `kusto_sample` | ❌ | -| 5 | 0.403967 | `kusto_cluster_list` | ❌ | - ---- - ## Test 157 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `kusto_sample` **Prompt:** Show me a data sample from the Data Explorer table in cluster @@ -6491,36 +3153,13 @@ |------|-------|------|--------| | 1 | 0.595554 | `kusto_sample` | ✅ **EXPECTED** | | 2 | 0.510233 | `kusto_table_schema` | ❌ | -<<<<<<< HEAD -<<<<<<< HEAD -| 3 | 0.424212 | `kusto_table_list` | ❌ | -======= -<<<<<<< HEAD -| 3 | 0.424221 | `kusto_table_list` | ❌ | ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -| 4 | 0.400924 | `kusto_cluster_list` | ❌ | -======= -======= ->>>>>>> e2fd2eac (refactor tts mcp tool) | 3 | 0.424212 | `kusto_table_list` | ❌ | | 4 | 0.400924 | `kusto_cluster_list` | ❌ | | 5 | 0.399525 | `kusto_cluster_get` | ❌ | --- -<<<<<<< HEAD -<<<<<<< HEAD -## Test 148 -======= -<<<<<<< HEAD -## Test 143 -======= -## Test 153 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= ## Test 158 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `kusto_table_list` **Prompt:** List all tables in the Data Explorer database in cluster @@ -6529,47 +3168,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -<<<<<<< HEAD | 1 | 0.679642 | `kusto_table_list` | ✅ **EXPECTED** | -======= -<<<<<<< HEAD -| 1 | 0.679655 | `kusto_table_list` | ✅ **EXPECTED** | ->>>>>>> 58ab8585 (update prompts and tool description evaluator) | 2 | 0.585237 | `postgres_table_list` | ❌ | -| 3 | 0.580964 | `kusto_database_list` | ❌ | +| 3 | 0.580885 | `kusto_database_list` | ❌ | | 4 | 0.556724 | `mysql_table_list` | ❌ | -| 5 | 0.550005 | `monitor_table_list` | ❌ | - ---- - -<<<<<<< HEAD -## Test 149 -======= -## Test 144 -======= -| 1 | 0.679642 | `kusto_table_list` | ✅ **EXPECTED** | -| 2 | 0.585237 | `postgres_table_list` | ❌ | -| 3 | 0.581015 | `kusto_database_list` | ❌ | -| 4 | 0.556724 | `mysql_table_list` | ❌ | -| 5 | 0.549762 | `monitor_table_list` | ❌ | - ---- - -## Test 154 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -| 1 | 0.679642 | `kusto_table_list` | ✅ **EXPECTED** | -| 2 | 0.585237 | `postgres_table_list` | ❌ | -| 3 | 0.581207 | `kusto_database_list` | ❌ | -| 4 | 0.556724 | `mysql_table_list` | ❌ | -| 5 | 0.550007 | `monitor_table_list` | ❌ | +| 5 | 0.549940 | `monitor_table_list` | ❌ | --- ## Test 159 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `kusto_table_list` **Prompt:** Show me the tables in the Data Explorer database in cluster @@ -6578,42 +3185,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -<<<<<<< HEAD -| 1 | 0.619252 | `kusto_table_list` | ✅ **EXPECTED** | -| 2 | 0.554332 | `kusto_table_schema` | ❌ | -| 3 | 0.527431 | `kusto_database_list` | ❌ | -| 4 | 0.524691 | `mysql_table_list` | ❌ | -======= -| 1 | 0.619269 | `kusto_table_list` | ✅ **EXPECTED** | -======= | 1 | 0.619252 | `kusto_table_list` | ✅ **EXPECTED** | ->>>>>>> e2fd2eac (refactor tts mcp tool) | 2 | 0.554333 | `kusto_table_schema` | ❌ | -| 3 | 0.527626 | `kusto_database_list` | ❌ | +| 3 | 0.527314 | `kusto_database_list` | ❌ | | 4 | 0.524691 | `mysql_table_list` | ❌ | -<<<<<<< HEAD ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= ->>>>>>> e2fd2eac (refactor tts mcp tool) | 5 | 0.523432 | `postgres_table_list` | ❌ | --- -<<<<<<< HEAD -<<<<<<< HEAD -## Test 150 -======= -<<<<<<< HEAD -## Test 145 -======= -## Test 155 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= ## Test 160 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `kusto_table_schema` **Prompt:** Show me the schema for table in the Data Explorer database in cluster @@ -6622,51 +3202,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -<<<<<<< HEAD -| 1 | 0.666980 | `kusto_table_schema` | ✅ **EXPECTED** | -| 2 | 0.564204 | `postgres_table_schema_get` | ❌ | -| 3 | 0.528301 | `mysql_table_schema_get` | ❌ | -| 4 | 0.490892 | `kusto_sample` | ❌ | -| 5 | 0.489745 | `kusto_table_list` | ❌ | - ---- - -## Test 151 -======= -<<<<<<< HEAD -| 1 | 0.667033 | `kusto_table_schema` | ✅ **EXPECTED** | -| 2 | 0.564282 | `postgres_table_schema_get` | ❌ | -| 3 | 0.527921 | `mysql_table_schema_get` | ❌ | -| 4 | 0.490939 | `kusto_sample` | ❌ | -| 5 | 0.489722 | `kusto_table_list` | ❌ | - ---- - -## Test 146 -======= -| 1 | 0.667095 | `kusto_table_schema` | ✅ **EXPECTED** | -| 2 | 0.564717 | `postgres_table_schema_get` | ❌ | -| 3 | 0.528210 | `mysql_table_schema_get` | ❌ | -| 4 | 0.490775 | `kusto_sample` | ❌ | -| 5 | 0.489814 | `kusto_table_list` | ❌ | - ---- - -## Test 156 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -| 1 | 0.666828 | `kusto_table_schema` | ✅ **EXPECTED** | -| 2 | 0.564124 | `postgres_table_schema_get` | ❌ | -| 3 | 0.527717 | `mysql_table_schema_get` | ❌ | -| 4 | 0.490739 | `kusto_sample` | ❌ | -| 5 | 0.489476 | `kusto_table_list` | ❌ | +| 1 | 0.666757 | `kusto_table_schema` | ✅ **EXPECTED** | +| 2 | 0.564118 | `postgres_table_schema_get` | ❌ | +| 3 | 0.527778 | `mysql_table_schema_get` | ❌ | +| 4 | 0.490798 | `kusto_sample` | ❌ | +| 5 | 0.489446 | `kusto_table_list` | ❌ | --- ## Test 161 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `mysql_database_list` **Prompt:** List all MySQL databases in server @@ -6675,46 +3219,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -<<<<<<< HEAD -| 1 | 0.633991 | `postgres_database_list` | ❌ | -| 2 | 0.623359 | `mysql_database_list` | ✅ **EXPECTED** | -| 3 | 0.534434 | `mysql_table_list` | ❌ | -| 4 | 0.498902 | `mysql_server_list` | ❌ | -| 5 | 0.490102 | `sql_db_list` | ❌ | - ---- - -## Test 152 -======= -<<<<<<< HEAD -| 1 | 0.633973 | `postgres_database_list` | ❌ | -| 2 | 0.623333 | `mysql_database_list` | ✅ **EXPECTED** | -| 3 | 0.534537 | `mysql_table_list` | ❌ | -| 4 | 0.498854 | `mysql_server_list` | ❌ | -| 5 | 0.490179 | `sql_db_list` | ❌ | - ---- - -## Test 147 -======= -======= ->>>>>>> e2fd2eac (refactor tts mcp tool) | 1 | 0.634056 | `postgres_database_list` | ❌ | -| 2 | 0.623421 | `mysql_database_list` | ✅ **EXPECTED** | +| 2 | 0.623043 | `mysql_database_list` | ✅ **EXPECTED** | | 3 | 0.534457 | `mysql_table_list` | ❌ | -| 4 | 0.498918 | `mysql_server_list` | ❌ | +| 4 | 0.499107 | `mysql_server_list` | ❌ | | 5 | 0.490148 | `sql_db_list` | ❌ | --- -<<<<<<< HEAD -## Test 157 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= ## Test 162 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `mysql_database_list` **Prompt:** Show me the MySQL databases in server @@ -6723,42 +3236,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -| 1 | 0.588121 | `mysql_database_list` | ✅ **EXPECTED** | +| 1 | 0.587863 | `mysql_database_list` | ✅ **EXPECTED** | | 2 | 0.574089 | `postgres_database_list` | ❌ | -<<<<<<< HEAD -<<<<<<< HEAD -| 3 | 0.483855 | `mysql_table_list` | ❌ | -| 4 | 0.463244 | `mysql_server_list` | ❌ | -| 5 | 0.444547 | `sql_db_list` | ❌ | - ---- - -## Test 153 -======= -<<<<<<< HEAD -| 3 | 0.483938 | `mysql_table_list` | ❌ | -| 4 | 0.463238 | `mysql_server_list` | ❌ | -| 5 | 0.444622 | `sql_db_list` | ❌ | - ---- - -## Test 148 -======= -======= ->>>>>>> e2fd2eac (refactor tts mcp tool) | 3 | 0.483855 | `mysql_table_list` | ❌ | -| 4 | 0.463244 | `mysql_server_list` | ❌ | +| 4 | 0.463405 | `mysql_server_list` | ❌ | | 5 | 0.444547 | `sql_db_list` | ❌ | --- -<<<<<<< HEAD -## Test 158 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= ## Test 163 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `mysql_database_query` **Prompt:** Show me all items that contain the word in the MySQL database in server @@ -6767,47 +3253,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -<<<<<<< HEAD -| 1 | 0.476423 | `mysql_table_list` | ❌ | -| 2 | 0.455770 | `mysql_database_list` | ❌ | -| 3 | 0.432703 | `mysql_database_query` | ✅ **EXPECTED** | -| 4 | 0.419859 | `mysql_server_list` | ❌ | -| 5 | 0.409655 | `mysql_table_schema_get` | ❌ | +| 1 | 0.476256 | `mysql_table_list` | ❌ | +| 2 | 0.456053 | `mysql_database_list` | ❌ | +| 3 | 0.433203 | `mysql_database_query` | ✅ **EXPECTED** | +| 4 | 0.419831 | `mysql_server_list` | ❌ | +| 5 | 0.409321 | `mysql_table_schema_get` | ❌ | --- -## Test 154 -======= -<<<<<<< HEAD -| 1 | 0.476539 | `mysql_table_list` | ❌ | -| 2 | 0.455770 | `mysql_database_list` | ❌ | -| 3 | 0.433392 | `mysql_database_query` | ✅ **EXPECTED** | -| 4 | 0.419938 | `mysql_server_list` | ❌ | -======= -| 1 | 0.476423 | `mysql_table_list` | ❌ | -| 2 | 0.455770 | `mysql_database_list` | ❌ | -| 3 | 0.433202 | `mysql_database_query` | ✅ **EXPECTED** | -======= -| 1 | 0.476420 | `mysql_table_list` | ❌ | -| 2 | 0.455766 | `mysql_database_list` | ❌ | -| 3 | 0.433385 | `mysql_database_query` | ✅ **EXPECTED** | ->>>>>>> e2fd2eac (refactor tts mcp tool) -| 4 | 0.419859 | `mysql_server_list` | ❌ | -| 5 | 0.409450 | `mysql_table_schema_get` | ❌ | - ---- - -<<<<<<< HEAD -<<<<<<< HEAD -## Test 149 -======= -## Test 159 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= ## Test 164 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `mysql_server_config_get` **Prompt:** Show me the configuration of MySQL server @@ -6816,27 +3270,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -| 1 | 0.531964 | `postgres_server_config_get` | ❌ | -| 2 | 0.517385 | `mysql_server_param_set` | ❌ | -| 3 | 0.489870 | `mysql_server_config_get` | ✅ **EXPECTED** | -| 4 | 0.476944 | `mysql_server_param_get` | ❌ | -| 5 | 0.426840 | `mysql_table_schema_get` | ❌ | +| 1 | 0.531887 | `postgres_server_config_get` | ❌ | +| 2 | 0.516894 | `mysql_server_param_set` | ❌ | +| 3 | 0.489816 | `mysql_server_config_get` | ✅ **EXPECTED** | +| 4 | 0.476863 | `mysql_server_param_get` | ❌ | +| 5 | 0.426507 | `mysql_table_schema_get` | ❌ | --- -<<<<<<< HEAD -<<<<<<< HEAD -## Test 155 -======= -<<<<<<< HEAD -## Test 150 -======= -## Test 160 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= ## Test 165 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `mysql_server_list` **Prompt:** List all MySQL servers in my subscription @@ -6845,42 +3287,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -<<<<<<< HEAD -| 1 | 0.678473 | `postgres_server_list` | ❌ | -======= -<<<<<<< HEAD -| 1 | 0.678536 | `postgres_server_list` | ❌ | ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -| 2 | 0.558177 | `mysql_database_list` | ❌ | -| 3 | 0.554818 | `mysql_server_list` | ✅ **EXPECTED** | -| 4 | 0.513706 | `kusto_cluster_list` | ❌ | -| 5 | 0.501199 | `mysql_table_list` | ❌ | - ---- - -<<<<<<< HEAD -## Test 156 -======= -## Test 151 -======= -======= ->>>>>>> e2fd2eac (refactor tts mcp tool) | 1 | 0.678472 | `postgres_server_list` | ❌ | -| 2 | 0.558177 | `mysql_database_list` | ❌ | -| 3 | 0.554817 | `mysql_server_list` | ✅ **EXPECTED** | +| 2 | 0.558115 | `mysql_database_list` | ❌ | +| 3 | 0.554998 | `mysql_server_list` | ✅ **EXPECTED** | | 4 | 0.513706 | `kusto_cluster_list` | ❌ | | 5 | 0.501199 | `mysql_table_list` | ❌ | --- -<<<<<<< HEAD -## Test 161 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= ## Test 166 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `mysql_server_list` **Prompt:** Show me my MySQL servers @@ -6889,45 +3304,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -| 1 | 0.478518 | `mysql_database_list` | ❌ | -<<<<<<< HEAD -<<<<<<< HEAD -| 2 | 0.474586 | `mysql_server_list` | ✅ **EXPECTED** | -| 3 | 0.435642 | `postgres_server_list` | ❌ | -| 4 | 0.412380 | `mysql_table_list` | ❌ | -======= -<<<<<<< HEAD -| 2 | 0.474630 | `mysql_server_list` | ✅ **EXPECTED** | -| 3 | 0.435692 | `postgres_server_list` | ❌ | -| 4 | 0.412417 | `mysql_table_list` | ❌ | -======= -| 2 | 0.474586 | `mysql_server_list` | ✅ **EXPECTED** | +| 1 | 0.478503 | `mysql_database_list` | ❌ | +| 2 | 0.474891 | `mysql_server_list` | ✅ **EXPECTED** | | 3 | 0.435642 | `postgres_server_list` | ❌ | | 4 | 0.412380 | `mysql_table_list` | ❌ | ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -| 2 | 0.474586 | `mysql_server_list` | ✅ **EXPECTED** | -| 3 | 0.435642 | `postgres_server_list` | ❌ | -| 4 | 0.412380 | `mysql_table_list` | ❌ | ->>>>>>> e2fd2eac (refactor tts mcp tool) | 5 | 0.389993 | `postgres_database_list` | ❌ | --- -<<<<<<< HEAD -<<<<<<< HEAD -## Test 157 -======= -<<<<<<< HEAD -## Test 152 -======= -## Test 162 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= ## Test 167 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `mysql_server_list` **Prompt:** Show me the MySQL servers in my subscription @@ -6936,41 +3321,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -<<<<<<< HEAD -| 1 | 0.636435 | `postgres_server_list` | ❌ | -| 2 | 0.534266 | `mysql_server_list` | ✅ **EXPECTED** | -======= -<<<<<<< HEAD -| 1 | 0.636471 | `postgres_server_list` | ❌ | -| 2 | 0.534277 | `mysql_server_list` | ✅ **EXPECTED** | ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -| 3 | 0.530210 | `mysql_database_list` | ❌ | -| 4 | 0.475052 | `kusto_cluster_list` | ❌ | -======= -======= ->>>>>>> e2fd2eac (refactor tts mcp tool) | 1 | 0.636435 | `postgres_server_list` | ❌ | -| 2 | 0.534266 | `mysql_server_list` | ✅ **EXPECTED** | -| 3 | 0.530210 | `mysql_database_list` | ❌ | +| 2 | 0.534464 | `mysql_server_list` | ✅ **EXPECTED** | +| 3 | 0.530312 | `mysql_database_list` | ❌ | | 4 | 0.475052 | `kusto_cluster_list` | ❌ | -| 5 | 0.470469 | `redis_list` | ❌ | +| 5 | 0.470468 | `redis_list` | ❌ | --- -<<<<<<< HEAD -<<<<<<< HEAD -## Test 158 -======= -<<<<<<< HEAD -## Test 153 -======= -## Test 163 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= ## Test 168 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `mysql_server_param_get` **Prompt:** Show me the value of connection timeout in seconds in my MySQL server @@ -6980,16 +3339,6 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| | 1 | 0.495071 | `mysql_server_param_get` | ✅ **EXPECTED** | -<<<<<<< HEAD -| 2 | 0.437857 | `mysql_server_param_set` | ❌ | -| 3 | 0.333041 | `mysql_database_query` | ❌ | -| 4 | 0.313364 | `mysql_table_schema_get` | ❌ | -| 5 | 0.310856 | `postgres_server_param_get` | ❌ | - ---- - -## Test 159 -======= | 2 | 0.438075 | `mysql_server_param_set` | ❌ | | 3 | 0.333841 | `mysql_database_query` | ❌ | | 4 | 0.313150 | `mysql_table_schema_get` | ❌ | @@ -6997,16 +3346,7 @@ --- -<<<<<<< HEAD -<<<<<<< HEAD -## Test 154 -======= -## Test 164 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= ## Test 169 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `mysql_server_param_set` **Prompt:** Set connection timeout to 20 seconds for my MySQL server @@ -7015,40 +3355,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -| 1 | 0.449612 | `mysql_server_param_set` | ✅ **EXPECTED** | +| 1 | 0.449419 | `mysql_server_param_set` | ✅ **EXPECTED** | | 2 | 0.381144 | `mysql_server_param_get` | ❌ | | 3 | 0.303499 | `postgres_server_param_set` | ❌ | -<<<<<<< HEAD -<<<<<<< HEAD -| 4 | 0.298661 | `mysql_database_query` | ❌ | -| 5 | 0.254180 | `mysql_server_list` | ❌ | - ---- - -## Test 160 -======= -<<<<<<< HEAD -======= ->>>>>>> e2fd2eac (refactor tts mcp tool) | 4 | 0.298911 | `mysql_database_query` | ❌ | -| 5 | 0.254180 | `mysql_server_list` | ❌ | +| 5 | 0.254159 | `mysql_server_list` | ❌ | --- -<<<<<<< HEAD -## Test 155 -======= -| 4 | 0.299246 | `mysql_database_query` | ❌ | -| 5 | 0.277569 | `appservice_database_add` | ❌ | - ---- - -## Test 165 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= ## Test 170 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `mysql_table_list` **Prompt:** List all tables in the MySQL database in server @@ -7057,47 +3372,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -<<<<<<< HEAD -| 1 | 0.633542 | `mysql_table_list` | ✅ **EXPECTED** | -| 2 | 0.573851 | `postgres_table_list` | ❌ | -| 3 | 0.550878 | `postgres_database_list` | ❌ | -| 4 | 0.546988 | `mysql_database_list` | ❌ | -| 5 | 0.511879 | `kusto_table_list` | ❌ | - ---- - -## Test 161 -======= -<<<<<<< HEAD -| 1 | 0.633547 | `mysql_table_list` | ✅ **EXPECTED** | -======= | 1 | 0.633448 | `mysql_table_list` | ✅ **EXPECTED** | ->>>>>>> e2fd2eac (refactor tts mcp tool) | 2 | 0.573844 | `postgres_table_list` | ❌ | | 3 | 0.550898 | `postgres_database_list` | ❌ | -| 4 | 0.546963 | `mysql_database_list` | ❌ | +| 4 | 0.546779 | `mysql_database_list` | ❌ | | 5 | 0.511847 | `kusto_table_list` | ❌ | --- -<<<<<<< HEAD -## Test 156 -======= -| 1 | 0.633542 | `mysql_table_list` | ✅ **EXPECTED** | -| 2 | 0.573851 | `postgres_table_list` | ❌ | -| 3 | 0.550878 | `postgres_database_list` | ❌ | -| 4 | 0.546987 | `mysql_database_list` | ❌ | -| 5 | 0.511879 | `kusto_table_list` | ❌ | - ---- - -## Test 166 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= ## Test 171 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `mysql_table_list` **Prompt:** Show me the tables in the MySQL database in server @@ -7108,25 +3391,13 @@ |------|-------|------|--------| | 1 | 0.609131 | `mysql_table_list` | ✅ **EXPECTED** | | 2 | 0.526236 | `postgres_table_list` | ❌ | -| 3 | 0.525709 | `mysql_database_list` | ❌ | -| 4 | 0.507532 | `mysql_table_schema_get` | ❌ | +| 3 | 0.525627 | `mysql_database_list` | ❌ | +| 4 | 0.507258 | `mysql_table_schema_get` | ❌ | | 5 | 0.498050 | `postgres_database_list` | ❌ | --- -<<<<<<< HEAD -<<<<<<< HEAD -## Test 162 -======= -<<<<<<< HEAD -## Test 157 -======= -## Test 167 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= ## Test 172 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `mysql_table_schema_get` **Prompt:** Show me the schema of table
in the MySQL database in server @@ -7135,27 +3406,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -| 1 | 0.630824 | `mysql_table_schema_get` | ✅ **EXPECTED** | +| 1 | 0.630623 | `mysql_table_schema_get` | ✅ **EXPECTED** | | 2 | 0.558306 | `postgres_table_schema_get` | ❌ | | 3 | 0.545025 | `mysql_table_list` | ❌ | | 4 | 0.517419 | `kusto_table_schema` | ❌ | -| 5 | 0.457739 | `mysql_database_list` | ❌ | +| 5 | 0.457648 | `mysql_database_list` | ❌ | --- -<<<<<<< HEAD -<<<<<<< HEAD -## Test 163 -======= -<<<<<<< HEAD -## Test 158 -======= -## Test 168 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= ## Test 173 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `postgres_database_list` **Prompt:** List all PostgreSQL databases in server @@ -7164,25 +3423,6 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -| 1 | 0.815470 | `postgres_database_list` | ✅ **EXPECTED** | -| 2 | 0.643680 | `postgres_table_list` | ❌ | -| 3 | 0.622824 | `postgres_server_list` | ❌ | -| 4 | 0.542912 | `postgres_server_config_get` | ❌ | -| 5 | 0.490950 | `postgres_server_param_get` | ❌ | - ---- - -<<<<<<< HEAD -## Test 164 -======= -<<<<<<< HEAD -## Test 159 -======= -## Test 169 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= | 1 | 0.815617 | `postgres_database_list` | ✅ **EXPECTED** | | 2 | 0.644014 | `postgres_table_list` | ❌ | | 3 | 0.622790 | `postgres_server_list` | ❌ | @@ -7192,7 +3432,6 @@ --- ## Test 174 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `postgres_database_list` **Prompt:** Show me the PostgreSQL databases in server @@ -7202,30 +3441,14 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| | 1 | 0.760033 | `postgres_database_list` | ✅ **EXPECTED** | -<<<<<<< HEAD -| 2 | 0.589784 | `postgres_server_list` | ❌ | -======= | 2 | 0.589783 | `postgres_server_list` | ❌ | ->>>>>>> e2fd2eac (refactor tts mcp tool) | 3 | 0.585891 | `postgres_table_list` | ❌ | | 4 | 0.552660 | `postgres_server_config_get` | ❌ | -| 5 | 0.495685 | `postgres_server_param_get` | ❌ | +| 5 | 0.495629 | `postgres_server_param_get` | ❌ | --- -<<<<<<< HEAD -<<<<<<< HEAD -## Test 165 -======= -<<<<<<< HEAD -## Test 160 -======= -## Test 170 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= ## Test 175 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `postgres_database_query` **Prompt:** Show me all items that contain the word in the PostgreSQL database in server @@ -7234,48 +3457,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD | 1 | 0.546211 | `postgres_database_list` | ❌ | -<<<<<<< HEAD -| 2 | 0.523223 | `postgres_database_query` | ✅ **EXPECTED** | -| 3 | 0.503267 | `postgres_table_list` | ❌ | -| 4 | 0.466599 | `postgres_server_list` | ❌ | -| 5 | 0.403963 | `postgres_server_param_get` | ❌ | - ---- - -## Test 166 -======= -<<<<<<< HEAD -| 2 | 0.523142 | `postgres_database_query` | ✅ **EXPECTED** | -| 3 | 0.503267 | `postgres_table_list` | ❌ | -| 4 | 0.466608 | `postgres_server_list` | ❌ | -======= -| 2 | 0.523122 | `postgres_database_query` | ✅ **EXPECTED** | +| 2 | 0.523127 | `postgres_database_query` | ✅ **EXPECTED** | | 3 | 0.503267 | `postgres_table_list` | ❌ | | 4 | 0.466599 | `postgres_server_list` | ❌ | ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) | 5 | 0.403969 | `postgres_server_param_get` | ❌ | --- -<<<<<<< HEAD -## Test 161 -======= -## Test 171 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -| 1 | 0.546505 | `postgres_database_list` | ❌ | -| 2 | 0.523181 | `postgres_database_query` | ✅ **EXPECTED** | -| 3 | 0.503458 | `postgres_table_list` | ❌ | -| 4 | 0.466623 | `postgres_server_list` | ❌ | -| 5 | 0.404170 | `postgres_server_config_get` | ❌ | - ---- - ## Test 176 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `postgres_server_config_get` **Prompt:** Show me the configuration of PostgreSQL server @@ -7286,30 +3476,13 @@ |------|-------|------|--------| | 1 | 0.756593 | `postgres_server_config_get` | ✅ **EXPECTED** | | 2 | 0.615429 | `postgres_server_param_set` | ❌ | -<<<<<<< HEAD -| 3 | 0.599487 | `postgres_server_param_get` | ❌ | -| 4 | 0.535050 | `postgres_database_list` | ❌ | -======= | 3 | 0.599471 | `postgres_server_param_get` | ❌ | | 4 | 0.535049 | `postgres_database_list` | ❌ | ->>>>>>> e2fd2eac (refactor tts mcp tool) | 5 | 0.518574 | `postgres_server_list` | ❌ | --- -<<<<<<< HEAD -<<<<<<< HEAD -## Test 167 -======= -<<<<<<< HEAD -## Test 162 -======= -## Test 172 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= ## Test 177 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `postgres_server_list` **Prompt:** List all PostgreSQL servers in my subscription @@ -7318,16 +3491,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -<<<<<<< HEAD | 1 | 0.900023 | `postgres_server_list` | ✅ **EXPECTED** | -======= -<<<<<<< HEAD -| 1 | 0.900052 | `postgres_server_list` | ✅ **EXPECTED** | ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -| 1 | 0.900023 | `postgres_server_list` | ✅ **EXPECTED** | ->>>>>>> e2fd2eac (refactor tts mcp tool) | 2 | 0.640733 | `postgres_database_list` | ❌ | | 3 | 0.565914 | `postgres_table_list` | ❌ | | 4 | 0.538997 | `postgres_server_config_get` | ❌ | @@ -7335,26 +3499,7 @@ --- -<<<<<<< HEAD -<<<<<<< HEAD -## Test 168 -======= -## Test 163 -======= -| 1 | 0.900023 | `postgres_server_list` | ✅ **EXPECTED** | -| 2 | 0.640733 | `postgres_database_list` | ❌ | -| 3 | 0.565914 | `postgres_table_list` | ❌ | -| 4 | 0.538997 | `postgres_server_config_get` | ❌ | -| 5 | 0.534345 | `kusto_cluster_list` | ❌ | - ---- - -## Test 173 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= ## Test 178 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `postgres_server_list` **Prompt:** Show me my PostgreSQL servers @@ -7365,25 +3510,13 @@ |------|-------|------|--------| | 1 | 0.674327 | `postgres_server_list` | ✅ **EXPECTED** | | 2 | 0.607062 | `postgres_database_list` | ❌ | -| 3 | 0.576348 | `postgres_server_config_get` | ❌ | -| 4 | 0.522995 | `postgres_table_list` | ❌ | -| 5 | 0.506254 | `postgres_server_param_get` | ❌ | +| 3 | 0.576349 | `postgres_server_config_get` | ❌ | +| 4 | 0.522996 | `postgres_table_list` | ❌ | +| 5 | 0.506171 | `postgres_server_param_get` | ❌ | --- -<<<<<<< HEAD -<<<<<<< HEAD -## Test 169 -======= -<<<<<<< HEAD -## Test 164 -======= -## Test 174 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= ## Test 179 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `postgres_server_list` **Prompt:** Show me the PostgreSQL servers in my subscription @@ -7396,23 +3529,11 @@ | 2 | 0.579232 | `postgres_database_list` | ❌ | | 3 | 0.531804 | `postgres_server_config_get` | ❌ | | 4 | 0.514445 | `postgres_table_list` | ❌ | -| 5 | 0.505978 | `postgres_server_param_get` | ❌ | +| 5 | 0.505869 | `postgres_server_param_get` | ❌ | --- -<<<<<<< HEAD -<<<<<<< HEAD -## Test 170 -======= -<<<<<<< HEAD -## Test 165 -======= -## Test 175 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= ## Test 180 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `postgres_server_param_get` **Prompt:** Show me if the parameter my PostgreSQL server has replication enabled @@ -7421,7 +3542,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -| 1 | 0.594770 | `postgres_server_param_get` | ✅ **EXPECTED** | +| 1 | 0.594753 | `postgres_server_param_get` | ✅ **EXPECTED** | | 2 | 0.552678 | `postgres_server_param_set` | ❌ | | 3 | 0.539671 | `postgres_server_config_get` | ❌ | | 4 | 0.489693 | `postgres_server_list` | ❌ | @@ -7429,19 +3550,7 @@ --- -<<<<<<< HEAD -<<<<<<< HEAD -## Test 171 -======= -<<<<<<< HEAD -## Test 166 -======= -## Test 176 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= ## Test 181 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `postgres_server_param_set` **Prompt:** Enable replication for my PostgreSQL server @@ -7450,25 +3559,6 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -| 1 | 0.579909 | `postgres_server_param_set` | ✅ **EXPECTED** | -| 2 | 0.488496 | `postgres_server_config_get` | ❌ | -| 3 | 0.469810 | `postgres_server_list` | ❌ | -| 4 | 0.447051 | `postgres_server_param_get` | ❌ | -| 5 | 0.440716 | `postgres_database_list` | ❌ | - ---- - -<<<<<<< HEAD -## Test 172 -======= -<<<<<<< HEAD -## Test 167 -======= -## Test 177 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= | 1 | 0.579873 | `postgres_server_param_set` | ✅ **EXPECTED** | | 2 | 0.488474 | `postgres_server_config_get` | ❌ | | 3 | 0.469794 | `postgres_server_list` | ❌ | @@ -7478,7 +3568,6 @@ --- ## Test 182 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `postgres_table_list` **Prompt:** List all tables in the PostgreSQL database in server @@ -7487,25 +3576,6 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -| 1 | 0.789934 | `postgres_table_list` | ✅ **EXPECTED** | -| 2 | 0.750592 | `postgres_database_list` | ❌ | -| 3 | 0.574975 | `postgres_server_list` | ❌ | -| 4 | 0.519816 | `postgres_table_schema_get` | ❌ | -| 5 | 0.501361 | `postgres_server_config_get` | ❌ | - ---- - -<<<<<<< HEAD -## Test 173 -======= -<<<<<<< HEAD -## Test 168 -======= -## Test 178 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= | 1 | 0.789883 | `postgres_table_list` | ✅ **EXPECTED** | | 2 | 0.750580 | `postgres_database_list` | ❌ | | 3 | 0.574930 | `postgres_server_list` | ❌ | @@ -7515,7 +3585,6 @@ --- ## Test 183 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `postgres_table_list` **Prompt:** Show me the tables in the PostgreSQL database in server @@ -7532,19 +3601,7 @@ --- -<<<<<<< HEAD -<<<<<<< HEAD -## Test 174 -======= -<<<<<<< HEAD -## Test 169 -======= -## Test 179 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= ## Test 184 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `postgres_table_schema_get` **Prompt:** Show me the schema of table
in the PostgreSQL database in server @@ -7553,35 +3610,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -| 1 | 0.714916 | `postgres_table_schema_get` | ✅ **EXPECTED** | -| 2 | 0.597892 | `postgres_table_list` | ❌ | -| 3 | 0.574251 | `postgres_database_list` | ❌ | -| 4 | 0.508090 | `postgres_server_config_get` | ❌ | -| 5 | 0.502593 | `kusto_table_schema` | ❌ | - ---- - -<<<<<<< HEAD -## Test 175 -======= -<<<<<<< HEAD -## Test 170 -======= -## Test 180 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -| 1 | 0.716024 | `postgres_table_schema_get` | ✅ **EXPECTED** | -| 2 | 0.599077 | `postgres_table_list` | ❌ | -| 3 | 0.574928 | `postgres_database_list` | ❌ | -| 4 | 0.508250 | `postgres_server_config_get` | ❌ | -| 5 | 0.502665 | `kusto_table_schema` | ❌ | +| 1 | 0.714596 | `postgres_table_schema_get` | ✅ **EXPECTED** | +| 2 | 0.597838 | `postgres_table_list` | ❌ | +| 3 | 0.574339 | `postgres_database_list` | ❌ | +| 4 | 0.507791 | `postgres_server_config_get` | ❌ | +| 5 | 0.502705 | `kusto_table_schema` | ❌ | --- ## Test 185 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `deploy_app_logs_get` **Prompt:** Show me the log of the application deployed by azd @@ -7590,38 +3627,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -| 1 | 0.711844 | `deploy_app_logs_get` | ✅ **EXPECTED** | +| 1 | 0.711770 | `deploy_app_logs_get` | ✅ **EXPECTED** | | 2 | 0.471692 | `deploy_plan_get` | ❌ | -<<<<<<< HEAD -<<<<<<< HEAD -| 3 | 0.451639 | `monitor_activitylog_list` | ❌ | -| 4 | 0.404892 | `deploy_pipeline_guidance_get` | ❌ | -======= -<<<<<<< HEAD -| 3 | 0.451653 | `monitor_activitylog_list` | ❌ | -======= -======= ->>>>>>> e2fd2eac (refactor tts mcp tool) | 3 | 0.451638 | `monitor_activitylog_list` | ❌ | | 4 | 0.404890 | `deploy_pipeline_guidance_get` | ❌ | ->>>>>>> 58ab8585 (update prompts and tool description evaluator) | 5 | 0.401388 | `monitor_resource_log_query` | ❌ | --- -<<<<<<< HEAD -<<<<<<< HEAD -## Test 176 -======= -<<<<<<< HEAD -## Test 171 -======= -## Test 181 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= ## Test 186 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `deploy_architecture_diagram_generate` **Prompt:** Generate the azure architecture diagram for this application @@ -7630,35 +3644,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -| 1 | 0.680599 | `deploy_architecture_diagram_generate` | ✅ **EXPECTED** | -| 2 | 0.562485 | `deploy_plan_get` | ❌ | -| 3 | 0.497326 | `deploy_pipeline_guidance_get` | ❌ | -| 4 | 0.489325 | `cloudarchitect_design` | ❌ | -| 5 | 0.435899 | `deploy_iac_rules_get` | ❌ | - ---- - -<<<<<<< HEAD -## Test 177 -======= -<<<<<<< HEAD -## Test 172 -======= -## Test 182 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= | 1 | 0.680640 | `deploy_architecture_diagram_generate` | ✅ **EXPECTED** | | 2 | 0.562521 | `deploy_plan_get` | ❌ | | 3 | 0.497193 | `deploy_pipeline_guidance_get` | ❌ | -| 4 | 0.489344 | `cloudarchitect_design` | ❌ | +| 4 | 0.490040 | `cloudarchitect_design` | ❌ | | 5 | 0.435921 | `deploy_iac_rules_get` | ❌ | --- ## Test 187 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `deploy_iac_rules_get` **Prompt:** Show me the rules to generate bicep scripts @@ -7668,40 +3662,14 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| | 1 | 0.529092 | `deploy_iac_rules_get` | ✅ **EXPECTED** | -<<<<<<< HEAD -| 2 | 0.480324 | `bicepschema_get` | ❌ | -======= | 2 | 0.479903 | `bicepschema_get` | ❌ | -<<<<<<< HEAD -<<<<<<< HEAD ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -| 3 | 0.391965 | `get_bestpractices_get` | ❌ | -| 4 | 0.383210 | `azureterraformbestpractices_get` | ❌ | -| 5 | 0.375561 | `extension_cli_generate` | ❌ | - ---- - -<<<<<<< HEAD -## Test 178 -======= -## Test 173 -======= -| 3 | 0.394509 | `get_bestpractices_get` | ❌ | -======= | 3 | 0.391965 | `get_bestpractices_get` | ❌ | ->>>>>>> e2fd2eac (refactor tts mcp tool) | 4 | 0.383210 | `azureterraformbestpractices_get` | ❌ | | 5 | 0.375561 | `extension_cli_generate` | ❌ | --- -<<<<<<< HEAD -## Test 183 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= ## Test 188 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `deploy_pipeline_guidance_get` **Prompt:** How can I create a CI/CD pipeline to deploy this app to Azure? @@ -7710,33 +3678,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -| 1 | 0.638588 | `deploy_pipeline_guidance_get` | ✅ **EXPECTED** | +| 1 | 0.638841 | `deploy_pipeline_guidance_get` | ✅ **EXPECTED** | | 2 | 0.499242 | `deploy_plan_get` | ❌ | -<<<<<<< HEAD -| 3 | 0.448917 | `deploy_iac_rules_get` | ❌ | -| 4 | 0.385670 | `deploy_app_logs_get` | ❌ | -| 5 | 0.382240 | `get_bestpractices_get` | ❌ | - ---- - -## Test 179 -======= | 3 | 0.448918 | `deploy_iac_rules_get` | ❌ | | 4 | 0.385920 | `deploy_app_logs_get` | ❌ | | 5 | 0.382240 | `get_bestpractices_get` | ❌ | --- -<<<<<<< HEAD -<<<<<<< HEAD -## Test 174 -======= -## Test 184 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= ## Test 189 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `deploy_plan_get` **Prompt:** Create a plan to deploy this application to azure @@ -7746,30 +3696,14 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| | 1 | 0.688055 | `deploy_plan_get` | ✅ **EXPECTED** | -| 2 | 0.587963 | `deploy_pipeline_guidance_get` | ❌ | +| 2 | 0.587903 | `deploy_pipeline_guidance_get` | ❌ | | 3 | 0.499385 | `deploy_iac_rules_get` | ❌ | | 4 | 0.498575 | `deploy_architecture_diagram_generate` | ❌ | -<<<<<<< HEAD -| 5 | 0.448912 | `loadtesting_test_create` | ❌ | - ---- - -<<<<<<< HEAD -## Test 180 -======= -<<<<<<< HEAD -## Test 175 -======= -## Test 185 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= | 5 | 0.448692 | `loadtesting_test_create` | ❌ | --- ## Test 190 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `eventgrid_events_publish` **Prompt:** Publish an event to Event Grid topic using with the following data @@ -7778,47 +3712,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -<<<<<<< HEAD -| 1 | 0.755353 | `eventgrid_events_publish` | ✅ **EXPECTED** | -| 2 | 0.482544 | `eventgrid_subscription_list` | ❌ | -| 3 | 0.465759 | `eventgrid_topic_list` | ❌ | -| 4 | 0.360686 | `eventhubs_eventhub_update` | ❌ | -| 5 | 0.355213 | `servicebus_topic_details` | ❌ | - ---- - -## Test 181 -======= -<<<<<<< HEAD -| 1 | 0.755366 | `eventgrid_events_publish` | ✅ **EXPECTED** | -| 2 | 0.482575 | `eventgrid_subscription_list` | ❌ | -| 3 | 0.465432 | `eventgrid_topic_list` | ❌ | -| 4 | 0.360845 | `eventhubs_eventhub_update` | ❌ | -| 5 | 0.354313 | `servicebus_topic_details` | ❌ | +| 1 | 0.755140 | `eventgrid_events_publish` | ✅ **EXPECTED** | +| 2 | 0.482731 | `eventgrid_subscription_list` | ❌ | +| 3 | 0.465891 | `eventgrid_topic_list` | ❌ | +| 4 | 0.360374 | `eventhubs_eventhub_update` | ❌ | +| 5 | 0.355481 | `servicebus_topic_details` | ❌ | --- -## Test 176 -======= -| 1 | 0.755380 | `eventgrid_events_publish` | ✅ **EXPECTED** | -======= -| 1 | 0.755365 | `eventgrid_events_publish` | ✅ **EXPECTED** | ->>>>>>> e2fd2eac (refactor tts mcp tool) -| 2 | 0.483021 | `eventgrid_subscription_list` | ❌ | -| 3 | 0.466031 | `eventgrid_topic_list` | ❌ | -| 4 | 0.360676 | `eventhubs_eventhub_update` | ❌ | -| 5 | 0.355599 | `servicebus_topic_details` | ❌ | - ---- - -<<<<<<< HEAD -## Test 186 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= ## Test 191 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `eventgrid_events_publish` **Prompt:** Publish event to my Event Grid topic with the following events @@ -7827,38 +3729,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -<<<<<<< HEAD -| 1 | 0.654648 | `eventgrid_events_publish` | ✅ **EXPECTED** | -| 2 | 0.524134 | `eventgrid_subscription_list` | ❌ | -| 3 | 0.509777 | `eventgrid_topic_list` | ❌ | -| 4 | 0.373438 | `servicebus_topic_details` | ❌ | -======= -<<<<<<< HEAD -======= ->>>>>>> e2fd2eac (refactor tts mcp tool) | 1 | 0.654647 | `eventgrid_events_publish` | ✅ **EXPECTED** | | 2 | 0.524503 | `eventgrid_subscription_list` | ❌ | | 3 | 0.510039 | `eventgrid_topic_list` | ❌ | | 4 | 0.373718 | `servicebus_topic_details` | ❌ | ->>>>>>> 58ab8585 (update prompts and tool description evaluator) | 5 | 0.359908 | `eventhubs_eventhub_update` | ❌ | --- -<<<<<<< HEAD -<<<<<<< HEAD -## Test 182 -======= -<<<<<<< HEAD -## Test 177 -======= -## Test 187 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= ## Test 192 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `eventgrid_events_publish` **Prompt:** Send an event to Event Grid topic in resource group with @@ -7867,37 +3746,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -| 1 | 0.600274 | `eventgrid_events_publish` | ✅ **EXPECTED** | -<<<<<<< HEAD -| 2 | 0.521041 | `eventgrid_topic_list` | ❌ | -| 3 | 0.504642 | `eventgrid_subscription_list` | ❌ | -| 4 | 0.411129 | `eventhubs_eventhub_consumergroup_update` | ❌ | -======= -| 2 | 0.521240 | `eventgrid_topic_list` | ❌ | -| 3 | 0.504808 | `eventgrid_subscription_list` | ❌ | -| 4 | 0.411130 | `eventhubs_eventhub_consumergroup_update` | ❌ | -<<<<<<< HEAD ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= ->>>>>>> e2fd2eac (refactor tts mcp tool) -| 5 | 0.389439 | `eventhubs_eventhub_consumergroup_get` | ❌ | +| 1 | 0.600271 | `eventgrid_events_publish` | ✅ **EXPECTED** | +| 2 | 0.521247 | `eventgrid_topic_list` | ❌ | +| 3 | 0.504794 | `eventgrid_subscription_list` | ❌ | +| 4 | 0.411140 | `eventhubs_eventhub_consumergroup_update` | ❌ | +| 5 | 0.389466 | `eventhubs_eventhub_consumergroup_get` | ❌ | --- -<<<<<<< HEAD -<<<<<<< HEAD -## Test 183 -======= -<<<<<<< HEAD -## Test 178 -======= -## Test 188 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= ## Test 193 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `eventgrid_topic_list` **Prompt:** List all Event Grid topics in my subscription @@ -7906,38 +3763,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -| 1 | 0.769921 | `eventgrid_topic_list` | ✅ **EXPECTED** | -| 2 | 0.745048 | `eventgrid_subscription_list` | ❌ | -| 3 | 0.561862 | `kusto_cluster_list` | ❌ | -| 4 | 0.543887 | `search_service_list` | ❌ | -======= | 1 | 0.770140 | `eventgrid_topic_list` | ✅ **EXPECTED** | | 2 | 0.745470 | `eventgrid_subscription_list` | ❌ | | 3 | 0.561862 | `kusto_cluster_list` | ❌ | | 4 | 0.545540 | `search_service_list` | ❌ | -<<<<<<< HEAD ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -| 5 | 0.526123 | `subscription_list` | ❌ | - ---- - -<<<<<<< HEAD -## Test 184 -======= -<<<<<<< HEAD -## Test 179 -======= -## Test 189 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= | 5 | 0.526138 | `subscription_list` | ❌ | --- ## Test 194 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `eventgrid_topic_list` **Prompt:** Show me the Event Grid topics in my subscription @@ -7946,17 +3780,6 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -| 1 | 0.738040 | `eventgrid_topic_list` | ✅ **EXPECTED** | -| 2 | 0.736919 | `eventgrid_subscription_list` | ❌ | -| 3 | 0.492592 | `kusto_cluster_list` | ❌ | -| 4 | 0.480252 | `subscription_list` | ❌ | -| 5 | 0.473459 | `search_service_list` | ❌ | - ---- - -## Test 185 -======= | 1 | 0.738258 | `eventgrid_topic_list` | ✅ **EXPECTED** | | 2 | 0.737486 | `eventgrid_subscription_list` | ❌ | | 3 | 0.492592 | `kusto_cluster_list` | ❌ | @@ -7965,16 +3788,7 @@ --- -<<<<<<< HEAD -<<<<<<< HEAD -## Test 180 -======= -## Test 190 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= ## Test 195 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `eventgrid_topic_list` **Prompt:** List all Event Grid topics in subscription @@ -7983,17 +3797,6 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -| 1 | 0.769840 | `eventgrid_topic_list` | ✅ **EXPECTED** | -| 2 | 0.720426 | `eventgrid_subscription_list` | ❌ | -| 3 | 0.535369 | `kusto_cluster_list` | ❌ | -| 4 | 0.513921 | `search_service_list` | ❌ | -| 5 | 0.495939 | `subscription_list` | ❌ | - ---- - -## Test 186 -======= | 1 | 0.770140 | `eventgrid_topic_list` | ✅ **EXPECTED** | | 2 | 0.721362 | `eventgrid_subscription_list` | ❌ | | 3 | 0.535326 | `kusto_cluster_list` | ❌ | @@ -8002,16 +3805,7 @@ --- -<<<<<<< HEAD -<<<<<<< HEAD -## Test 181 -======= -## Test 191 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= ## Test 196 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `eventgrid_topic_list` **Prompt:** List all Event Grid topics in resource group in subscription @@ -8020,43 +3814,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -<<<<<<< HEAD -| 1 | 0.758562 | `eventgrid_topic_list` | ✅ **EXPECTED** | -| 2 | 0.704062 | `eventgrid_subscription_list` | ❌ | -======= -<<<<<<< HEAD -======= ->>>>>>> e2fd2eac (refactor tts mcp tool) -| 1 | 0.758816 | `eventgrid_topic_list` | ✅ **EXPECTED** | -| 2 | 0.704462 | `eventgrid_subscription_list` | ❌ | ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -| 3 | 0.609175 | `group_list` | ❌ | -| 4 | 0.544809 | `monitor_webtests_list` | ❌ | -| 5 | 0.524209 | `eventhubs_namespace_get` | ❌ | - ---- - -<<<<<<< HEAD -<<<<<<< HEAD -## Test 187 -======= -## Test 182 -======= -| 1 | 0.758786 | `eventgrid_topic_list` | ✅ **EXPECTED** | -| 2 | 0.704443 | `eventgrid_subscription_list` | ❌ | -| 3 | 0.609074 | `group_list` | ❌ | -| 4 | 0.536981 | `monitor_webtests_list` | ❌ | -| 5 | 0.524359 | `eventhubs_namespace_get` | ❌ | +| 1 | 0.758595 | `eventgrid_topic_list` | ✅ **EXPECTED** | +| 2 | 0.704232 | `eventgrid_subscription_list` | ❌ | +| 3 | 0.609085 | `group_list` | ❌ | +| 4 | 0.544465 | `monitor_webtests_list` | ❌ | +| 5 | 0.524019 | `eventhubs_namespace_get` | ❌ | --- -## Test 192 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= ## Test 197 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `eventgrid_subscription_list` **Prompt:** Show me all Event Grid subscriptions for topic @@ -8065,30 +3831,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -| 1 | 0.768696 | `eventgrid_subscription_list` | ✅ **EXPECTED** | -| 2 | 0.720373 | `eventgrid_topic_list` | ❌ | -| 3 | 0.498398 | `servicebus_topic_details` | ❌ | +| 1 | 0.769097 | `eventgrid_subscription_list` | ✅ **EXPECTED** | +| 2 | 0.720606 | `eventgrid_topic_list` | ❌ | +| 3 | 0.498615 | `servicebus_topic_details` | ❌ | | 4 | 0.486216 | `servicebus_topic_subscription_details` | ❌ | | 5 | 0.486162 | `eventgrid_events_publish` | ❌ | --- -<<<<<<< HEAD -<<<<<<< HEAD -## Test 188 -======= -## Test 183 -======= -| 5 | 0.486132 | `eventgrid_events_publish` | ❌ | - ---- - -## Test 193 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= ## Test 198 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `eventgrid_subscription_list` **Prompt:** List Event Grid subscriptions for topic in subscription @@ -8097,38 +3848,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -| 1 | 0.717676 | `eventgrid_subscription_list` | ✅ **EXPECTED** | -| 2 | 0.709586 | `eventgrid_topic_list` | ❌ | +| 1 | 0.718109 | `eventgrid_subscription_list` | ✅ **EXPECTED** | +| 2 | 0.709805 | `eventgrid_topic_list` | ❌ | | 3 | 0.539977 | `servicebus_topic_subscription_details` | ❌ | -<<<<<<< HEAD -| 4 | 0.529084 | `servicebus_topic_details` | ❌ | -======= | 4 | 0.529286 | `servicebus_topic_details` | ❌ | -<<<<<<< HEAD -<<<<<<< HEAD ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= ->>>>>>> e2fd2eac (refactor tts mcp tool) | 5 | 0.477876 | `eventgrid_events_publish` | ❌ | --- -<<<<<<< HEAD -<<<<<<< HEAD -## Test 189 -======= -## Test 184 -======= -| 5 | 0.477848 | `eventgrid_events_publish` | ❌ | - ---- - -## Test 194 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -## Test 199 ->>>>>>> e2fd2eac (refactor tts mcp tool) +## Test 199 **Expected Tool:** `eventgrid_subscription_list` **Prompt:** List Event Grid subscriptions for topic in resource group @@ -8137,46 +3865,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -<<<<<<< HEAD -| 1 | 0.746672 | `eventgrid_subscription_list` | ✅ **EXPECTED** | -| 2 | 0.745851 | `eventgrid_topic_list` | ❌ | -| 3 | 0.535463 | `monitor_webtests_list` | ❌ | -| 4 | 0.524802 | `group_list` | ❌ | -| 5 | 0.502884 | `servicebus_topic_details` | ❌ | - ---- - -## Test 190 -======= -<<<<<<< HEAD -======= ->>>>>>> e2fd2eac (refactor tts mcp tool) | 1 | 0.746815 | `eventgrid_subscription_list` | ✅ **EXPECTED** | | 2 | 0.746174 | `eventgrid_topic_list` | ❌ | -| 3 | 0.535569 | `monitor_webtests_list` | ❌ | -| 4 | 0.524919 | `group_list` | ❌ | +| 3 | 0.535484 | `monitor_webtests_list` | ❌ | +| 4 | 0.524923 | `group_list` | ❌ | | 5 | 0.503158 | `servicebus_topic_details` | ❌ | --- -<<<<<<< HEAD -## Test 185 -======= -| 1 | 0.746335 | `eventgrid_subscription_list` | ✅ **EXPECTED** | -| 2 | 0.745666 | `eventgrid_topic_list` | ❌ | -| 3 | 0.528105 | `monitor_webtests_list` | ❌ | -| 4 | 0.524883 | `group_list` | ❌ | -| 5 | 0.502820 | `servicebus_topic_details` | ❌ | - ---- - -## Test 195 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= ## Test 200 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `eventgrid_subscription_list` **Prompt:** Show all Event Grid subscriptions in my subscription @@ -8185,17 +3882,6 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -| 1 | 0.736844 | `eventgrid_subscription_list` | ✅ **EXPECTED** | -| 2 | 0.659612 | `eventgrid_topic_list` | ❌ | -| 3 | 0.569255 | `subscription_list` | ❌ | -| 4 | 0.537922 | `kusto_cluster_list` | ❌ | -| 5 | 0.517276 | `search_service_list` | ❌ | - ---- - -## Test 191 -======= | 1 | 0.736436 | `eventgrid_subscription_list` | ✅ **EXPECTED** | | 2 | 0.659727 | `eventgrid_topic_list` | ❌ | | 3 | 0.569254 | `subscription_list` | ❌ | @@ -8204,16 +3890,7 @@ --- -<<<<<<< HEAD -<<<<<<< HEAD -## Test 186 -======= -## Test 196 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= ## Test 201 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `eventgrid_subscription_list` **Prompt:** List all Event Grid subscriptions in subscription @@ -8222,46 +3899,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -<<<<<<< HEAD -| 1 | 0.684586 | `eventgrid_subscription_list` | ✅ **EXPECTED** | -| 2 | 0.656227 | `eventgrid_topic_list` | ❌ | -| 3 | 0.542362 | `subscription_list` | ❌ | -| 4 | 0.521053 | `kusto_cluster_list` | ❌ | -| 5 | 0.510115 | `group_list` | ❌ | - ---- - -## Test 192 -======= -<<<<<<< HEAD -| 1 | 0.684444 | `eventgrid_subscription_list` | ✅ **EXPECTED** | -| 2 | 0.656183 | `eventgrid_topic_list` | ❌ | -| 3 | 0.542320 | `subscription_list` | ❌ | -| 4 | 0.521015 | `kusto_cluster_list` | ❌ | -| 5 | 0.510024 | `group_list` | ❌ | - ---- - -## Test 187 -======= -======= ->>>>>>> e2fd2eac (refactor tts mcp tool) -| 1 | 0.684543 | `eventgrid_subscription_list` | ✅ **EXPECTED** | -| 2 | 0.656277 | `eventgrid_topic_list` | ❌ | -| 3 | 0.542388 | `subscription_list` | ❌ | -| 4 | 0.521053 | `kusto_cluster_list` | ❌ | -| 5 | 0.510115 | `group_list` | ❌ | +| 1 | 0.684522 | `eventgrid_subscription_list` | ✅ **EXPECTED** | +| 2 | 0.656271 | `eventgrid_topic_list` | ❌ | +| 3 | 0.542366 | `subscription_list` | ❌ | +| 4 | 0.521031 | `kusto_cluster_list` | ❌ | +| 5 | 0.510078 | `group_list` | ❌ | --- -<<<<<<< HEAD -## Test 197 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= ## Test 202 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `eventgrid_subscription_list` **Prompt:** Show Event Grid subscriptions in resource group in subscription @@ -8270,42 +3916,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -| 1 | 0.696332 | `eventgrid_subscription_list` | ✅ **EXPECTED** | -| 2 | 0.691623 | `eventgrid_topic_list` | ❌ | -| 3 | 0.557573 | `group_list` | ❌ | -<<<<<<< HEAD -<<<<<<< HEAD -| 4 | 0.510684 | `monitor_webtests_list` | ❌ | -| 5 | 0.504984 | `resourcehealth_availability-status_list` | ❌ | - ---- - -## Test 193 -======= -<<<<<<< HEAD -| 4 | 0.510814 | `monitor_webtests_list` | ❌ | -| 5 | 0.505497 | `resourcehealth_availability-status_list` | ❌ | - ---- - -## Test 188 -======= -| 4 | 0.504984 | `resourcehealth_availability-status_list` | ❌ | -| 5 | 0.503099 | `monitor_webtests_list` | ❌ | - ---- - -## Test 198 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -| 4 | 0.510684 | `monitor_webtests_list` | ❌ | +| 1 | 0.696101 | `eventgrid_subscription_list` | ✅ **EXPECTED** | +| 2 | 0.691739 | `eventgrid_topic_list` | ❌ | +| 3 | 0.557598 | `group_list` | ❌ | +| 4 | 0.510586 | `monitor_webtests_list` | ❌ | | 5 | 0.504984 | `resourcehealth_availability-status_list` | ❌ | --- ## Test 203 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `eventgrid_subscription_list` **Prompt:** List Event Grid subscriptions for subscription in location @@ -8314,42 +3933,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -| 1 | 0.710457 | `eventgrid_subscription_list` | ✅ **EXPECTED** | -| 2 | 0.642001 | `eventgrid_topic_list` | ❌ | -| 3 | 0.506618 | `subscription_list` | ❌ | -<<<<<<< HEAD -| 4 | 0.476396 | `search_service_list` | ❌ | -======= -| 4 | 0.476763 | `search_service_list` | ❌ | -<<<<<<< HEAD ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= | 1 | 0.709801 | `eventgrid_subscription_list` | ✅ **EXPECTED** | | 2 | 0.642095 | `eventgrid_topic_list` | ❌ | | 3 | 0.506697 | `subscription_list` | ❌ | | 4 | 0.476763 | `search_service_list` | ❌ | ->>>>>>> e2fd2eac (refactor tts mcp tool) | 5 | 0.475782 | `kusto_cluster_list` | ❌ | --- -<<<<<<< HEAD -<<<<<<< HEAD -## Test 194 -======= -## Test 189 -======= -| 5 | 0.475718 | `kusto_cluster_list` | ❌ | - ---- - -## Test 199 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= ## Test 204 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `eventhubs_eventhub_consumergroup_delete` **Prompt:** Delete my consumer group in my event hub , namespace , and resource group @@ -8358,51 +3950,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -<<<<<<< HEAD -| 1 | 0.766928 | `eventhubs_eventhub_consumergroup_delete` | ✅ **EXPECTED** | -| 2 | 0.675842 | `eventhubs_eventhub_consumergroup_update` | ❌ | -| 3 | 0.641112 | `eventhubs_eventhub_consumergroup_get` | ❌ | -| 4 | 0.633788 | `eventhubs_namespace_delete` | ❌ | -| 5 | 0.605465 | `eventhubs_eventhub_delete` | ❌ | - ---- - -## Test 195 -======= -<<<<<<< HEAD -| 1 | 0.766896 | `eventhubs_eventhub_consumergroup_delete` | ✅ **EXPECTED** | -| 2 | 0.675127 | `eventhubs_eventhub_consumergroup_update` | ❌ | +| 1 | 0.766923 | `eventhubs_eventhub_consumergroup_delete` | ✅ **EXPECTED** | +| 2 | 0.675846 | `eventhubs_eventhub_consumergroup_update` | ❌ | | 3 | 0.641111 | `eventhubs_eventhub_consumergroup_get` | ❌ | -| 4 | 0.633848 | `eventhubs_namespace_delete` | ❌ | -| 5 | 0.605802 | `eventhubs_eventhub_delete` | ❌ | - ---- - -## Test 190 -======= -| 1 | 0.767014 | `eventhubs_eventhub_consumergroup_delete` | ✅ **EXPECTED** | -| 2 | 0.675937 | `eventhubs_eventhub_consumergroup_update` | ❌ | -| 3 | 0.641200 | `eventhubs_eventhub_consumergroup_get` | ❌ | -| 4 | 0.631867 | `eventhubs_namespace_delete` | ❌ | -| 5 | 0.605622 | `eventhubs_eventhub_delete` | ❌ | - ---- - -## Test 200 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -| 1 | 0.766871 | `eventhubs_eventhub_consumergroup_delete` | ✅ **EXPECTED** | -| 2 | 0.675824 | `eventhubs_eventhub_consumergroup_update` | ❌ | -| 3 | 0.641096 | `eventhubs_eventhub_consumergroup_get` | ❌ | -| 4 | 0.633729 | `eventhubs_namespace_delete` | ❌ | -| 5 | 0.605488 | `eventhubs_eventhub_delete` | ❌ | +| 4 | 0.633787 | `eventhubs_namespace_delete` | ❌ | +| 5 | 0.605477 | `eventhubs_eventhub_delete` | ❌ | --- ## Test 205 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `eventhubs_eventhub_consumergroup_get` **Prompt:** List all consumer groups in my event hub in namespace @@ -8412,41 +3968,14 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| | 1 | 0.738475 | `eventhubs_eventhub_consumergroup_get` | ✅ **EXPECTED** | -<<<<<<< HEAD -<<<<<<< HEAD -| 2 | 0.634517 | `eventhubs_eventhub_consumergroup_update` | ❌ | -| 3 | 0.626486 | `eventhubs_eventhub_consumergroup_delete` | ❌ | -======= -<<<<<<< HEAD -| 2 | 0.634345 | `eventhubs_eventhub_consumergroup_update` | ❌ | -======= -======= ->>>>>>> e2fd2eac (refactor tts mcp tool) | 2 | 0.634517 | `eventhubs_eventhub_consumergroup_update` | ❌ | | 3 | 0.626485 | `eventhubs_eventhub_consumergroup_delete` | ❌ | ->>>>>>> 58ab8585 (update prompts and tool description evaluator) | 4 | 0.606619 | `eventhubs_namespace_get` | ❌ | -<<<<<<< HEAD | 5 | 0.593098 | `eventhubs_eventhub_get` | ❌ | --- -<<<<<<< HEAD -## Test 196 -======= -<<<<<<< HEAD -## Test 191 -======= -## Test 201 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -| 5 | 0.593085 | `eventhubs_eventhub_get` | ❌ | - ---- - ## Test 206 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `eventhubs_eventhub_consumergroup_get` **Prompt:** Get the details of my consumer group in my event hub , namespace , and resource group @@ -8456,41 +3985,14 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| | 1 | 0.712861 | `eventhubs_eventhub_consumergroup_get` | ✅ **EXPECTED** | -<<<<<<< HEAD -<<<<<<< HEAD -| 2 | 0.637170 | `eventhubs_eventhub_consumergroup_update` | ❌ | -======= -<<<<<<< HEAD -| 2 | 0.637418 | `eventhubs_eventhub_consumergroup_update` | ❌ | -======= | 2 | 0.637170 | `eventhubs_eventhub_consumergroup_update` | ❌ | ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) | 3 | 0.625913 | `eventhubs_eventhub_consumergroup_delete` | ❌ | | 4 | 0.576800 | `eventhubs_namespace_get` | ❌ | | 5 | 0.529940 | `eventhubs_eventhub_get` | ❌ | --- -<<<<<<< HEAD -## Test 197 -======= -<<<<<<< HEAD -## Test 192 -======= -## Test 202 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -| 2 | 0.637170 | `eventhubs_eventhub_consumergroup_update` | ❌ | -| 3 | 0.625913 | `eventhubs_eventhub_consumergroup_delete` | ❌ | -| 4 | 0.576800 | `eventhubs_namespace_get` | ❌ | -| 5 | 0.529926 | `eventhubs_eventhub_get` | ❌ | - ---- - ## Test 207 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `eventhubs_eventhub_consumergroup_update` **Prompt:** Create a new consumer group in my event hub , namespace , and resource group @@ -8499,23 +4001,6 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -<<<<<<< HEAD -| 1 | 0.756873 | `eventhubs_eventhub_consumergroup_update` | ✅ **EXPECTED** | -| 2 | 0.688248 | `eventhubs_eventhub_consumergroup_get` | ❌ | -| 3 | 0.669384 | `eventhubs_eventhub_consumergroup_delete` | ❌ | -| 4 | 0.553692 | `eventhubs_eventhub_update` | ❌ | -| 5 | 0.544512 | `eventhubs_namespace_get` | ❌ | - ---- - -## Test 198 -======= -<<<<<<< HEAD -| 1 | 0.757520 | `eventhubs_eventhub_consumergroup_update` | ✅ **EXPECTED** | -======= -======= ->>>>>>> e2fd2eac (refactor tts mcp tool) | 1 | 0.757614 | `eventhubs_eventhub_consumergroup_update` | ✅ **EXPECTED** | | 2 | 0.688923 | `eventhubs_eventhub_consumergroup_get` | ❌ | | 3 | 0.670026 | `eventhubs_eventhub_consumergroup_delete` | ❌ | @@ -8524,16 +4009,7 @@ --- -<<<<<<< HEAD -<<<<<<< HEAD -## Test 193 -======= -## Test 203 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= ## Test 208 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `eventhubs_eventhub_consumergroup_update` **Prompt:** Update my consumer group in my event hub , namespace , and resource group @@ -8542,31 +4018,6 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -<<<<<<< HEAD -| 1 | 0.739158 | `eventhubs_eventhub_consumergroup_update` | ✅ **EXPECTED** | -| 2 | 0.655927 | `eventhubs_eventhub_consumergroup_delete` | ❌ | -| 3 | 0.642524 | `eventhubs_eventhub_consumergroup_get` | ❌ | -| 4 | 0.552602 | `eventhubs_eventhub_update` | ❌ | -| 5 | 0.524106 | `eventhubs_namespace_delete` | ❌ | - ---- - -## Test 199 -======= -<<<<<<< HEAD -| 1 | 0.739615 | `eventhubs_eventhub_consumergroup_update` | ✅ **EXPECTED** | -| 2 | 0.655951 | `eventhubs_eventhub_consumergroup_delete` | ❌ | -| 3 | 0.642701 | `eventhubs_eventhub_consumergroup_get` | ❌ | -| 4 | 0.552830 | `eventhubs_eventhub_update` | ❌ | -| 5 | 0.524428 | `eventhubs_namespace_delete` | ❌ | - ---- - -## Test 194 -======= -======= ->>>>>>> e2fd2eac (refactor tts mcp tool) | 1 | 0.738818 | `eventhubs_eventhub_consumergroup_update` | ✅ **EXPECTED** | | 2 | 0.655614 | `eventhubs_eventhub_consumergroup_delete` | ❌ | | 3 | 0.642219 | `eventhubs_eventhub_consumergroup_get` | ❌ | @@ -8575,13 +4026,7 @@ --- -<<<<<<< HEAD -## Test 204 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= ## Test 209 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `eventhubs_eventhub_delete` **Prompt:** Delete my event hub in my namespace and resource group @@ -8590,51 +4035,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -<<<<<<< HEAD -| 1 | 0.699266 | `eventhubs_namespace_delete` | ❌ | -| 2 | 0.688646 | `eventhubs_eventhub_delete` | ✅ **EXPECTED** | -| 3 | 0.627721 | `eventhubs_eventhub_consumergroup_delete` | ❌ | -| 4 | 0.578653 | `eventhubs_namespace_get` | ❌ | -| 5 | 0.552963 | `eventhubs_eventhub_get` | ❌ | - ---- - -## Test 200 -======= -<<<<<<< HEAD -| 1 | 0.699621 | `eventhubs_namespace_delete` | ❌ | -| 2 | 0.689171 | `eventhubs_eventhub_delete` | ✅ **EXPECTED** | -| 3 | 0.627887 | `eventhubs_eventhub_consumergroup_delete` | ❌ | -| 4 | 0.579273 | `eventhubs_namespace_get` | ❌ | -| 5 | 0.553715 | `eventhubs_eventhub_get` | ❌ | - ---- - -## Test 195 -======= -| 1 | 0.697894 | `eventhubs_namespace_delete` | ❌ | -| 2 | 0.688471 | `eventhubs_eventhub_delete` | ✅ **EXPECTED** | -| 3 | 0.627661 | `eventhubs_eventhub_consumergroup_delete` | ❌ | -| 4 | 0.578662 | `eventhubs_namespace_get` | ❌ | -| 5 | 0.552931 | `eventhubs_eventhub_get` | ❌ | - ---- - -## Test 205 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -| 1 | 0.699213 | `eventhubs_namespace_delete` | ❌ | -| 2 | 0.688502 | `eventhubs_eventhub_delete` | ✅ **EXPECTED** | -| 3 | 0.627718 | `eventhubs_eventhub_consumergroup_delete` | ❌ | -| 4 | 0.578687 | `eventhubs_namespace_get` | ❌ | -| 5 | 0.552908 | `eventhubs_eventhub_get` | ❌ | +| 1 | 0.699271 | `eventhubs_namespace_delete` | ❌ | +| 2 | 0.688649 | `eventhubs_eventhub_delete` | ✅ **EXPECTED** | +| 3 | 0.627530 | `eventhubs_eventhub_consumergroup_delete` | ❌ | +| 4 | 0.578627 | `eventhubs_namespace_get` | ❌ | +| 5 | 0.553129 | `eventhubs_eventhub_get` | ❌ | --- ## Test 210 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `eventhubs_eventhub_get` **Prompt:** List all Event Hubs in my namespace @@ -8643,22 +4052,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -<<<<<<< HEAD -| 1 | 0.773277 | `eventhubs_eventhub_get` | ✅ **EXPECTED** | -| 2 | 0.687596 | `eventhubs_namespace_get` | ❌ | -| 3 | 0.578709 | `eventhubs_eventhub_update` | ❌ | -| 4 | 0.561587 | `eventhubs_namespace_delete` | ❌ | -| 5 | 0.545481 | `eventhubs_eventhub_consumergroup_get` | ❌ | - ---- - -## Test 201 -======= -| 1 | 0.773231 | `eventhubs_eventhub_get` | ✅ **EXPECTED** | -======= -| 1 | 0.773218 | `eventhubs_eventhub_get` | ✅ **EXPECTED** | ->>>>>>> e2fd2eac (refactor tts mcp tool) +| 1 | 0.773242 | `eventhubs_eventhub_get` | ✅ **EXPECTED** | | 2 | 0.687582 | `eventhubs_namespace_get` | ❌ | | 3 | 0.578689 | `eventhubs_eventhub_update` | ❌ | | 4 | 0.561545 | `eventhubs_namespace_delete` | ❌ | @@ -8666,16 +4060,7 @@ --- -<<<<<<< HEAD -<<<<<<< HEAD -## Test 196 -======= -## Test 206 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= ## Test 211 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `eventhubs_eventhub_get` **Prompt:** Get the details of my event hub in my namespace and resource group @@ -8684,43 +4069,8 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -<<<<<<< HEAD -| 1 | 0.638112 | `eventhubs_namespace_get` | ❌ | -| 2 | 0.627528 | `eventhubs_eventhub_get` | ✅ **EXPECTED** | -| 3 | 0.570964 | `eventhubs_eventhub_consumergroup_get` | ❌ | -| 4 | 0.527503 | `eventhubs_eventhub_update` | ❌ | -| 5 | 0.521930 | `eventhubs_namespace_delete` | ❌ | - ---- - -## Test 202 -======= -<<<<<<< HEAD -| 1 | 0.638030 | `eventhubs_namespace_get` | ❌ | -| 2 | 0.627606 | `eventhubs_eventhub_get` | ✅ **EXPECTED** | -| 3 | 0.570898 | `eventhubs_eventhub_consumergroup_get` | ❌ | -| 4 | 0.527564 | `eventhubs_eventhub_update` | ❌ | -| 5 | 0.521837 | `eventhubs_namespace_delete` | ❌ | - ---- - -## Test 197 -======= -| 1 | 0.638173 | `eventhubs_namespace_get` | ❌ | -| 2 | 0.627712 | `eventhubs_eventhub_get` | ✅ **EXPECTED** | -| 3 | 0.571001 | `eventhubs_eventhub_consumergroup_get` | ❌ | -| 4 | 0.527639 | `eventhubs_eventhub_update` | ❌ | -| 5 | 0.521101 | `eventhubs_namespace_delete` | ❌ | - ---- - -## Test 207 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= | 1 | 0.638083 | `eventhubs_namespace_get` | ❌ | -| 2 | 0.627619 | `eventhubs_eventhub_get` | ✅ **EXPECTED** | +| 2 | 0.627638 | `eventhubs_eventhub_get` | ✅ **EXPECTED** | | 3 | 0.570904 | `eventhubs_eventhub_consumergroup_get` | ❌ | | 4 | 0.527646 | `eventhubs_eventhub_update` | ❌ | | 5 | 0.521920 | `eventhubs_namespace_delete` | ❌ | @@ -8728,7 +4078,6 @@ --- ## Test 212 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `eventhubs_eventhub_update` **Prompt:** Create a new event hub in my namespace and resource group @@ -8737,49 +4086,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -<<<<<<< HEAD -| 1 | 0.645976 | `eventhubs_eventhub_update` | ✅ **EXPECTED** | -| 2 | 0.605856 | `eventhubs_namespace_get` | ❌ | -| 3 | 0.574389 | `eventhubs_eventhub_get` | ❌ | -| 4 | 0.571676 | `eventhubs_eventhub_consumergroup_update` | ❌ | -| 5 | 0.557550 | `eventhubs_namespace_delete` | ❌ | - ---- - -## Test 203 -======= -<<<<<<< HEAD -| 1 | 0.645723 | `eventhubs_eventhub_update` | ✅ **EXPECTED** | -| 2 | 0.605716 | `eventhubs_namespace_get` | ❌ | -| 3 | 0.574303 | `eventhubs_eventhub_get` | ❌ | -| 4 | 0.571748 | `eventhubs_eventhub_consumergroup_update` | ❌ | -| 5 | 0.557530 | `eventhubs_namespace_delete` | ❌ | - ---- - -## Test 198 -======= -| 1 | 0.645976 | `eventhubs_eventhub_update` | ✅ **EXPECTED** | -| 2 | 0.605856 | `eventhubs_namespace_get` | ❌ | -| 3 | 0.574389 | `eventhubs_eventhub_get` | ❌ | -======= -| 1 | 0.646114 | `eventhubs_eventhub_update` | ✅ **EXPECTED** | -| 2 | 0.605940 | `eventhubs_namespace_get` | ❌ | -| 3 | 0.574547 | `eventhubs_eventhub_get` | ❌ | ->>>>>>> e2fd2eac (refactor tts mcp tool) -| 4 | 0.571676 | `eventhubs_eventhub_consumergroup_update` | ❌ | -| 5 | 0.557693 | `eventhubs_namespace_delete` | ❌ | +| 1 | 0.646034 | `eventhubs_eventhub_update` | ✅ **EXPECTED** | +| 2 | 0.605911 | `eventhubs_namespace_get` | ❌ | +| 3 | 0.574464 | `eventhubs_eventhub_get` | ❌ | +| 4 | 0.571638 | `eventhubs_eventhub_consumergroup_update` | ❌ | +| 5 | 0.557592 | `eventhubs_namespace_delete` | ❌ | --- -<<<<<<< HEAD -## Test 208 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= ## Test 213 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `eventhubs_eventhub_update` **Prompt:** Update my event hub in my namespace and resource group @@ -8788,41 +4103,6 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -<<<<<<< HEAD -| 1 | 0.655283 | `eventhubs_eventhub_update` | ✅ **EXPECTED** | -| 2 | 0.571661 | `eventhubs_eventhub_delete` | ❌ | -| 3 | 0.568605 | `eventhubs_eventhub_consumergroup_update` | ❌ | -| 4 | 0.568396 | `eventhubs_namespace_get` | ❌ | -| 5 | 0.565977 | `eventhubs_namespace_delete` | ❌ | - ---- - -## Test 204 -======= -<<<<<<< HEAD -| 1 | 0.655261 | `eventhubs_eventhub_update` | ✅ **EXPECTED** | -| 2 | 0.571762 | `eventhubs_eventhub_delete` | ❌ | -| 3 | 0.569417 | `eventhubs_eventhub_consumergroup_update` | ❌ | -| 4 | 0.568279 | `eventhubs_namespace_get` | ❌ | -| 5 | 0.565852 | `eventhubs_namespace_delete` | ❌ | - ---- - -## Test 199 -======= -| 1 | 0.655104 | `eventhubs_eventhub_update` | ✅ **EXPECTED** | -| 2 | 0.571580 | `eventhubs_eventhub_delete` | ❌ | -| 3 | 0.568796 | `eventhubs_eventhub_consumergroup_update` | ❌ | -| 4 | 0.568526 | `eventhubs_namespace_get` | ❌ | -| 5 | 0.564849 | `eventhubs_namespace_delete` | ❌ | - ---- - -## Test 209 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= | 1 | 0.655283 | `eventhubs_eventhub_update` | ✅ **EXPECTED** | | 2 | 0.571661 | `eventhubs_eventhub_delete` | ❌ | | 3 | 0.568606 | `eventhubs_eventhub_consumergroup_update` | ❌ | @@ -8832,7 +4112,6 @@ --- ## Test 214 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `eventhubs_namespace_delete` **Prompt:** Delete my namespace in my resource group @@ -8841,36 +4120,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -<<<<<<< HEAD -| 1 | 0.623995 | `eventhubs_namespace_delete` | ✅ **EXPECTED** | -| 2 | 0.525810 | `eventhubs_namespace_update` | ❌ | -======= -| 1 | 0.626113 | `eventhubs_namespace_delete` | ✅ **EXPECTED** | -======= | 1 | 0.623995 | `eventhubs_namespace_delete` | ✅ **EXPECTED** | ->>>>>>> e2fd2eac (refactor tts mcp tool) -| 2 | 0.525446 | `eventhubs_namespace_update` | ❌ | ->>>>>>> 58ab8585 (update prompts and tool description evaluator) +| 2 | 0.525823 | `eventhubs_namespace_update` | ❌ | | 3 | 0.505082 | `eventhubs_eventhub_consumergroup_delete` | ❌ | | 4 | 0.449841 | `eventhubs_namespace_get` | ❌ | | 5 | 0.435037 | `workbooks_delete` | ❌ | --- -<<<<<<< HEAD -<<<<<<< HEAD -## Test 205 -======= -<<<<<<< HEAD -## Test 200 -======= -## Test 210 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= ## Test 215 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `eventhubs_namespace_get` **Prompt:** List all Event Hubs namespaces in my subscription @@ -8879,38 +4137,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -| 1 | 0.659800 | `eventhubs_eventhub_get` | ❌ | +| 1 | 0.659838 | `eventhubs_eventhub_get` | ❌ | | 2 | 0.658827 | `eventhubs_namespace_get` | ✅ **EXPECTED** | | 3 | 0.607372 | `kusto_cluster_list` | ❌ | -<<<<<<< HEAD -<<<<<<< HEAD -| 4 | 0.557150 | `eventgrid_topic_list` | ❌ | -| 5 | 0.556016 | `eventgrid_subscription_list` | ❌ | - ---- - -## Test 206 -======= -======= -| 3 | 0.607365 | `kusto_cluster_list` | ❌ | ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) -======= ->>>>>>> e2fd2eac (refactor tts mcp tool) | 4 | 0.557200 | `eventgrid_topic_list` | ❌ | | 5 | 0.556126 | `eventgrid_subscription_list` | ❌ | --- -<<<<<<< HEAD -<<<<<<< HEAD -## Test 201 -======= -## Test 211 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= ## Test 216 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `eventhubs_namespace_get` **Prompt:** Get the details of my namespace in my resource group @@ -8919,46 +4154,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -<<<<<<< HEAD -| 1 | 0.509749 | `eventhubs_namespace_get` | ✅ **EXPECTED** | -| 2 | 0.509432 | `monitor_webtests_get` | ❌ | -| 3 | 0.497399 | `servicebus_queue_details` | ❌ | -| 4 | 0.490015 | `eventhubs_namespace_update` | ❌ | -| 5 | 0.470455 | `functionapp_get` | ❌ | - ---- - -## Test 207 -======= -<<<<<<< HEAD -| 1 | 0.510078 | `monitor_webtests_get` | ❌ | -| 2 | 0.509993 | `eventhubs_namespace_get` | ✅ **EXPECTED** | -| 3 | 0.497527 | `servicebus_queue_details` | ❌ | -| 4 | 0.490095 | `eventhubs_namespace_update` | ❌ | -| 5 | 0.470636 | `functionapp_get` | ❌ | - ---- - -## Test 202 -======= -======= ->>>>>>> e2fd2eac (refactor tts mcp tool) | 1 | 0.509749 | `eventhubs_namespace_get` | ✅ **EXPECTED** | | 2 | 0.509431 | `monitor_webtests_get` | ❌ | | 3 | 0.497399 | `servicebus_queue_details` | ❌ | -| 4 | 0.490055 | `eventhubs_namespace_update` | ❌ | +| 4 | 0.489992 | `eventhubs_namespace_update` | ❌ | | 5 | 0.470455 | `functionapp_get` | ❌ | --- -<<<<<<< HEAD -## Test 212 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= ## Test 217 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `eventhubs_namespace_update` **Prompt:** Create an new namespace in my resource group @@ -8967,27 +4171,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -| 1 | 0.610313 | `eventhubs_namespace_update` | ✅ **EXPECTED** | +| 1 | 0.610205 | `eventhubs_namespace_update` | ✅ **EXPECTED** | | 2 | 0.466721 | `eventhubs_namespace_get` | ❌ | | 3 | 0.458458 | `eventhubs_namespace_delete` | ❌ | -| 4 | 0.449562 | `workbooks_create` | ❌ | +| 4 | 0.449724 | `workbooks_create` | ❌ | | 5 | 0.438492 | `eventhubs_eventhub_consumergroup_update` | ❌ | --- -<<<<<<< HEAD -<<<<<<< HEAD -## Test 208 -======= -<<<<<<< HEAD -## Test 203 -======= -## Test 213 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= ## Test 218 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `eventhubs_namespace_update` **Prompt:** Update my namespace in my resource group @@ -8996,18 +4188,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -| 1 | 0.622219 | `eventhubs_namespace_update` | ✅ **EXPECTED** | -| 2 | 0.474098 | `eventhubs_namespace_delete` | ❌ | -| 3 | 0.448723 | `eventhubs_namespace_get` | ❌ | -| 4 | 0.436549 | `eventhubs_eventhub_consumergroup_update` | ❌ | -| 5 | 0.372490 | `sql_db_rename` | ❌ | - ---- - -## Test 209 -======= -| 1 | 0.622338 | `eventhubs_namespace_update` | ✅ **EXPECTED** | +| 1 | 0.622140 | `eventhubs_namespace_update` | ✅ **EXPECTED** | | 2 | 0.474099 | `eventhubs_namespace_delete` | ❌ | | 3 | 0.448723 | `eventhubs_namespace_get` | ❌ | | 4 | 0.436549 | `eventhubs_eventhub_consumergroup_update` | ❌ | @@ -9015,16 +4196,7 @@ --- -<<<<<<< HEAD -<<<<<<< HEAD -## Test 204 -======= -## Test 214 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= ## Test 219 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `functionapp_get` **Prompt:** Describe the function app in resource group @@ -9034,39 +4206,14 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| | 1 | 0.660116 | `functionapp_get` | ✅ **EXPECTED** | -| 2 | 0.451226 | `deploy_app_logs_get` | ❌ | +| 2 | 0.451613 | `deploy_app_logs_get` | ❌ | | 3 | 0.450457 | `applens_resource_diagnose` | ❌ | -<<<<<<< HEAD -<<<<<<< HEAD -| 4 | 0.390048 | `mysql_server_list` | ❌ | -======= -<<<<<<< HEAD -| 4 | 0.390107 | `mysql_server_list` | ❌ | ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -| 4 | 0.390048 | `mysql_server_list` | ❌ | ->>>>>>> e2fd2eac (refactor tts mcp tool) +| 4 | 0.390229 | `mysql_server_list` | ❌ | | 5 | 0.380314 | `get_bestpractices_get` | ❌ | --- -<<<<<<< HEAD -<<<<<<< HEAD -## Test 210 -======= -## Test 205 -======= -| 4 | 0.390048 | `mysql_server_list` | ❌ | -| 5 | 0.380262 | `get_bestpractices_get` | ❌ | - ---- - -## Test 215 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= ## Test 220 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `functionapp_get` **Prompt:** Get configuration for function app @@ -9077,25 +4224,13 @@ |------|-------|------|--------| | 1 | 0.607276 | `functionapp_get` | ✅ **EXPECTED** | | 2 | 0.447400 | `mysql_server_config_get` | ❌ | -| 3 | 0.424765 | `appconfig_account_list` | ❌ | +| 3 | 0.424693 | `appconfig_account_list` | ❌ | | 4 | 0.411267 | `appconfig_kv_get` | ❌ | -| 5 | 0.400002 | `deploy_app_logs_get` | ❌ | +| 5 | 0.400402 | `deploy_app_logs_get` | ❌ | --- -<<<<<<< HEAD -<<<<<<< HEAD -## Test 211 -======= -<<<<<<< HEAD -## Test 206 -======= -## Test 216 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= ## Test 221 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `functionapp_get` **Prompt:** Get function app status for @@ -9105,20 +4240,6 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| | 1 | 0.622384 | `functionapp_get` | ✅ **EXPECTED** | -<<<<<<< HEAD -<<<<<<< HEAD -| 2 | 0.413523 | `resourcehealth_availability-status_get` | ❌ | -| 3 | 0.390708 | `resourcehealth_availability-status_list` | ❌ | -| 4 | 0.383293 | `deploy_app_logs_get` | ❌ | -| 5 | 0.360665 | `storage_account_get` | ❌ | - ---- - -## Test 212 -======= -<<<<<<< HEAD -======= ->>>>>>> e2fd2eac (refactor tts mcp tool) | 2 | 0.413481 | `resourcehealth_availability-status_get` | ❌ | | 3 | 0.390708 | `resourcehealth_availability-status_list` | ❌ | | 4 | 0.383533 | `deploy_app_logs_get` | ❌ | @@ -9126,13 +4247,7 @@ --- -<<<<<<< HEAD -## Test 217 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= ## Test 222 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `functionapp_get` **Prompt:** Get information about my function app in @@ -9143,38 +4258,13 @@ |------|-------|------|--------| | 1 | 0.690933 | `functionapp_get` | ✅ **EXPECTED** | | 2 | 0.441937 | `foundry_resource_get` | ❌ | -<<<<<<< HEAD -<<<<<<< HEAD -| 3 | 0.432317 | `resourcehealth_availability-status_list` | ❌ | -======= -<<<<<<< HEAD -| 3 | 0.432458 | `resourcehealth_availability-status_list` | ❌ | ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -| 4 | 0.431821 | `applens_resource_diagnose` | ❌ | -| 5 | 0.429077 | `storage_account_get` | ❌ | - ---- - -<<<<<<< HEAD -## Test 213 -======= -## Test 208 -======= -======= ->>>>>>> e2fd2eac (refactor tts mcp tool) | 3 | 0.432317 | `resourcehealth_availability-status_list` | ❌ | | 4 | 0.431821 | `applens_resource_diagnose` | ❌ | | 5 | 0.429077 | `storage_account_get` | ❌ | --- -<<<<<<< HEAD -## Test 218 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= ## Test 223 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `functionapp_get` **Prompt:** Retrieve host name and status of function app @@ -9184,44 +4274,14 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| | 1 | 0.592791 | `functionapp_get` | ✅ **EXPECTED** | -<<<<<<< HEAD -<<<<<<< HEAD -| 2 | 0.417779 | `resourcehealth_availability-status_get` | ❌ | -| 3 | 0.409487 | `deploy_app_logs_get` | ❌ | -| 4 | 0.399953 | `storage_account_get` | ❌ | -======= -<<<<<<< HEAD -| 2 | 0.417817 | `resourcehealth_availability-status_get` | ❌ | -| 3 | 0.409712 | `deploy_app_logs_get` | ❌ | -| 4 | 0.399896 | `storage_account_get` | ❌ | -======= -| 2 | 0.417634 | `resourcehealth_availability-status_get` | ❌ | -| 3 | 0.409712 | `deploy_app_logs_get` | ❌ | -| 4 | 0.400049 | `storage_account_get` | ❌ | ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= | 2 | 0.417817 | `resourcehealth_availability-status_get` | ❌ | | 3 | 0.409712 | `deploy_app_logs_get` | ❌ | | 4 | 0.399953 | `storage_account_get` | ❌ | ->>>>>>> e2fd2eac (refactor tts mcp tool) | 5 | 0.392237 | `applens_resource_diagnose` | ❌ | --- -<<<<<<< HEAD -<<<<<<< HEAD -## Test 214 -======= -<<<<<<< HEAD -## Test 209 -======= -## Test 219 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= ## Test 224 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `functionapp_get` **Prompt:** Show function app details for in @@ -9231,37 +4291,14 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| | 1 | 0.687356 | `functionapp_get` | ✅ **EXPECTED** | -| 2 | 0.449033 | `deploy_app_logs_get` | ❌ | +| 2 | 0.449588 | `deploy_app_logs_get` | ❌ | | 3 | 0.428689 | `applens_resource_diagnose` | ❌ | | 4 | 0.424686 | `foundry_resource_get` | ❌ | -<<<<<<< HEAD -<<<<<<< HEAD -| 5 | 0.391781 | `monitor_webtests_get` | ❌ | - ---- - -## Test 215 -======= -<<<<<<< HEAD -| 5 | 0.392451 | `monitor_webtests_get` | ❌ | - ---- - -## Test 210 -======= -======= ->>>>>>> e2fd2eac (refactor tts mcp tool) | 5 | 0.391781 | `monitor_webtests_get` | ❌ | --- -<<<<<<< HEAD -## Test 220 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= ## Test 225 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `functionapp_get` **Prompt:** Show me the details for the function app @@ -9271,32 +4308,14 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| | 1 | 0.644882 | `functionapp_get` | ✅ **EXPECTED** | -<<<<<<< HEAD -| 2 | 0.429692 | `deploy_app_logs_get` | ❌ | -| 3 | 0.421082 | `storage_account_get` | ❌ | -| 4 | 0.403261 | `signalr_runtime_get` | ❌ | -======= | 2 | 0.430189 | `deploy_app_logs_get` | ❌ | | 3 | 0.421082 | `storage_account_get` | ❌ | | 4 | 0.403311 | `signalr_runtime_get` | ❌ | ->>>>>>> 58ab8585 (update prompts and tool description evaluator) | 5 | 0.391615 | `foundry_resource_get` | ❌ | --- -<<<<<<< HEAD -<<<<<<< HEAD -## Test 216 -======= -<<<<<<< HEAD -## Test 211 -======= -## Test 221 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= ## Test 226 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `functionapp_get` **Prompt:** Show plan and region for function app @@ -9306,37 +4325,14 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| | 1 | 0.554980 | `functionapp_get` | ✅ **EXPECTED** | -<<<<<<< HEAD -<<<<<<< HEAD -| 2 | 0.426921 | `quota_usage_check` | ❌ | -| 3 | 0.424062 | `deploy_app_logs_get` | ❌ | -======= -<<<<<<< HEAD -| 2 | 0.426976 | `quota_usage_check` | ❌ | -======= -======= ->>>>>>> e2fd2eac (refactor tts mcp tool) | 2 | 0.426703 | `quota_usage_check` | ❌ | | 3 | 0.424610 | `deploy_app_logs_get` | ❌ | ->>>>>>> 58ab8585 (update prompts and tool description evaluator) | 4 | 0.408011 | `deploy_plan_get` | ❌ | | 5 | 0.381629 | `deploy_architecture_diagram_generate` | ❌ | --- -<<<<<<< HEAD -<<<<<<< HEAD -## Test 217 -======= -<<<<<<< HEAD -## Test 212 -======= -## Test 222 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= ## Test 227 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `functionapp_get` **Prompt:** What is the status of function app ? @@ -9346,40 +4342,14 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| | 1 | 0.565797 | `functionapp_get` | ✅ **EXPECTED** | -<<<<<<< HEAD -| 2 | 0.403246 | `deploy_app_logs_get` | ❌ | -| 3 | 0.384159 | `resourcehealth_availability-status_list` | ❌ | -| 4 | 0.369868 | `applens_resource_diagnose` | ❌ | -<<<<<<< HEAD -| 5 | 0.354912 | `resourcehealth_availability-status_get` | ❌ | - ---- - -## Test 218 -======= -<<<<<<< HEAD -======= | 2 | 0.403665 | `deploy_app_logs_get` | ❌ | | 3 | 0.384159 | `resourcehealth_availability-status_list` | ❌ | | 4 | 0.369868 | `applens_resource_diagnose` | ❌ | ->>>>>>> e2fd2eac (refactor tts mcp tool) | 5 | 0.355044 | `resourcehealth_availability-status_get` | ❌ | --- -<<<<<<< HEAD -## Test 213 -======= -| 5 | 0.352966 | `resourcehealth_availability-status_get` | ❌ | - ---- - -## Test 223 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= ## Test 228 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `functionapp_get` **Prompt:** List all function apps in my subscription @@ -9389,41 +4359,14 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| | 1 | 0.646561 | `functionapp_get` | ✅ **EXPECTED** | -<<<<<<< HEAD -| 2 | 0.557549 | `search_service_list` | ❌ | -| 3 | 0.534936 | `subscription_list` | ❌ | -======= | 2 | 0.559382 | `search_service_list` | ❌ | -<<<<<<< HEAD -<<<<<<< HEAD -| 3 | 0.534935 | `subscription_list` | ❌ | ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= | 3 | 0.534930 | `subscription_list` | ❌ | ->>>>>>> e2fd2eac (refactor tts mcp tool) | 4 | 0.529031 | `kusto_cluster_list` | ❌ | | 5 | 0.516618 | `cosmos_account_list` | ❌ | --- -<<<<<<< HEAD -<<<<<<< HEAD -## Test 219 -======= -## Test 214 -======= -| 3 | 0.534930 | `subscription_list` | ❌ | -| 4 | 0.528892 | `kusto_cluster_list` | ❌ | -| 5 | 0.516664 | `cosmos_account_list` | ❌ | - ---- - -## Test 224 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= ## Test 229 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `functionapp_get` **Prompt:** Show me my Azure function apps @@ -9433,37 +4376,14 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| | 1 | 0.560249 | `functionapp_get` | ✅ **EXPECTED** | -<<<<<<< HEAD -| 2 | 0.464637 | `deploy_app_logs_get` | ❌ | -| 3 | 0.411323 | `get_bestpractices_get` | ❌ | -| 4 | 0.410461 | `search_service_list` | ❌ | -======= | 2 | 0.464985 | `deploy_app_logs_get` | ❌ | | 3 | 0.412646 | `search_service_list` | ❌ | | 4 | 0.411323 | `get_bestpractices_get` | ❌ | ->>>>>>> 58ab8585 (update prompts and tool description evaluator) | 5 | 0.398503 | `extension_cli_install` | ❌ | --- -<<<<<<< HEAD -<<<<<<< HEAD -## Test 220 -======= -## Test 215 -======= -| 3 | 0.436167 | `foundry_agents_list` | ❌ | -| 4 | 0.413594 | `get_bestpractices_get` | ❌ | -| 5 | 0.412646 | `search_service_list` | ❌ | - ---- - -## Test 225 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= ## Test 230 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `functionapp_get` **Prompt:** What function apps do I have? @@ -9472,27 +4392,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -| 1 | 0.433675 | `functionapp_get` | ✅ **EXPECTED** | -| 2 | 0.346031 | `deploy_app_logs_get` | ❌ | +| 1 | 0.433674 | `functionapp_get` | ✅ **EXPECTED** | +| 2 | 0.346619 | `deploy_app_logs_get` | ❌ | | 3 | 0.337966 | `applens_resource_diagnose` | ❌ | | 4 | 0.316594 | `extension_cli_install` | ❌ | | 5 | 0.284362 | `get_bestpractices_get` | ❌ | --- -<<<<<<< HEAD -<<<<<<< HEAD -## Test 221 -======= -<<<<<<< HEAD -## Test 216 -======= -## Test 226 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= ## Test 231 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `keyvault_admin_settings_get` **Prompt:** Get the account settings for my key vault @@ -9501,24 +4409,6 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -<<<<<<< HEAD -| 1 | 0.604780 | `keyvault_admin_settings_get` | ✅ **EXPECTED** | -| 2 | 0.532196 | `storage_account_get` | ❌ | -| 3 | 0.496042 | `keyvault_key_get` | ❌ | -| 4 | 0.452367 | `appconfig_kv_set` | ❌ | -| 5 | 0.448265 | `keyvault_secret_get` | ❌ | - ---- - -## Test 222 -======= -<<<<<<< HEAD -| 1 | 0.604797 | `keyvault_admin_settings_get` | ✅ **EXPECTED** | -| 2 | 0.532029 | `storage_account_get` | ❌ | -======= -======= ->>>>>>> e2fd2eac (refactor tts mcp tool) | 1 | 0.604780 | `keyvault_admin_settings_get` | ✅ **EXPECTED** | | 2 | 0.532196 | `storage_account_get` | ❌ | | 3 | 0.496629 | `keyvault_key_get` | ❌ | @@ -9527,16 +4417,7 @@ --- -<<<<<<< HEAD -<<<<<<< HEAD -## Test 217 -======= -## Test 227 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= ## Test 232 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `keyvault_admin_settings_get` **Prompt:** Show me the account settings for managed HSM keyvault @@ -9545,40 +4426,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -<<<<<<< HEAD -| 1 | 0.671370 | `keyvault_admin_settings_get` | ✅ **EXPECTED** | -| 2 | 0.455561 | `storage_account_get` | ❌ | -| 3 | 0.440966 | `keyvault_key_get` | ❌ | -======= -<<<<<<< HEAD -| 1 | 0.671368 | `keyvault_admin_settings_get` | ✅ **EXPECTED** | -| 2 | 0.455516 | `storage_account_get` | ❌ | -======= -======= ->>>>>>> e2fd2eac (refactor tts mcp tool) | 1 | 0.671370 | `keyvault_admin_settings_get` | ✅ **EXPECTED** | | 2 | 0.455561 | `storage_account_get` | ❌ | | 3 | 0.441225 | `keyvault_key_get` | ❌ | ->>>>>>> 58ab8585 (update prompts and tool description evaluator) | 4 | 0.404666 | `appconfig_kv_set` | ❌ | -| 5 | 0.395449 | `keyvault_secret_get` | ❌ | +| 5 | 0.395274 | `keyvault_secret_get` | ❌ | --- -<<<<<<< HEAD -<<<<<<< HEAD -## Test 223 -======= -<<<<<<< HEAD -## Test 218 -======= -## Test 228 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= ## Test 233 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `keyvault_admin_settings_get` **Prompt:** What's the value of the setting in my key vault with name @@ -9587,23 +4443,6 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -<<<<<<< HEAD -| 1 | 0.505709 | `keyvault_admin_settings_get` | ✅ **EXPECTED** | -| 2 | 0.496565 | `appconfig_kv_set` | ❌ | -| 3 | 0.420067 | `appconfig_kv_lock_set` | ❌ | -| 4 | 0.419642 | `keyvault_key_get` | ❌ | -| 5 | 0.410219 | `keyvault_secret_get` | ❌ | - ---- - -## Test 224 -======= -<<<<<<< HEAD -| 1 | 0.505731 | `keyvault_admin_settings_get` | ✅ **EXPECTED** | -======= -======= ->>>>>>> e2fd2eac (refactor tts mcp tool) | 1 | 0.505750 | `keyvault_admin_settings_get` | ✅ **EXPECTED** | | 2 | 0.496540 | `appconfig_kv_set` | ❌ | | 3 | 0.420145 | `appconfig_kv_lock_set` | ❌ | @@ -9612,16 +4451,7 @@ --- -<<<<<<< HEAD -<<<<<<< HEAD -## Test 219 -======= -## Test 229 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= ## Test 234 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `keyvault_certificate_create` **Prompt:** Create a new certificate called in the key vault @@ -9630,46 +4460,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -<<<<<<< HEAD -| 1 | 0.627727 | `keyvault_certificate_create` | ✅ **EXPECTED** | -| 2 | 0.570319 | `keyvault_certificate_import` | ❌ | -| 3 | 0.540199 | `keyvault_key_create` | ❌ | -| 4 | 0.519218 | `keyvault_certificate_get` | ❌ | -| 5 | 0.500027 | `keyvault_certificate_list` | ❌ | - ---- - -## Test 225 -======= -<<<<<<< HEAD -| 1 | 0.627882 | `keyvault_certificate_create` | ✅ **EXPECTED** | -| 2 | 0.570708 | `keyvault_certificate_import` | ❌ | -| 3 | 0.540476 | `keyvault_key_create` | ❌ | -| 4 | 0.519268 | `keyvault_certificate_get` | ❌ | -| 5 | 0.500093 | `keyvault_certificate_list` | ❌ | - ---- - -## Test 220 -======= -======= ->>>>>>> e2fd2eac (refactor tts mcp tool) -| 1 | 0.627727 | `keyvault_certificate_create` | ✅ **EXPECTED** | -| 2 | 0.570318 | `keyvault_certificate_import` | ❌ | -| 3 | 0.540199 | `keyvault_key_create` | ❌ | -| 4 | 0.519218 | `keyvault_certificate_get` | ❌ | -| 5 | 0.499900 | `keyvault_certificate_list` | ❌ | +| 1 | 0.627713 | `keyvault_certificate_create` | ✅ **EXPECTED** | +| 2 | 0.570324 | `keyvault_certificate_import` | ❌ | +| 3 | 0.540181 | `keyvault_key_create` | ❌ | +| 4 | 0.515939 | `keyvault_certificate_get` | ❌ | +| 5 | 0.500018 | `keyvault_certificate_list` | ❌ | --- -<<<<<<< HEAD -## Test 230 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= ## Test 235 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `keyvault_certificate_create` **Prompt:** Generate a certificate named in key vault @@ -9678,43 +4477,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -<<<<<<< HEAD -| 1 | 0.599548 | `keyvault_certificate_create` | ✅ **EXPECTED** | -| 2 | 0.561717 | `keyvault_certificate_import` | ❌ | -| 3 | 0.521910 | `keyvault_certificate_get` | ❌ | -| 4 | 0.501291 | `keyvault_key_create` | ❌ | -| 5 | 0.496516 | `keyvault_certificate_list` | ❌ | - ---- - -## Test 226 -======= -| 1 | 0.599990 | `keyvault_certificate_create` | ✅ **EXPECTED** | -| 2 | 0.561458 | `keyvault_certificate_import` | ❌ | -| 3 | 0.522706 | `keyvault_certificate_get` | ❌ | -| 4 | 0.502128 | `keyvault_key_create` | ❌ | -| 5 | 0.497145 | `keyvault_certificate_list` | ❌ | - ---- - -<<<<<<< HEAD -## Test 221 -======= -## Test 231 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -| 1 | 0.600003 | `keyvault_certificate_create` | ✅ **EXPECTED** | -| 2 | 0.561463 | `keyvault_certificate_import` | ❌ | -| 3 | 0.522705 | `keyvault_certificate_get` | ❌ | -| 4 | 0.502139 | `keyvault_key_create` | ❌ | -| 5 | 0.497143 | `keyvault_certificate_list` | ❌ | +| 1 | 0.600005 | `keyvault_certificate_create` | ✅ **EXPECTED** | +| 2 | 0.561459 | `keyvault_certificate_import` | ❌ | +| 3 | 0.519600 | `keyvault_certificate_get` | ❌ | +| 4 | 0.502052 | `keyvault_key_create` | ❌ | +| 5 | 0.497159 | `keyvault_certificate_list` | ❌ | --- ## Test 236 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `keyvault_certificate_create` **Prompt:** Request creation of certificate in the key vault @@ -9723,46 +4494,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -<<<<<<< HEAD -| 1 | 0.573998 | `keyvault_certificate_create` | ✅ **EXPECTED** | -| 2 | 0.527759 | `keyvault_certificate_import` | ❌ | -| 3 | 0.498278 | `keyvault_certificate_get` | ❌ | -| 4 | 0.481548 | `keyvault_key_create` | ❌ | -| 5 | 0.469601 | `keyvault_certificate_list` | ❌ | - ---- - -## Test 227 -======= -<<<<<<< HEAD -| 1 | 0.574040 | `keyvault_certificate_create` | ✅ **EXPECTED** | -| 2 | 0.527743 | `keyvault_certificate_import` | ❌ | -| 3 | 0.498226 | `keyvault_certificate_get` | ❌ | -| 4 | 0.481666 | `keyvault_key_create` | ❌ | -| 5 | 0.469651 | `keyvault_certificate_list` | ❌ | - ---- - -## Test 222 -======= -======= ->>>>>>> e2fd2eac (refactor tts mcp tool) -| 1 | 0.573998 | `keyvault_certificate_create` | ✅ **EXPECTED** | -| 2 | 0.527759 | `keyvault_certificate_import` | ❌ | -| 3 | 0.498278 | `keyvault_certificate_get` | ❌ | -| 4 | 0.481548 | `keyvault_key_create` | ❌ | -| 5 | 0.469457 | `keyvault_certificate_list` | ❌ | +| 1 | 0.573920 | `keyvault_certificate_create` | ✅ **EXPECTED** | +| 2 | 0.527631 | `keyvault_certificate_import` | ❌ | +| 3 | 0.495169 | `keyvault_certificate_get` | ❌ | +| 4 | 0.481471 | `keyvault_key_create` | ❌ | +| 5 | 0.469500 | `keyvault_certificate_list` | ❌ | --- -<<<<<<< HEAD -## Test 232 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= ## Test 237 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `keyvault_certificate_create` **Prompt:** Provision a new key vault certificate in vault @@ -9773,25 +4513,13 @@ |------|-------|------|--------| | 1 | 0.591697 | `keyvault_certificate_create` | ✅ **EXPECTED** | | 2 | 0.562265 | `keyvault_certificate_import` | ❌ | -| 3 | 0.522147 | `keyvault_certificate_get` | ❌ | +| 3 | 0.518739 | `keyvault_certificate_get` | ❌ | | 4 | 0.502529 | `keyvault_key_create` | ❌ | -| 5 | 0.479936 | `keyvault_certificate_list` | ❌ | +| 5 | 0.479992 | `keyvault_certificate_list` | ❌ | --- -<<<<<<< HEAD -<<<<<<< HEAD -## Test 228 -======= -<<<<<<< HEAD -## Test 223 -======= -## Test 233 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= ## Test 238 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `keyvault_certificate_create` **Prompt:** Issue a certificate in key vault @@ -9802,25 +4530,13 @@ |------|-------|------|--------| | 1 | 0.622788 | `keyvault_certificate_create` | ✅ **EXPECTED** | | 2 | 0.558532 | `keyvault_certificate_import` | ❌ | -| 3 | 0.534503 | `keyvault_certificate_get` | ❌ | -| 4 | 0.521205 | `keyvault_certificate_list` | ❌ | +| 3 | 0.531287 | `keyvault_certificate_get` | ❌ | +| 4 | 0.521316 | `keyvault_certificate_list` | ❌ | | 5 | 0.465056 | `keyvault_key_create` | ❌ | --- -<<<<<<< HEAD -<<<<<<< HEAD -## Test 229 -======= -<<<<<<< HEAD -## Test 224 -======= -## Test 234 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= ## Test 239 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `keyvault_certificate_get` **Prompt:** Show me the certificate in the key vault @@ -9829,27 +4545,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -| 1 | 0.600625 | `keyvault_certificate_get` | ✅ **EXPECTED** | -| 2 | 0.528153 | `keyvault_certificate_list` | ❌ | +| 1 | 0.603235 | `keyvault_certificate_get` | ✅ **EXPECTED** | +| 2 | 0.528405 | `keyvault_certificate_list` | ❌ | | 3 | 0.519037 | `keyvault_certificate_import` | ❌ | | 4 | 0.499293 | `keyvault_certificate_create` | ❌ | -| 5 | 0.487691 | `keyvault_key_get` | ❌ | +| 5 | 0.486609 | `keyvault_key_get` | ❌ | --- -<<<<<<< HEAD -<<<<<<< HEAD -## Test 230 -======= -<<<<<<< HEAD -## Test 225 -======= -## Test 235 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= ## Test 240 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `keyvault_certificate_get` **Prompt:** Show me the details of the certificate in the key vault @@ -9858,38 +4562,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -| 1 | 0.646098 | `keyvault_certificate_get` | ✅ **EXPECTED** | -<<<<<<< HEAD -| 2 | 0.563263 | `keyvault_key_get` | ❌ | -| 3 | 0.514499 | `keyvault_secret_get` | ❌ | -| 4 | 0.509446 | `keyvault_certificate_list` | ❌ | -<<<<<<< HEAD -| 5 | 0.507738 | `keyvault_certificate_import` | ❌ | - ---- - -## Test 231 -======= -| 5 | 0.507630 | `keyvault_certificate_import` | ❌ | - ---- - -<<<<<<< HEAD -## Test 226 -======= -## Test 236 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -| 2 | 0.562988 | `keyvault_key_get` | ❌ | -| 3 | 0.514170 | `keyvault_secret_get` | ❌ | -| 4 | 0.509201 | `keyvault_certificate_list` | ❌ | -| 5 | 0.507737 | `keyvault_certificate_import` | ❌ | +| 1 | 0.649214 | `keyvault_certificate_get` | ✅ **EXPECTED** | +| 2 | 0.563103 | `keyvault_key_get` | ❌ | +| 3 | 0.514222 | `keyvault_secret_get` | ❌ | +| 4 | 0.509503 | `keyvault_certificate_list` | ❌ | +| 5 | 0.507757 | `keyvault_certificate_import` | ❌ | --- ## Test 241 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `keyvault_certificate_get` **Prompt:** Get the certificate from vault @@ -9898,36 +4579,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -| 1 | 0.609523 | `keyvault_certificate_get` | ✅ **EXPECTED** | -| 2 | 0.515460 | `keyvault_certificate_list` | ❌ | +| 1 | 0.606958 | `keyvault_certificate_get` | ✅ **EXPECTED** | +| 2 | 0.515570 | `keyvault_certificate_list` | ❌ | | 3 | 0.511197 | `keyvault_certificate_create` | ❌ | -<<<<<<< HEAD -<<<<<<< HEAD | 4 | 0.507768 | `keyvault_certificate_import` | ❌ | -| 5 | 0.475674 | `keyvault_key_get` | ❌ | - ---- - -## Test 232 -======= -| 4 | 0.507693 | `keyvault_certificate_import` | ❌ | -======= -| 4 | 0.507768 | `keyvault_certificate_import` | ❌ | ->>>>>>> e2fd2eac (refactor tts mcp tool) | 5 | 0.474394 | `keyvault_key_get` | ❌ | --- -<<<<<<< HEAD -<<<<<<< HEAD -## Test 227 -======= -## Test 237 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= ## Test 242 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `keyvault_certificate_get` **Prompt:** Display the certificate details for in vault @@ -9936,43 +4596,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -<<<<<<< HEAD -| 1 | 0.647669 | `keyvault_certificate_get` | ✅ **EXPECTED** | -| 2 | 0.528243 | `keyvault_key_get` | ❌ | +| 1 | 0.649758 | `keyvault_certificate_get` | ✅ **EXPECTED** | +| 2 | 0.527400 | `keyvault_key_get` | ❌ | | 3 | 0.521556 | `keyvault_certificate_list` | ❌ | | 4 | 0.509796 | `keyvault_certificate_import` | ❌ | -| 5 | 0.502403 | `keyvault_secret_get` | ❌ | - ---- - -<<<<<<< HEAD -## Test 233 -======= -## Test 228 -======= -| 1 | 0.647626 | `keyvault_certificate_get` | ✅ **EXPECTED** | -| 2 | 0.527284 | `keyvault_key_get` | ❌ | -| 3 | 0.521689 | `keyvault_certificate_list` | ❌ | -| 4 | 0.509907 | `keyvault_certificate_import` | ❌ | -| 5 | 0.501942 | `keyvault_secret_get` | ❌ | - ---- - -## Test 238 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -| 1 | 0.647745 | `keyvault_certificate_get` | ✅ **EXPECTED** | -| 2 | 0.527487 | `keyvault_key_get` | ❌ | -| 3 | 0.521168 | `keyvault_certificate_list` | ❌ | -| 4 | 0.509776 | `keyvault_certificate_import` | ❌ | -| 5 | 0.502207 | `keyvault_secret_get` | ❌ | +| 5 | 0.501988 | `keyvault_secret_get` | ❌ | --- ## Test 243 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `keyvault_certificate_get` **Prompt:** Retrieve certificate metadata for in vault @@ -9981,46 +4613,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -<<<<<<< HEAD -| 1 | 0.595959 | `keyvault_certificate_get` | ✅ **EXPECTED** | +| 1 | 0.594012 | `keyvault_certificate_get` | ✅ **EXPECTED** | | 2 | 0.527404 | `keyvault_certificate_list` | ❌ | | 3 | 0.519059 | `keyvault_certificate_import` | ❌ | | 4 | 0.501138 | `keyvault_certificate_create` | ❌ | -| 5 | 0.465429 | `keyvault_key_get` | ❌ | - ---- - -## Test 234 -======= -<<<<<<< HEAD -| 1 | 0.595902 | `keyvault_certificate_get` | ✅ **EXPECTED** | -| 2 | 0.527167 | `keyvault_certificate_list` | ❌ | -| 3 | 0.518836 | `keyvault_certificate_import` | ❌ | -| 4 | 0.500932 | `keyvault_certificate_create` | ❌ | -| 5 | 0.465265 | `keyvault_key_get` | ❌ | - ---- - -## Test 229 -======= -======= ->>>>>>> e2fd2eac (refactor tts mcp tool) -| 1 | 0.595959 | `keyvault_certificate_get` | ✅ **EXPECTED** | -| 2 | 0.527274 | `keyvault_certificate_list` | ❌ | -| 3 | 0.519059 | `keyvault_certificate_import` | ❌ | -| 4 | 0.501138 | `keyvault_certificate_create` | ❌ | | 5 | 0.465174 | `keyvault_key_get` | ❌ | --- -<<<<<<< HEAD -## Test 239 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= ## Test 244 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `keyvault_certificate_import` **Prompt:** Import the certificate in file into the key vault @@ -10029,47 +4630,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -<<<<<<< HEAD | 1 | 0.585481 | `keyvault_certificate_import` | ✅ **EXPECTED** | -| 2 | 0.420747 | `keyvault_certificate_get` | ❌ | +| 2 | 0.420009 | `keyvault_certificate_get` | ❌ | | 3 | 0.402595 | `keyvault_certificate_create` | ❌ | | 4 | 0.399342 | `keyvault_certificate_list` | ❌ | | 5 | 0.352905 | `keyvault_key_create` | ❌ | --- -## Test 235 -======= -<<<<<<< HEAD -| 1 | 0.585549 | `keyvault_certificate_import` | ✅ **EXPECTED** | -| 2 | 0.420798 | `keyvault_certificate_get` | ❌ | -| 3 | 0.402853 | `keyvault_certificate_create` | ❌ | -| 4 | 0.399353 | `keyvault_certificate_list` | ❌ | -| 5 | 0.353196 | `keyvault_key_create` | ❌ | - ---- - -## Test 230 -======= -| 1 | 0.585374 | `keyvault_certificate_import` | ✅ **EXPECTED** | -======= -| 1 | 0.585481 | `keyvault_certificate_import` | ✅ **EXPECTED** | ->>>>>>> e2fd2eac (refactor tts mcp tool) -| 2 | 0.420747 | `keyvault_certificate_get` | ❌ | -| 3 | 0.402595 | `keyvault_certificate_create` | ❌ | -| 4 | 0.399228 | `keyvault_certificate_list` | ❌ | -| 5 | 0.352905 | `keyvault_key_create` | ❌ | - ---- - -<<<<<<< HEAD -## Test 240 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= ## Test 245 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `keyvault_certificate_import` **Prompt:** Import a certificate into the key vault using the name @@ -10078,34 +4647,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -| 1 | 0.622172 | `keyvault_certificate_import` | ✅ **EXPECTED** | -| 2 | 0.504401 | `keyvault_certificate_get` | ❌ | -| 3 | 0.498608 | `keyvault_certificate_create` | ❌ | -| 4 | 0.448038 | `keyvault_certificate_list` | ❌ | -| 5 | 0.419465 | `keyvault_key_create` | ❌ | - ---- - -<<<<<<< HEAD -<<<<<<< HEAD -## Test 236 -======= -## Test 231 -======= -| 1 | 0.622168 | `keyvault_certificate_import` | ✅ **EXPECTED** | -| 2 | 0.504306 | `keyvault_certificate_get` | ❌ | -| 3 | 0.498841 | `keyvault_certificate_create` | ❌ | -| 4 | 0.448114 | `keyvault_certificate_list` | ❌ | -| 5 | 0.419794 | `keyvault_key_create` | ❌ | +| 1 | 0.622125 | `keyvault_certificate_import` | ✅ **EXPECTED** | +| 2 | 0.501864 | `keyvault_certificate_get` | ❌ | +| 3 | 0.498847 | `keyvault_certificate_create` | ❌ | +| 4 | 0.448105 | `keyvault_certificate_list` | ❌ | +| 5 | 0.419811 | `keyvault_key_create` | ❌ | --- -## Test 241 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= ## Test 246 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `keyvault_certificate_import` **Prompt:** Upload certificate file to key vault @@ -10116,32 +4666,13 @@ |------|-------|------|--------| | 1 | 0.595707 | `keyvault_certificate_import` | ✅ **EXPECTED** | | 2 | 0.453929 | `keyvault_certificate_create` | ❌ | -| 3 | 0.452551 | `keyvault_certificate_get` | ❌ | -| 4 | 0.418115 | `keyvault_certificate_list` | ❌ | +| 3 | 0.451713 | `keyvault_certificate_get` | ❌ | +| 4 | 0.418203 | `keyvault_certificate_list` | ❌ | | 5 | 0.413377 | `keyvault_key_create` | ❌ | --- -<<<<<<< HEAD -<<<<<<< HEAD -## Test 237 -======= -## Test 232 -======= -| 1 | 0.594990 | `keyvault_certificate_import` | ✅ **EXPECTED** | -| 2 | 0.453726 | `keyvault_certificate_create` | ❌ | -| 3 | 0.452165 | `keyvault_certificate_get` | ❌ | -| 4 | 0.418142 | `keyvault_certificate_list` | ❌ | -| 5 | 0.413240 | `keyvault_key_create` | ❌ | - ---- - -## Test 242 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= ## Test 247 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `keyvault_certificate_import` **Prompt:** Load certificate from file into vault @@ -10151,26 +4682,14 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| | 1 | 0.619480 | `keyvault_certificate_import` | ✅ **EXPECTED** | -| 2 | 0.517804 | `keyvault_certificate_get` | ❌ | +| 2 | 0.515610 | `keyvault_certificate_get` | ❌ | | 3 | 0.480815 | `keyvault_certificate_create` | ❌ | -| 4 | 0.444264 | `keyvault_certificate_list` | ❌ | +| 4 | 0.444386 | `keyvault_certificate_list` | ❌ | | 5 | 0.381873 | `keyvault_key_create` | ❌ | --- -<<<<<<< HEAD -<<<<<<< HEAD -## Test 238 -======= -<<<<<<< HEAD -## Test 233 -======= -## Test 243 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= ## Test 248 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `keyvault_certificate_import` **Prompt:** Add existing certificate file to the key vault with name @@ -10179,48 +4698,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -<<<<<<< HEAD -| 1 | 0.595418 | `keyvault_certificate_import` | ✅ **EXPECTED** | -| 2 | 0.452490 | `keyvault_certificate_create` | ❌ | -======= -<<<<<<< HEAD | 1 | 0.595417 | `keyvault_certificate_import` | ✅ **EXPECTED** | | 2 | 0.452489 | `keyvault_certificate_create` | ❌ | ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -| 3 | 0.441616 | `keyvault_certificate_get` | ❌ | +| 3 | 0.440366 | `keyvault_certificate_get` | ❌ | | 4 | 0.408018 | `keyvault_key_create` | ❌ | -| 5 | 0.392244 | `keyvault_secret_create` | ❌ | - ---- - -<<<<<<< HEAD -## Test 239 -======= -## Test 234 -======= -| 1 | 0.595426 | `keyvault_certificate_import` | ✅ **EXPECTED** | -| 2 | 0.452531 | `keyvault_certificate_create` | ❌ | -| 3 | 0.441676 | `keyvault_certificate_get` | ❌ | -| 4 | 0.408033 | `keyvault_key_create` | ❌ | -| 5 | 0.392316 | `keyvault_secret_create` | ❌ | - ---- - -## Test 244 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -| 1 | 0.595460 | `keyvault_certificate_import` | ✅ **EXPECTED** | -| 2 | 0.452480 | `keyvault_certificate_create` | ❌ | -| 3 | 0.441646 | `keyvault_certificate_get` | ❌ | -| 4 | 0.408002 | `keyvault_key_create` | ❌ | -| 5 | 0.392240 | `keyvault_secret_create` | ❌ | +| 5 | 0.392284 | `keyvault_secret_create` | ❌ | --- ## Test 249 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `keyvault_certificate_list` **Prompt:** List all certificates in the key vault @@ -10229,41 +4715,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD | 1 | 0.726124 | `keyvault_certificate_list` | ✅ **EXPECTED** | -<<<<<<< HEAD -| 2 | 0.583110 | `keyvault_key_list` | ❌ | -======= -<<<<<<< HEAD -| 2 | 0.583138 | `keyvault_key_list` | ❌ | -======= -| 2 | 0.583079 | `keyvault_key_list` | ❌ | ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -| 3 | 0.531988 | `keyvault_secret_list` | ❌ | -======= -| 1 | 0.726049 | `keyvault_certificate_list` | ✅ **EXPECTED** | | 2 | 0.583110 | `keyvault_key_list` | ❌ | -| 3 | 0.532060 | `keyvault_secret_list` | ❌ | ->>>>>>> e2fd2eac (refactor tts mcp tool) -| 4 | 0.515236 | `keyvault_certificate_get` | ❌ | +| 3 | 0.531838 | `keyvault_secret_list` | ❌ | +| 4 | 0.514152 | `keyvault_certificate_get` | ❌ | | 5 | 0.485792 | `keyvault_certificate_create` | ❌ | --- -<<<<<<< HEAD -<<<<<<< HEAD -## Test 240 -======= -<<<<<<< HEAD -## Test 235 -======= -## Test 245 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= ## Test 250 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `keyvault_certificate_list` **Prompt:** Show me the certificates in the key vault @@ -10272,39 +4732,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -| 1 | 0.615289 | `keyvault_certificate_list` | ✅ **EXPECTED** | -| 2 | 0.522453 | `keyvault_certificate_get` | ❌ | -<<<<<<< HEAD -<<<<<<< HEAD -| 3 | 0.475156 | `keyvault_key_list` | ❌ | -======= -<<<<<<< HEAD -| 3 | 0.475197 | `keyvault_key_list` | ❌ | -======= -| 3 | 0.475142 | `keyvault_key_list` | ❌ | ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= +| 1 | 0.615541 | `keyvault_certificate_list` | ✅ **EXPECTED** | +| 2 | 0.525122 | `keyvault_certificate_get` | ❌ | | 3 | 0.475156 | `keyvault_key_list` | ❌ | ->>>>>>> e2fd2eac (refactor tts mcp tool) | 4 | 0.460973 | `keyvault_certificate_create` | ❌ | -| 5 | 0.449381 | `keyvault_key_get` | ❌ | +| 5 | 0.448139 | `keyvault_key_get` | ❌ | --- -<<<<<<< HEAD -<<<<<<< HEAD -## Test 241 -======= -<<<<<<< HEAD -## Test 236 -======= -## Test 246 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= ## Test 251 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `keyvault_certificate_list` **Prompt:** What certificates are in the key vault ? @@ -10313,35 +4749,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -| 1 | 0.624710 | `keyvault_certificate_list` | ✅ **EXPECTED** | -======= -| 1 | 0.624522 | `keyvault_certificate_list` | ✅ **EXPECTED** | ->>>>>>> e2fd2eac (refactor tts mcp tool) -| 2 | 0.519739 | `keyvault_certificate_get` | ❌ | +| 1 | 0.624711 | `keyvault_certificate_list` | ✅ **EXPECTED** | +| 2 | 0.518577 | `keyvault_certificate_get` | ❌ | | 3 | 0.510048 | `keyvault_certificate_create` | ❌ | | 4 | 0.505534 | `keyvault_certificate_import` | ❌ | | 5 | 0.497356 | `keyvault_key_list` | ❌ | --- -<<<<<<< HEAD -<<<<<<< HEAD -## Test 242 -======= -## Test 237 -======= -| 4 | 0.505367 | `keyvault_certificate_import` | ❌ | -| 5 | 0.497322 | `keyvault_key_list` | ❌ | - ---- - -## Test 247 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= ## Test 252 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `keyvault_certificate_list` **Prompt:** List certificate names in vault @@ -10350,41 +4766,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD | 1 | 0.672622 | `keyvault_certificate_list` | ✅ **EXPECTED** | -<<<<<<< HEAD -| 2 | 0.553990 | `keyvault_key_list` | ❌ | -======= -<<<<<<< HEAD -| 2 | 0.554016 | `keyvault_key_list` | ❌ | -======= -| 2 | 0.553960 | `keyvault_key_list` | ❌ | ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -| 3 | 0.511905 | `keyvault_secret_list` | ❌ | -======= -| 1 | 0.672392 | `keyvault_certificate_list` | ✅ **EXPECTED** | | 2 | 0.553990 | `keyvault_key_list` | ❌ | -| 3 | 0.511981 | `keyvault_secret_list` | ❌ | ->>>>>>> e2fd2eac (refactor tts mcp tool) -| 4 | 0.507062 | `keyvault_certificate_get` | ❌ | +| 3 | 0.511668 | `keyvault_secret_list` | ❌ | +| 4 | 0.505198 | `keyvault_certificate_get` | ❌ | | 5 | 0.492357 | `keyvault_certificate_create` | ❌ | --- -<<<<<<< HEAD -<<<<<<< HEAD -## Test 243 -======= -<<<<<<< HEAD -## Test 238 -======= -## Test 248 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= ## Test 253 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `keyvault_certificate_list` **Prompt:** Enumerate certificates in key vault @@ -10393,42 +4783,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -<<<<<<< HEAD -| 1 | 0.747408 | `keyvault_certificate_list` | ✅ **EXPECTED** | -| 2 | 0.594216 | `keyvault_key_list` | ❌ | -======= | 1 | 0.747407 | `keyvault_certificate_list` | ✅ **EXPECTED** | -<<<<<<< HEAD -| 2 | 0.594268 | `keyvault_key_list` | ❌ | -======= -| 2 | 0.594121 | `keyvault_key_list` | ❌ | ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -| 3 | 0.558771 | `keyvault_secret_list` | ❌ | -======= -| 1 | 0.747416 | `keyvault_certificate_list` | ✅ **EXPECTED** | | 2 | 0.594216 | `keyvault_key_list` | ❌ | -| 3 | 0.558818 | `keyvault_secret_list` | ❌ | ->>>>>>> e2fd2eac (refactor tts mcp tool) -| 4 | 0.515568 | `keyvault_certificate_get` | ❌ | +| 3 | 0.558644 | `keyvault_secret_list` | ❌ | +| 4 | 0.513381 | `keyvault_certificate_get` | ❌ | | 5 | 0.490876 | `keyvault_certificate_create` | ❌ | --- -<<<<<<< HEAD -<<<<<<< HEAD -## Test 244 -======= -<<<<<<< HEAD -## Test 239 -======= -## Test 249 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= ## Test 254 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `keyvault_certificate_list` **Prompt:** Show certificate names in the key vault @@ -10437,39 +4800,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -| 1 | 0.639473 | `keyvault_certificate_list` | ✅ **EXPECTED** | -| 2 | 0.512475 | `keyvault_certificate_get` | ❌ | -<<<<<<< HEAD -<<<<<<< HEAD -| 3 | 0.507572 | `keyvault_key_list` | ❌ | -======= -<<<<<<< HEAD -| 3 | 0.507603 | `keyvault_key_list` | ❌ | -======= -| 3 | 0.507562 | `keyvault_key_list` | ❌ | ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= +| 1 | 0.639711 | `keyvault_certificate_list` | ✅ **EXPECTED** | +| 2 | 0.512269 | `keyvault_certificate_get` | ❌ | | 3 | 0.507572 | `keyvault_key_list` | ❌ | ->>>>>>> e2fd2eac (refactor tts mcp tool) | 4 | 0.482583 | `keyvault_certificate_create` | ❌ | -| 5 | 0.464824 | `keyvault_secret_list` | ❌ | +| 5 | 0.464535 | `keyvault_secret_list` | ❌ | --- -<<<<<<< HEAD -<<<<<<< HEAD -## Test 245 -======= -<<<<<<< HEAD -## Test 240 -======= -## Test 250 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= ## Test 255 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `keyvault_key_create` **Prompt:** Create a new key called with the RSA type in the key vault @@ -10479,33 +4818,14 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| | 1 | 0.661466 | `keyvault_key_create` | ✅ **EXPECTED** | -| 2 | 0.456580 | `keyvault_secret_create` | ❌ | +| 2 | 0.456633 | `keyvault_secret_create` | ❌ | | 3 | 0.451790 | `keyvault_certificate_create` | ❌ | | 4 | 0.429614 | `keyvault_certificate_import` | ❌ | -| 5 | 0.399469 | `keyvault_key_get` | ❌ | - ---- - -<<<<<<< HEAD -<<<<<<< HEAD -## Test 246 -======= -## Test 241 -======= -| 1 | 0.661548 | `keyvault_key_create` | ✅ **EXPECTED** | -| 2 | 0.456628 | `keyvault_secret_create` | ❌ | -| 3 | 0.451826 | `keyvault_certificate_create` | ❌ | -| 4 | 0.429537 | `keyvault_certificate_import` | ❌ | -| 5 | 0.399324 | `keyvault_key_get` | ❌ | +| 5 | 0.399326 | `keyvault_key_get` | ❌ | --- -## Test 251 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= ## Test 256 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `keyvault_key_create` **Prompt:** Generate a key with type in vault @@ -10514,51 +4834,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -<<<<<<< HEAD -| 1 | 0.641070 | `keyvault_key_create` | ✅ **EXPECTED** | -| 2 | 0.428964 | `keyvault_key_get` | ❌ | -| 3 | 0.422763 | `keyvault_certificate_create` | ❌ | -| 4 | 0.420045 | `keyvault_secret_create` | ❌ | -| 5 | 0.405644 | `appconfig_kv_set` | ❌ | - ---- - -## Test 247 -======= -<<<<<<< HEAD -| 1 | 0.641022 | `keyvault_key_create` | ✅ **EXPECTED** | -| 2 | 0.428461 | `keyvault_key_get` | ❌ | -| 3 | 0.422686 | `keyvault_certificate_create` | ❌ | -| 4 | 0.419964 | `keyvault_secret_create` | ❌ | -| 5 | 0.405612 | `appconfig_kv_set` | ❌ | - ---- - -## Test 242 -======= | 1 | 0.641070 | `keyvault_key_create` | ✅ **EXPECTED** | | 2 | 0.428502 | `keyvault_key_get` | ❌ | | 3 | 0.422763 | `keyvault_certificate_create` | ❌ | -| 4 | 0.420045 | `keyvault_secret_create` | ❌ | +| 4 | 0.420135 | `keyvault_secret_create` | ❌ | | 5 | 0.405644 | `appconfig_kv_set` | ❌ | --- -## Test 252 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -| 1 | 0.641639 | `keyvault_key_create` | ✅ **EXPECTED** | -| 2 | 0.428841 | `keyvault_key_get` | ❌ | -| 3 | 0.423116 | `keyvault_certificate_create` | ❌ | -| 4 | 0.420631 | `keyvault_secret_create` | ❌ | -| 5 | 0.406157 | `appconfig_kv_set` | ❌ | - ---- - ## Test 257 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `keyvault_key_create` **Prompt:** Create an oct key in the vault @@ -10567,46 +4851,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -<<<<<<< HEAD -| 1 | 0.547493 | `keyvault_key_create` | ✅ **EXPECTED** | -| 2 | 0.463557 | `keyvault_secret_create` | ❌ | -| 3 | 0.447410 | `keyvault_certificate_create` | ❌ | -| 4 | 0.420793 | `keyvault_key_get` | ❌ | -| 5 | 0.404350 | `keyvault_certificate_import` | ❌ | - ---- - -## Test 248 -======= -<<<<<<< HEAD -| 1 | 0.548424 | `keyvault_key_create` | ✅ **EXPECTED** | -| 2 | 0.464221 | `keyvault_secret_create` | ❌ | -| 3 | 0.448379 | `keyvault_certificate_create` | ❌ | -| 4 | 0.421467 | `keyvault_key_get` | ❌ | -| 5 | 0.405195 | `keyvault_certificate_import` | ❌ | - ---- - -## Test 243 -======= -======= ->>>>>>> e2fd2eac (refactor tts mcp tool) | 1 | 0.547493 | `keyvault_key_create` | ✅ **EXPECTED** | -| 2 | 0.463557 | `keyvault_secret_create` | ❌ | +| 2 | 0.463630 | `keyvault_secret_create` | ❌ | | 3 | 0.447410 | `keyvault_certificate_create` | ❌ | | 4 | 0.420366 | `keyvault_key_get` | ❌ | | 5 | 0.404350 | `keyvault_certificate_import` | ❌ | --- -<<<<<<< HEAD -## Test 253 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= ## Test 258 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `keyvault_key_create` **Prompt:** Create an RSA key in the vault with name @@ -10616,33 +4869,14 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| | 1 | 0.641369 | `keyvault_key_create` | ✅ **EXPECTED** | -| 2 | 0.501636 | `keyvault_secret_create` | ❌ | +| 2 | 0.501664 | `keyvault_secret_create` | ❌ | | 3 | 0.491735 | `keyvault_certificate_create` | ❌ | | 4 | 0.464557 | `keyvault_certificate_import` | ❌ | -| 5 | 0.451505 | `keyvault_key_get` | ❌ | - ---- - -<<<<<<< HEAD -<<<<<<< HEAD -## Test 249 -======= -## Test 244 -======= -| 1 | 0.640853 | `keyvault_key_create` | ✅ **EXPECTED** | -| 2 | 0.500742 | `keyvault_secret_create` | ❌ | -| 3 | 0.491071 | `keyvault_certificate_create` | ❌ | -| 4 | 0.463536 | `keyvault_certificate_import` | ❌ | -| 5 | 0.450448 | `keyvault_key_get` | ❌ | +| 5 | 0.451016 | `keyvault_key_get` | ❌ | --- -## Test 254 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= ## Test 259 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `keyvault_key_create` **Prompt:** Create an EC key with name in the vault @@ -10651,38 +4885,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -| 1 | 0.571793 | `keyvault_key_create` | ✅ **EXPECTED** | -| 2 | 0.443085 | `keyvault_certificate_create` | ❌ | -| 3 | 0.434697 | `keyvault_secret_create` | ❌ | -| 4 | 0.421997 | `keyvault_key_get` | ❌ | -| 5 | 0.400514 | `keyvault_certificate_import` | ❌ | - ---- - -## Test 250 -======= | 1 | 0.571718 | `keyvault_key_create` | ✅ **EXPECTED** | | 2 | 0.443369 | `keyvault_certificate_create` | ❌ | -| 3 | 0.434675 | `keyvault_secret_create` | ❌ | +| 3 | 0.434701 | `keyvault_secret_create` | ❌ | | 4 | 0.421721 | `keyvault_key_get` | ❌ | | 5 | 0.400533 | `keyvault_certificate_import` | ❌ | --- -<<<<<<< HEAD -## Test 245 -======= -| 5 | 0.400433 | `keyvault_certificate_import` | ❌ | - ---- - -## Test 255 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= ## Test 260 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `keyvault_key_get` **Prompt:** Show me the key in the key vault @@ -10691,39 +4902,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -| 1 | 0.550225 | `keyvault_key_get` | ✅ **EXPECTED** | -| 2 | 0.468243 | `keyvault_secret_get` | ❌ | +| 1 | 0.549488 | `keyvault_key_get` | ✅ **EXPECTED** | +| 2 | 0.468165 | `keyvault_secret_get` | ❌ | | 3 | 0.452816 | `keyvault_key_create` | ❌ | -<<<<<<< HEAD -<<<<<<< HEAD -| 4 | 0.439969 | `keyvault_key_list` | ❌ | -======= -<<<<<<< HEAD -| 4 | 0.440015 | `keyvault_key_list` | ❌ | -======= -| 4 | 0.439941 | `keyvault_key_list` | ❌ | ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= | 4 | 0.439969 | `keyvault_key_list` | ❌ | ->>>>>>> e2fd2eac (refactor tts mcp tool) -| 5 | 0.426545 | `keyvault_certificate_get` | ❌ | +| 5 | 0.430038 | `keyvault_certificate_get` | ❌ | --- -<<<<<<< HEAD -<<<<<<< HEAD -## Test 251 -======= -<<<<<<< HEAD -## Test 246 -======= -## Test 256 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= ## Test 261 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `keyvault_key_get` **Prompt:** Show me the details of the key in the key vault @@ -10732,47 +4919,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -<<<<<<< HEAD -| 1 | 0.629372 | `keyvault_key_get` | ✅ **EXPECTED** | -| 2 | 0.532872 | `keyvault_secret_get` | ❌ | -| 3 | 0.512278 | `storage_account_get` | ❌ | -======= -<<<<<<< HEAD -| 1 | 0.629552 | `keyvault_key_get` | ✅ **EXPECTED** | -| 2 | 0.532651 | `keyvault_secret_get` | ❌ | -| 3 | 0.512106 | `storage_account_get` | ❌ | ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= | 1 | 0.629552 | `keyvault_key_get` | ✅ **EXPECTED** | | 2 | 0.532651 | `keyvault_secret_get` | ❌ | | 3 | 0.512278 | `storage_account_get` | ❌ | ->>>>>>> e2fd2eac (refactor tts mcp tool) -| 4 | 0.495957 | `keyvault_certificate_get` | ❌ | +| 4 | 0.499757 | `keyvault_certificate_get` | ❌ | | 5 | 0.456992 | `keyvault_key_create` | ❌ | --- -<<<<<<< HEAD -<<<<<<< HEAD -## Test 252 -======= -## Test 247 -======= -| 1 | 0.629579 | `keyvault_key_get` | ✅ **EXPECTED** | -| 2 | 0.532628 | `keyvault_secret_get` | ❌ | -| 3 | 0.512235 | `storage_account_get` | ❌ | -| 4 | 0.496014 | `keyvault_certificate_get` | ❌ | -| 5 | 0.457056 | `keyvault_key_create` | ❌ | - ---- - -## Test 257 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= ## Test 262 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `keyvault_key_get` **Prompt:** Get the key from vault @@ -10781,31 +4936,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -| 1 | 0.485492 | `keyvault_key_get` | ✅ **EXPECTED** | +| 1 | 0.484645 | `keyvault_key_get` | ✅ **EXPECTED** | | 2 | 0.443182 | `keyvault_key_create` | ❌ | -<<<<<<< HEAD -| 3 | 0.409356 | `keyvault_secret_get` | ❌ | -======= | 3 | 0.409388 | `keyvault_secret_get` | ❌ | ->>>>>>> e2fd2eac (refactor tts mcp tool) | 4 | 0.395491 | `keyvault_admin_settings_get` | ❌ | | 5 | 0.383519 | `appconfig_kv_lock_set` | ❌ | --- -<<<<<<< HEAD -<<<<<<< HEAD -## Test 253 -======= -<<<<<<< HEAD -## Test 248 -======= -## Test 258 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= ## Test 263 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `keyvault_key_get` **Prompt:** Display the key details for in vault @@ -10814,41 +4953,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -| 1 | 0.590297 | `keyvault_key_get` | ✅ **EXPECTED** | -| 2 | 0.488574 | `keyvault_secret_get` | ❌ | -| 3 | 0.476498 | `storage_account_get` | ❌ | -======= | 1 | 0.590303 | `keyvault_key_get` | ✅ **EXPECTED** | | 2 | 0.488213 | `keyvault_secret_get` | ❌ | -<<<<<<< HEAD -<<<<<<< HEAD -| 3 | 0.476278 | `storage_account_get` | ❌ | -======= -| 3 | 0.476529 | `storage_account_get` | ❌ | ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= | 3 | 0.476498 | `storage_account_get` | ❌ | ->>>>>>> e2fd2eac (refactor tts mcp tool) -| 4 | 0.460796 | `keyvault_certificate_get` | ❌ | +| 4 | 0.464283 | `keyvault_certificate_get` | ❌ | | 5 | 0.436511 | `keyvault_admin_settings_get` | ❌ | --- -<<<<<<< HEAD -<<<<<<< HEAD -## Test 254 -======= -<<<<<<< HEAD -## Test 249 -======= -## Test 259 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= ## Test 264 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `keyvault_key_get` **Prompt:** Retrieve key metadata for in vault @@ -10857,37 +4970,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -| 1 | 0.518346 | `keyvault_key_get` | ✅ **EXPECTED** | -| 2 | 0.432950 | `storage_account_get` | ❌ | -| 3 | 0.432742 | `keyvault_admin_settings_get` | ❌ | -======= -| 1 | 0.518886 | `keyvault_key_get` | ✅ **EXPECTED** | -| 2 | 0.432950 | `storage_account_get` | ❌ | -| 3 | 0.432742 | `keyvault_admin_settings_get` | ❌ | -<<<<<<< HEAD ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= ->>>>>>> e2fd2eac (refactor tts mcp tool) -| 4 | 0.429131 | `keyvault_key_create` | ❌ | -| 5 | 0.422731 | `keyvault_secret_get` | ❌ | +| 1 | 0.518839 | `keyvault_key_get` | ✅ **EXPECTED** | +| 2 | 0.432982 | `storage_account_get` | ❌ | +| 3 | 0.432733 | `keyvault_admin_settings_get` | ❌ | +| 4 | 0.429089 | `keyvault_key_create` | ❌ | +| 5 | 0.422498 | `keyvault_secret_get` | ❌ | --- -<<<<<<< HEAD -<<<<<<< HEAD -## Test 255 -======= -<<<<<<< HEAD -## Test 250 -======= -## Test 260 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= ## Test 265 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `keyvault_key_list` **Prompt:** List all keys in the key vault @@ -10896,46 +4987,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -<<<<<<< HEAD | 1 | 0.701448 | `keyvault_key_list` | ✅ **EXPECTED** | -======= -<<<<<<< HEAD -| 1 | 0.701474 | `keyvault_key_list` | ✅ **EXPECTED** | ->>>>>>> 58ab8585 (update prompts and tool description evaluator) | 2 | 0.601513 | `keyvault_certificate_list` | ❌ | -| 3 | 0.587427 | `keyvault_secret_list` | ❌ | +| 3 | 0.587218 | `keyvault_secret_list` | ❌ | | 4 | 0.498767 | `cosmos_account_list` | ❌ | | 5 | 0.480129 | `keyvault_admin_settings_get` | ❌ | --- -<<<<<<< HEAD -## Test 256 -======= -## Test 251 -======= -| 1 | 0.701420 | `keyvault_key_list` | ✅ **EXPECTED** | -| 2 | 0.601513 | `keyvault_certificate_list` | ❌ | -| 3 | 0.587427 | `keyvault_secret_list` | ❌ | -| 4 | 0.498750 | `cosmos_account_list` | ❌ | -======= -| 1 | 0.701448 | `keyvault_key_list` | ✅ **EXPECTED** | -| 2 | 0.601430 | `keyvault_certificate_list` | ❌ | -| 3 | 0.587541 | `keyvault_secret_list` | ❌ | -| 4 | 0.498767 | `cosmos_account_list` | ❌ | ->>>>>>> e2fd2eac (refactor tts mcp tool) -| 5 | 0.480129 | `keyvault_admin_settings_get` | ❌ | - ---- - -<<<<<<< HEAD -## Test 261 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= ## Test 266 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `keyvault_key_list` **Prompt:** Show me the keys in the key vault @@ -10944,40 +5004,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -<<<<<<< HEAD -| 1 | 0.549453 | `keyvault_key_list` | ✅ **EXPECTED** | -| 2 | 0.507865 | `keyvault_key_get` | ❌ | -| 3 | 0.475507 | `keyvault_certificate_list` | ❌ | -| 4 | 0.472465 | `keyvault_admin_settings_get` | ❌ | -| 5 | 0.455936 | `keyvault_secret_get` | ❌ | - ---- - -## Test 257 -======= -<<<<<<< HEAD -| 1 | 0.549498 | `keyvault_key_list` | ✅ **EXPECTED** | -======= | 1 | 0.549453 | `keyvault_key_list` | ✅ **EXPECTED** | ->>>>>>> e2fd2eac (refactor tts mcp tool) | 2 | 0.506815 | `keyvault_key_get` | ❌ | -| 3 | 0.475251 | `keyvault_certificate_list` | ❌ | +| 3 | 0.475507 | `keyvault_certificate_list` | ❌ | | 4 | 0.472465 | `keyvault_admin_settings_get` | ❌ | | 5 | 0.455683 | `keyvault_secret_get` | ❌ | --- -<<<<<<< HEAD -<<<<<<< HEAD -## Test 252 -======= -## Test 262 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= ## Test 267 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `keyvault_key_list` **Prompt:** What keys are in the key vault ? @@ -10986,45 +5021,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -<<<<<<< HEAD | 1 | 0.581970 | `keyvault_key_list` | ✅ **EXPECTED** | | 2 | 0.502245 | `keyvault_admin_settings_get` | ❌ | -======= -<<<<<<< HEAD -| 1 | 0.582010 | `keyvault_key_list` | ✅ **EXPECTED** | -| 2 | 0.502252 | `keyvault_admin_settings_get` | ❌ | -======= -| 1 | 0.581948 | `keyvault_key_list` | ✅ **EXPECTED** | -| 2 | 0.502245 | `keyvault_admin_settings_get` | ❌ | ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) | 3 | 0.501481 | `keyvault_certificate_list` | ❌ | -| 4 | 0.477451 | `keyvault_key_get` | ❌ | -| 5 | 0.472414 | `keyvault_secret_list` | ❌ | - ---- - -<<<<<<< HEAD -## Test 258 -======= -<<<<<<< HEAD -## Test 253 -======= -## Test 263 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -| 1 | 0.581970 | `keyvault_key_list` | ✅ **EXPECTED** | -| 2 | 0.502245 | `keyvault_admin_settings_get` | ❌ | -| 3 | 0.501285 | `keyvault_certificate_list` | ❌ | | 4 | 0.476470 | `keyvault_key_get` | ❌ | -| 5 | 0.472515 | `keyvault_secret_list` | ❌ | +| 5 | 0.472124 | `keyvault_secret_list` | ❌ | --- ## Test 268 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `keyvault_key_list` **Prompt:** List key names in vault @@ -11033,45 +5038,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -<<<<<<< HEAD | 1 | 0.641314 | `keyvault_key_list` | ✅ **EXPECTED** | -======= -<<<<<<< HEAD -| 1 | 0.641339 | `keyvault_key_list` | ✅ **EXPECTED** | ->>>>>>> 58ab8585 (update prompts and tool description evaluator) | 2 | 0.559550 | `keyvault_certificate_list` | ❌ | -| 3 | 0.553553 | `keyvault_secret_list` | ❌ | -======= -| 1 | 0.641314 | `keyvault_key_list` | ✅ **EXPECTED** | -| 2 | 0.559318 | `keyvault_certificate_list` | ❌ | -| 3 | 0.553669 | `keyvault_secret_list` | ❌ | ->>>>>>> e2fd2eac (refactor tts mcp tool) +| 3 | 0.553257 | `keyvault_secret_list` | ❌ | | 4 | 0.486377 | `keyvault_admin_settings_get` | ❌ | | 5 | 0.475992 | `cosmos_account_list` | ❌ | --- -<<<<<<< HEAD -<<<<<<< HEAD -## Test 259 -======= -## Test 254 -======= -| 1 | 0.641210 | `keyvault_key_list` | ✅ **EXPECTED** | -| 2 | 0.559476 | `keyvault_certificate_list` | ❌ | -| 3 | 0.553501 | `keyvault_secret_list` | ❌ | -| 4 | 0.486377 | `keyvault_admin_settings_get` | ❌ | -| 5 | 0.475945 | `cosmos_account_list` | ❌ | - ---- - -## Test 264 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= ## Test 269 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `keyvault_key_list` **Prompt:** Enumerate keys in key vault @@ -11080,47 +5055,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -<<<<<<< HEAD | 1 | 0.723266 | `keyvault_key_list` | ✅ **EXPECTED** | | 2 | 0.611366 | `keyvault_certificate_list` | ❌ | -| 3 | 0.611185 | `keyvault_secret_list` | ❌ | -| 4 | 0.473886 | `keyvault_admin_settings_get` | ❌ | -| 5 | 0.443322 | `keyvault_key_get` | ❌ | - ---- - -## Test 260 -======= -<<<<<<< HEAD -| 1 | 0.723318 | `keyvault_key_list` | ✅ **EXPECTED** | -| 2 | 0.611366 | `keyvault_certificate_list` | ❌ | -| 3 | 0.611185 | `keyvault_secret_list` | ❌ | -| 4 | 0.473874 | `keyvault_admin_settings_get` | ❌ | -======= -| 1 | 0.723171 | `keyvault_key_list` | ✅ **EXPECTED** | -| 2 | 0.611366 | `keyvault_certificate_list` | ❌ | -| 3 | 0.611185 | `keyvault_secret_list` | ❌ | -======= -| 1 | 0.723266 | `keyvault_key_list` | ✅ **EXPECTED** | -| 2 | 0.611390 | `keyvault_certificate_list` | ❌ | -| 3 | 0.611279 | `keyvault_secret_list` | ❌ | ->>>>>>> e2fd2eac (refactor tts mcp tool) +| 3 | 0.611042 | `keyvault_secret_list` | ❌ | | 4 | 0.473886 | `keyvault_admin_settings_get` | ❌ | | 5 | 0.441881 | `keyvault_key_get` | ❌ | --- -<<<<<<< HEAD -<<<<<<< HEAD -## Test 255 -======= -## Test 265 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= ## Test 270 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `keyvault_key_list` **Prompt:** Show key names in the key vault @@ -11129,49 +5072,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -<<<<<<< HEAD | 1 | 0.570444 | `keyvault_key_list` | ✅ **EXPECTED** | -| 2 | 0.501953 | `keyvault_key_get` | ❌ | -| 3 | 0.500103 | `keyvault_certificate_list` | ❌ | -| 4 | 0.496817 | `storage_account_get` | ❌ | -======= -<<<<<<< HEAD -| 1 | 0.570489 | `keyvault_key_list` | ✅ **EXPECTED** | -| 2 | 0.501073 | `keyvault_key_get` | ❌ | -| 3 | 0.500103 | `keyvault_certificate_list` | ❌ | -| 4 | 0.496907 | `storage_account_get` | ❌ | -======= -| 1 | 0.570418 | `keyvault_key_list` | ✅ **EXPECTED** | | 2 | 0.501073 | `keyvault_key_get` | ❌ | | 3 | 0.500103 | `keyvault_certificate_list` | ❌ | -| 4 | 0.496837 | `storage_account_get` | ❌ | ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -| 5 | 0.490367 | `keyvault_secret_list` | ❌ | - ---- - -<<<<<<< HEAD -## Test 261 -======= -<<<<<<< HEAD -## Test 256 -======= -## Test 266 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -| 1 | 0.570444 | `keyvault_key_list` | ✅ **EXPECTED** | -| 2 | 0.501073 | `keyvault_key_get` | ❌ | -| 3 | 0.499912 | `keyvault_certificate_list` | ❌ | | 4 | 0.496817 | `storage_account_get` | ❌ | -| 5 | 0.490504 | `keyvault_secret_list` | ❌ | +| 5 | 0.490100 | `keyvault_secret_list` | ❌ | --- ## Test 271 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `keyvault_secret_create` **Prompt:** Create a new secret called with value in the key vault @@ -11180,27 +5089,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -| 1 | 0.678482 | `keyvault_secret_create` | ✅ **EXPECTED** | -| 2 | 0.553018 | `keyvault_key_create` | ❌ | -| 3 | 0.512602 | `keyvault_secret_get` | ❌ | -| 4 | 0.475097 | `keyvault_certificate_create` | ❌ | -| 5 | 0.461437 | `appconfig_kv_set` | ❌ | +| 1 | 0.678470 | `keyvault_secret_create` | ✅ **EXPECTED** | +| 2 | 0.552708 | `keyvault_key_create` | ❌ | +| 3 | 0.512617 | `keyvault_secret_get` | ❌ | +| 4 | 0.474664 | `keyvault_certificate_create` | ❌ | +| 5 | 0.462012 | `appconfig_kv_set` | ❌ | --- -<<<<<<< HEAD -<<<<<<< HEAD -## Test 262 -======= -<<<<<<< HEAD -## Test 257 -======= -## Test 267 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= ## Test 272 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `keyvault_secret_create` **Prompt:** Set a secret named with value in key vault @@ -11209,35 +5106,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -| 1 | 0.663094 | `keyvault_secret_create` | ✅ **EXPECTED** | -| 2 | 0.519306 | `keyvault_secret_get` | ❌ | +| 1 | 0.663147 | `keyvault_secret_create` | ✅ **EXPECTED** | +| 2 | 0.519601 | `keyvault_secret_get` | ❌ | | 3 | 0.512233 | `appconfig_kv_set` | ❌ | | 4 | 0.458502 | `keyvault_key_create` | ❌ | | 5 | 0.429785 | `appconfig_kv_lock_set` | ❌ | --- -<<<<<<< HEAD -## Test 263 -======= -<<<<<<< HEAD -## Test 258 -======= -## Test 268 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -| 1 | 0.663051 | `keyvault_secret_create` | ✅ **EXPECTED** | -| 2 | 0.519554 | `keyvault_secret_get` | ❌ | -| 3 | 0.512173 | `appconfig_kv_set` | ❌ | -| 4 | 0.458563 | `keyvault_key_create` | ❌ | -| 5 | 0.429786 | `appconfig_kv_lock_set` | ❌ | - ---- - ## Test 273 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `keyvault_secret_create` **Prompt:** Store secret value in the key vault @@ -11246,43 +5123,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -<<<<<<< HEAD -| 1 | 0.639897 | `keyvault_secret_create` | ✅ **EXPECTED** | -| 2 | 0.509526 | `keyvault_secret_get` | ❌ | +| 1 | 0.639917 | `keyvault_secret_create` | ✅ **EXPECTED** | +| 2 | 0.509674 | `keyvault_secret_get` | ❌ | | 3 | 0.485203 | `appconfig_kv_set` | ❌ | | 4 | 0.484680 | `keyvault_key_create` | ❌ | | 5 | 0.448995 | `appconfig_kv_lock_set` | ❌ | --- -<<<<<<< HEAD -## Test 264 -======= -## Test 259 -======= -| 1 | 0.639804 | `keyvault_secret_create` | ✅ **EXPECTED** | -| 2 | 0.509509 | `keyvault_secret_get` | ❌ | -| 3 | 0.485174 | `appconfig_kv_set` | ❌ | -| 4 | 0.484391 | `keyvault_key_create` | ❌ | -| 5 | 0.449001 | `appconfig_kv_lock_set` | ❌ | - ---- - -## Test 269 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -| 1 | 0.639908 | `keyvault_secret_create` | ✅ **EXPECTED** | -| 2 | 0.509778 | `keyvault_secret_get` | ❌ | -| 3 | 0.485096 | `appconfig_kv_set` | ❌ | -| 4 | 0.484619 | `keyvault_key_create` | ❌ | -| 5 | 0.448908 | `appconfig_kv_lock_set` | ❌ | - ---- - ## Test 274 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `keyvault_secret_create` **Prompt:** Add a new version of secret with value in vault @@ -11291,51 +5140,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -<<<<<<< HEAD -| 1 | 0.675145 | `keyvault_secret_create` | ✅ **EXPECTED** | -| 2 | 0.499276 | `keyvault_secret_get` | ❌ | -| 3 | 0.498228 | `keyvault_key_create` | ❌ | -| 4 | 0.479174 | `keyvault_certificate_import` | ❌ | -| 5 | 0.458574 | `appconfig_kv_set` | ❌ | - ---- - -## Test 265 -======= -<<<<<<< HEAD -| 1 | 0.675147 | `keyvault_secret_create` | ✅ **EXPECTED** | -| 2 | 0.499602 | `keyvault_secret_get` | ❌ | -| 3 | 0.498196 | `keyvault_key_create` | ❌ | -| 4 | 0.479173 | `keyvault_certificate_import` | ❌ | -| 5 | 0.458587 | `appconfig_kv_set` | ❌ | - ---- - -## Test 260 -======= -| 1 | 0.675145 | `keyvault_secret_create` | ✅ **EXPECTED** | +| 1 | 0.675185 | `keyvault_secret_create` | ✅ **EXPECTED** | | 2 | 0.499612 | `keyvault_secret_get` | ❌ | | 3 | 0.498228 | `keyvault_key_create` | ❌ | -| 4 | 0.478700 | `keyvault_certificate_import` | ❌ | +| 4 | 0.479174 | `keyvault_certificate_import` | ❌ | | 5 | 0.458574 | `appconfig_kv_set` | ❌ | --- -## Test 270 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -| 1 | 0.675151 | `keyvault_secret_create` | ✅ **EXPECTED** | -| 2 | 0.499630 | `keyvault_secret_get` | ❌ | -| 3 | 0.498091 | `keyvault_key_create` | ❌ | -| 4 | 0.479063 | `keyvault_certificate_import` | ❌ | -| 5 | 0.458559 | `appconfig_kv_set` | ❌ | - ---- - ## Test 275 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `keyvault_secret_create` **Prompt:** Update secret to value in the key vault @@ -11344,51 +5157,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -<<<<<<< HEAD -| 1 | 0.571597 | `keyvault_secret_create` | ✅ **EXPECTED** | -| 2 | 0.513012 | `keyvault_secret_get` | ❌ | -| 3 | 0.441198 | `appconfig_kv_set` | ❌ | -| 4 | 0.417911 | `appconfig_kv_lock_set` | ❌ | -| 5 | 0.408739 | `keyvault_key_get` | ❌ | - ---- - -## Test 266 -======= -<<<<<<< HEAD -| 1 | 0.571716 | `keyvault_secret_create` | ✅ **EXPECTED** | -| 2 | 0.513963 | `keyvault_secret_get` | ❌ | -| 3 | 0.441281 | `appconfig_kv_set` | ❌ | -| 4 | 0.417998 | `appconfig_kv_lock_set` | ❌ | -| 5 | 0.408505 | `keyvault_key_get` | ❌ | - ---- - -## Test 261 -======= -| 1 | 0.571612 | `keyvault_secret_create` | ✅ **EXPECTED** | -| 2 | 0.513767 | `keyvault_secret_get` | ❌ | -| 3 | 0.441223 | `appconfig_kv_set` | ❌ | -| 4 | 0.417943 | `appconfig_kv_lock_set` | ❌ | -| 5 | 0.408242 | `keyvault_key_get` | ❌ | - ---- - -## Test 271 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -| 1 | 0.571590 | `keyvault_secret_create` | ✅ **EXPECTED** | -| 2 | 0.513749 | `keyvault_secret_get` | ❌ | -| 3 | 0.441094 | `appconfig_kv_set` | ❌ | -| 4 | 0.417832 | `appconfig_kv_lock_set` | ❌ | -| 5 | 0.408233 | `keyvault_key_get` | ❌ | +| 1 | 0.571490 | `keyvault_secret_create` | ✅ **EXPECTED** | +| 2 | 0.513686 | `keyvault_secret_get` | ❌ | +| 3 | 0.440666 | `appconfig_kv_set` | ❌ | +| 4 | 0.417799 | `appconfig_kv_lock_set` | ❌ | +| 5 | 0.407937 | `keyvault_key_get` | ❌ | --- ## Test 276 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `keyvault_secret_get` **Prompt:** Show me the secret in the key vault @@ -11397,46 +5174,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -<<<<<<< HEAD -| 1 | 0.602686 | `keyvault_secret_get` | ✅ **EXPECTED** | -| 2 | 0.505620 | `keyvault_key_get` | ❌ | -| 3 | 0.501397 | `keyvault_secret_create` | ❌ | -| 4 | 0.478769 | `keyvault_secret_list` | ❌ | -| 5 | 0.439521 | `keyvault_certificate_get` | ❌ | - ---- - -## Test 267 -======= -<<<<<<< HEAD -| 1 | 0.605040 | `keyvault_secret_get` | ✅ **EXPECTED** | -| 2 | 0.504063 | `keyvault_key_get` | ❌ | -| 3 | 0.502826 | `keyvault_secret_create` | ❌ | -| 4 | 0.479767 | `keyvault_secret_list` | ❌ | -| 5 | 0.440063 | `keyvault_certificate_get` | ❌ | - ---- - -## Test 262 -======= -======= ->>>>>>> e2fd2eac (refactor tts mcp tool) | 1 | 0.602769 | `keyvault_secret_get` | ✅ **EXPECTED** | | 2 | 0.504212 | `keyvault_key_get` | ❌ | -| 3 | 0.501397 | `keyvault_secret_create` | ❌ | -| 4 | 0.478828 | `keyvault_secret_list` | ❌ | -| 5 | 0.439521 | `keyvault_certificate_get` | ❌ | +| 3 | 0.501412 | `keyvault_secret_create` | ❌ | +| 4 | 0.478600 | `keyvault_secret_list` | ❌ | +| 5 | 0.442183 | `keyvault_certificate_get` | ❌ | --- -<<<<<<< HEAD -## Test 272 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= ## Test 277 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `keyvault_secret_get` **Prompt:** Show me the details of the secret in the key vault @@ -11445,45 +5191,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -<<<<<<< HEAD -| 1 | 0.653920 | `keyvault_secret_get` | ✅ **EXPECTED** | -| 2 | 0.567036 | `keyvault_key_get` | ❌ | -| 3 | 0.517547 | `storage_account_get` | ❌ | -======= | 1 | 0.653871 | `keyvault_secret_get` | ✅ **EXPECTED** | | 2 | 0.566786 | `keyvault_key_get` | ❌ | -<<<<<<< HEAD -| 3 | 0.517355 | `storage_account_get` | ❌ | -======= -| 3 | 0.517561 | `storage_account_get` | ❌ | ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -| 4 | 0.496050 | `keyvault_certificate_get` | ❌ | -| 5 | 0.485249 | `keyvault_secret_list` | ❌ | - ---- - -<<<<<<< HEAD -## Test 268 -======= -<<<<<<< HEAD -## Test 263 -======= -## Test 273 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -| 1 | 0.653702 | `keyvault_secret_get` | ✅ **EXPECTED** | -| 2 | 0.566721 | `keyvault_key_get` | ❌ | -| 3 | 0.517433 | `storage_account_get` | ❌ | -| 4 | 0.495959 | `keyvault_certificate_get` | ❌ | -| 5 | 0.485474 | `keyvault_secret_list` | ❌ | +| 3 | 0.517547 | `storage_account_get` | ❌ | +| 4 | 0.499014 | `keyvault_certificate_get` | ❌ | +| 5 | 0.485117 | `keyvault_secret_list` | ❌ | --- ## Test 278 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `keyvault_secret_get` **Prompt:** Get the secret from vault @@ -11492,31 +5208,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -| 1 | 0.578261 | `keyvault_secret_get` | ✅ **EXPECTED** | -| 2 | 0.493543 | `keyvault_key_get` | ❌ | -| 3 | 0.488705 | `keyvault_secret_create` | ❌ | -<<<<<<< HEAD -| 4 | 0.443676 | `keyvault_secret_list` | ❌ | -======= -| 4 | 0.443696 | `keyvault_secret_list` | ❌ | ->>>>>>> e2fd2eac (refactor tts mcp tool) +| 1 | 0.578479 | `keyvault_secret_get` | ✅ **EXPECTED** | +| 2 | 0.492213 | `keyvault_key_get` | ❌ | +| 3 | 0.488680 | `keyvault_secret_create` | ❌ | +| 4 | 0.443595 | `keyvault_secret_list` | ❌ | | 5 | 0.424167 | `keyvault_admin_settings_get` | ❌ | --- -<<<<<<< HEAD -<<<<<<< HEAD -## Test 269 -======= -<<<<<<< HEAD -## Test 264 -======= -## Test 274 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= ## Test 279 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `keyvault_secret_get` **Prompt:** Display the secret details for in vault @@ -11525,43 +5225,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -| 1 | 0.649423 | `keyvault_secret_get` | ✅ **EXPECTED** | -| 2 | 0.548102 | `keyvault_key_get` | ❌ | -| 3 | 0.497402 | `storage_account_get` | ❌ | -======= | 1 | 0.649267 | `keyvault_secret_get` | ✅ **EXPECTED** | | 2 | 0.546992 | `keyvault_key_get` | ❌ | -<<<<<<< HEAD -<<<<<<< HEAD -| 3 | 0.497258 | `storage_account_get` | ❌ | -======= -| 3 | 0.497410 | `storage_account_get` | ❌ | ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -| 4 | 0.492583 | `keyvault_certificate_get` | ❌ | -| 5 | 0.491597 | `keyvault_secret_list` | ❌ | - ---- - -<<<<<<< HEAD -## Test 270 -======= -<<<<<<< HEAD -## Test 265 -======= -## Test 275 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= | 3 | 0.497402 | `storage_account_get` | ❌ | -| 4 | 0.492583 | `keyvault_certificate_get` | ❌ | -| 5 | 0.491655 | `keyvault_secret_list` | ❌ | +| 4 | 0.494759 | `keyvault_certificate_get` | ❌ | +| 5 | 0.491412 | `keyvault_secret_list` | ❌ | --- ## Test 280 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `keyvault_secret_get` **Prompt:** Retrieve secret metadata for in vault @@ -11570,43 +5242,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -| 1 | 0.577338 | `keyvault_secret_get` | ✅ **EXPECTED** | -| 2 | 0.475492 | `keyvault_key_get` | ❌ | -| 3 | 0.466890 | `keyvault_secret_create` | ❌ | -| 4 | 0.447602 | `keyvault_secret_list` | ❌ | -<<<<<<< HEAD +| 1 | 0.577477 | `keyvault_secret_get` | ✅ **EXPECTED** | +| 2 | 0.475443 | `keyvault_key_get` | ❌ | +| 3 | 0.466873 | `keyvault_secret_create` | ❌ | +| 4 | 0.447533 | `keyvault_secret_list` | ❌ | | 5 | 0.439583 | `storage_account_get` | ❌ | --- -## Test 271 -======= -<<<<<<< HEAD -| 5 | 0.439381 | `storage_account_get` | ❌ | - ---- - -## Test 266 -======= -| 5 | 0.439597 | `storage_account_get` | ❌ | - ---- - -## Test 276 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -| 1 | 0.577471 | `keyvault_secret_get` | ✅ **EXPECTED** | -| 2 | 0.475432 | `keyvault_key_get` | ❌ | -| 3 | 0.466876 | `keyvault_secret_create` | ❌ | -| 4 | 0.447631 | `keyvault_secret_list` | ❌ | -| 5 | 0.439582 | `storage_account_get` | ❌ | - ---- - ## Test 281 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `keyvault_secret_list` **Prompt:** List all secrets in the key vault @@ -11615,44 +5259,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -| 1 | 0.701227 | `keyvault_secret_list` | ✅ **EXPECTED** | -<<<<<<< HEAD +| 1 | 0.701015 | `keyvault_secret_list` | ✅ **EXPECTED** | | 2 | 0.563736 | `keyvault_key_list` | ❌ | | 3 | 0.538337 | `keyvault_certificate_list` | ❌ | -| 4 | 0.499888 | `keyvault_secret_get` | ❌ | -| 5 | 0.455500 | `cosmos_account_list` | ❌ | - ---- - -## Test 272 -======= -<<<<<<< HEAD -| 2 | 0.563760 | `keyvault_key_list` | ❌ | -======= -| 2 | 0.563694 | `keyvault_key_list` | ❌ | ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) -| 3 | 0.538337 | `keyvault_certificate_list` | ❌ | -======= -| 1 | 0.701255 | `keyvault_secret_list` | ✅ **EXPECTED** | -| 2 | 0.563736 | `keyvault_key_list` | ❌ | -| 3 | 0.538290 | `keyvault_certificate_list` | ❌ | ->>>>>>> e2fd2eac (refactor tts mcp tool) | 4 | 0.499642 | `keyvault_secret_get` | ❌ | | 5 | 0.455500 | `cosmos_account_list` | ❌ | --- -<<<<<<< HEAD -<<<<<<< HEAD -## Test 267 -======= -## Test 277 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= ## Test 282 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `keyvault_secret_list` **Prompt:** Show me the secrets in the key vault @@ -11661,21 +5276,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -| 1 | 0.555681 | `keyvault_secret_list` | ✅ **EXPECTED** | -<<<<<<< HEAD -| 2 | 0.544015 | `keyvault_secret_get` | ❌ | -| 3 | 0.498713 | `keyvault_key_get` | ❌ | -| 4 | 0.464661 | `keyvault_key_list` | ❌ | -| 5 | 0.453130 | `keyvault_admin_settings_get` | ❌ | - ---- - -## Test 273 -======= -======= -| 1 | 0.555768 | `keyvault_secret_list` | ✅ **EXPECTED** | ->>>>>>> e2fd2eac (refactor tts mcp tool) +| 1 | 0.555367 | `keyvault_secret_list` | ✅ **EXPECTED** | | 2 | 0.543861 | `keyvault_secret_get` | ❌ | | 3 | 0.497525 | `keyvault_key_get` | ❌ | | 4 | 0.464661 | `keyvault_key_list` | ❌ | @@ -11683,13 +5284,7 @@ --- -<<<<<<< HEAD -## Test 278 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= ## Test 283 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `keyvault_secret_list` **Prompt:** What secrets are in the key vault ? @@ -11698,21 +5293,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -| 1 | 0.572540 | `keyvault_secret_list` | ✅ **EXPECTED** | -<<<<<<< HEAD -| 2 | 0.529389 | `keyvault_secret_get` | ❌ | -| 3 | 0.493761 | `keyvault_key_list` | ❌ | -| 4 | 0.487620 | `keyvault_admin_settings_get` | ❌ | -| 5 | 0.476109 | `keyvault_key_get` | ❌ | - ---- - -## Test 274 -======= -======= -| 1 | 0.572620 | `keyvault_secret_list` | ✅ **EXPECTED** | ->>>>>>> e2fd2eac (refactor tts mcp tool) +| 1 | 0.572149 | `keyvault_secret_list` | ✅ **EXPECTED** | | 2 | 0.529258 | `keyvault_secret_get` | ❌ | | 3 | 0.493761 | `keyvault_key_list` | ❌ | | 4 | 0.487620 | `keyvault_admin_settings_get` | ❌ | @@ -11720,16 +5301,7 @@ --- -<<<<<<< HEAD -<<<<<<< HEAD -## Test 269 -======= -## Test 279 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= ## Test 284 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `keyvault_secret_list` **Prompt:** List secrets names in vault @@ -11738,43 +5310,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -| 1 | 0.624290 | `keyvault_secret_list` | ✅ **EXPECTED** | -<<<<<<< HEAD +| 1 | 0.624070 | `keyvault_secret_list` | ✅ **EXPECTED** | | 2 | 0.559681 | `keyvault_key_list` | ❌ | -======= -<<<<<<< HEAD -| 2 | 0.559700 | `keyvault_key_list` | ❌ | -======= -| 2 | 0.559622 | `keyvault_key_list` | ❌ | ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) | 3 | 0.517516 | `keyvault_certificate_list` | ❌ | -| 4 | 0.479771 | `keyvault_secret_get` | ❌ | -| 5 | 0.453295 | `storage_blob_container_get` | ❌ | - ---- - -<<<<<<< HEAD -## Test 275 -======= -<<<<<<< HEAD -## Test 270 -======= -## Test 280 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -| 1 | 0.624379 | `keyvault_secret_list` | ✅ **EXPECTED** | -| 2 | 0.559681 | `keyvault_key_list` | ❌ | -| 3 | 0.517338 | `keyvault_certificate_list` | ❌ | | 4 | 0.479547 | `keyvault_secret_get` | ❌ | -| 5 | 0.454596 | `storage_blob_container_get` | ❌ | +| 5 | 0.454288 | `storage_blob_container_get` | ❌ | --- ## Test 285 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `keyvault_secret_list` **Prompt:** Enumerate secrets in key vault @@ -11783,42 +5327,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -| 1 | 0.742358 | `keyvault_secret_list` | ✅ **EXPECTED** | -<<<<<<< HEAD +| 1 | 0.742121 | `keyvault_secret_list` | ✅ **EXPECTED** | | 2 | 0.601183 | `keyvault_key_list` | ❌ | -======= -<<<<<<< HEAD -| 2 | 0.601234 | `keyvault_key_list` | ❌ | -======= -| 2 | 0.601079 | `keyvault_key_list` | ❌ | ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) | 3 | 0.567827 | `keyvault_certificate_list` | ❌ | -| 4 | 0.496363 | `keyvault_secret_get` | ❌ | -======= -| 1 | 0.742378 | `keyvault_secret_list` | ✅ **EXPECTED** | -| 2 | 0.601183 | `keyvault_key_list` | ❌ | -| 3 | 0.567881 | `keyvault_certificate_list` | ❌ | | 4 | 0.496127 | `keyvault_secret_get` | ❌ | ->>>>>>> e2fd2eac (refactor tts mcp tool) | 5 | 0.437560 | `keyvault_admin_settings_get` | ❌ | --- -<<<<<<< HEAD -<<<<<<< HEAD -## Test 276 -======= -<<<<<<< HEAD -## Test 271 -======= -## Test 281 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= ## Test 286 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `keyvault_secret_list` **Prompt:** Show secrets names in the key vault @@ -11827,38 +5344,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -| 1 | 0.567110 | `keyvault_secret_list` | ✅ **EXPECTED** | -<<<<<<< HEAD -| 2 | 0.522600 | `keyvault_secret_get` | ❌ | -| 3 | 0.476309 | `keyvault_key_list` | ❌ | -| 4 | 0.462711 | `keyvault_key_get` | ❌ | -| 5 | 0.462677 | `keyvault_secret_create` | ❌ | - ---- - -## Test 277 -======= -======= -| 1 | 0.567204 | `keyvault_secret_list` | ✅ **EXPECTED** | ->>>>>>> e2fd2eac (refactor tts mcp tool) +| 1 | 0.566824 | `keyvault_secret_list` | ✅ **EXPECTED** | | 2 | 0.522398 | `keyvault_secret_get` | ❌ | | 3 | 0.476309 | `keyvault_key_list` | ❌ | -| 4 | 0.462676 | `keyvault_secret_create` | ❌ | +| 4 | 0.462720 | `keyvault_secret_create` | ❌ | | 5 | 0.461326 | `keyvault_key_get` | ❌ | --- -<<<<<<< HEAD -<<<<<<< HEAD -## Test 272 -======= -## Test 282 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= ## Test 287 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `aks_cluster_get` **Prompt:** Get the configuration of AKS cluster @@ -11868,26 +5362,14 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| | 1 | 0.588300 | `aks_cluster_get` | ✅ **EXPECTED** | -| 2 | 0.544302 | `aks_nodepool_get` | ❌ | +| 2 | 0.544309 | `aks_nodepool_get` | ❌ | | 3 | 0.517279 | `kusto_cluster_get` | ❌ | | 4 | 0.481416 | `mysql_server_config_get` | ❌ | -| 5 | 0.430976 | `postgres_server_config_get` | ❌ | +| 5 | 0.430975 | `postgres_server_config_get` | ❌ | --- -<<<<<<< HEAD -<<<<<<< HEAD -## Test 278 -======= -<<<<<<< HEAD -## Test 273 -======= -## Test 283 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= ## Test 288 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `aks_cluster_get` **Prompt:** Show me the details of AKS cluster in resource group @@ -11896,46 +5378,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -<<<<<<< HEAD -| 1 | 0.621759 | `aks_cluster_get` | ✅ **EXPECTED** | -| 2 | 0.575626 | `aks_nodepool_get` | ❌ | -| 3 | 0.567870 | `kusto_cluster_get` | ❌ | -| 4 | 0.461466 | `sql_db_show` | ❌ | -| 5 | 0.444327 | `monitor_webtests_get` | ❌ | - ---- - -## Test 279 -======= -<<<<<<< HEAD -| 1 | 0.621536 | `aks_cluster_get` | ✅ **EXPECTED** | -| 2 | 0.575434 | `aks_nodepool_get` | ❌ | -| 3 | 0.567416 | `kusto_cluster_get` | ❌ | -| 4 | 0.461358 | `sql_db_show` | ❌ | -| 5 | 0.445310 | `monitor_webtests_get` | ❌ | - ---- - -## Test 274 -======= -======= ->>>>>>> e2fd2eac (refactor tts mcp tool) | 1 | 0.621759 | `aks_cluster_get` | ✅ **EXPECTED** | -| 2 | 0.575625 | `aks_nodepool_get` | ❌ | +| 2 | 0.575634 | `aks_nodepool_get` | ❌ | | 3 | 0.567870 | `kusto_cluster_get` | ❌ | | 4 | 0.461466 | `sql_db_show` | ❌ | | 5 | 0.444327 | `monitor_webtests_get` | ❌ | --- -<<<<<<< HEAD -## Test 284 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= ## Test 289 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `aks_cluster_get` **Prompt:** Show me the network configuration for AKS cluster @@ -11945,29 +5396,14 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| | 1 | 0.522525 | `aks_cluster_get` | ✅ **EXPECTED** | -| 2 | 0.483220 | `aks_nodepool_get` | ❌ | +| 2 | 0.483225 | `aks_nodepool_get` | ❌ | | 3 | 0.434684 | `kusto_cluster_get` | ❌ | | 4 | 0.380301 | `mysql_server_config_get` | ❌ | | 5 | 0.366689 | `kusto_cluster_list` | ❌ | --- -<<<<<<< HEAD -<<<<<<< HEAD -## Test 280 -======= -## Test 275 -======= -| 5 | 0.366594 | `kusto_cluster_list` | ❌ | - ---- - -## Test 285 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= ## Test 290 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `aks_cluster_get` **Prompt:** What are the details of my AKS cluster in ? @@ -11977,38 +5413,14 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| | 1 | 0.588634 | `aks_cluster_get` | ✅ **EXPECTED** | -| 2 | 0.550555 | `aks_nodepool_get` | ❌ | +| 2 | 0.550582 | `aks_nodepool_get` | ❌ | | 3 | 0.527511 | `kusto_cluster_get` | ❌ | -<<<<<<< HEAD -<<<<<<< HEAD | 4 | 0.445722 | `storage_account_get` | ❌ | -======= -<<<<<<< HEAD -| 4 | 0.445813 | `storage_account_get` | ❌ | -======= -| 4 | 0.445833 | `storage_account_get` | ❌ | ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -| 4 | 0.445722 | `storage_account_get` | ❌ | ->>>>>>> e2fd2eac (refactor tts mcp tool) | 5 | 0.435597 | `foundry_resource_get` | ❌ | --- -<<<<<<< HEAD -<<<<<<< HEAD -## Test 281 -======= -<<<<<<< HEAD -## Test 276 -======= -## Test 286 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= ## Test 291 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `aks_cluster_get` **Prompt:** List all AKS clusters in my subscription @@ -12019,39 +5431,13 @@ |------|-------|------|--------| | 1 | 0.756471 | `aks_cluster_get` | ✅ **EXPECTED** | | 2 | 0.749416 | `kusto_cluster_list` | ❌ | -| 3 | 0.590166 | `aks_nodepool_get` | ❌ | -<<<<<<< HEAD -<<<<<<< HEAD -| 4 | 0.568635 | `kusto_database_list` | ❌ | -| 5 | 0.560522 | `search_service_list` | ❌ | - ---- - -## Test 282 -======= -| 4 | 0.568440 | `kusto_database_list` | ❌ | -======= -| 2 | 0.749293 | `kusto_cluster_list` | ❌ | -| 3 | 0.590166 | `aks_nodepool_get` | ❌ | -| 4 | 0.568301 | `kusto_database_list` | ❌ | ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) -======= -| 4 | 0.568403 | `kusto_database_list` | ❌ | ->>>>>>> e2fd2eac (refactor tts mcp tool) +| 3 | 0.590161 | `aks_nodepool_get` | ❌ | +| 4 | 0.568502 | `kusto_database_list` | ❌ | | 5 | 0.562043 | `search_service_list` | ❌ | --- -<<<<<<< HEAD -<<<<<<< HEAD -## Test 277 -======= -## Test 287 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= ## Test 292 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `aks_cluster_get` **Prompt:** Show me my Azure Kubernetes Service clusters @@ -12060,42 +5446,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD | 1 | 0.612123 | `aks_cluster_get` | ✅ **EXPECTED** | -<<<<<<< HEAD | 2 | 0.586661 | `kusto_cluster_list` | ❌ | -| 3 | 0.507757 | `aks_nodepool_get` | ❌ | -| 4 | 0.489724 | `kusto_cluster_get` | ❌ | -| 5 | 0.462950 | `kusto_database_list` | ❌ | - ---- - -<<<<<<< HEAD -## Test 283 -======= -## Test 278 -======= -| 2 | 0.586466 | `kusto_cluster_list` | ❌ | -| 3 | 0.507757 | `aks_nodepool_get` | ❌ | +| 3 | 0.507701 | `aks_nodepool_get` | ❌ | | 4 | 0.489724 | `kusto_cluster_get` | ❌ | -| 5 | 0.462718 | `kusto_database_list` | ❌ | - ---- - -## Test 288 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -| 1 | 0.612043 | `aks_cluster_get` | ✅ **EXPECTED** | -| 2 | 0.586527 | `kusto_cluster_list` | ❌ | -| 3 | 0.507689 | `aks_nodepool_get` | ❌ | -| 4 | 0.489677 | `kusto_cluster_get` | ❌ | -| 5 | 0.462776 | `kusto_database_list` | ❌ | +| 5 | 0.462957 | `kusto_database_list` | ❌ | --- ## Test 293 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `aks_cluster_get` **Prompt:** What AKS clusters do I have? @@ -12104,40 +5463,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -| 1 | 0.628470 | `aks_cluster_get` | ✅ **EXPECTED** | -| 2 | 0.563211 | `aks_nodepool_get` | ❌ | -| 3 | 0.526840 | `kusto_cluster_list` | ❌ | -| 4 | 0.426233 | `kusto_cluster_get` | ❌ | -| 5 | 0.409379 | `kusto_database_list` | ❌ | - ---- - -## Test 284 -======= | 1 | 0.628429 | `aks_cluster_get` | ✅ **EXPECTED** | -| 2 | 0.563189 | `aks_nodepool_get` | ❌ | +| 2 | 0.563208 | `aks_nodepool_get` | ❌ | | 3 | 0.526756 | `kusto_cluster_list` | ❌ | | 4 | 0.426157 | `kusto_cluster_get` | ❌ | -| 5 | 0.409103 | `kusto_database_list` | ❌ | +| 5 | 0.409308 | `kusto_database_list` | ❌ | --- -<<<<<<< HEAD -## Test 279 -======= -| 3 | 0.526670 | `kusto_cluster_list` | ❌ | -| 4 | 0.426157 | `kusto_cluster_get` | ❌ | -| 5 | 0.409404 | `kusto_database_list` | ❌ | - ---- - -## Test 289 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= ## Test 294 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `aks_nodepool_get` **Prompt:** Get details for nodepool in AKS cluster in @@ -12146,32 +5480,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -<<<<<<< HEAD -| 1 | 0.728569 | `aks_nodepool_get` | ✅ **EXPECTED** | -| 2 | 0.516573 | `kusto_cluster_get` | ❌ | -| 3 | 0.509314 | `aks_cluster_get` | ❌ | -| 4 | 0.468516 | `virtualdesktop_hostpool_list` | ❌ | -| 5 | 0.463185 | `sql_elastic-pool_list` | ❌ | - ---- - -## Test 285 -======= -<<<<<<< HEAD -| 1 | 0.729136 | `aks_nodepool_get` | ✅ **EXPECTED** | -| 2 | 0.517116 | `kusto_cluster_get` | ❌ | -| 3 | 0.510014 | `aks_cluster_get` | ❌ | -| 4 | 0.468597 | `virtualdesktop_hostpool_list` | ❌ | -| 5 | 0.463489 | `sql_elastic-pool_list` | ❌ | - ---- - -## Test 280 -======= -======= ->>>>>>> e2fd2eac (refactor tts mcp tool) -| 1 | 0.728937 | `aks_nodepool_get` | ✅ **EXPECTED** | +| 1 | 0.728958 | `aks_nodepool_get` | ✅ **EXPECTED** | | 2 | 0.517021 | `kusto_cluster_get` | ❌ | | 3 | 0.509820 | `aks_cluster_get` | ❌ | | 4 | 0.468392 | `virtualdesktop_hostpool_list` | ❌ | @@ -12179,13 +5488,7 @@ --- -<<<<<<< HEAD -## Test 290 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= ## Test 295 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `aks_nodepool_get` **Prompt:** Show me the configuration for nodepool in AKS cluster in resource group @@ -12194,34 +5497,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -| 1 | 0.654106 | `aks_nodepool_get` | ✅ **EXPECTED** | -| 2 | 0.458596 | `sql_elastic-pool_list` | ❌ | -| 3 | 0.446035 | `aks_cluster_get` | ❌ | -| 4 | 0.440273 | `virtualdesktop_hostpool_list` | ❌ | -| 5 | 0.413758 | `kusto_cluster_get` | ❌ | - ---- - -<<<<<<< HEAD -<<<<<<< HEAD -## Test 286 -======= -## Test 281 -======= -| 1 | 0.654031 | `aks_nodepool_get` | ✅ **EXPECTED** | -| 2 | 0.458651 | `sql_elastic-pool_list` | ❌ | -| 3 | 0.445952 | `aks_cluster_get` | ❌ | -| 4 | 0.440187 | `virtualdesktop_hostpool_list` | ❌ | -| 5 | 0.413711 | `kusto_cluster_get` | ❌ | +| 1 | 0.654192 | `aks_nodepool_get` | ✅ **EXPECTED** | +| 2 | 0.458497 | `sql_elastic-pool_list` | ❌ | +| 3 | 0.446296 | `aks_cluster_get` | ❌ | +| 4 | 0.440322 | `virtualdesktop_hostpool_list` | ❌ | +| 5 | 0.414154 | `kusto_cluster_get` | ❌ | --- -## Test 291 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= ## Test 296 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `aks_nodepool_get` **Prompt:** What is the setup of nodepool for AKS cluster in ? @@ -12230,27 +5514,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -| 1 | 0.592806 | `aks_nodepool_get` | ✅ **EXPECTED** | -| 2 | 0.402556 | `aks_cluster_get` | ❌ | -| 3 | 0.385218 | `virtualdesktop_hostpool_list` | ❌ | -| 4 | 0.383045 | `sql_elastic-pool_list` | ❌ | -| 5 | 0.355090 | `kusto_cluster_get` | ❌ | +| 1 | 0.592931 | `aks_nodepool_get` | ✅ **EXPECTED** | +| 2 | 0.402605 | `aks_cluster_get` | ❌ | +| 3 | 0.385275 | `virtualdesktop_hostpool_list` | ❌ | +| 4 | 0.383112 | `sql_elastic-pool_list` | ❌ | +| 5 | 0.355131 | `kusto_cluster_get` | ❌ | --- -<<<<<<< HEAD -<<<<<<< HEAD -## Test 287 -======= -<<<<<<< HEAD -## Test 282 -======= -## Test 292 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= ## Test 297 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `aks_nodepool_get` **Prompt:** List nodepools for AKS cluster in @@ -12259,32 +5531,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -<<<<<<< HEAD -| 1 | 0.692231 | `aks_nodepool_get` | ✅ **EXPECTED** | -| 2 | 0.519037 | `aks_cluster_get` | ❌ | -| 3 | 0.506720 | `virtualdesktop_hostpool_list` | ❌ | -| 4 | 0.500749 | `kusto_cluster_list` | ❌ | -| 5 | 0.487707 | `sql_elastic-pool_list` | ❌ | - ---- - -## Test 288 -======= -<<<<<<< HEAD -| 1 | 0.692264 | `aks_nodepool_get` | ✅ **EXPECTED** | -| 2 | 0.519034 | `aks_cluster_get` | ❌ | -| 3 | 0.506649 | `virtualdesktop_hostpool_list` | ❌ | -| 4 | 0.500705 | `kusto_cluster_list` | ❌ | -| 5 | 0.487723 | `sql_elastic-pool_list` | ❌ | - ---- - -## Test 283 -======= -======= ->>>>>>> e2fd2eac (refactor tts mcp tool) -| 1 | 0.692231 | `aks_nodepool_get` | ✅ **EXPECTED** | +| 1 | 0.692235 | `aks_nodepool_get` | ✅ **EXPECTED** | | 2 | 0.519037 | `aks_cluster_get` | ❌ | | 3 | 0.506624 | `virtualdesktop_hostpool_list` | ❌ | | 4 | 0.500749 | `kusto_cluster_list` | ❌ | @@ -12292,13 +5539,7 @@ --- -<<<<<<< HEAD -## Test 293 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= ## Test 298 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `aks_nodepool_get` **Prompt:** Show me the nodepool list for AKS cluster in @@ -12307,38 +5548,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -| 1 | 0.732132 | `aks_nodepool_get` | ✅ **EXPECTED** | +| 1 | 0.732102 | `aks_nodepool_get` | ✅ **EXPECTED** | | 2 | 0.561829 | `aks_cluster_get` | ❌ | | 3 | 0.510269 | `sql_elastic-pool_list` | ❌ | -<<<<<<< HEAD -| 4 | 0.509840 | `virtualdesktop_hostpool_list` | ❌ | -======= | 4 | 0.509732 | `virtualdesktop_hostpool_list` | ❌ | -<<<<<<< HEAD -<<<<<<< HEAD ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= ->>>>>>> e2fd2eac (refactor tts mcp tool) | 5 | 0.486700 | `kusto_cluster_list` | ❌ | --- -<<<<<<< HEAD -<<<<<<< HEAD -## Test 289 -======= -## Test 284 -======= -| 5 | 0.486544 | `kusto_cluster_list` | ❌ | - ---- - -## Test 294 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= ## Test 299 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `aks_nodepool_get` **Prompt:** What nodepools do I have for AKS cluster in @@ -12347,35 +5565,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -| 1 | 0.629358 | `aks_nodepool_get` | ✅ **EXPECTED** | -| 2 | 0.456911 | `aks_cluster_get` | ❌ | -<<<<<<< HEAD -| 3 | 0.443940 | `virtualdesktop_hostpool_list` | ❌ | -======= -| 3 | 0.443902 | `virtualdesktop_hostpool_list` | ❌ | -<<<<<<< HEAD -<<<<<<< HEAD ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= ->>>>>>> e2fd2eac (refactor tts mcp tool) -| 4 | 0.433006 | `kusto_cluster_list` | ❌ | -| 5 | 0.425448 | `sql_elastic-pool_list` | ❌ | +| 1 | 0.629316 | `aks_nodepool_get` | ✅ **EXPECTED** | +| 2 | 0.456894 | `aks_cluster_get` | ❌ | +| 3 | 0.443957 | `virtualdesktop_hostpool_list` | ❌ | +| 4 | 0.432997 | `kusto_cluster_list` | ❌ | +| 5 | 0.425570 | `sql_elastic-pool_list` | ❌ | --- -<<<<<<< HEAD -<<<<<<< HEAD -## Test 290 -======= -<<<<<<< HEAD -## Test 285 -======= -## Test 295 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= ## Test 300 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `loadtesting_test_create` **Prompt:** Create a basic URL test using the following endpoint URL that runs for 30 minutes with 45 virtual users. The test name is with the test id and the load testing resource is in the resource group in my subscription @@ -12384,27 +5582,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -| 1 | 0.579172 | `loadtesting_test_create` | ✅ **EXPECTED** | -| 2 | 0.520449 | `loadtesting_testresource_create` | ❌ | -| 3 | 0.513419 | `loadtesting_testrun_create` | ❌ | -| 4 | 0.473951 | `monitor_webtests_create` | ❌ | -| 5 | 0.461959 | `loadtesting_testresource_list` | ❌ | +| 1 | 0.577811 | `loadtesting_test_create` | ✅ **EXPECTED** | +| 2 | 0.519418 | `loadtesting_testresource_create` | ❌ | +| 3 | 0.512099 | `loadtesting_testrun_create` | ❌ | +| 4 | 0.472777 | `monitor_webtests_create` | ❌ | +| 5 | 0.460717 | `loadtesting_testresource_list` | ❌ | --- -<<<<<<< HEAD -<<<<<<< HEAD -## Test 291 -======= -<<<<<<< HEAD -## Test 286 -======= -## Test 296 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= ## Test 301 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `loadtesting_test_get` **Prompt:** Get the load test with id in the load test resource in resource group @@ -12413,104 +5599,32 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -<<<<<<< HEAD | 1 | 0.626226 | `loadtesting_testresource_list` | ❌ | | 2 | 0.619944 | `loadtesting_test_get` | ✅ **EXPECTED** | | 3 | 0.594666 | `loadtesting_testresource_create` | ❌ | -| 4 | 0.590698 | `monitor_webtests_get` | ❌ | -| 5 | 0.536024 | `monitor_webtests_list` | ❌ | +| 4 | 0.590697 | `monitor_webtests_get` | ❌ | +| 5 | 0.535650 | `monitor_webtests_list` | ❌ | --- -## Test 292 -======= -<<<<<<< HEAD -| 1 | 0.626213 | `loadtesting_testresource_list` | ❌ | -| 2 | 0.620147 | `loadtesting_test_get` | ✅ **EXPECTED** | -| 3 | 0.594630 | `loadtesting_testresource_create` | ❌ | -| 4 | 0.591112 | `monitor_webtests_get` | ❌ | -| 5 | 0.535891 | `monitor_webtests_list` | ❌ | +## Test 302 ---- +**Expected Tool:** `loadtesting_testresource_create` +**Prompt:** Create a load test resource in the resource group in my subscription -## Test 287 -======= -| 1 | 0.626271 | `loadtesting_testresource_list` | ❌ | -| 2 | 0.620094 | `loadtesting_test_get` | ✅ **EXPECTED** | -| 3 | 0.594881 | `loadtesting_testresource_create` | ❌ | -| 4 | 0.590679 | `monitor_webtests_get` | ❌ | -| 5 | 0.537187 | `monitor_webtests_list` | ❌ | - ---- - -## Test 297 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -| 1 | 0.626226 | `loadtesting_testresource_list` | ❌ | -| 2 | 0.619944 | `loadtesting_test_get` | ✅ **EXPECTED** | -| 3 | 0.594666 | `loadtesting_testresource_create` | ❌ | -| 4 | 0.590697 | `monitor_webtests_get` | ❌ | -| 5 | 0.536024 | `monitor_webtests_list` | ❌ | - ---- - -## Test 302 ->>>>>>> e2fd2eac (refactor tts mcp tool) - -**Expected Tool:** `loadtesting_testresource_create` -**Prompt:** Create a load test resource in the resource group in my subscription - -### Results +### Results | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -<<<<<<< HEAD -| 1 | 0.645537 | `loadtesting_testresource_create` | ✅ **EXPECTED** | -| 2 | 0.618773 | `loadtesting_testresource_list` | ❌ | -| 3 | 0.541696 | `loadtesting_test_create` | ❌ | -| 4 | 0.539771 | `loadtesting_testrun_create` | ❌ | -| 5 | 0.526684 | `monitor_webtests_list` | ❌ | - ---- - -## Test 293 -======= -<<<<<<< HEAD -| 1 | 0.645750 | `loadtesting_testresource_create` | ✅ **EXPECTED** | -| 2 | 0.618984 | `loadtesting_testresource_list` | ❌ | -| 3 | 0.541950 | `loadtesting_test_create` | ❌ | -| 4 | 0.539866 | `loadtesting_testrun_create` | ❌ | -| 5 | 0.526644 | `monitor_webtests_list` | ❌ | - ---- - -## Test 288 -======= | 1 | 0.645537 | `loadtesting_testresource_create` | ✅ **EXPECTED** | | 2 | 0.618773 | `loadtesting_testresource_list` | ❌ | | 3 | 0.541746 | `loadtesting_test_create` | ❌ | | 4 | 0.539771 | `loadtesting_testrun_create` | ❌ | -| 5 | 0.525628 | `monitor_webtests_list` | ❌ | - ---- - -## Test 298 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -| 1 | 0.644693 | `loadtesting_testresource_create` | ✅ **EXPECTED** | -| 2 | 0.618375 | `loadtesting_testresource_list` | ❌ | -| 3 | 0.541221 | `loadtesting_test_create` | ❌ | -| 4 | 0.540031 | `loadtesting_testrun_create` | ❌ | -| 5 | 0.526768 | `monitor_webtests_list` | ❌ | +| 5 | 0.526226 | `monitor_webtests_list` | ❌ | --- ## Test 303 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `loadtesting_testresource_list` **Prompt:** List all load testing resources in the resource group in my subscription @@ -12520,38 +5634,14 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| | 1 | 0.794326 | `loadtesting_testresource_list` | ✅ **EXPECTED** | -<<<<<<< HEAD -<<<<<<< HEAD -| 2 | 0.653165 | `monitor_webtests_list` | ❌ | -======= -<<<<<<< HEAD -| 2 | 0.653137 | `monitor_webtests_list` | ❌ | -======= -| 2 | 0.651533 | `monitor_webtests_list` | ❌ | ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -| 2 | 0.653165 | `monitor_webtests_list` | ❌ | ->>>>>>> e2fd2eac (refactor tts mcp tool) -| 3 | 0.577408 | `group_list` | ❌ | +| 2 | 0.652990 | `monitor_webtests_list` | ❌ | +| 3 | 0.577427 | `group_list` | ❌ | | 4 | 0.575172 | `loadtesting_testresource_create` | ❌ | | 5 | 0.565565 | `datadog_monitoredresources_list` | ❌ | --- -<<<<<<< HEAD -<<<<<<< HEAD -## Test 294 -======= -<<<<<<< HEAD -## Test 289 -======= -## Test 299 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= ## Test 304 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `loadtesting_testrun_create` **Prompt:** Create a test run using the id for test in the load testing resource in resource group . Use the name of test run and description as @@ -12561,31 +5651,14 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| | 1 | 0.688976 | `loadtesting_testrun_create` | ✅ **EXPECTED** | -<<<<<<< HEAD | 2 | 0.594879 | `loadtesting_testrun_update` | ❌ | -| 3 | 0.558566 | `loadtesting_test_create` | ❌ | -======= -| 2 | 0.594779 | `loadtesting_testrun_update` | ❌ | | 3 | 0.558636 | `loadtesting_test_create` | ❌ | ->>>>>>> e2fd2eac (refactor tts mcp tool) | 4 | 0.547102 | `loadtesting_testresource_create` | ❌ | | 5 | 0.496224 | `loadtesting_testresource_list` | ❌ | --- -<<<<<<< HEAD -<<<<<<< HEAD -## Test 295 -======= -<<<<<<< HEAD -## Test 290 -======= -## Test 300 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= ## Test 305 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `loadtesting_testrun_get` **Prompt:** Get the load test run with id in the load test resource in resource group @@ -12594,43 +5667,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -| 1 | 0.619146 | `loadtesting_testresource_list` | ❌ | -| 2 | 0.601927 | `loadtesting_test_get` | ❌ | -| 3 | 0.597430 | `loadtesting_testresource_create` | ❌ | -<<<<<<< HEAD -| 4 | 0.577532 | `monitor_webtests_get` | ❌ | -======= -<<<<<<< HEAD -| 4 | 0.577924 | `monitor_webtests_get` | ❌ | -======= -| 4 | 0.577532 | `monitor_webtests_get` | ❌ | ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -| 5 | 0.565996 | `loadtesting_testrun_list` | ❌ | - ---- - -<<<<<<< HEAD -## Test 296 -======= -<<<<<<< HEAD -## Test 291 -======= -## Test 301 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -| 1 | 0.618926 | `loadtesting_testresource_list` | ❌ | -| 2 | 0.602281 | `loadtesting_test_get` | ❌ | -| 3 | 0.596851 | `loadtesting_testresource_create` | ❌ | -| 4 | 0.577610 | `monitor_webtests_get` | ❌ | -| 5 | 0.566147 | `loadtesting_testrun_list` | ❌ | +| 1 | 0.618909 | `loadtesting_testresource_list` | ❌ | +| 2 | 0.601963 | `loadtesting_test_get` | ❌ | +| 3 | 0.597266 | `loadtesting_testresource_create` | ❌ | +| 4 | 0.577220 | `monitor_webtests_get` | ❌ | +| 5 | 0.566095 | `loadtesting_testrun_list` | ❌ | --- ## Test 306 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `loadtesting_testrun_list` **Prompt:** Get all the load test runs for the test with id in the load test resource in resource group @@ -12639,47 +5684,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -<<<<<<< HEAD -| 1 | 0.669307 | `loadtesting_testresource_list` | ❌ | -| 2 | 0.640644 | `loadtesting_testrun_list` | ✅ **EXPECTED** | -| 3 | 0.600977 | `loadtesting_test_get` | ❌ | -| 4 | 0.577403 | `loadtesting_testresource_create` | ❌ | -| 5 | 0.569287 | `monitor_webtests_list` | ❌ | - ---- - -## Test 297 -======= | 1 | 0.669180 | `loadtesting_testresource_list` | ❌ | | 2 | 0.640360 | `loadtesting_testrun_list` | ✅ **EXPECTED** | | 3 | 0.601075 | `loadtesting_test_get` | ❌ | | 4 | 0.577460 | `loadtesting_testresource_create` | ❌ | -<<<<<<< HEAD -| 5 | 0.569963 | `monitor_webtests_get` | ❌ | - ---- - -## Test 292 -======= | 5 | 0.569424 | `monitor_webtests_get` | ❌ | --- -## Test 302 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -| 1 | 0.669160 | `loadtesting_testresource_list` | ❌ | -| 2 | 0.640500 | `loadtesting_testrun_list` | ✅ **EXPECTED** | -| 3 | 0.601136 | `loadtesting_test_get` | ❌ | -| 4 | 0.577398 | `loadtesting_testresource_create` | ❌ | -| 5 | 0.569408 | `monitor_webtests_get` | ❌ | - ---- - ## Test 307 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `loadtesting_testrun_update` **Prompt:** Update a test run display name as for the id for test in the load testing resource in resource group . @@ -12688,43 +5701,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -| 1 | 0.706390 | `loadtesting_testrun_update` | ✅ **EXPECTED** | +| 1 | 0.706747 | `loadtesting_testrun_update` | ✅ **EXPECTED** | | 2 | 0.514428 | `loadtesting_testrun_create` | ❌ | -<<<<<<< HEAD -<<<<<<< HEAD -| 3 | 0.486977 | `monitor_webtests_update` | ❌ | -======= -<<<<<<< HEAD -======= ->>>>>>> e2fd2eac (refactor tts mcp tool) | 3 | 0.486980 | `monitor_webtests_update` | ❌ | ->>>>>>> 58ab8585 (update prompts and tool description evaluator) | 4 | 0.470337 | `loadtesting_testresource_list` | ❌ | -<<<<<<< HEAD | 5 | 0.468374 | `monitor_webtests_get` | ❌ | --- -<<<<<<< HEAD -## Test 298 -======= -## Test 293 -======= -| 3 | 0.487022 | `monitor_webtests_update` | ❌ | -| 4 | 0.470337 | `loadtesting_testresource_list` | ❌ | -======= ->>>>>>> e2fd2eac (refactor tts mcp tool) -| 5 | 0.468374 | `monitor_webtests_get` | ❌ | - ---- - -<<<<<<< HEAD -## Test 303 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= ## Test 308 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `grafana_list` **Prompt:** List all Azure Managed Grafana in one subscription @@ -12735,31 +5720,13 @@ |------|-------|------|--------| | 1 | 0.599427 | `kusto_cluster_list` | ❌ | | 2 | 0.578892 | `grafana_list` | ✅ **EXPECTED** | -<<<<<<< HEAD -| 3 | 0.550372 | `subscription_list` | ❌ | -| 4 | 0.549957 | `search_service_list` | ❌ | -| 5 | 0.531259 | `redis_list` | ❌ | - ---- - -<<<<<<< HEAD -## Test 299 -======= -<<<<<<< HEAD -## Test 294 -======= -## Test 304 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= | 3 | 0.551851 | `search_service_list` | ❌ | | 4 | 0.550372 | `subscription_list` | ❌ | -| 5 | 0.531277 | `redis_list` | ❌ | +| 5 | 0.531259 | `redis_list` | ❌ | --- ## Test 309 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `managedlustre_fs_create` **Prompt:** Create an Azure Managed Lustre filesystem with name , size , SKU , and subnet for availability zone in location . Maintenance should occur on at @@ -12768,49 +5735,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -<<<<<<< HEAD -| 1 | 0.726553 | `managedlustre_fs_create` | ✅ **EXPECTED** | -| 2 | 0.616164 | `managedlustre_fs_list` | ❌ | -| 3 | 0.605701 | `managedlustre_fs_sku_get` | ❌ | -| 4 | 0.598215 | `managedlustre_fs_update` | ❌ | -======= -<<<<<<< HEAD -======= ->>>>>>> e2fd2eac (refactor tts mcp tool) | 1 | 0.728113 | `managedlustre_fs_create` | ✅ **EXPECTED** | | 2 | 0.616164 | `managedlustre_fs_list` | ❌ | | 3 | 0.605775 | `managedlustre_fs_sku_get` | ❌ | -<<<<<<< HEAD | 4 | 0.598255 | `managedlustre_fs_update` | ❌ | ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -| 4 | 0.598293 | `managedlustre_fs_update` | ❌ | ->>>>>>> e2fd2eac (refactor tts mcp tool) | 5 | 0.557720 | `managedlustre_fs_subnetsize_validate` | ❌ | --- -<<<<<<< HEAD -<<<<<<< HEAD -## Test 300 -======= -## Test 295 -======= -| 1 | 0.728113 | `managedlustre_filesystem_create` | ❌ | -| 2 | 0.616164 | `managedlustre_filesystem_list` | ❌ | -| 3 | 0.605775 | `managedlustre_filesystem_sku_get` | ❌ | -| 4 | 0.598255 | `managedlustre_filesystem_update` | ❌ | -| 5 | 0.557720 | `managedlustre_filesystem_subnetsize_validate` | ❌ | - ---- - -## Test 305 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= ## Test 310 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `managedlustre_fs_list` **Prompt:** List the Azure Managed Lustre filesystems in my subscription @@ -12819,50 +5752,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -<<<<<<< HEAD | 1 | 0.750675 | `managedlustre_fs_list` | ✅ **EXPECTED** | -| 2 | 0.631730 | `managedlustre_fs_sku_get` | ❌ | -| 3 | 0.579855 | `managedlustre_fs_create` | ❌ | -| 4 | 0.562377 | `kusto_cluster_list` | ❌ | -| 5 | 0.512086 | `search_service_list` | ❌ | - ---- - -## Test 301 -======= -<<<<<<< HEAD -| 1 | 0.750302 | `managedlustre_fs_list` | ✅ **EXPECTED** | | 2 | 0.631770 | `managedlustre_fs_sku_get` | ❌ | | 3 | 0.582660 | `managedlustre_fs_create` | ❌ | | 4 | 0.562377 | `kusto_cluster_list` | ❌ | -======= -| 1 | 0.750675 | `managedlustre_filesystem_list` | ❌ | -| 2 | 0.631770 | `managedlustre_filesystem_sku_get` | ❌ | -| 3 | 0.582660 | `managedlustre_filesystem_create` | ❌ | -| 4 | 0.562520 | `kusto_cluster_list` | ❌ | ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) | 5 | 0.513156 | `search_service_list` | ❌ | --- -<<<<<<< HEAD -## Test 296 -======= -## Test 306 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -| 1 | 0.750667 | `managedlustre_fs_list` | ✅ **EXPECTED** | -| 2 | 0.631727 | `managedlustre_fs_sku_get` | ❌ | -| 3 | 0.582749 | `managedlustre_fs_create` | ❌ | -| 4 | 0.562295 | `kusto_cluster_list` | ❌ | -| 5 | 0.513090 | `search_service_list` | ❌ | - ---- - ## Test 311 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `managedlustre_fs_list` **Prompt:** List the Azure Managed Lustre filesystems in my resource group @@ -12871,45 +5769,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -<<<<<<< HEAD -| 1 | 0.743903 | `managedlustre_fs_list` | ✅ **EXPECTED** | -| 2 | 0.613164 | `managedlustre_fs_sku_get` | ❌ | -| 3 | 0.563081 | `managedlustre_fs_create` | ❌ | -======= -<<<<<<< HEAD -| 1 | 0.743639 | `managedlustre_fs_list` | ✅ **EXPECTED** | -| 2 | 0.613217 | `managedlustre_fs_sku_get` | ❌ | -| 3 | 0.565856 | `managedlustre_fs_create` | ❌ | -======= -| 1 | 0.743903 | `managedlustre_filesystem_list` | ❌ | -| 2 | 0.613217 | `managedlustre_filesystem_sku_get` | ❌ | -| 3 | 0.565856 | `managedlustre_filesystem_create` | ❌ | ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -| 1 | 0.743903 | `managedlustre_fs_list` | ✅ **EXPECTED** | -| 2 | 0.613217 | `managedlustre_fs_sku_get` | ❌ | -| 3 | 0.565856 | `managedlustre_fs_create` | ❌ | ->>>>>>> e2fd2eac (refactor tts mcp tool) -| 4 | 0.519986 | `datadog_monitoredresources_list` | ❌ | -| 5 | 0.515433 | `loadtesting_testresource_list` | ❌ | +| 1 | 0.743881 | `managedlustre_fs_list` | ✅ **EXPECTED** | +| 2 | 0.613165 | `managedlustre_fs_sku_get` | ❌ | +| 3 | 0.565820 | `managedlustre_fs_create` | ❌ | +| 4 | 0.520005 | `datadog_monitoredresources_list` | ❌ | +| 5 | 0.515449 | `loadtesting_testresource_list` | ❌ | --- -<<<<<<< HEAD -<<<<<<< HEAD -## Test 302 -======= -<<<<<<< HEAD -## Test 297 -======= -## Test 307 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= ## Test 312 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `managedlustre_fs_sku_get` **Prompt:** List the Azure Managed Lustre SKUs available in location @@ -12918,44 +5786,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -<<<<<<< HEAD -| 1 | 0.827360 | `managedlustre_fs_sku_get` | ✅ **EXPECTED** | -| 2 | 0.613674 | `managedlustre_fs_list` | ❌ | -| 3 | 0.511625 | `managedlustre_fs_create` | ❌ | -======= -<<<<<<< HEAD -======= ->>>>>>> e2fd2eac (refactor tts mcp tool) | 1 | 0.827381 | `managedlustre_fs_sku_get` | ✅ **EXPECTED** | | 2 | 0.613674 | `managedlustre_fs_list` | ❌ | | 3 | 0.513242 | `managedlustre_fs_create` | ❌ | ->>>>>>> 58ab8585 (update prompts and tool description evaluator) | 4 | 0.496242 | `managedlustre_fs_subnetsize_validate` | ❌ | | 5 | 0.470241 | `kusto_cluster_list` | ❌ | --- -<<<<<<< HEAD -<<<<<<< HEAD -## Test 303 -======= -## Test 298 -======= -| 1 | 0.827381 | `managedlustre_filesystem_sku_get` | ❌ | -| 2 | 0.613674 | `managedlustre_filesystem_list` | ❌ | -| 3 | 0.513242 | `managedlustre_filesystem_create` | ❌ | -| 4 | 0.496242 | `managedlustre_filesystem_subnetsize_validate` | ❌ | -| 5 | 0.470347 | `kusto_cluster_list` | ❌ | - ---- - -## Test 308 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= ## Test 313 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `managedlustre_fs_subnetsize_ask` **Prompt:** Tell me how many IP addresses I need for an Azure Managed Lustre filesystem of size using the SKU @@ -12964,41 +5803,6 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -<<<<<<< HEAD -| 1 | 0.739766 | `managedlustre_fs_subnetsize_ask` | ✅ **EXPECTED** | -| 2 | 0.651598 | `managedlustre_fs_subnetsize_validate` | ❌ | -| 3 | 0.594536 | `managedlustre_fs_sku_get` | ❌ | -| 4 | 0.559498 | `managedlustre_fs_list` | ❌ | -| 5 | 0.533351 | `managedlustre_fs_create` | ❌ | - ---- - -## Test 304 -======= -<<<<<<< HEAD -| 1 | 0.739679 | `managedlustre_fs_subnetsize_ask` | ✅ **EXPECTED** | -| 2 | 0.651615 | `managedlustre_fs_subnetsize_validate` | ❌ | -| 3 | 0.594695 | `managedlustre_fs_sku_get` | ❌ | -| 4 | 0.559034 | `managedlustre_fs_list` | ❌ | -| 5 | 0.533796 | `managedlustre_fs_create` | ❌ | - ---- - -## Test 299 -======= -| 1 | 0.739721 | `managedlustre_filesystem_subnetsize_ask` | ❌ | -| 2 | 0.651551 | `managedlustre_filesystem_subnetsize_validate` | ❌ | -| 3 | 0.594559 | `managedlustre_filesystem_sku_get` | ❌ | -| 4 | 0.559415 | `managedlustre_filesystem_list` | ❌ | -| 5 | 0.533625 | `managedlustre_filesystem_create` | ❌ | - ---- - -## Test 309 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= | 1 | 0.739766 | `managedlustre_fs_subnetsize_ask` | ✅ **EXPECTED** | | 2 | 0.651598 | `managedlustre_fs_subnetsize_validate` | ❌ | | 3 | 0.594585 | `managedlustre_fs_sku_get` | ❌ | @@ -13008,7 +5812,6 @@ --- ## Test 314 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `managedlustre_fs_subnetsize_validate` **Prompt:** Validate if the network can host Azure Managed Lustre filesystem of size using the SKU @@ -13017,51 +5820,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -<<<<<<< HEAD -| 1 | 0.879240 | `managedlustre_fs_subnetsize_validate` | ✅ **EXPECTED** | -| 2 | 0.622368 | `managedlustre_fs_subnetsize_ask` | ❌ | -| 3 | 0.542555 | `managedlustre_fs_sku_get` | ❌ | -| 4 | 0.516032 | `managedlustre_fs_create` | ❌ | -| 5 | 0.480796 | `managedlustre_fs_list` | ❌ | - ---- - -## Test 305 -======= -<<<<<<< HEAD -| 1 | 0.879541 | `managedlustre_fs_subnetsize_validate` | ✅ **EXPECTED** | -| 2 | 0.622603 | `managedlustre_fs_subnetsize_ask` | ❌ | -| 3 | 0.542788 | `managedlustre_fs_sku_get` | ❌ | -| 4 | 0.515947 | `managedlustre_fs_create` | ❌ | -| 5 | 0.480673 | `managedlustre_fs_list` | ❌ | - ---- - -## Test 300 -======= -| 1 | 0.879453 | `managedlustre_filesystem_subnetsize_validate` | ❌ | -| 2 | 0.622511 | `managedlustre_filesystem_subnetsize_ask` | ❌ | -| 3 | 0.542894 | `managedlustre_filesystem_sku_get` | ❌ | -| 4 | 0.516028 | `managedlustre_filesystem_create` | ❌ | -| 5 | 0.480920 | `managedlustre_filesystem_list` | ❌ | - ---- - -## Test 310 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -| 1 | 0.879742 | `managedlustre_fs_subnetsize_validate` | ✅ **EXPECTED** | -| 2 | 0.623614 | `managedlustre_fs_subnetsize_ask` | ❌ | -| 3 | 0.543132 | `managedlustre_fs_sku_get` | ❌ | -| 4 | 0.516528 | `managedlustre_fs_create` | ❌ | -| 5 | 0.480633 | `managedlustre_fs_list` | ❌ | +| 1 | 0.879389 | `managedlustre_fs_subnetsize_validate` | ✅ **EXPECTED** | +| 2 | 0.622463 | `managedlustre_fs_subnetsize_ask` | ❌ | +| 3 | 0.542808 | `managedlustre_fs_sku_get` | ❌ | +| 4 | 0.515936 | `managedlustre_fs_create` | ❌ | +| 5 | 0.480855 | `managedlustre_fs_list` | ❌ | --- ## Test 315 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `managedlustre_fs_update` **Prompt:** Update the maintenance window of the Azure Managed Lustre filesystem to at @@ -13070,50 +5837,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -<<<<<<< HEAD -| 1 | 0.738895 | `managedlustre_fs_update` | ✅ **EXPECTED** | -| 2 | 0.525980 | `managedlustre_fs_create` | ❌ | -| 3 | 0.487193 | `managedlustre_fs_list` | ❌ | -| 4 | 0.385318 | `managedlustre_fs_sku_get` | ❌ | -======= -<<<<<<< HEAD | 1 | 0.739000 | `managedlustre_fs_update` | ✅ **EXPECTED** | | 2 | 0.527525 | `managedlustre_fs_create` | ❌ | -| 3 | 0.487003 | `managedlustre_fs_list` | ❌ | +| 3 | 0.487193 | `managedlustre_fs_list` | ❌ | | 4 | 0.385349 | `managedlustre_fs_sku_get` | ❌ | ->>>>>>> 58ab8585 (update prompts and tool description evaluator) | 5 | 0.344891 | `managedlustre_fs_subnetsize_validate` | ❌ | --- -<<<<<<< HEAD -## Test 306 -======= -## Test 301 -======= -| 1 | 0.739000 | `managedlustre_filesystem_update` | ❌ | -| 2 | 0.527525 | `managedlustre_filesystem_create` | ❌ | -| 3 | 0.487193 | `managedlustre_filesystem_list` | ❌ | -| 4 | 0.385349 | `managedlustre_filesystem_sku_get` | ❌ | -| 5 | 0.344891 | `managedlustre_filesystem_subnetsize_validate` | ❌ | - ---- - -## Test 311 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -| 1 | 0.739170 | `managedlustre_fs_update` | ✅ **EXPECTED** | -| 2 | 0.527527 | `managedlustre_fs_create` | ❌ | -| 3 | 0.487191 | `managedlustre_fs_list` | ❌ | -| 4 | 0.385343 | `managedlustre_fs_sku_get` | ❌ | -| 5 | 0.344858 | `managedlustre_fs_subnetsize_validate` | ❌ | - ---- - ## Test 316 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `marketplace_product_get` **Prompt:** Get details about marketplace product @@ -13122,23 +5854,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -<<<<<<< HEAD -| 1 | 0.570164 | `marketplace_product_get` | ✅ **EXPECTED** | -| 2 | 0.499208 | `marketplace_product_list` | ❌ | -| 3 | 0.353280 | `servicebus_topic_subscription_details` | ❌ | -| 4 | 0.333304 | `servicebus_topic_details` | ❌ | -| 5 | 0.330949 | `servicebus_queue_details` | ❌ | - ---- - -## Test 307 -======= -<<<<<<< HEAD -| 1 | 0.570028 | `marketplace_product_get` | ✅ **EXPECTED** | -======= -| 1 | 0.570109 | `marketplace_product_get` | ✅ **EXPECTED** | ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) +| 1 | 0.570189 | `marketplace_product_get` | ✅ **EXPECTED** | | 2 | 0.499184 | `marketplace_product_list` | ❌ | | 3 | 0.353256 | `servicebus_topic_subscription_details` | ❌ | | 4 | 0.333160 | `servicebus_topic_details` | ❌ | @@ -13146,23 +5862,7 @@ --- -<<<<<<< HEAD -## Test 302 -======= -## Test 312 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -| 1 | 0.570164 | `marketplace_product_get` | ✅ **EXPECTED** | -| 2 | 0.499208 | `marketplace_product_list` | ❌ | -| 3 | 0.353280 | `servicebus_topic_subscription_details` | ❌ | -| 4 | 0.333178 | `servicebus_topic_details` | ❌ | -| 5 | 0.330949 | `servicebus_queue_details` | ❌ | - ---- - ## Test 317 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `marketplace_product_list` **Prompt:** Search for Microsoft products in the marketplace @@ -13171,35 +5871,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -| 1 | 0.607950 | `marketplace_product_list` | ✅ **EXPECTED** | -| 2 | 0.443177 | `marketplace_product_get` | ❌ | -| 3 | 0.341360 | `search_service_list` | ❌ | -| 4 | 0.330544 | `foundry_models_list` | ❌ | -| 5 | 0.328671 | `managedlustre_fs_sku_get` | ❌ | - ---- - -## Test 308 -======= | 1 | 0.607916 | `marketplace_product_list` | ✅ **EXPECTED** | -| 2 | 0.443133 | `marketplace_product_get` | ❌ | +| 2 | 0.443000 | `marketplace_product_get` | ❌ | | 3 | 0.343549 | `search_service_list` | ❌ | | 4 | 0.330500 | `foundry_models_list` | ❌ | | 5 | 0.328676 | `managedlustre_fs_sku_get` | ❌ | --- -<<<<<<< HEAD -<<<<<<< HEAD -## Test 303 -======= -## Test 313 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= ## Test 318 ->>>>>>> e2fd2eac (refactor tts mcp tool) **Expected Tool:** `marketplace_product_list` **Prompt:** Show me marketplace products from publisher @@ -13209,24 +5889,14 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| | 1 | 0.537726 | `marketplace_product_list` | ✅ **EXPECTED** | -<<<<<<< HEAD -<<<<<<< HEAD -| 2 | 0.385167 | `marketplace_product_get` | ❌ | -======= -<<<<<<< HEAD -| 2 | 0.385198 | `marketplace_product_get` | ❌ | -======= -| 2 | 0.385111 | `marketplace_product_get` | ❌ | ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) +| 2 | 0.385151 | `marketplace_product_get` | ❌ | | 3 | 0.308769 | `foundry_models_list` | ❌ | | 4 | 0.288006 | `redis_list` | ❌ | -| 5 | 0.260421 | `managedlustre_fs_sku_get` | ❌ | +| 5 | 0.260387 | `managedlustre_fs_sku_get` | ❌ | --- -<<<<<<< HEAD -## Test 309 +## Test 319 **Expected Tool:** `azureaibestpractices_get` **Prompt:** Get best practices for building AI applications in Azure @@ -13235,15 +5905,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -| 1 | 0.675775 | `azureaibestpractices_get` | ✅ **EXPECTED** | -| 2 | 0.555579 | `get_bestpractices_get` | ❌ | -| 3 | 0.501210 | `azureterraformbestpractices_get` | ❌ | -| 4 | 0.480026 | `deploy_pipeline_guidance_get` | ❌ | -| 5 | 0.477592 | `cloudarchitect_design` | ❌ | +| 1 | 0.555579 | `get_bestpractices_get` | ❌ | +| 2 | 0.501211 | `azureterraformbestpractices_get` | ❌ | +| 3 | 0.480235 | `deploy_pipeline_guidance_get` | ❌ | +| 4 | 0.478355 | `cloudarchitect_design` | ❌ | +| 5 | 0.476579 | `deploy_iac_rules_get` | ❌ | --- -## Test 310 +## Test 320 **Expected Tool:** `azureaibestpractices_get` **Prompt:** Show me the best practices for Azure AI Foundry agents code generation @@ -13252,15 +5922,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -| 1 | 0.699440 | `azureaibestpractices_get` | ✅ **EXPECTED** | -| 2 | 0.603487 | `foundry_agents_get-sdk-sample` | ❌ | -| 3 | 0.534202 | `get_bestpractices_get` | ❌ | -| 4 | 0.520202 | `foundry_agents_list` | ❌ | -| 5 | 0.508727 | `azureterraformbestpractices_get` | ❌ | +| 1 | 0.603773 | `foundry_agents_get-sdk-sample` | ❌ | +| 2 | 0.534202 | `get_bestpractices_get` | ❌ | +| 3 | 0.520223 | `foundry_agents_list` | ❌ | +| 4 | 0.508727 | `azureterraformbestpractices_get` | ❌ | +| 5 | 0.480034 | `deploy_plan_get` | ❌ | --- -## Test 311 +## Test 321 **Expected Tool:** `azureaibestpractices_get` **Prompt:** Get guidance for building agents with Azure AI Foundry @@ -13269,15 +5939,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -| 1 | 0.635165 | `azureaibestpractices_get` | ✅ **EXPECTED** | -| 2 | 0.593029 | `foundry_agents_get-sdk-sample` | ❌ | -| 3 | 0.553580 | `foundry_agents_list` | ❌ | -| 4 | 0.534256 | `foundry_agents_create` | ❌ | -| 5 | 0.513217 | `foundry_agents_connect` | ❌ | +| 1 | 0.593216 | `foundry_agents_get-sdk-sample` | ❌ | +| 2 | 0.553662 | `foundry_agents_list` | ❌ | +| 3 | 0.534160 | `foundry_agents_create` | ❌ | +| 4 | 0.513217 | `foundry_agents_connect` | ❌ | +| 5 | 0.505706 | `deploy_pipeline_guidance_get` | ❌ | --- -## Test 312 +## Test 322 **Expected Tool:** `azureaibestpractices_get` **Prompt:** Create an AI app that helps me to manage travel queries. @@ -13286,15 +5956,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -| 1 | 0.417629 | `azureaibestpractices_get` | ✅ **EXPECTED** | -| 2 | 0.343844 | `foundry_threads_create` | ❌ | -| 3 | 0.327503 | `foundry_openai_chat-completions-create` | ❌ | -| 4 | 0.320532 | `foundry_openai_create-completion` | ❌ | -| 5 | 0.311958 | `foundry_agents_connect` | ❌ | +| 1 | 0.343793 | `foundry_threads_create` | ❌ | +| 2 | 0.327503 | `foundry_openai_chat-completions-create` | ❌ | +| 3 | 0.320532 | `foundry_openai_create-completion` | ❌ | +| 4 | 0.311958 | `foundry_agents_connect` | ❌ | +| 5 | 0.305073 | `foundry_agents_get-sdk-sample` | ❌ | --- -## Test 313 +## Test 323 **Expected Tool:** `azureaibestpractices_get` **Prompt:** Create an AI app that helps me to manage travel queries in Azure AI Foundry @@ -13303,32 +5973,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -| 1 | 0.517931 | `azureaibestpractices_get` | ✅ **EXPECTED** | -| 2 | 0.478747 | `foundry_openai_embeddings-create` | ❌ | -| 3 | 0.469654 | `foundry_openai_chat-completions-create` | ❌ | -| 4 | 0.466216 | `foundry_openai_create-completion` | ❌ | -| 5 | 0.456719 | `foundry_resource_get` | ❌ | - ---- - -## Test 314 -======= -<<<<<<< HEAD -## Test 304 -======= -## Test 314 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -| 2 | 0.385167 | `marketplace_product_get` | ❌ | -| 3 | 0.308769 | `foundry_models_list` | ❌ | -| 4 | 0.288030 | `redis_list` | ❌ | -| 5 | 0.260387 | `managedlustre_fs_sku_get` | ❌ | +| 1 | 0.478745 | `foundry_openai_embeddings-create` | ❌ | +| 2 | 0.469654 | `foundry_openai_chat-completions-create` | ❌ | +| 3 | 0.466216 | `foundry_openai_create-completion` | ❌ | +| 4 | 0.456719 | `foundry_resource_get` | ❌ | +| 5 | 0.448502 | `foundry_agents_list` | ❌ | --- -## Test 319 ->>>>>>> e2fd2eac (refactor tts mcp tool) +## Test 324 **Expected Tool:** `get_bestpractices_get` **Prompt:** Get the latest Azure code generation best practices @@ -13337,27 +5990,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -<<<<<<< HEAD -| 1 | 0.656395 | `azureaibestpractices_get` | ❌ | -| 2 | 0.646844 | `get_bestpractices_get` | ✅ **EXPECTED** | -| 3 | 0.635406 | `azureterraformbestpractices_get` | ❌ | -| 4 | 0.586907 | `deploy_iac_rules_get` | ❌ | -| 5 | 0.531457 | `deploy_pipeline_guidance_get` | ❌ | - ---- - -## Test 315 -======= -<<<<<<< HEAD -| 1 | 0.646857 | `get_bestpractices_get` | ✅ **EXPECTED** | -| 2 | 0.635437 | `azureterraformbestpractices_get` | ❌ | -| 3 | 0.586894 | `deploy_iac_rules_get` | ❌ | -======= -| 1 | 0.651264 | `get_bestpractices_get` | ✅ **EXPECTED** | -======= | 1 | 0.646844 | `get_bestpractices_get` | ✅ **EXPECTED** | ->>>>>>> e2fd2eac (refactor tts mcp tool) | 2 | 0.635406 | `azureterraformbestpractices_get` | ❌ | | 3 | 0.586907 | `deploy_iac_rules_get` | ❌ | | 4 | 0.531727 | `deploy_pipeline_guidance_get` | ❌ | @@ -13365,16 +5998,7 @@ --- -<<<<<<< HEAD -<<<<<<< HEAD -## Test 305 -======= -## Test 315 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -## Test 320 ->>>>>>> e2fd2eac (refactor tts mcp tool) +## Test 325 **Expected Tool:** `get_bestpractices_get` **Prompt:** Get the latest Azure deployment best practices @@ -13384,38 +6008,14 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| | 1 | 0.600903 | `get_bestpractices_get` | ✅ **EXPECTED** | -<<<<<<< HEAD -<<<<<<< HEAD -| 2 | 0.548542 | `azureterraformbestpractices_get` | ❌ | -======= -| 2 | 0.548655 | `azureterraformbestpractices_get` | ❌ | -======= -| 1 | 0.602216 | `get_bestpractices_get` | ✅ **EXPECTED** | | 2 | 0.548542 | `azureterraformbestpractices_get` | ❌ | ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -| 2 | 0.548542 | `azureterraformbestpractices_get` | ❌ | ->>>>>>> e2fd2eac (refactor tts mcp tool) | 3 | 0.541091 | `deploy_iac_rules_get` | ❌ | | 4 | 0.516852 | `deploy_plan_get` | ❌ | -| 5 | 0.516203 | `deploy_pipeline_guidance_get` | ❌ | +| 5 | 0.516443 | `deploy_pipeline_guidance_get` | ❌ | --- -<<<<<<< HEAD -<<<<<<< HEAD -## Test 316 -======= -<<<<<<< HEAD -## Test 306 -======= -## Test 316 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -## Test 321 ->>>>>>> e2fd2eac (refactor tts mcp tool) +## Test 326 **Expected Tool:** `get_bestpractices_get` **Prompt:** Get the latest Azure best practices @@ -13425,39 +6025,14 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| | 1 | 0.625259 | `get_bestpractices_get` | ✅ **EXPECTED** | -<<<<<<< HEAD -<<<<<<< HEAD -| 2 | 0.594323 | `azureterraformbestpractices_get` | ❌ | -| 3 | 0.539715 | `azureaibestpractices_get` | ❌ | -| 4 | 0.518643 | `deploy_iac_rules_get` | ❌ | -| 5 | 0.465370 | `deploy_pipeline_guidance_get` | ❌ | - ---- - -## Test 317 -======= -| 2 | 0.594455 | `azureterraformbestpractices_get` | ❌ | -======= -| 1 | 0.624689 | `get_bestpractices_get` | ✅ **EXPECTED** | -======= ->>>>>>> e2fd2eac (refactor tts mcp tool) | 2 | 0.594323 | `azureterraformbestpractices_get` | ❌ | | 3 | 0.518643 | `deploy_iac_rules_get` | ❌ | | 4 | 0.465572 | `deploy_pipeline_guidance_get` | ❌ | -| 5 | 0.450629 | `cloudarchitect_design` | ❌ | +| 5 | 0.451502 | `cloudarchitect_design` | ❌ | --- -<<<<<<< HEAD -<<<<<<< HEAD -## Test 307 -======= -## Test 317 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -## Test 322 ->>>>>>> e2fd2eac (refactor tts mcp tool) +## Test 327 **Expected Tool:** `get_bestpractices_get` **Prompt:** Get the latest Azure Functions code generation best practices @@ -13467,22 +6042,6 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| | 1 | 0.624273 | `get_bestpractices_get` | ✅ **EXPECTED** | -<<<<<<< HEAD -<<<<<<< HEAD -| 2 | 0.587474 | `azureaibestpractices_get` | ❌ | -| 3 | 0.570488 | `azureterraformbestpractices_get` | ❌ | -| 4 | 0.522998 | `deploy_iac_rules_get` | ❌ | -| 5 | 0.493745 | `deploy_pipeline_guidance_get` | ❌ | - ---- - -## Test 318 -======= -| 2 | 0.570547 | `azureterraformbestpractices_get` | ❌ | -======= -| 1 | 0.629031 | `get_bestpractices_get` | ✅ **EXPECTED** | -======= ->>>>>>> e2fd2eac (refactor tts mcp tool) | 2 | 0.570488 | `azureterraformbestpractices_get` | ❌ | | 3 | 0.522998 | `deploy_iac_rules_get` | ❌ | | 4 | 0.493998 | `deploy_pipeline_guidance_get` | ❌ | @@ -13490,16 +6049,7 @@ --- -<<<<<<< HEAD -<<<<<<< HEAD -## Test 308 -======= -## Test 318 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -## Test 323 ->>>>>>> e2fd2eac (refactor tts mcp tool) +## Test 328 **Expected Tool:** `get_bestpractices_get` **Prompt:** Get the latest Azure Functions deployment best practices @@ -13508,36 +6058,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -<<<<<<< HEAD -| 1 | 0.581850 | `get_bestpractices_get` | ✅ **EXPECTED** | -| 2 | 0.497056 | `deploy_pipeline_guidance_get` | ❌ | -======= -| 1 | 0.584392 | `get_bestpractices_get` | ✅ **EXPECTED** | -======= -| 1 | 0.581850 | `get_bestpractices_get` | ✅ **EXPECTED** | ->>>>>>> e2fd2eac (refactor tts mcp tool) -| 2 | 0.497350 | `deploy_pipeline_guidance_get` | ❌ | ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -| 3 | 0.495659 | `deploy_iac_rules_get` | ❌ | -| 4 | 0.486886 | `azureterraformbestpractices_get` | ❌ | -| 5 | 0.474511 | `deploy_plan_get` | ❌ | +| 1 | 0.581868 | `get_bestpractices_get` | ✅ **EXPECTED** | +| 2 | 0.497378 | `deploy_pipeline_guidance_get` | ❌ | +| 3 | 0.495688 | `deploy_iac_rules_get` | ❌ | +| 4 | 0.486928 | `azureterraformbestpractices_get` | ❌ | +| 5 | 0.474572 | `deploy_plan_get` | ❌ | --- -<<<<<<< HEAD -<<<<<<< HEAD -## Test 319 -======= -<<<<<<< HEAD -## Test 309 -======= -## Test 319 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -## Test 324 ->>>>>>> e2fd2eac (refactor tts mcp tool) +## Test 329 **Expected Tool:** `get_bestpractices_get` **Prompt:** Get the latest Azure Functions best practices @@ -13547,22 +6076,6 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| | 1 | 0.610986 | `get_bestpractices_get` | ✅ **EXPECTED** | -<<<<<<< HEAD -<<<<<<< HEAD -| 2 | 0.532790 | `azureterraformbestpractices_get` | ❌ | -| 3 | 0.518386 | `azureaibestpractices_get` | ❌ | -| 4 | 0.487322 | `deploy_iac_rules_get` | ❌ | -| 5 | 0.457812 | `deploy_pipeline_guidance_get` | ❌ | - ---- - -## Test 320 -======= -| 2 | 0.532921 | `azureterraformbestpractices_get` | ❌ | -======= -| 1 | 0.612552 | `get_bestpractices_get` | ✅ **EXPECTED** | -======= ->>>>>>> e2fd2eac (refactor tts mcp tool) | 2 | 0.532790 | `azureterraformbestpractices_get` | ❌ | | 3 | 0.487322 | `deploy_iac_rules_get` | ❌ | | 4 | 0.458060 | `deploy_pipeline_guidance_get` | ❌ | @@ -13570,16 +6083,7 @@ --- -<<<<<<< HEAD -<<<<<<< HEAD -## Test 310 -======= -## Test 320 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -## Test 325 ->>>>>>> e2fd2eac (refactor tts mcp tool) +## Test 330 **Expected Tool:** `get_bestpractices_get` **Prompt:** Get the latest Azure Static Web Apps best practices @@ -13589,39 +6093,14 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| | 1 | 0.557862 | `get_bestpractices_get` | ✅ **EXPECTED** | -<<<<<<< HEAD -<<<<<<< HEAD -| 2 | 0.513262 | `azureterraformbestpractices_get` | ❌ | -| 3 | 0.510399 | `azureaibestpractices_get` | ❌ | -| 4 | 0.505123 | `deploy_iac_rules_get` | ❌ | -| 5 | 0.483482 | `deploy_pipeline_guidance_get` | ❌ | - ---- - -## Test 321 -======= -| 2 | 0.513385 | `azureterraformbestpractices_get` | ❌ | -======= -| 1 | 0.559184 | `get_bestpractices_get` | ✅ **EXPECTED** | -======= ->>>>>>> e2fd2eac (refactor tts mcp tool) | 2 | 0.513262 | `azureterraformbestpractices_get` | ❌ | | 3 | 0.505123 | `deploy_iac_rules_get` | ❌ | | 4 | 0.483705 | `deploy_pipeline_guidance_get` | ❌ | -| 5 | 0.421581 | `cloudarchitect_design` | ❌ | +| 5 | 0.422144 | `cloudarchitect_design` | ❌ | --- -<<<<<<< HEAD -<<<<<<< HEAD -## Test 311 -======= -## Test 321 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -## Test 326 ->>>>>>> e2fd2eac (refactor tts mcp tool) +## Test 331 **Expected Tool:** `get_bestpractices_get` **Prompt:** What are azure function best practices? @@ -13631,39 +6110,14 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| | 1 | 0.582541 | `get_bestpractices_get` | ✅ **EXPECTED** | -<<<<<<< HEAD -<<<<<<< HEAD -| 2 | 0.500368 | `azureterraformbestpractices_get` | ❌ | -| 3 | 0.475018 | `azureaibestpractices_get` | ❌ | -| 4 | 0.472112 | `deploy_iac_rules_get` | ❌ | -| 5 | 0.432959 | `deploy_pipeline_guidance_get` | ❌ | - ---- - -## Test 322 -======= -| 2 | 0.500479 | `azureterraformbestpractices_get` | ❌ | -======= -| 1 | 0.584536 | `get_bestpractices_get` | ✅ **EXPECTED** | -======= ->>>>>>> e2fd2eac (refactor tts mcp tool) | 2 | 0.500368 | `azureterraformbestpractices_get` | ❌ | | 3 | 0.472112 | `deploy_iac_rules_get` | ❌ | | 4 | 0.433134 | `deploy_pipeline_guidance_get` | ❌ | -| 5 | 0.432087 | `cloudarchitect_design` | ❌ | +| 5 | 0.432810 | `cloudarchitect_design` | ❌ | --- -<<<<<<< HEAD -<<<<<<< HEAD -## Test 312 -======= -## Test 322 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -## Test 327 ->>>>>>> e2fd2eac (refactor tts mcp tool) +## Test 332 **Expected Tool:** `get_bestpractices_get` **Prompt:** configure azure mcp in coding agent for my repo @@ -13673,38 +6127,14 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| | 1 | 0.488855 | `deploy_plan_get` | ❌ | -| 2 | 0.460745 | `deploy_pipeline_guidance_get` | ❌ | +| 2 | 0.460956 | `deploy_pipeline_guidance_get` | ❌ | | 3 | 0.390270 | `deploy_iac_rules_get` | ❌ | -<<<<<<< HEAD -| 4 | 0.370753 | `azureaibestpractices_get` | ❌ | -| 5 | 0.370298 | `azureterraformbestpractices_get` | ❌ | - ---- - -<<<<<<< HEAD -## Test 323 -======= -## Test 313 -======= -| 1 | 0.488915 | `deploy_plan_get` | ❌ | -| 2 | 0.460980 | `deploy_pipeline_guidance_get` | ❌ | -| 3 | 0.390340 | `deploy_iac_rules_get` | ❌ | -| 4 | 0.370368 | `azureterraformbestpractices_get` | ❌ | -| 5 | 0.369284 | `extension_cli_install` | ❌ | - ---- - -## Test 323 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= | 4 | 0.370298 | `azureterraformbestpractices_get` | ❌ | | 5 | 0.369169 | `extension_cli_install` | ❌ | --- -## Test 328 ->>>>>>> e2fd2eac (refactor tts mcp tool) +## Test 333 **Expected Tool:** `monitor_activitylog_list` **Prompt:** List the activity logs of the last month for @@ -13713,83 +6143,32 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -<<<<<<< HEAD | 1 | 0.537893 | `monitor_activitylog_list` | ✅ **EXPECTED** | -======= -<<<<<<< HEAD -| 1 | 0.537916 | `monitor_activitylog_list` | ✅ **EXPECTED** | ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -| 1 | 0.537893 | `monitor_activitylog_list` | ✅ **EXPECTED** | ->>>>>>> e2fd2eac (refactor tts mcp tool) | 2 | 0.506212 | `monitor_resource_log_query` | ❌ | -| 3 | 0.371728 | `monitor_workspace_log_query` | ❌ | +| 3 | 0.371727 | `monitor_workspace_log_query` | ❌ | | 4 | 0.363798 | `resourcehealth_health-events_list` | ❌ | | 5 | 0.344629 | `datadog_monitoredresources_list` | ❌ | --- -<<<<<<< HEAD -<<<<<<< HEAD -## Test 324 -======= -## Test 314 -======= -| 1 | 0.537780 | `monitor_activitylog_list` | ✅ **EXPECTED** | -| 2 | 0.506270 | `monitor_resource_log_query` | ❌ | -| 3 | 0.371737 | `monitor_workspace_log_query` | ❌ | -| 4 | 0.363731 | `resourcehealth_service-health-events_list` | ❌ | -| 5 | 0.344620 | `datadog_monitoredresources_list` | ❌ | +## Test 334 ---- - -## Test 324 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -## Test 329 ->>>>>>> e2fd2eac (refactor tts mcp tool) - -**Expected Tool:** `monitor_healthmodels_entity_get` -**Prompt:** Show me the health status of entity using the health model +**Expected Tool:** `monitor_healthmodels_entity_get` +**Prompt:** Show me the health status of entity using the health model ### Results | Rank | Score | Tool | Status | |------|-------|------|--------| | 1 | 0.660947 | `monitor_healthmodels_entity_get` | ✅ **EXPECTED** | -<<<<<<< HEAD -| 2 | 0.608665 | `resourcehealth_availability-status_get` | ❌ | -======= | 2 | 0.609276 | `resourcehealth_availability-status_get` | ❌ | ->>>>>>> e2fd2eac (refactor tts mcp tool) | 3 | 0.351697 | `resourcehealth_availability-status_list` | ❌ | | 4 | 0.328321 | `resourcehealth_health-events_list` | ❌ | -| 5 | 0.288127 | `foundry_models_deployments_list` | ❌ | - ---- - -<<<<<<< HEAD -<<<<<<< HEAD -## Test 325 -======= -## Test 315 -======= -| 1 | 0.660947 | `monitor_healthmodels_entity_gethealth` | ❌ | -| 2 | 0.603153 | `resourcehealth_availability-status_get` | ❌ | -| 3 | 0.355116 | `foundry_openai_models-list` | ❌ | -| 4 | 0.351697 | `resourcehealth_availability-status_list` | ❌ | -| 5 | 0.328321 | `resourcehealth_service-health-events_list` | ❌ | +| 5 | 0.288705 | `foundry_models_deployments_list` | ❌ | --- -## Test 325 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -## Test 330 ->>>>>>> e2fd2eac (refactor tts mcp tool) +## Test 335 **Expected Tool:** `monitor_metrics_definitions` **Prompt:** Get metric definitions for from the namespace @@ -13798,48 +6177,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -<<<<<<< HEAD | 1 | 0.592640 | `monitor_metrics_definitions` | ✅ **EXPECTED** | | 2 | 0.424141 | `monitor_metrics_query` | ❌ | -| 3 | 0.368006 | `bicepschema_get` | ❌ | -| 4 | 0.332369 | `monitor_table_type_list` | ❌ | -| 5 | 0.325634 | `resourcehealth_availability-status_get` | ❌ | - ---- - -## Test 326 -======= -<<<<<<< HEAD -| 1 | 0.592676 | `monitor_metrics_definitions` | ✅ **EXPECTED** | -| 2 | 0.424006 | `monitor_metrics_query` | ❌ | -======= -| 1 | 0.592640 | `monitor_metrics_definitions` | ✅ **EXPECTED** | -| 2 | 0.424256 | `monitor_metrics_query` | ❌ | ->>>>>>> e2fd2eac (refactor tts mcp tool) | 3 | 0.368319 | `bicepschema_get` | ❌ | | 4 | 0.332356 | `monitor_table_type_list` | ❌ | | 5 | 0.324986 | `resourcehealth_availability-status_get` | ❌ | --- -<<<<<<< HEAD -## Test 316 -======= -| 1 | 0.592640 | `monitor_metrics_definitions` | ✅ **EXPECTED** | -| 2 | 0.424141 | `monitor_metrics_query` | ❌ | -| 3 | 0.368319 | `bicepschema_get` | ❌ | -| 4 | 0.332356 | `monitor_table_type_list` | ❌ | -| 5 | 0.323083 | `resourcehealth_availability-status_get` | ❌ | - ---- - -## Test 326 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -## Test 331 ->>>>>>> e2fd2eac (refactor tts mcp tool) +## Test 336 **Expected Tool:** `monitor_metrics_definitions` **Prompt:** Show me all available metrics and their definitions for storage account @@ -13848,45 +6194,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -<<<<<<< HEAD | 1 | 0.607600 | `storage_account_get` | ❌ | | 2 | 0.587736 | `monitor_metrics_definitions` | ✅ **EXPECTED** | -| 3 | 0.544043 | `storage_blob_container_get` | ❌ | -======= -<<<<<<< HEAD -| 1 | 0.607537 | `storage_account_get` | ❌ | -| 2 | 0.587640 | `monitor_metrics_definitions` | ✅ **EXPECTED** | | 3 | 0.544781 | `storage_blob_container_get` | ❌ | ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -| 4 | 0.495829 | `storage_blob_get` | ❌ | -| 5 | 0.473421 | `managedlustre_fs_list` | ❌ | - ---- - -<<<<<<< HEAD -## Test 327 -======= -## Test 317 -======= -| 1 | 0.607575 | `storage_account_get` | ❌ | -======= -| 1 | 0.607600 | `storage_account_get` | ❌ | ->>>>>>> e2fd2eac (refactor tts mcp tool) -| 2 | 0.587736 | `monitor_metrics_definitions` | ✅ **EXPECTED** | -| 3 | 0.545035 | `storage_blob_container_get` | ❌ | | 4 | 0.495829 | `storage_blob_get` | ❌ | | 5 | 0.473421 | `managedlustre_fs_list` | ❌ | --- -<<<<<<< HEAD -## Test 327 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -## Test 332 ->>>>>>> e2fd2eac (refactor tts mcp tool) +## Test 337 **Expected Tool:** `monitor_metrics_definitions` **Prompt:** What metric definitions are available for the Application Insights resource @@ -13895,42 +6211,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -<<<<<<< HEAD -| 1 | 0.633173 | `monitor_metrics_definitions` | ✅ **EXPECTED** | -| 2 | 0.495513 | `monitor_metrics_query` | ❌ | -======= -<<<<<<< HEAD -| 1 | 0.633132 | `monitor_metrics_definitions` | ✅ **EXPECTED** | -| 2 | 0.495439 | `monitor_metrics_query` | ❌ | -======= -| 1 | 0.633173 | `monitor_metrics_definitions` | ✅ **EXPECTED** | -| 2 | 0.495513 | `monitor_metrics_query` | ❌ | ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -| 1 | 0.633173 | `monitor_metrics_definitions` | ✅ **EXPECTED** | -| 2 | 0.495587 | `monitor_metrics_query` | ❌ | ->>>>>>> e2fd2eac (refactor tts mcp tool) -| 3 | 0.433945 | `monitor_resource_log_query` | ❌ | -| 4 | 0.392960 | `loadtesting_testresource_list` | ❌ | -| 5 | 0.388569 | `bicepschema_get` | ❌ | +| 1 | 0.633136 | `monitor_metrics_definitions` | ✅ **EXPECTED** | +| 2 | 0.495555 | `monitor_metrics_query` | ❌ | +| 3 | 0.434042 | `monitor_resource_log_query` | ❌ | +| 4 | 0.392971 | `loadtesting_testresource_list` | ❌ | +| 5 | 0.388754 | `bicepschema_get` | ❌ | --- -<<<<<<< HEAD -<<<<<<< HEAD -## Test 328 -======= -<<<<<<< HEAD -## Test 318 -======= -## Test 328 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -## Test 333 ->>>>>>> e2fd2eac (refactor tts mcp tool) +## Test 338 **Expected Tool:** `monitor_metrics_query` **Prompt:** Analyze the performance trends and response times for Application Insights resource over the last @@ -13939,7 +6228,6 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD | 1 | 0.555377 | `monitor_metrics_query` | ✅ **EXPECTED** | | 2 | 0.527530 | `monitor_resource_log_query` | ❌ | | 3 | 0.464743 | `applens_resource_diagnose` | ❌ | @@ -13948,26 +6236,7 @@ --- -<<<<<<< HEAD -## Test 329 -======= -<<<<<<< HEAD -## Test 319 -======= -## Test 329 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -| 1 | 0.555259 | `monitor_metrics_query` | ✅ **EXPECTED** | -| 2 | 0.527465 | `monitor_resource_log_query` | ❌ | -| 3 | 0.464988 | `applens_resource_diagnose` | ❌ | -| 4 | 0.420447 | `resourcehealth_health-events_list` | ❌ | -| 5 | 0.413438 | `applicationinsights_recommendation_list` | ❌ | - ---- - -## Test 334 ->>>>>>> e2fd2eac (refactor tts mcp tool) +## Test 339 **Expected Tool:** `monitor_metrics_query` **Prompt:** Check the availability metrics for my Application Insights resource for the last @@ -13976,44 +6245,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD | 1 | 0.557830 | `monitor_metrics_query` | ✅ **EXPECTED** | | 2 | 0.476671 | `monitor_resource_log_query` | ❌ | -<<<<<<< HEAD -| 3 | 0.460611 | `resourcehealth_availability-status_list` | ❌ | -| 4 | 0.456360 | `quota_usage_check` | ❌ | -| 5 | 0.438233 | `monitor_metrics_definitions` | ❌ | - ---- - -## Test 330 -======= -<<<<<<< HEAD -| 3 | 0.460351 | `resourcehealth_availability-status_list` | ❌ | -| 4 | 0.456321 | `quota_usage_check` | ❌ | -| 5 | 0.438171 | `monitor_metrics_definitions` | ❌ | - ---- - -## Test 320 -======= -======= -| 1 | 0.558015 | `monitor_metrics_query` | ✅ **EXPECTED** | -| 2 | 0.476671 | `monitor_resource_log_query` | ❌ | ->>>>>>> e2fd2eac (refactor tts mcp tool) | 3 | 0.460611 | `resourcehealth_availability-status_list` | ❌ | | 4 | 0.455904 | `quota_usage_check` | ❌ | | 5 | 0.438233 | `monitor_metrics_definitions` | ❌ | --- -<<<<<<< HEAD -## Test 330 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -## Test 335 ->>>>>>> e2fd2eac (refactor tts mcp tool) +## Test 340 **Expected Tool:** `monitor_metrics_query` **Prompt:** Get the metric for over the last with intervals @@ -14022,33 +6262,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -<<<<<<< HEAD -| 1 | 0.461249 | `monitor_metrics_query` | ✅ **EXPECTED** | -| 2 | 0.390029 | `monitor_metrics_definitions` | ❌ | -| 3 | 0.338557 | `monitor_resource_log_query` | ❌ | -| 4 | 0.335118 | `resourcehealth_availability-status_get` | ❌ | -| 5 | 0.306338 | `resourcehealth_availability-status_list` | ❌ | - ---- - -## Test 331 -======= -<<<<<<< HEAD -| 1 | 0.461138 | `monitor_metrics_query` | ✅ **EXPECTED** | -| 2 | 0.389998 | `monitor_metrics_definitions` | ❌ | -| 3 | 0.338392 | `monitor_resource_log_query` | ❌ | -| 4 | 0.334417 | `resourcehealth_availability-status_get` | ❌ | -| 5 | 0.306224 | `resourcehealth_availability-status_list` | ❌ | - ---- - -## Test 321 -======= | 1 | 0.461249 | `monitor_metrics_query` | ✅ **EXPECTED** | -======= -| 1 | 0.461420 | `monitor_metrics_query` | ✅ **EXPECTED** | ->>>>>>> e2fd2eac (refactor tts mcp tool) | 2 | 0.390029 | `monitor_metrics_definitions` | ❌ | | 3 | 0.338557 | `monitor_resource_log_query` | ❌ | | 4 | 0.334519 | `resourcehealth_availability-status_get` | ❌ | @@ -14056,13 +6270,7 @@ --- -<<<<<<< HEAD -## Test 331 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -## Test 336 ->>>>>>> e2fd2eac (refactor tts mcp tool) +## Test 341 **Expected Tool:** `monitor_metrics_query` **Prompt:** Investigate error rates and failed requests for Application Insights resource for the last @@ -14071,46 +6279,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD | 1 | 0.496878 | `monitor_resource_log_query` | ❌ | -<<<<<<< HEAD | 2 | 0.492138 | `monitor_metrics_query` | ✅ **EXPECTED** | -======= -<<<<<<< HEAD -| 2 | 0.491782 | `monitor_metrics_query` | ✅ **EXPECTED** | ->>>>>>> 58ab8585 (update prompts and tool description evaluator) | 3 | 0.448148 | `applens_resource_diagnose` | ❌ | | 4 | 0.412184 | `resourcehealth_health-events_list` | ❌ | -| 5 | 0.397853 | `quota_usage_check` | ❌ | - ---- - -<<<<<<< HEAD -## Test 332 -======= -## Test 322 -======= -| 2 | 0.492138 | `monitor_metrics_query` | ✅ **EXPECTED** | -| 3 | 0.448148 | `applens_resource_diagnose` | ❌ | -| 4 | 0.412184 | `resourcehealth_service-health-events_list` | ❌ | | 5 | 0.397335 | `quota_usage_check` | ❌ | --- -## Test 332 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -| 1 | 0.496911 | `monitor_resource_log_query` | ❌ | -| 2 | 0.492280 | `monitor_metrics_query` | ✅ **EXPECTED** | -| 3 | 0.448203 | `applens_resource_diagnose` | ❌ | -| 4 | 0.412199 | `resourcehealth_health-events_list` | ❌ | -| 5 | 0.397367 | `quota_usage_check` | ❌ | - ---- - -## Test 337 ->>>>>>> e2fd2eac (refactor tts mcp tool) +## Test 342 **Expected Tool:** `monitor_metrics_query` **Prompt:** Query the metric for for the last @@ -14119,22 +6296,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -<<<<<<< HEAD -| 1 | 0.525890 | `monitor_metrics_query` | ✅ **EXPECTED** | -| 2 | 0.405838 | `monitor_resource_log_query` | ❌ | -| 3 | 0.384811 | `monitor_metrics_definitions` | ❌ | -| 4 | 0.347228 | `monitor_workspace_log_query` | ❌ | -| 5 | 0.330657 | `resourcehealth_availability-status_get` | ❌ | - ---- - -## Test 333 -======= -| 1 | 0.525326 | `monitor_metrics_query` | ✅ **EXPECTED** | -======= -| 1 | 0.525816 | `monitor_metrics_query` | ✅ **EXPECTED** | ->>>>>>> e2fd2eac (refactor tts mcp tool) +| 1 | 0.525585 | `monitor_metrics_query` | ✅ **EXPECTED** | | 2 | 0.406185 | `monitor_resource_log_query` | ❌ | | 3 | 0.384482 | `monitor_metrics_definitions` | ❌ | | 4 | 0.347723 | `monitor_workspace_log_query` | ❌ | @@ -14142,21 +6304,7 @@ --- -<<<<<<< HEAD -## Test 323 -======= -| 3 | 0.384482 | `monitor_metrics_definitions` | ❌ | -| 4 | 0.347723 | `monitor_workspace_log_query` | ❌ | -| 5 | 0.325967 | `resourcehealth_availability-status_get` | ❌ | - ---- - -## Test 333 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -## Test 338 ->>>>>>> e2fd2eac (refactor tts mcp tool) +## Test 343 **Expected Tool:** `monitor_metrics_query` **Prompt:** What's the request per second rate for my Application Insights resource over the last @@ -14165,43 +6313,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD | 1 | 0.480140 | `monitor_metrics_query` | ✅ **EXPECTED** | | 2 | 0.444779 | `monitor_resource_log_query` | ❌ | | 3 | 0.388382 | `applens_resource_diagnose` | ❌ | -<<<<<<< HEAD -| 4 | 0.363672 | `quota_usage_check` | ❌ | -======= -<<<<<<< HEAD -| 4 | 0.363640 | `quota_usage_check` | ❌ | ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -| 1 | 0.480194 | `monitor_metrics_query` | ✅ **EXPECTED** | -| 2 | 0.444779 | `monitor_resource_log_query` | ❌ | -| 3 | 0.388382 | `applens_resource_diagnose` | ❌ | | 4 | 0.363412 | `quota_usage_check` | ❌ | ->>>>>>> e2fd2eac (refactor tts mcp tool) | 5 | 0.350076 | `resourcehealth_health-events_list` | ❌ | --- -<<<<<<< HEAD -<<<<<<< HEAD -## Test 334 -======= -## Test 324 -======= -| 4 | 0.363412 | `quota_usage_check` | ❌ | -| 5 | 0.350076 | `resourcehealth_service-health-events_list` | ❌ | - ---- - -## Test 334 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -## Test 339 ->>>>>>> e2fd2eac (refactor tts mcp tool) +## Test 344 **Expected Tool:** `monitor_resource_log_query` **Prompt:** Show me the logs for the past hour for the resource in the Log Analytics workspace @@ -14210,45 +6330,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD | 1 | 0.687852 | `monitor_resource_log_query` | ✅ **EXPECTED** | | 2 | 0.621919 | `monitor_workspace_log_query` | ❌ | -<<<<<<< HEAD | 3 | 0.598393 | `monitor_activitylog_list` | ❌ | -| 4 | 0.485528 | `deploy_app_logs_get` | ❌ | -| 5 | 0.469703 | `monitor_metrics_query` | ❌ | - ---- - -## Test 335 -======= -<<<<<<< HEAD -| 3 | 0.598436 | `monitor_activitylog_list` | ❌ | -======= -| 3 | 0.598393 | `monitor_activitylog_list` | ❌ | ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) | 4 | 0.485633 | `deploy_app_logs_get` | ❌ | -| 5 | 0.470119 | `monitor_metrics_query` | ❌ | - ---- - -<<<<<<< HEAD -## Test 325 -======= -## Test 335 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -| 1 | 0.687702 | `monitor_resource_log_query` | ✅ **EXPECTED** | -| 2 | 0.621740 | `monitor_workspace_log_query` | ❌ | -| 3 | 0.598494 | `monitor_activitylog_list` | ❌ | -| 4 | 0.485733 | `deploy_app_logs_get` | ❌ | -| 5 | 0.469848 | `monitor_metrics_query` | ❌ | +| 5 | 0.469703 | `monitor_metrics_query` | ❌ | --- -## Test 340 ->>>>>>> e2fd2eac (refactor tts mcp tool) +## Test 345 **Expected Tool:** `monitor_table_list` **Prompt:** List all tables in the Log Analytics workspace @@ -14257,37 +6347,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -| 1 | 0.851075 | `monitor_table_list` | ✅ **EXPECTED** | -<<<<<<< HEAD -| 2 | 0.725693 | `monitor_table_type_list` | ❌ | -| 3 | 0.620451 | `monitor_workspace_list` | ❌ | -| 4 | 0.541928 | `kusto_table_list` | ❌ | -======= +| 1 | 0.850711 | `monitor_table_list` | ✅ **EXPECTED** | | 2 | 0.725738 | `monitor_table_type_list` | ❌ | | 3 | 0.620445 | `monitor_workspace_list` | ❌ | | 4 | 0.541928 | `kusto_table_list` | ❌ | -<<<<<<< HEAD ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= ->>>>>>> e2fd2eac (refactor tts mcp tool) | 5 | 0.539481 | `monitor_workspace_log_query` | ❌ | --- -<<<<<<< HEAD -<<<<<<< HEAD -## Test 336 -======= -<<<<<<< HEAD -## Test 326 -======= -## Test 336 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -## Test 341 ->>>>>>> e2fd2eac (refactor tts mcp tool) +## Test 346 **Expected Tool:** `monitor_table_list` **Prompt:** Show me the tables in the Log Analytics workspace @@ -14296,38 +6364,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -<<<<<<< HEAD -| 1 | 0.798459 | `monitor_table_list` | ✅ **EXPECTED** | -| 2 | 0.701092 | `monitor_table_type_list` | ❌ | -| 3 | 0.600003 | `monitor_workspace_list` | ❌ | -| 4 | 0.542820 | `monitor_workspace_log_query` | ❌ | -======= -<<<<<<< HEAD -======= ->>>>>>> e2fd2eac (refactor tts mcp tool) -| 1 | 0.798460 | `monitor_table_list` | ✅ **EXPECTED** | +| 1 | 0.798147 | `monitor_table_list` | ✅ **EXPECTED** | | 2 | 0.701122 | `monitor_table_type_list` | ❌ | | 3 | 0.599917 | `monitor_workspace_list` | ❌ | | 4 | 0.542821 | `monitor_workspace_log_query` | ❌ | ->>>>>>> 58ab8585 (update prompts and tool description evaluator) | 5 | 0.502882 | `monitor_resource_log_query` | ❌ | --- -<<<<<<< HEAD -<<<<<<< HEAD -## Test 337 -======= -<<<<<<< HEAD -## Test 327 -======= -## Test 337 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -## Test 342 ->>>>>>> e2fd2eac (refactor tts mcp tool) +## Test 347 **Expected Tool:** `monitor_table_type_list` **Prompt:** List all available table types in the Log Analytics workspace @@ -14336,38 +6381,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -| 1 | 0.881468 | `monitor_table_type_list` | ✅ **EXPECTED** | -| 2 | 0.765694 | `monitor_table_list` | ❌ | -| 3 | 0.570092 | `monitor_workspace_list` | ❌ | -| 4 | 0.504683 | `mysql_table_list` | ❌ | -======= | 1 | 0.881524 | `monitor_table_type_list` | ✅ **EXPECTED** | -| 2 | 0.765702 | `monitor_table_list` | ❌ | +| 2 | 0.765557 | `monitor_table_list` | ❌ | | 3 | 0.569921 | `monitor_workspace_list` | ❌ | | 4 | 0.504683 | `mysql_table_list` | ❌ | -<<<<<<< HEAD ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= ->>>>>>> e2fd2eac (refactor tts mcp tool) | 5 | 0.497622 | `monitor_workspace_log_query` | ❌ | --- -<<<<<<< HEAD -<<<<<<< HEAD -## Test 338 -======= -<<<<<<< HEAD -## Test 328 -======= -## Test 338 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -## Test 343 ->>>>>>> e2fd2eac (refactor tts mcp tool) +## Test 348 **Expected Tool:** `monitor_table_type_list` **Prompt:** Show me the available table types in the Log Analytics workspace @@ -14376,33 +6398,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -| 1 | 0.843110 | `monitor_table_type_list` | ✅ **EXPECTED** | -| 2 | 0.736831 | `monitor_table_list` | ❌ | -| 3 | 0.576934 | `monitor_workspace_list` | ❌ | -======= | 1 | 0.843138 | `monitor_table_type_list` | ✅ **EXPECTED** | -| 2 | 0.736837 | `monitor_table_list` | ❌ | +| 2 | 0.736728 | `monitor_table_list` | ❌ | | 3 | 0.576731 | `monitor_workspace_list` | ❌ | ->>>>>>> 58ab8585 (update prompts and tool description evaluator) | 4 | 0.509598 | `monitor_workspace_log_query` | ❌ | | 5 | 0.481189 | `mysql_table_list` | ❌ | --- -<<<<<<< HEAD -<<<<<<< HEAD -## Test 339 -======= -<<<<<<< HEAD -## Test 329 -======= -## Test 339 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -## Test 344 ->>>>>>> e2fd2eac (refactor tts mcp tool) +## Test 349 **Expected Tool:** `monitor_webtests_create` **Prompt:** Create a new Standard Web Test with name in my subscription in in a given @@ -14411,51 +6415,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -<<<<<<< HEAD -| 1 | 0.651084 | `monitor_webtests_create` | ✅ **EXPECTED** | -| 2 | 0.570105 | `monitor_webtests_list` | ❌ | -| 3 | 0.550426 | `monitor_webtests_update` | ❌ | -| 4 | 0.533477 | `monitor_webtests_get` | ❌ | -| 5 | 0.482251 | `loadtesting_testresource_create` | ❌ | - ---- - -## Test 340 -======= -<<<<<<< HEAD -| 1 | 0.650749 | `monitor_webtests_create` | ✅ **EXPECTED** | -| 2 | 0.569999 | `monitor_webtests_list` | ❌ | -| 3 | 0.550088 | `monitor_webtests_update` | ❌ | -| 4 | 0.533466 | `monitor_webtests_get` | ❌ | -| 5 | 0.482122 | `loadtesting_testresource_create` | ❌ | - ---- - -## Test 330 -======= -| 1 | 0.650734 | `monitor_webtests_create` | ✅ **EXPECTED** | -| 2 | 0.572163 | `monitor_webtests_list` | ❌ | -| 3 | 0.550075 | `monitor_webtests_update` | ❌ | +| 1 | 0.650766 | `monitor_webtests_create` | ✅ **EXPECTED** | +| 2 | 0.569868 | `monitor_webtests_list` | ❌ | +| 3 | 0.550072 | `monitor_webtests_update` | ❌ | | 4 | 0.533352 | `monitor_webtests_get` | ❌ | | 5 | 0.482145 | `loadtesting_testresource_create` | ❌ | --- -## Test 340 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -| 1 | 0.650804 | `monitor_webtests_create` | ✅ **EXPECTED** | -| 2 | 0.570334 | `monitor_webtests_list` | ❌ | -| 3 | 0.550263 | `monitor_webtests_update` | ❌ | -| 4 | 0.533405 | `monitor_webtests_get` | ❌ | -| 5 | 0.482023 | `loadtesting_testresource_create` | ❌ | - ---- - -## Test 345 ->>>>>>> e2fd2eac (refactor tts mcp tool) +## Test 350 **Expected Tool:** `monitor_webtests_get` **Prompt:** Get Web Test details for in my subscription in @@ -14464,48 +6432,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -<<<<<<< HEAD -| 1 | 0.758910 | `monitor_webtests_get` | ✅ **EXPECTED** | -| 2 | 0.725360 | `monitor_webtests_list` | ❌ | -| 3 | 0.583663 | `loadtesting_testresource_list` | ❌ | -| 4 | 0.562785 | `monitor_webtests_update` | ❌ | -| 5 | 0.530432 | `monitor_webtests_create` | ❌ | - ---- - -## Test 341 -======= -<<<<<<< HEAD -| 1 | 0.759380 | `monitor_webtests_get` | ✅ **EXPECTED** | -| 2 | 0.725337 | `monitor_webtests_list` | ❌ | -======= | 1 | 0.759015 | `monitor_webtests_get` | ✅ **EXPECTED** | -| 2 | 0.725442 | `monitor_webtests_list` | ❌ | ->>>>>>> e2fd2eac (refactor tts mcp tool) +| 2 | 0.725302 | `monitor_webtests_list` | ❌ | | 3 | 0.583816 | `loadtesting_testresource_list` | ❌ | | 4 | 0.562797 | `monitor_webtests_update` | ❌ | -| 5 | 0.530557 | `monitor_webtests_create` | ❌ | - ---- - -<<<<<<< HEAD -## Test 331 -======= -| 1 | 0.759062 | `monitor_webtests_get` | ✅ **EXPECTED** | -| 2 | 0.726138 | `monitor_webtests_list` | ❌ | -| 3 | 0.583770 | `loadtesting_testresource_list` | ❌ | -| 4 | 0.562773 | `monitor_webtests_update` | ❌ | -| 5 | 0.530496 | `monitor_webtests_create` | ❌ | +| 5 | 0.530581 | `monitor_webtests_create` | ❌ | --- -## Test 341 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -## Test 346 ->>>>>>> e2fd2eac (refactor tts mcp tool) +## Test 351 **Expected Tool:** `monitor_webtests_list` **Prompt:** List all Web Test resources in my subscription @@ -14514,46 +6449,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -<<<<<<< HEAD -| 1 | 0.730616 | `monitor_webtests_list` | ✅ **EXPECTED** | -======= -<<<<<<< HEAD -| 1 | 0.730568 | `monitor_webtests_list` | ✅ **EXPECTED** | ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -| 2 | 0.610160 | `loadtesting_testresource_list` | ❌ | -| 3 | 0.547708 | `grafana_list` | ❌ | -| 4 | 0.520828 | `redis_list` | ❌ | -| 5 | 0.496166 | `monitor_webtests_get` | ❌ | - ---- - -<<<<<<< HEAD -## Test 342 -======= -## Test 332 -======= -| 1 | 0.732801 | `monitor_webtests_list` | ✅ **EXPECTED** | +| 1 | 0.730837 | `monitor_webtests_list` | ✅ **EXPECTED** | | 2 | 0.610160 | `loadtesting_testresource_list` | ❌ | | 3 | 0.547708 | `grafana_list` | ❌ | | 4 | 0.520829 | `redis_list` | ❌ | -======= -| 1 | 0.730616 | `monitor_webtests_list` | ✅ **EXPECTED** | -| 2 | 0.610160 | `loadtesting_testresource_list` | ❌ | -| 3 | 0.547708 | `grafana_list` | ❌ | -| 4 | 0.520842 | `redis_list` | ❌ | ->>>>>>> e2fd2eac (refactor tts mcp tool) | 5 | 0.496166 | `monitor_webtests_get` | ❌ | --- -<<<<<<< HEAD -## Test 342 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -## Test 347 ->>>>>>> e2fd2eac (refactor tts mcp tool) +## Test 352 **Expected Tool:** `monitor_webtests_list` **Prompt:** List all Web Test resources in my subscription in @@ -14562,45 +6466,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -<<<<<<< HEAD -| 1 | 0.793807 | `monitor_webtests_list` | ✅ **EXPECTED** | -| 2 | 0.675965 | `loadtesting_testresource_list` | ❌ | -| 3 | 0.584429 | `monitor_webtests_get` | ❌ | -======= -<<<<<<< HEAD -| 1 | 0.793702 | `monitor_webtests_list` | ✅ **EXPECTED** | -| 2 | 0.675965 | `loadtesting_testresource_list` | ❌ | -| 3 | 0.584942 | `monitor_webtests_get` | ❌ | -======= -| 1 | 0.793581 | `monitor_webtests_list` | ✅ **EXPECTED** | -| 2 | 0.675965 | `loadtesting_testresource_list` | ❌ | -| 3 | 0.584429 | `monitor_webtests_get` | ❌ | ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -| 1 | 0.793807 | `monitor_webtests_list` | ✅ **EXPECTED** | +| 1 | 0.793880 | `monitor_webtests_list` | ✅ **EXPECTED** | | 2 | 0.675965 | `loadtesting_testresource_list` | ❌ | | 3 | 0.584429 | `monitor_webtests_get` | ❌ | ->>>>>>> e2fd2eac (refactor tts mcp tool) -| 4 | 0.573602 | `group_list` | ❌ | +| 4 | 0.573620 | `group_list` | ❌ | | 5 | 0.546088 | `resourcehealth_availability-status_list` | ❌ | --- -<<<<<<< HEAD -<<<<<<< HEAD -## Test 343 -======= -<<<<<<< HEAD -## Test 333 -======= -## Test 343 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -## Test 348 ->>>>>>> e2fd2eac (refactor tts mcp tool) +## Test 353 **Expected Tool:** `monitor_webtests_update` **Prompt:** Update an existing Standard Web Test with name in my subscription in in a given @@ -14609,51 +6483,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -<<<<<<< HEAD -| 1 | 0.686427 | `monitor_webtests_update` | ✅ **EXPECTED** | -| 2 | 0.558816 | `monitor_webtests_get` | ❌ | -| 3 | 0.557828 | `monitor_webtests_create` | ❌ | -| 4 | 0.553372 | `monitor_webtests_list` | ❌ | -| 5 | 0.509192 | `loadtesting_testrun_update` | ❌ | - ---- - -## Test 344 -======= -<<<<<<< HEAD | 1 | 0.686449 | `monitor_webtests_update` | ✅ **EXPECTED** | -| 2 | 0.559199 | `monitor_webtests_get` | ❌ | -| 3 | 0.558234 | `monitor_webtests_create` | ❌ | -| 4 | 0.553545 | `monitor_webtests_list` | ❌ | +| 2 | 0.559296 | `monitor_webtests_get` | ❌ | +| 3 | 0.558239 | `monitor_webtests_create` | ❌ | +| 4 | 0.553466 | `monitor_webtests_list` | ❌ | | 5 | 0.508736 | `loadtesting_testrun_update` | ❌ | --- -## Test 334 -======= -| 1 | 0.686466 | `monitor_webtests_update` | ✅ **EXPECTED** | -| 2 | 0.559612 | `monitor_webtests_get` | ❌ | -| 3 | 0.558102 | `monitor_webtests_create` | ❌ | -| 4 | 0.555899 | `monitor_webtests_list` | ❌ | -| 5 | 0.509033 | `loadtesting_testrun_update` | ❌ | - ---- - -## Test 344 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -| 1 | 0.686426 | `monitor_webtests_update` | ✅ **EXPECTED** | -| 2 | 0.559273 | `monitor_webtests_get` | ❌ | -| 3 | 0.558221 | `monitor_webtests_create` | ❌ | -| 4 | 0.553741 | `monitor_webtests_list` | ❌ | -| 5 | 0.508780 | `loadtesting_testrun_update` | ❌ | - ---- - -## Test 349 ->>>>>>> e2fd2eac (refactor tts mcp tool) +## Test 354 **Expected Tool:** `monitor_workspace_list` **Prompt:** List all Log Analytics workspaces in my subscription @@ -14662,46 +6500,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -| 1 | 0.813871 | `monitor_workspace_list` | ✅ **EXPECTED** | +| 1 | 0.813902 | `monitor_workspace_list` | ✅ **EXPECTED** | | 2 | 0.680201 | `grafana_list` | ❌ | -<<<<<<< HEAD -| 3 | 0.660127 | `monitor_table_list` | ❌ | -| 4 | 0.610623 | `kusto_cluster_list` | ❌ | -| 5 | 0.599636 | `search_service_list` | ❌ | - ---- - -## Test 345 -======= -<<<<<<< HEAD -| 3 | 0.660135 | `monitor_table_list` | ❌ | +| 3 | 0.659497 | `monitor_table_list` | ❌ | | 4 | 0.610623 | `kusto_cluster_list` | ❌ | -======= -| 3 | 0.659287 | `monitor_table_list` | ❌ | -| 4 | 0.610480 | `kusto_cluster_list` | ❌ | ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) | 5 | 0.600802 | `search_service_list` | ❌ | --- -<<<<<<< HEAD -## Test 335 -======= -## Test 345 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -| 1 | 0.813506 | `monitor_workspace_list` | ✅ **EXPECTED** | -| 2 | 0.679650 | `grafana_list` | ❌ | -| 3 | 0.659506 | `monitor_table_list` | ❌ | -| 4 | 0.610550 | `kusto_cluster_list` | ❌ | -| 5 | 0.601012 | `search_service_list` | ❌ | - ---- - -## Test 350 ->>>>>>> e2fd2eac (refactor tts mcp tool) +## Test 355 **Expected Tool:** `monitor_workspace_list` **Prompt:** Show me my Log Analytics workspaces @@ -14710,46 +6517,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -<<<<<<< HEAD -| 1 | 0.656159 | `monitor_workspace_list` | ✅ **EXPECTED** | -| 2 | 0.585355 | `monitor_table_list` | ❌ | -| 3 | 0.531036 | `monitor_table_type_list` | ❌ | -| 4 | 0.518275 | `grafana_list` | ❌ | -| 5 | 0.506663 | `monitor_workspace_log_query` | ❌ | - ---- - -## Test 346 -======= -<<<<<<< HEAD -======= ->>>>>>> e2fd2eac (refactor tts mcp tool) | 1 | 0.656194 | `monitor_workspace_list` | ✅ **EXPECTED** | -| 2 | 0.585436 | `monitor_table_list` | ❌ | +| 2 | 0.584758 | `monitor_table_list` | ❌ | | 3 | 0.531083 | `monitor_table_type_list` | ❌ | | 4 | 0.518254 | `grafana_list` | ❌ | | 5 | 0.506772 | `monitor_workspace_log_query` | ❌ | --- -<<<<<<< HEAD -## Test 336 -======= -| 1 | 0.656153 | `monitor_workspace_list` | ✅ **EXPECTED** | -| 2 | 0.584651 | `monitor_table_list` | ❌ | -| 3 | 0.531025 | `monitor_table_type_list` | ❌ | -| 4 | 0.518275 | `grafana_list` | ❌ | -| 5 | 0.506663 | `monitor_workspace_log_query` | ❌ | - ---- - -## Test 346 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -## Test 351 ->>>>>>> e2fd2eac (refactor tts mcp tool) +## Test 356 **Expected Tool:** `monitor_workspace_list` **Prompt:** Show me the Log Analytics workspaces in my subscription @@ -14758,40 +6534,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -| 1 | 0.732964 | `monitor_workspace_list` | ✅ **EXPECTED** | +| 1 | 0.732962 | `monitor_workspace_list` | ✅ **EXPECTED** | | 2 | 0.601481 | `grafana_list` | ❌ | -<<<<<<< HEAD -<<<<<<< HEAD -| 3 | 0.580244 | `monitor_table_list` | ❌ | -======= -<<<<<<< HEAD -======= ->>>>>>> e2fd2eac (refactor tts mcp tool) -| 3 | 0.580261 | `monitor_table_list` | ❌ | ->>>>>>> 58ab8585 (update prompts and tool description evaluator) +| 3 | 0.579669 | `monitor_table_list` | ❌ | | 4 | 0.523782 | `monitor_workspace_log_query` | ❌ | | 5 | 0.522749 | `kusto_cluster_list` | ❌ | --- -<<<<<<< HEAD -<<<<<<< HEAD -## Test 347 -======= -## Test 337 -======= -| 3 | 0.579582 | `monitor_table_list` | ❌ | -| 4 | 0.523782 | `monitor_workspace_log_query` | ❌ | -| 5 | 0.522605 | `kusto_cluster_list` | ❌ | - ---- - -## Test 347 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -## Test 352 ->>>>>>> e2fd2eac (refactor tts mcp tool) +## Test 357 **Expected Tool:** `monitor_workspace_log_query` **Prompt:** Show me the logs for the past hour in the Log Analytics workspace @@ -14803,36 +6554,12 @@ | 1 | 0.610115 | `monitor_workspace_log_query` | ✅ **EXPECTED** | | 2 | 0.587614 | `monitor_resource_log_query` | ❌ | | 3 | 0.527733 | `monitor_activitylog_list` | ❌ | -<<<<<<< HEAD -| 4 | 0.498148 | `deploy_app_logs_get` | ❌ | -| 5 | 0.485982 | `monitor_table_list` | ❌ | +| 4 | 0.498269 | `deploy_app_logs_get` | ❌ | +| 5 | 0.485470 | `monitor_table_list` | ❌ | --- -<<<<<<< HEAD -## Test 348 -======= -## Test 338 -======= -| 1 | 0.610116 | `monitor_workspace_log_query` | ✅ **EXPECTED** | -| 2 | 0.587644 | `monitor_resource_log_query` | ❌ | -| 3 | 0.527761 | `monitor_activitylog_list` | ❌ | -| 4 | 0.498255 | `deploy_app_logs_get` | ❌ | -| 5 | 0.485667 | `monitor_table_list` | ❌ | - ---- - -## Test 348 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -| 4 | 0.498269 | `deploy_app_logs_get` | ❌ | -| 5 | 0.485984 | `monitor_table_list` | ❌ | - ---- - -## Test 353 ->>>>>>> e2fd2eac (refactor tts mcp tool) +## Test 358 **Expected Tool:** `datadog_monitoredresources_list` **Prompt:** List all monitored resources in the Datadog resource @@ -14841,42 +6568,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -| 1 | 0.668828 | `datadog_monitoredresources_list` | ✅ **EXPECTED** | +| 1 | 0.668827 | `datadog_monitoredresources_list` | ✅ **EXPECTED** | | 2 | 0.454270 | `redis_list` | ❌ | | 3 | 0.413661 | `loadtesting_testresource_list` | ❌ | -<<<<<<< HEAD | 4 | 0.413173 | `monitor_metrics_query` | ❌ | -======= -<<<<<<< HEAD -| 4 | 0.413208 | `monitor_metrics_query` | ❌ | -======= -| 4 | 0.413173 | `monitor_metrics_query` | ❌ | ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -| 1 | 0.668827 | `datadog_monitoredresources_list` | ✅ **EXPECTED** | -| 2 | 0.454295 | `redis_list` | ❌ | -| 3 | 0.413661 | `loadtesting_testresource_list` | ❌ | -| 4 | 0.413407 | `monitor_metrics_query` | ❌ | ->>>>>>> e2fd2eac (refactor tts mcp tool) | 5 | 0.401731 | `grafana_list` | ❌ | --- -<<<<<<< HEAD -<<<<<<< HEAD -## Test 349 -======= -<<<<<<< HEAD -## Test 339 -======= -## Test 349 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -## Test 354 ->>>>>>> e2fd2eac (refactor tts mcp tool) +## Test 359 **Expected Tool:** `datadog_monitoredresources_list` **Prompt:** Show me the monitored resources in the Datadog resource @@ -14886,31 +6586,14 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| | 1 | 0.624066 | `datadog_monitoredresources_list` | ✅ **EXPECTED** | -<<<<<<< HEAD | 2 | 0.443481 | `monitor_metrics_query` | ❌ | | 3 | 0.440052 | `redis_list` | ❌ | -======= -| 2 | 0.443652 | `monitor_metrics_query` | ❌ | -| 3 | 0.440095 | `redis_list` | ❌ | ->>>>>>> e2fd2eac (refactor tts mcp tool) | 4 | 0.424391 | `monitor_resource_log_query` | ❌ | | 5 | 0.385122 | `loadtesting_testresource_list` | ❌ | --- -<<<<<<< HEAD -<<<<<<< HEAD -## Test 350 -======= -<<<<<<< HEAD -## Test 340 -======= -## Test 350 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -## Test 355 ->>>>>>> e2fd2eac (refactor tts mcp tool) +## Test 360 **Expected Tool:** `extension_azqr` **Prompt:** Check my Azure subscription for any compliance issues or recommendations @@ -14919,21 +6602,6 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -<<<<<<< HEAD -| 1 | 0.533403 | `quota_usage_check` | ❌ | -| 2 | 0.481143 | `azureterraformbestpractices_get` | ❌ | -| 3 | 0.476826 | `extension_azqr` | ✅ **EXPECTED** | -======= -<<<<<<< HEAD -| 1 | 0.533406 | `quota_usage_check` | ❌ | -| 2 | 0.481236 | `azureterraformbestpractices_get` | ❌ | -| 3 | 0.476761 | `extension_azqr` | ✅ **EXPECTED** | ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -| 4 | 0.471547 | `subscription_list` | ❌ | -======= -======= ->>>>>>> e2fd2eac (refactor tts mcp tool) | 1 | 0.533164 | `quota_usage_check` | ❌ | | 2 | 0.481143 | `azureterraformbestpractices_get` | ❌ | | 3 | 0.476826 | `extension_azqr` | ✅ **EXPECTED** | @@ -14942,19 +6610,7 @@ --- -<<<<<<< HEAD -<<<<<<< HEAD -## Test 351 -======= -<<<<<<< HEAD -## Test 341 -======= -## Test 351 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -## Test 356 ->>>>>>> e2fd2eac (refactor tts mcp tool) +## Test 361 **Expected Tool:** `extension_azqr` **Prompt:** Provide compliance recommendations for my current Azure subscription @@ -14963,38 +6619,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -<<<<<<< HEAD -| 1 | 0.532792 | `azureterraformbestpractices_get` | ❌ | -======= -<<<<<<< HEAD -| 1 | 0.532869 | `azureterraformbestpractices_get` | ❌ | ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -| 2 | 0.492863 | `get_bestpractices_get` | ❌ | -======= -======= ->>>>>>> e2fd2eac (refactor tts mcp tool) | 1 | 0.532792 | `azureterraformbestpractices_get` | ❌ | | 2 | 0.492863 | `get_bestpractices_get` | ❌ | | 3 | 0.476164 | `applicationinsights_recommendation_list` | ❌ | | 4 | 0.473365 | `deploy_iac_rules_get` | ❌ | -| 5 | 0.468491 | `azureaibestpractices_get` | ❌ | +| 5 | 0.464954 | `cloudarchitect_design` | ❌ | --- -<<<<<<< HEAD -<<<<<<< HEAD -## Test 352 -======= -<<<<<<< HEAD -## Test 342 -======= -## Test 352 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -## Test 357 ->>>>>>> e2fd2eac (refactor tts mcp tool) +## Test 362 **Expected Tool:** `extension_azqr` **Prompt:** Scan my Azure subscription for compliance recommendations @@ -15003,40 +6636,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -<<<<<<< HEAD -| 1 | 0.536917 | `azureterraformbestpractices_get` | ❌ | -| 2 | 0.516910 | `extension_azqr` | ✅ **EXPECTED** | -| 3 | 0.514947 | `applicationinsights_recommendation_list` | ❌ | -| 4 | 0.504918 | `quota_usage_check` | ❌ | -| 5 | 0.494808 | `deploy_plan_get` | ❌ | - ---- - -## Test 353 -======= -| 1 | 0.536984 | `azureterraformbestpractices_get` | ❌ | -| 2 | 0.516810 | `extension_azqr` | ✅ **EXPECTED** | -======= | 1 | 0.536934 | `azureterraformbestpractices_get` | ❌ | | 2 | 0.516925 | `extension_azqr` | ✅ **EXPECTED** | ->>>>>>> e2fd2eac (refactor tts mcp tool) | 3 | 0.514978 | `applicationinsights_recommendation_list` | ❌ | | 4 | 0.504673 | `quota_usage_check` | ❌ | | 5 | 0.494872 | `deploy_plan_get` | ❌ | --- -<<<<<<< HEAD -<<<<<<< HEAD -## Test 343 -======= -## Test 353 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -## Test 358 ->>>>>>> e2fd2eac (refactor tts mcp tool) +## Test 363 **Expected Tool:** `quota_region_availability_list` **Prompt:** Show me the available regions for these resource types @@ -15045,42 +6653,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -| 1 | 0.590878 | `quota_region_availability_list` | ✅ **EXPECTED** | -<<<<<<< HEAD -<<<<<<< HEAD -| 2 | 0.413662 | `quota_usage_check` | ❌ | -======= -<<<<<<< HEAD -| 2 | 0.413577 | `quota_usage_check` | ❌ | -======= +| 1 | 0.590950 | `quota_region_availability_list` | ✅ **EXPECTED** | | 2 | 0.413274 | `quota_usage_check` | ❌ | ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) | 3 | 0.391332 | `redis_list` | ❌ | | 4 | 0.372940 | `resourcehealth_availability-status_list` | ❌ | -| 5 | 0.369915 | `managedlustre_fs_sku_get` | ❌ | - ---- - -<<<<<<< HEAD -## Test 354 -======= -<<<<<<< HEAD -## Test 344 -======= -## Test 354 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -| 2 | 0.413274 | `quota_usage_check` | ❌ | -| 3 | 0.391361 | `redis_list` | ❌ | -| 4 | 0.372940 | `resourcehealth_availability-status_list` | ❌ | | 5 | 0.369855 | `managedlustre_fs_sku_get` | ❌ | --- -## Test 359 ->>>>>>> e2fd2eac (refactor tts mcp tool) +## Test 364 **Expected Tool:** `quota_usage_check` **Prompt:** Check usage information for in region @@ -15089,49 +6670,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -<<<<<<< HEAD -| 1 | 0.609711 | `quota_usage_check` | ✅ **EXPECTED** | -| 2 | 0.491058 | `quota_region_availability_list` | ❌ | -| 3 | 0.384350 | `resourcehealth_availability-status_list` | ❌ | -| 4 | 0.376819 | `resourcehealth_availability-status_get` | ❌ | -======= -<<<<<<< HEAD -| 1 | 0.609607 | `quota_usage_check` | ✅ **EXPECTED** | -| 2 | 0.491058 | `quota_region_availability_list` | ❌ | -| 3 | 0.384500 | `resourcehealth_availability-status_list` | ❌ | -| 4 | 0.376368 | `resourcehealth_availability-status_get` | ❌ | -======= -| 1 | 0.609244 | `quota_usage_check` | ✅ **EXPECTED** | -| 2 | 0.491058 | `quota_region_availability_list` | ❌ | -| 3 | 0.384350 | `resourcehealth_availability-status_list` | ❌ | -| 4 | 0.374248 | `resourcehealth_availability-status_get` | ❌ | ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -| 5 | 0.371407 | `redis_list` | ❌ | - ---- - -<<<<<<< HEAD -## Test 355 -======= -<<<<<<< HEAD -## Test 345 -======= -## Test 355 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -| 1 | 0.609244 | `quota_usage_check` | ✅ **EXPECTED** | -| 2 | 0.491058 | `quota_region_availability_list` | ❌ | -| 3 | 0.384350 | `resourcehealth_availability-status_list` | ❌ | -| 4 | 0.376368 | `resourcehealth_availability-status_get` | ❌ | -| 5 | 0.371447 | `redis_list` | ❌ | +| 1 | 0.609378 | `quota_usage_check` | ✅ **EXPECTED** | +| 2 | 0.491300 | `quota_region_availability_list` | ❌ | +| 3 | 0.384035 | `resourcehealth_availability-status_list` | ❌ | +| 4 | 0.376096 | `resourcehealth_availability-status_get` | ❌ | +| 5 | 0.371392 | `redis_list` | ❌ | --- -## Test 360 ->>>>>>> e2fd2eac (refactor tts mcp tool) +## Test 365 **Expected Tool:** `role_assignment_list` **Prompt:** List all available role assignments in my subscription @@ -15140,32 +6687,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -| 1 | 0.645258 | `role_assignment_list` | ✅ **EXPECTED** | -| 2 | 0.539757 | `subscription_list` | ❌ | -======= | 1 | 0.645259 | `role_assignment_list` | ✅ **EXPECTED** | | 2 | 0.539761 | `subscription_list` | ❌ | ->>>>>>> e2fd2eac (refactor tts mcp tool) -| 3 | 0.483988 | `group_list` | ❌ | +| 3 | 0.484047 | `group_list` | ❌ | | 4 | 0.478700 | `grafana_list` | ❌ | | 5 | 0.471364 | `cosmos_account_list` | ❌ | --- -<<<<<<< HEAD -<<<<<<< HEAD -## Test 356 -======= -<<<<<<< HEAD -## Test 346 -======= -## Test 356 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -## Test 361 ->>>>>>> e2fd2eac (refactor tts mcp tool) +## Test 366 **Expected Tool:** `role_assignment_list` **Prompt:** Show me the available role assignments in my subscription @@ -15174,35 +6704,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -| 1 | 0.609704 | `role_assignment_list` | ✅ **EXPECTED** | -| 2 | 0.514697 | `subscription_list` | ❌ | -| 3 | 0.456956 | `grafana_list` | ❌ | -| 4 | 0.449753 | `eventgrid_subscription_list` | ❌ | -| 5 | 0.445149 | `redis_list` | ❌ | - ---- - -<<<<<<< HEAD -## Test 357 -======= -<<<<<<< HEAD -## Test 347 -======= -## Test 357 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= | 1 | 0.609705 | `role_assignment_list` | ✅ **EXPECTED** | | 2 | 0.514696 | `subscription_list` | ❌ | | 3 | 0.456956 | `grafana_list` | ❌ | | 4 | 0.449210 | `eventgrid_subscription_list` | ❌ | -| 5 | 0.445176 | `redis_list` | ❌ | +| 5 | 0.445149 | `redis_list` | ❌ | --- -## Test 362 ->>>>>>> e2fd2eac (refactor tts mcp tool) +## Test 367 **Expected Tool:** `redis_list` **Prompt:** List all Redis resources in my subscription @@ -15211,33 +6721,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -<<<<<<< HEAD -| 1 | 0.810504 | `redis_list` | ✅ **EXPECTED** | -| 2 | 0.587836 | `grafana_list` | ❌ | -| 3 | 0.512954 | `kusto_cluster_list` | ❌ | -| 4 | 0.508532 | `datadog_monitoredresources_list` | ❌ | -| 5 | 0.501218 | `postgres_server_list` | ❌ | - ---- - -## Test 358 -======= -<<<<<<< HEAD -| 1 | 0.810487 | `redis_list` | ✅ **EXPECTED** | -| 2 | 0.587872 | `grafana_list` | ❌ | -| 3 | 0.512995 | `kusto_cluster_list` | ❌ | -| 4 | 0.508555 | `datadog_monitoredresources_list` | ❌ | -| 5 | 0.501183 | `postgres_server_list` | ❌ | - ---- - -## Test 348 -======= | 1 | 0.810504 | `redis_list` | ✅ **EXPECTED** | -======= -| 1 | 0.810532 | `redis_list` | ✅ **EXPECTED** | ->>>>>>> e2fd2eac (refactor tts mcp tool) | 2 | 0.587836 | `grafana_list` | ❌ | | 3 | 0.512954 | `kusto_cluster_list` | ❌ | | 4 | 0.508531 | `datadog_monitoredresources_list` | ❌ | @@ -15245,13 +6729,7 @@ --- -<<<<<<< HEAD -## Test 358 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -## Test 363 ->>>>>>> e2fd2eac (refactor tts mcp tool) +## Test 368 **Expected Tool:** `redis_list` **Prompt:** Show me my Redis resources @@ -15260,42 +6738,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD | 1 | 0.685128 | `redis_list` | ✅ **EXPECTED** | -| 2 | 0.374327 | `grafana_list` | ❌ | -| 3 | 0.364197 | `datadog_monitoredresources_list` | ❌ | -<<<<<<< HEAD -| 4 | 0.359659 | `mysql_server_list` | ❌ | -======= -<<<<<<< HEAD -| 4 | 0.359709 | `mysql_server_list` | ❌ | -======= -| 4 | 0.359659 | `mysql_server_list` | ❌ | ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -| 1 | 0.685197 | `redis_list` | ✅ **EXPECTED** | | 2 | 0.374328 | `grafana_list` | ❌ | | 3 | 0.364197 | `datadog_monitoredresources_list` | ❌ | -| 4 | 0.359659 | `mysql_server_list` | ❌ | ->>>>>>> e2fd2eac (refactor tts mcp tool) -| 5 | 0.331502 | `mysql_database_list` | ❌ | +| 4 | 0.359774 | `mysql_server_list` | ❌ | +| 5 | 0.331841 | `mysql_database_list` | ❌ | --- -<<<<<<< HEAD -<<<<<<< HEAD -## Test 359 -======= -<<<<<<< HEAD -## Test 349 -======= -## Test 359 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -## Test 364 ->>>>>>> e2fd2eac (refactor tts mcp tool) +## Test 369 **Expected Tool:** `redis_list` **Prompt:** Show me the Redis resources in my subscription @@ -15304,40 +6755,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -| 1 | 0.781276 | `redis_list` | ✅ **EXPECTED** | +| 1 | 0.781228 | `redis_list` | ✅ **EXPECTED** | | 2 | 0.539177 | `grafana_list` | ❌ | | 3 | 0.449276 | `datadog_monitoredresources_list` | ❌ | -<<<<<<< HEAD -<<<<<<< HEAD | 4 | 0.449014 | `postgres_server_list` | ❌ | -======= -<<<<<<< HEAD -| 4 | 0.448989 | `postgres_server_list` | ❌ | ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -| 4 | 0.449014 | `postgres_server_list` | ❌ | ->>>>>>> e2fd2eac (refactor tts mcp tool) | 5 | 0.442854 | `kusto_cluster_list` | ❌ | --- -<<<<<<< HEAD -<<<<<<< HEAD -## Test 360 -======= -## Test 350 -======= -| 4 | 0.449014 | `postgres_server_list` | ❌ | -| 5 | 0.442860 | `kusto_cluster_list` | ❌ | - ---- - -## Test 360 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -## Test 365 ->>>>>>> e2fd2eac (refactor tts mcp tool) +## Test 370 **Expected Tool:** `redis_list` **Prompt:** Show me my Redis caches @@ -15346,40 +6772,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -| 1 | 0.572836 | `redis_list` | ✅ **EXPECTED** | -| 2 | 0.316630 | `mysql_database_list` | ❌ | +| 1 | 0.572767 | `redis_list` | ✅ **EXPECTED** | +| 2 | 0.316869 | `mysql_database_list` | ❌ | | 3 | 0.301786 | `postgres_database_list` | ❌ | -<<<<<<< HEAD -<<<<<<< HEAD -| 4 | 0.286513 | `mysql_server_list` | ❌ | -======= -<<<<<<< HEAD -| 4 | 0.286570 | `mysql_server_list` | ❌ | ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -| 4 | 0.286513 | `mysql_server_list` | ❌ | ->>>>>>> e2fd2eac (refactor tts mcp tool) +| 4 | 0.286679 | `mysql_server_list` | ❌ | | 5 | 0.273014 | `kusto_cluster_list` | ❌ | --- -<<<<<<< HEAD -<<<<<<< HEAD -## Test 361 -======= -## Test 351 -======= -| 4 | 0.286513 | `mysql_server_list` | ❌ | -| 5 | 0.272972 | `kusto_cluster_list` | ❌ | - ---- - -## Test 361 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -## Test 366 ->>>>>>> e2fd2eac (refactor tts mcp tool) +## Test 371 **Expected Tool:** `redis_list` **Prompt:** Get Redis clusters @@ -15388,48 +6789,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -<<<<<<< HEAD -| 1 | 0.478070 | `redis_list` | ✅ **EXPECTED** | -| 2 | 0.456308 | `kusto_cluster_list` | ❌ | -| 3 | 0.384630 | `kusto_cluster_get` | ❌ | -| 4 | 0.359935 | `kusto_database_list` | ❌ | -| 5 | 0.343305 | `aks_cluster_get` | ❌ | - ---- - -## Test 362 -======= -<<<<<<< HEAD -| 1 | 0.478109 | `redis_list` | ✅ **EXPECTED** | -| 2 | 0.456382 | `kusto_cluster_list` | ❌ | -| 3 | 0.384637 | `kusto_cluster_get` | ❌ | -| 4 | 0.359466 | `kusto_database_list` | ❌ | -| 5 | 0.343367 | `aks_cluster_get` | ❌ | - ---- - -## Test 352 -======= | 1 | 0.478070 | `redis_list` | ✅ **EXPECTED** | -| 2 | 0.456311 | `kusto_cluster_list` | ❌ | -======= -| 1 | 0.478136 | `redis_list` | ✅ **EXPECTED** | | 2 | 0.456309 | `kusto_cluster_list` | ❌ | ->>>>>>> e2fd2eac (refactor tts mcp tool) | 3 | 0.384630 | `kusto_cluster_get` | ❌ | -| 4 | 0.359434 | `kusto_database_list` | ❌ | +| 4 | 0.359573 | `kusto_database_list` | ❌ | | 5 | 0.343305 | `aks_cluster_get` | ❌ | --- -<<<<<<< HEAD -## Test 362 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -## Test 367 ->>>>>>> e2fd2eac (refactor tts mcp tool) +## Test 372 **Expected Tool:** `group_list` **Prompt:** List all resource groups in my subscription @@ -15438,41 +6806,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -| 1 | 0.755935 | `group_list` | ✅ **EXPECTED** | -<<<<<<< HEAD -<<<<<<< HEAD -| 2 | 0.566552 | `workbooks_list` | ❌ | -======= -<<<<<<< HEAD -| 2 | 0.566497 | `workbooks_list` | ❌ | ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -| 3 | 0.564566 | `loadtesting_testresource_list` | ❌ | -| 4 | 0.552633 | `datadog_monitoredresources_list` | ❌ | -| 5 | 0.549477 | `monitor_webtests_list` | ❌ | - ---- - -<<<<<<< HEAD -## Test 363 -======= -## Test 353 -======= -======= ->>>>>>> e2fd2eac (refactor tts mcp tool) +| 1 | 0.755933 | `group_list` | ✅ **EXPECTED** | | 2 | 0.566552 | `workbooks_list` | ❌ | | 3 | 0.564566 | `loadtesting_testresource_list` | ❌ | | 4 | 0.552633 | `datadog_monitoredresources_list` | ❌ | -| 5 | 0.549477 | `monitor_webtests_list` | ❌ | +| 5 | 0.549537 | `monitor_webtests_list` | ❌ | --- -<<<<<<< HEAD -## Test 363 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -## Test 368 ->>>>>>> e2fd2eac (refactor tts mcp tool) +## Test 373 **Expected Tool:** `group_list` **Prompt:** Show me my resource groups @@ -15481,39 +6823,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -| 1 | 0.529504 | `group_list` | ✅ **EXPECTED** | -| 2 | 0.464725 | `redis_list` | ❌ | +| 1 | 0.529503 | `group_list` | ✅ **EXPECTED** | +| 2 | 0.464690 | `redis_list` | ❌ | | 3 | 0.463685 | `datadog_monitoredresources_list` | ❌ | -<<<<<<< HEAD -<<<<<<< HEAD -| 4 | 0.462391 | `mysql_server_list` | ❌ | -======= -<<<<<<< HEAD -| 4 | 0.462388 | `mysql_server_list` | ❌ | -======= -| 4 | 0.462391 | `mysql_server_list` | ❌ | ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -| 4 | 0.462391 | `mysql_server_list` | ❌ | ->>>>>>> e2fd2eac (refactor tts mcp tool) +| 4 | 0.462699 | `mysql_server_list` | ❌ | | 5 | 0.460280 | `loadtesting_testresource_list` | ❌ | --- -<<<<<<< HEAD -<<<<<<< HEAD -## Test 364 -======= -<<<<<<< HEAD -## Test 354 -======= -## Test 364 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -## Test 369 ->>>>>>> e2fd2eac (refactor tts mcp tool) +## Test 374 **Expected Tool:** `group_list` **Prompt:** Show me the resource groups in my subscription @@ -15522,32 +6840,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -| 1 | 0.665772 | `group_list` | ✅ **EXPECTED** | +| 1 | 0.665819 | `group_list` | ✅ **EXPECTED** | | 2 | 0.532656 | `datadog_monitoredresources_list` | ❌ | -<<<<<<< HEAD | 3 | 0.532505 | `redis_list` | ❌ | -| 4 | 0.532015 | `eventgrid_topic_list` | ❌ | -======= -| 3 | 0.532524 | `redis_list` | ❌ | | 4 | 0.532054 | `eventgrid_topic_list` | ❌ | ->>>>>>> e2fd2eac (refactor tts mcp tool) | 5 | 0.531920 | `resourcehealth_availability-status_list` | ❌ | --- -<<<<<<< HEAD -<<<<<<< HEAD -## Test 365 -======= -<<<<<<< HEAD -## Test 355 -======= -## Test 365 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -## Test 370 ->>>>>>> e2fd2eac (refactor tts mcp tool) +## Test 375 **Expected Tool:** `resourcehealth_availability-status_get` **Prompt:** Get the availability status for resource @@ -15556,47 +6857,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -<<<<<<< HEAD -| 1 | 0.556926 | `resourcehealth_availability-status_get` | ✅ **EXPECTED** | -| 2 | 0.538273 | `resourcehealth_availability-status_list` | ❌ | -| 3 | 0.378030 | `quota_usage_check` | ❌ | -======= -<<<<<<< HEAD -| 1 | 0.556629 | `resourcehealth_availability-status_get` | ✅ **EXPECTED** | -| 2 | 0.538277 | `resourcehealth_availability-status_list` | ❌ | -| 3 | 0.377966 | `quota_usage_check` | ❌ | ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= | 1 | 0.556629 | `resourcehealth_availability-status_get` | ✅ **EXPECTED** | | 2 | 0.538273 | `resourcehealth_availability-status_list` | ❌ | | 3 | 0.377586 | `quota_usage_check` | ❌ | ->>>>>>> e2fd2eac (refactor tts mcp tool) | 4 | 0.373112 | `monitor_healthmodels_entity_get` | ❌ | -| 5 | 0.349981 | `datadog_monitoredresources_list` | ❌ | - ---- - -<<<<<<< HEAD -<<<<<<< HEAD -## Test 366 -======= -## Test 356 -======= -| 1 | 0.555432 | `resourcehealth_availability-status_get` | ✅ **EXPECTED** | -| 2 | 0.538273 | `resourcehealth_availability-status_list` | ❌ | -| 3 | 0.404305 | `foundry_openai_models-list` | ❌ | -| 4 | 0.377586 | `quota_usage_check` | ❌ | -| 5 | 0.373112 | `monitor_healthmodels_entity_gethealth` | ❌ | +| 5 | 0.349980 | `datadog_monitoredresources_list` | ❌ | --- -## Test 366 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -## Test 371 ->>>>>>> e2fd2eac (refactor tts mcp tool) +## Test 376 **Expected Tool:** `resourcehealth_availability-status_get` **Prompt:** Show me the health status of the storage account @@ -15605,44 +6874,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -<<<<<<< HEAD | 1 | 0.576591 | `storage_account_get` | ❌ | -| 2 | 0.564706 | `resourcehealth_availability-status_get` | ✅ **EXPECTED** | -| 3 | 0.555636 | `storage_blob_container_get` | ❌ | -======= -<<<<<<< HEAD -| 1 | 0.576617 | `storage_account_get` | ❌ | | 2 | 0.564128 | `resourcehealth_availability-status_get` | ✅ **EXPECTED** | | 3 | 0.556167 | `storage_blob_container_get` | ❌ | ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -| 4 | 0.487207 | `storage_blob_get` | ❌ | -| 5 | 0.466885 | `resourcehealth_availability-status_list` | ❌ | - ---- - -<<<<<<< HEAD -## Test 367 -======= -## Test 357 -======= -======= ->>>>>>> e2fd2eac (refactor tts mcp tool) -| 1 | 0.576591 | `storage_account_get` | ❌ | -| 2 | 0.564128 | `resourcehealth_availability-status_get` | ✅ **EXPECTED** | -| 3 | 0.556369 | `storage_blob_container_get` | ❌ | | 4 | 0.487207 | `storage_blob_get` | ❌ | | 5 | 0.466885 | `resourcehealth_availability-status_list` | ❌ | --- -<<<<<<< HEAD -## Test 367 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -## Test 372 ->>>>>>> e2fd2eac (refactor tts mcp tool) +## Test 377 **Expected Tool:** `resourcehealth_availability-status_get` **Prompt:** What is the availability status of virtual machine in resource group ? @@ -15651,44 +6891,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -<<<<<<< HEAD -| 1 | 0.577398 | `resourcehealth_availability-status_list` | ❌ | -| 2 | 0.502794 | `resourcehealth_availability-status_get` | ✅ **EXPECTED** | -| 3 | 0.424939 | `mysql_server_list` | ❌ | -======= -<<<<<<< HEAD -| 1 | 0.577529 | `resourcehealth_availability-status_list` | ❌ | -| 2 | 0.501568 | `resourcehealth_availability-status_get` | ✅ **EXPECTED** | -| 3 | 0.424957 | `mysql_server_list` | ❌ | ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -| 4 | 0.412025 | `loadtesting_testresource_list` | ❌ | -| 5 | 0.393479 | `managedlustre_fs_list` | ❌ | - ---- - -<<<<<<< HEAD -## Test 368 -======= -## Test 358 -======= -======= ->>>>>>> e2fd2eac (refactor tts mcp tool) | 1 | 0.577398 | `resourcehealth_availability-status_list` | ❌ | | 2 | 0.501568 | `resourcehealth_availability-status_get` | ✅ **EXPECTED** | -| 3 | 0.424939 | `mysql_server_list` | ❌ | +| 3 | 0.425180 | `mysql_server_list` | ❌ | | 4 | 0.412025 | `loadtesting_testresource_list` | ❌ | | 5 | 0.393479 | `managedlustre_fs_list` | ❌ | --- -<<<<<<< HEAD -## Test 368 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -## Test 373 ->>>>>>> e2fd2eac (refactor tts mcp tool) +## Test 378 **Expected Tool:** `resourcehealth_availability-status_list` **Prompt:** List availability status for all resources in my subscription @@ -15698,30 +6909,14 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| | 1 | 0.737219 | `resourcehealth_availability-status_list` | ✅ **EXPECTED** | -<<<<<<< HEAD | 2 | 0.585501 | `redis_list` | ❌ | -======= -| 2 | 0.585487 | `redis_list` | ❌ | ->>>>>>> e2fd2eac (refactor tts mcp tool) | 3 | 0.549914 | `loadtesting_testresource_list` | ❌ | | 4 | 0.548549 | `grafana_list` | ❌ | | 5 | 0.544505 | `subscription_list` | ❌ | --- -<<<<<<< HEAD -<<<<<<< HEAD -## Test 369 -======= -<<<<<<< HEAD -## Test 359 -======= -## Test 369 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -## Test 374 ->>>>>>> e2fd2eac (refactor tts mcp tool) +## Test 379 **Expected Tool:** `resourcehealth_availability-status_list` **Prompt:** Show me the health status of all my Azure resources @@ -15730,44 +6925,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -<<<<<<< HEAD -| 1 | 0.644982 | `resourcehealth_availability-status_list` | ✅ **EXPECTED** | -| 2 | 0.544917 | `resourcehealth_availability-status_get` | ❌ | -| 3 | 0.509740 | `resourcehealth_health-events_list` | ❌ | -| 4 | 0.508766 | `quota_usage_check` | ❌ | -======= -<<<<<<< HEAD -| 1 | 0.644908 | `resourcehealth_availability-status_list` | ✅ **EXPECTED** | -======= | 1 | 0.644982 | `resourcehealth_availability-status_list` | ✅ **EXPECTED** | ->>>>>>> e2fd2eac (refactor tts mcp tool) | 2 | 0.545208 | `resourcehealth_availability-status_get` | ❌ | | 3 | 0.509740 | `resourcehealth_health-events_list` | ❌ | | 4 | 0.508252 | `quota_usage_check` | ❌ | -<<<<<<< HEAD ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) | 5 | 0.505776 | `redis_list` | ❌ | --- -<<<<<<< HEAD -## Test 370 -======= -<<<<<<< HEAD -## Test 360 -======= -## Test 370 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -| 5 | 0.505799 | `redis_list` | ❌ | - ---- - -## Test 375 ->>>>>>> e2fd2eac (refactor tts mcp tool) +## Test 380 **Expected Tool:** `resourcehealth_availability-status_list` **Prompt:** What resources in resource group have health issues? @@ -15776,37 +6942,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -<<<<<<< HEAD -| 1 | 0.596890 | `resourcehealth_availability-status_list` | ✅ **EXPECTED** | -| 2 | 0.550812 | `resourcehealth_availability-status_get` | ❌ | -======= -<<<<<<< HEAD -| 1 | 0.596817 | `resourcehealth_availability-status_list` | ✅ **EXPECTED** | -======= | 1 | 0.596890 | `resourcehealth_availability-status_list` | ✅ **EXPECTED** | ->>>>>>> e2fd2eac (refactor tts mcp tool) | 2 | 0.549900 | `resourcehealth_availability-status_get` | ❌ | ->>>>>>> 58ab8585 (update prompts and tool description evaluator) | 3 | 0.496640 | `resourcehealth_health-events_list` | ❌ | | 4 | 0.441921 | `applens_resource_diagnose` | ❌ | | 5 | 0.433614 | `loadtesting_testresource_list` | ❌ | --- -<<<<<<< HEAD -<<<<<<< HEAD -## Test 371 -======= -<<<<<<< HEAD -## Test 361 -======= -## Test 371 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -## Test 376 ->>>>>>> e2fd2eac (refactor tts mcp tool) +## Test 381 **Expected Tool:** `resourcehealth_health-events_list` **Prompt:** List all service health events in my subscription @@ -15815,21 +6959,6 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -<<<<<<< HEAD -| 1 | 0.690720 | `resourcehealth_health-events_list` | ✅ **EXPECTED** | -| 2 | 0.553485 | `search_service_list` | ❌ | -| 3 | 0.534169 | `eventgrid_topic_list` | ❌ | -| 4 | 0.529200 | `eventgrid_subscription_list` | ❌ | -| 5 | 0.518372 | `resourcehealth_availability-status_list` | ❌ | - ---- - -## Test 372 -======= -<<<<<<< HEAD -======= ->>>>>>> e2fd2eac (refactor tts mcp tool) | 1 | 0.690719 | `resourcehealth_health-events_list` | ✅ **EXPECTED** | | 2 | 0.554895 | `search_service_list` | ❌ | | 3 | 0.534250 | `eventgrid_topic_list` | ❌ | @@ -15838,16 +6967,7 @@ --- -<<<<<<< HEAD -<<<<<<< HEAD -## Test 362 -======= -## Test 372 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -## Test 377 ->>>>>>> e2fd2eac (refactor tts mcp tool) +## Test 382 **Expected Tool:** `resourcehealth_health-events_list` **Prompt:** Show me Azure service health events for subscription @@ -15857,33 +6977,6 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| | 1 | 0.686448 | `resourcehealth_health-events_list` | ✅ **EXPECTED** | -<<<<<<< HEAD -<<<<<<< HEAD -| 2 | 0.534707 | `eventgrid_subscription_list` | ❌ | -| 3 | 0.513302 | `search_service_list` | ❌ | -| 4 | 0.513237 | `eventgrid_topic_list` | ❌ | -======= -======= -| 1 | 0.686448 | `resourcehealth_service-health-events_list` | ❌ | ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) -| 2 | 0.534556 | `eventgrid_subscription_list` | ❌ | -| 3 | 0.513815 | `search_service_list` | ❌ | -| 4 | 0.513259 | `eventgrid_topic_list` | ❌ | ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -| 5 | 0.501121 | `subscription_list` | ❌ | - ---- - -<<<<<<< HEAD -## Test 373 -======= -<<<<<<< HEAD -## Test 363 -======= -## Test 373 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= | 2 | 0.534556 | `eventgrid_subscription_list` | ❌ | | 3 | 0.513815 | `search_service_list` | ❌ | | 4 | 0.513259 | `eventgrid_topic_list` | ❌ | @@ -15891,8 +6984,7 @@ --- -## Test 378 ->>>>>>> e2fd2eac (refactor tts mcp tool) +## Test 383 **Expected Tool:** `resourcehealth_health-events_list` **Prompt:** What service issues have occurred in the last 30 days? @@ -15901,47 +6993,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -<<<<<<< HEAD | 1 | 0.450841 | `resourcehealth_health-events_list` | ✅ **EXPECTED** | | 2 | 0.267663 | `applens_resource_diagnose` | ❌ | -| 3 | 0.245720 | `cloudarchitect_design` | ❌ | -| 4 | 0.216847 | `resourcehealth_availability-status_list` | ❌ | -| 5 | 0.211043 | `search_service_list` | ❌ | - ---- - -## Test 374 -======= -<<<<<<< HEAD -| 1 | 0.450909 | `resourcehealth_health-events_list` | ✅ **EXPECTED** | -| 2 | 0.267752 | `applens_resource_diagnose` | ❌ | | 3 | 0.245709 | `cloudarchitect_design` | ❌ | -| 4 | 0.217130 | `resourcehealth_availability-status_list` | ❌ | -| 5 | 0.211900 | `search_service_list` | ❌ | - ---- - -## Test 364 -======= -| 1 | 0.450841 | `resourcehealth_service-health-events_list` | ❌ | -======= -| 1 | 0.450841 | `resourcehealth_health-events_list` | ✅ **EXPECTED** | ->>>>>>> e2fd2eac (refactor tts mcp tool) -| 2 | 0.267663 | `applens_resource_diagnose` | ❌ | -| 3 | 0.245720 | `cloudarchitect_design` | ❌ | | 4 | 0.216847 | `resourcehealth_availability-status_list` | ❌ | | 5 | 0.211842 | `search_service_list` | ❌ | --- -<<<<<<< HEAD -## Test 374 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -## Test 379 ->>>>>>> e2fd2eac (refactor tts mcp tool) +## Test 384 **Expected Tool:** `resourcehealth_health-events_list` **Prompt:** List active service health events in my subscription @@ -15951,22 +7011,6 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| | 1 | 0.685391 | `resourcehealth_health-events_list` | ✅ **EXPECTED** | -<<<<<<< HEAD -<<<<<<< HEAD -| 2 | 0.527255 | `eventgrid_subscription_list` | ❌ | -| 3 | 0.523975 | `eventgrid_topic_list` | ❌ | -| 4 | 0.518668 | `search_service_list` | ❌ | -| 5 | 0.502064 | `resourcehealth_availability-status_list` | ❌ | - ---- - -## Test 375 -======= -======= -| 1 | 0.685391 | `resourcehealth_service-health-events_list` | ❌ | ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) -======= ->>>>>>> e2fd2eac (refactor tts mcp tool) | 2 | 0.527905 | `eventgrid_subscription_list` | ❌ | | 3 | 0.524063 | `eventgrid_topic_list` | ❌ | | 4 | 0.520197 | `search_service_list` | ❌ | @@ -15974,16 +7018,7 @@ --- -<<<<<<< HEAD -<<<<<<< HEAD -## Test 365 -======= -## Test 375 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -## Test 380 ->>>>>>> e2fd2eac (refactor tts mcp tool) +## Test 385 **Expected Tool:** `resourcehealth_health-events_list` **Prompt:** Show me planned maintenance events for my Azure services @@ -15993,22 +7028,6 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| | 1 | 0.565851 | `resourcehealth_health-events_list` | ✅ **EXPECTED** | -<<<<<<< HEAD -<<<<<<< HEAD -| 2 | 0.436322 | `search_service_list` | ❌ | -| 3 | 0.404191 | `eventgrid_subscription_list` | ❌ | -| 4 | 0.402493 | `resourcehealth_availability-status_list` | ❌ | -| 5 | 0.398050 | `quota_usage_check` | ❌ | - ---- - -## Test 376 -======= -======= -| 1 | 0.565851 | `resourcehealth_service-health-events_list` | ❌ | ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) -======= ->>>>>>> e2fd2eac (refactor tts mcp tool) | 2 | 0.437868 | `search_service_list` | ❌ | | 3 | 0.403665 | `eventgrid_subscription_list` | ❌ | | 4 | 0.402493 | `resourcehealth_availability-status_list` | ❌ | @@ -16016,16 +7035,7 @@ --- -<<<<<<< HEAD -<<<<<<< HEAD -## Test 366 -======= -## Test 376 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -## Test 381 ->>>>>>> e2fd2eac (refactor tts mcp tool) +## Test 386 **Expected Tool:** `servicebus_queue_details` **Prompt:** Show me the details of service bus queue @@ -16034,39 +7044,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -| 1 | 0.642876 | `servicebus_queue_details` | ✅ **EXPECTED** | -| 2 | 0.460932 | `servicebus_topic_subscription_details` | ❌ | -| 3 | 0.437000 | `servicebus_topic_details` | ❌ | -| 4 | 0.385812 | `search_knowledge_base_get` | ❌ | -<<<<<<< HEAD -<<<<<<< HEAD -| 5 | 0.384139 | `storage_account_get` | ❌ | - ---- - -## Test 377 -======= -<<<<<<< HEAD -| 5 | 0.384133 | `storage_account_get` | ❌ | - ---- - -## Test 367 -======= -| 5 | 0.384187 | `storage_account_get` | ❌ | - ---- - -## Test 377 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -| 5 | 0.384139 | `storage_account_get` | ❌ | +| 1 | 0.642896 | `servicebus_queue_details` | ✅ **EXPECTED** | +| 2 | 0.460952 | `servicebus_topic_subscription_details` | ❌ | +| 3 | 0.436934 | `servicebus_topic_details` | ❌ | +| 4 | 0.385791 | `search_knowledge_base_get` | ❌ | +| 5 | 0.384199 | `storage_account_get` | ❌ | --- -## Test 382 ->>>>>>> e2fd2eac (refactor tts mcp tool) +## Test 387 **Expected Tool:** `servicebus_topic_details` **Prompt:** Show me the details of service bus topic @@ -16075,27 +7061,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -| 1 | 0.642762 | `servicebus_topic_details` | ✅ **EXPECTED** | -| 2 | 0.571860 | `servicebus_topic_subscription_details` | ❌ | +| 1 | 0.642952 | `servicebus_topic_details` | ✅ **EXPECTED** | +| 2 | 0.571861 | `servicebus_topic_subscription_details` | ❌ | | 3 | 0.483976 | `servicebus_queue_details` | ❌ | -| 4 | 0.482735 | `eventgrid_topic_list` | ❌ | -| 5 | 0.457603 | `eventgrid_subscription_list` | ❌ | +| 4 | 0.482958 | `eventgrid_topic_list` | ❌ | +| 5 | 0.458711 | `eventgrid_subscription_list` | ❌ | --- -<<<<<<< HEAD -<<<<<<< HEAD -## Test 378 -======= -<<<<<<< HEAD -## Test 368 -======= -## Test 378 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -## Test 383 ->>>>>>> e2fd2eac (refactor tts mcp tool) +## Test 388 **Expected Tool:** `servicebus_topic_subscription_details` **Prompt:** Show me the details of service bus subscription @@ -16105,26 +7079,14 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| | 1 | 0.633187 | `servicebus_topic_subscription_details` | ✅ **EXPECTED** | -| 2 | 0.517516 | `servicebus_topic_details` | ❌ | +| 2 | 0.517623 | `servicebus_topic_details` | ❌ | | 3 | 0.494515 | `servicebus_queue_details` | ❌ | -| 4 | 0.493776 | `eventgrid_topic_list` | ❌ | -| 5 | 0.471876 | `eventgrid_subscription_list` | ❌ | +| 4 | 0.493853 | `eventgrid_topic_list` | ❌ | +| 5 | 0.472128 | `eventgrid_subscription_list` | ❌ | --- -<<<<<<< HEAD -<<<<<<< HEAD -## Test 379 -======= -<<<<<<< HEAD -## Test 369 -======= -## Test 379 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -## Test 384 ->>>>>>> e2fd2eac (refactor tts mcp tool) +## Test 389 **Expected Tool:** `signalr_runtime_get` **Prompt:** Show me the details of SignalR @@ -16133,32 +7095,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -| 1 | 0.532742 | `signalr_runtime_get` | ✅ **EXPECTED** | -| 2 | 0.355028 | `redis_list` | ❌ | -======= | 1 | 0.532544 | `signalr_runtime_get` | ✅ **EXPECTED** | -| 2 | 0.355082 | `redis_list` | ❌ | ->>>>>>> e2fd2eac (refactor tts mcp tool) +| 2 | 0.355028 | `redis_list` | ❌ | | 3 | 0.329804 | `foundry_resource_get` | ❌ | | 4 | 0.319981 | `sql_server_show` | ❌ | | 5 | 0.304420 | `servicebus_queue_details` | ❌ | --- -<<<<<<< HEAD -<<<<<<< HEAD -## Test 380 -======= -<<<<<<< HEAD -## Test 370 -======= -## Test 380 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -## Test 385 ->>>>>>> e2fd2eac (refactor tts mcp tool) +## Test 390 **Expected Tool:** `signalr_runtime_get` **Prompt:** Show me the network information of SignalR runtime @@ -16167,32 +7112,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -| 1 | 0.573540 | `signalr_runtime_get` | ✅ **EXPECTED** | +| 1 | 0.573446 | `signalr_runtime_get` | ✅ **EXPECTED** | | 2 | 0.337342 | `sql_server_show` | ❌ | | 3 | 0.306559 | `foundry_resource_get` | ❌ | -<<<<<<< HEAD | 4 | 0.305021 | `redis_list` | ❌ | -| 5 | 0.301114 | `servicebus_topic_details` | ❌ | - ---- - -<<<<<<< HEAD -## Test 381 -======= -<<<<<<< HEAD -## Test 371 -======= -## Test 381 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -| 4 | 0.305083 | `redis_list` | ❌ | | 5 | 0.300956 | `servicebus_topic_details` | ❌ | --- -## Test 386 ->>>>>>> e2fd2eac (refactor tts mcp tool) +## Test 391 **Expected Tool:** `signalr_runtime_get` **Prompt:** Describe the SignalR runtime in resource group @@ -16201,40 +7129,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -| 1 | 0.710281 | `signalr_runtime_get` | ✅ **EXPECTED** | +| 1 | 0.710353 | `signalr_runtime_get` | ✅ **EXPECTED** | | 2 | 0.411396 | `loadtesting_testresource_list` | ❌ | | 3 | 0.410606 | `foundry_resource_get` | ❌ | -<<<<<<< HEAD -<<<<<<< HEAD -| 4 | 0.399412 | `resourcehealth_availability-status_list` | ❌ | -| 5 | 0.382028 | `sql_server_list` | ❌ | - ---- - -## Test 382 -======= -<<<<<<< HEAD -| 4 | 0.399745 | `resourcehealth_availability-status_list` | ❌ | -| 5 | 0.382472 | `sql_server_list` | ❌ | - ---- - -## Test 372 -======= -======= ->>>>>>> e2fd2eac (refactor tts mcp tool) | 4 | 0.399412 | `resourcehealth_availability-status_list` | ❌ | -| 5 | 0.382028 | `sql_server_list` | ❌ | +| 5 | 0.382099 | `sql_server_list` | ❌ | --- -<<<<<<< HEAD -## Test 382 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -## Test 387 ->>>>>>> e2fd2eac (refactor tts mcp tool) +## Test 392 **Expected Tool:** `signalr_runtime_get` **Prompt:** Get information about my SignalR runtime in @@ -16243,41 +7146,6 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -<<<<<<< HEAD -| 1 | 0.715701 | `signalr_runtime_get` | ✅ **EXPECTED** | -| 2 | 0.458894 | `foundry_resource_get` | ❌ | -| 3 | 0.431212 | `resourcehealth_availability-status_list` | ❌ | -| 4 | 0.430721 | `loadtesting_testresource_list` | ❌ | -| 5 | 0.417313 | `functionapp_get` | ❌ | - ---- - -## Test 383 -======= -<<<<<<< HEAD -| 1 | 0.715913 | `signalr_runtime_get` | ✅ **EXPECTED** | -| 2 | 0.459979 | `foundry_resource_get` | ❌ | -| 3 | 0.431800 | `resourcehealth_availability-status_list` | ❌ | -| 4 | 0.431393 | `loadtesting_testresource_list` | ❌ | -| 5 | 0.417497 | `functionapp_get` | ❌ | - ---- - -## Test 373 -======= -| 1 | 0.715937 | `signalr_runtime_get` | ✅ **EXPECTED** | -| 2 | 0.459543 | `foundry_resource_get` | ❌ | -| 3 | 0.431534 | `resourcehealth_availability-status_list` | ❌ | -| 4 | 0.430926 | `loadtesting_testresource_list` | ❌ | -| 5 | 0.417653 | `functionapp_get` | ❌ | - ---- - -## Test 383 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= | 1 | 0.715974 | `signalr_runtime_get` | ✅ **EXPECTED** | | 2 | 0.459045 | `foundry_resource_get` | ❌ | | 3 | 0.430829 | `resourcehealth_availability-status_list` | ❌ | @@ -16286,8 +7154,7 @@ --- -## Test 388 ->>>>>>> e2fd2eac (refactor tts mcp tool) +## Test 393 **Expected Tool:** `signalr_runtime_get` **Prompt:** Show all the SignalRs information in @@ -16296,41 +7163,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -| 1 | 0.563883 | `signalr_runtime_get` | ✅ **EXPECTED** | -| 2 | 0.501077 | `redis_list` | ❌ | -<<<<<<< HEAD -| 3 | 0.494478 | `resourcehealth_availability-status_list` | ❌ | -======= -<<<<<<< HEAD -| 3 | 0.494808 | `resourcehealth_availability-status_list` | ❌ | -======= -| 3 | 0.494478 | `resourcehealth_availability-status_list` | ❌ | ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= | 1 | 0.564072 | `signalr_runtime_get` | ✅ **EXPECTED** | -| 2 | 0.501156 | `redis_list` | ❌ | +| 2 | 0.501077 | `redis_list` | ❌ | | 3 | 0.494478 | `resourcehealth_availability-status_list` | ❌ | ->>>>>>> e2fd2eac (refactor tts mcp tool) | 4 | 0.481428 | `loadtesting_testresource_list` | ❌ | -| 5 | 0.462090 | `mysql_server_list` | ❌ | +| 5 | 0.462417 | `mysql_server_list` | ❌ | --- -<<<<<<< HEAD -<<<<<<< HEAD -## Test 384 -======= -<<<<<<< HEAD -## Test 374 -======= -## Test 384 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -## Test 389 ->>>>>>> e2fd2eac (refactor tts mcp tool) +## Test 394 **Expected Tool:** `signalr_runtime_get` **Prompt:** List all SignalRs in my subscription @@ -16339,39 +7180,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -| 1 | 0.530514 | `signalr_runtime_get` | ✅ **EXPECTED** | -| 2 | 0.507654 | `postgres_server_list` | ❌ | -| 3 | 0.495157 | `redis_list` | ❌ | -<<<<<<< HEAD -| 4 | 0.494498 | `kusto_cluster_list` | ❌ | -| 5 | 0.487906 | `subscription_list` | ❌ | - ---- - -<<<<<<< HEAD -## Test 385 -======= -## Test 375 -======= -| 4 | 0.494513 | `kusto_cluster_list` | ❌ | -======= | 1 | 0.530646 | `signalr_runtime_get` | ✅ **EXPECTED** | | 2 | 0.507653 | `postgres_server_list` | ❌ | -| 3 | 0.495179 | `redis_list` | ❌ | +| 3 | 0.495157 | `redis_list` | ❌ | | 4 | 0.494498 | `kusto_cluster_list` | ❌ | ->>>>>>> e2fd2eac (refactor tts mcp tool) | 5 | 0.487856 | `subscription_list` | ❌ | --- -<<<<<<< HEAD -## Test 385 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -## Test 390 ->>>>>>> e2fd2eac (refactor tts mcp tool) +## Test 395 **Expected Tool:** `sql_db_create` **Prompt:** Create a new SQL database named in server @@ -16381,31 +7198,14 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| | 1 | 0.516780 | `sql_db_create` | ✅ **EXPECTED** | -| 2 | 0.470892 | `sql_server_create` | ❌ | -<<<<<<< HEAD -| 3 | 0.420389 | `sql_db_rename` | ❌ | -| 4 | 0.408515 | `sql_db_delete` | ❌ | -======= +| 2 | 0.470913 | `sql_server_create` | ❌ | | 3 | 0.420504 | `sql_db_rename` | ❌ | -| 4 | 0.408628 | `sql_db_delete` | ❌ | ->>>>>>> e2fd2eac (refactor tts mcp tool) +| 4 | 0.408515 | `sql_db_delete` | ❌ | | 5 | 0.404860 | `sql_server_delete` | ❌ | --- -<<<<<<< HEAD -<<<<<<< HEAD -## Test 386 -======= -<<<<<<< HEAD -## Test 376 -======= -## Test 386 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -## Test 391 ->>>>>>> e2fd2eac (refactor tts mcp tool) +## Test 396 **Expected Tool:** `sql_db_create` **Prompt:** Create a SQL database with Basic tier in server @@ -16415,40 +7215,14 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| | 1 | 0.571760 | `sql_db_create` | ✅ **EXPECTED** | -| 2 | 0.459672 | `sql_server_create` | ❌ | -<<<<<<< HEAD -| 3 | 0.437525 | `sql_server_delete` | ❌ | -======= +| 2 | 0.459683 | `sql_server_create` | ❌ | | 3 | 0.437526 | `sql_server_delete` | ❌ | -<<<<<<< HEAD -<<<<<<< HEAD ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -| 4 | 0.420843 | `sql_db_show` | ❌ | -| 5 | 0.417661 | `sql_db_delete` | ❌ | - ---- - -<<<<<<< HEAD -## Test 387 -======= -## Test 377 -======= -| 4 | 0.424021 | `appservice_database_add` | ❌ | -| 5 | 0.420843 | `sql_db_show` | ❌ | - ---- - -## Test 387 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= | 4 | 0.420843 | `sql_db_show` | ❌ | -| 5 | 0.417795 | `sql_db_delete` | ❌ | +| 5 | 0.417662 | `sql_db_delete` | ❌ | --- -## Test 392 ->>>>>>> e2fd2eac (refactor tts mcp tool) +## Test 397 **Expected Tool:** `sql_db_create` **Prompt:** Create a new database called on SQL server in resource group @@ -16458,37 +7232,14 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| | 1 | 0.604472 | `sql_db_create` | ✅ **EXPECTED** | -| 2 | 0.545906 | `sql_server_create` | ❌ | -| 3 | 0.503938 | `sql_db_rename` | ❌ | +| 2 | 0.545986 | `sql_server_create` | ❌ | +| 3 | 0.504013 | `sql_db_rename` | ❌ | | 4 | 0.494377 | `sql_db_show` | ❌ | -<<<<<<< HEAD -<<<<<<< HEAD -| 5 | 0.473975 | `sql_db_list` | ❌ | - ---- - -## Test 388 -======= -<<<<<<< HEAD -| 5 | 0.473859 | `sql_db_list` | ❌ | - ---- - -## Test 378 -======= -======= ->>>>>>> e2fd2eac (refactor tts mcp tool) | 5 | 0.473975 | `sql_db_list` | ❌ | --- -<<<<<<< HEAD -## Test 388 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -## Test 393 ->>>>>>> e2fd2eac (refactor tts mcp tool) +## Test 398 **Expected Tool:** `sql_db_delete` **Prompt:** Delete the SQL database from server @@ -16497,27 +7248,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -| 1 | 0.568205 | `sql_db_delete` | ✅ **EXPECTED** | +| 1 | 0.568196 | `sql_db_delete` | ✅ **EXPECTED** | | 2 | 0.567412 | `sql_server_delete` | ❌ | -| 3 | 0.391436 | `sql_db_rename` | ❌ | -| 4 | 0.386721 | `sql_server_firewall-rule_delete` | ❌ | +| 3 | 0.391509 | `sql_db_rename` | ❌ | +| 4 | 0.386564 | `sql_server_firewall-rule_delete` | ❌ | | 5 | 0.364776 | `sql_db_show` | ❌ | --- -<<<<<<< HEAD -<<<<<<< HEAD -## Test 389 -======= -<<<<<<< HEAD -## Test 379 -======= -## Test 389 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -## Test 394 ->>>>>>> e2fd2eac (refactor tts mcp tool) +## Test 399 **Expected Tool:** `sql_db_delete` **Prompt:** Remove database from SQL server in resource group @@ -16526,46 +7265,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -<<<<<<< HEAD | 1 | 0.567513 | `sql_server_delete` | ❌ | | 2 | 0.543440 | `sql_db_delete` | ✅ **EXPECTED** | | 3 | 0.500756 | `sql_db_show` | ❌ | -| 4 | 0.481023 | `sql_db_rename` | ❌ | -| 5 | 0.478729 | `sql_db_list` | ❌ | - ---- - -## Test 390 -======= -<<<<<<< HEAD -| 1 | 0.567481 | `sql_server_delete` | ❌ | -| 2 | 0.543378 | `sql_db_delete` | ✅ **EXPECTED** | -| 3 | 0.500746 | `sql_db_show` | ❌ | -| 4 | 0.480981 | `sql_db_rename` | ❌ | -| 5 | 0.478583 | `sql_db_list` | ❌ | - ---- - -## Test 380 -======= -======= ->>>>>>> e2fd2eac (refactor tts mcp tool) -| 1 | 0.567513 | `sql_server_delete` | ❌ | -| 2 | 0.543468 | `sql_db_delete` | ✅ **EXPECTED** | -| 3 | 0.500756 | `sql_db_show` | ❌ | | 4 | 0.481083 | `sql_db_rename` | ❌ | | 5 | 0.478729 | `sql_db_list` | ❌ | --- -<<<<<<< HEAD -## Test 390 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -## Test 395 ->>>>>>> e2fd2eac (refactor tts mcp tool) +## Test 400 **Expected Tool:** `sql_db_delete` **Prompt:** Delete the database called on server @@ -16574,32 +7282,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD | 1 | 0.509916 | `sql_db_delete` | ✅ **EXPECTED** | -| 2 | 0.490893 | `sql_server_delete` | ❌ | -======= -| 1 | 0.509939 | `sql_db_delete` | ✅ **EXPECTED** | | 2 | 0.490892 | `sql_server_delete` | ❌ | ->>>>>>> e2fd2eac (refactor tts mcp tool) | 3 | 0.364494 | `postgres_database_list` | ❌ | -| 4 | 0.355416 | `mysql_database_list` | ❌ | -| 5 | 0.347703 | `sql_db_rename` | ❌ | +| 4 | 0.354710 | `mysql_database_list` | ❌ | +| 5 | 0.347837 | `sql_db_rename` | ❌ | --- -<<<<<<< HEAD -<<<<<<< HEAD -## Test 391 -======= -<<<<<<< HEAD -## Test 381 -======= -## Test 391 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -## Test 396 ->>>>>>> e2fd2eac (refactor tts mcp tool) +## Test 401 **Expected Tool:** `sql_db_list` **Prompt:** List all databases in the Azure SQL server @@ -16608,46 +7299,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -<<<<<<< HEAD -| 1 | 0.643138 | `sql_db_list` | ✅ **EXPECTED** | -| 2 | 0.639644 | `mysql_database_list` | ❌ | -| 3 | 0.609116 | `postgres_database_list` | ❌ | -| 4 | 0.602872 | `cosmos_database_list` | ❌ | -| 5 | 0.569464 | `kusto_database_list` | ❌ | - ---- - -## Test 392 -======= -<<<<<<< HEAD -| 1 | 0.643202 | `sql_db_list` | ✅ **EXPECTED** | -| 2 | 0.639694 | `mysql_database_list` | ❌ | -| 3 | 0.609178 | `postgres_database_list` | ❌ | -| 4 | 0.602890 | `cosmos_database_list` | ❌ | -| 5 | 0.570103 | `kusto_database_list` | ❌ | - ---- - -## Test 382 -======= -======= ->>>>>>> e2fd2eac (refactor tts mcp tool) | 1 | 0.643186 | `sql_db_list` | ✅ **EXPECTED** | -| 2 | 0.639694 | `mysql_database_list` | ❌ | +| 2 | 0.639398 | `mysql_database_list` | ❌ | | 3 | 0.609178 | `postgres_database_list` | ❌ | | 4 | 0.602890 | `cosmos_database_list` | ❌ | -| 5 | 0.570140 | `kusto_database_list` | ❌ | +| 5 | 0.570278 | `kusto_database_list` | ❌ | --- -<<<<<<< HEAD -## Test 392 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -## Test 397 ->>>>>>> e2fd2eac (refactor tts mcp tool) +## Test 402 **Expected Tool:** `sql_db_list` **Prompt:** Show me all the databases configuration details in the Azure SQL server @@ -16657,38 +7317,14 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| | 1 | 0.617746 | `sql_server_show` | ❌ | -<<<<<<< HEAD -<<<<<<< HEAD | 2 | 0.609322 | `sql_db_list` | ✅ **EXPECTED** | -======= -<<<<<<< HEAD -| 2 | 0.609291 | `sql_db_list` | ✅ **EXPECTED** | -======= -| 2 | 0.609322 | `sql_db_list` | ✅ **EXPECTED** | ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -| 2 | 0.609322 | `sql_db_list` | ✅ **EXPECTED** | ->>>>>>> e2fd2eac (refactor tts mcp tool) -| 3 | 0.557353 | `mysql_database_list` | ❌ | +| 3 | 0.557245 | `mysql_database_list` | ❌ | | 4 | 0.553488 | `mysql_server_config_get` | ❌ | | 5 | 0.524274 | `sql_db_show` | ❌ | --- -<<<<<<< HEAD -<<<<<<< HEAD -## Test 393 -======= -<<<<<<< HEAD -## Test 383 -======= -## Test 393 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -## Test 398 ->>>>>>> e2fd2eac (refactor tts mcp tool) +## Test 403 **Expected Tool:** `sql_db_rename` **Prompt:** Rename the SQL database on server to @@ -16697,46 +7333,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -<<<<<<< HEAD -| 1 | 0.593251 | `sql_db_rename` | ✅ **EXPECTED** | -| 2 | 0.425282 | `sql_server_delete` | ❌ | -| 3 | 0.416207 | `sql_db_delete` | ❌ | -| 4 | 0.396947 | `sql_db_create` | ❌ | -| 5 | 0.346018 | `sql_db_show` | ❌ | - ---- - -## Test 394 -======= -<<<<<<< HEAD -| 1 | 0.593308 | `sql_db_rename` | ✅ **EXPECTED** | -| 2 | 0.425296 | `sql_server_delete` | ❌ | -| 3 | 0.416187 | `sql_db_delete` | ❌ | -| 4 | 0.396109 | `sql_db_create` | ❌ | -| 5 | 0.345991 | `sql_db_show` | ❌ | - ---- - -## Test 384 -======= -======= ->>>>>>> e2fd2eac (refactor tts mcp tool) -| 1 | 0.593348 | `sql_db_rename` | ✅ **EXPECTED** | -| 2 | 0.425282 | `sql_server_delete` | ❌ | -| 3 | 0.416267 | `sql_db_delete` | ❌ | -| 4 | 0.396947 | `sql_db_create` | ❌ | -| 5 | 0.346018 | `sql_db_show` | ❌ | +| 1 | 0.593278 | `sql_db_rename` | ✅ **EXPECTED** | +| 2 | 0.425161 | `sql_server_delete` | ❌ | +| 3 | 0.416057 | `sql_db_delete` | ❌ | +| 4 | 0.396824 | `sql_db_create` | ❌ | +| 5 | 0.345805 | `sql_db_show` | ❌ | --- -<<<<<<< HEAD -## Test 394 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -## Test 399 ->>>>>>> e2fd2eac (refactor tts mcp tool) +## Test 404 **Expected Tool:** `sql_db_rename` **Prompt:** Rename my Azure SQL database to on server @@ -16745,51 +7350,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -<<<<<<< HEAD -| 1 | 0.711257 | `sql_db_rename` | ✅ **EXPECTED** | -| 2 | 0.516770 | `sql_server_delete` | ❌ | -| 3 | 0.506834 | `sql_db_delete` | ❌ | -| 4 | 0.501963 | `sql_db_create` | ❌ | -| 5 | 0.434094 | `sql_server_show` | ❌ | - ---- - -## Test 395 -======= -<<<<<<< HEAD -| 1 | 0.710788 | `sql_db_rename` | ✅ **EXPECTED** | -| 2 | 0.516432 | `sql_server_delete` | ❌ | -| 3 | 0.506388 | `sql_db_delete` | ❌ | -| 4 | 0.500926 | `sql_db_create` | ❌ | -| 5 | 0.434133 | `sql_server_show` | ❌ | - ---- - -## Test 385 -======= -| 1 | 0.710925 | `sql_db_rename` | ✅ **EXPECTED** | -| 2 | 0.516662 | `sql_server_delete` | ❌ | -| 3 | 0.506572 | `sql_db_delete` | ❌ | -| 4 | 0.501347 | `sql_db_create` | ❌ | -| 5 | 0.433966 | `sql_server_show` | ❌ | - ---- - -## Test 395 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= | 1 | 0.711063 | `sql_db_rename` | ✅ **EXPECTED** | | 2 | 0.516485 | `sql_server_delete` | ❌ | -| 3 | 0.506579 | `sql_db_delete` | ❌ | +| 3 | 0.506499 | `sql_db_delete` | ❌ | | 4 | 0.501476 | `sql_db_create` | ❌ | | 5 | 0.433897 | `sql_server_show` | ❌ | --- -## Test 400 ->>>>>>> e2fd2eac (refactor tts mcp tool) +## Test 405 **Expected Tool:** `sql_db_show` **Prompt:** Get the configuration details for the SQL database on server @@ -16798,46 +7367,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -<<<<<<< HEAD -| 1 | 0.610991 | `sql_server_show` | ❌ | -| 2 | 0.593150 | `postgres_server_config_get` | ❌ | -| 3 | 0.530422 | `mysql_server_config_get` | ❌ | -| 4 | 0.528136 | `sql_db_show` | ✅ **EXPECTED** | -| 5 | 0.465693 | `sql_db_list` | ❌ | - ---- - -## Test 396 -======= -<<<<<<< HEAD -| 1 | 0.611215 | `sql_server_show` | ❌ | -| 2 | 0.593200 | `postgres_server_config_get` | ❌ | -| 3 | 0.530520 | `mysql_server_config_get` | ❌ | -| 4 | 0.528378 | `sql_db_show` | ✅ **EXPECTED** | -| 5 | 0.465779 | `sql_db_list` | ❌ | - ---- - -## Test 386 -======= -======= ->>>>>>> e2fd2eac (refactor tts mcp tool) -| 1 | 0.610991 | `sql_server_show` | ❌ | -| 2 | 0.593150 | `postgres_server_config_get` | ❌ | -| 3 | 0.530422 | `mysql_server_config_get` | ❌ | -| 4 | 0.528136 | `sql_db_show` | ✅ **EXPECTED** | -| 5 | 0.465693 | `sql_db_list` | ❌ | +| 1 | 0.610788 | `sql_server_show` | ❌ | +| 2 | 0.593239 | `postgres_server_config_get` | ❌ | +| 3 | 0.530655 | `mysql_server_config_get` | ❌ | +| 4 | 0.528543 | `sql_db_show` | ✅ **EXPECTED** | +| 5 | 0.465617 | `sql_db_list` | ❌ | --- -<<<<<<< HEAD -## Test 396 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -## Test 401 ->>>>>>> e2fd2eac (refactor tts mcp tool) +## Test 406 **Expected Tool:** `sql_db_show` **Prompt:** Show me the details of SQL database in server @@ -16846,51 +7384,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -<<<<<<< HEAD -| 1 | 0.530095 | `sql_db_show` | ✅ **EXPECTED** | -| 2 | 0.503681 | `sql_server_show` | ❌ | -| 3 | 0.440073 | `sql_db_list` | ❌ | -| 4 | 0.439076 | `mysql_table_schema_get` | ❌ | -| 5 | 0.432919 | `mysql_database_list` | ❌ | - ---- - -## Test 397 -======= -<<<<<<< HEAD -| 1 | 0.530071 | `sql_db_show` | ✅ **EXPECTED** | -| 2 | 0.503602 | `sql_server_show` | ❌ | -| 3 | 0.439895 | `sql_db_list` | ❌ | -| 4 | 0.438615 | `mysql_table_schema_get` | ❌ | -| 5 | 0.432907 | `mysql_database_list` | ❌ | - ---- - -## Test 387 -======= | 1 | 0.530095 | `sql_db_show` | ✅ **EXPECTED** | | 2 | 0.503681 | `sql_server_show` | ❌ | | 3 | 0.440073 | `sql_db_list` | ❌ | | 4 | 0.438622 | `mysql_table_schema_get` | ❌ | -| 5 | 0.432919 | `mysql_database_list` | ❌ | +| 5 | 0.432990 | `mysql_database_list` | ❌ | --- -## Test 397 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -| 1 | 0.530040 | `sql_db_show` | ✅ **EXPECTED** | -| 2 | 0.503614 | `sql_server_show` | ❌ | -| 3 | 0.440041 | `sql_db_list` | ❌ | -| 4 | 0.438628 | `mysql_table_schema_get` | ❌ | -| 5 | 0.432915 | `mysql_database_list` | ❌ | - ---- - -## Test 402 ->>>>>>> e2fd2eac (refactor tts mcp tool) +## Test 407 **Expected Tool:** `sql_db_update` **Prompt:** Update the performance tier of SQL database on server @@ -16899,42 +7401,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -<<<<<<< HEAD -| 1 | 0.603271 | `sql_db_update` | ✅ **EXPECTED** | -| 2 | 0.467571 | `sql_db_create` | ❌ | -| 3 | 0.440442 | `sql_db_rename` | ❌ | -| 4 | 0.427621 | `sql_db_show` | ❌ | -| 5 | 0.413941 | `sql_server_delete` | ❌ | - ---- - -## Test 398 -======= -<<<<<<< HEAD -| 1 | 0.603537 | `sql_db_update` | ✅ **EXPECTED** | -| 2 | 0.467332 | `sql_db_create` | ❌ | -| 3 | 0.440688 | `sql_db_rename` | ❌ | -| 4 | 0.427542 | `sql_db_show` | ❌ | -| 5 | 0.414267 | `sql_server_delete` | ❌ | - ---- - -## Test 388 -======= -| 1 | 0.603360 | `sql_db_update` | ✅ **EXPECTED** | -| 2 | 0.467590 | `sql_db_create` | ❌ | -| 3 | 0.440550 | `sql_db_rename` | ❌ | -| 4 | 0.427654 | `sql_db_show` | ❌ | -| 5 | 0.414041 | `sql_server_delete` | ❌ | - ---- - -## Test 398 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -| 1 | 0.603376 | `sql_db_update` | ✅ **EXPECTED** | +| 1 | 0.603366 | `sql_db_update` | ✅ **EXPECTED** | | 2 | 0.467571 | `sql_db_create` | ❌ | | 3 | 0.440493 | `sql_db_rename` | ❌ | | 4 | 0.427621 | `sql_db_show` | ❌ | @@ -16942,8 +7409,7 @@ --- -## Test 403 ->>>>>>> e2fd2eac (refactor tts mcp tool) +## Test 408 **Expected Tool:** `sql_db_update` **Prompt:** Scale SQL database on server to use SKU @@ -16952,33 +7418,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -<<<<<<< HEAD -| 1 | 0.550449 | `sql_db_update` | ✅ **EXPECTED** | -| 2 | 0.418358 | `sql_server_delete` | ❌ | -| 3 | 0.401817 | `sql_db_list` | ❌ | -| 4 | 0.395508 | `sql_db_rename` | ❌ | -| 5 | 0.394770 | `sql_db_show` | ❌ | - ---- - -## Test 399 -======= -<<<<<<< HEAD -| 1 | 0.550501 | `sql_db_update` | ✅ **EXPECTED** | -| 2 | 0.418334 | `sql_server_delete` | ❌ | -| 3 | 0.401717 | `sql_db_list` | ❌ | -| 4 | 0.395462 | `sql_db_rename` | ❌ | -| 5 | 0.394705 | `sql_db_show` | ❌ | - ---- - -## Test 389 -======= | 1 | 0.550556 | `sql_db_update` | ✅ **EXPECTED** | -======= -| 1 | 0.550661 | `sql_db_update` | ✅ **EXPECTED** | ->>>>>>> e2fd2eac (refactor tts mcp tool) | 2 | 0.418358 | `sql_server_delete` | ❌ | | 3 | 0.401817 | `sql_db_list` | ❌ | | 4 | 0.395518 | `sql_db_rename` | ❌ | @@ -16986,13 +7426,7 @@ --- -<<<<<<< HEAD -## Test 399 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -## Test 404 ->>>>>>> e2fd2eac (refactor tts mcp tool) +## Test 409 **Expected Tool:** `sql_elastic-pool_list` **Prompt:** List all elastic pools in SQL server @@ -17002,38 +7436,14 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| | 1 | 0.678124 | `sql_elastic-pool_list` | ✅ **EXPECTED** | -<<<<<<< HEAD -<<<<<<< HEAD -| 2 | 0.502376 | `sql_db_list` | ❌ | -======= -<<<<<<< HEAD -| 2 | 0.502382 | `sql_db_list` | ❌ | -======= -| 2 | 0.502376 | `sql_db_list` | ❌ | ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= | 2 | 0.502376 | `sql_db_list` | ❌ | ->>>>>>> e2fd2eac (refactor tts mcp tool) -| 3 | 0.498367 | `mysql_database_list` | ❌ | -| 4 | 0.485249 | `aks_nodepool_get` | ❌ | +| 3 | 0.498208 | `mysql_database_list` | ❌ | +| 4 | 0.485167 | `aks_nodepool_get` | ❌ | | 5 | 0.479044 | `sql_server_show` | ❌ | --- -<<<<<<< HEAD -<<<<<<< HEAD -## Test 400 -======= -<<<<<<< HEAD -## Test 390 -======= -## Test 400 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -## Test 405 ->>>>>>> e2fd2eac (refactor tts mcp tool) +## Test 410 **Expected Tool:** `sql_elastic-pool_list` **Prompt:** Show me the elastic pools configured for SQL server @@ -17042,46 +7452,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -<<<<<<< HEAD -| 1 | 0.606425 | `sql_elastic-pool_list` | ✅ **EXPECTED** | -| 2 | 0.502877 | `sql_server_show` | ❌ | -| 3 | 0.457164 | `sql_db_list` | ❌ | -| 4 | 0.450743 | `aks_nodepool_get` | ❌ | -| 5 | 0.432816 | `mysql_database_list` | ❌ | - ---- - -## Test 401 -======= -<<<<<<< HEAD -| 1 | 0.606478 | `sql_elastic-pool_list` | ✅ **EXPECTED** | -| 2 | 0.502977 | `sql_server_show` | ❌ | -| 3 | 0.457262 | `sql_db_list` | ❌ | -| 4 | 0.450790 | `aks_nodepool_get` | ❌ | -| 5 | 0.432867 | `mysql_database_list` | ❌ | - ---- - -## Test 391 -======= -======= ->>>>>>> e2fd2eac (refactor tts mcp tool) | 1 | 0.606425 | `sql_elastic-pool_list` | ✅ **EXPECTED** | | 2 | 0.502877 | `sql_server_show` | ❌ | | 3 | 0.457163 | `sql_db_list` | ❌ | -| 4 | 0.450743 | `aks_nodepool_get` | ❌ | -| 5 | 0.432816 | `mysql_database_list` | ❌ | +| 4 | 0.450655 | `aks_nodepool_get` | ❌ | +| 5 | 0.432815 | `mysql_database_list` | ❌ | --- -<<<<<<< HEAD -## Test 401 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -## Test 406 ->>>>>>> e2fd2eac (refactor tts mcp tool) +## Test 411 **Expected Tool:** `sql_elastic-pool_list` **Prompt:** What elastic pools are available in my SQL server ? @@ -17091,39 +7470,14 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| | 1 | 0.592709 | `sql_elastic-pool_list` | ✅ **EXPECTED** | -| 2 | 0.420325 | `mysql_database_list` | ❌ | -| 3 | 0.407169 | `aks_nodepool_get` | ❌ | -<<<<<<< HEAD -<<<<<<< HEAD -| 4 | 0.402616 | `mysql_server_list` | ❌ | +| 2 | 0.420481 | `mysql_database_list` | ❌ | +| 3 | 0.407084 | `aks_nodepool_get` | ❌ | +| 4 | 0.402474 | `mysql_server_list` | ❌ | | 5 | 0.397670 | `sql_db_list` | ❌ | --- -## Test 402 -======= -<<<<<<< HEAD -| 4 | 0.402602 | `mysql_server_list` | ❌ | -| 5 | 0.397708 | `sql_db_list` | ❌ | - ---- - -## Test 392 -======= -======= ->>>>>>> e2fd2eac (refactor tts mcp tool) -| 4 | 0.402616 | `mysql_server_list` | ❌ | -| 5 | 0.397670 | `sql_db_list` | ❌ | - ---- - -<<<<<<< HEAD -## Test 402 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -## Test 407 ->>>>>>> e2fd2eac (refactor tts mcp tool) +## Test 412 **Expected Tool:** `sql_server_create` **Prompt:** Create a new Azure SQL server named in resource group @@ -17132,51 +7486,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -<<<<<<< HEAD -| 1 | 0.682605 | `sql_server_create` | ✅ **EXPECTED** | -| 2 | 0.563707 | `sql_db_create` | ❌ | -| 3 | 0.529198 | `sql_server_list` | ❌ | -| 4 | 0.482102 | `storage_account_create` | ❌ | -| 5 | 0.474180 | `sql_db_rename` | ❌ | - ---- - -## Test 403 -======= -<<<<<<< HEAD -| 1 | 0.682198 | `sql_server_create` | ✅ **EXPECTED** | -| 2 | 0.563307 | `sql_db_create` | ❌ | -| 3 | 0.529314 | `sql_server_list` | ❌ | -| 4 | 0.481645 | `storage_account_create` | ❌ | -| 5 | 0.473844 | `sql_db_rename` | ❌ | - ---- - -## Test 393 -======= -| 1 | 0.682812 | `sql_server_create` | ✅ **EXPECTED** | -| 2 | 0.563994 | `sql_db_create` | ❌ | -| 3 | 0.529755 | `sql_server_list` | ❌ | -| 4 | 0.482437 | `storage_account_create` | ❌ | -| 5 | 0.474643 | `sql_db_rename` | ❌ | - ---- - -## Test 403 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -| 1 | 0.682606 | `sql_server_create` | ✅ **EXPECTED** | +| 1 | 0.682736 | `sql_server_create` | ✅ **EXPECTED** | | 2 | 0.563708 | `sql_db_create` | ❌ | -| 3 | 0.529198 | `sql_server_list` | ❌ | +| 3 | 0.529372 | `sql_server_list` | ❌ | | 4 | 0.482102 | `storage_account_create` | ❌ | | 5 | 0.474207 | `sql_db_rename` | ❌ | --- -## Test 408 ->>>>>>> e2fd2eac (refactor tts mcp tool) +## Test 413 **Expected Tool:** `sql_server_create` **Prompt:** Create an Azure SQL server with name in location with admin user @@ -17185,29 +7503,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -<<<<<<< HEAD -| 1 | 0.618354 | `sql_server_create` | ✅ **EXPECTED** | -| 2 | 0.510222 | `sql_db_create` | ❌ | -======= -<<<<<<< HEAD -| 1 | 0.618244 | `sql_server_create` | ✅ **EXPECTED** | -| 2 | 0.510507 | `sql_db_create` | ❌ | ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -| 3 | 0.472462 | `sql_server_show` | ❌ | -| 4 | 0.441267 | `sql_server_delete` | ❌ | -| 5 | 0.400941 | `sql_db_rename` | ❌ | - ---- - -<<<<<<< HEAD -## Test 404 -======= -## Test 394 -======= -======= ->>>>>>> e2fd2eac (refactor tts mcp tool) -| 1 | 0.618309 | `sql_server_create` | ✅ **EXPECTED** | +| 1 | 0.618390 | `sql_server_create` | ✅ **EXPECTED** | | 2 | 0.510169 | `sql_db_create` | ❌ | | 3 | 0.472463 | `sql_server_show` | ❌ | | 4 | 0.441174 | `sql_server_delete` | ❌ | @@ -17215,13 +7511,7 @@ --- -<<<<<<< HEAD -## Test 404 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -## Test 409 ->>>>>>> e2fd2eac (refactor tts mcp tool) +## Test 414 **Expected Tool:** `sql_server_create` **Prompt:** Set up a new SQL server called in my resource group @@ -17230,44 +7520,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -| 1 | 0.589818 | `sql_server_create` | ✅ **EXPECTED** | -<<<<<<< HEAD -<<<<<<< HEAD +| 1 | 0.589934 | `sql_server_create` | ✅ **EXPECTED** | | 2 | 0.501403 | `sql_db_create` | ❌ | -| 3 | 0.497890 | `sql_server_list` | ❌ | -| 4 | 0.461147 | `sql_db_rename` | ❌ | -| 5 | 0.442934 | `mysql_server_list` | ❌ | - ---- - -## Test 405 -======= -<<<<<<< HEAD -| 2 | 0.500874 | `sql_db_create` | ❌ | -| 3 | 0.498255 | `sql_server_list` | ❌ | -| 4 | 0.461181 | `sql_db_rename` | ❌ | -| 5 | 0.442984 | `mysql_server_list` | ❌ | - ---- - -## Test 395 -======= -======= ->>>>>>> e2fd2eac (refactor tts mcp tool) -| 2 | 0.501403 | `sql_db_create` | ❌ | -| 3 | 0.497890 | `sql_server_list` | ❌ | +| 3 | 0.498302 | `sql_server_list` | ❌ | | 4 | 0.461181 | `sql_db_rename` | ❌ | -| 5 | 0.442934 | `mysql_server_list` | ❌ | +| 5 | 0.442943 | `mysql_server_list` | ❌ | --- -<<<<<<< HEAD -## Test 405 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -## Test 410 ->>>>>>> e2fd2eac (refactor tts mcp tool) +## Test 415 **Expected Tool:** `sql_server_delete` **Prompt:** Delete the Azure SQL server from resource group @@ -17277,39 +7538,14 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| | 1 | 0.656593 | `sql_server_delete` | ✅ **EXPECTED** | -<<<<<<< HEAD | 2 | 0.548064 | `sql_db_delete` | ❌ | -<<<<<<< HEAD -| 3 | 0.518037 | `sql_server_list` | ❌ | -======= -<<<<<<< HEAD -| 3 | 0.518306 | `sql_server_list` | ❌ | -======= -| 3 | 0.518201 | `sql_server_list` | ❌ | ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -| 2 | 0.548024 | `sql_db_delete` | ❌ | -| 3 | 0.518036 | `sql_server_list` | ❌ | ->>>>>>> e2fd2eac (refactor tts mcp tool) -| 4 | 0.495550 | `sql_server_create` | ❌ | +| 3 | 0.518178 | `sql_server_list` | ❌ | +| 4 | 0.495640 | `sql_server_create` | ❌ | | 5 | 0.483132 | `workbooks_delete` | ❌ | --- -<<<<<<< HEAD -<<<<<<< HEAD -## Test 406 -======= -<<<<<<< HEAD -## Test 396 -======= -## Test 406 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -## Test 411 ->>>>>>> e2fd2eac (refactor tts mcp tool) +## Test 416 **Expected Tool:** `sql_server_delete` **Prompt:** Remove the SQL server from my subscription @@ -17320,39 +7556,13 @@ |------|-------|------|--------| | 1 | 0.615073 | `sql_server_delete` | ✅ **EXPECTED** | | 2 | 0.393885 | `postgres_server_list` | ❌ | -<<<<<<< HEAD | 3 | 0.379760 | `sql_db_delete` | ❌ | | 4 | 0.376660 | `sql_server_show` | ❌ | -<<<<<<< HEAD -| 5 | 0.350103 | `sql_server_list` | ❌ | - ---- - -## Test 407 -======= -<<<<<<< HEAD -| 5 | 0.350384 | `sql_server_list` | ❌ | - ---- - -## Test 397 -======= -| 5 | 0.350173 | `sql_server_list` | ❌ | +| 5 | 0.350228 | `sql_server_list` | ❌ | --- -## Test 407 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -| 3 | 0.379763 | `sql_db_delete` | ❌ | -| 4 | 0.376660 | `sql_server_show` | ❌ | -| 5 | 0.350103 | `sql_server_list` | ❌ | - ---- - -## Test 412 ->>>>>>> e2fd2eac (refactor tts mcp tool) +## Test 417 **Expected Tool:** `sql_server_delete` **Prompt:** Delete SQL server permanently @@ -17362,40 +7572,14 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| | 1 | 0.624310 | `sql_server_delete` | ✅ **EXPECTED** | -<<<<<<< HEAD | 2 | 0.454892 | `sql_db_delete` | ❌ | -| 3 | 0.362561 | `sql_server_firewall-rule_delete` | ❌ | -| 4 | 0.341503 | `sql_server_show` | ❌ | -<<<<<<< HEAD -| 5 | 0.318758 | `eventhubs_eventhub_delete` | ❌ | - ---- - -## Test 408 -======= -<<<<<<< HEAD -| 5 | 0.319013 | `eventhubs_eventhub_delete` | ❌ | - ---- - -## Test 398 -======= -======= -| 2 | 0.454907 | `sql_db_delete` | ❌ | | 3 | 0.362389 | `sql_server_firewall-rule_delete` | ❌ | | 4 | 0.341503 | `sql_server_show` | ❌ | ->>>>>>> e2fd2eac (refactor tts mcp tool) | 5 | 0.318758 | `eventhubs_eventhub_delete` | ❌ | --- -<<<<<<< HEAD -## Test 408 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -## Test 413 ->>>>>>> e2fd2eac (refactor tts mcp tool) +## Test 418 **Expected Tool:** `sql_server_entra-admin_list` **Prompt:** List Microsoft Entra ID administrators for SQL server @@ -17406,91 +7590,30 @@ |------|-------|------|--------| | 1 | 0.783479 | `sql_server_entra-admin_list` | ✅ **EXPECTED** | | 2 | 0.456051 | `sql_server_show` | ❌ | -<<<<<<< HEAD -<<<<<<< HEAD -| 3 | 0.434868 | `sql_server_list` | ❌ | -| 4 | 0.401854 | `sql_server_firewall-rule_list` | ❌ | -| 5 | 0.376055 | `sql_db_list` | ❌ | - ---- - -## Test 409 -======= -<<<<<<< HEAD -| 3 | 0.434565 | `sql_server_list` | ❌ | +| 3 | 0.434815 | `sql_server_list` | ❌ | | 4 | 0.401908 | `sql_server_firewall-rule_list` | ❌ | -| 5 | 0.375977 | `sql_db_list` | ❌ | - ---- - -## Test 399 -======= -| 3 | 0.434776 | `sql_server_list` | ❌ | -| 4 | 0.401880 | `sql_server_firewall-rule_list` | ❌ | -======= -| 3 | 0.434868 | `sql_server_list` | ❌ | -| 4 | 0.401878 | `sql_server_firewall-rule_list` | ❌ | ->>>>>>> e2fd2eac (refactor tts mcp tool) | 5 | 0.376055 | `sql_db_list` | ❌ | --- -<<<<<<< HEAD -## Test 409 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -## Test 414 ->>>>>>> e2fd2eac (refactor tts mcp tool) +## Test 419 **Expected Tool:** `sql_server_entra-admin_list` **Prompt:** Show me the Entra ID administrators configured for SQL server -### Results - -| Rank | Score | Tool | Status | -|------|-------|------|--------| -<<<<<<< HEAD -| 1 | 0.713306 | `sql_server_entra-admin_list` | ✅ **EXPECTED** | -| 2 | 0.413144 | `sql_server_show` | ❌ | -<<<<<<< HEAD -| 3 | 0.368082 | `sql_server_list` | ❌ | -| 4 | 0.315966 | `sql_db_list` | ❌ | -| 5 | 0.311085 | `postgres_server_list` | ❌ | - ---- - -## Test 410 -======= -<<<<<<< HEAD -| 3 | 0.367692 | `sql_server_list` | ❌ | -| 4 | 0.315939 | `sql_db_list` | ❌ | -| 5 | 0.311071 | `postgres_server_list` | ❌ | - ---- +### Results -## Test 400 -======= -| 3 | 0.368018 | `sql_server_list` | ❌ | +| Rank | Score | Tool | Status | +|------|-------|------|--------| +| 1 | 0.713306 | `sql_server_entra-admin_list` | ✅ **EXPECTED** | +| 2 | 0.413144 | `sql_server_show` | ❌ | +| 3 | 0.368036 | `sql_server_list` | ❌ | | 4 | 0.315966 | `sql_db_list` | ❌ | | 5 | 0.311085 | `postgres_server_list` | ❌ | --- -## Test 410 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -| 1 | 0.713093 | `sql_server_entra-admin_list` | ✅ **EXPECTED** | -| 2 | 0.412505 | `sql_server_show` | ❌ | -| 3 | 0.368257 | `sql_server_list` | ❌ | -| 4 | 0.315605 | `sql_db_list` | ❌ | -| 5 | 0.310940 | `postgres_server_list` | ❌ | - ---- - -## Test 415 ->>>>>>> e2fd2eac (refactor tts mcp tool) +## Test 420 **Expected Tool:** `sql_server_entra-admin_list` **Prompt:** What Microsoft Entra ID administrators are set up for my SQL server ? @@ -17501,37 +7624,13 @@ |------|-------|------|--------| | 1 | 0.646419 | `sql_server_entra-admin_list` | ✅ **EXPECTED** | | 2 | 0.356025 | `sql_server_show` | ❌ | -<<<<<<< HEAD -<<<<<<< HEAD -| 3 | 0.322155 | `sql_server_list` | ❌ | -======= -<<<<<<< HEAD -| 3 | 0.322084 | `sql_server_list` | ❌ | -======= -| 3 | 0.322362 | `sql_server_list` | ❌ | ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -| 3 | 0.322155 | `sql_server_list` | ❌ | ->>>>>>> e2fd2eac (refactor tts mcp tool) -| 4 | 0.307823 | `sql_server_create` | ❌ | +| 3 | 0.322358 | `sql_server_list` | ❌ | +| 4 | 0.307885 | `sql_server_create` | ❌ | | 5 | 0.269788 | `sql_server_delete` | ❌ | --- -<<<<<<< HEAD -<<<<<<< HEAD -## Test 411 -======= -<<<<<<< HEAD -## Test 401 -======= -## Test 411 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -## Test 416 ->>>>>>> e2fd2eac (refactor tts mcp tool) +## Test 421 **Expected Tool:** `sql_server_firewall-rule_create` **Prompt:** Create a firewall rule for my Azure SQL server @@ -17540,33 +7639,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -| 1 | 0.635467 | `sql_server_firewall-rule_create` | ✅ **EXPECTED** | -| 2 | 0.532658 | `sql_server_firewall-rule_list` | ❌ | -| 3 | 0.522133 | `sql_server_firewall-rule_delete` | ❌ | -======= | 1 | 0.635466 | `sql_server_firewall-rule_create` | ✅ **EXPECTED** | -| 2 | 0.532758 | `sql_server_firewall-rule_list` | ❌ | +| 2 | 0.532712 | `sql_server_firewall-rule_list` | ❌ | | 3 | 0.522184 | `sql_server_firewall-rule_delete` | ❌ | ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -| 4 | 0.448822 | `sql_server_create` | ❌ | +| 4 | 0.448938 | `sql_server_create` | ❌ | | 5 | 0.440845 | `sql_server_delete` | ❌ | --- -<<<<<<< HEAD -<<<<<<< HEAD -## Test 412 -======= -<<<<<<< HEAD -## Test 402 -======= -## Test 412 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -## Test 417 ->>>>>>> e2fd2eac (refactor tts mcp tool) +## Test 422 **Expected Tool:** `sql_server_firewall-rule_create` **Prompt:** Add a firewall rule to allow access from IP range to for SQL server @@ -17575,51 +7656,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -<<<<<<< HEAD -| 1 | 0.670392 | `sql_server_firewall-rule_create` | ✅ **EXPECTED** | -| 2 | 0.533587 | `sql_server_firewall-rule_list` | ❌ | -| 3 | 0.503740 | `sql_server_firewall-rule_delete` | ❌ | -| 4 | 0.316700 | `sql_server_list` | ❌ | -| 5 | 0.302273 | `sql_server_delete` | ❌ | - ---- - -## Test 413 -======= -<<<<<<< HEAD -| 1 | 0.670233 | `sql_server_firewall-rule_create` | ✅ **EXPECTED** | -| 2 | 0.533669 | `sql_server_firewall-rule_list` | ❌ | -| 3 | 0.503500 | `sql_server_firewall-rule_delete` | ❌ | -| 4 | 0.316954 | `sql_server_list` | ❌ | -| 5 | 0.302510 | `sql_server_delete` | ❌ | - ---- - -## Test 403 -======= | 1 | 0.670189 | `sql_server_firewall-rule_create` | ✅ **EXPECTED** | -| 2 | 0.533532 | `sql_server_firewall-rule_list` | ❌ | +| 2 | 0.533562 | `sql_server_firewall-rule_list` | ❌ | | 3 | 0.503648 | `sql_server_firewall-rule_delete` | ❌ | -| 4 | 0.316667 | `sql_server_list` | ❌ | +| 4 | 0.316596 | `sql_server_list` | ❌ | | 5 | 0.302362 | `sql_server_delete` | ❌ | --- -## Test 413 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -| 1 | 0.670186 | `sql_server_firewall-rule_create` | ✅ **EXPECTED** | -| 2 | 0.533573 | `sql_server_firewall-rule_list` | ❌ | -| 3 | 0.503564 | `sql_server_firewall-rule_delete` | ❌ | -| 4 | 0.316641 | `sql_server_list` | ❌ | -| 5 | 0.302391 | `sql_server_delete` | ❌ | - ---- - -## Test 418 ->>>>>>> e2fd2eac (refactor tts mcp tool) +## Test 423 **Expected Tool:** `sql_server_firewall-rule_create` **Prompt:** Create a new firewall rule named for SQL server @@ -17628,32 +7673,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -| 1 | 0.685125 | `sql_server_firewall-rule_create` | ✅ **EXPECTED** | -| 2 | 0.574393 | `sql_server_firewall-rule_list` | ❌ | -| 3 | 0.539643 | `sql_server_firewall-rule_delete` | ❌ | -| 4 | 0.428987 | `sql_server_create` | ❌ | -| 5 | 0.395244 | `sql_db_create` | ❌ | - ---- - -## Test 414 -======= | 1 | 0.685107 | `sql_server_firewall-rule_create` | ✅ **EXPECTED** | -| 2 | 0.574431 | `sql_server_firewall-rule_list` | ❌ | +| 2 | 0.574336 | `sql_server_firewall-rule_list` | ❌ | | 3 | 0.539577 | `sql_server_firewall-rule_delete` | ❌ | -| 4 | 0.428919 | `sql_server_create` | ❌ | +| 4 | 0.428986 | `sql_server_create` | ❌ | | 5 | 0.395165 | `sql_db_create` | ❌ | --- -<<<<<<< HEAD -## Test 414 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -## Test 419 ->>>>>>> e2fd2eac (refactor tts mcp tool) +## Test 424 **Expected Tool:** `sql_server_firewall-rule_delete` **Prompt:** Delete a firewall rule from my Azure SQL server @@ -17662,35 +7690,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -| 1 | 0.691498 | `sql_server_firewall-rule_delete` | ✅ **EXPECTED** | +| 1 | 0.691421 | `sql_server_firewall-rule_delete` | ✅ **EXPECTED** | | 2 | 0.584379 | `sql_server_delete` | ❌ | -<<<<<<< HEAD -<<<<<<< HEAD -| 3 | 0.543780 | `sql_server_firewall-rule_list` | ❌ | -======= -| 3 | 0.543839 | `sql_server_firewall-rule_list` | ❌ | ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -| 3 | 0.543913 | `sql_server_firewall-rule_list` | ❌ | ->>>>>>> e2fd2eac (refactor tts mcp tool) +| 3 | 0.543857 | `sql_server_firewall-rule_list` | ❌ | | 4 | 0.540333 | `sql_server_firewall-rule_create` | ❌ | -| 5 | 0.498448 | `sql_db_delete` | ❌ | +| 5 | 0.498444 | `sql_db_delete` | ❌ | --- -<<<<<<< HEAD -<<<<<<< HEAD -## Test 415 -======= -<<<<<<< HEAD -## Test 405 -======= -## Test 415 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -## Test 420 ->>>>>>> e2fd2eac (refactor tts mcp tool) +## Test 425 **Expected Tool:** `sql_server_firewall-rule_delete` **Prompt:** Remove the firewall rule from SQL server @@ -17699,36 +7707,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -| 1 | 0.670233 | `sql_server_firewall-rule_delete` | ✅ **EXPECTED** | -| 2 | 0.574296 | `sql_server_firewall-rule_list` | ❌ | -======= -| 1 | 0.670179 | `sql_server_firewall-rule_delete` | ✅ **EXPECTED** | -<<<<<<< HEAD -| 2 | 0.574321 | `sql_server_firewall-rule_list` | ❌ | ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -| 2 | 0.574448 | `sql_server_firewall-rule_list` | ❌ | ->>>>>>> e2fd2eac (refactor tts mcp tool) -| 3 | 0.530419 | `sql_server_firewall-rule_create` | ❌ | -| 4 | 0.488418 | `sql_server_delete` | ❌ | -| 5 | 0.360401 | `sql_db_delete` | ❌ | +| 1 | 0.670091 | `sql_server_firewall-rule_delete` | ✅ **EXPECTED** | +| 2 | 0.574319 | `sql_server_firewall-rule_list` | ❌ | +| 3 | 0.530412 | `sql_server_firewall-rule_create` | ❌ | +| 4 | 0.488400 | `sql_server_delete` | ❌ | +| 5 | 0.360385 | `sql_db_delete` | ❌ | --- -<<<<<<< HEAD -<<<<<<< HEAD -## Test 416 -======= -<<<<<<< HEAD -## Test 406 -======= -## Test 416 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -## Test 421 ->>>>>>> e2fd2eac (refactor tts mcp tool) +## Test 426 **Expected Tool:** `sql_server_firewall-rule_delete` **Prompt:** Delete firewall rule for SQL server @@ -17737,39 +7724,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -| 1 | 0.671298 | `sql_server_firewall-rule_delete` | ✅ **EXPECTED** | -| 2 | 0.601174 | `sql_server_firewall-rule_list` | ❌ | -======= | 1 | 0.671212 | `sql_server_firewall-rule_delete` | ✅ **EXPECTED** | -<<<<<<< HEAD -| 2 | 0.601217 | `sql_server_firewall-rule_list` | ❌ | ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -| 3 | 0.577330 | `sql_server_firewall-rule_create` | ❌ | -| 4 | 0.499272 | `sql_server_delete` | ❌ | -| 5 | 0.378586 | `sql_db_delete` | ❌ | - ---- - -<<<<<<< HEAD -## Test 417 -======= -<<<<<<< HEAD -## Test 407 -======= -## Test 417 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -| 2 | 0.601324 | `sql_server_firewall-rule_list` | ❌ | +| 2 | 0.601230 | `sql_server_firewall-rule_list` | ❌ | | 3 | 0.577330 | `sql_server_firewall-rule_create` | ❌ | | 4 | 0.499272 | `sql_server_delete` | ❌ | -| 5 | 0.378589 | `sql_db_delete` | ❌ | +| 5 | 0.378585 | `sql_db_delete` | ❌ | --- -## Test 422 ->>>>>>> e2fd2eac (refactor tts mcp tool) +## Test 427 **Expected Tool:** `sql_server_firewall-rule_list` **Prompt:** List all firewall rules for SQL server @@ -17778,47 +7741,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -<<<<<<< HEAD -| 1 | 0.729336 | `sql_server_firewall-rule_list` | ✅ **EXPECTED** | -======= -| 1 | 0.729320 | `sql_server_firewall-rule_list` | ✅ **EXPECTED** | ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -| 1 | 0.729415 | `sql_server_firewall-rule_list` | ✅ **EXPECTED** | ->>>>>>> e2fd2eac (refactor tts mcp tool) +| 1 | 0.729372 | `sql_server_firewall-rule_list` | ✅ **EXPECTED** | | 2 | 0.549667 | `sql_server_firewall-rule_create` | ❌ | -| 3 | 0.513187 | `sql_server_firewall-rule_delete` | ❌ | +| 3 | 0.513114 | `sql_server_firewall-rule_delete` | ❌ | | 4 | 0.468812 | `sql_server_show` | ❌ | -<<<<<<< HEAD -<<<<<<< HEAD -| 5 | 0.418817 | `sql_server_list` | ❌ | - ---- - -## Test 418 -======= -<<<<<<< HEAD -| 5 | 0.418869 | `sql_server_list` | ❌ | - ---- - -## Test 408 -======= -| 5 | 0.418738 | `sql_server_list` | ❌ | - ---- - -## Test 418 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -| 5 | 0.418817 | `sql_server_list` | ❌ | +| 5 | 0.418681 | `sql_server_list` | ❌ | --- -## Test 423 ->>>>>>> e2fd2eac (refactor tts mcp tool) +## Test 428 **Expected Tool:** `sql_server_firewall-rule_list` **Prompt:** Show me the firewall rules for SQL server @@ -17827,39 +7758,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -| 1 | 0.630795 | `sql_server_firewall-rule_list` | ✅ **EXPECTED** | +| 1 | 0.630731 | `sql_server_firewall-rule_list` | ✅ **EXPECTED** | | 2 | 0.524126 | `sql_server_firewall-rule_create` | ❌ | -| 3 | 0.476792 | `sql_server_firewall-rule_delete` | ❌ | +| 3 | 0.476757 | `sql_server_firewall-rule_delete` | ❌ | | 4 | 0.410680 | `sql_server_show` | ❌ | -<<<<<<< HEAD -<<<<<<< HEAD -| 5 | 0.348100 | `sql_server_list` | ❌ | - ---- - -## Test 419 -======= -<<<<<<< HEAD -| 5 | 0.348249 | `sql_server_list` | ❌ | - ---- - -## Test 409 -======= -| 5 | 0.348096 | `sql_server_list` | ❌ | - ---- - -## Test 419 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -| 5 | 0.348100 | `sql_server_list` | ❌ | +| 5 | 0.348049 | `sql_server_list` | ❌ | --- -## Test 424 ->>>>>>> e2fd2eac (refactor tts mcp tool) +## Test 429 **Expected Tool:** `sql_server_firewall-rule_list` **Prompt:** What firewall rules are configured for my SQL server ? @@ -17868,47 +7775,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -<<<<<<< HEAD -| 1 | 0.630460 | `sql_server_firewall-rule_list` | ✅ **EXPECTED** | -======= -| 1 | 0.630494 | `sql_server_firewall-rule_list` | ✅ **EXPECTED** | ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -| 1 | 0.630582 | `sql_server_firewall-rule_list` | ✅ **EXPECTED** | ->>>>>>> e2fd2eac (refactor tts mcp tool) -| 2 | 0.532454 | `sql_server_firewall-rule_create` | ❌ | -| 3 | 0.473596 | `sql_server_firewall-rule_delete` | ❌ | -| 4 | 0.412957 | `sql_server_show` | ❌ | -<<<<<<< HEAD -<<<<<<< HEAD -| 5 | 0.350513 | `sql_server_list` | ❌ | - ---- - -## Test 420 -======= -<<<<<<< HEAD -| 5 | 0.350545 | `sql_server_list` | ❌ | - ---- - -## Test 410 -======= -| 5 | 0.350474 | `sql_server_list` | ❌ | - ---- - -## Test 420 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -| 5 | 0.350513 | `sql_server_list` | ❌ | +| 1 | 0.630555 | `sql_server_firewall-rule_list` | ✅ **EXPECTED** | +| 2 | 0.532484 | `sql_server_firewall-rule_create` | ❌ | +| 3 | 0.473499 | `sql_server_firewall-rule_delete` | ❌ | +| 4 | 0.412906 | `sql_server_show` | ❌ | +| 5 | 0.350385 | `sql_server_list` | ❌ | --- -## Test 425 ->>>>>>> e2fd2eac (refactor tts mcp tool) +## Test 430 **Expected Tool:** `sql_server_list` **Prompt:** List all Azure SQL servers in resource group @@ -17917,45 +7792,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -<<<<<<< HEAD -| 1 | 0.694404 | `sql_server_list` | ✅ **EXPECTED** | -| 2 | 0.596686 | `mysql_server_list` | ❌ | -| 3 | 0.578238 | `sql_db_list` | ❌ | -======= -<<<<<<< HEAD -| 1 | 0.694268 | `sql_server_list` | ✅ **EXPECTED** | -| 2 | 0.596720 | `mysql_server_list` | ❌ | -| 3 | 0.578135 | `sql_db_list` | ❌ | -======= -| 1 | 0.694306 | `sql_server_list` | ✅ **EXPECTED** | -| 2 | 0.596686 | `mysql_server_list` | ❌ | -| 3 | 0.578239 | `sql_db_list` | ❌ | ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -| 1 | 0.694404 | `sql_server_list` | ✅ **EXPECTED** | -| 2 | 0.596686 | `mysql_server_list` | ❌ | -| 3 | 0.578239 | `sql_db_list` | ❌ | ->>>>>>> e2fd2eac (refactor tts mcp tool) -| 4 | 0.515851 | `sql_elastic-pool_list` | ❌ | -| 5 | 0.509789 | `sql_db_show` | ❌ | +| 1 | 0.694271 | `sql_server_list` | ✅ **EXPECTED** | +| 2 | 0.596669 | `mysql_server_list` | ❌ | +| 3 | 0.578222 | `sql_db_list` | ❌ | +| 4 | 0.515894 | `sql_elastic-pool_list` | ❌ | +| 5 | 0.509835 | `sql_db_show` | ❌ | --- -<<<<<<< HEAD -<<<<<<< HEAD -## Test 421 -======= -<<<<<<< HEAD -## Test 411 -======= -## Test 421 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -## Test 426 ->>>>>>> e2fd2eac (refactor tts mcp tool) +## Test 431 **Expected Tool:** `sql_server_list` **Prompt:** Show me every SQL server available in resource group @@ -17964,48 +7809,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -<<<<<<< HEAD -| 1 | 0.618218 | `sql_server_list` | ✅ **EXPECTED** | -| 2 | 0.593837 | `mysql_server_list` | ❌ | -| 3 | 0.542398 | `sql_db_list` | ❌ | -| 4 | 0.507404 | `resourcehealth_availability-status_list` | ❌ | -======= -<<<<<<< HEAD -| 1 | 0.618206 | `sql_server_list` | ✅ **EXPECTED** | -| 2 | 0.593874 | `mysql_server_list` | ❌ | -| 3 | 0.542307 | `sql_db_list` | ❌ | -| 4 | 0.507683 | `resourcehealth_availability-status_list` | ❌ | -======= -| 1 | 0.618222 | `sql_server_list` | ✅ **EXPECTED** | -| 2 | 0.593837 | `mysql_server_list` | ❌ | +| 1 | 0.618187 | `sql_server_list` | ✅ **EXPECTED** | +| 2 | 0.594043 | `mysql_server_list` | ❌ | | 3 | 0.542398 | `sql_db_list` | ❌ | | 4 | 0.507404 | `resourcehealth_availability-status_list` | ❌ | ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -| 1 | 0.618218 | `sql_server_list` | ✅ **EXPECTED** | -| 2 | 0.593837 | `mysql_server_list` | ❌ | -| 3 | 0.542398 | `sql_db_list` | ❌ | -| 4 | 0.507404 | `resourcehealth_availability-status_list` | ❌ | ->>>>>>> e2fd2eac (refactor tts mcp tool) -| 5 | 0.496200 | `group_list` | ❌ | +| 5 | 0.496257 | `group_list` | ❌ | --- -<<<<<<< HEAD -<<<<<<< HEAD -## Test 422 -======= -<<<<<<< HEAD -## Test 412 -======= -## Test 422 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -## Test 427 ->>>>>>> e2fd2eac (refactor tts mcp tool) +## Test 432 **Expected Tool:** `sql_server_show` **Prompt:** Show me the details of Azure SQL server in resource group @@ -18016,41 +7828,13 @@ |------|-------|------|--------| | 1 | 0.629672 | `sql_db_show` | ❌ | | 2 | 0.595184 | `sql_server_show` | ✅ **EXPECTED** | -<<<<<<< HEAD -<<<<<<< HEAD -| 3 | 0.587728 | `sql_server_list` | ❌ | -| 4 | 0.559893 | `mysql_server_list` | ❌ | -| 5 | 0.540218 | `sql_db_list` | ❌ | - ---- - -## Test 423 -======= -<<<<<<< HEAD -| 3 | 0.587826 | `sql_server_list` | ❌ | -| 4 | 0.559936 | `mysql_server_list` | ❌ | -| 5 | 0.540037 | `sql_db_list` | ❌ | - ---- - -## Test 413 -======= -| 3 | 0.587806 | `sql_server_list` | ❌ | -======= -| 3 | 0.587728 | `sql_server_list` | ❌ | ->>>>>>> e2fd2eac (refactor tts mcp tool) -| 4 | 0.559893 | `mysql_server_list` | ❌ | +| 3 | 0.587768 | `sql_server_list` | ❌ | +| 4 | 0.560004 | `mysql_server_list` | ❌ | | 5 | 0.540218 | `sql_db_list` | ❌ | --- -<<<<<<< HEAD -## Test 423 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -## Test 428 ->>>>>>> e2fd2eac (refactor tts mcp tool) +## Test 433 **Expected Tool:** `sql_server_show` **Prompt:** Get the configuration details for SQL server @@ -18063,23 +7847,11 @@ | 2 | 0.610507 | `postgres_server_config_get` | ❌ | | 3 | 0.538034 | `mysql_server_config_get` | ❌ | | 4 | 0.471541 | `sql_db_show` | ❌ | -| 5 | 0.445432 | `postgres_server_param_get` | ❌ | +| 5 | 0.445430 | `postgres_server_param_get` | ❌ | --- -<<<<<<< HEAD -<<<<<<< HEAD -## Test 424 -======= -<<<<<<< HEAD -## Test 414 -======= -## Test 424 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -## Test 429 ->>>>>>> e2fd2eac (refactor tts mcp tool) +## Test 434 **Expected Tool:** `sql_server_show` **Prompt:** Display the properties of SQL server @@ -18090,38 +7862,13 @@ |------|-------|------|--------| | 1 | 0.563143 | `sql_server_show` | ✅ **EXPECTED** | | 2 | 0.392532 | `postgres_server_config_get` | ❌ | -<<<<<<< HEAD -| 3 | 0.380035 | `postgres_server_param_get` | ❌ | -| 4 | 0.372102 | `sql_server_firewall-rule_list` | ❌ | -======= | 3 | 0.380021 | `postgres_server_param_get` | ❌ | -<<<<<<< HEAD -<<<<<<< HEAD | 4 | 0.372194 | `sql_server_firewall-rule_list` | ❌ | -======= -| 4 | 0.372172 | `sql_server_firewall-rule_list` | ❌ | ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -| 4 | 0.372179 | `sql_server_firewall-rule_list` | ❌ | ->>>>>>> e2fd2eac (refactor tts mcp tool) | 5 | 0.370539 | `sql_db_show` | ❌ | --- -<<<<<<< HEAD -<<<<<<< HEAD -## Test 425 -======= -<<<<<<< HEAD -## Test 415 -======= -## Test 425 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -## Test 430 ->>>>>>> e2fd2eac (refactor tts mcp tool) +## Test 435 **Expected Tool:** `storage_account_create` **Prompt:** Create a new storage account called testaccount123 in East US region @@ -18132,40 +7879,13 @@ |------|-------|------|--------| | 1 | 0.533552 | `storage_account_create` | ✅ **EXPECTED** | | 2 | 0.438046 | `storage_blob_container_create` | ❌ | -<<<<<<< HEAD -<<<<<<< HEAD | 3 | 0.418191 | `storage_account_get` | ❌ | -| 4 | 0.413950 | `storage_blob_container_get` | ❌ | -| 5 | 0.373651 | `managedlustre_fs_create` | ❌ | - ---- - -## Test 426 -======= -<<<<<<< HEAD -| 3 | 0.418002 | `storage_account_get` | ❌ | -======= -| 3 | 0.418134 | `storage_account_get` | ❌ | ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) | 4 | 0.414518 | `storage_blob_container_get` | ❌ | -======= -| 3 | 0.418191 | `storage_account_get` | ❌ | -| 4 | 0.414964 | `storage_blob_container_get` | ❌ | ->>>>>>> e2fd2eac (refactor tts mcp tool) | 5 | 0.370957 | `managedlustre_fs_create` | ❌ | --- -<<<<<<< HEAD -<<<<<<< HEAD -## Test 416 -======= -## Test 426 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -## Test 431 ->>>>>>> e2fd2eac (refactor tts mcp tool) +## Test 436 **Expected Tool:** `storage_account_create` **Prompt:** Create a storage account with premium performance and LRS replication @@ -18175,41 +7895,14 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| | 1 | 0.500638 | `storage_account_create` | ✅ **EXPECTED** | -<<<<<<< HEAD -<<<<<<< HEAD -| 2 | 0.484584 | `managedlustre_fs_create` | ❌ | -| 3 | 0.407222 | `storage_account_get` | ❌ | -======= -<<<<<<< HEAD -| 2 | 0.483202 | `managedlustre_fs_create` | ❌ | -| 3 | 0.407182 | `storage_account_get` | ❌ | -======= -| 2 | 0.483202 | `managedlustre_filesystem_create` | ❌ | -| 3 | 0.407200 | `storage_account_get` | ❌ | ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= | 2 | 0.483202 | `managedlustre_fs_create` | ❌ | | 3 | 0.407222 | `storage_account_get` | ❌ | ->>>>>>> e2fd2eac (refactor tts mcp tool) | 4 | 0.406804 | `storage_blob_container_create` | ❌ | -| 5 | 0.400134 | `managedlustre_fs_sku_get` | ❌ | +| 5 | 0.400151 | `managedlustre_fs_sku_get` | ❌ | --- -<<<<<<< HEAD -<<<<<<< HEAD -## Test 427 -======= -<<<<<<< HEAD -## Test 417 -======= -## Test 427 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -## Test 432 ->>>>>>> e2fd2eac (refactor tts mcp tool) +## Test 437 **Expected Tool:** `storage_account_create` **Prompt:** Create a new storage account with Data Lake Storage Gen2 enabled @@ -18218,41 +7911,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -| 1 | 0.589002 | `storage_account_create` | ✅ **EXPECTED** | -| 2 | 0.538023 | `managedlustre_fs_create` | ❌ | +| 1 | 0.589003 | `storage_account_create` | ✅ **EXPECTED** | +| 2 | 0.535501 | `managedlustre_fs_create` | ❌ | | 3 | 0.509731 | `storage_blob_container_create` | ❌ | -<<<<<<< HEAD -<<<<<<< HEAD -| 4 | 0.462519 | `storage_account_get` | ❌ | -| 5 | 0.447156 | `sql_db_create` | ❌ | - ---- - -## Test 428 -======= -<<<<<<< HEAD -| 4 | 0.462494 | `storage_account_get` | ❌ | -| 5 | 0.447560 | `sql_db_create` | ❌ | - ---- - -## Test 418 -======= -| 4 | 0.462480 | `storage_account_get` | ❌ | -======= | 4 | 0.462519 | `storage_account_get` | ❌ | ->>>>>>> e2fd2eac (refactor tts mcp tool) | 5 | 0.447156 | `sql_db_create` | ❌ | --- -<<<<<<< HEAD -## Test 428 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -## Test 433 ->>>>>>> e2fd2eac (refactor tts mcp tool) +## Test 438 **Expected Tool:** `storage_account_get` **Prompt:** Show me the details for my storage account @@ -18261,48 +7928,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -<<<<<<< HEAD -| 1 | 0.673750 | `storage_account_get` | ✅ **EXPECTED** | -| 2 | 0.607762 | `storage_blob_container_get` | ❌ | -| 3 | 0.556457 | `storage_blob_get` | ❌ | -| 4 | 0.483435 | `storage_account_create` | ❌ | -| 5 | 0.439236 | `cosmos_account_list` | ❌ | - ---- - -## Test 429 -======= -<<<<<<< HEAD -| 1 | 0.673569 | `storage_account_get` | ✅ **EXPECTED** | -| 2 | 0.608073 | `storage_blob_container_get` | ❌ | -| 3 | 0.556407 | `storage_blob_get` | ❌ | -| 4 | 0.483573 | `storage_account_create` | ❌ | -| 5 | 0.439109 | `cosmos_account_list` | ❌ | - ---- - -## Test 419 -======= -| 1 | 0.673754 | `storage_account_get` | ✅ **EXPECTED** | -| 2 | 0.608256 | `storage_blob_container_get` | ❌ | -======= | 1 | 0.673749 | `storage_account_get` | ✅ **EXPECTED** | -| 2 | 0.608245 | `storage_blob_container_get` | ❌ | ->>>>>>> e2fd2eac (refactor tts mcp tool) +| 2 | 0.608256 | `storage_blob_container_get` | ❌ | | 3 | 0.556457 | `storage_blob_get` | ❌ | | 4 | 0.483435 | `storage_account_create` | ❌ | | 5 | 0.439236 | `cosmos_account_list` | ❌ | --- -<<<<<<< HEAD -## Test 429 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -## Test 434 ->>>>>>> e2fd2eac (refactor tts mcp tool) +## Test 439 **Expected Tool:** `storage_account_get` **Prompt:** Get details about the storage account @@ -18311,41 +7945,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -<<<<<<< HEAD | 1 | 0.692687 | `storage_account_get` | ✅ **EXPECTED** | -| 2 | 0.577173 | `storage_blob_container_get` | ❌ | -======= -<<<<<<< HEAD -| 1 | 0.692473 | `storage_account_get` | ✅ **EXPECTED** | -======= -| 1 | 0.692698 | `storage_account_get` | ✅ **EXPECTED** | ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) | 2 | 0.577547 | `storage_blob_container_get` | ❌ | ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -| 1 | 0.692687 | `storage_account_get` | ✅ **EXPECTED** | -| 2 | 0.577692 | `storage_blob_container_get` | ❌ | ->>>>>>> e2fd2eac (refactor tts mcp tool) | 3 | 0.529205 | `storage_blob_get` | ❌ | | 4 | 0.518215 | `storage_account_create` | ❌ | -| 5 | 0.448506 | `storage_blob_container_create` | ❌ | +| 5 | 0.448507 | `storage_blob_container_create` | ❌ | --- -<<<<<<< HEAD -<<<<<<< HEAD -## Test 430 -======= -<<<<<<< HEAD -## Test 420 -======= -## Test 430 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -## Test 435 ->>>>>>> e2fd2eac (refactor tts mcp tool) +## Test 440 **Expected Tool:** `storage_account_get` **Prompt:** List all storage accounts in my subscription including their location and SKU @@ -18354,48 +7962,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -<<<<<<< HEAD | 1 | 0.649215 | `storage_account_get` | ✅ **EXPECTED** | -| 2 | 0.557093 | `managedlustre_fs_sku_get` | ❌ | -| 3 | 0.549448 | `storage_blob_container_get` | ❌ | -======= -<<<<<<< HEAD -| 1 | 0.649393 | `storage_account_get` | ✅ **EXPECTED** | | 2 | 0.557016 | `managedlustre_fs_sku_get` | ❌ | | 3 | 0.550148 | `storage_blob_container_get` | ❌ | ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -| 4 | 0.547577 | `subscription_list` | ❌ | -======= -| 1 | 0.649215 | `storage_account_get` | ✅ **EXPECTED** | -| 2 | 0.557016 | `managedlustre_fs_sku_get` | ❌ | -| 3 | 0.550292 | `storage_blob_container_get` | ❌ | | 4 | 0.547647 | `subscription_list` | ❌ | ->>>>>>> e2fd2eac (refactor tts mcp tool) | 5 | 0.536909 | `cosmos_account_list` | ❌ | --- -<<<<<<< HEAD -<<<<<<< HEAD -## Test 431 -======= -## Test 421 -======= -| 1 | 0.649191 | `storage_account_get` | ✅ **EXPECTED** | -| 2 | 0.557016 | `managedlustre_filesystem_sku_get` | ❌ | -| 3 | 0.550148 | `storage_blob_container_get` | ❌ | -| 4 | 0.547647 | `subscription_list` | ❌ | -| 5 | 0.536912 | `cosmos_account_list` | ❌ | - ---- - -## Test 431 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -## Test 436 ->>>>>>> e2fd2eac (refactor tts mcp tool) +## Test 441 **Expected Tool:** `storage_account_get` **Prompt:** Show me my storage accounts with whether hierarchical namespace (HNS) is enabled @@ -18404,40 +7979,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -<<<<<<< HEAD | 1 | 0.556860 | `storage_account_get` | ✅ **EXPECTED** | -| 2 | 0.481664 | `storage_blob_container_get` | ❌ | -| 3 | 0.461284 | `managedlustre_fs_list` | ❌ | -======= -<<<<<<< HEAD -| 1 | 0.557064 | `storage_account_get` | ✅ **EXPECTED** | | 2 | 0.482418 | `storage_blob_container_get` | ❌ | -| 3 | 0.461308 | `managedlustre_fs_list` | ❌ | ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -| 1 | 0.556860 | `storage_account_get` | ✅ **EXPECTED** | -| 2 | 0.482650 | `storage_blob_container_get` | ❌ | | 3 | 0.461284 | `managedlustre_fs_list` | ❌ | ->>>>>>> e2fd2eac (refactor tts mcp tool) | 4 | 0.421642 | `cosmos_account_list` | ❌ | | 5 | 0.410587 | `storage_blob_get` | ❌ | --- -<<<<<<< HEAD -<<<<<<< HEAD -## Test 432 -======= -<<<<<<< HEAD -## Test 422 -======= -## Test 432 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -## Test 437 ->>>>>>> e2fd2eac (refactor tts mcp tool) +## Test 442 **Expected Tool:** `storage_account_get` **Prompt:** Show me the storage accounts in my subscription and include HTTPS-only and public blob access settings @@ -18446,47 +7996,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -<<<<<<< HEAD | 1 | 0.619462 | `storage_account_get` | ✅ **EXPECTED** | -| 2 | 0.555677 | `storage_blob_container_get` | ❌ | -======= -<<<<<<< HEAD -| 1 | 0.619639 | `storage_account_get` | ✅ **EXPECTED** | -| 2 | 0.556436 | `storage_blob_container_get` | ❌ | ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -| 3 | 0.518229 | `storage_blob_get` | ❌ | -| 4 | 0.473598 | `cosmos_account_list` | ❌ | -| 5 | 0.465527 | `subscription_list` | ❌ | - ---- - -<<<<<<< HEAD -## Test 433 -======= -## Test 423 -======= -| 1 | 0.619491 | `storage_account_get` | ✅ **EXPECTED** | | 2 | 0.556436 | `storage_blob_container_get` | ❌ | | 3 | 0.518229 | `storage_blob_get` | ❌ | -| 4 | 0.473662 | `cosmos_account_list` | ❌ | -======= -| 1 | 0.619462 | `storage_account_get` | ✅ **EXPECTED** | -| 2 | 0.556525 | `storage_blob_container_get` | ❌ | -| 3 | 0.518229 | `storage_blob_get` | ❌ | | 4 | 0.473598 | `cosmos_account_list` | ❌ | ->>>>>>> e2fd2eac (refactor tts mcp tool) | 5 | 0.465571 | `subscription_list` | ❌ | --- -<<<<<<< HEAD -## Test 433 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -## Test 438 ->>>>>>> e2fd2eac (refactor tts mcp tool) +## Test 443 **Expected Tool:** `storage_blob_container_create` **Prompt:** Create the storage container mycontainer in storage account @@ -18496,30 +8014,14 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| | 1 | 0.649793 | `storage_blob_container_create` | ✅ **EXPECTED** | -<<<<<<< HEAD -| 2 | 0.585556 | `storage_blob_container_get` | ❌ | -======= -| 2 | 0.584263 | `storage_blob_container_get` | ❌ | ->>>>>>> e2fd2eac (refactor tts mcp tool) +| 2 | 0.583896 | `storage_blob_container_get` | ❌ | | 3 | 0.524779 | `storage_account_create` | ❌ | | 4 | 0.496679 | `storage_blob_get` | ❌ | | 5 | 0.447784 | `cosmos_database_container_list` | ❌ | --- -<<<<<<< HEAD -<<<<<<< HEAD -## Test 434 -======= -<<<<<<< HEAD -## Test 424 -======= -## Test 434 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -## Test 439 ->>>>>>> e2fd2eac (refactor tts mcp tool) +## Test 444 **Expected Tool:** `storage_blob_container_create` **Prompt:** Create the container using blob public access in storage account @@ -18529,40 +8031,14 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| | 1 | 0.682161 | `storage_blob_container_create` | ✅ **EXPECTED** | -<<<<<<< HEAD -<<<<<<< HEAD -| 2 | 0.590826 | `storage_blob_container_get` | ❌ | -| 3 | 0.559264 | `storage_blob_get` | ❌ | -| 4 | 0.500625 | `storage_account_create` | ❌ | -| 5 | 0.420514 | `storage_account_get` | ❌ | - ---- - -## Test 435 -======= | 2 | 0.590160 | `storage_blob_container_get` | ❌ | -======= -| 2 | 0.590461 | `storage_blob_container_get` | ❌ | ->>>>>>> e2fd2eac (refactor tts mcp tool) | 3 | 0.559263 | `storage_blob_get` | ❌ | | 4 | 0.500624 | `storage_account_create` | ❌ | | 5 | 0.420514 | `storage_account_get` | ❌ | --- -<<<<<<< HEAD -## Test 425 -======= -| 5 | 0.420516 | `storage_account_get` | ❌ | - ---- - -## Test 435 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -## Test 440 ->>>>>>> e2fd2eac (refactor tts mcp tool) +## Test 445 **Expected Tool:** `storage_blob_container_create` **Prompt:** Create a new blob container named documents with container public access in storage account @@ -18571,35 +8047,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -| 1 | 0.625397 | `storage_blob_container_create` | ✅ **EXPECTED** | -| 2 | 0.544024 | `storage_blob_container_get` | ❌ | -| 3 | 0.497804 | `storage_blob_get` | ❌ | +| 1 | 0.625490 | `storage_blob_container_create` | ✅ **EXPECTED** | +| 2 | 0.543562 | `storage_blob_container_get` | ❌ | +| 3 | 0.497792 | `storage_blob_get` | ❌ | | 4 | 0.463198 | `storage_account_create` | ❌ | -| 5 | 0.435099 | `cosmos_database_container_list` | ❌ | - ---- - -<<<<<<< HEAD -## Test 436 -======= -<<<<<<< HEAD -## Test 426 -======= -## Test 436 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -| 1 | 0.625122 | `storage_blob_container_create` | ✅ **EXPECTED** | -| 2 | 0.543590 | `storage_blob_container_get` | ❌ | -| 3 | 0.497579 | `storage_blob_get` | ❌ | -| 4 | 0.463133 | `storage_account_create` | ❌ | -| 5 | 0.435075 | `cosmos_database_container_list` | ❌ | +| 5 | 0.435103 | `cosmos_database_container_list` | ❌ | --- -## Test 441 ->>>>>>> e2fd2eac (refactor tts mcp tool) +## Test 446 **Expected Tool:** `storage_blob_container_get` **Prompt:** Show me the properties of the storage container in the storage account @@ -18607,44 +8063,16 @@ ### Results | Rank | Score | Tool | Status | -|------|-------|------|--------| -<<<<<<< HEAD -| 1 | 0.703348 | `storage_blob_container_get` | ✅ **EXPECTED** | -| 2 | 0.623681 | `storage_blob_get` | ❌ | -<<<<<<< HEAD -| 3 | 0.577921 | `storage_account_get` | ❌ | -| 4 | 0.549804 | `storage_blob_container_create` | ❌ | -| 5 | 0.523289 | `cosmos_database_container_list` | ❌ | - ---- - -## Test 437 -======= -<<<<<<< HEAD -| 3 | 0.577740 | `storage_account_get` | ❌ | -======= -| 3 | 0.577904 | `storage_account_get` | ❌ | ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) -======= -| 1 | 0.701878 | `storage_blob_container_get` | ✅ **EXPECTED** | +|------|-------|------|--------| +| 1 | 0.701642 | `storage_blob_container_get` | ✅ **EXPECTED** | | 2 | 0.623681 | `storage_blob_get` | ❌ | | 3 | 0.577921 | `storage_account_get` | ❌ | ->>>>>>> e2fd2eac (refactor tts mcp tool) | 4 | 0.549803 | `storage_blob_container_create` | ❌ | | 5 | 0.523288 | `cosmos_database_container_list` | ❌ | --- -<<<<<<< HEAD -<<<<<<< HEAD -## Test 427 -======= -## Test 437 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -## Test 442 ->>>>>>> e2fd2eac (refactor tts mcp tool) +## Test 447 **Expected Tool:** `storage_blob_container_get` **Prompt:** List all blob containers in the storage account @@ -18653,11 +8081,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -| 1 | 0.712012 | `storage_blob_container_get` | ✅ **EXPECTED** | -======= -| 1 | 0.712439 | `storage_blob_container_get` | ✅ **EXPECTED** | ->>>>>>> e2fd2eac (refactor tts mcp tool) +| 1 | 0.712037 | `storage_blob_container_get` | ✅ **EXPECTED** | | 2 | 0.680802 | `storage_blob_get` | ❌ | | 3 | 0.613933 | `cosmos_database_container_list` | ❌ | | 4 | 0.556319 | `storage_blob_container_create` | ❌ | @@ -18665,19 +8089,7 @@ --- -<<<<<<< HEAD -<<<<<<< HEAD -## Test 438 -======= -<<<<<<< HEAD -## Test 428 -======= -## Test 438 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -## Test 443 ->>>>>>> e2fd2eac (refactor tts mcp tool) +## Test 448 **Expected Tool:** `storage_blob_container_get` **Prompt:** Show me the containers in the storage account @@ -18686,43 +8098,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -| 1 | 0.713080 | `storage_blob_container_get` | ✅ **EXPECTED** | -| 2 | 0.592373 | `cosmos_database_container_list` | ❌ | -| 3 | 0.586169 | `storage_blob_get` | ❌ | -<<<<<<< HEAD -| 4 | 0.523322 | `storage_account_get` | ❌ | -| 5 | 0.487520 | `storage_blob_container_create` | ❌ | - ---- - -## Test 439 -======= -<<<<<<< HEAD -| 4 | 0.523353 | `storage_account_get` | ❌ | -======= -| 4 | 0.523293 | `storage_account_get` | ❌ | ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) -======= -| 1 | 0.713761 | `storage_blob_container_get` | ✅ **EXPECTED** | +| 1 | 0.713527 | `storage_blob_container_get` | ✅ **EXPECTED** | | 2 | 0.592373 | `cosmos_database_container_list` | ❌ | | 3 | 0.586169 | `storage_blob_get` | ❌ | | 4 | 0.523322 | `storage_account_get` | ❌ | ->>>>>>> e2fd2eac (refactor tts mcp tool) | 5 | 0.487521 | `storage_blob_container_create` | ❌ | --- -<<<<<<< HEAD -<<<<<<< HEAD -## Test 429 -======= -## Test 439 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -## Test 444 ->>>>>>> e2fd2eac (refactor tts mcp tool) +## Test 449 **Expected Tool:** `storage_blob_get` **Prompt:** Show me the properties for blob in container in storage account @@ -18731,46 +8115,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -<<<<<<< HEAD -| 1 | 0.700963 | `storage_blob_get` | ✅ **EXPECTED** | -| 2 | 0.648279 | `storage_blob_container_get` | ❌ | -| 3 | 0.540987 | `storage_blob_container_create` | ❌ | -| 4 | 0.527363 | `storage_account_get` | ❌ | -| 5 | 0.477959 | `cosmos_database_container_list` | ❌ | - ---- - -## Test 440 -======= -<<<<<<< HEAD -| 1 | 0.700969 | `storage_blob_get` | ✅ **EXPECTED** | -| 2 | 0.647029 | `storage_blob_container_get` | ❌ | -| 3 | 0.541060 | `storage_blob_container_create` | ❌ | -| 4 | 0.527327 | `storage_account_get` | ❌ | -| 5 | 0.477993 | `cosmos_database_container_list` | ❌ | - ---- - -## Test 430 -======= -======= ->>>>>>> e2fd2eac (refactor tts mcp tool) | 1 | 0.700973 | `storage_blob_get` | ✅ **EXPECTED** | -| 2 | 0.647348 | `storage_blob_container_get` | ❌ | +| 2 | 0.646973 | `storage_blob_container_get` | ❌ | | 3 | 0.541019 | `storage_blob_container_create` | ❌ | | 4 | 0.527427 | `storage_account_get` | ❌ | | 5 | 0.477946 | `cosmos_database_container_list` | ❌ | --- -<<<<<<< HEAD -## Test 440 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -## Test 445 ->>>>>>> e2fd2eac (refactor tts mcp tool) +## Test 450 **Expected Tool:** `storage_blob_get` **Prompt:** Get the details about blob in the container in storage account @@ -18779,45 +8132,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD | 1 | 0.694997 | `storage_blob_get` | ✅ **EXPECTED** | -<<<<<<< HEAD -| 2 | 0.633397 | `storage_blob_container_get` | ❌ | -| 3 | 0.589151 | `storage_blob_container_create` | ❌ | -| 4 | 0.580226 | `storage_account_get` | ❌ | -======= | 2 | 0.631161 | `storage_blob_container_get` | ❌ | | 3 | 0.589152 | `storage_blob_container_create` | ❌ | -<<<<<<< HEAD -| 4 | 0.579989 | `storage_account_get` | ❌ | -======= -| 4 | 0.580235 | `storage_account_get` | ❌ | ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) +| 4 | 0.580226 | `storage_account_get` | ❌ | | 5 | 0.457038 | `storage_account_create` | ❌ | --- -<<<<<<< HEAD -## Test 441 -======= -<<<<<<< HEAD -## Test 431 -======= -## Test 441 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -| 1 | 0.694812 | `storage_blob_get` | ✅ **EXPECTED** | -| 2 | 0.631318 | `storage_blob_container_get` | ❌ | -| 3 | 0.589010 | `storage_blob_container_create` | ❌ | -| 4 | 0.580074 | `storage_account_get` | ❌ | -| 5 | 0.457004 | `storage_account_create` | ❌ | - ---- - -## Test 446 ->>>>>>> e2fd2eac (refactor tts mcp tool) +## Test 451 **Expected Tool:** `storage_blob_get` **Prompt:** List all blobs in the blob container in the storage account @@ -18827,40 +8150,14 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| | 1 | 0.733586 | `storage_blob_get` | ✅ **EXPECTED** | -<<<<<<< HEAD -| 2 | 0.702342 | `storage_blob_container_get` | ❌ | -| 3 | 0.605993 | `storage_blob_container_create` | ❌ | -| 4 | 0.579070 | `cosmos_database_container_list` | ❌ | -<<<<<<< HEAD -| 5 | 0.506639 | `cosmos_database_container_item_query` | ❌ | - ---- - -## Test 442 -======= -<<<<<<< HEAD -| 5 | 0.506792 | `cosmos_database_container_item_query` | ❌ | - ---- - -## Test 432 -======= -======= -| 2 | 0.701375 | `storage_blob_container_get` | ❌ | +| 2 | 0.700891 | `storage_blob_container_get` | ❌ | | 3 | 0.605993 | `storage_blob_container_create` | ❌ | | 4 | 0.579070 | `cosmos_database_container_list` | ❌ | ->>>>>>> e2fd2eac (refactor tts mcp tool) | 5 | 0.506639 | `cosmos_database_container_item_query` | ❌ | --- -<<<<<<< HEAD -## Test 442 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -## Test 447 ->>>>>>> e2fd2eac (refactor tts mcp tool) +## Test 452 **Expected Tool:** `storage_blob_get` **Prompt:** Show me the blobs in the blob container in the storage account @@ -18870,37 +8167,14 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| | 1 | 0.704426 | `storage_blob_get` | ✅ **EXPECTED** | -<<<<<<< HEAD -| 2 | 0.666342 | `storage_blob_container_get` | ❌ | -======= -| 2 | 0.665280 | `storage_blob_container_get` | ❌ | ->>>>>>> e2fd2eac (refactor tts mcp tool) +| 2 | 0.664940 | `storage_blob_container_get` | ❌ | | 3 | 0.561557 | `storage_blob_container_create` | ❌ | | 4 | 0.533515 | `cosmos_database_container_list` | ❌ | | 5 | 0.484018 | `storage_account_get` | ❌ | --- -<<<<<<< HEAD -<<<<<<< HEAD -## Test 443 -======= -## Test 433 -======= -| 1 | 0.704413 | `storage_blob_get` | ✅ **EXPECTED** | -| 2 | 0.664877 | `storage_blob_container_get` | ❌ | -| 3 | 0.561546 | `storage_blob_container_create` | ❌ | -| 4 | 0.533442 | `cosmos_database_container_list` | ❌ | -| 5 | 0.483914 | `storage_account_get` | ❌ | - ---- - -## Test 443 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -## Test 448 ->>>>>>> e2fd2eac (refactor tts mcp tool) +## Test 453 **Expected Tool:** `storage_blob_upload` **Prompt:** Upload file to storage blob in container in storage account @@ -18909,51 +8183,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -<<<<<<< HEAD -| 1 | 0.566278 | `storage_blob_upload` | ✅ **EXPECTED** | -| 2 | 0.525685 | `storage_blob_container_create` | ❌ | -| 3 | 0.517524 | `storage_blob_get` | ❌ | -| 4 | 0.474395 | `storage_blob_container_get` | ❌ | -| 5 | 0.382007 | `storage_account_create` | ❌ | +| 1 | 0.566319 | `storage_blob_upload` | ✅ **EXPECTED** | +| 2 | 0.525579 | `storage_blob_container_create` | ❌ | +| 3 | 0.517548 | `storage_blob_get` | ❌ | +| 4 | 0.473592 | `storage_blob_container_get` | ❌ | +| 5 | 0.381886 | `storage_account_create` | ❌ | --- -## Test 444 -======= -<<<<<<< HEAD -| 1 | 0.566280 | `storage_blob_upload` | ✅ **EXPECTED** | -| 2 | 0.525689 | `storage_blob_container_create` | ❌ | -| 3 | 0.517628 | `storage_blob_get` | ❌ | -| 4 | 0.473667 | `storage_blob_container_get` | ❌ | -| 5 | 0.382148 | `storage_account_create` | ❌ | - ---- - -## Test 434 -======= -| 1 | 0.566287 | `storage_blob_upload` | ✅ **EXPECTED** | -| 2 | 0.525674 | `storage_blob_container_create` | ❌ | -| 3 | 0.517616 | `storage_blob_get` | ❌ | -| 4 | 0.473645 | `storage_blob_container_get` | ❌ | -| 5 | 0.382123 | `storage_account_create` | ❌ | - ---- - -## Test 444 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -| 1 | 0.566243 | `storage_blob_upload` | ✅ **EXPECTED** | -| 2 | 0.525629 | `storage_blob_container_create` | ❌ | -| 3 | 0.517631 | `storage_blob_get` | ❌ | -| 4 | 0.474198 | `storage_blob_container_get` | ❌ | -| 5 | 0.382137 | `storage_account_create` | ❌ | - ---- - -## Test 449 ->>>>>>> e2fd2eac (refactor tts mcp tool) +## Test 454 **Expected Tool:** `subscription_list` **Prompt:** List all subscriptions for my account @@ -18962,36 +8200,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -<<<<<<< HEAD -| 1 | 0.654048 | `subscription_list` | ✅ **EXPECTED** | -| 2 | 0.512964 | `cosmos_account_list` | ❌ | -| 3 | 0.471653 | `postgres_server_list` | ❌ | -| 4 | 0.469023 | `kusto_cluster_list` | ❌ | -======= -======= ->>>>>>> e2fd2eac (refactor tts mcp tool) | 1 | 0.654071 | `subscription_list` | ✅ **EXPECTED** | | 2 | 0.512964 | `cosmos_account_list` | ❌ | | 3 | 0.471653 | `postgres_server_list` | ❌ | | 4 | 0.469023 | `kusto_cluster_list` | ❌ | -| 5 | 0.461054 | `redis_list` | ❌ | +| 5 | 0.461078 | `redis_list` | ❌ | --- -<<<<<<< HEAD -<<<<<<< HEAD -## Test 445 -======= -<<<<<<< HEAD -## Test 435 -======= -## Test 445 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -## Test 450 ->>>>>>> e2fd2eac (refactor tts mcp tool) +## Test 455 **Expected Tool:** `subscription_list` **Prompt:** Show me my subscriptions @@ -19000,34 +8217,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -| 1 | 0.458834 | `subscription_list` | ✅ **EXPECTED** | -| 2 | 0.407101 | `eventgrid_subscription_list` | ❌ | -| 3 | 0.393662 | `eventgrid_topic_list` | ❌ | -| 4 | 0.391555 | `redis_list` | ❌ | -======= | 1 | 0.458821 | `subscription_list` | ✅ **EXPECTED** | | 2 | 0.407471 | `eventgrid_subscription_list` | ❌ | | 3 | 0.393695 | `eventgrid_topic_list` | ❌ | -| 4 | 0.391545 | `redis_list` | ❌ | ->>>>>>> e2fd2eac (refactor tts mcp tool) +| 4 | 0.391555 | `redis_list` | ❌ | | 5 | 0.381238 | `postgres_server_list` | ❌ | --- -<<<<<<< HEAD -<<<<<<< HEAD -## Test 446 -======= -<<<<<<< HEAD -## Test 436 -======= -## Test 446 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -## Test 451 ->>>>>>> e2fd2eac (refactor tts mcp tool) +## Test 456 **Expected Tool:** `subscription_list` **Prompt:** What is my current subscription? @@ -19038,39 +8236,13 @@ |------|-------|------|--------| | 1 | 0.433196 | `subscription_list` | ✅ **EXPECTED** | | 2 | 0.319579 | `marketplace_product_list` | ❌ | -<<<<<<< HEAD -<<<<<<< HEAD -| 3 | 0.315547 | `marketplace_product_get` | ❌ | -| 4 | 0.293009 | `eventgrid_subscription_list` | ❌ | -| 5 | 0.289280 | `eventgrid_topic_list` | ❌ | - ---- - -## Test 447 -======= -<<<<<<< HEAD -| 3 | 0.315354 | `marketplace_product_get` | ❌ | -======= -| 3 | 0.315474 | `marketplace_product_get` | ❌ | ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) -======= -| 3 | 0.315547 | `marketplace_product_get` | ❌ | ->>>>>>> e2fd2eac (refactor tts mcp tool) +| 3 | 0.315615 | `marketplace_product_get` | ❌ | | 4 | 0.293772 | `eventgrid_subscription_list` | ❌ | | 5 | 0.289334 | `eventgrid_topic_list` | ❌ | --- -<<<<<<< HEAD -<<<<<<< HEAD -## Test 437 -======= -## Test 447 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -## Test 452 ->>>>>>> e2fd2eac (refactor tts mcp tool) +## Test 457 **Expected Tool:** `subscription_list` **Prompt:** What subscriptions do I have? @@ -19079,35 +8251,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -| 1 | 0.477657 | `subscription_list` | ✅ **EXPECTED** | -| 2 | 0.356775 | `eventgrid_subscription_list` | ❌ | -| 3 | 0.354286 | `marketplace_product_list` | ❌ | -| 4 | 0.344549 | `redis_list` | ❌ | -| 5 | 0.340764 | `eventgrid_topic_list` | ❌ | - ---- - -<<<<<<< HEAD -## Test 448 -======= -<<<<<<< HEAD -## Test 438 -======= -## Test 448 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= | 1 | 0.477592 | `subscription_list` | ✅ **EXPECTED** | | 2 | 0.357625 | `eventgrid_subscription_list` | ❌ | | 3 | 0.354286 | `marketplace_product_list` | ❌ | -| 4 | 0.344527 | `redis_list` | ❌ | +| 4 | 0.344549 | `redis_list` | ❌ | | 5 | 0.340837 | `eventgrid_topic_list` | ❌ | --- -## Test 453 ->>>>>>> e2fd2eac (refactor tts mcp tool) +## Test 458 **Expected Tool:** `azureterraformbestpractices_get` **Prompt:** Fetch the Azure Terraform best practices @@ -19116,37 +8268,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -| 1 | 0.686886 | `azureterraformbestpractices_get` | ✅ **EXPECTED** | -| 2 | 0.625270 | `deploy_iac_rules_get` | ❌ | -<<<<<<< HEAD -<<<<<<< HEAD -| 3 | 0.605048 | `get_bestpractices_get` | ❌ | -| 4 | 0.482745 | `deploy_pipeline_guidance_get` | ❌ | -| 5 | 0.468390 | `azureaibestpractices_get` | ❌ | +| 1 | 0.686971 | `azureterraformbestpractices_get` | ✅ **EXPECTED** | +| 2 | 0.625325 | `deploy_iac_rules_get` | ❌ | +| 3 | 0.605182 | `get_bestpractices_get` | ❌ | +| 4 | 0.483016 | `deploy_pipeline_guidance_get` | ❌ | +| 5 | 0.466241 | `deploy_plan_get` | ❌ | --- -## Test 449 -======= -| 3 | 0.605599 | `get_bestpractices_get` | ❌ | -======= -| 3 | 0.605047 | `get_bestpractices_get` | ❌ | ->>>>>>> e2fd2eac (refactor tts mcp tool) -| 4 | 0.482936 | `deploy_pipeline_guidance_get` | ❌ | -| 5 | 0.466199 | `deploy_plan_get` | ❌ | - ---- - -<<<<<<< HEAD -<<<<<<< HEAD -## Test 439 -======= -## Test 449 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -## Test 454 ->>>>>>> e2fd2eac (refactor tts mcp tool) +## Test 459 **Expected Tool:** `azureterraformbestpractices_get` **Prompt:** Show me the Azure Terraform best practices and generate code sample to get a secret from Azure Key Vault @@ -19155,24 +8285,6 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -<<<<<<< HEAD -| 1 | 0.581316 | `azureterraformbestpractices_get` | ✅ **EXPECTED** | -| 2 | 0.512141 | `get_bestpractices_get` | ❌ | -| 3 | 0.510005 | `deploy_iac_rules_get` | ❌ | -| 4 | 0.473943 | `keyvault_secret_get` | ❌ | -| 5 | 0.451726 | `azureaibestpractices_get` | ❌ | - ---- - -## Test 450 -======= -<<<<<<< HEAD -| 1 | 0.581332 | `azureterraformbestpractices_get` | ✅ **EXPECTED** | -| 2 | 0.512141 | `get_bestpractices_get` | ❌ | -======= -======= ->>>>>>> e2fd2eac (refactor tts mcp tool) | 1 | 0.581316 | `azureterraformbestpractices_get` | ✅ **EXPECTED** | | 2 | 0.512141 | `get_bestpractices_get` | ❌ | | 3 | 0.510004 | `deploy_iac_rules_get` | ❌ | @@ -19181,16 +8293,7 @@ --- -<<<<<<< HEAD -<<<<<<< HEAD -## Test 440 -======= -## Test 450 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -## Test 455 ->>>>>>> e2fd2eac (refactor tts mcp tool) +## Test 460 **Expected Tool:** `virtualdesktop_hostpool_list` **Prompt:** List all host pools in my subscription @@ -19199,17 +8302,6 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -| 1 | 0.711905 | `virtualdesktop_hostpool_list` | ✅ **EXPECTED** | -| 2 | 0.659763 | `virtualdesktop_hostpool_host_list` | ❌ | -| 3 | 0.620665 | `kusto_cluster_list` | ❌ | -| 4 | 0.546744 | `search_service_list` | ❌ | -| 5 | 0.536423 | `virtualdesktop_hostpool_host_user-list` | ❌ | - ---- - -## Test 451 -======= | 1 | 0.711969 | `virtualdesktop_hostpool_list` | ✅ **EXPECTED** | | 2 | 0.659763 | `virtualdesktop_hostpool_host_list` | ❌ | | 3 | 0.620666 | `kusto_cluster_list` | ❌ | @@ -19218,16 +8310,7 @@ --- -<<<<<<< HEAD -<<<<<<< HEAD -## Test 441 -======= -## Test 451 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -## Test 456 ->>>>>>> e2fd2eac (refactor tts mcp tool) +## Test 461 **Expected Tool:** `virtualdesktop_hostpool_host_list` **Prompt:** List all session hosts in host pool @@ -19237,39 +8320,14 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| | 1 | 0.727054 | `virtualdesktop_hostpool_host_list` | ✅ **EXPECTED** | -<<<<<<< HEAD -<<<<<<< HEAD -| 2 | 0.715572 | `virtualdesktop_hostpool_host_user-list` | ❌ | -| 3 | 0.573350 | `virtualdesktop_hostpool_list` | ❌ | -======= -| 2 | 0.714553 | `virtualdesktop_hostpool_host_user-list` | ❌ | -======= -| 1 | 0.726933 | `virtualdesktop_hostpool_sessionhost_list` | ❌ | -| 2 | 0.714469 | `virtualdesktop_hostpool_sessionhost_usersession-list` | ❌ | ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) -======= | 2 | 0.714469 | `virtualdesktop_hostpool_host_user-list` | ❌ | ->>>>>>> e2fd2eac (refactor tts mcp tool) | 3 | 0.573352 | `virtualdesktop_hostpool_list` | ❌ | ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -| 4 | 0.438659 | `aks_nodepool_get` | ❌ | +| 4 | 0.438621 | `aks_nodepool_get` | ❌ | | 5 | 0.393721 | `sql_elastic-pool_list` | ❌ | --- -<<<<<<< HEAD -<<<<<<< HEAD -## Test 452 -======= -<<<<<<< HEAD -## Test 442 -======= -## Test 452 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -## Test 457 ->>>>>>> e2fd2eac (refactor tts mcp tool) +## Test 462 **Expected Tool:** `virtualdesktop_hostpool_host_user-list` **Prompt:** List all user sessions on session host in host pool @@ -19278,45 +8336,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -<<<<<<< HEAD -| 1 | 0.813311 | `virtualdesktop_hostpool_host_user-list` | ✅ **EXPECTED** | -| 2 | 0.659213 | `virtualdesktop_hostpool_host_list` | ❌ | -| 3 | 0.501113 | `virtualdesktop_hostpool_list` | ❌ | -======= -<<<<<<< HEAD -| 1 | 0.812787 | `virtualdesktop_hostpool_host_user-list` | ✅ **EXPECTED** | -======= | 1 | 0.812659 | `virtualdesktop_hostpool_host_user-list` | ✅ **EXPECTED** | ->>>>>>> e2fd2eac (refactor tts mcp tool) | 2 | 0.659212 | `virtualdesktop_hostpool_host_list` | ❌ | | 3 | 0.501167 | `virtualdesktop_hostpool_list` | ❌ | ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -| 4 | 0.357561 | `aks_nodepool_get` | ❌ | -| 5 | 0.336576 | `monitor_workspace_list` | ❌ | - ---- - -<<<<<<< HEAD -<<<<<<< HEAD -## Test 453 -======= -## Test 443 -======= -| 1 | 0.812628 | `virtualdesktop_hostpool_sessionhost_usersession-list` | ❌ | -| 2 | 0.658986 | `virtualdesktop_hostpool_sessionhost_list` | ❌ | -| 3 | 0.501050 | `virtualdesktop_hostpool_list` | ❌ | -| 4 | 0.357450 | `aks_nodepool_get` | ❌ | -| 5 | 0.336389 | `monitor_workspace_list` | ❌ | +| 4 | 0.357540 | `aks_nodepool_get` | ❌ | +| 5 | 0.336385 | `monitor_workspace_list` | ❌ | --- -## Test 453 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -## Test 458 ->>>>>>> e2fd2eac (refactor tts mcp tool) +## Test 463 **Expected Tool:** `workbooks_create` **Prompt:** Create a new workbook named @@ -19325,31 +8353,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -| 1 | 0.552307 | `workbooks_create` | ✅ **EXPECTED** | +| 1 | 0.552212 | `workbooks_create` | ✅ **EXPECTED** | | 2 | 0.417950 | `workbooks_update` | ❌ | | 3 | 0.361364 | `workbooks_delete` | ❌ | -<<<<<<< HEAD | 4 | 0.329077 | `workbooks_show` | ❌ | -======= -| 4 | 0.329118 | `workbooks_show` | ❌ | ->>>>>>> e2fd2eac (refactor tts mcp tool) | 5 | 0.328113 | `workbooks_list` | ❌ | --- -<<<<<<< HEAD -<<<<<<< HEAD -## Test 454 -======= -<<<<<<< HEAD -## Test 444 -======= -## Test 454 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -## Test 459 ->>>>>>> e2fd2eac (refactor tts mcp tool) +## Test 464 **Expected Tool:** `workbooks_delete` **Prompt:** Delete the workbook with resource ID @@ -19359,40 +8371,14 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| | 1 | 0.621310 | `workbooks_delete` | ✅ **EXPECTED** | -<<<<<<< HEAD | 2 | 0.498506 | `workbooks_show` | ❌ | | 3 | 0.432454 | `workbooks_create` | ❌ | -<<<<<<< HEAD -| 4 | 0.425569 | `workbooks_list` | ❌ | -======= -<<<<<<< HEAD -| 4 | 0.425484 | `workbooks_list` | ❌ | -======= -| 4 | 0.425569 | `workbooks_list` | ❌ | ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -| 2 | 0.498518 | `workbooks_show` | ❌ | -| 3 | 0.432643 | `workbooks_create` | ❌ | | 4 | 0.425569 | `workbooks_list` | ❌ | ->>>>>>> e2fd2eac (refactor tts mcp tool) | 5 | 0.421897 | `workbooks_update` | ❌ | --- -<<<<<<< HEAD -<<<<<<< HEAD -## Test 455 -======= -<<<<<<< HEAD -## Test 445 -======= -## Test 455 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -## Test 460 ->>>>>>> e2fd2eac (refactor tts mcp tool) +## Test 465 **Expected Tool:** `workbooks_list` **Prompt:** List all workbooks in my resource group @@ -19401,35 +8387,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD -| 1 | 0.772404 | `workbooks_list` | ✅ **EXPECTED** | -| 2 | 0.562476 | `workbooks_create` | ❌ | -| 3 | 0.516733 | `grafana_list` | ❌ | -| 4 | 0.493962 | `workbooks_show` | ❌ | -| 5 | 0.488522 | `group_list` | ❌ | - ---- - -<<<<<<< HEAD -## Test 456 -======= -<<<<<<< HEAD -## Test 446 -======= -## Test 456 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= | 1 | 0.772431 | `workbooks_list` | ✅ **EXPECTED** | -| 2 | 0.562794 | `workbooks_create` | ❌ | +| 2 | 0.562485 | `workbooks_create` | ❌ | | 3 | 0.516739 | `grafana_list` | ❌ | -| 4 | 0.494073 | `workbooks_show` | ❌ | -| 5 | 0.488600 | `group_list` | ❌ | +| 4 | 0.493975 | `workbooks_show` | ❌ | +| 5 | 0.488609 | `group_list` | ❌ | --- -## Test 461 ->>>>>>> e2fd2eac (refactor tts mcp tool) +## Test 466 **Expected Tool:** `workbooks_list` **Prompt:** What workbooks do I have in resource group ? @@ -19439,31 +8405,14 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| | 1 | 0.708612 | `workbooks_list` | ✅ **EXPECTED** | -<<<<<<< HEAD -| 2 | 0.570260 | `workbooks_create` | ❌ | +| 2 | 0.570259 | `workbooks_create` | ❌ | | 3 | 0.499633 | `workbooks_show` | ❌ | -======= -| 2 | 0.570521 | `workbooks_create` | ❌ | -| 3 | 0.499716 | `workbooks_show` | ❌ | ->>>>>>> e2fd2eac (refactor tts mcp tool) | 4 | 0.485504 | `workbooks_delete` | ❌ | | 5 | 0.472378 | `grafana_list` | ❌ | --- -<<<<<<< HEAD -<<<<<<< HEAD -## Test 457 -======= -<<<<<<< HEAD -## Test 447 -======= -## Test 457 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -## Test 462 ->>>>>>> e2fd2eac (refactor tts mcp tool) +## Test 467 **Expected Tool:** `workbooks_show` **Prompt:** Get information about the workbook with resource ID @@ -19472,41 +8421,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD | 1 | 0.686095 | `workbooks_show` | ✅ **EXPECTED** | | 2 | 0.498390 | `workbooks_create` | ❌ | -<<<<<<< HEAD -| 3 | 0.494708 | `workbooks_list` | ❌ | -======= -<<<<<<< HEAD -| 3 | 0.494492 | `workbooks_list` | ❌ | -======= -| 3 | 0.494708 | `workbooks_list` | ❌ | ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -| 1 | 0.686087 | `workbooks_show` | ✅ **EXPECTED** | -| 2 | 0.498518 | `workbooks_create` | ❌ | | 3 | 0.494708 | `workbooks_list` | ❌ | ->>>>>>> e2fd2eac (refactor tts mcp tool) | 4 | 0.463156 | `workbooks_update` | ❌ | | 5 | 0.452348 | `workbooks_delete` | ❌ | --- -<<<<<<< HEAD -<<<<<<< HEAD -## Test 458 -======= -<<<<<<< HEAD -## Test 448 -======= -## Test 458 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -## Test 463 ->>>>>>> e2fd2eac (refactor tts mcp tool) +## Test 468 **Expected Tool:** `workbooks_show` **Prompt:** Show me the workbook with resource ID @@ -19515,33 +8438,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -<<<<<<< HEAD | 1 | 0.581575 | `workbooks_show` | ✅ **EXPECTED** | | 2 | 0.500475 | `workbooks_list` | ❌ | | 3 | 0.468996 | `workbooks_create` | ❌ | -======= -| 1 | 0.581501 | `workbooks_show` | ✅ **EXPECTED** | -| 2 | 0.500475 | `workbooks_list` | ❌ | -| 3 | 0.469214 | `workbooks_create` | ❌ | ->>>>>>> e2fd2eac (refactor tts mcp tool) | 4 | 0.466266 | `workbooks_update` | ❌ | | 5 | 0.455311 | `workbooks_delete` | ❌ | --- -<<<<<<< HEAD -<<<<<<< HEAD -## Test 459 -======= -<<<<<<< HEAD -## Test 449 -======= -## Test 459 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -## Test 464 ->>>>>>> e2fd2eac (refactor tts mcp tool) +## Test 469 **Expected Tool:** `workbooks_update` **Prompt:** Update the workbook with a new text step @@ -19551,31 +8456,14 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| | 1 | 0.586347 | `workbooks_update` | ✅ **EXPECTED** | -| 2 | 0.382724 | `workbooks_create` | ❌ | +| 2 | 0.382651 | `workbooks_create` | ❌ | | 3 | 0.349689 | `workbooks_delete` | ❌ | -<<<<<<< HEAD | 4 | 0.347778 | `workbooks_show` | ❌ | | 5 | 0.292904 | `loadtesting_testrun_update` | ❌ | --- -<<<<<<< HEAD -## Test 460 -======= -<<<<<<< HEAD -## Test 450 -======= -## Test 460 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -| 4 | 0.347944 | `workbooks_show` | ❌ | -| 5 | 0.292993 | `loadtesting_testrun_update` | ❌ | - ---- - -## Test 465 ->>>>>>> e2fd2eac (refactor tts mcp tool) +## Test 470 **Expected Tool:** `bicepschema_get` **Prompt:** How can I use Bicep to create an Azure OpenAI service? @@ -19584,40 +8472,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -| 1 | 0.543803 | `bicepschema_get` | ✅ **EXPECTED** | +| 1 | 0.543154 | `bicepschema_get` | ✅ **EXPECTED** | | 2 | 0.485970 | `foundry_models_deploy` | ❌ | | 3 | 0.485889 | `deploy_iac_rules_get` | ❌ | -<<<<<<< HEAD -<<<<<<< HEAD -| 4 | 0.468898 | `azureaibestpractices_get` | ❌ | -| 5 | 0.453412 | `foundry_openai_embeddings-create` | ❌ | - ---- - -## Test 461 -======= -<<<<<<< HEAD -======= ->>>>>>> e2fd2eac (refactor tts mcp tool) | 4 | 0.453282 | `foundry_openai_embeddings-create` | ❌ | | 5 | 0.448373 | `get_bestpractices_get` | ❌ | --- -<<<<<<< HEAD -## Test 451 -======= -| 4 | 0.462146 | `foundry_openai_embeddings-create` | ❌ | -| 5 | 0.449694 | `get_bestpractices_get` | ❌ | - ---- - -## Test 461 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -## Test 466 ->>>>>>> e2fd2eac (refactor tts mcp tool) +## Test 471 **Expected Tool:** `cloudarchitect_design` **Prompt:** Please help me design an architecture for a large-scale file upload, storage, and retrieval service @@ -19626,43 +8489,15 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -| 1 | 0.502125 | `cloudarchitect_design` | ✅ **EXPECTED** | +| 1 | 0.502110 | `cloudarchitect_design` | ✅ **EXPECTED** | | 2 | 0.290902 | `storage_blob_upload` | ❌ | -<<<<<<< HEAD -<<<<<<< HEAD -| 3 | 0.260101 | `managedlustre_fs_create` | ❌ | -| 4 | 0.254991 | `deploy_architecture_diagram_generate` | ❌ | -======= -<<<<<<< HEAD -| 3 | 0.259162 | `managedlustre_fs_create` | ❌ | -| 4 | 0.254853 | `deploy_architecture_diagram_generate` | ❌ | ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= | 3 | 0.259162 | `managedlustre_fs_create` | ❌ | | 4 | 0.254991 | `deploy_architecture_diagram_generate` | ❌ | ->>>>>>> e2fd2eac (refactor tts mcp tool) | 5 | 0.245034 | `managedlustre_fs_subnetsize_validate` | ❌ | --- -<<<<<<< HEAD -<<<<<<< HEAD -## Test 462 -======= -## Test 452 -======= -| 3 | 0.259162 | `managedlustre_filesystem_create` | ❌ | -| 4 | 0.254991 | `deploy_architecture_diagram_generate` | ❌ | -| 5 | 0.245034 | `managedlustre_filesystem_subnetsize_validate` | ❌ | - ---- - -## Test 462 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -## Test 467 ->>>>>>> e2fd2eac (refactor tts mcp tool) +## Test 472 **Expected Tool:** `cloudarchitect_design` **Prompt:** Help me design an Azure cloud service that will serve as an ATM for users @@ -19671,25 +8506,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -| 1 | 0.508153 | `cloudarchitect_design` | ✅ **EXPECTED** | -<<<<<<< HEAD -<<<<<<< HEAD -| 2 | 0.377941 | `deploy_architecture_diagram_generate` | ❌ | -| 3 | 0.341316 | `deploy_pipeline_guidance_get` | ❌ | -| 4 | 0.336385 | `azureaibestpractices_get` | ❌ | -| 5 | 0.328747 | `get_bestpractices_get` | ❌ | - ---- - -## Test 463 -======= -<<<<<<< HEAD -| 2 | 0.377584 | `deploy_architecture_diagram_generate` | ❌ | -| 3 | 0.341462 | `deploy_pipeline_guidance_get` | ❌ | -| 4 | 0.328747 | `get_bestpractices_get` | ❌ | -======= -======= ->>>>>>> e2fd2eac (refactor tts mcp tool) +| 1 | 0.508504 | `cloudarchitect_design` | ✅ **EXPECTED** | | 2 | 0.377941 | `deploy_architecture_diagram_generate` | ❌ | | 3 | 0.341462 | `deploy_pipeline_guidance_get` | ❌ | | 4 | 0.328747 | `get_bestpractices_get` | ❌ | @@ -19697,16 +8514,7 @@ --- -<<<<<<< HEAD -<<<<<<< HEAD -## Test 453 -======= -## Test 463 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -## Test 468 ->>>>>>> e2fd2eac (refactor tts mcp tool) +## Test 473 **Expected Tool:** `cloudarchitect_design` **Prompt:** I want to design a cloud app for ordering groceries @@ -19715,25 +8523,7 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -| 1 | 0.423577 | `cloudarchitect_design` | ✅ **EXPECTED** | -<<<<<<< HEAD -| 2 | 0.271869 | `deploy_pipeline_guidance_get` | ❌ | -| 3 | 0.265972 | `deploy_architecture_diagram_generate` | ❌ | -| 4 | 0.242581 | `deploy_plan_get` | ❌ | -| 5 | 0.241197 | `azureaibestpractices_get` | ❌ | - ---- - -<<<<<<< HEAD -## Test 464 -======= -<<<<<<< HEAD -## Test 454 -======= -## Test 464 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= +| 1 | 0.423059 | `cloudarchitect_design` | ✅ **EXPECTED** | | 2 | 0.271943 | `deploy_pipeline_guidance_get` | ❌ | | 3 | 0.265972 | `deploy_architecture_diagram_generate` | ❌ | | 4 | 0.242581 | `deploy_plan_get` | ❌ | @@ -19741,8 +8531,7 @@ --- -## Test 469 ->>>>>>> e2fd2eac (refactor tts mcp tool) +## Test 474 **Expected Tool:** `cloudarchitect_design` **Prompt:** How can I design a cloud service in Azure that will store and present videos for users? @@ -19751,111 +8540,39 @@ | Rank | Score | Tool | Status | |------|-------|------|--------| -| 1 | 0.534690 | `cloudarchitect_design` | ✅ **EXPECTED** | -<<<<<<< HEAD -| 2 | 0.369872 | `deploy_pipeline_guidance_get` | ❌ | -| 3 | 0.357808 | `managedlustre_fs_create` | ❌ | -| 4 | 0.352797 | `deploy_architecture_diagram_generate` | ❌ | -| 5 | 0.324217 | `azureaibestpractices_get` | ❌ | -======= +| 1 | 0.535224 | `cloudarchitect_design` | ✅ **EXPECTED** | | 2 | 0.369969 | `deploy_pipeline_guidance_get` | ❌ | | 3 | 0.356331 | `managedlustre_fs_create` | ❌ | | 4 | 0.352797 | `deploy_architecture_diagram_generate` | ❌ | | 5 | 0.323920 | `storage_blob_upload` | ❌ | ->>>>>>> 58ab8585 (update prompts and tool description evaluator) --- ## Summary -<<<<<<< HEAD -<<<<<<< HEAD -**Total Prompts Tested:** 464 -**Analysis Execution Time:** 186.7791311s -======= -<<<<<<< HEAD -**Total Prompts Tested:** 454 -**Analysis Execution Time:** 61.2275421s ->>>>>>> 58ab8585 (update prompts and tool description evaluator) - -### Success Rate Metrics - -**Top Choice Success:** 92.2% (428/464 tests) - -#### Confidence Level Distribution - -**💪 Very High Confidence (≥0.8):** 3.2% (15/464 tests) -**🎯 High Confidence (≥0.7):** 22.8% (106/464 tests) -**✅ Good Confidence (≥0.6):** 62.3% (289/464 tests) -**👍 Fair Confidence (≥0.5):** 92.2% (428/464 tests) -**👌 Acceptable Confidence (≥0.4):** 99.6% (462/464 tests) -**❌ Low Confidence (<0.4):** 0.4% (2/464 tests) - -#### Top Choice + Confidence Combinations - -<<<<<<< HEAD -**💪 Top Choice + Very High Confidence (≥0.8):** 3.2% (15/464 tests) -**🎯 Top Choice + High Confidence (≥0.7):** 22.8% (106/464 tests) -**✅ Top Choice + Good Confidence (≥0.6):** 60.3% (280/464 tests) -**👍 Top Choice + Fair Confidence (≥0.5):** 86.9% (403/464 tests) -**👌 Top Choice + Acceptable Confidence (≥0.4):** 92.2% (428/464 tests) -======= -**💪 Top Choice + Very High Confidence (≥0.8):** 3.3% (15/454 tests) -**🎯 Top Choice + High Confidence (≥0.7):** 23.3% (106/454 tests) -**✅ Top Choice + Good Confidence (≥0.6):** 60.6% (275/454 tests) -**👍 Top Choice + Fair Confidence (≥0.5):** 86.8% (394/454 tests) -**👌 Top Choice + Acceptable Confidence (≥0.4):** 92.1% (418/454 tests) -======= -**Total Prompts Tested:** 464 -**Analysis Execution Time:** 123.7654249s - -### Success Rate Metrics - -**Top Choice Success:** 89.2% (414/464 tests) - -#### Confidence Level Distribution - -**💪 Very High Confidence (≥0.8):** 2.6% (12/464 tests) -**🎯 High Confidence (≥0.7):** 19.6% (91/464 tests) -**✅ Good Confidence (≥0.6):** 57.8% (268/464 tests) -**👍 Fair Confidence (≥0.5):** 88.8% (412/464 tests) -**👌 Acceptable Confidence (≥0.4):** 96.3% (447/464 tests) -**❌ Low Confidence (<0.4):** 3.7% (17/464 tests) - -#### Top Choice + Confidence Combinations - -**💪 Top Choice + Very High Confidence (≥0.8):** 2.6% (12/464 tests) -**🎯 Top Choice + High Confidence (≥0.7):** 19.6% (91/464 tests) -**✅ Top Choice + Good Confidence (≥0.6):** 55.8% (259/464 tests) -**👍 Top Choice + Fair Confidence (≥0.5):** 83.8% (389/464 tests) -**👌 Top Choice + Acceptable Confidence (≥0.4):** 89.2% (414/464 tests) ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= -**Total Prompts Tested:** 469 -**Analysis Execution Time:** 101.3684843s +**Total Prompts Tested:** 474 +**Analysis Execution Time:** 105.8478990s ### Success Rate Metrics -**Top Choice Success:** 92.3% (433/469 tests) +**Top Choice Success:** 91.4% (433/474 tests) #### Confidence Level Distribution -**💪 Very High Confidence (≥0.8):** 3.2% (15/469 tests) -**🎯 High Confidence (≥0.7):** 22.6% (106/469 tests) -**✅ Good Confidence (≥0.6):** 61.2% (287/469 tests) -**👍 Fair Confidence (≥0.5):** 91.9% (431/469 tests) -**👌 Acceptable Confidence (≥0.4):** 99.6% (467/469 tests) -**❌ Low Confidence (<0.4):** 0.4% (2/469 tests) +**💪 Very High Confidence (≥0.8):** 3.2% (15/474 tests) +**🎯 High Confidence (≥0.7):** 22.4% (106/474 tests) +**✅ Good Confidence (≥0.6):** 60.5% (287/474 tests) +**👍 Fair Confidence (≥0.5):** 90.9% (431/474 tests) +**👌 Acceptable Confidence (≥0.4):** 98.5% (467/474 tests) +**❌ Low Confidence (<0.4):** 1.5% (7/474 tests) #### Top Choice + Confidence Combinations -**💪 Top Choice + Very High Confidence (≥0.8):** 3.2% (15/469 tests) -**🎯 Top Choice + High Confidence (≥0.7):** 22.6% (106/469 tests) -**✅ Top Choice + Good Confidence (≥0.6):** 59.3% (278/469 tests) -**👍 Top Choice + Fair Confidence (≥0.5):** 86.6% (406/469 tests) -**👌 Top Choice + Acceptable Confidence (≥0.4):** 92.3% (433/469 tests) ->>>>>>> e2fd2eac (refactor tts mcp tool) +**💪 Top Choice + Very High Confidence (≥0.8):** 3.2% (15/474 tests) +**🎯 Top Choice + High Confidence (≥0.7):** 22.4% (106/474 tests) +**✅ Top Choice + Good Confidence (≥0.6):** 58.6% (278/474 tests) +**👍 Top Choice + Fair Confidence (≥0.5):** 85.7% (406/474 tests) +**👌 Top Choice + Acceptable Confidence (≥0.4):** 91.4% (433/474 tests) ### Success Rate Analysis diff --git a/eng/tools/ToolDescriptionEvaluator/tools.json b/eng/tools/ToolDescriptionEvaluator/tools.json index 839af1bc3..8cf5aa60d 100644 --- a/eng/tools/ToolDescriptionEvaluator/tools.json +++ b/eng/tools/ToolDescriptionEvaluator/tools.json @@ -896,12 +896,6 @@ } ] }, - { - "name": "get", - "description": "Returns best practices and code generation guidance for building AI applications in Azure. \r\n Use this tool when you need recommendations on how to write code for AI agents, chatbots, workflows, or other AI features.\r\n This tool also provides guidance for code generation using the Azure resources (e.g. Azure AI Foundry) for application development only. \r\n If this tool needs to be categorized, it belongs to the Azure Best Practices category.", - "command": "azureaibestpractices get", - "option": [] - }, { "name": "get", "description": "Returns Terraform best practices for Azure. Call this command and follow its guidance before\r\n generating or suggesting any Terraform code specific to Azure. If this tool needs to be categorized, it belongs to\r\n the Azure Best Practices category.", @@ -13268,17 +13262,5 @@ } ], "consolidated_tools": null, -<<<<<<< HEAD -<<<<<<< HEAD - "duration": 53 -======= -<<<<<<< HEAD - "duration": 47 -======= - "duration": 49 ->>>>>>> 84ad4f44 (update prompts and tool description evaluator) ->>>>>>> 58ab8585 (update prompts and tool description evaluator) -======= - "duration": 55 ->>>>>>> e2fd2eac (refactor tts mcp tool) + "duration": 56 } \ No newline at end of file From 55c12f6294ce4711f860e1c90c7043c8b3ff0ae6 Mon Sep 17 00:00:00 2001 From: "Fei Zhao (SPEECH)" Date: Mon, 10 Nov 2025 22:37:36 +0800 Subject: [PATCH 14/14] fix more assert property --- .../SpeechCommandTests.cs | 20 +++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/tools/Azure.Mcp.Tools.Speech/tests/Azure.Mcp.Tools.Speech.LiveTests/SpeechCommandTests.cs b/tools/Azure.Mcp.Tools.Speech/tests/Azure.Mcp.Tools.Speech.LiveTests/SpeechCommandTests.cs index 27bea9922..51855aa3a 100644 --- a/tools/Azure.Mcp.Tools.Speech/tests/Azure.Mcp.Tools.Speech.LiveTests/SpeechCommandTests.cs +++ b/tools/Azure.Mcp.Tools.Speech/tests/Azure.Mcp.Tools.Speech.LiveTests/SpeechCommandTests.cs @@ -414,12 +414,12 @@ public async Task SpeechToText_WithBrokenFile_ShouldHandleGracefully() var resultObject = jsonResult.RootElement; // Validate Error message for corrupted file - Assert.True(resultObject.TryGetProperty("message", out var messageProperty)); + var messageProperty = resultObject.AssertProperty("message"); var message = messageProperty.GetString() ?? ""; Assert.True(message.Contains("The audio file appears to be empty or corrupted. Please provide a valid audio file.", StringComparison.OrdinalIgnoreCase)); // Validate exception type - Assert.True(resultObject.TryGetProperty("type", out var exceptionTypeProperty)); + var exceptionTypeProperty = resultObject.AssertProperty("type"); var exceptionType = exceptionTypeProperty.GetString() ?? ""; Assert.True(exceptionType.Contains("InvalidOperationException", StringComparison.OrdinalIgnoreCase)); } @@ -507,13 +507,13 @@ public async Task SpeechToText_RecognizeCompressedAudioWithRealtimeTranscription var resultObject = jsonResult.RootElement; // Validate Error message for corrupted file - Assert.True(resultObject.TryGetProperty("message", out var messageProperty)); + var messageProperty = resultObject.AssertProperty("message"); var message = messageProperty.GetString() ?? ""; Assert.True(message.Contains("Cannot process compressed audio file", StringComparison.OrdinalIgnoreCase)); Assert.True(message.Contains("because GStreamer is not properly installed or configured.", StringComparison.OrdinalIgnoreCase)); // Validate exception type - Assert.True(resultObject.TryGetProperty("type", out var exceptionTypeProperty)); + var exceptionTypeProperty = resultObject.AssertProperty("type"); var exceptionType = exceptionTypeProperty.GetString() ?? ""; Assert.True(exceptionType.Contains("InvalidOperationException", StringComparison.OrdinalIgnoreCase)); } @@ -604,14 +604,14 @@ public async Task Should_synthesize_speech_with_different_voices(string language var jsonResult = JsonDocument.Parse(resultText); var resultObject = jsonResult.RootElement; - Assert.True(resultObject.TryGetProperty("result", out var resultProperty)); + var resultProperty = resultObject.AssertProperty("result"); // Verify voice was used - Assert.True(resultProperty.TryGetProperty("voice", out var voiceProperty)); + var voiceProperty = resultProperty.AssertProperty("voice"); Assert.Equal(voice, voiceProperty.GetString()); // Verify language - Assert.True(resultProperty.TryGetProperty("language", out var languageProperty)); + var languageProperty = resultProperty.AssertProperty("language"); Assert.Equal(language, languageProperty.GetString()); // Verify file exists @@ -657,10 +657,10 @@ public async Task Should_synthesize_speech_with_different_formats(string format) var jsonResult = JsonDocument.Parse(resultText); var resultObject = jsonResult.RootElement; - Assert.True(resultObject.TryGetProperty("result", out var resultProperty)); + var resultProperty = resultObject.AssertProperty("result"); // Verify format - Assert.True(resultProperty.TryGetProperty("format", out var formatProperty)); + var formatProperty = resultProperty.AssertProperty("format"); Assert.Equal(format, formatProperty.GetString()); // Verify file exists and has content @@ -772,7 +772,7 @@ public async Task Should_handle_large_text_input() var jsonResult = JsonDocument.Parse(resultText); var resultObject = jsonResult.RootElement; - Assert.True(resultObject.TryGetProperty("result", out var resultProperty)); + var resultProperty = resultObject.AssertProperty("result"); // Verify file exists and is significantly larger than a short phrase Assert.True(File.Exists(outputFile));