microsoft · compulim · Jul 7, 2021 · Jul 5, 2021 · Jul 5, 2021 · Jul 5, 2021
@@ -40,7 +40,8 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
       -  `"any"` will show when there are any offscreen messages;
       -  `false` will always hide the button.
    -  Added new [`scrollToEndButtonMiddleware`](https://github.com/microsoft/BotFramework-WebChat/blob/main/packages/api/src/types/scrollToEndButtonMiddleware.ts) to customize the appearance of the scroll to end button.
--  Resolves [#3752](https://github.com/microsoft/BotFramework-WebChat/issues/3752). Added typings (`*.d.ts`) for all public interfaces, by [@compulim](https://github.com), in PR [#3931](https://github.com/microsoft/BotFramework-WebChat/pull/3931) and [#3946](https://github.com/microsoft/BotFramework-WebChat/pull/3946)
+-  Resolves [#3752](https://github.com/microsoft/BotFramework-WebChat/issues/3752). Added typings (`*.d.ts`) for all public interfaces, by [@compulim](https://github.com/compulim), in PR [#3931](https://github.com/microsoft/BotFramework-WebChat/pull/3931) and [#3946](https://github.com/microsoft/BotFramework-WebChat/pull/3946)
+-  Resolves [#2316](https://github.com/microsoft/BotFramework-WebChat/issues/2316). Added blessing/priming of `AudioContext` when clicking on microphone button, by [@compulim](https://github.com/compulim), in PR [#3974](https://github.com/microsoft/BotFramework-WebChat/pull/3974)
 
 ### Fixed
 
@@ -57,6 +58,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
 -  Fixes [#3856](https://github.com/microsoft/BotFramework-WebChat/issues/3856). Fix missing typings, by [@compulim](https://github.com/compulim) and [@corinagum](https://github.com/corinagum), in PR [#3931](https://github.com/microsoft/BotFramework-WebChat/pull/3931)
 -  Fixes [#3943](https://github.com/microsoft/BotFramework-WebChat/issues/3943). Auto-scroll should skip invisible activities, such as post back or event activity, by [@compulim](https://github.com/compulim), in PR [#3945](https://github.com/microsoft/BotFramework-WebChat/pull/3945)
 -  Fixes [#3947](https://github.com/microsoft/BotFramework-WebChat/issues/3947). Adaptive Cards: all action sets (which has `role="menubar"`) must have at least 1 or more `role="menuitem"`, by [@compulim](https://github.com/compulim), in PR [#3950](https://github.com/microsoft/BotFramework-WebChat/pull/3950)
+-  Fixes [#3823](https://github.com/microsoft/BotFramework-WebChat/issues/3823) and [#3899](https://github.com/microsoft/BotFramework-WebChat/issues/3899). Fix speech recognition and synthesis on Safari, in PR [#3974](https://github.com/microsoft/BotFramework-WebChat/pull/3974)
 
 ### Changed
 

@@ -0,0 +1,74 @@
+<!DOCTYPE html>
+<html lang="en-US">
+  <head>
+    <link href="/assets/index.css" rel="stylesheet" type="text/css" />
+    <script crossorigin="anonymous" src="/test-harness.js"></script>
+    <script crossorigin="anonymous" src="/test-page-object.js"></script>
+    <script crossorigin="anonymous" src="/__dist__/webchat-es5.js"></script>
+  </head>
+  <body>
+    <div id="webchat"></div>
+    <script>
+      run(async function () {
+        const { authorizationToken, region } = await fetch(
+          'https://webchat-mockbot.azurewebsites.net/speechservices/token',
+          {
+            method: 'POST'
+          }
+        ).then(res => res.json());
+
+        const riffWavArrayBuffer = await fetch('/assets/hello-world.wav').then(res => res.arrayBuffer());
+
+        const webSpeechPonyfillFactory = WebChat.createCognitiveServicesSpeechServicesPonyfillFactory({
+          audioConfig: testHelpers.createAudioInputStreamFromRiffWavArrayBuffer(riffWavArrayBuffer),
+          credentials: {
+            authorizationToken,
+            region
+          }
+        });
+
+        let numResumeAudioContextCalled = 0;
+
+        // GIVEN: Using custom AudioConfig with pre-recorded speech of "Hello, World!".
+        WebChat.renderWebChat(
+          {
+            directLine: WebChat.createDirectLine({ token: await testHelpers.token.fetchDirectLineToken() }),
+            store: testHelpers.createStore(),
+            webSpeechPonyfillFactory: () => {
+              return {
+                ...webSpeechPonyfillFactory(),
+                resumeAudioContext: () => {
+                  numResumeAudioContextCalled++;
+                }
+              };
+            }
+          },
+          document.getElementById('webchat')
+        );
+
+        await pageConditions.uiConnected();
+
+        // WHEN: Microphone button is clicked, it should send out the pre-recorded speech of "Hello, World!"
+        await host.click(pageElements.microphoneButton());
+
+        // THEN: resumeAudioContext() should be called once.
+        expect(numResumeAudioContextCalled).toBe(1);
+
+        // THEN: It should send out the voice.
+        await pageConditions.became(
+          'Recognize and send "Hello world."',
+          () =>
+            /hello\sworld/iu.test(
+              pageElements.activities()[0]?.querySelector('[aria-roledescription="message"]')?.innerText || ''
+            ),
+          5000
+        );
+
+        // THEN: The bot should respond.
+        await pageConditions.numActivitiesShown(2);
+
+        await host.snapshot();
+      });
+    </script>
+  </body>
+</html>
@@ -1,9 +1,23 @@
 /* globals SpeechGrammarList, SpeechRecognition, SpeechSynthesis */
 
 type WebSpeechPonyfill = {
+  /**
+   * Function to resume `AudioContext` object when called.
+   *
+   * Web Chat will call this function on user gestures to resume suspended `AudioContext`.
+   */
+  resumeAudioContext?: () => Promise<void>;
+
+  /** Polyfill for Web Speech API `SpeechGrammarList` class. */
   SpeechGrammarList?: typeof SpeechGrammarList;
+
+  /** Polyfill for Web Speech API `SpeechRecognition` class. */
   SpeechRecognition?: typeof SpeechRecognition;
+
+  /** Polyfill for Web Speech API `speechSynthesis` instance. */
   speechSynthesis?: SpeechSynthesis;
+
+  /** Polyfill for Web Speech API `SpeechSynthesisUtterance` class. */
   SpeechSynthesisUtterance?: typeof SpeechSynthesisUtterance;
 };
 

@@ -51,6 +51,7 @@
     "prop-types": "15.7.2",
     "sanitize-html": "1.27.5",
     "url-search-params-polyfill": "8.1.1",
+    "uuid": "8.3.2",
     "web-speech-cognitive-services": "7.1.0",
     "whatwg-fetch": "3.6.2"
   },

@@ -21,7 +21,25 @@ beforeEach(() => {
   createCognitiveServicesSpeechServicesPonyfillFactory = require('./createCognitiveServicesSpeechServicesPonyfillFactory')
     .default;
 
-  window.navigator.mediaDevices = {};
+  window.AudioContext = class MockAudioContext {
+    // eslint-disable-next-line class-methods-use-this
+    createMediaStreamSource() {
+      // eslint-disable-next-line @typescript-eslint/no-empty-function
+      return { connect: () => {} };
+    }
+
+    // eslint-disable-next-line class-methods-use-this
+    createScriptProcessor() {
+      // eslint-disable-next-line @typescript-eslint/no-empty-function
+      return { connect: () => {} };
+    }
+  };
+
+  window.navigator.mediaDevices = {
+    getUserMedia: jest.fn(() => ({
+      getAudioTracks: () => ['mock-media-stream-track']
+    }))
+  };
 });
 
 afterEach(() => {
@@ -60,7 +78,8 @@ test('not providing reference grammar ID', () => {
   expect(referenceGrammars).toEqual([]);
 });
 
-test('supplying audioInputDeviceId', () => {
+test('supplying audioInputDeviceId', async () => {
+  // GIVEN: Set up Web Speech with "audioInputDeviceId" of "audio-input-device-1".
   const ponyfillFactory = createCognitiveServicesSpeechServicesPonyfillFactory({
     audioInputDeviceId: 'audio-input-device-1',
     credentials: {
@@ -69,9 +88,20 @@ test('supplying audioInputDeviceId', () => {
     }
   });
 
+  // WHEN: Polyfill is created.
   ponyfillFactory({});
 
-  expect(createPonyfill.mock.calls[0][0]).toHaveProperty('audioConfig.privSource.deviceId', 'audio-input-device-1');
+  // WHEN: Audio source is attached and audio device is opened.
+  await createPonyfill.mock.calls[0][0].audioConfig.privSource.attach();
+
+  // THEN: It should call getUserMedia() with "audio" constraints of { deviceId: 'audio-input-device-1' }.
+  expect(window.navigator.mediaDevices.getUserMedia.mock.calls[0][0]).toHaveProperty(
+    'audio.deviceId',
+    'audio-input-device-1'
+  );
+
+  // THEN: It should call getUserMedia() with "video" constraint of false.
+  expect(window.navigator.mediaDevices.getUserMedia.mock.calls[0][0]).toHaveProperty('video', false);
 });
 
 test('supplying both audioConfig and audioInputDeviceId', () => {

@@ -1,35 +1,11 @@
-import { AudioConfig } from 'microsoft-cognitiveservices-speech-sdk/distrib/lib/src/sdk/Audio/AudioConfig';
+import { AudioConfig } from 'microsoft-cognitiveservices-speech-sdk';
 import { WebSpeechPonyfillFactory } from 'botframework-webchat-api';
 import createPonyfill from 'web-speech-cognitive-services/lib/SpeechServices';
 
+import CognitiveServicesAudioOutputFormat from './types/CognitiveServicesAudioOutputFormat';
 import CognitiveServicesCredentials from './types/CognitiveServicesCredentials';
-
-type CognitiveServicesAudioOutputFormat =
-  | 'audio-16khz-128kbitrate-mono-mp3'
-  | 'audio-16khz-32kbitrate-mono-mp3'
-  | 'audio-16khz-64kbitrate-mono-mp3'
-  | 'audio-24khz-160kbitrate-mono-mp3'
-  | 'audio-24khz-48kbitrate-mono-mp3'
-  | 'audio-24khz-96kbitrate-mono-mp3'
-  | 'audio-48khz-192kbitrate-mono-mp3'
-  | 'audio-48khz-96kbitrate-mono-mp3'
-  | 'ogg-16khz-16bit-mono-opus'
-  | 'ogg-24khz-16bit-mono-opus'
-  | 'ogg-48khz-16bit-mono-opus'
-  | 'raw-16khz-16bit-mono-pcm'
-  | 'raw-16khz-16bit-mono-truesilk'
-  | 'raw-24khz-16bit-mono-pcm'
-  | 'raw-24khz-16bit-mono-truesilk'
-  | 'raw-48khz-16bit-mono-pcm'
-  | 'raw-8khz-8bit-mono-alaw'
-  | 'raw-8khz-8bit-mono-mulaw'
-  | 'riff-16khz-16bit-mono-pcm'
-  | 'riff-24khz-16bit-mono-pcm'
-  | 'riff-48khz-16bit-mono-pcm'
-  | 'riff-8khz-8bit-mono-alaw'
-  | 'riff-8khz-8bit-mono-mulaw'
-  | 'webm-16khz-16bit-mono-opus'
-  | 'webm-24khz-16bit-mono-opus';
+import CognitiveServicesTextNormalization from './types/CognitiveServicesTextNormalization';
+import createMicrophoneAudioConfigAndAudioContext from './speech/createMicrophoneAudioConfigAndAudioContext';
 
 export default function createCognitiveServicesSpeechServicesPonyfillFactory({
   audioConfig,
@@ -46,11 +22,11 @@ export default function createCognitiveServicesSpeechServicesPonyfillFactory({
   audioContext?: AudioContext;
   audioInputDeviceId?: string;
   credentials: CognitiveServicesCredentials;
-  enableTelemetry?: boolean;
+  enableTelemetry?: true;
   speechRecognitionEndpointId?: string;
   speechSynthesisDeploymentId?: string;
   speechSynthesisOutputFormat?: CognitiveServicesAudioOutputFormat;
-  textNormalization?: 'display' | 'itn' | 'lexical' | 'maskeditn';
+  textNormalization?: CognitiveServicesTextNormalization;
 }): WebSpeechPonyfillFactory {
   if (!window.navigator.mediaDevices && !audioConfig) {
     console.warn(
@@ -60,19 +36,22 @@ export default function createCognitiveServicesSpeechServicesPonyfillFactory({
     return () => ({});
   }
 
-  if (audioConfig && audioInputDeviceId) {
-    console.warn(
-      'botframework-webchat: "audioConfig" and "audioInputDeviceId" cannot be set at the same time; ignoring "audioInputDeviceId".'
-    );
-  }
+  if (audioConfig) {
+    audioInputDeviceId &&
+      console.warn(
+        'botframework-webchat: "audioConfig" and "audioInputDeviceId" cannot be set at the same time; ignoring "audioInputDeviceId".'
+      );
 
-  // WORKAROUND: We should prevent AudioContext object from being recreated because they may be blessed and UX-wise expensive to recreate.
-  //             In Cognitive Services SDK, if they detect the "end" function is falsy, they will not call "end" but "suspend" instead.
-  //             And on next recognition, they will re-use the AudioContext object.
-  if (!audioConfig) {
-    audioConfig = audioInputDeviceId
-      ? AudioConfig.fromMicrophoneInput(audioInputDeviceId)
-      : AudioConfig.fromDefaultMicrophoneInput();
+    audioContext &&
+      console.warn(
+        'botframework-webchat: "audioConfig" and "audioContext" cannot be set at the same time; ignoring "audioContext" for speech recognition.'
+      );
+  } else {
+    ({ audioConfig, audioContext } = createMicrophoneAudioConfigAndAudioContext({
+      audioContext,
+      audioInputDeviceId,
+      enableTelemetry
+    }));
   }
 
   return ({ referenceGrammarID } = {}) => {
@@ -89,6 +68,7 @@ export default function createCognitiveServicesSpeechServicesPonyfillFactory({
     });
 
     return {
+      resumeAudioContext: () => audioContext && audioContext.state === 'suspended' && audioContext.resume(),
       SpeechGrammarList,
       SpeechRecognition,
       speechSynthesis,