I am trying to get the Bing Speech API to work in C # via WebSockets. I looked at the implementation in Javascript here and followed the protocol instructions here , but I came across a full brick wall. I cannot use the existing C # service because I work in a Linux container, so I need to use the implementation on .net Core. Annoyingly, the existing service is closed-source!
I can successfully connect to the web juice, but I cannot get the server to respond to my connection. I expect to receive a turn.start text message from the server, but I boot from the server as soon as I sent a few bytes of the audio file. I know that the audio file is in the correct format because I got it directly from the C # service example here .
I feel like I've run out of options here. The only thing I can think about right now is that I didn’t send sound bites correctly. Currently Im just sending the audio file in sequential 4096 bytes. I know that the first audio message contains the RIFF header, which is only 36 bytes, and then I just send it along with the following (4096-36) bytes.
Here is my full code. You just need to run it as a .net kernel or .net console application, and you need an audio file and an API key.
using Newtonsoft.Json; using System; using System.Collections.Generic; using System.IO; using System.Linq; using System.Net.Http; using System.Net.WebSockets; using System.Text; using System.Threading; using System.Threading.Tasks; namespace ConsoleApp3 { class Program { static void Main(string[] args) { Task.Run(async () => { var bingService = new BingSpeechToTextService(); var audioFilePath = @"FILEPATH GOES HERE"; var authenticationKey = @"BING AUTHENTICATION KEY GOES HERE"; await bingService.RegisterJob(audioFilePath, authenticationKey); }).Wait(); } } public class BingSpeechToTextService { private static async Task Receiving(ClientWebSocket client) { var buffer = new byte[128]; while (true) { var result = await client.ReceiveAsync(new ArraySegment<byte>(buffer), CancellationToken.None); var res = Encoding.UTF8.GetString(buffer, 0, result.Count); if (result.MessageType == WebSocketMessageType.Text) { Console.WriteLine(Encoding.UTF8.GetString(buffer, 0, result.Count)); } else if (result.MessageType == WebSocketMessageType.Close) { Console.WriteLine($"Closing ... reason {client.CloseStatusDescription}"); var description = client.CloseStatusDescription; //await client.CloseOutputAsync(WebSocketCloseStatus.NormalClosure, "", CancellationToken.None); break; } else { Console.WriteLine("Other result"); } } } /* #endregion Private Static Methods */ /* #region Public Static Methods */ public static UInt16 ReverseBytes(UInt16 value) { return (UInt16)((value & 0xFFU) << 8 | (value & 0xFF00U) >> 8); } /* #endregion Public Static Methods */ /* #region Interface: 'Unscrypt.Bing.SpeechToText.Client.Api.IBingSpeechToTextJobService' Methods */ public async Task<int?> RegisterJob(string audioFilePath, string authenticationKeyStr) { var authenticationKey = new BingSocketAuthentication(authenticationKeyStr); var token = authenticationKey.GetAccessToken(); /* #region Connect web socket */ var cws = new ClientWebSocket(); var connectionId = Guid.NewGuid().ToString("N"); var lang = "en-US"; cws.Options.SetRequestHeader("X-ConnectionId", connectionId); cws.Options.SetRequestHeader("Authorization", "Bearer " + token); Console.WriteLine("Connecting to web socket."); var url = $"wss://speech.platform.bing.com/speech/recognition/interactive/cognitiveservices/v1?format=simple&language={lang}"; await cws.ConnectAsync(new Uri(url), new CancellationToken()); Console.WriteLine("Connected."); /* #endregion*/ /* #region Receiving */ var receiving = Receiving(cws); /* #endregion*/ /* #region Sending */ var sending = Task.Run(async () => { dynamic speechConfig = new { context = new { system = new { version = "1.0.00000" }, os = new { platform = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36", name = "Browser", version = "" }, device = new { manufacturer = "SpeechSample", model = "SpeechSample", version = "1.0.00000" } } }; var requestId = Guid.NewGuid().ToString("N"); var speechConfigJson = JsonConvert.SerializeObject(speechConfig, Formatting.None); StringBuilder outputBuilder = new StringBuilder(); outputBuilder.Append("path:speech.config\r\n");