Unity对接科大讯飞实时语音转写WebAPI(Windows平台)

devtools/2024/9/23 4:31:24/

科大讯飞官方文档:实时语音转写 API 文档 | 讯飞开放平台文档中心 (xfyun.cn)

参考文章:unity通过WebAPI连接Websocket实现讯飞语音识别与合成。_unity websocket audio-CSDN博客

        要实现语音转文字。首先我们需要从麦克风获取到语音数据,这里用到了Microphone类,Unity自带;其次,需要将语音数据发送给讯飞,这里用到的是WebSocketSharp.WebSocket,用习惯了。然后就是按照文档一步步踩坑了。

        直接贴代码了。代码主要实现握手阶段参数签名,实时通信阶段的数据传输以及结果解析。

using System.Collections;
using System.Collections.Generic;
using UnityEngine;
using System;
using WebSocketSharp;
using System.Text;
using System.Security.Cryptography;
using LitJson;
using Newtonsoft.Json;public class SpeechHelper : MonoBehaviour
{public event Action<string> 语音识别完成事件;   //语音识别回调事件public AudioClip RecordedClip;private string micphoneName = string.Empty;WebSocket speechWebSocket;private System.Action<string> resultCallback;public void InitSpeechHelper(System.Action<string> textCallback){resultCallback = textCallback;}public void StartSpeech(){if (speechWebSocket != null && speechWebSocket.ReadyState == WebSocketState.Open){Debug.LogWarning("开始语音识别失败!,等待上次识别连接结束");return;}if(Microphone.devices.Length <= 0){Debug.LogWarning("找不到麦克风");return;}messageQueue.Clear();micphoneName = Microphone.devices[0];Debug.Log("micphoneName:" + micphoneName);try{RecordedClip = Microphone.Start(micphoneName, false, 60, 16000);ConnectSpeechWebSocket();}catch(Exception ex){Debug.LogError(ex.Message);}}public void StopSpeech(){Microphone.End(micphoneName);Debug.Log("识别结束,停止录音");}void ConnectSpeechWebSocket(){try{speechWebSocket = new WebSocket(GetWebSocketUrl());}catch (Exception ex){UnityEngine.Debug.LogError(ex.Message);return;}speechWebSocket.OnOpen += (sender, e) =>{Debug.Log("OnOpen");speechWebSocket.OnClose += OnWebSocketClose;};speechWebSocket.OnMessage += OnInitMessage;speechWebSocket.OnError += OnError;speechWebSocket.ConnectAsync();StartCoroutine(SendVoiceData());}void OnWebSocketClose(object sender, CloseEventArgs e){Debug.Log("OnWebSocketClose");}private static Queue<string> messageQueue = new Queue<string>();void OnInitMessage(object sender, MessageEventArgs e){UnityEngine.Debug.Log("qqqqqqqqqqqqqWebSocket数据返回:" + e.Data);messageQueue.Enqueue(e.Data);}private void MainThreadOnMessage(string message){try{XFResponse response = JsonConvert.DeserializeObject<XFResponse>(message);if (0 != response.code){return;}if (response.action.Equals("result")){var result = ParseXunfeiRecognitionResult(response.data);if(result.IsFinal){Debug.Log("Text最终:" + result.Text);resultCallback?.Invoke(result.Text);}else{Debug.Log("Text中间:" + result.Text);}}}catch (Exception ex){Debug.LogError(ex.Message);}}void OnError(object sender, ErrorEventArgs e){UnityEngine.Debug.Log("WebSoclet:发生错误:" + e.Message);}public SpeechRecognitionResult ParseXunfeiRecognitionResult(string dataJson){StringBuilder builder = new StringBuilder();SpeechRecognitionResult res = new SpeechRecognitionResult();try{JsonData data = JsonMapper.ToObject(dataJson);JsonData cn = data["cn"];JsonData st = cn["st"];if (st["ed"].ToString().Equals("0")){res.IsFinal = false;}else{res.IsFinal = true;}JsonData rtArry = st["rt"];foreach (JsonData rtObject in rtArry){JsonData wsArr = rtObject["ws"];foreach (JsonData wsObject in wsArr){JsonData cwArr = wsObject["cw"];foreach (JsonData cwObject in cwArr){builder.Append(cwObject["w"].ToString());}}}}catch(Exception ex){Debug.LogError(ex.Message);}res.Text = builder.ToString();return res;}void SendData(byte[] voiceData){Debug.Log("SendData:" + voiceData.Length + ",time:" + Time.realtimeSinceStartup);if (speechWebSocket.ReadyState != WebSocketState.Open){return;}try{if (speechWebSocket != null && speechWebSocket.IsAlive){speechWebSocket.SendAsync(voiceData, success =>{if (success){UnityEngine.Debug.Log("WebSoclet:发送成功:" + voiceData.Length);}else{UnityEngine.Debug.Log("WebSoclet:发送失败:");}});}}catch{}}void SendEndMsg(System.Action callback){string endMsg = "{\"end\": true}";byte[] data = Encoding.UTF8.GetBytes(endMsg);try{if (speechWebSocket != null && speechWebSocket.IsAlive){speechWebSocket.SendAsync(data, success =>{if (success){UnityEngine.Debug.Log("WebSoclet:发送END成功:" + data.Length);}else{UnityEngine.Debug.Log("WebSoclet:发送END失败:");}callback?.Invoke();});}}catch{}}IEnumerator SendVoiceData(){yield return new WaitUntil(()=> (speechWebSocket.ReadyState == WebSocketState.Open));yield return new WaitWhile(() => Microphone.GetPosition(micphoneName) <= 0);float t = 0;int position = Microphone.GetPosition(micphoneName);const float waitTime = 0.04f;//每隔40ms发送音频int lastPosition = 0;const int Maxlength = 640;//最大发送长度//Debug.Log("position:" + position + ",samples:" + RecordedClip.samples);while (position < RecordedClip.samples && speechWebSocket.ReadyState == WebSocketState.Open){t += waitTime;yield return new WaitForSecondsRealtime(waitTime);if (Microphone.IsRecording(micphoneName)) position = Microphone.GetPosition(micphoneName);//Debug.Log("录音时长:" + t + "position=" + position + ",lastPosition=" + lastPosition);if (position <= lastPosition){Debug.LogWarning("字节流发送完毕!强制结束!");break;}int length = position - lastPosition > Maxlength ? Maxlength : position - lastPosition;byte[] date = GetClipData(lastPosition, length, RecordedClip);SendData(date);lastPosition = lastPosition + length;}yield return new WaitForSecondsRealtime(waitTime);SendEndMsg(null);Microphone.End(micphoneName);}public byte[] GetClipData(int star, int length, AudioClip recordedClip){float[] soundata = new float[length];recordedClip.GetData(soundata, star);int rescaleFactor = 32767;byte[] outData = new byte[soundata.Length * 2];for (int i = 0; i < soundata.Length; i++){short temshort = (short)(soundata[i] * rescaleFactor);byte[] temdata = BitConverter.GetBytes(temshort);outData[i * 2] = temdata[0];outData[i * 2 + 1] = temdata[1];}return outData;}private string GetWebSocketUrl(){string appid = "appid";string ts = GetCurrentUnixTimestampMillis().ToString();string baseString = appid + ts;string md5 = GetMD5Hash(baseString);UnityEngine.Debug.Log("baseString:" + baseString + ",md5:" + md5);string sha1 = CalculateHmacSha1(md5, "appkey");string signa = sha1;string url = string.Format("ws://rtasr.xfyun.cn/v1/ws?appid={0}&ts={1}&signa={2}", appid, ts, signa);UnityEngine.Debug.Log(url);return url;}private long GetCurrentUnixTimestampMillis(){DateTime unixStartTime = new DateTime(1970, 1, 1).ToLocalTime();DateTime now = DateTime.Now;// DateTime.UtcNow;TimeSpan timeSpan = now - unixStartTime;long timestamp = (long)timeSpan.TotalSeconds;return timestamp;}public string GetMD5Hash(string input){MD5 md5Hasher = MD5.Create();byte[] data = md5Hasher.ComputeHash(Encoding.Default.GetBytes(input));StringBuilder sBuilder = new StringBuilder();for (int i = 0; i < data.Length; i++){sBuilder.Append(data[i].ToString("x2"));}return sBuilder.ToString();}public string CalculateHmacSha1(string data, string key){HMACSHA1 hmac = new HMACSHA1(Encoding.UTF8.GetBytes(key));byte[] hashBytes = hmac.ComputeHash(Encoding.UTF8.GetBytes(data));return Convert.ToBase64String(hashBytes);}private void Update(){if(messageQueue.Count > 0){MainThreadOnMessage(messageQueue.Dequeue());}}
}

Json解析类。

[Serializable]
public struct XFResponse
{public string action;public int code;public string data;public string desc;public string sid;
}
[Serializable]
public struct SpeechRecognitionResult
{public string Text;        public bool IsFinal;        
}

值得注意的问题。

1、Microphone使用时传默认设备名比传null好使

2、握手阶段时间戳用的是秒(不是毫秒)

3、上传结束标志时,也要间隔40ms,否则讯飞像是没收到一样

遗留问题:

yield return new WaitForSecondsRealtime(0.04f)实际间隔时间0.1s左右,导致消息发送得很慢


http://www.ppmy.cn/devtools/35273.html

相关文章

矩池云jupyter运行opengait代码 未完成版

文章目录 前言——矩池云的使用技巧1.切换源 一、下载数据集二、下载模型三、环境配置1.查看python、torch、torchvision版本2.查看一些包版本是否过高3.下载包 四、开始训练1.设置环境变量2.遇到的问题&#xff08;1&#xff09;torch.cuda.is_available()返回false&#xff0…

BEV下统一的多传感器融合框架 - FUTR3D

BEV下统一的多传感器融合框架 - FUTR3D 引言 在自动驾驶汽车或者移动机器人上&#xff0c;通常会配备许多种传感器&#xff0c;比如&#xff1a;光学相机、激光雷达、毫米波雷达等。由于不同传感器的数据形式不同&#xff0c;如RGB图像&#xff0c;点云等&#xff0c;不同模态…

【stm32-3】对射式红外传感器计次旋转编码器计数

1.对射式红外传感器计次 void EXTI_DeInit(void); //把EXTI配置都清除&#xff0c;恢复到上电默认状态 void EXTI_Init(EXTI_InitTypeDef* EXTI_InitStruct//指向EXTI_InitTypeDef结构体的指针&#xff0c;其中包含了EXTI外设的基本信息); //初始化 void EXTI_S…

毕业设计:《基于 Prometheus 和 ELK 的基础平台监控系统设计与实现》

前言 《基于 Prometheus 和 ELK 的基础平台监控系统设计与实现》&#xff0c;这是我在本科阶段的毕业设计&#xff0c;通过引入 Prometheus 和 ELK 架构实现企业对指标与日志的全方位监控。并且基于云原生&#xff0c;使用容器化持续集成部署的开发方式&#xff0c;通过 Sprin…

基于springboot+vue+Mysql的在线动漫信息平台

开发语言&#xff1a;Java框架&#xff1a;springbootJDK版本&#xff1a;JDK1.8服务器&#xff1a;tomcat7数据库&#xff1a;mysql 5.7&#xff08;一定要5.7版本&#xff09;数据库工具&#xff1a;Navicat11开发软件&#xff1a;eclipse/myeclipse/ideaMaven包&#xff1a;…

数组操作push、pop、shift、unshift操作

这四个都是用来操作数组的,用于插入和删除元素 成对来说 push、pop都是从数组尾部操作(你看p开头的,就记住屁股就行了),分别是插入和弹出 let array [1,2,3,4,5,6] array.push(7) // 7 array // [1,2,3,4,5,6,7]push操作,数组从尾部插入,返回插入后数组的个数,改变了原数组…

数据库的乐观锁和悲观锁是什么?怎么实现的?

数据库管理系统&#xff08;DBMS&#xff09;中的并发控制的任务是确保在多个事务同时存取数据库中同一数据时不破坏事务的隔离性和统一性以及数据库的统一性。乐观并发控制&#xff08;乐观锁&#xff09;和悲观并发控制&#xff08;悲观锁&#xff09;是并发控制主要采用的技…

conda换源和pip换源

最近在anaconda虚拟环境里时&#xff0c;pip下载实在过慢&#xff0c;忍不住怀疑换源到底成功了没&#xff0c;最后发现conda和pip的换源不是一个东西 conda的换源网上教程很多了&#xff0c;我在这里就不提了&#xff0c;随便找个赞数高的教程参考即可。但是&#xff0c;cond…