语音实时识别-阿里通义听悟

语音实时识别-阿里通义听悟

官方文档

本文语言采用python+react,对接的阿里的通义听悟,直接上代码

后端接口

接口这里使用python实现,这里只有简单的示例,其他语言或者具体文档请参考官方的文档

#!/usr/bin/env python

# coding=utf-8

import json

import datetime

from aliyunsdkcore.client import AcsClient

from aliyunsdkcore.request import CommonRequest

from aliyunsdkcore.auth.credentials import AccessKeyCredential

from flask import Flask, abort

from flask_cors import CORS

app = Flask(__name__)

CORS(app)

APP_KEY = 'APP_KEY'

ACCESS_ID = 'ACCESS_ID'

ACCESS_SECRET = 'ACCESS_SECRET'

def create_common_request(domain, version, protocolType, method, uri):

curr_request = CommonRequest()

curr_request.set_accept_format('json')

curr_request.set_domain(domain)

curr_request.set_version(version)

curr_request.set_protocol_type(protocolType)

curr_request.set_method(method)

curr_request.set_uri_pattern(uri)

curr_request.add_header('Content-Type', 'application/json')

return curr_request

def init_parameters():

body = dict()

body['AppKey'] = APP_KEY

# 基本请求参数

input = dict()

# 输入语音流格式和采样率和以下参数设置保持一致

input['Format'] = 'pcm'

input['SampleRate'] = 16000

input['SourceLanguage'] = 'cn'

input['TaskKey'] = 'task' + datetime.datetime.now().strftime('%Y%m%d%H%M%S')

input['ProgressiveCallbacksEnabled'] = False

body['Input'] = input

# AI相关参数,按需设置即可

parameters = dict()

# 语音识别控制相关

transcription = dict()

# 角色分离 : 可选

transcription['DiarizationEnabled'] = True

diarization = dict()

diarization['SpeakerCount'] = 2

transcription['Diarization'] = diarization

parameters['Transcription'] = transcription

# 文本翻译控制相关 : 可选

parameters['TranslationEnabled'] = True

translation = dict()

translation['TargetLanguages'] = ['en'] # 假设翻译成英文

parameters['Translation'] = translation

# 章节速览相关 : 可选,包括: 标题、议程摘要

parameters['AutoChaptersEnabled'] = True

# 智能纪要相关 : 可选,包括: 待办、关键信息(关键词、重点内容、场景识别)

parameters['MeetingAssistanceEnabled'] = True

meetingAssistance = dict()

meetingAssistance['Types'] = ['Actions', 'KeyInformation']

parameters['MeetingAssistance'] = meetingAssistance

# 摘要控制相关 : 可选,包括: 全文摘要、发言人总结摘要、问答摘要(问答回顾)

parameters['SummarizationEnabled'] = True

summarization = dict()

summarization['Types'] = ['Paragraph', 'Conversational', 'QuestionsAnswering', 'MindMap']

parameters['Summarization'] = summarization

# ppt抽取和ppt总结 : 可选

parameters['PptExtractionEnabled'] = True

# 口语书面化 : 可选

parameters['TextPolishEnabled'] = True

body['Parameters'] = parameters

return body

@app.route('/createVoiceWsInfo', methods=['POST'])

def create_voice_ws_info():

body = init_parameters()

print(body)

credentials = AccessKeyCredential(ACCESS_ID, ACCESS_SECRET)

client = AcsClient(region_id='cn-beijing', credential=credentials)

curr_request = create_common_request('tingwu.cn-beijing.aliyuncs.com', '2023-09-30', 'https', 'PUT',

'/openapi/tingwu/v2/tasks')

curr_request.add_query_param('type', 'realtime')

curr_request.set_content(json.dumps(body).encode('utf-8'))

response = client.do_action_with_exception(curr_request)

res_json = json.dumps(json.loads(response), indent=4, ensure_ascii=False)

return res_json

@app.route('/getVoiceTask/', methods=['GET'])

def get_voice_task_info(task_id):

# 在这里可以根据 item_name 返回不同的数据

if not task_id:

abort(400) # 如果 item_name 为空,返回 400 错误

credentials = AccessKeyCredential(ACCESS_ID, ACCESS_SECRET)

client = AcsClient(region_id='cn-beijing', credential=credentials)

uri = '/openapi/tingwu/v2/tasks' + '/' + task_id

request = create_common_request('tingwu.cn-beijing.aliyuncs.com', '2023-09-30', 'https', 'GET', uri)

response = client.do_action_with_exception(request)

res_json = json.dumps(json.loads(response), indent=4, ensure_ascii=False)

return res_json

@app.route('/closeVoiceTask/', methods=['PUT'])

def close_voice_task_info(task_id):

# 在这里可以根据 item_name 返回不同的数据

if not task_id:

abort(400) # 如果 item_name 为空,返回 400 错误

credentials = AccessKeyCredential(ACCESS_ID, ACCESS_SECRET)

client = AcsClient(region_id='cn-beijing', credential=credentials)

request = create_common_request('tingwu.cn-beijing.aliyuncs.com', '2023-09-30', 'https', 'PUT',

'/openapi/tingwu/v2/tasks')

request.add_query_param('type', 'realtime')

request.add_query_param('operation', 'stop')

body = dict()

body['AppKey'] = APP_KEY

# 基本请求参数

input = dict()

# 输入语音流格式和采样率和以下参数设置保持一致

input['TaskId'] = task_id

body['Input'] = input

request.set_content(json.dumps(body).encode('utf-8'))

response = client.do_action_with_exception(request)

res_json = json.dumps(json.loads(response), indent=4, ensure_ascii=False)

return res_json

if __name__ == "__main__":

app.run(debug=True, host='0.0.0.0')

前端代码

VoiceWebSocket.ts

import { createVoiceWsInfo, closeTask } from "@/app/client/voiceApi";

import { v4 as uuidv4 } from "uuid";

import VoiceRecorder from "@/app/utils/VoiceRecorder";

class VoiceWebSocket {

public wsUrl: string | undefined;

public taskId: string | undefined;

public socket: WebSocket | undefined;

public socketStatus: string;

private voice: VoiceRecorder | undefined;

private setFlag: ((flag: boolean) => void | false) | undefined;

private callBack: ((result: string) => void) | undefined;

private currentResult = [];

constructor() {

this.socketStatus = "init";

}

async init(): Promise {

return new Promise((resolve, reject) => {

createVoiceWsInfo()

.then((response) => {

if (response.ok) {

return response.json();

}

throw new Error("Failed to fetch WebSocket info");

})

.then((data) => {

if ("0" === data.Code) {

const { Data = {} } = data;

const { MeetingJoinUrl = "", TaskId = "" } = Data;

this.wsUrl = MeetingJoinUrl;

this.taskId = TaskId;

this.socketStatus = "ing";

resolve();

}

reject(data.Message);

});

});

}

async initSocket(): Promise {

return new Promise((resolve, reject) => {

if (this.wsUrl && this.taskId) {

this.socket = new WebSocket(this.wsUrl);

this.socket.binaryType = "blob";

this.socket.onopen = (event) => {

console.log("WebSocket connection open ", event);

const message = {

header: {

message_id: uuidv4(),

name: "StartTranscription",

namespace: "SpeechTranscriber",

task_id: this.taskId,

},

payload: {},

context: {},

};

// 将对象转换为 JSON 字符串并发送

this.socket?.send(JSON.stringify(message));

};

this.socket.onmessage = (event) => {

this.onReceiveSocketData(event);

};

this.socket.onclose = (event) => {

this.voice && this.voice.stopRecording();

if (this.socketStatus === "ing") {

closeTask(this.taskId);

}

this.closeSocket(event);

this.setFlag && this.setFlag(false);

};

resolve();

}

reject("socket初始化未成功");

});

}

public setVoice(voice: VoiceRecorder) {

this.voice = voice;

}

public setStopFlagFun(setFlag: (flag: boolean) => void) {

this.setFlag = setFlag;

}

sendSocketData(data: string | ArrayBuffer | Blob | ArrayBufferView) {

this.socket?.send(data);

}

onReceiveSocketData(event: MessageEvent) {

console.log("receive socket data: ", event);

if ("string" == typeof event.data) {

var n = null;

try {

n = JSON.parse(event.data);

} catch (r) {

n = {};

}

this.handleMessage && this.handleMessage(n);

}

}

handleMessage(data = {}) {

// @ts-ignore

const { header = {}, payload = {} } = data;

if ("SentenceEnd" == header.name) {

const result = payload.result;

// @ts-ignore

this.currentResult.push(result);

this.callBack && this.callBack(this.currentResult.join(""));

} else if ("TranscriptionResultChanged" == header.name) {

const result = payload.result;

this.currentResult = [];

// @ts-ignore

this.currentResult.push(result);

this.callBack && this.callBack(this.currentResult.join(""));

}

}

closeSocket(event: CloseEvent) {

console.log("WebSocket connection closed: ", event);

}

stopSocket() {

this.socket?.close();

}

setCallback(callback: ((result: string) => void) | undefined): void {

this.callBack = callback;

}

}

export default VoiceWebSocket;

AudioRecorder.ts

class AudioRecorder {

socket: WebSocket;

sampleRate: number;

stream!: MediaStream;

source: MediaStreamAudioSourceNode | null = null;

processor: ScriptProcessorNode | null | undefined;

isRecording: boolean = false;

audioChunks: Blob[] = [];

constructor(socket: WebSocket, sampleRate: number = 16000) {

this.socket = socket;

this.sampleRate = sampleRate; // 采样频率

}

async startMicrophone(): Promise {

try {

const stream = await navigator.mediaDevices.getUserMedia({ audio: true });

this.stream = stream;

const audioContext = new window.AudioContext({

sampleRate: this.sampleRate,

});

// 创建源节点

const source = audioContext.createMediaStreamSource(stream);

this.source = source;

// 创建一个 ScriptProcessorNode 用于处理音频数据

const processor = audioContext.createScriptProcessor(4096, 1, 1); // 单声道

this.processor = processor;

source.connect(processor);

processor.connect(audioContext.destination); // 连接到目的地(可选)

processor.onaudioprocess = (event) => {

// console.log("可用数据:", event)

// console.log("可用数据1:", event.inputBuffer)

// console.log("可用数据2:", event.inputBuffer.getChannelData(0))

var arrayBuffer = this.floatTo16BitPCM(

event.inputBuffer.getChannelData(0),

);

this.socket.send(arrayBuffer);

};

} catch (err) {

console.error("Error accessing media devices:", err);

}

}

floatTo16BitPCM = function (e: Float32Array) {

for (

var t = new DataView(new ArrayBuffer(2 * e.length)), n = 0;

n < e.length;

n++

) {

var r = e[n] < 0 ? 32768 : 32767;

t.setInt16(2 * n, (e[n] * r) | 0, !0);

}

return t.buffer;

};

startRecording(): void {

this.isRecording = true;

this.audioChunks = []; // Reset chunks for new recording

}

stopRecording(): void {

if (this.isRecording) {

this.isRecording = false;

}

if (this.source) {

this.source.disconnect();

}

if (this.processor) {

this.processor.disconnect();

}

if (this.stream) {

this.stream.getTracks().forEach((item) => {

item.stop();

});

}

}

}

export default AudioRecorder;

VoiceComponent.ts

"use client";

import VoiceWebSocket from "../utils/VoiceWebSocket";

import AudioRecorder from "../utils/VoiceRecorder";

import React, {useState} from "react";

const VoiceComponent = () => {

const [isRecording, setIsRecording] = useState(false);

const [voiceHandle, setVoiceHandle] = useState(null);

const [socket, setSocket] = useState(null);

const [inputVal, setInputVal] = useState('')

const startListening = async () => {

if (!isRecording) {

const ws = new VoiceWebSocket();

ws.init()

.then((res) => {

console.log("socket init", res);

ws.initSocket()

.then((res) => {

console.log("web socket init", res);

setSocket(ws);

setIsRecording(true);

})

.then((next) => {

console.log(next)

if (ws.socket) {

const voice = new AudioRecorder(ws.socket);

setVoiceHandle(voice);

voice.startMicrophone().then(() => {

voice.startRecording();

});

// voice.startRecording();

ws.setVoice(voice);

ws.setCallback(setInputVal);

ws.setStopFlagFun(setIsRecording);

}

})

.catch((err) => {

alert(err);

});

})

.catch((err) => {

alert(err);

});

} else {

voiceHandle?.stopRecording();

setIsRecording(false);

}

};

return (