总目录地址:AI 系列 总目录
需要最新源码,或技术提问,请加QQ群:538327407
我的各种github 开源项目和代码:https://github.com/linbin524
目标需求
使用录音形式,模拟微信语音聊天。按住录音,松开发送语音,并完成语音识别。
ps:百度的语言识别有60秒长度限制,需要自己做好控制。
实现方案
采用C# winform 程序实现桌面版,采用Accord 实现语音录制停止等基础语音操作,操作停止按钮,
自动调用百度语言识别接口将识别内容显示在文本框中。
备注,语音识别需要配套阵列麦克风,(请先注册百度开发者)百度语音识别接口请参考:http://ai.baidu.com/docs#/ASR-Online-Csharp-SDK/top
实现效果展示
实现过程
1、下载Accord 完成语音操作引用
accord 官方 地址:http://accord-framework.net/intro.html
官网中有示例demo,笔者的就是在示例demo上做改造的。
建立自己的项目,引用包中的dll
界面代码:
using System; using System.Drawing; using System.IO; using System.Windows.Forms; using Accord.Audio; using Accord.Audio.Formats; using Accord.DirectSound; using Accord.Audio.Filters; using Baidu.Aip.API; namespace SampleApp { public partial class MainForm : Form { private MemoryStream stream; private IAudioSource source; private IAudioOutput output; private WaveEncoder encoder; private WaveDecoder decoder; private float[] current; private int frames; private int samples; private TimeSpan duration; /// <summary> /// 备注,语音识别需要配套阵列麦克风 /// </summary> public MainForm() { InitializeComponent(); // Configure the wavechart chart.SimpleMode = true; chart.AddWaveform("wave", Color.Green, 1, false); updateButtons(); // Application.Idle += ProcessFrame; } void ProcessFrame(object sender, EventArgs e) { } /// <summary> /// 从声卡开始录制音频 /// </summary> /// private void btnRecord_Click(object sender, EventArgs e) { // Create capture device source = new AudioCaptureDevice()//这里是核心 { // Listen on 22050 Hz DesiredFrameSize = 4096, SampleRate = 16000,//采样率 //SampleRate = 22050,//采样率 Channels=1, // We will be reading 16-bit PCM Format = SampleFormat.Format16Bit }; // Wire up some events source.NewFrame += source_NewFrame; source.AudioSourceError += source_AudioSourceError; // Create buffer for wavechart control current = new float[source.DesiredFrameSize]; // Create stream to store file stream = new MemoryStream(); encoder = new WaveEncoder(stream); // Start source.Start(); updateButtons(); } /// <summary> /// 播放录制的音频流。 /// </summary> /// private void btnPlay_Click(object sender, EventArgs e) { // First, we rewind the stream stream.Seek(0, SeekOrigin.Begin); // Then we create a decoder for it decoder = new WaveDecoder(stream); // Configure the track bar so the cursor // can show the proper current position if (trackBar1.Value < decoder.Frames) decoder.Seek(trackBar1.Value); trackBar1.Maximum = decoder.Samples; // Here we can create the output audio device that will be playing the recording output = new AudioOutputDevice(this.Handle, decoder.SampleRate, decoder.Channels); // Wire up some events output.FramePlayingStarted += output_FramePlayingStarted; output.NewFrameRequested += output_NewFrameRequested; output.Stopped += output_PlayingFinished; // Start playing! output.Play(); updateButtons(); } /// <summary> /// 停止录制或播放流。 /// </summary> /// private void btnStop_Click(object sender, EventArgs e) { // Stops both cases if (source != null) { // If we were recording source.SignalToStop(); source.WaitForStop(); } if (output != null) { // If we were playing output.SignalToStop(); output.WaitForStop(); } updateButtons(); // Also zero out the buffers and screen Array.Clear(current, 0, current.Length); updateWaveform(current, current.Length); SpeechAPI speechApi = new SpeechAPI(); string result = speechApi.AsrData(stream,"wav"); tb_result.Text = "语音识别结果:"+result; } /// <summary> /// 当音频有错误时,将调用这个回调函数。 /// /// /// </summary> /// private void source_AudioSourceError(object sender, AudioSourceErrorEventArgs e) { throw new Exception(e.Description); } /// <summary> /// /// 每当有新的输入音频帧时,该方法将被调用。 /// /// </summary> /// private void source_NewFrame(object sender, NewFrameEventArgs eventArgs) { eventArgs.Signal.CopyTo(current); updateWaveform(current, eventArgs.Signal.Length); encoder.Encode(eventArgs.Signal); duration += eventArgs.Signal.Duration; samples += eventArgs.Signal.Samples; frames += eventArgs.Signal.Length; } private void output_FramePlayingStarted(object sender, PlayFrameEventArgs e) { updateTrackbar(e.FrameIndex); if (e.FrameIndex + e.Count < decoder.Frames) { int previous = decoder.Position; decoder.Seek(e.FrameIndex); Signal s = decoder.Decode(e.Count); decoder.Seek(previous); updateWaveform(s.ToFloat(), s.Length); } } private void output_PlayingFinished(object sender, EventArgs e) { updateButtons(); Array.Clear(current, 0, current.Length); updateWaveform(current, current.Length); } /// private void output_NewFrameRequested(object sender, NewFrameRequestedEventArgs e) { e.FrameIndex = decoder.Position; Signal signal = decoder.Decode(e.Frames); if (signal == null) { e.Stop = true; return; } e.Frames = signal.Length; signal.CopyTo(e.Buffer); } private void updateWaveform(float[] samples, int length) { if (InvokeRequired) { BeginInvoke(new Action(() => { chart.UpdateWaveform("wave", samples, length); })); } else { chart.UpdateWaveform("wave", current, length); } } /// private void updateTrackbar(int value) { if (InvokeRequired) { BeginInvoke(new Action(() => { trackBar1.Value = Math.Max(trackBar1.Minimum, Math.Min(trackBar1.Maximum, value)); })); } else { trackBar1.Value = Math.Max(trackBar1.Minimum, Math.Min(trackBar1.Maximum, value)); } } private void updateButtons() { if (InvokeRequired) { BeginInvoke(new Action(updateButtons)); return; } if (source != null && source.IsRunning) { btnBwd.Enabled = false; btnFwd.Enabled = false; btnPlay.Enabled = false; btnStop.Enabled = true; btnRecord.Enabled = false; trackBar1.Enabled = false; } else if (output != null && output.IsRunning) { btnBwd.Enabled = false; btnFwd.Enabled = false; btnPlay.Enabled = false; btnStop.Enabled = true; btnRecord.Enabled = false; trackBar1.Enabled = true; } else { btnBwd.Enabled = false; btnFwd.Enabled = false; btnPlay.Enabled = stream != null; btnStop.Enabled = false; btnRecord.Enabled = true; trackBar1.Enabled = decoder != null; trackBar1.Value = 0; } } private void MainFormFormClosed(object sender, FormClosedEventArgs e) { if (source != null) source.SignalToStop(); if (output != null) output.SignalToStop(); } private void saveFileDialog1_FileOk(object sender, System.ComponentModel.CancelEventArgs e) { Stream fileStream = saveFileDialog1.OpenFile(); stream.WriteTo(fileStream); fileStream.Close(); } private void saveToolStripMenuItem_Click(object sender, EventArgs e) { saveFileDialog1.ShowDialog(this); } private void updateTimer_Tick(object sender, EventArgs e) { lbLength.Text = String.Format("Length: {0:00.00} sec.", duration.Seconds); } private void aboutToolStripMenuItem_Click(object sender, EventArgs e) { new AboutBox().ShowDialog(this); } private void closeToolStripMenuItem_Click(object sender, EventArgs e) { Close(); } private void btnIncreaseVolume_Click(object sender, EventArgs e) { adjustVolume(1.25f); } private void btnDecreaseVolume_Click(object sender, EventArgs e) { adjustVolume(0.75f); } private void adjustVolume(float value) { stream.Seek(0, SeekOrigin.Begin); decoder = new WaveDecoder(stream); var signal = decoder.Decode(); var volume = new VolumeFilter(value); volume.ApplyInPlace(signal); stream.Seek(0, SeekOrigin.Begin); encoder = new WaveEncoder(stream); encoder.Encode(signal); } } }
百度语音识别接口
百度已经提供sdk,对于支持语音格式如下。
支持的语音格式
原始 PCM 的录音参数必须符合 8k/16k 采样率、16bit 位深、单声道,支持的格式有:pcm(不压缩)、wav(不压缩,pcm编码)、amr(压缩格式)。
public string AsrData(string filePath, string format = "pcm", int rate = 16000) { var data =File.ReadAllBytes(filePath); var result = _asrClient.Recognize(data, format, 16000); return result.ToString(); }
结果评测:
对于普通的语言识别效果不好,需要阵列麦克风才可以。