zoukankan      html  css  js  c++  java
  • 调用百度语音AI实现语音的识别和合成

    #coding:utf-8
    
    ## 先去ffmpeg官网下载(https://ffmpeg.zeranoe.com/builds/),好了之后解压缩,配一下环境变量
    
    ## 打开cmd,运行命令,安装如下的包
    ## pip install baidu-aip
    ## pip install pydub
    ## pip install PyAudio
    ## pip install Wave
    
    """ 调用百度语音api """
    from aip import AipSpeech
    APP_ID = " "
    API_KEY = " "
    SECRET_KEY = " "
    client = AipSpeech(APP_ID,API_KEY,SECRET_KEY)
     
     
     
    def speech_synthesis(text, filepath):
        """ 语音合成:文字转语音 """
        result = client.synthesis(text, 'zh', 1, {
            'vol': 5,
            'spd': 5,
            'pit': 5,
            'per': 0,
        })
        if not isinstance(result, dict): 
            with open (filepath , 'wb') as file: file.write(result)
    
    
    
    def play_speech(filepath):
        import os
        os.system("ffplay %s"%(filepath))
    
    # def play_speech(filepath):
        # """ 播放语音 """
        # import pyaudio
        # import wave 
        # wf = wave.open(filepath, 'rb') #二进制只读方式打开wav文件
        # p = pyaudio.PyAudio()
        # stream=p.open(format=p.get_format_from_width(wf.getsampwidth()),channels=wf.getnchannels(),rate=wf.getframerate(),output=True)
        # stream = p.open(format=pyaudio.paInt16,
                        # channels=1,
                        # rate=16000,
                        # output=True) #打开数据流
        # data = wf.readframes(1024) #读取数据
        # while data != '': #播放  
            # stream.write(data)
            # data = wf.readframes(1024)
        # stream.stop_stream()
        # stream.close()
        # p.terminate()
     
     
    
    
    # def Conversion_sampling_rate(filepath, newfilepath):
        # """ 转换采样率 """
        # from pydub import AudioSegment
        # setframefp = AudioSegment.from_file(filepath)
        # setframefp.set_frame_rate(16000)
        # setframefp.export(newfilepath, format='wav')
    
    
    
    def wav_to_pcm(wav_file):
        """ wav文件转为16k pcm文件 """
        import os
        pcm_file = "%s.pcm" %(wav_file.split(".")[0])
        os.system("ffmpeg -y  -i %s  -acodec pcm_s16le -f s16le -ac 1 -ar 16000 %s"%(wav_file,pcm_file))
        return pcm_file
    
    
    
    
    def sound_record(file_name):
        """ 录音 """
        import pyaudio
        import wave
        CHUNK = 1024
        FORMAT = pyaudio.paInt16
        CHANNELS = 1
        RATE = 16000
        RECORD_SECONDS = 3
        
        p = pyaudio.PyAudio()
        stream = p.open(format=FORMAT,
                        channels=CHANNELS,
                        rate=RATE,
                        input=True,
                        frames_per_buffer=CHUNK)
        print("开始录音,请说话......")
        frames = []
        for i in range(0, int(RATE / CHUNK * RECORD_SECONDS)):
            data = stream.read(CHUNK)
            frames.append(data)
        print("录音结束!")
        stream.stop_stream()
        stream.close()
        p.terminate()
    
        wf = wave.open(file_name, 'wb')
        wf.setnchannels(CHANNELS)
        wf.setsampwidth(p.get_sample_size(FORMAT))
        wf.setframerate(RATE)
        wf.writeframes(b''.join(frames))
        wf.close()
    
    
    def speech_recognition(filepath):
        """ 语音识别:语音转文字 """ 
        with open(filepath, 'rb') as fp:speechfile = fp.read()
        result = client.asr(speechfile, 'pcm', 16000, {
            'dev_pid': 1536,
        })
        try:
            res_str = result.get("result")[0]
            print(res_str)
        except:
            res_str = "error"
            print("识别没有成功")
        return res_str
    
     
    # 测试
    # text = "世界很复杂百度更懂你"
    # synthesisfilepath = "synthesisspeech.pcm"
    # synthesisfilepath = "16k.pcm"
    # speech_synthesis(text, synthesisfilepath)
    # wav_file = pcm_to_wav(synthesisfilepath)
    # play_speech(wav_file)
    
    # recordfilepath = "recordspeech.wav"
    # sound_record(recordfilepath)
    # pcm_file = wav_to_pcm(recordfilepath)
    # speech_recognition(pcm_file)
    
    
    
    
    """ 控制面板 """
    from tkinter import *
    from tkinter import ttk
    from tkinter import messagebox
    import os
    class App:
        def __init__(self, master):
            self.master = master
            self.master.title("调用百度AI识别语音")
            self.master.geometry("500x400")
            self.buttonimg = PhotoImage(file= os.path.join(os.path.dirname(os.path.abspath(__file__)), 'luyin - small.gif')) 
            self.initWidgets()
            
        def initWidgets(self):
            self.button = Button(self.master, text='开始录音', image=self.buttonimg, command=self.open_sound_record, height=100,width=100)
            self.button.pack(ipadx=5, ipady=5,  pady = 20)
            
            self.label = Label(self.master, text="语音识别结果:")
            self.label.place(x=100,y=400,anchor='nw')
            self.label.pack()
            
            self.text = Text(self.master, height=3, width=200)
            self.text.place(x=150,y=400,anchor='nw')
            self.text.pack()
        
        def open_sound_record(self):
            recordfilepath = "recordspeech.wav"
            sound_record(recordfilepath)
            pcm_file = wav_to_pcm(recordfilepath)
            res_str = speech_recognition(pcm_file)
            if res_str == "error":
                print(messagebox.showinfo("出错","没有成功识别语音!"))
            else:
                self.text.insert("insert", res_str)
                # text = "语音识别的结果是"+res_str
                # synthesisfilepath = "synthesisspeech.pcm"
                # speech_synthesis(text, synthesisfilepath)
                # play_speech(synthesisfilepath)
            
    root = Tk()
    App(root)
    root.mainloop()
    
  • 相关阅读:
    hbuilder中如何使用egit上传项目
    网络攻防第二周学习笔记
    sqlserver两表关联的更新
    ISAPI_rewrite中文手册
    Unity中C#单例模式使用总结
    Window Live Writer Test
    Spring Cloud 服务注册与发现(Eureka 找到了!找到了! 嘻嘻)
    Spring Cloud 服务消费与负载均衡(feign)
    Spring Cloud 服务消费与负载均衡(Rest + Ribbon )
    列表的响应式排版
  • 原文地址:https://www.cnblogs.com/yejifeng/p/11428936.html
Copyright © 2011-2022 走看看