zoukankan      html  css  js  c++  java
  • 使用机器学习方法识别音频文件的音乐和演讲

    背景

    最近下载了一批类似百家讲坛的音频文件。这些文件前面部分是演讲类的音频,主要讲历史的,后面一部分是音乐。
    但是我只想听演讲类部分,不想听音乐。所以希望把文件切割,把音乐部分切走,只留下演讲部分。
    观察文件,发现每个文件的音乐都不一样,演讲和音乐的长度也不一样。
    这里一个技术难点就是怎么识别哪些音频是演讲,哪些音频是音乐。
    通过KNN算法,1s的音频文件的预测正确率是92%。
    同时3s都判断为音乐才进行分割,整个文件的分割正确率是98%。

    音频文件和源码

    音频文件和源码可以在这里下载

    一、把音频文件转换为数字

    # encoding=gbk
    import random
    import wave
    import matplotlib.pyplot as plt
    import numpy as np
    import os
    
    # nchannels 声道
    # sampwidth 样本宽度
    # framerate 帧率,也就是一秒有多少帧
    # nframes 文件一共有多少帧
    
    def pre_deal(file_path):
        """音频解析,返回音频数据"""
        f = wave.open(file_path, 'rb')
        params = f.getparams()
        nchannels, sampwidth, framerate, nframes = params[:4]
        strData = f.readframes(nframes)  # 读取音频,字符串格式
        waveData = np.fromstring(strData, dtype=np.int16)  # 将字符串转化为int
    
        waveData = waveData[::nchannels]  # 根据声道数,转换为单声道
        rate = 20.00
        framerate = framerate / rate  # 降低帧率
        nframes = nframes / rate  # 降低帧率
        waveData = waveData[::int(rate)]
    
        # wave幅值归一化
        max_ = float(max(abs(waveData)))
        waveData = waveData / max_
    
        return waveData, framerate, nframes
    
    
    def plpot(waveData):
        """画图"""
        time = [i for i, v in enumerate(waveData)]
        plt.plot(time, waveData)
        plt.xlabel("Time")
        plt.ylabel("Amplitude")
        plt.title("Single channel wavedata")
        plt.grid('on')  # 标尺,on:有,off:无。
        plt.show()
    
    
    def mp3towav(file_path, to_file_path):
        """mp3文件转wav文件"""
        if os.path.exists(to_file_path):
            return to_file_path
        from pydub import AudioSegment
        print file_path
        song1 = AudioSegment.from_mp3(file_path)
        song1.export(to_file_path, 'wav')
        return to_file_path
    
    
    if __name__ == '__main__':
        file_path = 'D:BaiduNetdiskDownload\a.mp3'
        file_path = mp3towav('D:BaiduNetdiskDownload\a.mp3', file_path.replace('mp3', 'wav'))
        data, _, _ = pre_deal(file_path)
        plpot(data)
    
    
    • 通过wave库,可以识别音频文件,声道,样本宽度,帧率,帧数等
    • 由于文件的左右声道的值都一样,所以简单处理只要其中一个声道
    • 为了提升机器学习速度,修改采样率为20分之一,来降低数据量
    • wave库只支持wav文件,所以需要把mp3转换为wav,这里用到了pydub库
    • 解析音频数据后,通过matplotlib库来画图,显示出波纹图

    二、人工标记数据

    使用音频处理软件goldwave,采用人工听的方法来把音频文件的音乐部分剪掉,保存的文件放在chg目录里面,剪之前的文件放在raw目录下面。一共剪了18个文件。

    三、获取训练数据

    class LeaningTest():
        chg_path = r'D:BaiduNetdiskDownload	estchg'
        raw_path = r'D:BaiduNetdiskDownload	est
    aw'
        model = None
    
        @classmethod
        def load_model(cls):
            cls.model = pickle_utils.load('knn.model.pkl')
    
        @classmethod
        def chg(cls):
            chg_path = r'D:BaiduNetdiskDownload	estchg'
            raw_path = r'D:BaiduNetdiskDownload	est
    aw'
            for i, f in enumerate(os.listdir(chg_path)):
                shutil.copy(chg_path + '\' + f, chg_path + '\' + '%s.mp3' % i)
                shutil.copy(raw_path + '\' + f, raw_path + '\' + '%s.mp3' % i)
    
        @classmethod
        def get_path(cls, i, t):
            p = cls.chg_path if t == 'chg' else cls.raw_path
            return p + '\' + '%s.mp3' % i
    
        @classmethod
        def sample_cnt(cls, sample):
            """
            转换样本数据,返回每个区间的计数。
            例如从[0.1,0.1,0.8]转换为[2,1]
            2是[0,0.5)区间的计数
            1是[0.5,1)区间的计数
            """
            step = 0.025
            qujians = []
            start = 0
            while start < 1:
                qujians.append((start, start + step))
                start += step
            new_sample = [0 for i in range(len(qujians))]
            for s in sample:
                for i, qujian in enumerate(qujians):
                    if qujian[0] <= s < qujian[1]:
                        new_sample[i] += 1
            return new_sample
    
        @classmethod
        def get_sample(cls, i):
            """
            获取用于机器学习的数据
            return [([100,200],0)]
            """
            chg = cls.to_wav(cls.get_path(i, 'chg'))
            raw = cls.to_wav(cls.get_path(i, 'raw'))
    
            data_chg, framerate_chg, n_frames_chg = pre_deal(chg)
            total_sec_chg = int(n_frames_chg / framerate_chg)
    
            data_raw, framerate_raw, n_frames_raw = pre_deal(raw)
            total_sec_raw = int(n_frames_raw / framerate_raw)
    
            length = 1
            samples = []
            for i in range(60, total_sec_raw, length):
                if total_sec_chg + 5 < i < total_sec_chg + 5:
                    continue  # 不要这部分
    
                flag = 0 if i < total_sec_chg else 1
                # print get_index(framerate, 0, i),get_index(framerate, 0, i + length),total_sec
                sample = data_raw[get_index(framerate_raw, 0, i):get_index(framerate_raw, 0, i + length)]
    
                sample = cls.sample_cnt(sample)
    
                samples.append((sample, flag))
            return samples
    
        @classmethod
        def to_wav(cls, file_path):
            """转换mp3为wav"""
            if 'mp3' in file_path:
                to_file_path = file_path.replace('mp3', 'wav')
                mp3towav(file_path, to_file_path)
                file_path = to_file_path
            return file_path
    
        @classmethod
        def get_all_sample(cls, ):
            """获取所有样本"""
            file_name = 'sample4.json'
            if os.path.exists(file_name):
                with open(file_name, 'r') as f:
                    return json.loads(f.read())
            else:
                samples = []
                for i in range(1):
                    print 'get sample', i
                    samples.extend(cls.get_sample(i))
                with open(file_name, 'w') as f:
                    f.write(json.dumps(samples))
                return samples
    
        @classmethod
        def train_wrapper(cls):
            """训练"""
            samples = cls.get_all_sample()
            label0 = [s for s in samples if s[1] == 0]
            label1 = [s for s in samples if s[1] == 1]
            random.shuffle(label0)
            random.shuffle(label1)
            train_datas_sets = [i[0] for i in label0[:int(len(label0) * 0.7)]] + [i[0] for i in
                                                                                  label1[:int(len(label1) * 0.7)]]
            train_labels_set = [i[1] for i in label0[:int(len(label0) * 0.7)]] + [i[1] for i in
                                                                                  label1[:int(len(label1) * 0.7)]]
            test_datas_set = [i[0] for i in label0[int(len(label0) * 0.7):]] + [i[0] for i in
                                                                                label1[int(len(label1) * 0.7):]]
            test_labels_set = [i[1] for i in label0[int(len(label0) * 0.7):]] + [i[1] for i in
                                                                                 label1[int(len(label1) * 0.7):]]
            print len(train_datas_sets)
            # cls.train_knn(train_datas_sets, train_labels_set, test_datas_set, test_labels_set)
    
     
    
    if __name__ == '__main__':
        LeaningTest.train_wrapper()
    
    • 以1秒钟为一个样本,然后对数据进行计数,返回每个区间的计数,区间间隔是0.025,所以一个样布的向量长度是40
    • 由于前60s都是前奏,所以不作为训练数据
    • 由于是人工分割,所以可能有误差,所以把分割点前后5s的都不作为训练数据

    四、训练

    @classmethod
    def train(cls, train_datas_sets, train_labels_set, test_datas_set, test_labels_set):
        """
        """
        from sklearn.naive_bayes import GaussianNB
        from sklearn.linear_model import LogisticRegression
        from sklearn.linear_model import LinearRegression
        from sklearn import tree
        from sklearn import svm
        from sklearn.neural_network import MLPClassifier
        from sklearn import neighbors
        for mechine in [svm.SVC, LogisticRegression, LinearRegression, tree.DecisionTreeClassifier,
                        neighbors.KNeighborsClassifier, MLPClassifier, GaussianNB]:
            clf = mechine()
            clf.fit(train_datas_sets, train_labels_set)  # 训练
            score = clf.score(test_datas_set, test_labels_set)  # 预测测试集,并计算正确率
            print 'score', mechine, score
    

    训练结果:

    score <class 'sklearn.svm.classes.SVC'> 0.7203252032520325
    score <class 'sklearn.linear_model.logistic.LogisticRegression'> 0.8886178861788618
    score <class 'sklearn.linear_model.base.LinearRegression'> 0.40864632529611417
    score <class 'sklearn.tree.tree.DecisionTreeClassifier'> 0.8888888888888888
    score <class 'sklearn.neighbors.classification.KNeighborsClassifier'> 0.9224932249322493
    score <class 'sklearn.neural_network.multilayer_perceptron.MLPClassifier'> 0.835230352303523
    score <class 'sklearn.naive_bayes.GaussianNB'> 0.8035230352303523
    
    • 使用多种模型进行训练,得到的结果为knn的准确率最高,达到了0.92

    所以训练knn模型,并保存为pickle

    @classmethod
    def train_knn(cls, train_datas_sets, train_labels_set, test_datas_set, test_labels_set):
        from sklearn import neighbors
        mechine = neighbors.KNeighborsClassifier
        clf = mechine()
        clf.fit(train_datas_sets, train_labels_set)  # 训练
        score = clf.score(test_datas_set, test_labels_set)  # 预测测试集,并计算正确率
        print 'score', mechine, score
        pickle_utils.dump(clf, 'knn.model.pkl')
    

    五、分割文件

    @classmethod
    def get_cut_sce(cls, file_path, model):
        """获取分割的秒数,找不到返回None"""
        file_path = cls.to_wav(file_path)
        data_raw, framerate, n_frames = pre_deal(file_path)
        total_sec = int(n_frames / framerate)
    
        length = 1
        rets = []
        for i in range(60, total_sec, length):
            # print file_path, i
            sample = data_raw[get_index(framerate, 0, i):get_index(framerate, 0, i + length)]
    
            sample = cls.sample_cnt(sample)
            ret = model.predict([sample])
            rets.append(ret)
            if ret == 1 and len(rets) >= 3 and rets[-2] == 1 and rets[-3] == 1:
                return i
    
        return None
    
    @classmethod
    def get_min(cls, sec):
        """转换秒数为 分秒格式"""
        print '%s:%s' % (int(sec / 60), int(sec % 60))
    
    @classmethod
    def predict(cls, ):
        """预测"""
        file_path = r'D:BaiduNetdiskDownloadc.mp3'
        model = pickle_utils.load('knn.model.pkl')
        sec = cls.get_cut_sce(file_path, model)
        print 'sec', sec, cls.get_min(sec)
    
    @classmethod
    def cut_song(cls, file_path, to_file_path, file_name):
        """分割歌曲"""
        print 'cut_song', file_name.decode('gbk'), file_path
        sec = cls.get_cut_sce(file_path, cls.model)
        if sec is None:
            print 'error can not find sec', file_path, file_name.decode('gbk')
            return 0
        song = AudioSegment.from_mp3(file_path)
        # to_file_path=file_path.replace('mp3','wav')
        song = song[:sec * 1000]
        song.export(to_file_path, 'mp3', bitrate='64k')
        return 1
    
    
    @classmethod
    def cut_songs(cls, ):
        """分割某个文件夹下面的所有歌曲"""
        root_path = r'D:BaiduNetdiskDownload听世界-战国5(156集)64kbps'
        del_path = r'D:BaiduNetdiskDownload	o_del'
        for f in os.listdir(root_path):
            if 'mp3' in f and 'cut' not in f:
                file_path = root_path + '\' + f
                if os.path.exists(file_path + '.cut.mp3'):
                    print 'exist', file_path.decode('gbk') + '.cut.mp3'
                    continue
                # 由于pydub不支持windows的中文路径,所以只能把源文件已到一个临时的英文目录,然后执行分割 然后把临时文件移走
                tmp_file_path = 'D:BaiduNetdiskDownload\test.mp3'  # pydub不支持中文地址,只能这样
                tmp_wav_path = tmp_file_path.replace('mp3', 'wav')
                tmp_to_file_path = tmp_file_path + '.cut.mp3'
                shutil.copy(file_path, tmp_file_path)
                ret = cls.cut_song(tmp_file_path, tmp_to_file_path,f)
                shutil.move(tmp_file_path, del_path + '\del1_' + f)
                shutil.move(tmp_wav_path, del_path + '\del3_' + f)
                try:
                    # 有可能找不到分割点,导致没有分割,所以加上try
                    shutil.copy(tmp_to_file_path, file_path + '.cut.mp3')
                    shutil.move(tmp_to_file_path, del_path + '\del2_' + f)
    
                except:
                    import traceback
                    print traceback.format_exc()
    
    
    
    
    @classmethod
    def test(cls):
        song = AudioSegment.from_mp3(u'D:BaiduNetdiskDownload测试\a.mp3'.encode('gbk'))
    
    
    if __name__ == '__main__':
        LeaningTest.load_model()
        LeaningTest.cut_songs()
    
    • 即使准确率达到0.92,但是还没有到100%,所以连续3s都判断为音乐,才分割,这样理论的准确率可以去到1-0.08^3。
    • 由于pydub不支持windows的中文路径,所以只能把源文件已到一个临时的英文目录,然后执行分割 然后把临时文件移走

    未经同意,请不要转载

  • 相关阅读:
    Sharepoint COMException 0x81020037
    [SharePoint]如何去掉欢迎导航的信息?
    配置两台Azure服务器,一台加入另一台的ad域加入不进去的问题
    在Azure上部署Sqlserver网络访问不了的问题
    Page 由于代码已经过优化或者本机框架位于调用堆栈之上
    ProjectOnline同步数据到线下
    关于ProjectServer定制化项目中心页面
    oracle数据库逐步学习总结【基础二】
    Java学习个人总结
    oracle数据库逐步学习总结【基础一】
  • 原文地址:https://www.cnblogs.com/Xjng/p/12560707.html
Copyright © 2011-2022 走看看