zoukankan      html  css  js  c++  java
  • 机器学习-文本分类实例-朴素贝叶斯

    机器学习-文本分类实例-朴素贝叶斯

    1.准备训练样本

    使用的复旦大学文本分类样本数据

    2.训练模型

    3.准备测试数据

    4.分类

    训练模型

    import os
    import jieba
    #Bunch类
    from sklearn.datasets.base import Bunch
    import pickle
    from sklearn import feature_extraction
    from sklearn.feature_extraction.text import TfidfTransformer #TF-IDF向量转换类
    from sklearn.feature_extraction.text import TfidfVectorizer
    
    
    # 定义两个函数读取和保存文件
    # 保存至文件
    def savefile(savepath, content):
        fp = open(savepath, "w", encoding="GBK")
        fp.write(content)
        fp.close()
    
    
    # 读文件
    def readfile(path, encode):
        content = None
        try:
            fp = open(path, "r", encoding=encode)
            content = fp.read()
            fp.close()
        except UnicodeDecodeError:
            print("Error: 文件读取失败")
        else:
            return content
    
    
    # 1.读取和写入Bunch对象的函数
    def readbunchobj(path):
        file_obj = open(path, "rb")
        bunch = pickle.load(file_obj)
        file_obj.close()
        return bunch
    
    
    # 写入Bunch对象
    def writebunchobj(path,bunchobj):
        file_obj = open(path, "wb")
        pickle.dump(bunchobj, file_obj)
        file_obj.close()
    
    
    # 整个语料库分词的主程序
    # 训练文本分词存储
    def segment(corpus_path,seg_path):
        # 获取corpus_path下的所有子目录
        cateList = os.listdir(corpus_path)
        for myDir in cateList:
            if not myDir.startswith("."):
                # 拼出分类子目录的路径
                class_path = corpus_path+myDir+"/"
                # 拼出分词后的语料分类目录
                seg_dir = seg_path+myDir+"/"
                # 是否存在目录,如果没有则创建
                if not os.path.exists(seg_dir):
                    os.makedirs(seg_dir)
                # 获得类别目录下的所有文件
                file_list = os.listdir(class_path)
                # 遍历类别目录下的所有文件
                for file_path in file_list:
                    # 拼出文件名的全路径
                    fullname = class_path + file_path
                    print("path:" + fullname)
                    # 读取文件的内容
                    content = readfile(fullname, "GBK")
                    if content != None:
                        content = content.strip()
                        # 删除换行和多余的空格
                        content = content.replace("
    ", "").strip()
                        # 为文件的内容分词
                        content_seg = jieba.cut(content)
                        # 将处理后的文件保存到分词后的语目录
                        savefile(seg_dir + file_path, "".join(content_seg))
        print("中文语料分析结束!!!")
    
    # 将分好词的文本文件转换并持久化为Bunch类形式
    def bunchObj(wordbag_path,seg_path):
        bunch = Bunch(target_name=[], label=[], filename=[], contents=[])
        # 将分好词的文本文件转换并持久化为Bunch类形式的代码如下:
        catelist = os.listdir(seg_path)
        # 按类别信息保存到Bunch对象中
        bunch.target_name.extend(catelist)
        for myDir in catelist:
            if not myDir.startswith("."):
                class_path = seg_path + myDir + "/"
                file_list = os.listdir(class_path)
                for file_path in file_list:
                    fullname = class_path + file_path
                    print(fullname)
                    # 保存当前文件的分类标签
                    bunch.label.append(myDir)
                    # 保存当前文件路径
                    bunch.filename.append(fullname)
                    # 保存文件词向量
                    bunch.contents.append(readfile(fullname, "GBK").strip())
        # Bunch对象的持久化
        file_obj = open(wordbag_path, "wb")
        pickle.dump(bunch, file_obj)
        file_obj.close()
        print("构建文本对象结束!!!")
    
    #训练模型
    def startTrain(stopword_path, wordbag_path, space_path):
        stpwrdlst = readfile(stopword_path,"UTF-8").splitlines()
        # 从训练集生成TF-IDF向量词袋
        # 2.导入分词后的词向量Bunch对象
        bunch = readbunchobj(wordbag_path)
    
        # 3.构建TF-IDF向量空间模型
        tfidfspace = Bunch(target_name=bunch.target_name, label=bunch.label, filename=bunch.filename, tdm=[], vocabulary={})
        # 使用TfidfVectorizer初始化向量空间模型
        vectorizer = TfidfVectorizer(stop_words=stpwrdlst, sublinear_tf=True, max_df=0.5)
        transform = TfidfTransformer()  # 该类会统计每个词语放入Tf-IDF权重
    
        # 4.文本转化为词频矩阵:单独保存字典文件
        tfidfspace.tdm = vectorizer.fit_transform(bunch.contents)
        tfidfspace.vocabulary = vectorizer.vocabulary_
    
        # 5.创建词袋的持久化
        writebunchobj(space_path, tfidfspace)
        print("文本分类模型训练完成")
    
    # 未分词分类语料库路径
    corpus_path = "/Users/FengZhen/Desktop/accumulate/机器学习/文本集/train/"
    # 分词后的分类语料库路径
    segment_path = "/Users/FengZhen/Desktop/accumulate/机器学习/classify_test/train_segment/"
    # 分词语料Bunch对象持久化文件路径
    wordbag_path = "/Users/FengZhen/Desktop/accumulate/机器学习/classify_test/train_word_bag/train_set.dat"
    # 停用词路径
    stop_words_path = "/Users/FengZhen/Desktop/accumulate/机器学习/classify_test/中文停用词表.txt"
    # 创建词袋的持久化路径
    space_path = "/Users/FengZhen/Desktop/accumulate/机器学习/classify_test/train_word_bag/tfidfspace.dat"
    
    
    # 训练文本分词存储
    # segment(corpus_path, segment_path)
    
    # 将分好词的文本文件转换并持久化为Bunch类形式
    # bunchObj(wordbag_path, segment_path)
    
    #开始训练
    startTrain(stop_words_path, wordbag_path, space_path)

    准备测试数据

    import os
    import jieba
    #Bunch类
    from sklearn.datasets.base import Bunch
    import pickle
    from sklearn import feature_extraction
    from sklearn.feature_extraction.text import TfidfTransformer #TF-IDF向量转换类
    from sklearn.feature_extraction.text import TfidfVectorizer
    
    
    # 定义两个函数读取和保存文件
    # 保存至文件
    def savefile(savepath, content):
        fp = open(savepath, "w", encoding="GBK")
        fp.write(content)
        fp.close()
    
    
    # 读文件
    def readfile(path, encode):
        content = None
        try:
            fp = open(path, "r", encoding=encode)
            content = fp.read()
            fp.close()
        except UnicodeDecodeError:
            print("Error: 文件读取失败")
        else:
            return content
    
    
    # 1.读取和写入Bunch对象的函数
    def readbunchobj(path):
        file_obj = open(path, "rb")
        bunch = pickle.load(file_obj)
        file_obj.close()
        return bunch
    
    
    # 写入Bunch对象
    def writebunchobj(path,bunchobj):
        file_obj = open(path, "wb")
        pickle.dump(bunchobj, file_obj)
        file_obj.close()
    
    
    # 整个语料库分词的主程序
    # 训练文本分词存储
    def segment(corpus_path,seg_path):
        # 获取corpus_path下的所有子目录
        cateList = os.listdir(corpus_path)
        for myDir in cateList:
            if not myDir.startswith("."):
                # 拼出分类子目录的路径
                class_path = corpus_path+myDir+"/"
                # 拼出分词后的语料分类目录
                seg_dir = seg_path+myDir+"/"
                # 是否存在目录,如果没有则创建
                if not os.path.exists(seg_dir):
                    os.makedirs(seg_dir)
                # 获得类别目录下的所有文件
                file_list = os.listdir(class_path)
                # 遍历类别目录下的所有文件
                for file_path in file_list:
                    # 拼出文件名的全路径
                    fullname = class_path + file_path
                    print("path:" + fullname)
                    # 读取文件的内容
                    content = readfile(fullname, "GBK")
                    if content != None:
                        content = content.strip()
                        # 删除换行和多余的空格
                        content = content.replace("
    ", "").strip()
                        # 为文件的内容分词
                        content_seg = jieba.cut(content)
                        # 将处理后的文件保存到分词后的语目录
                        savefile(seg_dir + file_path, "".join(content_seg))
        print("中文语料分析结束!!!")
    
    # 将分好词的文本文件转换并持久化为Bunch类形式
    def bunchObj(wordbag_path,seg_path):
        bunch = Bunch(target_name=[], label=[], filename=[], contents=[])
        # 将分好词的文本文件转换并持久化为Bunch类形式的代码如下:
        catelist = os.listdir(seg_path)
        # 按类别信息保存到Bunch对象中
        bunch.target_name.extend(catelist)
        for myDir in catelist:
            if not myDir.startswith("."):
                class_path = seg_path + myDir + "/"
                file_list = os.listdir(class_path)
                for file_path in file_list:
                    fullname = class_path + file_path
                    print(fullname)
                    # 保存当前文件的分类标签
                    bunch.label.append(myDir)
                    # 保存当前文件路径
                    bunch.filename.append(fullname)
                    # 保存文件词向量
                    bunch.contents.append(readfile(fullname, "GBK").strip())
        # Bunch对象的持久化
        file_obj = open(wordbag_path, "wb")
        pickle.dump(bunch, file_obj)
        file_obj.close()
        print("构建文本对象结束!!!")
    
    #训练模型
    def startTrain(stopword_path, wordbag_path, space_path, train_space_path):
        stpwrdlst = readfile(stopword_path,"UTF-8").splitlines()
        # 从训练集生成TF-IDF向量词袋
        # 2.导入分词后的词向量Bunch对象
        bunch = readbunchobj(wordbag_path)
    
        # 3.构建测试集TF-IDF向量空间
        testspace = Bunch(target_name=bunch.target_name, label=bunch.label, filenames= bunch.filename, tdm=[], vocabulary={})
    
        # 4.导入训练集词袋
        trainbunch = readbunchobj(train_space_path)
    
        # 5.使用TfidfVectorizer初始化向量空间模型
        vectorizer = TfidfVectorizer(stop_words=stpwrdlst, sublinear_tf=True, max_df=0.5,vocabulary=trainbunch.vocabulary)  # 使用训练集词袋向量
        transformer = TfidfTransformer()
        testspace.tdm = vectorizer.fit_transform(bunch.contents)
        testspace.vocabulary = trainbunch.vocabulary
        writebunchobj(space_path, testspace)
        print("文本分类模型训练完成")
    
    
    # 未分词分类语料库路径
    corpus_path = "/Users/FengZhen/Desktop/accumulate/机器学习/classify_test/测试文本集/"
    # 分词后的分类语料库路径
    segment_path = "/Users/FengZhen/Desktop/accumulate/机器学习/classify_test/test_segment/"
    # 分词语料Bunch对象持久化文件路径
    wordbag_path = "/Users/FengZhen/Desktop/accumulate/机器学习/classify_test/test_word_bag/test_set.dat"
    # 停用词路径
    stop_words_path = "/Users/FengZhen/Desktop/accumulate/机器学习/classify_test/中文停用词表.txt"
    # 创建词袋的持久化路径
    space_path = "/Users/FengZhen/Desktop/accumulate/机器学习/classify_test/test_word_bag/testspace.dat"
    
    train_space_path = "/Users/FengZhen/Desktop/accumulate/机器学习/classify_test/train_word_bag/tfidfspace.dat"
    
    # 训练文本分词存储
    # segment(corpus_path, segment_path)
    #
    # # 将分好词的文本文件转换并持久化为Bunch类形式
    # bunchObj(wordbag_path, segment_path)
    
    #开始训练
    startTrain(stop_words_path, wordbag_path, space_path,train_space_path)

    测试

    import pickle
    from sklearn.naive_bayes import MultinomialNB #导入多项式贝叶斯算法包
    
    def readbunchobj(path):
        file_obj = open(path,"rb")
        bunch    = pickle.load(file_obj)
        file_obj.close()
        return bunch
    
    #导入训练向量空间
    trainpath = "/Users/FengZhen/Desktop/accumulate/机器学习/classify_test/train_word_bag/tfidfspace.dat"
    train_set = readbunchobj(trainpath)
    
    #导入测试集向量空间
    testpath  = "/Users/FengZhen/Desktop/accumulate/机器学习/classify_test/test_word_bag/testspace.dat"
    test_set = readbunchobj(testpath)
    
    #应用朴素贝叶斯
    #alpha:0.001 alpha越小,迭代次数越多,精度越高
    clf = MultinomialNB(alpha = 0.001).fit(train_set.tdm,train_set.label)
    
    #预测分类结果
    predicted = clf.predict(test_set.tdm)
    total     = len(predicted)
    rate      = 0
    for flabel,file_name,expct_cate in zip(test_set.label,test_set.filenames,predicted):
        print(file_name, u":实际类别:", flabel, u"-->预测类别:", expct_cate)
        if flabel != expct_cate:
            rate  += 1
            # print(file_name,u":实际类别:",flabel,u"-->预测类别:",expct_cate)
    #精度
    print("error rate:",float(rate)*100/float(total),"%")
  • 相关阅读:
    [offer_53-1] 在排序数组中查找数字 I (开启编辑看 i,j,m)
    window10 办公软件word、execel、ppt突然变得很卡顿如何解决?
    数组中第k大的数
    heapq 堆
    每日一题 482. 密钥格式化
    算法笔记Go!
    DFS与BFS的python实现
    无向图中找到长度为k的“链”
    无序数组中找一个比左边都大、右边都小的数
    SRM(空域富模型隐写分析)
  • 原文地址:https://www.cnblogs.com/EnzoDin/p/12374513.html
Copyright © 2011-2022 走看看