zoukankan      html  css  js  c++  java
  • 贝叶斯实现分类

    # -*- coding: utf-8 -*-
    
    import sys
    import os
    import numpy as np
    import pickle
    from sklearn import metrics
    
    #导入数据集
    def loadDataSet():
        postingList = [['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'],
                       ['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],
                       ['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him', 'my'],
                       ['stop', 'posting', 'stupid', 'worthless', 'garbage'],
                       ['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],
                       ['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]
        classVec = [0, 1, 0, 1, 0, 1]  # 1 is abusive, 0 not,分类
        return postingList, classVec
    
    
    # 读取文件
    def readfile(path):
        fp = open(path, "rb")
        content = fp.read()
        fp.close()
        return content
    
    """
    
    #计算分类精度:
    def metrics_result(actual,predict):
        print('精度:{0:.3f}'.format(metrics.precision_score(actual,predict)))
        print ('召回:{0:0.3f}'.format(metrics.recall_score(actual,predict)))
        print ('f1-score:{0:.3f}'.format(metrics.f1_score(actual,predict)))
    
    """
    
    # 读取bunch对象
    def readbunchobj(path):
        file_obj = open(path, "rb")
        bunch = pickle.load(file_obj)
        file_obj.close()
        return bunch
    
    
    # 写入bunch对象
    def writebunchobj(path, bunchobj):
        file_obj = open(path, "wb")
        pickle.dump(bunchobj, file_obj)
        file_obj.close()
    
    
    class NBayes(object):
        def __init__(self):
            self.vocabulary = []  # 词典
            self.idf = 0  # 词典的idf权值向量
            self.tf = 0  # 训练集的权值矩阵
            self.tdm = 0  # P(x|yi)
            self.Pcates = {}  # P(yi)--是个类别字典,这个集合就是p(yi)的值的集合
            self.labels = []  # 对应每个文本的分类,是个外部导入的列表
            self.doclength = 0  # 训练集文本数
            self.vocablen = 0  # 词典词长
            self.testset = 0  # 测试集
    
        #    加载训练集并生成词典,以及tf, idf值
        def train_set(self, trainset, classVec):
            self.cate_prob(classVec)  # 计算每个分类在数据集中的概率:P(yi)
            self.doclength = len(trainset)
            tempset = set()
            [tempset.add(word) for doc in trainset for word in doc]  # 生成词典
            self.vocabulary = list(tempset)
            self.vocablen = len(self.vocabulary)
            self.calc_wordfreq(trainset)
            # self.calc_tfidf(trainset)  # 生成tf-idf权值
            self.build_tdm()  # 按分类累计向量空间的每维值:P(x|yi)
    
        # 生成 tf-idf
        def calc_tfidf(self, trainset):
            self.idf = np.zeros([1, self.vocablen])
            self.tf = np.zeros([self.doclength, self.vocablen])
            for indx in range(self.doclength):
                for word in trainset[indx]:
                    self.tf[indx, self.vocabulary.index(word)] += 1
                # 消除不同句长导致的偏差
                self.tf[indx] = self.tf[indx] / float(len(trainset[indx]))
                for signleword in set(trainset[indx]):
                    self.idf[0, self.vocabulary.index(signleword)] += 1
            self.idf = np.log(float(self.doclength) / self.idf)
            self.tf = np.multiply(self.tf, self.idf)  # 矩阵与向量的点乘
    
        # 生成普通的词频向量
        def calc_wordfreq(self, trainset):
            self.idf = np.zeros([1, self.vocablen])  # 1*词典数
            self.tf = np.zeros([self.doclength, self.vocablen])  # 训练集文件数*词典数
            for indx in range(self.doclength):  # 遍历所有的文本
                for word in trainset[indx]:  # 遍历文本中的每个词
                    self.tf[indx, self.vocabulary.index(word)] += 1  # 找到文本的词在字典中的位置+1
                for signleword in set(trainset[indx]):
                    self.idf[0, self.vocabulary.index(signleword)] += 1
    
        # 计算每个分类在数据集中的概率:P(yi)
        def cate_prob(self, classVec):
            self.labels = classVec#让分类作为相对应的标签
            labeltemps = set(self.labels)  # 获取全部分类,返回的是一个集合,其值为{0,1}
            #print('分类的结果:',labeltemps)
            for labeltemp in labeltemps:
                # 统计列表中重复的值:self.labels.count(labeltemp)
                self.Pcates[labeltemp] = float(self.labels.count(labeltemp)) / float(len(self.labels))#求分类列表中重复的值,就是0和1在所有当中所占的比例
    
        # 按分类累计向量空间的每维值:P(x|yi)
        def build_tdm(self):
            self.tdm = np.zeros([len(self.Pcates), self.vocablen])  # 类别行*词典列
            sumlist = np.zeros([len(self.Pcates), 1])  # 统计每个分类的总值
            for indx in range(self.doclength):
                self.tdm[self.labels[indx]] += self.tf[indx]  # 将同一类别的词向量空间值加总
                sumlist[self.labels[indx]] = np.sum(self.tdm[self.labels[indx]])  # 统计每个分类的总值--是个标量
            self.tdm = self.tdm / sumlist  # P(x|yi)
    
        # 测试集映射到当前词典
        def map2vocab(self, testdata):
            self.testset = np.zeros([1, self.vocablen])
            for word in testdata:
                self.testset[0, self.vocabulary.index(word)] += 1
    
        # 输出分类类别
        def predict(self, testset):
            if np.shape(testset)[1] != self.vocablen:
                print("输入错误")
                exit(0)
            predvalue = 0
            predclass = ""
            for tdm_vect, keyclass in zip(self.tdm, self.Pcates):
                # P(x|yi)P(yi)
                temp = np.sum(testset * tdm_vect * self.Pcates[keyclass])
                if temp > predvalue:
                    predvalue = temp
                    predclass = keyclass
            return predclass
  • 相关阅读:
    具有快表的地址变换机构
    npm更换淘宝镜像
    内存扩充技术
    内存管理的概念
    内存的基础知识
    102. 二叉树的层序遍历
    104. 二叉树的最大深度
    206. 反转链表
    mysql 多字段查询,全局搜素
    java 处理html转义字符
  • 原文地址:https://www.cnblogs.com/caicaihong/p/5970840.html
Copyright © 2011-2022 走看看