zoukankan      html  css  js  c++  java
  • 重写轮子之 GaussionNB

    我仿照sk-learn 中 GaussionNB 的结构, 重写了该算法的轮子,命名为 MyGaussionNB, 如下:

    # !/usr/bin/python
    # -*- coding:utf-8 -*-
    
    """
    Reimplement Gaussion naive Bayes algorithm as a practice
    """
    
    # Author: 相忠良(Zhong-Liang Xiang) <ugoood@163.com>
    # Finished at June 3, 2017
    
    import numpy as np
    from sklearn import datasets, cross_validation
    import math
    import matplotlib.pyplot as plt
    from sklearn import naive_bayes
    
    
    def load_data():
        iris = datasets.load_iris()
        return cross_validation.train_test_split(iris.data, iris.target, test_size=0.50, random_state=0)
    
    
    class MyGaussianNB:
        """
        注意: 使用该分类器前, 必须把标签处理成 0,1,2,3.... 这样的形式
        """
        class_prior_dic = {}
        class_prior_arr = []
    
        class_count_dic = {}
        class_count_arr = []
    
        theta_ = []
        sigma_ = []
    
        predict_label = []  # 最终预测值
    
        def __init__(self):
            pass
    
        def fit(self, X, y):
            """
            Fit Gaussian naive Bayes according to X, y
            
            Parameters
            ----------
            X : array-like, shape(n-samples, m-features)
                X part of training data 
            y : array-like, shape(n-samples,)
                labels of training data
            
            Returns:
            --------
            self : object
                Return self.
            """
    
            # calculate class_prior and class_count-------------
            dic = {}
    
            for item in y:
                if item in dic.keys():
                    dic[item] += 1
                else:
                    dic[item] = 1
    
            dic_temp = dic.copy()
            self.class_count_dic = dic_temp
            self.class_count_arr = dic_temp.values()
    
            for item in dic:
                dic[item] = float(dic[item]) / y.shape[0]
    
            self.class_prior_dic = dic
            self.class_prior_arr = dic.values()
            # --------------------------------------------------
            # 调用本类 私有方法
            self.__cal_theta_sigma_arr(X_train, y_train)
    
        def predict(self, X):
            """
            Predict class labels of X 
            
            Parameters
            ----------
            X : array-like, shape(n-samples, m-features)
                X a set of test data 
    
            Returns:
            --------
            a list of class labels of X
            """
    
            post_arr_matrix = []
    
            for c in self.class_prior_dic.keys():
                post_per_sample = []
                for sample in X:
                    i = 0
                    temp = 0.0
                    for element in sample:
                        # 注意: 使用该分类器前, 必须把标签处理成 0,1,2,3.... 这样的形式
                        # 原因在 下面的 theta_[c][] 处
                        # 重要: 用 np.log(x)相加的形式, 因为有的概率值特别特别小, 导致后验概率为0
                        #       log sum 越大, post 概率 越大!
                        # 注意: 我们并未采用 -np.log(x)的形式
                        temp = temp + np.log(self.__Gaussion_function(element, self.theta_[c][i], self.sigma_[c][i]))
    
                        i += 1
                    # print '在某类, 一个样例结束'
                    temp = temp + np.log(self.class_prior_dic[c])  # temp - log(p(c))
                    post_per_sample.append(temp)  # 某类下, X 中所有 sample 的 post 概率, shape(n-samples,)
                    # print '某类, 所有样本概率', post_per_sample
                post_arr_matrix.append(post_per_sample)  # 各类下, X 中所有 sample 的 post 概率, shape(n-classes, n-samples)
            self.predict_label = np.argmax(post_arr_matrix, 0)  # 返回 matrix 每列最大值索引. 这里, 索引值恰好是每个 sample 的预测 label.
    
            return self.predict_label
    
        def score(self, X, y):
            # 返回正确率
            temp_1 = list(X)
            temp = list(temp_1 == y)
    
            return 1.0 * temp.count(True) / temp.__len__()
    
        # 私有方法: 计算 每类 各列的 均值theta 和 标准差sigma
        def __cal_theta_sigma_arr(self, X, y):
            theta_arr = []
            sigma_arr = []
    
            xxx = []  # including (X,y)
            for item in X:
                xxx.append(list(item))
    
            ii = 0
            for item in xxx:
                item.append(y[ii])
                ii += 1
    
            # 担心改了原数据
            sss = np.array(xxx).copy()
            ssss = np.array(xxx).copy()
    
            for k in self.class_count_dic.keys():
                row_mask = np.array(sss[:, -1] == k, dtype = bool)  # 行网子
                temp = sss[row_mask, :]  # 用 行 网 子 !
                theta_arr.append(np.mean(temp, axis = 0))  # axis=0 表示列
    
                row_mask_1 = np.array(ssss[:, -1] == k, dtype = bool)  # 行网子
                temp_1 = ssss[row_mask_1, :]  # 用 行 网 子 !
                sigma_arr.append(np.std(temp_1, axis = 0))
    
            self.theta_ = theta_arr
            self.sigma_ = sigma_arr
            return theta_arr, sigma_arr
    
        # Gaussian function
        def __Gaussion_function(self, x, theta, sigma):  # private method
            return np.exp(-(x - theta) ** 2 / (2 * sigma ** 2)) / (np.sqrt(2 * np.pi) * sigma)
    
    
    X_train, X_test, y_train, y_test = load_data()
    MGN = MyGaussianNB()
    MGN.fit(X_train, y_train)
    
    a = MGN.predict(X_test)
    b = np.array(a)
    # print b
    # print X_test
    
    print '预测值: ', a
    print '实际值: ', y_test
    print a == y_test
    print 'MyGaussionNB 预测正确率: ', MGN.score(MGN.predict_label, y_test)
    
    # sk-learn 中的 GaussionNB 的性能, 且和我的实现 比较一下, 验证我的 implementation 的正确性.
    cls = naive_bayes.GaussianNB()
    cls.fit(X_train, y_train)
    result = cls.predict(X_test)
    print 'sklearn 的 GaussionNB 预测正确率: ', MGN.score(result, y_test)
    # 结果几乎完全一致,  但在 test_size=0.95 及 训练集更小时, 我的程序会出现问题 !
    
    
    '''
    下面是编程过程中留下的经验
    '''
    
    # 重要1:  判断column value真假,用mask,取想要rows的方法
    # row_mask = np.array(a[:, -1] == 0, dtype=bool)
    # print a[row_mask, :]
    # print np.mean(a[mask, :], axis=0)
    
    # 重要2:  提取字典的keys集合和values集合
    # print MGN.class_count_dic.keys()
    # print MGN.class_count_dic.values()
    
    # 重要3: 用 np.log(x)相加的形式, 因为有的概率值特别特别小, 导致后验概率为0
    #       log sum 越大, post 概率 越大!
    # 注意: 我们并未采用 -np.log(x)的形式
    
    # 重要4: Numpy中找出array中最大值所对应的行和列
    # a = np.array([[.5, 2, 0],
    #               [5, 3, 6],
    #               [.5, 1, 0]])
    #
    # re = np.where(a == np.max(a[:,1]))   a中第一列最大元素 在a中的坐标
    # print re
    
    # 重要5: 找出 列 or 行 的最大值索引 np.argmax(a,0), a 是矩阵, 0:列, 1:行
    
    # 重要6: 必须要有用于测试的小数据, 来探测每一个func的正确性.
    #        下面是我用来测试的小矩阵. 不仅仅测试自己编写的函数,
    #        还得对numpy, python 中的函数探测其功能和使用方法.
    # a = np.array([[.5, 2, 0],
    #               [.25, 3, 6],
    #               [.51, 1, 0]])
    #
    # b = np.array([11, 22, 33])
    # aa = np.array(zip(a, b))
    #
    # cc = [True, False, True, False, True, True]
    # print cc.__len__()
    # print cc.count(True)
    
    ################################################
    # 以下内容是我编程过程中用于测试和探查的各种乱七八糟的代码
    #
    # 我抛弃了这种做法----->: Gaussion这种东西, 算出的值 极有可能非常小, 得用 -log 相加 处理.
    # log sum 越小, post 概率 越大!
    # 取而代之的是------->: 直接 log后 相加, 取和的最大值的 为 那个样例 应得的标签.
    
    # def Gaussion_function(x, u, sig):
    #     return np.exp(-(x - u) ** 2 / (2 * sig ** 2)) / (math.sqrt(2 * math.pi) * sig)
    
    # x1 = Gaussion_function(6., 5.006, 0.34894699)
    # x2 = Gaussion_function(2.2, 3.418, 0.37719491)
    # x3 = Gaussion_function(4., 1.464, 0.17176728)
    # x4 = Gaussion_function(1., 0.244, 0.10613199)
    
    # x1 = Gaussion_function(5., 4.99574468, 0.35247299)
    # x2 = Gaussion_function(2.2, 3.418, 0.37719491)
    # x3 = Gaussion_function(4., 1.464, 0.17176728)
    # x4 = Gaussion_function(1., 0.244, 0.10613199)
    
    # print 'x1 ', x1
    # print 'x2 ', x2
    # print 'x3 ', x3
    # print 'x4 ', x4
    #
    # x1 = -np.log(x1)
    # x2 = -np.log(x2)
    # x3 = -np.log(x3)
    # x4 = -np.log(x4)
    #
    # pc = -np.log(0.33098591549295775)
    
    # print 'x1 ', x1
    # print 'x2 ', x2
    # print 'x3 ', x3
    # print 'x4 ', x4
    # print 'pc ', pc
    # print "x1-x4 log sum:", x1 + x2 + x3 + x4 + pc
    #
    # print MGN.class_prior_dic
    # print MGN.theta_
    # [array([ 5.006,  3.418,  1.464,  0.244,  0.   ]),
    # array([ 5.93469388,  2.78163265,  4.26530612,  1.33265306,  1.        ]),
    # array([ 6.60408163,  2.97755102,  5.56122449,  2.01836735,  2.        ])]
    #
    
    # print MGN.sigma_
    # [array([ 0.34894699,  0.37719491,  0.17176728,  0.10613199,  0.        ]),
    # array([ 0.51608851,  0.30282579,  0.4684107 ,  0.19207541,  0.        ]),
    # array([ 0.62562921,  0.32151764,  0.54802664,  0.26929497,  0.        ])]
    
    # print 'x=0, 均值为0, 方差为1', Gaussion_function(0, 0, 1)
    
    # x1 = Gaussion_function(5.8, 5.006, 0.34894699)
    # x2 = Gaussion_function(2.8, 3.418, 0.37719491)
    # x3 = Gaussion_function(5.1, 1.464, 0.17176728)
    # x4 = Gaussion_function(2.4, 0.244, 0.10613199)
    # print 'x1 ', x1
    # print 'x2 ', x2
    # print 'x3 ', x3
    # print 'x4 ', x4
    # print "x1-x4乘积:", x1 * x2 * x3 * x4 * 1.0  # x1-x4乘积: 2.53455055621e-188
    
    # print X_test
    # print MGN.class_prior_dic.keys()
    
    # print "标准差", MGN.sigma_
    # print "均值", MGN.theta_
    
    # print MGN.class_count_dic
    # print MGN.class_prior_dic
    # print 'theta: ', MGN.theta_[0][0]
    # print 'sigma: ', MGN.sigma_
    # print a.__len__()
    # print len(X_test)*4
    # print MGN.theta_
    # print X_train
    # print np.mean(X_train, axis=0)
    
    # print MGN.class_count_dic.keys()
    
    # print 'x_train: ', X_train
    # print 'y_train: ', y_train
    #
    # print 'MGN.class_prior_arr: ', MGN.class_prior_arr
    # print 'MGN.class_prior_dic: ', MGN.class_prior_dic
    # print 'MGN.class_count_arr: ', MGN.class_count_arr
    # print 'MGN.class_count_dic: ', MGN.class_count_dic
    
    # print np.argmax(a, 0)
    #
    # re = np.where(a == np.max(a[:, 0]))
    # print re
    # print int(re[0])
    
    # print a[[True,False,True],:]
    
  • 相关阅读:
    linux查找某个命令属于哪个rpm包
    dashboard安装
    yum下载的rpm包离线安装
    jQuery的选择器
    jQuery介绍
    client、offset、scroll系列
    BOM
    js 中的定时器
    JS中的面相对象
    关于DOM操作的相关案例
  • 原文地址:https://www.cnblogs.com/ZhongliangXiang/p/7357167.html
Copyright © 2011-2022 走看看