zoukankan      html  css  js  c++  java
  • 使用pandas,7行代码实现朴素贝叶斯

    作者:hhh5460

    大抵分成两类

    一、离散的、标签化的数据

    原文没有使用pandas,我使用pandas重新实现了朴素贝叶斯算法,看起来非常简洁、清爽。

    import pandas as pd
    
    '''
    导入数据集
    {a1 = 0, a2 = 0, C = 0} {a1 = 0, a2 = 0, C = 1}
    {a1 = 0, a2 = 0, C = 0} {a1 = 0, a2 = 0, C = 1}
    {a1 = 0, a2 = 0, C = 0} {a1 = 0, a2 = 0, C = 1}
    {a1 = 1, a2 = 0, C = 0} {a1 = 0, a2 = 0, C = 1}
    {a1 = 1, a2 = 0, C = 0} {a1 = 0, a2 = 0, C = 1}
    {a1 = 1, a2 = 0, C = 0} {a1 = 1, a2 = 0, C = 1}
    {a1 = 1, a2 = 1, C = 0} {a1 = 1, a2 = 0, C = 1}
    {a1 = 1, a2 = 1, C = 0} {a1 = 1, a2 = 1, C = 1}
    {a1 = 1, a2 = 1, C = 0} {a1 = 1, a2 = 1, C = 1}
    {a1 = 1, a2 = 1, C = 0} {a1 = 1, a2 = 1, C = 1}
    '''
    #导入数据集
    data = [[0, 0, 0],
    [0, 0, 0],
    [0, 0, 0],
    [1, 0, 0],
    [1, 0, 0],
    [1, 0, 0],
    [1, 1, 0],
    [1, 1, 0],
    [1, 1, 0],
    [1, 1, 0],
    [0, 0, 1],
    [0, 0, 1],
    [0, 0, 1],
    [0, 0, 1],
    [0, 0, 1],
    [1, 0, 1],
    [1, 0, 1],
    [1, 1, 1],
    [1, 1, 1],
    [1, 1, 1]]
    
    df = pd.DataFrame(data, columns=['a1', 'a2', 'c'])
    
    '''
    #计算类别的先验概率
    #P(C = 0) = 0.5
    #P(C = 1) = 0.5
    '''
    
    #计算类别的先验概率
    pc = df['c'].value_counts()/df['c'].size
    
    '''
    计算每个特征属性条件概率:
    P(a1 = 0 | C = 0) = 0.3
    P(a1 = 1 | C = 0) = 0.7
    P(a2 = 0 | C = 0) = 0.4
    P(a2 = 1 | C = 0) = 0.6
    P(a1 = 0 | C = 1) = 0.5
    P(a1 = 1 | C = 1) = 0.5
    P(a2 = 0 | C = 1) = 0.7
    P(a2 = 1 | C = 1) = 0.3
    '''
    # 计算每个特征属性条件概率:
    pa1 = pd.crosstab(df['c'], df['a1'], margins=True).apply(lambda x:x/x[-1], axis=1)
    pa2 = pd.crosstab(df['c'], df['a2'], margins=True).apply(lambda x:x/x[-1], axis=1)
    
    '''
    测试样本:
    x = { a1 = 1, a2 = 1}
    p(x | C = 0) = p(a1 = 1 | C = 0) * p( a2 = 1 | C = 0) = 0.3 * 0.6 = 0.18
    p(x | C = 1) = p(a1 = 1 | C = 1) * p (a2 = 1 | C = 1) = 0.5 * 0.3 = 0.15
    '''
    # 给出测试样本:
    x = pd.Series([1,1], index=['a1', 'a2'])
    px = pa1.ix[:,x[0]].mul(pa2.ix[:,x[1]])[:-1]
    '''
    计算P(C | x):
    P(C = 0) * p(x | C = 1) = 0.5 * 0.18 = 0.09
    P(C = 1) * p(x | C = 1) = 0.5 * 0.15 = 0.075
    所以认为测试样本属于类型C1
    '''
    # 计算P(C | x)
    res = pc.mul(px).argmax()
    print(res)
    
    
    

    同样的方法,7行代码解决这里的问题:

    import pandas as pd
    
    data = [['打喷嚏','护士','感冒'],
            ['打喷嚏','农夫','过敏'],
            ['头痛','建筑工人','脑震荡'],
            ['头痛','建筑工人','感冒'],
            ['打喷嚏','教师','感冒'],
            ['头痛','教师','脑震荡']]
            
    df = pd.DataFrame(data, columns=['症状','职业','疾病'])
    
    
    #计算类别的先验概率
    pr = df['疾病'].value_counts()/df['疾病'].size
    
    
    # 计算每个特征属性条件概率:
    pzz = pd.crosstab(df['疾病'], df['症状'], margins=True).apply(lambda x:x/x[-1], axis=1)
    pzy = pd.crosstab(df['疾病'], df['职业'], margins=True).apply(lambda x:x/x[-1], axis=1)
    
    # 给出测试样本:
    x = pd.Series(['打喷嚏','建筑工人'], index=['症状','职业'])
    px = pzz.ix[:,x[0]].mul(pzy.ix[:,x[1]])[:-1]
    
    # 计算P(C | x)
    res = pr.mul(px).argmax()
    print(res)
    
    

    二、连续的、非标签化的数据

    1.连续变量,样本足够大。使用区间,标签化

    这里的第二个例子:
    **

    # 检测SNS社区中不真实账号
    
    # 运维人员人工检测过的1万个账号作为训练样本
    
    # 原始数据格式:
    # ['日志数量','好友数量','注册天数','是否使用真实头像','账号类别']
    
    '''可惜,没有真实数据!!!!'''
    data = [
        [3,0,120,1,1],
        [3,0,120,1,1],
        [3,0,120,1,1],
        [3,0,120,1,1],
        [3,0,120,1,1],
        [3,0,120,1,1],
        [3,0,120,1,1],
        #...
        [3,0,120,1,1]]
        
    df = pd.DataFrame(data, columns=['日志数量','好友数量','注册天数','是否使用真实头像','账号类别'])
    
    
    
    
    # 计算训练样本中每个类别的频率(当做 类别的先验概率)
    '''
    P(C=0) = 8900/10000 = 0.89
    P(C=1) = 1100/10000 = 0.11
    '''
    pr = df['账号类别'].value_counts()/df['账号类别'].size
    
    #================================================================
    #----------------------------------------------------------------
    
    # 构建两个特征
    # ['日志数量/注册天数','好友数量/注册天数']
    
    df['日志数量/注册天数'] = df['日志数量'].div(df['注册天数'])
    df['好友数量/注册天数'] = df['好友数量'].div(df['注册天数'])
    
    
    # 把'日志数量/注册天数'分解成[0, 0.05]、(0.05, 0.2)、[0.2, +∞)三个区间
    # 把'好友数量/注册天数'分解成[0, 0.1]、(0.1, 0.8)、[0.8, +∞)三个区间
    
    # 打标签函数(根据 x 值所在的区间)
    def depart(x, low, high):
        if x <= low:
            return 0
        elif x >= high:
            return 2
        else:
            return 1
            
    # 打标签
    df['特征1'] = df['日志数量/注册天数'].apply(depart, args=(0.05, 0.2))
    df['特征2'] = df['好友数量/注册天数'].apply(depart, args=(0.1, 0.8))
    df['特征3'] = df['是否使用真实头像']
    
    #----------------------------------------------------------------
    #================================================================
    
    
    # 计算每个特征属性条件概率:
    ptz1 = pd.crosstab(df['账号类别'], df['特征1'], margins=True).apply(lambda x:x/x[-1], axis=1)
    ptz2 = pd.crosstab(df['账号类别'], df['特征2'], margins=True).apply(lambda x:x/x[-1], axis=1)
    ptz3 = pd.crosstab(df['账号类别'], df['特征3'], margins=True).apply(lambda x:x/x[-1], axis=1)
    
    
    
    # 给出测试样本:
    x_ = pd.Series([0.1, 0.2, 0], index=['日志数量/注册天数', '好友数量/注册天数', '是否使用真实头像'])
    
    #================================================================
    #----------------------------------------------------------------
    # 打标签
    x = pd.Series([depart(x_[0], 0.05, 0.2), depart(x_[1], 0.1, 0.8), x_[2]], index=['特征1','特征2','特征3'])
    #----------------------------------------------------------------
    #================================================================
    px = ptz1.ix[:,x[0]].mul(ptz2.ix[:,x[1]]).mul(ptz3.ix[:,x[2]])[:-1]
    
    
    
    # 计算P(C | x)
    res = pr.mul(px).argmax()
    print(res)
    
    
    

    2.连续变量,样本太小,无法划分区间。

    假设符合正态分布,先求出按类的均值,方差,再代入密度函数
    这里的第三个例子:
    **

    import pandas as pd
    
    # 关于处理连续变量的另一种方法
    
    
    # 下面是一组人类身体特征的统计资料
    data = [['男', 6, 180, 12],
            ['男', 5.92, 190, 11],
            ['男', 5.58, 170, 12],
            ['男', 5.92, 165, 10],
            ['女', 5, 100, 6],
            ['女', 5.5, 150, 8],
            ['女', 5.42, 130, 7],
            ['女', 5.75, 150, 9]]
    
    df = pd.DataFrame(data, columns=['性别','身高(英尺)','体重(磅)','脚掌(英寸)'])
    
    
    # 已知某人身高6英尺、体重130磅,脚掌8英寸,请问该人是男是女?
    x = pd.Series([6,130,8], index=['身高(英尺)','体重(磅)','脚掌(英寸)'])
    
    # 这里的困难在于,
    # 1.连续变量
    # 2.样本太少(无法分成区间)
    
    # 解决:
    # 假设男性和女性的身高、体重、脚掌都是正态分布,
    # 通过样本计算出均值和方差,也就是得到正态分布的密度函数。
    # 有了密度函数,就可以把值代入,算出某一点的密度函数的值。
    
    mean_male = df[df['性别']=='男'].mean()
    var_male = df[df['性别']=='男'].var()
    
    mean_formale = df[df['性别']=='女'].mean()
    var_formale = df[df['性别']=='女'].var()
    
    df2 = pd.concat((x, mean_male, var_male, mean_formale, var_formale), axis=1, keys=['x', 'mean_male', 'var_male', 'mean_formale', 'var_formale'])
    
    # 正态分布密度函数:
    # f(x|male) = exp(-(x-mean)**2/(2*var))/sqrt(2*pi*var)
    from math import pi
    def f(x, mean, var):
        return exp(-(x-mean)**2/(2*var))/sqrt(2*pi*var) # 密度函数
    
    # 求对应的密度函数值
    df2['px_male'] = df2['x', 'mean_male', 'var_male'].apply(lambda x:f(x[0],x[1],x[2])) ###################报错!容后再改!!
    df2['px_formale'] = df2['x', 'mean_formale', 'var_formale'].apply(lambda x:f(x[0],x[1],x[2]))
    
    
    # 类别的先验概率
    pr = df['性别'].value_counts()/df['性别'].size
    
    # 预测结果
    res = pd.Series([df2['p_male'].cumprod()[-1]*pr['男'], df2['p_formale'].cumprod()[-1]]*pr['女'], index=['男','女']).argmax()
    print(res)
    
    
    
    
  • 相关阅读:
    10.cocos2dx C++为Sprite添加触摸事件监听器
    9.多彩的幕布layer
    8.ZOrder
    7.cocos精灵创建和绘制
    6.cocos2d设置定时器
    5.cocos2d锚点
    4.cocos场景和层的调用
    文件导入导出
    两个整数相乘是否超限
    倒置字符串函数reverse
  • 原文地址:https://www.cnblogs.com/hhh5460/p/6417880.html
Copyright © 2011-2022 走看看