zoukankan      html  css  js  c++  java
  • 2018携程大数据方向在线笔试题

    1.求信息增益:

    import sys
    from math import log2
    
    n=int(sys.stdin.readline().strip())
    data=[]
    for i in range(n):
        list=sys.stdin.readline().strip().split(" ")
        data.append(list)
    
    # 计算信息熵
    label=[0, 0]
    for sample in data:
        if sample[1] == '0':
            label[0]+=1
        else:
            label[1]+=1
    
    label[0] = (label[0])/len(data)
    label[1] = (label[1])/len(data)
    if (label[0]==0 or label[1]==0):
        entropy = 0
    else:
        entropy = -(label[0] * log2(label[0]) + label[1] * log2(label[1]))
    
    #计算条件信息熵:
    
    dic={}
    for i in data:
        if i[0] not in dic:
            dic[i[0]]=[0, 0, 0]
        if i[1]=='0':
            dic[i[0]][0]+=1
        else:
            dic[i[0]][1]+=1
        dic[i[0]][2]+=1
    con_ent=0
    for key in dic:
        dic[key][0]=dic[key][0]/dic[key][2]
        dic[key][1]=dic[key][1]/dic[key][2]
        if (dic[key][0]==0 or dic[key][1]==0):
            tmp=0
        else:
            tmp=-(dic[key][0]*log2(dic[key][0])+dic[key][1]*log2(dic[key][1]))
        con_ent+=((dic[key][2])/len(data))*tmp
    # print(entropy-con_ent)
    print('%.2f' % (entropy-con_ent))
    
    输入输出如下:

    5
    1 0
    2 0
    2 1
    1 1
    1 1
    0.02
    
    Process finished with exit code 0

     2.求KL距离

    # -*- coding:utf-8 -*-  
    """ 
        @author:Tanshoudong 
        @file: test.py.py 
        @time: 2018/09/05 
        Ml items - 当前的项目名
    
        Description :
    """
    import sys
    import math
    
    list1 = sys.stdin.readline().strip().split()
    list2 = sys.stdin.readline().strip().split()
    dict1 = {}
    dict2 = {}
    for i in list1:
        if i not in dict1:
            dict1[i] = 0
        dict1[i] += 1
    for i in list2:
        if i not in dict2:
            dict2[i] = 0
        dict2[i] += 1
    
    for key in dict1:
        dict1[key] /= len(list1)
    for key in dict2:
        dict2[key] /= len(list2)
    kl = 0
    for key in dict1:
        kl += dict1[key] * math.log2(dict1[key] / dict2[key])
    print("{:.2f}".format(kl))

     3.朴素贝叶斯,情感分析

    import math
    import sys
    good_senten = ["high cost performance","Great place","Have a good time","quite special","good place","easy of access",
                   "Very worth a visit", "tickets are cheap", "convenient traffic", "overall feels good", "worth to see",
                   "very convenient traffic", "very fun", "a good place", "The ticket is cost-effective",
                   "the overall feeling is good", "the place worth going", "the view is good", "the overall is not bad",
                   "feel good", "the scenery is very good", "the scenery is not bad", "I like it very much", "Really good",
                   "It's worth seeing", "The ticket is not expensive", "The ticket is very cheap", "The scenery is not bad",
                   "The price is very good", "It's still very good", "The scenery is very good", "The environment is great",
                   "The scenery is OK", "The air is very fresh", "Very worth seeing", "Very worth going", "Good view",
                   "Good traffic"," Value this price", "good value for money"]
    
    bad_senten = ["low cost performance", "I don't prefer it", "Nothing fun", "Nothing special", "Nothing good-looking",
                  "Traffic is not very convenient", "Nothing special", "Tickets are too expensive", "Traffic inconvenient",
                  "The overall feeling is bad", "Nothing to play", "Never come again", "Nothing to see", "Feeling bad",
                  "Tickets are a bit expensive", "very bad", "The scenery is bad", "There is nothing to watch",
                  "I don't like it", "The price is a bit expensive", "cost performance is too low", "I feel not good",
                  "not high cost performance", "Not very fun", "Tickets are not cheap ", "traffic is not convenient",
                  "the scenery is not good", "nothing to play", "too commercial", "tickets are expensive",
                  "fare is a bit expensive", "nothing to see", "price not cheap", "price is very high", "ticket too expensive",
                  "The traffic is not good", "The ticket is too expensive", "The scenery is very poor", "Not worth the price",
                  "The traffic is bad"]
    ngood=0
    nbad=0
    good_words={}
    bad_words={}
    for word in good_senten:
        for s in word.strip().split(" "):
            ngood+=1
            if s not in good_words:
                good_words[s]=0
            good_words[s]+=1
    
    for word in bad_senten:
        for s in word.strip().split(" "):
            nbad+=1
            if s not in bad_words:
                bad_words[s]=0
            bad_words[s]+=1
    
    p_good=1
    p_bad=1
    list = sys.stdin.readline().strip().split(" ")
    for s in list:
        if s in good_words:
            p_good*=(good_words[s]/ngood)
        else:
            p_good*=1/ngood
        if s in bad_words:
            p_bad*=(bad_words[s]/nbad)
        else:
            p_bad*=1/nbad
    
    if p_good>p_bad:
        print(1)
    else:
        print(0)
  • 相关阅读:
    Eclipse使用jre的原理与配置
    [笔记]Android开发环境配置及HelloWorld程序
    【足迹C++primer】38、关联容器操作(2)
    Linux ls 命令实现(简化版)
    POJ1149_PIGS(网络流/EK)
    调用存储过程取到数据通过NPOI存到Excel中
    python get post模拟请求
    常见软件应用
    Docker 镜像的导入和导出
    Linux 编译安装R语言
  • 原文地址:https://www.cnblogs.com/tsdblogs/p/9593098.html
Copyright © 2011-2022 走看看