zoukankan      html  css  js  c++  java
  • 机器学习之python: kNN

      1 ##################################################
      2 # kNN : k Nearest Neighbour
      3 # Author : Monne
      4 # Date : 2015-01-24
      5 # Email : 416606639@qq.com
      6 ##################################################
      7 import numpy as np
      8 import time
      9 starttime = time.time()
     10 
     11 """ too long , equal to classify()
     12 def distance(xVec, yVec):
     13     # 1. attain distance from xVec and yVec
     14     x = np.array(xVec); y = np.array(yVec) # x = array([1,2,3]), y = array([2,3,4])
     15     diff = x - y # x - y = array([-1, -1, -1])
     16     diff2 = diff ** 2 # diff2 = diff**2 = array([1, 1, 1])
     17     sumdiff2 = sum(diff2) # sumdiff2 = sum(diff2) = 3
     18     sqrtsumdiff2 = sumdiff2 ** 0.5 # 9 ** 0.5 = 3.0
     19     return sqrtsumdiff2
     20 
     21 def disttest(testx, trainx):
     22     # attain all the distance between testx and trainx[i]
     23     # from distx {ID: distance}
     24     distx = {}
     25     numsample = len(trainx)
     26     for i in range(numsample):
     27         distx[i] = distance(testx, trainx[i])
     28     return distx
     29 
     30 def sort(testx, trainx):
     31     # sort distx {ID: distance}
     32     # return IDk
     33     distx = disttest(testx, trainx)
     34     sortitems = sorted(distx.iteritems(), key = lambda d:d[1]) # list
     35     IDk = []; distances = []
     36     l = len(trainx)
     37     for i in range(l):
     38         IDk.append(sortitems[i][0]) # ID
     39         distances.append(sortitems[i][1]) # distance
     40     #print "distances = ", distances[:5]
     41     return IDk
     42 
     43 def majorcount(testx, trainx, trainy, k):
     44     IDk = sort(testx, trainx)
     45     sorty = {} # dist(y, count)
     46     #l = len(trainx)
     47     for i in range(k):
     48         sorty[trainy[IDk[i]]] = sorty.get(trainy[IDk[i]], 0) + 1
     49     sorty = sorted(sorty.iteritems(), key = lambda d:d[1], reverse = True)  # list
     50     #print "sorty = ",sorty
     51     return sorty[0][0]
     52 
     53 def kNN(testx, trainx, trainy, k):
     54     # given testx, trainx, trainy, k
     55     # return predict y
     56     c = classify(testx, trainx, trainy, k)
     57     print "the classifier came back: % r" % c
     58     return c
     59 """
     60 
     61 
     62 # step 1. data input
     63 def testsample():
     64     trainx = [[1.0, 1.1],
     65                     [1.0, 1.0],
     66                     [0, 0],
     67                     [0, 0.1]]
     68     trainy = ['A', 'A', 'B', 'B']
     69     return trainx, trainy
     70 
     71 def txt2trainxy(filename):
     72     # 1.read from file
     73     # 2.attain dataset: trainx and trainy
     74     fr = open( filename +'.txt')
     75     trainx = []; trainy = []
     76     for line in fr.readlines():
     77         l = line.split()
     78         trainx.append(map(float,l[: -1]))
     79         trainy.append(int(l[-1]))
     80     return trainx,trainy
     81 
     82 def img2trainxy(filename):
     83     trainx = []; trainy = []
     84     from os import listdir
     85     fl = listdir(filename) # fr = ['0_2.txt','0_1.txt']
     86     for name in fl: # name = '0_2.txt'
     87         trainy.append(int(name[0])) # name[0] = '0', int(name[0]) = int('0') = 0
     88         fr = open(filename + '/' + name) # open('0_2.txt')
     89         tx = []
     90         for line in fr.readlines(): # line = '001100
    '
     91             tx.extend(line.strip()) # line.strip() = '001100', tx = ['0','0,'1','1',...]
     92         trainx.append(map(int, tx)) # map(int, tx) = [0,0,1,1,...]
     93     return trainx, trainy
     94 
     95 # step 2. data transform
     96 def norm(trainx):
     97     max = np.array(trainx).max(0) # max(0) = max(axis = 0)
     98     min = np.array(trainx).min(0) 
     99     diff = max - min
    100     ntrainx = (np.array(trainx) - min) / map(float, diff)
    101     return ntrainx.tolist(), min, map(float, diff)
    102 
    103 
    104 # step 3. classify function
    105 def classify(testx, trainx, trainy, k):
    106     diff = np.array(trainx) - np.array(testx)
    107     diff2 = diff ** 2
    108     sumdiff2 = diff2.sum(axis = 1)
    109     sqrt = sumdiff2 ** 0.5
    110     IDs = sqrt.argsort() # sorted index 
    111     sorty = {} # (y, count)
    112     for i in range(k):
    113         key = trainy[IDs[i]]
    114         sorty[key] = sorty.get(key, 0) + 1
    115     return sorted(sorty.iteritems(), key = 
    116         lambda d:d[1], reverse = True)[0][0]
    117 
    118 
    119 # step 4. test for error rate
    120 def testkNN(testratio, trainx, trainy, k):
    121     l = int(len(trainx) * testratio)
    122     errorcount = 0
    123     for i in range(l):
    124         c =  classify(trainx[i], trainx[l:], trainy[l:], k)
    125         #print "the classifier came back: % r, the real answer is: %r" % (c, trainy[i])
    126         if c != trainy[i]:
    127             errorcount += 1
    128     print "the total error rate is: %f." % (errorcount / float(l))
    129     #return (errorcount / float(l))
    130 
    131 def randomtestkNN(testratio, trainx, trainy, k):
    132     import random
    133     m = len(trainx); l = int(m * 0.1)
    134     testx = []; testy = []; s = []
    135 
    136     # random choose k number in [0,l)
    137     s = random.sample(range(m), l); b = list(set(range(m)) - set(s))
    138     testx = [trainx[i] for i in s]
    139     testy = [trainy[i] for i in s]
    140     trainx = [trainx[i] for i in b]
    141     trainy = [trainy[i] for i in b]
    142     """
    143     for i in range(l):
    144         s = random.randint(0, m - 1) #[0,m] include m and maybe repeat
    145         dels.append(s)
    146         testx.append(trainx[s])
    147         testy.append(trainy[s])
    148     trainx = [trainx[i] for i in range(m) if i not in dels]
    149     trainy = [trainy[i] for i in range(m) if i not in dels]
    150     """
    151 
    152     errorcount = 0
    153     for i in range(l):
    154         c =  classify(testx[i], trainx, trainy, k)
    155         #print "the classifier came back: % r, the real answer is: %r" % (c, trainy[i])
    156         if c != testy[i]:
    157             errorcount += 1
    158     print "the total error rate is: %f." % (errorcount / float(l))
    159     return (errorcount / float(l))
    160 
    161 def avg():
    162     a = []
    163     for i in range(1,10):
    164         #print i
    165         a.append(handwriting('trainingDigits', 'testDigits', i))
    166     a = np.array(a)
    167     print a
    168     print a.argsort()
    169     # k = 4, errormin = 0.03
    170 
    171 
    172 # step 5_1 small sample
    173 def sample(k):
    174     trainx, trainy = testsample()
    175     testkNN(trainx, trainy, k)
    176 
    177 
    178 # step 5_2. use for dating web site
    179 def datingwebsite(filename, k):
    180     ## step 1: load data
    181     print "step 1: load data..."
    182     trainx, trainy = txt2trainxy(filename) # must str like 'datingTestSet2', not datingTestSet2
    183     trainx, min, diff = norm(trainx)
    184 
    185 
    186     ## step 2: training...
    187     print "step 2: training..."
    188     pass
    189 
    190 
    191     ## step 3: testing...
    192     print "step 3: testing..."
    193     randomtestkNN(0.10, trainx, trainy, k)
    194     #testkNN(0.10, trainx, trainy, k)
    195     print "time cost: ", (time.time() - starttime)
    196     
    197 
    198     ## step 4: show the result...
    199     print "step 4: show the result..."
    200     resultList = ['not at all', 'in small doses', 'in large doses']
    201     percentTats = float(raw_input(
    202                     "percentage of time spent playing video games?> "))
    203     ffMiles = float(raw_input("frequent flier miles earned per year?> "))
    204     iceCream = float(raw_input("liters of ice cream consumed per year?> "))
    205     classx = (np.array([ffMiles, percentTats, iceCream]) - min) / diff
    206     classy = classify(classx, trainx, trainy, k)
    207     print "You will probably like this person: ", resultList[classy - 1]
    208 
    209     return (errorcount / float(l))
    210 
    211 
    212 # step 5_3. use for hand writing
    213 def handwriting(trainfile, testfile, k):
    214     ## step 1: load data... 
    215     print "step 1: load data..."
    216     print "---Getting training set..."
    217     trainx, trainy = img2trainxy(trainfile)
    218     print "---Geting testing set..."
    219     testx, testy = img2trainxy(testfile)
    220     m = len(trainx)
    221     print m, len(trainx[0])
    222     print len(testx), len(testx[0])
    223 
    224     # random choose trainx
    225     print "---Random choosing the training data..."
    226     import random
    227     n = random.randint(0, m - 1) # random numbers
    228     s = random.sample(range(m), n) # random samples
    229     trainx = [trainx[i] for i in s]
    230     trainy = [trainy[i] for i in s]
    231     print "---the numbers of training data is: ", n
    232 
    233 
    234     ## step 2: training...
    235     print "step 2: training..."
    236     pass
    237 
    238 
    239     ## step 3: testing...
    240     print "step 3: testing..."
    241     l = len(testx)
    242     errorcount = 0
    243     for i in range(l):
    244         c =  classify(testx[i], trainx, trainy, k)
    245         #print "the classifier came back: % r, the real answer is: %r" % (c, trainy[i])
    246         if c != testy[i]:
    247             errorcount += 1
    248     print "the total error rate is: %f." % (errorcount / float(l))
    249     print "time cost: ", (time.time() - starttime)
    250     
    251 
    252     ## step 4: show the result...
    253     print "step 4: show the result..."
    254     pass
    255 
    256     return (errorcount / float(l))
    257 
    258 
    259 
    260 
    261 #datingwebsite('datingTestSet2', 4)
    262 
    263 handwriting('trainingDigits', 'testDigits', 3)    
    264     
    265 #avg()
    View Code
  • 相关阅读:
    (1)、Bash的基本功能
    (3)、shell运算符与正则表达式
    中小规模集群搭建之backup服务(rsync守护进程)
    中小规模集群搭建(拓扑)
    asp.net 后台弹出JS提示框或执行JS方法
    MYSQL外键(Foreign Key)的使用
    直接双击页面元素进行修改的HTML代码
    [原创]Centos7 从零编译配置Memcached
    在XHTML中使用Media Player播放媒体文件
    JQuery插件右键菜单
  • 原文地址:https://www.cnblogs.com/monne/p/4246684.html
Copyright © 2011-2022 走看看