1 ################################################## 2 # kNN : k Nearest Neighbour 3 # Author : Monne 4 # Date : 2015-01-24 5 # Email : 416606639@qq.com 6 ################################################## 7 import numpy as np 8 import time 9 starttime = time.time() 10 11 """ too long , equal to classify() 12 def distance(xVec, yVec): 13 # 1. attain distance from xVec and yVec 14 x = np.array(xVec); y = np.array(yVec) # x = array([1,2,3]), y = array([2,3,4]) 15 diff = x - y # x - y = array([-1, -1, -1]) 16 diff2 = diff ** 2 # diff2 = diff**2 = array([1, 1, 1]) 17 sumdiff2 = sum(diff2) # sumdiff2 = sum(diff2) = 3 18 sqrtsumdiff2 = sumdiff2 ** 0.5 # 9 ** 0.5 = 3.0 19 return sqrtsumdiff2 20 21 def disttest(testx, trainx): 22 # attain all the distance between testx and trainx[i] 23 # from distx {ID: distance} 24 distx = {} 25 numsample = len(trainx) 26 for i in range(numsample): 27 distx[i] = distance(testx, trainx[i]) 28 return distx 29 30 def sort(testx, trainx): 31 # sort distx {ID: distance} 32 # return IDk 33 distx = disttest(testx, trainx) 34 sortitems = sorted(distx.iteritems(), key = lambda d:d[1]) # list 35 IDk = []; distances = [] 36 l = len(trainx) 37 for i in range(l): 38 IDk.append(sortitems[i][0]) # ID 39 distances.append(sortitems[i][1]) # distance 40 #print "distances = ", distances[:5] 41 return IDk 42 43 def majorcount(testx, trainx, trainy, k): 44 IDk = sort(testx, trainx) 45 sorty = {} # dist(y, count) 46 #l = len(trainx) 47 for i in range(k): 48 sorty[trainy[IDk[i]]] = sorty.get(trainy[IDk[i]], 0) + 1 49 sorty = sorted(sorty.iteritems(), key = lambda d:d[1], reverse = True) # list 50 #print "sorty = ",sorty 51 return sorty[0][0] 52 53 def kNN(testx, trainx, trainy, k): 54 # given testx, trainx, trainy, k 55 # return predict y 56 c = classify(testx, trainx, trainy, k) 57 print "the classifier came back: % r" % c 58 return c 59 """ 60 61 62 # step 1. data input 63 def testsample(): 64 trainx = [[1.0, 1.1], 65 [1.0, 1.0], 66 [0, 0], 67 [0, 0.1]] 68 trainy = ['A', 'A', 'B', 'B'] 69 return trainx, trainy 70 71 def txt2trainxy(filename): 72 # 1.read from file 73 # 2.attain dataset: trainx and trainy 74 fr = open( filename +'.txt') 75 trainx = []; trainy = [] 76 for line in fr.readlines(): 77 l = line.split() 78 trainx.append(map(float,l[: -1])) 79 trainy.append(int(l[-1])) 80 return trainx,trainy 81 82 def img2trainxy(filename): 83 trainx = []; trainy = [] 84 from os import listdir 85 fl = listdir(filename) # fr = ['0_2.txt','0_1.txt'] 86 for name in fl: # name = '0_2.txt' 87 trainy.append(int(name[0])) # name[0] = '0', int(name[0]) = int('0') = 0 88 fr = open(filename + '/' + name) # open('0_2.txt') 89 tx = [] 90 for line in fr.readlines(): # line = '001100 ' 91 tx.extend(line.strip()) # line.strip() = '001100', tx = ['0','0,'1','1',...] 92 trainx.append(map(int, tx)) # map(int, tx) = [0,0,1,1,...] 93 return trainx, trainy 94 95 # step 2. data transform 96 def norm(trainx): 97 max = np.array(trainx).max(0) # max(0) = max(axis = 0) 98 min = np.array(trainx).min(0) 99 diff = max - min 100 ntrainx = (np.array(trainx) - min) / map(float, diff) 101 return ntrainx.tolist(), min, map(float, diff) 102 103 104 # step 3. classify function 105 def classify(testx, trainx, trainy, k): 106 diff = np.array(trainx) - np.array(testx) 107 diff2 = diff ** 2 108 sumdiff2 = diff2.sum(axis = 1) 109 sqrt = sumdiff2 ** 0.5 110 IDs = sqrt.argsort() # sorted index 111 sorty = {} # (y, count) 112 for i in range(k): 113 key = trainy[IDs[i]] 114 sorty[key] = sorty.get(key, 0) + 1 115 return sorted(sorty.iteritems(), key = 116 lambda d:d[1], reverse = True)[0][0] 117 118 119 # step 4. test for error rate 120 def testkNN(testratio, trainx, trainy, k): 121 l = int(len(trainx) * testratio) 122 errorcount = 0 123 for i in range(l): 124 c = classify(trainx[i], trainx[l:], trainy[l:], k) 125 #print "the classifier came back: % r, the real answer is: %r" % (c, trainy[i]) 126 if c != trainy[i]: 127 errorcount += 1 128 print "the total error rate is: %f." % (errorcount / float(l)) 129 #return (errorcount / float(l)) 130 131 def randomtestkNN(testratio, trainx, trainy, k): 132 import random 133 m = len(trainx); l = int(m * 0.1) 134 testx = []; testy = []; s = [] 135 136 # random choose k number in [0,l) 137 s = random.sample(range(m), l); b = list(set(range(m)) - set(s)) 138 testx = [trainx[i] for i in s] 139 testy = [trainy[i] for i in s] 140 trainx = [trainx[i] for i in b] 141 trainy = [trainy[i] for i in b] 142 """ 143 for i in range(l): 144 s = random.randint(0, m - 1) #[0,m] include m and maybe repeat 145 dels.append(s) 146 testx.append(trainx[s]) 147 testy.append(trainy[s]) 148 trainx = [trainx[i] for i in range(m) if i not in dels] 149 trainy = [trainy[i] for i in range(m) if i not in dels] 150 """ 151 152 errorcount = 0 153 for i in range(l): 154 c = classify(testx[i], trainx, trainy, k) 155 #print "the classifier came back: % r, the real answer is: %r" % (c, trainy[i]) 156 if c != testy[i]: 157 errorcount += 1 158 print "the total error rate is: %f." % (errorcount / float(l)) 159 return (errorcount / float(l)) 160 161 def avg(): 162 a = [] 163 for i in range(1,10): 164 #print i 165 a.append(handwriting('trainingDigits', 'testDigits', i)) 166 a = np.array(a) 167 print a 168 print a.argsort() 169 # k = 4, errormin = 0.03 170 171 172 # step 5_1 small sample 173 def sample(k): 174 trainx, trainy = testsample() 175 testkNN(trainx, trainy, k) 176 177 178 # step 5_2. use for dating web site 179 def datingwebsite(filename, k): 180 ## step 1: load data 181 print "step 1: load data..." 182 trainx, trainy = txt2trainxy(filename) # must str like 'datingTestSet2', not datingTestSet2 183 trainx, min, diff = norm(trainx) 184 185 186 ## step 2: training... 187 print "step 2: training..." 188 pass 189 190 191 ## step 3: testing... 192 print "step 3: testing..." 193 randomtestkNN(0.10, trainx, trainy, k) 194 #testkNN(0.10, trainx, trainy, k) 195 print "time cost: ", (time.time() - starttime) 196 197 198 ## step 4: show the result... 199 print "step 4: show the result..." 200 resultList = ['not at all', 'in small doses', 'in large doses'] 201 percentTats = float(raw_input( 202 "percentage of time spent playing video games?> ")) 203 ffMiles = float(raw_input("frequent flier miles earned per year?> ")) 204 iceCream = float(raw_input("liters of ice cream consumed per year?> ")) 205 classx = (np.array([ffMiles, percentTats, iceCream]) - min) / diff 206 classy = classify(classx, trainx, trainy, k) 207 print "You will probably like this person: ", resultList[classy - 1] 208 209 return (errorcount / float(l)) 210 211 212 # step 5_3. use for hand writing 213 def handwriting(trainfile, testfile, k): 214 ## step 1: load data... 215 print "step 1: load data..." 216 print "---Getting training set..." 217 trainx, trainy = img2trainxy(trainfile) 218 print "---Geting testing set..." 219 testx, testy = img2trainxy(testfile) 220 m = len(trainx) 221 print m, len(trainx[0]) 222 print len(testx), len(testx[0]) 223 224 # random choose trainx 225 print "---Random choosing the training data..." 226 import random 227 n = random.randint(0, m - 1) # random numbers 228 s = random.sample(range(m), n) # random samples 229 trainx = [trainx[i] for i in s] 230 trainy = [trainy[i] for i in s] 231 print "---the numbers of training data is: ", n 232 233 234 ## step 2: training... 235 print "step 2: training..." 236 pass 237 238 239 ## step 3: testing... 240 print "step 3: testing..." 241 l = len(testx) 242 errorcount = 0 243 for i in range(l): 244 c = classify(testx[i], trainx, trainy, k) 245 #print "the classifier came back: % r, the real answer is: %r" % (c, trainy[i]) 246 if c != testy[i]: 247 errorcount += 1 248 print "the total error rate is: %f." % (errorcount / float(l)) 249 print "time cost: ", (time.time() - starttime) 250 251 252 ## step 4: show the result... 253 print "step 4: show the result..." 254 pass 255 256 return (errorcount / float(l)) 257 258 259 260 261 #datingwebsite('datingTestSet2', 4) 262 263 handwriting('trainingDigits', 'testDigits', 3) 264 265 #avg()