1 from sklearn.neighbors import KNeighborsClassifier 2 from sklearn.externals import joblib 3 onehot = OneHotEncoder() 4 for b in range(1,115): 5 addata = pd.read_csv("adFeature.csv") 6 userdata = pd.read_csv("userFeature_%d_part_bat.csv"%(b*100000)) 7 traindata = pd.read_csv("train.csv") 8 data = pd.merge(userdata,traindata) 9 data = pd.merge(data,addata) 10 #data.to_csv("111.csv") 11 #列出所有特征值,遍历,uid是用户唯一标识,不应该算作特征 12 13 userfeature = ["age","carrier","consumptionAbility","ct","education","gender","house","interest1","interest2","interest3","interest4","interest5","kw1","kw2","kw3","marriageStatus","os","topic1","topic2","topic3","LBS","appIdAction","appIdInstall","campaignId","creativeId","creativeSize","adCategoryId","advertiserId","productId","productType"] 14 #for index in data[feature] : 15 userdata = [] 16 for index in range(len(data["uid"])): 17 feature_li = [] 18 for feature in userfeature: 19 # a = data[feature] 20 # print(a[0],type(a[index]),isinstance(a[0],(numpy.int64))) 21 22 if isinstance(data[feature][index],numpy.int64): 23 feature_li.append(int(data[feature][index])) 24 elif isinstance(data[feature][index],numpy.float64): 25 feature_li.append(0)#缺失值用0填充,这是不合理的,有待改进 26 elif isinstance(data[feature][index], numpy.float): 27 feature_li.append(0) 28 else : 29 trans = data[feature][index].strip().split(" ") 30 trans = map(int,trans) 31 trans = sorted(trans) 32 #print(trans) 33 s = 0 34 for num in trans : 35 s += num 36 feature_li.append(s) 37 # print(feature_li) 38 userdata.append(feature_li) 39 userdata = numpy.array(userdata) 40 onehot.fit(userdata) 41 print("--------------------------------------------------------------------") 42 Y = numpy.array(data["label"]) 43 print("--------------------------------------------------------------------") 44 X = onehot.transform(userdata) 45 print(X) 46 print(numpy.shape(X)) 47 knn = KNeighborsClassifier() 48 model = knn.fit(X[:99000],Y[:99000]) 49 joblib.dump(model,"%d.model"%(b)) 50 result = model.predict(X[-900:]) 51 print("-----------------------------------------------------------------------") 52 #print(Y[-900:]) 53 print("--------------------------------------------------------------------------") 54 s = 0 55 for i in range(len(result)): 56 if result[i] == Y[-900:][i] : 57 s +=1 58 a = s/len(result) 59 f = open("result.txt","a",encoding="utf-8") 60 f.write(str(a)) 61 print(a)