zoukankan      html  css  js  c++  java
  • 对每块训练集的前99000数据训练,后1000数据集进行测试

     1 from sklearn.neighbors import KNeighborsClassifier
     2 from sklearn.externals import joblib
     3 onehot = OneHotEncoder()
     4 for b in range(1,115):
     5     addata = pd.read_csv("adFeature.csv")
     6     userdata = pd.read_csv("userFeature_%d_part_bat.csv"%(b*100000))
     7     traindata = pd.read_csv("train.csv")
     8     data = pd.merge(userdata,traindata)
     9     data = pd.merge(data,addata)
    10     #data.to_csv("111.csv")
    11     #列出所有特征值,遍历,uid是用户唯一标识,不应该算作特征
    12 
    13     userfeature = ["age","carrier","consumptionAbility","ct","education","gender","house","interest1","interest2","interest3","interest4","interest5","kw1","kw2","kw3","marriageStatus","os","topic1","topic2","topic3","LBS","appIdAction","appIdInstall","campaignId","creativeId","creativeSize","adCategoryId","advertiserId","productId","productType"]
    14     #for index in data[feature] :
    15     userdata = []
    16     for index in range(len(data["uid"])):
    17         feature_li = []
    18         for feature in userfeature:
    19             # a = data[feature]
    20             # print(a[0],type(a[index]),isinstance(a[0],(numpy.int64)))
    21 
    22             if isinstance(data[feature][index],numpy.int64):
    23                 feature_li.append(int(data[feature][index]))
    24             elif isinstance(data[feature][index],numpy.float64):
    25                 feature_li.append(0)#缺失值用0填充,这是不合理的,有待改进
    26             elif isinstance(data[feature][index], numpy.float):
    27                 feature_li.append(0)
    28             else :
    29                 trans = data[feature][index].strip().split(" ")
    30                 trans = map(int,trans)
    31                 trans = sorted(trans)
    32                 #print(trans)
    33                 s = 0
    34                 for num in trans :
    35                     s += num
    36                 feature_li.append(s)
    37         # print(feature_li)
    38         userdata.append(feature_li)
    39     userdata = numpy.array(userdata)
    40     onehot.fit(userdata)
    41     print("--------------------------------------------------------------------")
    42     Y = numpy.array(data["label"])
    43     print("--------------------------------------------------------------------")
    44     X = onehot.transform(userdata)
    45     print(X)
    46     print(numpy.shape(X))
    47     knn = KNeighborsClassifier()
    48     model = knn.fit(X[:99000],Y[:99000])
    49     joblib.dump(model,"%d.model"%(b))
    50     result = model.predict(X[-900:])
    51     print("-----------------------------------------------------------------------")
    52     #print(Y[-900:])
    53     print("--------------------------------------------------------------------------")
    54     s = 0
    55     for i in range(len(result)):
    56         if result[i] == Y[-900:][i] :
    57             s +=1
    58     a = s/len(result)
    59     f = open("result.txt","a",encoding="utf-8")
    60     f.write(str(a))
    61     print(a)
  • 相关阅读:
    CentOS 7 搭建 LAMP
    CentOS 7 安装 nginx
    ms-sql关联表操作
    在CentOS下自动备份mysql
    Redhat 7 或者 CentOS 7 密码破解
    java环境变量的设置
    CentOS6.6安装及配置vsftpd文件服务器
    Virtualbox虚拟机配置CentOS7.0静态网络
    CentOS6.6安装vmware workstation报错
    CentOS6.6安装virtualbox4.1.44
  • 原文地址:https://www.cnblogs.com/wbt1995/p/8941586.html
Copyright © 2011-2022 走看看