zoukankan      html  css  js  c++  java
  • kddcup2015

    kddcup2015,二分类,课程逃课预测。写了好久了,突然想起简单整理一下,以备后需。

    step1,预处理,利用numpy和pandas库,数值化特征,简单而优雅

    #!/usr/bin/env python
    # coding=utf-8
    
    import pickle
    import pandas as pd
    import numpy as np
    
    
    source_dict={'server':0,'browser':1}
    event_dict = {"problem":5,"video":3,"access":1,"wiki":4,"discussion":6,"navigate":2,"page_close":0}
    
    
    def gen_time_dict():
        rng = pd.date_range('2013-10-27','2014-08-01')
        time_dict = pd.Series(np.arange(len(rng)),index=rng)
        fw = open('data/time_dict.csv','w')
        pickle.dump(time_dict,fw)
        return time_dict
    
    
    def gen_courseid_dict():
        df = pd.read_csv('data/date.csv',usecols=[0])
        course_map = pd.factorize(df.course_id)[1]
        course_dict = dict(zip(course_map,range(len(course_map))))
        fw = open('data/course_idTrain2.csv','w')
        pickle.dump(course_dict,fw)
        print "course_dict done"
        return course_dict
    
    
    def gen_object_dict():
        df = pd.read_csv('data/log_train.csv',usecols=[4])
        obj_map = pd.factorize(df.object)[1]
        obj_dict = dict(zip(obj_map,range(len(obj_map))))
        
        df2 = pd.read_csv('data/test/log_test.csv',usecols=[4])
        obj_map2 = pd.factorize(df2.object)[1]
        diff = [w for w in obj_map2 if w not in obj_map]
        obj_dict2 =dict(zip(diff,np.arange(len(obj_map),len(obj_map)+len(diff))))
        
        obj_dict.update(obj_dict2)
        fw = open('data/object_pkl.csv','w')
        pickle.dump(obj_dict,fw)
        print "obj_dict done.."
        return obj_dict
    
    
    def time_map(x):
        x = x[:10]
        return time_dict[x]
    
    
    def obj_map(x):
        return obj_dict[x]
    
    
    def course_map(x):
        return course_dict[x]
    
    time_dict = gen_time_dict()
    course_dict= gen_courseid_dict()
    obj_dict= gen_object_dict()
    
    
    
    def log_trainData():
        print "read log_train.csv "
        df1 = pd.read_csv('data/log_train.csv',converters={1:time_map,4:obj_map})
        print df1.head()
        
        df1.source = df1.source.map(lambda x:source_dict[x])
        df1.event = df1.event.map(lambda x:event_dict[x])
        print df1.head()
        print df1.tail()
        df1.to_csv('data/log_trainData.csv',index=False)
        
    
    def course_Data():
        df2 = pd.read_csv('data/enrollment_train.csv',usecols=[0,2],converters={2:course_map})
        df3 = pd.read_csv('data/date.csv',converters={0:course_map,1:time_map,2:time_map})
        df4 = pd.merge(df2,df3,on='course_id',how='outer')
        df4 = df4.sort_index(by='enrollment_id')
        print df4.tail(10)
        df4.to_csv("data/course_Trainpkl.csv",index=False)
    
        df1 = pd.read_csv('data/test/enrollment_test.csv',usecols=[0,2],converters={2:course_map})
        df4 = pd.merge(df1,df3)
        df4 = df4.sort_index(by='enrollment_id')
        print df4.tail(10)
        df4.to_csv("data/test/course_Testpkl.csv",index=False)
    
    
    
    def log_testData():
        print "read log_test.csv "
        df1 = pd.read_csv('data/test/log_test.csv',converters={1:time_map,4:obj_map})
        print df1.tail(10)
        df1.source = df1.source.map(lambda x:source_dict[x])
        df1.event = df1.event.map(lambda x:event_dict[x])
        print df1.tail(10)
        df1.to_csv('data/test/log_testData.csv',index=False)
    
    log_trainData()
    log_testData()
    course_Data()

    2. 使用各种机器学习方法进行建模,预测

    #!/usr/bin/env python
    # coding=utf-8
    
    import numpy as np
    import pandas as pd
    from sklearn.ensemble import GradientBoostingClassifier
    from sklearn import preprocessing
    from sklearn.metrics import roc_auc_score
    from sklearn.cross_validation import train_test_split
    from sklearn.grid_search import GridSearchCV
    from sklearn.svm import SVC 
    import pickle
    
    
    debug=True
    if debug:
        N=5000
    else:
        N = 20000
    
    
    class DropOutPredict(object):
    
        course_dict={}
        def __init__(self):
            print "welcom kdd2015 contest, jkmiao@526588996"
    
            fr = open("data/coursePkl.pkl")
            self.course_dict = pickle.load(fr)
    
        def datestr2num(s):
            return pd.to_datetime(s)
    
    
        def norm_res(self,x):
            if x<0.0001:
                x=0
            elif x>0.98:
                x=1.0
            return x
        
        def norm_course(self,c):
            return self.course_dict[c]
    
        def loadTrainData(self):
            df1 = pd.read_csv('./data/log_train.csv',usecols=[0,2,3,4])
            df1.source=pd.factorize(df1.source)[0]
            df1.event=pd.factorize(df1.event)[0]
            df1.object=pd.factorize(df1.object)[0]
            
            gp = df1.groupby("enrollment_id")
            gp2 = df1.groupby(["enrollment_id","source"])
    
            df2 = pd.read_csv('data/enrollment_train.csv',usecols=[2])
            df2.course_id=pd.factorize(df2.course_id)[0]
    
            df3 = pd.read_csv('data/truth_train.csv',usecols=[1],names=["drop"])
            data = df1.pivot_table("source",rows="enrollment_id",cols="event",aggfunc="count",fill_value=0)
            
            data["browser"] = gp2.event.count().unstack()[0]
            data["server"] = gp2.event.count().unstack()[1]
            data["course_id"]=df2.course_id
            data["cnt"] = gp.event.count()
            data["std"] = gp.object.std()
            data["var"] =gp.event.var()
            data["mean"] =gp.event.mean()
            data = data.fillna(0)
            print data.head()
            X = data.values
            y = np.ravel(df3["drop"])
            return X,y
    
    
    
        def loadTestData(self):
            df1 = pd.read_csv('data/test/log_test.csv',usecols=[0,2,3,4])
            df1.source = pd.factorize(df1.source)[0]
            df1.event = pd.factorize(df1.event)[0]
            df1.object = pd.factorize(df1.object)[0]
               
            gp = df1.groupby("enrollment_id")
            gp2 = df1.groupby(["enrollment_id","source"])
    
            df2 = pd.read_csv("data/test/enrollment_test.csv",usecols=[2])
            df2.course_id = pd.factorize(df2.course_id)[0]
    
            data = df1.pivot_table("source",rows="enrollment_id",cols="event",aggfunc="count",fill_value=0)
            
            data["browser"] = gp2.event.count().unstack()[0]
            data["server"] = gp2.event.count().unstack()[1]
            data["course_id"] = df2.course_id
            data["cnt"] = gp.event.count()
            data["std"] = gp.object.std()
            data["var"] = gp.event.var()
            data["mean"] = gp.event.mean()
            data = data.fillna(0)
            print "test data head():...
    ",data.head()
            test = data.values
            return test
    
        
        def gbdt_clf(self,x_train,x_test,y_train,y_test,test):
            clf = GradientBoostingClassifier(n_estimators=450,learning_rate=0.1,random_state=20)
            clf.fit(x_train,y_train)
            y_pred = clf.predict_proba(x_test)[:,1]
            
            scores = roc_auc_score(y_test,y_pred)
            print "gbdt_clf  scores ... ",scores
            pred = clf.predict_proba(test)[:,1]
            print pred[:5]
            self.saveResult(pred,"data/test/gbdt_clf.csv")
    
    
        def svc_clf(self,x_train,x_test,y_train,y_test,test):
            tuned_parameters = [{'kernel':['poly'],'C':[10,500,1200]},
                                {'kernel':['linear'],'C':[200,500,800]}]
            clf = GridSearchCV(SVC(probability=True),tuned_parameters,cv=5,scoring="roc_auc")
            
            # clf = svm.SVC(C=2.0,kernel="rbf",probability=True,random_state=42)
            clf.fit(x_train,y_train)
            print "Best parameters set found : "
            print clf.best_params_
    
            y_pred = clf.predict_proba(x_test)[:,1]
            scores = roc_auc_score(y_test,y_pred)
            print "svm clf scores...",scores
            pred = clf.predict_proba(test)[:,1]
            self.saveResult(pred,"data/test/svc_res"+str(scores)+".csv")
            return pred[:5]
    
    
        def saveResult(self,pred,fileName):
            enrollment_test = pd.read_csv('./data/test/enrollment_test.csv',usecols=[0])
            enrollment_test['drop'] = pred 
            res = enrollment_test[['enrollment_id','drop']];
            print "***"*30
            print res.head()
            res.to_csv(fileName,index=False,header=False)
            
    
    
        def drop_predict(self):
            print "loading train data..."
            X,y = self.loadTrainData()
           
            x_train,x_test,y_train,y_test = train_test_split(X,y,test_size=0.23,random_state=20)
            print "loading test data..."
            test = self.loadTestData()
            
            print "moding gbdt_clf..."
            self.gbdt_clf(x_train,x_test,y_train,y_test,test)
            
            x_train,x_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=28)
            print "moding svm_clf ..."
            self.svc_clf(x_train,x_test,y_train,y_test,test)
       
        def em_result(self):
           print "ensemable results..."
           df_gbdt = pd.read_csv("data/test/gbdt_res.csv",header=None,names=["id","drop1"])
           df_svm = pd.read_csv("data/test/svc_res.csv",header=None,usecols=[1],names=["id","drop2"])
           # df_knn = pd.read_csv("data/test/knn_res.csv",header=None,usecols=[1],names=["id","drop3"])
           
           # df_ex1 = pd.read_csv("data/test/gbdt_clf0.861831055542.csv",header=None,usecols=[1],names=["drop4"])
           # df_ex2 = pd.read_csv("data/test/gbdt_clf0.863249041131.csv",header=None,usecols=[1],names=["drop5"])
    
           # final result
           # df = pd.concat([df_gbdt,df_svm,df_knn,df_ex1,df_ex2],axis=1)
           # df["drop"] = df["drop1"]*0.4+df["drop2"]*0.2+df["drop3"]*0.2+df["drop4"]*0.1+df["drop5"]*0.1
           
           df = pd.concat([df_gbdt,df_svm],axis=1)
           df["drop"] = df.drop1*0.7+df.drop2*0.3
           df["drop"] = map(lambda x:self.norm_res(x),df["drop"])
           print df.head()
           # df.drop(["drop1","drop2","drop3","drop4","drop5"],axis=1,inplace=True)
           df.drop(["drop1","drop2"],axis=1,inplace=True)
           print df.head()
           df.to_csv("data/test/em_res.csv",header=False,index=False)
           
    
    
    if __name__ == '__main__':
        drop = DropOutPredict()
        drop.drop_predict()
        drop.em_result()
        print "done."  # 准确率召回率AOC值可达84%左右

    3, 继续特征工程,加强提取特征,AOC值接近89%

    #!/usr/bin/env python
    # coding=utf-8
    
    import numpy as np
    import pandas as pd
    import cPickle as pickle
    
    from sklearn import svm
    from sklearn import linear_model
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.ensemble import GradientBoostingClassifier
    from sklearn.preprocessing import scale
    
    from sklearn.cross_validation import train_test_split
    from sklearn.metrics import roc_auc_score
    
    
    
    def norm( x ):
        if x<0.000001:
            x=0
        elif x>0.96:
            x=1
        return x
    
    def last_time(x):
        return x.max()-x.min()
    
    def loadTrainData():
        df1 = pd.read_csv('data/log_trainData.csv')
        print df1.head()
        print df1.tail()
        df2 = pd.read_csv('data/truth_train.csv',header=None,usecols=[1],names=["drop"])
        df3 = pd.read_csv('data/course_Trainpkl.csv',usecols=[1,2,3])
        
        
        gp = df1.groupby("enrollment_id")
    
        data = df1.pivot_table("source",rows='enrollment_id',cols="event",aggfunc='count',fill_value=0)
        
        eventdf = gp.event.describe().unstack()
    
        timedf = gp.time.describe().unstack()
        timedf.drop('count',axis=1)
    
        sourcedf = gp.source.describe().unstack()
        sourcedf.drop(['count','min','max'],axis=1)
    
        objectdf = gp.object.describe().unstack()
        objectdf.drop(['count'],axis=1)
        
        # 并连"特征表"
        data = pd.concat([data,eventdf],axis=1)
        data = pd.concat([data,timedf],axis=1)
        data = pd.concat([data,sourcedf],axis=1)
        data = pd.concat([data,objectdf],axis=1)
        
        # 课程特征,持续时间,id号,从?天到?天
        data['dtime'] = gp.time.apply(last_time)
        data["course_id"] = df3["course_id"].values
        data["from"] = df3["from"].values
        data["to"] = df3["to"].values
        
        # 最大最小值规范化,并未什么提升
        # X = MinMaxScaler().fit_transform(X)
        print "origin data: "
        print data.tail() 
        data = data.fillna(0)
        data.to_csv('data/trainData.csv',index=False)
        X = data.values 
        # 去均值后规范化
        X = scale(X)
        # fw = open("data/train/trainData.pkl",'w')
        # pickle.dump(X,fw)
        y = np.ravel(df2['drop'])   
        print "y: ",y[:5]
        return X,y
    
    def loadTestData():
        df1 = pd.read_csv('data/test/log_testData.csv')
        print df1.head()
        df3 = pd.read_csv('data/test/course_Testpkl.csv',usecols=[1,2,3])
    
        gp = df1.groupby("enrollment_id")
    
        data = df1.pivot_table("source",rows='enrollment_id',cols="event",aggfunc='count',fill_value=0)
        
        eventdf = gp.event.describe().unstack()
    
        timedf = gp.time.describe().unstack()
        timedf.drop('count',axis=1)
    
        sourcedf = gp.source.describe().unstack()
        sourcedf.drop(['count','min','max'],axis=1)
    
        objectdf = gp.object.describe().unstack()
        objectdf.drop(['count'],axis=1)
    
        data = pd.concat([data,eventdf],axis=1)
        data = pd.concat([data,timedf],axis=1)
        data = pd.concat([data,sourcedf],axis=1)
        data = pd.concat([data,objectdf],axis=1)
    
        data['dtime'] = gp.time.apply(last_time)
        data["course_id"] = df3["course_id"].values
        data["from"] = df3["from"].values
        data["to"] = df3["to"].values
        
        # data["cnt"]=gp.size()
        # data["eventstd"] = gp.event.std()
        # data['eventmean'] = gp.event.mean()
        # data['eventmdeian'] = gp.event.median()
        # data['equantile0.25'] = gp.event.quantile(0.25)
        # data['equantile0.75'] = gp.event.quantile(0.75)
        # data['equantilemad'] = gp.event.mad()
        
        print "test data: "
        print data.tail(10)
        data = data.fillna(0)
        # 写入文件,以备后需,直接读取
        data.to_csv('data/test/testData.csv',index=False)
        
        # 也可以直接生成序列化文件
        # fw = open("data/test/testData.pkl",'w')
        # pickle.dump(data,fw)
    
        test = data.values
        # test = MinMaxScaler().fit_transform(test)
        test = scale(test)
        return test
    
    
    def svc_clf(x_train,x_test,y_train,y_test,test):
        clf = svm.SVC(kernel='linear',probability=True,random_state=42)
        clf.fit(x_train,y_train)
        y_pred= clf.predict_proba(x_test)[:,1]
        scores = roc_auc_score(y_test,y_pred) # 必须test值写在前面,否则报错
        print "svm scores:...",scores
        pred = clf.predict_proba(test)[:,1]
        saveResult(pred,'data/test/svc_res.csv')
    
    
    def lr_clf(x_train,x_test,y_train,y_test,test):
        clf = linear_model.LogisticRegression()
        clf.fit(x_train,y_train)
        y_pred = clf.predict_proba(x_test)[:,1]
        scores= roc_auc_score(y_test,y_pred)
        print "lr_clf scores: ",scores
        
        y_pred = map(norm,y_pred)
        score2 = roc_auc_score(y_test,y_pred)
        print "after nomailzied score ... ",score2
        
        pred = clf.predict_proba(test)[:,1]
        saveResult(pred,'data/test/lr_res.csv')
    
    def rf_clf(x_train,x_test,y_train,y_test,test):
        clf = RandomForestClassifier(n_estimators=100)
        clf.fit(x_train,x_train)
        y_pred = clf.predict_proba(x_test)[:,1]
        scores = roc_auc_score(y_test,y_pred)
        pred = clf.predict(test)[:,1]
        print "rf_scores: ",scores
        saveResult(pred,'./data/test/rf_res.csv')
    
    
    def  gbdt_clf(x_train,x_test,y_train,y_test,test):
        clf = GradientBoostingClassifier(n_estimators=500)
        clf.fit(x_train,y_train)
        y_pred=clf.predict_proba(x_test)[:,1]
        scores = roc_auc_score(y_test,y_pred)
        pred = clf.predict_proba(test)[:,1]
        print "gbdt_clf scores: ",scores
        saveResult(pred,'data/test/gbdt_clf'+str(scores)+'.csv')
    
    
    def saveResult(pred,fileName):
        # 获取用户id号
        df = pd.read_csv('data/test/enrollment_test.csv',usecols=[0])
        # 加上预测值
        df['drop'] = pred
        print df.head()
        # 写入提交文件
        df.to_csv(fileName,index=False,header=False)
    
    # 将较好的几个结果ensemble一下
    def em_res():
        df = pd.read_csv("data/test/gbdt_res.csv",header=None,names=["id","drop"])
        df1 = pd.read_csv("data/test/gbdt_clf0.875919444048.csv",header=None,usecols=[1],names=["drop1"])
        df2 = pd.read_csv("data/test/final_res.csv",header=None,usecols=[1],names=["drop2"])
        df["drop"]  =df["drop"]*0.5+ df1["drop1"]*0.2+df2["drop2"]*0.3
        df["drop"] = df["drop"]
        df.to_csv("data/test/final_res.csv",index=None,header=None)
    
    # 后续使用时,直接读取,加快速度
    def loadPickleTrainData():
        df1 = pd.read_csv('data/trainData.csv')
        print df1.head()
        X = df1.values
        # X = scale(X)
        fr2 = open("data/train/trainLabel.txt")
        y = pickle.load(fr2)
        return X,y
    
    
    def loadPickleTestData():
        df1 = pd.read_csv('data/test/testData.csv')
        test = df1.values
        # test = scale(test)
        return test
    
    
    def dropPredict():
        em_res()
        print "loading train data..."
        X,y = loadPickleTrainData()
        # X,y = loadTrainData()
    
        print "loading test data... "
        test = loadPickleTestData()
        # test = loadTestData()
    
        print "
    modeling lr..."
        x_train,x_test,y_train,y_test = train_test_split(X,y,test_size=0.31,random_state=148)
        lr_clf(x_train,x_test,y_train,y_test,test)
        
        x_train,x_test,y_train,y_test = train_test_split(X,y,test_size=0.28,random_state=151)
        print "
    modeling rf..."
        # rf_clf(x_train,x_test,y_train,y_test,test)
    
        print "
    modeling gbdt..."
        gbdt_clf(x_train,x_test,y_train,y_test,test)
    
        print "
    modeling svm..."
        svc_clf(x_train,x_test,y_train,y_test,test)
    
    
    if __name__ =="__main__":
        print "start>>>"
        dropPredict()
        print "done"
    每天一小步,人生一大步!Good luck~
  • 相关阅读:
    视图与URL配置--Hello world
    初始Django
    Django 学习之前提
    MySQL--解决中文乱码
    第二课时之c#语言基础
    第一课时之c#程序设计概述
    hdu--1029--思维题
    hdu--1028--dp||递推||母函数
    hdu--1026--bfs&&优先队列&&打印路径
    hdu--1027-next_permutation||dfs
  • 原文地址:https://www.cnblogs.com/jkmiao/p/4806369.html
Copyright © 2011-2022 走看看