zoukankan      html  css  js  c++  java
  • 特征工程与集成学习参考代码

    1.引入包

    import pandas as pd
    import numpy as np
    import re

    2.读取数据

    train=pd.read_csv("训练数据.csv",encoding="gbk")

    3.设置最大显示列数目

    pd.set_option("display.max_columns",100)
    q3_2['userid']=q3_2['userid'].astype(str)
    q3_5.sort_values(by='userid',ascending=False).reset_index(drop=True)

    q3_3=q3_2[q3_2['userid'].str.len()==12]
    q3_3['user_id7']=q3_3['userid'].str[:7]

    user_scenic_df=user_scenic_df.dropna(subset=['scenic_area_name'])
    user_scenic_df.shape

    行、列选取service_df.iloc[[0,2],[2,3]]

    service_df.iloc[[0,2],[2,3,4]][service_df.volume_M>0.3]

    4.数据类型转换

    train["VideoTestTime"]=pd.to_datetime(train["VideoTestTime"])
    train[['RamUsage', 'CpuUsage', 'Longitude', 'Latitude', 'Source', 'LAC', 'CI',
           'LteCi', 'LtePci', 'LteTac', 'RX', 'L_SINR', 'LteRsrq', 'VideoAvgSpeed',
           'VideoPeakSpeed', 'TCLASS', 'VideoSize', 'VideoTotleTraffic']]=train[['RamUsage', 'CpuUsage', 'Longitude', 'Latitude', 'Source', 'LAC', 'CI',
           'LteCi', 'LtePci', 'LteTac', 'RX', 'L_SINR', 'LteRsrq', 'VideoAvgSpeed',
           'VideoPeakSpeed', 'TCLASS', 'VideoSize', 'VideoTotleTraffic']].apply(pd.to_numeric)

    5.空值处理

    train["LAC"].fillna(np.mean(train["LAC"]),inplace=True)

    6.one-hot编码

    APN=pd.get_dummies(train["APN/SSID"])
    train=pd.concat([train,APN],axis=1)
    train=train.drop(["APN/SSID"],axis=1)

    7.处理Label列

    train["label"]=train["BufferCounter"].apply(lambda x:dealLabel(x))
    def dealLabel(x):
        if((x==0)or x=="0"):
            return 0
        else:
            return 1

    8.时间相关处理

    train["VideoTestTime"]=pd.to_datetime(train["VideoTestTime"])
    train["year"]=train["VideoTestTime"].apply(lambda x:x.year)
    train["month"]=train["VideoTestTime"].apply(lambda x:x.month)
    train["Day"]=train["VideoTestTime"].apply(lambda x:x.day)
    train["hour"]=train["VideoTestTime"].apply(lambda x:x.hour)
    train["minute"]=train["VideoTestTime"].apply(lambda x:x.minute)
    train=train.drop(["VideoTestTime"],axis=1)

     排序

    avg_traffic.sort_values('Downlink traffic at the PDCP Layer',ascending=False, inplace=True)

    删除重复项

    data=data.drop_duplicates(subset=None,keep='first',inplace=False)

    9.引入sklearn包(随机森林)

    from sklearn.ensemble import RandomForestClassifier
    RF=RandomForestClassifier()
    RF.fit(X_train,y_train)
    y_pre=RF.predict(X_test)

    10参数调节

    from sklearn.model_selection import GridSearchCV
    parameter_space = {
        "n_estimators": [10, 15, 20],
        "criterion": ["gini", "entropy"],
        "min_samples_leaf": [2, 4, 6],
    }
    grid = GridSearchCV(RF, parameter_space, cv=5,scoring="f1")
    grid.fit(X_train, y_train)
    print(grid.best_params_)
    grid.best_score_

    11.标准归一化

    from sklearn.preprocessing import StandardScaler
    std=StandardScaler()
    X_std=std.fit_transform(X_train)

    12.损失函数权重(1:10)、上采样(1:100)、下采样(1:1000)、异常检测(1:10000)

    # 使用imlbearn库中上采样方法中的SMOTE接口
    from imblearn.over_sampling import SMOTE
    # 定义SMOTE模型,random_state相当于随机数种子的作用
    smo = SMOTE(random_state=42)
    X_smo, y_smo = smo.fit_sample(X, y)

    13.集成学习

    # 下面针对多个模型进行集成操作
    from sklearn.svm import SVC, LinearSVC
    from sklearn.naive_bayes import GaussianNB
    from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
    from sklearn.linear_model import LogisticRegression
    from sklearn.neighbors import KNeighborsClassifier
    from sklearn.neural_network import MLPClassifier
    from sklearn.kernel_approximation import Nystroem
    from sklearn.kernel_approximation import RBFSampler
    from sklearn.pipeline import make_pipeline
    SEED=666
    def get_models():
        """Generate a library of base learners."""
        nb = GaussianNB()
        knn = KNeighborsClassifier(n_neighbors=3)
        lr = LogisticRegression(C=100, random_state=SEED)
        nn = MLPClassifier((80, 10), early_stopping=False, random_state=SEED)
        gb = GradientBoostingClassifier(n_estimators=100, random_state=SEED)
        rf = RandomForestClassifier(n_estimators=10, max_features=3, random_state=SEED)
    
        models = {
                  'knn': knn,
                  'naive bayes': nb,
                  'mlp-nn': nn,
                  'random forest': rf,
                  'gbm': gb,
                  'logistic': lr,
                  }
    
        return models
    meta_learner = GradientBoostingClassifier(n_estimators=1000,    loss="exponential",    max_features=4,    max_depth=3,    subsample=0.5,    learning_rate=0.005,     random_state=SEED)
    from mlens.ensemble import SuperLearner
    # Instantiate the ensemble with 10 folds
    sl = SuperLearner(
        folds=2,
        random_state=SEED,
        verbose=2,
        backend="multiprocessing"
    )
    
    # Add the base learners and the meta learner
    sl.add(list(get_models().values()), proba=True) 
    sl.add_meta(meta_learner, proba=True)
    # Train the ensemble
    sl.fit(X_train,y_train)
    # Predict the test set
    from sklearn.metrics import f1_score
    p_sl = sl.predict_proba(X_test)
    Y_prelast=np.argmax(p_sl,axis=1)
    f1_score(y_test,Y_prelast)
  • 相关阅读:
    slf4j的简单用法以及与log4j的区别
    [转]Git 代码撤销、回滚到任意版本(当误提代码到本地或master分支时)
    【转】IDEA 中配置文件properties文件中文乱码解决
    Python+Selenium练习篇之3-浏览器滚动条操作
    selenium操作下拉滚动条的几种方法
    python利用unittest进行测试用例执行的几种方式
    安装和使用 Python
    PM2实用入门指南
    linux清除缓存
    【centos6.6环境搭建】Github unable to access SSL connect error出错处理
  • 原文地址:https://www.cnblogs.com/wangzhenghua/p/13703701.html
Copyright © 2011-2022 走看看