zoukankan      html  css  js  c++  java
  • 电信用户流失率预测

     

     二、代码实现

    # /usr/bin/python
    # -*- encoding:utf-8 -*-
    
    # data analysis
    import pandas as pd
    import numpy as np
    import random as rnd
    
    # visualization
    import seaborn as sns
    import matplotlib.pyplot as plt
    
    # machine learning
    from sklearn.linear_model import LogisticRegression
    from sklearn.svm import SVC, LinearSVC
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.neighbors import KNeighborsClassifier
    from sklearn.naive_bayes import GaussianNB
    from sklearn.linear_model import Perceptron
    from sklearn.linear_model import SGDClassifier
    from sklearn.tree import DecisionTreeClassifier
    from sklearn.ensemble import RandomForestRegressor
    
    train_df = pd.read_csv("E:\pyworkpace\CDA\data\tele_cust_train.csv")
    test_df = pd.read_csv("E:\pyworkpace\CDA\data\tele_cust_test.csv")
    combine = [train_df, test_df]
    # 查看数据的完整性
    train_df.info()
    test_df.info()
    # 查看特征值的分布
    train_df.describe()
    train_df.describe(include=['O'])
    # 地区与对应流失率
    train_df[['area', 'churn']].groupby(['area'], as_index=False).mean().sort_values(by='churn', ascending=False)
    
    # 离散变量和连续变量画图
    # 居住面积、租房类型和婚姻状况的关系
    grid = sns.FacetGrid(train_df, row='marital', size=2.2, aspect=1.6)
    grid.map(sns.pointplot, 'dwllsize', 'churn', 'dwlltype', palette='deep')
    grid.add_legend()
    plt.show()
    
    # 离散变量补充
    # (1)eqpdays 以众数补齐,删除错误值,例如为负数的值,清空再补(112条为负的数据)
    freq_port = train_df.eqpdays.dropna().mode()[0]  # 众数
    for dataset in combine:
        dataset['eqpdays'] = dataset['eqpdays'].replace([-5, -4, -3, -2, -1], np.nan)
        dataset['eqpdays'] = dataset['eqpdays'].fillna(freq_port)
    # (2)dulband  以众数补齐数据
    freq_port = train_df.dualband.dropna().mode()[0]  # 众数
    for dataset in combine:
        dataset['dualband'] = dataset['dualband'].fillna(freq_port)
    
    #  (4)creditcd、truck、ethnic、marital、kid0_2至kid16_16共9个数据同时缺失,这几个变量同时用未知分类代替并生成一个新的变量new,1为补的缺失值,0为有值的数据
    for dataset in combine:
        dataset['new'] = 0  # 没有填补过的设置为0
        dataset.loc[np.where(np.isnan(dataset['truck']))[0], 'new'] = 1
        dataset['truck'] = dataset['truck'].replace(np.nan, 3)
        dataset['ethnic'] = dataset['ethnic'].replace(np.nan, 'Z')
        dataset['marital'] = dataset['marital'].fillna('Z')
        dataset['creditcd'] = dataset['creditcd'].fillna('Z')
        dataset[['kid0_2', 'kid3_5', 'kid6_10', 'kid11_15', 'kid16_17']] = 
            dataset[['kid0_2', 'kid3_5', 'kid6_10', 'kid11_15', 'kid16_17']].fillna('Y')  # 将为空的替换为Y(1)
        # 将U替换成0,Y替换成1
        dataset['kid0_2'] = dataset['kid0_2'].map({'U': 0, 'Y': 1}).astype(int)
        dataset['kid3_5'] = dataset['kid3_5'].map({'U': 0, 'Y': 1}).astype(int)
        dataset['kid6_10'] = dataset['kid6_10'].map({'U': 0, 'Y': 1}).astype(int)
        dataset['kid11_15'] = dataset['kid11_15'].map({'U': 0, 'Y': 1}).astype(int)
        dataset['kid16_17'] = dataset['kid16_17'].map({'U': 0, 'Y': 1}).astype(int)
    # 生成新的kids(孩子个数)用于取代kid0_2至kid16_16
    for dataset in combine:
        dataset['kids'] = dataset.apply(lambda x: x['kid0_2'] + x['kid3_5'] + x['kid6_10'] + x['kid11_15'] + x['kid16_17'],
                                        axis=1)
    # 连续变量补充
    new_df = train_df.append(test_df, sort=True)
    new_df = new_df.reset_index(drop=True)  # 重排索引
    corrmat = new_df.corr() # 得到连续变量间的相关关系
    f, ax = plt.subplots(figsize=(20, 9))
    sns.heatmap(corrmat, vmax=0.8, square=True)   # 绘制关系矩阵图
    '''
    补充da_Mean的数据 (其他连续变量以此类似)
     与此相关关系较大的变量有:'adjmou','adjrev','avgrev','avgmou','totcalls','avg3mou','avg3rev','totmou',peak_vce_Mean,mou_Mean,rev_Mean
    '''
    #  补充da_Mean数据
    train_df_da_Mean = new_df.dropna(subset=['da_Mean']).copy()
    test_df_da_Mean = new_df[np.isnan(new_df['da_Mean'])]
    X_train = train_df_da_Mean[['adjmou','adjrev','avgrev','avgmou','totcalls','avg3mou','avg3rev','totmou']]
    Y_train = train_df_da_Mean["da_Mean"]
    X_test = test_df_da_Mean[['adjmou','adjrev','avgrev','avgmou','totcalls','avg3mou','avg3rev','totmou']]
    model = RandomForestRegressor(n_estimators=100, oob_score=True, criterion='mse')
    model.fit(X_train, Y_train.ravel())
    Y_test= model.predict(X_test)
    r2 = model.score(X_train, Y_train)
    test_df_da_Mean['da_Mean'] = Y_test
    neww = test_df_da_Mean.append(train_df_da_Mean)   # 未改变索引值
    neww = neww.sort_index()
    new_df['da_Mean'] = neww['da_Mean']
    
    
    # 编码
    new_df['dualband'] = new_df['dualband'].map({'Y':0, 'N':1, 'T':2, 'U':3}).astype(int)
    new_df['creditcd'] = new_df['creditcd'].map({'Y':0, 'N':1, 'Z':3}).astype(int)
    new_df['ethnic'] = new_df['ethnic'].map({'U':0, 'N':1, 'H':2, 'Z':3, 'F':4, 'S':5, 'R':6, 'O':7, 'G':8, 'J':9,
                                             'P':10, 'I':11, 'B':12, 'D':13, 'X':14, 'C':15, 'M':16}).astype(int)
    new_df['marital'] = new_df['marital'].map({'B':0,'M':1,'U':2,'A':3,'S':4,'Z':5}).astype(int)
    
    new_df = new_df.drop(['Customer_ID', 'HHstatin', 'area','kid0_2', 'kid3_5', 'kid6_10', 'kid11_15', 'kid16_17'],axis=1)
    
    
    # 没有处理'numbcars','income','dwlltype','dwllsize','adult','ownrent','hnd_price'这几个变量
    yy = new_df.copy()
    yy = yy.drop(['numbcars','income','dwlltype','dwllsize','adult','ownrent','hnd_price'],axis=1)
    train_df = yy.dropna(subset=['churn']).copy()
    test_df = yy[np.isnan(new_df['churn'])]
    X_train=train_df.drop(['churn'],axis=1)
    Y_train=train_df['churn']
    X_test = test_df.drop(['churn'],axis=1)
    
    
    # 用决策树
    decision_tree = DecisionTreeClassifier()
    decision_tree.fit(X_train, Y_train)
    Y_pred = decision_tree.predict(X_test)
    acc_decision_tree = round(decision_tree.score(X_train, Y_train) * 100, 2)
    

      

  • 相关阅读:
    计算机中的那些缩写词
    Linux 下dns的搭建
    html中的定位
    编程基础之流程控制
    linux中kvm的安装及快照管理
    zabbix 中监控windows 的typepref中的值
    详解Linux交互式shell脚本中创建对话框实例教程_linux服务器
    rhel7 单用户修改root密码
    Linux vim编辑器使用详解
    Linux下用ftp更新web内容!
  • 原文地址:https://www.cnblogs.com/bethansy/p/10254228.html
Copyright © 2011-2022 走看看