二、代码实现
# /usr/bin/python # -*- encoding:utf-8 -*- # data analysis import pandas as pd import numpy as np import random as rnd # visualization import seaborn as sns import matplotlib.pyplot as plt # machine learning from sklearn.linear_model import LogisticRegression from sklearn.svm import SVC, LinearSVC from sklearn.ensemble import RandomForestClassifier from sklearn.neighbors import KNeighborsClassifier from sklearn.naive_bayes import GaussianNB from sklearn.linear_model import Perceptron from sklearn.linear_model import SGDClassifier from sklearn.tree import DecisionTreeClassifier from sklearn.ensemble import RandomForestRegressor train_df = pd.read_csv("E:\pyworkpace\CDA\data\tele_cust_train.csv") test_df = pd.read_csv("E:\pyworkpace\CDA\data\tele_cust_test.csv") combine = [train_df, test_df] # 查看数据的完整性 train_df.info() test_df.info() # 查看特征值的分布 train_df.describe() train_df.describe(include=['O']) # 地区与对应流失率 train_df[['area', 'churn']].groupby(['area'], as_index=False).mean().sort_values(by='churn', ascending=False) # 离散变量和连续变量画图 # 居住面积、租房类型和婚姻状况的关系 grid = sns.FacetGrid(train_df, row='marital', size=2.2, aspect=1.6) grid.map(sns.pointplot, 'dwllsize', 'churn', 'dwlltype', palette='deep') grid.add_legend() plt.show() # 离散变量补充 # (1)eqpdays 以众数补齐,删除错误值,例如为负数的值,清空再补(112条为负的数据) freq_port = train_df.eqpdays.dropna().mode()[0] # 众数 for dataset in combine: dataset['eqpdays'] = dataset['eqpdays'].replace([-5, -4, -3, -2, -1], np.nan) dataset['eqpdays'] = dataset['eqpdays'].fillna(freq_port) # (2)dulband 以众数补齐数据 freq_port = train_df.dualband.dropna().mode()[0] # 众数 for dataset in combine: dataset['dualband'] = dataset['dualband'].fillna(freq_port) # (4)creditcd、truck、ethnic、marital、kid0_2至kid16_16共9个数据同时缺失,这几个变量同时用未知分类代替并生成一个新的变量new,1为补的缺失值,0为有值的数据 for dataset in combine: dataset['new'] = 0 # 没有填补过的设置为0 dataset.loc[np.where(np.isnan(dataset['truck']))[0], 'new'] = 1 dataset['truck'] = dataset['truck'].replace(np.nan, 3) dataset['ethnic'] = dataset['ethnic'].replace(np.nan, 'Z') dataset['marital'] = dataset['marital'].fillna('Z') dataset['creditcd'] = dataset['creditcd'].fillna('Z') dataset[['kid0_2', 'kid3_5', 'kid6_10', 'kid11_15', 'kid16_17']] = dataset[['kid0_2', 'kid3_5', 'kid6_10', 'kid11_15', 'kid16_17']].fillna('Y') # 将为空的替换为Y(1) # 将U替换成0,Y替换成1 dataset['kid0_2'] = dataset['kid0_2'].map({'U': 0, 'Y': 1}).astype(int) dataset['kid3_5'] = dataset['kid3_5'].map({'U': 0, 'Y': 1}).astype(int) dataset['kid6_10'] = dataset['kid6_10'].map({'U': 0, 'Y': 1}).astype(int) dataset['kid11_15'] = dataset['kid11_15'].map({'U': 0, 'Y': 1}).astype(int) dataset['kid16_17'] = dataset['kid16_17'].map({'U': 0, 'Y': 1}).astype(int) # 生成新的kids(孩子个数)用于取代kid0_2至kid16_16 for dataset in combine: dataset['kids'] = dataset.apply(lambda x: x['kid0_2'] + x['kid3_5'] + x['kid6_10'] + x['kid11_15'] + x['kid16_17'], axis=1) # 连续变量补充 new_df = train_df.append(test_df, sort=True) new_df = new_df.reset_index(drop=True) # 重排索引 corrmat = new_df.corr() # 得到连续变量间的相关关系 f, ax = plt.subplots(figsize=(20, 9)) sns.heatmap(corrmat, vmax=0.8, square=True) # 绘制关系矩阵图 ''' 补充da_Mean的数据 (其他连续变量以此类似) 与此相关关系较大的变量有:'adjmou','adjrev','avgrev','avgmou','totcalls','avg3mou','avg3rev','totmou',peak_vce_Mean,mou_Mean,rev_Mean ''' # 补充da_Mean数据 train_df_da_Mean = new_df.dropna(subset=['da_Mean']).copy() test_df_da_Mean = new_df[np.isnan(new_df['da_Mean'])] X_train = train_df_da_Mean[['adjmou','adjrev','avgrev','avgmou','totcalls','avg3mou','avg3rev','totmou']] Y_train = train_df_da_Mean["da_Mean"] X_test = test_df_da_Mean[['adjmou','adjrev','avgrev','avgmou','totcalls','avg3mou','avg3rev','totmou']] model = RandomForestRegressor(n_estimators=100, oob_score=True, criterion='mse') model.fit(X_train, Y_train.ravel()) Y_test= model.predict(X_test) r2 = model.score(X_train, Y_train) test_df_da_Mean['da_Mean'] = Y_test neww = test_df_da_Mean.append(train_df_da_Mean) # 未改变索引值 neww = neww.sort_index() new_df['da_Mean'] = neww['da_Mean'] # 编码 new_df['dualband'] = new_df['dualband'].map({'Y':0, 'N':1, 'T':2, 'U':3}).astype(int) new_df['creditcd'] = new_df['creditcd'].map({'Y':0, 'N':1, 'Z':3}).astype(int) new_df['ethnic'] = new_df['ethnic'].map({'U':0, 'N':1, 'H':2, 'Z':3, 'F':4, 'S':5, 'R':6, 'O':7, 'G':8, 'J':9, 'P':10, 'I':11, 'B':12, 'D':13, 'X':14, 'C':15, 'M':16}).astype(int) new_df['marital'] = new_df['marital'].map({'B':0,'M':1,'U':2,'A':3,'S':4,'Z':5}).astype(int) new_df = new_df.drop(['Customer_ID', 'HHstatin', 'area','kid0_2', 'kid3_5', 'kid6_10', 'kid11_15', 'kid16_17'],axis=1) # 没有处理'numbcars','income','dwlltype','dwllsize','adult','ownrent','hnd_price'这几个变量 yy = new_df.copy() yy = yy.drop(['numbcars','income','dwlltype','dwllsize','adult','ownrent','hnd_price'],axis=1) train_df = yy.dropna(subset=['churn']).copy() test_df = yy[np.isnan(new_df['churn'])] X_train=train_df.drop(['churn'],axis=1) Y_train=train_df['churn'] X_test = test_df.drop(['churn'],axis=1) # 用决策树 decision_tree = DecisionTreeClassifier() decision_tree.fit(X_train, Y_train) Y_pred = decision_tree.predict(X_test) acc_decision_tree = round(decision_tree.score(X_train, Y_train) * 100, 2)