逻辑回归算法案例分析
良/恶性乳腺癌肿瘤预测
原始数据的下载地址为:https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/
数据预处理
import pandas as pd import numpy as np # 根据官方数据构建类别 column_names = ['Sample code number','Clump Thickness','Uniformity of Cell Size','Uniformity of Cell Shape','Marginal Adhesion','Single Epithelial Cell Size','Bare Nuclei','Bland Chromatin','Normal Nucleoli','Mitoses','Class'], data = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/',names = column_names) # 将?替换成标准缺失值表示 data = data.replace(to_replace='?',value = np.nan) # 丢弃带有缺失值的数据(只要一个维度有缺失) data = data.dropna(how='any') data.shape
处理的缺失值后的样本共有683条,特征包括细胞厚度、细胞大小、形状等九个维度
准备训练测试数据
from sklearn.cross_validation import train_test_split X_train,X_test,y_train,y_test = train_test_split(data[column_names[1:10]],data[column_names[10]],test_size=0.25,random_state=42) # 查看训练和测试样本的数量和类别分布 y_train.value_counts() y_test.value_counts()
使用逻辑回归进行良/恶性肿瘤预测任务
from sklearn.preprocessing import StandardScaler from sklearn.linear_model import LogisticRegression # 标准化数据,保证每个维度的特征数据方差为1,均值为0。使得预测结果不会被某些维度过大的特征值而主导 ss = StandardScaler() X_train = ss.fit_transform(X_train) X_test = ss.transform(X_test) # 初始化 LogisticRegression lr = LogisticRegression(C=1.0, penalty='l1', tol=0.01) # 跳用LogisticRegression中的fit函数/模块来训练模型参数 lr.fit(X_train,y_train) lr_y_predict = lr.predict(X_test)
性能分析
from sklearn.metrics import classification_report # 利用逻辑斯蒂回归自带的评分函数score获得模型在测试集上的准确定结果 print '精确率为:',lr.score(X_test,y_test) print classification_report(y_test,lr_y_predict,target_names = ['Benign','Maligant'])