zoukankan      html  css  js  c++  java
  • Python 机器学习及实践 Coded One :使用经典的分类模型和回归模型对数据进行训练 预测 评估 PS 欢迎指点指明错误!!!!

    ^(* ̄(oo) ̄)^:1.有部分代码我进行了数据归一化操作(也叫数据标准化) 在评估的时候使用的inverse_transform函数把数据还原

    2.code的代码是按书中的顺序 先进行了 数据抽样 (split) 然后进行了归一化操作(StandardScaler)

    先进行数据抽样会是数据的比例发生改变 再进行归一化操作这样是不当的

    正常情况应该先进行归一化操作然后再进行数据抽样

    PartOne经典分类 模型(做选择题:比如:判断是A类还是B类)

    使用线性分类模型从事良/恶性肿瘤预测任务(LogisticRegression和SGDClassifiler)

    import pandas as pd
    import numpy as np
    from sklearn.model_selection import train_test_split
    from sklearn.preprocessing import StandardScaler
    from sklearn.linear_model import LogisticRegression
    from sklearn.linear_model import stochastic_gradient
    from sklearn.metrics import classification_report
    column_names = ['Sample code number', 'Clump Thickness', 'Uniformity of Cell Size', 'Uniformity of Cell Shape', 'Marginal Adhesion', 'Single Epithelial Cell Size', 'Bare Nuclei', 'Bland Chromatin', 'Normal Nucleoli', 'Mitoses', 'Class']
    data = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data',names=column_names)
    
    print(data.isnull)
    
    data = data.replace(to_replace='?', value=np.nan)
    data = data.dropna(how='any')
    
    print(data.shape)
    
    x_train, x_test, y_train, y_test = train_test_split(data[column_names[1:10]], data[column_names[10]], test_size=0.25, random_state=33)
    print(y_train.value_counts())
    print(y_test.value_counts())
    
    ss = StandardScaler()
    x_train = ss.fit_transform(x_train)
    x_test = ss.fit_transform(x_test)
    
    lr = LogisticRegression()
    lr.fit( x_train ,y_train)
    lr_y_predict = lr.predict(x_test)
    print('Accuracy of LR ClassifierL:', lr.score(x_test, y_test))
    print(classification_report(y_test, lr_y_predict,target_names=['Benign', 'Malignant']))
    
    sgdc = stochastic_gradient.SGDClassifier()
    sgdc.fit( x_train  ,y_train)
    sgdc_y_predict = sgdc.predict(x_test)
    
    print('Accuracy of SGD ClassifierL:', sgdc.score(x_test, y_test))
    print(classification_report(y_test, sgdc_y_predict,target_names=['Benign', 'Malignant']))

    对手写数码图像识别(分类)模型(支持向量机)

    from sklearn.datasets import load_digits
    from sklearn.model_selection import train_test_split
    from sklearn.preprocessing import StandardScaler
    from sklearn.svm import LinearSVC
    from sklearn.metrics import classification_report
    
    digits = load_digits()
    print( digits.data.shape)
    x_train, x_test, y_train, y_test = train_test_split(digits.data, digits.target, test_size=0.25, random_state=33)
    print(y_train.shape)
    print(y_test.shape)
    
    ss = StandardScaler()
    x_train = ss.fit_transform(x_train)
    x_test = ss.fit_transform(x_test)
    
    lsvc=LinearSVC()
    lsvc.fit(x_train, y_train)
    y_predict=lsvc.predict(x_test)
    
    print('Accuracy of Liner SVC is:', lsvc.score(x_test, y_test))
    print(classification_report(y_test, y_predict,target_names=digits.target_names.astype(str)))

    新闻文本分类(朴素贝叶斯)

    from sklearn.datasets import fetch_20newsgroups
    from sklearn.model_selection import train_test_split
    from sklearn.feature_extraction.text import CountVectorizer
    from sklearn.naive_bayes import MultinomialNB
    from sklearn.metrics import classification_report
    
    news=fetch_20newsgroups(subset='all')
    print(len(news.data))
    print(news.data[0])
    
    x_train, x_test, y_train, y_test = train_test_split(news.data, news.target, test_size=0.25, random_state=33)
    
    vec=CountVectorizer()
    x_train=vec.fit_transform(x_train)
    x_test=vec.transform(x_test)
    
    mnb=MultinomialNB()
    mnb.fit(x_train,y_train)
    y_predict=mnb.predict(x_test)
    
    print('Accuracy of Naive Bayes Classifier is:', mnb.score(x_test, y_test))
    print(classification_report(y_test, y_predict,target_names=news.target_names))

    对鸢尾花(lris)数据进行类别预测(K近邻分类)

    from sklearn.datasets import load_iris
    from sklearn.model_selection import train_test_split
    from sklearn.preprocessing import StandardScaler
    from sklearn.neighbors import  KNeighborsClassifier
    from sklearn.metrics import classification_report
    
    iris=load_iris()
    print(iris.data.shape)
    
    print(iris.DESCR)
    
    x_train, x_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.25, random_state=33)
    
    ss = StandardScaler()
    x_train = ss.fit_transform(x_train)
    x_test = ss.fit_transform(x_test)
    
    knc=KNeighborsClassifier()
    knc.fit(x_train, y_train)
    y_predict=knc.predict(x_test)
    
    
    print('Accuracy of K-nearest Neighbour Classifier is:', knc.score(x_test, y_test))
    print(classification_report(y_test, y_predict,target_names=iris.target_names))

    对泰坦尼克号乘客的生还情况预测(决策树)

    import pandas as pd
    from sklearn.model_selection import train_test_split
    from sklearn.feature_extraction import DictVectorizer
    from sklearn.tree import DecisionTreeClassifier
    from sklearn.metrics import classification_report
    
    titanic = pd.read_csv('http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic.txt')
    titanic.head()
    
    titanic.info()
    
    x=titanic[['pclass','age','sex']]
    y=titanic['survived']
    
    x.info()
    
    x['age'].fillna(x['age'].mean(),inplace=True)
    
    x.info()
    
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=33)
    
    vec=DictVectorizer(sparse=False)
    
    x_train=vec.fit_transform(x_train.to_dict(orient='record'))
    print(vec.feature_names_)
    
    x_test=vec.fit_transform(x_test.to_dict(orient='record'))
    
    dtc=DecisionTreeClassifier()
    dtc.fit(x_train, y_train)
    y_predict=dtc.predict(x_test)
    
    print(dtc.score(x_test,y_test))
    print(classification_report(y_test, y_predict,target_names = ['died', 'survived']))

    对泰坦尼克号乘客的生还情况预测(集成模型(分类):随机森林分类器和梯度提升决策树)

    import pandas as pd
    from sklearn.model_selection import train_test_split
    from sklearn.feature_extraction import DictVectorizer
    from sklearn.tree import DecisionTreeClassifier
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.ensemble import GradientBoostingClassifier
    from sklearn.metrics import classification_report
    
    titanic = pd.read_csv('http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic.txt')
    
    x=titanic[['pclass','age','sex']]
    y=titanic['survived']
    
    x['age'].fillna(x['age'].mean(),inplace=True)
    
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=33)
    
    vec=DictVectorizer(sparse=False)
    x_train=vec.fit_transform(x_train.to_dict(orient='record'))
    x_test=vec.fit_transform(x_test.to_dict(orient='record'))
    
    dtc=DecisionTreeClassifier()
    dtc.fit(x_train, y_train)
    dtc_y_predict=dtc.predict(x_test)
    
    rfc=RandomForestClassifier()
    rfc.fit(x_train,y_train)
    rfc_y_predict=rfc.predict(x_test)
    
    gbc=GradientBoostingClassifier()
    gbc.fit(x_train,y_train)
    gbc_y_predict=gbc.predict(x_test)
    
    
    print('Accuracy of decision tree is:', dtc.score(x_test, y_test))
    print(classification_report(dtc_y_predict,y_test))
    
    
    print('Accuracy of random forest classifier is:', rfc.score(x_test, y_test))
    print(classification_report(rfc_y_predict,y_test))
    
    
    print('Accuracy of gradient tree classifier is:', gbc.score(x_test, y_test))
    print(classification_report(gbc_y_predict,y_test))

     PartTwo经典回归模型(做计算题:比如:计算某个问题的数值)

     使用线性回归器对房屋价格进行预测LinearRegression和Stochastic_Gradient

    代码后面的inverse_transform函数的作用是把归一化的数据还原

    中间要对标签集进行reshape

    from sklearn.datasets import load_boston
    from sklearn.model_selection import train_test_split
    from sklearn.preprocessing import StandardScaler
    from sklearn.linear_model import LinearRegression
    from sklearn.linear_model import stochastic_gradient
    from sklearn.metrics import r2_score,mean_squared_error,mean_absolute_error
    import numpy as np
    
    boston =load_boston()
    print (boston.DESCR)#查看数据描述
    
    x=boston.data
    y=boston.target
    
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=33)#分割数据
    # 注:应该先进性归一化然后再进行样本抽样  此代码中顺序是相反的(根据书上)
    print("The max target value is",np.max(boston.target))
    print("The min target value is",np.min(boston.target))
    print("The average target value is ",np.mean(boston.target))#输出标签集的最大值最小值 平均值
    
    
    #预测目标之间相差较大 进行标准化处理
    ss_X = StandardScaler().fit(x)
    ss_y = StandardScaler().fit(y)
    #ss_X = StandardScaler()
    #ss_y = StandardScaler()
    
    x_train = ss_X.fit_transform(x_train)
    x_test = ss_X.transform(x_test)
    
    y_train = ss_y.fit_transform(y_train.reshape(-1,1))
    y_test = ss_y.transform(y_test.reshape(-1,1))
    #y_train = ss_y.fit_transform(y_train)
    #y_test = ss_y.fit_transform(y_test)
    
    #使用线性回归器对房价进行预测()
    lr = LinearRegression()
    lr.fit( x_train ,y_train)
    lr_y_predict = lr.predict(x_test)
    
    sgdr = stochastic_gradient.SGDRegressor()
    sgdr.fit( x_train  ,y_train)
    sgdr_y_predict = sgdr.predict(x_test)
    
    print('The value of default measurement of LinearRegression is',lr.score(x_test,y_test))#线性回归模型自带的评估模块
    print('The value of R-squred of LinearRegression is',r2_score(y_test,lr_y_predict))#回归问题的评价指标
    print('The mean squred error of LinearRegression is',mean_squared_error(ss_y.inverse_transform(y_test),ss_y.inverse_transform(lr_y_predict)))#平放误差
    print('The mean absoluate error of LinearRegression is',mean_absolute_error(ss_y.inverse_transform(y_test),ss_y.inverse_transform(lr_y_predict)))#绝对平放误差
    
    print('The value of default measurement of Regressor  is',sgdr.score(x_test,y_test))#线性回归模型自带的评估模块
    print('The value of R-squred of  is',r2_score(y_test,sgdr_y_predict))#回归问题的评价指标
    print('The mean squred error of LinearRegression is',mean_squared_error(ss_y.inverse_transform(y_test),ss_y.inverse_transform(sgdr_y_predict)))#平放误差
    print('The mean absoluate error of LinearRegression is',mean_absolute_error(ss_y.inverse_transform(y_test),ss_y.inverse_transform(sgdr_y_predict)))#绝对平放误差

     以三种不同的核函数来使用支持向量机模型对房屋价格进行预测

    ^(* ̄(oo) ̄)^:我第一遍打代码的时候 没有使用对数据进行归一化操作

    其中核函数linear 没有受到影响

    但是使用和函数poly训练没有归一化的数据会卡住

    核函数rbf的预测准确程度会大幅度下降

    from sklearn.datasets import load_boston
    from sklearn.model_selection import train_test_split
    from sklearn.preprocessing import StandardScaler
    from sklearn.svm import SVR
    from sklearn.metrics import r2_score
    from sklearn.metrics import mean_absolute_error
    from sklearn.metrics import mean_squared_error
    
    boston = load_boston()
    x = boston.data
    y = boston.target
    
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=33)
    
    ss_x = StandardScaler()
    ss_y = StandardScaler()
    
    x_train = ss_x.fit_transform(x_train)
    x_test = ss_x.transform(x_test)
    y_train = ss_y.fit_transform(y_train.reshape(-1, 1))
    y_test = ss_y.transform(y_test.reshape(-1, 1))
    
    linear_svr=SVR(kernel='linear')
    linear_svr.fit(x_train,y_train)
    linear_svr_y_predict=linear_svr.predict(x_test)
    
    
    poly_svr = SVR(kernel='poly')
    poly_svr.fit(x_train, y_train.ravel())
    poly_svr_y_predict = poly_svr.predict(x_test)
    
    rbf_svr=SVR(kernel='rbf')
    rbf_svr.fit(x_train,y_train)
    rbf_svr_y_predict=rbf_svr.predict(x_test)
    
    
    print('I AM Linear_SVR')
    print('score',linear_svr.score(x_test,y_test))
    print('R-squared',r2_score(y_test,linear_svr_y_predict))
    print('mean squared',mean_squared_error(y_test,linear_svr_y_predict))
    print('mean absolute',mean_absolute_error(y_test,linear_svr_y_predict))
    
    print('I AM POLY_SVR')
    print('score',poly_svr.score(x_test,y_test))
    print('R-squared',r2_score(y_test,poly_svr_y_predict))
    print('mean squared',mean_squared_error(y_test,poly_svr_y_predict))
    print('mean absolute',mean_absolute_error(y_test,poly_svr_y_predict))
    
    print('I AM RBF_SVR')
    print('score',rbf_svr.score(x_test,y_test))
    print('R-squared',r2_score(y_test,rbf_svr_y_predict))
    print('mean squared',mean_squared_error(y_test,rbf_svr_y_predict))
    print('mean absolute',mean_absolute_error(y_test,rbf_svr_y_predict))

    使用两种不同配置的k近邻回归模型对房间进行预测(普通算数平均算法和加权平均)

    from sklearn.datasets import load_boston
    from sklearn.model_selection import train_test_split
    from sklearn.preprocessing import StandardScaler
    #________________________________________________________________________________________
    from sklearn.neighbors import KNeighborsRegressor#k近邻回归器
    
    from sklearn.metrics import r2_score
    from sklearn.metrics import mean_absolute_error
    from sklearn.metrics import mean_squared_error
    
    boston = load_boston()
    x = boston.data
    y = boston.target
    
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=33)
    
    ss_x = StandardScaler()
    ss_y = StandardScaler()
    
    x_train = ss_x.fit_transform(x_train)
    x_test = ss_x.transform(x_test)
    y_train = ss_y.fit_transform(y_train.reshape(-1, 1))
    y_test = ss_y.transform(y_test.reshape(-1, 1))
    
    #________________________________________________________________________________________
    uni_knr=KNeighborsRegressor(weights='uniform')#初始化K近邻回归器 并设置预测方式为 平均回归
    uni_knr.fit(x_train,y_train)
    uni_knr_y_predict=uni_knr.predict(x_test)
    
    dis_knr=KNeighborsRegressor(weights='distance')#初始化K近邻回归器 并设置预测方式为 加权回归
    dis_knr.fit(x_train,y_train)
    dis_knr_knr_y_predict=dis_knr.predict(x_test)
    
    print('I AM uni_knr')
    print('score',uni_knr.score(x_test,y_test))
    print('R-squared',r2_score(y_test,uni_knr_y_predict))
    print('mean squared',mean_squared_error(y_test,uni_knr_y_predict))
    print('mean absolute',mean_absolute_error(y_test,uni_knr_y_predict))
    
    print('I AM dis_knr')
    print('score',dis_knr.score(x_test,y_test))
    print('R-squared',r2_score(y_test,dis_knr_knr_y_predict))
    print('mean squared',mean_squared_error(y_test,dis_knr_knr_y_predict))
    print('mean absolute',mean_absolute_error(y_test,dis_knr_knr_y_predict))

    使用单一回归树对房价进行预测 

    from sklearn.datasets import load_boston
    from sklearn.model_selection import train_test_split
    from sklearn.preprocessing import StandardScaler
    
    #_________________________________________________________
    
    from sklearn.tree import DecisionTreeRegressor#导入回归树模型
    
    from sklearn.metrics import r2_score
    from sklearn.metrics import mean_absolute_error
    from sklearn.metrics import mean_squared_error
    
    boston = load_boston()
    x = boston.data
    y = boston.target
    
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=33)
    
    ss_x = StandardScaler()
    ss_y = StandardScaler()
    
    x_train = ss_x.fit_transform(x_train)
    x_test = ss_x.transform(x_test)
    y_train = ss_y.fit_transform(y_train.reshape(-1, 1))
    y_test = ss_y.transform(y_test.reshape(-1, 1))
    
    #____________________________________________________________
    dtr=DecisionTreeRegressor()
    dtr.fit(x_train,y_train)
    dtr_y_predict=dtr.predict(x_test)
    
    print('I AM DecisionTreeRegressor')
    print('score',dtr.score(x_test,y_test))
    print('R-squared',r2_score(y_test,dtr_y_predict))
    print('mean squared',mean_squared_error(y_test,dtr_y_predict))
    print('mean absolute',mean_absolute_error(y_test,dtr_y_predict))

     使用三种集成回归模型对房价进行预测(RandomForestRegressor,ExtraTreesRegressor,GradientBoostingRegressor)

    from sklearn.datasets import load_boston
    from sklearn.model_selection import train_test_split
    from sklearn.preprocessing import StandardScaler
    
    #_______________________________________________________________________________________
    
    from sklearn.ensemble import RandomForestRegressor,ExtraTreesRegressor,GradientBoostingRegressor
    
    from sklearn.metrics import r2_score
    from sklearn.metrics import mean_absolute_error
    from sklearn.metrics import mean_squared_error
    
    boston = load_boston()
    x = boston.data
    y = boston.target
    
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=33)
    
    ss_x = StandardScaler()
    ss_y = StandardScaler()
    
    x_train = ss_x.fit_transform(x_train)
    x_test = ss_x.transform(x_test)
    y_train = ss_y.fit_transform(y_train.reshape(-1, 1))
    y_test = ss_y.transform(y_test.reshape(-1, 1))
    
    rfr=RandomForestRegressor()
    rfr.fit(x_train,y_train)
    rfr_y_predict=rfr.predict(x_test)
    
    etr=ExtraTreesRegressor()
    etr.fit(x_train,y_train)
    etr_y_predict=etr.predict(x_test)
    
    gbr=GradientBoostingRegressor()
    gbr.fit(x_train,y_train)
    gbr_y_predict=gbr.predict(x_test)
    
    print('I AM RandomForestRegressor')
    print('score',rfr.score(x_test,y_test))
    print('R-squared',r2_score(y_test,rfr_y_predict))
    print('mean squared',mean_squared_error(y_test,rfr_y_predict))
    print('mean absolute',mean_absolute_error(y_test,rfr_y_predict))
    
    print('I AM ExtraTreesRegressor')
    print('score',etr.score(x_test,y_test))
    print('R-squared',r2_score(y_test,etr_y_predict))
    print('mean squared',mean_squared_error(y_test,etr_y_predict))
    print('mean absolute',mean_absolute_error(y_test,etr_y_predict))
    
    print('I AM GradientBoostingRegressor')
    print('score',gbr.score(x_test,y_test))
    print('R-squared',r2_score(y_test,gbr_y_predict))
    print('mean squared',mean_squared_error(y_test,gbr_y_predict))
    print('mean absolute',mean_absolute_error(y_test,gbr_y_predict))
  • 相关阅读:
    RabbitMQ内存爆出
    SQL Server特殊用法笔记
    C# 实现一个可取消的多线程操作 示例
    js 下载图片与下载文件的方式一样;保存至本地 ASP.NET 方式
    ajaxfileupload.js的简单使用
    Wince 6.0适用 .NET 使用HttpRequest的Post上传文件,服务端的Web API接收Post上传上来的文件 代码
    C# .csv文件转为Excel格式;Excel格式转换为.csv
    将汉字转换为拼音
    检查汉子字符
    初学JQuery笔记
  • 原文地址:https://www.cnblogs.com/IAMzhuxiaofeng/p/8808780.html
Copyright © 2011-2022 走看看