zoukankan      html  css  js  c++  java
  • 线性回归和Ridge回归

    网址:https://www.cnblogs.com/pinard/p/6023000.html

    线性回归和交叉验证

    import matplotlib.pyplot as plt
    import numpy as np
    import pandas as pd
    from sklearn import datasets,linear_model

    读取csv里面的数据

    data = pd.read_excel("F:dataCCPPFolds5x2_pp.xlsx");
    x = data[['AT', 'V', 'AP', 'RH']]
    y = data[['PE']]

    划分训练集和测试集

    from sklearn.cross_validation import train_test_split

    x为待划分的样本特征集合,y为待划分的样本标签,x,y经train_test_split划分后,x_train为训练集特征集合,x_test为训练的标签;y_train为测试集合的样本特征集合,y_test为测试集合的样本标签

    x_train,x_test,y_train,y_test = train_test_split(x, y, random_state=1)#可以通过test_size来设置划分比列

    导入线性模型

    from sklearn.linear_model import LinearRegression
    linreg = LinearRegression()#线性回归函数

    拟合

    linreg.fit(x_train,y_train)
    print("linreg.intercept_",linreg.intercept_,"linreg.coef_",linreg.coef_)

    模型拟合测试级

    y_pred = linreg.predict(x_test)
    from sklearn import metrics

    用scikit-learn计算MSE

    print("MSE:",metrics.mean_squared_error(y_test, y_pred))

    用scikit-learn计算RMSE

    print("RMSE:",np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

    利用交叉验证来优化模型

    from sklearn.model_selection import cross_val_predict

    cv为s折交验证

    predicted = cross_val_predict(linreg, x, y, cv=100)
    print("predicted:",predicted.shape)

    用scikit-learn计算MSE

    print("MSE:",metrics.mean_squared_error(y, predicted))

    用scikit-learn计算RMSE

    print ("RMSE:",np.sqrt(metrics.mean_squared_error(y, predicted)))

    画出图像

    fig, ax = plt.subplots()
    ax.scatter(y, predicted)
    ax.plot([y.min(), y.max()], [y.min(), y.max()], 'k--', lw=4)
    ax.set_xlabel('Measured')
    ax.set_ylabel('Predicted')
    plt.show()

    Ridge回归用scikit-learn选择Ridge回归超参数α

    import matplotlib.pyplot as plt
    import numpy as np
    import pandas as pd
    from sklearn import datasets,linear_model
    from sklearn import metrics

    读取csv里面的数据

    data = pd.read_excel("F:dataCCPPFolds5x2_pp.xlsx");
    x = data[['AT', 'V', 'AP', 'RH']]
    y = data[['PE']]

    划分训练集和测试集

    from sklearn.model_selection import train_test_split

    x为待划分的样本特征集合,y为待划分的样本标签,x,y经train_test_split划分后,x_train为训练集特征集合,x_test为训练的标签;y_train为测试集合的样本特征集合,y_test为测试集合的样本标签

    x_train,x_test,y_train,y_test = train_test_split(x, y, random_state=1)#可以通过test_size来设置划分比列
    n_alphas = 200
    alphas = np.logspace(-10,-2,n_alphas)
    print("alphas:",alphas)
    clf = linear_model.Ridge(fit_intercept=False)
    coefs = []
    for a in alphas:
    #设置本次循环的超参数
    clf.set_params(alpha=a)
    #针对每个alpha做ridge回归
    clf.fit(x_train, y_train)
    y_predict = clf.predict(x_test)
    error = metrics.mean_squared_error(y_predict,y_test)#计算方差
    print("error:",error)
    # 把每一个超参数alpha对应的theta存下来
    coefs.append(clf.coef_)
    print("coefs:",coefs)
    from sklearn import metrics

    ax = plt.gca()

    ax.plot(alphas, coefs)

    #将alpha的值取对数便于画图

    ax.set_xscale('log')

    #翻转x轴的大小方向,让alpha从大到小显示

    ax.set_xlim(ax.get_xlim()[::-1])

    plt.xlabel('alpha')

    plt.ylabel('weights')

    plt.title('Ridge coefficients as a function of the regularization')

    plt.axis('tight')

    plt.show()

    自己编写的逻辑编写代码

    定义LR回归模型

    class LogisticReression:
    def init(self, max_iter=200, learning_rate=0.01):
    self.max_iter = max_iter
    self.learning_rate = learning_rate
    def sigmoid(self, x):
    return 1 / (1 + exp(-x))
    def data_matrix(self, X):
    data_mat = []
    for d in X:
    data_mat.append([1.0, d])
    return data_mat
    #训练
    def train(self, X, y):
    # label = np.mat(y)
    data_mat = self.data_matrix(X) # m
    n
    self.weights = np.zeros((len(data_mat[0]), 1), dtype=np.float32)
    for iter_ in range(self.max_iter):
    for i in range(len(X)):
    result = self.sigmoid(np.dot(data_mat[i], self.weights))
    error = y[i] - result
    self.weights += self.learning_rate * error * np.transpose( [data_mat[i]])
    print('LR模型学习率={},最大迭代次数={}'.format( self.learning_rate, self.max_iter))
    # 准确率
    def accuracy(self, X_test, y_test):
    right = 0
    X_test = self.data_matrix(X_test)
    for x, y in zip(X_test, y_test):
    result = np.dot(x, self.weights)
    if (result > 0 and y == 1) or (result < 0 and y == 0):
    right += 1
    return right / len(X_test)

    构建数据

    def create_data():
    iris = load_iris()
    df = pd.DataFrame(iris.data, columns=iris.feature_names)
    df['label'] = iris.target
    df.columns = ['sepal length', 'sepal width', 'petal length', 'petal width', 'label']
    data = np.array(df.iloc[:100, [0,1,-1]])
    return data[:,:2], data[:,-1]
    X, y = create_data()
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

    训练数据

    LR = LogisticReression()
    LR.train(X_train, y_train)

    计算测试精度

    score = LR.accuracy(X_test, y_test)
    print("score:",score)
    x_ponits = np.arange(3, 9)
    y_ = -(LR.weights[1]*x_ponits + LR.weights[0])/LR.weights[2]
    plt.plot(x_ponits, y_)

    绘制图

    plt.scatter(X[:50,0],X[:50,1], label='0')
    plt.scatter(X[50:,0],X[50:,1], label='1')
    plt.legend()
    plt.show()

    scikit-learn

    # from sklearn import datasets

    # from sklearn.linear_model import LogisticRegression

    # from sklearn.model_selection import train_test_split

    # from sklearn.preprocessing import StandardScaler

    # from sklearn.metrics import accuracy_score

    # import matplotlib.pyplot as plt

    # from matplotlib.colors import ListedColormap

    # from mlxtend.plotting import plot_decision_regions

    # from sklearn.metrics import accuracy_score

    # import numpy as np

    # iris = datasets.load_iris()

    # x = iris.data[:,[2,3]]

    # y = iris.target

    # print("y:",y)

    # X_train,X_test,y_train,y_test = train_test_split(x , y, test_size=0.3, random_state = 1)

    # print("x_train:",X_test.shape)

    # print("y_train;",y_test.shape)

    # #sigmoid 函数

    # def sigmoid(z):

    # return 1.0/(1.0+np.exp(-z))

    # sc = StandardScaler()

    # sc.fit(X_train)

    # X_train_std = sc.transform(X_train)

    # X_test_std = sc.transform(X_test)

    # print("X_test:",X_test)

    # Ir = LogisticRegression(C=1000.0,random_state=1)

    # Ir.fit(X_train_std,y_train)

    # y_pred = Ir.predict(X_test)

    # print("y_test:",y_test)

    # print("y_pred:",y_pred)

    # print("score:",accuracy_score(y_test, y_pred))

    # print("Ir.coef:",Ir.coef_)

    # X_combined_std = np.vstack((X_train_std,X_test_std))

    # y_combined = np.hstack((y_train,y_test))

    # plot_decision_regions(X=X_combined_std,y=y_combined,clf=Ir,legend = 2)

    # plt.xlabel('petal length [standardized]')

    # plt.ylabel('petal width [standardized]')

    # plt.legend(loc='upper right')

    # plt.savefig('Iris.png')

    # plt.show()

    from math import exp

    import numpy as np

    import pandas as pd

    import matplotlib.pyplot as plt

    from sklearn.datasets import load_iris

    from sklearn.model_selection import train_test_split

    #定义LR回归模型

    class LogisticReression:

    def init(self, max_iter=200, learning_rate=0.01):

    self.max_iter = max_iter

    self.learning_rate = learning_rate

    def sigmoid(self, x):

    return 1 / (1 + exp(-x))

    def data_matrix(self, X):

    data_mat = []

    for d in X:

    data_mat.append([1.0, *d])

    return data_mat

    #训练

    def train(self, X, y):

    # label = np.mat(y)

    data_mat = self.data_matrix(X) # m*n

    self.weights = np.zeros((len(data_mat[0]), 1), dtype=np.float32)

    for iter_ in range(self.max_iter):

    for i in range(len(X)):

    result = self.sigmoid(np.dot(data_mat[i], self.weights))

    error = y[i] - result

    self.weights += self.learning_rate * error * np.transpose( [data_mat[i]])

    print('LR模型学习率={},最大迭代次数={}'.format( self.learning_rate, self.max_iter))

    # 准确率

    def accuracy(self, X_test, y_test):

    right = 0

    X_test = self.data_matrix(X_test)

    for x, y in zip(X_test, y_test):

    result = np.dot(x, self.weights)

    if (result > 0 and y == 1) or (result < 0 and y == 0):

    right += 1

    return right / len(X_test)

    #构建数据

    def create_data():

    iris = load_iris()

    df = pd.DataFrame(iris.data, columns=iris.feature_names)

    df['label'] = iris.target

    df.columns = ['sepal length', 'sepal width', 'petal length', 'petal width', 'label']

    data = np.array(df.iloc[:100, [0,1,-1]])

    return data[:,:2], data[:,-1]

    X, y = create_data()

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

    #训练数据

    LR = LogisticReression()

    LR.train(X_train, y_train)

    #计算测试精度

    score = LR.accuracy(X_test, y_test)

    print("score:",score)

    x_ponits = np.arange(3, 9)

    y_ = -(LR.weights[1]*x_ponits + LR.weights[0])/LR.weights[2]

    plt.plot(x_ponits, y_)

    # 绘制图

    plt.scatter(X[:50,0],X[:50,1], label='0')

    plt.scatter(X[50:,0],X[50:,1], label='1')

    plt.legend()

    plt.show()

    import pandas as pd
    from sklearn.model_selection import train_test_split
    from sklearn.linear_model import LogisticRegression
    from sklearn import metrics
    import seaborn as sn
    import matplotlib.pyplot as plt

    第一步构建数据

    candidates = {'gmat': [780,750,690,710,680,730,690,720,740,690,610,690,710,680,770,610,580,650,540,590,620,600,550,550,570,670,660,580,650,660,640,620,660,660,680,650,670,580,590,690],
    'gpa': [4,3.9,3.3,3.7,3.9,3.7,2.3,3.3,3.3,1.7,2.7,3.7,3.7,3.3,3.3,3,2.7,3.7,2.7,2.3,3.3,2,2.3,2.7,3,3.3,3.7,2.3,3.7,3.3,3,2.7,4,3.3,3.3,2.3,2.7,3.3,1.7,3.7],
    'work_experience': [3,4,3,5,4,6,1,4,5,1,3,5,6,4,3,1,4,6,2,3,2,1,4,1,2,6,4,2,6,5,1,2,4,6,5,1,2,1,4,5],
    'admitted': [1,1,1,1,1,1,0,1,1,0,0,1,1,1,1,0,0,1,0,0,0,0,0,0,0,1,1,0,1,1,0,0,1,1,1,0,0,0,0,1] }
    df = pd.DataFrame(candidates,columns= ['gmat', 'gpa','work_experience','admitted'])
    X = df[['gmat', 'gpa','work_experience']]
    y = df['admitted']

    75%的数据用来做训练集,25%的数据用作测试集

    X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.25,random_state=0)
    logistic_regression= LogisticRegression()

    训练

    logistic_regression.fit(X_train,y_train)

    预测

    y_pred=logistic_regression.predict(X_test)

    绘制热力图

    confusion_matrix = pd.crosstab(y_test, y_pred, rownames=['Actual'], colnames=['Predicted'])
    sn.heatmap(confusion_matrix, annot=True)
    plt.show()
    print('精度: ',metrics.accuracy_score(y_test, y_pred))

  • 相关阅读:
    红黑树——以无厚入有间
    红黑树——依天理以神遇
    B-树 分合之道
    B-树 动机与结构
    云心出岫——Splay Tree
    双散列和再散列暨散列表总结
    开放定址法——平方探测(Quadratic Probing)
    [LeetCode 109]
    [LeetCode 110]
    [LeetCode 111]
  • 原文地址:https://www.cnblogs.com/131415-520/p/11741303.html
Copyright © 2011-2022 走看看