zoukankan      html  css  js  c++  java
  • 逻辑回归代码demo

    程序所用文件:https://files.cnblogs.com/files/henuliulei/%E5%9B%9E%E5%BD%92%E5%88%86%E7%B1%BB%E6%95%B0%E6%8D%AE.zip

    概念

     

     

    代价函数关于参数的偏导

     

    梯度下降法最终的推导公式如下

     多分类问题可以转为2分类问题

    正则化处理可以防止过拟合,下面是正则化后的代价函数和求导后的式子

     

    正确率和召回率F1指标

     

     我们希望自己预测的结果希望更准确那么查准率就更高,如果希望更获得更多数量的正确结果,那么查全率更重要,综合考虑可以用F1指标

    关于正则化为啥可以防止过拟合的理解

    梯度下降法实现线性逻辑回归

      1 import matplotlib.pyplot as plt
      2 import numpy as np
      3 from sklearn.metrics import classification_report#分类
      4 from sklearn import preprocessing#预测
      5 # 数据是否需要标准化
      6 scale = True#True是需要,False是不需要,标准化后效果更好,标准化之前可以画图看出可视化结果
      7 scale = False
      8 # 载入数据
      9 data = np.genfromtxt("LR-testSet.csv", delimiter=",")
     10 x_data = data[:, :-1]
     11 y_data = data[:, -1]
     12 
     13 def plot():
     14     x0 = []
     15     x1 = []
     16     y0 = []
     17     y1 = []
     18     # 切分不同类别的数据
     19     for i in range(len(x_data)):
     20         if y_data[i] == 0:
     21             x0.append(x_data[i, 0])
     22             y0.append(x_data[i, 1])
     23         else:
     24             x1.append(x_data[i, 0])
     25             y1.append(x_data[i, 1])
     26 
     27     # 画图
     28     scatter0 = plt.scatter(x0, y0, c='b', marker='o')
     29     scatter1 = plt.scatter(x1, y1, c='r', marker='x')
     30     # 画图例
     31     plt.legend(handles=[scatter0, scatter1], labels=['label0', 'label1'], loc='best')
     32 
     33 
     34 plot()
     35 plt.show()
     36 # 数据处理,添加偏置项
     37 x_data = data[:,:-1]
     38 y_data = data[:,-1,np.newaxis]
     39 
     40 print(np.mat(x_data).shape)
     41 print(np.mat(y_data).shape)
     42 # 给样本添加偏置项
     43 X_data = np.concatenate((np.ones((100,1)),x_data),axis=1)
     44 print(X_data.shape)
     45 
     46 def sigmoid(x):
     47     return 1.0 / (1 + np.exp(-x))
     48 
     49 def cost(xMat, yMat, ws):
     50     left = np.multiply(yMat, np.log(sigmoid(xMat * ws)))
     51     right = np.multiply(1 - yMat, np.log(1 - sigmoid(xMat * ws)))
     52     return np.sum(left + right) / -(len(xMat))
     53 
     54 def gradAscent(xArr, yArr):
     55     if scale == True:
     56         xArr = preprocessing.scale(xArr)
     57     xMat = np.mat(xArr)
     58     yMat = np.mat(yArr)
     59 
     60     lr = 0.001
     61     epochs = 10000
     62     costList = []
     63     # 计算数据行列数
     64     # 行代表数据个数,列代表权值个数
     65     m, n = np.shape(xMat)
     66     # 初始化权值
     67     ws = np.mat(np.ones((n, 1)))
     68 
     69     for i in range(epochs + 1):
     70         # xMat和weights矩阵相乘
     71         h = sigmoid(xMat * ws)
     72         # 计算误差
     73         ws_grad = xMat.T * (h - yMat) / m
     74         ws = ws - lr * ws_grad
     75 
     76         if i % 50 == 0:
     77             costList.append(cost(xMat, yMat, ws))
     78     return ws, costList
     79 # 训练模型,得到权值和cost值的变化
     80 ws,costList = gradAscent(X_data, y_data)
     81 print(ws)
     82 if scale == False:
     83     # 画图决策边界
     84     plot()
     85     x_test = [[-4],[3]]
     86     y_test = (-ws[0] - x_test*ws[1])/ws[2]
     87     plt.plot(x_test, y_test, 'k')
     88     plt.show()
     89 # 画图 loss值的变化
     90 x = np.linspace(0, 10000, 201)
     91 plt.plot(x, costList, c='r')
     92 plt.title('Train')
     93 plt.xlabel('Epochs')
     94 plt.ylabel('Cost')
     95 plt.show()
     96 # 预测
     97 def predict(x_data, ws):
     98     if scale == True:
     99         x_data = preprocessing.scale(x_data)
    100     xMat = np.mat(x_data)
    101     ws = np.mat(ws)
    102     return [1 if x >= 0.5 else 0 for x in sigmoid(xMat*ws)]
    103 
    104 predictions = predict(X_data, ws)
    105 
    106 print(classification_report(y_data, predictions))#计算预测的分

    sklearn实现线性逻辑回归

     1 import matplotlib.pyplot as plt
     2 import numpy as np
     3 from sklearn.metrics import classification_report
     4 from sklearn import preprocessing
     5 from sklearn import linear_model
     6 # 数据是否需要标准化
     7 scale = False
     8 # 载入数据
     9 data = np.genfromtxt("LR-testSet.csv", delimiter=",")
    10 x_data = data[:, :-1]
    11 y_data = data[:, -1]
    12 
    13 def plot():
    14     x0 = []
    15     x1 = []
    16     y0 = []
    17     y1 = []
    18     # 切分不同类别的数据
    19     for i in range(len(x_data)):
    20         if y_data[i] == 0:
    21             x0.append(x_data[i, 0])
    22             y0.append(x_data[i, 1])
    23         else:
    24             x1.append(x_data[i, 0])
    25             y1.append(x_data[i, 1])
    26 
    27     # 画图
    28     scatter0 = plt.scatter(x0, y0, c='b', marker='o')
    29     scatter1 = plt.scatter(x1, y1, c='r', marker='x')
    30     # 画图例
    31     plt.legend(handles=[scatter0, scatter1], labels=['label0', 'label1'], loc='best')
    32 
    33 plot()
    34 plt.show()
    35 logistic = linear_model.LogisticRegression()
    36 logistic.fit(x_data, y_data)
    37 if scale == False:
    38     # 画图决策边界
    39     plot()
    40     x_test = np.array([[-4],[3]])
    41     y_test = (-logistic.intercept_ - x_test*logistic.coef_[0][0])/logistic.coef_[0][1]
    42     plt.plot(x_test, y_test, 'k')
    43     plt.show()
    44 predictions = logistic.predict(x_data)
    45 
    46 print(classification_report(y_data, predictions))

    梯度下降法实现非线性逻辑回归

      1 import matplotlib.pyplot as plt
      2 import numpy as np
      3 from sklearn.metrics import classification_report
      4 from sklearn import preprocessing
      5 from sklearn.preprocessing import PolynomialFeatures
      6 # 数据是否需要标准化
      7 scale = False
      8 # 载入数据
      9 data = np.genfromtxt("LR-testSet2.txt", delimiter=",")
     10 x_data = data[:, :-1]
     11 y_data = data[:, -1, np.newaxis]
     12 
     13 def plot():
     14     x0 = []
     15     x1 = []
     16     y0 = []
     17     y1 = []
     18     # 切分不同类别的数据
     19     for i in range(len(x_data)):
     20         if y_data[i] == 0:
     21             x0.append(x_data[i, 0])
     22             y0.append(x_data[i, 1])
     23         else:
     24             x1.append(x_data[i, 0])
     25             y1.append(x_data[i, 1])
     26     # 画图
     27     scatter0 = plt.scatter(x0, y0, c='b', marker='o')
     28     scatter1 = plt.scatter(x1, y1, c='r', marker='x')
     29     # 画图例
     30     plt.legend(handles=[scatter0, scatter1], labels=['label0', 'label1'], loc='best')
     31 
     32 plot()
     33 plt.show()
     34 # 定义多项式回归,degree的值可以调节多项式的特征
     35 poly_reg  = PolynomialFeatures(degree=3)
     36 # 特征处理
     37 x_poly = poly_reg.fit_transform(x_data)
     38 def sigmoid(x):
     39     return 1.0 / (1 + np.exp(-x))
     40 
     41 
     42 def cost(xMat, yMat, ws):
     43     left = np.multiply(yMat, np.log(sigmoid(xMat * ws)))
     44     right = np.multiply(1 - yMat, np.log(1 - sigmoid(xMat * ws)))
     45     return np.sum(left + right) / -(len(xMat))
     46 
     47 def gradAscent(xArr, yArr):
     48     if scale == True:
     49         xArr = preprocessing.scale(xArr)
     50     xMat = np.mat(xArr)
     51     yMat = np.mat(yArr)
     52 
     53     lr = 0.03
     54     epochs = 50000
     55     costList = []
     56     # 计算数据列数,有几列就有几个权值
     57     m, n = np.shape(xMat)
     58     # 初始化权值
     59     ws = np.mat(np.ones((n, 1)))
     60 
     61     for i in range(epochs + 1):
     62         # xMat和weights矩阵相乘
     63         h = sigmoid(xMat * ws)
     64         # 计算误差
     65         ws_grad = xMat.T * (h - yMat) / m
     66         ws = ws - lr * ws_grad
     67 
     68         if i % 50 == 0:
     69             costList.append(cost(xMat, yMat, ws))
     70     return ws, costList
     71 # 训练模型,得到权值和cost值的变化
     72 ws,costList = gradAscent(x_poly, y_data)
     73 print(ws)
     74 
     75 # 获取数据值所在的范围
     76 x_min, x_max = x_data[:, 0].min() - 1, x_data[:, 0].max() + 1
     77 y_min, y_max = x_data[:, 1].min() - 1, x_data[:, 1].max() + 1
     78 
     79 # 生成网格矩阵
     80 xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.02),
     81                      np.arange(y_min, y_max, 0.02))
     82 #print(xx.shape)#xx和yy有相同shape,xx的每行相同,总行数取决于yy的范围
     83 #print(yy.shape)#xx和yy有相同shape,yy的每列相同,总行数取决于xx的范围
     84 # np.r_按row来组合array,
     85 # np.c_按colunm来组合array
     86 # >>> a = np.array([1,2,3])
     87 # >>> b = np.array([5,2,5])
     88 # >>> np.r_[a,b]
     89 # array([1, 2, 3, 5, 2, 5])
     90 # >>> np.c_[a,b]
     91 # array([[1, 5],
     92 #        [2, 2],
     93 #        [3, 5]])
     94 # >>> np.c_[a,[0,0,0],b]
     95 # array([[1, 0, 5],
     96 #        [2, 0, 2],
     97 #        [3, 0, 5]])
     98 z = sigmoid(poly_reg.fit_transform(np.c_[xx.ravel(), yy.ravel()]).dot(np.array(ws)))# ravel与flatten类似,多维数据转一维。flatten不会改变原始数据,ravel会改变原始数据
     99 for i in range(len(z)):
    100     if z[i] > 0.5:
    101         z[i] = 1
    102     else:
    103         z[i] = 0
    104 print(z)
    105 z = z.reshape(xx.shape)
    106 
    107 # 等高线图
    108 cs = plt.contourf(xx, yy, z)
    109 plot()
    110 plt.show()
    111 # 预测
    112 def predict(x_data, ws):
    113 #     if scale == True:
    114 #         x_data = preprocessing.scale(x_data)
    115     xMat = np.mat(x_data)
    116     ws = np.mat(ws)
    117     return [1 if x >= 0.5 else 0 for x in sigmoid(xMat*ws)]
    118 
    119 predictions = predict(x_poly, ws)
    120 
    121 print(classification_report(y_data, predictions))

    sklearn实现非线性逻辑回归

     1 import numpy as np
     2 import matplotlib.pyplot as plt
     3 from sklearn import linear_model
     4 from sklearn.datasets import make_gaussian_quantiles
     5 from sklearn.preprocessing import PolynomialFeatures
     6 # 生成2维正态分布,生成的数据按分位数分为两类,500个样本,2个样本特征
     7 # 可以生成两类或多类数据
     8 x_data, y_data = make_gaussian_quantiles(n_samples=500, n_features=2,n_classes=2)#生成500个样本,2个特征,2个类别
     9 plt.scatter(x_data[:, 0], x_data[:, 1], c=y_data)
    10 plt.show()
    11 logistic = linear_model.LogisticRegression()
    12 logistic.fit(x_data, y_data)
    13 # 获取数据值所在的范围
    14 x_min, x_max = x_data[:, 0].min() - 1, x_data[:, 0].max() + 1
    15 y_min, y_max = x_data[:, 1].min() - 1, x_data[:, 1].max() + 1
    16 # 生成网格矩阵
    17 xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.02),
    18                      np.arange(y_min, y_max, 0.02))
    19 z = logistic.predict(np.c_[xx.ravel(), yy.ravel()])# ravel与flatten类似,多维数据转一维。flatten不会改变原始数据,ravel会改变原始数据
    20 z = z.reshape(xx.shape)
    21 # 等高线图
    22 cs = plt.contourf(xx, yy, z)
    23 # 样本散点图
    24 plt.scatter(x_data[:, 0], x_data[:, 1], c=y_data)
    25 plt.show()
    26 print('score:',logistic.score(x_data,y_data))#使用线性逻辑回归,得分很低
    27 
    28 # 定义多项式回归,degree的值可以调节多项式的特征
    29 poly_reg  = PolynomialFeatures(degree=5)
    30 # 特征处理
    31 x_poly = poly_reg.fit_transform(x_data)
    32 # 定义逻辑回归模型
    33 logistic = linear_model.LogisticRegression()
    34 # 训练模型
    35 logistic.fit(x_poly, y_data)
    36 
    37 # 获取数据值所在的范围
    38 x_min, x_max = x_data[:, 0].min() - 1, x_data[:, 0].max() + 1
    39 y_min, y_max = x_data[:, 1].min() - 1, x_data[:, 1].max() + 1
    40 
    41 # 生成网格矩阵
    42 xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.02),
    43                      np.arange(y_min, y_max, 0.02))
    44 z = logistic.predict(poly_reg.fit_transform(np.c_[xx.ravel(), yy.ravel()]))# ravel与flatten类似,多维数据转一维。flatten不会改变原始数据,ravel会改变原始数据
    45 z = z.reshape(xx.shape)
    46 # 等高线图
    47 cs = plt.contourf(xx, yy, z)
    48 # 样本散点图
    49 plt.scatter(x_data[:, 0], x_data[:, 1], c=y_data)
    50 plt.show()
    51 print('score:',logistic.score(x_poly,y_data))#得分很高
  • 相关阅读:
    BZOJ1941Hide and Seek
    数学与树学(娱乐向)
    树状数组(模板)
    BZOJ2716天使玩偶
    BZOJ3262陌上花开
    BZOJ3781小B的询问
    BZOJ3236作业
    Bsgs模板
    HNOI2008明明的烦恼
    BZOJ1211树的计数
  • 原文地址:https://www.cnblogs.com/henuliulei/p/11783428.html
Copyright © 2011-2022 走看看