zoukankan      html  css  js  c++  java
  • 机器学习之逻辑回归(Logistic Regression)

    1.  
      """逻辑回归中的Sigmoid函数"""
    2.  
      import numpy as np
    3.  
      import matplotlib.pyplot as plt
    4.  
       
    5.  
      def sigmoid(t):
    6.  
      return 1/(1+np.exp(-t))
    7.  
       
    8.  
      x=np.linspace(-10,10,500)
    9.  
      y=sigmoid(x)
    10.  
       
    11.  
      plt.plot(x,y)
    12.  
      plt.show()

     结果:

     逻辑回归损失函数的梯度:

     

     

     

     

     

     

     

     

     

    逻辑回归算法: 

    1.  
      import numpy as np
    2.  
      from metrics import accuracy_score
    3.  
       
    4.  
      class LogisticRegression:
    5.  
       
    6.  
      def __init__(self):
    7.  
      """初始化Logistic Regression模型"""
    8.  
      self.coef_ = None
    9.  
      self.intercept_ = None
    10.  
      self._theta = None
    11.  
       
    12.  
      def _sigmoid(self,t):
    13.  
      return 1. / (1. + np.exp(-t))
    14.  
       
    15.  
      def fit(self, X_train, y_train, eta=0.01, n_iters=1e4):
    16.  
      """根据训练数据集X_train, y_train, 使用梯度下降法训练Linear Regression模型"""
    17.  
      assert X_train.shape[0] == y_train.shape[0],
    18.  
      "the size of X_train must be equal to the size of y_train"
    19.  
       
    20.  
       
    21.  
       
    22.  
      def J(theta, X_b, y):
    23.  
      """求损失函数"""
    24.  
      y_hat=self._sigmoid(X_b.dot(theta))
    25.  
      try:
    26.  
      return -np.sum(y*np.log(y_hat) + (1-y)*np.log(1-y_hat))/ len(y)
    27.  
      except:
    28.  
      return float('inf')
    29.  
       
    30.  
      def dJ(theta, X_b, y):
    31.  
      """求梯度"""
    32.  
      # res = np.empty(len(theta))
    33.  
      # res[0] = np.sum(X_b.dot(theta) - y)
    34.  
      # for i in range(1, len(theta)):
    35.  
      # res[i] = (X_b.dot(theta) - y).dot(X_b[:, i])
    36.  
      # return res * 2 / len(X_b)
    37.  
      return X_b.T.dot(self._sigmoid(X_b.dot(theta)) - y) / len(X_b)
    38.  
       
    39.  
      def gradient_descent(X_b, y, initial_theta, eta, n_iters=1e4, epsilon=1e-8):
    40.  
      """使用批量梯度下降法寻找theta"""
    41.  
      theta = initial_theta
    42.  
      cur_iter = 0
    43.  
       
    44.  
      while cur_iter < n_iters:
    45.  
      gradient = dJ(theta, X_b, y)
    46.  
      last_theta = theta
    47.  
      theta = theta - eta * gradient
    48.  
      if (abs(J(theta, X_b, y) - J(last_theta, X_b, y)) < epsilon):
    49.  
      break
    50.  
       
    51.  
      cur_iter += 1
    52.  
       
    53.  
      return theta
    54.  
       
    55.  
      X_b = np.hstack([np.ones((len(X_train), 1)), X_train])
    56.  
      initial_theta = np.zeros(X_b.shape[1])
    57.  
      self._theta = gradient_descent(X_b, y_train, initial_theta, eta, n_iters)
    58.  
       
    59.  
      self.intercept_ = self._theta[0]
    60.  
      self.coef_ = self._theta[1:]
    61.  
       
    62.  
      return self
    63.  
       
    64.  
      def predict_proba(self, X_predict):
    65.  
      """给定待预测数据集X_predict,返回表示X_predict的结果概率向量"""
    66.  
      assert self.intercept_ is not None and self.coef_ is not None,
    67.  
      "must fit before predict!"
    68.  
      assert X_predict.shape[1] == len(self.coef_),
    69.  
      "the feature number of X_predict must be equal to X_train"
    70.  
       
    71.  
      X_b = np.hstack([np.ones((len(X_predict), 1)), X_predict])
    72.  
      return self._sigmoid(X_b.dot(self._theta))
    73.  
       
    74.  
      def predict(self, X_predict):
    75.  
      """给定待预测数据集X_predict,返回表示X_predict的结果向量"""
    76.  
      assert self.intercept_ is not None and self.coef_ is not None,
    77.  
      "must fit before predict!"
    78.  
      assert X_predict.shape[1] == len(self.coef_),
    79.  
      "the feature number of X_predict must be equal to X_train"
    80.  
       
    81.  
      proba=self.predict_proba(X_predict)
    82.  
      return np.array(proba>=0.5,dtype='int')
    83.  
       
    84.  
      def score(self, X_test, y_test):
    85.  
      """根据测试数据集 X_test 和 y_test 确定当前模型的准确度"""
    86.  
       
    87.  
      y_predict = self.predict(X_test)
    88.  
      return accuracy_score(y_test, y_predict)
    89.  
       
    90.  
      def __repr__(self):
    91.  
      return "LogisticRegression()"
    1.  
      """实现逻辑回归"""
    2.  
      import numpy as np
    3.  
      import matplotlib.pyplot as plt
    4.  
      from sklearn import datasets
    5.  
       
    6.  
      iris=datasets.load_iris()
    7.  
      X=iris.data
    8.  
      y=iris.target
    9.  
       
    10.  
      X=X[y<2,:2]
    11.  
      y=y[y<2]
    12.  
       
    13.  
      plt.scatter(X[y==0,0],X[y==0,1],color='red')
    14.  
      plt.scatter(X[y==1,0],X[y==1,1],color='blue')
    15.  
      plt.show()
    16.  
       
    17.  
      """使用逻辑回归"""
    18.  
      from model_selection import train_test_split
    19.  
      from LogisticRegression import LogisticRegression
    20.  
       
    21.  
      X_train,X_test,y_train,y_test=train_test_split(X,y,seed=666)
    22.  
      log_reg=LogisticRegression()
    23.  
      log_reg.fit(X_train,y_train)
    24.  
      print(log_reg.score(X_test,y_test))
    25.  
      print(log_reg.predict_proba(X_test))

     结果:

    1.  
      E:pythonspaceKNN_functionvenvScriptspython.exe E:/pythonspace/KNN_function/try.py
    2.  
      1.0
    3.  
      [0.92972035 0.98664939 0.14852024 0.17601199 0.0369836 0.0186637
    4.  
      0.04936918 0.99669244 0.97993941 0.74524655 0.04473194 0.00339285
    5.  
      0.26131273 0.0369836 0.84192923 0.79892262 0.82890209 0.32358166
    6.  
      0.06535323 0.20735334]
    7.  
       
    8.  
      Process finished with exit code 0

    逻辑回归中的决策边界和添加多项式特征:

    1.  
      """在逻辑回归中添加多项式特征"""
    2.  
      import numpy as np
    3.  
      import matplotlib.pyplot as plt
    4.  
       
    5.  
      np.random.seed(666)
    6.  
      X=np.random.normal(0,1,size=(100,2))
    7.  
      y=np.array(X[:,0]**2+X[:,1]**2<1.5,dtype='int')
    8.  
       
    9.  
       
    10.  
      """使用逻辑回归"""
    11.  
      from LogisticRegression import LogisticRegression
    12.  
       
    13.  
      log_reg=LogisticRegression()
    14.  
      log_reg.fit(X,y)
    15.  
       
    16.  
      """绘制思路"""
    17.  
      def plot_decision_boundary(model,axis):
    18.  
      x0,x1 = np.meshgrid(
    19.  
      np.linspace(axis[0],axis[1],int((axis[1]-axis[0])*100)),
    20.  
      np.linspace(axis[2],axis[3],int((axis[3]-axis[2])*100))
    21.  
      )
    22.  
      X_new = np.c_[x0.ravel(),x1.ravel()]
    23.  
      y_predict = model.predict(X_new)
    24.  
      zz = y_predict.reshape(x0.shape)
    25.  
      from matplotlib.colors import ListedColormap
    26.  
      custom_cmap = ListedColormap(['#EF9A9A','#FFF59D','#90CAF9'])
    27.  
      plt.contourf(x0,x1,zz,linewidth=5,cmap=custom_cmap)
    28.  
       
    29.  
      plot_decision_boundary(log_reg,axis=[-4,4,-4,4])
    30.  
      plt.scatter(X[y==0,0],X[y==0,1])
    31.  
      plt.scatter(X[y==1,0],X[y==1,1])
    32.  
      plt.show()
    33.  
       
    34.  
       
    35.  
      """添加特征值,即升维"""
    36.  
      from sklearn.preprocessing import PolynomialFeatures
    37.  
      from sklearn.preprocessing import StandardScaler
    38.  
      from sklearn.pipeline import Pipeline
    39.  
      def PolynomialLogisticRegression(degree):
    40.  
      return Pipeline([
    41.  
      ('Poly',PolynomialFeatures(degree=degree)),
    42.  
      ('std_scaler',StandardScaler()),
    43.  
      ('Logistic',LogisticRegression())
    44.  
      ])
    45.  
      poly_log_reg = PolynomialLogisticRegression(degree=2)
    46.  
      poly_log_reg.fit(X,y)
    47.  
      plot_decision_boundary(poly_log_reg,axis=[-4,4,-4,4])
    48.  
      plt.scatter(X[y==0,0],X[y==0,1])
    49.  
      plt.scatter(X[y==1,0],X[y==1,1])
    50.  
      plt.show()

     结果:

     

    1.  
      """逻辑回归中使用正则化"""
    2.  
      import numpy as np
    3.  
      import matplotlib.pyplot as plt
    4.  
      from sklearn.model_selection import train_test_split
    5.  
      from sklearn.linear_model import LogisticRegression
    6.  
      from sklearn.preprocessing import StandardScaler
    7.  
      from sklearn.pipeline import Pipeline
    8.  
      from sklearn.preprocessing import PolynomialFeatures
    9.  
       
    10.  
      np.random.seed(666)
    11.  
      X=np.random.normal(0,1,size=(200,2))
    12.  
      y=np.array(X[:,0]**2+X[:,1]<1.5,dtype='int')
    13.  
      for _ in range(20):
    14.  
      y[np.random.randint(200)] = 1
    15.  
      plt .scatter(X[y==0,0],X[y==0,1])
    16.  
      plt .scatter(X[y==1,0],X[y==1,1])
    17.  
      plt.show()
    18.  
       
    19.  
      X_train,X_test,y_train,y_test=train_test_split(X,y)
    20.  
      log_reg=LogisticRegression()
    21.  
      log_reg.fit(X,y)
    22.  
      def plot_decision_boundary(model,axis):
    23.  
      x0,x1 = np.meshgrid(
    24.  
      np.linspace(axis[0],axis[1],int((axis[1]-axis[0])*100)),
    25.  
      np.linspace(axis[2],axis[3],int((axis[3]-axis[2])*100))
    26.  
      )
    27.  
      X_new = np.c_[x0.ravel(),x1.ravel()]
    28.  
      y_predict = model.predict(X_new)
    29.  
      zz = y_predict.reshape(x0.shape)
    30.  
      from matplotlib.colors import ListedColormap
    31.  
      custom_cmap = ListedColormap(['#EF9A9A','#FFF59D','#90CAF9'])
    32.  
      plt.contourf(x0,x1,zz,linewidth=5,cmap=custom_cmap)
    33.  
       
    34.  
      def PolynomialLogisticRegression(degree,C=1.0,penalty='l2'):
    35.  
      return Pipeline([
    36.  
      ('Poly',PolynomialFeatures(degree=degree)),
    37.  
      ('std_scaler',StandardScaler()),
    38.  
      ('Logistic',LogisticRegression(C=C,penalty=penalty))
    39.  
      ])
    40.  
      poly_log_reg = PolynomialLogisticRegression(degree=20,C=0.1,penalty='l1')
    41.  
      poly_log_reg.fit(X_train,y_train)
    42.  
       
    43.  
      plot_decision_boundary(poly_log_reg,axis=[-4,4,-4,4])
    44.  
      plt.scatter(X[y==0,0],X[y==0,1])
    45.  
      plt.scatter(X[y==1,0],X[y==1,1])
    46.  
      plt.show()

     结果

    应用OVR和OVO使逻辑回归处理多分类问题
     

    1.  
      """OVR和OVO"""
    2.  
      #为了数据可视化方便,我们只使用鸢尾花数据集的前两列特征
    3.  
      from sklearn import datasets
    4.  
      from sklearn.linear_model import LogisticRegression
    5.  
      from sklearn.model_selection import train_test_split
    6.  
      import matplotlib.pyplot as plt
    7.  
      import numpy as np
    8.  
       
    9.  
      iris = datasets.load_iris()
    10.  
      X = iris['data'][:,:2]
    11.  
      y = iris['target']
    12.  
      X_train,X_test,y_train,y_test=train_test_split(X,y,random_state=666)
    13.  
       
    14.  
       
    15.  
      #log_reg = LogisticRegression(multi_class='ovr') #传入multi_class参数可以指定使用ovr或ovo,默认ovr #由于只使用前两列特征,导致分类准确度较低
    16.  
      log_reg = LogisticRegression(multi_class='ovr',solver='newton-cg')
    17.  
      log_reg.fit(X_train,y_train)
    18.  
      log_reg.score(X_test,y_test)
    19.  
      def plot_decision_boundary(model,axis):
    20.  
      x0,x1 = np.meshgrid(
    21.  
      np.linspace(axis[0],axis[1],int((axis[1]-axis[0])*100)),
    22.  
      np.linspace(axis[2],axis[3],int((axis[3]-axis[2])*100))
    23.  
      )
    24.  
      X_new = np.c_[x0.ravel(),x1.ravel()]
    25.  
      y_predict = model.predict(X_new)
    26.  
      zz = y_predict.reshape(x0.shape)
    27.  
      from matplotlib.colors import ListedColormap
    28.  
      custom_cmap = ListedColormap(['#EF9A9A','#FFF59D','#90CAF9'])
    29.  
      plt.contourf(x0,x1,zz,linewidth=5,cmap=custom_cmap)
    30.  
       
    31.  
      plot_decision_boundary(log_reg,axis=[4,8.5,1.5,4.5])
    32.  
      plt.scatter(X[y==0,0],X[y==0,1])
    33.  
      plt.scatter(X[y==1,0],X[y==1,1])
    34.  
      plt.scatter(X[y==2,0],X[y==2,1])
    35.  
      plt.show()
    36.  
       
    37.  
       
    38.  
       
    39.  
      """使用全部数据 OVR and OVO"""
    40.  
      from sklearn.multiclass import OneVsOneClassifier
    41.  
      from sklearn.multiclass import OneVsRestClassifier
    42.  
       
    43.  
      from sklearn import datasets
    44.  
      from sklearn.linear_model import LogisticRegression
    45.  
      from sklearn.model_selection import train_test_split
    46.  
      iris = datasets.load_iris()
    47.  
      X = iris.data
    48.  
      y = iris.target
    49.  
      X_train,X_test,y_train,y_test=train_test_split(X,y,random_state=666)
    50.  
       
    51.  
      ovr = OneVsRestClassifier(log_reg) #参数为二分类器
    52.  
      ovr.fit(X_train,y_train)
    53.  
      print(ovr.score(X_test,y_test))
    54.  
      ovo = OneVsOneClassifier(log_reg)
    55.  
      ovo.fit(X_train,y_train)
    56.  
      print(ovo.score(X_test,y_test))

    结果:

    1.  
      E:pythonspaceKNN_functionvenvScriptspython.exe E:/pythonspace/KNN_function/try.py
    2.  
      E:pythonspaceKNN_functionvenvlibsite-packagesmatplotlibcontour.py:960: UserWarning: The following kwargs were not used by contour: 'linewidth'
    3.  
      s)
    4.  
      0.9736842105263158
    5.  
      1.0
    6.  
       
    7.  
      Process finished with exit code 0

     

  • 相关阅读:
    2021牛客暑期多校训练营5
    二分图知识点温习
    Codeforces Round #735 (Div. 2)
    牛客比赛订正(3,4)
    Harbour.Space Scholarship Contest 2021-2022 (Div. 1 + Div. 2) Editorial题解
    关于球的相关知识
    AtCoder Beginner Contest 210题解
    P7077 [CSP-S2020] 函数调用
    偏序问题学习笔记
    P1606 [USACO07FEB]Lilypad Pond G
  • 原文地址:https://www.cnblogs.com/kekexuanxaun/p/9459365.html
Copyright © 2011-2022 走看看