zoukankan      html  css  js  c++  java
  • sklearn调用逻辑回归算法

    #逻辑回归算法是一个二分类的算法,但是通过变形可以解决多分类的任务
    #逻辑回归将数据的特征转变为数据的发生概率,然后与阈值作比较,判断是0还是1,所以也可以叫做回归算法
    import numpy as np
    import matplotlib.pyplot as plt
    #定义概率转换函数sigmoid函数
    def sigmoid(t):
    return 1/(1+np.exp(-t))
    x=np.linspace(-10,10,100)
    y=sigmoid(x)
    plt.figure()
    plt.plot(x,y,"r",label="Sigmoid")
    plt.legend(loc=2)
    plt.show()
    #定义逻辑回归算法的损失函数,与线性回归算法比较一致
    #逻辑回归算法的数学原理底层实现编写
    def J1(theta,x_b,y): #损失函数的定义
    y_hat=sigmoid(x_b.dot(theta))
    return np.sum(y*np.log(y_hat)+(1-y)*np.log(1-y_hat))/len(x_b)
    def DJ2(theta,x_b,y):
    res=np.empty(len(theta))
    res[0]=np.sum(sigmoid(x_b.dot(theta))-y)
    for i in range(1,len(theta)):
    res[i]=np.sum((sigmoid(x_b.dot(theta))-y).dot(x_b[:,i]))
    return res*2/len(x_b)
    def DJ1(theta, x_b, y): #梯度计算公式
    return x_b.T.dot(sigmoid(x_b.dot(theta))-y)/len(y)
    def gradient_descent1(x_b,y,eta,theta_initial,erro=1e-8, n=1e5): #采用批量梯度下降法来进行寻找损失函数的最小值
    theta=theta_initial
    i=0
    while i<n:
    gradient = DJ1(theta,x_b,y)
    last_theta = theta
    theta = theta - gradient * eta
    if (abs(J1(theta,x_b,y) - J1(last_theta,x_b,y)))<erro:
    break
    i+=1
    return theta

    #利用iris数据集进行原理代码的验证
    from sklearn import datasets
    d=datasets.load_iris()
    x=d.data
    y=d.target
    x=x[y<2,:2]
    y=y[y<2] #逻辑回归适用于二元分类数据
    print(x)
    print(y)
    plt.figure()
    plt.scatter(x[y==0,0],x[y==0,1],color="r")
    plt.scatter(x[y==1,0],x[y==1,1],color="g")
    plt.show()
    from sklearn.model_selection import train_test_split
    x_train,x_test,y_train,y_test=train_test_split(x,y,random_state=1)
    x_b=np.hstack([np.ones((len(x_train),1)),x_train])
    print(x_b)
    theta0=np.zeros(x_b.shape[1])
    eta=0.1
    theta1=gradient_descent1(x_b,y_train,eta,theta0)
    print(theta1)
    from sklearn.metrics import accuracy_score
    x_b=np.hstack([np.ones((len(x_test),1)),x_test])
    y_hat=sigmoid(x_b.dot(theta1))
    print(y_hat)
    p=np.array(y_hat>0.5,dtype="int") #如果概率大于0.5,输出为1,不然输出为0
    print(p) #输出预测的分类结果
    print(y_test) #输出实际的结果
    print(accuracy_score(p,y_test)) #输出预测的准确度
    #二维特征的数据的决策边界,使得线性回归函数转换为概率阈值的时候其线性参数边界
    def f(x):
    return (-theta1[1]*x-theta1[0])/theta1[2]
    x1=np.linspace(4,7.5,100)
    plt.plot(x1,f(x1))
    plt.scatter(x[y==0,0],x[y==0,1],color="r")
    plt.scatter(x[y==1,0],x[y==1,1],color="g")
    plt.show()
    #定义机器学习算法的决策边界输出函数
    def plot_decision_boundary(model,axis):
    x0,x1=np.meshgrid(
    np.linspace(axis[0],axis[1],int((axis[1]-axis[0])*100)).reshape(-1,1),
    np.linspace(axis[2],axis[3], int((axis[3] - axis[2]) * 100)).reshape(-1,1)
    )
    x_new=np.c_[x0.ravel(),x1.ravel()]
    y_pre=model.predict(x_new)
    zz=y_pre.reshape(x0.shape)
    from matplotlib.colors import ListedColormap
    cus=ListedColormap(["#EF9A9A","#FFF59D","#90CAF9"])
    plt.contourf(x0,x1,zz,cmap=cus)
    #使用KNN算法展示KNN分类算法的决策边界(以二维特征作为例子,便可以在平面上进行展示)
    from sklearn.model_selection import train_test_split
    x_train,x_test,y_train,y_test=train_test_split(x,y,random_state=666)
    from sklearn.neighbors import KNeighborsClassifier
    knn1=KNeighborsClassifier() #默认的k比较小,模型比较复杂
    knn1.fit(x_train,y_train)
    plot_decision_boundary(knn1,axis=[4,8,1,5])
    plt.scatter(x[y==0,0],x[y==0,1],color="r")
    plt.scatter(x[y==1,0],x[y==1,1],color="g")
    plt.show()
    knn2=KNeighborsClassifier(n_neighbors=3) #k越大,模型越简单,也意味着过拟合的程度越轻,决策边界越清晰
    knn2.fit(d.data[:,:2],d.target)
    x=d.data
    y=d.target
    plot_decision_boundary(knn2,axis=[4,8,1,5])
    plt.scatter(x[y==0,0],x[y==0,1],color="r")
    plt.scatter(x[y==1,0],x[y==1,1],color="g")
    plt.scatter(x[y==2,0],x[y==2,1],color="b")
    plt.show()
    knn2=KNeighborsClassifier(n_neighbors=50) #k越大,模型越简单,也意味着过拟合的程度越轻,决策边界越清晰
    knn2.fit(d.data[:,:2],d.target)
    x=d.data
    y=d.target
    plot_decision_boundary(knn2,axis=[4,8,1,5])
    plt.scatter(x[y==0,0],x[y==0,1],color="r")
    plt.scatter(x[y==1,0],x[y==1,1],color="g")
    plt.scatter(x[y==2,0],x[y==2,1],color="b")
    plt.show()

    #sklearn中调用逻辑回归算法的过程
    #1-1单纯的逻辑回归算法
    x=np.random.normal(0,1,size=(200,2)) #自定义数据
    y=np.array(x[:,0]**2+x[:,1]**2<1.5,dtype="int")
    from sklearn.model_selection import train_test_split
    x_train,x_test,y_train,y_test=train_test_split(x,y,random_state=666)
    from sklearn.linear_model import LogisticRegression
    log=LogisticRegression()
    log.fit(x_train,y_train)
    print(log.score(x_test,y_test))
    knn3=KNeighborsClassifier()
    knn3.fit(x_train,y_train)
    print(knn3.score(x_test,y_test))
    plot_decision_boundary(log,axis=[-4,4,-4,4])
    plt.scatter(x[y==0,0],x[y==0,1],color="r")
    plt.scatter(x[y==1,0],x[y==1,1],color="g")
    plt.show()
    plot_decision_boundary(knn3,axis=[-4,4,-4,4])
    plt.scatter(x[y==0,0],x[y==0,1],color="r")
    plt.scatter(x[y==1,0],x[y==1,1],color="g")
    plt.show()

    #1-2sklearn中的逻辑回归(多项式参与,并不带正则化),采用管道的方式可以训练模型
    from sklearn.preprocessing import PolynomialFeatures
    from sklearn.pipeline import Pipeline
    from sklearn.preprocessing import StandardScaler
    def polynomiallogisticregression(degree):
    return Pipeline([
    ("poly",PolynomialFeatures(degree=degree)),
    ("std_reg",StandardScaler()),
    ("log_reg",LogisticRegression())
    ])
    x=np.random.normal(0,1,size=(200,2))
    y=np.array(x[:,0]**2+x[:,1]<1.5,dtype="int")
    from sklearn.model_selection import train_test_split
    x_train,x_test,y_train,y_test=train_test_split(x,y,random_state=666)
    p1=polynomiallogisticregression(degree=2)
    p1.fit(x_train,y_train)
    print(p1.score(x_train,y_train))
    print(p1.score(x_test,y_test))
    plot_decision_boundary(p1,axis=[-4,4,-4,4])
    plt.scatter(x[y==0,0],x[y==0,1],color="r")
    plt.scatter(x[y==1,0],x[y==1,1],color="g")
    plt.show()
    p1=polynomiallogisticregression(degree=20) #当其次数变为高次时,其训练模型已经过拟合,决策边界比较复杂
    p1.fit(x_train,y_train)
    print(p1.score(x_train,y_train))
    print(p1.score(x_test,y_test))
    plot_decision_boundary(p1,axis=[-4,4,-4,4])
    plt.scatter(x[y==0,0],x[y==0,1],color="r")
    plt.scatter(x[y==1,0],x[y==1,1],color="g")
    plt.show()
    #1-3逻辑回归的正则化形式函数(减小degree次数,另外需要加入正则化系数C以及正则化方式penalty,提高泛化能力)
    x=np.random.normal(0,1,size=(200,2)) #自定义数据
    y=np.array(x[:,0]**2+x[:,1]<1.5,dtype="int")
    #随机化重置2个点的值,使得其含有一定的噪声
    for _ in range(20):
    y[np.random.randint(200)]=1
    def Polynomiallogisticregression(degree,C,penalty): #逻辑回归的三大超参数:多项式系数,正则化系数,正则化方式L1/L2
    return Pipeline([
    ("poly",PolynomialFeatures(degree=degree)),
    ("std_reg",StandardScaler()),
    ("log_reg",LogisticRegression(C=C,penalty=penalty))
    ])
    p1=Polynomiallogisticregression(degree=20,C=1,penalty="l1") #当其次数变为高次时,其训练模型已经过拟合
    p1.fit(x_train,y_train)
    print(p1.score(x_train,y_train))
    print(p1.score(x_test,y_test))
    plot_decision_boundary(p1,axis=[-4,4,-4,4])
    plt.scatter(x[y==0,0],x[y==0,1],color="r")
    plt.scatter(x[y==1,0],x[y==1,1],color="g")
    plt.show()
    #多分类任务的封装OVR(n个)和OVO(Cmn个)
    #sklearn中采用的逻辑回归是可以进行多分类任务的,默认采用ovr方式
    from sklearn import datasets
    d=datasets.load_iris()
    x=d.data
    y=d.target
    from sklearn.model_selection import train_test_split
    x_train,x_test,y_train,y_test=train_test_split(x,y,random_state=1)
    #默认的OVR的多分类任务,时间更短,准确度较低
    log1=LogisticRegression()
    log1.fit(x_train,y_train)
    print(log1.score(x_test,y_test))
    #修改默认参数,使得其成为OVO的多分类算法,准确度更高一点,时间更长
    log2=LogisticRegression(multi_class="multinomial",solver="newton-cg")
    log2.fit(x_train,y_train)
    print(log2.score(x_test,y_test))
    #sklearn中封装的OVO和OVR
    #sklearn中对于所有的二分类算法提供了统一的OVR和OVO的分类器函数,可以方便调用实现所有二分类算法的多分类实现
    from sklearn.multiclass import OneVsOneClassifier
    from sklearn.multiclass import OneVsRestClassifier
    from sklearn.model_selection import train_test_split
    log_reg=LogisticRegression() #1-1定义一种二分类算法
    ovr=OneVsRestClassifier(log_reg) #1-2进行多分类转换OVR
    ovo=OneVsOneClassifier(log_reg) #1-2进行多分类转换OVO
    ovr.fit(x_train,y_train) #1-3进行数据训练与预测
    print(ovr.score(x_test,y_test))
    ovo.fit(x_train,y_train)
    print(ovo.score(x_test,y_test))
  • 相关阅读:
    hwclock设置时间的调用过程是怎样的?
    git如何获取获取子模块的代码?
    hwclock和date源码分析
    linux内核是如何支持深度睡眠(deep sleep)方式的?
    mac下如何安装python3?
    linux内核中的__cpu_suspend是在哪里实现的呀?
    linux下安装oracle需要的配置
    linux实操常用命令总结
    linux下vi命令大全
    PHP100精华:很靠谱linux常用命令
  • 原文地址:https://www.cnblogs.com/Yanjy-OnlyOne/p/12526413.html
Copyright © 2011-2022 走看看