zoukankan      html  css  js  c++  java
  • 『科学计算』通过代码理解线性回归&Logistic回归模型

    sklearn线性回归模型

    import numpy as np
    import matplotlib.pyplot as plt
    from sklearn import linear_model
    
    def get_data():
        #506行,14列,最后一列为label,前面13列为参数
        data_original = np.loadtxt('housing.data')  
    
        scale_data = scale_n(data_original)
        np.random.shuffle(scale_data)
        #在位置0,插入一列1,axis=1代表列,代表b
        data = np.insert(scale_data, 0, 1, axis=1) 
    
        train_X = data[:400, :-1] #前400行为训练数据
        train_y = data[:400, -1]
        train_y.shape = (train_y.shape[0],1)
    
        test_X = data[400:, :-1]
        test_y = data[400:, -1]
        test_y.shape = (test_y.shape[0],1)
    
    
    
        # test测试数据没有返回
        return train_X,train_y,test_X,test_y
    
    def scale_n(x):
        return (x-x.mean(axis=0))/x.std(axis=0)
    
    
    if __name__=="__main__":
        train_X,train_y,test_X,test_y = get_data()
        
        l_model = linear_model.Ridge(alpha = 1000)     # 参数是正则化系数
            
        l_model.fit(train_X,train_y)
        
        predict_train_y = l_model.predict(train_X)
    
        predict_train_y.shape = (predict_train_y.shape[0],1)
        error = (predict_train_y-train_y)
        rms_train = np.sqrt(np.mean(error**2, axis=0))        
        
        predict_test_y = l_model.predict(test_X)
        predict_test_y.shape = (predict_test_y.shape[0],1)
        error = (predict_test_y-test_y)
        rms_test = np.sqrt(np.mean(error**2, axis=0))
        
        print (rms_train, rms_test)
        
        plt.figure(figsize=(10, 8))
        plt.scatter(np.arange(test_y.size), sorted(test_y), c='b', edgecolor='None', alpha=0.5, label='actual')
        plt.scatter(np.arange(test_y.size), sorted(predict_test_y), c='g', edgecolor='None', alpha=0.5, label='predicted')
        plt.legend(loc='upper left')
        plt.ylabel('House price ($1000s)')
        plt.xlabel('House #')
        plt.show()
    

     sklearn模型调用民工三连:

    l_model = linear_model.Ridge(alpha = 1000)     # 模型装载
            
    l_model.fit(train_X,train_y)                   # 模型训练
        
    predict_train_y = l_model.predict(train_X)     # 模型预测

    手动线性回归模型

    数据获取

    房价数据,506行,14列,最后一列为label,前面13列为参数

    假如我们需要平方特征,只要修改get_data()中的data_original即可,在13列后添加平方项或者立方项等,由于我们不知道具体添加多少特征的组合更好,神经网络自动提取组合特征的功能就被很好的凸显出来了

    import numpy as np
    import matplotlib.pyplot as plt
    
    def get_data():
        #506行,14列,最后一列为label,前面13列为参数
        data_original = np.loadtxt('housing.data')      # 读取数据
    
        scale_data = scale_n(data_original)             # 归一化处理
        np.random.shuffle(scale_data)                   # 打乱顺序
        #在位置0,插入一列1,axis=1代表列,代表b
        data = np.insert(scale_data, 0, 1, axis=1)      # 数组插入函数
    
    
        train_X = data[:400, :-1]                       #前400行为训练数据
        train_y = data[:400, -1]
        train_y.shape = (train_y.shape[0],1)
    
        test_X = data[400:, :-1]
        test_y = data[400:, -1]
        test_y.shape = (test_y.shape[0],1)
    
        # test测试数据没有返回
        return train_X,train_y,test_X,test_y
    

    其中:

    np.loadtxt('housing.data')     # 读取数据
    # 本函数读取数据后自动转化为ndarray数组,可以自行设定分隔符delimiter=","
    
    np.insert(scale_data, 0, 1, axis=1)      # 数组插入函数
    
    # 在数组中插入指定的行列,numpy.insert(arr, obj, values, axis=None)
    # 和其他数组一样,axis不设定的话会把数组定为一维后插入,axis=0的话行扩展,axis=1的话列扩展
    

    预处理

    中心归零,标准差归一

    def scale_n(x):
        """
        减去平均值,除以标准差
        """
        x = (x - np.mean(x,axis=0))/np.std(x,axis=0)
        return x
    

    线性回归类

    class LinearModel():
        def __init__(self,learn_rate=0.06,lamda=0.01,threhold=0.0000005):
            """
            初始化一些参数
            """
            self.learn_rate = learn_rate  # 学习率
            self.lamda = lamda            # 正则化系数
            self.threhold = threhold      # 迭代阈值
        
        def get_cost_grad(self,theta,X,y):
            """
            计算代价cost和梯度grad
            """
            y_pre = X.dot(theta)
            cost = (y_pre-y).T.dot(y_pre-y) + self.lamda*theta.T.dot(theta)
            grad = (2.0*X.T.dot(y_pre-y) + 2.0*self.lamda*theta)/X.shape[0]               # 实际上是1个batch的梯度的累加值
            return cost, grad
            
        def grad_check(self,X,y):
            """
            梯度检查: 函数计算梯度 == L(theta+delta)-L(theta-delta) / 2delta
            """
            m,n = X.shape
            delta = 10**(-4)
            sum_error = 0
    
            for i in range(100):
                theta = np.random.random((n,1))
                j = np.random.randint(1,n)
    
                theta1,theta2 = theta.copy(),theta.copy()
                theta1[j] += delta
                theta2[j] -= delta
    
                cost1, grad1 = self.get_cost_grad(theta1, X, y)
                cost2, grad2 = self.get_cost_grad(theta2, X, y)
                cost,  grad  = self.get_cost_grad(theta , X, y)
    
                sum_error += np.abs(grad[j] - (cost1-cost2)/(delta*2))
            print(sum_error/300.0)
    
    
        def train(self,X,y):
            """
            初始化theta
            训练后,将theta保存到实例变量里
            """
            m,n = X.shape
            theta = np.random.random((n,1))
    
            prev_cost = None
            for loop in range(1000):
                cost,grad = self.get_cost_grad(theta,X,y)
                theta -= self.learn_rate*grad
                if prev_cost:
                    if prev_cost - cost < self.threhold:
                        break
                prev_cost = cost
            self.theta = theta
            # print(theta,loop,cost)
    
    
        def predict(self,X):
            """
            预测程序
            """
            return X.dot(self.theta)
    

     主函数

    if __name__ == "__main__":
        train_X,train_y,test_X,test_y = get_data()
    
        linear_model = LinearModel()
    
        linear_model.grad_check(train_X,train_y)
    
        linear_model.train(train_X,train_y)
    
        predict_train_y = linear_model.predict(train_X)
        error = (predict_train_y - train_y)
        rms_train = np.sqrt(np.mean(error ** 2,axis=0))
    
        predict_test_y = linear_model.predict(test_X)
        error = (predict_test_y - test_y)
        rms_test = np.sqrt(np.mean(error ** 2,axis=0))
        #
        print(rms_train,rms_test)
        # [ 0.54031084] [ 0.60065021]
    
        plt.figure(figsize=(10,8))
        plt.scatter(np.arange(test_y.size),sorted(test_y),c='b',edgecolor='None',alpha=0.5,label='actual')
        plt.scatter(np.arange(test_y.size),sorted(predict_test_y),c='g',edgecolor='None',alpha=0.5,label='predicted')
        plt.legend(loc='upper left')
        plt.ylabel('House price ($1000s)')
        plt.xlabel('House #')
        plt.show()
    

    Logistic回归

    数据获取&预处理

    logistic回归输出值在0~1之间,所以数据预处理分两部分,前13列仍然是均值归零标准差归一,label列采取(x-x.min(axis=0))/(x.max(axis=0)-x.min(axis=0))的方式

    import numpy as np
    import matplotlib.pyplot as plt
    import math
    
    def get_data(N=400):
        #506行,14列,最后一列为label,前面13列为参数
        data_original = np.loadtxt('housing.data')  
    
        scale_data = np.zeros(data_original.shape)
    
        scale_data[:,:13] = scale_n(data_original[:,:13])
        scale_data[:,-1] = scale_max(data_original[:,-1])
    
        np.random.shuffle(scale_data)
        #在位置0,插入一列1,axis=1代表列,代表b
        data = np.insert(scale_data, 0, 1, axis=1) 
    
        train_X = data[:N, :-1] #前400行为训练数据
        train_y = data[:N, -1]
        train_y.shape = (train_y.shape[0],1)
    
        test_X = data[N:, :-1]
        test_y = data[N:, -1]
        test_y.shape = (test_y.shape[0],1)
    
    
        # test测试数据没有返回
        return train_X,train_y,test_X,test_y
    
    def scale_n(x):
        return (x-x.mean(axis=0))/x.std(axis=0)
    def scale_max(x):
        print (x.min(axis=0))
        print (x.max(axis=0))
        print (x.mean(axis=0))
        print (x.std(axis=0))
    
        return (x-x.min(axis=0))/(x.max(axis=0)-x.min(axis=0))
    

     Logistic回归类

     公式参考,

    实际代码,

    class LogisticModel(object):
        def __init__(self,lamda=0.01, alpha=0.6,threhold=0.0000005):
            self.alpha = alpha
            self.threhold = threhold
            self.lamda = lamda
            
        def sigmoid(self,x):
            return 1.0/(1+np.exp(-x))
            
        def get_cost_grad(self,theta,X,y):
            m, n = X.shape
            y_dash = self.sigmoid(X.dot(theta))
            error = np.sum((y * np.log(y_dash) + (1-y) * np.log(1-y_dash)),axis=1)
            cost = -np.sum(error, axis=0)+self.lamda*theta.T.dot(theta)
            grad = X.T.dot(y_dash-y)+2.0*self.lamda*theta
                        
            return cost,grad/m
            
            
        def grad_check(self,X,y):
            epsilon = 10**-4
            m, n = X.shape   
            
            sum_error=0
            
            for i in range(300):
                theta = np.random.random((n, 1))
                j = np.random.randint(1,n)
                theta1=theta.copy()
                theta2=theta.copy()
                theta1[j]+=epsilon
                theta2[j]-=epsilon
    
                cost1,grad1 = self.get_cost_grad(theta1,X,y)
                cost2,grad2 = self.get_cost_grad(theta2,X,y)
                cost3,grad3 = self.get_cost_grad(theta,X,y)
    
                sum_error += np.abs(grad3[j]-(cost1-cost2)/float(2*epsilon))
            
        def train(self,X,y):
            m, n = X.shape  # 400,15   
            theta = np.random.random((n, 1))  #[15,1]
            #our intial prediction
            prev_cost = None
            loop_num = 0
            while(True):                      
    
                #intial cost
                cost,grad = self.get_cost_grad(theta,X,y)
                
                theta = theta- self.alpha * grad
                
                loop_num+=1
                if loop_num%100==0:
                    print (cost,loop_num)
                if prev_cost:
                    if prev_cost - cost <= self.threhold:
                        break
                if loop_num>1000:
                    break
    
                prev_cost = cost
                           
                
            self.theta = theta
            print (theta,loop_num)
            
        def predict(self,X):
            return self.sigmoid(X.dot(self.theta))
    

  • 相关阅读:
    Spark SQL 编程API入门系列之SparkSQL数据源
    [转]Window2008站点安全设置,IIS7/IIS7.5中目录执行权限的设置方法
    [转]c# 对密码执行散列和 salt 运算方法
    [转]C#使用 Salt + Hash 来为密码加密
    [转]webservice 采用SSL实现加密传输
    [转]如何借助 TLS/SSL 确保套接字连接的安全(使用 C#/VB/C++ 和 XAML 的 Windows 应用商店应用)
    [转]网银在线支付接口和应用
    [转]支付宝接口程序、文档及解读(ASP.NET)
    [转]加盐hash保存密码的正确方式
    [转]如何安全的存储密码
  • 原文地址:https://www.cnblogs.com/hellcat/p/7217555.html
Copyright © 2011-2022 走看看