zoukankan      html  css  js  c++  java
  • 李宏毅机器学习HW2(逻辑回归)

    问题引入

    很简单,就是二分类问题。

    数据处理

    首先读入数据

    def read_file():
        """读入数据"""
        x_train = pd.read_csv('X_train.csv')
        x_train = x_train.iloc[:, 1:].to_numpy()
        y_train = pd.read_csv('Y_train.csv')
        y_train = y_train.iloc[:, 1:].to_numpy()
        x_test = pd.read_csv('X_test.csv')
        x_test = x_test.iloc[:, 1:].to_numpy()
        # print(x_train.shape)
        # print(y_train.shape)
        # print(x_test.shape)
        return x_train, y_train, x_test
    

      

    总的可用于训练的数据为(54256,510),测试数据为(27622,510)。

    接下来标准化

    def normalize(X, train=True, X_mean=None, X_std=None):
        """标准化"""
        if train:  # 如果train为True,那么表示处理training
            # data,否则就处理testing data,即不再另算X_mean和X_std
            X_mean = np.mean(X, axis=0).reshape(1, -1)  # 求各列均值
            X_std = np.std(X, axis=0).reshape(1, -1)    # 求各列标准差
    
        X = (X - X_mean) / (X_std + 1e-8)  # X_std加入一个很小的数防止分母除以0
        return X, X_mean, X_std

    接下来将一部分数据分出来当作验证集

    def split_data(x_train, y_train, ratio):
        """将数据按ratio比例分成训练集和验证集"""
        x_validation = x_train[math.floor(ratio * len(x_train)):, :]
        y_validation = y_train[math.floor(ratio * len(y_train)):, :]
        x_train = x_train[:math.floor(ratio * len(x_train)), :]
        y_train = y_train[:math.floor(ratio * len(y_train)), :]
        # print(x_train.shape)
        # print(y_train.shape)
        # print(x_validation.shape)
        # print(y_validation.shape)
        return x_train, y_train, x_validation, y_validation

    按照9:1的比例拆分,最后训练集为(43404,510),验证集为(10852,510)。

    逻辑回归

    逻辑回归的公式为$f_{w,b} = frac{1}{1+e^{-z}}$,其中$z=w_1x_1+w_2x_2+···+x_nx_n$。将样本的特征值代入该式计算后,会得到一个(0,1)之间的概率值,以0.5为界限可以将其二分类。

    现在给出一组训练数据:

     那么$f_{w,b} $这个函数可以正确分类的概率为:

    $L(w,b)=f_{w,b}(x^1)f_{w,b}(x^2)(1-f_{w,b}(x^3))···f_{w,b}(x^n)$

    所以我们的目的是找到使得$L(w,b)$最大的$w$和$b$。

    然后我们把问题进行转换,寻找$w,b$,使得$-lnL(w,b)$最小。

     $-lnL(w,b)=-(lnf_{w,b}(x^1)+lnf_{w,b}(x^2)+ln(1-f_{w,b}(x^3))···)$

                      $=sum_{i=0}^{n}-hat{y^i}lnf_{w,b}(x^i)-(1-hat{y^i})ln(1-f_{w,b}(x^i))$

     接下来梯度下降求解,懒得打公式了,直接上照片:

    这梯度和线性回归是相同的。

    def sigmod(z):
        '''返回sigmod函数'''
        return np.clip(1 / (1.0 + np.exp(-z)), 1e-8, 1-1e-8)   # 避免溢出,如果sigmoid函数的最小值比1e-8小,只会输出1e-8;而比1 - (1e-8)大,则只输出1 - (1e-8)
    
    
    def f(x, w, b):
        '''返回概率值'''
        return sigmod(np.dot(x, w) + b)
    
    
    def cross_entropy_loss(y_pred, y):
        """计算误差"""
        loss = -np.dot(y.reshape(1, -1), np.log(y_pred)) - np.dot((1-y).reshape(1, -1), np.log(1-y_pred))
        return loss[0][0]      # 这儿我计算出来就是1×1大小的二维数组
    
    
    def gradient(x, y, w, b):
        """计算损失函数的梯度"""
        y_pred = f(x, w, b)
        w_grad = np.dot(x.T, y_pred - y)
        b_grad = np.sum(y_pred - y)
        return w_grad, b_grad
    
    
    def shuffle(x, y):
        """打乱数据"""
        random_list = np.arange(len(x))
        np.random.shuffle(random_list)
        return x[random_list], y[random_list]
    
    
    def accuracy(y_pred, y):
        """计算正确率"""
        acc = 1-np.mean(np.abs(y_pred - y))
        return acc

    模型训练

    这里我们使用批次梯度下降。

    def train(x_train, y_train, x_validation, y_validation):
        """批次梯度下降"""
        w = np.ones((x_train.shape[1], 1))
        b = 1
        iter_time = 30   # 迭代次数
        batch_size = 12  # 每个批次的样本数
        learning_rate = 0.05
    
        train_loss = []  # 训练集损失
        dev_loss = []  # 验证集损失
        train_acc = []  # 训练集正确率
        dev_acc = []  # 验证集正确率
    
        step = 1
    
        for epoch in range(iter_time):
            x_train, y_train = shuffle(x_train, y_train)    # 将样本打乱
            for i in range(int(len(x_train)/batch_size)):
                x = x_train[i*batch_size:(i+1)*batch_size]
                y = y_train[i*batch_size:(i+1)*batch_size]
                w_grad, b_grad = gradient(x, y, w, b)
                w = w - learning_rate / np.sqrt(step) * w_grad
                b = b - learning_rate / np.sqrt(step) * b_grad
    
                step += 1
    
            y_train_pred = f(x_train, w, b)
            train_acc.append(accuracy(np.round(y_train_pred), y_train))                  # 计算训练集正确率
            train_loss.append(cross_entropy_loss(y_train_pred, y_train) / len(x_train))  # 计算训练集误差
    
            y_dev_pred = f(x_validation, w, b)
            dev_acc.append(accuracy(np.round(y_dev_pred), y_validation))                        # 计算测试集正确率
            dev_loss.append(cross_entropy_loss(y_dev_pred, y_validation) / len(x_validation))   # 计算测试集误差
    
        print('训练集正确率:' + str(train_acc[-1]))
        print('训练集误差:' + str(train_loss[-1]))
        print('验证集正确率:' + str(dev_acc[-1]))
        print('验证集误差:' + str(dev_loss[-1]))
    
        plt.plot(train_loss)
        plt.plot(dev_loss)
        plt.title('Loss')
        plt.legend(['train', 'dev'])
        plt.savefig('loss.png')
        plt.show()
    
        plt.plot(train_acc)
        plt.plot(dev_acc)
        plt.title('Accuracy')
        plt.legend(['train', 'dev'])
        plt.savefig('acc.png')
        plt.show()
    
        return w, b

    测试预测

    def predict(x, w, b):
        """预测"""
        result = f(x, w, b)
        result = np.round(result)
        file = open('result.csv', 'w')
        file.write('id,label')
        file.write('
    ')
        for i in range(len(result)):
            file.write(str(i) + ',' + str(int(result[i][0])))
            file.write('
    ')
        file.close()

     

     最后去kaggle提交了一下,分数不高。

    完整代码

    import pandas as pd
    import numpy as np
    import math
    import matplotlib.pyplot as plt
    
    X_train_file_path = './X_train.csv'
    Y_train_file_path = './Y_train.csv'
    X_test_file_path = './X_test.csv'
    
    
    def read_file():
        """读入数据"""
        x_train = pd.read_csv('X_train.csv')
        x_train = x_train.iloc[:, 1:].to_numpy()
        y_train = pd.read_csv('Y_train.csv')
        y_train = y_train.iloc[:, 1:].to_numpy()
        x_test = pd.read_csv('X_test.csv')
        x_test = x_test.iloc[:, 1:].to_numpy()
        # print(x_train.shape)
        # print(y_train.shape)
        # print(x_test.shape)
        return x_train, y_train, x_test
    
    
    def normalize(X, train=True, X_mean=None, X_std=None):
        """标准化"""
        if train:  # 如果train为True,那么表示处理training
            # data,否则就处理testing data,即不再另算X_mean和X_std
            X_mean = np.mean(X, axis=0).reshape(1, -1)  # 求各列均值
            X_std = np.std(X, axis=0).reshape(1, -1)    # 求各列标准差
    
        X = (X - X_mean) / (X_std + 1e-8)  # X_std加入一个很小的数防止分母除以0
        return X, X_mean, X_std
    
    
    def split_data(x_train, y_train, ratio):
        """将数据按ratio比例分成训练集和验证集"""
        x_validation = x_train[math.floor(ratio * len(x_train)):, :]
        y_validation = y_train[math.floor(ratio * len(y_train)):, :]
        x_train = x_train[:math.floor(ratio * len(x_train)), :]
        y_train = y_train[:math.floor(ratio * len(y_train)), :]
        # print(x_train.shape)
        # print(y_train.shape)
        # print(x_validation.shape)
        # print(y_validation.shape)
        return x_train, y_train, x_validation, y_validation
    
    
    def sigmod(z):
        '''返回sigmod函数'''
        return np.clip(1 / (1.0 + np.exp(-z)), 1e-8, 1-1e-8)   # 避免溢出,如果sigmoid函数的最小值比1e-8小,只会输出1e-8;而比1 - (1e-8)大,则只输出1 - (1e-8)
    
    
    def f(x, w, b):
        '''返回概率值'''
        return sigmod(np.dot(x, w) + b)
    
    
    def cross_entropy_loss(y_pred, y):
        """计算误差"""
        loss = -np.dot(y.reshape(1, -1), np.log(y_pred)) - np.dot((1-y).reshape(1, -1), np.log(1-y_pred))
        return loss[0][0]      # 这儿我计算出来就是1×1大小的二维数组
    
    
    def gradient(x, y, w, b):
        """计算损失函数的梯度"""
        y_pred = f(x, w, b)
        w_grad = np.dot(x.T, y_pred - y)
        b_grad = np.sum(y_pred - y)
        return w_grad, b_grad
    
    
    def shuffle(x, y):
        """打乱数据"""
        random_list = np.arange(len(x))
        np.random.shuffle(random_list)
        return x[random_list], y[random_list]
    
    
    def accuracy(y_pred, y):
        """计算正确率"""
        acc = 1-np.mean(np.abs(y_pred - y))
        return acc
    
    
    def train(x_train, y_train, x_validation, y_validation):
        """批次梯度下降"""
        w = np.ones((x_train.shape[1], 1))
        b = 1
        iter_time = 30   # 迭代次数
        batch_size = 12  # 每个批次的样本数
        learning_rate = 0.05
    
        train_loss = []  # 训练集损失
        dev_loss = []  # 验证集损失
        train_acc = []  # 训练集正确率
        dev_acc = []  # 验证集正确率
    
        step = 1
    
        for epoch in range(iter_time):
            x_train, y_train = shuffle(x_train, y_train)    # 将样本打乱
            for i in range(int(len(x_train)/batch_size)):
                x = x_train[i*batch_size:(i+1)*batch_size]
                y = y_train[i*batch_size:(i+1)*batch_size]
                w_grad, b_grad = gradient(x, y, w, b)
                w = w - learning_rate / np.sqrt(step) * w_grad
                b = b - learning_rate / np.sqrt(step) * b_grad
    
                step += 1
    
            y_train_pred = f(x_train, w, b)
            train_acc.append(accuracy(np.round(y_train_pred), y_train))                  # 计算训练集正确率
            train_loss.append(cross_entropy_loss(y_train_pred, y_train) / len(x_train))  # 计算训练集误差
    
            y_dev_pred = f(x_validation, w, b)
            dev_acc.append(accuracy(np.round(y_dev_pred), y_validation))                        # 计算测试集正确率
            dev_loss.append(cross_entropy_loss(y_dev_pred, y_validation) / len(x_validation))   # 计算测试集误差
    
        print('训练集正确率:' + str(train_acc[-1]))
        print('训练集误差:' + str(train_loss[-1]))
        print('验证集正确率:' + str(dev_acc[-1]))
        print('验证集误差:' + str(dev_loss[-1]))
    
        plt.plot(train_loss)
        plt.plot(dev_loss)
        plt.title('Loss')
        plt.legend(['train', 'dev'])
        plt.savefig('loss.png')
        plt.show()
    
        plt.plot(train_acc)
        plt.plot(dev_acc)
        plt.title('Accuracy')
        plt.legend(['train', 'dev'])
        plt.savefig('acc.png')
        plt.show()
    
        return w, b
    
    
    def predict(x, w, b):
        """预测"""
        result = f(x, w, b)
        result = np.round(result)
        file = open('result.csv', 'w')
        file.write('id,label')
        file.write('
    ')
        for i in range(len(result)):
            file.write(str(i) + ',' + str(int(result[i][0])))
            file.write('
    ')
        file.close()
    
    
    if __name__ == '__main__':
        x_train, y_train, x_test = read_file()
        x_train, x_mean, x_std = normalize(x_train)
        x_train, y_train, x_validation, y_validation = split_data(x_train, y_train, 0.9)
        w, b = train(x_train, y_train, x_validation, y_validation)
        x_test, x_mean, x_std = normalize(x_test   # 对测试数据进行标准化
        predict(x_test, w, b)
  • 相关阅读:
    hao947 : Mybatis resultMap配置插入和主键自增返回 : 好947
    VelocityTracker简单介绍
    Java中StringBuilder的清空方法比較
    jquery中的动画
    数据库索引的作用和长处缺点
    很具体GC学习笔记
    深入理解 JBoss 7/WildFly Standalone 模式启动过程
    curl命令具体解释
    【免费】iPhone上最好用的短信群发软件: 高速短信4.1
    Project interpreter not specified(eclipse+pydev)
  • 原文地址:https://www.cnblogs.com/zyb993963526/p/13702141.html
Copyright © 2011-2022 走看看