zoukankan      html  css  js  c++  java
  • ML——手写logistic-regression 实现线性二元分类器

    参考李宏毅老师ML作业二:https://colab.research.google.com/drive/1JaMKJU7hvnDoUfZjvUKzm9u-JLeX6B2C#scrollTo=RJuoQ_R2jUmX

               https://github.com/PoRuiHuang/ML2020SPRING

      1 """
      2 2020/4/11
      3 cfancy
      4 logistic-regression
      5 实做一个线性二元分类器,来根据人们的个人资料,判断其年收入
      6 是否高于50000$
      7 所用数据是已经做了稍微的处理
      8 X_train :训练集
      9 Y_trian :目标集(训练标签集)
     10 X_trian :测试集
     11 手写  gradient descent
     12 """
     13 # 导入库
     14 import sys
     15 import numpy as np
     16 import pandas as pd
     17 import math
     18 import matplotlib.pyplot as plt
     19 import csv
     20 
     21 #常用的基础函数
     22  #平均值
     23 def mean(x):
     24   return sum(x) / len(x)
     25 
     26  #值与平均值之差
     27 def de_mean(x):
     28   x_bar = mean(x)
     29   return [x_i - x_bar for x_i in x]
     30 
     31  #矩阵点乘
     32 def dot(v, w):
     33     return sum(v_i * w_i for v_i, w_i in zip(v, w))
     34  #python 中 zip()函数:打包为元组的列表
     35 
     36  #
     37 def sum_of_squares(v):
     38     """
     39     :param v:
     40     :return:
     41     """
     42     return dot(v, v)
     43 
     44  # 方差
     45 def variance(x):
     46     """
     47     方差
     48     :param x:
     49     :return:
     50     """
     51     n = len(x)
     52     deviations = de_mean(x)
     53     return sum_of_squares(deviations) / (n - 1)
     54  #标准差
     55 def standard_deviation(x):
     56     """
     57     标准差
     58     :param x:
     59     :return:
     60     """
     61     return math.sqrt(variance(x))
     62 
     63  #协方差
     64 def covariance(x,y):
     65     """
     66     协方差
     67     :param x:
     68     :param y:
     69     :return:
     70     """
     71     n = len(x)
     72     return dot(de_mean(x), de_mean(y)) / (n - 1)
     73 
     74  #相关系数
     75 def correlation(x, y):
     76     """
     77     相关系数
     78     :return:
     79     """
     80     stdev_x = standard_deviation(x)
     81     stdev_y = standard_deviation(y)
     82     if stdev_x > 0 and stdev_y > 0:
     83         return covariance(x, y) / stdev_x / stdev_y
     84     else:
     85         return 0
     86 
     87 # 一些有用的函数
     88  # 打乱索引
     89 def _shuffle(X, Y):
     90     """
     91     将两个长度相等的列表或者数组打乱其索引
     92     打乱训练集(即打乱索引)
     93     :param X:
     94     :param Y:
     95     :return:
     96     """
     97     randomize = np.arange(len(X))
     98     np.random.shuffle(randomize)
     99     return (X[randomize], Y[randomize])
    100 # A = np.arange(10)
    101 # B = np.arange(10)
    102 #
    103 # A ,B = _shuffle(A,B)
    104 # print(A)  #[3 5 2 9 6 7 0 8 4 1]
    105 #
    106 # print(B)  #[3 5 2 9 6 7 0 8 4 1]
    107 
    108  #
    109 def _sigmoid(z):
    110     """
    111     Sigmoid 函数用来计算概率
    112     :param z:
    113     :return:
    114     """
    115     return np.clip(1 / (1.0 + np.exp(-z)), 1e-8, 1 - (1e-8))  #clip 限制数组中的值
    116 
    117 # activate function
    118 def _f(X, w, b):
    119     """
    120     logistic_regression 函数
    121     :param X: 输入数据  shape = [batch_size, data_dimension
    122     :param w: 参数 权重 w 向量  shape = [data_dimension, ]
    123     :param b: 参数 偏置 b 数值
    124     :return: predicted probability of each row of X being positively labeled, shape = [batch_size, ]
    125     """
    126     return _sigmoid(np.matmul(X, w) + b)
    127 
    128 def _predict(X, w, b):
    129     """
    130     结果通过四舍五入的
    131     :param X:
    132     :param w:
    133     :param b:
    134     :return:返回预测的值
    135     """
    136     return np.round(_f(X, w,b)).astype(np.int)
    137 
    138 def _accuracy(Y_pred, Y_label):
    139     # This function calculates prediction accuracy
    140     acc = 1 - np.mean(np.abs(Y_pred - Y_label))
    141     return acc
    142 
    143 
    144 """--------------------准备数据--------------------"""
    145 np.random.seed(0)   ##随机数生成器
    146 # 读取数据 X_train Y_train X_test  并对数据进行正规化(归一化)normalization
    147 X_train_fpath = './data/X_train'  #训练集
    148 Y_train_fpath = './data/Y_train'  #目标集(标签集)
    149 X_test_fpath = './data/X_test'  # 测试集
    150 output_fpath = './output_{}.csv'  # 预测保存
    151 
    152 #解析csv文件转为numpy数组
    153 drop_list = []
    154 with open(Y_train_fpath) as f:
    155     # next(f)
    156     # Y_train = np.array([line.strip('
    ').split(',')[1] for line in f], dtype = float)
    157     # print(Y_train.shape)
    158     # print(type(Y_train))
    159     # print(Y_train)
    160 
    161     data2 = pd.read_csv(f)
    162     # print(data2.shape)
    163     data2 = data2.iloc[:, 1:]
    164     # data2.drop(columns=['migration code-change in msa','migration code-change in reg','migration code-move within reg','migration prev res in sunbelt','country of birth father','country of birth mother','country of birth self'],inplace=True)
    165 
    166     # next(f)
    167     Y_train = data2.to_numpy(dtype=float)
    168     Y_train = Y_train.reshape((len(Y_train),))
    169 # Parse csv files to numpy array
    170 with open(X_train_fpath) as f:
    171     # next(f)
    172     # X_train = np.array([line.strip('
    ').split(',')[1:] for line in f], dtype = float)
    173     # print(X_train.shape)
    174     # print(X_train)
    175 
    176     data1 = pd.read_csv(f)
    177     # print(data1.shape)
    178 
    179     data1 = data1.iloc[:, 1:]
    180     # data1.drop(columns=['migration code-change in msa','migration code-change in reg','migration code-move within reg','migration prev res in sunbelt','country of birth father','country of birth mother','country of birth self'],inplace=True)
    181     # data.drop(columns=['id', 'migration code-change in msa','migration code-change in reg','migration code-move within reg','migration prev res in sunbelt','country of birth father','country of birth mother','country of birth self'],inplace=True)
    182     X_train = data1.to_numpy(dtype=float)
    183 
    184     for i in range(len(X_train[0])):
    185         corr = correlation(X_train[:, i], Y_train)
    186         # print(i,corr,sep=": ")
    187         if (abs(corr) < 0.01):
    188             drop_list.append(i)
    189 
    190     print(drop_list)
    191     # print(len(drop_list))
    192     print(X_train.shape)
    193     X_train = np.delete(X_train, drop_list, 1)
    194 
    195 
    196 with open(X_test_fpath) as f:
    197     # next(f)
    198     # X_test = np.array([line.strip('
    ').split(',')[1:] for line in f], dtype = float)
    199     # print(X_test.shape)
    200     # print(X_test)
    201 
    202     data3 = pd.read_csv(f)
    203     # print(data3.shape)
    204     data3 = data3.iloc[:, 1:]
    205     # data3.drop(columns=['migration code-change in msa','migration code-change in reg','migration code-move within reg','migration prev res in sunbelt','country of birth father','country of birth mother','country of birth self'],inplace=True)
    206     X_test = data3.to_numpy(dtype=float)
    207     X_test = np.delete(X_test, drop_list, 1)
    208 print("------------------------")
    209 print(X_train)
    210 print("------------------------")
    211 print(Y_train)
    212 print("------------------------")
    213 print(X_test)
    214 
    215 #正规化(归一化)
    216 def _normalize(X, train = True , specified_column = None, X_mean = None, X_std = None):
    217     """
    218     此函数归一化X的特定列
    219     :param X: 数据
    220     :param train:  train = true 表示 处理training data,False 表示 testing data
    221     :param specified_column: 将被标准化的列的索引  None 表示所有列都被归一化
    222     :param X_mean: 当train = False 时training data 的平均值
    223     :param X_std:当train = False 时training data 的标准差
    224     :return: 返回归一化后的数据,training data的平均值 ,training data 的标准差
    225     处理测试数据时,将重用训练数据的均值和标准方差
    226     """
    227     if specified_column == None:
    228         specified_column = np.arange(X.shape[1])
    229     if train:
    230         X_mean = np.mean(X[:, specified_column], 0).reshape(1, -1)
    231         X_std   = np.std(X[:, specified_column], 0).reshape(1, -1)
    232 
    233     X[:, specified_column] = (X[:, specified_column] - X_mean) / (X_std + 1e-8)
    234 
    235     return X, X_mean, X_std
    236 
    237 #将处理后的数据切分为训练集和发展集
    238 def _train_dev_split(X, Y, dev_ratio = 0.25):
    239     """
    240     归一化后的数据切分为训练集与发展集
    241     :param X:
    242     :param Y:
    243     :param dev_ration:
    244     :return:
    245     """
    246     train_size = int(len(X) * (1 - dev_ratio))
    247     return X[:train_size], Y[:train_size], X[train_size:], Y[train_size:]
    248 
    249 # 归一化训练集和测试集
    250 X_train, X_mean, X_std = _normalize(X_train, train=True)
    251 X_test, _, _ = _normalize(X_test, train=False, specified_column=None, X_mean=X_mean, X_std=X_std)
    252 
    253 #将数据分为训练集和发展集
    254 dev_ratio = 0.1
    255 X_train, Y_train, X_dev, Y_dev = _train_dev_split(X_train, Y_train, dev_ratio = dev_ratio)
    256 
    257 train_size = X_train.shape[0]
    258 dev_size = X_dev.shape[0]
    259 test_size = X_test.shape[0]
    260 data_dim = X_train.shape[1]
    261 print('Size of training set: {}'.format(train_size))
    262 print('Size of development set: {}'.format(dev_size))
    263 print('Size of testing set: {}'.format(test_size))
    264 print('Dimension of data: {}'.format(data_dim))
    265 
    266 """-------------------梯度函数和损失函数-----------------------"""
    267 def _cross_entropy_loss(y_pred, Y_label,):
    268     """
    269     交叉熵损失函数  根据推导的公式
    270     :param y_pred:  概率预测  浮点型向量
    271     :param Y_label: 真实标签值  布尔值向量
    272     :return: 返回的是数值
    273     """
    274     cross_entropy = -np.dot(Y_label, np.log(y_pred)) - np.dot((1 - Y_label), np.log(1 - y_pred))
    275     return cross_entropy
    276 
    277 def _gradient(X, Y_label, w, b):
    278     """
    279     计算交叉熵的梯度  根据笔记推到的公式 w_grad
    280     :param X:
    281     :param Y_label:
    282     :param w:
    283     :param b:
    284     :return:
    285     """
    286     y_pred = _f(X, w, b)
    287     pred_error = Y_label - y_pred
    288     w_grad = -np.sum(pred_error * X.T, 1)
    289     b_grad = -np.sum(pred_error)
    290     return w_grad, b_grad
    291 
    292 """-------------------Training--------------------------"""
    293 """
    294 使用小批次梯度下降法来训练(MBGD——Mini-batch gradient descent),训练集被分为许多小批次,
    295 我们分别计算其梯度以及损失,并根据该批次来更新模型的参数。当一次回圈(epoch)完成,也就是整个训练集的所有
    296 小批次都被使用过一次后,我们将所有训练资料打散并且重新分成新的小的批次,进行下一个回圈,知道事先设定好的回圈数量为止。
    297 """
    298 #初始化参数 w,b
    299 w = np.zeros((data_dim,))
    300 b = np.zeros((1,))
    301 
    302 #训练时需要的参数 最大迭代次数、每个batch的大小以及学习率
    303 max_iter = 40
    304 batch_size = 10
    305 lr = 0.1
    306 
    307 #保存每次迭代时的损失和精度,以进行绘制
    308 train_loss = []
    309 dev_loss = []
    310 train_acc = []
    311 dev_acc = []
    312 
    313 #计算参数更新的次数
    314 count = 1
    315 
    316 # 迭代训练
    317 for epoch in range(max_iter):
    318     #在每个回圈开始时的随机变动
    319     X_train, Y_train = _shuffle(X_train, Y_train)
    320 
    321     #小批次训练
    322     for idx in range(int(np.floor(train_size / batch_size))):
    323         X = X_train[idx * batch_size:(idx+1) * batch_size]
    324         Y = Y_train[idx * batch_size:(idx+1) * batch_size]
    325 
    326         #计算梯度
    327         w_grad, b_grad = _gradient(X, Y, w, b)
    328 
    329         #梯度下降参数更新
    330         #学习率随时间衰减
    331         w = w - lr/np.sqrt(count) * w_grad
    332         b = b - lr/np.sqrt(count) * b_grad
    333 
    334         count = count + 1
    335 
    336     #计算训练集和发展集的损失和精度
    337     y_train_pred = _f(X_train, w, b)
    338     Y_train_pred = np.round(y_train_pred) #round ()四舍五入
    339     train_acc.append(_accuracy(Y_train_pred, Y_train))
    340     train_loss.append(_cross_entropy_loss(y_train_pred, Y_train,) / train_size)
    341 
    342     y_dev_pred = _f(X_dev, w, b)
    343     Y_dev_pred = np.round(y_dev_pred)
    344     dev_acc.append(_accuracy(Y_dev_pred, Y_dev))
    345     dev_loss.append(_cross_entropy_loss(y_dev_pred, Y_dev) / dev_size)
    346     print('Training loss: {}'.format(train_loss[-1]))
    347     print('Development loss: {}'.format(dev_loss[-1]))
    348     print('Training accuracy: {}'.format(train_acc[-1]))
    349     print('Development accuracy: {}'.format(dev_acc[-1]))
    350 
    351 """------------------------画图----------------------------"""
    352 # 损失曲线(Loss curve)
    353 plt.plot(train_loss)
    354 plt.plot(dev_loss)
    355 plt.title('Loss')
    356 plt.legend(['train', 'dev'])
    357 plt.savefig('loss.png')
    358 plt.show()
    359 
    360 # 精度曲线(Accuracy curve)
    361 plt.plot(train_acc)
    362 plt.plot(dev_acc)
    363 plt.title('Accuracy')
    364 plt.legend(['train', 'dev'])
    365 plt.savefig('acc.png')
    366 plt.show()
    367 
    368 
    369 """------------------------------保存-------------------------------"""
    370 # 保存预测测试集标签 (Predict testing labels)
    371 predictions = _predict(X_test, w, b)
    372 with open(output_fpath.format('logistic'), 'w') as f:
    373     f.write('id,label
    ')
    374     for i, label in  enumerate(predictions):
    375         f.write('{},{}
    '.format(i, label))
    376 
    377 # 对测试集归一化处理  打印出最重要的权重w
    378 ind = np.argsort(np.abs(w))[::-1]  #从后往前
    379 with open(X_test_fpath) as f:
    380     content = f.readline().strip('
    ').split(',')
    381 features = np.array(content)
    382 for i in ind[0:10]:
    383     print('*************')
    384     print(features[i], w[i])
  • 相关阅读:
    软件开发基本原则(四)—— 风险管理 (转)
    软件开发基本原则(三)—— 基本原则 (转)
    人心散了,项目必然要败! (转)
    项目失败的若干征兆
    项目管理思考——角色转变
    软件开发基本原则(二)—— 典型错误 (转)
    项目管理思考——我适合做项目经理吗
    flex 学习篇 导航类容器
    mfc 中在 dc中 显示 位图
    第一次作业 黎娜 2013551605
  • 原文地址:https://www.cnblogs.com/cfancy/p/12685890.html
Copyright © 2011-2022 走看看