zoukankan      html  css  js  c++  java
  • PyTorch基础——预测共享单车的使用量

    预处理实验数据

    读取数据

    下载数据 网盘链接:https://pan.baidu.com/s/1n_FtZjAswWR9rfuI6GtDhA 提取码:y4fb

    #导入需要使用的库
    import numpy as np
    import pandas as pd #读取csv文件的库
    import matplotlib.pyplot as plt
    import torch
    from torch.autograd import Variable
    import torch.optim as optim
    
    # 让输出的图形直接在Notebook中显示
    %matplotlib inline
    
    #首先,让我们再来看看数据长什么样子
    #读取数据到内存中,rides为一个dataframe对象
    data_path = 'hour.csv'
    rides = pd.read_csv(data_path)
    rides.head()
    

    对于类型变量的处理

    #对于类型变量的特殊处理
    # season=1,2,3,4, weathersi=1,2,3, mnth= 1,2,...,12, hr=0,1, ...,23, weekday=0,1,...,6
    # 经过下面的处理后,将会多出若干特征,例如,对于season变量就会有 season_1, season_2, season_3, season_4
    # 这四种不同的特征。
    dummy_fields = ['season', 'weathersit', 'mnth', 'hr', 'weekday']
    for each in dummy_fields:
        #利用pandas对象,我们可以很方便地将一个类型变量属性进行one-hot编码,变成多个属性
        dummies = pd.get_dummies(rides[each], prefix=each, drop_first=False)
        rides = pd.concat([rides, dummies], axis=1)
    
    # 把原有的类型变量对应的特征去掉,将一些不相关的特征去掉
    fields_to_drop = ['instant', 'dteday', 'season', 'weathersit', 
                      'weekday', 'atemp', 'mnth', 'workingday', 'hr']
    data = rides.drop(fields_to_drop, axis=1)
    data.head()
    

    对于数值类型变量进行标准化

    # 调整所有的特征,标准化处理
    quant_features = ['cnt', 'temp', 'hum', 'windspeed']
    #quant_features = ['temp', 'hum', 'windspeed']
    
    # 我们将每一个变量的均值和方差都存储到scaled_features变量中。
    scaled_features = {}
    for each in quant_features:
        mean, std = data[each].mean(), data[each].std()
        scaled_features[each] = [mean, std]
        data.loc[:, each] = (data[each] - mean)/std
    

    将数据集进行分割

    # 将所有的数据集分为测试集和训练集,我们以后21天数据一共21*24个数据点作为测试集,其它是训练集
    test_data = data[-21*24:]
    train_data = data[:-21*24]
    print('训练数据:',len(train_data),'测试数据:',len(test_data))
    
    # 将我们的数据列分为特征列和目标列
    
    #目标列
    target_fields = ['cnt', 'casual', 'registered']
    features, targets = train_data.drop(target_fields, axis=1), train_data[target_fields]
    test_features, test_targets = test_data.drop(target_fields, axis=1), test_data[target_fields]
    
    # 将数据从pandas dataframe转换为numpy
    X = features.values
    Y = targets['cnt'].values
    Y = Y.astype(float)
    
    Y = np.reshape(Y, [len(Y),1])
    losses = []
    
    features.head()
    

    构建神经网络模型

    手动编写用 Tensor 运算的人工神经网络

    # 定义神经网络架构,features.shape[1]个输入层单元,10个隐含层,1个输出层
    input_size = features.shape[1] #输入层单元个数
    hidden_size = 10 #隐含层单元个数
    output_size = 1 #输出层单元个数
    batch_size = 128 #每隔batch的记录数
    weights1 = Variable(torch.randn([input_size, hidden_size]), requires_grad = True) #第一到二层权重
    biases1 = Variable(torch.randn([hidden_size]), requires_grad = True) #隐含层偏置
    weights2 = Variable(torch.randn([hidden_size, output_size]), requires_grad = True) #隐含层到输出层权重
    def neu(x):
        #计算隐含层输出
        #x为batch_size * input_size的矩阵,weights1为input_size*hidden_size矩阵,
        #biases为hidden_size向量,输出为batch_size * hidden_size矩阵    
        hidden = x.mm(weights1) + biases1.expand(x.size()[0], hidden_size)
        hidden = torch.sigmoid(hidden)
    
        #输入batch_size * hidden_size矩阵,mm上weights2, hidden_size*output_size矩阵,
        #输出batch_size*output_size矩阵
        output = hidden.mm(weights2)
        return output
    def cost(x, y):
        # 计算损失函数
        error = torch.mean((x - y)**2)
        return error
    def zero_grad():
        # 清空每个参数的梯度信息
        if weights1.grad is not None and biases1.grad is not None and weights2.grad is not None:
            weights1.grad.data.zero_()
            weights2.grad.data.zero_()
            biases1.grad.data.zero_()
    def optimizer_step(learning_rate):
        # 梯度下降算法
        weights1.data.add_(- learning_rate * weights1.grad.data)
        weights2.data.add_(- learning_rate * weights2.grad.data)
        biases1.data.add_(- learning_rate * biases1.grad.data)
    

    调用PyTorch现成的函数,构建序列化的神经网络

    # 定义神经网络架构,features.shape[1]个输入层单元,10个隐含层,1个输出层
    input_size = features.shape[1]
    hidden_size = 10
    output_size = 1
    batch_size = 128
    neu = torch.nn.Sequential(
        torch.nn.Linear(input_size, hidden_size),
        torch.nn.Sigmoid(),
        torch.nn.Linear(hidden_size, output_size),
    )
    cost = torch.nn.MSELoss()
    optimizer = torch.optim.SGD(neu.parameters(), lr = 0.01)
    

    数据的分批次处理

    # 神经网络训练循环
    losses = []
    for i in range(1000):
        # 每128个样本点被划分为一个撮,在循环的时候一批一批地读取
        batch_loss = []
        # start和end分别是提取一个batch数据的起始和终止下标
        for start in range(0, len(X), batch_size):
            end = start + batch_size if start + batch_size < len(X) else len(X)
            xx = Variable(torch.FloatTensor(X[start:end]))
            yy = Variable(torch.FloatTensor(Y[start:end]))
            predict = neu(xx)
            loss = cost(predict, yy)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            batch_loss.append(loss.data.numpy())
    
        # 每隔100步输出一下损失值(loss)
        if i % 100==0:
            losses.append(np.mean(batch_loss))
            print(i, np.mean(batch_loss))
    
    # 打印输出损失值
    fig = plt.figure(figsize=(10, 7))
    plt.plot(np.arange(len(losses))*100,losses, 'o-')
    plt.xlabel('epoch')
    plt.ylabel('MSE')
    

    测试网络

    使用测试数据集测试网络

    # 用训练好的神经网络在测试集上进行预测
    targets = test_targets['cnt'] #读取测试集的cnt数值
    targets = targets.values.reshape([len(targets),1]) #将数据转换成合适的tensor形式
    targets = targets.astype(float) #保证数据为实数
    
    # 将属性和预测变量包裹在Variable型变量中
    x = Variable(torch.FloatTensor(test_features.values))
    y = Variable(torch.FloatTensor(targets))
    
    # 用神经网络进行预测
    predict = neu(x)
    predict = predict.data.numpy()
    
    
    # 将后21天的预测数据与真实数据画在一起并比较
    # 横坐标轴是不同的日期,纵坐标轴是预测或者真实数据的值
    fig, ax = plt.subplots(figsize = (10, 7))
    
    mean, std = scaled_features['cnt']
    ax.plot(predict * std + mean, label='Prediction', linestyle = '--')
    ax.plot(targets * std + mean, label='Data', linestyle = '-')
    ax.legend()
    ax.set_xlabel('Date-time')
    ax.set_ylabel('Counts')
    # 对横坐标轴进行标注
    dates = pd.to_datetime(rides.loc[test_data.index]['dteday'])
    dates = dates.apply(lambda d: d.strftime('%b %d'))
    ax.set_xticks(np.arange(len(dates))[12::24])
    _ = ax.set_xticklabels(dates[12::24], rotation=45)
    
  • 相关阅读:
    Linux Vim编辑器
    Linux sed 流编辑器
    Shell 编程 (变量和条件测试)
    Linux 下 Bash配置文件读取
    Linux 用户、权限
    Linux 指令(一)文件/目录操作
    Ubuntu 下安装 Swoole
    Mysql IN语句查询
    Mysql 查询优化
    Mysql 获取表属性
  • 原文地址:https://www.cnblogs.com/wwj99/p/12179229.html
Copyright © 2011-2022 走看看