zoukankan      html  css  js  c++  java
  • otto-group-product-classification-challenge(Pytorch处理多分类问题)

    参考一:《Pytorch深度学习实践》(第九集)

    参考二:Otto-Neural-Net

    注意:使用的数据来自kaggle,链接

    由于上面给出的两个参考链接,对代码的讲解都已经很详细,所以这里不再赘述,下面按自己的理解整理了代码如下:

    Imports

    import numpy as np
    import pandas as pd
    import os
    import time
    from xgboost import XGBClassifier
    import matplotlib.pyplot as plt
    import torch
    from torch.utils.data import Dataset, DataLoader
    import torch.nn as nn
    import torch.nn.functional as F
    import torch.optim as optim

    Prepare Data

    # 处理数据
    csv_path = os.path.join('train.csv')
    datas = pd.read_csv(csv_path)
    datas = datas.copy() 
    datas = datas.drop(columns='id')
    datas = datas.sample(frac=1)
    
    # 将category转化为数字
    datas.target = datas.target.astype('category').cat.codes
    # 划分数据集
    rows, _ = datas.shape  # type(datas) 为 <class 'pandas.core.frame.DataFrame'>
    train_rows = int(rows*0.7)
    train_datas = datas.iloc[:train_rows, :]
    val_datas = datas.iloc[train_rows:, :]
    
    # 得到features和targets
    train_features = train_datas.loc[:, train_datas.columns!='target'].values  # values 得到一个矩阵
    train_targets = train_datas.target.values
    
    val_features = val_datas.loc[:, val_datas.columns!='target'].values
    val_targets = val_datas.target.values
    # 封装 继承dataset
    class CustomDataset(Dataset):
        def __init__(self, features, targets):
            super(CustomDataset, self).__init__()
            
            self.features = features
            self.targets = targets
        
        def __len__(self):
            return len(self.features)
        
        def __getitem__(self, idx):
            return self.features[idx], self.targets[idx]
    
    
    # 得到Dataset类
    train_ds = CustomDataset(train_features, train_targets)
    val_ds = CustomDataset(val_features, val_targets)
    
    # 得到DataLoader
    batch_size = 64
    train_loader = DataLoader(train_ds, batch_size=batch_size)
    val_loader = DataLoader(val_ds, batch_size=batch_size)

    Device

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    Design Model

    class Net(torch.nn.Module):
        def __init__(self):
            super(Net, self).__init__()
            
            self.l1 = torch.nn.Linear(93, 64)
            self.bn = torch.nn.BatchNorm1d(num_features=64)
            self.l2 = torch.nn.Linear(64, 32)
            self.dropout = torch.nn.Dropout(p=0.1)
            self.l3 = torch.nn.Linear(32, 16)
            self.l4 = torch.nn.Linear(16, 9)
        
        def forward(self, x):
            # first layer
            x = self.l1(x)
            x = self.bn(x)
            x = torch.relu(x)
            # second layer
            x = self.l2(x)
            x = self.dropout(x)
            x = torch.relu(x)
            
            x = torch.relu(self.l3(x))
            return self.l4(x)  # 注意 这里不要加relu
    
    model = Net().to(device)

    Construct Loss and Optimizer

    criterion = torch.nn.CrossEntropyLoss().to(device)
    optimizer = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.5)

    Train and Validate

    def train():
        losses = 0.0
        accs = 0.0
        for idx, data in enumerate(train_loader, 0):
            inputs, targets = data
            inputs = inputs.to(device)
            targets = targets.to(device)
            optimizer.zero_grad()
            
            # forward + backward + update
            outputs = model(inputs.float())
            loss = criterion(outputs, targets.long())
            loss.backward()
            optimizer.step()
            
            losses += loss.item()
            _, predicted = torch.max(outputs.data, dim=1)
            accs += (predicted==targets).sum().item()
            
        losses /= len(train_loader)
        accs = accs * 100 / len(train_ds)
        print('Training:  loss: %.2f    accuracy: %.2f %%' %(losses, accs))
        
        return losses, accs
    def val():
        losses = 0.0
        accs = 0.0
        with torch.no_grad():
            for data in val_loader:
                inputs, labels = data
                inputs = inputs.to(device)
                labels = labels.to(device)
                outputs = model(inputs.float())
                losses += criterion(outputs, labels.long()).item()
                _, predicted = torch.max(outputs.data, dim=1)
                accs += (predicted == labels).sum().item()
        losses /= len(val_loader)
        accs = accs * 100 / len(val_ds)
        print('Validating:   loss: %.2f    accuracy: %.2f %%' %(losses, accs))
        
        return losses, accs
    train_losses = []
    val_losses = []
    train_accs = []
    val_accs = []
    maxAcc = 0  # 记录最高准确率
    
    for epoch in range(100):
        print('[epoch: %d]' % (epoch+1))
        
        losses, accs = train()
        train_losses.append(losses)
        train_accs.append(accs)
        
        losses, accs = val()
        val_losses.append(losses)
        val_accs.append(accs)
        
        if maxAcc < accs:
            maxAcc = accs
        
        check = np.greater(maxAcc, val_accs[-5:])
        if (check.all()==True)  and (epoch>20):
            print('Convergence met!
    ')
            break
    
    print('Maximum validation accuracy: %.2f %%' % (maxAcc))

    Plot Losses

    # 这里采用了指数平滑
    import math
    import matplotlib.pyplot as plt
    
    tmp_train_losses = [math.e**i for i in train_losses]
    tmp_val_losses = [math.e**i for i in val_losses]
    plt.figure(figsize=(10, 5))
    plt.plot(tmp_train_losses, 'r', label='Training')
    plt.plot(tmp_val_losses, 'b', label='Validating')
    plt.xlabel('Epoch')
    plt.ylabel('Loss per epoch')
    plt.legend()
    plt.show()
  • 相关阅读:
    算法-经典趣题-寻找假银币
    一天一个 Linux 命令(3):cat 命令
    算法-经典趣题-青蛙过河
    常用数据库有哪些?
    SpringBoot2.0入门教程(一) 快速入门,项目构建HelloWorld示例
    一天一个 Linux 命令(2):ls 命令
    算法-经典趣题-爱因斯坦阶梯问题
    一天一个 Linux 命令(1):vim 命令
    什么是开发环境、测试环境、UAT环境、仿真环境、生产环境?
    算法-经典趣题-渔夫捕鱼
  • 原文地址:https://www.cnblogs.com/heyour/p/13466077.html
Copyright © 2011-2022 走看看