zoukankan      html  css  js  c++  java
  • 房价预测-California House Prices

    K折交叉验证, 最后成绩Private Score为0.23218,在该种模型下算是不错了。

    读取训练和测试数据,并将其特征合并,统一用于数据清洗。

    %matplotlib inline
    import pandas as pd
    import torch
    import time
    from torch import nn
    from d2l import torch as d2l
    import numpy as np
    def read_csv_and_integration(train_csv_name, test_csv_name,labels=None):
        """ 读取测试和训练集的内容,并且将其特征合并,用于数据统一清洗
        Args:
            train_csv_name: 训练数据集的路径
            test_csv_name: 测试数据集的路径
            labels: 训练数据集中需要预测的数据的标签。
        Returns:
            返回经过两个数据集整合之后的特征数据
        Raises:
            FileNotFoundError: 检查路径是否错误
        """
        train_file, test_file = pd.read_csv(train_csv_name), pd.read_csv(test_csv_name)
        train_file = train_file.drop(train_file['Lot'].idxmax())
        labels_predict = train_file[labels]
        if labels!=None:
            train_file.drop(labels,axis=1,inplace=True)
        all_features = pd.concat((train_file.iloc[:,1:],test_file.iloc[:,1:]))
        return all_features,labels_predict
        
    train_csv_name, test_csv_name, labels_name = '../../ycy_data/train.csv', '../../ycy_data/test.csv', 'Sold Price'
    
    def try_gpu(i=0):  #@save
        """如果存在,则返回gpu(i),否则返回cpu()。"""
        if torch.cuda.device_count() >= i + 1:
            return torch.device(f'cuda:{i}')
        return torch.device('cpu')
    
    def try_all_gpus():  #@save
        """返回所有可用的GPU,如果没有GPU,则返回[cpu(),]。"""
        devices = [torch.device(f'cuda:{i}')
                 for i in range(torch.cuda.device_count())]
        return devices if devices else [torch.device('cpu')]
    
    try_gpu(), try_gpu(10), try_all_gpus()
    
    (device(type='cuda', index=0),
     device(type='cpu'),
     [device(type='cuda', index=0),
      device(type='cuda', index=1),
      device(type='cuda', index=2),
      device(type='cuda', index=3),
      device(type='cuda', index=4),
      device(type='cuda', index=5),
      device(type='cuda', index=6),
      device(type='cuda', index=7)])
    
    train_file, test_file = pd.read_csv(train_csv_name), pd.read_csv(test_csv_name)
    
    all_features,labels_predict = read_csv_and_integration(train_csv_name, test_csv_name, labels_name)
    test_data = pd.read_csv(test_csv_name)
    

    找出无法处理或者价值不高的列,将其drop。

    drop_features = ['Address','Summary','Region','Elementary School','Middle School','High School','State','City','Parking features','Appliances included']
    
    all_features.drop(columns=drop_features,axis=1,inplace = True)
    

    将可用数据清洗。

    all_features['Year built'] = all_features['Year built'].apply(lambda x: 1800 if x<1800 else(x if x<2022 else 2020))
    all_features['Year built'].fillna(int(all_features['Year built'].mean()),inplace = True)
    
    features_list_dict = {'Heating':5,'Cooling':5,'Parking':7,'Flooring':8,'Heating features':9,
                          'Cooling features':7,'Laundry features':7,
                         'Type':7
                         }
    def cut_features_str_normal(features_list_dict,all_features):
        for i,j in features_list_dict.items():
            all_features[i] = all_features[i].fillna('-')
            all_features[i] = all_features[i].apply(lambda x: x[:j].lower())
        return all_features
    all_features = cut_features_str_normal(features_list_dict,all_features)
    
    
    all_features['Lot'] = all_features['Lot'].apply(lambda x: 435 if  x<435 else x)
    all_features['Bedrooms']=all_features['Bedrooms'].apply(lambda x: x.count(',')+1 if isinstance (x,str) else x)
    all_features['Listed On'] = all_features['Listed On'].apply(lambda x: float(x.replace('-',"")))
    all_features['Last Sold On'].fillna('1970-01-01',inplace = True)
    all_features['Last Sold On'] = all_features['Last Sold On'].apply(lambda x: float(x.replace('-',"")))
    
    
    to_avg_features = ['Lot','Bedrooms','Bathrooms','Full bathrooms',
                       'Total interior livable area','Total spaces','Garage spaces',
                       'Elementary School Score','Elementary School Distance',
                       'Middle School Score','Middle School Distance','High School Score',
                       'High School Distance','Tax assessed value','Annual tax amount',
                      'Listed Price','Last Sold Price','Zip']
    def avg_data_na(features_list,all_features):
        for i  in features_list:
            all_features[i] = all_features[i].fillna(round(all_features[i].mean()))
            all_features[i] = all_features[i].apply(lambda x:all_features[i].mean() if x==0 else x)
        return all_features
    all_features = avg_data_na(to_avg_features,all_features)
    
    
    

    数据清洗完毕之后,对其进行normalization。$$x leftarrow frac{x - mu}{sigma}.$$
    Dummy_na=True 将“na”(缺失值)视为有效的特征值,并为其创建指示符特征。
    进行独热向量化。

    normalization_features = to_avg_features + ['Last Sold On','Listed On','Year built']
    all_features[normalization_features] = all_features[normalization_features].apply(
        lambda x: (x - x.mean()) / (x.std()))
    
    all_features = pd.get_dummies(all_features, dummy_na=True)
    

    以上 已经将所有特征清洗完毕,并对部分进行独热向量处理。


    将数据 tensor化,进行下面的学习。

    n_train = labels_predict.count()
    train_features = torch.tensor(all_features[:n_train].values,dtype=torch.float32,device=try_gpu())
    test_features = torch.tensor(all_features[n_train:].values,dtype=torch.float32,device=try_gpu())
    train_labels = torch.tensor(labels_predict.values.reshape(-1,1),dtype=torch.float32,device=try_gpu())
    

    损失函数使用均方差损失,$$ell(x, y) = L = {l_1,dots,l_N}^ op, quad
    l_n = left( x_n - y_n ight)^2$$

    loss = nn.MSELoss()
    in_features = train_features.shape[1]
    def get_net():
        net = nn.Sequential(nn.Linear(in_features,128),nn.ReLU(),nn.Linear(128,64),nn.ReLU(),nn.Linear(64,8),nn.ReLU(),nn.Linear(8,1))
        net.to(device=try_gpu())
        return net
    

    对于房价,就像股票价格一样,我们关心的是相对数量,而不是绝对数量。因此,[我们更关心相对误差(frac{y - hat{y}}{y})]而不是绝对误差(y - hat{y})。例如,如果我们在俄亥俄州农村地区估计一栋房子的价格时,我们的预测偏差了10万美元,在那里一栋典型的房子的价值是12.5万美元,那么我们可能做得很糟糕。另一方面,如果我们在加州豪宅区的预测出现了这个数字的偏差,这可能是一个惊人的准确预测(在那里,房价均值超过400万美元)。

    (解决这个问题的一种方法是用价格预测的对数来衡量差异)。事实上,这也是比赛中官方用来评价提交质量的误差指标。即将 (delta) for (|log y - log hat{y}| leq delta)转换为(e^{-delta} leq frac{hat{y}}{y} leq e^delta)。这使得预测价格的对数与真实标签价格的对数之间出现以下均方根误差:

    [sqrt{frac{1}{n}sum_{i=1}^nleft(log y_i -log hat{y}_i ight)^2}. ]

    def log_rmse(net,features,labels):
        clipped_preds = torch.clamp(net(features), 1, float('inf'))
        rmse = torch.sqrt(loss(torch.log(clipped_preds),torch.log(labels)))
        return rmse.item()
    
    def train(net, train_features, train_labels, test_features, test_labels,
              num_epochs, learning_rate, weight_decay, batch_size):
        train_ls, test_ls = [], []
        train_iter = d2l.load_array((train_features, train_labels), batch_size,is_train=True) #  这里好像是缺参数, 
        optimizer = torch.optim.Adam(net.parameters(), lr = learning_rate, weight_decay = weight_decay)
        for epoch in range(num_epochs):
            for X, y in train_iter:
                optimizer.zero_grad()
                l = loss(net(X), y)
                l.backward()
                optimizer.step()
            train_ls.append(log_rmse(net, train_features, train_labels))
            if test_labels is not None:
                test_ls.append(log_rmse(net, test_features, test_labels))
        return train_ls, test_ls
    
    def get_k_fold_data(k, i, X, y):
        assert k > 1
        fold_size = X.shape[0] // k
        X_train, y_train = None, None
        for j in range(k):
            idx = slice(j * fold_size, (j + 1) * fold_size)
            X_part, y_part = X[idx, :], y[idx]
            if j == i:
                X_valid, y_valid = X_part, y_part
            elif X_train is None:
                X_train, y_train = X_part, y_part
            else:
                X_train = torch.cat([X_train, X_part], 0)
                y_train = torch.cat([y_train, y_part], 0)
        return X_train, y_train, X_valid, y_valid
    
    net_dict,dict_kets_to_list = {},[]
    def k_fold(k, X_train, y_train, num_epochs, learning_rate, weight_decay,
               batch_size):
        train_l_sum, valid_l_sum = 0, 0
        for i in range(k):
            data = get_k_fold_data(k, i, X_train, y_train)
            net = get_net()
            train_ls, valid_ls = train(net, *data, num_epochs, learning_rate,
                                       weight_decay, batch_size)
            
            time_str =  time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
            torch.save(net, './tem_pth/'+time_str+'----'+'K-Flod:'+str(i+1)+'----'+'train_ls:'+str(round(train_ls[-1],4))+'----'+'valid_ls:'+str(round(valid_ls[-1],4))+'.pth')
            
            net_dict[round(valid_ls[-1],4)] = net
            
            
            train_l_sum += train_ls[-1]
            valid_l_sum += valid_ls[-1]
            if i == 0:
                d2l.plot(list(range(1, num_epochs + 1)), [train_ls, valid_ls],
                         xlabel='epoch', ylabel='rmse', xlim=[1, num_epochs],
                         legend=['train', 'valid'], yscale='log')
            print(f'fold {i + 1}, train log rmse {float(train_ls[-1]):f}, '
                  f'valid log rmse {float(valid_ls[-1]):f}')
        return train_l_sum / k, valid_l_sum / k
    

    进行K折交叉验证。

    k, num_epochs, lr, weight_decay, batch_size = 10, 500, 0.01, 0.001, 64
    train_l, valid_l = k_fold(k, train_features, train_labels, num_epochs, lr,
                              weight_decay, batch_size)
    print(f'{k}-折验证: 平均训练log rmse: {float(train_l):f}, '
          f'平均验证log rmse: {float(valid_l):f}')
    

    k, num_epochs, lr, weight_decay, batch_size = 10, 500, 0.1, 0.001, 64

    def train_and_pred(train_features, test_feature, train_labels, test_data,
                       num_epochs, lr, weight_decay, batch_size):
    #     net = get_net()
    #     train_ls, _ = train(net, train_features, train_labels, None, None,
    #                         num_epochs, lr, weight_decay, batch_size)
        dict_kets_to_list = list(net_dict.keys())
        
        net = net_dict[min(dict_kets_to_list)]
        print(min(dict_kets_to_list))
        
    #     d2l.plot(np.arange(1, num_epochs + 1), [train_ls], xlabel='epoch',
    #              ylabel='log rmse', xlim=[1, num_epochs], yscale='log')
    #     print(f'train log rmse {float(train_ls[-1]):f}')
        # 将网络应用于测试集。
        preds = net(test_features).detach().cpu().numpy()
        # 将其重新格式化以导出到Kaggle
        test_data['Sold Price'] = pd.Series(preds.reshape(1, -1)[0])
        submission = pd.concat([test_data['Id'], test_data['Sold Price']], axis=1)
        submission.to_csv('submission.csv', index=False)
        
    train_and_pred(train_features, test_features, train_labels, test_data, num_epochs, lr, weight_decay, batch_size)
    
  • 相关阅读:
    在 jQuery Repeater 中为多个字段排序
    使用 AjaxManager 生成调用服务器端方法的 javascript 函数
    使用 JQueryElement ResponseProgress 显示页面执行进度
    (原创)反ARP攻击的绝招,让你在ARP的炮雨攻击下永不掉线
    (原创)最详细可靠的Cadence16.01破解crack和安装方法步骤
    (原创)PCI总线特性及信号说明
    (原创)Modelsim的“The system date appears to have been set back.Cannot continue”问题的解决办法
    爱你哦
    为ASP.NET封装的SQL数据库访问类
    JavaScript中的高级特性及特别对象、属性和方法
  • 原文地址:https://www.cnblogs.com/A-FM/p/15201845.html
Copyright © 2011-2022 走看看