zoukankan      html  css  js  c++  java
  • 加州房价预测数据预处理

    本文是该系列读书笔记的第二章数据预处理部分

    • 导入常用的数据分析库
    import pandas as pd
    import numpy as np
    
    import os 
    import tarfile
    from six.moves import urllib
    

    获取数据

    download_root="https://raw.githubusercontent.com/ageron/handson-ml/master/"
    house_path="datasets/housing"
    housing_url=download_root+house_path+"/housing.tgz"
    
    def fecthing_housing_data(housing_url=housing_url,house_path=house_path):
        if not os.path.exists(house_path):
            os.makedirs(house_path)
        tgz_path=os.path.join(house_path,'housing.tgz')
        urllib.request.urlretrieve(housing_url,tgz_path)
        housing_tgz=tarfile.open(tgz_path)
        housing_tgz.extractall(path=house_path)
        housing_tgz.close()
    
    def load_housing_data(house_path=house_path):
        csv_path=os.path.join(house_path,"housing.csv")
        return pd.read_csv(csv_path)
    

    数据的初步分析,数据探索

    # fecthing_housing_data()  # 下载数据,解压出csv文件
    housing=load_housing_data()
    housing.head()
    
    longitude latitude housing_median_age total_rooms total_bedrooms population households median_income median_house_value ocean_proximity
    0 -122.23 37.88 41.0 880.0 129.0 322.0 126.0 8.3252 452600.0 NEAR BAY
    1 -122.22 37.86 21.0 7099.0 1106.0 2401.0 1138.0 8.3014 358500.0 NEAR BAY
    2 -122.24 37.85 52.0 1467.0 190.0 496.0 177.0 7.2574 352100.0 NEAR BAY
    3 -122.25 37.85 52.0 1274.0 235.0 558.0 219.0 5.6431 341300.0 NEAR BAY
    4 -122.25 37.85 52.0 1627.0 280.0 565.0 259.0 3.8462 342200.0 NEAR BAY
    housing.info()
    # total_bedrooms 存在缺失值,
    # 前9列为float格式,经度,维度,房龄中位数,总的房间数,卧室数目,人口,家庭数,收入中位数,房屋价格的中位数,
    # 最后一列为离海距离为object类型
    
    <class 'pandas.core.frame.DataFrame'>
    RangeIndex: 20640 entries, 0 to 20639
    Data columns (total 10 columns):
    longitude             20640 non-null float64
    latitude              20640 non-null float64
    housing_median_age    20640 non-null float64
    total_rooms           20640 non-null float64
    total_bedrooms        20433 non-null float64
    population            20640 non-null float64
    households            20640 non-null float64
    median_income         20640 non-null float64
    median_house_value    20640 non-null float64
    ocean_proximity       20640 non-null object
    dtypes: float64(9), object(1)
    memory usage: 1.6+ MB
    
    # 需要查看ocean_proximity都包含哪些,
    housing['ocean_proximity'].value_counts()
    
    <1H OCEAN     9136
    INLAND        6551
    NEAR OCEAN    2658
    NEAR BAY      2290
    ISLAND           5
    Name: ocean_proximity, dtype: int64
    
    # 对数值类型的特征进行初步的统计
    housing.describe()
    
    longitude latitude housing_median_age total_rooms total_bedrooms population households median_income median_house_value
    count 20640.000000 20640.000000 20640.000000 20640.000000 20433.000000 20640.000000 20640.000000 20640.000000 20640.000000
    mean -119.569704 35.631861 28.639486 2635.763081 537.870553 1425.476744 499.539680 3.870671 206855.816909
    std 2.003532 2.135952 12.585558 2181.615252 421.385070 1132.462122 382.329753 1.899822 115395.615874
    min -124.350000 32.540000 1.000000 2.000000 1.000000 3.000000 1.000000 0.499900 14999.000000
    25% -121.800000 33.930000 18.000000 1447.750000 296.000000 787.000000 280.000000 2.563400 119600.000000
    50% -118.490000 34.260000 29.000000 2127.000000 435.000000 1166.000000 409.000000 3.534800 179700.000000
    75% -118.010000 37.710000 37.000000 3148.000000 647.000000 1725.000000 605.000000 4.743250 264725.000000
    max -114.310000 41.950000 52.000000 39320.000000 6445.000000 35682.000000 6082.000000 15.000100 500001.000000
    %matplotlib inline
    import matplotlib.pyplot as plt
    # 查看每个数值特征的分布,
    housing.hist(bins=50,figsize=(20,15))
    # plt.show()
    
    array([[<matplotlib.axes._subplots.AxesSubplot object at 0x00000000179D4A20>,
            <matplotlib.axes._subplots.AxesSubplot object at 0x0000000019A2A128>,
            <matplotlib.axes._subplots.AxesSubplot object at 0x0000000019A557B8>],
           [<matplotlib.axes._subplots.AxesSubplot object at 0x0000000019A7AE48>,
            <matplotlib.axes._subplots.AxesSubplot object at 0x0000000019AAB518>,
            <matplotlib.axes._subplots.AxesSubplot object at 0x0000000019AAB550>],
           [<matplotlib.axes._subplots.AxesSubplot object at 0x0000000019B03278>,
            <matplotlib.axes._subplots.AxesSubplot object at 0x0000000019B29908>,
            <matplotlib.axes._subplots.AxesSubplot object at 0x0000000019B53F98>]],
          dtype=object)
    

    地理分布

    housing.plot(kind="scatter", x="longitude", y="latitude")
    
    <matplotlib.axes._subplots.AxesSubplot at 0x19bbfcc0>
    

    housing.plot(kind="scatter", x="longitude", y="latitude",alpha=0.4)
    # 标量,可选,默认值无,alpha混合值,介于0(透明)和1(不透明)之间
    # 显示高密度区域的散点图,颜色越深,表示人口越密集,虽然我对加州的地理位置不是特别清楚
    
    <matplotlib.axes._subplots.AxesSubplot at 0x1a705b70>
    

    housing.plot(kind='scatter',x='longitude',y='latitude',alpha=0.4,
                s=housing['population']/50,label='population',
                c='median_house_value',cmap=plt.get_cmap("jet"),colorbar=True,
                figsize=(9,6))
    # import matplotlib
    # plt.figure(figsize=(15,9)) 
    # sc=plt.scatter(housing['longitude'],housing['latitude'],alpha=0.4,
    #             s=housing['population']/100,label='population',
    #             c=housing['median_house_value'],cmap=plt.get_cmap("jet"))
    # plt.legend()
    # matplotlib.rcParams["font.sans-serif"]=["SimHei"]
    # matplotlib.rcParams['axes.unicode_minus'] = False
    # matplotlib.rcParams['font.size'] =15
    # plt.xlabel('经度')
    # plt.ylabel('纬度')
    # color_bar=plt.colorbar(sc)
    # color_bar.set_label('meidan_house_value')
    # plt.show()
    #以上为使用plt的完整代码,将坐标轴的内容以及添加colorbar,设置中文坐标轴标题
    
    <matplotlib.axes._subplots.AxesSubplot at 0x19ffb390>
    

    #  房价与位置和人口密度联系密切,但是如何用数学的角度来描述几个变量之间的关联呢,可以使用标准相关系数standard correlation coefficient 
    # 常用的相关系数为皮尔逊相关系数
    corr_matrix = housing.corr()
    corr_matrix
    
    longitude latitude housing_median_age total_rooms total_bedrooms population households median_income median_house_value
    longitude 1.000000 -0.924664 -0.108197 0.044568 0.069608 0.099773 0.055310 -0.015176 -0.045967
    latitude -0.924664 1.000000 0.011173 -0.036100 -0.066983 -0.108785 -0.071035 -0.079809 -0.144160
    housing_median_age -0.108197 0.011173 1.000000 -0.361262 -0.320451 -0.296244 -0.302916 -0.119034 0.105623
    total_rooms 0.044568 -0.036100 -0.361262 1.000000 0.930380 0.857126 0.918484 0.198050 0.134153
    total_bedrooms 0.069608 -0.066983 -0.320451 0.930380 1.000000 0.877747 0.979728 -0.007723 0.049686
    population 0.099773 -0.108785 -0.296244 0.857126 0.877747 1.000000 0.907222 0.004834 -0.024650
    households 0.055310 -0.071035 -0.302916 0.918484 0.979728 0.907222 1.000000 0.013033 0.065843
    median_income -0.015176 -0.079809 -0.119034 0.198050 -0.007723 0.004834 0.013033 1.000000 0.688075
    median_house_value -0.045967 -0.144160 0.105623 0.134153 0.049686 -0.024650 0.065843 0.688075 1.000000

    数据特征的相关性

    import seaborn as sns
    plt.Figure(figsize=(25,20))
    hm=sns.heatmap(corr_matrix,cbar=True,annot=True,square=True,fmt='.2f',annot_kws={'size':9}, cmap="YlGnBu")
    plt.show()
    

    corr_matrix['median_house_value'].sort_values(ascending=False)
    """
    相关系数的范围是 -1 到 1。当接近 1 时,意味强正相关;
    例如,当收入中位数增加时,房价中位数也会增加。
    当相关系数接近 -1 时,意味强负相关;
    纬度和房价中位数有轻微的负相关性(即,越往北,房价越可能降低)。
    最后,相关系数接近 0,意味没有线性相关性。
    """
    
    # 使用pandas中的scatter_matrix 可以从另外一种角度分析多个变量之间的相关性
    from pandas.plotting import  scatter_matrix
    attributes=['median_house_value',"median_income","total_bedrooms","housing_median_age"]
    scatter_matrix(housing[attributes],figsize=(12,9))
    # sns.pairplot(housing[['median_house_value',"median_income",]],height=5)
    # 使用seaborn中的pariplot可以实现同样的结果
    housing.plot(kind="scatter",x='median_income',y='median_house_value',alpha=0.2)
    
    <matplotlib.axes._subplots.AxesSubplot at 0x1e3df9e8>
    

    创建新的特征

    • 重点关注收入的中位数与房屋价值的中位数之间的关系,从上图以及相关系数都可以得到两者之间存在很明显的正相关
    • 可以清洗的看到向上的趋势,并且数据点不是非常分散,
    • 我们之前统计得到的最高房价位于5000000美元的水平线
    • 从频率分布直方图hist可以看到housing_median_age ,meidan_house_value 具有长尾分布,可以尝试对其进行log或者开根号等转化
    • 当然,不同项目的处理方法各不相同,但大体思路是相似的。
    housing['rooms_per_household']=housing['total_rooms']/housing['households']
    housing['bedrooms_per_room']= housing['total_bedrooms']/housing['total_rooms']
    housing['population_per_household']=housing['population']/housing['households']
    
    corr_matrix = housing.corr()
    corr_matrix['median_house_value'].sort_values(ascending=False)
    # """
    # 新的特征房间中,卧室占比与房屋价值中位数有着更明显的负相关性,比例越低,房价越高;
    # 每家的房间数也比街区的总房间数的更有信息,很明显,房屋越大,房价就越高
    # """
    
    median_house_value          1.000000
    median_income               0.688075
    rooms_per_household         0.151948
    total_rooms                 0.134153
    housing_median_age          0.105623
    households                  0.065843
    total_bedrooms              0.049686
    population_per_household   -0.023737
    population                 -0.024650
    longitude                  -0.045967
    latitude                   -0.144160
    bedrooms_per_room          -0.255880
    Name: median_house_value, dtype: float64
    

    数据清洗, 创建处理流水线

    • 缺失值处理
    • 处理object文本数据类型
    • 特征放缩
    • 构建模型pepeline
    • 以上几个步骤我们在之前的博客中基本上都已经用过,这里作为读书笔记不会再过多的详细解释
    # total_bedrooms特征缺失值处理
    """
    - 去掉含有缺失值的样本,dropna()
    - 去掉含有缺失值的特征 dropna(axis=1)
    - 进行填充(中位数,平均值,0,插值填充) fillna(housing['total_bedrooms'].median()) 较为方便的使用pandas中的方法
    """
    from sklearn.preprocessing import Imputer
    imputer=Imputer(strategy='mean')
    housing_num=housing.drop('ocean_proximity',axis=1)
    imputer.fit(housing_num)
    
    Imputer(axis=0, copy=True, missing_values='NaN', strategy='mean', verbose=0)
    
    housing_num_trans=pd.DataFrame(imputer.transform(housing_num),columns=housing_num.columns)
    housing_num_trans.info()
    # 缺失值补齐,总觉得如果是缺失值处理的话,可以直接用pandas中的fillna会节省一点时间,在原始的数据上直接处理掉,后面也就不用再去担心这个
    
    <class 'pandas.core.frame.DataFrame'>
    RangeIndex: 20640 entries, 0 to 20639
    Data columns (total 12 columns):
    longitude                   20640 non-null float64
    latitude                    20640 non-null float64
    housing_median_age          20640 non-null float64
    total_rooms                 20640 non-null float64
    total_bedrooms              20640 non-null float64
    population                  20640 non-null float64
    households                  20640 non-null float64
    median_income               20640 non-null float64
    median_house_value          20640 non-null float64
    rooms_per_household         20640 non-null float64
    bedrooms_per_room           20640 non-null float64
    population_per_household    20640 non-null float64
    dtypes: float64(12)
    memory usage: 1.9 MB
    
    # 处理文本object类型数据
    from sklearn.preprocessing import  LabelEncoder
    encoder= LabelEncoder()
    house_cat=housing['ocean_proximity']
    house_cat_encode=encoder.fit_transform(house_cat)
    house_cat_encode
    
    array([3, 3, 3, ..., 1, 1, 1], dtype=int64)
    
    encoder.classes_
    
    array(['<1H OCEAN', 'INLAND', 'ISLAND', 'NEAR BAY', 'NEAR OCEAN'],
          dtype=object)
    
    • 在之前博客中也提到类似的操作,改操作可能会将两个临近的值
    • 比两个疏远的值更为相似,因此一般情况下,对与类标才会使用LabelEncoder,对于特征不会使用该方式对特征转换
    • 更为常用的操作是独热编码,给每个分类创建一个二元属性,比如当分类是INLAND,有则是1,没有则是0
    • skleanrn中提供了编码器OneHotEncoder,类似与pandas中pd.get_dummies()
    from sklearn.preprocessing import OneHotEncoder
    # OneHotEncoder只能对数值型数据进行处理,只接受2D数组
    encoder=OneHotEncoder()
    housing_cat_1hot=encoder.fit_transform(house_cat_encode.reshape((-1,1)))
    housing_cat_1hot
    
    <20640x5 sparse matrix of type '<class 'numpy.float64'>'
    	with 20640 stored elements in Compressed Sparse Row format>
    
    housing_cat_1hot.toarray()
    
    array([[0., 0., 0., 1., 0.],
           [0., 0., 0., 1., 0.],
           [0., 0., 0., 1., 0.],
           ...,
           [0., 1., 0., 0., 0.],
           [0., 1., 0., 0., 0.],
           [0., 1., 0., 0., 0.]])
    
    # 使用LabelBinarizer 可以实现同样的效果
    from sklearn.preprocessing import  LabelBinarizer
    encoder=LabelBinarizer()
    housing_cat_1hot=encoder.fit_transform(house_cat)
    housing_cat_1hot
    
    array([[0, 0, 0, 1, 0],
           [0, 0, 0, 1, 0],
           [0, 0, 0, 1, 0],
           ...,
           [0, 1, 0, 0, 0],
           [0, 1, 0, 0, 0],
           [0, 1, 0, 0, 0]])
    
    # 直接在原始的数据上使用pandas.get_dummies()是最简单的方法
    pd.get_dummies(housing[['ocean_proximity']]).head()
    
    ocean_proximity_<1H OCEAN ocean_proximity_INLAND ocean_proximity_ISLAND ocean_proximity_NEAR BAY ocean_proximity_NEAR OCEAN
    0 0 0 0 1 0
    1 0 0 0 1 0
    2 0 0 0 1 0
    3 0 0 0 1 0
    4 0 0 0 1 0
    # 特征放缩 我们常用到的MinMaxScaler和StandandScaler两种
    # 一般会对不同范围内的特征进行放缩,有助于优化算法收敛的速度(尤其是针对梯度提升的优化算法)
    # 归一化: 减去最小值,然后除以最大最小值的差
    # 标准化: 减去平均值,然后除以方差,得到均值为0,方差为1的标准正态分布,受异常值影响比较小,决策树和随机森林不需要特征放缩
    # 特征放缩一般针对训练数据集进行transform_fit,对测试集数据进行transform
    
    # 从划分数据集→pipeline
    from sklearn.model_selection import  train_test_split
    housing=load_housing_data()
    # train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42)  #  随机采样
    from sklearn.model_selection import StratifiedShuffleSplit  #  分层采样
    
    split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
    housing["income_cat"] = np.ceil(housing["median_income"] / 1.5)
    housing["income_cat"].where(housing["income_cat"] < 5, 5.0, inplace=True)
    
    for train_index, test_index in split.split(housing, housing["income_cat"]): # 按照收入中位数进行分层采样
        strat_train_set = housing.loc[train_index]
        strat_test_set = housing.loc[test_index]
    housing = strat_train_set.copy()  # 创建一个副本,以免损伤训练集,
    
    housing.info()
    
    <class 'pandas.core.frame.DataFrame'>
    Int64Index: 16512 entries, 17606 to 15775
    Data columns (total 11 columns):
    longitude             16512 non-null float64
    latitude              16512 non-null float64
    housing_median_age    16512 non-null float64
    total_rooms           16512 non-null float64
    total_bedrooms        16354 non-null float64
    population            16512 non-null float64
    households            16512 non-null float64
    median_income         16512 non-null float64
    median_house_value    16512 non-null float64
    ocean_proximity       16512 non-null object
    income_cat            16512 non-null float64
    dtypes: float64(10), object(1)
    memory usage: 1.5+ MB
    
    #转化流水线
    from sklearn.pipeline import Pipeline
    from sklearn.preprocessing import StandardScaler
    num_pipeline=Pipeline([('imputer',Imputer(strategy='median')),('std_scaler',StandardScaler())])
    housing = strat_train_set.drop("median_house_value", axis=1)
    housing_labels = strat_train_set["median_house_value"].copy()
    housing_num=housing.drop('ocean_proximity',axis=1)
    housing_num_tr = num_pipeline.fit_transform(housing_num)
    housing_cat=housing['ocean_proximity']
    housing_cat_tr= LabelBinarizer().fit_transform(housing_cat)
    housing_train=np.c_[housing_num_tr,housing_cat_tr]
    housing_train.shape
    #  数字特征与categoriy 特征不能同时进行转化,需要进行FeatureUnion
    # 你给它一列转换器(可以是所有的转换器),当调用它的transform()方法,每个转换器的transform()会被并行执行,
    # 等待输出,然后将输出合并起来,并返回结果
    # 当然也可以通过分批转化,然后通过np将转化好的数据集合并,本质上没有什么区别,只不过对于测试集仍然需要transform,然后再合并成转化好的测试集
    
    (16512, 14)
    
    import os
    import sys
    sys.path.append(os.getcwd())
    from future_encoders import ColumnTransformer
    from future_encoders import OneHotEncoder
    
    num_attribs = list(housing_num)
    cat_attribs = ["ocean_proximity"]
    
    full_pipeline = ColumnTransformer([
            ("num", num_pipeline, num_attribs),
            ("cat", OneHotEncoder(), cat_attribs),
        ])
    
    housing_prepared = full_pipeline.fit_transform(housing)
    housing_prepared
    
    array([[-1.15604281,  0.77194962,  0.74333089, ...,  0.        ,
             1.        ,  0.        ],
           [-1.17602483,  0.6596948 , -1.1653172 , ...,  0.        ,
             1.        ,  0.        ],
           [ 1.18684903, -1.34218285,  0.18664186, ...,  0.        ,
             1.        ,  1.        ],
           ...,
           [ 1.58648943, -0.72478134, -1.56295222, ...,  0.        ,
             1.        ,  0.        ],
           [ 0.78221312, -0.85106801,  0.18664186, ...,  0.        ,
             1.        ,  0.        ],
           [-1.43579109,  0.99645926,  1.85670895, ...,  0.        ,
             1.        ,  0.        ]])
    
    np.allclose(housing_prepared, housing_train)
    
    True
    

    后续内容已经放在github上,篇幅过大就只能把数据预处理的部分整理在这里,然后把后续的算法的实现部分整理在github中

  • 相关阅读:
    C#中处理鼠标和键盘的事件
    C#中处理鼠标和键盘的事件
    C#中处理鼠标和键盘的事件
    mpich2安装
    算法题推箱子
    LINUX终端下windows盘的位置
    Linux头文件和库文件添加环境变量与GCC编译器添加INCLUDE与LIB环境变量
    第九章顺序容器重学C++之《 C++ PRIMER》
    sed中使用变量
    抛出异常
  • 原文地址:https://www.cnblogs.com/onemorepoint/p/9602734.html
Copyright © 2011-2022 走看看