zoukankan      html  css  js  c++  java
  • Pandas数据的去重,替换和离散化,异常值的检测

    数据转换

    移除重复数据
    import pandas as pd
    import numpy as np
    from pandas import Series
    
    data = pd.DataFrame(
        {'k1':['one']*3+['two']*4,
         'k2':[1,1,2,3,3,4,4]})
    data
    
         k1	 k2
    0	one	 1
    1	one	 1
    2	one	 2
    3	two	 3
    4	two	 3
    5	two	 4
    6	two	 4
    
    duplicated方法返回一个布尔型Series,表示各行是否是重复行
    data.duplicated()
    
    0    False
    1     True
    2    False
    3    False
    4     True
    5    False
    6     True
    dtype: bool
    
    drop_duplicates方法,直接返回去除重后的DataFrame
    data.drop_duplicates()
    
         k1	k2
    0	one	 1
    2	one	 2
    3	two	 3
    5	two	 4
    
    可以指定列进行重复项判断
    data['v1'] = range(7)
    data.drop_duplicates(['k1'])
    
        k1	k2	v1
    0	one	1	0
    3	two	3	3
    
    duplicated和drop_duplicates保留的都是第一个出现的值,传入keep='last'则保留最后一个
    
    data
    
        k1	k2	v1
    0	one	1	0
    1	one	1	1
    2	one	2	2
    3	two	3	3
    4	two	3	4
    5	two	4	5
    6	two	4	6
    
    # 默认保留相同选项的是第一个
    data.drop_duplicates(['k1','k2'])
    
        k1	k2	v1
    0	one	1	0
    2	one	2	2
    3	two	3	3
    5	two	4	5
    
    # 保留相同选项里的最后一个
    data.drop_duplicates(['k1','k2'],keep='last')
    
        k1	k2	v1
    1	one	1	1
    2	one	2	2
    4	two	3	4
    6	two	4	6
    

    利用函数或映射进行数据转换

    Series的map方法可以接收一个函数或含有映射关系的字典型对象
    data1 = pd.DataFrame({
        'food':['bacon','pulled pork','bacon','Pastrami',
               'corned beef','Bacon','pastrami','honey ham',
                'nova lox'],
        'ounces':[4,3,12,6,7.5,8,3,5,6]
    })
    
    data1
    
         food	    ounces
    0	bacon	    4.0
    1	pulled pork	3.0
    2	bacon	    12.0
    3	Pastrami	6.0
    4	corned beef	7.5
    5	Bacon	    8.0
    6	pastrami	3.0
    7	honey ham	5.0
    8	nova lox	6.0
    
    # 添加肉类来源这一列
    meat_to_animal = {
        'bacon':'pig',
        'pulled pork':'pig',
        'pastrami':'cow',
        'corned beef':'cow',
        'honey ham':'pig',
        'nova lox':'salmon'
    }
    
    # 根据键映射对应的来源,str.lower的原因是肉类里面的键全是小写,但是food里的键有的是大写,想要映射需要一一对应
    data1['animal'] = data1['food'].map(str.lower).map(meat_to_animal)
    data1
    
        food	   ounces  animal
    0	bacon	    4.0	    pig
    1	pulled pork	3.0	    pig
    2	bacon	    12.0	pig
    3	Pastrami	6.0	    cow
    4	corned beef	7.5	    cow
    5	Bacon	    8.0	    pig
    6	pastrami	3.0	    cow
    7	honey ham	5.0	    pig
    8	nova lox	6.0	    salmon
    

    lambda函数关系映射

    
    # 映射出字典的值是谁
    data1['food'].map(lambda x:meat_to_animal[x.lower()])
    
    0       pig
    1       pig
    2       pig
    3       cow
    4       cow
    5       pig
    6       cow
    7       pig
    8    salmon
    Name: food, dtype: object
    
    替换值
    data2 = Series([1.,-999.,2.,-999.,-1000.,3.])
    data2
    
    0       1.0
    1    -999.0
    2       2.0
    3    -999.0
    4   -1000.0
    5       3.0
    dtype: float64
    
    data2.replace(-999,np.nan)
    
    0       1.0
    1       NaN
    2       2.0
    3       NaN
    4   -1000.0
    5       3.0
    dtype: float64
    
    替换多个值
    
    data2.replace([-999,-1000],np.nan)
    
    0    1.0
    1    NaN
    2    2.0
    3    NaN
    4    NaN
    5    3.0
    dtype: float64
    
    对不同的值进行不同的替换,传入一个关系组成列表
    data2.replace([-999,-1000],[np.nan,0])
    
    0    1.0
    1    NaN
    2    2.0
    3    NaN
    4    0.0
    5    3.0
    dtype: float64
    
    传入的参数是字典映射
    data2.replace({-999:np.nan,-1000:0})
    
    0    1.0
    1    NaN
    2    2.0
    3    NaN
    4    0.0
    5    3.0
    dtype: float64
    
    

    重命名轴索引

    data3 = pd.DataFrame(np.arange(12).reshape(3,4),index=['Ohio','Colorado','New York'],columns=['one','two','three','four'])
    data3
    
               one	two	 three	four
    Ohio	    0	  1	   2	3
    Colorado	4	  5	   6	7
    New York	8	  9	   10	11
    # 这样会直接修改原始数据
    data3.index = data3.index.map(str.upper)
    data3
    
    # 如果想要创建数据集的转换版,也就是副本
    data3.rename(index=str.title,columns=str.upper)
    
               ONE	TWO	THREE FOUR
    Ohio	    0	 1	 2	   3
    Colorado	4	 5	 6	   7
    New York	8	 9	 10	   11
    
    # rename可以结合字典型对象一对一进行轴标签的更新
    data3.rename(index={'Ohio':'New_Ohio'},columns={'three':'peekaboo'})
    
               one	 two	 peekaboo	four
    OHIO	    0	  1	       2	    3
    COLORADO	4	  5	       6	    7
    NEW YORK	8	  9	       10	    11
    
    # rename自带复制功能,如果希望就地修改,传入inplace=True即可
    data3.rename(index={'OHIO':'inplace'},inplace=True)
    data3
    
               one	   two	   three	four
    inplace	    0	    1	    2	     3
    COLORADO	4	    5	    6	     7
    NEW YORK	8	    9	    10	     11
    

    离散化和面元划分

    cut和qcut对分量和分组分析非常重要
    # 将下面这些数据划分为18-25,26-35,35-60,60以上
    args = [20,22,25,27,21,23,37,31,61,45,41,32]
    bins = [18,25,35,60,100]
    cats = pd.cut(args,bins)
    # 左开右闭
    cats
    
    [(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
    Length: 12
    Categories (4, interval[int64]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]
    
    # 直接得到区间范围的数字,跟'区间'的数学符号一样,圆括号表示开端,方括号表示闭端,可以通过right=False进行修改
    pd.value_counts(cats)
    
    (18, 25]     5
    (35, 60]     3
    (25, 35]     3
    (60, 100]    1
    dtype: int64
    
    # 左闭右开
    cut1 = pd.cut(args,bins,right=False)
    cut1
    
    [[18, 25), [18, 25), [25, 35), [25, 35), [18, 25), ..., [25, 35), [60, 100), [35, 60), [35, 60), [25, 35)]
    Length: 12
    Categories (4, interval[int64]): [[18, 25) < [25, 35) < [35, 60) < [60, 100)]
    
    pd.value_counts(cut1)
    
    [25, 35)     4
    [18, 25)     4
    [35, 60)     3
    [60, 100)    1
    dtype: int64
    
    # 不想干巴巴用那么丑的区间名做索引,指定区间的名称
    group_names=['Youth','YoungAdult','MiddleAged','Senior']
    cut2 = pd.cut(args,bins,labels=group_names)
    cut2
    
    [Youth, Youth, Youth, YoungAdult, Youth, ..., YoungAdult, Senior, MiddleAged, MiddleAged, YoungAdult]
    Length: 12
    Categories (4, object): [Youth < YoungAdult < MiddleAged < Senior]
    
    pd.value_counts(cut2)
    
    Youth         5
    MiddleAged    3
    YoungAdult    3
    Senior        1
    dtype: int64
    
    自动划分区间
    data = np.random.rand(20)
    # precision保留的是小数点的位数
    pd.cut(data,4,precision=2)
    
    [(0.27, 0.5], (0.031, 0.27], (0.74, 0.97], (0.031, 0.27], (0.27, 0.5], ..., (0.74, 0.97], (0.031, 0.27], (0.27, 0.5], (0.74, 0.97], (0.5, 0.74]]
    Length: 20
    Categories (4, interval[float64]): [(0.031, 0.27] < (0.27, 0.5] < (0.5, 0.74] < (0.74, 0.97]]
    
    # qcut是一个类似于cut的函数,它可以根据样本分位数对数据进行划分。
    # 和cut不同的是,cut无法使各个面元中含有相同数量的数据点,二qcut可以
    data4 = np.random.randn(1000)
    cat4 = pd.qcut(data4, 4)
    cat4
    
    [(-3.114, -0.713], (-0.713, -0.0478], (-0.0478, 0.618], (-0.713, -0.0478], (0.618, 2.917], ..., (-0.0478, 0.618], (-0.0478, 0.618], (-3.114, -0.713], (-0.0478, 0.618], (0.618, 2.917]]
    Length: 1000
    Categories (4, interval[float64]): [(-3.114, -0.713] < (-0.713, -0.0478] < (-0.0478, 0.618] < (0.618, 2.917]]
    
    # 区间出来是数量是相等的
    pd.value_counts(cat4)
    
    (0.618, 2.917]       250
    (-0.0478, 0.618]     250
    (-0.713, -0.0478]    250
    (-3.114, -0.713]     250
    dtype: int64
    
    # 跟cut一样,也可以设置自定义的分位数
    pd.qcut(data4,[0,0.1,0.5,0.9,1.])
    
    [(-3.114, -1.263], (-1.263, -0.0478], (-0.0478, 1.247], (-1.263, -0.0478], (1.247, 2.917], ..., (-0.0478, 1.247], (-0.0478, 1.247], (-1.263, -0.0478], (-0.0478, 1.247], (-0.0478, 1.247]]
    Length: 1000
    Categories (4, interval[float64]): [(-3.114, -1.263] < (-1.263, -0.0478] < (-0.0478, 1.247] < (1.247, 2.917]]
    

    检测和过滤异常值

    # seed参数生成一个随机数的起始位置,使用了后,后面的随机数不会发生变化,否则每次生成一次变化一次
    # seed参数的值随意,没有任何用
    np.random.seed(0)
    data5 = pd.DataFrame(np.random.randn(1000,4))
    data5.describe()
    
                0	        1	         2	       3
    count	1000.000000	1000.000000	1000.000000	1000.000000
    mean	-0.062966	-0.002087	-0.025777	-0.010981
    std	     0.983517	 0.967146	 0.983671	 0.993891
    min	    -3.740101	-3.046143	-3.116857	-3.392300
    25%	    -0.755720	-0.683680	-0.684833	-0.686776
    50%	    -0.029995	-0.023210	-0.025068	-0.038192
    75%	     0.604792	 0.652095	 0.624139	 0.648778
    max	     2.929096	 2.662727	 3.801660	 3.427539
    
    # 假设你想要找出某列中绝对值大小超过3的值
    col = data5[3]
    col[np.abs(col)>3]
    
    861    3.427539
    919   -3.392300
    Name: 3, dtype: float64
    
    # 选出全部含有超过3或-3的值的行,你可以利用布尔型DataFrame以及any方法
    data[(np.abs(data5)>3)] # 找出绝对值大于3的值,不满足的为NaN
    data[(np.abs(data5)>3).any(1)] # 找出绝对值大于3的行
    
    0	1	2	3
    147	 0.823681	-2.929552	 1.721550	 1.039882
    263	-1.326474	 0.873638	-1.556238	-1.072714
    504	 0.991843	-1.198124	-0.060144	-1.802440
    770	 1.251980	 0.801589	 0.644481	 1.106683
    779	 1.014776	 0.088887	 0.785261	 0.849345
    861	-0.736245	-1.258751	 1.385519	 0.509164
    865	-0.034140	-0.825346	-0.921655	-0.023471
    919	-0.580442	-0.347443	 0.309293	-1.018100
    938	-0.327996	 0.703850	 0.227200	 2.131917
    
    # 将值限制在区间-3到3以内,np.sign代表的是数组的正负,正值为1,负值为-1,0为0
    data5[np.abs(data)>3] = data5*3
    data5.describe()
    
                0	         1	         2	       3
    count	1000.000000	1000.000000	1000.000000	1000.000000
    mean	-0.062966	-0.002087	-0.025777	-0.010981
    std	     0.983517	 0.967146	 0.983671	 0.993891
    min	    -3.740101	-3.046143	-3.116857	-3.392300
    25%	    -0.755720	-0.683680	-0.684833	-0.686776
    50%	    -0.029995	-0.023210	-0.025068	-0.038192
    75%	     0.604792	 0.652095	 0.624139	 0.648778
    max	     2.929096	 2.662727	 3.801660	 3.427539
    
  • 相关阅读:
    win10系统下office 2019激活
    如何根据【抖音分享链接】去掉抖音水印
    Java多线程学习之ThreadLocal源码分析
    Java多线程学习之synchronized总结
    Java多线程学习之线程的取消与中断机制
    Java多线程学习之Lock与ReentranLock详解
    Java多线程学习之线程池源码详解
    MyBatis 一、二级缓存和自定义缓存
    Spring 高级依赖注入方式
    Spring 依赖注入的方式
  • 原文地址:https://www.cnblogs.com/lishi-jie/p/10059234.html
Copyright © 2011-2022 走看看