zoukankan      html  css  js  c++  java
  • pandas 常规操作大全

    那年夏天抓住了蝉的尾巴
    gitbook

    前言

    pandas 抓住 Series (排序的字典), DataFrame (row + 多个 Series) 对象 , 就如同 numpy 里抓住 ndarray  多维数组一样
    可是人的精力始终是有限的,没有过目不忘的本领,那就记住 API 以及常用参数, 其他的交给字典吧
    

    下面学习 示例 可能会用到的 两个函数

    def pretty_print(text='',star_cnt=20):
        stars = '*'*star_cnt
        print('{stars} {text} {stars}'.format(text=text,stars=stars))
    
    def generate_df(rows, cols):
        data = np.arange(rows*cols).reshape((rows,cols))
        columns = ['col_'+str(i) for i in range(cols)]
        indices = ['row_'+str(j) for j in range(rows)]
        return DataFrame(data,columns=columns,index=indices)
    
    generate_df(3,4)
    
    

    修改 dataframe 中数据

    from pandas import DataFrame
    import numpy as np
    import pandas as pd
    import logging
    logging.basicConfig(level=logging.DEBUG)
    
    def pretty_print(text='',star_cnt=20):
        stars = '*'*star_cnt
        print('{stars}{text}{stars}'.format(text=text,stars=stars))
    
    data = {'open':[8.08, 7.93, 7.97, 8.00],
            'close':[7.93,8.05,7.97,8.05],
            'high':[8.10,8.12,8.00,8.09],
            'low':[7.88,7.92,7.91,8.00]}
    
    df = DataFrame(data,index=pd.date_range('02/01/2016','02/04/2016'))
    print(df.head(10))
    pretty_print('华丽的分隔符')
    df[~df.isin([7.93])] = 0 # 将 df 中不为 6.93 的变为 0
    print(df)
    """
                open  close  high   low
    2016-02-01  8.08   7.93  8.10  7.88
    2016-02-02  7.93   8.05  8.12  7.92
    2016-02-03  7.97   7.97  8.00  7.91
    2016-02-04  8.00   8.05  8.09  8.00
    ********************华丽的分隔符********************
                open  close  high  low
    2016-02-01  0.00   7.93   0.0  0.0
    2016-02-02  7.93   0.00   0.0  0.0
    2016-02-03  0.00   0.00   0.0  0.0
    2016-02-04  0.00   0.00   0.0  0.0
    """
    
    """
    df[]  这样返回的都是 DataFrame , df.ix ,df.loc , df.iloc 这类返回的都是 Series
    """
    
    

    apply + map == applymap

    from pandas import DataFrame
    import numpy as np
    import pandas as pd
    import logging
    logging.basicConfig(level=logging.DEBUG)
    from functools import reduce
    
    def pretty_print(text='',star_cnt=20):
        stars = '*'*star_cnt
        print('{stars}{text}{stars}'.format(text=text,stars=stars))
    
    data = {'open':[8.08, 7.93, 7.97, 8.00],
            'close':[7.93,8.05,7.97,8.05],
            'high':[8.10,8.12,8.00,8.09],
            'low':[7.88,7.92,7.91,8.00]}
    
    df = DataFrame(data,index=pd.date_range('02/01/2016','02/04/2016'))
    print(df.head(10))
    pretty_print('华丽的分隔线')
    # 新增一列 下面两种方法等价
    df['new'] = df.apply(lambda cols:reduce(lambda x,y:x+y,cols), axis=1)
    df['new_2'] = df.apply(sum,axis=1)
    # 新增一行
    df.loc['new_row_idx'] = df.apply(lambda s:sum(s), axis=0)
    print(df.head(10))
    """
                open  close  high   low
    2016-02-01  8.08   7.93  8.10  7.88
    2016-02-02  7.93   8.05  8.12  7.92
    2016-02-03  7.97   7.97  8.00  7.91
    2016-02-04  8.00   8.05  8.09  8.00
    ********************华丽的分隔线********************
                open  close  high   low    new  new_2
    2016-02-01  8.08   7.93  8.10  7.88  31.99  63.98
    2016-02-02  7.93   8.05  8.12  7.92  32.02  64.04
    2016-02-03  7.97   7.97  8.00  7.91  31.85  63.70
    2016-02-04  8.00   8.05  8.09  8.00  32.14  64.28
    """
    

    分组后 分组col 会被作为 key 索引

    from pandas import Series,DataFrame
    a=[['Li','男','PE',98.],['Li','男','MATH',60.],['liu','男','MATH',60.],['yu','男','PE',100.]]
    
    af=DataFrame(a,columns=['name','sex','course','score'])
    print(af.head(10))
    print('*'*50)
    print(af.groupby(['name','course'])['score'].sum())
    print('*'*50)
    print(af.groupby(['name','course'])['score'].sum()['Li'])
    """
    name sex course  score
    0   Li   男     PE   98.0
    1   Li   男   MATH   60.0
    2  liu   男   MATH   60.0
    3   yu   男     PE  100.0
    **************************************************
    name  course
    Li    MATH       60.0
          PE         98.0
    liu   MATH       60.0
    yu    PE        100.0
    Name: score, dtype: float64
    **************************************************
    course
    MATH    60.0
    PE      98.0
    Name: score, dtype: float64
    """
    

    骚操作

    # -*- coding: utf-8 -*-
    __author__ = 'Frank Li'
    from pandas import Series,DataFrame
    import pandas as pd
    import pdb
    addr = pd.Series([
     'Washington, D.C. 20003',
     'Brooklyn, NY 11211-1755',
     'Omaha, NE 68154',
     'Pittsburgh, PA 15211' ])
    # Series._accessors 有 str , cat , dt 三个对象
    addr.str.upper() # 将字符大写
    print(addr.str.count(r'd')) # 计数 每一个单元格包含数字个数
    
    regex = (r'(?P<city>[A-Za-z ]+), '      # 一个或更多字母
        r'(?P<state>[A-Z]{2}) '        # 两个大写字母
       r'(?P<zip>d{5}(?:-d{4})?)')  # 可选的4个延伸数字
    print(addr.str.replace('.','').str.extract(regex))
    
    print([i for i in dir(pd.Series.str) if not i.startswith('_')])
    """
    0    5
    1    9
    2    5
    3    5
    dtype: int64
             city state         zip
    0  Washington    DC       20003
    1    Brooklyn    NY  11211-1755
    2       Omaha    NE       68154
    3  Pittsburgh    PA       15211
    ['capitalize', 'cat', 'center', 'contains', 'count', 'decode', 'encode', 'endswith', 'extract', 'extractall', 'find', 'findall', 'get', 'get_dummies', 'index', 'isalnum', 'isalpha', 'isdecimal', 'isdigit', 'islower', 'isnumeric', 'isspace', 'istitle', 'isupper', 'join', 'len', 'ljust', 'lower', 'lstrip', 'match', 'normalize', 'pad', 'partition', 'repeat', 'replace', 'rfind', 'rindex', 'rjust', 'rpartition', 'rsplit', 'rstrip', 'slice', 'slice_replace', 'split', 'startswith', 'strip', 'swapcase', 'title', 'translate', 'upper', 'wrap', 'zfill']
    """
    
    daterng = pd.Series(pd.date_range('2017', periods=9, freq='Q'))
    print(daterng)
    print(daterng.dt.day_name())
    # 查看下半年
    print(daterng[daterng.dt.quarter > 2])
    print(daterng[daterng.dt.is_year_end])
    """
    Series.dt.day_name():从日期判断出所处星期数;
    
    Series.dt.quarter:从日期判断所处季节;
    
    Series.dt.is_year_end:从日期判断是否处在年底
    """
    
    colors = pd.Series([
    'periwinkle',
    'mint green',
    'burnt orange',
    'periwinkle',
    'burnt orange',
    'rose',
    'rose',
    'mint green',
    'rose',
    'navy'])
    import sys
    print(colors.apply(sys.getsizeof))
    mapper = {v: k for k, v in enumerate(colors.unique())}
    as_int = colors.map(mapper)
    print(as_int)
    print(as_int.apply(sys.getsizeof))
    # 节省内存
    primary_usage = colors.memory_usage(index=False, deep=True)
    category_usage = colors.astype('category').memory_usage(index=False, deep=True)
    print('primary: {}
    category_usage: {}'.format(primary_usage,category_usage))
    """
    primary: 370
    category_usage: 291
    这样看起来似乎并没有什么很大区别
    但是我们可以 repeat 多次试试
    """
    manycolors = colors.repeat(10)
    print(len(manycolors) / manycolors.nunique())
    
    print(manycolors.memory_usage(index=False, deep=True))
    # pdb.set_trace()
    print(manycolors.astype('category').memory_usage(index=False, deep=True))
    
    
    

    数据清洗与准备

    # 处理缺失数据
    API 
    dropna
    fillna
    isnull
    notnull
    
    # 过滤缺失值
    data.dropna()   等价于 data[data.notnull()]
    
    对于 DataFrame 还有  axis= 0 or 1 , how = 'all' 等可选 ,当某列全为 NaN 时候删除 , thresh=2 当 NA 个数 > = 2 时候触发删除操作
    df.dropna(axis=1,how='all')
    df.fillna({1: 0.5, 2: 0}, inplace=True, method='ffill', limit=2)为 1 列 2 列 分别填充不同的默认值
    fillna 参数 有:
    value 标量值或字典型对象用于填充确实值
    method 插值方法, 如果没有其他参数, 默认是 ‘ffill’
    axis  需要填充的轴, 默认 axis = 0
    inplace 
    limit 用于前向或后向填充时最大的填充范围
    
    ### 查询与删除重复值
    data.duplicated()
    data.drop_duplicates(['col1','col2'], keep='last' or 'first')
    
    ### 使用函数或映射进行数据转换
    data = {'food':['bacon','pulled pork','bacon','Pastrami','corned beef','Bacon','pastrami','honey ham','nova lox'],
            'ounces':[4,5,12,6,7.5,8,3,5,6]}
    df = DataFrame(data)
    print(df.head(10))
    print('*'*50)
    meat_to_animal = {'bacon':'pig','pulled pork':'pig','pastrami':'cow','corned beef':'cow','honey ham':'pig','nova lox':'salmon'}
    
    df['meat_to_animall'] = df['food'].map(lambda x:meat_to_animal.get(x.lower(),'unknown'))
    print(df.head(10))
    
    ### 替代值
    data.replace(-999, np.nan)
    
    ### 重命名轴索引
    data.index.map(lambda x: x[:4].upper())
    data.rename(index=str.title, columns=str.upper)
    dta.rename(index={'old_idx':'new_idx'}, columns={'old_col':'new_col'}, inplace=True)
    
    
    ### 离散化和分箱  cut , qcut
    pd.cut(ages, bins)
    
    from pandas import Series,DataFrame
    import pandas as pd
    
    ages = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]
    bins = [18,35,50,70,90,120]
    cats = pd.cut(ages, bins)
    print(cats.codes)
    
    df = DataFrame({'ages':ages})
    df['ages_dicretes'] = pd.cut(ages, bins,right=False).codes
    print(df.head(10))
    
    
    from pandas import Series,DataFrame
    import pandas as pd
    import numpy as np
    data = DataFrame(np.random.randn(1000,4))
    data[np.abs(data)>3] = np.sign(data) * 3 # 将数值限定于 -3 to +3
    print(data.head(10))
    
    # 置换和随机抽样
    numpy.random.permutation
    df.sample , series.sample
    
    df = DataFrame(np.arange(20).reshape((5,4)))
    print(df)
    print('*'*50)
    sampler = np.random.permutation(5)
    print(df.take(sampler))
    
    
    # 组合使用
    from pandas import Series,DataFrame
    import pandas as pd
    import numpy as np
    
    def pretty_print(text='',star_cnt=20):
        stars = '*'*star_cnt
        print('{stars} {text} {stars}'.format(text=text,stars=stars))
    
    np.random.seed(12345)
    values = np.random.rand(10)
    print(values)
    pretty_print('离散分箱')
    bins = [0, 0.2, 0.4, 0.6, 0.8, 1]
    df = pd.get_dummies(pd.cut(values,bins))
    print(df.head(10))
    
    """
    [0.92961609 0.31637555 0.18391881 0.20456028 0.56772503 0.5955447
     0.96451452 0.6531771  0.74890664 0.65356987]
    ******************** 离散分箱 ********************
       (0.0, 0.2]  (0.2, 0.4]  (0.4, 0.6]  (0.6, 0.8]  (0.8, 1.0]
    0           0           0           0           0           1
    1           0           1           0           0           0
    2           1           0           0           0           0
    3           0           1           0           0           0
    4           0           0           1           0           0
    5           0           0           1           0           0
    6           0           0           0           0           1
    7           0           0           0           1           0
    8           0           0           0           1           0
    9           0           0           0           1           0
    """
    
    ### 向量化 字符串函数
    from pandas import Series,DataFrame
    import pandas as pd
    import numpy as np
    import re
    
    def pretty_print(text='',star_cnt=20):
        stars = '*'*star_cnt
        print('{stars} {text} {stars}'.format(text=text,stars=stars))
    
    data = {'col1':['001100110111'],
            'col2': ['001100110111'],
            'col3': ['001100110111']}
    df = DataFrame(data)
    print(df.head(10))
    pretty_print('华丽的分割线')
    df2 = df.apply(lambda s:s.str.extract(r'(?P<nums_1>d{3})(?P<nums_2>d{3})(?P<nums_3>d{3})(?P<nums_4>d{3})') ,axis=1)
    print(df2.values)
    """
               col1          col2          col3
    0  001100110111  001100110111  001100110111
    ******************** 华丽的分割线 ********************
    [     nums_1 nums_2 nums_3 nums_4
    col1    001    100    110    111
    col2    001    100    110    111
    col3    001    100    110    111]
    """
    
    # 部分向量化字符串方法列表
    cat 根据可选的分隔符暗元素年和字符串
    contains 返回是否含有某个模式 / 正则表达式的 布尔值数组
    count 模式出现次数的计数
    extract 使用正则表达式从字符串Series 中分组抽取一个 或多个字符串, 返回的结果是 每个分组形成一列的 DataFrame
    endswith  等价于对每个元素使用 x.endswith 
    startswith 等价于 对每个元素使用 x.statswith
    findall  找出字符串中所有的 模式 / 正则表达式 匹配项 ,以列表返回
    get 对每个元素进行索引 (获得第 i 个元素)
    isalnum
    is alhpa
    isdecimal
    isdigit
    islower
    isnumeric
    isupper
    join
    len
    loer, upper
    match
    pad
    center
    repeat
    replace
    slice
    
    split
    strip
    rstrip
    lstrip
    

    第八章 数据规整 , 连接,联合 重塑

    # 分层索引  部分索引
    分层索引允许你在一个轴向上拥有多个 (两个或两个以上) 索引层级, 龙宫地说 分层索引提供了一种在耕地唯独的形式中处理更高维度 数据的方法。
    
    # -*- coding: utf-8 -*-
    __author__ = 'Frank Li'
    from pandas import Series,DataFrame
    import pandas as pd
    import numpy as np
    import re
    
    def pretty_print(text='',star_cnt=20):
        stars = '*'*star_cnt
        print('{stars} {text} {stars}'.format(text=text,stars=stars))
    
    
    data = pd.Series(np.random.randn(9),
                     index=[['a', 'a', 'a', 'b', 'b', 'c', 'c', 'd', 'd'],
                            [1, 2, 3, 1, 3, 1, 2, 2, 3]])
    print(data.head(10))
    pretty_print('华丽的分割线')
    print(data['b'])
    pretty_print('华丽的分割线')
    print(data.unstack())
    pretty_print('华丽的分割线')
    print(data.unstack().stack())  
    
    #  重排序 和 层级排序
    swaplevel   sort_index
    # 按 层级进行汇总统计
    
    # 使用 DataFrame 的列 进行索引
    
    set_index()   提出 层级索引
    reset_index()  反操作 封层索引会被移动到 列中
    
    # 联合于合并数据集 
    类似于 SQL 表关联操作  merge
    from pandas import Series,DataFrame
    import pandas as pd
    import numpy as np
    import re
    
    def pretty_print(text='',star_cnt=20):
        stars = '*'*star_cnt
        print('{stars} {text} {stars}'.format(text=text,stars=stars))
    
    
    df1 = pd.DataFrame({'key1':['b','b','a','c','a','a','b'],
                        'data1':range(7)})
    df2 = pd.DataFrame({'key1':['a','b','d'],
                       'data2':range(3)})
    
    df3 = pd.merge(df1,df2,left_on='key1',right_on='key1',how='inner', suffixes=('_left','_right')) 
    # inner ,left ,right ,outer ,如果 是多个列进行关联 则 on = ['key1','key2'] ,如果有相同的列,则使用
    # suffixes 参数 给列取别名   , 根据索引来合并 (关联) left_index=True, right_index=True
    
    print(df3.head(10))
    
    # join 按照 索引进行合并
    df4 = df1.join(df2,how='inner', lsuffix='_left', rsuffix='_right')
    pretty_print('华丽的分割线')
    print(df4.head(10))
    
    left.join([right1, right2,right3], how='outer')  默认 inner 
    
    concat  类似于  union all 其实又不仅仅是 
    
    arr = np.arange(12).reshape((3,4))
    print(arr)
    pretty_print('华丽的分割线')
    result = np.concatenate([arr,arr],axis=1)
    result_2 = np.concatenate([arr,arr],axis=0)
    print(result)
    pretty_print('华丽的分割线')
    print(result_2)
    
    from pandas import Series,DataFrame
    import pandas as pd
    import numpy as np
    import re
    
    def pretty_print(text='',star_cnt=20):
        stars = '*'*star_cnt
        print('{stars} {text} {stars}'.format(text=text,stars=stars))
    
    def generate_df(rows, cols):
        data = np.arange(rows*cols).reshape((rows,cols))
        columns = ['col_'+str(i) for i in range(cols)]
        indices = ['row_'+str(j) for j in range(rows)]
        return DataFrame(data,columns=columns,index=indices)
    
    arr = generate_df(3,4)
    s1 = pd.Series([0, 1], index=['a', 'b'])
    s2 = pd.Series([2, 3, 4], index=['c', 'd', 'e'])
    s3 = pd.Series([5, 6], index=['f', 'g'])
    
    print(pd.concat([s1,s2,s3],axis=0,sort=False))
    pretty_print('华丽的分割线')
    print(pd.concat([s1,s2,s3],axis=1,sort=False))
    """
    a    0
    b    1
    c    2
    d    3
    e    4
    f    5
    g    6
    dtype: int64
    ******************** 华丽的分割线 ********************
         0    1    2
    a  0.0  NaN  NaN
    b  1.0  NaN  NaN
    c  NaN  2.0  NaN
    d  NaN  3.0  NaN
    e  NaN  4.0  NaN
    f  NaN  NaN  5.0
    g  NaN  NaN  6.0
    """
    
    s4 = pd.concat([s1,s3])
    print(pd.concat([s1,s4],axis=1,join='inner',sort=False))  # join = outer or inner
    
    
    from pandas import Series,DataFrame
    import pandas as pd
    import numpy as np
    import re
    
    def pretty_print(text='',star_cnt=20):
        stars = '*'*star_cnt
        print('{stars} {text} {stars}'.format(text=text,stars=stars))
    
    def generate_df(rows, cols):
        data = np.arange(rows*cols).reshape((rows,cols))
        columns = ['col_'+str(i) for i in range(cols)]
        indices = ['row_'+str(j) for j in range(rows)]
        return DataFrame(data,columns=columns,index=indices)
    
    arr = generate_df(3,4)
    s1 = pd.Series([0, 1], index=['a', 'b'])
    s2 = pd.Series([2, 3, 4], index=['c', 'd', 'e'])
    s3 = pd.Series([5, 6], index=['f', 'g'])
    
    s5 = pd.concat([s1,s2,s3],axis=0,keys=['one', 'two', 'three'],sort=False) # 可以为每一个 series 做分层索引便于区分
    print(s5)
    pretty_print('华丽的分割线')
    """
    one    a    0
           b    1
    two    c    2
           d    3
           e    4
    three  f    5
           g    6
    dtype: int64
    ******************** 华丽的分割线 ********************
             a    b    c    d    e    f    g
    one    0.0  1.0  NaN  NaN  NaN  NaN  NaN
    two    NaN  NaN  2.0  3.0  4.0  NaN  NaN
    three  NaN  NaN  NaN  NaN  NaN  5.0  6.0
    
    """
    
    from pandas import Series,DataFrame
    import pandas as pd
    import numpy as np
    import re
    
    def pretty_print(text='',star_cnt=20):
        stars = '*'*star_cnt
        print('{stars} {text} {stars}'.format(text=text,stars=stars))
    
    def generate_df(rows, cols):
        data = np.arange(rows*cols).reshape((rows,cols))
        columns = ['col_'+str(i) for i in range(cols)]
        indices = ['row_'+str(j) for j in range(rows)]
        return DataFrame(data,columns=columns,index=indices)
    
    df1 = generate_df(3,2)
    df2 = generate_df(2,2).applymap(lambda x:x+5)
    print(df1)
    pretty_print('华丽的分割线')
    print(df2)
    pretty_print('华丽的分割线')
    print(pd.concat([df1,df2],axis=1,sort=False,keys=('lvl1','lvl2')))
    """
           col_0  col_1
    row_0      0      1
    row_1      2      3
    row_2      4      5
    ******************** 华丽的分割线 ********************
           col_0  col_1
    row_0      5      6
    row_1      7      8
    ******************** 华丽的分割线 ********************
           lvl1        lvl2      
          col_0 col_1 col_0 col_1
    row_0     0     1   5.0   6.0
    row_1     2     3   7.0   8.0
    row_2     4     5   NaN   NaN
    """
    
    

    联合重叠数据

    from pandas import Series,DataFrame
    import pandas as pd
    import numpy as np
    import re
    
    def pretty_print(text='',star_cnt=20):
        stars = '*'*star_cnt
        print('{stars} {text} {stars}'.format(text=text,stars=stars))
    
    def generate_df(rows, cols):
        data = np.arange(rows*cols).reshape((rows,cols))
        columns = ['col_'+str(i) for i in range(cols)]
        indices = ['row_'+str(j) for j in range(rows)]
        return DataFrame(data,columns=columns,index=indices)
    
    a = pd.Series([np.nan, 2.5, np.nan, 3.5, 4.5, np.nan],
                  index=['f', 'e', 'd', 'c', 'b', 'a'])
    b = pd.Series(np.arange(len(a), dtype=np.float64),
                  index=['f', 'e', 'd', 'c', 'b', 'a'])
    print(a)
    pretty_print('华丽的分割线')
    print(b)
    pretty_print('华丽的分割线')
    print(np.where(pd.isnull(a),b,a))
    """
    f    NaN
    e    2.5
    d    NaN
    c    3.5
    b    4.5
    a    NaN
    dtype: float64
    ******************** 华丽的分割线 ********************
    f    0.0
    e    1.0
    d    2.0
    c    3.0
    b    4.0
    a    5.0
    dtype: float64
    ******************** 华丽的分割线 ********************
    [0.  2.5 2.  3.5 4.5 5. ]
    """
    print(b.combine_first(a))
    

    8.3 重中之重 重塑或透视

    stack 
    unstack
    往往结合 分层索引来做
    
    pivot <==> 等价于  set_index 创建分层索引, 然后调用 unstack 拆堆
    pivot 反过来操作 就是 pd.melt 
    
    import pandas as pd
    import numpy as np
    
    """
    对比 某一行 第二三列的 差值 == 下一行 第一列的值,找出这样的行
    """
    
    data = np.zeros((20,3))
    df = pd.DataFrame(data, columns=['col_'+str(i) for i in range(3)], index=['row_'+str(i) for i in range(20)])
    df.iloc[:, 1:] = 1
    
    print(df.head(10))
    
    print('{stars} {text} {stars}'.format(stars='*'*20,text='华丽的分割线'))
    def func(row):
        return abs(row[1] - row[2])
    
    df['col_3'] = df.apply(func, axis=1)
    df['col_4'] = np.where(df['col_3'] == df['col_0'].shift(1), True, False)
    
    df = df[df['col_4']]
    print(df.head(20))
    
    

    判断类似 gene_1|gene_2 ==> gene_1 = 1 , gene_2 = 1 gene_3 = 0....

    from pandas import Series,DataFrame
    import pandas as pd
    import numpy as np
    import re
    
    def pretty_print(text='',star_cnt=20):
        stars = '*'*star_cnt
        print('{stars} {text} {stars}'.format(text=text,stars=stars))
    
    def generate_df(rows, cols):
        data = np.arange(rows*cols).reshape((rows,cols))
        columns = ['col_'+str(i) for i in range(cols)]
        indices = ['row_'+str(j) for j in range(rows)]
        return DataFrame(data,columns=columns,index=indices)
    
    df = generate_df(3,6)
    print(df.head(10))
    df['col_1'] = 'col_2|col_4'
    df.ix[1:2,'col_1'] = 'col_3|col_5'
    
    cols = [col for col in df.columns if col!='col_0']
    
    def do_apply(row):
        for col in cols:
            print('*'*100)
            if col in row['col_1'].split(r'|'):
                tmp = row['col_0']
                df.loc[df['col_0'] == tmp,col] = 1
    
    df.apply(do_apply, axis=1)
    # df.apply(lambda row:do_apply(row) ,axis=1)
    print('df shape: {}'.format(df.shape))
    print('df 前十行: ',df.head(10))
    

    SQLpd

    如果有来生,一个人去远行,看不同的风景,感受生命的活力。。。
  • 相关阅读:
    没有 Lambda 演算何来匿名函数——匿名函数(匿名方法和Lambda)、委托、LINQ
    HTML 4.01 符号实体
    利用 IHttpModule 自定义 HTTP 处理模块
    数据结构单链表
    Ext.Net 1.2.0_Ext.Net.RendererFormat 常用数据格式转换呈现格式
    ASP.NET 以 Request.Querystring、Request.Form 或 Request.Params 获取客户端的数据
    Flex>连接WebService
    java.sql.SQLException: 关闭的连接
    jsp>SmartUpload相关类说明
    flex>样式和主题
  • 原文地址:https://www.cnblogs.com/Frank99/p/10972238.html
Copyright © 2011-2022 走看看