zoukankan      html  css  js  c++  java
  • Pandas VS SQL

    环境说明

    • MySQL版本:5.7.28
    • Pandas版本:1.0.1
    • Python版本:3.8.2

    数据准备

    MySQL:

    create table tab_name(
        idx int primary key,
        dt  datetime,
        msg varchar(50)
    );
    
    insert into tab_name(idx,dt,msg) values(1,str_to_date('2020-01-01 11:23:45','%Y-%m-%d %H:%i:%s'),'A1一');
    insert into tab_name(idx,dt,msg) values(2,str_to_date('2020-01-02 12:23:45','%Y-%m-%d %H:%i:%s'),'B2二');
    insert into tab_name(idx,dt,msg) values(3,str_to_date('2020-01-03 13:23:45','%Y-%m-%d %H:%i:%s'),'C3三');
    insert into tab_name(idx,dt,msg) values(4,str_to_date('2020-01-04 14:23:45','%Y-%m-%d %H:%i:%s'),'D4四');
    insert into tab_name(idx,dt,msg) values(5,str_to_date('2020-01-05 15:23:45','%Y-%m-%d %H:%i:%s'),'E5五');
    insert into tab_name(idx,dt,msg) values(6,str_to_date('2020-01-06 16:23:45','%Y-%m-%d %H:%i:%s'),'F6六');
    insert into tab_name(idx,dt,msg) values(7,str_to_date('2020-01-07 17:23:45','%Y-%m-%d %H:%i:%s'),'G7七');
    insert into tab_name(idx,dt,msg) values(8,str_to_date('2020-01-08 18:23:45','%Y-%m-%d %H:%i:%s'),'H8八');
    insert into tab_name(idx,dt,msg) values(9,str_to_date('2020-01-09 19:23:45','%Y-%m-%d %H:%i:%s'),'I9九');
    
    mysql> select * from tab_name;
    +-----+---------------------+------+
    | idx | dt                  | msg  |
    +-----+---------------------+------+
    |   1 | 2020-01-01 11:23:45 | A1一  |
    |   2 | 2020-01-02 12:23:45 | B2二  |
    |   3 | 2020-01-03 13:23:45 | C3三  |
    |   4 | 2020-01-04 14:23:45 | D4四  |
    |   5 | 2020-01-05 15:23:45 | E5五  |
    |   6 | 2020-01-06 16:23:45 | F6六  |
    |   7 | 2020-01-07 17:23:45 | G7七  |
    |   8 | 2020-01-08 18:23:45 | H8八  |
    |   9 | 2020-01-09 19:23:45 | I9九  |
    +-----+---------------------+------+
    9 rows in set (0.00 sec)
    

    Pandas:

    import pandas as pd
    from datetime import datetime
    
    df = pd.DataFrame({
        'idx': [1,2,3,4,5,6,7,8,9],
        'dt': [
            datetime(2020,1,1,11,23,45),
            datetime(2020,1,2,12,23,45),
            datetime(2020,1,3,13,23,45),
            datetime(2020,1,4,14,23,45),
            datetime(2020,1,5,15,23,45),
            datetime(2020,1,6,16,23,45),
            datetime(2020,1,7,17,23,45),
            datetime(2020,1,8,18,23,45),
            datetime(2020,1,9,19,23,45),
        ],
        'msg': ['A1一','B2二','C3三','D4四','E5五','F6六','G7七','H8八','I9九']
    })
    
    # df数据
       idx                  dt  msg
    0    1 2020-01-01 11:23:45  A1一
    1    2 2020-01-02 12:23:45  B2二
    2    3 2020-01-03 13:23:45  C3三
    3    4 2020-01-04 14:23:45  D4四
    4    5 2020-01-05 15:23:45  E5五
    5    6 2020-01-06 16:23:45  F6六
    6    7 2020-01-07 17:23:45  G7七
    7    8 2020-01-08 18:23:45  H8八
    8    9 2020-01-09 19:23:45  I9九
    


    数据过滤

    单值过滤

    mysql> select
        ->     idx,
        ->     dt,
        ->     msg
        -> from tab_name
        -> where idx>5;
    +-----+---------------------+-------+
    | idx | dt                  | msg   |
    +-----+---------------------+-------+
    |   6 | 2020-01-06 16:23:45 | F6六  |
    |   7 | 2020-01-07 17:23:45 | G7七  |
    |   8 | 2020-01-08 18:23:45 | H8八  |
    |   9 | 2020-01-09 19:23:45 | I9九  |
    +-----+---------------------+-------+
    4 rows in set (0.00 sec)
    
    print(df.loc[lambda x: x['idx'] > 5])
    
       idx                  dt  msg
    5    6 2020-01-06 16:23:45  F6六
    6    7 2020-01-07 17:23:45  G7七
    7    8 2020-01-08 18:23:45  H8八
    8    9 2020-01-09 19:23:45  I9九
    

    多值过滤

    mysql> select
        ->     idx,
        ->     dt,
        ->     msg
        -> from tab_name
        -> where idx in (2,4,6);
    +-----+---------------------+-------+
    | idx | dt                  | msg   |
    +-----+---------------------+-------+
    |   2 | 2020-01-02 12:23:45 | B2二  |
    |   4 | 2020-01-04 14:23:45 | D4四  |
    |   6 | 2020-01-06 16:23:45 | F6六  |
    +-----+---------------------+-------+
    3 rows in set (0.00 sec)
    
    print(df.loc[lambda x: x['idx'].isin([2, 4, 6])])
    
       idx                  dt  msg
    5    6 2020-01-06 16:23:45  F6六
    6    7 2020-01-07 17:23:45  G7七
    7    8 2020-01-08 18:23:45  H8八
    8    9 2020-01-09 19:23:45  I9九
    


    类型操作

    数值与字符串

    数值转字符串

    mysql> select
        ->     idx,
        ->     cast(idx as char) as idx_char
        -> from tab_name;
    +-----+----------+
    | idx | idx_char |
    +-----+----------+
    |   1 | 1        |
    |   2 | 2        |
    |   3 | 3        |
    |   4 | 4        |
    |   5 | 5        |
    |   6 | 6        |
    |   7 | 7        |
    |   8 | 8        |
    |   9 | 9        |
    +-----+----------+
    9 rows in set (0.00 sec)
    
    df['idx_char'] = df['idx'].apply(lambda x: str(x))
    print(df[['idx', 'idx_char']])
    
       idx idx_char
    0    1        1
    1    2        2
    2    3        3
    3    4        4
    4    5        5
    5    6        6
    6    7        7
    7    8        8
    8    9        9
    

    字符串转数值

    mysql> select
        ->     idx,
        ->     cast('1' as signed) as string_to_int
        -> from tab_name;
    +-----+---------------+
    | idx | string_to_int |
    +-----+---------------+
    |   1 |             1 |
    |   2 |             1 |
    |   3 |             1 |
    |   4 |             1 |
    |   5 |             1 |
    |   6 |             1 |
    |   7 |             1 |
    |   8 |             1 |
    |   9 |             1 |
    +-----+---------------+
    9 rows in set (0.00 sec)
    
    df['string_to_int'] = int('1')
    print(df[['string_to_int']])
    
       string_to_int
    0              1
    1              1
    2              1
    3              1
    4              1
    5              1
    6              1
    7              1
    8              1
    

    日期与字符串

    日期转字符串

    mysql> select
        ->     idx,
        ->     dt,
        ->     date_format(dt,'%Y-%m-%d %H:%i:%s') as datetime_to_string
        -> from tab_name;
    +-----+---------------------+---------------------+
    | idx | dt                  | datetime_to_string  |
    +-----+---------------------+---------------------+
    |   1 | 2020-01-01 11:23:45 | 2020-01-01 11:23:45 |
    |   2 | 2020-01-02 12:23:45 | 2020-01-02 12:23:45 |
    |   3 | 2020-01-03 13:23:45 | 2020-01-03 13:23:45 |
    |   4 | 2020-01-04 14:23:45 | 2020-01-04 14:23:45 |
    |   5 | 2020-01-05 15:23:45 | 2020-01-05 15:23:45 |
    |   6 | 2020-01-06 16:23:45 | 2020-01-06 16:23:45 |
    |   7 | 2020-01-07 17:23:45 | 2020-01-07 17:23:45 |
    |   8 | 2020-01-08 18:23:45 | 2020-01-08 18:23:45 |
    |   9 | 2020-01-09 19:23:45 | 2020-01-09 19:23:45 |
    +-----+---------------------+---------------------+
    9 rows in set (0.00 sec)
    
    df['datetime_to_string'] = df['dt'].apply(lambda x: x.strftime('%Y-%m-%d %H:%M:%S'))
    print(df[['idx','dt','datetime_to_string']])
    
       idx                  dt   datetime_to_string
    0    1 2020-01-01 11:23:45  2020-01-01 11:23:45
    1    2 2020-01-02 12:23:45  2020-01-02 12:23:45
    2    3 2020-01-03 13:23:45  2020-01-03 13:23:45
    3    4 2020-01-04 14:23:45  2020-01-04 14:23:45
    4    5 2020-01-05 15:23:45  2020-01-05 15:23:45
    5    6 2020-01-06 16:23:45  2020-01-06 16:23:45
    6    7 2020-01-07 17:23:45  2020-01-07 17:23:45
    7    8 2020-01-08 18:23:45  2020-01-08 18:23:45
    8    9 2020-01-09 19:23:45  2020-01-09 19:23:45
    

    字符串转日期

    mysql> select
        ->     idx,
        ->     str_to_date('2020-01-01 11:23:45','%Y-%m-%d %H:%i:%s') as string_to_datetime
        -> from tab_name;
    +-----+---------------------+
    | idx | string_to_datetime  |
    +-----+---------------------+
    |   1 | 2020-01-01 11:23:45 |
    |   2 | 2020-01-01 11:23:45 |
    |   3 | 2020-01-01 11:23:45 |
    |   4 | 2020-01-01 11:23:45 |
    |   5 | 2020-01-01 11:23:45 |
    |   6 | 2020-01-01 11:23:45 |
    |   7 | 2020-01-01 11:23:45 |
    |   8 | 2020-01-01 11:23:45 |
    |   9 | 2020-01-01 11:23:45 |
    +-----+---------------------+
    9 rows in set (0.00 sec)
    
    df['string_to_datetime'] = datetime.strptime('2020-01-01 11:23:45','%Y-%m-%d %H:%M:%S')
    print(df[['idx','string_to_datetime']])
    
       idx  string_to_datetime
    0    1 2020-01-01 11:23:45
    1    2 2020-01-01 11:23:45
    2    3 2020-01-01 11:23:45
    3    4 2020-01-01 11:23:45
    4    5 2020-01-01 11:23:45
    5    6 2020-01-01 11:23:45
    6    7 2020-01-01 11:23:45
    7    8 2020-01-01 11:23:45
    8    9 2020-01-01 11:23:45
    

    字符串截取

    mysql> select
        ->     idx,
        ->     msg,
        ->     substr(msg,3,1) as substring
        -> from tab_name;
    +-----+-------+-----------+
    | idx | msg   | substring |
    +-----+-------+-----------+
    |   1 | A1一  | 一        |
    |   2 | B2二  | 二        |
    |   3 | C3三  | 三        |
    |   4 | D4四  | 四        |
    |   5 | E5五  | 五        |
    |   6 | F6六  | 六        |
    |   7 | G7七  | 七        |
    |   8 | H8八  | 八        |
    |   9 | I9九  | 九        |
    +-----+-------+-----------+
    9 rows in set (0.00 sec)
    
    df['substring'] = df['msg'].apply(lambda x: x[2:3])
    print(df[['idx','msg','substring']])
    
       idx  msg substring
    0    1  A1一         一
    1    2  B2二         二
    2    3  C3三         三
    3    4  D4四         四
    4    5  E5五         五
    5    6  F6六         六
    6    7  G7七         七
    7    8  H8八         八
    8    9  I9九         九
    

    字符串拼接

    mysql> select
        ->     idx,
        ->     msg,
        ->     concat(msg,' + AaC') as concat_string
        -> from tab_name;
    +-----+-------+---------------+
    | idx | msg   | concat_string |
    +-----+-------+---------------+
    |   1 | A1一  | A1一 + AaC    |
    |   2 | B2二  | B2二 + AaC    |
    |   3 | C3三  | C3三 + AaC    |
    |   4 | D4四  | D4四 + AaC    |
    |   5 | E5五  | E5五 + AaC    |
    |   6 | F6六  | F6六 + AaC    |
    |   7 | G7七  | G7七 + AaC    |
    |   8 | H8八  | H8八 + AaC    |
    |   9 | I9九  | I9九 + AaC    |
    +-----+-------+---------------+
    9 rows in set (0.00 sec)
    
    df['concat_string'] = df['msg'].apply(lambda x: x + ' + AaC')
    print(df[['idx','msg','concat_string']])
    
       idx  msg concat_string
    0    1  A1一     A1一 + AaC
    1    2  B2二     B2二 + AaC
    2    3  C3三     C3三 + AaC
    3    4  D4四     D4四 + AaC
    4    5  E5五     E5五 + AaC
    5    6  F6六     F6六 + AaC
    6    7  G7七     G7七 + AaC
    7    8  H8八     H8八 + AaC
    8    9  I9九     I9九 + AaC
    

    字符串替换

    mysql> select
        ->     idx,
        ->     msg,
        ->     replace(msg,idx,'@') as replace_string
        -> from tab_name;
    +-----+-------+----------------+
    | idx | msg   | replace_string |
    +-----+-------+----------------+
    |   1 | A1一  | A@一           |
    |   2 | B2二  | B@二           |
    |   3 | C3三  | C@三           |
    |   4 | D4四  | D@四           |
    |   5 | E5五  | E@五           |
    |   6 | F6六  | F@六           |
    |   7 | G7七  | G@七           |
    |   8 | H8八  | H@八           |
    |   9 | I9九  | I@九           |
    +-----+-------+----------------+
    9 rows in set (0.00 sec)
    
    # 方法一
    def func(source, old, new):
        return source.replace(old, new)
    
    df['replace_string'] = df.apply(lambda x: func(x['msg'], str(x['idx']), '@'), axis=1)
    print(df[['idx', 'msg', 'replace_string']])
    
       idx  msg replace_string
    0    1  A1一            A@一
    1    2  B2二            B@二
    2    3  C3三            C@三
    3    4  D4四            D@四
    4    5  E5五            E@五
    5    6  F6六            F@六
    6    7  G7七            G@七
    7    8  H8八            H@八
    8    9  I9九            I@九
    
    
    # 方法二
    import re
    
    df['replace_string'] = df.apply(lambda x: re.sub(str(x['idx']), '@', x['msg']), axis=1)
    print(df[['idx', 'msg', 'replace_string']])
    
       idx  msg replace_string
    0    1  A1一            A@一
    1    2  B2二            B@二
    2    3  C3三            C@三
    3    4  D4四            D@四
    4    5  E5五            E@五
    5    6  F6六            F@六
    6    7  G7七            G@七
    7    8  H8八            H@八
    8    9  I9九            I@九
    


    列操作

    增加列

    mysql> select
        ->     idx,
        ->     msg,
        ->     concat(msg, '_new') as new_msg
        -> from tab_name;
    +-----+-------+-----------+
    | idx | msg   | new_msg   |
    +-----+-------+-----------+
    |   1 | A1一  | A1一_new  |
    |   2 | B2二  | B2二_new  |
    |   3 | C3三  | C3三_new  |
    |   4 | D4四  | D4四_new  |
    |   5 | E5五  | E5五_new  |
    |   6 | F6六  | F6六_new  |
    |   7 | G7七  | G7七_new  |
    |   8 | H8八  | H8八_new  |
    |   9 | I9九  | I9九_new  |
    +-----+-------+-----------+
    9 rows in set (0.00 sec)
    
    df['new_msg'] = df['msg'].apply(lambda x: x + '_new')
    print(df[['idx','msg','new_msg']])
    
       idx  msg  new_msg
    0    1  A1一  A1一_new
    1    2  B2二  B2二_new
    2    3  C3三  C3三_new
    3    4  D4四  D4四_new
    4    5  E5五  E5五_new
    5    6  F6六  F6六_new
    6    7  G7七  G7七_new
    7    8  H8八  H8八_new
    8    9  I9九  I9九_new
    

    Case语句

    mysql> select
        ->     idx,
        ->     msg,
        ->     case substr(msg,1,1)
        ->         when 'A' then 'AAA'
        ->         when 'B' then 'BBB'
        ->         else 'ZZZ'
        ->     end as case_msg
        -> from tab_name;
    +-----+-------+----------+
    | idx | msg   | case_msg |
    +-----+-------+----------+
    |   1 | A1一  | AAA      |
    |   2 | B2二  | BBB      |
    |   3 | C3三  | ZZZ      |
    |   4 | D4四  | ZZZ      |
    |   5 | E5五  | ZZZ      |
    |   6 | F6六  | ZZZ      |
    |   7 | G7七  | ZZZ      |
    |   8 | H8八  | ZZZ      |
    |   9 | I9九  | ZZZ      |
    +-----+-------+----------+
    9 rows in set (0.01 sec)
    
    def func(msg):
        s = msg[0:1]
        if s == 'A':
            return 'AAA'
        elif s == 'B':
            return 'BBB'
        else:
            return 'ZZZ'
    
    df['case_msg'] = df['msg'].apply(func)
    print(df[['idx','msg','case_msg']])
    
       idx  msg case_msg
    0    1  A1一      AAA
    1    2  B2二      BBB
    2    3  C3三      ZZZ
    3    4  D4四      ZZZ
    4    5  E5五      ZZZ
    5    6  F6六      ZZZ
    6    7  G7七      ZZZ
    7    8  H8八      ZZZ
    8    9  I9九      ZZZ
    


    Group By

    mysql> select
        ->     idx%3 as idx_mod,
        ->     count(idx) as idx_count,
        ->     sum(idx) as idx_sum
        -> from tab_name
        -> group by idx%3
        -> order by idx_mod;
    +---------+-----------+---------+
    | idx_mod | idx_count | idx_sum |
    +---------+-----------+---------+
    |       0 |         3 |      18 |
    |       1 |         3 |      12 |
    |       2 |         3 |      15 |
    +---------+-----------+---------+
    3 rows in set (0.00 sec)
    
    import numpy as np
    
    df['idx_mod'] = df['idx'].apply(lambda x: x%3)
    grouped = df.groupby('idx_mod')
    print(grouped.agg(idx_count=('idx','count'), idx_sum=('idx','sum')).sort_values(by='idx_mod', ascending=True))
    # 上面的等价写法如下:
    # print(grouped['idx'].agg(['count', 'sum']).rename(columns={'count': 'idx_count', 'sum': 'idx_sum'}).sort_values(by='idx_mod', ascending=True))
    
             idx_count  idx_sum
    idx_mod
    0                3       18
    1                3       12
    2                3       15
    
  • 相关阅读:
    SpringBoot自动装配原理解析
    面试官:你知道Spring中有哪些可以让我们扩展的地方么
    Spring事件监听机制
    Spring扩展点之Aware接口族
    程序员的进阶课-架构师之路(13)-B-树
    程序员的进阶课-架构师之路(12)-2-3-4树
    程序员的进阶课-架构师之路(11)-最容易理解的红黑树
    程序员的进阶课-架构师之路(10)-霍夫曼树
    程序员的进阶课-架构师之路(9)-平衡二叉树(AVL树)
    程序员的进阶课-架构师之路(8)-二叉树
  • 原文地址:https://www.cnblogs.com/shenyuelan/p/14080653.html
Copyright © 2011-2022 走看看