环境说明
- MySQL版本:5.7.28
- Pandas版本:1.0.1
- Python版本:3.8.2
数据准备
MySQL:
create table tab_name(
idx int primary key,
dt datetime,
msg varchar(50)
);
insert into tab_name(idx,dt,msg) values(1,str_to_date('2020-01-01 11:23:45','%Y-%m-%d %H:%i:%s'),'A1一');
insert into tab_name(idx,dt,msg) values(2,str_to_date('2020-01-02 12:23:45','%Y-%m-%d %H:%i:%s'),'B2二');
insert into tab_name(idx,dt,msg) values(3,str_to_date('2020-01-03 13:23:45','%Y-%m-%d %H:%i:%s'),'C3三');
insert into tab_name(idx,dt,msg) values(4,str_to_date('2020-01-04 14:23:45','%Y-%m-%d %H:%i:%s'),'D4四');
insert into tab_name(idx,dt,msg) values(5,str_to_date('2020-01-05 15:23:45','%Y-%m-%d %H:%i:%s'),'E5五');
insert into tab_name(idx,dt,msg) values(6,str_to_date('2020-01-06 16:23:45','%Y-%m-%d %H:%i:%s'),'F6六');
insert into tab_name(idx,dt,msg) values(7,str_to_date('2020-01-07 17:23:45','%Y-%m-%d %H:%i:%s'),'G7七');
insert into tab_name(idx,dt,msg) values(8,str_to_date('2020-01-08 18:23:45','%Y-%m-%d %H:%i:%s'),'H8八');
insert into tab_name(idx,dt,msg) values(9,str_to_date('2020-01-09 19:23:45','%Y-%m-%d %H:%i:%s'),'I9九');
mysql> select * from tab_name;
+-----+---------------------+------+
| idx | dt | msg |
+-----+---------------------+------+
| 1 | 2020-01-01 11:23:45 | A1一 |
| 2 | 2020-01-02 12:23:45 | B2二 |
| 3 | 2020-01-03 13:23:45 | C3三 |
| 4 | 2020-01-04 14:23:45 | D4四 |
| 5 | 2020-01-05 15:23:45 | E5五 |
| 6 | 2020-01-06 16:23:45 | F6六 |
| 7 | 2020-01-07 17:23:45 | G7七 |
| 8 | 2020-01-08 18:23:45 | H8八 |
| 9 | 2020-01-09 19:23:45 | I9九 |
+-----+---------------------+------+
9 rows in set (0.00 sec)
Pandas:
import pandas as pd
from datetime import datetime
df = pd.DataFrame({
'idx': [1,2,3,4,5,6,7,8,9],
'dt': [
datetime(2020,1,1,11,23,45),
datetime(2020,1,2,12,23,45),
datetime(2020,1,3,13,23,45),
datetime(2020,1,4,14,23,45),
datetime(2020,1,5,15,23,45),
datetime(2020,1,6,16,23,45),
datetime(2020,1,7,17,23,45),
datetime(2020,1,8,18,23,45),
datetime(2020,1,9,19,23,45),
],
'msg': ['A1一','B2二','C3三','D4四','E5五','F6六','G7七','H8八','I9九']
})
# df数据
idx dt msg
0 1 2020-01-01 11:23:45 A1一
1 2 2020-01-02 12:23:45 B2二
2 3 2020-01-03 13:23:45 C3三
3 4 2020-01-04 14:23:45 D4四
4 5 2020-01-05 15:23:45 E5五
5 6 2020-01-06 16:23:45 F6六
6 7 2020-01-07 17:23:45 G7七
7 8 2020-01-08 18:23:45 H8八
8 9 2020-01-09 19:23:45 I9九
数据过滤
单值过滤
mysql> select
-> idx,
-> dt,
-> msg
-> from tab_name
-> where idx>5;
+-----+---------------------+-------+
| idx | dt | msg |
+-----+---------------------+-------+
| 6 | 2020-01-06 16:23:45 | F6六 |
| 7 | 2020-01-07 17:23:45 | G7七 |
| 8 | 2020-01-08 18:23:45 | H8八 |
| 9 | 2020-01-09 19:23:45 | I9九 |
+-----+---------------------+-------+
4 rows in set (0.00 sec)
print(df.loc[lambda x: x['idx'] > 5])
idx dt msg
5 6 2020-01-06 16:23:45 F6六
6 7 2020-01-07 17:23:45 G7七
7 8 2020-01-08 18:23:45 H8八
8 9 2020-01-09 19:23:45 I9九
多值过滤
mysql> select
-> idx,
-> dt,
-> msg
-> from tab_name
-> where idx in (2,4,6);
+-----+---------------------+-------+
| idx | dt | msg |
+-----+---------------------+-------+
| 2 | 2020-01-02 12:23:45 | B2二 |
| 4 | 2020-01-04 14:23:45 | D4四 |
| 6 | 2020-01-06 16:23:45 | F6六 |
+-----+---------------------+-------+
3 rows in set (0.00 sec)
print(df.loc[lambda x: x['idx'].isin([2, 4, 6])])
idx dt msg
5 6 2020-01-06 16:23:45 F6六
6 7 2020-01-07 17:23:45 G7七
7 8 2020-01-08 18:23:45 H8八
8 9 2020-01-09 19:23:45 I9九
类型操作
数值与字符串
数值转字符串
mysql> select
-> idx,
-> cast(idx as char) as idx_char
-> from tab_name;
+-----+----------+
| idx | idx_char |
+-----+----------+
| 1 | 1 |
| 2 | 2 |
| 3 | 3 |
| 4 | 4 |
| 5 | 5 |
| 6 | 6 |
| 7 | 7 |
| 8 | 8 |
| 9 | 9 |
+-----+----------+
9 rows in set (0.00 sec)
df['idx_char'] = df['idx'].apply(lambda x: str(x))
print(df[['idx', 'idx_char']])
idx idx_char
0 1 1
1 2 2
2 3 3
3 4 4
4 5 5
5 6 6
6 7 7
7 8 8
8 9 9
字符串转数值
mysql> select
-> idx,
-> cast('1' as signed) as string_to_int
-> from tab_name;
+-----+---------------+
| idx | string_to_int |
+-----+---------------+
| 1 | 1 |
| 2 | 1 |
| 3 | 1 |
| 4 | 1 |
| 5 | 1 |
| 6 | 1 |
| 7 | 1 |
| 8 | 1 |
| 9 | 1 |
+-----+---------------+
9 rows in set (0.00 sec)
df['string_to_int'] = int('1')
print(df[['string_to_int']])
string_to_int
0 1
1 1
2 1
3 1
4 1
5 1
6 1
7 1
8 1
日期与字符串
日期转字符串
mysql> select
-> idx,
-> dt,
-> date_format(dt,'%Y-%m-%d %H:%i:%s') as datetime_to_string
-> from tab_name;
+-----+---------------------+---------------------+
| idx | dt | datetime_to_string |
+-----+---------------------+---------------------+
| 1 | 2020-01-01 11:23:45 | 2020-01-01 11:23:45 |
| 2 | 2020-01-02 12:23:45 | 2020-01-02 12:23:45 |
| 3 | 2020-01-03 13:23:45 | 2020-01-03 13:23:45 |
| 4 | 2020-01-04 14:23:45 | 2020-01-04 14:23:45 |
| 5 | 2020-01-05 15:23:45 | 2020-01-05 15:23:45 |
| 6 | 2020-01-06 16:23:45 | 2020-01-06 16:23:45 |
| 7 | 2020-01-07 17:23:45 | 2020-01-07 17:23:45 |
| 8 | 2020-01-08 18:23:45 | 2020-01-08 18:23:45 |
| 9 | 2020-01-09 19:23:45 | 2020-01-09 19:23:45 |
+-----+---------------------+---------------------+
9 rows in set (0.00 sec)
df['datetime_to_string'] = df['dt'].apply(lambda x: x.strftime('%Y-%m-%d %H:%M:%S'))
print(df[['idx','dt','datetime_to_string']])
idx dt datetime_to_string
0 1 2020-01-01 11:23:45 2020-01-01 11:23:45
1 2 2020-01-02 12:23:45 2020-01-02 12:23:45
2 3 2020-01-03 13:23:45 2020-01-03 13:23:45
3 4 2020-01-04 14:23:45 2020-01-04 14:23:45
4 5 2020-01-05 15:23:45 2020-01-05 15:23:45
5 6 2020-01-06 16:23:45 2020-01-06 16:23:45
6 7 2020-01-07 17:23:45 2020-01-07 17:23:45
7 8 2020-01-08 18:23:45 2020-01-08 18:23:45
8 9 2020-01-09 19:23:45 2020-01-09 19:23:45
字符串转日期
mysql> select
-> idx,
-> str_to_date('2020-01-01 11:23:45','%Y-%m-%d %H:%i:%s') as string_to_datetime
-> from tab_name;
+-----+---------------------+
| idx | string_to_datetime |
+-----+---------------------+
| 1 | 2020-01-01 11:23:45 |
| 2 | 2020-01-01 11:23:45 |
| 3 | 2020-01-01 11:23:45 |
| 4 | 2020-01-01 11:23:45 |
| 5 | 2020-01-01 11:23:45 |
| 6 | 2020-01-01 11:23:45 |
| 7 | 2020-01-01 11:23:45 |
| 8 | 2020-01-01 11:23:45 |
| 9 | 2020-01-01 11:23:45 |
+-----+---------------------+
9 rows in set (0.00 sec)
df['string_to_datetime'] = datetime.strptime('2020-01-01 11:23:45','%Y-%m-%d %H:%M:%S')
print(df[['idx','string_to_datetime']])
idx string_to_datetime
0 1 2020-01-01 11:23:45
1 2 2020-01-01 11:23:45
2 3 2020-01-01 11:23:45
3 4 2020-01-01 11:23:45
4 5 2020-01-01 11:23:45
5 6 2020-01-01 11:23:45
6 7 2020-01-01 11:23:45
7 8 2020-01-01 11:23:45
8 9 2020-01-01 11:23:45
字符串截取
mysql> select
-> idx,
-> msg,
-> substr(msg,3,1) as substring
-> from tab_name;
+-----+-------+-----------+
| idx | msg | substring |
+-----+-------+-----------+
| 1 | A1一 | 一 |
| 2 | B2二 | 二 |
| 3 | C3三 | 三 |
| 4 | D4四 | 四 |
| 5 | E5五 | 五 |
| 6 | F6六 | 六 |
| 7 | G7七 | 七 |
| 8 | H8八 | 八 |
| 9 | I9九 | 九 |
+-----+-------+-----------+
9 rows in set (0.00 sec)
df['substring'] = df['msg'].apply(lambda x: x[2:3])
print(df[['idx','msg','substring']])
idx msg substring
0 1 A1一 一
1 2 B2二 二
2 3 C3三 三
3 4 D4四 四
4 5 E5五 五
5 6 F6六 六
6 7 G7七 七
7 8 H8八 八
8 9 I9九 九
字符串拼接
mysql> select
-> idx,
-> msg,
-> concat(msg,' + AaC') as concat_string
-> from tab_name;
+-----+-------+---------------+
| idx | msg | concat_string |
+-----+-------+---------------+
| 1 | A1一 | A1一 + AaC |
| 2 | B2二 | B2二 + AaC |
| 3 | C3三 | C3三 + AaC |
| 4 | D4四 | D4四 + AaC |
| 5 | E5五 | E5五 + AaC |
| 6 | F6六 | F6六 + AaC |
| 7 | G7七 | G7七 + AaC |
| 8 | H8八 | H8八 + AaC |
| 9 | I9九 | I9九 + AaC |
+-----+-------+---------------+
9 rows in set (0.00 sec)
df['concat_string'] = df['msg'].apply(lambda x: x + ' + AaC')
print(df[['idx','msg','concat_string']])
idx msg concat_string
0 1 A1一 A1一 + AaC
1 2 B2二 B2二 + AaC
2 3 C3三 C3三 + AaC
3 4 D4四 D4四 + AaC
4 5 E5五 E5五 + AaC
5 6 F6六 F6六 + AaC
6 7 G7七 G7七 + AaC
7 8 H8八 H8八 + AaC
8 9 I9九 I9九 + AaC
字符串替换
mysql> select
-> idx,
-> msg,
-> replace(msg,idx,'@') as replace_string
-> from tab_name;
+-----+-------+----------------+
| idx | msg | replace_string |
+-----+-------+----------------+
| 1 | A1一 | A@一 |
| 2 | B2二 | B@二 |
| 3 | C3三 | C@三 |
| 4 | D4四 | D@四 |
| 5 | E5五 | E@五 |
| 6 | F6六 | F@六 |
| 7 | G7七 | G@七 |
| 8 | H8八 | H@八 |
| 9 | I9九 | I@九 |
+-----+-------+----------------+
9 rows in set (0.00 sec)
# 方法一
def func(source, old, new):
return source.replace(old, new)
df['replace_string'] = df.apply(lambda x: func(x['msg'], str(x['idx']), '@'), axis=1)
print(df[['idx', 'msg', 'replace_string']])
idx msg replace_string
0 1 A1一 A@一
1 2 B2二 B@二
2 3 C3三 C@三
3 4 D4四 D@四
4 5 E5五 E@五
5 6 F6六 F@六
6 7 G7七 G@七
7 8 H8八 H@八
8 9 I9九 I@九
# 方法二
import re
df['replace_string'] = df.apply(lambda x: re.sub(str(x['idx']), '@', x['msg']), axis=1)
print(df[['idx', 'msg', 'replace_string']])
idx msg replace_string
0 1 A1一 A@一
1 2 B2二 B@二
2 3 C3三 C@三
3 4 D4四 D@四
4 5 E5五 E@五
5 6 F6六 F@六
6 7 G7七 G@七
7 8 H8八 H@八
8 9 I9九 I@九
列操作
增加列
mysql> select
-> idx,
-> msg,
-> concat(msg, '_new') as new_msg
-> from tab_name;
+-----+-------+-----------+
| idx | msg | new_msg |
+-----+-------+-----------+
| 1 | A1一 | A1一_new |
| 2 | B2二 | B2二_new |
| 3 | C3三 | C3三_new |
| 4 | D4四 | D4四_new |
| 5 | E5五 | E5五_new |
| 6 | F6六 | F6六_new |
| 7 | G7七 | G7七_new |
| 8 | H8八 | H8八_new |
| 9 | I9九 | I9九_new |
+-----+-------+-----------+
9 rows in set (0.00 sec)
df['new_msg'] = df['msg'].apply(lambda x: x + '_new')
print(df[['idx','msg','new_msg']])
idx msg new_msg
0 1 A1一 A1一_new
1 2 B2二 B2二_new
2 3 C3三 C3三_new
3 4 D4四 D4四_new
4 5 E5五 E5五_new
5 6 F6六 F6六_new
6 7 G7七 G7七_new
7 8 H8八 H8八_new
8 9 I9九 I9九_new
Case语句
mysql> select
-> idx,
-> msg,
-> case substr(msg,1,1)
-> when 'A' then 'AAA'
-> when 'B' then 'BBB'
-> else 'ZZZ'
-> end as case_msg
-> from tab_name;
+-----+-------+----------+
| idx | msg | case_msg |
+-----+-------+----------+
| 1 | A1一 | AAA |
| 2 | B2二 | BBB |
| 3 | C3三 | ZZZ |
| 4 | D4四 | ZZZ |
| 5 | E5五 | ZZZ |
| 6 | F6六 | ZZZ |
| 7 | G7七 | ZZZ |
| 8 | H8八 | ZZZ |
| 9 | I9九 | ZZZ |
+-----+-------+----------+
9 rows in set (0.01 sec)
def func(msg):
s = msg[0:1]
if s == 'A':
return 'AAA'
elif s == 'B':
return 'BBB'
else:
return 'ZZZ'
df['case_msg'] = df['msg'].apply(func)
print(df[['idx','msg','case_msg']])
idx msg case_msg
0 1 A1一 AAA
1 2 B2二 BBB
2 3 C3三 ZZZ
3 4 D4四 ZZZ
4 5 E5五 ZZZ
5 6 F6六 ZZZ
6 7 G7七 ZZZ
7 8 H8八 ZZZ
8 9 I9九 ZZZ
Group By
mysql> select
-> idx%3 as idx_mod,
-> count(idx) as idx_count,
-> sum(idx) as idx_sum
-> from tab_name
-> group by idx%3
-> order by idx_mod;
+---------+-----------+---------+
| idx_mod | idx_count | idx_sum |
+---------+-----------+---------+
| 0 | 3 | 18 |
| 1 | 3 | 12 |
| 2 | 3 | 15 |
+---------+-----------+---------+
3 rows in set (0.00 sec)
import numpy as np
df['idx_mod'] = df['idx'].apply(lambda x: x%3)
grouped = df.groupby('idx_mod')
print(grouped.agg(idx_count=('idx','count'), idx_sum=('idx','sum')).sort_values(by='idx_mod', ascending=True))
# 上面的等价写法如下:
# print(grouped['idx'].agg(['count', 'sum']).rename(columns={'count': 'idx_count', 'sum': 'idx_sum'}).sort_values(by='idx_mod', ascending=True))
idx_count idx_sum
idx_mod
0 3 18
1 3 12
2 3 15