zoukankan      html  css  js  c++  java
  • pandas 处理文本数据

    import pandas as pd
    import numpy as np
    

    常规的字符串操作

    s = pd.Series(['A',"B","C","AaBa","Baca",np.nan,'dog','cat'])
    
    s
    
    0       A
    1       B
    2       C
    3    AaBa
    4    Baca
    5     NaN
    6     dog
    7     cat
    dtype: object
    
    s.str.lower()
    
    0       a
    1       b
    2       c
    3    aaba
    4    baca
    5     NaN
    6     dog
    7     cat
    dtype: object
    
    s.str.upper()
    
    0       A
    1       B
    2       C
    3    AABA
    4    BACA
    5     NaN
    6     DOG
    7     CAT
    dtype: object
    
    s.str.len()
    
    0    1.0
    1    1.0
    2    1.0
    3    4.0
    4    4.0
    5    NaN
    6    3.0
    7    3.0
    dtype: float64
    
    idx = pd.Index([' jack','jill ',' jesse','frank'])
    
    idx.str.strip() # 去掉左右两边的空白符
    
    Index(['jack', 'jill', 'jesse', 'frank'], dtype='object')
    
    idx.str.lstrip()  #  左去掉空白字符
    
    Index(['jack', 'jill ', 'jesse', 'frank'], dtype='object')
    
    idx.str.rstrip()  # 去掉右边的空白符
    
    Index([' jack', 'jill', ' jesse', 'frank'], dtype='object')
    
    df = pd.DataFrame(np.random.randn(3,2),columns=[' Column A ',' Column B '],index=range(3))
    
    df
    
    Column A Column B
    0 0.048811 -1.097950
    1 -1.099516 -0.514286
    2 0.984136 -1.027790
    df.columns.str.strip()
    
    Index(['Column A', 'Column B'], dtype='object')
    
    df.columns.str.lower()
    
    Index([' column a ', ' column b '], dtype='object')
    
    df.columns = df.columns.str.strip().str.lower().str.replace(' ',"_")
    
    df
    
    column_a column_b
    0 0.048811 -1.097950
    1 -1.099516 -0.514286
    2 0.984136 -1.027790

    分割与替换字符

    str.split 操作
    s2 = pd.Series(['a_b_c',"c_D_e",np.nan,'f_g_H'])
    
    s2.str.split("_")
    
    0    [a, b, c]
    1    [c, D, e]
    2          NaN
    3    [f, g, H]
    dtype: object
    
    s2.str.split('_')[1]
    
    ['c', 'D', 'e']
    
    s2.str.split('_').str[1] # 切割之后的Series,通过str方法可以得到新的数据
    
    0      b
    1      D
    2    NaN
    3      g
    dtype: object
    
    s2.str.split('_').str.get(1)
    
    0      b
    1      D
    2    NaN
    3      g
    dtype: object
    
    s2.str.split('_',expand=True,n=1) # expand 参数,通过可以通过n确定延伸的次数
    
    0 1
    0 a b_c
    1 c D_e
    2 NaN NaN
    3 f g_H
    s2.str.rsplit('_',expand=True,n=1) # rsplit 方法
    
    0 1
    0 a_b c
    1 c_D e
    2 NaN NaN
    3 f_g H
    str.replace操作
    s3 = pd.Series(['A',"B","C","AaBa","Baca",np.nan,"CABA","dog","cat"])
    s3
    
    0       A
    1       B
    2       C
    3    AaBa
    4    Baca
    5     NaN
    6    CABA
    7     dog
    8     cat
    dtype: object
    
    s3.str.replace('^.a|dog','XX_XX',case=False)  # 替换第二个字符是a或者dog的字符串,忽略大小写,关于正则表达式的内容篇幅很大
    
    0          A
    1          B
    2          C
    3    XX_XXBa
    4    XX_XXca
    5        NaN
    6    XX_XXBA
    7      XX_XX
    8     XX_XXt
    dtype: object
    
    dollars = pd.Series(['12', '-$10', '$10,000'])
    dollars.str.replace('$', '') # replace $ to ''
    
    0        12
    1       -10
    2    10,000
    dtype: object
    
    dollars.str.replace("-$",'-')  #  doesn't work 
    
    0         12
    1       -$10
    2    $10,000
    dtype: object
    
    dollars.str.replace(r'-$','-')
    # 转义 原字符-$  替换成'-'
    
    0         12
    1        -10
    2    $10,000
    dtype: object
    
    dollars.str.replace('-$', '-')
    
    0         12
    1        -10
    2    $10,000
    dtype: object
    
    str.cat操作
    s = pd.Series(['A',"B","C","D"])
    s.str.cat(sep=',')
    
    'A,B,C,D'
    
    s.str.cat()
    
    'ABCD'
    
    t = pd.Series(['a', 'b', np.nan, 'd'])
    t.str.cat(sep=',',na_rep='_')
    
    'a,b,_,d'
    
    s.str.cat(['a',"b","c","d"])
    
    0    Aa
    1    Bb
    2    Cc
    3    Dd
    dtype: object
    
    pd.Series(['a1', 'b2', 'c3']).str.extract('(?P<letter>[ab])(?P<digit>d)', expand=False)#  组命名?P 
    
    letter digit
    0 a 1
    1 b 2
    2 NaN NaN
    match or contain操作
    pattern = r'[0-9][a-z]'
    pd.Series(['1','2','3a','3b','03c']).str.contains(pattern)# 包含数字字母的文本
    
    0    False
    1    False
    2     True
    3     True
    4     True
    dtype: bool
    
    pd.Series(['1','2','3a','3b','03c']).str.match(pattern)# 匹配数字字母的文本
    
    0    False
    1    False
    2     True
    3     True
    4    False
    dtype: bool
    
    其他的方法,可以参考官方文档中的方法函数

  • 相关阅读:
    程序员这生必须掌握的两种图形
    用一张组织架构图说清楚类和对象
    简单工厂、工厂方法、抽象工厂的比较与分析
    rabbitmq系列(一)初识rabbitmq
    【最新】经典面试100问,附答案
    使用wordPress搭建个人博客
    调试接口你还在用postman吗
    Token ,Cookie、Session傻傻分不清楚?
    你不可不知的自定义注解
    使用aop加解密http接口
  • 原文地址:https://www.cnblogs.com/onemorepoint/p/10106072.html
Copyright © 2011-2022 走看看