zoukankan      html  css  js  c++  java
  • python模块之re模块

    1. 正则

    正则就是用一些具有特殊意义的符号组合到一起(正则表达式)来描述字符或者字符串的方法,在python中正则匹配时通过re模块来实现的

     2. re模块

     单个字符匹配

    # w与W
    # s2 = "df当你 的_眼  睛眯|着/笑?sh29 sedn"
    # print(re.findall("w", s2))   # ['d', 'f', '当', '你', '的', '_', '眼', '睛', '眯', '着', '笑', 's', 'h', '2', '9', 's', 'e', 'd', 'n']
    # print(re.findall("W", s2))   # [' ', ' ', ' ', '|', '/', '?', ' ']
    
    # s与S
    # s3 = "df当  2	眯|着/笑
    ?sh
    29 d
    n"
    # print(re.findall("s", s3))    # [' ', ' ', '	', '
    ', '
    ', ' ', '
    ']
    # print(re.findall("S", s3))    # ['d', 'f', '当', 'x08', '2', '眯', '|', '着', '/', '笑', '?', 's', 'h', '2', '9', 'd', 'n']
    
    # d与D
    # print(re.findall("d", s3))   # ['2', '2', '9']
    # print(re.findall("D", s3))   # ['d', 'f', '当', 'x08', ' ', ' ', '	', '眯', '|', '着', '/', '笑', '
    ', '?', 's', 'h', '
    ', ' ', 'd', '
    ', 'n']
    
    # A与^
    # print(re.findall("Adf", s3))  # ['df']
    # print(re.findall("Ad", s3))   # ['d']
    # print(re.findall("A当", s3))  # []
    # print(re.findall("^df", s3))    # ['df']
    # print(re.findall("^当", s3))    # []
    
    # s4 = "df当 你的眼 睛眯|着/?sh
    29 dn笑"
    #   $   z不能用
    # print(re.findall("笑", s4))   # ['笑']
    # print(re.findall("笑$", s4))    # ['笑']
    
    # s5 = "s_		
    he	哈哈
    
     爱好 
    u
    "
    #
    # print(re.findall("
    ", s5))   # ['
    ', '
    ', '
    ', '
    ']
    # print(re.findall("	", s5))   # ['	', '	', '	']

     重复匹配

    # .  ?  *  +  {m,n}  .*  .*?
    
    # . 匹配任意字符,除了换行符(加上re.DOTALL这二个参数可以匹配
    )
    # s1 = "aa bbb aabb acb agb bba babbcb"
    # print(re.findall("a.b", s1))    # ['a b', 'aab', 'acb', 'agb', 'a b', 'abb']
    # print(re.findall("aa.b", s1))
    """
    匹配逻辑
    1. 读取三个字符
    2. 进行匹配
    3. 成功则返回这三个字符,并从最后一个字符下一个字符开始匹配
    4. 失败则从第一个字符的下一个字符开始匹配
    """
    
    # s2 = "aa babb aabb aaab aaaab bab ba"
    
    # ? 匹配0个或多个左边(单个)字符表达式,满足贪婪规则
    # print(re.findall("a?b", s2))   # ['b', 'ab', 'b', 'ab', 'b', 'b', 'ab', 'b', 'b', 'b']
    
    # * 匹配0个或多个左边(单个)字符的表达式  满足贪婪规则
    # s3 = "aa babb aabb aacb aaab bba ba"
    # print(re.findall("aa*b", s3))   # ['ab', 'aab', 'aaab']
    # print(re.findall("a*b", s3))   # ['b', 'ab', 'b', 'aab', 'b', 'b', 'aaab', 'b', 'b', 'b']
    
    # + 匹配一个或多个左边字符的表达式,满足贪婪规则
    # print(re.findall("a+b", s3))   # ['ab', 'aab', 'aaab']
    # print(re.findall("ab+b", s3))   # ['abb', 'abb']
    
    # {m,n} 匹配m个至n个左边表达式,满足贪婪规则
    # s4 = 'ab aab aaab aaaaabb'
    # print(re.findall("a{2,4}b", s4))  # ['aab', 'aaab', 'aaaab']
    
    # .* 贪婪匹配(尽可能地多),从头到尾
    s5 = "ab aa_b a*()b"
    # print(re.findall("a.*b", s5))  # ['ab aa_b a*()b'] 匹配以a开头以b结尾的任意长度的字符串
    # 上式匹配逻辑:从a开始,找到最后一个b,停止
    # print(re.findall("a.*_", s5))  # ['ab aa_']
    
    # .*?从头到尾匹配,非贪婪
    # print(re.findall("a.*?b", s5))  # ['ab', 'aa_b', 'aa*()b']
    # 上式匹配逻辑:从a开始,找到第一个b,停止,继续下一轮匹配
    # []
    #
    [] 括号中可以放任意一个字符 # - 在括号中表示范围,如果你要匹配上-,那么这个不能放在中间 # s1 = 'a1b a3b abb a*b acb a_b' # print(re.findall("a[abc]b", s1)) # ['abb', 'acb'] # [abc]表示abc中的任意一个字符 # print(re.findall("a[1-9]b", s1)) # ['a1b', 'a3b'] # s2 = 'aAb aWb aeb a*b arb a_b' # print(re.findall("a[A-Z]b", s2)) # ['aAb', 'aWb'] # print(re.findall("a[a-z]b", s2)) # ['aeb', 'arb'] # print(re.findall("a[A-Za-z]b", s2)) # ['aAb', 'aWb', 'aeb', 'arb']

     分组

    # 分组
    # ()制定一个规则,将满足规则的结果匹配出来
    # 练习1:找到s4里面的hang juan min
    # s4 = "hang_1 hang_gr juan_1 min_1"
    # print(re.findall("(.*?)_1", s4))  # ['hang', ' hang_gr juan', ' min']
    # print(re.findall("([a-z]+)_1", s4))  # ['hang', 'juan', 'min']
    # 分析:都是以字母开头,以_1结尾,字母可以有多个
    
    # 练习2:找到一个标签里的网址
    # s5 = '<a href="http://www.baidu.com">点击</a>'
    # print(re.findall('href="([a-z].*?)"', s5))  # ['http://www.baidu.com']
    
    # | 匹配左边或右边
    # s6 = "hanser:149 yousa:148 mandy:160"
    # print(re.findall("hanser|yousa|mandy", s6))  # ['hanser', 'yousa', 'mandy']
    
    # s7 = 'Too many companies have gone bankrupt, and the next one is my company'
    # print(re.findall("compan(?:y|ies)", s7))   # ['companies', 'company']
    # ?:表示将整体匹配出来而不只是()你里面的内容

    3. 常用方法

    findall 

    全部找到并返回一个列表

    # 找到下面标签里面的网址
    import
    re s1 = '<img src="https://pic3.zhimg.com/80/v2-1d1a5e4f422a77372514a57f38503f3e_hd.jpg" data-rawwidth="564" data-rawheight="699" data-size="normal" data-default-watermark-src="https://pic1.zhimg.com/v2-22b99e59d8efc7e7dec3faba8fbf2a24_b.jpg" class="origin_image zh-lightbox-thumb lazy" width="564" data-original="https://pic3.zhimg.com/v2-1d1a5e4f422a77372514a57f38503f3e_r.jpg" data-actualsrc="https://pic3.zhimg.com/v2-1d1a5e4f422a77372514a57f38503f3e_b.jpg">' print(re.findall('src="([a-z].*?)"', s1))

    # 结果['https://pic3.zhimg.com/80/v2-1d1a5e4f422a77372514a57f38503f3e_hd.jpg', 'https://pic1.zhimg.com/v2-22b99e59d8efc7e7dec3faba8fbf2a24_b.jpg', 'https://pic3.zhimg.com/v2-1d1a5e4f422a77372514a57f38503f3e_b.jpg']

    search

    找到第一个并返回包含匹配信息的对象,该对象可以通过group()方法得到匹配的字符串,没找到返回None

    s = "Hanser is a little girl in kindergarten"
    ret = re.search("[A-Z][a-z]*", "Hanser is a little girl in kindergarten")
    print(ret)   # <_sre.SRE_Match object; span=(0, 6), match='Hanser'>
    print(ret.group())   # Hanser

    match

    同search,区别在于从字符串开始处进行匹配,可以用search+^代替

    s = "Hanser is a little girl in kindergarten"
    print(re.match("Hanser", s).group())   # Hanser

    split

    按照指定的分割符分割

    s = "Hanser is a little girl in kindergarten"
    print(re.split(" ", s))  # ['Hanser', 'is', 'a', 'little', 'girl', 'in', 'kindergarten']
    s1 = "花褪残红青杏小,燕子飞时,绿水人家绕。枝上柳绵吹又少,天涯何处无芳草。"
    # 方法一:
    lst = re.split("[,。]", s1)  # ['花褪残红青杏小', '燕子飞时', '绿水人家绕', '枝上柳绵吹又少', '天涯何处无芳草', '']
    for i in lst:
        if i:  # 过滤空字符
            print(i[0])  # 花 燕 绿 枝 天
    
    # 方法二:
    lst1 = re.findall(r"[,。]([^,。])", s1)
    print(lst1)  # ['燕', '绿', '枝', '天'] 只能找到除去开头的短句首字

    sub

    替换

    s2 = "大家好,我是常山赵子龙"
    print(re.sub("常山", "石家庄", s2))   # 大家好,我是石家庄赵子龙

    compile

    # compile 制定一个匹配规则
    obj = re.compile("d{2}")
    print(obj.search("sdfs14523sdf").group())  # 14
    print(obj.findall("sdfs14523sdf"))    # ['14', '52']

    finditer

    返回一个存放匹配结果的迭代器

    ret = re.finditer("d", "sd283sef8w3o7sh")
    print(ret)   # 迭代器 <callable_iterator object at 0x000001C37F3A9C50>
    print(next(ret))  # match对象
    print(next(ret).group())  # 8
    print(next(ret).group())  # 3
    print(next(ret).group())  # 8
    print([i.group() for i in ret])  # 查看剩余结果

    命名分组

    # ret = re.search(r"<(?P<tag_name>w+)>w+</(?P=tag_name)>", "<h1>hello</h1>")
    # 在分组中利用?P<name>给分组起名字
    # 获取的匹配结果可以直接用group("名字")拿到对应的值
    # print(ret.group())   # <h1>hello</h1>
    # print(ret.group("tag_name"))  # h1
    
    # 如果不给组起名字,也可以用序号来找到对应的组,获取的结果可以直接用group(序号)拿到对应的值
    # ret = re.search(r"<(w+)>w+</1>", "<h1>hello</h1>")
    # print(ret.group())  # <h1>hello</h1>
    # print(ret.group(1))  # h1
    
    # ret = re.findall(r"<(?P<tag_name>w+)>w+</(?P=tag_name)>", "<h1>hello</h1>")
    # print(ret)   # ['h1']
  • 相关阅读:
    Java实现 LeetCode 833 字符串中的查找与替换(暴力模拟)
    Java实现 LeetCode 833 字符串中的查找与替换(暴力模拟)
    Java实现 LeetCode 833 字符串中的查找与替换(暴力模拟)
    Java实现 LeetCode 832 翻转图像(位运算)
    Java实现 LeetCode 832 翻转图像(位运算)
    Java实现 LeetCode 832 翻转图像(位运算)
    Java实现 LeetCode 831 隐藏个人信息(暴力)
    Java实现 LeetCode 831 隐藏个人信息(暴力)
    Java实现 LeetCode 831 隐藏个人信息(暴力)
    how to use automapper in c#, from cf~
  • 原文地址:https://www.cnblogs.com/zzliu/p/10257371.html
Copyright © 2011-2022 走看看