zoukankan      html  css  js  c++  java
  • Python中re模块的使用

    Python的re模块


    #预备知识点
    #正则表达式regex
    #特殊符号和字符 ---> 元字符



    正则表达式基础知识
    通配符含义正则示例匹配结果
    reg1 | reg2 匹配正则表达式reg1或reg2 foo | bar foo
    . 匹配任何字符( 除外) a.a abc
    ^ 匹配字符串起始部分 ^a ab....
    $ 匹配字符串终止部分 .txt$ a.txt
    * 匹配0次或者多次前面出现的正则表达式 a* aaaaa
    + 匹配1次或者多次前面出现的正则表达式 [a-z]+ aasx
    ? 匹配0次或者1次前面出现的正则表达式 first? first
    {N} 匹配N次前面出现的正则表达式 *.c{2} first.c abc.c
    {M,N} 匹配M~N次前面出现的正则表达式 *.c{0,1} one.c
    [...] 匹配来自字符集的任意单个字符 [abc] b
    [...x-y...] 匹配x~y范围中的任意单个字符 [0-9] 9
    [^...] 不匹配次字符集中任意单个字符 [^0-9] a
    (*|+|?|{})? 匹配上面频繁出现符号的非贪婪版 (*|+|?|{})? ({})
    (...) 匹配封闭的正则表达式,然后另存为子组 ([0-1][0-9])? 12
    d 匹配任何十进制数字 d.txt 1.txt
    w 匹配任何字母数字字符 w{2}txt 1.txt
    s 匹配任何空格字符 asb a b
     匹配任何单词边界 Thedog The dog
    N 匹配已保存的子组 ([0-9])1 1
    . 匹配"."这个字符 a.txt a.txt
    常用正则表达式
    正则表达式描述匹配结果
    d+(.d*)? 任意整数和浮点数 0.004 2 75.
    [^Wa-z0-9_][^WA-Z0-9_]* 首字母只能大写 Boo Foo
    ^http://([w-]+(.[w-]+)+(/[w-./?%&=u4e00-u9fa5]*)?)?$ 验证网址 http://www.baidu.com/?id=1
    ^[u4e00-u9fa5]{0,}$ 验证汉字 汉字汉字
    w+([-+.']w+)*@w+([-.]w+)*.w+([-.]w+)* 验证电子邮件 example@163.com
    ^[1-9]([0-9]{16}|[0-9]{13})[xX0-9]$ 验证身份证 14525419951215445X
    ^13[0-9]{1}[0-9]{8}|^15[9]{1}[0-9]{8} 验证手机号 138459572***
    ^(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]).(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0).(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0).(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[0-9])$ 验证IP 192.168.1.1
    ^[a-zA-Z0-9]+([a-zA-Z0-9-.]+)?.s|)$ 验证域名 baidu.com
    ^([a-zA-Z]:|\)\([^\]+\)*[^/:*?"<>|]+.txt(l)?$ 验证文件路径 C:userwo
    <(.*)>(.*)</(.*)>|<(.*)/> HTML标签匹配 xxxx

     


    #re模块
    #常用的方法
    compile(pattern, flags = 0)        匹配任何可选的标记来编译正则表达式的模式,然后返回一个正则表达式对象
    match(pattern, string, flags = 0)     使用带有可选标记的正则表达式的模式来匹配字符串。如果匹配成功,返回匹配对象,否则返回None
    search(pattern, string ,flags = 0)     使用可选标记搜索字符串中第一次出现的正则表达式模式。如果匹配成功,则返回匹配对象,否则返回None
    findall(pattern, string[,flags] )          查找字符串中所有(非重复)出现的正则表达式模式,并返回一个匹配列表
    finditer(pattern, string[,flags] )        与findall()相同,但返回的是一个迭代器。对于每一次匹配,迭代器都能返回一个匹配对象
    split(pattern, string, max = 0)         根据正则表达式的模式分隔符,split函数将字符串分割为列表,返回匹配列表,分割最多操作max次
    group(num = 0)               返回整个匹配对象,或者编号为num的特定子组

    import re
    m = re.search('foo','asdasdfooasd')
    #这里如果使用match将匹配不到任何字符串,因为match从第一个a开始匹配
    if m is not None:
        print(m.group())
    regex = <(.*)>(.*)</(.*)>|<(.*)/>
    m = re.search(regex,"aa<a>aaaa</a>")
    #一样只有search能匹配到标签
    if m is not None:
        print(m.group())
    regex = '(foow)(w)'
    m = re.match(r'(foow)(w)','fooasdfooasd')
    if m is not None:
        print(m.group(1))
        print(m.groups())
    #输出
    #fooa
    #('fooa', 's')
    regex = 'apple'
    m = re.findall(regex,'apple1 apple2 apple3')
        print(m)
    #输出
    #['apple', 'apple', 'apple']
    regex = 'apple'
    m = [ g.group() for g in re.finditer(regex,'apple1 apple2 apple3')]
    print(m)
    #输出
    #['apple', 'apple', 'apple']
    list = [
    'aaa, bbb ccc',
    'ddd, eee fff',
    ]
    for i in list:
        print(re.split(', |(?= (?:[a-z]{3})) ',i))
    #输出
    #['aaa', 'bbb', 'ccc']
    #['ddd', 'eee', 'fff']

    re模块小实例:

    __author__ = 'cq'
    
    import  re
    from random import randrange,choice,randint
    from string import ascii_lowercase as lc
    from time import ctime
    
    
    #生成数据文件
    def generate_data():
        with open('./data.txt','w') as f:
            for i in range(randint(20,30)):
                tlds = ('com', 'edu', 'net', 'org', 'gov')
                dtint = randint(100000000,1200000000) #生成时间戳
                dtstr = ctime(dtint)  #将时间戳转化为特定时间格式
                llen = randrange(4, 8) #用户名长度
                login = ''.join(choice(lc) for i in range(llen))  #生成用户名
                dlen = randrange(llen,13)                         #域名长度
                dom = ''.join(choice(lc) for i in range(dlen))    #生成域名
    
                data_line = "%s::%s@%s.%s::%d-%d-%d
    " % (dtstr, login, dom, choice(tlds), dtint, llen, dlen)
                f.write(data_line) #写入文件
                print(data_line)   #打印每行记录
    
    
    
    #匹配指定日期的行
    def match_date():
        regex = '(Mon|Tue|Wed|Thu|Fri|Sat|Sun)(.*)'
        with open('./data.txt','r') as f:
            m = re.findall(regex,f.read())
            for i in m:
                print(i)
    
    
    
    
    #匹配在某时间段内的记录
    def match_time_slot():
        regex = ' ([0-9]{1,2}) .*([0-9]{4})::(.*)'
        # regex = ' ([0-9]{0,2}).*(::)(.*) '
        with open('./data.txt','r') as f:
            m = re.findall(regex,f.read())
            for i in m:
                if 2000 <= int(i[1]) and int(i[1]) <= 2020 and 20 <= int(i[0]) and int(i[0]) <= 31:
                    print(i)
    
    
    #匹配某名单中人员的记录
    def match_name():
        regex = '::([a-z]{2,13})@([a-z]{2,13}).(com|edu|net|org|gov)'
        with open('./data.txt','r') as f:
            m = re.findall(regex,f.read())
            for i in m:
                print(i)
    
    
    
    def main():
        generate_data()
        print("
    ---------------match_date--------------------
    ")
        match_date()
        print("
    ---------------match_time_slot--------------------
    ")
        match_time_slot()
        print("
    ---------------match_name--------------------
    ")
        match_name()
    
    
    if '__main__' == __name__:
        main()
    输出结果
    Sun Mar  5 00:55:55 1989::qvnc@ygeowwaf.com::605033755-4-8
    
    Mon Oct 17 17:16:31 2005::yene@rtewqvvyfe.edu::1129540591-4-10
    
    Tue Oct  7 06:33:30 2003::wlyi@coagmnososzy.edu::1065479610-4-12
    
    Mon Oct 16 00:01:06 2006::zsgok@jkpiplcm.edu::1160928066-5-8
    
    Wed Mar 15 06:37:35 2000::paok@anpekysphicu.com::953073455-4-12
    
    Wed Mar 26 12:27:25 1980::bodqoe@iydohek.org::322892845-6-7
    
    Mon Jun  5 13:54:28 1989::fgiy@oppcjnafx.gov::613029268-4-9
    
    Sun Jul 25 05:27:23 2004::agmljfx@qvxgjqtkiwnl.org::1090704443-7-12
    
    Mon Nov 14 16:15:36 2005::tctz@bcikib.gov::1131956136-4-6
    
    Sun Jan 14 23:20:42 2007::qqlfkf@isslbh.com::1168788042-6-6
    
    Sun Jul 27 02:00:13 1980::cpiqwau@drbpfsfglip.edu::333482413-7-11
    
    Sun Feb 20 16:10:34 2005::aguqfd@hnrcaged.com::1108887034-6-8
    
    Wed Jun 27 06:13:05 1979::kowyk@ruoackjavkpq.net::299283185-5-12
    
    Wed Oct 12 19:52:54 1994::kqaol@mzewoas.edu::781962774-5-7
    
    Thu Aug 23 01:46:59 1973::uofpdq@zdeidbobin.org::114889619-6-10
    
    Sat Dec 21 11:36:20 1991::hodw@wfbw.org::693286580-4-4
    
    Tue Jun 22 14:42:19 1993::azgagm@nfmguh.org::740731339-6-6
    
    Sun Feb 23 04:50:57 2003::cysfu@fnzdo.com::1045947057-5-5
    
    Fri Jun 10 13:38:02 1983::qdhqw@fcdsvlmnhx.net::424071482-5-10
    
    Sat Jan 24 21:56:37 1998::dfyicjw@fklbymd.org::885650197-7-7
    
    Sun Jun  3 07:48:45 2007::wptuyjk@tsngnle.edu::1180828125-7-7
    
    Mon Nov 19 00:34:41 2001::ocjlb@nusyk.net::1006101281-5-5
    
    Sat Dec  1 21:01:23 1973::bvhx@lmir.net::123598883-4-4
    
    Sun Dec 16 17:42:51 1979::rpgs@hppau.org::314185371-4-5
    
    Mon Jul 21 23:46:13 1986::fnsro@nmbcwdmie.org::522344773-5-9
    
    
    ---------------match_date--------------------
    
    ('Sun', ' Mar  5 00:55:55 1989::qvnc@ygeowwaf.com::605033755-4-8')
    ('Mon', ' Oct 17 17:16:31 2005::yene@rtewqvvyfe.edu::1129540591-4-10')
    ('Tue', ' Oct  7 06:33:30 2003::wlyi@coagmnososzy.edu::1065479610-4-12')
    ('Mon', ' Oct 16 00:01:06 2006::zsgok@jkpiplcm.edu::1160928066-5-8')
    ('Wed', ' Mar 15 06:37:35 2000::paok@anpekysphicu.com::953073455-4-12')
    ('Wed', ' Mar 26 12:27:25 1980::bodqoe@iydohek.org::322892845-6-7')
    ('Mon', ' Jun  5 13:54:28 1989::fgiy@oppcjnafx.gov::613029268-4-9')
    ('Sun', ' Jul 25 05:27:23 2004::agmljfx@qvxgjqtkiwnl.org::1090704443-7-12')
    ('Mon', ' Nov 14 16:15:36 2005::tctz@bcikib.gov::1131956136-4-6')
    ('Sun', ' Jan 14 23:20:42 2007::qqlfkf@isslbh.com::1168788042-6-6')
    ('Sun', ' Jul 27 02:00:13 1980::cpiqwau@drbpfsfglip.edu::333482413-7-11')
    ('Sun', ' Feb 20 16:10:34 2005::aguqfd@hnrcaged.com::1108887034-6-8')
    ('Wed', ' Jun 27 06:13:05 1979::kowyk@ruoackjavkpq.net::299283185-5-12')
    ('Wed', ' Oct 12 19:52:54 1994::kqaol@mzewoas.edu::781962774-5-7')
    ('Thu', ' Aug 23 01:46:59 1973::uofpdq@zdeidbobin.org::114889619-6-10')
    ('Sat', ' Dec 21 11:36:20 1991::hodw@wfbw.org::693286580-4-4')
    ('Tue', ' Jun 22 14:42:19 1993::azgagm@nfmguh.org::740731339-6-6')
    ('Sun', ' Feb 23 04:50:57 2003::cysfu@fnzdo.com::1045947057-5-5')
    ('Fri', ' Jun 10 13:38:02 1983::qdhqw@fcdsvlmnhx.net::424071482-5-10')
    ('Sat', ' Jan 24 21:56:37 1998::dfyicjw@fklbymd.org::885650197-7-7')
    ('Sun', ' Jun  3 07:48:45 2007::wptuyjk@tsngnle.edu::1180828125-7-7')
    ('Mon', ' Nov 19 00:34:41 2001::ocjlb@nusyk.net::1006101281-5-5')
    ('Sat', ' Dec  1 21:01:23 1973::bvhx@lmir.net::123598883-4-4')
    ('Sun', ' Dec 16 17:42:51 1979::rpgs@hppau.org::314185371-4-5')
    ('Mon', ' Jul 21 23:46:13 1986::fnsro@nmbcwdmie.org::522344773-5-9')
    
    ---------------match_time_slot--------------------
    
    ('25', '2004', 'agmljfx@qvxgjqtkiwnl.org::1090704443-7-12')
    ('20', '2005', 'aguqfd@hnrcaged.com::1108887034-6-8')
    ('23', '2003', 'cysfu@fnzdo.com::1045947057-5-5')
    
    ---------------match_name--------------------
    
    ('qvnc', 'ygeowwaf', 'com')
    ('yene', 'rtewqvvyfe', 'edu')
    ('wlyi', 'coagmnososzy', 'edu')
    ('zsgok', 'jkpiplcm', 'edu')
    ('paok', 'anpekysphicu', 'com')
    ('bodqoe', 'iydohek', 'org')
    ('fgiy', 'oppcjnafx', 'gov')
    ('agmljfx', 'qvxgjqtkiwnl', 'org')
    ('tctz', 'bcikib', 'gov')
    ('qqlfkf', 'isslbh', 'com')
    ('cpiqwau', 'drbpfsfglip', 'edu')
    ('aguqfd', 'hnrcaged', 'com')
    ('kowyk', 'ruoackjavkpq', 'net')
    ('kqaol', 'mzewoas', 'edu')
    ('uofpdq', 'zdeidbobin', 'org')
    ('hodw', 'wfbw', 'org')
    ('azgagm', 'nfmguh', 'org')
    ('cysfu', 'fnzdo', 'com')
    ('qdhqw', 'fcdsvlmnhx', 'net')
    ('dfyicjw', 'fklbymd', 'org')
    ('wptuyjk', 'tsngnle', 'edu')
    ('ocjlb', 'nusyk', 'net')
    ('bvhx', 'lmir', 'net')
    ('rpgs', 'hppau', 'org')
    ('fnsro', 'nmbcwdmie', 'org')
    
    Process finished with exit code 0
    View Code

     

     

  • 相关阅读:
    position中的四种属性
    CSS中link和@import的区别
    隐藏对应元素的办法
    word20161217
    word20161216
    word20161215
    word20161214
    word20161213
    word201612012
    word20161211
  • 原文地址:https://www.cnblogs.com/cq146637/p/8072540.html
Copyright © 2011-2022 走看看