zoukankan      html  css  js  c++  java
  • re模块详解

      1 #!/usr/bin/env python
      2 #-*- coding:UTF-8 -*-
      3 #####################################################
      4 # Author: sunfx   xingrhce@163.com
      5 # Last modified:  2014/11/18
      6 # Filename:  re.py
      7 # Q  Q  群:  236147801
      8 #####################################################
      9  
     10 import re
     11  
     12 #1.查找文本中的字符
     13  
     14 pattern = 'this'
     15 text = 'Does this text match the pattern?'
     16  
     17 match = re.search(pattern,text)
     18  
     19 s = match.start()
     20 e = match.end()
     21  
     22 print 'Found "%s"
    in "%s"
    from %d to %d ("%s")' %
     23       (match.re.pattern,match.string,s,e,text[s:e])
     24  
     25 '''
     26 match.re.pattern 要匹配的内容
     27 match.string 匹配的字符
     28 s  匹配到内容开始索引
     29 d  匹配到内容结束索引
     30 text[s:e] 匹配字符
     31 '''
     32  
     33 #2.编译表达式
     34  
     35 regexes = [ re.compile(p)
     36             for p in ['this','that']              
     37 ] #把字符转换Regexobject格式
     38  
     39  
     40  
     41 print 'Text: %r
    ' % text #输出text内容
     42  
     43 for regex in regexes:
     44  
     45     print 'Seeking "%s"->' % regex.pattern,  #regex.pattern 要匹配的字符
     46  
     47     if regex.search(text): #在text中搜索this or that
     48  
     49         print 'match!'
     50  
     51     else:
     52  
     53         print 'no match'
     54  
     55 #3.多重匹配
     56  
     57 text = 'abbaaabbbbaaaaa'
     58  
     59 pattern = 'ab'
     60  
     61 for match in re.findall(pattern,text):
     62  
     63     print 'Found: "%s"' % match
     64  
     65 #findall 直接返回字符串
     66  
     67  
     68 for match in re.finditer(pattern,text):
     69     s = match.start()
     70     e = match.end()
     71     print 'Found "%s" at %d:%d' % (text[s:e],s,e)
     72  
     73 #finditer 返回原输入文字在字符串的位置
     74  
     75 #4.模式语法
     76  
     77 def test_patterns(text,patterns=[]):
     78  
     79     for pattern,desc in patterns: 
     80         print 'Pattern %r (%s) 
    ' %(pattern,desc) 
     81         print '   %r' % text
     82         for match in re.finditer(pattern,text):
     83             s = match.start()
     84             e = match.end()
     85             substr = text[s:e] #匹配到的字符
     86             n_backslashes = text[:s].count('\') #查找文本:s坐标之前的包含多少\
     87             prefix = '.' * ( s + n_backslashes ) 
     88             print '    %s%r' % (prefix,substr) 
     89         print
     90     return
     91  
     92 test_patterns('abbaaabbbbaaaaa',
     93             [('ab',"'a' followed by 'b'")]
     94     )
     95  
     96 #贪婪模式 这种模式会减少单个匹配减少
     97 '''
     98      *                '匹配一次到多次'
     99      +                '至少匹配一次到多次'
    100      ?                '只匹配一次'
    101      ab*,             'a followerd by zero or more b'),  #匹配0次或者更多次
    102      ab+,             'a followerd by one or mrore b'),  #最少匹配一次或者更多次
    103      ab?,             'a followerd by zero or one b'),   #匹配0最多一次
    104      ab{3},           'a followerd by three b'),         #最少匹配三次
    105      ab{2,3},           'a followerd by two to three b')   #匹配两至三次
    106  
    107  
    108      ab*?,             'a followerd by zero or more b'),  #匹配0次或者更多次
    109      ab+?,             'a followerd by one or mrore b'),  #最少匹配一次或者更多次
    110      ab??,             'a followerd by zero or one b'),   #匹配0最多一次
    111      ab{3}?,           'a followerd by three b'),         #最少匹配三次
    112      ab{2,3}?,           'a followerd by two to three b')   #匹配两至三次
    113 '''
    114  
    115 #用法如下:
    116  
    117 str = 'absdsdsdsdsd'
    118  
    119 print re.findall('ab*',str)
    120 #['ab']
    121  
    122 print re.findall('ab*?',str)
    123 #['a']
    124  
    125 #5.字符集
    126  
    127 '''
    128 [ab]     'either a or b 匹配a或者b'
    129 a[ab]+   'a followerd by 1 more a or b 匹配一次a、b或者多次 '
    130 a[ab]+?  'a followerd by 1 or more a or b,not greedy 匹配1一次可以匹配多次'
    131 [^]      '不包含内容'
    132 [a-z]    '所有小写ASCII字母' 
    133 [A-Z]    '所有大写写ASCII字母' 
    134 [a-zA-Z] '一个小写和大写的序列'
    135 [A-Za-z] '一个大写小写的序列'
    136 '''
    137 str ='aaaaaaaaaaaaaaaaaabbbbbbbbbbbbbbbbbbbbabbbbbbbbbbbasbsbab,a_baba'
    138  
    139 print re.findall('[ab]',str)
    140 print re.findall('a[ab]+',str)
    141 print re.findall('a[ab]+?',str)
    142 print re.findall('[^_]',str)
    143  
    144 str = 'China,lovE'
    145  
    146 print re.findall('[a-z][A-Z]',str)  #['vE'] 
    147 print re.findall('[A-Z][a-z]',str)  #['Ch']
    148  
    149 print re.findall('[A-Z][a-z]+',str) #['China']
    150 print re.findall('[a-z][A-Z]+',str) #['vE']
    151  
    152 print re.findall('[A-Z][a-z]*',str) #['China', 'E']
    153 print re.findall('[a-z][A-Z]*',str) #['h', 'i', 'n', 'a', 'l', 'o', 'vE']
    154  
    155 print re.findall('[A-Z][a-z]?',str) #['Ch', 'E']
    156 print re.findall('[a-z][A-Z]?',str) #['h', 'i', 'n', 'a', 'l', 'o', 'vE']
    157  
    158 '''
    159 .      元字符匹配一个字符
    160 a.
    161 b.
    162 a.*b
    163 a.*?b
    164 '''
    165  
    166 c = 'woaizhongguoawsb,wasssssssssssssdsdsdsdbsdddddddbaaabbbbbbbsd'
    167  
    168 print re.findall('a.',c)  #['ai', 'aw', 'as', 'aa', 'ab']
    169 print re.findall('b.',c)  #['b,', 'bs', 'ba', 'bb', 'bb', 'bb', 'bs']
    170 print re.findall('a.*b',c)  #['aizhongguoawsb,wasssssssssssssdsdsdsdbsdddddddbaaabbbbbbb'] #贪婪模式匹配a到b之间的任意字符长度字符
    171 print re.findall('a.*?b',c)  #['aizhongguoawsb', 'asssssssssssssdsdsdsdb', 'aaab'] # ?结束了* 的贪婪模式,
    172                              #它不会到最后一个b再去匹配而且见好就收,匹配可能最短的字符
    173  
    174  
    175 #6.转义码
    176  
    177 '''
    178 转义码                                   含义
    179  d                                    一个数字
    180  D                                    一个非字符
    181  s                                    空白符(制表符、空格、换行符)
    182  S                                    非空白符(符号、字母、数字)
    183  w                                    字母数字
    184  W                                    非字母数字(符号、制表符、空格、换行符)
    185 '''
    186  
    187 #7.锚定
    188  
    189 '''
    190 锚定码                               含义
    191   ^                              字符串或行的开始
    192   $                              字符串或行结束
    193   A                             字符串开始
    194                                字符串结束
    195                                一个单词开头或者末尾的空串
    196   B                             不在一个单词的开头活末尾的空串
    197 '''
    198 #8.限制搜索 match、search
    199  
    200 text = 'This is some text --with punctuation.'
    201  
    202 pattern = 'is'
    203  
    204 print 'Text    :',text
    205 print 'pattern:',pattern
    206  
    207 m = re.match(pattern,text)   #因为match是从字符开头开始匹配 is没有在开头所以没有匹配到.
    208 print 'Match :',m   
    209  
    210 s = re.search(pattern,text) #is在文本中出现了两次所以匹配到内容
    211 print 'Search :',s
    212  
    213 pattern = re.compile(r'w*isw*') #编译规则
    214  
    215 print 'Text:',text
    216  
    217  
    218 pos = 0
    219 while  True:
    220     match = pattern.search(text,pos) #搜索规则
    221     if not match:
    222         break
    223     s = match.start()
    224     e = match.end() 
    225     print '  %d : %d = "%s"' % (s,e-1,text[s:e]) 
    226     pos = e
    227  
    228 #9 用户组解析匹配(任何一个正则都可以为组并嵌套在一个更大的表达式中)
    229 regex = re.compile(r'(tw+)W+(w+)')
    230  
    231 print 'Input  text      :',text
    232  
    233 print 'Pattern          :',regex.pattern
    234  
    235 match = regex.search(text)
    236 print 'Entire match     :',match.group(0) #表示整个表达式的字符串,子组从1开始排序
    237 print 'World start with "t":',match.group(1) #匹配到的第一组
    238 print 'World after "t" word :',match.group(2) #匹配到的第二组
    239  
    240 #python对基本分组进行了扩展 (?P<name>pattern)
    241  
    242 print text
    243 print
    244 for pattern in [ r'^(?P<first_word>w+)',  #组名和正则表达式组成
    245                  r'(?P<last_word>w+)S*$',
    246                  r'(?P<t_word>tw+)W+(?P<other_word>w+)',
    247                  r'(?P<ends_with_t>w+t)',
    248                  ]:
    249     regex = re.compile(pattern)
    250     match = regex.search(text)
    251     print 'Matching "%s"' % pattern
    252     print ' ',match.groups()  #匹配到所有的组的值
    253     print ' ',match.groupdict() #把组名和字串生成字典 
    254     print
    255  
    256 def test_patterns(text,patterns=[]):
    257     '''Given source text and a list of patterns,look for 
    258     matches for each pattern within the text and print
    259     them to stdout.
    260     '''
    261     #look for each pattern in the text and print the resuls
    262  
    263     for pattern,desc in patterns:
    264         print 'Pattern %r (%s)
    ' % (pattern,desc)
    265         print '   %r' % text
    266     for match in re.finditer(pattern,text):
    267         s = match.start()
    268         e = match.end()
    269         prefix = ' ' * (s) #'空格 X 次数'
    270         print '   %s%r%s' % (prefix,text[s:e],' '*(len(text)-e)),
    271         print match.groups()
    272         if match.groupdict():
    273             print '%s%s' % (' ' * (len(text) -s),match,groupdict())
    274             print
    275     return
    276  
    277 print test_patterns(text,[(r'(a(a*)(b*))','a followerd by 0-n a and 0-n b')])
    278  
    279 '''
    280 |       代表左右表达式任意匹配一个,他总是先尝试匹配左边的表达式,一旦成功匹配则
    281 跳过匹配右边的表达式。如果|没有被包括()中,则它的范围是整个正则表达式
    282 ?:pattern
    283 '''
    284  
    285  
    286 #10.搜索选项 - 不区分大小写的匹配
    287 '''
    288 re.IGNORECASE 忽略大小写
    289 '''
    290  
    291 text  = 'This is some text  -- with punctuation.'
    292 pattern = r'Tw+'
    293 with_case = re.compile(pattern)
    294 whitout_case = re.compile(pattern,re.IGNORECASE) #re.IGNORECASE 忽略大小写
    295  
    296 print 'Text: 
      %r' % text
    297 print 'Pattern:
     %s' % pattern
    298 print 'Case-sensitive:'
    299 for match in with_case.findall(text):
    300     print '  %r' % match
    301 print 'Case-insensitive:'
    302 for match in whitout_case.findall(text):
    303     print ' %r' % match
    304  
    305 #11.多行输入
    306 '''
    307 MULTILINE  多行匹配
    308 '''
    309  
    310 text = 'This is some text  -- with punctuation.
    A secone lines.'
    311 pattern = r'(^w+)|(w+S*$)'
    312 single_line = re.compile(pattern)
    313 multiline = re.compile(pattern,re.MULTILINE) 
    314 print 'Text:
     %r' % text
    315 print 'Pattern:
      %s' % pattern
    316 print 'Single Line :'
    317 for match in single_line.findall(text):
    318     print '  %r' % (match,)
    319 print 'MULTILINE  :'
    320 for match in multiline.findall(text):
    321     print '  %r'  % (match,)
    322  
    323 '''
    324 DOTALL 让点字符也可以匹配换行符
    325 '''
    326  
    327 pattern = r'.+'
    328 no_newlines = re.compile(pattern)
    329 dotall = re.compile(pattern,re.DOTALL)
    330  
    331 print 'Text :
       %r' % text
    332 print 'Pattern:
     %s' % pattern
    333 print 'No newlines :'
    334 for match in no_newlines.findall(text):
    335     print '  %r' % match
    336 print 'Dotall    :'
    337 for  match in dotall.findall(text):
    338     print '  %r' % match
    339  
    340 #12 Unicode匹配
    341 '''
    342 re.UNICODE 匹配Unicode
    343 '''
    344  
    345  
    346 import codecs
    347 import sys
    348  
    349 #set standard output encoding to UTF-8
    350  
    351 sys.output = codecs.getwriter('UTF-8')(sys.stdout)
    352  
    353 pattern = ur'w+'
    354 ascii_pattern = re.compile(pattern)
    355 unicde_pattern = re.compile(pattern,re.UNICODE)
    356  
    357 print 'Text    :',text
    358 print 'Pattern :',pattern
    359 print 'ASCII   :',u', '.join(ascii_pattern.findall(text))
    360 print 'Unicode :',u', '.join(unicde_pattern.findall(text))
    361  
    362 '''
    363 re.VERBOSE 让正则更容易读
    364 '''
    365  
    366 address = re.compile(
    367         '''
    368         [wd.+-]+    #username
    369         @ 
    370         ([wd.]+.)+ #domain name prefix
    371         (com|org|edu) #TODO:support more top-level domains
    372         ''',
    373         re.UNICODE | re.VERBOSE)
    374  
    375 candidates = [
    376         u'first.last@example.com',
    377         u'first.last+category@gmail.com',
    378         u'valid-address@mail.example.com',
    379         u'not-valid@example.foo'
    380 ]
    381  
    382 for candidate in candidates:
    383     match = address.search(candidate)
    384     print '%-30s %s' % (candidate,'Matche' if match else 'no match')
    385  
    386  
    387 address = re.compile (
    388     '''
    389     #A name is made up of letters,and may include "."
    390     #for title abbreviations and middle initials.
    391     ((?P<name>
    392         ([w.,]+S+)*[w.,]+)
    393         s*
    394         # Email addresses are wrapped in angle
    395         # brackets: <> but only if a name is 
    396         # found, so keep the start bracket in this
    397         # group.
    398         <
    399     )?  # the entire name is optional
    400      
    401     # the address itself:username@domain.tld
    402     (?P<email>
    403         [wd.+-]+    #username
    404         @ 
    405         ([wd.]+.)+ #domain name prefix
    406         (com|org|edu) #TODO:support more top-level domains
    407     )
    408     >? # optional closeing angle break
    409     ''',
    410     re.UNICODE | re.VERBOSE)
    411  
    412 candidates = [
    413         u'first.last@example.com',
    414         u'first.last+category@gmail.com',
    415         u'valid-address@mail.example.com',
    416         u'not-valid@example.foo'
    417         u'Fist Last <first.last@example.com>'
    418         u'NO Brackets first.last@example',
    419         u'First Last',
    420         u'First Middle Last <first.last@example.com>',
    421         u'First M. Last <first.last@example.com>',
    422         u'<first.last@example.com>',
    423 ]
    424  
    425 for candidate in candidates:
    426     print 'candidate:',candidate
    427     match = address.search(candidate)
    428     if match:
    429         print ' Name:',match.groupdict()['name']
    430         print ' Email:',match.groupdict()['email']
    431     else:
    432         print '   No match'
    433  
    434 '''
    435                     正则表达式标志缩写表
    436  
    437     标志                  缩写               描述
    438  
    439   IGNORECASE              i           忽略大小写
    440   MULTILINE                 m           多行匹配
    441   DOTALL                    s          让点字符也可以匹配换行符
    442   UNICODE                  u          匹配Unicode
    443   VERBOSE                 x          让正则更容易读
    444 在模式中嵌入标签(?imu)会打开相应的选项
    445 '''
    446 text = 'This is  some text -- with punctuation.'
    447 pattern = r'(?i)Tw+'
    448 regex = re.compile(pattern)
    449  
    450 print 'Text   :',text
    451 print 'Pattern    :',pattern
    452 print 'Matches   :',regex.findall(text)
    453  
    454 #13 前向或后向
    455  
    456 address = re.compile(
    457     '''
    458     # A name is made up of letters, and may include "."
    459     # for title abbreviations and middle initials
    460     ((?P<name>
    461         ([w.,]+s+)*[w.,]+
    462         )
    463     s+
    464     )  # name is no longer optional
    465     # LOOKAHEAD
    466     # Email address are wrapped in angle brackets, but only
    467     # if they are both present or neither is .
    468     (?= (<.*>$)
    469         |
    470         ([^<].*[^>]$)
    471     )
    472     <? # optional opening angle bracket
    473  
    474     # The address itself: username@domain.tld
    475     (?P<email>
    476         [wd.+-]+
    477         @
    478         ([wd.]+.)+
    479         (com|org|edu)
    480     )
    481     >?
    482     ''',
    483     re.UNICODE | re.VERBOSE)
    484  
    485 candidates = [
    486     u'First Last <first.last@example.com>',
    487     u'No Brackets first.last@example.com',
    488     u'Open Brackets <first.last@example.com>',
    489     u'Close Brackets first.last@example.com',
    490     ]
    491 for candidate in candidates:
    492     print 'Candidate:',candidate
    493     match = address.search(candidate)
    494     if match:
    495         print ' Name :',match.groupdict()['name']
    496         print ' Email :',match.groupdict()['email']
    497     else:
    498         print '  No match'
    499  
    500 #自动忽略系统常用的noreply邮件地址
    501 '''
    502 (?!noreply@.*$) 忽略这个邮件地址
    503 (?<!noreply>)  两种模式 写在username之前不会向后断言 
    504 (?<=pattern)   用肯定向后断言查找符合某个模式的文本 
    505 '''
    506 address = re.compile(
    507     '''
    508     ^
    509     # An address: username@domain.tld
    510  
    511     # Ignore noreply address
    512     (?!noreply@.*$)
    513  
    514     [wd.+-]+     # username
    515     @
    516     ([wd.]+.)+  # domain name prefix
    517     (com|org|edu)  # limit the allowed top-level domains
    518  
    519     $
    520     ''',
    521     re.UNICODE | re.VERBOSE)
    522  
    523 candidates = [
    524  
    525     u'first.last@example.com',
    526     u'noreply@example.com',
    527 ]
    528  
    529 for candidate in candidates:
    530     print 'Candidate:',candidate
    531     match = address.search(candidate)
    532     if match:
    533         print '  Match:',candidate[match.start():match.end()]
    534     else:
    535         print '  No match'
    536  
    537 twitter = re.compile(
    538     '''
    539     # A twitter handle: @username
    540     (?<=@)
    541     ([wd_]+)   # username
    542     ''',
    543     re.UNICODE | re.VERBOSE)
    544  
    545 text = ''' This text includes two Twitter handles.
    546 One for @TheSF,and one for the author,@doughellmann.
    547 '''
    548 print text
    549 for match in twitter.findall(text):
    550     print 'handle:',match
    551  
    552 #14 自引用表达式 #可以把表达式编号后面来引用
    553  
    554 address = re.compile(
    555     '''
    556     (w+)          # first name
    557     s+
    558     (([w.]+)s+)?  # optional middle name or initial
    559     (w+)           # last name
    560  
    561     s+
    562     <
    563  
    564     # The address: first_name.last_name@domain.tld
    565     (?P<email>
    566         1         #first name
    567         .
    568         4         #last name
    569         @
    570         ([wd.]+.)+
    571         (com|org|edu)
    572         )            
    573     >
    574     ''',
    575     re.UNICODE | re.VERBOSE | re.IGNORECASE)
    576  
    577 candidates = [
    578     u'First Last <first.last@example.com>',
    579     u'Different Name <first.last.example.com>',
    580     u'First Middle Last <first.last@example.com>', 
    581 ]
    582 for candidate in candidates:
    583     print 'Candidate:',candidate
    584     match = address.search(candidate)
    585 if match:
    586     print '  Match name:',match.group(1),match.group(4)
    587 else:
    588     print ' No match'
    589  
    590 #正则表达式解析包括一个扩展,可以使用(?P=name)指示表达式先前匹配的一个命名组的值.
    591  
    592 address = re.compile(
    593     '''
    594  
    595     # The regular name
    596     (?P<first_name>w+)
    597     s+
    598     (([w.]+)s+)?
    599     (?P<last_name>w+)
    600     s+
    601     <
    602  
    603     # The address: first_name.last_name@domain.tld
    604     (?P<email>
    605         (?P=first_name)
    606         .
    607         (?P=last_name)
    608         @
    609         ([wd.]+.)+
    610         (com|org|edu)
    611         )
    612     >
    613     ''',
    614     re.UNICODE | re.VERBOSE | re.IGNORECASE)
    615  
    616 candidates = [
    617     u'First last <first.last@example.com>',
    618     u'Different Name <first.last@example.com>',
    619     u'First Middle last <first.last@example.com>',
    620     u'First M. Last<first.last@example.com>',
    621 ]
    622  
    623 for candidate in candidates:
    624     print 'Candidate:',candidate
    625     match = address.search(candidate)
    626     if match:
    627         print '  Match name:',match.groupdict()['first_name']
    628         print match.groupdict()['last_name']
    629         print '  Match email:',match.groupdict()['email']
    630  
    631     else:
    632         print 'No match'
    633  
    634 #15 用模式修改字符串
    635 '''
    636 re支持使用正则表达式作为搜索机制来修改文本,而且可以替换可以引用正则表达式中的匹配组作为替换文本的一部分。
    637 '''
    638 bold = re.compile(r'*{2}(.*?)*{2}')
    639 text = 'Make this **bold**. This **too**.'
    640 print 'Text:',text
    641 print 'Bold:',bold.sub(r'<b>1</b>',text)
    642  
    643 '''
    644 使用命名组来替换
    645 count 来限制替换次数
    646 sbun 工作原理和sub相似 subn同时返回修改后的字符串和完成的替换次数
    647 '''
    648  
    649 bold = re.compile(r'*{2}(?P<bold_text>.*?)*{2}',re.UNICODE,)
    650  
    651 print 'Text:',text
    652 print 'Bold:',bold.sub(r'<b>g<bold_text></b>',text,count=1)
    653  
    654 #16 利用模式拆分
    655  
    656 '''
    657 str.split() 是分解字符串来完成解析的最常用方法之一,它只是支持字面值得作为分隔符
    658 '''
    659  
    660 text = '''Paragraph one
    661 one tuo lines.
    662  
    663 Paragraph two.
    664  
    665 Paragraph three.'''
    666  
    667 print 'With findall:'
    668 for num,para in enumerate(re.findall(r'.+?
    {2,}|$',
    669                                     text,
    670                                     flags = re.DOTALL)
    671                             ):
    672     print num,repr(para)
    673     print
    674  
    675 print 
    676 print 'With split:'
    677 for num,para in enumerate(re.split(r'
    {2,}',text)):
    678     print num,repr(para)
    679     print
  • 相关阅读:
    基础学习总结(四)---内存获取、XML之PULL解析
    基础学习总结(三)--文本、SD卡数据读写
    基础学习总结(二)---认识布局与配置测试环境
    基础学习总结(一)--工程结构与打包过程
    StreamReader和StreamWrite与FileStream区别
    redis笔记
    linux 下文件显示行数
    php判断页面访问是移动端还是pc端
    redis
    判断链接是否为图片
  • 原文地址:https://www.cnblogs.com/carl-angela/p/5499421.html
Copyright © 2011-2022 走看看