1 #!/usr/bin/env python 2 #-*- coding:UTF-8 -*- 3 ##################################################### 4 # Author: sunfx xingrhce@163.com 5 # Last modified: 2014/11/18 6 # Filename: re.py 7 # Q Q 群: 236147801 8 ##################################################### 9 10 import re 11 12 #1.查找文本中的字符 13 14 pattern = 'this' 15 text = 'Does this text match the pattern?' 16 17 match = re.search(pattern,text) 18 19 s = match.start() 20 e = match.end() 21 22 print 'Found "%s" in "%s" from %d to %d ("%s")' % 23 (match.re.pattern,match.string,s,e,text[s:e]) 24 25 ''' 26 match.re.pattern 要匹配的内容 27 match.string 匹配的字符 28 s 匹配到内容开始索引 29 d 匹配到内容结束索引 30 text[s:e] 匹配字符 31 ''' 32 33 #2.编译表达式 34 35 regexes = [ re.compile(p) 36 for p in ['this','that'] 37 ] #把字符转换Regexobject格式 38 39 40 41 print 'Text: %r ' % text #输出text内容 42 43 for regex in regexes: 44 45 print 'Seeking "%s"->' % regex.pattern, #regex.pattern 要匹配的字符 46 47 if regex.search(text): #在text中搜索this or that 48 49 print 'match!' 50 51 else: 52 53 print 'no match' 54 55 #3.多重匹配 56 57 text = 'abbaaabbbbaaaaa' 58 59 pattern = 'ab' 60 61 for match in re.findall(pattern,text): 62 63 print 'Found: "%s"' % match 64 65 #findall 直接返回字符串 66 67 68 for match in re.finditer(pattern,text): 69 s = match.start() 70 e = match.end() 71 print 'Found "%s" at %d:%d' % (text[s:e],s,e) 72 73 #finditer 返回原输入文字在字符串的位置 74 75 #4.模式语法 76 77 def test_patterns(text,patterns=[]): 78 79 for pattern,desc in patterns: 80 print 'Pattern %r (%s) ' %(pattern,desc) 81 print ' %r' % text 82 for match in re.finditer(pattern,text): 83 s = match.start() 84 e = match.end() 85 substr = text[s:e] #匹配到的字符 86 n_backslashes = text[:s].count('\') #查找文本:s坐标之前的包含多少\ 87 prefix = '.' * ( s + n_backslashes ) 88 print ' %s%r' % (prefix,substr) 89 print 90 return 91 92 test_patterns('abbaaabbbbaaaaa', 93 [('ab',"'a' followed by 'b'")] 94 ) 95 96 #贪婪模式 这种模式会减少单个匹配减少 97 ''' 98 * '匹配一次到多次' 99 + '至少匹配一次到多次' 100 ? '只匹配一次' 101 ab*, 'a followerd by zero or more b'), #匹配0次或者更多次 102 ab+, 'a followerd by one or mrore b'), #最少匹配一次或者更多次 103 ab?, 'a followerd by zero or one b'), #匹配0最多一次 104 ab{3}, 'a followerd by three b'), #最少匹配三次 105 ab{2,3}, 'a followerd by two to three b') #匹配两至三次 106 107 108 ab*?, 'a followerd by zero or more b'), #匹配0次或者更多次 109 ab+?, 'a followerd by one or mrore b'), #最少匹配一次或者更多次 110 ab??, 'a followerd by zero or one b'), #匹配0最多一次 111 ab{3}?, 'a followerd by three b'), #最少匹配三次 112 ab{2,3}?, 'a followerd by two to three b') #匹配两至三次 113 ''' 114 115 #用法如下: 116 117 str = 'absdsdsdsdsd' 118 119 print re.findall('ab*',str) 120 #['ab'] 121 122 print re.findall('ab*?',str) 123 #['a'] 124 125 #5.字符集 126 127 ''' 128 [ab] 'either a or b 匹配a或者b' 129 a[ab]+ 'a followerd by 1 more a or b 匹配一次a、b或者多次 ' 130 a[ab]+? 'a followerd by 1 or more a or b,not greedy 匹配1一次可以匹配多次' 131 [^] '不包含内容' 132 [a-z] '所有小写ASCII字母' 133 [A-Z] '所有大写写ASCII字母' 134 [a-zA-Z] '一个小写和大写的序列' 135 [A-Za-z] '一个大写小写的序列' 136 ''' 137 str ='aaaaaaaaaaaaaaaaaabbbbbbbbbbbbbbbbbbbbabbbbbbbbbbbasbsbab,a_baba' 138 139 print re.findall('[ab]',str) 140 print re.findall('a[ab]+',str) 141 print re.findall('a[ab]+?',str) 142 print re.findall('[^_]',str) 143 144 str = 'China,lovE' 145 146 print re.findall('[a-z][A-Z]',str) #['vE'] 147 print re.findall('[A-Z][a-z]',str) #['Ch'] 148 149 print re.findall('[A-Z][a-z]+',str) #['China'] 150 print re.findall('[a-z][A-Z]+',str) #['vE'] 151 152 print re.findall('[A-Z][a-z]*',str) #['China', 'E'] 153 print re.findall('[a-z][A-Z]*',str) #['h', 'i', 'n', 'a', 'l', 'o', 'vE'] 154 155 print re.findall('[A-Z][a-z]?',str) #['Ch', 'E'] 156 print re.findall('[a-z][A-Z]?',str) #['h', 'i', 'n', 'a', 'l', 'o', 'vE'] 157 158 ''' 159 . 元字符匹配一个字符 160 a. 161 b. 162 a.*b 163 a.*?b 164 ''' 165 166 c = 'woaizhongguoawsb,wasssssssssssssdsdsdsdbsdddddddbaaabbbbbbbsd' 167 168 print re.findall('a.',c) #['ai', 'aw', 'as', 'aa', 'ab'] 169 print re.findall('b.',c) #['b,', 'bs', 'ba', 'bb', 'bb', 'bb', 'bs'] 170 print re.findall('a.*b',c) #['aizhongguoawsb,wasssssssssssssdsdsdsdbsdddddddbaaabbbbbbb'] #贪婪模式匹配a到b之间的任意字符长度字符 171 print re.findall('a.*?b',c) #['aizhongguoawsb', 'asssssssssssssdsdsdsdb', 'aaab'] # ?结束了* 的贪婪模式, 172 #它不会到最后一个b再去匹配而且见好就收,匹配可能最短的字符 173 174 175 #6.转义码 176 177 ''' 178 转义码 含义 179 d 一个数字 180 D 一个非字符 181 s 空白符(制表符、空格、换行符) 182 S 非空白符(符号、字母、数字) 183 w 字母数字 184 W 非字母数字(符号、制表符、空格、换行符) 185 ''' 186 187 #7.锚定 188 189 ''' 190 锚定码 含义 191 ^ 字符串或行的开始 192 $ 字符串或行结束 193 A 字符串开始 194 字符串结束 195 一个单词开头或者末尾的空串 196 B 不在一个单词的开头活末尾的空串 197 ''' 198 #8.限制搜索 match、search 199 200 text = 'This is some text --with punctuation.' 201 202 pattern = 'is' 203 204 print 'Text :',text 205 print 'pattern:',pattern 206 207 m = re.match(pattern,text) #因为match是从字符开头开始匹配 is没有在开头所以没有匹配到. 208 print 'Match :',m 209 210 s = re.search(pattern,text) #is在文本中出现了两次所以匹配到内容 211 print 'Search :',s 212 213 pattern = re.compile(r'w*isw*') #编译规则 214 215 print 'Text:',text 216 217 218 pos = 0 219 while True: 220 match = pattern.search(text,pos) #搜索规则 221 if not match: 222 break 223 s = match.start() 224 e = match.end() 225 print ' %d : %d = "%s"' % (s,e-1,text[s:e]) 226 pos = e 227 228 #9 用户组解析匹配(任何一个正则都可以为组并嵌套在一个更大的表达式中) 229 regex = re.compile(r'(tw+)W+(w+)') 230 231 print 'Input text :',text 232 233 print 'Pattern :',regex.pattern 234 235 match = regex.search(text) 236 print 'Entire match :',match.group(0) #表示整个表达式的字符串,子组从1开始排序 237 print 'World start with "t":',match.group(1) #匹配到的第一组 238 print 'World after "t" word :',match.group(2) #匹配到的第二组 239 240 #python对基本分组进行了扩展 (?P<name>pattern) 241 242 print text 243 print 244 for pattern in [ r'^(?P<first_word>w+)', #组名和正则表达式组成 245 r'(?P<last_word>w+)S*$', 246 r'(?P<t_word>tw+)W+(?P<other_word>w+)', 247 r'(?P<ends_with_t>w+t)', 248 ]: 249 regex = re.compile(pattern) 250 match = regex.search(text) 251 print 'Matching "%s"' % pattern 252 print ' ',match.groups() #匹配到所有的组的值 253 print ' ',match.groupdict() #把组名和字串生成字典 254 print 255 256 def test_patterns(text,patterns=[]): 257 '''Given source text and a list of patterns,look for 258 matches for each pattern within the text and print 259 them to stdout. 260 ''' 261 #look for each pattern in the text and print the resuls 262 263 for pattern,desc in patterns: 264 print 'Pattern %r (%s) ' % (pattern,desc) 265 print ' %r' % text 266 for match in re.finditer(pattern,text): 267 s = match.start() 268 e = match.end() 269 prefix = ' ' * (s) #'空格 X 次数' 270 print ' %s%r%s' % (prefix,text[s:e],' '*(len(text)-e)), 271 print match.groups() 272 if match.groupdict(): 273 print '%s%s' % (' ' * (len(text) -s),match,groupdict()) 274 print 275 return 276 277 print test_patterns(text,[(r'(a(a*)(b*))','a followerd by 0-n a and 0-n b')]) 278 279 ''' 280 | 代表左右表达式任意匹配一个,他总是先尝试匹配左边的表达式,一旦成功匹配则 281 跳过匹配右边的表达式。如果|没有被包括()中,则它的范围是整个正则表达式 282 ?:pattern 283 ''' 284 285 286 #10.搜索选项 - 不区分大小写的匹配 287 ''' 288 re.IGNORECASE 忽略大小写 289 ''' 290 291 text = 'This is some text -- with punctuation.' 292 pattern = r'Tw+' 293 with_case = re.compile(pattern) 294 whitout_case = re.compile(pattern,re.IGNORECASE) #re.IGNORECASE 忽略大小写 295 296 print 'Text: %r' % text 297 print 'Pattern: %s' % pattern 298 print 'Case-sensitive:' 299 for match in with_case.findall(text): 300 print ' %r' % match 301 print 'Case-insensitive:' 302 for match in whitout_case.findall(text): 303 print ' %r' % match 304 305 #11.多行输入 306 ''' 307 MULTILINE 多行匹配 308 ''' 309 310 text = 'This is some text -- with punctuation. A secone lines.' 311 pattern = r'(^w+)|(w+S*$)' 312 single_line = re.compile(pattern) 313 multiline = re.compile(pattern,re.MULTILINE) 314 print 'Text: %r' % text 315 print 'Pattern: %s' % pattern 316 print 'Single Line :' 317 for match in single_line.findall(text): 318 print ' %r' % (match,) 319 print 'MULTILINE :' 320 for match in multiline.findall(text): 321 print ' %r' % (match,) 322 323 ''' 324 DOTALL 让点字符也可以匹配换行符 325 ''' 326 327 pattern = r'.+' 328 no_newlines = re.compile(pattern) 329 dotall = re.compile(pattern,re.DOTALL) 330 331 print 'Text : %r' % text 332 print 'Pattern: %s' % pattern 333 print 'No newlines :' 334 for match in no_newlines.findall(text): 335 print ' %r' % match 336 print 'Dotall :' 337 for match in dotall.findall(text): 338 print ' %r' % match 339 340 #12 Unicode匹配 341 ''' 342 re.UNICODE 匹配Unicode 343 ''' 344 345 346 import codecs 347 import sys 348 349 #set standard output encoding to UTF-8 350 351 sys.output = codecs.getwriter('UTF-8')(sys.stdout) 352 353 pattern = ur'w+' 354 ascii_pattern = re.compile(pattern) 355 unicde_pattern = re.compile(pattern,re.UNICODE) 356 357 print 'Text :',text 358 print 'Pattern :',pattern 359 print 'ASCII :',u', '.join(ascii_pattern.findall(text)) 360 print 'Unicode :',u', '.join(unicde_pattern.findall(text)) 361 362 ''' 363 re.VERBOSE 让正则更容易读 364 ''' 365 366 address = re.compile( 367 ''' 368 [wd.+-]+ #username 369 @ 370 ([wd.]+.)+ #domain name prefix 371 (com|org|edu) #TODO:support more top-level domains 372 ''', 373 re.UNICODE | re.VERBOSE) 374 375 candidates = [ 376 u'first.last@example.com', 377 u'first.last+category@gmail.com', 378 u'valid-address@mail.example.com', 379 u'not-valid@example.foo' 380 ] 381 382 for candidate in candidates: 383 match = address.search(candidate) 384 print '%-30s %s' % (candidate,'Matche' if match else 'no match') 385 386 387 address = re.compile ( 388 ''' 389 #A name is made up of letters,and may include "." 390 #for title abbreviations and middle initials. 391 ((?P<name> 392 ([w.,]+S+)*[w.,]+) 393 s* 394 # Email addresses are wrapped in angle 395 # brackets: <> but only if a name is 396 # found, so keep the start bracket in this 397 # group. 398 < 399 )? # the entire name is optional 400 401 # the address itself:username@domain.tld 402 (?P<email> 403 [wd.+-]+ #username 404 @ 405 ([wd.]+.)+ #domain name prefix 406 (com|org|edu) #TODO:support more top-level domains 407 ) 408 >? # optional closeing angle break 409 ''', 410 re.UNICODE | re.VERBOSE) 411 412 candidates = [ 413 u'first.last@example.com', 414 u'first.last+category@gmail.com', 415 u'valid-address@mail.example.com', 416 u'not-valid@example.foo' 417 u'Fist Last <first.last@example.com>' 418 u'NO Brackets first.last@example', 419 u'First Last', 420 u'First Middle Last <first.last@example.com>', 421 u'First M. Last <first.last@example.com>', 422 u'<first.last@example.com>', 423 ] 424 425 for candidate in candidates: 426 print 'candidate:',candidate 427 match = address.search(candidate) 428 if match: 429 print ' Name:',match.groupdict()['name'] 430 print ' Email:',match.groupdict()['email'] 431 else: 432 print ' No match' 433 434 ''' 435 正则表达式标志缩写表 436 437 标志 缩写 描述 438 439 IGNORECASE i 忽略大小写 440 MULTILINE m 多行匹配 441 DOTALL s 让点字符也可以匹配换行符 442 UNICODE u 匹配Unicode 443 VERBOSE x 让正则更容易读 444 在模式中嵌入标签(?imu)会打开相应的选项 445 ''' 446 text = 'This is some text -- with punctuation.' 447 pattern = r'(?i)Tw+' 448 regex = re.compile(pattern) 449 450 print 'Text :',text 451 print 'Pattern :',pattern 452 print 'Matches :',regex.findall(text) 453 454 #13 前向或后向 455 456 address = re.compile( 457 ''' 458 # A name is made up of letters, and may include "." 459 # for title abbreviations and middle initials 460 ((?P<name> 461 ([w.,]+s+)*[w.,]+ 462 ) 463 s+ 464 ) # name is no longer optional 465 # LOOKAHEAD 466 # Email address are wrapped in angle brackets, but only 467 # if they are both present or neither is . 468 (?= (<.*>$) 469 | 470 ([^<].*[^>]$) 471 ) 472 <? # optional opening angle bracket 473 474 # The address itself: username@domain.tld 475 (?P<email> 476 [wd.+-]+ 477 @ 478 ([wd.]+.)+ 479 (com|org|edu) 480 ) 481 >? 482 ''', 483 re.UNICODE | re.VERBOSE) 484 485 candidates = [ 486 u'First Last <first.last@example.com>', 487 u'No Brackets first.last@example.com', 488 u'Open Brackets <first.last@example.com>', 489 u'Close Brackets first.last@example.com', 490 ] 491 for candidate in candidates: 492 print 'Candidate:',candidate 493 match = address.search(candidate) 494 if match: 495 print ' Name :',match.groupdict()['name'] 496 print ' Email :',match.groupdict()['email'] 497 else: 498 print ' No match' 499 500 #自动忽略系统常用的noreply邮件地址 501 ''' 502 (?!noreply@.*$) 忽略这个邮件地址 503 (?<!noreply>) 两种模式 写在username之前不会向后断言 504 (?<=pattern) 用肯定向后断言查找符合某个模式的文本 505 ''' 506 address = re.compile( 507 ''' 508 ^ 509 # An address: username@domain.tld 510 511 # Ignore noreply address 512 (?!noreply@.*$) 513 514 [wd.+-]+ # username 515 @ 516 ([wd.]+.)+ # domain name prefix 517 (com|org|edu) # limit the allowed top-level domains 518 519 $ 520 ''', 521 re.UNICODE | re.VERBOSE) 522 523 candidates = [ 524 525 u'first.last@example.com', 526 u'noreply@example.com', 527 ] 528 529 for candidate in candidates: 530 print 'Candidate:',candidate 531 match = address.search(candidate) 532 if match: 533 print ' Match:',candidate[match.start():match.end()] 534 else: 535 print ' No match' 536 537 twitter = re.compile( 538 ''' 539 # A twitter handle: @username 540 (?<=@) 541 ([wd_]+) # username 542 ''', 543 re.UNICODE | re.VERBOSE) 544 545 text = ''' This text includes two Twitter handles. 546 One for @TheSF,and one for the author,@doughellmann. 547 ''' 548 print text 549 for match in twitter.findall(text): 550 print 'handle:',match 551 552 #14 自引用表达式 #可以把表达式编号后面来引用 553 554 address = re.compile( 555 ''' 556 (w+) # first name 557 s+ 558 (([w.]+)s+)? # optional middle name or initial 559 (w+) # last name 560 561 s+ 562 < 563 564 # The address: first_name.last_name@domain.tld 565 (?P<email> 566 1 #first name 567 . 568 4 #last name 569 @ 570 ([wd.]+.)+ 571 (com|org|edu) 572 ) 573 > 574 ''', 575 re.UNICODE | re.VERBOSE | re.IGNORECASE) 576 577 candidates = [ 578 u'First Last <first.last@example.com>', 579 u'Different Name <first.last.example.com>', 580 u'First Middle Last <first.last@example.com>', 581 ] 582 for candidate in candidates: 583 print 'Candidate:',candidate 584 match = address.search(candidate) 585 if match: 586 print ' Match name:',match.group(1),match.group(4) 587 else: 588 print ' No match' 589 590 #正则表达式解析包括一个扩展,可以使用(?P=name)指示表达式先前匹配的一个命名组的值. 591 592 address = re.compile( 593 ''' 594 595 # The regular name 596 (?P<first_name>w+) 597 s+ 598 (([w.]+)s+)? 599 (?P<last_name>w+) 600 s+ 601 < 602 603 # The address: first_name.last_name@domain.tld 604 (?P<email> 605 (?P=first_name) 606 . 607 (?P=last_name) 608 @ 609 ([wd.]+.)+ 610 (com|org|edu) 611 ) 612 > 613 ''', 614 re.UNICODE | re.VERBOSE | re.IGNORECASE) 615 616 candidates = [ 617 u'First last <first.last@example.com>', 618 u'Different Name <first.last@example.com>', 619 u'First Middle last <first.last@example.com>', 620 u'First M. Last<first.last@example.com>', 621 ] 622 623 for candidate in candidates: 624 print 'Candidate:',candidate 625 match = address.search(candidate) 626 if match: 627 print ' Match name:',match.groupdict()['first_name'] 628 print match.groupdict()['last_name'] 629 print ' Match email:',match.groupdict()['email'] 630 631 else: 632 print 'No match' 633 634 #15 用模式修改字符串 635 ''' 636 re支持使用正则表达式作为搜索机制来修改文本,而且可以替换可以引用正则表达式中的匹配组作为替换文本的一部分。 637 ''' 638 bold = re.compile(r'*{2}(.*?)*{2}') 639 text = 'Make this **bold**. This **too**.' 640 print 'Text:',text 641 print 'Bold:',bold.sub(r'<b>1</b>',text) 642 643 ''' 644 使用命名组来替换 645 count 来限制替换次数 646 sbun 工作原理和sub相似 subn同时返回修改后的字符串和完成的替换次数 647 ''' 648 649 bold = re.compile(r'*{2}(?P<bold_text>.*?)*{2}',re.UNICODE,) 650 651 print 'Text:',text 652 print 'Bold:',bold.sub(r'<b>g<bold_text></b>',text,count=1) 653 654 #16 利用模式拆分 655 656 ''' 657 str.split() 是分解字符串来完成解析的最常用方法之一,它只是支持字面值得作为分隔符 658 ''' 659 660 text = '''Paragraph one 661 one tuo lines. 662 663 Paragraph two. 664 665 Paragraph three.''' 666 667 print 'With findall:' 668 for num,para in enumerate(re.findall(r'.+? {2,}|$', 669 text, 670 flags = re.DOTALL) 671 ): 672 print num,repr(para) 673 print 674 675 print 676 print 'With split:' 677 for num,para in enumerate(re.split(r' {2,}',text)): 678 print num,repr(para) 679 print