------------恢复内容开始------------
import re text = "Tom is 8 years old. Mike is 25 years old." pattern = re.compile('d+') #模式字符串编译 pattern.findall(text) Out[5]: ['8', '25'] text Out[6]: 'Tom is 8 years old. Mike is 25 years old.' re.findall('d+',text) Out[7]: ['8', '25']
import re s = "\author:Tom" pattern = re.compile('\author') pattern.findall(s) Out[5]: [] pattern = re.compile('\\author') pattern.findall(s)
import re text = 'Tom is 8 years old. Mike is 35 years old. Peter is 68 years old.' pattern = re.compile(r'd+') pattern.findall(text) Out[5]: ['8', '35', '68'] p_name = re.compile(r'[A-Z]w+')#匹配模式 p_name.findall(text) Out[7]: ['Tom', 'Mike', 'Peter']
Python正则模块之MatchObject
In[2]: import re In[3]: text = 'Tom is 8 years old. Jarry is 23 years old.' In[4]: pattern = re.compile(r'd+') #编译一个模式 In[5]: pattern.findall(text)#找到所有匹配项 Out[5]: ['8', '23'] In[6]: pattern = re.compile(r'(d+).*?()')#编译模式把第一个d+放入一个分组里 In[7]: pattern = re.compile(r'(d+).*?(d+)')#编译模式把第一个d+放入一个分组里,?避免贪婪模式匹配 In[8]: m = pattern.search(text) #搜索 In[9]: m Out[9]: <re.Match object; span=(7, 31), match='8 years old. Jarry is 23'> In[10]: m.group() Out[10]: '8 years old. Jarry is 23' In[11]: m.group(0)#写0表示整体,用括号进行编组,不管写不写0默认返回整体 Out[11]: '8 years old. Jarry is 23' In[12]: m.group(1)#模式匹配第一个匹配的值 Out[12]: '8' In[13]: m.group(2)#第二个匹配的值 Out[13]: '23' In[14]: m.start(1) #第一个分组数字8的文本对应下标 Out[14]: 7 In[15]: m.end(1)#第一个文本8在哪个位置终止 Out[15]: 8 In[16]: m.start(2)#看第二个分组所对应的值23 Out[16]: 29 In[17]: m.end(2) Out[17]: 31 In[18]: m.group() Out[18]: '8 years old. Jarry is 23' In[19]: m.groups()#返回匹配单个结果 Out[19]: ('8', '23')
In[20]: m.groupdict()
Out[20]: {}
In[2]: import re In[3]: pattern = re.compile(r'(w+) (w+)') In[4]: text = "Beautiful is better than ugly." In[5]: pattern.findall(text)#两两匹配 Out[5]:[('Beautiful', 'is'), ('better', 'than')]
Group编组
创建子正则以应用量词
In[2]: import re In[3]: re.search(r'ab+c','ababc')#把b当成一次或多次 Out[3]: <re.Match object; span=(2, 5), match='abc'> In[4]: re.search(r'(ab)+c','ababc') Out[4]: <re.Match object; span=(0, 5), match='ababc'>
限制备选项范围
In[5]: re.search(r'Center|re','Center')#匹配一个叫Center的单词 Out[5]: <re.Match object; span=(0, 6), match='Center'> In[6]: re.search(r'Center|re','Centre') Out[6]: <re.Match object; span=(4, 6), match='re'> In[7]: re.search(r'Cent(er|re)','Centre') Out[7]: <re.Match object; span=(0, 6), match='Centre'>
重用正则模式中提取的内容
In[8]: re.search(r'(w+) 1','hello world')#当前位置重现第一个编组 In[9]: re.search(r'(w+) 1','hello hello world') Out[9]: <_sre.SRE_Match object; span=(0, 11), match='hello hello'>
引用
In[2]: import re In[3]: text = "Tom:98" In[4]: pattern = re.compile(r'(w+):(d+)')#w若干个字母字符,d若干个数字 In[5]: m = pattern.search(text) In[6]: m.group() Out[6]: 'Tom:98' In[7]: m.groups() Out[7]: ('Tom', '98') In[8]: m.group(1)#第一个分组匹配啥 Out[8]: 'Tom' In[11]: pattern = re.compile(r'(?P<name>w+):(?P<score>d+)') In[9]: m = pattern.search(text) In[10]: m.group() Out[10]: 'Tom:98' In[11]: m.group(1) Out[11]: 'Tom' In[12]: m.group('name') Out[12]: 'Tom' In[13]: m.group('score') Out[13]: '98'
综合应用
切割
>>> import re#导入模块 >>> text = 'Beautiful is better than ugly. Explicit is better than implicit. Simple is better than complex'#声明文本 >>> re.compile(r' ')#想按 切割内容 re.compile('\n') >>> p = re.compile(r' ')#想按 切割内容 >>> p.split(text) ['Beautiful is better than ugly.', 'Explicit is better than implicit.', 'Simple is better than complex'] >>> re.split(r' ',text)#第二种方法 ['Beautiful is better than ugly.', 'Explicit is better than implicit.', 'Simple is better than complex'] >>> re.split(r'W','Good morning')#小写w是字符A-Z大小写以及数字0-9包括下划线,大写W是反过来除了这些字符以外的,结果是以空格拆分 ['Good', 'morning'] >>> re.split(r'-','Good-morning') ['Good', 'morning'] >>> re.split(r'(-)','Good-morning')#把-也放入切割里面 ['Good', '-', 'morning']
>>> text #以 切割最大切割两个 'Beautiful is better than ugly. Explicit is better than implicit. Simple is better than complex' >>> re.split(r' ', text, 2)#前面切割两个,后面保留切割整体 ['Beautiful is better than ugly.', 'Explicit is better than implicit.', 'Simple is better than complex'] >>> re.split(r' ', text, 1) ['Beautiful is better than ugly.', 'Explicit is better than implicit. Simple is better than complex']
替换
>>> ords = 'ORD000 ORD001 ORD003' >>> re.sub(r'd+', '-', ords)#d是找数字,d+一次或多次的数字,想替换成-,ords变量里替换 'ORD- ORD- ORD-' >>> text = 'Beautiful is *better* than ugly' >>> re.sub(r'*(.*?)*','<strong></strong>', text)#把*号替换 'Beautiful is <strong></strong> than ugly' >>> re.sub(r'*(.*?)*','<strong>g<1></strong>', text)#定义分组引用保留原有的 'Beautiful is <strong>better</strong> than ugly' >>> re.sub(r'*(?P<html>.*?)*','<strong>g<1></strong>', text) 'Beautiful is <strong>better</strong> than ugly' >>> ords 'ORD000 ORD001 ORD003' >>> re.sub(r'([A-Z]+)(d+)','g<2>-g<1>',ords) #A-Z重复若干次,数字d+出现若干次,想换成先是数字-再加原来字母 '000-ORD 001-ORD 003-ORD' >>> re.subn(r'([A-Z]+)(d+)','g<2>-g<1>',ords) #告诉结果后面总共替换几次 ('000-ORD 001-ORD 003-ORD', 3)
In[2]: import re In[3]: text = 'Python python PYTHON' In[5]: re.search(r'python',text) Out[5]: <re.Match object; span=(7, 13), match='python'> In[6]: re.findall(r'python',text)#找内容 Out[6]: ['python'] In[7]: re.findall(r'python',text,re.I)#找所有 Out[7]: ['Python', 'python', 'PYTHON']
In[2]: import re In[3]: re.findall(r'^<html>',' <html>') Out[3]: [] In[4]: re.findall(r'^<html>',' <html>',re.M) Out[4]: ['<html>'] In[5]: re.findall(r'd(.)','1 e') Out[5]: [] In[6]: re.findall(r'd(.)','1 e',re.S) Out[6]: [' ']
------------恢复内容结束------------