正则表达式
一、普通字符
. 通配符一个.只匹配一个字符
匹配任意除换行符" "外的字符(在DOTALL模式中也能匹配换行符
>>> import re >>> re.findall("abcd","abcdrbnmjfsdsaeedsss") ['abcd'] >>> re.findall("a..d","abcdrbnmjfsdsaeedsss") ['abcd', 'aeed']
^ 以什么开头
>>> re.findall("^a..d","abcdrbnmjfsdsaeedsss") ['abcd']
$ 以什么结尾
>>> re.findall("a..d$","abcdajhd") ['ajhd']
* 代表匹配*前面的字符 重复0到无穷次
>>> re.findall("^g*","ggggggggsdfsdf") ['gggggggg'] >>> re.findall("^g*","gggdfsdf") ['ggg']
>>> re.findall("gou*","gossee") ['go'] >>> re.findall("gou*","gosseegouuusssdd") ['go', 'gouuu']
+ 代表匹配+前面的字符 重复1到无穷次
>>> re.findall("gou+","gossee") [] >>> re.findall("gou+","gosseegouuusssdd") ['gouuu']
*和+属于贪婪匹配
? 代表匹配?前面的字符的0次或者是1次
>>> re.findall("gou?","gossee") ['go'] >>> re.findall("gou?","gosseegouuusssdd") ['go', 'gou']
{}可以指定重复几次
{0,} 代表重复{}前面字符的0到无穷次,相当于*
{1,} 代表重复{}前面字符的1到无穷次,相当于+
{0,1}代表重复{}前面字符的0到1次,相当于?
{6}代表重复{}前面字符的6次
{1,3}代表重复{}前面字符的1次到3次
>>> re.findall("gou{3}","gosseegouuusssdd") ['gouuu'] >>> re.findall("gou{0,3}","gosseegouuusssdd") ['go', 'gouuu'] >>> re.findall("gou{0,4}","gosseegouuusssdd") ['go', 'gouuu'] >>> re.findall("gou{4}","gosseegouuusssdd") [] >>> re.findall("gou{0}","gosseegouuusssdd") ['go', 'go'] >>> re.findall("gou{1}","gosseegouuusssdd") ['gou']
惰性匹配*? 只要匹配出*前面的字符的0次就不匹配了
>>> re.findall("gou*","gosseegouuusssdd") ['go', 'gouuu'] >>> re.findall("gou*?","gosseegouuusssdd") ['go', 'go']
惰性匹配+? 只要匹配出+前面的字符的1次就不匹配了
>>> re.findall("gou+","gosseegouuusssdd") ['gouuu'] >>> re.findall("gou+?","gosseegouuusssdd") ['gou']
二、字符集[]
1、或的功能
>>> re.findall("x[yz]","xyzzzxyzxssdzx") ['xy', 'xy'] >>> re.findall("x[yz]","xyzzzxzsdxzzy") ['xy', 'xz', 'xz'] >>> re.findall("x[yz]p","xypzzzxzsdxzpzy") ['xyp', 'xzp']
2、[]中的特殊符号- 匹配字母
取小写字母
>>> re.findall("[a-z]","sd67lMNVv17jB5") ['s', 'd', 'l', 'v', 'j']
取大写字母
>>> re.findall("[A-Z]","sd67lMNVv17jB5") ['M', 'N', 'V', 'B']
取大小写字母
>>> re.findall("[A-Za-z]","sd67lMNVv17jB5") ['s', 'd', 'l', 'M', 'N', 'V', 'v', 'j', 'B']
取ab后面跟一个字母 取ab后面跟0个字母或者1个字母
>>> re.findall("ab[a-z]","abclkmnbab") ['abc'] >>> re.findall("ab[a-z]?","abclkmnbab") ['abc', 'ab']
取出字符串中的小写字母连着的字母为一个整体
>>> re.findall("[a-z]+","abc56Pabm902") ['abc', 'abm']
取出字符串中的大写字母,连着为一个整体
>>> re.findall("[A-Z]+","abc56PMabRm902") ['PM', 'R']
取出字符串中的大小写字母,连着的为一体
>>> re.findall("[A-Za-z]+","abc56Pabm902") ['abc', 'Pabm'] >>> re.findall("q[a-z]*","abc56q") ['q'] >>> re.findall("q[a-z]*","abc56qr") ['qr'] >>> re.findall("q[a-z]*","abc56qrg") ['qrg'] >>> re.findall("q[a-z]+","abc56qrg") ['qrg'] >>> re.findall("q[a-z]?","abc56qrg")
3、[]里面的特殊符号^ 是取反的意思
第一个字符是q 第二个字符只要不是a-z的就可以匹配出来
>>> re.findall("q[^a-z]","abc56qrg") [] >>> re.findall("q[^a-z]","abc56qrq6g") ['q6'] >>> re.findall("q[^a-z]","abc56qrq6gq677") ['q6', 'q6']
4、[]里面的特殊符号 叫转义符 最牛的一个斜杠
d 匹配任意十进制数,相当于[0-9]
D 匹配任意非数字字符,相当于[^0-9]
s 匹配任何空白字符,相当于[ fv]
S 匹配任何非空白字符,相当于[^ fv]
w 匹配任何字符数字字符,相当于[a-zA-Z0-9_]
W 匹配任何非字母数字字符,相当于[^a-zA-Z0-9_]
匹配一个特殊字符边界,比如空格 & # 等
匹配下数字
>>> re.findall("d","12+(34*6+2-5*(2-1+6))") ['1', '2', '3', '4', '6', '2', '5', '2', '1', '6'] >>> re.findall("d+","12+(34*6+2-5*(2-1+6))") ['12', '34', '6', '2', '5', '2', '1', '6']
匹配下除了数字之外的
>>> re.findall("D+","12+(34*6+2-5*(2-1+6))") ['+(', '*', '+', '-', '*(', '-', '+', '))']
匹配任何非空字符
>>> re.findall("S+","hello gouguoqi") ['hello', 'gouguoqi']
匹配空白字符
>>> re.findall("s+","hello gouguoqi") [' ']
匹配任意字符和数字字符也包括_
>>> re.findall("w","hel()*&%34dcsdg_") ['h', 'e', 'l', '3', '4', 'd', 'c', 's', 'd', 'g', '_'] >>> re.findall("w+","hel()*&%34dcsdg_") ['hel', '34dcsdg_']
转义功能
>>> re.findall("www*baidu","www*baidu.com") [] >>> re.findall("www*baidu","www*baidu.com") ['www*baidu'] >>> re.findall("www.baidu","wwwkbaidu.com") 这里代表是通配符 ['wwwkbaidu'] >>> re.findall("www.baidu","wwwkbaidu.com") [] >>> re.findall("www.baidu","www.baidu.com") ['www.baidu']
匹配特殊边界
>>> re.findall("I","I am LIST") ['I', 'I'] >>> re.findall("^I","I am LIST") ['I'] >>> re.findall("^I","hello I am LIST") []
取出中间的大写的I
>>> re.findall(r"I","hello I am LIST") ['I'] >>> re.findall("I\b","hello I am LIST") ['I']
5、取出最里面这个括号里面的元素
>>> re.findall("([^()]*)","12+(34*6+2-5*(2-1))") ['(2-1)']
( 以(开头
) 以)结尾
[^()] 中间不是括号就行
* 重复前面的0次到无穷次,就是只要里面不是括号可以是其他的无数次
>>> re.findall("([^()]*)","12+(34*6+2-5*(2-1+6mk))") ['(2-1+6mk)']
6、| 管道符 或的概念,是匹配2个整体
>>> re.findall("ka|a","abcka|kb") ['a', 'ka'] >>> re.findall("ka|kb","abcka|kbc") ['ka', 'kb'] >>> re.findall("ka|kc","abcka|kbc") ['ka']
7、() 分组
重复c这个字母来匹配
>>> re.findall("abc+","abcccrtfabcasdcabcc") ['abccc', 'abc', 'abcc']
把abc作为一个整体来匹配
>>> re.findall("(abc)+","abccccc") ['abc']
把abc作为一个整体来匹配,重复一次或者多次?:就是去掉括号中的优先级的
>>> re.findall("(?:abc)+","abcccrtfabcabcabcc") ['abc', 'abcabcabc']
8、re的search方法
匹配到第一个之后就不继续往下匹配了
>>> re.search("(abc)","abccccc") <_sre.SRE_Match object; span=(0, 3), match='abc'> >>> re.search("d{2}","abccccc")#匹配不到则返回空 >>> re.search("d{2}","abcccc9879")#取出来是一个对象,想要值,用group方法 <_sre.SRE_Match object; span=(6, 8), match='98'> >>> re.search("d{2}","abcccc9879").group() '98' >>> re.search("d{2}","abcccc9879").group() '98'
通过?P<name> 进行分组<>内为组名,在用group方法打印对应的组名
>>> re.search("(?P<name>[a-z]+)d+","gouguoqi28miaoye29beiye60").group() 'gouguoqi28' >>> re.search("(?P<name>[a-z]+)d+","gouguoqi28miaoye29beiye60").group("name") 'gouguoqi' >>> re.search("(?P<name>[a-z]+)(?P<age>d+)","gouguoqi28miaoye29beiye60").group( "age") '28'
三、re模块中的常用方法
1、re.findall("a","a bb") 返回所有满足条件的结果,放在列表里面
>>> re.findall("abc","abccccc") ['abc']
2、re.rearch("a","a bb c").group()取出来是一个对象
匹配不到则返回空, 匹配到第一个之后就不继续往下匹配了
>>> re.search("d{2}","abcccc9879").group() '98'
3、re.match("a","abc").group()通search相同,只不过仅在字符串开始出进行匹配
>>> re.match("abc","aaabccccc") >>> re.match("abc","abccccc") <_sre.SRE_Match object; span=(0, 3), match='abc'> >>> re.match("abc","abccccc").group() 'abc'
4、re.split()
>>> re.split(" ","abc cc hello") ['abc', 'cc', 'hello'] >>> re.split("[ |]","abc|cc hello") ['abc', 'cc', 'hello'] >>> re.split("[ab]","asdabcd") ['', 'sd', '', 'cd'] >>> re.split("[ab]","abc") ['', '', 'c']
5、re.sub替换
>>> re.sub("d+","A","sdfdsfgc56712MMns980") 'sdfdsfgcAMMnsA' >>> re.sub("d","A","sdfdsfgc56712MMns980") 'sdfdsfgcAAAAAMMnsAAA'
只匹配前4次
>>> re.sub("d","A","sdfdsfgc56712MMns980",4) 'sdfdsfgcAAAA2MMns980'
显示出匹配出来的次数
>>> re.subn("d","A","sdfdsfgc56712MMns980") ('sdfdsfgcAAAAAMMnsAAA', 8)
6、re.compile 编译,提前把匹配规则定义好,直接调用就行了,好处就是可以用多次
>>> com=re.compile("d+") >>> com.findall("sdcvf456dfg67") ['456', '67']
7、re.finditer 把数据存到迭代器里面,用一条拿一条,不浪费内存
>>> re.findall("d","sdcvf456dfg67") ['4', '5', '6', '6', '7'] >>> re.finditer("d","sdcvf456dfg67") <callable_iterator object at 0x0000000000D92160> >>> ret=re.finditer("d","sdcvf456dfg67") >>> next(ret).group() '4' >>> next(ret).group() '5' >>> next(ret).group() '6' >>> next(ret).group() '6'
特例:当有分组的时候findall优先匹配组里面的内容
>>> ret=re.findall("www.(baidu|163).com","www.baidu.com") >>> re.findall("www.(baidu|163).com","www.baidu.com") ['baidu']
也可以加个?:去掉优先级
>>> re.findall("www.(?:baidu|163).com","www.baidu.comaawww.163.combv") ['www.baidu.com', 'www.163.com']