一, 什么是正则?
import re
# 将所有的数据都找出来放进列表中list,一一匹配 print(re.findall('alex', 'haha alex is alex is dsb')) # >>>: ['alex', 'alex'] # w 匹配字母数字及下划线(一个w每次匹配一个字符) # W 匹配非字母数字及下划线 print(re.findall('w','Aah123 +-_')) # >>>: ['A', 'a', 'h', '1', '2', '3', '_'] print(re.findall('ww','Aah123 +-_')) # >>>: ['Aa', 'h1', '23'] print(re.findall('w9w','Aa9h123 aaa9c+-_')) # >>>: ['a9h', 'a9c'] # s 匹配任意空白字符,等价于 f # S 匹配非空字符 # d 匹配任意数字,[0-9] # D 匹配任意非数字 # ^ : 仅从开头开始匹配 # $ : 仅从尾部开始匹配 print(re.findall('^alex', 'alex is alex is alex')) # >>>: ['alex'] print(re.findall('^alex', '1alex is alex is alex')) # >>>: []
重复匹配: | . | * | ? | .* | .*? | + | {n,m}
# . :代表一个字符,该字符可以是任意字符(除换行符) print(re.findall('a.c', 'a alc aaac a c asfdsaf')) # >>>: ['alc', 'aac', 'a c'] print(re.findall('a.c', 'a alc aaac a c asfd',re.DOTALL)) #DOTALL使得.匹配包括换行符在内的所有字符 # >>>: ['alc', 'aac', 'a c']
# ? :代表左边那一个字符出现0次或者1次 print(re.findall('ab?', 'a ab abb abbbb a123b a123bbbb')) # >>>: ['a', 'ab', 'ab', 'ab', 'a', 'a']
# * :代表左边那一个字符出现0次到无穷次 print(re.findall('ab*', 'a ab abb abbbb a123b a123bbbb')) # >>>: ['a', 'ab', 'abb', 'abbbb', 'a', 'a']
# + :代表左边那一个字符出现1次到无穷次 print(re.findall('ab+', 'a ab abb abbbb a123b a123bbbb')) # >>>: ['ab', 'abb', 'abbbb']
# {n,m} :代表左边那一个字符出现n次到m次 print(re.findall('ab{1,3}', 'a ab abb abbbb a123b a123bbbb')) # >>>: ['ab', 'abb', 'abbb']
# .* :匹配任意0个到无穷个字符,贪婪匹配 print(re.findall('a.*c','a132142qwdcavcccc(((()))))c2333')) # >>>: ['a132142qwdcavcccc(((()))))c']
# .*? :匹配任意0个到无穷个字符,非贪婪匹配 print(re.findall('a.*?c', 'a132142qwdcavcccc(((()))))c2333')) # >>>: ['a132142qwdc', 'avc']
# |:或者 print(re.findall('companies|company', 'Too many companies have gone bankrupt,c and the next one is my company')) # >>>: ['companies', 'company']
# ():分组 print(re.findall('compan(?:ies|y)','Too many companies have gone bankrupt,c and the next one is my company')) # >>>: ['companies', 'company']
# :转义 print(re.findall('a\\c','ac aac')) print(re.findall(r'a\c','ac aac')) # >>>: ['a\c']
# 忽略大小写 # print(re.findall('alex','my name is alex ALex is dSB',re.I)) # # >>>: ['alex', 'ALex'] # msg = '''my name is egon # asdfassg egon # 122324324egon''' # print(re.findall('egon$',msg,re.M)) # >>>: ['egon', 'egon', 'egon']
# []: 代表匹配一个字符,这个字符是来自于自定义的范围 print(re.findall('a[1]c', 'a a1c aaac a c asfdsaf',re.DOTALL)) # >>>: ['a1c'] print(re.findall('a[0-9]c', 'a a1c aaac a7c asfdsaf',re.DOTALL)) #[0-9]的数字 # >>>: ['a1c', 'a7c'] print(re.findall('a[a-zA-Z]c', 'a a1c aaac a7c asfdsaf',re.DOTALL)) #所有字母 # >>>: ['aac'] print(re.findall('a[+*/-]c', 'a a1c aaac a7c asfdsaf',re.DOTALL)) #-代表连字符,在首尾才表示符号意思
# re模块其他方法 res=re.findall('(href)="(.*?)"','<p>动感视频</p><a href="https://www.douniwan.com/1.mp4">逗你玩呢</a><a href="https://www.xxx.com/2.mp4">葫芦娃</a>') print(res) res=re.search('(href)="(.*?)"','<p>动感视频</p><a href="https://www.douniwan.com/1.mp4">逗你玩呢</a><a href="https://www.xxx.com/2.mp4">葫芦娃</a>') print(res) print(res.group(0)) print(res.group(1)) print(res.group(2)) #运行结果 [('href', 'https://www.douniwan.com/1.mp4'), ('href', 'https://www.xxx.com/2.mp4')] <_sre.SRE_Match object; span=(14, 51), match='href="https://www.douniwan.com/1.mp4"'> href="https://www.douniwan.com/1.mp4" href https://www.douniwan.com/1.mp4