python--(爬虫-re模块)
re模块四大核心功能:
1.findall 查找所有,返回list
import re lst = re.findall("m", "mai le fo len, mai ni mei!") print(lst) # ['m', 'm', 'm'] lst = re.findall(r"d+", "5点之前. 你要给我5000") print(lst) # ['5' '5000']
2.search 会进行匹配,但如果匹配到了第一个结果,就会返回这个结果,
如果匹配不上search返回的则是None
import re ret = re.search(r'd', '5点之前. 你要给我5000万').group() print(ret) # 5
3. match 只能从字符串的开头进⾏匹配
import re ret = re.match('a', 'abc').group() print(ret) # a
4. finditer 和findall差不多. 只不过这时返回的是迭代器
import re it = re.finditer("m", "mai le fo len, mai ni mei!") for el in it: print(el.group()) # 依然需要分组
5.re模块相关操作
import re # split 切割. 按照正则切割. # lst = re.split(r"[ab]", "abcdefghahahehedebade") # print(lst) # sub 替换. # result = re.sub("250", "__sb__", "alex250taibai250taihei250ritian250liuwei") # print(result) # result = re.subn("250", "__sb__", "alex250taibai250taihei250ritian250liuwei") # print(result) # obj = re.compile(r"d+") # lst = obj.findall("大阳哥昨天赚了5000块") # lst2 = obj.findall("银行流水5000, 花了6000") # print(lst) # print(lst2) # obj = re.compile(r"(?P<id>d+)(?P<zimu>e{3})") # ret = obj.search("abcdefg123456eeeee") # ((123456)(eee)) # print(ret.group()) # print(ret.group("id")) # print(ret.group("zimu")) # ret = re.findall('www.(baidu|oldboy).com', 'www.oldboy.com') # print(ret) # 这是因为findall会优先把匹配结果组⾥内容返回,如果想要匹配结果,取消权限即可 # ret = re.findall('www.(?:baidu|oldboy).com', 'www.oldboy.com') # ?: 当前的()不分组 # print(ret) # ['www.oldboy.com'] # ret=re.split("sb","alexsbwusirsbtaibaisbliuwei") # print(ret)
爬虫重点:爬取豆瓣网站相关信息===>
import re from urllib.request import urlopen # 打开一个链接. 读取源代码 import ssl # 干掉数字签名证书 ssl._create_default_https_context = ssl._create_unverified_context def getPage(url): response = urlopen(url) # 和网页链接 return response.read().decode('utf-8') # 返回正常的页面源代码. 一大堆HTML def parsePage(s): # s 是页面源代码 ret = re.findall('<div class="item">.*?<div class="pic">.*?<em .*?>(?P<id>d+).*?'+ '<span class="title">(?P<title>.*?)</span>'+ '.*?<span class="rating_num" .*?>(?P<rating_num>.*?)</span>.*?<span>'+ '(?P<comment_num>.*?)评价</span>', s, re.S) return ret # id,title, rating_num, comment_num def main(num): url = 'https://movie.douban.com/top250?start=%s&filter=' % num response_html = getPage(url) # response_html是页面源代码 ret = parsePage(response_html) print(ret) # id,title, rating_num, comment_num count = 0 for i in range(10): # 10 main(count) count += 25
import re from urllib.request import urlopen # 打开一个链接. 读取源代码 import ssl # 干掉数字签名证书 ssl._create_default_https_context = ssl._create_unverified_context def getPage(url): response = urlopen(url) # 和网页链接 return response.read().decode('utf-8') # 返回正常的页面源代码. 一大堆HTML def parsePage(s): com = re.compile( '<div class="item">.*?<div class="pic">.*?<em .*?>(?P<id>d+).*?' + '<span class="title">(?P<title>.*?)</span>' + '.*?<span class="rating_num" .*?>(?P<rating_num>.*?)</span>.*?<span>' + '(?P<comment_num>.*?)评价</span>', re.S) ret = com.finditer(s) for i in ret: yield { "id": i.group("id"), "title": i.group("title"), "rating_num": i.group("rating_num"), "comment_num": i.group("comment_num"), } def main(num): url = 'https://movie.douban.com/top250?start=%s&filter=' response_html = getPage(url) print(response_html) ret = parsePage(response_html) # print(ret) f = open("move_info7", "a", encoding="utf8") for obj in ret: print(obj) data = str(obj) f.write(data + " ") count = 0 for i in range(10): # 10 main(count) count += 25