实例来源:https://www.bilibili.com/video/BV1i54y1h75W?p=1
# 爬取首页必看片电影中的名称+下载链接 import requests import re # 获取首页源码 domain = 'https://www.dy2018.com/' resp = requests.get(domain, verify = False) #verify = False去掉安全验证 resp.encoding = 'gbk' #指定字符集 # print(resp.text) # 预加载正则,提取首页面所需信息(片名&链接) obj1 = re.compile(r"2021新片精品.*?<ul>(?P<name_link>.*?)</ul>",re.S) result1 = obj1.search(resp.text) # print(result1.group("name")) homepage_content = result1.group("name_link") # 拼接子页面完整链接 :预加载正则,提取片名&链接中的链接,与domain组成完整的子页面链接(域名+地址) obj2 = re.compile(r"<li><a href='(?P<href>.*?)'",re.S) result2 = obj2.finditer(homepage_content) child_href_list = [] for i in result2: child_href = domain +i.group("href").strip("/") # print(child_href) child_href_list.append(child_href) #将拼接好的子页面地址存入list中便于后面提取 # print(child_href_list) # 通过子页面地址,获取子页面源码 # 在子页面源码中提取所需内容(片名+下载地址):预加载正则,提取所需内容 obj3 = re.compile(r'◎译 名(?P<trans_name>.*?)<br />◎片 名(?P<movie_name>.*?)<br />.*?' r'<td style="WORD-WRAP: break-word" bgcolor="#fdfddf"><a href="(?P<download>.*?)">',re.S) for ii in child_href_list: child_respon = requests.get(ii) child_respon.encoding = 'gbk' # print(child_respon.text) result3 = obj3.search(child_respon.text) print(result3.group("trans_name")) print(result3.group("movie_name")) print(result3.group("download"))