zoukankan html css js c++ java

爬虫_糗事百科（正则表达式）

 1 import requests
 2 import re
 3 from lxml import etree
 4 
 5 headers = 'Mozilla/5.0 (Windows NT 10.0; …) Gecko/20100101 Firefox/60.0'
 6 
 7 
 8 def get_html(url):
 9     try:
10         response = requests.get(url, headers)
11         response.raise_for_status()
12         response.encoding = 'utf-8'
13         return response.text
14     except:
15         print('get_html() faild')
16 
17 
18 def parse_html(html):
19     # 正则表达式
20     hrefs = re.findall(r'<a class="contentHerf" href=(.*?)', html, re.S)
21     # xpath
22     element = etree.HTML(html)
23     hrefs = element.xpath('//a[@class="contentHerf"]/@href')
24     
25     contents = re.findall(r'<div class="content".*?<span>(.*?)</span>', html, re.S)
26     
27     new_content = []
28     for content in contents:
29         content = re.sub('<.*?>', '', content)
30         x = content.strip()
31         new_content.append(x)
32     return new_content
33 
34 
35 def main():
36     page_num = 13
37     for i in range(1, page_num+1):
38         url = 'https://www.qiushibaike.com/text/page/{}/'.format(i)
39         html = get_html(url)
40         contents = parse_html(html)
41         print(contents)
42 
43 
44 if __name__ == '__main__':
45     main()

hrefs = re.findall(r'<a class="contentHerf" href=(.*?)', html, re.S)

这个链接的正则表达式感觉写的没问题啊，可是匹配不到东西，奇了怪了，先放着吧，xpath可以匹配到

运行结果

查看全文

相关阅读:
dalvik浅析二：jni、so
android apk壳
 prism4 StockTrader RI 项目分析一些体会
 PRISM ‘VS100COMNTOOLS’ not set. Cannot set the build environment
安卓移动应用代码安全加固系统设计及实现
 手机游戏外挂技术研究与运用
 移动终端安全威胁与应对措施
 游戏软件反外挂技术方案的设计和安全解决方案
 网络安全等级保护 2.0 下的安全体系建设
 APP加密,ios代码混淆工具,虚拟化技术适用于移动应用程序的虚拟化加密软件

原文地址：https://www.cnblogs.com/MC-Curry/p/9431365.html