5、第一个爬取网页使用正则解析数据

from urllib import request
import re
# 请求的url
url = "http://www.dfenqi.cn/Product/Index"
# 请求的头文件
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36"
}
# 创建请求对象
req = request.Request(url,headers = headers)
# 创建处理器对象
httpHandler = request.HTTPHandler()
# 创建opener
opener = request.build_opener(httpHandler)
# 发送请求
response = opener.open(req)
# 读取源文件
html = response.read().decode('utf-8')
# 使用正则表达式解析源文件中所有产品的标题
pattern = re.compile(r'<p\s*class="p">([\s\S]*?)</p>')
# 显示产品标题列表
goodsList = pattern.findall(html)
print("产品总个数：%s" % len(goodsList))
# 打印产品标题
for goods in goodsList:
    print(goods)

查看全文

相关阅读:
DTree中致命的递归
 通过CLR同步SQL Server和Sharepoint List数据（三）
通过CLR同步SQL Server和Sharepoint List数据（四）
C#中操作IIS 7.0
可爱又可恨的梅花雪
 通过CLR同步SQL Server和Sharepoint List数据（一）
在Javascript中调用WSS
VBA在Excel中的应用（二）
自己编写存储过程对Oracle中的序列进行重置
 在Javascript中实现伪哈希表

原文地址：https://www.cnblogs.com/toloy/p/8617305.html