1 from urllib import request, parse
2 import sys
3
4
5 def loadPage(url, filename):
6 """
7 作用: 根据url发送请求, 获取服务器相应文件
8 url: 需要爬取的url地址
9 """
10 print("正在下载" + filename)
11 headers = {
12 "User-Agent": "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50"
13 }
14 req = request.Request(url, headers=headers)
15 # 获得系统的编码
16 type = sys.getfilesystemencoding()
17 # 设置爬出内容的编码
18 print(type)
19 # ************************************************
20 html = request.urlopen(req).read().decode(type)
21 # ************************************************
22 print(html)
23 return html
24
25
26 def writePage(html, filename):
27 """
28 作用: 将html内容写入到本地
29 html: 服务器相应文件内容
30 """
31 print("正在保存" + filename)
32 # ***********************************************
33 with open(filename, "w", encoding="utf-8") as f:
34 # ***********************************************
35 f.write(html)
36 print("*" * 30)
37
38
39 def tiebaSpider(url, bingenPage, endPage):
40 """
41 作用: 爬虫爬虫调度器, 负责组合处理每一个页面的url
42 url: 贴吧url的前部分,
43 beginPage: 起始页
44 endPage: 终止页
45 """
46 for page in range(beginPage, endPage + 1):
47 pn = (page - 1) * 50
48 filename = " 第" + str(page) + "页.html"
49 fullurl = url + "&pn=" + str(pn)
50 print(fullurl)
51 html = loadPage(fullurl, filename)
52 writePage(html, filename)
53
54
55 if __name__ == "__main__":
56 kw = input("请输入需要爬取的贴吧名:")
57 beginPage = int(input("请输入起始页编号"))
58 endPage = int(input("请输入结束页编号:"))
59 url = "http://tieba.baidu.com/f?"
60 key = parse.urlencode({"kw": kw})
61 fullurl = url + key
62 tiebaSpider(fullurl, beginPage, endPage)