用到了requests、BeautifulSoup、urllib等,具体代码如下。
# -*- coding: utf-8 -*- """ Created on Sat Jul 21 09:13:07 2018 @author: brave_man email: 1979887709@qq.com 这里先说一个坑。。 页面不存在404的坑。 首先,我们把包含30个投诉的一个页面,称作一个主界面。每一个主界面是包含有30个投诉贴,我们获取每一个投诉贴的超链接, 然后,将获取到的超链接传到getDetails()中,去获取每一个投诉贴的详细内容,包括标题,内容,处理状态等。 当我第一次爬的时候,爬到第十页,显示索引超出了范围,就去找了一下,打开相关投诉贴,显示的是404,页面不存在,程序报错了。 为了增强我们小蜘蛛的健壮性,在获取每个投诉贴详情的时候,先用try语句试一下,当然,前提是你已经确定在获取网页元素的 时候不会出错。 """ import requests from bs4 import BeautifulSoup #import json #from threading import Thread import urllib from time import sleep def getDetails(url): try: headers = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0) Gecko/20100101 Firefox/6.0"} res = requests.get("{}".format(url), headers = headers) res.encoding = "GBK" soup = BeautifulSoup(res.text, "html.parser") try: content = soup.select(".contentext")[0].text.strip() except: content = soup.select(".greyframe")[0].text.split(" ")[7].strip() try: imgUrl = "http://wz.sun0769.com/" + soup.select(".textpic")[0].img["src"] imgSaveUrl = "D:\downloadPhotos" + "\" + soup.select(".textpic")[0].img["src"][-10:] urllib.request.urlretrieve(imgUrl, "D:\downloadPhotos" + "\" + soup.select(".textpic")[0].img["src"][-10:]) except: imgSaveUrl = "无图片" try: status = soup.select(".qgrn")[0].text except: try: status = soup.select(".qblue")[0].text except: status = soup.select(".qred")[0].text details = {"Title": soup.select(".tgray14")[0].text[4:-12].strip(), "Code": soup.select(".tgray14")[0].text[-8:-2], "Picture": imgSaveUrl, "Content": content, "Status": status, "NetFriend": soup.select(".te12h")[0].text.lstrip(" 网友:")[0:-27], "Time": soup.select(".te12h")[0].text[-21:-2]} # jd = json.dumps(details) # print(type(jd)) try: with open("saveComplaints.txt", "a") as f: f.write(str(details)) except: print("存入失败") except: print("页面不存在") sleep(5) def getA(url): headers = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0) Gecko/20100101 Firefox/6.0"} res = requests.get("{}".format(url), headers = headers) res.encoding = "GBK" soup = BeautifulSoup(res.text, "html.parser") for i in soup.select(".news14"): url = i["href"] getDetails(url) def getPages(): rUrl = "http://wz.sun0769.com/index.php/question/questionType?type=4&page=" for i in range(30): url = rUrl + str((i - 1) * 30) getA(url) if __name__ == "__main__": # getA("http://wz.sun0769.com/index.php/question/questionType?type=4") # getDetails("http://wz.sun0769.com/html/question/201807/379074.shtml") getPages()
在编代码的时候,有一些小细节的处理不够熟练,比如文件的读写。下面再搞一搞。
# -*- coding: utf-8 -*- """ Created on Sat Jul 21 13:51:40 2018 @author: brave_man email: 1979887709@qq.com """ import json try: with open("saveComplaints.txt", "r") as f: print("开始读取") s = f.readline() # print(s) except: print("存入失败") # 将文件中数据读取出来 s1 = s.encode("utf8").decode("unicode-escape") print(s1) # 转换成json格式 jd = json.dumps(s1) print(jd) #d = {"name": "张飞", "age": "29"} #print(str(d)) #jd = json.dumps(d) #print(jd) #js = json.loads(jd) #print(js)
爬虫爬取了前30个页面保存到本地文件中,其实可以考虑用多线程,线程池的方法去分别爬取每一个主页面,这样可能效率会更高一些。至于多线程的部分,还是不太熟练,需要多注意。