爬取淮安信息职业技术学院所有的新闻内容;包含所有不同的新闻内容,本脚本会输出显示爬取到的新闻详细页URL、文件下载URL,同时提供了下载文件的功能,可以自行研究处理文件下载的!(暂只支持下载爬取到的DOC/xls/PDF文件)。
该脚本是一个框体,解决了爬取过程种的URL爬取,使用者可自行根据实际需要来改动代码。同时也为了方便用来存进数据库,这里也预留了news_*[],用来向数据库写入,他家可以基于本脚本自行扩展,欢迎留言你的github仓库。
作者在处理页面URL遍历的时候,用了一个比较牵强的方法来处理hcit这种逆向的新闻页面编号htm;如果大家又有了高明的方法,欢迎与我讨论哈~~~
#! python3
import requests
from bs4 import BeautifulSoup
def reptiles(responses,addr):
soup = BeautifulSoup(responses.text,"html.parser")
# print (soup.find_all(id="line_u8_0"))
k = 0
news_link = []
news_title = []
news_time = []
news_text = []
while True:
key_text = "line_u8_%d"%(k)
soup_li = soup.find(id=str(key_text))
if key_text not in responses.text : # 遇空则break
break
# 获取标题
title = soup_li.a.get_text()
title = title[10:]
news_title.append(title)
# 获取时间
time = soup_li.span.get_text()
news_time.append(time)
# 获取正文页
link = soup_li.a['href']
news_link.append(link)
newsText_url = addr + link
res = requests.get(newsText_url)
res.encoding = "utf-8"
print("[爬取正文]%s [%d]"%(newsText_url,res.status_code))
soup_newsTtext = BeautifulSoup(res.text,"html.parser")
newsText = soup_newsTtext.find(id="vsb_content")
news_text.append(newsText)
# 获取正文附件并下载
try:
newsText_a = newsText.a['href']
newsText_name = newsText.a.string
newsText_link = "http://www.hcit.edu.cn" + newsText_a
print("[附件文件]"+newsText_link)
# res_link = requests.get(newsText_link)
# with open(newsText_name,'wb') as code:
# code.write(res_link.content)
except Exception as e:
print("[无附件文件]")
else:
pass
k += 1
# print(responses.url)
return
def main():
url_addr = [
# "http://www.hcit.edu.cn/sdxw/xyyw",
# "http://www.hcit.edu.cn/sdxw/ybdt",
# "http://www.hcit.edu.cn/sdxw/mtjj",
"http://www.hcit.edu.cn/sdxw/ggtz"
]
for addr in url_addr:
page = 1
while True:
addr_url = addr + "/" + str(page) + ".htm"
# print(addr_url)
responses = requests.get(addr_url)
if responses.status_code != 200 :
print("[爬取页面] %s [%s]" % (str(addr + ".htm"), str(requests.get(addr + ".htm").status_code)))
reptiles(requests.get(addr + ".htm"), addr + "/../")
print("+++++++++++++++++++++++++[None]++++++++++++++++++++++++++++")
break
responses.encoding = "utf-8"
# 确认访问正常
print("[爬取页面] %s [%s]"%(str(addr_url),str(responses.status_code)))
reptiles(responses,addr + "/")
page += 1
print("=============================================")
if __name__ == "__main__":
main()