今天开始爬取数据:
import requests from bs4 import BeautifulSoup from datetime import datetime import json import pymysql def getnewsdetail(newsurl): res = requests.get(newsurl) res.encoding = 'utf-8' value=[] soup = BeautifulSoup(res.text, 'html.parser') if (soup.select('.main-title')): title = soup.select('.main-title')[0].text else: title="异常爬取" if (soup.select('.date-source span')): timesource = soup.select('.date-source span')[0].text # 获取时间) dt = datetime.strptime(timesource, '%Y年%m月%d日 %H:%M') dt.strftime('%Y-%m-%d') else: timesource="异常爬取" if(soup.select('.date-source a')): place = soup.select('.date-source a')[0].text # 获取新闻来源 else: if soup.select('#top_bar > div > div.date-source > span.source'): place = soup.select('#top_bar > div > div.date-source > span.source')[0].text else: place="异常爬取" if(soup.select("#artibody")): articleall = soup.select("#artibody")[0] # 获取文章内容 else: articleall = soup.select("#article")[0] # 获取文章内容 if(soup.select('#article p')): editor = soup.select('#article p')[-1].text.strip('责任编辑:') # 获取作者姓名 else: editor='异常爬取' value=[title,timesource,place,editor,articleall] return value def parseListLinks(url): newsdetail=[] headers = { 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36'} res=requests.get(url,headers=headers) jd=json.loads(res.text[47:-14]) for ent in jd['result']['data']: newsdetail.append(getnewsdetail(ent['url'])) return newsdetail url='https://feed.mix.sina.com.cn/api/roll/get?pageid=153&lid=2509&k=&num=50&page={}&r=0.7778780795677847&callback=jQuery1112046350965709357705_1620651288029&_=1620651288032' news_total=[] for i in range(2): newsurl=url.format(i) newsary=parseListLinks(newsurl) news_total.extend(newsary) print(i) tuplist = tuple(news_total) db = pymysql.connect(host="localhost", user="root", password="1229", database="lianxi", charset='utf8') cursor = db.cursor() sql_xiwen2 = "INSERT INTO xinwen2 values (%s,%s,%s,%s,%s)" try: cursor.executemany(sql_xiwen2,tuplist) db.commit() except: print('执行失败,进入回调3') db.rollback() db.close()
爬取腾讯新闻数据的一个爬虫,之后还会对其他新闻网站进行爬取