今日爬取了剩下论文,ICCV,ECVA,ICCV与CVPR在同一个网页上,所以爬取解析的方式一之前一样去,主要是ECVA的爬取以及解析。
def get_tencent_data_ECVA(): url_ECCV = 'https://www.ecva.net/papers.php' headers = { 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36', } res=requests.get(url_ECCV,headers=headers) soup= BeautifulSoup(res.text) dt=soup.find_all("dt",{"class":"ptitle"}) num=1; url_paper=[] cursor = None conn = None conn, cursor = get_conn() for i in dt: a=i.select("a") url_paper.append(url_ECCV[0:len(url_ECCV)-10]+a[0]['href']) num = 400;#由于网络不好,老是断线,所以记录已经插入的位置,断网后从断点处重新爬取。 try: print("开始插入数据") flag=1; for i in url_paper: if(flag>400):#之前爬过的不必再解析 res = requests.get(i, headers=headers) soup = BeautifulSoup(res.text) div = soup.find_all("div", {"id": "papertitle"}) title = div[0].text div = soup.find_all("div", {"id": "authors"}) authors = div[0].text.replace(" ", "") div = soup.find_all("div", {"id": "abstract"}) abstract = div[0].text url = i; keyworld = replace(title) url_s = url.split("/") if ("eccv_2020" in url_s): yeardata = 2020 else: yeardata = 2018 meet = "ECVA" print(url) sql = "insert into paper_data (title,authors,abstract,keyworld,url,yeardata,meet) values(%s,%s,%s,%s,%s,%s,%s)" cursor.execute(sql, [title, authors, abstract, keyworld, url, yeardata, meet]) num = num + 1; conn.commit() print("数据插入成功", num) flag=flag+1 except: print("插入失败!",num) traceback.print_exc() close_conn(conn, cursor)