顶会热词爬虫:
# -*- coding = UTF-8 -*- # @Time : 2021/6/12 23:34 # @Author : 伏珊瑞 # @File : test2021.py # @Software : PyCharm import urllib import pymysql import requests import jieba.analyse from bs4 import BeautifulSoup import re from pymysql.constants.FIELD_TYPE import JSON conn = pymysql.connect(host='localhost', user="root", passwd="123456", database="paper") # 获取游标 cursor = conn.cursor() Sql="insert into pypaper1(name,herf,writer,Abstract,time,keywords) values(%s,%s,%s,%s,%s,%s)" def main(): #主函数,之后调用 html=askURL("https://openaccess.thecvf.com/CVPR2021?day=all") getherf(html) cursor.close() conn.commit() conn.close() def getAbstract(url): #爬取论文的摘要 html=askURL(url) #将网页的html转换成文本 data="" bs = BeautifulSoup(html, "html.parser") findlink_herf = re.compile(r' (.*?) ')#摘要的相应的正则表达式 a=bs.find_all(id="abstract") for item in a: item=str(item) data=re.findall(findlink_herf,item)[0] return data def getherf(html):#爬取论文所有数据并写入数据库 bs=BeautifulSoup(html,"html.parser") a=bs.find_all(class_="ptitle") b = bs.find_all("dd")#dd是网页中每条论文数据的框框 findlink_herf=re.compile(r'<a href="(.*?)">')#论文的超链接的正则表达式 findlink_name = re.compile(r'<a href="(.*?)">(.*?)</a></dt>')#论文名字的正则表达式 findlink_writer = re.compile(r'">(.*?)</a>')#论文作者的正则表达式 TEMP=1;#爬取的作者连同ptf也爬了所以每两个才是作者数据 inta=1;#计数器 for item in a: # try: item=str(item) name=str(b[TEMP]) link_href=re.findall(findlink_herf,item)[0] link_name = re.findall(findlink_name, item)[0] writer=re.findall(findlink_writer, name) link_writer="" for s in writer: link_writer+=s+"+" link_Abstract=getAbstract("https://openaccess.thecvf.com/"+link_href)#论文超链接应该加上链接头 keywords="" p=0 for word in jieba.analyse.extract_tags(link_Abstract):#将关键词数组拼成字符串 keywords+=word+"+" p=p+1 if(p==5): break # print(link_name[1]) # print(link_href) # print(link_writer) # print(link_Abstract) # print(keywords) insert = cursor.execute(Sql, (link_name[1], link_href, link_writer, link_Abstract, "2021",keywords))#写入数据库 TEMP+=2 print(inta) inta+=1 # except: # print(link_name[1]) # print(link_href) # print(link_writer) # print(link_Abstract) # print(keywords) def askURL(url):#获取网页的html文本 header = { 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.77 Safari/537.36' } request = urllib.request.Request(url, headers=header) html = "" response = urllib.request.urlopen(request) html = response.read().decode("UTF-8") return html main()#调用main函数
# -*- coding = UTF-8 -*- # @Time : 2021/6/12 23:34 # @Author : 伏珊瑞 # @File : test2021.py # @Software : PyCharm import urllib import pymysql import requests import jieba.analyse from bs4 import BeautifulSoup import re from pymysql.constants.FIELD_TYPE import JSON conn = pymysql.connect(host='localhost', user="root", passwd="123456", database="paper") # 获取游标 cursor = conn.cursor() Sql="insert into pypaper(name,herf,writer,Abstract,time,keywords) values(%s,%s,%s,%s,%s,%s)" def main(): html=askURL("https://openaccess.thecvf.com/WACV2021") getherf(html) cursor.close() conn.commit() conn.close() #getAbstract("https://openaccess.thecvf.com/content_WACV_2020/html/Sang_Inferring_Super-Resolution_Depth_from_a_Moving_Light-Source_Enhanced_RGB-D_Sensor_WACV_2020_paper.html") def getAbstract(url): html=askURL(url) data="" bs = BeautifulSoup(html, "html.parser") findlink_herf = re.compile(r' (.*?) ') a=bs.find_all(id="abstract") for item in a: item=str(item) data=re.findall(findlink_herf,item)[0] return data def getherf(html): bs=BeautifulSoup(html,"html.parser") a=bs.find_all(class_="ptitle") b = bs.find_all("dd") findlink_herf=re.compile(r'<a href="(.*?)">') findlink_name = re.compile(r'<a href="(.*?)">(.*?)</a></dt>') findlink_writer = re.compile(r'">(.*?)</a>') TEMP=0; inta=1; for item in a: try: item=str(item) name=str(b[TEMP]) link_href=re.findall(findlink_herf,item)[0] link_name = re.findall(findlink_name, item)[0] writer=re.findall(findlink_writer, name) link_writer="" for s in writer: link_writer+=s+"+" link_Abstract=getAbstract("https://openaccess.thecvf.com/"+link_href) keywords="" for word in jieba.analyse.extract_tags(link_Abstract): keywords+=word+"+" # print(link_name[1]) # print(link_href) # print(link_writer) # print(link_Abstract) # print(keywords) insert = cursor.execute(Sql, (link_name[1], link_href, link_writer, link_Abstract, "2021",keywords)) #print("insert into pypaper values("+link_name[1]+","+link_href+","+link_writer+","+link_Abstract+",2021)") TEMP+=2 print(inta) inta+=1 except: print(link_name[1]) print(link_href) print(link_writer) print(link_Abstract) print(keywords) def askURL(url): header = { 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.77 Safari/537.36' } request = urllib.request.Request(url, headers=header) html = "" response = urllib.request.urlopen(request) html = response.read().decode("UTF-8") return html main() #def getwriter(html): # bs = BeautifulSoup(html, "html.parser") # a = bs.find_all("dd") # findlink = re.compile(r'">(.*?)</a>') # TEMP=2; # for item in a: # if(TEMP%2==0): # item = str(item) # link = re.findall(findlink, item) # print(link) # TEMP+=1 # def getname(html): # bs = BeautifulSoup(html, "html.parser") # a = bs.find_all(class_="ptitle", ) # findlink = re.compile(r'<a href="(.*?)">(.*?)</a></dt>') # list = bs.find_all(re.compile("dt")) # for item in a: # item = str(item) # link = re.findall(findlink, item)[0] # print(link[1])