论文爬取爬虫
# -*- coding:utf-8 -*-
import requests
import re
import json
import mysql
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36(KHTML, like Gecko) Chrome/83.0.4103.106 Safari/537.36"
}
def getData():
url = "https://openaccess.thecvf.com/menu"
res = requests.get(url).text.replace(' ', '').replace('<br>', '')
# print(res)
getA = re.compile(r'<dd>(.*?) [<a href="(.*?)">Main Conference</a>] [<a href="(.*?)/menu.*?">Workshops</a>]</dd>')
keyA = re.findall(getA, res)
print("会议有"+str(len(keyA)))
print(keyA)
httpList = []
httpList2 = []
httpList3 = []
ht = []
h = []
t = []
temp = []
for i in keyA:
h1 = i[0]
h2 = 'https://openaccess.thecvf.com'+i[1]
h3 = 'https://openaccess.thecvf.com'+i[2]
httpList.append([h1, h2, h3]) # 会议题目 链接
# print(httpList)
for i in httpList:
url2 = i[2]+'/menu'
res2 = requests.get(url2).text.replace('.py', '')
print(url2)
getZ = re.compile(r'<dl>(.*?)</dl>', re.DOTALL)
keyZ = re.findall(getZ, res2)[0]
# print(keyZ)
getB = re.compile(r'<a href="/?(?:w+/)?(w+)">(.*?)</a><br><br>.*?</dd>', re.DOTALL)
keyB = re.findall(getB, keyZ) # 2
print(keyB)
for k in keyB:
h1 = i[2]+'/'+k[0]
url4 = h1
print(h1)
res4 = requests.get(url4).text
getX = re.compile(r'<dt class="ptitle"><br><a href="(.*?)">')
keyX = re.findall(getX, res4)
for y in range(len(keyX)):
act1 = 'https://openaccess.thecvf.com'+keyX[y]
url3 = act1 # 论文链接
print(act1)
res2 = requests.get(url3).text.replace(' ', '')
getC = re.compile(r'<meta name="citation_pdf_url" content="(.*?)">.*?<div id="abstract">(.*?)</div>.*?authors+=s+{(.*?)}.*?titles+=s+{(.*?)}.*?booktitles+=s+{(.*?)}.*?months+=s+{(.*?)}.*?years+=s+{(.*?)}', re.DOTALL)
keyC = re.findall(getC, res2)
print(keyC)
t1 = keyC[0][2] # 作者
t2 = keyC[0][3] # 题目
t3 = keyC[0][4] # 书名
t4 = keyC[0][5] + ',' + keyC[0][6] # 日期
t5 = keyC[0][1] # 摘要
t6 = keyC[0][0] # 链接
temp.append([t1, t2, t3, t4, t5, t6])
mysql.insert_item(temp)
temp = []
if __name__ == '__main__':
getData()