zoukankan      html  css  js  c++  java
  • 论文爬取(一)

    论文爬取爬虫

    # -*- coding:utf-8 -*-
    import requests
    import re
    import json
    import mysql

    headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36(KHTML, like Gecko) Chrome/83.0.4103.106 Safari/537.36"
    }


    def getData():
    url = "https://openaccess.thecvf.com/menu"
    res = requests.get(url).text.replace(' ', '').replace('<br>', '')
    # print(res)
    getA = re.compile(r'<dd>(.*?) [<a href="(.*?)">Main Conference</a>] [<a href="(.*?)/menu.*?">Workshops</a>]</dd>')
    keyA = re.findall(getA, res)
    print("会议有"+str(len(keyA)))
    print(keyA)
    httpList = []
    httpList2 = []
    httpList3 = []
    ht = []
    h = []
    t = []
    temp = []
    for i in keyA:
    h1 = i[0]
    h2 = 'https://openaccess.thecvf.com'+i[1]
    h3 = 'https://openaccess.thecvf.com'+i[2]
    httpList.append([h1, h2, h3]) # 会议题目 链接
    # print(httpList)
    for i in httpList:
    url2 = i[2]+'/menu'
    res2 = requests.get(url2).text.replace('.py', '')
    print(url2)
    getZ = re.compile(r'<dl>(.*?)</dl>', re.DOTALL)
    keyZ = re.findall(getZ, res2)[0]
    # print(keyZ)
    getB = re.compile(r'<a href="/?(?:w+/)?(w+)">(.*?)</a><br><br>.*?</dd>', re.DOTALL)
    keyB = re.findall(getB, keyZ) # 2
    print(keyB)
    for k in keyB:
    h1 = i[2]+'/'+k[0]
    url4 = h1
    print(h1)
    res4 = requests.get(url4).text
    getX = re.compile(r'<dt class="ptitle"><br><a href="(.*?)">')
    keyX = re.findall(getX, res4)
    for y in range(len(keyX)):
    act1 = 'https://openaccess.thecvf.com'+keyX[y]
    url3 = act1 # 论文链接
    print(act1)
    res2 = requests.get(url3).text.replace(' ', '')
    getC = re.compile(r'<meta name="citation_pdf_url" content="(.*?)">.*?<div id="abstract">(.*?)</div>.*?authors+=s+{(.*?)}.*?titles+=s+{(.*?)}.*?booktitles+=s+{(.*?)}.*?months+=s+{(.*?)}.*?years+=s+{(.*?)}', re.DOTALL)
    keyC = re.findall(getC, res2)
    print(keyC)
    t1 = keyC[0][2] # 作者
    t2 = keyC[0][3] # 题目
    t3 = keyC[0][4] # 书名
    t4 = keyC[0][5] + ',' + keyC[0][6] # 日期
    t5 = keyC[0][1] # 摘要
    t6 = keyC[0][0] # 链接

    temp.append([t1, t2, t3, t4, t5, t6])
    mysql.insert_item(temp)
    temp = []


    if __name__ == '__main__':
    getData()
  • 相关阅读:
    屏幕截图 从安卓模拟器中识别出屏幕文字
    srcset
    Bitwise and Bit Shift Operators 位运算 取反 补码
    text recognizer (OCR) Engine 光学字符识别
    删除目录下 某类名字的文件
    appmaptile
    登录框
    将代码设置的剪切板内容通过输入法软件粘贴入app搜索框
    面向问题的高级语言
    使用心理视觉来进行图像处理
  • 原文地址:https://www.cnblogs.com/mumulailai/p/14912235.html
Copyright © 2011-2022 走看看