zoukankan      html  css  js  c++  java
  • 顶会热词爬取代码

    import pymysql
    import requests
    from lxml import etree
    
    
    class Spider:
        def __init__(self):
            self.url = "http://openaccess.thecvf.com/CVPR2019?day=2019-06-18"
            self.header = {
                "user-agent": "Mozilla/5.0 (Linux; Android 8.0; Pixel 2 Build/OPD3.170816.012) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.116 Mobile Safari/537.36"}
            self.db = pymysql.connect(host='localhost', port=3306, user='root', passwd='yangjiang', db='words',
                                      charset='utf8')
            self.cursor = self.db.cursor()
            self.html_list = []
    
        def getHtmlList(self):
            response = requests.get(self.url, headers=self.header)
            html_body = etree.HTML(response.text)
            title = html_body.xpath("//dt[@class='ptitle']/a/@href")
            for item in title:
                self.html_list.append("http://openaccess.thecvf.com/" + item)
    
        def getContent(self, url):
            try:
                response = requests.get(url, headers=self.header)
                body = etree.HTML(response.text)
                title = body.xpath("//div[@id='papertitle']/text()")[0]
                abstract = body.xpath("//div[@id='abstract']/text()")[0]
                down_url = body.xpath("//div[@id='content']//a/@href")[0].replace("../../", "http://openaccess.thecvf.com/")
                year=2019
                sql = '''insert into paper(title,year,link,abstract,keywords) values("{}","{}","{}","{}","{}")'''.format(title,year, down_url, abstract,title)
                self.cursor.execute(sql)
                print(title + "插入成功!")
                self.db.commit()
            except Exception as e:
                print(e)
    
        def run(self):
            self.getHtmlList()
            for url in self.html_list:
                self.getContent(url)
    
    
    if __name__ == '__main__':
        spwder = Spider()
        spwder.run()
  • 相关阅读:
    LeetCode OJ--Sort Colors
    LeetCode OJ--Single Number II **
    LeetCode OJ--Single Number
    LeetCode OJ--Subsets II
    LeetCode OJ--ZigZag Conversion
    3ds Max学习日记(三)
    3ds Max学习日记(二)
    3ds Max学习日记(一)
    PokeCats开发者日志(十三)
    PokeCats开发者日志(十二)
  • 原文地址:https://www.cnblogs.com/yongyuandishen/p/14908651.html
Copyright © 2011-2022 走看看