zoukankan      html  css  js  c++  java
  • 顶会热词爬取代码

    import pymysql
    import requests
    from lxml import etree
    
    
    class Spider:
        def __init__(self):
            self.url = "http://openaccess.thecvf.com/CVPR2019?day=2019-06-18"
            self.header = {
                "user-agent": "Mozilla/5.0 (Linux; Android 8.0; Pixel 2 Build/OPD3.170816.012) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.116 Mobile Safari/537.36"}
            self.db = pymysql.connect(host='localhost', port=3306, user='root', passwd='yangjiang', db='words',
                                      charset='utf8')
            self.cursor = self.db.cursor()
            self.html_list = []
    
        def getHtmlList(self):
            response = requests.get(self.url, headers=self.header)
            html_body = etree.HTML(response.text)
            title = html_body.xpath("//dt[@class='ptitle']/a/@href")
            for item in title:
                self.html_list.append("http://openaccess.thecvf.com/" + item)
    
        def getContent(self, url):
            try:
                response = requests.get(url, headers=self.header)
                body = etree.HTML(response.text)
                title = body.xpath("//div[@id='papertitle']/text()")[0]
                abstract = body.xpath("//div[@id='abstract']/text()")[0]
                down_url = body.xpath("//div[@id='content']//a/@href")[0].replace("../../", "http://openaccess.thecvf.com/")
                year=2019
                sql = '''insert into paper(title,year,link,abstract,keywords) values("{}","{}","{}","{}","{}")'''.format(title,year, down_url, abstract,title)
                self.cursor.execute(sql)
                print(title + "插入成功!")
                self.db.commit()
            except Exception as e:
                print(e)
    
        def run(self):
            self.getHtmlList()
            for url in self.html_list:
                self.getContent(url)
    
    
    if __name__ == '__main__':
        spwder = Spider()
        spwder.run()
  • 相关阅读:
    setjmp和longjmp函数使用详解
    一文搞懂HMM(隐马尔可夫模型)
    Qt多工程多目录的编译案例
    HTML中Id和Name的区别
    字符识别中的图像归一化算法
    QT工程pro设置实践(with QtCreator)----非弄的像VS一样才顺手?
    暗通道优先的图像去雾算法
    callback用法简介
    ansible 批量部署准备工作
    MySQL高级管理
  • 原文地址:https://www.cnblogs.com/yongyuandishen/p/14908651.html
Copyright © 2011-2022 走看看