zoukankan      html  css  js  c++  java
  • 顶会热词爬取代码

    import pymysql
    import requests
    from lxml import etree
    
    
    class Spider:
        def __init__(self):
            self.url = "http://openaccess.thecvf.com/CVPR2019?day=2019-06-18"
            self.header = {
                "user-agent": "Mozilla/5.0 (Linux; Android 8.0; Pixel 2 Build/OPD3.170816.012) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.116 Mobile Safari/537.36"}
            self.db = pymysql.connect(host='localhost', port=3306, user='root', passwd='yangjiang', db='words',
                                      charset='utf8')
            self.cursor = self.db.cursor()
            self.html_list = []
    
        def getHtmlList(self):
            response = requests.get(self.url, headers=self.header)
            html_body = etree.HTML(response.text)
            title = html_body.xpath("//dt[@class='ptitle']/a/@href")
            for item in title:
                self.html_list.append("http://openaccess.thecvf.com/" + item)
    
        def getContent(self, url):
            try:
                response = requests.get(url, headers=self.header)
                body = etree.HTML(response.text)
                title = body.xpath("//div[@id='papertitle']/text()")[0]
                abstract = body.xpath("//div[@id='abstract']/text()")[0]
                down_url = body.xpath("//div[@id='content']//a/@href")[0].replace("../../", "http://openaccess.thecvf.com/")
                year=2019
                sql = '''insert into paper(title,year,link,abstract,keywords) values("{}","{}","{}","{}","{}")'''.format(title,year, down_url, abstract,title)
                self.cursor.execute(sql)
                print(title + "插入成功!")
                self.db.commit()
            except Exception as e:
                print(e)
    
        def run(self):
            self.getHtmlList()
            for url in self.html_list:
                self.getContent(url)
    
    
    if __name__ == '__main__':
        spwder = Spider()
        spwder.run()
  • 相关阅读:
    每周进度条(第九周)
    团队项目最后更改版
    项目需求分析与建议 NABCD模型
    课堂练习找水王
    问题账户需求分析
    2016年秋季个人阅读计划
    学习进度条
    用户体验
    程序员修炼之道——从小工到专家阅读笔记03
    程序员修炼之道——从小工到专家阅读笔记02
  • 原文地址:https://www.cnblogs.com/yongyuandishen/p/14908651.html
Copyright © 2011-2022 走看看