zoukankan      html  css  js  c++  java
  • 顶会热词统计1

    在完成python的相关配置后,首先进行数据的爬取

    import requests
    import pymysql
    from jieba.analyse import extract_tags
    from lxml import etree  # 导入库
    from bs4 import BeautifulSoup
    import re
    import time
    
    db = pymysql.connect(
        host='localhost',
        port=3306,
        user='root',
        password='liutianwen0613',
        db='cloud',
        charset='utf8'
    )
    cursor = db.cursor()
    
    
    # 定义爬虫类
    
    class Spider():
        def __init__(self):
            self.url = 'https://openaccess.thecvf.com/CVPR2018?day=2018-06-21'
            #self.url='https://openaccess.thecvf.com/CVPR2018?day=2018-06-20'
            #self.url='https://openaccess.thecvf.com/CVPR2018?day=2018-06-19'
            #self.url = 'https://openaccess.thecvf.com/CVPR2019?day=2019-06-18'
            #self.url = 'https://openaccess.thecvf.com/CVPR2019?day=2019-06-19'
            #self.url = 'https://openaccess.thecvf.com/CVPR2019?day=2019-06-20'
            #self.url = 'https://openaccess.thecvf.com/ICCV2019?day=2019-10-29'
            #self.url = 'https://openaccess.thecvf.com/ICCV2019?day=2019-10-31'
            #self.url = 'https://openaccess.thecvf.com/CVPR2021?day=2021-06-22'
            self.headers = {
                'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 '
                              'Safari/537.36 '
            }
            r = requests.get(self.url, headers=self.headers)
            r.encoding = r.apparent_encoding
            self.html = r.text
    
        def lxml_find(self):
            '''用lxml解析'''
            tonum = 200
            number = 1
            start = time.time()  # 三种方式速度对比
            selector = etree.HTML(self.html)  # 转换为lxml解析的对象
            titles = selector.xpath('//dt[@class="ptitle"]/a/@href')  # 这里返回的是一个列表
            for each in titles[200:]:
                title0 = each.strip()  # 去掉字符左右的空格
                # print("https://openaccess.thecvf.com/content_CVPR_2019"+title[17:])
                chaolianjie = "https://openaccess.thecvf.com/content_cvpr_2018" + title0[17:]
                req = requests.get(chaolianjie, headers=self.headers)
                req.encoding = req.apparent_encoding
                onehtml = req.text
                selector1 = etree.HTML(req.text)
                title = selector1.xpath('//div[@id="papertitle"]/text()')
                # print(title[0][1:])
                abst = selector1.xpath('//div[@id="abstract"]/text()')
                hre0 = selector1.xpath('//a/@href')
                hre = "https://openaccess.thecvf.com" + hre0[5][5:]
                # print(hre)
                zuozhe = selector1.xpath('//dd/div[@id="authors"]/b/i/text()')
                va = []
                for keywords, weight in extract_tags(abst[0].strip(), topK=1, withWeight=True):
                    print('%s %s' % (keywords, weight))
                va.append(title)
                va.append(hre)
                va.append(abst)
                va.append(zuozhe)
                va.append("2018-06-21")
                va.append(keywords)
                sql = "insert into cvpr (title,link,abstract,zuozhe,time,keywords) values (%s,%s,%s,%s,%s,%s)"
                cursor.execute(sql, va)
                db.commit()
    
                print("已爬取" + str(number) + "条数据")
                number = number + 1
    
            end = time.time()
            print('共耗时:', end - start)
    
    
    if __name__ == '__main__':
        spider = Spider()
        spider.lxml_find()
        cursor.close()
        db.close()
    

      

    python初学者爬取数据完整教程

  • 相关阅读:
    MySQL 8 新特性之Clone Plugin
    哈尔滨工业大学(威海)第九届ACM程序设计竞赛
    我回来了
    正则表达式(五):正则表达式的高阶使用
    正则表达式(四):正则表达式中的分组的概念
    正则表达式(三):正则表达式中的重复
    正则表达式(二):正则表达式的元字符
    正则表达式(一):正则表达式的概念
    Android NDK 跨平台构建工具 CMake 使用笔记
    字节序问题之大小端模式讲解
  • 原文地址:https://www.cnblogs.com/ltw222/p/14892105.html
Copyright © 2011-2022 走看看