zoukankan      html  css  js  c++  java
  • 每日总结6.12

    今天爬取到了论文数据

    python代码:

    import requests
    import pymysql
    from bs4 import BeautifulSoup
    
    # 链接到本地数据库
    from jieba.analyse import extract_tags
    
    db = pymysql.connect(host='127.0.0.1', port=3306, user='root', password='123456', database='z1', charset='utf8')
    
    cursor = db.cursor()
    
    # 定义头文件
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 "
                      "Safari/537.36 "
    }
    # get方法抓取数据
    # url="http://openaccess.thecvf.com/CVPR2019.py"
    url = "https://openaccess.thecvf.com/CVPR2019?day=2019-06-18"
    html = requests.get(url)
    # 使用 Beautiful Soup 解析网页
    soup = BeautifulSoup(html.content, 'html.parser')
    
    pdfs = soup.findAll("a", text="pdf")
    print(len(pdfs))
    
    lis = []
    jianjie = ""
    for i, pdf in enumerate(pdfs):
        pdf_name = pdf["href"].split('/')[-1]
        name = pdf_name.split('.')[0].replace("_CVPR_2019_paper", "")
        link = "https://openaccess.thecvf.com/content_CVPR_2019/html/" + name + "_CVPR_2019_paper.html"
        url1 = link
        print(url1)
        print(i)
        html1 = requests.get(url1)
        if html1:
            soup1 = BeautifulSoup(html1.content, 'html.parser')
            weizhi = soup1.find('div', attrs={'id': 'abstract'})
            if weizhi:
                jianjie = weizhi.get_text()
            authors = soup1.find_all(id="authors")
            # 论文编号
            a = authors[0].contents[3]
            a_split = a.split('.')  # 以点分割为数组
            code = a_split[1].strip()  # 去掉空格前后
            # 作者
            author = soup1.find("i")
            myauthor = author.string
            keywordlist = []
            for keyword, weight in extract_tags(jianjie.strip(), topK=5, withWeight=True):
                keywordlist.append(keyword)
            keywordliststr = ','.join(keywordlist)
            info = {
                'title': name,
                'author': myauthor,
                'abstract': jianjie,
                'link': link,
                'code': code,
                'keywords': keywordliststr}
            print(info.values())
            lis.append(info)
    print(lis)
    
    cursor = db.cursor()
    for i in range(len(lis)):
        cols = ", ".join('`{}`'.format(k) for k in lis[i].keys())
        print(cols)  # '`name`, `age`'
    
        val_cols = ', '.join('%({})s'.format(k) for k in lis[i].keys())
        print(val_cols)  # '%(name)s, %(age)s'
    
        sql = "insert into lunwen(%s) values(%s)"
        res_sql = sql % (cols, val_cols)
        print(res_sql)
    
        cursor.execute(res_sql, lis[i])  # 将字典a传入
        db.commit()
        print("ok")
  • 相关阅读:
    网络
    分区
    JavaScript概述
    CSS样式属性
    css选择器
    CSS的引入方式
    css介绍
    HTML结构
    常用标签
    HTML介绍
  • 原文地址:https://www.cnblogs.com/a8047/p/14902228.html
Copyright © 2011-2022 走看看