zoukankan      html  css  js  c++  java
  • python的数据爬取

    import requests
    import jieba
    import pymysql
    from bs4 import BeautifulSoup
    
    #链接到本地数据库
    from jieba.analyse import extract_tags
    
    db = pymysql.connect(host='127.0.0.1',port=3306,user='root',password='1224',database='cvpr',charset='utf8')
    
    cursor = db.cursor()
    
    #定义头文件
    headers={
            "User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36"
            }
    #get方法抓取数据
    # url="http://openaccess.thecvf.com/CVPR2019.py"
    url="https://openaccess.thecvf.com/CVPR2019?day=2019-06-18"
    html=requests.get(url)
    #使用 Beautiful Soup 解析网页
    soup=BeautifulSoup(html.content,'html.parser')
    
    pdfs=soup.findAll("a",text="pdf")
    print(len(pdfs))
    
    lis = []
    jianjie=""
    for i,pdf in enumerate(pdfs):
        pdf_name=pdf["href"].split('/')[-1]
        name=pdf_name.split('.')[0].replace("_CVPR_2019_paper","")
        link="http://openaccess.thecvf.com/content_CVPR_2019/html/"+name+"_CVPR_2019_paper.html"
        url1=link
        print(url1)
        print(i)
        html1 = requests.get(url1)
        if html1:
            soup1 = BeautifulSoup(html1.content, 'html.parser')
            weizhi = soup1.find('div', attrs={'id':'abstract'})
            if weizhi:
                jianjie =weizhi.get_text()
            authers = soup1.find_all(id="authors")
            # 论文编号
            a = authers[0].contents[3]
            a_split = a.split('.')#以点分割为数组
            code=a_split[1].strip()#去掉空格前后
            # 作者
            auther = soup1.find("i")
            myauther=auther.string
            keywordlist=[]
            for keyword, weight in extract_tags(jianjie.strip(), topK=5, withWeight=True):
                keywordlist.append(keyword)
            keywordliststr = ','.join(keywordlist)
            info = {'title': name, 'author': myauther, 'abstract': jianjie, 'link': link, 'code': code,
                    'keywords': keywordliststr}
            print(info.values())
            lis.append(info)
    print(lis)
    
    cursor = db.cursor()
    for i in range(len(lis)):
        cols = ", ".join('`{}`'.format(k) for k in lis[i].keys())
        print(cols)  # '`name`, `age`'
    
        val_cols = ', '.join('%({})s'.format(k) for k in lis[i].keys())
        print(val_cols)  # '%(name)s, %(age)s'
    
        sql = "insert into lunwen(%s) values(%s)"
        res_sql = sql % (cols, val_cols)
        print(res_sql)
    
        cursor.execute(res_sql, lis[i])  # 将字典a传入
        db.commit()
        print("ok")

  • 相关阅读:
    高通电池曲线(转)
    随笔分类
    海思uboot启动流程详细分析(转)
    GPRS以TCP上传数据到服务器OK,但收不到服务器下发的数据
    WIS800C TCP Client 非透传模式连接远程服务器使用方法(转)
    MySQL 不允许从远程访问的解决方法
    Linux下将Mysql和Apache加入到系统服务里的方法
    linux服务器出现严重故障后的原因以及解决方法
    Linux防火墙该如何设置
    Linux下的两个聊天命令的使用方法
  • 原文地址:https://www.cnblogs.com/da48/p/14910820.html
Copyright © 2011-2022 走看看