zoukankan      html  css  js  c++  java
  • 个人作业——CVPR顶会论文爬取

    main.py

    #保存单个界面数据
    def getInfo(url):
         # url='https://openaccess.thecvf.com/WACV2021'
         header={
              'User-Agent':'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Mobile Safari/537.36'
         }
         html=requests.get(url=url,headers=header).text
         soup=BeautifulSoup(html,'lxml')
         dl=soup.find('dl')
         print(dl.find('dt'))
         info=dl.find_all(class_='bibref pre-white-space')
         for i in info:
              print('----------------------------------------------------------------------------------------------------------')
              print(i.text)
              # info0=i.find(class_='bibref pre-white-space').get_text()
              # print(info0)
              #处理数据
              info1=i.text.strip('@InProceedings{,}')
              info2=info1.replace('=','')
              info2=info2.replace("'","''")
              info2=info2.replace('{','')
              info2=info2.replace('}',',')
              info2=info2.replace('author',',')
              info2=info2.replace('title','')
              info2=info2.replace('book','')
              info2=info2.replace('month','')
              info2=info2.replace('year','')
              info2=info2.replace('pages','')
              # info2=info2.replace(' ','')
              info2=info2.replace('
    ','')
              info2=info2.replace('    ','')
              info2=",,"+info2+","
              print(info2)
              info3=info2.split(',,')
              print(info3)
    
              #保存数据
              list=[]
              for i in info3:
                   list.append(i.strip(' '))
                   print(i.strip(' '))
              print(str(list[1]))
              #链接数据库
              conn=pymysql.connect(host='39.106.103.180',port=3306,user='root',password='E6B3628525e4',database='user')
              cursor=conn.cursor()
              sql_select="select * from paper where title='"+str(list[3])+"'"
              if(cursor.execute(sql_select)!=1):
                   sql="insert into paper(author,title,booktitle,month,year,pages) values('"+str(list[2])+"','"+str(list[3])+"','"+str(list[4])+"','"+str(list[5])+"','"+str(list[6])+"','"+str(list[7])+"')"
                   a=cursor.execute(sql)
                   re=cursor.fetchall()
                   print(re)
              conn.commit()
              cursor.close()
              conn.close()
    
    url='https://openaccess.thecvf.com/menu'
    headers={
        'User-Agent':'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Mobile Safari/537.36'
    }
    html=requests.get(url=url,headers=headers).text
    print(html)
    soup=BeautifulSoup(html,'lxml')
    dds=soup.find_all('dd')
    # print('________________________________________________________________________________________________________________')
    # print(dds)
    for dd in dds:
        print('----------------------------------------------------------------------------------------------------------------------')
        # print(dd.text) #标题
        for d in dd.find_all('a'):
            # print(d.text)  #会议性质
            # print(d['href'],'
    ') #地址
    
            url_MainConference=url.strip('menu')+str(d['href']).strip('/')
            print(url_MainConference.strip('.py'))
            # menu=requests.get(url=url_MainConference,headers=headers).text
            getInfo(url_MainConference.strip('.py '))

    注意数据库字段设置合适的长度

  • 相关阅读:
    获取<input type="checkbox" >控件的checked值
    网站IIS部署及调试
    winform窗体全屏实现
    ComBox控件的使用
    在vs2005项目中,将.sln文件和.suo文件放在一个独立的文件夹内
    .NET面试题整理
    《url重写——更友好的网站url设计》
    char、varchar、nvarchar三者间的区别
    操作符"??"的用法
    Maven 学习笔记(三)
  • 原文地址:https://www.cnblogs.com/1305536110-dym/p/14900322.html
Copyright © 2011-2022 走看看