zoukankan      html  css  js  c++  java
  • 2021 6 1

    顶会热词爬虫:

    # -*- coding = UTF-8 -*-
    # @Time : 2021/6/12 23:34
    # @Author : 伏珊瑞
    # @File : test2021.py
    # @Software : PyCharm
    import urllib
    import pymysql
    import requests
    import jieba.analyse
    from bs4 import BeautifulSoup
    import re
    from pymysql.constants.FIELD_TYPE import JSON
    conn = pymysql.connect(host='localhost', user="root", passwd="123456", database="paper")
    # 获取游标
    cursor = conn.cursor()
    Sql="insert into pypaper1(name,herf,writer,Abstract,time,keywords) values(%s,%s,%s,%s,%s,%s)"
    def main():  #主函数,之后调用
        html=askURL("https://openaccess.thecvf.com/CVPR2021?day=all")
        getherf(html)
        cursor.close()
        conn.commit()
        conn.close()
    def getAbstract(url):   #爬取论文的摘要
        html=askURL(url)       #将网页的html转换成文本
        data=""
        bs = BeautifulSoup(html, "html.parser")
        findlink_herf = re.compile(r'
    (.*?)
    ')#摘要的相应的正则表达式
        a=bs.find_all(id="abstract")
        for item in a:
            item=str(item)
            data=re.findall(findlink_herf,item)[0]
        return data
    def getherf(html):#爬取论文所有数据并写入数据库
        bs=BeautifulSoup(html,"html.parser")
        a=bs.find_all(class_="ptitle")
        b = bs.find_all("dd")#dd是网页中每条论文数据的框框
        findlink_herf=re.compile(r'<a href="(.*?)">')#论文的超链接的正则表达式
        findlink_name = re.compile(r'<a href="(.*?)">(.*?)</a></dt>')#论文名字的正则表达式
        findlink_writer = re.compile(r'">(.*?)</a>')#论文作者的正则表达式
        TEMP=1;#爬取的作者连同ptf也爬了所以每两个才是作者数据
        inta=1;#计数器
        for item in a:
            # try:
                item=str(item)
                name=str(b[TEMP])
                link_href=re.findall(findlink_herf,item)[0]
                link_name = re.findall(findlink_name, item)[0]
                writer=re.findall(findlink_writer, name)
                link_writer=""
                for s in writer:
                    link_writer+=s+"+"
                link_Abstract=getAbstract("https://openaccess.thecvf.com/"+link_href)#论文超链接应该加上链接头
                keywords=""
                p=0
                for word in jieba.analyse.extract_tags(link_Abstract):#将关键词数组拼成字符串
                    keywords+=word+"+"
                    p=p+1
                    if(p==5):
                        break
                # print(link_name[1])
                # print(link_href)
                # print(link_writer)
                # print(link_Abstract)
                # print(keywords)
                insert = cursor.execute(Sql, (link_name[1], link_href, link_writer, link_Abstract, "2021",keywords))#写入数据库
                TEMP+=2
                print(inta)
                inta+=1
            # except:
            #     print(link_name[1])
            #     print(link_href)
            #     print(link_writer)
            #     print(link_Abstract)
            #     print(keywords)
    def askURL(url):#获取网页的html文本
        header = {
            'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.77 Safari/537.36'
        }
        request = urllib.request.Request(url, headers=header)
        html = ""
        response = urllib.request.urlopen(request)
        html = response.read().decode("UTF-8")
        return html
    main()#调用main函数
    # -*- coding = UTF-8 -*-
    # @Time : 2021/6/12 23:34
    # @Author : 伏珊瑞
    # @File : test2021.py
    # @Software : PyCharm
    import urllib
    import pymysql
    import requests
    import jieba.analyse
    from bs4 import BeautifulSoup
    import re
    from pymysql.constants.FIELD_TYPE import JSON
    conn = pymysql.connect(host='localhost', user="root", passwd="123456", database="paper")
    # 获取游标
    cursor = conn.cursor()
    Sql="insert into pypaper(name,herf,writer,Abstract,time,keywords) values(%s,%s,%s,%s,%s,%s)"
    def main():
        html=askURL("https://openaccess.thecvf.com/WACV2021")
        getherf(html)
        cursor.close()
        conn.commit()
        conn.close()
        #getAbstract("https://openaccess.thecvf.com/content_WACV_2020/html/Sang_Inferring_Super-Resolution_Depth_from_a_Moving_Light-Source_Enhanced_RGB-D_Sensor_WACV_2020_paper.html")
    def getAbstract(url):
        html=askURL(url)
        data=""
        bs = BeautifulSoup(html, "html.parser")
        findlink_herf = re.compile(r'
    (.*?)
    ')
        a=bs.find_all(id="abstract")
        for item in a:
            item=str(item)
            data=re.findall(findlink_herf,item)[0]
        return data
    def getherf(html):
        bs=BeautifulSoup(html,"html.parser")
        a=bs.find_all(class_="ptitle")
        b = bs.find_all("dd")
        findlink_herf=re.compile(r'<a href="(.*?)">')
        findlink_name = re.compile(r'<a href="(.*?)">(.*?)</a></dt>')
        findlink_writer = re.compile(r'">(.*?)</a>')
        TEMP=0;
        inta=1;
        for item in a:
            try:
                item=str(item)
                name=str(b[TEMP])
                link_href=re.findall(findlink_herf,item)[0]
                link_name = re.findall(findlink_name, item)[0]
                writer=re.findall(findlink_writer, name)
                link_writer=""
                for s in writer:
                    link_writer+=s+"+"
                link_Abstract=getAbstract("https://openaccess.thecvf.com/"+link_href)
                keywords=""
                for word in jieba.analyse.extract_tags(link_Abstract):
                    keywords+=word+"+"
                # print(link_name[1])
                # print(link_href)
                # print(link_writer)
                # print(link_Abstract)
                # print(keywords)
                insert = cursor.execute(Sql, (link_name[1], link_href, link_writer, link_Abstract, "2021",keywords))
                #print("insert into pypaper values("+link_name[1]+","+link_href+","+link_writer+","+link_Abstract+",2021)")
                TEMP+=2
                print(inta)
                inta+=1
            except:
                print(link_name[1])
                print(link_href)
                print(link_writer)
                print(link_Abstract)
                print(keywords)
    def askURL(url):
        header = {
            'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.77 Safari/537.36'
        }
        request = urllib.request.Request(url, headers=header)
        html = ""
        response = urllib.request.urlopen(request)
        html = response.read().decode("UTF-8")
        return html
    main()
    #def getwriter(html):
    #     bs = BeautifulSoup(html, "html.parser")
    #     a = bs.find_all("dd")
    #     findlink = re.compile(r'">(.*?)</a>')
    #     TEMP=2;
    #     for item in a:
    #         if(TEMP%2==0):
    #             item = str(item)
    #             link = re.findall(findlink, item)
    #             print(link)
    #         TEMP+=1
    # def getname(html):
    #     bs = BeautifulSoup(html, "html.parser")
    #     a = bs.find_all(class_="ptitle", )
    #     findlink = re.compile(r'<a href="(.*?)">(.*?)</a></dt>')
    #     list = bs.find_all(re.compile("dt"))
    #     for item in a:
    #         item = str(item)
    #         link = re.findall(findlink, item)[0]
    #         print(link[1])
  • 相关阅读:
    javascirpt Scoket
    黑马程序员面试题(一)交通灯管理系统
    中软国际实习总结
    黑马程序员Java基础正则表达式
    黑马程序员Java基础加强Java高新技术
    黑马程序员JAVA基础GUI
    面试题(二)银行业务调度系统
    黑马程序员JAVA基础IO流之File 类
    黑马程序员JAVA基础IO流其他类
    黑马程序员Java基础网络编程
  • 原文地址:https://www.cnblogs.com/fuxw4971/p/14913378.html
Copyright © 2011-2022 走看看