zoukankan      html  css  js  c++  java
  • 自己爬虫的几个案例

    原创,如转载请注明来源https://www.cnblogs.com/sogeisetsu/

    爬取中国大学排名

    #这个只用到了requests 和 bs4
    #爬取大学排名
    import requests
    from bs4 import BeautifulSoup as bs
    def grthtml(url):
        demo=requests.get(url)
        demo.encoding=demo.apparent_encoding
        demo=demo.text
        return(demo)
    def listhtml(ulist,html):
        soup=bs(html,"html.parser")
        soup=soup.tbody
        for tr in soup("tr"):
            tds=tr("td")
            ulist.append([tds[0].string,tds[1].string,tds[2].string])
    def pmhtml(ulist,num):
        print('{0:^10}	{1:{3}^7}	{2:^10}'.format("排名","校名","地址",chr(12288)))
        for i in ulist[0:num]:
            print("{0:^10}	{1:{3}^10}	{2:^10}".format(i[0],i[1],i[2],chr(12288)))
    if __name__=="__main__":
        url="http://www.zuihaodaxue.cn/zuihaodaxuepaiming2016.html"
        html=grthtml(url)
        uinfo=[]
        listhtml(uinfo,html)
        num=int(input())
        pmhtml(uinfo,num)
    
    

    爬取今日头条热点

    # 这个爬取的是动态页面
    # 爬取动态页面,今日头条
    # 源文件里是没有内容的只有js
    import requests
    from bs4 import BeautifulSoup as bs
    import json
    def gethtml(url):
        try:
        	#通过F12检查出来自下面这个链接
            # url="https://www.toutiao.com/api/pc/feed/?category=news_hot&utm_source=toutiao&widen=1&max_behot_time=0&max_behot_time_tmp=0&tadrequire=true&as=A1D5FDAAB5194AE&cp=5DA599948ACE7E1&_signature=TBeQ-wAAEbOkzbKGAd3hQUwXkO"
            head={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36','Cookie':#cookie就用自己的吧}
            r=requests.get(url,headers=head)
            return r.text #返回json文件
        except:
            print("oneerror")
    def getulist(html,list):
        try:
            soup=json.loads(html)
            soupdata=soup['data']
            for one in soupdata:
                a='https://www.toutiao.com/a'+one['group_id']
                list.append([one['title'],one['abstract'],a])
        except:
            print("twoerror")
    def printulist(list):
        for i in list:
            print("title:
    {}".format(i[0]))
            print("简介:
    {}".format(i[1]))
            print("链接:{:^30}".format(i[2]))
    if __name__=="__main__":
        url = "https://www.toutiao.com/api/pc/feed/?category=news_hot&utm_source=toutiao&widen=1&max_behot_time=0&max_behot_time_tmp=0&tadrequire=true&as=A1D5FDAAB5194AE&cp=5DA599948ACE7E1&_signature=TBeQ-wAAEbOkzbKGAd3hQUwXkO"
        html=gethtml(url)
        relist=[]
        getulist(html,relist)
        printulist(relist)
    

    爬取知乎热点

    #爬取知乎
    import requests
    from bs4 import BeautifulSoup as bs
    def gethtml(url):
        headers={'user-agent':'Mozila/5.0','Cookie':#cookie就用自己的吧}
        r=requests.get(url,headers=headers)
        r.raise_for_status()
        r.encoding=r.apparent_encoding
        r=r.text
        return r
    def getlist(ulist,html):
        soup=bs(html,"html.parser")
        soup = soup("div", "HotList-list")
        c=soup[0].contents
        for sect in c:
            if sect.a.p:
                pp=sect.a.p.string
            else:
                pp="该题目下没有相应简介"
            ulist.append([sect.a.h2.string,pp,sect.a.attrs['href']])
    def printlist(ulist,num):
        top=1
        for i in ulist[0:num]:
            print("top{}".format(top))
            top+=1
            print("题目:{}".format(i[0]))
            print("优秀回答简介:
    {}".format(i[1]))
            print("链接
    {}".format(i[2]))
            # print("
    ")
    if __name__ == "__main__":
        url="https://www.zhihu.com/hot"
        ulist=[]
        print("您想看知乎热榜top几?")
        num=int(input())
        html=gethtml(url)
        getlist(ulist,html)
        printlist(ulist,num)
    
  • 相关阅读:
    HTTP协议中GET、POST和HEAD的介绍
    Django model 字段类型清单
    MySQL的菜鸟级操作
    windows7下将Cygwin加入右键菜单,并从当前目录打开
    数组指针和指针数组的区别
    const引用与非const引用
    printf("33[1;33m ***** 33[0m ");
    C语言可变参数函数详解示例
    机顶盒demux的工作原理
    机顶盒的工作原理
  • 原文地址:https://www.cnblogs.com/sogeisetsu/p/11679761.html
Copyright © 2011-2022 走看看