zoukankan      html  css  js  c++  java
  • Python实现抓取CSDN热门文章列表

    1、使用工具:
    Python3.5
    BeautifulSoup
    2、抓取网站:
    csdn热门文章列表 http://blog.csdn.net/hot.html
    3、分析网站代码:
    这里写图片描述
    4、实现代码:

    __author__ = 'Administrator'
    import urllib.request
    import re
    from bs4 import BeautifulSoup
    
    
    ########################################################
    #
    # 抓取csdn首页文章http://blog.csdn.net/?&page=1
    #
    #
    #
    ########################################################
    class CsdnUtils(object):
        def __init__(self):
            user_agent = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'
            self.headers = {'Cache-Control': 'max-age=0',
                            'Connection': 'keep-alive',
                            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
                            'User-Agent': user_agent,
            }
    
        def getPage(self, url=None):
            request = urllib.request.Request(url, headers=self.headers)
            response = urllib.request.urlopen(request)
            soup = BeautifulSoup(response.read(), "html.parser")
            #print(soup.prettify())
            return soup
    
        def parsePage(self, url=None, page=None):
            soup = self.getPage(url)
            itemBlog = soup.find_all('div', 'blog_list')
            cnArticle = CsdnUtils
            print("========================第", page, "页======================================")
            for i, itemSingle in enumerate(itemBlog):
                cnArticle.num = i
                cnArticle.author = itemSingle.find('a', 'user_name').string
                cnArticle.postTime = itemSingle.find('span', 'time').string
                cnArticle.articleView = itemSingle.find('a', 'view').string
                if itemSingle.find('h1').find('a').has_attr('class'):
                    cnArticle.type = itemSingle.find('h1').find('a', 'category').string
                else:
                    cnArticle.type = "None"
                cnArticle.title = itemSingle.find('h1').find('a', attrs={'name': True}).string
                cnArticle.url = itemSingle.find('h1').find('a', attrs={'name': True}).get("href")
                print("数据:", cnArticle.num + 1, '	', cnArticle.author, '	', cnArticle.postTime, '	',
                      cnArticle.articleView, '	', cnArticle.type, '	', cnArticle.title, '	', cnArticle.url)
    
    
    #######     执行入口    ########
    if __name__ == "__main__":
    
        #要抓取的网页地址'http://blog.csdn.net/?&page={}'.format(i+1),i+1)
        url = "http://blog.csdn.net/hot.html"
    
        cnblog = CsdnUtils()
        for i in range(0, 5):
            cnblog.parsePage(url, i + 1)
    
    

    5、执行结果:
    这里写图片描述

  • 相关阅读:
    微信开发:MySQL utf8mb4 字符集
    Spring 事务
    Exception
    mysql系列之多实例介绍
    python连接MySQL
    1_archlinux_安装篇
    apache中如何调用CGI脚本
    1.1_Django简介及安装
    git分支合并脚本
    用python收集系统信息
  • 原文地址:https://www.cnblogs.com/luweiwei/p/5968460.html
Copyright © 2011-2022 走看看