zoukankan      html  css  js  c++  java
  • python3: 博客园列表爬取;

    import  requests
    from bs4 import BeautifulSoup as bs
    import  html5lib
    
    
    header = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.96 Safari/537.36'
    }
    
    def download():
        """
         模拟浏览器进行访问;
        :param url:
        :return:
        """
        for pageIdx in range(1, 5, 1):
            #print(pageIdx)
            url = "https://www.cnblogs.com/sitehome/p/%s" % str(pageIdx)
            try:
                r = requests.get(url, timeout=30)
                r.raise_for_status()
                r.encoding = r.apparent_encoding
                data = r.text
            except:
                return
            content = bs(data, 'html5lib')
            for ctx in content.find_all('h3'):
                print(ctx.a['href'], ctx.a.string)
    
    
    if __name__ == "__main__":
        download()
    

      

  • 相关阅读:
    补码原理
    清风徐来,水波不兴
    月尾和周尾
    又一春又一季
    9.11
    晨光无限
    9.18
    心悠
    小事一桩
    一周岁啦
  • 原文地址:https://www.cnblogs.com/yinwei-space/p/9320784.html
Copyright © 2011-2022 走看看