zoukankan      html  css  js  c++  java
  • python3: 博客园列表爬取;

    import  requests
    from bs4 import BeautifulSoup as bs
    import  html5lib
    
    
    header = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.96 Safari/537.36'
    }
    
    def download():
        """
         模拟浏览器进行访问;
        :param url:
        :return:
        """
        for pageIdx in range(1, 5, 1):
            #print(pageIdx)
            url = "https://www.cnblogs.com/sitehome/p/%s" % str(pageIdx)
            try:
                r = requests.get(url, timeout=30)
                r.raise_for_status()
                r.encoding = r.apparent_encoding
                data = r.text
            except:
                return
            content = bs(data, 'html5lib')
            for ctx in content.find_all('h3'):
                print(ctx.a['href'], ctx.a.string)
    
    
    if __name__ == "__main__":
        download()
    

      

  • 相关阅读:
    九连环
    Fruit Ninja(取随机数)
    Distinct Values(贪心)
    Harvest of Apples
    Don't Be a Subsequence
    ConvexScore
    守卫
    Trie树
    NOIP 2005 过河
    HDU 4597 Play Game 记忆化DP
  • 原文地址:https://www.cnblogs.com/yinwei-space/p/9320784.html
Copyright © 2011-2022 走看看