zoukankan      html  css  js  c++  java
  • Python爬虫【实战篇】百度贴吧爬取页面存到本地

    先上代码

    import requests
    
    
    class TiebaSpider:
        def __init__(self, tieba_name):
            self.tieba_name = tieba_name
            self.url_temp = " https://tieba.baidu.com/f?kw=" + tieba_name + "&ie=utf-8&pn={}"
            self.headers = {
                "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36"
            }
    
        def get_url_list(self):
            """构造url列表"""
            return [self.url_temp.format(i * 50) for i in range(50)]
    
        def parse_url(self, url):
            """发送请求 获取响应"""
            print(url)
            response = requests.get(url=url, headers=self.headers)
            return response.content
    
        def save_html(self, html_str, page_num):
            # 构建文件名
            file_path = "{}第{}页.html".format(self.tieba_name, page_num)
            with open(file_path, "wb") as f:
                f.write(html_str)
    
        def run(self):
            """实现主要逻辑"""
    
            url_list = self.get_url_list()
    
            # 遍历请求
            for url in url_list:
                html_str = self.parse_url(url=url)
    
                # 构建页码
                page_num = url_list.index(url) + 1
    
                # 创建html文件
                self.save_html(html_str, page_num)
    
    
    if __name__ == '__main__':
        tieba_spider = TiebaSpider("lol")
        tieba_spider.run()

  • 相关阅读:
    小组项目进度汇报
    小组项目进程展示
    结队项目
    小组计划
    个人项目:数独
    问题
    自我介绍
    结对项目
    软件工程基础大项目——数独问题
    关于软件工程的几个问题
  • 原文地址:https://www.cnblogs.com/tangkaishou/p/10247471.html
Copyright © 2011-2022 走看看