zoukankan      html  css  js  c++  java
  • python request爬取百度贴吧

     1 import requests
     2 import os
     3 import shutil
     4 import time
     5 
     6 
     7 class PostBarSpider(object):
     8     def __init__(self, post_bar, page_number, file_dir):
     9         # 爬取某个贴吧前多少页内容
    10         self.post_bar = post_bar
    11         self.page = page_number
    12         # 保存到哪个目录
    13         self.file_dir = file_dir
    14         self.url = "https://tieba.baidu.com/f?kw={}&ie=utf-8&pn={}"
    15         self.headers = {
    16             "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36"
    17         }
    18 
    19     # 构造url
    20     def get_url_list(self):
    21         # url_list = []
    22         # for i in range(0, self.page):
    23         #     result = self.url.format(self.post_bar, i * 50)
    24         #     print(result)
    25         #     url_list.append(result)
    26         # return url_list
    27         return [self.url.format(self.post_bar, i * 50) for i in range(self.page)]
    28 
    29     # 创建保存文件的目录
    30     def set_dir(self):
    31         self.file_dir += "/{}/".format(self.post_bar)
    32         print("保存路径-----{}".format(self.file_dir))
    33         try:
    34             if not os.path.exists(self.file_dir):
    35                 os.makedirs(self.file_dir)
    36             else:
    37                 shutil.rmtree(self.file_dir)
    38                 os.makedirs(self.file_dir)
    39         except Exception as info:
    40             print("创建或删除文件夹出现问题")
    41 
    42     def run(self):
    43         # 设置保存目录
    44         self.set_dir()
    45         # 获得url_list
    46         url_list = self.get_url_list()
    47 
    48         # 遍历ulr_list
    49         for i in range(0, len(url_list)):
    50             response = requests.get(url_list[i], headers=self.headers)
    51             if response.status_code == 200:
    52                 # 写出数据
    53                 file_name = self.file_dir + "{}".format("第{}页.txt".format(i + 1))
    54                 file = open(file_name, "w", encoding="utf-8")
    55                 file.write(response.content.decode())
    56                 print("已写入第{}页".format(i + 1))
    57                 file.close()
    58 
    59 
    60 def main():
    61     start = time.time()
    62     my_spider = PostBarSpider("李毅", 20, "f:/post_bar_test")
    63     my_spider.run()
    64     end = time.time()
    65     print("耗时-----{}s".format(end - start))
    66 
    67 
    68 if __name__ == '__main__':
    69     main()

     

  • 相关阅读:
    ECSHOP给分类添加图
    windows2008一键安装环境的配置说明
    在css中定义滚动条样式
    登录不到phpmyadmin
    dedecms程序给栏目增加缩略图的方法
    httpd.conf
    关于 equals() 与 hashCode() 个人理解总结
    postman 安装失败 Failed to install the .NET Framework, try installingthe latest version manully
    docker 私有仓库The push refers to repository [x:5000/test] Get https://x:5000/v2/: dial tcp x:5000: conn
    Redis window 和 Linux 环境下的搭建
  • 原文地址:https://www.cnblogs.com/tele-share/p/10533521.html
Copyright © 2011-2022 走看看