zoukankan      html  css  js  c++  java
  • Python爬虫_百度贴吧

    # 本爬虫为爬取百度贴吧并存储HTML

    import
    requests class TiebaSpider: def __init__(self, tieba_name): self.tieba_name = tieba_name self.url = "https://tieba.baidu.com/f?kw="+tieba_name+"&ie=utf-8&pn={}" self.headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36"} def get_url_list(self): #构造url列表 url_list = [self.url.format(i*50) for i in range(1000)] return url_list def parse_url(self, url): #发送请求 获取响应 res = requests.get(url, headers=self.headers) return res.content.decode() def save_html(self, html_str, page_num): #保存 file_path = "{}-第{}页.html".format(self.tieba_name, page_num) with open(file_path, "w", encoding="utf-8") as f: f.write(html_str) def run(self): # 实现主要逻辑 # 1 构造url列表 url_list = self.get_url_list() # 2 遍历 发送请求 获取响应 for url in url_list: html_str = self.parse_url(url) # 3 保存 page_num = url_list.index(url)+1 self.save_html(html_str, page_num) print(url) if __name__ == "__main__": tieba_spider = TiebaSpider("lol") tieba_spider.run()
  • 相关阅读:
    Java学习8.17
    Java学习8.16
    Java学习8.15
    Java学习8.14
    Java学习8.13
    Java学习8.12
    Java学习8.11
    131. Palindrome Partitioning 回文串分割
    40. Combination Sum II 不允许使用重复元素
    39. Combination Sum 凑出一个和,可以重复用元素(含duplicates)
  • 原文地址:https://www.cnblogs.com/waterr/p/13893578.html
Copyright © 2011-2022 走看看