zoukankan      html  css  js  c++  java
  • 爬取豆瓣

    import json
    
    import requests
    
    
    class DoubanSpider:
        def __init__(self):
            self.start_url = "https://movie.douban.com/j/search_subjects?type=tv&tag={}&sort=recommend&page_limit=20&page_start={}"
            self.tv_type_url = "https://movie.douban.com/j/search_tags?type=tv&source="
            self.headers = {
                "User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 Mobile/15E148 Safari/604.1",
            }
    
        def parse_url(self, url):
            """发送请求,获取响应"""
            print(url)
            response = requests.get(url, headers=self.headers)
            return response.content.decode()
    
        def get_content_list(self, json_str, json_data):
            """提取数据"""
            dict_ret = json.loads(json_str)
            content_list = dict_ret[json_data]
            return content_list
    
        def save_content_list(self, content_list,data_type):
            """保存"""
            with open("douban.txt", "a", encoding="Utf-8") as f:
                for content in content_list:
                    content["data_type"] =data_type
                    f.write(json.dumps(content, ensure_ascii=False))
                    f.write("
    ")  # 写入换行符,进行换行
            print("保存成功")
    
        def run(self):
            """实现主要逻辑"""
            tv_type_json_str = self.parse_url(self.tv_type_url)
            tv_type_list = self.get_content_list(tv_type_json_str, "tags")
            for tv_type in tv_type_list:
                num = 0
                while True:
                    # 1.start_url
                    url = self.start_url.format(tv_type, num)
                    # 2.发送请求,获取响应
                    json_str = self.parse_url(url)
                    # 3.提取TV数据
                    content_list = self.get_content_list(json_str, "subjects")
                    # 4.保存
                    self.save_content_list(content_list,tv_type)
                    if len(content_list) < 20:
                        break
                    # 5.构造下一页的url地址,进入循环
                    num += 1
    
    
    if __name__ == '__main__':
        douban_spider = DoubanSpider()
        douban_spider.run()
  • 相关阅读:
    循环
    rugarch包与R语言中的garch族模型
    Logistic回归
    机器学习缺失值处理方法汇总
    pandas库介绍之DataFrame基本操作
    python中常用的九种预处理方法
    谁动了我的特征?——sklearn特征转换行为全记录
    使用sklearn优雅地进行数据挖掘
    使用sklearn做单机特征工程
    彻底解决matplotlib中文乱码问题
  • 原文地址:https://www.cnblogs.com/King-boy/p/13173862.html
Copyright © 2011-2022 走看看