zoukankan      html  css  js  c++  java
  • requests爬取豆瓣

    豆瓣电视剧爬虫

    # coding=utf-8
    import requests
    import json
    
    class DoubanSpider():
        def __init__(self):
            self.url_temp_list=[                      {"url_temp":"https://m.douban.com/rexxar/api/v2/subject_collection/tv_american/items?start={}&count=18&loc_id=108288",
                            "country":"US"},
                            {"url_temp":"https://m.douban.com/rexxar/api/v2/subject_collection/tv_domestic/items?start={}&count=18&loc_id=108288",
                             "country":"CN"},
                             
    {"url_temp": "https://m.douban.com/rexxar/api/v2/subject_collection/tv_korean/items?start={}&count=18&loc_id=108288",
                             "country":"KR"},
                            {"url_temp":"https://m.douban.com/rexxar/api/v2/subject_collection/tv_japanese/items?start={}&count=18&loc_id=108288",
                             "country":"JP"}]
    
    self.headers = {
                "Referer": "https: // m.douban.com / tv / american",
                "User-Agent":"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36"}
    
        def parse_url(self,url):# 发送请求,获取响应
            print(url)
            response = requests.get(url,headers=self.headers)
            return response.content.decode()
    
        def get_content_list(self,json_str):
            dict_ret = json.loads(json_str)
            content_list = dict_ret["subject_collection_items"]
            total = dict_ret["total"]
            return content_list,total
    
        def save_content_list(self,content_list,country):#保存
    
            with open("douban.txt","a",encoding="utf-8") as f:
                for content in content_list:
                    content["country"]=country
                    f.write(json.dumps(content,ensure_ascii=False))
                    f.write("
    ")#写入换行符进行换行
            print("保存成功")
    
        def run(self):# 实现主要逻辑
            for url_temp in self.url_temp_list:
                num=0
                total = 100 # 假设有第一页
                while num<total+18:
                    # 1.start_utl
                    url = url_temp["url_temp"].format(num)
                    #2.发送请求,获取响应
                    json_str = self.parse_url(url)
                    #3.提取数据
                    content_list,total = self.get_content_list(json_str)
                    #4.保存
                    self.save_content_list(content_list,url_temp["country"])
                    # if len(content_list)<18:
                    #     break
                    #5.构造下一页的url地址,进入循环
                    num +=18
    
    if __name__ == "__main__" :
        douban_spider = DoubanSpider()
        douban_spider.run()

    豆瓣书籍爬取

    import requests
    import json
    
    class DoubanBook_Spider():
        def __init__(self):
            self.url_temp_list = [
                {"url_temp":"https://m.douban.com/rexxar/api/v2/subject_collection/book_fiction/items?start=0&count=18&loc_id=0",
                 "book":"fiction"},
                {"url_temp":"https://m.douban.com/rexxar/api/v2/subject_collection/book_nonfiction/items?start=0&count=18&loc_id=0",
                 "book":"nofiction"},
                {"url_temp":"https://m.douban.com/rexxar/api/v2/subject_collection/book_classic/items?start=0&count=18&loc_id=0",
                 "book":"classic"}
            ]
            self.headers={
                "Referer": "https://m.douban.com/book/classic",
                "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36"
            }
    
        def parse_url(self,url):# 发送请求,获取响应
            print(url)
            response = requests.get(url,headers=self.headers)
            return response.content.decode()
    
        def get_content_list(self,json_str):#  提取数据
            dict_ret = json.loads(json_str)
            content_list = dict_ret["subject_collection_items"]
            total = dict_ret["total"]
            return content_list,total
    
        def save_content_list(self,content_list,book):
            with open("book_list.txt","a",encoding="utf-8")as f:
                for content in content_list:
                    content["book"]= book
                    f.write(json.dumps(content, ensure_ascii=False))
                    f.write("
    ")  # 写入换行符进行换行
                print("保存成功")
    
        def run(self):
            for url_temp in self.url_temp_list:
                num = 0
                total = 100  # 假设有第一页
                while num < total + 18:
                    # 1.start_utl
                    url = url_temp["url_temp"].format(num)
                    # 2.发送请求,获取响应
                    json_str = self.parse_url(url)
                    # 3.提取是数据
                    content_list, total = self.get_content_list(json_str)
                    # 4.保存
                    self.save_content_list(content_list, url_temp["book"])
                    # if len(content_list)<18:
                    #     break
                    # 5.构造下一页的url地址,进入循环
                    num += 18
    
    if __name__ == "__main__":
        douban_spider = DoubanBook_Spider()
        douban_spider.run()
  • 相关阅读:
    form表单ajaxSubmit提交并验证
    jQuery幸运大转盘_jQuery+PHP抽奖程序
    thinkphp3.2 + soap
    chart.js图表 传值问题
    window和document的区别理解,bom和dom的区别理解
    JS弹出层制作,以及移动端禁止弹出层下内容滚动,overflow:hidden移动端失效问题
    富文本编辑器summernote的基本使用
    input文件类型上传,或者作为参数拼接的时候注意的问题!
    使用input选择本地图片,并且实现预览功能
    整体页面加载和某一模块加载监听
  • 原文地址:https://www.cnblogs.com/groundcontrol/p/12851696.html
Copyright © 2011-2022 走看看