zoukankan      html  css  js  c++  java
  • 项目练习:电影列表爬虫

     1 # -*- coding:utf-8 -*-
     2 # Author:Sure Feng
     3 
     4 import requests
     5 import json
     6 
     7 
     8 class DoubanFileSpider(object):
     9     def __init__(self):
    10         self.start_tempt_url = [
    11             {
    12             "url_tempt": "https://movie.douban.com/j/search_subjects?type=movie&tag=%E5%8D%8E%E8%AF%AD&sort=recommend&page_limit=20&page_start={}",
    13             "country": "cn"
    14             }, {
    15             "url_tempt": "https://movie.douban.com/j/search_subjects?type=movie&tag=%E6%97%A5%E6%9C%AC&sort=recommend&page_limit=20&page_start={}",
    16             "country": "janpan"
    17             }, {
    18             "url_tempt": "https://movie.douban.com/j/search_subjects?type=movie&tag=%E6%AC%A7%E7%BE%8E&sort=recommend&page_limit=20&page_start={}",
    19             "country": "usa"
    20             }
    21         ]
    22         self.headers = {
    23             "User-Agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Mobile Safari/537.36"}
    24 
    25     def parse_url(self, url):
    26         """发送请求,获取响应"""
    27         respond = requests.get(url, headers=self.headers)
    28         return respond.content.decode()
    29 
    30     def save_data(self, list_str, country):
    31         """保存数据"""
    32         with open("douban.txt", "a", encoding="utf-8") as f:
    33             for info_str in list_str:
    34                 # print(info_str)
    35                 info_str["country"] = country
    36                 f.write(json.dumps(info_str, ensure_ascii=False))
    37                 f.write("
    ")
    38 
    39     def get_content(self, json_str):
    40         """提取数据"""
    41         dict_ret = json.loads(json_str)
    42         content_list = dict_ret["subjects"]
    43         return content_list
    44 
    45     def run(self):
    46         """实现主要逻辑"""
    47         for url_tempt in self.start_tempt_url:
    48             num = 0
    49             country = url_tempt["country"]
    50             while True:
    51                 # start_url
    52                 start_url = url_tempt["url_tempt"].format(num)
    53                 print(start_url)
    54                 # 发送请求,获取响应
    55                 json_str = self.parse_url(start_url)
    56                 # 提取数据
    57                 content_list = self.get_content(json_str)
    58                 # 保存
    59                 self.save_data(content_list, country)
    60                 # 构造下一页的URL地址,重复步骤
    61                 if len(content_list) < 20:
    62                     break
    63                 num += 20
    64 
    65 
    66 if __name__ == "__main__":
    67     douban_spider = DoubanFileSpider()
    68     douban_spider.run()
  • 相关阅读:
    C# 托管内存与非托管内存之间的转换
    A*算法详解链接
    【转】使用minizip解压缩多个文件(基于zlib)
    lua中table的遍历,以及删除
    clientHeight scrollHeight offsetHeight
    消息中间件(转)
    js 原型链和继承(转)
    session 和 cookie (转)
    java servlet
    redis 命令
  • 原文地址:https://www.cnblogs.com/sure-feng/p/10052871.html
Copyright © 2011-2022 走看看