zoukankan      html  css  js  c++  java
  • 爬虫---爬取豆瓣-科幻片-排行

    scrapy

    movie.py

    # -*- coding: utf-8 -*-
    import scrapy
    import json
    import re

    from douban.items import DoubanItem


    class MovieSpider(scrapy.Spider):
    name = 'movie'
    allowed_domains = ['movie.douban.com']
    start_urls = ['https://movie.douban.com/j/chart/top_list?type=17&interval_id=100%3A90&action=&start=0&limit=20']

    def parse(self, response):
    item = DoubanItem()
    datas = json.loads(response.body)
    if datas:
    for data in datas:
    item['movie_rank'] = data['rank']
    item['movie_name'] = data['title']
    item['movie_score'] = data['score']
    item['movie_people'] = data['vote_count']
    # print item
    yield item
    a = int(re.findall('.*start=(d+).*', response.url)[0])
    if a < 201:
    url = 'https://movie.douban.com/j/chart/top_list?type=17&interval_id=100%3A90&action=&start=' + str(a + 20) + '&limit=20'
    print url
    yield scrapy.Request(url, callback=self.parse)


    items.py

    # -*- coding: utf-8 -*-

    # Define here the models for your scraped items
    #
    # See documentation in:
    # https://doc.scrapy.org/en/latest/topics/items.html

    import scrapy


    class DoubanItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    movie_rank = scrapy.Field()
    movie_name = scrapy.Field()
    movie_score = scrapy.Field()
    movie_people = scrapy.Field()

    pipelines.py

    # -*- coding: utf-8 -*-

    # Define your item pipelines here
    #
    # Don't forget to add your pipeline to the ITEM_PIPELINES setting
    # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html


    class DoubanPipeline(object):
    def process_item(self, item, spider):
    with open('./douban_movie.txt', 'a+') as f:
    # cont = str(item['movie_rank']) + ' ' + item['movie_name'].encode('utf8') + ' ' + item['movie_score'].encode('utf8') + ' ' + str(item['movie_people']) + ' '
    cont = str(item['movie_rank']).ljust(3, ' ') + ' ' + item['movie_name'].encode('utf8') + ' ' + item['movie_score'].encode('utf8') + ' '
    f.write(cont)

    mian.py

    # -*- coding:utf-8 -*-
    from scrapy import cmdline
    file = open('./douban_movie.txt', 'w+')
    file.close()
    cmdline.execute('scrapy crawl movie'.split())

    保存结果   txt文件

    1       盗梦空间             9.3
    2 机器人总动员 9.3
    3 星际穿越 9.2
    4 楚门的世界 9.2
    5 超感猎杀:完结特别篇 9.2
    6 蝙蝠侠:黑暗骑士 9.1
    7 攻壳机动队2:无罪 9.1
    
    
  • 相关阅读:
    C#基础
    C#基础
    Sqlserver数据库备份和还原
    C#基础
    Python3学习笔记4
    Python3学习笔记3
    调用接口Post xml和json数据的通用方法
    Python3学习笔记2
    Python3学习笔记1
    常见的PHP函数代码性能对比
  • 原文地址:https://www.cnblogs.com/wozuilang-mdzz/p/9740418.html
Copyright © 2011-2022 走看看