zoukankan      html  css  js  c++  java
  • scrapy爬取豆瓣电影top250

    # -*- coding: utf-8 -*-
    # scrapy爬取豆瓣电影top250
    
    import scrapy
    from douban.items import DoubanItem
    
    
    class DoubanspiderSpider(scrapy.Spider):
        name = "doubanspider"
        # allowed_domains = ["movie.douban.com/top250"]注意这里的主页限制,一旦翻页可能超出范围
        start_urls = ['http://movie.douban.com/top250']
    
        def parse(self, response):
            item = DoubanItem()
            for each in response.css('.article .grid_view li'):
                # 电影名称
                title = each.css('.item .hd .title:nth-child(1)::text').extract_first()
                # 导演
                dire_actor = each.css('.item .bd p::text').extract()[0].strip()
                director = dire_actor.split('xa0xa0xa0')[0].strip()
                # 演员
                actor = dire_actor.split('xa0xa0xa0')[1].strip()
                # 年代
                info = each.css('.item .bd p::text').extract()[1].strip()
                year = info.split('/')[0].strip()
                # 国家
                country = info.split('/')[1].strip()
                # 类型
                type = info.split('/')[2].strip()
                # 评分
                rating_num = each.css('.item .bd .star .rating_num::text').extract_first()
                # 经典台词
                quote = each.css('.item .bd .quote span::text').extract_first()
                # 海报
                image = each.css('.item .pic a img::attr(src)').extract_first()
    
                item['title'] = title
                item['director'] = director
                item['actor'] = actor
                item['year'] = year
                item['country'] = country
                item['type'] = type
                item['rating_num'] = rating_num
                item['quote'] = quote
                item['image'] = image
    
                yield item
    
            # 构造下一页的请求
            next = response.css('.paginator .next a::attr(href)').extract_first()
            if next:
                url = 'http://movie.douban.com/top250' + next
                print(url)
                yield scrapy.Request(url=url, callback=self.parse)
  • 相关阅读:
    BUUCTF-[GYCTF2020]Blacklist 1 思路
    [强网杯 2019]随便注 WriteUp(three way)思路
    [GXYCTF2019]Ping Ping Ping 1思路
    [ACTF2020 新生赛]Include 思路
    [极客大挑战 2019]Secret File 思路
    [SUCTF 2019]EasySQL 思路
    java环境变量配置 详细
    [极客大挑战 2019]Upload 思路
    [极客大挑战 2019]Http 思路
    [ACTF2020 新生赛]Exec 思路
  • 原文地址:https://www.cnblogs.com/themost/p/7090247.html
Copyright © 2011-2022 走看看