zoukankan      html  css  js  c++  java
  • 爬取豆瓣电影信息

     爬取豆瓣电影top250movie.douban.com/top250的电影数据,并保存在MongoDB中。

    创建项目
    scrapy startproject douban

    items.py

    import scrapy
    
    
    class DoubanItem(scrapy.Item):
        # define the fields for your item here like:
        # 标题
        title = scrapy.Field()
        # 信息
        bd = scrapy.Field()
        # 评分
        star = scrapy.Field()
        # 简介
        quote = scrapy.Field()

    创建CrawSpider,使用模版craw

    scrapy genspider -t craw doubanmovie douban.com

    import scrapy
    from douban.items import DoubanItem
    
    class DoubamovieSpider(scrapy.Spider):
        name = "doubanmovie"
        allowed_domains = ["movie.douban.com"]
        offset = 0
        url = "https://movie.douban.com/top250?start="
        start_urls = (
                url+str(offset),
        )
    
        def parse(self, response):
            item = DoubanItem()
            movies = response.xpath("//div[@class='info']")
    
            for each in movies:
                # 标题
                item['title'] = each.xpath(".//span[@class='title'][1]/text()").extract()[0]
                # 信息
                item['bd'] = each.xpath(".//div[@class='bd']/p/text()").extract()[0]
                # 评分
                item['star'] = each.xpath(".//div[@class='star']/span[@class='rating_num']/text()").extract()[0]
                # 简介
                quote = each.xpath(".//p[@class='quote']/span/text()").extract()
                if len(quote) != 0:
                    item['quote'] = quote[0]
                yield item
    
            if self.offset < 225:
                self.offset += 25
                yield scrapy.Request(self.url + str(self.offset), callback = self.parse)

     pipelines.py

    import pymongo
    from scrapy.conf import settings
    
    class DoubanPipeline(object):
        def __init__(self):
            host = settings["MONGODB_HOST"]
            port = settings["MONGODB_PORT"]
            dbname = settings["MONGODB_DBNAME"]
            sheetname= settings["MONGODB_SHEETNAME"]
    
            # 创建MONGODB数据库链接
            client = pymongo.MongoClient(host = host, port = port)
            # 指定数据库
            mydb = client[dbname]
            # 存放数据的数据库表名
            self.sheet = mydb[sheetname]
    
        def process_item(self, item, spider):
            data = dict(item)
            self.sheet.insert(data)
            return item
    settings.py
    BOT_NAME = 'douban'
    
    SPIDER_MODULES = ['douban.spiders']
    NEWSPIDER_MODULE = 'douban.spiders'
    
    
    # Crawl responsibly by identifying yourself (and your website) on the user-agent
    USER_AGENT = "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0;"
    
    DOWNLOAD_DELAY = 2.5
    
    # Disable cookies (enabled by default)
    COOKIES_ENABLED = False
    
    DOWNLOADER_MIDDLEWARES = {
        'douban.middlewares.RandomUserAgent': 100,
        'douban.middlewares.RandomProxy': 200,
    
    USER_AGENTS = [
        'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0)',
        'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.2)',
        'Opera/9.27 (Windows NT 5.2; U; zh-cn)',
        'Opera/8.0 (Macintosh; PPC Mac OS X; U; en)',
        'Mozilla/5.0 (Macintosh; PPC Mac OS X; U; en) Opera 8.0',
        'Mozilla/5.0 (Linux; U; Android 4.0.3; zh-cn; M032 Build/IML74K) AppleWebKit/534.30 (KHTML, like Gecko) Version/4.0 Mobile Safari/534.30',
        'Mozilla/5.0 (Windows; U; Windows NT 5.2) AppleWebKit/525.13 (KHTML, like Gecko) Chrome/0.2.149.27 Safari/525.13'
    ]
    
    PROXIES = [
            {"ip_port" :"121.42.140.113:16816", "user_passwd" : "用户名:密码"},
            #{"ip_prot" :"121.42.140.113:16816", "user_passwd" : ""}
            #{"ip_prot" :"121.42.140.113:16816", "user_passwd" : ""}
            #{"ip_prot" :"121.42.140.113:16816", "user_passwd" : ""}
    ]
    
    ITEM_PIPELINES = {
        'douban.pipelines.DoubanPipeline': 300,
    }
    
    # MONGODB 主机名
    MONGODB_HOST = "127.0.0.1"
    
    # MONGODB 端口号
    MONGODB_PORT = 27017
    
    # 数据库名称
    MONGODB_DBNAME = "Douban"
    
    # 存放数据的表名称
    MONGODB_SHEETNAME = "doubanmovies"
  • 相关阅读:
    HDU1255覆盖的面积
    B. An express train to reveries
    Long Long Message(后缀数组)
    Longest Common Substring(最长公共子序列)
    最长上升子序列(NlogN)总结
    bzoj 1500 维修数列
    HDU 6357 Hills And Valleys
    牛客暑假多校第六场 I Team Rocket
    HDU 6346 整数规划 二分图匹配最优解
    牛客暑假多校第五场 I vcd
  • 原文地址:https://www.cnblogs.com/wanglinjie/p/9216096.html
Copyright © 2011-2022 走看看