zoukankan      html  css  js  c++  java
  • 爬取豆瓣电影储存到数据库MONGDB中以及反反爬虫

    1.代码如下:

    doubanmoive.py

    # -*- coding: utf-8 -*-
    import scrapy
    from douban.items import DoubanItem
    
    class DoubamovieSpider(scrapy.Spider):
        name = "doubanmovie"
        allowed_domains = ["movie.douban.com"]
        offset = 0
        url = "https://movie.douban.com/top250?start="
        start_urls = (
                url+str(offset),
        )
    
        def parse(self, response):
            item = DoubanItem()
            movies = response.xpath("//div[@class='info']")
    
            for each in movies:
                # 标题
                item['title'] = each.xpath(".//span[@class='title'][1]/text()").extract()[0]
                # 信息
                item['bd'] = each.xpath(".//div[@class='bd']/p/text()").extract()[0]
                # 评分
                item['star'] = each.xpath(".//div[@class='star']/span[@class='rating_num']/text()").extract()[0]
                # 简介
                quote = each.xpath(".//p[@class='quote']/span/text()").extract()
                if len(quote) != 0:
                    item['quote'] = quote[0]
                yield item
    
            if self.offset < 225:
                self.offset += 25
                yield scrapy.Request(self.url + str(self.offset), callback = self.parse)

    items.py

    import scrapy
    
    
    class DoubanItem(scrapy.Item):
        # define the fields for your item here like:
        # 标题
        title = scrapy.Field()
        # 信息
        bd = scrapy.Field()
        # 评分
        star = scrapy.Field()
        # 简介
        quote = scrapy.Field()

    2.在管道文件中更改储存位置

    import pymongo
    from scrapy.conf import settings
    
    class DoubanPipeline(object):
        def __init__(self):
            host = settings["MONGODB_HOST"]
            port = settings["MONGODB_PORT"]
            dbname = settings["MONGODB_DBNAME"]
            sheetname= settings["MONGODB_SHEETNAME"]
    
            # 创建MONGODB数据库链接
            client = pymongo.MongoClient(host = host, port = port)
            # 指定数据库
            mydb = client[dbname]
            # 存放数据的数据库表名
            self.sheet = mydb[sheetname]
    
        def process_item(self, item, spider):
            data = dict(item)
            self.sheet.insert(data)
            return item

    3.新建中间件 middlewares.py 进行反反爬虫

     1 # -*- coding:utf-8 -*-
     2 
     3 import random
     4 import base64
     5 
     6 from settings import USER_AGENTS
     7 from settings import PROXIES
     8 
     9 # 随机的User-Agent
    10 class RandomUserAgent(object):
    11     def process_request(self, request, spider):
    12         useragent = random.choice(USER_AGENTS)
    13         #print useragent
    14         request.headers.setdefault("User-Agent", useragent)
    15 
    16 class RandomProxy(object):
    17     def process_request(self, request, spider):
    18         proxy = random.choice(PROXIES)
    19 
    20         if proxy['user_passwd'] is None:
    21             # 没有代理账户验证的代理使用方式
    22             request.meta['proxy'] = "http://" + proxy['ip_port']
    23 
    24         else:
    25             # 对账户密码进行base64编码转换
    26             base64_userpasswd = base64.b64encode(proxy['user_passwd'])
    27             # 对应到代理服务器的信令格式里
    28             request.headers['Proxy-Authorization'] = 'Basic ' + base64_userpasswd
    29 
    30             request.meta['proxy'] = "http://" + proxy['ip_port']

    4.setting的设置

      1 # -*- coding: utf-8 -*-
      2 
      3 # Scrapy settings for douban project
      4 #
      5 # For simplicity, this file contains only settings considered important or
      6 # commonly used. You can find more settings consulting the documentation:
      7 #
      8 #     http://doc.scrapy.org/en/latest/topics/settings.html
      9 #     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
     10 #     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
     11 
     12 BOT_NAME = 'douban'
     13 
     14 SPIDER_MODULES = ['douban.spiders']
     15 NEWSPIDER_MODULE = 'douban.spiders'
     16 
     17 
     18 # Crawl responsibly by identifying yourself (and your website) on the user-agent
     19 USER_AGENT = "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0;"
     20 
     21 # Obey robots.txt rules
     22 #ROBOTSTXT_OBEY = True
     23 
     24 # Configure maximum concurrent requests performed by Scrapy (default: 16)
     25 #CONCURRENT_REQUESTS = 32
     26 
     27 # Configure a delay for requests for the same website (default: 0)
     28 # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
     29 # See also autothrottle settings and docs
     30 DOWNLOAD_DELAY = 2.5
     31 # The download delay setting will honor only one of:
     32 #CONCURRENT_REQUESTS_PER_DOMAIN = 16
     33 #CONCURRENT_REQUESTS_PER_IP = 16
     34 
     35 # Disable cookies (enabled by default)
     36 COOKIES_ENABLED = False
     37 
     38 # Disable Telnet Console (enabled by default)
     39 #TELNETCONSOLE_ENABLED = False
     40 
     41 # Override the default request headers:
     42 #DEFAULT_REQUEST_HEADERS = {
     43 #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
     44 #   'Accept-Language': 'en',
     45 #}
     46 
     47 # Enable or disable spider middlewares
     48 # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
     49 #SPIDER_MIDDLEWARES = {
     50 #    'douban.middlewares.MyCustomSpiderMiddleware': 543,
     51 #}
     52 
     53 # Enable or disable downloader middlewares
     54 # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
     55 DOWNLOADER_MIDDLEWARES = {
     56     'douban.middlewares.RandomUserAgent': 100,
     57     'douban.middlewares.RandomProxy': 200,
     58 }
     59 
     60 USER_AGENTS = [
     61     'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0)',
     62     'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.2)',
     63     'Opera/9.27 (Windows NT 5.2; U; zh-cn)',
     64     'Opera/8.0 (Macintosh; PPC Mac OS X; U; en)',
     65     'Mozilla/5.0 (Macintosh; PPC Mac OS X; U; en) Opera 8.0',
     66     'Mozilla/5.0 (Linux; U; Android 4.0.3; zh-cn; M032 Build/IML74K) AppleWebKit/534.30 (KHTML, like Gecko) Version/4.0 Mobile Safari/534.30',
     67     'Mozilla/5.0 (Windows; U; Windows NT 5.2) AppleWebKit/525.13 (KHTML, like Gecko) Chrome/0.2.149.27 Safari/525.13'
     68 ]
     69 
     70 PROXIES = [
     71         {"ip_port" :"121.42.140.113:16816", "user_passwd" : "mr_mao_hacker:sffqry9r"},
     72         #{"ip_prot" :"121.42.140.113:16816", "user_passwd" : ""}
     73         #{"ip_prot" :"121.42.140.113:16816", "user_passwd" : ""}
     74         #{"ip_prot" :"121.42.140.113:16816", "user_passwd" : ""}
     75 ]
     76 
     77 #LOG_FILE = "douban.log"
     78 #LOG_LEVEL = "DEBUG"
     79 # Enable or disable extensions
     80 # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
     81 #EXTENSIONS = {
     82 #    'scrapy.extensions.telnet.TelnetConsole': None,
     83 #}
     84 
     85 # Configure item pipelines
     86 # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
     87 ITEM_PIPELINES = {
     88     'douban.pipelines.DoubanPipeline': 300,
     89 }
     90 
     91 # MONGODB 主机名
     92 MONGODB_HOST = "127.0.0.1"
     93 
     94 # MONGODB 端口号
     95 MONGODB_PORT = 27017
     96 
     97 # 数据库名称
     98 MONGODB_DBNAME = "Douban"
     99 
    100 # 存放数据的表名称
    101 MONGODB_SHEETNAME = "doubanmovies"
    102 
    103 # Enable and configure the AutoThrottle extension (disabled by default)
    104 # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
    105 #AUTOTHROTTLE_ENABLED = True
    106 # The initial download delay
    107 #AUTOTHROTTLE_START_DELAY = 5
    108 # The maximum download delay to be set in case of high latencies
    109 #AUTOTHROTTLE_MAX_DELAY = 60
    110 # The average number of requests Scrapy should be sending in parallel to
    111 # each remote server
    112 #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
    113 # Enable showing throttling stats for every response received:
    114 #AUTOTHROTTLE_DEBUG = False
    115 
    116 # Enable and configure HTTP caching (disabled by default)
    117 # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
    118 #HTTPCACHE_ENABLED = True
    119 #HTTPCACHE_EXPIRATION_SECS = 0
    120 #HTTPCACHE_DIR = 'httpcache'
    121 #HTTPCACHE_IGNORE_HTTP_CODES = []
    122 #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
  • 相关阅读:
    类的加载过程
    ASCII码表
    uboot main_loop函数分析
    串行CPU设计
    __attribute__ ((section(".text")))的测试
    NandFlash
    测试gcc的优化选项
    如何编写一个简单的makefile
    UBOOT的多支持性与可裁剪性
    函数指针的使用
  • 原文地址:https://www.cnblogs.com/cuzz/p/7632033.html
Copyright © 2011-2022 走看看