zoukankan      html  css  js  c++  java
  • python之scrapy篇(一)

    一、首先创建工程(cmd中进行)

    scrapy startproject xxx

    二、编写Item文件

    添加要字段

    # -*- coding: utf-8 -*-
    
    # Define here the models for your scraped items
    #
    # See documentation in:
    # https://docs.scrapy.org/en/latest/topics/items.html
    
    import scrapy
    
    
    class DoubanItem(scrapy.Item):
        # define the fields for your item here like:
        # name = scrapy.Field()
        # 电影标题
        title = scrapy.Field()
    
        # 电影信息
        info = scrapy.Field()
    
        # 电影评分
        score = scrapy.Field()
    
        # 评分人数
        number = scrapy.Field()
    
        # 简介
        content = scrapy.Field()

    # 简介
    content = scrapy.Field()

    三、进入spider文件(cmd中进行)

    scrapy genspider demo 'www.movie.douban.com'

    创建完成后进入

    编写代码

    # -*- coding: utf-8 -*-
    import scrapy
    from douban.items import DoubanItem
    
    
    class DoubanmovieSpider(scrapy.Spider):
        name = 'doubanmovie'
        allowed_domains = ['movie.douban.com']
        offset = 0
        url = "https://movie.douban.com/top250?start="
        start_urls = (
            url + str(offset),
        )
    
        def parse(self, response):
            item = DoubanItem()
            # 标题
            movies = response.xpath("//div[@class='info']")
    
            for movie in movies:
                name = movie.xpath('div[@class="hd"]/a/span/text()').extract()
    
                message = movie.xpath('div[@class="bd"]/p/text()').extract()
    
                star = movie.xpath('div[@class="bd"]/div[@class="star"]/span[@class="rating_num"]/text()').extract()
    
                number = movie.xpath('div[@class="bd"]/div[@class="star"]/span/text()').extract()
    
                quote = movie.xpath('div[@class="bd"]/p[@class="quote"]/span/text()').extract()
    
                if quote:
                    quote = quote[0]
                else:
                    quote = ''
    
                item['title'] = ''.join(name)
                item['info'] = quote
                item['score'] = star[0]
                item['content'] = ';'.join(message).replace(' ', '').replace('
    ', '')
                item['number'] = number[1].split('')[0]
    
                yield item
    
            if self.offset < 225:
                self.offset += 25
                yield scrapy.Request(self.url + str(self.offset), callback=self.parse)

    四、配置setting文件

    # -*- coding: utf-8 -*-
    
    # Scrapy settings for douban project
    #
    # For simplicity, this file contains only settings considered important or
    # commonly used. You can find more settings consulting the documentation:
    #
    #     https://docs.scrapy.org/en/latest/topics/settings.html
    #     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
    #     https://docs.scrapy.org/en/latest/topics/spider-middleware.html
    
    BOT_NAME = 'douban'
    
    SPIDER_MODULES = ['douban.spiders']
    NEWSPIDER_MODULE = 'douban.spiders'
    
    
    # Crawl responsibly by identifying yourself (and your website) on the user-agent
    USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36"
    
    # Obey robots.txt rules
    # ROBOTSTXT_OBEY = True
    
    # Configure maximum concurrent requests performed by Scrapy (default: 16)
    #CONCURRENT_REQUESTS = 32
    
    # Configure a delay for requests for the same website (default: 0)
    # See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
    # See also autothrottle settings and docs
    # DOWNLOAD_DELAY = 3
    # The download delay setting will honor only one of:
    #CONCURRENT_REQUESTS_PER_DOMAIN = 16
    #CONCURRENT_REQUESTS_PER_IP = 16
    
    # Disable cookies (enabled by default)
    COOKIES_ENABLED = False
    
    # Disable Telnet Console (enabled by default)
    #TELNETCONSOLE_ENABLED = False
    
    # Override the default request headers:
    DEFAULT_REQUEST_HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36',
      # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
      # 'Accept-Language': 'en',
    }
    
    # Enable or disable spider middlewares
    # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
    #SPIDER_MIDDLEWARES = {
    #    'douban.middlewares.DoubanSpiderMiddleware': 543,
    #}
    
    # Enable or disable downloader middlewares
    # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
    DOWNLOADER_MIDDLEWARES = {
       # 'douban.middlewares.DoubanDownloaderMiddleware': 100,
       'douban.middlewares.RandomUserAgent': 100,
    }
    
    USER_AGENT = [
       # Opera
       "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 OPR/26.0.1656.60",
       "Opera/8.0 (Windows NT 5.1; U; en)",
       "Mozilla/5.0 (Windows NT 5.1; U; en; rv:1.8.1) Gecko/20061208 Firefox/2.0.0 Opera 9.50",
       "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; en) Opera 9.50",
       # Firefox
       "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0",
       "Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10",
       # Safari
       "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2",
       # chrome
       "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36",
       "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11",
       "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.133 Safari/534.16",
       # 360
       "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36",
       "Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko",
       # 淘宝浏览器
       "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11",
       # 猎豹浏览器
       "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER",
       "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER)",
       "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E; LBBROWSER)",
       # QQ浏览器
       "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)",
       "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)",
       # sogou浏览器
       "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0",
       "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; SE 2.X MetaSr 1.0)",
       # maxthon浏览器
       "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Maxthon/4.4.3.4000 Chrome/30.0.1599.101 Safari/537.36",
       # UC浏览器
       "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36",
    ]
    
    # Enable or disable extensions
    # See https://docs.scrapy.org/en/latest/topics/extensions.html
    #EXTENSIONS = {
    #    'scrapy.extensions.telnet.TelnetConsole': None,
    #}
    
    # Configure item pipelines
    # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
    ITEM_PIPELINES = {
       # 'douban.pipelines.DoubanPipeline': 300,
       'douban.pipelines.DoubanSqlPipeline': 300,
       'douban.pipelines.DoubanWritePipeline': 250,
    }
    # 主机名
    MYSQL_HOST = "IP"
    # 端口号
    MYSQL_PORT = 3306
    # 数据库用户名
    MYSQL_USER = "root"
    # 数据库密码
    MYSQL_PASSWORD = "Password"
    # 数据库名称
    MYSQL_DBNAME = "mydouban"
    # 存放数据的表名称
    MYSQL_TABLENAME = "doubanmovies"
    # 数据库编码
    MYSQL_CHARSET = "utf8"
    
    
    # Enable and configure the AutoThrottle extension (disabled by default)
    # See https://docs.scrapy.org/en/latest/topics/autothrottle.html
    #AUTOTHROTTLE_ENABLED = True
    # The initial download delay
    #AUTOTHROTTLE_START_DELAY = 5
    # The maximum download delay to be set in case of high latencies
    #AUTOTHROTTLE_MAX_DELAY = 60
    # The average number of requests Scrapy should be sending in parallel to
    # each remote server
    #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
    # Enable showing throttling stats for every response received:
    #AUTOTHROTTLE_DEBUG = False
    
    # Enable and configure HTTP caching (disabled by default)
    # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
    #HTTPCACHE_ENABLED = True
    #HTTPCACHE_EXPIRATION_SECS = 0
    #HTTPCACHE_DIR = 'httpcache'
    #HTTPCACHE_IGNORE_HTTP_CODES = []
    #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

    配置管道文件

    ITEM_PIPELINES = {
    # 'douban.pipelines.DoubanPipeline': 300,
    'douban.pipelines.DoubanSqlPipeline': 300,
    'douban.pipelines.DoubanWritePipeline': 250,
    }

    User_Agent配置(包含了大多数浏览器)

    DOWNLOADER_MIDDLEWARES = {
       # 'douban.middlewares.DoubanDownloaderMiddleware': 100,
       'douban.middlewares.RandomUserAgent': 100,
    }
    USER_AGENT = [
       # Opera
       "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 OPR/26.0.1656.60",
       "Opera/8.0 (Windows NT 5.1; U; en)",
       "Mozilla/5.0 (Windows NT 5.1; U; en; rv:1.8.1) Gecko/20061208 Firefox/2.0.0 Opera 9.50",
       "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; en) Opera 9.50",
       # Firefox
       "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0",
       "Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10",
       # Safari
       "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2",
       # chrome
       "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36",
       "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11",
       "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.133 Safari/534.16",
       # 360
       "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36",
       "Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko",
       # 淘宝浏览器
       "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11",
       # 猎豹浏览器
       "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER",
       "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER)",
       "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E; LBBROWSER)",
       # QQ浏览器
       "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)",
       "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)",
       # sogou浏览器
       "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0",
       "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; SE 2.X MetaSr 1.0)",
       # maxthon浏览器
       "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Maxthon/4.4.3.4000 Chrome/30.0.1599.101 Safari/537.36",
       # UC浏览器
       "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36",
    ]

    # 主机名
    MYSQL_HOST = "IP"
    # 端口号
    MYSQL_PORT = 3306
    # 数据库用户名
    MYSQL_USER = "root"
    # 数据库密码
    MYSQL_PASSWORD = "password"
    # 数据库名称
    MYSQL_DBNAME = "mydouban"
    # 存放数据的表名称
    MYSQL_TABLENAME = "doubanmovies"
    # 数据库编码
    MYSQL_CHARSET = "utf8"

    五、管道文件pipelines

    # -*- coding: utf-8 -*-
    
    # Define your item pipelines here
    #
    # Don't forget to add your pipeline to the ITEM_PIPELINES setting
    # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
    
    import pymongo
    import pymysql
    from scrapy import settings
    import json
    import logging
    from pymysql import cursors
    from twisted.enterprise import adbapi
    import time
    import copy
    
    #保存到本地
    class DoubanWritePipeline(object):
        def __init__(self):
            self.filename = open("douban.json", "wb")
    
        def process_item(self, item, spider):
            text = json.dumps(dict(item), ensure_ascii=False) + "
    "
            self.filename.write(text.encode("utf-8"))
            return item
    
        def colse_spider(self, spider):
            self.filename.close()
    
    #保存到mysql(提前建立好数据库)
    class DoubanSqlPipeline(object):
        def __init__(self):
            self.conn = pymysql.connect(host='IP', user='root',
                                   passwd='password', db='mydouban', charset='utf8')
            self.cur = self.conn.cursor()
    
        def process_item(self, item, spider):
            title = item["title"]
            info = item["info"]
            score = item["score"]
            number = item["number"]
            content = item["content"]
            # 创建sql语句
            sql = """INSERT INTO doubanmovies (title,info,score,number,content,createtime) VALUES ("{}","{}","{}","{}","{}","{}")""".format(
                pymysql.escape_string(title), pymysql.escape_string(info), pymysql.escape_string(score), pymysql.escape_string(number), pymysql.escape_string(content),pymysql.escape_string(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())))
            # 执行sql语句
            self.conn.ping(reconnect=True)
            self.cur.execute(sql)
            self.conn.commit()
    
        def close_spider(self, spider):
            self.cur.close()
            self.conn.close()

    运行

    scrapy crawl xxx

    OVER!

    下面展示一下效果...

  • 相关阅读:
    Django Rest Swagger生成api文档
    django 完整日志配置
    django解决跨域请求的问题
    Django REST framework 自定义字段
    Django model 定义属性
    mysql server has gone away的原因
    也谈时间管理和GTD
    MySQL之thread cache
    MySQL之aborted connections和aborted clients
    TokuDB的特点验证
  • 原文地址:https://www.cnblogs.com/jake-jin/p/11359798.html
Copyright © 2011-2022 走看看