zoukankan html css js c++ java

scrapy的CrawlSpider使用

1.创建项目

我这里的项目名称为scrapyuniversal，然后我创建在D盘根目录。创建方法如下

打开cmd，切换到d盘根目录。然后输入以下命令:

scrapy startproject scrapyuniversal

如果创建成功，d盘的根目录下将生成一个名为scrapyuniversal的文件夹。

2.创建crawl模板

打开命令行窗口，然后定位到d盘刚才创建的scrapyuniversal文件夹。然后输入以下命令

scrapy genspider -t crawl china tech.china.com

如创建成功会在scrapyuniversal目录下的spider目录里多一个spider文件，下面我们就来看这个spider文件。代码含注释

3.目录结构

scrapyuniversal
│  scrapy.cfg
│  spider.sql
│  start.py
│  
└─scrapyuniversal
    │  items.py
    │  loaders.py
    │  middlewares.py
    │  pipelines.py
    │  settings.py
    │  __init__.py
    │  
    ├─spiders
    │  │  china.py
    │  │  __init__.py
    │  │

4.china.py

# -*- coding: utf-8 -*-
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from ..items import NewsItem
from ..loaders import ChinaLoader
class ChinaSpider(CrawlSpider):
    name = 'china'
    allowed_domains = ['tech.china.com']

    start_urls = ['http://tech.china.com/articles/']
    #当我们follow一个链接时, 我们其实是用rules把这个链接返回的response再提取一遍.
    #第二个rule是设定设定只取两页
    rules = (
        Rule(LinkExtractor(allow='article/.*.html', restrict_xpaths='//div[@id="left_side"]//div[@class="con_item"]'),
             callback='parse_item'),
        Rule(LinkExtractor(restrict_xpaths="//div[@id='pageStyle']//span[text()<3]")),
    )

    def parse_item(self, response):
       #item存储方式，保存格式乱所以改用itemloader
       # item=NewsItem()
       # item['title']=response.xpath("//h1[@id='chan_newsTitle']/text()").extract_first()
       # item['url']=response.url
       # item['text']=''.join(response.xpath("//div[@id='chan_newsDetail']//text()").extract()).strip()
       # #re_first正则表达式提取时间
       # item['datetime']=response.xpath("//div[@id='chan_newsInfo']/text()").re_first('(d+-d+-d+sd+:d+:d+)')
       # item['source']=response.xpath('//div[@id="chan_newsInfo"]/text()').re_first("来源： (.*)").strip()
       # item['website']="中华网"
       # yield item

       loader = ChinaLoader(item=NewsItem(), response=response)
       loader.add_xpath('title', '//h1[@id="chan_newsTitle"]/text()')
       loader.add_value('url', response.url)
       loader.add_xpath('text', '//div[@id="chan_newsDetail"]//text()')
       loader.add_xpath('datetime', '//div[@id="chan_newsInfo"]/text()', re='(d+-d+-d+sd+:d+:d+)')
       loader.add_xpath('source', '//div[@id="chan_newsInfo"]/text()', re='来源：(.*)')
       loader.add_value('website', '中华网')
       yield loader.load_item()

　5.loader.py

#!/usr/bin/env python  
# encoding: utf-8  
from scrapy.loader import ItemLoader
from scrapy.loader.processors import TakeFirst,Join,Compose

class NewsLoader(ItemLoader):
    """
    定义一个通用Out Processor为TakeFirst
    TakeFirst：取迭代对象中第一个非空元素，相当于之前item用的extract_first
    """
    default_output_processor = TakeFirst()
class ChinaLoader(NewsLoader):
    """
    Compose第一个参数
    Join:把列表拼成字符串
    Compose第二个参数是一个匿名函数
    对字符串进一步处理
    """
    text_out=Compose(Join(),lambda s: s.strip())
    source_out=Compose(Join(),lambda s:s.strip())

　　6.items.py

# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html

from scrapy import Field,Item


class NewsItem(Item):
    #标题
    title=Field()
    #链接
    url=Field()
    #正文
    text=Field()
    #发布时间
    datetime=Field()
    #来源
    source=Field()
    #站点名称，直接赋值中华网
    website=Field()

　　7.中间件的修改，随机获取useragent逻辑

# -*- coding: utf-8 -*-

# Define here the models for your spider middleware
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html

from scrapy import signals
import random
class ProcessHeaderMidware():
    """process request add request info"""
    def __init__(self):
       self.USER_AGENT_LIST= ["Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1"
         "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
         "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
         "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
         "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
         "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
         "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
         "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
         "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
         "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
         "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
         "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
         "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
         "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
         "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
         "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
         "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
         "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
         ]
    def process_request(self, request, spider):
        """
        随机从列表中获得header， 并传给user_agent进行使用
        """
        ua = random.choice(self.USER_AGENT_LIST)
        spider.logger.info(msg='now entring download midware')
        if ua:
            request.headers['User-Agent'] = ua
            # Add desired logging message here.
            spider.logger.info(u'User-Agent is : {} {}'.format(request.headers.get('User-Agent'), request))

　　8.settings.py

# -*- coding: utf-8 -*-

# Scrapy settings for scrapyuniversal project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
#     https://doc.scrapy.org/en/latest/topics/settings.html
#     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
#     https://doc.scrapy.org/en/latest/topics/spider-middleware.html

BOT_NAME = 'scrapyuniversal'

SPIDER_MODULES = ['scrapyuniversal.spiders']
NEWSPIDER_MODULE = 'scrapyuniversal.spiders'


# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'scrapyuniversal (+http://www.yourdomain.com)'

# Obey robots.txt rules
ROBOTSTXT_OBEY = False

# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32

# Configure a delay for requests for the same website (default: 0)
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)
#COOKIES_ENABLED = False

# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False

# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
#   'Accept-Language': 'en',
#}

# Enable or disable spider middlewares
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
#    'scrapyuniversal.middlewares.ScrapyuniversalSpiderMiddleware': 543,
#}

# Enable or disable downloader middlewares
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
#    'scrapyuniversal.middlewares.ScrapyuniversalDownloaderMiddleware': 543,
#}
DOWNLOADER_MIDDLEWARES = {
    'scrapyuniversal.middlewares.ProcessHeaderMidware': 543,

}
# Enable or disable extensions
# See https://doc.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
#    'scrapy.extensions.telnet.TelnetConsole': None,
#}

# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
   'scrapyuniversal.pipelines.ScrapyuniversalPipeline': 300,
}

# Enable and configure the AutoThrottle extension (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False

# Enable and configure HTTP caching (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

HTTP_PROXY="127.0.0.1:5000" #替换成需要的代理

查看全文

相关阅读:
Can't remove netstandard folder from output path (.net standard)
website项目的reference问题
 The type exists in both DLLs
git常用配置
 Map dependencies with code maps
How to check HTML version of any website
Bootstrap UI 编辑器
 网上职位要求对照
 Use of implicitly declared global variable
ResolveUrl in external JavaScript file in asp.net project

原文地址：https://www.cnblogs.com/c-x-a/p/9040548.html