1.创建项目
我这里的项目名称为scrapyuniversal,然后我创建在D盘根目录。创建方法如下
打开cmd,切换到d盘根目录。然后输入以下命令:
scrapy startproject scrapyuniversal
如果创建成功,d盘的根目录下将生成一个名为scrapyuniversal的文件夹。
2.创建crawl模板
打开命令行窗口,然后定位到d盘刚才创建的scrapyuniversal文件夹。然后输入以下命令
scrapy genspider -t crawl china tech.china.com
如创建成功会在scrapyuniversal目录下的spider目录里多一个spider文件,下面我们就来看这个spider文件。 代码含注释
3.目录结构
scrapyuniversal │ scrapy.cfg │ spider.sql │ start.py │ └─scrapyuniversal │ items.py │ loaders.py │ middlewares.py │ pipelines.py │ settings.py │ __init__.py │ ├─spiders │ │ china.py │ │ __init__.py │ │
4.china.py
# -*- coding: utf-8 -*- import scrapy from scrapy.linkextractors import LinkExtractor from scrapy.spiders import CrawlSpider, Rule from ..items import NewsItem from ..loaders import ChinaLoader class ChinaSpider(CrawlSpider): name = 'china' allowed_domains = ['tech.china.com'] start_urls = ['http://tech.china.com/articles/'] #当我们follow一个链接时, 我们其实是用rules把这个链接返回的response再提取一遍. #第二个rule是设定设定只取两页 rules = ( Rule(LinkExtractor(allow='article/.*.html', restrict_xpaths='//div[@id="left_side"]//div[@class="con_item"]'), callback='parse_item'), Rule(LinkExtractor(restrict_xpaths="//div[@id='pageStyle']//span[text()<3]")), ) def parse_item(self, response): #item存储方式,保存格式乱所以改用itemloader # item=NewsItem() # item['title']=response.xpath("//h1[@id='chan_newsTitle']/text()").extract_first() # item['url']=response.url # item['text']=''.join(response.xpath("//div[@id='chan_newsDetail']//text()").extract()).strip() # #re_first正则表达式提取时间 # item['datetime']=response.xpath("//div[@id='chan_newsInfo']/text()").re_first('(d+-d+-d+sd+:d+:d+)') # item['source']=response.xpath('//div[@id="chan_newsInfo"]/text()').re_first("来源: (.*)").strip() # item['website']="中华网" # yield item loader = ChinaLoader(item=NewsItem(), response=response) loader.add_xpath('title', '//h1[@id="chan_newsTitle"]/text()') loader.add_value('url', response.url) loader.add_xpath('text', '//div[@id="chan_newsDetail"]//text()') loader.add_xpath('datetime', '//div[@id="chan_newsInfo"]/text()', re='(d+-d+-d+sd+:d+:d+)') loader.add_xpath('source', '//div[@id="chan_newsInfo"]/text()', re='来源:(.*)') loader.add_value('website', '中华网') yield loader.load_item()
5.loader.py
#!/usr/bin/env python # encoding: utf-8 from scrapy.loader import ItemLoader from scrapy.loader.processors import TakeFirst,Join,Compose class NewsLoader(ItemLoader): """ 定义一个通用Out Processor为TakeFirst TakeFirst:取迭代对象中第一个非空元素,相当于之前item用的extract_first """ default_output_processor = TakeFirst() class ChinaLoader(NewsLoader): """ Compose第一个参数 Join:把列表拼成字符串 Compose第二个参数是一个匿名函数 对字符串进一步处理 """ text_out=Compose(Join(),lambda s: s.strip()) source_out=Compose(Join(),lambda s:s.strip())
6.items.py
# -*- coding: utf-8 -*- # Define here the models for your scraped items # # See documentation in: # https://doc.scrapy.org/en/latest/topics/items.html from scrapy import Field,Item class NewsItem(Item): #标题 title=Field() #链接 url=Field() #正文 text=Field() #发布时间 datetime=Field() #来源 source=Field() #站点名称,直接赋值中华网 website=Field()
7.中间件的修改,随机获取useragent逻辑
# -*- coding: utf-8 -*- # Define here the models for your spider middleware # # See documentation in: # https://doc.scrapy.org/en/latest/topics/spider-middleware.html from scrapy import signals import random class ProcessHeaderMidware(): """process request add request info""" def __init__(self): self.USER_AGENT_LIST= ["Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1" "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6", "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6", "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5", "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24", "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24" ] def process_request(self, request, spider): """ 随机从列表中获得header, 并传给user_agent进行使用 """ ua = random.choice(self.USER_AGENT_LIST) spider.logger.info(msg='now entring download midware') if ua: request.headers['User-Agent'] = ua # Add desired logging message here. spider.logger.info(u'User-Agent is : {} {}'.format(request.headers.get('User-Agent'), request))
8.settings.py
# -*- coding: utf-8 -*- # Scrapy settings for scrapyuniversal project # # For simplicity, this file contains only settings considered important or # commonly used. You can find more settings consulting the documentation: # # https://doc.scrapy.org/en/latest/topics/settings.html # https://doc.scrapy.org/en/latest/topics/downloader-middleware.html # https://doc.scrapy.org/en/latest/topics/spider-middleware.html BOT_NAME = 'scrapyuniversal' SPIDER_MODULES = ['scrapyuniversal.spiders'] NEWSPIDER_MODULE = 'scrapyuniversal.spiders' # Crawl responsibly by identifying yourself (and your website) on the user-agent #USER_AGENT = 'scrapyuniversal (+http://www.yourdomain.com)' # Obey robots.txt rules ROBOTSTXT_OBEY = False # Configure maximum concurrent requests performed by Scrapy (default: 16) #CONCURRENT_REQUESTS = 32 # Configure a delay for requests for the same website (default: 0) # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay # See also autothrottle settings and docs #DOWNLOAD_DELAY = 3 # The download delay setting will honor only one of: #CONCURRENT_REQUESTS_PER_DOMAIN = 16 #CONCURRENT_REQUESTS_PER_IP = 16 # Disable cookies (enabled by default) #COOKIES_ENABLED = False # Disable Telnet Console (enabled by default) #TELNETCONSOLE_ENABLED = False # Override the default request headers: #DEFAULT_REQUEST_HEADERS = { # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', # 'Accept-Language': 'en', #} # Enable or disable spider middlewares # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html #SPIDER_MIDDLEWARES = { # 'scrapyuniversal.middlewares.ScrapyuniversalSpiderMiddleware': 543, #} # Enable or disable downloader middlewares # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html #DOWNLOADER_MIDDLEWARES = { # 'scrapyuniversal.middlewares.ScrapyuniversalDownloaderMiddleware': 543, #} DOWNLOADER_MIDDLEWARES = { 'scrapyuniversal.middlewares.ProcessHeaderMidware': 543, } # Enable or disable extensions # See https://doc.scrapy.org/en/latest/topics/extensions.html #EXTENSIONS = { # 'scrapy.extensions.telnet.TelnetConsole': None, #} # Configure item pipelines # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html ITEM_PIPELINES = { 'scrapyuniversal.pipelines.ScrapyuniversalPipeline': 300, } # Enable and configure the AutoThrottle extension (disabled by default) # See https://doc.scrapy.org/en/latest/topics/autothrottle.html #AUTOTHROTTLE_ENABLED = True # The initial download delay #AUTOTHROTTLE_START_DELAY = 5 # The maximum download delay to be set in case of high latencies #AUTOTHROTTLE_MAX_DELAY = 60 # The average number of requests Scrapy should be sending in parallel to # each remote server #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 # Enable showing throttling stats for every response received: #AUTOTHROTTLE_DEBUG = False # Enable and configure HTTP caching (disabled by default) # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings #HTTPCACHE_ENABLED = True #HTTPCACHE_EXPIRATION_SECS = 0 #HTTPCACHE_DIR = 'httpcache' #HTTPCACHE_IGNORE_HTTP_CODES = [] #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' HTTP_PROXY="127.0.0.1:5000" #替换成需要的代理