1、scrapy 新建项目
scrapy startproject 项目名称
2、spiders编写(以爬取163北京新闻为例)
此例中用到了scrapy的Itemloader机制,itemloader中有三个比较重要的方法,有add_xpath(),add_value(),add_css(),这三个方法中,都有两个参数第一个为item的名,第二个为值或者是提取规则
用法如下例。
1、scrapy 新建项目 scrapy startproject 项目名称 2、spiders编写(以爬取163北京新闻为例) import json import time import scrapy from scrapy.http import Request from aticleSpider.items import ArticleItem from aticleSpider.until.common import get_md5 from aticleSpider.items import ArticleItemLoader # from scrapy.loader import ItemLoader class ArticleSpider(scrapy.Spider): #spider名字,scrapy会读取这个name name = 'bj163news' #可允许爬取的域名 allowed_domains = ['163.com'] #url入口 start_urls = ["http://bendi.news.163.com/beijing/special/04388GGG/bjxinxiliu.js?callback=data_callback&_="] num = 1 times = time.time() add_time = str(times).replace('.','')[0:13] def parse(self, response): """ 1、获取文章列表中的文章的url并进行具体字段的解析 2、获取到下页的URL进行下载 :param response: :return: """ #解析列表页的所有文章的url response_str = str(response.body.decode("gbk", "ignore")).replace('data_callback(','').replace(')','') # print(response_str) js = json.loads(response_str) for line in js: keys = [] title = line['title'] commenturl= line['commenturl'] docurl = line['docurl'] newstype = line['newstype'] title_img = line['imgurl'] for keywords in line['keywords']: keys.append(keywords['keyname']) key_words = ','.join(keys) # print(docurl,'docurl') metas = {'title':title,'commenturl':commenturl,'newstype':newstype,'title_img':title_img,'key_words':key_words} yield Request(url=docurl,meta=metas,callback=self.parse_detail) #循环获取所有页面数据的方法 self.num = self.num +1 if self.num==10: str_num = str(self.num) else : str_num = '0'+str(self.num) next_url = "http://bendi.news.163.com/beijing/special/04388GGG/bjxinxiliu_"+str_num+".js?callback=data_callback&_="+self.add_time # yield Request(url=next_url, callback=self.parse) if url: url_page = url #文章内容页数据爬取 item loader 模式 def parse_detail(self,response): # item =ArticleItem() main_data = response.meta # docurl = response.url # # project_doc_url = get_md5(docurl) # # try: # news_editor = response.xpath('//span[@class="ep-editor"]/text()').extract_first("").replace('责任编辑:','') # news_source = response.xpath('//div[@class="ep-source cDGray"]/span/text()').extract_first("").replace('本文来源:','') # # except: source_title = response.xpath('//div[@id="endText"]/p[@class="otitle"]/text()').extract_first(" ").replace('(原标题:','').replace(')','').strip() # news_time = response.xpath('//div[@class="post_time_source"]//text()').extract_first("").replace('来源:','').strip() # content_org = response.xpath('//div[@id="endText"]').extract_first("") news_conts =response.xpath('//div[@id="endText"]/p') news_cont =[] for one_p in news_conts: img = one_p.xpath('.//img') # print(img) if img != []: img_url = one_p.xpath('.//@src').extract_first("") news_cont.append({'content':img_url,'type':'pic'}) else: try: text = one_p.xpath('.//text()').extract_first("") if text.find('(原标题:')>0 or text =='': continue news_cont.append({'content':text,'type':'text'}).strip() except: pass #注释部分为不用itemloader的提取方法 # item['content_org'] ='' # item['project_doc_url'] = project_doc_url # item['title'] = main_data.get('title','') #get方法取值,并传默认值 # item['source_title'] = source_title # item['commenturl'] = main_data.get('commenturl','') # item['newstype'] = main_data.get('newstype','') # item['docurl'] = docurl # item['title_img'] = [main_data.get('title_img','')] # item['key_words'] = main_data.get('key','') # item['news_editor'] = news_editor # item['news_source'] = news_source # item['news_time'] = news_time # item['news_cont'] = news_cont # yield item item_loader = ArticleItemLoader(item=ArticleItem(),response=response) item_loader.add_xpath('news_editor','//span[@class="ep-editor"]/text()') item_loader.add_value('title',main_data.get('title','')) item_loader.add_value('project_doc_url',get_md5(response.url)) item_loader.add_value('commenturl',main_data.get('commenturl','')) item_loader.add_value('newstype',main_data.get('newstype','')) item_loader.add_value('docurl',response.url) item_loader.add_value('source_title',source_title) item_loader.add_value('title_img',[main_data.get('title_img','')]) item_loader.add_value('key_words',main_data.get('key_words','')) item_loader.add_xpath('news_editor','//span[@class="ep-editor"]/text()') item_loader.add_xpath('news_source','//div[@class="ep-source cDGray"]/span/text()') item_loader.add_xpath('content_org','//div[@id="endText"]') item_loader.add_value('news_cont',news_cont) item_loader.add_xpath('news_time','//div[@class="post_time_source"]//text()') article_item = item_loader.load_item() yield article_item
3、使用item loader的item设计
# -*- coding: utf-8 -*- # Define here the models for your scraped items # # See documentation in: # http://doc.scrapy.org/en/latest/topics/items.html import scrapy from scrapy.loader import ItemLoader from scrapy.loader.processors import TakeFirst, MapCompose class ArticleSpiderItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() pass #自定义article loader 设置默认输出值为list的第一个 class ArticleItemLoader(ItemLoader): # pass default_output_processor = TakeFirst() #item loader的返回原本值,不做任何改变 def return_value(value): return value #163时间格式化 def return_time(value): return value.replace('来源:','').strip() #163格式化获取作者 def get_editor(value): # if value != '': return value.replace('责任编辑:','') #163格式化获取本文来源 def get_source(value): # if value != '': return value.replace('本文来源:','') #163item class ArticleItem(scrapy.Item): """ item= scrapy.Field() Field() 有两个参数,即输入/输出处理器 一个为 input_processor 输入处理器在数据被接受到时执行 output_processor 输出处理器在ItemLoader.load_item()时再执行输出处理器,返回最终结果 输入处理器在数据被接受到时执行,当数据收集完后调用ItemLoader.load_item()时再执行输出处理器,返回最终结果。 """ #文章标题 source_title = scrapy.Field() title = scrapy.Field() #文章URL docurl = scrapy.Field() #文章md5 URL project_doc_url = scrapy.Field() #大标题图片 title_img = scrapy.Field( output_processor = MapCompose(return_value) ) #图片存储路径 img_path = scrapy.Field() #关键字 key_words = scrapy.Field() #新闻类型 article newstype = scrapy.Field() #评论url commenturl = scrapy.Field() #文章作者 news_editor = scrapy.Field( input_processor = MapCompose(get_editor) ) #分段内容,图片名或者是内容 news_cont = scrapy.Field( output_processor = MapCompose(return_value) ) #文章内容源码 content_org = scrapy.Field() #图片url # news_pic_url = scrapy.Field() #发表时间 news_time = scrapy.Field( input_processor = MapCompose(return_time) ) #文章来源 news_source = scrapy.Field( input_processor = MapCompose(get_source) ) down_img = scrapy.Field()
在Field定义中声明输入/输出处理器
优先级:
- 在Item Loader中定义的
field_in
和field_out
- Filed元数据(
input_processor
和output_processor
关键字) - Item Loader中的默认的
Tips:一般来讲,将输入处理器定义在Item Loader的定义中field_in
,然后将输出处理器定义在Field元数据中
内置的处理器
Identity
啥也不做TakeFirst
返回第一个非空值,通常用作输出处理器Join
将结果连起来,默认使用空格’ ‘Compose
将函数链接起来形成管道流,产生最后的输出MapCompose
跟上面的Compose
类似,区别在于内部结果在函数中的传递方式.它的输入值是可迭代的,首先将第一个函数依次作用于所有值,产生新的可迭代输入,作为第二个函数的输入,最后生成的结果连起来返回最终值,一般用在输入处理器中。SelectJmes
使用json路径来查询值并返回结果
4、pipleines设计
pipelines是用来处理item的,此例中用图片下载、json格式存储文件、数据存入数据库。数据库具体sql用*代替。
# -*- coding: utf-8 -*- # Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html import codecs # import hashlib import json # import random # from scrapy.utils.python import to_bytes import pymysql # import time from scrapy.pipelines.images import ImagesPipeline from scrapy.exporters import JsonItemExporter from scrapy.http import Request from aticleSpider.until.common import get_md5 #默认方法 class ArticleSpiderPipeline(object): def process_item(self, item, spider): return item #调用scrapy提供的json export导出json文件 class JsonExporterPipeline(object): #调用scrapy提供的json export导出json文件 def __init__(self): self.file = open('articleexport.json', 'wb') self.exporter = JsonItemExporter(self.file, encoding="utf-8", ensure_ascii=False) self.exporter.start_exporting() def close_spider(self, spider): self.exporter.finish_exporting() self.file.close() def process_item(self, item, spider): self.exporter.export_item(item) return item #北京163新闻数据入库操作 class Bj163MySqlPipeline(object): def __init__(self,db_parms): self.conn = pymysql.connect(**db_parms) self.cursor = self.conn.cursor() @classmethod def from_settings(cls,settings): db_parms = dict( host = settings['MYSQL_HOST'], user = settings['MYSQL_USER'], password = settings['MYSQL_PASSWORD'], database = settings['MYSQL_DBNAME'], charset='utf8' ) # conn = pymysql.connect(**db_parms) return cls(db_parms) def process_item(self,item,spider): select_sql_find = """ select id from toon_news_163 WHERE pageurl = %s; """ self.cursor.execute(select_sql_find,(item['docurl'])) odis = self.cursor.fetchall() if odis == (): insert_sql = """ insert into toon_news_163(*) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s); """ try: source_title = item['source_title'] except: source_title = item['title'] # print(item['news_time'],item['docurl'],item['news_editor'],source_title,item['key_words'],item['news_source'], # item['title'],item['title_img'],item['content_org'],item['img_path'],item['docurl']) self.cursor.execute(insert_sql,('163网易北京',item['news_time'],item['docurl'],item['news_editor'],source_title,item['key_words'],item['news_source'], item['title'],item['title_img'],item['content_org'],item['img_path'],item['docurl'])) self.conn.commit() select_sql = """ select max(id) FROM toon_news_163; """ self.cursor.execute(select_sql) oidss = self.cursor.fetchall() max_id = oidss[0][0] # print(max_id) content = item['news_cont'] for i in range(0,len(content)): if content[i]['type'] == 'pic': pic_url = content[i]['content'] else: pic_url = '' insert_con_sql = """ insert into toon_news_content_163(*) VALUES( %s,%s,%s,%s,%s,%s); """ # print(str(max_id),content[i]['content'],content[i]['type'],str(i),0,pic_url,item['docurl']) self.cursor.execute(insert_con_sql,(str(max_id),content[i]['content'],content[i]['type'],str(i+1),0,item['docurl'])) self.conn.commit() # return item #json格式数据存储 class JsonWithEncodingPipeline(object): #自定义json文件的导出 def __init__(self): self.file = codecs.open('article.json', 'w', encoding="utf-8") def process_item(self, item, spider): lines = json.dumps(dict(item), ensure_ascii=False) + " " self.file.write(lines) return item def spider_closed(self, spider): self.file.close() #文章标题页图片下载 class ArticleImagePipeline(ImagesPipeline): #图片下载 def file_path(self, request, response=None, info=None): ## start of deprecation warning block (can be removed in the future) def _warn(): from scrapy.exceptions import ScrapyDeprecationWarning import warnings warnings.warn('ImagesPipeline.image_key(url) and file_key(url) methods are deprecated, ' 'please use file_path(request, response=None, info=None) instead', category=ScrapyDeprecationWarning, stacklevel=1) # check if called from image_key or file_key with url as first argument if not isinstance(request, Request): _warn() url = request else: url = request.url # detect if file_key() or image_key() methods have been overridden if not hasattr(self.file_key, '_base'): _warn() return self.file_key(url) elif not hasattr(self.image_key, '_base'): _warn() return self.image_key(url) ## end of deprecation warning block image_guid = get_md5(url) # change to request.url after deprecation return '%s.jpg' % (image_guid) def item_completed(self, results, item, info): # if "title_img" in item: if "title_img" in item: img_path ='' for ok,value in results: img_path = value['path'] item['img_path'] = img_path # pass return item
5、setting设置
其中重要部分均已做注释,ITEM_PIPELINES参数为运行优先级,数值越小越优先
# -*- coding: utf-8 -*- # Scrapy settings for aticleSpider project # # For simplicity, this file contains only settings considered important or # commonly used. You can find more settings consulting the documentation: # # http://doc.scrapy.org/en/latest/topics/settings.html # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html import os BOT_NAME = 'aticleSpider' SPIDER_MODULES = ['aticleSpider.spiders'] NEWSPIDER_MODULE = 'aticleSpider.spiders' # Crawl responsibly by identifying yourself (and your website) on the user-agent #USER_AGENT = 'aticleSpider (+http://www.yourdomain.com)' # Obey robots.txt rules ROBOTSTXT_OBEY = False # Configure maximum concurrent requests performed by Scrapy (default: 16) #CONCURRENT_REQUESTS = 32 # Configure a delay for requests for the same website (default: 0) # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay # See also autothrottle settings and docs #DOWNLOAD_DELAY = 3 # The download delay setting will honor only one of: #CONCURRENT_REQUESTS_PER_DOMAIN = 16 #CONCURRENT_REQUESTS_PER_IP = 16 # Disable cookies (enabled by default) #COOKIES_ENABLED = False # Disable Telnet Console (enabled by default) #TELNETCONSOLE_ENABLED = False # Override the default request headers: #DEFAULT_REQUEST_HEADERS = { # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', # 'Accept-Language': 'en', #} # Enable or disable spider middlewares # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html #SPIDER_MIDDLEWARES = { # 'aticleSpider.middlewares.AticlespiderSpiderMiddleware': 543, #} # Enable or disable downloader middlewares # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html #DOWNLOADER_MIDDLEWARES = { # 'aticleSpider.middlewares.MyCustomDownloaderMiddleware': 543, #} # Enable or disable extensions # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html #EXTENSIONS = { # 'scrapy.extensions.telnet.TelnetConsole': None, #} # Configure item pipelines # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html ITEM_PIPELINES = { # 'aticleSpider.pipelines.ArticleSpiderPipeline': 100, 'aticleSpider.pipelines.Bj163MySqlPipeline': 3, 'aticleSpider.pipelines.ArticleImagePipeline':1, } IMAGES_URLS_FIELD ='title_img' #图片URL的item ,其值类型为list project_dir = os.path.abspath(os.path.dirname(__file__)) IMAGES_STORE = os.path.join(project_dir, '163_news') #图片存储位置 MYSQL_HOST = '' #数据库host MYSQL_USER = '' #数据库user MYSQL_DBNAME = '' #数据库database MYSQL_PASSWORD = '' #数据库密码 # Enable and configure the AutoThrottle extension (disabled by default) # See http://doc.scrapy.org/en/latest/topics/autothrottle.html #AUTOTHROTTLE_ENABLED = True # The initial download delay #AUTOTHROTTLE_START_DELAY = 5 # The maximum download delay to be set in case of high latencies #AUTOTHROTTLE_MAX_DELAY = 60 # The average number of requests Scrapy should be sending in parallel to # each remote server #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 # Enable showing throttling stats for every response received: #AUTOTHROTTLE_DEBUG = False # Enable and configure HTTP caching (disabled by default) # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings #HTTPCACHE_ENABLED = True #HTTPCACHE_EXPIRATION_SECS = 0 #HTTPCACHE_DIR = 'httpcache' #HTTPCACHE_IGNORE_HTTP_CODES = [] #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
6、运行scrapy项目
scrapy crawl 项目名称