import scrapy from scrapy.linkextractors import LinkExtractor from scrapy.spiders import CrawlSpider, Rule from wxapp.items import WxappItem class WxSpider(CrawlSpider): name = 'wx' allowed_domains = ['wxapp-union.com'] start_urls = ['http://www.wxapp-union.com/portal.php?mod=list&catid=2&page=1'] rules = ( Rule(LinkExtractor(allow=r'.*mod=list&catid=2&page=d+'), follow=True), Rule(LinkExtractor(allow=r'.*article-.+.html'), callback='parse_detail', follow=False), ) def parse_detail(self, response): detail_href = response.request.url title = response.xpath('//h1[@class="ph"]/text()').get() content = response.xpath('//td[@id="article_content"]//text()').getall() content = [c.strip() for c in content] content = ''.join(content).strip() pub_time = response.xpath('//p[@class="authors"]/span/text()').get() author = response.xpath('//p[@class="authors"]/a/text()').get() item = WxappItem(title=title, content=content, detail_href=detail_href, pub_time=pub_time, author=author) yield item
from scrapy.exporters import JsonLinesItemExporter, JsonItemExporter class WxappPipeline(object): def __init__(self): """ 爬虫开始的时候执行 """ self.fp = open("data.json", 'wb') self.exporter = JsonItemExporter(self.fp, ensure_ascii=False, encoding='utf-8') def open_spider(self, spider): """ 爬虫开始的时候执行 :param spider: :return: """ pass def process_item(self, item, spider): self.exporter.export_item(item) return item def close_spider(self, spider): """ 爬虫结束的时候执行 :param spider: :return: """ self.fp.close()
import scrapy class WxappItem(scrapy.Item): title = scrapy.Field() content = scrapy.Field() pub_time = scrapy.Field() author = scrapy.Field() detail_href = scrapy.Field()