start
from scrapy.cmdline import execute execute(['scrapy', 'crawl', 'jokespider'])
items.py
import scrapy class JokejiItem(scrapy.Item): title=scrapy.Field() url=scrapy.Field() class ListItem(scrapy.Item): title=scrapy.Field() url=scrapy.Field()
spider.py
from scrapy.linkextractors import LinkExtractor from scrapy.spiders import CrawlSpider, Rule from jokeji.items import JokejiItem,ListItem class JokespiderSpider(CrawlSpider): name = 'jokespider' allowed_domains = ['zizi.cn'] start_urls = ['http://www.zizi.cn'] rules = [ Rule(LinkExtractor(allow=r'/listw+.htm'), callback='parse_list', follow=True), Rule(LinkExtractor(allow=r'/jokehtml/w+/d+.htm',deny=(r'/list')), callback='parse_item', follow=True), ] def parse_item(self, response): item=JokejiItem() item['title']='from content' return item def parse_list(self,response): item=ListItem() item['url']="from list........"+response.url return item
pipelines.py
class JokejiPipeline(object): def process_item(self, item, spider): print(item,item__class__,spider)
通过 item__class__ 是什么类来决定如何处理数据
当然 ItemClass() 类里可以加
def __str__(self):
return 'ItemClass"
更直观.