1. spider文件
from scrapy.contrib.spiders import CrawlSpider, Rule from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor from scrapy.selector import HtmlXPathSelector item = DomzItem() image_urls = hxs.select('//img/@src').extract() item['image_urls'] = ["http:" + x for x in image_urls] return item
from scrapy.selector import HtmlXPathSelector hxs = HtmlXPathSelector(response)
class MySpider(CrawlSpider): #控制下载速度 name = 'myspider' download_delay = 2
$ scrapy crawl somespider -s JOBDIR=crawls/somespider-1
#这样开始下载之后可以Ctrl + C停止,恢复下载还是同样的命令
$ scrapy crawl somespider -s JOBDIR=crawls/somespider-1
name = "wikipedia" allowed_domains = ["wikipedia.org"] start_urls = [ "http://en.wikipedia.org/wiki/Pune" ]
2. setting文件
ITEM_PIPELINES = ['scrapy.contrib.pipeline.images.ImagesPipeline'] IMAGES_STORE= '...'
3. item 文件
image_urls = Field() images = Field()