crawlspider提取url
创建一个crawlspider爬虫
scrapy genspider --t crawl baidu baidu.com
rules = (
Rule(LinkExtractor(allow=r'Items/'), callback='parse_item', follow=True),
)
allow 正则匹配
restrict_css css匹配, 自动提取url
restrict_xpath xpath匹配, 自动提取url
# restrict_xpaths=("//div[@class='a'/li]") 能够去除li下面所有a标签的url地址并进行请求 Rule(LinkExtractor(restrict_xpaths=("//div[@class='a'/li]")), callback='parse_item', follow=True),
创建的爬虫
# -*- coding: utf-8 -*- import scrapy from scrapy.linkextractors import LinkExtractor from scrapy.spiders import CrawlSpider, Rule class CfSpider(CrawlSpider): name = 'cf' allowed_domains = ['circ.gov.cn'] start_urls = ['http://circ.gov.cn/'] # 提取规则 follow=True 继续提取(提取下一页地址 需要True) rules = ( Rule(LinkExtractor(allow=r'Items/'), callback='parse_item', follow=True), ) def parse_item(self, response): item = {} #item['domain_id'] = response.xpath('//input[@id="sid"]/@value').get() #item['name'] = response.xpath('//div[@id="name"]').get() #item['description'] = response.xpath('//div[@id="description"]').get() return item
爬去腾讯招聘职位
import scrapy from scrapy.linkextractors import LinkExtractor from scrapy.spiders import CrawlSpider, Rule class HrSpider(CrawlSpider): name = 'hr' allowed_domains = ['tencent.com'] start_urls = ['https://hr.tencent.com/index.php'] rules = ( # https://hr.tencent.com/social.php Rule(LinkExtractor(allow=r'https://hr.tencent.com/position.php'), callback='parse_item', follow=True),
# next page Rule(LinkExtractor(allow=r'https://hr.tencent.com/position.php?keywords=&tid=0&start=d{1}0#a'), follow=True), ) def parse_item(self, response): item = {} tr_list = response.xpath("//table[@class='tablelist']/tr")[1:-1] for tr in tr_list: item['title'] = tr.xpath("./td[1]/a/text()").extract_first() item['position'] = tr.xpath("./td[2]/text()").extract_first() item['pub_date'] = tr.xpath("./td[5]/text()").extract_first() yield item