Scrapy 练习 :
spiders:
import scrapy from yg.items import YgItem from copy import deepcopy class YangguangSpider(scrapy.Spider): name = 'yangguang' allowed_domains = ['sun0769.com'] start_urls = ['http://wz.sun0769.com/index.php/question/huiyin'] def parse(self, response): #分组 tr_list = response.xpath("//div[@class='newsHead clearfix']/table[2]/tr") for tr in tr_list: item = YgItem() item["title"] = tr.xpath("./td[3]/a/text()").extract_first() item["href"] = tr.xpath("./td[3]/a/@href").extract_first() item["publish_date"] = tr.xpath("./td[6]/text()").extract_first() print(item) yield scrapy.Request( item["href"], callback=self.parse_details, meta = {"item":deepcopy(item)} ) next_url = response.xpath("//a[text()='>']/@href").extract_first() if next_url is not None: yield scrapy.Request( next_url, callback=self.parse, meta={"item":item} ) def parse_details(self,response): item = response.meta["item"] item["content"] = response.xpath("//td[@class='txt16_3']//text()").extract() item["img"] = response.xpath("//td[@class='txt16_3']//img/@src").extract() item["img"] = ["http://wz.sun0769.com" + i for i in item["img"]] print(item)
items.py:
import scrapy class YgItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() title = scrapy.Field() href = scrapy.Field() publish_date = scrapy.Field() img = scrapy.Field() content = scrapy.Field()
settings.py
BOT_NAME = 'yg' SPIDER_MODULES = ['yg.spiders'] NEWSPIDER_MODULE = 'yg.spiders' LOG_LEVEL = 'WARNING' # Crawl responsibly by identifying yourself (and your website) on the user-agent USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36' # Obey robots.txt rules ROBOTSTXT_OBEY = False