2017-03-28
入职接到的第一个小任务,scrapy多级页面爬虫,从来没写过爬虫,也没学过scrapy,甚至连xpath都没用过,最后用了将近一周才搞定。肯定有很多low爆的地方,希望大家可以给我一些建议。
spider文件:
# -*- coding: utf-8 -*- import scrapy from nosta.items import NostaItem import time import hashlib class NostaSpider(scrapy.Spider): name = "nosta" allowed_domains = ["nosta.gov.cn"] start_urls = [ "http://www.nosta.gov.cn/upload/2017slgb/showProject.html", ] def parse(self, response): for sel1 in response.xpath('//a/@href').extract(): # 存储链接自身组名group_name group_name = response.xpath('//a[@href="%s"]/text()'%(sel1)).extract()[0] # 存储链接前的顺序号group_number group_number = response.xpath('//a[@href="%s"]/parent::*/preceding-sibling::*/text()'%(sel1)).extract()[0] # 存储目录名directory_name directory_name = response.xpath('//a[@href="%s"]/parent::*/parent::*/parent::*/parent::*/preceding-sibling::*/text()'%(sel1)).extract()[0] # 存储链接本身group_url group_url = response.urljoin(sel1) # url1 = "http://www.nosta.gov.cn/upload/2017slgb/" + sel1 yield scrapy.Request(url = group_url, meta = {"group_name":group_name, "group_number":group_number, "directory_name":directory_name, "group_url":group_url}, callback=self.parse_url, dont_filter=True) def parse_url(self, response): # item = response.meta['item'] group_name = response.meta["group_name"] group_number = response.meta["group_number"] directory_name = response.meta["directory_name"] group_url = response.meta["group_url"] for sel2 in response.xpath('//a/@href').extract(): # 存储链接前的顺序号project_number project_number = response.xpath('//a[@href="%s"]/parent::*/preceding-sibling::*/text()'%(sel2)).extract()[0] # 存储链接本身project_url project_url = response.urljoin(sel2) # 存储链接自身工程名project_name project_name = response.xpath('//a[@href="%s"]/text()'%(sel2)).extract()[0] # url2 = response.urljoin(sel2) yield scrapy.Request(url = project_url, meta = {"group_name":group_name, "group_number":group_number, "directory_name":directory_name, "group_url":group_url, "project_number":project_number, "project_url":project_url, "project_name":project_name}, callback=self.parse_item, dont_filter=True) def parse_item(self, response): item = NostaItem() item["time"] = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) item["year"] = ["2017"] item["group_name"] = response.meta["group_name"] item["group_number"] = response.meta["group_number"] item["directory_name"] = response.meta["directory_name"] item["group_url"] = response.meta["group_url"] item["project_number"] = response.meta["project_number"] item["project_url"] = response.meta["project_url"] item["project_name"] = response.meta["project_name"] # 存储详情页源代码project_html item["project_html"] = response.body # 存储合作人关系链接file_urls s1 = u'完成人合作关系说明:' item["file_urls"] = ['http://www.nosta.gov.cn/upload/2017slgb'+i.replace('..', '') for i in response.xpath("//td[text() = '%s']/following-sibling::*/a/@href"%(s1)).extract()] sha_1 = hashlib.sha1() item["files"] = [] for i in item["file_urls"]: dict1 = {} dict1["url"] = i sha_1.update(i) dict1["path"] = sha_1.hexdigest() + ".pdf" item["files"].append(dict1) # 存储所有图片链接image_urls item["image_urls"] = ['http://www.nosta.gov.cn/upload/2017slgb'+i.replace('..', '') for i in response.xpath('//img[@width="840px"]/@src').extract()] # 存储所有图片本地地址和图片名(列表中存存字典)images sha_2 = hashlib.sha1() item["images"] = [] for i in item["image_urls"]: dict2 = {} dict2["url"] = i sha_2.update(i) dict2["path"] = sha_2.hexdigest() + ".jpg" item["images"].append(dict2) # 存储详情页中具体内容project_content dict3 = {} project_detail = response.xpath('//td[@class="label"]/text()').extract() for j in project_detail: dict3[j] = response.xpath("//td[text() = '%s']/following-sibling::*"%(j)).xpath('string(.)').extract()[0] if not dict3[j]: dict3[j] = ['http://www.nosta.gov.cn/upload/2017slgb'+i.replace('..', '') for i in response.xpath("//td[text() = '%s']/following-sibling::*/img/@src"%(j)).extract()] item["project_content"] = dict3 yield item
items文件:
import scrapy class NostaItem(scrapy.Item): time = scrapy.Field() files = scrapy.Field() # 完成人合作关系 列表中存字典 url:网上链接 path本地路径(第三级) crawl_date = scrapy.Field() # 爬取日期 project_name = scrapy.Field() # 工程名称(第二、三级) group_url = scrapy.Field() # 所在组的索引页面链接(第一、二级) project_number = scrapy.Field() # 在组中顺序(第二级) project_content = scrapy.Field() # 项目详情页中具体内容(第三级) group_number = scrapy.Field() # 组在总页面中顺序(第一级) project_url = scrapy.Field() # 项目链接(第二、三级) group_name = scrapy.Field() # 组名称(第一、二、三级) image_urls = scrapy.Field() # 列表存图片链接(第三级) file_urls = scrapy.Field() # 列表中存合作人关系链接(第三级) year = scrapy.Field() # 哪年(2017) images = scrapy.Field() # 列表中存字典 url:网上链接 path:本地路径(第三级) directory_name = scrapy.Field() # 属于何种目录名(第一级) project_html = scrapy.Field() # 项目详情页html源代码(第三级) current_count = scrapy.Field()
pipelines文件
from pymongo import MongoClient from nosta.items import NostaItem class NostaPipeline(object): def __init__(self): self.client = MongoClient('IP', 27017) def process_item(self, item, spider): if isinstance(item, NostaItem): dict1 = {} dict1["time"] = item["time"] dict1["files"] = item["files"] dict1["project_name"] = item["project_name"] dict1["group_url"] = item["group_url"] dict1["project_number"] = item["project_number"] dict1["project_content"] = item["project_content"] dict1["group_number"] = item["group_number"] dict1["project_url"] = item["project_url"] dict1["group_name"] = item["group_name"] dict1["image_urls"] = item["image_urls"] dict1["file_urls"] = item["file_urls"] dict1["year"] = item["year"] dict1["images"] = item["images"] dict1["directory_name"] = item["directory_name"] self.db = self.client.nosta self.db.authenticate('', '') collection = self.db.nosta_2017 collection.insert(dict1) self.db = self.client.platform_info self.db.authenticate('', '') collection = self.db.crawl_info dict2 = {} dict2["current_count"] = item["current_count"] if dict2["current_count"] == 1: dict2["start_time"] = item["time"] collection.update( {'job': '2017年国家科技奖励'}, {'$set': dict2}) return item
settings文件(部分修改)
ITEM_PIPELINES = { 'nosta.pipelines.NostaPipeline': 300, 'scrapy.pipelines.images.ImagesPipeline': 1, 'scrapy.pipelines.files.FilesPipeline': 1 } IMAGES_STORE = r'.' FILES_STORE = r'.'