zoukankan html css js c++ java

Python 自用代码（scrapy多级页面(三级页面)爬虫）

2017-03-28

入职接到的第一个小任务，scrapy多级页面爬虫，从来没写过爬虫，也没学过scrapy，甚至连xpath都没用过，最后用了将近一周才搞定。肯定有很多low爆的地方，希望大家可以给我一些建议。

spider文件：

# -*- coding: utf-8 -*-
import scrapy
from nosta.items import NostaItem
import time
import hashlib

class NostaSpider(scrapy.Spider):
    name = "nosta"
    allowed_domains = ["nosta.gov.cn"]
    start_urls = [
        "http://www.nosta.gov.cn/upload/2017slgb/showProject.html",
    ]

    def parse(self, response):  
        for sel1 in response.xpath('//a/@href').extract():
            # 存储链接自身组名group_name
            group_name = response.xpath('//a[@href="%s"]/text()'%(sel1)).extract()[0]
            # 存储链接前的顺序号group_number
            group_number = response.xpath('//a[@href="%s"]/parent::*/preceding-sibling::*/text()'%(sel1)).extract()[0]
            # 存储目录名directory_name
            directory_name = response.xpath('//a[@href="%s"]/parent::*/parent::*/parent::*/parent::*/preceding-sibling::*/text()'%(sel1)).extract()[0]
            # 存储链接本身group_url
            group_url = response.urljoin(sel1)
            # url1 = "http://www.nosta.gov.cn/upload/2017slgb/" + sel1
            yield scrapy.Request(url = group_url, meta = {"group_name":group_name, "group_number":group_number, "directory_name":directory_name, "group_url":group_url}, callback=self.parse_url, dont_filter=True)

    def parse_url(self, response): 
        # item = response.meta['item']
        group_name = response.meta["group_name"]
        group_number = response.meta["group_number"]
        directory_name = response.meta["directory_name"]
        group_url = response.meta["group_url"]
        for sel2 in response.xpath('//a/@href').extract():
            # 存储链接前的顺序号project_number
            project_number = response.xpath('//a[@href="%s"]/parent::*/preceding-sibling::*/text()'%(sel2)).extract()[0]
            # 存储链接本身project_url
            project_url = response.urljoin(sel2)
            # 存储链接自身工程名project_name
            project_name = response.xpath('//a[@href="%s"]/text()'%(sel2)).extract()[0]
            # url2 = response.urljoin(sel2)
            yield scrapy.Request(url = project_url, meta = {"group_name":group_name, "group_number":group_number, "directory_name":directory_name, "group_url":group_url, "project_number":project_number, "project_url":project_url, "project_name":project_name}, callback=self.parse_item, dont_filter=True)

    def parse_item(self, response):
        item = NostaItem()
        item["time"] = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
        item["year"] = ["2017"]      
        item["group_name"] = response.meta["group_name"]
        item["group_number"] = response.meta["group_number"]
        item["directory_name"] = response.meta["directory_name"]
        item["group_url"] = response.meta["group_url"]
        item["project_number"] = response.meta["project_number"]
        item["project_url"] = response.meta["project_url"]
        item["project_name"] = response.meta["project_name"]
        # 存储详情页源代码project_html
        item["project_html"] = response.body
        # 存储合作人关系链接file_urls
        s1 = u'完成人合作关系说明：'
        item["file_urls"] = ['http://www.nosta.gov.cn/upload/2017slgb'+i.replace('..', '') for i in response.xpath("//td[text() = '%s']/following-sibling::*/a/@href"%(s1)).extract()]
        sha_1 = hashlib.sha1()
        item["files"] = []
        for i in item["file_urls"]:
            dict1 = {}
            dict1["url"] = i
            sha_1.update(i)
            dict1["path"] = sha_1.hexdigest() + ".pdf"
            item["files"].append(dict1)
        # 存储所有图片链接image_urls
        item["image_urls"] = ['http://www.nosta.gov.cn/upload/2017slgb'+i.replace('..', '') for i in response.xpath('//img[@width="840px"]/@src').extract()]
        # 存储所有图片本地地址和图片名（列表中存存字典）images
        sha_2 = hashlib.sha1()
        item["images"] = []
        for i in item["image_urls"]:
            dict2 = {}
            dict2["url"] = i
            sha_2.update(i)
            dict2["path"] = sha_2.hexdigest() + ".jpg"
            item["images"].append(dict2)
        # 存储详情页中具体内容project_content
        dict3 = {}
        project_detail = response.xpath('//td[@class="label"]/text()').extract()
        for j in project_detail:
            dict3[j] = response.xpath("//td[text() = '%s']/following-sibling::*"%(j)).xpath('string(.)').extract()[0]
            if not dict3[j]:
                dict3[j] = ['http://www.nosta.gov.cn/upload/2017slgb'+i.replace('..', '') for i in response.xpath("//td[text() = '%s']/following-sibling::*/img/@src"%(j)).extract()]
        item["project_content"] = dict3
        yield item

items文件：

import scrapy


class NostaItem(scrapy.Item):
    time = scrapy.Field()
    files = scrapy.Field()           # 完成人合作关系 列表中存字典  url:网上链接 path本地路径（第三级）
    crawl_date = scrapy.Field()      # 爬取日期
    project_name = scrapy.Field()    # 工程名称（第二、三级）
    group_url = scrapy.Field()       # 所在组的索引页面链接（第一、二级）
    project_number = scrapy.Field()  # 在组中顺序（第二级）
    project_content = scrapy.Field() # 项目详情页中具体内容（第三级）
    group_number = scrapy.Field()    # 组在总页面中顺序（第一级）
    project_url = scrapy.Field()     # 项目链接（第二、三级）
    group_name = scrapy.Field()      # 组名称（第一、二、三级）
    image_urls = scrapy.Field()      # 列表存图片链接（第三级）
    file_urls = scrapy.Field()       # 列表中存合作人关系链接（第三级）
    year = scrapy.Field()            # 哪年（2017）
    images = scrapy.Field()          # 列表中存字典 url:网上链接 path:本地路径（第三级）
    directory_name = scrapy.Field()  # 属于何种目录名（第一级）
    project_html = scrapy.Field()    # 项目详情页html源代码（第三级）
    current_count = scrapy.Field()

pipelines文件

from pymongo import MongoClient
from nosta.items import NostaItem

class NostaPipeline(object):
    def __init__(self):
        self.client = MongoClient('IP', 27017)
        
    def process_item(self, item, spider):
        if isinstance(item, NostaItem):
            dict1 = {}
            dict1["time"] = item["time"]
            dict1["files"] = item["files"]
            dict1["project_name"] = item["project_name"]
            dict1["group_url"] = item["group_url"]
            dict1["project_number"] = item["project_number"]
            dict1["project_content"] = item["project_content"]
            dict1["group_number"] = item["group_number"]
            dict1["project_url"] = item["project_url"]
            dict1["group_name"] = item["group_name"]
            dict1["image_urls"] = item["image_urls"]
            dict1["file_urls"] = item["file_urls"]
            dict1["year"] = item["year"]
            dict1["images"] = item["images"]
            dict1["directory_name"] = item["directory_name"]
            
            self.db = self.client.nosta        
            self.db.authenticate('', '')
            collection = self.db.nosta_2017
            collection.insert(dict1)

            self.db = self.client.platform_info
            self.db.authenticate('', '')
            collection = self.db.crawl_info
            dict2 = {}
            dict2["current_count"] = item["current_count"]
            if dict2["current_count"] == 1:
                dict2["start_time"] = item["time"]
            collection.update( {'job': '2017年国家科技奖励'}, {'$set': dict2})

        return item

settings文件（部分修改）

ITEM_PIPELINES = {
   'nosta.pipelines.NostaPipeline': 300,
   'scrapy.pipelines.images.ImagesPipeline': 1,
   'scrapy.pipelines.files.FilesPipeline': 1
}

IMAGES_STORE = r'.'
FILES_STORE = r'.'

查看全文

相关阅读:
lightoj1140_数位dp
lightoj1057_状压dp
lightoj1068_数位dp
lightoj1018_状压dp
lightoj1217_简单dp
lightoj1119_简单状压dp
lightoj1037_状压dp
lightoj1110_LCS并输出
 图论算法----最短路
 poj1182 食物链

原文地址：https://www.cnblogs.com/zhangtianyuan/p/6829781.html