zoukankan      html  css  js  c++  java
  • Python 自用代码(scrapy多级页面(三级页面)爬虫)

    2017-03-28

    入职接到的第一个小任务,scrapy多级页面爬虫,从来没写过爬虫,也没学过scrapy,甚至连xpath都没用过,最后用了将近一周才搞定。肯定有很多low爆的地方,希望大家可以给我一些建议。

    spider文件:

    # -*- coding: utf-8 -*-
    import scrapy
    from nosta.items import NostaItem
    import time
    import hashlib
    
    class NostaSpider(scrapy.Spider):
        name = "nosta"
        allowed_domains = ["nosta.gov.cn"]
        start_urls = [
            "http://www.nosta.gov.cn/upload/2017slgb/showProject.html",
        ]
    
        def parse(self, response):  
            for sel1 in response.xpath('//a/@href').extract():
                # 存储链接自身组名group_name
                group_name = response.xpath('//a[@href="%s"]/text()'%(sel1)).extract()[0]
                # 存储链接前的顺序号group_number
                group_number = response.xpath('//a[@href="%s"]/parent::*/preceding-sibling::*/text()'%(sel1)).extract()[0]
                # 存储目录名directory_name
                directory_name = response.xpath('//a[@href="%s"]/parent::*/parent::*/parent::*/parent::*/preceding-sibling::*/text()'%(sel1)).extract()[0]
                # 存储链接本身group_url
                group_url = response.urljoin(sel1)
                # url1 = "http://www.nosta.gov.cn/upload/2017slgb/" + sel1
                yield scrapy.Request(url = group_url, meta = {"group_name":group_name, "group_number":group_number, "directory_name":directory_name, "group_url":group_url}, callback=self.parse_url, dont_filter=True)
    
        def parse_url(self, response): 
            # item = response.meta['item']
            group_name = response.meta["group_name"]
            group_number = response.meta["group_number"]
            directory_name = response.meta["directory_name"]
            group_url = response.meta["group_url"]
            for sel2 in response.xpath('//a/@href').extract():
                # 存储链接前的顺序号project_number
                project_number = response.xpath('//a[@href="%s"]/parent::*/preceding-sibling::*/text()'%(sel2)).extract()[0]
                # 存储链接本身project_url
                project_url = response.urljoin(sel2)
                # 存储链接自身工程名project_name
                project_name = response.xpath('//a[@href="%s"]/text()'%(sel2)).extract()[0]
                # url2 = response.urljoin(sel2)
                yield scrapy.Request(url = project_url, meta = {"group_name":group_name, "group_number":group_number, "directory_name":directory_name, "group_url":group_url, "project_number":project_number, "project_url":project_url, "project_name":project_name}, callback=self.parse_item, dont_filter=True)
    
        def parse_item(self, response):
            item = NostaItem()
            item["time"] = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
            item["year"] = ["2017"]      
            item["group_name"] = response.meta["group_name"]
            item["group_number"] = response.meta["group_number"]
            item["directory_name"] = response.meta["directory_name"]
            item["group_url"] = response.meta["group_url"]
            item["project_number"] = response.meta["project_number"]
            item["project_url"] = response.meta["project_url"]
            item["project_name"] = response.meta["project_name"]
            # 存储详情页源代码project_html
            item["project_html"] = response.body
            # 存储合作人关系链接file_urls
            s1 = u'完成人合作关系说明:'
            item["file_urls"] = ['http://www.nosta.gov.cn/upload/2017slgb'+i.replace('..', '') for i in response.xpath("//td[text() = '%s']/following-sibling::*/a/@href"%(s1)).extract()]
            sha_1 = hashlib.sha1()
            item["files"] = []
            for i in item["file_urls"]:
                dict1 = {}
                dict1["url"] = i
                sha_1.update(i)
                dict1["path"] = sha_1.hexdigest() + ".pdf"
                item["files"].append(dict1)
            # 存储所有图片链接image_urls
            item["image_urls"] = ['http://www.nosta.gov.cn/upload/2017slgb'+i.replace('..', '') for i in response.xpath('//img[@width="840px"]/@src').extract()]
            # 存储所有图片本地地址和图片名(列表中存存字典)images
            sha_2 = hashlib.sha1()
            item["images"] = []
            for i in item["image_urls"]:
                dict2 = {}
                dict2["url"] = i
                sha_2.update(i)
                dict2["path"] = sha_2.hexdigest() + ".jpg"
                item["images"].append(dict2)
            # 存储详情页中具体内容project_content
            dict3 = {}
            project_detail = response.xpath('//td[@class="label"]/text()').extract()
            for j in project_detail:
                dict3[j] = response.xpath("//td[text() = '%s']/following-sibling::*"%(j)).xpath('string(.)').extract()[0]
                if not dict3[j]:
                    dict3[j] = ['http://www.nosta.gov.cn/upload/2017slgb'+i.replace('..', '') for i in response.xpath("//td[text() = '%s']/following-sibling::*/img/@src"%(j)).extract()]
            item["project_content"] = dict3
            yield item

     items文件:

    import scrapy
    
    
    class NostaItem(scrapy.Item):
        time = scrapy.Field()
        files = scrapy.Field()           # 完成人合作关系 列表中存字典  url:网上链接 path本地路径(第三级)
        crawl_date = scrapy.Field()      # 爬取日期
        project_name = scrapy.Field()    # 工程名称(第二、三级)
        group_url = scrapy.Field()       # 所在组的索引页面链接(第一、二级)
        project_number = scrapy.Field()  # 在组中顺序(第二级)
        project_content = scrapy.Field() # 项目详情页中具体内容(第三级)
        group_number = scrapy.Field()    # 组在总页面中顺序(第一级)
        project_url = scrapy.Field()     # 项目链接(第二、三级)
        group_name = scrapy.Field()      # 组名称(第一、二、三级)
        image_urls = scrapy.Field()      # 列表存图片链接(第三级)
        file_urls = scrapy.Field()       # 列表中存合作人关系链接(第三级)
        year = scrapy.Field()            # 哪年(2017)
        images = scrapy.Field()          # 列表中存字典 url:网上链接 path:本地路径(第三级)
        directory_name = scrapy.Field()  # 属于何种目录名(第一级)
        project_html = scrapy.Field()    # 项目详情页html源代码(第三级)
        current_count = scrapy.Field()

    pipelines文件

    from pymongo import MongoClient
    from nosta.items import NostaItem
    
    class NostaPipeline(object):
        def __init__(self):
            self.client = MongoClient('IP', 27017)
            
        def process_item(self, item, spider):
            if isinstance(item, NostaItem):
                dict1 = {}
                dict1["time"] = item["time"]
                dict1["files"] = item["files"]
                dict1["project_name"] = item["project_name"]
                dict1["group_url"] = item["group_url"]
                dict1["project_number"] = item["project_number"]
                dict1["project_content"] = item["project_content"]
                dict1["group_number"] = item["group_number"]
                dict1["project_url"] = item["project_url"]
                dict1["group_name"] = item["group_name"]
                dict1["image_urls"] = item["image_urls"]
                dict1["file_urls"] = item["file_urls"]
                dict1["year"] = item["year"]
                dict1["images"] = item["images"]
                dict1["directory_name"] = item["directory_name"]
                
                self.db = self.client.nosta        
                self.db.authenticate('', '')
                collection = self.db.nosta_2017
                collection.insert(dict1)
    
                self.db = self.client.platform_info
                self.db.authenticate('', '')
                collection = self.db.crawl_info
                dict2 = {}
                dict2["current_count"] = item["current_count"]
                if dict2["current_count"] == 1:
                    dict2["start_time"] = item["time"]
                collection.update( {'job': '2017年国家科技奖励'}, {'$set': dict2})
    
            return item

    settings文件(部分修改)

    ITEM_PIPELINES = {
       'nosta.pipelines.NostaPipeline': 300,
       'scrapy.pipelines.images.ImagesPipeline': 1,
       'scrapy.pipelines.files.FilesPipeline': 1
    }
    
    IMAGES_STORE = r'.'
    FILES_STORE = r'.'
  • 相关阅读:
    某个应用使cpu使用率100%
    cpu上下文切换(下)
    kafka集群安装和kafka-manager
    cpu上下文切换
    oralce 记一次 External Procedure initial connection 处理
    Oracle 监听
    Oracle 序列
    Oracle 同义词
    发布到远程存储库时遇到错误: Git failed with a fatal error.
    报表加入参数
  • 原文地址:https://www.cnblogs.com/zhangtianyuan/p/6829781.html
Copyright © 2011-2022 走看看