zoukankan      html  css  js  c++  java
  • scrapy项目3

    # -*- coding: utf-8 -*-
    import scrapy
    
    #导入items
    from tencent.items import TencentItem
    
    class HrSpider(scrapy.Spider):
        name = 'hr'
        allowed_domains = ['tencent.com']
        start_urls = ['https://hr.tencent.com/position.php']
    
        def parse(self, response):
            print("=========")
            tr_list = response.xpath("//table[@class='tablelist']/tr")[1:-1] #去掉第一个和最后一个
    
            for tr in tr_list:
                # 使用item,items里面的字段要和这下面的字段一样
                item = TencentItem()
                # item = {}  #不使用items
    
                item["title"] = tr.xpath("./td[1]/a/text()").extract_first()
                item["position"] = tr.xpath("./td[4]/text()").extract_first()
                item["publish_date"] = tr.xpath("./td[5]/text()").extract_first()
                yield item
    
                # <a href="javascript:;"class="noactive" id="next">下一页</a>
                #判断下一页,最后一页的href="javascript
            next_url = response.xpath("//a[@id='next']/@href").extract_first()
            if next_url != "javascript": #如果href的属性不为javascript,代表有下一页
                next_url = 'https://hr.tencent.com/'+next_url
                yield scrapy.Request(
                    next_url,
                    callback=self.parse
                )

    pipelines.py

    # mogodb数据库
    # from pymongo import MogoClient
    # client = MogoClient()
    # collection = client["tencent"]["hr"]
    #导入items
    from tencent.items import TencentItem
    class TencentPipeline(object):
        def process_item(self, item, spider):
            # print(spider.name)
            if isinstance(item,TencentItem):
    
    
                print(item)
            # collection.insert(dict(item))
            return item

    items,py

    import scrapy
    
    # 可以定义多个item对应不同的爬虫项目字段,比如怕京东,抽屉,汽车之家
    #然后再pipelines中做判断
    class TencentItem(scrapy.Item): #scrapy.Item也是一个字典
        # define the fields for your item here like:
        # name = scrapy.Field()
        num = scrapy.Field()
        title = scrapy.Field()   #scrapy.Field()是一个字典
        position = scrapy.Field()
        publish_date = scrapy.Field()
    
    
    class ChoutiItem(scrapy.Item):
        # define the fields for your item here like:
        # name = scrapy.Field()
        title = scrapy.Field()
        position = scrapy.Field()
        publish_date = scrapy.Field()
    
    
    class JdItem(scrapy.Item):
        # define the fields for your item here like:
        # name = scrapy.Field()
        title = scrapy.Field()
        position = scrapy.Field()
        publish_date = scrapy.Field()

    settings.py

    LOG_LEVEL = "WARNING"
    
    
    # Crawl responsibly by identifying yourself (and your website) on the user-agent
    USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36'

     项目地址:https://github.com/CH-chen/tencent

  • 相关阅读:
    kubernetes集群系列资料20-metric介绍
    kubernetes集群系列资料19-dashboard介绍
    kubernetes集群系列资料18--K8S证书
    kubernetes集群系列资料16--helm介绍
    云安全产品使用---文件存储
    kubernetes集群系列资料15--安全机制介绍
    kubernetes集群系列资料14--scheduler介绍
    kubernetes集群系列资料17--prometheus介绍
    云安全产品使用---云安全中心
    kubernetes集群系列资料13--存储机制介绍
  • 原文地址:https://www.cnblogs.com/chvv/p/10332460.html
Copyright © 2011-2022 走看看