zoukankan      html  css  js  c++  java
  • scrapy项目3

    # -*- coding: utf-8 -*-
    import scrapy
    
    #导入items
    from tencent.items import TencentItem
    
    class HrSpider(scrapy.Spider):
        name = 'hr'
        allowed_domains = ['tencent.com']
        start_urls = ['https://hr.tencent.com/position.php']
    
        def parse(self, response):
            print("=========")
            tr_list = response.xpath("//table[@class='tablelist']/tr")[1:-1] #去掉第一个和最后一个
    
            for tr in tr_list:
                # 使用item,items里面的字段要和这下面的字段一样
                item = TencentItem()
                # item = {}  #不使用items
    
                item["title"] = tr.xpath("./td[1]/a/text()").extract_first()
                item["position"] = tr.xpath("./td[4]/text()").extract_first()
                item["publish_date"] = tr.xpath("./td[5]/text()").extract_first()
                yield item
    
                # <a href="javascript:;"class="noactive" id="next">下一页</a>
                #判断下一页,最后一页的href="javascript
            next_url = response.xpath("//a[@id='next']/@href").extract_first()
            if next_url != "javascript": #如果href的属性不为javascript,代表有下一页
                next_url = 'https://hr.tencent.com/'+next_url
                yield scrapy.Request(
                    next_url,
                    callback=self.parse
                )

    pipelines.py

    # mogodb数据库
    # from pymongo import MogoClient
    # client = MogoClient()
    # collection = client["tencent"]["hr"]
    #导入items
    from tencent.items import TencentItem
    class TencentPipeline(object):
        def process_item(self, item, spider):
            # print(spider.name)
            if isinstance(item,TencentItem):
    
    
                print(item)
            # collection.insert(dict(item))
            return item

    items,py

    import scrapy
    
    # 可以定义多个item对应不同的爬虫项目字段,比如怕京东,抽屉,汽车之家
    #然后再pipelines中做判断
    class TencentItem(scrapy.Item): #scrapy.Item也是一个字典
        # define the fields for your item here like:
        # name = scrapy.Field()
        num = scrapy.Field()
        title = scrapy.Field()   #scrapy.Field()是一个字典
        position = scrapy.Field()
        publish_date = scrapy.Field()
    
    
    class ChoutiItem(scrapy.Item):
        # define the fields for your item here like:
        # name = scrapy.Field()
        title = scrapy.Field()
        position = scrapy.Field()
        publish_date = scrapy.Field()
    
    
    class JdItem(scrapy.Item):
        # define the fields for your item here like:
        # name = scrapy.Field()
        title = scrapy.Field()
        position = scrapy.Field()
        publish_date = scrapy.Field()

    settings.py

    LOG_LEVEL = "WARNING"
    
    
    # Crawl responsibly by identifying yourself (and your website) on the user-agent
    USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36'

     项目地址:https://github.com/CH-chen/tencent

  • 相关阅读:
    《数据通信与网络》笔记--数据链路层的成帧
    设计模式10---设计模式之原型模式(Prototype)
    Yii 控制dropdownlist / select 控件的宽度和 option 的宽度
    [置顶] 如何vs在cocos2dx项目中打印中文
    mongodb实现简单的增删改查
    北京和硅谷在创新方面的区别
    Android 解决Gallery下ScrollView滑动事件冲突
    Java 授权内幕--转载
    JAVA 上加密算法的实现用例---转载
    基于事件的 NIO 多线程服务器--转载
  • 原文地址:https://www.cnblogs.com/chvv/p/10332460.html
Copyright © 2011-2022 走看看