zoukankan      html  css  js  c++  java
  • scrapy爬取腾讯招聘信息

    创建项目
    scrapy startproject tencent

    编写items.py
    写class TencentItem

     1 import scrapy
     2 
     3 class TencentItem(scrapy.Item):
     4     # define the fields for your item here like:
     5     # 职位名
     6     positionname = scrapy.Field()
     7     # 详情连接
     8     positionlink = scrapy.Field()
     9     # 职位类别
    10     positionType = scrapy.Field()
    11     # 招聘人数
    12     peopleNum = scrapy.Field()
    13     # 工作地点
    14     workLocation = scrapy.Field()
    15     # 发布时间
    16     publishTime = scrapy.Field()

    创建基础类的爬虫

    scrapy genspider tencentPosition"tencent.com"

    tencentPosition.py

     1 # -*- coding: utf-8 -*-
     2 import scrapy
     3 from tencent.items import TencentItem
     4 
     5 class TencentpositionSpider(scrapy.Spider):
     6     name = "tencent"
     7     allowed_domains = ["tencent.com"]
     8 
     9     url = "http://hr.tencent.com/position.php?&start="
    10     offset = 0
    11 
    12     start_urls = [url + str(offset)]
    13 
    14     def parse(self, response):
    15         for each in response.xpath("//tr[@class='even'] | //tr[@class='odd']"):
    16             # 初始化模型对象
    17             item = TencentItem()
    18 
    19             item['positionname'] = each.xpath("./td[1]/a/text()").extract()[0]
    20             # 详情连接
    21             item['positionlink'] = each.xpath("./td[1]/a/@href").extract()[0]
    22             # 职位类别
    23             item['positionType'] = each.xpath("./td[2]/text()").extract()[0]
    24             # 招聘人数
    25             item['peopleNum'] =  each.xpath("./td[3]/text()").extract()[0]
    26             # 工作地点
    27             item['workLocation'] = each.xpath("./td[4]/text()").extract()[0]
    28             # 发布时间
    29             item['publishTime'] = each.xpath("./td[5]/text()").extract()[0]
    30 
    31             yield item
    32 
    33         if self.offset < 1680:
    34             self.offset += 10
    35 
    36         # 每次处理完一页的数据之后,重新发送下一页页面请求
    37         # self.offset自增10,同时拼接为新的url,并调用回调函数self.parse处理Response
    38         yield scrapy.Request(self.url + str(self.offset), callback = self.parse)

    管道文件
    pipelines.py

     1 import json
     2 
     3 class TencentPipeline(object):
     4     def __init__(self):
     5         self.filename = open("tencent.json", "w")
     6 
     7     def process_item(self, item, spider):
     8         text = json.dumps(dict(item), ensure_ascii = False) + ",
    "
     9         self.filename.write(text.encode("utf-8"))
    10         return item
    11 
    12     def close_spider(self, spider):
    13         self.filename.close()

    在settings文件设置pipelines

    ITEM_PIPELINES = {
    'tencent.pipelines.TencentPipeline': 300,
    }


    添加请求报头

    DEFAULT_REQUEST_HEADERS


    settings.py
    BOT_NAME = 'tencent'
    
    SPIDER_MODULES = ['tencent.spiders']
    NEWSPIDER_MODULE = 'tencent.spiders'
    
    ROBOTSTXT_OBEY = True
    
    DOWNLOAD_DELAY = 2
    
    DEFAULT_REQUEST_HEADERS = {
        "User-Agent" : "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0;",
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'
    }
    
    ITEM_PIPELINES = {
        'tencent.pipelines.TencentPipeline': 300,
    }
  • 相关阅读:
    ROM定制开发教程-Android adb命令用法与实例解析
    Android内存优化—dumpsys meminfo详解
    adb命令查看手机应用内存使用情况
    【特征检测】BRISK特征提取算法
    焦虑症
    基于Lua的游戏服务端框架简介
    流量
    学习React Native必看的几个开源项目
    那些离开私企进入国企的人,现在都过得怎么样了?
    分布式系统中有关一致性的理解
  • 原文地址:https://www.cnblogs.com/wanglinjie/p/9210850.html
Copyright © 2011-2022 走看看