zoukankan      html  css  js  c++  java
  • Scrapy实现腾讯招聘网信息爬取【Python】

    一.腾讯招聘网

    二.代码实现

      1.spider爬虫

     1 # -*- coding: utf-8 -*-
     2 import scrapy
     3 from Tencent.items import TencentItem
     4 
     5 class TencentSpider(scrapy.Spider):
     6     name = 'tencent'
     7     allowed_domains = ['tencent.com']
     8     base_url = 'https://hr.tencent.com/position.php?&start='
     9     offset = 0
    10     start_urls = [base_url + str(offset)]
    11     def parse(self, response):
    12         node_list = response.xpath("//tr[@class='even'] | //tr[@class='odd']")
    13         for node in node_list:
    14             # 创建Item对象来保存信息
    15             item = TencentItem()
    16             positionName = node.xpath("./td[1]/a/text()").extract()[0]
    17             if len(node.xpath("./td[2]/text()")):
    18                 positionType = node.xpath("./td[2]/text()").extract()[0]
    19             else:
    20                 positionType = ""
    21             positionNumber = node.xpath("./td[3]/text()").extract()[0]
    22             location = node.xpath("./td[4]/text()").extract()[0]
    23             publishTime = node.xpath("./td[5]/text()").extract()[0]
    24 
    25             # 保存到item中
    26             item['positionName'] = positionName
    27             item['positionType'] = positionType
    28             item['positionNumber'] = positionNumber
    29             item['location'] = location
    30             item['publishTime'] = publishTime
    31 
    32             yield item
    33 
    34         # 判断是否需要拼接下一页路径
    35         if self.offset < 2770:
    36             self.offset += 10
    37             # 拼接下一页路径
    38             url = self.base_url + str(self.offset)
    39             # dont_filter=True 禁止因域名不同而过滤
    40             yield scrapy.Request(url, callback=self.parse)

      2.管道

     1 # -*- coding: utf-8 -*-
     2 
     3 # Define your item pipelines here
     4 #
     5 # Don't forget to add your pipeline to the ITEM_PIPELINES setting
     6 # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
     7 import json
     8 
     9 class TencentPipeline(object):
    10     def __init__(self):
    11         self.f = open("Tencent.json", "w")
    12 
    13     def process_item(self, item, spider):
    14         text = json.dumps(dict(item), ensure_ascii=False) + ",
    "
    15         self.f.write(text)
    16         return item
    17 
    18     def close_spider(self, spider):
    19         self.f.close()

      3.实体

     1 # -*- coding: utf-8 -*-
     2 
     3 # Define here the models for your scraped items
     4 #
     5 # See documentation in:
     6 # https://doc.scrapy.org/en/latest/topics/items.html
     7 
     8 import scrapy
     9 
    10 
    11 class TencentItem(scrapy.Item):
    12     # define the fields for your item here like:
    13     # name = scrapy.Field()
    14     # 职位名称
    15     positionName = scrapy.Field()
    16     # 职位类别
    17     positionType = scrapy.Field()
    18     # 人数
    19     positionNumber = scrapy.Field()
    20     # 地点
    21     location = scrapy.Field()
    22     # 发布时间
    23     publishTime = scrapy.Field()

    三.结果【部分展示】

    {"positionName": "15605-动作RPG手游游戏活动策划", "positionType": "产品/项目类", "positionNumber": "1", "location": "深圳", "publishTime": "2019-02-14"},
    {"positionName": "21645-高级法律顾问", "positionType": "职能类", "positionNumber": "1", "location": "深圳", "publishTime": "2019-02-14"},
    {"positionName": "HGJ-senior legal counsel(MA)", "positionType": "职能类", "positionNumber": "1", "location": "深圳", "publishTime": "2019-02-14"},
    {"positionName": "19837-电影新媒体媒介经理", "positionType": "市场类", "positionNumber": "1", "location": "北京", "publishTime": "2019-02-14"},
    {"positionName": "HGJ-Legal Counsel (Technology Transactions)", "positionType": "职能类", "positionNumber": "1", "location": "深圳", "publishTime": "2019-02-14"},
    {"positionName": "22989-云服务平台部--服务运营中心总监", "positionType": "技术类", "positionNumber": "1", "location": "深圳", "publishTime": "2019-02-14"},
    {"positionName": "22989-经营平台产品中心web前端开发", "positionType": "技术类", "positionNumber": "1", "location": "深圳", "publishTime": "2019-02-14"},
    {"positionName": "22989-视频云高级Web前端开发", "positionType": "技术类", "positionNumber": "2", "location": "深圳", "publishTime": "2019-02-14"},
    {"positionName": "27554-腾讯音乐商业制片人(上海)", "positionType": "市场类", "positionNumber": "1", "location": "上海", "publishTime": "2019-02-14"},
    {"positionName": "30361-天天P图图像处理后台开发(上海)", "positionType": "技术类", "positionNumber": "1", "location": "上海", "publishTime": "2019-02-14"},
    {"positionName": "15573-MMORPG UE4手游资深美术3D设计(上海)", "positionType": "设计类", "positionNumber": "1", "location": "上海", "publishTime": "2019-02-14"},
    {"positionName": "15573-MMORPG UE4手游3D动画设计师(上海)", "positionType": "设计类", "positionNumber": "1", "location": "上海", "publishTime": "2019-02-14"},
    {"positionName": "15573-MMORPG UE4手游3D特效美术师(上海)", "positionType": "设计类", "positionNumber": "1", "location": "上海", "publishTime": "2019-02-14"},
    {"positionName": "15573-MMORPG UE4手游交互设计师(上海)", "positionType": "设计类", "positionNumber": "1", "location": "上海", "publishTime": "2019-02-14"},
    {"positionName": "AQ-产品安全经理(深圳)", "positionType": "职能类", "positionNumber": "1", "location": "深圳", "publishTime": "2019-02-14"},
    {"positionName": "AQ-产品安全经理(广州)", "positionType": "职能类", "positionNumber": "1", "location": "广州", "publishTime": "2019-02-14"},
    {"positionName": "29050-数据安全经理/专家(深圳)", "positionType": "职能类", "positionNumber": "2", "location": "深圳", "publishTime": "2019-02-14"},
    {"positionName": "29050-数据安全经理/专家(北京)", "positionType": "职能类", "positionNumber": "2", "location": "北京", "publishTime": "2019-02-14"},
    {"positionName": "AQ-行业合作经理(北京)", "positionType": "职能类", "positionNumber": "1", "location": "北京", "publishTime": "2019-02-14"},
    {"positionName": "AQ-行业合作经理(深圳)", "positionType": "职能类", "positionNumber": "1", "location": "深圳", "publishTime": "2019-02-14"},
    {"positionName": "27553-腾讯音乐人曲库运营", "positionType": "内容编辑类", "positionNumber": "2", "location": "深圳", "publishTime": "2019-02-14"},
    {"positionName": "22086-体育号创作平台产品经理", "positionType": "产品/项目类", "positionNumber": "1", "location": "北京", "publishTime": "2019-02-14"},
    {"positionName": "22086-体育号CP管理产品经理 ", "positionType": "产品/项目类", "positionNumber": "1", "location": "北京", "publishTime": "2019-02-14"},
    {"positionName": "22086-体育号内容质量产品经理", "positionType": "产品/项目类", "positionNumber": "1", "location": "北京", "publishTime": "2019-02-14"},
    {"positionName": "28297-二次元手游本地化策划(深圳)", "positionType": "产品/项目类", "positionNumber": "2", "location": "深圳", "publishTime": "2019-02-14"},
    {"positionName": "GY0-腾讯云海外商务拓展", "positionType": "市场类", "positionNumber": "1", "location": "深圳", "publishTime": "2019-02-14"},
    {"positionName": "PCG10-高级产品经理(工具产品方向)", "positionType": "产品/项目类", "positionNumber": "1", "location": "成都", "publishTime": "2019-02-14"},
    {"positionName": "18432-策略分析师", "positionType": "产品/项目类", "positionNumber": "1", "location": "深圳", "publishTime": "2019-02-14"},
    {"positionName": "18432-基金高级分析师", "positionType": "产品/项目类", "positionNumber": "1", "location": "深圳", "publishTime": "2019-02-14"},
    {"positionName": "21309-在线教育运营专家/增长黑客", "positionType": "产品/项目类", "positionNumber": "1", "location": "深圳", "publishTime": "2019-02-14"},
    {"positionName": "28481-高级医疗商务拓展经理(北京)", "positionType": "市场类", "positionNumber": "1", "location": "北京", "publishTime": "2019-02-14"},
    {"positionName": "21882-高级医学编辑(深圳/北京)", "positionType": "内容编辑类", "positionNumber": "2", "location": "北京", "publishTime": "2019-02-14"},
    {"positionName": "18402-MMO手游-平台渠道运营", "positionType": "产品/项目类", "positionNumber": "1", "location": "深圳", "publishTime": "2019-02-14"},
    {"positionName": "28170-腾讯游戏直播业务管理经理(深圳)", "positionType": "产品/项目类", "positionNumber": "1", "location": "深圳", "publishTime": "2019-02-14"},
    {"positionName": "28170-腾讯游戏直播内容品牌经理(深圳)", "positionType": "市场类", "positionNumber": "1", "location": "深圳", "publishTime": "2019-02-14"},
    {"positionName": "PCG10-浏览器阅读中心后台开发工程师(深圳)", "positionType": "技术类", "positionNumber": "1", "location": "深圳", "publishTime": "2019-02-14"},
    {"positionName": "PCG10-浏览器阅读中心前端开发工程师(深圳)", "positionType": "技术类", "positionNumber": "2", "location": "深圳", "publishTime": "2019-02-14"},

     四.详情

      

  • 相关阅读:
    Sql获取表中随机1到n条数据
    SQLite相关
    Eclipse自动生成api时报错“警告: 编码 GBK 的不可映射字符”
    Git基础命令
    Oracle查询结果行转列,列转行
    字符串编码方式转换
    Java字符串匹配正则表达式
    python爬虫--编码问题y
    GET与POST方法
    python爬虫(1)--Urllib库的基本使用
  • 原文地址:https://www.cnblogs.com/yszd/p/10380648.html
Copyright © 2011-2022 走看看