zoukankan html css js c++ java

CrawlSpider爬取腾讯招聘信息

CrawlSpider不在手动处理url，它会自动匹配到响应文件里的所有符合匹配规则的链接。

创建项目
scrapy startproject TencentSpider

items.py

import scrapy

class TencentItem(scrapy.Item):
    # define the fields for your item here like:
    # 职位名
    positionname = scrapy.Field()
    # 详情连接
    positionlink = scrapy.Field()
    # 职位类别
    positionType = scrapy.Field()
    # 招聘人数
    peopleNum = scrapy.Field()
    # 工作地点
    workLocation = scrapy.Field()
    # 发布时间
    publishTime = scrapy.Field()

创建CrawlSpider，使用模版crawl

scrapy genspider -t crawl tencent tencent.com

tencent.py

import scrapy
# 导入CrawlSpider类和Rule
from scrapy.spiders import CrawlSpider, Rule
# 导入链接规则匹配类，用来提取符合规则的连接
from scrapy.linkextractors import LinkExtractor
from TencentSpider.items import TencentItem

class TencentSpider(CrawlSpider):
    name = "tencent"
    allow_domains = ["hr.tencent.com"]
    start_urls = ["http://hr.tencent.com/position.php?&start=0#a"]

    # Response里链接的提取规则，返回的符合匹配规则的链接匹配对象的列表
    pagelink = LinkExtractor(allow=("start=d+"))

    rules = [
        # 获取这个列表里的链接，依次发送请求，并且继续跟进，调用指定回调函数处理
        Rule(pagelink, callback = "parseTencent", follow = True)
    ]

    # 指定的回调函数
    def parseTencent(self, response):
        #evenlist = response.xpath("//tr[@class='even'] | //tr[@class='odd']")
        #oddlist = response.xpath("//tr[@class='even'] | //tr[@class='odd']")
        #fulllist = evenlist + oddlist
        #for each in fulllist:
        for each in response.xpath("//tr[@class='even'] | //tr[@class='odd']"):
            item = TencentItem()
            # 职位名称
            item['positionname'] = each.xpath("./td[1]/a/text()").extract()[0]
            # 详情连接
            item['positionlink'] = each.xpath("./td[1]/a/@href").extract()[0]
            # 职位类别
            item['positionType'] = each.xpath("./td[2]/text()").extract()[0]
            # 招聘人数
            item['peopleNum'] =  each.xpath("./td[3]/text()").extract()[0]
            # 工作地点
            item['workLocation'] = each.xpath("./td[4]/text()").extract()[0]
            # 发布时间
            item['publishTime'] = each.xpath("./td[5]/text()").extract()[0]

            yield item

pipelines.py

import json

class TencentPipeline(object):
    def __init__(self):
        self.filename = open("tencent.json", "w")

    def process_item(self, item, spider):
        text = json.dumps(dict(item), ensure_ascii = False) + ",
"
        self.filename.write(text.encode("utf-8"))
        return item

    def close_spider(self, spider):
        self.filename.close()

settings.py

BOT_NAME = 'TencentSpider'

SPIDER_MODULES = ['TencentSpider.spiders']
NEWSPIDER_MODULE = 'TencentSpider.spiders'

# 保存日志信息的文件名
LOG_FILE = "tencentlog.log"
# 保存日志等级，低于|等于此等级的信息都被保存
LOG_LEVEL = "DEBUG"

ITEM_PIPELINES = {
    'TencentSpider.pipelines.TencentPipeline': 300,
}

执行

scrapy crawl tencent

查看全文

相关阅读:
所谓幸福
 kobuki 红外自动回充源码原理分析
 《SLAM机器人基础教程》第三章单片机与STM32：电机码盘实现里程计实验
 《SLAM机器人基础教程》第三章单片机与STM32：PWM电机转动实验
 《SLAM机器人基础教程》第三章单片机与STM32：超声测距实验
 《SLAM机器人基础教程》第三章单片机与STM32：ADC与电压检测实验
 《SLAM机器人基础教程》第三章单片机与STM32：碰撞传感器实验
 《SLAM机器人基础教程》第三章单片机与STM32：定时器实验
 《SLAM机器人基础教程》第三章单片机与STM32：滴答延时实验使用SysTick实现时间戳
 《SLAM导航机器人基础》第三章：单片机与STM32：串口Printf打印实验

原文地址：https://www.cnblogs.com/wanglinjie/p/9211097.html