zoukankan html css js c++ java

scrapy 第一个案例（爬取腾讯招聘职位信息）

import scrapy
import json

class TzcSpider(scrapy.Spider):
    # spider的名字，唯一
    name = 'tzc'
    # 起始地址
    start_urls = ['https://hr.tencent.com/position.php?keywords=python&tid=0&lid=2268']

    # 每个url爬取之后会调用这个方法
    def parse(self, response):
        tr = response.xpath( '//table[@class="tablelist"]/tr[@class = "even"]|//table[@class="tablelist"]/tr[@class = "odd"]')
        with open('info.json','a') as f:
            for i in tr:
                data = {
                    "jobName": i.xpath('./td[1]/a/text()').extract_first(),
                    "jobType":i.xpath('./td[2]/text()').extract_first(),
                    "Num":i.xpath('./td[3]/text()').extract_first(),
                    "Place":i.xpath('./td[4]/text()').extract_first(),
                    "Time":i.xpath('./td[5]/text()').extract_first()
                }
                data = json.dumps(data,ensure_ascii=False)
                f.write(data)
                f.write('
')
        # 寻找下一页标签
        url_next = response.xpath('//a[@id = "next"]/@href').extract_first()
        # 提取的是段标签，需要加上域名
        url_next = 'https://hr.tencent.com/{}'.format(url_next)
        # 返回下一页地址，scrapy会递归
        return scrapy.Request(url_next)

查看全文

相关阅读:
SQLalchemy 字段类型
 爬虫学习
 Linux了解一下
 django-rest-framework
vue相关理论知识
 Django认证系统
 Form组件
 JS之AJAX
Django之中间件
 Django之ORM

原文地址：https://www.cnblogs.com/cxhzy/p/10299696.html