zoukankan      html  css  js  c++  java
  • scrapy 爬取前程无忧

    spider

    # -*- coding: utf-8 -*-
    import scrapy
    from Jobs.items import JobsItem
    
    class Job51spiderSpider(scrapy.Spider):
        name = 'Job51Spider'
        allowed_domains = ['www.51job.com', 'search.51job.com']
        offset = 1
        # 起始url
        url = "https://search.51job.com/list/090200,000000,0000,00,9,99,php,2,"
        start_urls = [url + str(offset) + ".html"]
        def parse(self, response):
            print(response.url)
            for each in response.css('#resultList .el:not(.title)'):
                # 初始化模型对象
                item = JobsItem()
                # 职位名
                item['zwname'] = each.css('.t1 a').xpath('./@title').extract_first()
                # 公司名字
                item['gsname'] = each.css('.t2 a').xpath('./@title').extract_first()
                # 工作地点
                item['gzdd'] = each.css('.t3::text').extract_first()
                # 工资
                item['gz'] = each.css('.t4::text').extract_first()
                # 发布时间
                item['fbtime'] = each.css('.t5::text').extract_first()
                yield item
    
            zong = response.xpath('//div[@class="dw_page"]/div/div/div/span/text()').extract_first().split('')[0].strip('')
            if self.offset < int(zong):
                self.offset += 1
    
            # import ipdb; ipdb.set_trace()
            ss = self.url + str(self.offset) + ".html"
            yield scrapy.Request(url=ss, callback=self.parse)

    items

    # -*- coding: utf-8 -*-
    
    # Define here the models for your scraped items
    #
    # See documentation in:
    # https://doc.scrapy.org/en/latest/topics/items.html
    
    import scrapy
    
    
    class JobsItem(scrapy.Item):
        # 职位名
        zwname = scrapy.Field()
        # 公司名字
        gsname = scrapy.Field()
        # 工作地点
        gzdd = scrapy.Field()
        # 工资
        gz = scrapy.Field()
        # 发布时间
        fbtime = scrapy.Field()

  • 相关阅读:
    7.Flask-上传文件和访问上传的文件
    Python 数字模块
    Django之模板语法
    decimal模块
    python中的计时器:timeit模块
    6.Flask-WTForms
    Django之ORM跨表操作
    公司 邮件 翻译 培训 长难句 16
    公司 邮件 翻译 培训 长难句 15
    公司 邮件 翻译 培训 长难句 14
  • 原文地址:https://www.cnblogs.com/sxqfuture/p/10256462.html
Copyright © 2011-2022 走看看