zoukankan      html  css  js  c++  java
  • (补充)scrapy框架爬取智联招聘信息--上海python工作

    1. job.py:   

      据观察,pageSize参数可以控制页码,一页90个工作,这里做了一个用户交互,输入数字,即下载相应的前几页的工作信息。 

    # -*- coding: utf-8 -*-
    import scrapy
    import json
    from zhilianJob.items import ZhilianjobItem
    
    
    class JobSpider(scrapy.Spider):
        name = 'job'
        # allowed_domains = ['www.sou.zhaopin.com']
        # start_urls可以简写成:https://fe-api.zhaopin.com/c/i/sou?pageSize=90&cityId=538&kw=python&kt=3
        # start_urls = [
        #     'https://fe-api.zhaopin.com/c/i/sou?pageSize=90&cityId=538&salary=0,0&workExperience=-1&education=-1&companyType=-1&employmentType=-1&jobWelfareTag=-1&kw=python&kt=3&=0&_v=0.02964699&x-zp-page-request-id=3e524df5d2b541dcb5ddb82028a5c1b6-1565749700925-710042&x-zp-client-id=2724abb6-fb33-43a0-af2e-f177d8a3e169']
        start_urls=['https://fe-api.zhaopin.com/c/i/sou?pageSize=90&cityId=538&kw=python&kt=3']
    
        end = int(input('需要下载前几页信息?请输入页码>>>:').strip())
    
        def parse(self, response):
            # print(response.text)
            # print(111)
            data = json.loads(response.text)
            job = data['data']['results']
            # print(job)
            try:
                for j in job:
                    item = ZhilianjobItem()
                    item['job_name'] = j['jobName']
                    item['job_firm'] = j['company']['name']
                    item['job_firmPeople'] = j['company']['size']['name']
                    job_url = item['job_url'] = j['positionURL']
                    item['job_salary'] = j['salary']
                    item['job_type'] = j['jobType']['items'][0]['name']
                    item['job_yaoqiu'] = j['eduLevel']['name'] + ',' + j['workingExp']['name']
                    item['job_welfare'] = ','.join(j['welfare'])
                    yield scrapy.Request(url=job_url,callback=self.detail_parse,meta={'item':item})
    
                    if self.end > 1:
                        for i in range(2, self.end + 1):
                            url = 'https://fe-api.zhaopin.com/c/i/sou?pageSize=%s&cityId=538&kw=python&kt=3' % str(90 * i)
                            # print(111)
                            yield scrapy.Request(url=url, callback=self.parse)
            except Exception as e:
                print(e)
    
        def detail_parse(self,response):
            # print(response)
            #获取parse() 参数
            # print(2222)
            item = response.meta['item']
            job_address = response.xpath('//div[@class="job-address"]/div/span/text()').extract_first()
            item['job_address'] = job_address
            yield item
    

    2. items.py

      比之前多了两个字段 job_url,job_address

    # -*- coding: utf-8 -*-
    
    # Define here the models for your scraped items
    #
    # See documentation in:
    # https://doc.scrapy.org/en/latest/topics/items.html
    
    import scrapy
    
    
    class ZhilianjobItem(scrapy.Item):
        # define the fields for your item here like:
        job_name = scrapy.Field()       #工作名称
        job_firm = scrapy.Field()       #公司名称
        job_firmPeople = scrapy.Field()       #公司人数
        job_url = scrapy.Field()       #公司url
        job_type = scrapy.Field()       #工作类型
        job_salary = scrapy.Field()     #薪水
        job_yaoqiu = scrapy.Field()     #工作要求
        job_welfare = scrapy.Field()    #福利
        job_address = scrapy.Field()    #福利
        pass
    

    3. pipelines.py

     

    # -*- coding: utf-8 -*-
    
    # Define your item pipelines here
    #
    # Don't forget to add your pipeline to the ITEM_PIPELINES setting
    # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
    
    import pymysql
    class ZhilianjobPipeline(object):
        conn = None
        mycursor = None
    
        def open_spider(self,spider):
            self.conn = pymysql.connect(host='172.16.25.37',port=3306,user='root',password='root',db='scrapy')
            # 获取游标
            self.mycursor = self.conn.cursor()
    
            # print('正在清空之前的数据...')
            # # 我只打算要第一页的数据,所以每次爬取都是最新的,要把数据库里的之前的数据要清空
            # sql1 = "truncate table sh_python"
            # self.mycursor.execute(sql1)
            print('上海--python--开始下载...
    请稍后...')
        def process_item(self, item, spider):
            # print(0000)
            job_name = item['job_name']
            job_firm = item['job_firm']
            job_firmPeople = item['job_firmPeople']
            job_url = item['job_url']
            job_salary = item['job_salary']
            job_type = item['job_type']
            job_yaoqiu = item['job_yaoqiu']
            job_welfare = item['job_welfare']
            job_address = item['job_address']
    
            try:
                sql2 = "insert into sh_python VALUES (NULL ,'%s','%s','%s','%s','%s','%s','%s','%s','%s')"%(job_name,job_firm,job_firmPeople,job_salary,job_type,job_yaoqiu,job_welfare,job_url,job_address)
                #执行sql
                self.mycursor.execute(sql2)
                #提交
                self.conn.commit()
            except Exception as e:
                print(e)
                self.conn.rollback()
    
            return item
    
        def close_spider(self,spider):
            self.mycursor.close()
            self.conn.close()
            print('上海--python--下载完毕...')
    

    结果:

       

  • 相关阅读:
    springboot @ConfigurationProperties 中文乱码解决方案
    Centos 7安装Mysql 5.7详细教程,Linux安装Mysql 5.7详细教程
    Centos7 mysql Unit not found,Centos7 在线安装mysql 5.7
    Windows Tomcat安装配置,Tomcat 启动闪退,Windows Tomcat中文乱码解决
    ubuntu 切换到 root 用户
    一行代码完成定时任务调度,基于Quartz的UI可视化操作组件 GZY.Quartz.MUI
    快速实现一个室内空气质量检测仪
    外设驱动库开发笔记36:NTC负温度系数热电阻测温驱动
    外设驱动库开发笔记34:OLED显示屏驱动
    滤波器开发之五:基于算术平均的限幅滤波器
  • 原文地址:https://www.cnblogs.com/wshr210/p/11362742.html
Copyright © 2011-2022 走看看