zoukankan      html  css  js  c++  java
  • 拉钩爬虫

    # -*-coding:utf-8-*-
    '''
        FileName:LaG爬取岗位信息
        CreatTime:2018-4-10
        Author: ___dx___
        FileDescript:
    '''
    import requests
    import xlwt
    import ssl
    
    ssl._create_default_https_context = ssl._create_unverified_context    # https校验证书
    
    class Lagou_job(object):
        def __init__(self):
            self.url = 'https://www.lagou.com/jobs/positionAjax.json?px=new&needAddtionalResult=false'
            self.headers = {
                'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36',
                'Referer': 'https://www.lagou.com/jobs/list_%E6%B5%8B%E8%AF%95?city=%E5%85%A8%E5%9B%BD&cl=false&fromSearch=true&labelWords=&suginput=',
                'Connection': 'keep - alive',
                'Content-Type':'application/x-www-form-urlencoded; charset=UTF-8',
                'Origin':'https://www.lagou.com',
                'X-Anit-Forge-Code': '0',
                'X-Anit-Forge-Token': 'None',
                'X-Requested-With': 'XMLHttpRequest'
    
            }
    
        # 抓取接口函数
        def getJobList(self, page):
            self.data = {
                'first': 'true',
                'pn': page,
                'kd': '测试'
            }
            session = requests.Session()
            res =session.post(self.url, data=self.data, headers=self.headers)
            result = res.json()
            print(result)  # debug
            print(res.status_code)
            jobs = result['content']['positionResult']['result']
            return jobs
    
        # 抓取结果存入excel
        def saveExcel(self):
            excelTabel = xlwt.Workbook()  # 创建excel对象
            # 如果对一个单元格重复操作,会引发
            # returns error:
            # Exception: Attempt to overwrite cell:
            # sheetname=u'sheet 1' rowx=0 colx=0
            # 所以在打开时加cell_overwrite_ok=True 解决
            sheet_1 = excelTabel.add_sheet('daixiang', cell_overwrite_ok=True)  #创建sheet页
            sheet_1.write(0, 0, u'公司全名')
            sheet_1.write(0, 1, u'公司简称')
            sheet_1.write(0, 2, u'城市')
            sheet_1.write(0, 3, u'区域')
            sheet_1.write(0, 4, u'工作性质')
            sheet_1.write(0, 5, u'职位名称')
            sheet_1.write(0, 6, u'薪资范围')
            sheet_1.write(0, 7, u'职位')
            sheet_1.write(0, 8, u'工作年限')
            sheet_1.write(0, 9, u'公司规模')
            sheet_1.write(0, 10, u'学历要求')
            n = 1
            for page in range(1, 2):  # 前99页
                for job in self.getJobList(page=page):
                    if '' in job['workYear'] and u'' in job['jobNature'] and u'' in job['education']:
                     if '' in job['workYear'] and u'全职' in job['jobNature'] and u'深圳' in job['city']:
                        sheet_1.write(n, 0, job['companyFullName'])
                        sheet_1.write(n, 1, job['companyShortName'])
                        sheet_1.write(n, 2, job['city'])
                        sheet_1.write(n, 3, job['district'])
                        sheet_1.write(n, 4, job['jobNature'])
                        sheet_1.write(n, 5, job['positionName'])
                        sheet_1.write(n, 6, job['salary'])
                        sheet_1.write(n, 7, job['secondType'])
                        sheet_1.write(n, 8, job['workYear'])
                        sheet_1.write(n, 9, job['companySize'])
                        sheet_1.write(n, 10, job['education'])
                        n += 1
                print (job['companyShortName'],job['salary'])
                #print ('{},{}'.format(job['companyShortName'].encode('utf-8'),job['salary'].encode('utf-8')))
                #print "{0[0]} is {0[1]} years old".format(li)
                #print {0}{1}.format(job['companyShortName'], job['salary'])
    
                #print('[{name:<{len}}	x'.format(name=job['companyShortName'] + ']', len=50 - len(job['companyShortName'].encode('utf-8')) + len(job['companyShortName'])))
    
            # 保存文件到excel
            #excelTabel.save('daidai.xls')
            excelTabel.save("深圳测试_By_dx.xls")
    
    if __name__ == '__main__':
        lagou_job = Lagou_job()
        #lagou_job.getJobList(1)
        lagou_job.saveExcel()
  • 相关阅读:
    java笔记之IO详解——序列流
    java笔记之IO详解——输出字符流
    java笔记之IO详解——输入字符流
    java笔记之IO详解——输出字节流
    Nginx同一个域名部署多个项目
    服务器安装mongo数据库
    服务器安装node
    服务器Nginx配置及文件目录
    笔记 [待整理]
    vue-cli3打包app物理按键失效的问题[已解决]
  • 原文地址:https://www.cnblogs.com/jsondai/p/11393056.html
Copyright © 2011-2022 走看看