zoukankan      html  css  js  c++  java
  • python3爬虫-通过requests获取拉钩职位信息

    import requests, json, time, tablib
    
    
    def send_ajax_request(data: dict):
        try:
            ajax_response = session.post(url=ajax_url,
                                         params={"needAddtionalResult": "false", "city": city},
                                         data=data,
                                         headers=ajax_headers,
                                         timeout=timeout)
            if ajax_response.status_code == 200:
                return ajax_response.json()
            return {}
        except Exception:
            return {}
    
    
    def get_job_info(info_dic: dict):
        jobInfoMap = info_dic.get("content").get("positionResult").get("result")
    
        for jobInfoDict in jobInfoMap:
            dic = {}
            dic["companyId"] = jobInfoDict.get("companyId")
            dic["companyFullName"] = jobInfoDict.get("companyFullName")
            dic["positionName"] = jobInfoDict.get("positionName")
            dic["workYear"] = jobInfoDict.get("workYear")
            dic["education"] = jobInfoDict.get("education")
            dic["salary"] = jobInfoDict.get("salary")
            dic["jobNature"] = jobInfoDict.get("jobNature")
            dic["companySize"] = jobInfoDict.get("companySize")
            dic["city"] = jobInfoDict.get("city")
            dic["district"] = jobInfoDict.get("district")
            dic["createTime"] = jobInfoDict.get("createTime")
            if is_save_txtfile:
                yield json.dumps(dic, ensure_ascii=False)
            else:
                yield dic.values()
    
    
    def save_to_file(json_data):
        for data in json_data:
            f.write(data + "
    ")
    
    
    def save_to_excel(list_data):
        for line in list_data:
            dataset.append(line)
    
    
    def run():
        for i in range(1, 31):
            data = {
                "first": "false",
                "pn": i,
                "kd": "python"
            }
            info_dic = send_ajax_request(data)
            data = get_job_info(info_dic)
            if is_save_txtfile:
                save_to_file(data)
            else:
                save_to_excel(data)
            print("正在保存数据")
            time.sleep(sleeptime)
    
    
    if __name__ == '__main__':
        session = requests.Session()
        job_name = "python"
        city = "成都"
        timeout = 5
        sleeptime = 10
        doc_url = "https://www.lagou.com/jobs/list_{job_name}".format(job_name=job_name)
        session.headers[
            "User-Agent"] = "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36"
        session.headers["Host"] = "www.lagou.com"
    
        doc_response = session.get(url=doc_url, params={"city": city})
    
        ajax_headers = {
            "Origin": "https://www.lagou.com",
            "Referer": doc_response.url
        }
    
        ajax_url = "https://www.lagou.com/jobs/positionAjax.json?=false"
    
        is_save_txtfile = False
    
        if not is_save_txtfile:
            dataset = tablib.Dataset()
            dataset.headers = ["companyId", "companyFullName", "positionName", "workYear",
                               "education", "salary", "jobNature", "companySize", "city",
                               "district", "createTime"]
    
        f = open("jobinfo.txt", "a", encoding="utf-8")
        try:
            run()
        except Exception:
            print('出错了')
        finally:
            if is_save_txtfile:
                f.close()
            else:
                with open("jobInfo.xls", "wb") as f:
                    f.write(dataset.xls)
                    f.flush()
  • 相关阅读:
    airprobe 安装 part2
    USRP Daugherboard: DBSRX
    电赛又见电赛!2011电赛之我见
    USRP Experiment 1: Data transmission
    How to Switch Between GDM and KDM on Ubuntu
    USRP Daugherboard: BasicRX
    Oracle Analyze 命令 详解
    Oracle SQL优化 总结
    Oracle SQL优化 总结
    Oracle 用拼接字符串更新表 测试
  • 原文地址:https://www.cnblogs.com/zhuchunyu/p/10765945.html
Copyright © 2011-2022 走看看