zoukankan      html  css  js  c++  java
  • 周总结——爬取拉勾网数据

    一、爬取进阶-爬取拉勾网数据

    .py文件

    import requests
    import math
    import time
    import pandas as pd
    
    
    def get_json(url, num):
        """
        从指定的url中通过requests请求携带请求头和请求体获取网页中的信息,
        :return:
        """
        url1 = 'https://www.lagou.com/jobs/list_python%E5%BC%80%E5%8F%91%E5%B7%A5%E7%A8%8B%E5%B8%88?labelWords=&fromSearch=true&suginput='
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36',
            'Host': 'www.lagou.com',
            'Referer': 'https://www.lagou.com/jobs/list_%E6%95%B0%E6%8D%AE%E5%88%86%E6%9E%90?labelWords=&fromSearch=true&suginput=',
            'X-Anit-Forge-Code': '0',
            'X-Anit-Forge-Token': 'None',
            'X-Requested-With': 'XMLHttpRequest'
        }
        data = {
            'first': 'true',
            'pn': num,
            'kd': 'python工程师'}
        s = requests.Session()
        print('建立session:', s, '
    
    ')
        s.get(url=url1, headers=headers, timeout=3)
        cookie = s.cookies
        print('获取cookie:', cookie, '
    
    ')
        res = requests.post(url, headers=headers, data=data, cookies=cookie, timeout=3)
        res.raise_for_status()
        res.encoding = 'utf-8'
        page_data = res.json()
        print('请求响应结果:', page_data, '
    
    ')
        return page_data
    
    
    def get_page_num(count):
        """
        计算要抓取的页数,通过在拉勾网输入关键字信息,可以发现最多显示30页信息,每页最多显示15个职位信息
        :return:
        """
        page_num = math.ceil(count / 15)
        if page_num > 30:
            return 30
        else:
            return page_num
    
    
    def get_page_info(jobs_list):
        """
        获取职位
        :param jobs_list:
        :return:
        """
        page_info_list = []
        for i in jobs_list:  # 循环每一页所有职位信息
            job_info = []
            job_info.append(i['companyFullName'])
            job_info.append(i['companyShortName'])
            job_info.append(i['companySize'])
            job_info.append(i['financeStage'])
            job_info.append(i['district'])
            job_info.append(i['positionName'])
            job_info.append(i['workYear'])
            job_info.append(i['education'])
            job_info.append(i['salary'])
            job_info.append(i['positionAdvantage'])
            job_info.append(i['industryField'])
            job_info.append(i['firstType'])
            job_info.append(i['companyLabelList'])
            job_info.append(i['secondType'])
            job_info.append(i['city'])
            page_info_list.append(job_info)
        return page_info_list
    
    
    def main():
        url = ' https://www.lagou.com/jobs/positionAjax.json?needAddtionalResult=false'
        first_page = get_json(url, 1)
        total_page_count = first_page['content']['positionResult']['totalCount']
        num = get_page_num(total_page_count)
        total_info = []
        time.sleep(10)
        print("python开发相关职位总数:{},总页数为:{}".format(total_page_count, num))
        for num in range(1, num + 1):
            # 获取每一页的职位相关的信息
            page_data = get_json(url, num)  # 获取响应json
            jobs_list = page_data['content']['positionResult']['result']  # 获取每页的所有python相关的职位信息
            page_info = get_page_info(jobs_list)
            print("每一页python相关的职位信息:%s" % page_info, '
    
    ')
            total_info += page_info
            print('已经爬取到第{}页,职位总数为{}'.format(num, len(total_info)))
            time.sleep(20)
            # 将总数据转化为data frame再输出,然后在写入到csv各式的文件中
            df = pd.DataFrame(data=total_info,
                              columns=['公司全名', '公司简称', '公司规模', '融资阶段', '区域', '职位名称', '工作经验', '学历要求', '薪资', '职位福利', '经营范围',
                                       '职位类型', '公司福利', '第二职位类型', '城市'])
            df.to_csv('Python_development_engineer2.csv', index=False)
            print('python相关职位信息已保存')
    
    
    if __name__ == '__main__':
        main()

    JupyterLab中通过 %load lagou-Copy1.py 和%run lagou-Copy1.py运行(注:这里df.to_csv('Python_development_engineer2.csv', index=False)的.csv文件它是自己会生成的不要预先创建)

     

     

     

  • 相关阅读:
    Chrome插件开发,美化网页上的文件列表。chrome-extension,background
    Chrome插件开发,美化网页上的文件列表。chrome-extension,content-scripts
    ASP.NET MVC 常用扩展点:过滤器、模型绑定等
    Windows下Redis缓存服务器的使用 .NET StackExchange.Redis Redis Desktop Manager
    企业号微信支付 公众号支付 H5调起支付API示例代码 JSSDK C# .NET
    分享一个html+js+ashx+easyui+ado.net权限管理系统
    ASP.NET MVC Filters 4种默认过滤器的使用【附示例】
    ASP.NET MVC Controllers and Actions
    玩转控件:Fucking ERP之流程图
    玩转控件:对Dev的GridControl控件扩展
  • 原文地址:https://www.cnblogs.com/chenaiiu/p/13946540.html
Copyright © 2011-2022 走看看