zoukankan      html  css  js  c++  java
  • python3-对拉钩网数据爬取及简单的数据分析

    #encoding:utf-8
    import requests
    import json, os
    import matplotlib.pyplot as plt

    class LaGouAnsialy():
    def __init__(self):
    self.headers = {"Accept": "application/json, text/javascript, */*; q=0.01",
    "Accept-Encoding": "gzip, deflate, br",
    "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
    "Connection": "keep-alive",
    "Content-Length": "25",
    "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
    "Cookie": "JSESSIONID=ABAAABAABEEAAJA3893CB27253239CD99CA00B5B714A93D; WEBTJ-ID=20200102153439-16f652d85a5430-089bd0ca7e1d01-3a65420e-2073600-16f652d85a69d0; _ga=GA1.2.1705075091.1577950480; _gid=GA1.2.613899177.1577950480; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1577950480; user_trace_token=20200102153440-55dd897c-2d32-11ea-b0f6-525400f775ce; LGUID=20200102153440-55dd8ebd-2d32-11ea-b0f6-525400f775ce; TG-TRACK-CODE=index_search; X_MIDDLE_TOKEN=7064071e9d874446822efc2a3b85cc31; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%2216f65612f75770-0a06d44f49f747-3a65420e-2073600-16f65612f769d9%22%2C%22%24device_id%22%3A%2216f65612f75770-0a06d44f49f747-3a65420e-2073600-16f65612f769d9%22%7D; index_location_city=%E6%B7%B1%E5%9C%B3; X_HTTP_TOKEN=eba94a1ed2839078190020875166bed98c35c75552; _gat=1; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1578020091; LGSID=20200103105451-69586484-2dd4-11ea-a70a-5254005c3644; PRE_UTM=; PRE_HOST=; PRE_SITE=https%3A%2F%2Fwww.lagou.com%2Futrack%2FtrackMid.html%3Ff%3Dhttps%253A%252F%252Fwww.lagou.com%252Fjobs%252Flist%255Fpython%252Fp-city%255F215%253Fpx%253Ddefault%26t%3D1578020088%26_ti%3D2; PRE_LAND=https%3A%2F%2Fwww.lagou.com%2Fjobs%2Flist_python%2Fp-city_215%3Fpx%3Ddefault; LGRID=20200103105451-695865ff-2dd4-11ea-a70a-5254005c3644; SEARCH_ID=51bf9f11cda9465093fe6165ce281bb2",
    "Host": "www.lagou.com",
    "Origin": "https://www.lagou.com",
    "Referer": "https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput=",
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36",
    "X-Anit-Forge-Code": "0",
    "X-Anit-Forge-Token": "None",
    "X-Requested-With": "XMLHttpRequest"}
    self.request_url = "https://www.lagou.com/jobs/positionAjax.json?needAddtionalResult=false"
    self.search = "python"
    self.city = "深圳"
    self.datas = []

    def send_request(self, page):
    param = {"needAddtionalResult": False, "city": self.city, "px": "default"}
    data = {"first": True, "pn": page, "kd": self.search}
    resp = requests.post(url=self.request_url, params=param, data=data, headers=self.headers)
    return resp

    def ansaly_data(self):
    for page in range(1, 50):
    resp = self.send_request(page)
    if len(resp.json()["content"]["positionResult"]["result"]) == 0:
    break
    for position in resp.json()["content"]["positionResult"]["result"]:
    position_dict = {
    '学历': position['education'],
    '薪水': position['salary'],
    '工作经验': position['workYear'],
    }
    self.datas.append(position_dict)
    time.sleep(20)
    line = json.dumps(self.datas, ensure_ascii=False)
    with open("lagou.json", "w") as f:
    f.write(line)
    print("解析完成")

    def create_report(self):
    plt.rcParams['font.sans-serif'] = ['SimHei'] # 设置字体,解决中文乱码问题
    result = pd.read_json("lagou.json")
    datas = result.薪水.value_counts()
    indexs = [index for index in datas.index]
    values = [value for value in datas.values]
    dicts = dict(zip(indexs, values))
    s8 = 0 # 小于8k
    s8_12 = 0 # 8-12k
    s12_15 = 0 # 12-15k
    s15_18 = 0 # 15-18k
    s18_22 = 0 # 18-22k
    s22_30 = 0 # 22-30k
    l30 = 0 # 大于30k
    for kbs, val in dicts.items():
    a2 = int(kbs.split("-")[1].split("k")[0])
    if a2 <= 8:
    s8 += val
    elif 8 < a2 <= 12:
    s8_12 += val
    elif 12 < a2 <= 15:
    s12_15 += val
    elif 15 < a2 <= 18:
    s15_18 += val
    elif 18 < a2 <= 22:
    s18_22 += val
    elif 22 < a2 <= 30:
    s22_30 += val
    else:
    l30 += val

    data = [s8, s8_12, s12_15, s15_18, s18_22, s22_30, l30]
    plt.figure(1, dpi=100)
    plt.pie(
    data, # 每个饼的实际数据 若大于1 会进行归一化 计算百分比
    explode=[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.1], # 每个饼块离中心距离
    colors=['y', 'r', 'g', '#89e8e1', '#69e8a1', "#98e8e1", "#46e8e2"], # 每个饼块的颜色
    labels=['<=8K', '8-12k', '12-15k', '15-18k', '18-22k', "22-30k", ">=30k"], # 每个饼块的标签
    labeldistance=1.1, # 每个饼块标签离中心的距离
    autopct='%1.1f%%', # 百分比的显示模式
    pctdistance=0.6, # 百分比离中心的距离
    shadow=False, # 每个饼块是否显示阴影
    startangle=90, # 默认从x正半轴逆时针起
    radius=1.2 # 饼块半径
    )
    plt.show()

    def main(self):
    self.ansaly_data()
    if os.path.exists(os.path.join(os.getcwd(),"lagou.json")):
    self.create_report()
    else:
    print("json文件未生成")
  • 相关阅读:
    java实现趣味拼算式
    windows下安装docker
    Docker_入门?只要这篇就够了!(纯干货适合0基础小白)
    网关支付、银联代扣通道、快捷支付、银行卡支付分别是怎么样进行支付的?
    【深度解析】第三方支付的分类、接口与支付流程
    去外包公司的伙伴们小心了!——亲身经历,数数外包公司的坑
    一个tomcat下部署多个项目或一个服务器部署多个tomcat
    tomcat部署web应用的4种方法以及部署多个应用
    datatables增删改查的实现
    基于SpringMVC+Bootstrap+DataTables实现表格服务端分页、模糊查询
  • 原文地址:https://www.cnblogs.com/zhouzetian/p/12161609.html
Copyright © 2011-2022 走看看