zoukankan      html  css  js  c++  java
  • python3-对拉钩网数据爬取及简单的数据分析

    #encoding:utf-8
    import requests
    import json, os
    import matplotlib.pyplot as plt

    class LaGouAnsialy():
    def __init__(self):
    self.headers = {"Accept": "application/json, text/javascript, */*; q=0.01",
    "Accept-Encoding": "gzip, deflate, br",
    "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
    "Connection": "keep-alive",
    "Content-Length": "25",
    "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
    "Cookie": "JSESSIONID=ABAAABAABEEAAJA3893CB27253239CD99CA00B5B714A93D; WEBTJ-ID=20200102153439-16f652d85a5430-089bd0ca7e1d01-3a65420e-2073600-16f652d85a69d0; _ga=GA1.2.1705075091.1577950480; _gid=GA1.2.613899177.1577950480; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1577950480; user_trace_token=20200102153440-55dd897c-2d32-11ea-b0f6-525400f775ce; LGUID=20200102153440-55dd8ebd-2d32-11ea-b0f6-525400f775ce; TG-TRACK-CODE=index_search; X_MIDDLE_TOKEN=7064071e9d874446822efc2a3b85cc31; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%2216f65612f75770-0a06d44f49f747-3a65420e-2073600-16f65612f769d9%22%2C%22%24device_id%22%3A%2216f65612f75770-0a06d44f49f747-3a65420e-2073600-16f65612f769d9%22%7D; index_location_city=%E6%B7%B1%E5%9C%B3; X_HTTP_TOKEN=eba94a1ed2839078190020875166bed98c35c75552; _gat=1; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1578020091; LGSID=20200103105451-69586484-2dd4-11ea-a70a-5254005c3644; PRE_UTM=; PRE_HOST=; PRE_SITE=https%3A%2F%2Fwww.lagou.com%2Futrack%2FtrackMid.html%3Ff%3Dhttps%253A%252F%252Fwww.lagou.com%252Fjobs%252Flist%255Fpython%252Fp-city%255F215%253Fpx%253Ddefault%26t%3D1578020088%26_ti%3D2; PRE_LAND=https%3A%2F%2Fwww.lagou.com%2Fjobs%2Flist_python%2Fp-city_215%3Fpx%3Ddefault; LGRID=20200103105451-695865ff-2dd4-11ea-a70a-5254005c3644; SEARCH_ID=51bf9f11cda9465093fe6165ce281bb2",
    "Host": "www.lagou.com",
    "Origin": "https://www.lagou.com",
    "Referer": "https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput=",
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36",
    "X-Anit-Forge-Code": "0",
    "X-Anit-Forge-Token": "None",
    "X-Requested-With": "XMLHttpRequest"}
    self.request_url = "https://www.lagou.com/jobs/positionAjax.json?needAddtionalResult=false"
    self.search = "python"
    self.city = "深圳"
    self.datas = []

    def send_request(self, page):
    param = {"needAddtionalResult": False, "city": self.city, "px": "default"}
    data = {"first": True, "pn": page, "kd": self.search}
    resp = requests.post(url=self.request_url, params=param, data=data, headers=self.headers)
    return resp

    def ansaly_data(self):
    for page in range(1, 50):
    resp = self.send_request(page)
    if len(resp.json()["content"]["positionResult"]["result"]) == 0:
    break
    for position in resp.json()["content"]["positionResult"]["result"]:
    position_dict = {
    '学历': position['education'],
    '薪水': position['salary'],
    '工作经验': position['workYear'],
    }
    self.datas.append(position_dict)
    time.sleep(20)
    line = json.dumps(self.datas, ensure_ascii=False)
    with open("lagou.json", "w") as f:
    f.write(line)
    print("解析完成")

    def create_report(self):
    plt.rcParams['font.sans-serif'] = ['SimHei'] # 设置字体,解决中文乱码问题
    result = pd.read_json("lagou.json")
    datas = result.薪水.value_counts()
    indexs = [index for index in datas.index]
    values = [value for value in datas.values]
    dicts = dict(zip(indexs, values))
    s8 = 0 # 小于8k
    s8_12 = 0 # 8-12k
    s12_15 = 0 # 12-15k
    s15_18 = 0 # 15-18k
    s18_22 = 0 # 18-22k
    s22_30 = 0 # 22-30k
    l30 = 0 # 大于30k
    for kbs, val in dicts.items():
    a2 = int(kbs.split("-")[1].split("k")[0])
    if a2 <= 8:
    s8 += val
    elif 8 < a2 <= 12:
    s8_12 += val
    elif 12 < a2 <= 15:
    s12_15 += val
    elif 15 < a2 <= 18:
    s15_18 += val
    elif 18 < a2 <= 22:
    s18_22 += val
    elif 22 < a2 <= 30:
    s22_30 += val
    else:
    l30 += val

    data = [s8, s8_12, s12_15, s15_18, s18_22, s22_30, l30]
    plt.figure(1, dpi=100)
    plt.pie(
    data, # 每个饼的实际数据 若大于1 会进行归一化 计算百分比
    explode=[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.1], # 每个饼块离中心距离
    colors=['y', 'r', 'g', '#89e8e1', '#69e8a1', "#98e8e1", "#46e8e2"], # 每个饼块的颜色
    labels=['<=8K', '8-12k', '12-15k', '15-18k', '18-22k', "22-30k", ">=30k"], # 每个饼块的标签
    labeldistance=1.1, # 每个饼块标签离中心的距离
    autopct='%1.1f%%', # 百分比的显示模式
    pctdistance=0.6, # 百分比离中心的距离
    shadow=False, # 每个饼块是否显示阴影
    startangle=90, # 默认从x正半轴逆时针起
    radius=1.2 # 饼块半径
    )
    plt.show()

    def main(self):
    self.ansaly_data()
    if os.path.exists(os.path.join(os.getcwd(),"lagou.json")):
    self.create_report()
    else:
    print("json文件未生成")
  • 相关阅读:
    HYSBZ 3813 奇数国
    HYSBZ 4419 发微博
    HYSBZ 1079 着色方案
    HYSBZ 3506 排序机械臂
    HYSBZ 3224 Tyvj 1728 普通平衡树
    Unity 3D,地形属性
    nginx 的naginx 种包含include关键字
    Redis 出现NOAUTH Authentication required解决方案
    mysql 8.0出现 Public Key Retrieval is not allowed
    修改jar包里的源码时候需要注意的问题
  • 原文地址:https://www.cnblogs.com/zhouzetian/p/12161609.html
Copyright © 2011-2022 走看看