zoukankan      html  css  js  c++  java
  • 拉钩网的起薪

    import requests
    import re
    import pyecharts
    
    user_agent = "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36"
    res = requests.get("https://www.lagou.com/",headers = {"User-Agent":user_agent})
    # print(res.status_code)
    # print(res.text)
    
    data = re.search("<span>后端开发</span>([sS]*?)后端开发其它</a>",res.text).group(1)
    data = data.replace(" ","").replace("
    ","")
    urls = re.findall('<ahref="(.*?)"data-lg-tj-id="4O00"data-lg-tj-no="01dd"data-lg-tj-cid="idnull"class=".*?">(.*?)</a>',data)
    print(urls)
    
    
    def parser_job(i):
        # 工作的链接
        job_url = re.search('href="(.*?)"', i).group(1)
        name = re.search("<h3>(.*?)</h3>", i).group(1)
        add = re.search("<em>(.*?)</em>", i).group(1)
        money = re.search('"money">(.*?)</span>', i).group(1)
        jy, xl = re.search('-->([sS]*?)
    ', i).group(1).split(" / ")
    
        return {"name":name,"add":add,"job_url":job_url,"money":money,"jy":jy,"xl":xl}
    
    # 指定需要查看的技术方向
    jobs = ["Java","Python"]
    
    # 创建一个页面
    page = pyecharts.Page()
    for url in urls:
        if url[1] not in jobs:
            continue
    
        # 存储解析完成的字典
        datas = []
    
        for j in range(1,11):
            res = requests.get(url[0] + str(j))
            data = re.findall("""<a class="position_link"([sS]*?)<div class="company">""",res.text)
            for i in data:
                dic = parser_job(i)
                if "{" in dic["xl"]:
                    continue
                datas.append(dic)
        print(len(datas))
        # 统计每个起薪有多少个岗位
        count_dic = { }
        for d in datas:
            key = d["money"].split("-")[0]
            if key in count_dic:
                count_dic[key] += 1
            else:
                count_dic[key] =1
        print(url[1],count_dic)
    
        # 创建一个饼图
        pie = pyecharts.Pie()
        # 为饼图添加数据
        """
        标题
        keys
        values
        """
        pie.add(url[1],count_dic.keys(), count_dic.values())
        # 将图加到页面上
        page.add(pie)
    
    
    # 生成一个页面文件
    page.render("test.html")
    import requests
    import re
    import pyecharts
    
    user_agent = "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36"
    res = requests.get("https://www.lagou.com/",headers = {"User-Agent":user_agent})
    # print(res.status_code)
    # print(res.text)
    
    data = re.search("<span>后端开发</span>([sS]*?)后端开发其它</a>",res.text).group(1)
    data = data.replace(" ","").replace("
    ","")
    urls = re.findall('<ahref="(.*?)"data-lg-tj-id="4O00"data-lg-tj-no="01dd"data-lg-tj-cid="idnull"class=".*?">(.*?)</a>',data)
    print(urls)
    
    
    def parser_job(i):
        # 工作的链接
        job_url = re.search('href="(.*?)"', i).group(1)
        name = re.search("<h3>(.*?)</h3>", i).group(1)
        add = re.search("<em>(.*?)</em>", i).group(1)
        money = re.search('"money">(.*?)</span>', i).group(1)
        jy, xl = re.search('-->([sS]*?)
    ', i).group(1).split(" / ")
    
        return {"name":name,"add":add,"job_url":job_url,"money":money,"jy":jy,"xl":xl}
    
    # 指定需要查看的技术方向
    jobs = ["Java","Python"]
    
    # 创建一个页面
    page = pyecharts.Page()
    for url in urls:
        if url[1] not in jobs:
            continue
    
        # 存储解析完成的字典
        datas = []
    
        for j in range(1,11):
            res = requests.get(url[0] + str(j))
            data = re.findall("""<a class="position_link"([sS]*?)<div class="company">""",res.text)
            for i in data:
                dic = parser_job(i)
                if "{" in dic["xl"]:
                    continue
                datas.append(dic)
        print(len(datas))
        # 统计每个起薪有多少个岗位
        count_dic = { }
        for d in datas:
            key = d["money"].split("-")[0]
            if key in count_dic:
                count_dic[key] += 1
            else:
                count_dic[key] =1
        print(url[1],count_dic)
    
        # 创建一个饼图
        pie = pyecharts.Pie()
        # 为饼图添加数据
        """
        标题
        keys
        values
        """
        pie.add(url[1],count_dic.keys(), count_dic.values())
        # 将图加到页面上
        page.add(pie)
    
    
    # 生成一个页面文件
    page.render("test.html")
  • 相关阅读:
    MongoDB+Lucence.net
    hubble+sqlserver
    C# 设计模式 1 接口模式 1.1 适配器模式 IT
    SQLServer2005 中 XML类型方法中 XQuery中变量的参数化匆忙整理 IT
    DoNET 类库设计准则01 名称规则 IT
    GMRES在matlab中的描述
    矩阵良态与病态
    调试vc++的一点感悟
    基于GramSchmidt正交法的广义极小残量法(GMRES)
    VC6 vs2003 vs2005 使用技巧(转)
  • 原文地址:https://www.cnblogs.com/xuecaichang/p/10487888.html
Copyright © 2011-2022 走看看