仅仅从网上爬下数据当然是不够用的,主要还得对数据进行分析与展示,大部分人都看重薪资,但是薪资数据有的是*k/月,有的是*万/月,还有*万/年等等,就要对数据进行清理
将所有单位统一化,全部换算成统一单位,然后分类薪资范围,在计算各个范围的数量,最后绘图展示
import pymysql import numpy as np from pyecharts import Bar from pyecharts import Pie class Mysqlhelper(object): config = { "host": "localhost", "user": "root", "password": "123456", "db": "test", "charset": "utf8" } def __init__(self): self.connection = None self.cursor = None # 从数据库中查询多行数据 def getlist(self, sql, *args): try: self.connection = pymysql.connect(**Mysqlhelper.config) # **接函数所有参数 self.cursor = self.connection.cursor() self.cursor.execute(sql, args) return self.cursor.fetchall() except Exception as ex: print(ex, ex) finally: self.close() def close(self): if self.cursor: self.cursor.close() if self.connection: self.connection.close() if __name__ == "__main__": count=0 list = [] list1 = [] list2 = [5000,10000,15000,20000,25000,30000,35000,40000] salary0 = [] salary1 = [] salary2 = [] salary3 = [] salary4 = [] salary5 = [] salary6 = [] salary7 = [] city=[] helper = Mysqlhelper() rows = helper.getlist("select * from t_job") #print(rows) for n in rows: if n[4][-1]=='月': list.append(n[4]) elif n[4][-1]=='年': pass elif n[4][-1]=='天': pass else: pass for sale in list: #print(sale) money = sale.split('/') #print(money[0]) money1 = money[0].split('-') #print(money1) if money[0][-1] == '万': a = float(money1[0]) * 10000 b = float(money1[1][:-1]) * 10000 aveage = (a + b) / 2 count+=1 list1.append(aveage) elif money[0][-1]=='千': a = float(money1[0]) * 1000 b = float(money1[1][:-1]) * 1000 #print(a) #print(b) aveage = (a + b) / 2 #print(aveage) count += 1 list1.append(aveage) #print(count) #print(list1) for i in list1: print(i) if 0 < i <= 5000: salary0.append(i) elif 5000 < i <= 10000: salary1.append(i) elif 10000 < i <= 15000: salary2.append(i) elif 15000 < i <= 20000: salary3.append(i) elif 20000 < i <= 25000: salary4.append(i) elif 25000 < i <= 30000: salary5.append(i) elif 30000 < i <= 35000: salary6.append(i) elif 35000 < i <= 40000: salary7.append(i) print(min(list1)) print(max(list1)) a = len(salary0) b = len(salary1) c = len(salary2) d = len(salary3) e = len(salary4) f = len(salary5) g = len(salary6) h = len(salary7) list3=[a,b,c,d,e,f,g,h] print(list2) #x轴 print(a,b,c,d,e,f,g,h) print(list3) #数量 bar = Bar('Python平均工资') bar.add("月薪", list2,list3) # bar.show_config() bar.render('Python工资柱状图.html') pie = Pie() pie.add("", list2, list3, is_label_show=True) #pie.show_config() pie.render('Python工资饼状图.html') ''' #print(rows) citycount=[] cityname=['北京','异地招聘','海淀区','朝阳区','丰台区','昌平区','东城区','延庆区', '房山区','通州区','顺义区','大兴区','怀柔区','西城区','平谷区','门头沟区'] beijing=[] yidi=[] haidian=[] chaoyang=[] fengtai=[] changping=[] dongcheng=[] yanqing=[] fangshan=[] tongzhou=[] shunyi=[] daxing=[] huairou=[] xicheng=[] pinggu=[] mentougou=[] for n in rows: #print(n[3]) area=n[3].split('-') print(area) if len(area)==1: print(area[0]) city.append(area[0]) else: print(area[1]) city.append(area[1]) print(city) print(len(city)) for i in city: if i=='北京': beijing.append(i) elif i=='异地招聘': yidi.append(i) elif i=='海淀区': haidian.append(i) elif i == '朝阳区': chaoyang.append(i) elif i=='丰台区': fengtai.append(i) elif i=='昌平区': changping.append(i) elif i=='东城区': dongcheng.append(i) elif i=='延庆区': yanqing.append(i) elif i=='房山区': fangshan.append(i) elif i=='通州区': tongzhou.append(i) elif i=='顺义区': shunyi.append(i) elif i=='大兴区': daxing.append(i) elif i=='怀柔区': huairou.append(i) elif i=='西城区': xicheng.append(i) elif i=='平谷区': pinggu.append(i) elif i=='门头沟区': mentougou.append(i) #print(beijing) #print(len(beijing)) a = len(beijing) b = len(yidi) c = len(haidian) d = len(chaoyang) e = len(fengtai) f = len(changping) g = len(dongcheng) h = len(yanqing) j = len(fangshan) k = len(tongzhou) l = len(shunyi) m = len(daxing) n = len(huairou) o = len(xicheng) p = len(pinggu) q = len(mentougou) citycount=[a,b,c,d,e,f,g,h,j,k,l,m,n,o,p,q] print(cityname) print(citycount) pie = Pie() pie.add("", cityname, citycount, is_label_show=True) # pie.show_config() pie.render('北京各区Python职位占比饼状图.html') bar = Bar('北京各区职位数量') bar.add("数量", cityname, citycount) # bar.show_config() bar.render('北京各区Python职位占比柱状图.html') '''
前面写的是数据库的操作函数,其实可以封装成一个py文件,以后使用直接调用即可。
结果。:
我也分析了boss直聘网站的一些数据,类似于经验要求和学历要求等等,也可以自己分析想要的数据。
import pymysql import numpy as np from pyecharts import Bar from pyecharts import Pie import jieba from collections import Counter from os import path class Mysqlhelper(object): config={ "host":"localhost", "user":"root", "password":"123456", "db":"test", "charset":"utf8" } def __init__(self): self.connection=None self.cursor=None # 从数据库中查询多行数据 def getlist(self, sql, *args): try: self.connection = pymysql.connect(**Mysqlhelper.config) # **接函数所有参数 self.cursor = self.connection.cursor() self.cursor.execute(sql, args) return self.cursor.fetchall() except Exception as ex: print(ex,ex) finally: self.close() def close(self): if self.cursor: self.cursor.close() if self.connection: self.connection.close() if __name__=="__main__": sale=[] exp=[] edu=[] one = [] three = [] five = [] onein = [] noexp = [] qita=[] benke=[] dazhuan=[] noedu=[] boshi=[] other=[] helper = Mysqlhelper() rows = helper.getlist("select * from boss_job") #print(rows) for data in rows: #print(data[2]) #print(data[5]) #print(data[6]) sale.append(data[2]) exp.append(data[5]) edu.append(data[6]) if data[5]=='1-3年': one.append(data[5]) elif data[5]=='3-5年': three.append(data[5]) elif data[5]=='5-10年': five.append(data[5]) elif data[5]=='经验不限': noexp.append(data[5]) elif data[5]=='1年以内': onein.append(data[5]) else: qita.append(data[5]) pass if data[6]=='本科': benke.append(data[6]) elif data[6]=='大专': dazhuan.append(data[6]) elif data[6]=='博士': boshi.append(data[6]) elif data[6]=='学历不限': noedu.append(data[6]) else: other.append(data[6]) # with open('./data/jingyan.txt', 'a', encoding='utf-8') as fp: # fp.write(data[5]) # fp.write(',') # fp.flush() # fp.close() print(exp) print(edu) print(len(exp)) print(len(edu)) ''' d = path.dirname(__file__) jingyan_text = open(path.join(d, "data//jingyan.txt"), encoding='utf-8').read() print(len(jingyan_text)) jieba.load_userdict("data//jingyan_dict.txt") seg_list = jieba.cut_for_search(jingyan_text) print(u"[全模式]: ", "/ ".join(seg_list)) ''' # sanguo_words = [x for x in jieba.cut(jingyan_text)if x!=','and len(x) >=2] # c = Counter(sanguo_words).most_common(20) # print(c) # print(''.join(jieba.cut(jingyan_text))) print(one) print(three) print(five) print(noexp) print(onein) print(qita) a=len(one) b=len(three) c=len(five) d=len(noexp) e=len(onein) f=len(qita) expcount=[f,e,a,b,c,d] expfenlei=['应届生','1年以内','1-3年','3-5年','5-10年','经验不限'] print(expcount) print(a+b+c+d+e+f) print(other) g=len(benke) h=len(dazhuan) j=len(boshi) k=len(noedu) m=len(other) educount=[h,g,k,j,m] edufenlei=['大专','本科','硕士','博士','学历不限'] print(educount) ''' bar = Bar('工作年限') bar.add("要求", expfenlei, expcount) # bar.show_config() bar.render('工作年限柱状图.html') pie = Pie() pie.add("工作", expfenlei, expcount, is_label_show=True) # pie.show_config() pie.render('工作年限饼状图.html') ''' bar = Bar('学历要求') bar.add("学历", edufenlei, educount) # bar.show_config() bar.render('学历要求柱状图.html') pie = Pie() pie.add("学历", edufenlei, educount, is_label_show=True) # pie.show_config() pie.render('学历要求饼状图.html')
我使用的是最基本的数组方法,不知道有什么简单方法么,例如jieba分词模块,等等
可以看出本科生需求还是很大的。。。