zoukankan      html  css  js  c++  java
  • process data

    # version 1.0
    def
    connect_mysql(sql, oper_type="select", data_l=None): conn = pymysql.connect(host='localhost', user="root", password="123", database="work", port=3306) cur = conn.cursor() if oper_type == "insert": cur.executemany(sql, data_l) conn.commit() else: cur.execute(sql) result = cur.fetchall() # print(type(result), "result") conn.close() return result def process_jobs(field_name): sql = "select j." + field_name + " FROM personal_jobs j" column_name = connect_mysql(sql, oper_type="select") row_total = (len(column_name)) row_category = set(column_name) # init category dict category_dict = {} for k in row_category: category_dict[k] = 0 # calculate amount cal_nmu = 0 for k in row_category: for r in column_name: if r == k: cal_nmu += 1 category_dict[k] = cal_nmu cal_nmu = 0 print(type(category_dict.items()), category_dict.items()) print(row_total, len(category_dict.items())) return row_total, category_dict process_jobs("job_salary")
    version 1.1
    def
    count_times(all_list): ls = [] item_list = list(set(all_list)) for m in item_list: c = all_list.count(m) ls.append([m, c]) return sorted(ls) def process_salary(field_name): # sql = "select " + field_name + " from work.personal_jobs where job_exp = '1-3年';" sql = "select " + field_name + " from work.personal_jobs where job_exp = '1年以内' or job_exp = '经验不限';" original_sal = connect_mysql(sql) # sort salary order row_category = list(set(original_sal)) general_min, general_avg, general_max = [], [], [] # cal_num = 0 for sal in row_category: # calculate category amount # for cat in column_name: # if cat == sal: # cal_num += 1 # process salary if field_name == "job_salary": sal_tmp = str(sal).strip("('").strip("K',)").split("K-") general_min.append(int(sal_tmp[0])) general_max.append(int(sal_tmp[1])) # process experience if field_name == "job_exp": print(original_sal) # initial again # cal_num = 0 # calculate min sal min_sal = count_times(general_min) for m1 in min_sal: min_s = str(m1[0]) + "K" m1[0] = min_s # calculate max sal max_sal = count_times(general_max) for m2 in max_sal: min_s = str(m2[0]) + "K" m2[0] = min_s # calculate avg sal avg_sal = count_times(original_sal) print("original: ", avg_sal) for a1 in avg_sal: sal_tmp_1 = str(a1[0]).strip("('").strip("K',)").split("K-") a1[0] = (int(sal_tmp_1[0]) + int(sal_tmp_1[1])) / 2.0 avg_sal = sorted(avg_sal) for a2 in avg_sal: a2[0] = str(a2[0]) + "K" # debug print(len(min_sal), min_sal) print(len(avg_sal), avg_sal) print(len(max_sal), max_sal) return min_sal, avg_sal, max_sal # process_salary("job_salary")
    import jieba
    from wordcloud import WordCloud
    import matplotlib.pyplot as plt
    from collections import Counter
    from scipy.misc import imread
    
    def process_reqirement(field_name):
        sql = "select " + field_name + " from work.personal_jobs where job_exp = '1年以内' or job_exp = '经验不限';"
        original_req = connect_mysql(sql)
        userdict = ["C", "C#", "C++", "Go", "Linux", "MongoDB", "Mysql", "PostgreSQL", "Ajax", "Bootstrap", "CSS", "Django", "Docker", "Flask", "Git", "http", "tcp", "Java", "JavaScript", "Jquery", "Oracle", "Python", "Redis", "Ruby", "Scrapy", "shell", "Tornado", "Web", "Zabbix", "RESTful", "云计算", "分布式", "前端", "后端", "大数据", "高并发", "数据分析", "数据挖掘", "机器学习", "爬虫", "算法", "自动化", "运维", "集群"]
    
        jieba.load_userdict(userdict)
        # print(type(original_req), str(original_req))
        text0 = Counter(jieba.cut(str(original_req)))
        text1 = " ".join(jieba.cut(str(original_req)))
        [item for item in sorted(text0.values())]
        # print(text0.keys(), text0.values())
        # print(type(text0), text0)
    
        # # create word cloud
        # wordcloud = WordCloud(font_path=r"D:wwjworkscriptwebpersonaldatabase_operationMSYH.TTC",
        #                       background_color="white", mask=imread("china.jpg")).generate(text1)
        # plt.imshow(wordcloud)
        # plt.axis("off")
        # plt.show()
    
        # find requirement item what we really need
        req_list = []
        # print(len(text0.keys()), text0)
        for k, v in text0.items():
            for kk, vv in text0.items():
                if str(k).lower() == str(kk).lower():
                    # print(k, v)
                    req_list.append([k, (v + vv)])
                    # print(k, v)
                    break
        print(len(req_list), req_list)
    
        for t in userdict:
            for k, v in text0.items():
                if t.lower() == str(k).lower():
                    req_list.append([t, v])
                    break
        # print(req_list)
        return req_list
    process_reqirement("job_requirement")
    def user_defined(file_name):
        user_list = []
        with open(file_name, "r", encoding="utf8") as f:
            for i in f:
                user_list.append(i.strip())
        return user_list
    
    def process_company(field_name):
        sql = "select " + field_name + " from work.personal_jobs"
        company = [list(i) for i in connect_mysql(sql)]
        user_list = user_defined("t.txt")
        user_list = ['C','C#','C++','Go','Linux','MongoDB','Mysql','PostgreSQL','Ajax','Bootstrap','CSS','Django','Docker','Flask','Git','http','tcp','Java','JavaScript','Jquery','Oracle','Python','Redis','Ruby','Scrapy','shell','Tornado','Web','RESTful','云计算','分布式','前端','后端','大数据','高并发','数据分析','数据挖掘','机器学习','爬虫','算法','自动化','测试','运维','集群']
        jieba.load_userdict(user_list)
        me_list = ['python', 'django', 'linux', '运维', '自动化', '爬虫', '数据分析', 'shell', 'mysql', 'oracle']
        req_list, suit_list = [], []
        for req in company:
            req_dict = Counter(jieba.cut(req[1]))
            req_list.append([req[0], [k for k in req_dict.keys() if k in user_list]])
        for r in req_list:
            if len(r[1]) > 0:
                # print(r[1])
                own = [item for item in me_list if item in r[1]]
                if len(own) > 0:
                    suit_list.append([r[0], int(len(own) * 100/len(r[1]))])
        return sorted(suit_list, key=lambda x: x[1])
        # print(sorted(suit_list, key=lambda x: x[1]))
    process_company("company_name, job_requirement")
  • 相关阅读:
    java.lang.Object中的方法
    lyt经典版MySQL基础——进阶3:排序查询
    lyt经典版MySQL基础——进阶5:分组查询
    lyt经典版MySQL基础——DML语言-数据的插入、修改、删除操作
    lyt经典版MySQL基础——进阶8:联合查询
    lyt经典版MySQL基础——进阶7:子查询
    lyt经典版MySQL基础——进阶6:连接查询-sql99语法-内连接、外连接、交叉连接
    lyt经典版MySQL基础——进阶6:连接查询-sql92语法-内连接
    lyt经典版MySQL基础——进阶4:常见函数-分组函数
    lyt经典版MySQL基础——进阶2:条件查询
  • 原文地址:https://www.cnblogs.com/vickey-wu/p/8480227.html
Copyright © 2011-2022 走看看