zoukankan      html  css  js  c++  java
  • jieba分词及词频统计小项目

    import pandas as pd
    import jieba
    import jieba.analyse
    from collections import Counter,OrderedDict
    jieba.load_userdict('./userdict.txt')  # 加载外部 用户词典
    
    
    def stopwordslist(filepath):
        stopwords = [line.strip() for line in open(filepath, 'r', encoding='utf-8').readlines()]
        return stopwords
    
    
    def text_cut(text1):
        stopwords = stopwordslist('./stop_words.txt')  # 这里加载停用词的路径
        words = jieba.analyse.extract_tags(text1, topK=6, withWeight=False, allowPOS=('ns', 'n', 'vn', 'v', 'm', 'q'))
        santi_words = [x for x in words if len(x) > 1 and x not in stopwords]
        return ','.join(santi_words)
    
    
    def cut_term():
        data = pd.read_excel('./xxx.xlsx', sheet_name='Sheet3')
    
        data['term'] = data['合并'].apply(text_cut)
        print(data.head())
        data.to_excel('./Q2-xxxx_new2.xlsx', index=False)
    
    
    def make_count(data):
        writer = pd.ExcelWriter('./Q2分行业分词结果11.xlsx', engine='xlsxwriter')
        all_industry = ['xxxx]
        for industry in all_industry:
            cut = data[data['一xxx']==industry]['term'].tolist()
            l = []
            for i in cut:
                l.extend(i.split(','))
            print(len(l))
            term_dic = dict(Counter(l))
            d = dict(sorted(term_dic.items(), key=lambda x: x[1], reverse=True))
            k = list(d.keys())
            v = list(d.values())
            df = pd.DataFrame({f'{industry}-词': k,'频率':v }, columns=[f'{industry}-词', '频率'])
            df.to_excel(writer,sheet_name=industry,index=False)
        writer.close()
    
    
    data = pd.read_excel('./xxxxxx.xlsx', sheet_name='Sheet1')
    
    make_count(data)
  • 相关阅读:
    二分法查找
    全排列 递归实现 c 语言实现
    南阳oj 题目290 动物统计加强版 字典树
    蛇形填数
    南阳理工oj 题目289 苹果 01背包
    南阳理工 oj 题目38 布线问题
    南阳理工oj 题目85 有趣的数 Cantor数表
    CSU-1110 RMQ with Shifts (单点更新+区间最小值 zkw线段树)
    POJ-2387 Til the Cows Come Home
    HDU-2680 Choose the best route
  • 原文地址:https://www.cnblogs.com/Erick-L/p/11177107.html
Copyright © 2011-2022 走看看