zoukankan      html  css  js  c++  java
  • python jieba分词小说与词频统计

    1、知识点

    """
    1)cut()
        a) codecs.open() 解决编码问题
        b) f.readline() 读取一行,也可以使用f.readlines()读取多行
        c) words =" ".join(jieba.cut(line))分词,每个词用空格分隔
    2)lcut()
        返回一个list列表
    """

    2、标点符号处理,并分词,存储到文件中

    def fenCi():
        """
        标点符号处理,并分词,存储到文件中
        :return:
        """
        f = codecs.open("深渊主宰系统.txt",'r',encoding='utf-8')
        f1 = open("seg.txt",'w',encoding='utf-8')
        line = f.readline()
        while line:
            line = line.strip(' ')
            words =" ".join(jieba.cut(line))
            words = words.replace("","").replace("","").replace("","")
                .replace("","").replace("","").replace("","").replace("","")
                .replace("...","").replace("","").strip(' ')
            print(len(words))
            if words.startswith('-') or words == '
    ' or words.startswith('.') or len(words)<10 :
                line = f.readline()
                continue
            words = words.strip('
    ')
            f1.writelines(words)
            line = f.readline()

    3、中文分词统计

    def zhongwen():
        """
        中文分词统计
        对两个词以上的次数进行统计
            lcut 进行分词,返回分词后list列表
        :return:
        """
        f = codecs.open("深渊主宰系统.txt", 'r', encoding='utf-8').read()
        counts = {}
        wordsList =jieba.lcut(f)
        for word in wordsList:
            word = word.replace("", "").replace("", "").replace("", "") 
                .replace("", "").replace("", "").replace("", "").replace("", "") 
                .replace("...", "").replace("", "").strip(' ').strip('
    ')
            if len(word) == 1 or word == "":
                continue
            else:
                counts[word]=counts.get(word,0)+1 #单词计数
        items = list(counts.items()) #将字典转为list
        items.sort(key=lambda x:x[1],reverse=True) #根据单词出现次数降序排序
        #打印前15个
        for i in range(15):
            word,counter = items[i]
            print("单词:{},次数:{}".format(word,counter))

    4、英文分词统计

    def get_txt():
        txt = open("1.txt", "r", encoding='UTF-8').read()
        txt = txt.lower()
        for ch in '!"#$%&()*+,-./:;<=>?@[\]^_‘{|}~':
            txt = txt.replace(ch, " ")      # 将文本中特殊字符替换为空格
        return txt
    
    def yingwen():
        """
        英文分词统计
        :return:
        """
        file_txt = get_txt()
        words = file_txt.split()    # 对字符串进行分割,获得单词列表
        counts = {}
        for word in words:
            if len(word) == 1:
                continue
            else:
                counts[word] = counts.get(word, 0) + 1
    
        items = list(counts.items())
        items.sort(key=lambda x: x[1], reverse=True)
    
        for i in range(5):
            word, count = items[i]
            print("{0:<5}->{1:>5}".format(word, count))
  • 相关阅读:
    绘制八卦阵
    绘制奥运五环
    绘制渐变的圆
    实验报告
    大学排名
    第一条爬虫
    自己的第一个网页
    科学计算与可视化
    类和正则表达(自动更正 代数运算)
    预测比赛
  • 原文地址:https://www.cnblogs.com/ywjfx/p/11003872.html
Copyright © 2011-2022 走看看