zoukankan      html  css  js  c++  java
  • 团队项目冲刺第三天

    数据已经取出,然后是对数据的一个清洗

    其中中文文本的分类

    需要分词

    还需要进行 停用词的取出

    以防对特征值的抽取造成过大影响

    分词是为了进行特征抽取的一个词语分割 然后进行提取作用

    # encoding=utf-8                         #遍历文件,用ProsessofWords处理文件
    from imp import reload
    import jieba
    import os
    import numpy as np
    import sys

    reload(sys)


    def EnumPathFiles(path, callback, stop_words_list):
    if not os.path.isdir(path):
    print('Error:"', path, '" is not a directory or does not exist.')
    return
    list_dirs = os.walk(path)

    for root, dirs, files in list_dirs:
    for d in dirs:
    print(d)
    EnumPathFiles(os.path.join(root, d), callback, stop_words_list)
    for f in files:
    callback(root, f, stop_words_list)


    def ProsessofWords(textpath, stop_words_list):
    f = open(textpath, 'r', encoding='utf-8')
    text = f.read()
    f.close()
    result = list()
    outstr = ''
    seg_list = jieba.cut(text, cut_all=False)
    for word in seg_list:
    if word not in stop_words_list:
    if word != ' ':
    outstr += word
    outstr += " "
    f = open(textpath, 'w+', encoding='utf-8')
    f.write(outstr)
    f.close()


    def callback1(path, filename, stop_words_list):
    textpath = path + '\' + filename
    print(textpath)
    ProsessofWords(textpath, stop_words_list)


    if __name__ == '__main__':
    stopwords_file = "../stopword/stopword.txt"
    stop_f = open(stopwords_file, "r", encoding='utf-8')
    stop_words = list()
    for line in stop_f.readlines():
    line = line.strip()
    if not len(line):
    continue
    stop_words.append(line)
    stop_f.close()
    print(len(stop_words))

    EnumPathFiles(r'../article', callback1, stop_words)
  • 相关阅读:
    HDU 2089 不要62
    HDU 5038 Grade(分级)
    FZU 2105 Digits Count(位数计算)
    FZU 2218 Simple String Problem(简单字符串问题)
    FZU 2221 RunningMan(跑男)
    FZU 2216 The Longest Straight(最长直道)
    FZU 2212 Super Mobile Charger(超级充电宝)
    FZU 2219 StarCraft(星际争霸)
    FZU 2213 Common Tangents(公切线)
    FZU 2215 Simple Polynomial Problem(简单多项式问题)
  • 原文地址:https://www.cnblogs.com/wang2232985989/p/14908610.html
Copyright © 2011-2022 走看看