结巴分词python脚本

#-*-coding:utf-8-*- 2018-04-17 想要用绝佳的编程技术实现一些事情
import jieba
import os
def savefile(savepath,content):
    with open(savepath,'w',encoding="utf-8") as fp:
        fp.write(content)

def readfile(path):
    with open(path,'r',encoding="utf-8") as fp:
        c=fp.read()
    return c                      
before_path = "D:/fenglongyu_SoftWare/data/train/"
after_path = "D:/fenglongyu_SoftWare/data/textC/"
catelist = os.listdir(before_path)#获取before_path下所有子目录下 为每个目录下的所有文件
print(catelist)
for mydir in catelist:
    class_path = before_path+mydir+r"/"#拼出分类子目录的路径
    after_dir = after_path+mydir+r"/"
          #拼出分词后的料分类目录
        #不存在路径时创建
    if not os.path.exists(after_dir):
        os.makedirs(after_dir)
    file_list= os.listdir(class_path)
    for file_path in file_list:
        fullname = class_path+file_path #拼出文件名的全路径
        content = readfile(fullname).strip()
        #读取文件内容
        #删除换行和多余的空格
        content_seg = jieba.cut(content)
        savefile(after_dir+file_path," ".join(content_seg))
print("分词结束！！")

查看全文

相关阅读:
【邀请函】小投入大产出—微软智能云(Azure)之CDN 专题
 Azure镜像市场再下一城，中标软件入驻开启Azure国产操作系统时代
 15分钟完成基于Azure公有云搭建远程测试环境
 独家秘笈！教你解锁移动应用新技能
 “剁手节”来了，红包你抢到了吗？
Azure 11 月新公布
 面对故宫万千珍宝，升哲科技如何做到“朕知道了”
高斯-克吕格投影
 cad定制快捷键
 matlab之scatter3()与plot3()函数

原文地址：https://www.cnblogs.com/fenglongyu/p/8876387.html