zoukankan      html  css  js  c++  java
  • python 部分数据处理代码

    # -*- coding:utf8 -*-
    import os
    import jieba.posseg as pseg
    # -*- coding:utf8 -*-
    import os
     
    def splitSentence(inputFile,name):
        fin = open(inputFile, 'r')      #以读的方式打开文件
        print name
        fout= open('/home/xdj/target/'+name,'w')         #以写得方式打开文件
        for eachLine in fin:
            line = eachLine.strip().decode('utf-8', 'ignore')      #去除每行首尾可能出现的空格,并转为Unicode进行处理
        line=line.strip(' ')                                       #去掉多余空行
            wordList = pseg.cut(line)                        #用结巴分词,对每行内容进行分词    
            outStr = ''
            for word in wordList:#
            #print word.word,word.flag            
            outStr += word.word+'/'+word.flag
        #print outStr
            fout.write(outStr.encode('utf-8'))              #将分词好的结果写入到输出文件
            fout.write(' ')
        fin.close()
        fout.close()
     
    path='/media/软件/zhuomian/VARandLDA/xuejiesourse'
    fns=[os.path.join(root,fn) for root,dirs,files in os.walk(path) for fn in files]
    #fout= open('/home/xdj/myOutput.txt','w')  
     
    i=-1
    num=0
    for f in fns:
        print f
        i=i+1
        strm = '%d' %i
        splitSentence(f,strm)
    #fout.close()
    print num   

    # -*- coding:utf8 -*-
    import os
    import jieba.posseg as pseg
    # -*- coding:utf8 -*-
    import os
    
    def splitSentence(inputFile,name):
        fin = open(inputFile, 'r')      #以读的方式打开文件
        print name
        fout= open('/home/xdj/target/'+name,'w')         #以写得方式打开文件
        for eachLine in fin:
            line = eachLine.strip().decode('utf-8', 'ignore')      #去除每行首尾可能出现的空格,并转为Unicode进行处理
        line=line.strip('
    ')                                       #去掉多余空行
            wordList = pseg.cut(line)                        #用结巴分词,对每行内容进行分词    
            outStr = ''
            for word in wordList:#
            #print word.word,word.flag            
            outStr += word.word+'/'+word.flag
        #print outStr
            fout.write(outStr.encode('utf-8'))              #将分词好的结果写入到输出文件
            fout.write('
    ')
        fin.close()
        fout.close()
    
    path='/media/软件/zhuomian/VARandLDA/xuejiesourse'
    fns=[os.path.join(root,fn) for root,dirs,files in os.walk(path) for fn in files]
    #fout= open('/home/xdj/myOutput.txt','w') 
    
    i=-1
    num=0
    for f in fns:
        print f
        i=i+1
        strm = '%d' %i
        splitSentence(f,strm)
    #fout.close()
    print num    
  • 相关阅读:
    linux查看CPU性能及工作状态的指令mpstat,vmstat,iostat,sar,top
    Linux vmstat命令实战详解
    dstat 性能监测工具
    sysstat 工具
    Linux命令详解----iostat
    Linux CPU实时监控mpstat命令详解
    Linux Top 命令解析 比较详细
    Linux统计/监控工具SAR详细介绍
    ubuntu 添加用户到已存在的组
    Ubuntu 14.04 使用速度极快的Genymotion 取代蜗牛速度的原生AVD模拟器
  • 原文地址:https://www.cnblogs.com/XDJjy/p/5273276.html
Copyright © 2011-2022 走看看