zoukankan      html  css  js  c++  java
  • python实现指定目录下批量文件的单词计数:串行版本


           直接上代码。

     

           练习目标:

                 1.  使用 Python 面向对象的方法封装逻辑和表达 ;

                 2.  使用异常处理和日志API ;

                 3.  使用文件目录读写API ; 

                 4.  使用 list, map, tuple 三种数据结构 ;

                 5.  lambda 、正则使用及其它。


           下一篇将实现并发版本。  

           

    #-------------------------------------------------------------------------------
    # Name:        wordstat_serial.py
    # Purpose:     statistic words in java files of given directory by serial
    #
    # Author:      qin.shuq
    #
    # Created:     08/10/2014
    # Copyright:   (c) qin.shuq 2014
    # Licence:     <your licence>
    #-------------------------------------------------------------------------------
    
    import re
    import os
    import time
    import logging
    
    LOG_LEVELS = {
        'DEBUG': logging.DEBUG, 'INFO': logging.INFO,
        'WARN': logging.WARNING, 'ERROR': logging.ERROR,
        'CRITICAL': logging.CRITICAL
    }
    
    def initlog(filename) :
    
        logger = logging.getLogger()
        hdlr = logging.FileHandler(filename)
        formatter = logging.Formatter("%(asctime)s %(levelname)s %(message)s")
        hdlr.setFormatter(formatter)
        logger.addHandler(hdlr)
        logger.setLevel(LOG_LEVELS['INFO'])
    
        return logger
    
    
    errlog = initlog("error.log")
    infolog = initlog("info.log")
    
    class WordReading(object):
    
        def __init__(self, fileList):
            self.fileList = fileList
    
        def readFileInternal(self, filename):
            lines = []
            try:
                f = open(filename, 'r')
                lines = f.readlines()
                infolog.info('[successful read file %s]
    ' % filename)
                f.close()
            except IOError, err:
                errorInfo = 'file %s Not found 
    ' % filename
                errlog.error(errorInfo)
            return lines
    
        def readFile(self):
            allLines = []
            for filename in self.fileList:
                allLines.extend(self.readFileInternal(filename))
            return allLines
    
    class WordAnalyzing(object):
        '''
         return Map<Word, count>  the occurrence times of each word
        '''
        wordRegex = re.compile("[w]+")
        def __init__(self, allLines):
            self.allLines = allLines
    
        def analyze(self):
            result = {}
            lineContent = ''.join(self.allLines)
            matches = WordAnalyzing.wordRegex.findall(lineContent)
            if matches:
                for word in matches:
                    if result.get(word) is None:
                        result[word] = 0
                    result[word] += 1
            return result
    
    class FileObtainer(object):
    
        def __init__(self, dirpath, fileFilterFunc=None):
            self.dirpath = dirpath
            self.fileFilterFunc = fileFilterFunc
    
        def findAllFilesInDir(self):
            files = []
            for path, dirs, filenames in os.walk(self.dirpath):
                if len(filenames) > 0:
                    for filename in filenames:
                        files.append(path+'/'+filename)
    
            if self.fileFilterFunc is None:
                return files
            else:
                return filter(self.fileFilterFunc, files)
    
    class PostProcessing(object):
    
        def __init__(self, resultMap):
            self.resultMap = resultMap
    
        def sortByValue(self):
            return sorted(self.resultMap.items(),key=lambda e:e[1], reverse=True)
    
        def obtainTopN(self, topN):
            sortedResult = self.sortByValue()
            sortedNum = len(sortedResult)
            topN = sortedNum if topN > sortedNum else topN
            for i in range(topN):
                topi = sortedResult[i]
                print topi[0], ' counts: ', topi[1]
    
    if __name__ == "__main__":
    
        dirpath = "c:\Users\qin.shuq\Desktop\region_master\src"
    
        starttime = time.time()
        fileObtainer = FileObtainer(dirpath, lambda f: f.endswith('.java'))
        fileList = fileObtainer.findAllFilesInDir()
        endtime = time.time()
        print 'ObtainFile cost: ', (endtime-starttime)*1000 , 'ms'
    
        starttime = time.time()
        wr = WordReading(fileList)
        allLines = wr.readFile()
        endtime = time.time()
        print 'WordReading cost: ', (endtime-starttime)*1000 , 'ms'
    
        starttime = time.time()
        wa = WordAnalyzing(allLines)
        resultMap = wa.analyze()
        endtime = time.time()
        print 'WordAnalyzing cost: ', (endtime-starttime)*1000 , 'ms'
    
        starttime = time.time()
        postproc = PostProcessing(resultMap)
        postproc.obtainTopN(30)
        endtime = time.time()
        print 'PostProcessing cost: ', (endtime-starttime)*1000 , 'ms'
    
    
  • 相关阅读:
    Oracle中有大量的sniped会话
    Error 1130: Host '127.0.0.1' is not allowed to connect to this MySQL server
    汉字转换为拼音以及缩写(javascript)
    高效率随机删除数据(不重复)
    vs2010 舒服背景 优雅字体 配置
    mvc中的ViewData用到webfrom中去
    jquery ajax return值 没有返回 的解决方法
    zShowBox (图片放大展示jquery版 兼容性好)
    动感效果的TAB选项卡 jquery 插件
    loading 加载提示······
  • 原文地址:https://www.cnblogs.com/lovesqcc/p/4037664.html
Copyright © 2011-2022 走看看