zoukankan      html  css  js  c++  java
  • python实现指定目录下批量文件的单词计数:串行版本


           直接上代码。

     

           练习目标:

                 1.  使用 Python 面向对象的方法封装逻辑和表达 ;

                 2.  使用异常处理和日志API ;

                 3.  使用文件目录读写API ; 

                 4.  使用 list, map, tuple 三种数据结构 ;

                 5.  lambda 、正则使用及其它。


           下一篇将实现并发版本。  

           

    #-------------------------------------------------------------------------------
    # Name:        wordstat_serial.py
    # Purpose:     statistic words in java files of given directory by serial
    #
    # Author:      qin.shuq
    #
    # Created:     08/10/2014
    # Copyright:   (c) qin.shuq 2014
    # Licence:     <your licence>
    #-------------------------------------------------------------------------------
    
    import re
    import os
    import time
    import logging
    
    LOG_LEVELS = {
        'DEBUG': logging.DEBUG, 'INFO': logging.INFO,
        'WARN': logging.WARNING, 'ERROR': logging.ERROR,
        'CRITICAL': logging.CRITICAL
    }
    
    def initlog(filename) :
    
        logger = logging.getLogger()
        hdlr = logging.FileHandler(filename)
        formatter = logging.Formatter("%(asctime)s %(levelname)s %(message)s")
        hdlr.setFormatter(formatter)
        logger.addHandler(hdlr)
        logger.setLevel(LOG_LEVELS['INFO'])
    
        return logger
    
    
    errlog = initlog("error.log")
    infolog = initlog("info.log")
    
    class WordReading(object):
    
        def __init__(self, fileList):
            self.fileList = fileList
    
        def readFileInternal(self, filename):
            lines = []
            try:
                f = open(filename, 'r')
                lines = f.readlines()
                infolog.info('[successful read file %s]
    ' % filename)
                f.close()
            except IOError, err:
                errorInfo = 'file %s Not found 
    ' % filename
                errlog.error(errorInfo)
            return lines
    
        def readFile(self):
            allLines = []
            for filename in self.fileList:
                allLines.extend(self.readFileInternal(filename))
            return allLines
    
    class WordAnalyzing(object):
        '''
         return Map<Word, count>  the occurrence times of each word
        '''
        wordRegex = re.compile("[w]+")
        def __init__(self, allLines):
            self.allLines = allLines
    
        def analyze(self):
            result = {}
            lineContent = ''.join(self.allLines)
            matches = WordAnalyzing.wordRegex.findall(lineContent)
            if matches:
                for word in matches:
                    if result.get(word) is None:
                        result[word] = 0
                    result[word] += 1
            return result
    
    class FileObtainer(object):
    
        def __init__(self, dirpath, fileFilterFunc=None):
            self.dirpath = dirpath
            self.fileFilterFunc = fileFilterFunc
    
        def findAllFilesInDir(self):
            files = []
            for path, dirs, filenames in os.walk(self.dirpath):
                if len(filenames) > 0:
                    for filename in filenames:
                        files.append(path+'/'+filename)
    
            if self.fileFilterFunc is None:
                return files
            else:
                return filter(self.fileFilterFunc, files)
    
    class PostProcessing(object):
    
        def __init__(self, resultMap):
            self.resultMap = resultMap
    
        def sortByValue(self):
            return sorted(self.resultMap.items(),key=lambda e:e[1], reverse=True)
    
        def obtainTopN(self, topN):
            sortedResult = self.sortByValue()
            sortedNum = len(sortedResult)
            topN = sortedNum if topN > sortedNum else topN
            for i in range(topN):
                topi = sortedResult[i]
                print topi[0], ' counts: ', topi[1]
    
    if __name__ == "__main__":
    
        dirpath = "c:\Users\qin.shuq\Desktop\region_master\src"
    
        starttime = time.time()
        fileObtainer = FileObtainer(dirpath, lambda f: f.endswith('.java'))
        fileList = fileObtainer.findAllFilesInDir()
        endtime = time.time()
        print 'ObtainFile cost: ', (endtime-starttime)*1000 , 'ms'
    
        starttime = time.time()
        wr = WordReading(fileList)
        allLines = wr.readFile()
        endtime = time.time()
        print 'WordReading cost: ', (endtime-starttime)*1000 , 'ms'
    
        starttime = time.time()
        wa = WordAnalyzing(allLines)
        resultMap = wa.analyze()
        endtime = time.time()
        print 'WordAnalyzing cost: ', (endtime-starttime)*1000 , 'ms'
    
        starttime = time.time()
        postproc = PostProcessing(resultMap)
        postproc.obtainTopN(30)
        endtime = time.time()
        print 'PostProcessing cost: ', (endtime-starttime)*1000 , 'ms'
    
    
  • 相关阅读:
    HTML5 重力感应效果,实现摇一摇效果
    WEB 移动端 CSS3动画性能 优化
    jquery 插件封装模板
    textarea 提交到数据库的内容,输出到 html 中显示正常的格式
    js根据银行卡号判断属于哪个银行,并返回银行缩写及银行卡类型
    微信小程序如何引用iconfont图标
    nodejs: express basic
    javascript设计模式:适配器模式
    javascript设计模式:装饰者模式
    javascript设计模式:代理模式
  • 原文地址:https://www.cnblogs.com/lovesqcc/p/4037664.html
Copyright © 2011-2022 走看看