zoukankan      html  css  js  c++  java
  • sortquery(转)

    以下代码用python编写。

    import os
    import os.path
    import operator
    import heapq
    
    """
    sort users' queries by frequency
    1. hashing queries and dividing into 10 files. (hash(query)%10)
    2. counting the number queries and sorting in each file using hashtable.
    3. merging files using heap queue algorithm.
    """
    
    datadir  = "d:/querysort/data/"
    tempdir  = "d:/querysort/temp/"
    destfile = "d:/querysort/sorted.txt"
    
    def hashfiles():
        fs = []
        if not os.path.exists(tempdir):
            os.makedirs(tempdir)
        for f in range(0,10):
            fs.append(open(tempdir + str(f), 'w'))
        
        for parent, dirnames, filenames in os.walk(datadir):
            for filename in filenames:
                f = open(os.path.join(parent, filename),'r')
                for query in f:
                    fs[hash(query)%10].write(query)
                f.close()          
    
        for f in fs:
             f.close()
         
                    
    def sortqueryinfile():
        fs = []
        if not os.path.exists(tempdir):
            return
        for f in range(0,10):
            fs.append(open(tempdir + str(f), 'r+'))
    
        for f in fs:
            D = {}
            for query in f:
                if query in D:
                    D[query] += 1
                else:
                    D[query] = 1
            sorted_D = sorted(D.iteritems(), key=operator.itemgetter(1), reverse=True)
            f.seek(0,0)
            f.truncate()
            for item in sorted_D:
                f.write(str(item[1]) + "\t" + item[0])
            f.close()
    
    def decorated_file(f):
        """ Yields an easily sortable tuple. 
        """
        for line in f:
            count, query = line.split('\t',2)
            yield (-int(count), query)
    
    def mergefiles():
        fs = []
        if not os.path.exists(tempdir):
            return
        for f in range(0,10):
            fs.append(open(tempdir + str(f), 'r+'))
        f_dest = open(destfile,"w")
        lines_written = 0
        for line in heapq.merge(*[decorated_file(f) for f in fs]):
            f_dest.write(line[1])
            lines_written += 1
        return lines_written
    
         
    if __name__ == '__main__':
        hashfiles()
        sortqueryinfile()
        print "sorting completed, total queries: ", mergefiles()
  • 相关阅读:
    过河问题 贪心
    喷水装置2 贪心
    喷水装置 贪心算法
    大红数星星 图论 XD网络赛
    Bi-shoe and Phi-shoe 欧拉函数 素数
    c++ 打飞机游戏开发日志
    POJ 1129 Channel Allocation DFS 回溯
    POJ 2676 Sudoku
    LibreOJ #100. 矩阵乘法
    BZOJ 1009: [HNOI2008]GT考试
  • 原文地址:https://www.cnblogs.com/qq78292959/p/2771762.html
Copyright © 2011-2022 走看看