zoukankan      html  css  js  c++  java
  • python 遍历hadoop, 跟指定列表对比 包含列表中值的取出。

    import sys
    import tstree
    
    fname = 'high_freq_site.list'
    tree = tstree.TernarySearchTrie()
    tree.loadData(fname)
    
    token = ''
    counter = 0
    post = []
    
    # url, count, posttime
    for line in sys.stdin:
        line = line.strip()
        arr = line.split()
        if len(arr) != 3:
            continue
        
        #print arr
        num = arr[1]
        url = arr[0]
        posttime = int(arr[2])
    
        if token == '':
            token = url
            counter = 0
            counter += int(num)
            post.append(posttime)
        elif token == url:
            counter += int(num)
            post.append(posttime)
        elif token != url:
            ret = tree.maxMatch(token)
            if ret and post:
                print '%s	%s	%s	%s' % (ret, token, counter, min(post))
            
            token = url
            counter = 0
            counter += int(num)
            post = []
    
    ret = tree.maxMatch(token)
    if ret and post:
        print '%s	%s	%s	%s' % (ret, token, counter, min(post))
    
    
    
    class TSTNode(object):
        def __init__(self, splitchar):
            self.splitchar = splitchar
            self.data = None
    
            self.loNode = None
            self.eqNode = None
            self.hiNode = None
    
    
    class TernarySearchTrie(object):
        def __init__(self):
            self.rootNode = None
    
    
        def loadData(self, fname):
            f = open(fname)
            while True:
                line = f.readline()
                if not line:
                    break
                line = line.strip()
                node = self.addWord(line)
                if node:
                    node.data = line
            f.close()
    
        
        def addWord(self, word):
            if not word:
                return None
    
            charIndex = 0
            if not self.rootNode:
                self.rootNode = TSTNode(word[0])
    
            currentNode = self.rootNode
    
            while True:
                charComp = ord(word[charIndex]) - ord(currentNode.splitchar)
                if charComp == 0:
                    charIndex += 1
                    if charIndex == len(word):
                        return currentNode
                    if not currentNode.eqNode:
                        currentNode.eqNode = TSTNode(word[charIndex])
                    currentNode = currentNode.eqNode
                elif charComp < 0:
                    if not currentNode.loNode:
                        currentNode.loNode = TSTNode(word[charIndex])
                    currentNode = currentNode.loNode
                else:
                    if not currentNode.hiNode:
                        currentNode.hiNode = TSTNode(word[charIndex])
                    currentNode = currentNode.hiNode
    
    
        def maxMatch(self, url):
            ret = None
            currentNode = self.rootNode
            charIndex = 0
            while currentNode:
                if charIndex >= len(url):
                    break
                charComp = ord(url[charIndex]) - ord(currentNode.splitchar)
                if charComp == 0:
                    charIndex += 1
                    if currentNode.data:
                        ret = currentNode.data
                    if charIndex == len(url):
                        return ret
                    currentNode = currentNode.eqNode
                elif charComp < 0:
                    currentNode = currentNode.loNode
                else:
                    currentNode = currentNode.hiNode
            return ret
    
    
    if __name__ == '__main__':
        import sys
        fname = 'high_freq_site.list'
        tree = TernarySearchTrie()
        tree.loadData(fname)
    
        for url in sys.stdin:
            url = url.strip()
            ret = tree.maxMatch(url)
            print ret
  • 相关阅读:
    Linux关闭防火墙和selinux
    Linux内存VSS,RSS,PSS,USS解析
    JS 将有父子关系的数组转换成树形结构数据
    npm install报错类似于npm WARN tar ENOENT: no such file or directory, open '*** ode_modules.staging***
    react-native之文件上传下载
    Markdown语法简记
    MySQL运维开发
    股票投资
    数据仓库原理与实战
    python基础
  • 原文地址:https://www.cnblogs.com/i80386/p/5058584.html
Copyright © 2011-2022 走看看