zoukankan      html  css  js  c++  java
  • 又是一段精典的python代码,

    #!/usr/bin/env python
    #encoding=utf-8
    import redis,codecs,sys,time,datetime,doctest
    reload(sys)
    sys.setdefaultencoding('utf-8')
    class Unbuffered:
        def __init__(self, stream):
            self.stream = stream

        def write(self, data):
            self.stream.write(data)
            self.stream.flush()

        def __getattr__(self, attr):
            return getattr(self.stream, attr)

    sys.stdout = Unbuffered(sys.stdout)

    def read_keys():
        keys=r.keys()
        r=redis.Redis(host='localhost',db=6)
        print len(keys)
        f=codecs.open("query_keys.txt","w","utf-8")
        #print r.info()
        for key in keys:
            print key
            #print type(key)
            f.write("%s\n"%(key,))
        f.close()

    def read_relevent_words():
        keys=r.keys()
        r=redis.Redis(host='localhost',db=6)
        print len(keys)
        f=codecs.open("query_relevent_words.txt","w","utf-8")
        for key in keys:
    #        print r.get(key)
            f.write("%s\n"%(r.get(key),))
        f.close()

    def parser_one_line_one_words():
        ff=codecs.open("parser_one_line_one_words.txt","w","utf-8")
        f=codecs.open("query_relevent_words.txt","r","utf-8")
        for line in f.readlines():
            li=line.strip().split("*")
            for elem in li:
                ff.write("%s\n"%(elem,))
        ff.close()


    def parser_one_line_one_words2():
        s=set()
        ff=codecs.open("parser_one_line_one_words.txt","w","utf-8")
        f=codecs.open("query_relevent_words.txt","r","utf-8")
        for line in f.readlines():
            li=line.strip().split("*")
            for elem in li:
                s.add(elem.strip())
                ff.write("%s\n"%(elem,))
        ff.close()
        print len(s)

    def compare_pareser_one_line_one_words_result_lost_line_for_tmp():
        f1=codecs.open("parser_one_line_one_words_uniq.txt","r","utf-8")
        f2=codecs.open("parser_one_line_one_words_uniq_result.txt","r","utf-8")
        count=0
        for a,b in zip(f1.readlines(),f2.readlines()):
            count+=1
            if a.strip()<>b.replace(" ","").strip():
                print count,a,b
                time.sleep(5)   

    def build_invert_index():
        """
        对wname建倒排索引
        以set结构存放倒排数据
        """
        r=redis.Redis(db=1)
        p=r.pipeline()
        count=0
        #for line in codecs.open("../result_text.txt","r","utf-8").readlines():
        for line in codecs.open("../output_result_process","r","utf-8").readlines():
            count+=1
            #if count<2553148:
            #    continue
            #print count
            #print line,
            #print line.strip().split(" ").__len__()
            for elem in line.strip().split(" "):
                p.sadd(elem.strip(),count)
            if count%10000==0:
                print count
                print "batch insert to redis ..."
                s=datetime.datetime.now()
                p.execute()
                e=datetime.datetime.now()
                print "done:%s"%((e-s).seconds)
        p.execute()



    def is_chinese(uchar):
        """判断一个unicode是否是汉字"""
        if uchar >= u'\u4e00' and uchar<=u'\u9fa5':
            return True
        else:
            return False
       
    def is_number(uchar):
        """判断一个unicode是否是数字"""
        if uchar >= u'\u0030' and uchar<=u'\u0039':
            return True
        else:
            return False
       
    def is_alphabet(uchar):
        """判断一个unicode是否是英文字母"""
        if (uchar >= u'\u0041' and uchar<=u'\u005a') or (uchar >= u'\u0061' and uchar<=u'\u007a'):
            return True
        else:
            return False

    def is_other(uchar):
        """判断是否非汉字,数字和英文字符"""
        if not (is_chinese(uchar) or is_number(uchar) or is_alphabet(uchar)):
            return True
        else:
           return False

    def _filter(line):
        """
        对分词后的文本wname字符进行非中文汉字、字母、数字的替换
        """
        r=[]
        for elem in line.strip().split(" "):
            element=elem.strip()
            if type(element)<>type(u""):
                element=element.decode("utf-8","ignore")
            if is_other(element)==False:
                r.append(element)
        return " ".join(r)

    def post_process_wname_segments_illegal_characters():
        f=codecs.open("../output_result_process","w","utf-8")
        for line in codecs.open("../output_result","r","utf-8").readlines():
            s=_filter(line)
            print s
            f.write(_filter(line)+"\n")
        f.close()

    def build_word_segments_hash_map():
        """
        给查询词和相关词建立原词-分词结果之间的hashmap
        """
        r2=redis.Redis(db=2)
        p=r2.pipeline()
        f1=codecs.open("parser_one_line_one_words_uniq.txt","r","utf-8")
        #f2=codecs.open("parser_one_line_one_words_uniq_result.txt","r","utf-8")
        f2=codecs.open("parser_one_line_one_words_uniq_result_pku.txt","r","utf-8")
        count=0
        for a,b in zip(f1.readlines(),f2.readlines()):
            count+=1
            p.set(a.strip(),b.strip())
            if count%10000==0:
                print count
                print "batch insert to redis ..."
                s=datetime.datetime.now()
                p.execute()
                e=datetime.datetime.now()
                print "done:%s"%((e-s).seconds)
        p.execute()

        f1=codecs.open("query_keys.txt","r","utf-8")
        #f2=codecs.open("query_keys_result.txt","r","utf-8")
        f2=codecs.open("query_keys_result_pku.txt","r","utf-8")
        count=0
        for a,b in zip(f1.readlines(),f2.readlines()):
            count+=1
            p.set(a.strip(),b.strip())
            if count%10000==0:
                print count
                print "batch insert to redis ..."
                s=datetime.datetime.now()
                p.execute()
                e=datetime.datetime.now()
                print "done:%s"%((e-s).seconds)
        p.execute()
        r2.bgsave()

    def _build_list_for_inter_args(s1,s2):
        """
        将分词后的字符串组合成一个list形式反加给r.sinter使用,去除无用的东西
        """
        r=[]
        r.extend(s1.split(" "))
        r.extend(s2.split(" "))
        return [elem.strip() for elem in r if elem.strip()<>""]

    def final_find_synomns_out():
        """

        """
        #f=codecs.open("synomns.txt","w","utf-8")
        f=codecs.open("synomns_pku.txt","w","utf-8")
        r1=redis.Redis(db=1)
        r2=redis.Redis(db=2)
        f1=codecs.open("query_keys.txt","r","utf-8")
        f2=codecs.open("query_relevent_words.txt","r","utf-8")
        count=0
        validateCount=0
        for a,b in zip(f1.readlines(),f2.readlines()):
            count+=1
            #print count
            query_segments=r2.get(a.strip())
            for elem in b.split("*"):
                if elem.strip()=="":
                    continue
                if r1.sinter(*_build_list_for_inter_args(query_segments,r2.get(elem.strip()))).__len__()>0:
                    validateCount+=1
                    if validateCount%1000==0:
                        print "validateCount:%s\n"%validateCount
                    f.write("%s|||%s\n"%(a.strip(),elem.strip()))
                    f.flush()
        f.close()

    def interactive_mode():
        while(True):
            r1=redis.Redis(db=1)
            r2=redis.Redis(db=2)
            input=raw_input("input query|||relevent_word:\n")
            a,b=input.strip().split("|||")
            query_segments=r2.get(a.strip())
            print a.strip(),"==>",query_segments
            print b.strip(),"==>",r2.get(b.strip())
            print r1.sinter(*_build_list_for_inter_args(query_segments,r2.get(b.strip())))
            print "========="

    def c1(line):
        """
        空格切分
        >>> c1("执手|||把手")
        False
        """
        a,b=line.strip().split("|||")
        return a.split(" ").__len__()>1 or b.split(" ").__len__()>1

    def c2(line):
        """
        包含子串
        >>> c2("执手|||把手")
        False
       
        >>> c2("浓缩咖啡|||咖啡")
        True
        """
        a,b=line.strip().split("|||")
        return (a in b) or (b in a)

    def filter_synonym_result():
        """
        将pku分词获得的query和relevent_word有交集的synomns_pku.txt,
        对其结果进行过滤
        过滤掉以下条件:
        有空格切分的
        包含子串的进行过滤
        """
       
        f=codecs.open("synomns_pku_filter.txt","w","utf-8")
        for line in codecs.open("synomns_pku.txt","r","utf-8").readlines():
            if c1(line)==False and c2(line)==False:
                f.write(line)
        f.close()
       
               
           
    if __name__=="__main__":
    #    doctest.testmod()
    #    read_relevent_words()
    #    parser_one_line_one_words2()
    #    compare_pareser_one_line_one_words_result_lost_line_for_tmp()
    #    build_invert_index()
    #    build_word_segments_hash_map()
    #    final_find_synomns_out()   
    #    interactive_mode()
    #    print _filter("龟 鹿 补 肾丸 4.5 g*12 袋 水 蜜丸 / 盒 [ 补 肾 失眠 体弱 疲乏 壮 阳 ]")
    #    print _filter("龟 牌 ( turtle ) 硬壳 防水 全效 蜡 g-223r")
    #    post_process_wname_segments_illegal_characters()
        filter_synonym_result()   

  • 相关阅读:
    HTTP协议详情
    HTTP入门
    DNS vs CDN
    TCP/IP协议和互联网协议群
    常用docker镜像
    linq to sql 把数据库连接字段写入配置文件
    微信开发--one.微信平台验证
    项目--ajax上传文件(本次是图片)(.net)
    项目--Repeater嵌套横向显示
    项目--物流查询实现
  • 原文地址:https://www.cnblogs.com/lexus/p/2762808.html
Copyright © 2011-2022 走看看