zoukankan      html  css  js  c++  java
  • export.py

    #!/usr/bin/env python
    #encoding=utf-8
    import redis,codecs,sys,time,datetime,doctest,re
    reload(sys)
    sys.setdefaultencoding('utf8')
    class Unbuffered:
        def __init__(self, stream):
            self.stream = stream

        def write(self, data):
            self.stream.write(data)
            self.stream.flush()

        def __getattr__(self, attr):
            return getattr(self.stream, attr)

    sys.stdout = Unbuffered(sys.stdout)

    def read_keys():
        keys=r.keys()
        r=redis.Redis(host='localhost',db=6)
        print len(keys)
        f=codecs.open("query_keys.txt","w","utf-8")
        #print r.info()
        for key in keys:
            print key
            #print type(key)
            f.write("%s\n"%(key,))
        f.close()

    def read_relevent_words():
        keys=r.keys()
        r=redis.Redis(host='localhost',db=6)
        print len(keys)
        f=codecs.open("query_relevent_words.txt","w","utf-8")
        for key in keys:
    #        print r.get(key)
            f.write("%s\n"%(r.get(key),))
        f.close()

    def parser_one_line_one_words():
        ff=codecs.open("parser_one_line_one_words.txt","w","utf-8")
        f=codecs.open("query_relevent_words.txt","r","utf-8")
        for line in f.readlines():
            li=line.strip().split("*")
            for elem in li:
                ff.write("%s\n"%(elem,))
        ff.close()


    def parser_one_line_one_words2():
        s=set()
        ff=codecs.open("parser_one_line_one_words.txt","w","utf-8")
        f=codecs.open("query_relevent_words.txt","r","utf-8")
        for line in f.readlines():
            li=line.strip().split("*")
            for elem in li:
                s.add(elem.strip())
                ff.write("%s\n"%(elem,))
        ff.close()
        print len(s)

    def compare_pareser_one_line_one_words_result_lost_line_for_tmp():
        f1=codecs.open("parser_one_line_one_words_uniq.txt","r","utf-8")
        f2=codecs.open("parser_one_line_one_words_uniq_result.txt","r","utf-8")
        count=0
        for a,b in zip(f1.readlines(),f2.readlines()):
            count+=1
            if a.strip()<>b.replace(" ","").strip():
                print count,a,b
                time.sleep(5)   

    def build_invert_index():
        """
        对wname建倒排索引
        以set结构存放倒排数据
        """
        r=redis.Redis(db=1)
        p=r.pipeline()
        count=0
        #for line in codecs.open("../result_text.txt","r","utf-8").readlines():
        for line in codecs.open("../output_result_process","r","utf-8").readlines():
            count+=1
            #if count<2553148:
            #    continue
            #print count
            #print line,
            #print line.strip().split(" ").__len__()
            for elem in line.strip().split(" "):
                p.sadd(elem.strip(),count)
            if count%10000==0:
                print count
                print "batch insert to redis ..."
                s=datetime.datetime.now()
                p.execute()
                e=datetime.datetime.now()
                print "done:%s"%((e-s).seconds)
        p.execute()

    def is_chinese(uchar):
        """
        判断一个unicode是否是汉字
        >>> is_chinese(u"人")
        True

        >>> is_chinese("人")
        True

        >>> is_chinese("1")
        False

        >>> is_chinese(" ")
        False
        """
        if type(uchar)==type(""):
            u=uchar.decode("utf-8","ignore")
        else:
            u=uchar.encode("utf-8","ignore")
        if len(u)!=len(uchar):
            return True
        else:
            return False
        #if uchar >= u'\u4e00' and uchar<=u'\u9fa5':
        #    return True
        #else:
        #    return False

    def is_number(uchar):
        """判断一个unicode是否是数字"""
        if uchar >= u'\u0030' and uchar<=u'\u0039':
            return True
        else:
            return False
       
    def is_alphabet(uchar):
        """
            判断一个unicode是否是英文字母
           
            #>>> is_alphabet(u"t")
            #True   
           
            #>>> is_alphabet("t")
            #True
            """
        if (uchar >= u'\u0041' and uchar<=u'\u005a') or (uchar >= u'\u0061' and uchar<=u'\u007a'):
            return True
        else:
            return False

    def is_other(uchar):
        """判断是否非汉字,数字和英文字符"""
        if not (is_chinese(uchar) or is_number(uchar) or is_alphabet(uchar)):
            return True
        else:
           return False

    def _filter(line):
        """
        对分词后的文本wname字符进行非中文汉字、字母、数字的替换
        """
        r=[]
        for elem in line.strip().split(" "):
            element=elem.strip()
            if type(element)<>type(u""):
                element=element.decode("utf-8","ignore")
            if is_other(element)==False:
                r.append(element)
        return " ".join(r)

    def post_process_wname_segments_illegal_characters():
        f=codecs.open("../output_result_process","w","utf-8")
        for line in codecs.open("../output_result","r","utf-8").readlines():
            s=_filter(line)
            print s
            f.write(_filter(line)+"\n")
        f.close()

    def build_word_segments_hash_map():
        """
        给查询词和相关词建立原词-分词结果之间的hashmap
        """
        r2=redis.Redis(db=2)
        p=r2.pipeline()
        f1=codecs.open("parser_one_line_one_words_uniq.txt","r","utf-8")
        #f2=codecs.open("parser_one_line_one_words_uniq_result.txt","r","utf-8")
        f2=codecs.open("parser_one_line_one_words_uniq_result_pku.txt","r","utf-8")
        count=0
        for a,b in zip(f1.readlines(),f2.readlines()):
            count+=1
            p.set(a.strip(),b.strip())
            if count%10000==0:
                print count
                print "batch insert to redis ..."
                s=datetime.datetime.now()
                p.execute()
                e=datetime.datetime.now()
                print "done:%s"%((e-s).seconds)
        p.execute()

        f1=codecs.open("query_keys.txt","r","utf-8")
        #f2=codecs.open("query_keys_result.txt","r","utf-8")
        f2=codecs.open("query_keys_result_pku.txt","r","utf-8")
        count=0
        for a,b in zip(f1.readlines(),f2.readlines()):
            count+=1
            p.set(a.strip(),b.strip())
            if count%10000==0:
                print count
                print "batch insert to redis ..."
                s=datetime.datetime.now()
                p.execute()
                e=datetime.datetime.now()
                print "done:%s"%((e-s).seconds)
        p.execute()
        r2.bgsave()

    def _build_list_for_inter_args(s1,s2):
        """
        将分词后的字符串组合成一个list形式反加给r.sinter使用,去除无用的东西
        """
        r=[]
        r.extend(s1.split(" "))
        r.extend(s2.split(" "))
        return [elem.strip() for elem in r if elem.strip()<>""]

    def final_find_synomns_out():
        """

        """
        #f=codecs.open("synomns.txt","w","utf-8")
        f=codecs.open("synomns_pku.txt","w","utf-8")
        r1=redis.Redis(db=1)
        r2=redis.Redis(db=2)
        f1=codecs.open("query_keys.txt","r","utf-8")
        f2=codecs.open("query_relevent_words.txt","r","utf-8")
        count=0
        validateCount=0
        for a,b in zip(f1.readlines(),f2.readlines()):
            count+=1
            #print count
            query_segments=r2.get(a.strip())
            for elem in b.split("*"):
                if elem.strip()=="":
                    continue
                if r1.sinter(*_build_list_for_inter_args(query_segments,r2.get(elem.strip()))).__len__()>0:
                    validateCount+=1
                    if validateCount%1000==0:
                        print "validateCount:%s\n"%validateCount
                    f.write("%s|||%s\n"%(a.strip(),elem.strip()))
                    f.flush()
        f.close()

    def interactive_mode():
        while(True):
            r1=redis.Redis(db=1)
            r2=redis.Redis(db=2)
            input=raw_input("input query|||relevent_word:\n")
            a,b=input.strip().split("|||")
            query_segments=r2.get(a.strip())
            print a.strip(),"==>",query_segments
            print b.strip(),"==>",r2.get(b.strip())
            print r1.sinter(*_build_list_for_inter_args(query_segments,r2.get(b.strip())))
            print "========="

    def c1(line):
        """
        空格切分
        >>> c1("执手|||把手")
        False
        """
        a,b=line.strip().split("|||")
        return a.split(" ").__len__()>1 or b.split(" ").__len__()>1

    r2=redis.Redis(db=2)
    def c4(s1,s2):
        """
        #>>> c4("尤利西斯","追忆逝水年华")
        False

        #>>> c4("A B C","A B")
        True

        >>> c4("无线鼠套装","无线键鼠套装")
        False

        #>>> c4("A B","A C")
        False

        #>>> c4("A B","A C")
        False

        #>>> c4("A","A")
        True

        >>> c4("行政职业能力测验真题","行测真题")
        False

        #>>> c4("B","C")
        False
        """
        if s1==s2:
            return True

        global r2
        set1=set()
        set2=set()
        if r2.exists(s1):
            s1=r2.get(s1).strip()
        if s1.find(" ")>-1:
            set1=set([elem.strip() for elem in s1.split(" ") if elem.strip()<>""])
        else:
            set1=set([s1.strip()])

        if r2.exists(s2):
            s2=r2.get(s2).strip()
        if s2.find(" ")>-1:
            set2=set([elem.strip() for elem in s2.split(" ") if elem.strip()<>""])
        else:
            set2=set([s2.strip()])

        #print set1,set2
        #for elem in set1:
        #    print elem,
        #print "=========="
        #for elem in set2:
        #    print elem,
        inster=set1 & set2
        if inster.__len__()==0:
            return False
        if inster.__len__()<min(set1.__len__(),set2.__len__()):
            return False
        else:
            return True
       
       
    def c3(line):
        """
        >>> c3("执手|||把手")
        False

        >>> c3("the north face|||tnf")
        False
       
        >>> c3("the 大north face|||tnf")
        True
       
        >>> c3("wd1tb|||i5 2320")
        True
        """
        def is_en_or_num(s):
            #if re.match(r"[a-zA-A0-9]{1,}\Z",s):
            if re.match(r"[a-zA-Z]{1,}\Z",s.strip()):
                return True
            else:
                return False

        def f(list):
            r=set()
            def _f(s1,s2):
                r.add(is_en_or_num(s1) & is_en_or_num(s2))
                return s2
            reduce(_f,list)
            if False in r:
                return True
            else:
                return False

        a,b=line.strip().split("|||")
        if a.split(" ").__len__()>1 and b.split(" ").__len__()>1:
            return f(a.split(" ")) & f(b.split(" "))
        if a.split(" ").__len__()==1 and b.split(" ").__len__()>1:
            return f(b.split(" "))
        if a.split(" ").__len__()>1 and b.split(" ").__len__()==1:
            return f(a.split(" "))
        if a.split(" ").__len__()==1 and b.split(" ").__len__()==1:
            return False

           
    def c2(line):
        """
        包含子串
        >>> c2("执手|||把手")
        False
       
        >>> c2("浓缩咖啡|||咖啡")
        True
        """
        a,b=line.strip().split("|||")
        return (a in b) or (b in a)

    def filter_synonym_result():
        """
        将pku分词获得的query和relevent_word有交集的synomns_pku.txt,
        对其结果进行过滤
        过滤掉以下条件:
        有空格切分的
        包含子串的进行过滤
        """
       
        f=codecs.open("synomns_pku_filter.txt","w","utf-8")
        for line in codecs.open("synomns_pku.txt","r","utf-8").readlines():
            if c1(line)==False and c2(line)==False:
                f.write(line)
        f.close()
       
    def test_redis_is_ready():
        """
        测试redis启动OK了
        """       
        r=redis.Redis()
        print r.info()
       
    def pivot_query_relvent_word_order_and_intersation_size():
        """
        将结果以
        Query为key
        hashmap为value
        hashmap的key为relevent word
                     value为list [intersation_size,relevent word order]
        """   
        debug=False
        r1=redis.Redis(db=1)
        r2=redis.Redis(db=2)
        r3=redis.Redis(db=3)
        #r3.flushdb()
        p=r3.pipeline()
        def step1():
            """
            从synomns_pku_filter.txt中初始化存储的格式为hmap格式
            """
            count=0
            for line in codecs.open("synomns_pku_filter.txt","r","utf-8").readlines():
                count+=1
                a,b=line.split("|||")
                a=a.strip()
                b=b.strip()
                #print type(a),type(b)
                #print a,b
                p.hset(a,b,[])
                if count%10000==0:
                    p.execute()
                    print "执行一次批量提交redis操作"
                if count==1 and debug==True:
                    break
            p.execute()   
        #step1()

        def step2():
            """
            将相关词的顺序插入到redis的hmap 的value中
            """
            count=0
            exists_count=0
            not_exists_count=0
            f1=codecs.open("query_keys.txt","r","utf-8")
            f2=codecs.open("query_relevent_words.txt","r","utf-8")
            for a,b in zip(f1.readlines(),f2.readlines()):
                count+=1
                a=a.strip()
                b=b.strip()
                for idx,elem in enumerate(b.split("*")):
                    element=elem.strip()
                    if element=="":
                        continue
                    #print type(a),type(element)
                    #print a,b,element
                    if r3.hexists(a,element):
                        exists_count+=1
                        r3.hset(a,element,[idx+1])
                    else:
                        not_exists_count+=1
                        #print "%s,%s not exists in redis"%(a,element)
                    if count%10000==0:
                        print "exists_count:%s"%exists_count
                        print "not_exists_count:%s"%not_exists_count
                       
                if count==1 and debug==True:
                    break       
            print "exists_count:%s"%exists_count
            print "not_exists_count:%s"%not_exists_count
            print "step2 finished"

        #step2()

        def test_step1_and_step2_is_ok():
            """
            """
            result=r3.hget("透明茶杯","茶具")
            if type([])==type(eval(result)):
                print "正确"
            else:
                print "不正确"
        #test_step1_and_step2_is_ok()

        def step3():
            """
            将有交集结果的数据重新再跑一遍并将交集的大小改写到hmap的value中
            """
            count=0
            validateCount=0
            for line in codecs.open("synomns_pku_filter.txt","r","utf-8").readlines():
                a,b=line.strip().split("|||")
                a=a.strip()
                b=b.strip()
                count+=1
                #print count
                query_segments=r2.get(a)
                intersation_len=r1.sinter(*_build_list_for_inter_args(query_segments,r2.get(b))).__len__()
                if intersation_len>0:
                        list_result=eval(r3.hget(a,b))
                        if len(list_result)<>1:
                            print a,b
                            print type(a),type(b)
                            print "ERROR"
                            exit(-1)
                        #print type(list_result)==type([])
                        list_result.append(intersation_len)
                        r3.hset(a,b,list_result)
                        validateCount+=1
                        if validateCount%1000==0:
                            print "validateCount:%s\n"%validateCount
            print "final validateCount %s"%validateCount

        #step3()
       
        def step4():
            """
            将存储在redis hmap中的结构进行输出
            """
            def cmp(x,y):
                if x[1][1]<y[1][1]:
                    return 1
                elif x[1][1]>y[1][1]:
                    return -1
                else:
                    if x[1][0]<y[1][0]:
                        return 1
                    elif x[1][0]>y[1][0]:
                        return -1
                    else:
                        return 0
            f=codecs.open("synomns_pku_filter_process.txt","w","utf-8")
            #[('b', [2, 4]), ('a', [1, 3])]
            count=0
            for key in r3.keys():
                count+=1
                print count
                f.write("%s"%key)
                z=r3.hgetall(key)
                for k,v in z.iteritems():
                    z[k]=eval(v)
                for elem in sorted(z.items(),cmp):
                    word,orders=elem
                    f.write("|||%s,%s"%(word,str(orders)))
                f.write("\n")
                f.flush()
            f.close()
           
        step4()

    def _find_short_name(s1,s2):
        """
        >>> _find_short_name("行测","行政能力测试")
        True

        >>> _find_short_name("AB","ABC")
        False

        >>> _find_short_name("A","D")
        False
        """
        if len(s1)>=len(s2):
            return False
        if s1 in s2:
            return False
        return set(s1).issubset(set(s2))   

    def find_short_name():
        """
        在synomns_pku_filter.txt中查找简称
        """
        for line in codecs.open("synomns_pku_filter.txt","r","utf-8").readlines():
            a,b=line.strip().split("|||")
            a=a.strip()
            b=b.strip()
            if _find_short_name(a,b) or _find_short_name(b,a):
                print "%s|||%s"%(a,b)

    def find_short_name2():
        """
        在原query和relevent word中查找简称
        """
        f=codecs.open("short_name_global.txt","w","utf-8")
        count=0
        validateCount=0
        f1=codecs.open("query_keys.txt","r","utf-8")
        f2=codecs.open("query_relevent_words.txt","r","utf-8")
        for a,b in zip(f1.readlines(),f2.readlines()):
            count+=1
            #if count<146146:
            #    continue
            #else:
            #    print a,b
            #if count<146148:
            #    print "stop..."
            #    time.sleep(100000)
            a=a.strip()
            b=b.strip()
            for idx,elem in enumerate(b.split("*")):
                element=elem.strip()
                if element=="":
                    continue
                line="%s|||%s\n"%(a,element)
                #print line
                #time.sleep(4000)
                if c3(line)==False and c2(line)==False and c4(a,element)==False:
                    #if _find_short_name(a,element) or _find_short_name(element,a):
                    validateCount+=1
                    #if validateCount%10000==0:
                    print "validateCount:%s"%validateCount
                    print line
                    #time.sleep(100000)
                    f.write(line)
                    f.flush()

            if count%10000==0:
                    print "cout===========>%s"%count
        f.close()
        print "validateCount:%s"%validateCount
        print "cout===========>%s"%count

    def test_sorted():
        a=[('a',[1,2]),("b",[0,2]),("c",[-1,3])]
        def cmp(x,y):
            if x[1][1]<y[1][1]:
                return 1
            elif x[1][1]>y[1][1]:
                return -1
            else:
                if x[1][0]<y[1][0]:
                    return 1
                elif x[1][0]>y[1][0]:
                    return -1
                else:
                    return 0
        print sorted(a,cmp)

    def _find_only_one_word_difference(line):
        """
        >>> _find_only_one_word_difference("毛领毛衣|||毛领衣服")
        True
        """
        return True

    def find_only_one_word_difference():
        f=codecs.open("./short_name_global_filter.txt","w","utf-8")
        for line in codecs.open("./short_name_global.txt","r","utf-8").readlines():
            a,b=line.strip().split("|||")
            if len(a)==len(b) and a<>b:
                #print type(a),type(b)
                set1=set(a)
                set2=set(b)
                inster=set1&set2
                m=len(a)-1
                if m>0 and inster.__len__()==m:
                    if ((set1-set2).__len__()>0 and str(list(set1-set2)[0]).isdigit()==False) or \
                          ((set2-set1).__len__()>0 and str(list(set2-set1)[0]).isdigit()==False):   
                            f.write(line)
        f.close()
               
    def find_human_names():
        """
        从query_relevent_word中找一批人名
        苏轼 苏东坡
        """
        xins=['白','毕','卞','蔡','曹','岑','常','车','陈','成','程','池','邓','丁','范','方','樊','费','冯','符','傅','甘','高','葛','龚','古','关','郭','韩','何','贺','洪','侯','胡','华','黄','霍','姬','简','江','姜','蒋','金','康','柯','孔','赖','郎','乐','雷','黎','李','连','','梁','廖','林','凌','刘','柳','龙','卢','鲁','陆','路','吕','罗','骆','马','梅','孟','莫','母','穆','倪','宁','欧','区','潘','彭','','皮','齐','戚','钱','强','秦','丘','邱','饶','任','沈','盛','施','石','时','史','司徒','苏','孙','谭','汤','唐','陶','田','童','涂','王','危','韦','卫','魏','温','文','翁','巫','邬','吴','伍','武','席','夏','萧','谢','辛','邢','徐','许','薛','严','颜','杨','叶','易','殷','尤','于','余','俞','虞','元','袁','岳','云','曾','詹','张','章','赵','郑','钟','周','邹','朱','褚','庄','卓']
        xins+=['李','王','张','刘','陈','黄','周','吴','徐','孙','胡','朱','高','林','何','郭','马','罗','梁','宋','郑','谢','韩','唐','冯','于','董','萧','程','曹','袁','邓','许','傅','沈','曾','彭','吕','苏','卢','蒋','蔡','贾','丁','魏','薛','叶','阎','余','潘','杜','戴','夏','','汪','田','任','姜','范','方','石','姚','谭','廖','邹','熊','金','陆','郝','孔','白','崔','康','毛','邱','秦','江','史','顾','侯','','孟','龙','万','段','章','钱','汤','尹','黎','易','常','武','乔','贺','赖','龚','文']
        xins+=['鲍俎','百里','碧鲁','伯赏','北堂','陈林','淳于','第五','东方','东郭','东门','段干','独孤','端木','范姜','哥舒','公良','公孙','公西','公冶','公羊','缑亢','谷梁','归海','赫连','胡母','呼延','黄方','皇甫','即墨','夹谷','晋楚','况后','梁丘','令狐','陆费','闾丘','闾邱','明哲','墨哈','慕容','万俟','南宫','南郭','南门','年爱','欧阳','濮阳','漆雕','亓官','屈突','壤驷','汝鄢','司马','司空','司寇','司徒','官','商牟','申屠','侍其','疏束','叔孙','太史','太叔','澹台','涂钦','拓拔','完完','完颜','王子','闻人','微生','巫马','乌雅','铁笔','夏','许世','轩辕','闫法','羊舌','阳佟','耶律','有琴','尉迟','余佴','宇文','岳帅','乐正','宰父','子车','子阳','宗政','左丘','张简','章佳','长孙','郑余','仲孙','钟离','诸葛','颛孙']
        xins+=['付']
        xins+=['李','王','张','刘','陈','杨','黄','孙','周','吴','徐','赵','朱','马','胡','郭','林','何','高','梁','郑','罗','宋','谢','唐','韩','曹','许','邓','萧','冯','曾','程','蔡','彭','潘','袁','于','董','余','苏','叶','吕','魏','蒋','田','杜','丁','沈','姜','范','江','傅','','卢','汪','戴','崔','任','陆','廖','姚','方','金','邱','夏','谭','韦','贾','邹','石','熊','孟','秦','阎','薛','侯','雷','白','龙','','郝','孔','邵','史','毛','常','万','顾','赖','武','康','贺','严','尹','钱','施','牛','洪','龚','汤','陶','黎','温','莫','易','樊','','文','安','殷','颜','庄','章','鲁','倪','庞','邢','俞','翟','蓝','聂','齐','向','申','葛','柴','伍','覃','骆','关','焦','柳','欧','','纪','尚','毕','耿','芦','左','季','管','符','辛','苗','詹','曲','欧阳','靳','祁','路','涂','兰','甘','裴','梅','童','翁','霍','游','阮','尤','岳','柯','牟','滕','谷','舒','卜','成','饶','宁','凌','盛','查','单','冉','鲍','华','包','屈','房','喻','解','蒲','卫','简','时','连','车','项','闵','邬','吉','党','阳','司','费','蒙','席','晏','隋','古','强','穆','姬','宫','景','米','麦','谈','柏','瞿','艾','沙','鄢','桂','窦','郁','缪','畅','巩','卓','褚','栾','戚','全','娄','甄','郎','池','丛','边','岑','农','苟','迟','保','商','臧','','卞','虞','刁','冷','应','匡','栗','仇','练','楚','揭','师','官','佟','封','燕','桑','巫','敖','原','植','邝','仲','荆','储','宗','','干','苑','寇','盖','南','屠','鞠','荣','井','乐','银','奚','明','麻','雍','花','闻','冼','木','郜','廉','衣','蔺','和','冀','占','','门','帅','利','满','陈生']
        xins=set(xins)
        print xins.__len__()
        #f=codecs.open("./baijiaxin.txt","w","utf-8")
        #for elem in [elem.strip() for elem in xins if elem.strip()<>""]:
        #    f.write("%s\n"%elem.strip())
        #f.close()
        f=codecs.open("./short_name_global_xin.txt","w","utf-8")
        xins=[elem.strip() for elem in xins if elem.strip()<>""]   
        for line in codecs.open("./short_name_global.txt","r","utf-8").readlines():
            a,b=line.strip().split("|||")
            a=a.strip()
            b=b.strip()
            if (a[:2]==b[:2] and a[:2] in xins) or (a[:1]==b[:1] and a[:1] in xins) and len(a)<5 and len(b)<5:
                f.write(line)
        f.close()

    def extrace_names():
        subject="""
        <TD width=90><a href=/zaobao/chinese/surname/pages/story_bai2.html target=_blank>白</a></TD>
    <TD width=90><a href=/zaobao/chinese/surname/pages/story_bi4.html target=_blank>毕</a></TD>
    <TD width=90><a href=/zaobao/chinese/surname/pages/bian040600.html target=_blank>卞</a></TD>
    <TD width=90><a href=/zaobao/chinese/surname/pages/story_cai4.html target=_blank>蔡</a></TD>
    <TD width=90><a href=/zaobao/chinese/surname/pages/story_cao2.html target=_blank>曹</a></TD>
    <TD width=90><a href=/zaobao/chinese/surname/pages/story_cen2.html target=_blank>岑</a></TD>
    <TD width=90><a href=/zaobao/chinese/surname/pages/chang040600.html target=_blank>常</a></TD>
    <TD width=90><a href=/zaobao/chinese/surname/pages/story_che.html target=_blank>车</a></TD>
    <TD width=90><a href=/zaobao/chinese/surname/pages/story_chen2.html target=_blank>陈</a></TD>
    <TD width=90><a href=/zaobao/chinese/surname/pages/cheng030100.html target=_blank>成</a></TD>
    </TR>

    <TR ALIGN=CENTER>
    <TD width=90><a href=/zaobao/chinese/surname/pages/story_cheng2.html target=_blank>程</a></TD>
    <TD width=90><a href=/zaobao/chinese/surname/pages/story_chi2.html target=_blank>池</a></TD>
    <TD width=90><a href=/zaobao/chinese/surname/pages/story_deng4.html target=_blank></a>邓</TD>
    <TD width=90><a href=/zaobao/chinese/surname/pages/story_ding.html target=_blank>丁</a></TD>
    <TD width=90><a href=/zaobao/chinese/surname/pages/story_fan4.html target=_blank>范</a></TD>
    <TD width=90><a href=/zaobao/chinese/surname/pages/story_fang.html target=_blank></a>方</TD>
    <TD width=90><a href=/zaobao/chinese/surname/pages/fan140600.html target=_blank>樊</a></TD>
    <TD width=90><a href=/zaobao/chinese/surname/pages/fei140600.html target=_blank>费</a></TD>
    <TD width=90><a href=/zaobao/chinese/surname/pages/story_feng2.html target=_blank>冯</a></TD>
    <TD width=90><a href=/zaobao/chinese/surname/pages/story_fu2.html target=_blank>符</a></TD>
    </TR>

    <TR ALIGN=CENTER>
    <TD width=90><a href=/zaobao/chinese/surname/pages/story_fu4.html target=_blank>傅</a></TD>
    <TD width=90><a href=/zaobao/chinese/surname/pages/story_gan.html target=_blank>甘</a></TD>
    <TD width=90><a href=/zaobao/chinese/surname/pages/story_gao.html target=_blank>高</a></TD>
    <TD width=90><a href=/zaobao/chinese/surname/pages/ge170100.html target=_blank>葛</a></TD>
    <TD width=90><a href=/zaobao/chinese/surname/pages/story_gong.html target=_blank>龚</a></TD>
    <TD width=90><a href=/zaobao/chinese/surname/pages/story_gu3.html target=_blank>古</a></TD>
    <TD width=90><a href=/zaobao/chinese/surname/pages/story_guan.html target=_blank>关</a></TD>
    <TD width=90><a href=/zaobao/chinese/surname/pages/story_guo.html target=_blank>郭</a></TD>
    <TD width=90><a href=/zaobao/chinese/surname/pages/story_han2.html target=_blank>韩</a></TD>
    <TD width=90><a href=/zaobao/chinese/surname/pages/story_he2.html target=_blank>何</a></TD>
    </TR>

    <TR ALIGN=CENTER>
    <TD width=90><a href=/zaobao/chinese/surname/pages/he140600.html target=_blank>贺</a></TD>
    <TD width=90><a href=/zaobao/chinese/surname/pages/story_hong2.html target=_blank>洪</a></TD>
    <TD width=90><a href=/zaobao/chinese/surname/pages/story_hou2.html target=_blank>侯</a></TD>
    <TD width=90><a href=/zaobao/chinese/surname/pages/story_hu2.html target=_blank>胡</a></TD>
    <TD width=90><a href=/zaobao/chinese/surname/pages/story_hua4.html target=_blank>华</a></TD>
    <TD width=90><a href=/zaobao/chinese/surname/pages/story_huang2.html target=_blank>黄</a></TD>
    <TD width=90><a href=/zaobao/chinese/surname/pages/huo010600.html target=_blank>霍</a></TD>
    <TD width=90><a href=/zaobao/chinese/surname/pages/ji030100.html target=_blank>姬</a></TD>
    <TD width=90><a href=/zaobao/chinese/surname/pages/story_jian3.html target=_blank>简</a></TD>
    <TD width=90><a href=/zaobao/chinese/surname/pages/story_jiang1.html target=_blank>江</a></TD>
    </TR>

    <TR ALIGN=CENTER>
    <TD width=90><a href=/zaobao/chinese/surname/pages/story_jiang.html target=_blank>姜</a></TD>
    <TD width=90><a href=/zaobao/chinese/surname/pages/story_jiang3.html target=_blank>蒋</a></TD>
    <TD width=90><a href=/zaobao/chinese/surname/pages/story_jin.html target=_blank>金</a></TD>
    <TD width=90><a href=/zaobao/chinese/surname/pages/story_kang.html target=_blank>康</a></TD>
    <TD width=90><a href=/zaobao/chinese/surname/pages/story_ke.html target=_blank>柯</a></TD>
    <TD width=90><a href=/zaobao/chinese/surname/pages/story_kong3.html target=_blank>孔</a></TD>
    <TD width=90><a href=/zaobao/chinese/surname/pages/story_lai4.html target=_blank>赖</a></TD>
    <TD width=90><a href=/zaobao/chinese/surname/pages/lang170100.html target=_blank>郎</a></TD>
    <TD width=90><a href=/zaobao/chinese/surname/pages/le140600.html target=_blank>乐</a></TD>
    <TD width=90><a href=/zaobao/chinese/surname/pages/story_lei2.html target=_blank>雷</a></TD>
    </TR>

    <TR ALIGN=CENTER>
    <TD width=90><a href=/zaobao/chinese/surname/pages/story_li2.html target=_blank>黎</a></TD>
    <TD width=90><a href=/zaobao/chinese/surname/pages/story_li3.html target=_blank>李</a></TD>
    <TD width=90><a href=/zaobao/chinese/surname/pages/story_lian2.html target=_blank>连</a></TD>
    <TD width=90><a href=/zaobao/chinese/surname/pages/lian140600.html target=_blank>廉</a></TD>
    <TD width=90><a href=/zaobao/chinese/surname/pages/liang030100.html target=_blank>梁</a></TD>
    <TD width=90><a href=/zaobao/chinese/surname/pages/story_liao4.html target=_blank>廖</a></TD>
    <TD width=90><a href=/zaobao/chinese/surname/pages/story_lin2.html target=_blank>林</a></TD>
    <TD width=90><a href=/zaobao/chinese/surname/pages/story_ling2.html target=_blank>凌</a></TD>
    <TD width=90><a href=/zaobao/chinese/surname/pages/story_liu2.html target=_blank>刘</a></TD>
    <TD width=90><a href=/zaobao/chinese/surname/pages/story_liu3.html target=_blank>柳</a></TD>
    </TR>

    <TR ALIGN=CENTER>
    <TD width=90><a href=/zaobao/chinese/surname/pages/story_long2.html target=_blank>龙</a></TD>
    <TD width=90><a href=/zaobao/chinese/surname/pages/story_lu2.html target=_blank>卢</a></TD>
    <TD width=90><a href=/zaobao/chinese/surname/pages/lu170100.html target=_blank>鲁</a></TD>
    <TD width=90><a href=/zaobao/chinese/surname/pages/story_lu4.html target=_blank>陆</a></TD>
    <TD width=90><a href=/zaobao/chinese/surname/pages/lu140600.html target=_blank>路</a></TD>
    <TD width=90><a href=/zaobao/chinese/surname/pages/story_lv3.html target=_blank>吕</a></TD>
    <TD width=90><a href=/zaobao/chinese/surname/pages/story_luo2.html target=_blank>罗</a></TD>
    <TD width=90><a href=/zaobao/chinese/surname/pages/story_luo4.html target=_blank>骆</a></TD>
    <TD width=90><a href=/zaobao/chinese/surname/pages/story_ma3.html target=_blank>马</a></TD>
    <TD width=90><a href=/zaobao/chinese/surname/pages/story_mei2.html target=_blank>梅</a></TD>
    </TR>

    <TR ALIGN=CENTER>
    <TD width=90><a href=/zaobao/chinese/surname/pages/meng140600.html target=_blank>孟</a></TD>
    <TD width=90><a href=/zaobao/chinese/surname/pages/story_mo4.html target=_blank>莫</a></TD>
    <TD width=90><a href=/zaobao/chinese/surname/pages/mu030100.html target=_blank>母</a></TD>
    <TD width=90><a href=/zaobao/chinese/surname/pages/mu130700.html target=_blank>穆</a></TD>
    <TD width=90><a href=/zaobao/chinese/surname/pages/story_ni2.html target=_blank>倪</a></TD>
    <TD width=90><a href=/zaobao/chinese/surname/pages/story_ning2.html target=_blank>宁</a></TD>
    <TD width=90><a href=/zaobao/chinese/surname/pages/story_ou.html target=_blank>欧</a></TD>
    <TD width=90><a href=/zaobao/chinese/surname/pages/ou030100.html target=_blank>区</a></TD>
    <TD width=90><a href=/zaobao/chinese/surname/pages/story_pan.html target=_blank>潘</a></TD>
    <TD width=90><a href=/zaobao/chinese/surname/pages/story_peng2.html target=_blank>彭</a></TD>
    </TR>

    <TR ALIGN=CENTER>
    <TD width=90><a href=/zaobao/chinese/surname/pages/story_pu2.html target=_blank>蒲</a></TD>
    <TD width=90><a href=/zaobao/chinese/surname/pages/pi130700.html target=_blank>皮</a></TD>
    <TD width=90><a href=/zaobao/chinese/surname/pages/qi130700.html target=_blank>齐</a></TD>
    <TD width=90><a href=/zaobao/chinese/surname/pages/qi030100.html target=_blank>戚</a></TD>
    <TD width=90><a href=/zaobao/chinese/surname/pages/story_qian2.html target=_blank>钱</a></TD>
    <TD width=90><a href=/zaobao/chinese/surname/pages/qiang310500.html target=_blank>强</a></TD>
    <TD width=90><a href=/zaobao/chinese/surname/pages/story_qing2.html target=_blank>秦</a></TD>
    <TD width=90><a href=/zaobao/chinese/surname/pages/qiu030100.html target=_blank>丘</a></TD>
    <TD width=90><a href=/zaobao/chinese/surname/pages/story_qiu.html target=_blank>邱</a></TD>
    <TD width=90><a href=/zaobao/chinese/surname/pages/story_rao2.html target=_blank>饶</a></TD>
    </TR>

    <TR ALIGN=CENTER>
    <TD width=90><a href=/zaobao/chinese/surname/pages/story_ren2.html target=_blank>任</a></TD>
    <TD width=90><a href=/zaobao/chinese/surname/pages/story_shen3.html target=_blank>沈</a></TD>
    <TD width=90><a href=/zaobao/chinese/surname/pages/sheng010600.html target=_blank>盛</a></TD>
    <TD width=90><a href=/zaobao/chinese/surname/pages/story_shi.html target=_blank>施</a></TD>
    <TD width=90><a href=/zaobao/chinese/surname/pages/story_shi2.html target=_blank>石</a></TD>
    <TD width=90><a href=/zaobao/chinese/surname/pages/shi300500.html target=_blank>时</a></TD>
    <TD width=90><a href=/zaobao/chinese/surname/pages/story_shi3.html target=_blank>史</a></TD>
    <TD width=90><a href=/zaobao/chinese/surname/pages/situ030100.html target=_blank>司徒</a></TD>
    <TD width=90><a href=/zaobao/chinese/surname/pages/story_su.html target=_blank>苏</a></TD>
    <TD width=90><a href=/zaobao/chinese/surname/pages/story_sun.html target=_blank>孙</a></TD>
    </TR>

    <TR ALIGN=CENTER>
    <TD width=90><a href=/zaobao/chinese/surname/pages/story_tan2.html target=_blank>谭</a></TD>
    <TD width=90><a href=/zaobao/chinese/surname/pages/story_tang.html target=_blank>汤</a></TD>
    <TD width=90><a href=/zaobao/chinese/surname/pages/story_tang2.html target=_blank>唐</a></TD>
    <TD width=90><a href=/zaobao/chinese/surname/pages/story_tao2.html target=_blank>陶</a></TD>
    <TD width=90><a href=/zaobao/chinese/surname/pages/story_tian2.html target=_blank>田</a></TD>
    <TD width=90><a href=/zaobao/chinese/surname/pages/tong040600.html target=_blank>童</a></TD>
    <TD width=90><a href=/zaobao/chinese/surname/pages/story_tu2.html target=_blank>涂</a></TD>
    <TD width=90><a href=/zaobao/chinese/surname/pages/story_wang2.html target=_blank>王</a></TD>
    <TD width=90><a href=/zaobao/chinese/surname/pages/wei010600.html target=_blank>危</a></TD>
    <TD width=90><a href=/zaobao/chinese/surname/pages/story_wei3.html target=_blank>韦</a></TD>
    </TR>

    <TR ALIGN=CENTER>
    <TD width=90><a href=/zaobao/chinese/surname/pages/wei180100a.html target=_blank>卫</a></TD>
    <TD width=90><a href=/zaobao/chinese/surname/pages/story_wei4.html target=_blank>魏</a></TD>
    <TD width=90><a href=/zaobao/chinese/surname/pages/story_wen.html target=_blank>温</a></TD>
    <TD width=90><a href=/zaobao/chinese/surname/pages/story_wen2.html target=_blank>文</a></TD>
    <TD width=90><a href=/zaobao/chinese/surname/pages/story_weng.html target=_blank>翁</a></TD>
    <TD width=90><a href=/zaobao/chinese/surname/pages/story_wu.html target=_blank>巫</a></TD>
    <TD width=90><a href=/zaobao/chinese/surname/pages/story_wu1.html target=_blank>邬</a></TD>
    <TD width=90><a href=/zaobao/chinese/surname/pages/story_wu2.html target=_blank>吴</a></TD>
    <TD width=90><a href=/zaobao/chinese/surname/pages/story_wu3.html target=_blank>伍</a></TD>
    <TD width=90><a href=/zaobao/chinese/surname/pages/story_wu3a.html target=_blank>武</a></TD>
    </TR>

    <TR ALIGN=CENTER>
    <TD width=90><a href=/zaobao/chinese/surname/pages/xi040600.html target=_blank>席</a></TD>
    <TD width=90><a href=/zaobao/chinese/surname/pages/story_xia4.html target=_blank>夏</a></TD>
    <TD width=90><a href=/zaobao/chinese/surname/pages/story_xiao.html target=_blank>萧</a></TD>
    <TD width=90><a href=/zaobao/chinese/surname/pages/story_xie4.html target=_blank>谢</a></TD>
    <TD width=90><a href=/zaobao/chinese/surname/pages/story_xin.html target=_blank>辛</a></TD>
    <TD width=90><a href=/zaobao/chinese/surname/pages/story_xing2.html target=_blank>邢</a></TD>
    <TD width=90><a href=/zaobao/chinese/surname/pages/story_xu2.html target=_blank>徐</a></TD>
    <TD width=90><a href=/zaobao/chinese/surname/pages/story_xu3.html target=_blank>许</a></TD>
    <TD width=90><a href=/zaobao/chinese/surname/pages/story_xue.html target=_blank>薛</a></TD>
    <TD width=90><a href=/zaobao/chinese/surname/pages/story_yan2.html target=_blank>严</a></TD>
    </TR>

    <TR ALIGN=CENTER>
    <TD width=90><a href=/zaobao/chinese/surname/pages/story_yan2a.html target=_blank>颜</a></TD>
    <TD width=90><a href=/zaobao/chinese/surname/pages/story_yang2.html target=_blank>杨</a></TD>
    <TD width=90><a href=/zaobao/chinese/surname/pages/story_ye4.html target=_blank>叶</a></TD>
    <TD width=90><a href=/zaobao/chinese/surname/pages/story_yi4.html target=_blank>易</a></TD>
    <TD width=90><a href=/zaobao/chinese/surname/pages/yin020600.html target=_blank>殷</a></TD>
    <TD width=90><a href=/zaobao/chinese/surname/pages/story_you2.html target=_blank>尤</a></TD>
    <TD width=90><a href=/zaobao/chinese/surname/pages/yu010600.html target=_blank>于</a></TD>
    <TD width=90><a href=/zaobao/chinese/surname/pages/story_yu2.html target=_blank>余</a></TD>
    <TD width=90><a href=/zaobao/chinese/surname/pages/yu020600a.html target=_blank>俞</a></TD>
    <TD width=90><a href=/zaobao/chinese/surname/pages/yu020600.html target=_blank>虞</a></TD>
    </TR>

    <TR ALIGN=CENTER>
    <TD width=90><a href=/zaobao/chinese/surname/pages/yuan310500.html target=_blank>元</a></TD>
    <TD width=90><a href=/zaobao/chinese/surname/pages/story_yuan2.html target=_blank>袁</a></TD>
    <TD width=90><a href=/zaobao/chinese/surname/pages/yue030100.html target=_blank>岳</a></TD>
    <TD width=90><a href=/zaobao/chinese/surname/pages/story_yun2.html target=_blank>云</a></TD>
    <TD width=90><a href=/zaobao/chinese/surname/pages/story_zeng.html target=_blank>曾</a></TD>
    <TD width=90><a href=/zaobao/chinese/surname/pages/story_zhan.html target=_blank>詹</a></TD>
    <TD width=90><a href=/zaobao/chinese/surname/pages/story_zhang.html target=_blank>张</a></TD>
    <TD width=90><a href=/zaobao/chinese/surname/pages/story_zhang1.html target=_blank>章</a></TD>
    <TD width=90><a href=/zaobao/chinese/surname/pages/story_zhao4.html target=_blank>赵</a></TD>
    <TD width=90><a href=/zaobao/chinese/surname/pages/story_zheng4.html target=_blank>郑</a></TD>
    </TR>

    <TR ALIGN=CENTER>
    <TD width=90><a href=/zaobao/chinese/surname/pages/story_zhong.html target=_blank>钟</a></TD>
    <TD width=90><a href=/zaobao/chinese/surname/pages/story_zhou.html target=_blank>周</a></TD>
    <TD width=90><a href=/zaobao/chinese/surname/pages/story_zhou1.html target=_blank>邹</a></TD>
    <TD width=90><a href=/zaobao/chinese/surname/pages/story_zhu.html target=_blank>朱</a></TD>
    <TD width=90><a href=/zaobao/chinese/surname/pages/zhu180100.html target=_blank>褚</a></TD>
    <TD width=90><a href=/zaobao/chinese/surname/pages/story_zhuang.html target=_blank>庄</a></TD>
    <TD width=90><a href=/zaobao/chinese/surname/pages/story_zhuo.html target=_blank>卓</a></TD>
    """
        result = re.findall(r"target=_blank>(?P<name>[\s\S]*?)</TD>", subject)
        print ("['"+"','".join(result)+"']").replace("</a>","")
        html=u"""
    1李
    2王
    3张
    4刘
    5陈
    6杨
    7赵
    8黄
    9周
    10吴
    11徐
    12孙
    13胡
    14朱
    15高
    16林
    17何
    18郭
    19马
    20罗
    21梁
    22宋
    23郑
    24谢
    25韩
    26唐
    27冯
    28于
    29董
    30萧
    31程
    32曹
    33袁
    34邓
    35许
    36傅
    37沈
    38曾
    39彭
    40吕
    41苏
    42卢
    43蒋
    44蔡
    45贾
    46丁
    47魏
    48薛
    49叶
    50阎
    51余
    52潘
    53杜
    54戴
    55夏
    56钟
    57汪
    58田
    59任
    60姜
    61范
    62方
    63石
    64姚
    65谭
    66廖
    67邹
    68熊
    69金
    70陆
    71郝
    72孔
    73白
    74崔
    75康
    76毛
    77邱
    78秦
    79江
    80史
    81顾
    82侯
    83邵
    84孟
    85龙
    86万
    87段
    88章
    89钱
    90汤
    91尹
    92黎
    93易
    94常
    95武
    96乔
    97贺
    98赖
    99龚
    100文
    """
        list2=[]
        for line in html.strip().split(" \n"):
            list2.append("'"+line[-1]+"'")
        print "[" +  ",".join(list2) + "]"
        html=u"""
    鲍俎(bao zu)、百里(bai li)、碧鲁(bi lu)、伯赏(bo shang)、北堂(bei tang)
    单于(chan yu)、陈林(chen lin)、淳于(chun yu)、
    第五(di wu)、 东方(dong fang)、东郭(dong guo)、东门(dong men)、段干(duan gan)、独孤(du gu)、端木(duan mu)、
    范姜(fan jiang)、
    哥舒(ge shu)、公良(gong liang)、公孙(gong sun)、公西(gong xi)、公冶(gong yan)、公羊(gong yang)、缑亢(gou kang)、谷梁(gu liang)、归海(gui hai)、
    赫连(he lian)、胡母(hu mu)、呼延(hu yan)、黄方(huang fang)、皇甫(huang fu)、
    即墨(ji mo)、夹谷(jia gu)、晋楚(jin chu)、
    况后(kuang hou)、
    梁丘(liang qiu)、令狐(ling hu)、陆费(lu fei)、闾丘(lv qiu)、闾邱(lv qiu)、
    明哲(ming zhe)、墨哈(mo ha)、慕容(mu rong)、万俟(mò qí)
    钠兰(na lan)、南宫(nan gong)、南郭(nan guo)、南门(nan men)、年爱(nian ai)、
    欧阳(ou yang)、
    濮阳(pu yang)、
    漆雕(qi diao)、亓官(qi guan)、屈突(qu tu)、
    壤驷(rang si)、汝鄢(ru yan)、
    司马(si ma)、司空(si kong)、司寇(si kou)、司徒(si tu)、上官(shang guan)、商牟(shang mou)、申屠(shen tu)、侍其(shi qi)、疏束(shu su)、叔孙(shu sun)、
    太史(tai shi)、太叔(tai shu)、澹台(tan tai)、涂钦(tu qin)、拓拔(tuo ba)、
    完完(wan wan)、完颜(wan yan)、王子(wang zi)、闻人(wen ren)、微生(wei sheng)、巫马(wu ma)、乌雅(wu ya)、铁笔(tie bi)
    西门(xi men)、夏侯(xia hou)、许世(xu shi)、轩辕(xuan yuan)、
    闫法(yan fa)、羊舌(yang she)、阳佟(yang tong)、耶律(ye lv)、有琴(you qin)、尉迟(yu chi)、余佴(yu er)、宇文(yu wen)、岳帅(yue shuai)、乐正(yue zheng)、
    宰父(zai fu)、子车(zi che)、子阳(zi yang)、宗政(zong zheng)、左丘(zuo qiu)、张简(zhang jian)、章佳(zhang jia)、长孙(zhang sun)、郑余(zheng yu)、仲孙(zhong sun)、钟离(zhong li)、诸葛(zhu ge)、颛孙(zhuan sun)、
    """
        list3=[]
        for line in html.strip().split("\r\n"):
            list3.extend(["'" + elem.strip()[:2] + "'" for elem in line.split("、") if elem.strip()<>""])
        for elem in list3:
            print elem
        print "[" +  ",".join(list3) + "]"
       
        html=u"""
    李 王 张 刘 陈 杨 黄 孙 周 吴
    徐 赵 朱 马 胡 郭 林 何 高 梁
    郑 罗 宋 谢 唐 韩 曹 许 邓 萧
    冯 曾 程 蔡 彭 潘 袁 于 董 余
    苏 叶 吕 魏 蒋 田 杜 丁 沈 姜
    范 江 傅 钟 卢 汪 戴 崔 任 陆
    廖 姚 方 金 邱 夏 谭 韦 贾 邹
    石 熊 孟 秦 阎 薛 侯 雷 白 龙
    段 郝 孔 邵 史 毛 常 万 顾 赖
    武 康 贺 严 尹 钱 施 牛 洪 龚
    汤 陶 黎 温 莫 易 樊 乔 文 安
    殷 颜 庄 章 鲁 倪 庞 邢 俞 翟
    蓝 聂 齐 向 申 葛 柴 伍 覃 骆
    关 焦 柳 欧 祝 纪 尚 毕 耿 芦
    左 季 管 符 辛 苗 詹 曲 欧阳 靳
    祁 路 涂 兰 甘 裴 梅 童 翁 霍
    游 阮 尤 岳 柯 牟 滕 谷 舒 卜
    成 饶 宁 凌 盛 查 单 冉 鲍 华
    包 屈 房 喻 解 蒲 卫 简 时 连
    车 项 闵 邬 吉 党 阳 司 费 蒙
    席 晏 隋 古 强 穆 姬 宫 景 米
    麦 谈 柏 瞿 艾 沙 鄢 桂 窦 郁
    缪 畅 巩 卓 褚 栾 戚 全 娄 甄
    郎 池 丛 边 岑 农 苟 迟 保 商
    臧 佘 卞 虞 刁 冷 应 匡 栗 仇
    练 楚 揭 师 官 佟 封 燕 桑 巫
    敖 原 植 邝 仲 荆 储 宗 楼 干
    苑 寇 盖 南 屠 鞠 荣 井 乐 银
    奚 明 麻 雍 花 闻 冼 木 郜 廉
    衣 蔺 和 冀 占 公 门 帅 利 满
    陈生
    """
        list4=[]
        for line in html.split(" "):
            if line.strip()<>"" and line.strip().isdigit()==False:
                list4.append("'" + line.strip()+"'")
        print list4.__len__()
        print "[" +  ",".join(list4) + "]"

    def is_chinese_or_space(str):
        """
        >>> is_chinese_or_space(u"中国 人")
        True
       
        >>> is_chinese_or_space(u"中国 人1")
        False
       
        >>> is_chinese_or_space(u"华为huawei")
        False

        >>> is_chinese_or_space(u"游泳裤xxxl")
        False
        """
        if type(str)==type(""):
            str=str.encode("utf-8","ignore")
        r=[]
        for char in str:
            r.append(_is_chinese_or_space(char))
        if False in r:
            return False
        return True

    def is_english_or_space(str):
        """
        >>> is_english_or_space(u"abc def1")
        False
       
        >>> is_english_or_space(u"abc def")
        True

        >>> is_english_or_space(u"游泳裤xxxl")
        False
       
        >>> is_english_or_space(u"茶具")
        False
        """
        if type(str)==type(""):
            str=str.encode("utf-8","ignore")
        r=[]
        for char in str:
            r.append(_is_english_or_space(char))
        if False in r:
            return False
        return True

    def _is_chinese_or_space(uchar):
        """
        >>> is_chinese_or_space(u"人")
        True
       
        >>> is_chinese_or_space(u"1")
        False
       
        >>> is_chinese_or_space(u" ")
        True
        """
        if is_chinese(uchar) or uchar==u" ":
            return True
        else:
            return False

    def _is_english_or_space(uchar):
        """
        >>> _is_english_or_space(u"1")
        False
       
        >>> _is_english_or_space(u"a")
        True
       
        >>> _is_english_or_space(u" ")
        True
       
        >>> _is_english_or_space(u"中")
        False
        """
        if is_chinese(uchar):
            return False
        if uchar.isalpha() or uchar==u" ":
            return True
        return False
       
    def find_one_side_chinese_and_another_side_is_english():
        f=codecs.open("./short_name_global_chinese_english.txt","w","utf-8")
        for line in codecs.open("./short_name_global.txt","r","utf-8").readlines():
            a,b=line.strip().split("|||")
            a=a.strip()
            b=b.strip()
            #print a,b
            #print is_chinese_or_space(a)
            #print is_english_or_space(b)
            #if (is_chinese_or_space(a)==True and is_english_or_space(b)==True):
            #    print line
            #    time.sleep(100000000)

            if (is_chinese_or_space(a)==True and is_english_or_space(b)==True) or \
                 (is_chinese_or_space(b)==True and is_english_or_space(a)==True):
               
                f.write(line)
        f.close()

                   
    if __name__=="__main__":
        doctest.testmod()
    #    read_relevent_words()
    #    parser_one_line_one_words2()
    #    compare_pareser_one_line_one_words_result_lost_line_for_tmp()
    #    build_invert_index()
    #    build_word_segments_hash_map()
    #    final_find_synomns_out()   
    #    interactive_mode()
    #    print _filter("龟 鹿 补 肾丸 4.5 g*12 袋 水 蜜丸 / 盒 [ 补 肾 失眠 体弱 疲乏 壮 阳 ]")
    #    print _filter("龟 牌 ( turtle ) 硬壳 防水 全效 蜡 g-223r")
    #    post_process_wname_segments_illegal_characters()
    #    filter_synonym_result()   
    #    test_redis_is_ready()
    #    pivot_query_relvent_word_order_and_intersation_size()
    #    find_short_name()
    #    find_short_name2()
    #    test_sorted()
    #    find_only_one_word_difference()
    #    extrace_names()
    #    find_human_names()
        find_one_side_chinese_and_another_side_is_english()
    #    print is_english_or_space(u"茶具")

  • 相关阅读:
    写时复制集合 —— CopyOnWriteArrayList 源码原理阅读笔记
    初步整合vue-element-admin和GitDataV两个Vue开源框架方案实现大数据可视化
    IOS苹果登录sign in with apple后端校验
    IOS审核被拒:IOS苹果授权登录(Sign in with Apple)/Apple登录/苹果登录集成教程
    ios安装自定义基座失败问题
    IOS APP上架App Store及提交审核详细教程
    IOS APP报错:SyntaxError: Invalid regular expression: invalid group specifier name __ERROR
    Apple Pay苹果支付IOS in-App Purchase内购项目服务端校验
    浅析浏览器是如何工作的(一):V8引擎、JIT机制、JS代码解释执行与编译执行
    ApplePay苹果支付内购项目配置及代码实现及沙箱测试
  • 原文地址:https://www.cnblogs.com/lexus/p/2772315.html
Copyright © 2011-2022 走看看