zoukankan      html  css  js  c++  java
  • 用python做含有中文的正则表达式模式匹配

    #!/usr/bin/python
    #
    -*- coding:gbk-*-
    '''
    spec:根据是否命中126W人名,将usrdict分为两个部分
    parms:
    [IN] 
    [IN]
    [OUT]
    author: liuyusi0121@sogou-inc.com date 20120808
    '''
    import re;
    import sys;
    def LoadKeys(filename):
        '''
        加载key到内存
        
    '''
        keys=[];
        p=re.compile('^\s+|\s+$');
        fid=file(filename,"r");
        temp=fid.readlines();
        fid.close();
        for line in temp:
            line=p.sub('',line);
            keys.append(line);


        return keys;
    def PrintUsage():
        print 'program [IN] keywords.txt [IN]file.txt [OUT] matched.txt [OUT] notmatched.txt [OUT] ufuwfoverflow';
        exit(1);

    if(__name__=="__main__"):
        delim="\t";
        p=re.compile("(^\\s+|\\s+$)");
        if(len(sys.argv)!=6):
            PrintUsage();
        keyfile=str(sys.argv[1]);
        keys=LoadKeys(keyfile);
        print len(keys);
        inputfile=str(sys.argv[2]);
        outputfile1=str(sys.argv[3]);
        outputfile2=str(sys.argv[4]);
        outputfile3=str(sys.argv[5]);
        fout1=open(outputfile1,'w');
        fout2=open(outputfile2,'w');
        fout3=open(outputfile3,'w');
        fid=open(inputfile,"r");
        linecount=0;
        while True:
            line=fid.readline();
            flag=0;
            if(0==len(line)):
                break;
            line=p.sub('',line);
            if(''==line):
                continue;
            if(0==linecount%100000):
                print '语料已经处理%d行'%linecount;
                linecount=linecount+1;
            linesegs=line.split("\t");
            if(4!=len(linesegs)):
                continue;
            if(int(linesegs[2])<=0 or int(linesegs[3])<=0):
                fout3.write(line);
                fout3.write("\n");
                continue;
            try:
                useg=unicode(linesegs[0],'gbk');
                count=0;
                for key in keys:
                    if(0==count%100000):
                        print '模式已经扫描%d个'%count;
                    count=count+1;
                    patternstr="(^"+key+"|"+key+"$)";
                    try:
                        upatternstr=unicode(patternstr,"gbk");
                        pattern=re.compile(upatternstr);
                        if(pattern.search(useg)):
                            print line;
                            flag=1;
                            linesegs.append(key)
                            newline=delim.join(linesegs);
                            fout1.write(newline);
                            fout1.write("\n");
                            break;
                    except UnicodeDecodeError:
                        pass;
            except:
                pass;
            if(flag==0):
                linesegs.append("_");
                newline=delim.join(linesegs);
                fout2.write(newline);
                fout2.write("\n");
        fid.close();
        fout1.close();
        fout2.close();
        fout3.close();
  • 相关阅读:
    670. Maximum Swap
    653. Two Sum IV
    639. Decode Ways II
    636. Exclusive Time of Functions
    621. Task Scheduler
    572. Subtree of Another Tree
    554. Brick Wall
    543. Diameter of Binary Tree
    535. Encode and Decode TinyURL
    博客园自定义背景图片
  • 原文地址:https://www.cnblogs.com/finallyliuyu/p/2629020.html
Copyright © 2011-2022 走看看