zoukankan      html  css  js  c++  java
  • python 正则表达式匹配中文

    #!/usr/bin/python
    #
    -*- coding:cp936-*-

    #思路,将str转换成unicode,方可用正则表达式,前提是,要知道文件的编码,本例中是gbk
    import cPickle as mypickle
    import re
    import sys
    if (__name__=='__main__'):
        fid1=file('above50purenames.txt','r');
        p=re.compile('(^\s+|\s+$)');
        phanzigbk=re.compile('[\\x20-\\x7f]');
        phanzi=re.compile(u'[\u4e00-\u9fa5]');#这里要加u,注意
        commlines=fid1.readlines();
        fid1.close();
        dictfamilyname={};
        dictfirstname={};
        for line in commlines:
            line=p.sub('',line);
            print type(line);
            print line;
            uline=unicode(line,'gbk');
            print type(uline);
            candidates=phanzi.findall(uline);

            print len(candidates);
            if(len(candidates)==2):
                print candidates[0];
                familynamegbk=candidates[0].encode('gbk');#把unicode型的变量变成str型的变量
                firstnamegbk=candidates[1].encode('gbk');
                if(dictfamilyname.has_key(familynamegbk)):
                    dictfamilyname[familynamegbk]=dictfamilyname[familynamegbk]+1;
                else:
                    dictfamilyname[familynamegbk]=1;
            
                if(dictfirstname.has_key(firstnamegbk)):
                    dictfirstname[firstnamegbk]=dictfirstname[firstnamegbk]+1;
                else:
                    dictfirstname[firstnamegbk]=1;

        familynameitems=dictfamilyname.items();
        print familynameitems;
        firstnameitems=dictfirstname.items();
        familynameitems.sort(key=lambda d:d[1],reverse=True);
        firstnameitems.sort(key=lambda d :d[1],reverse=True);
        fid=file('familyname.txt','w');
        for m in familynameitems:
            s=m[0]+'\t'+str(m[1]);
            fid.write(s);
            fid.write('\n');
        fid.close();
        fid=file('firstname.txt','w');
        for m in firstnameitems:
            s=m[0]+'\t'+str(m[1]);
            fid.write(s);
            fid.write('\n');
        fid.close();
        print 'finish'
       

  • 相关阅读:
    redis学习
    Ubuntu命令大全
    关于jquery中attr和prop的用法
    Ubuntu下修改为永久DNS的方法
    Yii2 behaviors中verbs access的一些理解
    vue_ form表单 v-model
    vue-one_demo_music
    ES6
    VUE 入门 01
    Django model.py表单设置默认值允许为空
  • 原文地址:https://www.cnblogs.com/finallyliuyu/p/2340685.html
Copyright © 2011-2022 走看看