zoukankan      html  css  js  c++  java
  • 汉字正字表达式解决方案

    原理: 将模式串与匹配串都转成unicode编码,再用正则。 可以用python完成,或者是用c++boost

    方案一,解析程序C版本,中间调用python函数,python函数中调用正则表达式进行函数解析。 本地可以运行,但是haoop集群运行不了。

    方案二,采用boost wregex C++源码编译boost库。

    备注: cpp文件都UTF-8编码

    方案一代码:

    #-*-coding:UTF-8-*-
    import re;
    import sys;
    import time;
    def add(a,b): 
        s="";
        try:
            upatternstr=unicode(a,'UTF-8');
        except:
            pass;
        pchinese=re.compile(upatternstr);
        try:
            uline = unicode(b,"UTF-8");
            mylist = [];
            index = 0;
            while True:
                m=pchinese.search(uline,index);
                if (m!=None):
                    mylist.append(m.group(1).encode("UTF-8"));
                    index =m.end();
                else:
                    break;
            s="	".join(mylist);
            return s;
        except:
            return s;
    
    if (__name__=="__main__"):
        t="<li><span>字义:</span>(.*?)</li>";
        fid=open("qiming2.txt","r");
        s=fid.read();
        fid.close();
        add(t,s);
    

      

    char line[102400]={0};
        char text[102400]={0};
        char pattern[200]={0};
        strcpy(pattern,t.c_str());
        while(fgets(line,102400,stdin))
        {
    
                //text.assign(line);
                //wstring wtext = String2Wstringx(t);
                //wstring::const_iterator  it=wtext.begin();
               // wstring::const_iterator  end=wtext.end();
                //while(boost::regex_search(it,end,wm,wreg))
               // {
               //     wstring wtemp=wm[1];
               //     string temp=Wstring2String(wtemp);
               //     results.push_back(temp);
               //     it=wm[1].second;
                //}
              strcat(text,line);
              strcat(text,"
    ");
        }
        //string t="刘[^刘]*?,";
        //wstring ws=String2Wstring(s);
        //cout<<p.size()<<endl;
        //cout<<ws.size()<<endl;
        //fprintf(stdout,"输出正则匹配结果
    ");
        //for(vector<string>::iterator it=results.begin();it!=results.end();it++)
        //{
        //    printf("%s
    ",(*it).c_str());
        //}
    
         
        Py_Initialize(); 
    
        // 检查初始化是否成功 
        if ( !Py_IsInitialized() ) 
        { 
            return -1; 
        } 
        // 添加当前路径 
        //把输入的字符串作为Python代码直接运行,返回0 
        //表示成功,-1表示有错。大多时候错误都是因为字符串 
        //中有语法错误。 
        PyRun_SimpleString("import sys"); 
        PyRun_SimpleString("sys.path.append('./')"); 
        PyObject *pName,*pModule,*pDict,*pFunc,*pArgs, *ret; 
    
        // 载入名为pytest的脚本 
        pName = PyString_FromString("pytest"); 
        pModule = PyImport_Import(pName); 
        if ( !pModule ) 
        { 
            printf("can't find pytest.py"); 
            return -1; 
        } 
        pDict = PyModule_GetDict(pModule); 
        if ( !pDict ) 
        { 
            return -1; 
        } 
    
        // 找出函数名为add的函数 
        pFunc = PyDict_GetItemString(pDict, "add"); 
        if ( !pFunc || !PyCallable_Check(pFunc) ) 
        { 
            printf("can't find function [add]"); 
            return -1; 
        } 
    
        // 参数进栈 
        *pArgs; 
        pArgs = PyTuple_New(2); 
    
        //  PyObject* Py_BuildValue(char *format, ...) 
        //  把C++的变量转换成一个Python对象。当需要从 
        //  C++传递变量到Python时,就会使用这个函数。此函数 
        //  有点类似C的printf,但格式不同。常用的格式有 
        //  s 表示字符串, 
        //  i 表示整型变量, 
        //  f 表示浮点数, 
        //  O 表示一个Python对象。 
    
        PyTuple_SetItem(pArgs, 0, Py_BuildValue("s",pattern)); 
        PyTuple_SetItem(pArgs, 1, Py_BuildValue("s",text)); 
        // 调用Python函数 
        ret=PyObject_CallObject(pFunc, pArgs); 
        char * str_ret = PyString_AsString(ret);
         printf("result:%s
    ", str_ret);
        Py_DECREF(pName); 
        Py_DECREF(pArgs); 
        Py_DECREF(pModule); 
        // 关闭Python 
        Py_Finalize(); 
        gettimeofday(&tv2, NULL);
        fprintf(stderr,"%s has finished congratulations!
    ",argv[0]);
        fprintf( stderr,"time elapsed: %.2f ms
    ", (float)((tv2.tv_sec - tv1.tv_sec)*1000000+(tv2.tv_usec-tv1.tv_usec))/1000);
        return 0;
    

      方法二

    // please add your code here!
    #include <iostream>
    #include <stdlib.h>
    #include <math.h>
    #include<time.h>
    #include <set>
    #include <string>
    #include <sys/time.h>
    #include<locale.h>
    #include<boost/regex.hpp>
    #include  <wchar.h>
    #include <iconv.h> 
    #include <errno.h>
    using namespace std;
    
    /*
       funcname:
       spec:
       parms:[IN]
             [IN]
             [OUT]
       returnValue:
       author liuyu, 20120528
    */
    void PrintUsage()
    {
        fprintf( stderr, "prog [IN]hzpylist_file [IN]input_file [OUT]output_file [OUT]errdmp_file
    " );
    }
    int toAnotherCode(const char *toCode,const char *fromCode,char *srcstr, char *deststr, size_t srclen,size_t &destlen)
    {
        iconv_t convertor=iconv_open(toCode,fromCode);
        size_t inputsize;
        size_t outputsize;
        size_t oldoutputsize;
        char *input, *inputold;
        char *output=NULL;
        char *outputold=NULL;
        int flag=0;
        if(convertor==iconv_t(-1))
        {
            fprintf(stderr,"convertor device initailization failed!
    ");
            return 1;
        }
        else
        {
            inputsize=srclen;
            input=new char[inputsize+1];
            memcpy(input,srcstr,inputsize);
            input[inputsize]='';
            inputold=input;
            outputsize=inputsize*5;
            oldoutputsize=outputsize;
            output=new char[outputsize];
            output[0]=0;
            outputold=output;
            size_t rc = iconv(convertor,&input,&inputsize,&output,&outputsize);
            if (rc==size_t(-1))
            {
                fprintf(stdout, "errno=%d
    ",errno);
            }
            destlen=oldoutputsize-outputsize;
            memcpy(deststr,outputold,destlen);
            deststr[destlen]=0;
            if(rc!=size_t(-1))
            {
                flag=1;
            }
            
            delete []inputold;
            delete []outputold;
    
        }
        iconv_close(convertor);
        if(flag==1)
        {
            return 0;
        }
        else
        {
            return 1;
        }
    
    }
    wchar_t * MBs2WCs(const char* pszSrc){ 
         wchar_t* pwcs = NULL; 
            int size = 0; 
            setlocale(LC_ALL, "zh_CN.UTF8"); 
            size = mbstowcs(NULL,pszSrc,0); 
            pwcs = new wchar_t[size+1]; 
            size = mbstowcs(pwcs, pszSrc, size+1); 
            pwcs[size] = 0; 
        return pwcs; 
     }
     
     char* WCs2MBs(const wchar_t * wcharStr){ 
        char* str = NULL; 
        int size = 0; 
            setlocale(LC_ALL, "zh_CN.UTF8"); 
            size = wcstombs( NULL, wcharStr, 0); 
            str = new char[size + 1]; 
            wcstombs( str, wcharStr, size); 
            str[size] = ''; 
        return str; 
     }
    
    int main( int argc, char *argv[] )
    {
        timeval tv1, tv2;
        gettimeofday(&tv1, NULL); 
        if ( 1 != argc )
    	{
    		PrintUsage();
    
    		return 1;
    	}
        /*
        char *s="刘禹,刘德华,刘佳佳。。。王大虎。。。刘长春,xixi";
        char *t="(刘[^刘]*?),";
        wchar_t *ws =MBs2WCs(s);
        wchar_t *wt =MBs2WCs(t);
        wstring wstr1=ws;
        wstring wstr2=wt;
        boost::wregex wreg(wt,boost::regbase::icase|boost::regex::perl);
        boost::wsmatch wm;
        wstring::const_iterator  it=wstr1.begin();
        wstring::const_iterator  end=wstr1.end();
        while(boost::regex_search(it,end,wm,wreg))
            {
                    wstring wtemp=wm[1];
                    char* temp=WCs2MBs(wtemp.c_str());
                    printf("%s
    ",temp);
                    it=wm[0].second;
            }
          */
        char line[102400]={0};
        char text[102400]={0};
        char* t="<li><span>字义:</span>(.*?)</li>";
        wchar_t *wt =MBs2WCs(t);
        boost::wsmatch wm;
        boost::wregex wreg(wt,boost::regbase::icase|boost::regex::perl);
        while(fgets(line,102400,stdin))
        {
            strcat(text,line);
        }
        wchar_t * ws = MBs2WCs(text);
        wstring wtext=ws;
        wstring::const_iterator  it=wtext.begin();
        wstring::const_iterator  end=wtext.end();
        vector<string> results;
        while(boost::regex_search(it,end,wm,wreg))
        {
                    wstring wtemp=wm[1];
                    char* temp=WCs2MBs(wtemp.c_str());
                    results.push_back(temp);
                    it=wm[1].second;
        }
        for (vector<string>::iterator it = results.begin(); it!=results.end(); it++)
        {
            fprintf(stdout,"%s
    ",(*it).c_str());
        }
        gettimeofday(&tv2, NULL);
        fprintf(stderr,"%s has finished congratulations!
    ",argv[0]);
        fprintf( stderr,"time elapsed: %.2f ms
    ", (float)((tv2.tv_sec - tv1.tv_sec)*1000000+(tv2.tv_usec-tv1.tv_usec))/1000);
    	return 0;
    }
    

      方法一的编译方法:

     g++ Python.cpp -o Python -I/usr/include/python2.5 -L/usr/lib/python2.5 -lpython2.5

  • 相关阅读:
    springmvc+mybatis多数据源切换
    Tomcat 8.5 配置自动从http跳转https
    Tomcat 8.5 配置 域名绑定
    本地测试Tomcat配置Https访问
    Spring boot
    解决IDEA16闪退的问题
    cef
    spring-boot学习资料
    oracle 表空间不足解决办法
    oracle导出表的办法
  • 原文地址:https://www.cnblogs.com/finallyliuyu/p/4724404.html
Copyright © 2011-2022 走看看