zoukankan      html  css  js  c++  java
  • 字符串UTF8和GBK之间的转换以及判定

    一、判定字符串是否是UTF-8的编码

    bool is_str_utf8(const char* str)
    {
        unsigned int nBytes = 0;//UFT8可用1-6个字节编码,ASCII用一个字节  
        unsigned char chr = *str;
        bool bAllAscii = true;
     
        for (unsigned int i = 0; str[i] != '\0'; ++i)
        {
            chr = *(str + i);
            //判断是否ASCII编码,如果不是,说明有可能是UTF8,ASCII用7位编码,最高位标记为0,0xxxxxxx 
            if (nBytes == 0 && (chr & 0x80) != 0)
            {
                bAllAscii = false;
            }
     
            if (nBytes == 0) 
            {
                //如果不是ASCII码,应该是多字节符,计算字节数  
                if (chr >= 0x80) 
                {
                    if (chr >= 0xFC && chr <= 0xFD)
                    {
                        nBytes = 6;
                    }
                    else if (chr >= 0xF8)
                    {
                        nBytes = 5;
                    }
                    else if (chr >= 0xF0)
                    {
                        nBytes = 4;
                    }
                    else if (chr >= 0xE0)
                    {
                        nBytes = 3;
                    }
                    else if (chr >= 0xC0)
                    {
                        nBytes = 2;
                    }
                    else
                    {
                        return false;
                    }
                    nBytes--;
                }
            }
            else
            {
                //多字节符的非首字节,应为 10xxxxxx 
                if ((chr & 0xC0) != 0x80)
                {
                    return false;
                }
                //减到为零为止
                nBytes--;
            }
        }
     
        //违返UTF8编码规则 
        if (nBytes != 0)  
        {
            return false;
        }
     
        if (bAllAscii)
        { //如果全部都是ASCII, 也是UTF8
            return true;
        }
     
        return true;
    }

    二、判定字符串是否是GBk的编码

    bool is_str_gbk(const char* str)
    {
        unsigned int nBytes = 0;//GBK可用1-2个字节编码,中文两个 ,英文一个 
        unsigned char chr = *str;
        bool bAllAscii = true; //如果全部都是ASCII,  
     
        for (unsigned int i = 0; str[i] != '\0'; ++i)
        {
            chr = *(str + i);
            if ((chr & 0x80) != 0 && nBytes == 0)
            {// 判断是否ASCII编码,如果不是,说明有可能是GBK
                bAllAscii = false;
            }
     
            if (nBytes == 0) 
            {
                if (chr >= 0x80) 
                {
                    if (chr >= 0x81 && chr <= 0xFE)
                    {
                        nBytes = +2;
                    }
                    else
                    {
                        return false;
                    }
                    nBytes--;
                }
            }
            else
            {
                if (chr < 0x40 || chr>0xFE)
                {
                    return false;
                }
                nBytes--;
            }//else end
        }
     
        if (nBytes != 0)  
        {    //违返规则 
            return false;
        }
     
        if (bAllAscii)
        { //如果全部都是ASCII, 也是GBK
            return true;
        }
     
        return true;
    }

    三、字符串由GBk编码转换成UTF-8编码

    void ConvertGBKToUtf8(CString &strGBK)
     {
      int len=MultiByteToWideChar(CP_ACP, 0, (LPCTSTR)strGBK, -1, NULL,0);
      wchar_t * wszUtf8 = new wchar_t [len];
      memset(wszUtf8, 0, len);
      MultiByteToWideChar(CP_ACP, 0, (LPCTSTR)strGBK, -1, wszUtf8, len);
      len = WideCharToMultiByte(CP_UTF8, 0, wszUtf8, -1, NULL, 0, NULL, NULL);
      char *szUtf8=new char[len + 1];
      memset(szUtf8, 0, len + 1);
      WideCharToMultiByte (CP_UTF8, 0, wszUtf8, -1, szUtf8, len, NULL,NULL);
      strGBK = szUtf8;
      delete[] szUtf8;
      delete[] wszUtf8;
     }
    
    
    string GBKToUTF8(const char* strGBK)  
    {  
        int len = MultiByteToWideChar(CP_ACP, 0, strGBK, -1, NULL, 0);  
        wchar_t* wstr = new wchar_t[len+1];  
        memset(wstr, 0, len+1);  
        MultiByteToWideChar(CP_ACP, 0, strGBK, -1, wstr, len);  
        len = WideCharToMultiByte(CP_UTF8, 0, wstr, -1, NULL, 0, NULL, NULL);  
        char* str = new char[len+1];  
        memset(str, 0, len+1);  
        WideCharToMultiByte(CP_UTF8, 0, wstr, -1, str, len, NULL, NULL);  
        string strTemp = str;  
        if(wstr) delete[] wstr;  
        if(str) delete[] str;  
        return strTemp;  
    }  

    四、字符串由UTF-8编码转换成GBk编码

    string UtfToGbk(const char* utf8)
    {
        int len = MultiByteToWideChar(CP_UTF8, 0, utf8, -1, NULL, 0);
        wchar_t* wstr = new wchar_t[len+1];
        memset(wstr, 0, len+1);
        MultiByteToWideChar(CP_UTF8, 0, utf8, -1, wstr, len);
        len = WideCharToMultiByte(CP_ACP, 0, wstr, -1, NULL, 0, NULL, NULL);
        char* str = new char[len+1];
        memset(str, 0, len+1);
        WideCharToMultiByte(CP_ACP, 0, wstr, -1, str, len, NULL, NULL);
        if(wstr) delete[] wstr;
        return str;
    }
    
    bool Utf82gbk(std::string &gbkStr, std::string &srcStr)
    {
     
        //首先先将utf-8编码转换为unicode编码   
        if(NULL==setlocale(LC_ALL,"zh_CN.utf8"))//设置转换为unicode前的码,当前为utf8编码   
        {
            printf("Bad Parameter\n");
            return false;
        }
     
        int unicodeLen=mbstowcs(NULL,srcStr.c_str(),0);//计算转换后的长度   
        if(unicodeLen<=0)
        {
            printf("Can not Transfer!!!\n");
            return false;
        }
        wchar_t *unicodeStr=(wchar_t *)calloc(sizeof(wchar_t),unicodeLen+1);
        mbstowcs(unicodeStr,srcStr.c_str(),srcStr.size());//将gbk转换为unicode   
     
        //将unicode编码转换为gbk编码   
        if(NULL==setlocale(LC_ALL,"zh_CN.gbk"))//设置unicode转换后的码,当前为gbk   
        {
            printf("Bad Parameter\n");
            return false;
        }
        int gbkLen = wcstombs(NULL,unicodeStr,0);//计算转换后的长度   
        if(gbkLen<=0)
        {
            printf("Can not Transfer!!!\n");
            return false;
        }
        char gbkbuf[1024*10];
        wcstombs(gbkbuf,unicodeStr,gbkLen);
        gbkbuf[gbkLen]=0;//添加结束符   
        gbkStr = gbkbuf;
        free(unicodeStr);
        return true;
    }
    
    
    string UTF8ToGBK(const std::string& strUTF8)    
    {    
        int len = MultiByteToWideChar(CP_UTF8, 0, strUTF8.c_str(), -1, NULL, 0);    
        WCHAR* wszGBK = new WCHAR[len+1];  
        memset(wszGBK, 0, len * 2 + 2);    
        MultiByteToWideChar(CP_UTF8, 0, (LPCSTR)(LPCTSTR)strUTF8.c_str(), -1, wszGBK, len);    
      
        len = WideCharToMultiByte(CP_ACP, 0, wszGBK, -1, NULL, 0, NULL, NULL);    
        char *szGBK = new char[len + 1];    
        memset(szGBK, 0, len + 1);    
        WideCharToMultiByte(CP_ACP,0, wszGBK, -1, szGBK, len, NULL, NULL);     
        std::string strTemp(szGBK);    
        delete[]szGBK;    
        delete[]wszGBK;    
        return strTemp;    
    }   
  • 相关阅读:
    数据字典的应用实例
    数据字典动态性能表(视图)
    MySQL exists的用法介绍
    Don’t Assume – Per Session Buffers
    MySQL 5.5: InnoDB Change Buffering
    Fixing Poor MySQL Default Configuration Values
    A quest for the full InnoDB status
    MySQL : interactive_timeout v/s wait_timeout
    Mydumper & Myloader Documentation
    InnoDB Plugin文件格式(概述)
  • 原文地址:https://www.cnblogs.com/Toney-01-22/p/9935297.html
Copyright © 2011-2022 走看看