zoukankan      html  css  js  c++  java
  • 跨平台Unicode与UTF8互转代码

    参考来源:http://blog.csdn.net/flying8127/article/details/1598521

    在原来原基础上,将代码整理,并加强安全性. 并按照WindowsAPI设计, 添加输出缓冲长度探测功能

    当OutUTFString为NULL时, 可以进行输出的UTF8字符串长度探测

       1:  uint32 UniCharToUTF8(wchar_t UniChar, char *OutUTFString)
       2:      {
       3:   
       4:          uint32 UTF8CharLength = 0;
       5:   
       6:          if (UniChar < 0x80)
       7:          {  
       8:              if ( OutUTFString )
       9:                  OutUTFString[UTF8CharLength++] = (char)UniChar;
      10:              else
      11:                  UTF8CharLength++;
      12:          }
      13:          else if(UniChar < 0x800)
      14:          {
      15:              if ( OutUTFString )
      16:              {
      17:                  OutUTFString[UTF8CharLength++] = 0xc0 | ( UniChar >> 6 );
      18:                  OutUTFString[UTF8CharLength++] = 0x80 | ( UniChar & 0x3f );
      19:              }
      20:              else
      21:              {
      22:                  UTF8CharLength += 2;
      23:              }
      24:          }
      25:          else if(UniChar < 0x10000 )
      26:          {
      27:              if ( OutUTFString )
      28:              {
      29:                  OutUTFString[UTF8CharLength++] = 0xe0 | ( UniChar >> 12 );
      30:                  OutUTFString[UTF8CharLength++] = 0x80 | ( (UniChar >> 6) & 0x3f );
      31:                  OutUTFString[UTF8CharLength++] = 0x80 | ( UniChar & 0x3f );
      32:              }
      33:              else
      34:              {
      35:                  UTF8CharLength += 3;
      36:              }
      37:          }
      38:          else if( UniChar < 0x200000 ) 
      39:          {
      40:              if ( OutUTFString )
      41:              {
      42:                  OutUTFString[UTF8CharLength++] = 0xf0 | ( (int)UniChar >> 18 );
      43:                  OutUTFString[UTF8CharLength++] = 0x80 | ( (UniChar >> 12) & 0x3f );
      44:                  OutUTFString[UTF8CharLength++] = 0x80 | ( (UniChar >> 6) & 0x3f );
      45:                  OutUTFString[UTF8CharLength++] = 0x80 | ( UniChar & 0x3f );
      46:              }
      47:              else
      48:              {
      49:                  UTF8CharLength += 4;
      50:              }
      51:   
      52:          }
      53:   
      54:          return UTF8CharLength;
      55:      }

    当OutUnicodeString为NULL时, 可以进行输出的Unicode字符串长度探测

       1:  uint32 UTF8StrToUnicode( const char* UTF8String, uint32 UTF8StringLength, wchar_t* OutUnicodeString, uint32 UnicodeStringBufferSize )
       2:      {
       3:          uint32 UTF8Index = 0;
       4:          uint32 UniIndex = 0;
       5:   
       6:          while ( UTF8Index < UTF8StringLength )
       7:          {
       8:              unsigned char UTF8Char = UTF8String[UTF8Index];
       9:   
      10:              if ( UnicodeStringBufferSize != 0 && UniIndex >= UnicodeStringBufferSize )
      11:                  break;
      12:   
      13:              if ((UTF8Char & 0x80) == 0) 
      14:              {
      15:                  const uint32 cUTF8CharRequire = 1;
      16:   
      17:                  // UTF8字码不足
      18:                  if ( UTF8Index + cUTF8CharRequire > UTF8StringLength )
      19:                      break;
      20:   
      21:                  if ( OutUnicodeString )
      22:                  {
      23:                      wchar_t& WideChar = OutUnicodeString[UniIndex]; 
      24:   
      25:                      WideChar = UTF8Char;
      26:                  }
      27:   
      28:                  UTF8Index++;
      29:                  
      30:              } 
      31:              else if((UTF8Char & 0xE0) == 0xC0)  ///< 110x-xxxx 10xx-xxxx
      32:              {
      33:                  const uint32 cUTF8CharRequire = 2;
      34:   
      35:                  // UTF8字码不足
      36:                  if ( UTF8Index + cUTF8CharRequire > UTF8StringLength )
      37:                      break;
      38:   
      39:                  if ( OutUnicodeString )
      40:                  {
      41:                      wchar_t& WideChar = OutUnicodeString[UniIndex]; 
      42:                      WideChar  = (UTF8String[UTF8Index + 0] & 0x3F) << 6;
      43:                      WideChar |= (UTF8String[UTF8Index + 1] & 0x3F);
      44:                  }
      45:                  
      46:                  UTF8Index += cUTF8CharRequire;
      47:              }
      48:              else if((UTF8Char & 0xF0) == 0xE0)  ///< 1110-xxxx 10xx-xxxx 10xx-xxxx
      49:              {
      50:                  const uint32 cUTF8CharRequire = 3;
      51:   
      52:                  // UTF8字码不足
      53:                  if ( UTF8Index + cUTF8CharRequire > UTF8StringLength )
      54:                      break;
      55:   
      56:                  if ( OutUnicodeString )
      57:                  {
      58:                      wchar_t& WideChar = OutUnicodeString[UniIndex]; 
      59:   
      60:                      WideChar  = (UTF8String[UTF8Index + 0] & 0x1F) << 12;
      61:                      WideChar |= (UTF8String[UTF8Index + 1] & 0x3F) << 6;
      62:                      WideChar |= (UTF8String[UTF8Index + 2] & 0x3F);
      63:                  }
      64:                  
      65:   
      66:                  UTF8Index += cUTF8CharRequire;
      67:              } 
      68:              else if((UTF8Char & 0xF8) == 0xF0)  ///< 1111-0xxx 10xx-xxxx 10xx-xxxx 10xx-xxxx 
      69:              {
      70:                  const uint32 cUTF8CharRequire = 4;
      71:   
      72:                  // UTF8字码不足
      73:                  if ( UTF8Index + cUTF8CharRequire > UTF8StringLength )
      74:                      break;
      75:   
      76:                  if ( OutUnicodeString )
      77:                  {
      78:                      wchar_t& WideChar = OutUnicodeString[UniIndex]; 
      79:   
      80:                      WideChar  = (UTF8String[UTF8Index + 0] & 0x0F) << 18;
      81:                      WideChar  = (UTF8String[UTF8Index + 1] & 0x3F) << 12;
      82:                      WideChar |= (UTF8String[UTF8Index + 2] & 0x3F) << 6;
      83:                      WideChar |= (UTF8String[UTF8Index + 3] & 0x3F);
      84:                  }
      85:   
      86:                  UTF8Index += cUTF8CharRequire;
      87:              } 
      88:              else ///< 1111-10xx 10xx-xxxx 10xx-xxxx 10xx-xxxx 10xx-xxxx 
      89:              {
      90:                  const uint32 cUTF8CharRequire = 5;
      91:   
      92:                  // UTF8字码不足
      93:                  if ( UTF8Index + cUTF8CharRequire > UTF8StringLength )
      94:                      break;
      95:   
      96:                  if ( OutUnicodeString )
      97:                  {
      98:                      wchar_t& WideChar = OutUnicodeString[UniIndex]; 
      99:   
     100:                      WideChar  = (UTF8String[UTF8Index + 0] & 0x07) << 24;
     101:                      WideChar  = (UTF8String[UTF8Index + 1] & 0x3F) << 18;
     102:                      WideChar  = (UTF8String[UTF8Index + 2] & 0x3F) << 12;
     103:                      WideChar |= (UTF8String[UTF8Index + 3] & 0x3F) << 6;
     104:                      WideChar |= (UTF8String[UTF8Index + 4] & 0x3F);
     105:                  }
     106:   
     107:                  UTF8Index += cUTF8CharRequire;
     108:              }
     109:   
     110:   
     111:              UniIndex++;
     112:          }
     113:   
     114:          return UniIndex;
     115:      }

    疗效: 用了此代码啊, 再也不用被iconv折磨了

  • 相关阅读:
    在 Mac OS X 上安装 TensorFlow
    用序列到序列和注意模型实现的:Translation with a Sequence to Sequence Network and Attention
    PyTorch 实战-张量
    PyTorch 实战-用 Numpy 热身
    tf.nn.embedding_lookup TensorFlow embedding_lookup 函数最简单实例
    叩响秋雨梧桐的大门——2018中考之后
    完全背包——01背包方法数
    dp——01背包
    图论最短路——dijkstra
    图论最短路——spfa
  • 原文地址:https://www.cnblogs.com/lidabo/p/3903629.html
Copyright © 2011-2022 走看看