zoukankan      html  css  js  c++  java
  • 一个高效过滤非UTF8字符的C函数(也可用来判断是否utf8)

    /*
    UTF-8 valid format list:
    0xxxxxxx
    110xxxxx 10xxxxxx
    1110xxxx 10xxxxxx 10xxxxxx
    11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
    111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
    1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
    */
    char *filter_none_utf8_chars(char *src, int *len)
    {
            unsigned char *p;
            unsigned char *pSub;
            unsigned char *pStrEnd;
            unsigned char *pCharEnd;
            int bytes;
            unsigned char *filtered;
            unsigned char *pDest;
            unsigned char *pInvalidCharStart;
    
            pStrEnd = (unsigned char *)src + (*len);
            p = (unsigned char *)src;
            pInvalidCharStart = NULL;
            while (p < pStrEnd)
            {
                    if (*p < 0x80)
                    {
                            p++;
                            continue;
                    }
    
                    if ((*p & 0xE0) == 0xC0)  //110xxxxx
                    {
                            bytes = 1;
                    }
                    else if ((*p & 0xF0) == 0xE0) //1110xxxx
                    {
                            bytes = 2;
                    }
                    else if ((*p & 0xF8) == 0xF0) //11110xxx
                    {
                            bytes = 3;
                    }
                    else if ((*p & 0xFC) == 0xF8) //111110xx
    
                    {
                            bytes = 4;
                    }
                    else if ((*p & 0xFE) == 0xFC) //1111110x
                    {
                            bytes = 5;
                    }
                    else
                    {
                            pInvalidCharStart = p;
                            break;
                    }
    
                    p++;
                    pCharEnd = p + bytes;
                    if (pCharEnd > pStrEnd)
                    {
                            pInvalidCharStart = p - 1;
                            break;
                    }
    
                    for (; p<pCharEnd; p++)
                    {
                            if ((*p & 0xC0) != 0x80)
                            {
                                    break;
                            }
                    }
    
                    if (p != pCharEnd)
                    {
                            pInvalidCharStart = pCharEnd - (bytes + 1);
                            break;
                    }
            }
    
            if (pInvalidCharStart == NULL) //all chars are valid
            {
                    return src;
            }
    
    
            filtered = (unsigned char *)malloc(sizeof(char) * (*len));
            if (filtered == NULL)
            {
                    *len = 0;
                    *src = '';
                    return src;
            }
    
            pDest = filtered;
            bytes = (char *)pInvalidCharStart - src;
            if (bytes > 0)
            {
                    memcpy(pDest, src, bytes);
                    pDest += bytes;
            }
    
            p = pInvalidCharStart + 1; //skip this invalid char
            while (p < pStrEnd)
            {
                    if (*p < 0x80)
                    {
                            *pDest++ = *p++;
                            continue;
                    }
    
                    if ((*p & 0xE0) == 0xC0)  //110xxxxx
                    {
                            bytes = 1;
                    }
                    else if ((*p & 0xF0) == 0xE0) //1110xxxx
                    {
                            bytes = 2;
                    }
                    else if ((*p & 0xF8) == 0xF0) //11110xxx
                    {
                            bytes = 3;
                    }
                    else if ((*p & 0xFC) == 0xF8) //111110xx
                    {
                            bytes = 4;
                    }
                    else if ((*p & 0xFE) == 0xFC) //1111110x
                    {
                            bytes = 5;
                    }                
    
                    else  //invalid char
                    {
                            p++;
                            continue;
                    }
    
                    pSub = p + 1;
                    pCharEnd = pSub + bytes;
                    if (pCharEnd > pStrEnd)
                    {
                            p++;
                            continue;
                    }
    
                    for (; pSub<pCharEnd; pSub++)
                    {
                            if ((*pSub & 0xC0) != 0x80)
                            {
                                    break;
                            }
                    }
    
                    if (pSub != pCharEnd)
                    {
                            p++;
                            continue;
                    }
    
                    bytes += 1;
                    memcpy(pDest,  pSub-bytes, bytes);
                    pDest += bytes;
                    p += bytes;
            }
    
            *len = pDest - filtered;
            memcpy(src, filtered, *len);
            * (src + (*len)) = '';
    
            free(filtered);
    
            return src;
    }

    http://bbs.chinaunix.net/forum.php?mod=viewthread&tid=1230313

  • 相关阅读:
    MYSQL性能优化的最佳20+条经验
    MySQL性能分析工具之PROFILE
    理解事务的4种隔离级别
    二进制中1的个数
    滑动窗口最大值
    字符流中第一个不重复字符
    字符串转化为整数
    java字符,字符串,数字之间的转换
    java中数组输出的方式
    java基础知识(1)
  • 原文地址:https://www.cnblogs.com/findumars/p/5068059.html
Copyright © 2011-2022 走看看