zoukankan      html  css  js  c++  java
  • C语言 检测一个文本文件的编码是否为utf-8

    /*
        filename: isutf8.c
        Time:     2016-12-9 20:27
        Author:   Albert Wang
        email:    albertofwb@gmail.com
        Function: detect whether a text file's encoding is utf-8 format
    */
    
    #include <stdio.h>
    #include <stdlib.h>  // exit()
    #include <io.h>  // _access() detect a file's existence
    
    #define True  1
    #define False 0
    
    typedef char Bool;
    typedef unsigned char Uchar;
    
    int DumpFromFile(const char *FileName, char *buf, size_t FileSize)
    {
        FILE     *fp;
    
        if ((fp = fopen(FileName, "rb")) == NULL)
        {
            return -1;
        }
    
        fread(buf, 1, FileSize, fp);
        fclose(fp);
    
        return 0;
    }
    
    
    int GetFileSize(const char *FileName, size_t *FileSize)
    {
        FILE *fp;
    
        if ((fp = fopen(FileName, "rb")) == NULL)
        {
            return -1;
        }
    
        fseek(fp, 0, SEEK_END);
        *FileSize = ftell(fp);
    
        fclose(fp);
    
        return 0;
    }
    
    Bool IsUtf8(const char* FileName)
    {
        FILE *fp = NULL;
        size_t FileSize = 0;
        char *fileBuf = NULL;
    
    
        GetFileSize(FileName, &FileSize);
        fileBuf = (char *)malloc(FileSize);
        DumpFromFile(FileName, fileBuf, FileSize);
    
        size_t i = 0;
        Bool ret = True;
    
        for ( ; ret && (i < FileSize); i++)
        {
            Uchar hexchar = fileBuf[i];
            // ignore ascii code
            if (!(hexchar & 0x80))
            {
                continue;
            }
    
            // calculate how many serial "1"
            int   BitOneCount = 0;
            Uchar num = hexchar;
            while (num & 0x80)
            {
                if (num & 0x80)
                {
                    BitOneCount += 1;
                }
                num <<= 1;
            }
    
            BitOneCount -= 1;
            while (BitOneCount > 0)
            {
                i += 1;
                num = fileBuf[i];   // num suppose to be 10xx xxxx
                num >>= 6;            // num = 0000 0010
                if (2 != num)
                {
                    ret = False;
                    //printf("i = %d num = %d hexchar = 0x%x BitOneCount= %d
    ", i, num, hexchar, BitOneCount);
                    break;
                }
                BitOneCount -= 1;
            }
    
        //end for
        }
    
    
        free(fileBuf);
        return ret;
    }
    
    int main(int argc, char *argv[])
    {
        if (argc != 2)
        {
            printf("Usage: %s <FileName>
    ", argv[0]);
            exit(1);
        }
    
        const char* FileName = argv[1];
        char  *result[] = {
            "False", "True"
        };
    
        if (-1 == _access(FileName, 0))
        {
            printf("%s not exists!
    ", FileName);
            exit(1);
        }
    
        printf("[%s] %s
    ", FileName, result[IsUtf8(FileName)]);
    
        return 0;
    }
    
    /*
        参考连接: http://www.ruanyifeng.com/blog/2007/10/ascii_unicode_and_utf-8.html
    */

    运行结果

    使用 winhex 以utf8 的编码查看样本文件:

    文件

  • 相关阅读:
    精彩的漫画小说
    《Java语言精粹》译者序
    群啊群
    围观透明咆哮体
    读《Cassandra权威指南》
    好书什么样?
    一个关于360和腾讯的调查
    Xcode 3.x class ations 以及outlets 去哪里了 ?
    「译」JavaScript 的 MVC 模式
    MAC OS 虚拟机里的control键设置
  • 原文地址:https://www.cnblogs.com/albertofwb/p/6151484.html
Copyright © 2011-2022 走看看