zoukankan      html  css  js  c++  java
  • C语言 检测一个文本文件的编码是否为utf-8

    /*
        filename: isutf8.c
        Time:     2016-12-9 20:27
        Author:   Albert Wang
        email:    albertofwb@gmail.com
        Function: detect whether a text file's encoding is utf-8 format
    */
    
    #include <stdio.h>
    #include <stdlib.h>  // exit()
    #include <io.h>  // _access() detect a file's existence
    
    #define True  1
    #define False 0
    
    typedef char Bool;
    typedef unsigned char Uchar;
    
    int DumpFromFile(const char *FileName, char *buf, size_t FileSize)
    {
        FILE     *fp;
    
        if ((fp = fopen(FileName, "rb")) == NULL)
        {
            return -1;
        }
    
        fread(buf, 1, FileSize, fp);
        fclose(fp);
    
        return 0;
    }
    
    
    int GetFileSize(const char *FileName, size_t *FileSize)
    {
        FILE *fp;
    
        if ((fp = fopen(FileName, "rb")) == NULL)
        {
            return -1;
        }
    
        fseek(fp, 0, SEEK_END);
        *FileSize = ftell(fp);
    
        fclose(fp);
    
        return 0;
    }
    
    Bool IsUtf8(const char* FileName)
    {
        FILE *fp = NULL;
        size_t FileSize = 0;
        char *fileBuf = NULL;
    
    
        GetFileSize(FileName, &FileSize);
        fileBuf = (char *)malloc(FileSize);
        DumpFromFile(FileName, fileBuf, FileSize);
    
        size_t i = 0;
        Bool ret = True;
    
        for ( ; ret && (i < FileSize); i++)
        {
            Uchar hexchar = fileBuf[i];
            // ignore ascii code
            if (!(hexchar & 0x80))
            {
                continue;
            }
    
            // calculate how many serial "1"
            int   BitOneCount = 0;
            Uchar num = hexchar;
            while (num & 0x80)
            {
                if (num & 0x80)
                {
                    BitOneCount += 1;
                }
                num <<= 1;
            }
    
            BitOneCount -= 1;
            while (BitOneCount > 0)
            {
                i += 1;
                num = fileBuf[i];   // num suppose to be 10xx xxxx
                num >>= 6;            // num = 0000 0010
                if (2 != num)
                {
                    ret = False;
                    //printf("i = %d num = %d hexchar = 0x%x BitOneCount= %d
    ", i, num, hexchar, BitOneCount);
                    break;
                }
                BitOneCount -= 1;
            }
    
        //end for
        }
    
    
        free(fileBuf);
        return ret;
    }
    
    int main(int argc, char *argv[])
    {
        if (argc != 2)
        {
            printf("Usage: %s <FileName>
    ", argv[0]);
            exit(1);
        }
    
        const char* FileName = argv[1];
        char  *result[] = {
            "False", "True"
        };
    
        if (-1 == _access(FileName, 0))
        {
            printf("%s not exists!
    ", FileName);
            exit(1);
        }
    
        printf("[%s] %s
    ", FileName, result[IsUtf8(FileName)]);
    
        return 0;
    }
    
    /*
        参考连接: http://www.ruanyifeng.com/blog/2007/10/ascii_unicode_and_utf-8.html
    */

    运行结果

    使用 winhex 以utf8 的编码查看样本文件:

    文件

  • 相关阅读:
    Oracle 18c新特性一览
    iOS xcode缓存问题
    预编译头文件
    iOS 限制UITextField输入字符
    网络通信之 字节序转换原理与网络字节序、大端和小端模式
    iOS 库文件制作
    iOS 全屏布局
    内存问题 动态加载地址和运行时地址
    申请工作居住证政策解答
    phpsession配置
  • 原文地址:https://www.cnblogs.com/albertofwb/p/6151484.html
Copyright © 2011-2022 走看看