zoukankan      html  css  js  c++  java
  • C语言 检测一个文本文件的编码是否为utf-8

    /*
        filename: isutf8.c
        Time:     2016-12-9 20:27
        Author:   Albert Wang
        email:    albertofwb@gmail.com
        Function: detect whether a text file's encoding is utf-8 format
    */
    
    #include <stdio.h>
    #include <stdlib.h>  // exit()
    #include <io.h>  // _access() detect a file's existence
    
    #define True  1
    #define False 0
    
    typedef char Bool;
    typedef unsigned char Uchar;
    
    int DumpFromFile(const char *FileName, char *buf, size_t FileSize)
    {
        FILE     *fp;
    
        if ((fp = fopen(FileName, "rb")) == NULL)
        {
            return -1;
        }
    
        fread(buf, 1, FileSize, fp);
        fclose(fp);
    
        return 0;
    }
    
    
    int GetFileSize(const char *FileName, size_t *FileSize)
    {
        FILE *fp;
    
        if ((fp = fopen(FileName, "rb")) == NULL)
        {
            return -1;
        }
    
        fseek(fp, 0, SEEK_END);
        *FileSize = ftell(fp);
    
        fclose(fp);
    
        return 0;
    }
    
    Bool IsUtf8(const char* FileName)
    {
        FILE *fp = NULL;
        size_t FileSize = 0;
        char *fileBuf = NULL;
    
    
        GetFileSize(FileName, &FileSize);
        fileBuf = (char *)malloc(FileSize);
        DumpFromFile(FileName, fileBuf, FileSize);
    
        size_t i = 0;
        Bool ret = True;
    
        for ( ; ret && (i < FileSize); i++)
        {
            Uchar hexchar = fileBuf[i];
            // ignore ascii code
            if (!(hexchar & 0x80))
            {
                continue;
            }
    
            // calculate how many serial "1"
            int   BitOneCount = 0;
            Uchar num = hexchar;
            while (num & 0x80)
            {
                if (num & 0x80)
                {
                    BitOneCount += 1;
                }
                num <<= 1;
            }
    
            BitOneCount -= 1;
            while (BitOneCount > 0)
            {
                i += 1;
                num = fileBuf[i];   // num suppose to be 10xx xxxx
                num >>= 6;            // num = 0000 0010
                if (2 != num)
                {
                    ret = False;
                    //printf("i = %d num = %d hexchar = 0x%x BitOneCount= %d
    ", i, num, hexchar, BitOneCount);
                    break;
                }
                BitOneCount -= 1;
            }
    
        //end for
        }
    
    
        free(fileBuf);
        return ret;
    }
    
    int main(int argc, char *argv[])
    {
        if (argc != 2)
        {
            printf("Usage: %s <FileName>
    ", argv[0]);
            exit(1);
        }
    
        const char* FileName = argv[1];
        char  *result[] = {
            "False", "True"
        };
    
        if (-1 == _access(FileName, 0))
        {
            printf("%s not exists!
    ", FileName);
            exit(1);
        }
    
        printf("[%s] %s
    ", FileName, result[IsUtf8(FileName)]);
    
        return 0;
    }
    
    /*
        参考连接: http://www.ruanyifeng.com/blog/2007/10/ascii_unicode_and_utf-8.html
    */

    运行结果

    使用 winhex 以utf8 的编码查看样本文件:

    文件

  • 相关阅读:
    bzoj 4012: [HNOI2015]开店
    POJ 1054 The Troublesome Frog
    POJ 3171 Cleaning Shifts
    POJ 3411 Paid Roads
    POJ 3045 Cow Acrobats
    POJ 1742 Coins
    POJ 3181 Dollar Dayz
    POJ 3040 Allowance
    POJ 3666 Making the Grade
    洛谷 P3657 [USACO17FEB]Why Did the Cow Cross the Road II P
  • 原文地址:https://www.cnblogs.com/albertofwb/p/6151484.html
Copyright © 2011-2022 走看看