判断文件是否为UTF8编码(以前收集的)

zoukankan html css js c++ java

判断文件是否为UTF8编码(以前收集的)

  1        private bool CheckEncoding(string strFileName)
  2        {
  3            using (FileStream stream = new FileStream(strFileName, FileMode.Open))
  4            {
  5                byte[] bs = new byte[stream.Length];
  6                stream.Read(bs, 0, bs.Length);
  7                if (utf8_probability(bs) > 0) return true;
  8                else return false;
  9
10                /**//*
11                if (stream != null && stream.Length >= 2)
12                {
13                    //保存文件流的前4个字节
14                    byte byte1 = 0;
15                    byte byte2 = 0;
16                    byte byte3 = 0;
17                    byte byte4 = 0;
18                    //保存当前Seek位置
19                    long origPos = stream.Seek(0, SeekOrigin.Begin);
20                    stream.Seek(0, SeekOrigin.Begin);
21                    int nByte = stream.ReadByte();
22                    byte1 = Convert.ToByte(nByte);
23                    byte2 = Convert.ToByte(stream.ReadByte());
24                    if (stream.Length >= 3)
25                    {
26                        byte3 = Convert.ToByte(stream.ReadByte());
27                    }
28                    if (stream.Length >= 4)
29                    {
30                        byte4 = Convert.ToByte(stream.ReadByte());
31                    }
32
33                    //根据文件流的前4个字节判断Encoding
34                    //Unicode {0xFF, 0xFE};
35                    //BE-Unicode {0xFE, 0xFF};
36                    //UTF8 = {0xEF, 0xBB, 0xBF};
37                    if (byte1 == 0xFE && byte2 == 0xFF)//UnicodeBe
38                    {
39                        targetEncoding = Encoding.BigEndianUnicode;
40                    }
41                    if (byte1 == 0xFF && byte2 == 0xFE && byte3 != 0xFF)//Unicode
42                    {
43                        targetEncoding = Encoding.Unicode;
44                    }
45                    if (byte1 == 0xEF && byte2 == 0xBB && byte3 == 0xBF)//UTF8
46                    {
47                        targetEncoding = Encoding.UTF8;
48                    }
49                    //恢复Seek位置
50                    stream.Seek(origPos, SeekOrigin.Begin);
51
52                }*/
53            }
54        }
55
56
57        private int utf8_probability(byte[] rawtext)
58        {
59            int score = 0;
60            int i, rawtextlen = 0;
61            int goodbytes = 0, asciibytes = 0;
62
63            // Maybe also use UTF8 Byte Order Mark:  EF BB BF
64
65            // Check to see if characters fit into acceptable ranges
66            rawtextlen = rawtext.Length;
67            for (i = 0; i < rawtextlen; i++)
68            {
69                if ((rawtext[i] & (byte)0x7F) == rawtext[i])
70                {  // One byte
71                    asciibytes++;
72                    // Ignore ASCII, can throw off count
73                }
74                else
75                {
76                    int m_rawInt0 = Convert.ToInt16(rawtext[i]);
77                    int m_rawInt1 = Convert.ToInt16(rawtext[i + 1]);
78                    int m_rawInt2 = Convert.ToInt16(rawtext[i + 2]);
79
80                    if (256 - 64 <= m_rawInt0 && m_rawInt0 <= 256 - 33 && // Two bytes
81                     i + 1 < rawtextlen &&
82                     256 - 128 <= m_rawInt1 && m_rawInt1 <= 256 - 65)
83                    {
84                        goodbytes += 2;
85                        i++;
86                    }
87                    else if (256 - 32 <= m_rawInt0 && m_rawInt0 <= 256 - 17 && // Three bytes
88                     i + 2 < rawtextlen &&
89                     256 - 128 <= m_rawInt1 && m_rawInt1 <= 256 - 65 &&
90                     256 - 128 <= m_rawInt2 && m_rawInt2 <= 256 - 65)
91                    {
92                        goodbytes += 3;
93                        i += 2;
94                    }
95                }
96            }
97
98            if (asciibytes == rawtextlen) { return 0; }
99
100            score = (int)(100 * ((float)goodbytes / (float)(rawtextlen - asciibytes)));
101
102            // If not above 98, reduce to zero to prevent coincidental matches
103            // Allows for some (few) bad formed sequences
104            if (score > 98)
105            {
106                return score;
107            }
108            else if (score > 95 && goodbytes > 30)
109            {
110                return score;
111            }
112            else
113            {
114                return 0;
115            }
116
117        }

查看全文

相关阅读:
POJ 1436 Horizontally Visible Segments（线段树）
POJ 1436 Horizontally Visible Segments（线段树）
精益项目管理的可行性分析
 精益项目管理的可行性分析
 精益项目管理的可行性分析
 精益项目管理的可行性分析
 单点登录cas常见问题(二)
单点登录cas常见问题(二)
蓝氏兄弟依靠板栗东山再起，意外赚回八九万元
 元旦快乐，感谢一路相伴！

原文地址：https://www.cnblogs.com/sxlfybb/p/803100.html