1 private bool CheckEncoding(string strFileName)
2 {
3 using (FileStream stream = new FileStream(strFileName, FileMode.Open))
4 {
5 byte[] bs = new byte[stream.Length];
6 stream.Read(bs, 0, bs.Length);
7 if (utf8_probability(bs) > 0) return true;
8 else return false;
9
10 /*
11 if (stream != null && stream.Length >= 2)
12 {
13 //保存文件流的前4个字节
14 byte byte1 = 0;
15 byte byte2 = 0;
16 byte byte3 = 0;
17 byte byte4 = 0;
18 //保存当前Seek位置
19 long origPos = stream.Seek(0, SeekOrigin.Begin);
20 stream.Seek(0, SeekOrigin.Begin);
21 int nByte = stream.ReadByte();
22 byte1 = Convert.ToByte(nByte);
23 byte2 = Convert.ToByte(stream.ReadByte());
24 if (stream.Length >= 3)
25 {
26 byte3 = Convert.ToByte(stream.ReadByte());
27 }
28 if (stream.Length >= 4)
29 {
30 byte4 = Convert.ToByte(stream.ReadByte());
31 }
32
33 //根据文件流的前4个字节判断Encoding
34 //Unicode {0xFF, 0xFE};
35 //BE-Unicode {0xFE, 0xFF};
36 //UTF8 = {0xEF, 0xBB, 0xBF};
37 if (byte1 == 0xFE && byte2 == 0xFF)//UnicodeBe
38 {
39 targetEncoding = Encoding.BigEndianUnicode;
40 }
41 if (byte1 == 0xFF && byte2 == 0xFE && byte3 != 0xFF)//Unicode
42 {
43 targetEncoding = Encoding.Unicode;
44 }
45 if (byte1 == 0xEF && byte2 == 0xBB && byte3 == 0xBF)//UTF8
46 {
47 targetEncoding = Encoding.UTF8;
48 }
49 //恢复Seek位置
50 stream.Seek(origPos, SeekOrigin.Begin);
51
52 }*/
53 }
54 }
55
56
57 private int utf8_probability(byte[] rawtext)
58 {
59 int score = 0;
60 int i, rawtextlen = 0;
61 int goodbytes = 0, asciibytes = 0;
62
63 // Maybe also use UTF8 Byte Order Mark: EF BB BF
64
65 // Check to see if characters fit into acceptable ranges
66 rawtextlen = rawtext.Length;
67 for (i = 0; i < rawtextlen; i++)
68 {
69 if ((rawtext[i] & (byte)0x7F) == rawtext[i])
70 { // One byte
71 asciibytes++;
72 // Ignore ASCII, can throw off count
73 }
74 else
75 {
76 int m_rawInt0 = Convert.ToInt16(rawtext[i]);
77 int m_rawInt1 = Convert.ToInt16(rawtext[i + 1]);
78 int m_rawInt2 = Convert.ToInt16(rawtext[i + 2]);
79
80 if (256 - 64 <= m_rawInt0 && m_rawInt0 <= 256 - 33 && // Two bytes
81 i + 1 < rawtextlen &&
82 256 - 128 <= m_rawInt1 && m_rawInt1 <= 256 - 65)
83 {
84 goodbytes += 2;
85 i++;
86 }
87 else if (256 - 32 <= m_rawInt0 && m_rawInt0 <= 256 - 17 && // Three bytes
88 i + 2 < rawtextlen &&
89 256 - 128 <= m_rawInt1 && m_rawInt1 <= 256 - 65 &&
90 256 - 128 <= m_rawInt2 && m_rawInt2 <= 256 - 65)
91 {
92 goodbytes += 3;
93 i += 2;
94 }
95 }
96 }
97
98 if (asciibytes == rawtextlen) { return 0; }
99
100 score = (int)(100 * ((float)goodbytes / (float)(rawtextlen - asciibytes)));
101
102 // If not above 98, reduce to zero to prevent coincidental matches
103 // Allows for some (few) bad formed sequences
104 if (score > 98)
105 {
106 return score;
107 }
108 else if (score > 95 && goodbytes > 30)
109 {
110 return score;
111 }
112 else
113 {
114 return 0;
115 }
116
117 }
2 {
3 using (FileStream stream = new FileStream(strFileName, FileMode.Open))
4 {
5 byte[] bs = new byte[stream.Length];
6 stream.Read(bs, 0, bs.Length);
7 if (utf8_probability(bs) > 0) return true;
8 else return false;
9
10 /*
11 if (stream != null && stream.Length >= 2)
12 {
13 //保存文件流的前4个字节
14 byte byte1 = 0;
15 byte byte2 = 0;
16 byte byte3 = 0;
17 byte byte4 = 0;
18 //保存当前Seek位置
19 long origPos = stream.Seek(0, SeekOrigin.Begin);
20 stream.Seek(0, SeekOrigin.Begin);
21 int nByte = stream.ReadByte();
22 byte1 = Convert.ToByte(nByte);
23 byte2 = Convert.ToByte(stream.ReadByte());
24 if (stream.Length >= 3)
25 {
26 byte3 = Convert.ToByte(stream.ReadByte());
27 }
28 if (stream.Length >= 4)
29 {
30 byte4 = Convert.ToByte(stream.ReadByte());
31 }
32
33 //根据文件流的前4个字节判断Encoding
34 //Unicode {0xFF, 0xFE};
35 //BE-Unicode {0xFE, 0xFF};
36 //UTF8 = {0xEF, 0xBB, 0xBF};
37 if (byte1 == 0xFE && byte2 == 0xFF)//UnicodeBe
38 {
39 targetEncoding = Encoding.BigEndianUnicode;
40 }
41 if (byte1 == 0xFF && byte2 == 0xFE && byte3 != 0xFF)//Unicode
42 {
43 targetEncoding = Encoding.Unicode;
44 }
45 if (byte1 == 0xEF && byte2 == 0xBB && byte3 == 0xBF)//UTF8
46 {
47 targetEncoding = Encoding.UTF8;
48 }
49 //恢复Seek位置
50 stream.Seek(origPos, SeekOrigin.Begin);
51
52 }*/
53 }
54 }
55
56
57 private int utf8_probability(byte[] rawtext)
58 {
59 int score = 0;
60 int i, rawtextlen = 0;
61 int goodbytes = 0, asciibytes = 0;
62
63 // Maybe also use UTF8 Byte Order Mark: EF BB BF
64
65 // Check to see if characters fit into acceptable ranges
66 rawtextlen = rawtext.Length;
67 for (i = 0; i < rawtextlen; i++)
68 {
69 if ((rawtext[i] & (byte)0x7F) == rawtext[i])
70 { // One byte
71 asciibytes++;
72 // Ignore ASCII, can throw off count
73 }
74 else
75 {
76 int m_rawInt0 = Convert.ToInt16(rawtext[i]);
77 int m_rawInt1 = Convert.ToInt16(rawtext[i + 1]);
78 int m_rawInt2 = Convert.ToInt16(rawtext[i + 2]);
79
80 if (256 - 64 <= m_rawInt0 && m_rawInt0 <= 256 - 33 && // Two bytes
81 i + 1 < rawtextlen &&
82 256 - 128 <= m_rawInt1 && m_rawInt1 <= 256 - 65)
83 {
84 goodbytes += 2;
85 i++;
86 }
87 else if (256 - 32 <= m_rawInt0 && m_rawInt0 <= 256 - 17 && // Three bytes
88 i + 2 < rawtextlen &&
89 256 - 128 <= m_rawInt1 && m_rawInt1 <= 256 - 65 &&
90 256 - 128 <= m_rawInt2 && m_rawInt2 <= 256 - 65)
91 {
92 goodbytes += 3;
93 i += 2;
94 }
95 }
96 }
97
98 if (asciibytes == rawtextlen) { return 0; }
99
100 score = (int)(100 * ((float)goodbytes / (float)(rawtextlen - asciibytes)));
101
102 // If not above 98, reduce to zero to prevent coincidental matches
103 // Allows for some (few) bad formed sequences
104 if (score > 98)
105 {
106 return score;
107 }
108 else if (score > 95 && goodbytes > 30)
109 {
110 return score;
111 }
112 else
113 {
114 return 0;
115 }
116
117 }