1
private bool CheckEncoding(string strFileName)
2
{
3
using (FileStream stream = new FileStream(strFileName, FileMode.Open))
4
{
5
byte[] bs = new byte[stream.Length];
6
stream.Read(bs, 0, bs.Length);
7
if (utf8_probability(bs) > 0) return true;
8
else return false;
9
10
/*
11
if (stream != null && stream.Length >= 2)
12
{
13
//保存文件流的前4个字节
14
byte byte1 = 0;
15
byte byte2 = 0;
16
byte byte3 = 0;
17
byte byte4 = 0;
18
//保存当前Seek位置
19
long origPos = stream.Seek(0, SeekOrigin.Begin);
20
stream.Seek(0, SeekOrigin.Begin);
21
int nByte = stream.ReadByte();
22
byte1 = Convert.ToByte(nByte);
23
byte2 = Convert.ToByte(stream.ReadByte());
24
if (stream.Length >= 3)
25
{
26
byte3 = Convert.ToByte(stream.ReadByte());
27
}
28
if (stream.Length >= 4)
29
{
30
byte4 = Convert.ToByte(stream.ReadByte());
31
}
32
33
//根据文件流的前4个字节判断Encoding
34
//Unicode {0xFF, 0xFE};
35
//BE-Unicode {0xFE, 0xFF};
36
//UTF8 = {0xEF, 0xBB, 0xBF};
37
if (byte1 == 0xFE && byte2 == 0xFF)//UnicodeBe
38
{
39
targetEncoding = Encoding.BigEndianUnicode;
40
}
41
if (byte1 == 0xFF && byte2 == 0xFE && byte3 != 0xFF)//Unicode
42
{
43
targetEncoding = Encoding.Unicode;
44
}
45
if (byte1 == 0xEF && byte2 == 0xBB && byte3 == 0xBF)//UTF8
46
{
47
targetEncoding = Encoding.UTF8;
48
}
49
//恢复Seek位置
50
stream.Seek(origPos, SeekOrigin.Begin);
51
52
}*/
53
}
54
}
55
56
57
private int utf8_probability(byte[] rawtext)
58
{
59
int score = 0;
60
int i, rawtextlen = 0;
61
int goodbytes = 0, asciibytes = 0;
62
63
// Maybe also use UTF8 Byte Order Mark: EF BB BF
64
65
// Check to see if characters fit into acceptable ranges
66
rawtextlen = rawtext.Length;
67
for (i = 0; i < rawtextlen; i++)
68
{
69
if ((rawtext[i] & (byte)0x7F) == rawtext[i])
70
{ // One byte
71
asciibytes++;
72
// Ignore ASCII, can throw off count
73
}
74
else
75
{
76
int m_rawInt0 = Convert.ToInt16(rawtext[i]);
77
int m_rawInt1 = Convert.ToInt16(rawtext[i + 1]);
78
int m_rawInt2 = Convert.ToInt16(rawtext[i + 2]);
79
80
if (256 - 64 <= m_rawInt0 && m_rawInt0 <= 256 - 33 && // Two bytes
81
i + 1 < rawtextlen &&
82
256 - 128 <= m_rawInt1 && m_rawInt1 <= 256 - 65)
83
{
84
goodbytes += 2;
85
i++;
86
}
87
else if (256 - 32 <= m_rawInt0 && m_rawInt0 <= 256 - 17 && // Three bytes
88
i + 2 < rawtextlen &&
89
256 - 128 <= m_rawInt1 && m_rawInt1 <= 256 - 65 &&
90
256 - 128 <= m_rawInt2 && m_rawInt2 <= 256 - 65)
91
{
92
goodbytes += 3;
93
i += 2;
94
}
95
}
96
}
97
98
if (asciibytes == rawtextlen) { return 0; }
99
100
score = (int)(100 * ((float)goodbytes / (float)(rawtextlen - asciibytes)));
101
102
// If not above 98, reduce to zero to prevent coincidental matches
103
// Allows for some (few) bad formed sequences
104
if (score > 98)
105
{
106
return score;
107
}
108
else if (score > 95 && goodbytes > 30)
109
{
110
return score;
111
}
112
else
113
{
114
return 0;
115
}
116
117
}

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117
