zoukankan      html  css  js  c++  java
  • 判断文件是否为UTF8编码(以前收集的)

      1        private bool CheckEncoding(string strFileName)
      2        {
      3            using (FileStream stream = new FileStream(strFileName, FileMode.Open))
      4            {
      5                byte[] bs = new byte[stream.Length];
      6                stream.Read(bs, 0, bs.Length);
      7                if (utf8_probability(bs) > 0return true;
      8                else return false;
      9
     10                /*
     11                if (stream != null && stream.Length >= 2)
     12                {     
     13                    //保存文件流的前4个字节
     14                    byte byte1 = 0;
     15                    byte byte2 = 0;
     16                    byte byte3 = 0;
     17                    byte byte4 = 0;
     18                    //保存当前Seek位置
     19                    long origPos = stream.Seek(0, SeekOrigin.Begin);
     20                    stream.Seek(0, SeekOrigin.Begin);
     21                    int nByte = stream.ReadByte();
     22                    byte1 = Convert.ToByte(nByte);
     23                    byte2 = Convert.ToByte(stream.ReadByte());
     24                    if (stream.Length >= 3)
     25                    {
     26                        byte3 = Convert.ToByte(stream.ReadByte());
     27                    }
     28                    if (stream.Length >= 4)
     29                    {
     30                        byte4 = Convert.ToByte(stream.ReadByte());
     31                    }
     32
     33                    //根据文件流的前4个字节判断Encoding
     34                    //Unicode {0xFF, 0xFE};
     35                    //BE-Unicode {0xFE, 0xFF};
     36                    //UTF8 = {0xEF, 0xBB, 0xBF};
     37                    if (byte1 == 0xFE && byte2 == 0xFF)//UnicodeBe
     38                    {
     39                        targetEncoding = Encoding.BigEndianUnicode;
     40                    }
     41                    if (byte1 == 0xFF && byte2 == 0xFE && byte3 != 0xFF)//Unicode
     42                    {
     43                        targetEncoding = Encoding.Unicode;
     44                    }
     45                    if (byte1 == 0xEF && byte2 == 0xBB && byte3 == 0xBF)//UTF8
     46                    {
     47                        targetEncoding = Encoding.UTF8;
     48                    }
     49                    //恢复Seek位置       
     50                    stream.Seek(origPos, SeekOrigin.Begin);
     51                  
     52                }*/

     53            }

     54        }

     55        
     56        
     57        private int utf8_probability(byte[] rawtext)
     58        {
     59            int score = 0;
     60            int i, rawtextlen = 0;
     61            int goodbytes = 0, asciibytes = 0;
     62
     63            // Maybe also use UTF8 Byte Order Mark:  EF BB BF
     64
     65            // Check to see if characters fit into acceptable ranges
     66            rawtextlen = rawtext.Length;
     67            for (i = 0; i < rawtextlen; i++)
     68            {
     69                if ((rawtext[i] & (byte)0x7F== rawtext[i])
     70                {  // One byte
     71                    asciibytes++;
     72                    // Ignore ASCII, can throw off count
     73                }

     74                else
     75                {
     76                    int m_rawInt0 = Convert.ToInt16(rawtext[i]);
     77                    int m_rawInt1 = Convert.ToInt16(rawtext[i + 1]);
     78                    int m_rawInt2 = Convert.ToInt16(rawtext[i + 2]);
     79
     80                    if (256 - 64 <= m_rawInt0 && m_rawInt0 <= 256 - 33 && // Two bytes
     81                     i + 1 < rawtextlen &&
     82                     256 - 128 <= m_rawInt1 && m_rawInt1 <= 256 - 65)
     83                    {
     84                        goodbytes += 2;
     85                        i++;
     86                    }

     87                    else if (256 - 32 <= m_rawInt0 && m_rawInt0 <= 256 - 17 && // Three bytes
     88                     i + 2 < rawtextlen &&
     89                     256 - 128 <= m_rawInt1 && m_rawInt1 <= 256 - 65 &&
     90                     256 - 128 <= m_rawInt2 && m_rawInt2 <= 256 - 65)
     91                    {
     92                        goodbytes += 3;
     93                        i += 2;
     94                    }

     95                }

     96            }

     97
     98            if (asciibytes == rawtextlen) return 0; }
     99
    100            score = (int)(100 * ((float)goodbytes / (float)(rawtextlen - asciibytes)));
    101
    102            // If not above 98, reduce to zero to prevent coincidental matches
    103            // Allows for some (few) bad formed sequences
    104            if (score > 98)
    105            {
    106                return score;
    107            }

    108            else if (score > 95 && goodbytes > 30)
    109            {
    110                return score;
    111            }

    112            else
    113            {
    114                return 0;
    115            }

    116
    117        }
  • 相关阅读:
    Codeforces 994B. Knights of a Polygonal Table
    Codeforces 994A. Fingerprints
    Codeforces 988F. Rain and Umbrellas
    51nod 1158 全是1的最大子矩阵(单调栈 ,o(n*m))
    51nod 1102 面积最大的矩形 && 新疆大学OJ 1387: B.HUAWEI's billboard 【单调栈】+【拼凑段】(o(n) 或 o(nlog(n))
    Codeforces 988E. Divisibility by 25
    【复习资料】单片机与嵌入式系统原理及应用
    Codeforces 723D. Lakes in Berland
    Codeforces 986A. Fair(对物品bfs暴力求解)
    Codeforces 986B. Petr and Permutations(没想到这道2250分的题这么简单,早知道就先做了)
  • 原文地址:https://www.cnblogs.com/sxlfybb/p/803100.html
Copyright © 2011-2022 走看看