zoukankan      html  css  js  c++  java
  • 判断文件是否为UTF8编码(以前收集的)

      1        private bool CheckEncoding(string strFileName)
      2        {
      3            using (FileStream stream = new FileStream(strFileName, FileMode.Open))
      4            {
      5                byte[] bs = new byte[stream.Length];
      6                stream.Read(bs, 0, bs.Length);
      7                if (utf8_probability(bs) > 0return true;
      8                else return false;
      9
     10                /*
     11                if (stream != null && stream.Length >= 2)
     12                {     
     13                    //保存文件流的前4个字节
     14                    byte byte1 = 0;
     15                    byte byte2 = 0;
     16                    byte byte3 = 0;
     17                    byte byte4 = 0;
     18                    //保存当前Seek位置
     19                    long origPos = stream.Seek(0, SeekOrigin.Begin);
     20                    stream.Seek(0, SeekOrigin.Begin);
     21                    int nByte = stream.ReadByte();
     22                    byte1 = Convert.ToByte(nByte);
     23                    byte2 = Convert.ToByte(stream.ReadByte());
     24                    if (stream.Length >= 3)
     25                    {
     26                        byte3 = Convert.ToByte(stream.ReadByte());
     27                    }
     28                    if (stream.Length >= 4)
     29                    {
     30                        byte4 = Convert.ToByte(stream.ReadByte());
     31                    }
     32
     33                    //根据文件流的前4个字节判断Encoding
     34                    //Unicode {0xFF, 0xFE};
     35                    //BE-Unicode {0xFE, 0xFF};
     36                    //UTF8 = {0xEF, 0xBB, 0xBF};
     37                    if (byte1 == 0xFE && byte2 == 0xFF)//UnicodeBe
     38                    {
     39                        targetEncoding = Encoding.BigEndianUnicode;
     40                    }
     41                    if (byte1 == 0xFF && byte2 == 0xFE && byte3 != 0xFF)//Unicode
     42                    {
     43                        targetEncoding = Encoding.Unicode;
     44                    }
     45                    if (byte1 == 0xEF && byte2 == 0xBB && byte3 == 0xBF)//UTF8
     46                    {
     47                        targetEncoding = Encoding.UTF8;
     48                    }
     49                    //恢复Seek位置       
     50                    stream.Seek(origPos, SeekOrigin.Begin);
     51                  
     52                }*/

     53            }

     54        }

     55        
     56        
     57        private int utf8_probability(byte[] rawtext)
     58        {
     59            int score = 0;
     60            int i, rawtextlen = 0;
     61            int goodbytes = 0, asciibytes = 0;
     62
     63            // Maybe also use UTF8 Byte Order Mark:  EF BB BF
     64
     65            // Check to see if characters fit into acceptable ranges
     66            rawtextlen = rawtext.Length;
     67            for (i = 0; i < rawtextlen; i++)
     68            {
     69                if ((rawtext[i] & (byte)0x7F== rawtext[i])
     70                {  // One byte
     71                    asciibytes++;
     72                    // Ignore ASCII, can throw off count
     73                }

     74                else
     75                {
     76                    int m_rawInt0 = Convert.ToInt16(rawtext[i]);
     77                    int m_rawInt1 = Convert.ToInt16(rawtext[i + 1]);
     78                    int m_rawInt2 = Convert.ToInt16(rawtext[i + 2]);
     79
     80                    if (256 - 64 <= m_rawInt0 && m_rawInt0 <= 256 - 33 && // Two bytes
     81                     i + 1 < rawtextlen &&
     82                     256 - 128 <= m_rawInt1 && m_rawInt1 <= 256 - 65)
     83                    {
     84                        goodbytes += 2;
     85                        i++;
     86                    }

     87                    else if (256 - 32 <= m_rawInt0 && m_rawInt0 <= 256 - 17 && // Three bytes
     88                     i + 2 < rawtextlen &&
     89                     256 - 128 <= m_rawInt1 && m_rawInt1 <= 256 - 65 &&
     90                     256 - 128 <= m_rawInt2 && m_rawInt2 <= 256 - 65)
     91                    {
     92                        goodbytes += 3;
     93                        i += 2;
     94                    }

     95                }

     96            }

     97
     98            if (asciibytes == rawtextlen) return 0; }
     99
    100            score = (int)(100 * ((float)goodbytes / (float)(rawtextlen - asciibytes)));
    101
    102            // If not above 98, reduce to zero to prevent coincidental matches
    103            // Allows for some (few) bad formed sequences
    104            if (score > 98)
    105            {
    106                return score;
    107            }

    108            else if (score > 95 && goodbytes > 30)
    109            {
    110                return score;
    111            }

    112            else
    113            {
    114                return 0;
    115            }

    116
    117        }
  • 相关阅读:
    Asynchronous AOF fsync is taking too long (disk is busy?). Writing the AOF buffer without waiting for fsync to complete, this may slow down Redis.
    DHCP "No subnet declaration for xxx (no IPv4 addresses)" 报错
    Centos安装前端开发常用软件
    kubernetes学习笔记之十:RBAC(二)
    k8s学习笔记之StorageClass+NFS
    k8s学习笔记之ConfigMap和Secret
    k8s笔记之chartmuseum搭建
    K8S集群集成harbor(1.9.3)服务并配置HTTPS
    Docker镜像仓库Harbor1.7.0搭建及配置
    Nginx自建SSL证书部署HTTPS网站
  • 原文地址:https://www.cnblogs.com/sxlfybb/p/803100.html
Copyright © 2011-2022 走看看