zoukankan      html  css  js  c++  java
  • 自动判断文本文件编码来读取文本文件内容(.net版本和java版本)

    .net版本

    using System;
    using System.IO;
    using System.Text;
    
    namespace G2.Common
    {
        /// <summary>   
        /// 用于取得一个文本文件的编码方式(Encoding)。   
        /// </summary>   
        public static class TextEncodingHelper
        {
            /// <summary>   
            /// 取得一个文本文件的编码方式。如果无法在文件头部找到有效的前导符,Encoding.Default将被返回。   
            /// 文件的字符集在Windows下有两种,一种是ANSI,一种Unicode。
            /// 对于Unicode,Windows支持了它的三种编码方式,一种是小尾编码(Unicode),一种是大尾编码(BigEndianUnicode),一种是UTF-8编码。
            /// 我们可以从文件的头部来区分一个文件是属于哪种编码。当头部开始的两个字节为 FF FE时,是Unicode的小尾编码;当头部的两个字节为FE FF时,是Unicode的大尾编码;当头部两个字节为EF BB时,是Unicode的UTF-8编码;当它不为这些时,则是ANSI编码。
            /// 按照如上所说,我们可以通过读取文件头的两个字节来判断文件的编码格式
            /// </summary>   
            /// <param name="filename">文件名。</param>   
            /// <returns></returns>   
            public static System.Text.Encoding GetFileEncoding(this string filename)
            {
                if (!File.Exists(filename))
                {
                    throw new Exception("文件"" + filename + ""不存在!");
                }
    
                using (var fs = new System.IO.FileStream(filename, System.IO.FileMode.Open, System.IO.FileAccess.Read))
                using (var br = new System.IO.BinaryReader(fs))
                {
                    var buffer = br.ReadBytes(2);
                    if (buffer[0] >= 0xEF)
                    {
                        if (buffer[0] == 0xEF && buffer[1] == 0xBB)
                        {
                            return System.Text.Encoding.UTF8;
                        }
    
                        if (buffer[0] == 0xFE && buffer[1] == 0xFF)
                        {
                            return System.Text.Encoding.BigEndianUnicode;
                        }
    
                        if (buffer[0] == 0xFF && buffer[1] == 0xFE)
                        {
                            return System.Text.Encoding.Unicode;
                        }
                    }
    
                    return GetEncodingWithBomUtf8(fs, System.Text.Encoding.Default);
                }
            }
    
            /// <summary>   
            /// 通过给定的文件流,判断文件的编码类型   (解决了不带BOM的 UTF8 编码问题   )
            /// </summary>   
            /// <param name="fs">文件流</param>   
            /// <param name="defaultEncoding">默认编码</param>   
            /// <returns>文件的编码类型</returns>   
            private static System.Text.Encoding GetEncodingWithBomUtf8(Stream fs, Encoding defaultEncoding)
            {
                byte[] unicode = new byte[] { 0xFF, 0xFE, 0x41 };
                byte[] unicodeBig = new byte[] { 0xFE, 0xFF, 0x00 };
    
                //带BOM
                byte[] utf8 = new byte[] { 0xEF, 0xBB, 0xBF };
                var reVal = defaultEncoding;
    
                using (var r = new System.IO.BinaryReader(fs))
                {
                    byte[] ss = r.ReadBytes(4);
                    if (ss[0] == 0xFE && ss[1] == 0xFF && ss[2] == 0x00)
                    {
                        reVal = Encoding.BigEndianUnicode;
                    }
                    else if (ss[0] == 0xFF && ss[1] == 0xFE && ss[2] == 0x41)
                    {
                        reVal = Encoding.Unicode;
                    }
                    else
                    {
                        if (ss[0] == 0xEF && ss[1] == 0xBB && ss[2] == 0xBF)
                        {
                            reVal = Encoding.UTF8;
                        }
                        else
                        {
                            int i;
                            int.TryParse(fs.Length.ToString(), out i);
                            ss = r.ReadBytes(i);
    
                            if (IsUtf8Bytes(ss))
                            {
                                reVal = Encoding.UTF8;
                            }
                        }
                    }
    
                    return reVal;
                }
            }
    
            /// <summary>   
            /// 判断是否是不带 BOM 的 UTF8 格式   
            /// </summary>   
            /// <param name="data"></param>   
            /// <returns></returns>   
            private static bool IsUtf8Bytes(byte[] data)
            {
                int charByteCounter = 1;  //计算当前正分析的字符应还有的字节数   
                for (int i = 0; i < data.Length; i++)
                {
                    var curByte = data[i]; //当前分析的字节.   
                    if (charByteCounter == 1)
                    {
                        if (curByte >= 0x80)
                        {
                            //判断当前   
                            while (((curByte <<= 1) & 0x80) != 0)
                            {
                                charByteCounter++;
                            }
                            //标记位首位若为非0 则至少以2个1开始 如:110XXXXX...........1111110X    
                            if (charByteCounter == 1 || charByteCounter > 6)
                            {
                                return false;
                            }
                        }
                    }
                    else
                    {
                        //若是UTF-8 此时第一位必须为1   
                        if ((curByte & 0xC0) != 0x80)
                        {
                            return false;
                        }
                        charByteCounter--;
                    }
                }
                if (charByteCounter > 1)
                {
                    throw new Exception("非预期的byte格式!");
                }
                return true;
            }
        }
    }

    java版本

    import org.slf4j.Logger;
    import org.slf4j.LoggerFactory;
    
    import java.io.BufferedInputStream;
    import java.io.FileInputStream;
    import java.util.BitSet;
    
    public class EncodeUtils {
        private static final Logger logger = LoggerFactory.getLogger(EncodeUtils.class);
        private static final int BYTE_SIZE = 8;
        private static final String CODE_UTF8 = "UTF-8";
        private static final String CODE_UTF16 = "UTF-16";//Unicode
        private static final String CODE_UTF16LE = "UTF-16LE";//Unicode big endian
        private static final String CODE_GBK = "GBK"; //ABSU
    
        /**
         * 通过文件全名称获取编码集名称
         */
        public static String getEncode(String fullFileName) throws Exception {
            BufferedInputStream bis = new BufferedInputStream(new FileInputStream(fullFileName));
            return getEncode(bis, CODE_GBK);
        }
    
    
        /**
         * 通过文件全名称获取编码集名称
         */
        public static String getEncode(String fullFileName, String defaultEncoding) throws Exception {
            BufferedInputStream bis = new BufferedInputStream(new FileInputStream(fullFileName));
            return getEncode(bis, defaultEncoding);
        }
    
        /**
         * 通过文件缓存流获取编码集名称,文件流必须为未曾
         *
         * @param bis 文件流
         */
        public static String getEncode(BufferedInputStream bis, String defaultEncoding) throws Exception {
            bis.mark(0);
            String encodeType;
            byte[] head = new byte[3];
            bis.read(head);
            if (head[0] == -1 && head[1] == -2 && head[2] == (byte) 0x41) {
                encodeType = CODE_UTF16;
            } else if (head[0] == -2 && head[1] == -1 && head[2] == 0) {
                //encodeType = "Unicode";
                encodeType = CODE_UTF16LE;
            } else if (head[0] == -17 && head[1] == -69 && head[2] == -65) {
                //带BOM的UTF8 (CODE_UTF8_BOM)
                encodeType = CODE_UTF8;
            } else {
                if (isUTF8(bis)) {
                    encodeType = CODE_UTF8;
                } else {
                    encodeType = defaultEncoding;
                }
            }
    
            return encodeType;
        }
    
        /**
         * 是否是无BOM的UTF8格式,不判断常规场景,只区分无BOM UTF8和GBK
         */
        private static boolean isUTF8(BufferedInputStream bis) throws Exception {
            bis.reset();
    
            //读取第一个字节
            int code = bis.read();
            do {
                BitSet bitSet = convert2BitSet(code);
                //判断是否为单字节
                if (bitSet.get(0)) {//多字节时,再读取N个字节
                    if (!checkMultiByte(bis, bitSet)) {//未检测通过,直接返回
                        return false;
                    }
                }
                code = bis.read();
            } while (code != -1);
            return true;
        }
    
        /**
         * 检测多字节,判断是否为utf8,已经读取了一个字节
         */
        private static boolean checkMultiByte(BufferedInputStream bis, BitSet bitSet) throws Exception {
            int count = getCountOfSequential(bitSet);
            byte[] bytes = new byte[count - 1];//已经读取了一个字节,不能再读取
            bis.read(bytes);
            for (byte b : bytes) {
                if (!checkUtf8Byte(b)) {
                    return false;
                }
            }
            return true;
        }
    
        /**
         * 检测bitSet中从开始有多少个连续的1
         */
        private static int getCountOfSequential(BitSet bitSet) {
            int count = 0;
            for (int i = 0; i < BYTE_SIZE; i++) {
                if (bitSet.get(i)) {
                    count++;
                } else {
                    break;
                }
            }
            return count;
        }
    
        /**
         * 检测单字节,判断是否为utf8
         */
        private static boolean checkUtf8Byte(byte b) throws Exception {
            BitSet bitSet = convert2BitSet(b);
            return bitSet.get(0) && !bitSet.get(1);
        }
    
        /**
         * 将整形转为BitSet
         */
        private static BitSet convert2BitSet(int code) {
            BitSet bitSet = new BitSet(BYTE_SIZE);
    
            for (int i = 0; i < BYTE_SIZE; i++) {
                int tmp3 = code >> (BYTE_SIZE - i - 1);
                int tmp2 = 0x1 & tmp3;
                if (tmp2 == 1) {
                    bitSet.set(i);
                }
            }
            return bitSet;
        }
    
        public static void main(String[] args) {
            String filePath = "C:\110025.txt";
            try {
                String encoding = getEncode(filePath);
                System.out.println(encoding);
            } catch (Exception ex) {
                logger.warn("文件检测编码出错!", ex);
            }
        }
    }
  • 相关阅读:
    Oracle常见授权与回收权限——grant和revoke
    数据库之笛卡尔积
    hdu 2032 一维数组实现杨辉三角
    poj3071之概率DP
    冒泡排序及两种优化方式
    Non-ASCII character &#39;xe8&#39; in file xxx.py on line 8, but no encoding declared
    编写shell脚本获取本机的网络地址。&#160; 比方:本机的ip地址是:192.168.100.2/255.255.255.0,那么它的网络地址是&#160;192.168.100.1/255.255.255.
    移动站点性能优化
    Math类概述及其成员方法
    java中StringBuilder、StringBuffer、String类之间的关系
  • 原文地址:https://www.cnblogs.com/zhshlimi/p/8715012.html
Copyright © 2011-2022 走看看