zoukankan      html  css  js  c++  java
  • 检测编码

    public static Encoding determineEncoding(RandomAccessFile file) {
    	    Encoding enc = Encoding.GBK;
    	    try {
    		    file.seek(0);
    		    if(file.length() < 3) return enc;
    		    byte[] bom = new byte[3]; //byte order mark
    		    file.read(bom);
    		    
    		    if((bom[0] & 0XFF) == 0xFF && (bom[1] & 0XFF) == 0xFE) 
    		    	enc = Encoding.UTF16LE;
    		    else if((bom[0] & 0XFF) == 0xFE && (bom[1] & 0XFF) == 0xFF) 
    		    	enc = Encoding.UTF16BE;
    		    else if((bom[0] & 0XFF) == 0xEF && (bom[1] & 0XFF) == 0xBB && (bom[2] & 0XFF) == 0xBF) 
    		    	enc = Encoding.UTF8;
    		    else {//test if the file is encoded using GBK or BIG5 character set
    		        int gbkCount = 0;
    		        int big5Count = 0;
    		        int utf16leCount = 0;
    		        int utf16beCount = 0;
    		        int utf8Count = 0;
    		        
    		        file.seek(0);
    		        byte[] bs = new byte[4096];
    		        file.read(bs);
    		        int len = bs.length - 2;
    		        //look up the Chinese characters "�?
    		        for(int i = 0; i < len; ++i) {
    		        	if((bs[i] & 0xFF) == 0xB5 && (bs[i + 1] & 0xFF) == 0xC4) {
    		        		++gbkCount;
    		        		++i;
    		        	} else if ((bs[i] & 0xFF) == 0xE7 && (bs[i + 1] & 0xFF) == 0x9A && (bs[i + 2] & 0xFF) == 0x84) {
    		        		++utf8Count;
    		                i += 2;
    		        	} else if ((bs[i] & 0xFF) == 0x84 && (bs[i + 1] & 0xFF) == 0x76) {
    		        		++utf16leCount;
    		        		++i;
    		        	} else if ((bs[i] & 0xFF) == 0x76 && (bs[i + 1] & 0xFF) == 0x84) {
    		        		++utf16beCount;
    		        		++i;
    		        	} else if ((bs[i] & 0xFF) == 0xAA && (bs[i + 1] & 0xFF) == 0xBA) {
    		        		++big5Count;
    		        		++i;
    		        	}
    		        }       
    	
    		        if(gbkCount > utf8Count && gbkCount > big5Count && gbkCount > utf16leCount && gbkCount > utf16beCount) 
    		        	enc = Encoding.GBK;
    		        else if(utf8Count > gbkCount && utf8Count > big5Count && utf8Count > utf16leCount && utf8Count > utf16beCount)
    		        	enc = Encoding.UTF8;
    		        else if(utf16leCount > gbkCount && utf16leCount > big5Count && utf16leCount > utf8Count && utf16leCount > utf16beCount)
    		        	enc = Encoding.UTF16LE;
    		        else if(utf16beCount > gbkCount && utf16beCount > big5Count && utf16beCount > utf16leCount && utf16beCount > utf16leCount)
    		        	enc = Encoding.UTF16BE;
    		        else if(big5Count > gbkCount && big5Count > utf8Count && big5Count > utf16leCount && big5Count > utf16beCount)
    		        	enc = Encoding.BIG5;
    		    }
    	    } catch (Exception ex) {
    	    	Log.e("File ERROR", "encoding detection failed.");
    	    }
    	    return enc;
    	}
    	


    public enum Encoding {
    	GBK("GBK"),
    	BIG5("BIG5"),
    	UTF8("UTF-8"),
    	UTF16BE("UTF-16BE"),
    	UTF16LE("UTF-16LE"),
    	UNKNOWN("UNKNOWN");
    	
    	private Encoding (String name) {
    		this.name = name;
    		try {
    			maxCharLength = "中".getBytes(name).length;
    		} catch (Exception e) {}
    	}
    	
    	private String name;
    	public String getName() {
    		return name;
    	}
    	
    	private int maxCharLength;
    	public int getMaxCharLength() {
    		return maxCharLength;
    	}
    }
    


  • 相关阅读:
    基于FPGA的频率检测与LCD显示
    基于labview和fpga的信号发生器
    基于FPGA的直流电机
    基于FPGA的LDPC编译码器说明文档
    基于FPGA的dds发生器与lcd显示
    12th.Linux驱动程序开发
    11th.U-boot——代码结构分析(二)
    C语言中的函数指针
    10th.U-boot——代码结构分析(一)
    9th.U-boot——初识Bootloader
  • 原文地址:https://www.cnblogs.com/javawebsoa/p/3006013.html
Copyright © 2011-2022 走看看