public static Encoding determineEncoding(RandomAccessFile file) { Encoding enc = Encoding.GBK; try { file.seek(0); if(file.length() < 3) return enc; byte[] bom = new byte[3]; //byte order mark file.read(bom); if((bom[0] & 0XFF) == 0xFF && (bom[1] & 0XFF) == 0xFE) enc = Encoding.UTF16LE; else if((bom[0] & 0XFF) == 0xFE && (bom[1] & 0XFF) == 0xFF) enc = Encoding.UTF16BE; else if((bom[0] & 0XFF) == 0xEF && (bom[1] & 0XFF) == 0xBB && (bom[2] & 0XFF) == 0xBF) enc = Encoding.UTF8; else {//test if the file is encoded using GBK or BIG5 character set int gbkCount = 0; int big5Count = 0; int utf16leCount = 0; int utf16beCount = 0; int utf8Count = 0; file.seek(0); byte[] bs = new byte[4096]; file.read(bs); int len = bs.length - 2; //look up the Chinese characters "�? for(int i = 0; i < len; ++i) { if((bs[i] & 0xFF) == 0xB5 && (bs[i + 1] & 0xFF) == 0xC4) { ++gbkCount; ++i; } else if ((bs[i] & 0xFF) == 0xE7 && (bs[i + 1] & 0xFF) == 0x9A && (bs[i + 2] & 0xFF) == 0x84) { ++utf8Count; i += 2; } else if ((bs[i] & 0xFF) == 0x84 && (bs[i + 1] & 0xFF) == 0x76) { ++utf16leCount; ++i; } else if ((bs[i] & 0xFF) == 0x76 && (bs[i + 1] & 0xFF) == 0x84) { ++utf16beCount; ++i; } else if ((bs[i] & 0xFF) == 0xAA && (bs[i + 1] & 0xFF) == 0xBA) { ++big5Count; ++i; } } if(gbkCount > utf8Count && gbkCount > big5Count && gbkCount > utf16leCount && gbkCount > utf16beCount) enc = Encoding.GBK; else if(utf8Count > gbkCount && utf8Count > big5Count && utf8Count > utf16leCount && utf8Count > utf16beCount) enc = Encoding.UTF8; else if(utf16leCount > gbkCount && utf16leCount > big5Count && utf16leCount > utf8Count && utf16leCount > utf16beCount) enc = Encoding.UTF16LE; else if(utf16beCount > gbkCount && utf16beCount > big5Count && utf16beCount > utf16leCount && utf16beCount > utf16leCount) enc = Encoding.UTF16BE; else if(big5Count > gbkCount && big5Count > utf8Count && big5Count > utf16leCount && big5Count > utf16beCount) enc = Encoding.BIG5; } } catch (Exception ex) { Log.e("File ERROR", "encoding detection failed."); } return enc; }
public enum Encoding { GBK("GBK"), BIG5("BIG5"), UTF8("UTF-8"), UTF16BE("UTF-16BE"), UTF16LE("UTF-16LE"), UNKNOWN("UNKNOWN"); private Encoding (String name) { this.name = name; try { maxCharLength = "中".getBytes(name).length; } catch (Exception e) {} } private String name; public String getName() { return name; } private int maxCharLength; public int getMaxCharLength() { return maxCharLength; } }