zoukankan      html  css  js  c++  java
  • Java检测文件是否UTF8编码

    介绍UTF-8编码规则

    1. UTF-8 编码字符理论上可以最多到 6 个字节长, 然而 16 位 BMP 字符最多只用到 3 字节长. Bigendian UCS-4 字节串的排列顺序是预定的.
    2. 字节 0xFE 和 0xFF 在 UTF-8 编码中从未用到.
    3. 下列字节串用来表示一个字符. 用到哪个串取决于该字符在 Unicode 中的序号.
    4. U-00000000 - U-0000007F: 0xxxxxxx
    5. U-00000080 - U-000007FF: 110xxxxx 10xxxxxx
    6. U-00000800 - U-0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx
    7. U-00010000 - U-001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
    8. U-00200000 - U-03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
    9. U-04000000 - U-7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
    10. xxx 的位置由字符编码数的二进制表示的位填入. 越靠右的 x 具有越少的特殊意义. 只用最短的那个足够表达一个字符编码数的多字节串. 注意在多字节串中, 第一个字节的开头"1"的数目就是整个串中字节的数目.
    11. 例如: Unicode 字符 U+00A9 = 1010 1001 (版权符号) 在 UTF-8 里的编码为:
    12. 11000010 10101001 = 0xC2 0xA9
    13. 而字符 U+2260 = 0010 0010 0110 0000 (不等于) 编码为:
    14. 11100010 10001001 10100000 = 0xE2 0x89 0xA0
    15. 特殊规则: 文件头三个字节用16进制表示是EFBBBF, 此规则不通用, 由编辑工具定义.
    16. 这种编码的官方名字拼写为 UTF-8, 其中 UTF 代表 UCS Transformation Format. 请勿在任何文档中用其他名字 (比如 utf8 或 UTF_8) 来表示 UTF-8, 当然除非你指的是一个变量名而不是这种编码本身.

    复制代码

    源码实现:

    1. package com.yy.game.test;
    2. import java.io.BufferedInputStream;
    3. import java.io.File;
    4. import java.io.FileInputStream;
    5. import java.io.IOException;
    6. import java.io.InputStream;
    7. import java.nio.CharBuffer;
    8. import java.nio.MappedByteBuffer;
    9. import java.nio.channels.FileChannel;
    10. import java.nio.channels.FileChannel.MapMode;
    11. import java.nio.charset.Charset;
    12. import java.nio.charset.CharsetDecoder;
    13. import java.nio.charset.CoderResult;
    14. public class UTF8Checker {
    15.         public static void main(String[] args) throws IOException {
    16.                 File dir = new File("F:\test");
    17.                 for (File file : dir.listFiles()) {
    18.                         System.out.format("%s: %s, %s%n", file, check(file), check2(file));
    19.                 }
    20.         }
    21.         /**
    22.          * JDK自带API实现
    23.          */
    24.         @SuppressWarnings("resource")
    25.         public static boolean check2(File file) throws IOException {
    26.                 long start = System.nanoTime();
    27.                 FileChannel fc = null;
    28.                 try {
    29.                         fc = new FileInputStream(file).getChannel();
    30.                         MappedByteBuffer buf = fc.map(MapMode.READ_ONLY, 0, fc.size());
    31.                         Charset utf8 = Charset.forName("UTF-8");
    32.                         CharsetDecoder decoder = utf8.newDecoder();
    33.                         CharBuffer cbuf = CharBuffer.allocate((int) (buf.limit() * decoder.averageCharsPerByte()));
    34.                         CoderResult result = decoder.decode(buf, cbuf, true);
    35.                         return !result.isError();
    36.                 } finally {
    37.                         if (fc != null) {
    38.                                 fc.close();
    39.                         }
    40.                         long end = System.nanoTime();
    41.                         System.out.println("used(ns):" + (end - start));
    42.                 }
    43.         }
    44.         /**
    45.          * 自定义实现
    46.          */
    47.         public static boolean check(File file) throws IOException {
    48.                 long start = System.nanoTime();
    49.                 InputStream in = null;
    50.                 try {
    51.                         in = new BufferedInputStream(new FileInputStream(file));
    52.                         StreamBuffer sbuf = new StreamBuffer(in, 1024);
    53.                         if (sbuf.next() == 0xEF && sbuf.next() == 0xBB && sbuf.next() == 0xBF) {
    54.                                 return true;
    55.                         }
    56.                         sbuf.redo();
    57.                         // 1. U-00000000 - U-0000007F: 0xxxxxxx
    58.                         // 2. U-00000080 - U-000007FF: 110xxxxx 10xxxxxx
    59.                         // 3. U-00000800 - U-0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx
    60.                         // 4. U-00010000 - U-001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
    61.                         // 5. U-00200000 - U-03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
    62.                         // 6. U-04000000 - U-7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
    63.                         for (int ch = 0; (ch = sbuf.next()) != -1;) {
    64.                                 int n = 0;
    65.                                 if (ch <= 0x7F) {
    66.                                         n = 1;
    67.                                 } else if (ch <= 0xBF) {
    68.                                         return false;
    69.                                 } else if (ch <= 0xDF) {
    70.                                         n = 2;
    71.                                 } else if (ch <= 0xEF) {
    72.                                         n = 3;
    73.                                 } else if (ch <= 0xF7) {
    74.                                         n = 4;
    75.                                 } else if (ch <= 0xFB) {
    76.                                         n = 5;
    77.                                 } else if (ch <= 0xFD) {
    78.                                         n = 6;
    79.                                 } else {
    80.                                         return false;
    81.                                 }
    82.                                 while (--n > 0) {
    83.                                         if ((sbuf.next() & 0x80) != 0x80) {
    84.                                                 return false;
    85.                                         }
    86.                                 }
    87.                         }
    88.                         return true;
    89.                 } finally {
    90.                         if (in != null) {
    91.                                 in.close();
    92.                         }
    93.                         long end = System.nanoTime();
    94.                         System.out.println("used(ns):" + (end - start));
    95.                 }
    96.         }
    97.         static class StreamBuffer {
    98.                 final InputStream in;
    99.                 final byte[] buf;
    100.                 int pos = -1;// 初始值为-1,表示指针尚未移动.
    101.                 int len;
    102.                 public StreamBuffer(InputStream in, int size) {
    103.                         this.in = in;
    104.                         if (size < 3) {
    105.                                 size = 3;
    106.                         }
    107.                         this.buf = new byte[size];
    108.                 }
    109.                 public void redo() {
    110.                         this.pos = 0;
    111.                 }
    112.                 public int next() throws IOException {
    113.                         if (len > 0 || pos < 0) {
    114.                                 if (++pos == len) {
    115.                                         if ((len = in.read(buf)) == 0) {
    116.                                                 return -1;
    117.                                         }
    118.                                         pos = 0;
    119.                                 }
    120.                                 return this.buf[this.pos] & 0xFF;
    121.                         } else {
    122.                                 return -1;
    123.                         }
    124.                 }
    125.         }
    126. }

    复制代码

    在本机测试, JDK原生API需要创建CharBuffer,性能明显慢了25%以上.

    1. used(ns):472420
    2. used(ns):4490075
    3. F: est334d5fd-b8a7-48f4-9099-f6011c7e5a48.sql: true, true
    4. used(ns):122515
    5. used(ns):343490
    6. F: est334d5fd-b8a7-48f4-9099-f6011c7e5a482.sql: false, false
    7. used(ns):55164
    8. used(ns):82425
    9. F: est est.sql: false, false

    复制代码

  • 相关阅读:
    bzoj4321
    bzoj1800
    codeforces396C
    codeforces400C
    codeforces271D
    关于jsp中jstl-core标签循环遍历的使用
    hibernate坑边闲话2
    hibernate坑边闲话
    hibernate中实体与数据库中属性对应的类型
    MySQL的常用命令:添加外键,修改字段名称,增加字段 设置主键自增长等
  • 原文地址:https://www.cnblogs.com/zolo/p/5849180.html
Copyright © 2011-2022 走看看