zoukankan      html  css  js  c++  java
  • java 清除 bom

    参考工具  http://akini.mbnet.fi/java/unicodereader/


    Utf8BomRemover 清除bom的方法
    1. package cn.com.do1.component.common.util;
    2. import java.io.*;
    3. import java.nio.charset.Charset;
    4. public class Utf8BomRemover {
    5. /**
    6. * 读取流中前面的字符,看是否有bom,如果有bom,将bom头先读掉丢弃
    7. *
    8. * @param in
    9. * @return
    10. * @throws java.io.IOException
    11. */
    12. public static InputStream getInputStream(InputStream in) throws IOException {
    13. PushbackInputStream testin = new PushbackInputStream(in);
    14. int ch = testin.read();
    15. if (ch != 0xEF) {
    16. testin.unread(ch);
    17. } else if ((ch = testin.read()) != 0xBB) {
    18. testin.unread(ch);
    19. testin.unread(0xef);
    20. } else if ((ch = testin.read()) != 0xBF) {
    21. throw new IOException("错误的UTF-8格式文件");
    22. } else {
    23. // 不需要做,这里是bom头被读完了
    24. // // System.out.println("still exist bom");
    25. }
    26. return testin;
    27. }
    28. /**
    29. * 根据一个文件名,读取完文件,干掉bom头。
    30. *
    31. * @param fileName
    32. * @throws java.io.IOException
    33. */
    34. public static void trimBom(String fileName) throws IOException {
    35. FileInputStream fin = new FileInputStream(fileName);
    36. // 开始写临时文件
    37. InputStream in = getInputStream(fin);
    38. ByteArrayOutputStream bos = new ByteArrayOutputStream();
    39. byte b[] = new byte[4096];
    40. int len = 0;
    41. while (in.available() > 0) {
    42. len = in.read(b, 0, 4096);
    43. // out.write(b, 0, len);
    44. bos.write(b, 0, len);
    45. }
    46. in.close();
    47. fin.close();
    48. bos.close();
    49. // 临时文件写完,开始将临时文件写回本文件。
    50. System.out.println("[" + fileName + "]");
    51. FileOutputStream out = new FileOutputStream(fileName);
    52. out.write(bos.toByteArray());
    53. out.close();
    54. System.out.println("处理文件" + fileName);
    55. }
    56. public static void main(String[] args) throws IOException {
    57. //刪除指定文件夾下(含子文件夾)所有java文件的BOM,若構造器中參數為null則刪除所有文件頭部BOM
    58. new Utf8BomRemover( "java" ).start( ""F:\\flwork\\gmmsDGYH\\src\\com");
    59. }
    60. /**
    61. * 根据一个文件名,读取完文件,干掉bom头2 这里使用了第三方类UnicodeReader
    62. *
    63. * @throws java.io.IOException
    64. */
    65. public void saveFile(String file) throws IOException {
    66. InputStream in= new FileInputStream( file);
    67. BufferedReader bre = null;
    68. OutputStreamWriter pw = null;//定义一个流
    69. CharArrayWriter writer = new CharArrayWriter();
    70. //这一句会读取BOM头
    71. //bre = new BufferedReader(new InputStreamReader(in, "UTF-8"));
    72. //这一句会干掉BOM头
    73. bre = new BufferedReader(new UnicodeReader(in, Charset.defaultCharset().name()));//
    74. String line = bre.readLine();
    75. while(line != null)
    76. {
    77. writer.write(line);
    78. /*
    79. 加上这段代码可以查看更详细的16进制
    80. byte[] allbytes = line.getBytes("UTF-8");
    81. for (int i=0; i < allbytes.length; i++)
    82. {
    83. int tmp = allbytes[i];
    84. String hexString = Integer.toHexString(tmp);
    85. // 1个byte变成16进制的,只需要2位就可以表示了,取后面两位,去掉前面的符号填充
    86. hexString = hexString.substring(hexString.length() -2);
    87. System.out.print(hexString.toUpperCase());
    88. System.out.print(" ");
    89. }*/
    90. line = bre.readLine();
    91. }
    92. writer.flush();
    93. bre.close();
    94. FileWriter f2 = new FileWriter(file);
    95. writer.writeTo(f2);
    96. f2.close();
    97. writer.close();
    98. }
    99. private String extension = null ;
    100. public Utf8BomRemover(String extension) {
    101. super ();
    102. this .extension = extension;
    103. }
    104. /** 啟動對某個文件夾的篩選 */
    105. @SuppressWarnings ( "unchecked" )
    106. public void start(String rootDir) throws IOException {
    107. traverseFolder2(rootDir);
    108. }
    109. public void traverseFolder2(String path) throws IOException {
    110. File file = new File(path);
    111. if (file.exists()) {
    112. File[] files = file.listFiles();
    113. if (files.length == 0) {
    114. System.out.println("文件夹是空的!");
    115. return;
    116. } else {
    117. for (File file2 : files) {
    118. if (file2.isDirectory()) {
    119. System.out.println("文件夹:" + file2.getAbsolutePath());
    120. traverseFolder2(file2.getAbsolutePath());
    121. } else {
    122. remove(file2.getAbsolutePath());
    123. }
    124. }
    125. }
    126. } else {
    127. System.out.println("文件不存在!");
    128. }
    129. }
    130. /** 移除UTF-8的BOM */
    131. private void remove(String path) throws IOException {
    132. saveFile(path);
    133. trimBom(path);
    134. }
    135. }

    第二种方法的UnicodeReader


    1. package cn.com.do1.component.common.util;
    2. import java.io.IOException;
    3. import java.io.InputStream;
    4. import java.io.InputStreamReader;
    5. import java.io.PushbackInputStream;
    6. import java.io.Reader;
    7. /**
    8. version: 1.1 / 2007-01-25
    9. - changed BOM recognition ordering (longer boms first)
    10. 网络地址:http://koti.mbnet.fi/akini/java/unicodereader/UnicodeReader.java.txt
    11. Original pseudocode : Thomas Weidenfeller
    12. Implementation tweaked: Aki Nieminen
    13. http://www.unicode.org/unicode/faq/utf_bom.html
    14. BOMs:
    15. 00 00 FE FF = UTF-32, big-endian
    16. FF FE 00 00 = UTF-32, little-endian
    17. EF BB BF = UTF-8,
    18. FE FF = UTF-16, big-endian
    19. FF FE = UTF-16, little-endian
    20. Win2k Notepad:
    21. Unicode format = UTF-16LE
    22. ***/
    23. /**
    24. * Generic unicode textreader, which will use BOM mark
    25. * to identify the encoding to be used. If BOM is not found
    26. * then use a given default or system encoding.
    27. */
    28. public class UnicodeReader extends Reader {
    29. PushbackInputStream internalIn;
    30. InputStreamReader internalIn2 = null;
    31. String defaultEnc;
    32. private static final int BOM_SIZE = 4;
    33. /**
    34. *
    35. * @param in inputstream to be read
    36. * @param defaultEnc default encoding if stream does not have
    37. * BOM marker. Give NULL to use system-level default.
    38. */
    39. UnicodeReader(InputStream in, String defaultEnc) {
    40. internalIn = new PushbackInputStream(in, BOM_SIZE);
    41. this.defaultEnc = defaultEnc;
    42. }
    43. public String getDefaultEncoding() {
    44. return defaultEnc;
    45. }
    46. /**
    47. * Get stream encoding or NULL if stream is uninitialized.
    48. * Call init() or read() method to initialize it.
    49. */
    50. public String getEncoding() {
    51. if (internalIn2 == null) return null;
    52. return internalIn2.getEncoding();
    53. }
    54. /**
    55. * Read-ahead four bytes and check for BOM marks. Extra bytes are
    56. * unread back to the stream, only BOM bytes are skipped.
    57. */
    58. protected void init() throws IOException {
    59. if (internalIn2 != null) return;
    60. String encoding;
    61. byte bom[] = new byte[BOM_SIZE];
    62. int n, unread;
    63. n = internalIn.read(bom, 0, bom.length);
    64. if ( (bom[0] == (byte)0x00) && (bom[1] == (byte)0x00) &&
    65. (bom[2] == (byte)0xFE) && (bom[3] == (byte)0xFF) ) {
    66. encoding = "UTF-32BE";
    67. unread = n - 4;
    68. } else if ( (bom[0] == (byte)0xFF) && (bom[1] == (byte)0xFE) &&
    69. (bom[2] == (byte)0x00) && (bom[3] == (byte)0x00) ) {
    70. encoding = "UTF-32LE";
    71. unread = n - 4;
    72. } else if ( (bom[0] == (byte)0xEF) && (bom[1] == (byte)0xBB) &&
    73. (bom[2] == (byte)0xBF) ) {
    74. encoding = "UTF-8";
    75. unread = n - 3;
    76. } else if ( (bom[0] == (byte)0xFE) && (bom[1] == (byte)0xFF) ) {
    77. encoding = "UTF-16BE";
    78. unread = n - 2;
    79. } else if ( (bom[0] == (byte)0xFF) && (bom[1] == (byte)0xFE) ) {
    80. encoding = "UTF-16LE";
    81. unread = n - 2;
    82. } else {
    83. // Unicode BOM mark not found, unread all bytes
    84. encoding = defaultEnc;
    85. unread = n;
    86. }
    87. //System.out.println("read=" + n + ", unread=" + unread);
    88. if (unread > 0) internalIn.unread(bom, (n - unread), unread);
    89. // Use given encoding
    90. if (encoding == null) {
    91. internalIn2 = new InputStreamReader(internalIn);
    92. } else {
    93. internalIn2 = new InputStreamReader(internalIn, encoding);
    94. }
    95. }
    96. public void close() throws IOException {
    97. init();
    98. internalIn2.close();
    99. }
    100. public int read(char[] cbuf, int off, int len) throws IOException {
    101. init();
    102. return internalIn2.read(cbuf, off, len);
    103. }
    104. }




  • 相关阅读:
    ACM成长之路
    洛谷P1047 校门外的树
    洛谷P1046 陶陶摘苹果
    2017 ACM-ICPC 亚洲区(南宁赛区)网络赛 F题
    图论:POJ2186-Popular Cows (求强连通分量)
    DFS:POJ1562-Oil Deposits(求连通块个数)
    DFS:POJ3620-Avoid The Lakes(求最基本的联通块)
    map函数的应用:UVa156-Ananagrams
    set的应用:UVa10815-Andy's First Dictionary
    水题:UVa253-Cube painting
  • 原文地址:https://www.cnblogs.com/signheart/p/0925dae8a5a29153fe5276f67701176d.html
Copyright © 2011-2022 走看看