zoukankan      html  css  js  c++  java
  • Dom4j解析XML中遇到的一些问题

    最近在用Dom4j解析XML文件,遇到了一些问题,记录如下:

    1. BOM头问题,得到的异常是:

     Nested exception: org.xml.sax.SAXParseException: Content is not allowed in prolog.

    (1)http://koti.mbnet.fi/akini/java/unicodereader/,里面提供了两个删掉BOM头的方法(我用了第一个):

    UnicodeInputStream
      1 /**
      2  version: 1.1 / 2007-01-25
      3  - changed BOM recognition ordering (longer boms first)
      4 
      5  Original pseudocode   : Thomas Weidenfeller
      6  Implementation tweaked: Aki Nieminen
      7 
      8  http://www.unicode.org/unicode/faq/utf_bom.html
      9  BOMs in byte length ordering:
     10    00 00 FE FF    = UTF-32, big-endian
     11    FF FE 00 00    = UTF-32, little-endian
     12    EF BB BF       = UTF-8,
     13    FE FF          = UTF-16, big-endian
     14    FF FE          = UTF-16, little-endian
     15 
     16  Win2k Notepad:
     17    Unicode format = UTF-16LE
     18 
     19 This class will help you to autorecognize and skip BOMs. This will support UTF-8 as well.
     20 ***/
     21 
     22 import java.io.*;
     23 
     24 /**
     25  * This inputstream will recognize unicode BOM marks
     26  * and will skip bytes if getEncoding() method is called
     27  * before any of the read(...) methods.
     28  *
     29  * Usage pattern:
     30      String enc = "ISO-8859-1"; // or NULL to use systemdefault
     31      FileInputStream fis = new FileInputStream(file);
     32      UnicodeInputStream uin = new UnicodeInputStream(fis, enc);
     33      enc = uin.getEncoding(); // check and skip possible BOM bytes
     34      InputStreamReader in;
     35      if (enc == null) in = new InputStreamReader(uin);
     36      else in = new InputStreamReader(uin, enc);
     37  */
     38 public class UnicodeInputStream extends InputStream {
     39    PushbackInputStream internalIn;
     40    boolean             isInited = false;
     41     String              defaultEnc;
     42     String              encoding;
     43 
     44     private static final int BOM_SIZE = 4;
     45 
     46     UnicodeInputStream(InputStream in, String defaultEnc) {
     47         internalIn = new PushbackInputStream(in, BOM_SIZE);
     48         this.defaultEnc = defaultEnc;
     49     }
     50 
     51     public String getDefaultEncoding() {
     52       return defaultEnc;
     53    }
     54 
     55    public String getEncoding() {
     56       if (!isInited) {
     57          try {
     58             init();
     59          } catch (IOException ex) {
     60             IllegalStateException ise = new IllegalStateException("Init method failed.");
     61             ise.initCause(ise);
     62             throw ise;
     63          }
     64       }
     65       return encoding;
     66    }
     67 
     68    /**
     69     * Read-ahead four bytes and check for BOM marks. Extra bytes are
     70     * unread back to the stream, only BOM bytes are skipped.
     71     */
     72    protected void init() throws IOException {
     73       if (isInited) return;
     74 
     75       byte bom[] = new byte[BOM_SIZE];
     76       int n, unread;
     77       n = internalIn.read(bom, 0, bom.length);
     78 
     79       if ( (bom[0] == (byte)0x00) && (bom[1] == (byte)0x00) &&
     80                   (bom[2] == (byte)0xFE) && (bom[3] == (byte)0xFF) ) {
     81          encoding = "UTF-32BE";
     82          unread = n - 4;
     83       } else if ( (bom[0] == (byte)0xFF) && (bom[1] == (byte)0xFE) &&
     84                   (bom[2] == (byte)0x00) && (bom[3] == (byte)0x00) ) {
     85          encoding = "UTF-32LE";
     86          unread = n - 4;
     87       } else if (  (bom[0] == (byte)0xEF) && (bom[1] == (byte)0xBB) &&
     88             (bom[2] == (byte)0xBF) ) {
     89          encoding = "UTF-8";
     90          unread = n - 3;
     91       } else if ( (bom[0] == (byte)0xFE) && (bom[1] == (byte)0xFF) ) {
     92          encoding = "UTF-16BE";
     93          unread = n - 2;
     94       } else if ( (bom[0] == (byte)0xFF) && (bom[1] == (byte)0xFE) ) {
     95          encoding = "UTF-16LE";
     96          unread = n - 2;
     97       } else {
     98          // Unicode BOM mark not found, unread all bytes
     99          encoding = defaultEnc;
    100          unread = n;
    101       }      
    102       //System.out.println("read=" + n + ", unread=" + unread);
    103 
    104       if (unread > 0) internalIn.unread(bom, (n - unread), unread);
    105 
    106       isInited = true;
    107    }
    108 
    109    public void close() throws IOException {
    110       //init();
    111       isInited = true;
    112       internalIn.close();
    113    }
    114 
    115    public int read() throws IOException {
    116       //init();
    117       isInited = true;
    118       return internalIn.read();
    119    }
    120 }
    UnicodeReader
      1 /**
      2  version: 1.1 / 2007-01-25
      3  - changed BOM recognition ordering (longer boms first)
      4 
      5  Original pseudocode   : Thomas Weidenfeller
      6  Implementation tweaked: Aki Nieminen
      7 
      8  http://www.unicode.org/unicode/faq/utf_bom.html
      9  BOMs:
     10    00 00 FE FF    = UTF-32, big-endian
     11    FF FE 00 00    = UTF-32, little-endian
     12    EF BB BF       = UTF-8,
     13    FE FF          = UTF-16, big-endian
     14    FF FE          = UTF-16, little-endian
     15 
     16  Win2k Notepad:
     17    Unicode format = UTF-16LE
     18 
     19 This class will do everything ever more transparently. Just instantiate it and read text.
     20 ***/
     21 
     22 import java.io.*;
     23 
     24 /**
     25  * Generic unicode textreader, which will use BOM mark
     26  * to identify the encoding to be used. If BOM is not found
     27  * then use a given default or system encoding.
     28  */
     29 public class UnicodeReader extends Reader {
     30    PushbackInputStream internalIn;
     31    InputStreamReader   internalIn2 = null;
     32    String              defaultEnc;
     33 
     34    private static final int BOM_SIZE = 4;
     35 
     36    /**
     37     *
     38     * @param in  inputstream to be read
     39     * @param defaultEnc default encoding if stream does not have 
     40     *                   BOM marker. Give NULL to use system-level default.
     41     */
     42    UnicodeReader(InputStream in, String defaultEnc) {
     43       internalIn = new PushbackInputStream(in, BOM_SIZE);
     44       this.defaultEnc = defaultEnc;
     45    }
     46 
     47    public String getDefaultEncoding() {
     48       return defaultEnc;
     49    }
     50 
     51    /**
     52     * Get stream encoding or NULL if stream is uninitialized.
     53     * Call init() or read() method to initialize it.
     54     */
     55    public String getEncoding() {
     56       if (internalIn2 == null) return null;
     57       return internalIn2.getEncoding();
     58    }
     59 
     60    /**
     61     * Read-ahead four bytes and check for BOM marks. Extra bytes are
     62     * unread back to the stream, only BOM bytes are skipped.
     63     */
     64    protected void init() throws IOException {
     65       if (internalIn2 != null) return;
     66 
     67       String encoding;
     68       byte bom[] = new byte[BOM_SIZE];
     69       int n, unread;
     70       n = internalIn.read(bom, 0, bom.length);
     71 
     72       if ( (bom[0] == (byte)0x00) && (bom[1] == (byte)0x00) &&
     73                   (bom[2] == (byte)0xFE) && (bom[3] == (byte)0xFF) ) {
     74          encoding = "UTF-32BE";
     75          unread = n - 4;
     76       } else if ( (bom[0] == (byte)0xFF) && (bom[1] == (byte)0xFE) &&
     77                   (bom[2] == (byte)0x00) && (bom[3] == (byte)0x00) ) {
     78          encoding = "UTF-32LE";
     79          unread = n - 4;
     80       } else if (  (bom[0] == (byte)0xEF) && (bom[1] == (byte)0xBB) &&
     81             (bom[2] == (byte)0xBF) ) {
     82          encoding = "UTF-8";
     83          unread = n - 3;
     84       } else if ( (bom[0] == (byte)0xFE) && (bom[1] == (byte)0xFF) ) {
     85          encoding = "UTF-16BE";
     86          unread = n - 2;
     87       } else if ( (bom[0] == (byte)0xFF) && (bom[1] == (byte)0xFE) ) {
     88          encoding = "UTF-16LE";
     89          unread = n - 2;
     90       } else {
     91          // Unicode BOM mark not found, unread all bytes
     92          encoding = defaultEnc;
     93          unread = n;
     94       }    
     95       //System.out.println("read=" + n + ", unread=" + unread);
     96 
     97       if (unread > 0) internalIn.unread(bom, (n - unread), unread);
     98 
     99       // Use given encoding
    100       if (encoding == null) {
    101          internalIn2 = new InputStreamReader(internalIn);
    102       } else {
    103          internalIn2 = new InputStreamReader(internalIn, encoding);
    104       }
    105    }
    106 
    107    public void close() throws IOException {
    108       init();
    109       internalIn2.close();
    110    }
    111 
    112    public int read(char[] cbuf, int off, int len) throws IOException {
    113       init();
    114       return internalIn2.read(cbuf, off, len);
    115    }
    116 
    117 }

    (2)下面的代码分析了产生BOM头的原因:

    ContentNotAllowedInProlog
    import java.io.*;
    import java.nio.charset.Charset;
    import javax.xml.parsers.*;
    import org.xml.sax.SAXException;
    import org.xml.sax.helpers.DefaultHandler;
    
    public class ContentNotAllowedInProlog {
      private static void parse(InputStream stream) throws SAXException,
          ParserConfigurationException, IOException {
        SAXParserFactory.newInstance().newSAXParser().parse(stream,
            new DefaultHandler());
      }
    
      public static void main(String[] args) {
        String[] encodings = { "UTF-8", "UTF-16", "ISO-8859-1" };
        for (String actual : encodings) {
          for (String declared : encodings) {
            if (actual != declared) {
              String xml = "<?xml version='1.0' encoding='" + declared
                  + "'?><x/>";
              byte[] encoded = xml.getBytes(Charset.forName(actual));
              try {
                parse(new ByteArrayInputStream(encoded));
                System.out.println("HIDDEN ERROR! actual:" + actual + " " + xml);
              } catch (Exception e) {
                System.out.println(e.getMessage() + " actual:" + actual + " xml:"
                    + xml);
              }
            }
          }
        }
      }
    }

    2. <!-- -->注释问题,得到的异常是:

    Nested exception: org.xml.sax.SAXParseException: The string "--" is not permitted within comments.

    产生这个异常的原因有很多。其中一个是<!--  -->中间有“--”字符;另外一个是“-->”前面不是空格,如“abc-->”则会抛出异常,而不是“abc -->”则不会。

    我的解决方式就是:删掉所有注释!

    针对这两个问题,写了个简单工具类:

    XmlUtil
      1 import java.io.BufferedReader;
      2 import java.io.FileInputStream;
      3 import java.io.FileNotFoundException;
      4 import java.io.FileOutputStream;
      5 import java.io.IOException;
      6 import java.io.InputStream;
      7 import java.io.InputStreamReader;
      8 import java.io.OutputStreamWriter;
      9 import java.io.PushbackInputStream;
     10 import java.io.UnsupportedEncodingException;
     11 import java.util.HashMap;
     12 import java.util.List;
     13 import org.dom4j.DocumentHelper;
     14 import org.dom4j.Element;
     15 import org.dom4j.XPath;
     16 import org.slf4j.Logger;
     17 import org.slf4j.LoggerFactory;
     18 
     19 public class XmlUtil extends InputStream {
     20     private static final Logger log = (Logger) LoggerFactory
     21             .getLogger(XmlUtil.class);
     22     private static final int BOM_SIZE = 4;
     23     PushbackInputStream internalIn;
     24     boolean isInited = false;
     25     String defaultEnc;
     26     String encoding;
     27 
     28     public XmlUtil(InputStream in, String defaultEnc) {
     29         internalIn = new PushbackInputStream(in, BOM_SIZE);
     30         this.defaultEnc = defaultEnc;
     31     }
     32 
     33     public String getDefaultEncoding() {
     34         return defaultEnc;
     35     }
     36 
     37     /**
     38      * Read-ahead four bytes and check for BOM marks. Extra bytes are unread
     39      * back to the stream, only BOM bytes are skipped.
     40      */
     41     protected void initXmlBOM() throws IOException {
     42         if (isInited)
     43             return;
     44 
     45         byte bom[] = new byte[BOM_SIZE];
     46         int n, unread;
     47         n = internalIn.read(bom, 0, bom.length);
     48 
     49         if ((bom[0] == (byte) 0x00) && (bom[1] == (byte) 0x00)
     50                 && (bom[2] == (byte) 0xFE) && (bom[3] == (byte) 0xFF)) {
     51             encoding = "UTF-32BE";
     52             unread = n - 4;
     53         } else if ((bom[0] == (byte) 0xFF) && (bom[1] == (byte) 0xFE)
     54                 && (bom[2] == (byte) 0x00) && (bom[3] == (byte) 0x00)) {
     55             encoding = "UTF-32LE";
     56             unread = n - 4;
     57         } else if ((bom[0] == (byte) 0xEF) && (bom[1] == (byte) 0xBB)
     58                 && (bom[2] == (byte) 0xBF)) {
     59             encoding = "UTF-8";
     60             unread = n - 3;
     61         } else if ((bom[0] == (byte) 0xFE) && (bom[1] == (byte) 0xFF)) {
     62             encoding = "UTF-16BE";
     63             unread = n - 2;
     64         } else if ((bom[0] == (byte) 0xFF) && (bom[1] == (byte) 0xFE)) {
     65             encoding = "UTF-16LE";
     66             unread = n - 2;
     67         } else {
     68             // Unicode BOM mark not found, unread all bytes
     69             encoding = defaultEnc;
     70             unread = n;
     71         }
     72         // log.info("read=" + n + ", unread=" + unread);
     73 
     74         if (unread > 0)
     75             internalIn.unread(bom, (n - unread), unread);
     76 
     77         isInited = true;
     78     }
     79 
     80     public String getEncoding() {
     81         if (!isInited) {
     82             try {
     83                 initXmlBOM();
     84             } catch (IOException ex) {
     85                 IllegalStateException ise = new IllegalStateException(
     86                         "Init method failed.");
     87                 ise.initCause(ise);
     88                 throw ise;
     89             }
     90         }
     91         return encoding;
     92     }
     93 
     94     public static void removeXmlBomAndComment(String filePath) {
     95         XmlUtil uins = null;
     96         BufferedReader bufr = null;
     97         OutputStreamWriter osw = null;
     98         String enc = "ISO-8859-1";
     99 
    100         String fileContent = "";
    101         String leftBracket = "<!--";
    102         String rightBracket = "-->";
    103         int leftBracketIndex = 0;
    104         int rightBracketIndex = 0;
    105 
    106         String line = "";
    107         StringBuffer fileContentBuffer = new StringBuffer();
    108         try {
    109             // 根据BOM Mark编码方式,对文件进行重新编码
    110             uins = new XmlUtil(new FileInputStream(filePath), enc);
    111             enc = uins.getEncoding();
    112 
    113             if (enc == null) {
    114                 bufr = new BufferedReader(new InputStreamReader(uins));
    115             } else {
    116                 bufr = new BufferedReader(new InputStreamReader(uins, enc));
    117             }
    118 
    119             while ((line = bufr.readLine()) != null) {
    120                 fileContentBuffer.append(line);
    121             }
    122             uins.close();
    123             bufr.close();
    124 
    125             // 删除"<!-- -->"格式的注释
    126             fileContent = fileContentBuffer.toString();
    127             leftBracketIndex = fileContent.indexOf(leftBracket);
    128             rightBracketIndex = fileContent.indexOf(rightBracket);
    129             while (leftBracketIndex < rightBracketIndex
    130                     && rightBracketIndex != 0) {
    131                 fileContent = fileContent.substring(0, leftBracketIndex)
    132                         + fileContent.substring(rightBracketIndex + 3,
    133                                 fileContent.length());
    134                 leftBracketIndex = fileContent.indexOf(leftBracket);
    135                 rightBracketIndex = fileContent.indexOf(rightBracket);
    136             }
    137 
    138             // 将处理过的内容,写入文件
    139             osw = new OutputStreamWriter(new FileOutputStream(filePath));
    140             osw.write(fileContent);
    141             osw.flush();
    142             osw.close();
    143 
    144         } catch (FileNotFoundException e) {
    145             e.printStackTrace();
    146         } catch (UnsupportedEncodingException e) {
    147             e.printStackTrace();
    148         } catch (IOException e) {
    149             e.printStackTrace();
    150         } finally {
    151             if (uins != null) {
    152                 try {
    153                     uins.close();
    154                 } catch (IOException e) {
    155                     e.printStackTrace();
    156                 }
    157             }
    158 
    159             if (bufr != null) {
    160                 try {
    161                     bufr.close();
    162                 } catch (IOException e) {
    163                     e.printStackTrace();
    164                 }
    165             }
    166 
    167             if (osw != null) {
    168                 try {
    169                     osw.close();
    170                 } catch (IOException e) {
    171                     e.printStackTrace();
    172                 }
    173             }
    174         }
    175     }
    176 
    177     /**
    178      * 如果根元素有声明命名空间,通过xpath匹配子元素时,需要特殊处理。
    179      * */
    180     public static List<Element> getNameSpaceElement(Element root, String node) {
    181         // 获得节点的命名空间
    182         HashMap<String, String> map = new HashMap<String, String>();
    183         map.put("mvn", root.getNamespaceURI());
    184         XPath xpath = DocumentHelper.createXPath("//mvn:" + node);
    185         xpath.setNamespaceURIs(map);
    186 
    187         @SuppressWarnings("unchecked")
    188         List<Element> selectedNodes = (List<Element>) xpath.selectNodes(root
    189                 .getDocument());
    190         return selectedNodes;
    191     }
    192 
    193     @Override
    194     public void close() throws IOException {
    195         // init();
    196         isInited = true;
    197         internalIn.close();
    198     }
    199 
    200     @Override
    201     public int read() throws IOException {
    202         // init();
    203         isInited = true;
    204         return internalIn.read();
    205     }
    206 }
  • 相关阅读:
    Oracle创建自增字段方法-ORACLE SEQUENCE的简介
    iOS项目开发实战——使用Xcode6设计自己定义控件与图形
    准备开源用javascript写Tomcat下的WebApp的项目
    Codeforces Round #256 (Div. 2) B. Suffix Structures
    静默方式安装10g数据库软件+升级patch+手工建库
    oracle 数据库开发面试题
    待机异常篇
    HTTP状态码(HTTP Status Code)
    POJ3126——Prime Path
    RHEL7 -- 通过gerp使用正则表达式
  • 原文地址:https://www.cnblogs.com/zhangqingsh/p/2954263.html
Copyright © 2011-2022 走看看