zoukankan      html  css  js  c++  java
  • java解析超大xml(1G),一般数据挖掘dblp.xml文件的解析

    在网上找了很多关于解析超大xml的例子,都说java再带的jar包中有相关的SAXparse类来解析xml,但是试过了好多次,之后还是不行,还有dom4j.jar等等,都不能解析太多条数的xml,大概超过30M,就会解析报错。

    不过偶尔看到过xercesImpl.jar,sax2.jar,jaxen-1.1.1.jar

    import java.io.IOException;
    
    import javax.xml.parsers.ParserConfigurationException;
    import javax.xml.parsers.SAXParser;
    import javax.xml.parsers.SAXParserFactory;
    import java.io.File;
    import org.xml.sax.SAXException;
    
    public class SAX {
    
    	public static void main(String[] args) {
    		try {
    			SAXParserFactory factory=SAXParserFactory.newInstance();
    			factory.setNamespaceAware(true);
    			factory.setValidating(true);
    			SAXParser parser=factory.newSAXParser();
    			SAXparse p1=new SAXparse();
    			parser.parse(new File("D:\\dblp.xml"), p1);
    		} catch (ParserConfigurationException e) {
    			// TODO Auto-generated catch block
    			e.printStackTrace();
    		} catch (SAXException e) {
    			// TODO Auto-generated catch block
    			e.printStackTrace();
    		} catch (IOException e) {
    			// TODO Auto-generated catch block
    			e.printStackTrace();
    		}
    	}
    
    }
    

      或者

    import java.io.FileWriter;
    import java.io.IOException;
    import java.io.InputStream;
    import java.util.Iterator;
    import java.util.List;
    
    import org.dom4j.Document;
    import org.dom4j.DocumentException;
    import org.dom4j.Element;
    import org.dom4j.io.SAXReader;
    
    /**
     * 
     */
    public class XMLParse {
    	private String configName = "dblp_little.xml";
    	private SAXReader saxReader;
    	private Document doc;
    	private Element root;
    
    	/**
    	 */
    	public XMLParse() {
    //		InputStream in = Thread.currentThread().getContextClassLoader()
    //				.getResourceAsStream(configName);
    		saxReader = new SAXReader();
    		try {
    			doc = saxReader.read(configName);
    		} catch (DocumentException e) {
    			e.printStackTrace();
    		}
    		root = doc.getRootElement();
    	}
    
    	/**
    	 * get Data Type
    	 * 
    	 * @throws IOException
    	 */
    	public void getModelElement(String attribute) {
    		FileWriter fileWriter = null;
    		try {
    			fileWriter = new FileWriter(attribute + ".txt");
    		} catch (IOException e) {
    			// TODO Auto-generated catch block
    			e.printStackTrace();
    		}
    		List list = root.elements();
    		Element model = null;
    		List childList = null;
    		Element modelEle = null;
    		Element returnModel = null;
    		String dataType = null;
    		StringBuffer sb = new StringBuffer();
    		int temp = 0;
    			for (Iterator it = list.iterator(); it.hasNext();) {
    				model = (Element) it.next();
    				temp++;
    				System.out.println("temp:"+temp);
    				childList = model.elements();
    				
    					
    					for (Iterator ite = childList.iterator(); ite.hasNext();) {
    						modelEle = (Element) ite.next();
    						if (attribute.equals(modelEle.getName())) {
    							dataType = modelEle.getText();
    							dataType = dataType;
    							if (sb.length() > 1) {
    								sb.append(",");
    							}
    							sb.append(dataType);
    						}
    					}
    					dataType = sb.toString();
    					if (!"".equals(dataType)) { // 没有值的话,跳过往txt中写值
    						try {
    							fileWriter.write(dataType);
    							fileWriter.write("\r\n");
    						} catch (IOException e) {
    							// TODO Auto-generated catch block
    							e.printStackTrace();
    						}
    					}
    					sb.delete(0, sb.length());
    					try {
    						fileWriter.flush();
    					} catch (IOException e) {
    						// TODO Auto-generated catch block
    						e.printStackTrace();
    					}
    					
    				}
    			try {
    //				fileWriter.flush();
    				fileWriter.close();
    				System.out.println("xml解析成功,并成功写入到"+attribute+".txt 文件中!");
    			} catch (IOException e) {
    				// TODO Auto-generated catch block
    				e.printStackTrace();
    			}
    			System.out.println("list.size:"+list.size());
    	}
    
    	public static void main(String[] args) {
    		// TODO Auto-generated method stub
    		String attribute = null;
    		XMLParse parse = new XMLParse();
    		attribute = "author";
    		parse.getModelElement(attribute);
    	}
    }
    

      

  • 相关阅读:
    欧拉函数、欧拉定理、费马小定理、拓展欧拉定理
    $CF 635 (Div 2)$
    $CF 634 (Div 3)$
    $CF 633 (Div 2)$
    $ACM$ 课第三次作业-搜索
    《信息安全数学基础一》第一章笔记
    接口测试工具与接口测试框架
    【python】基础知识小结
    【mongo】多个字段进行分组查询
    【mongo】去重操作
  • 原文地址:https://www.cnblogs.com/sinoJay/p/Jay.html
Copyright © 2011-2022 走看看