zoukankan      html  css  js  c++  java
  • Java解析excel文件大数据量时,报OOM异常问题。

    在解析EXCEL,JAVA 通常使用poi包下的 XSSFWorkbook 对象,但是,遇到海量数据(比如十几万条 或者 JVM堆设置了内存比较小时),就会抛出OOM异常,下面就放出解决方法。

    直接上代码:

    package com.xxx.xxx.xxx.common.util;
    
    import java.io.IOException;
    import java.io.InputStream;
    import java.util.ArrayList;
    import java.util.List;
    import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
    import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
    import org.apache.poi.openxml4j.opc.OPCPackage;
    import org.apache.poi.xssf.eventusermodel.ReadOnlySharedStringsTable;
    import org.apache.poi.xssf.eventusermodel.XSSFReader;
    import org.apache.poi.xssf.eventusermodel.XSSFSheetXMLHandler;
    import org.apache.poi.xssf.eventusermodel.XSSFSheetXMLHandler.SheetContentsHandler;
    import org.apache.poi.xssf.model.StylesTable;
    import org.apache.poi.xssf.usermodel.XSSFComment;
    import org.slf4j.Logger;
    import org.slf4j.LoggerFactory;
    import org.springframework.stereotype.Component;
    import org.xml.sax.InputSource;
    import org.xml.sax.SAXException;
    import org.xml.sax.XMLReader;
    import org.xml.sax.helpers.XMLReaderFactory;
    
    /**
     * @author Jimmy Shan
     * @date 2021-04-09
     * @desc 解析大数据量excel文件,避免OOM发生
     */
    @Component
    public class ExcelParserBigData {
        private static final Logger logger = LoggerFactory.getLogger(ExcelParserBigData.class);
        private ISheetContentHandler contentHandler = new DefaultSheetHandler(); //表格默认处理器
        private List<String[]> datas = new ArrayList<String[]>(); //读取数据
    
        /**
         * @desc 转换表格,默认为转换第一个表格
         */
        public ExcelParserBigData parse(InputStream stream)
                throws InvalidFormatException, IOException, ParseException {
            return parse(stream, 1);
        }
    
        /**
         * @desc 解析方法
         */
        public synchronized ExcelParserBigData parse(InputStream stream, int sheetId)
                throws InvalidFormatException, IOException, ParseException {
            // 每次转换前都清空数据
            datas.clear();
            // 打开表格文件输入流
            OPCPackage pkg = OPCPackage.open(stream);
            try {
                // 创建表阅读器
                XSSFReader reader;
                try {
                    reader = new XSSFReader(pkg);
                } catch (OpenXML4JException e) {
                    logger.error("读取表格出错");
                    throw new ParseException(e.fillInStackTrace());
                }
    
                // 转换指定单元表
                InputStream shellStream = reader.getSheet("rId" + sheetId);
                try {
                    InputSource sheetSource = new InputSource(shellStream);
                    StylesTable styles = reader.getStylesTable();
                    ReadOnlySharedStringsTable strings = new ReadOnlySharedStringsTable(pkg);
                    getContentHandler().init(datas);// 设置读取出的数据
                    // 获取转换器
                    XMLReader parser = getSheetParser(styles, strings);
                    parser.parse(sheetSource);
                } catch (SAXException e) {
                    logger.error("读取表格出错");
                    throw new ParseException(e.fillInStackTrace());
                } finally {
                    shellStream.close();
                }
            } finally {
                pkg.close();
            }
    
            return this;
        }
    
        /**
         * @desc 获取表格读取数据,获取数据前,需要先转换数据,此方法不会获取第一行数据,表格读取数据
         */
        public List<String[]> getDatas() {
            return getDatas(true);
        }
    
        /**
         * @desc 获取表格读取数据,获取数据前,需要先转换数据
         */
        public List<String[]> getDatas(boolean dropFirstRow) {
            if (dropFirstRow && datas.size() > 0) {
                datas.remove(0);// 删除表头
            }
    
            return datas;
        }
    
        /**
         * @desc 获取读取表格的转换器
         */
        protected XMLReader getSheetParser(StylesTable styles, ReadOnlySharedStringsTable strings) throws SAXException {
            XMLReader parser = XMLReaderFactory.createXMLReader();
            parser.setContentHandler(new XSSFSheetXMLHandler(styles, strings, getContentHandler(), false));
    
            return parser;
        }
    
        public ISheetContentHandler getContentHandler() {
            return contentHandler;
        }
    
        public void setContentHandler(ISheetContentHandler contentHandler) {
            this.contentHandler = contentHandler;
        }
    
        /**
         * @desc 表格转换错误
         */
        public class ParseException extends Exception {
            private static final long serialVersionUID = -2451526411018517607L;
    
            public ParseException(Throwable t) {
                super("表格转换错误", t);
            }
        }
    
        public interface ISheetContentHandler extends SheetContentsHandler {
            /**
             * @desc 设置转换后的数据集,用于存放转换结果
             */
            void init(List<String[]> datas);
        }
    
        /**
         * @desc 默认表格解析handder
         */
        class DefaultSheetHandler implements ISheetContentHandler {
            /**
             * @desc 读取数据
             */
            private List<String[]> datas;
            private int columsLength;
            private String[] readRow;
            private ArrayList<String> fristRow = new ArrayList<String>();
    
            @Override
            public void init(List<String[]> datas) {
                this.datas = datas;
                //this.columsLength = columsLength;
            }
    
            @Override
            public void startRow(int rowNum) {
                if (rowNum != 0) {
                    readRow = new String[columsLength];
                }
            }
    
            @Override
            public void endRow(int rowNum) {
                //将Excel第一行表头的列数当做数组的长度,要保证后续的行的列数不能超过这个长度,这是个约定。
                if (rowNum == 0) {
                    columsLength = fristRow.size();
                    readRow = fristRow.toArray(new String[fristRow.size()]);
                }else {
                    readRow = fristRow.toArray(new String[columsLength]);
                }
                datas.add(readRow.clone());
                readRow = null;
                fristRow.clear();
            }
    
            @Override
            public void cell(String cellReference, String formattedValue, XSSFComment comment) {
                int index = getCellIndex(cellReference);//转换A1,B1,C1等表格位置为真实索引位置
                try {
                    fristRow.set(index, formattedValue);
                } catch (IndexOutOfBoundsException e) {
                    int size = fristRow.size();
                    for (int i = index - size+1;i>0;i--){
                        fristRow.add(null);
                    }
                    fristRow.set(index,formattedValue);
                }
            }
    
            @Override
            public void headerFooter(String text, boolean isHeader, String tagName) {
            }
    
            /**
             * @desc 转换表格引用为列编号
             */
            public int getCellIndex(String cellReference) {
                String ref = cellReference.replaceAll("\d+", "");
                int num = 0;
                int result = 0;
                for (int i = 0; i < ref.length(); i++) {
                    char ch = cellReference.charAt(ref.length() - i - 1);
                    num = (int) (ch - 'A' + 1);
                    num *= Math.pow(26, i);
                    result += num;
                }
                return result - 1;
            }
        }
    }
    

    调用方:

    package com.xxx.xxx.xxx.service.impl;
    
    import com.xxx.xxx.xxx.common.util.ExcelParserBigData;
    import com.xxx.xxx.xxx.service.DemoBaseService;
    import org.slf4j.Logger;
    import org.slf4j.LoggerFactory;
    import org.springframework.beans.factory.annotation.Autowired;
    import org.springframework.stereotype.Service;
    import java.io.FileInputStream;
    import java.util.List;
    
    /**
     * @author Jimmy Shan
     * @date 2021-04-07
     * @desc 示例 服务实现
     */
    @Service("demoBaseService")
    public class DemoBaseServiceImpl implements DemoBaseService {
        private static final Logger LOGGER = LoggerFactory.getLogger(DemoBaseServiceImpl.class);
        @Autowired
        private ExcelParserBigData excelParserBigData;
    
        /**
         * @desc 解析大数据量excel文件
         */
        @Override
        public void parseBigXlsx(String upPathName) {
            long start = System.currentTimeMillis();
            FileInputStream inf = null;
            try {
                inf = new FileInputStream(upPathName);
                ExcelParserBigData parseBigData = excelParserBigData.parse(inf);
                List<String[]> dataList = parseBigData.getDatas(); //这里的List泛型是String数组,解析后的数据是以 数组形式存放的。
                for (int i = 0; i < dataList.size(); i++) {
                    String[] str = dataList.get(i);
                    LOGGER.info("第 {} 行", (i +1));
                    LOGGER.info("oppoId = {}, oppoName = {}", str[0], str[1]);
                }
            } catch (Exception e) {
                e.printStackTrace();
            } finally {
                try {
                    if(inf != null) {
                        inf.close();
                    }
                    LOGGER.info("解析总耗时:{} 毫秒", (System.currentTimeMillis() - start));
                } catch (Exception e) {
                    e.printStackTrace();
                }
            }
        }
    }
    

    接口定义:

    package com.xxx.xxx.xxx.service;
    
    /**
     * @author Jimmy Shan
     * @date 2021-04-07
     * @desc 示例 服务
     */
    public interface DemoBaseService {
        /**
         * @desc 解析大数据量excel文件
         */
        void parseBigXlsx(String upPathName);
    }
    

    OK,记录到此。

  • 相关阅读:
    django 定时任务 django-crontab 的使用
    Django中更新多个对象数据与删除对象的方法
    Django复制记录的方法
    Python中关于日期的计算总结
    django中添加日志功能
    Python 日期时间datetime 加一天,减一天,加减一小时一分钟,加减一年
    根据后端传的时间前端js进行倒计时
    输入pip命令报错:from pip import main ImportError: cannot import name 'main'
    操作uwsgi命令
    关于linux下安装mysqlclient报 Failed building wheel for mysqlclient问题
  • 原文地址:https://www.cnblogs.com/jimmyshan-study/p/14638700.html
Copyright © 2011-2022 走看看