zoukankan      html  css  js  c++  java
  • java 解析office文件 大全

    原文地址:http://ansjsun.iteye.com/blog/791142

    读取OFFICE文件纯文本

    package org.css.resource.businesssoft.searchengine.quwenjiansuo;
    
    import java.io.File;
    import java.io.FileInputStream;
    import java.io.IOException;
    import java.io.InputStream;
    
    import org.apache.poi.POITextExtractor;
    import org.apache.poi.POIXMLDocument;
    import org.apache.poi.POIXMLTextExtractor;
    import org.apache.poi.extractor.ExtractorFactory;
    import org.apache.poi.hssf.usermodel.HSSFCell;
    import org.apache.poi.hssf.usermodel.HSSFRow;
    import org.apache.poi.hssf.usermodel.HSSFSheet;
    import org.apache.poi.hssf.usermodel.HSSFWorkbook;
    import org.apache.poi.hwpf.extractor.WordExtractor;
    import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
    import org.apache.poi.openxml4j.opc.OPCPackage;
    import org.apache.poi.xssf.usermodel.XSSFCell;
    import org.apache.poi.xssf.usermodel.XSSFRow;
    import org.apache.poi.xssf.usermodel.XSSFSheet;
    import org.apache.poi.xssf.usermodel.XSSFWorkbook;
    import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
    import org.apache.xmlbeans.XmlException;
    /**
     * 
     * @author lizh
     *
     */
    public class CovertFile {
    
        /**
         * 从word 2003文档中提取纯文本
         * @param is
         * @return
         * @throws IOException
         */
        public static String extractTextFromDOC(InputStream is) throws IOException {
            WordExtractor ex = new WordExtractor(is); // is是WORD文件的InputStream
            return ex.getText();
        }
    
        /**
         * 从word 2007文档中提取纯文本
         * @param fileName
         * @return
         */
        public static String extractTextFromDOC2007(String fileName) {
            try {
                OPCPackage opcPackage = POIXMLDocument.openPackage(fileName);
                POIXMLTextExtractor ex = new XWPFWordExtractor(opcPackage);
                return ex.getText();
            } catch (Exception e) {
                return "";
            }
        }
    
        /**
         * 从excel 2003文档中提取纯文本
         * @param is
         * @return
         * @throws IOException
         */
        private static String extractTextFromXLS(InputStream is) throws IOException {
            StringBuffer content = new StringBuffer();
            HSSFWorkbook workbook = new HSSFWorkbook(is); // 创建对Excel工作簿文件的引用
    
            for (int numSheets = 0; numSheets < workbook.getNumberOfSheets(); numSheets++) {
                if (null != workbook.getSheetAt(numSheets)) {
                    HSSFSheet aSheet = workbook.getSheetAt(numSheets); // 获得一个sheet
    
                    for (int rowNumOfSheet = 0; rowNumOfSheet <= aSheet
                            .getLastRowNum(); rowNumOfSheet++) {
                        if (null != aSheet.getRow(rowNumOfSheet)) {
                            HSSFRow aRow = aSheet.getRow(rowNumOfSheet); // 获得一行
    
                            for (short cellNumOfRow = 0; cellNumOfRow <= aRow
                                    .getLastCellNum(); cellNumOfRow++) {
                                if (null != aRow.getCell(cellNumOfRow)) {
                                    HSSFCell aCell = aRow.getCell(cellNumOfRow); // 获得列值
    
                                    if (aCell.getCellType() == HSSFCell.CELL_TYPE_NUMERIC) {
                                        content.append(aCell.getNumericCellValue());
                                    } else if (aCell.getCellType() == HSSFCell.CELL_TYPE_BOOLEAN) {
                                        content.append(aCell.getBooleanCellValue());
                                    } else {
                                        content.append(aCell.getStringCellValue());
                                    }
                                }
                            }
                        }
                    }
                }
            }
            return content.toString();
        }
    
        /**
         * 从excel 2007文档中提取纯文本
         * @param fileName
         * @return
         * @throws Exception
         */
        private static String extractTextFromXLS2007(String fileName)
                throws Exception {
            StringBuffer content = new StringBuffer();
    
            // 构造 XSSFWorkbook 对象,strPath 传入文件路径
            XSSFWorkbook xwb = new XSSFWorkbook(fileName);
    
            // 循环工作表Sheet
            for (int numSheet = 0; numSheet < xwb.getNumberOfSheets(); numSheet++) {
                XSSFSheet xSheet = xwb.getSheetAt(numSheet);
                if (xSheet == null) {
                    continue;
                }
    
                // 循环行Row
                for (int rowNum = 0; rowNum <= xSheet.getLastRowNum(); rowNum++) {
                    XSSFRow xRow = xSheet.getRow(rowNum);
                    if (xRow == null) {
                        continue;
                    }
    
                    // 循环列Cell
                    for (int cellNum = 0; cellNum <= xRow.getLastCellNum(); cellNum++) {
                        XSSFCell xCell = xRow.getCell(cellNum);
                        if (xCell == null) {
                            continue;
                        }
    
                        if (xCell.getCellType() == XSSFCell.CELL_TYPE_BOOLEAN) {
                            content.append(xCell.getBooleanCellValue());
                        } else if (xCell.getCellType() == XSSFCell.CELL_TYPE_NUMERIC) {
                            content.append(xCell.getNumericCellValue());
                        } else {
                            content.append(xCell.getStringCellValue());
                        }
                    }
                }
            }
    
            return content.toString();
        }
        
        /**
         * 从excel 2007文档中提取纯文本
         * @param fileName
         * @return
         */
        public static String getXLS2007(String fileName){
            String doc = "";
            try{
                doc = extractTextFromXLS2007(fileName);
                return doc;
            }catch(Exception e){
                return "";
            }
        }
        
        /**
         * 从ppt 2003、2007文档中提取纯文本
         * @param fileName
         * @return
         */
        public static String getPPTX(String fileName){
            String doc = "";
            try{
                File inputFile = new File(fileName);   
                POITextExtractor extractor = ExtractorFactory.createExtractor(inputFile);
                doc = extractor.getText();
                return doc;
            }catch(Exception e){
                return "";
            }
        }
        
        
        public static void main(String[] args) {
            try {
    //            String wordFile = "D:/松山血战.docx";
    //            String wordText2007 = CovertFile.extractTextFromDOC2007(wordFile);
    //            System.out.println("wordText2007=======" + wordText2007);
    //
    //            InputStream is = new FileInputStream("D:/XXX研发中心技术岗位职位需求.xls");
    //            String excelText = CovertFile.extractTextFromXLS(is);
    //            System.out.println("text2003==========" + excelText);
    
    //            String excelFile = "D:/zh.xlsx";
    //            String excelText2007 = CovertFile.extractTextFromXLS2007(excelFile);
    //            System.out.println("excelText2007==========" + excelText2007);
                
                String pptFile = "D:/zz3.ppt";
                String pptx = CovertFile.getPPTX(pptFile);
                System.out.println("pptx==========" + pptx);
    
            } catch (Exception e) {
                e.printStackTrace();
            }
        }
    
    }

    最后突然发现其实只用两行代码就能搞定 
    office 2003 - office 2007

    POITextExtractor extractor = ExtractorFactory.createExtractor(f);
                return extractor.getText();

    于是我泪流满面....白忙乎了..顺路奉上解析pdf的吧

    package com.lingjoin.extractors;
    
    import java.io.BufferedReader;
    import java.io.FileNotFoundException;
    import java.io.IOException;
    import java.io.StringReader;
    import java.util.Date;
    
    import org.apache.pdfbox.pdmodel.PDDocument;
    import org.apache.pdfbox.pdmodel.PDDocumentInformation;
    import org.apache.pdfbox.util.PDFTextStripper;
    import com.lingjoin.paser.LingJoinFile;
    
    /**
     * PDF解析器
     * 
     * @author Ansj
     * 
     */
    public class PDFExtractor extends AbstractExtractor {
        
        private String getContent(LingJoinFile f) {
            // TODO Auto-generated method stub
            PDDocument doc = null ;
            try {
                doc = PDDocument.load(f);
                PDFTextStripper stripper = new PDFTextStripper();
                /**
                 * 设置文件的信息
                 */
                this.setLingJoinFileInfo(f, doc
                        .getDocumentInformation());
                return stripper.getText(doc);
            } catch (FileNotFoundException e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
            } catch (IOException e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
            } finally {
                if (doc != null) {
                    try {
                        doc.close();
                    } catch (IOException e) {
                        // TODO Auto-generated catch block
                        e.printStackTrace();
                    }
                }
            }
            return "";
        }
    
        private BufferedReader getContentReader(LingJoinFile f) {
            return new BufferedReader(new StringReader(this.getContent(f)));
        }
    
        /**
        *    
        * 项目名称:FilePaser 
        * 类描述:   设置文件的信息
        * 创建人:ANSJ   
        * 创建时间:2010-4-14 下午04:27:57  
        * 修改备注:   
        * @version    
         */
        private void setLingJoinFileInfo(LingJoinFile f, PDDocumentInformation info) {
            if (info.getAuthor() != null) {
                f.setlAuthor(info.getAuthor());
            }
    //        try {
    //            if (info.getModificationDate() != null) {
    //                Date date = info.getModificationDate().getTime();
    //                f.setlModificationDate(date.getTime());
    //            }
    //        } catch (IOException e) {
    //            // TODO Auto-generated catch block
    //            e.printStackTrace();
    //        }
            //设置标题
    //        if (info.getTitle() != null) {
    //            f.setlTitle(info.getTitle());
    //        }
        }
    
        public void paserFileToReader(LingJoinFile f) throws Exception {
            f.setlContentReader(this.getContentReader(f)) ;
    
        }
    
        public void paserFileToString(LingJoinFile f) throws Exception {
            // TODO Auto-generated method stub
            f.setlContent(this.getContent(f)) ;
        }
        
        public PDFExtractor(Integer typeFlag) {
            // TODO Auto-generated constructor stub
            this.typeFlag = typeFlag ;
        }
        
        private Integer typeFlag = null ;
        
        public Integer getTypeFlag() {
            // TODO Auto-generated method stub
            return typeFlag;
        }
    }
  • 相关阅读:
    Android学习小Demo一个显示行线的自定义EditText
    Android中自定义checkbox样式
    android圆角View实现及不同版本这间的兼容
    android下大文件分割上传
    drwtsn32.exe 遇到问题须要关闭。我们对此引起的不便表示抱歉
    【分享】深入浅出WPF全系列教程及源码
    iOS国际化时遇到的错误:read failed: the data couldn&#39;t be read because it isn&#39;t in the correct format.
    void及void指针含义的深刻解析
    堆和栈的差别(转过无数次的文章)
    sizeof,终极无惑(上)
  • 原文地址:https://www.cnblogs.com/zouhao/p/3237805.html
Copyright © 2011-2022 走看看