zoukankan      html  css  js  c++  java
  • [置顶] java处理office文档与pdf文件(二)

    该部分主要内容:文件上传,以及office文件和pdf的html处理,以及提取text

    // 根据服务器的文件保存地址和原文件名创建目录文件全路径
    		File file = this.getFile();
    		String url = "";
    		String tempFile = "";
    		String fileFolder = "";	//上传文件路径
    		String hz = "";
    		String oldOrgFileId = null;
    		Long oldId = knowledge.getZsk_zskID();
    		if(null != oldId && 0 != oldId){
    			oldOrgFileId = knowledge.getOrgFileId();
    		}
    		
    		if(null != file){
    			// 截取扩展名
    			hz = fileFileName.substring(fileFileName.lastIndexOf("."),fileFileName.length());
    			String zskCode = knowledge.getZsk_Code();
    			fileFolder = createNewFile(this.savePath,zskCode);
    			// 上传的文件在服务器中的全路径
    			url = fileFolder + "\" + fileFileName;
    			//1、文件上传
    			FileUtils.copyFile(file, new File(url));	
    			
    			//2、文件转化为html
    			tempFile = createNewFile(this.tempPath,zskCode);
    			String htmlStr = "";
    			if(hz.equals(".pdf")){
    				htmlStr = "<html><body>" +
    				"<embed src='"+fileFileName+"' width='100%' height='100%'></embed>" +
    				"</body></html>";
    			}else{
    				String dstHtml = tempFile+"\"+zskCode+".html";
    				//删除文件夹下所有文件及子文件夹
    				FileUtil.deleteChildFile(new File(tempFile));
    				
    				changeDocToHtml(hz, url, dstHtml);
    				htmlStr = FileUtil.htmlToStr(dstHtml);
    			}
    			knowledge.setContentHtml(htmlStr);
    			Clob htmlColb=Hibernate.createClob(htmlStr);
    			knowledge.setZsk_Description(htmlColb);
    			
    			//3、获取上传文件对应的文本内容
    			String docContent = findDocContent(hz, url);
    			knowledge.setContentText(docContent);
    			Clob docContentClob=Hibernate.createClob(docContent);
    			knowledge.setZsk_Text(docContentClob);
    			
    			String orgFileId = new GUID().toString();	//知识库原文件对应的标识
    			knowledge.setOrgFileId(orgFileId);
    			knowledge.setZsk_ContentType(1);
    		}else{
    			Clob htmlColb = Hibernate.createClob(htmlArea);
    			Clob textClob = Hibernate.createClob(htmlArea.replaceAll("</?[^>]+>", ""));
    			knowledge.setZsk_Description(htmlColb);
    			knowledge.setContentHtml(htmlArea);
    			knowledge.setZsk_Text(textClob);
    			knowledge.setContentText(htmlArea);
    			knowledge.setZsk_ContentType(2);
    		}
    		
    		//添加时处理
    		if(null == oldId || 0 ==  oldId){
    			//to--do  需要在后期重新处理 当前用户
    			if(null == knowledge.getZsk_Author() || "".equals(knowledge.getZsk_Author())){	//当前用户
    				knowledge.setZsk_Author(SessionUtil.getTSysAgent().getCagentname());
    			}
    			knowledge.setZsk_RegisterTime(new Date());
    		}
    		//to---do 
    		knowledge.setZsk_LastMender(1L);
    		knowledge.setZsk_ModifyTime(new Date());
    		
    		KnowLedgeOtherContion ko = new KnowLedgeOtherContion();
    		ko.setFileContentType(fileContentType);
    		ko.setFileFileName(fileFileName);
    		ko.setOldId(oldId);
    		ko.setTempFile(tempFile);
    		ko.setUrl(url);
    		ko.setOldOrgFileId(oldOrgFileId);
    		
    		knowUploadServiceImp.saveOrUpdateKnowledge(knowledge,ko);

     将office转化为html

    /**
    	 * 将word,excel,ppt,pdf转化为html
    	 * @param hz
    	 * @param url
    	 * @param dstHtml
    	 */
    	private void changeDocToHtml(String hz, String url, String dstHtml) {
    		if("pdf".equalsIgnoreCase(hz)){
    			
    		}else if(".xls".equalsIgnoreCase(hz) || ".xlsx".equalsIgnoreCase(hz)){
    			DocToHtml.getInstance().ExceltoHtml(url,dstHtml);
    		}else if(".doc".equalsIgnoreCase(hz) || ".docx".equalsIgnoreCase(hz)){
    			DocToHtml.getInstance().WordtoHtml(url,dstHtml);
    		}else if(".ppt".equalsIgnoreCase(hz) || ".pptx".equalsIgnoreCase(hz)){
    			DocToHtml.getInstance().PPTtoHtml(url, dstHtml);
    		}
    	}

     将word,wxcel,ppt另存为html的方法

    public boolean WordtoHtml(String srcFile, String dstFile) {
    		ComThread.InitSTA();
    		ActiveXComponent activexcomponent = new ActiveXComponent("Word.Application");
    		String s2 = srcFile;
    		String s3 = dstFile;
    		boolean flag = false;
    		try {
    			activexcomponent.setProperty("Visible", new Variant(false));
    			Dispatch dispatch = activexcomponent.getProperty("Documents").toDispatch();
    			Dispatch dispatch1 = Dispatch.invoke(dispatch, "Open", 1,
    					new Object[] { s2, new Variant(false), new Variant(true) },
    					new int[1]).toDispatch();
    			Dispatch.invoke(dispatch1, "SaveAs", 1, new Object[] { s3,new Variant(8) }, new int[1]);
    			Variant variant = new Variant(false);
    			Dispatch.call(dispatch1, "Close", variant);
    			flag = true;
    		} catch (Exception exception) {
    			log.error("word转化为html出错-->"+exception.getMessage());
    		} finally {
    			activexcomponent.invoke("Quit", new Variant[0]);
    			ComThread.Release();
    			ComThread.quitMainSTA();
    		}
    		return flag;
    	}
    
    	public boolean PPTtoHtml(String srcFile, String dstFile) {
    		ComThread.InitSTA();
    		ActiveXComponent activexcomponent = new ActiveXComponent( "PowerPoint.Application");
    		boolean flag = false;
    		try {
    			Dispatch dispatch = activexcomponent.getProperty("Presentations")
    					.toDispatch();
    			Dispatch dispatch1 = Dispatch.call(dispatch, "Open", srcFile,
    					new Variant(-1), new Variant(-1), new Variant(0))
    					.toDispatch();
    			Dispatch.call(dispatch1, "SaveAs", dstFile, new Variant(12));
    //			Variant variant = new Variant(-1);
    			Dispatch.call(dispatch1, "Close");
    			flag = true;
    		} catch (Exception exception) {
    			log.error("ppt转化为html出错-->"+exception.getMessage());
    		} finally {
    			activexcomponent.invoke("Quit", new Variant[0]);
    			ComThread.Release();
    			ComThread.quitMainSTA();
    		}
    		return flag;
    	}
    
    	public boolean ExceltoHtml(String s, String s1) {
    		 ComThread.InitSTA();
    		 ActiveXComponent activexcomponent = new
    		 ActiveXComponent("Excel.Application");
    		 boolean flag = false;
    		 try
    		 {
    			 activexcomponent.setProperty("Visible", new Variant(false));
    			 Dispatch dispatch = activexcomponent.getProperty("Workbooks").toDispatch();
    			 Dispatch dispatch1 = Dispatch.invoke(dispatch, "Open", 1, new Object[] {
    					 s, new Variant(false), new Variant(true)}, new int[1]).toDispatch();
    			 Dispatch.call(dispatch1, "SaveAs", s1, new Variant(44));
    			 Variant variant = new Variant(false);
    			 Dispatch.call(dispatch1, "Close", variant);
    			 flag = true;
    		 }catch(Exception exception){
    			 log.error("excel转化为html出错-->"+exception.getMessage());
    		 }finally{
    			 activexcomponent.invoke("Quit", new Variant[0]);
    			 ComThread.Release();
    			 ComThread.quitMainSTA();
    		 }
    		 return flag;
    	}

     获取office文件以及pdf的文本内容

    private String findDocContent(String hz, String url) {
    		String docContent = null;
    		File file = new File(url);
    		if(".pdf".equalsIgnoreCase(hz)){
    			docContent = GetDocText.getDocTextInta().getTextFromPdf(file);
    		}else if(".xls".equalsIgnoreCase(hz) || ".xlsx".equalsIgnoreCase(hz)){
    			docContent = GetDocText.getDocTextInta().getTextFromExcel(file);
    		}else if(".doc".equalsIgnoreCase(hz) || ".docx".equalsIgnoreCase(hz)){
    			docContent = GetDocText.getDocTextInta().getTextFromWord(file);
    		}else if(".ppt".equalsIgnoreCase(hz) || ".pptx".equalsIgnoreCase(hz)){
    			docContent = GetDocText.getDocTextInta().getTextFromPPT(file);
    		}
    		return docContent;
    	}

     具体的实现方法

    /**
    	 * 从word文件获取文本内容
    	 * 
    	 * @param wordFile
    	 * @return word文件的文本内容
    	 */
    	public String getTextFromWord(File wordFile) {
    		String wordText = "";
    		InputStream is = null;
    		try {  
                //word 2003: 图片不会被读取  
                is = new FileInputStream(wordFile);
                String fileName = wordFile.getName();
    		    String hz = fileName.substring(fileName.lastIndexOf("."),fileName.length());
    		    if(".doc".equals(hz)){
    		    	WordExtractor ex = new WordExtractor(is);  
    		    	wordText = ex.getText();
    		    }else{
    		    	OPCPackage opcPackage = POIXMLDocument.openPackage(wordFile.getAbsolutePath());
    		    	POIXMLTextExtractor extractor = new XWPFWordExtractor(opcPackage);  
    		    	wordText = extractor.getText();
    		    }
                  
            } catch (Exception e) {  
                e.printStackTrace();  
            }finally{
            	if(is != null){
            		try {
    					is.close();
    				} catch (IOException e) {
    					e.printStackTrace();
    				}
            	}
            }
    		return wordText;
    	}
    
    	/**
    	 * 从excel获取文本内容
    	 * 
    	 * @param excelFile
    	 * @return Excel文件的文本内容
    	 */
    	public String getTextFromExcel(File excelFile) {
    		String text = "";
    		InputStream in = null;
    		try {
    			//创建相关的文件流对象
    			in = new FileInputStream(excelFile);
    		    //声明相关的工作薄对象
    			Workbook wb =null;
    		    //声明相关的excel抽取对象
    		    ExcelExtractor extractor=null;
    		    String fileName = excelFile.getName();
    		    String hz = fileName.substring(fileName.lastIndexOf("."),fileName.length());
    		    
    		    if(hz.equals(".xls"))//针对2003版本
    		    {
    		    	//创建excel2003的文件文本抽取对象
    		    	wb=new HSSFWorkbook(new POIFSFileSystem(in));
    		    	extractor =new org.apache.poi.hssf.extractor.ExcelExtractor((HSSFWorkbook)wb);
    		    }else{ //针对2007版本
    		    	wb = new  XSSFWorkbook(in);
    		    	//创建excel2007的文件文本抽取对象
    		    	extractor =new XSSFExcelExtractor((XSSFWorkbook)wb);
    		    }
    		    
    		    extractor.setFormulasNotResults(false);
    		    //是否抽象sheet页的名称
    		    extractor.setIncludeSheetNames(true);
    		    //是否抽取cell的注释内容
    		    extractor.setIncludeCellComments(true);
    		    //获取相关的抽取文本信息
    		    text = extractor.getText();
    		} catch (FileNotFoundException e) {
    			e.printStackTrace();
    		} catch (IOException e) {
    			e.printStackTrace();
    		}finally{
    			if(in != null){
    				try {
    					in.close();
    				} catch (IOException e) {
    					e.printStackTrace();
    				}
    			}
    		}
    		
    		return text;
    	}
    	/**
    	 * 从ppt获取文本内容
    	 * 
    	 * @param pptFile
    	 * @return ppt文件的文本内容
    	 */
    	public String getTextFromPPT(File pptFile){
    		String pptText = null;
    		FileInputStream fin = null;
    		try {
    			fin = new FileInputStream(pptFile);
    			String fileName = pptFile.getName();
    			String hz = fileName.substring(fileName.lastIndexOf("."),fileName.length());
    			if(".ppt".equals(hz)){
    				QuickButCruddyTextExtractor qct = new QuickButCruddyTextExtractor(fin);
    				pptText = qct.getTextAsString();
    			}else{
    				OPCPackage opcPackage = POIXMLDocument.openPackage(pptFile.getAbsolutePath());
    				XSLFPowerPointExtractor pptExtractor = new XSLFPowerPointExtractor(opcPackage);
    				pptText = pptExtractor.getText();
    			}
    		} catch (FileNotFoundException e) {
    			e.printStackTrace();
    		} catch (IOException e) {
    			e.printStackTrace();
    		} catch (XmlException e) {
    			e.printStackTrace();
    		} catch (OpenXML4JException e) {
    			e.printStackTrace();
    		}finally{
    			if(null != fin){
    				try {
    					fin.close();
    				} catch (IOException e) {
    					e.printStackTrace();
    				}
    			}
    		}
    		return pptText;
    	}
    	/**
    	 * 从pdf文件获取文本内容
    	 * 
    	 * @param pdfFile
    	 * @return pdf文件的文本内容
    	 */
    	public String getTextFromPdf(File pdfFile){
    		String result = null;
    		FileInputStream is = null;
    		PDDocument document = null;
    		try{
    			is = new FileInputStream(pdfFile);
    			PDFParser parser = new PDFParser(is);
    			parser.parse();
    			document = parser.getPDDocument();
    			PDFTextStripper stripper = new PDFTextStripper();
    			result = stripper.getText(document);
    		}catch(FileNotFoundException e){
    			e.printStackTrace();
    		}catch(IOException e){
    			e.printStackTrace();
    		}finally{
    			if(is != null){
    				try{
    					is.close();
    				}catch(IOException e){
    					e.printStackTrace();
    				}
    			}
    			if(document != null){
    				try{
    					document.close();
    				}catch(IOException ex){
    					ex.printStackTrace();
    				}
    			}
    		}
    		return result;
    	}
    	/**
    	 * 
    	 * @param txtFile
    	 * @return  返回txt的内容
    	 */
    	public String getTextFromTxt(File txtFile){
    		FileReader fr;
    		StringBuffer buff = new StringBuffer();
    		try {
    			fr = new FileReader(txtFile);
    			BufferedReader br = new BufferedReader(fr);
    			String temp = null;
    			while((temp = br.readLine()) != null){
    				buff.append(temp + "
    "); 
    			}
    			br.close();
    		} catch (FileNotFoundException e) {
    			e.printStackTrace();
    		} catch (IOException e) {
    			e.printStackTrace();
    		}
    		
    		return buff.toString();
    	}

     对带有clob字段的实体save时,直接调用hibernate的save即可。ojdbc14.jar。

    更新时的处理  如下:

    public void updateKnowledge(Knowledge knowledge) {
    		 try {
    		    knowledge.setZsk_Description(Hibernate.createClob(" "));
    		    knowledge.setZsk_Text(Hibernate.createClob(" "));
    		    update(knowledge);
    		    flush();
    		    
    		    getSession().refresh(knowledge, LockMode.UPGRADE);
    		    
    		    SerializableClob htmlSc=(SerializableClob)knowledge.getZsk_Description();
    		    SerializableClob textSc=(SerializableClob)knowledge.getZsk_Text();
    		    Clob htmlWrapclob=htmlSc.getWrappedClob();
    		    Clob textWrapclob=textSc.getWrappedClob();
    		    CLOB htmlClob2=(CLOB)htmlWrapclob;
    		    CLOB textClob2=(CLOB)textWrapclob;
    		    Writer htmlWriter=htmlClob2.getCharacterOutputStream();
    		    htmlWriter.write(knowledge.getContentHtml());
    		    htmlWriter.close();
    		    
    		    Writer textWriter=textClob2.getCharacterOutputStream();
    		    textWriter.write(knowledge.getContentText());
    		    textWriter.close();
    		    
    		    update(knowledge);
    		  } catch (RuntimeException re) {
    		    throw re;
    		  } catch (SQLException e) {
    		    e.printStackTrace();
    		  } catch (IOException e) {
    		    e.printStackTrace();
    		  }
    	}

     上面几步做完,基本可以完成上传以及存入数据库,以及对带有clob文件的更新。

    需要的环境  windows,jacob-1.17-M2-x64 具体的jacob下载和配置 参照网络。poi-3.9

  • 相关阅读:
    Java实现 蓝桥杯VIP 算法提高 交换Easy
    Java实现 蓝桥杯VIP 算法提高 多项式输出
    Java实现 蓝桥杯VIP 算法提高 多项式输出
    Java实现 蓝桥杯VIP 算法提高 多项式输出
    Java实现 蓝桥杯VIP 算法提高 多项式输出
    Java实现 蓝桥杯VIP 算法提高 多项式输出
    Java实现 蓝桥杯VIP 算法训练 矩阵乘方
    QT中给各控件增加背景图片(可缩放可旋转)的几种方法
    回调函数实现类似QT中信号机制
    std::string的Copy-on-Write:不如想象中美好(VC不使用这种方式,而使用对小字符串更友好的SSO实现)
  • 原文地址:https://www.cnblogs.com/pigga/p/10098227.html
Copyright © 2011-2022 走看看