zoukankan      html  css  js  c++  java
  • 代码片段,使用TIKA来解析PDF,WORD和EMAIL

    /**
     * com.jiaoyiping.pdstest.TestTika.java
     * Copyright (c) 2009 Hewlett-Packard Development Company, L.P.
     * All rights reserved.
     */
    package com.jiaoyiping.pdstest;
    
    import java.io.BufferedInputStream;
    import java.io.BufferedOutputStream;
    import java.io.File;
    import java.io.FileInputStream;
    import java.io.FileOutputStream;
    import java.io.InputStream;
    import java.io.OutputStream;
    
    import org.apache.tika.metadata.Metadata;
    import org.apache.tika.parser.ParseContext;
    import org.apache.tika.parser.Parser;
    import org.apache.tika.parser.mail.RFC822Parser;
    import org.apache.tika.parser.microsoft.OfficeParser;
    import org.apache.tika.parser.pdf.PDFParser;
    import org.apache.tika.sax.BodyContentHandler;
    import org.junit.Test;
    import org.xml.sax.ContentHandler;
    
    /**
     * <pre>
     * Desc: 
     * @author 焦一平
     * @refactor 焦一平
     * @date   2014年12月4日 下午1:31:09
     * @version 1.0
     * @see  
     * REVISIONS: 
     * Version 	   Date 		    Author 			  Description
     * ------------------------------------------------------------------- 
     * 1.0 		  2014年12月4日 	                              焦一平  	         1. Created this class. 
     * </pre>  
     */
    public class TestTika {
    	
    	//解析PDF
    	@Test
    	public void testPdf() throws Exception{
    		Long start = System.currentTimeMillis();
    		Parser parser = new PDFParser();
    		InputStream is = new BufferedInputStream(new FileInputStream(new File("D:\我的微盘\文档\参考文档\Linux Shell脚本攻略.pdf")));
    		OutputStream os = new BufferedOutputStream(new FileOutputStream(new File("C:\Users\Administrator\Desktop\result.txt")));
    	    Metadata meta = new Metadata();  
    	    meta.add(Metadata.CONTENT_ENCODING, "utf-8");  
            ContentHandler iHandler = new BodyContentHandler(os);  
    	    parser.parse(is, iHandler, meta, new ParseContext());
    	    Long end = System.currentTimeMillis();
    	    Long used = (end-start)/1000;
    	    System.out.println("耗时: "+used+"秒");
    	}
    	//解析Word
    	@Test
    	public void testWrod() throws Exception{
    		Long start = System.currentTimeMillis();
    		Parser parser = new OfficeParser();
    		InputStream is = new BufferedInputStream(new FileInputStream(new File("D:\我的微盘\文档\参考文档\jBPM5_用户指南中文版.doc")));
    		OutputStream os = new BufferedOutputStream(new FileOutputStream(new File("C:\Users\Administrator\Desktop\result.txt")));
    		Metadata meta = new Metadata();  
    	    meta.add(Metadata.CONTENT_ENCODING, "utf-8");  
            ContentHandler iHandler = new BodyContentHandler(os);  
    	    parser.parse(is, iHandler, meta, new ParseContext());
    		
    		Long end = System.currentTimeMillis();
    		Long used = (end-start)/1000;
    		System.out.println("耗时:"+used+"秒");
    	}
    	//解析EMAIL(只能解析标准的eml格式的,不能解析微软的msg格式) 
    	//使用commons-email来进行解析的可以得到收件人、发件人、主题、内容等元数据,TIkA是否支持未尝试
    	@Test
    	public void testEmail() throws Exception{
    		Long start = System.currentTimeMillis();
    		Parser parser = new RFC822Parser();
    		InputStream is = new BufferedInputStream(new FileInputStream(new File("C:\Users\Administrator\Downloads\回复_ RE_ 数据导入工作 - 外部系统枚举与U-Cloud枚举映射.eml")));
    		OutputStream os = new BufferedOutputStream(new FileOutputStream(new File("C:\Users\Administrator\Desktop\result.txt")));
    		Metadata meta = new Metadata();  
    		meta.add(Metadata.CONTENT_ENCODING, "utf-8"); 
    		ContentHandler iHandler = new BodyContentHandler(os);  
    		parser.parse(is, iHandler, meta, new ParseContext());
    		
    		Long end = System.currentTimeMillis();
    		Long used = (end-start)/1000;
    		System.out.println("耗时:"+used+"秒");
    	}
    }
    

      

  • 相关阅读:
    centos7下磁盘空间调整
    centos7下 查看CPU、内存、磁盘的使用情况
    centos7中Spark集群的安装与配置(Hadoop2.6.5+spark2.3.3)
    linux下mysql ---- Host '' is not allowed to connect to this MySQL server
    11-1、多线程
    10-2、对象的序列化和反序列化
    9-1、注解
    8-1、泛型
    7-1、集合
    6-1、异常处理
  • 原文地址:https://www.cnblogs.com/jiaoyiping/p/4150238.html
Copyright © 2011-2022 走看看