zoukankan      html  css  js  c++  java
  • 用Tika读取文件(不需要考虑文件格式)

    不需要考虑文件格式,用Tika包。

    package com.geni_sage.gdme.core.dataReader;
    
    import java.io.*;
    import java.util.Arrays;
    import java.util.regex.Matcher;
    import java.util.regex.Pattern;
    
    import org.apache.tika.Tika;
    import org.apache.tika.io.TikaInputStream;
    import org.apache.tika.metadata.Metadata;
    
    public class TikaManager {
    
        private Metadata metadata;
    
        private String content;
    
        private boolean isRepalceBlank = false;
    
        public TikaManager(File file, boolean isReplaceBlank) throws Exception {
            metadata = new Metadata();
    
            TikaInputStream stream = TikaInputStream.get(file, metadata);
            try {
                Tika tika = new Tika();
                tika.setMaxStringLength(Integer.MAX_VALUE);
                content = tika.parseToString(stream, metadata);
    
            } finally {
                stream.close();
            }
            this.isRepalceBlank = isReplaceBlank;
        }
    
        public String getContent() {
            if (isRepalceBlank) {
                return replaceBlank(content);
            } else {
                return content;
            }
    
        }
    
        public Metadata getMetadata() {
            return metadata;
        }
    
        public String getMetadataString() throws Exception {
            return metadataToString();
        }
    
        private String metadataToString() throws Exception {
    
            StringBuilder metadataBuffer = new StringBuilder();
    
            String[] names = metadata.names();
            Arrays.sort(names);
            for (String name : names) {
                metadataBuffer.append(name);
                metadataBuffer.append(": ");
                metadataBuffer.append(metadata.get(name));
                metadataBuffer.append("\n");
            }
    
            return metadataBuffer.toString();
        }
    
        private String replaceBlank(String str) {
            String dest = "";
            if (str != null) {
                // Pattern p = Pattern.compile("\\s*|\t|\r|\n");
                Pattern p = Pattern.compile("\n");
                Matcher m = p.matcher(str);
                dest = m.replaceAll("");
            }
            return dest;
        }
    }


     

  • 相关阅读:
    [Oracle维护工程师手记]一次升级后运行变慢的分析
    [Oracle运维工程师手记] 如何从trace 文件,判断是否执行了并行
    Top Page
    Python 输出文件内容到网络端口
    [Spark][Streaming]Spark读取网络输入的例子
    [Spark]如何设置使得spark程序不输出 INFO级别的内容
    linux 利器
    C++ 资源大全
    Who is using Asio?
    服务器开发知识要点
  • 原文地址:https://www.cnblogs.com/yuwenfeng/p/3080075.html
Copyright © 2011-2022 走看看