zoukankan      html  css  js  c++  java
  • Atitit pdf转文本 pdf2txt v4 t83.docx Atitit pdf转文本 pdfutil 目录 1.1. Pdfbox cmd 模式 TextToPDF 1 1.2. Pdf

    Atitit pdf转文本 pdf2txt v4 t83.docx

    Atitit pdf转文本 pdfutil

     

     

    目录

    1.1. Pdfbox cmd 模式 TextToPDF 1

    1.2. Pdf util code 2

    1.3. Pdf api模式 5

    2. ref 6

     

     

    import org.apache.poi.hslf.extractor.PowerPointExtractor;

    poi工具

    public static String readPPT(String f) {

    PowerPointExtractor extractor;

    try {

    extractor = new PowerPointExtractor(new FileInputStream(new File(f)));

    return extractor.getText();

    } catch (IOException e) {

    ExUtilV2t33.throwExV2(e);

    }

    return null;

     

    }

     

     

     java -jar C:\Users\attilax\Pictures\pdfbox-app-2.0.9.jar  ExtractText   "C:\atibeks517\l4 doc v3 r7a ori exted\_0index\一种简单的基于字符形状的验证码识别技术.pdf"   c:\logs\识别技术.pdf.txt

     

     

    转html

     

    -console

    false

    Send text to console instead of file.

    -html

    false

    Output in HTML format instead of raw text.

     

     

      1. Pdfbox cmd 模式 TextToPDF

    This application will create a PDF document from a text file.

    Usage: java -jar pdfbox-app-2.y.z.jar TextToPDF [OPTIONS] <outputfile> <textfile>

    Command-Line Parameter

    Default

    Description

    -standardFont

    Helvetica

    The font to use for the text. Either this or -ttf should be specified but not both.

    -ttf

     

    The TTF font to use for the text. Either this or -standardFont should be specified but not both.

    java——PDF转换txt - 乞彦 - 博客园.html

     

     

     java -jar C:\Users\attilax\Pictures\pdfbox-app-2.0.9.jar  ExtractText -console  "C:\atibeks517\l4 doc v3 r7a ori exted\_0index\一种简单的基于字符形状的验证码识别技术.pdf"   c:\logs\识别技术.pdf.txt

     

     

      1. Pdf util code

    /FulltxtLucenePrj/src/com/attilax/archive/pdfutilV3t88.java

     

    pdfutilV3t88.java

     

    import java.io.ByteArrayOutputStream;

    import java.io.File;

    import java.io.IOException;

    import java.nio.file.FileVisitResult;

    import java.nio.file.Files;

    import java.nio.file.Path;

    import java.nio.file.Paths;

    import java.nio.file.SimpleFileVisitor;

    import java.nio.file.attribute.BasicFileAttributes;

     

    import org.apache.commons.exec.CommandLine;

    import org.apache.commons.exec.DefaultExecuteResultHandler;

    import org.apache.commons.exec.DefaultExecutor;

    import org.apache.commons.exec.ExecuteException;

    import org.apache.commons.exec.ExecuteWatchdog;

    import org.apache.commons.exec.PumpStreamHandler;

    import org.apache.commons.io.FileUtils;

    import org.apache.commons.io.FilenameUtils;

     

    import com.attilax.util.ExUtil;

     

    public class pdfutilV2 {

     

    public static void main(String[] args) throws ExecuteException, IOException {

     

    Files.walkFileTree(

    Paths.get(

    "C:\\Users\\Administrator\\Documents\\WeChat Files\\attilax\\FileStorage\\File\\2019-08\\CityLink接入文档"),

    new SimpleFileVisitor<Path>() {

     

    // 处理文件

    public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) throws IOException {

     

    // return super.visitFile(file, attrs);

    String fpath = file.toString();

    String ext=FilenameUtils.getExtension(fpath);

    if(!ext.equals("pdf"))

    return FileVisitResult.CONTINUE; // 没找到继续找

    String rzt = pdfutilV2.convert2txt_consoleBlockmodeV2(fpath);

    FileUtils.writeStringToFile(new File("d:\\0pdfout\\" + file.toFile().getName() + ".txt"), rzt,

    true);

    return FileVisitResult.CONTINUE; // 没找到继续找

    }

     

    });

     

    String sou = "C:\\atibeks517\\l4 doc v3 r7a ori exted\\_0index\\一种简单的基于字符形状的验证码识别技术.pdf";

    String dest = " c:\\logs\\v2识别技术.pdf.txt";

    System.out.println();

    ;

    }

     

    public static String convert2txt_consoleBlockmodeV2(String sou) {

    // String s = " java -jar D:\\0gif sexy\\pdfbox-app-2.0.9.jar  ExtractText  @sou@ @dest@";

    // s.replaceAll("@sou@", sou);

    // s.replaceAll("@dest@", dest);

    // final CommandLine cmdLine = CommandLine.parse(s);

     

    final CommandLine cmdLine = new CommandLine("D:\\jdk1.8.0_31\\bin\\java.exe");

    cmdLine.addArgument("-jar");

    cmdLine.addArgument("D:\\0gif sexy\\pdfbox-app-2.0.9.jar");

    cmdLine.addArgument("ExtractText");

    cmdLine.addArgument("-console");

     

    cmdLine.addArgument(sou);

    // cmdLine.addArgument(dest);

     

    // DefaultExecuteResultHandler resultHandler = new

    // DefaultExecuteResultHandler();

    DefaultExecutor executor = new DefaultExecutor();

    try {

    ByteArrayOutputStream baos = new ByteArrayOutputStream();

    executor.setStreamHandler(new PumpStreamHandler(baos, baos));// iytstren

    System.out.println( cmdLine);

    executor.execute(cmdLine);

     

    String result = baos.toString("utf8").trim();

    return result;

     

    } catch (Exception e) {

    ExUtil.throwExV2(e);

    }

    return "";

     

    }

     

     

      1. Pdf api模式

     

    /bookmarksHtmlEverythingIndexPrj/src/emailPKg/ExtractTextFromPDF.java

     

     

    package emailPKg;

     

    import java.io.File;

    import java.io.FileInputStream;

    import java.io.FileNotFoundException;

    import java.io.IOException;

     

    import org.apache.commons.io.FileUtils;

    import org.apache.commons.io.FilenameUtils;

    import org.apache.pdfbox.pdfparser.PDFParser;

    import org.apache.pdfbox.pdmodel.PDDocument;

    import org.apache.pdfbox.util.PDFTextStripper;

     

     

     

     /**

      * jra 1.8.16

      * @author zhoufeiyue

      *

      */

    public class ExtractTextFromPDF {

    public static  String readPDFV2WithCache(String filename,String cacheDir) throws  Exception{

     

    String basename=FilenameUtils.getName(filename);

    File file2 = new File(cacheDir+"\\"+basename+".txt");

    if(file2.exists())

    {

    return FileUtils.readFileToString(file2);

    }

     

    File file = new File(filename);

    FileInputStream in = null;

     

    in = new FileInputStream(filename);

    PDFParser parser = new PDFParser(in);

    parser.parse();

    PDDocument pdDocument = parser.getPDDocument();

    PDFTextStripper stripper = new PDFTextStripper();

    String result = stripper.getText(pdDocument);

     

    System.out.println("PDF文件" + file.getAbsolutePath()+"内容如下:");

     

    FileUtils.write(file2, result);

    return (result);

     

    }

     

     

    1. ref

    Apache PDFBox _ Command-Line Tools.html

    Atitit 读写文件慢的解决方案cache法  pdf转txt

     

  • 相关阅读:
    2018 10-708 (CMU) Probabilistic Graphical Models {Lecture 25} [Spectral Methods]
    2018 10-708 (CMU) Probabilistic Graphical Models {Lecture 23} [Applications in Computer Vision (cont’d) + Gaussian Process] (unfinished)
    2018 10-708 (CMU) Probabilistic Graphical Models {Lecture 22} [Applications in Computer Vision (cont’d) + Gaussian Process]
    2018 10-708 (CMU) Probabilistic Graphical Models {Lecture 21} [A Hybrid: Deep Learning and Graphical Models]
    2018 10-708 (CMU) Probabilistic Graphical Models {Lecture 15} [Mean field Approximation]
    Font Awesome
    Vue中img的src属性绑定
    匿名内部类的使用
    局部内部类的特点与使用
    linux systemctl命令
  • 原文地址:https://www.cnblogs.com/attilax/p/15197155.html
Copyright © 2011-2022 走看看