Atitit pdf转文本 pdf2txt v4 t83.docx
Atitit pdf转文本 pdfutil
目录
1.1. Pdfbox cmd 模式 TextToPDF 1
import org.apache.poi.hslf.extractor.PowerPointExtractor;
poi工具
public static String readPPT(String f) {
PowerPointExtractor extractor;
try {
extractor = new PowerPointExtractor(new FileInputStream(new File(f)));
return extractor.getText();
} catch (IOException e) {
ExUtilV2t33.throwExV2(e);
}
return null;
}
java -jar C:\Users\attilax\Pictures\pdfbox-app-2.0.9.jar ExtractText "C:\atibeks517\l4 doc v3 r7a ori exted\_0index\一种简单的基于字符形状的验证码识别技术.pdf" c:\logs\识别技术.pdf.txt
转html
-console | false | Send text to console instead of file. |
-html | false | Output in HTML format instead of raw text. |
This application will create a PDF document from a text file.
Usage: java -jar pdfbox-app-2.y.z.jar TextToPDF [OPTIONS] <outputfile> <textfile>
Command-Line Parameter | Default | Description |
-standardFont | Helvetica | The font to use for the text. Either this or -ttf should be specified but not both. |
-ttf |
| The TTF font to use for the text. Either this or -standardFont should be specified but not both. |
java——PDF转换txt - 乞彦 - 博客园.html
java -jar C:\Users\attilax\Pictures\pdfbox-app-2.0.9.jar ExtractText -console "C:\atibeks517\l4 doc v3 r7a ori exted\_0index\一种简单的基于字符形状的验证码识别技术.pdf" c:\logs\识别技术.pdf.txt
/FulltxtLucenePrj/src/com/attilax/archive/pdfutilV3t88.java
pdfutilV3t88.java
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.IOException;
import java.nio.file.FileVisitResult;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.nio.file.SimpleFileVisitor;
import java.nio.file.attribute.BasicFileAttributes;
import org.apache.commons.exec.CommandLine;
import org.apache.commons.exec.DefaultExecuteResultHandler;
import org.apache.commons.exec.DefaultExecutor;
import org.apache.commons.exec.ExecuteException;
import org.apache.commons.exec.ExecuteWatchdog;
import org.apache.commons.exec.PumpStreamHandler;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.FilenameUtils;
import com.attilax.util.ExUtil;
public class pdfutilV2 {
public static void main(String[] args) throws ExecuteException, IOException {
Files.walkFileTree(
Paths.get(
"C:\\Users\\Administrator\\Documents\\WeChat Files\\attilax\\FileStorage\\File\\2019-08\\CityLink接入文档"),
new SimpleFileVisitor<Path>() {
// 处理文件
public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) throws IOException {
// return super.visitFile(file, attrs);
String fpath = file.toString();
String ext=FilenameUtils.getExtension(fpath);
if(!ext.equals("pdf"))
return FileVisitResult.CONTINUE; // 没找到继续找
String rzt = pdfutilV2.convert2txt_consoleBlockmodeV2(fpath);
FileUtils.writeStringToFile(new File("d:\\0pdfout\\" + file.toFile().getName() + ".txt"), rzt,
true);
return FileVisitResult.CONTINUE; // 没找到继续找
}
});
String sou = "C:\\atibeks517\\l4 doc v3 r7a ori exted\\_0index\\一种简单的基于字符形状的验证码识别技术.pdf";
String dest = " c:\\logs\\v2识别技术.pdf.txt";
System.out.println();
;
}
public static String convert2txt_consoleBlockmodeV2(String sou) {
// String s = " java -jar D:\\0gif sexy\\pdfbox-app-2.0.9.jar ExtractText @sou@ @dest@";
// s.replaceAll("@sou@", sou);
// s.replaceAll("@dest@", dest);
// final CommandLine cmdLine = CommandLine.parse(s);
final CommandLine cmdLine = new CommandLine("D:\\jdk1.8.0_31\\bin\\java.exe");
cmdLine.addArgument("-jar");
cmdLine.addArgument("D:\\0gif sexy\\pdfbox-app-2.0.9.jar");
cmdLine.addArgument("ExtractText");
cmdLine.addArgument("-console");
cmdLine.addArgument(sou);
// cmdLine.addArgument(dest);
// DefaultExecuteResultHandler resultHandler = new
// DefaultExecuteResultHandler();
DefaultExecutor executor = new DefaultExecutor();
try {
ByteArrayOutputStream baos = new ByteArrayOutputStream();
executor.setStreamHandler(new PumpStreamHandler(baos, baos));// iytstren
System.out.println( cmdLine);
executor.execute(cmdLine);
String result = baos.toString("utf8").trim();
return result;
} catch (Exception e) {
ExUtil.throwExV2(e);
}
return "";
}
/bookmarksHtmlEverythingIndexPrj/src/emailPKg/ExtractTextFromPDF.java
package emailPKg;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.FilenameUtils;
import org.apache.pdfbox.pdfparser.PDFParser;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.util.PDFTextStripper;
/**
* jra 1.8.16
* @author zhoufeiyue
*
*/
public class ExtractTextFromPDF {
public static String readPDFV2WithCache(String filename,String cacheDir) throws Exception{
String basename=FilenameUtils.getName(filename);
File file2 = new File(cacheDir+"\\"+basename+".txt");
if(file2.exists())
{
return FileUtils.readFileToString(file2);
}
File file = new File(filename);
FileInputStream in = null;
in = new FileInputStream(filename);
PDFParser parser = new PDFParser(in);
parser.parse();
PDDocument pdDocument = parser.getPDDocument();
PDFTextStripper stripper = new PDFTextStripper();
String result = stripper.getText(pdDocument);
System.out.println("PDF文件" + file.getAbsolutePath()+"内容如下:");
FileUtils.write(file2, result);
return (result);
}
Apache PDFBox _ Command-Line Tools.html
Atitit 读写文件慢的解决方案cache法 pdf转txt