zoukankan html css js c++ java

pdfbox 识别 pdf为excel

1、继承 PageDrawer 和 PDFRenderer获得文本框坐标

2、通过坐标获取文字

3、通过easyExcel生成表格

public class MyPageDrawer extends PageDrawer {

    static final List<Coordinate> COORDINATE_LIST = new ArrayList<>();
    double pageHeight;
    MyPageDrawer(PageDrawerParameters parameters) throws IOException
    {
        super(parameters);
        this.pageHeight=parameters.getPage().getBBox().getHeight();

    }
    PDPage pdPage;
    @Override
    public void processPage(PDPage aPage) throws IOException {
        this.pdPage=aPage;
        super.processPage(aPage);
    }

    @Override
    public void fillPath(int windingRule)  {
        Shape bbox = getLinePath().getBounds2D();
        Coordinate startCoordinate = new Coordinate(bbox.getBounds().getLocation().x,(int)pageHeight-bbox.getBounds().getLocation().y);
        COORDINATE_LIST.add(startCoordinate);
        getLinePath().reset();
    }

}

public   class MyPDFRenderer extends PDFRenderer
{
    MyPDFRenderer(PDDocument document)
    {
        super(document);
    }

    @Override
    protected PageDrawer createPageDrawer(PageDrawerParameters parameters) throws IOException
    {
       return new MyPageDrawer(parameters);
        // return new TestPageDrawer(parameters);
    }
}

public class App {
    public static void main(String[] args) throws Exception {
        String fileName = "E:\download\test\2020年12月北京工程造价信息.pdf";  //这里先手动把绝对路径的文件夹给补上。
       readPDF(fileName);
    }
    /**
     * 读PDF文件，使用了pdfbox开源项目
     * @param fileName
     */
    public static  void readPDF(String fileName) {
        File file = new File(fileName);
        FileInputStream in = null;
        try {
            in = new FileInputStream(fileName);
            // 新建一个PDF解析器对象
            PDFParser parser = new PDFParser(new RandomAccessFile(file,"rw"));
            // 对PDF文件进行解析
            parser.parse();
            // 获取解析后得到的PDF文档对象
            PDDocument pdfdocument = parser.getPDDocument();
            System.out.println("NumberOfPages:"+ pdfdocument.getNumberOfPages());

            PDFRenderer renderer = new MyPDFRenderer(pdfdocument);

            int pageNum=12;
            BufferedImage image =  renderer.renderImage(pageNum);
            ImageIO.write(image, "PNG", new File("test.png"));

//            System.out.println("SEG_LINETO_LIST...");
//            MyPageDrawer.SEG_LINETO_LIST.stream().forEach(System.out::println);

            String resultFileName = "simpleWrite" + System.currentTimeMillis() + ".xlsx";
            EasyExcel.write(resultFileName).sheet().doWrite(judgeCoordinate(MyPageDrawer.COORDINATE_LIST, pdfdocument, pageNum));
        } catch (Exception e) {
            System.out.println("读取PDF文件" + file.getAbsolutePath() + "生失败！" + e);
            e.printStackTrace();
        } finally {
            if (in != null) {
                try {
                    in.close();
                } catch (IOException e1) {
                }
            }
        }
    }
    /**
     * 去重排序
     *
     * @param coordinateList
     * @param document
     * @return
     */
    private static List<List<String>> judgeCoordinate(List<Coordinate> coordinateList, PDDocument document,int pageNum)  {
        //去除pdf边界
        coordinateList=coordinateList.stream().filter(coordinate -> !(coordinate.getX()<38||coordinate.getY()<70||coordinate.getY()>780||coordinate.getX()>558)).collect(Collectors.toList());
        // 去重 按y,x排序 从左上角开始计算
        coordinateList = coordinateList.stream().sorted(Comparator.comparing(Coordinate::getY).thenComparing(Coordinate::getX)).collect(Collectors.toList());
        System.out.println("去重，排序后，分组前...");
        coordinateList.stream().forEach(System.out::println);
        // 去除相近元素
        for(int a=0;a<coordinateList.size();a++){
            Coordinate coordinateStart = coordinateList.get(a);
            for (int j = a+1; j < coordinateList.size(); j++) {
                Coordinate coordinateC = coordinateList.get(j);
                if (Math.abs(coordinateStart.getY()-coordinateC.getY()) <=2) {
                    if(Math.abs(coordinateC.getX()-coordinateStart.getX())<=2){
                        coordinateList.remove(j);
                        j--;
                    }else {
                        int y=coordinateStart.getY()>coordinateC.getY()?coordinateC.getY():coordinateStart.getY();
                        coordinateC.setY(y);
                    }
                }else {
                    break;
                }
            }
        }
        //需要重新排序
        coordinateList=coordinateList.stream().sorted(Comparator.comparing(Coordinate::getY).thenComparing(Coordinate::getX))
                .collect(Collectors.toList());

        Map<Integer, List<Coordinate>> groupList = coordinateList.stream()
                .collect(Collectors.groupingBy(Coordinate::getY));
        Map<Integer, List<Coordinate>> result =new LinkedHashMap<>();
        groupList.entrySet().stream().sorted(Map.Entry.<Integer, List<Coordinate>>comparingByKey())
                .forEachOrdered(e -> result.put(e.getKey(), e.getValue()));
        System.out.println("总行数:"+result.size());

        List<List<Coordinate>> resultRow = result.values().stream()
                .collect(Collectors.toList());
        resultRow=resultRow.stream().filter(item-> (item.size()>1)).collect(Collectors.toList());
        System.out.println("去重，排序，分组后...");

        resultRow.stream().forEach(System.out::println);
        List<List<String>> mapList = new ArrayList<>();
        for (int k = 0; k < resultRow.size()-1; k++) {
            Map<String,String> map = new HashMap<>();
            List<String> listRow=new ArrayList<>();
            boolean nullData=false;
            for (int i = 0; i < resultRow.get(k).size()-1; i++) {
                Coordinate coordinateStart=resultRow.get(k).get(i);
                List<Coordinate> nextRow=resultRow.get(k+1);
                if(nextRow.size()>i+1){
                    Coordinate coordinateEnd=nextRow.get(i+1);
                    int width=coordinateEnd.getX() - coordinateStart.getX();
                    int  height=coordinateEnd.getY() - coordinateStart.getY();
                    //左上角 为原始点 向右 加宽向下加高
                    try {
                        String info = readRectangleInfo(coordinateStart.getX(), coordinateStart.getY(),
                                width,height, document,pageNum);
                        info = info.replaceAll("
|
", "");
                        map.put("column"+i,info);
                        if(info==null||info.length()==0){
                            nullData=true;
                        }else {
                            nullData=false;
                            listRow.add(info);
                        }
                    }catch (Exception e){
                        e.printStackTrace();
                    }
                }
            }
            if(!nullData){
                mapList.add(listRow);
            }
        }
        Gson gson = new Gson();
        String mapListString = gson.toJson(mapList);
        System.out.println(mapListString);

        return mapList;
    }
    private static String readRectangleInfo(int x, int y, int width, int height, PDDocument document
            , int pageNum) throws Exception {
        PDFTextStripperByArea stripper = new PDFTextStripperByArea();
        stripper.setSortByPosition(true);
        Rectangle rect = new Rectangle(x, y, width, height);
        stripper.addRegion("rect", rect);
        PDPage firstPage = document.getPage(pageNum);
        stripper.extractRegions(firstPage);
        return stripper.getTextForRegion("rect");
    }
}


<dependencies>

    <dependency>
        <groupId>org.apache.pdfbox</groupId>
        <artifactId>pdfbox</artifactId>
        <version>2.0.22</version>
    </dependency>
    <!-- https://mvnrepository.com/artifact/org.apache.pdfbox/fontbox -->
    <dependency>
        <groupId>org.apache.pdfbox</groupId>
        <artifactId>fontbox</artifactId>
        <version>2.0.22</version>
    </dependency>
    <!-- https://mvnrepository.com/artifact/org.apache.pdfbox/jempbox -->
    <dependency>
        <groupId>org.apache.pdfbox</groupId>
        <artifactId>jempbox</artifactId>
        <version>1.8.16</version>
    </dependency>
    <dependency>
        <groupId>com.google.code.gson</groupId>
        <artifactId>gson</artifactId>
        <version>2.8.0</version>
    </dependency>

    <dependency>
        <groupId>com.alibaba</groupId>
        <artifactId>easyexcel</artifactId>
        <version>2.2.7</version>
    </dependency>

</dependencies>

查看全文

相关阅读:
NYoj-119-士兵杀敌（3）-RMQ算法
 springMVC3学习(九)--redirect和forward跳转
 STL
在 Ubuntu 12.04 上通过源码安装 Open vSwitch (OVS)
SSO 基于CAS实现单点登录实例解析（二）
Linux
linux的子进程调用exec( )系列函数
 以Settings.APPLICATION_DEVELOPMENT_SETTINGS打开开发人员面板出错总结
 python学习记录
 CentOS6.X下安装配置独立SVN服务器Subversion server

原文地址：https://www.cnblogs.com/CaptainLin/p/14298026.html