zoukankan      html  css  js  c++  java
  • pdfBox 读取pdf文件

    1、引入maven依赖

            <dependency>
              <groupId>org.apache.pdfbox</groupId>
              <artifactId>pdfbox</artifactId>
              <version>2.0.4</version>
            </dependency>

    2、相关工具类:PdfParser.java

    package com.insurance.tool;
    
    import java.io.File;
    import java.io.IOException;
    import java.io.InputStream;
    import java.util.ArrayList;
    import java.util.List;
    import java.util.regex.Matcher;
    import java.util.regex.Pattern;
    
    import org.apache.pdfbox.pdmodel.PDDocument;
    import org.apache.pdfbox.pdmodel.encryption.InvalidPasswordException;
    import org.apache.pdfbox.text.PDFTextStripper;
    
    import com.insurance.pojo.Insurance;
    import com.insurance.pojo.InsuranceOrder;
    import com.insurance.pojo.InsuranceProgram;
    
    
    public class PdfParser {
        
        public static void main(String[] args) {
            readPDF("C:\Users\yinz\Desktop\场景1\场景1_样例_电子保单识别.pdf");
        }
    
        public static List<InsuranceOrder> readPDF(InputStream stream) throws Exception{
            List<InsuranceOrder> orderList = new ArrayList<InsuranceOrder>();
            PDDocument document = null;
            document=PDDocument.load(stream);
    
            // 获取页码
            int pages = document.getNumberOfPages();
    
            // 读文本内容
            PDFTextStripper stripper=new PDFTextStripper();
            // 设置按顺序输出
            stripper.setSortByPosition(true);
            /*stripper.setStartPage(1);
            stripper.setEndPage(pages);
            String content = stripper.getText(document);
            System.out.println(content);*/     
            
            for(int page = 1; page <= pages; page++) {
                stripper.setStartPage(page);
                stripper.setEndPage(page);
                String content = stripper.getText(document);
                //System.out.println(content);
                parseContent(content, orderList);
            }
            
            System.out.println(orderList);
            return orderList;
        }
        
        public static void readPDF(String filePath) {
            List<InsuranceOrder> orderList = new ArrayList<InsuranceOrder>();
            File pdfFile = new File(filePath);
            PDDocument document = null;
            try
            {
                document=PDDocument.load(pdfFile);
    
                // 获取页码
                int pages = document.getNumberOfPages();
    
                // 读文本内容
                PDFTextStripper stripper=new PDFTextStripper();
                // 设置按顺序输出
                stripper.setSortByPosition(true);
                /*stripper.setStartPage(1);
                stripper.setEndPage(pages);
                String content = stripper.getText(document);
                System.out.println(content);*/     
                
                for(int page = 1; page <= pages; page++) {
                    stripper.setStartPage(page);
                    stripper.setEndPage(page);
                    String content = stripper.getText(document);
                    //System.out.println(content);
                    parseContent(content, orderList);
                }
                System.out.println(orderList);
            }
            catch(Exception e)
            {
                System.out.println(e);
            }
    
            }
        
        private static Pattern insurancePoliceNoP = Pattern.compile("保险单号\s(.*?)\s");
        private static Pattern insuranceApplicationNoP = Pattern.compile("投保单号\s(.*?)\s");
        private static Pattern policeHolderP = Pattern.compile("投 保 人.*
    ");
        private static Pattern insuredP = Pattern.compile("被保险人.*
    ");
        private static Pattern insuredAgeP = Pattern.compile("被保险人投保年龄\s(.*?)(
    |\s)");
        private static Pattern beneficiaryP = Pattern.compile("身故受益人及分配方式\s(.*?)(
    |\s)");
        private static Pattern insuranceNameP = Pattern.compile("险种名称及款式\s(.*?)(
    |\s)");
        private static Pattern validPeriodP = Pattern.compile("保险期间\s(.*?)\s合同生效日", Pattern.DOTALL);
        private static Pattern effectiveDateP = Pattern.compile("合同生效日\s(.*?)(
    |\s)");
        private static Pattern chargeWayP = Pattern.compile("交费方式\s(.*?)\s");
        private static Pattern feeP = Pattern.compile("保 险 费\s(.*?)(
    |\s)");
        private static Pattern policeHolderCount = Pattern.compile("投保份数\s(.*?)(
    |\s)");
        private static Pattern programListP = Pattern.compile("保险金额(.*?)保险责任与责任免除详见条款", Pattern.DOTALL);
        /*private static Pattern validPeriodP = Pattern.compile("保险期间\s(.*?)\s");
        private static Pattern effectiveDateP = Pattern.compile("合同生效日\s(.*?)\s");*/
        private static void parseContent(String content, List<InsuranceOrder> list) {
            if(content == null || content.trim().length() == 0) {
                return;
            }
            if(content.startsWith("个 人 人 身 保 险 保 险 单")) {
                //个人信息
                InsuranceOrder order = new InsuranceOrder();
                String insurancePoliceNo = retriveText(content, insurancePoliceNoP, 1);
                if(insurancePoliceNo == null || insurancePoliceNo.length() <= 0) {
                    return;
                }
                list.add(order);
                order.setInsurancePoliceNo(insurancePoliceNo);
                order.setInsuranceApplicationNo(retriveText(content, insuranceApplicationNoP, 1));
                
                String policeHolderInfo = retriveTextWithInnnerBlank(content, policeHolderP, 0);
                if(policeHolderInfo != null) {
                    Pattern policeHolderNameP = Pattern.compile("投 保 人(.*?)性别");
                    Pattern policeHolderGenderP = Pattern.compile("性别(.*?)出生日期");
                    Pattern policeHolderBirthdayP = Pattern.compile("出生日期(.*?)证件号码");
                    Pattern policeHolderIDP = Pattern.compile("证件号码(.*)$");
                    
                    order.setPoliceHolderName(retriveText(policeHolderInfo, policeHolderNameP, 1));
                    order.setPoliceHolderGender(retriveText(policeHolderInfo, policeHolderGenderP, 1));
                    order.setPoliceHolderBirthday(retriveText(policeHolderInfo, policeHolderBirthdayP, 1));
                    order.setPoliceHolderID(retriveText(policeHolderInfo, policeHolderIDP, 1));
                }
                String insuredInfo = retriveTextWithInnnerBlank(content, insuredP, 0);
                if(insuredInfo != null) {
                    Pattern insuredNameP = Pattern.compile("被保险人(.*?)性别");
                    Pattern insuredGenderP = Pattern.compile("性别(.*?)出生日期");
                    Pattern insuredBirthdayP = Pattern.compile("出生日期(.*?)证件号码");
                    Pattern insuredIDP = Pattern.compile("证件号码(.*)$");
                    
                    order.setInsuredName(retriveText(insuredInfo, insuredNameP, 1));
                    order.setInsuredGender(retriveText(insuredInfo, insuredGenderP, 1));
                    order.setInsuredBirthday(retriveText(insuredInfo, insuredBirthdayP, 1));
                    order.setInsuredID(retriveText(insuredInfo, insuredIDP, 1));
                }
                order.setInsuredAge(retriveText(content, insuredAgeP, 1));
                order.setBeneficiary(retriveText(content, beneficiaryP, 1));
                
                //保险信息
                Insurance insurance = new Insurance();
                order.setInsurance(insurance);
                insurance.setName(retriveText(content, insuranceNameP, 1));
                insurance.setValidPeriod(retriveText(content, validPeriodP, 1).replaceAll("
    ", ""));
                insurance.setEffectiveDate(retriveText(content, effectiveDateP, 1));
                insurance.setChargeWay(retriveText(content, chargeWayP, 1));
                insurance.setFee(retriveText(content, feeP, 1));
                insurance.setPoliceHolderCount(retriveText(content, policeHolderCount, 1));
                
                //保险项目信息
                String programList = retriveTextWithInnnerBlank(content, programListP, 1);
                if(programList != null) {
                    String[] pArr = programList.split("
    ");
                    for(String str : pArr) {
                        if(str != null && str.trim().length() > 0) {
                            String[] subArr = str.split(" ");
                            InsuranceProgram program = new InsuranceProgram();
                            order.getProgramList().add(program);
                            program.setName(subArr[0]);
                            program.setFee(subArr[1]);
                        }
                    }
                }
            }
        }
        
        private static String retriveText(String content, Pattern p, int position) {
            Matcher m = p.matcher(content);
            if(m.find()) {
                return m.group(position).trim().replace(" ", "");
            }
            return "";
        }
        
        private static String retriveTextWithInnnerBlank(String content, Pattern p, int position) {
            Matcher m = p.matcher(content);
            if(m.find()) {
                return m.group(position).trim();
            }
            return "";
        }
    }

    相关实体类:InsuranceOrder .java

    package com.insurance.pojo;
    
    import java.util.ArrayList;
    import java.util.List;
    
    public class InsuranceOrder {
    
        private String insurancePoliceNo;  //保险单号
        private String insuranceApplicationNo;  //投保单号
        private String policeHolderName;  //  投保人
        private String policeHolderBirthday; //投保人出生日期
        private String policeHolderGender;  //投保人性别
        private String policeHolderID;  //  投保人证件号码
        private String insuredName;  //被保险人
        private String insuredGender;  //被保险人性别
        private String insuredBirthday; //被保险人出生日期
        private String insuredID;  //被保险人证件号
        private String insuredAge;  //被保险人投保年龄
        private String beneficiary;  //身故受益人及分配方式
        
        private Insurance insurance; //险种
        private List<InsuranceProgram> programList = new ArrayList<InsuranceProgram>();  //保险项目
        
        
        public String getPoliceHolderBirthday() {
            return policeHolderBirthday;
        }
        public void setPoliceHolderBirthday(String policeHolderBirthday) {
            this.policeHolderBirthday = policeHolderBirthday;
        }
        public String getInsuredBirthday() {
            return insuredBirthday;
        }
        public void setInsuredBirthday(String insuredBirthday) {
            this.insuredBirthday = insuredBirthday;
        }
        public String getInsurancePoliceNo() {
            return insurancePoliceNo;
        }
        public void setInsurancePoliceNo(String insurancePoliceNo) {
            this.insurancePoliceNo = insurancePoliceNo;
        }
        public String getInsuranceApplicationNo() {
            return insuranceApplicationNo;
        }
        public void setInsuranceApplicationNo(String insuranceApplicationNo) {
            this.insuranceApplicationNo = insuranceApplicationNo;
        }
        public String getPoliceHolderName() {
            return policeHolderName;
        }
        public void setPoliceHolderName(String policeHolderName) {
            this.policeHolderName = policeHolderName;
        }
        public String getPoliceHolderGender() {
            return policeHolderGender;
        }
        public void setPoliceHolderGender(String policeHolderGender) {
            this.policeHolderGender = policeHolderGender;
        }
        public String getPoliceHolderID() {
            return policeHolderID;
        }
        public void setPoliceHolderID(String policeHolderID) {
            this.policeHolderID = policeHolderID;
        }
        public String getInsuredName() {
            return insuredName;
        }
        public void setInsuredName(String insuredName) {
            this.insuredName = insuredName;
        }
        public String getInsuredGender() {
            return insuredGender;
        }
        public void setInsuredGender(String insuredGender) {
            this.insuredGender = insuredGender;
        }
        public String getInsuredID() {
            return insuredID;
        }
        public void setInsuredID(String insuredID) {
            this.insuredID = insuredID;
        }
        public String getInsuredAge() {
            return insuredAge;
        }
        public void setInsuredAge(String insuredAge) {
            this.insuredAge = insuredAge;
        }
        public String getBeneficiary() {
            return beneficiary;
        }
        public void setBeneficiary(String beneficiary) {
            this.beneficiary = beneficiary;
        }
        public Insurance getInsurance() {
            return insurance;
        }
        public void setInsurance(Insurance insurance) {
            this.insurance = insurance;
        }
        public List<InsuranceProgram> getProgramList() {
            return programList;
        }
        public void setProgramList(List<InsuranceProgram> programList) {
            this.programList = programList;
        }
        @Override
        public String toString() {
            return "InsuranceOrder [insurancePoliceNo=" + insurancePoliceNo
                    + ", insuranceApplicationNo=" + insuranceApplicationNo
                    + ", policeHolderName=" + policeHolderName
                    + ", policeHolderBirthday=" + policeHolderBirthday
                    + ", policeHolderGender=" + policeHolderGender
                    + ", policeHolderID=" + policeHolderID + ", insuredName="
                    + insuredName + ", insuredGender=" + insuredGender
                    + ", insuredBirthday=" + insuredBirthday + ", insuredID="
                    + insuredID + ", insuredAge=" + insuredAge + ", beneficiary="
                    + beneficiary + ", insurance=" + insurance + ", programList="
                    + programList + "]";
        }
        
        
    }

    InsuranceProgram.java

    package com.insurance.pojo;
    
    /**
     * 保险项目
     * @author yinz
     *
     */
    public class InsuranceProgram {
    
        private String name;  //项目名称
        private String fee;  //金额
        public String getName() {
            return name;
        }
        public void setName(String name) {
            this.name = name;
        }
        public String getFee() {
            return fee;
        }
        public void setFee(String fee) {
            this.fee = fee;
        }
        @Override
        public String toString() {
            return "InsuranceProgram [name=" + name + ", fee=" + fee + "]";
        }
        
        
    }

    此处用于读取的pdf文件:http://files.cnblogs.com/files/yinz/场景1_样例_电子保单识别.rar

  • 相关阅读:
    scala之伴生对象的继承
    scala之伴生对象说明
    “Failed to install the following Android SDK packages as some licences have not been accepted” 错误
    PATH 环境变量重复问题解决
    Ubuntu 18.04 配置java环境
    JDBC的基本使用2
    DCL的基本语法(授权)
    ZJNU 1374
    ZJNU 2184
    ZJNU 1334
  • 原文地址:https://www.cnblogs.com/yinz/p/7109986.html
Copyright © 2011-2022 走看看