zoukankan      html  css  js  c++  java
  • pdfBox 读取pdf文件

    1、引入maven依赖

            <dependency>
              <groupId>org.apache.pdfbox</groupId>
              <artifactId>pdfbox</artifactId>
              <version>2.0.4</version>
            </dependency>

    2、相关工具类:PdfParser.java

    package com.insurance.tool;
    
    import java.io.File;
    import java.io.IOException;
    import java.io.InputStream;
    import java.util.ArrayList;
    import java.util.List;
    import java.util.regex.Matcher;
    import java.util.regex.Pattern;
    
    import org.apache.pdfbox.pdmodel.PDDocument;
    import org.apache.pdfbox.pdmodel.encryption.InvalidPasswordException;
    import org.apache.pdfbox.text.PDFTextStripper;
    
    import com.insurance.pojo.Insurance;
    import com.insurance.pojo.InsuranceOrder;
    import com.insurance.pojo.InsuranceProgram;
    
    
    public class PdfParser {
        
        public static void main(String[] args) {
            readPDF("C:\Users\yinz\Desktop\场景1\场景1_样例_电子保单识别.pdf");
        }
    
        public static List<InsuranceOrder> readPDF(InputStream stream) throws Exception{
            List<InsuranceOrder> orderList = new ArrayList<InsuranceOrder>();
            PDDocument document = null;
            document=PDDocument.load(stream);
    
            // 获取页码
            int pages = document.getNumberOfPages();
    
            // 读文本内容
            PDFTextStripper stripper=new PDFTextStripper();
            // 设置按顺序输出
            stripper.setSortByPosition(true);
            /*stripper.setStartPage(1);
            stripper.setEndPage(pages);
            String content = stripper.getText(document);
            System.out.println(content);*/     
            
            for(int page = 1; page <= pages; page++) {
                stripper.setStartPage(page);
                stripper.setEndPage(page);
                String content = stripper.getText(document);
                //System.out.println(content);
                parseContent(content, orderList);
            }
            
            System.out.println(orderList);
            return orderList;
        }
        
        public static void readPDF(String filePath) {
            List<InsuranceOrder> orderList = new ArrayList<InsuranceOrder>();
            File pdfFile = new File(filePath);
            PDDocument document = null;
            try
            {
                document=PDDocument.load(pdfFile);
    
                // 获取页码
                int pages = document.getNumberOfPages();
    
                // 读文本内容
                PDFTextStripper stripper=new PDFTextStripper();
                // 设置按顺序输出
                stripper.setSortByPosition(true);
                /*stripper.setStartPage(1);
                stripper.setEndPage(pages);
                String content = stripper.getText(document);
                System.out.println(content);*/     
                
                for(int page = 1; page <= pages; page++) {
                    stripper.setStartPage(page);
                    stripper.setEndPage(page);
                    String content = stripper.getText(document);
                    //System.out.println(content);
                    parseContent(content, orderList);
                }
                System.out.println(orderList);
            }
            catch(Exception e)
            {
                System.out.println(e);
            }
    
            }
        
        private static Pattern insurancePoliceNoP = Pattern.compile("保险单号\s(.*?)\s");
        private static Pattern insuranceApplicationNoP = Pattern.compile("投保单号\s(.*?)\s");
        private static Pattern policeHolderP = Pattern.compile("投 保 人.*
    ");
        private static Pattern insuredP = Pattern.compile("被保险人.*
    ");
        private static Pattern insuredAgeP = Pattern.compile("被保险人投保年龄\s(.*?)(
    |\s)");
        private static Pattern beneficiaryP = Pattern.compile("身故受益人及分配方式\s(.*?)(
    |\s)");
        private static Pattern insuranceNameP = Pattern.compile("险种名称及款式\s(.*?)(
    |\s)");
        private static Pattern validPeriodP = Pattern.compile("保险期间\s(.*?)\s合同生效日", Pattern.DOTALL);
        private static Pattern effectiveDateP = Pattern.compile("合同生效日\s(.*?)(
    |\s)");
        private static Pattern chargeWayP = Pattern.compile("交费方式\s(.*?)\s");
        private static Pattern feeP = Pattern.compile("保 险 费\s(.*?)(
    |\s)");
        private static Pattern policeHolderCount = Pattern.compile("投保份数\s(.*?)(
    |\s)");
        private static Pattern programListP = Pattern.compile("保险金额(.*?)保险责任与责任免除详见条款", Pattern.DOTALL);
        /*private static Pattern validPeriodP = Pattern.compile("保险期间\s(.*?)\s");
        private static Pattern effectiveDateP = Pattern.compile("合同生效日\s(.*?)\s");*/
        private static void parseContent(String content, List<InsuranceOrder> list) {
            if(content == null || content.trim().length() == 0) {
                return;
            }
            if(content.startsWith("个 人 人 身 保 险 保 险 单")) {
                //个人信息
                InsuranceOrder order = new InsuranceOrder();
                String insurancePoliceNo = retriveText(content, insurancePoliceNoP, 1);
                if(insurancePoliceNo == null || insurancePoliceNo.length() <= 0) {
                    return;
                }
                list.add(order);
                order.setInsurancePoliceNo(insurancePoliceNo);
                order.setInsuranceApplicationNo(retriveText(content, insuranceApplicationNoP, 1));
                
                String policeHolderInfo = retriveTextWithInnnerBlank(content, policeHolderP, 0);
                if(policeHolderInfo != null) {
                    Pattern policeHolderNameP = Pattern.compile("投 保 人(.*?)性别");
                    Pattern policeHolderGenderP = Pattern.compile("性别(.*?)出生日期");
                    Pattern policeHolderBirthdayP = Pattern.compile("出生日期(.*?)证件号码");
                    Pattern policeHolderIDP = Pattern.compile("证件号码(.*)$");
                    
                    order.setPoliceHolderName(retriveText(policeHolderInfo, policeHolderNameP, 1));
                    order.setPoliceHolderGender(retriveText(policeHolderInfo, policeHolderGenderP, 1));
                    order.setPoliceHolderBirthday(retriveText(policeHolderInfo, policeHolderBirthdayP, 1));
                    order.setPoliceHolderID(retriveText(policeHolderInfo, policeHolderIDP, 1));
                }
                String insuredInfo = retriveTextWithInnnerBlank(content, insuredP, 0);
                if(insuredInfo != null) {
                    Pattern insuredNameP = Pattern.compile("被保险人(.*?)性别");
                    Pattern insuredGenderP = Pattern.compile("性别(.*?)出生日期");
                    Pattern insuredBirthdayP = Pattern.compile("出生日期(.*?)证件号码");
                    Pattern insuredIDP = Pattern.compile("证件号码(.*)$");
                    
                    order.setInsuredName(retriveText(insuredInfo, insuredNameP, 1));
                    order.setInsuredGender(retriveText(insuredInfo, insuredGenderP, 1));
                    order.setInsuredBirthday(retriveText(insuredInfo, insuredBirthdayP, 1));
                    order.setInsuredID(retriveText(insuredInfo, insuredIDP, 1));
                }
                order.setInsuredAge(retriveText(content, insuredAgeP, 1));
                order.setBeneficiary(retriveText(content, beneficiaryP, 1));
                
                //保险信息
                Insurance insurance = new Insurance();
                order.setInsurance(insurance);
                insurance.setName(retriveText(content, insuranceNameP, 1));
                insurance.setValidPeriod(retriveText(content, validPeriodP, 1).replaceAll("
    ", ""));
                insurance.setEffectiveDate(retriveText(content, effectiveDateP, 1));
                insurance.setChargeWay(retriveText(content, chargeWayP, 1));
                insurance.setFee(retriveText(content, feeP, 1));
                insurance.setPoliceHolderCount(retriveText(content, policeHolderCount, 1));
                
                //保险项目信息
                String programList = retriveTextWithInnnerBlank(content, programListP, 1);
                if(programList != null) {
                    String[] pArr = programList.split("
    ");
                    for(String str : pArr) {
                        if(str != null && str.trim().length() > 0) {
                            String[] subArr = str.split(" ");
                            InsuranceProgram program = new InsuranceProgram();
                            order.getProgramList().add(program);
                            program.setName(subArr[0]);
                            program.setFee(subArr[1]);
                        }
                    }
                }
            }
        }
        
        private static String retriveText(String content, Pattern p, int position) {
            Matcher m = p.matcher(content);
            if(m.find()) {
                return m.group(position).trim().replace(" ", "");
            }
            return "";
        }
        
        private static String retriveTextWithInnnerBlank(String content, Pattern p, int position) {
            Matcher m = p.matcher(content);
            if(m.find()) {
                return m.group(position).trim();
            }
            return "";
        }
    }

    相关实体类:InsuranceOrder .java

    package com.insurance.pojo;
    
    import java.util.ArrayList;
    import java.util.List;
    
    public class InsuranceOrder {
    
        private String insurancePoliceNo;  //保险单号
        private String insuranceApplicationNo;  //投保单号
        private String policeHolderName;  //  投保人
        private String policeHolderBirthday; //投保人出生日期
        private String policeHolderGender;  //投保人性别
        private String policeHolderID;  //  投保人证件号码
        private String insuredName;  //被保险人
        private String insuredGender;  //被保险人性别
        private String insuredBirthday; //被保险人出生日期
        private String insuredID;  //被保险人证件号
        private String insuredAge;  //被保险人投保年龄
        private String beneficiary;  //身故受益人及分配方式
        
        private Insurance insurance; //险种
        private List<InsuranceProgram> programList = new ArrayList<InsuranceProgram>();  //保险项目
        
        
        public String getPoliceHolderBirthday() {
            return policeHolderBirthday;
        }
        public void setPoliceHolderBirthday(String policeHolderBirthday) {
            this.policeHolderBirthday = policeHolderBirthday;
        }
        public String getInsuredBirthday() {
            return insuredBirthday;
        }
        public void setInsuredBirthday(String insuredBirthday) {
            this.insuredBirthday = insuredBirthday;
        }
        public String getInsurancePoliceNo() {
            return insurancePoliceNo;
        }
        public void setInsurancePoliceNo(String insurancePoliceNo) {
            this.insurancePoliceNo = insurancePoliceNo;
        }
        public String getInsuranceApplicationNo() {
            return insuranceApplicationNo;
        }
        public void setInsuranceApplicationNo(String insuranceApplicationNo) {
            this.insuranceApplicationNo = insuranceApplicationNo;
        }
        public String getPoliceHolderName() {
            return policeHolderName;
        }
        public void setPoliceHolderName(String policeHolderName) {
            this.policeHolderName = policeHolderName;
        }
        public String getPoliceHolderGender() {
            return policeHolderGender;
        }
        public void setPoliceHolderGender(String policeHolderGender) {
            this.policeHolderGender = policeHolderGender;
        }
        public String getPoliceHolderID() {
            return policeHolderID;
        }
        public void setPoliceHolderID(String policeHolderID) {
            this.policeHolderID = policeHolderID;
        }
        public String getInsuredName() {
            return insuredName;
        }
        public void setInsuredName(String insuredName) {
            this.insuredName = insuredName;
        }
        public String getInsuredGender() {
            return insuredGender;
        }
        public void setInsuredGender(String insuredGender) {
            this.insuredGender = insuredGender;
        }
        public String getInsuredID() {
            return insuredID;
        }
        public void setInsuredID(String insuredID) {
            this.insuredID = insuredID;
        }
        public String getInsuredAge() {
            return insuredAge;
        }
        public void setInsuredAge(String insuredAge) {
            this.insuredAge = insuredAge;
        }
        public String getBeneficiary() {
            return beneficiary;
        }
        public void setBeneficiary(String beneficiary) {
            this.beneficiary = beneficiary;
        }
        public Insurance getInsurance() {
            return insurance;
        }
        public void setInsurance(Insurance insurance) {
            this.insurance = insurance;
        }
        public List<InsuranceProgram> getProgramList() {
            return programList;
        }
        public void setProgramList(List<InsuranceProgram> programList) {
            this.programList = programList;
        }
        @Override
        public String toString() {
            return "InsuranceOrder [insurancePoliceNo=" + insurancePoliceNo
                    + ", insuranceApplicationNo=" + insuranceApplicationNo
                    + ", policeHolderName=" + policeHolderName
                    + ", policeHolderBirthday=" + policeHolderBirthday
                    + ", policeHolderGender=" + policeHolderGender
                    + ", policeHolderID=" + policeHolderID + ", insuredName="
                    + insuredName + ", insuredGender=" + insuredGender
                    + ", insuredBirthday=" + insuredBirthday + ", insuredID="
                    + insuredID + ", insuredAge=" + insuredAge + ", beneficiary="
                    + beneficiary + ", insurance=" + insurance + ", programList="
                    + programList + "]";
        }
        
        
    }

    InsuranceProgram.java

    package com.insurance.pojo;
    
    /**
     * 保险项目
     * @author yinz
     *
     */
    public class InsuranceProgram {
    
        private String name;  //项目名称
        private String fee;  //金额
        public String getName() {
            return name;
        }
        public void setName(String name) {
            this.name = name;
        }
        public String getFee() {
            return fee;
        }
        public void setFee(String fee) {
            this.fee = fee;
        }
        @Override
        public String toString() {
            return "InsuranceProgram [name=" + name + ", fee=" + fee + "]";
        }
        
        
    }

    此处用于读取的pdf文件:http://files.cnblogs.com/files/yinz/场景1_样例_电子保单识别.rar

  • 相关阅读:
    Codeforces1101G (Zero XOR Subset)-less 【线性基】【贪心】
    Codeforces1101F Trucks and Cities 【滑动窗口】【区间DP】
    HDU4651 Partition 【多项式求逆】
    BZOJ2554 color 【概率DP】【期望DP】
    codeforces1101D GCD Counting 【树形DP】
    codechef EBAIT Election Bait【欧几里得算法】
    BZOJ2434 [NOI2011] 阿狸的打字机 【树链剖分】【线段树】【fail树】【AC自动机】
    codeforces1093G Multidimensional Queries 【线段树】
    BZOJ3277 串 【后缀数组】【二分答案】【主席树】
    AHOI2013 差异 【后缀数组】
  • 原文地址:https://www.cnblogs.com/yinz/p/7109986.html
Copyright © 2011-2022 走看看