zoukankan      html  css  js  c++  java
  • 基于java对doc文档的分词,导入数据库

    这篇word文档都是正规的文本文字,有一定的格式,其中没有图片等难以处理的内容

    我也是刚学习对word文档的处理,其中也有很对不懂的地方

    Apache POI是Apache软件基金会的开放源码函式库,POI提供API给Java程序对Microsoft Office格式档案读和写的功能。

    1、首先我下载了poi的包  http://poi.apache.org/download.html 网址

    2、然后就是利用函数对文档的处理

    读取doc文档

      

    public static String contextOfDoc(File file) {
            String str = "";
            try {
                FileInputStream fis = new FileInputStream(file);
                HWPFDocument doc = new HWPFDocument(fis);
                str = doc.getDocumentText();
                doc.close();
                fis.close();
            } catch (Exception e) {
                e.printStackTrace();
                // TODO: handle exception
            }
            return str;
        }

    测试

    public static void main(String[] args) {
            File file = new File("src/1.doc");
            String str = contextOfDoc(file);
            String[] arr = str.split("
    ");
            for (int i = 9; i < 284; i++) {
                System.out.println(arr[i]);
            }
        }

    先切分文档,分为目录和内容

    public static String[] cataAndContext() {
            File file = new File("src/1.doc");
            String textAll = docIo.contextOfDoc(file);
            String[] str = textAll.split("第五篇");
            return str;
        }

    对目录和内容分别切分

    public static List<String> typePart(String str) {
            //File file = new File("src/1.doc");
            //all
            //String textAll = docIo.contextOfDoc(file);
            String[] partOne = str.split("新技术篇");
            //第一篇到目录
            String partOneCatalog = partOne[1].split("网络安全篇")[0];
            String partNest = partOne[1].split("网络安全篇")[1];
            //第二篇目录
            String partTowCatalog = partNest.split("基础篇")[0];
            partNest = partNest.split("基础篇")[1];
            //第三篇目录
            String partThreeCatalog = partNest.split("国家信息化政策规划篇")[0];
            partNest = partNest.split("国家信息化政策规划篇")[1];
            //第四篇目录
            String partForeCatalog = partNest.split("附录")[0];
            List<String> strList = new ArrayList<>();
            strList.add(partOneCatalog);
            strList.add(partTowCatalog);
            strList.add(partThreeCatalog);
            strList.add(partForeCatalog);
            return strList;
        }

    对内容的处理

    public static void main(String[] args) throws Exception {
            FileInputStream fis = new FileInputStream("src/3.doc");
            WordExtractor wordExtractor = new WordExtractor(fis);
            String[] paragraphs = wordExtractor.getParagraphText();
            List<String> lists = getParas(paragraphs);
            CRUD c = new CRUD();
            List<String> catas = c.getCatalogs();
            for (int i = 0; i < catas.size()-1; i++) {
                String context = getContext(catas.get(i), catas.get(i+1), lists);
                c.insertContext(catas.get(i), context);
            }
        }
        public static String getContext(String start,String end,List<String> paras) {
            String context = "";
            for (int i = 0; i < paras.size(); i++) {
                if (paras.get(i).equals(start)) {
                    for (int j = i+1; j < paras.size(); j++) {
                        if(paras.get(j).equals(end)) {
                            return context;
                        }
                        context = context + paras.get(j);
                    }
                }
            }
            return context;
        }
        
        public static List<String> getParas(String[] paras) {
            List<String> paraList = new ArrayList<>();
            for (int i = 289; i < paras.length; i++) {
                paraList.add(paras[i].trim());
            }
            return paraList;
        }

    数据库的crud

    public List<String> getCatalogs(){
            List<String> lists = new ArrayList<>();
            Connection connection = Dbuitl.getConnection();
            String sql = "select catalog from catalogs";
            PreparedStatement preparedStatement = null;
            ResultSet resultSet = null;   
            try {
                Statement statement = connection.createStatement();
                resultSet = statement.executeQuery(sql);
                while (resultSet.next()) {
                    lists.add(resultSet.getString("catalog"));
                }
            } catch (SQLException e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
            }finally {
                Dbuitl.close(preparedStatement);
                Dbuitl.close(connection);
            }
            return lists;
        }
        
        public void insert(String type,String cata) {
            Connection connection = Dbuitl.getConnection();
            String sql = "insert into catalogs(type,catalog) value(?,?)";
            PreparedStatement preparedStatement = null;
            ResultSet resultSet = null;        
            try {
                preparedStatement = connection.prepareStatement(sql);
                preparedStatement.setString(1, type);
                preparedStatement.setString(2, cata);
                preparedStatement.executeUpdate();
            } catch (SQLException e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
            }finally {
                Dbuitl.close(preparedStatement);
                Dbuitl.close(connection);
            }
        }
        
        public void insertContext(String catalog,String context) {
            Connection connection = Dbuitl.getConnection();
            String sql = "insert into context(catalog,context) value(?,?)";
            PreparedStatement preparedStatement = null;
            ResultSet resultSet = null;        
            try {
                preparedStatement = connection.prepareStatement(sql);
                preparedStatement.setString(1, catalog);
                preparedStatement.setString(2, context);
                preparedStatement.executeUpdate();
            } catch (SQLException e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
            }finally {
                Dbuitl.close(preparedStatement);
                Dbuitl.close(connection);
            }
        }
        
        public void insertSheet(String sheet,String type) {
            Connection connection = Dbuitl.getConnection();
            String sql = "insert into sheet(sheet,type) value(?,?)";
            PreparedStatement preparedStatement = null;
            ResultSet resultSet = null;        
            try {
                preparedStatement = connection.prepareStatement(sql);
                preparedStatement.setString(1, sheet);
                preparedStatement.setString(2, type);
                preparedStatement.executeUpdate();
            } catch (SQLException e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
            }finally {
                Dbuitl.close(preparedStatement);
                Dbuitl.close(connection);
            }
        }

  • 相关阅读:
    HTTPS协议详解
    HTTP协议详解
    网络传输协议 UDP & TCP 详解
    Socket(套接字)基础概念
    网络基础
    OSI 七层协议
    经典SQL题 1/25/50/100美分,多少种可能拼凑成2美元
    5.1一阶谓词逻辑
    4.4符号视角下的科学
    4.3领域语言与自然语言的比较
  • 原文地址:https://www.cnblogs.com/0710whh/p/10567657.html
Copyright © 2011-2022 走看看