zoukankan      html  css  js  c++  java
  • Html批量转csv

    不知道大家又没有遇到这样的问题,某些业务系统有导出数据功能,导出的数据都是存放在excel表格里面,需要批量转csv,

    但是这样的文件不是标准的excel文档,本质是html文档

    比如说,系统导出的文档是这样的

     从这里我们可以看出来,感觉就是一个普通的excel文档,通过office也能正常打开,但是你通过编写代码批量转csv的时候,就出问题

    我也是在无意中发现这不是标准的excle文档,我们通过文档编辑器打开试试

    这明显就是html文件,只能怪这个业务系统的开发人员不够严谨了,现在需要我们来解决这样的问题

    我们先在idea里面创建一个maven项目

    package com.gong;
    
    
    import java.io.*;
    import java.util.ArrayList;
    import java.util.LinkedList;
    import java.util.List;
    import java.util.Scanner;
    
    import org.apache.commons.lang.StringUtils;
    import org.jsoup.Jsoup;
    import org.jsoup.nodes.Document;
    import org.jsoup.nodes.Element;
    import org.jsoup.select.Elements;
    
    /**
     * Jsoup解析html标签时类似于JQuery的一些符号
     *
     * @author chixh
     *
     */
    public class HtmlParser {
        protected List<List<String>> data = new LinkedList<List<String>>();
    
        /**
         * 获取value值
         *
         * @param e
         * @return
         */
        public static String getValue(Element e) {
            return e.attr("value");
        }
    
        /**
         * 获取
         * <tr>
         * 和
         * </tr>
         * 之间的文本
         *
         * @param e
         * @return
         */
        public static String getText(Element e) {
            return e.text();
        }
    
        /**
         * 识别属性id的标签,一般一个html页面id唯一
         *
         * @param body
         * @param id
         * @return
         */
        public static Element getID(String body, String id) {
            Document doc = Jsoup.parse(body);
            // 所有#id的标签
            Elements elements = doc.select("#" + id);
            // 返回第一个
            return elements.first();
        }
    
        /**
         * 识别属性class的标签
         *
         * @param body
         * @param class
         * @return
         */
        public static Elements getClassTag(String body, String classTag) {
            Document doc = Jsoup.parse(body);
            // 所有#id的标签
            return doc.select("." + classTag);
        }
    
        /**
         * 获取tr标签元素组
         *
         * @param e
         * @return
         */
        public static Elements getTR(Element e) {
            return e.getElementsByTag("tr");
        }
    
        /**
         * 获取td标签元素组
         *
         * @param e
         * @return
         */
        public static Elements getTD(Element e) {
            return e.getElementsByTag("td");
        }
        /**
         * 获取表元组
         * @param table
         * @return
         */
        public static List<List<String>> getTables(Element table){
            List<List<String>> data = new ArrayList<>();
    
            for (Element etr : table.select("tr")) {
                List<String> listh=new ArrayList<>();
                //获取表头
                for(Element eth : etr.select("th")){
                    String th=eth.text();
                    listh.add(th);
                }
                if(!listh.isEmpty()) {
                    data.add(listh);
                }
                List<String> list = new ArrayList<>();
                for (Element etd : etr.select("td")) {
                    String temp = etd.text();
                    //增加一行中的一列
                    list.add(temp);
                }
                //增加一行
                if(!list.isEmpty()) {
                    data.add(list);
                }
            }
            return data;
        }
        /**
         * 读html文件
         * @param fileName
         * @return
         */
        public static String readHtml(String fileName){
            FileInputStream fis = null;
            StringBuffer sb = new StringBuffer();
            try {
                fis = new FileInputStream(fileName);
                byte[] bytes = new byte[1024];
                while (-1 != fis.read(bytes)) {
                    sb.append(new String(bytes));
                }
            } catch (FileNotFoundException e) {
                e.printStackTrace();
            } catch (IOException e) {
                e.printStackTrace();
            } finally {
                try {
                    fis.close();
                } catch (IOException e1) {
                    e1.printStackTrace();
                }
            }
            return sb.toString();
        }
    
        public static void getFileName(String inputexecl,String outputcsv){
           // Document doc2 = Jsoup.parse(readHtml("E:\datas\小组成员学习情况统计(11).xls"));
            String path = inputexecl;
            File f = new File(path);
            if (!f.exists()) {
                System.out.println(path + " not exists");
                return;
            }
    
            File fa[] = f.listFiles();//获取该目录下所有文件和目录的绝对路径
            for (int i = 0; i < fa.length; i++) {
                File fs = fa[i];
                if (fs.isDirectory()) {
                    System.out.println(fs.getName() + " [目录]");
                } else{
                    String filepath= String.valueOf(fs);
                    Document doc2 = Jsoup.parse(readHtml(filepath));
                    Element table = doc2.select("table").first();
                    //获取table表的内容,存放到List集合里面
                    List<List<String>> list = getTables(table);
                    for (List<String> list2 : list) {
                        for (String string : list2) {
                            System.out.print(string+",");
                        }
                        System.out.println();
                    }
                    String name= StringUtils.substringBeforeLast(fs.getName(),".");//获取文件名字部分
                    //String newFilePath="E:\datas\csv\小组成员学习.csv";
                    String newFilePath=outputcsv+name+".csv";
                    String savePath = newFilePath;
                    File saveCSV = new File(savePath);
                    String buffer="";
                    try {
                        if(!saveCSV.exists())
                            saveCSV.createNewFile();
                        OutputStreamWriter write = new OutputStreamWriter(new FileOutputStream(saveCSV ),"UTF-8");
                        BufferedWriter writer = new BufferedWriter(write);
                        for(int j=0;j<list.size();j++){
                            List<String> list1=new ArrayList<String>();
                            buffer=list.get(j).toString();
                            System.out.println(buffer);
                            buffer = buffer.substring(1, buffer.lastIndexOf("]")).toString();
                            list1.add(buffer);
                            writer.write(buffer);
                            writer.newLine();
                        }
                        writer.close();
                    } catch (IOException e) {
                        e.printStackTrace();
                    }
                }
            }
    
        }
        public static void main(String[] args) {
            System.out.println("请输入Execl数据所在路径");
            Scanner execl=new Scanner(System.in);
            String input=execl.nextLine(); //获取execl输入路径
            System.out.println("请输入csv文件数据的输出路径");
            Scanner csv=new Scanner(System.in);
            String  output = csv.nextLine();
            getFileName(input,output);
    
        }
    
    }

    pom.xml

    <?xml version="1.0" encoding="UTF-8"?>
    
    <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
      xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
      <modelVersion>4.0.0</modelVersion>
    
      <groupId>com.gong</groupId>
      <artifactId>csv</artifactId>
      <version>1.0-SNAPSHOT</version>
    
      <name>csv</name>
      <!-- FIXME change it to the project's website -->
      <url>http://www.example.com</url>
    
      <properties>
        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
        <maven.compiler.source>1.7</maven.compiler.source>
        <maven.compiler.target>1.7</maven.compiler.target>
      </properties>
    
      <dependencies>
        <dependency>
          <groupId>junit</groupId>
          <artifactId>junit</artifactId>
          <version>4.11</version>
          <scope>test</scope>
        </dependency>
        <dependency>
                     <groupId>net.sf.opencsv</groupId>
                     <artifactId>opencsv</artifactId>
                     <version>2.1</version>
                 </dependency>
               <dependency>
                    <groupId>org.apache.poi</groupId>
                   <artifactId>ooxml-schemas</artifactId>
                    <version>1.1</version>
                   <type>pom</type>
                </dependency>
               <dependency>
                    <groupId>org.apache.poi</groupId>
                    <artifactId>poi</artifactId>
                    <version>3.7</version>
               </dependency>
               <dependency>
                    <groupId>org.apache.poi</groupId>
                    <artifactId>ooxml-schemas</artifactId>
                    <version>1.1</version>
                </dependency>
                <dependency>
                    <groupId>org.apache.poi</groupId>
                    <artifactId>poi-ooxml</artifactId>
                    <version>3.7</version>
                </dependency>
                <dependency>
                    <groupId>dom4j</groupId>
                    <artifactId>dom4j</artifactId>
                    <version>1.6.1</version>
                </dependency>
          <!-- https://mvnrepository.com/artifact/net.sourceforge.jexcelapi/jxl -->
          <dependency>
              <groupId>net.sourceforge.jexcelapi</groupId>
              <artifactId>jxl</artifactId>
              <version>2.6.12</version>
          </dependency>
          <dependency>
              <groupId>commons-lang</groupId>
              <artifactId>commons-lang</artifactId>
              <version>2.6</version>
          </dependency>
          <dependency>
              <groupId>org.jsoup</groupId>
              <artifactId>jsoup</artifactId>
              <version>1.11.3</version>
          </dependency>
    
      </dependencies>
    
      <build>
        <pluginManagement><!-- lock down plugins versions to avoid using Maven defaults (may be moved to parent pom) -->
          <plugins>
            <!-- clean lifecycle, see https://maven.apache.org/ref/current/maven-core/lifecycles.html#clean_Lifecycle -->
            <plugin>
              <artifactId>maven-clean-plugin</artifactId>
              <version>3.1.0</version>
            </plugin>
            <!-- default lifecycle, jar packaging: see https://maven.apache.org/ref/current/maven-core/default-bindings.html#Plugin_bindings_for_jar_packaging -->
            <plugin>
              <artifactId>maven-resources-plugin</artifactId>
              <version>3.0.2</version>
            </plugin>
            <plugin>
              <artifactId>maven-compiler-plugin</artifactId>
              <version>3.8.0</version>
            </plugin>
            <plugin>
              <artifactId>maven-surefire-plugin</artifactId>
              <version>2.22.1</version>
            </plugin>
            <plugin>
              <artifactId>maven-jar-plugin</artifactId>
              <version>3.0.2</version>
            </plugin>
            <plugin>
              <artifactId>maven-install-plugin</artifactId>
              <version>2.5.2</version>
            </plugin>
            <plugin>
              <artifactId>maven-deploy-plugin</artifactId>
              <version>2.8.2</version>
            </plugin>
            <!-- site lifecycle, see https://maven.apache.org/ref/current/maven-core/lifecycles.html#site_Lifecycle -->
            <plugin>
              <artifactId>maven-site-plugin</artifactId>
              <version>3.7.1</version>
            </plugin>
            <plugin>
              <artifactId>maven-project-info-reports-plugin</artifactId>
              <version>3.0.0</version>
            </plugin>
          </plugins>
    
        </pluginManagement>
      </build>
    </project>

    运行分别输入excel文档的目录和csv的输出目录就可以了,在这里提醒一下大家,如果使用我这段代码的话,excel文档的数据文件不能带有其他类型的文件。

  • 相关阅读:
    回车执行函数
    ajax短信验证码-mvc
    css3背景及字体渐变
    MVC3-表单
    Layout布局
    【leetcode】两数之和
    C语言如何开发简单的插件
    Google Supersonic列存储查询库的介绍、安装、测试
    vm网络设置
    centos升级支持到C++11, gcc4.8.2
  • 原文地址:https://www.cnblogs.com/braveym/p/13672867.html
Copyright © 2011-2022 走看看