zoukankan      html  css  js  c++  java
  • java 两个csv文件数据去重

    1.pom.xml配置

    <dependency>
          <groupId>commons-io</groupId>
           <artifactId>commons-io</artifactId>
           <version>2.4</version>
    </dependency>

    2.实现

    package com.tangxin.kafka.service;
    
    import org.apache.commons.io.FileUtils;
    import org.apache.commons.io.LineIterator;
    import org.springframework.util.StringUtils;
    
    import java.io.*;
    import java.math.BigDecimal;
    import java.util.*;
    
    /**
     * 两个csv文件数据去重
     */
    public class CSVDeduplication {
    
        private static final String CSV_PATH = "I:\";
    
    
        public static List<String> ids(String path) {
            List<String> result = new ArrayList<>();
            File csv = new File(path);  // CSV文件路径
            LineIterator it = null;
            try {
                it = FileUtils.lineIterator(csv);
                while (it.hasNext()) {
                    String line = it.nextLine();
                    if (line.trim().contains("id")) {
                        continue;
                    }
                    String[] arr = line.split(",");
                    String id = arr[0];
                    id = id.replaceAll(""", "").trim();
                    result.add(id);
                }
            } catch (Exception e) {
            } finally {
                LineIterator.closeQuietly(it);
            }
            return result;
        }
    
    
        public static void main(String[] args) throws Exception {
            String path1 = CSV_PATH+"100w.csv";
            String path2 = CSV_PATH+"300w.csv";
    
    
    
            List<String> ids1 = ids(path1);
            Set<String> idSet1 = new HashSet<>();
            Set<String> idSet2 = new HashSet<>();
    
            for (int i = 0; i < ids1.size(); i++) {
                if(StringUtils.isEmpty(ids1.get(i))){
                    continue;
                }
                idSet1.add(ids1.get(i));
            }
    
            List<String> ids2 = ids(path2);
    
            for (int i = 0; i < ids2.size(); i++) {
                if(StringUtils.isEmpty(ids2.get(i))){
                    continue;
                }
                idSet2.add(ids2.get(i));
            }
    
            System.out.println("用户100万=" + idSet1.size());
            System.out.println("用户300万=" + idSet2.size());
            BigDecimal b1 = new BigDecimal(idSet1.size());
            BigDecimal b2 = new BigDecimal(idSet2.size());
            BigDecimal b3 = b1.add(b2);
            System.out.println("用户100万和用户300万="+b3.toString());
    
            List<String> ids4 = new ArrayList<>();//重复数据
    
    
            Set<String> ids3 = new HashSet<>();
    
            Iterator<String> iterator1 = idSet1.iterator();
            while (iterator1.hasNext()){
                String t1 = iterator1.next();
                ids3.add(t1);
            }
    
            Iterator<String> iterator2 = idSet2.iterator();
            while (iterator2.hasNext()){
                String t1 = iterator2.next();
                ids3.add(t1);
            }
    
            System.out.println("用户100万和用户300万去重=" + ids3.size());
    
    
            ids1.removeAll(ids3);
            ids2.removeAll(ids3);
            ids4.addAll(ids1);
            ids4.addAll(ids2);
            System.out.println("用户100万和用户300万重复="+ids4.size());
    
    
            Set<String> fiveMillion = splitHeadData(ids3, 50000);
    
            System.out.println("5W用户推送数据:" + fiveMillion.size());
    
            List<String> staffsList = new ArrayList<>(fiveMillion);
    
            createCSV(staffsList,"5w.csv");
    
    
            System.out.println("剩余推送总数:" + ids3.size());
    
            System.out.println("============剩余总数每50w分页显示=================");
    
            List<List<String>> pageListTotal = pageList(ids3,500000);
    
            for (int i = 0; i < pageListTotal.size(); i++) {
                List<String> items = pageListTotal.get(i);
                createCSV(items,"50w"+i+".csv");
            }
    
    
        }
    
        public static Set<String> splitHeadData(Set<String> mySet, int size) {
            Set<String> result = new HashSet<>();
            Iterator<String> iterator = mySet.iterator();
            int count = 0;
            while (iterator.hasNext()) {
                if (count == size) {
                    break;
                }
                result.add(iterator.next());
                count++;
            }
            mySet.removeAll(result);
            return result;
        }
    
    
        /**
         * 分页list的id数据
         * @return
         */
        public static List<List<String>> pageList(Set<String> totalSet, int pageSize) {
            List<List<String>> allIdList = new ArrayList<>();
            List<String> idList = new ArrayList<>();
            Iterator<String> it = totalSet.iterator();
            int count = 0;
            while (it.hasNext()) {
                String id = it.next();
                if (count > pageSize) {
                    allIdList.add(idList);
                    count = 0;
                    idList = new ArrayList<>();
                }
                idList.add(id);
                count++;
            }
            if (idList.size() > 0) {
                allIdList.add(idList);
            }
            return allIdList;
        }
    
    
        /**
         * 创建CSV文件
         */
        public static void createCSV(List<String> list,String fileName) {
    
            // 表格头
            Object[] head = {"id"};
            List<Object> headList = Arrays.asList(head);
    
            //数据
            List<List<Object>> dataList = new ArrayList<>();
            List<Object> rowList;
            for (int i = 0; i < list.size(); i++) {
                rowList = new ArrayList<>();
                rowList.add(list.get(i));
                dataList.add(rowList);
            }
    
            String filePath = CSV_PATH; //文件路径
    
            File csvFile;
            BufferedWriter csvWriter = null;
            try {
                csvFile = new File(filePath + fileName);
                File parent = csvFile.getParentFile();
                if (parent != null && !parent.exists()) {
                    parent.mkdirs();
                }
                csvFile.createNewFile();
    
                // GB2312使正确读取分隔符","
                csvWriter = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(csvFile), "GB2312"), 1024);
    
    
                // 写入文件头部
                writeRow(headList, csvWriter);
    
                // 写入文件内容
                for (List<Object> row : dataList) {
                    writeRow(row, csvWriter);
                }
                csvWriter.flush();
            } catch (Exception e) {
                e.printStackTrace();
            } finally {
                try {
                    csvWriter.close();
                } catch (IOException e) {
                    e.printStackTrace();
                }
            }
        }
    
    
        private static void writeRow(List<Object> row, BufferedWriter csvWriter) throws IOException {
            for (Object data : row) {
                StringBuffer sb = new StringBuffer();
                String rowStr = sb.append(""").append(data).append("",").toString();
                csvWriter.write(rowStr);
            }
            csvWriter.newLine();
        }
    
    }

    3.开始的实现思路和后面的实现思路

    3.1 开始的实现思路

         读取文件1.csv,数据大概有100多万 读取文件2.csv,数据大概有300多万,然后用100万和300万的数据一个个去比较看哪些已经存在了,两个for循环,100万*300万=3万亿次 卡着不动放弃了。

         然后想着用多线程把300万数据分页成每50万来跑也是跑的很。

    3.2 后面的实现思路

          代码就在上面,整体思路就是通过java的Set集合来去重复,因为java单个循环处理还是很快的,注意需要配置jvm参数来跑不然会内存溢出:

    VM options:

    -Xms1g -Xmx1g -XX:SurvivorRatio=2 -XX:+UseParallelGC
  • 相关阅读:
    TestLink学习六:TestLink1.9.13工作使用小结
    TestLink学习五:TestLink1.9.13和JIRA6.3.6的集成
    TestLink学习四:TestLink1.9.13使用说明
    TestLink学习三:发送邮件的两种配置方法
    TestLink学习二:Windows搭建TestLink环境
    TestLink学习一:Windows搭建Apache+MySQL+PHP环境
    Python:Ubuntu上使用pip安装opencv-python出现错误
    Python:Ubuntu上出现错误 Could not load dynamic library 'libnvinfer.so.6' / 'libnvinfer_plugin.so.6'
    mybatis-generator二次开发总结
    动态代理
  • 原文地址:https://www.cnblogs.com/fofawubian/p/7911164.html
Copyright © 2011-2022 走看看