zoukankan      html  css  js  c++  java
  • 大文件去重

    小文件可以加载到内存一块去重, 大文件不行.

    大文件去重步骤:

    1. 定义小文件大小 计算小文件总数
    2. 大文件按规则拆分到小文件  
    3. 小文件去重 
    4. 小文件合并为大文件

    代码:

    package com.util.file;
    
    import com.util.FileUtils;
    import lombok.Builder;
    import lombok.SneakyThrows;
    import org.apache.commons.lang3.StringUtils;
    
    import java.io.BufferedReader;
    import java.io.BufferedWriter;
    import java.io.File;
    import java.io.FileInputStream;
    import java.io.FileOutputStream;
    import java.io.IOException;
    import java.io.InputStreamReader;
    import java.io.OutputStreamWriter;
    import java.util.ArrayList;
    import java.util.Comparator;
    import java.util.List;
    import java.util.TreeSet;
    import java.util.concurrent.CopyOnWriteArrayList;
    import java.util.function.Function;
    import java.util.stream.Collectors;
    
    /**
     * @description: 大文件去重
     * @author: AlbertXe
     * @create: 2020-10-21 10:23
     */
    @Builder(toBuilder = true)
    public class BigFileDistinctUtil {
        @Builder.Default
        private int size = 1 << 20;
        @Builder.Default
        private String charset = "gbk";
        @Builder.Default
        private List<File> tempFiles = new CopyOnWriteArrayList<>();
        private Function<String, String> function;
    
    
        public void uniq(File file) {
            if (!file.exists()) {
                throw new RuntimeException("文件不存在");
            }
            // 拆分小文件  小文件去重  合并  删除临时文件
            division(file);
    
            tempFiles.parallelStream().forEach(f -> uniqLittleFile(f));
    
            FileUtils.combineFiles(tempFiles, file);
    
            tempFiles.parallelStream().forEach(f -> f.delete());
        }
    
        private void uniqLittleFile(File file) {
            String path = file.getParent() + "/" + file.getName() + ".tmp";
            List<String> lines = FileUtils.lines(file, charset);
            ArrayList<String> resultList = lines.stream().collect(Collectors.collectingAndThen(Collectors.toCollection(() -> new TreeSet<>(Comparator.comparing(line -> function.apply(line)))), ArrayList::new));
            String result = resultList.stream().collect(Collectors.joining(System.lineSeparator()));
            if (StringUtils.isNotBlank(result)) {
                result += System.lineSeparator();
            }
            File tempFile = FileUtils.getFile(path);
            FileUtils.write(tempFile, result, charset);
            FileUtils.copyFile(tempFile, file);
            tempFile.delete();
        }
    
        @SneakyThrows
        private void division(File file) {
            long length = file.length();
            int fileSum = (int) (length / size + 1);
            List<BufferedWriter> writers = new ArrayList<>();
            try (BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(file), charset))) {
                for (int i = 0; i < fileSum; i++) {
                    String tempFile = file.getParent() + "/" + file.getName() + "_" + i + ".temp";
                    tempFiles.add(FileUtils.getFile(tempFile));
                    BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(tempFile), charset));
                    writers.add(writer);
                }
                String line;
                while ((line = reader.readLine()) != null) {
                    String key = function.apply(line);
                    int hash = key.hashCode() & Integer.MAX_VALUE;
                    int i = hash % fileSum;
                    writers.get(i).write(line + System.lineSeparator());
                }
            } finally {
                writers.parallelStream().forEach(writer -> {
                    try {
                        writer.close();
                    } catch (IOException e) {
                        e.printStackTrace();
                    }
                });
            }
        }
    }
    package com.util;
    
    import lombok.SneakyThrows;
    import lombok.extern.slf4j.Slf4j;
    
    import javax.swing.filechooser.FileSystemView;
    import java.io.File;
    import java.io.FileInputStream;
    import java.io.FileOutputStream;
    import java.io.IOException;
    import java.nio.ByteBuffer;
    import java.nio.channels.FileChannel;
    import java.nio.file.FileSystems;
    import java.nio.file.FileVisitResult;
    import java.nio.file.Files;
    import java.nio.file.Path;
    import java.nio.file.PathMatcher;
    import java.nio.file.Paths;
    import java.nio.file.SimpleFileVisitor;
    import java.nio.file.attribute.BasicFileAttributes;
    import java.text.DecimalFormat;
    import java.util.ArrayList;
    import java.util.List;
    
    /**
     * @author AlbertXe
     * @date 2019-11-21 21:20
     */
    @Slf4j
    public class FileUtils {
    
        public static void main(String[] args) {
            getCD();
        }
    
        /**
         * NIO方式copy
         *
         * @param source
         * @param target
         * @throws IOException
         */
        public static void copyNIO(String source, String target) throws IOException {
            try (FileInputStream is = new FileInputStream(source)) {
                try (FileOutputStream os = new FileOutputStream(target)) {
                    FileChannel outChannel = os.getChannel();
                    FileChannel inChannel = is.getChannel();
                    ByteBuffer buffer = ByteBuffer.allocate(4086);
                    while (inChannel.read(buffer) != -1) {
                        buffer.flip();
                        outChannel.write(buffer);
                        buffer.clear();
                    }
                }
            }
        }
    
    
        /**
         * 可以创建多层目录文件
         *
         * @param path
         * @return
         * @throws IOException
         */
        @SneakyThrows
        public static File getFile(String path) {
            File file = new File(path);
            if (file.exists()) {
                return file;
            } else {
                file.getParentFile().mkdirs();
                file.createNewFile();
                return file;
            }
        }
    
        /**
         * 获得系统盘符
         */
        public static void getCD() {
            FileSystemView view = FileSystemView.getFileSystemView();
    
            File[] roots = File.listRoots();
            for (int i = 0; i < roots.length; i++) {
                System.out.println("盘符:" + view.getSystemDisplayName(roots[i]));
                System.out.println("总大小:" + formatSize(roots[i].getTotalSpace()));
                System.out.println("剩余大小:" + formatSize(roots[i].getFreeSpace()));
            }
        }
    
        /**
         * 格式化盘符大小
         *
         * @param size
         * @return
         */
        public static String formatSize(long size) {
            DecimalFormat df = new DecimalFormat("#.00");
            if (size < 1024) {
                return df.format(size) + "B";
            } else if (size < 1024 * 1024) {
                return df.format((double) size / 1024) + "KB";
            } else if (size < 1024 * 1024 * 1024) {
                return df.format((double) size / (1024 * 1024)) + "M";
            } else {
                return df.format((double) size / (1024 * 1024 * 1024)) + "G";
            }
        }
    //    全局规则glob
    //    使用类似于正则表达式但语法更简单的模式,匹配路径的字符串。
    //
    //    glob:*.java 匹配以java结尾的文件
    //    glob:*.* 匹配包含'.'的文件
    //    glob:*.{java,class} 匹配以java或class结尾的文件
    //    glob:foo.? 匹配以foo开头且一个字符扩展名的文件
    //    glob:/home/*/* 在unix平台上匹配,例如/home/gus/data等
    //    glob:/home/** 在unix平台上匹配,例如/home/gus,/home/gus/data
    //    glob:c:\\* 在windows平台上匹配,例如c:foo,c:bar,注意字符串转义
    //    规则说明
    //    * 匹配零个或多个字符与名称组件,不跨越目录
    //    ** 匹配零个或多个字符与名称组件,跨越目录(含子目录)
    //    ? 匹配一个字符的字符与名称组件
    //     转义字符,例如{表示匹配左花括号
    //    [] 匹配方括号表达式中的范围,连字符(-)可指定范围。例如[ABC]匹配"A"、"B"和"C";[a-z]匹配从"a"到"z";[abce-g]匹配"a"、"b"、"c"、"e"、"f"、"g";
    //    [!...]匹配范围之外的字符与名称组件,例如[!a-c]匹配除"a"、"b"、"c"之外的任意字符
    //    {}匹配组中的任意子模式,多个子模式用","分隔,不能嵌套。
    //    正则规则regex
    //    使用java.util.regex.Pattern支持的正则表达式。
    
        /**
         * 遍历目录 查找特定文件
         */
        public static List<String> listFiles(String dir, String glob) throws IOException {
            List<String> files = new ArrayList<>();
            PathMatcher pathMatcher = FileSystems.getDefault().getPathMatcher(glob);
    
            Files.walkFileTree(Paths.get(dir), new SimpleFileVisitor<Path>() {
    
                //            Path path = Paths.get("/usr/web/bbf.jar"); //endsWith 必须是路径中一段完整的
    //            path.endsWith("bbf.jar");  // true
    //            path.endsWith(".jar");     // false
                @Override
                public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) throws IOException {
                    if (pathMatcher.matches(file)) {
                        files.add(file.toString());
                        System.out.println(file);
                    }
                    return FileVisitResult.CONTINUE;
                }
            });
            return files;
        }
    
        @SneakyThrows
        public static List<String> lines(File file, String charset) {
            return org.apache.commons.io.FileUtils.readLines(file, charset);
        }
    
        @SneakyThrows
        public static void write(File file, String content, String charset) {
            org.apache.commons.io.FileUtils.write(file, content, charset);
        }
    
        @SneakyThrows
        public static void combineFiles(List<File> files, File outFile) {
    
            try (FileOutputStream fos = new FileOutputStream(outFile);
                 FileChannel fosChannel = fos.getChannel();) {
                for (File file : files) {
                    if (file.length() > 0) {
                        try (FileInputStream fis = new FileInputStream(file);
                             FileChannel fisChannel = fis.getChannel();) {
                            long size = fisChannel.size();
                            // 只能传输2G的数据
                            fisChannel.transferTo(0, size, fosChannel);
                            log.info("合并源:{}文件到:{}最终文件,最终文件合并前大小:{}.合并后大小:{},源文件大小:{}", file, outFile, size, fosChannel.size(), file.length()
                            );
                        }
                    }
                }
            }
    
        }
    
        @SneakyThrows
        public static void copyFile(File tempFile, File file) {
            org.apache.commons.io.FileUtils.copyFile(tempFile, file);
        }
    }

    测试:

    package com.util.file;
    
    import org.junit.Test;
    
    import java.io.File;
    
    /**
     * @description: 大文件去重测试
     * @author: AlbertXe
     * @create: 2020-10-21 14:06
     */
    public class BigFileDistinctUtilTest {
    
        @Test
        public void test() {
            File file = new File("d:/a.txt");
            BigFileDistinctUtil build = BigFileDistinctUtil.builder().function(t -> t).build();
            build.uniq(file);
        }
    
    }
  • 相关阅读:
    WYT的刷子
    小烈送菜
    猴腮雷
    基于Docker的Mysql主从复制搭建
    C#集合类型大揭秘
    ASP.NET三剑客 HttpApplication HttpModule HttpHandler 解析
    使用缓存的正确姿势
    【模块化那些事】 拆散的模块化
    分享一个开源的网盘下载工具BaiduPCS-Go
    【抽象那些事】不必要的抽象
  • 原文地址:https://www.cnblogs.com/albertXe/p/13859763.html
Copyright © 2011-2022 走看看