小文件可以加载到内存一块去重, 大文件不行.
大文件去重步骤:
- 定义小文件大小 计算小文件总数
- 大文件按规则拆分到小文件
- 小文件去重
- 小文件合并为大文件
代码:
package com.util.file; import com.util.FileUtils; import lombok.Builder; import lombok.SneakyThrows; import org.apache.commons.lang3.StringUtils; import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.File; import java.io.FileInputStream; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStreamReader; import java.io.OutputStreamWriter; import java.util.ArrayList; import java.util.Comparator; import java.util.List; import java.util.TreeSet; import java.util.concurrent.CopyOnWriteArrayList; import java.util.function.Function; import java.util.stream.Collectors; /** * @description: 大文件去重 * @author: AlbertXe * @create: 2020-10-21 10:23 */ @Builder(toBuilder = true) public class BigFileDistinctUtil { @Builder.Default private int size = 1 << 20; @Builder.Default private String charset = "gbk"; @Builder.Default private List<File> tempFiles = new CopyOnWriteArrayList<>(); private Function<String, String> function; public void uniq(File file) { if (!file.exists()) { throw new RuntimeException("文件不存在"); } // 拆分小文件 小文件去重 合并 删除临时文件 division(file); tempFiles.parallelStream().forEach(f -> uniqLittleFile(f)); FileUtils.combineFiles(tempFiles, file); tempFiles.parallelStream().forEach(f -> f.delete()); } private void uniqLittleFile(File file) { String path = file.getParent() + "/" + file.getName() + ".tmp"; List<String> lines = FileUtils.lines(file, charset); ArrayList<String> resultList = lines.stream().collect(Collectors.collectingAndThen(Collectors.toCollection(() -> new TreeSet<>(Comparator.comparing(line -> function.apply(line)))), ArrayList::new)); String result = resultList.stream().collect(Collectors.joining(System.lineSeparator())); if (StringUtils.isNotBlank(result)) { result += System.lineSeparator(); } File tempFile = FileUtils.getFile(path); FileUtils.write(tempFile, result, charset); FileUtils.copyFile(tempFile, file); tempFile.delete(); } @SneakyThrows private void division(File file) { long length = file.length(); int fileSum = (int) (length / size + 1); List<BufferedWriter> writers = new ArrayList<>(); try (BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(file), charset))) { for (int i = 0; i < fileSum; i++) { String tempFile = file.getParent() + "/" + file.getName() + "_" + i + ".temp"; tempFiles.add(FileUtils.getFile(tempFile)); BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(tempFile), charset)); writers.add(writer); } String line; while ((line = reader.readLine()) != null) { String key = function.apply(line); int hash = key.hashCode() & Integer.MAX_VALUE; int i = hash % fileSum; writers.get(i).write(line + System.lineSeparator()); } } finally { writers.parallelStream().forEach(writer -> { try { writer.close(); } catch (IOException e) { e.printStackTrace(); } }); } } }
package com.util; import lombok.SneakyThrows; import lombok.extern.slf4j.Slf4j; import javax.swing.filechooser.FileSystemView; import java.io.File; import java.io.FileInputStream; import java.io.FileOutputStream; import java.io.IOException; import java.nio.ByteBuffer; import java.nio.channels.FileChannel; import java.nio.file.FileSystems; import java.nio.file.FileVisitResult; import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.PathMatcher; import java.nio.file.Paths; import java.nio.file.SimpleFileVisitor; import java.nio.file.attribute.BasicFileAttributes; import java.text.DecimalFormat; import java.util.ArrayList; import java.util.List; /** * @author AlbertXe * @date 2019-11-21 21:20 */ @Slf4j public class FileUtils { public static void main(String[] args) { getCD(); } /** * NIO方式copy * * @param source * @param target * @throws IOException */ public static void copyNIO(String source, String target) throws IOException { try (FileInputStream is = new FileInputStream(source)) { try (FileOutputStream os = new FileOutputStream(target)) { FileChannel outChannel = os.getChannel(); FileChannel inChannel = is.getChannel(); ByteBuffer buffer = ByteBuffer.allocate(4086); while (inChannel.read(buffer) != -1) { buffer.flip(); outChannel.write(buffer); buffer.clear(); } } } } /** * 可以创建多层目录文件 * * @param path * @return * @throws IOException */ @SneakyThrows public static File getFile(String path) { File file = new File(path); if (file.exists()) { return file; } else { file.getParentFile().mkdirs(); file.createNewFile(); return file; } } /** * 获得系统盘符 */ public static void getCD() { FileSystemView view = FileSystemView.getFileSystemView(); File[] roots = File.listRoots(); for (int i = 0; i < roots.length; i++) { System.out.println("盘符:" + view.getSystemDisplayName(roots[i])); System.out.println("总大小:" + formatSize(roots[i].getTotalSpace())); System.out.println("剩余大小:" + formatSize(roots[i].getFreeSpace())); } } /** * 格式化盘符大小 * * @param size * @return */ public static String formatSize(long size) { DecimalFormat df = new DecimalFormat("#.00"); if (size < 1024) { return df.format(size) + "B"; } else if (size < 1024 * 1024) { return df.format((double) size / 1024) + "KB"; } else if (size < 1024 * 1024 * 1024) { return df.format((double) size / (1024 * 1024)) + "M"; } else { return df.format((double) size / (1024 * 1024 * 1024)) + "G"; } } // 全局规则glob // 使用类似于正则表达式但语法更简单的模式,匹配路径的字符串。 // // glob:*.java 匹配以java结尾的文件 // glob:*.* 匹配包含'.'的文件 // glob:*.{java,class} 匹配以java或class结尾的文件 // glob:foo.? 匹配以foo开头且一个字符扩展名的文件 // glob:/home/*/* 在unix平台上匹配,例如/home/gus/data等 // glob:/home/** 在unix平台上匹配,例如/home/gus,/home/gus/data // glob:c:\\* 在windows平台上匹配,例如c:foo,c:bar,注意字符串转义 // 规则说明 // * 匹配零个或多个字符与名称组件,不跨越目录 // ** 匹配零个或多个字符与名称组件,跨越目录(含子目录) // ? 匹配一个字符的字符与名称组件 // 转义字符,例如{表示匹配左花括号 // [] 匹配方括号表达式中的范围,连字符(-)可指定范围。例如[ABC]匹配"A"、"B"和"C";[a-z]匹配从"a"到"z";[abce-g]匹配"a"、"b"、"c"、"e"、"f"、"g"; // [!...]匹配范围之外的字符与名称组件,例如[!a-c]匹配除"a"、"b"、"c"之外的任意字符 // {}匹配组中的任意子模式,多个子模式用","分隔,不能嵌套。 // 正则规则regex // 使用java.util.regex.Pattern支持的正则表达式。 /** * 遍历目录 查找特定文件 */ public static List<String> listFiles(String dir, String glob) throws IOException { List<String> files = new ArrayList<>(); PathMatcher pathMatcher = FileSystems.getDefault().getPathMatcher(glob); Files.walkFileTree(Paths.get(dir), new SimpleFileVisitor<Path>() { // Path path = Paths.get("/usr/web/bbf.jar"); //endsWith 必须是路径中一段完整的 // path.endsWith("bbf.jar"); // true // path.endsWith(".jar"); // false @Override public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) throws IOException { if (pathMatcher.matches(file)) { files.add(file.toString()); System.out.println(file); } return FileVisitResult.CONTINUE; } }); return files; } @SneakyThrows public static List<String> lines(File file, String charset) { return org.apache.commons.io.FileUtils.readLines(file, charset); } @SneakyThrows public static void write(File file, String content, String charset) { org.apache.commons.io.FileUtils.write(file, content, charset); } @SneakyThrows public static void combineFiles(List<File> files, File outFile) { try (FileOutputStream fos = new FileOutputStream(outFile); FileChannel fosChannel = fos.getChannel();) { for (File file : files) { if (file.length() > 0) { try (FileInputStream fis = new FileInputStream(file); FileChannel fisChannel = fis.getChannel();) { long size = fisChannel.size(); // 只能传输2G的数据 fisChannel.transferTo(0, size, fosChannel); log.info("合并源:{}文件到:{}最终文件,最终文件合并前大小:{}.合并后大小:{},源文件大小:{}", file, outFile, size, fosChannel.size(), file.length() ); } } } } } @SneakyThrows public static void copyFile(File tempFile, File file) { org.apache.commons.io.FileUtils.copyFile(tempFile, file); } }
测试:
package com.util.file; import org.junit.Test; import java.io.File; /** * @description: 大文件去重测试 * @author: AlbertXe * @create: 2020-10-21 14:06 */ public class BigFileDistinctUtilTest { @Test public void test() { File file = new File("d:/a.txt"); BigFileDistinctUtil build = BigFileDistinctUtil.builder().function(t -> t).build(); build.uniq(file); } }