zoukankan      html  css  js  c++  java
  • hadoop 将HDFS上多个小文件合并到SequenceFile里

    背景:hdfs上的文件最好和hdfs的块大小的N倍。如果文件太小,浪费namnode的元数据存储空间以及内存,如果文件分块不合理也会影响mapreduce中map的效率。

    本例中将小文件的文件名作为key,其内容作为value生成SequenceFile

    1、生成文件

     //将目标目录的所有文件以文件名为key,内容为value放入SequenceFile中
        //第一个参数是需要打包的目录,第二个参数生成的文件路径和名称
        private static void combineToSequenceFile(String[] args) throws IOException {
            String sourceDir = args[0];
            String destFile = args[1];
    
            List<String> files = getFiles(sourceDir);
    
            Configuration conf = new Configuration();
            FileSystem fs = FileSystem.get(conf);
            Path destPath = new Path(destFile);
            if (fs.exists(destPath)) {
                fs.delete(destPath, true);
            }
    
            FSDataInputStream in = null;
    
            Text key = new Text();
            BytesWritable value = new BytesWritable();
    
            byte[] buff = new byte[4096];
            SequenceFile.Writer writer = null;
    
            SequenceFile.Writer.Option option1 = SequenceFile.Writer.file(new Path(destFile));
            SequenceFile.Writer.Option option2 = SequenceFile.Writer.keyClass(key.getClass());
            SequenceFile.Writer.Option option3 = SequenceFile.Writer.valueClass(value.getClass());
            SequenceFile.Writer.Option option4 = SequenceFile.Writer.compression(SequenceFile.CompressionType.RECORD);
            try {
                writer = SequenceFile.createWriter(conf, option1, option2, option3, option4);
                for (int i = 0; i < files.size(); i++) {
                    Path path = new Path(files.get(i).toString());
                    System.out.println("读取文件:" + path.toString());
                    key = new Text(files.get(i).toString());
                    in = fs.open(path);
    //                只能处理小文件,int最大只能表示到1个G的大小,实际上大文件放入SequenceFile也没有意义
                    int length = (int) fs.getFileStatus(path).getLen();
                    byte[] bytes = new byte[length];
    //                read最多只能读取65536的大小
                    int readLength = in.read(buff);
                    int offset = 0;
                    while (readLength > 0) {
                        System.arraycopy(buff, 0, bytes, offset, readLength);
                        offset += readLength;
                        readLength = in.read(buff);
                    }
                    System.out.println("file length:" + length + ",read length:" + offset);
                    value = new BytesWritable(bytes);
                    System.out.printf("[%s]	%s	%s
    ", writer.getLength(), key, value.getLength());
                    writer.append(key, value);
                }
            } finally {
                IOUtils.closeStream(in);
                IOUtils.closeStream(writer);
                IOUtils.closeStream(fs);
            }
    
        }

    查找文件:

        private static List<String> getFiles(String dir) throws IOException {
            Configuration conf = new Configuration();
            Path path = new Path(dir);
            FileSystem fs = null;
            List<String> filelist = new ArrayList<>();
            try {
                fs = FileSystem.get(conf);
    
                //对单个文件或目录下所有文件和目录
                FileStatus[] fileStatuses = fs.listStatus(path);
    
                for (FileStatus fileStatus : fileStatuses) {
                    //递归查找子目录
                    if (fileStatus.isDirectory()) {
                        filelist.addAll(getFiles(fileStatus.getPath().toString()));
                    } else {
                        filelist.add(fileStatus.getPath().toString());
                    }
                }
                return filelist;
            } finally {
                IOUtils.closeStream(fs);
            }
        }

    2、还原压缩的SequenceFile文件

        //将combineToSequenceFile生成的文件分解成原文件。
        private static void extractCombineSequenceFile(String[] args) throws IOException {
            String sourceFile = args[0];
    //        String destdir = args[1];
            Configuration conf = new Configuration();
            Path sourcePath = new Path(sourceFile);
    
            SequenceFile.Reader reader = null;
            SequenceFile.Reader.Option option1 = SequenceFile.Reader.file(sourcePath);
    
            Writable key = null;
            Writable value = null;
    //        Text key = null;
    //        BytesWritable value = null;
    
            FileSystem fs = FileSystem.get(conf);
            try {
                reader = new SequenceFile.Reader(conf, option1);
                key = (Writable) ReflectionUtils.newInstance(reader.getKeyClass(), conf);
                value = (Writable) ReflectionUtils.newInstance(reader.getValueClass(), conf);
    
                //在知道key和value的明确类型的情况下,可以直接用其类型
    //            key = ReflectionUtils.newInstance(Text.class, conf);
    //            value =  ReflectionUtils.newInstance(BytesWritable.class, conf);
                long position = reader.getPosition();
                while (reader.next(key, value)) {
                    FSDataOutputStream out = fs.create(new Path(key.toString()), true);
                    //文件头会多出4个字节,用来标识长度,而本例中原文件头是没有长度的,所以不能用这个方式写入流
    //                value.write(out);
                    out.write(((BytesWritable)value).getBytes(),0,((BytesWritable)value).getLength());
    
                    //                out.write(value.getBytes(),0,value.getLength());
                    System.out.printf("[%s]	%s	%s
    ", position, key, out.getPos());
                    out.close();
                    position = reader.getPosition();
                }
            } finally {
                IOUtils.closeStream(reader);
                IOUtils.closeStream(fs);
            }
        }
  • 相关阅读:
    浏览器嗅探
    (转)javascript中为何在匿名function函数后面还外加一个括号
    js十进制转换二进制
    css_毛玻璃
    css桌布样式
    ocr api 识别表格 图片
    Linux查看日志常用命令
    linux find 命令查找文件和文件夹
    docker上安装airflow
    文件权限中 chmod、u+x、u、r、w、x分别代表什么
  • 原文地址:https://www.cnblogs.com/asker009/p/10383600.html
Copyright © 2011-2022 走看看