zoukankan      html  css  js  c++  java
  • Hadoop HDFS编程 API入门系列之合并小文件到HDFS(三)

      不多说,直接上代码。

     代码版本1

      1 package zhouls.bigdata.myWholeHadoop.HDFS.hdfs7;
      2 
      3 import java.io.IOException;
      4 import java.net.URI;
      5 import java.net.URISyntaxException;
      6 import org.apache.hadoop.conf.Configuration;
      7 import org.apache.hadoop.fs.FSDataInputStream;
      8 import org.apache.hadoop.fs.FSDataOutputStream;
      9 import org.apache.hadoop.fs.FileStatus;
     10 import org.apache.hadoop.fs.FileSystem;
     11 import org.apache.hadoop.fs.FileUtil;
     12 import org.apache.hadoop.fs.Path;
     13 import org.apache.hadoop.fs.PathFilter;
     14 import org.apache.hadoop.io.IOUtils;
     15 /**
     16  * function 合并小文件至 HDFS 
     17  * 
     18  *
     19  */
     20 public class MergeSmallFilesToHDFS 
     21 {
     22     private static FileSystem fs = null;  //定义文件系统对象,是HDFS上的
     23     private static FileSystem local = null; //定义文件系统对象,是本地上的
     24     
     25     /**
     26      * @function main 
     27      * @param args
     28      * @throws IOException
     29      * @throws URISyntaxException
     30      */
     31     
     32     public static void main(String[] args) throws IOException,URISyntaxException{
     33     
     34         list();
     35     }
     36 
     37     /**
     38      * 
     39      * @throws IOException
     40      * @throws URISyntaxException
     41      */
     42     public static void list() throws IOException, URISyntaxException{
     43         // 读取hadoop配置文件
     44         Configuration conf = new Configuration();
     45         // 文件系统访问接口和创建FileSystem对象,在本地上运行模式
     46         URI uri = new URI("hdfs://HadoopMaster:9000");
     47         fs = FileSystem.get(uri, conf);
     48         // 获得本地文件系统
     49         local = FileSystem.getLocal(conf);
     50         // 过滤目录下的 svn 文件
     51         FileStatus[] dirstatus = local.globStatus(new Path("./data/mergeSmallFilesToHDFS/73/*"),new RegexExcludePathFilter("^.*svn$"));
     52 //    FileStatus[] dirstatus = local.globStatus(new Path("D://data/73/*"),new RegexExcludePathFilter("^.*svn$"));
     53         //获取D:Data	vdata目录下的所有文件路径
     54         Path[] dirs = FileUtil.stat2Paths(dirstatus);
     55         FSDataOutputStream out = null;
     56         FSDataInputStream in = null;
     57         for (Path dir : dirs) 
     58         {//比如拿2012-09-17为例
     59             //将文件夹名称2012-09-17的-去掉,直接,得到20120901文件夹名称
     60             String fileName = dir.getName().replace("-", "");//文件名称
     61             //只接受20120917日期目录下的.txt文件
     62             FileStatus[] localStatus = local.globStatus(new Path(dir+"/*"),new RegexAcceptPathFilter("^.*txt$"));
     63             // 获得20120917日期目录下的所有文件
     64             Path[] listedPaths = FileUtil.stat2Paths(localStatus);
     65             // 输出路径
     66             Path block = new Path("hdfs://HadoopMaster:9000/middle/tv/"+ fileName + ".txt");
     67             System.out.println("合并后的文件名称:"+fileName+".txt");
     68             // 打开输出流
     69             out = fs.create(block);    
     70             //循环20120917日期目录下的所有文件
     71             for (Path p : listedPaths){//这是星型for循环,即listedPaths的值传给Path p
     72                 in = local.open(p);// 打开输入流
     73                 IOUtils.copyBytes(in, out, 4096, false); // 复制数据
     74                 // 关闭输入流
     75                 in.close();
     76             }
     77             if (out != null){
     78                 // 关闭输出流
     79                 out.close();
     80             }
     81             //当循环完20120917日期目录下的所有文件之后,接着依次20120918,20120919,,,
     82         }
     83     }
     84 
     85     /**
     86      * 
     87      * @function 过滤 regex 格式的文件
     88      *
     89      */
     90     public static class RegexExcludePathFilter implements PathFilter{
     91         private final String regex;
     92 
     93         public RegexExcludePathFilter(String regex){
     94             this.regex = regex;
     95         }
     96 
     97         
     98         public boolean accept(Path path){
     99             // TODO Auto-generated method stub
    100             boolean flag = path.toString().matches(regex);
    101             return !flag;
    102         }
    103 
    104     }
    105 
    106     /**
    107      * 
    108      * @function 接受 regex 格式的文件
    109      *
    110      */
    111     public static class RegexAcceptPathFilter implements PathFilter{
    112         private final String regex;
    113 
    114         public RegexAcceptPathFilter(String regex){
    115             this.regex = regex;
    116         }
    117 
    118     
    119         public boolean accept(Path path){
    120             // TODO Auto-generated method stub
    121             boolean flag = path.toString().matches(regex);
    122             return flag;
    123         }
    124 
    125     }
    126 }

    代码版本2

      1 package com.dajiangtai.Hadoop.HDFS;
      2 
      3 import java.io.IOException;
      4 import java.net.URI;
      5 import java.net.URISyntaxException;
      6 import org.apache.hadoop.conf.Configuration;
      7 import org.apache.hadoop.fs.FSDataInputStream;
      8 import org.apache.hadoop.fs.FSDataOutputStream;
      9 import org.apache.hadoop.fs.FileStatus;
     10 import org.apache.hadoop.fs.FileSystem;
     11 import org.apache.hadoop.fs.FileUtil;
     12 import org.apache.hadoop.fs.Path;
     13 import org.apache.hadoop.fs.PathFilter;
     14 import org.apache.hadoop.hdfs.DistributedFileSystem;
     15 import org.apache.hadoop.io.IOUtils;
     16 /**
     17  * function 合并小文件至 HDFS     ,  文件与块大小(比如128M)来比,小的话,称为小文件。是一个相对概念!相对于数据块而言的!
     18  * @author 小讲
     19  *  我们利用通配符和PathFilter 对象,将本地多种格式的文件上传至 HDFS文件系统,并过滤掉 txt文本格式以外的文件。
     20  */
     21 public class MergeSmallFilesToHDFS {
     22     private static FileSystem fs = null;
     23     private static FileSystem local = null;
     24     /**
     25      * @function main 
     26      * @param args
     27      * @throws IOException
     28      * @throws URISyntaxException
     29      */
     30     public static void main(String[] args) throws IOException,
     31             URISyntaxException {
     32         list();
     33     }
     34 
     35     /**
     36      * 
     37      * @throws IOException
     38      * @throws URISyntaxException
     39      */
     40     public static void list() throws IOException, URISyntaxException {
     41         // 读取hadoop文件系统的配置
     42         Configuration conf = new Configuration();
     43 //        conf=Configuration
     44 //        conf是Configuration: core-default.xml, core-site.xml, mapred-default.xml, mapred-site.xml, yarn-default.xml, yarn-site.xml, hdfs-default.xml, hdfs-site.xml
     45         
     46         //文件系统访问接口
     47         URI uri = new URI("hdfs://djt002:9000");
     48 //        uri=URI
     49 //        uri是hdfs://djt002:9000
     50         
     51 //        URL、URI与Path三者的区别
     52 //        Hadoop文件系统中通过Hadoop Path对象来代表一个文件    
     53 //        URL(相当于绝对路径)    ->   (文件) ->    URI(相当于相对路径,即代表URL前面的那一部分)
     54 //        URI:如hdfs://dajiangtai:9000
     55 //        如,URL.openStream
     56         
     57         
     58         
     59         //获得FileSystem实例,即HDFS
     60         fs = FileSystem.get(uri, conf);
     61 //        fs=DistributedFileSystem
     62 //        fs是DFS[DFSClient[clientName=DFSClient_NONMAPREDUCE_1814566850_1, ugi=Administrator (auth:SIMPLE)]]
     63         
     64         //获得FileSystem实例,即Local
     65         local = FileSystem.getLocal(conf);
     66 //        local=LocalFileSystem
     67 //        local是org.apache.hadoop.fs.LocalFileSystem@3ce1b8c5
     68 //            为什么要获取到Local呢,因为,我们要把本地D盘下data/73目录下的文件要合并后,上传到HDFS里,所以,我们需先获取到Local,再来做合并工作啦!
     69         
     70         
     71 //        18、列出文件或目录内容(主要是存放文件或目录的元数据,即大小,权限,副本,,,)
     72 //        public FileStatus[] listStatus(Path f) throws IOException
     73 //        public FileStatus[] listStatus(Path f,PathFilter filter) throws IOException
     74 //                PathFilter是路径过滤器
     75 //        public FileStatus[] listStatus(Path[] files) throws IOException
     76 //        public FileStatus[] listStatus(Path[] files,PathFilter filter)
     77 //                传送Path数组和路径过滤器
     78 //                
     79 //                
     80 //        19、FileUtil中的stat2Paths(),将一个FileStatus元数据对象数组转换为一个Path对象数组
     81 //
     82 //        20、(1)使用通配符来匹配多个目录下的多个文件,也是列出文件或目录内容(主要是存放文件或目录的元数据,即大小,权限,副本,,,)
     83 //        public FileStatus[] globStatus(Path pathPattern) throws IOException
     84 //        public FileStatus[] globStatus(Path pathPattern,PathFilter filter) throws IOException
     85 //                    
     86 //          (2)PathFilter对象
     87 //        public interface PathFilter{
     88 //            boolean accpet(Path path);
     89 //        }        
     90         
     91         
     92         
     93         //过滤目录下的 svn 文件,globStatus从第一个参数通配符合到文件,剔除满足第二个参数到结果,因为PathFilter中accept是return!  
     94         FileStatus[] dirstatus = local.globStatus(new Path("D://data/73/*"),new RegexExcludePathFilter("^.*svn$"));//一般这是隐藏文件,所以得排除
     95         //dirstatus=FileStatus[7]
     96 //        dirstatus是[DeprecatedRawLocalFileStatus{path=file:/D:/data/73/2012-09-17; isDirectory=true; modification_time=1427791478002; access_time=0; owner=; group=; permission=rwxrwxrwx; isSymlink=false}
     97 //        , DeprecatedRawLocalFileStatus{path=file:/D:/data/73/2012-09-18; isDirectory=true; modification_time=1427791505373; access_time=0; owner=; group=; permission=rwxrwxrwx; isSymlink=false}
     98 //        , DeprecatedRawLocalFileStatus{path=file:/D:/data/73/2012-09-19; isDirectory=true; modification_time=1427791532277; access_time=0; owner=; group=; permission=rwxrwxrwx; isSymlink=false}
     99 //        , DeprecatedRawLocalFileStatus{path=file:/D:/data/73/2012-09-20; isDirectory=true; modification_time=1427791553035; access_time=0; owner=; group=; permission=rwxrwxrwx; isSymlink=false}
    100 //        , DeprecatedRawLocalFileStatus{path=file:/D:/data/73/2012-09-21; isDirectory=true; modification_time=1427791577709; access_time=0; owner=; group=; permission=rwxrwxrwx; isSymlink=false}
    101 //        , DeprecatedRawLocalFileStatus{path=file:/D:/data/73/2012-09-22; isDirectory=true; modification_time=1427791602770; access_time=0; owner=; group=; permission=rwxrwxrwx; isSymlink=false}
    102 //        , DeprecatedRawLocalFileStatus{path=file:/D:/data/73/2012-09-23; isDirectory=true; modification_time=1427791647177; access_time=0; owner=; group=; permission=rwxrwxrwx; isSymlink=false}]
    103         
    104                         
    105                         //        ^表示匹配我们字符串开始的位置               *代表0到多个字符                        $代表字符串结束的位置
    106 //        RegexExcludePathFilter来只排除我们不需要的,即svn格式
    107 //        RegexExcludePathFilter这个方法我们自己写
    108         
    109 //        但是我们,最终是要处理文件里的东西,最终是要转成Path类型,因为Path对象f,它对应着一个文件。
    110         
    111         //获取73目录下的所有文件路径,注意FIleUtil中stat2Paths()的使用,它将一个FileStatus对象数组转换为Path对象数组。
    112         Path[] dirs = FileUtil.stat2Paths(dirstatus);//dirstatus是FileStatus数组类型
    113 //        dirs=Path[7]
    114 //        dirs是    [file:/D:/data/73/2012-09-17
    115 //                 , file:/D:/data/73/2012-09-18
    116 //                 , file:/D:/data/73/2012-09-19
    117 //                 , file:/D:/data/73/2012-09-20
    118 //                 , file:/D:/data/73/2012-09-21
    119 //                 , file:/D:/data/73/2012-09-22
    120 //                 , file:/D:/data/73/2012-09-23]        
    121                 
    122         
    123         FSDataOutputStream out = null;//输出流
    124 //        out=HdfsDaDataOutputStream
    125 //        out是org.apache.hadoop.hdfs.client.HdfsDataOutputStream@2b11624e
    126         
    127         FSDataInputStream in = null;//输入流
    128 //        in=ChecksumFileSystem&FSDataBoundedInputStream
    129 //        in是org.apache.hadoop.fs.ChecksumFileSystem$FSDataBoundedInputStream@526d542f
    130         
    131 //        很多人搞不清输入流和输出流,!!!!
    132 //        其实啊,输入流、输出流都是针对内存的
    133 //        往内存里写,是输入流。
    134 //        内存往文件里写,是输出Luis。
    135 //        
    136 //        比如一个文件A复制到另一文件B,那么,先写到内存里,再写到文件B。
    137 //           =>   则文件A写到内存里,叫输入流。
    138 //           =>    则内存里写到文件B,叫输出流    
    139         
    140         
    141         for (Path dir : dirs) {//for星型循环,即将dirs是Path对象数组,一一传给Path dir
    142 //            dirs=Path[7]
    143 //            dirs是[file:/D:/data/73/2012-09-17
    144 //                  , file:/D:/data/73/2012-09-18
    145 //                  , file:/D:/data/73/2012-09-19
    146 //                  , file:/D:/data/73/2012-09-20
    147 //                  , file:/D:/data/73/2012-09-21
    148 //                  , file:/D:/data/73/2012-09-22
    149 //                  , file:/D:/data/73/2012-09-23]    
    150             
    151 //        dir= Path    
    152 //        先传,dir是file:/D:/data/73/2012-09-17
    153 //        再传,file:/D:/data/73/2012-09-18           
    154 //        再传,file:/D:/data/73/2012-09-19     
    155 //        再传,file:/D:/data/73/2012-09-20       
    156 //        再传,file:/D:/data/73/2012-09-21       
    157 //        再传,file:/D:/data/73/2012-09-22       
    158 //        再传,file:/D:/data/73/2012-09-23       
    159             
    160             String fileName = dir.getName().replace("-", "");//文件名称
    161 //                        先获取到如2012-09-17,然后经过replace("-", ""),得到20120917
    162 //                                                                再获取,20120918
    163 //                                                                再获取,20120919
    164 //                                                                再获取,20120920
    165 //                                                                再获取,20120921
    166 //                                                                再获取,20120922
    167 //                                                                再获取,20120923            
    168             
    169             //只接受日期目录下的.txt文件,^匹配输入字符串的开始位置,$匹配输入字符串的结束位置,*匹配0个或多个字符。
    170             FileStatus[] localStatus = local.globStatus(new Path(dir+"/*"),new RegexAcceptPathFilter("^.*txt$"));
    171 //            先获取到,localStatus=FileStatus[23]
    172 //                   localStatus是[DeprecatedRawLocalFileStatus{path=file:/D:/data/73/2012-09-17/ars10767@20120917000000.txt; isDirectory=false; length=1111961; replication=1; blocksize=33554432; modification_time=1398669214000; access_time=0; owner=; group=; permission=rw-rw-rw-; isSymlink=false}, DeprecatedRawLocalFileStatus{path=file:/D:/data/73/2012-09-17/ars10767@20120917001500.txt; isDirectory=false; length=782533; replication=1; blocksize=33554432; modification_time=1398669214000; access_time=0; owner=; group=; permission=rw-rw-rw-; isSymlink=false}, DeprecatedRawLocalFileStatus{path=file:/D:/data/73/2012-09-17/ars10767@20120917003000.txt; isDirectory=false; length=593507; replication=1; blocksize=33554432; modification_time=1398669214000; access_time=0; owner=; group=; permission=rw-rw-rw-; isSymlink=false}, DeprecatedRawLocalFileStatus{path=file:/D:/data/73/2012-09-17/ars10767@20120917004500.txt; isDirectory=false; length=839019; replication=1; blocksize=33554432; modification_time=1398669214000; access_time=0; owner=; group=; permission=rw-rw-rw-; isSymlink=false}, DeprecatedRawLocalFileStatus{path=file:/D:/data/73/2012-09-17/ars10767@20120917010000.txt; isDirectory=false; length=866393; replication=1; blocksize=33554432; modification_time=1398669214000; access_time=0; owner=; group=; permission=rw-rw-rw-; isSymlink=false}, DeprecatedRawLocalFileStatus{path=file:/D:/data/73/2012-09-17/ars10767@20120917011500.txt; isDirectory=false; length=678491; replication=1; blocksize=33554432; modification_time=1398669214000; access_time=0; owner=; group=; permission=rw-rw-rw-; isSymlink=false}, DeprecatedRawLocalFileStatus{path=file:/D:/data/73/2012-09-17/ars10767@20120917013000.txt; isDirectory=false; length=593292; replication=1; blocksize=33554432; modification_time=1398669214000; access_time=0; owner=; group=; permission=rw-rw-rw-; isSymlink=false}, DeprecatedRawLocalFileStatus{path=file:/D:/data/73/2012-09-17/ars10767@20120917014500.txt; isDirectory=false; length=688620; replication=1; blocksize=33554432; modification_time=1398669214000; access_time=0; owner=; group=; permission=rw-rw-rw-; isSymlink=false}, DeprecatedRawLocalFileStatus{path=file:/D:/data/73/2012-09-17/ars10767@20120917020000.txt; isDirectory=false; length=674864; replication=1; blocksize=33554432; modification_time=1398669214000; access_time=0; owner=; group=; permission=rw-rw-rw-; isSymlink=false}, DeprecatedRawLocalFileStatus{path=file:/D:/data/73/2012-09-17/ars10767@20120917021500.txt; isDirectory=false; length=635052; replication=1; blocksize=33554432; modification_time=1398669214000; access_time=0; owner=; group=; permission=rw-rw-rw-; isSymlink=false}, DeprecatedRawLocalFileStatus{path=file:/D:/data/73/2012-09-17/ars10767@20120917023000.txt; isDirectory=false; length=547324; replication=1; blocksize=33554432; modification_time=1398669214000; access_time=0; owner=; group=; permission=rw-rw-rw-; isSymlink=false}, DeprecatedRawLocalFileStatus{path=file:/D:/data/73/2012-09-17/ars10767@20120917024500.txt; isDirectory=false; length=598814; replication=1; blocksize=33554432; modification_time=1398669214000; access_time=0; owner=; group=; permission=rw-rw-rw-; isSymlink=false}, DeprecatedRawLocalFileStatus{path=file:/D:/data/73/2012-09-17/ars10767@20120917030000.txt; isDirectory=false; length=542600; replication=1; blocksize=33554432; modification_time=1398669214000; access_time=0; owner=; group=; permission=rw-rw-rw-; isSymlink=false}, DeprecatedRawLocalFileStatus{path=file:/D:/data/73/2012-09-17/ars10767@20120917031500.txt; isDirectory=false; length=535446; replication=1; blocksize=33554432; modification_time=1398669214000; access_time=0; owner=; group=; permission=rw-rw-rw-; isSymlink=false}, DeprecatedRawLocalFileStatus{path=file:/D:/data/73/2012-09-17/ars10767@20120917033000.txt; isDirectory=false; length=592780; replication=1; blocksize=33554432; modification_time=1398669214000; access_time=0; owner=; group=; permission=rw-rw-rw-; isSymlink=false}, DeprecatedRawLocalFileStatus{path=file:/D:/data/73/2012-09-17/ars10767@20120917034500.txt; isDirectory=false; length=619410; replication=1; blocksize=33554432; modification_time=1398669216000; access_time=0; owner=; group=; permission=rw-rw-rw-; isSymlink=false}, DeprecatedRawLocalFileStatus{path=file:/D:/data/73/2012-09-17/ars10767@20120917040000.txt; isDirectory=false; length=590326; replication=1; blocksize=33554432; modification_time=1398669214000; access_time=0; owner=; group=; permission=rw-rw-rw-; isSymlink=false}, DeprecatedRawLocalFileStatus{path=file:/D:/data/73/2012-09-17/ars10767@20120917041500.txt; isDirectory=false; length=428487; replication=1; blocksize=33554432; modification_time=1398669214000; access_time=0; owner=; group=; permission=rw-rw-rw-; isSymlink=false}, DeprecatedRawLocalFileStatus{path=file:/D:/data/73/2012-09-17/ars10767@20120917043000.txt; isDirectory=false; length=598048; replication=1; blocksize=33554432; modification_time=1398669214000; access_time=0; owner=; group=; permission=rw-rw-rw-; isSymlink=false}, DeprecatedRawLocalFileStatus{path=file:/D:/data/73/2012-09-17/ars10767@20120917044500.txt; isDirectory=false; length=598792; replication=1; blocksize=33554432; modification_time=1398669216000; access_time=0; owner=; group=; permission=rw-rw-rw-; isSymlink=false}, DeprecatedRawLocalFileStatus{path=file:/D:/data/73/2012-09-17/ars10767@20120917050000.txt; isDirectory=false; length=575613; replication=1; blocksize=33554432; modification_time=1398669214000; access_time=0; owner=; group=; permission=rw-rw-rw-; isSymlink=false}, DeprecatedRawLocalFileStatus{path=file:/D:/data/73/2012-09-17/ars10767@20120917051500.txt; isDirectory=false; length=619080; replication=1; blocksize=33554432; modification_time=1398669214000; access_time=0; owner=; group=; permission=rw-rw-rw-; isSymlink=false}, DeprecatedRawLocalFileStatus{path=file:/D:/data/73/2012-09-17/ars10767@20120917053000.txt; isDirectory=false; length=587763; replication=1; blocksize=33554432; modification_time=1398669214000; access_time=0; owner=; group=; permission=rw-rw-rw-; isSymlink=false}]
    173 //                   再获取到,localStatus=FileStatus[23]
    174 //            localStatus是[DeprecatedRawLocalFileStatus{path=file:/D:/data/73/2012-09-18/ars10767@20120918131500.txt; isDirectory=false; length=1722797; replication=1; blocksize=33554432; modification_time=1398669244000; access_time=0; owner=; group=; permission=rw-rw-rw-; isSymlink=false}, DeprecatedRawLocalFileStatus{path=file:/D:/data/73/2012-09-18/ars10767@20120918133000.txt; isDirectory=false; length=1922955; replication=1; blocksize=33554432; modification_time=1398669244000; access_time=0; owner=; group=; permission=rw-rw-rw-; isSymlink=false}, DeprecatedRawLocalFileStatus{path=file:/D:/data/73/2012-09-18/ars10767@20120918134500.txt; isDirectory=false; length=1388036; replication=1; blocksize=33554432; modification_time=1398669244000; access_time=0; owner=; group=; permission=rw-rw-rw-; isSymlink=false}, DeprecatedRawLocalFileStatus{path=file:/D:/data/73/2012-09-18/ars10767@20120918140000.txt; isDirectory=false; length=1888871; replication=1; blocksize=33554432; modification_time=1398669244000; access_time=0; owner=; group=; permission=rw-rw-rw-; isSymlink=false}, DeprecatedRawLocalFileStatus{path=file:/D:/data/73/2012-09-18/ars10767@20120918141500.txt; isDirectory=false; length=1685719; replication=1; blocksize=33554432; modification_time=1398669244000; access_time=0; owner=; group=; permission=rw-rw-rw-; isSymlink=false}, DeprecatedRawLocalFileStatus{path=file:/D:/data/73/2012-09-18/ars10767@20120918143000.txt; isDirectory=false; length=1541381; replication=1; blocksize=33554432; modification_time=1398669244000; access_time=0; owner=; group=; permission=rw-rw-rw-; isSymlink=false}, DeprecatedRawLocalFileStatus{path=file:/D:/data/73/2012-09-18/ars10767@20120918144500.txt; isDirectory=false; length=1723638; replication=1; blocksize=33554432; modification_time=1398669244000; access_time=0; owner=; group=; permission=rw-rw-rw-; isSymlink=false}, DeprecatedRawLocalFileStatus{path=file:/D:/data/73/2012-09-18/ars10767@20120918150000.txt; isDirectory=false; length=1629322; replication=1; blocksize=33554432; modification_time=1398669244000; access_time=0; owner=; group=; permission=rw-rw-rw-; isSymlink=false}, DeprecatedRawLocalFileStatus{path=file:/D:/data/73/2012-09-18/ars10767@20120918151500.txt; isDirectory=false; length=1658684; replication=1; blocksize=33554432; modification_time=1398669244000; access_time=0; owner=; group=; permission=rw-rw-rw-; isSymlink=false}, DeprecatedRawLocalFileStatus{path=file:/D:/data/73/2012-09-18/ars10767@20120918153000.txt; isDirectory=false; length=1548216; replication=1; blocksize=33554432; modification_time=1398669244000; access_time=0; owner=; group=; permission=rw-rw-rw-; isSymlink=false}, DeprecatedRawLocalFileStatus{path=file:/D:/data/73/2012-09-18/ars10767@20120918154500.txt; isDirectory=false; length=1510965; replication=1; blocksize=33554432; modification_time=1398669244000; access_time=0; owner=; group=; permission=rw-rw-rw-; isSymlink=false}, DeprecatedRawLocalFileStatus{path=file:/D:/data/73/2012-09-18/ars10767@20120918160000.txt; isDirectory=false; length=1559078; replication=1; blocksize=33554432; modification_time=1398669244000; access_time=0; owner=; group=; permission=rw-rw-rw-; isSymlink=false}, DeprecatedRawLocalFileStatus{path=file:/D:/data/73/2012-09-18/ars10767@20120918161500.txt; isDirectory=false; length=1752005; replication=1; blocksize=33554432; modification_time=1398669244000; access_time=0; owner=; group=; permission=rw-rw-rw-; isSymlink=false}, DeprecatedRawLocalFileStatus{path=file:/D:/data/73/2012-09-18/ars10767@20120918163000.txt; isDirectory=false; length=1901994; replication=1; blocksize=33554432; modification_time=1398669244000; access_time=0; owner=; group=; permission=rw-rw-rw-; isSymlink=false}, DeprecatedRawLocalFileStatus{path=file:/D:/data/73/2012-09-18/ars10767@20120918164500.txt; isDirectory=false; length=2234304; replication=1; blocksize=33554432; modification_time=1398669244000; access_time=0; owner=; group=; permission=rw-rw-rw-; isSymlink=false}, DeprecatedRawLocalFileStatus{path=file:/D:/data/73/2012-09-18/ars10767@20120918170000.txt; isDirectory=false; length=1912051; replication=1; blocksize=33554432; modification_time=1398669244000; access_time=0; owner=; group=; permission=rw-rw-rw-; isSymlink=false}, DeprecatedRawLocalFileStatus{path=file:/D:/data/73/2012-09-18/ars10767@20120918171500.txt; isDirectory=false; length=1711317; replication=1; blocksize=33554432; modification_time=1398669244000; access_time=0; owner=; group=; permission=rw-rw-rw-; isSymlink=false}, DeprecatedRawLocalFileStatus{path=file:/D:/data/73/2012-09-18/ars10767@20120918173000.txt; isDirectory=false; length=1799747; replication=1; blocksize=33554432; modification_time=1398669244000; access_time=0; owner=; group=; permission=rw-rw-rw-; isSymlink=false}, DeprecatedRawLocalFileStatus{path=file:/D:/data/73/2012-09-18/ars10767@20120918174500.txt; isDirectory=false; length=2038653; replication=1; blocksize=33554432; modification_time=1398669244000; access_time=0; owner=; group=; permission=rw-rw-rw-; isSymlink=false}, DeprecatedRawLocalFileStatus{path=file:/D:/data/73/2012-09-18/ars10767@20120918180000.txt; isDirectory=false; length=2341515; replication=1; blocksize=33554432; modification_time=1398669244000; access_time=0; owner=; group=; permission=rw-rw-rw-; isSymlink=false}, DeprecatedRawLocalFileStatus{path=file:/D:/data/73/2012-09-18/ars10767@20120918181500.txt; isDirectory=false; length=2396977; replication=1; blocksize=33554432; modification_time=1398669244000; access_time=0; owner=; group=; permission=rw-rw-rw-; isSymlink=false}, DeprecatedRawLocalFileStatus{path=file:/D:/data/73/2012-09-18/ars10767@20120918183000.txt; isDirectory=false; length=2382769; replication=1; blocksize=33554432; modification_time=1398669244000; access_time=0; owner=; group=; permission=rw-rw-rw-; isSymlink=false}, DeprecatedRawLocalFileStatus{path=file:/D:/data/73/2012-09-18/ars10767@20120918184500.txt; isDirectory=false; length=2709048; replication=1; blocksize=33554432; modification_time=1398669244000; access_time=0; owner=; group=; permission=rw-rw-rw-; isSymlink=false}]
    175 //            再获取到,,,,不多赘述。
    176             
    177             
    178 //            FileStatus[] localStatus = local.listStatus(new Path(dir+"/*"),new RegexAcceptPathFilter("^.*txt$"));//试试,看有什么区别?
    179 
    180 //            如果不设置过滤器,FileInputFormat 会使用一个默认的过滤器来排除隐藏文件。 
    181 //            如果通过调用 setInputPathFilter()设置了过滤器,它会在默认过滤器的基础上进行过滤。换句话说,自定义的过滤器只能看到非隐藏文件。
    182             
    183             
    184                     //RegexAcceptPathFilter这个方法,我们自己写
    185 //            RegexAcceptPathFilter来只接收我们需要,即txt格式
    186 //            这里,我们还可以只接收别的格式,自己去改,一定要锻炼学会改别人的代码
    187             
    188             
    189             // 获得如2012-09-17日期目录下的所有文件
    190             Path[] listedPaths = FileUtil.stat2Paths(localStatus);
    191 //            同样,但是我们,最终是要处理文件里的东西,最终是要转成Path类型,因为Path对象f,它对应着一个文件。
    192             
    193 //            先获取,listedPaths=Path[23]
    194 //            先获取2012-09-17下的所有,这个不多赘述啦!
    195             
    196 //            再获取,listedPaths=Path[23]
    197 //            listedPaths是[file:/D:/data/73/2012-09-18/ars10767@20120918131500.txt
    198 //                         , file:/D:/data/73/2012-09-18/ars10767@20120918133000.txt
    199 //                         , file:/D:/data/73/2012-09-18/ars10767@20120918134500.txt
    200 //                         , file:/D:/data/73/2012-09-18/ars10767@20120918140000.txt
    201 //                         , file:/D:/data/73/2012-09-18/ars10767@20120918141500.txt
    202 //                         , file:/D:/data/73/2012-09-18/ars10767@20120918143000.txt
    203 //                         , file:/D:/data/73/2012-09-18/ars10767@20120918144500.txt
    204 //                         , file:/D:/data/73/2012-09-18/ars10767@20120918150000.txt
    205 //                         , file:/D:/data/73/2012-09-18/ars10767@20120918151500.txt
    206 //                         , file:/D:/data/73/2012-09-18/ars10767@20120918153000.txt
    207 //                         , file:/D:/data/73/2012-09-18/ars10767@20120918154500.txt
    208 //                         , file:/D:/data/73/2012-09-18/ars10767@20120918160000.txt
    209 //                         , file:/D:/data/73/2012-09-18/ars10767@20120918161500.txt
    210 //                         , file:/D:/data/73/2012-09-18/ars10767@20120918163000.txt
    211 //                         , file:/D:/data/73/2012-09-18/ars10767@20120918164500.txt
    212 //                         , file:/D:/data/73/2012-09-18/ars10767@20120918170000.txt
    213 //                         , file:/D:/data/73/2012-09-18/ars10767@20120918171500.txt
    214 //                         , file:/D:/data/73/2012-09-18/ars10767@20120918173000.txt
    215 //                         , file:/D:/data/73/2012-09-18/ars10767@20120918174500.txt
    216 //                         , file:/D:/data/73/2012-09-18/ars10767@20120918180000.txt
    217 //                         , file:/D:/data/73/2012-09-18/ars10767@20120918181500.txt
    218 //                         , file:/D:/data/73/2012-09-18/ars10767@20120918183000.txt
    219 //                         , file:/D:/data/73/2012-09-18/ars10767@20120918184500.txt]
    220             
    221             //输出路径
    222             Path block = new Path("hdfs://djt002:9000/outData/MergeSmallFilesToHDFS/"+ fileName + ".txt");
    223             //fileName是"fileName"
    224 //            block=Path
    225 //            block是hdfs://djt002:9000/outData/MergeSmallFilesToHDFS/20120918.txt
    226             
    227             // 打开输出流
    228             out = fs.create(block);//因为,合并小文件之后,比如这是,合并2012-09-17日期目录下的所有小文件,之后,要上传到HDFS里。
    229 //                类似于,文件A写到内存里,再内存里写到文件B。而这行代码out = fs.create(block);是相当于是,内存里写到文件B。所以是输出流,即是从内存里输出的,所以叫输出流。
    230 //                            这里,文件A是Local                文件B是HDFS
    231 
    232 //                                        文件与块大小(比如128M)来比,小的话,称为小文件。是一个相对概念!相对于数据块而言的!
    233             
    234 //            很多人搞不清输入流和输出流,!!!!
    235 //            其实啊,输入流、输出流都是针对内存的
    236 //            往内存里写,是输入流。
    237 //            内存往文件里写,是输出Luis。
    238 //            
    239 //            比如一个文件A复制到另一文件B,那么,先写到内存里,再写到文件B。
    240 //               =>   则文件A写到内存里,叫输入流。
    241 //               =>    则内存里写到文件B,叫输出流    
    242             
    243             
    244             for (Path p : listedPaths) {//for星型循环,即将listedPaths的值一一传给Path p
    245             //先获取2012-09-17下的所有,这个不多赘述啦!
    246             //现在,获取到2012-09-18下了
    247 //            p=Path
    248 //            p是file:/D:/data/73/2012-09-18/ars10767@20120918134500.txt
    249 //            得一个一个来,这才叫做一一传给Path p
    250                 
    251                 in = local.open(p);// 打开输入流in
    252 //                类似于,文件A写到内存里,再内存里写到文件B。而这行代码in = local.open(p);是相当于是,文件A写到内存里。所以是输如流,即是写到内存里的,所以叫输入流。
    253 //                    这里,文件A是Local                文件B是HDFS
    254                 
    255                 IOUtils.copyBytes(in, out, 4096, false); // 复制数据,IOUtils.copyBytes可以方便地将数据写入到文件,不需要自己去控制缓冲区,也不用自己去循环读取输入源。false表示不自动关闭数据流,那么就手动关闭。
    256 //                IOUtils.copyBytes这个方法很重要
    257                                 //是否自动关闭输入流和输出流,若是false,就要单独去关闭。则不在这个方法体里关闭输入和输出流了。
    258 //                                                     若是true,则在这个方法里关闭输入和输出流。不需单独去关闭了
    259                 
    260                 
    261 //                明白,IOUtils类的copyBytes将hdfs数据流拷贝到标准输出流System.out中,
    262 //                copyBytes前两个参数好理解,一个输入,一个输出,第三个是缓存大小,第四个指定拷贝完毕后是否关闭流。
    263 //                要设置为false,标准输出流不关闭,我们要手动关闭输入流。即,设置为false表示关闭输入流
    264                 
    265 //                主要是把最后的这个参数定义好, 就可以了。 定义为true还是false,则决定着是否在这个方法体里关闭
    266 //                若定义为true,则在这个方法体里直接关闭输入流、输出流。不需单独去关闭了
    267 //                若定义为false,则不在这个方法体里直接关闭输入流、输出流。需单独去关闭了
    268                 
    269                 
    270                 // 关闭输入流
    271                 in.close();//若定义为false,则不在这个方法体里直接关闭输入流、输出流。需单独去关闭了。这就是单独在关闭输入流!!!懂了吗
    272             }
    273             if (out != null) {//这里为什么不为空,空指针,则说明里面还有资源。
    274                 // 关闭输出流
    275                 out.close();//若定义为false,则不在这个方法体里直接关闭输入流、输出流。需单独去关闭了。这就是单独在关闭输出流!!!懂了吗
    276             }
    277         }
    278         
    279     }
    280 
    281     /**
    282      * 
    283      * @function 过滤 regex 格式的文件
    284      *
    285      */
    286     public static class RegexExcludePathFilter implements PathFilter {
    287         private final String regex;//变量
    288 
    289         public RegexExcludePathFilter(String regex) {//这个是上面的那个,正在表达式
    290             this.regex = regex;//将String regex的值,赋给RegexExcludePathFilter类里的private final String regex的值
    291         }
    292 
    293         public boolean accept(Path path) {//主要是实现accept方法
    294             // TODO Auto-generated method stub
    295             boolean flag = path.toString().matches(regex);//匹配正则表达式,这里是^.*svn$
    296             return !flag;
    297         }
    298 
    299     }
    300 
    301     /**
    302      * 
    303      * @function 接受 regex 格式的文件
    304      *
    305      */
    306     public static class RegexAcceptPathFilter implements PathFilter {
    307         private final String regex;//变量
    308 
    309         public RegexAcceptPathFilter(String regex) {//这个是上面的那个,正在表达式
    310             this.regex = regex;//将String regex的值,赋给RegexAcceptPathFilter类里的private final String regex的值
    311         }
    312 
    313         public boolean accept(Path path) {//主要是实现accept方法
    314             // TODO Auto-generated method stub
    315             boolean flag = path.toString().matches(regex);//匹配正则表达式,这里是^.*txt$
    316             return flag;
    317         }
    318 
    319     }
    320 }
  • 相关阅读:
    debian 登录CUPS 管理界面报错
    Shell脚本调试技术
    贴个ALSA例程
    产业生态圈和生态圈
    开个帖,开始学习shell编程
    Lua源码阅读建议
    Flash, EEPROM, SPI Flash diff
    makefile中的notdir,wildcard和patsubst
    quartus II使用零星记录
    Hello ZED
  • 原文地址:https://www.cnblogs.com/zlslch/p/6174553.html
Copyright © 2011-2022 走看看