zoukankan      html  css  js  c++  java
  • 通过java操作hadoop

    java操作hadoop真的比python麻烦太多,试了好久一直不成功,今天实验成功,做一下笔记
    1 作为初学者一定要导入common和HDFS目录的所有库,要不出现很多依赖库问题,库的位置在下载后的源码解压得到,比如我安装的是hadoop2.84,解压后的路径是位置在G:projecthadoophadoop-2.8.4,把G:projecthadoophadoop-2.8.4sharehadoopcommonlib、G:projecthadoophadoop-2.8.4sharehadoopcommon、G:projecthadoophadoop-2.8.4sharehadoophdfs和G:projecthadoophadoop-2.8.4sharehadoophdfslib下面的jar包全部导入项目
    2 项目根路径创建文件log4j.properties,内容如下(要不提示日志文件不存在的东东,然后在代码里面加入BasicConfigurator.configure();):
    # Configure logging for testing: optionally with log file

    #log4j.rootLogger=debug,appender
    log4j.rootLogger=info,appender
    #log4j.rootLogger=error,appender

    #u8F93u51FAu5230u63A7u5236u53F0
    log4j.appender.appender=org.apache.log4j.ConsoleAppender
    #u6837u5F0Fu4E3ATTCCLayout
    log4j.appender.appender.layout=org.apache.log4j.TTCCLayout

    3测试代码如下,在项目中新建test.java文件,里面的hadoop2.com换成core-site.xml配置文件里面fs.defaultFS的配置:

    package WordCount;
    import java.io.IOException;
    import java.net.URI;
    import java.net.URISyntaxException;

    import org.apache.hadoop.conf.Configuration;
    import org.apache.hadoop.fs.BlockLocation;
    import org.apache.hadoop.fs.FileStatus;
    import org.apache.hadoop.fs.FileSystem;
    import org.apache.hadoop.fs.FileUtil;
    import org.apache.hadoop.fs.Path;
    import org.apache.hadoop.hdfs.DistributedFileSystem;
    import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
    import org.apache.log4j.BasicConfigurator;

    public class test {

    public static void main(String[] args) {
    // TODO 自动生成的方法存根
    System.out.println("Hello World ");
    BasicConfigurator.configure();
    try {
    listAllFile();
    } catch (Exception e) {
    // TODO 自动生成的 catch 块
    e.printStackTrace();
    }
    System.out.println("Hello World ");

    }

    /**
    * 获取HDFS文件系统
    * @return
    * @throws IOException
    * @throws URISyntaxException
    */
    public static FileSystem getFileSystem() throws IOException, URISyntaxException{
    //read config file
    Configuration conf = new Configuration();

    //返回默认文件系统
    //如果在Hadoop集群下运行,使用此种方法可以直接获取默认文件系统
    // FileSystem fs = FileSystem.get(conf);

    //指定的文件系统地址
    URI uri = new URI("hdfs://hadoop2.com:9000");

    //返回指定的文件系统
    //如果在本地测试,需要使用此种方法获取文件系统
    FileSystem fs = FileSystem.get(uri, conf);

    return fs;
    }

    /**
    * 创建文件目录
    * @throws Exception
    */
    public static void mkdir() throws Exception{
    //获取文件系统
    FileSystem fs = getFileSystem();

    //创建文件目录
    fs.mkdirs(new Path("hdfs://hadoop2.com:9000/test/weibo"));

    //释放资源
    fs.close();
    }

    /**
    * 删除文件或者文件目录
    * @throws Exception
    */
    public static void rmdir() throws Exception{
    //获取文件系统
    FileSystem fs = getFileSystem();

    //删除文件或者文件目录
    fs.delete(new Path("hdfs://hadoop2.com:9000/test/weibo"), true);

    //释放资源
    fs.close();
    }


    /**
    * 获取目录下所有文件
    * @throws Exception
    */
    public static void listAllFile() throws Exception{
    //获取文件系统
    FileSystem fs = getFileSystem();

    //列出目录内容
    FileStatus[] status = fs.listStatus(new Path("hdfs://hadoop2.com:9000/test/"));

    //获取目录下所有文件路径
    Path[] listedPaths = FileUtil.stat2Paths(status);

    //循环读取每个文件
    for (Path path : listedPaths) {
    System.out.println(path);
    }

    //释放资源
    fs.close();
    }

    /**
    * 将文件上传至HDFS
    * @throws Exception
    */
    public static void copyToHDFS() throws Exception{
    //获取文件对象
    FileSystem fs = getFileSystem();

    //源文件路径是Linux下的路径 Path srcPath = new Path("/home/hadoop/temp.jar");
    //如果需要在windows下测试,需要改为Windows下的路径,比如 E://temp.jar
    Path srcPath = new Path("E://temp.jar");

    //目的路径
    Path dstPath = new Path("hdfs://hadoop2.com:9000/test/weibo");

    //实现文件上传
    fs.copyFromLocalFile(srcPath, dstPath);

    //释放资源
    fs.close();

    }

    /**
    * 从HDFS上下载文件
    * @throws Exception
    */
    public static void getFile() throws Exception{
    //获得文件系统
    FileSystem fs = getFileSystem();

    //源文件路径
    Path srcPath = new Path("hdfs://hadoop2.com:9000/test/weibo/temp.jar");

    //目的路径,默认是Linux下的
    //如果在Windows下测试,需要改为Windows下的路径,如C://User/andy/Desktop/
    Path dstPath = new Path("D://");

    //下载HDFS上的文件
    fs.copyToLocalFile(srcPath, dstPath);

    //释放资源
    fs.close();
    }

    /**
    * 获取HDFS集群点的信息
    * @throws Exception
    */
    public static void getHDFSNodes() throws Exception{
    //获取文件系统
    FileSystem fs = getFileSystem();

    //获取分布式文件系统
    DistributedFileSystem hdfs = (DistributedFileSystem)fs;

    //获取所有节点
    DatanodeInfo[] dataNodeStats = hdfs.getDataNodeStats();

    //循环比遍历
    for (int i = 0; i < dataNodeStats.length; i++) {
    System.out.println("DataNote_" + i + "_Name:" + dataNodeStats[i].getHostName());
    }

    //释放资源
    fs.close();
    }

    /**
    * 查找某个文件在HDFS集群的位置
    * @throws Exception
    */
    public static void getFileLocal() throws Exception{
    //获取文件系统
    FileSystem fs = getFileSystem();

    //文件路径
    Path path = new Path("hdfs://hadoop2.com:9000/test/weibo/temp.jar");

    //获取文件目录
    FileStatus fileStatus = fs.getFileStatus(path);

    //获取文件块位置列表
    BlockLocation[] blockLocations = fs.getFileBlockLocations(fileStatus, 0, fileStatus.getLen());

    //循环输出块信息
    for (int i = 0; i < blockLocations.length; i++) {
    String[] hosts = blockLocations[i].getHosts();
    System.out.println("block_" + i + "_location:" + hosts[0]);
    }

    //释放资源
    fs.close();
    }

    }

    4结果:
    Hello World
    0 [main] DEBUG org.apache.hadoop.metrics2.lib.MutableMetricsFactory - field org.apache.hadoop.metrics2.lib.MutableRate org.apache.hadoop.security.UserGroupInformation$UgiMetrics.loginSuccess with annotation @org.apache.hadoop.metrics2.annotation.Metric(about=, always=false, sampleName=Ops, type=DEFAULT, value=[Rate of successful kerberos logins and latency (milliseconds)], valueName=Time)
    24 [main] DEBUG org.apache.hadoop.metrics2.lib.MutableMetricsFactory - field org.apache.hadoop.metrics2.lib.MutableRate org.apache.hadoop.security.UserGroupInformation$UgiMetrics.loginFailure with annotation @org.apache.hadoop.metrics2.annotation.Metric(about=, always=false, sampleName=Ops, type=DEFAULT, value=[Rate of failed kerberos logins and latency (milliseconds)], valueName=Time)
    25 [main] DEBUG org.apache.hadoop.metrics2.lib.MutableMetricsFactory - field org.apache.hadoop.metrics2.lib.MutableRate org.apache.hadoop.security.UserGroupInformation$UgiMetrics.getGroups with annotation @org.apache.hadoop.metrics2.annotation.Metric(about=, always=false, sampleName=Ops, type=DEFAULT, value=[GetGroups], valueName=Time)
    25 [main] DEBUG org.apache.hadoop.metrics2.lib.MutableMetricsFactory - field private org.apache.hadoop.metrics2.lib.MutableGaugeLong org.apache.hadoop.security.UserGroupInformation$UgiMetrics.renewalFailuresTotal with annotation @org.apache.hadoop.metrics2.annotation.Metric(about=, always=false, sampleName=Ops, type=DEFAULT, value=[Renewal failures since startup], valueName=Time)
    25 [main] DEBUG org.apache.hadoop.metrics2.lib.MutableMetricsFactory - field private org.apache.hadoop.metrics2.lib.MutableGaugeInt org.apache.hadoop.security.UserGroupInformation$UgiMetrics.renewalFailures with annotation @org.apache.hadoop.metrics2.annotation.Metric(about=, always=false, sampleName=Ops, type=DEFAULT, value=[Renewal failures since last successful login], valueName=Time)
    27 [main] DEBUG org.apache.hadoop.metrics2.impl.MetricsSystemImpl - UgiMetrics, User and group related metrics
    117 [main] DEBUG org.apache.hadoop.security.authentication.util.KerberosName - Kerberos krb5 configuration not found, setting default realm to empty
    123 [main] DEBUG org.apache.hadoop.security.Groups - Creating new Groups object
    129 [main] DEBUG org.apache.hadoop.util.NativeCodeLoader - Trying to load the custom-built native-hadoop library...
    152 [main] DEBUG org.apache.hadoop.util.NativeCodeLoader - Loaded the native-hadoop library
    153 [main] DEBUG org.apache.hadoop.security.JniBasedUnixGroupsMapping - Using JniBasedUnixGroupsMapping for Group resolution
    153 [main] DEBUG org.apache.hadoop.security.JniBasedUnixGroupsMappingWithFallback - Group mapping impl=org.apache.hadoop.security.JniBasedUnixGroupsMapping
    202 [main] DEBUG org.apache.hadoop.security.Groups - Group mapping impl=org.apache.hadoop.security.JniBasedUnixGroupsMappingWithFallback; cacheTimeout=300000; warningDeltaMs=5000
    211 [main] DEBUG org.apache.hadoop.security.UserGroupInformation - hadoop login
    212 [main] DEBUG org.apache.hadoop.security.UserGroupInformation - hadoop login commit
    213 [main] DEBUG org.apache.hadoop.security.UserGroupInformation - Using user: "server" with name server
    213 [main] DEBUG org.apache.hadoop.security.UserGroupInformation - User entry: "server"
    213 [main] DEBUG org.apache.hadoop.security.UserGroupInformation - Assuming keytab is managed externally since logged in from subject.
    214 [main] DEBUG org.apache.hadoop.security.UserGroupInformation - UGI loginUser:server (auth:SIMPLE)
    250 [main] DEBUG org.apache.htrace.core.Tracer - sampler.classes = ; loaded no samplers
    413 [main] DEBUG org.apache.htrace.core.Tracer - span.receiver.classes = ; loaded no span receivers
    841 [main] DEBUG org.apache.hadoop.hdfs.client.impl.DfsClientConf - dfs.client.use.legacy.blockreader.local = false
    841 [main] DEBUG org.apache.hadoop.hdfs.client.impl.DfsClientConf - dfs.client.read.shortcircuit = false
    841 [main] DEBUG org.apache.hadoop.hdfs.client.impl.DfsClientConf - dfs.client.domain.socket.data.traffic = false
    841 [main] DEBUG org.apache.hadoop.hdfs.client.impl.DfsClientConf - dfs.domain.socket.path =
    854 [main] DEBUG org.apache.hadoop.hdfs.DFSClient - Sets dfs.client.block.write.replace-datanode-on-failure.min-replication to 0
    890 [main] DEBUG org.apache.hadoop.io.retry.RetryUtils - multipleLinearRandomRetry = null
    929 [main] DEBUG org.apache.hadoop.ipc.Server - rpcKind=RPC_PROTOCOL_BUFFER, rpcRequestWrapperClass=class org.apache.hadoop.ipc.ProtobufRpcEngine$RpcProtobufRequest, rpcInvoker=org.apache.hadoop.ipc.ProtobufRpcEngine$Server$ProtoBufRpcInvoker@489115ef
    1166 [main] DEBUG org.apache.hadoop.ipc.Client - getting client out of cache: org.apache.hadoop.ipc.Client@b2c9a9c
    1784 [main] DEBUG org.apache.hadoop.util.PerformanceAdvisory - Both short-circuit local reads and UNIX domain socket are disabled.
    1790 [main] DEBUG org.apache.hadoop.hdfs.protocol.datatransfer.sasl.DataTransferSaslUtil - DataTransferProtocol not using SaslPropertiesResolver, no QOP found in configuration for dfs.data.transfer.protection
    1849 [main] DEBUG org.apache.hadoop.ipc.Client - The ping interval is 60000 ms.
    1860 [main] DEBUG org.apache.hadoop.ipc.Client - Connecting to hadoop2.com/192.168.129.130:9000
    1969 [IPC Client (2093010349) connection to hadoop2.com/192.168.129.130:9000 from server] DEBUG org.apache.hadoop.ipc.Client - IPC Client (2093010349) connection to hadoop2.com/192.168.129.130:9000 from server: starting, having connections 1
    1973 [IPC Parameter Sending Thread #0] DEBUG org.apache.hadoop.ipc.Client - IPC Client (2093010349) connection to hadoop2.com/192.168.129.130:9000 from server sending #0 org.apache.hadoop.hdfs.protocol.ClientProtocol.getListing
    1984 [IPC Client (2093010349) connection to hadoop2.com/192.168.129.130:9000 from server] DEBUG org.apache.hadoop.ipc.Client - IPC Client (2093010349) connection to hadoop2.com/192.168.129.130:9000 from server got value #0
    1984 [main] DEBUG org.apache.hadoop.ipc.ProtobufRpcEngine - Call: getListing took 170ms
    hdfs://hadoop2.com:9000/test/output
    hdfs://hadoop2.com:9000/test/start.txt
    hdfs://hadoop2.com:9000/test/test.txt
    2027 [main] DEBUG org.apache.hadoop.ipc.Client - stopping client from cache: org.apache.hadoop.ipc.Client@b2c9a9c
    2028 [main] DEBUG org.apache.hadoop.ipc.Client - removing client from cache: org.apache.hadoop.ipc.Client@b2c9a9c
    2028 [main] DEBUG org.apache.hadoop.ipc.Client - stopping actual client because no more references remain: org.apache.hadoop.ipc.Client@b2c9a9c
    2028 [main] DEBUG org.apache.hadoop.ipc.Client - Stopping client
    2028 [IPC Client (2093010349) connection to hadoop2.com/192.168.129.130:9000 from server] DEBUG org.apache.hadoop.ipc.Client - IPC Client (2093010349) connection to hadoop2.com/192.168.129.130:9000 from server: closed
    2028 [IPC Client (2093010349) connection to hadoop2.com/192.168.129.130:9000 from server] DEBUG org.apache.hadoop.ipc.Client - IPC Client (2093010349) connection to hadoop2.com/192.168.129.130:9000 from server: stopped, remaining connections 0
    Hello World
    2131 [Thread-2] DEBUG org.apache.hadoop.util.ShutdownHookManager - ShutdownHookManger complete shutdown.

  • 相关阅读:
    Ubuntu16.04 中 Vscode 如何断点调试C语言程序
    PHP疑难杂症
    PHP之外观模式
    23种设计模式之适配器模式(Adapter Pattern)
    23种设计模式之原型模式(Prototype Pattern)
    23种设计模式之单例(Singleton Pattern)
    23种设计模式之抽象工厂(Abstract Factory Pattern)
    23种设计模式之工厂方法(Factory Method Pattern)
    简单工厂
    Nosql之Redis
  • 原文地址:https://www.cnblogs.com/kuainiao/p/9417767.html
Copyright © 2011-2022 走看看