java操作hadoop真的比python麻烦太多,试了好久一直不成功,今天实验成功,做一下笔记
1 作为初学者一定要导入common和HDFS目录的所有库,要不出现很多依赖库问题,库的位置在下载后的源码解压得到,比如我安装的是hadoop2.84,解压后的路径是位置在G:projecthadoophadoop-2.8.4,把G:projecthadoophadoop-2.8.4sharehadoopcommonlib、G:projecthadoophadoop-2.8.4sharehadoopcommon、G:projecthadoophadoop-2.8.4sharehadoophdfs和G:projecthadoophadoop-2.8.4sharehadoophdfslib下面的jar包全部导入项目
2 项目根路径创建文件log4j.properties,内容如下(要不提示日志文件不存在的东东,然后在代码里面加入BasicConfigurator.configure();):
# Configure logging for testing: optionally with log file
#log4j.rootLogger=debug,appender
log4j.rootLogger=info,appender
#log4j.rootLogger=error,appender
#u8F93u51FAu5230u63A7u5236u53F0
log4j.appender.appender=org.apache.log4j.ConsoleAppender
#u6837u5F0Fu4E3ATTCCLayout
log4j.appender.appender.layout=org.apache.log4j.TTCCLayout
3测试代码如下,在项目中新建test.java文件,里面的hadoop2.com换成core-site.xml配置文件里面fs.defaultFS的配置:
package WordCount;
import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.BlockLocation;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.FileUtil;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hdfs.DistributedFileSystem;
import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
import org.apache.log4j.BasicConfigurator;
public class test {
public static void main(String[] args) {
// TODO 自动生成的方法存根
System.out.println("Hello World ");
BasicConfigurator.configure();
try {
listAllFile();
} catch (Exception e) {
// TODO 自动生成的 catch 块
e.printStackTrace();
}
System.out.println("Hello World ");
}
/**
* 获取HDFS文件系统
* @return
* @throws IOException
* @throws URISyntaxException
*/
public static FileSystem getFileSystem() throws IOException, URISyntaxException{
//read config file
Configuration conf = new Configuration();
//返回默认文件系统
//如果在Hadoop集群下运行,使用此种方法可以直接获取默认文件系统
// FileSystem fs = FileSystem.get(conf);
//指定的文件系统地址
URI uri = new URI("hdfs://hadoop2.com:9000");
//返回指定的文件系统
//如果在本地测试,需要使用此种方法获取文件系统
FileSystem fs = FileSystem.get(uri, conf);
return fs;
}
/**
* 创建文件目录
* @throws Exception
*/
public static void mkdir() throws Exception{
//获取文件系统
FileSystem fs = getFileSystem();
//创建文件目录
fs.mkdirs(new Path("hdfs://hadoop2.com:9000/test/weibo"));
//释放资源
fs.close();
}
/**
* 删除文件或者文件目录
* @throws Exception
*/
public static void rmdir() throws Exception{
//获取文件系统
FileSystem fs = getFileSystem();
//删除文件或者文件目录
fs.delete(new Path("hdfs://hadoop2.com:9000/test/weibo"), true);
//释放资源
fs.close();
}
/**
* 获取目录下所有文件
* @throws Exception
*/
public static void listAllFile() throws Exception{
//获取文件系统
FileSystem fs = getFileSystem();
//列出目录内容
FileStatus[] status = fs.listStatus(new Path("hdfs://hadoop2.com:9000/test/"));
//获取目录下所有文件路径
Path[] listedPaths = FileUtil.stat2Paths(status);
//循环读取每个文件
for (Path path : listedPaths) {
System.out.println(path);
}
//释放资源
fs.close();
}
/**
* 将文件上传至HDFS
* @throws Exception
*/
public static void copyToHDFS() throws Exception{
//获取文件对象
FileSystem fs = getFileSystem();
//源文件路径是Linux下的路径 Path srcPath = new Path("/home/hadoop/temp.jar");
//如果需要在windows下测试,需要改为Windows下的路径,比如 E://temp.jar
Path srcPath = new Path("E://temp.jar");
//目的路径
Path dstPath = new Path("hdfs://hadoop2.com:9000/test/weibo");
//实现文件上传
fs.copyFromLocalFile(srcPath, dstPath);
//释放资源
fs.close();
}
/**
* 从HDFS上下载文件
* @throws Exception
*/
public static void getFile() throws Exception{
//获得文件系统
FileSystem fs = getFileSystem();
//源文件路径
Path srcPath = new Path("hdfs://hadoop2.com:9000/test/weibo/temp.jar");
//目的路径,默认是Linux下的
//如果在Windows下测试,需要改为Windows下的路径,如C://User/andy/Desktop/
Path dstPath = new Path("D://");
//下载HDFS上的文件
fs.copyToLocalFile(srcPath, dstPath);
//释放资源
fs.close();
}
/**
* 获取HDFS集群点的信息
* @throws Exception
*/
public static void getHDFSNodes() throws Exception{
//获取文件系统
FileSystem fs = getFileSystem();
//获取分布式文件系统
DistributedFileSystem hdfs = (DistributedFileSystem)fs;
//获取所有节点
DatanodeInfo[] dataNodeStats = hdfs.getDataNodeStats();
//循环比遍历
for (int i = 0; i < dataNodeStats.length; i++) {
System.out.println("DataNote_" + i + "_Name:" + dataNodeStats[i].getHostName());
}
//释放资源
fs.close();
}
/**
* 查找某个文件在HDFS集群的位置
* @throws Exception
*/
public static void getFileLocal() throws Exception{
//获取文件系统
FileSystem fs = getFileSystem();
//文件路径
Path path = new Path("hdfs://hadoop2.com:9000/test/weibo/temp.jar");
//获取文件目录
FileStatus fileStatus = fs.getFileStatus(path);
//获取文件块位置列表
BlockLocation[] blockLocations = fs.getFileBlockLocations(fileStatus, 0, fileStatus.getLen());
//循环输出块信息
for (int i = 0; i < blockLocations.length; i++) {
String[] hosts = blockLocations[i].getHosts();
System.out.println("block_" + i + "_location:" + hosts[0]);
}
//释放资源
fs.close();
}
}
4结果:
Hello World
0 [main] DEBUG org.apache.hadoop.metrics2.lib.MutableMetricsFactory - field org.apache.hadoop.metrics2.lib.MutableRate org.apache.hadoop.security.UserGroupInformation$UgiMetrics.loginSuccess with annotation @org.apache.hadoop.metrics2.annotation.Metric(about=, always=false, sampleName=Ops, type=DEFAULT, value=[Rate of successful kerberos logins and latency (milliseconds)], valueName=Time)
24 [main] DEBUG org.apache.hadoop.metrics2.lib.MutableMetricsFactory - field org.apache.hadoop.metrics2.lib.MutableRate org.apache.hadoop.security.UserGroupInformation$UgiMetrics.loginFailure with annotation @org.apache.hadoop.metrics2.annotation.Metric(about=, always=false, sampleName=Ops, type=DEFAULT, value=[Rate of failed kerberos logins and latency (milliseconds)], valueName=Time)
25 [main] DEBUG org.apache.hadoop.metrics2.lib.MutableMetricsFactory - field org.apache.hadoop.metrics2.lib.MutableRate org.apache.hadoop.security.UserGroupInformation$UgiMetrics.getGroups with annotation @org.apache.hadoop.metrics2.annotation.Metric(about=, always=false, sampleName=Ops, type=DEFAULT, value=[GetGroups], valueName=Time)
25 [main] DEBUG org.apache.hadoop.metrics2.lib.MutableMetricsFactory - field private org.apache.hadoop.metrics2.lib.MutableGaugeLong org.apache.hadoop.security.UserGroupInformation$UgiMetrics.renewalFailuresTotal with annotation @org.apache.hadoop.metrics2.annotation.Metric(about=, always=false, sampleName=Ops, type=DEFAULT, value=[Renewal failures since startup], valueName=Time)
25 [main] DEBUG org.apache.hadoop.metrics2.lib.MutableMetricsFactory - field private org.apache.hadoop.metrics2.lib.MutableGaugeInt org.apache.hadoop.security.UserGroupInformation$UgiMetrics.renewalFailures with annotation @org.apache.hadoop.metrics2.annotation.Metric(about=, always=false, sampleName=Ops, type=DEFAULT, value=[Renewal failures since last successful login], valueName=Time)
27 [main] DEBUG org.apache.hadoop.metrics2.impl.MetricsSystemImpl - UgiMetrics, User and group related metrics
117 [main] DEBUG org.apache.hadoop.security.authentication.util.KerberosName - Kerberos krb5 configuration not found, setting default realm to empty
123 [main] DEBUG org.apache.hadoop.security.Groups - Creating new Groups object
129 [main] DEBUG org.apache.hadoop.util.NativeCodeLoader - Trying to load the custom-built native-hadoop library...
152 [main] DEBUG org.apache.hadoop.util.NativeCodeLoader - Loaded the native-hadoop library
153 [main] DEBUG org.apache.hadoop.security.JniBasedUnixGroupsMapping - Using JniBasedUnixGroupsMapping for Group resolution
153 [main] DEBUG org.apache.hadoop.security.JniBasedUnixGroupsMappingWithFallback - Group mapping impl=org.apache.hadoop.security.JniBasedUnixGroupsMapping
202 [main] DEBUG org.apache.hadoop.security.Groups - Group mapping impl=org.apache.hadoop.security.JniBasedUnixGroupsMappingWithFallback; cacheTimeout=300000; warningDeltaMs=5000
211 [main] DEBUG org.apache.hadoop.security.UserGroupInformation - hadoop login
212 [main] DEBUG org.apache.hadoop.security.UserGroupInformation - hadoop login commit
213 [main] DEBUG org.apache.hadoop.security.UserGroupInformation - Using user: "server" with name server
213 [main] DEBUG org.apache.hadoop.security.UserGroupInformation - User entry: "server"
213 [main] DEBUG org.apache.hadoop.security.UserGroupInformation - Assuming keytab is managed externally since logged in from subject.
214 [main] DEBUG org.apache.hadoop.security.UserGroupInformation - UGI loginUser:server (auth:SIMPLE)
250 [main] DEBUG org.apache.htrace.core.Tracer - sampler.classes = ; loaded no samplers
413 [main] DEBUG org.apache.htrace.core.Tracer - span.receiver.classes = ; loaded no span receivers
841 [main] DEBUG org.apache.hadoop.hdfs.client.impl.DfsClientConf - dfs.client.use.legacy.blockreader.local = false
841 [main] DEBUG org.apache.hadoop.hdfs.client.impl.DfsClientConf - dfs.client.read.shortcircuit = false
841 [main] DEBUG org.apache.hadoop.hdfs.client.impl.DfsClientConf - dfs.client.domain.socket.data.traffic = false
841 [main] DEBUG org.apache.hadoop.hdfs.client.impl.DfsClientConf - dfs.domain.socket.path =
854 [main] DEBUG org.apache.hadoop.hdfs.DFSClient - Sets dfs.client.block.write.replace-datanode-on-failure.min-replication to 0
890 [main] DEBUG org.apache.hadoop.io.retry.RetryUtils - multipleLinearRandomRetry = null
929 [main] DEBUG org.apache.hadoop.ipc.Server - rpcKind=RPC_PROTOCOL_BUFFER, rpcRequestWrapperClass=class org.apache.hadoop.ipc.ProtobufRpcEngine$RpcProtobufRequest, rpcInvoker=org.apache.hadoop.ipc.ProtobufRpcEngine$Server$ProtoBufRpcInvoker@489115ef
1166 [main] DEBUG org.apache.hadoop.ipc.Client - getting client out of cache: org.apache.hadoop.ipc.Client@b2c9a9c
1784 [main] DEBUG org.apache.hadoop.util.PerformanceAdvisory - Both short-circuit local reads and UNIX domain socket are disabled.
1790 [main] DEBUG org.apache.hadoop.hdfs.protocol.datatransfer.sasl.DataTransferSaslUtil - DataTransferProtocol not using SaslPropertiesResolver, no QOP found in configuration for dfs.data.transfer.protection
1849 [main] DEBUG org.apache.hadoop.ipc.Client - The ping interval is 60000 ms.
1860 [main] DEBUG org.apache.hadoop.ipc.Client - Connecting to hadoop2.com/192.168.129.130:9000
1969 [IPC Client (2093010349) connection to hadoop2.com/192.168.129.130:9000 from server] DEBUG org.apache.hadoop.ipc.Client - IPC Client (2093010349) connection to hadoop2.com/192.168.129.130:9000 from server: starting, having connections 1
1973 [IPC Parameter Sending Thread #0] DEBUG org.apache.hadoop.ipc.Client - IPC Client (2093010349) connection to hadoop2.com/192.168.129.130:9000 from server sending #0 org.apache.hadoop.hdfs.protocol.ClientProtocol.getListing
1984 [IPC Client (2093010349) connection to hadoop2.com/192.168.129.130:9000 from server] DEBUG org.apache.hadoop.ipc.Client - IPC Client (2093010349) connection to hadoop2.com/192.168.129.130:9000 from server got value #0
1984 [main] DEBUG org.apache.hadoop.ipc.ProtobufRpcEngine - Call: getListing took 170ms
hdfs://hadoop2.com:9000/test/output
hdfs://hadoop2.com:9000/test/start.txt
hdfs://hadoop2.com:9000/test/test.txt
2027 [main] DEBUG org.apache.hadoop.ipc.Client - stopping client from cache: org.apache.hadoop.ipc.Client@b2c9a9c
2028 [main] DEBUG org.apache.hadoop.ipc.Client - removing client from cache: org.apache.hadoop.ipc.Client@b2c9a9c
2028 [main] DEBUG org.apache.hadoop.ipc.Client - stopping actual client because no more references remain: org.apache.hadoop.ipc.Client@b2c9a9c
2028 [main] DEBUG org.apache.hadoop.ipc.Client - Stopping client
2028 [IPC Client (2093010349) connection to hadoop2.com/192.168.129.130:9000 from server] DEBUG org.apache.hadoop.ipc.Client - IPC Client (2093010349) connection to hadoop2.com/192.168.129.130:9000 from server: closed
2028 [IPC Client (2093010349) connection to hadoop2.com/192.168.129.130:9000 from server] DEBUG org.apache.hadoop.ipc.Client - IPC Client (2093010349) connection to hadoop2.com/192.168.129.130:9000 from server: stopped, remaining connections 0
Hello World
2131 [Thread-2] DEBUG org.apache.hadoop.util.ShutdownHookManager - ShutdownHookManger complete shutdown.