zoukankan html css js c++ java

mapreduce之单词计数WordCount

在本地新建一个目录：

然后在里面写入内容

vim wordcount.txt

内容如下：

hello,world,hadoop
hive,sqoop,flume,hello
kitty,tom,jerry,world
hadoop

上传到HDFS

hdfs dfs -mkdir /wordcount
hdfs dfs -put wordcount.txt /wordcount/

出现文件

下面是在IDEA上运行的代码：

import java.io.IOException;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class WordCount {
    public static void main(String[] args) throws IOException,ClassNotFoundException,InterruptedException {
        Job job = Job.getInstance();
        job.setJobName("WordCount");
        job.setJarByClass(WordCount.class);
        job.setMapperClass(doMapper.class);
        job.setReducerClass(doReducer.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);
        Path in = new Path("hdfs://192.168.58.128:9000/wordcount/wordcount.txt");
        Path out = new Path("hdfs://192.168.58.128:9000/wordcount_out");
//        Path in = new Path("hdfs://192.168.58.128:9000/wordcount/wordcount.txt");
//        Path out = new Path("hdfs://192.168.58.128:9000/wordcount_out");
        FileInputFormat.addInputPath(job,in);//读取并自动分成键值对。
        FileOutputFormat.setOutputPath(job,out);
        System.exit(job.waitForCompletion(true)?0:1);
    }
    public static class doMapper extends Mapper<Object,Text,Text,IntWritable>{
        public static final IntWritable one = new IntWritable(1);
        public static Text word = new Text();
        @Override
        protected void map(Object key, Text value, Context context)
                throws IOException,InterruptedException {
            String line =value.toString();
            String[] split = line.split(",");
            for (String words:split){
                word.set(words);
                context.write(word,one);
            }
        }
    }
    public static class doReducer extends Reducer<Text,IntWritable,Text,IntWritable>{
        private IntWritable result = new IntWritable();
        @Override
        protected void reduce(Text key,Iterable<IntWritable> values,Context context)
                throws IOException,InterruptedException{
            int sum = 0;
            for (IntWritable value : values){
                sum += value.get();
            }
            result.set(sum);
            context.write(key,result);
        }
    }
}

成功之后可以在HDFS上看到新的文件夹wordcount_out

里面的part-r-00000文件会显示单词统计的结果

下载下来后可以看到

查看全文

相关阅读:
AD域渗透测试笔记
 ctf之WEB练习一
 CTF之crpto练习三
 ctf之WEB练习二
 ctf之WEB练习三
 [转]Ant学习笔记——自己构建Ant编译环境
 [转]【NoSQL】NoSQL入门级资料整理（CAP原理、最终一致性）
啥叫异步调用
 C++中虚函数的作用是什么？它应该怎么用呢？
[转]Thrift连接池实现

原文地址：https://www.cnblogs.com/a155-/p/13908424.html