zoukankan      html  css  js  c++  java
  • mapreduce之单词计数WordCount

    在本地新建一个目录:

    然后在里面写入内容

    vim wordcount.txt

    内容如下:

    hello,world,hadoop
    hive,sqoop,flume,hello
    kitty,tom,jerry,world
    hadoop

    上传到HDFS

    hdfs dfs -mkdir /wordcount
    hdfs dfs -put wordcount.txt /wordcount/

    出现文件

    下面是在IDEA上运行的代码:

    import java.io.IOException;
    import org.apache.hadoop.fs.Path;
    import org.apache.hadoop.io.IntWritable;
    import org.apache.hadoop.io.Text;
    import org.apache.hadoop.mapreduce.Job;
    import org.apache.hadoop.mapreduce.Mapper;
    import org.apache.hadoop.mapreduce.Reducer;
    import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
    import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
    public class WordCount {
        public static void main(String[] args) throws IOException,ClassNotFoundException,InterruptedException {
            Job job = Job.getInstance();
            job.setJobName("WordCount");
            job.setJarByClass(WordCount.class);
            job.setMapperClass(doMapper.class);
            job.setReducerClass(doReducer.class);
            job.setOutputKeyClass(Text.class);
            job.setOutputValueClass(IntWritable.class);
            Path in = new Path("hdfs://192.168.58.128:9000/wordcount/wordcount.txt");
            Path out = new Path("hdfs://192.168.58.128:9000/wordcount_out");
    //        Path in = new Path("hdfs://192.168.58.128:9000/wordcount/wordcount.txt");
    //        Path out = new Path("hdfs://192.168.58.128:9000/wordcount_out");
            FileInputFormat.addInputPath(job,in);//读取并自动分成键值对。
            FileOutputFormat.setOutputPath(job,out);
            System.exit(job.waitForCompletion(true)?0:1);
        }
        public static class doMapper extends Mapper<Object,Text,Text,IntWritable>{
            public static final IntWritable one = new IntWritable(1);
            public static Text word = new Text();
            @Override
            protected void map(Object key, Text value, Context context)
                    throws IOException,InterruptedException {
                String line =value.toString();
                String[] split = line.split(",");
                for (String words:split){
                    word.set(words);
                    context.write(word,one);
                }
            }
        }
        public static class doReducer extends Reducer<Text,IntWritable,Text,IntWritable>{
            private IntWritable result = new IntWritable();
            @Override
            protected void reduce(Text key,Iterable<IntWritable> values,Context context)
                    throws IOException,InterruptedException{
                int sum = 0;
                for (IntWritable value : values){
                    sum += value.get();
                }
                result.set(sum);
                context.write(key,result);
            }
        }
    }

    成功之后可以在HDFS上看到新的文件夹wordcount_out

    里面的part-r-00000文件会显示单词统计的结果

    下载下来后可以看到

  • 相关阅读:
    Linux下date命令,格式化输出,时间设置
    Linux scp复制文件,不需要输入密码的技巧
    Linux中cp和scp命令的使用方法
    Linux定时任务系统 Cron
    Eclipse启动Tomcat后无法访问项目
    eclipse下tomcat插件配置说明
    RPM方式安装MySQL5.6和windows下安装mysql解压版
    shell script练习
    Eclipse Java注释模板设置详解
    mysql备份还原数据库
  • 原文地址:https://www.cnblogs.com/a155-/p/13908424.html
Copyright © 2011-2022 走看看