zoukankan      html  css  js  c++  java
  • hadoop worldcount小程序

    首先在hadoop中建立input文件夹放几个文件,里边写点东西。比如我放了三个,分别写的是

    第一个

    hello hadoop

    bye hadoop

    第二个

    hello world

    bye world

    第三个

    hello bigdata

    然后就有下边这段代码做单词统计:

     1 import java.io.File;
     2 import java.io.IOException;
     3 import java.net.URI;
     4 import java.net.URISyntaxException;
     5 
     6 import org.apache.hadoop.conf.Configuration;
     7 import org.apache.hadoop.fs.FileSystem;
     8 import org.apache.hadoop.fs.Path;
     9 import org.apache.hadoop.io.LongWritable;
    10 import org.apache.hadoop.io.Text;
    11 import org.apache.hadoop.mapreduce.Job;
    12 import org.apache.hadoop.mapreduce.Mapper;
    13 import org.apache.hadoop.mapreduce.Reducer;
    14 import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
    15 import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
    16 
    17 public class WorldCount {    
    18     
    19     static final String INPUT_PATH = "hdfs://masters:9000/user/hadoop/input";
    20     static final String OUTPUT_PATH = "hdfs://masters:9000/user/hadoop/output";
    21     public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException, URISyntaxException {
    22         
    23         //添加以下的代码,就可以联通,不知道咋回事
    24         String path = new File(".").getCanonicalPath();
    25         System.getProperties().put("hadoop.home.dir", path);
    26         new File("./bin").mkdirs();
    27         new File("./bin/winutils.exe").createNewFile();
    28 
    29         Configuration conf = new Configuration();
    30         Path outpath = new Path(OUTPUT_PATH);
    31         
    32         Job job = new Job(conf, "WorldCount");
    33         
    34         FileInputFormat.setInputPaths(job, INPUT_PATH);
    35         FileOutputFormat.setOutputPath(job, outpath);
    36         
    37         //检测输出路径是否存在,如果存在就删除,否则会报错
    38         FileSystem fileSystem = FileSystem.get(new URI(OUTPUT_PATH), conf);
    39         if(fileSystem.exists(outpath)){
    40             fileSystem.delete(outpath, true);
    41         }
    42         
    43         job.setMapperClass(MyMapper.class);
    44         job.setReducerClass(MyReducer.class);
    45         job.setOutputKeyClass(Text.class);
    46         job.setOutputValueClass(LongWritable.class);
    47         job.waitForCompletion(true);
    48     }
    49     
    50     //输入,map,即拆分过程
    51     static class MyMapper extends Mapper<LongWritable, Text, Text, LongWritable>{
    52         
    53         /*
    54          * 输入为(key,value)输出为(value,count数量)
    55          * 所以LongWritable, Text, Text, LongWritable分别代表 key(行号) value value count
    56          * 其中LongWritable和Text是hadoop定义的类型,分别代表long和string两种类型
    57          * */
    58         protected void map(LongWritable k1, Text v1, Context context)throws IOException, InterruptedException{
    59             String[] splits = v1.toString().split(" ");//按照空格拆分
    60             for(String str: splits){
    61                 System.out.println("---" + str);
    62                 context.write(new Text(str), new LongWritable(1));//拆分出来的形式为(“单词”,出现次数(这里默认为1))
    63             }
    64         }
    65     }
    66     
    67     //输出,reduce,汇总过程
    68     static class MyReducer extends Reducer<Text, LongWritable, Text, LongWritable>{
    69         protected void reduce(
    70                 Text k2, //输出的内容,即value
    71                 Iterable<LongWritable> v2s, //是一个longwritable类型的数组,所以用了Iterable这个迭代器,且元素为v2s
    72                 org.apache.hadoop.mapreduce.Reducer<Text, LongWritable, Text, LongWritable>.Context context)
    73                 //这里一定设置好,不然输出会变成单个单词,从而没有统计数量
    74                 throws IOException, InterruptedException {
    75             //列表求和 初始为0
    76             long times = 0L;
    77             for(LongWritable count:v2s){
    78                 times += count.get();
    79             }
    80             context.write(k2, new LongWritable(times));
    81         }
    82     }
    83 }

    然后就成了,看下结果

    第23行到第27行不写就会报错,我也不知道咋回事,如果哪个大牛知道咋回事,非常期待留言解答。

  • 相关阅读:
    css世界六
    电子书
    es 浏览器支持情况地址
    mac 下载文件的一些地址
    NODE_ENV production / development
    css世界五
    css世界四
    css世界三
    css世界二
    关于递归算法
  • 原文地址:https://www.cnblogs.com/K-artorias/p/7065661.html
Copyright © 2011-2022 走看看