zoukankan html css js c++ java

mapreduce 对文件分词读取

MapReduce

实例一：（进行文件的分词读取）

1.1 首先导入架包

<dependency>
  <groupId>org.apache.logging.log4j</groupId>
  <artifactId>log4j-core</artifactId>
  <version>2.8.2</version>
</dependency>
<dependency>
  <groupId>org.apache.hadoop</groupId>
  <artifactId>hadoop-common</artifactId>
  <version>2.7.3</version>
</dependency>
<dependency>
  <groupId>org.apache.hadoop</groupId>
  <artifactId>hadoop-client</artifactId>
  <version>2.7.3</version>
</dependency>
<dependency>
  <groupId>org.apache.hadoop</groupId>
  <artifactId>hadoop-hdfs</artifactId>
  <version>2.7.3</version>
</dependency>

1.2 编写Mapper

private final static LongWritable two = new LongWritable(1);
private Text word = new Text();

@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
    //构造一个用来解析str的StringTokenizer对象。java默认的分隔符是“空格”、“制表符(‘	’)”、“换行符(‘
’)”、“回车符(‘
’)”
    StringTokenizer st = new StringTokenizer(value.toString());
    while(st.hasMoreTokens()){//返回是否还有分隔符
        word.set(st.nextToken());//返回从当前位置到下一个分隔符的字符串
        context.write(word,two);
    }

}

1.3 编写Reduce

private final static LongWritable one = new LongWritable();
@Override
protected void reduce(Text key, Iterable<LongWritable> values, Context context) throws IOException, InterruptedException {
    int sum = 0;
    for (LongWritable value:values) {
        sum+=value.get();
    }
    one.set(sum);
    context.write(key,one);
}

1.4 编写job驱动

Job job = Job.getInstance(new Configuration());
        job.setJarByClass(TextJob.class);
        FileInputFormat.addInputPath(job,new Path(args[0]));
        FileOutputFormat.setOutputPath(job,new Path(args[1]));
        job.setMapperClass(TextMapper.class);
//        job.setCombinerClass(TextReduce.class);
        job.setReducerClass(TextReduce.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(LongWritable.class);
        job.waitForCompletion(true);

1.5 在hsfs 中的方法：

[root@head42 ~]# hadoop jar mapreduce-1.0-SNAPSHOT.jar com.njbd.normal.text1.TextJob /text /output14

注释：mapreduce-1.0-SNAPSHOT.jar：为java包

com.njbd.normal.text1.TextJob：为job的路径

/text/output14/:为输出目录

查看全文

相关阅读:
【2016-10-27】【坚持学习】【Day14】【VS 配置管理器 AssemblyInfo 】
【2016-10-26】【坚持学习】【Day13】【WCF】【EF + Data Services】
【2016-10-25】【坚持学习】【Day12】【WPF】【Telerik】【VirtualtionData 虚拟化数据】
【2016-10-24】【坚持学习】【Day11】【WPF】【MVVM】
【2016-10-20】【坚持学习】【Day10】【反射2】
【2016-10-17】【坚持学习】【Day9】【反射】
【2016-10-17】【坚持学习】【Day8】【抽象工厂模式】
【2016-10-17】【坚持学习】【Day8】【工厂方法模式】
【2016-10-17】【坚持学习】【Day8】【简单工厂模式】
【2016-10-16】【坚持学习】【Day7】【建造者模式】

原文地址：https://www.cnblogs.com/tudousiya/p/11241441.html

最新文章
正则表达式
 数学运算
 数组
 变量
 Linux进程
 Linux软件
 Linux系统
 Linux磁盘
 220-Contains Duplicate III
java关键字总结