zoukankan      html  css  js  c++  java
  • hadoop入门,跑出第一个WordCount

    1.环境准备

    下载:http://mirror.bit.edu.cn/apache/hadoop/common/hadoop-2.7.2/hadoop-2.7.2.tar.gz

    解压:解压后,修改etc/hadoop/hadoop-env.sh 中JAVA_HOME, 我的java_home(可以通过cat /etc/profile)是/user/java/latest

    2.Hadoop Single_Node Cluster

    参考官方文档:http://hadoop.apache.org/docs/current/hadoop-project-dist/hadoop-common/SingleCluster.html

    3.WordCount示例

    a.maven 配置(pom.xml)

    <?xml version="1.0" encoding="UTF-8"?>
    <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
        xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
      <modelVersion>4.0.0</modelVersion>
      <groupId>my.hadoopstudy</groupId>
      <artifactId>hadoopstudy</artifactId>
      <packaging>jar</packaging>
      <version>1.0-SNAPSHOT</version>
      <name>hadoopstudy</name>
      <url>http://maven.apache.org</url>
      <dependencies>
           <dependency>  
                <groupId>org.apache.hadoop</groupId>  
                <artifactId>hadoop-common</artifactId>  
                <version>2.5.1</version>  
            </dependency>  
            <dependency>  
                <groupId>org.apache.hadoop</groupId>  
                <artifactId>hadoop-hdfs</artifactId>  
                <version>2.5.1</version>  
            </dependency>  
            <dependency>  
                <groupId>org.apache.hadoop</groupId>  
                <artifactId>hadoop-client</artifactId>  
                <version>2.5.1</version>  
            </dependency>
            <dependency>
                <groupId>jdk.tools</groupId>
                <artifactId>jdk.tools</artifactId>
                <version>1.8.0_65</version>
                <scope>system</scope>
                <systemPath>${JAVA_HOME}/lib/tools.jar</systemPath>
            </dependency>
        <dependency>
          <groupId>junit</groupId>
          <artifactId>junit</artifactId>
          <version>3.8.1</version>
          <scope>test</scope>
        </dependency>
      </dependencies>
    </project>

    b.Mapper代码:

    public static class TokenizerMapper extends Mapper<Object, Text, Text, IntWritable> {
    
            private final static IntWritable one = new IntWritable(1);
            private Text word = new Text();
    
            public void map(Object key, Text value, Context context) throws IOException, InterruptedException {
                StringTokenizer itr = new StringTokenizer(value.toString());
                while (itr.hasMoreTokens()) {
                    word.set(itr.nextToken());
                    context.write(word, one);
                }
            }
        }

    c.Reducer代码:

    public static class IntSumReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
            private IntWritable result = new IntWritable();
    
            public void reduce(Text key, Iterable<IntWritable> values, Context context)
                    throws IOException, InterruptedException {
                int sum = 0;
                for (IntWritable val : values) {
                    sum += val.get();
                }
                result.set(sum);
                context.write(key, result);
            }
        }

    d.整个java代码如下:

    package my.hadoopstudy.mapreduce;
    
    import java.io.IOException;
    import java.util.StringTokenizer;
    
    import org.apache.hadoop.conf.Configuration;
    import org.apache.hadoop.fs.Path;
    import org.apache.hadoop.io.IntWritable;
    import org.apache.hadoop.io.Text;
    import org.apache.hadoop.mapreduce.Job;
    import org.apache.hadoop.mapreduce.Mapper;
    import org.apache.hadoop.mapreduce.Reducer;
    import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
    import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
    
    public class WordCount {
        public static class TokenizerMapper extends Mapper<Object, Text, Text, IntWritable> {
    
            private final static IntWritable one = new IntWritable(1);
            private Text word = new Text();
    
            public void map(Object key, Text value, Context context) throws IOException, InterruptedException {
                StringTokenizer itr = new StringTokenizer(value.toString());
                while (itr.hasMoreTokens()) {
                    word.set(itr.nextToken());
                    context.write(word, one);
                }
            }
        }
    
        public static class IntSumReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
            private IntWritable result = new IntWritable();
    
            public void reduce(Text key, Iterable<IntWritable> values, Context context)
                    throws IOException, InterruptedException {
                int sum = 0;
                for (IntWritable val : values) {
                    sum += val.get();
                }
                result.set(sum);
                context.write(key, result);
            }
        }
    
        public static void main(String[] args) throws Exception {
            Configuration conf = new Configuration();
            Job job = Job.getInstance(conf, "word count");
            job.setJarByClass(WordCount.class);
            job.setMapperClass(TokenizerMapper.class);
            job.setCombinerClass(IntSumReducer.class);
            job.setReducerClass(IntSumReducer.class);
            job.setOutputKeyClass(Text.class);
            job.setOutputValueClass(IntWritable.class);
            FileInputFormat.addInputPath(job, new Path(args[0]));
            FileOutputFormat.setOutputPath(job, new Path(args[1]));
            System.exit(job.waitForCompletion(true) ? 0 : 1);
        }
    }

    e.打包: 在项目目录下 mvn package,将target中jar包放到hadoop目录下自己建的study目录

    f.运行:bin/hadoop jar study/hadoopstudy-1.0-SNAPSHOT.jar my.hadoopstudy.mapreduce.WordCount /user/wangke/wordcount/input /user/wangke/wordcount/output

    4.遇到的问题及解决方式:

    a.JAVA_HOME一定要记得修改

    b.要按照2中官方文档修改相关的xml文件配置

    c.第二次按照官方pseudo-distributed,报错如下:hadoop-there-are-0-datanodes-running-and-no-nodes-are-excluded-in-this-operation

    解决方式: sbin/stop_all.sh  --> 删除current文件(rm -r /tmp/hadoop-admin/dfs/data/current)  ,然后重新按照pseudo-distributed就没问题了

    d.在pseudo-distributed下跑jar时,connecting to resourcemanager一直连不上,,retry。这是因为没有启动yarn(本来以为这个local跑的时候,根本不需要yarn,但是只有启动yarn,才能打开8032resourceManager端口)

    解决方式:修改yarn-site.xml配置:

    <configuration>
    
    <!-- Site specific YARN configuration properties -->
        <property>
            <name>yarn.nodemanager.aux-services</name>
            <value>mapreduce_shuffle</value>
        </property>
        <property>
            <name>yarn.resourcemanager.address</name>
            <value>127.0.0.1:8032</value>
        </property>
        <property>
            <name>yarn.resourcemanager.scheduler.address</name>
            <value>127.0.0.1:8030</value>
        </property>
        <property>
            <name>yarn.resourcemanager.resource-tracker.address</name>
            <value>127.0.0.1:8031</value>
        </property>
    </configuration>

    sbin/yarn-start.sh  发现执行成功,bin/hdfs dfs -cat /user/wangke/wordcount/output/part-r-00000  查看结果没问题

  • 相关阅读:
    如何成为合格的技术面试官?
    互联网上有多少个网站?
    前端领域不需要架构师?
    WebAssembly 简介
    git常用命令
    剑指offer-基本思想学习(未包括代码)
    OS知识点总结
    对软件工程的一点认识
    项目实现过程的每个阶段
    编译原理课程设计总结
  • 原文地址:https://www.cnblogs.com/wangkeustc/p/hadoop_primary.html
Copyright © 2011-2022 走看看