zoukankan      html  css  js  c++  java
  • hadoop wordcount

    使用java写出wordcount

    1.创建项目

    加入依赖

    <?xml version="1.0" encoding="UTF-8"?>
    <project xmlns="http://maven.apache.org/POM/4.0.0"
             xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
             xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
        <modelVersion>4.0.0</modelVersion>
    
        <groupId>com.xiaodao</groupId>
        <artifactId>hadoop001</artifactId>
        <version>1.0</version>
    
        <properties>
            <hadoop.version>2.6.0</hadoop.version>
        </properties>
        <dependencies>
            <dependency>
                <groupId>commons-cli</groupId>
                <artifactId>commons-cli</artifactId>
                <version>1.2</version>
            </dependency>
            <dependency>
                <groupId>commons-logging</groupId>
                <artifactId>commons-logging</artifactId>
                <version>1.1.3</version>
            </dependency>
            <dependency>
                <groupId>org.apache.hadoop</groupId>
                <artifactId>hadoop-mapreduce-client-jobclient</artifactId>
                <version>${hadoop.version}</version>
            </dependency>
    
            <dependency>
                <groupId>org.apache.hadoop</groupId>
                <artifactId>hadoop-common</artifactId>
                <version>${hadoop.version}</version>
            </dependency>
    
            <!-- 3.1.2 -->
            <!--        <dependency>-->
            <!--            <groupId>org.apache.hadoop</groupId>-->
            <!--            <artifactId>hadoop-hdfs-client</artifactId>-->
            <!--            <version>2.8.0</version>-->
            <!--        </dependency>-->
    
            <dependency>
                <groupId>org.apache.hadoop</groupId>
                <artifactId>hadoop-hdfs</artifactId>
                <version>2.7.3</version>
            </dependency>
    
            <dependency>
                <groupId>org.apache.hadoop</groupId>
                <artifactId>hadoop-hdfs</artifactId>
                <version>${hadoop.version}</version>
            </dependency>
    
            <dependency>
                <groupId>org.apache.hadoop</groupId>
                <artifactId>hadoop-mapreduce-client-app</artifactId>
                <version>${hadoop.version}</version>
            </dependency>
    
            <dependency>
                <groupId>org.apache.hadoop</groupId>
                <artifactId>hadoop-mapreduce-client-hs</artifactId>
                <version>${hadoop.version}</version>
            </dependency>
    
    
            <!--        <dependency>-->
            <!--            <groupId>org.slf4j</groupId>-->
            <!--            <artifactId>slf4j-api</artifactId>-->
            <!--            <version>1.7.25</version>-->
            <!--        </dependency>-->
            <!--        <dependency>-->
            <!--            <groupId>log4j</groupId>-->
            <!--            <artifactId>log4j</artifactId>-->
            <!--            <version>1.2.17</version>-->
            <!--        </dependency>-->
        </dependencies>
    
    
    </project>
    View Code

    在resouce文件夹下:

     启动log文件的配置:

    log4j.rootLogger=DEBUG,console,FILE
    
    log4j.appender.console=org.apache.log4j.ConsoleAppender
    log4j.appender.console.threshold=INFO
    log4j.appender.console.layout=org.apache.log4j.PatternLayout
    log4j.appender.console.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} [%5p] - %c -%F(%L) -%m%n
    
    log4j.appender.FILE=org.apache.log4j.RollingFileAppender
    log4j.appender.FILE.Append=true
    log4j.appender.FILE.File=logs/log4jtest.log
    log4j.appender.FILE.Threshold=INFO
    log4j.appender.FILE.layout=org.apache.log4j.PatternLayout
    log4j.appender.FILE.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} [%5p] - %c -%F(%L) -%m%n
    log4j.appender.FILE.MaxFileSize=10MB

    mapred-site.xml

    <?xml version="1.0"?>
    <?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
    <!--
      Licensed under the Apache License, Version 2.0 (the "License");
      you may not use this file except in compliance with the License.
      You may obtain a copy of the License at
    
        http://www.apache.org/licenses/LICENSE-2.0
    
      Unless required by applicable law or agreed to in writing, software
      distributed under the License is distributed on an "AS IS" BASIS,
      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
      See the License for the specific language governing permissions and
      limitations under the License. See accompanying LICENSE file.
    -->
    
    <!-- Put site-specific property overrides in this file. -->
    
    <configuration>
        <property>
            <name>mapreduce.framework.name</name>
            <value>local</value>
        </property>
    </configuration>

    剩下的配置文件.就是你集群中的配置文件copy进来即可.

    mapper代码

    public class WordCountMap extends Mapper<LongWritable, Text,Text, IntWritable> {
    
        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            String[] split = value.toString().split(" ");
            for (String s : split) {
                context.write(new Text(s),new IntWritable(1));
            }
        }
    }

    reducer代码

    public class WordCountMap extends Mapper<LongWritable, Text,Text, IntWritable> {
    
        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            String[] split = value.toString().split(" ");
            for (String s : split) {
                context.write(new Text(s),new IntWritable(1));
            }
        }
    }

    main 方法:

    public class WordCountMain {
    
        public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
            //1,一个输入路径 2.一个输出路径
            if(args.length !=2 || args ==null){
                System.out.println("路径为空");
                System.exit(0);
            }
            Configuration configuration = new Configuration();
            //调用getinstance 生成job 方法
    
            Job job = Job.getInstance(configuration, WordCountMain.class.getSimpleName());
            //打jar
            job.setJarByClass(WordCountMain.class);
    
            //1设置默认格式.默认就是这个格式 InputFormat 可以传入一些子类
            job.setInputFormatClass(TextInputFormat.class);
            job.setOutputFormatClass(TextOutputFormat.class);
    
            //2 设置输入输出路径
    
            FileInputFormat.setInputPaths(job, new Path(args[0]));
            FileOutputFormat.setOutputPath(job, new Path(args[1]));
            // 3  设置map和reduce类
            job.setMapperClass(WordCountMap.class);
            job.setReducerClass(WordCountReduce.class);
    
    //        job.setCombinerClass(WordCountReduce.class);
                //如果map reduce 输入的个是一致,这里可以不用写
    //        job.setMapOutputKeyClass(Text.class);
    //           job.setOutputValueClass(IntWritable.class);
            //设置 reduce task的输出key/value格式
            job.setOutputKeyClass(Text.class);
            job.setOutputValueClass(IntWritable.class);
    
            //提供作业
            job.waitForCompletion(true);
        }
    }

    参数

    出入文件

    nancy 22 8000
    ketty 22 9000
    stone 19 10000
    green 19 10000
    white 30 29000
    socrates 29 40000 

    输入和输出都在hdfs上.所以我们的参数为

    hdfs://xiaodao:9000/salary.txt hdfs://xiaodao:9000/0905wordcount

    hadoop jar /Users/xuyuanfang/IdeaProjects/hadoop001/target/hadoop001-1.0.jar com.xiaodao.wordcount.WordCountMain hdfs://xiaodao:9000/salary.txt hdfs://xiaodao:9000/0905wordcount2

    运行之后就可以了.

  • 相关阅读:
    【转】SpringCloud学习
    Springboot中配置druid
    阿里云sql监控配置-druid
    Linux中Java开发常用的软件总结:
    java 搞笑注释
    Python之路-pandas包的详解与使用
    Python之路-numpy模块
    Python之路-Python中的线程与进程
    Python之路-Python常用模块-time模块
    Python之路-Python中文件和异常
  • 原文地址:https://www.cnblogs.com/bj-xiaodao/p/11466966.html
Copyright © 2011-2022 走看看