zoukankan      html  css  js  c++  java
  • intelij创建MapReduce工程

    1、创建一个maven工程

    2、POM文件

    <?xml version="1.0" encoding="UTF-8"?>
    <project xmlns="http://maven.apache.org/POM/4.0.0"
    xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
    xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
    <modelVersion>4.0.0</modelVersion>

    <groupId>com.sogou</groupId>
    <artifactId>teemo-dc-etl</artifactId>
    <version>1.0.0</version>
    <packaging>jar</packaging>

    <name>teemo-dc-etl</name>
    <url>http://maven.apache.org</url>

    <properties>
    <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
    <mahout.version>0.5</mahout.version>
    <mahout.groupid>org.apache.mahout</mahout.groupid>
    <spring.version>3.0.6.RELEASE</spring.version>
    </properties>

    <repositories>
    <repository>
    <id>maven-ali</id>
    <url>http://maven.twttr.com/</url>
    <releases>
    <enabled>true</enabled>
    </releases>
    <snapshots>
    <enabled>true</enabled>
    <updatePolicy>always</updatePolicy>
    <checksumPolicy>fail</checksumPolicy>
    </snapshots>
    </repository>
    </repositories>

    <dependencies>
    <dependency>
    <groupId>junit</groupId>
    <artifactId>junit</artifactId>
    <version>3.8.1</version>
    <scope>test</scope>
    </dependency>

    <dependency>
    <groupId>org.apache.hadoop</groupId>
    <artifactId>hadoop-hdfs</artifactId>
    <version>2.5.0</version>
    </dependency>

    <dependency>
    <groupId>org.apache.hadoop</groupId>
    <artifactId>hadoop-client</artifactId>
    <version>2.5.1</version>
    </dependency>

    <dependency>
    <groupId>org.apache.hadoop</groupId>
    <artifactId>hadoop-common</artifactId>
    <version>2.5.0</version>
    </dependency>

    <dependency>
    <groupId>com.hadoop.gplcompression</groupId>
    <artifactId>hadoop-lzo</artifactId>
    <version>0.4.19</version>
    </dependency>

    <dependency>
    <groupId>org.apache.hadoop</groupId>
    <artifactId>hadoop-yarn-common</artifactId>
    <version>2.5.2</version>
    </dependency>

    <dependency>
    <groupId>com.alibaba</groupId>
    <artifactId>fastjson</artifactId>
    <version>1.2.4</version>
    </dependency>

    </dependencies>


    <build>
    <plugins>
    <!--
    bind the maven-assembly-plugin to the package phase
    this will create a jar file without the storm dependencies
    suitable for deployment to a cluster.
    -->
    <plugin>
    <artifactId>maven-assembly-plugin</artifactId>
    <configuration>
    <archive>
    <manifest>
    <mainClass></mainClass>
    </manifest>
    </archive>
    <descriptorRefs>
    <descriptorRef>jar-with-dependencies</descriptorRef>
    </descriptorRefs>
    </configuration>
    <executions>
    <execution>
    <id>make-assembly</id>
    <phase>package</phase> <!-- packaging phase -->
    <goals>
    <goal>single</goal>
    </goals>
    </execution>
    </executions>
    </plugin>
    <plugin>
    <groupId>org.apache.maven.plugins</groupId>
    <artifactId>maven-compiler-plugin</artifactId>
    <configuration>
    <source>1.6</source>
    <target>1.6</target>
    <encoding>UTF-8</encoding>
    </configuration>
    </plugin>

    <plugin>
    <groupId>org.apache.maven.plugins</groupId>
    <artifactId>maven-surefire-plugin</artifactId>
    <version>2.14.1</version>
    <configuration>
    <argLine>-Xmx2048m</argLine>
    </configuration>
    </plugin>
    </plugins>
    </build>
    </project>
    这里有个lzo包,需要增加twiter的资源库
    3、mapreduce文件写法
    package com.sogou.teemo.test;

    import org.apache.hadoop.conf.Configuration;
    import org.apache.hadoop.fs.Path;
    import org.apache.hadoop.io.IntWritable;
    import org.apache.hadoop.io.Text;
    import org.apache.hadoop.mapreduce.Job;
    import org.apache.hadoop.mapreduce.Mapper;
    import org.apache.hadoop.mapreduce.Reducer;
    import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
    import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

    import java.io.IOException;
    import java.util.StringTokenizer;

    public class WordCount {
    /* Mapper */
    public static class TokenizerMapper extends Mapper<Object, Text, Text, IntWritable> {
    private final static IntWritable one = new IntWritable(1);
    private Text word = new Text();
    @Override
    public void map(Object key, Text value, Context context) throws IOException, InterruptedException{
    StringTokenizer itr = new StringTokenizer(value.toString());
    while(itr.hasMoreTokens()){
    word.set(itr.nextToken());
    context.write(word, one);
    }
    }
    }

    /* Reducer */
    public static class IntSumReducer extends Reducer<Text,IntWritable,Text,IntWritable>{
    private IntWritable result = new IntWritable();
    @Override
    public void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException,InterruptedException{
    int sum = 0;
    for(IntWritable val : values){
    sum += val.get();
    }
    result.set(sum);
    context.write(key,result);
    }
    }

    /* 启动 MapReduce Job */
    public static void main(String[] args) throws Exception{
    System.setProperty("hadoop.home.dir","D:/hadoop-2.6.5" );
    Configuration conf = new Configuration();
    /*if(args.length != 2){
    System.err.println("Usage: wordcount <int> <out>");
    System.exit(2);
    }*/
    String arg1 = "input";
    String arg2 = "output";
    Job job = new Job(conf, "word count");
    job.setJarByClass(WordCount.class);
    job.setMapperClass(TokenizerMapper.class);
    job.setReducerClass(IntSumReducer.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);
    FileInputFormat.addInputPath(job,new Path(arg1));
    FileOutputFormat.setOutputPath(job,new Path(arg2));
    System.exit(job.waitForCompletion(true)?0:1);
    }
    }


  • 相关阅读:
    前后端不分离的springboot项目问题:页面框架问题
    SpringBoot使用Filter过滤器处理是否登录的过滤时,用response.sendRedirect()转发报错
    mysql千万级数据优化查询
    java进阶学习的一些思路
    java的List列表转成Tree(树形)结构列表
    cmd 一键获取 所有连接过的wifi 密码
    SQLMap用户手册【超详细】
    Python:SQLMAP参数中文解释
    初识sql注入及sqlmap
    wwwscan网站目录文件批量扫描工具
  • 原文地址:https://www.cnblogs.com/shenguo/p/10483161.html
Copyright © 2011-2022 走看看