zoukankan      html  css  js  c++  java
  • 数据清洗使用本机的java代码

    驱动类

    import org.apache.hadoop.conf.Configuration;
    import org.apache.hadoop.fs.Path;
    import org.apache.hadoop.io.NullWritable;
    import org.apache.hadoop.io.Text;
    import org.apache.hadoop.mapreduce.Job;
    import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
    import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
    import org.apache.hadoop.util.Tool;
    import org.apache.hadoop.util.ToolRunner;

    public class ETLDriver implements Tool {

    private Configuration configuration;

    public int run(String[] strings) throws Exception {
    //创建Job
    Job job = Job.getInstance(configuration);

    //设置运行环境
    job.setJarByClass(ETLDriver.class);

    //设置对应的MapperReduce类
    job.setMapperClass(ETLMapper.class);

    //设置Mapper输出的
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(NullWritable.class);

    //设置全局的输出
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(NullWritable.class);

    //设置输出输入路径
    FileInputFormat.setInputPaths(job,new Path(strings[0]));
    FileOutputFormat.setOutputPath(job,new Path(strings[1]));

    //不需要reduce
    job.setNumReduceTasks(0);

    //提交
    job.submit();
    return 1;
    }

    public void setConf(Configuration configuration) {
    this.configuration=configuration;
    }

    public Configuration getConf() {
    return configuration;
    }

    //主函数
    public static void main(String[] args) throws Exception{
    ToolRunner.run(new ETLDriver(),args);
    }
    }

    2maven依赖

    <dependencies>
    <dependency>
    <groupId>junit</groupId>
    <artifactId>junit</artifactId>
    <version>RELEASE</version>
    </dependency>
    <dependency>
    <groupId>org.apache.logging.log4j</groupId>
    <artifactId>log4j-core</artifactId>
    <version>2.8.2</version>
    </dependency>
    <dependency>
    <groupId>org.apache.hadoop</groupId>
    <artifactId>hadoop-common</artifactId>
    <version>2.7.2</version>
    </dependency>
    <dependency>
    <groupId>org.apache.hadoop</groupId>
    <artifactId>hadoop-client</artifactId>
    <version>2.7.2</version>
    </dependency>
    <dependency>
    <groupId>org.apache.hadoop</groupId>
    <artifactId>hadoop-hdfs</artifactId>
    <version>2.7.2</version>
    </dependency>

    </dependencies>

  • 相关阅读:
    苹果信息推送服务(Apple Push Notification Service)使用总结
    Xcode 相关路径总结
    微信红包随机算法 OC
    Xcode真机测试could not find developer disk image解决方法
    字典转模型 重写初始化方法
    Xcode 写代码没有补全提示解决:删缓存及显示隐藏文件命令
    按位与、或、异或等运算方法
    OC语言@property @synthesize和id
    iOS开发—Quartz2D简单介绍
    iOS开发—CoreLocation定位服务
  • 原文地址:https://www.cnblogs.com/enough/p/15703580.html
Copyright © 2011-2022 走看看