zoukankan html css js c++ java

mapreduce方式操作hbase

一、导入数据到hbase

1、配置hbase-site.xml指向hdfs

<configuration>
  <property>
    <name>hbase.rootdir</name>
    <value>hdfs://bigdata-senior01.home.com:9000/hbase</value>
  </property>
  <property>
    <name>hbase.zookeeper.property.dataDir</name>
    <value>hdfs://bigdata-senior01.home.com:9000/hbase/zookeeper</value>
  </property>
  <property>
    <name>hbase.unsafe.stream.capability.enforce</name>
    <value>false</value>
    <description>
      Controls whether HBase will check for stream capabilities (hflush/hsync).

      Disable this if you intend to run on LocalFileSystem, denoted by a rootdir
      with the 'file://' scheme, but be mindful of the NOTE below.

      WARNING: Setting this to false blinds you to potential data loss and
      inconsistent system state in the event of process and/or node failures. If
      HBase is complaining of an inability to use hsync or hflush it's most
      likely not a false positive.
    </description>
  </property>
</configuration>

2、依赖

        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-client</artifactId>
            <version>3.2.0</version>
        </dependency>

        <dependency>
            <groupId>org.apache.hbase</groupId>
            <artifactId>hbase-client</artifactId>
            <version>2.0.4</version>
        </dependency>

        <dependency>
            <groupId>org.apache.hbase</groupId>
            <artifactId>hbase-mapreduce</artifactId>
            <version>2.0.4</version>
        </dependency>

3、mapper

//输入:文本方式，输出：字节作为键，hbase的Mutation作为输出值
public class ImportMapper extends Mapper<LongWritable, Text, ImmutableBytesWritable, Mutation> {
    //计数器
    public enum Counters {
        LINES
    }

    private byte[] family = null;
    private byte[] qualifier = null;

    /**
     * Called once at the beginning of the task.
     *
     * @param context
     */
    @Override
    protected void setup(Context context) throws IOException, InterruptedException {
        //从配置文件中读取列族信息，这个信息是控制台方式写入，并通过cli获取
        String column = context.getConfiguration().get("conf.column");
        ColParser parser = new ColParser();
        parser.parse(column);
        if(!parser.isValid()) throw new IOException("family or qualifier error");
        family = parser.getFamily();
        qualifier = parser.getQualifier();
    }

    /**
     * Called once for each key/value pair in the input split. Most applications
     * should override this, but the default is the identity function.
     *
     * @param key
     * @param value
     * @param context
     */
    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        try {
            String line = value.toString();
            //散列每行数据作为行键，根据需求调整
            byte[] rowKey = DigestUtils.md5(line);
            Put put = new Put(rowKey);
            put.addColumn(this.family,this.qualifier,Bytes.toBytes(line));
            context.write(new ImmutableBytesWritable(rowKey),put);
            context.getCounter(Counters.LINES).increment(1);
        }catch (Exception e){
            e.printStackTrace();
        }
    }
    
    class ColParser {
        private byte[] family;
        private byte[] qualifier;
        private boolean valid;

        public byte[] getFamily() {
            return family;
        }

        public byte[] getQualifier() {
            return qualifier;
        }

        public boolean isValid() {
            return valid;
        }

        public void parse(String value) {
            try {
                String[] sValue = value.split(":");
                if (sValue == null || sValue.length < 2 || sValue[0].isEmpty() || sValue[1].isEmpty()) {
                    valid = false;
                    return;
                }

                family = Bytes.toBytes(sValue[0]);
                qualifier = Bytes.toBytes(sValue[1]);
                valid = true;
            } catch (Exception e) {
                valid = false;
            }
        }


    }
}

4、main

public class ImportFromFile {
//    private static String HDFSUri = "hdfs://bigdata-senior01.home.com:9000";
    public static final String NAME = "ImportFromFile";

    private static CommandLine parseArgs(String[] args) throws ParseException{
        Options options = new Options();

        Option option = new Option("t","table",true,"表不能为空");
        option.setArgName("table-name");
        option.setRequired(true);
        options.addOption(option);

        option = new Option("c","column",true,"列族和列名不能为空");
        option.setArgName("family:qualifier");
        option.setRequired(true);
        options.addOption(option);

        option = new Option("i","input",true,"输入文件或者目录");
        option.setArgName("path-in-HDFS");
        option.setRequired(true);
        options.addOption(option);

        options.addOption("d","debug",false,"switch on DEBUG log level");
        CommandLineParser parser = new PosixParser();
        CommandLine cmd = null;
        try {
            cmd = parser.parse(options,args);
        }catch (Exception e){
            System.err.println("ERROR: " + e.getMessage() + "
");
            HelpFormatter formatter = new HelpFormatter();
            formatter.printHelp(NAME + " ", options, true);
            System.exit(-1);
        }
        if (cmd.hasOption("d")) {
            Logger log = Logger.getLogger("mapreduce");
            log.setLevel(Level.DEBUG);
        }

        return cmd;
    }

    public static void main(String[] args) throws Exception{
        Configuration conf = HBaseConfiguration.create();

        String[] runArgs = new GenericOptionsParser(conf,args).getRemainingArgs();
        CommandLine cmd = parseArgs(runArgs);
        if (cmd.hasOption("d")) conf.set("conf.debug", "true");

        String table = cmd.getOptionValue("t");
        String input = cmd.getOptionValue("i");
        String column = cmd.getOptionValue("c");
        //写入配置后，在mapper阶段取出
        conf.set("conf.column", column);

        Job job = Job.getInstance(conf,"Import from file " + input +" into table " + table);
        job.setJarByClass(ImportFromFile.class);
        job.setMapperClass(ImportMapper.class);
        job.setOutputFormatClass(TableOutputFormat.class);
        job.getConfiguration().set(TableOutputFormat.OUTPUT_TABLE,table);
        job.setOutputKeyClass(ImmutableBytesWritable.class);
        job.setOutputValueClass(Writable.class);
        job.setNumReduceTasks(0); //不需要reduce

        FileInputFormat.addInputPath(job,new Path(input));

        System.exit(job.waitForCompletion(true) ? 0 : 1);

    }
}

5、执行

先在HBASE里建表
create 'importTable','data'

把jar包传到hdfs上执行
hadoop jar ImportFromFile.jar -t importTable -i /input/test-data.txt -c data:json

二、从hbase获取数据进行计算

从上例中把hbase数据抽取出来计算作者出现数量

多加一个依赖

      <dependency>
            <groupId>com.googlecode.json-simple</groupId>
            <artifactId>json-simple</artifactId>
            <version>1.1.1</version>
        </dependency>

1、mapper

import org.apache.hadoop.hbase.Cell;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.mapreduce.TableMapper;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.json.simple.JSONObject;
import org.json.simple.parser.JSONParser;

import java.io.IOException;


public class AnalyzeMapper extends TableMapper<Text,IntWritable> {
    private JSONParser parser = new JSONParser();
    public enum Counters { ROWS, COLS, ERROR, VALID }
    private IntWritable ONE = new IntWritable(1);
    /**
     * Called once for each key/value pair in the input split. Most applications
     * should override this, but the default is the identity function.
     *
     * @param key
     * @param value
     * @param context
     */
    @Override
    protected void map(ImmutableBytesWritable key, Result value, Context context) throws IOException, InterruptedException {
        context.getCounter(Counters.ROWS).increment(1);
        String val = null;
        try {
            for(Cell cell:value.listCells()){
                context.getCounter(Counters.COLS).increment(1);
                val = Bytes.toStringBinary(cell.getValueArray(),cell.getValueOffset(),cell.getValueLength());
                JSONObject json = (JSONObject)parser.parse(val);
                String author = (String)json.get("author");
                if (context.getConfiguration().get("conf.debug") != null)
                    System.out.println("Author: " + author);
                context.write(new Text(author),ONE);
                context.getCounter(Counters.VALID).increment(1);
            }

        }catch (Exception e){
            e.printStackTrace();
            System.err.println("Row: " + Bytes.toStringBinary(key.get()) +
                    ", JSON: " + value);
            context.getCounter(Counters.ERROR).increment(1);
        }

    }
}

2、reducer

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;


public class AnalyzeReducer extends Reducer<Text,IntWritable,Text,IntWritable> {
    /**
     * This method is called once for each key. Most applications will define
     * their reduce class by overriding this method. The default implementation
     * is an identity function.
     *
     * @param key
     * @param values
     * @param context
     */
    @Override
    protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
        int count = 0;
        for(IntWritable one:values) count++;

        if (context.getConfiguration().get("conf.debug") != null)
            System.out.println("Author: " + key.toString() + ", Count: " + count);

        context.write(key,new IntWritable(count));
    }
}

3、main

import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.commons.cli.*;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.client.Scan;
import org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
import org.apache.log4j.Level;
import org.apache.log4j.Logger;

import java.io.IOException;


public class AnalyzeData {
    private static final Log LOG = LogFactory.getLog(AnalyzeData.class);

    public static final String NAME = "AnalyzeData";


    /**
     * Parse the command line parameters.
     *
     * @param args The parameters to parse.
     * @return The parsed command line.
     * @throws org.apache.commons.cli.ParseException When the parsing of the parameters fails.
     */
    private static CommandLine parseArgs(String[] args) throws ParseException {
        Options options = new Options();
        Option o = new Option("t", "table", true,
                "table to read from (must exist)");
        o.setArgName("table-name");
        o.setRequired(true);
        options.addOption(o);
        o = new Option("c", "column", true,
                "column to read data from (must exist)");
        o.setArgName("family:qualifier");
        options.addOption(o);
        o = new Option("o", "output", true,
                "the directory to write to");
        o.setArgName("path-in-HDFS");
        o.setRequired(true);
        options.addOption(o);
        options.addOption("d", "debug", false, "switch on DEBUG log level");
        CommandLineParser parser = new PosixParser();
        CommandLine cmd = null;
        try {
            cmd = parser.parse(options, args);
        } catch (Exception e) {
            System.err.println("ERROR: " + e.getMessage() + "
");
            HelpFormatter formatter = new HelpFormatter();
            formatter.printHelp(NAME + " ", options, true);
            System.exit(-1);
        }
        if (cmd.hasOption("d")) {
            Logger log = Logger.getLogger("mapreduce");
            log.setLevel(Level.DEBUG);
            System.out.println("DEBUG ON");
        }
        return cmd;
    }

    public static void main(String[] args) throws Exception{
        Configuration conf = HBaseConfiguration.create();
        String[] runArgs = new GenericOptionsParser(conf,args).getRemainingArgs();
        CommandLine cmd = parseArgs(runArgs);
        if(cmd.hasOption("d"))
            conf.set("conf.debug","true");

        String table = cmd.getOptionValue("t");
        String column = cmd.getOptionValue("c");
        String output = cmd.getOptionValue("o");

        ColumnParser columnParser = new ColumnParser();
        columnParser.parse(column);
        if(!columnParser.isValid()) throw new IOException("family or qualifier error");
        byte[] family = columnParser.getFamily();
        byte[] qualifier = columnParser.getQualifier();

        Scan scan = new Scan();
        scan.addColumn(family,qualifier);

        Job job = Job.getInstance(conf,"Analyze data in " + table);
        job.setJarByClass(AnalyzeData.class);
        TableMapReduceUtil.initTableMapperJob(table,scan,AnalyzeMapper.class, Text.class, IntWritable.class,job);
        job.setMapperClass(AnalyzeMapper.class);
        job.setReducerClass(AnalyzeReducer.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);
        job.setNumReduceTasks(1);
        FileOutputFormat.setOutputPath(job,new Path(output));

        System.exit(job.waitForCompletion(true) ? 0:1);

    }


}

###
public class ColumnParser {
    private byte[] family;
    private byte[] qualifier;
    private boolean valid;

    public byte[] getFamily() {
        return family;
    }

    public byte[] getQualifier() {
        return qualifier;
    }

    public boolean isValid() {
        return valid;
    }

    public void parse(String value) {
        try {
            String[] sValue = value.split(":");
            if (sValue == null || sValue.length < 2 || sValue[0].isEmpty() || sValue[1].isEmpty()) {
                valid = false;
                return;
            }

            family = Bytes.toBytes(sValue[0]);
            qualifier = Bytes.toBytes(sValue[1]);
            valid = true;
        } catch (Exception e) {
            valid = false;
        }
    }
}

4、执行

hadoop jar AnalyzeData.jar -t importTable -c data:json -o /output9

结果：
... ...
    AnalyzeMapper$Counters
        COLS=993
        ERROR=6
        ROWS=993
        VALID=987

三、从hbase中读取数据，计算后存回hbase

把上例中存入的json串读出，按key-value的方式分解，把key作为列名，value作为列值存入hbase

public class ParseJson {
    private static final String HDFSUri = "hdfs://bigdata-senior01.home.com:9000";
    private static final Log LOG = LogFactory.getLog(ParseJson.class);
    public static final String NAME = "ParseJson";
    public enum Counters {ROWS,COLS,VALID,ERROR};

    static class ParseMapper extends TableMapper<ImmutableBytesWritable, Mutation>{
        private JSONParser parser = new JSONParser();
        private byte[] columnFamily = null;

        @Override
        protected void setup(Context context) throws IOException, InterruptedException {
            columnFamily = Bytes.toBytes(context.getConfiguration().get("conf.columnFamily"));
        }

        @Override
        protected void map(ImmutableBytesWritable key, Result value, Context context) throws IOException, InterruptedException {
            context.getCounter(Counters.ROWS).increment(1);
            String val = null;
            try {
                Put put = new Put(key.get());
                for(Cell cell : value.listCells()){
                    context.getCounter(Counters.COLS).increment(1);
                    val = Bytes.toStringBinary(cell.getValueArray(),cell.getValueOffset(),cell.getValueLength());
                    JSONObject json = (JSONObject) parser.parse(val);

                    for (Object jsonKey : json.keySet()){
                        Object jsonValue = json.get(jsonKey);
                        put.addColumn(columnFamily,Bytes.toBytes(jsonKey.toString()),Bytes.toBytes(jsonValue.toString()));
                    }
                }
                context.write(key,put);
                context.getCounter(Counters.VALID).increment(1);
            }catch (Exception e){
                e.printStackTrace();
                System.err.println("Error: " + e.getMessage() + ", Row: " +
                        Bytes.toStringBinary(key.get()) + ", JSON: " + value);
                context.getCounter(Counters.ERROR).increment(1);
            }
        }
    }

    private static CommandLine parseArgs(String[] args) throws ParseException{
        Options options = new Options();
        Option o = new Option("i", "input", true,
                "table to read from (must exist)");
        o.setArgName("input-table-name");
        o.setRequired(true);
        options.addOption(o);
        o = new Option("o", "output", true,
                "table to write to (must exist)");
        o.setArgName("output-table-name");
        o.setRequired(true);
        options.addOption(o);
        o = new Option("c", "column", true,
                "column to read data from (must exist)");
        o.setArgName("family:qualifier");
        options.addOption(o);
        options.addOption("d", "debug", false, "switch on DEBUG log level");

        CommandLineParser parser = new PosixParser();
        CommandLine cmd = null;
        try {
            cmd = parser.parse(options, args);
        } catch (Exception e) {
            System.err.println("ERROR: " + e.getMessage() + "
");
            HelpFormatter formatter = new HelpFormatter();
            formatter.printHelp(NAME + " ", options, true);
            System.exit(-1);
        }
        if (cmd.hasOption("d")) {
            Logger log = Logger.getLogger("mapreduce");
            log.setLevel(Level.DEBUG);
            System.out.println("DEBUG ON");
        }
        return cmd;
    }

    public static void main(String[] args) throws Exception{
        Configuration conf = HBaseConfiguration.create();

//        conf.set("hbase.master","192.168.31.10");
//        conf.set("hbase.zookeeper.quorum", "192.168.31.10");
//        conf.set("hbase.rootdir","hdfs://bigdata-senior01.home.com:9000/hbase");
//        conf.set("hbase.zookeeper.property.dataDir","hdfs://bigdata-senior01.home.com:9000/hbase/zookeeper");

        String[] runArgs = new GenericOptionsParser(conf,args).getRemainingArgs();
        CommandLine cmd = parseArgs(runArgs);
        if(cmd.hasOption("d")) conf.set("conf.debug","true");
        String input = cmd.getOptionValue("i");
        String output = cmd.getOptionValue("o");
        String column = cmd.getOptionValue("c");

        ColumnParser columnParser = new ColumnParser();
        columnParser.parse(column);
        if(!columnParser.isValid()) throw new IOException("family or qualifier error");
        byte[] family = columnParser.getFamily();
        byte[] qualifier = columnParser.getQualifier();

        Scan scan = new Scan();
        scan.addColumn(family,qualifier);
        conf.set("conf.columnFamily", Bytes.toStringBinary(family));

        Job job = Job.getInstance(conf, "Parse data in " + input +
                ", write to " + output);
        job.setJarByClass(ParseJson.class);
        TableMapReduceUtil.initTableMapperJob(input,scan,ParseMapper.class,ImmutableBytesWritable.class,Put.class,job);
        TableMapReduceUtil.initTableReducerJob(output, IdentityTableReducer.class,job);

        System.exit(job.waitForCompletion(true)?0:1);

    }

}

执行：

hadoop jar ParseJson.jar -i importTable -c data:json -o importTable

查看全文

相关阅读:
机器学习之决策树与随机森林模型
 深度学习入门篇--手把手教你用 TensorFlow 训练模型
 Android 性能测试之方向与框架篇
 机器学习：从入门到第一个模型
 5分钟教你玩转 sklearn 机器学习（上）
Hbase 技术细节笔记（上）
五年 Web 开发者 star 的 github 整理说明
 腾讯云发布第三代云服务器矩阵，开放更强计算力赋能产业智能化
 为什么要用深度学习来做个性化推荐 CTR 预估
 云 MongoDB 优化让 LBS 服务性能提升十倍

原文地址：https://www.cnblogs.com/asker009/p/10771928.html