zoukankan      html  css  js  c++  java
  • mapreduce方式操作hbase

    一、导入数据到hbase

    1、配置hbase-site.xml指向hdfs

    <configuration>
      <property>
        <name>hbase.rootdir</name>
        <value>hdfs://bigdata-senior01.home.com:9000/hbase</value>
      </property>
      <property>
        <name>hbase.zookeeper.property.dataDir</name>
        <value>hdfs://bigdata-senior01.home.com:9000/hbase/zookeeper</value>
      </property>
      <property>
        <name>hbase.unsafe.stream.capability.enforce</name>
        <value>false</value>
        <description>
          Controls whether HBase will check for stream capabilities (hflush/hsync).
    
          Disable this if you intend to run on LocalFileSystem, denoted by a rootdir
          with the 'file://' scheme, but be mindful of the NOTE below.
    
          WARNING: Setting this to false blinds you to potential data loss and
          inconsistent system state in the event of process and/or node failures. If
          HBase is complaining of an inability to use hsync or hflush it's most
          likely not a false positive.
        </description>
      </property>
    </configuration>

    2、依赖

            <dependency>
                <groupId>org.apache.hadoop</groupId>
                <artifactId>hadoop-client</artifactId>
                <version>3.2.0</version>
            </dependency>
    
            <dependency>
                <groupId>org.apache.hbase</groupId>
                <artifactId>hbase-client</artifactId>
                <version>2.0.4</version>
            </dependency>
    
            <dependency>
                <groupId>org.apache.hbase</groupId>
                <artifactId>hbase-mapreduce</artifactId>
                <version>2.0.4</version>
            </dependency>

    3、mapper

    //输入:文本方式,输出:字节作为键,hbase的Mutation作为输出值
    public class ImportMapper extends Mapper<LongWritable, Text, ImmutableBytesWritable, Mutation> {
        //计数器
        public enum Counters {
            LINES
        }
    
        private byte[] family = null;
        private byte[] qualifier = null;
    
        /**
         * Called once at the beginning of the task.
         *
         * @param context
         */
        @Override
        protected void setup(Context context) throws IOException, InterruptedException {
            //从配置文件中读取列族信息,这个信息是控制台方式写入,并通过cli获取
            String column = context.getConfiguration().get("conf.column");
            ColParser parser = new ColParser();
            parser.parse(column);
            if(!parser.isValid()) throw new IOException("family or qualifier error");
            family = parser.getFamily();
            qualifier = parser.getQualifier();
        }
    
        /**
         * Called once for each key/value pair in the input split. Most applications
         * should override this, but the default is the identity function.
         *
         * @param key
         * @param value
         * @param context
         */
        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            try {
                String line = value.toString();
                //散列每行数据作为行键,根据需求调整
                byte[] rowKey = DigestUtils.md5(line);
                Put put = new Put(rowKey);
                put.addColumn(this.family,this.qualifier,Bytes.toBytes(line));
                context.write(new ImmutableBytesWritable(rowKey),put);
                context.getCounter(Counters.LINES).increment(1);
            }catch (Exception e){
                e.printStackTrace();
            }
        }
        
        class ColParser {
            private byte[] family;
            private byte[] qualifier;
            private boolean valid;
    
            public byte[] getFamily() {
                return family;
            }
    
            public byte[] getQualifier() {
                return qualifier;
            }
    
            public boolean isValid() {
                return valid;
            }
    
            public void parse(String value) {
                try {
                    String[] sValue = value.split(":");
                    if (sValue == null || sValue.length < 2 || sValue[0].isEmpty() || sValue[1].isEmpty()) {
                        valid = false;
                        return;
                    }
    
                    family = Bytes.toBytes(sValue[0]);
                    qualifier = Bytes.toBytes(sValue[1]);
                    valid = true;
                } catch (Exception e) {
                    valid = false;
                }
            }
    
    
        }
    }

    4、main

    public class ImportFromFile {
    //    private static String HDFSUri = "hdfs://bigdata-senior01.home.com:9000";
        public static final String NAME = "ImportFromFile";
    
        private static CommandLine parseArgs(String[] args) throws ParseException{
            Options options = new Options();
    
            Option option = new Option("t","table",true,"表不能为空");
            option.setArgName("table-name");
            option.setRequired(true);
            options.addOption(option);
    
            option = new Option("c","column",true,"列族和列名不能为空");
            option.setArgName("family:qualifier");
            option.setRequired(true);
            options.addOption(option);
    
            option = new Option("i","input",true,"输入文件或者目录");
            option.setArgName("path-in-HDFS");
            option.setRequired(true);
            options.addOption(option);
    
            options.addOption("d","debug",false,"switch on DEBUG log level");
            CommandLineParser parser = new PosixParser();
            CommandLine cmd = null;
            try {
                cmd = parser.parse(options,args);
            }catch (Exception e){
                System.err.println("ERROR: " + e.getMessage() + "
    ");
                HelpFormatter formatter = new HelpFormatter();
                formatter.printHelp(NAME + " ", options, true);
                System.exit(-1);
            }
            if (cmd.hasOption("d")) {
                Logger log = Logger.getLogger("mapreduce");
                log.setLevel(Level.DEBUG);
            }
    
            return cmd;
        }
    
        public static void main(String[] args) throws Exception{
            Configuration conf = HBaseConfiguration.create();
    
            String[] runArgs = new GenericOptionsParser(conf,args).getRemainingArgs();
            CommandLine cmd = parseArgs(runArgs);
            if (cmd.hasOption("d")) conf.set("conf.debug", "true");
    
            String table = cmd.getOptionValue("t");
            String input = cmd.getOptionValue("i");
            String column = cmd.getOptionValue("c");
            //写入配置后,在mapper阶段取出
            conf.set("conf.column", column);
    
            Job job = Job.getInstance(conf,"Import from file " + input +" into table " + table);
            job.setJarByClass(ImportFromFile.class);
            job.setMapperClass(ImportMapper.class);
            job.setOutputFormatClass(TableOutputFormat.class);
            job.getConfiguration().set(TableOutputFormat.OUTPUT_TABLE,table);
            job.setOutputKeyClass(ImmutableBytesWritable.class);
            job.setOutputValueClass(Writable.class);
            job.setNumReduceTasks(0); //不需要reduce
    
            FileInputFormat.addInputPath(job,new Path(input));
    
            System.exit(job.waitForCompletion(true) ? 0 : 1);
    
        }
    }

    5、执行

    先在HBASE里建表
    create 'importTable','data'
    
    把jar包传到hdfs上执行
    hadoop jar ImportFromFile.jar -t importTable -i /input/test-data.txt -c data:json 

    二、从hbase获取数据进行计算

    从上例中把hbase数据抽取出来计算作者出现数量

    多加一个依赖

          <dependency>
                <groupId>com.googlecode.json-simple</groupId>
                <artifactId>json-simple</artifactId>
                <version>1.1.1</version>
            </dependency>

    1、mapper

    import org.apache.hadoop.hbase.Cell;
    import org.apache.hadoop.hbase.client.Result;
    import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
    import org.apache.hadoop.hbase.mapreduce.TableMapper;
    import org.apache.hadoop.hbase.util.Bytes;
    import org.apache.hadoop.io.IntWritable;
    import org.apache.hadoop.io.Text;
    import org.json.simple.JSONObject;
    import org.json.simple.parser.JSONParser;
    
    import java.io.IOException;
    
    
    public class AnalyzeMapper extends TableMapper<Text,IntWritable> {
        private JSONParser parser = new JSONParser();
        public enum Counters { ROWS, COLS, ERROR, VALID }
        private IntWritable ONE = new IntWritable(1);
        /**
         * Called once for each key/value pair in the input split. Most applications
         * should override this, but the default is the identity function.
         *
         * @param key
         * @param value
         * @param context
         */
        @Override
        protected void map(ImmutableBytesWritable key, Result value, Context context) throws IOException, InterruptedException {
            context.getCounter(Counters.ROWS).increment(1);
            String val = null;
            try {
                for(Cell cell:value.listCells()){
                    context.getCounter(Counters.COLS).increment(1);
                    val = Bytes.toStringBinary(cell.getValueArray(),cell.getValueOffset(),cell.getValueLength());
                    JSONObject json = (JSONObject)parser.parse(val);
                    String author = (String)json.get("author");
                    if (context.getConfiguration().get("conf.debug") != null)
                        System.out.println("Author: " + author);
                    context.write(new Text(author),ONE);
                    context.getCounter(Counters.VALID).increment(1);
                }
    
            }catch (Exception e){
                e.printStackTrace();
                System.err.println("Row: " + Bytes.toStringBinary(key.get()) +
                        ", JSON: " + value);
                context.getCounter(Counters.ERROR).increment(1);
            }
    
        }
    }

    2、reducer

    import org.apache.hadoop.io.IntWritable;
    import org.apache.hadoop.io.Text;
    import org.apache.hadoop.mapreduce.Reducer;
    
    import java.io.IOException;
    
    
    public class AnalyzeReducer extends Reducer<Text,IntWritable,Text,IntWritable> {
        /**
         * This method is called once for each key. Most applications will define
         * their reduce class by overriding this method. The default implementation
         * is an identity function.
         *
         * @param key
         * @param values
         * @param context
         */
        @Override
        protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
            int count = 0;
            for(IntWritable one:values) count++;
    
            if (context.getConfiguration().get("conf.debug") != null)
                System.out.println("Author: " + key.toString() + ", Count: " + count);
    
            context.write(key,new IntWritable(count));
        }
    }

    3、main

    import org.apache.hadoop.fs.Path;
    import org.apache.hadoop.io.IntWritable;
    import org.apache.hadoop.io.Text;
    import org.apache.commons.cli.*;
    import org.apache.commons.logging.Log;
    import org.apache.commons.logging.LogFactory;
    import org.apache.hadoop.conf.Configuration;
    import org.apache.hadoop.hbase.HBaseConfiguration;
    import org.apache.hadoop.hbase.client.Scan;
    import org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil;
    import org.apache.hadoop.mapreduce.Job;
    import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
    import org.apache.hadoop.util.GenericOptionsParser;
    import org.apache.log4j.Level;
    import org.apache.log4j.Logger;
    
    import java.io.IOException;
    
    
    public class AnalyzeData {
        private static final Log LOG = LogFactory.getLog(AnalyzeData.class);
    
        public static final String NAME = "AnalyzeData";
    
    
        /**
         * Parse the command line parameters.
         *
         * @param args The parameters to parse.
         * @return The parsed command line.
         * @throws org.apache.commons.cli.ParseException When the parsing of the parameters fails.
         */
        private static CommandLine parseArgs(String[] args) throws ParseException {
            Options options = new Options();
            Option o = new Option("t", "table", true,
                    "table to read from (must exist)");
            o.setArgName("table-name");
            o.setRequired(true);
            options.addOption(o);
            o = new Option("c", "column", true,
                    "column to read data from (must exist)");
            o.setArgName("family:qualifier");
            options.addOption(o);
            o = new Option("o", "output", true,
                    "the directory to write to");
            o.setArgName("path-in-HDFS");
            o.setRequired(true);
            options.addOption(o);
            options.addOption("d", "debug", false, "switch on DEBUG log level");
            CommandLineParser parser = new PosixParser();
            CommandLine cmd = null;
            try {
                cmd = parser.parse(options, args);
            } catch (Exception e) {
                System.err.println("ERROR: " + e.getMessage() + "
    ");
                HelpFormatter formatter = new HelpFormatter();
                formatter.printHelp(NAME + " ", options, true);
                System.exit(-1);
            }
            if (cmd.hasOption("d")) {
                Logger log = Logger.getLogger("mapreduce");
                log.setLevel(Level.DEBUG);
                System.out.println("DEBUG ON");
            }
            return cmd;
        }
    
        public static void main(String[] args) throws Exception{
            Configuration conf = HBaseConfiguration.create();
            String[] runArgs = new GenericOptionsParser(conf,args).getRemainingArgs();
            CommandLine cmd = parseArgs(runArgs);
            if(cmd.hasOption("d"))
                conf.set("conf.debug","true");
    
            String table = cmd.getOptionValue("t");
            String column = cmd.getOptionValue("c");
            String output = cmd.getOptionValue("o");
    
            ColumnParser columnParser = new ColumnParser();
            columnParser.parse(column);
            if(!columnParser.isValid()) throw new IOException("family or qualifier error");
            byte[] family = columnParser.getFamily();
            byte[] qualifier = columnParser.getQualifier();
    
            Scan scan = new Scan();
            scan.addColumn(family,qualifier);
    
            Job job = Job.getInstance(conf,"Analyze data in " + table);
            job.setJarByClass(AnalyzeData.class);
            TableMapReduceUtil.initTableMapperJob(table,scan,AnalyzeMapper.class, Text.class, IntWritable.class,job);
            job.setMapperClass(AnalyzeMapper.class);
            job.setReducerClass(AnalyzeReducer.class);
            job.setOutputKeyClass(Text.class);
            job.setOutputValueClass(IntWritable.class);
            job.setNumReduceTasks(1);
            FileOutputFormat.setOutputPath(job,new Path(output));
    
            System.exit(job.waitForCompletion(true) ? 0:1);
    
        }
    
    
    }
    ###
    public class ColumnParser { private byte[] family; private byte[] qualifier; private boolean valid; public byte[] getFamily() { return family; } public byte[] getQualifier() { return qualifier; } public boolean isValid() { return valid; } public void parse(String value) { try { String[] sValue = value.split(":"); if (sValue == null || sValue.length < 2 || sValue[0].isEmpty() || sValue[1].isEmpty()) { valid = false; return; } family = Bytes.toBytes(sValue[0]); qualifier = Bytes.toBytes(sValue[1]); valid = true; } catch (Exception e) { valid = false; } } }

    4、执行

    hadoop jar AnalyzeData.jar -t importTable -c data:json -o /output9
    
    结果:
    ... ...
        AnalyzeMapper$Counters
            COLS=993
            ERROR=6
            ROWS=993
            VALID=987

     三、从hbase中读取数据,计算后存回hbase

    把上例中存入的json串读出,按key-value的方式分解,把key作为列名,value作为列值存入hbase

    public class ParseJson {
        private static final String HDFSUri = "hdfs://bigdata-senior01.home.com:9000";
        private static final Log LOG = LogFactory.getLog(ParseJson.class);
        public static final String NAME = "ParseJson";
        public enum Counters {ROWS,COLS,VALID,ERROR};
    
        static class ParseMapper extends TableMapper<ImmutableBytesWritable, Mutation>{
            private JSONParser parser = new JSONParser();
            private byte[] columnFamily = null;
    
            @Override
            protected void setup(Context context) throws IOException, InterruptedException {
                columnFamily = Bytes.toBytes(context.getConfiguration().get("conf.columnFamily"));
            }
    
            @Override
            protected void map(ImmutableBytesWritable key, Result value, Context context) throws IOException, InterruptedException {
                context.getCounter(Counters.ROWS).increment(1);
                String val = null;
                try {
                    Put put = new Put(key.get());
                    for(Cell cell : value.listCells()){
                        context.getCounter(Counters.COLS).increment(1);
                        val = Bytes.toStringBinary(cell.getValueArray(),cell.getValueOffset(),cell.getValueLength());
                        JSONObject json = (JSONObject) parser.parse(val);
    
                        for (Object jsonKey : json.keySet()){
                            Object jsonValue = json.get(jsonKey);
                            put.addColumn(columnFamily,Bytes.toBytes(jsonKey.toString()),Bytes.toBytes(jsonValue.toString()));
                        }
                    }
                    context.write(key,put);
                    context.getCounter(Counters.VALID).increment(1);
                }catch (Exception e){
                    e.printStackTrace();
                    System.err.println("Error: " + e.getMessage() + ", Row: " +
                            Bytes.toStringBinary(key.get()) + ", JSON: " + value);
                    context.getCounter(Counters.ERROR).increment(1);
                }
            }
        }
    
        private static CommandLine parseArgs(String[] args) throws ParseException{
            Options options = new Options();
            Option o = new Option("i", "input", true,
                    "table to read from (must exist)");
            o.setArgName("input-table-name");
            o.setRequired(true);
            options.addOption(o);
            o = new Option("o", "output", true,
                    "table to write to (must exist)");
            o.setArgName("output-table-name");
            o.setRequired(true);
            options.addOption(o);
            o = new Option("c", "column", true,
                    "column to read data from (must exist)");
            o.setArgName("family:qualifier");
            options.addOption(o);
            options.addOption("d", "debug", false, "switch on DEBUG log level");
    
            CommandLineParser parser = new PosixParser();
            CommandLine cmd = null;
            try {
                cmd = parser.parse(options, args);
            } catch (Exception e) {
                System.err.println("ERROR: " + e.getMessage() + "
    ");
                HelpFormatter formatter = new HelpFormatter();
                formatter.printHelp(NAME + " ", options, true);
                System.exit(-1);
            }
            if (cmd.hasOption("d")) {
                Logger log = Logger.getLogger("mapreduce");
                log.setLevel(Level.DEBUG);
                System.out.println("DEBUG ON");
            }
            return cmd;
        }
    
        public static void main(String[] args) throws Exception{
            Configuration conf = HBaseConfiguration.create();
    
    //        conf.set("hbase.master","192.168.31.10");
    //        conf.set("hbase.zookeeper.quorum", "192.168.31.10");
    //        conf.set("hbase.rootdir","hdfs://bigdata-senior01.home.com:9000/hbase");
    //        conf.set("hbase.zookeeper.property.dataDir","hdfs://bigdata-senior01.home.com:9000/hbase/zookeeper");
    
            String[] runArgs = new GenericOptionsParser(conf,args).getRemainingArgs();
            CommandLine cmd = parseArgs(runArgs);
            if(cmd.hasOption("d")) conf.set("conf.debug","true");
            String input = cmd.getOptionValue("i");
            String output = cmd.getOptionValue("o");
            String column = cmd.getOptionValue("c");
    
            ColumnParser columnParser = new ColumnParser();
            columnParser.parse(column);
            if(!columnParser.isValid()) throw new IOException("family or qualifier error");
            byte[] family = columnParser.getFamily();
            byte[] qualifier = columnParser.getQualifier();
    
            Scan scan = new Scan();
            scan.addColumn(family,qualifier);
            conf.set("conf.columnFamily", Bytes.toStringBinary(family));
    
            Job job = Job.getInstance(conf, "Parse data in " + input +
                    ", write to " + output);
            job.setJarByClass(ParseJson.class);
            TableMapReduceUtil.initTableMapperJob(input,scan,ParseMapper.class,ImmutableBytesWritable.class,Put.class,job);
            TableMapReduceUtil.initTableReducerJob(output, IdentityTableReducer.class,job);
    
            System.exit(job.waitForCompletion(true)?0:1);
    
        }
    
    }

    执行:

    hadoop jar ParseJson.jar -i importTable -c data:json -o importTable
  • 相关阅读:
    C# 枚举、字符串、值的相互转换
    What's New in v2010 vol 2.5
    Using Oracle's Parallel Execution Features
    [zhuan]asp.net程序性能优化的七个方面 (c#(或vb.net)程序改进)
    ORACLE常用网址
    html中的块元素(block element)和内联元素(inline element)
    软件构架师的特点
    窗体信息处理函数讲解
    [xue]软件项目经理所必需具备的素质
    Gulp系列文章入门Gulp
  • 原文地址:https://www.cnblogs.com/asker009/p/10771928.html
Copyright © 2011-2022 走看看