zoukankan html css js c++ java

十九、Hadoop学记笔记————Hbase和MapReduce

概要：

hadoop和hbase导入环境变量：

要运行Hbase中自带的MapReduce程序，需要运行如下指令，可在官网中找到：

如果遇到如下问题，则说明Hadoop的MapReduce没有权限访问Hbase的jar包：

参考官网可解决：

运行后解决：

导入数据运行指令：

tsv是指以制表符为分隔符的文件

先创建测试数据，创建user文件：

上传至hdfs，并且启动hbase shell：

创建表：

之后导入数据：

还有一些其他的方法，比如rowcounter统计行数：

接下来演示用sqoop将mysql数据考入hbase，构建测试数据：

使用import，需要先配置hbase环境变量：

Hbase表数据的迁移：

之后编写MapReduce程序，代码如下：

package com.tyx.hbase.mr;

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.hbase.Cell;
import org.apache.hadoop.hbase.CellUtil;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.client.Scan;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil;
import org.apache.hadoop.hbase.mapreduce.TableMapper;
import org.apache.hadoop.hbase.mapreduce.TableReducer;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;


public class Tab2TabMapReduce extends Configured implements Tool {

    // mapper class
    public static class TabMapper extends TableMapper<Text, Put> {
        private Text rowkey = new Text();
        
        @Override
        protected void map(ImmutableBytesWritable key, Result value,Context context)
                throws IOException, InterruptedException {
            byte[] bytes = key.get();
            rowkey.set(Bytes.toString(bytes));
            
            Put put = new Put(bytes);
            
            for (Cell cell : value.rawCells()) {
                // add cell
                if("info".equals(Bytes.toString(CellUtil.cloneFamily(cell)))) {
                    if("name".equals(Bytes.toString(CellUtil.cloneQualifier(cell)))) {
                        put.add(cell);
                    }
                }
            }
            
            context.write(rowkey, put);
        }
    }
    
    // reduce class
    public static class TabReduce extends TableReducer<Text,Put, ImmutableBytesWritable> {
        @Override
        protected void reduce(Text key, Iterable<Put> values,Context context)
                throws IOException, InterruptedException {
            for (Put put : values) {
                context.write(null, put);
            }
            
        }
    }
    
    @Override
    public int run(String[] args) throws Exception {
        //create job
        Job job = Job.getInstance(this.getConf(), this.getClass().getSimpleName());
        
        // set run class
        job.setJarByClass(this.getClass());
        
        Scan scan = new Scan();
        scan.setCaching(500);
        scan.setCacheBlocks(false);
        
        // set mapper
        TableMapReduceUtil.initTableMapperJob(
                "tab1", // input table
                scan , // scan instance   
                TabMapper.class,  // set mapper class
                Text.class, // mapper output key 
                Put.class, //mapper output value 
                job // set job
        );
        
        TableMapReduceUtil.initTableReducerJob(
                "tab2" , // output table 
                TabReduce.class, // set reduce class 
                job // set job
        );
        
        job.setNumReduceTasks(1);
        
        boolean b = job.waitForCompletion(true);
        
        if(!b) {
            System.err.print("error with job!!!");
        }
        
        
        
        
        
        
        
        
        
        
        return 0;
    }
    
    public static void main(String[] args) throws Exception {
        
        //create config
        Configuration config = HBaseConfiguration.create();
        
        //submit job
        int status = ToolRunner.run(config, new Tab2TabMapReduce(), args);
        
        //exit
        System.exit(status);
    }

}

运行指令：

接下来是hdfs中文件导入Hbase：

构造数据：

然后编写MapReduce程序：

package com.jkxy.hbase.mr;

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

public class HDFS2TabMapReduce extends Configured implements Tool{
    
    public static class HDFS2TabMapper extends Mapper<LongWritable, Text, ImmutableBytesWritable, Put> {
        
        ImmutableBytesWritable rowkey = new ImmutableBytesWritable();
        
        @Override
        protected void map(LongWritable key, Text value,Context context)
                throws IOException, InterruptedException {
            
            String[] words = value.toString().split("	");
            //rk0001    zhangsan    33
            
            Put put = new Put(Bytes.toBytes(words[0]));
            put.add(Bytes.toBytes("info"),Bytes.toBytes("name"),Bytes.toBytes(words[1]));
            put.add(Bytes.toBytes("info"),Bytes.toBytes("age"),Bytes.toBytes(words[2]));
            
            rowkey.set(Bytes.toBytes(words[0]));
            
            context.write(rowkey, put);
        }
    }
    
    @Override
    public int run(String[] args) throws Exception {
        
        // create job
        Job job = Job.getInstance(this.getConf(), this.getClass().getSimpleName());
        
        // set class
        job.setJarByClass(this.getClass());
        
        // set path
        FileInputFormat.addInputPath(job, new Path(args[0]));
        
        //set mapper
        job.setMapperClass(HDFS2TabMapper.class);
        job.setMapOutputKeyClass(ImmutableBytesWritable.class);
        job.setMapOutputValueClass(Put.class);
        
        // set reduce
        TableMapReduceUtil.initTableReducerJob(
                "user", // set table
                null, 
                job);
        job.setNumReduceTasks(0);
        
        boolean b = job.waitForCompletion(true);
        
        if(!b) {
            throw new IOException("error with job!!!");
        }
        
        return 0;
    }
    
    public static void main(String[] args) throws Exception {
        //get configuration
        Configuration conf = HBaseConfiguration.create();
        
        //submit job
        int status = ToolRunner.run(conf, new HDFS2TabMapReduce(), args);
        
        //exit
        System.exit(status);
    }

}

运行指令

接下来演示使用BulkLaod将数据从Hdfs导入Hbase，使用该方式可以绕过WAL，memstor等步骤，加快海量数据的效率，代码如下：

package com.jkxy.hbase.mr;

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.client.HTable;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.mapreduce.HFileOutputFormat2;
import org.apache.hadoop.hbase.mapreduce.LoadIncrementalHFiles;
import org.apache.hadoop.hbase.mapreduce.PutSortReducer;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

public class HFile2TabMapReduce extends Configured implements Tool {

    public static class HFile2TabMapper extends Mapper<LongWritable, Text, ImmutableBytesWritable, Put> {
        
        ImmutableBytesWritable rowkey = new ImmutableBytesWritable();
        
        @Override
        protected void map(LongWritable key, Text value,Context context)
                throws IOException, InterruptedException {
            
            String[] words = value.toString().split("	");
            
            Put put = new Put(Bytes.toBytes(words[0]));
            put.add(Bytes.toBytes("info"), Bytes.toBytes("name"), Bytes.toBytes(words[1]));
            put.add(Bytes.toBytes("info"), Bytes.toBytes("age"), Bytes.toBytes(words[2]));
            rowkey.set(Bytes.toBytes(words[0]));
            
            context.write(rowkey, put);
        }
    }
    
    
    @Override
    public int run(String[] args) throws Exception {
        
        //create job
        Job job = Job.getInstance(getConf(), this.getClass().getSimpleName());
        
        // set run jar class
        job.setJarByClass(this.getClass());
        
        // set input . output
        FileInputFormat.addInputPath(job, new Path(args[1]));
        FileOutputFormat.setOutputPath(job, new Path(args[2]));
        
        // set map
        job.setMapperClass(HFile2TabMapper.class);
        job.setMapOutputKeyClass(ImmutableBytesWritable.class);
        job.setMapOutputValueClass(Put.class);
        
        // set reduce
        job.setReducerClass(PutSortReducer.class);
        
        HTable table = new HTable(getConf(), args[0]);
        // set hfile output
        HFileOutputFormat2.configureIncrementalLoad(job, table );
        
        // submit job
        boolean b = job.waitForCompletion(true);
        if(!b) {
            throw new IOException(" error with job !!!");
        }
        LoadIncrementalHFiles loader = new LoadIncrementalHFiles(getConf());
        // load hfile
        loader.doBulkLoad(new Path(args[2]), table);
        
        
        
        return 0;
    }
    
    public static void main(String[] args) throws Exception {
        // get configuration
        Configuration conf = HBaseConfiguration.create();
        
        //run job
        int status = ToolRunner.run(conf, new HFile2TabMapReduce(), args);
        
        // exit
        System.exit(status);
        
    }

}

使用如下指令：

人生苦短，远离IT脱离苦海

查看全文

相关阅读:
Flask基础
 Scrapy框架(持久化,去重,深度控制,cookie)
scrapy框架简易整理
 BeautifulSoup 模块
 requests模块
 复习第三天
 在Django中使用原生Sql
ajax跨域简单请求和复杂请求
 复习第二天
 IOS

原文地址：https://www.cnblogs.com/liuxiaopang/p/8039237.html