zoukankan      html  css  js  c++  java
  • Spark操作hbase

    1. Spark操作HBase - 读操作
     1 /**
     2   * 配置hbase
     3   *
     4   * @param tableName
     5   * @param quorum
     6   * @param port
     7   * @return
     8   */
     9 def getHbaseConf(quorum: String, port: String): Configuration = {
    10   // 配置hbase环境
    11   val conf = HBaseConfiguration.create()
    12   // 设置zookeeper地址
    13   conf.set("hbase.zookeeper.quorum", quorum)
    14   // 设置zookeeper端口
    15   conf.set("hbase.zookeeper.property.clientPort", port)
    16   // 设置读取hbase的tablename
    17   conf
    18 }

      

     1 def main(args: Array[String]): Unit = {
     2 
     3   // 获取spark环境 设置local使程序在本地运行,不需要安装Spark集群 这里使用的是yarn模式
     4   val sparkConf = new SparkConf().setAppName("Spark-Hbase-Read").setMaster("local[2]")
     5   val sc = new SparkContext(sparkConf)
     6   // 获取hbase相关配置信息
     7   val hbaseConf = HbaseUtils.getHbaseConf("centos01,centos02,centos03", "2181")
     8   hbaseConf.set(TableInputFormat.INPUT_TABLE, "movie_wordcount")
     9   // 将 hbase 读取信息转换成rdd
    10   val hbaseRdd = sc.newAPIHadoopRDD(hbaseConf, classOf[TableInputFormat], classOf[ImmutableBytesWritable], classOf[Result]).cache()
    11 
    12   hbaseRdd.map(x => {
    13     val result = x._2
    14     val row = Bytes.toString(result.getRow)
    15     val word = Bytes.toString(result.getValue("word".getBytes(), "word".getBytes()))
    16     // 单词量是int 所以需要用 Bytes.toInt 否则会出现乱码
    17     val count = Bytes.toInt(result.getValue("word".getBytes(), "count".getBytes()))
    18 
    19     println(row, word, count)
    20     (row, word, count)
    21   }).saveAsTextFile("/wordcount/output3")
    22 
    23 
    24 }

    打包提交到集群,运行 命令:./bin/spark-submit --class com.xxx.xx.scala.hbase.SparkHbaseR ./localjar/sc-1.0-SNAPSHOT-jar-with-dependencies.jar 

      saveAsTextFile 默认保存地址是hdfs上的,所以去hdfs上查看结果,结果是一个文件夹,

      查看命令: hadoop fs -ls /wordcount/output3

      

      2.Spark操作HBase - 写操作

    object SparkHbaseW {

    def main(args: Array[String]): Unit = {
    val conf = new SparkConf().setMaster("Spark-Hbase-Write").setMaster("yarn")
    val sc = new SparkContext(conf)
    val file = sc.textFile("/spark/movie/wordcount/pinglun.txt")
    // 使用中文分词
    val rdd = file.flatMap(line => {
    getWords(line, filter(new Array[String](0)))
    }).map(x => (x, 1)).reduceByKey(_ + _)
    // 保存一份结果到hdfs
    rdd.saveAsTextFile(args(0))

    // 插入方式一
    rdd.foreachPartition(x => {
    x.foreach(y => {
    // 将数组插入hbase
    val table = HbaseUtils.getTable(HbaseUtils.getHbaseConf("centos01,centos02,centos03", "2181"), "movie_wordcount")
    val family = Bytes.toBytes("word")
    val wordColum = Bytes.toBytes("word")
    val countColum = Bytes.toBytes("count")

    val uuid = UUID.randomUUID()
    val wordPut = new Put(Bytes.toBytes(uuid.toString))
    wordPut.addColumn(family, wordColum, Bytes.toBytes(y._1))
    wordPut.addColumn(family, countColum, Bytes.toBytes(y._2))
    table.put(wordPut)
    })
    })

    // 插入方式二 (批量插入)
    // rdd.foreachPartition(x => {
    // val table = HbaseUtils.getTable(HbaseUtils.getHbaseConf("centos01,centos02,centos03", "2181"), "movie_wordcount")
    // val puts = new java.util.LinkedList[Put]()
    // x.foreach(y => {
    // // 将数组插入hbase
    // val family = Bytes.toBytes("word")
    // val wordColum = Bytes.toBytes("word")
    // val countColum = Bytes.toBytes("count")
    // val uuid = UUID.randomUUID()
    // val wordPut = new Put(Bytes.toBytes(uuid.toString))
    // wordPut.addColumn(family, wordColum, Bytes.toBytes(y._1))
    // wordPut.addColumn(family, countColum, Bytes.toBytes(y._2))
    // puts.add(wordPut)
    // })
    // table.put(puts)
    // })


    }


    /**
    * 分词停止符
    *
    * @param stopWords
    * @return
    */
    def filter(stopWords: Array[String]): StopRecognition = {
    // add stop words
    val filter = new StopRecognition
    filter.insertStopNatures("w") // filter punctuation
    filter.insertStopNatures("m") // filter m pattern
    filter.insertStopNatures("null") // filter null
    filter.insertStopNatures("<br />") // filter <br />
    filter.insertStopRegexes("^[a-zA-Z]{1,}") //filter English alphabet
    filter.insertStopRegexes("^[0-9]+") //filter number
    filter.insertStopRegexes("[^a-zA-Z0-9\\u4e00-\\u9fa5]+")
    filter.insertStopRegexes("\t")
    for (x <- stopWords) {
    filter.insertStopWords(x)
    }
    filter
    }

    /**
    * 分词统计
    *
    * @param text
    * @param filter
    * @return
    */
    def getWords(text: String, filter: StopRecognition): ArrayBuffer[String] = {
    val words = new mutable.ArrayBuffer[String]()
    val terms = ToAnalysis.parse(text).recognition(filter).getTerms
    for (i <- 0 until terms.size()) {
    val word = terms.get(i).getName
    if (word.length >= 0) {
    words += word
    }
    }
    words
    }


    }
  • 相关阅读:
    【Docker】Dockerfile的基本使用
    Linux nsenter 命令简介及 切换宿主机网络为docker容器网络实践
    docker+selenium搭建分布式web自动化测试环境
    docker容器与虚拟机的区别
    docker 网络模式 和 端口映射
    Java基础之数组的定义与使用详解
    Java基础之数据类型、标识符、运算符、程序流程控制结构
    docker常用命令详解
    python查询腾讯云COS存储桶目录及文件大小
    python-自动化监控进程发钉钉报警
  • 原文地址:https://www.cnblogs.com/chengzhihua/p/11428673.html
Copyright © 2011-2022 走看看