zoukankan html css js c++ java

Spark操作hbase

Spark操作HBase - 读操作

 1 /**
 2   * 配置hbase
 3   *
 4   * @param tableName
 5   * @param quorum
 6   * @param port
 7   * @return
 8   */
 9 def getHbaseConf(quorum: String, port: String): Configuration = {
10   // 配置hbase环境
11   val conf = HBaseConfiguration.create()
12   // 设置zookeeper地址
13   conf.set("hbase.zookeeper.quorum", quorum)
14   // 设置zookeeper端口
15   conf.set("hbase.zookeeper.property.clientPort", port)
16   // 设置读取hbase的tablename
17   conf
18 }

 1 def main(args: Array[String]): Unit = {
 2 
 3   // 获取spark环境 设置local使程序在本地运行，不需要安装Spark集群 这里使用的是yarn模式
 4   val sparkConf = new SparkConf().setAppName("Spark-Hbase-Read").setMaster("local[2]")
 5   val sc = new SparkContext(sparkConf)
 6   // 获取hbase相关配置信息
 7   val hbaseConf = HbaseUtils.getHbaseConf("centos01,centos02,centos03", "2181")
 8   hbaseConf.set(TableInputFormat.INPUT_TABLE, "movie_wordcount")
 9   // 将 hbase 读取信息转换成rdd
10   val hbaseRdd = sc.newAPIHadoopRDD(hbaseConf, classOf[TableInputFormat], classOf[ImmutableBytesWritable], classOf[Result]).cache()
11 
12   hbaseRdd.map(x => {
13     val result = x._2
14     val row = Bytes.toString(result.getRow)
15     val word = Bytes.toString(result.getValue("word".getBytes(), "word".getBytes()))
16     // 单词量是int 所以需要用 Bytes.toInt 否则会出现乱码
17     val count = Bytes.toInt(result.getValue("word".getBytes(), "count".getBytes()))
18 
19     println(row, word, count)
20     (row, word, count)
21   }).saveAsTextFile("/wordcount/output3")
22 
23 
24 }


  打包提交到集群，运行 命令：./bin/spark-submit --class com.xxx.xx.scala.hbase.SparkHbaseR ./localjar/sc-1.0-SNAPSHOT-jar-with-dependencies.jar

　　saveAsTextFile 默认保存地址是hdfs上的，所以去hdfs上查看结果，结果是一个文件夹，

　　查看命令： hadoop fs -ls /wordcount/output3

　　2.Spark操作HBase - 写操作

object SparkHbaseW {

  def main(args: Array[String]): Unit = {
    val conf = new SparkConf().setMaster("Spark-Hbase-Write").setMaster("yarn")
    val sc = new SparkContext(conf)
    val file = sc.textFile("/spark/movie/wordcount/pinglun.txt")
    // 使用中文分词
    val rdd = file.flatMap(line => {
      getWords(line, filter(new Array[String](0)))
    }).map(x => (x, 1)).reduceByKey(_ + _)
    // 保存一份结果到hdfs
    rdd.saveAsTextFile(args(0))

    //     插入方式一
    rdd.foreachPartition(x => {
      x.foreach(y => {
        // 将数组插入hbase
        val table = HbaseUtils.getTable(HbaseUtils.getHbaseConf("centos01,centos02,centos03", "2181"), "movie_wordcount")
        val family = Bytes.toBytes("word")
        val wordColum = Bytes.toBytes("word")
        val countColum = Bytes.toBytes("count")

        val uuid = UUID.randomUUID()
        val wordPut = new Put(Bytes.toBytes(uuid.toString))
        wordPut.addColumn(family, wordColum, Bytes.toBytes(y._1))
        wordPut.addColumn(family, countColum, Bytes.toBytes(y._2))
        table.put(wordPut)
      })
    })

    //     插入方式二 （批量插入）
    //    rdd.foreachPartition(x => {
    //      val table = HbaseUtils.getTable(HbaseUtils.getHbaseConf("centos01,centos02,centos03", "2181"), "movie_wordcount")
    //      val puts = new java.util.LinkedList[Put]()
    //      x.foreach(y => {
    //        // 将数组插入hbase
    //        val family = Bytes.toBytes("word")
    //        val wordColum = Bytes.toBytes("word")
    //        val countColum = Bytes.toBytes("count")
    //        val uuid = UUID.randomUUID()
    //        val wordPut = new Put(Bytes.toBytes(uuid.toString))
    //        wordPut.addColumn(family, wordColum, Bytes.toBytes(y._1))
    //        wordPut.addColumn(family, countColum, Bytes.toBytes(y._2))
    //        puts.add(wordPut)
    //      })
    //      table.put(puts)
    //    })


  }


  /**
    * 分词停止符
    *
    * @param stopWords
    * @return
    */
  def filter(stopWords: Array[String]): StopRecognition = {
    // add stop words
    val filter = new StopRecognition
    filter.insertStopNatures("w") // filter punctuation
    filter.insertStopNatures("m") // filter m pattern
    filter.insertStopNatures("null") // filter null
    filter.insertStopNatures("<br />") // filter <br />
    filter.insertStopRegexes("^[a-zA-Z]{1,}") //filter English alphabet
    filter.insertStopRegexes("^[0-9]+") //filter number
    filter.insertStopRegexes("[^a-zA-Z0-9\\u4e00-\\u9fa5]+")
    filter.insertStopRegexes("\t")
    for (x <- stopWords) {
      filter.insertStopWords(x)
    }
    filter
  }

  /**
    * 分词统计
    *
    * @param text
    * @param filter
    * @return
    */
  def getWords(text: String, filter: StopRecognition): ArrayBuffer[String] = {
    val words = new mutable.ArrayBuffer[String]()
    val terms = ToAnalysis.parse(text).recognition(filter).getTerms
    for (i <- 0 until terms.size()) {
      val word = terms.get(i).getName
      if (word.length >= 0) {
        words += word
      }
    }
    words
  }


}

查看全文

相关阅读:
Linux的CPU负载
 C++ 内接连与外接连
 boost 串口通信
 创建型模式－－单例模式
 Python urllib与urllib2
CodeBlocks使用boost+MinGW
Python 线程（七）：local(线程局部存储)
Python 线程（六）：Timer(定时器)
Python 线程（五）：Event
Python 线程（四）：Semphore同步

原文地址：https://www.cnblogs.com/chengzhihua/p/11428673.html