zoukankan      html  css  js  c++  java
  • 算子:sample(false, 0.1)抽样数据

    抽样示例操作:

    scala> import org.apache.spark.sql.hive.HiveContext
    import org.apache.spark.sql.hive.HiveContext
    
    scala> val hiveContext = new HiveContext(sc)
    17/11/07 17:19:36 WARN SessionState: load mapred-default.xml, HIVE_CONF_DIR env not found!
    17/11/07 17:19:37 WARN SessionState: load mapred-default.xml, HIVE_CONF_DIR env not found!
    hiveContext: org.apache.spark.sql.hive.HiveContext = org.apache.spark.sql.hive.HiveContext@14cc2fdd
    
    scala> hiveContext.sql("use my_hive_db")
    17/11/07 17:19:40 WARN SessionState: METASTORE_FILTER_HOOK will be ignored, since hive.security.authorization.manager is set to instance of HiveAuthorizerFactory.
    17/11/07 17:19:40 WARN UserGroupInformation: No groups available for user acount_rc
    res20: org.apache.spark.sql.DataFrame = [result: string]
    
    scala> val sampledPairs = hiveContext.sql("select objectid from myobjectid")
        .map(s=>(s.getAs[String]("objectid"),1))
        .sample(false, 0.1)
    17/11/07 17:19:40 WARN UserGroupInformation: No groups available for user acount_rc
    17/11/07 17:19:40 WARN UserGroupInformation: No groups available for user acount_rc
    sampledPairs: org.apache.spark.rdd.RDD[(String, Int)] = PartitionwiseSampledRDD[1059] at sample at <console>:32
    
    scala> val sampledWordCounts = sampledPairs.countByKey
    sampledWordCounts: scala.collection.Map[String,Long] = Map(193700355 -> 32348, 101549569 -> 81388, 100890370 -> 66425, 184703237 -> 60943, 
    184563457 -> 77401, 100692995 -> 55021, 184756482 -> 88707, 193611009 -> 1588, 185257985 -> 16457, 190035714 -> 14209, 153225089 -> 41515, 
    100811782 -> 115963, 100782849 -> 54729, 184581890 -> 70271, 185388291 -> 76225, 185278978 -> 40917, 80085891 -> 66957, 184957442 -> 59129, 
    153127554 -> 146, 101362179 -> 18600, 193658626 -> 48758, 79805058 -> 17477, 101623810 -> 263451, 184637699 -> 23640, 185363457 -> 24341, 
    153561730 -> 19010, 184722690 -> 2516, 79906177 -> 21106, 193805313 -> 78224, 184739585 -> 34405, 101342210 -> 60860, 193511427 -> 77125, 
    101244675 -> 624, 80425606 -> 12167, 189870594 -> 6944, 101441025 -> 39970, 185549825 -> 322, 101125633...
    scala> sampledWordCounts.foreach(println(_))
    (193700355,32348)
    (101549569,81388)
    (100890370,66425)
    (184703237,60943)
    (184563457,77401)
    (100692995,55021)
    (184756482,88707)
    (193611009,1588)
    (185257985,16457)
    (190035714,14209)
    (153225089,41515)
    (100811782,115963)
    (100782849,54729)
    (184581890,70271)
  • 相关阅读:
    关于sql json数据的处理
    时间函数strtotime的强大
    /usr/bin/install: cannot create regular file `/usr/local/jpeg6/include/jconfig.h'
    linux安装php7.2.7
    关于sql时间方面的处理
    关于centos防火墙的一些问题
    linux 安装ssl 失败原因
    linux安装php7.2.7
    拾取坐标和反查询接口api
    【转】通过点击获取地址等信息、可以传值
  • 原文地址:https://www.cnblogs.com/yy3b2007com/p/7800749.html
Copyright © 2011-2022 走看看