zoukankan      html  css  js  c++  java
  • dataframe的进行json数据的压平、增加一列的id自增列

    {"name":"Michael", "age":25,"myScore":[{"score1":19,"score2":23},{"score1":58,"score2":50}]}
    {"name":"Andy", "age":30,"myScore":[{"score1":29,"score2":33},{"score1":38,"score2":52},{"score1":88,"score2":71}]}
    {"name":"Justin", "age":19,"myScore":[{"score1":39,"score2":43},{"score1":28,"score2":53}]}
    {"name":"Michael", "age":25,"myScore":[{"score1":19,"score2":23},{"score1":58,"score2":50}]}
    object explodeTest {
      def main(args: Array[String]): Unit = {
    
        val sparks = SparkSession.builder.master("local[4]").appName("test1").getOrCreate
        val sc = sparks.sparkContext
    
        val df=  sparks.read.json("file:///C:\Users\imp\Desktop\bo-kong\data\josn")
    
        df.show()
        //spark  读取json 数据
        /**+---+--------------------+-------+
    |age|             myScore|   name|
    +---+--------------------+-------+
    | 25|  [[19,23], [58,50]]|Michael|
    | 30|[[29,33], [38,52]...|   Andy|
    | 19|  [[39,43], [28,53]]| Justin|
    | 25|  [[19,23], [58,50]]|Michael|
    | 30|[[29,33], [38,52]...|   Andy|
    | 19|  [[39,43], [28,53]]| Justin|
    | 25|  [[19,23], [58,50]]|Michael|
    | 30|[[29,33], [38,52]...|   Andy|
    | 19|  [[39,43], [28,53]]| Justin|
    +---+--------------------+-------+
          *
          *
          *
          */
    
        //使用spark.sql.functions._ explode函数进行压平操作  行转列
        import org.apache.spark.sql.functions._
        val dfScore = df.select(df("name"),explode(df("myScore"))).toDF("name","myScore")
        val dfMyScore = dfScore.select("name","myScore.score1", "myScore.score2")
        dfScore.show()
       df.createOrReplaceTempView("df")
        //u.answer, ''
        /**
          *
          *
          *
          * +-------+-------+
          * |   name|myScore|
          * +-------+-------+
          * |Michael|[19,23]|
          * |Michael|[58,50]|
          * |   Andy|[29,33]|
          * |   Andy|[38,52]|
          * |   Andy|[88,71]|
          * | Justin|[39,43]|
          * | Justin|[28,53]|
          * |Michael|[19,23]|
          * |Michael|[58,50]|
          * |   Andy|[29,33]|
          * |   Andy|[38,52]|
          * |   Andy|[88,71]|
          * | Justin|[39,43]|
          * | Justin|[28,53]|
          * |Michael|[19,23]|
          * |Michael|[58,50]|
          * |   Andy|[29,33]|
          * |   Andy|[38,52]|
          * |   Andy|[88,71]|
          * | Justin|[39,43]|
          * +-------+-------+
          * only showing top 20 rows
          */
    
    
    
      }
    }
    
    
    
    
     
    数据
    aa
    bb
    cc
    dd
    ee
    ff

    dataframe增加index主键列

     case  class Log(map:scala.collection.mutable.Map[String,String],ID: Long)
        import sparks.implicits._
      val data2 =  sc.parallelize(Seq((Map("uuid"->"sxexx","ip"->"192.168")),Map("uuid"->"man","ip"->"192.168.10.1"))).zipWithIndex()
        .map(i=>(i._1,i._2))
        data2.collect().foreach(print(_))
        /**
          * 先创造一个Rdd[map] 使用zipWithIndex 看看效果  第二个元素为id主键
          * 
          * 
          * (Map(uuid -> sxexx, ip -> 192.168),0)
          * (Map(uuid -> man, ip -> 192.168.10.1),1)
          */
    
    
    
    
    val data=  sc.textFile("file:///C:\Users\imp\Desktop\bo-kong\data\data")
          .zipWithIndex().toDF("id","value")
        data.show()
    
        /**
          * 使用上面的数据的得出结果
          * +---+-----+
          * | id|value|
          * +---+-----+
          * | aa|    0|
          * | bb|    1|
          * | cc|    2|
          * | dd|    3|
          * | ee|    4|
          * | ff|    5|
          * +---+-----+
          */
  • 相关阅读:
    225. Implement Stack using Queues
    150. Evaluate Reverse Polish Notation
    159 Longest Substring with At Most Two Distinct Characters
    142. Linked List Cycle II
    打印沙漏 (20 分)
    人见人爱A-B
    人见人爱A+B
    杨辉三角
    进制转换
    汉字统计
  • 原文地址:https://www.cnblogs.com/hejunhong/p/10604568.html
Copyright © 2011-2022 走看看