zoukankan      html  css  js  c++  java
  • spark-sql分组去重总数统计uv

    SparkConf sparkConf = new SparkConf();
            sparkConf
                    .setAppName("Internal_Func")
                    .setMaster("local");
    
            JavaSparkContext javaSparkContext = new JavaSparkContext(sparkConf);
            SQLContext sqlContext = new SQLContext(javaSparkContext);
    
            List<String> list = new ArrayList<String>();
            list.add("1,1");
            list.add("2,11");
            list.add("2,111");
            list.add("2,111");
            list.add("3,1111");
            list.add("3,11111");
    
            JavaRDD<String> rdd_str = javaSparkContext.parallelize(list, 5);
    
            JavaRDD<Row> rdd_row = rdd_str.map(new Function<String, Row>() {
                @Override
                public Row call(String v1) throws Exception {
                    String ary[] = v1.split(",");
                    return RowFactory.create(ary[0], Long.parseLong(ary[1]));
                }
            });
    
            List<StructField> fieldList = new ArrayList<StructField>();
            fieldList.add(DataTypes.createStructField("name", DataTypes.StringType, true));
            fieldList.add(DataTypes.createStructField("sc", DataTypes.LongType, true));
            StructType tmp = DataTypes.createStructType(fieldList);
    
            DataFrame df = sqlContext.createDataFrame(rdd_row, tmp);
            df.registerTempTable("tmp_sc");
    
            DataFrame df_agg = sqlContext.sql("select name,count(distinct(sc)) from tmp_sc group by name");//去重后分组求和统计
    
            df_agg.show();
  • 相关阅读:
    LYDSY模拟赛day3 序列
    LYDSY模拟赛day3 涂色游戏
    LYDSY模拟赛day3 平均数
    hdu1757 A Simple Math Problem
    清北国庆day1 (脑)残
    poj3070 Fibonacci
    uva10870 递推关系Recurrences
    湖南附中模拟day1 瞭望塔
    湖南附中模拟day1 收银员
    湖南附中模拟day1 金坷垃
  • 原文地址:https://www.cnblogs.com/zzq-include/p/8747107.html
Copyright © 2011-2022 走看看