zoukankan      html  css  js  c++  java
  • spark-sql分组去重总数统计uv

    SparkConf sparkConf = new SparkConf();
            sparkConf
                    .setAppName("Internal_Func")
                    .setMaster("local");
    
            JavaSparkContext javaSparkContext = new JavaSparkContext(sparkConf);
            SQLContext sqlContext = new SQLContext(javaSparkContext);
    
            List<String> list = new ArrayList<String>();
            list.add("1,1");
            list.add("2,11");
            list.add("2,111");
            list.add("2,111");
            list.add("3,1111");
            list.add("3,11111");
    
            JavaRDD<String> rdd_str = javaSparkContext.parallelize(list, 5);
    
            JavaRDD<Row> rdd_row = rdd_str.map(new Function<String, Row>() {
                @Override
                public Row call(String v1) throws Exception {
                    String ary[] = v1.split(",");
                    return RowFactory.create(ary[0], Long.parseLong(ary[1]));
                }
            });
    
            List<StructField> fieldList = new ArrayList<StructField>();
            fieldList.add(DataTypes.createStructField("name", DataTypes.StringType, true));
            fieldList.add(DataTypes.createStructField("sc", DataTypes.LongType, true));
            StructType tmp = DataTypes.createStructType(fieldList);
    
            DataFrame df = sqlContext.createDataFrame(rdd_row, tmp);
            df.registerTempTable("tmp_sc");
    
            DataFrame df_agg = sqlContext.sql("select name,count(distinct(sc)) from tmp_sc group by name");//去重后分组求和统计
    
            df_agg.show();
  • 相关阅读:
    数据库30条规范
    数据库索引原理
    HashMap的实现原理
    Google 和 Baidu 常用的搜索技巧
    Arrays工具类十大常用方法
    fastjson将json格式null转化空串
    SolrCloud的介绍
    网页背景图片自适应浏览器大小
    addlinkedserver
    常用
  • 原文地址:https://www.cnblogs.com/zzq-include/p/8747107.html
Copyright © 2011-2022 走看看