zoukankan      html  css  js  c++  java
  • 项目实战从0到1之hive(41)大数据项目之电商数仓(用户行为数据)(九)

    第11章 需求一:用户活跃主题

    11.1 DWS层

    目标:统计当日(dau)、当周、当月活动的每个设备明细

    11.1.1 每日活跃设备明细

    img

    1)建表语句

    drop table if exists dws_uv_detail_day;
    create external table dws_uv_detail_day
    (
      `mid_id` string COMMENT '设备唯一标识',
      `user_id` string COMMENT '用户标识',
      `version_code` string COMMENT '程序版本号',
      `version_name` string COMMENT '程序版本名',
      `lang` string COMMENT '系统语言',
      `source` string COMMENT '渠道号',
      `os` string COMMENT '安卓系统版本',
      `area` string COMMENT '区域',
      `model` string COMMENT '手机型号',
      `brand` string COMMENT '手机品牌',
      `sdk_version` string COMMENT 'sdkVersion',
      `gmail` string COMMENT 'gmail',
      `height_width` string COMMENT '屏幕宽高',
      `app_time` string COMMENT '客户端日志产生时的时间',
      `network` string COMMENT '网络模式',
      `lng` string COMMENT '经度',
      `lat` string COMMENT '纬度'
    )
    partitioned by(dt string)
    stored as parquet
    location '/warehouse/gmall/dws/dws_uv_detail_day';

    2)数据导入

    以用户单日访问为key进行聚合,如果某个用户在一天中使用了两种操作系统、两个系统版本、多个地区,登录不同账号,只取其中之一

    set hive.exec.dynamic.partition.mode=nonstrict;

    insert overwrite table dws_uv_detail_day
    partition(dt='2020-10-14')
    select  
      mid_id,
      concat_ws('|', collect_set(user_id)) user_id,
      concat_ws('|', collect_set(version_code)) version_code,
      concat_ws('|', collect_set(version_name)) version_name,
      concat_ws('|', collect_set(lang))lang,
      concat_ws('|', collect_set(source)) source,
      concat_ws('|', collect_set(os)) os,
      concat_ws('|', collect_set(area)) area,
      concat_ws('|', collect_set(model)) model,
      concat_ws('|', collect_set(brand)) brand,
      concat_ws('|', collect_set(sdk_version)) sdk_version,
      concat_ws('|', collect_set(gmail)) gmail,
      concat_ws('|', collect_set(height_width)) height_width,
      concat_ws('|', collect_set(app_time)) app_time,
      concat_ws('|', collect_set(network)) network,
      concat_ws('|', collect_set(lng)) lng,
      concat_ws('|', collect_set(lat)) lat
    from dwd_start_log
    where dt='2020-10-14'
    group by mid_id;

    3)查询导入结果

    select * from dws_uv_detail_day limit 1;
    select count(*) from dws_uv_detail_day;

    4)思考

    不同渠道来源的每日活跃数统计怎么计算?

    11.1.2 每周活跃设备明细

    img

    根据日用户访问明细,获得周用户访问明细。

    1)建表语句

    drop table if exists dws_uv_detail_wk;
    create external table dws_uv_detail_wk(
      `mid_id` string COMMENT '设备唯一标识',
      `user_id` string COMMENT '用户标识',
      `version_code` string COMMENT '程序版本号',
      `version_name` string COMMENT '程序版本名',
      `lang` string COMMENT '系统语言',
      `source` string COMMENT '渠道号',
      `os` string COMMENT '安卓系统版本',
      `area` string COMMENT '区域',
      `model` string COMMENT '手机型号',
      `brand` string COMMENT '手机品牌',
      `sdk_version` string COMMENT 'sdkVersion',
      `gmail` string COMMENT 'gmail',
      `height_width` string COMMENT '屏幕宽高',
      `app_time` string COMMENT '客户端日志产生时的时间',
      `network` string COMMENT '网络模式',
      `lng` string COMMENT '经度',
      `lat` string COMMENT '纬度',
      `monday_date` string COMMENT '周一日期',
      `sunday_date` string COMMENT  '周日日期'
    ) COMMENT '活跃用户按周明细'
    PARTITIONED BY (`wk_dt` string)
    stored as parquet
    location '/warehouse/gmall/dws/dws_uv_detail_wk/';

    2)数据导入

    set hive.exec.dynamic.partition.mode=nonstrict;

    insert overwrite table dws_uv_detail_wk partition(wk_dt)
    select  
      mid_id,
      concat_ws('|', collect_set(user_id)) user_id,
      concat_ws('|', collect_set(version_code)) version_code,
      concat_ws('|', collect_set(version_name)) version_name,
      concat_ws('|', collect_set(lang)) lang,
      concat_ws('|', collect_set(source)) source,
      concat_ws('|', collect_set(os)) os,
      concat_ws('|', collect_set(area)) area,
      concat_ws('|', collect_set(model)) model,
      concat_ws('|', collect_set(brand)) brand,
      concat_ws('|', collect_set(sdk_version)) sdk_version,
      concat_ws('|', collect_set(gmail)) gmail,
      concat_ws('|', collect_set(height_width)) height_width,
      concat_ws('|', collect_set(app_time)) app_time,
      concat_ws('|', collect_set(network)) network,
      concat_ws('|', collect_set(lng)) lng,
      concat_ws('|', collect_set(lat)) lat,
      date_add(next_day('2020-10-14','MO'),-7),
      date_add(next_day('2020-10-14','MO'),-1),
      concat(date_add( next_day('2020-10-14','MO'),-7), '_' , date_add(next_day('2020-10-14','MO'),-1)
    )
    from dws_uv_detail_day
    where dt>=date_add(next_day('2020-10-14','MO'),-7) and dt<=date_add(next_day('2020-10-14','MO'),-1)
    group by mid_id;

    3)查询导入结果

    select * from dws_uv_detail_wk limit 1;
    select count(*) from dws_uv_detail_wk;

    11.1.3 每月活跃设备明细

    img

    1)建表语句

    drop table if exists dws_uv_detail_mn;

    create external table dws_uv_detail_mn(
      `mid_id` string COMMENT '设备唯一标识',
      `user_id` string COMMENT '用户标识',
      `version_code` string COMMENT '程序版本号',
      `version_name` string COMMENT '程序版本名',
      `lang` string COMMENT '系统语言',
      `source` string COMMENT '渠道号',
      `os` string COMMENT '安卓系统版本',
      `area` string COMMENT '区域',
      `model` string COMMENT '手机型号',
      `brand` string COMMENT '手机品牌',
      `sdk_version` string COMMENT 'sdkVersion',
      `gmail` string COMMENT 'gmail',
      `height_width` string COMMENT '屏幕宽高',
      `app_time` string COMMENT '客户端日志产生时的时间',
      `network` string COMMENT '网络模式',
      `lng` string COMMENT '经度',
      `lat` string COMMENT '纬度'
    ) COMMENT '活跃用户按月明细'
    PARTITIONED BY (`mn` string)
    stored as parquet
    location '/warehouse/gmall/dws/dws_uv_detail_mn/';

    2)数据导入

    set hive.exec.dynamic.partition.mode=nonstrict;

    insert overwrite table dws_uv_detail_mn partition(mn)
    select  
      mid_id,
      concat_ws('|', collect_set(user_id)) user_id,
      concat_ws('|', collect_set(version_code)) version_code,
      concat_ws('|', collect_set(version_name)) version_name,
      concat_ws('|', collect_set(lang)) lang,
      concat_ws('|', collect_set(source)) source,
      concat_ws('|', collect_set(os)) os,
      concat_ws('|', collect_set(area)) area,
      concat_ws('|', collect_set(model)) model,
      concat_ws('|', collect_set(brand)) brand,
      concat_ws('|', collect_set(sdk_version)) sdk_version,
      concat_ws('|', collect_set(gmail)) gmail,
      concat_ws('|', collect_set(height_width)) height_width,
      concat_ws('|', collect_set(app_time)) app_time,
      concat_ws('|', collect_set(network)) network,
      concat_ws('|', collect_set(lng)) lng,
      concat_ws('|', collect_set(lat)) lat,
      date_format('2020-10-14','yyyy-MM')
    from dws_uv_detail_day
    where date_format(dt,'yyyy-MM') = date_format('2020-10-14','yyyy-MM')
    group by mid_id;

    3)查询导入结果

    select * from dws_uv_detail_mn limit 1;
    select count(*) from dws_uv_detail_mn ;

    11.1.4 DWS层加载数据脚本

    1)创建脚本

    [kgg@hadoop102 bin]$ vim dws_uv_log.sh
      在脚本中编写如下内容
    #!/bin/bash

    # 定义变量方便修改
    APP=gmall
    hive=/opt/module/hive/bin/hive

    # 如果是输入的日期按照取输入日期;如果没输入日期取当前时间的前一天
    if [ -n "$1" ] ;then
       do_date=$1
    else
       do_date=`date -d "-1 day" +%F`  
    fi
    echo "===日志日期为 $do_date==="
    sql="
     set hive.exec.dynamic.partition.mode=nonstrict;

    insert overwrite table "$APP".dws_uv_detail_day partition(dt='$do_date')
    select  
      mid_id,
      concat_ws('|', collect_set(user_id)) user_id,
      concat_ws('|', collect_set(version_code)) version_code,
      concat_ws('|', collect_set(version_name)) version_name,
      concat_ws('|', collect_set(lang)) lang,
      concat_ws('|', collect_set(source)) source,
      concat_ws('|', collect_set(os)) os,
      concat_ws('|', collect_set(area)) area,
      concat_ws('|', collect_set(model)) model,
      concat_ws('|', collect_set(brand)) brand,
      concat_ws('|', collect_set(sdk_version)) sdk_version,
      concat_ws('|', collect_set(gmail)) gmail,
      concat_ws('|', collect_set(height_width)) height_width,
      concat_ws('|', collect_set(app_time)) app_time,
      concat_ws('|', collect_set(network)) network,
      concat_ws('|', collect_set(lng)) lng,
      concat_ws('|', collect_set(lat)) lat
    from "$APP".dwd_start_log
    where dt='$do_date'  
    group by mid_id;


    insert overwrite table "$APP".dws_uv_detail_wk partition(wk_dt)
    select  
      mid_id,
      concat_ws('|', collect_set(user_id)) user_id,
      concat_ws('|', collect_set(version_code)) version_code,
      concat_ws('|', collect_set(version_name)) version_name,
      concat_ws('|', collect_set(lang)) lang,
      concat_ws('|', collect_set(source)) source,
      concat_ws('|', collect_set(os)) os,
      concat_ws('|', collect_set(area)) area,
      concat_ws('|', collect_set(model)) model,
      concat_ws('|', collect_set(brand)) brand,
      concat_ws('|', collect_set(sdk_version)) sdk_version,
      concat_ws('|', collect_set(gmail)) gmail,
      concat_ws('|', collect_set(height_width)) height_width,
      concat_ws('|', collect_set(app_time)) app_time,
      concat_ws('|', collect_set(network)) network,
      concat_ws('|', collect_set(lng)) lng,
      concat_ws('|', collect_set(lat)) lat,
      date_add(next_day('$do_date','MO'),-7),
      date_add(next_day('$do_date','MO'),-1),
      concat(date_add( next_day('$do_date','MO'),-7), '_' , date_add(next_day('$do_date','MO'),-1)
    )
    from "$APP".dws_uv_detail_day
    where dt>=date_add(next_day('$do_date','MO'),-7) and dt<=date_add(next_day('$do_date','MO'),-1)
    group by mid_id;


    insert overwrite table "$APP".dws_uv_detail_mn partition(mn)
    select
      mid_id,
      concat_ws('|', collect_set(user_id)) user_id,
      concat_ws('|', collect_set(version_code)) version_code,
      concat_ws('|', collect_set(version_name)) version_name,
      concat_ws('|', collect_set(lang))lang,
      concat_ws('|', collect_set(source)) source,
      concat_ws('|', collect_set(os)) os,
      concat_ws('|', collect_set(area)) area,
      concat_ws('|', collect_set(model)) model,
      concat_ws('|', collect_set(brand)) brand,
      concat_ws('|', collect_set(sdk_version)) sdk_version,
      concat_ws('|', collect_set(gmail)) gmail,
      concat_ws('|', collect_set(height_width)) height_width,
      concat_ws('|', collect_set(app_time)) app_time,
      concat_ws('|', collect_set(network)) network,
      concat_ws('|', collect_set(lng)) lng,
      concat_ws('|', collect_set(lat)) lat,
      date_format('$do_date','yyyy-MM')
    from "$APP".dws_uv_detail_day
    where date_format(dt,'yyyy-MM') = date_format('$do_date','yyyy-MM')  
    group by mid_id;
    "

    $hive -e "$sql"

    2)增加脚本执行权限

    chmod 777 dws_uv_log.sh

    3)脚本使用

    dws_uv_log.sh 2019-02-11

    4)查询结果

    select count(*) from dws_uv_detail_day where dt='2019-02-11';
    select count(*) from dws_uv_detail_wk;
    select count(*) from dws_uv_detail_mn ;

    5)脚本执行时间

    企业开发中一般在每日凌晨30分~1点

     

    11.2 ADS层

    目标:当日、当周、当月活跃设备数

    11.2.1 活跃设备数

    img

    1)建表语句

    drop table if exists ads_uv_count;
    create external table ads_uv_count(
      `dt` string COMMENT '统计日期',
      `day_count` bigint COMMENT '当日用户数量',
      `wk_count`  bigint COMMENT '当周用户数量',
      `mn_count`  bigint COMMENT '当月用户数量',
      `is_weekend` string COMMENT 'Y,N是否是周末,用于得到本周最终结果',
      `is_monthend` string COMMENT 'Y,N是否是月末,用于得到本月最终结果'
    ) COMMENT '活跃设备数'
    row format delimited fields terminated by ' '
    location '/warehouse/gmall/ads/ads_uv_count/';

    2)导入数据

    insert into table ads_uv_count 
    select  
     '2020-10-14' dt,
      daycount.ct,
      wkcount.ct,
      mncount.ct,
    if(date_add(next_day('2020-10-14','MO'),-1)='2020-10-14','Y','N') ,
      if(last_day('2020-10-14')='2020-10-14','Y','N')
    from
    (
      select  
         '2020-10-14' dt,
          count(*) ct
      from dws_uv_detail_day
      where dt='2020-10-14'  
    )daycount join
    (
      select  
        '2020-10-14' dt,
        count (*) ct
      from dws_uv_detail_wk
      where wk_dt=concat(date_add(next_day('2020-10-14','MO'),-7),'_' ,date_add(next_day('2020-10-14','MO'),-1) )
    ) wkcount on daycount.dt=wkcount.dt
    join
    (
      select  
        '2020-10-14' dt,
        count (*) ct
      from dws_uv_detail_mn
      where mn=date_format('2020-10-14','yyyy-MM')  
    )mncount on daycount.dt=mncount.dt;

    3)查询导入结果

    select * from ads_uv_count ;

    11.2.2 ADS层加载数据脚本

    1)创建脚本

    [kgg@hadoop102 bin]$ vim ads_uv_log.sh
      在脚本中编写如下内容
    #!/bin/bash

    # 定义变量方便修改
    APP=gmall
    hive=/opt/module/hive/bin/hive

    # 如果是输入的日期按照取输入日期;如果没输入日期取当前时间的前一天
    if [ -n "$1" ] ;then
       do_date=$1
    else
       do_date=`date -d "-1 day" +%F`  
    fi
    echo "===日志日期为 $do_date==="
    sql="
     set hive.exec.dynamic.partition.mode=nonstrict;

    insert into table "$APP".ads_uv_count
    select  
     '$do_date' dt,
      daycount.ct,
      wkcount.ct,
      mncount.ct,
      if(date_add(next_day('$do_date','MO'),-1)='$do_date','Y','N') ,
      if(last_day('$do_date')='$do_date','Y','N')
    from
    (
      select  
         '$do_date' dt,
          count(*) ct
      from "$APP".dws_uv_detail_day
      where dt='$do_date'  
    )daycount   join
    (
      select  
        '$do_date' dt,
        count (*) ct
      from "$APP".dws_uv_detail_wk
      where wk_dt=concat(date_add(next_day('$do_date','MO'),-7),'_' ,date_add(next_day('$do_date','MO'),-1) )
    ) wkcount on daycount.dt=wkcount.dt
    join
    (
      select  
        '$do_date' dt,
        count (*) ct
      from "$APP".dws_uv_detail_mn
      where mn=date_format('$do_date','yyyy-MM')  
    )mncount on daycount.dt=mncount.dt;
    "

    $hive -e "$sql"

    2)增加脚本执行权限

    chmod 777 ads_uv_log.sh

    3)脚本使用

     ads_uv_log.sh 2019-02-11

    4)脚本执行时间

    企业开发中一般在每日凌晨30分~1点

    5)查询导入结果

     select * from ads_uv_count;

     

    作者:大码王

    -------------------------------------------

    个性签名:独学而无友,则孤陋而寡闻。做一个灵魂有趣的人!

    如果觉得这篇文章对你有小小的帮助的话,记得在右下角点个“推荐”哦,博主在此感谢!

    万水千山总是情,打赏一分行不行,所以如果你心情还比较高兴,也是可以扫码打赏博主,哈哈哈(っ•?ω•?)っ???!

  • 相关阅读:
    sklearn各种分类器简单使用
    使用Pandas加载数据
    使用Sklearn-train_test_split 划分数据集
    KNN_python
    ga算法
    粒子群算法
    bp神经网络的实现C++
    理解RNN
    感知器的实现C++
    线性神经网络的实现C++
  • 原文地址:https://www.cnblogs.com/huanghanyu/p/14303487.html
Copyright © 2011-2022 走看看