zoukankan      html  css  js  c++  java
  • SPARK 数据统计程序性能优化。

    昨天写完R脚本 没测试就发到博客里, 结果实际运行发现很慢,运行时间在2小时以上, 查看spark控制台, 大量时间消耗在count上, 产生的stage多大70多个 。 

    分析原因。 1  select *可以优化,  2 join操作可以放倒hive sql里的尽量放到hive sql里

    这两个优化, 最终目的都是为了减少I/O操作。  hive数据到spark cache的数据量可以减少。 而且可能hive对join操作也有特别的优化。

    这两个优化带来的坏处也是显而易见的, 代码可读性下降, 调试长sql语句的难度比调试spark 集合运算的api难度要大。

    优化完, 实际全部运行时间, 只有8分钟。代码如下

    #领券日期参数, 修改统计日参数
    date_parameter <- "2016-07-11"
    dayCount_parameter = 1


    hiveContext <- sparkRHive.init(sc)
    sql(hiveContext, "use honeycomb_bh_db")

    #通过hiveSql 获得想要的并集集合并且缓存下来 sql date_add
    ##程序执行阶段1: 数据准备。。。。。
    acquired_users_sql <-"select distinct presentee_mobile from sc_t_acquire_record where sc_t_acquire_record.year=2016 and sc_t_acquire_record.month=07 and to_date(ct_time)='STARTDATE'"
    all_order_sql <- "select passenger_phone,create_time from sc_t_order_all_info As a where a.year=2016 and a.month=07 and to_date(a.create_time)>='STARTDATE' and to_date(a.create_time)<=date_add(date('STARTDATE'),14) and product_id=210"
    rebate_order_sql <- "select passenger_phone,create_time from sc_t_order_rebate_info As a where a.year=2016 and a.month=07 and to_date(a.create_time)>='STARTDATE' and to_date(a.create_time)<=date_add(date('STARTDATE'),7) and product_id=210"

    acquired_users_sql<-sub(pattern='STARTDATE', replacement=date_parameter, acquired_users_sql)
    all_order_sql<-gsub(pattern='STARTDATE', replacement=date_parameter, all_order_sql)
    rebate_order_sql<-gsub(pattern='STARTDATE', replacement=date_parameter, rebate_order_sql)


    #当天领券绑定的用户集合
    acquired_users <-sql(hiveContext,acquired_users_sql)
    cache(acquired_users)

    #15日内的全订单集合
    all_orders <-sql(hiveContext,all_order_sql)

    #7日内返利的订单集合
    rebated_orders <- sql(hiveContext,rebate_order_sql)

    #第0日领券后到14日结束前, 有打车纪录的, 尽量用hivesql 减少IO
    #acquired_users_with_orders<-join(acquired_users,all_orders, acquired_users$presentee_mobile==all_orders$passenger_phone, "left_outer")
    #acquired_users_with_orders <- filter(acquired_users_with_orders, "passenger_phone is not null")

    acquired_users_with_orders_sql = paste("select * from (", acquired_users_sql,") As acquire left outer join (",all_order_sql, ") As orders on acquire.presentee_mobile = orders.passenger_phone where orders.passenger_phone is not null and acquire.presentee_mobile is not null",sep="")
    acquired_users_with_orders <-sql(hiveContext,acquired_users_with_orders_sql)
    cache(acquired_users_with_orders)

    mobiles_acquired_users_with_order <-distinct(select(acquired_users_with_orders, "presentee_mobile"))
    #write.json(acquired_users_with_orders, "file:///home/rd/spark/bin/20160711_users_convertion.json")

    #第0日领券后~第7日结束前,被返利的领券用户
    #orders_rebated_within_8days <- join(acquired_users,rebated_orders, acquired_users$presentee_mobile==rebated_orders$passenger_phone, "left_outer")
    #orders_rebated_within_8days <- filter(orders_rebated_within_8days, "passenger_phone is not null")
    orders_rebated_sql <- paste("select * from (", acquired_users_sql,") As acquire left outer join (",rebate_order_sql, ") As orders on acquire.presentee_mobile = orders.passenger_phone where orders.passenger_phone is not null and acquire.presentee_mobile is not null",sep="")
    orders_rebated_within_8days<-sql(hiveContext,orders_rebated_sql)
    cache(orders_rebated_within_8days)
    results <- data.frame("name" = c("frist"), "value" = c(0),stringsAsFactors=FALSE)

    ##程序执行阶段2: 开始利用spark进行集合运算。。。。。

    #第0日到第7日结束前, 券有效期内打过车的领券用户订单数据
    rules<- "to_date(create_time)>='STARTDATE' and to_date(create_time)<=date_add(date('STARTDATE'),7)"
    rules<-gsub(pattern='STARTDATE', replacement=date_parameter, rules)
    orders_within_8days = filter(acquired_users_with_orders, rules)
    mobiles_with_orders_within_8days <- distinct(select(orders_within_8days, "presentee_mobile"))


    #第8日到第14日结束前, 券过期后, 打过车的领券用户订单数据
    rules<- "to_date(create_time)>=date_add(date('STARTDATE'),8) and to_date(create_time)<=date_add(date('STARTDATE'),15)"
    rules<-gsub(pattern='STARTDATE', replacement=date_parameter, rules)
    orders_after_8days = filter(acquired_users_with_orders, rules)
    mobiles_with_orders_after_8days <- distinct(select(orders_after_8days, "presentee_mobile"))


    #第0日到第7日结束前, 被返利信息纪录的领券用户
    mobiles_user_reabted <-distinct(select(orders_rebated_within_8days, "presentee_mobile"))

    #券0~7天有效期内首单后未被返利的用户
    mobiles_my_team_losted <- except(mobiles_with_orders_within_8days, mobiles_user_reabted)

    #第8日券有效期过后, 14日内, 有成交纪录被sic统计方法, 统计进来的用户
    mobiles_after_7days_countedBySicheng <-except(mobiles_with_orders_after_8days, mobiles_user_reabted)

    #券0~7天有效期内首单后未被返利的用户, 第8日到第14日成单, 被sic统计转化的用户
    mobiles_my_team_losted_countedBySicheng <-intersect(mobiles_my_team_losted, mobiles_with_orders_after_8days)


    #第8日券有效期过后, 14日内, sic没有统计的首单用户
    mobiles_both_losted <- except(mobiles_my_team_losted, mobiles_after_7days_countedBySicheng)

    #券0~7天有效期内首单后未被返利, 后7天没打车的用户
    mobile_first_order_withno_coupon_no_futher_order_after_7days <- except(mobiles_my_team_losted, mobiles_with_orders_after_8days)

    #7日内没打车, 后7日打车的用户
    mobiles_with_order_invoked_coupon <- except(mobiles_with_orders_after_8days, mobiles_with_orders_within_8days)

    #领券后15天里打车的用户, 由于业务特性,可以重复领券 这个存在重复统计。
    mobiles_converted = acquired_users_with_orders

    #程序运行阶段: 输出结果。。。
    results<-rbind(results, c("领新手券的用户数量", nrow(acquired_users))
    results<-rbind(results, c("领新手券后15日转化的用户数量", nrow(mobiles_acquired_users_with_order)))
    results<-rbind(results, c("领新手券7日内打车用券转化的用户数量", nrow(mobiles_user_reabted)))
    results<-rbind(results, c("新手券有效期过期后7日内打车转化用户", nrow(mobiles_after_7days_countedBySicheng)))
    results<-rbind(results, c("sic统计方法统计的转化用户数", nrow(mobiles_user_reabted)+nrow(mobiles_after_7days_countedBySicheng)))
    results<-rbind(results, c("7日内首单未用新手券的人数", nrow(mobiles_my_team_losted)))
    results<-rbind(results, c("7日内首单未用新手券, 后7日内没打车的人数", nrow(mobiles_both_losted)))
    results<-rbind(results, c("7日内首单未用新手券, 后7日内有打车的人数", nrow(mobiles_my_team_losted_countedBySicheng)))

    results<-rbind(results, c("领新手券后7日内未打车, 后7日又打车的人数", nrow(mobiles_with_order_invoked_coupon)))
    results

  • 相关阅读:
    网站中使用了Excel组件问题 Microsoft.ACE.OLEDB.14.0' provider is not registered on the local machine
    键值对在架构设计里的应用
    轻轻松松 用U盘安装WIN7
    U盘装WIN7:微软官方工具《Windows 7 USB DVD Download Tool》U盘装wind7(更新官方整合SP1的WIN7 ISO)
    微软一站式示例代码库(中文版)20110924版本, 新添加ASP.NET, Windows Base, Silverlight, WinForm等20个Sample
    最薄笔记本苹果MacBook Air安装雪豹+Win7双系统的驱动解决方法
    使用受保护的配置加密配置信息
    自动加密web.config配置节批处理
    Asp.net MVC3学习
    周老师科研站
  • 原文地址:https://www.cnblogs.com/realzjx/p/5717825.html
Copyright © 2011-2022 走看看