在今天我已经吧互联网营销精准决策项目的所有数据处理和分析的工作都完成了,包括按照范围分类,给表格打标签,设置权重,添加宽表,价格分类分析,销售情况分类分析等
在这几天的开发过程中,自己学到了很多,包括一些hive的使用方式、hql的语法,jdbc对hive的连接,hive的运行体制等,更重要是根据老师给的需求一步步自己探索,让我对数据分析有了一个新的认知,知道了拿到怎样的数据应该按照怎样的步骤,按照怎么样的范围去划分数据去分类去分析。
下面附上我的具体操作代码,其中有基本的注释,但由于写的过程中太投入了,就没有截图。。。。。。
以下代码的总结主要是用于以后自己在写hive时回顾,对于一些小白就算没有数据集也可以看一些基本的语法,很多代码还可以优化,但毕竟我刚开始学,多多包涵。
//手机表 create table iphone( > id string, > name array<string>, > title string, > storename string, > storeid string, > link string, > price int, > keyword string, > comment string, > goodcom int, > brand string, > model string, > color string, > uptime string, > system string) > row format delimited fields terminated by ',' > collection items terminated by ' ' > lines terminated by ' '; create table iphone1( > id string, > name array<string>, > title string, > storename string, > storeid string, > link string, > price int, > keyword string, > comment string, > goodcom int, > brand string, > model string, > color string, > uptime string, > system string) > row format delimited fields terminated by ',' > collection items terminated by ' ' > lines terminated by ' '; //清洗数据后的手机表 insert into iphone1( select * from iphone where system!=""); //用户信息表,该地区表中place中分割符与表中不符,建议使用userinfo1表 create table userinfo( id string, name string, place Array<string>, sex string, birthday string) row format delimited fields terminated by ',' lines terminated by ' '; //行为表 create table action( > userid string, > commodityid string, > behavior string, > month string, > day string) > row format delimited fields terminated by ',' > lines terminated by ' '; //拼接日期后的行为表 create table action1( userid string, commodityid string, behavior string, time string) row format delimited fields terminated by ' ' lines terminated by ' '; //评论表 create table comment( commodityid string, discuss string, time string, userid string, name string, rank string, color string, replynum string, score int, source string) row format delimited fields terminated by ' ' lines terminated by ' '; //拼接日期 insert into action1 > select userid,commodityid,behavior,concat(month,"-",day) from action; //用户表+评论表 create table com_user( commodityid string,//用户id name string, place Array<string>, sex string, agetitle int, agerange string, rank string) row format delimited fields terminated by ' ' lines terminated by ' '; select userinfo.id,userinfo.name,userinfo.place,userinfo.sex, case when (2019-cast(substr(userinfo.birthday,1,4)as int))<18 then 1 when (2019-cast(substr(userinfo.birthday,1,4)as int))<24 then 2 when (2019-cast(substr(userinfo.birthday,1,4)as int))<29 then 3 when (2019-cast(substr(userinfo.birthday,1,4)as int))<34 then 4 when (2019-cast(substr(userinfo.birthday,1,4)as int))<39 then 5 when (2019-cast(substr(userinfo.birthday,1,4)as int))<49 then 6 else 7 end, case when (2019-cast(substr(userinfo.birthday,1,4)as int))<18 then "18岁以下" when (2019-cast(substr(userinfo.birthday,1,4)as int))<24 then "24岁以下" when (2019-cast(substr(userinfo.birthday,1,4)as int))<29 then "29岁以下" when (2019-cast(substr(userinfo.birthday,1,4)as int))<34 then "34岁以下" when (2019-cast(substr(userinfo.birthday,1,4)as int))<39 then "39岁以下" when (2019-cast(substr(userinfo.birthday,1,4)as int))<49 then "49岁以下" else "50岁以上" end ,comment.rank from userinfo join comment on userinfo.id=comment.userid; //查询销售量前十的 select brand,count(*) as num from action join iphone1 on action.commodityid=iphone1.id where brand!="" group by brand order by num desc limit 10; //销售量前10表 hive> create table saleTop10( brand string, scount int) row format delimited fields terminated by ' ' lines terminated by ' '; //查询华为旗下的top 20 select name,count(*) as num from iphone where brand='华为(HUAWEI)' group by name order by num desc limit 20; //华为top20表 create table HUAWETopSale20( name array<string>, mcount int) row format delimited fields terminated by ' ' collection items terminated by ' ' lines terminated by ' '; //插入数据 insert into HUAWETopSale20 select name,count(*) as num from iphone where brand='华为(HUAWEI)' group by name order by num desc limit 20; //苹果top20表 create table AppleTopSale20( name array<string>, mcount int) row format delimited fields terminated by ' ' collection items terminated by ' ' lines terminated by ' '; //插入数据 insert into AppleTopSale20 select name,count(*) as num from iphone where brand='Apple' or brand='苹果' group by name order by num desc limit 20; //各年龄段手机销售情况表 create table agesale( agetitle int, model string, count int) row format delimited fields terminated by ' ' lines terminated by ' '; //插入数据 Insert into agesale select com_user.agetitle,iphone1.model,count(*) from action join iphone1 on action.commodityid=iphone1.id join com_user on action.userid=com_user.commodityid group by agetitle,model order by agetitle desc limit 50; //用户信息表,该地区表中place中分割符与表中不符,建议使用userinfo1表 create table userinfo1( id string, name string, place Array<string>, sex string, birthday string) row format delimited fields terminated by ',' collection items terminated by ' ' lines terminated by ' '; //地区销售表1 create table placesale( addr string, model string, count int); //地区销售表2 create table placesale1( addr string, model string, count int); //建立用户标签 //用户性别标签表 create table profile_tag_user_gender( userid string, tagid string, tagname string, tagtype string); //用户年龄段标签表 create table profile_tag_user_age_region( userid string, tagid string, tagname string, tagetype string); //插入用户年龄表数据 insert into profile_tag_user_age_region > select commodityid, > concat("A111U00100",cast(agetitle as string)),agerange, > "用户年龄段" > from com_user; //用户等级标签表 create table profile_tag_user_grade( > userid string, > tagid string, > tagname string, > tagtype string); //插入用户等级表 insert into profile_tag_user_grade select commodityid, case when rank="注册会员" then "A111U003_001" when rank="企业会员" then "A111U003_002" when rank="铜牌会员" then "A111U003_003" when rank="银牌会员" then "A111U003_004" when rank="金牌会员" then "A111U003_005" when rank="钻石会员" then "A111U003_006" when rank="PLUS会员[试用]" then "A111U003_007" when rank="PLUS会员" then "A111U003_008" else "A111U003_000" end, rank,"用户等级" from com_user; //创建用户行为标签表 create table person_user_tag_action( userid string, tagid string, tagname string, tagtype string, actioncount int); //插入数据 insert into person_user_tag_action select userid,concat("B21U001_00",behavior), case when behavior="0" then "点击" when behavior="1" then "添加购物车" when behavior="2" then "购买" when behavior="3" then "关注" else "" end, "用户行为", count(*) as c from action group by userid,behavior; //自定义权重:购买:5,加入购物车:4,关注:3,点击:2 自定义冷却系数1.68 //权重表 create table act_weight_detail( userid string, tagid string, tagname string, cnt int, tagtypeid int, actweight float); //计算时间,在插入数据中会使用 select month,day,(11-cast(month as int))*30,(12-cast(day as int)),(11-cast(month as int))*30+(12-cast(day as int)) as c from action order by c limit 20; //计算tf,在插入数据中会使用 select userid,sum(actioncount)over(partition by userid,tagid) as tf1,sum(actioncount)over(partition by userid) from person_user_tag_action order by userid limit 20; //计算idf,在插入数据中会使用 select tagid,(sum(actioncount)over(partition by tagid))/477008 from person_user_tag_action limit 20; //插入数据 insert into act_weight_detail select action.userid,person_user_tag_action.tagid,person_user_tag_action.tagname,person_user_tag_action.actioncount,cast(action.behavior as int), case when action.behavior="0" then 2/(exp((11-cast(action.month as int))*30+(12-cast(action.day as int))))*(sum(actioncount)over(partition by person_user_tag_action.userid,tagid)/sum(actioncount)over(partition by person_user_tag_action.userid))*477008/(sum(actioncount)over(partition by tagid))*person_user_tag_action.actioncount when action.behavior="1" then 4/(exp((11-cast(action.month as int))*30+(12-cast(action.day as int))))*(sum(actioncount)over(partition by person_user_tag_action.userid,tagid)/sum(actioncount)over(partition by person_user_tag_action.userid))*477008/(sum(actioncount)over(partition by tagid))*person_user_tag_action.actioncount when action.behavior="2" then 5/(exp((11-cast(action.month as int))*30+(12-cast(action.day as int))))*(sum(actioncount)over(partition by person_user_tag_action.userid,tagid)/sum(actioncount)over(partition by person_user_tag_action.userid))*477008/(sum(actioncount)over(partition by tagid))/477008*person_user_tag_action.actioncount when action.behavior="3" then 3/(exp((11-cast(action.month as int))*30+(12-cast(action.day as int))))*(sum(actioncount)over(partition by person_user_tag_action.userid,tagid)/sum(actioncount)over(partition by person_user_tag_action.userid))*477008/(sum(actioncount)over(partition by tagid))/477008*person_user_tag_action.actioncount else 0 end from person_user_tag_action join action on person_user_tag_action.userid=action.userid; //生成宽表 create table profile_user_tb( > userid string, > tagid1 string, > tagname1 string, > tagtype1 string, > tagid2 string, > tagname2 string, > tagtype2 string, > tagid3 string, > tagname3 string, > tagtype3 string, > tagid4 string, > tagname4 string, > actioncount int, > actionweight float, > tagtype4 string); //插入数据 insert into profile_user_tb select profile_tag_user_gender.userid,profile_tag_user_gender.tagid,profile_tag_user_gender.tagname,profile_tag_user_gender.tagtype,profile_tag_user_age_region.tagid,profile_tag_user_age_region.tagname,profile_tag_user_age_region.tagetype,profile_tag_user_grade.tagid,profile_tag_user_grade.tagname,profile_tag_user_grade.tagtype,act_weight_detail.tagid,act_weight_detail.tagname,act_weight_detail.cnt,act_weight_detail.actweight,"用户行为" from profile_tag_user_gender join profile_tag_user_age_region on profile_tag_user_gender.userid=profile_tag_user_age_region.userid join profile_tag_user_grade on profile_tag_user_gender.userid=profile_tag_user_grade.userid join act_weight_detail on profile_tag_user_gender.userid=act_weight_detail.userid; //在pycharm中操作python //提取评论 f1 = open("../comment1.txt",encoding="utf-8") f2 = open("../common","r+",encoding="utf-8") line = f1.readline() while line: f2.write(line.split(" ")[1]+" ") line = f1.readline() f1.close() f2.close() //计算评论的字数 输出字数: 12421790 content ="" try: f=open("../common",encoding="utf-8") for line in f.readlines(): content+=line.strip(); print("字数:",len(content)) except ValueError: print("AAAAAA") //建立分词统计表 create table comment_word_count_tb( word string, count int)row format delimited fields terminated by ' ' lines terminated by ' '; //导入分词统计数据 load data local inpath "/home/hadoop/file/wordcutsum" into table comment_word_count_tb; //分时间销售表,显示某一天的销售量 create table date_range_sail_count( sailcount int, datarange string); //价格区间销售表 create table price_range_sail_count( pricerange string, sailcount int); //为了向价格区间表输入数据先建立一个中间表 create table mid1( pricerange string); //向中间表导入数据 insert into mid1 select case when price<=1000 then "1000元以下" when price<=2000 then "1000-2000元" when price<=3000 then "2000-3000元" when price<=5000 then "3000-5000元" when price<=10000 then "5000-10000元" when price>=10000 then "10000以上" else 0 end from iphone1; //向价格区间销售表写入数据 insert into price_range_sail_count select pricerange,count(*) from mid1 group by pricerange;