movies表 电影id,电影名称,电影类型 ratings表 用户id,电影id,评分,评论时间(毫秒值) users表 用户id,性别,年龄段,工作类型,电话 age表 年龄段,年龄范围 job表 工作类型,工作岗位名称 1.清洗数据 mr 2.hive计算 表结构准备 外部表 create external table age( aid string, agescore string ) row format delimited fields terminated by ' ' lines terminated by ' ' stored as textfile location '/project/age'; create external table job( jid string, jobname string ) row format delimited fields terminated by ' ' lines terminated by ' ' stored as textfile location '/project/job'; create external table movies( mid string, mname string, type string ) row format delimited fields terminated by ',' lines terminated by ' ' stored as textfile location '/project/movies'; create external table ratings( uid string, mid string, score int, mytime string ) row format delimited fields terminated by ',' lines terminated by ' ' stored as textfile location '/project/ratings'; create external table users( uid string, sex string, aid string, jid string, tel string ) row format delimited fields terminated by ',' lines terminated by ' ' stored as textfile location '/project/users'; 要求: 1.算出所有电影的平均评分。 每个电影的平均分 select m.mname,round(t.avg,2) from movies m, (select avg(score) avg,mid from ratings group by mid)t where m.mid = t.mid 中间过程 含有有效 和 脏 score is null create table result1( mid string, mname string, score double ) row format delimited fields terminated by ' ' lines terminated by ' ' stored as textfile; //3883 3882 score 4000 null insert into result1 select mov.mid,mov.mname,t1.score from movies mov left join( select mid,round(avg(score),1) score from ratings group by mid) t1 on mov.mid=t1.mid; //hive和mysql 不同 update 并且 最不值钱的就是 表 create table result2( mid string, mname string, score double ) row format delimited fields terminated by ' ' lines terminated by ' ' stored as textfile; insert into table result2 select mid,mname,score from result1 where score is not null; insert into table result2 select mid,mname,0 from result10 where score is null; 2.算出所有用户对那些类型的电影最感兴趣,对每个用户推送该类型前三名评分最高的电影。 每个用户最喜欢哪个类型的电影 row_number() ratings表中 一个电影出现几次? 一个电影有几个类型? 类型在movies 评分或者观看次数ratings movie join ratings select t5.uid,t5.lx,m.mname from (select t3.uid,t3.lx,t4.mid from (select t2.uid,t2.lx from (select row_number() over (partition by t1.uid order by t1.cnt desc) rn,t1.uid,t1.lx from (select count(*) cnt,r.uid,t.lx from (select mid,mname,lx from movies lateral view explode(split(type,"\|")) types as lx)t ,ratings r where r.mid=t.mid group by r.uid,t.lx)t1)t2 where t2.rn=1)t3 , (select t2.lx,t2.mid from (select row_number() over (partition by t1.lx order by t1.cnt desc) rn,t1.lx,t1.mid from (select count(*) cnt,t.lx,r.mid from (select mid,mname,lx from movies lateral view explode(split(type,"\|")) types as lx)t , ratings r where r.mid = t.mid group by t.lx,r.mid)t1)t2 where t2.rn<4)t4 where t3.lx =t4.lx)t5 ,movies m where t5.mid=m.mid 每个类型最受欢迎的前三个电影 类型== 类型 select t3.uid,t3.lx,t4.mid from (select t2.lx,t2.mid,t2.cnt from (select row_number() over (partition by t1.lx order by t1.cnt desc)rn,t1.lx,t1.mid,t1.cnt from (select count(*) cnt,t.lx,t.mid from ratings r, (select mid,mname,lx from movies lateral view explode(split(type,"\|")) types as lx) t where r.mid = t.mid group by t.lx,t.mid)t1)t2 where t2.rn<4)t4 join (select t2.lx,t2.uid,t2.cnt from (select row_number() over (partition by t1.uid order by t1.cnt desc)rn,t1.lx,t1.uid,t1.cnt from (select count(*) cnt,t.lx,r.uid from ratings r, (select mid,mname,lx from movies lateral view explode(split(type,"\|")) types as lx) t where r.mid = t.mid group by t.lx,r.uid)t1)t2 where t2.rn=1)t3 where t3.lx = t4.lx //hive 最不值钱的是表 一半 人 和 类型 create table result3( uid string, type string ) row format delimited fields terminated by ' ' lines terminated by ' ' stored as textfile; insert into result3 select users.uid,t4.type from( select t3.uid uid,t3.type type from( select t2.uid uid,t2.type type,t2.num num,row_number() over(partition by t2.uid order by num desc) rk from( select t1.uid uid,t1.type type,count(*) num from( select rat.uid uid,mov.type type from ratings rat,movies mov where rat.mid=mov.mid) t1 group by t1.uid,t1.type) t2) t3 where t3.rk=1) t4,users where t4.uid=users.uid; create table result4( type string, mname string ) row format delimited fields terminated by ' ' lines terminated by ' ' stored as textfile; insert into result4 select t2.type,t2.mname from( select t1.type type,t1.mname mname,row_number() over(partition by t1.type order by t1.score desc) rk from( select mov.type type,mov.mname mname,avg(rat.score) score from ratings rat,movies mov where rat.mid=mov.mid group by mov.type,mov.mname) t1 ) t2 where t2.rk<4; select res3.uid,res4.mname from result3 res3,result4 res4 where res3.type=res4.type; 3.每个年龄层次,前3种最受欢迎的电影类型。 select t3.agescore,t3.lx,t3.cnt from (select row_number() over (partition by t2.agescore order by t2.cnt desc) rn,t2.agescore,t2.lx,t2.cnt from (select t.lx,r.agescore,count(*) cnt from (select mid,mname,lx from movies lateral view explode(split(type,"\|")) types as lx) t ,ratings r where t.mid = r.mid)t1,users u,age a where u.uid=t1.uid and u.aid=a.aid group by a.agescore,t1.lx)t2)t3 where t3.rn<=3 movies ratings users age ratings users movie uid mid aid,type 基础 方式 两两关联 没有优化 job数量是最大的 select age.agescore,t4.type from( select t3.aid aid,t3.type type from( select t2.aid aid,t2.type type,t2.num num,row_number() over(partition by t2.aid order by num desc) rk from( select t1.aid aid,t1.type type,count(*) num from( select users.aid aid,mov.type type from ratings rat,users,movies mov where rat.uid=users.uid and rat.mid=mov.mid) t1 group by t1.aid,t1.type) t2 ) t3 where t3.rk<4) t4,age where t4.aid=age.aid; 最终目标 1sql 做所有 sql编程 sql语句由复杂的sql嵌套组成 能在一次sql当中 尽量的做做些事情 hive-》job 分布式-》提交 计算资源(数据在哪 需要哪些资源)->run create table result5( agescore string, type string ) row format delimited fields terminated by ' ' lines terminated by ' ' stored as textfile; insert into result5 select age.agescore,t2.type from( select t1.aid aid,t1.type type,t1.num num,row_number() over(partition by t1.aid order by t1.num desc) rk from( select users.aid aid,mov.type type,count(*) num from ratings rat,users,movies mov where rat.uid=users.uid and rat.mid=mov.mid group by users.aid,mov.type) t1 ) t2,age where t2.rk<4 and t2.aid=age.aid; 4.算出所有电影都有那些工作岗位的人评论过,相对应的人数是多少? select count(*),j.jobname,r.mid from ratings r,users u,job j where r.uid = u.uid and j.jid=u.jid group by j.jobname,r.mid limit 10; rat->user->job select t1.mname,t1.jobname,count(*) num from( select job.jobname jobname,mov.mname from ratings rat,users,job,movies mov where rat.uid=users.uid and users.jid=job.jid and rat.mid=mov.mid) t1 group by t1.jobname,t1.mname order by t1.mname,num desc; from where group by having select order by limit 5.根据电影上映时间,得出当年最受欢迎和最不受欢迎的 10个名单 select t5.year,m1.mname,t5.cnt1,m2.mname,t5.cnt2,t5.rn from (select t3.year,t3.mid mid1,t3.cnt cnt1,t4.mid mid2,t4.cnt cnt2,t3.rn from (select t2.year,t2.mid,t2.cnt,t2.rn from (select row_number() over (partition by t1.year order by t1.cnt desc) rn, t1.year,t1.mid,t1.cnt from (select getDate(m.mname) year,r.mid,count(*) cnt from movies m,ratings r where m.mid = r.mid group by getDate(m.mname),r.mid)t1)t2 where t2.rn<11)t3 , (select t2.year,t2.mid,t2.cnt,t2.rn from (select row_number() over (partition by t1.year order by t1.cnt )rn, t1.year,t1.mid,t1.cnt from (select getDate(m.mname) year,r.mid,count(*) cnt from movies m,ratings r where m.mid = r.mid group by getDate(m.mname),r.mid)t1)t2 where t2.rn<11)t4 where t4.year=t3.year and t3.rn=t4.rn) t5,movies m1,movies m2 where m1.mid = t5.mid1 and t5.mid2=m2.mid 1.hive udf add jar /home/data/myTime.jar; create temporary function myTime as 'com.beiwang.project.MyTime'; 2.每年最受欢迎和最不受欢迎 top10 最受欢迎top10 create table result6( year string, mname string, rk string, flag string ) row format delimited fields terminated by ' ' lines terminated by ' ' stored as textfile; insert into result6 select t3.year,t3.mname,t3.rk,'good' from( select t2.year year,t2.mname mname,row_number() over(partition by year order by num desc) rk from( select t1.year year,t1.mname mname,count(*) num from( select myTime(mov.mname) year,mov.mname mname from ratings rat,movies mov where rat.mid=mov.mid) t1 group by t1.year,t1.mname ) t2 ) t3 where t3.rk<=10; 最不受欢迎top10 insert into result6 select t3.year,t3.mname,t3.rk,'bad' from( select t2.year year,t2.mname mname,row_number() over(partition by year order by num) rk from( select t1.year year,t1.mname mname,count(*) num from( select myTime(mov.mname) year,mov.mname mname from ratings rat,movies mov where rat.mid=mov.mid) t1 group by t1.year,t1.mname ) t2 ) t3 where t3.rk<=10; sqoop hive --》 mysql 6.得出每个岗位 最喜欢的电影类型 前3名 7.每年 每个类型 最受欢迎的电影 前3名 ( group by year type ) where rk<3
age.txt
1 "Under 18" 18 "18-24" 25 "25-34" 35 "35-44" 45 "45-49" 50 "50-55" 56 "56+"
job.txt
0 "other" or not specified 1 "academic/educator" 2 "artist" 3 "clerical/admin" 4 "college/grad student" 5 "customer service" 6 "doctor/health care" 7 "executive/managerial" 8 "farmer" 9 "homemaker" 10 "K-12 student" 11 "lawyer" 12 "programmer" 13 "retired" 14 "sales/marketing" 15 "scientist" 16 "self-employed" 17 "technician/engineer" 18 "tradesman/craftsman" 19 "unemployed" 20 "writer"