zoukankan      html  css  js  c++  java
  • 电影表

    movies表
    电影id,电影名称,电影类型
    
    ratings表
    用户id,电影id,评分,评论时间(毫秒值)
    
    users表
    用户id,性别,年龄段,工作类型,电话
    
    age表
    年龄段,年龄范围
    
    job表
    工作类型,工作岗位名称
    
    
    1.清洗数据 mr
    
    2.hive计算
    表结构准备 外部表
    
    create external table age(
    aid string,
    agescore string
    )
    row format delimited fields terminated by '	'
    lines terminated by '
    '
    stored as textfile
    location '/project/age';
    
    create external table job(
    jid string,
    jobname string
    )
    row format delimited fields terminated by '	'
    lines terminated by '
    '
    stored as textfile
    location '/project/job';
    
    create external table movies(
    mid string,
    mname string,
    type string
    )
    row format delimited fields terminated by ','
    lines terminated by '
    '
    stored as textfile
    location '/project/movies';
    
    create external table ratings(
    uid string,
    mid string,
    score int,
    mytime string
    )
    row format delimited fields terminated by ','
    lines terminated by '
    '
    stored as textfile
    location '/project/ratings';
    
    
    create external table users(
    uid string,
    sex string,
    aid string,
    jid string,
    tel string
    )
    row format delimited fields terminated by ','
    lines terminated by '
    '
    stored as textfile
    location '/project/users';
    
    
    要求:
    1.算出所有电影的平均评分。
    每个电影的平均分
    select m.mname,round(t.avg,2)
    from
    movies m,
    (select avg(score) avg,mid from ratings group by mid)t
    where m.mid = t.mid
    
    
    
    
    中间过程  含有有效 和 脏  score is null
    create table result1(
    mid string,
    mname string,
    score double
    )
    row format delimited fields terminated by '	'
    lines terminated by '
    '
    stored as textfile;
    
    //3883  3882 score  4000 null
    insert into result1
    select mov.mid,mov.mname,t1.score
    from movies mov left join(
       select mid,round(avg(score),1) score
       from ratings
       group by mid) t1 on mov.mid=t1.mid;
    
    //hive和mysql 不同 update 并且 最不值钱的就是 表
    create table result2(
    mid string,
    mname string,
    score double
    )
    row format delimited fields terminated by '	'
    lines terminated by '
    '
    stored as textfile;
    
    insert into table result2
    select mid,mname,score
    from result1
    where score is not null;
    
    insert into table result2
    select mid,mname,0
    from result10
    where score is null;
    
    
    
    2.算出所有用户对那些类型的电影最感兴趣,对每个用户推送该类型前三名评分最高的电影。
    每个用户最喜欢哪个类型的电影
    row_number()
    ratings表中 一个电影出现几次?
    一个电影有几个类型?
    类型在movies   评分或者观看次数ratings
    
    movie join ratings
    
    select t5.uid,t5.lx,m.mname
    from
    (select t3.uid,t3.lx,t4.mid
    from
    (select t2.uid,t2.lx
    from
    (select row_number() over (partition by t1.uid order by t1.cnt desc) rn,t1.uid,t1.lx
    from
    (select count(*) cnt,r.uid,t.lx
    from
    (select mid,mname,lx from movies lateral view explode(split(type,"\|")) types as lx)t
    ,ratings r
    where r.mid=t.mid
    group by r.uid,t.lx)t1)t2
    where t2.rn=1)t3
    ,
    (select t2.lx,t2.mid 
    from
    (select row_number() over (partition by t1.lx order by t1.cnt desc) rn,t1.lx,t1.mid
    from
    (select count(*) cnt,t.lx,r.mid
    from
    (select mid,mname,lx from movies lateral view explode(split(type,"\|")) types as lx)t
    ,
    ratings r
    where r.mid = t.mid
    group by t.lx,r.mid)t1)t2
    where t2.rn<4)t4
    where t3.lx =t4.lx)t5
    ,movies m
    where t5.mid=m.mid 
    
    
    每个类型最受欢迎的前三个电影
    
    类型== 类型
    
    
    
    
    
    
    
    
    
    
    select t3.uid,t3.lx,t4.mid
    from
    (select t2.lx,t2.mid,t2.cnt
    from
    (select row_number() over (partition by t1.lx order by t1.cnt desc)rn,t1.lx,t1.mid,t1.cnt
    from
    (select count(*) cnt,t.lx,t.mid
    from ratings r,
    (select mid,mname,lx from movies lateral view explode(split(type,"\|")) types as lx) t
    where r.mid = t.mid
    group by t.lx,t.mid)t1)t2
    where t2.rn<4)t4
    join
    (select t2.lx,t2.uid,t2.cnt
    from
    (select row_number() over (partition by t1.uid order by t1.cnt desc)rn,t1.lx,t1.uid,t1.cnt
    from
    (select count(*) cnt,t.lx,r.uid
    from ratings r,
    (select mid,mname,lx from movies lateral view explode(split(type,"\|")) types as lx) t
    where r.mid = t.mid
    group by t.lx,r.uid)t1)t2
    where t2.rn=1)t3
    where t3.lx = t4.lx
    
    //hive 最不值钱的是表 一半 人 和 类型
    create table result3(
    uid string,
    type string
    )
    row format delimited fields terminated by '	'
    lines terminated by '
    '
    stored as textfile;
    
    insert into result3
    select users.uid,t4.type
    from(
    select t3.uid uid,t3.type type
       from(
          select t2.uid uid,t2.type type,t2.num num,row_number() over(partition by t2.uid order by num desc) rk
          from(
             select t1.uid uid,t1.type type,count(*) num
             from(
                select rat.uid uid,mov.type type
                from ratings rat,movies mov
                where rat.mid=mov.mid) t1
             group by t1.uid,t1.type) t2) t3
       where t3.rk=1) t4,users
    where t4.uid=users.uid;
    
    create table result4(
    type string,
    mname string
    )
    row format delimited fields terminated by '	'
    lines terminated by '
    '
    stored as textfile;
    
    insert into result4
    select t2.type,t2.mname
    from(
    select t1.type type,t1.mname mname,row_number() over(partition by t1.type order by t1.score desc) rk
    from(
       select mov.type type,mov.mname mname,avg(rat.score) score
       from ratings rat,movies mov
       where rat.mid=mov.mid
       group by mov.type,mov.mname) t1 ) t2
    where t2.rk<4;   
    
    select res3.uid,res4.mname
    from result3 res3,result4 res4
    where res3.type=res4.type;
    
    3.每个年龄层次,前3种最受欢迎的电影类型。
    select t3.agescore,t3.lx,t3.cnt
    from
    (select row_number() over (partition by t2.agescore order by t2.cnt desc) rn,t2.agescore,t2.lx,t2.cnt
    from
    (select t.lx,r.agescore,count(*) cnt
    from
    (select mid,mname,lx from movies lateral view explode(split(type,"\|")) types as lx) t
    ,ratings r
    where t.mid = r.mid)t1,users u,age a
    where u.uid=t1.uid and u.aid=a.aid
    group by a.agescore,t1.lx)t2)t3
    where t3.rn<=3
    
    
    
    
    
    
    movies ratings users age
    
    
    ratings users movie
    uid mid    aid,type
    
    基础 方式 两两关联 没有优化 job数量是最大的
    select age.agescore,t4.type
    from(
    select t3.aid aid,t3.type type
    from(
       select t2.aid aid,t2.type type,t2.num num,row_number() over(partition by t2.aid order by num desc) rk
       from(
          select t1.aid aid,t1.type type,count(*) num
          from(
             select users.aid aid,mov.type type
             from ratings rat,users,movies mov
             where rat.uid=users.uid and rat.mid=mov.mid) t1
          group by t1.aid,t1.type) t2 ) t3
    where t3.rk<4) t4,age
    where t4.aid=age.aid;
    
    最终目标 1sql 做所有
    sql编程 sql语句由复杂的sql嵌套组成  能在一次sql当中 尽量的做做些事情 hive-》job 分布式-》提交 计算资源(数据在哪 需要哪些资源)->run
    
    create table result5(
    agescore string,
    type string
    )
    row format delimited fields terminated by '	'
    lines terminated by '
    '
    stored as textfile;
    
    insert into result5
    select age.agescore,t2.type
    from(
       select t1.aid aid,t1.type type,t1.num num,row_number() over(partition by t1.aid order by t1.num desc) rk
       from(
          select users.aid aid,mov.type type,count(*) num
          from ratings rat,users,movies mov
          where rat.uid=users.uid and rat.mid=mov.mid
          group by users.aid,mov.type) t1 ) t2,age
    where t2.rk<4 and t2.aid=age.aid;
    
    
    
    4.算出所有电影都有那些工作岗位的人评论过,相对应的人数是多少?
    
    select count(*),j.jobname,r.mid
    from
    ratings r,users u,job j
    where r.uid = u.uid and j.jid=u.jid
    group by j.jobname,r.mid limit 10;
    
    
    
    
    
    
    
    
    
    
    
    rat->user->job
    
    select t1.mname,t1.jobname,count(*) num
    from(
       select job.jobname jobname,mov.mname
       from ratings rat,users,job,movies mov
       where rat.uid=users.uid and users.jid=job.jid and rat.mid=mov.mid) t1
    group by t1.jobname,t1.mname
    order by t1.mname,num desc;
    
    from
    where
    group by
    having
    select
    order by
    limit
    
    5.根据电影上映时间,得出当年最受欢迎和最不受欢迎的 10个名单
    select t5.year,m1.mname,t5.cnt1,m2.mname,t5.cnt2,t5.rn
    from
    (select t3.year,t3.mid mid1,t3.cnt cnt1,t4.mid mid2,t4.cnt cnt2,t3.rn
    from
    (select t2.year,t2.mid,t2.cnt,t2.rn
    from
    (select row_number() over (partition by t1.year order by t1.cnt desc) rn, t1.year,t1.mid,t1.cnt
    from
    (select getDate(m.mname) year,r.mid,count(*) cnt
    from movies m,ratings r
    where m.mid = r.mid
    group by getDate(m.mname),r.mid)t1)t2
    where t2.rn<11)t3
    ,
    (select t2.year,t2.mid,t2.cnt,t2.rn
    from
    (select row_number() over (partition by t1.year order by t1.cnt )rn, t1.year,t1.mid,t1.cnt
    from
    (select getDate(m.mname) year,r.mid,count(*) cnt
    from movies m,ratings r
    where m.mid = r.mid
    group by getDate(m.mname),r.mid)t1)t2
    where t2.rn<11)t4
    where t4.year=t3.year and t3.rn=t4.rn) t5,movies m1,movies m2
    where m1.mid = t5.mid1 and t5.mid2=m2.mid
    
    
    
    
    1.hive udf
    
    add jar /home/data/myTime.jar;
    create temporary function myTime as 'com.beiwang.project.MyTime';
    
    
    2.每年最受欢迎和最不受欢迎 top10
    最受欢迎top10
    create table result6(
    year string,
    mname string,
    rk string,
    flag string
    )
    row format delimited fields terminated by '	'
    lines terminated by '
    '
    stored as textfile;
    
    insert into result6
    select t3.year,t3.mname,t3.rk,'good'
    from(
       select t2.year year,t2.mname mname,row_number() over(partition by year order by num desc) rk
       from(
          select t1.year year,t1.mname mname,count(*) num
          from(
             select myTime(mov.mname) year,mov.mname mname
             from ratings rat,movies mov
             where rat.mid=mov.mid) t1
          group by t1.year,t1.mname ) t2 ) t3
    where t3.rk<=10;
    
    最不受欢迎top10
    
    insert into result6
    select t3.year,t3.mname,t3.rk,'bad'
    from(
       select t2.year year,t2.mname mname,row_number() over(partition by year order by num) rk
       from(
          select t1.year year,t1.mname mname,count(*) num
          from(
             select myTime(mov.mname) year,mov.mname mname
             from ratings rat,movies mov
             where rat.mid=mov.mid) t1
          group by t1.year,t1.mname ) t2 ) t3
    where t3.rk<=10;
    
    sqoop hive --》 mysql
    
    
    6.得出每个岗位 最喜欢的电影类型 前3名
    
    
    7.每年 每个类型 最受欢迎的电影 前3名
    
    (
    group by year type
    )
    where rk<3

    age.txt

    1    "Under 18"
    18    "18-24"
    25    "25-34"
    35    "35-44"
    45    "45-49"
    50    "50-55"
    56    "56+"

    job.txt

    0    "other" or not specified
    1    "academic/educator"
    2    "artist"
    3    "clerical/admin"
    4    "college/grad student"
    5    "customer service"
    6    "doctor/health care"
    7    "executive/managerial"
    8    "farmer"
    9    "homemaker"
    10    "K-12 student"
    11    "lawyer"
    12    "programmer"
    13    "retired"
    14    "sales/marketing"
    15    "scientist"
    16    "self-employed"
    17    "technician/engineer"
    18    "tradesman/craftsman"
    19    "unemployed"
    20    "writer"
  • 相关阅读:
    指向指针的指针
    判断是否遵守某个协议
    oc继承,实现,分类
    oc中没有空指针错误
    oc方法
    指针
    Array.diff
    ATM机允许4位或6位密码,而密码只能包含4位或6位数字。 如果函数传递了一个有效的PIN字符串,返回true,否则返回false。
    替换字符串中的字符为“(” 或“)”
    python 异常处理
  • 原文地址:https://www.cnblogs.com/JBLi/p/10856921.html
Copyright © 2011-2022 走看看