zoukankan      html  css  js  c++  java
  • spark热门电影

    
    
    package movies

    import org.apache.spark.rdd.RDD
    import org.apache.spark.{SparkConf, SparkContext}

    object Movice {
    def main(args: Array[String]): Unit = {
    val cof = new SparkConf ()
    .setAppName ( this.getClass.getSimpleName )
    .setMaster ( "local[1]" )
    val sc = new SparkContext ( cof )

    val users1 = sc.textFile ( "D:\学习笔记\资料汇总\day02\资料\热门电影的数据\users.dat" )
    val movies1 = sc.textFile ( "D:\学习笔记\资料汇总\day02\资料\热门电影的数据\movies.dat" )
    val ratings1 = sc.textFile ( "D:\学习笔记\资料汇总\day02\资料\热门电影的数据\ratings.dat" )

    //1:评分(平均分)最高的10部电影 (moviceId, (userId, rating))
    val ratings2: RDD[(Int, (String, Int))] = ratings1.map ( tp => {
    val splits: Array[String] = tp.split ( "::" )
    val userId = splits ( 0 )
    val moviceId = splits ( 1 ).toInt
    val rating = splits ( 2 ).toInt
    (moviceId, (userId, rating))
    } )

    //(moviceId, (userId, 1))
    val rating4:RDD[(Int,(Int,Int))]=ratings2.map(tp=>{
    val rating=tp._2._2
    val moviceId=tp._1
    (moviceId,(rating,1))
    })

    val group2: RDD[(Int, Iterable[(Int, Int)])] =rating4.groupByKey()

    //聚合(movid,rtingsum,counsum)
    val rantresult1: RDD[(Int, Int, Int)] = group2.map(tp=>{
    val rantsum=tp._2.map(tp=>tp._1).sum
    val countsum=tp._2.map(_._2).sum
    (tp._1,rantsum,countsum)
    })
    // //取平均值
    // val ranresult2=rantresult1.map(tp=>{
    // (tp._1,tp._2/tp._3)
    // }).sortBy(-_._2).take(10).foreach(println)

    //2:18 - 24 岁的男性年轻人 最喜欢看的10部电影
    val users2: RDD[(Int, (String, Int))] = users1.map ( tp => {
    val splits: Array[String] = tp.split ( "::" )
    val userId = splits ( 0 ).toInt
    val gender = splits ( 1 )
    val age = splits ( 2 ).toInt

    (userId, (gender, age))
    } )

    val ratings3: RDD[(Int, String)] = ratings1.map ( tp => {
    val splits: Array[String] = tp.split ( "::" )
    val userId = splits ( 0 ).toInt
    val moviceId = splits ( 1 )
    (userId, moviceId)
    } )


    // users2.join ( ratings3 ).filter ( tp => {
    // tp._2._1._1.equals ( "M" )
    // tp._2._1._2 >= 18 && tp._2._1._2 <= 24
    // } ).map ( tp => (
    // tp._2._2, 1)
    // ).reduceByKey ( _ + _ ).sortBy ( -_._2 ).take ( 10 ).foreach ( println )

    //3:女性观看次数最多的10部电影名称及观看次数
    users2.join(ratings3).filter(tp=>{
    tp._2._1._1.equals("F")
    }).map(tp=>(
    tp._2._2,1
    )).reduceByKey(_+_).sortBy(-_._2).take(10).foreach(println)
     

    sc.stop()
    }
    }
     
  • 相关阅读:
    用Sklearn画一颗决策树
    硬核机器学习干货,手把手教你写KNN!
    nginx源码分析源码结构
    linux流量监控iftop命令安装详解
    fping简介及使用方法
    进程与线程的区别(网络摘抄)
    linux nload命令简介及安装方法
    php中heredoc使用方法
    201920201学期 20192430 《网络空间安全专业导论》第一周学习总结1
    五种I/O模型
  • 原文地址:https://www.cnblogs.com/wangshuang123/p/11078358.html
Copyright © 2011-2022 走看看