一、文件截图
处理数据量:804kb
二、流程分析
根据分割符获取时间,再利用分组统计单位时间内的访问量,控制台输出(时间,访问量)形式
三、代码
package rdd.operator.transform import java.text.SimpleDateFormat import java.util.Date import org.apache.spark.rdd.RDD import org.apache.spark.{SparkConf, SparkContext} object Spark06_RDD_Operator_Transform_groupby { def main(args: Array[String]): Unit = { val sparkConf: SparkConf = new SparkConf().setMaster("local[*]").setAppName("Operator") val sc = new SparkContext(sparkConf) val rdd: RDD[String] = sc.textFile("datas/apache.log") val timeRDD: RDD[(String, Iterable[(String, Int)])] = rdd.map( line => { val datas: Array[String] = line.split(" ") val time: String = datas(3) val sdf = new SimpleDateFormat("dd/MM/yyyy:HH:mm:ss") val date: Date = sdf.parse(time) val sdf1 = new SimpleDateFormat("HH") val hour: String = sdf1.format(date) (hour, 1) } ).groupBy(_._1) timeRDD.map{ case(hour,iter)=>{ (hour,iter.size) } }.collect().foreach(println) sc.stop() } }
四、运行截图