zoukankan      html  css  js  c++  java
  • 实际业务代码开发

    数据清洗

    时间工具类开发: DateUtils.scala

    package com.imooc.utils
    
    import java.util.Date
    import org.apache.commons.lang3.time.FastDateFormat
    
    /**
      * 日期时间工具类
      */
    object DateUtils {
    
    //  2019-03-31 06:00:00
      val YYYYMMDDHHMMSS_FORMAT = FastDateFormat.getInstance("yyyy-MM-dd HH:mm:ss")
      val TARGET_FORMAT = FastDateFormat.getInstance("yyyyMMddHHmmss")
    
      def getTime(time: String) = {
        YYYYMMDDHHMMSS_FORMAT.parse(time).getTime
      }
    
      def parseToMinute(time: String) = {
        TARGET_FORMAT.format(new Date(getTime(time)))
      }
    
      def main(args: Array[String]): Unit = {
        println(parseToMinute("2019-03-31 06:12:00"))
      }
    }
    

    定义数据结果

    ClickLog.scala

    package com.imooc
    /**
      * 清洗后的日志信息
      * @param time 日志访问的时间
      * @param ip 日志访问的时间
      * @param courseId 日志访问的实战课程编号
      * @param statusCode 日志访问产生的状态码
      * @param referer 日志访问产生的referer
      */
    case class ClickLog(time: String, ip: String, courseId: Int, statusCode: Int, referer: String)
    //    10.25.63.72   2019-03-31 06:17:01 "GET /class/153.html HTTP/1.1   500 https://cn.bing.com/search?q=Spark Streaming实战
    

    主代码:

    ImoocStatStreamingApp.scala

    package com.imooc
    
    import com.imooc.utils.DateUtils
    import org.apache.spark.SparkConf
    import org.apache.spark.streaming.kafka.KafkaUtils
    import org.apache.spark.streaming.{Seconds, StreamingContext}
    
    /**
      * 测试Kafka对接Spark Streaming
      */
    object ImoocStatStreamingApp {
    
      def main(args: Array[String]): Unit = {
        if (args.length != 4) {
          System.err.println("Usage: <zkQuorum> <groupId> <topics> <numThreads>")
          System.exit(1)
        }
    
        val Array(zkQuorum, groupId, topics,numThreads) = args
        val sparkconf = new SparkConf()
          .setAppName("ImoocStatStreamingApp")
          .setMaster("local[2]")
        val ssc = new StreamingContext(sparkconf, Seconds(60))
    
        val topicMap = topics.split(",").map((_, numThreads.toInt)).toMap
        val message = KafkaUtils.createStream(ssc, zkQuorum, groupId, topicMap)
    
    
    //    10.25.63.72   2019-03-31 06:17:01 "GET /class/153.html HTTP/1.1   500 https://cn.bing.com/search?q=Spark Streaming实战
        val logs = message.map(_._2)
        val cleanData = logs.map(line => {
        val infos = line.split("	")
    
    
          // info(2) = "GET /class/153.html HTTP/1.1"
          // url = /class/153.html
          val url = infos(2).split(" ")(1)
          var courseId = 0
    
          // 把实战课程的课程编号拿到了
          if (url.startsWith("/class")) {
            val courseIdHTML = url.split("/")(2)
            courseId = courseIdHTML.substring(0, courseIdHTML.lastIndexOf(".")).toInt
          }
          
          
    // 最终清洗后的结果
    // 20190331065001,29.198.72.55,153,500,https://search.yahoo.com/search?p=Hadoop基础
          ClickLog(DateUtils.parseToMinute(infos(1)), infos(0), courseId, infos(3).toInt, infos(4))
        }).filter(clicklog => clicklog.courseId != 0)
    
        cleanData.print()
    
        ssc.start()
        ssc.awaitTermination()
      }
    }
    
  • 相关阅读:
    树链剖分 (模板) 洛谷3384
    ST表 (模板) 洛谷3865
    IOI 2005 River (洛谷 3354)
    IOI 2005 River (洛谷 3354)
    poj1094 Sorting It All Out
    poj1094 Sorting It All Out
    spfa(模板)
    HAOI 2006 受欢迎的牛 (洛谷2341)
    HAOI 2006 受欢迎的牛 (洛谷2341)
    洛谷1850(NOIp2016) 换教室——期望dp
  • 原文地址:https://www.cnblogs.com/suixingc/p/shi-ji-ye-wu-dai-ma-kai-fa.html
Copyright © 2011-2022 走看看