zoukankan      html  css  js  c++  java
  • 日志=>flume=>kafka=>spark streaming=>hbase

      日志=>flume=>kafka=>spark streaming=>hbase

    日志部分

    #coding=UTF-8
    import random
    import time
    
    
    url_paths = [
            "class/112.html", 
            "class/128.html", 
            "learn/821", 
            "class/145.html",
            "class/146.html",
            "class/131.html",
            "class/130.html",
            "course/list"
            ]
    
    
    ip_slices = [132,156,124,10, 29, 167,143,187,30, 46, 55, 63, 72, 87,98,168]
    
    http_referers = [
                            "http://www.baidu.com/s?wd={query}",
                            "http://www.sogou.com/web?query={query}",
                            "https://search.yahoo.com/search?p={query}",
                            "http://www.bing.com/search?q={query}"
                    ]
    
    search_keyword = ["Spark SQL实战", "Hadoop基础", "Storm实战", "Spark Streaming实战", "大数据面试"]
    
    status_codes = ["200", "404", "500"]
    
    def sample_url():
            return random.sample(url_paths,1)[0]
    
    def sample_ip():
            slice = random.sample(ip_slices,4)
            return ".".join([str(item) for item in slice])
    
    def sample_status_code():
            return random.sample(status_codes,1)[0]
    
    def sample_referer():
            if random.uniform(0, 1) > 0.2:
                    return "-"
    
            refer_str = random.sample(http_referers, 1)
            query_str = random.sample(search_keyword, 1)
            return refer_str[0].format(query=query_str[0])
    
    def generate_log(count=3):
            time_str = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
            f = open("/home/hadoop/data/project/logs/access.log", "w+")
            while count >= 1:
                    query_log = "{ip}	[{local_time}]	"GET /{url} HTTP/1.1"	{status_code}	"{referer}"".format(ip=sample_ip() , local_time=time_str, url=sample_url(), status_code=sample_status_code(), referer=sample_referer())
                    print query_log
                    f.write(query_log + "
    ")
                    count = count - 1
    
    if __name__ == '__main__':
            #print sample_ip()
            #print sample_url()
            generate_log(10)

     flume对接日志部分

    exec-memory-kafka.conf
    #exec-memory-kafka
    
    exec-memory-kafka.sources = exec-source
    exec-memory-kafka.channels = memory-channel
    exec-memory-kafka.sinks = kafka-sink
    
    exec-memory-kafka.sources.exec-source.type = exec
    exec-memory-kafka.sources.exec-source.command = tail -F /home/hadoop/data/project/logs/access.log
    exec-memory-kafka.sources.exec-source.shell = /bin/sh -c
    exec-memory-kafka.sources.exec-source.channels = memory-channel
                
    exec-memory-kafka.channels.memory-channel.type = memory
                
    exec-memory-kafka.sinks.kafka-sink.type = org.apache.flume.sink.kafka.KafkaSink
    exec-memory-kafka.sinks.kafka-sink.topic = streamingtopic
    exec-memory-kafka.sinks.kafka-sink.brokerList = hadoop:9092
    exec-memory-kafka.sinks.kafka-sink.batchSize = 5
    exec-memory-kafka.sinks.kafka-sink.requiredAcks = 1
    exec-memory-kafka.sinks.kafka-sink.channel = memory-channel

    flume-ng agent
    --name exec-memory-kafka
    --conf $FLUME_HOME/conf
    --conf-file /home/hadoop/data/project/exec-memory-kafka.conf
    -Dflume.root.logger=INFO,console

    启动kafka测试消费:kafka-console-consumer.sh --zookeeper hadoop:2181 --topic streamingtopic --from-beginning

    启动Hadoop:start-dfs.sh


    启动hbase: start-hbase.sh

    进入hbase shell:hbase shell -> 查看: list
    hbase表设计:
    create 'lin_course_clickcount' ,'info'
    create 'lin_course_search_clickcount','info'
    查看表:scan 'lin_course_clickcount'
    rowkey设计:
    day_courseid
    day_search_courseid

    <?xml version="1.0" encoding="UTF-8"?>
    <project xmlns="http://maven.apache.org/POM/4.0.0"
             xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
             xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
        <modelVersion>4.0.0</modelVersion>
    
        <groupId>com.lin.spark</groupId>
        <artifactId>SparkStreaming</artifactId>
        <version>1.0-SNAPSHOT</version>
        <properties>
            <scala.version>2.11.8</scala.version>
            <kafka.version>0.9.0.0</kafka.version>
            <spark.version>2.2.0</spark.version>
            <hadoop.version>2.6.0-cdh5.7.0</hadoop.version>
            <hbase.version>1.2.0-cdh5.7.0</hbase.version>
        </properties>
    
        <!--添加cloudera的repository-->
        <repositories>
            <repository>
                <id>cloudera</id>
                <url>https://repository.cloudera.com/artifactory/cloudera-repos</url>
            </repository>
        </repositories>
    
        <dependencies>
            <dependency>
                <groupId>org.scala-lang</groupId>
                <artifactId>scala-library</artifactId>
                <version>${scala.version}</version>
            </dependency>
    
            <!-- Kafka 依赖-->
            <!--
            <dependency>
                <groupId>org.apache.kafka</groupId>
                <artifactId>kafka_2.11</artifactId>
                <version>${kafka.version}</version>
            </dependency>
            -->
    
            <!-- Hadoop 依赖-->
            <dependency>
                <groupId>org.apache.hadoop</groupId>
                <artifactId>hadoop-client</artifactId>
                <version>${hadoop.version}</version>
            </dependency>
    
            <!-- HBase 依赖-->
            <dependency>
                <groupId>org.apache.hbase</groupId>
                <artifactId>hbase-client</artifactId>
                <version>${hbase.version}</version>
            </dependency>
    
            <dependency>
                <groupId>org.apache.hbase</groupId>
                <artifactId>hbase-server</artifactId>
                <version>${hbase.version}</version>
            </dependency>
    
            <!-- Spark Streaming 依赖-->
            <dependency>
                <groupId>org.apache.spark</groupId>
                <artifactId>spark-streaming_2.11</artifactId>
                <version>${spark.version}</version>
            </dependency>
    
    
            <!-- Spark Streaming整合Flume 依赖-->
            <dependency>
                <groupId>org.apache.spark</groupId>
                <artifactId>spark-streaming-flume_2.11</artifactId>
                <version>${spark.version}</version>
            </dependency>
    
            <dependency>
                <groupId>org.apache.spark</groupId>
                <artifactId>spark-streaming-flume-sink_2.11</artifactId>
                <version>${spark.version}</version>
            </dependency>
    
            <dependency>
                <groupId>org.apache.spark</groupId>
                <artifactId>spark-streaming-kafka-0-8_2.11</artifactId>
                <version>${spark.version}</version>
            </dependency>
    
            <dependency>
                <groupId>org.apache.commons</groupId>
                <artifactId>commons-lang3</artifactId>
                <version>3.5</version>
            </dependency>
    
            <!-- Spark SQL 依赖-->
            <dependency>
                <groupId>org.apache.spark</groupId>
                <artifactId>spark-sql_2.11</artifactId>
                <version>${spark.version}</version>
            </dependency>
    
    
            <dependency>
                <groupId>com.fasterxml.jackson.module</groupId>
                <artifactId>jackson-module-scala_2.11</artifactId>
                <version>2.6.5</version>
            </dependency>
    
            <dependency>
                <groupId>net.jpountz.lz4</groupId>
                <artifactId>lz4</artifactId>
                <version>1.3.0</version>
            </dependency>
    
            <dependency>
                <groupId>mysql</groupId>
                <artifactId>mysql-connector-java</artifactId>
                <version>5.1.38</version>
            </dependency>
    
            <dependency>
                <groupId>org.apache.flume.flume-ng-clients</groupId>
                <artifactId>flume-ng-log4jappender</artifactId>
                <version>1.6.0</version>
            </dependency>
    
        </dependencies>
    
        <build>
            <!--
            <sourceDirectory>src/main/scala</sourceDirectory>
            <testSourceDirectory>src/test/scala</testSourceDirectory>
            -->
            <plugins>
                <plugin>
                    <groupId>org.scala-tools</groupId>
                    <artifactId>maven-scala-plugin</artifactId>
                    <executions>
                        <execution>
                            <goals>
                                <goal>compile</goal>
                                <goal>testCompile</goal>
                            </goals>
                        </execution>
                    </executions>
                    <configuration>
                        <scalaVersion>${scala.version}</scalaVersion>
                        <args>
                            <arg>-target:jvm-1.5</arg>
                        </args>
                    </configuration>
                </plugin>
                <plugin>
                    <groupId>org.apache.maven.plugins</groupId>
                    <artifactId>maven-eclipse-plugin</artifactId>
                    <configuration>
                        <downloadSources>true</downloadSources>
                        <buildcommands>
                            <buildcommand>ch.epfl.lamp.sdt.core.scalabuilder</buildcommand>
                        </buildcommands>
                        <additionalProjectnatures>
                            <projectnature>ch.epfl.lamp.sdt.core.scalanature</projectnature>
                        </additionalProjectnatures>
                        <classpathContainers>
                            <classpathContainer>org.eclipse.jdt.launching.JRE_CONTAINER</classpathContainer>
                            <classpathContainer>ch.epfl.lamp.sdt.launching.SCALA_CONTAINER</classpathContainer>
                        </classpathContainers>
                    </configuration>
                </plugin>
            </plugins>
        </build>
        <reporting>
            <plugins>
                <plugin>
                    <groupId>org.scala-tools</groupId>
                    <artifactId>maven-scala-plugin</artifactId>
                    <configuration>
                        <scalaVersion>${scala.version}</scalaVersion>
                    </configuration>
                </plugin>
            </plugins>
        </reporting>
    
    </project>
    package com.lin.spark.streaming.project.spark
    
    import com.lin.spark.streaming.project.dao.{CourseClickCountDAO, CourseSearchClickCountDAO}
    import com.lin.spark.streaming.project.domain.{ClickLog, CourseClickCount, CourseSearchClickCount}
    import com.lin.spark.streaming.project.utils.DateUtils
    import org.apache.spark.SparkConf
    import org.apache.spark.streaming.kafka.KafkaUtils
    import org.apache.spark.streaming.{Seconds, StreamingContext}
    
    import scala.collection.mutable.ListBuffer
    
    /**
      * Created by Administrator on 2019/6/6.
      */
    object StatStreamingApp {
      def main(args: Array[String]): Unit = {
    
        if (args.length != 4) {
          System.err.println("参数有误!")
          System.exit(1)
        }
        //hadoop:2181 test streamingtopic 2
        val Array(zkQuorum, group, topics, numThreads) = args
        val conf = new SparkConf().setAppName("KafkaUtil").setMaster("local[4]")
        val ssc = new StreamingContext(conf, Seconds(60))
    
        val topicMap = topics.split(",").map((_, numThreads.toInt)).toMap
    
        val clickLog = KafkaUtils.createStream(ssc, zkQuorum, group, topicMap).map(_._2)
    
        val cleanData = clickLog.map(line => {
          val infos = line.split("	")
          //29.98.156.124   2019-06-06 05:37:01     "GET /class/131.html HTTP/1.1"  500     http://www.baidu.com/s?wd=Storm实战
          //case class ClickLog(ip:String, time:String, courseId:Int, statusCode:Int, referer:String)
          var courseId = 0
          val url = infos(2).split(" ")(1)
          if (url.startsWith("/class")) {
            val urlHTML = url.split("/")(2)
            courseId = urlHTML.substring(0, urlHTML.lastIndexOf(".")).toInt
          }
          ClickLog(infos(0), DateUtils.parseToMinute(infos(1)), courseId, infos(3).toInt, infos(4))
        }).filter(clickLog => clickLog.courseId != 0)
    
        //存储点击日志
        cleanData.map(log => {
          (log.time.substring(0, 8) + "_" + log.courseId, 1)
        }).reduceByKey(_ + _).foreachRDD(rdd => {
          rdd.foreachPartition(partitionReconrds => {
            val list = new ListBuffer[CourseClickCount]
            partitionReconrds.foreach(pair => {
              list.append(CourseClickCount(pair._1, pair._2))
            })
            CourseClickCountDAO.save(list)
          })
        })
    
        //存储查询点击日志
        cleanData.map(log => {
    
          val referer = log.referer.replaceAll("//", "/")
          val splits = referer.split("/")
          var host = ""
          if (splits.length > 2) {
            host = splits(1)
          }
          (host, log.courseId, log.time)
        }).filter(x => {
          x._1 != ""
        }).map(searchLog=>{
          (searchLog._3.substring(0,8) + "_" + searchLog._1 + "_" + searchLog._2 , 1)
        }).reduceByKey(_ + _).foreachRDD(rdd => {
          rdd.foreachPartition(partitionReconrds => {
            val list = new ListBuffer[CourseSearchClickCount]
            partitionReconrds.foreach(pair => {
              list.append(CourseSearchClickCount(pair._1, pair._2))
            })
            CourseSearchClickCountDAO.save(list)
          })
        })
    
        ssc.start()
        ssc.awaitTermination()
      }
    }
    package com.lin.spark.streaming.project.utils
    
    import java.util.Date
    
    import org.apache.commons.lang3.time.FastDateFormat
    
    /**
      * Created by Administrator on 2019/6/6.
      */
    object DateUtils {
    
      val YYYYMMDDHHMMSS_FORMAT = FastDateFormat.getInstance("yyyy-MM-dd HH:mm:ss")
      val TARGE_FORMAT = FastDateFormat.getInstance("yyyyMMddHHmmss")
    
      def getTime(time:String) ={
        YYYYMMDDHHMMSS_FORMAT.parse(time).getTime
      }
    
      def parseToMinute(time:String)={
        TARGE_FORMAT.format(new Date(getTime(time)))
      }
    
      def main(args: Array[String]): Unit = {
        println(parseToMinute("2017-10-22 14:46:01"))
      }
    }
    package com.lin.spark.streaming.project.domain
    
    
    case class ClickLog(ip:String, time:String, courseId:Int, statusCode:Int, referer:String)
    package com.lin.spark.streaming.project.domain
    
    /**
      * Created by Administrator on 2019/6/7.
      */
    case class CourseClickCount(day_course:String,click_course:Long)
    package com.lin.spark.streaming.project.domain
    
    /**
      * Created by Administrator on 2019/6/7.
      */
    case class CourseSearchClickCount(day_search_course:String, click_count:Long)
    package com.lin.spark.streaming.project.dao
    
    import com.lin.spark.project.utils.HBaseUtils
    import com.lin.spark.streaming.project.domain.CourseClickCount
    import org.apache.hadoop.hbase.client.Get
    import org.apache.hadoop.hbase.util.Bytes
    
    import scala.collection.mutable.ListBuffer
    
    /**
      * Created by Administrator on 2019/6/7.
      */
    object CourseClickCountDAO {
    
      val tableName = "lin_course_clickcount"
      val cf = "info"
      val qualifer = "click_count"
    
      def save(list:ListBuffer[CourseClickCount]):Unit={
        val table =HBaseUtils.getInstance().getTable(tableName)
        for (ele <- list){
          table.incrementColumnValue(Bytes.toBytes(ele.day_course),
            Bytes.toBytes(cf),
            Bytes.toBytes(qualifer),
            ele.click_course)
        }
      }
    
      def count(day_course:String):Long={
        val table = HBaseUtils.getInstance().getTable(tableName)
        val get = new Get(Bytes.toBytes(day_course))
        val value  = table.get(get).getValue(cf.getBytes,qualifer.getBytes)
        if(value == null){
          0L
        }else{
          Bytes.toLong(value)
        }
      }
    
      def main(args: Array[String]): Unit = {
        val list = new ListBuffer[CourseClickCount]
        list.append(CourseClickCount("20190606",99))
        list.append(CourseClickCount("20190608",89))
        list.append(CourseClickCount("20190609",100))
    //    save(list)
        println(count("20190609"))
      }
    }
    package com.lin.spark.streaming.project.dao
    
    import com.lin.spark.project.utils.HBaseUtils
    import com.lin.spark.streaming.project.domain.{CourseClickCount, CourseSearchClickCount}
    import org.apache.hadoop.hbase.client.Get
    import org.apache.hadoop.hbase.util.Bytes
    
    import scala.collection.mutable.ListBuffer
    
    /**
      * Created by Administrator on 2019/6/7.
      */
    object CourseSearchClickCountDAO {
    
      val tableName = "lin_course_search_clickcount"
      val cf = "info"
      val qualifer = "click_count"
    
      def save(list:ListBuffer[CourseSearchClickCount]):Unit={
        val table =HBaseUtils.getInstance().getTable(tableName)
        for (ele <- list){
          table.incrementColumnValue(Bytes.toBytes(ele.day_search_course),
            Bytes.toBytes(cf),
            Bytes.toBytes(qualifer),
            ele.click_count)
        }
      }
    
      def count(day_course:String):Long={
        val table = HBaseUtils.getInstance().getTable(tableName)
        val get = new Get(Bytes.toBytes(day_course))
        val value  = table.get(get).getValue(cf.getBytes,qualifer.getBytes)
        if(value == null){
          0L
        }else{
          Bytes.toLong(value)
        }
      }
    
      def main(args: Array[String]): Unit = {
        val list = new ListBuffer[CourseSearchClickCount]
        list.append(CourseSearchClickCount("20190606_www.baidu.com_99",99))
        list.append(CourseSearchClickCount("20190608_www.bing.com_89",89))
        list.append(CourseSearchClickCount("20190609_www.csdn.net_100",100))
        save(list)
    //    println(count("20190609"))
      }
    }
  • 相关阅读:
    [BUUOJ记录] [ZJCTF 2019]NiZhuanSiWei
    [BUUOJ记录] [BJDCTF2020]EasySearch
    [BUUOJ记录] [BJDCTF2020]The mystery of ip
    php版本:实现过滤掉广告、色情、政治相关的敏感词
    热门搜索词获取java版
    如何用代码挖局长尾关键词
    几行python代码解决相关词联想
    如何高效的完成中文分词?
    python 脚本撞库国内“某榴”账号
    搜索引擎之全文搜索算法功能实现(基于Lucene)
  • 原文地址:https://www.cnblogs.com/linkmust/p/10989918.html
Copyright © 2011-2022 走看看