zoukankan html css js c++ java

saprk2 structed streaming

netcat (windows) >nc -L -p 9999

import java.sql.Timestamp

import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.functions._

/**
  */
object Test extends App {
  val host = "localhost"
  val port = 9999
  val windowSize = 10
  val slideSize = 5
  if (slideSize > windowSize) {
    System.err.println("<slide duration> must be less than or equal to <window duration>")
  }
  val windowDuration = s"$windowSize seconds"
  val slideDuration = s"$slideSize seconds"

  val spark = SparkSession
    .builder
    .appName("StructuredNetworkWordCountWindowed")
      .master("local[3]")
      .config("spark.sql.shuffle.partitions", 3)
    .getOrCreate()
  spark.sparkContext.setLogLevel("ERROR")
  import spark.implicits._

  // Create DataFrame representing the stream of input lines from connection to host:port
  val lines = spark.readStream
    .format("socket")
    .option("host", host)
    .option("port", port)
    .option("includeTimestamp", true)
    .load()

  // Split the lines into words, retaining timestamps
  val words = lines.as[(String, Timestamp)].flatMap(line =>
    line._1.split(" ").map(word => (word, line._2))
  ).toDF("word", "timestamp")

  // Group the data by window and word and compute the count of each group
  val windowedCounts = words.groupBy(
    window($"timestamp", windowDuration, slideDuration), $"word"
  ).count().orderBy($"window".desc)

  // Start running the query that prints the windowed word counts to the console
  val query = windowedCounts.writeStream
    .outputMode("complete")
    .format("console")
    .option("truncate", "false")
    .start()

  query.awaitTermination()

}

Result:

-------------------------------------------
Batch: 1
-------------------------------------------
+---------------------------------------------+----+-----+
|window                                       |word|count|
+---------------------------------------------+----+-----+
|[2017-10-24 16:09:30.0,2017-10-24 16:09:40.0]|b   |3    |
|[2017-10-24 16:09:30.0,2017-10-24 16:09:40.0]|a   |3    |
|[2017-10-24 16:09:30.0,2017-10-24 16:09:40.0]|c   |1    |
|[2017-10-24 16:09:30.0,2017-10-24 16:09:40.0]|d   |1    |
|[2017-10-24 16:06:40.0,2017-10-24 16:06:50.0]|a   |4    |
|[2017-10-24 16:06:35.0,2017-10-24 16:06:45.0]|a   |8    |
|[2017-10-24 16:06:30.0,2017-10-24 16:06:40.0]|a   |4    |
+---------------------------------------------+----+-----+

窗口移动5秒，窗口宽度10秒。
聚合维度： window, {world}

http://asyncified.io/2017/07/30/exploring-stateful-streaming-with-spark-structured-streaming/

查看全文

相关阅读:
java基础学习笔记四（异常）
关于linux下crontab mysql备份出来的数据为0字节的问题
 转:国内优秀npm镜像推荐及使用
 webpack使用总结~
php下载远程文件方法~
腾讯开放平台web第三方登录获取信息类（包含签名）
windows 平台 php_Imagick 拓展遇到的那些坑！
转:CentOS/Debian/Ubuntu一键安装LAMP（Apache/MySQL/PHP）环境
 composer 报错：Your requirements could not be resolved to an installable set of packages 解决方法
 Javascript模块化编程（三）：require.js的用法

原文地址：https://www.cnblogs.com/luweiseu/p/7724034.html