zoukankan      html  css  js  c++  java
  • Spark FPGrowth (Frequent Pattern Mining)

    给定交易数据集,FP增长的第一步是计算项目频率并识别频繁项目。与为同样目的设计的类似Apriori的算法不同,FP增长的第二步使用后缀树(FP-tree)结构来编码事务,而不会显式生成候选集,生成的代价通常很高。第二步之后,可以从FP树中提取频繁项集。

    import org.apache.spark.sql.SparkSession
    import org.apache.spark.mllib.fpm.FPGrowth
    import org.apache.spark.rdd.RDD
    
    
    val spark = SparkSession
          .builder()
          .appName("Spark SQL basic example")
          .config("spark.some.config.option", "some-value")
          .getOrCreate()
    
    // For implicit conversions like converting RDDs to DataFrames
    import spark.implicits._
    
    val data = List(
                "1,2,5",
                "1,2,3,5",
                "1,2").toDF("items")
    data: org.apache.spark.sql.DataFrame = [items: string]
    
    // 注意每行,头部和尾部的[中括号
     data.rdd.map { s => s.toString() }.collect().take(3)
    res20: Array[String] = Array([1,2,5], [1,2,3,5], [1,2])                         
    
    val transactions: RDD[Array[String]] = data.rdd.map {
                s =>
                  val str = s.toString().drop(1).dropRight(1)
                  str.trim().split(",")
              }
    		  
    val fpg = new FPGrowth().setMinSupport(0.5).setNumPartitions(8)
    
    val model = fpg.run(transactions)
    
    /* model.freqItemsets.collect().foreach { itemset =>
                println(itemset.items.mkString("[", ",", "]") + ", " + itemset.freq)
              }*/
          
    val freqItemSets = model.freqItemsets.map { itemset =>
                val items = itemset.items.mkString(",")
                val freq = itemset.freq
                (items, freq)
              }.toDF("items", "freq")
    freqItemSets: org.apache.spark.sql.DataFrame = [items: string, freq: bigint]
    
    freqItemSets.show
    +-----+----+
    |items|freq|
    +-----+----+
    |    1|   3|
    |    2|   3|
    |  2,1|   3|
    |    5|   2|
    |  5,2|   2|
    |5,2,1|   2|
    |  5,1|   2|
    +-----+----+
    
    val minConfidence = 0.6
    minConfidence: Double = 0.6
    
    /*model.generateAssociationRules(minConfidence).collect().foreach { rule =>
                println(
                  rule.antecedent.mkString("[", ",", "]")
                    + " => " + rule.consequent.mkString("[", ",", "]")
                    + ", " + rule.confidence)
              }*/
          
    // 根据置信度生成关联规则
    val Rules = model.generateAssociationRules(minConfidence)
    Rules: org.apache.spark.rdd.RDD[org.apache.spark.mllib.fpm.AssociationRules.Rule[String]] = MapPartitionsRDD[129] at filter at AssociationRules.scala:80
    
    val df = Rules.map { s =>
                val L = s.antecedent.mkString(",")
                val R = s.consequent.mkString(",")
                val confidence = s.confidence
                (L, R, confidence)
              }.toDF("left_collect", "right_collect", "confidence")
    df: org.apache.spark.sql.DataFrame = [left_collect: string, right_collect: string ... 1 more field]
    
    df.show
    +------------+-------------+------------------+
    |left_collect|right_collect|        confidence|
    +------------+-------------+------------------+
    |           2|            5|0.6666666666666666|
    |           2|            1|               1.0|
    |         5,2|            1|               1.0|
    |           5|            2|               1.0|
    |           5|            1|               1.0|
    |           1|            5|0.6666666666666666|
    |           1|            2|               1.0|
    |         2,1|            5|0.6666666666666666|
    |         5,1|            2|               1.0|
    +------------+-------------+------------------+
    
  • 相关阅读:
    一起谈.NET技术,C#特性Attribute的实际应用之:代码统计分析 狼人:
    uploadify+C#实例
    GoldenGate DDL双向复制
    [置顶] Cocos2dx 深入解析系列:以XML文件方式保存用户数据
    打包软件里面安装完打开网页
    [置顶] NYOJ38 布线问题
    IDC:PC 今年第一季度出货量继续下滑趋势,比起去年同期跌了13.9%
    设计模式之组合模式java实现
    Linux C下实现线程池
    [置顶] Unix 网络编程系列05
  • 原文地址:https://www.cnblogs.com/wwxbi/p/7339806.html
Copyright © 2011-2022 走看看