zoukankan      html  css  js  c++  java
  • spark计算两个DataFrame的差集、交集、合集

    spark 计算两个dataframe 的差集、交集、合集,只选择某一列来对比比较好。新建两个 dataframe

    import org.apache.spark.{SparkConf, SparkContext}
    import org.apache.spark.sql.SQLContext
    
    def main(args: Array[String]): Unit = {
    
        val conf = new SparkConf().setAppName("TTyb").setMaster("local")
        val sc = new SparkContext(conf)
        val spark = new SQLContext(sc)
        val sentenceDataFrame = spark.createDataFrame(Seq(
          (1, "asf"),
          (2, "2143"),
          (3, "rfds")
        )).toDF("label", "sentence")
        sentenceDataFrame.show()
    
        val sentenceDataFrame1 = spark.createDataFrame(Seq(
          (1, "asf"),
          (2, "2143"),
          (4, "f8934y")
        )).toDF("label", "sentence")
        sentenceDataFrame1.show()
    }
    

    差集 except

    val newDF = sentenceDataFrame1.select("sentence").except(sentenceDataFrame.select("sentence"))
    newDF.show()
    

    +--------+
    |sentence|
    +--------+
    |f8934y |
    +--------+

    交集 intersect

    val newDF = sentenceDataFrame1.select("sentence").intersect(sentenceDataFrame.select("sentence"))
    newDF.show()
    

    +--------+
    |sentence|
    +--------+
    | asf|
    | 2143|
    +--------+

    合集 union

    val newDF = sentenceDataFrame1.select("sentence").union(sentenceDataFrame.select("sentence"))
    newDF.show()
    

    +--------+
    |sentence|
    +--------+
    | asf|
    | 2143|
    | f8934y|
    | asf|
    | 2143|
    | rfds|
    +--------+

    合集最好去一下重 distinct

    val newDF = sentenceDataFrame1.select("sentence").union(sentenceDataFrame.select("sentence")).distinct()
    newDF.show()
    

    +--------+
    |sentence|
    +--------+
    | rfds|
    | asf|
    | 2143|
    | f8934y|
    +--------+

  • 相关阅读:
    hdu 1520
    poj 3468
    hdu 1698
    shell:实现linux服务器资源监控并发送告警邮件
    Tools:apache部署https服务
    自动化:客户端自动化工具比对
    python:爬虫获取淘宝/天猫的商品信息
    js:浏览器插件
    自动化:图像相似度比较,并标记不一样的地方
    Linux:常用命令【转载】
  • 原文地址:https://www.cnblogs.com/TTyb/p/7991952.html
Copyright © 2011-2022 走看看