zoukankan      html  css  js  c++  java
  • Spark 机器学习 ---TF-IDF

    package Spark_MLlib
    
    import org.apache.spark.ml.feature.{HashingTF, IDF, Tokenizer}
    import org.apache.spark.sql.SparkSession
    
    /**
      * TF-IDF
      */
    object 特征抽取 {
           val spark=SparkSession.builder().master("local").appName("TF-IDF").getOrCreate()
              import spark.implicits._
      def main(args: Array[String]): Unit = {
            val soureceData= spark.createDataFrame(Seq(
              (0,"soyo spark like spark hadoop spark and spark like spark"),
              (1,"i wish i can like java i"),
              (2,"but i dont know how to soyo"),
              (3,"spark is good spark tool")
            )).toDF("label","sentence")
           //进行分词
           val tokenizer=new Tokenizer().setInputCol("sentence").setOutputCol("words")
           val wordsData=tokenizer.transform(soureceData)
               wordsData.show(false)  //表示不省略,打印字符串的所有单词
           val hashTF=new HashingTF().setInputCol("words").setOutputCol("rawsFeatures").setNumFeatures(1000)
           //生成特征向量
           val featuredData=hashTF.transform(wordsData)
               featuredData.show(false)
           val idf=new IDF().setInputCol("rawsFeatures").setOutputCol("features")
           val idfModel=idf.fit(featuredData)
           val result=idfModel.transform(featuredData)
           result.show(false)
           result.select("label","features").show(false)
    
      }
    }

    结果:

    +-----+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
    |label|features                                                                                                                                                                  |
    +-----+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
    |0    |(1000,[105,107,181,330,333],[2.5541281188299534,0.5108256237659907,0.9162907318741551,1.0216512475319814,0.9162907318741551])                                             |
    |1    |(1000,[329,330,495,833,967],[1.5324768712979722,0.5108256237659907,0.9162907318741551,0.9162907318741551,0.9162907318741551])                                             |
    |2    |(1000,[83,107,237,329,388,779,977],[0.9162907318741551,0.5108256237659907,0.9162907318741551,0.5108256237659907,0.9162907318741551,0.9162907318741551,0.9162907318741551])|
    |3    |(1000,[105,111,168,281],[1.0216512475319814,0.9162907318741551,0.9162907318741551,0.9162907318741551])                                                                    |
    +-----+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------+


  • 相关阅读:
    Traefik使用
    kubernetes nfs-client-provisioner外部存储控制器
    基于腾讯云CLB实现K8S v1.10.1集群高可用+负载均衡
    k8s-rabbitmq-(一)集群部署
    TabSet 实现拖动后并保存配置
    C# MD5加密
    VSS “vc6.0vssum.dat may be corrupt”错误
    C#编程基础笔记
    android.view.WindowLeaked的解决办法
    【转】java线程系列---Runnable和Thread的区别
  • 原文地址:https://www.cnblogs.com/soyo/p/7725404.html
Copyright © 2011-2022 走看看