zoukankan      html  css  js  c++  java
  • Spark 机器学习------逻辑回归

    package Spark_MLlib
    import javassist.bytecode.SignatureAttribute.ArrayType
    
    import org.apache.spark.sql.SparkSession
    import org.apache.spark.ml.{Pipeline, PipelineModel}
    import org.apache.spark.ml.classification.LogisticRegression
    import org.apache.spark.ml.feature.{HashingTF, Tokenizer}
    import org.apache.spark.ml.linalg.Vector
    import org.apache.spark.sql.Row
    
    /**
      * Spark逻辑回归的库
      * http://spark.apache.org/docs/latest/api/scala/index.html#org.apache.spark.package
    */ object 逻辑回归 { val spark=SparkSession.builder().master("local[2]").appName("逻辑回归").getOrCreate() import spark.implicits._ def main(args: Array[String]): Unit = { val training = spark.createDataFrame(Seq((0,"soyo spark soyo1",1.0),(1,"hadoop spark",1.0),(2,"zhouhang xiaohai",0.0),(3,"hbase spark hive soyo",1.0))). toDF("id","text","label") //转换器 val tokenizer=new Tokenizer().setInputCol("text").setOutputCol("words") val hashingTF=new HashingTF().setNumFeatures(1000).setInputCol(tokenizer.getOutputCol).setOutputCol("features") //评估器 val lr= new LogisticRegression().setMaxIter(10). //设置最大迭代次数 setRegParam(0.01) // 设置正则化参数 val pipeline= new Pipeline().setStages(Array(tokenizer,hashingTF,lr)) //训练出的模型 val model=pipeline.fit(training) //测试数据 val test= spark.createDataFrame(Seq((4,"spark i like"),(5,"hadoop spark book"),(6,"soyo9 soy 88"))).toDF("id","text") test.show() // test.createOrReplaceTempView("soyo") // spark.sql("").show() model.transform(test).schema.foreach(println) model.transform(test) .select("id","text","probability","prediction") .collect() .foreach { case Row(id: Int, text: String, prob: Vector, prediction: Double) => println(s"($id,$text)----->prob=$prob,prediction=$prediction") } //转换器生成的一些中间数据 model.transform(test).select("id","text","features","rawPrediction") .collect() .foreach{ case Row(id:Int,text:String,features:Vector,rawPrediction:Vector)=> println(s"id=$id,text=$text,features=$features,rawPrediction=$rawPrediction") } spark.stop() } }

    结果:

    +---+-----------------+
    | id|             text|
    +---+-----------------+
    |  4|     spark i like|
    |  5|hadoop spark book|
    |  6|     soyo9 soy 88|
    +---+-----------------+

    StructField(id,IntegerType,false)
    StructField(text,StringType,true)
    StructField(words,ArrayType(StringType,true),true)
    StructField(features,org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7,true)
    StructField(rawPrediction,org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7,true)
    StructField(probability,org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7,true)
    StructField(prediction,DoubleType,true)
    (4,spark i like)----->prob=[0.033501882964501836,0.9664981170354981],prediction=1.0                                准确率
    (5,hadoop spark book)----->prob=[0.011175823696937707,0.9888241763030623],prediction=1.0                    准确率
    (6,soyo9 soy 88)----->prob=[0.26222944363302514,0.7377705563669748],prediction=1.0                              准确率(误判了)但值较低
    id=4,text=spark i like,features=(1000,[105,329,330],[1.0,1.0,1.0]),rawPrediction=[-3.3620777052692805,3.3620777052692805]
    id=5,text=hadoop spark book,features=(1000,[105,181,393],[1.0,1.0,1.0]),rawPrediction=[-4.482763689867715,4.482763689867715]
    id=6,text=soyo9 soy 88,features=(1000,[543,602,976],[1.0,1.0,1.0]),rawPrediction=[-1.0344130174468225,1.0344130174468225]

  • 相关阅读:
    NODE_PATH的疑难杂症(转)
    教你如何做一个优雅的Ecmascripter /转
    MDLMaterial Design Lite框架推荐
    GPU硬件加速原理 /转
    透明遮罩图层VS高斯模糊滤镜 效果分析
    QML vs WEB
    PixelMatorPro快捷键大全(osx)
    睡眠排序、面条排序、猴子排序...........................
    全栈设计模式套餐MVVM, RESTful, MVC的历史探索
    2018博客之星评选,我非常需要您宝贵的一票!♪(・ω・)ノ
  • 原文地址:https://www.cnblogs.com/soyo/p/7723007.html
Copyright © 2011-2022 走看看