zoukankan      html  css  js  c++  java
  • Spark中文文本分析建模

    实用的朴素贝叶斯模型建模
    建模过程主要是把文本转化成向量然后再作分析
    数据格式:

    0,善良 美丽
    1,丑陋 阴险 卑鄙
    0,温和
    .......
    注:前面是给文章贴的标签,后面是文章的分词,分词可以找关于分词的文章去查看,后面我也会写关于分词的文章
    import org.apache.spark.SparkConf
    import org.apache.spark.sql.SparkSession
    import org.apache.spark.SparkContext
    import org.apache.spark.ml.feature.Tokenizer
    import org.apache.spark.ml.feature.HashingTF
    import org.apache.spark.sql.Row
    import org.apache.spark.ml.linalg.Vector
    import org.apache.spark.ml.linalg.Vectors
    import org.apache.spark.ml.feature.LabeledPoint
    import org.apache.spark.ml.feature.IDF
    import org.apache.spark.ml.classification.NaiveBayes
    import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
    
    class CreatModel {
    
    }
    object CreatModel{
      case class RawDataRecord(category: String, text: String)
    
      def main(args: Array[String]): Unit = {
        val config = new SparkConf().setAppName("createModel").setMaster("local[4]");
        val sc =new  SparkContext(config);
        val spark = SparkSession.builder().config(config).config("spark.sql.warehouse.dir", "warehouse/dir").getOrCreate();
        import spark.implicits._
        //分数据
        val Array(srcDF,testDF) = sc.textFile("D:\decstop\testFiles\sougou").map {
          x =>
            val data = x.split(",")
            RawDataRecord(data(0),data(1))
        }.toDF().randomSplit(Array(0.7,0.3))
    
        //分词
        val tokenizer = new Tokenizer().setInputCol("text").setOutputCol("words")
        val wordsData = tokenizer.transform(srcDF)
        wordsData.show(false)
        val testtokenizer = new Tokenizer().setInputCol("text").setOutputCol("words")
        val testwordsData = testtokenizer.transform(testDF)
    
        //文档词频
        val hashingTF =
          new HashingTF().setInputCol("words").setOutputCol("rawFeatures").setNumFeatures(100)
        val featurizedData = hashingTF.transform(wordsData)
    
        val testhashingTF =
          new HashingTF().setInputCol("words").setOutputCol("rawFeatures").setNumFeatures(100)
        val testfeaturizedData = testhashingTF.transform(testwordsData)
    
        //逆文档词频
        val idf = new IDF().setInputCol("rawFeatures").setOutputCol("features")
        val idfModel = idf.fit(featurizedData)
        val rescaledData = idfModel.transform(featurizedData)
    
        val testidf = new IDF().setInputCol("rawFeatures").setOutputCol("features")
        val testidfModel = testidf.fit(testfeaturizedData)
        val testrescaledData = testidfModel.transform(testfeaturizedData)
        rescaledData.show(false) 
        //转换成贝叶斯的输入格式
        val trainDataRdd = rescaledData.select($"category",$"features").map {
          case Row(label: String, features:Vector) =>
            LabeledPoint(label.toDouble, Vectors.dense(features.toArray))
        }
    
        val testtrainDataRdd = testrescaledData.select($"category",$"features").map {
          case Row(label: String, features:Vector) =>
            LabeledPoint(label.toDouble, Vectors.dense(features.toArray))
        }
    
        val model =new NaiveBayes().fit(trainDataRdd)
    
        val predictions = model.transform(testtrainDataRdd)
        println("predictln out:");
        predictions.show();
        model.write.overwrite().save("resoult")
    
        //模型评估
        val evaluator = new MulticlassClassificationEvaluator()
          .setLabelCol("label")
          .setPredictionCol("prediction")
          .setMetricName("accuracy")
        val accuracy = evaluator.evaluate(predictions)
        println("accuracy out :")
        println("Accuracy:"+accuracy)
    
      }
    }
    
    
  • 相关阅读:
    Study Plan The Twelfth Day
    Study Plan The Fifteenth Day
    Study Plan The Seventeenth Day
    Study Plan The Tenth Day
    Study Plan The Eighth Day
    Study Plan The Eleventh Day
    Study Plan The Sixteenth Day
    Study Plan The Thirteenth Day
    Study Plan The Fourteenth Day
    Study Plan The Ninth Day
  • 原文地址:https://www.cnblogs.com/itboys/p/6860633.html
Copyright © 2011-2022 走看看