zoukankan      html  css  js  c++  java
  • Spark中文文本分析建模

    实用的朴素贝叶斯模型建模
    建模过程主要是把文本转化成向量然后再作分析
    数据格式:

    0,善良 美丽
    1,丑陋 阴险 卑鄙
    0,温和
    .......
    注:前面是给文章贴的标签,后面是文章的分词,分词可以找关于分词的文章去查看,后面我也会写关于分词的文章
    import org.apache.spark.SparkConf
    import org.apache.spark.sql.SparkSession
    import org.apache.spark.SparkContext
    import org.apache.spark.ml.feature.Tokenizer
    import org.apache.spark.ml.feature.HashingTF
    import org.apache.spark.sql.Row
    import org.apache.spark.ml.linalg.Vector
    import org.apache.spark.ml.linalg.Vectors
    import org.apache.spark.ml.feature.LabeledPoint
    import org.apache.spark.ml.feature.IDF
    import org.apache.spark.ml.classification.NaiveBayes
    import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
    
    class CreatModel {
    
    }
    object CreatModel{
      case class RawDataRecord(category: String, text: String)
    
      def main(args: Array[String]): Unit = {
        val config = new SparkConf().setAppName("createModel").setMaster("local[4]");
        val sc =new  SparkContext(config);
        val spark = SparkSession.builder().config(config).config("spark.sql.warehouse.dir", "warehouse/dir").getOrCreate();
        import spark.implicits._
        //分数据
        val Array(srcDF,testDF) = sc.textFile("D:\decstop\testFiles\sougou").map {
          x =>
            val data = x.split(",")
            RawDataRecord(data(0),data(1))
        }.toDF().randomSplit(Array(0.7,0.3))
    
        //分词
        val tokenizer = new Tokenizer().setInputCol("text").setOutputCol("words")
        val wordsData = tokenizer.transform(srcDF)
        wordsData.show(false)
        val testtokenizer = new Tokenizer().setInputCol("text").setOutputCol("words")
        val testwordsData = testtokenizer.transform(testDF)
    
        //文档词频
        val hashingTF =
          new HashingTF().setInputCol("words").setOutputCol("rawFeatures").setNumFeatures(100)
        val featurizedData = hashingTF.transform(wordsData)
    
        val testhashingTF =
          new HashingTF().setInputCol("words").setOutputCol("rawFeatures").setNumFeatures(100)
        val testfeaturizedData = testhashingTF.transform(testwordsData)
    
        //逆文档词频
        val idf = new IDF().setInputCol("rawFeatures").setOutputCol("features")
        val idfModel = idf.fit(featurizedData)
        val rescaledData = idfModel.transform(featurizedData)
    
        val testidf = new IDF().setInputCol("rawFeatures").setOutputCol("features")
        val testidfModel = testidf.fit(testfeaturizedData)
        val testrescaledData = testidfModel.transform(testfeaturizedData)
        rescaledData.show(false) 
        //转换成贝叶斯的输入格式
        val trainDataRdd = rescaledData.select($"category",$"features").map {
          case Row(label: String, features:Vector) =>
            LabeledPoint(label.toDouble, Vectors.dense(features.toArray))
        }
    
        val testtrainDataRdd = testrescaledData.select($"category",$"features").map {
          case Row(label: String, features:Vector) =>
            LabeledPoint(label.toDouble, Vectors.dense(features.toArray))
        }
    
        val model =new NaiveBayes().fit(trainDataRdd)
    
        val predictions = model.transform(testtrainDataRdd)
        println("predictln out:");
        predictions.show();
        model.write.overwrite().save("resoult")
    
        //模型评估
        val evaluator = new MulticlassClassificationEvaluator()
          .setLabelCol("label")
          .setPredictionCol("prediction")
          .setMetricName("accuracy")
        val accuracy = evaluator.evaluate(predictions)
        println("accuracy out :")
        println("Accuracy:"+accuracy)
    
      }
    }
    
    
  • 相关阅读:
    Tyvj 1729 文艺平衡树
    送花
    Tyvj 1728 普通平衡树
    [NOI2004]郁闷的出纳员
    [HNOI2004]宠物收养所
    [HNOI2002]营业额统计
    [NOIP2012] 借教室
    无聊的数列
    忠诚
    XOR的艺术
  • 原文地址:https://www.cnblogs.com/itboys/p/6860633.html
Copyright © 2011-2022 走看看