zoukankan      html  css  js  c++  java
  • spark pipeline 例子

    Pipeline Example.
    # $example on$
    from pyspark.ml import Pipeline
    from pyspark.ml.classification import LogisticRegression
    from pyspark.ml.feature import HashingTF, Tokenizer
    # $example off$
    from pyspark.sql import SparkSession
    if __name__ == "__main__":
        spark = SparkSession
        # $example on$
        # Prepare training documents from a list of (id, text, label) tuples.
        training = spark.createDataFrame([
            (0, "a b c d e spark", 1.0),
            (1, "b d", 0.0),
            (2, "spark f g h", 1.0),
            (3, "hadoop mapreduce", 0.0)
        ], ["id", "text", "label"])
        # Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and lr.
        tokenizer = Tokenizer(inputCol="text", outputCol="words")
        hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features")
        lr = LogisticRegression(maxIter=10, regParam=0.001)
        pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])
        # Fit the pipeline to training documents.
        model = pipeline.fit(training)
        # Prepare test documents, which are unlabeled (id, text) tuples.
        test = spark.createDataFrame([
            (4, "spark i j k"),
            (5, "l m n"),
            (6, "spark hadoop spark"),
            (7, "apache hadoop")
        ], ["id", "text"])
        # Make predictions on test documents and print columns of interest.
        prediction = model.transform(test)
        selected = prediction.select("id", "text", "probability", "prediction")
        for row in selected.collect():
            rid, text, prob, prediction = row
            print("(%d, %s) --> prob=%s, prediction=%f" % (rid, text, str(prob), prediction))
        # $example off$
    Decision Tree Classification Example.
    from __future__ import print_function
    # $example on$
    from pyspark.ml import Pipeline
    from pyspark.ml.classification import DecisionTreeClassifier
    from pyspark.ml.feature import StringIndexer, VectorIndexer
    from pyspark.ml.evaluation import MulticlassClassificationEvaluator
    # $example off$
    from pyspark.sql import SparkSession
    if __name__ == "__main__":
        spark = SparkSession
        # $example on$
        # Load the data stored in LIBSVM format as a DataFrame.
        data = spark.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")
        # Index labels, adding metadata to the label column.
        # Fit on whole dataset to include all labels in index.
        labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(data)
        # Automatically identify categorical features, and index them.
        # We specify maxCategories so features with > 4 distinct values are treated as continuous.
        featureIndexer =
            VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(data)
        # Split the data into training and test sets (30% held out for testing)
        (trainingData, testData) = data.randomSplit([0.7, 0.3])
        # Train a DecisionTree model.
        dt = DecisionTreeClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures")
        # Chain indexers and tree in a Pipeline
        pipeline = Pipeline(stages=[labelIndexer, featureIndexer, dt])
        # Train model.  This also runs the indexers.
        model = pipeline.fit(trainingData)
        # Make predictions.
        predictions = model.transform(testData)
        # Select example rows to display.
        predictions.select("prediction", "indexedLabel", "features").show(5)
        # Select (prediction, true label) and compute test error
        evaluator = MulticlassClassificationEvaluator(
            labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy")
        accuracy = evaluator.evaluate(predictions)
        print("Test Error = %g " % (1.0 - accuracy))
        treeModel = model.stages[2]
        # summary only
        # $example off$



    1.数据框:机器学习接口使用来自Spark SQL的数据框形式数据作为数据集,它可以处理多种数据类型。比如,一个数据框可以有不同的列存储文本、特征向量、标签值和预测值。








  • 相关阅读:
    前端开发 Vue -3axios
    前端开发 Vue -2npm
    前端开发 Vue -1windows环境搭建Vue Node开发环境
    前端开发 Vue -0前言
    java 框架-缓冲-Redis 2Jedis操作
    java 框架-缓冲-Redis 1概述
    微软银光 silverlight简介
  • 原文地址:https://www.cnblogs.com/bonelee/p/7810266.html
Copyright © 2011-2022 走看看