zoukankan      html  css  js  c++  java
  • spark MLlib Classification and regression 学习

    二分类:SVMs,logistic regression,decision trees,random forests,gradient-boosted trees,naive Bayes

    多分类:             logistic regression,decision trees,random forests,                                        naive Bayes

    归回:      linear least regression,    decision tress,random forests,gradient-boosted trees,                       isotonic regression。

    一。Linear models

      

       

        

      classification (SVMs, logistic regression)

       

     1 package ML.ClassificationAndRegression;
     2 
     3 import org.apache.spark.SparkConf;
     4 import org.apache.spark.api.java.JavaRDD;
     5 import org.apache.spark.api.java.JavaSparkContext;
     6 import org.apache.spark.api.java.function.Function;
     7 import org.apache.spark.mllib.classification.SVMModel;
     8 import org.apache.spark.mllib.classification.SVMWithSGD;
     9 import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics;
    10 import org.apache.spark.mllib.optimization.L1Updater;
    11 import org.apache.spark.mllib.regression.LabeledPoint;
    12 import org.apache.spark.mllib.util.MLUtils;
    13 import org.apache.spark.rdd.RDD;
    14 import scala.Tuple2;
    15 
    16 /**
    17  * TODO
    18  *
    19  * @ClassName: SVMClassifier
    20  * @author: DingH
    21  * @since: 2019/4/9 10:28
    22  */
    23 public class SVMClassifier {
    24     public static void main(String[] args) {
    25         SparkConf conf = new SparkConf().setAppName("SVM Classifier Example").setMaster("local");
    26         JavaSparkContext jsc = new JavaSparkContext(conf);
    27         String path = "D:\IdeaProjects\SimpleApp\src\main\resources\data\mllib\sample_libsvm_data.txt";
    28         JavaRDD<LabeledPoint> data = MLUtils.loadLibSVMFile(jsc.sc(), path).toJavaRDD();
    29 
    30         // Split initial RDD into two... [60% training data, 40% testing data].
    31         JavaRDD<LabeledPoint> train = data.sample(false, 0.6, 11L);
    32         train.cache();
    33         final JavaRDD<LabeledPoint> test = data.subtract(train);
    34 
    35         //Run training algorithm to build the model
    36         int numsIterations = 100;
    37         SVMWithSGD svm = new SVMWithSGD();
    38         svm.optimizer().setNumIterations(200).setRegParam(0.01).setUpdater(new L1Updater());
    39         final SVMModel model1 = svm.run(train.rdd());
    40 //        final SVMModel model1 = SVMWithSGD.train(train.rdd(), numsIterations);
    41 
    42         model1.clearThreshold();
    43 
    44         JavaRDD<Tuple2<Object, Object>> scoraAndLables = test.map(new Function<LabeledPoint, Tuple2<Object, Object>>() {
    45             public Tuple2<Object, Object> call(LabeledPoint p) throws Exception {
    46                 double predict = model1.predict(p.features());
    47                 return new Tuple2<Object, Object>(predict, p.label());
    48             }
    49         });
    50 
    51         BinaryClassificationMetrics metrics = new BinaryClassificationMetrics(scoraAndLables.rdd());
    52 
    53         double areaUnderROC = metrics.areaUnderROC();
    54 
    55         System.out.println("Area under ROC = " + areaUnderROC);
    56 
    57         model1.save(jsc.sc(),"D:\IdeaProjects\SimpleApp\src\main\java\MLModel");
    58         SVMModel model = SVMModel.load(jsc.sc(), "D:\IdeaProjects\SimpleApp\src\main\java\MLModel");
    59 
    60     }
    61 }
    SVMClassifier
     1 package ML.ClassificationAndRegression;
     2 
     3 import org.apache.spark.SparkConf;
     4 import org.apache.spark.api.java.JavaRDD;
     5 import org.apache.spark.api.java.JavaSparkContext;
     6 import org.apache.spark.api.java.function.Function;
     7 import org.apache.spark.ml.classification.MultiClassSummarizer;
     8 import org.apache.spark.mllib.classification.LogisticRegressionModel;
     9 import org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS;
    10 import org.apache.spark.mllib.classification.LogisticRegressionWithSGD;
    11 import org.apache.spark.mllib.evaluation.MulticlassMetrics;
    12 import org.apache.spark.mllib.regression.LabeledPoint;
    13 import org.apache.spark.mllib.util.MLUtils;
    14 import scala.Tuple2;
    15 
    16 /**
    17  * TODO
    18  *
    19  * @ClassName: LogistiRegression
    20  * @author: DingH
    21  * @since: 2019/4/9 11:08
    22  */
    23 public class LogistiRegression {
    24     public static void main(String[] args) {
    25         SparkConf conf = new SparkConf().setMaster("local").setAppName("LogisticRegression");
    26         JavaSparkContext jsc = new JavaSparkContext(conf);
    27         String path = "D:\IdeaProjects\SimpleApp\src\main\resources\data\mllib\sample_libsvm_data.txt";
    28         JavaRDD<LabeledPoint> data = MLUtils.loadLibSVMFile(jsc.sc(), path).toJavaRDD();
    29 
    30         JavaRDD<LabeledPoint>[] split = data.randomSplit(new double[]{0.6, 0.4}, 11L);
    31         JavaRDD<LabeledPoint> training = split[0].cache();
    32         final JavaRDD<LabeledPoint> test = split[1];
    33 
    34         final LogisticRegressionModel model = new LogisticRegressionWithLBFGS().setNumClasses(10).run(training.rdd());
    35         JavaRDD<Tuple2<Object, Object>> predictionAndLabels = test.map(new Function<LabeledPoint, Tuple2<Object, Object>>() {
    36             public Tuple2<Object, Object> call(LabeledPoint labeledPoint) throws Exception {
    37                 double predict = model.predict(labeledPoint.features());
    38                 return new Tuple2<Object, Object>(predict, labeledPoint.label());
    39             }
    40         });
    41 
    42         MulticlassMetrics metrics = new MulticlassMetrics(predictionAndLabels.rdd());
    43 
    44         double precision = metrics.precision();
    45         System.out.println("Precision = " + precision);
    46 
    47         // Save and load model
    48 //        model.save(jsc.sc(), "myModelPath");
    49 //        LogisticRegressionModel sameModel = LogisticRegressionModel.load(jsc.sc(), "myModelPath");
    50     }
    51 }
    LogistiRegression

      linear regression (least squares, Lasso, ridge)

     

     1 package ML.ClassificationAndRegression;
     2 
     3 import org.apache.hadoop.yarn.webapp.hamlet.Hamlet;
     4 import org.apache.spark.SparkConf;
     5 import org.apache.spark.api.java.JavaDoubleRDD;
     6 import org.apache.spark.api.java.JavaRDD;
     7 import org.apache.spark.api.java.JavaSparkContext;
     8 import org.apache.spark.api.java.function.Function;
     9 import org.apache.spark.mllib.linalg.Vectors;
    10 import org.apache.spark.mllib.regression.LabeledPoint;
    11 import org.apache.spark.mllib.regression.LinearRegressionModel;
    12 import org.apache.spark.mllib.regression.LinearRegressionWithSGD;
    13 import org.apache.spark.mllib.util.MLUtils;
    14 import scala.Tuple2;
    15 
    16 /**
    17  * TODO
    18  *
    19  * @ClassName: Regression
    20  * @author: DingH
    21  * @since: 2019/4/9 11:21
    22  */
    23 public class Regression {
    24     public static void main(String[] args) {
    25         SparkConf conf = new SparkConf().setAppName("Regression").setMaster("local");
    26         JavaSparkContext jsc = new JavaSparkContext(conf);
    27         String path = "D:\IdeaProjects\SimpleApp\src\main\resources\data\mllib\ridge-data\lpsa.data";
    28         JavaRDD<String> data = jsc.textFile(path);
    29 
    30         JavaRDD<LabeledPoint> parsedData = data.map(new Function<String, LabeledPoint>() {
    31             public LabeledPoint call(String line) throws Exception {
    32                 String[] split = line.split(",");
    33                 String[] features = split[1].split(" ");
    34                 double[] v = new double[features.length];
    35                 for (int i = 0; i < features.length - 1; i++) {
    36                     v[i] = Double.parseDouble(features[i]);
    37                 }
    38 
    39                 return new LabeledPoint(Double.parseDouble(split[0]), Vectors.dense(v));
    40             }
    41         }).cache();
    42 
    43         final LinearRegressionModel model = LinearRegressionWithSGD.train(parsedData.rdd(), 100);
    44 
    45         JavaRDD<Tuple2<Double, Double>> valuesAndLabels = parsedData.map(new Function<LabeledPoint, Tuple2<Double, Double>>() {
    46             public Tuple2<Double, Double> call(LabeledPoint labeledPoint) throws Exception {
    47                 double predict = model.predict(labeledPoint.features());
    48                 return new Tuple2<Double, Double>(predict, labeledPoint.label());
    49             }
    50         });
    51 
    52         Double MSE = new JavaDoubleRDD(valuesAndLabels.map(
    53                 new Function<Tuple2<Double, Double>, Object>() {
    54                     public Object call(Tuple2<Double, Double> dat) throws Exception {
    55                         return Math.pow(dat._1 - dat._2, 2.0);
    56                     }
    57                 }
    58         ).rdd()).mean();
    59         System.out.println("training Mean Squared Error = " + MSE);
    60 
    61         // Save and load model
    62 //        model.save(jsc.sc(), "myModelPath");
    63 //        LinearRegressionModel sameModel = LinearRegressionModel.load(jsc.sc(), "myModelPath");
    64     }
    65 }
    Regression

    二。Decision Trees.

     

    problem specification parameters:   algo, numClasses, categoricalFeaturesInfo

    stopping criteria : maxDepth, minInfoGain, minInstancePerNode

    tunnable parameters: maxBins, impurity, 

    package ML.DT;
    
    import org.apache.spark.SparkConf;
    import org.apache.spark.api.java.JavaPairRDD;
    import org.apache.spark.api.java.JavaRDD;
    import org.apache.spark.api.java.JavaSparkContext;
    import org.apache.spark.api.java.function.Function;
    import org.apache.spark.api.java.function.PairFunction;
    import org.apache.spark.mllib.regression.LabeledPoint;
    import org.apache.spark.mllib.tree.DecisionTree;
    import org.apache.spark.mllib.tree.model.DecisionTreeModel;
    import org.apache.spark.mllib.util.MLUtils;
    import scala.Tuple2;
    
    import java.util.HashMap;
    
    /**
     * TODO
     *
     * @ClassName: classification
     * @author: DingH
     * @since: 2019/4/9 16:11
     */
    public class classification {
        public static void main(String[] args) {
            SparkConf conf = new SparkConf().setAppName("DTclassification").setMaster("local");
            JavaSparkContext jsc = new JavaSparkContext(conf);
    
            String path = "D:\IdeaProjects\SimpleApp\src\main\resources\data\mllib\sample_libsvm_data.txt";
            JavaRDD<LabeledPoint> data = MLUtils.loadLibSVMFile(jsc.sc(), path).toJavaRDD();
    
            JavaRDD<LabeledPoint>[] split = data.randomSplit(new double[]{0.7, 0.3}, 11L);
            JavaRDD<LabeledPoint> trainningData = split[0];
            JavaRDD<LabeledPoint> test = split[1];
    
            int numsClasses = 2;
            HashMap<Integer, Integer> categoricalFeaturesInfo  = new HashMap<Integer, Integer>();
            String impurity = "gini";
            int maxDepth  = 1;
            int maxbins = 32;
    
            final DecisionTreeModel model = DecisionTree.trainClassifier(trainningData, numsClasses,categoricalFeaturesInfo, impurity, maxDepth,maxbins);
            JavaPairRDD<Double, Double> predictionAndLable = test.mapToPair(new PairFunction<LabeledPoint, Double, Double>() {
                public Tuple2<Double, Double> call(LabeledPoint labeledPoint) throws Exception {
                    return new Tuple2<Double, Double>(model.predict(labeledPoint.features()), labeledPoint.label());
                }
            });
    
            double testErr = predictionAndLable.filter(new Function<Tuple2<Double, Double>, Boolean>() {
                public Boolean call(Tuple2<Double, Double> doubleDoubleTuple2) throws Exception {
                    return !doubleDoubleTuple2._1().equals(doubleDoubleTuple2._2());
                }
            }).count() * 1.0 / test.count();
    
            System.out.println("Test Error: " + testErr);
            System.out.println("Learned classification tree model:
    " + model.toDebugString());
    
            // Save and load model
    //        model.save(jsc.sc(), "target/tmp/myDecisionTreeClassificationModel");
    //        DecisionTreeModel sameModel = DecisionTreeModel.load(jsc.sc(), "target/tmp/myDecisionTreeClassificationModel");
    
    
        }
    }
    classification
    package ML.DT;
    
    import org.apache.spark.SparkConf;
    import org.apache.spark.api.java.JavaPairRDD;
    import org.apache.spark.api.java.JavaRDD;
    import org.apache.spark.api.java.JavaSparkContext;
    import org.apache.spark.api.java.function.Function;
    import org.apache.spark.api.java.function.Function2;
    import org.apache.spark.api.java.function.PairFunction;
    import org.apache.spark.mllib.regression.LabeledPoint;
    import org.apache.spark.mllib.tree.DecisionTree;
    import org.apache.spark.mllib.tree.model.DecisionTreeModel;
    import org.apache.spark.mllib.util.MLUtils;
    import scala.Tuple2;
    
    import java.util.HashMap;
    import java.util.Map;
    
    /**
     * TODO
     *
     * @ClassName: Regression
     * @author: DingH
     * @since: 2019/4/9 16:33
     */
    public class Regression {
        public static void main(String[] args) {
            SparkConf sparkConf = new SparkConf().setAppName("JavaDecisionTreeRegressionExample").setMaster("local");
            JavaSparkContext jsc = new JavaSparkContext(sparkConf);
    
            // Load and parse the data file.
            String datapath = "D:\IdeaProjects\SimpleApp\src\main\resources\data\mllib\sample_libsvm_data.txt";
            JavaRDD<LabeledPoint> data = MLUtils.loadLibSVMFile(jsc.sc(), datapath).toJavaRDD();
            // Split the data into training and test sets (30% held out for testing)
            JavaRDD<LabeledPoint>[] splits = data.randomSplit(new double[]{0.7, 0.3});
            JavaRDD<LabeledPoint> trainingData = splits[0];
            JavaRDD<LabeledPoint> testData = splits[1];
    
            // Set parameters.
            // Empty categoricalFeaturesInfo indicates all features are continuous.
            Map<Integer, Integer> categoricalFeaturesInfo = new HashMap<Integer, Integer>();
            String impurity = "variance";
            Integer maxDepth = 5;
            Integer maxBins = 32;
    
            // Train a DecisionTree model.
            final DecisionTreeModel model = DecisionTree.trainRegressor(trainingData, categoricalFeaturesInfo, impurity, maxDepth, maxBins);
    
            // Evaluate model on test instances and compute test error
            JavaPairRDD<Double, Double> predictionAndLabel = testData.mapToPair(new PairFunction<LabeledPoint, Double, Double>() {
                public Tuple2<Double, Double> call(LabeledPoint p) {
                    return new Tuple2<Double, Double>(model.predict(p.features()), p.label());
                }
            });
    
            Double testMSE = predictionAndLabel.map(new Function<Tuple2<Double, Double>, Double>() {
                public Double call(Tuple2<Double, Double> pl) {
                    Double diff = pl._1() - pl._2();
                    return diff * diff;
                }
            }).reduce(new Function2<Double, Double, Double>() {
                public Double call(Double a, Double b) {
                    return a + b;
                }
            }) / data.count();
    
            System.out.println("Test Mean Squared Error: " + testMSE);
            System.out.println("Learned regression tree model:
    " + model.toDebugString());
    
            // Save and load model
    //        model.save(jsc.sc(), "target/tmp/myDecisionTreeRegressionModel");
    //        DecisionTreeModel sameModel = DecisionTreeModel.load(jsc.sc(), "target/tmp/myDecisionTreeRegressionModel");
        }
    }
    Regression

    三。Random Forests

      样本随机,特征随机  

      featureSubsetStrategy - Number of features to consider for splits at each node. Supported: "auto", "all", "sqrt", "log2", "onethird". If "auto" is set, this parameter is set based on numTrees: if numTrees == 1, set to "all"; if numTrees > 1 (forest) set to "sqrt".

    package ML.RF;
    
    import org.apache.spark.SparkConf;
    import org.apache.spark.api.java.JavaPairRDD;
    import org.apache.spark.api.java.JavaRDD;
    import org.apache.spark.api.java.JavaSparkContext;
    import org.apache.spark.api.java.function.Function;
    import org.apache.spark.api.java.function.PairFunction;
    import org.apache.spark.mllib.regression.LabeledPoint;
    import org.apache.spark.mllib.tree.RandomForest;
    import org.apache.spark.mllib.tree.model.RandomForestModel;
    import org.apache.spark.mllib.util.MLUtils;
    import scala.Tuple2;
    
    import java.util.HashMap;
    
    /**
     * TODO
     *
     * @ClassName: classification
     * @author: DingH
     * @since: 2019/4/9 16:58
     */
    public class classification {
        public static void main(String[] args) {
            SparkConf sparkConf = new SparkConf().setMaster("local").setAppName("JavaRandomForestClassificationExample");
            JavaSparkContext jsc = new JavaSparkContext(sparkConf);
    
            // Load and parse the data file.
            String datapath = "D:\IdeaProjects\SimpleApp\src\main\resources\data\mllib\sample_libsvm_data.txt";
            JavaRDD<LabeledPoint> data = MLUtils.loadLibSVMFile(jsc.sc(), datapath).toJavaRDD();
    
            // Split the data into training and test sets (30% held out for testing)
            JavaRDD<LabeledPoint>[] splits = data.randomSplit(new double[]{0.7, 0.3});
            JavaRDD<LabeledPoint> trainingData = splits[0];
            JavaRDD<LabeledPoint> testData = splits[1];
    
            // Train a RandomForest model.
            // Empty categoricalFeaturesInfo indicates all features are continuous.
            Integer numClasses = 2;
            HashMap<Integer, Integer> categoricalFeaturesInfo = new HashMap<Integer, Integer>();
            Integer numTrees = 3; // Use more in practice.
            String featureSubsetStrategy = "auto"; // Let the algorithm choose.
            String impurity = "gini";
            Integer maxDepth = 5;
            Integer maxBins = 32;
            Integer seed = 12345;
    
            final RandomForestModel model = RandomForest.trainClassifier(trainingData, numClasses,categoricalFeaturesInfo, numTrees, featureSubsetStrategy, impurity, maxDepth, maxBins, seed);
    
            // Evaluate model on test instances and compute test error
            JavaPairRDD<Double, Double> predictionAndLabel = testData.mapToPair(new PairFunction<LabeledPoint, Double, Double>() {
                public Tuple2<Double, Double> call(LabeledPoint p) {
                  return new Tuple2<Double, Double>(model.predict(p.features()), p.label());
                }
              });
    
            Double testErr =
              1.0 * predictionAndLabel.filter(new Function<Tuple2<Double, Double>, Boolean>() {
                public Boolean call(Tuple2<Double, Double> pl) {
                  return !pl._1().equals(pl._2());
                }
              }).count() / testData.count();
    
            System.out.println("Test Error: " + testErr);
            System.out.println("Learned classification forest model:
    " + model.toDebugString());
    
            // Save and load model
    //        model.save(jsc.sc(), "target/tmp/myRandomForestClassificationModel");
    //        RandomForestModel sameModel = RandomForestModel.load(jsc.sc(),"target/tmp/myRandomForestClassificationModel");
        }
    
    }
    classification
    package ML.RF;
    
    import org.apache.spark.SparkConf;
    import org.apache.spark.api.java.JavaPairRDD;
    import org.apache.spark.api.java.JavaRDD;
    import org.apache.spark.api.java.JavaSparkContext;
    import org.apache.spark.api.java.function.Function;
    import org.apache.spark.api.java.function.Function2;
    import org.apache.spark.api.java.function.PairFunction;
    import org.apache.spark.mllib.regression.LabeledPoint;
    import org.apache.spark.mllib.tree.RandomForest;
    import org.apache.spark.mllib.tree.model.RandomForestModel;
    import org.apache.spark.mllib.util.MLUtils;
    import scala.Tuple2;
    
    import java.util.HashMap;
    import java.util.Map;
    
    /**
     * TODO
     *
     * @ClassName: regression
     * @author: DingH
     * @since: 2019/4/9 17:50
     */
    public class regression {
        public static void main(String[] args) {
            SparkConf sparkConf = new SparkConf().setMaster("local").setAppName("JavaRandomForestRegressionExample");
            JavaSparkContext jsc = new JavaSparkContext(sparkConf);
            // Load and parse the data file.
            String datapath = "D:\IdeaProjects\SimpleApp\src\main\resources\data\mllib\sample_libsvm_data.txt";
            JavaRDD<LabeledPoint> data = MLUtils.loadLibSVMFile(jsc.sc(), datapath).toJavaRDD();
    
            // Split the data into training and test sets (30% held out for testing)
            JavaRDD<LabeledPoint>[] splits = data.randomSplit(new double[]{0.7, 0.3});
            JavaRDD<LabeledPoint> trainingData = splits[0];
            JavaRDD<LabeledPoint> testData = splits[1];
    
            // Set parameters.
            // Empty categoricalFeaturesInfo indicates all features are continuous.
            Map<Integer, Integer> categoricalFeaturesInfo = new HashMap<Integer, Integer>();
            Integer numTrees = 3; // Use more in practice.
            String featureSubsetStrategy = "auto"; // Let the algorithm choose.
            String impurity = "variance";
            Integer maxDepth = 4;
            Integer maxBins = 32;
            Integer seed = 12345;
            // Train a RandomForest model.
            final RandomForestModel model = RandomForest.trainRegressor(trainingData,categoricalFeaturesInfo, numTrees, featureSubsetStrategy, impurity, maxDepth, maxBins, seed);
    
            // Evaluate model on test instances and compute test error
            JavaPairRDD<Double, Double> predictionAndLabel = testData.mapToPair(new PairFunction<LabeledPoint, Double, Double>() {
                public Tuple2<Double, Double> call(LabeledPoint p) {
                  return new Tuple2<Double, Double>(model.predict(p.features()), p.label());
                }
              });
            Double testMSE = predictionAndLabel.map(new Function<Tuple2<Double, Double>, Double>() {
                public Double call(Tuple2<Double, Double> pl) {
                  Double diff = pl._1() - pl._2();
                  return diff * diff;
                }
              }).reduce(new Function2<Double, Double, Double>() {
                public Double call(Double a, Double b) {
                  return a + b;
                }
              }) / testData.count();
    
            System.out.println("Test Mean Squared Error: " + testMSE);
            System.out.println("Learned regression forest model:
    " + model.toDebugString());
    
            // Save and load model
            model.save(jsc.sc(), "target/tmp/myRandomForestRegressionModel");
            RandomForestModel sameModel = RandomForestModel.load(jsc.sc(),
              "target/tmp/myRandomForestRegressionModel");
        }
    }
    regression

    四。Gradient-Boosted Trees

      

    Usage tips: loss, numIterations, learningRate,  algo

    BoostingStrategy.validationTol

    package ML.GradientBoostedTrees;
    
    import org.apache.spark.SparkConf;
    import org.apache.spark.api.java.JavaPairRDD;
    import org.apache.spark.api.java.JavaRDD;
    import org.apache.spark.api.java.JavaSparkContext;
    import org.apache.spark.api.java.function.Function;
    import org.apache.spark.api.java.function.PairFunction;
    import org.apache.spark.mllib.regression.LabeledPoint;
    import org.apache.spark.mllib.tree.GradientBoostedTrees;
    import org.apache.spark.mllib.tree.configuration.BoostingStrategy;
    import org.apache.spark.mllib.tree.model.GradientBoostedTreesModel;
    import org.apache.spark.mllib.util.MLUtils;
    import scala.Tuple2;
    
    import java.util.HashMap;
    import java.util.Map;
    
    /**
     * TODO
     *
     * @ClassName: classification
     * @author: DingH
     * @since: 2019/4/9 17:56
     */
    public class classification {
        public static void main(String[] args) {
            SparkConf sparkConf = new SparkConf().setMaster("local").setAppName("JavaGradientBoostedTreesClassificationExample");
            JavaSparkContext jsc = new JavaSparkContext(sparkConf);
    
            // Load and parse the data file.
            String datapath = "D:\IdeaProjects\SimpleApp\src\main\resources\data\mllib\sample_libsvm_data.txt";
            JavaRDD<LabeledPoint> data = MLUtils.loadLibSVMFile(jsc.sc(), datapath).toJavaRDD();
    
            // Split the data into training and test sets (30% held out for testing)
            JavaRDD<LabeledPoint>[] splits = data.randomSplit(new double[]{0.7, 0.3});
            JavaRDD<LabeledPoint> trainingData = splits[0];
            JavaRDD<LabeledPoint> testData = splits[1];
    
            // Train a GradientBoostedTrees model.
            // The defaultParams for Classification use LogLoss by default.
            BoostingStrategy boostingStrategy = BoostingStrategy.defaultParams("Classification");
            boostingStrategy.setNumIterations(3); // Note: Use more iterations in practice.
            boostingStrategy.getTreeStrategy().setNumClasses(2);
            boostingStrategy.getTreeStrategy().setMaxDepth(5);
            // Empty categoricalFeaturesInfo indicates all features are continuous.
            Map<Integer, Integer> categoricalFeaturesInfo = new HashMap<Integer, Integer>();
            boostingStrategy.treeStrategy().setCategoricalFeaturesInfo(categoricalFeaturesInfo);
    
            final GradientBoostedTreesModel model =
              GradientBoostedTrees.train(trainingData, boostingStrategy);
    
            // Evaluate model on test instances and compute test error
            JavaPairRDD<Double, Double> predictionAndLabel =
              testData.mapToPair(new PairFunction<LabeledPoint, Double, Double>() {
                public Tuple2<Double, Double> call(LabeledPoint p) {
                  return new Tuple2<Double, Double>(model.predict(p.features()), p.label());
                }
              });
            Double testErr =
              1.0 * predictionAndLabel.filter(new Function<Tuple2<Double, Double>, Boolean>() {
                public Boolean call(Tuple2<Double, Double> pl) {
                  return !pl._1().equals(pl._2());
                }
              }).count() / testData.count();
            System.out.println("Test Error: " + testErr);
            System.out.println("Learned classification GBT model:
    " + model.toDebugString());
    
            // Save and load model
    //        model.save(jsc.sc(), "target/tmp/myGradientBoostingClassificationModel");
    //        GradientBoostedTreesModel sameModel = GradientBoostedTreesModel.load(jsc.sc(),
    //          "target/tmp/myGradientBoostingClassificationModel");
        }
    }
    classification
    regression

    五。naive Bayes

      model type : "multinomial","bernouli"

      

     1 package ML.ClassificationAndRegression.NaiveBayes;
     2 
     3 import org.apache.spark.SparkConf;
     4 import org.apache.spark.api.java.JavaRDD;
     5 import org.apache.spark.api.java.JavaSparkContext;
     6 import org.apache.spark.api.java.function.Function;
     7 import org.apache.spark.mllib.classification.NaiveBayes;
     8 import org.apache.spark.mllib.classification.NaiveBayesModel;
     9 import org.apache.spark.mllib.regression.LabeledPoint;
    10 import org.apache.spark.mllib.util.MLUtils;
    11 import scala.Tuple2;
    12 
    13 /**
    14  * TODO
    15  *
    16  * @ClassName: example
    17  * @author: DingH
    18  * @since: 2019/4/10 10:04
    19  */
    20 public class example {
    21     public static void main(String[] args) {
    22         SparkConf conf = new SparkConf().setMaster("local").setAppName("naiveBayesExample");
    23         JavaSparkContext jsc = new JavaSparkContext(conf);
    24 
    25         String path = "D:\IdeaProjects\SimpleApp\src\main\resources\data\mllib\sample_libsvm_data.txt";
    26         JavaRDD<LabeledPoint> data = MLUtils.loadLibSVMFile(jsc.sc(), path).toJavaRDD();
    27 
    28         JavaRDD<LabeledPoint>[] split = data.randomSplit(new double[]{0.6, 0.4}, 12345L);
    29         JavaRDD<LabeledPoint> train = split[0];
    30         JavaRDD<LabeledPoint> test = split[1];
    31 
    32         final NaiveBayesModel model = NaiveBayes.train(train.rdd(), 1.0, "multinomial");
    33 
    34         JavaRDD<Tuple2<Double, Double>> predictionAndLabel = test.map(new Function<LabeledPoint, Tuple2<Double, Double>>() {
    35             public Tuple2<Double, Double> call(LabeledPoint labeledPoint) throws Exception {
    36                 double predict = model.predict(labeledPoint.features());
    37                 return new Tuple2<Double, Double>(predict, labeledPoint.label());
    38             }
    39         });
    40 
    41         double accuracy = predictionAndLabel.filter(new Function<Tuple2<Double, Double>, Boolean>() {
    42             public Boolean call(Tuple2<Double, Double> doubleDoubleTuple2) throws Exception {
    43                 return doubleDoubleTuple2._1().equals(doubleDoubleTuple2._2());
    44             }
    45         }).count() / (double) test.count();
    46 
    47         System.out.println("acucuracy is : " + accuracy);
    48 
    49         NaiveBayesModel model1 = NaiveBayesModel.load(jsc.sc(), "");
    50 
    51 
    52     }
    53 }
    naive Bayes

    六。isotonic regression

      

      

     1 package ML.ClassificationAndRegression.IsotonicRegression;
     2 
     3 import org.apache.spark.SparkConf;
     4 import org.apache.spark.api.java.JavaDoubleRDD;
     5 import org.apache.spark.api.java.JavaRDD;
     6 import org.apache.spark.api.java.JavaSparkContext;
     7 import org.apache.spark.api.java.function.Function;
     8 import org.apache.spark.mllib.regression.IsotonicRegression;
     9 import org.apache.spark.mllib.regression.IsotonicRegressionModel;
    10 import scala.Tuple2;
    11 import scala.Tuple3;
    12 
    13 /**
    14  * TODO
    15  *
    16  * @ClassName: example
    17  * @author: DingH
    18  * @since: 2019/4/10 10:31
    19  */
    20 public class example {
    21     public static void main(String[] args) {
    22         SparkConf conf = new SparkConf().setMaster("local").setAppName("isotonicRegressionExample");
    23         JavaSparkContext jsc = new JavaSparkContext(conf);
    24 
    25         String path = "D:\IdeaProjects\SimpleApp\src\main\resources\data\mllib\sample_isotonic_regression_data.txt";
    26         JavaRDD<String> data = jsc.textFile(path);
    27 
    28         JavaRDD<Tuple3<Double, Double, Double>> parsedData = data.map(new Function<String, Tuple3<Double, Double, Double>>() {
    29             public Tuple3<Double, Double, Double> call(String s) throws Exception {
    30                 String[] strings = s.split(",");
    31                 return new Tuple3<Double, Double, Double>(Double.parseDouble(strings[0]), Double.parseDouble(strings[1]), 1.0);
    32             }
    33         });
    34 
    35         JavaRDD<Tuple3<Double, Double, Double>>[] split = parsedData.randomSplit(new double[]{0.7, 0.3}, 1234L);
    36         JavaRDD<Tuple3<Double, Double, Double>> train = split[0];
    37         JavaRDD<Tuple3<Double, Double, Double>> test = split[1];
    38 
    39         final IsotonicRegressionModel model = new IsotonicRegression().setIsotonic(true).run(train);
    40 
    41         JavaRDD<Tuple2<Double, Double>> preditionAndLabel = test.map(new Function<Tuple3<Double, Double, Double>, Tuple2<Double, Double>>() {
    42             public Tuple2<Double, Double> call(Tuple3<Double, Double, Double> doubleDoubleDoubleTuple3) throws Exception {
    43                 double predict = model.predict(doubleDoubleDoubleTuple3._1());
    44                 return new Tuple2<Double, Double>(predict, doubleDoubleDoubleTuple3._2());
    45             }
    46         });
    47 
    48         Double MSE = new JavaDoubleRDD(preditionAndLabel.map(new Function<Tuple2<Double, Double>, Object>() {
    49             public Object call(Tuple2<Double, Double> doubleDoubleTuple2) throws Exception {
    50 
    51                 return Math.pow(doubleDoubleTuple2._1() - doubleDoubleTuple2._2(), 2.0);
    52             }
    53         }).rdd()).mean();
    54 
    55         System.out.println("Mean Squared Error = " + MSE);
    56     }
    57 }
    isotonic regression

          

          

        

  • 相关阅读:
    【KM】BZOJ1937 [Shoi2004]Mst 最小生成树
    【动态规划+Floyd】OpenJudge3368
    【四边形不等式】HDU3506-Monkey Party
    【tarjan】BZOJ2140-稳定婚姻
    【并查集】BZOJ4668-冷战
    [ZJOI2010]贪吃的老鼠(网络流+建图)
    [NOI2010]航空管制(拓扑排序+贪心)
    [CERC2017]Intrinsic Interval(神仙+线段树)
    [CTSC2012]熟悉的文章(后缀自动机+动态规划)
    [CERC2014]Virus synthesis
  • 原文地址:https://www.cnblogs.com/dhName/p/10678782.html
Copyright © 2011-2022 走看看