zoukankan      html  css  js  c++  java
  • [Javascript] Classify JSON text data with machine learning in Natural

    In this lesson, we will learn how to train a Naive Bayes classifier and a Logistic Regression classifier - basic machine learning algorithms - on JSON text data, and classify it into categories.

    While this dataset is still considered a small dataset -- only a couple hundred points of data -- we'll start to get better results.

    The general rule is that Logistic Regression will work better than Naive Bayes, but only if there is enough data. Since this is still a pretty small dataset, Naive Bayes works better here. Generally, Logistic Regression takes longer to train as well.

    This uses data from Ana Cachopo: http://ana.cachopo.org/datasets-for-single-label-text-categorization.

    // train data
    
    [{text: 'xxxxxx', label: 'space'}]
    // Load train data form the files and train
    
    var natural = require('natural');
    var fs = require('fs');
    var classifier = new natural.BayesClassifier();
    
    fs.readFile('training_data.json', 'utf-8', function(err, data){
        if (err){
            console.log(err);
        } else {
            var trainingData = JSON.parse(data);
            train(trainingData);
        }
    });
    
    function train(trainingData){
        console.log("Training");
        trainingData.forEach(function(item){
            classifier.addDocument(item.text, item.label);
        });
        var startTime = new Date();
        classifier.train();
        var endTime = new Date();
        var trainingTime = (endTime-startTime)/1000.0;
        console.log("Training time:", trainingTime, "seconds");
        loadTestData();
    }
    
    function loadTestData(){
        console.log("Loading test data");
        fs.readFile('test_data.json', 'utf-8', function(err, data){
            if (err){
                console.log(err);
            } else {
                var testData = JSON.parse(data);
                testClassifier(testData);
            }
        });
    }
    
    function testClassifier(testData){
        console.log("Testing classifier");
        var numCorrect = 0;
        testData.forEach(function(item){
            var labelGuess = classifier.classify(item.text);
            if (labelGuess === item.label){
                numCorrect++;
            }
        });
        console.log("Correct %:", numCorrect/testData.length);
       saveClassifier(classifier) }
    function saveClassifier(classifier){
        classifier.save('classifier.json', function(err, classifier){
            if (err){
                console.log(err);
            } else {
                console.log("Classifier saved!");
            }
        });
    }

    In a new project, we can test the train result by:

    var natural = require('natural');
    
    natural.LogisticRegressionClassifier.load('classifier.json', null, function(err, classifier){
        if (err){
            console.log(err);
        } else {
            var testComment = "is this about the sun and moon?";
            console.log(classifier.classify(testComment));
        }
    });
  • 相关阅读:
    R dataframe 筛选
    R dataframe 统计每行中大于某个值的列的数量
    参考基因组
    C++/CLI入门系列 第二篇:封装C++ dll库,提供接口给C#调用
    C++/CLI入门系列 第一篇:HelloWorld
    用C++/CLI搭建C++和C#之间的桥梁
    C++ CLI简介(什么是C++ CLI)
    GB28181协议RTP传输
    gsoap使用总结
    C++ (使用gsoap)调用 WCF服务
  • 原文地址:https://www.cnblogs.com/Answer1215/p/7624379.html
Copyright © 2011-2022 走看看