# daal4py Decision Forest Classification Training example Serialization import daal4py as d4p import numpy as np import pickle from sklearn.datasets import fetch_mldata from sklearn.model_selection import train_test_split def get_mnist(): mnist = fetch_mldata('MNIST original') X_train, X_test, y_train, y_test = train_test_split(mnist.data, mnist.target, train_size=60000, test_size=10000) data = np.ascontiguousarray(X_train, dtype=np.float32) labels = np.ascontiguousarray(y_train, dtype=np.float32).reshape(y_train.shape[0],1) return data, labels # serialized model can be used only by daal4py with pickle def pickle_serialization(result, file='df_result.pkl'): with open(file,'wb') as out: pickle.dump(result, out) # universal naitive DAAL model serializtion. Can be used in all DAAL interfaces C++/Java/pydaal/daal4py def native_serialization(result, file='native_result.txt'): daal_buff = result.__getstate__() File = open(file, "wb") File.write(daal_buff) if __name__ == "__main__": data, labels = get_mnist() # 'fptype' parameter should be the same type as input numpy arrays to archive the best performance # (no data conversation in this case) train = d4p.decision_forest_classification_training(10, fptype='float', nTrees=100, minObservationsInLeafNode=1, engine = d4p.engines_mt19937(seed=777),bootstrap=True) result = train.compute(data, labels) # serialize model to file pickle_serialization(result) native_serialization(result)
python预测
import daal4py as d4p import numpy as np import pickle from sklearn.datasets import fetch_mldata from sklearn.model_selection import train_test_split def get_mnist_test(): mnist = fetch_mldata('MNIST original') X_train, X_test, y_train, y_test = train_test_split(mnist.data, mnist.target, train_size=60000, test_size=10000) pdata = np.ascontiguousarray(X_test, dtype=np.float32) plabels = np.ascontiguousarray(y_test, dtype=np.float32).reshape(y_test.shape[0],1) return pdata, plabels def checkAccuracy(plabels, prediction): t = 0 count = 0 for i in plabels: if i != prediction[t]: count = count + 1 t = t + 1 return (1 - count/t) def pickle_deserialization(file='df_result.pkl'): with open(file,'rb') as inp: return pickle.load(inp) def native_deserialization(file='native_result.txt'): daal_result = d4p.decision_forest_classification_training_result() File = open(file, "rb") daal_buff = File.read() daal_result.__setstate__(daal_buff) return daal_result if __name__ == "__main__": nClasses = 10 pdata, plabels = get_mnist_test() #deserialize model deserialized_result_pickle = pickle_deserialization() deserialized_result_naitive = native_deserialization() # now predict using the deserialized model from the training above, fptype is float as input data predict_algo = d4p.decision_forest_classification_prediction(nClasses, fptype='float') # just set pickle-obtained model into compute predict_result = predict_algo.compute(pdata, deserialized_result_pickle.model) print(" Accuracy:", checkAccuracy(plabels, predict_result.prediction)) # the same result as above. just set native-obtained model into compute predict_result = predict_algo.compute(pdata, deserialized_result_naitive.model) print(" Accuracy:", checkAccuracy(plabels, predict_result.prediction))
c++使用该daal4py的模型:
/** * <a name="DAAL-EXAMPLE-CPP-DF_CLS_DENSE_BATCH"></a> * example df_cls_dense_batch.cpp */ #include "daal.h" #include "service.h" #include "stdio.h" using namespace std; using namespace daal; using namespace daal::algorithms; using namespace daal::algorithms::decision_forest::classification; /* Input data set parameters */ const string testDatasetFileName = "../data/batch/mnist_test_data.csv"; const string labels = "../data/batch/mnist_test_labels.csv"; const size_t nFeatures = 784; /* Number of features in training and testing data sets */ const size_t nClasses = 10; /* Number of classes */ void testModel(); void loadData(const std::string& dataFileName, const std::string& labelsFileName, NumericTablePtr& pData, NumericTablePtr& pDependentVar); void check_accuracy(NumericTablePtr prediction, NumericTablePtr testGroundTruth); int main(int argc, char *argv[]) { checkArguments(argc, argv, 2, &labels, &testDatasetFileName); /* Deserialization */ size_t size = 0; byte * buffer = NULL; FILE * pFile; size_t result; pFile = fopen ( "../data/batch/native_result.txt" , "rb" ); if (pFile==NULL) { fputs ("File error",stderr); exit (1); } // obtain file size: fseek (pFile , 0 , SEEK_END); size = ftell (pFile); std::cout << "size: " << size << " "; rewind(pFile); // allocate memory to contain the whole file: buffer = (byte*) malloc (sizeof(byte)*size); if (buffer == NULL) { fputs ("Memory error",stderr); exit (2); } // copy the file into the buffer: result = fread (buffer,1,size,pFile); if (result != size) { fputs ("Reading error",stderr); exit (3); } /* the result buffer is now loaded in the buffer. */ /* Create a data archive to deserialize the numeric table */ OutputDataArchive out_dataArch(buffer, size); free (buffer); fclose (pFile); /* needed for result allocation */ training::Batch<> train(nClasses); train.getResult()->deserialize(out_dataArch); /* Create Numeric Tables for testing data and ground truth values */ NumericTablePtr testData; NumericTablePtr testGroundTruth; loadData(testDatasetFileName, labels, testData, testGroundTruth); /* Create an algorithm object to predict values of decision forest classification */ prediction::Batch<> algorithm(nClasses); /* Pass a testing data set and the trained model to the algorithm */ algorithm.input.set(classifier::prediction::data, testData); /* set deserialized model */ algorithm.input.set(classifier::prediction::model, train.getResult()->get(classifier::training::model)); /* Predict values of decision forest classification */ algorithm.compute(); /* Retrieve the algorithm results */ NumericTablePtr prediction = algorithm.getResult()->get(classifier::prediction::prediction); printNumericTable(prediction, "Prediction results (first 10 rows):", 10); printNumericTable(testGroundTruth, "Ground truth (first 10 rows):", 10); check_accuracy(prediction, testGroundTruth); return 0; } void check_accuracy(NumericTablePtr prediction, NumericTablePtr testGroundTruth) { /* check accuracy */ BlockDescriptor<double> blockPr; prediction->getBlockOfRows(0, prediction->getNumberOfRows(), readOnly, blockPr); double* valueP = (blockPr.getBlockPtr()); BlockDescriptor<double> blockGT; testGroundTruth->getBlockOfRows(0, testGroundTruth->getNumberOfRows(), readOnly, blockGT); double* valueG = (blockGT.getBlockPtr()); size_t count = 0; for(size_t i = 0; i < testGroundTruth->getNumberOfRows(); i++) { if(valueG[i] != valueP[i]) count++; } testGroundTruth->releaseBlockOfRows(blockGT); prediction->releaseBlockOfRows(blockPr); cout << "accuracy: " << 1- double(count)/double(testGroundTruth->getNumberOfRows()) << " "; } void loadData(const std::string& dataFileName,const std::string& labelsFileName, NumericTablePtr& pData, NumericTablePtr& pDependentVar) { /* Initialize FileDataSource<CSVFeatureManager> to retrieve the input data from a .csv file */ FileDataSource<CSVFeatureManager> trainDataSource(dataFileName, DataSource::notAllocateNumericTable, DataSource::doDictionaryFromContext); FileDataSource<CSVFeatureManager> trainLabels(labelsFileName, DataSource::notAllocateNumericTable, DataSource::doDictionaryFromContext); /* Create Numeric Tables for training data and dependent variables */ pData.reset(new HomogenNumericTable<>(nFeatures, 0, NumericTable::notAllocate)); pDependentVar.reset(new HomogenNumericTable<>(1, 0, NumericTable::notAllocate)); /* Retrieve the data from input file */ trainDataSource.loadDataBlock(pData.get()); trainLabels.loadDataBlock(pDependentVar.get()); NumericTableDictionaryPtr pDictionary = pData->getDictionarySharedPtr(); }