zoukankan      html  css  js  c++  java
  • 机器学习pipeline总结

    # -*- coding: utf-8 -*-
    """scikit-learn introduction
    
    Automatically generated by Colaboratory.
    
    Original file is located at
        https://colab.research.google.com/drive/1quaJafg43SN7S6cNwKFr0_WYn2ELt4Ph
    
    scikit-learn官方网站:https://scikit-learn.org/stable/
    
    模块引入
    """
    
    from sklearn import datasets
    from sklearn.metrics import mean_squared_error, r2_score
    import matplotlib.pyplot as plt
    import numpy as np
    
    """#分类:
     - SVM(support vector machine):支持向量机
     - svm.SVC()
    
    ###iris数据集
     - iris feature: 花萼长度,花萼宽度,花瓣长度,花瓣宽度
     - iris lable: 山鸢尾,杂色鸢尾,维吉尼亚鸢尾
    """
    
    iris = datasets.load_iris()
    print('iris feature
    ', iris.data[0:5])
    print('iris label
    ', iris.target[0:5])
    
    """###创建模型"""
    
    from sklearn import svm
    clf = svm.SVC()
    irisX = iris.data
    irisY = iris.target
    clf.fit(irisX, irisY)
    irisPred = clf.predict(irisX)
    clf.predict([[5.1,3.5,1.4,0.2]])  #刚刚的第1个数据
    
    """###评估指标
     - accuracy
     - precision  
     - recall
     - F1
    """
    
    from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
    print('acc is ', accuracy_score(irisY, irisPred, normalize=False)/len(irisY))
    print('precision is ', precision_score(irisY, irisPred, average='macro'))
    print('recall is ', recall_score(irisY, irisPred, average='macro'))
    print('F1 is ', f1_score(irisY, irisPred, average='macro'))
    
    """#回归
     - 线性回归
     - 模块:linear_model.LinearRegression()
    
    ###糖尿病数据集
    """
    
    diabetes = datasets.load_diabetes()
    diabetesX = np.array([[diabetes.data[i][0]] for i in range(0,diabetes.data.shape[0])])
    diabetesY = diabetes.target
    print('feature
    ',diabetesX[:5])
    print('label
    ',diabetesY[:5])
    
    """###创建模型"""
    
    from sklearn import svm, linear_model
    regr = linear_model.LinearRegression() 
    regr.fit(diabetesX, diabetes.target)
    diabetesPred = regr.predict(diabetesX) 
    regr.predict([[0.03807591]])  #对于原始数据的第一个值的预测结果
    
    plt.scatter(diabetesX, diabetes.target)  #原始数据的散点图
    plt.plot(diabetesX, diabetesPred)  #线性回归的折线图
    
    """###评价指标
     - 均方误差(mse)
    """
    
    from sklearn.metrics import mean_squared_error
    print('mean squared error is ', mean_squared_error(diabetesY, diabetesPred))
    
    """#聚类
     - k-means
    
    ###创建数据集
    """
    
    from sklearn.datasets.samples_generator import make_blobs
    clusterX, clusterY = make_blobs(n_samples=1000, n_features=2, centers=[[-1,-1], [0,0], [1,1], [2,2]], cluster_std=[0.4, 0.2, 0.2, 0.2], random_state=0)
    plt.scatter(clusterX[:, 0], clusterX[:, 1])
    
    """###建立模型"""
    
    from sklearn.cluster import KMeans
    clu = KMeans(n_clusters=2, random_state=9)
    clusterPredict = clu.fit_predict(clusterX)
    plt.scatter(clusterX[:, 0], clusterX[:, 1], c=clusterPredict)
    plt.show()
    
    """#模型评估
     - cross validation 交叉验证
     - 以iris数据集为例
    """
    
    from sklearn.model_selection import train_test_split,cross_val_score
    from sklearn.metrics import accuracy_score
    from sklearn import svm
    import warnings
    warnings.filterwarnings('ignore')
    clf = svm.SVC()
    scores = cross_val_score(clf, irisX, irisY, cv=10, scoring='accuracy')
    print('十折交叉验证分别的accuracy ', scores)
    print('平均的accuracy ', sum(scores/10))
    
    """- 通过设置随机种子来进行十次十折交叉验证"""
    
    from sklearn.model_selection import StratifiedKFold,KFold
    accEachTime = []
    
    for i in range(0,10):  
        clf = svm.SVC() 
        scores = cross_val_score(clf, irisX, irisY, cv=KFold(n_splits=10, random_state=i, shuffle=True), scoring='accuracy')
        print(scores)
        accEachTime.append(sum(scores/10))
    print('每一次的accuracy值 ', accEachTime)
    print('十次十折交叉验证的平均accuracy值 ', sum(accEachTime)/10)
    

  • 相关阅读:
    etcd数据单机部署
    PostgreSQL INSERT ON CONFLICT不存在则插入,存在则更新
    ERROR 1709 (HY000): Index column size too large. The maximum column size is 767 bytes.
    Hbase 0.92.1集群数据迁移到新集群
    PostgreSQL创建只读账户
    Kafka技术内幕 读书笔记之(六) 存储层——服务端处理读写请求、分区与副本
    Kafka技术内幕 读书笔记之(六) 存储层——日志的读写
    Kafka技术内幕 读书笔记之(五) 协调者——消费组状态机
    Kafka技术内幕 读书笔记之(五) 协调者——延迟的加入组操作
    Kafka技术内幕 读书笔记之(五) 协调者——协调者处理请求
  • 原文地址:https://www.cnblogs.com/hannahzhao/p/11959326.html
Copyright © 2011-2022 走看看