#正则化:降低模型的复杂度,避免过拟合。 #加载模块 from sklearn.datasets import load_iris import joblib from sklearn.neighbors import KNeighborsClassifier from sklearn.linear_model import LinearRegression from sklearn.linear_model import Ridge from sklearn.linear_model import Lasso from sklearn.linear_model import LogisticRegression from sklearn.svm import LinearSVC from sklearn.model_selection import train_test_split from sklearn.tree import DecisionTreeClassifier from sklearn.ensemble import RandomForestClassifier from sklearn.svm import SVC from sklearn.neural_network import MLPClassifier #分割数据集 data = load_iris() X = data.data y = data.target train_X,test_X,train_y,test_y = train_test_split(X,y,test_size=0.3,random_state=2) #训练模型 #k近邻 kneighbor=KNeighborsClassifier(n_neighbors=3)#n_neighbors是设定邻居的个数 #线性回归 lr = LinearRegression() #岭回归 ridge=Ridge(alpha=.01)# aipha模型简单性与训练集性能之间的权衡,alpha趋向于0,降低训练集性能,提高泛化性能 #Lasso回归 lasso= Lasso(alpha=.01,max_iter=100)# max_iter运行迭代的最大次数,aipha越小,模型越简单 #logistic回归 log=LogisticRegression(C=1)#C是正则化强度的权衡系数,C越大,模型较复杂,拟合效果越好,但可能出现过拟合 #线性支持向量机 linearSVC=LinearSVC(C=10)#C是正则化强度的权衡系数,C越大,模型较复杂,拟合效果越好,但可能出现过拟合 #朴素贝叶斯分类器 GaussianNB(连续数据)、BernoulliNB(二分类数据)、MultinomialNB(计数数据)。后两个基本用于文本数据分类 #决策树 tree=DecisionTreeClassifier(random_state=0,max_depth=4)#random_state解决内部平局(不太理解); # max_depth是树的深度,max_leaf_nodes、min_samples_leaf三个都可以防止过拟合。 #随机森林 randomtree=RandomForestClassifier(n_estimators=4,random_state=2)#n_estimators多少棵树,越大,树越多,也可以防止过拟合。 # max_features决定每棵树的随机性大小,较小可以防止过拟合,一般使用默认值。 #核支持向量机 svc=SVC(C=1,gamma=0.1)#C是指正则化系数,C越大,模型较复杂,拟合效果越好,但可能出现过拟合 #gamma用于控制高斯核宽度,决定点与点的最大距离,gamma越大,模型复杂度越高,决策边界变化越快。 #神经网络 mlp=MLPClassifier(hidden_layer_sizes=100,alpha=0.1)#hidden_layer_sizes隐含层数,alpha每个隐含层的正则化 kneighbor.fit(train_X,train_y) lr.fit(train_X,train_y) ridge.fit(train_X,train_y) lasso.fit(train_X,train_y) log.fit(train_X,train_y) linearSVC.fit(train_X,train_y) tree.fit(train_X,train_y) randomtree.fit(train_X,train_y) svc.fit(train_X,train_y) mlp.fit(train_X,train_y) #将训练的模型保存到磁盘(value=模型名) 默认当前文件夹下 joblib.dump(filename='kneighbor.model',value=kneighbor) joblib.dump(filename='LR.model',value=lr) joblib.dump(filename='Ridge.model',value=ridge) joblib.dump(filename='lasso.model',value=lasso) joblib.dump(filename='log.model',value=log) joblib.dump(filename='linearSVC.model',value=linearSVC) joblib.dump(filename='tree.model',value=tree) joblib.dump(filename='randomtree.model',value=randomtree) joblib.dump(filename='svc.model',value=svc) joblib.dump(filename='mlp.model',value=mlp) # 下载本地模型 model0 = joblib.load(filename="kneighbor.model") model1 = joblib.load(filename="LR.model") model2 = joblib.load(filename="Ridge.model") model3 = joblib.load(filename="lasso.model") model4 = joblib.load(filename="log.model") model5 = joblib.load(filename="linearSVC.model") model6 = joblib.load(filename="tree.model") model7 = joblib.load(filename="randomtree.model") model8 = joblib.load(filename="svc.model") model9 = joblib.load(filename="mlp.model") #对本地模型进行第三组数据进行预测 print(model0.predict(test_X)[2]) print(model0.score(test_X,test_y)) print(model1.predict(test_X)[2]) print(model1.score(test_X,test_y)) print(model2.predict(test_X)[2]) print(model2.score(test_X,test_y)) print(model3.predict(test_X)[2]) print(model3.score(test_X,test_y)) print(model4.predict(test_X)[2]) print(model4.score(test_X,test_y)) print(model5.predict(test_X)[2]) print(model5.score(test_X,test_y)) print(model6.predict(test_X)[2]) print(model6.score(test_X,test_y)) print(model7.predict(test_X)[2]) print(model7.score(test_X,test_y)) print(model8.predict(test_X)[2]) print(model8.score(test_X,test_y)) print(model9.predict(test_X)[2]) print(model9.score(test_X,test_y)) # 重新设置模型参数并训练 '''model1.set_params(normalize=True).fit(train_X,train_y) #新模型做预测 print(model1.predict(test_X)) print(model1.score(test_X,test_y))'''