1.通过Window10“所有应用”->"Anaconda(64bit)"->"Anaconda Command Prompt",启动pyhon,
2.在Python REPL中输入脚本
def loadDataSet(fileName, delim=' '): fr = open(fileName) stringArr = [line.strip().split(delim) for line in fr.readlines()] datArr = [map(float,line) for line in stringArr] return mat(datArr) def pca(dataMat, topNfeat=9999999): meanVals = mean(dataMat, axis=0) meanRemoved = dataMat - meanVals #remove mean covMat = cov(meanRemoved, rowvar=0) eigVals,eigVects = linalg.eig(mat(covMat)) eigValInd = argsort(eigVals) #sort, sort goes smallest to largest eigValInd = eigValInd[:-(topNfeat+1):-1] #cut off unwanted dimensions redEigVects = eigVects[:,eigValInd] #reorganize eig vects largest to smallest lowDDataMat = meanRemoved * redEigVects#transform data into new dimensions reconMat = (lowDDataMat * redEigVects.T) + meanVals return lowDDataMat, reconMat
运行结果如下:
Python 2.7.10 |Anaconda 2.3.0 (64-bit)| (default, May 28 2015, 16:44:52) [MSC v.1500 64 bit (AMD64)] on win32 Type "help", "copyright", "credits" or "license" for more information. Anaconda is brought to you by Continuum Analytics. Please check out: http://continuum.io/thanks and https://binstar.org >>> from numpy import * >>> >>> def loadDataSet(fileName, delim=' '): ... fr = open(fileName) ... stringArr = [line.strip().split(delim) for line in fr.readlines()] ... datArr = [map(float,line) for line in stringArr] ... return mat(datArr) ... >>> def pca(dataMat, topNfeat=9999999): ... meanVals = mean(dataMat, axis=0) ... meanRemoved = dataMat - meanVals #remove mean ... covMat = cov(meanRemoved, rowvar=0) ... eigVals,eigVects = linalg.eig(mat(covMat)) ... eigValInd = argsort(eigVals) #sort, sort goes smallest to largest ... eigValInd = eigValInd[:-(topNfeat+1):-1] #cut off unwanted dimensions ... redEigVects = eigVects[:,eigValInd] #reorganize eig vects largest to smallest ... lowDDataMat = meanRemoved * redEigVects#transform data into new dimensions ... reconMat = (lowDDataMat * redEigVects.T) + meanVals ... return lowDDataMat, reconMat
验证:
>>> dataMat = loadDataSet('F:\studio\MachineLearningInAction\ch13\testSet.txt') >>> shape(dataMat) (1000L, 2L) >>> lowMat, reconMat = pca(dataMat, 1) >>> shape(lowMat) (1000L, 1L) >>> import matplotlib >>> import matplotlib.pyplot as plt>>> fig = plt.figure() >>> ax = fig.add_subplot(111) >>> import matplotlib.pyplot as plt >>> fig = plt.figure() >>> ax = fig.add_subplot(111) >>> ax.scatter(dataMat[:,0].flatten().A[0], dataMat[:,1].flatten().A[0], marker='^', s=90) <matplotlib.collections.PathCollection object at 0x0000000009F556D8> >>> ax.scatter(reconMat[:,0].flatten().A[0], reconMat[:,1].flatten().A[0], marker='^', s=90) <matplotlib.collections.PathCollection object at 0x0000000009F55710> >>> plt.show()