package weka.filters.unsupervised.attribute;
PrincipalComponents
属性:
/** The data to transform analyse/transform. */ protected Instances m_TrainInstances; /** Keep a copy for the class attribute (if set). */ protected Instances m_TrainCopy; /** The header for the transformed data format. */ protected Instances m_TransformedFormat; /** Data has a class set. */ protected boolean m_HasClass; /** Class index. */ protected int m_ClassIndex; /** Number of attributes. */ protected int m_NumAttribs; /** Number of instances. */ protected int m_NumInstances; /** Correlation matrix for the original data. */ protected double[][] m_Correlation; /** * If true, center (rather than standardize) the data and * compute PCA from covariance (rather than correlation) * matrix. */ private boolean m_center = false; /** Will hold the unordered linear transformations of the (normalized) original data. */ protected double[][] m_Eigenvectors; /** Eigenvalues for the corresponding eigenvectors. */ protected double[] m_Eigenvalues = null; /** Sorted eigenvalues. */ protected int[] m_SortedEigens; /** sum of the eigenvalues. */ protected double m_SumOfEigenValues = 0.0; /** Filters for replacing missing values. */ protected ReplaceMissingValues m_ReplaceMissingFilter; /** Filter for turning nominal values into numeric ones. */ protected NominalToBinary m_NominalToBinaryFilter; /** Filter for removing class attribute, nominal attributes with 0 or 1 value. */ protected Remove m_AttributeFilter; /** Filter for standardizing the data */ protected Standardize m_standardizeFilter; /** Filter for centering the data */ protected Center m_centerFilter; /** The number of attributes in the pc transformed data. */ protected int m_OutputNumAtts = -1; /** the amount of varaince to cover in the original data when retaining the best n PC's. */ protected double m_CoverVariance = 0.95; /** maximum number of attributes in the transformed attribute name. */ protected int m_MaxAttrsInName = 5; /** maximum number of attributes in the transformed data (-1 for all). */ protected int m_MaxAttributes = -1;
计算协方差矩阵或相关系数矩阵
protected void fillCovariance() throws Exception { if (!m_center) { fillCorrelation(); return; } double[] att = new double[m_TrainInstances.numInstances()]; // now center the data by subtracting the mean m_centerFilter = new Center(); m_centerFilter.setInputFormat(m_TrainInstances); m_TrainInstances = Filter.useFilter(m_TrainInstances, m_centerFilter); // now compute the covariance matrix m_Correlation = new double[m_NumAttribs][m_NumAttribs]; for (int i = 0; i < m_NumAttribs; i++) { for (int j = 0; j < m_NumAttribs; j++) { double cov = 0; for (int k = 0; k < m_NumInstances; k++) { if (i == j) { cov += (m_TrainInstances.instance(k).value(i) * m_TrainInstances.instance(k).value(i)); } else { cov += (m_TrainInstances.instance(k).value(i) * m_TrainInstances.instance(k).value(j)); } } cov /= (double)(m_TrainInstances.numInstances() - 1); m_Correlation[i][j] = cov; m_Correlation[j][i] = cov; } } } /** * Fill the correlation matrix. */ protected void fillCorrelation() throws Exception { int i; int j; int k; double[] att1; double[] att2; double corr; m_Correlation = new double[m_NumAttribs][m_NumAttribs]; att1 = new double [m_NumInstances]; att2 = new double [m_NumInstances]; for (i = 0; i < m_NumAttribs; i++) { for (j = 0; j < m_NumAttribs; j++) { for (k = 0; k < m_NumInstances; k++) { att1[k] = m_TrainInstances.instance(k).value(i); att2[k] = m_TrainInstances.instance(k).value(j); } if (i == j) { m_Correlation[i][j] = 1.0; } else { corr = Utils.correlation(att1,att2,m_NumInstances); m_Correlation[i][j] = corr; m_Correlation[j][i] = corr; } } } // now standardize the input data m_standardizeFilter = new Standardize(); m_standardizeFilter.setInputFormat(m_TrainInstances); m_TrainInstances = Filter.useFilter(m_TrainInstances, m_standardizeFilter); }
处理数据
/** * Transform an instance in original (unormalized) format. * * @param instance an instance in the original (unormalized) format * @return a transformed instance * @throws Exception if instance can't be transformed */ protected Instance convertInstance(Instance instance) throws Exception { Instance result; double[] newVals; Instance tempInst; double cumulative; int i; int j; double tempval; int numAttsLowerBound; newVals = new double[m_OutputNumAtts]; tempInst = (Instance) instance.copy(); m_ReplaceMissingFilter.input(tempInst); m_ReplaceMissingFilter.batchFinished(); tempInst = m_ReplaceMissingFilter.output(); m_NominalToBinaryFilter.input(tempInst); m_NominalToBinaryFilter.batchFinished(); tempInst = m_NominalToBinaryFilter.output(); if (m_AttributeFilter != null) { m_AttributeFilter.input(tempInst); m_AttributeFilter.batchFinished(); tempInst = m_AttributeFilter.output(); } if (!m_center) { m_standardizeFilter.input(tempInst); m_standardizeFilter.batchFinished(); tempInst = m_standardizeFilter.output(); } else { m_centerFilter.input(tempInst); m_centerFilter.batchFinished(); tempInst = m_centerFilter.output(); } if (m_HasClass) newVals[m_OutputNumAtts - 1] = instance.value(instance.classIndex()); if (m_MaxAttributes > 0) numAttsLowerBound = m_NumAttribs - m_MaxAttributes; else numAttsLowerBound = 0; if (numAttsLowerBound < 0) numAttsLowerBound = 0; cumulative = 0; for (i = m_NumAttribs - 1; i >= numAttsLowerBound; i--) { tempval = 0.0; for (j = 0; j < m_NumAttribs; j++) tempval += m_Eigenvectors[j][m_SortedEigens[i]] * tempInst.value(j); newVals[m_NumAttribs - i - 1] = tempval; cumulative += m_Eigenvalues[m_SortedEigens[i]]; if ((cumulative / m_SumOfEigenValues) >= m_CoverVariance) break; } // create instance if (instance instanceof SparseInstance) result = new SparseInstance(instance.weight(), newVals); else result = new DenseInstance(instance.weight(), newVals); return result; } /** * Initializes the filter with the given input data. * * @param instances the data to process * @throws Exception in case the processing goes wrong * @see #batchFinished() */ protected void setup(Instances instances) throws Exception { int i; int j; Vector<Integer> deleteCols; int[] todelete; double[][] v; Matrix corr; EigenvalueDecomposition eig; Matrix V; m_TrainInstances = new Instances(instances); // make a copy of the training data so that we can get the class // column to append to the transformed data (if necessary) m_TrainCopy = new Instances(m_TrainInstances, 0); m_ReplaceMissingFilter = new ReplaceMissingValues(); m_ReplaceMissingFilter.setInputFormat(m_TrainInstances); m_TrainInstances = Filter.useFilter(m_TrainInstances, m_ReplaceMissingFilter); m_NominalToBinaryFilter = new NominalToBinary(); m_NominalToBinaryFilter.setInputFormat(m_TrainInstances); m_TrainInstances = Filter.useFilter(m_TrainInstances, m_NominalToBinaryFilter); // delete any attributes with only one distinct value or are all missing deleteCols = new Vector<Integer>(); for (i = 0; i < m_TrainInstances.numAttributes(); i++) { if (m_TrainInstances.numDistinctValues(i) <= 1) deleteCols.addElement(i); } if (m_TrainInstances.classIndex() >=0) { // get rid of the class column m_HasClass = true; m_ClassIndex = m_TrainInstances.classIndex(); deleteCols.addElement(new Integer(m_ClassIndex)); } // remove columns from the data if necessary if (deleteCols.size() > 0) { m_AttributeFilter = new Remove(); todelete = new int [deleteCols.size()]; for (i = 0; i < deleteCols.size(); i++) todelete[i] = ((Integer)(deleteCols.elementAt(i))).intValue(); m_AttributeFilter.setAttributeIndicesArray(todelete); m_AttributeFilter.setInvertSelection(false); m_AttributeFilter.setInputFormat(m_TrainInstances); m_TrainInstances = Filter.useFilter(m_TrainInstances, m_AttributeFilter); } // can evaluator handle the processed data ? e.g., enough attributes? getCapabilities().testWithFail(m_TrainInstances); m_NumInstances = m_TrainInstances.numInstances(); m_NumAttribs = m_TrainInstances.numAttributes(); //fillCorrelation(); fillCovariance(); // get eigen vectors/values corr = new Matrix(m_Correlation); eig = corr.eig(); V = eig.getV(); v = new double[m_NumAttribs][m_NumAttribs]; for (i = 0; i < v.length; i++) { for (j = 0; j < v[0].length; j++) v[i][j] = V.get(i, j); } m_Eigenvectors = (double[][]) v.clone(); m_Eigenvalues = (double[]) eig.getRealEigenvalues().clone(); // any eigenvalues less than 0 are not worth anything --- change to 0 for (i = 0; i < m_Eigenvalues.length; i++) { if (m_Eigenvalues[i] < 0) m_Eigenvalues[i] = 0.0; } m_SortedEigens = Utils.sort(m_Eigenvalues); m_SumOfEigenValues = Utils.sum(m_Eigenvalues); m_TransformedFormat = determineOutputFormat(m_TrainInstances); setOutputFormat(m_TransformedFormat); m_TrainInstances = null; }