zoukankan      html  css  js  c++  java
  • Relief 过滤式特征选择

    给定训练集{(x1,y1),(x2,y2).....(xm,ym)} ,对每个示例xi,Relief在xi的同类样本中寻找其最近邻xi,nh(猜中近邻),再从xi的异类样本中寻找其最近邻xi,nm(猜错近邻)

         代码如下:

    #!/usr/bin/env python2
    # -*- coding: utf-8 -*-
    """
    Created on Wed Feb 28 20:16:09 2018
    
    @author: jzc
    """
    import numpy as np
    import csv
    from random import randrange
    from sklearn import preprocessing
    #抽样次数
    m=8    
    def Compute_Distance_Discrete(diff_distance):
        # 使用欧几里得距离计算最近邻
        counter = np.power(diff_distance,2) 
        counter = np.sum(counter)
        counter = np.sqrt(counter)
        return counter
    def loadcsv(filename):
        """载入文件"""
        lines = csv.reader(open(filename,'r'))
        data = list(lines)
        for i in range(1,len(data)):
            data[i] = [float(x) for x in data[i]]
        result = np.array(data[1:])
        features = result[:,1:-1]
        labels = result[:,-1]
        return features,labels
    def Relief(features,labels):
        #初始化
        (n_samples,n_features)=np.shape(features)
        distance = np.zeros((n_samples,n_samples))
        weights = np.zeros(n_features) 
        nearHit= list()
        nearMiss= list()
        distance_sort=list()
        """寻找每个样本的距离"""
        for i in range(0,n_samples):
            for j in range(0,n_samples):
                diff_distance = features[i]-features[j]
                if i==j:
                    distance[i,j]=99999
                else:
                    distance[i,j] = Compute_Distance_Discrete(diff_distance)
                
              
        for i in range(0,m):
            one_sample = randrange(0,n_samples,1) #随机选择一个样本
            one_feature = features[one_sample]
            for index in range(n_samples):
                distance_sort.append([distance[one_sample,index],index,labels[index]])
            #从小到大排序
            distance_sort.sort(key = lambda x:x[0])
            """寻找样本的猜错近邻和猜中近邻"""
            for index in range(n_samples):
                if nearHit ==[] and distance_sort[index][2]==labels[one_sample]:
                    nearHit = features[distance_sort[index][1]]
                elif nearMiss==[] and distance_sort[index][2]!=labels[one_sample]:
                    nearMiss = features[distance_sort[index][1]]
                elif nearHit!=[] and nearMiss!=[]:
                    break;
                else:
                    continue;
            sum_nh = list()
            sum_nm =list()
            # 若属性j离散,Xaj==Xbj 则diff的值为0;否则为1
            for k in range(len(one_feature[:-2])):
                if one_feature[k] != nearHit[k]:
                    sum_nh.append(1)
                else:
                    sum_nh.append(0)
                if one_feature[k] != nearMiss[k]:
                    sum_nm.append(1)
                else:
                    sum_nm.append(0)
            #print sum_nh,sum_nm
            #print one_feature[-2:]-nearHit[-2:]
            """若为属性j为连续, diff(Xaj-Xbj)=|Xaj-Xbj| 并且Xaj,Xbj要归一化到[0,1]区间"""
            weights[-2:] = weights[-2:]-np.power(one_feature[-2:]-nearHit[-2:],2)
            +np.power(one_feature[-2:]-nearMiss[-2:],2)
            weights[:-2] = weights[:-2]-np.power(sum_nh,2)+np.power(sum_nm,2)
            #print weights/n_samples
        return weights/n_samples
    filename = '/Users/jzc/DeepLearning(7.8-)/data/watermelon3_0.csv'
    features,labels = loadcsv(filename)
    #features[-2:] = preprocessing.normalize(features[-2:],norm='l2')
    #print features
    for x in range(1,10):
        result = Relief(features,labels)
        print result
    #print features[0],labels[0]
        
            
            
  • 相关阅读:
    Python学习之余,摸摸鱼
    Python 实现斐波那契数
    Linux下为什么目录的大小总是4096
    Python的精髓居然是方括号、花括号和圆括号!
    为什么说Python是最伟大的语言?看图就知道了!
    前端常用知识(会更新)
    Mysql 约束
    Navicat 安装
    Java后台将CTS格式转为标准日期时间格式返回给前端
    MySQL数据库报错“Zero date value prohibited”
  • 原文地址:https://www.cnblogs.com/jzcbest1016/p/8551857.html
Copyright © 2011-2022 走看看