zoukankan      html  css  js  c++  java
  • 数据处理方法基础

    1. 归一化和标准化

     1 import numpy as np
     2 from numpy import sqrt
     3 
     4 
     5 def normalize(mtx, a=0, b=1):
     6     """
     7     normalization: remove dimensional effects
     8     range (a, b), default:(0, 1)
     9     formula: x* = a + k(x - min) / x* = b + k(x - max), k = (b - a)/(max - min)
    10     """
    11     data = mtx.ravel()          # 矩阵拉伸
    12     size = mtx.shape            # mtx's shape
    13     mx = np.max(data)           # max
    14     mn = np.min(data)           # min
    15     k = (b - a)/(mx - mn)       # step of (a, b)
    16 
    17     # 需要将矩阵拉伸,否则会报错:only size-1 arrays can be converted to Python scalars
    18     norm_data =  [a + k*(float(i) - mn) for i in data]
    19     return np.array(norm_data).reshape(size)
    20 
    21 
    22 def standard(mtx):
    23     """
    24     z-score standardlization: require data has approximate Gaussian distribution, otherwise will be worse
    25     result: standard data obey N(0, 1)
    26     formula: X* = (X - X_mean) / sqrt(var)
    27     """
    28     data = mtx.ravel()
    29     size = mtx.shape
    30     m = np.mean(data)    # 初始均值
    31     var = sum([(i-m)**2 for i in data]) / np.size(data)     # 初始方差
    32 
    33     standard_data = [(x - m) / sqrt(var) for x in data]
    34     return np.array(standard_data).reshape(size)
    35 
    36 # ------------------------Test Part-------------------------------
    37 # if __name__ == '__main__':
    38 #     arr = np.array(([1.12, 0.78, 2.33, 3.45, 4.11, 5],
    39 #                    [1, 3, 4, 5, 7, 6.66]))
    40 #
    41 #     norm_arr = normalize(arr)
    42 #     standard_arr = standard(arr)
    43 #     print(norm_arr)
    44 #     print(standard_arr)

    Notes:

    # 矩阵拉伸:将矩阵拉伸成行向量
    ravel(): 返回数组的试图
    flatten(): 返回真实数组,需要N重新分配空间

    # 矩阵分割:
    1 np.hsplit(mtx, arg)     # 水平分割,arg:一般是列数
    2 np.vsplit(mtx, arg)     # 垂直分割,arg:一般是行数

    # 矩阵组合:
    np.hstack((a, b))
    np.concatenate((a, b), axis=1)          # 水平组合
    print np.vstack((a, b))
    print np.concatenate((a, b), axis=0)    # 垂直组合

    # list_to_ndarray:
    np.array(list)
    arr.tolist()

    2. SVD分解

    import numpy as np
    from numpy import linalg,sqrt
    
    
    def mtx_svd(mtx):
        """M = UDV"""
        M = mtx
        M_T = mtx.T
    
        Z_v = np.dot(M_T, M)
        e_val, e_vecs = linalg.eig(Z_v)
        # 排序ATA的特征值特征向量
        sorted_eval_idx = np.argsort(e_val)[::-1]           # eval降序索引
        sorted_eval = [e_val[i] for i in sorted_eval_idx]   # 排序后的特征值
        v_sorted_evecs = e_vecs[:, sorted_eval_idx]           # 排序后的特征向量
        # 构造V矩阵,右奇异向量
        V = v_sorted_evecs
    
    
        # 构造奇异值对角阵D
        sin_val = [sqrt(eig) for eig in sorted_eval if eig!=0]
        D = np.diag(sin_val)
    
    
        Z_u = np.dot(M, M_T)
        val_u, vecs_u = linalg.eig(Z_u)
        # 排序AAT的特征值特征向量
        sorted_eval_idx = np.argsort(val_u)[::-1]           # eval降序索引
        u_sorted_evecs = vecs_u[:, sorted_eval_idx]         # 排序后的特征向量
        # 构造U矩阵,左奇异向量
        U = u_sorted_evecs
    
        return U, D, V
    
    # -------------------------------Test Part-------------------------------
    # if __name__ == '__main__':
    #     mtx = np.array(([1, 2, 3],
    #                     [2, 4, 7],
    #                     [3, 7, 10],
    #                     [4, 8, 5],
    #                     [6, 9, 7]))
    #     u,d,v = mtx_svd(mtx)
    #     print(u,d,v)

    Notes:

    # argsort(): 将矩阵从小到大排序,并提取对应的index list
    argsort()[::-1]: 将索引逆置
    # 按照特征值顺序对对应特征向量排序
    1. 降序排列特征值,并得到其索引
    eval_sorted_index = np.argsort(A)[::-1]
    2. 利用列表表达式排序特征值
    sorted_eval = [eval[i] for i in eval_sorted_index]
    3. 排序对应特征向量
    sorted_evecs = evecs[:, eval_sorted_index]
  • 相关阅读:
    codeforces C. No to Palindromes!
    codeforces D. Pashmak and Parmida's problem
    codeforces C. Little Pony and Expected Maximum
    codeforces D. Count Good Substrings
    codeforces C. Jzzhu and Chocolate
    codeforces C. DZY Loves Sequences
    codeforces D. Multiplication Table
    codeforces C. Painting Fence
    hdu 5067 Harry And Dig Machine
    POJ 1159 Palindrome
  • 原文地址:https://www.cnblogs.com/KrianJ/p/12178169.html
Copyright © 2011-2022 走看看