zoukankan html css js c++ java

数据处理方法基础

1. 归一化和标准化

 1 import numpy as np
 2 from numpy import sqrt
 3 
 4 
 5 def normalize(mtx, a=0, b=1):
 6     """
 7     normalization: remove dimensional effects
 8     range (a, b), default:(0, 1)
 9     formula: x* = a + k(x - min) / x* = b + k(x - max), k = (b - a)/(max - min)
10     """
11     data = mtx.ravel()          # 矩阵拉伸
12     size = mtx.shape            # mtx's shape
13     mx = np.max(data)           # max
14     mn = np.min(data)           # min
15     k = (b - a)/(mx - mn)       # step of (a, b)
16 
17     # 需要将矩阵拉伸，否则会报错：only size-1 arrays can be converted to Python scalars
18     norm_data =  [a + k*(float(i) - mn) for i in data]
19     return np.array(norm_data).reshape(size)
20 
21 
22 def standard(mtx):
23     """
24     z-score standardlization: require data has approximate Gaussian distribution, otherwise will be worse
25     result: standard data obey N(0, 1)
26     formula: X* = (X - X_mean) / sqrt(var)
27     """
28     data = mtx.ravel()
29     size = mtx.shape
30     m = np.mean(data)    # 初始均值
31     var = sum([(i-m)**2 for i in data]) / np.size(data)     # 初始方差
32 
33     standard_data = [(x - m) / sqrt(var) for x in data]
34     return np.array(standard_data).reshape(size)
35 
36 # ------------------------Test Part-------------------------------
37 # if __name__ == '__main__':
38 #     arr = np.array(([1.12, 0.78, 2.33, 3.45, 4.11, 5],
39 #                    [1, 3, 4, 5, 7, 6.66]))
40 #
41 #     norm_arr = normalize(arr)
42 #     standard_arr = standard(arr)
43 #     print(norm_arr)
44 #     print(standard_arr)

Notes:

# 矩阵拉伸：将矩阵拉伸成行向量
    ravel(): 返回数组的试图
    flatten(): 返回真实数组，需要N重新分配空间

# 矩阵分割：

1 np.hsplit(mtx, arg)     # 水平分割，arg:一般是列数
2 np.vsplit(mtx, arg)     # 垂直分割，arg:一般是行数


# 矩阵组合：

np.hstack((a, b))
np.concatenate((a, b), axis=1)          # 水平组合
print np.vstack((a, b))
print np.concatenate((a, b), axis=0)    # 垂直组合


# list_to_ndarray:

np.array(list)
arr.tolist()

2. SVD分解

import numpy as np
from numpy import linalg,sqrt


def mtx_svd(mtx):
    """M = UDV"""
    M = mtx
    M_T = mtx.T

    Z_v = np.dot(M_T, M)
    e_val, e_vecs = linalg.eig(Z_v)
    # 排序ATA的特征值特征向量
    sorted_eval_idx = np.argsort(e_val)[::-1]           # eval降序索引
    sorted_eval = [e_val[i] for i in sorted_eval_idx]   # 排序后的特征值
    v_sorted_evecs = e_vecs[:, sorted_eval_idx]           # 排序后的特征向量
    # 构造V矩阵，右奇异向量
    V = v_sorted_evecs


    # 构造奇异值对角阵D
    sin_val = [sqrt(eig) for eig in sorted_eval if eig!=0]
    D = np.diag(sin_val)


    Z_u = np.dot(M, M_T)
    val_u, vecs_u = linalg.eig(Z_u)
    # 排序AAT的特征值特征向量
    sorted_eval_idx = np.argsort(val_u)[::-1]           # eval降序索引
    u_sorted_evecs = vecs_u[:, sorted_eval_idx]         # 排序后的特征向量
    # 构造U矩阵，左奇异向量
    U = u_sorted_evecs

    return U, D, V

# -------------------------------Test Part-------------------------------
# if __name__ == '__main__':
#     mtx = np.array(([1, 2, 3],
#                     [2, 4, 7],
#                     [3, 7, 10],
#                     [4, 8, 5],
#                     [6, 9, 7]))
#     u,d,v = mtx_svd(mtx)
#     print(u,d,v)

Notes:

# argsort(): 将矩阵从小到大排序，并提取对应的index list
    argsort()[::-1]: 将索引逆置
# 按照特征值顺序对对应特征向量排序
    1. 降序排列特征值，并得到其索引
        eval_sorted_index = np.argsort(A)[::-1]
    2. 利用列表表达式排序特征值
        sorted_eval = [eval[i] for i in eval_sorted_index]
    3. 排序对应特征向量
        sorted_evecs = evecs[:, eval_sorted_index]

查看全文

相关阅读:
codeforces C. No to Palindromes!
codeforces D. Pashmak and Parmida's problem
codeforces C. Little Pony and Expected Maximum
codeforces D. Count Good Substrings
codeforces C. Jzzhu and Chocolate
codeforces C. DZY Loves Sequences
codeforces D. Multiplication Table
codeforces C. Painting Fence
hdu 5067 Harry And Dig Machine
POJ 1159 Palindrome

原文地址：https://www.cnblogs.com/KrianJ/p/12178169.html