zoukankan      html  css  js  c++  java
  • tensorflow knn 预测房价 注意有 Min-Max Scaling

    示例数据:

    0.00632  18.00   2.310  0  0.5380  6.5750  65.20  4.0900   1  296.0  15.30 396.90   4.98  24.00
     0.02731   0.00   7.070  0  0.4690  6.4210  78.90  4.9671   2  242.0  17.80 396.90   9.14  21.60
     0.02729   0.00   7.070  0  0.4690  7.1850  61.10  4.9671   2  242.0  17.80 392.83   4.03  34.70
     0.03237   0.00   2.180  0  0.4580  6.9980  45.80  6.0622   3  222.0  18.70 394.63   2.94  33.40
     0.06905   0.00   2.180  0  0.4580  7.1470  54.20  6.0622   3  222.0  18.70 396.90   5.33  36.20
     0.02985   0.00   2.180  0  0.4580  6.4300  58.70  6.0622   3  222.0  18.70 394.12   5.21  28.70
     0.08829  12.50   7.870  0  0.5240  6.0120  66.60  5.5605   5  311.0  15.20 395.60  12.43  22.90
     0.14455  12.50   7.870  0  0.5240  6.1720  96.10  5.9505   5  311.0  15.20 396.90  19.15  27.10
     0.21124  12.50   7.870  0  0.5240  5.6310 100.00  6.0821   5  311.0  15.20 386.63  29.93  16.50
     0.17004  12.50   7.870  0  0.5240  6.0040  85.90  6.5921   5  311.0  15.20 386.71  17.10  18.90
     0.22489  12.50   7.870  0  0.5240  6.3770  94.30  6.3467   5  311.0  15.20 392.52  20.45  15.00
     0.11747  12.50   7.870  0  0.5240  6.0090  82.90  6.2267   5  311.0  15.20 396.90  13.27  18.90
     0.09378  12.50   7.870  0  0.5240  5.8890  39.00  5.4509   5  311.0  15.20 390.50  15.71  21.70
     0.62976   0.00   8.140  0  0.5380  5.9490  61.80  4.7075   4  307.0  21.00 396.90   8.26  20.40
     0.63796   0.00   8.140  0  0.5380  6.0960  84.50  4.4619   4  307.0  21.00 380.02  10.26  18.20
     0.62739   0.00   8.140  0  0.5380  5.8340  56.50  4.4986   4  307.0  21.00 395.62   8.47  19.90
     1.05393   0.00   8.140  0  0.5380  5.9350  29.30  4.4986   4  307.0  21.00 386.85   6.58  23.10

    代码:最大值与最小值之差:ptp()

    # k-Nearest Neighbor
    #----------------------------------
    #
    # This function illustrates how to use
    # k-nearest neighbors in tensorflow
    #
    # We will use the 1970s Boston housing dataset
    # which is available through the UCI
    # ML data repository.
    #
    # Data:
    #----------x-values-----------
    # CRIM   : per capita crime rate by town
    # ZN     : prop. of res. land zones
    # INDUS  : prop. of non-retail business acres
    # CHAS   : Charles river dummy variable
    # NOX    : nitrix oxides concentration / 10 M
    # RM     : Avg. # of rooms per building
    # AGE    : prop. of buildings built prior to 1940
    # DIS    : Weighted distances to employment centers
    # RAD    : Index of radian highway access
    # TAX    : Full tax rate value per $10k
    # PTRATIO: Pupil/Teacher ratio by town
    # B      : 1000*(Bk-0.63)^2, Bk=prop. of blacks
    # LSTAT  : % lower status of pop
    #------------y-value-----------
    # MEDV   : Median Value of homes in $1,000's
    
    import matplotlib.pyplot as plt
    import numpy as np
    import tensorflow as tf
    import requests
    from tensorflow.python.framework import ops
    ops.reset_default_graph()
    
    # Create graph
    sess = tf.Session()
    
    # Load the data
    housing_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/housing/housing.data'
    housing_header = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT', 'MEDV']
    cols_used = ['CRIM', 'INDUS', 'NOX', 'RM', 'AGE', 'DIS', 'TAX', 'PTRATIO', 'B', 'LSTAT']
    num_features = len(cols_used)
    housing_file = requests.get(housing_url)
    housing_data = [[float(x) for x in y.split(' ') if len(x)>=1] for y in housing_file.text.split('
    ') if len(y)>=1]
    
    y_vals = np.transpose([np.array([y[13] for y in housing_data])])
    x_vals = np.array([[x for i,x in enumerate(y) if housing_header[i] in cols_used] for y in housing_data])
    
    ## Min-Max Scaling
    x_vals = (x_vals - x_vals.min(0)) / x_vals.ptp(0)
    
    # Split the data into train and test sets
    np.random.seed(13)  #make results reproducible
    train_indices = np.random.choice(len(x_vals), round(len(x_vals)*0.8), replace=False)
    test_indices = np.array(list(set(range(len(x_vals))) - set(train_indices)))
    x_vals_train = x_vals[train_indices]
    x_vals_test = x_vals[test_indices]
    y_vals_train = y_vals[train_indices]
    y_vals_test = y_vals[test_indices]
    
    # Declare k-value and batch size
    k = 4
    batch_size=len(x_vals_test)
    
    # Placeholders
    x_data_train = tf.placeholder(shape=[None, num_features], dtype=tf.float32)
    x_data_test = tf.placeholder(shape=[None, num_features], dtype=tf.float32)
    y_target_train = tf.placeholder(shape=[None, 1], dtype=tf.float32)
    y_target_test = tf.placeholder(shape=[None, 1], dtype=tf.float32)
    
    # Declare distance metric
    # L1
    distance = tf.reduce_sum(tf.abs(tf.subtract(x_data_train, tf.expand_dims(x_data_test,1))), axis=2)
    
    # L2
    #distance = tf.sqrt(tf.reduce_sum(tf.square(tf.subtract(x_data_train, tf.expand_dims(x_data_test,1))), reduction_indices=1))
    
    # Predict: Get min distance index (Nearest neighbor)
    #prediction = tf.arg_min(distance, 0)
    top_k_xvals, top_k_indices = tf.nn.top_k(tf.negative(distance), k=k)
    x_sums = tf.expand_dims(tf.reduce_sum(top_k_xvals, 1),1)
    x_sums_repeated = tf.matmul(x_sums,tf.ones([1, k], tf.float32))
    x_val_weights = tf.expand_dims(tf.div(top_k_xvals,x_sums_repeated), 1)
    
    top_k_yvals = tf.gather(y_target_train, top_k_indices)
    prediction = tf.squeeze(tf.matmul(x_val_weights,top_k_yvals), axis=[1])
    
    # Calculate MSE
    mse = tf.div(tf.reduce_sum(tf.square(tf.subtract(prediction, y_target_test))), batch_size)
    
    # Calculate how many loops over training data
    num_loops = int(np.ceil(len(x_vals_test)/batch_size))
    
    for i in range(num_loops):
        min_index = i*batch_size
        max_index = min((i+1)*batch_size,len(x_vals_train))
        x_batch = x_vals_test[min_index:max_index]
        y_batch = y_vals_test[min_index:max_index]
        predictions = sess.run(prediction, feed_dict={x_data_train: x_vals_train, x_data_test: x_batch,
                                             y_target_train: y_vals_train, y_target_test: y_batch})
        batch_mse = sess.run(mse, feed_dict={x_data_train: x_vals_train, x_data_test: x_batch,
                                             y_target_train: y_vals_train, y_target_test: y_batch})
    
        print('Batch #' + str(i+1) + ' MSE: ' + str(np.round(batch_mse,3)))
    
    # Plot prediction and actual distribution
    bins = np.linspace(5, 50, 45)
    
    plt.hist(predictions, bins, alpha=0.5, label='Prediction')
    plt.hist(y_batch, bins, alpha=0.5, label='Actual')
    plt.title('Histogram of Predicted and Actual Values')
    plt.xlabel('Med Home Value in $1,000s')
    plt.ylabel('Frequency')
    plt.legend(loc='upper right')
    plt.show()
    

     

  • 相关阅读:
    EZchip花1.3亿美元买Tilera然后以8亿美元把自己与Tilera一起卖掉
    [OFC]Mellanox发布首个200Gb/s硅光子设备
    Mellanox 8亿美元收购EZchip
    EZchip将推全球首款100核64位ARM A-53芯片
    MyBatis映射文件5
    MyBatis映射文件4(参数获取#{}和${}/select标签详解[返回类型为list])
    MyBatis源码分析1 参数映射分析
    MyBatis映射文件3(参数处理Map)
    MyBatis映射文件2(不支持自增的数据库解决方案/参数处理[单参、多参、命名参数])
    MyBatis映射文件1(增删改、insert获取自增主键值)
  • 原文地址:https://www.cnblogs.com/bonelee/p/9005495.html
Copyright © 2011-2022 走看看