zoukankan      html  css  js  c++  java
  • optim.py cs231n

    n如果有错误,欢迎指出,不胜感激

    import numpy as np
    
    """
    This file implements various first-order update rules that are commonly used for
    training neural networks. Each update rule accepts current weights and the
    gradient of the loss with respect to those weights and produces the next set of
    weights. Each update rule has the same interface:
    
    def update(w, dw, config=None):
    
    Inputs:
      - w: A numpy array giving the current weights.
      - dw: A numpy array of the same shape as w giving the gradient of the
        loss with respect to w.
      - config: A dictionary containing hyperparameter values such as learning rate,
        momentum, etc. If the update rule requires caching values over many
        iterations, then config will also hold these cached values.
    
    Returns:
      - next_w: The next point after the update.
      - config: The config dictionary to be passed to the next iteration of the
        update rule.
    
    NOTE: For most update rules, the default learning rate will probably not perform
    well; however the default values of the other hyperparameters should work well
    for a variety of different problems.
    
    For efficiency, update rules may perform in-place updates, mutating w and
    setting next_w equal to w.
    """
    
    
    def sgd(w, dw, config=None):
      """
      Performs vanilla stochastic gradient descent.
    
      config format:
      - learning_rate: Scalar learning rate.
      """
      if config is None: config = {}
      config.setdefault('learning_rate', 1e-2)
    w -= config['learning_rate'] * dw return w, config def sgd_momentum(w, dw, config=None): """ Performs stochastic gradient descent with momentum. config format: - learning_rate: Scalar learning rate. - momentum: Scalar between 0 and 1 giving the momentum value. Setting momentum = 0 reduces to sgd. - velocity: A numpy array of the same shape as w and dw used to store a moving average of the gradients. """ if config is None: config = {} config.setdefault('learning_rate', 1e-2) config.setdefault('momentum', 0.9) v = config.get('velocity', np.zeros_like(w)) next_w = None v=v*config['momentum']-config['learning_rate']*dw next_w=w+v config['velocity'] = v return next_w, config def rmsprop(x, dx, config=None): """ Uses the RMSProp update rule, which uses a moving average of squared gradient values to set adaptive per-parameter learning rates. config format: - learning_rate: Scalar learning rate. - decay_rate: Scalar between 0 and 1 giving the decay rate for the squared gradient cache. - epsilon: Small scalar used for smoothing to avoid dividing by zero. - cache: Moving average of second moments of gradients. """ if config is None: config = {} config.setdefault('learning_rate', 1e-2) config.setdefault('decay_rate', 0.99) config.setdefault('epsilon', 1e-8) config.setdefault('cache', np.zeros_like(x)) next_x = None cache=config['cache']*config['decay_rate']+(1-config['decay_rate'])*dx**2
    next_x=x-config['learning_rate']*dx/np.sqrt(cache+config['epsilon'])
    config['cache']=cache return next_x, config def adam(x, dx, config=None): """ Uses the Adam update rule, which incorporates moving averages of both the gradient and its square and a bias correction term. config format: - learning_rate: Scalar learning rate. - beta1: Decay rate for moving average of first moment of gradient. - beta2: Decay rate for moving average of second moment of gradient. - epsilon: Small scalar used for smoothing to avoid dividing by zero. - m: Moving average of gradient. - v: Moving average of squared gradient. - t: Iteration number. """ if config is None: config = {} config.setdefault('learning_rate', 1e-3) config.setdefault('beta1', 0.9) config.setdefault('beta2', 0.999) config.setdefault('epsilon', 1e-8) config.setdefault('m', np.zeros_like(x)) config.setdefault('v', np.zeros_like(x)) config.setdefault('t', 0) config['t']+=1 这个方法比较综合,各种方法的好处吧 m=config['beta1']*config['m']+(1-config['beta1'])*dx # now to change by acc v=config['beta2']*config['v']+(1-config['beta2'])*dx**2 config['m']=m config['v']=v m=m/(1-config['beta1']**config['t']) v=v/(1-config['beta2']**config['t']) next_x=x-config['learning_rate']*m/np.sqrt(v+config['epsilon']) return next_x, config

      

    n

  • 相关阅读:
    [C语言]数据类型与计算
    [C语言]变量VS常量
    [C语言]在命令行编译执行程序
    [cocos2d-x]游戏开发基础(图)
    [cocos2d-x]移动平台游戏开发(图)
    [cocos2d-x]认识游戏开发(图)
    [jQ/PHP]再谈使用JS数组储值的运用(提交PHP处理)
    [Nginx]Nginx的基本配置与优化1(完整配置示例与虚拟主机配置)
    [JS]如何理解JS中的类和对象
    [jPlayer]一分钟部署jPlayer
  • 原文地址:https://www.cnblogs.com/sfzyk/p/6731090.html
Copyright © 2011-2022 走看看