Best Python code snippet using avocado_python
bn.py
Source:bn.py  
1from __future__ import absolute_import, print_function, division2import numpy3import theano4from theano import Apply, Op5from theano.gof import local_optimizer6from theano.gof.opt import copy_stack_trace7from theano.tensor import as_tensor_variable, TensorType8from theano.tensor import basic as T9from theano.tensor.opt import register_specialize_device10from theano.scalar import Composite, as_common_dtype11from theano.scalar import add, sub, true_div, mul12class BNComposite(Composite):13    init_param = ('dtype',)14    @theano.configparser.change_flags(compute_test_value='off')15    def __init__(self, dtype):16        self.dtype = dtype17        x = theano.scalar.Scalar(dtype=dtype).make_variable()18        mean = theano.scalar.Scalar(dtype=dtype).make_variable()19        std = theano.scalar.Scalar(dtype=dtype).make_variable()20        gamma = theano.scalar.Scalar(dtype=dtype).make_variable()21        beta = theano.scalar.Scalar(dtype=dtype).make_variable()22        o = add(mul(true_div(sub(x, mean), std), gamma), beta)23        inputs = [x, mean, std, gamma, beta]24        outputs = [o]25        super(BNComposite, self).__init__(inputs, outputs)26    def grad(self, inps, grads):27        x, mean, std, gamma, beta = inps28        top, = grads29        top_gamma = top * gamma30        x_mean = x - mean31        dx = top_gamma / std32        dmean = -dx33        dstd = -(top_gamma * x_mean) / (std * std)34        dgamma = top * x_mean / std35        return [dx, dmean, dstd, dgamma, top]36def batch_normalization(inputs, gamma, beta, mean, std,37                        mode='low_mem'):38    """39    This function will build the symbolic graph for applying batch normalization40    to a set of activations.41    Also works on GPUs, but is not optimized using cuDNN.42    .. versionadded:: 0.7.143    Parameters44    ----------45    inputs : symbolic tensor46        Mini-batch of activations47    gamma: symbolic tensor48        BN scale parameter, must be of same dimensionality as49        inputs and broadcastable against it50    beta: symbolic tensor51        BN shift parameter, must be of same dimensionality as52        inputs and broadcastable against it53    mean: symbolic tensor54        inputs means, must be of same dimensionality as55        inputs and broadcastable against it56    std: symbolic tensor57        inputs standard deviation, must be of same dimensionality as58        inputs and broadcastable against it59    mode: 'low_mem' or 'high_mem'60        Specify which batch_normalization implementation that will be61        used.62        As no intermediate representations are stored for the back-propagation,63        'low_mem' implementation lower the memory usage, however,64        it is 5-10% slower than 'high_mem' implementation. Note that 5-10% computation65        time difference compare the batch_normalization operation only, time difference66        between implementation is likely to be less important on the full model fprop/bprop.67    """68    if mode == 'low_mem':69        elm_bn = theano.tensor.elemwise.Elemwise(scalar_op=BNComposite(dtype=inputs.dtype))70        rval = elm_bn(inputs, mean, std, gamma, beta)71    elif mode == 'high_mem':72        rval = (inputs - mean) * (gamma / std) + beta73    else:74        raise ValueError(75            'mode must be either "low_mem", "high_mem"')76    return rval77def _prepare_batch_normalization_axes(axes, ndim):78    if axes == 'per-activation':79        axes = (0,)80    elif axes == 'spatial':81        axes = (0,) + tuple(range(2, ndim))82    elif isinstance(axes, (tuple, list, numpy.ndarray)):83        axes = tuple(int(a) for a in axes)84    else:85        raise ValueError('invalid axes: %s', str(axes))86    axes = tuple(sorted(axes))87    if len(axes) == 0:88        raise ValueError('there should be at least one normalization axis')89    if min(axes) < 0 or max(axes) >= ndim:90        raise ValueError('axes should be less than ndim (<%d), but %s given' % (ndim, str(axes)))91    non_bc_axes = tuple(i for i in range(ndim) if i not in axes)92    return axes, non_bc_axes93def batch_normalization_train(inputs, gamma, beta, axes='per-activation',94                              epsilon=1e-4, running_average_factor=0.1,95                              running_mean=None, running_var=None):96    """97    Performs batch normalization of the given inputs, using the mean and98    variance of the inputs.99    Parameters100    ----------101    axes : 'per-activation', 'spatial' or a tuple of ints102        The axes along which the input should be normalized. ``'per-activation'``103        normalizes per activation and is equal to ``axes=(0,)``.104        ``'spatial'`` shares normalization factors across spatial dimensions105        (i.e., all dimensions past the second), which for 4D inputs would be106        equal to ``axes=(0, 2, 3)``.107    gamma : tensor108        Learnable scale factors. The shape must match the shape of `inputs`,109        except for the axes in `axes`. These axes should be set to 1 or be110        skipped altogether (such that `gamma.ndim == inputs.ndim - len(axes)`).111    beta : tensor112        Learnable biases. Must match the tensor layout of `gamma`.113    epsilon : float114        Epsilon value used in the batch normalization formula. Minimum allowed115        value is 1e-5 (imposed by cuDNN).116    running_average_factor : float117        Factor for updating the values or `running_mean` and `running_var`.118        If the factor is close to one, the running averages will update quickly,119        if the factor is close to zero it will update slowly.120    running_mean : tensor or None121        Previous value of the running mean. If this is given, the new value122        ``running_mean * (1 - r_a_factor) + batch mean * r_a_factor``123        will be returned as one of the outputs of this function.124        `running_mean` and `running_var` should either both be given or125        both be None. The shape should match that of `gamma` and `beta`.126    running_var : tensor or None127        Previous value of the running variance. If this is given, the new value128        ``running_var * (1 - r_a_factor) + (m / (m - 1)) * batch var * r_a_factor``129        will be returned as one of the outputs of this function,130        where `m` is the product of lengths of the averaged-over dimensions.131        `running_mean` and `running_var` should either both be given or132        both be None. The shape should match that of `gamma` and `beta`.133    Returns134    -------135    out : tensor136        Batch-normalized inputs.137    mean : tensor138        Means of `inputs` across the normalization axes.139    invstd : tensor140        Inverse standard deviations of `inputs` across the normalization axes.141    new_running_mean : tensor142        New value of the running mean (only if both `running_mean` and143        `running_var` were given).144    new_running_var : tensor145        New value of the running variance (only if both `running_var` and146        `running_mean` were given).147    Notes148    -----149    If per-activation or spatial normalization is selected, this operation150    will use the cuDNN implementation. (This requires cuDNN 5 or newer.)151    The returned values are equivalent to:152    .. code-block:: python153        # for per-activation normalization154        axes = (0,)155        # for spatial normalization156        axes = (0,) + tuple(range(2, inputs.ndim))157        mean = inputs.mean(axes, keepdims=True)158        var = inputs.var(axes, keepdims=True)159        invstd = T.inv(T.sqrt(var + epsilon))160        out = (inputs - mean) * gamma * invstd + beta161        m = T.cast(T.prod(inputs.shape) / T.prod(mean.shape), 'float32')162        running_mean = running_mean * (1 - running_average_factor) + \\163                       mean * running_average_factor164        running_var = running_var * (1 - running_average_factor) + \\165                      (m / (m - 1)) * var * running_average_factor166    """167    ndim = inputs.ndim168    axes, non_bc_axes = _prepare_batch_normalization_axes(axes, ndim)169    # have the parameter tensors been broadcasted yet?170    if gamma.ndim == ndim:171        params_ndim = ndim172    else:173        params_ndim = len(non_bc_axes)174        params_dimshuffle_pattern = ['x'] * ndim175        for i, axis in enumerate(non_bc_axes):176            params_dimshuffle_pattern[axis] = i177    if gamma.ndim != params_ndim or beta.ndim != params_ndim:178        raise ValueError("gamma and beta dimensionality must match the "179                         "number of non-normalized axes, or have the "180                         "same number of dimensions as the inputs; "181                         "got %d and %d instead of %d" %182                         (gamma.ndim, beta.ndim, params_ndim))183    if (running_mean is None) != (running_var is None):184        raise ValueError("running_mean and running_var must either both be "185                         "given or both be None")186    if running_mean is not None and running_mean.ndim != params_ndim:187        raise ValueError("running_mean must be of the same dimensionality "188                         "as gamma and beta; got %d instead of %d" %189                         (running_mean.ndim, params_ndim))190    if running_var is not None and running_var.ndim != params_ndim:191        raise ValueError("running_var must be of the same dimensionality "192                         "as gamma and beta; got %d instead of %d" %193                         (running_var.ndim, params_ndim))194    # epsilon will be converted to floatX later. we need to check195    # for rounding errors now, since numpy.float32(1e-5) < 1e-5.196    epsilon = numpy.cast[theano.config.floatX](epsilon)197    if epsilon < 1e-5:198        raise ValueError("epsilon must be at least 1e-5, got %s" % str(epsilon))199    inputs = as_tensor_variable(inputs)200    gamma = as_tensor_variable(gamma)201    beta = as_tensor_variable(beta)202    if params_ndim != ndim:203        gamma = gamma.dimshuffle(params_dimshuffle_pattern)204        beta = beta.dimshuffle(params_dimshuffle_pattern)205    else:206        gamma = T.addbroadcast(gamma, *axes)207        beta = T.addbroadcast(beta, *axes)208    batchnorm_op = AbstractBatchNormTrain(axes=axes)209    if running_mean is not None and running_var is not None:210        running_mean = as_tensor_variable(running_mean)211        running_var = as_tensor_variable(running_var)212        if params_ndim != ndim:213            running_mean = running_mean.dimshuffle(params_dimshuffle_pattern)214            running_var = running_var.dimshuffle(params_dimshuffle_pattern)215        else:216            running_mean = T.addbroadcast(running_mean, *axes)217            running_var = T.addbroadcast(running_var, *axes)218        out, mean, invstd, new_running_mean, new_running_var = batchnorm_op(219            inputs, gamma, beta, epsilon=epsilon,220            running_average_factor=running_average_factor,221            running_mean=running_mean, running_var=running_var)222        if new_running_mean.broadcastable != running_mean.broadcastable:223            new_running_mean = T.patternbroadcast(new_running_mean, running_mean.broadcastable)224        if new_running_var.broadcastable != running_var.broadcastable:225            new_running_var = T.patternbroadcast(new_running_var, running_var.broadcastable)226        results = (out, mean, invstd, new_running_mean, new_running_var)227    else:228        results = batchnorm_op(inputs, gamma, beta, epsilon=epsilon)229    if params_ndim != ndim:230        # remove the broadcasted dimensions (except from the output)231        results = ([results[0]] +232                   [r.dimshuffle(non_bc_axes) for r in results[1:]])233    return tuple(results)234def batch_normalization_test(inputs, gamma, beta, mean, var,235                             axes='per-activation', epsilon=1e-4):236    """237    Performs batch normalization of the given inputs, using the given mean and238    variance.239    Parameters240    ----------241    axes : 'per-activation', 'spatial' or a tuple of ints242        The axes along which the input should be normalized. ``'per-activation'``243        normalizes per activation and is equal to ``axes=(0,)``.244        ``'spatial'`` shares normalization factors across spatial dimensions245        (i.e., all dimensions past the second), which for 4D inputs would be246        equal to ``axes=(0, 2, 3)``.247    gamma : tensor248        Scale factors. The shape must match the shape of `inputs`,249        except for the axes in `axes`. These axes should be set to 1 or be250        skipped altogether (such that `gamma.ndim == inputs.ndim - len(axes)`).251    beta : tensor252        Biases. Must match the tensor layout of `gamma`.253    mean : tensor254        Means. Usually these are running averages computed during training.255        Must match the tensor layout of `gamma`.256    var : tensor257        Variances. Usually these are running averages computed during training.258        Must match the tensor layout of `gamma`.259    epsilon : float260        Epsilon value used in the batch normalization formula. Minimum allowed261        value is 1e-5 (imposed by cuDNN).262    Returns263    -------264    out : tensor265        Batch-normalized inputs.266    Notes267    -----268    If per-activation or spatial normalization is selected, this operation269    will use the cuDNN implementation. (This requires cuDNN 5 or newer.)270    The returned value is equivalent to:271    .. code-block:: python272        # for per-activation normalization273        axes = (0,)274        # for spatial normalization275        axes = (0,) + tuple(range(2, inputs.ndim))276        gamma, beta, mean, var = (T.addbroadcast(t, *axes)277                                  for t in (gamma, beta, mean, var))278        out = (inputs - mean) * gamma / T.sqrt(var + epsilon) + beta279    """280    ndim = inputs.ndim281    axes, non_bc_axes = _prepare_batch_normalization_axes(axes, ndim)282    # have the parameter tensors been broadcasted yet?283    if gamma.ndim == ndim:284        params_ndim = ndim285    else:286        params_ndim = len(non_bc_axes)287        params_dimshuffle_pattern = ['x'] * ndim288        for i, axis in enumerate(non_bc_axes):289            params_dimshuffle_pattern[axis] = i290    if gamma.ndim != params_ndim or beta.ndim != params_ndim:291        raise ValueError("gamma and beta dimensionality must match the "292                         "number of non-normalized axes, or have the "293                         "same number of dimensions as the inputs; "294                         "got %d and %d instead of %d" %295                         (gamma.ndim, beta.ndim, params_ndim))296    if mean.ndim != params_ndim or var.ndim != params_ndim:297        raise ValueError("mean and var must be of the same dimensionality "298                         "as gamma and beta; got %d and %d instead of %d" %299                         (mean.ndim, var.ndim, params_ndim))300    # epsilon will be converted to floatX later. we need to check301    # for rounding errors now, since numpy.float32(1e-5) < 1e-5.302    epsilon = numpy.cast[theano.config.floatX](epsilon)303    if epsilon < 1e-5:304        raise ValueError("epsilon must be at least 1e-5, got %s" % str(epsilon))305    gamma = as_tensor_variable(gamma)306    beta = as_tensor_variable(beta)307    mean = as_tensor_variable(mean)308    var = as_tensor_variable(var)309    if params_ndim != ndim:310        gamma = gamma.dimshuffle(params_dimshuffle_pattern)311        beta = beta.dimshuffle(params_dimshuffle_pattern)312        mean = mean.dimshuffle(params_dimshuffle_pattern)313        var = var.dimshuffle(params_dimshuffle_pattern)314    else:315        gamma = T.addbroadcast(gamma, *axes)316        beta = T.addbroadcast(beta, *axes)317        mean = T.addbroadcast(mean, *axes)318        var = T.addbroadcast(var, *axes)319    batchnorm_op = AbstractBatchNormInference(axes=axes)320    return batchnorm_op(inputs, gamma, beta, mean, var, epsilon=epsilon)321class AbstractBatchNormTrain(Op):322    """323    Abstract Op for Batch Normalization.324    Parameters325    ----------326    axes : a tuple of ints327        The axes along which the input should be normalized.328    x : tensor329        The input to be normalized along `axes`.330    scale : tensor331        `scale` should have the same number of dimensions as `x`.332        All dimensions listed in `axes` should have length 1.333    bias : tensor334        `bias` should have the same number of dimensions as `x`.335        All dimensions listed in `axes` should have length 1.336    epsilon337        Epsilon value used in the batch normalization formula. Minimum allowed338        value is 1e-5 (imposed by cuDNN).339    running_average_factor : float340        Factor for updating the values or `running_mean` and `running_var`.341        If the factor is close to one, the running averages will update quickly,342        if the factor is close to zero it will update slowly.343    running_mean : tensor or None344        Previous value of the running mean. If this is given, the new value345        ``running_mean * (1 - running_average_factor) + batch mean * running_average_factor``346        will be returned as one of the outputs of this function.347        `running_mean` and `running_var` should either both be given or348        both be None.349    running_var : tensor or None350        Previous value of the running variance. If this is given, the new value351        ``running_var * (1 - running_average_factor) + (m / (m - 1)) * batch var * running_average_factor``352        will be returned as one of the outputs of this function,353        where `m` is the product of lengths of the averaged-over dimensions.354        `running_mean` and `running_var` should either both be given or355        both be None.356    """357    __props__ = ('axes',)358    def __init__(self, axes=(0,)):359        assert isinstance(axes, (tuple, list))360        assert len(axes) > 0361        axes = tuple(int(a) for a in axes)362        self.axes = axes363    def infer_shape(self, node, shape):364        return [shape[0]] + [shape[1]] * (len(node.outputs) - 1)365    def make_node(self, x, scale, bias, epsilon=1e-4,366                  running_average_factor=0.1,367                  running_mean=None, running_var=None):368        x = as_tensor_variable(x)369        scale = as_tensor_variable(scale)370        bias = as_tensor_variable(bias)371        epsilon = as_tensor_variable(epsilon)372        running_average_factor = as_tensor_variable(running_average_factor)373        if running_mean is not None:374            running_mean = as_tensor_variable(running_mean)375        if running_var is not None:376            running_var = as_tensor_variable(running_var)377        assert x.ndim == scale.ndim == bias.ndim378        assert ((running_mean is None and running_var is None) or379                (running_mean is not None and running_var is not None))380        assert (running_mean is None or running_mean.ndim == x.ndim)381        assert (running_var is None or running_var.ndim == x.ndim)382        # Upcast to common dtype on the non-scalar383        # Keep as is dtype of scalar (epsilon and running_average_factor)384        if running_mean:385            x, scale, bias, running_mean, running_var = as_common_dtype(386                x, scale, bias, running_mean, running_var)387        else:388            x, scale, bias = as_common_dtype(x, scale, bias)389        inputs = [x, scale, bias, epsilon, running_average_factor]390        output_types = [x.type(), scale.type(), scale.type()]391        if running_mean is not None and running_var is not None:392            inputs.append(running_mean)393            inputs.append(running_var)394            output_types.append(scale.type())395            output_types.append(scale.type())396        return Apply(self, inputs, output_types)397    def L_op(self, inputs, outputs, grads):398        x, scale, bias, epsilon, running_average_factor = inputs[:5]399        dy = grads[0]400        _, x_mean, x_invstd = outputs[:3]401        disconnected_outputs = [402            theano.gradient.DisconnectedType()(),  # epsilon403            theano.gradient.DisconnectedType()()]  # running_average_factor404        # Optional running_mean and running_var.405        for i in range(5, len(inputs)):406            disconnected_outputs.append(theano.gradient.DisconnectedType()())407        return AbstractBatchNormTrainGrad(self.axes)(408            x, dy, scale, x_mean, x_invstd, epsilon) + disconnected_outputs409    def connection_pattern(self, node):410        # Specificy that epsilon and running_average_factor are not connected to outputs.411        patterns = [[True, True, True],     # x412                    [True, True, True],     # scale413                    [True, True, True],     # bias414                    [False, False, False],  # epsilon415                    [False, False, False]]  # running_average_factor416        # Optional running_mean and running_var are only417        # connected to their new values.418        for i in range(5, len(node.inputs)):419            patterns[0].append(True)420            for pattern in patterns[1:]:421                pattern.append(False)422            patterns.append([False] * (3 + i - 5) + [True])423        return patterns424    def perform(self, node, inputs, output_storage):425        x, scale, bias, epsilon, running_average_factor = inputs[:5]426        axes = self.axes427        if min(axes) < 0 or max(axes) >= x.ndim:428            raise ValueError('axes should be less than ndim (<%d), but %s given' % (x.ndim, str(axes)))429        mean = x.mean(axes, keepdims=True)430        var = x.var(axes, keepdims=True)431        invstd = 1.0 / numpy.sqrt(var + epsilon)432        out = (x - mean) * (scale * invstd) + bias433        output_storage[0][0] = out434        output_storage[1][0] = mean435        output_storage[2][0] = invstd436        if len(inputs) > 5:437            running_mean = inputs[5]438            running_mean = running_mean * (1.0 - running_average_factor) + \439                mean * running_average_factor440            output_storage[3][0] = running_mean441        if len(inputs) > 6:442            m = float(numpy.prod(x.shape) / numpy.prod(scale.shape))443            running_var = inputs[6]444            running_var = running_var * (1.0 - running_average_factor) + \445                (m / (m - 1)) * var * running_average_factor446            output_storage[4][0] = running_var447class AbstractBatchNormInference(Op):448    """449    Abstract Op for Batch Normalization.450    Parameters451    ----------452    axes : a tuple of ints453        The axes along which the input is normalized.454    epsilon455        Epsilon value used in the batch normalization formula. Minimum allowed456        value is 1e-5 (imposed by cuDNN).457    """458    __props__ = ('axes',)459    def __init__(self, axes=(0,)):460        assert isinstance(axes, (tuple, list))461        assert len(axes) > 0462        axes = tuple(int(a) for a in axes)463        self.axes = axes464    def infer_shape(self, node, shape):465        return [shape[0]]466    def make_node(self, x, scale, bias, estimated_mean, estimated_variance, epsilon=1e-4):467        x = as_tensor_variable(x)468        scale = as_tensor_variable(scale)469        bias = as_tensor_variable(bias)470        estimated_mean = as_tensor_variable(estimated_mean)471        estimated_variance = as_tensor_variable(estimated_variance)472        epsilon = as_tensor_variable(epsilon)473        # Upcast to common dtype on the non-scalar474        # Keep as is dtype of scalar (epsilon)475        x, scale, bias, estimated_mean, estimated_variance = as_common_dtype(476            x, scale, bias, estimated_mean, estimated_variance)477        assert x.ndim == scale.ndim == bias.ndim == estimated_mean.ndim == estimated_variance.ndim478        return Apply(self, [x, scale, bias, estimated_mean, estimated_variance, epsilon], [x.type()])479    def grad(self, inputs, grads):480        x, scale, bias, est_mean, est_var, epsilon = inputs481        dy = grads[0]482        axes = self.axes483        if min(axes) < 0 or max(axes) >= x.ndim:484            raise ValueError('axes should be less than ndim (<%d), but %s given' % (x.ndim, str(axes)))485        scale, bias, est_mean, est_var = (theano.tensor.addbroadcast(t, *axes)486                                          for t in (scale, bias, est_mean, est_var))487        # define helper expressions488        est_var_eps = est_var + epsilon489        est_std = theano.tensor.sqrt(est_var_eps)490        two = theano.tensor.constant(2.)491        # define and return gradients492        dx = dy * (scale / est_std)493        dscale = (dy * (x - est_mean)).sum(axes, keepdims=True) / est_std494        dbias = dy.sum(axes, keepdims=True)495        dmean = -dy.sum(axes, keepdims=True) * (scale / est_std)496        dvar = -(dy * (x - est_mean)).sum(axes, keepdims=True) * (scale / (two * est_var_eps * est_std))497        return [dx, dscale, dbias, dmean, dvar, theano.gradient.DisconnectedType()()]498    def connection_pattern(self, node):499        # Specificy that epsilon is not connected to outputs.500        return [[True], [True], [True], [True], [True], [False]]501    def perform(self, node, inputs, output_storage):502        x, scale, bias, estimated_mean, estimated_variance, epsilon = inputs503        out = (x - estimated_mean) * (scale / numpy.sqrt(estimated_variance + epsilon)) + bias504        output_storage[0][0] = out505class AbstractBatchNormTrainGrad(Op):506    __props__ = ('axes',)507    def __init__(self, axes=(0,)):508        assert isinstance(axes, (tuple, list))509        assert len(axes) > 0510        axes = tuple(int(a) for a in axes)511        self.axes = axes512    def make_node(self, x, dy, scale, x_mean, x_invstd, epsilon=1e-4):513        x = as_tensor_variable(x)514        dy = as_tensor_variable(dy)515        scale = as_tensor_variable(scale)516        x_mean = as_tensor_variable(x_mean)517        x_invstd = as_tensor_variable(x_invstd)518        epsilon = as_tensor_variable(epsilon)519        # Upcast to common dtype on the non-scalar520        # Keep as is dtype of scalar (epsilon)521        x, dy, scale, x_mean, x_invstd = as_common_dtype(522            x, dy, scale, x_mean, x_invstd)523        assert x.ndim == dy.ndim == scale.ndim == x_mean.ndim == x_invstd.ndim524        return Apply(self, [x, dy, scale, x_mean, x_invstd, epsilon],525                     [x.type(), scale.type(), scale.type()])526    def infer_shape(self, node, shape):527        return [shape[0], shape[2], shape[2]]528    def perform(self, node, inputs, output_storage):529        x, dy, scale, x_mean, x_invstd, epsilon = inputs530        axes = self.axes531        if min(axes) < 0 or max(axes) >= x.ndim:532            raise ValueError('axes should be less than ndim (<%d), but %s given' % (x.ndim, str(axes)))533        x_diff = x - x_mean534        mean_dy_x_diff = numpy.mean(dy * x_diff, axis=axes, keepdims=True)535        c = (dy * x_invstd) - (x_diff * mean_dy_x_diff * (x_invstd ** 3))536        g_wrt_inputs = scale * (c - numpy.mean(c, axis=axes, keepdims=True))537        g_wrt_scale = numpy.sum(dy * x_invstd * x_diff, axis=axes, keepdims=True)538        g_wrt_bias = numpy.sum(dy, axis=axes, keepdims=True)539        output_storage[0][0] = g_wrt_inputs540        output_storage[1][0] = g_wrt_scale541        output_storage[2][0] = g_wrt_bias542@local_optimizer([AbstractBatchNormTrain])543def local_abstract_batch_norm_train(node):544    if not isinstance(node.op, AbstractBatchNormTrain):545        return None546    x, scale, bias, epsilon, running_average_factor = node.inputs[:5]547    axes = node.op.axes548    if min(axes) < 0 or max(axes) > x.ndim:549        return None550    if not isinstance(x.type, TensorType) or \551       not isinstance(scale.type, TensorType) or \552       not isinstance(bias.type, TensorType) or \553       not isinstance(epsilon.type, TensorType) or \554       not isinstance(running_average_factor.type, TensorType):555        return None556    # optional running_mean and running_var557    if len(node.inputs) > 5 and not isinstance(node.inputs[5].type, TensorType):558        return None559    if len(node.inputs) > 6 and not isinstance(node.inputs[6].type, TensorType):560        return None561    mean = x.mean(axes, keepdims=True)562    var = x.var(axes, keepdims=True)563    # The epsilon should not upcast the dtype.564    if var.dtype == 'float32' and epsilon.dtype == 'float64':565        epsilon = epsilon.astype('float32')566    invstd = T.inv(T.sqrt(var + epsilon))567    out = (x - mean) * (scale * invstd) + bias568    results = [out, mean, invstd]569    if len(node.inputs) > 5:570        running_mean = node.inputs[5]571        running_mean = running_mean * (1.0 - running_average_factor) + \572            mean * running_average_factor573        results.append(running_mean)574    if len(node.inputs) > 6:575        m = T.cast(T.prod(x.shape) / T.prod(scale.shape), theano.config.floatX)576        running_var = node.inputs[6]577        running_var = running_var * (1.0 - running_average_factor) + \578            (m / (m - 1)) * var * running_average_factor579        results.append(running_var)580    results = [T.patternbroadcast(r, r_orig.broadcastable)581               for (r, r_orig) in zip(results, node.outputs)]582    for var in theano.gof.graph.variables(node.inputs, results):583        if var not in node.inputs:584            copy_stack_trace(node.outputs[0], var)585    return results586@local_optimizer([AbstractBatchNormTrainGrad])587def local_abstract_batch_norm_train_grad(node):588    if not isinstance(node.op, AbstractBatchNormTrainGrad):589        return None590    x, dy, scale, x_mean, x_invstd, epsilon = node.inputs591    axes = node.op.axes592    if min(axes) < 0 or max(axes) > x.ndim:593        return None594    if not isinstance(x.type, TensorType) or \595       not isinstance(dy.type, TensorType) or \596       not isinstance(scale.type, TensorType) or \597       not isinstance(x_mean.type, TensorType) or \598       not isinstance(x_invstd.type, TensorType) or \599       not isinstance(epsilon.type, TensorType):600        return None601    x_diff = x - x_mean602    mean_dy_x_diff = T.mean(dy * x_diff, axis=axes, keepdims=True)603    c = (dy * x_invstd) - x_diff * (mean_dy_x_diff * (x_invstd ** 3))604    g_wrt_inputs = scale * (c - T.mean(c, axis=axes, keepdims=True))605    g_wrt_scale = T.sum(dy * x_invstd * x_diff, axis=axes, keepdims=True)606    g_wrt_bias = T.sum(dy, axis=axes, keepdims=True)607    results = [g_wrt_inputs, g_wrt_scale, g_wrt_bias]608    results = [T.patternbroadcast(r, r_orig.broadcastable)609               for (r, r_orig) in zip(results, node.outputs)]610    for var in theano.gof.graph.variables(node.inputs, results):611        if var not in node.inputs:612            copy_stack_trace(node.outputs[0], var)613    return results614@local_optimizer([AbstractBatchNormInference])615def local_abstract_batch_norm_inference(node):616    if not isinstance(node.op, AbstractBatchNormInference):617        return None618    x, scale, bias, estimated_mean, estimated_variance, epsilon = node.inputs619    if not isinstance(x.type, TensorType) or \620       not isinstance(scale.type, TensorType) or \621       not isinstance(bias.type, TensorType) or \622       not isinstance(estimated_mean.type, TensorType) or \623       not isinstance(estimated_variance.type, TensorType) or \624       not isinstance(epsilon.type, TensorType):625        return None626    # The epsilon should not upcast the dtype.627    if estimated_variance.dtype == 'float32' and epsilon.dtype == 'float64':628        epsilon = epsilon.astype('float32')629    result = (x - estimated_mean) * (scale / T.sqrt(estimated_variance + epsilon)) + bias630    result = T.patternbroadcast(result, node.outputs[0].broadcastable)631    for var in theano.gof.graph.variables(node.inputs, [result]):632        if var not in node.inputs:633            copy_stack_trace(node.outputs[0], var)634    return [result]635# Register Cpu Optmization636bn_groupopt = theano.gof.optdb.LocalGroupDB()637bn_groupopt.__name__ = 'batchnorm_opts'638register_specialize_device(bn_groupopt, 'fast_compile', 'fast_run')639bn_groupopt.register('local_abstract_batch_norm_train',640                     local_abstract_batch_norm_train, 30,641                     'fast_compile', 'fast_run')642bn_groupopt.register('local_abstract_batch_norm_train_grad',643                     local_abstract_batch_norm_train_grad, 30,644                     'fast_compile', 'fast_run')645bn_groupopt.register('local_abstract_batch_norm_inference',646                     local_abstract_batch_norm_inference, 30,...layers.py
Source:layers.py  
1import numpy as np2def affine_forward(x, w, b):3    """4    Computes the forward pass for an affine (fully-connected) layer.5    The input x has shape (N, d_1, ..., d_k) where x[i] is the ith input.6    We multiply this against a weight matrix of shape (D, M) where7    D = \prod_i d_i8    Inputs:9    x - Input data, of shape (N, d_1, ..., d_k)10    w - Weights, of shape (D, M)11    b - Biases, of shape (M,)12    Returns a tuple of:13    - out: output, of shape (N, M)14    - cache: (x, w, b)15    """16    out = x.reshape(x.shape[0], -1).dot(w) + b17    cache = (x, w, b)18    return out, cache19def affine_backward(dout, cache):20    """21    Computes the backward pass for an affine layer.22    Inputs:23    - dout: Upstream derivative, of shape (N, M)24    - cache: Tuple of:25      - x: Input data, of shape (N, d_1, ... d_k)26      - w: Weights, of shape (D, M)27    Returns a tuple of:28    - dx: Gradient with respect to x, of shape (N, d1, ..., d_k)29    - dw: Gradient with respect to w, of shape (D, M)30    - db: Gradient with respect to b, of shape (M,)31    """32    x, w, b = cache33    dx = dout.dot(w.T).reshape(x.shape)34    dw = x.reshape(x.shape[0], -1).T.dot(dout)35    db = np.sum(dout, axis=0)36    return dx, dw, db37def relu_forward(x):38    """39    Computes the forward pass for a layer of rectified linear units (ReLUs).40    Input:41    - x: Inputs, of any shape42    Returns a tuple of:43    - out: Output, of the same shape as x44    - cache: x45    """46    out = np.maximum(0, x)47    cache = x48    return out, cache49def relu_backward(dout, cache):50    """51    Computes the backward pass for a layer of rectified linear units (ReLUs).52    Input:53    - dout: Upstream derivatives, of any shape54    - cache: Input x, of same shape as dout55    Returns:56    - dx: Gradient with respect to x57    """58    x = cache59    dx = np.where(x > 0, dout, 0)60    return dx61def batchnorm_forward(x, gamma, beta, bn_param):62    """63    Forward pass for batch normalization.64    During training the sample mean and (uncorrected) sample variance are65    computed from minibatch statistics and used to normalize the incoming data.66    During training we also keep an exponentially decaying running mean of the mean67    and variance of each feature, and these averages are used to normalize data68    at test-time.69    At each timestep we update the running averages for mean and variance using70    an exponential decay based on the momentum parameter:71    running_mean = momentum * running_mean + (1 - momentum) * sample_mean72    running_var = momentum * running_var + (1 - momentum) * sample_var73    Note that the batch normalization paper suggests a different test-time74    behavior: they compute sample mean and variance for each feature using a75    large number of training images rather than using a running average. For76    this implementation we have chosen to use running averages instead since77    they do not require an additional estimation step; the torch7 implementation78    of batch normalization also uses running averages.79    Input:80    - x: Data of shape (N, D)81    - gamma: Scale parameter of shape (D,)82    - beta: Shift paremeter of shape (D,)83    - bn_param: Dictionary with the following keys:84      - mode: 'train' or 'test'; required85      - eps: Constant for numeric stability86      - momentum: Constant for running mean / variance.87      - running_mean: Array of shape (D,) giving running mean of features88      - running_var Array of shape (D,) giving running variance of features89    Returns a tuple of:90    - out: of shape (N, D)91    - cache: A tuple of values needed in the backward pass92    """93    mode = bn_param['mode']94    eps = bn_param.get('eps', 1e-5)95    momentum = bn_param.get('momentum', 0.9)96    N, D = x.shape97    running_mean = bn_param.get('running_mean', np.zeros(D, dtype=x.dtype))98    running_var = bn_param.get('running_var', np.zeros(D, dtype=x.dtype))99    out, cache = None, None100    if mode == 'train':101        # Compute output102        mu = x.mean(axis=0)103        xc = x - mu104        var = np.mean(xc ** 2, axis=0)105        std = np.sqrt(var + eps)106        xn = xc / std107        out = gamma * xn + beta108        cache = (mode, x, gamma, xc, std, xn, out)109        # Update running average of mean110        running_mean *= momentum111        running_mean += (1 - momentum) * mu112        # Update running average of variance113        running_var *= momentum114        running_var += (1 - momentum) * var115    elif mode == 'test':116        # Using running mean and variance to normalize117        std = np.sqrt(running_var + eps)118        xn = (x - running_mean) / std119        out = gamma * xn + beta120        cache = (mode, x, xn, gamma, beta, std)121    else:122        raise ValueError('Invalid forward batchnorm mode "%s"' % mode)123    # Store the updated running means back into bn_param124    bn_param['running_mean'] = running_mean125    bn_param['running_var'] = running_var126    return out, cache127def batchnorm_backward(dout, cache):128    """129    Backward pass for batch normalization.130    For this implementation, you should write out a computation graph for131    batch normalization on paper and propagate gradients backward through132    intermediate nodes.133    Inputs:134    - dout: Upstream derivatives, of shape (N, D)135    - cache: Variable of intermediates from batchnorm_forward.136    Returns a tuple of:137    - dx: Gradient with respect to inputs x, of shape (N, D)138    - dgamma: Gradient with respect to scale parameter gamma, of shape (D,)139    - dbeta: Gradient with respect to shift parameter beta, of shape (D,)140    """141    mode = cache[0]142    if mode == 'train':143        mode, x, gamma, xc, std, xn, out = cache144        N = x.shape[0]145        dbeta = dout.sum(axis=0)146        dgamma = np.sum(xn * dout, axis=0)147        dxn = gamma * dout148        dxc = dxn / std149        dstd = -np.sum((dxn * xc) / (std * std), axis=0)150        dvar = 0.5 * dstd / std151        dxc += (2.0 / N) * xc * dvar152        dmu = np.sum(dxc, axis=0)153        dx = dxc - dmu / N154    elif mode == 'test':155        mode, x, xn, gamma, beta, std = cache156        dbeta = dout.sum(axis=0)157        dgamma = np.sum(xn * dout, axis=0)158        dxn = gamma * dout159        dx = dxn / std160    else:161        raise ValueError(mode)162    return dx, dgamma, dbeta163def spatial_batchnorm_forward(x, gamma, beta, bn_param):164    """165    Computes the forward pass for spatial batch normalization.166    Inputs:167    - x: Input data of shape (N, C, H, W)168    - gamma: Scale parameter, of shape (C,)169    - beta: Shift parameter, of shape (C,)170    - bn_param: Dictionary with the following keys:171      - mode: 'train' or 'test'; required172      - eps: Constant for numeric stability173      - momentum: Constant for running mean / variance. momentum=0 means that174        old information is discarded completely at every time step, while175        momentum=1 means that new information is never incorporated. The176        default of momentum=0.9 should work well in most situations.177      - running_mean: Array of shape (D,) giving running mean of features178      - running_var Array of shape (D,) giving running variance of features179    Returns a tuple of:180    - out: Output data, of shape (N, C, H, W)181    - cache: Values needed for the backward pass182    """183    N, C, H, W = x.shape184    x_flat = x.transpose(0, 2, 3, 1).reshape(-1, C)185    out_flat, cache = batchnorm_forward(x_flat, gamma, beta, bn_param)186    out = out_flat.reshape(N, H, W, C).transpose(0, 3, 1, 2)187    return out, cache188def spatial_batchnorm_backward(dout, cache):189    """190    Computes the backward pass for spatial batch normalization.191    Inputs:192    - dout: Upstream derivatives, of shape (N, C, H, W)193    - cache: Values from the forward pass194    Returns a tuple of:195    - dx: Gradient with respect to inputs, of shape (N, C, H, W)196    - dgamma: Gradient with respect to scale parameter, of shape (C,)197    - dbeta: Gradient with respect to shift parameter, of shape (C,)198    """199    N, C, H, W = dout.shape200    dout_flat = dout.transpose(0, 2, 3, 1).reshape(-1, C)201    dx_flat, dgamma, dbeta = batchnorm_backward(dout_flat, cache)202    dx = dx_flat.reshape(N, H, W, C).transpose(0, 3, 1, 2)203    return dx, dgamma, dbeta204def svm_loss(x, y):205    """206    Computes the loss and gradient using for multiclass SVM classification.207    Inputs:208    - x: Input data, of shape (N, C) where x[i, j] is the score for the jth class209      for the ith input.210    - y: Vector of labels, of shape (N,) where y[i] is the label for x[i] and211      0 <= y[i] < C212    Returns a tuple of:213    - loss: Scalar giving the loss214    - dx: Gradient of the loss with respect to x215    """216    N = x.shape[0]217    correct_class_scores = x[np.arange(N), y]218    margins = np.maximum(0, x - correct_class_scores[:, np.newaxis] + 1.0)219    margins[np.arange(N), y] = 0220    loss = np.sum(margins) / N221    num_pos = np.sum(margins > 0, axis=1)222    dx = np.zeros_like(x)223    dx[margins > 0] = 1224    dx[np.arange(N), y] -= num_pos225    dx /= N226    return loss, dx227def softmax_loss(x, y):228    """229    Computes the loss and gradient for softmax classification.230    Inputs:231    - x: Input data, of shape (N, C) where x[i, j] is the score for the jth class232      for the ith input.233    - y: Vector of labels, of shape (N,) where y[i] is the label for x[i] and234      0 <= y[i] < C235    Returns a tuple of:236    - loss: Scalar giving the loss237    - dx: Gradient of the loss with respect to x238    """239    probs = np.exp(x - np.max(x, axis=1, keepdims=True))240    probs /= np.sum(probs, axis=1, keepdims=True)241    N = x.shape[0]242    loss = -np.sum(np.log(probs[np.arange(N), y])) / N243    dx = probs.copy()244    dx[np.arange(N), y] -= 1245    dx /= N...batch_norm.py
Source:batch_norm.py  
1# encoding: utf-82"""3@author:  liaoxingyu4@contact: sherlockliao01@gmail.com5"""6import logging7import torch8import torch.nn.functional as F9from torch import nn10__all__ = ["IBN", "get_norm"]11class BatchNorm(nn.BatchNorm2d):12    def __init__(self, num_features, eps=1e-05, momentum=0.1, weight_freeze=False, bias_freeze=False, weight_init=1.0,13                 bias_init=0.0, **kwargs):14        super().__init__(num_features, eps=eps, momentum=momentum)15        if weight_init is not None: nn.init.constant_(self.weight, weight_init)16        if bias_init is not None: nn.init.constant_(self.bias, bias_init)17        self.weight.requires_grad_(not weight_freeze)18        self.bias.requires_grad_(not bias_freeze)19class SyncBatchNorm(nn.SyncBatchNorm):20    def __init__(self, num_features, eps=1e-05, momentum=0.1, weight_freeze=False, bias_freeze=False, weight_init=1.0,21                 bias_init=0.0):22        super().__init__(num_features, eps=eps, momentum=momentum)23        if weight_init is not None: nn.init.constant_(self.weight, weight_init)24        if bias_init is not None: nn.init.constant_(self.bias, bias_init)25        self.weight.requires_grad_(not weight_freeze)26        self.bias.requires_grad_(not bias_freeze)27class IBN(nn.Module):28    def __init__(self, planes, bn_norm, **kwargs):29        super(IBN, self).__init__()30        half1 = int(planes / 2)31        self.half = half132        half2 = planes - half133        self.IN = nn.InstanceNorm2d(half1, affine=True)34        self.BN = get_norm(bn_norm, half2, **kwargs)35    def forward(self, x):36        split = torch.split(x, self.half, 1)37        out1 = self.IN(split[0].contiguous())38        out2 = self.BN(split[1].contiguous())39        out = torch.cat((out1, out2), 1)40        return out41class GhostBatchNorm(BatchNorm):42    def __init__(self, num_features, num_splits=1, **kwargs):43        super().__init__(num_features, **kwargs)44        self.num_splits = num_splits45        self.register_buffer('running_mean', torch.zeros(num_features))46        self.register_buffer('running_var', torch.ones(num_features))47    def forward(self, input):48        N, C, H, W = input.shape49        if self.training or not self.track_running_stats:50            self.running_mean = self.running_mean.repeat(self.num_splits)51            self.running_var = self.running_var.repeat(self.num_splits)52            outputs = F.batch_norm(53                input.view(-1, C * self.num_splits, H, W), self.running_mean, self.running_var,54                self.weight.repeat(self.num_splits), self.bias.repeat(self.num_splits),55                True, self.momentum, self.eps).view(N, C, H, W)56            self.running_mean = torch.mean(self.running_mean.view(self.num_splits, self.num_features), dim=0)57            self.running_var = torch.mean(self.running_var.view(self.num_splits, self.num_features), dim=0)58            return outputs59        else:60            return F.batch_norm(61                input, self.running_mean, self.running_var,62                self.weight, self.bias, False, self.momentum, self.eps)63class FrozenBatchNorm(nn.Module):64    """65    BatchNorm2d where the batch statistics and the affine parameters are fixed.66    It contains non-trainable buffers called67    "weight" and "bias", "running_mean", "running_var",68    initialized to perform identity transformation.69    The pre-trained backbone models from Caffe2 only contain "weight" and "bias",70    which are computed from the original four parameters of BN.71    The affine transform `x * weight + bias` will perform the equivalent72    computation of `(x - running_mean) / sqrt(running_var) * weight + bias`.73    When loading a backbone model from Caffe2, "running_mean" and "running_var"74    will be left unchanged as identity transformation.75    Other pre-trained backbone models may contain all 4 parameters.76    The forward is implemented by `F.batch_norm(..., training=False)`.77    """78    _version = 379    def __init__(self, num_features, eps=1e-5, **kwargs):80        super().__init__()81        self.num_features = num_features82        self.eps = eps83        self.register_buffer("weight", torch.ones(num_features))84        self.register_buffer("bias", torch.zeros(num_features))85        self.register_buffer("running_mean", torch.zeros(num_features))86        self.register_buffer("running_var", torch.ones(num_features) - eps)87    def forward(self, x):88        if x.requires_grad:89            # When gradients are needed, F.batch_norm will use extra memory90            # because its backward op computes gradients for weight/bias as well.91            scale = self.weight * (self.running_var + self.eps).rsqrt()92            bias = self.bias - self.running_mean * scale93            scale = scale.reshape(1, -1, 1, 1)94            bias = bias.reshape(1, -1, 1, 1)95            return x * scale + bias96        else:97            # When gradients are not needed, F.batch_norm is a single fused op98            # and provide more optimization opportunities.99            return F.batch_norm(100                x,101                self.running_mean,102                self.running_var,103                self.weight,104                self.bias,105                training=False,106                eps=self.eps,107            )108    def _load_from_state_dict(109            self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs110    ):111        version = local_metadata.get("version", None)112        if version is None or version < 2:113            # No running_mean/var in early versions114            # This will silent the warnings115            if prefix + "running_mean" not in state_dict:116                state_dict[prefix + "running_mean"] = torch.zeros_like(self.running_mean)117            if prefix + "running_var" not in state_dict:118                state_dict[prefix + "running_var"] = torch.ones_like(self.running_var)119        if version is not None and version < 3:120            logger = logging.getLogger(__name__)121            logger.info("FrozenBatchNorm {} is upgraded to version 3.".format(prefix.rstrip(".")))122            # In version < 3, running_var are used without +eps.123            state_dict[prefix + "running_var"] -= self.eps124        super()._load_from_state_dict(125            state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs126        )127    def __repr__(self):128        return "FrozenBatchNorm2d(num_features={}, eps={})".format(self.num_features, self.eps)129    @classmethod130    def convert_frozen_batchnorm(cls, module):131        """132        Convert BatchNorm/SyncBatchNorm in module into FrozenBatchNorm.133        Args:134            module (torch.nn.Module):135        Returns:136            If module is BatchNorm/SyncBatchNorm, returns a new module.137            Otherwise, in-place convert module and return it.138        Similar to convert_sync_batchnorm in139        https://github.com/pytorch/pytorch/blob/master/torch/nn/modules/batchnorm.py140        """141        bn_module = nn.modules.batchnorm142        bn_module = (bn_module.BatchNorm2d, bn_module.SyncBatchNorm)143        res = module144        if isinstance(module, bn_module):145            res = cls(module.num_features)146            if module.affine:147                res.weight.data = module.weight.data.clone().detach()148                res.bias.data = module.bias.data.clone().detach()149            res.running_mean.data = module.running_mean.data150            res.running_var.data = module.running_var.data151            res.eps = module.eps152        else:153            for name, child in module.named_children():154                new_child = cls.convert_frozen_batchnorm(child)155                if new_child is not child:156                    res.add_module(name, new_child)157        return res158def get_norm(norm, out_channels, **kwargs):159    """160    Args:161        norm (str or callable): either one of BN, GhostBN, FrozenBN, GN or SyncBN;162            or a callable that takes a channel number and returns163            the normalization layer as a nn.Module164        out_channels: number of channels for normalization layer165    Returns:166        nn.Module or None: the normalization layer167    """168    if isinstance(norm, str):169        if len(norm) == 0:170            return None171        norm = {172            "BN": BatchNorm,173            "syncBN": SyncBatchNorm,174            "GhostBN": GhostBatchNorm,175            "FrozenBN": FrozenBatchNorm,176            "GN": lambda channels, **args: nn.GroupNorm(32, channels),177        }[norm]...precise_bn.py
Source:precise_bn.py  
1#!/usr/bin/env python32# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.3## https://github.com/facebookresearch/fvcore/blob/master/fvcore/nn/precise_bn.py4import itertools5import torch6import torch.nn as nn7import logging8from typing import Iterable, Any9from torch.distributed import ReduceOp, all_reduce10logger = logging.getLogger(__name__)11BN_MODULE_TYPES = (12    torch.nn.BatchNorm1d,13    torch.nn.BatchNorm2d,14    torch.nn.BatchNorm3d,15    torch.nn.SyncBatchNorm,16)17# pyre-fixme[56]: Decorator `torch.no_grad(...)` could not be called, because its18#  type `no_grad` is not callable.19@torch.no_grad()20def update_bn_stats(21    args: Any, model: nn.Module, data_loader: Iterable[Any], num_iters: int = 200  # pyre-ignore22) -> None:23    """24    Recompute and update the batch norm stats to make them more precise. During25    training both BN stats and the weight are changing after every iteration, so26    the running average can not precisely reflect the actual stats of the27    current model.28    In this function, the BN stats are recomputed with fixed weights, to make29    the running average more precise. Specifically, it computes the true average30    of per-batch mean/variance instead of the running average.31    Args:32        model (nn.Module): the model whose bn stats will be recomputed.33            Note that:34            1. This function will not alter the training mode of the given model.35               Users are responsible for setting the layers that needs36               precise-BN to training mode, prior to calling this function.37            2. Be careful if your models contain other stateful layers in38               addition to BN, i.e. layers whose state can change in forward39               iterations.  This function will alter their state. If you wish40               them unchanged, you need to either pass in a submodule without41               those layers, or backup the states.42        data_loader (iterator): an iterator. Produce data as inputs to the model.43        num_iters (int): number of iterations to compute the stats.44    """45    bn_layers = get_bn_modules(model)46    if len(bn_layers) == 0:47        return48    # In order to make the running stats only reflect the current batch, the49    # momentum is disabled.50    # bn.running_mean = (1 - momentum) * bn.running_mean + momentum * batch_mean51    # Setting the momentum to 1.0 to compute the stats without momentum.52    momentum_actual = [bn.momentum for bn in bn_layers]53    if args.rank == 0:54        a = [round(i.running_mean.cpu().numpy().max(), 4) for i in bn_layers]55        logger.info('bn mean max, %s', max(a))56        logger.info(a)57        a = [round(i.running_var.cpu().numpy().max(), 4) for i in bn_layers]58        logger.info('bn var max, %s', max(a))59        logger.info(a)60    for bn in bn_layers:61        # pyre-fixme[16]: `Module` has no attribute `momentum`.62        # bn.running_mean = torch.ones_like(bn.running_mean)63        # bn.running_var = torch.zeros_like(bn.running_var)64        bn.momentum = 1.065    # Note that PyTorch's running_var means "running average of66    # bessel-corrected batch variance". (PyTorch's BN normalizes by biased67    # variance, but updates EMA by unbiased (bessel-corrected) variance).68    # So we estimate population variance by "simple average of bessel-corrected69    # batch variance". This is the same as in the BatchNorm paper, Sec 3.1.70    # This estimator converges to population variance as long as batch size71    # is not too small, and total #samples for PreciseBN is large enough.72    # Its convergence may be affected by small batch size.73    # Alternatively, one can estimate population variance by the sample variance74    # of all batches combined. However, this needs a way to know the batch size75    # of each batch in this function (otherwise we only have access to the76    # bessel-corrected batch variance given by pytorch), which is an extra77    # requirement.78    running_mean = [torch.zeros_like(bn.running_mean) for bn in bn_layers]79    running_var = [torch.zeros_like(bn.running_var) for bn in bn_layers]80    ind = -181    for ind, inputs in enumerate(itertools.islice(data_loader, num_iters)):82        with torch.no_grad():83            model(inputs)84        for i, bn in enumerate(bn_layers):85            # Accumulates the bn stats.86            running_mean[i] += (bn.running_mean - running_mean[i]) / (ind + 1)87            running_var[i] += (bn.running_var - running_var[i]) / (ind + 1)88            if torch.sum(torch.isnan(bn.running_mean)) > 0 or torch.sum(torch.isnan(bn.running_var)) > 0:89                raise RuntimeError(90                    "update_bn_stats ERROR(args.rank {}): Got NaN val".format(args.rank))91            if torch.sum(torch.isinf(bn.running_mean)) > 0 or torch.sum(torch.isinf(bn.running_var)) > 0:92                raise RuntimeError(93                    "update_bn_stats ERROR(args.rank {}): Got INf val".format(args.rank))94            if torch.sum(~torch.isfinite(bn.running_mean)) > 0 or torch.sum(~torch.isfinite(bn.running_var)) > 0:95                raise RuntimeError(96                    "update_bn_stats ERROR(args.rank {}): Got INf val".format(args.rank))97    assert ind == num_iters - 1, (98        "update_bn_stats is meant to run for {} iterations, "99        "but the dataloader stops at {} iterations.".format(num_iters, ind)100    )101    for i, bn in enumerate(bn_layers):102        if args.distributed:103            all_reduce(running_mean[i], op=ReduceOp.SUM)104            all_reduce(running_var[i], op=ReduceOp.SUM)105            running_mean[i] = running_mean[i] / args.gpu_nums106            running_var[i] = running_var[i] / args.gpu_nums107        # Sets the precise bn stats.108        # pyre-fixme[16]: `Module` has no attribute `running_mean`.109        bn.running_mean = running_mean[i]110        # pyre-fixme[16]: `Module` has no attribute `running_var`.111        bn.running_var = running_var[i]112        bn.momentum = momentum_actual[i]113    if args.rank == 0:114        a = [round(i.cpu().numpy().max(), 4) for i in running_mean]115        logger.info('bn mean max, %s (%s)', max(a), a)116        a = [round(i.cpu().numpy().max(), 4) for i in running_var]117        logger.info('bn var max, %s (%s)', max(a), a)118def get_bn_modules(model):119    """120    Find all BatchNorm (BN) modules that are in training mode. See121    fvcore.precise_bn.BN_MODULE_TYPES for a list of all modules that are122    included in this search.123    Args:124        model (nn.Module): a model possibly containing BN modules.125    Returns:126        list[nn.Module]: all BN modules in the model.127    """128    # Finds all the bn layers.129    bn_layers = [130        m131        for m in model.modules()132        if m.training and isinstance(m, BN_MODULE_TYPES)133    ]...Learn to execute automation testing from scratch with LambdaTest Learning Hub. Right from setting up the prerequisites to run your first automation test, to following best practices and diving deeper into advanced test scenarios. LambdaTest Learning Hubs compile a list of step-by-step guides to help you be proficient with different test automation frameworks i.e. Selenium, Cypress, TestNG etc.
You could also refer to video tutorials over LambdaTest YouTube channel to get step by step demonstration from industry experts.
Get 100 minutes of automation test minutes FREE!!
