319 lines
15 KiB
Python
319 lines
15 KiB
Python
# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
# =============================================================================
|
|
|
|
# pylint: disable=unused-import,g-bad-import-order
|
|
"""Contains the normalization layer classes and their functional aliases.
|
|
"""
|
|
from __future__ import absolute_import
|
|
from __future__ import division
|
|
from __future__ import print_function
|
|
|
|
import six
|
|
from six.moves import xrange # pylint: disable=redefined-builtin
|
|
import numpy as np
|
|
|
|
from tensorflow.python.keras import layers as keras_layers
|
|
from tensorflow.python.layers import base
|
|
from tensorflow.python.ops import init_ops
|
|
from tensorflow.python.util.tf_export import tf_export
|
|
|
|
|
|
@tf_export('layers.BatchNormalization')
|
|
class BatchNormalization(keras_layers.BatchNormalization, base.Layer):
|
|
"""Batch Normalization layer from http://arxiv.org/abs/1502.03167.
|
|
|
|
"Batch Normalization: Accelerating Deep Network Training by Reducing
|
|
Internal Covariate Shift"
|
|
|
|
Sergey Ioffe, Christian Szegedy
|
|
|
|
Arguments:
|
|
axis: An `int` or list of `int`, the axis or axes that should be
|
|
normalized, typically the features axis/axes. For instance, after a
|
|
`Conv2D` layer with `data_format="channels_first"`, set `axis=1`. If a
|
|
list of axes is provided, each axis in `axis` will be normalized
|
|
simultaneously. Default is `-1` which uses the last axis. Note: when
|
|
using multi-axis batch norm, the `beta`, `gamma`, `moving_mean`, and
|
|
`moving_variance` variables are the same rank as the input Tensor, with
|
|
dimension size 1 in all reduced (non-axis) dimensions).
|
|
momentum: Momentum for the moving average.
|
|
epsilon: Small float added to variance to avoid dividing by zero.
|
|
center: If True, add offset of `beta` to normalized tensor. If False, `beta`
|
|
is ignored.
|
|
scale: If True, multiply by `gamma`. If False, `gamma` is
|
|
not used. When the next layer is linear (also e.g. `nn.relu`), this can be
|
|
disabled since the scaling can be done by the next layer.
|
|
beta_initializer: Initializer for the beta weight.
|
|
gamma_initializer: Initializer for the gamma weight.
|
|
moving_mean_initializer: Initializer for the moving mean.
|
|
moving_variance_initializer: Initializer for the moving variance.
|
|
beta_regularizer: Optional regularizer for the beta weight.
|
|
gamma_regularizer: Optional regularizer for the gamma weight.
|
|
beta_constraint: An optional projection function to be applied to the `beta`
|
|
weight after being updated by an `Optimizer` (e.g. used to implement
|
|
norm constraints or value constraints for layer weights). The function
|
|
must take as input the unprojected variable and must return the
|
|
projected variable (which must have the same shape). Constraints are
|
|
not safe to use when doing asynchronous distributed training.
|
|
gamma_constraint: An optional projection function to be applied to the
|
|
`gamma` weight after being updated by an `Optimizer`.
|
|
renorm: Whether to use Batch Renormalization
|
|
(https://arxiv.org/abs/1702.03275). This adds extra variables during
|
|
training. The inference is the same for either value of this parameter.
|
|
renorm_clipping: A dictionary that may map keys 'rmax', 'rmin', 'dmax' to
|
|
scalar `Tensors` used to clip the renorm correction. The correction
|
|
`(r, d)` is used as `corrected_value = normalized_value * r + d`, with
|
|
`r` clipped to [rmin, rmax], and `d` to [-dmax, dmax]. Missing rmax, rmin,
|
|
dmax are set to inf, 0, inf, respectively.
|
|
renorm_momentum: Momentum used to update the moving means and standard
|
|
deviations with renorm. Unlike `momentum`, this affects training
|
|
and should be neither too small (which would add noise) nor too large
|
|
(which would give stale estimates). Note that `momentum` is still applied
|
|
to get the means and variances for inference.
|
|
fused: if `None` or `True`, use a faster, fused implementation if possible.
|
|
If `False`, use the system recommended implementation.
|
|
trainable: Boolean, if `True` also add variables to the graph collection
|
|
`GraphKeys.TRAINABLE_VARIABLES` (see tf.Variable).
|
|
virtual_batch_size: An `int`. By default, `virtual_batch_size` is `None`,
|
|
which means batch normalization is performed across the whole batch. When
|
|
`virtual_batch_size` is not `None`, instead perform "Ghost Batch
|
|
Normalization", which creates virtual sub-batches which are each
|
|
normalized separately (with shared gamma, beta, and moving statistics).
|
|
Must divide the actual batch size during execution.
|
|
adjustment: A function taking the `Tensor` containing the (dynamic) shape of
|
|
the input tensor and returning a pair (scale, bias) to apply to the
|
|
normalized values (before gamma and beta), only during training. For
|
|
example, if axis==-1,
|
|
`adjustment = lambda shape: (
|
|
tf.random_uniform(shape[-1:], 0.93, 1.07),
|
|
tf.random_uniform(shape[-1:], -0.1, 0.1))`
|
|
will scale the normalized value by up to 7% up or down, then shift the
|
|
result by up to 0.1 (with independent scaling and bias for each feature
|
|
but shared across all examples), and finally apply gamma and/or beta. If
|
|
`None`, no adjustment is applied. Cannot be specified if
|
|
virtual_batch_size is specified.
|
|
name: A string, the name of the layer.
|
|
"""
|
|
|
|
def __init__(self,
|
|
axis=-1,
|
|
momentum=0.99,
|
|
epsilon=1e-3,
|
|
center=True,
|
|
scale=True,
|
|
beta_initializer=init_ops.zeros_initializer(),
|
|
gamma_initializer=init_ops.ones_initializer(),
|
|
moving_mean_initializer=init_ops.zeros_initializer(),
|
|
moving_variance_initializer=init_ops.ones_initializer(),
|
|
beta_regularizer=None,
|
|
gamma_regularizer=None,
|
|
beta_constraint=None,
|
|
gamma_constraint=None,
|
|
renorm=False,
|
|
renorm_clipping=None,
|
|
renorm_momentum=0.99,
|
|
fused=None,
|
|
trainable=True,
|
|
virtual_batch_size=None,
|
|
adjustment=None,
|
|
name=None,
|
|
**kwargs):
|
|
super(BatchNormalization, self).__init__(
|
|
axis=axis,
|
|
momentum=momentum,
|
|
epsilon=epsilon,
|
|
center=center,
|
|
scale=scale,
|
|
beta_initializer=beta_initializer,
|
|
gamma_initializer=gamma_initializer,
|
|
moving_mean_initializer=moving_mean_initializer,
|
|
moving_variance_initializer=moving_variance_initializer,
|
|
beta_regularizer=beta_regularizer,
|
|
gamma_regularizer=gamma_regularizer,
|
|
beta_constraint=beta_constraint,
|
|
gamma_constraint=gamma_constraint,
|
|
renorm=renorm,
|
|
renorm_clipping=renorm_clipping,
|
|
renorm_momentum=renorm_momentum,
|
|
fused=fused,
|
|
trainable=trainable,
|
|
virtual_batch_size=virtual_batch_size,
|
|
adjustment=adjustment,
|
|
name=name,
|
|
**kwargs)
|
|
|
|
def call(self, inputs, training=False):
|
|
return super(BatchNormalization, self).call(inputs, training=training)
|
|
|
|
|
|
@tf_export('layers.batch_normalization')
|
|
def batch_normalization(inputs,
|
|
axis=-1,
|
|
momentum=0.99,
|
|
epsilon=1e-3,
|
|
center=True,
|
|
scale=True,
|
|
beta_initializer=init_ops.zeros_initializer(),
|
|
gamma_initializer=init_ops.ones_initializer(),
|
|
moving_mean_initializer=init_ops.zeros_initializer(),
|
|
moving_variance_initializer=init_ops.ones_initializer(),
|
|
beta_regularizer=None,
|
|
gamma_regularizer=None,
|
|
beta_constraint=None,
|
|
gamma_constraint=None,
|
|
training=False,
|
|
trainable=True,
|
|
name=None,
|
|
reuse=None,
|
|
renorm=False,
|
|
renorm_clipping=None,
|
|
renorm_momentum=0.99,
|
|
fused=None,
|
|
virtual_batch_size=None,
|
|
adjustment=None):
|
|
"""Functional interface for the batch normalization layer.
|
|
|
|
Reference: http://arxiv.org/abs/1502.03167
|
|
|
|
"Batch Normalization: Accelerating Deep Network Training by Reducing
|
|
Internal Covariate Shift"
|
|
|
|
Sergey Ioffe, Christian Szegedy
|
|
|
|
Note: when training, the moving_mean and moving_variance need to be updated.
|
|
By default the update ops are placed in `tf.GraphKeys.UPDATE_OPS`, so they
|
|
need to be added as a dependency to the `train_op`. Also, be sure to add
|
|
any batch_normalization ops before getting the update_ops collection.
|
|
Otherwise, update_ops will be empty, and training/inference will not work
|
|
properly. For example:
|
|
|
|
```python
|
|
x_norm = tf.layers.batch_normalization(x, training=training)
|
|
|
|
# ...
|
|
|
|
update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
|
|
with tf.control_dependencies(update_ops):
|
|
train_op = optimizer.minimize(loss)
|
|
```
|
|
|
|
Arguments:
|
|
inputs: Tensor input.
|
|
axis: An `int`, the axis that should be normalized (typically the features
|
|
axis). For instance, after a `Convolution2D` layer with
|
|
`data_format="channels_first"`, set `axis=1` in `BatchNormalization`.
|
|
momentum: Momentum for the moving average.
|
|
epsilon: Small float added to variance to avoid dividing by zero.
|
|
center: If True, add offset of `beta` to normalized tensor. If False, `beta`
|
|
is ignored.
|
|
scale: If True, multiply by `gamma`. If False, `gamma` is
|
|
not used. When the next layer is linear (also e.g. `nn.relu`), this can be
|
|
disabled since the scaling can be done by the next layer.
|
|
beta_initializer: Initializer for the beta weight.
|
|
gamma_initializer: Initializer for the gamma weight.
|
|
moving_mean_initializer: Initializer for the moving mean.
|
|
moving_variance_initializer: Initializer for the moving variance.
|
|
beta_regularizer: Optional regularizer for the beta weight.
|
|
gamma_regularizer: Optional regularizer for the gamma weight.
|
|
beta_constraint: An optional projection function to be applied to the `beta`
|
|
weight after being updated by an `Optimizer` (e.g. used to implement
|
|
norm constraints or value constraints for layer weights). The function
|
|
must take as input the unprojected variable and must return the
|
|
projected variable (which must have the same shape). Constraints are
|
|
not safe to use when doing asynchronous distributed training.
|
|
gamma_constraint: An optional projection function to be applied to the
|
|
`gamma` weight after being updated by an `Optimizer`.
|
|
training: Either a Python boolean, or a TensorFlow boolean scalar tensor
|
|
(e.g. a placeholder). Whether to return the output in training mode
|
|
(normalized with statistics of the current batch) or in inference mode
|
|
(normalized with moving statistics). **NOTE**: make sure to set this
|
|
parameter correctly, or else your training/inference will not work
|
|
properly.
|
|
trainable: Boolean, if `True` also add variables to the graph collection
|
|
`GraphKeys.TRAINABLE_VARIABLES` (see tf.Variable).
|
|
name: String, the name of the layer.
|
|
reuse: Boolean, whether to reuse the weights of a previous layer
|
|
by the same name.
|
|
renorm: Whether to use Batch Renormalization
|
|
(https://arxiv.org/abs/1702.03275). This adds extra variables during
|
|
training. The inference is the same for either value of this parameter.
|
|
renorm_clipping: A dictionary that may map keys 'rmax', 'rmin', 'dmax' to
|
|
scalar `Tensors` used to clip the renorm correction. The correction
|
|
`(r, d)` is used as `corrected_value = normalized_value * r + d`, with
|
|
`r` clipped to [rmin, rmax], and `d` to [-dmax, dmax]. Missing rmax, rmin,
|
|
dmax are set to inf, 0, inf, respectively.
|
|
renorm_momentum: Momentum used to update the moving means and standard
|
|
deviations with renorm. Unlike `momentum`, this affects training
|
|
and should be neither too small (which would add noise) nor too large
|
|
(which would give stale estimates). Note that `momentum` is still applied
|
|
to get the means and variances for inference.
|
|
fused: if `None` or `True`, use a faster, fused implementation if possible.
|
|
If `False`, use the system recommended implementation.
|
|
virtual_batch_size: An `int`. By default, `virtual_batch_size` is `None`,
|
|
which means batch normalization is performed across the whole batch. When
|
|
`virtual_batch_size` is not `None`, instead perform "Ghost Batch
|
|
Normalization", which creates virtual sub-batches which are each
|
|
normalized separately (with shared gamma, beta, and moving statistics).
|
|
Must divide the actual batch size during execution.
|
|
adjustment: A function taking the `Tensor` containing the (dynamic) shape of
|
|
the input tensor and returning a pair (scale, bias) to apply to the
|
|
normalized values (before gamma and beta), only during training. For
|
|
example, if axis==-1,
|
|
`adjustment = lambda shape: (
|
|
tf.random_uniform(shape[-1:], 0.93, 1.07),
|
|
tf.random_uniform(shape[-1:], -0.1, 0.1))`
|
|
will scale the normalized value by up to 7% up or down, then shift the
|
|
result by up to 0.1 (with independent scaling and bias for each feature
|
|
but shared across all examples), and finally apply gamma and/or beta. If
|
|
`None`, no adjustment is applied. Cannot be specified if
|
|
virtual_batch_size is specified.
|
|
|
|
Returns:
|
|
Output tensor.
|
|
|
|
Raises:
|
|
ValueError: if eager execution is enabled.
|
|
"""
|
|
layer = BatchNormalization(
|
|
axis=axis,
|
|
momentum=momentum,
|
|
epsilon=epsilon,
|
|
center=center,
|
|
scale=scale,
|
|
beta_initializer=beta_initializer,
|
|
gamma_initializer=gamma_initializer,
|
|
moving_mean_initializer=moving_mean_initializer,
|
|
moving_variance_initializer=moving_variance_initializer,
|
|
beta_regularizer=beta_regularizer,
|
|
gamma_regularizer=gamma_regularizer,
|
|
beta_constraint=beta_constraint,
|
|
gamma_constraint=gamma_constraint,
|
|
renorm=renorm,
|
|
renorm_clipping=renorm_clipping,
|
|
renorm_momentum=renorm_momentum,
|
|
fused=fused,
|
|
trainable=trainable,
|
|
virtual_batch_size=virtual_batch_size,
|
|
adjustment=adjustment,
|
|
name=name,
|
|
_reuse=reuse,
|
|
_scope=name)
|
|
return layer.apply(inputs, training=training)
|
|
|
|
|
|
# Aliases
|
|
|
|
BatchNorm = BatchNormalization
|
|
batch_norm = batch_normalization
|