diff --git a/dynet/param-init.cc b/dynet/param-init.cc index 7f9dc8f91..7b81a117c 100644 --- a/dynet/param-init.cc +++ b/dynet/param-init.cc @@ -25,8 +25,17 @@ void ParameterInitIdentity::initialize_params(Tensor & values) const { void ParameterInitGlorot::initialize_params(Tensor & values) const { int dims = 0, dim_len = values.d.nd - (lookup ? 1 : 0); - for (int i = 0; i < dim_len; ++i) dims += values.d[i]; - float my_scale = gain * sqrt(3 * dim_len) / sqrt(dims); + float my_scale = 0.0; + if (dim_len == 4) { + // When doing a Conv the parameters is (H, W, In, Out) + int receptive_field = values.d[0] * values.d[1]; + // Other framework m + n are calculated by multiplying by the kernel size. + dims = values.d[2] * receptive_field + values.d[3] * receptive_field; + my_scale = gain * sqrt(6) / sqrt(dims); + } else { + for (int i = 0; i < dim_len; ++i) dims += values.d[i]; + my_scale = gain * sqrt(3 * dim_len) / sqrt(dims); + } TensorTools::randomize_uniform(values, -my_scale, my_scale); } diff --git a/dynet/param-init.h b/dynet/param-init.h index faf75dc6b..ce50089a1 100644 --- a/dynet/param-init.h +++ b/dynet/param-init.h @@ -113,6 +113,7 @@ struct ParameterInitIdentity : public ParameterInit { * \ingroup params * \brief Initialize with the methods described in [Glorot, 2010](http://www.jmlr.org/proceedings/papers/v9/glorot10a/glorot10a.pdf?hc_location=ufi) * \details In order to preserve the variance of the forward and backward flow across layers, the parameters \f$\theta\f$ are initialized such that \f$\mathrm{Var}(\theta)=\frac 2 {n_1+n_2}\f$ where \f$n_1,n_2\f$ are the input and output dim. + * \details In the case of 4d tensors (common in convolutional networks) of shape \f$XH,XW,XC,N\f$ the weights are sampled from \f$\mathcal U([-g\sqrt{\frac 6 {d}},g\sqrt{ \frac 6 {d}}])\f$ where \f$d = XC * (XH * XW) + N * (XH * XW)\f$ * Important note : The underlying distribution is uniform (not gaussian) * * *Note:* This is also known as **Xavier initialization** diff --git a/examples/mnist/basic-mnist-benchmarks/README.md b/examples/mnist/basic-mnist-benchmarks/README.md index afe028a75..b74b8aae7 100644 --- a/examples/mnist/basic-mnist-benchmarks/README.md +++ b/examples/mnist/basic-mnist-benchmarks/README.md @@ -67,5 +67,5 @@ Batch size: 64, learning rate: 0.01. | OS | Device | Framework | Speed | Accuracy (After 20 Epochs)| | --- | --- | --- | --- | --- | | Ubuntu 16.04 | GeForce GTX 1080 Ti | PyTorch | ~ 4.49±0.11 s per epoch | 98.95% | -| Ubuntu 16.04 | GeForce GTX 1080 Ti | DyNet (autobatch) | ~ 8.58±0.09 s per epoch | 99.14% | -| Ubuntu 16.04 | GeForce GTX 1080 Ti | DyNet (minibatch) | ~ 4.13±0.13 s per epoch | 99.16% | +| Ubuntu 16.04 | GeForce GTX 1080 Ti | DyNet (autobatch) | ~ 8.58±0.09 s per epoch | 98.98% | +| Ubuntu 16.04 | GeForce GTX 1080 Ti | DyNet (minibatch) | ~ 4.13±0.13 s per epoch | 98.99% | diff --git a/python/_dynet.pyx b/python/_dynet.pyx index d4121c8c5..e8e0a74a3 100644 --- a/python/_dynet.pyx +++ b/python/_dynet.pyx @@ -519,6 +519,8 @@ cdef class GlorotInitializer(PyInitializer): If the dimensions of the parameter matrix are :math:`m,n`, the weights are sampled from :math:`\mathcal U([-g\sqrt{\\frac{6}{m+n}},g\sqrt{\\frac{6}{m+n}}])` + In the case of 4d tensors (common in convolutional networks) of shape :math:`XH,XW,XC,N` the weights are sampled from :math:`\mathcal U([-g\sqrt{\\frac{6}{d}},g\sqrt{\\frac{6}{d}}])` where :math:`d = XC * (XH * XW) + N * (XH * XW)` + The gain :math:`g` depends on the activation function : * :math:`\\text{tanh}` : 1.0