.. raw:: html

.. raw:: latex \diilbookstyleinputcell .. code:: python from mxnet import init, np, npx from mxnet.gluon import nn npx.set_np() net = nn.Sequential() net.add(nn.Dense(8, activation='relu')) net.add(nn.Dense(1)) net.initialize() # 使用默认初始化方法 X = np.random.uniform(size=(2, 4)) net(X) # 正向传播 .. raw:: latex \diilbookstyleoutputcell .. parsed-literal:: :class: output [07:11:54] ../src/storage/storage.cc:196: Using Pooled (Naive) StorageManager for CPU .. raw:: latex \diilbookstyleoutputcell .. parsed-literal:: :class: output array([[0.0054572 ], [0.00488594]]) .. raw:: html

.. raw:: html

.. raw:: latex \diilbookstyleinputcell .. code:: python import torch from torch import nn net = nn.Sequential(nn.Linear(4, 8), nn.ReLU(), nn.Linear(8, 1)) X = torch.rand(size=(2, 4)) net(X) .. raw:: latex \diilbookstyleoutputcell .. parsed-literal:: :class: output tensor([[-0.0970], [-0.0827]], grad_fn=) .. raw:: html

.. raw:: html

.. raw:: latex \diilbookstyleinputcell .. code:: python import tensorflow as tf net = tf.keras.models.Sequential([ tf.keras.layers.Flatten(), tf.keras.layers.Dense(4, activation=tf.nn.relu), tf.keras.layers.Dense(1), ]) X = tf.random.uniform((2, 4)) net(X) .. raw:: latex \diilbookstyleoutputcell .. parsed-literal:: :class: output .. raw:: html

.. raw:: html

.. raw:: latex \diilbookstyleinputcell .. code:: python import warnings warnings.filterwarnings(action='ignore') import paddle from paddle import nn net = nn.Sequential(nn.Linear(4, 8), nn.ReLU(), nn.Linear(8, 1)) X = paddle.rand([2, 4]) net(X) .. raw:: latex \diilbookstyleoutputcell .. parsed-literal:: :class: output Tensor(shape=[2, 1], dtype=float32, place=Place(cpu), stop_gradient=False, [[0.48685262], [0.34589919]]) .. raw:: html

.. raw:: html

mxnet pytorch tensorflow paddle

.. raw:: html

.. raw:: latex \diilbookstyleinputcell .. code:: python print(net[1].params) .. raw:: latex \diilbookstyleoutputcell .. parsed-literal:: :class: output dense1_ ( Parameter dense1_weight (shape=(1, 8), dtype=float32) Parameter dense1_bias (shape=(1,), dtype=float32) ) .. raw:: html

.. raw:: html

.. raw:: latex \diilbookstyleinputcell .. code:: python print(net[2].state_dict()) .. raw:: latex \diilbookstyleoutputcell .. parsed-literal:: :class: output OrderedDict([('weight', tensor([[-0.0427, -0.2939, -0.1894, 0.0220, -0.1709, -0.1522, -0.0334, -0.2263]])), ('bias', tensor([0.0887]))]) .. raw:: html

.. raw:: html

.. raw:: latex \diilbookstyleinputcell .. code:: python print(net.layers[2].weights) .. raw:: latex \diilbookstyleoutputcell .. parsed-literal:: :class: output [, ] .. raw:: html

.. raw:: html

.. raw:: latex \diilbookstyleinputcell .. code:: python print(net[2].state_dict()) .. raw:: latex \diilbookstyleoutputcell .. parsed-literal:: :class: output OrderedDict([('weight', Parameter containing: Tensor(shape=[8, 1], dtype=float32, place=Place(cpu), stop_gradient=False, [[ 0.35208166], [-0.81014472], [ 0.77976549], [-0.21038508], [ 0.06733459], [-0.73830664], [ 0.65213728], [-0.33782813]])), ('bias', Parameter containing: Tensor(shape=[1], dtype=float32, place=Place(cpu), stop_gradient=False, [0.]))]) .. raw:: html

.. raw:: html

mxnet pytorch tensorflow paddle

.. raw:: html

.. raw:: latex \diilbookstyleinputcell .. code:: python print(type(net[1].bias)) print(net[1].bias) print(net[1].bias.data()) .. raw:: latex \diilbookstyleoutputcell .. parsed-literal:: :class: output Parameter dense1_bias (shape=(1,), dtype=float32) [0.] 参数是复合的对象，包含值、梯度和额外信息。这就是我们需要显式参数值的原因。除了值之外，我们还可以访问每个参数的梯度。在上面这个网络中，由于我们还没有调用反向传播，所以参数的梯度处于初始状态。 .. raw:: latex \diilbookstyleinputcell .. code:: python net[1].weight.grad() .. raw:: latex \diilbookstyleoutputcell .. parsed-literal:: :class: output array([[0., 0., 0., 0., 0., 0., 0., 0.]]) .. raw:: html

.. raw:: html

.. raw:: latex \diilbookstyleinputcell .. code:: python print(type(net[2].bias)) print(net[2].bias) print(net[2].bias.data) .. raw:: latex \diilbookstyleoutputcell .. parsed-literal:: :class: output Parameter containing: tensor([0.0887], requires_grad=True) tensor([0.0887]) 参数是复合的对象，包含值、梯度和额外信息。这就是我们需要显式参数值的原因。除了值之外，我们还可以访问每个参数的梯度。在上面这个网络中，由于我们还没有调用反向传播，所以参数的梯度处于初始状态。 .. raw:: latex \diilbookstyleinputcell .. code:: python net[2].weight.grad == None .. raw:: latex \diilbookstyleoutputcell .. parsed-literal:: :class: output True .. raw:: html

.. raw:: html

.. raw:: latex \diilbookstyleinputcell .. code:: python print(type(net.layers[2].weights[1])) print(net.layers[2].weights[1]) print(tf.convert_to_tensor(net.layers[2].weights[1])) .. raw:: latex \diilbookstyleoutputcell .. parsed-literal:: :class: output tf.Tensor([0.], shape=(1,), dtype=float32) .. raw:: html

.. raw:: html

.. raw:: latex \diilbookstyleinputcell .. code:: python print(type(net[2].bias)) print(net[2].bias) print(net[2].bias.value) .. raw:: latex \diilbookstyleoutputcell .. parsed-literal:: :class: output Parameter containing: Tensor(shape=[1], dtype=float32, place=Place(cpu), stop_gradient=False, [0.]) 参数是复合的对象，包含值、梯度和额外信息。这就是我们需要显式参数值的原因。除了值之外，我们还可以访问每个参数的梯度。在上面这个网络中，由于我们还没有调用反向传播，所以参数的梯度处于初始状态。 .. raw:: latex \diilbookstyleinputcell .. code:: python net[2].weight.grad == None .. raw:: latex \diilbookstyleoutputcell .. parsed-literal:: :class: output True .. raw:: html

.. raw:: html

mxnet pytorch tensorflow paddle

.. raw:: html

.. raw:: latex \diilbookstyleinputcell .. code:: python print(net[0].collect_params()) print(net.collect_params()) .. raw:: latex \diilbookstyleoutputcell .. parsed-literal:: :class: output dense0_ ( Parameter dense0_weight (shape=(8, 4), dtype=float32) Parameter dense0_bias (shape=(8,), dtype=float32) ) sequential0_ ( Parameter dense0_weight (shape=(8, 4), dtype=float32) Parameter dense0_bias (shape=(8,), dtype=float32) Parameter dense1_weight (shape=(1, 8), dtype=float32) Parameter dense1_bias (shape=(1,), dtype=float32) ) .. raw:: html

.. raw:: html

.. raw:: latex \diilbookstyleinputcell .. code:: python print(*[(name, param.shape) for name, param in net[0].named_parameters()]) print(*[(name, param.shape) for name, param in net.named_parameters()]) .. raw:: latex \diilbookstyleoutputcell .. parsed-literal:: :class: output ('weight', torch.Size([8, 4])) ('bias', torch.Size([8])) ('0.weight', torch.Size([8, 4])) ('0.bias', torch.Size([8])) ('2.weight', torch.Size([1, 8])) ('2.bias', torch.Size([1])) .. raw:: html

.. raw:: html

.. raw:: latex \diilbookstyleinputcell .. code:: python print(net.layers[1].weights) print(net.get_weights()) .. raw:: latex \diilbookstyleoutputcell .. parsed-literal:: :class: output [, ] [array([[ 0.07396382, -0.6543436 , -0.7244056 , -0.6157465 ], [ 0.40404958, 0.7228444 , 0.4572547 , 0.7116396 ], [ 0.5283937 , 0.25636894, -0.49113625, 0.6337872 ], [ 0.5183577 , 0.19943613, 0.5296057 , 0.6009421 ]], dtype=float32), array([0., 0., 0., 0.], dtype=float32), array([[-1.0469127 ], [ 0.31355536], [ 0.5405549 ], [ 0.7610214 ]], dtype=float32), array([0.], dtype=float32)] .. raw:: html

.. raw:: html

.. raw:: latex \diilbookstyleinputcell .. code:: python print(*[(name, param.shape) for name, param in net[0].named_parameters()]) print(*[(name, param.shape) for name, param in net.named_parameters()]) .. raw:: latex \diilbookstyleoutputcell .. parsed-literal:: :class: output ('weight', [4, 8]) ('bias', [8]) ('0.weight', [4, 8]) ('0.bias', [8]) ('2.weight', [8, 1]) ('2.bias', [1]) .. raw:: html

.. raw:: html

mxnet pytorch tensorflow paddle

.. raw:: html

.. raw:: latex \diilbookstyleinputcell .. code:: python net.collect_params()['dense1_bias'].data() .. raw:: latex \diilbookstyleoutputcell .. parsed-literal:: :class: output array([0.]) .. raw:: html

.. raw:: html

.. raw:: latex \diilbookstyleinputcell .. code:: python net.state_dict()['2.bias'].data .. raw:: latex \diilbookstyleoutputcell .. parsed-literal:: :class: output tensor([0.0887]) .. raw:: html

.. raw:: html

.. raw:: latex \diilbookstyleinputcell .. code:: python net.get_weights()[1] .. raw:: latex \diilbookstyleoutputcell .. parsed-literal:: :class: output array([0., 0., 0., 0.], dtype=float32) .. raw:: html

.. raw:: html

.. raw:: latex \diilbookstyleinputcell .. code:: python net.state_dict()['2.bias'] .. raw:: latex \diilbookstyleoutputcell .. parsed-literal:: :class: output Parameter containing: Tensor(shape=[1], dtype=float32, place=Place(cpu), stop_gradient=False, [0.]) .. raw:: html

.. raw:: html

mxnet pytorch tensorflow paddle

.. raw:: html

.. raw:: latex \diilbookstyleinputcell .. code:: python def block1(): net = nn.Sequential() net.add(nn.Dense(32, activation='relu')) net.add(nn.Dense(16, activation='relu')) return net def block2(): net = nn.Sequential() for _ in range(4): # 在这里嵌套 net.add(block1()) return net rgnet = nn.Sequential() rgnet.add(block2()) rgnet.add(nn.Dense(10)) rgnet.initialize() rgnet(X) .. raw:: latex \diilbookstyleoutputcell .. parsed-literal:: :class: output array([[-6.3465841e-09, -1.1096747e-09, 6.4161774e-09, 6.6354131e-09, -1.1265502e-09, 1.3285141e-10, 9.3619361e-09, 3.2229091e-09, 5.9429857e-09, 8.8181418e-09], [-8.6219423e-09, -7.5150813e-10, 8.3133243e-09, 8.9321119e-09, -1.6739999e-09, 3.2406069e-10, 1.2115976e-08, 4.4926454e-09, 8.0741742e-09, 1.2075874e-08]]) .. raw:: html

.. raw:: html

.. raw:: latex \diilbookstyleinputcell .. code:: python def block1(): return nn.Sequential(nn.Linear(4, 8), nn.ReLU(), nn.Linear(8, 4), nn.ReLU()) def block2(): net = nn.Sequential() for i in range(4): # 在这里嵌套 net.add_module(f'block {i}', block1()) return net rgnet = nn.Sequential(block2(), nn.Linear(4, 1)) rgnet(X) .. raw:: latex \diilbookstyleoutputcell .. parsed-literal:: :class: output tensor([[0.2596], [0.2596]], grad_fn=) .. raw:: html

.. raw:: html

.. raw:: latex \diilbookstyleinputcell .. code:: python def block1(name): return tf.keras.Sequential([ tf.keras.layers.Flatten(), tf.keras.layers.Dense(4, activation=tf.nn.relu)], name=name) def block2(): net = tf.keras.Sequential() for i in range(4): # 在这里嵌套 net.add(block1(name=f'block-{i}')) return net rgnet = tf.keras.Sequential() rgnet.add(block2()) rgnet.add(tf.keras.layers.Dense(1)) rgnet(X) .. raw:: latex \diilbookstyleoutputcell .. parsed-literal:: :class: output .. raw:: html

.. raw:: html

.. raw:: latex \diilbookstyleinputcell .. code:: python def block1(): return nn.Sequential(nn.Linear(4, 8), nn.ReLU(), nn.Linear(8, 4), nn.ReLU()) def block2(): net = nn.Sequential() for i in range(4): # 在这里嵌套 net.add_sublayer(f'block {i}', block1()) return net rgnet = nn.Sequential(block2(), nn.Linear(4, 1)) rgnet(X) .. raw:: latex \diilbookstyleoutputcell .. parsed-literal:: :class: output Tensor(shape=[2, 1], dtype=float32, place=Place(cpu), stop_gradient=False, [[0.], [0.]]) .. raw:: html

.. raw:: html

mxnet pytorch tensorflow paddle

.. raw:: html

.. raw:: latex \diilbookstyleinputcell .. code:: python print(rgnet.collect_params) print(rgnet.collect_params()) .. raw:: latex \diilbookstyleoutputcell .. parsed-literal:: :class: output 32, Activation(relu)) (1): Dense(32 -> 16, Activation(relu)) ) (1): Sequential( (0): Dense(16 -> 32, Activation(relu)) (1): Dense(32 -> 16, Activation(relu)) ) (2): Sequential( (0): Dense(16 -> 32, Activation(relu)) (1): Dense(32 -> 16, Activation(relu)) ) (3): Sequential( (0): Dense(16 -> 32, Activation(relu)) (1): Dense(32 -> 16, Activation(relu)) ) ) (1): Dense(16 -> 10, linear) )> sequential1_ ( Parameter dense2_weight (shape=(32, 4), dtype=float32) Parameter dense2_bias (shape=(32,), dtype=float32) Parameter dense3_weight (shape=(16, 32), dtype=float32) Parameter dense3_bias (shape=(16,), dtype=float32) Parameter dense4_weight (shape=(32, 16), dtype=float32) Parameter dense4_bias (shape=(32,), dtype=float32) Parameter dense5_weight (shape=(16, 32), dtype=float32) Parameter dense5_bias (shape=(16,), dtype=float32) Parameter dense6_weight (shape=(32, 16), dtype=float32) Parameter dense6_bias (shape=(32,), dtype=float32) Parameter dense7_weight (shape=(16, 32), dtype=float32) Parameter dense7_bias (shape=(16,), dtype=float32) Parameter dense8_weight (shape=(32, 16), dtype=float32) Parameter dense8_bias (shape=(32,), dtype=float32) Parameter dense9_weight (shape=(16, 32), dtype=float32) Parameter dense9_bias (shape=(16,), dtype=float32) Parameter dense10_weight (shape=(10, 16), dtype=float32) Parameter dense10_bias (shape=(10,), dtype=float32) ) .. raw:: html

.. raw:: html

.. raw:: latex \diilbookstyleinputcell .. code:: python print(rgnet) .. raw:: latex \diilbookstyleoutputcell .. parsed-literal:: :class: output Sequential( (0): Sequential( (block 0): Sequential( (0): Linear(in_features=4, out_features=8, bias=True) (1): ReLU() (2): Linear(in_features=8, out_features=4, bias=True) (3): ReLU() ) (block 1): Sequential( (0): Linear(in_features=4, out_features=8, bias=True) (1): ReLU() (2): Linear(in_features=8, out_features=4, bias=True) (3): ReLU() ) (block 2): Sequential( (0): Linear(in_features=4, out_features=8, bias=True) (1): ReLU() (2): Linear(in_features=8, out_features=4, bias=True) (3): ReLU() ) (block 3): Sequential( (0): Linear(in_features=4, out_features=8, bias=True) (1): ReLU() (2): Linear(in_features=8, out_features=4, bias=True) (3): ReLU() ) ) (1): Linear(in_features=4, out_features=1, bias=True) ) .. raw:: html

.. raw:: html

.. raw:: latex \diilbookstyleinputcell .. code:: python print(rgnet.summary()) .. raw:: latex \diilbookstyleoutputcell .. parsed-literal:: :class: output Model: "sequential_1" _________________________________________________________________ Layer (type) Output Shape Param # ================================================================= sequential_2 (Sequential) (2, 4) 80 dense_6 (Dense) (2, 1) 5 ================================================================= Total params: 85 Trainable params: 85 Non-trainable params: 0 _________________________________________________________________ None .. raw:: html

.. raw:: html

.. raw:: latex \diilbookstyleinputcell .. code:: python print(rgnet) .. raw:: latex \diilbookstyleoutputcell .. parsed-literal:: :class: output Sequential( (0): Sequential( (block 0): Sequential( (0): Linear(in_features=4, out_features=8, dtype=float32) (1): ReLU() (2): Linear(in_features=8, out_features=4, dtype=float32) (3): ReLU() ) (block 1): Sequential( (0): Linear(in_features=4, out_features=8, dtype=float32) (1): ReLU() (2): Linear(in_features=8, out_features=4, dtype=float32) (3): ReLU() ) (block 2): Sequential( (0): Linear(in_features=4, out_features=8, dtype=float32) (1): ReLU() (2): Linear(in_features=8, out_features=4, dtype=float32) (3): ReLU() ) (block 3): Sequential( (0): Linear(in_features=4, out_features=8, dtype=float32) (1): ReLU() (2): Linear(in_features=8, out_features=4, dtype=float32) (3): ReLU() ) ) (1): Linear(in_features=4, out_features=1, dtype=float32) ) .. raw:: html

.. raw:: html

mxnet pytorch tensorflow paddle

.. raw:: html

.. raw:: latex \diilbookstyleinputcell .. code:: python rgnet[0][1][0].bias.data() .. raw:: latex \diilbookstyleoutputcell .. parsed-literal:: :class: output array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]) .. raw:: html

.. raw:: html

.. raw:: latex \diilbookstyleinputcell .. code:: python rgnet[0][1][0].bias.data .. raw:: latex \diilbookstyleoutputcell .. parsed-literal:: :class: output tensor([ 0.1999, -0.4073, -0.1200, -0.2033, -0.1573, 0.3546, -0.2141, -0.2483]) .. raw:: html

.. raw:: html

.. raw:: latex \diilbookstyleinputcell .. code:: python rgnet.layers[0].layers[1].layers[1].weights[1] .. raw:: latex \diilbookstyleoutputcell .. parsed-literal:: :class: output .. raw:: html

.. raw:: html

.. raw:: latex \diilbookstyleinputcell .. code:: python print(rgnet[0].state_dict()['block 0.0.bias']) .. raw:: latex \diilbookstyleoutputcell .. parsed-literal:: :class: output Parameter containing: Tensor(shape=[8], dtype=float32, place=Place(cpu), stop_gradient=False, [0., 0., 0., 0., 0., 0., 0., 0.]) .. raw:: html

.. raw:: html

mxnet pytorch tensorflow paddle

.. raw:: html

默认情况下，MXNet通过初始化权重参数的方法是从均匀分布\ :math:`U(-0.07, 0.07)`\ 中随机采样权重，并将偏置参数设置为0。 MXNet的\ ``init``\ 模块提供了多种预置初始化方法。 .. raw:: html

.. raw:: html

默认情况下，PyTorch会根据一个范围均匀地初始化权重和偏置矩阵，这个范围是根据输入和输出维度计算出的。 PyTorch的\ ``nn.init``\ 模块提供了多种预置初始化方法。 .. raw:: html

.. raw:: html

默认情况下，Keras会根据一个范围均匀地初始化权重矩阵，这个范围是根据输入和输出维度计算出的。偏置参数设置为0。 TensorFlow在根模块和\ ``keras.initializers``\ 模块中提供了各种初始化方法。 .. raw:: html

.. raw:: html

默认情况下，PaddlePaddle会使用Xavier初始化权重矩阵，偏置参数设置为0。 PaddlePaddle的\ ``nn.initializer``\ 模块提供了多种预置初始化方法。 .. raw:: html

.. raw:: html

mxnet pytorch tensorflow paddle

.. raw:: html

.. raw:: latex \diilbookstyleinputcell .. code:: python # 这里的force_reinit确保参数会被重新初始化，不论之前是否已经被初始化 net.initialize(init=init.Normal(sigma=0.01), force_reinit=True) net[0].weight.data()[0] .. raw:: latex \diilbookstyleoutputcell .. parsed-literal:: :class: output array([-0.00324057, -0.00895028, -0.00698632, 0.01030831]) .. raw:: html

.. raw:: html

.. raw:: latex \diilbookstyleinputcell .. code:: python def init_normal(m): if type(m) == nn.Linear: nn.init.normal_(m.weight, mean=0, std=0.01) nn.init.zeros_(m.bias) net.apply(init_normal) net[0].weight.data[0], net[0].bias.data[0] .. raw:: latex \diilbookstyleoutputcell .. parsed-literal:: :class: output (tensor([-0.0214, -0.0015, -0.0100, -0.0058]), tensor(0.)) .. raw:: html

.. raw:: html

.. raw:: latex \diilbookstyleinputcell .. code:: python net = tf.keras.models.Sequential([ tf.keras.layers.Flatten(), tf.keras.layers.Dense( 4, activation=tf.nn.relu, kernel_initializer=tf.random_normal_initializer(mean=0, stddev=0.01), bias_initializer=tf.zeros_initializer()), tf.keras.layers.Dense(1)]) net(X) net.weights[0], net.weights[1] .. raw:: latex \diilbookstyleoutputcell .. parsed-literal:: :class: output (, ) .. raw:: html

.. raw:: html

.. raw:: latex \diilbookstyleinputcell .. code:: python def init_normal(m): if type(m) == nn.Linear: paddle.nn.initializer.Normal(mean=0.0, std=0.01) paddle.zeros(m.bias) net.apply(init_normal) net[0].weight[0],net[0].state_dict()['bias'] .. raw:: latex \diilbookstyleoutputcell .. parsed-literal:: :class: output (Tensor(shape=[8], dtype=float32, place=Place(cpu), stop_gradient=False, [-0.45191017, 0.32920808, -0.20261598, 0.45921832, 0.20553625, -0.44192296, 0.23093420, 0.31619781]), Parameter containing: Tensor(shape=[8], dtype=float32, place=Place(cpu), stop_gradient=False, [0., 0., 0., 0., 0., 0., 0., 0.])) .. raw:: html

.. raw:: html

mxnet pytorch tensorflow paddle

.. raw:: html

.. raw:: latex \diilbookstyleinputcell .. code:: python net.initialize(init=init.Constant(1), force_reinit=True) net[0].weight.data()[0] .. raw:: latex \diilbookstyleoutputcell .. parsed-literal:: :class: output array([1., 1., 1., 1.]) .. raw:: html

.. raw:: html

.. raw:: latex \diilbookstyleinputcell .. code:: python def init_constant(m): if type(m) == nn.Linear: nn.init.constant_(m.weight, 1) nn.init.zeros_(m.bias) net.apply(init_constant) net[0].weight.data[0], net[0].bias.data[0] .. raw:: latex \diilbookstyleoutputcell .. parsed-literal:: :class: output (tensor([1., 1., 1., 1.]), tensor(0.)) .. raw:: html

.. raw:: html

.. raw:: latex \diilbookstyleinputcell .. code:: python net = tf.keras.models.Sequential([ tf.keras.layers.Flatten(), tf.keras.layers.Dense( 4, activation=tf.nn.relu, kernel_initializer=tf.keras.initializers.Constant(1), bias_initializer=tf.zeros_initializer()), tf.keras.layers.Dense(1), ]) net(X) net.weights[0], net.weights[1] .. raw:: latex \diilbookstyleoutputcell .. parsed-literal:: :class: output (, ) .. raw:: html

.. raw:: html

.. raw:: latex \diilbookstyleinputcell .. code:: python def init_constant(m): if type(m) == nn.Linear: paddle.nn.initializer.Constant(value = 1) paddle.zeros(m.bias) net.apply(init_constant) net[0].weight[0],net[0].state_dict()['bias'] .. raw:: latex \diilbookstyleoutputcell .. parsed-literal:: :class: output (Tensor(shape=[8], dtype=float32, place=Place(cpu), stop_gradient=False, [-0.45191017, 0.32920808, -0.20261598, 0.45921832, 0.20553625, -0.44192296, 0.23093420, 0.31619781]), Parameter containing: Tensor(shape=[8], dtype=float32, place=Place(cpu), stop_gradient=False, [0., 0., 0., 0., 0., 0., 0., 0.])) .. raw:: html

.. raw:: html

mxnet pytorch tensorflow paddle

.. raw:: html

.. raw:: latex \diilbookstyleinputcell .. code:: python net[0].weight.initialize(init=init.Xavier(), force_reinit=True) net[1].initialize(init=init.Constant(42), force_reinit=True) print(net[0].weight.data()[0]) print(net[1].weight.data()) .. raw:: latex \diilbookstyleoutputcell .. parsed-literal:: :class: output [-0.17594433 0.02314097 -0.1992535 0.09509248] [[42. 42. 42. 42. 42. 42. 42. 42.]] .. raw:: html

.. raw:: html

.. raw:: latex \diilbookstyleinputcell .. code:: python def init_xavier(m): if type(m) == nn.Linear: nn.init.xavier_uniform_(m.weight) def init_42(m): if type(m) == nn.Linear: nn.init.constant_(m.weight, 42) net[0].apply(init_xavier) net[2].apply(init_42) print(net[0].weight.data[0]) print(net[2].weight.data) .. raw:: latex \diilbookstyleoutputcell .. parsed-literal:: :class: output tensor([ 0.5236, 0.0516, -0.3236, 0.3794]) tensor([[42., 42., 42., 42., 42., 42., 42., 42.]]) .. raw:: html

.. raw:: html

.. raw:: latex \diilbookstyleinputcell .. code:: python net = tf.keras.models.Sequential([ tf.keras.layers.Flatten(), tf.keras.layers.Dense( 4, activation=tf.nn.relu, kernel_initializer=tf.keras.initializers.GlorotUniform()), tf.keras.layers.Dense( 1, kernel_initializer=tf.keras.initializers.Constant(1)), ]) net(X) print(net.layers[1].weights[0]) print(net.layers[2].weights[0]) .. raw:: latex \diilbookstyleoutputcell .. parsed-literal:: :class: output .. raw:: html

.. raw:: html

.. raw:: latex \diilbookstyleinputcell .. code:: python def xavier(m): if type(m) == nn.Linear: paddle.nn.initializer.XavierUniform(m.weight) def init_42(m): if type(m) == nn.Linear: paddle.nn.initializer.Constant(42) net[0].apply(xavier) net[2].apply(init_42) print(net[0].weight[0]) print(net[2].weight) .. raw:: latex \diilbookstyleoutputcell .. parsed-literal:: :class: output Tensor(shape=[8], dtype=float32, place=Place(cpu), stop_gradient=False, [-0.45191017, 0.32920808, -0.20261598, 0.45921832, 0.20553625, -0.44192296, 0.23093420, 0.31619781]) Parameter containing: Tensor(shape=[8, 1], dtype=float32, place=Place(cpu), stop_gradient=False, [[ 0.35208166], [-0.81014472], [ 0.77976549], [-0.21038508], [ 0.06733459], [-0.73830664], [ 0.65213728], [-0.33782813]]) .. raw:: html

.. raw:: html

mxnet pytorch tensorflow paddle

.. raw:: html

在这里，我们定义了\ ``Initializer``\ 类的子类。通常，我们只需要实现\ ``_init_weight``\ 函数，该函数接受张量参数（\ ``data``\ ）并为其分配所需的初始化值。 .. raw:: latex \diilbookstyleinputcell .. code:: python class MyInit(init.Initializer): def _init_weight(self, name, data): print('Init', name, data.shape) data[:] = np.random.uniform(-10, 10, data.shape) data *= np.abs(data) >= 5 net.initialize(MyInit(), force_reinit=True) net[0].weight.data()[:2] .. raw:: latex \diilbookstyleoutputcell .. parsed-literal:: :class: output Init dense0_weight (8, 4) Init dense1_weight (1, 8) .. raw:: latex \diilbookstyleoutputcell .. parsed-literal:: :class: output array([[ 0. , -0. , -0. , 8.522827 ], [ 0. , -8.828651 , -0. , -5.6012006]]) .. raw:: html

.. raw:: html

同样，我们实现了一个\ ``my_init``\ 函数来应用到\ ``net``\ 。 .. raw:: latex \diilbookstyleinputcell .. code:: python def my_init(m): if type(m) == nn.Linear: print("Init", *[(name, param.shape) for name, param in m.named_parameters()][0]) nn.init.uniform_(m.weight, -10, 10) m.weight.data *= m.weight.data.abs() >= 5 net.apply(my_init) net[0].weight[:2] .. raw:: latex \diilbookstyleoutputcell .. parsed-literal:: :class: output Init weight torch.Size([8, 4]) Init weight torch.Size([1, 8]) .. raw:: latex \diilbookstyleoutputcell .. parsed-literal:: :class: output tensor([[5.4079, 9.3334, 5.0616, 8.3095], [0.0000, 7.2788, -0.0000, -0.0000]], grad_fn=) .. raw:: html

.. raw:: html

在这里，我们定义了一个\ ``Initializer``\ 的子类，并实现了\ ``__call__``\ 函数。该函数返回给定形状和数据类型的所需张量。 .. raw:: latex \diilbookstyleinputcell .. code:: python class MyInit(tf.keras.initializers.Initializer): def __call__(self, shape, dtype=None): data=tf.random.uniform(shape, -10, 10, dtype=dtype) factor=(tf.abs(data) >= 5) factor=tf.cast(factor, tf.float32) return data * factor net = tf.keras.models.Sequential([ tf.keras.layers.Flatten(), tf.keras.layers.Dense( 4, activation=tf.nn.relu, kernel_initializer=MyInit()), tf.keras.layers.Dense(1), ]) net(X) print(net.layers[1].weights[0]) .. raw:: latex \diilbookstyleoutputcell .. parsed-literal:: :class: output .. raw:: html

.. raw:: html

同样，我们实现了一个\ ``my_init``\ 函数来应用到\ ``net``\ 。 .. raw:: latex \diilbookstyleinputcell .. code:: python def my_init(m): if type(m) == nn.Linear: print("Init", *[(name, param.shape) for name, param in m.named_parameters()][0]) paddle.nn.initializer.XavierUniform(m.weight, -10, 10) h = paddle.abs(m.weight) >= 5 h = paddle.to_tensor(h) m = paddle.to_tensor(m.weight) m *= h net.apply(my_init) net[0].weight[:2] .. raw:: latex \diilbookstyleoutputcell .. parsed-literal:: :class: output Init weight [4, 8] Init weight [8, 1] .. raw:: latex \diilbookstyleoutputcell .. parsed-literal:: :class: output Tensor(shape=[2, 8], dtype=float32, place=Place(cpu), stop_gradient=False, [[-0.45191017, 0.32920808, -0.20261598, 0.45921832, 0.20553625, -0.44192296, 0.23093420, 0.31619781], [-0.34684724, -0.05426282, -0.12417591, -0.11773884, 0.51981491, 0.52964920, 0.54408771, -0.02475590]]) .. raw:: html

.. raw:: html

mxnet pytorch tensorflow paddle

.. raw:: html

.. raw:: latex \diilbookstyleinputcell .. code:: python net[0].weight.data()[:] += 1 net[0].weight.data()[0, 0] = 42 net[0].weight.data()[0] .. raw:: latex \diilbookstyleoutputcell .. parsed-literal:: :class: output array([42. , 1. , 1. , 9.522827]) 高级用户请注意：如果要在\ ``autograd``\ 范围内调整参数，则需要使用\ ``set_data``\ ，以避免误导自动微分机制。 .. raw:: html

.. raw:: html

.. raw:: latex \diilbookstyleinputcell .. code:: python net[0].weight.data[:] += 1 net[0].weight.data[0, 0] = 42 net[0].weight.data[0] .. raw:: latex \diilbookstyleoutputcell .. parsed-literal:: :class: output tensor([42.0000, 10.3334, 6.0616, 9.3095]) .. raw:: html

.. raw:: html

.. raw:: latex \diilbookstyleinputcell .. code:: python net.layers[1].weights[0][:].assign(net.layers[1].weights[0] + 1) net.layers[1].weights[0][0, 0].assign(42) net.layers[1].weights[0] .. raw:: latex \diilbookstyleoutputcell .. parsed-literal:: :class: output .. raw:: html

.. raw:: html

.. raw:: latex \diilbookstyleinputcell .. code:: python net[0].weight.set_value(net[0].weight.numpy() + 1) val = net[0].weight.numpy() val[0, 0] = 42 net[0].weight.set_value(val) net[0].weight[0] .. raw:: latex \diilbookstyleoutputcell .. parsed-literal:: :class: output Tensor(shape=[8], dtype=float32, place=Place(cpu), stop_gradient=False, [42. , 1.32920814, 0.79738402, 1.45921826, 1.20553625, 0.55807704, 1.23093414, 1.31619787]) .. raw:: html

.. raw:: html

mxnet pytorch tensorflow paddle

.. raw:: html

.. raw:: latex \diilbookstyleinputcell .. code:: python net = nn.Sequential() # 我们需要给共享层一个名称，以便可以引用它的参数 shared = nn.Dense(8, activation='relu') net.add(nn.Dense(8, activation='relu'), shared, nn.Dense(8, activation='relu', params=shared.params), nn.Dense(10)) net.initialize() X = np.random.uniform(size=(2, 20)) net(X) # 检查参数是否相同 print(net[1].weight.data()[0] == net[2].weight.data()[0]) net[1].weight.data()[0, 0] = 100 # 确保它们实际上是同一个对象，而不只是有相同的值 print(net[1].weight.data()[0] == net[2].weight.data()[0]) .. raw:: latex \diilbookstyleoutputcell .. parsed-literal:: :class: output [ True True True True True True True True] [ True True True True True True True True] 这个例子表明第二层和第三层的参数是绑定的。它们不仅值相等，而且由相同的张量表示。因此，如果我们改变其中一个参数，另一个参数也会改变。这里有一个问题：当参数绑定时，梯度会发生什么情况？答案是由于模型参数包含梯度，因此在反向传播期间第二个隐藏层和第三个隐藏层的梯度会加在一起。 .. raw:: html

.. raw:: html

.. raw:: latex \diilbookstyleinputcell .. code:: python # 我们需要给共享层一个名称，以便可以引用它的参数 shared = nn.Linear(8, 8) net = nn.Sequential(nn.Linear(4, 8), nn.ReLU(), shared, nn.ReLU(), shared, nn.ReLU(), nn.Linear(8, 1)) net(X) # 检查参数是否相同 print(net[2].weight.data[0] == net[4].weight.data[0]) net[2].weight.data[0, 0] = 100 # 确保它们实际上是同一个对象，而不只是有相同的值 print(net[2].weight.data[0] == net[4].weight.data[0]) .. raw:: latex \diilbookstyleoutputcell .. parsed-literal:: :class: output tensor([True, True, True, True, True, True, True, True]) tensor([True, True, True, True, True, True, True, True]) 这个例子表明第三个和第五个神经网络层的参数是绑定的。它们不仅值相等，而且由相同的张量表示。因此，如果我们改变其中一个参数，另一个参数也会改变。这里有一个问题：当参数绑定时，梯度会发生什么情况？答案是由于模型参数包含梯度，因此在反向传播期间第二个隐藏层（即第三个神经网络层）和第三个隐藏层（即第五个神经网络层）的梯度会加在一起。 .. raw:: html

.. raw:: html

.. raw:: latex \diilbookstyleinputcell .. code:: python # tf.keras的表现有点不同。它会自动删除重复层 shared = tf.keras.layers.Dense(4, activation=tf.nn.relu) net = tf.keras.models.Sequential([ tf.keras.layers.Flatten(), shared, shared, tf.keras.layers.Dense(1), ]) net(X) # 检查参数是否不同 print(len(net.layers) == 3) .. raw:: latex \diilbookstyleoutputcell .. parsed-literal:: :class: output True .. raw:: html

.. raw:: html

.. raw:: latex \diilbookstyleinputcell .. code:: python # 我们需要给共享层一个名称，以便可以引用它的参数。 shared = nn.Linear(8, 8) net = nn.Sequential(nn.Linear(4, 8), nn.ReLU(), shared, nn.ReLU(), shared, nn.ReLU(), nn.Linear(8, 1)) net(X) # 检查参数是否相同 print(net[2].weight[0] == net[4].weight[0]) .. raw:: latex \diilbookstyleoutputcell .. parsed-literal:: :class: output Tensor(shape=[8], dtype=bool, place=Place(cpu), stop_gradient=False, [True, True, True, True, True, True, True, True]) .. raw:: html

.. raw:: html