I used MatConvNet to build a CNN model for regression. The input size is 20×20×1×32, the output size is 4×1×32, the convolutional filter size is 3×3×1.
Now I found after training the training error is decreasing. However, all the results become the same! I checked the output of each layer. I found that after the first layer (which is the Conv) the output has already become the same.
My network structure is x = x1 - conv - x2 - ReLU- x2- Pool - x3 - Relu - x3 - sigmoid - x4 - linear - x5 = y
I have tried a simpler structure which is x = x1 - conv - x2 - sigmoid - x4 - linear - x5 = y
But this doesn't work. I deleted the CONV layer and only leaved the sigmoid layer and the last linear layer. I found the same results: the four outputs for all the 32 samples keep the same.
The training error looks normal which makes me believe in my training process.
Anyone could help I would be appreciate.
The code:
setup ;
clear; clc;
load minibatch
X = single(reshape(minibatch.a,20,20,1,32));
X = reshape(X,400,32);
X = mapminmax(X',0,1);
X = reshape(X',20,20,1,32);
y = single(rand(4,1,32));
n = 32; % number of examples
numIterations = 10000 ;
rate = 0.003 ;
momentum = 0;
shrinkRate = 500 ;
plotPeriod = 10 ;
Jtrain = zeros(size(shrinkRate,2),1);
w = single(randn(3,3,1) * 1/200 - 1/400);
b = single(0) ;
theta1 = rand(4,400) * 1/200 - 1/400;
beta1 = single(zeros(4,1));
theta2 = rand(4,4) * 1/2 - 1/4;
beta2 = single(zeros(4,1));
w_momentum = zeros('like', w) ;
b_momentum = zeros('like', b) ;
theta1_momentum = zeros('like',theta1);
beta1_momentum = zeros('like',beta1);
theta2_momentum = zeros('like',theta2);
beta2_momentum = zeros('like',beta2);
for t = 1:numIterations
% Forward pass
res = filtercnnDQN(X, w, b, theta1, beta1, theta2, beta2) ;
dzdx5 = reshape(res.x5,4,1,size(X,4)) - y;
for i = 1 : size(X,4)
E1(i) = dzdx5(:,i)' * dzdx5(:,i);
end
E(1,t) = 1/(2*size(X,4)) * sum(E1);
E(2,t) = 0;
E(3,t) = E(1,t) + E(2,t) ;
% Backward pass
res = filtercnnDQN(X, w, b, theta1, beta1, theta2, beta2, dzdx5) ;
% Update momentum
w_momentum = momentum * w_momentum + rate * ((1/n) * (res.dzdw) + shrinkRate * w) ;
b_momentum = momentum * b_momentum + rate * 0.1 * (1/n) * res.dzdb ;
theta1_momentum = momentum * theta1_momentum + rate * ((1/n) * (res.dzdtheta1) + shrinkRate * theta1);
beta1_momentum = momentum * beta1_momentum + rate * 0.1 * (1/n) * res.dzdbeta1;
theta2_momentum = momentum * theta2_momentum + rate * ((1/n) * (res.dzdtheta2) + shrinkRate * theta2);
beta2_momentum = momentum * beta2_momentum + rate * 0.1 * (1/n) * res.dzdbeta2;
% Gradient step
w = w - w_momentum ;
b = b - b_momentum ;
theta1 = theta1 - theta1_momentum;
beta1 = beta1 - beta1_momentum;
theta2 = theta2 - theta2_momentum;
beta2 = beta2 - beta2_momentum;
end
Jtrain = E(1,t);
%The function filtercnnDQN.m
function res = filtercnnDQN(x, w, b, theta1, beta1, theta2, beta2, dzdy)
% Parameters of the layers
pad1 = ([size(w,1) size(w,1) size(w,2) size(w,2)] - 1) / 2 ;
rho2 = 3 ;
pad2 = (rho2 - 1) / 2 ;
% Define the number of training examples
N = size(x,4);
% Forward pass
res.x1 = x ;
res.x2 = vl_nnconv(res.x1, w, b, 'pad', pad1) ;
% res.x2 = vl_nnrelu(res.x2);
% res.x3 = vl_nnpool(res.x2, rho2, 'pad', pad2) ;
% res.x3 = vl_nnrelu(res.x3);
for i = 1:N
res.x4(:,:,i) = single(sigmoid(theta1 * reshape(res.x2(:,:,i),400,1) + beta1));
res.x5(:,:,i) = theta2 * res.x4(:,:,i) + beta2;
end
% Backward pass (only if passed output derivative)
if nargin > 7
res.dzdx5 = dzdy ;
res.dzdtheta2 = 0;
res.dzdbeta2 = 0;
res.dzdtheta1 = 0;
res.dzdbeta1 = 0;
for i = 1:N
res.dzdx4(:,:,i) = theta2' * res.dzdx5(:,:,i);
res.dzdtheta2 = res.dzdtheta2 + res.dzdx5(:,:,i) * res.x4(:,i)';
res.dzdbeta2 = res.dzdbeta2 + res.dzdx5(:,:,i);
res.dzdx2(:,:,1,i) = reshape((theta1' * res.dzdx4(:,:,i)) .* (reshape(res.x2(:,:,:,i),400,1)) .* (ones(400,1) - (reshape(res.x2(:,:,:,i),400,1))), 20, 20);
res.dzdtheta1 = res.dzdtheta1 + res.dzdx4(:,:,i) * (reshape(res.x2(:,:,:,i),400,1))';
res.dzdbeta1 = res.dzdbeta1 + res.dzdx4(:,:,i);
end
% res.dzdx2 = vl_nnrelu(res.x2, res.dzdx3);
% res.dzdx2 = vl_nnpool(res.x2, rho2, res.dzdx2, 'pad', pad2) ;
% res.dzdx1 = vl_nnrelu(res.x1, res.dzdx2);
[res.dzdx1, res.dzdw, res.dzdb] = ...
vl_nnconv(res.x1, w, b, res.dzdx2, 'pad', pad1) ;
end