1

I used MatConvNet to build a CNN model for regression. The input size is 20×20×1×32, the output size is 4×1×32, the convolutional filter size is 3×3×1.

Now I found after training the training error is decreasing. However, all the results become the same! I checked the output of each layer. I found that after the first layer (which is the Conv) the output has already become the same.

My network structure is x = x1 - conv - x2 - ReLU- x2- Pool - x3 - Relu - x3 - sigmoid - x4 - linear - x5 = y

I have tried a simpler structure which is x = x1 - conv - x2 - sigmoid - x4 - linear - x5 = y

But this doesn't work. I deleted the CONV layer and only leaved the sigmoid layer and the last linear layer. I found the same results: the four outputs for all the 32 samples keep the same.

The training error looks normal which makes me believe in my training process.

enter image description here

Anyone could help I would be appreciate.

The code:

setup ;
clear; clc;

load minibatch
X = single(reshape(minibatch.a,20,20,1,32));
X = reshape(X,400,32);
X = mapminmax(X',0,1);
X = reshape(X',20,20,1,32);
y = single(rand(4,1,32));

n = 32; % number of examples

numIterations = 10000 ;
rate = 0.003 ;
momentum = 0;
shrinkRate = 500 ;
plotPeriod = 10 ;

Jtrain = zeros(size(shrinkRate,2),1);

w = single(randn(3,3,1) * 1/200 - 1/400);
b = single(0) ;

theta1 = rand(4,400) * 1/200 - 1/400;
beta1 = single(zeros(4,1));

theta2 = rand(4,4) * 1/2 - 1/4;
beta2 = single(zeros(4,1));

w_momentum = zeros('like', w) ;
b_momentum = zeros('like', b) ;
theta1_momentum = zeros('like',theta1);
beta1_momentum = zeros('like',beta1);
theta2_momentum = zeros('like',theta2);
beta2_momentum = zeros('like',beta2);

for t = 1:numIterations

  % Forward pass
  res = filtercnnDQN(X, w, b, theta1, beta1, theta2, beta2) ;
  dzdx5 = reshape(res.x5,4,1,size(X,4)) - y;
  for i = 1 : size(X,4)
      E1(i) = dzdx5(:,i)' * dzdx5(:,i);
  end
  E(1,t) = 1/(2*size(X,4)) * sum(E1);
  E(2,t) = 0;
  E(3,t) = E(1,t) + E(2,t) ;

  % Backward pass
  res = filtercnnDQN(X, w, b, theta1, beta1, theta2, beta2, dzdx5) ;

  % Update momentum
  w_momentum = momentum * w_momentum + rate * ((1/n) * (res.dzdw) + shrinkRate * w) ;
  b_momentum = momentum * b_momentum + rate * 0.1 * (1/n) * res.dzdb ;
  theta1_momentum = momentum * theta1_momentum + rate * ((1/n) * (res.dzdtheta1) + shrinkRate * theta1);
  beta1_momentum = momentum * beta1_momentum + rate * 0.1 * (1/n) * res.dzdbeta1;
  theta2_momentum = momentum * theta2_momentum + rate * ((1/n) * (res.dzdtheta2) + shrinkRate * theta2);
  beta2_momentum = momentum * beta2_momentum + rate * 0.1 * (1/n) * res.dzdbeta2;

  % Gradient step
  w = w - w_momentum ;
  b = b - b_momentum ;
  theta1 = theta1 - theta1_momentum;
  beta1 = beta1 - beta1_momentum;
  theta2 = theta2 - theta2_momentum;
  beta2 = beta2 - beta2_momentum;
end
Jtrain = E(1,t);

%The function filtercnnDQN.m

function res = filtercnnDQN(x, w, b, theta1, beta1, theta2, beta2, dzdy)

% Parameters of the layers
pad1 = ([size(w,1) size(w,1) size(w,2) size(w,2)] - 1) / 2 ;
rho2 = 3 ;
pad2 = (rho2 - 1) / 2 ;

% Define the number of training examples
N = size(x,4);

% Forward pass
res.x1 = x ;
res.x2 = vl_nnconv(res.x1, w, b, 'pad', pad1) ;
% res.x2 = vl_nnrelu(res.x2);
% res.x3 = vl_nnpool(res.x2, rho2, 'pad', pad2) ;
% res.x3 = vl_nnrelu(res.x3);
for i = 1:N
    res.x4(:,:,i) = single(sigmoid(theta1 * reshape(res.x2(:,:,i),400,1) + beta1));
    res.x5(:,:,i) = theta2 * res.x4(:,:,i) + beta2;

end

% Backward pass (only if passed output derivative)
if nargin > 7

  res.dzdx5 = dzdy ;
  res.dzdtheta2 = 0;
  res.dzdbeta2 = 0;
  res.dzdtheta1 = 0;
  res.dzdbeta1 = 0;

  for i = 1:N
      res.dzdx4(:,:,i) = theta2' * res.dzdx5(:,:,i);
      res.dzdtheta2 = res.dzdtheta2 + res.dzdx5(:,:,i) * res.x4(:,i)';
      res.dzdbeta2 = res.dzdbeta2 + res.dzdx5(:,:,i);

      res.dzdx2(:,:,1,i) = reshape((theta1' * res.dzdx4(:,:,i)) .* (reshape(res.x2(:,:,:,i),400,1)) .* (ones(400,1) - (reshape(res.x2(:,:,:,i),400,1))), 20, 20);
      res.dzdtheta1 = res.dzdtheta1 + res.dzdx4(:,:,i) * (reshape(res.x2(:,:,:,i),400,1))';
      res.dzdbeta1 = res.dzdbeta1 + res.dzdx4(:,:,i);


  end

%   res.dzdx2 = vl_nnrelu(res.x2, res.dzdx3);
%   res.dzdx2 = vl_nnpool(res.x2, rho2, res.dzdx2, 'pad', pad2) ;

%   res.dzdx1 = vl_nnrelu(res.x1, res.dzdx2);
  [res.dzdx1, res.dzdw, res.dzdb] = ...
    vl_nnconv(res.x1, w, b, res.dzdx2, 'pad', pad1) ;
end
Franck Dernoncourt
  • 42,093
  • 30
  • 155
  • 271

0 Answers0