I'm implementing a typical neural network with 1 hidden layer. The network does well with the logic XOR and other simple problems, but fails miserably when encountering a (16-input, 20~30 hidden, 3 output) training set. The problem is that the network does not learn anything, even with learning rate = 1.0 and momentum 0.8 ~ 0.9.
I did a lot of debug and found that at some output, the activation value is 1.0, even though mathematically the result of the logistic function (1/(1+exp(-signal)) can not be 1, but due to rounding, the computer produces 1.0 for signal that is >40.
Now, with the activation value equals 1.0, the gradient is always 0 regardless of the desired output because the term (1-activation)
will be 0, so in the equation:
delta = (1-activation) * (activation) * (activation - desired_output)
delta will be zero all the time, so no learning at all.
I tried to initialize the weights at very small value (0.00001 or so) but eventually all of them reach 3.0 ~4.0, or even some extreme value like 30~40, so with 20~30 hidden neurons it's very easy to end up with output signal >40.
Obviously I must have missed something, but with this kind of neural net (with hundreds of weights) I can't debug it too see what's wrong, so my question is: do all of the above symptoms signal anything wrong with my code?
Here are some important code:
class NNWeight {
double ** weights;
double *bias;
}
double sigmoid(double value) {
return 1.0/(1.0+exp(-value));
}
double Dsigmoid(double value){ //gradient
return value*(1-value);
}
//wInpHid = input-to-hidden layer weights (of type NNWeight)
//wHidOut = hidden-to-output layer weights (of type NNWeight)
void updateWeight() {
for (int iInp = 0; iInp<nInp; iInp++)
for (int iHid = 0; iHid<nHid; iHid++)
wInpHid.weights[iInp][iHid] += dInpHid.weights[iInp][iHid];
for (int iHid = 0; iHid<nHid; iHid++) {
wInpHid.bias[iHid] += dInpHid.bias[iHid];
}
for (int iHid = 0; iHid<nHid; iHid++)
for (int iOut = 0; iOut<nOut; iOut++)
wHidOut.weights[iHid][iOut] += dHidOut.weights[iHid][iOut];
for (int iOut = 0; iOut<nOut; iOut++)
wHidOut.bias[iOut] += dHidOut.bias[iOut];
}
void feedFW() {
bool correct = true;
// Feed the hidden layer
for (int iHid = 0; iHid<nHid; iHid++) {
hid[iHid] = wInpHid.bias[iHid];
for (int iInp = 0; iInp<nInp; iInp++) {
hid[iHid] += inp[iInp] * wInpHid.weights[iInp][iHid];
}
hid[iHid] = sigmoid(hid[iHid]);
}
//Feed the output layer
for (int iOut = 0; iOut<nOut; iOut++){
out[iOut] = wHidOut.bias[iOut];
for(int iHid = 0; iHid < nHid; iHid++)
out[iOut] += hid[iHid]*wHidOut.weights[iHid][iOut];
out[iOut] = sigmoid(out[iOut]);
}
}
void backProp() {
double SE =0.0;
//Calc the output errors:
for (int iOut = 0; iOut<nOut; iOut++){
SE += (out[iOut] - tar[iOut])*(out[iOut] - tar[iOut]);
outErr[iOut] = (tar[iOut] - out[iOut]) * Dsigmoid(out[iOut]);
}
//Calc the hidden errors:
for (int iHid = 0; iHid<nHid; iHid++){
double sum_wHidOut = 0.0;
for (int iOut = 0; iOut<nOut; iOut++) {
sum_wHidOut += outErr[iOut]*wHidOut.weights[iHid][iOut];
}
hidErr[iHid] = sum_wHidOut * Dsigmoid(hid[iHid]);
}
//Calc hidden - output layer delta (dHidOut)
for (int iHid = 0; iHid<nHid; iHid++)
for(int iOut = 0; iOut<nOut; iOut++){
dHidOut.weights[iHid][iOut] = lRate * outErr[iOut] * hid[iHid] + dHidOut.weights[iHid][iOut] * momentum; //lRate = learning rate
}
for (int iOut = 0; iOut<nOut; iOut++){
dHidOut.bias[iOut] = lRate * outErr[iOut] + dHidOut.bias[iOut] * momentum;
}
//Calc input - hidden layer delta (dInpHid)
for (int iInp = 0; iInp<nInp; iInp++)
for (int iHid = 0; iHid<nHid; iHid++)
dInpHid.weights[iInp][iHid] = lRate * hidErr[iHid] * inp[iInp] + dInpHid.weights[iInp][iHid] * momentum;
for (int iHid = 0; iHid<nHid; iHid++)
dInpHid.bias[iHid] = lRate * hidErr[iHid] + dInpHid.bias[iHid] * momentum;
//Update the weights
updateWeight();
}