Skip to content

Commit

Permalink
add all batch sizes and refactor backprop
Browse files Browse the repository at this point in the history
  • Loading branch information
100 committed Jan 5, 2017
1 parent 5a48c2c commit 45d7c92
Show file tree
Hide file tree
Showing 4 changed files with 178 additions and 118 deletions.
9 changes: 9 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,2 1,11 @@
# Neurotic
A portable, header-only, neural network framework written in C

## Features
* Sidmoid, ReLU, tanh, Softmax activations
* Cross-entropy loss, Mean squared error
* Batch, SGD, Mini-Batch
* L2 Regularization
* Learning rate annealing
* Momentum
* Serializable network
19 changes: 18 additions & 1 deletion src/matrix.h
Original file line number Diff line number Diff line change
Expand Up @@ -66,12 66,15 @@ Matrix* hadamard(Matrix* A, Matrix* B);
// places values of hadamard product of $A and $B into $into
void hadamardInto(Matrix* A, Matrix* B, Matrix* into);

// returns a copy of input matrix
// returns a shallow copy of input matrix
Matrix* copy(Matrix* orig);

// returns 1 if matrices are equal, 0 otherwise
int equals(Matrix* A, Matrix* B);

// shuffle two matrices, maintaining alignment between their rows
void shuffleTogether(Matrix* A, Matrix* B);

// frees a matrix and its data
void destroyMatrix(Matrix* matrix);

Expand Down Expand Up @@ -333,6 336,20 @@ int equals(Matrix* A, Matrix* B){
return 1;
}

void shuffleTogether(Matrix* A, Matrix* B){
assert(A->rows == B->rows);
int i;
for (i = 0; i < A->rows - 1; i ){
size_t j = i rand() / (RAND_MAX / (A->rows - i) 1);
float* tmpA = A->data[j];
A->data[j] = A->data[i];
A->data[i] = tmpA;
float* tmpB = B->data[j];
B->data[j] = B->data[i];
B->data[i] = tmpB;
}
}

void destroyMatrix(Matrix* matrix){
int i;
for (i = 0; i < matrix->rows; i ){
Expand Down
260 changes: 146 additions & 114 deletions src/optimizer.h
Original file line number Diff line number Diff line change
Expand Up @@ -7,40 7,45 @@
#ifndef OPTIMIZER_H
#define OPTIMIZER_H

// batch gradient descent
typedef enum LOSS_FUNCTION_ {
CROSS_ENTROPY_LOSS,
MEAN_SQUARED_ERROR
} LOSS_FUNCTION;

// batch gradient descent main function
// $network is the network to be trained
// $data is the training data
// $classes are the true values for each data point in $data, in order
// $batchSize is the size of each batch to use (1 for SGD, #rows for batch)
// $learningRate is the initial learning rate
// $searchTime is the other parameter in the search-and-converge method
// $regularizationStrength is the multiplier for L2 regularization
// $momentumFactor is the multiplier for the moment factor
// $maxIters is the number of epochs to run the algorithm for
// $shuffle, if non-zero, will shuffle the data between iterations
// $verbose, if non-zero, will print loss every 100 epochs
void batchGradientDescent(Network* network, Matrix* data, Matrix* classes, float learningRate, float searchTime, float regularizationStrength, float momentumFactor, int maxIters, int verbose);
void batchGradientDescent(Network* network, Matrix* data, Matrix* classes, LOSS_FUNCTION lossFunction, int batchSize, float learningRate, float searchTime, float regularizationStrength, float momentumFactor, int maxIters, int shuffle, int verbose);


/*
Begin functions.
*/

void batchGradientDescent(Network* network, Matrix* data, Matrix* classes, float learningRate, float searchTime, float regularizationStrength, float momentumFactor, int maxIters, int verbose){
void batchGradientDescent(Network* network, Matrix* data, Matrix* classes, LOSS_FUNCTION lossFunction, int batchSize, float learningRate, float searchTime, float regularizationStrength, float momentumFactor, int maxIters, int shuffle, int verbose){
assert(network->layers[0]->size == data->cols);
assert(data->rows == classes->rows);
assert(network->layers[network->numLayers - 1]->size == classes->cols);
assert(batchSize <= data->rows);
assert(maxIters >= 1);

// since it creates batches deterministically, the data batches and
// the class batches will still align properly
Matrix** dataBatches = createBatches(data, data->rows);
Matrix** classBatches = createBatches(classes, data->rows);

int training, epoch, layer, i, j, k;
int i, j, k;

// these will be reused per training instance
Matrix* errori[network->numLayers];
Matrix* dWi[network->numConnections];
Matrix* dbi[network->numConnections];
Matrix* regi[network->numConnections];
Matrix* beforeOutputT = createMatrixZeroes(network->layers[network->numLayers - 2]->size, 1);
for (i = 0; i < network->numConnections; i ){
errori[i] = createMatrixZeroes(1, network->layers[i]->size);
dWi[i] = createMatrixZeroes(network->connections[i]->weights->rows, network->connections[i]->weights->cols);
Expand Down Expand Up @@ -77,134 82,161 @@ void batchGradientDescent(Network* network, Matrix* data, Matrix* classes, float
dbi_last[i] = createMatrixZeroes(1, network->connections[i]->bias->cols);
}

for (epoch = 1; epoch <= maxIters; epoch ){
for (training = 0; training < data->rows; training ){
// current data point to train on
Matrix* example = dataBatches[training];
Matrix* target = classBatches[training];

// pass error forward
forwardPass(network, example);

// calculate each iteration of backpropagation
for (layer = network->numLayers - 1; layer > 0; layer--){
Layer* to = network->layers[layer];
Connection* con = network->connections[layer - 1];
if (layer == network->numLayers - 1){
// calculate output layer's error
copyValuesInto(to->input, errori[layer]);
for (j = 0; j < errori[layer]->cols; j ){
errori[layer]->data[0][j] -= target->data[0][j];
int numBatches = (data->rows / batchSize) (data->rows % batchSize != 0 ? 1 : 0);
int training, batch, epoch, layer;
Matrix** dataBatches,** classBatches;
epoch = 1;
while (epoch <= maxIters){
// shuffle all data and classes but maintain training/class alignment
if (shuffle != 0){
shuffleTogether(data, classes);
}

// split into overall batches
dataBatches = createBatches(data, numBatches);
classBatches = createBatches(classes, numBatches);
for (batch = 0; batch < numBatches && epoch <= maxIters; batch , epoch ){
// find current batch
int curBatchSize = batch == numBatches - 1 ? (data->rows % batchSize != 0 ? data->rows % batchSize : batchSize) : batchSize;
Matrix* batchTraining = dataBatches[batch];
Matrix* batchClasses = classBatches[batch];
Matrix** splitTraining = createBatches(batchTraining, curBatchSize);
Matrix** splitClasses = createBatches(batchClasses, curBatchSize);
for (training = 0; training < curBatchSize; training ){
// current data point to train on
Matrix* example = splitTraining[training];
Matrix* target = splitClasses[training];

// pass error forward
forwardPass(network, example);

// calculate each iteration of backpropagation
for (layer = network->numLayers - 1; layer > 0; layer--){
Layer* to = network->layers[layer];
Connection* con = network->connections[layer - 1];
if (layer == network->numLayers - 1){
// calculate output layer's error
copyValuesInto(to->input, errori[layer]);
for (j = 0; j < errori[layer]->cols; j ){
errori[layer]->data[0][j] -= target->data[0][j];
}

// calculate dWi and dbi
transposeInto(con->from->input, beforeOutputT);
multiplyInto(beforeOutputT, errori[layer], dWi[layer - 1]);
copyValuesInto(errori[layer], dbi[layer - 1]);
}
// calculate dWi and dbi
copyValuesInto(con->weights, dWi[layer - 1]);
for (i = 0; i < dWi[layer - 1]->rows; i ){
for (j = 0; j < dWi[layer - 1]->cols; j ){
dWi[layer - 1]->data[i][j] = errori[layer]->data[0][j] * con->from->input->data[0][i];
else{
// calculate error term for hidden layer
int hiddenLayer = layer - 1;
transposeInto(network->connections[layer]->weights, WTi[hiddenLayer]);
multiplyInto(errori[layer 1], WTi[hiddenLayer], errorLastTi[hiddenLayer]);
copyValuesInto(con->to->input, fprimei[hiddenLayer]);
float (*derivative)(float) = activationDerivative(con->to->activation);
for (j = 0; j < fprimei[hiddenLayer]->cols; j ){
fprimei[hiddenLayer]->data[0][j] = derivative(fprimei[hiddenLayer]->data[0][j]);
}
hadamardInto(errorLastTi[hiddenLayer], fprimei[hiddenLayer], errori[layer]);

// calculate dWi and dbi
transposeInto(con->from->input, inputTi[hiddenLayer]);
multiplyInto(inputTi[hiddenLayer], errori[layer], dWi[layer - 1]);
copyValuesInto(errori[layer], dbi[layer - 1]);
}
copyValuesInto(errori[layer], dbi[layer - 1]);
}
else{
// calculate error term for hidden layer
int hiddenLayer = layer - 1;
transposeInto(network->connections[layer]->weights, WTi[hiddenLayer]);
multiplyInto(errori[layer 1], WTi[hiddenLayer], errorLastTi[hiddenLayer]);
copyValuesInto(con->to->input, fprimei[hiddenLayer]);
float (*derivative)(float) = activationDerivative(con->to->activation);
for (j = 0; j < fprimei[hiddenLayer]->cols; j ){
fprimei[hiddenLayer]->data[0][j] = derivative(fprimei[hiddenLayer]->data[0][j]);
}
hadamardInto(errorLastTi[hiddenLayer], fprimei[hiddenLayer], errori[layer]);

// calculate dWi and dbi
transposeInto(con->from->input, inputTi[hiddenLayer]);
multiplyInto(inputTi[hiddenLayer], errori[layer], dWi[layer - 1]);
copyValuesInto(errori[layer], dbi[layer - 1]);
// add one example's contribution to total gradient
for (i = 0; i < network->numConnections; i ){
addTo(dWi[i], dWi_avg[i]);
addTo(dbi[i], dbi_avg[i]);
}

// zero out reusable matrices
zeroMatrix(beforeOutputT);
for (i = 0; i < network->numConnections; i ){
zeroMatrix(errori[i]);
zeroMatrix(dWi[i]);
zeroMatrix(dbi[i]);
}
zeroMatrix(errori[i]);
if (numHidden > 0){
for (i = 0; i < numHidden; i ){
zeroMatrix(WTi[i]);
zeroMatrix(errorLastTi[i]);
zeroMatrix(fprimei[i]);
zeroMatrix(inputTi[i]);
}
}
}

// add one example's contribution to total gradient
// calculate learning rate for this epoch
float currentLearningRate = searchTime == 0 ? learningRate : learningRate / (1 (epoch / searchTime));

// average out gradients and add learning rate
for (i = 0; i < network->numConnections; i ){
addTo(dWi[i], dWi_avg[i]);
addTo(dbi[i], dbi_avg[i]);
scalarMultiply(dWi_avg[i], currentLearningRate / data->rows);
scalarMultiply(dbi_avg[i], currentLearningRate / data->rows);
}

// zero out reusable matrices
// add regularization
for (i = 0; i < network->numConnections; i ){
zeroMatrix(errori[i]);
zeroMatrix(dWi[i]);
zeroMatrix(dbi[i]);
}
zeroMatrix(errori[i]);
if (numHidden > 0){
for (i = 0; i < numHidden; i ){
zeroMatrix(WTi[i]);
zeroMatrix(errorLastTi[i]);
zeroMatrix(fprimei[i]);
zeroMatrix(inputTi[i]);
}
copyValuesInto(network->connections[i]->weights, regi[i]);
scalarMultiply(regi[i], regularizationStrength);
addTo(regi[i], dWi_avg[i]);
}
}

// calculate learning rate for this epoch
float currentLearningRate = searchTime == 0 ? learningRate : learningRate / (1 (epoch / searchTime));

// average out gradients and add learning rate
for (i = 0; i < network->numConnections; i ){
scalarMultiply(dWi_avg[i], currentLearningRate / data->rows);
scalarMultiply(dbi_avg[i], currentLearningRate / data->rows);
}

// add regularization
for (i = 0; i < network->numConnections; i ){
copyValuesInto(network->connections[i]->weights, regi[i]);
scalarMultiply(regi[i], regularizationStrength);
addTo(regi[i], dWi_avg[i]);
}
// add momentum
for (i = 0; i < network->numConnections; i ){
scalarMultiply(dWi_last[i], momentumFactor);
scalarMultiply(dbi_last[i], momentumFactor);
addTo(dWi_last[i], dWi_avg[i]);
addTo(dbi_last[i], dbi_avg[i]);
}

// add momentum
for (i = 0; i < network->numConnections; i ){
scalarMultiply(dWi_last[i], momentumFactor);
scalarMultiply(dbi_last[i], momentumFactor);
addTo(dWi_last[i], dWi_avg[i]);
addTo(dbi_last[i], dbi_avg[i]);
}
// adjust weights and bias
for (i = 0; i < network->numConnections; i ){
scalarMultiply(dWi_avg[i], -1);
scalarMultiply(dbi_avg[i], -1);
addTo(dWi_avg[i], network->connections[i]->weights);
addTo(dbi_avg[i], network->connections[i]->bias);
}

// adjust weights and bias
for (i = 0; i < network->numConnections; i ){
scalarMultiply(dWi_avg[i], -1);
scalarMultiply(dbi_avg[i], -1);
addTo(dWi_avg[i], network->connections[i]->weights);
addTo(dbi_avg[i], network->connections[i]->bias);
}
// cache weight and bias updates for momentum
for (i = 0; i < network->numConnections; i ){
copyValuesInto(dWi_avg[i], dWi_last[i]);
copyValuesInto(dbi_avg[i], dbi_last[i]);
// make positive again for next epoch
scalarMultiply(dWi_last[i], -1);
scalarMultiply(dbi_last[i], -1);
}

// cache weight and bias updates for momentum
for (i = 0; i < network->numConnections; i ){
copyValuesInto(dWi_avg[i], dWi_last[i]);
copyValuesInto(dbi_avg[i], dbi_last[i]);
// make positive again for next epoch
scalarMultiply(dWi_last[i], -1);
scalarMultiply(dbi_last[i], -1);
}
// zero out reusable average matrices and regularization matrices
for (i = 0; i < network->numConnections; i ){
zeroMatrix(dWi_avg[i]);
zeroMatrix(dbi_avg[i]);
zeroMatrix(regi[i]);
}

// zero out reusable average matrices and regularization matrices
for (i = 0; i < network->numConnections; i ){
zeroMatrix(dWi_avg[i]);
zeroMatrix(dbi_avg[i]);
zeroMatrix(regi[i]);
}
// free list of training examples
for (i = 0; i < curBatchSize; i ){
free(splitTraining[i]);
free(splitClasses[i]);
}
free(splitTraining);
free(splitClasses);

// if verbose is set, print loss every 100 epochs
if (verbose != 0){
if (epoch % 100 == 0){
forwardPass(network, data);
printf("EPOCH %d: loss is %f\n", epoch, crossEntropyLoss(network, network->layers[network->numLayers - 1]->input, classes, regularizationStrength));
// if verbose is set, print loss every 100 epochs
if (verbose != 0){
if (epoch % 100 == 0){
forwardPass(network, data);
printf("EPOCH %d: loss is %f\n", epoch, crossEntropyLoss(network, network->layers[network->numLayers - 1]->input, classes, regularizationStrength));
}
}
}
}

// free all reusable matrices
destroyMatrix(beforeOutputT);
for (i = 0; i < network->numConnections; i ){
destroyMatrix(errori[i]);
destroyMatrix(dWi[i]);
Expand Down Expand Up @@ -233,7 265,7 @@ void batchGradientDescent(Network* network, Matrix* data, Matrix* classes, float
free(inputTi);
}

for (i = 0; i < data->rows; i ){
for (i = 0; i < numBatches; i ){
free(dataBatches[i]);
free(classBatches[i]);
}
Expand Down
Loading

0 comments on commit 45d7c92

Please sign in to comment.