-
Notifications
You must be signed in to change notification settings - Fork 55
/
Copy pathoptimizer.h
311 lines (277 loc) · 13.3 KB
/
optimizer.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
#include "std_includes.h"
#include "matrix.h"
#include "function.h"
#include "layer.h"
#include "network.h"
#ifndef OPTIMIZER_H
#define OPTIMIZER_H
// types of loss functions available to optimize under
typedef enum LOSS_FUNCTION_ {
CROSS_ENTROPY_LOSS,
MEAN_SQUARED_ERROR
} LOSS_FUNCTION;
// convenience struct for easier parameter filling
typedef struct ParameterSet_ {
Network* network;
DataSet* data;
DataSet* classes;
LOSS_FUNCTION lossFunction;
size_t batchSize;
float learningRate;
float searchTime;
float regularizationStrength;
float momentumFactor;
int maxIters;
int shuffle;
int verbose;
} ParameterSet;
// batch gradient descent main function
// $network is the network to be trained
// $data is the training data
// $classes are the true values for each data point in $data, in order
// $batchSize is the size of each batch to use (1 for SGD, #rows for batch)
// $learningRate is the initial learning rate
// $searchTime is the other parameter in the search-and-converge method
// $regularizationStrength is the multiplier for L2 regularization
// $momentumFactor is the multiplier for the moment factor
// $maxIters is the number of epochs to run the algorithm for
// $shuffle, if non-zero, will shuffle the data between iterations
// $verbose, if non-zero, will print loss every 100 epochs
static void batchGradientDescent(Network* network, DataSet* data, DataSet* classes, LOSS_FUNCTION lossFunction, size_t batchSize, float learningRate, float searchTime, float regularizationStrength, float momentumFactor, int maxIters, int shuffle, int verbose);
// optimizes given parameters
static void optimize(ParameterSet params){
batchGradientDescent(params.network, params.data, params.classes, params.lossFunction, params.batchSize, params.learningRate, params.searchTime, params.regularizationStrength, params.momentumFactor, params.maxIters, params.shuffle, params.verbose);
}
/*
Begin functions.
*/
void batchGradientDescent(Network* network, DataSet* data, DataSet* classes, LOSS_FUNCTION lossFunction, size_t batchSize, float learningRate, float searchTime, float regularizationStrength, float momentumFactor, int maxIters, int shuffle, int verbose){
assert(network->layers[0]->size == data->cols);
assert(data->rows == classes->rows);
assert(network->layers[network->numLayers - 1]->size == classes->cols);
assert(batchSize <= data->rows);
assert(maxIters >= 1);
int i, j, k;
// these will be reused per training instance
Matrix* errori[network->numLayers];
Matrix* dWi[network->numConnections];
Matrix* dbi[network->numConnections];
Matrix* regi[network->numConnections];
Matrix* beforeOutputT = createMatrixZeroes(network->layers[network->numLayers - 2]->size, 1);
for (i = 0; i < network->numConnections; i++){
errori[i] = createMatrixZeroes(1, network->layers[i]->size);
dWi[i] = createMatrixZeroes(network->connections[i]->weights->rows, network->connections[i]->weights->cols);
dbi[i] = createMatrixZeroes(1, network->connections[i]->bias->cols);
regi[i] = createMatrixZeroes(network->connections[i]->weights->rows, network->connections[i]->weights->cols);
}
errori[i] = createMatrixZeroes(1, network->layers[i]->size);
// these will be reused per training instance if network has hidden layers
int numHidden = network->numLayers - 2;
Matrix** WTi,** errorLastTi,** fprimei,** inputTi;
if (numHidden > 0){
WTi = (Matrix**)malloc(sizeof(Matrix*) * numHidden);
errorLastTi = (Matrix**)malloc(sizeof(Matrix*) * numHidden);
fprimei = (Matrix**)malloc(sizeof(Matrix*) * numHidden);
inputTi = (Matrix**)malloc(sizeof(Matrix*) * numHidden);
for (k = 0; k < numHidden; k++){
WTi[k] = createMatrixZeroes(network->connections[k + 1]->weights->cols, network->connections[k + 1]->weights->rows);
errorLastTi[k] = createMatrixZeroes(1, WTi[k]->cols);
fprimei[k] = createMatrixZeroes(1, network->connections[k]->to->size);
inputTi[k] = createMatrixZeroes(network->connections[k]->from->size, 1);
}
}
// these will be reused per epoch
Matrix* dWi_avg[network->numConnections];
Matrix* dbi_avg[network->numConnections];
Matrix* dWi_last[network->numConnections];
Matrix* dbi_last[network->numConnections];
for (i = 0; i < network->numConnections; i++){
dWi_avg[i] = createMatrixZeroes(network->connections[i]->weights->rows, network->connections[i]->weights->cols);
dbi_avg[i] = createMatrixZeroes(1, network->connections[i]->bias->cols);
dWi_last[i] = createMatrixZeroes(network->connections[i]->weights->rows, network->connections[i]->weights->cols);
dbi_last[i] = createMatrixZeroes(1, network->connections[i]->bias->cols);
}
int numBatches = (data->rows / batchSize) + (data->rows % batchSize != 0 ? 1 : 0);
int training, batch, epoch, layer;
DataSet** dataBatches,** classBatches;
epoch = 1;
while (epoch <= maxIters){
// shuffle all data and classes but maintain training/class alignment
if (shuffle != 0){
shuffleTogether(data, classes);
}
// split into overall batches
dataBatches = createBatches(data, numBatches);
classBatches = createBatches(classes, numBatches);
for (batch = 0; batch < numBatches && epoch <= maxIters; batch++, epoch++){
// find current batch
int curBatchSize = batch == numBatches - 1 ? (data->rows % batchSize != 0 ? data->rows % batchSize : batchSize) : batchSize;
DataSet* batchTraining = dataBatches[batch];
DataSet* batchClasses = classBatches[batch];
Matrix** splitTraining = splitRows(batchTraining);
Matrix** splitClasses = splitRows(batchClasses);
for (training = 0; training < curBatchSize; training++){
// current data point to train on
Matrix* example = splitTraining[training];
Matrix* target = splitClasses[training];
// pass error forward
forwardPass(network, example);
// calculate each iteration of backpropagation
for (layer = network->numLayers - 1; layer > 0; layer--){
Layer* to = network->layers[layer];
Connection* con = network->connections[layer - 1];
if (layer == network->numLayers - 1){
// calculate output layer's error
copyValuesInto(to->input, errori[layer]);
if (lossFunction == CROSS_ENTROPY_LOSS){
for (j = 0; j < errori[layer]->cols; j++){
errori[layer]->data[j] -= target->data[j];
}
}
else{
for (j = 0; j < errori[layer]->cols; j++){
errori[layer]->data[j] -= target->data[j];
}
}
// calculate dWi and dbi
transposeInto(con->from->input, beforeOutputT);
multiplyInto(beforeOutputT, errori[layer], dWi[layer - 1]);
copyValuesInto(errori[layer], dbi[layer - 1]);
}
else{
// calculate error term for hidden layer
int hiddenLayer = layer - 1;
transposeInto(network->connections[layer]->weights, WTi[hiddenLayer]);
multiplyInto(errori[layer + 1], WTi[hiddenLayer], errorLastTi[hiddenLayer]);
copyValuesInto(con->to->input, fprimei[hiddenLayer]);
float (*derivative)(float) = activationDerivative(con->to->activation);
for (j = 0; j < fprimei[hiddenLayer]->cols; j++){
fprimei[hiddenLayer]->data[j] = derivative(fprimei[hiddenLayer]->data[j]);
}
hadamardInto(errorLastTi[hiddenLayer], fprimei[hiddenLayer], errori[layer]);
// calculate dWi and dbi
transposeInto(con->from->input, inputTi[hiddenLayer]);
multiplyInto(inputTi[hiddenLayer], errori[layer], dWi[layer - 1]);
copyValuesInto(errori[layer], dbi[layer - 1]);
}
}
// add one example's contribution to total gradient
for (i = 0; i < network->numConnections; i++){
addTo(dWi[i], dWi_avg[i]);
addTo(dbi[i], dbi_avg[i]);
}
// zero out reusable matrices
zeroMatrix(beforeOutputT);
for (i = 0; i < network->numConnections; i++){
zeroMatrix(errori[i]);
zeroMatrix(dWi[i]);
zeroMatrix(dbi[i]);
}
zeroMatrix(errori[i]);
if (numHidden > 0){
for (i = 0; i < numHidden; i++){
zeroMatrix(WTi[i]);
zeroMatrix(errorLastTi[i]);
zeroMatrix(fprimei[i]);
zeroMatrix(inputTi[i]);
}
}
}
// calculate learning rate for this epoch
float currentLearningRate = searchTime == 0 ? learningRate : learningRate / (1 + (epoch / searchTime));
// average out gradients and add learning rate
for (i = 0; i < network->numConnections; i++){
scalarMultiply(dWi_avg[i], currentLearningRate / data->rows);
scalarMultiply(dbi_avg[i], currentLearningRate / data->rows);
}
// add regularization
for (i = 0; i < network->numConnections; i++){
copyValuesInto(network->connections[i]->weights, regi[i]);
scalarMultiply(regi[i], regularizationStrength);
addTo(regi[i], dWi_avg[i]);
}
// add momentum
for (i = 0; i < network->numConnections; i++){
scalarMultiply(dWi_last[i], momentumFactor);
scalarMultiply(dbi_last[i], momentumFactor);
addTo(dWi_last[i], dWi_avg[i]);
addTo(dbi_last[i], dbi_avg[i]);
}
// adjust weights and bias
for (i = 0; i < network->numConnections; i++){
scalarMultiply(dWi_avg[i], -1);
scalarMultiply(dbi_avg[i], -1);
addTo(dWi_avg[i], network->connections[i]->weights);
addTo(dbi_avg[i], network->connections[i]->bias);
}
// cache weight and bias updates for momentum
for (i = 0; i < network->numConnections; i++){
copyValuesInto(dWi_avg[i], dWi_last[i]);
copyValuesInto(dbi_avg[i], dbi_last[i]);
// make positive again for next epoch
scalarMultiply(dWi_last[i], -1);
scalarMultiply(dbi_last[i], -1);
}
// zero out reusable average matrices and regularization matrices
for (i = 0; i < network->numConnections; i++){
zeroMatrix(dWi_avg[i]);
zeroMatrix(dbi_avg[i]);
zeroMatrix(regi[i]);
}
// free list of training examples
for (i = 0; i < curBatchSize; i++){
free(splitTraining[i]);
free(splitClasses[i]);
}
free(splitTraining);
free(splitClasses);
// if verbose is set, print loss every 100 epochs
if (verbose != 0){
if (epoch % 100 == 0 || epoch == 1){
forwardPassDataSet(network, data);
if (lossFunction == CROSS_ENTROPY_LOSS){
printf("EPOCH %d: loss is %f\n", epoch, crossEntropyLoss(network, getOuput(network), classes, regularizationStrength));
}
else{
printf("EPOCH %d: loss is %f\n", epoch, meanSquaredError(network, getOuput(network), classes, regularizationStrength));
}
}
}
}
// free batch data for next batch creation
for (i = 0; i < numBatches; i++){
free(dataBatches[i]);
free(classBatches[i]);
}
free(dataBatches);
free(classBatches);
}
// free all reusable matrices
destroyMatrix(beforeOutputT);
for (i = 0; i < network->numConnections; i++){
destroyMatrix(errori[i]);
destroyMatrix(dWi[i]);
destroyMatrix(dbi[i]);
}
destroyMatrix(errori[i]);
for (i = 0; i < network->numConnections; i++){
destroyMatrix(dWi_avg[i]);
destroyMatrix(dbi_avg[i]);
destroyMatrix(dWi_last[i]);
destroyMatrix(dbi_last[i]);
destroyMatrix(regi[i]);
}
if (numHidden > 0){
for (i = 0; i < numHidden; i++){
destroyMatrix(WTi[i]);
destroyMatrix(errorLastTi[i]);
destroyMatrix(fprimei[i]);
destroyMatrix(inputTi[i]);
}
free(WTi);
free(errorLastTi);
free(fprimei);
free(inputTi);
}
}
#endif