-
Notifications
You must be signed in to change notification settings - Fork 7
/
train_ingrid.nim
1106 lines (1015 loc) · 45.5 KB
/
train_ingrid.nim
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
863
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
import flambeau / [flambeau_nn, tensors]
import std / [strformat, os, strutils, sequtils, random, algorithm, options, macros]
import ingrid / [tos_helpers, ingrid_types]
import pkg / [datamancer, unchained, nimhdf5]
# have to include the type definitions
include ./nn_types
import ./io_helpers
include ./nn_cuts
{.experimental: "views".}
var ClampOutput = 50.0
## XXX: a bit annoying that this is here...
const CdlFile = "/home/basti/CastData/data/CDL_2019/calibration-cdl-2018.h5"
let UseTeX = getEnv("USE_TEX", "false").parseBool
let Width = getEnv("WIDTH", "600").parseFloat
let Height = getEnv("Height", "420").parseFloat
proc thL(fWidth: float, width: float,
baseTheme: (proc(): Theme),
height = -1.0, ratio = -1.0,
textWidth = 458.29268, # 455.24411
): Theme =
if UseTeX:
let texOptions = toTeXOptions(UseTeX, onlyTikZ = false,
standalone = true,
texTemplate = "", caption = "", label = "", placement = "htbp")
result = themeLatex(fWidth, width, baseTheme, height, ratio, textWidth,
useTeX = UseTeX, texOptions = texOptions)
else:
result = Theme()
proc generateTrainTest(df: DataFrame, rnd: var Rand):
((RawTensor, RawTensor), (RawTensor, RawTensor)) {.noInit.} =
# - generate
# - generate random numbers from 0 to df length
# - add as column and sort by it
## XXX: this mixes signal and background events!
## XXX: Replace this `"x" in df` check by something sane!
if "x" in df: # raw data
const Num = 1000
let nTrain = Num #df.len div 2
let nTest = Num # df.len - nTrain
var data = rawtensors.zeros(nTrain * 256 * 256).reshape([nTrain.int64, 256, 256].asTorchView())
var target = rawtensors.zeros(nTrain * 2).reshape(sizes = [nTrain.int64, 2].asTorchView())
var dataTest = rawtensors.zeros(nTest * 256 * 256).reshape(sizes = [nTest.int64, 256, 256].asTorchView())
var targetTest = rawtensors.zeros(nTest * 2).reshape(sizes = [nTest.int64, 2].asTorchView())
var i = 0
for (tup, subDf) in groups(df.group_by(["eventNumber", "Type"])):
let ev = tup[0][1].toInt
# echo "Event number ", ev, " at index ", i
let xs = subDf["x", int]
let ys = subDf["y", int]
let isSignal = tup[1][1].toStr == $dtSignal
for j in 0 ..< xs.size:
let x = xs[j]
let y = ys[j]
if i < nTrain:
data[i, x, y] = 1.0
target[i, _] = if isSignal: [1, 0].toRawTensorFromScalar
else: [0, 1].toRawTensorFromScalar
else:
dataTest[i - nTrain, x, y] = 1.0
targetTest[i - nTrain, _] = if isSignal: [1, 0].toRawTensorFromScalar
else: [0, 1].toRawTensorFromScalar
inc i
if i >= nTrain nTest: break
result = (train: (data, target), test: (dataTest, targetTest))
else:
var df = df # mutable copy
df["Idx"] = rnd.shuff(toSeq(0 .. df.high))
df = df.arrange("Idx")
let dfTrain = df[0 .. df.high div 2]
let dfTest = df[df.high div 2 1 .. df.high]
## Add another output each to get the target
## thus return
## train: (input, target)
## test: (input, target)
## tuples each.
echo "SIZES\n\n"
echo dfTrain.len
echo dfTest.len
result = (train: dfTrain.toInputTensor, test: dfTest.toInputTensor)
proc plotLikelihoodDist(df: DataFrame) =
echo df
let df = df.mutate(f{float -> float: "likelihood" ~ (if classify(idx("likelihood")) == fcInf: 50.0
else: idx("likelihood"))})
ggplot(df, aes("likelihood", fill = "Type"))
geom_histogram(bins = 100, position = "identity", alpha = some(0.5), hdKind = hdOutline)
scale_x_continuous()
ggsave("/tmp/likelihood.pdf")
proc plotPredictions(predictions: seq[float], targets: seq[int],
outfile = "/tmp/test.pdf") =
#echo "INPUT ", predictions.len
#echo "TARG ", targets.len
let dfPlt = toDf(predictions, targets)
.mutate(f{"isSignal" ~ `targets` == 1})
.filter(f{`predictions` > -ClampOutput and `predictions` < ClampOutput})
#dfPlt.showBrowser()
#echo "Number of signals: ", dfPlt.filter(f{`isSignal` == true})
#echo "Number of backs: ", dfPlt.filter(f{`isSignal` == false})
#if true: quit()
try:
ggplot(dfPlt, aes("predictions", fill = "isSignal"))
geom_histogram(bins = 100, position = "identity", alpha = some(0.5), hdKind = hdOutline)
xlab("Predictions") ylab("Count")
scale_x_continuous()
margin(right = 3.5, left = 3.5)
thL(fWidth = 0.5, width = Width, baseTheme = sideBySide)
ggsave(outfile, width = 600, height = 420)
except:
discard
# now a log10 version
var dfL = newDataFrame()
var maxH = 0.0
for (tup, subDf) in groups(dfPlt.group_by("isSignal")):
let (hist, bins) = histogram(subDf["predictions", float].toSeq1D, bins = 100)
let dfH = toDf({"count" : hist.extend(0), "predictions" : bins, "isSignal" : tup[0][1].toBool})
dfL.add dfH
maxH = max(maxH, hist.max.float)
try:
ggplot(dfL, aes("predictions", "count", fill = "isSignal"))
geom_histogram(stat = "identity", position = "identity", alpha = some(0.5), hdKind = hdOutline)
scale_x_continuous()
xlab("Predictions") ylab("Count")
scale_y_log10()
ylim(0.0, log10(maxH))
margin(right = 3.5, left = 3.5)
thL(fWidth = 0.5, width = Width, baseTheme = sideBySide)
ggsave(outfile.replace(".pdf", "_log10.pdf"), width = 600, height = 420)
except:
discard
proc plotType(epochs: seq[int], data, testData: seq[float], typ: string,
outfile: string) =
## XXX: this is broken for `continueAfterEpochs`!
if data.len < 2: return
let df = toDf({"Epoch" : epochs, data, testData})
.gather(["data", "testData"], "Type", typ)
try:
ggplot(df, aes("Epoch", typ, color = "Type"))
geom_line()
scale_y_log10()
scale_x_continuous(breaks = 6)
margin(right = 5.0, left = 3.0)
thL(fWidth = 0.5, width = Width, baseTheme = sideBySide)
ggsave(outfile, width = 600, height = 420)
except:
discard
proc rocCurve(df: DataFrame,
suffix = "", plotPath = "/tmp",
yLow = 0.96) =
## plots the ROC curve of the predictions vs the targets
var plt = if "Type" in df:
ggplot(df, aes("sigEff", "backRej", color = "Type"))
else:
ggplot(df, aes("sigEff", "backRej"))
try:
plt
geom_line()
ylim(yLow, 1.0)
ggsave(&"{plotPath}/roc_curve{suffix}.pdf")
except:
discard
proc rocCurve(predictions: seq[float], targets: seq[int],
suffix = "", plotPath = "/tmp") =
let dfRoc = calcRocCurve(predictions, targets)
rocCurve(dfRoc, suffix, plotPath)
proc predictions(df: DataFrame): (seq[float], seq[int]) =
## The `Prediction` column contains either `"likelihood"` values or
## `"Neuron_0"` values. As likelihood may be saturated, we clamp to 50.
let pred = df["Prediction", float].map_inline:
if classify(x) == fcInf:
50.0
else: x
let targets = df["Type", string].map_inline:
if x == $dtBack: 0
else: 1
result = (pred.toSeq1D, targets.toSeq1D)
proc plotRocCurve(dfLnL, dfMLP: DataFrame, suffix = "_likelihood", plotPath = "/tmp") =
## plots the ROC curve of the predictions vs the targets
let (logL, targets) = predictions(dfLnL)
let (preds, targetsMlp) = predictions(dfMlp) ## Note: dfMlp must have its dataset named `likelihood`! # .rename(f{"likelihood" <- "preds"}))
let dfRoc = calcRocCurve(logL, targets, preds, targetsMlp)
rocCurve(dfRoc, suffix, plotPath)
import ./simulate_xrays
import ingridDatabase / databaseRead
proc prepareSimDataframe(mlpDesc: MLPDesc, readRaw: bool, rnd: var Rand): DataFrame =
## Generates a dataframe to train on that contains real background events and
## simulated X-rays. The calibration input files are only used to determine the
## boundaries of the gas gain and diffusion in which to generate events in!
# for now we just define hardcoded bounds
var dfSim = newDataFrame()
if mlpDesc.simFiles.len == 0:
let gains = @[2400.0, 4000.0]
let diffusion = @[550.0, 700.0] # μm/√cm, will be converted to `mm/√cm` when converted to DF
# the theta parameter describing the Pólya also needs to be varied somehow
var dfSim = newDataFrame()
for c in mlpDesc.calibFiles:
withH5(c, "r"):
let calibInfo = h5f.initCalibInfo()
dfSim.add rnd.generateFakeEventsDf(calibInfo, gains, diffusion, mlpDesc.nFake)
else:
dfSim.add readSimData(mlpDesc.simFiles)
var dfBack = newDataFrame()
for b in mlpDesc.backFiles:
## NOTE: For all training runs before 2023/10/30 we multiplied `subsetPerRun` by 6.
## As we never adjusted `subsetPerRun` by hand, to reproduce old trainings, use
## `--subsetPerRun 6000`!
dfBack.add prepareAllBackground(b, readRaw, subsetPerRun = mlpDesc.subsetPerRun,
region = mlpDesc.backgroundRegion,
chips = mlpDesc.backgroundChips)
echo "Simulated: ", dfSim
echo "Back: ", dfBack
result = newDataFrame()
result.add dfSim.drop(["runNumber", "eventNumber", "Target", "likelihood"])
result.add dfBack.drop(["runNumber", "eventNumber", "Target"])
proc prepareMixedDataframe(mlpDesc: MLPDesc, readRaw: bool): DataFrame =
let dfCdl = prepareCdl(readRaw)
var dfFe = newDataFrame()
let spr = mlpDesc.subsetPerRun
for c in mlpDesc.calibFiles:
dfFe.add readCalibData(c, "escape", 2.5, 3.5, spr div 2)
dfFe.add readCalibData(c, "photo", 5.0, 7.0, spr div 2)
var dfBack = newDataFrame()
for b in mlpDesc.backFiles:
dfBack.add prepareAllBackground(b, readRaw, subsetPerRun = spr, chips = mlpDesc.backgroundChips) # .drop(["centerX", "centerY"])
echo "CDL: ", dfCdl
echo "55Fe: ", dfFe
echo "Back: ", dfBack
result = newDataFrame()
result.add dfCdl
result.add dfFe.drop(["runNumber", "CalibType"])
result.add dfBack.drop(["runNumber"])
template printTensorInfo(arg: untyped): untyped =
echo astToStr(arg), ": ", typeof(arg), " on device ", arg.get_device(), " is cuda ", arg.is_cuda()
proc genCheckpointName(s: string, epoch: int, loss, acc: float): string =
## Generate a name for a model checkpoint given a base name (including file extension)
## and the epoch and test loss & accuracy
result = s
result.removeSuffix(".pt")
result.add &"checkpoint_epoch_{epoch}_loss_{loss:.4f}_acc_{acc:.4f}.pt"
proc test(model: AnyModel,
input, target: RawTensor,
device: Device,
desc: MLPDesc,
plotOutfile = "/tmp/test_validation.pdf",
neuron = 0,
toPlot = true): (float, float, seq[float], seq[int])
proc calcLoss(output, target: RawTensor, desc: MLPDesc): RawTensor =
## Applies the correct loss function based on the MLPDesc
case desc.lossFunction
of lfL1loss : result = l1_loss(output, target)
of lfMSEloss : result = mse_loss(output, target)
of lfSigmoidCrossEntropy : result = sigmoid_cross_entropy(output, target)
proc train(model: AnyModel, optimizer: var Optimizer,
input, target: RawTensor,
testInput, testTarget: RawTensor,
device: Device,
readRaw: bool,
desc: MLPDesc,
epochs: int,
continueAfterEpoch = 0) =
let dataset_size = input.size(0)
var toPlot = false
var mlpDesc = desc # local mutable copy to store losses, accuracy etc in
let plotPath = mlpDesc.plotPath
let start = continueAfterEpoch
let stop = start epochs
for epoch in start .. stop:
var correct = 0
if epoch mod 50 == 0:
echo "Epoch is:" & $epoch
if epoch mod desc.plotEvery == 0:
toPlot = true
var predictions = newSeqOfCap[float](dataset_size)
var targets = newSeqOfCap[int](dataset_size)
## XXX: Adjust the upper end similar to in `test` to get all data!
var sumLoss = 0.0
var count = 0
for batch_id in 0 ..< (dataset_size.float / bsz.float).ceil.int:
# Reset gradients.
optimizer.zero_grad()
# minibatch offset in the Tensor
## TODO: generalize the batching and make sure to take `all` elements! (currently skip last X)
# minibatch offset in the Tensor
let offset = batch_id * bsz
let stop = min(offset bsz, dataset_size)
var x: RawTensor
if not readRaw:
x = input[offset ..< stop, _ ].to(device)
else:
x = input[offset ..< stop, _, _ ].unsqueeze(1).to(device)
#x = input[offset ..< offset bsz, _, _ ].unsqueeze(1)
echo "INPUT TENSOR SHAPE ", x.sizes
let target = target[offset ..< stop].to(device)
#printTensorInfo(target)
# Running input through the network
let output = model.forward(desc, x)
#echo "computed forward"
#printTensorInfo(output)
let pred = output.argmax(1)
#echo "pred ", pred
#printTensorInfo(pred)
if toPlot:
# take 0th column
predictions.add output[_, 0].toNimSeq[:float]
targets.add target[_, 0].toNimSeq[:int]
correct = pred.eq(target.argmax(1)).sum().item(int)
# Computing the loss
var loss = calcLoss(output, target, desc)
# Compute the gradient (i.e. contribution of each parameter to the loss)
loss.backward()
#echo "did backward"
# Correct the weights now that we have the gradient information
optimizer.step()
#echo "stepped"
sumLoss = loss.item(float)
inc count
if toPlot:
let train_acc = correct.float / dataset_size.float64() # loss.item(float)
let averageLoss = sumLoss / count.float
echo &"\nTrain set: Average loss: {averageLoss:.4f} " &
&"| Accuracy: {correct.float64() / dataset_size.float64():.3f}"
let (testAccuracy, testLoss, tOut, tTarget) = model.test(testInput, testTarget, device, desc, toPlot = false)
# epoch
mlpDesc.epochs.add epoch
# Accuracies
mlpDesc.accuracies.add train_acc
mlpDesc.testAccuracies.add testAccuracy
# Losses
mlpDesc.losses.add(min(1.0, averageLoss))
mlpDesc.testLosses.add(min(1.0, testLoss))
# now save the model as a checkpoint
model.save(genCheckpointName(mlpDesc.path, epoch, testLoss, testAccuracy))
# now write the mlpDesc again
mlpDesc.toH5(mlpDesc.path.parentDir / MLPDescName)
## generate the plots
plotPredictions(predictions, targets, outfile = &"{plotPath}/training_output.pdf")
plotPredictions(tOut, tTarget, outfile = &"{plotPath}/validation_output.pdf")
plotType(mlpDesc.epochs, mlpDesc.accuracies, mlpDesc.testAccuracies, "Accuracy", outfile = &"{plotPath}/accuracy.pdf")
plotType(mlpDesc.epochs, mlpDesc.losses, mlpDesc.testLosses, "Loss", outfile = &"{plotPath}/loss.pdf")
let preds = predictions.mapIt(clamp(it, -ClampOutput, ClampOutput))
rocCurve(preds, targets, plotPath = plotPath)
toPlot = false
proc test(model: AnyModel,
input, target: RawTensor,
device: Device,
desc: MLPDesc,
plotOutfile = "/tmp/test_validation.pdf",
neuron = 0,
toPlot = true): (float, float, seq[float], seq[int]) =
## returns the predictions / targets
let dataset_size = input.size(0)
var correct = 0
var predictions = newSeqOfCap[float](dataset_size)
var targets = newSeqOfCap[int](dataset_size)
var sumLoss = 0.0
var count = 0
no_grad_mode:
for batch_id in 0 ..< (dataset_size.float / bsz.float).ceil.int:
# minibatch offset in the Tensor
let offset = batch_id * bsz
let stop = min(offset bsz, dataset_size)
let x = input[offset ..< stop, _ ].to(device)
let target = target[offset ..< stop].to(device)
# Running input through the network
let output = model.forward(desc, x)
# get the larger prediction along axis 1 (the example axis)
let pred = output.argmax(1)
# take 0th column
predictions.add output[_, neuron].toNimSeq[:float]
targets.add target[_, 0].toNimSeq[:int]
correct = pred.eq(target.argmax(1)).sum().item(int)
# Computing the loss
var loss = calcLoss(output, target, desc)
sumLoss = loss.item(float)
inc count
let testAcc = correct.float / dataset_size.float64()
let testLoss = sumLoss / count.float
echo &"\nTest set: Average loss: {testLoss:.4f} " &
&"| Accuracy: {testAcc:.4f}"
## create output plot
if toPlot:
plotPredictions(predictions, targets, outfile = plotOutfile)
# will fail probably...
# doAssert target == targets
let preds = predictions.mapIt(clamp(it, -ClampOutput, ClampOutput))
result = (testAcc, testLoss, preds, targets)
proc predict(model: AnyModel,
input, target: RawTensor,
device: Device,
desc: MLPDesc,
cutVal: float,
neuron: int = 0,
plotPath = "/tmp/"): seq[int] =
## returns the predictions / targets
let dataset_size = input.size(0)
var correct = 0
var predictions = newSeqOfCap[float](dataset_size)
var targets = newSeqOfCap[int](dataset_size)
var sumLoss = 0.0
var count = 0
no_grad_mode:
for batch_id in 0 ..< (dataset_size.float / bsz.float).ceil.int:
# minibatch offset in the Tensor
let offset = batch_id * bsz
let stop = min(offset bsz, dataset_size)
let x = input[offset ..< stop, _ ].to(device)
let target = target[offset ..< stop].to(device)
# Running input through the network
let output = model.forward(desc, x)
# get the larger prediction along axis 1 (the example axis)
let pred = output.argmax(1)
let predSeq = pred.toNimSeq[:int]
# take 0th column
predictions.add output[_, neuron].toNimSeq[:float]
targets.add target[_, 0].toNimSeq[:int]
correct = pred.eq(target.argmax(1)).sum().item(int)
# Computing the loss
var loss = calcLoss(output, target, desc)
sumLoss = loss.item(float)
inc count
for i in 0 ..< min(bsz, stop - offset):
if predSeq[i] == 0:
result.add (offset i).int # else add the index of this event that looks like signal
let predAcc = correct.float / dataset_size.float64()
let predLoss = sumLoss / count.float
echo &"\nPred set: Average loss: {predLoss:.4f} " &
&"| Accuracy: {predAcc:.4f}"
## create output plot
#plotPredictions(predictions, targets, outfile = plotOutfile)
# will fail probably...
# doAssert target == targets
#let preds = predictions.mapIt(clamp(it, -50.0, 50.0))
#result = preds
when false:
let dataset_size = input.size(0)
var correct = 0
var predictions = newSeq[float]()
no_grad_mode:
## XXX: Adjust the upper end similar to in `test` to get all data!
for batch_id in 0 ..< (dataset_size.float / bsz.float).ceil.int:
# minibatch offset in the Tensor
let offset = batch_id * bsz
let stop = min(offset bsz, dataset_size)
let x = input[offset ..< stop, _ ].to(device)
let target = target[offset ..< stop].to(device)
# Running input through the network
let output = model.forward(desc, x)
# get the larger prediction along axis 1 (the example axis)
let pred = output.argmax(1).toNimSeq[:float]()
# take 0th column
#echo pred # [i], " vs ", output[i, _]
predictions.add output[_, 0].toNimSeq[:float]
for i in 0 ..< min(bsz, stop - offset):
#if target[i, _].argmax() == output[i, _].argmax():
# #echo "Is correct! ", offset i # in this case it is indeed classified as background
# discard
#if pred[i].item(int) == 0: # target is [1, 0] for signal. i.e. argmax for signal should be `0` for signal
#if output[i, _].argmax(0).item(int) == 0: # target is [1, 0] for signal. i.e. argmax for signal should be `0` for signal
# result.add offset i
#else:
if output[i, 0].item(float) > cutVal:
#echo "is false! ", offset i
result.add (offset i).int # else add the index of this event that looks like signal
correct = pred.eq(target.argmax(1)).sum().item(int)
let test_loss = correct.float / dataset_size.float64()
echo &"\nPredict set: Average loss: {test_loss:.4f} " &
&"| Accuracy: {correct.float64() / dataset_size.float64():.3f}"
let df = toDf(predictions)
.filter(f{`predictions` > -ClampOutput})
ggplot(df, aes("predictions"))
geom_histogram(bins = 100)
ggsave(&"{plotPath}/predictions_all_background.pdf")
## XXX: remove the plotting, scaling etc logic and rather output the
## prediction data correctly. or rather don't do any prediction in this tool
## per se, move that to `likelihood` and use it as a regular `fkMlp` / `fkCNN` method!
proc scaleDset(data: Column, totalTime, factor: float): Column =
## scales the data in `data` according to the area of the gold region,
## total time and bin width. The data has to be pre binned of course!
let area = pow(0.95 - 0.45, 2) # area of gold region!
const bin_width = 0.2 # 0.392
const shutter_open = 1.0 # Does not play any role, because totalDuration takes
# this into account already!
let scale = factor / (totalTime * shutter_open * area * bin_width) #* 1e5
result = toColumn data.toTensor(float).map_inline(x * scale)
proc histogram(df: DataFrame): DataFrame =
## Calculates the histogam of the energy data in the `df` and returns
## a histogram of the binned data
## TODO: allow to do this by combining different `File` values
let (hist, bins) = histogram(df["energies"].toTensor(float).toSeq1D,
range = (0.0, 20.0), bins = 100)
result = toDf({ "energies" : bins, "count" : concat(hist, @[0]) })
proc plotBackground(data: seq[float], totalTime: Hour, plotPath = "/tmp/") =
let dfE = toDf({"energies" : data})
.filter(f{`energies` < 20.0})
#dfE.showBrowser()
var dfH = histogram(dfE)
let t = if totalTime > 0.0.Hour: totalTime else: 2144.67.Hour ## Correct time for Run-2
dfH["Rate"] = dfH["count"].scaleDset(t.to(Second).float, 1e5)
echo dfH
#ggplot(dfE, aes("energies"))
# geom_histogram(bins = 100)
# ggsave("/tmp/simple_background_rate.pdf")
ggplot(dfH, aes("energies", "Rate"))
geom_histogram(stat = "identity", hdKind = hdOutline)
xlab("Energy [keV]") ylab("Rate [10⁻⁵ keV⁻¹•cm⁻²•s⁻¹]")
ggsave(&"{plotPath}/simple_background_rate.pdf")
proc targetSpecificRoc(dfLnL, dfMlp: DataFrame, plotPath = "/tmp") =
## computes the CDL target specific ROC curves for logL and MLP
# 1. split the input df by target and perform regular ROC calculations
let dfRoc = bind_rows([("LogL", dfLnL), ("MLP", dfMlp)],
"Method")
var df = newDataFrame()
for tup, subDf in groups(dfRoc.group_by(["Method", "Target"])):
let meth = tup[0][1].toStr # lnL or MLP
let target = tup[1][1].toStr
let (output, targets) = subDf.predictions()
let verbose = meth == "LogL" and target == "C-EPIC-0.6kV"
var dfRoc = calcRocCurve(output, targets, verbose)
dfRoc["Target"] = target
dfRoc["Method"] = meth
df.add dfRoc
ggplot(df, aes("sigEff", "backRej", color = "Target", shape = "Method"))
geom_line()
ylim(0.5, 1.0)
xlab("Signal efficiency") ylab("Background rejection")
thL(fWidth = 0.9, width = Width, baseTheme = singlePlot)
ggsave(&"{plotPath}/roc_curve_combined_split_by_target.pdf")
proc determineCdlEfficiency(model: AnyModel, device: Device, desc: MLPDesc, ε: float, readRaw: bool,
global = true, plotPath = "/tmp/") =
##
let dfCdl = prepareCdl(readRaw)
let cutValGlobal = determineCutValue(model, device, desc, ε, readRaw)
let cutValLocal = determineLocalCutValue(model, device, desc, ε, readRaw)
# for reference determine cut value based on full data (expect to get out `ε`)
proc getEff(model: AnyModel, df: DataFrame, cutVal: float, outfile: string): float =
let (cdlInput, cdlTarget) = df.toInputTensor()
let (_, _, cdlPredict, predTargets) = model.test(cdlInput, cdlTarget, device,
desc,
plotOutfile = outfile)
result = determineEff(cdlPredict.sorted, cutVal)
let totalEff = model.getEff(dfCdl, cutValGlobal, &"{plotPath}/cdl_prediction_all_data.pdf")
echo "Total global efficiency = ", totalEff
for (tup, subDf) in groups(dfCdl.group_by("Target")):
let target = tup[0][1].toStr
#echo "Current target: ", target, " data: ", subDf
let cut = if global: cutValGlobal
else: cutValLocal[target]
let effectiveEff = model.getEff(subDf, cut, &"{plotPath}/cdl_prediction_{target}.pdf")
let suffix = if global: "global" else: "local"
echo "Target ", suffix, " ", target, " cutValue = ", cut, " eff = ", effectiveEff
proc predictBackground(model: AnyModel, device: Device, desc: MLPDesc, fname: string, ε: float, totalTime: Hour,
readRaw: bool, plotPath: string) =
# classify
let cutVal = determineCutValue(model, device, desc, ε, readRaw)
## XXX: make sure the cut value stuff & the sign of the predictions is correct everywhere! We flipped the
## sign initially to compare with logL!
let dfAll = prepareAllBackground(fname, readRaw)
let (allInput, allTarget) = dfAll.toInputTensor()
let passedInds = model.predict(allInput, allTarget, device, desc, cutVal)
echo "p inds len ", passedInds.len, " compared to all ", dfAll.len
# get all energies of these passing events
let energiesAll = dfAll["energyFromCharge", float]
var energies = newSeq[float](passedInds.len)
for i, idx in passedInds:
energies[i] = energiesAll[idx]
plotBackground(energies, totalTime, plotPath)
proc predictCalib(model: AnyModel, device: Device, desc: MLPDesc, fname: string, ε: float) =
# classify
var df = newDataFrame()
df.add readCalibData(fname, "escape", 2.5, 3.5)
df.add readCalibData(fname, "photo", 5.5, 6.5)
#let (model, device) = loadModelMakeDevice(model)
for (tup, subDf) in groups(df.group_by(["Type", "runNumber"])):
let (inp, target) = subDf.toInputTensor()
let passedInds = model.predict(inp, target, device, desc, 0.0)
echo "p inds len ", passedInds.len, " compared to all ", subDf.len, " Efficiency: ", passedInds.len.float / subDf.len.float
proc readAllData(model: AnyModel, device: Device,
desc: MLPDesc,
calcLogL: bool,
mlpOutput: bool): (DataFrame, DataFrame, DataFrame) =
var dfCdl = prepareCdl(false)
#.filter(f{`Target` == "Mn-Cr-12kV"}) #<- to only look at Photo peak equivalent. Better look at ridge line!
var dfFe = newDataFrame()
for c in desc.calibFiles:
dfFe.add readCalibData(c, "escape", 2.5, 3.5) #, subsetPerRun div 2)
dfFe.add readCalibData(c, "photo", 5.0, 7.0) #, subsetPerRun div 2)
var dfBack = newDataFrame()
for b in desc.backFiles:
dfBack.add prepareAllBackground(b, false) # , subsetPerRun = subsetPerRun * 6) # .drop(["centerX", "centerY"])
if calcLogL: # also calculate the likelihood of every event
## Set up the likelihood context to compute lnL values!
let ctx = initLikelihoodContext(CdlFile,
year = yr2018,
energyDset = igEnergyFromCharge,
region = crSilver,
timepix = Timepix1,
morphKind = mkLinear,
useLnLCut = true) # morphing to plot interpolation
template lnL(ctx, df: untyped): untyped =
df[igLikelihood.toDset()] = ctx.calcLikelihood(df).mapIt(clamp(it, -ClampOutput, ClampOutput))
ctx.lnL(dfCdl)
ctx.lnL(dfFe)
ctx.lnL(dfBack)
if mlpOutput:
proc modelOutput(model: AnyModel, device: Device, df: var DataFrame, neuron: int) =
df["Neuron_" & $neuron] = model.forward(df.toTorchTensor(), device, desc, neuron)
.mapIt(clamp(it, -ClampOutput, ClampOutput))
model.modelOutput(device, dfCdl, neuron = 0)
model.modelOutput(device, dfCdl, neuron = 1)
model.modelOutput(device, dfFe, neuron = 0)
model.modelOutput(device, dfFe, neuron = 1)
model.modelOutput(device, dfBack, neuron = 0)
model.modelOutput(device, dfBack, neuron = 1)
result = (dfCdl, dfFe, dfBack)
proc predictAll(model: AnyModel, device: Device,
desc: MLPDesc) =
let (dfCdl, dfFe, dfBack) = model.readAllData(device, desc,
calcLogL = true,
mlpOutput = true)
# for each get prediction, neuron 0 and neuron 1
template p(dfIn, neuron, typ: untyped): untyped =
block:
var df = toDf({"preds" : dfIn["Neuron_" & $neuron, float]})
df["Type"] = typ
df["Neuron"] = neuron
df
block AllPredictions:
var dfAll = newDataFrame()
dfAll.add p(dfCdl, 0, "CDL")
dfAll.add p(dfCdl, 1, "CDL")
dfAll.add p(dfFe, 0, "55Fe")
dfAll.add p(dfFe, 1, "55Fe")
dfAll.add p(dfBack, 0, "back")
dfAll.add p(dfBack, 1, "back")
ggplot(dfAll, aes("preds", fill = "Type"))
facet_wrap("Neuron")
geom_histogram(bins = 100, hdKind = hdOutline, alpha = 0.5, position = "identity")
ggsave(&"{desc.plotPath}/all_predictions.pdf", width = 1000, height = 500)
block AllByTypeRidge:
template lnL(dfIn, typ: untyped): untyped =
toDf({ "lnLs" : dfIn["likelihood", float],
"Type" : typ })
template byIt(df: DataFrame, by: string, useLnL = false): DataFrame =
var res = newDataFrame()
for (tup, subDf) in groups(df.group_by(by)):
let typ = tup[0][1].toStr
if not useLnL:
res.add p(subDf, 0, typ)
else:
res.add lnL(subDf, typ)
res
# predictions for MLP
var dfMlp = newDataFrame()
dfMlp.add byIt(dfCdl, "Target")
dfMlp.add byIt(dfFe, "CalibType")
dfMlp.add p(dfBack, 0, "back")
# likelihood values
var dfLnL = newDataFrame()
dfLnL.add byIt(dfCdl, "Target", true)
dfLnL.add byIt(dfFe, "CalibType", true)
dfLnL.add lnL(dfBack, "back")
var labelOrder = initTable[Value, int]()
labelOrder[%~ "back"] = 0
labelOrder[%~ "escape"] = 1
labelOrder[%~ "photo"] = 2
const xrayRef = getXrayRefTable()
for idx, el in xrayRef:
labelOrder[%~ el] = idx 3
proc plotRidges(df: DataFrame, x, plotPath, suffix: string) =
ggplot(df, aes(x, fill = "Type"))
ggridges("Type", overlap = 1.75, labelOrder = labelOrder)
geom_histogram(bins = 300, color = "black", lineWidth = 0.75,
hdKind = hdOutline, position = "identity",
density = true)
ggsave(&"{plotPath}/all_predictions_by_type_ridgeline{suffix}.pdf", width = 800, height = 500)
plotRidges(dfMlp, "preds", desc.plotPath, "_mlp")
# remove anything in the clamp region to not blow up our y scale with an 'overfilled' bin
plotRidges(dfLnL.filter(f{`lnLs` < 49.0}), "lnLs", desc.plotPath, "_lnL")
proc generateRocCurves(model: AnyModel, device: Device, desc: MLPDesc) =
let (dfCdl, dfFe, dfBack) = model.readAllData(device, desc,
calcLogL = true,
mlpOutput = true)
proc asDf(dfIn: DataFrame, typ, dset: string): DataFrame =
toDf({ "Prediction" : (if dset == "Neuron_0": dfIn[dset, float].map_inline(-x)
else: dfIn[dset, float]),
"Target" : dfIn["Target", string],
"Type" : typ })
template buildLong(dfCdl, dfFe, dfBack, dset: untyped): untyped =
# need the `likelihood` dataset and the data type
var df = newDataFrame()
df.add asDf(dfCdl, $dtSignal, dset)
df.add asDf(dfFe, $dtSignal, dset)
df.add asDf(dfBack, $dtBack, dset)
df
let dfLnL = buildLong(dfCdl, dfFe, dfBack, "likelihood")
let dfMlp = buildLong(dfCdl, dfFe, dfBack, "Neuron_0")
plotRocCurve(dfLnL, dfMlp, "_lnL_vs_mlp", desc.plotPath)
# target specific roc curves
targetSpecificRoc(dfLnL, dfMlp, desc.plotPath)
proc modelDirFile(model, outpath: string): (string, string) =
if model.len == 0 and outpath.len == 0:
raise newException(ValueError, "`model` and `modelOutpath` cannot be empty both.")
elif model.len > 0 and outpath.len == 0: ## user gave a model, use that
result = (model, model.parentDir)
elif model.len > 0 and outpath.len > 0:
result = (model, outpath)
else:
result = ("", outpath)
proc initDesc(calib, back, sim: seq[string], # data
model, modelOutpath, plotPath: string,
datasets: seq[string], # datasets used as input neurons
numHidden: seq[int], # number of neurons on each hidden layer
activation: ActivationFunction,
outputActivation: OutputActivation,
lossFunction: LossFunction,
optimizer: OptimizerKind,
learningRate: float,
subsetPerRun: int,
simulatedData: bool,
rngSeed: int,
backgroundRegion: ChipRegion,
nFake: int,
plotEvery: int,
backgroundChips: set[uint8]): MLPDesc =
if numHidden.len == 0:
raise newException(ValueError, "Please provide a number of neurons for the hidden layers.")
# 1. initialize the MLPDesc from the given parameters
let plotPath = if plotPath.len == 0: "/tmp/" else: plotPath
# 2. check if `modelOutpath` is a file or a directory
let (modelFile, modelDir) = modelDirFile(model, modelOutpath)
result = initMLPDesc(calib, back, sim, datasets,
modelFile, modelDir, plotPath,
numHidden,
activation, outputActivation, lossFunction, optimizer,
learningRate,
subsetPerRun,
simulatedData,
rngSeed,
backgroundRegion,
nFake,
plotEvery,
backgroundChips)
# 3. check if such a file already exists to possibly merge it or just return that
let outfile = result.modelDir / MLPDescName
# potentially create the output paths
discard existsOrCreateDir(result.plotPath)
discard existsOrCreateDir(result.modelDir)
if not fileExists(outfile):
## Existing file with same name. Possibly an older version of it?
## Try deserializing as a V1 MLPDesc
let (version, newestFile) = findNewestFile(result.modelDir)
if newestFile.len > 0:
## NOTE: for now we only have one older version to worry about. In the future we might
## need to generalize it based on the actual version.
let descV1 = deserializeH5[MLPDescV1](result.modelDir / newestFile)
if descV1.numHidden != 0: # means it really was V1. Therefore copy over
## As a V1, just copy over the training related fields
macro copyFields(fs: varargs[untyped]): untyped =
result = newStmtList()
for f in fs:
result.add quote do:
result.`f` = descV1.`f`
copyFields(epochs, accuracies, testAccuracies, losses, testLosses)
if result.datasets.len == 0:
result.datasets = descV1.datasets
result.numInputs = result.datasets.len
else:
doAssert descV1.datasets == result.datasets, "Datasets in existing file and input don't match!"
# no file exists at all
# write back the either modified new version MLPDesc object or initial file
result.toH5(outfile)
else:
## is actually V2 or higher, just return it! This branch is
# Note: most input parameters are ignored in this case!
let input = result
result = deserializeH5[MLPDesc](outfile)
result.inputModel = result.inputModel.strip(chars = Whitespace {'\0'})
# Update all fields user wishes to change (that are supported!)
var anySet = false
template setAny(field: untyped): untyped =
if input.field != result.field:
result.field = input.field
anySet = true
if input.inputModel.len > 0:
result.inputModel = input.inputModel
anySet = true
if input.learningRate != result.learningRate:
# user wishes to change learning rate, add last epoch to rates
doAssert result.epochs.len > 0, "This should not happen. We did not train before?"
result.pastLearningRates.add (lr: result.learningRate, toEpoch: result.epochs[^1])
result.learningRate = input.learningRate
anySet = true
setAny(path)
setAny(plotPath)
setAny(modelDir)
setAny(path)
setAny(version)
setAny(plotEvery)
if anySet: # If anything changed due to input, write new H5 file
result.toH5(result.modelDir / MLPDescName)
# update the global datasets!
CurrentDsets = result.datasets
## XXX: inside of a generic proc (what we would normally do) the `parameters` call
## breaks! Nim doesn't understand that the MLP / ConvNet type can be converted
## to a `Module`!
proc trainModel[T](_: typedesc[T],
device: Device,
mlpDesc: MLPDesc,
epochs = 100_000,
skipTraining = false
) =
## If we are training, construct a type appropriate to
var model = T.init(mlpDesc)
model.to(device)
if mlpDesc.inputModel.len > 0:
echo "Loading model file: `", mlpDesc.inputModel, "`"
model.load(mlpDesc.inputModel)
elif skipTraining:
raise newException(ValueError, "`skipTraining` only allowed if a trained input model is handed explicitly " &
"via the `--model` argument.")
when T is MLP:
const readRaw = false
else:
const readRaw = true
echo "Reading data"
var rnd = initRand(mlpDesc.rngSeed)
var df = if mlpDesc.simulatedData:
echo "Using simulated data."
prepareSimDataframe(mlpDesc, readRaw = readRaw, rnd = rnd)
else:
prepareMixedDataframe(mlpDesc, readRaw = readRaw)
# get training & test dataset
echo "Splitting data into train & test set"
let (trainTup, testTup) = generateTrainTest(df, rnd)
let (trainIn, trainTarg) = trainTup
let (testIn, testTarg) = testTup
## Get a mutable MLPDesc so that we can set the `nTrain/Test/...` fields
var mlpDesc = mlpDesc
proc assignNumbers(desc: var MLPDesc, df: DataFrame, nTrain, nTest: int) =
let dfS = df.filter(f{`Type` == $dtSignal})
desc.nFakeTotal = dfS.len
desc.nBack = df.len - dfS.len
desc.nTrain = nTrain
desc.nTest = nTest
mlpDesc.assignNumbers(df, trainIn.sizes[0], testIn.sizes[0])
## Continue training from the last epoch if any training already done
let continueAfterEpoch = if mlpDesc.epochs.len > 0: mlpDesc.epochs[^1] else: 0
let lr = mlpDesc.learningRate
if not skipTraining:
template callTrain(arg: typed): untyped =
model.train(arg,
trainIn.to(kFloat32).to(device),
trainTarg.to(kFloat32).to(device),
testIn.to(kFloat32).to(device),
testTarg.to(kFloat32).to(device),
device,
readRaw,
mlpDesc,
epochs,
continueAfterEpoch)
withOptim(model, mlpDesc): # injects `optimizer` variable of correct type into body
callTrain(optimizer)
model.save(mlpDesc.path)
else: # perform validation
# reproduce the accuracy and loss plots
plotType(mlpDesc.epochs, mlpDesc.accuracies, mlpDesc.testAccuracies, "Accuracy", outfile = &"{mlpDesc.plotPath}/accuracy.pdf")
plotType(mlpDesc.epochs, mlpDesc.losses, mlpDesc.testLosses, "Loss", outfile = &"{mlpDesc.plotPath}/loss.pdf")
# reproduce the training validation plots by using `test` procedure & overriding the plot names
let (acc, loss, trainPredict, trainTargets) = model.test(trainIn, trainTarg, device, mlpDesc,
plotOutfile = mlpDesc.plotPath / "train_validation.pdf")
echo "Train loss after training: ", loss, " with accuracy ", acc
let preds = trainPredict.mapIt(clamp(it, -ClampOutput, ClampOutput))
rocCurve(preds, trainTargets, plotPath = mlpDesc.plotPath)
let (acc, loss, testPredict, testTargets) = model.test(testIn, testTarg, device, mlpDesc,
plotOutfile = mlpDesc.plotPath / "test_validation.pdf")
echo "Test loss after training: ", loss, " with accuracy ", acc
template predictModel(Typ: typedesc,
device: Device,
desc: MLPDesc,
ε = 0.8, # signal efficiency for background rate prediction
totalTime = -1.0.Hour, # total background rate time in hours. Normally read from input file
rocCurve = false,
): untyped {.dirty.} =
var model = Typ.init(desc)
model.to(device)
when Typ is MLP:
const readRaw = false
else:
const readRaw = true
# load the model
echo "Loading model file: ", desc.inputModel
model.load(desc.inputModel)
model.predictBackground(device, desc, desc.backFiles[0], ε, totalTime, readRaw, desc.plotPath)
model.predictCalib(device, desc, desc.calibFiles[0], ε)
model.predictAll(device, desc)
model.determineCdlEfficiency(device, desc, ε, readRaw, plotPath = desc.plotPath)
model.determineCdlEfficiency(device, desc, ε, readRaw, global = false, plotPath = desc.plotPath)
model.generateRocCurves(device, desc)
proc main(calib, back, simFiles: seq[string] = @[],
datasets: seq[string] = @[],
ε = 0.8, # signal efficiency for background rate prediction
totalTime = -1.0.Hour, # total background rate time in hours. Normally read from input file
rocCurve = false,
predict = false,
model = "", ## Optional path to an existing model
modelKind = "MLP", # MLP or ConvNet #model = mkMLP ## parsing an enum here causes weird CT error in cligen :/
modelOutpath = "", ## Output path where to store the model. Can be empty if `model` given.
numHidden: seq[int] = @[], ## number of neurons on the hidden layers. One number per layer.
activation: ActivationFunction = afReLU,
outputActivation: OutputActivation = ofLinear,
lossFunction: LossFunction = lfSigmoidCrossEntropy,
optimizer: OptimizerKind = opSGD,
learningRate = Inf,
subsetPerRun = 1000, ## If multiple `backgroundChips` given, read `subsetPerRun` for *each chip!
backgroundChips = none(set[uint8]),
plotPath = "",
clampOutput = 50.0,
printDefaultDatasets = false,
simulatedData = false,