forked from cisocrgroup/ocrd_cis
-
Notifications
You must be signed in to change notification settings - Fork 1
/
run_training_test.bash
63 lines (57 loc) · 1.54 KB
/
run_training_test.bash
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
#!/bin/bash
set -e
source $(dirname $0)/test_lib.bash
ocrd_cis_init_ws blumenbach_anatomie_1805.ocrd.zip
# test if there are 3 gt files
pushd "$tmpws"
found_files=0
for file in $(ocrd workspace find -G OCR-D-GT-SEG-LINE); do
if [[ ! -f "$file" ]]; then
echo "cannot find ground truth file: $file"
exit 1
fi
found_files=$((found_files 1))
done
if [[ $found_files != 3 ]]; then
echo "invalid number of files: $found_files"
exit 1
fi
popd
ocrd_cis_align
# fix ocr for some entries (otherwise the training will fail)
pushd $tmpws
for f in $(ocrd workspace find -G OCR-D-CIS-ALIGN); do
sed -i -e 's#<pc:Unicode>e.</pc:Unicode>#<pc:Unicode>Säugethiere.</pc:Unicode>#' $f
sed -i -e 's#<pc:Unicode>E</pc:Unicode>#<pc:Unicode>Säugethieren</pc:Unicode>#' $f
done
popd
mkdir "$tmpdir/bin"
cat > "$tmpdir/bin/profiler.bash" <<EOF
#!/bin/bash
cat > /dev/null
echo '{"Säugethiere":{
"Candidates": [{
"Suggestion": "Säugethiere",
"Modern": "Säugetiere",
"Dict": "dict_modern_hypothetic_errors",
"HistPatterns": [{"Left":"t","Right":"th","Pos":5}],
"OCRPatterns": [],
"Distance": 0,
"Weight": 1.0
}]}}'
EOF
chmod a x "$tmpdir/bin/profiler.bash"
java -jar $(ocrd-cis-data -jar) -c train \
--log-level DEBUG \
-I OCR-D-CIS-ALIGN \
-m $tmpws/mets.xml \
--parameter <(
cat $(ocrd-cis-data -config) \
| sed -e "s#/path/to/profiler#$tmpdir/bin/profiler.bash#" \
| sed -e "s#/path/to/trigrams.csv#$(ocrd-cis-data -3gs)#" \
| sed -e "s#/path/to/train.dir#$tmpdir/train#"
)
if [[ ! -f $tmpdir/train/model.zip ]]; then
echo $tmpdir/train/model.zip not found
exit 1
fi