preProcess.sh
2.82 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
#!/bin/bash
ALL=$(find ../fullCorpus -maxdepth 1 -mindepth 1 -type d | wc -l)
COUNTER=1
# Supplements the morphosyntax and senses files with additional segmentation information necessary for parsing
for d in $(find ../fullCorpus -maxdepth 1 -mindepth 1 -type d)
do
echo -ne "\r\e[K$COUNTER/$ALL $d"
if [ ! -f $d/ann_morphosyntax_raw.xml ]; then
mv $d/ann_morphosyntax.xml $d/ann_morphosyntax_raw.xml
python3 preProcess.py $d/ann_morphosyntax_raw.xml $d/ann_segmentation.xml > $d/ann_morphosyntax.xml
fi
if [ ! -f $d/ann_senses_raw.xml ]; then
mv $d/ann_senses.xml $d/ann_senses_raw.xml
python3 preProcess.py $d/ann_senses_raw.xml $d/ann_segmentation.xml > $d/ann_senses.xml
fi
let COUNTER=COUNTER+1
done
# Fixes the error where some headers contain unescaped '>'
ALL=$(find .. -name 'header.xml' | wc -l)
COUNTER=1
for f in $(find .. -name 'header.xml')
do
echo -ne "\r\e[K$COUNTER/$ALL $f"
DIR=$(dirname $f)
if [ ! -f $DIR/header_raw.xml ]; then
mv $DIR/header.xml $DIR/header_raw.xml
python3 fixHeader.py $DIR/header_raw.xml > $DIR/header.xml
fi
let COUNTER=COUNTER+1
done
# Fixes the error where some files have missing divisions
mv ../fullCorpus/310-2-000000042/text.xml ../fullCorpus/310-2-000000042/text_raw.xml
python3 fixMissingDiv.py ../fullCorpus/310-2-000000042/text_raw.xml > ../fullCorpus/310-2-000000042/text.xml
mv ../fullCorpus/310-2-000000042/ann_segmentation.xml ../fullCorpus/310-2-000000042/ann_segmentation_raw.xml
python3 fixMissingDiv.py ../fullCorpus/310-2-000000042/ann_segmentation_raw.xml > ../fullCorpus/310-2-000000042/ann_segmentation.xml
mv ../fullCorpus/310-2-000000042/ann_morphosyntax.xml tmp.xml
python3 fixMissingDiv.py tmp.xml > ../fullCorpus/310-2-000000042/ann_morphosyntax.xml
mv ../fullCorpus/310-2-000000042/ann_senses.xml tmp.xml
python3 fixMissingDiv.py tmp.xml > ../fullCorpus/310-2-000000042/ann_senses.xml
rm tmp.xml
# Fixes the error where some files use non-monotonic numbering
for d in "200-4-000000308" "200-4-000000313" "200-4-000000303"
do
cp ../fullCorpus/$d/text.xml ../fullCorpus/$d/text_raw.xml
cp ../fullCorpus/$d/ann_segmentation.xml ../fullCorpus/$d/ann_segmentation_raw.xml
done
for f in "text.xml" "ann_segmentation.xml" "ann_morphosyntax.xml" "ann_senses.xml"
do
sed -i -e 's/14\.3-ab/14\.4-ab/g' ../fullCorpus/200-4-000000308/$f
sed -i -e 's/15\.3-ab/14\.3-ab/g' ../fullCorpus/200-4-000000308/$f
done
for f in "text.xml" "ann_segmentation.xml" "ann_morphosyntax.xml" "ann_senses.xml"
do
sed -i -e 's/10\.2-ab/10\.3-ab/g' ../fullCorpus/200-4-000000313/$f
sed -i -e 's/12\.2-ab/10\.2-ab/g' ../fullCorpus/200-4-000000313/$f
done
for f in "text.xml" "ann_segmentation.xml" "ann_morphosyntax.xml" "ann_senses.xml"
do
sed -i -e 's/9\.1-ab/9\.2-ab/g' ../fullCorpus/200-4-000000303/$f
sed -i -e 's/11\.1-ab/9\.1-ab/g' ../fullCorpus/200-4-000000303/$f
done
echo