-
Notifications
You must be signed in to change notification settings - Fork 1
/
clean-artifacts.sh
executable file
·92 lines (79 loc) · 3.37 KB
/
clean-artifacts.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
#!/bin/bash
helpFunction()
{
echo ""
echo "Usage: $0 -a parameterA -b parameterB"
echo -e "\t-a Absolute route to input data directory."
echo -e "\t-b Absolute route to output data directory"
exit 1 # Exit script after printing help
}
while getopts "a:b:c:" opt
do
case "$opt" in
a ) IN_DIR="$OPTARG" ;;
b ) OUT_DIR="$OPTARG" ;;
? ) helpFunction ;; # Print helpFunction in case parameter is non-existent
esac
done
# Print helpFunction in case parameters are empty
if [ -z "$IN_DIR" ] || [ -z "$OUT_DIR" ]
then
echo "Some or all of the parameters are empty";
helpFunction
fi
echo "$IN_DIR"
echo "$OUT_DIR"
# Build INI_DIR if it does not exist
echo "mkdir -p $OUT_DIR"
mkdir -p $OUT_DIR
printf "\n\nCreate a copy of directory\n"
echo "find ${IN_DIR} -type f -exec cp {} ${OUT_DIR}/. \;"
#cp $IN_DIR/* $OUT_DIR/.
find ${IN_DIR} -type f -exec cp {} ${OUT_DIR}/. \;
echo "------------------------------------------------"
## dos2unix
printf "\n\nStep 1/5: Force unix newline characters\n"
ALLFILES="${OUT_DIR}/*"
echo "dos2unix ${ALLFILES}"
dos2unix $ALLFILES
echo "------------------------------------------------"
## Remove artifacts
printf "\n\nStep 2/5: Remove common artifacts\n"
if [ ! -d "utils/FixEncodingErrors" ]
then
echo "git clone https://github.com/PlanTL-SANIDAD/utils.git"
git clone https://github.com/PlanTL-SANIDAD/utils.git
fi
echo "chmod 775 utils/FixEncodingErrors/FixEncodingErrors.pl
perl utils/FixEncodingErrors/FixEncodingErrors.pl --dir ${OUT_DIR}"
chmod 775 utils/FixEncodingErrors/FixEncodingErrors.pl
perl utils/FixEncodingErrors/FixEncodingErrors.pl --dir $OUT_DIR
echo "------------------------------------------------"
## Remove HTML errors
printf "\n\nStep 3/5: Remove common HTML errors\n"
echo "find ${OUT_DIR} -type f -name '*txt' -exec sed -i 's/μ/µ/g' {} \;
find ${OUT_DIR} -type f -name '*txt' -exec sed -i 's/’/'\''/g' {} \;
find ${OUT_DIR} -type f -name '*txt' -exec sed -i 's/≥/≥/g' {} \;
find ${OUT_DIR} -type f -name '*txt' -exec sed -i 's/≤/≤/g' {} \;
find ${OUT_DIR} -type f -name '*txt' -exec sed -i 's/β/β/g' {} \;
find ${OUT_DIR} -type f -name '*txt' -exec sed -i 's/α/α/g' {} \;
find ${OUT_DIR} -type f -name '*txt' -exec sed -i 's/—/-/g' {} \;"
find ${OUT_DIR} -type f -name '*txt' -exec sed -i 's/μ/µ/g' {} \;
find ${OUT_DIR} -type f -name '*txt' -exec sed -i 's/’/'\''/g' {} \;
find ${OUT_DIR} -type f -name '*txt' -exec sed -i 's/≥/≥/g' {} \;
find ${OUT_DIR} -type f -name '*txt' -exec sed -i 's/≤/≤/g' {} \;
find ${OUT_DIR} -type f -name '*txt' -exec sed -i 's/β/β/g' {} \;
find ${OUT_DIR} -type f -name '*txt' -exec sed -i 's/α/α/g' {} \;
find ${OUT_DIR} -type f -name '*txt' -exec sed -i 's/—/-/g' {} \;
echo "------------------------------------------------"
## Extra: quick-prepro.py
printf "\n\nStep 4/5: Quick substitution of common errors and Unicode normalization\n"
echo "python quick-prepro.py -d ${OUT_DIR}"
python quick-prepro.py -d $OUT_DIR
echo "------------------------------------------------"
## Find lines to manually check
printf "\n\nStep 5/5: Check if there are lines starting with lowercase. I need to manually go to those files and correct them if newlines are wrongly added\n"
echo "python check-newlines.py -d ${OUT_DIR}"
python check-newlines.py -d $OUT_DIR
echo "------------------------------------------------"
printf "\n\nFinished!\n"