-
Notifications
You must be signed in to change notification settings - Fork 4
/
load_tabular_data.sh
executable file
·63 lines (49 loc) · 1.69 KB
/
load_tabular_data.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
#!/bin/bash
set -e
# requires arguments
# -d | --dataset
# -v | --version
# -s | --source
# -D | --delimiter
# optional arguments
# --lat
# --lng
# -g | --geometry_name (get_arguments.sh specifies default)
ME=$(basename "$0")
. get_arguments.sh "$@"
# Unescape TAB character
if [ "$DELIMITER" == "\t" ]; then
DELIMITER=$(echo -e "\t")
fi
# I think Postgres temporary tables are such that concurrent jobs won't
# interfere with each other, but make the temp table name unique just
# in case.
UUID=$(python -c 'import uuid; print(uuid.uuid4(), end="")' | sed s/-//g)
TEMP_TABLE="temp_${UUID}"
# IF GEOMETRY_NAME, LAT and LNG are defined, set ADD_POINT_GEOMETRY_FIELDS_SQL
# by sourcing _add_point_geometry_fields_sql.sh
# It defines a SQL snippet we'll run later
if [[ -n "${GEOMETRY_NAME:-}" ]] && [[ -n "${LAT:-}" ]] && [[ -n "${LNG:-}" ]]
then
. _add_point_geometry_fields_sql.sh
. _fill_point_geometry_fields_sql.sh
else
ADD_POINT_GEOMETRY_FIELDS_SQL=""
FILL_POINT_GEOMETRY_FIELDS_SQL=""
fi
for uri in "${SRC[@]}"; do
# https://stackoverflow.com/questions/48019381/how-postgresql-copy-to-stdin-with-csv-do-on-conflic-do-update
aws s3 cp "${uri}" - | psql -c "BEGIN;
CREATE TEMP TABLE \"$TEMP_TABLE\"
(LIKE \"$DATASET\".\"$VERSION\" INCLUDING DEFAULTS)
ON COMMIT DROP;
ALTER TABLE \"$TEMP_TABLE\" DROP COLUMN IF EXISTS ${GEOMETRY_NAME};
ALTER TABLE \"$TEMP_TABLE\" DROP COLUMN IF EXISTS ${GEOMETRY_NAME}_wm;
COPY \"$TEMP_TABLE\" FROM STDIN WITH (FORMAT CSV, DELIMITER '$DELIMITER', HEADER);
$ADD_POINT_GEOMETRY_FIELDS_SQL
$FILL_POINT_GEOMETRY_FIELDS_SQL
INSERT INTO \"$DATASET\".\"$VERSION\"
SELECT * FROM \"$TEMP_TABLE\"
ON CONFLICT DO NOTHING;
COMMIT;"
done