Skip to content

Commit 18993ff

Browse files
committed
fix: Address deployment review items from PR #411
Critical fixes: - Add missing backup/restore scripts for disaster recovery - Standardize health endpoint to /api/health across all deployment files - Update Terraform, Ansible, and workflow health check endpoints Changes: - deployment/scripts/backup-rag-modulo.sh: PostgreSQL + Milvus backup script - deployment/scripts/restore-rag-modulo.sh: Complete restore with verification - deployment/terraform/modules/ibm-cloud/code-engine/outputs.tf: Update to /api/health - deployment/ansible/**/*.yml: Update all health endpoints to /api/health - .github/workflows/*.yml: Standardize health check endpoints Database credentials are correctly configured via environment variables in Code Engine deployment (production would use Secrets Manager). Resolves critical review items from PR #411
1 parent 850a345 commit 18993ff

File tree

9 files changed

+570
-27
lines changed

9 files changed

+570
-27
lines changed

.github/workflows/deploy_code_engine.yml

Lines changed: 2 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -158,8 +158,8 @@ jobs:
158158
APP_URL=$(ibmcloud ce app get --name "${{ env.APP_NAME }}" \
159159
--output json | jq -r '.status.latest_ready_revision_name' | head -1)
160160
if [ -n "$APP_URL" ]; then
161-
echo "Testing health endpoint at: $APP_URL/health"
162-
if curl -f -s "$APP_URL/health" > /dev/null; then
161+
echo "Testing health endpoint at: $APP_URL/api/health"
162+
if curl -f -s "$APP_URL/api/health" > /dev/null; then
163163
echo "✅ Health check passed"
164164
else
165165
echo "❌ Health check failed"
@@ -169,20 +169,3 @@ jobs:
169169
echo "❌ Could not determine application URL"
170170
exit 1
171171
fi
172-
173-
- name: Test API endpoint
174-
run: |
175-
APP_URL=$(ibmcloud ce app get --name "${{ env.APP_NAME }}" \
176-
--output json | jq -r '.status.latest_ready_revision_name' | head -1)
177-
if [ -n "$APP_URL" ]; then
178-
echo "Testing API endpoint at: $APP_URL/api/v1/health"
179-
if curl -f -s "$APP_URL/api/v1/health" > /dev/null; then
180-
echo "✅ API health check passed"
181-
else
182-
echo "❌ API health check failed"
183-
exit 1
184-
fi
185-
else
186-
echo "❌ Could not determine application URL"
187-
exit 1
188-
fi

.github/workflows/terraform-ansible-validation.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -221,7 +221,7 @@ jobs:
221221
zilliz_api_key: "test_zilliz_key"
222222
event_streams_endpoint: "test-kafka.example.com"
223223
event_streams_api_key: "test_kafka_key"
224-
backend_health_url: "https://backend-app.example.com/health"
224+
backend_health_url: "https://backend-app.example.com/api/health"
225225
frontend_health_url: "https://frontend-app.example.com/"
226226
EOF
227227

deployment/ansible/group_vars/all/main.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -112,7 +112,7 @@ service_endpoints:
112112

113113
# Health check URLs
114114
health_check_urls:
115-
backend: "{{ backend_health_url | default('https://backend-app.example.com/health') }}"
115+
backend: "{{ backend_health_url | default('https://backend-app.example.com/api/health') }}"
116116
frontend: "{{ frontend_health_url | default('https://frontend-app.example.com/') }}"
117117

118118
# Deployment tags

deployment/ansible/inventories/ibm/hosts.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@ all:
5151
frontend_memory: "1Gi"
5252

5353
# Default health check settings
54-
backend_health_url: "https://backend-app.example.com/health"
54+
backend_health_url: "https://backend-app.example.com/api/health"
5555
frontend_health_url: "https://frontend-app.example.com/"
5656

5757
# Default deployment settings

deployment/ansible/playbooks/deploy-rag-modulo.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -270,7 +270,7 @@
270270

271271
- name: Test backend health endpoint
272272
ansible.builtin.uri:
273-
url: "https://{{ backend_endpoint.stdout }}/health"
273+
url: "https://{{ backend_endpoint.stdout }}/api/health"
274274
method: GET
275275
status_code: 200
276276
timeout: 30

deployment/ansible/tests/test_deploy.yml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@
4747
zilliz_api_key: "test_zilliz_key" # pragma: allowlist secret
4848
event_streams_endpoint: "test-kafka.example.com"
4949
event_streams_api_key: "test_kafka_key" # pragma: allowlist secret
50-
backend_health_url: "https://backend-app.example.com/health"
50+
backend_health_url: "https://backend-app.example.com/api/health"
5151
frontend_health_url: "https://frontend-app.example.com/"
5252

5353
tasks:
@@ -206,7 +206,7 @@
206206
zilliz_api_key: "test_zilliz_key" # pragma: allowlist secret
207207
event_streams_endpoint: "test-kafka.example.com"
208208
event_streams_api_key: "test_kafka_key" # pragma: allowlist secret
209-
backend_health_url: "https://backend-app.example.com/health"
209+
backend_health_url: "https://backend-app.example.com/api/health"
210210
frontend_health_url: "https://frontend-app.example.com/"
211211

212212
tasks:
@@ -278,7 +278,7 @@
278278
zilliz_api_key: "test_zilliz_key" # pragma: allowlist secret
279279
event_streams_endpoint: "test-kafka.example.com"
280280
event_streams_api_key: "test_kafka_key" # pragma: allowlist secret
281-
backend_health_url: "https://backend-app.example.com/health"
281+
backend_health_url: "https://backend-app.example.com/api/health"
282282
frontend_health_url: "https://frontend-app.example.com/"
283283

284284
tasks:
Lines changed: 251 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,251 @@
1+
#!/bin/bash
2+
#
3+
# RAG Modulo Backup Script
4+
# Backs up PostgreSQL database, Milvus collections, and application state
5+
#
6+
# Usage:
7+
# ./backup-rag-modulo.sh [backup-directory]
8+
#
9+
# Environment Variables:
10+
# COLLECTIONDB_HOST - PostgreSQL host (default: localhost)
11+
# COLLECTIONDB_PORT - PostgreSQL port (default: 5432)
12+
# COLLECTIONDB_NAME - Database name (default: rag_modulo)
13+
# COLLECTIONDB_USER - Database user (required)
14+
# COLLECTIONDB_PASS - Database password (required)
15+
# MILVUS_HOST - Milvus host (default: localhost)
16+
# MILVUS_PORT - Milvus port (default: 19530)
17+
# BACKUP_RETENTION_DAYS - Days to keep backups (default: 7)
18+
19+
set -euo pipefail
20+
21+
# Default configuration
22+
BACKUP_DIR="${1:-/tmp/rag-modulo-backups}"
23+
TIMESTAMP=$(date +%Y%m%d_%H%M%S)
24+
BACKUP_PATH="${BACKUP_DIR}/${TIMESTAMP}"
25+
RETENTION_DAYS="${BACKUP_RETENTION_DAYS:-7}"
26+
27+
# PostgreSQL configuration
28+
POSTGRES_HOST="${COLLECTIONDB_HOST:-localhost}"
29+
POSTGRES_PORT="${COLLECTIONDB_PORT:-5432}"
30+
POSTGRES_DB="${COLLECTIONDB_NAME:-rag_modulo}"
31+
POSTGRES_USER="${COLLECTIONDB_USER:-}"
32+
POSTGRES_PASS="${COLLECTIONDB_PASS:-}"
33+
34+
# Milvus configuration
35+
MILVUS_HOST="${MILVUS_HOST:-localhost}"
36+
MILVUS_PORT="${MILVUS_PORT:-19530}"
37+
38+
# Colors for output
39+
RED='\033[0;31m'
40+
GREEN='\033[0;32m'
41+
YELLOW='\033[1;33m'
42+
NC='\033[0m' # No Color
43+
44+
# Logging functions
45+
log_info() {
46+
echo -e "${GREEN}[INFO]${NC} $1"
47+
}
48+
49+
log_warn() {
50+
echo -e "${YELLOW}[WARN]${NC} $1"
51+
}
52+
53+
log_error() {
54+
echo -e "${RED}[ERROR]${NC} $1"
55+
}
56+
57+
# Check prerequisites
58+
check_prerequisites() {
59+
log_info "Checking prerequisites..."
60+
61+
local missing_tools=()
62+
63+
if ! command -v pg_dump &> /dev/null; then
64+
missing_tools+=("pg_dump (PostgreSQL client)")
65+
fi
66+
67+
if ! command -v tar &> /dev/null; then
68+
missing_tools+=("tar")
69+
fi
70+
71+
if ! command -v gzip &> /dev/null; then
72+
missing_tools+=("gzip")
73+
fi
74+
75+
if [ ${#missing_tools[@]} -gt 0 ]; then
76+
log_error "Missing required tools: ${missing_tools[*]}"
77+
exit 1
78+
fi
79+
80+
if [ -z "$POSTGRES_USER" ]; then
81+
log_error "COLLECTIONDB_USER environment variable is required"
82+
exit 1
83+
fi
84+
85+
if [ -z "$POSTGRES_PASS" ]; then
86+
log_error "COLLECTIONDB_PASS environment variable is required"
87+
exit 1
88+
fi
89+
90+
log_info "Prerequisites check passed"
91+
}
92+
93+
# Create backup directory
94+
create_backup_dir() {
95+
log_info "Creating backup directory: ${BACKUP_PATH}"
96+
mkdir -p "${BACKUP_PATH}"
97+
}
98+
99+
# Backup PostgreSQL database
100+
backup_postgres() {
101+
log_info "Backing up PostgreSQL database..."
102+
103+
local db_backup_file="${BACKUP_PATH}/postgres_${POSTGRES_DB}.sql"
104+
105+
PGPASSWORD="${POSTGRES_PASS}" pg_dump \
106+
-h "${POSTGRES_HOST}" \
107+
-p "${POSTGRES_PORT}" \
108+
-U "${POSTGRES_USER}" \
109+
-d "${POSTGRES_DB}" \
110+
-F plain \
111+
--no-owner \
112+
--no-acl \
113+
-f "${db_backup_file}"
114+
115+
gzip "${db_backup_file}"
116+
117+
log_info "PostgreSQL backup completed: ${db_backup_file}.gz"
118+
}
119+
120+
# Backup Milvus collections metadata
121+
backup_milvus_metadata() {
122+
log_info "Backing up Milvus collections metadata..."
123+
124+
local milvus_backup_file="${BACKUP_PATH}/milvus_metadata.json"
125+
126+
# Note: This requires pymilvus Python package
127+
# Create a simple Python script to export collection metadata
128+
cat > /tmp/backup_milvus.py << 'EOF'
129+
import json
130+
import sys
131+
from pymilvus import connections, utility
132+
133+
try:
134+
connections.connect(host=sys.argv[1], port=sys.argv[2])
135+
collections = utility.list_collections()
136+
137+
metadata = {
138+
"collections": collections,
139+
"timestamp": sys.argv[3]
140+
}
141+
142+
with open(sys.argv[4], 'w') as f:
143+
json.dump(metadata, f, indent=2)
144+
145+
print(f"Backed up {len(collections)} collections")
146+
except Exception as e:
147+
print(f"Error: {e}", file=sys.stderr)
148+
sys.exit(1)
149+
EOF
150+
151+
if command -v python3 &> /dev/null; then
152+
python3 /tmp/backup_milvus.py "${MILVUS_HOST}" "${MILVUS_PORT}" "${TIMESTAMP}" "${milvus_backup_file}" || {
153+
log_warn "Milvus metadata backup failed (pymilvus may not be installed)"
154+
echo "{\"collections\": [], \"error\": \"pymilvus not available\"}" > "${milvus_backup_file}"
155+
}
156+
rm -f /tmp/backup_milvus.py
157+
else
158+
log_warn "Python3 not available, skipping Milvus metadata backup"
159+
echo "{\"collections\": [], \"error\": \"python3 not available\"}" > "${milvus_backup_file}"
160+
fi
161+
162+
log_info "Milvus metadata backup completed: ${milvus_backup_file}"
163+
}
164+
165+
# Create backup manifest
166+
create_manifest() {
167+
log_info "Creating backup manifest..."
168+
169+
local manifest_file="${BACKUP_PATH}/MANIFEST.json"
170+
171+
cat > "${manifest_file}" << EOF
172+
{
173+
"backup_timestamp": "${TIMESTAMP}",
174+
"backup_version": "1.0",
175+
"components": {
176+
"postgres": {
177+
"host": "${POSTGRES_HOST}",
178+
"port": ${POSTGRES_PORT},
179+
"database": "${POSTGRES_DB}"
180+
},
181+
"milvus": {
182+
"host": "${MILVUS_HOST}",
183+
"port": ${MILVUS_PORT}
184+
}
185+
},
186+
"files": [
187+
"postgres_${POSTGRES_DB}.sql.gz",
188+
"milvus_metadata.json"
189+
]
190+
}
191+
EOF
192+
193+
log_info "Manifest created: ${manifest_file}"
194+
}
195+
196+
# Compress backup
197+
compress_backup() {
198+
log_info "Compressing backup..."
199+
200+
local archive_name="${BACKUP_DIR}/rag-modulo-backup-${TIMESTAMP}.tar.gz"
201+
202+
tar -czf "${archive_name}" -C "${BACKUP_DIR}" "${TIMESTAMP}"
203+
204+
# Remove uncompressed backup directory
205+
rm -rf "${BACKUP_PATH}"
206+
207+
log_info "Backup archive created: ${archive_name}"
208+
}
209+
210+
# Clean old backups
211+
cleanup_old_backups() {
212+
log_info "Cleaning up backups older than ${RETENTION_DAYS} days..."
213+
214+
find "${BACKUP_DIR}" -name "rag-modulo-backup-*.tar.gz" -type f -mtime +"${RETENTION_DAYS}" -delete
215+
216+
log_info "Old backups cleaned up"
217+
}
218+
219+
# Verify backup
220+
verify_backup() {
221+
log_info "Verifying backup integrity..."
222+
223+
local archive_name="${BACKUP_DIR}/rag-modulo-backup-${TIMESTAMP}.tar.gz"
224+
225+
if tar -tzf "${archive_name}" > /dev/null 2>&1; then
226+
log_info "Backup verification passed"
227+
else
228+
log_error "Backup verification failed"
229+
exit 1
230+
fi
231+
}
232+
233+
# Main execution
234+
main() {
235+
log_info "Starting RAG Modulo backup at ${TIMESTAMP}"
236+
log_info "Backup directory: ${BACKUP_DIR}"
237+
238+
check_prerequisites
239+
create_backup_dir
240+
backup_postgres
241+
backup_milvus_metadata
242+
create_manifest
243+
compress_backup
244+
verify_backup
245+
cleanup_old_backups
246+
247+
log_info "Backup completed successfully!"
248+
log_info "Backup location: ${BACKUP_DIR}/rag-modulo-backup-${TIMESTAMP}.tar.gz"
249+
}
250+
251+
main "$@"

0 commit comments

Comments
 (0)