From ac4435663336e0c5f8b44cb7b55cf398d51b44df Mon Sep 17 00:00:00 2001 From: Hadley Rich Date: Wed, 30 Jul 2025 15:53:51 +1200 Subject: [PATCH] Add healthchecks --- README.md | 28 +++++++++++++++++++++++++++- backup.sh | 30 ++++++++++++++++++++++-------- k8s-cronjob.yaml | 11 ++++++----- k8s-secret.yaml | 3 ++- 4 files changed, 57 insertions(+), 15 deletions(-) diff --git a/README.md b/README.md index 2d7e22d..27e1238 100644 --- a/README.md +++ b/README.md @@ -40,7 +40,7 @@ docker push your-registry/postgres-backup:latest - `S3_PREFIX`: S3 key prefix for backups (default: postgres-backups) - `S3_REGION`: S3 region (default: us-east-1) - `BACKUP_RETENTION_DAYS`: Number of days to keep backups (default: 7) -- `WEBHOOK_URL`: Optional webhook URL for notifications +- `HEALTHCHECKS_URL`: Healthchecks.io ping URL for monitoring (optional) ## Running Locally @@ -53,6 +53,7 @@ docker run --rm \ -e S3_ENDPOINT=https://nyc3.digitaloceanspaces.com \ -e S3_ACCESS_KEY_ID=your-access-key \ -e S3_SECRET_ACCESS_KEY=your-secret-key \ + -e HEALTHCHECKS_URL=https://hc-ping.com/your-uuid \ your-registry/postgres-backup:latest ``` @@ -85,6 +86,31 @@ kubectl get jobs kubectl logs -l job-name=postgres-backup- ``` +## Monitoring with Healthchecks.io + +The container has built-in support for [Healthchecks.io](https://healthchecks.io) monitoring: + +### Setup: +1. Create a check on healthchecks.io +2. Copy the ping URL (e.g., `https://hc-ping.com/your-uuid-here`) +3. Add it to your Kubernetes secret as `healthchecks-url` + +### Webhook Behavior: +- **Start**: Pings `/start` when backup begins +- **Success**: Pings the main URL when all backups complete successfully +- **Failure**: Pings `/fail` with error details when any backup fails + +### Example healthchecks.io URL: +``` +https://hc-ping.com/12345678-1234-1234-1234-123456789012 +``` + +This will automatically track: +- Job start times +- Success/failure status +- Failure reasons in the check log +- Missing backup alerts if job doesn't run + ## Backup Structure Backups are stored in S3 with a simple flat structure: diff --git a/backup.sh b/backup.sh index 4f2b7ce..fe430d9 100644 --- a/backup.sh +++ b/backup.sh @@ -115,16 +115,26 @@ cleanup_old_backups() { done } -# Function to send notification (placeholder for webhook/email integration) +# Function to send notification to healthchecks.io send_notification() { local status="$1" local message="$2" - if [[ -n "${WEBHOOK_URL:-}" ]]; then - curl -X POST "${WEBHOOK_URL}" \ - -H "Content-Type: application/json" \ - -d "{\"status\": \"${status}\", \"message\": \"${message}\", \"timestamp\": \"$(date -Iseconds)\"}" \ - || log "Failed to send notification" + if [[ -n "${HEALTHCHECKS_URL:-}" ]]; then + case "$status" in + "start") + # Ping start endpoint + curl -fsS -m 10 --retry 5 "${HEALTHCHECKS_URL}/start" > /dev/null || log "Failed to send start notification" + ;; + "success") + # Ping success endpoint (default) + curl -fsS -m 10 --retry 5 "${HEALTHCHECKS_URL}" > /dev/null || log "Failed to send success notification" + ;; + "error"|"fail") + # Ping fail endpoint with log data + curl -fsS -m 10 --retry 5 --data-raw "$message" "${HEALTHCHECKS_URL}/fail" > /dev/null || log "Failed to send failure notification" + ;; + esac fi } @@ -132,9 +142,13 @@ send_notification() { main() { log "Starting PostgreSQL backup process" + # Send start notification + send_notification "start" "PostgreSQL backup process started" + # Validate required environment variables if [[ -z "${S3_BUCKET}" || -z "${S3_ACCESS_KEY_ID}" || -z "${S3_SECRET_ACCESS_KEY}" || -z "${S3_ENDPOINT}" ]]; then log "ERROR: Missing required environment variables (S3_BUCKET, S3_ACCESS_KEY_ID, S3_SECRET_ACCESS_KEY, S3_ENDPOINT)" + send_notification "fail" "Missing required environment variables" exit 1 fi @@ -145,7 +159,7 @@ main() { log "Testing database connection" if ! PGPASSWORD="${POSTGRES_PASSWORD}" psql -h "${POSTGRES_HOST}" -p "${POSTGRES_PORT}" -U "${POSTGRES_USER}" -d postgres -c "SELECT 1" > /dev/null 2>&1; then log "ERROR: Cannot connect to PostgreSQL database" - send_notification "error" "Cannot connect to PostgreSQL database" + send_notification "fail" "Cannot connect to PostgreSQL database" exit 1 fi @@ -169,7 +183,7 @@ main() { send_notification "success" "All PostgreSQL database backups completed successfully" else log "Some database backups failed" - send_notification "error" "Some PostgreSQL database backups failed" + send_notification "fail" "Some PostgreSQL database backups failed" exit 1 fi } diff --git a/k8s-cronjob.yaml b/k8s-cronjob.yaml index 4ed3fc4..e828625 100644 --- a/k8s-cronjob.yaml +++ b/k8s-cronjob.yaml @@ -57,11 +57,12 @@ spec: # Backup settings - name: BACKUP_RETENTION_DAYS value: "7" - - name: COMPRESSION - value: "gzip" - # Optional webhook for notifications - # - name: WEBHOOK_URL - # value: "https://hooks.slack.com/services/YOUR/SLACK/WEBHOOK" + # Healthchecks.io monitoring + - name: HEALTHCHECKS_URL + valueFrom: + secretKeyRef: + name: postgres-backup-secret + key: healthchecks-url resources: requests: memory: "256Mi" diff --git a/k8s-secret.yaml b/k8s-secret.yaml index 0946406..26006ac 100644 --- a/k8s-secret.yaml +++ b/k8s-secret.yaml @@ -9,4 +9,5 @@ stringData: postgres-user: postgres postgres-password: your-password s3-access-key-id: your-s3-access-key - s3-secret-access-key: your-s3-secret-key \ No newline at end of file + s3-secret-access-key: your-s3-secret-key + healthchecks-url: https://hc-ping.com/your-check-uuid \ No newline at end of file