stemedb/docs/operations/deployment/prometheus/backup-alerts.yml

---
# StemeDB Backup & DR Alert Rules
#
# These rules monitor backup health, verification status, and WAL archival.
# Integrate with Alertmanager for PagerDuty/Slack notifications.
#
# Installation:
#   1. Copy to /etc/prometheus/rules/stemedb-backup-alerts.yml
#   2. Add to prometheus.yml:
#      rule_files:
#        - /etc/prometheus/rules/stemedb-backup-alerts.yml
#   3. Reload Prometheus: systemctl reload prometheus
#

groups:
  - name: stemedb_backup
    interval: 60s
    rules:
      # CRITICAL: Backup completely failed
      - alert: StemeDBBackupFailed
        expr: |
          (time() - stemedb_backup_last_success_timestamp) > 21600
        for: 30m
        labels:
          severity: critical
          component: backup
          team: sre
        annotations:
          summary: "StemeDB backup failed (no successful backup in >6 hours)"
          description: |
            Last successful backup was {{ $value | humanizeDuration }} ago.
            Expected: backups every 6 hours.

            Impact: RPO degraded from 6h to {{ $value | humanizeDuration }}.
            If failure continues, data loss risk increases.

            Troubleshooting:
            1. Check systemd service: sudo systemctl status stemedb-backup.service
            2. View logs: sudo journalctl -u stemedb-backup.service -n 100
            3. Common causes:
               - Disk full (df -h /var/backups/stemedb)
               - S3 credentials expired
               - StemeDB process locked files

            Runbook: https://docs.stemedb.io/runbooks/backup-failed

      # CRITICAL: Backup verification failed
      - alert: StemeDBBackupVerificationFailed
        expr: |
          stemedb_backup_verification_status == 0
        for: 5m
        labels:
          severity: critical
          component: backup
          team: sre
        annotations:
          summary: "StemeDB backup verification failed"
          description: |
            Latest backup failed integrity checks.
            Passed: {{ $value }}{{ with query "stemedb_backup_verification_checks_total" }} / {{ . | first | value }}{{ end }} checks.

            Impact: Latest backup may be corrupted and unusable for restore.
            Cannot rely on this backup for disaster recovery.

            Troubleshooting:
            1. View verification logs: sudo journalctl -u stemedb-verify-backup.service -n 50
            2. Check which files failed:
               - WAL magic byte mismatches indicate corruption
               - CRC32C/BLAKE3 failures indicate bit rot
            3. Trigger new backup: sudo systemctl start stemedb-backup.service
            4. Re-verify: sudo systemctl start stemedb-verify-backup.service

            Runbook: https://docs.stemedb.io/runbooks/backup-verification-failed

      # CRITICAL: WAL archival lag exceeds RPO
      - alert: StemeDBWALArchivalLag
        expr: |
          stemedb_wal_archival_lag_seconds > 900
        for: 10m
        labels:
          severity: critical
          component: wal-archival
          team: sre
        annotations:
          summary: "StemeDB WAL archival lag exceeds RPO ({{ $value | humanizeDuration }})"
          description: |
            WAL segments are not being archived to S3 within RPO=15min target.
            Current lag: {{ $value | humanizeDuration }}.

            Impact: If disaster occurs, data loss window is {{ $value | humanizeDuration }} instead of 15min.

            Troubleshooting:
            1. Check archival service: sudo systemctl status stemedb-archive-wal.service
            2. View logs: sudo journalctl -u stemedb-archive-wal.service -n 50
            3. Common causes:
               - S3 upload slow (network congestion)
               - AWS credentials expired
               - S3 bucket quota exceeded
            4. Check S3 connectivity: aws s3 ls s3://$BUCKET/wal-archive/

            Runbook: https://docs.stemedb.io/runbooks/wal-archival-lag

      # WARNING: WAL archival failures accumulating
      - alert: StemeDBWALArchivalFailures
        expr: |
          rate(stemedb_wal_archival_segments_failed_total[15m]) > 0
        for: 15m
        labels:
          severity: warning
          component: wal-archival
          team: sre
        annotations:
          summary: "StemeDB WAL archival failures detected"
          description: |
            WAL segments are failing to upload to S3.
            Failed segments in last 15min: {{ $value }}.

            Impact: If failures persist, WAL archival will fall behind and RPO will degrade.

            Troubleshooting:
            1. Check recent failures: sudo journalctl -u stemedb-archive-wal.service -n 100 | grep FAIL
            2. Test S3 access: sudo -u stemedb aws s3 cp /tmp/test.txt s3://$BUCKET/test.txt
            3. Verify IAM permissions: s3:PutObject, s3:GetObject on bucket
            4. Check network: ping s3.amazonaws.com

            Runbook: https://docs.stemedb.io/runbooks/wal-archival-failures

      # WARNING: Backup age approaching threshold
      - alert: StemeDBBackupStale
        expr: |
          (time() - stemedb_backup_last_success_timestamp) > 18000
        for: 15m
        labels:
          severity: warning
          component: backup
          team: sre
        annotations:
          summary: "StemeDB backup is stale ({{ $value | humanizeDuration }} old)"
          description: |
            Backup age exceeds 5 hours (approaching 6-hour SLA).
            Last successful backup: {{ $value | humanizeDuration }} ago.

            Impact: RPO degrading. If failure continues, will escalate to critical.

            Troubleshooting:
            1. Check if backup is running: systemctl is-active stemedb-backup.service
            2. Check timer schedule: systemctl list-timers stemedb-backup.timer
            3. If timer disabled, re-enable: sudo systemctl start stemedb-backup.timer
            4. Trigger manual backup: sudo systemctl start stemedb-backup.service

            Runbook: https://docs.stemedb.io/runbooks/backup-stale

      # WARNING: Backup size anomaly (sudden change)
      - alert: StemeDBBackupSizeAnomaly
        expr: |
          abs(
            (stemedb_backup_size_bytes - stemedb_backup_size_bytes offset 6h)
            / stemedb_backup_size_bytes offset 6h
          ) > 0.5
        for: 5m
        labels:
          severity: warning
          component: backup
          team: sre
        annotations:
          summary: "StemeDB backup size changed >50% ({{ $value | humanizePercentage }})"
          description: |
            Backup size changed by {{ $value | humanizePercentage }} compared to 6 hours ago.

            Possible causes:
            - Large data ingestion (expected if running import)
            - Data deletion/compaction
            - Backup corruption (missing files)

            Action:
            1. Check assertion count: curl http://localhost:18180/v1/health | jq .assertion_count
            2. Compare to previous backup metadata
            3. If unexpected, investigate data changes
            4. If corruption suspected, trigger new backup

            Runbook: https://docs.stemedb.io/runbooks/backup-size-anomaly

      # INFO: Backup completed successfully (for observability)
      - alert: StemeDBBackupSuccess
        expr: |
          stemedb_backup_last_success_timestamp > 0
        for: 0s
        labels:
          severity: info
          component: backup
          team: sre
        annotations:
          summary: "StemeDB backup completed successfully"
          description: |
            Backup completed at {{ $value | humanizeTimestamp }}.
            Age: {{ with query "(time() - stemedb_backup_last_success_timestamp)" }}{{ . | first | value | humanizeDuration }}{{ end }}.

            This is an informational alert for audit trail purposes.

  - name: stemedb_disaster_recovery
    interval: 300s
    rules:
      # CRITICAL: Both local and S3 backups missing
      - alert: StemeDBNoViableBackup
        expr: |
          (time() - stemedb_backup_last_success_timestamp) > 86400
          and
          stemedb_backup_s3_uploaded == 0
        for: 1h
        labels:
          severity: critical
          component: disaster-recovery
          team: sre
        annotations:
          summary: "StemeDB has no viable backup (local OR S3)"
          description: |
            CRITICAL: No successful backup in >24 hours AND no S3 backups available.

            Impact: CANNOT recover from disaster. Data loss risk is MAXIMUM.

            Immediate action required:
            1. Trigger emergency backup NOW: sudo systemctl start stemedb-backup.service
            2. Verify backup success: sudo journalctl -u stemedb-backup.service -f
            3. Force S3 upload: sudo /usr/local/bin/backup-stemedb.sh --upload-s3
            4. Page on-call engineer if failures persist

            This is a business-critical alert requiring immediate response.

            Runbook: https://docs.stemedb.io/runbooks/no-viable-backup

      # WARNING: S3 backups missing (local only)
      - alert: StemeDBNoOffSiteBackup
        expr: |
          (time() - stemedb_backup_s3_last_upload_timestamp) > 43200
        for: 30m
        labels:
          severity: warning
          component: disaster-recovery
          team: sre
        annotations:
          summary: "StemeDB has no off-site (S3) backup in >12 hours"
          description: |
            Local backups exist but no S3 uploads in >12 hours.

            Impact: Cannot recover from server/disk failure. Regional disaster risk.

            Troubleshooting:
            1. Check S3 upload flag: grep upload-s3 /etc/systemd/system/stemedb-backup.service
            2. Test S3 access: aws s3 ls s3://$BUCKET/
            3. Check AWS credentials: sudo -u stemedb aws sts get-caller-identity
            4. Manually trigger upload: sudo /usr/local/bin/backup-stemedb.sh --upload-s3 --output /var/backups/stemedb/$(ls -t /var/backups/stemedb | head -n1)

            Runbook: https://docs.stemedb.io/runbooks/no-offsite-backup