apiVersion: v1 kind: ConfigMap metadata: name: openobserve-alerts-additional namespace: openobserve data: additional-alerts.sh: | #!/usr/bin/env sh set -eu ORG_ID="${ORG_ID:-default}" BASE_URL="${BASE_URL:-http://o2-openobserve-standalone.openobserve.svc.cluster.local:5080}" STREAM_NAME="${STREAM_NAME:-default}" TELEGRAM_CHAT_ID="${TELEGRAM_CHAT_ID:-}" AUTH="$(printf '%s:%s' "$ZO_ROOT_USER_EMAIL" "$ZO_ROOT_USER_PASSWORD" | base64 | tr -d '\n')" auth_hdr="Authorization: Basic $AUTH" api() { curl -sfS -H "$auth_hdr" -H "Content-Type: application/json" "$@" } ensure_alert() { alert_name="$1" sql="$2" period_minutes="$3" frequency_minutes="$4" silence_minutes="$5" row_template="$6" existing_id="$( api "$BASE_URL/api/v2/$ORG_ID/alerts" \ | jq -r --arg n "$alert_name" '.list[] | select(.name == $n) | .alert_id' \ | head -n 1 )" payload="$(jq -n \ --arg name "$alert_name" \ --arg stream "$STREAM_NAME" \ --arg sql "$sql" \ --argjson period "$period_minutes" \ --argjson frequency "$frequency_minutes" \ --argjson silence "$silence_minutes" \ --arg row_template "$row_template" \ '{ name: $name, stream_type: "logs", stream_name: $stream, is_real_time: false, enabled: true, tz_offset: 330, destinations: ["nxtgauge_telegram"], row_template: $row_template, row_template_type: "String", query_condition: { type: "sql", sql: $sql }, trigger_condition: { period: $period, operator: ">=", threshold: 1, frequency: $frequency, frequency_type: "minutes", silence: $silence } }')" if [ -n "$existing_id" ] && [ "$existing_id" != "null" ]; then api -X PUT "$BASE_URL/api/v2/$ORG_ID/alerts/$existing_id" -d "$payload" >/dev/null echo "updated alert=$alert_name" else api -X POST "$BASE_URL/api/v2/$ORG_ID/alerts" -d "$payload" >/dev/null echo "created alert=$alert_name" fi } # API Health ensure_alert \ "api-health-failures" \ "SELECT service, endpoint, status_code, COUNT(*) as count FROM \"default\" WHERE service ILIKE '%api%' AND (status_code >= 500 OR status_code = 0) GROUP BY service, endpoint, status_code ORDER BY count DESC LIMIT 50" \ 5 1 15 \ "{service}/{endpoint} status={status_code} count={count}" # Database Health ensure_alert \ "database-connection-failures" \ "SELECT k8s_namespace_name, k8s_pod_name, substring(body, 1, 220) AS msg FROM \"default\" WHERE body ILIKE '%connection refused%' OR body ILIKE '%database%' OR body ILIKE '%postgres%' OR body ILIKE '%sqlx%' ORDER BY _timestamp DESC LIMIT 50" \ 5 1 15 \ "{k8s_namespace_name}/{k8s_pod_name}: {msg}" # Redis Health ensure_alert \ "redis-connection-failures" \ "SELECT k8s_namespace_name, k8s_pod_name, substring(body, 1, 220) AS msg FROM \"default\" WHERE body ILIKE '%redis%' OR body ILIKE '%cache%' OR body ILIKE '%connection timeout%' ORDER BY _timestamp DESC LIMIT 50" \ 5 1 15 \ "{k8s_namespace_name}/{k8s_pod_name}: {msg}" # Pod Failures ensure_alert \ "pod-failures" \ "SELECT k8s_namespace_name, k8s_pod_name, body_object_reason, body_object_message FROM \"default\" WHERE body_object_reason IN ('Failed', 'Evicted', 'NodeAffinity', 'UnexpectedAdmissionError') ORDER BY _timestamp DESC LIMIT 50" \ 5 1 15 \ "{k8s_namespace_name}/{k8s_pod_name} {body_object_reason}: {body_object_message}" # CPU High ensure_alert \ "cpu-high-usage" \ "SELECT k8s_namespace_name, k8s_pod_name, k8s_container_name, AVG(cpu_usage_cores) as avg_cpu FROM \"default\" WHERE cpu_usage_cores > 0.8 GROUP BY k8s_namespace_name, k8s_pod_name, k8s_container_name ORDER BY avg_cpu DESC LIMIT 50" \ 10 2 30 \ "{k8s_namespace_name}/{k8s_pod_name}/{k8s_container_name} CPU={avg_cpu}" # Memory High ensure_alert \ "memory-high-usage" \ "SELECT k8s_namespace_name, k8s_pod_name, k8s_container_name, AVG(memory_usage_bytes) as avg_mem FROM \"default\" WHERE memory_usage_bytes > 1073741824 GROUP BY k8s_namespace_name, k8s_pod_name, k8s_container_name ORDER BY avg_mem DESC LIMIT 50" \ 10 2 30 \ "{k8s_namespace_name}/{k8s_pod_name}/{k8s_container_name} MEM={avg_mem}" # Disk Full ensure_alert \ "disk-full-warning" \ "SELECT k8s_node_name, k8s_namespace_name, k8s_pod_name, substring(body, 1, 220) AS msg FROM \"default\" WHERE body ILIKE '%disk full%' OR body ILIKE '%no space left%' OR body ILIKE '%DiskPressure%' ORDER BY _timestamp DESC LIMIT 50" \ 10 2 60 \ "{k8s_node_name}/{k8s_namespace_name}/{k8s_pod_name}: {msg}" # Longhorn Health ensure_alert \ "longhorn-volume-errors" \ "SELECT k8s_namespace_name, k8s_pod_name, substring(body, 1, 220) AS msg FROM \"default\" WHERE k8s_namespace_name = 'longhorn-system' AND (body ILIKE '%error%' OR body ILIKE '%degraded%' OR body ILIKE '%faulted%') ORDER BY _timestamp DESC LIMIT 50" \ 10 2 30 \ "longhorn/{k8s_pod_name}: {msg}" # ArgoCD Health ensure_alert \ "argocd-app-sync-failures" \ "SELECT k8s_pod_name, k8s_container_name, substring(body, 1, 220) AS msg FROM \"default\" WHERE k8s_namespace_name = 'argocd' AND (body ILIKE '%SyncFailed%' OR body ILIKE '%ComparisonError%' OR body ILIKE '%ResourceQuota%') ORDER BY _timestamp DESC LIMIT 50" \ 10 2 30 \ "argocd/{k8s_pod_name}: {msg}" # Registry Health ensure_alert \ "registry-push-failures" \ "SELECT k8s_pod_name, k8s_container_name, substring(body, 1, 220) AS msg FROM \"default\" WHERE k8s_namespace_name = 'registry' AND (body ILIKE '%413%' OR body ILIKE '%payload too large%' OR body ILIKE '%unauthorized%') ORDER BY _timestamp DESC LIMIT 50" \ 10 2 60 \ "registry/{k8s_pod_name}: {msg}"