nxtgauge-gitops/ops/openobserve-alerts/configmap-additional.yaml
Ashwin Kumar Sivakumar 37a589fa87 fix(backend): add PORT env to all rust deployments (was crashing on boot)
16 of 20 rust services had no PORT env var set; their main.rs calls
std::env::var('PORT').expect('PORT must be a valid u16') which panicked
on startup. This commit adds env.PORT matching the existing containerPort
for each service. Service ports: gateway=9100 users=9101 companies=9102
jobs=9103 job_seekers=9104 customers=9105 employees=9106 photographers=9107
tutors=9108 makeup_artists=9109 developers=9110 video_editors=9111
graphic_designers=9112 social_media_managers=9113 fitness_trainers=9114
catering_services=9115 payments=9116 ugc_content_creators=9117 leads=9118
2026-06-11 01:17:15 +05:30

143 lines
No EOL
6 KiB
YAML

apiVersion: v1
kind: ConfigMap
metadata:
name: openobserve-alerts-additional
namespace: openobserve
data:
additional-alerts.sh: |
#!/usr/bin/env sh
set -eu
ORG_ID="${ORG_ID:-default}"
BASE_URL="${BASE_URL:-http://o2-openobserve-standalone.openobserve.svc.cluster.local:5080}"
STREAM_NAME="${STREAM_NAME:-default}"
TELEGRAM_CHAT_ID="${TELEGRAM_CHAT_ID:-}"
AUTH="$(printf '%s:%s' "$ZO_ROOT_USER_EMAIL" "$ZO_ROOT_USER_PASSWORD" | base64 | tr -d '\n')"
auth_hdr="Authorization: Basic $AUTH"
api() {
curl -sfS -H "$auth_hdr" -H "Content-Type: application/json" "$@"
}
ensure_alert() {
alert_name="$1"
sql="$2"
period_minutes="$3"
frequency_minutes="$4"
silence_minutes="$5"
row_template="$6"
existing_id="$(
api "$BASE_URL/api/v2/$ORG_ID/alerts" \
| jq -r --arg n "$alert_name" '.list[] | select(.name == $n) | .alert_id' \
| head -n 1
)"
payload="$(jq -n \
--arg name "$alert_name" \
--arg stream "$STREAM_NAME" \
--arg sql "$sql" \
--argjson period "$period_minutes" \
--argjson frequency "$frequency_minutes" \
--argjson silence "$silence_minutes" \
--arg row_template "$row_template" \
'{
name: $name,
stream_type: "logs",
stream_name: $stream,
is_real_time: false,
enabled: true,
tz_offset: 330,
destinations: ["nxtgauge_telegram"],
row_template: $row_template,
row_template_type: "String",
query_condition: { type: "sql", sql: $sql },
trigger_condition: {
period: $period,
operator: ">=",
threshold: 1,
frequency: $frequency,
frequency_type: "minutes",
silence: $silence
}
}')"
if [ -n "$existing_id" ] && [ "$existing_id" != "null" ]; then
api -X PUT "$BASE_URL/api/v2/$ORG_ID/alerts/$existing_id" -d "$payload" >/dev/null
echo "updated alert=$alert_name"
else
api -X POST "$BASE_URL/api/v2/$ORG_ID/alerts" -d "$payload" >/dev/null
echo "created alert=$alert_name"
fi
}
# API Health
ensure_alert \
"api-health-failures" \
"SELECT service, endpoint, status_code, COUNT(*) as count FROM \"default\" WHERE service ILIKE '%api%' AND (status_code >= 500 OR status_code = 0) GROUP BY service, endpoint, status_code ORDER BY count DESC LIMIT 50" \
5 1 15 \
"{service}/{endpoint} status={status_code} count={count}"
# Database Health
ensure_alert \
"database-connection-failures" \
"SELECT k8s_namespace_name, k8s_pod_name, substring(body, 1, 220) AS msg FROM \"default\" WHERE body ILIKE '%connection refused%' OR body ILIKE '%database%' OR body ILIKE '%postgres%' OR body ILIKE '%sqlx%' ORDER BY _timestamp DESC LIMIT 50" \
5 1 15 \
"{k8s_namespace_name}/{k8s_pod_name}: {msg}"
# Redis Health
ensure_alert \
"redis-connection-failures" \
"SELECT k8s_namespace_name, k8s_pod_name, substring(body, 1, 220) AS msg FROM \"default\" WHERE body ILIKE '%redis%' OR body ILIKE '%cache%' OR body ILIKE '%connection timeout%' ORDER BY _timestamp DESC LIMIT 50" \
5 1 15 \
"{k8s_namespace_name}/{k8s_pod_name}: {msg}"
# Pod Failures
ensure_alert \
"pod-failures" \
"SELECT k8s_namespace_name, k8s_pod_name, body_object_reason, body_object_message FROM \"default\" WHERE body_object_reason IN ('Failed', 'Evicted', 'NodeAffinity', 'UnexpectedAdmissionError') ORDER BY _timestamp DESC LIMIT 50" \
5 1 15 \
"{k8s_namespace_name}/{k8s_pod_name} {body_object_reason}: {body_object_message}"
# CPU High
ensure_alert \
"cpu-high-usage" \
"SELECT k8s_namespace_name, k8s_pod_name, k8s_container_name, AVG(cpu_usage_cores) as avg_cpu FROM \"default\" WHERE cpu_usage_cores > 0.8 GROUP BY k8s_namespace_name, k8s_pod_name, k8s_container_name ORDER BY avg_cpu DESC LIMIT 50" \
10 2 30 \
"{k8s_namespace_name}/{k8s_pod_name}/{k8s_container_name} CPU={avg_cpu}"
# Memory High
ensure_alert \
"memory-high-usage" \
"SELECT k8s_namespace_name, k8s_pod_name, k8s_container_name, AVG(memory_usage_bytes) as avg_mem FROM \"default\" WHERE memory_usage_bytes > 1073741824 GROUP BY k8s_namespace_name, k8s_pod_name, k8s_container_name ORDER BY avg_mem DESC LIMIT 50" \
10 2 30 \
"{k8s_namespace_name}/{k8s_pod_name}/{k8s_container_name} MEM={avg_mem}"
# Disk Full
ensure_alert \
"disk-full-warning" \
"SELECT k8s_node_name, k8s_namespace_name, k8s_pod_name, substring(body, 1, 220) AS msg FROM \"default\" WHERE body ILIKE '%disk full%' OR body ILIKE '%no space left%' OR body ILIKE '%DiskPressure%' ORDER BY _timestamp DESC LIMIT 50" \
10 2 60 \
"{k8s_node_name}/{k8s_namespace_name}/{k8s_pod_name}: {msg}"
# Longhorn Health
ensure_alert \
"longhorn-volume-errors" \
"SELECT k8s_namespace_name, k8s_pod_name, substring(body, 1, 220) AS msg FROM \"default\" WHERE k8s_namespace_name = 'longhorn-system' AND (body ILIKE '%error%' OR body ILIKE '%degraded%' OR body ILIKE '%faulted%') ORDER BY _timestamp DESC LIMIT 50" \
10 2 30 \
"longhorn/{k8s_pod_name}: {msg}"
# ArgoCD Health
ensure_alert \
"argocd-app-sync-failures" \
"SELECT k8s_pod_name, k8s_container_name, substring(body, 1, 220) AS msg FROM \"default\" WHERE k8s_namespace_name = 'argocd' AND (body ILIKE '%SyncFailed%' OR body ILIKE '%ComparisonError%' OR body ILIKE '%ResourceQuota%') ORDER BY _timestamp DESC LIMIT 50" \
10 2 30 \
"argocd/{k8s_pod_name}: {msg}"
# Registry Health
ensure_alert \
"registry-push-failures" \
"SELECT k8s_pod_name, k8s_container_name, substring(body, 1, 220) AS msg FROM \"default\" WHERE k8s_namespace_name = 'registry' AND (body ILIKE '%413%' OR body ILIKE '%payload too large%' OR body ILIKE '%unauthorized%') ORDER BY _timestamp DESC LIMIT 50" \
10 2 60 \
"registry/{k8s_pod_name}: {msg}"