nxtgauge-gitops/ops/openobserve-alerts/configmap.yaml

166 lines
6.8 KiB
YAML

apiVersion: v1
kind: ConfigMap
metadata:
name: openobserve-alerts-bootstrap
namespace: openobserve
data:
bootstrap.sh: |
#!/usr/bin/env sh
set -eu
ORG_ID="${ORG_ID:-default}"
BASE_URL="${BASE_URL:-http://o2-openobserve-standalone.openobserve.svc.cluster.local:5080}"
STREAM_NAME="${STREAM_NAME:-default}"
TELEGRAM_CHAT_ID="${TELEGRAM_CHAT_ID:-}"
if [ -z "${ZO_ROOT_USER_EMAIL:-}" ] || [ -z "${ZO_ROOT_USER_PASSWORD:-}" ]; then
echo "missing ZO_ROOT_USER_EMAIL / ZO_ROOT_USER_PASSWORD"
exit 1
fi
if [ -z "$TELEGRAM_CHAT_ID" ]; then
echo "missing TELEGRAM_CHAT_ID"
exit 1
fi
AUTH="$(printf '%s:%s' "$ZO_ROOT_USER_EMAIL" "$ZO_ROOT_USER_PASSWORD" | base64 | tr -d '\n')"
auth_hdr="Authorization: Basic $AUTH"
api() {
# shellcheck disable=SC2068
curl -sfS -H "$auth_hdr" -H "Content-Type: application/json" "$@"
}
ensure_template() {
template_name="$1"
template_type="$2" # http or email
title="$3"
body="$4"
is_default="$5" # true/false
payload="$(jq -n \
--arg name "$template_name" \
--arg type "$template_type" \
--arg title "$title" \
--arg body "$body" \
--argjson isDefault "$is_default" \
'{name: $name, type: $type, title: $title, body: $body, isDefault: $isDefault}')"
# Upsert: PUT works for existing, and also works as create in newer versions.
if api -X PUT "$BASE_URL/api/$ORG_ID/alerts/templates/$template_name" -d "$payload" >/dev/null 2>&1; then
echo "upserted template=$template_name"
return 0
fi
api -X POST "$BASE_URL/api/$ORG_ID/alerts/templates" -d "$payload" >/dev/null
echo "created template=$template_name"
}
ensure_alert() {
alert_name="$1"
sql="$2"
period_minutes="$3"
frequency_minutes="$4"
silence_minutes="$5"
row_template="$6"
existing_id="$(
api "$BASE_URL/api/v2/$ORG_ID/alerts" \
| jq -r --arg n "$alert_name" '.list[] | select(.name == $n) | .alert_id' \
| head -n 1
)"
payload="$(jq -n \
--arg name "$alert_name" \
--arg stream "$STREAM_NAME" \
--arg sql "$sql" \
--argjson period "$period_minutes" \
--argjson frequency "$frequency_minutes" \
--argjson silence "$silence_minutes" \
--arg row_template "$row_template" \
'{
name: $name,
stream_type: "logs",
stream_name: $stream,
is_real_time: false,
enabled: true,
tz_offset: 330,
destinations: ["nxtgauge_telegram"],
row_template: $row_template,
row_template_type: "String",
query_condition: { type: "sql", sql: $sql },
trigger_condition: {
period: $period,
operator: ">=",
threshold: 1,
frequency: $frequency,
frequency_type: "minutes",
silence: $silence
}
}')"
if [ -n "$existing_id" ] && [ "$existing_id" != "null" ]; then
resp="$(api -X PUT "$BASE_URL/api/v2/$ORG_ID/alerts/$existing_id" -d "$payload")" || {
echo "failed updating alert=$alert_name id=$existing_id"
exit 1
}
code="$(echo "$resp" | jq -r '.code // empty')"
if [ "$code" != "200" ]; then
echo "failed updating alert=$alert_name id=$existing_id resp=$resp"
exit 1
fi
echo "updated alert=$alert_name id=$existing_id"
else
resp="$(api -X POST "$BASE_URL/api/v2/$ORG_ID/alerts" -d "$payload")" || {
echo "failed creating alert=$alert_name"
exit 1
}
code="$(echo "$resp" | jq -r '.code // empty')"
if [ "$code" != "200" ]; then
echo "failed creating alert=$alert_name resp=$resp"
exit 1
fi
echo "created alert=$alert_name"
fi
}
# Telegram template includes useful debugging context + top rows.
# Uses OpenObserve built-in variables: {alert_url}, {alert_count}, {rows:5}, etc.
telegram_body="$(printf '{\"chat_id\":\"%s\",\"text\":\"ALERT {alert_name}\\\\norg={org_name} stream={stream_type}/{stream_name}\\\\ncount={alert_count} window={alert_start_time}..{alert_end_time}\\\\n\\\\n{rows:5}\\\\n\\\\nOpen: {alert_url}\"}' "$TELEGRAM_CHAT_ID")"
ensure_template "telegram_nxtgauge" "http" "" "$telegram_body" true
ensure_alert \
"k8s-image-pull-failures" \
"SELECT k8s_namespace_name, k8s_pod_name, body_object_reason, body_object_message FROM \"default\" WHERE body_object_message ILIKE '%ErrImagePull%' OR body_object_message ILIKE '%ImagePullBackOff%' OR body_object_message ILIKE '%Failed to pull image%' ORDER BY _timestamp DESC LIMIT 50" \
5 1 30 \
"{k8s_namespace_name}/{k8s_pod_name} {body_object_reason}: {body_object_message}"
ensure_alert \
"k8s-crashloopbackoff" \
"SELECT k8s_namespace_name, k8s_pod_name, body_object_reason, body_object_message FROM \"default\" WHERE body_object_message ILIKE '%CrashLoopBackOff%' OR body_object_message ILIKE '%Back-off restarting failed container%' ORDER BY _timestamp DESC LIMIT 50" \
5 1 30 \
"{k8s_namespace_name}/{k8s_pod_name} {body_object_reason}: {body_object_message}"
ensure_alert \
"k8s-volume-mount-failures" \
"SELECT k8s_namespace_name, k8s_pod_name, body_object_reason, body_object_message FROM \"default\" WHERE body_object_message ILIKE '%FailedMount%' OR body_object_message ILIKE '%FailedAttachVolume%' OR body_object_message ILIKE '%MountVolume%' ORDER BY _timestamp DESC LIMIT 50" \
10 2 60 \
"{k8s_namespace_name}/{k8s_pod_name} {body_object_reason}: {body_object_message}"
ensure_alert \
"argocd-errors" \
"SELECT k8s_pod_name, k8s_container_name, substring(body, 1, 220) AS msg FROM \"default\" WHERE k8s_namespace_name = 'argocd' AND (body ILIKE '%level=error%' OR body ILIKE '%ERROR%' OR body ILIKE '%ComparisonError%' OR body ILIKE '%SyncFailed%') ORDER BY _timestamp DESC LIMIT 50" \
10 2 30 \
"argocd/{k8s_pod_name} {k8s_container_name}: {msg}"
ensure_alert \
"woodpecker-errors" \
"SELECT k8s_pod_name, k8s_container_name, substring(body, 1, 220) AS msg FROM \"default\" WHERE k8s_namespace_name = 'woodpecker' AND (body ILIKE '%error%' OR body ILIKE '%ERROR%' OR body ILIKE '%failed%') ORDER BY _timestamp DESC LIMIT 50" \
10 2 30 \
"woodpecker/{k8s_pod_name} {k8s_container_name}: {msg}"
ensure_alert \
"registry-errors" \
"SELECT k8s_pod_name, k8s_container_name, substring(body, 1, 220) AS msg FROM \"default\" WHERE k8s_namespace_name = 'registry' AND (body ILIKE '%error%' OR body ILIKE '%ERROR%' OR body ILIKE '%413%' OR body ILIKE '%payload too large%') ORDER BY _timestamp DESC LIMIT 50" \
10 2 60 \
"registry/{k8s_pod_name} {k8s_container_name}: {msg}"