166 lines
6.8 KiB
YAML
166 lines
6.8 KiB
YAML
apiVersion: v1
|
|
kind: ConfigMap
|
|
metadata:
|
|
name: openobserve-alerts-bootstrap
|
|
namespace: openobserve
|
|
data:
|
|
bootstrap.sh: |
|
|
#!/usr/bin/env sh
|
|
set -eu
|
|
|
|
ORG_ID="${ORG_ID:-default}"
|
|
BASE_URL="${BASE_URL:-http://o2-openobserve-standalone.openobserve.svc.cluster.local:5080}"
|
|
STREAM_NAME="${STREAM_NAME:-default}"
|
|
TELEGRAM_CHAT_ID="${TELEGRAM_CHAT_ID:-}"
|
|
|
|
if [ -z "${ZO_ROOT_USER_EMAIL:-}" ] || [ -z "${ZO_ROOT_USER_PASSWORD:-}" ]; then
|
|
echo "missing ZO_ROOT_USER_EMAIL / ZO_ROOT_USER_PASSWORD"
|
|
exit 1
|
|
fi
|
|
if [ -z "$TELEGRAM_CHAT_ID" ]; then
|
|
echo "missing TELEGRAM_CHAT_ID"
|
|
exit 1
|
|
fi
|
|
|
|
AUTH="$(printf '%s:%s' "$ZO_ROOT_USER_EMAIL" "$ZO_ROOT_USER_PASSWORD" | base64 | tr -d '\n')"
|
|
auth_hdr="Authorization: Basic $AUTH"
|
|
|
|
api() {
|
|
# shellcheck disable=SC2068
|
|
curl -sfS -H "$auth_hdr" -H "Content-Type: application/json" "$@"
|
|
}
|
|
|
|
ensure_template() {
|
|
template_name="$1"
|
|
template_type="$2" # http or email
|
|
title="$3"
|
|
body="$4"
|
|
is_default="$5" # true/false
|
|
|
|
payload="$(jq -n \
|
|
--arg name "$template_name" \
|
|
--arg type "$template_type" \
|
|
--arg title "$title" \
|
|
--arg body "$body" \
|
|
--argjson isDefault "$is_default" \
|
|
'{name: $name, type: $type, title: $title, body: $body, isDefault: $isDefault}')"
|
|
|
|
# Upsert: PUT works for existing, and also works as create in newer versions.
|
|
if api -X PUT "$BASE_URL/api/$ORG_ID/alerts/templates/$template_name" -d "$payload" >/dev/null 2>&1; then
|
|
echo "upserted template=$template_name"
|
|
return 0
|
|
fi
|
|
|
|
api -X POST "$BASE_URL/api/$ORG_ID/alerts/templates" -d "$payload" >/dev/null
|
|
echo "created template=$template_name"
|
|
}
|
|
|
|
ensure_alert() {
|
|
alert_name="$1"
|
|
sql="$2"
|
|
period_minutes="$3"
|
|
frequency_minutes="$4"
|
|
silence_minutes="$5"
|
|
row_template="$6"
|
|
|
|
existing_id="$(
|
|
api "$BASE_URL/api/v2/$ORG_ID/alerts" \
|
|
| jq -r --arg n "$alert_name" '.list[] | select(.name == $n) | .alert_id' \
|
|
| head -n 1
|
|
)"
|
|
|
|
payload="$(jq -n \
|
|
--arg name "$alert_name" \
|
|
--arg stream "$STREAM_NAME" \
|
|
--arg sql "$sql" \
|
|
--argjson period "$period_minutes" \
|
|
--argjson frequency "$frequency_minutes" \
|
|
--argjson silence "$silence_minutes" \
|
|
--arg row_template "$row_template" \
|
|
'{
|
|
name: $name,
|
|
stream_type: "logs",
|
|
stream_name: $stream,
|
|
is_real_time: false,
|
|
enabled: true,
|
|
tz_offset: 330,
|
|
destinations: ["nxtgauge_telegram"],
|
|
row_template: $row_template,
|
|
row_template_type: "String",
|
|
query_condition: { type: "sql", sql: $sql },
|
|
trigger_condition: {
|
|
period: $period,
|
|
operator: ">=",
|
|
threshold: 1,
|
|
frequency: $frequency,
|
|
frequency_type: "minutes",
|
|
silence: $silence
|
|
}
|
|
}')"
|
|
|
|
if [ -n "$existing_id" ] && [ "$existing_id" != "null" ]; then
|
|
resp="$(api -X PUT "$BASE_URL/api/v2/$ORG_ID/alerts/$existing_id" -d "$payload")" || {
|
|
echo "failed updating alert=$alert_name id=$existing_id"
|
|
exit 1
|
|
}
|
|
code="$(echo "$resp" | jq -r '.code // empty')"
|
|
if [ "$code" != "200" ]; then
|
|
echo "failed updating alert=$alert_name id=$existing_id resp=$resp"
|
|
exit 1
|
|
fi
|
|
echo "updated alert=$alert_name id=$existing_id"
|
|
else
|
|
resp="$(api -X POST "$BASE_URL/api/v2/$ORG_ID/alerts" -d "$payload")" || {
|
|
echo "failed creating alert=$alert_name"
|
|
exit 1
|
|
}
|
|
code="$(echo "$resp" | jq -r '.code // empty')"
|
|
if [ "$code" != "200" ]; then
|
|
echo "failed creating alert=$alert_name resp=$resp"
|
|
exit 1
|
|
fi
|
|
echo "created alert=$alert_name"
|
|
fi
|
|
}
|
|
|
|
# Telegram template includes useful debugging context + top rows.
|
|
# Uses OpenObserve built-in variables: {alert_url}, {alert_count}, {rows:5}, etc.
|
|
telegram_body="$(printf '{\"chat_id\":\"%s\",\"text\":\"ALERT {alert_name}\\\\norg={org_name} stream={stream_type}/{stream_name}\\\\ncount={alert_count} window={alert_start_time}..{alert_end_time}\\\\n\\\\n{rows:5}\\\\n\\\\nOpen: {alert_url}\"}' "$TELEGRAM_CHAT_ID")"
|
|
|
|
ensure_template "telegram_nxtgauge" "http" "" "$telegram_body" true
|
|
|
|
ensure_alert \
|
|
"k8s-image-pull-failures" \
|
|
"SELECT k8s_namespace_name, k8s_pod_name, body_object_reason, body_object_message FROM \"default\" WHERE body_object_message ILIKE '%ErrImagePull%' OR body_object_message ILIKE '%ImagePullBackOff%' OR body_object_message ILIKE '%Failed to pull image%' ORDER BY _timestamp DESC LIMIT 50" \
|
|
5 1 30 \
|
|
"{k8s_namespace_name}/{k8s_pod_name} {body_object_reason}: {body_object_message}"
|
|
|
|
ensure_alert \
|
|
"k8s-crashloopbackoff" \
|
|
"SELECT k8s_namespace_name, k8s_pod_name, body_object_reason, body_object_message FROM \"default\" WHERE body_object_message ILIKE '%CrashLoopBackOff%' OR body_object_message ILIKE '%Back-off restarting failed container%' ORDER BY _timestamp DESC LIMIT 50" \
|
|
5 1 30 \
|
|
"{k8s_namespace_name}/{k8s_pod_name} {body_object_reason}: {body_object_message}"
|
|
|
|
ensure_alert \
|
|
"k8s-volume-mount-failures" \
|
|
"SELECT k8s_namespace_name, k8s_pod_name, body_object_reason, body_object_message FROM \"default\" WHERE body_object_message ILIKE '%FailedMount%' OR body_object_message ILIKE '%FailedAttachVolume%' OR body_object_message ILIKE '%MountVolume%' ORDER BY _timestamp DESC LIMIT 50" \
|
|
10 2 60 \
|
|
"{k8s_namespace_name}/{k8s_pod_name} {body_object_reason}: {body_object_message}"
|
|
|
|
ensure_alert \
|
|
"argocd-errors" \
|
|
"SELECT k8s_pod_name, k8s_container_name, substring(body, 1, 220) AS msg FROM \"default\" WHERE k8s_namespace_name = 'argocd' AND (body ILIKE '%level=error%' OR body ILIKE '%ERROR%' OR body ILIKE '%ComparisonError%' OR body ILIKE '%SyncFailed%') ORDER BY _timestamp DESC LIMIT 50" \
|
|
10 2 30 \
|
|
"argocd/{k8s_pod_name} {k8s_container_name}: {msg}"
|
|
|
|
ensure_alert \
|
|
"woodpecker-errors" \
|
|
"SELECT k8s_pod_name, k8s_container_name, substring(body, 1, 220) AS msg FROM \"default\" WHERE k8s_namespace_name = 'woodpecker' AND (body ILIKE '%error%' OR body ILIKE '%ERROR%' OR body ILIKE '%failed%') ORDER BY _timestamp DESC LIMIT 50" \
|
|
10 2 30 \
|
|
"woodpecker/{k8s_pod_name} {k8s_container_name}: {msg}"
|
|
|
|
ensure_alert \
|
|
"registry-errors" \
|
|
"SELECT k8s_pod_name, k8s_container_name, substring(body, 1, 220) AS msg FROM \"default\" WHERE k8s_namespace_name = 'registry' AND (body ILIKE '%error%' OR body ILIKE '%ERROR%' OR body ILIKE '%413%' OR body ILIKE '%payload too large%') ORDER BY _timestamp DESC LIMIT 50" \
|
|
10 2 60 \
|
|
"registry/{k8s_pod_name} {k8s_container_name}: {msg}"
|