fix: run db migrations as Argo PreSync hook + add openobserve collector/alerts

This commit is contained in:
Ashwin Kumar Sivakumar 2026-04-17 17:08:31 +05:30
parent 0c92767030
commit 1b4ef92083
13 changed files with 493 additions and 2 deletions

View file

@ -2,12 +2,12 @@ apiVersion: batch/v1
kind: Job
metadata:
name: nxtgauge-db-migrate
labels:
app: nxtgauge-db-migrate
annotations:
argocd.argoproj.io/hook: PreSync
argocd.argoproj.io/hook-delete-policy: BeforeHookCreation,HookSucceeded
argocd.argoproj.io/sync-wave: "-1"
labels:
app: nxtgauge-db-migrate
spec:
ttlSecondsAfterFinished: 300
backoffLimit: 3

View file

@ -0,0 +1,21 @@
apiVersion: argoproj.io/v1alpha1
kind: Application
metadata:
name: openobserve-alerts
namespace: argocd
spec:
project: default
source:
repoURL: https://github.com/Traceworks2023/nxtgauge-gitops.git
targetRevision: main
path: ops/openobserve-alerts
destination:
server: https://kubernetes.default.svc
namespace: openobserve
syncPolicy:
automated:
prune: true
selfHeal: true
syncOptions:
- CreateNamespace=true

View file

@ -0,0 +1,21 @@
apiVersion: argoproj.io/v1alpha1
kind: Application
metadata:
name: openobserve-otelcol
namespace: argocd
spec:
project: default
source:
repoURL: https://github.com/Traceworks2023/nxtgauge-gitops.git
targetRevision: main
path: ops/openobserve-otelcol
destination:
server: https://kubernetes.default.svc
namespace: openobserve
syncPolicy:
automated:
prune: true
selfHeal: true
syncOptions:
- CreateNamespace=true

View file

@ -0,0 +1,22 @@
# OpenObserve alerts + Telegram
This deploys a CronJob (`openobserve-alerts-bootstrap`) that upserts alert templates + common alerts in OpenObserve.
## Prereqs
Create a Kubernetes Secret with your Telegram chat id:
```bash
kubectl -n openobserve create secret generic openobserve-telegram \
--from-literal=TELEGRAM_CHAT_ID='<your_chat_id>'
```
The OpenObserve credentials are read from the existing Secret created by the OpenObserve install:
- `o2-openobserve-standalone` (`ZO_ROOT_USER_EMAIL`, `ZO_ROOT_USER_PASSWORD`)
## Notes
- Alerts are created to send to destination `nxtgauge_telegram` (must exist in OpenObserve).
- Edit `ops/openobserve-alerts/configmap.yaml` to add/remove alerts.

View file

@ -0,0 +1,166 @@
apiVersion: v1
kind: ConfigMap
metadata:
name: openobserve-alerts-bootstrap
namespace: openobserve
data:
bootstrap.sh: |
#!/usr/bin/env sh
set -eu
ORG_ID="${ORG_ID:-default}"
BASE_URL="${BASE_URL:-http://o2-openobserve-standalone.openobserve.svc.cluster.local:5080}"
STREAM_NAME="${STREAM_NAME:-default}"
TELEGRAM_CHAT_ID="${TELEGRAM_CHAT_ID:-}"
if [ -z "${ZO_ROOT_USER_EMAIL:-}" ] || [ -z "${ZO_ROOT_USER_PASSWORD:-}" ]; then
echo "missing ZO_ROOT_USER_EMAIL / ZO_ROOT_USER_PASSWORD"
exit 1
fi
if [ -z "$TELEGRAM_CHAT_ID" ]; then
echo "missing TELEGRAM_CHAT_ID"
exit 1
fi
AUTH="$(printf '%s:%s' "$ZO_ROOT_USER_EMAIL" "$ZO_ROOT_USER_PASSWORD" | base64 | tr -d '\n')"
auth_hdr="Authorization: Basic $AUTH"
api() {
# shellcheck disable=SC2068
curl -sfS -H "$auth_hdr" -H "Content-Type: application/json" "$@"
}
ensure_template() {
template_name="$1"
template_type="$2" # http or email
title="$3"
body="$4"
is_default="$5" # true/false
payload="$(jq -n \
--arg name "$template_name" \
--arg type "$template_type" \
--arg title "$title" \
--arg body "$body" \
--argjson isDefault "$is_default" \
'{name: $name, type: $type, title: $title, body: $body, isDefault: $isDefault}')"
# Upsert: PUT works for existing, and also works as create in newer versions.
if api -X PUT "$BASE_URL/api/$ORG_ID/alerts/templates/$template_name" -d "$payload" >/dev/null 2>&1; then
echo "upserted template=$template_name"
return 0
fi
api -X POST "$BASE_URL/api/$ORG_ID/alerts/templates" -d "$payload" >/dev/null
echo "created template=$template_name"
}
ensure_alert() {
alert_name="$1"
sql="$2"
period_minutes="$3"
frequency_minutes="$4"
silence_minutes="$5"
row_template="$6"
existing_id="$(
api "$BASE_URL/api/v2/$ORG_ID/alerts" \
| jq -r --arg n "$alert_name" '.list[] | select(.name == $n) | .alert_id' \
| head -n 1
)"
payload="$(jq -n \
--arg name "$alert_name" \
--arg stream "$STREAM_NAME" \
--arg sql "$sql" \
--argjson period "$period_minutes" \
--argjson frequency "$frequency_minutes" \
--argjson silence "$silence_minutes" \
--arg row_template "$row_template" \
'{
name: $name,
stream_type: "logs",
stream_name: $stream,
is_real_time: false,
enabled: true,
tz_offset: 330,
destinations: ["nxtgauge_telegram"],
row_template: $row_template,
row_template_type: "String",
query_condition: { type: "sql", sql: $sql },
trigger_condition: {
period: $period,
operator: ">=",
threshold: 1,
frequency: $frequency,
frequency_type: "minutes",
silence: $silence
}
}')"
if [ -n "$existing_id" ] && [ "$existing_id" != "null" ]; then
resp="$(api -X PUT "$BASE_URL/api/v2/$ORG_ID/alerts/$existing_id" -d "$payload")" || {
echo "failed updating alert=$alert_name id=$existing_id"
exit 1
}
code="$(echo "$resp" | jq -r '.code // empty')"
if [ "$code" != "200" ]; then
echo "failed updating alert=$alert_name id=$existing_id resp=$resp"
exit 1
fi
echo "updated alert=$alert_name id=$existing_id"
else
resp="$(api -X POST "$BASE_URL/api/v2/$ORG_ID/alerts" -d "$payload")" || {
echo "failed creating alert=$alert_name"
exit 1
}
code="$(echo "$resp" | jq -r '.code // empty')"
if [ "$code" != "200" ]; then
echo "failed creating alert=$alert_name resp=$resp"
exit 1
fi
echo "created alert=$alert_name"
fi
}
# Telegram template includes useful debugging context + top rows.
# Uses OpenObserve built-in variables: {alert_url}, {alert_count}, {rows:5}, etc.
telegram_body="$(printf '{\"chat_id\":\"%s\",\"text\":\"ALERT {alert_name}\\\\norg={org_name} stream={stream_type}/{stream_name}\\\\ncount={alert_count} window={alert_start_time}..{alert_end_time}\\\\n\\\\n{rows:5}\\\\n\\\\nOpen: {alert_url}\"}' "$TELEGRAM_CHAT_ID")"
ensure_template "telegram_nxtgauge" "http" "" "$telegram_body" true
ensure_alert \
"k8s-image-pull-failures" \
"SELECT k8s_namespace_name, k8s_pod_name, body_object_reason, body_object_message FROM \"default\" WHERE body_object_message ILIKE '%ErrImagePull%' OR body_object_message ILIKE '%ImagePullBackOff%' OR body_object_message ILIKE '%Failed to pull image%' ORDER BY _timestamp DESC LIMIT 50" \
5 1 30 \
"{k8s_namespace_name}/{k8s_pod_name} {body_object_reason}: {body_object_message}"
ensure_alert \
"k8s-crashloopbackoff" \
"SELECT k8s_namespace_name, k8s_pod_name, body_object_reason, body_object_message FROM \"default\" WHERE body_object_message ILIKE '%CrashLoopBackOff%' OR body_object_message ILIKE '%Back-off restarting failed container%' ORDER BY _timestamp DESC LIMIT 50" \
5 1 30 \
"{k8s_namespace_name}/{k8s_pod_name} {body_object_reason}: {body_object_message}"
ensure_alert \
"k8s-volume-mount-failures" \
"SELECT k8s_namespace_name, k8s_pod_name, body_object_reason, body_object_message FROM \"default\" WHERE body_object_message ILIKE '%FailedMount%' OR body_object_message ILIKE '%FailedAttachVolume%' OR body_object_message ILIKE '%MountVolume%' ORDER BY _timestamp DESC LIMIT 50" \
10 2 60 \
"{k8s_namespace_name}/{k8s_pod_name} {body_object_reason}: {body_object_message}"
ensure_alert \
"argocd-errors" \
"SELECT k8s_pod_name, k8s_container_name, substring(body, 1, 220) AS msg FROM \"default\" WHERE k8s_namespace_name = 'argocd' AND (body ILIKE '%level=error%' OR body ILIKE '%ERROR%' OR body ILIKE '%ComparisonError%' OR body ILIKE '%SyncFailed%') ORDER BY _timestamp DESC LIMIT 50" \
10 2 30 \
"argocd/{k8s_pod_name} {k8s_container_name}: {msg}"
ensure_alert \
"woodpecker-errors" \
"SELECT k8s_pod_name, k8s_container_name, substring(body, 1, 220) AS msg FROM \"default\" WHERE k8s_namespace_name = 'woodpecker' AND (body ILIKE '%error%' OR body ILIKE '%ERROR%' OR body ILIKE '%failed%') ORDER BY _timestamp DESC LIMIT 50" \
10 2 30 \
"woodpecker/{k8s_pod_name} {k8s_container_name}: {msg}"
ensure_alert \
"registry-errors" \
"SELECT k8s_pod_name, k8s_container_name, substring(body, 1, 220) AS msg FROM \"default\" WHERE k8s_namespace_name = 'registry' AND (body ILIKE '%error%' OR body ILIKE '%ERROR%' OR body ILIKE '%413%' OR body ILIKE '%payload too large%') ORDER BY _timestamp DESC LIMIT 50" \
10 2 60 \
"registry/{k8s_pod_name} {k8s_container_name}: {msg}"

View file

@ -0,0 +1,49 @@
apiVersion: batch/v1
kind: CronJob
metadata:
name: openobserve-alerts-bootstrap
namespace: openobserve
spec:
schedule: "*/15 * * * *"
concurrencyPolicy: Forbid
jobTemplate:
spec:
backoffLimit: 0
template:
spec:
restartPolicy: Never
containers:
- name: bootstrap
image: registry.nxtgauge.com/docker:28-cli
command: ["sh", "-lc"]
args:
- apk add --no-cache curl jq >/dev/null && /scripts/bootstrap.sh
env:
- name: ORG_ID
value: default
- name: BASE_URL
value: http://o2-openobserve-standalone.openobserve.svc.cluster.local:5080
- name: TELEGRAM_CHAT_ID
valueFrom:
secretKeyRef:
name: openobserve-telegram
key: TELEGRAM_CHAT_ID
- name: ZO_ROOT_USER_EMAIL
valueFrom:
secretKeyRef:
name: o2-openobserve-standalone
key: ZO_ROOT_USER_EMAIL
- name: ZO_ROOT_USER_PASSWORD
valueFrom:
secretKeyRef:
name: o2-openobserve-standalone
key: ZO_ROOT_USER_PASSWORD
volumeMounts:
- name: scripts
mountPath: /scripts
readOnly: true
volumes:
- name: scripts
configMap:
name: openobserve-alerts-bootstrap
defaultMode: 0555

View file

@ -0,0 +1,9 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
namespace: openobserve
resources:
- configmap.yaml
- cronjob.yaml

View file

@ -0,0 +1,23 @@
# OpenObserve OpenTelemetry Collector (k8s logs + warning events)
This deploys an OpenTelemetry Collector `DaemonSet` that ships:
- container logs from `/var/log/containers/*.log`
- Kubernetes `Warning` events (watch)
to OpenObserve via OTLP/HTTP.
## Image mirroring
The DaemonSet expects this image in the internal registry:
- `registry.nxtgauge.com/otelcol-contrib:0.105.0`
Mirror it once (from a machine that can pull GHCR and push to the registry):
```bash
docker pull ghcr.io/open-telemetry/opentelemetry-collector-releases/opentelemetry-collector-contrib:0.105.0
docker tag ghcr.io/open-telemetry/opentelemetry-collector-releases/opentelemetry-collector-contrib:0.105.0 registry.nxtgauge.com/otelcol-contrib:0.105.0
docker push registry.nxtgauge.com/otelcol-contrib:0.105.0
```

View file

@ -0,0 +1,65 @@
apiVersion: v1
kind: ConfigMap
metadata:
name: openobserve-otelcol-config
namespace: openobserve
data:
collector.yaml: |
extensions:
basicauth/openobserve:
client_auth:
username: ${env:ZO_ROOT_USER_EMAIL}
password: ${env:ZO_ROOT_USER_PASSWORD}
receivers:
filelog/containers:
include:
- /var/log/containers/*.log
start_at: end
include_file_path: true
operators:
- id: parse_k8s_from_path
type: regex_parser
parse_from: attributes["log.file.path"]
regex: '^.*/(?P<k8s_pod_name>[^_]+)_(?P<k8s_namespace_name>[^_]+)_(?P<k8s_container_name>.+)-(?P<k8s_container_id>[0-9a-f]+)\.log$'
on_error: drop
k8sobjects/warning_events:
auth_type: serviceAccount
objects:
- name: events
mode: watch
field_selector: type=Warning
processors:
batch: {}
resource/containers:
attributes:
- action: upsert
key: service.name
value: k8s_container_logs
resource/events:
attributes:
- action: upsert
key: service.name
value: k8s_events
exporters:
otlphttp/openobserve:
logs_endpoint: http://o2-openobserve-standalone.openobserve.svc.cluster.local:5080/api/default/v1/logs
auth:
authenticator: basicauth/openobserve
service:
extensions: [basicauth/openobserve]
pipelines:
logs/containers:
receivers: [filelog/containers]
processors: [resource/containers, batch]
exporters: [otlphttp/openobserve]
logs/events:
receivers: [k8sobjects/warning_events]
processors: [resource/events, batch]
exporters: [otlphttp/openobserve]

View file

@ -0,0 +1,69 @@
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: openobserve-otelcol
namespace: openobserve
labels:
app.kubernetes.io/name: openobserve-otelcol
spec:
selector:
matchLabels:
app.kubernetes.io/name: openobserve-otelcol
template:
metadata:
labels:
app.kubernetes.io/name: openobserve-otelcol
spec:
serviceAccountName: openobserve-otelcol
tolerations:
- operator: Exists
securityContext:
runAsUser: 0
runAsGroup: 0
containers:
- name: otelcol
image: registry.nxtgauge.com/otelcol-contrib:0.105.0
args: ["--config=/conf/collector.yaml"]
env:
- name: ZO_ROOT_USER_EMAIL
valueFrom:
secretKeyRef:
name: o2-openobserve-standalone
key: ZO_ROOT_USER_EMAIL
- name: ZO_ROOT_USER_PASSWORD
valueFrom:
secretKeyRef:
name: o2-openobserve-standalone
key: ZO_ROOT_USER_PASSWORD
resources:
requests:
cpu: 50m
memory: 128Mi
limits:
cpu: 500m
memory: 512Mi
volumeMounts:
- name: conf
mountPath: /conf
readOnly: true
- name: varlogcontainers
mountPath: /var/log/containers
readOnly: true
- name: varlogpods
mountPath: /var/log/pods
readOnly: true
volumes:
- name: conf
configMap:
name: openobserve-otelcol-config
items:
- key: collector.yaml
path: collector.yaml
- name: varlogcontainers
hostPath:
path: /var/log/containers
type: DirectoryOrCreate
- name: varlogpods
hostPath:
path: /var/log/pods
type: DirectoryOrCreate

View file

@ -0,0 +1,11 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
namespace: openobserve
resources:
- serviceaccount.yaml
- rbac.yaml
- configmap.yaml
- daemonset.yaml

View file

@ -0,0 +1,29 @@
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: openobserve-otelcol
rules:
- apiGroups: [""]
resources:
- nodes
- namespaces
- pods
- events
verbs: ["get", "list", "watch"]
- apiGroups: ["events.k8s.io"]
resources: ["events"]
verbs: ["get", "list", "watch"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: openobserve-otelcol
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: openobserve-otelcol
subjects:
- kind: ServiceAccount
name: openobserve-otelcol
namespace: openobserve

View file

@ -0,0 +1,6 @@
apiVersion: v1
kind: ServiceAccount
metadata:
name: openobserve-otelcol
namespace: openobserve