From 1b4ef9208386a309ba243e74cbd31a5851592b40 Mon Sep 17 00:00:00 2001 From: Ashwin Kumar Sivakumar Date: Fri, 17 Apr 2026 17:08:31 +0530 Subject: [PATCH] fix: run db migrations as Argo PreSync hook + add openobserve collector/alerts --- .../base/k8s-migration-job.yaml | 4 +- argocd/openobserve-alerts-application.yaml | 21 +++ argocd/openobserve-otelcol-application.yaml | 21 +++ ops/openobserve-alerts/README.md | 22 +++ ops/openobserve-alerts/configmap.yaml | 166 ++++++++++++++++++ ops/openobserve-alerts/cronjob.yaml | 49 ++++++ ops/openobserve-alerts/kustomization.yaml | 9 + ops/openobserve-otelcol/README.md | 23 +++ ops/openobserve-otelcol/configmap.yaml | 65 +++++++ ops/openobserve-otelcol/daemonset.yaml | 69 ++++++++ ops/openobserve-otelcol/kustomization.yaml | 11 ++ ops/openobserve-otelcol/rbac.yaml | 29 +++ ops/openobserve-otelcol/serviceaccount.yaml | 6 + 13 files changed, 493 insertions(+), 2 deletions(-) create mode 100644 argocd/openobserve-alerts-application.yaml create mode 100644 argocd/openobserve-otelcol-application.yaml create mode 100644 ops/openobserve-alerts/README.md create mode 100644 ops/openobserve-alerts/configmap.yaml create mode 100644 ops/openobserve-alerts/cronjob.yaml create mode 100644 ops/openobserve-alerts/kustomization.yaml create mode 100644 ops/openobserve-otelcol/README.md create mode 100644 ops/openobserve-otelcol/configmap.yaml create mode 100644 ops/openobserve-otelcol/daemonset.yaml create mode 100644 ops/openobserve-otelcol/kustomization.yaml create mode 100644 ops/openobserve-otelcol/rbac.yaml create mode 100644 ops/openobserve-otelcol/serviceaccount.yaml diff --git a/apps/nxtgauge-backend-rust/base/k8s-migration-job.yaml b/apps/nxtgauge-backend-rust/base/k8s-migration-job.yaml index a7c4742..70a6b6e 100644 --- a/apps/nxtgauge-backend-rust/base/k8s-migration-job.yaml +++ b/apps/nxtgauge-backend-rust/base/k8s-migration-job.yaml @@ -2,12 +2,12 @@ apiVersion: batch/v1 kind: Job metadata: name: nxtgauge-db-migrate + labels: + app: nxtgauge-db-migrate annotations: argocd.argoproj.io/hook: PreSync argocd.argoproj.io/hook-delete-policy: BeforeHookCreation,HookSucceeded argocd.argoproj.io/sync-wave: "-1" - labels: - app: nxtgauge-db-migrate spec: ttlSecondsAfterFinished: 300 backoffLimit: 3 diff --git a/argocd/openobserve-alerts-application.yaml b/argocd/openobserve-alerts-application.yaml new file mode 100644 index 0000000..675e133 --- /dev/null +++ b/argocd/openobserve-alerts-application.yaml @@ -0,0 +1,21 @@ +apiVersion: argoproj.io/v1alpha1 +kind: Application +metadata: + name: openobserve-alerts + namespace: argocd +spec: + project: default + source: + repoURL: https://github.com/Traceworks2023/nxtgauge-gitops.git + targetRevision: main + path: ops/openobserve-alerts + destination: + server: https://kubernetes.default.svc + namespace: openobserve + syncPolicy: + automated: + prune: true + selfHeal: true + syncOptions: + - CreateNamespace=true + diff --git a/argocd/openobserve-otelcol-application.yaml b/argocd/openobserve-otelcol-application.yaml new file mode 100644 index 0000000..345a724 --- /dev/null +++ b/argocd/openobserve-otelcol-application.yaml @@ -0,0 +1,21 @@ +apiVersion: argoproj.io/v1alpha1 +kind: Application +metadata: + name: openobserve-otelcol + namespace: argocd +spec: + project: default + source: + repoURL: https://github.com/Traceworks2023/nxtgauge-gitops.git + targetRevision: main + path: ops/openobserve-otelcol + destination: + server: https://kubernetes.default.svc + namespace: openobserve + syncPolicy: + automated: + prune: true + selfHeal: true + syncOptions: + - CreateNamespace=true + diff --git a/ops/openobserve-alerts/README.md b/ops/openobserve-alerts/README.md new file mode 100644 index 0000000..67905e3 --- /dev/null +++ b/ops/openobserve-alerts/README.md @@ -0,0 +1,22 @@ +# OpenObserve alerts + Telegram + +This deploys a CronJob (`openobserve-alerts-bootstrap`) that upserts alert templates + common alerts in OpenObserve. + +## Prereqs + +Create a Kubernetes Secret with your Telegram chat id: + +```bash +kubectl -n openobserve create secret generic openobserve-telegram \ + --from-literal=TELEGRAM_CHAT_ID='' +``` + +The OpenObserve credentials are read from the existing Secret created by the OpenObserve install: + +- `o2-openobserve-standalone` (`ZO_ROOT_USER_EMAIL`, `ZO_ROOT_USER_PASSWORD`) + +## Notes + +- Alerts are created to send to destination `nxtgauge_telegram` (must exist in OpenObserve). +- Edit `ops/openobserve-alerts/configmap.yaml` to add/remove alerts. + diff --git a/ops/openobserve-alerts/configmap.yaml b/ops/openobserve-alerts/configmap.yaml new file mode 100644 index 0000000..ac15412 --- /dev/null +++ b/ops/openobserve-alerts/configmap.yaml @@ -0,0 +1,166 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: openobserve-alerts-bootstrap + namespace: openobserve +data: + bootstrap.sh: | + #!/usr/bin/env sh + set -eu + + ORG_ID="${ORG_ID:-default}" + BASE_URL="${BASE_URL:-http://o2-openobserve-standalone.openobserve.svc.cluster.local:5080}" + STREAM_NAME="${STREAM_NAME:-default}" + TELEGRAM_CHAT_ID="${TELEGRAM_CHAT_ID:-}" + + if [ -z "${ZO_ROOT_USER_EMAIL:-}" ] || [ -z "${ZO_ROOT_USER_PASSWORD:-}" ]; then + echo "missing ZO_ROOT_USER_EMAIL / ZO_ROOT_USER_PASSWORD" + exit 1 + fi + if [ -z "$TELEGRAM_CHAT_ID" ]; then + echo "missing TELEGRAM_CHAT_ID" + exit 1 + fi + + AUTH="$(printf '%s:%s' "$ZO_ROOT_USER_EMAIL" "$ZO_ROOT_USER_PASSWORD" | base64 | tr -d '\n')" + auth_hdr="Authorization: Basic $AUTH" + + api() { + # shellcheck disable=SC2068 + curl -sfS -H "$auth_hdr" -H "Content-Type: application/json" "$@" + } + + ensure_template() { + template_name="$1" + template_type="$2" # http or email + title="$3" + body="$4" + is_default="$5" # true/false + + payload="$(jq -n \ + --arg name "$template_name" \ + --arg type "$template_type" \ + --arg title "$title" \ + --arg body "$body" \ + --argjson isDefault "$is_default" \ + '{name: $name, type: $type, title: $title, body: $body, isDefault: $isDefault}')" + + # Upsert: PUT works for existing, and also works as create in newer versions. + if api -X PUT "$BASE_URL/api/$ORG_ID/alerts/templates/$template_name" -d "$payload" >/dev/null 2>&1; then + echo "upserted template=$template_name" + return 0 + fi + + api -X POST "$BASE_URL/api/$ORG_ID/alerts/templates" -d "$payload" >/dev/null + echo "created template=$template_name" + } + + ensure_alert() { + alert_name="$1" + sql="$2" + period_minutes="$3" + frequency_minutes="$4" + silence_minutes="$5" + row_template="$6" + + existing_id="$( + api "$BASE_URL/api/v2/$ORG_ID/alerts" \ + | jq -r --arg n "$alert_name" '.list[] | select(.name == $n) | .alert_id' \ + | head -n 1 + )" + + payload="$(jq -n \ + --arg name "$alert_name" \ + --arg stream "$STREAM_NAME" \ + --arg sql "$sql" \ + --argjson period "$period_minutes" \ + --argjson frequency "$frequency_minutes" \ + --argjson silence "$silence_minutes" \ + --arg row_template "$row_template" \ + '{ + name: $name, + stream_type: "logs", + stream_name: $stream, + is_real_time: false, + enabled: true, + tz_offset: 330, + destinations: ["nxtgauge_telegram"], + row_template: $row_template, + row_template_type: "String", + query_condition: { type: "sql", sql: $sql }, + trigger_condition: { + period: $period, + operator: ">=", + threshold: 1, + frequency: $frequency, + frequency_type: "minutes", + silence: $silence + } + }')" + + if [ -n "$existing_id" ] && [ "$existing_id" != "null" ]; then + resp="$(api -X PUT "$BASE_URL/api/v2/$ORG_ID/alerts/$existing_id" -d "$payload")" || { + echo "failed updating alert=$alert_name id=$existing_id" + exit 1 + } + code="$(echo "$resp" | jq -r '.code // empty')" + if [ "$code" != "200" ]; then + echo "failed updating alert=$alert_name id=$existing_id resp=$resp" + exit 1 + fi + echo "updated alert=$alert_name id=$existing_id" + else + resp="$(api -X POST "$BASE_URL/api/v2/$ORG_ID/alerts" -d "$payload")" || { + echo "failed creating alert=$alert_name" + exit 1 + } + code="$(echo "$resp" | jq -r '.code // empty')" + if [ "$code" != "200" ]; then + echo "failed creating alert=$alert_name resp=$resp" + exit 1 + fi + echo "created alert=$alert_name" + fi + } + + # Telegram template includes useful debugging context + top rows. + # Uses OpenObserve built-in variables: {alert_url}, {alert_count}, {rows:5}, etc. + telegram_body="$(printf '{\"chat_id\":\"%s\",\"text\":\"ALERT {alert_name}\\\\norg={org_name} stream={stream_type}/{stream_name}\\\\ncount={alert_count} window={alert_start_time}..{alert_end_time}\\\\n\\\\n{rows:5}\\\\n\\\\nOpen: {alert_url}\"}' "$TELEGRAM_CHAT_ID")" + + ensure_template "telegram_nxtgauge" "http" "" "$telegram_body" true + + ensure_alert \ + "k8s-image-pull-failures" \ + "SELECT k8s_namespace_name, k8s_pod_name, body_object_reason, body_object_message FROM \"default\" WHERE body_object_message ILIKE '%ErrImagePull%' OR body_object_message ILIKE '%ImagePullBackOff%' OR body_object_message ILIKE '%Failed to pull image%' ORDER BY _timestamp DESC LIMIT 50" \ + 5 1 30 \ + "{k8s_namespace_name}/{k8s_pod_name} {body_object_reason}: {body_object_message}" + + ensure_alert \ + "k8s-crashloopbackoff" \ + "SELECT k8s_namespace_name, k8s_pod_name, body_object_reason, body_object_message FROM \"default\" WHERE body_object_message ILIKE '%CrashLoopBackOff%' OR body_object_message ILIKE '%Back-off restarting failed container%' ORDER BY _timestamp DESC LIMIT 50" \ + 5 1 30 \ + "{k8s_namespace_name}/{k8s_pod_name} {body_object_reason}: {body_object_message}" + + ensure_alert \ + "k8s-volume-mount-failures" \ + "SELECT k8s_namespace_name, k8s_pod_name, body_object_reason, body_object_message FROM \"default\" WHERE body_object_message ILIKE '%FailedMount%' OR body_object_message ILIKE '%FailedAttachVolume%' OR body_object_message ILIKE '%MountVolume%' ORDER BY _timestamp DESC LIMIT 50" \ + 10 2 60 \ + "{k8s_namespace_name}/{k8s_pod_name} {body_object_reason}: {body_object_message}" + + ensure_alert \ + "argocd-errors" \ + "SELECT k8s_pod_name, k8s_container_name, substring(body, 1, 220) AS msg FROM \"default\" WHERE k8s_namespace_name = 'argocd' AND (body ILIKE '%level=error%' OR body ILIKE '%ERROR%' OR body ILIKE '%ComparisonError%' OR body ILIKE '%SyncFailed%') ORDER BY _timestamp DESC LIMIT 50" \ + 10 2 30 \ + "argocd/{k8s_pod_name} {k8s_container_name}: {msg}" + + ensure_alert \ + "woodpecker-errors" \ + "SELECT k8s_pod_name, k8s_container_name, substring(body, 1, 220) AS msg FROM \"default\" WHERE k8s_namespace_name = 'woodpecker' AND (body ILIKE '%error%' OR body ILIKE '%ERROR%' OR body ILIKE '%failed%') ORDER BY _timestamp DESC LIMIT 50" \ + 10 2 30 \ + "woodpecker/{k8s_pod_name} {k8s_container_name}: {msg}" + + ensure_alert \ + "registry-errors" \ + "SELECT k8s_pod_name, k8s_container_name, substring(body, 1, 220) AS msg FROM \"default\" WHERE k8s_namespace_name = 'registry' AND (body ILIKE '%error%' OR body ILIKE '%ERROR%' OR body ILIKE '%413%' OR body ILIKE '%payload too large%') ORDER BY _timestamp DESC LIMIT 50" \ + 10 2 60 \ + "registry/{k8s_pod_name} {k8s_container_name}: {msg}" diff --git a/ops/openobserve-alerts/cronjob.yaml b/ops/openobserve-alerts/cronjob.yaml new file mode 100644 index 0000000..0a2b4ec --- /dev/null +++ b/ops/openobserve-alerts/cronjob.yaml @@ -0,0 +1,49 @@ +apiVersion: batch/v1 +kind: CronJob +metadata: + name: openobserve-alerts-bootstrap + namespace: openobserve +spec: + schedule: "*/15 * * * *" + concurrencyPolicy: Forbid + jobTemplate: + spec: + backoffLimit: 0 + template: + spec: + restartPolicy: Never + containers: + - name: bootstrap + image: registry.nxtgauge.com/docker:28-cli + command: ["sh", "-lc"] + args: + - apk add --no-cache curl jq >/dev/null && /scripts/bootstrap.sh + env: + - name: ORG_ID + value: default + - name: BASE_URL + value: http://o2-openobserve-standalone.openobserve.svc.cluster.local:5080 + - name: TELEGRAM_CHAT_ID + valueFrom: + secretKeyRef: + name: openobserve-telegram + key: TELEGRAM_CHAT_ID + - name: ZO_ROOT_USER_EMAIL + valueFrom: + secretKeyRef: + name: o2-openobserve-standalone + key: ZO_ROOT_USER_EMAIL + - name: ZO_ROOT_USER_PASSWORD + valueFrom: + secretKeyRef: + name: o2-openobserve-standalone + key: ZO_ROOT_USER_PASSWORD + volumeMounts: + - name: scripts + mountPath: /scripts + readOnly: true + volumes: + - name: scripts + configMap: + name: openobserve-alerts-bootstrap + defaultMode: 0555 diff --git a/ops/openobserve-alerts/kustomization.yaml b/ops/openobserve-alerts/kustomization.yaml new file mode 100644 index 0000000..87dc0ad --- /dev/null +++ b/ops/openobserve-alerts/kustomization.yaml @@ -0,0 +1,9 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +namespace: openobserve + +resources: + - configmap.yaml + - cronjob.yaml + diff --git a/ops/openobserve-otelcol/README.md b/ops/openobserve-otelcol/README.md new file mode 100644 index 0000000..92e3bce --- /dev/null +++ b/ops/openobserve-otelcol/README.md @@ -0,0 +1,23 @@ +# OpenObserve OpenTelemetry Collector (k8s logs + warning events) + +This deploys an OpenTelemetry Collector `DaemonSet` that ships: + +- container logs from `/var/log/containers/*.log` +- Kubernetes `Warning` events (watch) + +to OpenObserve via OTLP/HTTP. + +## Image mirroring + +The DaemonSet expects this image in the internal registry: + +- `registry.nxtgauge.com/otelcol-contrib:0.105.0` + +Mirror it once (from a machine that can pull GHCR and push to the registry): + +```bash +docker pull ghcr.io/open-telemetry/opentelemetry-collector-releases/opentelemetry-collector-contrib:0.105.0 +docker tag ghcr.io/open-telemetry/opentelemetry-collector-releases/opentelemetry-collector-contrib:0.105.0 registry.nxtgauge.com/otelcol-contrib:0.105.0 +docker push registry.nxtgauge.com/otelcol-contrib:0.105.0 +``` + diff --git a/ops/openobserve-otelcol/configmap.yaml b/ops/openobserve-otelcol/configmap.yaml new file mode 100644 index 0000000..2132f47 --- /dev/null +++ b/ops/openobserve-otelcol/configmap.yaml @@ -0,0 +1,65 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: openobserve-otelcol-config + namespace: openobserve +data: + collector.yaml: | + extensions: + basicauth/openobserve: + client_auth: + username: ${env:ZO_ROOT_USER_EMAIL} + password: ${env:ZO_ROOT_USER_PASSWORD} + + receivers: + filelog/containers: + include: + - /var/log/containers/*.log + start_at: end + include_file_path: true + operators: + - id: parse_k8s_from_path + type: regex_parser + parse_from: attributes["log.file.path"] + regex: '^.*/(?P[^_]+)_(?P[^_]+)_(?P.+)-(?P[0-9a-f]+)\.log$' + on_error: drop + + k8sobjects/warning_events: + auth_type: serviceAccount + objects: + - name: events + mode: watch + field_selector: type=Warning + + processors: + batch: {} + + resource/containers: + attributes: + - action: upsert + key: service.name + value: k8s_container_logs + + resource/events: + attributes: + - action: upsert + key: service.name + value: k8s_events + + exporters: + otlphttp/openobserve: + logs_endpoint: http://o2-openobserve-standalone.openobserve.svc.cluster.local:5080/api/default/v1/logs + auth: + authenticator: basicauth/openobserve + + service: + extensions: [basicauth/openobserve] + pipelines: + logs/containers: + receivers: [filelog/containers] + processors: [resource/containers, batch] + exporters: [otlphttp/openobserve] + logs/events: + receivers: [k8sobjects/warning_events] + processors: [resource/events, batch] + exporters: [otlphttp/openobserve] diff --git a/ops/openobserve-otelcol/daemonset.yaml b/ops/openobserve-otelcol/daemonset.yaml new file mode 100644 index 0000000..c40f1bc --- /dev/null +++ b/ops/openobserve-otelcol/daemonset.yaml @@ -0,0 +1,69 @@ +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: openobserve-otelcol + namespace: openobserve + labels: + app.kubernetes.io/name: openobserve-otelcol +spec: + selector: + matchLabels: + app.kubernetes.io/name: openobserve-otelcol + template: + metadata: + labels: + app.kubernetes.io/name: openobserve-otelcol + spec: + serviceAccountName: openobserve-otelcol + tolerations: + - operator: Exists + securityContext: + runAsUser: 0 + runAsGroup: 0 + containers: + - name: otelcol + image: registry.nxtgauge.com/otelcol-contrib:0.105.0 + args: ["--config=/conf/collector.yaml"] + env: + - name: ZO_ROOT_USER_EMAIL + valueFrom: + secretKeyRef: + name: o2-openobserve-standalone + key: ZO_ROOT_USER_EMAIL + - name: ZO_ROOT_USER_PASSWORD + valueFrom: + secretKeyRef: + name: o2-openobserve-standalone + key: ZO_ROOT_USER_PASSWORD + resources: + requests: + cpu: 50m + memory: 128Mi + limits: + cpu: 500m + memory: 512Mi + volumeMounts: + - name: conf + mountPath: /conf + readOnly: true + - name: varlogcontainers + mountPath: /var/log/containers + readOnly: true + - name: varlogpods + mountPath: /var/log/pods + readOnly: true + volumes: + - name: conf + configMap: + name: openobserve-otelcol-config + items: + - key: collector.yaml + path: collector.yaml + - name: varlogcontainers + hostPath: + path: /var/log/containers + type: DirectoryOrCreate + - name: varlogpods + hostPath: + path: /var/log/pods + type: DirectoryOrCreate diff --git a/ops/openobserve-otelcol/kustomization.yaml b/ops/openobserve-otelcol/kustomization.yaml new file mode 100644 index 0000000..6f02929 --- /dev/null +++ b/ops/openobserve-otelcol/kustomization.yaml @@ -0,0 +1,11 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +namespace: openobserve + +resources: + - serviceaccount.yaml + - rbac.yaml + - configmap.yaml + - daemonset.yaml + diff --git a/ops/openobserve-otelcol/rbac.yaml b/ops/openobserve-otelcol/rbac.yaml new file mode 100644 index 0000000..9daef7b --- /dev/null +++ b/ops/openobserve-otelcol/rbac.yaml @@ -0,0 +1,29 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: openobserve-otelcol +rules: + - apiGroups: [""] + resources: + - nodes + - namespaces + - pods + - events + verbs: ["get", "list", "watch"] + - apiGroups: ["events.k8s.io"] + resources: ["events"] + verbs: ["get", "list", "watch"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: openobserve-otelcol +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: openobserve-otelcol +subjects: + - kind: ServiceAccount + name: openobserve-otelcol + namespace: openobserve + diff --git a/ops/openobserve-otelcol/serviceaccount.yaml b/ops/openobserve-otelcol/serviceaccount.yaml new file mode 100644 index 0000000..4b8f6b7 --- /dev/null +++ b/ops/openobserve-otelcol/serviceaccount.yaml @@ -0,0 +1,6 @@ +apiVersion: v1 +kind: ServiceAccount +metadata: + name: openobserve-otelcol + namespace: openobserve +