feat(observability): monitor every k8s node and platform endpoints in openobserve
This commit is contained in:
parent
ace5873261
commit
8844a0481d
3 changed files with 202 additions and 0 deletions
|
|
@ -46,4 +46,6 @@ resources:
|
||||||
- cron-deployment.yaml
|
- cron-deployment.yaml
|
||||||
- openobserve-endpoint-monitor-secret.yaml
|
- openobserve-endpoint-monitor-secret.yaml
|
||||||
- openobserve-endpoint-monitor-cronjob.yaml
|
- openobserve-endpoint-monitor-cronjob.yaml
|
||||||
|
- openobserve-k8s-monitor-rbac.yaml
|
||||||
|
- openobserve-k8s-monitor-cronjob.yaml
|
||||||
- k8s-migration-job.yaml
|
- k8s-migration-job.yaml
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,173 @@
|
||||||
|
apiVersion: batch/v1
|
||||||
|
kind: CronJob
|
||||||
|
metadata:
|
||||||
|
name: nxtgauge-openobserve-k8s-monitor
|
||||||
|
namespace: nxtgauge
|
||||||
|
spec:
|
||||||
|
schedule: "*/1 * * * *"
|
||||||
|
concurrencyPolicy: Forbid
|
||||||
|
successfulJobsHistoryLimit: 1
|
||||||
|
failedJobsHistoryLimit: 3
|
||||||
|
jobTemplate:
|
||||||
|
spec:
|
||||||
|
template:
|
||||||
|
spec:
|
||||||
|
serviceAccountName: nxtgauge-openobserve-k8s-monitor
|
||||||
|
restartPolicy: OnFailure
|
||||||
|
containers:
|
||||||
|
- name: k8s-monitor
|
||||||
|
image: python:3.12-alpine
|
||||||
|
imagePullPolicy: IfNotPresent
|
||||||
|
envFrom:
|
||||||
|
- secretRef:
|
||||||
|
name: nxtgauge-openobserve-endpoint-monitor-secret
|
||||||
|
command: ["python", "-c"]
|
||||||
|
args:
|
||||||
|
- |
|
||||||
|
import datetime
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
import ssl
|
||||||
|
import time
|
||||||
|
import urllib.error
|
||||||
|
import urllib.request
|
||||||
|
|
||||||
|
token_path = "/var/run/secrets/kubernetes.io/serviceaccount/token"
|
||||||
|
ca_path = "/var/run/secrets/kubernetes.io/serviceaccount/ca.crt"
|
||||||
|
with open(token_path, "r", encoding="utf-8") as f:
|
||||||
|
token = f.read().strip()
|
||||||
|
|
||||||
|
kube_ctx = ssl.create_default_context(cafile=ca_path)
|
||||||
|
kube_api = "https://kubernetes.default.svc"
|
||||||
|
|
||||||
|
def kube_get(path: str):
|
||||||
|
req = urllib.request.Request(
|
||||||
|
kube_api + path,
|
||||||
|
headers={"Authorization": f"Bearer {token}"},
|
||||||
|
)
|
||||||
|
with urllib.request.urlopen(req, context=kube_ctx, timeout=20) as resp:
|
||||||
|
return json.loads(resp.read().decode("utf-8"))
|
||||||
|
|
||||||
|
def check_url(name: str, url: str):
|
||||||
|
start = time.time()
|
||||||
|
status = 0
|
||||||
|
ok = False
|
||||||
|
err = ""
|
||||||
|
try:
|
||||||
|
req = urllib.request.Request(url)
|
||||||
|
with urllib.request.urlopen(req, timeout=15) as resp:
|
||||||
|
status = int(getattr(resp, "status", 0) or 0)
|
||||||
|
ok = 200 <= status < 400
|
||||||
|
except urllib.error.HTTPError as e:
|
||||||
|
status = int(getattr(e, "code", 0) or 0)
|
||||||
|
ok = 200 <= status < 400
|
||||||
|
err = str(e)
|
||||||
|
except Exception as e:
|
||||||
|
err = str(e)
|
||||||
|
latency_ms = int((time.time() - start) * 1000)
|
||||||
|
return {
|
||||||
|
"kind": "endpoint",
|
||||||
|
"endpoint": name,
|
||||||
|
"url": url,
|
||||||
|
"status_code": status,
|
||||||
|
"ok": ok,
|
||||||
|
"latency_ms": latency_ms,
|
||||||
|
"error": err,
|
||||||
|
}
|
||||||
|
|
||||||
|
now = datetime.datetime.utcnow().replace(microsecond=0).isoformat() + "Z"
|
||||||
|
records = []
|
||||||
|
|
||||||
|
nodes = kube_get("/api/v1/nodes").get("items", [])
|
||||||
|
ready_count = 0
|
||||||
|
for n in nodes:
|
||||||
|
conds = {c.get("type"): c.get("status") for c in n.get("status", {}).get("conditions", [])}
|
||||||
|
ready = conds.get("Ready") == "True"
|
||||||
|
if ready:
|
||||||
|
ready_count += 1
|
||||||
|
records.append(
|
||||||
|
{
|
||||||
|
"kind": "node",
|
||||||
|
"node": n.get("metadata", {}).get("name", "unknown"),
|
||||||
|
"ready": ready,
|
||||||
|
"memory_pressure": conds.get("MemoryPressure"),
|
||||||
|
"disk_pressure": conds.get("DiskPressure"),
|
||||||
|
"pid_pressure": conds.get("PIDPressure"),
|
||||||
|
"network_unavailable": conds.get("NetworkUnavailable"),
|
||||||
|
"checked_at": now,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
pod_issues = 0
|
||||||
|
pods = kube_get("/api/v1/pods").get("items", [])
|
||||||
|
for p in pods:
|
||||||
|
ns = p.get("metadata", {}).get("namespace", "")
|
||||||
|
name = p.get("metadata", {}).get("name", "")
|
||||||
|
phase = p.get("status", {}).get("phase", "")
|
||||||
|
reason = p.get("status", {}).get("reason", "") or ""
|
||||||
|
message = p.get("status", {}).get("message", "") or ""
|
||||||
|
crash = False
|
||||||
|
|
||||||
|
for cs in p.get("status", {}).get("containerStatuses", []) or []:
|
||||||
|
waiting = (cs.get("state") or {}).get("waiting") or {}
|
||||||
|
if waiting.get("reason") in ("CrashLoopBackOff", "ImagePullBackOff", "ErrImagePull"):
|
||||||
|
crash = True
|
||||||
|
reason = waiting.get("reason", reason)
|
||||||
|
message = waiting.get("message", message)
|
||||||
|
|
||||||
|
if phase != "Running" or crash:
|
||||||
|
pod_issues += 1
|
||||||
|
records.append(
|
||||||
|
{
|
||||||
|
"kind": "pod",
|
||||||
|
"namespace": ns,
|
||||||
|
"pod": name,
|
||||||
|
"phase": phase,
|
||||||
|
"reason": reason,
|
||||||
|
"message": message[:300],
|
||||||
|
"checked_at": now,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
endpoints = [
|
||||||
|
("frontend", "https://test121.nxtgauge.com/"),
|
||||||
|
("admin", "https://admin.nxtgauge.com/"),
|
||||||
|
("api-health", "https://api.nxtgauge.com/health"),
|
||||||
|
("woodpecker", "https://ci.nxtgauge.com/"),
|
||||||
|
("argocd-server", "http://argocd-server.argocd.svc.cluster.local/healthz"),
|
||||||
|
("openobserve", "https://logs.nxtgauge.com/"),
|
||||||
|
]
|
||||||
|
for name, url in endpoints:
|
||||||
|
rec = check_url(name, url)
|
||||||
|
rec["checked_at"] = now
|
||||||
|
records.append(rec)
|
||||||
|
|
||||||
|
records.append(
|
||||||
|
{
|
||||||
|
"kind": "cluster_summary",
|
||||||
|
"cluster": "nxtgauge",
|
||||||
|
"node_total": len(nodes),
|
||||||
|
"node_ready": ready_count,
|
||||||
|
"node_not_ready": len(nodes) - ready_count,
|
||||||
|
"pod_issues": pod_issues,
|
||||||
|
"checked_at": now,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
oo_endpoint = os.environ["OO_ENDPOINT"].rstrip("/")
|
||||||
|
oo_org = os.environ.get("OO_ORG", "default")
|
||||||
|
stream = "nxtgauge_k8s_health"
|
||||||
|
auth_header = os.environ["OO_AUTH_HEADER"]
|
||||||
|
payload = json.dumps(records).encode("utf-8")
|
||||||
|
|
||||||
|
req = urllib.request.Request(
|
||||||
|
f"{oo_endpoint}/api/{oo_org}/{stream}/_json",
|
||||||
|
data=payload,
|
||||||
|
headers={
|
||||||
|
"Content-Type": "application/json",
|
||||||
|
"Authorization": auth_header,
|
||||||
|
},
|
||||||
|
method="POST",
|
||||||
|
)
|
||||||
|
with urllib.request.urlopen(req, timeout=30) as resp:
|
||||||
|
_ = resp.read()
|
||||||
|
|
@ -0,0 +1,27 @@
|
||||||
|
apiVersion: v1
|
||||||
|
kind: ServiceAccount
|
||||||
|
metadata:
|
||||||
|
name: nxtgauge-openobserve-k8s-monitor
|
||||||
|
namespace: nxtgauge
|
||||||
|
---
|
||||||
|
apiVersion: rbac.authorization.k8s.io/v1
|
||||||
|
kind: ClusterRole
|
||||||
|
metadata:
|
||||||
|
name: nxtgauge-openobserve-k8s-monitor
|
||||||
|
rules:
|
||||||
|
- apiGroups: [""]
|
||||||
|
resources: ["nodes", "pods", "namespaces"]
|
||||||
|
verbs: ["get", "list"]
|
||||||
|
---
|
||||||
|
apiVersion: rbac.authorization.k8s.io/v1
|
||||||
|
kind: ClusterRoleBinding
|
||||||
|
metadata:
|
||||||
|
name: nxtgauge-openobserve-k8s-monitor
|
||||||
|
subjects:
|
||||||
|
- kind: ServiceAccount
|
||||||
|
name: nxtgauge-openobserve-k8s-monitor
|
||||||
|
namespace: nxtgauge
|
||||||
|
roleRef:
|
||||||
|
apiGroup: rbac.authorization.k8s.io
|
||||||
|
kind: ClusterRole
|
||||||
|
name: nxtgauge-openobserve-k8s-monitor
|
||||||
Loading…
Add table
Reference in a new issue