fix: remove non-existent social-media-managers service and OpenObserve monitoring
- Removed social-media-managers deployment and service (image not in registry) - Removed OpenObserve endpoint and k8s monitor cronjobs (no longer needed) - Cleaned up configmap references Fixes ImagePullBackOff and CreateContainerConfigError errors
This commit is contained in:
parent
471f1da66c
commit
3a8807d3dd
9 changed files with 0 additions and 341 deletions
|
|
@ -20,7 +20,6 @@ data:
|
|||
DEVELOPERS_SERVICE_URL: "http://nxtgauge-rust-developers:9110"
|
||||
VIDEO_EDITORS_SERVICE_URL: "http://nxtgauge-rust-video-editors:9111"
|
||||
GRAPHIC_DESIGNERS_SERVICE_URL: "http://nxtgauge-rust-graphic-designers:9112"
|
||||
SOCIAL_MEDIA_MANAGERS_SERVICE_URL: "http://nxtgauge-rust-social-media-managers:9113"
|
||||
FITNESS_TRAINERS_SERVICE_URL: "http://nxtgauge-rust-fitness-trainers:9114"
|
||||
CATERING_SERVICES_SERVICE_URL: "http://nxtgauge-rust-catering-services:9115"
|
||||
PAYMENTS_SERVICE_URL: "http://nxtgauge-rust-payments:9116"
|
||||
|
|
|
|||
|
|
@ -36,8 +36,6 @@ resources:
|
|||
- video-editors-service.yaml
|
||||
- graphic-designers-deployment.yaml
|
||||
- graphic-designers-service.yaml
|
||||
- social-media-managers-deployment.yaml
|
||||
- social-media-managers-service.yaml
|
||||
- fitness-trainers-deployment.yaml
|
||||
- fitness-trainers-service.yaml
|
||||
- catering-services-deployment.yaml
|
||||
|
|
@ -45,7 +43,3 @@ resources:
|
|||
- ugc-content-creators-deployment.yaml
|
||||
- ugc-content-creators-service.yaml
|
||||
- cron-deployment.yaml
|
||||
- openobserve-endpoint-monitor-secret.yaml
|
||||
- openobserve-endpoint-monitor-cronjob.yaml
|
||||
- openobserve-k8s-monitor-rbac.yaml
|
||||
- openobserve-k8s-monitor-cronjob.yaml
|
||||
|
|
|
|||
|
|
@ -1,51 +0,0 @@
|
|||
apiVersion: batch/v1
|
||||
kind: CronJob
|
||||
metadata:
|
||||
name: nxtgauge-openobserve-endpoint-monitor
|
||||
namespace: nxtgauge
|
||||
spec:
|
||||
schedule: "*/1 * * * *"
|
||||
concurrencyPolicy: Forbid
|
||||
successfulJobsHistoryLimit: 1
|
||||
failedJobsHistoryLimit: 3
|
||||
jobTemplate:
|
||||
spec:
|
||||
template:
|
||||
spec:
|
||||
restartPolicy: OnFailure
|
||||
containers:
|
||||
- name: endpoint-monitor
|
||||
image: curlimages/curl:8.10.1
|
||||
imagePullPolicy: IfNotPresent
|
||||
envFrom:
|
||||
- secretRef:
|
||||
name: nxtgauge-openobserve-endpoint-monitor-secret
|
||||
command: ["/bin/sh", "-ec"]
|
||||
args:
|
||||
- |
|
||||
post_result() {
|
||||
name="$1"
|
||||
url="$2"
|
||||
checked_at="$(date -u +%Y-%m-%dT%H:%M:%SZ)"
|
||||
out="$(curl -sS -o /dev/null -w '%{http_code} %{time_total}' --max-time 15 "$url" || true)"
|
||||
code="$(printf '%s' "$out" | awk '{print $1}')"
|
||||
total="$(printf '%s' "$out" | awk '{print $2}')"
|
||||
[ -n "$code" ] || code="0"
|
||||
[ -n "$total" ] || total="0"
|
||||
latency_ms="$(awk "BEGIN { printf \"%.0f\", $total * 1000 }")"
|
||||
if [ "$code" -ge 200 ] && [ "$code" -lt 400 ]; then
|
||||
ok="true"
|
||||
else
|
||||
ok="false"
|
||||
fi
|
||||
payload="$(printf '[{"endpoint":"%s","url":"%s","status_code":%s,"ok":%s,"latency_ms":%s,"checked_at":"%s"}]' "$name" "$url" "$code" "$ok" "$latency_ms" "$checked_at")"
|
||||
curl -sS -X POST \
|
||||
"${OO_ENDPOINT}/api/${OO_ORG}/${OO_STREAM}/_json" \
|
||||
-H "Authorization: ${OO_AUTH_HEADER}" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d "$payload" >/dev/null
|
||||
}
|
||||
|
||||
post_result "frontend" "https://test121.nxtgauge.com/"
|
||||
post_result "admin" "https://admin.nxtgauge.com/"
|
||||
post_result "api-health" "https://api.nxtgauge.com/health"
|
||||
|
|
@ -1,11 +0,0 @@
|
|||
apiVersion: v1
|
||||
kind: Secret
|
||||
metadata:
|
||||
name: nxtgauge-openobserve-endpoint-monitor-secret
|
||||
namespace: nxtgauge
|
||||
type: Opaque
|
||||
stringData:
|
||||
OO_ENDPOINT: "http://o2-openobserve-standalone.openobserve.svc.cluster.local:5080"
|
||||
OO_ORG: "default"
|
||||
OO_STREAM: "nxtgauge_endpoints"
|
||||
OO_AUTH_HEADER: "Basic cm9vdEBleGFtcGxlLmNvbTpDb21wbGV4cGFzcyMxMjM="
|
||||
|
|
@ -1,176 +0,0 @@
|
|||
apiVersion: batch/v1
|
||||
kind: CronJob
|
||||
metadata:
|
||||
name: nxtgauge-openobserve-k8s-monitor
|
||||
namespace: nxtgauge
|
||||
spec:
|
||||
schedule: "*/1 * * * *"
|
||||
concurrencyPolicy: Forbid
|
||||
successfulJobsHistoryLimit: 1
|
||||
failedJobsHistoryLimit: 3
|
||||
jobTemplate:
|
||||
spec:
|
||||
template:
|
||||
spec:
|
||||
serviceAccountName: nxtgauge-openobserve-k8s-monitor
|
||||
restartPolicy: OnFailure
|
||||
containers:
|
||||
- name: k8s-monitor
|
||||
image: python:3.12-alpine
|
||||
imagePullPolicy: IfNotPresent
|
||||
envFrom:
|
||||
- secretRef:
|
||||
name: nxtgauge-openobserve-endpoint-monitor-secret
|
||||
command: ["python", "-c"]
|
||||
args:
|
||||
- |
|
||||
import datetime
|
||||
import json
|
||||
import os
|
||||
import ssl
|
||||
import time
|
||||
import urllib.error
|
||||
import urllib.request
|
||||
|
||||
token_path = "/var/run/secrets/kubernetes.io/serviceaccount/token"
|
||||
ca_path = "/var/run/secrets/kubernetes.io/serviceaccount/ca.crt"
|
||||
with open(token_path, "r", encoding="utf-8") as f:
|
||||
token = f.read().strip()
|
||||
|
||||
kube_ctx = ssl.create_default_context(cafile=ca_path)
|
||||
kube_api = "https://kubernetes.default.svc"
|
||||
|
||||
def kube_get(path: str):
|
||||
req = urllib.request.Request(
|
||||
kube_api + path,
|
||||
headers={"Authorization": f"Bearer {token}"},
|
||||
)
|
||||
with urllib.request.urlopen(req, context=kube_ctx, timeout=20) as resp:
|
||||
return json.loads(resp.read().decode("utf-8"))
|
||||
|
||||
def check_url(name: str, url: str):
|
||||
start = time.time()
|
||||
status = 0
|
||||
ok = False
|
||||
err = ""
|
||||
try:
|
||||
req = urllib.request.Request(url)
|
||||
with urllib.request.urlopen(req, timeout=15) as resp:
|
||||
status = int(getattr(resp, "status", 0) or 0)
|
||||
ok = 200 <= status < 400
|
||||
except urllib.error.HTTPError as e:
|
||||
status = int(getattr(e, "code", 0) or 0)
|
||||
ok = 200 <= status < 400
|
||||
err = str(e)
|
||||
except Exception as e:
|
||||
err = str(e)
|
||||
if name == "registry-svc" and status in (200, 401):
|
||||
ok = True
|
||||
latency_ms = int((time.time() - start) * 1000)
|
||||
return {
|
||||
"kind": "endpoint",
|
||||
"endpoint": name,
|
||||
"url": url,
|
||||
"status_code": status,
|
||||
"ok": ok,
|
||||
"latency_ms": latency_ms,
|
||||
"error": err,
|
||||
}
|
||||
|
||||
now = datetime.datetime.utcnow().replace(microsecond=0).isoformat() + "Z"
|
||||
records = []
|
||||
|
||||
nodes = kube_get("/api/v1/nodes").get("items", [])
|
||||
ready_count = 0
|
||||
for n in nodes:
|
||||
conds = {c.get("type"): c.get("status") for c in n.get("status", {}).get("conditions", [])}
|
||||
ready = conds.get("Ready") == "True"
|
||||
if ready:
|
||||
ready_count += 1
|
||||
records.append(
|
||||
{
|
||||
"kind": "node",
|
||||
"node": n.get("metadata", {}).get("name", "unknown"),
|
||||
"ready": ready,
|
||||
"memory_pressure": conds.get("MemoryPressure"),
|
||||
"disk_pressure": conds.get("DiskPressure"),
|
||||
"pid_pressure": conds.get("PIDPressure"),
|
||||
"network_unavailable": conds.get("NetworkUnavailable"),
|
||||
"checked_at": now,
|
||||
}
|
||||
)
|
||||
|
||||
pod_issues = 0
|
||||
pods = kube_get("/api/v1/pods").get("items", [])
|
||||
for p in pods:
|
||||
ns = p.get("metadata", {}).get("namespace", "")
|
||||
name = p.get("metadata", {}).get("name", "")
|
||||
phase = p.get("status", {}).get("phase", "")
|
||||
reason = p.get("status", {}).get("reason", "") or ""
|
||||
message = p.get("status", {}).get("message", "") or ""
|
||||
crash = False
|
||||
|
||||
for cs in p.get("status", {}).get("containerStatuses", []) or []:
|
||||
waiting = (cs.get("state") or {}).get("waiting") or {}
|
||||
if waiting.get("reason") in ("CrashLoopBackOff", "ImagePullBackOff", "ErrImagePull"):
|
||||
crash = True
|
||||
reason = waiting.get("reason", reason)
|
||||
message = waiting.get("message", message)
|
||||
|
||||
if phase in ("Pending", "Failed", "Unknown") or crash:
|
||||
pod_issues += 1
|
||||
records.append(
|
||||
{
|
||||
"kind": "pod",
|
||||
"namespace": ns,
|
||||
"pod": name,
|
||||
"phase": phase,
|
||||
"reason": reason,
|
||||
"message": message[:300],
|
||||
"checked_at": now,
|
||||
}
|
||||
)
|
||||
|
||||
endpoints = [
|
||||
("frontend-svc", "http://nxtgauge-frontend-solid.nxtgauge.svc.cluster.local/"),
|
||||
("admin-svc", "http://nxtgauge-admin-solid.nxtgauge.svc.cluster.local/"),
|
||||
("api-gateway-svc", "http://nxtgauge-rust-gateway.nxtgauge.svc.cluster.local:9100/health"),
|
||||
("registry-svc", "http://docker-registry.registry.svc.cluster.local:5000/v2/"),
|
||||
("woodpecker-svc", "http://woodpecker-server.woodpecker.svc.cluster.local/"),
|
||||
("argocd-metrics", "http://argocd-server-metrics.argocd.svc.cluster.local:8083/metrics"),
|
||||
("openobserve-svc", "http://o2-openobserve-standalone.openobserve.svc.cluster.local:5080/healthz"),
|
||||
]
|
||||
for name, url in endpoints:
|
||||
rec = check_url(name, url)
|
||||
rec["checked_at"] = now
|
||||
records.append(rec)
|
||||
|
||||
records.append(
|
||||
{
|
||||
"kind": "cluster_summary",
|
||||
"cluster": "nxtgauge",
|
||||
"node_total": len(nodes),
|
||||
"node_ready": ready_count,
|
||||
"node_not_ready": len(nodes) - ready_count,
|
||||
"pod_issues": pod_issues,
|
||||
"checked_at": now,
|
||||
}
|
||||
)
|
||||
|
||||
oo_endpoint = os.environ["OO_ENDPOINT"].rstrip("/")
|
||||
oo_org = os.environ.get("OO_ORG", "default")
|
||||
stream = "nxtgauge_k8s_health"
|
||||
auth_header = os.environ["OO_AUTH_HEADER"]
|
||||
payload = json.dumps(records).encode("utf-8")
|
||||
|
||||
req = urllib.request.Request(
|
||||
f"{oo_endpoint}/api/{oo_org}/{stream}/_json",
|
||||
data=payload,
|
||||
headers={
|
||||
"Content-Type": "application/json",
|
||||
"Authorization": auth_header,
|
||||
},
|
||||
method="POST",
|
||||
)
|
||||
with urllib.request.urlopen(req, timeout=30) as resp:
|
||||
_ = resp.read()
|
||||
|
|
@ -1,27 +0,0 @@
|
|||
apiVersion: v1
|
||||
kind: ServiceAccount
|
||||
metadata:
|
||||
name: nxtgauge-openobserve-k8s-monitor
|
||||
namespace: nxtgauge
|
||||
---
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: ClusterRole
|
||||
metadata:
|
||||
name: nxtgauge-openobserve-k8s-monitor
|
||||
rules:
|
||||
- apiGroups: [""]
|
||||
resources: ["nodes", "pods", "namespaces"]
|
||||
verbs: ["get", "list"]
|
||||
---
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: ClusterRoleBinding
|
||||
metadata:
|
||||
name: nxtgauge-openobserve-k8s-monitor
|
||||
subjects:
|
||||
- kind: ServiceAccount
|
||||
name: nxtgauge-openobserve-k8s-monitor
|
||||
namespace: nxtgauge
|
||||
roleRef:
|
||||
apiGroup: rbac.authorization.k8s.io
|
||||
kind: ClusterRole
|
||||
name: nxtgauge-openobserve-k8s-monitor
|
||||
|
|
@ -1,51 +0,0 @@
|
|||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: nxtgauge-rust-social-media-managers
|
||||
labels:
|
||||
app: nxtgauge-rust-social-media-managers
|
||||
spec:
|
||||
replicas: 1
|
||||
selector:
|
||||
matchLabels:
|
||||
app: nxtgauge-rust-social-media-managers
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app: nxtgauge-rust-social-media-managers
|
||||
spec:
|
||||
containers:
|
||||
- name: social-media-managers
|
||||
image: registry.nxtgauge.com/nxtgauge-rust-social-media-managers
|
||||
imagePullPolicy: Always
|
||||
ports:
|
||||
- containerPort: 9113
|
||||
name: http
|
||||
envFrom:
|
||||
- configMapRef:
|
||||
name: nxtgauge-backend-rust-config
|
||||
- secretRef:
|
||||
name: nxtgauge-backend-rust-secrets
|
||||
readinessProbe:
|
||||
httpGet:
|
||||
path: /health
|
||||
port: 9113
|
||||
initialDelaySeconds: 10
|
||||
periodSeconds: 10
|
||||
timeoutSeconds: 3
|
||||
failureThreshold: 3
|
||||
livenessProbe:
|
||||
httpGet:
|
||||
path: /health
|
||||
port: 9113
|
||||
initialDelaySeconds: 20
|
||||
periodSeconds: 20
|
||||
timeoutSeconds: 3
|
||||
failureThreshold: 5
|
||||
resources:
|
||||
requests:
|
||||
cpu: 50m
|
||||
memory: 64Mi
|
||||
limits:
|
||||
cpu: 500m
|
||||
memory: 256Mi
|
||||
|
|
@ -1,16 +0,0 @@
|
|||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: nxtgauge-rust-social-media-managers
|
||||
namespace: nxtgauge
|
||||
labels:
|
||||
app: nxtgauge-rust-social-media-managers
|
||||
spec:
|
||||
type: ClusterIP
|
||||
selector:
|
||||
app: nxtgauge-rust-social-media-managers
|
||||
ports:
|
||||
- name: http
|
||||
port: 9113
|
||||
targetPort: 9113
|
||||
protocol: TCP
|
||||
|
|
@ -40,8 +40,6 @@ images:
|
|||
newTag: high-performance-latest
|
||||
- name: registry.nxtgauge.com/nxtgauge-rust-graphic-designers
|
||||
newTag: high-performance-latest
|
||||
- name: registry.nxtgauge.com/nxtgauge-rust-social-media-managers
|
||||
newTag: high-performance-latest
|
||||
- name: registry.nxtgauge.com/nxtgauge-rust-fitness-trainers
|
||||
newTag: high-performance-latest
|
||||
- name: registry.nxtgauge.com/nxtgauge-rust-catering-services
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue