nxtgauge-gitops/apps/registry/retention-script.yaml
Ashwin Kumar Sivakumar 4eed905fb6 feat: auto-trigger garbage collection after manifest cleanup
- Added automatic GC to prune script after deleting manifests
- Cronjob now uses python:3.12-slim with kubectl installed
- Added serviceAccountName: registry-gc-runner for permissions
- GC scales down registry, runs garbage-collect, scales back up
- Deletes unreferenced blob layers to actually free disk space
2026-06-12 04:50:02 +05:30

184 lines
7 KiB
YAML

apiVersion: v1
kind: ConfigMap
metadata:
name: registry-retention-script
namespace: registry
data:
prune.py: |
import base64, json, re, urllib.request, urllib.error
REG='https://registry.nxtgauge.com'
CFG='/auth/.dockerconfigjson'
PATTERN=re.compile(r'^[0-9a-f]{40}$')
# Base images that MUST NEVER be deleted, even if their names start with
# nxtgauge- in the future. These are the FROM lines in our Dockerfiles
# (alpine for rust, node variants for frontend/admin, etc.). If any of
# these are missing the entire build pipeline breaks.
BASE_IMAGES = {
'alpine',
'node',
'rust',
'busybox',
'golang',
'nginx',
'postgres',
'redis',
}
# Project-image prefix that we DO prune. Anything outside this is sacred.
PROJECT_PREFIX = 'nxtgauge-'
with open(CFG,'r') as f:
dcfg=json.load(f)
auth=dcfg['auths']['registry.nxtgauge.com']['auth']
HEAD={'Authorization': f'Basic {auth}'}
def req(url, headers=None, method='GET'):
h=dict(HEAD)
if headers: h.update(headers)
r=urllib.request.Request(url, headers=h, method=method)
with urllib.request.urlopen(r, timeout=30) as resp:
return resp.status, dict(resp.headers), resp.read()
_, _, body = req(f'{REG}/v2/_catalog?n=1000')
all_repos=json.loads(body.decode()).get('repositories',[])
# EXPLICIT SAFETY: only consider repos that match the project prefix.
# This double-belt-and-suspenders: base images (alpine/node/rust) are
# also in BASE_IMAGES as a fallback in case the prefix is ever changed.
repos=[r for r in all_repos if r.startswith(PROJECT_PREFIX) and r not in BASE_IMAGES]
# Sanity check: log if any base image is missing
missing_base = [b for b in BASE_IMAGES if b in all_repos or True] # always present
present = set(all_repos)
for b in BASE_IMAGES:
if b not in present:
print(f'[WARN] base image {b} not in registry catalog - re-push required!')
deleted=0
for repo in sorted(repos):
try:
_, _, tb=req(f'{REG}/v2/{repo}/tags/list')
tags=(json.loads(tb.decode()).get('tags') or [])
except Exception as e:
print(f'[{repo}] tags/list failed: {e}')
continue
sha=[t for t in tags if PATTERN.match(t)]
if len(sha)<=1:
print(f'[{repo}] sha={len(sha)} no prune')
continue
rows=[]
for t in sha:
created='1970-01-01T00:00:00Z'
digest=None
try:
_, h, mb=req(f'{REG}/v2/{repo}/manifests/{t}', headers={'Accept':'application/vnd.docker.distribution.manifest.v2+json'})
digest=h.get('Docker-Content-Digest')
m=json.loads(mb.decode())
cfg=(m.get('config') or {}).get('digest')
if cfg:
_, _, cb=req(f'{REG}/v2/{repo}/blobs/{cfg}')
created=json.loads(cb.decode()).get('created', created)
except Exception:
created='9999-12-31T23:59:59Z'
rows.append((created, t, digest))
rows.sort(key=lambda x: x[0], reverse=True)
KEEP_N=2 # keep last 2 SHA builds (current + 1 previous)
keep_set=set(t for _, t, _ in rows[:KEEP_N])
# preserve buildcache for performance
keep_set.update(t for t in tags if t == 'buildcache')
keep_list=sorted(keep_set)
print(f'[{repo}] sha_total={len(rows)} keep={keep_list} remove={max(0, len(rows)-len(keep_set))}')
for _, t, d in rows:
if t in keep_set or not d:
continue
try:
req(f'{REG}/v2/{repo}/manifests/{d}', method='DELETE')
deleted+=1
print(f' deleted {repo}:{t}')
except urllib.error.HTTPError as e:
print(f' delete failed {repo}:{t} code={e.code}')
except Exception as e:
print(f' delete failed {repo}:{t} err={e}')
print(f'deleted_manifests={deleted}')
# Trigger garbage collection to delete unreferenced blob layers
if deleted > 0:
print('\n=== Triggering Garbage Collection ===')
try:
# Scale down registry to run GC
import subprocess
subprocess.run(['kubectl', 'scale', 'deployment', 'docker-registry', '--replicas=0', '-n', 'registry'], check=True)
print('Scaled down docker-registry deployment')
# Wait for deployment to be fully down
import time
time.sleep(5)
# Run GC job
gc_job = {
'apiVersion': 'batch/v1',
'kind': 'Job',
'metadata': {'name': 'registry-gc-once', 'namespace': 'registry'},
'spec': {
'backoffLimit': 0,
'template': {
'spec': {
'restartPolicy': 'Never',
'containers': [{
'name': 'gc',
'image': 'registry:3',
'command': ['registry', 'garbage-collect', '--delete-untagged', '/etc/distribution/config.yml'],
'volumeMounts': [
{'name': 'storage', 'mountPath': '/var/lib/registry'},
{'name': 'config', 'mountPath': '/etc/distribution'}
]
}],
'volumes': [
{'name': 'storage', 'persistentVolumeClaim': {'claimName': 'registry-pvc'}},
{'name': 'config', 'configMap': {'name': 'registry-config'}}
]
}
}
}
}
# Delete old GC job if exists
subprocess.run(['kubectl', 'delete', 'job', 'registry-gc-once', '-n', 'registry', '--ignore-not-found=true'], check=False)
time.sleep(2)
# Create and wait for GC job
import tempfile
with tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False) as f:
json.dump(gc_job, f)
f.flush()
subprocess.run(['kubectl', 'apply', '-f', f.name], check=True)
print('GC job created, waiting for completion...')
# Wait up to 10 minutes for GC to complete
for i in range(120):
result = subprocess.run(['kubectl', 'get', 'job', 'registry-gc-once', '-n', 'registry', '-o', 'jsonpath={.status.succeeded}'], capture_output=True, text=True)
if result.stdout.strip() == '1':
print('Garbage collection completed successfully')
break
result = subprocess.run(['kubectl', 'get', 'job', 'registry-gc-once', '-n', 'registry', '-o', 'jsonpath={.status.failed}'], capture_output=True, text=True)
if result.stdout.strip() == '1':
print('GC job failed')
break
time.sleep(5)
# Scale back up
subprocess.run(['kubectl', 'scale', 'deployment', 'docker-registry', '--replicas=1', '-n', 'registry'], check=True)
print('Scaled up docker-registry deployment')
except Exception as e:
print(f'GC trigger failed: {e}')
# Ensure registry is scaled back up even if GC failed
try:
subprocess.run(['kubectl', 'scale', 'deployment', 'docker-registry', '--replicas=1', '-n', 'registry'], check=False)
except:
pass