- Added automatic GC to prune script after deleting manifests - Cronjob now uses python:3.12-slim with kubectl installed - Added serviceAccountName: registry-gc-runner for permissions - GC scales down registry, runs garbage-collect, scales back up - Deletes unreferenced blob layers to actually free disk space
184 lines
7 KiB
YAML
184 lines
7 KiB
YAML
apiVersion: v1
|
|
kind: ConfigMap
|
|
metadata:
|
|
name: registry-retention-script
|
|
namespace: registry
|
|
data:
|
|
prune.py: |
|
|
import base64, json, re, urllib.request, urllib.error
|
|
REG='https://registry.nxtgauge.com'
|
|
CFG='/auth/.dockerconfigjson'
|
|
PATTERN=re.compile(r'^[0-9a-f]{40}$')
|
|
|
|
# Base images that MUST NEVER be deleted, even if their names start with
|
|
# nxtgauge- in the future. These are the FROM lines in our Dockerfiles
|
|
# (alpine for rust, node variants for frontend/admin, etc.). If any of
|
|
# these are missing the entire build pipeline breaks.
|
|
BASE_IMAGES = {
|
|
'alpine',
|
|
'node',
|
|
'rust',
|
|
'busybox',
|
|
'golang',
|
|
'nginx',
|
|
'postgres',
|
|
'redis',
|
|
}
|
|
# Project-image prefix that we DO prune. Anything outside this is sacred.
|
|
PROJECT_PREFIX = 'nxtgauge-'
|
|
|
|
with open(CFG,'r') as f:
|
|
dcfg=json.load(f)
|
|
auth=dcfg['auths']['registry.nxtgauge.com']['auth']
|
|
HEAD={'Authorization': f'Basic {auth}'}
|
|
|
|
def req(url, headers=None, method='GET'):
|
|
h=dict(HEAD)
|
|
if headers: h.update(headers)
|
|
r=urllib.request.Request(url, headers=h, method=method)
|
|
with urllib.request.urlopen(r, timeout=30) as resp:
|
|
return resp.status, dict(resp.headers), resp.read()
|
|
|
|
_, _, body = req(f'{REG}/v2/_catalog?n=1000')
|
|
all_repos=json.loads(body.decode()).get('repositories',[])
|
|
|
|
# EXPLICIT SAFETY: only consider repos that match the project prefix.
|
|
# This double-belt-and-suspenders: base images (alpine/node/rust) are
|
|
# also in BASE_IMAGES as a fallback in case the prefix is ever changed.
|
|
repos=[r for r in all_repos if r.startswith(PROJECT_PREFIX) and r not in BASE_IMAGES]
|
|
|
|
# Sanity check: log if any base image is missing
|
|
missing_base = [b for b in BASE_IMAGES if b in all_repos or True] # always present
|
|
present = set(all_repos)
|
|
for b in BASE_IMAGES:
|
|
if b not in present:
|
|
print(f'[WARN] base image {b} not in registry catalog - re-push required!')
|
|
|
|
deleted=0
|
|
for repo in sorted(repos):
|
|
try:
|
|
_, _, tb=req(f'{REG}/v2/{repo}/tags/list')
|
|
tags=(json.loads(tb.decode()).get('tags') or [])
|
|
except Exception as e:
|
|
print(f'[{repo}] tags/list failed: {e}')
|
|
continue
|
|
|
|
sha=[t for t in tags if PATTERN.match(t)]
|
|
if len(sha)<=1:
|
|
print(f'[{repo}] sha={len(sha)} no prune')
|
|
continue
|
|
|
|
rows=[]
|
|
for t in sha:
|
|
created='1970-01-01T00:00:00Z'
|
|
digest=None
|
|
try:
|
|
_, h, mb=req(f'{REG}/v2/{repo}/manifests/{t}', headers={'Accept':'application/vnd.docker.distribution.manifest.v2+json'})
|
|
digest=h.get('Docker-Content-Digest')
|
|
m=json.loads(mb.decode())
|
|
cfg=(m.get('config') or {}).get('digest')
|
|
if cfg:
|
|
_, _, cb=req(f'{REG}/v2/{repo}/blobs/{cfg}')
|
|
created=json.loads(cb.decode()).get('created', created)
|
|
except Exception:
|
|
created='9999-12-31T23:59:59Z'
|
|
rows.append((created, t, digest))
|
|
|
|
rows.sort(key=lambda x: x[0], reverse=True)
|
|
KEEP_N=2 # keep last 2 SHA builds (current + 1 previous)
|
|
keep_set=set(t for _, t, _ in rows[:KEEP_N])
|
|
# preserve buildcache for performance
|
|
keep_set.update(t for t in tags if t == 'buildcache')
|
|
keep_list=sorted(keep_set)
|
|
print(f'[{repo}] sha_total={len(rows)} keep={keep_list} remove={max(0, len(rows)-len(keep_set))}')
|
|
for _, t, d in rows:
|
|
if t in keep_set or not d:
|
|
continue
|
|
try:
|
|
req(f'{REG}/v2/{repo}/manifests/{d}', method='DELETE')
|
|
deleted+=1
|
|
print(f' deleted {repo}:{t}')
|
|
except urllib.error.HTTPError as e:
|
|
print(f' delete failed {repo}:{t} code={e.code}')
|
|
except Exception as e:
|
|
print(f' delete failed {repo}:{t} err={e}')
|
|
|
|
print(f'deleted_manifests={deleted}')
|
|
|
|
# Trigger garbage collection to delete unreferenced blob layers
|
|
if deleted > 0:
|
|
print('\n=== Triggering Garbage Collection ===')
|
|
try:
|
|
# Scale down registry to run GC
|
|
import subprocess
|
|
subprocess.run(['kubectl', 'scale', 'deployment', 'docker-registry', '--replicas=0', '-n', 'registry'], check=True)
|
|
print('Scaled down docker-registry deployment')
|
|
|
|
# Wait for deployment to be fully down
|
|
import time
|
|
time.sleep(5)
|
|
|
|
# Run GC job
|
|
gc_job = {
|
|
'apiVersion': 'batch/v1',
|
|
'kind': 'Job',
|
|
'metadata': {'name': 'registry-gc-once', 'namespace': 'registry'},
|
|
'spec': {
|
|
'backoffLimit': 0,
|
|
'template': {
|
|
'spec': {
|
|
'restartPolicy': 'Never',
|
|
'containers': [{
|
|
'name': 'gc',
|
|
'image': 'registry:3',
|
|
'command': ['registry', 'garbage-collect', '--delete-untagged', '/etc/distribution/config.yml'],
|
|
'volumeMounts': [
|
|
{'name': 'storage', 'mountPath': '/var/lib/registry'},
|
|
{'name': 'config', 'mountPath': '/etc/distribution'}
|
|
]
|
|
}],
|
|
'volumes': [
|
|
{'name': 'storage', 'persistentVolumeClaim': {'claimName': 'registry-pvc'}},
|
|
{'name': 'config', 'configMap': {'name': 'registry-config'}}
|
|
]
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
# Delete old GC job if exists
|
|
subprocess.run(['kubectl', 'delete', 'job', 'registry-gc-once', '-n', 'registry', '--ignore-not-found=true'], check=False)
|
|
time.sleep(2)
|
|
|
|
# Create and wait for GC job
|
|
import tempfile
|
|
with tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False) as f:
|
|
json.dump(gc_job, f)
|
|
f.flush()
|
|
subprocess.run(['kubectl', 'apply', '-f', f.name], check=True)
|
|
|
|
print('GC job created, waiting for completion...')
|
|
|
|
# Wait up to 10 minutes for GC to complete
|
|
for i in range(120):
|
|
result = subprocess.run(['kubectl', 'get', 'job', 'registry-gc-once', '-n', 'registry', '-o', 'jsonpath={.status.succeeded}'], capture_output=True, text=True)
|
|
if result.stdout.strip() == '1':
|
|
print('Garbage collection completed successfully')
|
|
break
|
|
result = subprocess.run(['kubectl', 'get', 'job', 'registry-gc-once', '-n', 'registry', '-o', 'jsonpath={.status.failed}'], capture_output=True, text=True)
|
|
if result.stdout.strip() == '1':
|
|
print('GC job failed')
|
|
break
|
|
time.sleep(5)
|
|
|
|
# Scale back up
|
|
subprocess.run(['kubectl', 'scale', 'deployment', 'docker-registry', '--replicas=1', '-n', 'registry'], check=True)
|
|
print('Scaled up docker-registry deployment')
|
|
|
|
except Exception as e:
|
|
print(f'GC trigger failed: {e}')
|
|
# Ensure registry is scaled back up even if GC failed
|
|
try:
|
|
subprocess.run(['kubectl', 'scale', 'deployment', 'docker-registry', '--replicas=1', '-n', 'registry'], check=False)
|
|
except:
|
|
pass
|