chore(workspace): add hardened startup/security workflows and skill suite

This commit is contained in:
zap
2026-03-04 19:13:33 +00:00
parent 4903e9d75d
commit 808af5ee13
58 changed files with 3787 additions and 3 deletions

View File

@@ -0,0 +1,7 @@
{
"version": 1,
"registry": "https://clawhub.ai",
"slug": "kubernetes",
"installedVersion": "1.0.0",
"installedAt": 1772497715868
}

542
skills/kubernetes/SKILL.md Normal file
View File

@@ -0,0 +1,542 @@
---
name: kubernetes
description: |
Comprehensive Kubernetes and OpenShift cluster management skill covering operations, troubleshooting, manifest generation, security, and GitOps. Use this skill when:
(1) Cluster operations: upgrades, backups, node management, scaling, monitoring setup
(2) Troubleshooting: pod failures, networking issues, storage problems, performance analysis
(3) Creating manifests: Deployments, StatefulSets, Services, Ingress, NetworkPolicies, RBAC
(4) Security: audits, Pod Security Standards, RBAC, secrets management, vulnerability scanning
(5) GitOps: ArgoCD, Flux, Kustomize, Helm, CI/CD pipelines, progressive delivery
(6) OpenShift-specific: SCCs, Routes, Operators, Builds, ImageStreams
(7) Multi-cloud: AKS, EKS, GKE, ARO, ROSA operations
metadata:
author: cluster-skills
version: "1.0.0"
---
# Kubernetes & OpenShift Cluster Management
Comprehensive skill for Kubernetes and OpenShift clusters covering operations, troubleshooting, manifests, security, and GitOps.
## Current Versions (January 2026)
| Platform | Version | Documentation |
|----------|---------|---------------|
| **Kubernetes** | 1.31.x | https://kubernetes.io/docs/ |
| **OpenShift** | 4.17.x | https://docs.openshift.com/ |
| **EKS** | 1.31 | https://docs.aws.amazon.com/eks/ |
| **AKS** | 1.31 | https://learn.microsoft.com/azure/aks/ |
| **GKE** | 1.31 | https://cloud.google.com/kubernetes-engine/docs |
### Key Tools
| Tool | Version | Purpose |
|------|---------|---------|
| **ArgoCD** | v2.13.x | GitOps deployments |
| **Flux** | v2.4.x | GitOps toolkit |
| **Kustomize** | v5.5.x | Manifest customization |
| **Helm** | v3.16.x | Package management |
| **Velero** | 1.15.x | Backup/restore |
| **Trivy** | 0.58.x | Security scanning |
| **Kyverno** | 1.13.x | Policy engine |
## Command Convention
**IMPORTANT**: Use `kubectl` for standard Kubernetes. Use `oc` for OpenShift/ARO.
---
## 1. CLUSTER OPERATIONS
### Node Management
```bash
# View nodes
kubectl get nodes -o wide
# Drain node for maintenance
kubectl drain ${NODE} --ignore-daemonsets --delete-emptydir-data --grace-period=60
# Uncordon after maintenance
kubectl uncordon ${NODE}
# View node resources
kubectl top nodes
```
### Cluster Upgrades
**AKS:**
```bash
az aks get-upgrades -g ${RG} -n ${CLUSTER} -o table
az aks upgrade -g ${RG} -n ${CLUSTER} --kubernetes-version ${VERSION}
```
**EKS:**
```bash
aws eks update-cluster-version --name ${CLUSTER} --kubernetes-version ${VERSION}
```
**GKE:**
```bash
gcloud container clusters upgrade ${CLUSTER} --master --cluster-version ${VERSION}
```
**OpenShift:**
```bash
oc adm upgrade --to=${VERSION}
oc get clusterversion
```
### Backup with Velero
```bash
# Install Velero
velero install --provider ${PROVIDER} --bucket ${BUCKET} --secret-file ${CREDS}
# Create backup
velero backup create ${BACKUP_NAME} --include-namespaces ${NS}
# Restore
velero restore create --from-backup ${BACKUP_NAME}
```
---
## 2. TROUBLESHOOTING
### Health Assessment
Run the bundled script for comprehensive health check:
```bash
bash scripts/cluster-health-check.sh
```
### Pod Status Interpretation
| Status | Meaning | Action |
|--------|---------|--------|
| `Pending` | Scheduling issue | Check resources, nodeSelector, tolerations |
| `CrashLoopBackOff` | Container crashing | Check logs: `kubectl logs ${POD} --previous` |
| `ImagePullBackOff` | Image unavailable | Verify image name, registry access |
| `OOMKilled` | Out of memory | Increase memory limits |
| `Evicted` | Node pressure | Check node resources |
### Debugging Commands
```bash
# Pod logs (current and previous)
kubectl logs ${POD} -c ${CONTAINER} --previous
# Multi-pod logs with stern
stern ${LABEL_SELECTOR} -n ${NS}
# Exec into pod
kubectl exec -it ${POD} -- /bin/sh
# Pod events
kubectl describe pod ${POD} | grep -A 20 Events
# Cluster events (sorted by time)
kubectl get events -A --sort-by='.lastTimestamp' | tail -50
```
### Network Troubleshooting
```bash
# Test DNS
kubectl run -it --rm debug --image=busybox -- nslookup kubernetes.default
# Test service connectivity
kubectl run -it --rm debug --image=curlimages/curl -- curl -v http://${SVC}.${NS}:${PORT}
# Check endpoints
kubectl get endpoints ${SVC}
```
---
## 3. MANIFEST GENERATION
### Production Deployment Template
```yaml
apiVersion: apps/v1
kind: Deployment
metadata:
name: ${APP_NAME}
namespace: ${NAMESPACE}
labels:
app.kubernetes.io/name: ${APP_NAME}
app.kubernetes.io/version: "${VERSION}"
spec:
replicas: 3
strategy:
type: RollingUpdate
rollingUpdate:
maxSurge: 1
maxUnavailable: 0
selector:
matchLabels:
app.kubernetes.io/name: ${APP_NAME}
template:
metadata:
labels:
app.kubernetes.io/name: ${APP_NAME}
spec:
serviceAccountName: ${APP_NAME}
securityContext:
runAsNonRoot: true
runAsUser: 1000
fsGroup: 1000
seccompProfile:
type: RuntimeDefault
containers:
- name: ${APP_NAME}
image: ${IMAGE}:${TAG}
ports:
- name: http
containerPort: 8080
securityContext:
allowPrivilegeEscalation: false
readOnlyRootFilesystem: true
capabilities:
drop: ["ALL"]
resources:
requests:
cpu: 100m
memory: 128Mi
limits:
cpu: 500m
memory: 512Mi
livenessProbe:
httpGet:
path: /healthz
port: http
initialDelaySeconds: 10
periodSeconds: 10
readinessProbe:
httpGet:
path: /ready
port: http
initialDelaySeconds: 5
periodSeconds: 5
volumeMounts:
- name: tmp
mountPath: /tmp
volumes:
- name: tmp
emptyDir: {}
affinity:
podAntiAffinity:
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 100
podAffinityTerm:
labelSelector:
matchLabels:
app.kubernetes.io/name: ${APP_NAME}
topologyKey: kubernetes.io/hostname
```
### Service & Ingress
```yaml
apiVersion: v1
kind: Service
metadata:
name: ${APP_NAME}
spec:
selector:
app.kubernetes.io/name: ${APP_NAME}
ports:
- name: http
port: 80
targetPort: http
---
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
name: ${APP_NAME}
annotations:
nginx.ingress.kubernetes.io/ssl-redirect: "true"
spec:
ingressClassName: nginx
tls:
- hosts:
- ${HOST}
secretName: ${APP_NAME}-tls
rules:
- host: ${HOST}
http:
paths:
- path: /
pathType: Prefix
backend:
service:
name: ${APP_NAME}
port:
name: http
```
### OpenShift Route
```yaml
apiVersion: route.openshift.io/v1
kind: Route
metadata:
name: ${APP_NAME}
spec:
to:
kind: Service
name: ${APP_NAME}
port:
targetPort: http
tls:
termination: edge
insecureEdgeTerminationPolicy: Redirect
```
Use the bundled script for manifest generation:
```bash
bash scripts/generate-manifest.sh deployment myapp production
```
---
## 4. SECURITY
### Security Audit
Run the bundled script:
```bash
bash scripts/security-audit.sh [namespace]
```
### Pod Security Standards
```yaml
apiVersion: v1
kind: Namespace
metadata:
name: ${NAMESPACE}
labels:
pod-security.kubernetes.io/enforce: restricted
pod-security.kubernetes.io/audit: baseline
pod-security.kubernetes.io/warn: restricted
```
### NetworkPolicy (Zero Trust)
```yaml
apiVersion: networking.k8s.io/v1
kind: NetworkPolicy
metadata:
name: ${APP_NAME}-policy
spec:
podSelector:
matchLabels:
app.kubernetes.io/name: ${APP_NAME}
policyTypes:
- Ingress
- Egress
ingress:
- from:
- podSelector:
matchLabels:
app.kubernetes.io/name: frontend
ports:
- protocol: TCP
port: 8080
egress:
- to:
- podSelector:
matchLabels:
app.kubernetes.io/name: database
ports:
- protocol: TCP
port: 5432
# Allow DNS
- to:
- namespaceSelector: {}
podSelector:
matchLabels:
k8s-app: kube-dns
ports:
- protocol: UDP
port: 53
```
### RBAC Best Practices
```yaml
apiVersion: v1
kind: ServiceAccount
metadata:
name: ${APP_NAME}
---
apiVersion: rbac.authorization.k8s.io/v1
kind: Role
metadata:
name: ${APP_NAME}-role
rules:
- apiGroups: [""]
resources: ["configmaps"]
verbs: ["get", "list"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding
metadata:
name: ${APP_NAME}-binding
subjects:
- kind: ServiceAccount
name: ${APP_NAME}
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: Role
name: ${APP_NAME}-role
```
### Image Scanning
```bash
# Scan image with Trivy
trivy image ${IMAGE}:${TAG}
# Scan with severity filter
trivy image --severity HIGH,CRITICAL ${IMAGE}:${TAG}
# Generate SBOM
trivy image --format spdx-json -o sbom.json ${IMAGE}:${TAG}
```
---
## 5. GITOPS
### ArgoCD Application
```yaml
apiVersion: argoproj.io/v1alpha1
kind: Application
metadata:
name: ${APP_NAME}
namespace: argocd
finalizers:
- resources-finalizer.argocd.argoproj.io
spec:
project: default
source:
repoURL: ${GIT_REPO}
targetRevision: main
path: k8s/overlays/${ENV}
destination:
server: https://kubernetes.default.svc
namespace: ${NAMESPACE}
syncPolicy:
automated:
prune: true
selfHeal: true
syncOptions:
- CreateNamespace=true
```
### Kustomize Structure
```
k8s/
├── base/
│ ├── kustomization.yaml
│ ├── deployment.yaml
│ └── service.yaml
└── overlays/
├── dev/
│ └── kustomization.yaml
├── staging/
│ └── kustomization.yaml
└── prod/
└── kustomization.yaml
```
**base/kustomization.yaml:**
```yaml
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
- deployment.yaml
- service.yaml
```
**overlays/prod/kustomization.yaml:**
```yaml
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
- ../../base
namePrefix: prod-
namespace: production
replicas:
- name: myapp
count: 5
images:
- name: myregistry/myapp
newTag: v1.2.3
```
### GitHub Actions CI/CD
```yaml
name: Build and Deploy
on:
push:
branches: [main]
jobs:
build:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Build and push image
uses: docker/build-push-action@v5
with:
push: true
tags: ${{ secrets.REGISTRY }}/${{ github.event.repository.name }}:${{ github.sha }}
- name: Update Kustomize image
run: |
cd k8s/overlays/prod
kustomize edit set image myapp=${{ secrets.REGISTRY }}/${{ github.event.repository.name }}:${{ github.sha }}
- name: Commit and push
run: |
git config user.name "github-actions"
git config user.email "github-actions@github.com"
git add .
git commit -m "Update image to ${{ github.sha }}"
git push
```
Use the bundled script for ArgoCD sync:
```bash
bash scripts/argocd-app-sync.sh ${APP_NAME} --prune
```
---
## Helper Scripts
This skill includes automation scripts in the `scripts/` directory:
| Script | Purpose |
|--------|---------|
| `cluster-health-check.sh` | Comprehensive cluster health assessment with scoring |
| `security-audit.sh` | Security posture audit (privileged, root, RBAC, NetworkPolicy) |
| `node-maintenance.sh` | Safe node drain and maintenance prep |
| `pre-upgrade-check.sh` | Pre-upgrade validation checklist |
| `generate-manifest.sh` | Generate production-ready K8s manifests |
| `argocd-app-sync.sh` | ArgoCD application sync helper |
Run any script:
```bash
bash scripts/<script-name>.sh [arguments]
```

View File

@@ -0,0 +1,6 @@
{
"ownerId": "kn7f82v7f3g1dtvm0gm74q016n7zz73v",
"slug": "kubernetes",
"version": "1.0.0",
"publishedAt": 1769436428875
}

View File

@@ -0,0 +1,96 @@
#!/bin/bash
# argocd-app-sync.sh - ArgoCD application sync helper
# Usage: ./argocd-app-sync.sh <app-name> [--prune] [--force]
set -e
APP=${1:-""}
PRUNE=${2:-""}
FORCE=${3:-""}
if [ -z "$APP" ]; then
echo "Usage: $0 <app-name> [--prune] [--force]" >&2
echo "" >&2
echo "Available applications:" >&2
argocd app list --output name 2>/dev/null || kubectl get applications -A -o jsonpath='{range .items[*]}{.metadata.name}{"\n"}{end}'
exit 1
fi
echo "=== ARGOCD APPLICATION SYNC: $APP ===" >&2
echo "Timestamp: $(date -u +"%Y-%m-%dT%H:%M:%SZ")" >&2
echo "" >&2
# Check if argocd CLI is available
if command -v argocd &> /dev/null; then
USE_CLI=true
else
USE_CLI=false
echo "argocd CLI not found, using kubectl" >&2
fi
# Get current status
echo "### Current Status ###" >&2
if [ "$USE_CLI" = true ]; then
argocd app get "$APP" --refresh >&2
else
kubectl get application "$APP" -n argocd -o yaml | grep -A20 "status:" | head -25 >&2
fi
# Build sync options
SYNC_OPTS=""
if [ "$PRUNE" == "--prune" ]; then
SYNC_OPTS="$SYNC_OPTS --prune"
echo "Prune enabled: Will remove resources not defined in Git" >&2
fi
if [ "$FORCE" == "--force" ]; then
SYNC_OPTS="$SYNC_OPTS --force"
echo "Force enabled: Will replace resources that cannot be patched" >&2
fi
# Perform sync
echo -e "\n### Syncing Application ###" >&2
if [ "$USE_CLI" = true ]; then
argocd app sync "$APP" $SYNC_OPTS >&2
else
# Trigger sync via annotation
kubectl patch application "$APP" -n argocd --type=merge -p '{"operation":{"sync":{"revision":"HEAD"}}}' >&2
fi
# Wait for sync to complete
echo -e "\n### Waiting for Sync ###" >&2
if [ "$USE_CLI" = true ]; then
argocd app wait "$APP" --health --timeout 300 >&2
else
echo "Waiting for sync (check manually with kubectl)..." >&2
sleep 10
fi
# Final status
echo -e "\n### Final Status ###" >&2
if [ "$USE_CLI" = true ]; then
argocd app get "$APP" >&2
STATUS=$(argocd app get "$APP" -o json | jq -r '.status.sync.status')
HEALTH=$(argocd app get "$APP" -o json | jq -r '.status.health.status')
else
kubectl get application "$APP" -n argocd -o yaml | grep -A20 "status:" | head -25 >&2
STATUS=$(kubectl get application "$APP" -n argocd -o jsonpath='{.status.sync.status}')
HEALTH=$(kubectl get application "$APP" -n argocd -o jsonpath='{.status.health.status}')
fi
echo "" >&2
echo "========================================" >&2
echo "SYNC COMPLETE" >&2
echo "========================================" >&2
echo "Sync Status: $STATUS" >&2
echo "Health Status: $HEALTH" >&2
# Output JSON
cat << EOF
{
"application": "$APP",
"timestamp": "$(date -u +"%Y-%m-%dT%H:%M:%SZ")",
"sync_status": "$STATUS",
"health_status": "$HEALTH",
"success": $([ "$STATUS" == "Synced" ] && echo "true" || echo "false")
}
EOF

View File

@@ -0,0 +1,131 @@
#!/bin/bash
# cluster-health-check.sh - Comprehensive cluster health assessment
# Usage: ./cluster-health-check.sh
set -e
echo "=== KUBERNETES CLUSTER HEALTH ASSESSMENT ===" >&2
echo "Timestamp: $(date -u +"%Y-%m-%dT%H:%M:%SZ")" >&2
echo "" >&2
SCORE=100
ISSUES=()
# 1. Node Health (Critical: -50 points per issue)
echo "### NODE HEALTH ###" >&2
UNHEALTHY_NODES=$(kubectl get nodes --no-headers | grep -vE "Ready\s+<none>|Ready\s+master|Ready\s+control-plane" | grep -c -E "NotReady|Unknown" || echo 0)
if [ "$UNHEALTHY_NODES" -gt 0 ]; then
SCORE=$((SCORE - 50))
ISSUES+=("BOOM: $UNHEALTHY_NODES unhealthy nodes detected")
kubectl get nodes | grep -E "NotReady|Unknown" >&2
else
echo "✓ All nodes healthy" >&2
fi
# 2. Pod Issues (Warning: -20 points)
echo -e "\n### POD HEALTH ###" >&2
POD_ISSUES=$(kubectl get pods -A --field-selector=status.phase!=Running,status.phase!=Succeeded --no-headers 2>/dev/null | wc -l | tr -d ' ')
if [ "$POD_ISSUES" -gt 0 ]; then
SCORE=$((SCORE - 20))
ISSUES+=("WARN: $POD_ISSUES pods not in Running/Succeeded state")
echo "Pods with issues:" >&2
kubectl get pods -A --field-selector=status.phase!=Running,status.phase!=Succeeded >&2
else
echo "✓ All pods running" >&2
fi
# 3. CrashLoopBackOff (Critical: -50 points)
echo -e "\n### CRASH LOOP DETECTION ###" >&2
CRASHLOOP=$(kubectl get pods -A -o json 2>/dev/null | jq -r '.items[] | select(.status.containerStatuses[]?.state.waiting?.reason == "CrashLoopBackOff") | "\(.metadata.namespace)/\(.metadata.name)"' | wc -l | tr -d ' ')
if [ "$CRASHLOOP" -gt 0 ]; then
SCORE=$((SCORE - 50))
ISSUES+=("BOOM: $CRASHLOOP pods in CrashLoopBackOff")
kubectl get pods -A -o json | jq -r '.items[] | select(.status.containerStatuses[]?.state.waiting?.reason == "CrashLoopBackOff") | "\(.metadata.namespace)/\(.metadata.name)"' >&2
else
echo "✓ No pods in CrashLoopBackOff" >&2
fi
# 4. Security - Privileged Containers (Critical: -50 points)
echo -e "\n### SECURITY - PRIVILEGED CONTAINERS ###" >&2
PRIVILEGED=$(kubectl get pods -A -o json 2>/dev/null | jq -r '[.items[] | select(.spec.containers[].securityContext.privileged == true)] | length')
if [ "$PRIVILEGED" -gt 0 ]; then
SCORE=$((SCORE - 50))
ISSUES+=("BOOM: $PRIVILEGED privileged containers detected")
kubectl get pods -A -o json | jq -r '.items[] | select(.spec.containers[].securityContext.privileged == true) | "\(.metadata.namespace)/\(.metadata.name)"' >&2
else
echo "✓ No privileged containers" >&2
fi
# 5. Resource Limits (Warning: -20 points)
echo -e "\n### RESOURCE CONFIGURATION ###" >&2
NO_LIMITS=$(kubectl get pods -A -o json 2>/dev/null | jq -r '[.items[] | select(.spec.containers[].resources.limits == null)] | length')
if [ "$NO_LIMITS" -gt 10 ]; then
SCORE=$((SCORE - 20))
ISSUES+=("WARN: $NO_LIMITS containers without resource limits")
else
echo "✓ Most containers have resource limits" >&2
fi
# 6. PVC Status (Warning: -20 points)
echo -e "\n### STORAGE HEALTH ###" >&2
PENDING_PVC=$(kubectl get pvc -A --field-selector=status.phase!=Bound --no-headers 2>/dev/null | wc -l | tr -d ' ')
if [ "$PENDING_PVC" -gt 0 ]; then
SCORE=$((SCORE - 20))
ISSUES+=("WARN: $PENDING_PVC PVCs not bound")
kubectl get pvc -A --field-selector=status.phase!=Bound >&2
else
echo "✓ All PVCs bound" >&2
fi
# 7. Recent Warning Events (Info: -5 points per 10 events)
echo -e "\n### RECENT WARNING EVENTS ###" >&2
WARNING_EVENTS=$(kubectl get events -A --field-selector=type=Warning --no-headers 2>/dev/null | wc -l | tr -d ' ')
if [ "$WARNING_EVENTS" -gt 50 ]; then
SCORE=$((SCORE - 5))
ISSUES+=("INFO: $WARNING_EVENTS warning events in cluster")
echo "Recent warning events: $WARNING_EVENTS" >&2
else
echo "✓ Warning events within normal range" >&2
fi
# OpenShift-specific checks
if command -v oc &> /dev/null && oc whoami &> /dev/null; then
echo -e "\n### OPENSHIFT CLUSTER OPERATORS ###" >&2
DEGRADED=$(oc get clusteroperators --no-headers 2>/dev/null | grep -c -E "False.*True|False.*False" || echo 0)
if [ "$DEGRADED" -gt 0 ]; then
SCORE=$((SCORE - 50))
ISSUES+=("BOOM: $DEGRADED cluster operators degraded/unavailable")
oc get clusteroperators | grep -E "False.*True|False.*False" >&2
else
echo "✓ All cluster operators healthy" >&2
fi
fi
# Ensure score doesn't go below 0
if [ "$SCORE" -lt 0 ]; then
SCORE=0
fi
# Output summary
echo "" >&2
echo "========================================" >&2
echo "CLUSTER HEALTH SCORE: $SCORE/100" >&2
echo "========================================" >&2
if [ ${#ISSUES[@]} -gt 0 ]; then
echo "" >&2
echo "ISSUES FOUND:" >&2
for issue in "${ISSUES[@]}"; do
echo " - $issue" >&2
done
fi
# Output JSON for programmatic use
cat << EOF
{
"score": $SCORE,
"timestamp": "$(date -u +"%Y-%m-%dT%H:%M:%SZ")",
"issues_count": ${#ISSUES[@]},
"healthy": $([ $SCORE -ge 80 ] && echo "true" || echo "false")
}
EOF

View File

@@ -0,0 +1,370 @@
#!/bin/bash
# generate-manifest.sh - Generate production-ready Kubernetes manifests
# Usage: ./generate-manifest.sh <type> <name> [namespace]
set -e
TYPE=${1:-""}
NAME=${2:-""}
NAMESPACE=${3:-"default"}
VALID_TYPES="deployment statefulset service ingress configmap secret pvc networkpolicy hpa"
if [ -z "$TYPE" ] || [ -z "$NAME" ]; then
echo "Usage: $0 <type> <name> [namespace]" >&2
echo "" >&2
echo "Available types: $VALID_TYPES" >&2
exit 1
fi
echo "# Generated: $(date -u +"%Y-%m-%dT%H:%M:%SZ")"
echo "# Type: $TYPE | Name: $NAME | Namespace: $NAMESPACE"
echo ""
case $TYPE in
deployment)
cat << EOF
apiVersion: apps/v1
kind: Deployment
metadata:
name: $NAME
namespace: $NAMESPACE
labels:
app.kubernetes.io/name: $NAME
app.kubernetes.io/component: server
spec:
replicas: 3
strategy:
type: RollingUpdate
rollingUpdate:
maxSurge: 1
maxUnavailable: 0
selector:
matchLabels:
app.kubernetes.io/name: $NAME
template:
metadata:
labels:
app.kubernetes.io/name: $NAME
spec:
serviceAccountName: $NAME
securityContext:
runAsNonRoot: true
runAsUser: 1000
fsGroup: 1000
seccompProfile:
type: RuntimeDefault
containers:
- name: $NAME
image: your-registry/$NAME:latest
imagePullPolicy: Always
ports:
- name: http
containerPort: 8080
protocol: TCP
securityContext:
allowPrivilegeEscalation: false
readOnlyRootFilesystem: true
capabilities:
drop: ["ALL"]
resources:
requests:
cpu: 100m
memory: 128Mi
limits:
cpu: 500m
memory: 512Mi
livenessProbe:
httpGet:
path: /healthz
port: http
initialDelaySeconds: 10
periodSeconds: 10
timeoutSeconds: 5
failureThreshold: 3
readinessProbe:
httpGet:
path: /ready
port: http
initialDelaySeconds: 5
periodSeconds: 5
timeoutSeconds: 3
failureThreshold: 3
env:
- name: POD_NAME
valueFrom:
fieldRef:
fieldPath: metadata.name
volumeMounts:
- name: tmp
mountPath: /tmp
volumes:
- name: tmp
emptyDir: {}
affinity:
podAntiAffinity:
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 100
podAffinityTerm:
labelSelector:
matchLabels:
app.kubernetes.io/name: $NAME
topologyKey: kubernetes.io/hostname
EOF
;;
statefulset)
cat << EOF
apiVersion: apps/v1
kind: StatefulSet
metadata:
name: $NAME
namespace: $NAMESPACE
labels:
app.kubernetes.io/name: $NAME
spec:
serviceName: $NAME-headless
replicas: 3
podManagementPolicy: OrderedReady
updateStrategy:
type: RollingUpdate
selector:
matchLabels:
app.kubernetes.io/name: $NAME
template:
metadata:
labels:
app.kubernetes.io/name: $NAME
spec:
serviceAccountName: $NAME
securityContext:
runAsNonRoot: true
runAsUser: 1000
fsGroup: 1000
terminationGracePeriodSeconds: 30
containers:
- name: $NAME
image: your-registry/$NAME:latest
ports:
- name: tcp
containerPort: 5432
securityContext:
allowPrivilegeEscalation: false
capabilities:
drop: ["ALL"]
resources:
requests:
cpu: 250m
memory: 512Mi
limits:
cpu: 1000m
memory: 2Gi
volumeMounts:
- name: data
mountPath: /data
volumeClaimTemplates:
- metadata:
name: data
spec:
accessModes: ["ReadWriteOnce"]
storageClassName: standard
resources:
requests:
storage: 10Gi
EOF
;;
service)
cat << EOF
apiVersion: v1
kind: Service
metadata:
name: $NAME
namespace: $NAMESPACE
labels:
app.kubernetes.io/name: $NAME
spec:
type: ClusterIP
ports:
- name: http
port: 80
targetPort: http
protocol: TCP
selector:
app.kubernetes.io/name: $NAME
EOF
;;
ingress)
cat << EOF
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
name: $NAME
namespace: $NAMESPACE
annotations:
nginx.ingress.kubernetes.io/ssl-redirect: "true"
spec:
ingressClassName: nginx
tls:
- hosts:
- $NAME.example.com
secretName: $NAME-tls
rules:
- host: $NAME.example.com
http:
paths:
- path: /
pathType: Prefix
backend:
service:
name: $NAME
port:
name: http
EOF
;;
configmap)
cat << EOF
apiVersion: v1
kind: ConfigMap
metadata:
name: $NAME
namespace: $NAMESPACE
labels:
app.kubernetes.io/name: $NAME
data:
config.yaml: |
# Add your configuration here
server:
port: 8080
host: "0.0.0.0"
EOF
;;
secret)
cat << EOF
apiVersion: v1
kind: Secret
metadata:
name: $NAME
namespace: $NAMESPACE
labels:
app.kubernetes.io/name: $NAME
type: Opaque
stringData:
# Replace with actual values before applying
API_KEY: "your-api-key-here"
DATABASE_URL: "postgresql://user:pass@host:5432/db"
EOF
;;
pvc)
cat << EOF
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: $NAME
namespace: $NAMESPACE
labels:
app.kubernetes.io/name: $NAME
spec:
accessModes:
- ReadWriteOnce
storageClassName: standard
resources:
requests:
storage: 10Gi
EOF
;;
networkpolicy)
cat << EOF
apiVersion: networking.k8s.io/v1
kind: NetworkPolicy
metadata:
name: $NAME
namespace: $NAMESPACE
spec:
podSelector:
matchLabels:
app.kubernetes.io/name: $NAME
policyTypes:
- Ingress
- Egress
ingress:
- from:
- podSelector:
matchLabels:
app.kubernetes.io/name: frontend
ports:
- protocol: TCP
port: 8080
egress:
- to:
- podSelector:
matchLabels:
app.kubernetes.io/name: database
ports:
- protocol: TCP
port: 5432
- to:
- namespaceSelector: {}
podSelector:
matchLabels:
k8s-app: kube-dns
ports:
- protocol: UDP
port: 53
EOF
;;
hpa)
cat << EOF
apiVersion: autoscaling/v2
kind: HorizontalPodAutoscaler
metadata:
name: $NAME
namespace: $NAMESPACE
spec:
scaleTargetRef:
apiVersion: apps/v1
kind: Deployment
name: $NAME
minReplicas: 3
maxReplicas: 10
metrics:
- type: Resource
resource:
name: cpu
target:
type: Utilization
averageUtilization: 70
- type: Resource
resource:
name: memory
target:
type: Utilization
averageUtilization: 80
behavior:
scaleDown:
stabilizationWindowSeconds: 300
policies:
- type: Percent
value: 25
periodSeconds: 60
scaleUp:
stabilizationWindowSeconds: 0
policies:
- type: Percent
value: 100
periodSeconds: 15
EOF
;;
*)
echo "Unknown type: $TYPE" >&2
echo "Valid types: $VALID_TYPES" >&2
exit 1
;;
esac

View File

@@ -0,0 +1,97 @@
#!/bin/bash
# node-maintenance.sh - Safely drain and prepare node for maintenance
# Usage: ./node-maintenance.sh <node-name> [--force]
set -e
NODE=${1:-""}
FORCE=${2:-""}
if [ -z "$NODE" ]; then
echo "Usage: $0 <node-name> [--force]" >&2
echo "" >&2
echo "Available nodes:" >&2
kubectl get nodes --no-headers | awk '{print " " $1 " (" $2 ")"}'
exit 1
fi
echo "=== NODE MAINTENANCE: $NODE ===" >&2
echo "Timestamp: $(date -u +"%Y-%m-%dT%H:%M:%SZ")" >&2
echo "" >&2
# Verify node exists
if ! kubectl get node "$NODE" &>/dev/null; then
echo "Error: Node '$NODE' not found" >&2
exit 1
fi
# Show current status
echo "### Current Node Status ###" >&2
kubectl get node "$NODE" -o wide >&2
echo -e "\n### Pods on Node ###" >&2
POD_COUNT=$(kubectl get pods -A --field-selector spec.nodeName="$NODE" --no-headers | wc -l | tr -d ' ')
echo "Total pods: $POD_COUNT" >&2
kubectl get pods -A --field-selector spec.nodeName="$NODE" --no-headers | head -20 >&2
[ "$POD_COUNT" -gt 20 ] && echo "... and $((POD_COUNT - 20)) more" >&2
# Check for pods with PDBs that might block drain
echo -e "\n### Checking PodDisruptionBudgets ###" >&2
kubectl get pdb -A -o json 2>/dev/null | jq -r '.items[] | "\(.metadata.namespace)/\(.metadata.name): minAvailable=\(.spec.minAvailable // "N/A"), maxUnavailable=\(.spec.maxUnavailable // "N/A")"' >&2
# Confirmation
if [ "$FORCE" != "--force" ]; then
echo "" >&2
read -p "Proceed with cordoning and draining node $NODE? (yes/no): " confirm
if [ "$confirm" != "yes" ]; then
echo "Aborted." >&2
exit 0
fi
fi
# Step 1: Cordon the node
echo -e "\n### Step 1: Cordoning node ###" >&2
kubectl cordon "$NODE"
echo "✓ Node cordoned (unschedulable)" >&2
# Step 2: Drain the node
echo -e "\n### Step 2: Draining node ###" >&2
DRAIN_OPTS="--ignore-daemonsets --delete-emptydir-data --grace-period=60 --timeout=300s"
if [ "$FORCE" == "--force" ]; then
DRAIN_OPTS="$DRAIN_OPTS --force"
echo "Force mode enabled" >&2
fi
if kubectl drain "$NODE" $DRAIN_OPTS; then
echo "✓ Node drained successfully" >&2
else
echo "Warning: Drain completed with some issues" >&2
fi
# Step 3: Verify no pods remain (except daemonsets)
echo -e "\n### Step 3: Verification ###" >&2
REMAINING=$(kubectl get pods -A --field-selector spec.nodeName="$NODE" --no-headers 2>/dev/null | wc -l | tr -d ' ')
echo "Remaining pods on node: $REMAINING (should be daemonsets only)" >&2
kubectl get pods -A --field-selector spec.nodeName="$NODE" >&2
echo "" >&2
echo "========================================" >&2
echo "NODE MAINTENANCE READY" >&2
echo "========================================" >&2
echo "Node '$NODE' is now cordoned and drained." >&2
echo "" >&2
echo "Perform your maintenance tasks, then run:" >&2
echo " kubectl uncordon $NODE" >&2
echo "" >&2
# Output JSON
cat << EOF
{
"node": "$NODE",
"action": "drain",
"timestamp": "$(date -u +"%Y-%m-%dT%H:%M:%SZ")",
"remaining_pods": $REMAINING,
"status": "ready_for_maintenance"
}
EOF

View File

@@ -0,0 +1,137 @@
#!/bin/bash
# pre-upgrade-check.sh - Pre-upgrade cluster validation
# Usage: ./pre-upgrade-check.sh
set -e
echo "=== PRE-UPGRADE CLUSTER VALIDATION ===" >&2
echo "Timestamp: $(date -u +"%Y-%m-%dT%H:%M:%SZ")" >&2
echo "" >&2
WARNINGS=0
BLOCKERS=0
# 1. Cluster Version
echo "### Cluster Version ###" >&2
SERVER_VERSION=$(kubectl version -o json 2>/dev/null | jq -r '.serverVersion.gitVersion')
echo "Server Version: $SERVER_VERSION" >&2
# 2. Node Status
echo -e "\n### Node Status ###" >&2
kubectl get nodes >&2
NOT_READY=$(kubectl get nodes --no-headers | grep -cv "Ready" || echo 0)
if [ "$NOT_READY" -gt 0 ]; then
BLOCKERS=$((BLOCKERS + 1))
echo "BLOCKER: $NOT_READY nodes not ready" >&2
fi
# 3. Control Plane Health
echo -e "\n### Control Plane Health ###" >&2
kubectl get pods -n kube-system -l tier=control-plane 2>/dev/null || \
kubectl get pods -n kube-system | grep -E "kube-apiserver|kube-controller|kube-scheduler|etcd" >&2
# 4. Pods Not Running
echo -e "\n### Pods Not Running ###" >&2
NOT_RUNNING=$(kubectl get pods -A --field-selector=status.phase!=Running,status.phase!=Succeeded --no-headers 2>/dev/null | wc -l | tr -d ' ')
if [ "$NOT_RUNNING" -gt 0 ]; then
WARNINGS=$((WARNINGS + 1))
echo "WARNING: $NOT_RUNNING pods not in Running/Succeeded state" >&2
kubectl get pods -A --field-selector=status.phase!=Running,status.phase!=Succeeded >&2
else
echo "✓ All pods running" >&2
fi
# 5. PodDisruptionBudgets
echo -e "\n### PodDisruptionBudgets ###" >&2
PDB_COUNT=$(kubectl get pdb -A --no-headers 2>/dev/null | wc -l | tr -d ' ')
echo "Found $PDB_COUNT PDBs" >&2
if [ "$PDB_COUNT" -gt 0 ]; then
kubectl get pdb -A >&2
fi
# 6. Pending PVCs
echo -e "\n### Pending PVCs ###" >&2
PENDING_PVC=$(kubectl get pvc -A --field-selector=status.phase=Pending --no-headers 2>/dev/null | wc -l | tr -d ' ')
if [ "$PENDING_PVC" -gt 0 ]; then
WARNINGS=$((WARNINGS + 1))
echo "WARNING: $PENDING_PVC PVCs pending" >&2
kubectl get pvc -A --field-selector=status.phase=Pending >&2
else
echo "✓ No pending PVCs" >&2
fi
# 7. Deprecated APIs
echo -e "\n### Deprecated API Usage ###" >&2
DEPRECATED=$(kubectl get --raw /metrics 2>/dev/null | grep -c "apiserver_requested_deprecated_apis" || echo 0)
if [ "$DEPRECATED" -gt 0 ]; then
WARNINGS=$((WARNINGS + 1))
echo "WARNING: Deprecated APIs may be in use" >&2
echo "Check: kubectl get --raw /metrics | grep apiserver_requested_deprecated_apis" >&2
else
echo "✓ No deprecated API metrics found" >&2
fi
# 8. etcd Health (if accessible)
echo -e "\n### etcd Health ###" >&2
ETCD_PODS=$(kubectl get pods -n kube-system -l component=etcd --no-headers 2>/dev/null | wc -l | tr -d ' ')
if [ "$ETCD_PODS" -gt 0 ]; then
kubectl get pods -n kube-system -l component=etcd >&2
else
echo "etcd pods not directly visible (managed platform)" >&2
fi
# 9. Resource Pressure
echo -e "\n### Node Resource Pressure ###" >&2
PRESSURE=$(kubectl get nodes -o json 2>/dev/null | jq -r '.items[] | select(.status.conditions[] | select(.type | contains("Pressure")) | .status == "True") | .metadata.name')
if [ -n "$PRESSURE" ]; then
WARNINGS=$((WARNINGS + 1))
echo "WARNING: Nodes under pressure:" >&2
echo "$PRESSURE" >&2
else
echo "✓ No resource pressure detected" >&2
fi
# OpenShift-specific checks
if command -v oc &> /dev/null && oc whoami &> /dev/null; then
echo -e "\n### OpenShift Cluster Operators ###" >&2
DEGRADED=$(oc get clusteroperators --no-headers 2>/dev/null | grep -c -E "False.*True|False.*False" || echo 0)
if [ "$DEGRADED" -gt 0 ]; then
BLOCKERS=$((BLOCKERS + 1))
echo "BLOCKER: $DEGRADED cluster operators degraded" >&2
oc get clusteroperators | grep -E "False.*True|False.*False" >&2
else
echo "✓ All cluster operators healthy" >&2
fi
fi
# Summary
echo "" >&2
echo "========================================" >&2
echo "PRE-UPGRADE CHECK SUMMARY" >&2
echo "========================================" >&2
echo "Blockers: $BLOCKERS" >&2
echo "Warnings: $WARNINGS" >&2
if [ "$BLOCKERS" -gt 0 ]; then
echo "" >&2
echo "❌ DO NOT PROCEED WITH UPGRADE" >&2
echo " Resolve blockers before upgrading" >&2
elif [ "$WARNINGS" -gt 0 ]; then
echo "" >&2
echo "⚠️ PROCEED WITH CAUTION" >&2
echo " Review warnings before upgrading" >&2
else
echo "" >&2
echo "✅ CLUSTER READY FOR UPGRADE" >&2
fi
# Output JSON
cat << EOF
{
"timestamp": "$(date -u +"%Y-%m-%dT%H:%M:%SZ")",
"server_version": "$SERVER_VERSION",
"blockers": $BLOCKERS,
"warnings": $WARNINGS,
"ready_for_upgrade": $([ $BLOCKERS -eq 0 ] && echo "true" || echo "false")
}
EOF

View File

@@ -0,0 +1,149 @@
#!/bin/bash
# security-audit.sh - Kubernetes security posture assessment
# Usage: ./security-audit.sh [namespace]
set -e
NAMESPACE=${1:-""}
NS_FLAG=""
if [ -n "$NAMESPACE" ]; then
NS_FLAG="-n $NAMESPACE"
echo "=== SECURITY AUDIT: Namespace $NAMESPACE ===" >&2
else
NS_FLAG="-A"
echo "=== SECURITY AUDIT: All Namespaces ===" >&2
fi
echo "Timestamp: $(date -u +"%Y-%m-%dT%H:%M:%SZ")" >&2
echo "" >&2
FINDINGS=()
CRITICAL=0
WARNING=0
INFO=0
# 1. Privileged Containers (Critical)
echo "### Checking for privileged containers..." >&2
PRIVILEGED=$(kubectl get pods $NS_FLAG -o json 2>/dev/null | jq -r '.items[] | select(.spec.containers[].securityContext.privileged == true) | "\(.metadata.namespace)/\(.metadata.name)"')
if [ -n "$PRIVILEGED" ]; then
CRITICAL=$((CRITICAL + 1))
FINDINGS+=("CRITICAL: Privileged containers found")
echo "CRITICAL: Privileged containers:" >&2
echo "$PRIVILEGED" >&2
else
echo "✓ No privileged containers" >&2
fi
# 2. Containers Running as Root (Warning)
echo -e "\n### Checking for root containers..." >&2
ROOT_CONTAINERS=$(kubectl get pods $NS_FLAG -o json 2>/dev/null | jq -r '.items[] | select(.spec.securityContext.runAsNonRoot != true) | select(.spec.containers[].securityContext.runAsNonRoot != true) | "\(.metadata.namespace)/\(.metadata.name)"' | sort -u)
ROOT_COUNT=$(echo "$ROOT_CONTAINERS" | grep -c . || echo 0)
if [ "$ROOT_COUNT" -gt 0 ]; then
WARNING=$((WARNING + 1))
FINDINGS+=("WARNING: $ROOT_COUNT pods may run as root")
echo "WARNING: Pods without runAsNonRoot:" >&2
echo "$ROOT_CONTAINERS" | head -10 >&2
[ "$ROOT_COUNT" -gt 10 ] && echo "... and $((ROOT_COUNT - 10)) more" >&2
else
echo "✓ All pods have runAsNonRoot" >&2
fi
# 3. Host Namespace Access (Critical)
echo -e "\n### Checking for host namespace access..." >&2
HOST_ACCESS=$(kubectl get pods $NS_FLAG -o json 2>/dev/null | jq -r '.items[] | select(.spec.hostNetwork == true or .spec.hostPID == true or .spec.hostIPC == true) | "\(.metadata.namespace)/\(.metadata.name)"')
if [ -n "$HOST_ACCESS" ]; then
CRITICAL=$((CRITICAL + 1))
FINDINGS+=("CRITICAL: Host namespace access detected")
echo "CRITICAL: Pods with host namespace access:" >&2
echo "$HOST_ACCESS" >&2
else
echo "✓ No host namespace access" >&2
fi
# 4. Missing Resource Limits (Warning)
echo -e "\n### Checking for missing resource limits..." >&2
NO_LIMITS=$(kubectl get pods $NS_FLAG -o json 2>/dev/null | jq -r '[.items[] | select(.spec.containers[].resources.limits == null)] | length')
if [ "$NO_LIMITS" -gt 10 ]; then
WARNING=$((WARNING + 1))
FINDINGS+=("WARNING: $NO_LIMITS containers without resource limits")
echo "WARNING: $NO_LIMITS containers missing resource limits" >&2
else
echo "✓ Resource limits configured ($NO_LIMITS missing)" >&2
fi
# 5. Default Service Account Usage (Info)
echo -e "\n### Checking for default service account usage..." >&2
DEFAULT_SA=$(kubectl get pods $NS_FLAG -o json 2>/dev/null | jq -r '.items[] | select(.spec.serviceAccountName == "default" or .spec.serviceAccountName == null) | "\(.metadata.namespace)/\(.metadata.name)"')
DEFAULT_SA_COUNT=$(echo "$DEFAULT_SA" | grep -c . || echo 0)
if [ "$DEFAULT_SA_COUNT" -gt 0 ]; then
INFO=$((INFO + 1))
FINDINGS+=("INFO: $DEFAULT_SA_COUNT pods using default service account")
echo "INFO: Pods using default SA:" >&2
echo "$DEFAULT_SA" | head -10 >&2
else
echo "✓ No pods using default service account" >&2
fi
# 6. Wildcard RBAC (Critical)
echo -e "\n### Checking for overly permissive RBAC..." >&2
WILDCARD_ROLES=$(kubectl get clusterroles -o json 2>/dev/null | jq -r '.items[] | select(.rules[]?.verbs[]? == "*" and .rules[]?.resources[]? == "*") | .metadata.name')
if [ -n "$WILDCARD_ROLES" ]; then
CRITICAL=$((CRITICAL + 1))
FINDINGS+=("CRITICAL: Wildcard RBAC permissions found")
echo "CRITICAL: ClusterRoles with wildcard permissions:" >&2
echo "$WILDCARD_ROLES" >&2
else
echo "✓ No wildcard RBAC permissions" >&2
fi
# 7. Pods without NetworkPolicy (Info)
echo -e "\n### Checking NetworkPolicy coverage..." >&2
if [ -n "$NAMESPACE" ]; then
NP_COUNT=$(kubectl get networkpolicy -n $NAMESPACE --no-headers 2>/dev/null | wc -l | tr -d ' ')
if [ "$NP_COUNT" -eq 0 ]; then
INFO=$((INFO + 1))
FINDINGS+=("INFO: Namespace $NAMESPACE has no NetworkPolicies")
echo "INFO: No NetworkPolicies in $NAMESPACE" >&2
else
echo "$NP_COUNT NetworkPolicies found" >&2
fi
else
NS_WITHOUT_NP=0
for ns in $(kubectl get ns -o jsonpath='{.items[*].metadata.name}' 2>/dev/null); do
count=$(kubectl get networkpolicy -n $ns --no-headers 2>/dev/null | wc -l | tr -d ' ')
[ "$count" -eq 0 ] && NS_WITHOUT_NP=$((NS_WITHOUT_NP + 1))
done
if [ "$NS_WITHOUT_NP" -gt 0 ]; then
INFO=$((INFO + 1))
FINDINGS+=("INFO: $NS_WITHOUT_NP namespaces without NetworkPolicies")
echo "INFO: $NS_WITHOUT_NP namespaces lack NetworkPolicies" >&2
fi
fi
# Summary
echo "" >&2
echo "========================================" >&2
echo "SECURITY AUDIT SUMMARY" >&2
echo "========================================" >&2
echo "Critical Issues: $CRITICAL" >&2
echo "Warnings: $WARNING" >&2
echo "Informational: $INFO" >&2
echo "" >&2
if [ ${#FINDINGS[@]} -gt 0 ]; then
echo "FINDINGS:" >&2
for finding in "${FINDINGS[@]}"; do
echo " - $finding" >&2
done
fi
# Output JSON
cat << EOF
{
"timestamp": "$(date -u +"%Y-%m-%dT%H:%M:%SZ")",
"namespace": "${NAMESPACE:-all}",
"critical": $CRITICAL,
"warning": $WARNING,
"info": $INFO,
"compliant": $([ $CRITICAL -eq 0 ] && echo "true" || echo "false")
}
EOF