chore(workspace): add hardened startup/security workflows and skill suite
This commit is contained in:
7
skills/kubernetes/.clawhub/origin.json
Normal file
7
skills/kubernetes/.clawhub/origin.json
Normal file
@@ -0,0 +1,7 @@
|
||||
{
|
||||
"version": 1,
|
||||
"registry": "https://clawhub.ai",
|
||||
"slug": "kubernetes",
|
||||
"installedVersion": "1.0.0",
|
||||
"installedAt": 1772497715868
|
||||
}
|
||||
542
skills/kubernetes/SKILL.md
Normal file
542
skills/kubernetes/SKILL.md
Normal file
@@ -0,0 +1,542 @@
|
||||
---
|
||||
name: kubernetes
|
||||
description: |
|
||||
Comprehensive Kubernetes and OpenShift cluster management skill covering operations, troubleshooting, manifest generation, security, and GitOps. Use this skill when:
|
||||
(1) Cluster operations: upgrades, backups, node management, scaling, monitoring setup
|
||||
(2) Troubleshooting: pod failures, networking issues, storage problems, performance analysis
|
||||
(3) Creating manifests: Deployments, StatefulSets, Services, Ingress, NetworkPolicies, RBAC
|
||||
(4) Security: audits, Pod Security Standards, RBAC, secrets management, vulnerability scanning
|
||||
(5) GitOps: ArgoCD, Flux, Kustomize, Helm, CI/CD pipelines, progressive delivery
|
||||
(6) OpenShift-specific: SCCs, Routes, Operators, Builds, ImageStreams
|
||||
(7) Multi-cloud: AKS, EKS, GKE, ARO, ROSA operations
|
||||
metadata:
|
||||
author: cluster-skills
|
||||
version: "1.0.0"
|
||||
---
|
||||
|
||||
# Kubernetes & OpenShift Cluster Management
|
||||
|
||||
Comprehensive skill for Kubernetes and OpenShift clusters covering operations, troubleshooting, manifests, security, and GitOps.
|
||||
|
||||
## Current Versions (January 2026)
|
||||
|
||||
| Platform | Version | Documentation |
|
||||
|----------|---------|---------------|
|
||||
| **Kubernetes** | 1.31.x | https://kubernetes.io/docs/ |
|
||||
| **OpenShift** | 4.17.x | https://docs.openshift.com/ |
|
||||
| **EKS** | 1.31 | https://docs.aws.amazon.com/eks/ |
|
||||
| **AKS** | 1.31 | https://learn.microsoft.com/azure/aks/ |
|
||||
| **GKE** | 1.31 | https://cloud.google.com/kubernetes-engine/docs |
|
||||
|
||||
### Key Tools
|
||||
|
||||
| Tool | Version | Purpose |
|
||||
|------|---------|---------|
|
||||
| **ArgoCD** | v2.13.x | GitOps deployments |
|
||||
| **Flux** | v2.4.x | GitOps toolkit |
|
||||
| **Kustomize** | v5.5.x | Manifest customization |
|
||||
| **Helm** | v3.16.x | Package management |
|
||||
| **Velero** | 1.15.x | Backup/restore |
|
||||
| **Trivy** | 0.58.x | Security scanning |
|
||||
| **Kyverno** | 1.13.x | Policy engine |
|
||||
|
||||
## Command Convention
|
||||
|
||||
**IMPORTANT**: Use `kubectl` for standard Kubernetes. Use `oc` for OpenShift/ARO.
|
||||
|
||||
---
|
||||
|
||||
## 1. CLUSTER OPERATIONS
|
||||
|
||||
### Node Management
|
||||
|
||||
```bash
|
||||
# View nodes
|
||||
kubectl get nodes -o wide
|
||||
|
||||
# Drain node for maintenance
|
||||
kubectl drain ${NODE} --ignore-daemonsets --delete-emptydir-data --grace-period=60
|
||||
|
||||
# Uncordon after maintenance
|
||||
kubectl uncordon ${NODE}
|
||||
|
||||
# View node resources
|
||||
kubectl top nodes
|
||||
```
|
||||
|
||||
### Cluster Upgrades
|
||||
|
||||
**AKS:**
|
||||
```bash
|
||||
az aks get-upgrades -g ${RG} -n ${CLUSTER} -o table
|
||||
az aks upgrade -g ${RG} -n ${CLUSTER} --kubernetes-version ${VERSION}
|
||||
```
|
||||
|
||||
**EKS:**
|
||||
```bash
|
||||
aws eks update-cluster-version --name ${CLUSTER} --kubernetes-version ${VERSION}
|
||||
```
|
||||
|
||||
**GKE:**
|
||||
```bash
|
||||
gcloud container clusters upgrade ${CLUSTER} --master --cluster-version ${VERSION}
|
||||
```
|
||||
|
||||
**OpenShift:**
|
||||
```bash
|
||||
oc adm upgrade --to=${VERSION}
|
||||
oc get clusterversion
|
||||
```
|
||||
|
||||
### Backup with Velero
|
||||
|
||||
```bash
|
||||
# Install Velero
|
||||
velero install --provider ${PROVIDER} --bucket ${BUCKET} --secret-file ${CREDS}
|
||||
|
||||
# Create backup
|
||||
velero backup create ${BACKUP_NAME} --include-namespaces ${NS}
|
||||
|
||||
# Restore
|
||||
velero restore create --from-backup ${BACKUP_NAME}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 2. TROUBLESHOOTING
|
||||
|
||||
### Health Assessment
|
||||
|
||||
Run the bundled script for comprehensive health check:
|
||||
```bash
|
||||
bash scripts/cluster-health-check.sh
|
||||
```
|
||||
|
||||
### Pod Status Interpretation
|
||||
|
||||
| Status | Meaning | Action |
|
||||
|--------|---------|--------|
|
||||
| `Pending` | Scheduling issue | Check resources, nodeSelector, tolerations |
|
||||
| `CrashLoopBackOff` | Container crashing | Check logs: `kubectl logs ${POD} --previous` |
|
||||
| `ImagePullBackOff` | Image unavailable | Verify image name, registry access |
|
||||
| `OOMKilled` | Out of memory | Increase memory limits |
|
||||
| `Evicted` | Node pressure | Check node resources |
|
||||
|
||||
### Debugging Commands
|
||||
|
||||
```bash
|
||||
# Pod logs (current and previous)
|
||||
kubectl logs ${POD} -c ${CONTAINER} --previous
|
||||
|
||||
# Multi-pod logs with stern
|
||||
stern ${LABEL_SELECTOR} -n ${NS}
|
||||
|
||||
# Exec into pod
|
||||
kubectl exec -it ${POD} -- /bin/sh
|
||||
|
||||
# Pod events
|
||||
kubectl describe pod ${POD} | grep -A 20 Events
|
||||
|
||||
# Cluster events (sorted by time)
|
||||
kubectl get events -A --sort-by='.lastTimestamp' | tail -50
|
||||
```
|
||||
|
||||
### Network Troubleshooting
|
||||
|
||||
```bash
|
||||
# Test DNS
|
||||
kubectl run -it --rm debug --image=busybox -- nslookup kubernetes.default
|
||||
|
||||
# Test service connectivity
|
||||
kubectl run -it --rm debug --image=curlimages/curl -- curl -v http://${SVC}.${NS}:${PORT}
|
||||
|
||||
# Check endpoints
|
||||
kubectl get endpoints ${SVC}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 3. MANIFEST GENERATION
|
||||
|
||||
### Production Deployment Template
|
||||
|
||||
```yaml
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: ${APP_NAME}
|
||||
namespace: ${NAMESPACE}
|
||||
labels:
|
||||
app.kubernetes.io/name: ${APP_NAME}
|
||||
app.kubernetes.io/version: "${VERSION}"
|
||||
spec:
|
||||
replicas: 3
|
||||
strategy:
|
||||
type: RollingUpdate
|
||||
rollingUpdate:
|
||||
maxSurge: 1
|
||||
maxUnavailable: 0
|
||||
selector:
|
||||
matchLabels:
|
||||
app.kubernetes.io/name: ${APP_NAME}
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app.kubernetes.io/name: ${APP_NAME}
|
||||
spec:
|
||||
serviceAccountName: ${APP_NAME}
|
||||
securityContext:
|
||||
runAsNonRoot: true
|
||||
runAsUser: 1000
|
||||
fsGroup: 1000
|
||||
seccompProfile:
|
||||
type: RuntimeDefault
|
||||
containers:
|
||||
- name: ${APP_NAME}
|
||||
image: ${IMAGE}:${TAG}
|
||||
ports:
|
||||
- name: http
|
||||
containerPort: 8080
|
||||
securityContext:
|
||||
allowPrivilegeEscalation: false
|
||||
readOnlyRootFilesystem: true
|
||||
capabilities:
|
||||
drop: ["ALL"]
|
||||
resources:
|
||||
requests:
|
||||
cpu: 100m
|
||||
memory: 128Mi
|
||||
limits:
|
||||
cpu: 500m
|
||||
memory: 512Mi
|
||||
livenessProbe:
|
||||
httpGet:
|
||||
path: /healthz
|
||||
port: http
|
||||
initialDelaySeconds: 10
|
||||
periodSeconds: 10
|
||||
readinessProbe:
|
||||
httpGet:
|
||||
path: /ready
|
||||
port: http
|
||||
initialDelaySeconds: 5
|
||||
periodSeconds: 5
|
||||
volumeMounts:
|
||||
- name: tmp
|
||||
mountPath: /tmp
|
||||
volumes:
|
||||
- name: tmp
|
||||
emptyDir: {}
|
||||
affinity:
|
||||
podAntiAffinity:
|
||||
preferredDuringSchedulingIgnoredDuringExecution:
|
||||
- weight: 100
|
||||
podAffinityTerm:
|
||||
labelSelector:
|
||||
matchLabels:
|
||||
app.kubernetes.io/name: ${APP_NAME}
|
||||
topologyKey: kubernetes.io/hostname
|
||||
```
|
||||
|
||||
### Service & Ingress
|
||||
|
||||
```yaml
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: ${APP_NAME}
|
||||
spec:
|
||||
selector:
|
||||
app.kubernetes.io/name: ${APP_NAME}
|
||||
ports:
|
||||
- name: http
|
||||
port: 80
|
||||
targetPort: http
|
||||
---
|
||||
apiVersion: networking.k8s.io/v1
|
||||
kind: Ingress
|
||||
metadata:
|
||||
name: ${APP_NAME}
|
||||
annotations:
|
||||
nginx.ingress.kubernetes.io/ssl-redirect: "true"
|
||||
spec:
|
||||
ingressClassName: nginx
|
||||
tls:
|
||||
- hosts:
|
||||
- ${HOST}
|
||||
secretName: ${APP_NAME}-tls
|
||||
rules:
|
||||
- host: ${HOST}
|
||||
http:
|
||||
paths:
|
||||
- path: /
|
||||
pathType: Prefix
|
||||
backend:
|
||||
service:
|
||||
name: ${APP_NAME}
|
||||
port:
|
||||
name: http
|
||||
```
|
||||
|
||||
### OpenShift Route
|
||||
|
||||
```yaml
|
||||
apiVersion: route.openshift.io/v1
|
||||
kind: Route
|
||||
metadata:
|
||||
name: ${APP_NAME}
|
||||
spec:
|
||||
to:
|
||||
kind: Service
|
||||
name: ${APP_NAME}
|
||||
port:
|
||||
targetPort: http
|
||||
tls:
|
||||
termination: edge
|
||||
insecureEdgeTerminationPolicy: Redirect
|
||||
```
|
||||
|
||||
Use the bundled script for manifest generation:
|
||||
```bash
|
||||
bash scripts/generate-manifest.sh deployment myapp production
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 4. SECURITY
|
||||
|
||||
### Security Audit
|
||||
|
||||
Run the bundled script:
|
||||
```bash
|
||||
bash scripts/security-audit.sh [namespace]
|
||||
```
|
||||
|
||||
### Pod Security Standards
|
||||
|
||||
```yaml
|
||||
apiVersion: v1
|
||||
kind: Namespace
|
||||
metadata:
|
||||
name: ${NAMESPACE}
|
||||
labels:
|
||||
pod-security.kubernetes.io/enforce: restricted
|
||||
pod-security.kubernetes.io/audit: baseline
|
||||
pod-security.kubernetes.io/warn: restricted
|
||||
```
|
||||
|
||||
### NetworkPolicy (Zero Trust)
|
||||
|
||||
```yaml
|
||||
apiVersion: networking.k8s.io/v1
|
||||
kind: NetworkPolicy
|
||||
metadata:
|
||||
name: ${APP_NAME}-policy
|
||||
spec:
|
||||
podSelector:
|
||||
matchLabels:
|
||||
app.kubernetes.io/name: ${APP_NAME}
|
||||
policyTypes:
|
||||
- Ingress
|
||||
- Egress
|
||||
ingress:
|
||||
- from:
|
||||
- podSelector:
|
||||
matchLabels:
|
||||
app.kubernetes.io/name: frontend
|
||||
ports:
|
||||
- protocol: TCP
|
||||
port: 8080
|
||||
egress:
|
||||
- to:
|
||||
- podSelector:
|
||||
matchLabels:
|
||||
app.kubernetes.io/name: database
|
||||
ports:
|
||||
- protocol: TCP
|
||||
port: 5432
|
||||
# Allow DNS
|
||||
- to:
|
||||
- namespaceSelector: {}
|
||||
podSelector:
|
||||
matchLabels:
|
||||
k8s-app: kube-dns
|
||||
ports:
|
||||
- protocol: UDP
|
||||
port: 53
|
||||
```
|
||||
|
||||
### RBAC Best Practices
|
||||
|
||||
```yaml
|
||||
apiVersion: v1
|
||||
kind: ServiceAccount
|
||||
metadata:
|
||||
name: ${APP_NAME}
|
||||
---
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: Role
|
||||
metadata:
|
||||
name: ${APP_NAME}-role
|
||||
rules:
|
||||
- apiGroups: [""]
|
||||
resources: ["configmaps"]
|
||||
verbs: ["get", "list"]
|
||||
---
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: RoleBinding
|
||||
metadata:
|
||||
name: ${APP_NAME}-binding
|
||||
subjects:
|
||||
- kind: ServiceAccount
|
||||
name: ${APP_NAME}
|
||||
roleRef:
|
||||
apiGroup: rbac.authorization.k8s.io
|
||||
kind: Role
|
||||
name: ${APP_NAME}-role
|
||||
```
|
||||
|
||||
### Image Scanning
|
||||
|
||||
```bash
|
||||
# Scan image with Trivy
|
||||
trivy image ${IMAGE}:${TAG}
|
||||
|
||||
# Scan with severity filter
|
||||
trivy image --severity HIGH,CRITICAL ${IMAGE}:${TAG}
|
||||
|
||||
# Generate SBOM
|
||||
trivy image --format spdx-json -o sbom.json ${IMAGE}:${TAG}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 5. GITOPS
|
||||
|
||||
### ArgoCD Application
|
||||
|
||||
```yaml
|
||||
apiVersion: argoproj.io/v1alpha1
|
||||
kind: Application
|
||||
metadata:
|
||||
name: ${APP_NAME}
|
||||
namespace: argocd
|
||||
finalizers:
|
||||
- resources-finalizer.argocd.argoproj.io
|
||||
spec:
|
||||
project: default
|
||||
source:
|
||||
repoURL: ${GIT_REPO}
|
||||
targetRevision: main
|
||||
path: k8s/overlays/${ENV}
|
||||
destination:
|
||||
server: https://kubernetes.default.svc
|
||||
namespace: ${NAMESPACE}
|
||||
syncPolicy:
|
||||
automated:
|
||||
prune: true
|
||||
selfHeal: true
|
||||
syncOptions:
|
||||
- CreateNamespace=true
|
||||
```
|
||||
|
||||
### Kustomize Structure
|
||||
|
||||
```
|
||||
k8s/
|
||||
├── base/
|
||||
│ ├── kustomization.yaml
|
||||
│ ├── deployment.yaml
|
||||
│ └── service.yaml
|
||||
└── overlays/
|
||||
├── dev/
|
||||
│ └── kustomization.yaml
|
||||
├── staging/
|
||||
│ └── kustomization.yaml
|
||||
└── prod/
|
||||
└── kustomization.yaml
|
||||
```
|
||||
|
||||
**base/kustomization.yaml:**
|
||||
```yaml
|
||||
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||
kind: Kustomization
|
||||
resources:
|
||||
- deployment.yaml
|
||||
- service.yaml
|
||||
```
|
||||
|
||||
**overlays/prod/kustomization.yaml:**
|
||||
```yaml
|
||||
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||
kind: Kustomization
|
||||
resources:
|
||||
- ../../base
|
||||
namePrefix: prod-
|
||||
namespace: production
|
||||
replicas:
|
||||
- name: myapp
|
||||
count: 5
|
||||
images:
|
||||
- name: myregistry/myapp
|
||||
newTag: v1.2.3
|
||||
```
|
||||
|
||||
### GitHub Actions CI/CD
|
||||
|
||||
```yaml
|
||||
name: Build and Deploy
|
||||
on:
|
||||
push:
|
||||
branches: [main]
|
||||
|
||||
jobs:
|
||||
build:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- name: Build and push image
|
||||
uses: docker/build-push-action@v5
|
||||
with:
|
||||
push: true
|
||||
tags: ${{ secrets.REGISTRY }}/${{ github.event.repository.name }}:${{ github.sha }}
|
||||
|
||||
- name: Update Kustomize image
|
||||
run: |
|
||||
cd k8s/overlays/prod
|
||||
kustomize edit set image myapp=${{ secrets.REGISTRY }}/${{ github.event.repository.name }}:${{ github.sha }}
|
||||
|
||||
- name: Commit and push
|
||||
run: |
|
||||
git config user.name "github-actions"
|
||||
git config user.email "github-actions@github.com"
|
||||
git add .
|
||||
git commit -m "Update image to ${{ github.sha }}"
|
||||
git push
|
||||
```
|
||||
|
||||
Use the bundled script for ArgoCD sync:
|
||||
```bash
|
||||
bash scripts/argocd-app-sync.sh ${APP_NAME} --prune
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Helper Scripts
|
||||
|
||||
This skill includes automation scripts in the `scripts/` directory:
|
||||
|
||||
| Script | Purpose |
|
||||
|--------|---------|
|
||||
| `cluster-health-check.sh` | Comprehensive cluster health assessment with scoring |
|
||||
| `security-audit.sh` | Security posture audit (privileged, root, RBAC, NetworkPolicy) |
|
||||
| `node-maintenance.sh` | Safe node drain and maintenance prep |
|
||||
| `pre-upgrade-check.sh` | Pre-upgrade validation checklist |
|
||||
| `generate-manifest.sh` | Generate production-ready K8s manifests |
|
||||
| `argocd-app-sync.sh` | ArgoCD application sync helper |
|
||||
|
||||
Run any script:
|
||||
```bash
|
||||
bash scripts/<script-name>.sh [arguments]
|
||||
```
|
||||
6
skills/kubernetes/_meta.json
Normal file
6
skills/kubernetes/_meta.json
Normal file
@@ -0,0 +1,6 @@
|
||||
{
|
||||
"ownerId": "kn7f82v7f3g1dtvm0gm74q016n7zz73v",
|
||||
"slug": "kubernetes",
|
||||
"version": "1.0.0",
|
||||
"publishedAt": 1769436428875
|
||||
}
|
||||
96
skills/kubernetes/scripts/argocd-app-sync.sh
Normal file
96
skills/kubernetes/scripts/argocd-app-sync.sh
Normal file
@@ -0,0 +1,96 @@
|
||||
#!/bin/bash
|
||||
# argocd-app-sync.sh - ArgoCD application sync helper
|
||||
# Usage: ./argocd-app-sync.sh <app-name> [--prune] [--force]
|
||||
|
||||
set -e
|
||||
|
||||
APP=${1:-""}
|
||||
PRUNE=${2:-""}
|
||||
FORCE=${3:-""}
|
||||
|
||||
if [ -z "$APP" ]; then
|
||||
echo "Usage: $0 <app-name> [--prune] [--force]" >&2
|
||||
echo "" >&2
|
||||
echo "Available applications:" >&2
|
||||
argocd app list --output name 2>/dev/null || kubectl get applications -A -o jsonpath='{range .items[*]}{.metadata.name}{"\n"}{end}'
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "=== ARGOCD APPLICATION SYNC: $APP ===" >&2
|
||||
echo "Timestamp: $(date -u +"%Y-%m-%dT%H:%M:%SZ")" >&2
|
||||
echo "" >&2
|
||||
|
||||
# Check if argocd CLI is available
|
||||
if command -v argocd &> /dev/null; then
|
||||
USE_CLI=true
|
||||
else
|
||||
USE_CLI=false
|
||||
echo "argocd CLI not found, using kubectl" >&2
|
||||
fi
|
||||
|
||||
# Get current status
|
||||
echo "### Current Status ###" >&2
|
||||
if [ "$USE_CLI" = true ]; then
|
||||
argocd app get "$APP" --refresh >&2
|
||||
else
|
||||
kubectl get application "$APP" -n argocd -o yaml | grep -A20 "status:" | head -25 >&2
|
||||
fi
|
||||
|
||||
# Build sync options
|
||||
SYNC_OPTS=""
|
||||
if [ "$PRUNE" == "--prune" ]; then
|
||||
SYNC_OPTS="$SYNC_OPTS --prune"
|
||||
echo "Prune enabled: Will remove resources not defined in Git" >&2
|
||||
fi
|
||||
if [ "$FORCE" == "--force" ]; then
|
||||
SYNC_OPTS="$SYNC_OPTS --force"
|
||||
echo "Force enabled: Will replace resources that cannot be patched" >&2
|
||||
fi
|
||||
|
||||
# Perform sync
|
||||
echo -e "\n### Syncing Application ###" >&2
|
||||
if [ "$USE_CLI" = true ]; then
|
||||
argocd app sync "$APP" $SYNC_OPTS >&2
|
||||
else
|
||||
# Trigger sync via annotation
|
||||
kubectl patch application "$APP" -n argocd --type=merge -p '{"operation":{"sync":{"revision":"HEAD"}}}' >&2
|
||||
fi
|
||||
|
||||
# Wait for sync to complete
|
||||
echo -e "\n### Waiting for Sync ###" >&2
|
||||
if [ "$USE_CLI" = true ]; then
|
||||
argocd app wait "$APP" --health --timeout 300 >&2
|
||||
else
|
||||
echo "Waiting for sync (check manually with kubectl)..." >&2
|
||||
sleep 10
|
||||
fi
|
||||
|
||||
# Final status
|
||||
echo -e "\n### Final Status ###" >&2
|
||||
if [ "$USE_CLI" = true ]; then
|
||||
argocd app get "$APP" >&2
|
||||
STATUS=$(argocd app get "$APP" -o json | jq -r '.status.sync.status')
|
||||
HEALTH=$(argocd app get "$APP" -o json | jq -r '.status.health.status')
|
||||
else
|
||||
kubectl get application "$APP" -n argocd -o yaml | grep -A20 "status:" | head -25 >&2
|
||||
STATUS=$(kubectl get application "$APP" -n argocd -o jsonpath='{.status.sync.status}')
|
||||
HEALTH=$(kubectl get application "$APP" -n argocd -o jsonpath='{.status.health.status}')
|
||||
fi
|
||||
|
||||
echo "" >&2
|
||||
echo "========================================" >&2
|
||||
echo "SYNC COMPLETE" >&2
|
||||
echo "========================================" >&2
|
||||
echo "Sync Status: $STATUS" >&2
|
||||
echo "Health Status: $HEALTH" >&2
|
||||
|
||||
# Output JSON
|
||||
cat << EOF
|
||||
{
|
||||
"application": "$APP",
|
||||
"timestamp": "$(date -u +"%Y-%m-%dT%H:%M:%SZ")",
|
||||
"sync_status": "$STATUS",
|
||||
"health_status": "$HEALTH",
|
||||
"success": $([ "$STATUS" == "Synced" ] && echo "true" || echo "false")
|
||||
}
|
||||
EOF
|
||||
131
skills/kubernetes/scripts/cluster-health-check.sh
Normal file
131
skills/kubernetes/scripts/cluster-health-check.sh
Normal file
@@ -0,0 +1,131 @@
|
||||
#!/bin/bash
|
||||
# cluster-health-check.sh - Comprehensive cluster health assessment
|
||||
# Usage: ./cluster-health-check.sh
|
||||
|
||||
set -e
|
||||
|
||||
echo "=== KUBERNETES CLUSTER HEALTH ASSESSMENT ===" >&2
|
||||
echo "Timestamp: $(date -u +"%Y-%m-%dT%H:%M:%SZ")" >&2
|
||||
echo "" >&2
|
||||
|
||||
SCORE=100
|
||||
ISSUES=()
|
||||
|
||||
# 1. Node Health (Critical: -50 points per issue)
|
||||
echo "### NODE HEALTH ###" >&2
|
||||
UNHEALTHY_NODES=$(kubectl get nodes --no-headers | grep -vE "Ready\s+<none>|Ready\s+master|Ready\s+control-plane" | grep -c -E "NotReady|Unknown" || echo 0)
|
||||
if [ "$UNHEALTHY_NODES" -gt 0 ]; then
|
||||
SCORE=$((SCORE - 50))
|
||||
ISSUES+=("BOOM: $UNHEALTHY_NODES unhealthy nodes detected")
|
||||
kubectl get nodes | grep -E "NotReady|Unknown" >&2
|
||||
else
|
||||
echo "✓ All nodes healthy" >&2
|
||||
fi
|
||||
|
||||
# 2. Pod Issues (Warning: -20 points)
|
||||
echo -e "\n### POD HEALTH ###" >&2
|
||||
POD_ISSUES=$(kubectl get pods -A --field-selector=status.phase!=Running,status.phase!=Succeeded --no-headers 2>/dev/null | wc -l | tr -d ' ')
|
||||
if [ "$POD_ISSUES" -gt 0 ]; then
|
||||
SCORE=$((SCORE - 20))
|
||||
ISSUES+=("WARN: $POD_ISSUES pods not in Running/Succeeded state")
|
||||
echo "Pods with issues:" >&2
|
||||
kubectl get pods -A --field-selector=status.phase!=Running,status.phase!=Succeeded >&2
|
||||
else
|
||||
echo "✓ All pods running" >&2
|
||||
fi
|
||||
|
||||
# 3. CrashLoopBackOff (Critical: -50 points)
|
||||
echo -e "\n### CRASH LOOP DETECTION ###" >&2
|
||||
CRASHLOOP=$(kubectl get pods -A -o json 2>/dev/null | jq -r '.items[] | select(.status.containerStatuses[]?.state.waiting?.reason == "CrashLoopBackOff") | "\(.metadata.namespace)/\(.metadata.name)"' | wc -l | tr -d ' ')
|
||||
if [ "$CRASHLOOP" -gt 0 ]; then
|
||||
SCORE=$((SCORE - 50))
|
||||
ISSUES+=("BOOM: $CRASHLOOP pods in CrashLoopBackOff")
|
||||
kubectl get pods -A -o json | jq -r '.items[] | select(.status.containerStatuses[]?.state.waiting?.reason == "CrashLoopBackOff") | "\(.metadata.namespace)/\(.metadata.name)"' >&2
|
||||
else
|
||||
echo "✓ No pods in CrashLoopBackOff" >&2
|
||||
fi
|
||||
|
||||
# 4. Security - Privileged Containers (Critical: -50 points)
|
||||
echo -e "\n### SECURITY - PRIVILEGED CONTAINERS ###" >&2
|
||||
PRIVILEGED=$(kubectl get pods -A -o json 2>/dev/null | jq -r '[.items[] | select(.spec.containers[].securityContext.privileged == true)] | length')
|
||||
if [ "$PRIVILEGED" -gt 0 ]; then
|
||||
SCORE=$((SCORE - 50))
|
||||
ISSUES+=("BOOM: $PRIVILEGED privileged containers detected")
|
||||
kubectl get pods -A -o json | jq -r '.items[] | select(.spec.containers[].securityContext.privileged == true) | "\(.metadata.namespace)/\(.metadata.name)"' >&2
|
||||
else
|
||||
echo "✓ No privileged containers" >&2
|
||||
fi
|
||||
|
||||
# 5. Resource Limits (Warning: -20 points)
|
||||
echo -e "\n### RESOURCE CONFIGURATION ###" >&2
|
||||
NO_LIMITS=$(kubectl get pods -A -o json 2>/dev/null | jq -r '[.items[] | select(.spec.containers[].resources.limits == null)] | length')
|
||||
if [ "$NO_LIMITS" -gt 10 ]; then
|
||||
SCORE=$((SCORE - 20))
|
||||
ISSUES+=("WARN: $NO_LIMITS containers without resource limits")
|
||||
else
|
||||
echo "✓ Most containers have resource limits" >&2
|
||||
fi
|
||||
|
||||
# 6. PVC Status (Warning: -20 points)
|
||||
echo -e "\n### STORAGE HEALTH ###" >&2
|
||||
PENDING_PVC=$(kubectl get pvc -A --field-selector=status.phase!=Bound --no-headers 2>/dev/null | wc -l | tr -d ' ')
|
||||
if [ "$PENDING_PVC" -gt 0 ]; then
|
||||
SCORE=$((SCORE - 20))
|
||||
ISSUES+=("WARN: $PENDING_PVC PVCs not bound")
|
||||
kubectl get pvc -A --field-selector=status.phase!=Bound >&2
|
||||
else
|
||||
echo "✓ All PVCs bound" >&2
|
||||
fi
|
||||
|
||||
# 7. Recent Warning Events (Info: -5 points per 10 events)
|
||||
echo -e "\n### RECENT WARNING EVENTS ###" >&2
|
||||
WARNING_EVENTS=$(kubectl get events -A --field-selector=type=Warning --no-headers 2>/dev/null | wc -l | tr -d ' ')
|
||||
if [ "$WARNING_EVENTS" -gt 50 ]; then
|
||||
SCORE=$((SCORE - 5))
|
||||
ISSUES+=("INFO: $WARNING_EVENTS warning events in cluster")
|
||||
echo "Recent warning events: $WARNING_EVENTS" >&2
|
||||
else
|
||||
echo "✓ Warning events within normal range" >&2
|
||||
fi
|
||||
|
||||
# OpenShift-specific checks
|
||||
if command -v oc &> /dev/null && oc whoami &> /dev/null; then
|
||||
echo -e "\n### OPENSHIFT CLUSTER OPERATORS ###" >&2
|
||||
DEGRADED=$(oc get clusteroperators --no-headers 2>/dev/null | grep -c -E "False.*True|False.*False" || echo 0)
|
||||
if [ "$DEGRADED" -gt 0 ]; then
|
||||
SCORE=$((SCORE - 50))
|
||||
ISSUES+=("BOOM: $DEGRADED cluster operators degraded/unavailable")
|
||||
oc get clusteroperators | grep -E "False.*True|False.*False" >&2
|
||||
else
|
||||
echo "✓ All cluster operators healthy" >&2
|
||||
fi
|
||||
fi
|
||||
|
||||
# Ensure score doesn't go below 0
|
||||
if [ "$SCORE" -lt 0 ]; then
|
||||
SCORE=0
|
||||
fi
|
||||
|
||||
# Output summary
|
||||
echo "" >&2
|
||||
echo "========================================" >&2
|
||||
echo "CLUSTER HEALTH SCORE: $SCORE/100" >&2
|
||||
echo "========================================" >&2
|
||||
|
||||
if [ ${#ISSUES[@]} -gt 0 ]; then
|
||||
echo "" >&2
|
||||
echo "ISSUES FOUND:" >&2
|
||||
for issue in "${ISSUES[@]}"; do
|
||||
echo " - $issue" >&2
|
||||
done
|
||||
fi
|
||||
|
||||
# Output JSON for programmatic use
|
||||
cat << EOF
|
||||
{
|
||||
"score": $SCORE,
|
||||
"timestamp": "$(date -u +"%Y-%m-%dT%H:%M:%SZ")",
|
||||
"issues_count": ${#ISSUES[@]},
|
||||
"healthy": $([ $SCORE -ge 80 ] && echo "true" || echo "false")
|
||||
}
|
||||
EOF
|
||||
370
skills/kubernetes/scripts/generate-manifest.sh
Normal file
370
skills/kubernetes/scripts/generate-manifest.sh
Normal file
@@ -0,0 +1,370 @@
|
||||
#!/bin/bash
|
||||
# generate-manifest.sh - Generate production-ready Kubernetes manifests
|
||||
# Usage: ./generate-manifest.sh <type> <name> [namespace]
|
||||
|
||||
set -e
|
||||
|
||||
TYPE=${1:-""}
|
||||
NAME=${2:-""}
|
||||
NAMESPACE=${3:-"default"}
|
||||
|
||||
VALID_TYPES="deployment statefulset service ingress configmap secret pvc networkpolicy hpa"
|
||||
|
||||
if [ -z "$TYPE" ] || [ -z "$NAME" ]; then
|
||||
echo "Usage: $0 <type> <name> [namespace]" >&2
|
||||
echo "" >&2
|
||||
echo "Available types: $VALID_TYPES" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "# Generated: $(date -u +"%Y-%m-%dT%H:%M:%SZ")"
|
||||
echo "# Type: $TYPE | Name: $NAME | Namespace: $NAMESPACE"
|
||||
echo ""
|
||||
|
||||
case $TYPE in
|
||||
deployment)
|
||||
cat << EOF
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: $NAME
|
||||
namespace: $NAMESPACE
|
||||
labels:
|
||||
app.kubernetes.io/name: $NAME
|
||||
app.kubernetes.io/component: server
|
||||
spec:
|
||||
replicas: 3
|
||||
strategy:
|
||||
type: RollingUpdate
|
||||
rollingUpdate:
|
||||
maxSurge: 1
|
||||
maxUnavailable: 0
|
||||
selector:
|
||||
matchLabels:
|
||||
app.kubernetes.io/name: $NAME
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app.kubernetes.io/name: $NAME
|
||||
spec:
|
||||
serviceAccountName: $NAME
|
||||
securityContext:
|
||||
runAsNonRoot: true
|
||||
runAsUser: 1000
|
||||
fsGroup: 1000
|
||||
seccompProfile:
|
||||
type: RuntimeDefault
|
||||
containers:
|
||||
- name: $NAME
|
||||
image: your-registry/$NAME:latest
|
||||
imagePullPolicy: Always
|
||||
ports:
|
||||
- name: http
|
||||
containerPort: 8080
|
||||
protocol: TCP
|
||||
securityContext:
|
||||
allowPrivilegeEscalation: false
|
||||
readOnlyRootFilesystem: true
|
||||
capabilities:
|
||||
drop: ["ALL"]
|
||||
resources:
|
||||
requests:
|
||||
cpu: 100m
|
||||
memory: 128Mi
|
||||
limits:
|
||||
cpu: 500m
|
||||
memory: 512Mi
|
||||
livenessProbe:
|
||||
httpGet:
|
||||
path: /healthz
|
||||
port: http
|
||||
initialDelaySeconds: 10
|
||||
periodSeconds: 10
|
||||
timeoutSeconds: 5
|
||||
failureThreshold: 3
|
||||
readinessProbe:
|
||||
httpGet:
|
||||
path: /ready
|
||||
port: http
|
||||
initialDelaySeconds: 5
|
||||
periodSeconds: 5
|
||||
timeoutSeconds: 3
|
||||
failureThreshold: 3
|
||||
env:
|
||||
- name: POD_NAME
|
||||
valueFrom:
|
||||
fieldRef:
|
||||
fieldPath: metadata.name
|
||||
volumeMounts:
|
||||
- name: tmp
|
||||
mountPath: /tmp
|
||||
volumes:
|
||||
- name: tmp
|
||||
emptyDir: {}
|
||||
affinity:
|
||||
podAntiAffinity:
|
||||
preferredDuringSchedulingIgnoredDuringExecution:
|
||||
- weight: 100
|
||||
podAffinityTerm:
|
||||
labelSelector:
|
||||
matchLabels:
|
||||
app.kubernetes.io/name: $NAME
|
||||
topologyKey: kubernetes.io/hostname
|
||||
EOF
|
||||
;;
|
||||
|
||||
statefulset)
|
||||
cat << EOF
|
||||
apiVersion: apps/v1
|
||||
kind: StatefulSet
|
||||
metadata:
|
||||
name: $NAME
|
||||
namespace: $NAMESPACE
|
||||
labels:
|
||||
app.kubernetes.io/name: $NAME
|
||||
spec:
|
||||
serviceName: $NAME-headless
|
||||
replicas: 3
|
||||
podManagementPolicy: OrderedReady
|
||||
updateStrategy:
|
||||
type: RollingUpdate
|
||||
selector:
|
||||
matchLabels:
|
||||
app.kubernetes.io/name: $NAME
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app.kubernetes.io/name: $NAME
|
||||
spec:
|
||||
serviceAccountName: $NAME
|
||||
securityContext:
|
||||
runAsNonRoot: true
|
||||
runAsUser: 1000
|
||||
fsGroup: 1000
|
||||
terminationGracePeriodSeconds: 30
|
||||
containers:
|
||||
- name: $NAME
|
||||
image: your-registry/$NAME:latest
|
||||
ports:
|
||||
- name: tcp
|
||||
containerPort: 5432
|
||||
securityContext:
|
||||
allowPrivilegeEscalation: false
|
||||
capabilities:
|
||||
drop: ["ALL"]
|
||||
resources:
|
||||
requests:
|
||||
cpu: 250m
|
||||
memory: 512Mi
|
||||
limits:
|
||||
cpu: 1000m
|
||||
memory: 2Gi
|
||||
volumeMounts:
|
||||
- name: data
|
||||
mountPath: /data
|
||||
volumeClaimTemplates:
|
||||
- metadata:
|
||||
name: data
|
||||
spec:
|
||||
accessModes: ["ReadWriteOnce"]
|
||||
storageClassName: standard
|
||||
resources:
|
||||
requests:
|
||||
storage: 10Gi
|
||||
EOF
|
||||
;;
|
||||
|
||||
service)
|
||||
cat << EOF
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: $NAME
|
||||
namespace: $NAMESPACE
|
||||
labels:
|
||||
app.kubernetes.io/name: $NAME
|
||||
spec:
|
||||
type: ClusterIP
|
||||
ports:
|
||||
- name: http
|
||||
port: 80
|
||||
targetPort: http
|
||||
protocol: TCP
|
||||
selector:
|
||||
app.kubernetes.io/name: $NAME
|
||||
EOF
|
||||
;;
|
||||
|
||||
ingress)
|
||||
cat << EOF
|
||||
apiVersion: networking.k8s.io/v1
|
||||
kind: Ingress
|
||||
metadata:
|
||||
name: $NAME
|
||||
namespace: $NAMESPACE
|
||||
annotations:
|
||||
nginx.ingress.kubernetes.io/ssl-redirect: "true"
|
||||
spec:
|
||||
ingressClassName: nginx
|
||||
tls:
|
||||
- hosts:
|
||||
- $NAME.example.com
|
||||
secretName: $NAME-tls
|
||||
rules:
|
||||
- host: $NAME.example.com
|
||||
http:
|
||||
paths:
|
||||
- path: /
|
||||
pathType: Prefix
|
||||
backend:
|
||||
service:
|
||||
name: $NAME
|
||||
port:
|
||||
name: http
|
||||
EOF
|
||||
;;
|
||||
|
||||
configmap)
|
||||
cat << EOF
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: $NAME
|
||||
namespace: $NAMESPACE
|
||||
labels:
|
||||
app.kubernetes.io/name: $NAME
|
||||
data:
|
||||
config.yaml: |
|
||||
# Add your configuration here
|
||||
server:
|
||||
port: 8080
|
||||
host: "0.0.0.0"
|
||||
EOF
|
||||
;;
|
||||
|
||||
secret)
|
||||
cat << EOF
|
||||
apiVersion: v1
|
||||
kind: Secret
|
||||
metadata:
|
||||
name: $NAME
|
||||
namespace: $NAMESPACE
|
||||
labels:
|
||||
app.kubernetes.io/name: $NAME
|
||||
type: Opaque
|
||||
stringData:
|
||||
# Replace with actual values before applying
|
||||
API_KEY: "your-api-key-here"
|
||||
DATABASE_URL: "postgresql://user:pass@host:5432/db"
|
||||
EOF
|
||||
;;
|
||||
|
||||
pvc)
|
||||
cat << EOF
|
||||
apiVersion: v1
|
||||
kind: PersistentVolumeClaim
|
||||
metadata:
|
||||
name: $NAME
|
||||
namespace: $NAMESPACE
|
||||
labels:
|
||||
app.kubernetes.io/name: $NAME
|
||||
spec:
|
||||
accessModes:
|
||||
- ReadWriteOnce
|
||||
storageClassName: standard
|
||||
resources:
|
||||
requests:
|
||||
storage: 10Gi
|
||||
EOF
|
||||
;;
|
||||
|
||||
networkpolicy)
|
||||
cat << EOF
|
||||
apiVersion: networking.k8s.io/v1
|
||||
kind: NetworkPolicy
|
||||
metadata:
|
||||
name: $NAME
|
||||
namespace: $NAMESPACE
|
||||
spec:
|
||||
podSelector:
|
||||
matchLabels:
|
||||
app.kubernetes.io/name: $NAME
|
||||
policyTypes:
|
||||
- Ingress
|
||||
- Egress
|
||||
ingress:
|
||||
- from:
|
||||
- podSelector:
|
||||
matchLabels:
|
||||
app.kubernetes.io/name: frontend
|
||||
ports:
|
||||
- protocol: TCP
|
||||
port: 8080
|
||||
egress:
|
||||
- to:
|
||||
- podSelector:
|
||||
matchLabels:
|
||||
app.kubernetes.io/name: database
|
||||
ports:
|
||||
- protocol: TCP
|
||||
port: 5432
|
||||
- to:
|
||||
- namespaceSelector: {}
|
||||
podSelector:
|
||||
matchLabels:
|
||||
k8s-app: kube-dns
|
||||
ports:
|
||||
- protocol: UDP
|
||||
port: 53
|
||||
EOF
|
||||
;;
|
||||
|
||||
hpa)
|
||||
cat << EOF
|
||||
apiVersion: autoscaling/v2
|
||||
kind: HorizontalPodAutoscaler
|
||||
metadata:
|
||||
name: $NAME
|
||||
namespace: $NAMESPACE
|
||||
spec:
|
||||
scaleTargetRef:
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
name: $NAME
|
||||
minReplicas: 3
|
||||
maxReplicas: 10
|
||||
metrics:
|
||||
- type: Resource
|
||||
resource:
|
||||
name: cpu
|
||||
target:
|
||||
type: Utilization
|
||||
averageUtilization: 70
|
||||
- type: Resource
|
||||
resource:
|
||||
name: memory
|
||||
target:
|
||||
type: Utilization
|
||||
averageUtilization: 80
|
||||
behavior:
|
||||
scaleDown:
|
||||
stabilizationWindowSeconds: 300
|
||||
policies:
|
||||
- type: Percent
|
||||
value: 25
|
||||
periodSeconds: 60
|
||||
scaleUp:
|
||||
stabilizationWindowSeconds: 0
|
||||
policies:
|
||||
- type: Percent
|
||||
value: 100
|
||||
periodSeconds: 15
|
||||
EOF
|
||||
;;
|
||||
|
||||
*)
|
||||
echo "Unknown type: $TYPE" >&2
|
||||
echo "Valid types: $VALID_TYPES" >&2
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
97
skills/kubernetes/scripts/node-maintenance.sh
Normal file
97
skills/kubernetes/scripts/node-maintenance.sh
Normal file
@@ -0,0 +1,97 @@
|
||||
#!/bin/bash
|
||||
# node-maintenance.sh - Safely drain and prepare node for maintenance
|
||||
# Usage: ./node-maintenance.sh <node-name> [--force]
|
||||
|
||||
set -e
|
||||
|
||||
NODE=${1:-""}
|
||||
FORCE=${2:-""}
|
||||
|
||||
if [ -z "$NODE" ]; then
|
||||
echo "Usage: $0 <node-name> [--force]" >&2
|
||||
echo "" >&2
|
||||
echo "Available nodes:" >&2
|
||||
kubectl get nodes --no-headers | awk '{print " " $1 " (" $2 ")"}'
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "=== NODE MAINTENANCE: $NODE ===" >&2
|
||||
echo "Timestamp: $(date -u +"%Y-%m-%dT%H:%M:%SZ")" >&2
|
||||
echo "" >&2
|
||||
|
||||
# Verify node exists
|
||||
if ! kubectl get node "$NODE" &>/dev/null; then
|
||||
echo "Error: Node '$NODE' not found" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Show current status
|
||||
echo "### Current Node Status ###" >&2
|
||||
kubectl get node "$NODE" -o wide >&2
|
||||
|
||||
echo -e "\n### Pods on Node ###" >&2
|
||||
POD_COUNT=$(kubectl get pods -A --field-selector spec.nodeName="$NODE" --no-headers | wc -l | tr -d ' ')
|
||||
echo "Total pods: $POD_COUNT" >&2
|
||||
kubectl get pods -A --field-selector spec.nodeName="$NODE" --no-headers | head -20 >&2
|
||||
[ "$POD_COUNT" -gt 20 ] && echo "... and $((POD_COUNT - 20)) more" >&2
|
||||
|
||||
# Check for pods with PDBs that might block drain
|
||||
echo -e "\n### Checking PodDisruptionBudgets ###" >&2
|
||||
kubectl get pdb -A -o json 2>/dev/null | jq -r '.items[] | "\(.metadata.namespace)/\(.metadata.name): minAvailable=\(.spec.minAvailable // "N/A"), maxUnavailable=\(.spec.maxUnavailable // "N/A")"' >&2
|
||||
|
||||
# Confirmation
|
||||
if [ "$FORCE" != "--force" ]; then
|
||||
echo "" >&2
|
||||
read -p "Proceed with cordoning and draining node $NODE? (yes/no): " confirm
|
||||
if [ "$confirm" != "yes" ]; then
|
||||
echo "Aborted." >&2
|
||||
exit 0
|
||||
fi
|
||||
fi
|
||||
|
||||
# Step 1: Cordon the node
|
||||
echo -e "\n### Step 1: Cordoning node ###" >&2
|
||||
kubectl cordon "$NODE"
|
||||
echo "✓ Node cordoned (unschedulable)" >&2
|
||||
|
||||
# Step 2: Drain the node
|
||||
echo -e "\n### Step 2: Draining node ###" >&2
|
||||
DRAIN_OPTS="--ignore-daemonsets --delete-emptydir-data --grace-period=60 --timeout=300s"
|
||||
|
||||
if [ "$FORCE" == "--force" ]; then
|
||||
DRAIN_OPTS="$DRAIN_OPTS --force"
|
||||
echo "Force mode enabled" >&2
|
||||
fi
|
||||
|
||||
if kubectl drain "$NODE" $DRAIN_OPTS; then
|
||||
echo "✓ Node drained successfully" >&2
|
||||
else
|
||||
echo "Warning: Drain completed with some issues" >&2
|
||||
fi
|
||||
|
||||
# Step 3: Verify no pods remain (except daemonsets)
|
||||
echo -e "\n### Step 3: Verification ###" >&2
|
||||
REMAINING=$(kubectl get pods -A --field-selector spec.nodeName="$NODE" --no-headers 2>/dev/null | wc -l | tr -d ' ')
|
||||
echo "Remaining pods on node: $REMAINING (should be daemonsets only)" >&2
|
||||
kubectl get pods -A --field-selector spec.nodeName="$NODE" >&2
|
||||
|
||||
echo "" >&2
|
||||
echo "========================================" >&2
|
||||
echo "NODE MAINTENANCE READY" >&2
|
||||
echo "========================================" >&2
|
||||
echo "Node '$NODE' is now cordoned and drained." >&2
|
||||
echo "" >&2
|
||||
echo "Perform your maintenance tasks, then run:" >&2
|
||||
echo " kubectl uncordon $NODE" >&2
|
||||
echo "" >&2
|
||||
|
||||
# Output JSON
|
||||
cat << EOF
|
||||
{
|
||||
"node": "$NODE",
|
||||
"action": "drain",
|
||||
"timestamp": "$(date -u +"%Y-%m-%dT%H:%M:%SZ")",
|
||||
"remaining_pods": $REMAINING,
|
||||
"status": "ready_for_maintenance"
|
||||
}
|
||||
EOF
|
||||
137
skills/kubernetes/scripts/pre-upgrade-check.sh
Normal file
137
skills/kubernetes/scripts/pre-upgrade-check.sh
Normal file
@@ -0,0 +1,137 @@
|
||||
#!/bin/bash
|
||||
# pre-upgrade-check.sh - Pre-upgrade cluster validation
|
||||
# Usage: ./pre-upgrade-check.sh
|
||||
|
||||
set -e
|
||||
|
||||
echo "=== PRE-UPGRADE CLUSTER VALIDATION ===" >&2
|
||||
echo "Timestamp: $(date -u +"%Y-%m-%dT%H:%M:%SZ")" >&2
|
||||
echo "" >&2
|
||||
|
||||
WARNINGS=0
|
||||
BLOCKERS=0
|
||||
|
||||
# 1. Cluster Version
|
||||
echo "### Cluster Version ###" >&2
|
||||
SERVER_VERSION=$(kubectl version -o json 2>/dev/null | jq -r '.serverVersion.gitVersion')
|
||||
echo "Server Version: $SERVER_VERSION" >&2
|
||||
|
||||
# 2. Node Status
|
||||
echo -e "\n### Node Status ###" >&2
|
||||
kubectl get nodes >&2
|
||||
NOT_READY=$(kubectl get nodes --no-headers | grep -cv "Ready" || echo 0)
|
||||
if [ "$NOT_READY" -gt 0 ]; then
|
||||
BLOCKERS=$((BLOCKERS + 1))
|
||||
echo "BLOCKER: $NOT_READY nodes not ready" >&2
|
||||
fi
|
||||
|
||||
# 3. Control Plane Health
|
||||
echo -e "\n### Control Plane Health ###" >&2
|
||||
kubectl get pods -n kube-system -l tier=control-plane 2>/dev/null || \
|
||||
kubectl get pods -n kube-system | grep -E "kube-apiserver|kube-controller|kube-scheduler|etcd" >&2
|
||||
|
||||
# 4. Pods Not Running
|
||||
echo -e "\n### Pods Not Running ###" >&2
|
||||
NOT_RUNNING=$(kubectl get pods -A --field-selector=status.phase!=Running,status.phase!=Succeeded --no-headers 2>/dev/null | wc -l | tr -d ' ')
|
||||
if [ "$NOT_RUNNING" -gt 0 ]; then
|
||||
WARNINGS=$((WARNINGS + 1))
|
||||
echo "WARNING: $NOT_RUNNING pods not in Running/Succeeded state" >&2
|
||||
kubectl get pods -A --field-selector=status.phase!=Running,status.phase!=Succeeded >&2
|
||||
else
|
||||
echo "✓ All pods running" >&2
|
||||
fi
|
||||
|
||||
# 5. PodDisruptionBudgets
|
||||
echo -e "\n### PodDisruptionBudgets ###" >&2
|
||||
PDB_COUNT=$(kubectl get pdb -A --no-headers 2>/dev/null | wc -l | tr -d ' ')
|
||||
echo "Found $PDB_COUNT PDBs" >&2
|
||||
if [ "$PDB_COUNT" -gt 0 ]; then
|
||||
kubectl get pdb -A >&2
|
||||
fi
|
||||
|
||||
# 6. Pending PVCs
|
||||
echo -e "\n### Pending PVCs ###" >&2
|
||||
PENDING_PVC=$(kubectl get pvc -A --field-selector=status.phase=Pending --no-headers 2>/dev/null | wc -l | tr -d ' ')
|
||||
if [ "$PENDING_PVC" -gt 0 ]; then
|
||||
WARNINGS=$((WARNINGS + 1))
|
||||
echo "WARNING: $PENDING_PVC PVCs pending" >&2
|
||||
kubectl get pvc -A --field-selector=status.phase=Pending >&2
|
||||
else
|
||||
echo "✓ No pending PVCs" >&2
|
||||
fi
|
||||
|
||||
# 7. Deprecated APIs
|
||||
echo -e "\n### Deprecated API Usage ###" >&2
|
||||
DEPRECATED=$(kubectl get --raw /metrics 2>/dev/null | grep -c "apiserver_requested_deprecated_apis" || echo 0)
|
||||
if [ "$DEPRECATED" -gt 0 ]; then
|
||||
WARNINGS=$((WARNINGS + 1))
|
||||
echo "WARNING: Deprecated APIs may be in use" >&2
|
||||
echo "Check: kubectl get --raw /metrics | grep apiserver_requested_deprecated_apis" >&2
|
||||
else
|
||||
echo "✓ No deprecated API metrics found" >&2
|
||||
fi
|
||||
|
||||
# 8. etcd Health (if accessible)
|
||||
echo -e "\n### etcd Health ###" >&2
|
||||
ETCD_PODS=$(kubectl get pods -n kube-system -l component=etcd --no-headers 2>/dev/null | wc -l | tr -d ' ')
|
||||
if [ "$ETCD_PODS" -gt 0 ]; then
|
||||
kubectl get pods -n kube-system -l component=etcd >&2
|
||||
else
|
||||
echo "etcd pods not directly visible (managed platform)" >&2
|
||||
fi
|
||||
|
||||
# 9. Resource Pressure
|
||||
echo -e "\n### Node Resource Pressure ###" >&2
|
||||
PRESSURE=$(kubectl get nodes -o json 2>/dev/null | jq -r '.items[] | select(.status.conditions[] | select(.type | contains("Pressure")) | .status == "True") | .metadata.name')
|
||||
if [ -n "$PRESSURE" ]; then
|
||||
WARNINGS=$((WARNINGS + 1))
|
||||
echo "WARNING: Nodes under pressure:" >&2
|
||||
echo "$PRESSURE" >&2
|
||||
else
|
||||
echo "✓ No resource pressure detected" >&2
|
||||
fi
|
||||
|
||||
# OpenShift-specific checks
|
||||
if command -v oc &> /dev/null && oc whoami &> /dev/null; then
|
||||
echo -e "\n### OpenShift Cluster Operators ###" >&2
|
||||
DEGRADED=$(oc get clusteroperators --no-headers 2>/dev/null | grep -c -E "False.*True|False.*False" || echo 0)
|
||||
if [ "$DEGRADED" -gt 0 ]; then
|
||||
BLOCKERS=$((BLOCKERS + 1))
|
||||
echo "BLOCKER: $DEGRADED cluster operators degraded" >&2
|
||||
oc get clusteroperators | grep -E "False.*True|False.*False" >&2
|
||||
else
|
||||
echo "✓ All cluster operators healthy" >&2
|
||||
fi
|
||||
fi
|
||||
|
||||
# Summary
|
||||
echo "" >&2
|
||||
echo "========================================" >&2
|
||||
echo "PRE-UPGRADE CHECK SUMMARY" >&2
|
||||
echo "========================================" >&2
|
||||
echo "Blockers: $BLOCKERS" >&2
|
||||
echo "Warnings: $WARNINGS" >&2
|
||||
|
||||
if [ "$BLOCKERS" -gt 0 ]; then
|
||||
echo "" >&2
|
||||
echo "❌ DO NOT PROCEED WITH UPGRADE" >&2
|
||||
echo " Resolve blockers before upgrading" >&2
|
||||
elif [ "$WARNINGS" -gt 0 ]; then
|
||||
echo "" >&2
|
||||
echo "⚠️ PROCEED WITH CAUTION" >&2
|
||||
echo " Review warnings before upgrading" >&2
|
||||
else
|
||||
echo "" >&2
|
||||
echo "✅ CLUSTER READY FOR UPGRADE" >&2
|
||||
fi
|
||||
|
||||
# Output JSON
|
||||
cat << EOF
|
||||
{
|
||||
"timestamp": "$(date -u +"%Y-%m-%dT%H:%M:%SZ")",
|
||||
"server_version": "$SERVER_VERSION",
|
||||
"blockers": $BLOCKERS,
|
||||
"warnings": $WARNINGS,
|
||||
"ready_for_upgrade": $([ $BLOCKERS -eq 0 ] && echo "true" || echo "false")
|
||||
}
|
||||
EOF
|
||||
149
skills/kubernetes/scripts/security-audit.sh
Normal file
149
skills/kubernetes/scripts/security-audit.sh
Normal file
@@ -0,0 +1,149 @@
|
||||
#!/bin/bash
|
||||
# security-audit.sh - Kubernetes security posture assessment
|
||||
# Usage: ./security-audit.sh [namespace]
|
||||
|
||||
set -e
|
||||
|
||||
NAMESPACE=${1:-""}
|
||||
NS_FLAG=""
|
||||
if [ -n "$NAMESPACE" ]; then
|
||||
NS_FLAG="-n $NAMESPACE"
|
||||
echo "=== SECURITY AUDIT: Namespace $NAMESPACE ===" >&2
|
||||
else
|
||||
NS_FLAG="-A"
|
||||
echo "=== SECURITY AUDIT: All Namespaces ===" >&2
|
||||
fi
|
||||
echo "Timestamp: $(date -u +"%Y-%m-%dT%H:%M:%SZ")" >&2
|
||||
echo "" >&2
|
||||
|
||||
FINDINGS=()
|
||||
CRITICAL=0
|
||||
WARNING=0
|
||||
INFO=0
|
||||
|
||||
# 1. Privileged Containers (Critical)
|
||||
echo "### Checking for privileged containers..." >&2
|
||||
PRIVILEGED=$(kubectl get pods $NS_FLAG -o json 2>/dev/null | jq -r '.items[] | select(.spec.containers[].securityContext.privileged == true) | "\(.metadata.namespace)/\(.metadata.name)"')
|
||||
if [ -n "$PRIVILEGED" ]; then
|
||||
CRITICAL=$((CRITICAL + 1))
|
||||
FINDINGS+=("CRITICAL: Privileged containers found")
|
||||
echo "CRITICAL: Privileged containers:" >&2
|
||||
echo "$PRIVILEGED" >&2
|
||||
else
|
||||
echo "✓ No privileged containers" >&2
|
||||
fi
|
||||
|
||||
# 2. Containers Running as Root (Warning)
|
||||
echo -e "\n### Checking for root containers..." >&2
|
||||
ROOT_CONTAINERS=$(kubectl get pods $NS_FLAG -o json 2>/dev/null | jq -r '.items[] | select(.spec.securityContext.runAsNonRoot != true) | select(.spec.containers[].securityContext.runAsNonRoot != true) | "\(.metadata.namespace)/\(.metadata.name)"' | sort -u)
|
||||
ROOT_COUNT=$(echo "$ROOT_CONTAINERS" | grep -c . || echo 0)
|
||||
if [ "$ROOT_COUNT" -gt 0 ]; then
|
||||
WARNING=$((WARNING + 1))
|
||||
FINDINGS+=("WARNING: $ROOT_COUNT pods may run as root")
|
||||
echo "WARNING: Pods without runAsNonRoot:" >&2
|
||||
echo "$ROOT_CONTAINERS" | head -10 >&2
|
||||
[ "$ROOT_COUNT" -gt 10 ] && echo "... and $((ROOT_COUNT - 10)) more" >&2
|
||||
else
|
||||
echo "✓ All pods have runAsNonRoot" >&2
|
||||
fi
|
||||
|
||||
# 3. Host Namespace Access (Critical)
|
||||
echo -e "\n### Checking for host namespace access..." >&2
|
||||
HOST_ACCESS=$(kubectl get pods $NS_FLAG -o json 2>/dev/null | jq -r '.items[] | select(.spec.hostNetwork == true or .spec.hostPID == true or .spec.hostIPC == true) | "\(.metadata.namespace)/\(.metadata.name)"')
|
||||
if [ -n "$HOST_ACCESS" ]; then
|
||||
CRITICAL=$((CRITICAL + 1))
|
||||
FINDINGS+=("CRITICAL: Host namespace access detected")
|
||||
echo "CRITICAL: Pods with host namespace access:" >&2
|
||||
echo "$HOST_ACCESS" >&2
|
||||
else
|
||||
echo "✓ No host namespace access" >&2
|
||||
fi
|
||||
|
||||
# 4. Missing Resource Limits (Warning)
|
||||
echo -e "\n### Checking for missing resource limits..." >&2
|
||||
NO_LIMITS=$(kubectl get pods $NS_FLAG -o json 2>/dev/null | jq -r '[.items[] | select(.spec.containers[].resources.limits == null)] | length')
|
||||
if [ "$NO_LIMITS" -gt 10 ]; then
|
||||
WARNING=$((WARNING + 1))
|
||||
FINDINGS+=("WARNING: $NO_LIMITS containers without resource limits")
|
||||
echo "WARNING: $NO_LIMITS containers missing resource limits" >&2
|
||||
else
|
||||
echo "✓ Resource limits configured ($NO_LIMITS missing)" >&2
|
||||
fi
|
||||
|
||||
# 5. Default Service Account Usage (Info)
|
||||
echo -e "\n### Checking for default service account usage..." >&2
|
||||
DEFAULT_SA=$(kubectl get pods $NS_FLAG -o json 2>/dev/null | jq -r '.items[] | select(.spec.serviceAccountName == "default" or .spec.serviceAccountName == null) | "\(.metadata.namespace)/\(.metadata.name)"')
|
||||
DEFAULT_SA_COUNT=$(echo "$DEFAULT_SA" | grep -c . || echo 0)
|
||||
if [ "$DEFAULT_SA_COUNT" -gt 0 ]; then
|
||||
INFO=$((INFO + 1))
|
||||
FINDINGS+=("INFO: $DEFAULT_SA_COUNT pods using default service account")
|
||||
echo "INFO: Pods using default SA:" >&2
|
||||
echo "$DEFAULT_SA" | head -10 >&2
|
||||
else
|
||||
echo "✓ No pods using default service account" >&2
|
||||
fi
|
||||
|
||||
# 6. Wildcard RBAC (Critical)
|
||||
echo -e "\n### Checking for overly permissive RBAC..." >&2
|
||||
WILDCARD_ROLES=$(kubectl get clusterroles -o json 2>/dev/null | jq -r '.items[] | select(.rules[]?.verbs[]? == "*" and .rules[]?.resources[]? == "*") | .metadata.name')
|
||||
if [ -n "$WILDCARD_ROLES" ]; then
|
||||
CRITICAL=$((CRITICAL + 1))
|
||||
FINDINGS+=("CRITICAL: Wildcard RBAC permissions found")
|
||||
echo "CRITICAL: ClusterRoles with wildcard permissions:" >&2
|
||||
echo "$WILDCARD_ROLES" >&2
|
||||
else
|
||||
echo "✓ No wildcard RBAC permissions" >&2
|
||||
fi
|
||||
|
||||
# 7. Pods without NetworkPolicy (Info)
|
||||
echo -e "\n### Checking NetworkPolicy coverage..." >&2
|
||||
if [ -n "$NAMESPACE" ]; then
|
||||
NP_COUNT=$(kubectl get networkpolicy -n $NAMESPACE --no-headers 2>/dev/null | wc -l | tr -d ' ')
|
||||
if [ "$NP_COUNT" -eq 0 ]; then
|
||||
INFO=$((INFO + 1))
|
||||
FINDINGS+=("INFO: Namespace $NAMESPACE has no NetworkPolicies")
|
||||
echo "INFO: No NetworkPolicies in $NAMESPACE" >&2
|
||||
else
|
||||
echo "✓ $NP_COUNT NetworkPolicies found" >&2
|
||||
fi
|
||||
else
|
||||
NS_WITHOUT_NP=0
|
||||
for ns in $(kubectl get ns -o jsonpath='{.items[*].metadata.name}' 2>/dev/null); do
|
||||
count=$(kubectl get networkpolicy -n $ns --no-headers 2>/dev/null | wc -l | tr -d ' ')
|
||||
[ "$count" -eq 0 ] && NS_WITHOUT_NP=$((NS_WITHOUT_NP + 1))
|
||||
done
|
||||
if [ "$NS_WITHOUT_NP" -gt 0 ]; then
|
||||
INFO=$((INFO + 1))
|
||||
FINDINGS+=("INFO: $NS_WITHOUT_NP namespaces without NetworkPolicies")
|
||||
echo "INFO: $NS_WITHOUT_NP namespaces lack NetworkPolicies" >&2
|
||||
fi
|
||||
fi
|
||||
|
||||
# Summary
|
||||
echo "" >&2
|
||||
echo "========================================" >&2
|
||||
echo "SECURITY AUDIT SUMMARY" >&2
|
||||
echo "========================================" >&2
|
||||
echo "Critical Issues: $CRITICAL" >&2
|
||||
echo "Warnings: $WARNING" >&2
|
||||
echo "Informational: $INFO" >&2
|
||||
echo "" >&2
|
||||
|
||||
if [ ${#FINDINGS[@]} -gt 0 ]; then
|
||||
echo "FINDINGS:" >&2
|
||||
for finding in "${FINDINGS[@]}"; do
|
||||
echo " - $finding" >&2
|
||||
done
|
||||
fi
|
||||
|
||||
# Output JSON
|
||||
cat << EOF
|
||||
{
|
||||
"timestamp": "$(date -u +"%Y-%m-%dT%H:%M:%SZ")",
|
||||
"namespace": "${NAMESPACE:-all}",
|
||||
"critical": $CRITICAL,
|
||||
"warning": $WARNING,
|
||||
"info": $INFO,
|
||||
"compliant": $([ $CRITICAL -eq 0 ] && echo "true" || echo "false")
|
||||
}
|
||||
EOF
|
||||
Reference in New Issue
Block a user