Update dashboard manifests and add automation

- Updated deployment with correct Pi 3 tolerations
- Updated ingress for cloudflare-tunnel
- Added crontab example for systemd alternative
- Updated go.sum

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
OpenCode Test
2025-12-27 11:39:40 -08:00
parent c14bae9a12
commit 73512e92a6
7 changed files with 64 additions and 21 deletions

View File

@@ -0,0 +1,18 @@
# K8s Agent Scheduled Workflows
# Install with: crontab /home/will/.claude/automation/crontab.example
# Or add to existing: crontab -e
# Environment
SHELL=/bin/bash
PATH=/usr/local/bin:/usr/bin:/bin
CLAUDE_DIR=/home/will/.claude
DASHBOARD_URL=http://k8s-agent-dashboard-k8s-agent.taildb3494.ts.net
# Cluster health check - every 6 hours
0 */6 * * * /home/will/.claude/automation/scheduler.sh cluster-health-check >> /home/will/.claude/logs/cron.log 2>&1
# Daily cluster summary - 8am
0 8 * * * /home/will/.claude/automation/scheduler.sh cluster-health-check >> /home/will/.claude/logs/cron.log 2>&1
# Log rotation - weekly on Sunday at midnight
0 0 * * 0 find /home/will/.claude/logs -name "*.log" -mtime +7 -delete

View File

@@ -18,12 +18,12 @@ spec:
spec:
# Target Pi 3 node (lightweight workload)
tolerations:
- key: "node-type"
- key: "capacity"
operator: "Equal"
value: "pi3"
effect: "NoSchedule"
value: "low"
effect: "NoExecute"
nodeSelector:
kubernetes.io/arch: arm64
kubernetes.io/hostname: pi3
# Security context
securityContext:
@@ -33,7 +33,7 @@ spec:
containers:
- name: dashboard
image: ghcr.io/will/k8s-agent-dashboard:latest
image: gitea-http.taildb3494.ts.net/will/k8s-agent-dashboard:latest
imagePullPolicy: Always
ports:
- name: http

View File

@@ -6,13 +6,10 @@ metadata:
labels:
app.kubernetes.io/name: k8s-agent-dashboard
app.kubernetes.io/component: dashboard
annotations:
# Adjust annotations based on your ingress controller
# nginx.ingress.kubernetes.io/ssl-redirect: "false"
spec:
ingressClassName: nginx # or traefik, etc.
ingressClassName: cloudflare-tunnel
rules:
- host: k8s-agent.local # Adjust to your domain
- host: k8s-agent-dashboard-k8s-agent.taildb3494.ts.net
http:
paths:
- path: /
@@ -22,8 +19,3 @@ spec:
name: k8s-agent-dashboard
port:
name: http
# Uncomment for TLS
# tls:
# - hosts:
# - k8s-agent.local
# secretName: k8s-agent-dashboard-tls

View File

@@ -15,5 +15,5 @@ commonLabels:
app.kubernetes.io/managed-by: argocd
images:
- name: ghcr.io/will/k8s-agent-dashboard
- name: gitea-http.taildb3494.ts.net/will/k8s-agent-dashboard
newTag: latest

View File

@@ -9,8 +9,7 @@ metadata:
spec:
accessModes:
- ReadWriteOnce
storageClassName: local-path
resources:
requests:
storage: 100Mi
# Adjust storageClassName based on your cluster
# storageClassName: local-path

View File

@@ -1,4 +1,4 @@
github.com/go-chi/chi/v5 v5.0.11 h1/BnpYbFZ3T3S1WMpD79r7R5ThWX40TaFB7L31Y8xqSwA=
github.com/go-chi/chi/v5 v5.0.11/go.mod h1/DslCQbL2OYiznFReuXYUmQ2hGd1aDpCnlMNITLSKoi8=
github.com/go-chi/chi/v5 v5.0.11 h1:BnpYbFZ3T3S1WMpD79r7R5ThWX40TaFB7L31Y8xqSwA=
github.com/go-chi/chi/v5 v5.0.11/go.mod h1:DslCQbL2OYiznFReuXYUmQ2hGd1aDpCnlMNITLSKoi8=
github.com/go-chi/cors v1.2.1 h1:xEC8UT3Rlp2QuWNEr4Fs/c2EAGVKBwy/1vHx3bppil4=
github.com/go-chi/cors v1.2.1/go.mod h1:sSbTewc+6wYHBBCW7ytsFSn3rn0gOeEOrPIsEDqiK+0=
github.com/go-chi/cors v1.2.1/go.mod h1:sSbTewc+6wYHBBCW7ytsFSn836hqM7JxpglAy2Vzc58=

View File

@@ -0,0 +1,34 @@
# Cluster Issue Diagnosis Plan
## Issues to Investigate
1. **Critical Alerts** - KubeSchedulerDown, KubeControllerManagerDown
- Likely false positives (k0s bundles these in k0s-controller)
- Check if cluster is actually functional
2. **CrashLooping Pod** - Find and diagnose
- Get pod status across all namespaces
- Check logs and events
3. **Stuck Deployment** - Find and diagnose
- List deployments not at desired replica count
- Check events
4. **Degraded kube-prometheus-stack**
- Check prometheus/alertmanager pods
## Commands to Run
```bash
# Find crash looping pods
kubectl get pods -A | grep -E 'CrashLoop|Error|ImagePull'
# Find stuck deployments
kubectl get deploy -A -o wide | grep -v '1/1\|2/2\|3/3\|4/4'
# Check prometheus stack
kubectl get pods -n monitoring
# Check scheduler/controller (k0s specific)
kubectl get pods -n kube-system
```