feat: Implement Phase 2 dashboard for K8s agent system

Lightweight Go-based dashboard for Raspberry Pi cluster: Backend: - chi router with REST API - Embedded static file serving - JSON file-based state storage - Health checks and CORS support Frontend: - Responsive dark theme UI - Status view with nodes, alerts, ArgoCD apps - Pending actions with approve/reject - Action history and audit trail - Workflow listing and manual triggers Deployment: - Multi-stage Dockerfile (small Alpine image) - Kubernetes manifests with Pi 3 tolerations - Resource limits: 32-64Mi memory, 10-100m CPU - ArgoCD application manifest - Kustomize configuration API endpoints: - GET /api/status - Cluster status - GET/POST /api/pending - Action management - GET /api/history - Action audit trail - GET/POST /api/workflows - Workflow management 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-26 11:34:36 -08:00
parent a80f714fc2
commit 5646508adb
18 changed files with 1712 additions and 0 deletions
--- a/dashboard/internal/api/handlers.go
+++ b/dashboard/internal/api/handlers.go
@@ -0,0 +1,157 @@
+package api
+
+import (
+	"encoding/json"
+	"net/http"
+	"strconv"
+
+	"github.com/go-chi/chi/v5"
+	"github.com/will/k8s-agent-dashboard/internal/store"
+)
+
+// JSON helper
+func respondJSON(w http.ResponseWriter, status int, data interface{}) {
+	w.Header().Set("Content-Type", "application/json")
+	w.WriteHeader(status)
+	json.NewEncoder(w).Encode(data)
+}
+
+func respondError(w http.ResponseWriter, status int, message string) {
+	respondJSON(w, status, map[string]string{"error": message})
+}
+
+// HealthCheck returns API health status
+func HealthCheck(w http.ResponseWriter, r *http.Request) {
+	respondJSON(w, http.StatusOK, map[string]string{
+		"status": "ok",
+		"service": "k8s-agent-dashboard",
+	})
+}
+
+// GetClusterStatus returns current cluster status
+func GetClusterStatus(s *store.Store) http.HandlerFunc {
+	return func(w http.ResponseWriter, r *http.Request) {
+		status := s.GetClusterStatus()
+		respondJSON(w, http.StatusOK, status)
+	}
+}
+
+// GetPendingActions returns all pending actions
+func GetPendingActions(s *store.Store) http.HandlerFunc {
+	return func(w http.ResponseWriter, r *http.Request) {
+		actions := s.GetPendingActions()
+		respondJSON(w, http.StatusOK, map[string]interface{}{
+			"count":   len(actions),
+			"actions": actions,
+		})
+	}
+}
+
+// ApproveAction approves a pending action
+func ApproveAction(s *store.Store) http.HandlerFunc {
+	return func(w http.ResponseWriter, r *http.Request) {
+		id := chi.URLParam(r, "id")
+		if id == "" {
+			respondError(w, http.StatusBadRequest, "missing action id")
+			return
+		}
+
+		var body struct {
+			Reason string `json:"reason"`
+		}
+		json.NewDecoder(r.Body).Decode(&body)
+
+		action, err := s.ApproveAction(id, body.Reason)
+		if err != nil {
+			respondError(w, http.StatusNotFound, err.Error())
+			return
+		}
+
+		respondJSON(w, http.StatusOK, map[string]interface{}{
+			"status":  "approved",
+			"action":  action,
+			"message": "Action approved and ready for execution",
+		})
+	}
+}
+
+// RejectAction rejects a pending action
+func RejectAction(s *store.Store) http.HandlerFunc {
+	return func(w http.ResponseWriter, r *http.Request) {
+		id := chi.URLParam(r, "id")
+		if id == "" {
+			respondError(w, http.StatusBadRequest, "missing action id")
+			return
+		}
+
+		var body struct {
+			Reason string `json:"reason"`
+		}
+		json.NewDecoder(r.Body).Decode(&body)
+
+		if body.Reason == "" {
+			body.Reason = "Rejected by user"
+		}
+
+		action, err := s.RejectAction(id, body.Reason)
+		if err != nil {
+			respondError(w, http.StatusNotFound, err.Error())
+			return
+		}
+
+		respondJSON(w, http.StatusOK, map[string]interface{}{
+			"status":  "rejected",
+			"action":  action,
+			"message": "Action rejected",
+		})
+	}
+}
+
+// GetActionHistory returns action history
+func GetActionHistory(s *store.Store) http.HandlerFunc {
+	return func(w http.ResponseWriter, r *http.Request) {
+		limitStr := r.URL.Query().Get("limit")
+		limit := 50
+		if limitStr != "" {
+			if l, err := strconv.Atoi(limitStr); err == nil {
+				limit = l
+			}
+		}
+
+		history := s.GetActionHistory(limit)
+		respondJSON(w, http.StatusOK, map[string]interface{}{
+			"count":   len(history),
+			"history": history,
+		})
+	}
+}
+
+// GetWorkflows returns defined workflows
+func GetWorkflows(s *store.Store) http.HandlerFunc {
+	return func(w http.ResponseWriter, r *http.Request) {
+		workflows := s.GetWorkflows()
+		respondJSON(w, http.StatusOK, map[string]interface{}{
+			"count":     len(workflows),
+			"workflows": workflows,
+		})
+	}
+}
+
+// RunWorkflow triggers a workflow execution
+func RunWorkflow(s *store.Store) http.HandlerFunc {
+	return func(w http.ResponseWriter, r *http.Request) {
+		name := chi.URLParam(r, "name")
+		if name == "" {
+			respondError(w, http.StatusBadRequest, "missing workflow name")
+			return
+		}
+
+		// In Phase 2, we just acknowledge the request
+		// Phase 3 will implement actual execution via Claude Code
+		respondJSON(w, http.StatusAccepted, map[string]interface{}{
+			"status":   "queued",
+			"workflow": name,
+			"message":  "Workflow queued for execution. Use Claude Code CLI to run workflows.",
+		})
+	}
+}
--- a/dashboard/internal/models/models.go
+++ b/dashboard/internal/models/models.go
@@ -0,0 +1,80 @@
+package models
+
+import "time"
+
+// ClusterStatus represents the overall cluster health
+type ClusterStatus struct {
+	Health    string       `json:"health"` // Healthy, Degraded, Critical
+	UpdatedAt time.Time    `json:"updated_at"`
+	Nodes     []NodeStatus `json:"nodes"`
+	Alerts    []Alert      `json:"alerts"`
+	Apps      []AppStatus  `json:"apps"`
+}
+
+// NodeStatus represents a single node's status
+type NodeStatus struct {
+	Name       string  `json:"name"`
+	Status     string  `json:"status"` // Ready, NotReady
+	CPU        float64 `json:"cpu_percent"`
+	Memory     float64 `json:"memory_percent"`
+	Conditions string  `json:"conditions"` // OK, MemoryPressure, DiskPressure, etc.
+}
+
+// Alert represents a Prometheus/Alertmanager alert
+type Alert struct {
+	Name        string    `json:"name"`
+	Severity    string    `json:"severity"` // warning, critical
+	Description string    `json:"description"`
+	FiringAt    time.Time `json:"firing_at"`
+}
+
+// AppStatus represents an ArgoCD application status
+type AppStatus struct {
+	Name       string `json:"name"`
+	SyncStatus string `json:"sync_status"` // Synced, OutOfSync
+	Health     string `json:"health"`      // Healthy, Progressing, Degraded
+	Revision   string `json:"revision"`
+}
+
+// PendingAction represents an action awaiting user approval
+type PendingAction struct {
+	ID          string                 `json:"id"`
+	CreatedAt   time.Time              `json:"created_at"`
+	Agent       string                 `json:"agent"`
+	Action      string                 `json:"action"`
+	Description string                 `json:"description"`
+	Details     map[string]interface{} `json:"details"`
+	Risk        string                 `json:"risk"` // low, medium, high
+	Workflow    string                 `json:"workflow,omitempty"`
+}
+
+// ActionDecision represents the user's decision on a pending action
+type ActionDecision struct {
+	ID         string    `json:"id"`
+	Decision   string    `json:"decision"` // approved, rejected
+	DecidedAt  time.Time `json:"decided_at"`
+	DecidedBy  string    `json:"decided_by,omitempty"`
+	Reason     string    `json:"reason,omitempty"`
+}
+
+// ActionHistory represents a completed action
+type ActionHistory struct {
+	ID          string                 `json:"id"`
+	Timestamp   time.Time              `json:"timestamp"`
+	Agent       string                 `json:"agent"`
+	Action      string                 `json:"action"`
+	Description string                 `json:"description"`
+	Details     map[string]interface{} `json:"details,omitempty"`
+	Result      string                 `json:"result"` // success, failed
+	AutoApproved bool                  `json:"auto_approved"`
+	Workflow    string                 `json:"workflow,omitempty"`
+}
+
+// Workflow represents a defined workflow
+type Workflow struct {
+	Name        string   `json:"name"`
+	Description string   `json:"description"`
+	Triggers    []string `json:"triggers"`
+	LastRun     *time.Time `json:"last_run,omitempty"`
+	Status      string   `json:"status,omitempty"` // idle, running, completed, failed
+}
--- a/dashboard/internal/store/store.go
+++ b/dashboard/internal/store/store.go
@@ -0,0 +1,244 @@
+package store
+
+import (
+	"encoding/json"
+	"fmt"
+	"os"
+	"path/filepath"
+	"sync"
+	"time"
+
+	"github.com/will/k8s-agent-dashboard/internal/models"
+)
+
+// Store manages persistent state for the dashboard
+type Store struct {
+	dataDir string
+	mu      sync.RWMutex
+
+	// In-memory cache
+	status   *models.ClusterStatus
+	pending  []models.PendingAction
+	history  []models.ActionHistory
+	workflows []models.Workflow
+}
+
+// New creates a new store instance
+func New(dataDir string) (*Store, error) {
+	// Ensure data directory exists
+	if err := os.MkdirAll(dataDir, 0755); err != nil {
+		return nil, fmt.Errorf("failed to create data dir: %w", err)
+	}
+
+	s := &Store{
+		dataDir:  dataDir,
+		pending:  make([]models.PendingAction, 0),
+		history:  make([]models.ActionHistory, 0),
+		workflows: make([]models.Workflow, 0),
+	}
+
+	// Load existing data
+	if err := s.load(); err != nil {
+		return nil, err
+	}
+
+	return s, nil
+}
+
+func (s *Store) load() error {
+	// Load pending actions
+	pendingPath := filepath.Join(s.dataDir, "pending.json")
+	if data, err := os.ReadFile(pendingPath); err == nil {
+		if err := json.Unmarshal(data, &s.pending); err != nil {
+			return fmt.Errorf("failed to parse pending.json: %w", err)
+		}
+	}
+
+	// Load history
+	historyPath := filepath.Join(s.dataDir, "history.json")
+	if data, err := os.ReadFile(historyPath); err == nil {
+		if err := json.Unmarshal(data, &s.history); err != nil {
+			return fmt.Errorf("failed to parse history.json: %w", err)
+		}
+	}
+
+	// Load status
+	statusPath := filepath.Join(s.dataDir, "status.json")
+	if data, err := os.ReadFile(statusPath); err == nil {
+		s.status = &models.ClusterStatus{}
+		if err := json.Unmarshal(data, s.status); err != nil {
+			return fmt.Errorf("failed to parse status.json: %w", err)
+		}
+	}
+
+	return nil
+}
+
+func (s *Store) save(filename string, data interface{}) error {
+	path := filepath.Join(s.dataDir, filename)
+	bytes, err := json.MarshalIndent(data, "", "  ")
+	if err != nil {
+		return err
+	}
+	return os.WriteFile(path, bytes, 0644)
+}
+
+// GetClusterStatus returns the current cluster status
+func (s *Store) GetClusterStatus() *models.ClusterStatus {
+	s.mu.RLock()
+	defer s.mu.RUnlock()
+
+	if s.status == nil {
+		// Return demo status if none exists
+		return &models.ClusterStatus{
+			Health:    "Unknown",
+			UpdatedAt: time.Now(),
+			Nodes:     []models.NodeStatus{},
+			Alerts:    []models.Alert{},
+			Apps:      []models.AppStatus{},
+		}
+	}
+	return s.status
+}
+
+// UpdateClusterStatus updates the cluster status
+func (s *Store) UpdateClusterStatus(status *models.ClusterStatus) error {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+
+	status.UpdatedAt = time.Now()
+	s.status = status
+	return s.save("status.json", status)
+}
+
+// GetPendingActions returns all pending actions
+func (s *Store) GetPendingActions() []models.PendingAction {
+	s.mu.RLock()
+	defer s.mu.RUnlock()
+	return s.pending
+}
+
+// AddPendingAction adds a new pending action
+func (s *Store) AddPendingAction(action models.PendingAction) error {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+
+	action.CreatedAt = time.Now()
+	s.pending = append(s.pending, action)
+	return s.save("pending.json", s.pending)
+}
+
+// ApproveAction approves a pending action
+func (s *Store) ApproveAction(id string, reason string) (*models.PendingAction, error) {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+
+	for i, action := range s.pending {
+		if action.ID == id {
+			// Remove from pending
+			s.pending = append(s.pending[:i], s.pending[i+1:]...)
+
+			// Add to history
+			historyEntry := models.ActionHistory{
+				ID:          action.ID,
+				Timestamp:   time.Now(),
+				Agent:       action.Agent,
+				Action:      action.Action,
+				Description: action.Description,
+				Details:     action.Details,
+				Result:      "approved",
+				AutoApproved: false,
+				Workflow:    action.Workflow,
+			}
+			s.history = append([]models.ActionHistory{historyEntry}, s.history...)
+
+			// Keep only last 100 history entries
+			if len(s.history) > 100 {
+				s.history = s.history[:100]
+			}
+
+			// Save both files
+			s.save("pending.json", s.pending)
+			s.save("history.json", s.history)
+
+			return &action, nil
+		}
+	}
+	return nil, fmt.Errorf("action not found: %s", id)
+}
+
+// RejectAction rejects a pending action
+func (s *Store) RejectAction(id string, reason string) (*models.PendingAction, error) {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+
+	for i, action := range s.pending {
+		if action.ID == id {
+			// Remove from pending
+			s.pending = append(s.pending[:i], s.pending[i+1:]...)
+
+			// Add to history as rejected
+			historyEntry := models.ActionHistory{
+				ID:          action.ID,
+				Timestamp:   time.Now(),
+				Agent:       action.Agent,
+				Action:      action.Action,
+				Description: action.Description + " (REJECTED: " + reason + ")",
+				Details:     action.Details,
+				Result:      "rejected",
+				AutoApproved: false,
+				Workflow:    action.Workflow,
+			}
+			s.history = append([]models.ActionHistory{historyEntry}, s.history...)
+
+			if len(s.history) > 100 {
+				s.history = s.history[:100]
+			}
+
+			s.save("pending.json", s.pending)
+			s.save("history.json", s.history)
+
+			return &action, nil
+		}
+	}
+	return nil, fmt.Errorf("action not found: %s", id)
+}
+
+// GetActionHistory returns the action history
+func (s *Store) GetActionHistory(limit int) []models.ActionHistory {
+	s.mu.RLock()
+	defer s.mu.RUnlock()
+
+	if limit <= 0 || limit > len(s.history) {
+		return s.history
+	}
+	return s.history[:limit]
+}
+
+// GetWorkflows returns all defined workflows
+func (s *Store) GetWorkflows() []models.Workflow {
+	s.mu.RLock()
+	defer s.mu.RUnlock()
+
+	// Return predefined workflows based on what we have in ~/.claude/workflows
+	return []models.Workflow{
+		{
+			Name:        "cluster-health-check",
+			Description: "Comprehensive cluster health assessment",
+			Triggers:    []string{"schedule: 0 */6 * * *", "manual"},
+			Status:      "idle",
+		},
+		{
+			Name:        "deploy-app",
+			Description: "Deploy or update an application",
+			Triggers:    []string{"manual"},
+			Status:      "idle",
+		},
+		{
+			Name:        "pod-crashloop-remediation",
+			Description: "Diagnose and remediate pods in CrashLoopBackOff",
+			Triggers:    []string{"alert: KubePodCrashLooping", "manual"},
+			Status:      "idle",
+		},
+	}
+}