package k8s import ( "fmt" "strconv" "strings" "time" corev1 "k8s.io/api/core/v1" "tower/internal/model" ) // IssuesFromPods applies the PLAN.md pod rules. // // Pure rule function: it does not talk to the API server. func IssuesFromPods(pods []*corev1.Pod, now time.Time, pendingGrace time.Duration, crashLoopRestartThreshold int) []model.Issue { if crashLoopRestartThreshold <= 0 { crashLoopRestartThreshold = 5 } if pendingGrace <= 0 { pendingGrace = 120 * time.Second } out := make([]model.Issue, 0, 32) for _, p := range pods { if p == nil { continue } ns, name := p.Namespace, p.Name // Pending for too long. if p.Status.Phase == corev1.PodPending { age := now.Sub(p.CreationTimestamp.Time) if !p.CreationTimestamp.IsZero() && age >= pendingGrace { out = append(out, model.Issue{ ID: fmt.Sprintf("k8s:pod:%s/%s:Pending", ns, name), Category: model.CategoryKubernetes, Priority: model.PriorityP1, Title: fmt.Sprintf("Pod Pending: %s/%s", ns, name), Details: fmt.Sprintf("Pod has been Pending for %s.", age.Truncate(time.Second)), Evidence: map[string]string{ "kind": "Pod", "reason": "Pending", "namespace": ns, "pod": name, "phase": string(p.Status.Phase), "node": p.Spec.NodeName, }, SuggestedFix: fmt.Sprintf("kubectl -n %s describe pod %s", ns, name), }) } } // Container-derived signals. for _, cs := range p.Status.ContainerStatuses { cname := cs.Name restarts := int(cs.RestartCount) // CrashLoopBackOff and pull errors are reported via Waiting state. if cs.State.Waiting != nil { reason := cs.State.Waiting.Reason msg := cs.State.Waiting.Message switch reason { case "CrashLoopBackOff": pri := model.PriorityP1 if restarts >= crashLoopRestartThreshold { pri = model.PriorityP0 } out = append(out, model.Issue{ ID: fmt.Sprintf("k8s:pod:%s/%s:CrashLoop:%s", ns, name, cname), Category: model.CategoryKubernetes, Priority: pri, Title: fmt.Sprintf("CrashLoopBackOff: %s/%s (%s)", ns, name, cname), Details: firstNonEmpty(msg, "Container is in CrashLoopBackOff."), Evidence: map[string]string{ "kind": "Pod", "reason": "CrashLoopBackOff", "namespace": ns, "pod": name, "container": cname, "restarts": strconv.Itoa(restarts), "node": p.Spec.NodeName, }, SuggestedFix: strings.TrimSpace(fmt.Sprintf(`kubectl -n %s describe pod %s kubectl -n %s logs %s -c %s --previous`, ns, name, ns, name, cname)), }) case "ImagePullBackOff", "ErrImagePull": out = append(out, model.Issue{ ID: fmt.Sprintf("k8s:pod:%s/%s:ImagePull:%s", ns, name, cname), Category: model.CategoryKubernetes, Priority: model.PriorityP1, Title: fmt.Sprintf("%s: %s/%s (%s)", reason, ns, name, cname), Details: firstNonEmpty(msg, "Container image pull is failing."), Evidence: map[string]string{ "kind": "Pod", "reason": reason, "namespace": ns, "pod": name, "container": cname, "restarts": strconv.Itoa(restarts), "node": p.Spec.NodeName, }, SuggestedFix: fmt.Sprintf("kubectl -n %s describe pod %s", ns, name), }) } } // OOMKilled is typically stored in LastTerminationState. if cs.LastTerminationState.Terminated != nil { term := cs.LastTerminationState.Terminated if term.Reason == "OOMKilled" { out = append(out, model.Issue{ ID: fmt.Sprintf("k8s:pod:%s/%s:OOMKilled:%s", ns, name, cname), Category: model.CategoryKubernetes, Priority: model.PriorityP1, Title: fmt.Sprintf("OOMKilled: %s/%s (%s)", ns, name, cname), Details: firstNonEmpty(term.Message, "Container was killed due to OOM."), Evidence: map[string]string{ "kind": "Pod", "reason": "OOMKilled", "namespace": ns, "pod": name, "container": cname, "restarts": strconv.Itoa(restarts), "node": p.Spec.NodeName, }, SuggestedFix: strings.TrimSpace(fmt.Sprintf(`kubectl -n %s describe pod %s kubectl -n %s logs %s -c %s --previous`, ns, name, ns, name, cname)), }) } } // High restarts even if running. // Keep this lower priority than active CrashLoopBackOff. if restarts >= crashLoopRestartThreshold { if cs.State.Waiting == nil || cs.State.Waiting.Reason == "" { out = append(out, model.Issue{ ID: fmt.Sprintf("k8s:pod:%s/%s:Restarts:%s", ns, name, cname), Category: model.CategoryKubernetes, Priority: model.PriorityP2, Title: fmt.Sprintf("High restarts: %s/%s (%s)", ns, name, cname), Details: "Container has restarted multiple times.", Evidence: map[string]string{ "kind": "Pod", "reason": "HighRestarts", "namespace": ns, "pod": name, "container": cname, "restarts": strconv.Itoa(restarts), "node": p.Spec.NodeName, }, SuggestedFix: fmt.Sprintf("kubectl -n %s describe pod %s", ns, name), }) } } } } return out } func firstNonEmpty(v, fallback string) string { if strings.TrimSpace(v) != "" { return v } return fallback }