I am trying to emulate this script which summarizes statuses of pods that are not running by namespace and reason.
kubectl get po -A --no-headers |
awk '
BEGIN {
SUBSEP=" "
format = "%-20s %20s %5s\n"
printf format, "NAMESPACE", "STATUS", "COUNT"
}
!/Running/ {a[$1,$4]++}
END {
for (i in a) {split(i,t); printf format, t[1],t[2],a[i]}
}
' | sort
This script produces output similar to this:
$ notrunning
NAMESPACE STATUS COUNT
namespace-01 InvalidImageName 2
namespace-02 InvalidImageName 1
namespace-02 Init:ImagePullBackOff 1
namespace-03 CrashLoopBackOff 2
namespace-03 InvalidImageName 9
namespace-04 Init:ErrImagePull 1
I can't find where kubectl is getting the status or reason. I'm trying code similar to this (leaving out some error checking for brevity). I am not getting the results I expect.
type PodSummary struct {
NotRunning int
Summary map[PodKey]int // Map of namespace+state to count
}
type PodKey struct {
Namespace string
Status string
}
func getPodSummary(kubeconfig, cluster string) PodSummary {
clientset, _ := getClientsetForContext(kubeconfig, cluster)
pods, _ := clientset.CoreV1().Pods("").List(context.TODO(), metav1.ListOptions{})
summary := PodSummary{Summary: make(map[PodKey]int)}
for _, pod := range pods.Items {
if pod.Status.Phase != "Running" && pod.Status.Phase != "Succeeded" { // need to check "Completed" also?
podNS := pod.Namespace
summary.NotRunning++
var pk PodKey
for _,containerStatus := range pod.Status.ContainerStatuses {
if containerStatus.State.Waiting != nil {
pk = PodKey {podNS, string(containerStatus.State.Waiting.Reason)}
break
} else { // cannot find it, use this instead.
pk = PodKey {podNS, string(pod.Status.Phase)}
}
}
summary.Summary[pk]++
}
}
return summary
}
I was expecting to get detailed reasons why the pods are failing. Instead I got results like "Pending" which isn't helpful or what I wanted.
Here is an example of a pod in a bad state, yet .status.phase says it is Running. How can I do a better test? And what if the pod has multiple containers? How do I choose which to report on?
$ k get po
NAME READY STATUS RESTARTS AGE
notreallyrunningpod 0/1 CrashLoopBackOff 5152 (85s ago) 10d
$ jq -r '.status.phase' < notreallyrunningpod.json
Running
$ jq -r '.status.containerStatuses | .[] | .state.waiting.reason ' < notreallyrunningpod.json
CrashLoopBackOff