Skip to content

Commit

Permalink
fix: deployment/sts health fixes
Browse files Browse the repository at this point in the history
  • Loading branch information
moshloop committed Nov 16, 2024
1 parent 82deaeb commit 5fe44be
Show file tree
Hide file tree
Showing 19 changed files with 1,373 additions and 303 deletions.
11 changes: 4 additions & 7 deletions pkg/health/health.go
Original file line number Diff line number Diff line change
Expand Up @@ -44,14 +44,15 @@ const (
HealthStatusEvicted HealthStatusCode = "Evicted"
HealthStatusCompleted HealthStatusCode = "Completed"
HealthStatusCrashLoopBackoff HealthStatusCode = "CrashLoopBackOff"
HealthStatusCrashLoop HealthStatusCode = "CrashLoop"
HealthStatusCrashed HealthStatusCode = "Crashed"
HealthStatusCreating HealthStatusCode = "Creating"
HealthStatusDeleted HealthStatusCode = "Deleted"
HealthStatusDeleting HealthStatusCode = "Deleting"
HealthStatusTerminating HealthStatusCode = "Terminating"
HealthStatusError HealthStatusCode = "Error"
HealthStatusRolloutFailed HealthStatusCode = "Rollout Failed"
HealthStatusInaccesible HealthStatusCode = "Inaccesible"
HealthStatusInaccesible HealthStatusCode = "Inaccessible"
HealthStatusInfo HealthStatusCode = "Info"
HealthStatusPending HealthStatusCode = "Pending"
HealthStatusMaintenance HealthStatusCode = "Maintenance"
Expand Down Expand Up @@ -147,7 +148,7 @@ func GetResourceHealth(
terminatingFor := time.Since(obj.GetDeletionTimestamp().Time)
return &HealthStatus{
Status: "TerminatingStalled",
Health: HealthUnhealthy,
Health: HealthWarning,
Message: fmt.Sprintf("terminating for %v", duration.ShortHumanDuration(terminatingFor.Truncate(time.Hour))),
}, nil
}
Expand Down Expand Up @@ -198,10 +199,6 @@ func GetHealthCheckFunc(gvk schema.GroupVersionKind) func(obj *unstructured.Unst
return getNodeHealth
}

if strings.HasSuffix(gvk.Group, ".crossplane.io") || strings.HasSuffix(gvk.Group, ".upbound.io") {
return GetDefaultHealth
}

switch gvk.Group {
case "apps":
switch gvk.Kind {
Expand Down Expand Up @@ -264,5 +261,5 @@ func GetHealthCheckFunc(gvk schema.GroupVersionKind) func(obj *unstructured.Unst
return getHPAHealth
}
}
return nil
return GetDefaultHealth
}
18 changes: 18 additions & 0 deletions pkg/health/health_cnrm_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
package health_test

import (
"testing"

"github.com/flanksource/is-healthy/pkg/health"
)

func TestCnrmContainer(t *testing.T) {
assertAppHealthMsg(
t,
"Kubernetes::ContainerCluster/failed.yaml",
"UpdateFailed",
health.HealthUnhealthy,
true,
"Update call failed: error applying desired state: summary: googleapi: Error 403: Google Compute Engine: Required 'compute.networks.get' permission for 'projects/flanksource-prod/global/networks/flanksource-workload'.\nDetails:\n[\n {\n \"@type\": \"type.googleapis.com/google.rpc.RequestInfo\",\n \"requestId\": \"0xf1e9e3ca2797eb18\"\n },\n {\n \"@type\": \"type.googleapis.com/google.rpc.ErrorInfo\",\n \"domain\": \"container.googleapis.com\",\n \"reason\": \"GCE_PERMISSION_DENIED\"\n }\n]\n, forbidden",
)
}
145 changes: 81 additions & 64 deletions pkg/health/health_deployment.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,12 @@ package health

import (
"fmt"
"strings"
"time"

"github.com/samber/lo"
appsv1 "k8s.io/api/apps/v1"
corev1 "k8s.io/api/core/v1"

"k8s.io/apimachinery/pkg/apis/meta/v1/unstructured"
)

Expand All @@ -24,85 +26,100 @@ func getDeploymentHealth(obj *unstructured.Unstructured) (*HealthStatus, error)
}
}

func getAppsv1DeploymentHealth(deployment *appsv1.Deployment, obj *unstructured.Unstructured) (*HealthStatus, error) {
var containersWaitingForReadiness []string
for _, container := range deployment.Spec.Template.Spec.Containers {
if container.ReadinessProbe != nil && container.ReadinessProbe.InitialDelaySeconds > 0 {
deadline := deployment.CreationTimestamp.Add(
time.Second * time.Duration(container.ReadinessProbe.InitialDelaySeconds),
)
if time.Now().Before(deadline) {
containersWaitingForReadiness = append(containersWaitingForReadiness, container.Name)
}
}
type ReplicaStatus struct {
Object *unstructured.Unstructured
Containers []corev1.Container
Desired, Replicas, Ready, Updated, Unavailable int
}

func (rs ReplicaStatus) String() string {
s := fmt.Sprintf("%d/%d ready", rs.Ready, rs.Desired)

if rs.Replicas != rs.Updated {
s += fmt.Sprintf(", %d updating", rs.Replicas-rs.Updated)
}

if len(containersWaitingForReadiness) > 0 {
return &HealthStatus{
Health: HealthUnknown,
Status: HealthStatusStarting,
Message: fmt.Sprintf(
"Container(s) %s is waiting for readiness probe",
strings.Join(containersWaitingForReadiness, ","),
),
}, nil
if rs.Replicas > rs.Desired {
s += fmt.Sprintf(", %d terminating", rs.Replicas-rs.Desired)
}
return s
}

status, err := GetDefaultHealth(obj)
if err != nil {
return status, err
func getReplicaHealth(s ReplicaStatus) *HealthStatus {
hs := &HealthStatus{
Message: s.String(),
}
startDeadline := GetStartDeadline(s.Containers...)
age := time.Since(s.Object.GetCreationTimestamp().Time).Truncate(time.Minute).Abs()

replicas := int32(0)
gs := GetGenericStatus(s.Object)

if deployment.Spec.Replicas != nil {
replicas = *deployment.Spec.Replicas
}
progressing := gs.FindCondition("Progressing")
isStarting := age < startDeadline
isProgressDeadlineExceeded := !isStarting && (progressing.Reason == "ProgressDeadlineExceeded")
hs.Ready = progressing.Status == "True"

if replicas == 0 && deployment.Status.Replicas == 0 {
return &HealthStatus{
Ready: true,
Status: HealthStatusScaledToZero,
Health: HealthUnknown,
}, nil
}
hs.Health = lo.Ternary(s.Ready >= s.Desired, HealthHealthy, lo.Ternary(s.Ready > 0, HealthWarning, HealthUnhealthy))

if deployment.Status.ReadyReplicas == replicas {
status.PrependMessage("%d pods ready", deployment.Status.ReadyReplicas)
} else {
status.PrependMessage("%d of %d pods ready", deployment.Status.ReadyReplicas, replicas)
if s.Desired == 0 && s.Replicas == 0 {
hs.Ready = true
hs.Status = HealthStatusScaledToZero
hs.Health = HealthUnknown
return hs
}

if deployment.Spec.Paused {
status.Ready = false
status.Status = HealthStatusSuspended
return status, err
if s.Replicas == 0 {
if isProgressDeadlineExceeded {
hs.Status = "Failed Create"
hs.Health = HealthUnhealthy
} else {
hs.Status = "Pending"
hs.Health = HealthUnknown
}
} else if s.Ready == 0 && isStarting && !isProgressDeadlineExceeded {
hs.Health = HealthUnknown
hs.Status = HealthStatusStarting
} else if s.Ready == 0 && !isStarting {
hs.Health = HealthUnhealthy
hs.Status = HealthStatusCrashLoop
} else if s.Desired == 0 && s.Replicas > 0 {
hs.Status = HealthStatusScalingDown
hs.Health = lo.Ternary(isProgressDeadlineExceeded, HealthWarning, HealthHealthy)
} else if s.Ready == s.Desired && s.Desired == s.Updated && s.Replicas == s.Desired {
hs.Status = HealthStatusRunning
} else if s.Desired != s.Updated {
hs.Status = HealthStatusUpdating
} else if s.Replicas > s.Desired {
hs.Status = HealthStatusScalingDown
} else if s.Replicas < s.Desired {
hs.Status = HealthStatusScalingUp
}

if deployment.Status.ReadyReplicas > 0 {
status.Status = HealthStatusRunning
if isStarting && hs.Health == HealthUnhealthy {
hs.Health = HealthUnknown
}

if status.Health == HealthUnhealthy {
return status, nil
return hs
}

func getAppsv1DeploymentHealth(deployment *appsv1.Deployment, obj *unstructured.Unstructured) (*HealthStatus, error) {
replicas := int32(0)
if deployment.Spec.Replicas != nil {
replicas = *deployment.Spec.Replicas
}

if deployment.Status.ReadyReplicas < replicas {
status.AppendMessage("%d starting", deployment.Status.Replicas-deployment.Status.ReadyReplicas)
if deployment.Status.Replicas < replicas {
status.AppendMessage("%d creating", replicas-deployment.Status.Replicas)
}
status.Ready = false
status.Status = HealthStatusStarting
} else if deployment.Status.UpdatedReplicas < replicas {
status.AppendMessage("%d updating", replicas-deployment.Status.UpdatedReplicas)
status.Ready = false
status.Status = HealthStatusRollingOut
} else if deployment.Status.Replicas > replicas {
status.AppendMessage("%d pods terminating", deployment.Status.Replicas-replicas)
status.Ready = false
status.Status = HealthStatusScalingDown
replicaHealth := getReplicaHealth(
ReplicaStatus{
Object: obj,
Containers: deployment.Spec.Template.Spec.Containers,
Desired: int(replicas), Replicas: int(deployment.Status.Replicas),
Ready: int(deployment.Status.ReadyReplicas), Updated: int(deployment.Status.UpdatedReplicas),
Unavailable: int(deployment.Status.UnavailableReplicas),
})

if deployment.Spec.Paused {
replicaHealth.Status = HealthStatusSuspended
replicaHealth.Ready = false
}

return status, nil
return replicaHealth, nil
}
76 changes: 26 additions & 50 deletions pkg/health/health_replicaset.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,18 +2,13 @@ package health

import (
"fmt"
"strings"
"time"

appsv1 "k8s.io/api/apps/v1"
corev1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/apis/meta/v1/unstructured"
)

// duration after the creation of a replica set
// within which we never deem the it to be unhealthy.
const replicaSetBufferPeriod = time.Minute * 10

func getReplicaSetHealth(obj *unstructured.Unstructured) (*HealthStatus, error) {
gvk := obj.GroupVersionKind()
switch gvk {
Expand All @@ -29,64 +24,45 @@ func getReplicaSetHealth(obj *unstructured.Unstructured) (*HealthStatus, error)
}
}

func getAppsv1ReplicaSetHealth(replicaSet *appsv1.ReplicaSet) (*HealthStatus, error) {
isWithinBufferPeriod := replicaSet.CreationTimestamp.Add(replicaSetBufferPeriod).After(time.Now())

var containersWaitingForReadiness []string
for _, container := range replicaSet.Spec.Template.Spec.Containers {
if container.ReadinessProbe != nil && container.ReadinessProbe.InitialDelaySeconds > 0 {
deadline := replicaSet.CreationTimestamp.Add(
time.Second * time.Duration(container.ReadinessProbe.InitialDelaySeconds),
)
if time.Now().Before(deadline) {
containersWaitingForReadiness = append(containersWaitingForReadiness, container.Name)
}
}
func getAppsv1ReplicaSetHealth(rs *appsv1.ReplicaSet) (*HealthStatus, error) {
replicas := int32(0)
if rs.Spec.Replicas != nil {
replicas = *rs.Spec.Replicas
}

if len(containersWaitingForReadiness) > 0 {
return &HealthStatus{
Health: HealthUnknown,
Status: HealthStatusStarting,
Message: fmt.Sprintf(
"Container(s) %s is waiting for readiness probe",
strings.Join(containersWaitingForReadiness, ","),
),
}, nil
startDeadline := GetStartDeadline(rs.Spec.Template.Spec.Containers...)
age := time.Since(rs.CreationTimestamp.Time).Truncate(time.Minute).Abs()

health := HealthHealthy
if rs.Status.ReadyReplicas == 0 {
if rs.Status.Replicas > 0 && age < startDeadline {
health = HealthUnknown
} else {
health = HealthUnhealthy
}
} else if rs.Status.ReadyReplicas < replicas {
health = HealthWarning
} else if rs.Status.ReadyReplicas >= replicas {
health = HealthHealthy
}

health := HealthUnknown
if (replicaSet.Spec.Replicas == nil || *replicaSet.Spec.Replicas == 0) && replicaSet.Status.Replicas == 0 {
if replicas == 0 && rs.Status.Replicas == 0 {
return &HealthStatus{
Ready: true,
Status: HealthStatusScaledToZero,
Health: health,
}, nil
}

if replicaSet.Spec.Replicas != nil && replicaSet.Status.ReadyReplicas >= *replicaSet.Spec.Replicas {
health = HealthHealthy
} else if replicaSet.Status.ReadyReplicas > 0 {
health = HealthWarning
} else {
health = HealthUnhealthy
}

if (health == HealthUnhealthy || health == HealthWarning) && isWithinBufferPeriod {
// within the buffer period, we don't mark a ReplicaSet as unhealthy
health = HealthUnknown
}

if replicaSet.Generation == replicaSet.Status.ObservedGeneration &&
replicaSet.Status.ReadyReplicas == *replicaSet.Spec.Replicas {
if rs.Generation == rs.Status.ObservedGeneration &&
rs.Status.ReadyReplicas == *rs.Spec.Replicas {
return &HealthStatus{
Health: health,
Status: HealthStatusRunning,
Ready: true,
}, nil
}

failCondition := getAppsv1ReplicaSetCondition(replicaSet.Status, appsv1.ReplicaSetReplicaFailure)
failCondition := getAppsv1ReplicaSetCondition(rs.Status, appsv1.ReplicaSetReplicaFailure)
if failCondition != nil && failCondition.Status == corev1.ConditionTrue {
return &HealthStatus{
Health: health,
Expand All @@ -95,19 +71,19 @@ func getAppsv1ReplicaSetHealth(replicaSet *appsv1.ReplicaSet) (*HealthStatus, er
}, nil
}

if replicaSet.Status.ReadyReplicas < *replicaSet.Spec.Replicas {
if rs.Status.ReadyReplicas < *rs.Spec.Replicas {
return &HealthStatus{
Health: health,
Status: HealthStatusScalingUp,
Message: fmt.Sprintf("%d of %d pods ready", replicaSet.Status.ReadyReplicas, *replicaSet.Spec.Replicas),
Message: fmt.Sprintf("%d of %d pods ready", rs.Status.ReadyReplicas, *rs.Spec.Replicas),
}, nil
}

if replicaSet.Status.ReadyReplicas > *replicaSet.Spec.Replicas {
if rs.Status.ReadyReplicas > *rs.Spec.Replicas {
return &HealthStatus{
Health: health,
Status: HealthStatusScalingDown,
Message: fmt.Sprintf("%d pods terminating", replicaSet.Status.ReadyReplicas-*replicaSet.Spec.Replicas),
Message: fmt.Sprintf("%d pods terminating", rs.Status.ReadyReplicas-*rs.Spec.Replicas),
}, nil
}

Expand Down
Loading

0 comments on commit 5fe44be

Please sign in to comment.