Better events and timeouts for health checks

2016-11-22 18:30:31 -08:00 · 2016-11-22 18:30:31 -08:00 · d7fb15d972
commit d7fb15d972
parent b789b78837
2 changed files with 72 additions and 33 deletions
--- a/controllers/gce/controller/utils.go
+++ b/controllers/gce/controller/utils.go
@ -242,7 +242,7 @@ func (t *GCETranslator) toURLMap(ing *extensions.Ingress) (utils.GCEURLMap, erro
 				// to all other services under the assumption that the user will
 				// modify nodeport.
 				if _, ok := err.(errorNodePortNotFound); ok {
-					glog.Infof("%v", err)
+					t.recorder.Eventf(ing, api.EventTypeWarning, "Service", err.(errorNodePortNotFound).Error())
 					continue
 				}

@ -269,6 +269,10 @@ func (t *GCETranslator) toURLMap(ing *extensions.Ingress) (utils.GCEURLMap, erro
 	}
 	defaultBackend, _ := t.toGCEBackend(ing.Spec.Backend, ing.Namespace)
 	hostPathBackend.PutDefaultBackend(defaultBackend)
+
+	if defaultBackend != nil && ing.Spec.Backend != nil {
+		t.recorder.Eventf(ing, api.EventTypeNormal, "GCE", fmt.Sprintf("default backend set to %v:%v", ing.Spec.Backend.ServiceName, defaultBackend.Port))
+	}
 	return hostPathBackend, nil
 }

@ -461,10 +465,17 @@ func (t *GCETranslator) HealthCheck(port int64) (*compute.HttpHealthCheck, error
 	if err != nil {
 		return nil, err
 	}
+	var ingresses []extensions.Ingress
+	var healthCheck *compute.HttpHealthCheck
 	// Find the label and target port of the one service with the given nodePort
 	for _, s := range sl {
 		for _, p := range s.Spec.Ports {
-			if int32(port) == p.NodePort {
+
+			// only one Service can match this nodePort, try and look up
+			// the readiness probe of the pods behind it
+			if int32(port) != p.NodePort {
+				continue
+			}
 			rp, err := t.getHTTPProbe(*s, p.TargetPort)
 			if err != nil {
 				return nil, err
@ -473,28 +484,47 @@ func (t *GCETranslator) HealthCheck(port int64) (*compute.HttpHealthCheck, error
 				glog.Infof("No pod in service %v with node port %v has declared a matching readiness probe for health checks.", s.Name, port)
 				break
 			}
+
 			healthPath := rp.Handler.HTTPGet.Path
 			// GCE requires a leading "/" for health check urls.
 			if string(healthPath[0]) != "/" {
 				healthPath = fmt.Sprintf("/%v", healthPath)
 			}
+
 			host := rp.Handler.HTTPGet.Host
 			glog.Infof("Found custom health check for Service %v nodeport %v: %v%v", s.Name, port, host, healthPath)
-				return &compute.HttpHealthCheck{
+			// remember the ingresses that use this Service so we can send
+			// the right events
+			ingresses, err = t.ingLister.GetServiceIngress(s)
+			if err != nil {
+				glog.Warningf("Failed to list ingresses for service %v", s.Name)
+			}
+
+			healthCheck = &compute.HttpHealthCheck{
 				Port:        port,
 				RequestPath: healthPath,
 				Host:        host,
 				Description: "kubernetes L7 health check from readiness probe.",
-					CheckIntervalSec:   int64(rp.PeriodSeconds),
+				// set a low health threshold and a high failure threshold.
+				// We're just trying to detect if the node networking is
+				// borked, service level outages will get detected sooner
+				// by kube-proxy.
+				CheckIntervalSec:   int64(rp.PeriodSeconds + utils.DefaultHealthCheckInterval),
 				TimeoutSec:         int64(rp.TimeoutSeconds),
-					HealthyThreshold:   int64(rp.SuccessThreshold),
-					UnhealthyThreshold: int64(rp.FailureThreshold),
+				HealthyThreshold:   utils.DefaultHealthyThreshold,
+				UnhealthyThreshold: utils.DefaultUnhealthyThreshold,
 				// TODO: include headers after updating compute godep.
-				}, nil
+			}
+			break
 		}
 	}
+	if healthCheck == nil {
+		healthCheck = utils.DefaultHealthCheckTemplate(port)
 	}
-	return utils.DefaultHealthCheckTemplate(port), nil
+	for _, ing := range ingresses {
+		t.recorder.Eventf(&ing, api.EventTypeNormal, "GCE", fmt.Sprintf("health check using %v:%v%v", healthCheck.Host, healthCheck.Port, healthCheck.RequestPath))
+	}
+	return healthCheck, nil
 }

 // PodsByCreationTimestamp sorts a list of Pods by creation timestamp, using their names as a tie breaker.
--- a/controllers/gce/utils/utils.go
+++ b/controllers/gce/utils/utils.go
@ -79,6 +79,15 @@ const (
 	// K8sAnnotationPrefix is the prefix used in annotations used to record
 	// debug information in the Ingress annotations.
 	K8sAnnotationPrefix = "ingress.kubernetes.io"
+
+	// DefaultHealthCheckInterval defines how frequently a probe runs
+	DefaultHealthCheckInterval = 60
+	// DefaultHealthyThreshold defines the threshold of success probes that declare a backend "healthy"
+	DefaultHealthyThreshold = 1
+	// DefaultUnhealthyThreshold defines the threshold of failure probes that declare a backend "unhealthy"
+	DefaultUnhealthyThreshold = 10
+	// DefaultTimeoutSeconds defines the timeout of each probe
+	DefaultTimeoutSeconds = 60
 )

 // Namer handles centralized naming for the cluster.
@ -305,12 +314,12 @@ func DefaultHealthCheckTemplate(port int64) *compute.HttpHealthCheck {
 		RequestPath: "",
 		Description: "Default kubernetes L7 Loadbalancing health check.",
 		// How often to health check.
-		CheckIntervalSec: 1,
+		CheckIntervalSec: DefaultHealthCheckInterval,
 		// How long to wait before claiming failure of a health check.
-		TimeoutSec: 1,
+		TimeoutSec: DefaultTimeoutSeconds,
 		// Number of healthchecks to pass for a vm to be deemed healthy.
-		HealthyThreshold: 1,
+		HealthyThreshold: DefaultHealthyThreshold,
 		// Number of healthchecks to fail before the vm is deemed unhealthy.
-		UnhealthyThreshold: 10,
+		UnhealthyThreshold: DefaultUnhealthyThreshold,
 	}
 }