Retry GCE client creation.

2016-03-28 18:15:42 -07:00 · 2016-03-28 18:15:42 -07:00 · 73afef4bec
commit 73afef4bec
parent 8d59739bb1
3 changed files with 54 additions and 12 deletions
--- a/controllers/gce/backends/backends_test.go
+++ b/controllers/gce/backends/backends_test.go
@ -137,7 +137,7 @@ func TestBackendPoolSync(t *testing.T) {
 	// Repopulate the pool with a cloud list, which now includes the 82 port
 	// backend. This would happen if, say, an ingress backend is removed
 	// while the controller is restarting.
-	pool.(*Backends).snapshotter.(*storage.CloudListingPool).ReplinishPool()
+	pool.(*Backends).snapshotter.(*storage.CloudListingPool).ReplenishPool()

 	pool.GC(svcNodePorts)

--- a/controllers/gce/controller/cluster_manager.go
+++ b/controllers/gce/controller/cluster_manager.go
@ -18,6 +18,8 @@ package controller

 import (
 	"fmt"
+	"net/http"
+	"time"

 	"k8s.io/contrib/ingress/controllers/gce/backends"
 	"k8s.io/contrib/ingress/controllers/gce/healthchecks"
@ -26,6 +28,8 @@ import (
 	"k8s.io/contrib/ingress/controllers/gce/utils"
 	"k8s.io/kubernetes/pkg/cloudprovider"
 	gce "k8s.io/kubernetes/pkg/cloudprovider/providers/gce"
+
+	"github.com/golang/glog"
 )

 const (
@ -54,6 +58,9 @@ const (

 	// Names longer than this are truncated, because of GCE restrictions.
 	nameLenLimit = 62
+
+	// Sleep interval to retry cloud client creation.
+	cloudClientRetryInterval = 10 * time.Second
 )

 // ClusterManager manages cluster resource pools.
@ -70,6 +77,14 @@ func (c *ClusterManager) IsHealthy() (err error) {
 	// TODO: Expand on this, for now we just want to detect when the GCE client
 	// is broken.
 	_, err = c.backendPool.List()
+
+	// If this container is scheduled on a node without compute/rw it is
+	// effectively useless, but it is healthy. Reporting it as unhealthy
+	// will lead to container crashlooping.
+	if utils.IsHTTPErrorCode(err, http.StatusForbidden) {
+		glog.Infof("Reporting cluster as healthy, but unable to list backends: %v", err)
+		return nil
+	}
 	return
 }

@ -138,6 +153,32 @@ func defaultInstanceGroupName(clusterName string) string {
 	return fmt.Sprintf("%v-%v", instanceGroupPrefix, clusterName)
 }

+func getGCEClient() *gce.GCECloud {
+	// Creating the cloud interface involves resolving the metadata server to get
+	// an oauth token. If this fails, the token provider assumes it's not on GCE.
+	// No errors are thrown. So we need to keep retrying till it works because
+	// we know we're on GCE.
+	for {
+		cloudInterface, err := cloudprovider.GetCloudProvider("gce", nil)
+		if err == nil {
+			cloud := cloudInterface.(*gce.GCECloud)
+
+			// If this controller is scheduled on a node without compute/rw
+			// it won't be allowed to list backends. We can assume that the
+			// user has no need for Ingress in this case. If they grant
+			// permissions to the node they will have to restart the controller
+			// manually to re-create the client.
+			if _, err = cloud.ListBackendServices(); err == nil || utils.IsHTTPErrorCode(err, http.StatusForbidden) {
+				return cloud
+			}
+			glog.Warningf("Failed to list backend services, retrying: %v", err)
+		} else {
+			glog.Warningf("Failed to retrieve cloud interface, retrying: %v", err)
+		}
+		time.Sleep(cloudClientRetryInterval)
+	}
+}
+
 // NewClusterManager creates a cluster manager for shared resources.
 // - name: is the name used to tag cluster wide shared resources. This is the
 //   string passed to glbc via --gce-cluster-name.
@ -149,11 +190,12 @@ func NewClusterManager(
 	defaultBackendNodePort int64,
 	defaultHealthCheckPath string) (*ClusterManager, error) {

-	cloudInterface, err := cloudprovider.GetCloudProvider("gce", nil)
-	if err != nil {
-		return nil, err
-	}
-	cloud := cloudInterface.(*gce.GCECloud)
+	// TODO: Make this more resilient. Currently we create the cloud client
+	// and pass it through to all the pools. This makes unittesting easier.
+	// However if the cloud client suddenly fails, we should try to re-create it
+	// and continue.
+	cloud := getGCEClient()
+
 	cluster := ClusterManager{ClusterNamer: utils.Namer{name}}
 	zone, err := cloud.GetZone()
 	if err != nil {
--- a/controllers/gce/storage/pools.go
+++ b/controllers/gce/storage/pools.go
@ -78,11 +78,11 @@ type CloudListingPool struct {
 	keyGetter keyFunc
 }

-// ReplinishPool lists through the cloudLister and inserts into the pool.
-func (c *CloudListingPool) ReplinishPool() {
+// ReplenishPool lists through the cloudLister and inserts into the pool.
+func (c *CloudListingPool) ReplenishPool() {
 	c.lock.Lock()
 	defer c.lock.Unlock()
-	glog.V(4).Infof("Replinishing pool")
+	glog.V(4).Infof("Replenishing pool")
 	items, err := c.lister.List()
 	if err != nil {
 		glog.Warningf("Failed to list: %v", err)
@ -119,7 +119,7 @@ func (c *CloudListingPool) Delete(key string) {
 	c.InMemoryPool.Delete(key)
 }

-// NewCloudListingPool replinishes the InMemoryPool through a background
+// NewCloudListingPool replenishes the InMemoryPool through a background
 // goroutine that lists from the given cloudLister.
 func NewCloudListingPool(k keyFunc, lister cloudLister, relistPeriod time.Duration) *CloudListingPool {
 	cl := &CloudListingPool{
@ -127,7 +127,7 @@ func NewCloudListingPool(k keyFunc, lister cloudLister, relistPeriod time.Durati
 		lister:       lister,
 		keyGetter:    k,
 	}
-	glog.V(4).Infof("Starting pool replinish goroutine")
-	go wait.Until(cl.ReplinishPool, relistPeriod, make(chan struct{}))
+	glog.V(4).Infof("Starting pool replenish goroutine")
+	go wait.Until(cl.ReplenishPool, relistPeriod, make(chan struct{}))
 	return cl
 }