diff --git a/controllers/gce/backends/backends_test.go b/controllers/gce/backends/backends_test.go index 93d761902..f8f74e2d2 100644 --- a/controllers/gce/backends/backends_test.go +++ b/controllers/gce/backends/backends_test.go @@ -137,7 +137,7 @@ func TestBackendPoolSync(t *testing.T) { // Repopulate the pool with a cloud list, which now includes the 82 port // backend. This would happen if, say, an ingress backend is removed // while the controller is restarting. - pool.(*Backends).snapshotter.(*storage.CloudListingPool).ReplinishPool() + pool.(*Backends).snapshotter.(*storage.CloudListingPool).ReplenishPool() pool.GC(svcNodePorts) diff --git a/controllers/gce/controller/cluster_manager.go b/controllers/gce/controller/cluster_manager.go index 4128c60d7..642c3fd22 100644 --- a/controllers/gce/controller/cluster_manager.go +++ b/controllers/gce/controller/cluster_manager.go @@ -18,6 +18,8 @@ package controller import ( "fmt" + "net/http" + "time" "k8s.io/contrib/ingress/controllers/gce/backends" "k8s.io/contrib/ingress/controllers/gce/healthchecks" @@ -26,6 +28,8 @@ import ( "k8s.io/contrib/ingress/controllers/gce/utils" "k8s.io/kubernetes/pkg/cloudprovider" gce "k8s.io/kubernetes/pkg/cloudprovider/providers/gce" + + "github.com/golang/glog" ) const ( @@ -54,6 +58,9 @@ const ( // Names longer than this are truncated, because of GCE restrictions. nameLenLimit = 62 + + // Sleep interval to retry cloud client creation. + cloudClientRetryInterval = 10 * time.Second ) // ClusterManager manages cluster resource pools. @@ -70,6 +77,14 @@ func (c *ClusterManager) IsHealthy() (err error) { // TODO: Expand on this, for now we just want to detect when the GCE client // is broken. _, err = c.backendPool.List() + + // If this container is scheduled on a node without compute/rw it is + // effectively useless, but it is healthy. Reporting it as unhealthy + // will lead to container crashlooping. + if utils.IsHTTPErrorCode(err, http.StatusForbidden) { + glog.Infof("Reporting cluster as healthy, but unable to list backends: %v", err) + return nil + } return } @@ -138,6 +153,32 @@ func defaultInstanceGroupName(clusterName string) string { return fmt.Sprintf("%v-%v", instanceGroupPrefix, clusterName) } +func getGCEClient() *gce.GCECloud { + // Creating the cloud interface involves resolving the metadata server to get + // an oauth token. If this fails, the token provider assumes it's not on GCE. + // No errors are thrown. So we need to keep retrying till it works because + // we know we're on GCE. + for { + cloudInterface, err := cloudprovider.GetCloudProvider("gce", nil) + if err == nil { + cloud := cloudInterface.(*gce.GCECloud) + + // If this controller is scheduled on a node without compute/rw + // it won't be allowed to list backends. We can assume that the + // user has no need for Ingress in this case. If they grant + // permissions to the node they will have to restart the controller + // manually to re-create the client. + if _, err = cloud.ListBackendServices(); err == nil || utils.IsHTTPErrorCode(err, http.StatusForbidden) { + return cloud + } + glog.Warningf("Failed to list backend services, retrying: %v", err) + } else { + glog.Warningf("Failed to retrieve cloud interface, retrying: %v", err) + } + time.Sleep(cloudClientRetryInterval) + } +} + // NewClusterManager creates a cluster manager for shared resources. // - name: is the name used to tag cluster wide shared resources. This is the // string passed to glbc via --gce-cluster-name. @@ -149,11 +190,12 @@ func NewClusterManager( defaultBackendNodePort int64, defaultHealthCheckPath string) (*ClusterManager, error) { - cloudInterface, err := cloudprovider.GetCloudProvider("gce", nil) - if err != nil { - return nil, err - } - cloud := cloudInterface.(*gce.GCECloud) + // TODO: Make this more resilient. Currently we create the cloud client + // and pass it through to all the pools. This makes unittesting easier. + // However if the cloud client suddenly fails, we should try to re-create it + // and continue. + cloud := getGCEClient() + cluster := ClusterManager{ClusterNamer: utils.Namer{name}} zone, err := cloud.GetZone() if err != nil { diff --git a/controllers/gce/storage/pools.go b/controllers/gce/storage/pools.go index 7cd26da94..3d1fd235e 100644 --- a/controllers/gce/storage/pools.go +++ b/controllers/gce/storage/pools.go @@ -78,11 +78,11 @@ type CloudListingPool struct { keyGetter keyFunc } -// ReplinishPool lists through the cloudLister and inserts into the pool. -func (c *CloudListingPool) ReplinishPool() { +// ReplenishPool lists through the cloudLister and inserts into the pool. +func (c *CloudListingPool) ReplenishPool() { c.lock.Lock() defer c.lock.Unlock() - glog.V(4).Infof("Replinishing pool") + glog.V(4).Infof("Replenishing pool") items, err := c.lister.List() if err != nil { glog.Warningf("Failed to list: %v", err) @@ -119,7 +119,7 @@ func (c *CloudListingPool) Delete(key string) { c.InMemoryPool.Delete(key) } -// NewCloudListingPool replinishes the InMemoryPool through a background +// NewCloudListingPool replenishes the InMemoryPool through a background // goroutine that lists from the given cloudLister. func NewCloudListingPool(k keyFunc, lister cloudLister, relistPeriod time.Duration) *CloudListingPool { cl := &CloudListingPool{ @@ -127,7 +127,7 @@ func NewCloudListingPool(k keyFunc, lister cloudLister, relistPeriod time.Durati lister: lister, keyGetter: k, } - glog.V(4).Infof("Starting pool replinish goroutine") - go wait.Until(cl.ReplinishPool, relistPeriod, make(chan struct{})) + glog.V(4).Infof("Starting pool replenish goroutine") + go wait.Until(cl.ReplenishPool, relistPeriod, make(chan struct{})) return cl }