Retry GCE client creation.
This commit is contained in:
parent
8d59739bb1
commit
73afef4bec
3 changed files with 54 additions and 12 deletions
|
@ -137,7 +137,7 @@ func TestBackendPoolSync(t *testing.T) {
|
|||
// Repopulate the pool with a cloud list, which now includes the 82 port
|
||||
// backend. This would happen if, say, an ingress backend is removed
|
||||
// while the controller is restarting.
|
||||
pool.(*Backends).snapshotter.(*storage.CloudListingPool).ReplinishPool()
|
||||
pool.(*Backends).snapshotter.(*storage.CloudListingPool).ReplenishPool()
|
||||
|
||||
pool.GC(svcNodePorts)
|
||||
|
||||
|
|
|
@ -18,6 +18,8 @@ package controller
|
|||
|
||||
import (
|
||||
"fmt"
|
||||
"net/http"
|
||||
"time"
|
||||
|
||||
"k8s.io/contrib/ingress/controllers/gce/backends"
|
||||
"k8s.io/contrib/ingress/controllers/gce/healthchecks"
|
||||
|
@ -26,6 +28,8 @@ import (
|
|||
"k8s.io/contrib/ingress/controllers/gce/utils"
|
||||
"k8s.io/kubernetes/pkg/cloudprovider"
|
||||
gce "k8s.io/kubernetes/pkg/cloudprovider/providers/gce"
|
||||
|
||||
"github.com/golang/glog"
|
||||
)
|
||||
|
||||
const (
|
||||
|
@ -54,6 +58,9 @@ const (
|
|||
|
||||
// Names longer than this are truncated, because of GCE restrictions.
|
||||
nameLenLimit = 62
|
||||
|
||||
// Sleep interval to retry cloud client creation.
|
||||
cloudClientRetryInterval = 10 * time.Second
|
||||
)
|
||||
|
||||
// ClusterManager manages cluster resource pools.
|
||||
|
@ -70,6 +77,14 @@ func (c *ClusterManager) IsHealthy() (err error) {
|
|||
// TODO: Expand on this, for now we just want to detect when the GCE client
|
||||
// is broken.
|
||||
_, err = c.backendPool.List()
|
||||
|
||||
// If this container is scheduled on a node without compute/rw it is
|
||||
// effectively useless, but it is healthy. Reporting it as unhealthy
|
||||
// will lead to container crashlooping.
|
||||
if utils.IsHTTPErrorCode(err, http.StatusForbidden) {
|
||||
glog.Infof("Reporting cluster as healthy, but unable to list backends: %v", err)
|
||||
return nil
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
|
@ -138,6 +153,32 @@ func defaultInstanceGroupName(clusterName string) string {
|
|||
return fmt.Sprintf("%v-%v", instanceGroupPrefix, clusterName)
|
||||
}
|
||||
|
||||
func getGCEClient() *gce.GCECloud {
|
||||
// Creating the cloud interface involves resolving the metadata server to get
|
||||
// an oauth token. If this fails, the token provider assumes it's not on GCE.
|
||||
// No errors are thrown. So we need to keep retrying till it works because
|
||||
// we know we're on GCE.
|
||||
for {
|
||||
cloudInterface, err := cloudprovider.GetCloudProvider("gce", nil)
|
||||
if err == nil {
|
||||
cloud := cloudInterface.(*gce.GCECloud)
|
||||
|
||||
// If this controller is scheduled on a node without compute/rw
|
||||
// it won't be allowed to list backends. We can assume that the
|
||||
// user has no need for Ingress in this case. If they grant
|
||||
// permissions to the node they will have to restart the controller
|
||||
// manually to re-create the client.
|
||||
if _, err = cloud.ListBackendServices(); err == nil || utils.IsHTTPErrorCode(err, http.StatusForbidden) {
|
||||
return cloud
|
||||
}
|
||||
glog.Warningf("Failed to list backend services, retrying: %v", err)
|
||||
} else {
|
||||
glog.Warningf("Failed to retrieve cloud interface, retrying: %v", err)
|
||||
}
|
||||
time.Sleep(cloudClientRetryInterval)
|
||||
}
|
||||
}
|
||||
|
||||
// NewClusterManager creates a cluster manager for shared resources.
|
||||
// - name: is the name used to tag cluster wide shared resources. This is the
|
||||
// string passed to glbc via --gce-cluster-name.
|
||||
|
@ -149,11 +190,12 @@ func NewClusterManager(
|
|||
defaultBackendNodePort int64,
|
||||
defaultHealthCheckPath string) (*ClusterManager, error) {
|
||||
|
||||
cloudInterface, err := cloudprovider.GetCloudProvider("gce", nil)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
cloud := cloudInterface.(*gce.GCECloud)
|
||||
// TODO: Make this more resilient. Currently we create the cloud client
|
||||
// and pass it through to all the pools. This makes unittesting easier.
|
||||
// However if the cloud client suddenly fails, we should try to re-create it
|
||||
// and continue.
|
||||
cloud := getGCEClient()
|
||||
|
||||
cluster := ClusterManager{ClusterNamer: utils.Namer{name}}
|
||||
zone, err := cloud.GetZone()
|
||||
if err != nil {
|
||||
|
|
|
@ -78,11 +78,11 @@ type CloudListingPool struct {
|
|||
keyGetter keyFunc
|
||||
}
|
||||
|
||||
// ReplinishPool lists through the cloudLister and inserts into the pool.
|
||||
func (c *CloudListingPool) ReplinishPool() {
|
||||
// ReplenishPool lists through the cloudLister and inserts into the pool.
|
||||
func (c *CloudListingPool) ReplenishPool() {
|
||||
c.lock.Lock()
|
||||
defer c.lock.Unlock()
|
||||
glog.V(4).Infof("Replinishing pool")
|
||||
glog.V(4).Infof("Replenishing pool")
|
||||
items, err := c.lister.List()
|
||||
if err != nil {
|
||||
glog.Warningf("Failed to list: %v", err)
|
||||
|
@ -119,7 +119,7 @@ func (c *CloudListingPool) Delete(key string) {
|
|||
c.InMemoryPool.Delete(key)
|
||||
}
|
||||
|
||||
// NewCloudListingPool replinishes the InMemoryPool through a background
|
||||
// NewCloudListingPool replenishes the InMemoryPool through a background
|
||||
// goroutine that lists from the given cloudLister.
|
||||
func NewCloudListingPool(k keyFunc, lister cloudLister, relistPeriod time.Duration) *CloudListingPool {
|
||||
cl := &CloudListingPool{
|
||||
|
@ -127,7 +127,7 @@ func NewCloudListingPool(k keyFunc, lister cloudLister, relistPeriod time.Durati
|
|||
lister: lister,
|
||||
keyGetter: k,
|
||||
}
|
||||
glog.V(4).Infof("Starting pool replinish goroutine")
|
||||
go wait.Until(cl.ReplinishPool, relistPeriod, make(chan struct{}))
|
||||
glog.V(4).Infof("Starting pool replenish goroutine")
|
||||
go wait.Until(cl.ReplenishPool, relistPeriod, make(chan struct{}))
|
||||
return cl
|
||||
}
|
||||
|
|
Loading…
Reference in a new issue