From 1691e17b854587039fb2db1ae69759f653da3e9b Mon Sep 17 00:00:00 2001 From: Nick Sardo Date: Wed, 5 Apr 2017 12:43:55 -0700 Subject: [PATCH] Add balance mode updater --- controllers/gce/cmd/mode-updater/README.md | 47 ++++ controllers/gce/cmd/mode-updater/main.go | 292 +++++++++++++++++++++ 2 files changed, 339 insertions(+) create mode 100644 controllers/gce/cmd/mode-updater/README.md create mode 100644 controllers/gce/cmd/mode-updater/main.go diff --git a/controllers/gce/cmd/mode-updater/README.md b/controllers/gce/cmd/mode-updater/README.md new file mode 100644 index 000000000..84a05ab34 --- /dev/null +++ b/controllers/gce/cmd/mode-updater/README.md @@ -0,0 +1,47 @@ + +## Backend-Service BalancingMode Updater +**For non-GKE Users** + +Earlier versions of the GLBC created GCP BackendService resources with no balancing mode specified. By default the API used CPU UTILIZATION. The "internal load balancer" feature provided by GCP requires backend services to have the balancing mode RATE. In order to have a K8s cluster with an internal load balancer and ingress resources, you'll need to perform some manual steps. + +#### Why +There are two GCP requirements that complicate changing the backend service balancing mode. +1. An instance can only belong to one loadbalancer instance group (a group that has at least one backend service pointing to it). +1. An load balancer instance group can only have one balancing mode for all the backend services pointing to it. + +#### Complicating factors +1. You cannot atomically update a set of backend services to a new backend mode. +1. The default backend service in the `kube-system` namespace exists, so you'll have at least two backend services. + +#### Your Options +- (UNTESTED) If you only have one service being referenced by ingresses AND that service is the default backend as specified in the Ingress spec. (resulting in one used backend service and one unused backend service) + 1. Go to the GCP Console + 1. Delete the kube-system's default backend service + 1. Change the balancing mode of the used backend service. + + The GLBC should recreate the default backend service at its resync interval. + + +- Re-create all ingress resources. The GLBC will use RATE mode when it's not blocked by backend services with UTILIZATION mode. + - Must be running GLBC version >0.9.1 + - Must delete all ingress resources before re-creating + + +- Run this updater tool + +#### How the updater works +1. Create temporary instance groups `k8s-ig-migrate` in each zone where a `k8s-ig-{cluster_id}` exists. +1. Update all backend-services to point to both original and temporary instance groups (mode of the new backend doesn't matter) +1. Slowly migrate instances from original to temporary groups. +1. Update all backend-services to remove pointers to original instance groups. +1. Update all backend-services to point to original groups (with new balancing mode!) +1. Slowly migrate instances from temporary to original groups. +1. Update all backend-services to remove pointers to temporary instance groups. +1. Delete temporary instance groups + +#### Required Testing +- [ ] Up time is not effected when switching instance groups +- [ ] An active GLBC does not negatively interfere with this updater + +#### TODO +- [ ] Use GCE CloudProvider package in order to utilize the `waitForOp` functionality in order to remove some sleeps. diff --git a/controllers/gce/cmd/mode-updater/main.go b/controllers/gce/cmd/mode-updater/main.go new file mode 100644 index 000000000..2bbd17b38 --- /dev/null +++ b/controllers/gce/cmd/mode-updater/main.go @@ -0,0 +1,292 @@ +package main + +import ( + "errors" + "flag" + "fmt" + "log" + "strings" + "sync" + "time" + + "golang.org/x/oauth2" + "golang.org/x/oauth2/google" + + compute "google.golang.org/api/compute/v1" +) + +var ( + projectID string + regionName string + targetBalancingMode string + + instanceGroupName string + + s *compute.Service + region *compute.Region + zones []*compute.Zone + igs map[string]*compute.InstanceGroup + instances []*compute.Instance +) + +const ( + instanceGroupTemp = "k8s-ig--migrate" + balancingModeRATE = "RATE" + balancingModeUTIL = "UTILIZATION" +) + +func main() { + //flag.Usage + flag.Parse() + + args := flag.Args() + if len(args) != 3 { + log.Fatalf("Expected three arguments: project_id region balancing_mode") + } + projectID, regionName, targetBalancingMode = args[0], args[1], args[2] + + switch targetBalancingMode { + case balancingModeRATE, balancingModeUTIL: + default: + panic(fmt.Errorf("expected either %s or %s, actual: %v", balancingModeRATE, balancingModeUTIL, targetBalancingMode)) + } + + igs = make(map[string]*compute.InstanceGroup) + + tokenSource, err := google.DefaultTokenSource( + oauth2.NoContext, + compute.CloudPlatformScope, + compute.ComputeScope) + if err != nil { + panic(err) + } + + client := oauth2.NewClient(oauth2.NoContext, tokenSource) + s, err = compute.New(client) + if err != nil { + panic(err) + } + + // Get Region + region, err = s.Regions.Get(projectID, regionName).Do() + if err != nil { + panic(err) + } + + // Get Zones + zoneFilter := fmt.Sprintf("(region eq %s)", region.SelfLink) + zoneList, err := s.Zones.List(projectID).Filter(zoneFilter).Do() + if err != nil { + panic(err) + } + zones = zoneList.Items + + // Get instance groups + for _, z := range zones { + igl, err := s.InstanceGroups.List(projectID, z.Name).Do() + if err != nil { + panic(err) + } + for _, ig := range igl.Items { + if !strings.HasPrefix(ig.Name, "k8s-ig--") { + continue + } + + if instanceGroupName == "" { + instanceGroupName = ig.Name + } + + // Note instances + r := &compute.InstanceGroupsListInstancesRequest{InstanceState: "ALL"} + instList, err := s.InstanceGroups.ListInstances(projectID, getResourceName(ig.Zone, "zones"), ig.Name, r).Do() + if err != nil { + panic(err) + } + + for _, i := range instList.Items { + inst, err := s.Instances.Get(projectID, getResourceName(ig.Zone, "zones"), getResourceName(i.Instance, "instances")).Do() + if err != nil { + panic(err) + } + + instances = append(instances, inst) + } + + // Note instance group in zone + igs[z.Name] = ig + } + } + + if instanceGroupName == "" { + panic(errors.New("Could not determine k8s load balancer instance group")) + } + + bs := getBackendServices() + fmt.Println("Region:", region.Name) + fmt.Println("Backend Services:", len(bs)) + fmt.Println("Instance Groups:", len(igs)) + + // Create temoprary instance groups + for zone, ig := range igs { + _, err = s.InstanceGroups.Get(projectID, zone, instanceGroupTemp).Do() + if err != nil { + newIg := &compute.InstanceGroup{ + Name: instanceGroupTemp, + Zone: zone, + NamedPorts: ig.NamedPorts, + } + fmt.Println("Creating", instanceGroupTemp, "zone:", zone) + _, err = s.InstanceGroups.Insert(projectID, zone, newIg).Do() + if err != nil { + panic(err) + } + } + } + + // Straddle both groups + fmt.Println("Straddle both groups in backend services") + setBackendsTo(true, balancingModeInverse(targetBalancingMode), true, balancingModeInverse(targetBalancingMode)) + + fmt.Println("Migrate instances to temporary group") + migrateInstances(instanceGroupName, instanceGroupTemp) + + time.Sleep(20 * time.Second) + + // Remove original backends + fmt.Println("Remove original backends") + setBackendsTo(false, "", true, balancingModeInverse(targetBalancingMode)) + + sleep(1 * time.Minute) + + // Straddle both groups (new balancing mode) + fmt.Println("Create backends pointing to original instance groups") + setBackendsTo(true, targetBalancingMode, true, balancingModeInverse(targetBalancingMode)) + + sleep(20 * time.Second) + + fmt.Println("Migrate instances back to original groups") + migrateInstances(instanceGroupTemp, instanceGroupName) + + sleep(20 * time.Second) + + fmt.Println("Remove temporary backends") + setBackendsTo(true, targetBalancingMode, false, "") + + sleep(20 * time.Second) + + fmt.Println("Delete temporary instance groups") + for z := range igs { + _, err = s.InstanceGroups.Delete(projectID, z, instanceGroupTemp).Do() + if err != nil { + fmt.Println("Couldn't delete temporary instance group", instanceGroupTemp) + } + } +} + +func sleep(d time.Duration) { + fmt.Println("Sleeping for", d.String()) + time.Sleep(d) +} + +func setBackendsTo(orig bool, origMode string, temp bool, tempMode string) { + bs := getBackendServices() + for _, bsi := range bs { + var union []*compute.Backend + for zone := range igs { + if orig { + b := &compute.Backend{ + Group: createInstanceGroupLink(zone, instanceGroupName), + BalancingMode: origMode, + CapacityScaler: 0.8, + MaxRatePerInstance: 1.0, + } + union = append(union, b) + } + if temp { + b := &compute.Backend{ + Group: createInstanceGroupLink(zone, instanceGroupTemp), + BalancingMode: tempMode, + CapacityScaler: 0.8, + MaxRatePerInstance: 1.0, + } + union = append(union, b) + } + } + bsi.Backends = union + _, err := s.BackendServices.Update(projectID, bsi.Name, bsi).Do() + if err != nil { + panic(err) + } + } +} + +func balancingModeInverse(m string) string { + switch m { + case balancingModeRATE: + return balancingModeUTIL + case balancingModeUTIL: + return balancingModeRATE + default: + return "" + } +} + +func getBackendServices() (bs []*compute.BackendService) { + bsl, err := s.BackendServices.List(projectID).Do() + if err != nil { + panic(err) + } + + for _, bsli := range bsl.Items { + if bsli.Region == "" && strings.HasPrefix(bsli.Name, "k8s-be-") { + bs = append(bs, bsli) + } + } + return bs +} + +func migrateInstances(fromIG, toIG string) error { + wg := sync.WaitGroup{} + for _, i := range instances { + wg.Add(1) + go func(i *compute.Instance) { + z := getResourceName(i.Zone, "zones") + fmt.Printf(" - %s (%s)\n", i.Name, z) + rr := &compute.InstanceGroupsRemoveInstancesRequest{Instances: []*compute.InstanceReference{{Instance: i.SelfLink}}} + _, err := s.InstanceGroups.RemoveInstances(projectID, z, fromIG, rr).Do() + if err != nil { + fmt.Println("Skipping error when removing instance from group", err) + } + time.Sleep(10 * time.Second) + + ra := &compute.InstanceGroupsAddInstancesRequest{Instances: []*compute.InstanceReference{{Instance: i.SelfLink}}} + _, err = s.InstanceGroups.AddInstances(projectID, z, toIG, ra).Do() + if err != nil { + if !strings.Contains(err.Error(), "memberAlreadyExists") { // GLBC already added the instance back to the IG + fmt.Println("failed to add instance to new IG", i.Name, err) + } + } + wg.Done() + }(i) + time.Sleep(10 * time.Second) + } + wg.Wait() + return nil +} + +func createInstanceGroupLink(zone, igName string) string { + return fmt.Sprintf("https://www.googleapis.com/compute/v1/projects/%s/zones/%s/instanceGroups/%s", projectID, zone, igName) +} + +func getResourceName(link string, resourceType string) string { + s := strings.Split(link, "/") + + for i := 0; i < len(s); i++ { + if s[i] == resourceType { + if i+1 <= len(s) { + return s[i+1] + } + } + } + return "" +}