Add balance mode updater

This commit is contained in:
Nick Sardo 2017-04-05 12:43:55 -07:00
parent 403cf5e989
commit 1691e17b85
2 changed files with 339 additions and 0 deletions

View file

@ -0,0 +1,47 @@
## Backend-Service BalancingMode Updater
**For non-GKE Users**
Earlier versions of the GLBC created GCP BackendService resources with no balancing mode specified. By default the API used CPU UTILIZATION. The "internal load balancer" feature provided by GCP requires backend services to have the balancing mode RATE. In order to have a K8s cluster with an internal load balancer and ingress resources, you'll need to perform some manual steps.
#### Why
There are two GCP requirements that complicate changing the backend service balancing mode.
1. An instance can only belong to one loadbalancer instance group (a group that has at least one backend service pointing to it).
1. An load balancer instance group can only have one balancing mode for all the backend services pointing to it.
#### Complicating factors
1. You cannot atomically update a set of backend services to a new backend mode.
1. The default backend service in the `kube-system` namespace exists, so you'll have at least two backend services.
#### Your Options
- (UNTESTED) If you only have one service being referenced by ingresses AND that service is the default backend as specified in the Ingress spec. (resulting in one used backend service and one unused backend service)
1. Go to the GCP Console
1. Delete the kube-system's default backend service
1. Change the balancing mode of the used backend service.
The GLBC should recreate the default backend service at its resync interval.
- Re-create all ingress resources. The GLBC will use RATE mode when it's not blocked by backend services with UTILIZATION mode.
- Must be running GLBC version >0.9.1
- Must delete all ingress resources before re-creating
- Run this updater tool
#### How the updater works
1. Create temporary instance groups `k8s-ig-migrate` in each zone where a `k8s-ig-{cluster_id}` exists.
1. Update all backend-services to point to both original and temporary instance groups (mode of the new backend doesn't matter)
1. Slowly migrate instances from original to temporary groups.
1. Update all backend-services to remove pointers to original instance groups.
1. Update all backend-services to point to original groups (with new balancing mode!)
1. Slowly migrate instances from temporary to original groups.
1. Update all backend-services to remove pointers to temporary instance groups.
1. Delete temporary instance groups
#### Required Testing
- [ ] Up time is not effected when switching instance groups
- [ ] An active GLBC does not negatively interfere with this updater
#### TODO
- [ ] Use GCE CloudProvider package in order to utilize the `waitForOp` functionality in order to remove some sleeps.

View file

@ -0,0 +1,292 @@
package main
import (
"errors"
"flag"
"fmt"
"log"
"strings"
"sync"
"time"
"golang.org/x/oauth2"
"golang.org/x/oauth2/google"
compute "google.golang.org/api/compute/v1"
)
var (
projectID string
regionName string
targetBalancingMode string
instanceGroupName string
s *compute.Service
region *compute.Region
zones []*compute.Zone
igs map[string]*compute.InstanceGroup
instances []*compute.Instance
)
const (
instanceGroupTemp = "k8s-ig--migrate"
balancingModeRATE = "RATE"
balancingModeUTIL = "UTILIZATION"
)
func main() {
//flag.Usage
flag.Parse()
args := flag.Args()
if len(args) != 3 {
log.Fatalf("Expected three arguments: project_id region balancing_mode")
}
projectID, regionName, targetBalancingMode = args[0], args[1], args[2]
switch targetBalancingMode {
case balancingModeRATE, balancingModeUTIL:
default:
panic(fmt.Errorf("expected either %s or %s, actual: %v", balancingModeRATE, balancingModeUTIL, targetBalancingMode))
}
igs = make(map[string]*compute.InstanceGroup)
tokenSource, err := google.DefaultTokenSource(
oauth2.NoContext,
compute.CloudPlatformScope,
compute.ComputeScope)
if err != nil {
panic(err)
}
client := oauth2.NewClient(oauth2.NoContext, tokenSource)
s, err = compute.New(client)
if err != nil {
panic(err)
}
// Get Region
region, err = s.Regions.Get(projectID, regionName).Do()
if err != nil {
panic(err)
}
// Get Zones
zoneFilter := fmt.Sprintf("(region eq %s)", region.SelfLink)
zoneList, err := s.Zones.List(projectID).Filter(zoneFilter).Do()
if err != nil {
panic(err)
}
zones = zoneList.Items
// Get instance groups
for _, z := range zones {
igl, err := s.InstanceGroups.List(projectID, z.Name).Do()
if err != nil {
panic(err)
}
for _, ig := range igl.Items {
if !strings.HasPrefix(ig.Name, "k8s-ig--") {
continue
}
if instanceGroupName == "" {
instanceGroupName = ig.Name
}
// Note instances
r := &compute.InstanceGroupsListInstancesRequest{InstanceState: "ALL"}
instList, err := s.InstanceGroups.ListInstances(projectID, getResourceName(ig.Zone, "zones"), ig.Name, r).Do()
if err != nil {
panic(err)
}
for _, i := range instList.Items {
inst, err := s.Instances.Get(projectID, getResourceName(ig.Zone, "zones"), getResourceName(i.Instance, "instances")).Do()
if err != nil {
panic(err)
}
instances = append(instances, inst)
}
// Note instance group in zone
igs[z.Name] = ig
}
}
if instanceGroupName == "" {
panic(errors.New("Could not determine k8s load balancer instance group"))
}
bs := getBackendServices()
fmt.Println("Region:", region.Name)
fmt.Println("Backend Services:", len(bs))
fmt.Println("Instance Groups:", len(igs))
// Create temoprary instance groups
for zone, ig := range igs {
_, err = s.InstanceGroups.Get(projectID, zone, instanceGroupTemp).Do()
if err != nil {
newIg := &compute.InstanceGroup{
Name: instanceGroupTemp,
Zone: zone,
NamedPorts: ig.NamedPorts,
}
fmt.Println("Creating", instanceGroupTemp, "zone:", zone)
_, err = s.InstanceGroups.Insert(projectID, zone, newIg).Do()
if err != nil {
panic(err)
}
}
}
// Straddle both groups
fmt.Println("Straddle both groups in backend services")
setBackendsTo(true, balancingModeInverse(targetBalancingMode), true, balancingModeInverse(targetBalancingMode))
fmt.Println("Migrate instances to temporary group")
migrateInstances(instanceGroupName, instanceGroupTemp)
time.Sleep(20 * time.Second)
// Remove original backends
fmt.Println("Remove original backends")
setBackendsTo(false, "", true, balancingModeInverse(targetBalancingMode))
sleep(1 * time.Minute)
// Straddle both groups (new balancing mode)
fmt.Println("Create backends pointing to original instance groups")
setBackendsTo(true, targetBalancingMode, true, balancingModeInverse(targetBalancingMode))
sleep(20 * time.Second)
fmt.Println("Migrate instances back to original groups")
migrateInstances(instanceGroupTemp, instanceGroupName)
sleep(20 * time.Second)
fmt.Println("Remove temporary backends")
setBackendsTo(true, targetBalancingMode, false, "")
sleep(20 * time.Second)
fmt.Println("Delete temporary instance groups")
for z := range igs {
_, err = s.InstanceGroups.Delete(projectID, z, instanceGroupTemp).Do()
if err != nil {
fmt.Println("Couldn't delete temporary instance group", instanceGroupTemp)
}
}
}
func sleep(d time.Duration) {
fmt.Println("Sleeping for", d.String())
time.Sleep(d)
}
func setBackendsTo(orig bool, origMode string, temp bool, tempMode string) {
bs := getBackendServices()
for _, bsi := range bs {
var union []*compute.Backend
for zone := range igs {
if orig {
b := &compute.Backend{
Group: createInstanceGroupLink(zone, instanceGroupName),
BalancingMode: origMode,
CapacityScaler: 0.8,
MaxRatePerInstance: 1.0,
}
union = append(union, b)
}
if temp {
b := &compute.Backend{
Group: createInstanceGroupLink(zone, instanceGroupTemp),
BalancingMode: tempMode,
CapacityScaler: 0.8,
MaxRatePerInstance: 1.0,
}
union = append(union, b)
}
}
bsi.Backends = union
_, err := s.BackendServices.Update(projectID, bsi.Name, bsi).Do()
if err != nil {
panic(err)
}
}
}
func balancingModeInverse(m string) string {
switch m {
case balancingModeRATE:
return balancingModeUTIL
case balancingModeUTIL:
return balancingModeRATE
default:
return ""
}
}
func getBackendServices() (bs []*compute.BackendService) {
bsl, err := s.BackendServices.List(projectID).Do()
if err != nil {
panic(err)
}
for _, bsli := range bsl.Items {
if bsli.Region == "" && strings.HasPrefix(bsli.Name, "k8s-be-") {
bs = append(bs, bsli)
}
}
return bs
}
func migrateInstances(fromIG, toIG string) error {
wg := sync.WaitGroup{}
for _, i := range instances {
wg.Add(1)
go func(i *compute.Instance) {
z := getResourceName(i.Zone, "zones")
fmt.Printf(" - %s (%s)\n", i.Name, z)
rr := &compute.InstanceGroupsRemoveInstancesRequest{Instances: []*compute.InstanceReference{{Instance: i.SelfLink}}}
_, err := s.InstanceGroups.RemoveInstances(projectID, z, fromIG, rr).Do()
if err != nil {
fmt.Println("Skipping error when removing instance from group", err)
}
time.Sleep(10 * time.Second)
ra := &compute.InstanceGroupsAddInstancesRequest{Instances: []*compute.InstanceReference{{Instance: i.SelfLink}}}
_, err = s.InstanceGroups.AddInstances(projectID, z, toIG, ra).Do()
if err != nil {
if !strings.Contains(err.Error(), "memberAlreadyExists") { // GLBC already added the instance back to the IG
fmt.Println("failed to add instance to new IG", i.Name, err)
}
}
wg.Done()
}(i)
time.Sleep(10 * time.Second)
}
wg.Wait()
return nil
}
func createInstanceGroupLink(zone, igName string) string {
return fmt.Sprintf("https://www.googleapis.com/compute/v1/projects/%s/zones/%s/instanceGroups/%s", projectID, zone, igName)
}
func getResourceName(link string, resourceType string) string {
s := strings.Split(link, "/")
for i := 0; i < len(s); i++ {
if s[i] == resourceType {
if i+1 <= len(s) {
return s[i+1]
}
}
}
return ""
}