feat: allow configuring nginx worker reload behaviour, to prevent multiple concurrent worker reloads

Signed-off-by: Rafael da Fonseca <rafael.fonseca@wildlifestudios.com>
2024-01-19 21:22:12 +00:00 · 2024-01-19 21:22:12 +00:00 · bc96769c91
commit bc96769c91
parent a596a18aeb
4 changed files with 66 additions and 3 deletions
--- a/docs/user-guide/nginx-configuration/configmap.md
+++ b/docs/user-guide/nginx-configuration/configmap.md
@ -114,6 +114,7 @@ The following table shows a configuration option's name, type, and the default v
 |[worker-processes](#worker-processes)|string|`<Number of CPUs>`||
 |[worker-cpu-affinity](#worker-cpu-affinity)|string|""||
 |[worker-shutdown-timeout](#worker-shutdown-timeout)|string|"240s"||
 |[concurrently-reload-worker-processes](#concurrently-reload-worker-processes)|bool|"true"||
 |[load-balance](#load-balance)|string|"round_robin"||
 |[variables-hash-bucket-size](#variables-hash-bucket-size)|int|128||
 |[variables-hash-max-size](#variables-hash-max-size)|int|2048||
--- a/internal/ingress/controller/config/config.go
+++ b/internal/ingress/controller/config/config.go
@ -474,6 +474,13 @@ type Configuration struct {
 	// http://nginx.org/en/docs/ngx_core_module.html#worker_processes
 	WorkerProcesses string `json:"worker-processes,omitempty"`
 	// Defines whether multiple concurrent reloads of worker processes should occur.
 	// Set this to false to prevent more than n x 2 workers to exist at any time, to avoid potential OOM situations and high CPU load
 	// With this setting on false, configuration changes in the queue will be re-queued with an exponential backoff, until the number of worker process is the expected value.
 	// By default new worker processes are spawned every time there's a change that cannot be applied dynamically with no upper limit to the number of running workers
 	// http://nginx.org/en/docs/ngx_core_module.html#worker_processes
 	ConcurrentlyReloadWorkers bool `json:"concurrently-reload-worker-processes,omitempty"`
 	// Defines a timeout for a graceful shutdown of worker processes
 	// http://nginx.org/en/docs/ngx_core_module.html#worker_shutdown_timeout
 	WorkerShutdownTimeout string `json:"worker-shutdown-timeout,omitempty"`
@ -842,6 +849,7 @@ func NewDefault() Configuration {
 		UseGzip:                          false,
 		UseGeoIP2:                        false,
 		WorkerProcesses:                  strconv.Itoa(runtime.NumCPU()),
 		ConcurrentlyReloadWorkers:        true,
 		WorkerShutdownTimeout:            "240s",
 		VariablesHashBucketSize:          256,
 		VariablesHashMaxSize:             2048,
--- a/internal/ingress/controller/nginx.go
+++ b/internal/ingress/controller/nginx.go
@ -35,6 +35,7 @@ import (
 	"syscall"
 	"text/template"
 	"time"
 	"unicode"
 	proxyproto "github.com/armon/go-proxyproto"
 	"github.com/eapache/channels"
@ -90,6 +91,7 @@ func NewNGINXController(config *Configuration, mc metric.Collector) *NGINXContro
 		resolver:         h,
 		cfg:              config,
 		syncRateLimiter:  flowcontrol.NewTokenBucketRateLimiter(config.SyncRateLimit, 1),
 		workersReloading: false,
 		recorder: eventBroadcaster.NewRecorder(scheme.Scheme, apiv1.EventSource{
 			Component: "nginx-ingress-controller",
@ -227,6 +229,8 @@ type NGINXController struct {
 	syncRateLimiter flowcontrol.RateLimiter
 	workersReloading bool
 	// stopLock is used to enforce that only a single call to Stop send at
 	// a given time. We allow stopping through an HTTP endpoint and
 	// allowing concurrent stoppers leads to stack traces.
@ -668,6 +672,12 @@ Error: %v
 //
 //nolint:gocritic // the cfg shouldn't be changed, and shouldn't be mutated by other processes while being rendered.
 func (n *NGINXController) OnUpdate(ingressCfg ingress.Configuration) error {
 	concurrentlyReloadWorkers := n.store.GetBackendConfiguration().ConcurrentlyReloadWorkers
 	if !concurrentlyReloadWorkers && n.workersReloading {
 		klog.InfoS("worker reload already in progress, requeuing reload")
 		return errors.New("worker reload already in progress, requeuing reload")
 	}
 	cfg := n.store.GetBackendConfiguration()
 	cfg.Resolver = n.resolver
@ -733,9 +743,41 @@ func (n *NGINXController) OnUpdate(ingressCfg ingress.Configuration) error {
 		return fmt.Errorf("%v\n%v", err, string(o))
 	}
 	// Reload status checking runs in a separate goroutine to avoid blocking the sync queue
 	if !concurrentlyReloadWorkers {
 		go n.awaitWorkersReload()
 	}
 	return nil
 }
 // awaitWorkersReload checks if the number of workers has returned to the expected count
 func (n *NGINXController) awaitWorkersReload() {
 	n.workersReloading = true
 	defer func() { n.workersReloading = false }()
 	expectedWorkers := n.store.GetBackendConfiguration().WorkerProcesses
 	var numWorkers string
 	klog.V(3).Infof("waiting for worker count to be equal to %s", expectedWorkers)
 	for numWorkers != expectedWorkers {
 		time.Sleep(time.Second)
 		o, err := exec.Command("/bin/sh", "-c", "pgrep worker | wc -l").Output()
 		if err != nil {
 			klog.ErrorS(err, string(numWorkers))
 			return
 		}
 		// cleanup any non-printable chars from shell output
 		numWorkers = strings.Map(func(r rune) rune {
 			if unicode.IsPrint(r) {
 				return r
 			}
 			return -1
 		}, string(o))
 		klog.V(3).Infof("Currently running nginx worker processes: %s, expected %s", numWorkers, expectedWorkers)
 	}
 }
 // nginxHashBucketSize computes the correct NGINX hash_bucket_size for a hash
 // with the given longest key.
 func nginxHashBucketSize(longestString int) int {
--- a/internal/ingress/controller/template/configmap.go
+++ b/internal/ingress/controller/template/configmap.go
@ -67,6 +67,7 @@ const (
 	luaSharedDictsKey             = "lua-shared-dicts"
 	plugins                       = "plugins"
 	debugConnections              = "debug-connections"
 	concurrentlyReloadWorkers     = "concurrently-reload-worker-processes"
 )
 var (
@ -385,6 +386,17 @@ func ReadConfig(src map[string]string) config.Configuration {
 		delete(conf, workerProcesses)
 	}
 	if val, ok := conf[concurrentlyReloadWorkers]; ok {
 		boolVal, err := strconv.ParseBool(val)
 		if err != nil {
 			to.ConcurrentlyReloadWorkers = true
 			klog.Warningf("failed to parse concurrently-reload-worker-processes setting, valid values are true or false, found %s", val)
 		} else {
 			to.ConcurrentlyReloadWorkers = boolVal
 		}
 		delete(conf, concurrentlyReloadWorkers)
 	}
 	if val, ok := conf[plugins]; ok {
 		to.Plugins = splitAndTrimSpace(val, ",")
 		delete(conf, plugins)