From 034c3ccad41374822a9180dceb3e5b999dc43f2f Mon Sep 17 00:00:00 2001 From: Jon Carl Date: Mon, 26 Aug 2024 13:09:11 -0600 Subject: [PATCH] Metrics: Add `--metrics-per-undefined-host` argument. (#11818) Signed-off-by: Jon Carl --- cmd/dataplane/main.go | 2 +- cmd/nginx/main.go | 2 +- docs/user-guide/cli-arguments.md | 1 + docs/user-guide/monitoring.md | 4 +- internal/ingress/controller/controller.go | 15 ++-- internal/ingress/metric/collectors/socket.go | 16 ++-- .../ingress/metric/collectors/socket_test.go | 75 ++++++++++++++++--- internal/ingress/metric/main.go | 4 +- pkg/flags/flags.go | 8 ++ pkg/flags/flags_test.go | 26 +++++++ test/e2e/metrics/metrics.go | 47 ++++++++++++ 11 files changed, 172 insertions(+), 28 deletions(-) diff --git a/cmd/dataplane/main.go b/cmd/dataplane/main.go index 29a175a6f..e7e4dc38f 100644 --- a/cmd/dataplane/main.go +++ b/cmd/dataplane/main.go @@ -66,7 +66,7 @@ func main() { mc := metric.NewDummyCollector() if conf.EnableMetrics { // TODO: Ingress class is not a part of dataplane anymore - mc, err = metric.NewCollector(conf.MetricsPerHost, conf.ReportStatusClasses, reg, conf.IngressClassConfiguration.Controller, *conf.MetricsBuckets, conf.MetricsBucketFactor, conf.MetricsMaxBuckets, conf.ExcludeSocketMetrics) + mc, err = metric.NewCollector(conf.MetricsPerHost, conf.MetricsPerUndefinedHost, conf.ReportStatusClasses, reg, conf.IngressClassConfiguration.Controller, *conf.MetricsBuckets, conf.MetricsBucketFactor, conf.MetricsMaxBuckets, conf.ExcludeSocketMetrics) if err != nil { klog.Fatalf("Error creating prometheus collector: %v", err) } diff --git a/cmd/nginx/main.go b/cmd/nginx/main.go index ea5acedc2..781f3a8eb 100644 --- a/cmd/nginx/main.go +++ b/cmd/nginx/main.go @@ -130,7 +130,7 @@ func main() { mc := metric.NewDummyCollector() if conf.EnableMetrics { - mc, err = metric.NewCollector(conf.MetricsPerHost, conf.ReportStatusClasses, reg, conf.IngressClassConfiguration.Controller, *conf.MetricsBuckets, conf.MetricsBucketFactor, conf.MetricsMaxBuckets, conf.ExcludeSocketMetrics) + mc, err = metric.NewCollector(conf.MetricsPerHost, conf.MetricsPerUndefinedHost, conf.ReportStatusClasses, reg, conf.IngressClassConfiguration.Controller, *conf.MetricsBuckets, conf.MetricsBucketFactor, conf.MetricsMaxBuckets, conf.ExcludeSocketMetrics) if err != nil { klog.Fatalf("Error creating prometheus collector: %v", err) } diff --git a/docs/user-guide/cli-arguments.md b/docs/user-guide/cli-arguments.md index 5315dd325..1beb821c7 100644 --- a/docs/user-guide/cli-arguments.md +++ b/docs/user-guide/cli-arguments.md @@ -48,6 +48,7 @@ They are set in the container spec of the `ingress-nginx-controller` Deployment | `--maxmind-license-key` | Maxmind license key to download GeoLite2 Databases. https://blog.maxmind.com/2019/12/significant-changes-to-accessing-and-using-geolite2-databases/ . | | `--maxmind-mirror` | Maxmind mirror url (example: http://geoip.local/databases. | | `--metrics-per-host` | Export metrics per-host. (default true) | +| `--metrics-per-undefined-host` | Export metrics per-host even if the host is not defined in an ingress. Requires --metrics-per-host to be set to true. (default false) | | `--monitor-max-batch-size` | Max batch size of NGINX metrics. (default 10000)| | `--post-shutdown-grace-period` | Additional delay in seconds before controller container exits. (default 10) | | `--profiler-port` | Port to use for expose the ingress controller Go profiler when it is enabled. (default 10245) | diff --git a/docs/user-guide/monitoring.md b/docs/user-guide/monitoring.md index b96914e1b..f08e1bc2f 100644 --- a/docs/user-guide/monitoring.md +++ b/docs/user-guide/monitoring.md @@ -166,7 +166,9 @@ According to the above example, this URL will be http://10.192.0.3:31086 #### Wildcard ingresses - - By default request metrics are labeled with the hostname. When you have a wildcard domain ingress, then there will be no metrics for that ingress (to prevent the metrics from exploding in cardinality). To get metrics in this case you need to run the ingress controller with `--metrics-per-host=false` (you will lose labeling by hostname, but still have labeling by ingress). + - By default request metrics are labeled with the hostname. When you have a wildcard domain ingress, then there will be no metrics for that ingress (to prevent the metrics from exploding in cardinality). To get metrics in this case you have two options: + - Run the ingress controller with `--metrics-per-host=false`. You will lose labeling by hostname, but still have labeling by ingress. + - Run the ingress controller with `--metrics-per-undefined-host=true --metrics-per-host=true`. You will get labeling by hostname even if the hostname is not explicitly defined on an ingress. Be warned that cardinality could explode due to many hostnames. ### Grafana dashboard using ingress resource - If you want to expose the dashboard for grafana using an ingress resource, then you can : diff --git a/internal/ingress/controller/controller.go b/internal/ingress/controller/controller.go index 45e11268c..aa8f4c4b9 100644 --- a/internal/ingress/controller/controller.go +++ b/internal/ingress/controller/controller.go @@ -105,13 +105,14 @@ type Configuration struct { EnableProfiling bool - EnableMetrics bool - MetricsPerHost bool - MetricsBuckets *collectors.HistogramBuckets - MetricsBucketFactor float64 - MetricsMaxBuckets uint32 - ReportStatusClasses bool - ExcludeSocketMetrics []string + EnableMetrics bool + MetricsPerHost bool + MetricsPerUndefinedHost bool + MetricsBuckets *collectors.HistogramBuckets + MetricsBucketFactor float64 + MetricsMaxBuckets uint32 + ReportStatusClasses bool + ExcludeSocketMetrics []string FakeCertificate *ingress.SSLCert diff --git a/internal/ingress/metric/collectors/socket.go b/internal/ingress/metric/collectors/socket.go index d0b57eb43..0bdd816ae 100644 --- a/internal/ingress/metric/collectors/socket.go +++ b/internal/ingress/metric/collectors/socket.go @@ -81,8 +81,9 @@ type SocketCollector struct { hosts sets.Set[string] - metricsPerHost bool - reportStatusClasses bool + metricsPerHost bool + metricsPerUndefinedHost bool + reportStatusClasses bool } var requestTags = []string{ @@ -99,7 +100,7 @@ var requestTags = []string{ // NewSocketCollector creates a new SocketCollector instance using // the ingress watch namespace and class used by the controller -func NewSocketCollector(pod, namespace, class string, metricsPerHost, reportStatusClasses bool, buckets HistogramBuckets, bucketFactor float64, maxBuckets uint32, excludeMetrics []string) (*SocketCollector, error) { +func NewSocketCollector(pod, namespace, class string, metricsPerHost, metricsPerUndefinedHost, reportStatusClasses bool, buckets HistogramBuckets, bucketFactor float64, maxBuckets uint32, excludeMetrics []string) (*SocketCollector, error) { socket := "/tmp/nginx/prometheus-nginx.socket" // unix sockets must be unlink()ed before being used //nolint:errcheck // Ignore unlink error @@ -139,8 +140,9 @@ func NewSocketCollector(pod, namespace, class string, metricsPerHost, reportStat sc := &SocketCollector{ listener: listener, - metricsPerHost: metricsPerHost, - reportStatusClasses: reportStatusClasses, + metricsPerHost: metricsPerHost, + metricsPerUndefinedHost: metricsPerUndefinedHost, + reportStatusClasses: reportStatusClasses, connectTime: histogramMetric( &prometheus.HistogramOpts{ @@ -306,8 +308,8 @@ func (sc *SocketCollector) handleMessage(msg []byte) { for i := range statsBatch { stats := &statsBatch[i] - if sc.metricsPerHost && !sc.hosts.Has(stats.Host) { - klog.V(3).InfoS("Skipping metric for host not being served", "host", stats.Host) + if sc.metricsPerHost && !sc.hosts.Has(stats.Host) && !sc.metricsPerUndefinedHost { + klog.V(3).InfoS("Skipping metric for host not explicitly defined in an ingress", "host", stats.Host) continue } diff --git a/internal/ingress/metric/collectors/socket_test.go b/internal/ingress/metric/collectors/socket_test.go index a8261a9ca..3a1f29f35 100644 --- a/internal/ingress/metric/collectors/socket_test.go +++ b/internal/ingress/metric/collectors/socket_test.go @@ -87,14 +87,15 @@ func TestCollector(t *testing.T) { maxBuckets := uint32(100) cases := []struct { - name string - data []string - metrics []string - useStatusClasses bool - excludeMetrics []string - wantBefore string - removeIngresses []string - wantAfter string + name string + data []string + metrics []string + metricsPerUndefinedHost bool + useStatusClasses bool + excludeMetrics []string + wantBefore string + removeIngresses []string + wantAfter string }{ { name: "invalid metric object should not increase prometheus metrics", @@ -591,13 +592,69 @@ func TestCollector(t *testing.T) { nginx_ingress_controller_response_duration_seconds_count{canary="",controller_class="ingress",controller_namespace="default",controller_pod="pod",host="testshop.com",ingress="web-yml",method="GET",namespace="test-app-production",path="/admin",service="test-app",status="2xx"} 1 `, }, + { + name: "metrics with a host should not be dropped when the host is not in the hosts slice but metricsPerUndefinedHost is true", + data: []string{`[{ + "host":"wildcard.testshop.com", + "status":"200", + "bytesSent":150.0, + "method":"GET", + "path":"/admin", + "requestLength":300.0, + "requestTime":60.0, + "upstreamLatency":1.0, + "upstreamHeaderTime":5.0, + "upstreamName":"test-upstream", + "upstreamIP":"1.1.1.1:8080", + "upstreamResponseTime":200, + "upstreamStatus":"220", + "namespace":"test-app-production", + "ingress":"web-yml", + "service":"test-app", + "canary":"" + }]`}, + excludeMetrics: []string{"response_duration_seconds2", "test.*", "nginx_ingress_.*", "response_duration_secon"}, + metrics: []string{"nginx_ingress_controller_requests"}, + metricsPerUndefinedHost: true, + useStatusClasses: true, + wantBefore: ` + # HELP nginx_ingress_controller_requests The total number of client requests + # TYPE nginx_ingress_controller_requests counter + nginx_ingress_controller_requests{canary="",controller_class="ingress",controller_namespace="default",controller_pod="pod",host="wildcard.testshop.com",ingress="web-yml",method="GET",namespace="test-app-production",path="/admin",service="test-app",status="2xx"} 1 + `, + }, + { + name: "metrics with a host should be dropped when the host is not in the hosts slice", + data: []string{`[{ + "host":"wildcard.testshop.com", + "status":"200", + "bytesSent":150.0, + "method":"GET", + "path":"/admin", + "requestLength":300.0, + "requestTime":60.0, + "upstreamLatency":1.0, + "upstreamHeaderTime":5.0, + "upstreamName":"test-upstream", + "upstreamIP":"1.1.1.1:8080", + "upstreamResponseTime":200, + "upstreamStatus":"220", + "namespace":"test-app-production", + "ingress":"web-yml", + "service":"test-app", + "canary":"" + }]`}, + excludeMetrics: []string{"response_duration_seconds2", "test.*", "nginx_ingress_.*", "response_duration_secon"}, + metrics: []string{"nginx_ingress_controller_requests"}, + useStatusClasses: true, + }, } for _, c := range cases { t.Run(c.name, func(t *testing.T) { registry := prometheus.NewPedanticRegistry() - sc, err := NewSocketCollector("pod", "default", "ingress", true, c.useStatusClasses, buckets, bucketFactor, maxBuckets, c.excludeMetrics) + sc, err := NewSocketCollector("pod", "default", "ingress", true, c.metricsPerUndefinedHost, c.useStatusClasses, buckets, bucketFactor, maxBuckets, c.excludeMetrics) if err != nil { t.Errorf("%v: unexpected error creating new SocketCollector: %v", c.name, err) } diff --git a/internal/ingress/metric/main.go b/internal/ingress/metric/main.go index 99128c3c6..9ed401d19 100644 --- a/internal/ingress/metric/main.go +++ b/internal/ingress/metric/main.go @@ -71,7 +71,7 @@ type collector struct { } // NewCollector creates a new metric collector the for ingress controller -func NewCollector(metricsPerHost, reportStatusClasses bool, registry *prometheus.Registry, ingressclass string, buckets collectors.HistogramBuckets, bucketFactor float64, maxBuckets uint32, excludedSocketMetrics []string) (Collector, error) { +func NewCollector(metricsPerHost, metricsPerUndefinedHost, reportStatusClasses bool, registry *prometheus.Registry, ingressclass string, buckets collectors.HistogramBuckets, bucketFactor float64, maxBuckets uint32, excludedSocketMetrics []string) (Collector, error) { podNamespace := os.Getenv("POD_NAMESPACE") if podNamespace == "" { podNamespace = "default" @@ -89,7 +89,7 @@ func NewCollector(metricsPerHost, reportStatusClasses bool, registry *prometheus return nil, err } - s, err := collectors.NewSocketCollector(podName, podNamespace, ingressclass, metricsPerHost, reportStatusClasses, buckets, bucketFactor, maxBuckets, excludedSocketMetrics) + s, err := collectors.NewSocketCollector(podName, podNamespace, ingressclass, metricsPerHost, metricsPerUndefinedHost, reportStatusClasses, buckets, bucketFactor, maxBuckets, excludedSocketMetrics) if err != nil { return nil, err } diff --git a/pkg/flags/flags.go b/pkg/flags/flags.go index 5a1ab0aa9..6f62f75b5 100644 --- a/pkg/flags/flags.go +++ b/pkg/flags/flags.go @@ -17,6 +17,7 @@ limitations under the License. package flags import ( + "errors" "flag" "fmt" "net" @@ -177,6 +178,8 @@ Requires the update-status parameter.`) `Enables the collection of NGINX metrics.`) metricsPerHost = flags.Bool("metrics-per-host", true, `Export metrics per-host.`) + metricsPerUndefinedHost = flags.Bool("metrics-per-undefined-host", false, + `Export metrics per-host even if the host is not defined in an ingress. Requires --metrics-per-host to be set to true.`) reportStatusClasses = flags.Bool("report-status-classes", false, `Use status classes (2xx, 3xx, 4xx and 5xx) instead of status codes in metrics.`) @@ -319,6 +322,10 @@ https://blog.maxmind.com/2019/12/significant-changes-to-accessing-and-using-geol } } + if *metricsPerUndefinedHost && !*metricsPerHost { + return false, nil, errors.New("--metrics-per-undefined-host=true must be passed with --metrics-per-host=true") + } + if *electionTTL <= 0 { *electionTTL = 30 * time.Second } @@ -340,6 +347,7 @@ https://blog.maxmind.com/2019/12/significant-changes-to-accessing-and-using-geol EnableProfiling: *profiling, EnableMetrics: *enableMetrics, MetricsPerHost: *metricsPerHost, + MetricsPerUndefinedHost: *metricsPerUndefinedHost, MetricsBuckets: histogramBuckets, MetricsBucketFactor: *bucketFactor, MetricsMaxBuckets: *maxBuckets, diff --git a/pkg/flags/flags_test.go b/pkg/flags/flags_test.go index e51d5fa6c..fdf153021 100644 --- a/pkg/flags/flags_test.go +++ b/pkg/flags/flags_test.go @@ -212,3 +212,29 @@ func TestLeaderElectionTTLParseValueInHours(t *testing.T) { t.Fatalf("Expected --election-ttl and conf.ElectionTTL as 1h, but found: %v", conf.ElectionTTL) } } + +func TestMetricsPerUndefinedHost(t *testing.T) { + ResetForTesting(func() { t.Fatal("Parsing failed") }) + + oldArgs := os.Args + defer func() { os.Args = oldArgs }() + os.Args = []string{"cmd", "--metrics-per-undefined-host=true"} + + _, _, err := ParseFlags() + if err != nil { + t.Fatalf("Expected no error but got: %s", err) + } +} + +func TestMetricsPerUndefinedHostWithMetricsPerHostFalse(t *testing.T) { + ResetForTesting(func() { t.Fatal("Parsing failed") }) + + oldArgs := os.Args + defer func() { os.Args = oldArgs }() + os.Args = []string{"cmd", "--metrics-per-host=false", "--metrics-per-undefined-host=true"} + + _, _, err := ParseFlags() + if err == nil { + t.Fatalf("Expected an error parsing flags but none returned") + } +} diff --git a/test/e2e/metrics/metrics.go b/test/e2e/metrics/metrics.go index 907b53732..bec09bb37 100644 --- a/test/e2e/metrics/metrics.go +++ b/test/e2e/metrics/metrics.go @@ -36,6 +36,7 @@ const waitForMetrics = 2 * time.Second var _ = framework.IngressNginxDescribe("[metrics] exported prometheus metrics", func() { f := framework.NewDefaultFramework("metrics") host := "foo.com" + wildcardHost := "wildcard." + host ginkgo.BeforeEach(func() { f.NewEchoDeployment() @@ -91,4 +92,50 @@ var _ = framework.IngressNginxDescribe("[metrics] exported prometheus metrics", assert.Nil(ginkgo.GinkgoT(), err) assert.NotNil(ginkgo.GinkgoT(), mf) }) + ginkgo.It("request metrics per undefined host are present when flag is set", func() { + err := f.UpdateIngressControllerDeployment(func(deployment *appsv1.Deployment) error { + args := deployment.Spec.Template.Spec.Containers[0].Args + args = append(args, "--metrics-per-undefined-host=true") + deployment.Spec.Template.Spec.Containers[0].Args = args + _, err := f.KubeClientSet.AppsV1().Deployments(f.Namespace).Update(context.TODO(), deployment, metav1.UpdateOptions{}) + return err + }) + assert.Nil(ginkgo.GinkgoT(), err, "updating deployment") + + f.HTTPTestClient(). + GET("/"). + WithHeader("Host", wildcardHost). + Expect(). + Status(http.StatusNotFound) + time.Sleep(waitForMetrics) + + ip := f.GetNginxPodIP() + reqMetrics, err := f.GetMetric("nginx_ingress_controller_requests", ip) + assert.Nil(ginkgo.GinkgoT(), err) + assert.NotNil(ginkgo.GinkgoT(), reqMetrics.Metric) + assert.Len(ginkgo.GinkgoT(), reqMetrics.Metric, 1) + + containedLabel := false + for _, label := range reqMetrics.Metric[0].Label { + if *label.Name == "host" && *label.Value == wildcardHost { + containedLabel = true + break + } + } + + assert.Truef(ginkgo.GinkgoT(), containedLabel, "expected reqMetrics to contain label with \"name\"=\"host\" \"value\"=%q, but it did not: %s", wildcardHost, reqMetrics.String()) + }) + ginkgo.It("request metrics per undefined host are not present when flag is not set", func() { + f.HTTPTestClient(). + GET("/"). + WithHeader("Host", wildcardHost). + Expect(). + Status(http.StatusNotFound) + time.Sleep(waitForMetrics) + + ip := f.GetNginxPodIP() + reqMetrics, err := f.GetMetric("nginx_ingress_controller_requests", ip) + assert.EqualError(ginkgo.GinkgoT(), err, "there is no metric with name nginx_ingress_controller_requests") + assert.Nil(ginkgo.GinkgoT(), reqMetrics) + }) })