From 4e8d0b5836096426393340273260b9000cd1e151 Mon Sep 17 00:00:00 2001 From: Marco Cadetg Date: Tue, 11 Apr 2023 10:01:18 +0200 Subject: [PATCH] Exclude socket metrics (#9770) * exclude creation and exporting of socket metrics via flag * make exclude metric naming more consistent * fix connect time metric update * add documentation * e2e test * improve creation of metric mapping --- cmd/dataplane/main.go | 2 +- cmd/nginx/main.go | 2 +- docs/user-guide/cli-arguments.md | 1 + internal/ingress/controller/controller.go | 9 +- internal/ingress/metric/collectors/socket.go | 219 +++++++++++------- .../ingress/metric/collectors/socket_test.go | 116 +++++++++- internal/ingress/metric/main.go | 4 +- pkg/flags/flags.go | 10 +- test/e2e/e2e.go | 1 + test/e2e/metrics/metrics.go | 94 ++++++++ 10 files changed, 362 insertions(+), 96 deletions(-) create mode 100644 test/e2e/metrics/metrics.go diff --git a/cmd/dataplane/main.go b/cmd/dataplane/main.go index 0ab978429..6fd559e4d 100644 --- a/cmd/dataplane/main.go +++ b/cmd/dataplane/main.go @@ -70,7 +70,7 @@ func main() { mc := metric.NewDummyCollector() if conf.EnableMetrics { // TODO: Ingress class is not a part of dataplane anymore - mc, err = metric.NewCollector(conf.MetricsPerHost, conf.ReportStatusClasses, reg, conf.IngressClassConfiguration.Controller, *conf.MetricsBuckets) + mc, err = metric.NewCollector(conf.MetricsPerHost, conf.ReportStatusClasses, reg, conf.IngressClassConfiguration.Controller, *conf.MetricsBuckets, conf.ExcludeSocketMetrics) if err != nil { klog.Fatalf("Error creating prometheus collector: %v", err) } diff --git a/cmd/nginx/main.go b/cmd/nginx/main.go index 9f0973ec9..48dd933dc 100644 --- a/cmd/nginx/main.go +++ b/cmd/nginx/main.go @@ -133,7 +133,7 @@ func main() { mc := metric.NewDummyCollector() if conf.EnableMetrics { - mc, err = metric.NewCollector(conf.MetricsPerHost, conf.ReportStatusClasses, reg, conf.IngressClassConfiguration.Controller, *conf.MetricsBuckets) + mc, err = metric.NewCollector(conf.MetricsPerHost, conf.ReportStatusClasses, reg, conf.IngressClassConfiguration.Controller, *conf.MetricsBuckets, conf.ExcludeSocketMetrics) if err != nil { klog.Fatalf("Error creating prometheus collector: %v", err) } diff --git a/docs/user-guide/cli-arguments.md b/docs/user-guide/cli-arguments.md index febc6f762..59e52957d 100644 --- a/docs/user-guide/cli-arguments.md +++ b/docs/user-guide/cli-arguments.md @@ -25,6 +25,7 @@ They are set in the container spec of the `ingress-nginx-controller` Deployment | `--enable-ssl-chain-completion` | Autocomplete SSL certificate chains with missing intermediate CA certificates. Certificates uploaded to Kubernetes must have the "Authority Information Access" X.509 v3 extension for this to succeed. (default false)| | `--enable-ssl-passthrough` | Enable SSL Passthrough. (default false) | | `--enable-topology-aware-routing` | Enable topology aware hints feature, needs service object annotation service.kubernetes.io/topology-aware-hints sets to auto. (default false) | +| `--exclude-socket-metrics` | Set of socket request metrics to exclude which won't be exported nor being calculated. The possible socket request metrics to exclude are documented in the monitoring guide e.g. 'nginx_ingress_controller_request_duration_seconds,nginx_ingress_controller_response_size'| | `--health-check-path` | URL path of the health check endpoint. Configured inside the NGINX status server. All requests received on the port defined by the healthz-port parameter are forwarded internally to this path. (default "/healthz") | | `--health-check-timeout` | Time limit, in seconds, for a probe to health-check-path to succeed. (default 10) | | `--healthz-port` | Port to use for the healthz endpoint. (default 10254) | diff --git a/internal/ingress/controller/controller.go b/internal/ingress/controller/controller.go index 5d94605b5..17142a478 100644 --- a/internal/ingress/controller/controller.go +++ b/internal/ingress/controller/controller.go @@ -102,10 +102,11 @@ type Configuration struct { EnableProfiling bool - EnableMetrics bool - MetricsPerHost bool - MetricsBuckets *collectors.HistogramBuckets - ReportStatusClasses bool + EnableMetrics bool + MetricsPerHost bool + MetricsBuckets *collectors.HistogramBuckets + ReportStatusClasses bool + ExcludeSocketMetrics []string FakeCertificate *ingress.SSLCert diff --git a/internal/ingress/metric/collectors/socket.go b/internal/ingress/metric/collectors/socket.go index 23048d5d6..508cc6bc8 100644 --- a/internal/ingress/metric/collectors/socket.go +++ b/internal/ingress/metric/collectors/socket.go @@ -21,6 +21,7 @@ import ( "io" "net" "os" + "strings" "syscall" jsoniter "github.com/json-iterator/go" @@ -60,6 +61,8 @@ type HistogramBuckets struct { SizeBuckets []float64 } +type metricMapping map[string]prometheus.Collector + // SocketCollector stores prometheus metrics and ingress meta-data type SocketCollector struct { prometheus.Collector @@ -78,7 +81,7 @@ type SocketCollector struct { listener net.Listener - metricMapping map[string]interface{} + metricMapping metricMapping hosts sets.Set[string] @@ -106,7 +109,7 @@ var defObjectives = map[float64]float64{0.5: 0.05, 0.9: 0.01, 0.99: 0.001} // NewSocketCollector creates a new SocketCollector instance using // the ingress watch namespace and class used by the controller -func NewSocketCollector(pod, namespace, class string, metricsPerHost, reportStatusClasses bool, buckets HistogramBuckets) (*SocketCollector, error) { +func NewSocketCollector(pod, namespace, class string, metricsPerHost, reportStatusClasses bool, buckets HistogramBuckets, excludeMetrics []string) (*SocketCollector, error) { socket := "/tmp/nginx/prometheus-nginx.socket" // unix sockets must be unlink()ed before being used _ = syscall.Unlink(socket) @@ -132,13 +135,23 @@ func NewSocketCollector(pod, namespace, class string, metricsPerHost, reportStat requestTags = append(requestTags, "host") } + em := make(map[string]struct{}, len(excludeMetrics)) + for _, m := range excludeMetrics { + // remove potential nginx_ingress_controller prefix from the metric name + // TBD: how to handle fully qualified histogram metrics e.g. _buckets and _sum. Should we just remove the suffix and remove the histogram metric or ignore it? + em[strings.TrimPrefix(m, "nginx_ingress_controller_")] = struct{}{} + } + + // create metric mapping with only the metrics that are not excluded + mm := make(metricMapping) + sc := &SocketCollector{ listener: listener, metricsPerHost: metricsPerHost, reportStatusClasses: reportStatusClasses, - connectTime: prometheus.NewHistogramVec( + connectTime: histogramMetric( prometheus.HistogramOpts{ Name: "connect_duration_seconds", Help: "The time spent on establishing a connection with the upstream server", @@ -147,8 +160,11 @@ func NewSocketCollector(pod, namespace, class string, metricsPerHost, reportStat Buckets: buckets.TimeBuckets, }, requestTags, + em, + mm, ), - headerTime: prometheus.NewHistogramVec( + + headerTime: histogramMetric( prometheus.HistogramOpts{ Name: "header_duration_seconds", Help: "The time spent on receiving first header from the upstream server", @@ -157,8 +173,10 @@ func NewSocketCollector(pod, namespace, class string, metricsPerHost, reportStat Buckets: buckets.TimeBuckets, }, requestTags, + em, + mm, ), - responseTime: prometheus.NewHistogramVec( + responseTime: histogramMetric( prometheus.HistogramOpts{ Name: "response_duration_seconds", Help: "The time spent on receiving the response from the upstream server", @@ -167,8 +185,11 @@ func NewSocketCollector(pod, namespace, class string, metricsPerHost, reportStat Buckets: buckets.TimeBuckets, }, requestTags, + em, + mm, ), - requestTime: prometheus.NewHistogramVec( + + requestTime: histogramMetric( prometheus.HistogramOpts{ Name: "request_duration_seconds", Help: "The request processing time in milliseconds", @@ -177,9 +198,11 @@ func NewSocketCollector(pod, namespace, class string, metricsPerHost, reportStat Buckets: buckets.TimeBuckets, }, requestTags, + em, + mm, ), - responseLength: prometheus.NewHistogramVec( + responseLength: histogramMetric( prometheus.HistogramOpts{ Name: "response_size", Help: "The response length (including request line, header, and request body)", @@ -188,19 +211,24 @@ func NewSocketCollector(pod, namespace, class string, metricsPerHost, reportStat Buckets: buckets.LengthBuckets, }, requestTags, + em, + mm, ), - requestLength: prometheus.NewHistogramVec( + + requestLength: histogramMetric( prometheus.HistogramOpts{ Name: "request_size", Help: "The request length (including request line, header, and request body)", Namespace: PrometheusNamespace, - Buckets: buckets.LengthBuckets, ConstLabels: constLabels, + Buckets: buckets.LengthBuckets, }, requestTags, + em, + mm, ), - requests: prometheus.NewCounterVec( + requests: counterMetric( prometheus.CounterOpts{ Name: "requests", Help: "The total number of client requests", @@ -208,9 +236,11 @@ func NewSocketCollector(pod, namespace, class string, metricsPerHost, reportStat ConstLabels: constLabels, }, requestTags, + em, + mm, ), - bytesSent: prometheus.NewHistogramVec( + bytesSent: histogramMetric( prometheus.HistogramOpts{ Name: "bytes_sent", Help: "DEPRECATED The number of bytes sent to a client", @@ -219,9 +249,11 @@ func NewSocketCollector(pod, namespace, class string, metricsPerHost, reportStat ConstLabels: constLabels, }, requestTags, + em, + mm, ), - upstreamLatency: prometheus.NewSummaryVec( + upstreamLatency: summaryMetric( prometheus.SummaryOpts{ Name: "ingress_upstream_latency_seconds", Help: "DEPRECATED Upstream service latency per Ingress", @@ -230,28 +262,59 @@ func NewSocketCollector(pod, namespace, class string, metricsPerHost, reportStat Objectives: defObjectives, }, []string{"ingress", "namespace", "service", "canary"}, + em, + mm, ), } - sc.metricMapping = map[string]interface{}{ - prometheus.BuildFQName(PrometheusNamespace, "", "requests"): sc.requests, - - prometheus.BuildFQName(PrometheusNamespace, "", "connect_duration_seconds"): sc.connectTime, - prometheus.BuildFQName(PrometheusNamespace, "", "header_duration_seconds"): sc.headerTime, - prometheus.BuildFQName(PrometheusNamespace, "", "response_duration_seconds"): sc.responseTime, - prometheus.BuildFQName(PrometheusNamespace, "", "request_duration_seconds"): sc.requestTime, - - prometheus.BuildFQName(PrometheusNamespace, "", "request_size"): sc.requestLength, - prometheus.BuildFQName(PrometheusNamespace, "", "response_size"): sc.responseLength, - - prometheus.BuildFQName(PrometheusNamespace, "", "bytes_sent"): sc.bytesSent, - - prometheus.BuildFQName(PrometheusNamespace, "", "ingress_upstream_latency_seconds"): sc.upstreamLatency, - } - + sc.metricMapping = mm return sc, nil } +func containsMetric(excludeMetrics map[string]struct{}, name string) bool { + if _, ok := excludeMetrics[name]; ok { + klog.V(3).InfoS("Skipping metric", "metric", name) + return true + } + return false +} + +func summaryMetric(opts prometheus.SummaryOpts, requestTags []string, excludeMetrics map[string]struct{}, metricMapping metricMapping) *prometheus.SummaryVec { + if containsMetric(excludeMetrics, opts.Name) { + return nil + } + m := prometheus.NewSummaryVec( + opts, + requestTags, + ) + metricMapping[prometheus.BuildFQName(PrometheusNamespace, "", opts.Name)] = m + return m +} + +func counterMetric(opts prometheus.CounterOpts, requestTags []string, excludeMetrics map[string]struct{}, metricMapping metricMapping) *prometheus.CounterVec { + if containsMetric(excludeMetrics, opts.Name) { + return nil + } + m := prometheus.NewCounterVec( + opts, + requestTags, + ) + metricMapping[prometheus.BuildFQName(PrometheusNamespace, "", opts.Name)] = m + return m +} + +func histogramMetric(opts prometheus.HistogramOpts, requestTags []string, excludeMetrics map[string]struct{}, metricMapping metricMapping) *prometheus.HistogramVec { + if containsMetric(excludeMetrics, opts.Name) { + return nil + } + m := prometheus.NewHistogramVec( + opts, + requestTags, + ) + metricMapping[prometheus.BuildFQName(PrometheusNamespace, "", opts.Name)] = m + return m +} + func (sc *SocketCollector) handleMessage(msg []byte) { klog.V(5).InfoS("Metric", "message", string(msg)) @@ -305,30 +368,36 @@ func (sc *SocketCollector) handleMessage(msg []byte) { "canary": stats.Canary, } - requestsMetric, err := sc.requests.GetMetricWith(collectorLabels) - if err != nil { - klog.ErrorS(err, "Error fetching requests metric") - } else { - requestsMetric.Inc() + if sc.requests != nil { + requestsMetric, err := sc.requests.GetMetricWith(collectorLabels) + if err != nil { + klog.ErrorS(err, "Error fetching requests metric") + } else { + requestsMetric.Inc() + } } if stats.Latency != -1 { - connectTimeMetric, err := sc.connectTime.GetMetricWith(requestLabels) - if err != nil { - klog.ErrorS(err, "Error fetching connect time metric") - } else { - connectTimeMetric.Observe(stats.Latency) + if sc.connectTime != nil { + connectTimeMetric, err := sc.connectTime.GetMetricWith(requestLabels) + if err != nil { + klog.ErrorS(err, "Error fetching connect time metric") + } else { + connectTimeMetric.Observe(stats.Latency) + } } - latencyMetric, err := sc.upstreamLatency.GetMetricWith(latencyLabels) - if err != nil { - klog.ErrorS(err, "Error fetching latency metric") - } else { - latencyMetric.Observe(stats.Latency) + if sc.upstreamLatency != nil { + latencyMetric, err := sc.upstreamLatency.GetMetricWith(latencyLabels) + if err != nil { + klog.ErrorS(err, "Error fetching latency metric") + } else { + latencyMetric.Observe(stats.Latency) + } } } - if stats.HeaderTime != -1 { + if stats.HeaderTime != -1 && sc.headerTime != nil { headerTimeMetric, err := sc.headerTime.GetMetricWith(requestLabels) if err != nil { klog.ErrorS(err, "Error fetching header time metric") @@ -337,7 +406,7 @@ func (sc *SocketCollector) handleMessage(msg []byte) { } } - if stats.RequestTime != -1 { + if stats.RequestTime != -1 && sc.requestTime != nil { requestTimeMetric, err := sc.requestTime.GetMetricWith(requestLabels) if err != nil { klog.ErrorS(err, "Error fetching request duration metric") @@ -346,7 +415,7 @@ func (sc *SocketCollector) handleMessage(msg []byte) { } } - if stats.RequestLength != -1 { + if stats.RequestLength != -1 && sc.requestLength != nil { requestLengthMetric, err := sc.requestLength.GetMetricWith(requestLabels) if err != nil { klog.ErrorS(err, "Error fetching request length metric") @@ -355,7 +424,7 @@ func (sc *SocketCollector) handleMessage(msg []byte) { } } - if stats.ResponseTime != -1 { + if stats.ResponseTime != -1 && sc.responseTime != nil { responseTimeMetric, err := sc.responseTime.GetMetricWith(requestLabels) if err != nil { klog.ErrorS(err, "Error fetching upstream response time metric") @@ -365,18 +434,22 @@ func (sc *SocketCollector) handleMessage(msg []byte) { } if stats.ResponseLength != -1 { - bytesSentMetric, err := sc.bytesSent.GetMetricWith(requestLabels) - if err != nil { - klog.ErrorS(err, "Error fetching bytes sent metric") - } else { - bytesSentMetric.Observe(stats.ResponseLength) + if sc.bytesSent != nil { + bytesSentMetric, err := sc.bytesSent.GetMetricWith(requestLabels) + if err != nil { + klog.ErrorS(err, "Error fetching bytes sent metric") + } else { + bytesSentMetric.Observe(stats.ResponseLength) + } } - responseSizeMetric, err := sc.responseLength.GetMetricWith(requestLabels) - if err != nil { - klog.ErrorS(err, "Error fetching bytes sent metric") - } else { - responseSizeMetric.Observe(stats.ResponseLength) + if sc.responseLength != nil { + responseSizeMetric, err := sc.responseLength.GetMetricWith(requestLabels) + if err != nil { + klog.ErrorS(err, "Error fetching bytes sent metric") + } else { + responseSizeMetric.Observe(stats.ResponseLength) + } } } } @@ -471,36 +544,16 @@ func (sc *SocketCollector) RemoveMetrics(ingresses []string, registry prometheus // Describe implements prometheus.Collector func (sc SocketCollector) Describe(ch chan<- *prometheus.Desc) { - sc.connectTime.Describe(ch) - sc.headerTime.Describe(ch) - sc.responseTime.Describe(ch) - sc.requestTime.Describe(ch) - - sc.requestLength.Describe(ch) - sc.responseLength.Describe(ch) - - sc.requests.Describe(ch) - - sc.upstreamLatency.Describe(ch) - - sc.bytesSent.Describe(ch) + for _, metric := range sc.metricMapping { + metric.Describe(ch) + } } // Collect implements the prometheus.Collector interface. func (sc SocketCollector) Collect(ch chan<- prometheus.Metric) { - sc.connectTime.Collect(ch) - sc.headerTime.Collect(ch) - sc.responseTime.Collect(ch) - sc.requestTime.Collect(ch) - - sc.requestLength.Collect(ch) - sc.responseLength.Collect(ch) - - sc.requests.Collect(ch) - - sc.upstreamLatency.Collect(ch) - - sc.bytesSent.Collect(ch) + for _, metric := range sc.metricMapping { + metric.Collect(ch) + } } // SetHosts sets the hostnames that are being served by the ingress controller diff --git a/internal/ingress/metric/collectors/socket_test.go b/internal/ingress/metric/collectors/socket_test.go index 4bebc7600..fe442aba0 100644 --- a/internal/ingress/metric/collectors/socket_test.go +++ b/internal/ingress/metric/collectors/socket_test.go @@ -84,6 +84,7 @@ func TestCollector(t *testing.T) { data []string metrics []string useStatusClasses bool + excludeMetrics []string wantBefore string removeIngresses []string wantAfter string @@ -470,13 +471,126 @@ func TestCollector(t *testing.T) { wantAfter: ` `, }, + { + name: "basic exclude metrics test", + data: []string{`[{ + "host":"testshop.com", + "status":"200", + "bytesSent":150.0, + "method":"GET", + "path":"/admin", + "requestLength":300.0, + "requestTime":60.0, + "upstreamLatency":1.0, + "upstreamHeaderTime":5.0, + "upstreamName":"test-upstream", + "upstreamIP":"1.1.1.1:8080", + "upstreamResponseTime":200, + "upstreamStatus":"220", + "namespace":"test-app-production", + "ingress":"web-yml", + "service":"test-app", + "canary":"" + }]`}, + excludeMetrics: []string{"nginx_ingress_controller_connect_duration_seconds"}, + metrics: []string{"nginx_ingress_controller_connect_duration_seconds", "nginx_ingress_controller_response_duration_seconds"}, + useStatusClasses: true, + wantBefore: ` + # HELP nginx_ingress_controller_response_duration_seconds The time spent on receiving the response from the upstream server + # TYPE nginx_ingress_controller_response_duration_seconds histogram + nginx_ingress_controller_response_duration_seconds_bucket{canary="",controller_class="ingress",controller_namespace="default",controller_pod="pod",host="testshop.com",ingress="web-yml",method="GET",namespace="test-app-production",path="/admin",service="test-app",status="2xx",le="0.005"} 0 + nginx_ingress_controller_response_duration_seconds_bucket{canary="",controller_class="ingress",controller_namespace="default",controller_pod="pod",host="testshop.com",ingress="web-yml",method="GET",namespace="test-app-production",path="/admin",service="test-app",status="2xx",le="0.01"} 0 + nginx_ingress_controller_response_duration_seconds_bucket{canary="",controller_class="ingress",controller_namespace="default",controller_pod="pod",host="testshop.com",ingress="web-yml",method="GET",namespace="test-app-production",path="/admin",service="test-app",status="2xx",le="0.025"} 0 + nginx_ingress_controller_response_duration_seconds_bucket{canary="",controller_class="ingress",controller_namespace="default",controller_pod="pod",host="testshop.com",ingress="web-yml",method="GET",namespace="test-app-production",path="/admin",service="test-app",status="2xx",le="0.05"} 0 + nginx_ingress_controller_response_duration_seconds_bucket{canary="",controller_class="ingress",controller_namespace="default",controller_pod="pod",host="testshop.com",ingress="web-yml",method="GET",namespace="test-app-production",path="/admin",service="test-app",status="2xx",le="0.1"} 0 + nginx_ingress_controller_response_duration_seconds_bucket{canary="",controller_class="ingress",controller_namespace="default",controller_pod="pod",host="testshop.com",ingress="web-yml",method="GET",namespace="test-app-production",path="/admin",service="test-app",status="2xx",le="0.25"} 0 + nginx_ingress_controller_response_duration_seconds_bucket{canary="",controller_class="ingress",controller_namespace="default",controller_pod="pod",host="testshop.com",ingress="web-yml",method="GET",namespace="test-app-production",path="/admin",service="test-app",status="2xx",le="0.5"} 0 + nginx_ingress_controller_response_duration_seconds_bucket{canary="",controller_class="ingress",controller_namespace="default",controller_pod="pod",host="testshop.com",ingress="web-yml",method="GET",namespace="test-app-production",path="/admin",service="test-app",status="2xx",le="1"} 0 + nginx_ingress_controller_response_duration_seconds_bucket{canary="",controller_class="ingress",controller_namespace="default",controller_pod="pod",host="testshop.com",ingress="web-yml",method="GET",namespace="test-app-production",path="/admin",service="test-app",status="2xx",le="2.5"} 0 + nginx_ingress_controller_response_duration_seconds_bucket{canary="",controller_class="ingress",controller_namespace="default",controller_pod="pod",host="testshop.com",ingress="web-yml",method="GET",namespace="test-app-production",path="/admin",service="test-app",status="2xx",le="5"} 0 + nginx_ingress_controller_response_duration_seconds_bucket{canary="",controller_class="ingress",controller_namespace="default",controller_pod="pod",host="testshop.com",ingress="web-yml",method="GET",namespace="test-app-production",path="/admin",service="test-app",status="2xx",le="10"} 0 + nginx_ingress_controller_response_duration_seconds_bucket{canary="",controller_class="ingress",controller_namespace="default",controller_pod="pod",host="testshop.com",ingress="web-yml",method="GET",namespace="test-app-production",path="/admin",service="test-app",status="2xx",le="+Inf"} 1 + nginx_ingress_controller_response_duration_seconds_sum{canary="",controller_class="ingress",controller_namespace="default",controller_pod="pod",host="testshop.com",ingress="web-yml",method="GET",namespace="test-app-production",path="/admin",service="test-app",status="2xx"} 200 + nginx_ingress_controller_response_duration_seconds_count{canary="",controller_class="ingress",controller_namespace="default",controller_pod="pod",host="testshop.com",ingress="web-yml",method="GET",namespace="test-app-production",path="/admin",service="test-app",status="2xx"} 1 + `, + }, + { + name: "remove metrics with the short metric name", + data: []string{`[{ + "host":"testshop.com", + "status":"200", + "bytesSent":150.0, + "method":"GET", + "path":"/admin", + "requestLength":300.0, + "requestTime":60.0, + "upstreamLatency":1.0, + "upstreamHeaderTime":5.0, + "upstreamName":"test-upstream", + "upstreamIP":"1.1.1.1:8080", + "upstreamResponseTime":200, + "upstreamStatus":"220", + "namespace":"test-app-production", + "ingress":"web-yml", + "service":"test-app", + "canary":"" + }]`}, + excludeMetrics: []string{"response_duration_seconds"}, + metrics: []string{"nginx_ingress_controller_response_duration_seconds"}, + useStatusClasses: true, + wantBefore: ` + `, + }, + { + name: "exclude metrics make sure to only remove exactly matched metrics", + data: []string{`[{ + "host":"testshop.com", + "status":"200", + "bytesSent":150.0, + "method":"GET", + "path":"/admin", + "requestLength":300.0, + "requestTime":60.0, + "upstreamLatency":1.0, + "upstreamHeaderTime":5.0, + "upstreamName":"test-upstream", + "upstreamIP":"1.1.1.1:8080", + "upstreamResponseTime":200, + "upstreamStatus":"220", + "namespace":"test-app-production", + "ingress":"web-yml", + "service":"test-app", + "canary":"" + }]`}, + excludeMetrics: []string{"response_duration_seconds2", "test.*", "nginx_ingress_.*", "response_duration_secon"}, + metrics: []string{"nginx_ingress_controller_response_duration_seconds"}, + useStatusClasses: true, + wantBefore: ` + # HELP nginx_ingress_controller_response_duration_seconds The time spent on receiving the response from the upstream server + # TYPE nginx_ingress_controller_response_duration_seconds histogram + nginx_ingress_controller_response_duration_seconds_bucket{canary="",controller_class="ingress",controller_namespace="default",controller_pod="pod",host="testshop.com",ingress="web-yml",method="GET",namespace="test-app-production",path="/admin",service="test-app",status="2xx",le="0.005"} 0 + nginx_ingress_controller_response_duration_seconds_bucket{canary="",controller_class="ingress",controller_namespace="default",controller_pod="pod",host="testshop.com",ingress="web-yml",method="GET",namespace="test-app-production",path="/admin",service="test-app",status="2xx",le="0.01"} 0 + nginx_ingress_controller_response_duration_seconds_bucket{canary="",controller_class="ingress",controller_namespace="default",controller_pod="pod",host="testshop.com",ingress="web-yml",method="GET",namespace="test-app-production",path="/admin",service="test-app",status="2xx",le="0.025"} 0 + nginx_ingress_controller_response_duration_seconds_bucket{canary="",controller_class="ingress",controller_namespace="default",controller_pod="pod",host="testshop.com",ingress="web-yml",method="GET",namespace="test-app-production",path="/admin",service="test-app",status="2xx",le="0.05"} 0 + nginx_ingress_controller_response_duration_seconds_bucket{canary="",controller_class="ingress",controller_namespace="default",controller_pod="pod",host="testshop.com",ingress="web-yml",method="GET",namespace="test-app-production",path="/admin",service="test-app",status="2xx",le="0.1"} 0 + nginx_ingress_controller_response_duration_seconds_bucket{canary="",controller_class="ingress",controller_namespace="default",controller_pod="pod",host="testshop.com",ingress="web-yml",method="GET",namespace="test-app-production",path="/admin",service="test-app",status="2xx",le="0.25"} 0 + nginx_ingress_controller_response_duration_seconds_bucket{canary="",controller_class="ingress",controller_namespace="default",controller_pod="pod",host="testshop.com",ingress="web-yml",method="GET",namespace="test-app-production",path="/admin",service="test-app",status="2xx",le="0.5"} 0 + nginx_ingress_controller_response_duration_seconds_bucket{canary="",controller_class="ingress",controller_namespace="default",controller_pod="pod",host="testshop.com",ingress="web-yml",method="GET",namespace="test-app-production",path="/admin",service="test-app",status="2xx",le="1"} 0 + nginx_ingress_controller_response_duration_seconds_bucket{canary="",controller_class="ingress",controller_namespace="default",controller_pod="pod",host="testshop.com",ingress="web-yml",method="GET",namespace="test-app-production",path="/admin",service="test-app",status="2xx",le="2.5"} 0 + nginx_ingress_controller_response_duration_seconds_bucket{canary="",controller_class="ingress",controller_namespace="default",controller_pod="pod",host="testshop.com",ingress="web-yml",method="GET",namespace="test-app-production",path="/admin",service="test-app",status="2xx",le="5"} 0 + nginx_ingress_controller_response_duration_seconds_bucket{canary="",controller_class="ingress",controller_namespace="default",controller_pod="pod",host="testshop.com",ingress="web-yml",method="GET",namespace="test-app-production",path="/admin",service="test-app",status="2xx",le="10"} 0 + nginx_ingress_controller_response_duration_seconds_bucket{canary="",controller_class="ingress",controller_namespace="default",controller_pod="pod",host="testshop.com",ingress="web-yml",method="GET",namespace="test-app-production",path="/admin",service="test-app",status="2xx",le="+Inf"} 1 + nginx_ingress_controller_response_duration_seconds_sum{canary="",controller_class="ingress",controller_namespace="default",controller_pod="pod",host="testshop.com",ingress="web-yml",method="GET",namespace="test-app-production",path="/admin",service="test-app",status="2xx"} 200 + nginx_ingress_controller_response_duration_seconds_count{canary="",controller_class="ingress",controller_namespace="default",controller_pod="pod",host="testshop.com",ingress="web-yml",method="GET",namespace="test-app-production",path="/admin",service="test-app",status="2xx"} 1 + `, + }, } for _, c := range cases { t.Run(c.name, func(t *testing.T) { registry := prometheus.NewPedanticRegistry() - sc, err := NewSocketCollector("pod", "default", "ingress", true, c.useStatusClasses, buckets) + sc, err := NewSocketCollector("pod", "default", "ingress", true, c.useStatusClasses, buckets, c.excludeMetrics) if err != nil { t.Errorf("%v: unexpected error creating new SocketCollector: %v", c.name, err) } diff --git a/internal/ingress/metric/main.go b/internal/ingress/metric/main.go index cac86e889..b2f721f62 100644 --- a/internal/ingress/metric/main.go +++ b/internal/ingress/metric/main.go @@ -71,7 +71,7 @@ type collector struct { } // NewCollector creates a new metric collector the for ingress controller -func NewCollector(metricsPerHost, reportStatusClasses bool, registry *prometheus.Registry, ingressclass string, buckets collectors.HistogramBuckets) (Collector, error) { +func NewCollector(metricsPerHost, reportStatusClasses bool, registry *prometheus.Registry, ingressclass string, buckets collectors.HistogramBuckets, excludedSocketMetrics []string) (Collector, error) { podNamespace := os.Getenv("POD_NAMESPACE") if podNamespace == "" { podNamespace = "default" @@ -89,7 +89,7 @@ func NewCollector(metricsPerHost, reportStatusClasses bool, registry *prometheus return nil, err } - s, err := collectors.NewSocketCollector(podName, podNamespace, ingressclass, metricsPerHost, reportStatusClasses, buckets) + s, err := collectors.NewSocketCollector(podName, podNamespace, ingressclass, metricsPerHost, reportStatusClasses, buckets, excludedSocketMetrics) if err != nil { return nil, err } diff --git a/pkg/flags/flags.go b/pkg/flags/flags.go index 911ab775c..370510380 100644 --- a/pkg/flags/flags.go +++ b/pkg/flags/flags.go @@ -171,10 +171,11 @@ Requires the update-status parameter.`) reportStatusClasses = flags.Bool("report-status-classes", false, `Use status classes (2xx, 3xx, 4xx and 5xx) instead of status codes in metrics.`) - timeBuckets = flags.Float64Slice("time-buckets", prometheus.DefBuckets, "Set of buckets which will be used for prometheus histogram metrics such as RequestTime, ResponseTime.") - lengthBuckets = flags.Float64Slice("length-buckets", prometheus.LinearBuckets(10, 10, 10), "Set of buckets which will be used for prometheus histogram metrics such as RequestLength, ResponseLength.") - sizeBuckets = flags.Float64Slice("size-buckets", prometheus.ExponentialBuckets(10, 10, 7), "Set of buckets which will be used for prometheus histogram metrics such as BytesSent.") - monitorMaxBatchSize = flags.Int("monitor-max-batch-size", 10000, "Max batch size of NGINX metrics.") + timeBuckets = flags.Float64Slice("time-buckets", prometheus.DefBuckets, "Set of buckets which will be used for prometheus histogram metrics such as RequestTime, ResponseTime.") + lengthBuckets = flags.Float64Slice("length-buckets", prometheus.LinearBuckets(10, 10, 10), "Set of buckets which will be used for prometheus histogram metrics such as RequestLength, ResponseLength.") + sizeBuckets = flags.Float64Slice("size-buckets", prometheus.ExponentialBuckets(10, 10, 7), "Set of buckets which will be used for prometheus histogram metrics such as BytesSent.") + excludeSocketMetrics = flags.StringSlice("exclude-socket-metrics", []string{}, "et of socket request metrics to exclude which won't be exported nor being calculated. E.g. 'nginx_ingress_controller_success,nginx_ingress_controller_header_duration_seconds'.") + monitorMaxBatchSize = flags.Int("monitor-max-batch-size", 10000, "Max batch size of NGINX metrics.") httpPort = flags.Int("http-port", 80, `Port to use for servicing HTTP traffic.`) httpsPort = flags.Int("https-port", 443, `Port to use for servicing HTTPS traffic.`) @@ -328,6 +329,7 @@ https://blog.maxmind.com/2019/12/18/significant-changes-to-accessing-and-using-g MetricsPerHost: *metricsPerHost, MetricsBuckets: histogramBuckets, ReportStatusClasses: *reportStatusClasses, + ExcludeSocketMetrics: *excludeSocketMetrics, MonitorMaxBatchSize: *monitorMaxBatchSize, DisableServiceExternalName: *disableServiceExternalName, EnableSSLPassthrough: *enableSSLPassthrough, diff --git a/test/e2e/e2e.go b/test/e2e/e2e.go index 57b047230..614dd166a 100644 --- a/test/e2e/e2e.go +++ b/test/e2e/e2e.go @@ -40,6 +40,7 @@ import ( _ "k8s.io/ingress-nginx/test/e2e/leaks" _ "k8s.io/ingress-nginx/test/e2e/loadbalance" _ "k8s.io/ingress-nginx/test/e2e/lua" + _ "k8s.io/ingress-nginx/test/e2e/metrics" _ "k8s.io/ingress-nginx/test/e2e/nginx" _ "k8s.io/ingress-nginx/test/e2e/security" _ "k8s.io/ingress-nginx/test/e2e/servicebackend" diff --git a/test/e2e/metrics/metrics.go b/test/e2e/metrics/metrics.go new file mode 100644 index 000000000..907b53732 --- /dev/null +++ b/test/e2e/metrics/metrics.go @@ -0,0 +1,94 @@ +/* +Copyright 2023 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package defaultbackend + +import ( + "context" + "fmt" + "net/http" + "strings" + "time" + + "github.com/onsi/ginkgo/v2" + "github.com/stretchr/testify/assert" + appsv1 "k8s.io/api/apps/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + + "k8s.io/ingress-nginx/test/e2e/framework" +) + +const waitForMetrics = 2 * time.Second + +var _ = framework.IngressNginxDescribe("[metrics] exported prometheus metrics", func() { + f := framework.NewDefaultFramework("metrics") + host := "foo.com" + + ginkgo.BeforeEach(func() { + f.NewEchoDeployment() + f.EnsureIngress(framework.NewSingleIngress(host, "/", host, f.Namespace, framework.EchoService, 80, nil)) + f.WaitForNginxServer(host, + func(server string) bool { + return strings.Contains(server, fmt.Sprintf("server_name %s ;", host)) && + strings.Contains(server, "proxy_pass http://upstream_balancer;") + }) + }) + + ginkgo.It("exclude socket request metrics are absent", func() { + err := f.UpdateIngressControllerDeployment(func(deployment *appsv1.Deployment) error { + args := deployment.Spec.Template.Spec.Containers[0].Args + args = append(args, "--exclude-socket-metrics=nginx_ingress_controller_request_size,nginx_ingress_controller_header_duration_seconds") + deployment.Spec.Template.Spec.Containers[0].Args = args + _, err := f.KubeClientSet.AppsV1().Deployments(f.Namespace).Update(context.TODO(), deployment, metav1.UpdateOptions{}) + return err + }) + assert.Nil(ginkgo.GinkgoT(), err, "updating deployment") + + f.HTTPTestClient(). + GET("/"). + WithHeader("Host", host). + Expect(). + Status(http.StatusOK) + time.Sleep(waitForMetrics) + + ip := f.GetNginxPodIP() + mf, err := f.GetMetric("nginx_ingress_controller_request_size", ip) + assert.ErrorContains(ginkgo.GinkgoT(), err, "nginx_ingress_controller_request_size") + assert.Nil(ginkgo.GinkgoT(), mf) + }) + ginkgo.It("exclude socket request metrics are present", func() { + err := f.UpdateIngressControllerDeployment(func(deployment *appsv1.Deployment) error { + args := deployment.Spec.Template.Spec.Containers[0].Args + args = append(args, "--exclude-socket-metrics=non_existing_metric_does_not_affect_existing_metrics") + deployment.Spec.Template.Spec.Containers[0].Args = args + _, err := f.KubeClientSet.AppsV1().Deployments(f.Namespace).Update(context.TODO(), deployment, metav1.UpdateOptions{}) + return err + }) + assert.Nil(ginkgo.GinkgoT(), err, "updating deployment") + + f.HTTPTestClient(). + GET("/"). + WithHeader("Host", host). + Expect(). + Status(http.StatusOK) + time.Sleep(waitForMetrics) + + ip := f.GetNginxPodIP() + mf, err := f.GetMetric("nginx_ingress_controller_request_size", ip) + assert.Nil(ginkgo.GinkgoT(), err) + assert.NotNil(ginkgo.GinkgoT(), mf) + }) +})