Metrics: Add --metrics-per-undefined-host argument. (#11818)

Signed-off-by: Jon Carl <grounded042@joncarl.com>
This commit is contained in:
Jon Carl 2024-08-26 13:09:11 -06:00 committed by GitHub
parent 93f9f9fbb3
commit 034c3ccad4
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
11 changed files with 172 additions and 28 deletions

View file

@ -66,7 +66,7 @@ func main() {
mc := metric.NewDummyCollector()
if conf.EnableMetrics {
// TODO: Ingress class is not a part of dataplane anymore
mc, err = metric.NewCollector(conf.MetricsPerHost, conf.ReportStatusClasses, reg, conf.IngressClassConfiguration.Controller, *conf.MetricsBuckets, conf.MetricsBucketFactor, conf.MetricsMaxBuckets, conf.ExcludeSocketMetrics)
mc, err = metric.NewCollector(conf.MetricsPerHost, conf.MetricsPerUndefinedHost, conf.ReportStatusClasses, reg, conf.IngressClassConfiguration.Controller, *conf.MetricsBuckets, conf.MetricsBucketFactor, conf.MetricsMaxBuckets, conf.ExcludeSocketMetrics)
if err != nil {
klog.Fatalf("Error creating prometheus collector: %v", err)
}

View file

@ -130,7 +130,7 @@ func main() {
mc := metric.NewDummyCollector()
if conf.EnableMetrics {
mc, err = metric.NewCollector(conf.MetricsPerHost, conf.ReportStatusClasses, reg, conf.IngressClassConfiguration.Controller, *conf.MetricsBuckets, conf.MetricsBucketFactor, conf.MetricsMaxBuckets, conf.ExcludeSocketMetrics)
mc, err = metric.NewCollector(conf.MetricsPerHost, conf.MetricsPerUndefinedHost, conf.ReportStatusClasses, reg, conf.IngressClassConfiguration.Controller, *conf.MetricsBuckets, conf.MetricsBucketFactor, conf.MetricsMaxBuckets, conf.ExcludeSocketMetrics)
if err != nil {
klog.Fatalf("Error creating prometheus collector: %v", err)
}

View file

@ -48,6 +48,7 @@ They are set in the container spec of the `ingress-nginx-controller` Deployment
| `--maxmind-license-key` | Maxmind license key to download GeoLite2 Databases. https://blog.maxmind.com/2019/12/significant-changes-to-accessing-and-using-geolite2-databases/ . |
| `--maxmind-mirror` | Maxmind mirror url (example: http://geoip.local/databases. |
| `--metrics-per-host` | Export metrics per-host. (default true) |
| `--metrics-per-undefined-host` | Export metrics per-host even if the host is not defined in an ingress. Requires --metrics-per-host to be set to true. (default false) |
| `--monitor-max-batch-size` | Max batch size of NGINX metrics. (default 10000)|
| `--post-shutdown-grace-period` | Additional delay in seconds before controller container exits. (default 10) |
| `--profiler-port` | Port to use for expose the ingress controller Go profiler when it is enabled. (default 10245) |

View file

@ -166,7 +166,9 @@ According to the above example, this URL will be http://10.192.0.3:31086
#### Wildcard ingresses
- By default request metrics are labeled with the hostname. When you have a wildcard domain ingress, then there will be no metrics for that ingress (to prevent the metrics from exploding in cardinality). To get metrics in this case you need to run the ingress controller with `--metrics-per-host=false` (you will lose labeling by hostname, but still have labeling by ingress).
- By default request metrics are labeled with the hostname. When you have a wildcard domain ingress, then there will be no metrics for that ingress (to prevent the metrics from exploding in cardinality). To get metrics in this case you have two options:
- Run the ingress controller with `--metrics-per-host=false`. You will lose labeling by hostname, but still have labeling by ingress.
- Run the ingress controller with `--metrics-per-undefined-host=true --metrics-per-host=true`. You will get labeling by hostname even if the hostname is not explicitly defined on an ingress. Be warned that cardinality could explode due to many hostnames.
### Grafana dashboard using ingress resource
- If you want to expose the dashboard for grafana using an ingress resource, then you can :

View file

@ -105,13 +105,14 @@ type Configuration struct {
EnableProfiling bool
EnableMetrics bool
MetricsPerHost bool
MetricsBuckets *collectors.HistogramBuckets
MetricsBucketFactor float64
MetricsMaxBuckets uint32
ReportStatusClasses bool
ExcludeSocketMetrics []string
EnableMetrics bool
MetricsPerHost bool
MetricsPerUndefinedHost bool
MetricsBuckets *collectors.HistogramBuckets
MetricsBucketFactor float64
MetricsMaxBuckets uint32
ReportStatusClasses bool
ExcludeSocketMetrics []string
FakeCertificate *ingress.SSLCert

View file

@ -81,8 +81,9 @@ type SocketCollector struct {
hosts sets.Set[string]
metricsPerHost bool
reportStatusClasses bool
metricsPerHost bool
metricsPerUndefinedHost bool
reportStatusClasses bool
}
var requestTags = []string{
@ -99,7 +100,7 @@ var requestTags = []string{
// NewSocketCollector creates a new SocketCollector instance using
// the ingress watch namespace and class used by the controller
func NewSocketCollector(pod, namespace, class string, metricsPerHost, reportStatusClasses bool, buckets HistogramBuckets, bucketFactor float64, maxBuckets uint32, excludeMetrics []string) (*SocketCollector, error) {
func NewSocketCollector(pod, namespace, class string, metricsPerHost, metricsPerUndefinedHost, reportStatusClasses bool, buckets HistogramBuckets, bucketFactor float64, maxBuckets uint32, excludeMetrics []string) (*SocketCollector, error) {
socket := "/tmp/nginx/prometheus-nginx.socket"
// unix sockets must be unlink()ed before being used
//nolint:errcheck // Ignore unlink error
@ -139,8 +140,9 @@ func NewSocketCollector(pod, namespace, class string, metricsPerHost, reportStat
sc := &SocketCollector{
listener: listener,
metricsPerHost: metricsPerHost,
reportStatusClasses: reportStatusClasses,
metricsPerHost: metricsPerHost,
metricsPerUndefinedHost: metricsPerUndefinedHost,
reportStatusClasses: reportStatusClasses,
connectTime: histogramMetric(
&prometheus.HistogramOpts{
@ -306,8 +308,8 @@ func (sc *SocketCollector) handleMessage(msg []byte) {
for i := range statsBatch {
stats := &statsBatch[i]
if sc.metricsPerHost && !sc.hosts.Has(stats.Host) {
klog.V(3).InfoS("Skipping metric for host not being served", "host", stats.Host)
if sc.metricsPerHost && !sc.hosts.Has(stats.Host) && !sc.metricsPerUndefinedHost {
klog.V(3).InfoS("Skipping metric for host not explicitly defined in an ingress", "host", stats.Host)
continue
}

View file

@ -87,14 +87,15 @@ func TestCollector(t *testing.T) {
maxBuckets := uint32(100)
cases := []struct {
name string
data []string
metrics []string
useStatusClasses bool
excludeMetrics []string
wantBefore string
removeIngresses []string
wantAfter string
name string
data []string
metrics []string
metricsPerUndefinedHost bool
useStatusClasses bool
excludeMetrics []string
wantBefore string
removeIngresses []string
wantAfter string
}{
{
name: "invalid metric object should not increase prometheus metrics",
@ -591,13 +592,69 @@ func TestCollector(t *testing.T) {
nginx_ingress_controller_response_duration_seconds_count{canary="",controller_class="ingress",controller_namespace="default",controller_pod="pod",host="testshop.com",ingress="web-yml",method="GET",namespace="test-app-production",path="/admin",service="test-app",status="2xx"} 1
`,
},
{
name: "metrics with a host should not be dropped when the host is not in the hosts slice but metricsPerUndefinedHost is true",
data: []string{`[{
"host":"wildcard.testshop.com",
"status":"200",
"bytesSent":150.0,
"method":"GET",
"path":"/admin",
"requestLength":300.0,
"requestTime":60.0,
"upstreamLatency":1.0,
"upstreamHeaderTime":5.0,
"upstreamName":"test-upstream",
"upstreamIP":"1.1.1.1:8080",
"upstreamResponseTime":200,
"upstreamStatus":"220",
"namespace":"test-app-production",
"ingress":"web-yml",
"service":"test-app",
"canary":""
}]`},
excludeMetrics: []string{"response_duration_seconds2", "test.*", "nginx_ingress_.*", "response_duration_secon"},
metrics: []string{"nginx_ingress_controller_requests"},
metricsPerUndefinedHost: true,
useStatusClasses: true,
wantBefore: `
# HELP nginx_ingress_controller_requests The total number of client requests
# TYPE nginx_ingress_controller_requests counter
nginx_ingress_controller_requests{canary="",controller_class="ingress",controller_namespace="default",controller_pod="pod",host="wildcard.testshop.com",ingress="web-yml",method="GET",namespace="test-app-production",path="/admin",service="test-app",status="2xx"} 1
`,
},
{
name: "metrics with a host should be dropped when the host is not in the hosts slice",
data: []string{`[{
"host":"wildcard.testshop.com",
"status":"200",
"bytesSent":150.0,
"method":"GET",
"path":"/admin",
"requestLength":300.0,
"requestTime":60.0,
"upstreamLatency":1.0,
"upstreamHeaderTime":5.0,
"upstreamName":"test-upstream",
"upstreamIP":"1.1.1.1:8080",
"upstreamResponseTime":200,
"upstreamStatus":"220",
"namespace":"test-app-production",
"ingress":"web-yml",
"service":"test-app",
"canary":""
}]`},
excludeMetrics: []string{"response_duration_seconds2", "test.*", "nginx_ingress_.*", "response_duration_secon"},
metrics: []string{"nginx_ingress_controller_requests"},
useStatusClasses: true,
},
}
for _, c := range cases {
t.Run(c.name, func(t *testing.T) {
registry := prometheus.NewPedanticRegistry()
sc, err := NewSocketCollector("pod", "default", "ingress", true, c.useStatusClasses, buckets, bucketFactor, maxBuckets, c.excludeMetrics)
sc, err := NewSocketCollector("pod", "default", "ingress", true, c.metricsPerUndefinedHost, c.useStatusClasses, buckets, bucketFactor, maxBuckets, c.excludeMetrics)
if err != nil {
t.Errorf("%v: unexpected error creating new SocketCollector: %v", c.name, err)
}

View file

@ -71,7 +71,7 @@ type collector struct {
}
// NewCollector creates a new metric collector the for ingress controller
func NewCollector(metricsPerHost, reportStatusClasses bool, registry *prometheus.Registry, ingressclass string, buckets collectors.HistogramBuckets, bucketFactor float64, maxBuckets uint32, excludedSocketMetrics []string) (Collector, error) {
func NewCollector(metricsPerHost, metricsPerUndefinedHost, reportStatusClasses bool, registry *prometheus.Registry, ingressclass string, buckets collectors.HistogramBuckets, bucketFactor float64, maxBuckets uint32, excludedSocketMetrics []string) (Collector, error) {
podNamespace := os.Getenv("POD_NAMESPACE")
if podNamespace == "" {
podNamespace = "default"
@ -89,7 +89,7 @@ func NewCollector(metricsPerHost, reportStatusClasses bool, registry *prometheus
return nil, err
}
s, err := collectors.NewSocketCollector(podName, podNamespace, ingressclass, metricsPerHost, reportStatusClasses, buckets, bucketFactor, maxBuckets, excludedSocketMetrics)
s, err := collectors.NewSocketCollector(podName, podNamespace, ingressclass, metricsPerHost, metricsPerUndefinedHost, reportStatusClasses, buckets, bucketFactor, maxBuckets, excludedSocketMetrics)
if err != nil {
return nil, err
}

View file

@ -17,6 +17,7 @@ limitations under the License.
package flags
import (
"errors"
"flag"
"fmt"
"net"
@ -177,6 +178,8 @@ Requires the update-status parameter.`)
`Enables the collection of NGINX metrics.`)
metricsPerHost = flags.Bool("metrics-per-host", true,
`Export metrics per-host.`)
metricsPerUndefinedHost = flags.Bool("metrics-per-undefined-host", false,
`Export metrics per-host even if the host is not defined in an ingress. Requires --metrics-per-host to be set to true.`)
reportStatusClasses = flags.Bool("report-status-classes", false,
`Use status classes (2xx, 3xx, 4xx and 5xx) instead of status codes in metrics.`)
@ -319,6 +322,10 @@ https://blog.maxmind.com/2019/12/significant-changes-to-accessing-and-using-geol
}
}
if *metricsPerUndefinedHost && !*metricsPerHost {
return false, nil, errors.New("--metrics-per-undefined-host=true must be passed with --metrics-per-host=true")
}
if *electionTTL <= 0 {
*electionTTL = 30 * time.Second
}
@ -340,6 +347,7 @@ https://blog.maxmind.com/2019/12/significant-changes-to-accessing-and-using-geol
EnableProfiling: *profiling,
EnableMetrics: *enableMetrics,
MetricsPerHost: *metricsPerHost,
MetricsPerUndefinedHost: *metricsPerUndefinedHost,
MetricsBuckets: histogramBuckets,
MetricsBucketFactor: *bucketFactor,
MetricsMaxBuckets: *maxBuckets,

View file

@ -212,3 +212,29 @@ func TestLeaderElectionTTLParseValueInHours(t *testing.T) {
t.Fatalf("Expected --election-ttl and conf.ElectionTTL as 1h, but found: %v", conf.ElectionTTL)
}
}
func TestMetricsPerUndefinedHost(t *testing.T) {
ResetForTesting(func() { t.Fatal("Parsing failed") })
oldArgs := os.Args
defer func() { os.Args = oldArgs }()
os.Args = []string{"cmd", "--metrics-per-undefined-host=true"}
_, _, err := ParseFlags()
if err != nil {
t.Fatalf("Expected no error but got: %s", err)
}
}
func TestMetricsPerUndefinedHostWithMetricsPerHostFalse(t *testing.T) {
ResetForTesting(func() { t.Fatal("Parsing failed") })
oldArgs := os.Args
defer func() { os.Args = oldArgs }()
os.Args = []string{"cmd", "--metrics-per-host=false", "--metrics-per-undefined-host=true"}
_, _, err := ParseFlags()
if err == nil {
t.Fatalf("Expected an error parsing flags but none returned")
}
}

View file

@ -36,6 +36,7 @@ const waitForMetrics = 2 * time.Second
var _ = framework.IngressNginxDescribe("[metrics] exported prometheus metrics", func() {
f := framework.NewDefaultFramework("metrics")
host := "foo.com"
wildcardHost := "wildcard." + host
ginkgo.BeforeEach(func() {
f.NewEchoDeployment()
@ -91,4 +92,50 @@ var _ = framework.IngressNginxDescribe("[metrics] exported prometheus metrics",
assert.Nil(ginkgo.GinkgoT(), err)
assert.NotNil(ginkgo.GinkgoT(), mf)
})
ginkgo.It("request metrics per undefined host are present when flag is set", func() {
err := f.UpdateIngressControllerDeployment(func(deployment *appsv1.Deployment) error {
args := deployment.Spec.Template.Spec.Containers[0].Args
args = append(args, "--metrics-per-undefined-host=true")
deployment.Spec.Template.Spec.Containers[0].Args = args
_, err := f.KubeClientSet.AppsV1().Deployments(f.Namespace).Update(context.TODO(), deployment, metav1.UpdateOptions{})
return err
})
assert.Nil(ginkgo.GinkgoT(), err, "updating deployment")
f.HTTPTestClient().
GET("/").
WithHeader("Host", wildcardHost).
Expect().
Status(http.StatusNotFound)
time.Sleep(waitForMetrics)
ip := f.GetNginxPodIP()
reqMetrics, err := f.GetMetric("nginx_ingress_controller_requests", ip)
assert.Nil(ginkgo.GinkgoT(), err)
assert.NotNil(ginkgo.GinkgoT(), reqMetrics.Metric)
assert.Len(ginkgo.GinkgoT(), reqMetrics.Metric, 1)
containedLabel := false
for _, label := range reqMetrics.Metric[0].Label {
if *label.Name == "host" && *label.Value == wildcardHost {
containedLabel = true
break
}
}
assert.Truef(ginkgo.GinkgoT(), containedLabel, "expected reqMetrics to contain label with \"name\"=\"host\" \"value\"=%q, but it did not: %s", wildcardHost, reqMetrics.String())
})
ginkgo.It("request metrics per undefined host are not present when flag is not set", func() {
f.HTTPTestClient().
GET("/").
WithHeader("Host", wildcardHost).
Expect().
Status(http.StatusNotFound)
time.Sleep(waitForMetrics)
ip := f.GetNginxPodIP()
reqMetrics, err := f.GetMetric("nginx_ingress_controller_requests", ip)
assert.EqualError(ginkgo.GinkgoT(), err, "there is no metric with name nginx_ingress_controller_requests")
assert.Nil(ginkgo.GinkgoT(), reqMetrics)
})
})