Add support for the Prometheus Operator (#772)

support collecting Vault server metrics by deploying PrometheusOperator
CustomResources.

Co-authored-by: Sam Weston <weston.sam@gmail.com>
Co-authored-by: Theron Voran <tvoran@users.noreply.github.com>
This commit is contained in:
Ben Ash 2022-09-01 13:07:49 -06:00 committed by GitHub
parent 8a6872e36d
commit 04074311f7
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
13 changed files with 498 additions and 4 deletions

View file

@ -21,6 +21,6 @@ jobs:
node_image: kindest/node:v${{ matrix.kind-k8s-version }}
version: v0.14.0
- run: bats ./test/acceptance -t
- run: bats --tap --timing ./test/acceptance
env:
VAULT_LICENSE_CI: ${{ secrets.VAULT_LICENSE_CI }}

View file

@ -8,7 +8,7 @@ jobs:
steps:
- uses: actions/checkout@v2
- uses: ./.github/workflows/setup-test-tools
- run: bats ./test/unit -t
- run: bats --tap --timing ./test/unit
chart-verifier:
runs-on: ubuntu-latest
@ -22,4 +22,4 @@ jobs:
with:
go-version: '1.17.4'
- run: go install github.com/redhat-certification/chart-verifier@${CHART_VERIFIER_VERSION}
- run: bats ./test/chart -t
- run: bats --tap --timing ./test/chart

View file

@ -1,4 +1,6 @@
## Unreleased
Features:
* Add PrometheusOperator support for collecting Vault server metrics. [GH-772](https://github.com/hashicorp/vault-helm/pull/772)
## 0.21.0 (August 10th, 2022)

View file

@ -71,7 +71,7 @@ acceptance:
ifneq ($(LOCAL_ACCEPTANCE_TESTS),true)
gcloud auth activate-service-account --key-file=${GOOGLE_CREDENTIALS}
endif
bats test/${ACCEPTANCE_TESTS}
bats --tap --timing test/${ACCEPTANCE_TESTS}
# this target is for provisioning the GKE cluster
# it is run in the docker container above when the test-provision target is invoked

View file

@ -0,0 +1,26 @@
{{ if and (.Values.serverTelemetry.prometheusRules.rules)
(or (.Values.global.serverTelemetry.prometheusOperator) (.Values.serverTelemetry.prometheusRules.enabled) )
}}
---
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: {{ template "vault.fullname" . }}
labels:
helm.sh/chart: {{ include "vault.chart" . }}
app.kubernetes.io/name: {{ include "vault.name" . }}
app.kubernetes.io/instance: {{ .Release.Name }}
app.kubernetes.io/managed-by: {{ .Release.Service }}
{{- /* update the selectors docs in values.yaml whenever the defaults below change. */ -}}
{{- $selectors := .Values.serverTelemetry.prometheusRules.selectors }}
{{- if $selectors }}
{{- toYaml $selectors | nindent 4 }}
{{- else }}
release: prometheus
{{- end }}
spec:
groups:
- name: {{ include "vault.fullname" . }}
rules:
{{- toYaml .Values.serverTelemetry.prometheusRules.rules | nindent 6 }}
{{- end }}

View file

@ -0,0 +1,44 @@
{{ template "vault.mode" . }}
{{ if or (.Values.global.serverTelemetry.prometheusOperator) (.Values.serverTelemetry.serviceMonitor.enabled) }}
---
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
name: {{ template "vault.fullname" . }}
labels:
helm.sh/chart: {{ include "vault.chart" . }}
app.kubernetes.io/name: {{ include "vault.name" . }}
app.kubernetes.io/instance: {{ .Release.Name }}
app.kubernetes.io/managed-by: {{ .Release.Service }}
{{- /* update the selectors docs in values.yaml whenever the defaults below change. */ -}}
{{- $selectors := .Values.serverTelemetry.serviceMonitor.selectors }}
{{- if $selectors }}
{{- toYaml $selectors | nindent 4 }}
{{- else }}
release: prometheus
{{- end }}
spec:
selector:
matchLabels:
app.kubernetes.io/name: {{ template "vault.name" . }}
app.kubernetes.io/instance: {{ .Release.Name }}
{{- if eq .mode "ha" }}
vault-active: "true"
{{- else }}
vault-internal: "true"
{{- end }}
endpoints:
- port: {{ include "vault.scheme" . }}
interval: {{ .Values.serverTelemetry.serviceMonitor.interval }}
scrapeTimeout: {{ .Values.serverTelemetry.serviceMonitor.scrapeTimeout }}
scheme: {{ include "vault.scheme" . | lower }}
path: /v1/sys/metrics
params:
format:
- prometheus
tlsConfig:
insecureSkipVerify: true
namespaceSelector:
matchNames:
- {{ .Release.Namespace }}
{{ end }}

View file

@ -14,6 +14,7 @@ metadata:
app.kubernetes.io/name: {{ include "vault.name" . }}
app.kubernetes.io/instance: {{ .Release.Name }}
app.kubernetes.io/managed-by: {{ .Release.Service }}
vault-active: "true"
annotations:
{{ template "vault.service.annotations" .}}
spec:

View file

@ -13,6 +13,7 @@ metadata:
app.kubernetes.io/name: {{ include "vault.name" . }}
app.kubernetes.io/instance: {{ .Release.Name }}
app.kubernetes.io/managed-by: {{ .Release.Service }}
vault-internal: "true"
annotations:
{{ template "vault.service.annotations" .}}
spec:

View file

@ -0,0 +1,90 @@
#!/usr/bin/env bats
load _helpers
@test "server/telemetry: prometheusOperator" {
cd `chart_dir`
helm --namespace acceptance uninstall $(name_prefix) || :
helm --namespace acceptance uninstall prometheus || :
kubectl delete namespace acceptance --ignore-not-found=true
kubectl create namespace acceptance
kubectl config set-context --current --namespace=acceptance
helm repo add prometheus-community https://prometheus-community.github.io/helm-charts
helm repo update
helm install \
--wait \
--version 39.6.0 \
prometheus prometheus-community/kube-prometheus-stack
helm install \
--wait \
--values ./test/acceptance/server-test/telemetry.yaml \
"$(name_prefix)" .
wait_for_running $(name_prefix)-0
# Sealed, not initialized
wait_for_sealed_vault $(name_prefix)-0
# Vault Init
local token=$(kubectl exec -ti "$(name_prefix)-0" -- \
vault operator init -format=json -n 1 -t 1 | \
jq -r '.unseal_keys_b64[0]')
[ "${token}" != "" ]
# Vault Unseal
local pods=($(kubectl get pods --selector='app.kubernetes.io/name=vault' -o json | jq -r '.items[].metadata.name'))
for pod in "${pods[@]}"
do
kubectl exec -ti ${pod} -- vault operator unseal ${token}
done
wait_for_ready "$(name_prefix)-0"
# Unsealed, initialized
local sealed_status=$(kubectl exec "$(name_prefix)-0" -- vault status -format=json |
jq -r '.sealed' )
[ "${sealed_status}" == "false" ]
local init_status=$(kubectl exec "$(name_prefix)-0" -- vault status -format=json |
jq -r '.initialized')
[ "${init_status}" == "true" ]
# unfortunately it can take up to 2 minutes for the vault prometheus job to appear
# TODO: investigate how reduce this.
local job_labels
local tries=0
until [ $tries -ge 240 ]
do
job_labels=$( (kubectl exec -n acceptance svc/prometheus-kube-prometheus-prometheus \
-c prometheus \
-- wget -q -O - http://127.0.0.1:9090/api/v1/label/job/values) | tee /dev/stderr )
# Ensure the expected job label was picked up by Prometheus
[ "$(echo "${job_labels}" | jq 'any(.data[]; . == "vault-internal")')" = "true" ] && break
((++tries))
sleep .5
done
# Ensure the expected job is "up"
local job_up=$( ( kubectl exec -n acceptance svc/prometheus-kube-prometheus-prometheus \
-c prometheus \
-- wget -q -O - 'http://127.0.0.1:9090/api/v1/query?query=up{job="vault-internal"}' ) | \
tee /dev/stderr )
[ "$(echo "${job_up}" | jq '.data.result[0].value[1]')" = \"1\" ]
}
# Clean up
teardown() {
if [[ ${CLEANUP:-true} == "true" ]]
then
echo "helm/pvc teardown"
helm uninstall $(name_prefix)
helm uninstall prometheus
kubectl delete --all pvc
kubectl delete namespace acceptance --ignore-not-found=true
fi
}

View file

@ -0,0 +1,28 @@
server:
standalone:
config: |
ui = true
listener "tcp" {
tls_disable = 1
address = "[::]:8200"
cluster_address = "[::]:8201"
# Enable unauthenticated metrics access (necessary for Prometheus Operator)
telemetry {
unauthenticated_metrics_access = "true"
}
}
storage "file" {
path = "/vault/data"
}
telemetry {
prometheus_retention_time = "30s",
disable_hostname = true
}
serverTelemetry:
serviceMonitor:
enabled: true
interval: 15s

View file

@ -0,0 +1,68 @@
#!/usr/bin/env bats
load _helpers
@test "prometheus/PrometheusRules-server: assertDisabled by default" {
cd `chart_dir`
local actual=$( (helm template \
--show-only templates/prometheus-prometheusrules.yaml \
--set 'serverTelemetry.prometheusRules.rules.foo=bar' \
. || echo "---") | tee /dev/stderr |
yq 'length > 0' | tee /dev/stderr)
[ "${actual}" = "false" ]
}
@test "prometheus/PrometheusRules-server: assertDisabled with rules-defined=false" {
cd `chart_dir`
local actual=$( (helm template \
--show-only templates/prometheus-prometheusrules.yaml \
--set 'serverTelemetry.prometheusRules.enabled=true' \
. || echo "---") | tee /dev/stderr | yq 'length > 0' | tee /dev/stderr)
[ "${actual}" = "false" ]
}
@test "prometheus/PrometheusRules-server: assertEnabled with rules-defined=true" {
cd `chart_dir`
local output=$( (helm template \
--show-only templates/prometheus-prometheusrules.yaml \
--set 'serverTelemetry.prometheusRules.enabled=true' \
--set 'serverTelemetry.prometheusRules.rules.foo=bar' \
--set 'serverTelemetry.prometheusRules.rules.baz=qux' \
.) | tee /dev/stderr )
[ "$(echo "$output" | yq -r '.spec.groups | length')" = "1" ]
[ "$(echo "$output" | yq -r '.spec.groups[0] | length')" = "2" ]
[ "$(echo "$output" | yq -r '.spec.groups[0].name')" = "release-name-vault" ]
[ "$(echo "$output" | yq -r '.spec.groups[0].rules | length')" = "2" ]
[ "$(echo "$output" | yq -r '.spec.groups[0].rules.foo')" = "bar" ]
[ "$(echo "$output" | yq -r '.spec.groups[0].rules.baz')" = "qux" ]
}
@test "prometheus/PrometheusRules-server: assertSelectors default" {
cd `chart_dir`
local output=$( (helm template \
--show-only templates/prometheus-prometheusrules.yaml \
--set 'serverTelemetry.prometheusRules.enabled=true' \
--set 'serverTelemetry.prometheusRules.rules.foo=bar' \
. ) | tee /dev/stderr)
[ "$(echo "$output" | yq -r '.metadata.labels | length')" = "5" ]
[ "$(echo "$output" | yq -r '.metadata.labels.release')" = "prometheus" ]
}
@test "prometheus/PrometheusRules-server: assertSelectors overrides" {
cd `chart_dir`
local output=$( (helm template \
--show-only templates/prometheus-prometheusrules.yaml \
--set 'serverTelemetry.prometheusRules.enabled=true' \
--set 'serverTelemetry.prometheusRules.rules.foo=bar' \
--set 'serverTelemetry.prometheusRules.selectors.baz=qux' \
--set 'serverTelemetry.prometheusRules.selectors.bar=foo' \
. ) | tee /dev/stderr)
[ "$(echo "$output" | yq -r '.metadata.labels | length')" = "6" ]
[ "$(echo "$output" | yq -r '.metadata.labels | has("app")')" = "false" ]
[ "$(echo "$output" | yq -r '.metadata.labels | has("kube-prometheus-stack")')" = "false" ]
[ "$(echo "$output" | yq -r '.metadata.labels.baz')" = "qux" ]
[ "$(echo "$output" | yq -r '.metadata.labels.bar')" = "foo" ]
}

View file

@ -0,0 +1,125 @@
#!/usr/bin/env bats
load _helpers
@test "prometheus/ServiceMonitor-server: assertDisabled by default" {
cd `chart_dir`
local actual=$( (helm template \
--show-only templates/prometheus-servicemonitor.yaml \
. || echo "---") | tee /dev/stderr |
yq 'length > 0' | tee /dev/stderr)
[ "${actual}" = "false" ]
}
@test "prometheus/ServiceMonitor-server: assertEnabled global" {
cd `chart_dir`
local actual=$( (helm template \
--show-only templates/prometheus-servicemonitor.yaml \
--set 'serverTelemetry.serviceMonitor.enabled=false' \
--set 'global.serverTelemetry.prometheusOperator=true' \
. || echo "---") | tee /dev/stderr |
yq 'length > 0' | tee /dev/stderr)
[ "${actual}" = "true" ]
}
@test "prometheus/ServiceMonitor-server: assertEnabled" {
cd `chart_dir`
local actual=$( (helm template \
--show-only templates/prometheus-servicemonitor.yaml \
--set 'serverTelemetry.serviceMonitor.enabled=true' \
. || echo "---") | tee /dev/stderr |
yq 'length > 0' | tee /dev/stderr)
[ "${actual}" = "true" ]
}
@test "prometheus/ServiceMonitor-server: assertScrapeTimeout default" {
cd `chart_dir`
local actual=$( (helm template \
--show-only templates/prometheus-servicemonitor.yaml \
--set 'serverTelemetry.serviceMonitor.enabled=true' \
. ) | tee /dev/stderr |
yq -r '.spec.endpoints[0].scrapeTimeout' | tee /dev/stderr)
[ "${actual}" = "10s" ]
}
@test "prometheus/ServiceMonitor-server: assertScrapeTimeout update" {
cd `chart_dir`
local actual=$( (helm template \
--show-only templates/prometheus-servicemonitor.yaml \
--set 'serverTelemetry.serviceMonitor.enabled=true' \
--set 'serverTelemetry.serviceMonitor.scrapeTimeout=60s' \
. ) | tee /dev/stderr |
yq -r '.spec.endpoints[0].scrapeTimeout' | tee /dev/stderr)
[ "${actual}" = "60s" ]
}
@test "prometheus/ServiceMonitor-server: assertInterval default" {
cd `chart_dir`
local actual=$( (helm template \
--show-only templates/prometheus-servicemonitor.yaml \
--set 'serverTelemetry.serviceMonitor.enabled=true' \
. ) | tee /dev/stderr |
yq -r '.spec.endpoints[0].interval' | tee /dev/stderr)
[ "${actual}" = "30s" ]
}
@test "prometheus/ServiceMonitor-server: assertInterval update" {
cd `chart_dir`
local output=$( (helm template \
--show-only templates/prometheus-servicemonitor.yaml \
--set 'serverTelemetry.serviceMonitor.enabled=true' \
--set 'serverTelemetry.serviceMonitor.interval=60s' \
. ) | tee /dev/stderr)
[ "$(echo "$output" | yq -r '.spec.endpoints[0].interval')" = "60s" ]
}
@test "prometheus/ServiceMonitor-server: assertSelectors default" {
cd `chart_dir`
local output=$( (helm template \
--show-only templates/prometheus-servicemonitor.yaml \
--set 'serverTelemetry.serviceMonitor.enabled=true' \
. ) | tee /dev/stderr)
[ "$(echo "$output" | yq -r '.metadata.labels | length')" = "5" ]
[ "$(echo "$output" | yq -r '.metadata.labels.release')" = "prometheus" ]
}
@test "prometheus/ServiceMonitor-server: assertSelectors override" {
cd `chart_dir`
local output=$( (helm template \
--show-only templates/prometheus-servicemonitor.yaml \
--set 'serverTelemetry.serviceMonitor.enabled=true' \
--set 'serverTelemetry.serviceMonitor.selectors.baz=qux' \
--set 'serverTelemetry.serviceMonitor.selectors.bar=foo' \
. ) | tee /dev/stderr)
[ "$(echo "$output" | yq -r '.metadata.labels | length')" = "6" ]
[ "$(echo "$output" | yq -r '.metadata.labels | has("app")')" = "false" ]
[ "$(echo "$output" | yq -r '.metadata.labels.baz')" = "qux" ]
[ "$(echo "$output" | yq -r '.metadata.labels.bar')" = "foo" ]
}
@test "prometheus/ServiceMonitor-server: assertEndpoints noTLS" {
cd `chart_dir`
local output=$( (helm template \
--show-only templates/prometheus-servicemonitor.yaml \
--set 'global.tlsDisable=true' \
--set 'serverTelemetry.serviceMonitor.enabled=true' \
. ) | tee /dev/stderr)
[ "$(echo "$output" | yq -r '.spec.endpoints | length')" = "1" ]
[ "$(echo "$output" | yq -r '.spec.endpoints[0].port')" = "http" ]
}
@test "prometheus/ServiceMonitor-server: assertEndpoints TLS" {
cd `chart_dir`
local output=$( (helm template \
--show-only templates/prometheus-servicemonitor.yaml \
--set 'global.tlsDisable=false' \
--set 'serverTelemetry.serviceMonitor.enabled=true' \
. ) | tee /dev/stderr)
[ "$(echo "$output" | yq -r '.spec.endpoints | length')" = "1" ]
[ "$(echo "$output" | yq -r '.spec.endpoints[0].port')" = "https" ]
}

View file

@ -32,6 +32,11 @@ global:
seccomp.security.alpha.kubernetes.io/defaultProfileName: runtime/default
apparmor.security.beta.kubernetes.io/defaultProfileName: runtime/default
serverTelemetry:
# Enable integration with the Prometheus Operator
# See the top level serverTelemetry section below before enabling this feature.
prometheusOperator: false
injector:
# True if you want to enable vault agent injection.
# @default: global.enabled
@ -705,6 +710,10 @@ server:
tls_disable = 1
address = "[::]:8200"
cluster_address = "[::]:8201"
# Enable unauthenticated metrics access (necessary for Prometheus Operator)
#telemetry {
# unauthenticated_metrics_access = "true"
#}
}
storage "file" {
path = "/vault/data"
@ -720,6 +729,12 @@ server:
# crypto_key = "vault-helm-unseal-key"
#}
# Example configuration for enabling Prometheus metrics in your config.
#telemetry {
# prometheus_retention_time = "30s",
# disable_hostname = true
#}
# Run Vault in "HA" mode. There are no storage requirements unless the audit log
# persistence is required. In HA mode Vault will configure itself to use Consul
# for its storage backend. The default configuration provided will work the Consul
@ -761,6 +776,10 @@ server:
tls_disable = 1
address = "[::]:8200"
cluster_address = "[::]:8201"
# Enable unauthenticated metrics access (necessary for Prometheus Operator)
#telemetry {
# unauthenticated_metrics_access = "true"
#}
}
storage "raft" {
@ -802,6 +821,14 @@ server:
# crypto_key = "vault-helm-unseal-key"
#}
# Example configuration for enabling Prometheus metrics.
# If you are using Prometheus Operator you can enable a ServiceMonitor resource below.
# You may wish to enable unauthenticated metrics in the listener block above.
#telemetry {
# prometheus_retention_time = "30s",
# disable_hostname = true
#}
# A disruption budget limits the number of pods of a replicated application
# that are down simultaneously from voluntary disruptions
disruptionBudget:
@ -1008,3 +1035,85 @@ csi:
# See https://www.vaultproject.io/docs/platform/k8s/csi/configurations#command-line-arguments
# for the available command line flags.
extraArgs: []
# Vault is able to collect and publish various runtime metrics.
# Enabling this feature requires setting adding `telemetry{}` stanza to
# the Vault configuration. There are a few examples included in the `config` sections above.
#
# For more information see:
# https://www.vaultproject.io/docs/configuration/telemetry
# https://www.vaultproject.io/docs/internals/telemetry
serverTelemetry:
# Enable support for the Prometheus Operator. Currently, this chart does not support
# authenticating to Vault's metrics endpoint, so the following `telemetry{}` must be included
# in the `listener "tcp"{}` stanza
# telemetry {
# unauthenticated_metrics_access = "true"
# }
#
# See the `standalone.config` for a more complete example of this.
#
# In addition, a top level `telemetry{}` stanza must also be included in the Vault configuration:
#
# example:
# telemetry {
# prometheus_retention_time = "30s",
# disable_hostname = true
# }
#
# Configuration for monitoring the Vault server.
serviceMonitor:
# The Prometheus operator *must* be installed before enabling this feature,
# if not the chart will fail to install due to missing CustomResourceDefinitions
# provided by the operator.
#
# Instructions on how to install the Helm chart can be found here:
# https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack
# More information can be found here:
# https://github.com/prometheus-operator/prometheus-operator
# https://github.com/prometheus-operator/kube-prometheus
# Enable deployment of the Vault Server ServiceMonitor CustomResource.
enabled: false
# Selector labels to add to the ServiceMonitor.
# When empty, defaults to:
# release: prometheus
selectors: {}
# Interval at which Prometheus scrapes metrics
interval: 30s
# Timeout for Prometheus scrapes
scrapeTimeout: 10s
prometheusRules:
# The Prometheus operator *must* be installed before enabling this feature,
# if not the chart will fail to install due to missing CustomResourceDefinitions
# provided by the operator.
# Deploy the PrometheusRule custom resource for AlertManager based alerts.
# Requires that AlertManager is properly deployed.
enabled: false
# Selector labels to add to the PrometheusRules.
# When empty, defaults to:
# release: prometheus
selectors: {}
# Some example rules.
rules: {}
# - alert: vault-HighResponseTime
# annotations:
# message: The response time of Vault is over 500ms on average over the last 5 minutes.
# expr: vault_core_handle_request{quantile="0.5", namespace="mynamespace"} > 500
# for: 5m
# labels:
# severity: warning
# - alert: vault-HighResponseTime
# annotations:
# message: The response time of Vault is over 1s on average over the last 5 minutes.
# expr: vault_core_handle_request{quantile="0.5", namespace="mynamespace"} > 1000
# for: 5m
# labels:
# severity: critical