Merge pull request #2957 from ElvinEfendi/batch-metrics

Batch metrics and flush periodically
This commit is contained in:
k8s-ci-robot 2018-08-18 14:15:06 -07:00 committed by GitHub
commit a982713b3b
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
7 changed files with 298 additions and 265 deletions

View file

@ -20,6 +20,7 @@ import (
"encoding/json"
"fmt"
"io"
"io/ioutil"
"net"
"os"
@ -206,13 +207,14 @@ func (sc *SocketCollector) handleMessage(msg []byte) {
glog.V(5).Infof("msg: %v", string(msg))
// Unmarshall bytes
var stats socketData
err := json.Unmarshal(msg, &stats)
var statsBatch []socketData
err := json.Unmarshal(msg, &statsBatch)
if err != nil {
glog.Errorf("Unexpected error deserializing JSON paylod: %v", err)
glog.Errorf("Unexpected error deserializing JSON paylod: %v. Payload:\n%v", err, string(msg))
return
}
for _, stats := range statsBatch {
requestLabels := prometheus.Labels{
"host": stats.Host,
"status": stats.Status,
@ -295,6 +297,7 @@ func (sc *SocketCollector) handleMessage(msg []byte) {
}
}
}
}
// Start listen for connections in the unix socket and spawns a goroutine to process the content
func (sc *SocketCollector) Start() {
@ -408,19 +411,15 @@ func (sc SocketCollector) Collect(ch chan<- prometheus.Metric) {
sc.bytesSent.Collect(ch)
}
const packetSize = 1024 * 65
// handleMessages process the content received in a network connection
func handleMessages(conn io.ReadCloser, fn func([]byte)) {
defer conn.Close()
msg := make([]byte, packetSize)
s, err := conn.Read(msg[0:])
data, err := ioutil.ReadAll(conn)
if err != nil {
return
}
fn(msg[0:s])
fn(data)
}
func deleteConstants(labels prometheus.Labels) {

View file

@ -100,7 +100,7 @@ func TestCollector(t *testing.T) {
},
{
name: "valid metric object should update prometheus metrics",
data: []string{`{
data: []string{`[{
"host":"testshop.com",
"status":"200",
"bytesSent":150.0,
@ -115,7 +115,7 @@ func TestCollector(t *testing.T) {
"namespace":"test-app-production",
"ingress":"web-yml",
"service":"test-app"
}`},
}]`},
metrics: []string{"nginx_ingress_controller_response_duration_seconds"},
wantBefore: `
# HELP nginx_ingress_controller_response_duration_seconds The time spent on receiving the response from the upstream server
@ -142,7 +142,7 @@ func TestCollector(t *testing.T) {
{
name: "multiple messages should increase prometheus metric by two",
data: []string{`{
data: []string{`[{
"host":"testshop.com",
"status":"200",
"bytesSent":150.0,
@ -157,7 +157,7 @@ func TestCollector(t *testing.T) {
"namespace":"test-app-production",
"ingress":"web-yml",
"service":"test-app"
}`, `{
}]`, `[{
"host":"testshop.com",
"status":"200",
"bytesSent":150.0,
@ -172,7 +172,7 @@ func TestCollector(t *testing.T) {
"namespace":"test-app-qa",
"ingress":"web-yml-qa",
"service":"test-app-qa"
}`, `{
}]`, `[{
"host":"testshop.com",
"status":"200",
"bytesSent":150.0,
@ -187,7 +187,7 @@ func TestCollector(t *testing.T) {
"namespace":"test-app-qa",
"ingress":"web-yml-qa",
"service":"test-app-qa"
}`},
}]`},
metrics: []string{"nginx_ingress_controller_response_duration_seconds"},
wantBefore: `
# HELP nginx_ingress_controller_response_duration_seconds The time spent on receiving the response from the upstream server
@ -222,6 +222,65 @@ func TestCollector(t *testing.T) {
nginx_ingress_controller_response_duration_seconds_count{controller_class="ingress",controller_namespace="default",controller_pod="pod",host="testshop.com",ingress="web-yml-qa",method="GET",namespace="test-app-qa",path="/admin",service="test-app-qa",status="200"} 2
`,
},
{
name: "collector should be able to handle batched metrics correctly",
data: []string{`[
{
"host":"testshop.com",
"status":"200",
"bytesSent":150.0,
"method":"GET",
"path":"/admin",
"requestLength":300.0,
"requestTime":60.0,
"upstreamName":"test-upstream",
"upstreamIP":"1.1.1.1:8080",
"upstreamResponseTime":200,
"upstreamStatus":"220",
"namespace":"test-app-production",
"ingress":"web-yml",
"service":"test-app"
},
{
"host":"testshop.com",
"status":"200",
"bytesSent":150.0,
"method":"GET",
"path":"/admin",
"requestLength":300.0,
"requestTime":60.0,
"upstreamName":"test-upstream",
"upstreamIP":"1.1.1.1:8080",
"upstreamResponseTime":100,
"upstreamStatus":"220",
"namespace":"test-app-production",
"ingress":"web-yml",
"service":"test-app"
}]`},
metrics: []string{"nginx_ingress_controller_response_duration_seconds"},
wantBefore: `
# HELP nginx_ingress_controller_response_duration_seconds The time spent on receiving the response from the upstream server
# TYPE nginx_ingress_controller_response_duration_seconds histogram
nginx_ingress_controller_response_duration_seconds_bucket{controller_class="ingress",controller_namespace="default",controller_pod="pod",host="testshop.com",ingress="web-yml",method="GET",namespace="test-app-production",path="/admin",service="test-app",status="200",le="0.005"} 0
nginx_ingress_controller_response_duration_seconds_bucket{controller_class="ingress",controller_namespace="default",controller_pod="pod",host="testshop.com",ingress="web-yml",method="GET",namespace="test-app-production",path="/admin",service="test-app",status="200",le="0.01"} 0
nginx_ingress_controller_response_duration_seconds_bucket{controller_class="ingress",controller_namespace="default",controller_pod="pod",host="testshop.com",ingress="web-yml",method="GET",namespace="test-app-production",path="/admin",service="test-app",status="200",le="0.025"} 0
nginx_ingress_controller_response_duration_seconds_bucket{controller_class="ingress",controller_namespace="default",controller_pod="pod",host="testshop.com",ingress="web-yml",method="GET",namespace="test-app-production",path="/admin",service="test-app",status="200",le="0.05"} 0
nginx_ingress_controller_response_duration_seconds_bucket{controller_class="ingress",controller_namespace="default",controller_pod="pod",host="testshop.com",ingress="web-yml",method="GET",namespace="test-app-production",path="/admin",service="test-app",status="200",le="0.1"} 0
nginx_ingress_controller_response_duration_seconds_bucket{controller_class="ingress",controller_namespace="default",controller_pod="pod",host="testshop.com",ingress="web-yml",method="GET",namespace="test-app-production",path="/admin",service="test-app",status="200",le="0.25"} 0
nginx_ingress_controller_response_duration_seconds_bucket{controller_class="ingress",controller_namespace="default",controller_pod="pod",host="testshop.com",ingress="web-yml",method="GET",namespace="test-app-production",path="/admin",service="test-app",status="200",le="0.5"} 0
nginx_ingress_controller_response_duration_seconds_bucket{controller_class="ingress",controller_namespace="default",controller_pod="pod",host="testshop.com",ingress="web-yml",method="GET",namespace="test-app-production",path="/admin",service="test-app",status="200",le="1"} 0
nginx_ingress_controller_response_duration_seconds_bucket{controller_class="ingress",controller_namespace="default",controller_pod="pod",host="testshop.com",ingress="web-yml",method="GET",namespace="test-app-production",path="/admin",service="test-app",status="200",le="2.5"} 0
nginx_ingress_controller_response_duration_seconds_bucket{controller_class="ingress",controller_namespace="default",controller_pod="pod",host="testshop.com",ingress="web-yml",method="GET",namespace="test-app-production",path="/admin",service="test-app",status="200",le="5"} 0
nginx_ingress_controller_response_duration_seconds_bucket{controller_class="ingress",controller_namespace="default",controller_pod="pod",host="testshop.com",ingress="web-yml",method="GET",namespace="test-app-production",path="/admin",service="test-app",status="200",le="10"} 0
nginx_ingress_controller_response_duration_seconds_bucket{controller_class="ingress",controller_namespace="default",controller_pod="pod",host="testshop.com",ingress="web-yml",method="GET",namespace="test-app-production",path="/admin",service="test-app",status="200",le="+Inf"} 2
nginx_ingress_controller_response_duration_seconds_sum{controller_class="ingress",controller_namespace="default",controller_pod="pod",host="testshop.com",ingress="web-yml",method="GET",namespace="test-app-production",path="/admin",service="test-app",status="200"} 300
nginx_ingress_controller_response_duration_seconds_count{controller_class="ingress",controller_namespace="default",controller_pod="pod",host="testshop.com",ingress="web-yml",method="GET",namespace="test-app-production",path="/admin",service="test-app",status="200"} 2
`,
removeIngresses: []string{"test-app-production/web-yml"},
wantAfter: `
`,
},
}
for _, c := range cases {

View file

@ -1,50 +1,83 @@
local socket = ngx.socket.tcp
local cjson = require('cjson')
local defer = require('util.defer')
local assert = assert
local metrics_batch = {}
-- if an Nginx worker processes more than (MAX_BATCH_SIZE/FLUSH_INTERVAL) RPS then it will start dropping metrics
local MAX_BATCH_SIZE = 10000
local FLUSH_INTERVAL = 1 -- second
local _M = {}
local function send_data(jsonData)
local function send(payload)
local s = assert(socket())
assert(s:connect('unix:/tmp/prometheus-nginx.socket'))
assert(s:send(jsonData))
assert(s:connect("unix:/tmp/prometheus-nginx.socket"))
assert(s:send(payload))
assert(s:close())
end
function _M.encode_nginx_stats()
return cjson.encode({
local function metrics()
return {
host = ngx.var.host or "-",
method = ngx.var.request_method or "-",
namespace = ngx.var.namespace or "-",
ingress = ngx.var.ingress_name or "-",
service = ngx.var.service_name or "-",
path = ngx.var.location_path or "-",
method = ngx.var.request_method or "-",
status = ngx.var.status or "-",
requestLength = tonumber(ngx.var.request_length) or -1,
requestTime = tonumber(ngx.var.request_time) or -1,
responseLength = tonumber(ngx.var.bytes_sent) or -1,
endpoint = ngx.var.upstream_addr or "-",
upstreamLatency = tonumber(ngx.var.upstream_connect_time) or -1,
upstreamResponseTime = tonumber(ngx.var.upstream_response_time) or -1,
upstreamResponseLength = tonumber(ngx.var.upstream_response_length) or -1,
upstreamStatus = ngx.var.upstream_status or "-",
}
end
namespace = ngx.var.namespace or "-",
ingress = ngx.var.ingress_name or "-",
service = ngx.var.service_name or "-",
})
local function flush(premature)
if premature then
return
end
if #metrics_batch == 0 then
return
end
local current_metrics_batch = metrics_batch
metrics_batch = {}
local ok, payload = pcall(cjson.encode, current_metrics_batch)
if not ok then
ngx.log(ngx.ERR, "error while encoding metrics: " .. tostring(payload))
return
end
send(payload)
end
function _M.init_worker()
local _, err = ngx.timer.every(FLUSH_INTERVAL, flush)
if err then
ngx.log(ngx.ERR, string.format("error when setting up timer.every: %s", tostring(err)))
end
end
function _M.call()
local ok, err = defer.to_timer_phase(send_data, _M.encode_nginx_stats())
if not ok then
ngx.log(ngx.ERR, "failed to defer send_data to timer phase: ", err)
if #metrics_batch >= MAX_BATCH_SIZE then
ngx.log(ngx.WARN, "omiting metrics for the request, current batch is full")
return
end
table.insert(metrics_batch, metrics())
end
if _TEST then
_M.flush = flush
_M.get_metrics_batch = function() return metrics_batch end
end
return _M

View file

@ -1,12 +0,0 @@
_G._TEST = true
local defer = require("util.defer")
describe("Defer", function()
describe("to_timer_phase", function()
it("executes passed callback immediately if called on timer phase", function()
defer.counter = 0
defer.to_timer_phase(function() defer.counter = defer.counter + 1 end)
assert.equal(defer.counter, 1)
end)
end)
end)

View file

@ -1,96 +1,106 @@
_G._TEST = true
local cjson = require("cjson")
local original_ngx = ngx
local function reset_ngx()
_G.ngx = original_ngx
end
local function mock_ngx(mock)
local _ngx = mock
setmetatable(_ngx, { __index = ngx })
_G.ngx = _ngx
end
local function mock_ngx_socket_tcp()
local tcp_mock = {}
stub(tcp_mock, "connect", true)
stub(tcp_mock, "send", true)
stub(tcp_mock, "close", true)
local socket_mock = {}
stub(socket_mock, "tcp", tcp_mock)
mock_ngx({ socket = socket_mock })
return tcp_mock
end
describe("Monitor", function()
after_each(function()
reset_ngx()
package.loaded["monitor"] = nil
end)
it("batches metrics", function()
local monitor = require("monitor")
describe("encode_nginx_stats()", function()
it("successfuly encodes the current stats of nginx to JSON", function()
local nginx_environment = {
host = "testshop.com",
status = "200",
bytes_sent = "150",
server_protocol = "HTTP",
mock_ngx({ var = {} })
for i = 1,10,1 do
monitor.call()
end
assert.equal(10, #monitor.get_metrics_batch())
end)
describe("flush", function()
it("short circuits when premmature is true (when worker is shutting down)", function()
local tcp_mock = mock_ngx_socket_tcp()
local monitor = require("monitor")
mock_ngx({ var = {} })
for i = 1,10,1 do
monitor.call()
end
monitor.flush(true)
assert.stub(tcp_mock.connect).was_not_called()
end)
it("short circuits when there's no metrics batched", function()
local tcp_mock = mock_ngx_socket_tcp()
local monitor = require("monitor")
monitor.flush()
assert.stub(tcp_mock.connect).was_not_called()
end)
it("JSON encodes and sends the batched metrics", function()
local tcp_mock = mock_ngx_socket_tcp()
local monitor = require("monitor")
local ngx_var_mock = {
host = "example.com",
namespace = "default",
ingress_name = "example",
service_name = "http-svc",
location_path = "/",
request_method = "GET",
location_path = "/admin",
request_length = "300",
request_time = "210",
proxy_upstream_name = "test-upstream",
upstream_addr = "2.2.2.2",
upstream_response_time = "200",
upstream_response_length = "150",
upstream_connect_time = "1",
upstream_status = "220",
namespace = "test-app-production",
ingress_name = "web-yml",
service_name = "test-app",
}
ngx.var = nginx_environment
local encode_nginx_stats = monitor.encode_nginx_stats
local encoded_json_stats = encode_nginx_stats()
local decoded_json_stats = cjson.decode(encoded_json_stats)
local expected_json_stats = {
host = "testshop.com",
status = "200",
responseLength = 150.0,
method = "GET",
path = "/admin",
requestLength = 300.0,
requestTime = 210.0,
endpoint = "2.2.2.2",
upstreamResponseTime = 200,
upstreamStatus = "220",
upstreamLatency = 1.0,
upstreamResponseLength = 150.0,
namespace = "test-app-production",
ingress = "web-yml",
service = "test-app",
request_length = "256",
request_time = "0.04",
bytes_sent = "512",
upstream_addr = "10.10.0.1",
upstream_connect_time = "0.01",
upstream_response_time = "0.02",
upstream_response_length = "456",
upstream_status = "200",
}
mock_ngx({ var = ngx_var_mock })
monitor.call()
assert.are.same(decoded_json_stats,expected_json_stats)
end)
local ngx_var_mock1 = ngx_var_mock
ngx_var_mock1.status = "201"
ngx_var_mock1.request_method = "POST"
mock_ngx({ var = ngx_var_mock })
monitor.call()
it("replaces empty numeric keys with -1 and missing string keys with -", function()
local nginx_environment = {
remote_addr = "10.10.10.10",
realip_remote_addr = "5.5.5.5",
remote_user = "francisco",
server_protocol = "HTTP",
request_method = "GET",
location_path = "/admin",
request_time = "202",
proxy_upstream_name = "test-upstream",
upstream_addr = "2.2.2.2",
upstream_response_time = "201",
upstream_status = "220",
ingress_name = "web-yml",
}
ngx.var = nginx_environment
monitor.flush()
local encode_nginx_stats = monitor.encode_nginx_stats
local encoded_json_stats = encode_nginx_stats()
local decoded_json_stats = cjson.decode(encoded_json_stats)
local expected_payload = '[{"upstreamStatus":"200","requestLength":256,"ingress":"example","status":"200","service":"http-svc","requestTime":0.04,"namespace":"default","host":"example.com","method":"GET","upstreamResponseTime":0.02,"upstreamResponseLength":456,"endpoint":"10.10.0.1","upstreamLatency":0.01,"path":"\\/","responseLength":512},{"upstreamStatus":"200","requestLength":256,"ingress":"example","status":"201","service":"http-svc","requestTime":0.04,"namespace":"default","host":"example.com","method":"POST","upstreamResponseTime":0.02,"upstreamResponseLength":456,"endpoint":"10.10.0.1","upstreamLatency":0.01,"path":"\\/","responseLength":512}]'
local expected_json_stats = {
host = "-",
status = "-",
responseLength = -1,
method = "GET",
path = "/admin",
requestLength = -1,
requestTime = 202.0,
endpoint = "2.2.2.2",
upstreamStatus = "220",
namespace = "-",
ingress = "web-yml",
upstreamLatency = -1,
upstreamResponseTime = 201,
upstreamResponseLength = -1,
responseLength = -1,
service = "-",
}
assert.are.same(decoded_json_stats,expected_json_stats)
assert.stub(tcp_mock.connect).was_called_with(tcp_mock, "unix:/tmp/prometheus-nginx.socket")
assert.stub(tcp_mock.send).was_called_with(tcp_mock, expected_payload)
assert.stub(tcp_mock.close).was_called_with(tcp_mock)
end)
end)
end)

View file

@ -1,57 +0,0 @@
local util = require("util")
local timer_started = false
local queue = {}
local MAX_QUEUE_SIZE = 10000
local _M = {}
local function flush_queue(premature)
-- TODO Investigate if we should actually still flush the queue when we're
-- shutting down.
if premature then return end
local current_queue = queue
queue = {}
timer_started = false
for _,v in ipairs(current_queue) do
v.func(unpack(v.args))
end
end
-- `to_timer_phase` will enqueue a function that will be executed in a timer
-- context, at a later point in time. The purpose is that some APIs (such as
-- sockets) are not available during some nginx request phases (such as the
-- logging phase), but are available for use in timers. There are no ordering
-- guarantees for when a function will be executed.
function _M.to_timer_phase(func, ...)
if ngx.get_phase() == "timer" then
func(...)
return true
end
if #queue >= MAX_QUEUE_SIZE then
ngx.log(ngx.ERR, "deferred timer queue full")
return nil, "deferred timer queue full"
end
table.insert(queue, { func = func, args = {...} })
if not timer_started then
local ok, err = ngx.timer.at(0, flush_queue)
if ok then
-- unfortunately this is to deal with tests - when running unit tests, we
-- dont actually run the timer, we call the function inline
if util.tablelength(queue) > 0 then
timer_started = true
end
else
local msg = "failed to create timer: " .. tostring(err)
ngx.log(ngx.ERR, msg)
return nil, msg
end
end
return true
end
return _M

View file

@ -90,6 +90,7 @@ http {
{{ if $all.DynamicConfigurationEnabled }}
init_worker_by_lua_block {
balancer.init_worker()
monitor.init_worker()
}
{{ end }}
{{ end }}