Posted at

AWS ECS Agent の Prometheus Metrics を試してみる

AWS ECS Agent 1.24.0 に Prometheus Metrics をエクスポートする機能がマージされていたので試してみた。

https://github.com/aws/amazon-ecs-agent/blob/master/CHANGELOG.md#1240

* Feature - Introduce prometheus support for agent metrics #1745


機能を有効化する

ECS Agent が Prometheus のメトリクスをエクスポートするには ecs.config に1行追加するだけで良い。


  • /etc/ecs/ecs.config

+ ECS_ENABLE_PROMETHEUS_METRICS=true

Listenポートは 51680 となっている。


メトリクスの一覧

とりあえず、curl で叩いてみる。

$ curl localhost:51680/metrics

# HELP AgentMetrics_DockerAPI_call_count
# TYPE AgentMetrics_DockerAPI_call_count counter
AgentMetrics_DockerAPI_call_count{Call="CREATE_CONTAINER"} 1
AgentMetrics_DockerAPI_call_count{Call="INSPECT_CONTAINER"} 3
AgentMetrics_DockerAPI_call_count{Call="INSPECT_IMAGE"} 2
AgentMetrics_DockerAPI_call_count{Call="LOAD_IMAGE"} 1
AgentMetrics_DockerAPI_call_count{Call="PULL_IMAGE"} 1
AgentMetrics_DockerAPI_call_count{Call="START_CONTAINER"} 1
# HELP AgentMetrics_DockerAPI_call_duration DockerAPI call duration in seconds individual
# TYPE AgentMetrics_DockerAPI_call_duration gauge
AgentMetrics_DockerAPI_call_duration{Call="CREATE_CONTAINER"} 0.257324464
AgentMetrics_DockerAPI_call_duration{Call="INSPECT_CONTAINER"} 0.00124233
AgentMetrics_DockerAPI_call_duration{Call="INSPECT_IMAGE"} 0.010489645
AgentMetrics_DockerAPI_call_duration{Call="LOAD_IMAGE"} 0.370162726
AgentMetrics_DockerAPI_call_duration{Call="PULL_IMAGE"} 19.683362793
AgentMetrics_DockerAPI_call_duration{Call="START_CONTAINER"} 0.43437413
# HELP AgentMetrics_DockerAPI_duration_seconds DockerAPI call duration in seconds
# TYPE AgentMetrics_DockerAPI_duration_seconds summary
AgentMetrics_DockerAPI_duration_seconds_sum{Call="CREATE_CONTAINER"} 0.257324464
AgentMetrics_DockerAPI_duration_seconds_count{Call="CREATE_CONTAINER"} 1
AgentMetrics_DockerAPI_duration_seconds_sum{Call="INSPECT_CONTAINER"} 0.004263372
AgentMetrics_DockerAPI_duration_seconds_count{Call="INSPECT_CONTAINER"} 3
AgentMetrics_DockerAPI_duration_seconds_sum{Call="INSPECT_IMAGE"} 0.011394566
AgentMetrics_DockerAPI_duration_seconds_count{Call="INSPECT_IMAGE"} 2
AgentMetrics_DockerAPI_duration_seconds_sum{Call="LOAD_IMAGE"} 0.370162726
AgentMetrics_DockerAPI_duration_seconds_count{Call="LOAD_IMAGE"} 1
AgentMetrics_DockerAPI_duration_seconds_sum{Call="PULL_IMAGE"} 19.683362793
AgentMetrics_DockerAPI_duration_seconds_count{Call="PULL_IMAGE"} 1
AgentMetrics_DockerAPI_duration_seconds_sum{Call="START_CONTAINER"} 0.43437413
AgentMetrics_DockerAPI_duration_seconds_count{Call="START_CONTAINER"} 1
# HELP AgentMetrics_ECSClient_call_count
# TYPE AgentMetrics_ECSClient_call_count counter
AgentMetrics_ECSClient_call_count{Call="SUBMIT_TASK_EVENTS"} 1
# HELP AgentMetrics_ECSClient_call_duration ECSClient call duration in seconds individual
# TYPE AgentMetrics_ECSClient_call_duration gauge
AgentMetrics_ECSClient_call_duration{Call="SUBMIT_TASK_EVENTS"} 0.071811231
# HELP AgentMetrics_ECSClient_duration_seconds ECSClient call duration in seconds
# TYPE AgentMetrics_ECSClient_duration_seconds summary
AgentMetrics_ECSClient_duration_seconds_sum{Call="SUBMIT_TASK_EVENTS"} 0.071811231
AgentMetrics_ECSClient_duration_seconds_count{Call="SUBMIT_TASK_EVENTS"} 1
# HELP AgentMetrics_StateManager_call_count
# TYPE AgentMetrics_StateManager_call_count counter
AgentMetrics_StateManager_call_count{Call="SAVE"} 15
# HELP AgentMetrics_StateManager_call_duration StateManager call duration in seconds individual
# TYPE AgentMetrics_StateManager_call_duration gauge
AgentMetrics_StateManager_call_duration{Call="SAVE"} 0.003184178
# HELP AgentMetrics_StateManager_duration_seconds StateManager call duration in seconds
# TYPE AgentMetrics_StateManager_duration_seconds summary
AgentMetrics_StateManager_duration_seconds_sum{Call="SAVE"} 0.09526237399999998
AgentMetrics_StateManager_duration_seconds_count{Call="SAVE"} 15
# HELP AgentMetrics_TaskEngine_call_count
# TYPE AgentMetrics_TaskEngine_call_count counter
AgentMetrics_TaskEngine_call_count{Call="ADD_TASK"} 1
# HELP AgentMetrics_TaskEngine_call_duration TaskEngine call duration in seconds individual
# TYPE AgentMetrics_TaskEngine_call_duration gauge
AgentMetrics_TaskEngine_call_duration{Call="ADD_TASK"} 6.2929e-05
# HELP AgentMetrics_TaskEngine_duration_seconds TaskEngine call duration in seconds
# TYPE AgentMetrics_TaskEngine_duration_seconds summary
AgentMetrics_TaskEngine_duration_seconds_sum{Call="ADD_TASK"} 6.2929e-05
AgentMetrics_TaskEngine_duration_seconds_count{Call="ADD_TASK"} 1
# HELP go_gc_duration_seconds A summary of the GC invocation durations.
# TYPE go_gc_duration_seconds summary
go_gc_duration_seconds{quantile="0"} 3.1468e-05
go_gc_duration_seconds{quantile="0.25"} 4.6355e-05
go_gc_duration_seconds{quantile="0.5"} 5.5721e-05
go_gc_duration_seconds{quantile="0.75"} 8.7131e-05
go_gc_duration_seconds{quantile="1"} 0.003637265
go_gc_duration_seconds_sum 0.006573597
go_gc_duration_seconds_count 16
# HELP go_goroutines Number of goroutines that currently exist.
# TYPE go_goroutines gauge
go_goroutines 84
# HELP go_info Information about the Go environment.
# TYPE go_info gauge
go_info{version="go1.9.7"} 1
# HELP go_memstats_alloc_bytes Number of bytes allocated and still in use.
# TYPE go_memstats_alloc_bytes gauge
go_memstats_alloc_bytes 4.147696e+06
# HELP go_memstats_alloc_bytes_total Total number of bytes allocated, even if freed.
# TYPE go_memstats_alloc_bytes_total counter
go_memstats_alloc_bytes_total 2.2890728e+07
# HELP go_memstats_buck_hash_sys_bytes Number of bytes used by the profiling bucket hash table.
# TYPE go_memstats_buck_hash_sys_bytes gauge
go_memstats_buck_hash_sys_bytes 1.447925e+06
# HELP go_memstats_frees_total Total number of frees.
# TYPE go_memstats_frees_total counter
go_memstats_frees_total 204574
# HELP go_memstats_gc_cpu_fraction The fraction of this program's available CPU time used by the GC since the program started.
# TYPE go_memstats_gc_cpu_fraction gauge
go_memstats_gc_cpu_fraction 2.0084119566021017e-05
# HELP go_memstats_gc_sys_bytes Number of bytes used for garbage collection system metadata.
# TYPE go_memstats_gc_sys_bytes gauge
go_memstats_gc_sys_bytes 520192
# HELP go_memstats_heap_alloc_bytes Number of heap bytes allocated and still in use.
# TYPE go_memstats_heap_alloc_bytes gauge
go_memstats_heap_alloc_bytes 4.147696e+06
# HELP go_memstats_heap_idle_bytes Number of heap bytes waiting to be used.
# TYPE go_memstats_heap_idle_bytes gauge
go_memstats_heap_idle_bytes 2.260992e+06
# HELP go_memstats_heap_inuse_bytes Number of heap bytes that are in use.
# TYPE go_memstats_heap_inuse_bytes gauge
go_memstats_heap_inuse_bytes 6.291456e+06
# HELP go_memstats_heap_objects Number of allocated objects.
# TYPE go_memstats_heap_objects gauge
go_memstats_heap_objects 29301
# HELP go_memstats_heap_released_bytes Number of heap bytes released to OS.
# TYPE go_memstats_heap_released_bytes gauge
go_memstats_heap_released_bytes 0
# HELP go_memstats_heap_sys_bytes Number of heap bytes obtained from system.
# TYPE go_memstats_heap_sys_bytes gauge
go_memstats_heap_sys_bytes 8.552448e+06
# HELP go_memstats_last_gc_time_seconds Number of seconds since 1970 of last garbage collection.
# TYPE go_memstats_last_gc_time_seconds gauge
go_memstats_last_gc_time_seconds 1.5484726661935096e+09
# HELP go_memstats_lookups_total Total number of pointer lookups.
# TYPE go_memstats_lookups_total counter
go_memstats_lookups_total 333
# HELP go_memstats_mallocs_total Total number of mallocs.
# TYPE go_memstats_mallocs_total counter
go_memstats_mallocs_total 233875
# HELP go_memstats_mcache_inuse_bytes Number of bytes in use by mcache structures.
# TYPE go_memstats_mcache_inuse_bytes gauge
go_memstats_mcache_inuse_bytes 3472
# HELP go_memstats_mcache_sys_bytes Number of bytes used for mcache structures obtained from system.
# TYPE go_memstats_mcache_sys_bytes gauge
go_memstats_mcache_sys_bytes 16384
# HELP go_memstats_mspan_inuse_bytes Number of bytes in use by mspan structures.
# TYPE go_memstats_mspan_inuse_bytes gauge
go_memstats_mspan_inuse_bytes 101232
# HELP go_memstats_mspan_sys_bytes Number of bytes used for mspan structures obtained from system.
# TYPE go_memstats_mspan_sys_bytes gauge
go_memstats_mspan_sys_bytes 131072
# HELP go_memstats_next_gc_bytes Number of heap bytes when next garbage collection will take place.
# TYPE go_memstats_next_gc_bytes gauge
go_memstats_next_gc_bytes 7.471776e+06
# HELP go_memstats_other_sys_bytes Number of bytes used for other system allocations.
# TYPE go_memstats_other_sys_bytes gauge
go_memstats_other_sys_bytes 680195
# HELP go_memstats_stack_inuse_bytes Number of bytes in use by the stack allocator.
# TYPE go_memstats_stack_inuse_bytes gauge
go_memstats_stack_inuse_bytes 884736
# HELP go_memstats_stack_sys_bytes Number of bytes obtained from system for stack allocator.
# TYPE go_memstats_stack_sys_bytes gauge
go_memstats_stack_sys_bytes 884736
# HELP go_memstats_sys_bytes Number of bytes obtained from system.
# TYPE go_memstats_sys_bytes gauge
go_memstats_sys_bytes 1.2232952e+07
# HELP go_threads Number of OS threads created.
# TYPE go_threads gauge
go_threads 9
# HELP process_cpu_seconds_total Total user and system CPU time spent in seconds.
# TYPE process_cpu_seconds_total counter
process_cpu_seconds_total 0.31
# HELP process_max_fds Maximum number of open file descriptors.
# TYPE process_max_fds gauge
process_max_fds 1024
# HELP process_open_fds Number of open file descriptors.
# TYPE process_open_fds gauge
process_open_fds 31
# HELP process_resident_memory_bytes Resident memory size in bytes.
# TYPE process_resident_memory_bytes gauge
process_resident_memory_bytes 1.9263488e+07
# HELP process_start_time_seconds Start time of the process since unix epoch in seconds.
# TYPE process_start_time_seconds gauge
process_start_time_seconds 1.54847142136e+09
# HELP process_virtual_memory_bytes Virtual memory size in bytes.
# TYPE process_virtual_memory_bytes gauge
process_virtual_memory_bytes 3.4983936e+07
# HELP process_virtual_memory_max_bytes Maximum amount of virtual memory available in bytes.
# TYPE process_virtual_memory_max_bytes gauge
process_virtual_memory_max_bytes -1
# HELP promhttp_metric_handler_requests_in_flight Current number of scrapes being served.
# TYPE promhttp_metric_handler_requests_in_flight gauge
promhttp_metric_handler_requests_in_flight 1
# HELP promhttp_metric_handler_requests_total Total number of scrapes by HTTP status code.
# TYPE promhttp_metric_handler_requests_total counter
promhttp_metric_handler_requests_total{code="200"} 6
promhttp_metric_handler_requests_total{code="500"} 0
promhttp_metric_handler_requests_total{code="503"} 0

あくまで Agent 自信のメトリクスを出しているだけのようだ。

Image Pull にかかった時間などが取れるみたいなので、起動が遅いといったときに、切り分けの情報に使えそう。