We are using AWS Cloudwatch to monitor CPU usage, p99 latency for API calls etc. The problem is during peak traffic Amazon Cloudwatch Agent itself is having 25%-35% of CPU usage, thus largely contributing to the high CPU usage trigger. I have observed a direct correlation between p99 latency metrics and CPU usage metrics.
- Is it normal for monitoring tools to be hard on system resources?
- Is there a way to optimize the Amazon Cloudwatch Agent to utilize low system resources?
I'm pasting the config file of Amazon Cloudwatch here:
[agent]
collection_jitter = "0s"
debug = false
flush_interval = "1s"
flush_jitter = "0s"
hostname = ""
interval = "60s"
logfile = "/opt/aws/amazon-cloudwatch-agent/logs/amazon-cloudwatch-agent.log"
logtarget = "lumberjack"
metric_batch_size = 1000
metric_buffer_limit = 10000
omit_hostname = false
precision = ""
quiet = false
round_interval = false
[inputs]
[[inputs.cpu]]
fieldpass = ["usage_active"]
interval = "10s"
percpu = true
report_active = true
totalcpu = false
[inputs.cpu.tags]
"aws:StorageResolution" = "true"
metricPath = "metrics"
[[inputs.disk]]
fieldpass = ["total", "used"]
interval = "60s"
mount_points = ["/", "/tmp"]
tagexclude = ["mode"]
[inputs.disk.tags]
metricPath = "metrics"
[[inputs.logfile]]
destination = "cloudwatchlogs"
file_state_folder = "/opt/aws/amazon-cloudwatch-agent/logs/state"
[[inputs.logfile.file_config]]
file_path = "/home/ubuntu/access-logs-app2/app.log.*"
from_beginning = true
log_group_name = "access-logs-app2"
log_stream_name = "access-logs-app2"
pipe = false
[[inputs.logfile.file_config]]
file_path = "/home/ubuntu/webhooks-logs-app2/webhook.log.*"
from_beginning = true
log_group_name = "webhooks-logs-app2"
log_stream_name = "webhooks-logs-app2"
pipe = false
[[inputs.logfile.file_config]]
file_path = "/home/ubuntu/access-logs-app/app.log.*"
from_beginning = true
log_group_name = "access-logs-app"
log_stream_name = "access-logs-app"
pipe = false
[[inputs.logfile.file_config]]
file_path = "/home/ubuntu/webhooks-logs-app/webhook.log.*"
from_beginning = true
log_group_name = "webhooks-logs-app"
log_stream_name = "webhooks-logs-app"
pipe = false
[[inputs.logfile.file_config]]
file_path = "/home/ubuntu/query-logs/**"
from_beginning = true
log_group_name = "db-query-logs"
log_stream_name = "db-query-logs"
pipe = false
[[inputs.logfile.file_config]]
file_path = "/var/log/nginx/some_name.*"
from_beginning = true
log_group_name = "some_name-nginx"
log_stream_name = "some_name-nginx"
pipe = false
[inputs.logfile.tags]
metricPath = "logs"
[[inputs.mem]]
fieldpass = ["used", "cached", "total"]
interval = "60s"
[inputs.mem.tags]
metricPath = "metrics"
[outputs]
[[outputs.cloudwatch]]
force_flush_interval = "60s"
namespace = "CWAgent"
profile = "www-data"
region = "ap-south-1"
shared_credential_file = "/var/.aws/credentials"
tagexclude = ["metricPath"]
[outputs.cloudwatch.tagpass]
metricPath = ["metrics"]
[[outputs.cloudwatchlogs]]
force_flush_interval = "5s"
log_stream_name = "production"
profile = "www-data"
region = "ap-south-1"
shared_credential_file = "/var/.aws/credentials"
tagexclude = ["metricPath"]
[outputs.cloudwatchlogs.tagpass]
metricPath = ["logs"]