1

I am trying to set up the Ganglia monitoring tool between two VMs. Initially after setting up the monitoring server and monitored VM, Ganglia could only detect the monitoring server on the web frontend. After rebooting the two machines it could detect nothing.

I followed a guide to set these host at http://www.unixmen.com/install-ganglia-monitoring-system-ubuntu-13-10-debian-7/ but this doesn't seemed to have worked.

Below is the /etc/ganglia/gmond.conf file for the monitoring server.

==============================================================

/* This configuration is as close to 2.5.x default behavior as possible 
   The values closely match ./gmond/metric.h definitions in 2.5.x */ 
globals {                    
  daemonize = yes              
  setuid = yes             
  user = ganglia              
  debug_level = 0               
  max_udp_msg_len = 1472        
  mute = no             
  deaf = no             
  host_dmax = 0 /*secs */ 
  cleanup_threshold = 300 /*secs */ 
  gexec = no             
  send_metadata_interval = 0     
} 

/* If a cluster attribute is specified, then all gmond hosts are wrapped inside 
 * of a <CLUSTER> tag.  If you do not specify a cluster tag, then all <HOSTS> will 
 * NOT be wrapped inside of a <CLUSTER> tag. */ 
cluster { 
  name = "my cluster" 
  owner = "unspecified" 
  latlong = "unspecified" 
  url = "unspecified" 
} 

/* The host section describes attributes of the host, like the location */ 
host { 
  location = "unspecified" 
} 

/* Feel free to specify as many udp_send_channels as you like.  Gmond 
   used to only support having a single channel */ 
udp_send_channel { 
  #mcast_join = 239.2.11.71
  host = 192.168.0.4 
  port = 8649 
  ttl = 1 
} 

/* You can specify as many udp_recv_channels as you like as well. */ 
udp_recv_channel { 
  mcast_join = 239.2.11.71 
  port = 8649 
  bind = 239.2.11.71 
} 

/* You can specify as many tcp_accept_channels as you like to share 
   an xml description of the state of the cluster */ 
tcp_accept_channel { 
  port = 8649 
} 

/* Each metrics module that is referenced by gmond must be specified and 
   loaded. If the module has been statically linked with gmond, it does not 
   require a load path. However all dynamically loadable modules must include 
   a load path. */ 
modules { 
  module { 
    name = "core_metrics" 
  } 
  module { 
    name = "cpu_module" 
    path = "/usr/lib/ganglia/modcpu.so" 
  } 
  module { 
    name = "disk_module" 
    path = "/usr/lib/ganglia/moddisk.so" 
  } 
  module { 
    name = "load_module" 
    path = "/usr/lib/ganglia/modload.so" 
  } 
  module { 
    name = "mem_module" 
    path = "/usr/lib/ganglia/modmem.so" 
  } 
  module { 
    name = "net_module" 
    path = "/usr/lib/ganglia/modnet.so" 
  } 
  module { 
    name = "proc_module" 
    path = "/usr/lib/ganglia/modproc.so" 
  } 
  module { 
    name = "sys_module" 
    path = "/usr/lib/ganglia/modsys.so" 
  } 
} 

include ('/etc/ganglia/conf.d/*.conf') 


/* The old internal 2.5.x metric array has been replaced by the following 
   collection_group directives.  What follows is the default behavior for 
   collecting and sending metrics that is as close to 2.5.x behavior as 
   possible. */

/* This collection group will cause a heartbeat (or beacon) to be sent every 
   20 seconds.  In the heartbeat is the GMOND_STARTED data which expresses 
   the age of the running gmond. */ 
collection_group { 
  collect_once = yes 
  time_threshold = 20 
  metric { 
    name = "heartbeat" 
  } 
} 

/* This collection group will send general info about this host every 1200 secs. 
   This information doesn't change between reboots and is only collected once. */ 
collection_group { 
  collect_once = yes 
  time_threshold = 1200 
  metric { 
    name = "cpu_num" 
    title = "CPU Count" 
  } 
  metric { 
    name = "cpu_speed" 
    title = "CPU Speed" 
  } 
  metric { 
    name = "mem_total" 
    title = "Memory Total" 
  } 
  /* Should this be here? Swap can be added/removed between reboots. */ 
  metric { 
    name = "swap_total" 
    title = "Swap Space Total" 
  } 
  metric { 
    name = "boottime" 
    title = "Last Boot Time" 
  } 
  metric { 
    name = "machine_type" 
    title = "Machine Type" 
  } 
  metric { 
    name = "os_name" 
    title = "Operating System" 
  } 
  metric { 
    name = "os_release" 
    title = "Operating System Release" 
  } 
  metric { 
    name = "location" 
    title = "Location" 
  } 
} 

/* This collection group will send the status of gexecd for this host every 300 secs */
/* Unlike 2.5.x the default behavior is to report gexecd OFF.  */ 
collection_group { 
  collect_once = yes 
  time_threshold = 300 
  metric { 
    name = "gexec" 
    title = "Gexec Status" 
  } 
} 

/* This collection group will collect the CPU status info every 20 secs. 
   The time threshold is set to 90 seconds.  In honesty, this time_threshold could be 
   set significantly higher to reduce unneccessary network chatter. */ 
collection_group { 
  collect_every = 20 
  time_threshold = 90 
  /* CPU status */ 
  metric { 
    name = "cpu_user"  
    value_threshold = "1.0" 
    title = "CPU User" 
  } 
  metric { 
    name = "cpu_system"   
    value_threshold = "1.0" 
    title = "CPU System" 
  } 
  metric { 
    name = "cpu_idle"  
    value_threshold = "5.0" 
    title = "CPU Idle" 
  } 
  metric { 
    name = "cpu_nice"  
    value_threshold = "1.0" 
    title = "CPU Nice" 
  } 
  metric { 
    name = "cpu_aidle" 
    value_threshold = "5.0" 
    title = "CPU aidle" 
  } 
  metric { 
    name = "cpu_wio" 
    value_threshold = "1.0" 
    title = "CPU wio" 
  } 
  /* The next two metrics are optional if you want more detail... 
     ... since they are accounted for in cpu_system.  
  metric { 
    name = "cpu_intr" 
    value_threshold = "1.0" 
    title = "CPU intr" 
  } 
  metric { 
    name = "cpu_sintr" 
    value_threshold = "1.0" 
    title = "CPU sintr" 
  } 
  */ 
} 

collection_group { 
  collect_every = 20 
  time_threshold = 90 
  /* Load Averages */ 
  metric { 
    name = "load_one" 
    value_threshold = "1.0" 
    title = "One Minute Load Average" 
  } 
  metric { 
    name = "load_five" 
    value_threshold = "1.0" 
    title = "Five Minute Load Average" 
  } 
  metric { 
    name = "load_fifteen" 
    value_threshold = "1.0" 
    title = "Fifteen Minute Load Average" 
  }
} 

/* This group collects the number of running and total processes */ 
collection_group { 
  collect_every = 80 
  time_threshold = 950 
  metric { 
    name = "proc_run" 
    value_threshold = "1.0" 
    title = "Total Running Processes" 
  } 
  metric { 
    name = "proc_total" 
    value_threshold = "1.0" 
    title = "Total Processes" 
  } 
}

/* This collection group grabs the volatile memory metrics every 40 secs and 
   sends them at least every 180 secs.  This time_threshold can be increased 
   significantly to reduce unneeded network traffic. */ 
collection_group { 
  collect_every = 40 
  time_threshold = 180 
  metric { 
    name = "mem_free" 
    value_threshold = "1024.0" 
    title = "Free Memory" 
  } 
  metric { 
    name = "mem_shared" 
    value_threshold = "1024.0" 
    title = "Shared Memory" 
  } 
  metric { 
    name = "mem_buffers" 
    value_threshold = "1024.0" 
    title = "Memory Buffers" 
  } 
  metric { 
    name = "mem_cached" 
    value_threshold = "1024.0" 
    title = "Cached Memory" 
  } 
  metric { 
    name = "swap_free" 
    value_threshold = "1024.0" 
    title = "Free Swap Space" 
  } 
} 

collection_group { 
  collect_every = 40 
  time_threshold = 300 
  metric { 
    name = "bytes_out" 
    value_threshold = 4096 
    title = "Bytes Sent" 
  } 
  metric { 
    name = "bytes_in" 
    value_threshold = 4096 
    title = "Bytes Received" 
  } 
  metric { 
    name = "pkts_in" 
    value_threshold = 256 
    title = "Packets Received" 
  } 
  metric { 
    name = "pkts_out" 
    value_threshold = 256 
    title = "Packets Sent" 
  } 
}

/* Different than 2.5.x default since the old config made no sense */ 
collection_group { 
  collect_every = 1800 
  time_threshold = 3600 
  metric { 
    name = "disk_total" 
    value_threshold = 1.0 
    title = "Total Disk Space" 
  } 
}

collection_group { 
  collect_every = 40 
  time_threshold = 180 
  metric { 
    name = "disk_free" 
    value_threshold = 1.0 
    title = "Disk Space Available" 
  } 
  metric { 
    name = "part_max_used" 
    value_threshold = 1.0 
    title = "Maximum Disk Space Used" 
  } 
}

=====================================================================

Below is the /etc/ganglia/gmetad.conf file for the monitoring server.

=====================================================================

# This is an example of a Ganglia Meta Daemon configuration file
#
#
#-------------------------------------------------------------------------------
# Setting the debug_level to 1 will keep daemon in the forground and
# show only error messages. Setting this value higher than 1 will make 
# gmetad output debugging information and stay in the foreground.
# default: 0
# debug_level 10
#
#-------------------------------------------------------------------------------
# What to monitor. The most important section of this file. 
#
# The data_source tag specifies either a cluster or a grid to
# monitor. If we detect the source is a cluster, we will maintain a complete
# set of RRD databases for it, which can be used to create historical 
# graphs of the metrics. If the source is a grid (it comes from another gmetad),
# we will only maintain summary RRDs for it.
#
# Format: 
# data_source "my cluster" [polling interval] address1:port addreses2:port ...
# 
# The keyword 'data_source' must immediately be followed by a unique
# string which identifies the source, then an optional polling interval in 
# seconds. The source will be polled at this interval on average. 
# If the polling interval is omitted, 15sec is asssumed. 
#
# If you choose to set the polling interval to something other than the default,
# note that the web frontend determines a host as down if its TN value is less
# than 4 * TMAX (20sec by default).  Therefore, if you set the polling interval
# to something around or greater than 80sec, this will cause the frontend to
# incorrectly display hosts as down even though they are not.
#
# A list of machines which service the data source follows, in the 
# format ip:port, or name:port. If a port is not specified then 8649
# (the default gmond port) is assumed.
# default: There is no default value
#
# data_source "my cluster" 10 localhost  my.machine.edu:8649  1.2.3.5:8655
# data_source "my grid" 50 1.3.4.7:8655 grid.org:8651 grid-backup.org:8651
# data_source "another source" 1.3.4.7:8655  1.3.4.8

data_source "my cluster" 50 localhost 192.168.0.4:8649

#
# Round-Robin Archives
# You can specify custom Round-Robin archives here (defaults are listed below)
#
# Old Default RRA: Keep 1 hour of metrics at 15 second resolution. 1 day at 6 minute
# RRAs "RRA:AVERAGE:0.5:1:244" "RRA:AVERAGE:0.5:24:244" "RRA:AVERAGE:0.5:168:244" "RRA:AVERAGE:0.5:672:244" \
#      "RRA:AVERAGE:0.5:5760:374"
# New Default RRA
# Keep 5856 data points at 15 second resolution assuming 15 second (default) polling. That's 1 day
# Two weeks of data points at 1 minute resolution (average)
#RRAs "RRA:AVERAGE:0.5:1:5856" "RRA:AVERAGE:0.5:4:20160" "RRA:AVERAGE:0.5:40:52704"

#
#-------------------------------------------------------------------------------
# Scalability mode. If on, we summarize over downstream grids, and respect
# authority tags. If off, we take on 2.5.0-era behavior: we do not wrap our output
# in <GRID></GRID> tags, we ignore all <GRID> tags we see, and always assume
# we are the "authority" on data source feeds. This approach does not scale to
# large groups of clusters, but is provided for backwards compatibility.
# default: on
# scalable off
#
#-------------------------------------------------------------------------------
# The name of this Grid. All the data sources above will be wrapped in a GRID
# tag with this name.
# default: unspecified
# gridname "MyGrid"
#
#-------------------------------------------------------------------------------
# The authority URL for this grid. Used by other gmetads to locate graphs
# for our data sources. Generally points to a ganglia/
# website on this machine.
# default: "http://hostname/ganglia/",
#   where hostname is the name of this machine, as defined by gethostname().
# authority "http://mycluster.org/newprefix/"
#
#-------------------------------------------------------------------------------
# List of machines this gmetad will share XML with. Localhost
# is always trusted. 
# default: There is no default value
# trusted_hosts 127.0.0.1 169.229.50.165 my.gmetad.org
#
#-------------------------------------------------------------------------------
# If you want any host which connects to the gmetad XML to receive
# data, then set this value to "on"
# default: off
# all_trusted on
#
#-------------------------------------------------------------------------------
# If you don't want gmetad to setuid then set this to off
# default: on
# setuid off
#
#-------------------------------------------------------------------------------
# User gmetad will setuid to (defaults to "nobody")
# default: "nobody"
# setuid_username "nobody"
#
#-------------------------------------------------------------------------------
# Umask to apply to created rrd files and grid directory structure
# default: 0 (files are public)
# umask 022
#
#-------------------------------------------------------------------------------
# The port gmetad will answer requests for XML
# default: 8651
# xml_port 8651
#
#-------------------------------------------------------------------------------
# The port gmetad will answer queries for XML. This facility allows
# simple subtree and summation views of the XML tree.
# default: 8652
# interactive_port 8652
#
#-------------------------------------------------------------------------------
# The number of threads answering XML requests
# default: 4
# server_threads 10
#
#-------------------------------------------------------------------------------
# Where gmetad stores its round-robin databases
# default: "/var/lib/ganglia/rrds"
# rrd_rootdir "/some/other/place"
#
#-------------------------------------------------------------------------------
# In earlier versions of gmetad, hostnames were handled in a case
# sensitive manner
# If your hostname directories have been renamed to lower case,
# set this option to 0 to disable backward compatibility.
# From version 3.2, backwards compatibility will be disabled by default.
# default: 1   (for gmetad < 3.2)
# default: 0   (for gmetad >= 3.2)
case_sensitive_hostnames 0

#-------------------------------------------------------------------------------
# It is now possible to export all the metrics collected by gmetad directly to
# graphite by setting the following attributes. 
#
# The hostname or IP address of the Graphite server
# default: unspecified
# carbon_server "my.graphite.box"
#
# The port on which Graphite is listening
# default: 2003
# carbon_port 2003
#
# A prefix to prepend to the metric names exported by gmetad. Graphite uses dot-
# separated paths to organize and refer to metrics. 
# default: unspecified
# graphite_prefix "datacenter1.gmetad"
#
# Number of milliseconds gmetad will wait for a response from the graphite server 
# default: 500
# carbon_timeout 500
#

====================================================================

Below is the /etc/ganglia/gmond.conf file for the monitored machine.

====================================================================

/* This configuration is as close to 2.5.x default behavior as possible 
   The values closely match ./gmond/metric.h definitions in 2.5.x */ 
globals {                    
  daemonize = yes              
  setuid = yes             
  user = ganglia              
  debug_level = 0               
  max_udp_msg_len = 1472        
  mute = no             
  deaf = no             
  host_dmax = 0 /*secs */ 
  cleanup_threshold = 300 /*secs */ 
  gexec = no             
  send_metadata_interval = 0     
} 

/* If a cluster attribute is specified, then all gmond hosts are wrapped inside 
 * of a <CLUSTER> tag.  If you do not specify a cluster tag, then all <HOSTS> will 
 * NOT be wrapped inside of a <CLUSTER> tag. */ 
cluster { 
  name = "my cluster" 
  owner = "unspecified" 
  latlong = "unspecified" 
  url = "unspecified" 
} 

/* The host section describes attributes of the host, like the location */ 
host { 
  location = "unspecified" 
} 

/* Feel free to specify as many udp_send_channels as you like.  Gmond 
   used to only support having a single channel */ 
udp_send_channel { 
  #mcast_join = 239.2.11.71
  host = 192.168.0.4 
  port = 8649 
  ttl = 1 
} 

/* You can specify as many udp_recv_channels as you like as well.
udp_recv_channel { 
  mcast_join = 239.2.11.71 
  port = 8649 
  bind = 239.2.11.71 
} 
*/

/* You can specify as many tcp_accept_channels as you like to share 
   an xml description of the state of the cluster */ 
tcp_accept_channel { 
  port = 8649 
} 

/* Each metrics module that is referenced by gmond must be specified and 
   loaded. If the module has been statically linked with gmond, it does not 
   require a load path. However all dynamically loadable modules must include 
   a load path. */ 
modules { 
  module { 
    name = "core_metrics" 
  } 
  module { 
    name = "cpu_module" 
    path = "/usr/lib/ganglia/modcpu.so" 
  } 
  module { 
    name = "disk_module" 
    path = "/usr/lib/ganglia/moddisk.so" 
  } 
  module { 
    name = "load_module" 
    path = "/usr/lib/ganglia/modload.so" 
  } 
  module { 
    name = "mem_module" 
    path = "/usr/lib/ganglia/modmem.so" 
  } 
  module { 
    name = "net_module" 
    path = "/usr/lib/ganglia/modnet.so" 
  } 
  module { 
    name = "proc_module" 
    path = "/usr/lib/ganglia/modproc.so" 
  } 
  module { 
    name = "sys_module" 
    path = "/usr/lib/ganglia/modsys.so" 
  } 
} 

include ('/etc/ganglia/conf.d/*.conf') 


/* The old internal 2.5.x metric array has been replaced by the following 
   collection_group directives.  What follows is the default behavior for 
   collecting and sending metrics that is as close to 2.5.x behavior as 
   possible. */

/* This collection group will cause a heartbeat (or beacon) to be sent every 
   20 seconds.  In the heartbeat is the GMOND_STARTED data which expresses 
   the age of the running gmond. */ 
collection_group { 
  collect_once = yes 
  time_threshold = 20 
  metric { 
    name = "heartbeat" 
  } 
} 

/* This collection group will send general info about this host every 1200 secs. 
   This information doesn't change between reboots and is only collected once. */ 
collection_group { 
  collect_once = yes 
  time_threshold = 1200 
  metric { 
    name = "cpu_num" 
    title = "CPU Count" 
  } 
  metric { 
    name = "cpu_speed" 
    title = "CPU Speed" 
  } 
  metric { 
    name = "mem_total" 
    title = "Memory Total" 
  } 
  /* Should this be here? Swap can be added/removed between reboots. */ 
  metric { 
    name = "swap_total" 
    title = "Swap Space Total" 
  } 
  metric { 
    name = "boottime" 
    title = "Last Boot Time" 
  } 
  metric { 
    name = "machine_type" 
    title = "Machine Type" 
  } 
  metric { 
    name = "os_name" 
    title = "Operating System" 
  } 
  metric { 
    name = "os_release" 
    title = "Operating System Release" 
  } 
  metric { 
    name = "location" 
    title = "Location" 
  } 
} 

/* This collection group will send the status of gexecd for this host every 300 secs */
/* Unlike 2.5.x the default behavior is to report gexecd OFF.  */ 
collection_group { 
  collect_once = yes 
  time_threshold = 300 
  metric { 
    name = "gexec" 
    title = "Gexec Status" 
  } 
} 

/* This collection group will collect the CPU status info every 20 secs. 
   The time threshold is set to 90 seconds.  In honesty, this time_threshold could be 
   set significantly higher to reduce unneccessary network chatter. */ 
collection_group { 
  collect_every = 20 
  time_threshold = 90 
  /* CPU status */ 
  metric { 
    name = "cpu_user"  
    value_threshold = "1.0" 
    title = "CPU User" 
  } 
  metric { 
    name = "cpu_system"   
    value_threshold = "1.0" 
    title = "CPU System" 
  } 
  metric { 
    name = "cpu_idle"  
    value_threshold = "5.0" 
    title = "CPU Idle" 
  } 
  metric { 
    name = "cpu_nice"  
    value_threshold = "1.0" 
    title = "CPU Nice" 
  } 
  metric { 
    name = "cpu_aidle" 
    value_threshold = "5.0" 
    title = "CPU aidle" 
  } 
  metric { 
    name = "cpu_wio" 
    value_threshold = "1.0" 
    title = "CPU wio" 
  } 
  /* The next two metrics are optional if you want more detail... 
     ... since they are accounted for in cpu_system.  
  metric { 
    name = "cpu_intr" 
    value_threshold = "1.0" 
    title = "CPU intr" 
  } 
  metric { 
    name = "cpu_sintr" 
    value_threshold = "1.0" 
    title = "CPU sintr" 
  } 
  */ 
} 

collection_group { 
  collect_every = 20 
  time_threshold = 90 
  /* Load Averages */ 
  metric { 
    name = "load_one" 
    value_threshold = "1.0" 
    title = "One Minute Load Average" 
  } 
  metric { 
    name = "load_five" 
    value_threshold = "1.0" 
    title = "Five Minute Load Average" 
  } 
  metric { 
    name = "load_fifteen" 
    value_threshold = "1.0" 
    title = "Fifteen Minute Load Average" 
  }
} 

/* This group collects the number of running and total processes */ 
collection_group { 
  collect_every = 80 
  time_threshold = 950 
  metric { 
    name = "proc_run" 
    value_threshold = "1.0" 
    title = "Total Running Processes" 
  } 
  metric { 
    name = "proc_total" 
    value_threshold = "1.0" 
    title = "Total Processes" 
  } 
}

/* This collection group grabs the volatile memory metrics every 40 secs and 
   sends them at least every 180 secs.  This time_threshold can be increased 
   significantly to reduce unneeded network traffic. */ 
collection_group { 
  collect_every = 40 
  time_threshold = 180 
  metric { 
    name = "mem_free" 
    value_threshold = "1024.0" 
    title = "Free Memory" 
  } 
  metric { 
    name = "mem_shared" 
    value_threshold = "1024.0" 
    title = "Shared Memory" 
  } 
  metric { 
    name = "mem_buffers" 
    value_threshold = "1024.0" 
    title = "Memory Buffers" 
  } 
  metric { 
    name = "mem_cached" 
    value_threshold = "1024.0" 
    title = "Cached Memory" 
  } 
  metric { 
    name = "swap_free" 
    value_threshold = "1024.0" 
    title = "Free Swap Space" 
  } 
} 

collection_group { 
  collect_every = 40 
  time_threshold = 300 
  metric { 
    name = "bytes_out" 
    value_threshold = 4096 
    title = "Bytes Sent" 
  } 
  metric { 
    name = "bytes_in" 
    value_threshold = 4096 
    title = "Bytes Received" 
  } 
  metric { 
    name = "pkts_in" 
    value_threshold = 256 
    title = "Packets Received" 
  } 
  metric { 
    name = "pkts_out" 
    value_threshold = 256 
    title = "Packets Sent" 
  } 
}

/* Different than 2.5.x default since the old config made no sense */ 
collection_group { 
  collect_every = 1800 
  time_threshold = 3600 
  metric { 
    name = "disk_total" 
    value_threshold = 1.0 
    title = "Total Disk Space" 
  } 
}

collection_group { 
  collect_every = 40 
  time_threshold = 180 
  metric { 
    name = "disk_free" 
    value_threshold = 1.0 
    title = "Disk Space Available" 
  } 
  metric { 
    name = "part_max_used" 
    value_threshold = 1.0 
    title = "Maximum Disk Space Used" 
  } 

}

Can anybody tell by looking at these config files where why the monitoring host is not detecting and nodes?

gimac28
  • 11
  • 1
  • 3

1 Answers1

3

In the unicast mode, you need to specify an interval to send the data:

send_metadata_interval = 15 // or something.

To solve the issue of individual node graphs not displaying if you cluster level stats try this. My gmetad was running on Ubuntu 14.04 and suffered from a backward compatibility problem. The fix for me was to:

  1. edit /usr/share/ganglia-webfrontend/conf_defaults.php and amend setting

$conf['case_sensitive_hostnames'] = false;

  1. cleared out the rrf files in /var/lib/ganglia/rrds (it may be possible to rename to lower case individual hosts names but my deployment was new so i just removed them)

  2. Restart gmetad

Gordon
  • 73
  • 7