ES Version: 5.4.1 OS: Ubuntu Java:1.8
Issue
We currently upgraded our systems from es 2.3 to es5.4.1, everything was going well till we deployed it on production. It started with es node going for some time randomly. On investigation, we saw under logs in es
[2017-12-15T20:33:28,459][INFO ][o.e.m.j.JvmGcMonitorService] [vzvV7bX] [gc][1723] overhead, spent [252ms] collecting in the last [1s]
[2017-12-15T20:42:53,599][INFO ][o.e.m.j.JvmGcMonitorService] [vzvV7bX] [gc][2288] overhead, spent [288ms] collecting in the last [1s]
[2017-12-15T20:53:55,846][INFO ][o.e.m.j.JvmGcMonitorService] [vzvV7bX] [gc][2950] overhead, spent [313ms] collecting in the last [1s]
[2017-12-15T21:03:18,145][INFO ][o.e.m.j.JvmGcMonitorService] [vzvV7bX] [gc][3512] overhead, spent [325ms] collecting in the last [1s]
[2017-12-15T21:45:45,815][INFO ][o.e.m.j.JvmGcMonitorService] [vzvV7bX] [gc][6059] overhead, spent [340ms] collecting in the last [1s]
[2017-12-15T21:46:39,834][INFO ][o.e.m.j.JvmGcMonitorService] [vzvV7bX] [gc][6113] overhead, spent [445ms] collecting in the last [1s]
[2017-12-15T21:46:59,895][INFO ][o.e.m.j.JvmGcMonitorService] [vzvV7bX] [gc][6133] overhead, spent [281ms] collecting in the last [1s]
[2017-12-15T21:47:27,901][INFO ][o.e.m.j.JvmGcMonitorService] [vzvV7bX] [gc][6161] overhead, spent [306ms] collecting in the last [1s]
[2017-12-15T21:51:56,093][INFO ][o.e.m.j.JvmGcMonitorService] [vzvV7bX] [gc][6429] overhead, spent [297ms] collecting in the last [1.1s]
[2017-12-15T21:51:57,094][INFO ][o.e.m.j.JvmGcMonitorService] [vzvV7bX] [gc][6430] overhead, spent [353ms] collecting in the last [1s]
[2017-12-15T21:57:56,361][INFO ][o.e.m.j.JvmGcMonitorService] [vzvV7bX] [gc][6789] overhead, spent [398ms] collecting in the last [1.1s]
[2017-12-15T21:58:23,407][INFO ][o.e.m.j.JvmGcMonitorService] [vzvV7bX] [gc][6816] overhead, spent [284ms] collecting in the last [1s]
[2017-12-15T21:58:24,444][INFO ][o.e.m.j.JvmGcMonitorService] [vzvV7bX] [gc][6817] overhead, spent [341ms] collecting in the last [1s]
[2017-12-15T22:01:29,536][INFO ][o.e.m.j.JvmGcMonitorService] [vzvV7bX] [gc][7002] overhead, spent [341ms] collecting in the last [1s]
[2017-12-15T22:01:31,663][INFO ][o.e.m.j.JvmGcMonitorService] [vzvV7bX] [gc][7004] overhead, spent [399ms] collecting in the last [1.1s]
[2017-12-15T22:01:35,664][INFO ][o.e.m.j.JvmGcMonitorService] [vzvV7bX] [gc][7008] overhead, spent [273ms] collecting in the last [1s]
[2017-12-15T22:03:08,779][INFO ][o.e.m.j.JvmGcMonitorService] [vzvV7bX] [gc][7101] overhead, spent [281ms] collecting in the last [1s]
[2017-12-15T22:05:54,928][INFO ][o.e.m.j.JvmGcMonitorService] [vzvV7bX] [gc][7267] overhead, spent [310ms] collecting in the last [1.1s]
[2017-12-15T22:05:55,928][INFO ][o.e.m.j.JvmGcMonitorService] [vzvV7bX] [gc][7268] overhead, spent [305ms] collecting in the last [1s]
[2017-12-15T22:05:58,929][INFO ][o.e.m.j.JvmGcMonitorService] [vzvV7bX] [gc][7271] overhead, spent [287ms] collecting in the last [1s]
``` Earlier the ES 2.3 used to work with 8gb memory on the system with 32 GB ram. We increased the heap space from 8 to 12 and then we increased system ram to 64 GB and give 32 GB heap to elastic search but the es nodes keep going down when there is any heavy query. Please note that earlier everything was working fine.
For 12 GB the details are as under
"jvm" : {
"timestamp" : 1513422077508,
"uptime_in_millis" : 16750330,
"mem" : {
"heap_used_in_bytes" : 9402202880,
"heap_used_percent" : 73,
"heap_committed_in_bytes" : 12815171584,
"heap_max_in_bytes" : 12815171584,
"non_heap_used_in_bytes" : 97693480,
"non_heap_committed_in_bytes" : 103424000,
"pools" : {
"young" : {
"used_in_bytes" : 441150480,
"max_in_bytes" : 558432256,
"peak_used_in_bytes" : 558432256,
"peak_max_in_bytes" : 558432256
},
"survivor" : {
"used_in_bytes" : 42550952,
"max_in_bytes" : 69730304,
"peak_used_in_bytes" : 69730304,
"peak_max_in_bytes" : 69730304
},
"old" : {
"used_in_bytes" : 8918501448,
"max_in_bytes" : 12187009024,
"peak_used_in_bytes" : 8918501448,
"peak_max_in_bytes" : 12187009024
}
}
},
"threads" : {
"count" : 71,
"peak_count" : 95
},
"gc" : {
"collectors" : {
"young" : {
"collection_count" : 104,
"collection_time_in_millis" : 7078
},
"old" : {
"collection_count" : 1,
"collection_time_in_millis" : 170
}
}
},
"buffer_pools" : {
"direct" : {
"count" : 67,
"used_in_bytes" : 847203522,
"total_capacity_in_bytes" : 847203521
},
"mapped" : {
"count" : 1204,
"used_in_bytes" : 16787345787,
"total_capacity_in_bytes" : 16787345787
}
},
"classes" : {
"current_loaded_count" : 11025,
"total_loaded_count" : 11025,
"total_unloaded_count" : 0
}
}
}
```
But till right now we are not able to find any concrete soln to the problem
Created a ticket in Github but was closed, so created a topic on forum here
Now we compared the es queries that were fired against es2.3 and es5.4
for es2.3 query was like
{
"from" : 0,
"size" : 200,
"query" : {
"bool" : {
"must" : {
"bool" : {
"must" : [
{
"bool" : {
"must_not" : {
"bool" : {
"should" : {
"terms" : {
"room_id" : [
-- terms upto 30000 -
]
}
}
}
}
}
}
]
}
},
"filter" : {
-- basic filter --
}
}
},
"fields" : [
-- columns --
],
"sort" : [
-- columns --
]
}
and query for 5.4 is
{
"from" : 0,
"size" : 20,
"query" : {
"bool" : {
"must" : [
{
"bool" : {
"must_not" : [
{
"bool" : {
"should" : [
{
"terms" : {
"room_ids" : [
-- terms can upto be 30000 --
],
"boost" : 1.0
}
}
],
"disable_coord" : false,
"adjust_pure_negative" : true,
"boost" : 1.0
}
}
],
"disable_coord" : false,
"adjust_pure_negative" : true,
"boost" : 1.0
}
}
],
"filter" : [
-- some basic filters --
],
"disable_coord" : false,
"adjust_pure_negative" : true,
"boost" : 1.0
}
},
"_source" : {
"includes" : [
--columns--
],
"excludes" : [ ]
},
"sort" : [
--columns--
]
}
The main difference is in 2.3 we are getting data from fields
and in 5.4 we are doing from _source
, everything else is almost same, but for 2.3 everything seems to be working fine.
Someone suggested me that heap size is too large here. S bo I set java Xms to 8GB but still nodes are going down
{
"_nodes" : {
"total" : 1,
"successful" : 1,
"failed" : 0
},
"cluster_name" : "elasticsearch_cc",
"nodes" : {
"vzvV7bXVQd-9aVzF_CrcHA" : {
"timestamp" : 1513839849278,
"name" : "vzvV7bX",
"transport_address" : "127.0.0.1:9300",
"host" : "127.0.0.1",
"ip" : "127.0.0.1:9300",
"roles" : [
"master",
"data",
"ingest"
],
"indices" : {
"docs" : {
"count" : 10490979,
"deleted" : 759806
},
"store" : {
"size_in_bytes" : 12759778301,
"throttle_time_in_millis" : 0
},
"indexing" : {
"index_total" : 86,
"index_time_in_millis" : 738,
"index_current" : 0,
"index_failed" : 0,
"delete_total" : 0,
"delete_time_in_millis" : 0,
"delete_current" : 0,
"noop_update_total" : 40,
"is_throttled" : false,
"throttle_time_in_millis" : 0
},
"get" : {
"total" : 75,
"time_in_millis" : 27,
"exists_total" : 75,
"exists_time_in_millis" : 27,
"missing_total" : 0,
"missing_time_in_millis" : 0,
"current" : 0
},
"search" : {
"open_contexts" : 0,
"query_total" : 5292,
"query_time_in_millis" : 31440,
"query_current" : 0,
"fetch_total" : 3325,
"fetch_time_in_millis" : 117815,
"fetch_current" : 0,
"scroll_total" : 0,
"scroll_time_in_millis" : 0,
"scroll_current" : 0,
"suggest_total" : 0,
"suggest_time_in_millis" : 0,
"suggest_current" : 0
},
"merges" : {
"current" : 0,
"current_docs" : 0,
"current_size_in_bytes" : 0,
"total" : 6,
"total_time_in_millis" : 2203,
"total_docs" : 2303,
"total_size_in_bytes" : 8707990,
"total_stopped_time_in_millis" : 0,
"total_throttled_time_in_millis" : 0,
"total_auto_throttle_in_bytes" : 671088640
},
"refresh" : {
"total" : 154,
"total_time_in_millis" : 1763,
"listeners" : 0
},
"flush" : {
"total" : 57,
"total_time_in_millis" : 830
},
"warmer" : {
"current" : 0,
"total" : 218,
"total_time_in_millis" : 413
},
"query_cache" : {
"memory_size_in_bytes" : 1807608,
"total_count" : 22863,
"hit_count" : 2962,
"miss_count" : 19901,
"cache_size" : 836,
"cache_count" : 836,
"evictions" : 0
},
"fielddata" : {
"memory_size_in_bytes" : 2112304,
"evictions" : 0
},
"completion" : {
"size_in_bytes" : 0
},
"segments" : {
"count" : 490,
"memory_in_bytes" : 54231303,
"terms_memory_in_bytes" : 40778050,
"stored_fields_memory_in_bytes" : 3325704,
"term_vectors_memory_in_bytes" : 0,
"norms_memory_in_bytes" : 637056,
"points_memory_in_bytes" : 580557,
"doc_values_memory_in_bytes" : 8909936,
"index_writer_memory_in_bytes" : 0,
"version_map_memory_in_bytes" : 0,
"fixed_bit_set_memory_in_bytes" : 1199712,
"max_unsafe_auto_id_timestamp" : -1,
"file_sizes" : { }
},
"translog" : {
"operations" : 3,
"size_in_bytes" : 18452
},
"request_cache" : {
"memory_size_in_bytes" : 66679,
"evictions" : 0,
"hit_count" : 84,
"miss_count" : 130
},
"recovery" : {
"current_as_source" : 0,
"current_as_target" : 0,
"throttle_time_in_millis" : 0
}
},
"os" : {
"timestamp" : 1513839849384,
"cpu" : {
"percent" : 3,
"load_average" : {
"1m" : 0.05,
"5m" : 0.38,
"15m" : 1.06
}
},
"mem" : {
"total_in_bytes" : 33738969088,
"free_in_bytes" : 15510724608,
"used_in_bytes" : 18228244480,
"free_percent" : 46,
"used_percent" : 54
},
"swap" : {
"total_in_bytes" : 0,
"free_in_bytes" : 0,
"used_in_bytes" : 0
}
},
"process" : {
"timestamp" : 1513839849384,
"open_file_descriptors" : 344,
"max_file_descriptors" : 65536,
"cpu" : {
"percent" : 0,
"total_in_millis" : 329250
},
"mem" : {
"total_virtual_in_bytes" : 28131942400
}
},
"jvm" : {
"timestamp" : 1513839849384,
"uptime_in_millis" : 12440740,
"mem" : {
"heap_used_in_bytes" : 2964483576,
"heap_used_percent" : 34,
"heap_committed_in_bytes" : 6240751616,
"heap_max_in_bytes" : 8520204288,
"non_heap_used_in_bytes" : 102879720,
"non_heap_committed_in_bytes" : 108969984,
"pools" : {
"young" : {
"used_in_bytes" : 451090984,
"max_in_bytes" : 558432256,
"peak_used_in_bytes" : 558432256,
"peak_max_in_bytes" : 558432256
},
"survivor" : {
"used_in_bytes" : 16900480,
"max_in_bytes" : 69730304,
"peak_used_in_bytes" : 69730304,
"peak_max_in_bytes" : 69730304
},
"old" : {
"used_in_bytes" : 2496492112,
"max_in_bytes" : 7892041728,
"peak_used_in_bytes" : 5608349440,
"peak_max_in_bytes" : 7892041728
}
}
},
"threads" : {
"count" : 88,
"peak_count" : 99
},
"gc" : {
"collectors" : {
"young" : {
"collection_count" : 86,
"collection_time_in_millis" : 14592
},
"old" : {
"collection_count" : 11,
"collection_time_in_millis" : 780
}
}
},
"buffer_pools" : {
"direct" : {
"count" : 57,
"used_in_bytes" : 795161001,
"total_capacity_in_bytes" : 795161000
},
"mapped" : {
"count" : 1080,
"used_in_bytes" : 12730171248,
"total_capacity_in_bytes" : 12730171248
}
},
"classes" : {
"current_loaded_count" : 11264,
"total_loaded_count" : 11388,
"total_unloaded_count" : 124
}
},
"thread_pool" : {
"bulk" : {
"threads" : 5,
"queue" : 0,
"active" : 0,
"rejected" : 0,
"largest" : 5,
"completed" : 84
},
"fetch_shard_started" : {
"threads" : 1,
"queue" : 0,
"active" : 0,
"rejected" : 0,
"largest" : 16,
"completed" : 35
},
"fetch_shard_store" : {
"threads" : 0,
"queue" : 0,
"active" : 0,
"rejected" : 0,
"largest" : 0,
"completed" : 0
},
"flush" : {
"threads" : 1,
"queue" : 0,
"active" : 0,
"rejected" : 0,
"largest" : 4,
"completed" : 114
},
"force_merge" : {
"threads" : 0,
"queue" : 0,
"active" : 0,
"rejected" : 0,
"largest" : 0,
"completed" : 0
},
"generic" : {
"threads" : 4,
"queue" : 0,
"active" : 0,
"rejected" : 0,
"largest" : 4,
"completed" : 1286
},
"get" : {
"threads" : 0,
"queue" : 0,
"active" : 0,
"rejected" : 0,
"largest" : 0,
"completed" : 0
},
"index" : {
"threads" : 8,
"queue" : 0,
"active" : 0,
"rejected" : 0,
"largest" : 8,
"completed" : 75
},
"listener" : {
"threads" : 0,
"queue" : 0,
"active" : 0,
"rejected" : 0,
"largest" : 0,
"completed" : 0
},
"management" : {
"threads" : 4,
"queue" : 0,
"active" : 1,
"rejected" : 0,
"largest" : 4,
"completed" : 1653
},
"refresh" : {
"threads" : 4,
"queue" : 0,
"active" : 0,
"rejected" : 0,
"largest" : 4,
"completed" : 86950
},
"search" : {
"threads" : 13,
"queue" : 0,
"active" : 0,
"rejected" : 0,
"largest" : 13,
"completed" : 13533
},
"snapshot" : {
"threads" : 0,
"queue" : 0,
"active" : 0,
"rejected" : 0,
"largest" : 0,
"completed" : 0
},
"warmer" : {
"threads" : 4,
"queue" : 0,
"active" : 0,
"rejected" : 0,
"largest" : 4,
"completed" : 1362
}
},
"fs" : {
"timestamp" : 1513839849386,
"total" : {
"total_in_bytes" : 42141548544,
"free_in_bytes" : 21673201664,
"available_in_bytes" : 19508940800
},
"data" : [
{
"path" : "/data/elasticsearch_cc/nodes/0",
"mount" : "/data (/dev/xvdf)",
"type" : "ext4",
"total_in_bytes" : 42141548544,
"free_in_bytes" : 21673201664,
"available_in_bytes" : 19508940800,
"spins" : "false"
}
],
"io_stats" : {
"devices" : [
{
"device_name" : "xvdf",
"operations" : 32559,
"read_operations" : 29633,
"write_operations" : 2926,
"read_kilobytes" : 1682476,
"write_kilobytes" : 51428
}
],
"total" : {
"operations" : 32559,
"read_operations" : 29633,
"write_operations" : 2926,
"read_kilobytes" : 1682476,
"write_kilobytes" : 51428
}
}
},
"transport" : {
"server_open" : 26,
"rx_count" : 10895,
"rx_size_in_bytes" : 1251104,
"tx_count" : 10893,
"tx_size_in_bytes" : 12114262496
},
"http" : {
"current_open" : 1,
"total_opened" : 1
},
"breakers" : {
"request" : {
"limit_size_in_bytes" : 5112122572,
"limit_size" : "4.7gb",
"estimated_size_in_bytes" : 0,
"estimated_size" : "0b",
"overhead" : 1.0,
"tripped" : 0
},
"fielddata" : {
"limit_size_in_bytes" : 5112122572,
"limit_size" : "4.7gb",
"estimated_size_in_bytes" : 2112304,
"estimated_size" : "2mb",
"overhead" : 1.03,
"tripped" : 0
},
"in_flight_requests" : {
"limit_size_in_bytes" : 8520204288,
"limit_size" : "7.9gb",
"estimated_size_in_bytes" : 3206,
"estimated_size" : "3.1kb",
"overhead" : 1.0,
"tripped" : 0
},
"parent" : {
"limit_size_in_bytes" : 5964143001,
"limit_size" : "5.5gb",
"estimated_size_in_bytes" : 2115510,
"estimated_size" : "2mb",
"overhead" : 1.0,
"tripped" : 0
}
},
"script" : {
"compilations" : 0,
"cache_evictions" : 0
},
"discovery" : {
"cluster_state_queue" : {
"total" : 0,
"pending" : 0,
"committed" : 0
}
},
"ingest" : {
"total" : {
"count" : 0,
"time_in_millis" : 0,
"current" : 0,
"failed" : 0
},
"pipelines" : { }
}
}
}
}