Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Return 0 for negative "free" and "total" memory reported by the OS #42725

Merged
merged 8 commits into from
Jun 19, 2019
43 changes: 37 additions & 6 deletions server/src/main/java/org/elasticsearch/monitor/os/OsProbe.java
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,23 @@
import java.util.regex.Pattern;
import java.util.stream.Collectors;

/**
* The {@link OsProbe} class retrieves information about the physical and swap size of the machine
* memory, as well as the system load average and cpu load.
*
* In some exceptional cases, it's possible the underlying native method used by
* {@link #getFreePhysicalMemorySize()} and {@link #getTotalPhysicalMemorySize()} can return a
* negative value. Because of this, we prevent those methods from returning negative values,
* returning 0 instead.
*
* The OS can report a negative number in a number of cases:
* - Non-supported OSes (HP-UX, AIX, OSX after failing to initialize host statistics
dakrone marked this conversation as resolved.
Show resolved Hide resolved
* - An OS that does not support the {@code _SC_PHYS_PAGES} or {@code _SC_PAGE_SIZE} flags for the {@code sysconf()} linux kernel call
* - An overflow of the product of {@code _SC_PHYS_PAGES} and {@code _SC_PAGE_SIZE}
* - An error case retrieving these values from a linux kernel
* - A non-standard libc implementation not implementing the required values
* For a more exhaustive explanation, see https://github.com/elastic/elasticsearch/pull/42725
*/
public class OsProbe {

private static final OperatingSystemMXBean osMxBean = ManagementFactory.getOperatingSystemMXBean();
Expand All @@ -67,12 +84,19 @@ public class OsProbe {
*/
public long getFreePhysicalMemorySize() {
if (getFreePhysicalMemorySize == null) {
return -1;
logger.warn("getFreePhysicalMemorySize is not available");
return 0;
}
try {
return (long) getFreePhysicalMemorySize.invoke(osMxBean);
final long freeMem = (long) getFreePhysicalMemorySize.invoke(osMxBean);
if (freeMem < 0) {
logger.warn("OS reported a negative free memory value [{}]", freeMem);
return 0;
}
return freeMem;
} catch (Exception e) {
return -1;
logger.warn("exception retrieving free physical memory", e);
return 0;
}
}

Expand All @@ -81,12 +105,19 @@ public long getFreePhysicalMemorySize() {
*/
public long getTotalPhysicalMemorySize() {
if (getTotalPhysicalMemorySize == null) {
return -1;
logger.warn("getTotalPhysicalMemorySize is not available");
return 0;
}
try {
return (long) getTotalPhysicalMemorySize.invoke(osMxBean);
final long totalMem = (long) getTotalPhysicalMemorySize.invoke(osMxBean);
if (totalMem < 0) {
logger.warn("OS reported a negative total memory value [{}]", totalMem);
return 0;
}
return totalMem;
} catch (Exception e) {
return -1;
logger.warn("exception retrieving total physical memory", e);
return 0;
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -228,13 +228,17 @@ public static class Mem implements Writeable, ToXContentFragment {
private final long free;

public Mem(long total, long free) {
assert total >= 0 : "expected total memory to be positive, got: " + total;
assert free >= 0 : "expected free memory to be positive, got: " + total;
this.total = total;
this.free = free;
}

public Mem(StreamInput in) throws IOException {
this.total = in.readLong();
assert total >= 0 : "expected total memory to be positive, got: " + total;
this.free = in.readLong();
assert free >= 0 : "expected free memory to be positive, got: " + total;
}

@Override
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -801,8 +801,8 @@ static long machineMemoryFromStats(OsStats stats) {
if (containerLimitStr != null) {
BigInteger containerLimit = new BigInteger(containerLimitStr);
if ((containerLimit.compareTo(BigInteger.valueOf(mem)) < 0 && containerLimit.compareTo(BigInteger.ZERO) > 0)
// mem < 0 means the value couldn't be obtained for some reason
|| (mem < 0 && containerLimit.compareTo(BigInteger.valueOf(Long.MAX_VALUE)) < 0)) {
// mem <= 0 means the value couldn't be obtained for some reason
|| (mem <= 0 && containerLimit.compareTo(BigInteger.valueOf(Long.MAX_VALUE)) < 0)) {
mem = containerLimit.longValue();
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -97,8 +97,8 @@ public void testNoAttributes_givenClash() {

public void testMachineMemory_givenStatsFailure() throws IOException {
OsStats stats = mock(OsStats.class);
when(stats.getMem()).thenReturn(new OsStats.Mem(-1, -1));
assertEquals(-1L, MachineLearning.machineMemoryFromStats(stats));
when(stats.getMem()).thenReturn(new OsStats.Mem(0, 0));
assertEquals(0L, MachineLearning.machineMemoryFromStats(stats));
dakrone marked this conversation as resolved.
Show resolved Hide resolved
}

public void testMachineMemory_givenNoCgroup() throws IOException {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -329,7 +329,7 @@ private static NodeStats mockNodeStats() {
final OsStats.Cgroup osCgroup = new OsStats.Cgroup("_cpu_acct_ctrl_group", ++iota, "_cpu_ctrl_group", ++iota, ++iota, osCpuStat,
"_memory_ctrl_group", "2000000000", "1000000000");

final OsStats.Mem osMem = new OsStats.Mem(no, no);
final OsStats.Mem osMem = new OsStats.Mem(0, 0);
final OsStats.Swap osSwap = new OsStats.Swap(no, no);
final OsStats os = new OsStats(no, osCpu, osMem, osSwap, osCgroup);

Expand Down