5

The developments of this issue are now described unequivocally in this new question: Why does the JVM show more latency for the same block of code after a busy spin pause?

I'm including below the source codes of a simple server and client that demonstrate and isolate the problem. Basically I'm timing the latency of a ping-pong (client-server-client) message. I start by sending one message every 1 millisecond. I wait for 200k messages to be sent so the HotSpot has a chance to optimize the code. Then I change my pause time from 1 millisecond to 30 seconds. For my surprise my write and read operation become considerably slower.

I don't think it is a JIT/HotSpot problem. I was able to pinpoint the slower method to the native JNI calls to write (write0) and read. It looks like the longer you pause the slower it becomes.

I'm looking for pointers on how to debug, understand, explain or fix this problem.

Server.java:

import java.io.IOException;
import java.net.InetSocketAddress;
import java.nio.ByteBuffer;
import java.nio.channels.ServerSocketChannel;
import java.nio.channels.SocketChannel;

public class Server {

    private final ServerSocketChannel serverSocketChannel;
    private final ByteBuffer readBuffer = ByteBuffer.allocateDirect(1024);
    private final int port;
    private final int msgSize;

    public Server(int port, int msgSize) throws IOException {
        this.serverSocketChannel = ServerSocketChannel.open();
        this.port = port;
        this.msgSize = msgSize;
    }

    public void start() throws IOException {
        serverSocketChannel.socket().bind(new InetSocketAddress(port));
        final SocketChannel socketChannel = serverSocketChannel.accept(); // blocking mode...
        System.out.println("Client accepted!");
        socketChannel.configureBlocking(false);
        socketChannel.socket().setTcpNoDelay(true);
        Thread t = new Thread(new Runnable() {

            @Override
            public void run() {
                try {
                    while(true) {
                        int bytesRead = socketChannel.read(readBuffer);
                        if (bytesRead == -1) {
                            System.out.println("Client disconnected!");
                            return;
                        } else if (bytesRead > 0) {
                            if (readBuffer.position() == msgSize) {
                                // have a full message there...
                                readBuffer.flip();
                                int bytesSent = socketChannel.write(readBuffer);
                                if (bytesSent != msgSize) throw new RuntimeException("Could not send full message out: " + bytesSent);
                                readBuffer.clear();
                            }
                        }
                    }
                } catch(Exception e) {
                    throw new RuntimeException(e);
                }
            }
        });
        t.start();
        serverSocketChannel.close();
    }

    public static void main(String[] args) throws Exception {

        Server s = new Server(9999, 8);
        s.start();
    }
}

Client.java:

import java.io.IOException;
import java.net.InetSocketAddress;
import java.nio.ByteBuffer;
import java.nio.channels.SocketChannel;

public class Client implements Runnable {

    private static final int WARMUP = 200000;

    private final SocketChannel socketChannel;
    private final String host;
    private final int port;
    private final ByteBuffer outBuffer;
    private final ByteBuffer inBuffer = ByteBuffer.allocateDirect(1024);
    private final int msgSize;
    private final StringBuilder sb = new StringBuilder(1024);

    private int interval;
    private int totalMessagesSent;
    private long timeSent;
    private int mod;


    public Client(String host, int port, int msgSize) throws IOException {
        this.socketChannel = SocketChannel.open();
        this.host = host;
        this.port = port;
        this.outBuffer = ByteBuffer.allocateDirect(msgSize);
        this.msgSize = msgSize;
        for(int i = 0; i < msgSize; i++) outBuffer.put((byte) i);
        outBuffer.flip();
        this.interval = 1;
        this.mod = 20000;
    }

    public static long busySleep(long t) {
        long x = 0;
        for(int i = 0; i < t * 20000; i++) {
            x += System.currentTimeMillis() / System.nanoTime();
        }
        return x;
    }

    public void start() throws Exception {
        this.socketChannel.configureBlocking(false);
        this.socketChannel.socket().setTcpNoDelay(true);
        this.socketChannel.connect(new InetSocketAddress(host, port));

        while(!socketChannel.finishConnect()) {
            System.out.println("Waiting to connect");
            Thread.sleep(1000);
        }
        System.out.println("Please wait as output will appear every minute or so. After " + WARMUP + " messages you will see the problem.");
        Thread t = new Thread(this);
        t.start();
    }

    private final void printResults(long latency, long timeToWrite, long timeToRead, long zeroReads, long partialReads, long realRead) {
        sb.setLength(0);
        sb.append(new java.util.Date().toString());
        sb.append(" Results: totalMessagesSent=").append(totalMessagesSent);
        sb.append(" currInterval=").append(interval);
        sb.append(" latency=").append(latency);
        sb.append(" timeToWrite=").append(timeToWrite);
        sb.append(" timeToRead=").append(timeToRead);
        sb.append(" realRead=").append(realRead);
        sb.append(" zeroReads=").append(zeroReads);
        sb.append(" partialReads=").append(partialReads);
        System.out.println(sb);
    }

    @Override
    public void run() {

        try {

            while(true) {

                busySleep(interval);

                outBuffer.position(0);

                timeSent = System.nanoTime();

                int bytesSent = socketChannel.write(outBuffer);
                long timeToWrite = System.nanoTime() - timeSent;
                if (bytesSent != msgSize) throw new IOException("Can't write message: " + bytesSent);

                inBuffer.clear();
                long zeroReads = 0;
                long partialReads = 0;
                long timeToRead = System.nanoTime();
                long realRead = 0;
                while(inBuffer.position() != msgSize) {
                    realRead = System.nanoTime();
                    int bytesRead = socketChannel.read(inBuffer);
                    if (bytesRead == 0) {
                        zeroReads++;
                    } else if (bytesRead == -1) {
                        System.out.println("Other side disconnected!");
                        return;
                    } else if (bytesRead != msgSize) {
                        partialReads++;
                        realRead = -1;
                    } else {
                        realRead = System.nanoTime() - realRead;
                    }
                }

                long now = System.nanoTime();

                timeToRead = now - timeToRead;

                long latency = now - timeSent;

                if (++totalMessagesSent % mod == 0 || totalMessagesSent == 1) {
                    printResults(latency, timeToWrite, timeToRead, zeroReads, partialReads, realRead);
                }

                if (totalMessagesSent == WARMUP) {
                    this.interval = 30000;
                    this.mod = 1;
                }
            }

        } catch(Exception e) {
            throw new RuntimeException(e);
        }
    }

    public static void main(String[] args) throws Exception {

        Client client = new Client("localhost", 9999, 8);
        client.start();
    }
}

I execute java -server -cp . Server and java -server -cp . Client. The output of the client is: enter image description here


Per @dunni request, changing to 1 second delay instead of 30 second delay. Same problem: enter image description here

Community
  • 1
  • 1
LatencyFighter
  • 351
  • 2
  • 11
  • I would guess that you hit some timeout on a connection or socket level and for each write a new connection has to be created. Do you get the same result if you choose e.g. 5 or 10 seconds pause time? – dunni Apr 12 '17 at 19:06
  • 1
    Thanks @dunni. Posted a picture using 1 sec delay instead of 30 sec delay. Same problem :( Any chance that `setTcpNoDelay(true)` is simply being ignored and Nagle's Algorithm is the one to blame? Hard to believe that such a Java bug would exist. – LatencyFighter Apr 12 '17 at 19:19
  • 1
    You are right, your problem cannot be Nagle's Algorithm, since you are using `setTcpNoDelay( true )`, which I failed to notice while skimming through your code. – Mike Nakis Apr 12 '17 at 19:22
  • It is hard to tell for sure what is going on, but I think that your next suspect is probably the `socketChannel.read(inBuffer);` which is trying to read an unspecified amount of bytes. It is likely that the TCP/IP stack (or perhaps the java networking layer) may be buffering data until it has 1024 bytes to yield, or until it times out. Try prefixing each message with the message length, so that you can always read exactly 4 bytes to obtain the message length, and then exactly bytes to avoid the possibility that there is buffering going on. – Mike Nakis Apr 12 '17 at 19:30
  • Nice idea @MikeNakis. I'll try that. This makes sense for reads, but not for my writes, which are also slower :/ – LatencyFighter Apr 12 '17 at 19:32
  • The writes should almost directly map to `write` or `writev` syscalls (check with `strace`). So this unlikely is a java problem. But you should not be using `socketChannel.configureBlocking(false)` without also using selectors. If you have a thread-per-socket you want blocking IO to not waste CPU cycles. That way the kernel can immediately wake up the thread blocked on a read when it receives data. – the8472 Apr 12 '17 at 20:44
  • @the8472 I agree that it is something on the kernel level (network stack) but I'm hoping there's some kind of configuration to avoid this problem. Something like making the syscall busy-spin instead of blocking on the kernel level. Btw, changing blocking mode to true on both server and client shows the same behavior / problem. – LatencyFighter Apr 12 '17 at 21:08
  • things to look at: a) strace (including timing information) to see whether the syscall itself takes longer b) wireshark output (does tcp behavior change?) c) perf record (check where the kernel spends its time) - that said, you're measuring *nanoseconds*. 1k nanoseconds more is just one microsecond. That's in the realm of lan-local network latencies. – the8472 Apr 12 '17 at 22:59
  • Your server either should not use blocking mode or else should use a `Selector` to tell it when to read. At present it is a CPU hog. It should also write whatever it reads immediately, rather than waiting for the input buffer to be full. This is a secondary source of latency. – user207421 Apr 13 '17 at 00:16
  • 1
    I can confirm that it happens on Windows too. – Alexandr Nikitin Apr 13 '17 at 14:58

2 Answers2

2

One problem you have is that the JVM, the CPU and it's cache are falling asleep when there is no data to read. Once this happens, the machine has to do much more before it can pick up data than it did when your problem was running hot.

  • the CPU speed may have dropped to save power. e.g. half normal. It can do this on dumb busy loops.
  • the thread isn't running and has to be restarted on a new CPU. (In you case this should be rare)
  • the CPU's cache may have been powered down and has to be loaded progressively from L3 cache or main memory
  • even after your thread returns, it will run slower than normal for up to 100 micro-seconds as the caches pull in more data/code.
  • you will get a non-maskable interrupt 100+ times per second you can't turn off.

In short if you need consistent latencies you need to

  • turn off power management.
  • don't give up the CPU i.e. busy wait. (Which you are doing)
  • run on isolated CPUs binding the thread with affinity.
  • disable all maskable interrupts on that core.
  • use a user space driver for networking instead of the kernel.

NOTE: Given every operation appears to be taking about 2x as long, I would look at power management first.

Peter Lawrey
  • 525,659
  • 79
  • 751
  • 1,130
  • Thanks @PeterLawrey. However I disabled power management from my Ubuntu server box (with `acpi=off apm=off` in my grub configuration as described [here](http://askubuntu.com/a/130541)) and still have the same problem. Are you able to run my simple server and client in your box and get different results? **This is particularly important for trading**, since you want to send an order to the exchange after some time has passed **at the fastest possible speed**. What do you mean by "_use a user space driver for networking_"? Also tried affinity with no luck. Maybe a bad TCP Congestion Algo? – LatencyFighter Apr 13 '17 at 02:20
  • Unless you are using low latency network cards, a few micro-seconds won't make as much difference. http://www.solarflare.com/electronic-trading These can help reduce the latency added for lower throughput usage. e.g. FX, FI or Commodities. – Peter Lawrey Apr 13 '17 at 03:18
  • _Unless you are using low latency network cards, a few micro-seconds won't make as much difference_ I agree but I really need to get to the bottom of this problem. If you execute the provided server and client in your machine you will probably see the same mysterious behavior. – LatencyFighter Apr 13 '17 at 03:26
  • @LatencyFighter yes, though this is lower with these network cards. I suspect it's nothing to do with Java as it's in the network layer. – Peter Lawrey Apr 13 '17 at 03:35
  • I agree it might not be related to Java, but I still have to fix it :/ This is a bad behavior in terms of latency. Tried a new kernel version with no luck. I'll keep researching. Let me know if you have any other suspicious. It happens on MacOS and in Linux. Total mystery! – LatencyFighter Apr 13 '17 at 04:11
  • @LatencyFighter in my experience redhat is the best in handling this but you really need a low latency network adapter to have any chance of fixing it. – Peter Lawrey Apr 13 '17 at 04:15
  • Thanks, I'll keep trying here. If you are able to run the provided Server and Client code in your environment (RedHat + low latency NIC) without this latency problem please let me know! This is a hard problem. – LatencyFighter Apr 13 '17 at 04:34
  • @LatencyFighter I have a setup at one in the UK, but I am in Australia this week. – Peter Lawrey Apr 13 '17 at 05:32
  • Hey Peter, any luck about this mystery? – LatencyFighter Apr 24 '17 at 22:29
  • @LatencyFighter I haven't had time to investigate since I got back. – Peter Lawrey Apr 25 '17 at 15:46
1

I was looking at the code of SocketChannelImpl and noticed that there are two monitors involved in read() - a read lock and a state lock.

My opinion is that locks behave much better when they are hot and uncontested.

Following class is based on your client and only does some locking, similar to what is done in SocketChannelImpl. From non-observable, the latency becomes ~5000 on my box (win8, jdk8)

import java.util.concurrent.TimeUnit;

public class Locker implements Runnable {

private static final int WARMUP = 40000;

private final Object readLock = new Object();
private final Object writeLock = new Object();
private final Object stateLock = new Object();

private final StringBuilder sb = new StringBuilder(1024);

private long interval;
private int totalMessagesSent;
private long timeSent;
private int mod;
private long totalOps;
private long readerThread;
private long writerThread;


public Locker() {
    this.interval = 1;
    this.mod = 20000;
}

public static long busySleep(long t) throws InterruptedException {
    long until = System.nanoTime() + TimeUnit.MILLISECONDS.toNanos(t);
    while(System.nanoTime() < until);
    return until;
}

public void start() throws Exception {
    Thread t = new Thread(this);
    t.start();
}

private final void printResults(long latency, long timeToRead) {
    sb.setLength(0);
    sb.append(new java.util.Date().toString());
    sb.append(" Results: totalMessagesSent=").append(totalMessagesSent);
    sb.append(" currInterval=").append(interval);
    sb.append(" latency=").append(latency);
    sb.append(" timeToRead=").append(timeToRead);
    sb.append(" totalOps=").append(totalOps);
    sb.append(" reader=").append(readerThread);
    sb.append(" writer=").append(writerThread);
    System.out.println(sb);
}

@Override
public void run() {

    try {
        while(true) {

            busySleep(interval);

            timeSent = System.nanoTime();

            try {
                synchronized (writeLock) {
                    synchronized (stateLock) {
                        writerThread = Thread.currentThread().getId();
                    }
                    totalOps++;
                }
            }
            finally {
                synchronized (stateLock) {
                    writerThread = 0;
                }
            }

            long timeToRead = System.nanoTime();

            try {
                synchronized (readLock) {
                    synchronized (stateLock) {
                        readerThread = Thread.currentThread().getId();
                    }
                    totalOps++;
                }
            } finally {
                synchronized (stateLock) {
                    readerThread = 0;
                }
            }

            long now = System.nanoTime();

            timeToRead = now - timeToRead;

            long latency = now - timeSent;

            if (++totalMessagesSent % mod == 0 || totalMessagesSent == 1) {
                printResults(latency, timeToRead);
            }

            if (totalMessagesSent == WARMUP) {
                this.interval = 5000;
                this.mod = 1;
            }
        }

    } catch(Exception e) {
        throw new RuntimeException(e);
    }
}

public static void main(String[] args) throws Exception {
    Locker locker = new Locker();
    locker.start();
}
}

EDIT: modified code per OP's suggestion exhibiting the same latency increase:

import java.util.Arrays;
import java.util.concurrent.TimeUnit;

public class Locker {
    static final int WARMUP = 20000;
    final Object readLock = new Object();
    final Object writeLock = new Object();
    final Object stateLock = new Object();

    long interval = 1;
    int totalMessagesSent;
    long totalOps;
    long readerThread;
    long writerThread;
    final long[] measures = new long[WARMUP + 20];

    static long busySleep(long t) {
        long until = System.nanoTime() + TimeUnit.MILLISECONDS.toNanos(t);
        while(System.nanoTime() < until);
        return until;
    }
    void printResults(long latency, long timeToRead) {
        if (readerThread != 0 || writerThread != 0 || totalMessagesSent  > totalOps || timeToRead < 0) throw new Error();
        measures[totalMessagesSent] = latency;
    }

    void run() {
        while(totalMessagesSent < measures.length) {
            busySleep(interval);
            long timeSent = System.nanoTime();
            try {
                synchronized (writeLock) {
                    synchronized (stateLock) {
                        writerThread = Thread.currentThread().getId();
                    }
                    totalOps++;
                }
            }
            finally {
                synchronized (stateLock) {
                    writerThread = 0;
                }
            }
            long timeToRead = System.nanoTime();
            try {
                synchronized (readLock) {
                    synchronized (stateLock) {
                        readerThread = Thread.currentThread().getId();
                    }
                    totalOps++;
                }
            } finally {
                synchronized (stateLock) {
                    readerThread = 0;
                }
            }
            long now = System.nanoTime();
            timeToRead = now - timeToRead;
            long latency = now - timeSent;
            printResults(latency, timeToRead);
            ++totalMessagesSent;
            this.interval = (totalMessagesSent/WARMUP * 5000) + 1;
        }
        System.out.println("last measures = " + Arrays.toString(Arrays.copyOfRange(measures, WARMUP - 20, measures.length - 1)));
    }

    public static void main(String[] args) {
        new Locker().run();
    }
}
  • Thanks Nikolay, but the problem here is that `IF` to change the interval variable. It messes with the JIT inlining when it triggers. If you keep `mod = 1` forever, write to a file the results and make `this.interval = (totalMessagesSent / WARMUP * 5000) + 1;` you will see that the latency is the same when the interval changes to 5000. However, even with that change the SocketChannel still exhibits that bad latency increase, so the mystery is still going. Rewriting the SocketChannel program in C++ will give us more clues I hope. – LatencyFighter Apr 24 '17 at 22:27
  • I did what you suggested and observed the same increase in latency from 0 to ~5k – Nikolay Tsankov Apr 26 '17 at 09:15
  • I ran your code and saw what you said but then I went ahead and commented out all the synchronized to remove all locks, so no locking anymore. Guess what: the problem still happens. Very weird! – LatencyFighter Apr 28 '17 at 22:44
  • Here is a new question with this problem described unequivocally: http://stackoverflow.com/questions/43696948/why-does-the-jvm-show-more-latency-for-the-same-block-of-code-after-a-busy-spin – LatencyFighter Apr 29 '17 at 14:45