6

I wrote a simple program that executes a bunch of NOP instructions in a loop, and to my surprise it executes about 10600000000 of them per second, or about 10Ghz, while my CPU is only 2.2GHz.

How is this possible? Is the CPU treating them as a single mega-NOP, or did I just discover what "instruction level parallelism" means?

What would be a better measure for instructions per second? Doing add instructions reaches only 414900000/s, a tenth of the bogomips reported by my CPU: 4390.03

C code:

#include <stdio.h>
#include <stdint.h>
#include <time.h>

#define ten(a) a a a a a a a a a a
#define hundred(a) ten(a) ten(a) ten(a) ten(a) ten(a) ten(a) ten(a) \
        ten(a) ten(a) ten(a)

#define ITER 10000000
int main(void) {
  uint64_t i=0;
  uint64_t t=time(NULL);
  while(1) {
    for(int j=0; j<ITER;j++) {
    hundred(asm volatile ("nop");)
    }
    i+=ITER*100;
    printf("%lu/%lu\n", i, time(NULL)-t);
  }
  return 0;
}

Compiled assembly:

    .file   "gbloopinc.c"
    .section    .rodata
.LC0:
    .string "%lu/%lu\n"
    .text
    .globl  main
    .type   main, @function
main:
.LFB0:
    .cfi_startproc
    pushq   %rbp
    .cfi_def_cfa_offset 16
    .cfi_offset 6, -16
    movq    %rsp, %rbp
    .cfi_def_cfa_register 6
    subq    $32, %rsp
    movq    $0, -16(%rbp)
    movl    $0, %edi
    call    time
    movq    %rax, -8(%rbp)
.L4:
    movl    $0, -20(%rbp)
    jmp .L2
.L3:
#APP
# 15 "gbloopinc.c" 1
    nop
# 0 "" 2
# 15 "gbloopinc.c" 1
    nop
# 0 "" 2
# 15 "gbloopinc.c" 1
    nop
# 0 "" 2
# 15 "gbloopinc.c" 1
    nop
# 0 "" 2
# 15 "gbloopinc.c" 1
    nop
# 0 "" 2
# 15 "gbloopinc.c" 1
    nop
# 0 "" 2
# 15 "gbloopinc.c" 1
    nop
# 0 "" 2
# 15 "gbloopinc.c" 1
    nop
# 0 "" 2
# 15 "gbloopinc.c" 1
    nop
# 0 "" 2
# 15 "gbloopinc.c" 1
    nop
# 0 "" 2
# 15 "gbloopinc.c" 1
    nop
# 0 "" 2
# 15 "gbloopinc.c" 1
    nop
# 0 "" 2
# 15 "gbloopinc.c" 1
    nop
# 0 "" 2
# 15 "gbloopinc.c" 1
    nop
# 0 "" 2
# 15 "gbloopinc.c" 1
    nop
# 0 "" 2
# 15 "gbloopinc.c" 1
    nop
# 0 "" 2
# 15 "gbloopinc.c" 1
    nop
# 0 "" 2
# 15 "gbloopinc.c" 1
    nop
# 0 "" 2
# 15 "gbloopinc.c" 1
    nop
# 0 "" 2
# 15 "gbloopinc.c" 1
    nop
# 0 "" 2
# 15 "gbloopinc.c" 1
    nop
# 0 "" 2
# 15 "gbloopinc.c" 1
    nop
# 0 "" 2
# 15 "gbloopinc.c" 1
    nop
# 0 "" 2
# 15 "gbloopinc.c" 1
    nop
# 0 "" 2
# 15 "gbloopinc.c" 1
    nop
# 0 "" 2
# 15 "gbloopinc.c" 1
    nop
# 0 "" 2
# 15 "gbloopinc.c" 1
    nop
# 0 "" 2
# 15 "gbloopinc.c" 1
    nop
# 0 "" 2
# 15 "gbloopinc.c" 1
    nop
# 0 "" 2
# 15 "gbloopinc.c" 1
    nop
# 0 "" 2
# 15 "gbloopinc.c" 1
    nop
# 0 "" 2
# 15 "gbloopinc.c" 1
    nop
# 0 "" 2
# 15 "gbloopinc.c" 1
    nop
# 0 "" 2
# 15 "gbloopinc.c" 1
    nop
# 0 "" 2
# 15 "gbloopinc.c" 1
    nop
# 0 "" 2
# 15 "gbloopinc.c" 1
    nop
# 0 "" 2
# 15 "gbloopinc.c" 1
    nop
# 0 "" 2
# 15 "gbloopinc.c" 1
    nop
# 0 "" 2
# 15 "gbloopinc.c" 1
    nop
# 0 "" 2
# 15 "gbloopinc.c" 1
    nop
# 0 "" 2
# 15 "gbloopinc.c" 1
    nop
# 0 "" 2
# 15 "gbloopinc.c" 1
    nop
# 0 "" 2
# 15 "gbloopinc.c" 1
    nop
# 0 "" 2
# 15 "gbloopinc.c" 1
    nop
# 0 "" 2
# 15 "gbloopinc.c" 1
    nop
# 0 "" 2
# 15 "gbloopinc.c" 1
    nop
# 0 "" 2
# 15 "gbloopinc.c" 1
    nop
# 0 "" 2
# 15 "gbloopinc.c" 1
    nop
# 0 "" 2
# 15 "gbloopinc.c" 1
    nop
# 0 "" 2
# 15 "gbloopinc.c" 1
    nop
# 0 "" 2
# 15 "gbloopinc.c" 1
    nop
# 0 "" 2
# 15 "gbloopinc.c" 1
    nop
# 0 "" 2
# 15 "gbloopinc.c" 1
    nop
# 0 "" 2
# 15 "gbloopinc.c" 1
    nop
# 0 "" 2
# 15 "gbloopinc.c" 1
    nop
# 0 "" 2
# 15 "gbloopinc.c" 1
    nop
# 0 "" 2
# 15 "gbloopinc.c" 1
    nop
# 0 "" 2
# 15 "gbloopinc.c" 1
    nop
# 0 "" 2
# 15 "gbloopinc.c" 1
    nop
# 0 "" 2
# 15 "gbloopinc.c" 1
    nop
# 0 "" 2
# 15 "gbloopinc.c" 1
    nop
# 0 "" 2
# 15 "gbloopinc.c" 1
    nop
# 0 "" 2
# 15 "gbloopinc.c" 1
    nop
# 0 "" 2
# 15 "gbloopinc.c" 1
    nop
# 0 "" 2
# 15 "gbloopinc.c" 1
    nop
# 0 "" 2
# 15 "gbloopinc.c" 1
    nop
# 0 "" 2
# 15 "gbloopinc.c" 1
    nop
# 0 "" 2
# 15 "gbloopinc.c" 1
    nop
# 0 "" 2
# 15 "gbloopinc.c" 1
    nop
# 0 "" 2
# 15 "gbloopinc.c" 1
    nop
# 0 "" 2
# 15 "gbloopinc.c" 1
    nop
# 0 "" 2
# 15 "gbloopinc.c" 1
    nop
# 0 "" 2
# 15 "gbloopinc.c" 1
    nop
# 0 "" 2
# 15 "gbloopinc.c" 1
    nop
# 0 "" 2
# 15 "gbloopinc.c" 1
    nop
# 0 "" 2
# 15 "gbloopinc.c" 1
    nop
# 0 "" 2
# 15 "gbloopinc.c" 1
    nop
# 0 "" 2
# 15 "gbloopinc.c" 1
    nop
# 0 "" 2
# 15 "gbloopinc.c" 1
    nop
# 0 "" 2
# 15 "gbloopinc.c" 1
    nop
# 0 "" 2
# 15 "gbloopinc.c" 1
    nop
# 0 "" 2
# 15 "gbloopinc.c" 1
    nop
# 0 "" 2
# 15 "gbloopinc.c" 1
    nop
# 0 "" 2
# 15 "gbloopinc.c" 1
    nop
# 0 "" 2
# 15 "gbloopinc.c" 1
    nop
# 0 "" 2
# 15 "gbloopinc.c" 1
    nop
# 0 "" 2
# 15 "gbloopinc.c" 1
    nop
# 0 "" 2
# 15 "gbloopinc.c" 1
    nop
# 0 "" 2
# 15 "gbloopinc.c" 1
    nop
# 0 "" 2
# 15 "gbloopinc.c" 1
    nop
# 0 "" 2
# 15 "gbloopinc.c" 1
    nop
# 0 "" 2
# 15 "gbloopinc.c" 1
    nop
# 0 "" 2
# 15 "gbloopinc.c" 1
    nop
# 0 "" 2
# 15 "gbloopinc.c" 1
    nop
# 0 "" 2
# 15 "gbloopinc.c" 1
    nop
# 0 "" 2
# 15 "gbloopinc.c" 1
    nop
# 0 "" 2
# 15 "gbloopinc.c" 1
    nop
# 0 "" 2
# 15 "gbloopinc.c" 1
    nop
# 0 "" 2
# 15 "gbloopinc.c" 1
    nop
# 0 "" 2
# 15 "gbloopinc.c" 1
    nop
# 0 "" 2
#NO_APP
    addl    $1, -20(%rbp)
.L2:
    cmpl    $9999999, -20(%rbp)
    jle .L3
    addq    $1000000000, -16(%rbp)
    movl    $0, %edi
    call    time
    subq    -8(%rbp), %rax
    movq    %rax, %rdx
    movq    -16(%rbp), %rax
    movq    %rax, %rsi
    movl    $.LC0, %edi
    movl    $0, %eax
    call    printf
    jmp .L4
    .cfi_endproc
.LFE0:
    .size   main, .-main
    .ident  "GCC: (Ubuntu 5.4.0-6ubuntu1~16.04.2) 5.4.0 20160609"
    .section    .note.GNU-stack,"",@progbits
Pepijn
  • 4,145
  • 5
  • 36
  • 64
  • How many cores (physical and logical)? – Jose Manuel Abarca Rodríguez Sep 22 '16 at 16:06
  • What do you mean by "better measure for instructions per second"? Different instructions take different amount of time, there's parallelism, speculative execution, pipeline stalls, branch (mis-)prediction, cache misses, etc. What are you actually trying to measure? – Art Sep 22 '16 at 16:17
  • 4 cores with hyper threading. I'm trying to measure the influence of code size on speed, pretty much. Larger code doesn't fit in cache, so I expect it to be slower. – Pepijn Sep 22 '16 at 16:24
  • 4
    Haswell can execute NOP on 4 execution units concurrently, add a bit of turbo-boost to jack it up to 10 GHz. Nothing to do with the number of cores, this is just one thread. – Hans Passant Sep 22 '16 at 16:25
  • What's your processor ? It surely has multiple ports able to process more than 1 NOP per cycle. – Benoît Sep 22 '16 at 16:27
  • 5
    If you're going to do nothing, you might as well do it quickly! – Keith Thompson Sep 22 '16 at 16:39
  • Intel(R) Core(TM) i7-2720QM CPU @ 2.20GHz, so yea, with turbo up to 3GHz and 4 ports, 3*4=12GHz. Mind blown. I'll gladly accept that as an answer, maybe with some additional background info. – Pepijn Sep 22 '16 at 16:49
  • The operating system may decide that it needs to run random other things other than your application. That can easily take away cycles, especially for longer run times. Also keep in mind there is overhead for your time call. – Michael Dorgan Sep 22 '16 at 16:58
  • @HansPassant: Even Core2 can run 4 NOPs per clock, because they don't even take execution units. The only limit is the front-end / issue / retirement. But yes, nothing to do with multiple cores, just instruction-level parallelism. – Peter Cordes Sep 23 '16 at 00:40
  • @Pepijn: NOPs of course have no input dependencies, so this is ILP. On your SnB CPU (with 3 ALU execution units per core), you could run 3 ADD and one load or store instruction per clock, since its pipeline width is 4 uops. See [Agner Fog's microarch pdf](http://agner.org/optimize/) and other links in http://stackoverflow.com/tags/x86/info. – Peter Cordes Sep 23 '16 at 00:41
  • And as you have only one hundred NOPs, they will be cached after the first pass and no more memory accesses are made after the first pass, so probably more than one NOP per cycle can be executed in a multicore cpu. – Luis Colorado Sep 24 '16 at 21:31

2 Answers2

5

This has nothing to do with multiple cores. Cores are not "ports".


4 NOPs per clock is the issue/retirement pipeline width of your superscalar / out-of-order CPU. NOPs don't even need an execution unit / execution port (ALU or load or store), so you're not even limited by the number of integer execution units. Even Core2 (Intel's first 4-wide x86 CPU) could run 4 NOPs per clock.

As you guessed, this is an example of Instruction-level Parallelism. NOPs of course have no input dependencies.

On your Sandybridge CPU (with 3 ALU execution units per core), you could run 3 ADD and one load or store instruction per clock, since its pipeline width is 4 uops. See Agner Fog's microarch pdf and other links in the tag wiki. On a stream of independent ADD instructions, like

add  eax, eax
add  ebx, ebx
add  ecx, ecx
add  edx, edx
...

you'd see about 3 per clock throughput on SnB, bottlenecking on integer ALU execution ports. Haswell could run this at 4 ADDs per clock, because it has a 4th ALU execution port that can handle non-vector integer ops (and branches).

Out-of-order CPUs typically have a wider front-end and issue/retire width than the number of execution units. Having more instructions decoded and ready to execute as soon as there's a free execution unit increases their utilization. Otherwise the out-of-order machinery could only see ahead of what's currently executing if execution stalled or slowed down due to serial dependencies. (e.g. add eax,eax / add eax,eax needs the output of the first add as the input to the second add, so can only run at one insn per clock.)

Community
  • 1
  • 1
Peter Cordes
  • 328,167
  • 45
  • 605
  • 847
  • Interesting, so if I do a loop that increments 4 counters and then sum them at the end, the loop would run 4 times as fast as the simple loop. I wonder if an optimising compiler could make use of this. – Pepijn Sep 25 '16 at 16:42
  • @Pepijn: yes, multiple accumulators is a known optimization. It's most useful for floating point, where even FP add has more than 1 cycle of latency. Unfortunately, most compilers (gcc and clang at least) usually don't do this even when unrolling. I think clang sometimes uses a couple accumulators when unrolling its default 4 times. If you're using FMA on Intel Haswell, latency=5 and throughput = one per 0.5 cycles, so you need 10 accumulators to keep 10 FMAs in flight and saturate those execution units. – Peter Cordes Sep 25 '16 at 19:56
4

I'll expand more on HansPassant's comment.

Modern processors are both superscalar and multicore. It is easy to understand what a multicore processor is - it has multiple cores. Superscalar, on the other hand, requires a bit more knowledge about the hardware. This is a stackexchange question the explains what it means for a processor to be superscalar. Superscalar processors have many functional units in the same core, and is heavily pipelined. This is why multiple instructions can be dispatched and going on at the same time in a single core. Here are some of the of functional units in a processor: integer addition/subtraction, floating point multiplication, floating point division, integer multiplication, integer division.

I encourage you to Google more about superscalar processors and look up more info about your processor in particular.

Community
  • 1
  • 1
mgarey
  • 733
  • 1
  • 5
  • 19
  • 1
    NOP doesn't even require an integer execution unit. The OP's CPU has a 4-wide issue / dispatch width, but only three ALU execution ports. Also, every ALU port can handle integer add instructions. Integer multiply is rarer, and takes a lot more transistors to implement, so Intel and AMD only put one scalar integer multiply unit on each core (on one execution port). Still, your answer is a useful simplification, and should have set the OP's thought process on the right track, but I just thought I'd point out that it's even more complex than you describe. – Peter Cordes Sep 23 '16 at 00:57
  • @PeterCordes Thanks for the added details, especially about the OP's processor. I am aware that it's a lot more complex than I described - I have a whole textbook about computer architecture / systems - but I tried to summarize and simplify. I really need to edit my post to fix my grammar, though - it's atrocious. – mgarey Sep 23 '16 at 01:06
  • @PeterCordes: Of course, if we're getting into the weeds, if the integers are being multiplied by constants, the compiler may be able to convert the multiplies to some combination of shifts, adds and/or address load (lea) instructions, so even if you're using integer multiplies, you may find your code exceeding the theoretical speeds expected from only one integer multiply port. – ShadowRanger Sep 23 '16 at 01:18
  • @ShadowRanger: Heh, I hadn't even considered that someone might look at the C source instead of asm compiler output while doing static analysis to look for bottlenecks. Often total fused-domain uop throughput is a bigger bottleneck than any specific ALU port, since modern microarchitectures replicate most simple ALU ops in multiple ports. [IACA](https://software.intel.com/en-us/articles/intel-architecture-code-analyzer) can help with static analysis for bottlenecks in small C or asm loops on Nehalem-Haswell, instead of analyzing the asm by hand, but it's not maintained and isn't perfect. – Peter Cordes Sep 23 '16 at 01:43