I'm trying to execute a rather trivial WebAssembly benchmark with Google's V8 engine (both in-browser using the current Version of Google Chrome (Version 83.0.4103.106, 64-bit) and via embedding V8 (Version 8.5.183) in a C++ program. All benchmarks are executed on macOS 10.14.6 with an Intel i7 8850H processor. No RAM swap has been used.
I am using the following C code as a benchmark. (Note that runtime is in the order of seconds on a current Intel Core i7)
static void init(int n, int path[1000][1000]) {
for (int i = 0; i < n; i++) {
for (int j = 0; j < n; j++) {
path[i][j] = i*j%7+1;
if ((i+j)%13 == 0 || (i+j)%7==0 || (i+j)%11 == 0) {
path[i][j] = 999;
}
}
}
}
static void kernel(int n, int path[1000][1000]) {
for (int k = 0; k < n; k++) {
for(int i = 0; i < n; i++) {
for (int j = 0; j < n; j++) {
path[i][j] = path[i][j] < path[i][k] + path[k][j] ? path[i][j] : path[i][k] + path[k][j];
}
}
}
}
int path[1000][1000];
int main(void) {
int n = 1000;
init(n, path);
kernel(n, path);
return 0;
}
This can be easily executed via https://wasdk.github.io/WasmFiddle/. The corresponding JS code measuring time in the most basic way is the following:
var wasmModule = new WebAssembly.Module(wasmCode);
var wasmInstance = new WebAssembly.Instance(wasmModule, wasmImports);
var a = new Date();
wasmInstance.exports.main();
var b = new Date();
log(b-a);
The result I'm getting in browser (e.g. in WasmFiddle or on a custom website) in Google Chrome is the following (for multiple consecutive executions) in milliseconds:
3687
1757
1837
1753
1726
1731
1774
1741
1771
1727
3549
1742
1731
1847
1734
1745
3515
1731
1772
Note the outliers performing at half the speed of the rest. How and why are there outliers with still such consistent performance? As much care as possible has been taken to ensure that no other processes are using up CPU time.
For the embedded version, the monolithic V8 library has been built from source using the following build config:
is_component_build = false
is_debug = false
target_cpu = "x64"
use_custom_libcxx = false
v8_monolithic = true
v8_use_external_startup_data = false
v8_enable_pointer_compression = false
The C++ code embedding the V8 library and executing the Wasm script (The Wasm code is the exact code produced by the WasmFiddle compiler):
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "include/libplatform/libplatform.h"
#include "include/v8.h"
int main(int argc, char* argv[]) {
// Initialize V8.
v8::V8::InitializeICUDefaultLocation(argv[0]);
v8::V8::InitializeExternalStartupData(argv[0]);
std::unique_ptr<v8::Platform> platform = v8::platform::NewDefaultPlatform();
v8::V8::InitializePlatform(platform.get());
v8::V8::Initialize();
// Create a new Isolate and make it the current one.
v8::Isolate::CreateParams create_params;
create_params.array_buffer_allocator = v8::ArrayBuffer::Allocator::NewDefaultAllocator();
v8::Isolate* isolate = v8::Isolate::New(create_params);
{
v8::Isolate::Scope isolate_scope(isolate);
// Create a stack-allocated handle scope.
v8::HandleScope handle_scope(isolate);
// Create a new context.
v8::Local<v8::Context> context = v8::Context::New(isolate);
v8::Context::Scope context_scope(context);
{
const char csource[] = R"(
let bytes = new Uint8Array([
0x0, 0x61, 0x73, 0x6D, 0x01, 0x00, 0x00, 0x00, 0x01, 0x85, 0x80, 0x80, 0x80, 0x00, 0x01, 0x60,
0x00, 0x01, 0x7F, 0x03, 0x82, 0x80, 0x80, 0x80, 0x00, 0x01, 0x00, 0x04, 0x84, 0x80, 0x80, 0x80,
0x00, 0x01, 0x70, 0x00, 0x00, 0x05, 0x83, 0x80, 0x80, 0x80, 0x00, 0x01, 0x00, 0x3E, 0x06, 0x81,
0x80, 0x80, 0x80, 0x00, 0x00, 0x07, 0x91, 0x80, 0x80, 0x80, 0x00, 0x02, 0x06, 0x6D, 0x65, 0x6D,
0x6F, 0x72, 0x79, 0x02, 0x00, 0x04, 0x6D, 0x61, 0x69, 0x6E, 0x00, 0x00, 0x0A, 0x8F, 0x82, 0x80,
0x80, 0x00, 0x01, 0x89, 0x82, 0x80, 0x80, 0x00, 0x01, 0x08, 0x7F, 0x41, 0x00, 0x21, 0x02, 0x41,
0x10, 0x21, 0x05, 0x03, 0x40, 0x20, 0x05, 0x21, 0x07, 0x41, 0x00, 0x21, 0x04, 0x41, 0x00, 0x21,
0x03, 0x03, 0x40, 0x20, 0x07, 0x20, 0x04, 0x41, 0x07, 0x6F, 0x41, 0x01, 0x6A, 0x41, 0xE7, 0x07,
0x20, 0x02, 0x20, 0x03, 0x6A, 0x22, 0x00, 0x41, 0x07, 0x6F, 0x1B, 0x41, 0xE7, 0x07, 0x20, 0x00,
0x41, 0x0D, 0x6F, 0x1B, 0x41, 0xE7, 0x07, 0x20, 0x00, 0x41, 0x0B, 0x6F, 0x1B, 0x36, 0x02, 0x00,
0x20, 0x07, 0x41, 0x04, 0x6A, 0x21, 0x07, 0x20, 0x04, 0x20, 0x02, 0x6A, 0x21, 0x04, 0x20, 0x03,
0x41, 0x01, 0x6A, 0x22, 0x03, 0x41, 0xE8, 0x07, 0x47, 0x0D, 0x00, 0x0B, 0x20, 0x05, 0x41, 0xA0,
0x1F, 0x6A, 0x21, 0x05, 0x20, 0x02, 0x41, 0x01, 0x6A, 0x22, 0x02, 0x41, 0xE8, 0x07, 0x47, 0x0D,
0x00, 0x0B, 0x41, 0x00, 0x21, 0x06, 0x41, 0x10, 0x21, 0x05, 0x03, 0x40, 0x41, 0x10, 0x21, 0x00,
0x41, 0x00, 0x21, 0x01, 0x03, 0x40, 0x20, 0x01, 0x41, 0xA0, 0x1F, 0x6C, 0x20, 0x06, 0x41, 0x02,
0x74, 0x6A, 0x41, 0x10, 0x6A, 0x21, 0x02, 0x41, 0x00, 0x21, 0x07, 0x03, 0x40, 0x20, 0x00, 0x20,
0x07, 0x6A, 0x22, 0x04, 0x20, 0x04, 0x28, 0x02, 0x00, 0x22, 0x04, 0x20, 0x05, 0x20, 0x07, 0x6A,
0x28, 0x02, 0x00, 0x20, 0x02, 0x28, 0x02, 0x00, 0x6A, 0x22, 0x03, 0x20, 0x04, 0x20, 0x03, 0x48,
0x1B, 0x36, 0x02, 0x00, 0x20, 0x07, 0x41, 0x04, 0x6A, 0x22, 0x07, 0x41, 0xA0, 0x1F, 0x47, 0x0D,
0x00, 0x0B, 0x20, 0x00, 0x41, 0xA0, 0x1F, 0x6A, 0x21, 0x00, 0x20, 0x01, 0x41, 0x01, 0x6A, 0x22,
0x01, 0x41, 0xE8, 0x07, 0x47, 0x0D, 0x00, 0x0B, 0x20, 0x05, 0x41, 0xA0, 0x1F, 0x6A, 0x21, 0x05,
0x20, 0x06, 0x41, 0x01, 0x6A, 0x22, 0x06, 0x41, 0xE8, 0x07, 0x47, 0x0D, 0x00, 0x0B, 0x41, 0x00,
0x0B
]);
let module = new WebAssembly.Module(bytes);
let instance = new WebAssembly.Instance(module);
instance.exports.main();
)";
// Create a string containing the JavaScript source code.
v8::Local<v8::String> source = v8::String::NewFromUtf8Literal(isolate, csource);
// Compile the source code.
v8::Local<v8::Script> script = v8::Script::Compile(context, source).ToLocalChecked();
// Run the script to get the result.
v8::Local<v8::Value> result = script->Run(context).ToLocalChecked();
}
}
// Dispose the isolate and tear down V8.
isolate->Dispose();
v8::V8::Dispose();
v8::V8::ShutdownPlatform();
delete create_params.array_buffer_allocator;
return 0;
}
I compile it as follows:
g++ -I. -O2 -Iinclude samples/wasm.cc -o wasm -lv8_monolith -Lout.gn/x64.release.sample/obj/ -pthread -std=c++17
On execution with time ./wasm
, I get execution times between 4.9s and 5.1s - almost triple that of in-Chrome/WasmFiddle execution! Did I miss anything? Maybe some optimization switches? This result is perfectly reproducible and I have even tested various different versions of the V8 library - still the same result.