-
Notifications
You must be signed in to change notification settings - Fork 30.3k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
events: optimize arrayClone by copying forward #10571
Conversation
It's slightly faster (and more readable) to copy array elements in forward direction. This way it also avoids the ToBoolean and the postfix count operation.
FWIW I just tested this locally with benchmark/events/ee-emit.js and saw that with this change small array (e.g. 10) copies are actually slower, but the copying only become slightly faster once you increase the size of the array substantially (e.g. 1000). |
I see improvements even for small arrays (size 10), on my Z620, i.e.
|
On my 2015 MBP it's even more visible (using Node.js master):
|
It seems the Warmup does not work as expected. With Node.js 7.3.0 on Windows 7 x64: 'use strict';
function arrayCloneOld(a, l) {
var c = new Array(l);
while (l--) c[l] = a[l];
return c;
}
function arrayCloneNew(a, l) {
var c = new Array(l);
for (var i = 0; i < l; ++i) c[i] = a[i];
return c;
}
// Warmup
var a = [0,1,2,3,4,5,6,7,8,9];
for (var i = 0; i < 10000; ++i) {
arrayCloneOld(a, a.length);
arrayCloneOld(a, a.length);
}
console.time('old');
for (var i = 0; i < 10000; ++i) {
arrayCloneOld(a, a.length);
}
console.timeEnd('old');
console.time('old');
for (var i = 0; i < 10000; ++i) {
arrayCloneOld(a, a.length);
}
console.timeEnd('old');
'use strict';
function arrayCloneOld(a, l) {
var c = new Array(l);
while (l--) c[l] = a[l];
return c;
}
function arrayCloneNew(a, l) {
var c = new Array(l);
for (var i = 0; i < l; ++i) c[i] = a[i];
return c;
}
// Warmup
var a = [0,1,2,3,4,5,6,7,8,9];
for (var i = 0; i < 10000; ++i) {
arrayCloneNew(a, a.length);
arrayCloneNew(a, a.length);
}
console.time('new');
for (var i = 0; i < 10000; ++i) {
arrayCloneNew(a, a.length);
}
console.timeEnd('new');
console.time('new');
for (var i = 0; i < 10000; ++i) {
arrayCloneNew(a, a.length);
}
console.timeEnd('new');
|
"use strict";
function arrayCloneOld(a, l) {
var c = new Array(l);
while (l--) c[l] = a[l];
return c;
}
function arrayCloneNew(a, l) {
var c = new Array(l);
for (var i = 0; i < l; ++i) c[i] = a[i];
return c;
}
// Warmup
var a = [0,1,2,3,4,5,6,7,8,9];
for (var i = 0; i < 10000; ++i) {
arrayCloneOld(a, a.length);
arrayCloneNew(a, a.length);
}
console.time('new');
for (var i = 0; i < 10000; ++i) {
arrayCloneNew(a, a.length);
}
console.timeEnd('new');
console.time('old');
for (var i = 0; i < 10000; ++i) {
arrayCloneOld(a, a.length);
}
console.timeEnd('old');
|
Ah, good catch. There's some inlining going on here. |
I'm not sure why it does indeed seem to be faster on its own vs testing it implicitly via the |
@bmeurer From my experience, the first cycle inside another cycle always run faster. So I've made this benchmark template for myself, trying firstly to get commensurably equal results for the same function. I use the benchmark.js for balanced tests and vanilla cycles for simplified tests. As you can see, the first ones do not need warm-up, the second ones do need. Template code and its results (click me):/******************************************************************************/
'use strict';
/******************************************************************************/
const functions = [
function warmup() { String(1); },
function test01() { String(1); },
function test02() { String(1); },
];
/******************************************************************************/
console.log(`
1. Balanced benchmark
`);
/******************************************************************************/
const suite = require('benchmark').Suite();
functions.forEach((func) => { suite.add(func.name, func); });
suite.on('cycle', (evt) => { console.log(String(evt.target)); }).run({ async: false });
/******************************************************************************/
console.log(`
2. Simplified benchmark
`);
/******************************************************************************/
functions.forEach((func) => {
const MS_IN_S = 1000;
const numberOfCycles = 1e8;
const start = Date.now();
for (let i = 0; i < numberOfCycles; i++) func();
console.log(`${func.name} x ${
Math.round(numberOfCycles / ((Date.now() - start) / MS_IN_S)).toLocaleString()
} ops/sec`);
}); 1. Balanced benchmark
warmup x 31,788,736 ops/sec ±1.09% (83 runs sampled)
test01 x 31,686,704 ops/sec ±0.97% (82 runs sampled)
test02 x 31,480,559 ops/sec ±1.04% (83 runs sampled)
2. Simplified benchmark
warmup x 52,356,021 ops/sec
test01 x 39,510,075 ops/sec
test02 x 39,745,628 ops/sec And here are the results for big and small arrays with this script (warmup data output is canceled for the simple cycles). Clone array benchmarks (click me):/******************************************************************************/
'use strict';
/******************************************************************************/
var smallArray = Array(1e1).fill(0).map((elm, i) => elm + i);
var largeArray = Array(1e4).fill(0).map((elm, i) => elm + i);
const functions = [
function warmup() {},
function smallArrayCloneOld() {
var l = smallArray.length;
var c = new Array(l);
while (l--) c[l] = smallArray[l];
return c;
},
function smallArrayCloneNew() {
var l = smallArray.length;
var c = new Array(l);
for (var i = 0; i < l; ++i) c[i] = smallArray[i];
return c;
},
function largeArrayCloneOld() {
var l = largeArray.length;
var c = new Array(l);
while (l--) c[l] = largeArray[l];
return c;
},
function largeArrayCloneNew() {
var l = largeArray.length;
var c = new Array(l);
for (var i = 0; i < l; ++i) c[i] = largeArray[i];
return c;
},
];
/******************************************************************************/
console.log(`
1. Balanced benchmark
`);
/******************************************************************************/
const suite = require('benchmark').Suite();
functions.forEach((func) => { if (func.name !== 'warmup') suite.add(func.name, func); });
suite.on('cycle', (evt) => { console.log(String(evt.target)); }).run({ async: false });
/******************************************************************************/
console.log(`
2. Simplified benchmark
`);
/******************************************************************************/
functions.forEach((func) => {
const MS_IN_S = 1000;
const numberOfCycles = 1e6;
const start = Date.now();
for (let i = 0; i < numberOfCycles; i++) func();
if (func.name !== 'warmup') {
console.log(`${func.name} x ${
Math.round(numberOfCycles / ((Date.now() - start) / MS_IN_S)).toLocaleString()
} ops/sec`);
}
}); 1. Balanced benchmark
smallArrayCloneOld x 12,660,628 ops/sec ±0.47% (87 runs sampled)
smallArrayCloneNew x 13,510,089 ops/sec ±1.32% (85 runs sampled)
largeArrayCloneOld x 21,524 ops/sec ±0.84% (87 runs sampled)
largeArrayCloneNew x 23,764 ops/sec ±0.99% (87 runs sampled)
2. Simplified benchmark
smallArrayCloneOld x 12,820,513 ops/sec
smallArrayCloneNew x 16,129,032 ops/sec
largeArrayCloneOld x 22,063 ops/sec
largeArrayCloneNew x 24,241 ops/sec However, in the real life the situation with `emit()` may be exposed to more different factors, as @mscdex notices. |
I'm +1 on this change because it makes clearer code. I don't think we'll see actual considerable performance difference in the benchmarks. |
Thanks for the investigations. I was looking at it from the VMs perspective; in this particular case by investigating Ignition+TurboFan performance, where we currently suffer a 2x slowdown due to the ToNumber from the postfix count operation (the |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM if CI is ✅
Nits on the commit message (but they can be fixed by whoever lands this if you don't get around to it):
|
So, what about the test failures? I can't seem to figure out how they related to my PR? |
@bmeurer they likely aren't related. I didn't go through all the failures, but I see a couple red CI runs, and some processes left behind on the CI machines. We'll do another CI run once it's all cleaned up. |
Only Raspberry Pi failed, and it seems to have been build related. Let's try again. arm-fanned CI: https://ci.nodejs.org/job/node-test-commit-arm-fanned/6279/ |
New CI run on rpi looks good |
Optimize arrayClone by copying forward. It's slightly faster (and more readable) to copy array elements in forward direction. This way it also avoids the ToBoolean and the postfix count operation. PR-URL: #10571 Reviewed-By: Luigi Pinca <luigipinca@gmail.com> Reviewed-By: Benjamin Gruenbaum <benjamingr@gmail.com> Reviewed-By: Rich Trott <rtrott@gmail.com> Reviewed-By: Colin Ihrig <cjihrig@gmail.com> Reviewed-By: James M Snell <jasnell@gmail.com> Reviewed-By: Franziska Hinkelmann <franziska.hinkelmann@gmail.com>
Optimize arrayClone by copying forward. It's slightly faster (and more readable) to copy array elements in forward direction. This way it also avoids the ToBoolean and the postfix count operation. PR-URL: #10571 Reviewed-By: Luigi Pinca <luigipinca@gmail.com> Reviewed-By: Benjamin Gruenbaum <benjamingr@gmail.com> Reviewed-By: Rich Trott <rtrott@gmail.com> Reviewed-By: Colin Ihrig <cjihrig@gmail.com> Reviewed-By: James M Snell <jasnell@gmail.com> Reviewed-By: Franziska Hinkelmann <franziska.hinkelmann@gmail.com>
It doesn't depend on a specific V8 version. For Crankshaft this is mostly identical or slightly faster ever since. For TurboFan the new version is better. |
I'm going to opt to hold off on this optimization for a bit. I didn't read through the entire thread, but are the perf gains significant? |
With the default configuration there are no significant performance gains. With TurboFan there are significant gains. So skipping this for now sounds perfectly reasonable. |
It's slightly faster (and more readable) to copy array elements
in forward direction. This way it also avoids the ToBoolean and
the postfix count operation.
Checklist
make -j4 test
(UNIX)Affected core subsystem(s)
events