Skip to content

Commit

Permalink
fix: use roundTiesToEven mode for rounding
Browse files Browse the repository at this point in the history
  • Loading branch information
petamoriken committed Jul 30, 2023
1 parent 4fa5480 commit 42a9636
Show file tree
Hide file tree
Showing 5 changed files with 113 additions and 83 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -177,7 +177,7 @@ environments:
```js
const array = new Float16Array([1.0, 1.1, 1.2, 1.3]);
for (const value of array) {
// 1, 1.099609375, 1.19921875, 1.2998046875
// 1, 1.099609375, 1.2001953125, 1.2998046875
console.log(value);
}

Expand Down
143 changes: 83 additions & 60 deletions src/_util/converter.mjs
Original file line number Diff line number Diff line change
@@ -1,93 +1,114 @@
// algorithm: http://fox-toolkit.org/ftp/fasthalffloatconversion.pdf

import {
MathAbs,
MathFloor,
MathLog2,
MathPow,
MathSign,
MathTrunc,
NativeArrayBuffer,
NativeFloat32Array,
NativeUint16Array,
NativeUint32Array,
NumberIsFinite,
NumberIsNaN,
ObjectIs,
} from "./primordials.mjs";

const buffer = new NativeArrayBuffer(4);
const floatView = new NativeFloat32Array(buffer);
const uint32View = new NativeUint32Array(buffer);
// base algorithm: https://github.com/feross/ieee754
// BSD-3-Clause License. Feross Aboukhadijeh <https://feross.org/opensource>

const baseTable = new NativeUint32Array(512);
const shiftTable = new NativeUint32Array(512);

for (let i = 0; i < 256; ++i) {
const e = i - 127;

// very small number (0, -0)
if (e < -27) {
baseTable[i] = 0x0000;
baseTable[i | 0x100] = 0x8000;
shiftTable[i] = 24;
shiftTable[i | 0x100] = 24;

// small number (denorm)
} else if (e < -14) {
baseTable[i] = 0x0400 >> (-e - 14);
baseTable[i | 0x100] = (0x0400 >> (-e - 14)) | 0x8000;
shiftTable[i] = -e - 1;
shiftTable[i | 0x100] = -e - 1;

// normal number
} else if (e <= 15) {
baseTable[i] = (e + 15) << 10;
baseTable[i | 0x100] = ((e + 15) << 10) | 0x8000;
shiftTable[i] = 13;
shiftTable[i | 0x100] = 13;

// large number (Infinity, -Infinity)
} else if (e < 128) {
baseTable[i] = 0x7c00;
baseTable[i | 0x100] = 0xfc00;
shiftTable[i] = 24;
shiftTable[i | 0x100] = 24;

// stay (NaN, Infinity, -Infinity)
} else {
baseTable[i] = 0x7c00;
baseTable[i | 0x100] = 0xfc00;
shiftTable[i] = 13;
shiftTable[i | 0x100] = 13;
/**
* round a number to nearest value; if the number falls midway,
* it is rounded to the nearest value with an even least significant digit.
* @param {number} num - double float
* @returns {number} half float number bits
*/
function roundTiesToEven(num) {
const truncated = MathTrunc(num);
const isOdd = truncated % 2 !== 0;
const delta = MathAbs(num - truncated);
if (delta > 0.5 || delta === 0.5 && isOdd) {
return truncated + MathSign(num);
}
return truncated;
}

const f16EMax = 31;
const f16EBias = 15;
const f16MLen = 10;
const f16MMask = 0x3ff;

/**
* round a number to a half float number bits
* @param {unknown} num - double float
* @returns {number} half float number bits
*/
export function roundToFloat16Bits(num) {
floatView[0] = /** @type {any} */ (num);
const f = uint32View[0];
const e = (f >> 23) & 0x1ff;
return baseTable[e] + ((f & 0x007fffff) >> shiftTable[e]);
const absNum = MathAbs(/** @type {number} */ (num));

const s = /** @type {number} */ (num) < 0 || ObjectIs(num, -0) ? 1 : 0;
let m, e;

// NaN, Infinity, -Infinity
if (!NumberIsFinite(absNum)) {
m = NumberIsNaN(absNum) ? 0x200 : 0;
e = f16EMax;

// finite
} else {
let rawE = MathFloor(MathLog2(absNum));
let c = MathPow(2, -rawE);
if (absNum * c < 1) {
--rawE;
c *= 2;
}
if (absNum * c >= 2) {
++rawE;
c /= 2;
}

if (rawE + f16EBias >= f16EMax) {
m = 0;
e = f16EMax;
} else if (rawE + f16EBias >= 1) {
m = roundTiesToEven(((absNum * c) - 1) * 0x400) & f16MMask;
e = rawE + f16EBias;
} else {
m = roundTiesToEven(absNum * 0x1000000) & f16MMask;
e = 0;
}
}

return s << 15 | e << f16MLen | m;
}

const mantissaTable = new NativeUint32Array(2048);
const exponentTable = new NativeUint32Array(64);
const offsetTable = new NativeUint32Array(64);
// base algorithm: http://fox-toolkit.org/ftp/fasthalffloatconversion.pdf

const buffer = new NativeArrayBuffer(4);
const floatView = new NativeFloat32Array(buffer);
const uint32View = new NativeUint32Array(buffer);

const mantissaTable = new NativeUint32Array(2048);
for (let i = 1; i < 1024; ++i) {
let m = i << 13; // zero pad mantissa bits
let e = 0; // zero exponent
let m = i << 13; // zero pad mantissa bits
let e = 0; // zero exponent

// normalized
while((m & 0x00800000) === 0) {
while ((m & 0x00800000) === 0) {
m <<= 1;
e -= 0x00800000; // decrement exponent
e -= 0x00800000; // decrement exponent
}

m &= ~0x00800000; // clear leading 1 bit
e += 0x38800000; // adjust bias
m &= ~0x00800000; // clear leading 1 bit
e += 0x38800000; // adjust bias

mantissaTable[i] = m | e;
}
for (let i = 1024; i < 2048; ++i) {
mantissaTable[i] = 0x38000000 + ((i - 1024) << 13);
}

const exponentTable = new NativeUint32Array(64);
for (let i = 1; i < 31; ++i) {
exponentTable[i] = i << 23;
}
Expand All @@ -98,6 +119,7 @@ for (let i = 33; i < 63; ++i) {
}
exponentTable[63] = 0xc7800000;

const offsetTable = new NativeUint16Array(64);
for (let i = 1; i < 64; ++i) {
if (i !== 32) {
offsetTable[i] = 1024;
Expand All @@ -110,7 +132,8 @@ for (let i = 1; i < 64; ++i) {
* @returns {number} double float
*/
export function convertToNumber(float16bits) {
const m = float16bits >> 10;
uint32View[0] = mantissaTable[offsetTable[m] + (float16bits & 0x3ff)] + exponentTable[m];
const i = float16bits >> 10;
uint32View[0] = mantissaTable[offsetTable[i] + (float16bits & 0x3ff)] +
exponentTable[i];
return floatView[0];
}
9 changes: 8 additions & 1 deletion src/_util/primordials.mjs
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,14 @@ export const NativeArrayPrototypeSymbolIterator = ArrayPrototype[SymbolIterator]
export const ArrayPrototypeSymbolIterator = uncurryThis(NativeArrayPrototypeSymbolIterator);

// Math
export const MathTrunc = Math.trunc;
export const {
abs: MathAbs,
floor: MathFloor,
log2: MathLog2,
pow: MathPow,
sign: MathSign,
trunc: MathTrunc,
} = Math;

// ArrayBuffer
export const NativeArrayBuffer = ArrayBuffer;
Expand Down
38 changes: 19 additions & 19 deletions test/Float16Array.js
Original file line number Diff line number Diff line change
Expand Up @@ -195,7 +195,7 @@ describe("Float16Array", () => {
});

it("iterate", () => {
const checkArray = [1, 1.099609375, 1.19921875, 1.2998046875];
const checkArray = [1, 1.099609375, 1.2001953125, 1.2998046875];

const float16 = new Float16Array([1, 1.1, 1.2, 1.3]);
for (const val of float16) {
Expand Down Expand Up @@ -263,7 +263,7 @@ describe("Float16Array", () => {

it("input Array or TypedArray", () => {
const array = [1, 1.1, 1.2, 1.3];
const checkArray = [1, 1.099609375, 1.19921875, 1.2998046875];
const checkArray = [1, 1.099609375, 1.2001953125, 1.2998046875];

const float16_1 = new Float16Array(array);

Expand All @@ -289,7 +289,7 @@ describe("Float16Array", () => {
it("input custom Array", () => {
class FooArray extends Array {}
const array = FooArray.from([1, 1.1, 1.2, 1.3]);
const checkArray = [1, 1.099609375, 1.19921875, 1.2998046875];
const checkArray = [1, 1.099609375, 1.2001953125, 1.2998046875];

const float16_1 = new Float16Array(array);

Expand Down Expand Up @@ -348,7 +348,7 @@ describe("Float16Array", () => {

it("input Iterable", () => {
const iterable = [1, 1.1, 1.2, 1.3][Symbol.iterator]();
const checkArray = [1, 1.099609375, 1.19921875, 1.2998046875];
const checkArray = [1, 1.099609375, 1.2001953125, 1.2998046875];

const float16 = new Float16Array(iterable);

Expand All @@ -361,7 +361,7 @@ describe("Float16Array", () => {

it("input ArrayLike", () => {
const arrayLike = { "0": 1, "1": 1.1, "2": 1.2, "3": 1.3, length: 4 };
const checkArray = [1, 1.099609375, 1.19921875, 1.2998046875];
const checkArray = [1, 1.099609375, 1.2001953125, 1.2998046875];

const float16 = new Float16Array(arrayLike);

Expand All @@ -374,7 +374,7 @@ describe("Float16Array", () => {

it("input Float16Array", () => {
const array = [1, 1.1, 1.2, 1.3];
const checkArray = [1, 1.099609375, 1.19921875, 1.2998046875];
const checkArray = [1, 1.099609375, 1.2001953125, 1.2998046875];

const float16 = new Float16Array(new Float16Array(array));

Expand Down Expand Up @@ -402,7 +402,7 @@ describe("Float16Array", () => {
}

const array = [1, 1.1, 1.2, 1.3];
const checkArray = [1, 1.099609375, 1.19921875, 1.2998046875];
const checkArray = [1, 1.099609375, 1.2001953125, 1.2998046875];

const float16 = new Float16Array(new AnotherRealmFloat16Array(array));

Expand All @@ -414,7 +414,7 @@ describe("Float16Array", () => {
});

it("input ArrayBuffer", () => {
const buffer = new Uint16Array([15360, 15462, 15564, 15667]).buffer;
const buffer = new Uint16Array([15360, 15462, 15565, 15667]).buffer;

const float16_1 = new Float16Array(buffer);

Expand All @@ -426,7 +426,7 @@ describe("Float16Array", () => {
assert.equalFloat16ArrayValues(float16_1, [
1,
1.099609375,
1.19921875,
1.2001953125,
1.2998046875,
]);

Expand All @@ -437,7 +437,7 @@ describe("Float16Array", () => {
assert(float16_2.byteOffset === 2);
assert(float16_2.byteLength === 4);
assert(float16_2.length === 2);
assert.equalFloat16ArrayValues(float16_2, [1.099609375, 1.19921875]);
assert.equalFloat16ArrayValues(float16_2, [1.099609375, 1.2001953125]);
});

it("input detached ArrayBuffer", function () {
Expand Down Expand Up @@ -473,7 +473,7 @@ describe("Float16Array", () => {

it("input Array or TypedArray", () => {
const array = [1, 1.1, 1.2, 1.3];
const checkArray = [1, 1.099609375, 1.19921875, 1.2998046875];
const checkArray = [1, 1.099609375, 1.2001953125, 1.2998046875];

const float16_1 = Float16Array.from(array);

Expand All @@ -488,7 +488,7 @@ describe("Float16Array", () => {

it("input Iterable", () => {
const iterable = [1, 1.1, 1.2, 1.3][Symbol.iterator]();
const checkArray = [1, 1.099609375, 1.19921875, 1.2998046875];
const checkArray = [1, 1.099609375, 1.2001953125, 1.2998046875];

const float16 = Float16Array.from(iterable);

Expand All @@ -498,7 +498,7 @@ describe("Float16Array", () => {

it("input ArrayLike", () => {
const arrayLike = { 0: 1, 1: 1.1, 2: 1.2, 3: 1.3, length: 4 };
const checkArray = [1, 1.099609375, 1.19921875, 1.2998046875];
const checkArray = [1, 1.099609375, 1.2001953125, 1.2998046875];

const float16 = Float16Array.from(arrayLike);

Expand All @@ -508,7 +508,7 @@ describe("Float16Array", () => {

it("input Float16Array", () => {
const array = [1, 1.1, 1.2, 1.3];
const checkArray = [1, 1.099609375, 1.19921875, 1.2998046875];
const checkArray = [1, 1.099609375, 1.2001953125, 1.2998046875];

const float16 = Float16Array.from(new Float16Array(array));

Expand All @@ -522,7 +522,7 @@ describe("Float16Array", () => {
}

const array = [1, 1.1, 1.2, 1.3];
const checkArray = [1, 1.099609375, 1.19921875, 1.2998046875];
const checkArray = [1, 1.099609375, 1.2001953125, 1.2998046875];

const float16 = Float16Array.from(new AnotherRealmFloat16Array(array));

Expand All @@ -533,7 +533,7 @@ describe("Float16Array", () => {
it("call from subclass", () => {
class Foo extends Float16Array {}

const checkArray = [1, 1.099609375, 1.19921875, 1.2998046875];
const checkArray = [1, 1.099609375, 1.2001953125, 1.2998046875];

const array = [1, 1.1, 1.2, 1.3];
const foo1 = Foo.from(array);
Expand Down Expand Up @@ -603,7 +603,7 @@ describe("Float16Array", () => {

it("input", () => {
const array = [1, 1.1, 1.2, 1.3];
const checkArray = [1, 1.099609375, 1.19921875, 1.2998046875];
const checkArray = [1, 1.099609375, 1.2001953125, 1.2998046875];

const float16 = Float16Array.of(...array);

Expand All @@ -615,7 +615,7 @@ describe("Float16Array", () => {
class Foo extends Float16Array {}

const array = [1, 1.1, 1.2, 1.3];
const checkArray = [1, 1.099609375, 1.19921875, 1.2998046875];
const checkArray = [1, 1.099609375, 1.2001953125, 1.2998046875];

const foo = Foo.of(...array);

Expand Down Expand Up @@ -2035,7 +2035,7 @@ describe("isFloat16Array", () => {
assert(isFloat16Array(new Float32Array()) === false);
assert(isFloat16Array(new Uint16Array()) === false);

assert(isFloat16Array(/* empty */) === false);
assert(isFloat16Array() === /* empty */ false);
assert(isFloat16Array(null) === false);
assert(isFloat16Array(undefined) === false);
assert(isFloat16Array(0) === false);
Expand Down
4 changes: 2 additions & 2 deletions test/f16round.js
Original file line number Diff line number Diff line change
Expand Up @@ -53,8 +53,8 @@ describe("f16round()", () => {
});

it("return ±float16 min value when value is ±float16 min value / 2 ± a bit number", () => {
assert(f16round(minFloat16 / 2 + 2 ** -25) === minFloat16);
assert(f16round(-minFloat16 / 2 - 2 ** -25) === -minFloat16);
assert(f16round(2.980232238769531911744490042422139897126953655970282852649688720703125e-8) === minFloat16);
assert(f16round(-2.980232238769531911744490042422139897126953655970282852649688720703125e-8) === -minFloat16);
});

it("return 1.3369140625 when value is 1.337", () => {
Expand Down

0 comments on commit 42a9636

Please sign in to comment.