Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fma not optimized for wasm relaxed-simd #121311

Open
kzhsw opened this issue Dec 30, 2024 · 3 comments
Open

Fma not optimized for wasm relaxed-simd #121311

kzhsw opened this issue Dec 30, 2024 · 3 comments

Comments

@kzhsw
Copy link

kzhsw commented Dec 30, 2024

Env: Compiler explorer (https://godbolt.org/z/Yxc6e8sd4)
Version: WebAssembly clang (trunk)

Code:

Code

#include <wasm_simd128.h>

void fma_inst(float * a, float * b, float * c, float * dest)  {
    v128_t va, vb, vc;
    va = wasm_v128_load(a);
    vb = wasm_v128_load(b);
    vc = wasm_v128_load(c);
    va = wasm_f32x4_mul(va, vb);
    va = wasm_f32x4_add(va, vc);
    wasm_v128_store(dest, va);
}

void fma_operator(float * a, float * b, float * c, float * dest)  {
    __f32x4 va, vb, vc;
    va = wasm_v128_load(a);
    vb = wasm_v128_load(b);
    vc = wasm_v128_load(c);
    va = (va * vb) + vc;
    wasm_v128_store(dest, va);
}

void fma_buildin(float * a, float * b, float * c, float * dest)  {
    __f32x4 va, vb, vc;
    va = wasm_v128_load(a);
    vb = wasm_v128_load(b);
    vc = wasm_v128_load(c);
    va = __builtin_elementwise_fma(va, vb, vc);
    wasm_v128_store(dest, va);
}

void fma_expected(float * a, float * b, float * c, float * dest)  {
    __f32x4 va, vb, vc;
    va = wasm_v128_load(a);
    vb = wasm_v128_load(b);
    vc = wasm_v128_load(c);
    va = __builtin_wasm_relaxed_madd_f32x4(va, vb, vc);
    wasm_v128_store(dest, va);
}

Flags: -O3 -msimd128 -mrelaxed-simd -ffast-math

Expected:
All impls optimized like using intrinsic.

fma_expected:
        local.get       3
        local.get       0
        v128.load       0:p2align=0
        local.get       1
        v128.load       0:p2align=0
        local.get       2
        v128.load       0:p2align=0
        f32x4.relaxed_madd
        v128.store      0:p2align=0
        end_function

Actual:

fma_inst:
        local.get       3
        local.get       1
        v128.load       0:p2align=0
        local.get       0
        v128.load       0:p2align=0
        f32x4.mul
        local.get       2
        v128.load       0:p2align=0
        f32x4.add
        v128.store      0:p2align=0
        end_function

fma_operator:
        local.get       3
        local.get       1
        v128.load       0:p2align=0
        local.get       0
        v128.load       0:p2align=0
        f32x4.mul
        local.get       2
        v128.load       0:p2align=0
        f32x4.add
        v128.store      0:p2align=0
        end_function

fma_buildin:
        local.get       3
        local.get       0
        v128.load       0:p2align=0
        local.tee       4
        f32x4.extract_lane      0
        local.get       1
        v128.load       0:p2align=0
        local.tee       5
        f32x4.extract_lane      0
        local.get       2
        v128.load       0:p2align=0
        local.tee       6
        f32x4.extract_lane      0
        call    fmaf
        f32x4.splat
        local.get       4
        f32x4.extract_lane      1
        local.get       5
        f32x4.extract_lane      1
        local.get       6
        f32x4.extract_lane      1
        call    fmaf
        f32x4.replace_lane      1
        local.get       4
        f32x4.extract_lane      2
        local.get       5
        f32x4.extract_lane      2
        local.get       6
        f32x4.extract_lane      2
        call    fmaf
        f32x4.replace_lane      2
        local.get       4
        f32x4.extract_lane      3
        local.get       5
        f32x4.extract_lane      3
        local.get       6
        f32x4.extract_lane      3
        call    fmaf
        f32x4.replace_lane      3
        v128.store      0:p2align=0
        end_function

Other info:
The same optimizations apply to x86: https://godbolt.org/z/jYKMEq4rM and arm64 https://godbolt.org/z/z5z4fdd7M

@llvmbot
Copy link
Member

llvmbot commented Dec 30, 2024

@llvm/issue-subscribers-backend-webassembly

Author: None (kzhsw)

Env: Compiler explorer (<https://godbolt.org/z/Yxc6e8sd4>) Version: WebAssembly clang (trunk)

Code:
<details><summary>Code</summary>
<p>

#include &lt;wasm_simd128.h&gt;

void fma_inst(float * a, float * b, float * c, float * dest)  {
    v128_t va, vb, vc;
    va = wasm_v128_load(a);
    vb = wasm_v128_load(b);
    vc = wasm_v128_load(c);
    va = wasm_f32x4_mul(va, vb);
    va = wasm_f32x4_add(va, vc);
    wasm_v128_store(dest, va);
}

void fma_operator(float * a, float * b, float * c, float * dest)  {
    __f32x4 va, vb, vc;
    va = wasm_v128_load(a);
    vb = wasm_v128_load(b);
    vc = wasm_v128_load(c);
    va = (va * vb) + vc;
    wasm_v128_store(dest, va);
}

void fma_buildin(float * a, float * b, float * c, float * dest)  {
    __f32x4 va, vb, vc;
    va = wasm_v128_load(a);
    vb = wasm_v128_load(b);
    vc = wasm_v128_load(c);
    va = __builtin_elementwise_fma(va, vb, vc);
    wasm_v128_store(dest, va);
}

void fma_expected(float * a, float * b, float * c, float * dest)  {
    __f32x4 va, vb, vc;
    va = wasm_v128_load(a);
    vb = wasm_v128_load(b);
    vc = wasm_v128_load(c);
    va = __builtin_wasm_relaxed_madd_f32x4(va, vb, vc);
    wasm_v128_store(dest, va);
}

</p>
</details>

Flags: -O3 -msimd128 -mrelaxed-simd -ffast-math

Expected:
All impls optimized like using intrinsic.

fma_expected:
        local.get       3
        local.get       0
        v128.load       0:p2align=0
        local.get       1
        v128.load       0:p2align=0
        local.get       2
        v128.load       0:p2align=0
        f32x4.relaxed_madd
        v128.store      0:p2align=0
        end_function

Actual:

fma_inst:
        local.get       3
        local.get       1
        v128.load       0:p2align=0
        local.get       0
        v128.load       0:p2align=0
        f32x4.mul
        local.get       2
        v128.load       0:p2align=0
        f32x4.add
        v128.store      0:p2align=0
        end_function

fma_operator:
        local.get       3
        local.get       1
        v128.load       0:p2align=0
        local.get       0
        v128.load       0:p2align=0
        f32x4.mul
        local.get       2
        v128.load       0:p2align=0
        f32x4.add
        v128.store      0:p2align=0
        end_function

fma_buildin:
        local.get       3
        local.get       0
        v128.load       0:p2align=0
        local.tee       4
        f32x4.extract_lane      0
        local.get       1
        v128.load       0:p2align=0
        local.tee       5
        f32x4.extract_lane      0
        local.get       2
        v128.load       0:p2align=0
        local.tee       6
        f32x4.extract_lane      0
        call    fmaf
        f32x4.splat
        local.get       4
        f32x4.extract_lane      1
        local.get       5
        f32x4.extract_lane      1
        local.get       6
        f32x4.extract_lane      1
        call    fmaf
        f32x4.replace_lane      1
        local.get       4
        f32x4.extract_lane      2
        local.get       5
        f32x4.extract_lane      2
        local.get       6
        f32x4.extract_lane      2
        call    fmaf
        f32x4.replace_lane      2
        local.get       4
        f32x4.extract_lane      3
        local.get       5
        f32x4.extract_lane      3
        local.get       6
        f32x4.extract_lane      3
        call    fmaf
        f32x4.replace_lane      3
        v128.store      0:p2align=0
        end_function

Other info:
The same optimizations apply to x86: <https://godbolt.org/z/jYKMEq4rM>

@ppenzin
Copy link
Contributor

ppenzin commented Dec 31, 2024

If I recall correctly that is intentional, i.e. relaxed simd wouldn't be produced via fast math + vectorization.

@kzhsw
Copy link
Author

kzhsw commented Jan 2, 2025

If I recall correctly that is intentional, i.e. relaxed simd wouldn't be produced via fast math + vectorization.

Seems the linked issue means fast-math would not auto enable relaxed-simd, but I manually enabled relaxed-simd via -mrelaxed-simd. Is this also the case where it should not be optimized?

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Development

No branches or pull requests

4 participants