Skip to content

Commit

Permalink
Make utf8_length_from_utf32 branchless.
Browse files Browse the repository at this point in the history
This helps GCC to autovectorize its implementation.
https://compiler-explorer.com/z/61xhGE784
so instead of
```
utf8_length_from_utf32(char32_t const*, unsigned long):
        test    rsi, rsi
        je      .L8
        lea     rcx, [rdi+rsi*4]
        xor     edx, edx
        jmp     .L7
.L11:
        add     rdx, 1
.L4:
        add     rdi, 4
        cmp     rcx, rdi
        je      .L1
.L7:
        mov     eax, DWORD PTR [rdi]
        cmp     eax, 127
        jbe     .L11
        cmp     eax, 2047
        ja      .L5
        add     rdi, 4
        add     rdx, 2
        cmp     rcx, rdi
        jne     .L7
.L1:
        mov     rax, rdx
        ret
.L5:
        mov     rsi, rdx
        xor     edx, edx
        cmp     eax, 65535
        seta    dl
        lea     rdx, [rdx+3+rsi]
        jmp     .L4
.L8:
        xor     edx, edx
        mov     rax, rdx
        ret
```
it generates
```
utf8_length_from_utf32(char32_t const*, unsigned long):
        mov     rcx, rsi
        test    rsi, rsi
        je      .L7
        lea     rax, [rsi-1]
        cmp     rax, 2
        jbe     .L8
        mov     rdx, rsi
        pxor    xmm5, xmm5
        mov     rax, rdi
        movdqa  xmm4, XMMWORD PTR .LC1[rip]
        shr     rdx, 2
        movdqa  xmm3, xmm5
        movdqa  xmm10, XMMWORD PTR .LC2[rip]
        movdqa  xmm6, XMMWORD PTR .LC5[rip]
        sal     rdx, 4
        movdqa  xmm7, XMMWORD PTR .LC8[rip]
        pxor    xmm2, xmm2
        movdqa  xmm9, XMMWORD PTR .LC6[rip]
        movdqa  xmm8, XMMWORD PTR .LC7[rip]
        add     rdx, rdi
.L4:
        movdqu  xmm0, XMMWORD PTR [rax]
        add     rax, 16
        psubd   xmm0, xmm10
        movdqa  xmm1, xmm0
        movdqa  xmm11, xmm0
        pcmpgtd xmm1, xmm9
        pcmpgtd xmm11, xmm8
        pcmpgtd xmm0, xmm7
        pand    xmm1, xmm4
        pand    xmm11, xmm4
        movdqa  xmm13, xmm1
        movdqa  xmm12, xmm11
        punpckhdq       xmm1, xmm2
        pand    xmm0, xmm4
        punpckhdq       xmm11, xmm2
        punpckldq       xmm13, xmm2
        paddq   xmm11, xmm1
        movdqa  xmm1, xmm0
        punpckldq       xmm12, xmm2
        punpckldq       xmm1, xmm2
        punpckhdq       xmm0, xmm2
        paddq   xmm12, xmm13
        paddq   xmm1, xmm6
        paddq   xmm0, xmm6
        paddq   xmm1, xmm12
        paddq   xmm0, xmm11
        paddq   xmm3, xmm1
        paddq   xmm5, xmm0
        cmp     rax, rdx
        jne     .L4
        paddq   xmm3, xmm5
        movdqa  xmm0, xmm3
        psrldq  xmm0, 8
        paddq   xmm3, xmm0
        movq    rax, xmm3
        test    cl, 3
        je      .L1
        mov     rdx, rcx
        and     rdx, -4
.L3:
        mov     r8d, DWORD PTR [rdi+rdx*4]
        xor     esi, esi
        lea     r9, [0+rdx*4]
        cmp     r8d, 127
        seta    sil
        lea     rax, [rax+1+rsi]
        xor     esi, esi
        cmp     r8d, 2047
        seta    sil
        add     rsi, rax
        xor     eax, eax
        cmp     r8d, 65535
        seta    al
        add     rax, rsi
        lea     rsi, [rdx+1]
        cmp     rsi, rcx
        jnb     .L1
        mov     r8d, DWORD PTR [rdi+4+r9]
        xor     r10d, r10d
        cmp     r8d, 127
        seta    r10b
        xor     esi, esi
        cmp     r8d, 2047
        seta    sil
        cmp     r8d, 65535
        seta    r8b
        lea     rsi, [r10+1+rsi]
        add     rdx, 2
        movzx   r8d, r8b
        add     rsi, r8
        add     rax, rsi
        cmp     rdx, rcx
        jnb     .L1
        mov     ecx, DWORD PTR [rdi+8+r9]
        xor     esi, esi
        cmp     ecx, 2047
        seta    sil
        xor     edx, edx
        cmp     ecx, 127
        seta    dl
        cmp     ecx, 65535
        seta    cl
        lea     rdx, [rsi+1+rdx]
        movzx   ecx, cl
        add     rdx, rcx
        add     rax, rdx
        ret
.L7:
        xor     eax, eax
.L1:
        ret
.L8:
        xor     edx, edx
        xor     eax, eax
        jmp     .L3
.LC1:
        .long   1
        .long   1
        .long   1
        .long   1
.LC2:
        .long   -2147483648
        .long   -2147483648
        .long   -2147483648
        .long   -2147483648
.LC5:
        .quad   1
        .quad   1
.LC6:
        .long   -2147483521
        .long   -2147483521
        .long   -2147483521
        .long   -2147483521
.LC7:
        .long   -2147481601
        .long   -2147481601
        .long   -2147481601
        .long   -2147481601
.LC8:
        .long   -2147418113
        .long   -2147418113
        .long   -2147418113
        .long   -2147418113
```
  • Loading branch information
ttsugriy committed Aug 29, 2023
1 parent cf504eb commit 4b35b67
Showing 1 changed file with 5 additions and 17 deletions.
22 changes: 5 additions & 17 deletions src/ada_idna.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -108,23 +108,11 @@ size_t utf8_length_from_utf32(const char32_t* buf, size_t len) {
// We are not BOM aware.
const uint32_t* p = reinterpret_cast<const uint32_t*>(buf);
size_t counter{0};
for (size_t i = 0; i < len; i++) {
/** ASCII **/
if (p[i] <= 0x7F) {
counter++;
}
/** two-byte **/
else if (p[i] <= 0x7FF) {
counter += 2;
}
/** three-byte **/
else if (p[i] <= 0xFFFF) {
counter += 3;
}
/** four-bytes **/
else {
counter += 4;
}
for (size_t i = 0; i != len; ++i) {
++counter; /** ASCII **/
counter += (p[i] > 0x7F); /** two-byte **/
counter += (p[i] > 0x7FF); /** three-byte **/
counter += (p[i] > 0xFFFF); /** four-bytes **/
}
return counter;
}
Expand Down

0 comments on commit 4b35b67

Please sign in to comment.