Skip to content

cmd/compile: complicated bounds check elimination #16092

Open
@GordonBGood

Description

@GordonBGood

1. go version go1.7beta1 windows/amd64

2.
set GOARCH=amd64

set GOBIN=
set GOEXE=.exe
set GOHOSTARCH=amd64
set GOHOSTOS=windows
set GOOS=windows
set GOPATH=F:\Go
set GORACE=
set GOROOT=F:\Go
set GOTOOLDIR=F:\Go\pkg\tool\windows_amd64
set CC=gcc
set GOGCCFLAGS=-m64 -mthreads -fmessage-length=0 -fdebug-prefix-map=C:\Users\Super\AppData\Local\Temp\go-build211527254=/tmp/go-build -gno-record-gcc-switches
set CXX=g++
set CGO_ENABLED=1

3. Runnable program:

// PrimeSpeed project PrimeSpeed.go
package main

import (
    "fmt"
    "math"
    "time"
    //    "unsafe"
)

func mkCLUT() [65536]byte {
    var arr [65536]byte
    for i := 0; i < 65536; i++ {
        var cnt byte = 0
        for v := (uint16)(i ^ 0xFFFF); v > 0; v &= v - 1 {
            cnt++
        }
        arr[i] = cnt
    }
    return arr
}

var cnstCLUT [65536]byte = mkCLUT()

func primesTest(top uint) int {
    lmtndx := (top - 3) >> 1
    lstw := lmtndx >> 5
    lmt := lstw + 1
    topsqrtndx := (int(math.Sqrt(float64(top))) - 3) >> 1
    cmpsts := make([]uint32, lstw+1)
    //    start := uintptr(unsafe.Pointer(&cmpsts[0]))
    //    step := unsafe.Sizeof(cmpsts[0])
    for i := 0; i <= topsqrtndx; i++ {
        if cmpsts[i>>5]&(uint32(1)<<uint(i)) == 0 {
            p := (uint(i) << 1) + 3
            for j := (p*p - 3) >> 1; j <= topi; j += p {
                cmpsts[j>>5] |= 1 << (j & 31)
            }
            //            p := uintptr((uint(i) << 1) + 3)
            //            lmt := uintptr(lmtndx)
            //            for j := (p*p - 3) >> 1; j <= lmt; j += p {
            //                *(*uint)(unsafe.Pointer(start + step*(j>>5))) |= 1 << (j & 31)
            //            }
        }
    }
    msk := uint32(0xFFFFFFFE) << (lmtndx & 31)
    cmpsts[lstw] |= msk
    cnt := 1
    for i := uint(0); i <= lstw; i++ {
        v := cmpsts[i]
        cnt += int(cnstCLUT[v&0xFFFF] + cnstCLUT[0xFFFF&(v>>16)])
    }
    return cnt
}

func main() {
    n := uint(262146)

    strt := time.Now()

    sum := 0
    for i := 0; i < 1000; i++ {
        sum += primesTest(n)
    }

    end := time.Now()

    fmt.Println("Found", sum, "primes up to", n, "in", end.Sub(strt), ".")
}

play.golang.org link: https://play.golang.org/p/_E5R5JAlGW

4. When "go tool compile -S PrimeSpeed.go > PrimeSpeed.s" is run, the inner tight composite number culling loop as quoted below:

    line 36     for j := (p*p - 3) >> 1; j <= topi; j += p {
    line 37         cmpsts[j>>5] |= 1 << (j & 31)
    line 38     }

looks like the following assembly code from PrimeSpeed.s:

    0x00f1 00241 (Main.go:37)   MOVQ    R8, CX
    0x00f4 00244 (Main.go:37)   SHRQ    $5, R8
    0x00f8 00248 (Main.go:37)   CMPQ    R8, DX
    0x00fb 00251 (Main.go:37)   JCC $0, 454
    0x0101 00257 (Main.go:37)   MOVL    (AX)(R8*4), R10
    0x0105 00261 (Main.go:37)   MOVQ    CX, R11
    0x0108 00264 (Main.go:37)   ANDQ    $31, CX
    0x010c 00268 (Main.go:37)   MOVL    R9, R12 **;; saves 1 to r12**
    0x010f 00271 (Main.go:37)   SHLL    CX, R9
    0x0112 00274 (Main.go:37)   ORL R10, R9
    0x0115 00277 (Main.go:37)   MOVL    R9, (AX)(R8*4)
    0x0119 00281 (Main.go:36)   LEAQ    3(R11)(DI*2), R8
    0x011e 00286 (Main.go:37)   MOVL    R12, R9 **;; restores 1 to r9 from r12**
    0x0121 00289 (Main.go:36)   CMPQ    R8, BX
    0x0124 00292 (Main.go:36)   JLS $0, 241

5. I expected to see:

    0x00f1 00241 (Main.go:37)   MOVQ    R8, CX
    0x00f4 00244 (Main.go:37)   SHRQ    $5, R8
    0x00f8 00248 (Main.go:37)   CMPQ    R8, DX ;; array bounds check, only if no -B option
    0x00fb 00251 (Main.go:37)   JCC $0, 454 ;; panic if out of bounds
    0x0101 00257 (Main.go:37)   MOVL    (AX)(R8*4), R10
    0x0105 00261 (Main.go:37)   MOVQ    CX, R11
    0x0108 00264 (Main.go:37)   ANDQ    $31, CX
                            (Main.go:37)    MOVL    $1,R9 **;; IMMEDIATE LOAD OF 1 to R9**
                            (Main.go:37)    SHLL    CX, R9
                            (Main.go:37)    ORL R10, R9
                            (Main.go:37)    MOVL    R9, (AX)(R8*4)
                            (Main.go:36)    LEAQ    3(R11)(DI*2), R8
                            (Main.go:36)    CMPQ    R8, BX
                            (Main.go:36)    JLS $0, 241

Even better, without recalculating p = 2 * i + 3 thus j += j + 2 * i + 3 inside the inner loop:
Includes changing order of instructions for processors without OOE:

    0x00f1 00241 (Main.go:37)   MOVQ    R8, R11
    0x0105 00261 (Main.go:37)   MOVQ    R8, CX
    0x00f4 00244 (Main.go:37)   SHRQ    $5, R11
    0x0108 00264 (Main.go:37)   ANDQ    $31, CX
    0x00f8 00248 (Main.go:37)   CMPQ    R11, DX ;; array bounds check, only if no -B option
    0x00fb 00251 (Main.go:37)   JCC $0, 454 ;; panic if out of bounds
    0x0101 00257 (Main.go:37)   MOVL    (AX)(R11*4), R10
    0x010c 00268 (Main.go:37)   MOVL    $1,R9 **;; IMMEDIATE LOAD OF 1 to R9**
    0x010f 00271 (Main.go:37)   SHLL    CX, R9
    0x0112 00274 (Main.go:37)   ORL R10, R9
    0x0119 00281 (Main.go:36)   ADDL    R12, R8 **;; ADD PRE-CALCULATED 'p' in R12 to 'j'**
    0x0115 00277 (Main.go:37)   MOVL    R9, (AX)(R11*4)
    0x0121 00289 (Main.go:36)   CMPQ    R8, BX
    0x0124 00292 (Main.go:36)   JLS $0, 241

The following is the same loop without bounds checks generated for C/C++ with the Visual Studio compiler (intel assembler format):

    $Loop:
        mov edx, esi
        mov ecx, esi
        shr edx, 5
        and ecx, 31                 ; 0000001fH
        mov eax, 1
        add esi, edi                    ; add 'p' in edi to 'j'
        shl eax, cl
        or  DWORD PTR [ebx+edx*4], eax
        cmp esi, ebp                    ; ebp contains 'topi'
        jbe SHORT $Loop

Note that the above uses a total of seven registers for this inner loop, so the same code is generated for x86 and x64 compilations. Unfortunately, it takes another register to hold the upper array bound for a range check and the x86 architecture can only have seven available; however, it is possible to slightly change the code as follows:

    line 36            bnds := lstw + 1
    line 37            k := (p*p - 3) >> 1
    line 38            for j, w := j, j >> 5; w <= bnds; w = r >> 5 {
    line 39                cmpsts[w] |= 1 << (j & 31)
    line 40                j += p
    line 41            }

which for Visual Studio C/C++ generates the following same number of instructions in the loop:

    $Loop:
        mov ecx, edx
        mov eax, 1
        and ecx, 31                 ; 0000001fH
        add edx, edi                    ; add 'p' in edi to 'j'
        shl eax, cl
        or  DWORD PTR [ebx+esi*4], eax
    **$Start:**
        mov esi, edx
        shr esi, 5
        cmp esi, ebp                    **; ebp contains the array length, bnds**
        jb  SHORT $Loop             **; this is the same as doing an array bounds check**

and it can be seen that the array bounds check is now done at the same time as the loop completion check; it should be a simple matter to clue the compiler that 'bnds' contains the array length, perhaps by assigning it inside the loop as len(cmpsts) as is done for C# x86 code so that it recognizes that the bounds check is already done. The start point of the loop could be the line after the "or" line at the $Start: label or an external check could be implemented to ensure that the bounds check is done for the first loop before the array is accessed as is done for the Visual Studio C/C++ compiler.

As demonstrated above, the golang code runs slower than C/C++ code by almost a factor of two on some x86 processors and more than that factor for x86 processors. It also runs slightly slower than C#/Java for both architectures.

Metadata

Metadata

Assignees

No one assigned

    Labels

    NeedsFixThe path to resolution is known, but the work has not been done.Performance

    Type

    No type

    Projects

    No projects

    Milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions