Description
1. go version go1.7beta1 windows/amd64
2.
set GOARCH=amd64
set GOBIN=
set GOEXE=.exe
set GOHOSTARCH=amd64
set GOHOSTOS=windows
set GOOS=windows
set GOPATH=F:\Go
set GORACE=
set GOROOT=F:\Go
set GOTOOLDIR=F:\Go\pkg\tool\windows_amd64
set CC=gcc
set GOGCCFLAGS=-m64 -mthreads -fmessage-length=0 -fdebug-prefix-map=C:\Users\Super\AppData\Local\Temp\go-build211527254=/tmp/go-build -gno-record-gcc-switches
set CXX=g++
set CGO_ENABLED=1
3. Runnable program:
// PrimeSpeed project PrimeSpeed.go
package main
import (
"fmt"
"math"
"time"
// "unsafe"
)
func mkCLUT() [65536]byte {
var arr [65536]byte
for i := 0; i < 65536; i++ {
var cnt byte = 0
for v := (uint16)(i ^ 0xFFFF); v > 0; v &= v - 1 {
cnt++
}
arr[i] = cnt
}
return arr
}
var cnstCLUT [65536]byte = mkCLUT()
func primesTest(top uint) int {
lmtndx := (top - 3) >> 1
lstw := lmtndx >> 5
lmt := lstw + 1
topsqrtndx := (int(math.Sqrt(float64(top))) - 3) >> 1
cmpsts := make([]uint32, lstw+1)
// start := uintptr(unsafe.Pointer(&cmpsts[0]))
// step := unsafe.Sizeof(cmpsts[0])
for i := 0; i <= topsqrtndx; i++ {
if cmpsts[i>>5]&(uint32(1)<<uint(i)) == 0 {
p := (uint(i) << 1) + 3
for j := (p*p - 3) >> 1; j <= topi; j += p {
cmpsts[j>>5] |= 1 << (j & 31)
}
// p := uintptr((uint(i) << 1) + 3)
// lmt := uintptr(lmtndx)
// for j := (p*p - 3) >> 1; j <= lmt; j += p {
// *(*uint)(unsafe.Pointer(start + step*(j>>5))) |= 1 << (j & 31)
// }
}
}
msk := uint32(0xFFFFFFFE) << (lmtndx & 31)
cmpsts[lstw] |= msk
cnt := 1
for i := uint(0); i <= lstw; i++ {
v := cmpsts[i]
cnt += int(cnstCLUT[v&0xFFFF] + cnstCLUT[0xFFFF&(v>>16)])
}
return cnt
}
func main() {
n := uint(262146)
strt := time.Now()
sum := 0
for i := 0; i < 1000; i++ {
sum += primesTest(n)
}
end := time.Now()
fmt.Println("Found", sum, "primes up to", n, "in", end.Sub(strt), ".")
}
play.golang.org link: https://play.golang.org/p/_E5R5JAlGW
4. When "go tool compile -S PrimeSpeed.go > PrimeSpeed.s" is run, the inner tight composite number culling loop as quoted below:
line 36 for j := (p*p - 3) >> 1; j <= topi; j += p {
line 37 cmpsts[j>>5] |= 1 << (j & 31)
line 38 }
looks like the following assembly code from PrimeSpeed.s:
0x00f1 00241 (Main.go:37) MOVQ R8, CX
0x00f4 00244 (Main.go:37) SHRQ $5, R8
0x00f8 00248 (Main.go:37) CMPQ R8, DX
0x00fb 00251 (Main.go:37) JCC $0, 454
0x0101 00257 (Main.go:37) MOVL (AX)(R8*4), R10
0x0105 00261 (Main.go:37) MOVQ CX, R11
0x0108 00264 (Main.go:37) ANDQ $31, CX
0x010c 00268 (Main.go:37) MOVL R9, R12 **;; saves 1 to r12**
0x010f 00271 (Main.go:37) SHLL CX, R9
0x0112 00274 (Main.go:37) ORL R10, R9
0x0115 00277 (Main.go:37) MOVL R9, (AX)(R8*4)
0x0119 00281 (Main.go:36) LEAQ 3(R11)(DI*2), R8
0x011e 00286 (Main.go:37) MOVL R12, R9 **;; restores 1 to r9 from r12**
0x0121 00289 (Main.go:36) CMPQ R8, BX
0x0124 00292 (Main.go:36) JLS $0, 241
5. I expected to see:
0x00f1 00241 (Main.go:37) MOVQ R8, CX
0x00f4 00244 (Main.go:37) SHRQ $5, R8
0x00f8 00248 (Main.go:37) CMPQ R8, DX ;; array bounds check, only if no -B option
0x00fb 00251 (Main.go:37) JCC $0, 454 ;; panic if out of bounds
0x0101 00257 (Main.go:37) MOVL (AX)(R8*4), R10
0x0105 00261 (Main.go:37) MOVQ CX, R11
0x0108 00264 (Main.go:37) ANDQ $31, CX
(Main.go:37) MOVL $1,R9 **;; IMMEDIATE LOAD OF 1 to R9**
(Main.go:37) SHLL CX, R9
(Main.go:37) ORL R10, R9
(Main.go:37) MOVL R9, (AX)(R8*4)
(Main.go:36) LEAQ 3(R11)(DI*2), R8
(Main.go:36) CMPQ R8, BX
(Main.go:36) JLS $0, 241
Even better, without recalculating p = 2 * i + 3 thus j += j + 2 * i + 3 inside the inner loop:
Includes changing order of instructions for processors without OOE:
0x00f1 00241 (Main.go:37) MOVQ R8, R11
0x0105 00261 (Main.go:37) MOVQ R8, CX
0x00f4 00244 (Main.go:37) SHRQ $5, R11
0x0108 00264 (Main.go:37) ANDQ $31, CX
0x00f8 00248 (Main.go:37) CMPQ R11, DX ;; array bounds check, only if no -B option
0x00fb 00251 (Main.go:37) JCC $0, 454 ;; panic if out of bounds
0x0101 00257 (Main.go:37) MOVL (AX)(R11*4), R10
0x010c 00268 (Main.go:37) MOVL $1,R9 **;; IMMEDIATE LOAD OF 1 to R9**
0x010f 00271 (Main.go:37) SHLL CX, R9
0x0112 00274 (Main.go:37) ORL R10, R9
0x0119 00281 (Main.go:36) ADDL R12, R8 **;; ADD PRE-CALCULATED 'p' in R12 to 'j'**
0x0115 00277 (Main.go:37) MOVL R9, (AX)(R11*4)
0x0121 00289 (Main.go:36) CMPQ R8, BX
0x0124 00292 (Main.go:36) JLS $0, 241
The following is the same loop without bounds checks generated for C/C++ with the Visual Studio compiler (intel assembler format):
$Loop:
mov edx, esi
mov ecx, esi
shr edx, 5
and ecx, 31 ; 0000001fH
mov eax, 1
add esi, edi ; add 'p' in edi to 'j'
shl eax, cl
or DWORD PTR [ebx+edx*4], eax
cmp esi, ebp ; ebp contains 'topi'
jbe SHORT $Loop
Note that the above uses a total of seven registers for this inner loop, so the same code is generated for x86 and x64 compilations. Unfortunately, it takes another register to hold the upper array bound for a range check and the x86 architecture can only have seven available; however, it is possible to slightly change the code as follows:
line 36 bnds := lstw + 1
line 37 k := (p*p - 3) >> 1
line 38 for j, w := j, j >> 5; w <= bnds; w = r >> 5 {
line 39 cmpsts[w] |= 1 << (j & 31)
line 40 j += p
line 41 }
which for Visual Studio C/C++ generates the following same number of instructions in the loop:
$Loop:
mov ecx, edx
mov eax, 1
and ecx, 31 ; 0000001fH
add edx, edi ; add 'p' in edi to 'j'
shl eax, cl
or DWORD PTR [ebx+esi*4], eax
**$Start:**
mov esi, edx
shr esi, 5
cmp esi, ebp **; ebp contains the array length, bnds**
jb SHORT $Loop **; this is the same as doing an array bounds check**
and it can be seen that the array bounds check is now done at the same time as the loop completion check; it should be a simple matter to clue the compiler that 'bnds' contains the array length, perhaps by assigning it inside the loop as len(cmpsts) as is done for C# x86 code so that it recognizes that the bounds check is already done. The start point of the loop could be the line after the "or" line at the $Start: label or an external check could be implemented to ensure that the bounds check is done for the first loop before the array is accessed as is done for the Visual Studio C/C++ compiler.
As demonstrated above, the golang code runs slower than C/C++ code by almost a factor of two on some x86 processors and more than that factor for x86 processors. It also runs slightly slower than C#/Java for both architectures.