Skip to content

Commit f596710

Browse files
suryasaimadhuIngo Molnar
authored and
Ingo Molnar
committed
x86/hweight: Get rid of the special calling convention
People complained about ARCH_HWEIGHT_CFLAGS and how it throws a wrench into kcov, lto, etc, experimentations. Add asm versions for __sw_hweight{32,64}() and do explicit saving and restoring of clobbered registers. This gets rid of the special calling convention. We get to call those functions on !X86_FEATURE_POPCNT CPUs. We still need to hardcode POPCNT and register operands as some old gas versions which we support, do not know about POPCNT. Btw, remove redundant REX prefix from 32-bit POPCNT because alternatives can do padding now. Suggested-by: H. Peter Anvin <hpa@zytor.com> Signed-off-by: Borislav Petkov <bp@suse.de> Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org> Cc: Andy Lutomirski <luto@amacapital.net> Cc: Borislav Petkov <bp@alien8.de> Cc: Brian Gerst <brgerst@gmail.com> Cc: Denys Vlasenko <dvlasenk@redhat.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Thomas Gleixner <tglx@linutronix.de> Link: http://lkml.kernel.org/r/1464605787-20603-1-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar <mingo@kernel.org>
1 parent 08dd8cd commit f596710

File tree

8 files changed

+97
-25
lines changed

8 files changed

+97
-25
lines changed

arch/x86/Kconfig

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -294,11 +294,6 @@ config X86_32_LAZY_GS
294294
def_bool y
295295
depends on X86_32 && !CC_STACKPROTECTOR
296296

297-
config ARCH_HWEIGHT_CFLAGS
298-
string
299-
default "-fcall-saved-ecx -fcall-saved-edx" if X86_32
300-
default "-fcall-saved-rdi -fcall-saved-rsi -fcall-saved-rdx -fcall-saved-rcx -fcall-saved-r8 -fcall-saved-r9 -fcall-saved-r10 -fcall-saved-r11" if X86_64
301-
302297
config ARCH_SUPPORTS_UPROBES
303298
def_bool y
304299

arch/x86/include/asm/arch_hweight.h

Lines changed: 10 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,8 @@
44
#include <asm/cpufeatures.h>
55

66
#ifdef CONFIG_64BIT
7-
/* popcnt %edi, %eax -- redundant REX prefix for alignment */
8-
#define POPCNT32 ".byte 0xf3,0x40,0x0f,0xb8,0xc7"
7+
/* popcnt %edi, %eax */
8+
#define POPCNT32 ".byte 0xf3,0x0f,0xb8,0xc7"
99
/* popcnt %rdi, %rax */
1010
#define POPCNT64 ".byte 0xf3,0x48,0x0f,0xb8,0xc7"
1111
#define REG_IN "D"
@@ -17,19 +17,15 @@
1717
#define REG_OUT "a"
1818
#endif
1919

20-
/*
21-
* __sw_hweightXX are called from within the alternatives below
22-
* and callee-clobbered registers need to be taken care of. See
23-
* ARCH_HWEIGHT_CFLAGS in <arch/x86/Kconfig> for the respective
24-
* compiler switches.
25-
*/
20+
#define __HAVE_ARCH_SW_HWEIGHT
21+
2622
static __always_inline unsigned int __arch_hweight32(unsigned int w)
2723
{
28-
unsigned int res = 0;
24+
unsigned int res;
2925

3026
asm (ALTERNATIVE("call __sw_hweight32", POPCNT32, X86_FEATURE_POPCNT)
31-
: "="REG_OUT (res)
32-
: REG_IN (w));
27+
: "="REG_OUT (res)
28+
: REG_IN (w));
3329

3430
return res;
3531
}
@@ -53,11 +49,11 @@ static inline unsigned long __arch_hweight64(__u64 w)
5349
#else
5450
static __always_inline unsigned long __arch_hweight64(__u64 w)
5551
{
56-
unsigned long res = 0;
52+
unsigned long res;
5753

5854
asm (ALTERNATIVE("call __sw_hweight64", POPCNT64, X86_FEATURE_POPCNT)
59-
: "="REG_OUT (res)
60-
: REG_IN (w));
55+
: "="REG_OUT (res)
56+
: REG_IN (w));
6157

6258
return res;
6359
}

arch/x86/kernel/i386_ksyms_32.c

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,3 +42,5 @@ EXPORT_SYMBOL(empty_zero_page);
4242
EXPORT_SYMBOL(___preempt_schedule);
4343
EXPORT_SYMBOL(___preempt_schedule_notrace);
4444
#endif
45+
46+
EXPORT_SYMBOL(__sw_hweight32);

arch/x86/kernel/x8664_ksyms_64.c

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,9 @@ EXPORT_SYMBOL(clear_page);
4444

4545
EXPORT_SYMBOL(csum_partial);
4646

47+
EXPORT_SYMBOL(__sw_hweight32);
48+
EXPORT_SYMBOL(__sw_hweight64);
49+
4750
/*
4851
* Export string functions. We normally rely on gcc builtin for most of these,
4952
* but gcc sometimes decides not to inline them.

arch/x86/lib/Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ lib-y += memcpy_$(BITS).o
2525
lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem.o
2626
lib-$(CONFIG_INSTRUCTION_DECODER) += insn.o inat.o
2727

28-
obj-y += msr.o msr-reg.o msr-reg-export.o
28+
obj-y += msr.o msr-reg.o msr-reg-export.o hweight.o
2929

3030
ifeq ($(CONFIG_X86_32),y)
3131
obj-y += atomic64_32.o

arch/x86/lib/hweight.S

Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,77 @@
1+
#include <linux/linkage.h>
2+
3+
#include <asm/asm.h>
4+
5+
/*
6+
* unsigned int __sw_hweight32(unsigned int w)
7+
* %rdi: w
8+
*/
9+
ENTRY(__sw_hweight32)
10+
11+
#ifdef CONFIG_X86_64
12+
movl %edi, %eax # w
13+
#endif
14+
__ASM_SIZE(push,) %__ASM_REG(dx)
15+
movl %eax, %edx # w -> t
16+
shrl %edx # t >>= 1
17+
andl $0x55555555, %edx # t &= 0x55555555
18+
subl %edx, %eax # w -= t
19+
20+
movl %eax, %edx # w -> t
21+
shrl $2, %eax # w_tmp >>= 2
22+
andl $0x33333333, %edx # t &= 0x33333333
23+
andl $0x33333333, %eax # w_tmp &= 0x33333333
24+
addl %edx, %eax # w = w_tmp + t
25+
26+
movl %eax, %edx # w -> t
27+
shrl $4, %edx # t >>= 4
28+
addl %edx, %eax # w_tmp += t
29+
andl $0x0f0f0f0f, %eax # w_tmp &= 0x0f0f0f0f
30+
imull $0x01010101, %eax, %eax # w_tmp *= 0x01010101
31+
shrl $24, %eax # w = w_tmp >> 24
32+
__ASM_SIZE(pop,) %__ASM_REG(dx)
33+
ret
34+
ENDPROC(__sw_hweight32)
35+
36+
ENTRY(__sw_hweight64)
37+
#ifdef CONFIG_X86_64
38+
pushq %rdx
39+
40+
movq %rdi, %rdx # w -> t
41+
movabsq $0x5555555555555555, %rax
42+
shrq %rdx # t >>= 1
43+
andq %rdx, %rax # t &= 0x5555555555555555
44+
movabsq $0x3333333333333333, %rdx
45+
subq %rax, %rdi # w -= t
46+
47+
movq %rdi, %rax # w -> t
48+
shrq $2, %rdi # w_tmp >>= 2
49+
andq %rdx, %rax # t &= 0x3333333333333333
50+
andq %rdi, %rdx # w_tmp &= 0x3333333333333333
51+
addq %rdx, %rax # w = w_tmp + t
52+
53+
movq %rax, %rdx # w -> t
54+
shrq $4, %rdx # t >>= 4
55+
addq %rdx, %rax # w_tmp += t
56+
movabsq $0x0f0f0f0f0f0f0f0f, %rdx
57+
andq %rdx, %rax # w_tmp &= 0x0f0f0f0f0f0f0f0f
58+
movabsq $0x0101010101010101, %rdx
59+
imulq %rdx, %rax # w_tmp *= 0x0101010101010101
60+
shrq $56, %rax # w = w_tmp >> 56
61+
62+
popq %rdx
63+
ret
64+
#else /* CONFIG_X86_32 */
65+
/* We're getting an u64 arg in (%eax,%edx): unsigned long hweight64(__u64 w) */
66+
pushl %ecx
67+
68+
call __sw_hweight32
69+
movl %eax, %ecx # stash away result
70+
movl %edx, %eax # second part of input
71+
call __sw_hweight32
72+
addl %ecx, %eax # result
73+
74+
popl %ecx
75+
ret
76+
#endif
77+
ENDPROC(__sw_hweight64)

lib/Makefile

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -15,9 +15,6 @@ KCOV_INSTRUMENT_rbtree.o := n
1515
KCOV_INSTRUMENT_list_debug.o := n
1616
KCOV_INSTRUMENT_debugobjects.o := n
1717
KCOV_INSTRUMENT_dynamic_debug.o := n
18-
# Kernel does not boot if we instrument this file as it uses custom calling
19-
# convention (see CONFIG_ARCH_HWEIGHT_CFLAGS).
20-
KCOV_INSTRUMENT_hweight.o := n
2118

2219
lib-y := ctype.o string.o vsprintf.o cmdline.o \
2320
rbtree.o radix-tree.o dump_stack.o timerqueue.o\
@@ -74,8 +71,6 @@ obj-$(CONFIG_HAS_IOMEM) += iomap_copy.o devres.o
7471
obj-$(CONFIG_CHECK_SIGNATURE) += check_signature.o
7572
obj-$(CONFIG_DEBUG_LOCKING_API_SELFTESTS) += locking-selftest.o
7673

77-
GCOV_PROFILE_hweight.o := n
78-
CFLAGS_hweight.o = $(subst $(quote),,$(CONFIG_ARCH_HWEIGHT_CFLAGS))
7974
obj-$(CONFIG_GENERIC_HWEIGHT) += hweight.o
8075

8176
obj-$(CONFIG_BTREE) += btree.o

lib/hweight.c

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
* The Hamming Weight of a number is the total number of bits set in it.
1010
*/
1111

12+
#ifndef __HAVE_ARCH_SW_HWEIGHT
1213
unsigned int __sw_hweight32(unsigned int w)
1314
{
1415
#ifdef CONFIG_ARCH_HAS_FAST_MULTIPLIER
@@ -25,6 +26,7 @@ unsigned int __sw_hweight32(unsigned int w)
2526
#endif
2627
}
2728
EXPORT_SYMBOL(__sw_hweight32);
29+
#endif
2830

2931
unsigned int __sw_hweight16(unsigned int w)
3032
{
@@ -43,6 +45,7 @@ unsigned int __sw_hweight8(unsigned int w)
4345
}
4446
EXPORT_SYMBOL(__sw_hweight8);
4547

48+
#ifndef __HAVE_ARCH_SW_HWEIGHT
4649
unsigned long __sw_hweight64(__u64 w)
4750
{
4851
#if BITS_PER_LONG == 32
@@ -65,3 +68,4 @@ unsigned long __sw_hweight64(__u64 w)
6568
#endif
6669
}
6770
EXPORT_SYMBOL(__sw_hweight64);
71+
#endif

0 commit comments

Comments
 (0)