cabac asm.

mostly because gcc refuses to use cmov. 28% faster than c on core2, 11% on k8, 6% on p4.
llchan · Mar 24, 2008 · bf9bf7a · bf9bf7a
1 parent 3687987
commit bf9bf7a
Showing 4 changed files with 201 additions and 36 deletions.
diff --git a/Makefile b/Makefile
@@ -21,7 +21,7 @@ endif
 
 # MMX/SSE optims
 ifneq ($(AS),)
-X86SRC0 = dct-a.asm deblock-a.asm mc-a.asm mc-a2.asm \
+X86SRC0 = cabac-a.asm dct-a.asm deblock-a.asm mc-a.asm mc-a2.asm \
           pixel-a.asm predict-a.asm quant-a.asm sad-a.asm \
           cpu-32.asm dct-32.asm
 X86SRC = $(X86SRC0:%=common/x86/%)

diff --git a/common/cabac.c b/common/cabac.c
@@ -666,7 +666,7 @@ static const int8_t x264_cabac_context_init_PB[3][460][2] =
 
 /* FIXME could avoid this duplication by reversing the order of states
  * with MPS=0, but that would uglify the other tables */
-static const uint8_t x264_cabac_range_lps[128][4] =
+const uint8_t x264_cabac_range_lps[128][4] =
 {
     {   2,   2,   2,   2 },
     {   6,   7,   8,   9 }, {   6,   7,   9,  10 }, {   6,   8,   9,  11 },
@@ -735,7 +735,7 @@ const uint8_t x264_cabac_transition[128][2] =
     {100,121}, {100,122}, {101,123}, {101,124}, {101,125}, {102,126}, {102,126}, {127,127},
 };
 
-static const uint8_t x264_cabac_renorm_shift[64]= {
+const uint8_t x264_cabac_renorm_shift[64]= {
  6,5,4,4,3,3,3,3,2,2,2,2,2,2,2,2,
  1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
@@ -895,6 +895,7 @@ static inline void x264_cabac_encode_renorm( x264_cabac_t *cb )
     x264_cabac_putbyte( cb );
 }
 
+#ifndef HAVE_MMX
 void x264_cabac_encode_decision( x264_cabac_t *cb, int i_ctx, int b )
 {
     int i_state = cb->state[i_ctx];
@@ -908,6 +909,7 @@ void x264_cabac_encode_decision( x264_cabac_t *cb, int i_ctx, int b )
     cb->state[i_ctx] = x264_cabac_transition[i_state][b];
     x264_cabac_encode_renorm( cb );
 }
+#endif
 
 void x264_cabac_encode_bypass( x264_cabac_t *cb, int b )
 {

diff --git a/common/x86/cabac-a.asm b/common/x86/cabac-a.asm
@@ -0,0 +1,161 @@
+;*****************************************************************************
+;* cabac-a.asm: h264 encoder library
+;*****************************************************************************
+;* Copyright (C) 2008 x264 project
+;*
+;* Author: Loren Merritt <lorenm@u.washington.edu>
+;*
+;* This program is free software; you can redistribute it and/or modify
+;* it under the terms of the GNU General Public License as published by
+;* the Free Software Foundation; either version 2 of the License, or
+;* (at your option) any later version.
+;*
+;* This program is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+;* GNU General Public License for more details.
+;*
+;* You should have received a copy of the GNU General Public License
+;* along with this program; if not, write to the Free Software
+;* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+;*****************************************************************************
+
+%include "x86inc.asm"
+
+SECTION .text
+
+cextern x264_cabac_range_lps
+cextern x264_cabac_transition
+cextern x264_cabac_renorm_shift
+
+%macro DEF_TMP 16
+    %rep 8
+        %define t%1d r%9d
+        %define t%1b r%9b
+        %define t%1  r%9
+        %rotate 1
+    %endrep
+%endmacro
+
+; t3 must be ecx, since it's used for shift.
+%ifdef ARCH_X86_64
+    DEF_TMP 0,1,2,3,4,5,6,7, 0,1,2,3,4,5,6,10
+    %define pointer 8
+%else
+    DEF_TMP 0,1,2,3,4,5,6,7, 0,3,2,1,4,5,6,3
+    %define pointer 4
+%endif
+
+%define cb.state r0+0
+%define cb.low   r0+464
+%define cb.range r0+468
+%define cb.queue r0+472
+%define cb.bytes_outstanding r0+476
+%define cb.p     r0+480+pointer
+%define cb.end   r0+480+pointer*2
+
+%macro LOAD_GLOBAL 4
+%ifdef PIC64
+    ; this would be faster if the arrays were declared in asm, so that I didn't have to duplicate the lea
+    lea   r11, [%2 GLOBAL]
+    %ifnidn %3, 0
+    add   r11, %3
+    %endif
+    movzx %1, byte [r11+%4]
+%elifdef PIC32
+    %ifnidn %3, 0
+    lea   %1, [%3+%4]
+    movzx %1, byte [%2+%1 GLOBAL]
+    %else
+    movzx %1, byte [%2+%3+%4 GLOBAL]
+    %endif
+%else
+    movzx %1, byte [%2+%3+%4]
+%endif
+%endmacro
+
+cglobal x264_cabac_encode_decision, 0,7
+    movifnidn t0d, r0m
+    movifnidn t1d, r1m
+    picgetgot t2
+    mov   t5d, [cb.range]
+    movzx t3d, byte [cb.state+t1]
+    mov   t4d, t5d
+    shr   t5d, 6
+    and   t5d, 3
+    LOAD_GLOBAL t5d, x264_cabac_range_lps, t5, t3*4
+    sub   t4d, t5d
+    mov   t6d, t3d
+    shr   t6d, 6
+%ifdef PIC32
+    cmp   t6d, r2m
+%else
+    movifnidn t2d, r2m
+    cmp   t6d, t2d
+%endif
+    mov   t6d, [cb.low]
+    lea   t7,  [t6+t4]
+    cmovne t4d, t5d
+    cmovne t6d, t7d
+%ifdef PIC32
+    mov   t1,  r2m
+    LOAD_GLOBAL t3d, x264_cabac_transition, t1, t3*2
+%else
+    LOAD_GLOBAL t3d, x264_cabac_transition, t2, t3*2
+%endif
+    if32 mov t1d, r1m
+    mov   [cb.state+t1], t3b
+.renorm:
+    mov   t3d, t4d
+    shr   t3d, 3
+    LOAD_GLOBAL t3d, x264_cabac_renorm_shift, 0, t3
+    shl   t4d, t3b
+    shl   t6d, t3b
+    add   t3d, [cb.queue]
+    mov   [cb.range], t4d
+    mov   [cb.low], t6d
+    mov   [cb.queue], t3d
+    cmp   t3d, 8
+    jge .putbyte
+.ret:
+    REP_RET
+.putbyte:
+    ; alive: t0=cb t3=queue t6=low
+    add   t3d, 2
+    mov   t1d, 1
+    mov   t2d, t6d
+    shl   t1d, t3b
+    shr   t2d, t3b ; out
+    dec   t1d
+    sub   t3d, 10
+    and   t6d, t1d
+    cmp   t2b, 0xff ; FIXME is a 32bit op faster?
+    mov   [cb.queue], t3d
+    mov   [cb.low], t6d
+    mov   t1d, t2d
+    mov   t4,  [cb.p]
+    je .postpone
+    mov   t5d, [cb.bytes_outstanding]
+    shr   t1d, 8 ; carry
+    lea   t6, [t4+t5+1]
+    cmp   t6, [cb.end]
+    jge .ret
+    add   [t4-1], t1b
+    test  t5d, t5d
+    jz .no_outstanding
+    dec   t1d
+.loop_outstanding:
+    mov   [t4], t1b
+    inc   t4
+    dec   t5d
+    jg .loop_outstanding
+.no_outstanding:
+    mov   [t4], t2b
+    inc   t4
+    mov   [cb.bytes_outstanding], t5d ; is zero, but a reg has smaller opcode than an immediate
+    mov   [cb.p], t4
+    RET
+.postpone:
+    inc   dword [cb.bytes_outstanding]
+    RET
+
diff --git a/common/x86/x86inc.asm b/common/x86/x86inc.asm
@@ -50,34 +50,36 @@
 ; Same, but if it doesn't pop anything it becomes a 2-byte ret, for athlons
 ; which are slow when a normal ret follows a branch.
 
-%macro DECLARE_REG 5
+%macro DECLARE_REG 6
     %define r%1q %2
     %define r%1d %3
     %define r%1w %4
-    ; no r%1b, because some regs don't have a byte form, and anyway x264 doesn't need it
-    %define r%1m %5
-    %define r%1  r%1q
+    %define r%1b %5
+    %define r%1m %6
+    %define r%1  %2
 %endmacro
 
-%macro DECLARE_REG_SIZE 1
+%macro DECLARE_REG_SIZE 2
     %define r%1q r%1
     %define e%1q r%1
     %define r%1d e%1
     %define e%1d e%1
     %define r%1w %1
     %define e%1w %1
+    %define r%1b %2
+    %define e%1b %2
 %ifndef ARCH_X86_64
     %define r%1  e%1
 %endif
 %endmacro
 
-DECLARE_REG_SIZE ax
-DECLARE_REG_SIZE bx
-DECLARE_REG_SIZE cx
-DECLARE_REG_SIZE dx
-DECLARE_REG_SIZE si
-DECLARE_REG_SIZE di
-DECLARE_REG_SIZE bp
+DECLARE_REG_SIZE ax, al
+DECLARE_REG_SIZE bx, bl
+DECLARE_REG_SIZE cx, cl
+DECLARE_REG_SIZE dx, dl
+DECLARE_REG_SIZE si, sil
+DECLARE_REG_SIZE di, dil
+DECLARE_REG_SIZE bp, bpl
 
 %ifdef ARCH_X86_64
     %define push_size 8
@@ -129,13 +131,13 @@ DECLARE_REG_SIZE bp
 
 %ifdef WIN64 ;================================================================
 
-DECLARE_REG 0, rcx, ecx, cx,  ecx
-DECLARE_REG 1, rdx, edx, dx,  edx
-DECLARE_REG 2, r8,  r8d, r8w, r8d
-DECLARE_REG 3, r9,  r9d, r9w, r9d
-DECLARE_REG 4, rdi, edi, di,  [rsp + stack_offset + 40]
-DECLARE_REG 5, rsi, esi, si,  [rsp + stack_offset + 48]
-DECLARE_REG 6, rax, eax, ax,  [rsp + stack_offset + 56]
+DECLARE_REG 0, rcx, ecx, cx,  cl,  ecx
+DECLARE_REG 1, rdx, edx, dx,  dl,  edx
+DECLARE_REG 2, r8,  r8d, r8w, r8b, r8d
+DECLARE_REG 3, r9,  r9d, r9w, r9b, r9d
+DECLARE_REG 4, rdi, edi, di,  dil, [rsp + stack_offset + 40]
+DECLARE_REG 5, rsi, esi, si,  sil, [rsp + stack_offset + 48]
+DECLARE_REG 6, rax, eax, ax,  al,  [rsp + stack_offset + 56]
 %define r7m [rsp + stack_offset + 64]
 
 %macro LOAD_IF_USED 2 ; reg_id, number_of_args
@@ -163,13 +165,13 @@ DECLARE_REG 6, rax, eax, ax,  [rsp + stack_offset + 56]
 
 %elifdef ARCH_X86_64 ;========================================================
 
-DECLARE_REG 0, rdi, edi, di,  edi
-DECLARE_REG 1, rsi, esi, si,  esi
-DECLARE_REG 2, rdx, edx, dx,  edx
-DECLARE_REG 3, rcx, ecx, cx,  ecx
-DECLARE_REG 4, r8,  r8d, r8w, r8d
-DECLARE_REG 5, r9,  r9d, r9w, r9d
-DECLARE_REG 6, rax, eax, ax,  [rsp + stack_offset + 8]
+DECLARE_REG 0, rdi, edi, di,  dil, edi
+DECLARE_REG 1, rsi, esi, si,  sil, esi
+DECLARE_REG 2, rdx, edx, dx,  dl,  edx
+DECLARE_REG 3, rcx, ecx, cx,  cl,  ecx
+DECLARE_REG 4, r8,  r8d, r8w, r8b, r8d
+DECLARE_REG 5, r9,  r9d, r9w, r9b, r9d
+DECLARE_REG 6, rax, eax, ax,  al,  [rsp + stack_offset + 8]
 %define r7m [rsp + stack_offset + 16]
 
 %macro LOAD_IF_USED 2 ; reg_id, number_of_args
@@ -195,13 +197,13 @@ DECLARE_REG 6, rax, eax, ax,  [rsp + stack_offset + 8]
 
 %else ; X86_32 ;==============================================================
 
-DECLARE_REG 0, eax, eax, ax, [esp + stack_offset + 4]
-DECLARE_REG 1, ecx, ecx, cx, [esp + stack_offset + 8]
-DECLARE_REG 2, edx, edx, dx, [esp + stack_offset + 12]
-DECLARE_REG 3, ebx, ebx, bx, [esp + stack_offset + 16]
-DECLARE_REG 4, esi, esi, si, [esp + stack_offset + 20]
-DECLARE_REG 5, edi, edi, di, [esp + stack_offset + 24]
-DECLARE_REG 6, ebp, ebp, bp, [esp + stack_offset + 28]
+DECLARE_REG 0, eax, eax, ax, al,   [esp + stack_offset + 4]
+DECLARE_REG 1, ecx, ecx, cx, cl,   [esp + stack_offset + 8]
+DECLARE_REG 2, edx, edx, dx, dl,   [esp + stack_offset + 12]
+DECLARE_REG 3, ebx, ebx, bx, bl,   [esp + stack_offset + 16]
+DECLARE_REG 4, esi, esi, si, null, [esp + stack_offset + 20]
+DECLARE_REG 5, edi, edi, di, null, [esp + stack_offset + 24]
+DECLARE_REG 6, ebp, ebp, bp, null, [esp + stack_offset + 28]
 %define r7m [esp + stack_offset + 32]
 %define rsp esp