Skip to content

Commit

Permalink
cabac asm.
Browse files Browse the repository at this point in the history
mostly because gcc refuses to use cmov.
28% faster than c on core2, 11% on k8, 6% on p4.
  • Loading branch information
pengvado committed Mar 24, 2008

Verified

This commit was signed with the committer’s verified signature.
renovate-bot Mend Renovate
1 parent 3687987 commit bf9bf7a
Showing 4 changed files with 201 additions and 36 deletions.
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
@@ -21,7 +21,7 @@ endif

# MMX/SSE optims
ifneq ($(AS),)
X86SRC0 = dct-a.asm deblock-a.asm mc-a.asm mc-a2.asm \
X86SRC0 = cabac-a.asm dct-a.asm deblock-a.asm mc-a.asm mc-a2.asm \
pixel-a.asm predict-a.asm quant-a.asm sad-a.asm \
cpu-32.asm dct-32.asm
X86SRC = $(X86SRC0:%=common/x86/%)
6 changes: 4 additions & 2 deletions common/cabac.c
Original file line number Diff line number Diff line change
@@ -666,7 +666,7 @@ static const int8_t x264_cabac_context_init_PB[3][460][2] =

/* FIXME could avoid this duplication by reversing the order of states
* with MPS=0, but that would uglify the other tables */
static const uint8_t x264_cabac_range_lps[128][4] =
const uint8_t x264_cabac_range_lps[128][4] =
{
{ 2, 2, 2, 2 },
{ 6, 7, 8, 9 }, { 6, 7, 9, 10 }, { 6, 8, 9, 11 },
@@ -735,7 +735,7 @@ const uint8_t x264_cabac_transition[128][2] =
{100,121}, {100,122}, {101,123}, {101,124}, {101,125}, {102,126}, {102,126}, {127,127},
};

static const uint8_t x264_cabac_renorm_shift[64]= {
const uint8_t x264_cabac_renorm_shift[64]= {
6,5,4,4,3,3,3,3,2,2,2,2,2,2,2,2,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
@@ -895,6 +895,7 @@ static inline void x264_cabac_encode_renorm( x264_cabac_t *cb )
x264_cabac_putbyte( cb );
}

#ifndef HAVE_MMX
void x264_cabac_encode_decision( x264_cabac_t *cb, int i_ctx, int b )
{
int i_state = cb->state[i_ctx];
@@ -908,6 +909,7 @@ void x264_cabac_encode_decision( x264_cabac_t *cb, int i_ctx, int b )
cb->state[i_ctx] = x264_cabac_transition[i_state][b];
x264_cabac_encode_renorm( cb );
}
#endif

void x264_cabac_encode_bypass( x264_cabac_t *cb, int b )
{
161 changes: 161 additions & 0 deletions common/x86/cabac-a.asm
Original file line number Diff line number Diff line change
@@ -0,0 +1,161 @@
;*****************************************************************************
;* cabac-a.asm: h264 encoder library
;*****************************************************************************
;* Copyright (C) 2008 x264 project
;*
;* Author: Loren Merritt <lorenm@u.washington.edu>
;*
;* This program is free software; you can redistribute it and/or modify
;* it under the terms of the GNU General Public License as published by
;* the Free Software Foundation; either version 2 of the License, or
;* (at your option) any later version.
;*
;* This program is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
;* GNU General Public License for more details.
;*
;* You should have received a copy of the GNU General Public License
;* along with this program; if not, write to the Free Software
;* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA.
;*****************************************************************************

%include "x86inc.asm"

SECTION .text

cextern x264_cabac_range_lps
cextern x264_cabac_transition
cextern x264_cabac_renorm_shift

%macro DEF_TMP 16
%rep 8
%define t%1d r%9d
%define t%1b r%9b
%define t%1 r%9
%rotate 1
%endrep
%endmacro

; t3 must be ecx, since it's used for shift.
%ifdef ARCH_X86_64
DEF_TMP 0,1,2,3,4,5,6,7, 0,1,2,3,4,5,6,10
%define pointer 8
%else
DEF_TMP 0,1,2,3,4,5,6,7, 0,3,2,1,4,5,6,3
%define pointer 4
%endif

%define cb.state r0+0
%define cb.low r0+464
%define cb.range r0+468
%define cb.queue r0+472
%define cb.bytes_outstanding r0+476
%define cb.p r0+480+pointer
%define cb.end r0+480+pointer*2

%macro LOAD_GLOBAL 4
%ifdef PIC64
; this would be faster if the arrays were declared in asm, so that I didn't have to duplicate the lea
lea r11, [%2 GLOBAL]
%ifnidn %3, 0
add r11, %3
%endif
movzx %1, byte [r11+%4]
%elifdef PIC32
%ifnidn %3, 0
lea %1, [%3+%4]
movzx %1, byte [%2+%1 GLOBAL]
%else
movzx %1, byte [%2+%3+%4 GLOBAL]
%endif
%else
movzx %1, byte [%2+%3+%4]
%endif
%endmacro

cglobal x264_cabac_encode_decision, 0,7
movifnidn t0d, r0m
movifnidn t1d, r1m
picgetgot t2
mov t5d, [cb.range]
movzx t3d, byte [cb.state+t1]
mov t4d, t5d
shr t5d, 6
and t5d, 3
LOAD_GLOBAL t5d, x264_cabac_range_lps, t5, t3*4
sub t4d, t5d
mov t6d, t3d
shr t6d, 6
%ifdef PIC32
cmp t6d, r2m
%else
movifnidn t2d, r2m
cmp t6d, t2d
%endif
mov t6d, [cb.low]
lea t7, [t6+t4]
cmovne t4d, t5d
cmovne t6d, t7d
%ifdef PIC32
mov t1, r2m
LOAD_GLOBAL t3d, x264_cabac_transition, t1, t3*2
%else
LOAD_GLOBAL t3d, x264_cabac_transition, t2, t3*2
%endif
if32 mov t1d, r1m
mov [cb.state+t1], t3b
.renorm:
mov t3d, t4d
shr t3d, 3
LOAD_GLOBAL t3d, x264_cabac_renorm_shift, 0, t3
shl t4d, t3b
shl t6d, t3b
add t3d, [cb.queue]
mov [cb.range], t4d
mov [cb.low], t6d
mov [cb.queue], t3d
cmp t3d, 8
jge .putbyte
.ret:
REP_RET
.putbyte:
; alive: t0=cb t3=queue t6=low
add t3d, 2
mov t1d, 1
mov t2d, t6d
shl t1d, t3b
shr t2d, t3b ; out
dec t1d
sub t3d, 10
and t6d, t1d
cmp t2b, 0xff ; FIXME is a 32bit op faster?
mov [cb.queue], t3d
mov [cb.low], t6d
mov t1d, t2d
mov t4, [cb.p]
je .postpone
mov t5d, [cb.bytes_outstanding]
shr t1d, 8 ; carry
lea t6, [t4+t5+1]
cmp t6, [cb.end]
jge .ret
add [t4-1], t1b
test t5d, t5d
jz .no_outstanding
dec t1d
.loop_outstanding:
mov [t4], t1b
inc t4
dec t5d
jg .loop_outstanding
.no_outstanding:
mov [t4], t2b
inc t4
mov [cb.bytes_outstanding], t5d ; is zero, but a reg has smaller opcode than an immediate
mov [cb.p], t4
RET
.postpone:
inc dword [cb.bytes_outstanding]
RET

68 changes: 35 additions & 33 deletions common/x86/x86inc.asm
Original file line number Diff line number Diff line change
@@ -50,34 +50,36 @@
; Same, but if it doesn't pop anything it becomes a 2-byte ret, for athlons
; which are slow when a normal ret follows a branch.

%macro DECLARE_REG 5
%macro DECLARE_REG 6
%define r%1q %2
%define r%1d %3
%define r%1w %4
; no r%1b, because some regs don't have a byte form, and anyway x264 doesn't need it
%define r%1m %5
%define r%1 r%1q
%define r%1b %5
%define r%1m %6
%define r%1 %2
%endmacro

%macro DECLARE_REG_SIZE 1
%macro DECLARE_REG_SIZE 2
%define r%1q r%1
%define e%1q r%1
%define r%1d e%1
%define e%1d e%1
%define r%1w %1
%define e%1w %1
%define r%1b %2
%define e%1b %2
%ifndef ARCH_X86_64
%define r%1 e%1
%endif
%endmacro

DECLARE_REG_SIZE ax
DECLARE_REG_SIZE bx
DECLARE_REG_SIZE cx
DECLARE_REG_SIZE dx
DECLARE_REG_SIZE si
DECLARE_REG_SIZE di
DECLARE_REG_SIZE bp
DECLARE_REG_SIZE ax, al
DECLARE_REG_SIZE bx, bl
DECLARE_REG_SIZE cx, cl
DECLARE_REG_SIZE dx, dl
DECLARE_REG_SIZE si, sil
DECLARE_REG_SIZE di, dil
DECLARE_REG_SIZE bp, bpl

%ifdef ARCH_X86_64
%define push_size 8
@@ -129,13 +131,13 @@ DECLARE_REG_SIZE bp

%ifdef WIN64 ;================================================================

DECLARE_REG 0, rcx, ecx, cx, ecx
DECLARE_REG 1, rdx, edx, dx, edx
DECLARE_REG 2, r8, r8d, r8w, r8d
DECLARE_REG 3, r9, r9d, r9w, r9d
DECLARE_REG 4, rdi, edi, di, [rsp + stack_offset + 40]
DECLARE_REG 5, rsi, esi, si, [rsp + stack_offset + 48]
DECLARE_REG 6, rax, eax, ax, [rsp + stack_offset + 56]
DECLARE_REG 0, rcx, ecx, cx, cl, ecx
DECLARE_REG 1, rdx, edx, dx, dl, edx
DECLARE_REG 2, r8, r8d, r8w, r8b, r8d
DECLARE_REG 3, r9, r9d, r9w, r9b, r9d
DECLARE_REG 4, rdi, edi, di, dil, [rsp + stack_offset + 40]
DECLARE_REG 5, rsi, esi, si, sil, [rsp + stack_offset + 48]
DECLARE_REG 6, rax, eax, ax, al, [rsp + stack_offset + 56]
%define r7m [rsp + stack_offset + 64]

%macro LOAD_IF_USED 2 ; reg_id, number_of_args
@@ -163,13 +165,13 @@ DECLARE_REG 6, rax, eax, ax, [rsp + stack_offset + 56]

%elifdef ARCH_X86_64 ;========================================================

DECLARE_REG 0, rdi, edi, di, edi
DECLARE_REG 1, rsi, esi, si, esi
DECLARE_REG 2, rdx, edx, dx, edx
DECLARE_REG 3, rcx, ecx, cx, ecx
DECLARE_REG 4, r8, r8d, r8w, r8d
DECLARE_REG 5, r9, r9d, r9w, r9d
DECLARE_REG 6, rax, eax, ax, [rsp + stack_offset + 8]
DECLARE_REG 0, rdi, edi, di, dil, edi
DECLARE_REG 1, rsi, esi, si, sil, esi
DECLARE_REG 2, rdx, edx, dx, dl, edx
DECLARE_REG 3, rcx, ecx, cx, cl, ecx
DECLARE_REG 4, r8, r8d, r8w, r8b, r8d
DECLARE_REG 5, r9, r9d, r9w, r9b, r9d
DECLARE_REG 6, rax, eax, ax, al, [rsp + stack_offset + 8]
%define r7m [rsp + stack_offset + 16]

%macro LOAD_IF_USED 2 ; reg_id, number_of_args
@@ -195,13 +197,13 @@ DECLARE_REG 6, rax, eax, ax, [rsp + stack_offset + 8]

%else ; X86_32 ;==============================================================

DECLARE_REG 0, eax, eax, ax, [esp + stack_offset + 4]
DECLARE_REG 1, ecx, ecx, cx, [esp + stack_offset + 8]
DECLARE_REG 2, edx, edx, dx, [esp + stack_offset + 12]
DECLARE_REG 3, ebx, ebx, bx, [esp + stack_offset + 16]
DECLARE_REG 4, esi, esi, si, [esp + stack_offset + 20]
DECLARE_REG 5, edi, edi, di, [esp + stack_offset + 24]
DECLARE_REG 6, ebp, ebp, bp, [esp + stack_offset + 28]
DECLARE_REG 0, eax, eax, ax, al, [esp + stack_offset + 4]
DECLARE_REG 1, ecx, ecx, cx, cl, [esp + stack_offset + 8]
DECLARE_REG 2, edx, edx, dx, dl, [esp + stack_offset + 12]
DECLARE_REG 3, ebx, ebx, bx, bl, [esp + stack_offset + 16]
DECLARE_REG 4, esi, esi, si, null, [esp + stack_offset + 20]
DECLARE_REG 5, edi, edi, di, null, [esp + stack_offset + 24]
DECLARE_REG 6, ebp, ebp, bp, null, [esp + stack_offset + 28]
%define r7m [esp + stack_offset + 32]
%define rsp esp

0 comments on commit bf9bf7a

Please sign in to comment.