Skip to content

Commit

Permalink
#368 asm optimizations
Browse files Browse the repository at this point in the history
  • Loading branch information
XProger committed Jul 2, 2022
1 parent daa0e35 commit ea021e0
Show file tree
Hide file tree
Showing 11 changed files with 213 additions and 204 deletions.
233 changes: 116 additions & 117 deletions src/fixed/common.h
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#ifndef H_COMMON
#define H_COMMON
//#define STATIC_ITEMS
//#define PROFILING
#define PROFILING
#ifdef PROFILING
#define STATIC_ITEMS
#define PROFILE_FRAMETIME
Expand Down Expand Up @@ -29,6 +29,8 @@
#define SND_FMT_OGG (1 << 14)
#define SND_FMT_MP3 (1 << 15)

#define FIXED_SHIFT 14

#if defined(__WIN32__)
#define USE_DIV_TABLE
#define MODEHW
Expand Down Expand Up @@ -348,133 +350,22 @@ X_INLINE int32 abs(int32 x) {
extern uint16 fb[VRAM_WIDTH * FRAME_HEIGHT];
#endif

#ifdef PROFILING
#define PROFILE_FRAME\
CNT_UPDATE,\
CNT_RENDER

#define PROFILE_STAGES\
CNT_TRANSFORM,\
CNT_ADD,\
CNT_FLUSH,\
CNT_VERT,\
CNT_POLY

#define PROFILE_SOUND\
CNT_SOUND

#if defined(PROFILE_FRAMETIME)
enum ProfileCounterId {
PROFILE_FRAME,
CNT_MAX,
PROFILE_STAGES,
PROFILE_SOUND
};
#elif defined(PROFILE_SOUNDTIME)
enum ProfileCounterId {
PROFILE_SOUND,
CNT_MAX,
PROFILE_FRAME,
PROFILE_STAGES
};
#else
enum ProfileCounterId {
PROFILE_STAGES,
CNT_MAX,
PROFILE_FRAME,
PROFILE_SOUND
};
#endif

extern uint32 gCounters[CNT_MAX];

#if defined(__3DO__) || defined(__32X__) // should be first, armcpp bug (#elif)
extern int32 g_timer;

#define PROFILE_START() {\
g_timer = osGetSystemTimeMS();\
}

#define PROFILE_STOP(value) {\
value += (osGetSystemTimeMS() - g_timer);\
}
#elif defined(__WIN32__) || defined(__GBA_WIN__)
extern LARGE_INTEGER g_timer;
extern LARGE_INTEGER g_current;

#define PROFILE_START() {\
QueryPerformanceCounter(&g_timer);\
}

#define PROFILE_STOP(value) {\
QueryPerformanceCounter(&g_current);\
value += uint32(g_current.QuadPart - g_timer.QuadPart);\
}
#elif defined(__GBA__)
#ifdef PROFILE_SOUNDTIME
#define TIMER_FREQ_DIV 1
#else
#define TIMER_FREQ_DIV 3
#endif

#define PROFILE_START() {\
REG_TM2CNT_L = 0;\
REG_TM2CNT_H = (1 << 7) | TIMER_FREQ_DIV;\
}

#define PROFILE_STOP(value) {\
value += REG_TM2CNT_L;\
REG_TM2CNT_H = 0;\
}
#else
#define PROFILE_START() aaa
#define PROFILE_STOP(value) bbb
#endif

struct ProfileCounter
{
ProfileCounterId cnt;

ProfileCounter(ProfileCounterId cnt) : cnt(cnt) {
if (cnt < CNT_MAX) {
PROFILE_START()
}
}

~ProfileCounter() {
if (cnt < CNT_MAX) {
PROFILE_STOP(gCounters[cnt]);
}
}
};

#define PROFILE(cnt) ProfileCounter profileCounter(cnt)
#define PROFILE_CLEAR() memset(gCounters, 0, sizeof(gCounters));
#else
#define PROFILE(cnt)
#define PROFILE_CLEAR()
#endif

#ifdef __TNS__
void osSetPalette(uint16* palette);
#endif

#define STATIC_MESH_FLAG_NO_COLLISION 1
#define STATIC_MESH_FLAG_VISIBLE 2
#define MAX_STATIC_MESH_RADIUS (5 * 1024)

extern int32 fps;

#define FIXED_SHIFT 14

#ifndef F16_SHIFT
#define F16_SHIFT 0
#endif

#ifdef USE_MATRIX_INT16
#define MATRIX_FIXED_SHIFT FIXED_SHIFT
#else
#define MATRIX_FIXED_SHIFT 0
#define MATRIX_FIXED_SHIFT FIXED_SHIFT
#endif

#ifndef MATRIX_FIXED_SHIFT
#define MATRIX_FIXED_SHIFT 0
#endif

#define SND_MAX_DIST (8 * 1024)
Expand Down Expand Up @@ -2966,4 +2857,112 @@ void osSetPalette(const uint16* palette);
const void* osLoadScreen(LevelID id);
const void* osLoadLevel(LevelID id);

#ifdef PROFILING
#define PROFILE_FRAME\
CNT_UPDATE,\
CNT_RENDER

#define PROFILE_STAGES\
CNT_TRANSFORM,\
CNT_ADD,\
CNT_FLUSH,\
CNT_VERT,\
CNT_POLY

#define PROFILE_SOUND\
CNT_SOUND

#if defined(PROFILE_FRAMETIME)
enum ProfileCounterId {
PROFILE_FRAME,
CNT_MAX,
PROFILE_STAGES,
PROFILE_SOUND
};
#elif defined(PROFILE_SOUNDTIME)
enum ProfileCounterId {
PROFILE_SOUND,
CNT_MAX,
PROFILE_FRAME,
PROFILE_STAGES
};
#else
enum ProfileCounterId {
PROFILE_STAGES,
CNT_MAX,
PROFILE_FRAME,
PROFILE_SOUND
};
#endif

extern uint32 gCounters[CNT_MAX];

#if defined(__3DO__) || defined(__32X__) // should be first, armcpp bug (#elif)
extern int32 g_timer;

#define PROFILE_START() {\
g_timer = osGetSystemTimeMS();\
}

#define PROFILE_STOP(value) {\
value += (osGetSystemTimeMS() - g_timer);\
}
#elif defined(__WIN32__) || defined(__GBA_WIN__)
extern LARGE_INTEGER g_timer;
extern LARGE_INTEGER g_current;

#define PROFILE_START() {\
QueryPerformanceCounter(&g_timer);\
}

#define PROFILE_STOP(value) {\
QueryPerformanceCounter(&g_current);\
value += uint32(g_current.QuadPart - g_timer.QuadPart);\
}
#elif defined(__GBA__)
#ifdef PROFILE_SOUNDTIME
#define TIMER_FREQ_DIV 1
#else
#define TIMER_FREQ_DIV 3
#endif

#define PROFILE_START() {\
REG_TM2CNT_L = 0;\
REG_TM2CNT_H = (1 << 7) | TIMER_FREQ_DIV;\
}

#define PROFILE_STOP(value) {\
value += REG_TM2CNT_L;\
REG_TM2CNT_H = 0;\
}
#else
#define PROFILE_START() aaa
#define PROFILE_STOP(value) bbb
#endif

struct ProfileCounter
{
ProfileCounterId cnt;

ProfileCounter(ProfileCounterId cnt) : cnt(cnt) {
if (cnt < CNT_MAX) {
PROFILE_START()
}
}

~ProfileCounter() {
if (cnt < CNT_MAX) {
PROFILE_STOP(gCounters[cnt]);
}
}
};

#define PROFILE(cnt) ProfileCounter profileCounter(cnt)
#define PROFILE_CLEAR() memset(gCounters, 0, sizeof(gCounters));
#else
#define PROFILE(cnt)
#define PROFILE_CLEAR()
#endif


#endif
1 change: 1 addition & 0 deletions src/platform/gba/asm/common_asm.inc
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,7 @@
.equ MIN_INT32, 0x80000000
.equ MAX_INT32, 0x7FFFFFFF

// res = divTable[x] (uint16)
.macro divLUT res, x
add \res, \x, #DIVLUT_ADDR
ldrh \res, [\res, \x]
Expand Down
43 changes: 19 additions & 24 deletions src/platform/gba/asm/faceAddMeshQuads.s
Original file line number Diff line number Diff line change
Expand Up @@ -24,10 +24,6 @@ vy1 .req vg3
vx2 .req vg2
vy2 .req vg2

vz0 .req vg0
vz1 .req vg1
vz2 .req vg2
vz3 .req vg3
depth .req vg0

tmp .req flags
Expand All @@ -52,10 +48,12 @@ faceAddMeshQuads_asm:
add polys, #2 // skip flags

.loop:
ldrb vp0, [polys], #1
ldrb vp1, [polys], #1
ldrb vp2, [polys], #1
ldrb vp3, [polys], #3 // + flags
ldrh vp0, [polys], #2
lsr vp1, vp0, #8
and vp0, #0xFF
ldrh vp2, [polys], #4 // + flags
lsr vp3, vp2, #8
and vp2, #0xFF

add vp0, vp, vp0, lsl #3
add vp1, vp, vp1, lsl #3
Expand All @@ -64,36 +62,33 @@ faceAddMeshQuads_asm:

CCW .skip

// fetch clip flags
ldrb vg0, [vp0, #VERTEX_CLIP]
ldrb vg1, [vp1, #VERTEX_CLIP]
ldrb vg2, [vp2, #VERTEX_CLIP]
ldrb vg3, [vp3, #VERTEX_CLIP]
// fetch [c, g, zz]
ldr vg0, [vp0, #VERTEX_Z]
ldr vg1, [vp1, #VERTEX_Z]
ldr vg2, [vp2, #VERTEX_Z]
ldr vg3, [vp3, #VERTEX_Z]

// check clipping
and tmp, vg0, vg1
and tmp, vg2
and tmp, vg3
tst tmp, #(CLIP_DISCARD >> 8)
tst tmp, #(CLIP_DISCARD << 16)
bne .skip

// mark if should be clipped by viewport
orr tmp, vg0, vg1
orr tmp, vg2
orr tmp, vg3
tst tmp, #(CLIP_FRAME >> 8)
tst tmp, #(CLIP_FRAME << 16)
ldrh flags, [polys, #-8]
orrne flags, #FACE_CLIPPED

// vz0 = AVG_Z4 (depth)
ldrh vz0, [vp0, #VERTEX_Z]
ldrh vz1, [vp1, #VERTEX_Z]
ldrh vz2, [vp2, #VERTEX_Z]
ldrh vz3, [vp3, #VERTEX_Z]
add depth, vz0, vz1
add depth, vz2
add depth, vz3
lsr depth, #(2 + OT_SHIFT)
// depth = AVG_Z4
lsl vg0, #16
add depth, vg0, vg1, lsl #16
add depth, vg2, lsl #16
add depth, vg3, lsl #16
lsr depth, #(16 + 2 + OT_SHIFT)

// faceAdd
rsb vp0, vertices, vp0, lsr #3
Expand Down
Loading

0 comments on commit ea021e0

Please sign in to comment.