-
Notifications
You must be signed in to change notification settings - Fork 5
/
soa.h
216 lines (184 loc) · 7.67 KB
/
soa.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
#pragma once
#include "cthing.h"
#include "math/vec.h"
typedef struct {
void **comps;
size_t num;
size_t width;
size_t stride;
} CT_SOA;
#ifdef CT_FEATURE_SSE
#ifdef CT_FEATURE_AVX
#include <immintrin.h>
#define CT_SOA_WORD_SHIFT 3
#define CT_SOA_SET1 _mm256_set1_ps
#define CT_SOA_SQRT _mm256_sqrt_ps
#define CT_SOA_RSQRT _mm256_rsqrt_ps
#define CT_SOA_MIN _mm256_min_ps
#define CT_SOA_MAX _mm256_max_ps
typedef __m256 ct_soa_vec;
#else
#include <xmmintrin.h>
#define CT_SOA_WORD_SHIFT 2
#define CT_SOA_SET1 _mm_set1_ps
#define CT_SOA_SQRT _mm_sqrt_ps
#define CT_SOA_RSQRT _mm_rsqrt_ps
#define CT_SOA_MIN _mm_min_ps
#define CT_SOA_MAX _mm_max_ps
typedef __m128 ct_soa_vec;
#endif
#else // NO SSE2
#define CT_SOA_WORD_SHIFT 0
#define CT_SOA_SET1
#define CT_SOA_SQRT sqrtf
#define CT_SOA_RSQRT 1.0f / sqrtf
#define CT_SOA_MIN CT_MIN
#define CT_SOA_MAX CT_MAX
typedef float ct_soa_vec;
#endif
#define CT_SOA_WORD_SIZE (1 << CT_SOA_WORD_SHIFT)
#define CT_SOA_SIZE_MASK (CT_SOA_WORD_SIZE - 1)
#define CT_SOA_PROLOGUE2_SCALAR_IMM(name) \
CT_SOA *ct_soa_##name##_imm(CT_SOA *a, float b) { \
size_t n = a->num >> CT_SOA_WORD_SHIFT; \
ct_soa_vec bb = CT_SOA_SET1(b); \
for (size_t i = 0, w = a->width; i < w; i++) { \
ct_soa_vec *aa = (ct_soa_vec *)a->comps[i]; \
for (size_t j = 0; j < n; j++)
#define CT_SOA_PROLOGUE2_FPTR_IMM(name) \
CT_SOA *ct_soa_##name##_imm(CT_SOA *a, const float *b) { \
size_t n = a->num >> CT_SOA_WORD_SHIFT; \
for (size_t i = 0, w = a->width; i < w; i++) { \
ct_soa_vec *aa = (ct_soa_vec *)a->comps[i]; \
ct_soa_vec bb = CT_SOA_SET1(b[i]); \
for (size_t j = 0; j < n; j++)
#define CT_SOA_PROLOGUE2_IMM(name) \
CT_SOA *ct_soa_##name##_imm(CT_SOA *a, const CT_SOA *b) { \
CT_CHECK(a->width == b->width && a->num == b->num, "a & b dims not same"); \
size_t n = a->num >> CT_SOA_WORD_SHIFT; \
for (size_t i = 0, w = a->width; i < w; i++) { \
ct_soa_vec *aa = (ct_soa_vec *)a->comps[i]; \
ct_soa_vec *bb = (ct_soa_vec *)b->comps[i]; \
for (size_t j = 0; j < n; j++)
#define CT_SOA_PROLOGUE3(name) \
CT_SOA *ct_soa_##name(const CT_SOA *a, const CT_SOA *b, CT_SOA *out) { \
CT_CHECK(a->width == b->width && a->width == out->width && \
a->num == b->num && a->num == out->num, \
"dims not same"); \
size_t n = a->num >> CT_SOA_WORD_SHIFT; \
for (size_t i = 0, w = a->width; i < w; i++) { \
ct_soa_vec *aa = (ct_soa_vec *)a->comps[i]; \
ct_soa_vec *bb = (ct_soa_vec *)b->comps[i]; \
ct_soa_vec *oo = (ct_soa_vec *)out->comps[i]; \
for (size_t j = 0; j < n; j++)
#define CT_SOA2_PROLOGUE3(name) \
float *ct_soa_##name(const CT_SOA *a, const CT_SOA *b, float *out) { \
CT_CHECK(a->width == 2 && b->width == 2 && a->num == b->num, \
"a & b dims not same"); \
ct_soa_vec *ax = (ct_soa_vec *)a->comps[0]; \
ct_soa_vec *ay = (ct_soa_vec *)a->comps[1]; \
ct_soa_vec *bx = (ct_soa_vec *)b->comps[0]; \
ct_soa_vec *by = (ct_soa_vec *)b->comps[1]; \
ct_soa_vec *oo = (ct_soa_vec *)out; \
for (size_t i = 0, n = a->num >> CT_SOA_WORD_SHIFT; i < n; i++)
#define CT_SOA3_PROLOGUE3(name) \
float *ct_soa_##name(const CT_SOA *a, const CT_SOA *b, float *out) { \
CT_CHECK(a->width == 3 && b->width == 3 && a->num == b->num, \
"a & b dims not same"); \
ct_soa_vec *ax = (ct_soa_vec *)a->comps[0]; \
ct_soa_vec *ay = (ct_soa_vec *)a->comps[1]; \
ct_soa_vec *az = (ct_soa_vec *)b->comps[2]; \
ct_soa_vec *bx = (ct_soa_vec *)b->comps[0]; \
ct_soa_vec *by = (ct_soa_vec *)b->comps[1]; \
ct_soa_vec *bz = (ct_soa_vec *)b->comps[2]; \
ct_soa_vec *oo = (ct_soa_vec *)out; \
for (size_t i = 0, n = a->num >> CT_SOA_WORD_SHIFT; i < n; i++)
#define CT_SOA_EPILOGUE_IMM \
} \
return a; \
fail: \
return NULL; \
}
#define CT_SOA_EPILOGUE3 \
} \
return out; \
fail: \
return NULL; \
}
#define CT_SOA_EPILOGUE3_ALT \
return out; \
fail: \
return NULL; \
}
CT_SOA *ct_soa_new(size_t width, size_t num, size_t stride);
int ct_soa_init(CT_SOA *a,
void **comps,
size_t width,
size_t num,
size_t stride);
void ct_soa_free(CT_SOA *s);
void *ct_soa_flatten(const CT_SOA *s, void *out);
CT_SOA *ct_soa_add1f_imm(CT_SOA *a, float b);
CT_SOA *ct_soa_sub1f_imm(CT_SOA *a, float b);
CT_SOA *ct_soa_mul1f_imm(CT_SOA *a, float b);
CT_SOA *ct_soa_div1f_imm(CT_SOA *a, float b);
CT_SOA *ct_soa_add1fp_imm(CT_SOA *a, const float *b);
CT_SOA *ct_soa_sub1fp_imm(CT_SOA *a, const float *b);
CT_SOA *ct_soa_mul1fp_imm(CT_SOA *a, const float *b);
CT_SOA *ct_soa_div1fp_imm(CT_SOA *a, const float *b);
CT_SOA *ct_soa_add_imm(CT_SOA *a, const CT_SOA *b);
CT_SOA *ct_soa_sub_imm(CT_SOA *a, const CT_SOA *b);
CT_SOA *ct_soa_mul_imm(CT_SOA *a, const CT_SOA *b);
CT_SOA *ct_soa_div_imm(CT_SOA *a, const CT_SOA *b);
CT_SOA *ct_soa_min_imm(CT_SOA *a, const CT_SOA *b);
CT_SOA *ct_soa_max_imm(CT_SOA *a, const CT_SOA *b);
CT_SOA *ct_soa_add(const CT_SOA *a, const CT_SOA *b, CT_SOA *out);
CT_SOA *ct_soa_sub(const CT_SOA *a, const CT_SOA *b, CT_SOA *out);
CT_SOA *ct_soa_mul(const CT_SOA *a, const CT_SOA *b, CT_SOA *out);
CT_SOA *ct_soa_div(const CT_SOA *a, const CT_SOA *b, CT_SOA *out);
float *ct_soa_dot2(const CT_SOA *a, const CT_SOA *b, float *out);
float *ct_soa_dist2(const CT_SOA *a, const CT_SOA *b, float *out);
CT_SOA *ct_soa_normalize2f_imm(CT_SOA *a, float len);
CT_SOA *ct_soa_normalize3f_imm(CT_SOA *a, float len);
ct_inline CT_Vec2f *ct_soa_get2f(const CT_SOA *s, size_t idx, CT_Vec2f *v) {
float **c = (float **)s->comps;
v->x = c[0][idx];
v->y = c[1][idx];
return v;
}
ct_inline CT_Vec3f *ct_soa_get3f(const CT_SOA *s, size_t idx, CT_Vec3f *v) {
float **c = (float **)s->comps;
v->x = c[0][idx];
v->y = c[1][idx];
v->z = c[2][idx];
return v;
}
ct_inline CT_Vec4f *ct_soa_get4f(const CT_SOA *s, size_t idx, CT_Vec4f *v) {
float **c = (float **)s->comps;
v->x = c[0][idx];
v->y = c[1][idx];
v->z = c[2][idx];
v->w = c[3][idx];
return v;
}
ct_inline CT_SOA *ct_soa_set2f(CT_SOA *s, size_t idx, const CT_Vec2f *v) {
float **c = (float **)s->comps;
c[0][idx] = v->x;
c[1][idx] = v->y;
return s;
}
ct_inline CT_SOA *ct_soa_set3f(CT_SOA *s, size_t idx, const CT_Vec3f *v) {
float **c = (float **)s->comps;
c[0][idx] = v->x;
c[1][idx] = v->y;
c[2][idx] = v->z;
return s;
}
ct_inline CT_SOA *ct_soa_set4f(CT_SOA *s, size_t idx, const CT_Vec4f *v) {
float **c = (float **)s->comps;
c[0][idx] = v->x;
c[1][idx] = v->y;
c[2][idx] = v->z;
c[3][idx] = v->w;
return s;
}