-
Notifications
You must be signed in to change notification settings - Fork 3k
/
Copy pathCpuDetensorizer.h
238 lines (201 loc) · 8.36 KB
/
CpuDetensorizer.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#pragma once
#include "inc/ImageConversionTypes.h"
namespace _winml {
class CpuDetensorizer {
public:
template <typename T>
static HRESULT Detensorize(
_In_ ImageTensorChannelType formatFrom,
_In_ ImageTensorChannelType formatTo,
_In_ const T* pCPUTensor,
_In_ uint32_t bufferWidth,
_In_ uint32_t tensorHeight,
_In_ uint32_t tensorWidth,
_Inout_ BYTE* pData) {
#pragma warning(push)
#pragma warning(disable : 26014) // warning about possible out of bounds accesing pData, but input is checked for BGRA8 format, so uiCapacity should be in multiples of 4
// output is BGRA8: so blue at i, green is at i + 1, red is at i + 2
uint32_t bytesPerPixel = formatTo == kImageTensorChannelTypeGRAY8 ? 1 : 4;
// bufferWidth may have padding because of optimization, but bytesPerRow includes only the real tensor data. We need to jump
// over bufferWidth's extra padding
uint32_t bytesPerRow = tensorWidth * bytesPerPixel;
uint32_t end = bufferWidth * tensorHeight;
size_t tensorPlaneSize = tensorWidth * tensorHeight;
if (formatFrom == formatTo && (formatFrom == kImageTensorChannelTypeBGR8 || formatFrom == kImageTensorChannelTypeRGB8)) {
for (uint32_t i = 0; i < tensorHeight; i++) {
BYTE* pPixel = pData;
InterleaveRowFloatToByte(
pCPUTensor + i * tensorWidth,
pCPUTensor + tensorPlaneSize + i * tensorWidth,
pCPUTensor + tensorPlaneSize * 2 + i * tensorWidth,
tensorWidth,
pPixel,
bytesPerPixel);
pData += bufferWidth;
}
} else if ((formatFrom == kImageTensorChannelTypeRGB8 && formatTo == kImageTensorChannelTypeBGR8) || (formatFrom == kImageTensorChannelTypeBGR8 && formatTo == kImageTensorChannelTypeRGB8)) {
for (uint32_t i = 0; i < tensorHeight; i++) {
BYTE* pPixel = pData;
InterleaveRowFloatToByte(
pCPUTensor + tensorPlaneSize * 2 + i * tensorWidth,
pCPUTensor + tensorPlaneSize + i * tensorWidth,
pCPUTensor + i * tensorWidth,
tensorWidth,
pPixel,
bytesPerPixel);
pData += bufferWidth;
}
} else if (formatFrom == kImageTensorChannelTypeGRAY8 && (formatTo == kImageTensorChannelTypeBGR8 || formatTo == kImageTensorChannelTypeRGB8)) {
// just replicate the gray data across each channel
for (uint32_t i = 0; i < end; i += bufferWidth) {
for (uint32_t j = i; j < i + bytesPerRow; j += 4) {
BYTE bGray = DetensorizeValue<T>(pCPUTensor);
pData[j] = bGray;
pData[j + 1] = bGray;
pData[j + 2] = bGray;
pData[j + 3] = 255;
pCPUTensor++;
}
}
} else if (formatFrom == kImageTensorChannelTypeGRAY8 && formatTo == kImageTensorChannelTypeGRAY8) {
for (uint32_t i = 0; i < end; i += bufferWidth) {
for (uint32_t j = i; j < i + bytesPerRow; j += 1) {
BYTE bGray = DetensorizeValue<T>(pCPUTensor);
pData[j] = bGray;
pCPUTensor++;
}
}
} else if (formatFrom == kImageTensorChannelTypeBGR8 && formatTo == kImageTensorChannelTypeGRAY8) {
for (uint32_t i = 0; i < end; i += bufferWidth) {
for (uint32_t j = i; j < i + bytesPerRow; j += 1) {
BYTE red, green, blue;
blue = DetensorizeValue(pCPUTensor);
green = DetensorizeValue(pCPUTensor + tensorPlaneSize);
red = DetensorizeValue(pCPUTensor + tensorPlaneSize * 2);
pData[j] = static_cast<BYTE>(0.2126f * red + 0.7152f * green + 0.0722f * blue);
pCPUTensor++;
}
}
} else if (formatFrom == kImageTensorChannelTypeRGB8 && formatTo == kImageTensorChannelTypeGRAY8) {
for (uint32_t i = 0; i < end; i += bufferWidth) {
for (uint32_t j = i; j < i + bytesPerRow; j += 1) {
BYTE red, green, blue;
red = DetensorizeValue(pCPUTensor);
green = DetensorizeValue(pCPUTensor + tensorPlaneSize);
blue = DetensorizeValue(pCPUTensor + tensorPlaneSize * 2);
pData[j] = static_cast<BYTE>(0.2126f * red + 0.7152f * green + 0.0722f * blue);
pCPUTensor++;
}
}
}
#pragma warning(pop)
else {
return E_INVALIDARG;
}
return S_OK;
}
private:
template <typename T>
static float ReadTensor(const T* pCPUTensor) {
return *pCPUTensor;
}
template <>
static float ReadTensor<DirectX::PackedVector::HALF>(const DirectX::PackedVector::HALF* pCPUTensor) {
return DirectX::PackedVector::XMConvertHalfToFloat(*pCPUTensor);
}
template <typename T>
static BYTE DetensorizeValue(const T* pCPUTensor) {
return static_cast<BYTE>(std::max(0.0f, std::min(255.0f, ReadTensor(pCPUTensor) + 0.5f)));
}
template <typename T>
static void InterleaveRowFloatToByte(
const T* xChannel,
const T* yChannel,
const T* zChannel,
uint32_t tensorWidth,
BYTE* pData,
uint32_t bytesPerPixel) {
BYTE* pPixel = pData;
uint32_t tensorWidthRemaining = tensorWidth;
while (tensorWidthRemaining > 0) {
pPixel[0] = DetensorizeValue(xChannel);
pPixel[1] = DetensorizeValue(yChannel);
pPixel[2] = DetensorizeValue(zChannel);
pPixel[3] = 255;
pPixel += 4;
xChannel++;
yChannel++;
zChannel++;
tensorWidthRemaining--;
}
}
#if defined(_M_AMD64) || defined(_M_IX86)
template <>
static void InterleaveRowFloatToByte(
const float* xChannel,
const float* yChannel,
const float* zChannel,
uint32_t tensorWidth,
BYTE* pData,
uint32_t bytesPerPixel) {
BYTE* pPixel = pData;
uint32_t tensorWidthRemaining = tensorWidth;
__m128 maxv = _mm_set1_ps(255.0f);
__m128 zero = _mm_setzero_ps();
// Prep an alpha register with 8 bit - 255 alpha values
__m128i alpha = _mm_setzero_si128();
alpha = _mm_cmpeq_epi32(alpha, alpha);
alpha = _mm_srli_epi16(alpha, 8);
while (tensorWidthRemaining >= 8) {
// Load, saturate, and convert to ints, 8 - 32 bit floats from X channel
__m128i vXIntsLo = _mm_cvtps_epi32(_mm_min_ps(_mm_loadu_ps(xChannel), maxv));
__m128i vXIntsHi = _mm_cvtps_epi32(_mm_min_ps(_mm_loadu_ps(xChannel + 4), maxv));
// Pack 32 bit ints into 16 bit ints
__m128i vXWords = _mm_packs_epi32(vXIntsLo, vXIntsHi);
// Load, saturate, and convert to ints, 8 - 32 bit floats from Y channel
__m128i vYIntsLo = _mm_cvtps_epi32(_mm_min_ps(_mm_loadu_ps(yChannel), maxv));
__m128i vYIntsHi = _mm_cvtps_epi32(_mm_min_ps(_mm_loadu_ps(yChannel + 4), maxv));
// Pack 32 bit ints into 16 bit ints
__m128i vYWords = _mm_packs_epi32(vYIntsLo, vYIntsHi);
// Load, saturate, and convert to ints, 8 - 32 bit floats from Z channel
__m128i vZIntsLo = _mm_cvtps_epi32(_mm_min_ps(_mm_loadu_ps(zChannel), maxv));
__m128i vZIntsHi = _mm_cvtps_epi32(_mm_min_ps(_mm_loadu_ps(zChannel + 4), maxv));
// Pack 32 bit ints into 16 bit ints
__m128i vZWords = _mm_packs_epi32(vZIntsLo, vZIntsHi);
// Pack 16 bit ints into 8 bit uints
__m128i vXZBytes = _mm_packus_epi16(vXWords, vZWords);
__m128i vYABytes = _mm_packus_epi16(vYWords, alpha);
// Interleave bytes into XY order
__m128i vXYBytesInterleaved = _mm_unpacklo_epi8(vXZBytes, vYABytes);
// Interleave bytes into ZA order
__m128i vZABytesInterleaved = _mm_unpackhi_epi8(vXZBytes, vYABytes);
// Interleave 16 bits to get XYZA XYZA ordering
__m128i vPixelBytesLo = _mm_unpacklo_epi16(vXYBytesInterleaved, vZABytesInterleaved);
__m128i vPixelBytesHi = _mm_unpackhi_epi16(vXYBytesInterleaved, vZABytesInterleaved);
// Write out bytes now in proper order
_mm_storeu_si128((__m128i*)pPixel, vPixelBytesLo);
_mm_storeu_si128((__m128i*)(pPixel + 16), vPixelBytesHi);
xChannel += 8;
yChannel += 8;
zChannel += 8;
pPixel += 8 * bytesPerPixel;
tensorWidthRemaining -= 8;
}
// Anything remaining deal with it one at a time
while (tensorWidthRemaining > 0) {
pPixel[0] = DetensorizeValue(xChannel);
pPixel[1] = DetensorizeValue(yChannel);
pPixel[2] = DetensorizeValue(zChannel);
pPixel[3] = 255;
pPixel += bytesPerPixel;
xChannel++;
yChannel++;
zChannel++;
tensorWidthRemaining--;
}
}
#endif
};
} // namespace _winml