10 #ifndef BLITTER_32BPP_SSE_FUNC_HPP
11 #define BLITTER_32BPP_SSE_FUNC_HPP
18 #define INTERNAL_LINKAGE static
22 GNU_TARGET(SSE_TARGET)
23 INTERNAL_LINKAGE
inline void InsertFirstUint32(
const uint32_t value, __m128i &into)
25 #if (SSE_VERSION >= 4)
26 into = _mm_insert_epi32(into, value, 0);
28 into = _mm_insert_epi16(into, value, 0);
29 into = _mm_insert_epi16(into, value >> 16, 1);
33 GNU_TARGET(SSE_TARGET)
34 INTERNAL_LINKAGE
inline void InsertSecondUint32(
const uint32_t value, __m128i &into)
36 #if (SSE_VERSION >= 4)
37 into = _mm_insert_epi32(into, value, 1);
39 into = _mm_insert_epi16(into, value, 2);
40 into = _mm_insert_epi16(into, value >> 16, 3);
44 GNU_TARGET(SSE_TARGET)
45 INTERNAL_LINKAGE
inline void LoadUint64(
const uint64_t value, __m128i &into)
47 #ifdef POINTER_IS_64BIT
48 into = _mm_cvtsi64_si128(value);
50 #if (SSE_VERSION >= 4)
51 into = _mm_cvtsi32_si128(value);
52 InsertSecondUint32(value >> 32, into);
54 (*(um128i*) &into).m128i_u64[0] = value;
59 GNU_TARGET(SSE_TARGET)
60 INTERNAL_LINKAGE
inline __m128i PackUnsaturated(__m128i from,
const __m128i &mask)
62 #if (SSE_VERSION == 2)
63 from = _mm_and_si128(from, mask);
64 return _mm_packus_epi16(from, from);
66 return _mm_shuffle_epi8(from, mask);
70 GNU_TARGET(SSE_TARGET)
71 INTERNAL_LINKAGE
inline __m128i DistributeAlpha(
const __m128i from,
const __m128i &mask)
73 #if (SSE_VERSION == 2)
74 __m128i alphaAB = _mm_shufflelo_epi16(from, 0x3F);
75 alphaAB = _mm_shufflehi_epi16(alphaAB, 0x3F);
76 return _mm_andnot_si128(mask, alphaAB);
78 return _mm_shuffle_epi8(from, mask);
82 GNU_TARGET(SSE_TARGET)
83 INTERNAL_LINKAGE
inline __m128i AlphaBlendTwoPixels(__m128i src, __m128i dst,
const __m128i &distribution_mask,
const __m128i &pack_mask,
const __m128i &alpha_mask)
85 __m128i srcAB = _mm_unpacklo_epi8(src, _mm_setzero_si128());
86 __m128i dstAB = _mm_unpacklo_epi8(dst, _mm_setzero_si128());
88 __m128i alphaMaskAB = _mm_cmpgt_epi16(srcAB, _mm_setzero_si128());
89 __m128i alphaAB = _mm_sub_epi16(srcAB, alphaMaskAB);
90 alphaAB = DistributeAlpha(alphaAB, distribution_mask);
92 srcAB = _mm_sub_epi16(srcAB, dstAB);
93 srcAB = _mm_mullo_epi16(srcAB, alphaAB);
94 srcAB = _mm_srli_epi16(srcAB, 8);
95 srcAB = _mm_add_epi16(srcAB, dstAB);
97 alphaMaskAB = _mm_and_si128(alphaMaskAB, alpha_mask);
98 srcAB = _mm_or_si128(srcAB, alphaMaskAB);
100 return PackUnsaturated(srcAB, pack_mask);
106 GNU_TARGET(SSE_TARGET)
107 INTERNAL_LINKAGE
inline __m128i DarkenTwoPixels(__m128i src, __m128i dst,
const __m128i &distribution_mask,
const __m128i &tr_nom_base)
109 __m128i srcAB = _mm_unpacklo_epi8(src, _mm_setzero_si128());
110 __m128i dstAB = _mm_unpacklo_epi8(dst, _mm_setzero_si128());
111 __m128i alphaAB = DistributeAlpha(srcAB, distribution_mask);
112 alphaAB = _mm_srli_epi16(alphaAB, 2);
113 __m128i nom = _mm_sub_epi16(tr_nom_base, alphaAB);
114 dstAB = _mm_mullo_epi16(dstAB, nom);
115 dstAB = _mm_srli_epi16(dstAB, 8);
116 return _mm_packus_epi16(dstAB, dstAB);
119 IGNORE_UNINITIALIZED_WARNING_START
120 GNU_TARGET(SSE_TARGET)
121 INTERNAL_LINKAGE
Colour ReallyAdjustBrightness(
Colour colour, uint8_t brightness)
123 uint64_t c16 = colour.b | (uint64_t) colour.g << 16 | (uint64_t) colour.r << 32;
125 uint64_t c16_ob = c16;
126 c16 /= Blitter_32bppBase::DEFAULT_BRIGHTNESS;
127 c16 &= 0x01FF01FF01FFULL;
130 c16_ob = (((c16_ob >> (8 + 7)) & 0x0100010001ULL) * 0xFF) & c16;
131 const uint ob = ((uint16_t) c16_ob + (uint16_t) (c16_ob >> 16) + (uint16_t) (c16_ob >> 32)) / 2;
133 const uint32_t alpha32 = colour.
data & 0xFF000000;
135 LoadUint64(c16, ret);
137 __m128i ob128 = _mm_cvtsi32_si128(ob);
138 ob128 = _mm_shufflelo_epi16(ob128, 0xC0);
139 __m128i white = OVERBRIGHT_VALUE_MASK;
141 ret = _mm_subs_epu16(white, c128);
142 ret = _mm_mullo_epi16(ret, ob128);
143 ret = _mm_srli_epi16(ret, 8);
144 ret = _mm_add_epi16(ret, c128);
147 ret = _mm_packus_epi16(ret, ret);
148 return alpha32 | _mm_cvtsi128_si32(ret);
150 IGNORE_UNINITIALIZED_WARNING_STOP
155 INTERNAL_LINKAGE
inline Colour AdjustBrightneSSE(
Colour colour, uint8_t brightness)
158 if (brightness == Blitter_32bppBase::DEFAULT_BRIGHTNESS)
return colour;
160 return ReallyAdjustBrightness(colour, brightness);
163 GNU_TARGET(SSE_TARGET)
164 INTERNAL_LINKAGE
inline __m128i AdjustBrightnessOfTwoPixels([[maybe_unused]] __m128i from, [[maybe_unused]] uint32_t brightness)
166 #if (SSE_VERSION < 3)
173 brightness &= 0xFF00FF00;
174 brightness += Blitter_32bppBase::DEFAULT_BRIGHTNESS;
176 __m128i colAB = _mm_unpacklo_epi8(from, _mm_setzero_si128());
177 __m128i briAB = _mm_cvtsi32_si128(brightness);
178 briAB = _mm_shuffle_epi8(briAB, BRIGHTNESS_LOW_CONTROL_MASK);
179 colAB = _mm_mullo_epi16(colAB, briAB);
180 __m128i colAB_ob = _mm_srli_epi16(colAB, 8 + 7);
181 colAB = _mm_srli_epi16(colAB, 7);
187 colAB = _mm_and_si128(colAB, BRIGHTNESS_DIV_CLEANER);
188 colAB_ob = _mm_and_si128(colAB_ob, OVERBRIGHT_PRESENCE_MASK);
189 colAB_ob = _mm_mullo_epi16(colAB_ob, OVERBRIGHT_VALUE_MASK);
190 colAB_ob = _mm_and_si128(colAB_ob, colAB);
191 __m128i obAB = _mm_hadd_epi16(_mm_hadd_epi16(colAB_ob, _mm_setzero_si128()), _mm_setzero_si128());
193 obAB = _mm_srli_epi16(obAB, 1);
194 obAB = _mm_shuffle_epi8(obAB, OVERBRIGHT_CONTROL_MASK);
195 __m128i retAB = OVERBRIGHT_VALUE_MASK;
196 retAB = _mm_subs_epu16(retAB, colAB);
197 retAB = _mm_mullo_epi16(retAB, obAB);
198 retAB = _mm_srli_epi16(retAB, 8);
199 retAB = _mm_add_epi16(retAB, colAB);
201 return _mm_packus_epi16(retAB, retAB);
205 #if FULL_ANIMATION == 0
213 IGNORE_UNINITIALIZED_WARNING_START
214 template <BlitterMode mode, Blitter_32bppSSE2::ReadMode read_mode, Blitter_32bppSSE2::BlockType bt_last,
bool translucent>
215 GNU_TARGET(SSE_TARGET)
216 #if (SSE_VERSION == 2)
218 #elif (SSE_VERSION == 3)
220 #elif (SSE_VERSION == 4)
224 const uint8_t *
const remap = bp->
remap;
226 int effective_width = bp->
width;
229 const SpriteData *
const sd = (
const SpriteData *) bp->
sprite;
230 const SpriteInfo *
const si = &sd->infos[zoom];
231 const MapValue *src_mv_line = (
const MapValue *) &sd->data[si->mv_offset] + bp->
skip_top * si->sprite_width;
232 const Colour *src_rgba_line = (
const Colour *) ((
const uint8_t *) &sd->data[si->sprite_offset] + bp->
skip_top * si->sprite_line_size);
234 if (read_mode != RM_WITH_MARGIN) {
238 const MapValue *src_mv = src_mv_line;
241 const __m128i alpha_and = ALPHA_AND_MASK;
242 #define ALPHA_BLEND_PARAM_3 alpha_and
243 #if (SSE_VERSION == 2)
244 const __m128i clear_hi = CLEAR_HIGH_BYTE_MASK;
245 #define ALPHA_BLEND_PARAM_1 alpha_and
246 #define ALPHA_BLEND_PARAM_2 clear_hi
247 #define DARKEN_PARAM_1 tr_nom_base
248 #define DARKEN_PARAM_2 tr_nom_base
250 const __m128i a_cm = ALPHA_CONTROL_MASK;
251 const __m128i pack_low_cm = PACK_LOW_CONTROL_MASK;
252 #define ALPHA_BLEND_PARAM_1 a_cm
253 #define ALPHA_BLEND_PARAM_2 pack_low_cm
254 #define DARKEN_PARAM_1 a_cm
255 #define DARKEN_PARAM_2 tr_nom_base
257 const __m128i tr_nom_base = TRANSPARENT_NOM_BASE;
259 for (
int y = bp->
height; y != 0; y--) {
261 const Colour *src = src_rgba_line + META_LENGTH;
264 if (read_mode == RM_WITH_MARGIN) {
265 assert(bt_last == BT_NONE);
266 src += src_rgba_line[0].
data;
267 dst += src_rgba_line[0].
data;
269 const int width_diff = si->sprite_width - bp->
width;
270 effective_width = bp->
width - (int) src_rgba_line[0].data;
271 const int delta_diff = (int) src_rgba_line[1].data - width_diff;
272 const int new_width = effective_width - delta_diff;
273 effective_width = delta_diff > 0 ? new_width : effective_width;
274 if (effective_width <= 0)
goto next_line;
280 for (uint x = (uint) effective_width; x > 0; x--) {
281 if (src->
a) *dst = *src;
288 for (uint x = (uint) effective_width / 2; x > 0; x--) {
289 __m128i srcABCD = _mm_loadl_epi64((
const __m128i*) src);
290 __m128i dstABCD = _mm_loadl_epi64((__m128i*) dst);
291 _mm_storel_epi64((__m128i*) dst, AlphaBlendTwoPixels(srcABCD, dstABCD, ALPHA_BLEND_PARAM_1, ALPHA_BLEND_PARAM_2, ALPHA_BLEND_PARAM_3));
296 if ((bt_last == BT_NONE && effective_width & 1) || bt_last == BT_ODD) {
297 __m128i srcABCD = _mm_cvtsi32_si128(src->
data);
298 __m128i dstABCD = _mm_cvtsi32_si128(dst->
data);
299 dst->
data = _mm_cvtsi128_si32(AlphaBlendTwoPixels(srcABCD, dstABCD, ALPHA_BLEND_PARAM_1, ALPHA_BLEND_PARAM_2, ALPHA_BLEND_PARAM_3));
304 #if (SSE_VERSION >= 3)
305 for (uint x = (uint) effective_width / 2; x > 0; x--) {
306 __m128i srcABCD = _mm_loadl_epi64((
const __m128i*) src);
307 __m128i dstABCD = _mm_loadl_epi64((__m128i*) dst);
308 uint32_t mvX2 = *((uint32_t *)
const_cast<MapValue *
>(src_mv));
311 if (mvX2 & 0x00FF00FF) {
312 #define CMOV_REMAP(m_colour, m_colour_init, m_src, m_m) \
314 Colour m_colour = m_colour_init; \
316 const Colour srcm = (Colour) (m_src); \
317 const uint m = (uint8_t) (m_m); \
318 const uint r = remap[m]; \
319 const Colour cmap = (this->LookupColourInPalette(r).data & 0x00FFFFFF) | (srcm.data & 0xFF000000); \
320 m_colour = r == 0 ? m_colour : cmap; \
321 m_colour = m != 0 ? m_colour : srcm; \
323 #ifdef POINTER_IS_64BIT
324 uint64_t srcs = _mm_cvtsi128_si64(srcABCD);
325 uint64_t remapped_src = 0;
326 CMOV_REMAP(c0, 0, srcs, mvX2);
327 remapped_src = c0.data;
328 CMOV_REMAP(c1, 0, srcs >> 32, mvX2 >> 16);
329 remapped_src |= (uint64_t) c1.data << 32;
330 srcABCD = _mm_cvtsi64_si128(remapped_src);
333 CMOV_REMAP(c0, 0, _mm_cvtsi128_si32(srcABCD), mvX2);
334 remapped_src[0] = c0.
data;
335 CMOV_REMAP(c1, 0, src[1], mvX2 >> 16);
336 remapped_src[1] = c1.
data;
337 srcABCD = _mm_loadl_epi64((__m128i*) &remapped_src);
340 if ((mvX2 & 0xFF00FF00) != 0x80008000) srcABCD = AdjustBrightnessOfTwoPixels(srcABCD, mvX2);
344 _mm_storel_epi64((__m128i *) dst, AlphaBlendTwoPixels(srcABCD, dstABCD, ALPHA_BLEND_PARAM_1, ALPHA_BLEND_PARAM_2, ALPHA_BLEND_PARAM_3));
350 if ((bt_last == BT_NONE && effective_width & 1) || bt_last == BT_ODD) {
352 for (uint x = (uint) effective_width; x > 0; x--) {
357 const uint r = remap[src_mv->m];
359 Colour remapped_colour = AdjustBrightneSSE(this->LookupColourInPalette(r), src_mv->v);
361 *dst = remapped_colour;
363 remapped_colour.
a = src->
a;
364 srcABCD = _mm_cvtsi32_si128(remapped_colour.
data);
365 goto bmcr_alpha_blend_single;
369 srcABCD = _mm_cvtsi32_si128(src->
data);
371 bmcr_alpha_blend_single:
372 __m128i dstABCD = _mm_cvtsi32_si128(dst->
data);
373 srcABCD = AlphaBlendTwoPixels(srcABCD, dstABCD, ALPHA_BLEND_PARAM_1, ALPHA_BLEND_PARAM_2, ALPHA_BLEND_PARAM_3);
375 dst->
data = _mm_cvtsi128_si32(srcABCD);
377 #if (SSE_VERSION == 2)
387 for (uint x = (uint) bp->
width / 2; x > 0; x--) {
388 __m128i srcABCD = _mm_loadl_epi64((
const __m128i*) src);
389 __m128i dstABCD = _mm_loadl_epi64((__m128i*) dst);
390 _mm_storel_epi64((__m128i *) dst, DarkenTwoPixels(srcABCD, dstABCD, DARKEN_PARAM_1, DARKEN_PARAM_2));
395 if ((bt_last == BT_NONE && bp->
width & 1) || bt_last == BT_ODD) {
396 __m128i srcABCD = _mm_cvtsi32_si128(src->
data);
397 __m128i dstABCD = _mm_cvtsi32_si128(dst->
data);
398 dst->
data = _mm_cvtsi128_si32(DarkenTwoPixels(srcABCD, dstABCD, DARKEN_PARAM_1, DARKEN_PARAM_2));
404 for (uint x = (uint) bp->
width; x > 0; x--) {
415 for (uint x = (uint) bp->
width; x > 0; x--) {
416 if (src_mv->m == 0) {
418 uint8_t g = MakeDark(src->r, src->g, src->b);
419 *dst = ComposeColourRGBA(g, g, g, src->
a, *dst);
422 uint r = remap[src_mv->m];
423 if (r != 0) *dst = ComposeColourPANoCheck(this->AdjustBrightness(this->LookupColourInPalette(r), src_mv->v), src->
a, *dst);
432 for (uint x = (uint) bp->
width; x > 0; x--) {
445 src_rgba_line = (
const Colour*) ((
const uint8_t*) src_rgba_line + si->sprite_line_size);
446 dst_line += bp->
pitch;
449 IGNORE_UNINITIALIZED_WARNING_STOP
458 #if (SSE_VERSION == 2)
460 #elif (SSE_VERSION == 3)
462 #elif (SSE_VERSION == 4)
470 const BlockType bt_last = (BlockType) (bp->
width & 1);
472 default: Draw<BM_NORMAL, RM_WITH_SKIP, BT_EVEN, true>(bp, zoom);
return;
473 case BT_ODD: Draw<BM_NORMAL, RM_WITH_SKIP, BT_ODD, true>(bp, zoom);
return;
476 if (((
const Blitter_32bppSSE_Base::SpriteData *) bp->
sprite)->flags & SF_TRANSLUCENT) {
477 Draw<BM_NORMAL, RM_WITH_MARGIN, BT_NONE, true>(bp, zoom);
479 Draw<BM_NORMAL, RM_WITH_MARGIN, BT_NONE, false>(bp, zoom);
486 if (((
const Blitter_32bppSSE_Base::SpriteData *) bp->
sprite)->flags & SF_NO_REMAP)
goto bm_normal;
488 Draw<BM_COLOUR_REMAP, RM_WITH_SKIP, BT_NONE, true>(bp, zoom);
return;
490 Draw<BM_COLOUR_REMAP, RM_WITH_MARGIN, BT_NONE, true>(bp, zoom);
return;
492 case BM_TRANSPARENT: Draw<BM_TRANSPARENT, RM_NONE, BT_NONE, true>(bp, zoom);
return;
493 case BM_TRANSPARENT_REMAP: Draw<BM_TRANSPARENT_REMAP, RM_NONE, BT_NONE, true>(bp, zoom);
return;
494 case BM_CRASH_REMAP: Draw<BM_CRASH_REMAP, RM_NONE, BT_NONE, true>(bp, zoom);
return;
495 case BM_BLACK_REMAP: Draw<BM_BLACK_REMAP, RM_NONE, BT_NONE, true>(bp, zoom);
return;
BlitterMode
The modes of blitting we can do.
@ BM_BLACK_REMAP
Perform remapping to a completely blackened sprite.
@ BM_COLOUR_REMAP
Perform a colour remapping.
@ BM_TRANSPARENT_REMAP
Perform transparency colour remapping.
@ BM_TRANSPARENT
Perform transparency darkening remapping.
@ BM_CRASH_REMAP
Perform a crash remapping.
uint8_t GetNearestColourIndex(uint8_t r, uint8_t g, uint8_t b)
Get nearest colour palette index from an RGB colour.
Parameters related to blitting.
int skip_top
How much pixels of the source to skip on the top (based on zoom of dst)
void * dst
Destination buffer.
int left
The left offset in the 'dst' in pixels to start drawing.
int pitch
The pitch of the destination buffer.
int skip_left
How much pixels of the source to skip on the left (based on zoom of dst)
int height
The height in pixels that needs to be drawn to dst.
const uint8_t * remap
XXX – Temporary storage for remap array.
int width
The width in pixels that needs to be drawn to dst.
const void * sprite
Pointer to the sprite how ever the encoder stored it.
int top
The top offset in the 'dst' in pixels to start drawing.
Structure to access the alpha, red, green, and blue channels from a 32 bit number.
uint32_t data
Conversion of the channel information to a 32 bit number.
uint8_t a
colour channels in LE order
ZoomLevel
All zoom levels we know.