17#ifndef BLITTER_32BPP_SSE_FUNC_HPP
18#define BLITTER_32BPP_SSE_FUNC_HPP
21#define INTERNAL_LINKAGE static
26INTERNAL_LINKAGE inline void InsertFirstUint32(
const uint32_t value, __m128i &into)
29 into = _mm_insert_epi32(into, value, 0);
31 into = _mm_insert_epi16(into, value, 0);
32 into = _mm_insert_epi16(into, value >> 16, 1);
37INTERNAL_LINKAGE inline void InsertSecondUint32(
const uint32_t value, __m128i &into)
40 into = _mm_insert_epi32(into, value, 1);
42 into = _mm_insert_epi16(into, value, 2);
43 into = _mm_insert_epi16(into, value >> 16, 3);
50#ifdef POINTER_IS_64BIT
51 into = _mm_cvtsi64_si128(value);
53 #if (SSE_VERSION >= 4)
54 into = _mm_cvtsi32_si128(value);
55 InsertSecondUint32(value >> 32, into);
57 (*(um128i*) &into).m128i_u64[0] = value;
63INTERNAL_LINKAGE inline __m128i PackUnsaturated(__m128i from,
const __m128i &mask)
66 from = _mm_and_si128(from, mask);
67 return _mm_packus_epi16(from, from);
69 return _mm_shuffle_epi8(from, mask);
74INTERNAL_LINKAGE inline __m128i DistributeAlpha(
const __m128i from,
const __m128i &mask)
77 __m128i alphaAB = _mm_shufflelo_epi16(from, 0x3F);
78 alphaAB = _mm_shufflehi_epi16(alphaAB, 0x3F);
79 return _mm_andnot_si128(mask, alphaAB);
81 return _mm_shuffle_epi8(from, mask);
86INTERNAL_LINKAGE inline __m128i AlphaBlendTwoPixels(__m128i src, __m128i dst,
const __m128i &distribution_mask,
const __m128i &pack_mask,
const __m128i &alpha_mask)
88 __m128i srcAB = _mm_unpacklo_epi8(src, _mm_setzero_si128());
89 __m128i dstAB = _mm_unpacklo_epi8(dst, _mm_setzero_si128());
91 __m128i alphaMaskAB = _mm_cmpgt_epi16(srcAB, _mm_setzero_si128());
92 __m128i alphaAB = _mm_sub_epi16(srcAB, alphaMaskAB);
93 alphaAB = DistributeAlpha(alphaAB, distribution_mask);
95 srcAB = _mm_sub_epi16(srcAB, dstAB);
96 srcAB = _mm_mullo_epi16(srcAB, alphaAB);
97 srcAB = _mm_srli_epi16(srcAB, 8);
98 srcAB = _mm_add_epi16(srcAB, dstAB);
100 alphaMaskAB = _mm_and_si128(alphaMaskAB, alpha_mask);
101 srcAB = _mm_or_si128(srcAB, alphaMaskAB);
103 return PackUnsaturated(srcAB, pack_mask);
109GNU_TARGET(SSE_TARGET)
110INTERNAL_LINKAGE inline __m128i DarkenTwoPixels(__m128i src, __m128i dst,
const __m128i &distribution_mask,
const __m128i &tr_nom_base)
112 __m128i srcAB = _mm_unpacklo_epi8(src, _mm_setzero_si128());
113 __m128i dstAB = _mm_unpacklo_epi8(dst, _mm_setzero_si128());
114 __m128i alphaAB = DistributeAlpha(srcAB, distribution_mask);
115 alphaAB = _mm_srli_epi16(alphaAB, 2);
116 __m128i nom = _mm_sub_epi16(tr_nom_base, alphaAB);
117 dstAB = _mm_mullo_epi16(dstAB, nom);
118 dstAB = _mm_srli_epi16(dstAB, 8);
119 return _mm_packus_epi16(dstAB, dstAB);
122GNU_TARGET(SSE_TARGET)
125 uint64_t c16 = colour.b | (uint64_t) colour.g << 16 | (uint64_t) colour.r << 32;
127 uint64_t c16_ob = c16;
128 c16 /= DEFAULT_BRIGHTNESS;
129 c16 &= 0x01FF01FF01FFULL;
132 c16_ob = (((c16_ob >> (8 + 7)) & 0x0100010001ULL) * 0xFF) & c16;
133 const uint ob = ((uint16_t) c16_ob + (uint16_t) (c16_ob >> 16) + (uint16_t) (c16_ob >> 32)) / 2;
135 const uint32_t alpha32 = colour.data & 0xFF000000;
137 LoadUint64(c16, ret);
139 __m128i ob128 = _mm_cvtsi32_si128(ob);
140 ob128 = _mm_shufflelo_epi16(ob128, 0xC0);
141 __m128i white = OVERBRIGHT_VALUE_MASK;
143 ret = _mm_subs_epu16(white, c128);
144 ret = _mm_mullo_epi16(ret, ob128);
145 ret = _mm_srli_epi16(ret, 8);
146 ret = _mm_add_epi16(ret, c128);
149 ret = _mm_packus_epi16(ret, ret);
150 return alpha32 | _mm_cvtsi128_si32(ret);
156INTERNAL_LINKAGE inline Colour AdjustBrightneSSE(Colour colour, uint8_t brightness)
159 if (brightness == DEFAULT_BRIGHTNESS)
return colour;
164GNU_TARGET(SSE_TARGET)
165INTERNAL_LINKAGE inline __m128i AdjustBrightnessOfTwoPixels([[maybe_unused]] __m128i from, [[maybe_unused]] uint32_t brightness)
174 brightness &= 0xFF00FF00;
175 brightness += DEFAULT_BRIGHTNESS;
177 __m128i colAB = _mm_unpacklo_epi8(from, _mm_setzero_si128());
178 __m128i briAB = _mm_cvtsi32_si128(brightness);
179 briAB = _mm_shuffle_epi8(briAB, BRIGHTNESS_LOW_CONTROL_MASK);
180 colAB = _mm_mullo_epi16(colAB, briAB);
181 __m128i colAB_ob = _mm_srli_epi16(colAB, 8 + 7);
182 colAB = _mm_srli_epi16(colAB, 7);
188 colAB = _mm_and_si128(colAB, BRIGHTNESS_DIV_CLEANER);
189 colAB_ob = _mm_and_si128(colAB_ob, OVERBRIGHT_PRESENCE_MASK);
190 colAB_ob = _mm_mullo_epi16(colAB_ob, OVERBRIGHT_VALUE_MASK);
191 colAB_ob = _mm_and_si128(colAB_ob, colAB);
192 __m128i obAB = _mm_hadd_epi16(_mm_hadd_epi16(colAB_ob, _mm_setzero_si128()), _mm_setzero_si128());
194 obAB = _mm_srli_epi16(obAB, 1);
195 obAB = _mm_shuffle_epi8(obAB, OVERBRIGHT_CONTROL_MASK);
196 __m128i retAB = OVERBRIGHT_VALUE_MASK;
197 retAB = _mm_subs_epu16(retAB, colAB);
198 retAB = _mm_mullo_epi16(retAB, obAB);
199 retAB = _mm_srli_epi16(retAB, 8);
200 retAB = _mm_add_epi16(retAB, colAB);
202 return _mm_packus_epi16(retAB, retAB);
206#if FULL_ANIMATION == 0
214template <BlitterMode mode, Blitter_32bppSSE2::ReadMode read_mode, Blitter_32bppSSE2::BlockType bt_last,
bool translucent>
215GNU_TARGET(SSE_TARGET)
216#if (SSE_VERSION == 2)
218#elif (SSE_VERSION == 3)
220#elif (SSE_VERSION == 4)
224 const uint8_t *
const remap = bp->
remap;
225 Colour *dst_line = (Colour *) bp->
dst + bp->
top * bp->
pitch + bp->
left;
226 int effective_width = bp->
width;
229 const SpriteData *
const sd = (
const SpriteData *) bp->
sprite;
230 const SpriteInfo *
const si = &sd->infos[zoom];
231 const MapValue *src_mv_line = (
const MapValue *) &sd->data[si->mv_offset] + bp->
skip_top * si->sprite_width;
232 const Colour *src_rgba_line = (
const Colour *) ((
const uint8_t *) &sd->data[si->sprite_offset] + bp->
skip_top * si->sprite_line_size);
234 if (read_mode != RM_WITH_MARGIN) {
238 const MapValue *src_mv = src_mv_line;
241 const __m128i alpha_and = ALPHA_AND_MASK;
242 #define ALPHA_BLEND_PARAM_3 alpha_and
243#if (SSE_VERSION == 2)
244 const __m128i clear_hi = CLEAR_HIGH_BYTE_MASK;
245 #define ALPHA_BLEND_PARAM_1 alpha_and
246 #define ALPHA_BLEND_PARAM_2 clear_hi
247 #define DARKEN_PARAM_1 tr_nom_base
248 #define DARKEN_PARAM_2 tr_nom_base
250 const __m128i a_cm = ALPHA_CONTROL_MASK;
251 const __m128i pack_low_cm = PACK_LOW_CONTROL_MASK;
252 #define ALPHA_BLEND_PARAM_1 a_cm
253 #define ALPHA_BLEND_PARAM_2 pack_low_cm
254 #define DARKEN_PARAM_1 a_cm
255 #define DARKEN_PARAM_2 tr_nom_base
257 const __m128i tr_nom_base = TRANSPARENT_NOM_BASE;
259 for (
int y = bp->
height; y != 0; y--) {
260 Colour *dst = dst_line;
261 const Colour *src = src_rgba_line + META_LENGTH;
264 if (read_mode == RM_WITH_MARGIN) {
265 assert(bt_last == BT_NONE);
266 src += src_rgba_line[0].data;
267 dst += src_rgba_line[0].data;
269 const int width_diff = si->sprite_width - bp->
width;
270 effective_width = bp->
width - (int) src_rgba_line[0].data;
271 const int delta_diff = (int) src_rgba_line[1].data - width_diff;
272 const int new_width = effective_width - delta_diff;
273 effective_width = delta_diff > 0 ? new_width : effective_width;
274 if (effective_width <= 0)
goto next_line;
280 for (uint x = (uint) effective_width; x > 0; x--) {
281 if (src->a) *dst = *src;
288 for (uint x = (uint) effective_width / 2; x > 0; x--) {
289 __m128i srcABCD = _mm_loadl_epi64((
const __m128i*) src);
290 __m128i dstABCD = _mm_loadl_epi64((__m128i*) dst);
291 _mm_storel_epi64((__m128i*) dst, AlphaBlendTwoPixels(srcABCD, dstABCD, ALPHA_BLEND_PARAM_1, ALPHA_BLEND_PARAM_2, ALPHA_BLEND_PARAM_3));
296 if ((bt_last == BT_NONE && effective_width & 1) || bt_last == BT_ODD) {
297 __m128i srcABCD = _mm_cvtsi32_si128(src->data);
298 __m128i dstABCD = _mm_cvtsi32_si128(dst->data);
299 dst->data = _mm_cvtsi128_si32(AlphaBlendTwoPixels(srcABCD, dstABCD, ALPHA_BLEND_PARAM_1, ALPHA_BLEND_PARAM_2, ALPHA_BLEND_PARAM_3));
304#if (SSE_VERSION >= 3)
305 for (uint x = (uint) effective_width / 2; x > 0; x--) {
306 __m128i srcABCD = _mm_loadl_epi64((
const __m128i*) src);
307 __m128i dstABCD = _mm_loadl_epi64((__m128i*) dst);
308 uint32_t mvX2 = *((uint32_t *)
const_cast<MapValue *
>(src_mv));
311 if (mvX2 & 0x00FF00FF) {
313 #define CMOV_REMAP(m_colour, m_colour_init, m_src, m_m) \
314 Colour m_colour = m_colour_init; \
316 const Colour srcm = (Colour) (m_src); \
317 const uint m = (uint8_t) (m_m); \
318 const uint r = remap[m]; \
319 const Colour cmap = (this->LookupColourInPalette(r).data & 0x00FFFFFF) | (srcm.data & 0xFF000000); \
320 m_colour = r == 0 ? m_colour : cmap; \
321 m_colour = m != 0 ? m_colour : srcm; \
323#ifdef POINTER_IS_64BIT
324 uint64_t srcs = _mm_cvtsi128_si64(srcABCD);
325 uint64_t remapped_src = 0;
326 CMOV_REMAP(c0, 0, srcs, mvX2);
327 remapped_src = c0.data;
328 CMOV_REMAP(c1, 0, srcs >> 32, mvX2 >> 16);
329 remapped_src |= (uint64_t) c1.data << 32;
330 srcABCD = _mm_cvtsi64_si128(remapped_src);
332 Colour remapped_src[2];
333 CMOV_REMAP(c0, 0, _mm_cvtsi128_si32(srcABCD), mvX2);
334 remapped_src[0] = c0.data;
335 CMOV_REMAP(c1, 0, src[1], mvX2 >> 16);
336 remapped_src[1] = c1.data;
337 srcABCD = _mm_loadl_epi64((__m128i*) &remapped_src);
340 if ((mvX2 & 0xFF00FF00) != 0x80008000) srcABCD = AdjustBrightnessOfTwoPixels(srcABCD, mvX2);
344 _mm_storel_epi64((__m128i *) dst, AlphaBlendTwoPixels(srcABCD, dstABCD, ALPHA_BLEND_PARAM_1, ALPHA_BLEND_PARAM_2, ALPHA_BLEND_PARAM_3));
350 if ((bt_last == BT_NONE && effective_width & 1) || bt_last == BT_ODD) {
352 for (uint x = (uint) effective_width; x > 0; x--) {
357 const uint r = remap[src_mv->m];
359 Colour remapped_colour = AdjustBrightneSSE(this->LookupColourInPalette(r), src_mv->v);
361 *dst = remapped_colour;
363 remapped_colour.a = src->a;
364 srcABCD = _mm_cvtsi32_si128(remapped_colour.data);
365 goto bmcr_alpha_blend_single;
369 srcABCD = _mm_cvtsi32_si128(src->data);
371bmcr_alpha_blend_single:
372 __m128i dstABCD = _mm_cvtsi32_si128(dst->data);
373 srcABCD = AlphaBlendTwoPixels(srcABCD, dstABCD, ALPHA_BLEND_PARAM_1, ALPHA_BLEND_PARAM_2, ALPHA_BLEND_PARAM_3);
375 dst->data = _mm_cvtsi128_si32(srcABCD);
377#if (SSE_VERSION == 2)
387 for (uint x = (uint) bp->
width / 2; x > 0; x--) {
388 __m128i srcABCD = _mm_loadl_epi64((
const __m128i*) src);
389 __m128i dstABCD = _mm_loadl_epi64((__m128i*) dst);
390 _mm_storel_epi64((__m128i *) dst, DarkenTwoPixels(srcABCD, dstABCD, DARKEN_PARAM_1, DARKEN_PARAM_2));
395 if ((bt_last == BT_NONE && bp->
width & 1) || bt_last == BT_ODD) {
396 __m128i srcABCD = _mm_cvtsi32_si128(src->data);
397 __m128i dstABCD = _mm_cvtsi32_si128(dst->data);
398 dst->data = _mm_cvtsi128_si32(DarkenTwoPixels(srcABCD, dstABCD, DARKEN_PARAM_1, DARKEN_PARAM_2));
404 for (uint x = (uint) bp->
width; x > 0; x--) {
415 for (uint x = (uint) bp->
width; x > 0; x--) {
416 if (src_mv->m == 0) {
418 uint8_t g = MakeDark(src->r, src->g, src->b);
419 *dst = ComposeColourRGBA(g, g, g, src->a, *dst);
422 uint r = remap[src_mv->m];
423 if (r != 0) *dst = ComposeColourPANoCheck(AdjustBrightness(this->LookupColourInPalette(r), src_mv->v), src->a, *dst);
432 for (uint x = (uint) bp->
width; x > 0; x--) {
434 *dst = Colour(0, 0, 0);
445 src_rgba_line = (
const Colour*) ((
const uint8_t*) src_rgba_line + si->sprite_line_size);
446 dst_line += bp->
pitch;
457#if (SSE_VERSION == 2)
459#elif (SSE_VERSION == 3)
461#elif (SSE_VERSION == 4)
469 const BlockType bt_last = (BlockType) (bp->
width & 1);
471 default: Draw<BlitterMode::Normal, RM_WITH_SKIP, BT_EVEN, true>(bp, zoom);
return;
472 case BT_ODD: Draw<BlitterMode::Normal, RM_WITH_SKIP, BT_ODD, true>(bp, zoom);
return;
475 if (((
const Blitter_32bppSSE_Base::SpriteData *) bp->
sprite)->flags.Test(SpriteFlag::Translucent)) {
476 Draw<BlitterMode::Normal, RM_WITH_MARGIN, BT_NONE, true>(bp, zoom);
478 Draw<BlitterMode::Normal, RM_WITH_MARGIN, BT_NONE, false>(bp, zoom);
485 if (((
const Blitter_32bppSSE_Base::SpriteData *) bp->
sprite)->flags.Test(SpriteFlag::NoRemap))
goto bm_normal;
487 Draw<BlitterMode::ColourRemap, RM_WITH_SKIP, BT_NONE, true>(bp, zoom);
return;
489 Draw<BlitterMode::ColourRemap, RM_WITH_MARGIN, BT_NONE, true>(bp, zoom);
return;
#define INTERNAL_LINKAGE
Prefix all things in this file wiht this specifier to make them linked internally only.
BlitterMode
The modes of blitting we can do.
@ Transparent
Perform transparency darkening remapping.
@ CrashRemap
Perform a crash remapping.
@ BlackRemap
Perform remapping to a completely blackened sprite.
@ TransparentRemap
Perform transparency colour remapping.
@ ColourRemap
Perform a colour remapping.
Colour ReallyAdjustBrightness(Colour colour, int brightness)
Adjust brightness of colour.
uint8_t GetNearestColourIndex(uint8_t r, uint8_t g, uint8_t b)
Get nearest colour palette index from an RGB colour.
Parameters related to blitting.
int skip_top
How much pixels of the source to skip on the top (based on zoom of dst).
void * dst
Destination buffer.
int left
The left offset in the 'dst' in pixels to start drawing.
int pitch
The pitch of the destination buffer.
int skip_left
How much pixels of the source to skip on the left (based on zoom of dst).
int height
The height in pixels that needs to be drawn to dst.
const uint8_t * remap
XXX – Temporary storage for remap array.
int width
The width in pixels that needs to be drawn to dst.
const void * sprite
Pointer to the sprite how ever the encoder stored it.
int top
The top offset in the 'dst' in pixels to start drawing.
ZoomLevel
All zoom levels we know.