10#ifndef BLITTER_32BPP_SSE_FUNC_HPP
11#define BLITTER_32BPP_SSE_FUNC_HPP
18#define INTERNAL_LINKAGE static
23INTERNAL_LINKAGE
inline void InsertFirstUint32(
const uint32_t value, __m128i &into)
26 into = _mm_insert_epi32(into, value, 0);
28 into = _mm_insert_epi16(into, value, 0);
29 into = _mm_insert_epi16(into, value >> 16, 1);
34INTERNAL_LINKAGE
inline void InsertSecondUint32(
const uint32_t value, __m128i &into)
37 into = _mm_insert_epi32(into, value, 1);
39 into = _mm_insert_epi16(into, value, 2);
40 into = _mm_insert_epi16(into, value >> 16, 3);
45INTERNAL_LINKAGE
inline void LoadUint64(
const uint64_t value, __m128i &into)
47#ifdef POINTER_IS_64BIT
48 into = _mm_cvtsi64_si128(value);
50 #if (SSE_VERSION >= 4)
51 into = _mm_cvtsi32_si128(value);
52 InsertSecondUint32(value >> 32, into);
54 (*(um128i*) &into).m128i_u64[0] = value;
60INTERNAL_LINKAGE
inline __m128i PackUnsaturated(__m128i from,
const __m128i &mask)
63 from = _mm_and_si128(from, mask);
64 return _mm_packus_epi16(from, from);
66 return _mm_shuffle_epi8(from, mask);
71INTERNAL_LINKAGE
inline __m128i DistributeAlpha(
const __m128i from,
const __m128i &mask)
74 __m128i alphaAB = _mm_shufflelo_epi16(from, 0x3F);
75 alphaAB = _mm_shufflehi_epi16(alphaAB, 0x3F);
76 return _mm_andnot_si128(mask, alphaAB);
78 return _mm_shuffle_epi8(from, mask);
83INTERNAL_LINKAGE
inline __m128i AlphaBlendTwoPixels(__m128i src, __m128i dst,
const __m128i &distribution_mask,
const __m128i &pack_mask,
const __m128i &alpha_mask)
85 __m128i srcAB = _mm_unpacklo_epi8(src, _mm_setzero_si128());
86 __m128i dstAB = _mm_unpacklo_epi8(dst, _mm_setzero_si128());
88 __m128i alphaMaskAB = _mm_cmpgt_epi16(srcAB, _mm_setzero_si128());
89 __m128i alphaAB = _mm_sub_epi16(srcAB, alphaMaskAB);
90 alphaAB = DistributeAlpha(alphaAB, distribution_mask);
92 srcAB = _mm_sub_epi16(srcAB, dstAB);
93 srcAB = _mm_mullo_epi16(srcAB, alphaAB);
94 srcAB = _mm_srli_epi16(srcAB, 8);
95 srcAB = _mm_add_epi16(srcAB, dstAB);
97 alphaMaskAB = _mm_and_si128(alphaMaskAB, alpha_mask);
98 srcAB = _mm_or_si128(srcAB, alphaMaskAB);
100 return PackUnsaturated(srcAB, pack_mask);
106GNU_TARGET(SSE_TARGET)
107INTERNAL_LINKAGE
inline __m128i DarkenTwoPixels(__m128i src, __m128i dst,
const __m128i &distribution_mask,
const __m128i &tr_nom_base)
109 __m128i srcAB = _mm_unpacklo_epi8(src, _mm_setzero_si128());
110 __m128i dstAB = _mm_unpacklo_epi8(dst, _mm_setzero_si128());
111 __m128i alphaAB = DistributeAlpha(srcAB, distribution_mask);
112 alphaAB = _mm_srli_epi16(alphaAB, 2);
113 __m128i nom = _mm_sub_epi16(tr_nom_base, alphaAB);
114 dstAB = _mm_mullo_epi16(dstAB, nom);
115 dstAB = _mm_srli_epi16(dstAB, 8);
116 return _mm_packus_epi16(dstAB, dstAB);
119GNU_TARGET(SSE_TARGET)
122 uint64_t c16 = colour.b | (uint64_t) colour.g << 16 | (uint64_t) colour.r << 32;
124 uint64_t c16_ob = c16;
125 c16 /= DEFAULT_BRIGHTNESS;
126 c16 &= 0x01FF01FF01FFULL;
129 c16_ob = (((c16_ob >> (8 + 7)) & 0x0100010001ULL) * 0xFF) & c16;
130 const uint ob = ((uint16_t) c16_ob + (uint16_t) (c16_ob >> 16) + (uint16_t) (c16_ob >> 32)) / 2;
132 const uint32_t alpha32 = colour.data & 0xFF000000;
134 LoadUint64(c16, ret);
136 __m128i ob128 = _mm_cvtsi32_si128(ob);
137 ob128 = _mm_shufflelo_epi16(ob128, 0xC0);
138 __m128i white = OVERBRIGHT_VALUE_MASK;
140 ret = _mm_subs_epu16(white, c128);
141 ret = _mm_mullo_epi16(ret, ob128);
142 ret = _mm_srli_epi16(ret, 8);
143 ret = _mm_add_epi16(ret, c128);
146 ret = _mm_packus_epi16(ret, ret);
147 return alpha32 | _mm_cvtsi128_si32(ret);
153INTERNAL_LINKAGE
inline Colour AdjustBrightneSSE(Colour colour, uint8_t brightness)
156 if (brightness == DEFAULT_BRIGHTNESS)
return colour;
161GNU_TARGET(SSE_TARGET)
162INTERNAL_LINKAGE
inline __m128i AdjustBrightnessOfTwoPixels([[maybe_unused]] __m128i from, [[maybe_unused]] uint32_t brightness)
171 brightness &= 0xFF00FF00;
172 brightness += DEFAULT_BRIGHTNESS;
174 __m128i colAB = _mm_unpacklo_epi8(from, _mm_setzero_si128());
175 __m128i briAB = _mm_cvtsi32_si128(brightness);
176 briAB = _mm_shuffle_epi8(briAB, BRIGHTNESS_LOW_CONTROL_MASK);
177 colAB = _mm_mullo_epi16(colAB, briAB);
178 __m128i colAB_ob = _mm_srli_epi16(colAB, 8 + 7);
179 colAB = _mm_srli_epi16(colAB, 7);
185 colAB = _mm_and_si128(colAB, BRIGHTNESS_DIV_CLEANER);
186 colAB_ob = _mm_and_si128(colAB_ob, OVERBRIGHT_PRESENCE_MASK);
187 colAB_ob = _mm_mullo_epi16(colAB_ob, OVERBRIGHT_VALUE_MASK);
188 colAB_ob = _mm_and_si128(colAB_ob, colAB);
189 __m128i obAB = _mm_hadd_epi16(_mm_hadd_epi16(colAB_ob, _mm_setzero_si128()), _mm_setzero_si128());
191 obAB = _mm_srli_epi16(obAB, 1);
192 obAB = _mm_shuffle_epi8(obAB, OVERBRIGHT_CONTROL_MASK);
193 __m128i retAB = OVERBRIGHT_VALUE_MASK;
194 retAB = _mm_subs_epu16(retAB, colAB);
195 retAB = _mm_mullo_epi16(retAB, obAB);
196 retAB = _mm_srli_epi16(retAB, 8);
197 retAB = _mm_add_epi16(retAB, colAB);
199 return _mm_packus_epi16(retAB, retAB);
203#if FULL_ANIMATION == 0
211template <BlitterMode mode, Blitter_32bppSSE2::ReadMode read_mode, Blitter_32bppSSE2::BlockType bt_last,
bool translucent>
212GNU_TARGET(SSE_TARGET)
213#if (SSE_VERSION == 2)
215#elif (SSE_VERSION == 3)
217#elif (SSE_VERSION == 4)
221 const uint8_t *
const remap = bp->
remap;
222 Colour *dst_line = (Colour *) bp->
dst + bp->
top * bp->
pitch + bp->
left;
223 int effective_width = bp->
width;
226 const SpriteData *
const sd = (
const SpriteData *) bp->
sprite;
227 const SpriteInfo *
const si = &sd->infos[zoom];
228 const MapValue *src_mv_line = (
const MapValue *) &sd->data[si->mv_offset] + bp->
skip_top * si->sprite_width;
229 const Colour *src_rgba_line = (
const Colour *) ((
const uint8_t *) &sd->data[si->sprite_offset] + bp->
skip_top * si->sprite_line_size);
231 if (read_mode != RM_WITH_MARGIN) {
235 const MapValue *src_mv = src_mv_line;
238 const __m128i alpha_and = ALPHA_AND_MASK;
239 #define ALPHA_BLEND_PARAM_3 alpha_and
240#if (SSE_VERSION == 2)
241 const __m128i clear_hi = CLEAR_HIGH_BYTE_MASK;
242 #define ALPHA_BLEND_PARAM_1 alpha_and
243 #define ALPHA_BLEND_PARAM_2 clear_hi
244 #define DARKEN_PARAM_1 tr_nom_base
245 #define DARKEN_PARAM_2 tr_nom_base
247 const __m128i a_cm = ALPHA_CONTROL_MASK;
248 const __m128i pack_low_cm = PACK_LOW_CONTROL_MASK;
249 #define ALPHA_BLEND_PARAM_1 a_cm
250 #define ALPHA_BLEND_PARAM_2 pack_low_cm
251 #define DARKEN_PARAM_1 a_cm
252 #define DARKEN_PARAM_2 tr_nom_base
254 const __m128i tr_nom_base = TRANSPARENT_NOM_BASE;
256 for (
int y = bp->
height; y != 0; y--) {
257 Colour *dst = dst_line;
258 const Colour *src = src_rgba_line + META_LENGTH;
261 if (read_mode == RM_WITH_MARGIN) {
262 assert(bt_last == BT_NONE);
263 src += src_rgba_line[0].data;
264 dst += src_rgba_line[0].data;
266 const int width_diff = si->sprite_width - bp->
width;
267 effective_width = bp->
width - (int) src_rgba_line[0].data;
268 const int delta_diff = (int) src_rgba_line[1].data - width_diff;
269 const int new_width = effective_width - delta_diff;
270 effective_width = delta_diff > 0 ? new_width : effective_width;
271 if (effective_width <= 0)
goto next_line;
277 for (uint x = (uint) effective_width; x > 0; x--) {
278 if (src->a) *dst = *src;
285 for (uint x = (uint) effective_width / 2; x > 0; x--) {
286 __m128i srcABCD = _mm_loadl_epi64((
const __m128i*) src);
287 __m128i dstABCD = _mm_loadl_epi64((__m128i*) dst);
288 _mm_storel_epi64((__m128i*) dst, AlphaBlendTwoPixels(srcABCD, dstABCD, ALPHA_BLEND_PARAM_1, ALPHA_BLEND_PARAM_2, ALPHA_BLEND_PARAM_3));
293 if ((bt_last == BT_NONE && effective_width & 1) || bt_last == BT_ODD) {
294 __m128i srcABCD = _mm_cvtsi32_si128(src->data);
295 __m128i dstABCD = _mm_cvtsi32_si128(dst->data);
296 dst->data = _mm_cvtsi128_si32(AlphaBlendTwoPixels(srcABCD, dstABCD, ALPHA_BLEND_PARAM_1, ALPHA_BLEND_PARAM_2, ALPHA_BLEND_PARAM_3));
301#if (SSE_VERSION >= 3)
302 for (uint x = (uint) effective_width / 2; x > 0; x--) {
303 __m128i srcABCD = _mm_loadl_epi64((
const __m128i*) src);
304 __m128i dstABCD = _mm_loadl_epi64((__m128i*) dst);
305 uint32_t mvX2 = *((uint32_t *)
const_cast<MapValue *
>(src_mv));
308 if (mvX2 & 0x00FF00FF) {
310 #define CMOV_REMAP(m_colour, m_colour_init, m_src, m_m) \
311 Colour m_colour = m_colour_init; \
313 const Colour srcm = (Colour) (m_src); \
314 const uint m = (uint8_t) (m_m); \
315 const uint r = remap[m]; \
316 const Colour cmap = (this->LookupColourInPalette(r).data & 0x00FFFFFF) | (srcm.data & 0xFF000000); \
317 m_colour = r == 0 ? m_colour : cmap; \
318 m_colour = m != 0 ? m_colour : srcm; \
320#ifdef POINTER_IS_64BIT
321 uint64_t srcs = _mm_cvtsi128_si64(srcABCD);
322 uint64_t remapped_src = 0;
323 CMOV_REMAP(c0, 0, srcs, mvX2);
324 remapped_src = c0.data;
325 CMOV_REMAP(c1, 0, srcs >> 32, mvX2 >> 16);
326 remapped_src |= (uint64_t) c1.data << 32;
327 srcABCD = _mm_cvtsi64_si128(remapped_src);
329 Colour remapped_src[2];
330 CMOV_REMAP(c0, 0, _mm_cvtsi128_si32(srcABCD), mvX2);
331 remapped_src[0] = c0.data;
332 CMOV_REMAP(c1, 0, src[1], mvX2 >> 16);
333 remapped_src[1] = c1.data;
334 srcABCD = _mm_loadl_epi64((__m128i*) &remapped_src);
337 if ((mvX2 & 0xFF00FF00) != 0x80008000) srcABCD = AdjustBrightnessOfTwoPixels(srcABCD, mvX2);
341 _mm_storel_epi64((__m128i *) dst, AlphaBlendTwoPixels(srcABCD, dstABCD, ALPHA_BLEND_PARAM_1, ALPHA_BLEND_PARAM_2, ALPHA_BLEND_PARAM_3));
347 if ((bt_last == BT_NONE && effective_width & 1) || bt_last == BT_ODD) {
349 for (uint x = (uint) effective_width; x > 0; x--) {
354 const uint r = remap[src_mv->m];
356 Colour remapped_colour = AdjustBrightneSSE(this->LookupColourInPalette(r), src_mv->v);
358 *dst = remapped_colour;
360 remapped_colour.a = src->a;
361 srcABCD = _mm_cvtsi32_si128(remapped_colour.data);
362 goto bmcr_alpha_blend_single;
366 srcABCD = _mm_cvtsi32_si128(src->data);
368bmcr_alpha_blend_single:
369 __m128i dstABCD = _mm_cvtsi32_si128(dst->data);
370 srcABCD = AlphaBlendTwoPixels(srcABCD, dstABCD, ALPHA_BLEND_PARAM_1, ALPHA_BLEND_PARAM_2, ALPHA_BLEND_PARAM_3);
372 dst->data = _mm_cvtsi128_si32(srcABCD);
374#if (SSE_VERSION == 2)
384 for (uint x = (uint) bp->
width / 2; x > 0; x--) {
385 __m128i srcABCD = _mm_loadl_epi64((
const __m128i*) src);
386 __m128i dstABCD = _mm_loadl_epi64((__m128i*) dst);
387 _mm_storel_epi64((__m128i *) dst, DarkenTwoPixels(srcABCD, dstABCD, DARKEN_PARAM_1, DARKEN_PARAM_2));
392 if ((bt_last == BT_NONE && bp->
width & 1) || bt_last == BT_ODD) {
393 __m128i srcABCD = _mm_cvtsi32_si128(src->data);
394 __m128i dstABCD = _mm_cvtsi32_si128(dst->data);
395 dst->data = _mm_cvtsi128_si32(DarkenTwoPixels(srcABCD, dstABCD, DARKEN_PARAM_1, DARKEN_PARAM_2));
401 for (uint x = (uint) bp->
width; x > 0; x--) {
412 for (uint x = (uint) bp->
width; x > 0; x--) {
413 if (src_mv->m == 0) {
415 uint8_t g = MakeDark(src->r, src->g, src->b);
416 *dst = ComposeColourRGBA(g, g, g, src->a, *dst);
419 uint r = remap[src_mv->m];
420 if (r != 0) *dst = ComposeColourPANoCheck(AdjustBrightness(this->LookupColourInPalette(r), src_mv->v), src->a, *dst);
429 for (uint x = (uint) bp->
width; x > 0; x--) {
431 *dst = Colour(0, 0, 0);
442 src_rgba_line = (
const Colour*) ((
const uint8_t*) src_rgba_line + si->sprite_line_size);
443 dst_line += bp->
pitch;
454#if (SSE_VERSION == 2)
456#elif (SSE_VERSION == 3)
458#elif (SSE_VERSION == 4)
466 const BlockType bt_last = (BlockType) (bp->
width & 1);
468 default: Draw<BlitterMode::Normal, RM_WITH_SKIP, BT_EVEN, true>(bp, zoom);
return;
469 case BT_ODD: Draw<BlitterMode::Normal, RM_WITH_SKIP, BT_ODD, true>(bp, zoom);
return;
472 if (((
const Blitter_32bppSSE_Base::SpriteData *) bp->
sprite)->flags.Test(SpriteFlag::Translucent)) {
473 Draw<BlitterMode::Normal, RM_WITH_MARGIN, BT_NONE, true>(bp, zoom);
475 Draw<BlitterMode::Normal, RM_WITH_MARGIN, BT_NONE, false>(bp, zoom);
482 if (((
const Blitter_32bppSSE_Base::SpriteData *) bp->
sprite)->flags.Test(SpriteFlag::NoRemap))
goto bm_normal;
484 Draw<BlitterMode::ColourRemap, RM_WITH_SKIP, BT_NONE, true>(bp, zoom);
return;
486 Draw<BlitterMode::ColourRemap, RM_WITH_MARGIN, BT_NONE, true>(bp, zoom);
return;
BlitterMode
The modes of blitting we can do.
@ Transparent
Perform transparency darkening remapping.
@ CrashRemap
Perform a crash remapping.
@ BlackRemap
Perform remapping to a completely blackened sprite.
@ TransparentRemap
Perform transparency colour remapping.
@ ColourRemap
Perform a colour remapping.
Colour ReallyAdjustBrightness(Colour colour, int brightness)
Adjust brightness of colour.
uint8_t GetNearestColourIndex(uint8_t r, uint8_t g, uint8_t b)
Get nearest colour palette index from an RGB colour.
Parameters related to blitting.
int skip_top
How much pixels of the source to skip on the top (based on zoom of dst)
void * dst
Destination buffer.
int left
The left offset in the 'dst' in pixels to start drawing.
int pitch
The pitch of the destination buffer.
int skip_left
How much pixels of the source to skip on the left (based on zoom of dst)
int height
The height in pixels that needs to be drawn to dst.
const uint8_t * remap
XXX – Temporary storage for remap array.
int width
The width in pixels that needs to be drawn to dst.
const void * sprite
Pointer to the sprite how ever the encoder stored it.
int top
The top offset in the 'dst' in pixels to start drawing.
ZoomLevel
All zoom levels we know.