2121
2222using namespace v8;
2323
24- // Windows doesn't support the C99 names for these
2524#ifdef _MSC_VER
26- #define isnan(x) _isnan(x)
27- #define isinf(x) (!_finite(x))
25+ // Windows doesn't support the C99 names for these. TODO unnecessary,
26+ // should be using std::isnan.
27+ # define isnan(x) _isnan(x)
28+ # define isinf(x) (!_finite(x))
29+ # include <intrin.h>
30+ # define bswap32 _byteswap_ulong
31+ #else
32+ # ifdef __x86_64__
33+ # include <x86intrin.h>
34+ # endif
35+ # define bswap32 __builtin_bswap32
2836#endif
2937
38+ static inline uint32_t rotr(uint32_t n, unsigned int c) {
39+ // GCC has no portable _rotr intrinsic, so rely on idiom recognition. Works
40+ // for all supported versions of MSVC, GCC x86, GCC ARM, Clang.
41+ // https://stackoverflow.com/a/776523/1218408
42+ const unsigned int mask = CHAR_BIT * sizeof(n) - 1;
43+ c &= mask;
44+ return (n >> c) | (n << ((~c + 1) & mask));
45+ }
46+
3047#ifndef isnan
3148#define isnan(x) std::isnan(x)
3249#define isinf(x) std::isinf(x)
@@ -852,32 +869,52 @@ NAN_METHOD(Context2d::PutImageData) {
852869 for (int y = 0; y < rows; ++y) {
853870 uint8_t *dstRow = dst;
854871 uint8_t *srcRow = src;
855- for (int x = 0; x < cols; ++x) {
856- // rgba
857- uint8_t r = *srcRow++;
858- uint8_t g = *srcRow++;
859- uint8_t b = *srcRow++;
860- uint8_t a = *srcRow++;
861-
862- // argb
863- // performance optimization: fully transparent/opaque pixels can be
864- // processed more efficiently.
872+ #if defined(__x86_64__) || defined(_M_X64)
873+ int x = 0;
874+ for (; x < cols - 2; x += 2) {
875+ __m128i px;
876+ memcpy(&px, srcRow, 8); // gcc doesn't define _mm_loadu_si64
877+ px = _mm_unpacklo_epi8(px, _mm_setzero_si128());
878+ // rgba -> bgra
879+ px = _mm_shufflelo_epi16(px, 0b11000110);
880+ px = _mm_shufflehi_epi16(px, 0b11000110);
881+ // broadcast alpha
882+ __m128i av = _mm_shufflelo_epi16(px, 0b11111111);
883+ av = _mm_shufflehi_epi16(av, 0b11111111);
884+ // Set alpha channel to 255 to undo upcoming division by 255
885+ av = _mm_and_si128(av, _mm_setr_epi16(0xFFFF, 0xFFFF, 0xFFFF, 0, 0xFFFF, 0xFFFF, 0xFFFF, 0));
886+ av = _mm_or_si128(av, _mm_setr_epi16(0, 0, 0, 255, 0, 0, 0, 255));
887+ px = _mm_mullo_epi16(px, av);
888+ // divide by 255
889+ px = _mm_mulhi_epu16(px, _mm_set1_epi16(0x8081));
890+ px = _mm_srli_epi16(px, 7);
891+ // pack int16 to int8
892+ px = _mm_packus_epi16(px, px);
893+ memcpy(dstRow, &px, 8);
894+ dstRow += 8;
895+ srcRow += 8;
896+ }
897+ if (x < cols) {
898+ #else
899+ for (int x = 0; x < cols; x++) {
900+ #endif
901+ uint32_t c;
902+ memcpy(&c, srcRow, 4); // rgba (LE)
903+ srcRow += 4;
904+ uint32_t a = c >> 24;
865905 if (a == 0) {
866- *dstRow++ = 0;
867- *dstRow++ = 0;
868- *dstRow++ = 0;
869- *dstRow++ = 0;
870- } else if (a == 255) {
871- *dstRow++ = b;
872- *dstRow++ = g;
873- *dstRow++ = r;
874- *dstRow++ = a;
906+ uint32_t zero = 0;
907+ memcpy(dstRow, &zero, 4);
908+ } else if (a == 255) { // rgba (LE)
909+ c = bswap32(c); // abgr
910+ c = rotr(c, 8); // bgra
911+ memcpy(dstRow, &c, 4);
875912 } else {
876- float alpha = (float) a / 255;
877- *dstRow++ = b * alpha ;
878- *dstRow++ = g * alpha ;
879- *dstRow++ = r * alpha ;
880- * dstRow++ = a ;
913+ uint8_t r = (c & 0xFF) * a / 255;
914+ uint8_t g = (c >> 8 & 0xFF) * a / 255 ;
915+ uint8_t b = (c >> 16 & 0xFF) * a / 255 ;
916+ uint32_t bgra = (a << 24) | (r << 16) | (g << 8) | b ;
917+ memcpy( dstRow, &bgra, 4) ;
881918 }
882919 }
883920 dst += dstStride;
@@ -892,13 +929,13 @@ NAN_METHOD(Context2d::PutImageData) {
892929 uint8_t *dstRow = dst;
893930 uint8_t *srcRow = src;
894931 for (int x = 0; x < cols; ++x) {
895- // rgba
932+ // rgb[a]
896933 uint8_t r = *srcRow++;
897934 uint8_t g = *srcRow++;
898935 uint8_t b = *srcRow++;
899936 srcRow++;
900937
901- // argb
938+ // bgra
902939 *dstRow++ = b;
903940 *dstRow++ = g;
904941 *dstRow++ = r;
0 commit comments