@@ -334,6 +334,7 @@ Matrix Matrix::operator*(Matrix const& other) const {
334334 return res;
335335}
336336
337+ #include < stdio.h> // for testing
337338Vector Matrix::operator *(Vector const & other) const {
338339 assert (width == other.height ); // check if dimensions are compatible
339340
@@ -342,18 +343,22 @@ Vector Matrix::operator*(Vector const& other) const {
342343 auto _width = width;
343344 auto _height = height;
344345
346+ uint8_t *res_blocks = res.blocks ;
347+ uint8_t *other_blocks = other.blocks ;
348+ uint64_t *this_blocks = blocks;
349+
350+ printf (" !! adress : %p\n " , this_blocks); // for testing
351+
345352 int16_t i, k;
346353 #if defined(_OPENMP) && defined(TARGET)
347354 if (_height*_width > GPU_LIMIT) {
348- long unsigned int *res_blocks = (long unsigned int *)&res.blocks [0 ];
349- uint8_t *other_blocks = other.blocks ;
350- uint64_t *this_blocks = blocks;
355+ long unsigned int *res_blocks_long = (long unsigned int *)&res.blocks [0 ];
351356
352357 #pragma omp target teams distribute parallel for collapse(2)
353358 for (i = 0 ; i < _height/8 - 1 ; i++)
354359 for (k = 0 ; k < _width; k++) {
355- #pragma omp atomic acquire
356- res_blocks [i] ^= multiply_block_word (this_blocks[k + 8 *i*_width], this_blocks[k + (8 *i + 1 )*_width], this_blocks[k + (8 *i + 2 )*_width], this_blocks[k + (8 *i + 3 )*_width], \
360+ #pragma omp atomic
361+ res_blocks_long [i] ^= multiply_block_word (this_blocks[k + 8 *i*_width], this_blocks[k + (8 *i + 1 )*_width], this_blocks[k + (8 *i + 2 )*_width], this_blocks[k + (8 *i + 3 )*_width], \
357362 this_blocks[k + (8 *i + 4 )*_width], this_blocks[k + (8 *i + 5 )*_width], this_blocks[k + (8 *i + 6 )*_width], this_blocks[k + (8 *i + 7 )*_width], \
358363 other_blocks[k]);
359364 }
@@ -364,18 +369,14 @@ Vector Matrix::operator*(Vector const& other) const {
364369 #pragma omp parallel for collapse(2) schedule(static) if(8*_width > CPU_LIMIT)
365370 for (k = 0 ; k < _width; k++)
366371 for (i = start; i < _height; i++) {
367- #pragma omp atomic if(8*_width > CPU_LIMIT)
368- res. blocks [i] ^= multiply_block_byte (blocks [k + i*_width], other. blocks [k]);
372+ #pragma omp atomic
373+ res_blocks [i] ^= multiply_block_byte (this_blocks [k + i*_width], other_blocks [k]);
369374 }
370375 res.to (start, length);
371376
372377 } else {
373378 #endif
374379
375- uint8_t *res_blocks = res.blocks ;
376- uint8_t *other_blocks = other.blocks ;
377- uint64_t *this_blocks = blocks;
378-
379380 _OPENMP_PRAGMA (" omp parallel for collapse(2) schedule(static) if(_height*_width > CPU_LIMIT)" )
380381 for (k = 0 ; k < _width; k++)
381382 for (i = 0 ; i < _height; i++) {
0 commit comments