Skip to content

Commit 8dd7480

Browse files
committed
still trying to debug gpu * and travis config
1 parent adad98f commit 8dd7480

File tree

2 files changed

+14
-12
lines changed

2 files changed

+14
-12
lines changed

Makefile

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,8 @@ openmp:
2121
$(eval U_LDLIBS+=-fopenmp)
2222

2323
gpu-nvidia: openmp
24-
$(eval U_CXX=g++-10 -std=c++0x)
24+
$(eval U_CXX=g++-10 -std=c++0x)
25+
$(eval U_CCFLAGS+=-DTARGET=1)
2526
$(eval U_LDLIBS+=-fno-stack-protector -foffload=nvptx-none)
2627

2728
gpu-amd: openmp

src/arithmetic.inl

Lines changed: 12 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -334,6 +334,7 @@ Matrix Matrix::operator*(Matrix const& other) const {
334334
return res;
335335
}
336336

337+
#include <stdio.h> //for testing
337338
Vector Matrix::operator*(Vector const& other) const {
338339
assert(width == other.height); //check if dimensions are compatible
339340

@@ -342,18 +343,22 @@ Vector Matrix::operator*(Vector const& other) const {
342343
auto _width = width;
343344
auto _height = height;
344345

346+
uint8_t *res_blocks = res.blocks;
347+
uint8_t *other_blocks = other.blocks;
348+
uint64_t *this_blocks = blocks;
349+
350+
printf("!! adress : %p\n", this_blocks); //for testing
351+
345352
int16_t i, k;
346353
#if defined(_OPENMP) && defined(TARGET)
347354
if(_height*_width > GPU_LIMIT) {
348-
long unsigned int *res_blocks = (long unsigned int*)&res.blocks[0];
349-
uint8_t *other_blocks = other.blocks;
350-
uint64_t *this_blocks = blocks;
355+
long unsigned int *res_blocks_long = (long unsigned int*)&res.blocks[0];
351356

352357
#pragma omp target teams distribute parallel for collapse(2)
353358
for (i = 0; i < _height/8 - 1; i++)
354359
for (k = 0; k < _width; k++) {
355-
#pragma omp atomic acquire
356-
res_blocks[i] ^= multiply_block_word(this_blocks[k + 8*i*_width], this_blocks[k + (8*i + 1)*_width], this_blocks[k + (8*i + 2)*_width], this_blocks[k + (8*i + 3)*_width], \
360+
#pragma omp atomic
361+
res_blocks_long[i] ^= multiply_block_word(this_blocks[k + 8*i*_width], this_blocks[k + (8*i + 1)*_width], this_blocks[k + (8*i + 2)*_width], this_blocks[k + (8*i + 3)*_width], \
357362
this_blocks[k + (8*i + 4)*_width], this_blocks[k + (8*i + 5)*_width], this_blocks[k + (8*i + 6)*_width], this_blocks[k + (8*i + 7)*_width], \
358363
other_blocks[k]);
359364
}
@@ -364,18 +369,14 @@ Vector Matrix::operator*(Vector const& other) const {
364369
#pragma omp parallel for collapse(2) schedule(static) if(8*_width > CPU_LIMIT)
365370
for (k = 0; k < _width; k++)
366371
for (i = start; i < _height; i++) {
367-
#pragma omp atomic if(8*_width > CPU_LIMIT)
368-
res.blocks[i] ^= multiply_block_byte(blocks[k + i*_width], other.blocks[k]);
372+
#pragma omp atomic
373+
res_blocks[i] ^= multiply_block_byte(this_blocks[k + i*_width], other_blocks[k]);
369374
}
370375
res.to(start, length);
371376

372377
} else {
373378
#endif
374379

375-
uint8_t *res_blocks = res.blocks;
376-
uint8_t *other_blocks = other.blocks;
377-
uint64_t *this_blocks = blocks;
378-
379380
_OPENMP_PRAGMA("omp parallel for collapse(2) schedule(static) if(_height*_width > CPU_LIMIT)")
380381
for (k = 0; k < _width; k++)
381382
for (i = 0; i < _height; i++) {

0 commit comments

Comments
 (0)