Skip to content

Commit fef9220

Browse files
Fixes per the review
Signed-off-by: Oleg Goncharov <[email protected]>
1 parent 47be9b2 commit fef9220

File tree

2 files changed

+36
-36
lines changed

2 files changed

+36
-36
lines changed

transformer_engine/common/cast/nvfp4/specialized/group_quantize_transpose_nvfp4_tuned_1D.cuh

Lines changed: 18 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -67,20 +67,20 @@ constexpr int ELTS_PER_THREAD = 16;
6767
constexpr int TILE_DIM_Y = 64;
6868
constexpr int TILE_DIM_X = 64;
6969

70-
static_assert(ELTS_PER_THREAD == SCALE_DIM && "Hardcoded and fixed parameter\0");
70+
static_assert(ELTS_PER_THREAD == SCALE_DIM, "Hardcoded and fixed parameter\0");
7171

72-
static_assert((THREADS_NUM * ELTS_PER_THREAD <= TILE_DIM_Y * TILE_DIM_X) &&
73-
"Unbalanced threads workload\0");
72+
static_assert(THREADS_NUM * ELTS_PER_THREAD <= TILE_DIM_Y * TILE_DIM_X,
73+
"Unbalanced threads workload");
7474

75-
static_assert((CHUNK_DIM_Y % TILE_DIM_Y == 0) &&
76-
"Chunk size Y must be evenly divisible by the tile size Y\0");
77-
static_assert((CHUNK_DIM_X % TILE_DIM_X == 0) &&
78-
"Chunk size X must be evenly divisible by the tile size X\0");
75+
static_assert(CHUNK_DIM_Y % TILE_DIM_Y == 0,
76+
"Chunk size Y must be evenly divisible by the tile size Y");
77+
static_assert(CHUNK_DIM_X % TILE_DIM_X == 0,
78+
"Chunk size X must be evenly divisible by the tile size X");
7979

80-
static_assert((TILE_DIM_Y % SCALE_DIM == 0) &&
81-
"Tile size Y must be evenly divisible by the scale dim\0");
82-
static_assert((TILE_DIM_X % SCALE_DIM == 0) &&
83-
"Tile size X must be evenly divisible by the scale dim\0");
80+
static_assert(TILE_DIM_Y % SCALE_DIM == 0,
81+
"Tile size Y must be evenly divisible by the scale dim");
82+
static_assert(TILE_DIM_X % SCALE_DIM == 0,
83+
"Tile size X must be evenly divisible by the scale dim");
8484

8585
constexpr int TILES_Y = CHUNK_DIM_Y / TILE_DIM_Y;
8686
constexpr int TILES_X = CHUNK_DIM_X / TILE_DIM_X;
@@ -134,19 +134,19 @@ constexpr int THREADS_Y_TR = THREADS_NUM / THREADS_X_TR;
134134

135135
constexpr int ITERATIONS_NORMAL = BUFF_DIM_Y / THREADS_Y_ROWWISE;
136136
constexpr int ITERATIONS_TR = SCALES_PER_TILE_Y / THREADS_Y_TR;
137-
static_assert(ITERATIONS_TR >= 1 && "Number of transpose iterations should be >=1\0");
138-
static_assert((SCALES_PER_TILE_Y % THREADS_Y_TR == 0) &&
139-
"Partial transpose iterations are not supported\0");
137+
static_assert(ITERATIONS_TR >= 1, "Number of transpose iterations should be >=1");
138+
static_assert(SCALES_PER_TILE_Y % THREADS_Y_TR == 0,
139+
"Partial transpose iterations are not supported");
140140

141141
constexpr int BUFF_OUT_IT_OFFSET = BUFF_OUT_TR_DIM_X / ITERATIONS_TR / STAGES;
142142

143-
static_assert(BUFF_DIM_Y >= SCALE_DIM &&
143+
static_assert(BUFF_DIM_Y >= SCALE_DIM,
144144
"Number of buffer rows must be greater or equal to the size of the columwise "
145-
"scaling block\0");
145+
"scaling block");
146146
static_assert(CHUNK_DIM_Y >= BUFF_DIM_Y);
147-
static_assert(BUFF_DIM_Y >= THREADS_Y_ROWWISE &&
147+
static_assert(BUFF_DIM_Y >= THREADS_Y_ROWWISE,
148148
"Number of buffer rows must be greater or equal to the number of rowwise "
149-
"processing threads in Y dimension\0");
149+
"processing threads in Y dimension");
150150

151151
// Number of 4-bit elements that span 32 banks (4-byte each) of shared memory
152152
constexpr int TOTAL_BANKS_WIDTH = (32 * 4 * 8) / 4; // 256

transformer_engine/common/cast/nvfp4/specialized/quantize_transpose_nvfp4_tuned_1D.cuh

Lines changed: 18 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -47,20 +47,20 @@ constexpr int ELTS_PER_THREAD = 16;
4747
constexpr int TILE_DIM_Y = 64;
4848
constexpr int TILE_DIM_X = 64;
4949

50-
static_assert(ELTS_PER_THREAD == SCALE_DIM && "Hardcoded and fixed parameter\0");
50+
static_assert(ELTS_PER_THREAD == SCALE_DIM && "Hardcoded and fixed parameter");
5151

52-
static_assert((THREADS_NUM * ELTS_PER_THREAD <= TILE_DIM_Y * TILE_DIM_X) &&
53-
"Unbalanced threads workload\0");
52+
static_assert((THREADS_NUM * ELTS_PER_THREAD <= TILE_DIM_Y * TILE_DIM_X),
53+
"Unbalanced threads workload");
5454

55-
static_assert((TunableConfig::CHUNK_DIM_Y % TILE_DIM_Y == 0) &&
56-
"Chunk size Y must be evenly divisible by the tile size Y\0");
57-
static_assert((TunableConfig::CHUNK_DIM_X % TILE_DIM_X == 0) &&
58-
"Chunk size X must be evenly divisible by the tile size X\0");
55+
static_assert(TunableConfig::CHUNK_DIM_Y % TILE_DIM_Y == 0,
56+
"Chunk size Y must be evenly divisible by the tile size Y");
57+
static_assert(TunableConfig::CHUNK_DIM_X % TILE_DIM_X == 0,
58+
"Chunk size X must be evenly divisible by the tile size X");
5959

60-
static_assert((TILE_DIM_Y % SCALE_DIM == 0) &&
61-
"Tile size Y must be evenly divisible by the scale dim\0");
62-
static_assert((TILE_DIM_X % SCALE_DIM == 0) &&
63-
"Tile size X must be evenly divisible by the scale dim\0");
60+
static_assert(TILE_DIM_Y % SCALE_DIM == 0,
61+
"Tile size Y must be evenly divisible by the scale dim");
62+
static_assert(TILE_DIM_X % SCALE_DIM == 0,
63+
"Tile size X must be evenly divisible by the scale dim");
6464

6565
constexpr int TILES_Y = TunableConfig::CHUNK_DIM_Y / TILE_DIM_Y;
6666
constexpr int TILES_X = TunableConfig::CHUNK_DIM_X / TILE_DIM_X;
@@ -114,19 +114,19 @@ constexpr int THREADS_Y_TR = THREADS_NUM / THREADS_X_TR;
114114

115115
constexpr int ITERATIONS_NORMAL = BUFF_DIM_Y / THREADS_Y_ROWWISE;
116116
constexpr int ITERATIONS_TR = SCALES_PER_TILE_Y / THREADS_Y_TR;
117-
static_assert(ITERATIONS_TR >= 1 && "Number of transpose iterations should be >=1\0");
118-
static_assert((SCALES_PER_TILE_Y % THREADS_Y_TR == 0) &&
119-
"Partial transpose iterations are not supported\0");
117+
static_assert(ITERATIONS_TR >= 1, "Number of transpose iterations should be >=1");
118+
static_assert(SCALES_PER_TILE_Y % THREADS_Y_TR == 0,
119+
"Partial transpose iterations are not supported");
120120

121121
constexpr int BUFF_OUT_IT_OFFSET = BUFF_OUT_TR_DIM_X / ITERATIONS_TR / STAGES;
122122

123-
static_assert(BUFF_DIM_Y >= SCALE_DIM &&
123+
static_assert(BUFF_DIM_Y >= SCALE_DIM,
124124
"Number of buffer rows must be greater or equal to the size of the columwise "
125-
"scaling block\0");
125+
"scaling block");
126126
static_assert(TunableConfig::CHUNK_DIM_Y >= BUFF_DIM_Y);
127-
static_assert(BUFF_DIM_Y >= THREADS_Y_ROWWISE &&
127+
static_assert(BUFF_DIM_Y >= THREADS_Y_ROWWISE,
128128
"Number of buffer rows must be greater or equal to the number of rowwise "
129-
"processing threads in Y dimension\0");
129+
"processing threads in Y dimension");
130130

131131
// Number of 4-bit elements that span 32 banks (4-byte each) of shared memory
132132
constexpr int TOTAL_BANKS_WIDTH = (32 * 4 * 8) / 4; // 256

0 commit comments

Comments
 (0)