Skip to content
Merged
Show file tree
Hide file tree
Changes from 18 commits
Commits
Show all changes
36 commits
Select commit Hold shift + click to select a range
fe437c1
code drop
pggPL Feb 3, 2026
a54a743
code drop
pggPL Feb 3, 2026
b6e0767
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Feb 3, 2026
76d362c
Merge branch 'main' into inpsect_tensor_dump_support
pggPL Mar 5, 2026
dc60fe8
docs
pggPL Mar 5, 2026
e94467f
nvfp4 internals support
pggPL Mar 5, 2026
e8c8e56
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Mar 5, 2026
b002b89
lint fixes
pggPL Mar 5, 2026
2816f37
Update transformer_engine/debug/features/dump_tensors.py
pggPL Mar 5, 2026
83506af
fix
pggPL Mar 5, 2026
a525f82
Update transformer_engine/debug/features/dump_tensors.py
pggPL Mar 5, 2026
df66054
Update transformer_engine/debug/features/dump_tensors.py
pggPL Mar 5, 2026
ab3e90e
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Mar 5, 2026
089a4d2
Update tests/pytorch/debug/test_log.py
pggPL Mar 5, 2026
a18664f
Update transformer_engine/debug/features/dump_tensors.py
pggPL Mar 5, 2026
41d17fa
fix
pggPL Mar 5, 2026
1736cbe
fix
pggPL Mar 5, 2026
b78d36f
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Mar 5, 2026
d98c4d0
Remove dump_quantized_internals support from DumpTensors
pggPL Mar 10, 2026
23c70e5
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Mar 10, 2026
8357ebe
Address Greptile review comments
pggPL Mar 10, 2026
41c671e
Remove portability suggestion from quantized key docstring
pggPL Mar 10, 2026
0cd16e5
Compute rank lazily in _expected_root_dir
pggPL Mar 10, 2026
6f21734
detach tensors before saving; verify dump filename in test
pggPL Mar 10, 2026
7d36811
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Mar 10, 2026
c7b7f01
Add empty dump_dict log; assert QuantizedTensor type in test
pggPL Mar 10, 2026
2fcd7eb
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Mar 10, 2026
677ad51
Update transformer_engine/debug/features/dump_tensors.py
pggPL Mar 10, 2026
dbe1688
Merge branch 'main' into inpsect_tensor_dump_support
pggPL Mar 19, 2026
c54368f
Address review: iter subdirs, remove dead rank field, add allclose te…
pggPL Mar 19, 2026
27bc899
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Mar 19, 2026
d4459bd
Merge branch 'main' into inpsect_tensor_dump_support
pggPL Mar 20, 2026
f3d8a56
fix: use detach().clone() to avoid shared storage in DumpTensors
pggPL Mar 20, 2026
4e20b8c
test: use torch.equal instead of torch.allclose for serialisation rou…
pggPL Mar 20, 2026
7b1559f
fix: add tp_size to DumpTensors.inspect_tensor and fix KeyError in ca…
pggPL Mar 20, 2026
275767d
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Mar 20, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion docs/debug/3_api_features.rst
Original file line number Diff line number Diff line change
Expand Up @@ -14,4 +14,5 @@ Debug features
.. autoapiclass:: transformer_engine.debug.features.per_tensor_scaling.PerTensorScaling
.. autoapiclass:: transformer_engine.debug.features.fake_quant.FakeQuant
.. autoapiclass:: transformer_engine.debug.features.disable_fp8_gemm.DisableFP8GEMM
.. autoapiclass:: transformer_engine.debug.features.disable_fp8_layer.DisableFP8Layer
.. autoapiclass:: transformer_engine.debug.features.disable_fp8_layer.DisableFP8Layer
.. autoapiclass:: transformer_engine.debug.features.dump_tensors.DumpTensors
150 changes: 150 additions & 0 deletions tests/pytorch/debug/test_log.py
Original file line number Diff line number Diff line change
Expand Up @@ -644,3 +644,153 @@ def test_compute_max_blockwise_dynamic_range_direct():
)

print("All direct tests for compute_max_blockwise_dynamic_range passed!")


# DumpTensors tests
DUMP_TENSORS_CONFIG = """
dump:
layers:
layer_name_regex_pattern: .*
enabled: True
transformer_engine:
DumpTensors:
enabled: True
tensors: [activation]
high_precision_tensor: True
quantized_tensor: True
dump_quantized_internals: True
freq: 1
"""


NVFP4_DUMP_TENSORS_CONFIG = """
dump:
layers:
layer_name_regex_pattern: .*
enabled: True
transformer_engine:
DumpTensors:
enabled: True
tensors: [activation]
high_precision_tensor: False
quantized_tensor: True
dump_quantized_internals: True
freq: 1
"""


def test_dump_tensors_sanity(feature_dirs):
"""Sanity test for DumpTensors feature - verify files are created with correct structure."""
if not fp8_available:
pytest.skip(reason_for_no_fp8)

with debug_session(DUMP_TENSORS_CONFIG, feature_dirs) as log_dir:
recipe_state = RecipeState.create(
recipe.DelayedScaling(),
mode="forward",
num_quantizers=3,
)

tensor = torch.randn(128, 128, dtype=torch.bfloat16).cuda()
quantizer = recipe_state.make_quantizers()[0]
quantized_tensor = quantizer(tensor)

debug_api.transformer_engine.inspect_tensor(
layer_name="test_layer",
tensor_name="activation",
iteration=0,
tp_group=None,
tensor=tensor,
quantizer=quantizer,
rowwise_quantized_tensor=quantized_tensor,
columnwise_quantized_tensor=quantized_tensor,
)
debug_api.step()

# Check that dump file was created
dump_dir = os.path.join(log_dir, "tensor_dumps", "rank_0")
assert os.path.exists(dump_dir), f"Dump directory not created: {dump_dir}"

dump_files = os.listdir(dump_dir)
assert len(dump_files) == 1, f"Expected 1 dump file, got {len(dump_files)}"

# Load and verify structure
dump_file = os.path.join(dump_dir, dump_files[0])
data = torch.load(dump_file, weights_only=False)

assert isinstance(data, dict), "Dump should be a dictionary"
assert "high_precision" in data, "Missing high_precision tensor"
assert "quantized" in data, "Missing quantized tensor"

# Check internals are present (dump_quantized_internals=True)
assert "data" in data, "Missing data (raw FP8 data)"
assert "scale_inv" in data, "Missing scale_inv"

# Verify tensor shapes match
assert data["high_precision"].shape == tensor.shape, "high_precision shape mismatch"

print("DumpTensors sanity test passed!")


def test_dump_tensors_nvfp4_unpacked_codes(feature_dirs):
"""Verify DumpTensors includes unpacked FP4 values in NVFP4 internals."""
if not nvfp4_available:
pytest.skip(reason_for_no_nvfp4)

with debug_session(NVFP4_DUMP_TENSORS_CONFIG, feature_dirs) as log_dir:
recipe_state = RecipeState.create(
recipe.NVFP4BlockScaling(),
mode="forward",
num_quantizers=3,
)

tensor = torch.randn(128, 128, dtype=torch.bfloat16).cuda()
quantizer = recipe_state.make_quantizers()[0]
quantized_tensor = quantizer(tensor)

debug_api.transformer_engine.inspect_tensor(
layer_name="test_layer",
tensor_name="activation",
iteration=0,
tp_group=None,
tensor=tensor,
quantizer=quantizer,
rowwise_quantized_tensor=quantized_tensor,
columnwise_quantized_tensor=quantized_tensor,
)
debug_api.step()

dump_dir = os.path.join(log_dir, "tensor_dumps", "rank_0")
dump_files = os.listdir(dump_dir)
assert len(dump_files) == 1, f"Expected 1 dump file, got {len(dump_files)}"

data = torch.load(os.path.join(dump_dir, dump_files[0]), weights_only=False)
assert "rowwise_data" in data, "Missing packed NVFP4 rowwise_data"
assert "rowwise_data_unpacked_values" in data, "Missing unpacked NVFP4 rowwise values"

packed = data["rowwise_data"]
unpacked = data["rowwise_data_unpacked_values"]
assert unpacked.dtype == torch.float32, "Unpacked values must be float32"
assert (
unpacked.shape[-1] == packed.shape[-1] * 2
), "Unpacked values should double the last packed dimension"
assert (
unpacked.min().item() >= -6.0 and unpacked.max().item() <= 6.0
), "Decoded FP4 values should be in representable E2M1 range [-6, 6]"

# Reconstruct dequantized values from unpacked FP4 values and block scales.
# For NVFP4 rowwise path, one E4M3 scale corresponds to a block of 16 values.
assert "rowwise_block_scale_inv" in data, "Missing rowwise NVFP4 block scales"
rowwise_scale_inv = data["rowwise_block_scale_inv"].to(torch.float32)
values = unpacked.to(torch.float32)
n_rows, n_cols = values.shape
scale_tiles = (n_cols + 15) // 16
expanded_scales = rowwise_scale_inv[:n_rows, :scale_tiles].repeat_interleave(16, dim=1)[
:, :n_cols
]
reconstructed = values * expanded_scales

expected = quantized_tensor.dequantize(dtype=torch.float32)
assert torch.allclose(
reconstructed, expected, atol=1e-5, rtol=1e-3
), "Unpacked FP4 values multiplied by block scales should match NVFP4 dequantization"
Loading
Loading