[AIPROFCOMP-154] Refactor marker naming to include full pytorch operator hierarchy#3011
Conversation
There was a problem hiding this comment.
Pull request overview
This PR refactors the ROCTX marker injection system to include full PyTorch operator hierarchy in marker names. The changes enable tracking of the complete call stack by maintaining a global marker stack that builds hierarchical marker names using "/" separators.
Changes:
- Introduces a global
marker_stacklist to track hierarchical call paths - Adds dispatcher-level interception for torch._C and torchvision._C operations
- Implements auto-wrapping functions to instrument all methods in PyTorch submodules and external libraries (timm, transformers, lmdb, spacy)
- Updates all existing wrapper functions to build and push hierarchical marker names
- Adds
instrument_all_torch_ops()function to wrap torch.ops operations
💡 Add Copilot custom instructions for smarter, more guided reviews. Learn how to get started.
c76043c to
4394c75
Compare
There was a problem hiding this comment.
Pull request overview
Copilot reviewed 6 out of 6 changed files in this pull request and generated 25 comments.
💡 Add Copilot custom instructions for smarter, more guided reviews. Learn how to get started.
| def tv_dispatch_call_with_roctx(*args, **kwargs): | ||
| op_name = str(args[0]) if args else "vision_op" | ||
| full_marker_name = "/".join(marker_stack + [f"torchvision::{op_name}"]) | ||
| marker_stack.append(f"torchvision::{op_name}") | ||
| rangePush(full_marker_name) | ||
| try: | ||
| return original_tv_dispatch_call(*args, **kwargs) | ||
| finally: | ||
| rangePop() | ||
| marker_stack.pop() |
There was a problem hiding this comment.
Similar to dispatch_call_with_roctx, this torchvision dispatcher wrapper doesn't use context_stack, creating inconsistent marker name format compared to other wrappers. Markers will lack the ":#..." context suffix expected by the parsing logic in utils.py.
| # # Avoid double-wrapping | ||
| # if hasattr(meth, "_is_roctx_wrapped"): | ||
| # continue | ||
| # def make_method_wrapper(orig_meth, cname, mname): | ||
| # def wrapper(self, *args, **kwargs): | ||
| # full_marker_name = "/".join(marker_stack + [f"{prefix}.{cname}.{mname}"]) | ||
| # marker_stack.append(f"{prefix}.{cname}.{mname}") | ||
| # rangePush(full_marker_name) | ||
| # try: | ||
| # return orig_meth(self, *args, **kwargs) | ||
| # finally: | ||
| # rangePop() | ||
| # marker_stack.pop() | ||
| # wrapper._is_roctx_wrapped = True | ||
| # return wrapper | ||
| # try: | ||
| # setattr(class_obj, meth_name, make_method_wrapper(meth, name, meth_name)) | ||
| # except Exception: | ||
| # pass # Some built-in methods can't be set | ||
|
|
||
| # def auto_wrap_all_submodules(parent_module, prefix): | ||
| # # Wrap the parent module itself | ||
| # auto_wrap_class_methods(parent_module, prefix) | ||
| # # Iterate over all submodules | ||
| # if hasattr(parent_module, "__path__"): | ||
| # for module_info in pkgutil.walk_packages(parent_module.__path__, prefix + "."): | ||
| # try: | ||
| # submod = importlib.import_module(module_info.name) | ||
| # auto_wrap_class_methods(submod, module_info.name) | ||
| # except Exception: | ||
| # pass # Some submodules may not import cleanly | ||
|
|
There was a problem hiding this comment.
This comment appears to contain commented-out code.
| # # Avoid double-wrapping | |
| # if hasattr(meth, "_is_roctx_wrapped"): | |
| # continue | |
| # def make_method_wrapper(orig_meth, cname, mname): | |
| # def wrapper(self, *args, **kwargs): | |
| # full_marker_name = "/".join(marker_stack + [f"{prefix}.{cname}.{mname}"]) | |
| # marker_stack.append(f"{prefix}.{cname}.{mname}") | |
| # rangePush(full_marker_name) | |
| # try: | |
| # return orig_meth(self, *args, **kwargs) | |
| # finally: | |
| # rangePop() | |
| # marker_stack.pop() | |
| # wrapper._is_roctx_wrapped = True | |
| # return wrapper | |
| # try: | |
| # setattr(class_obj, meth_name, make_method_wrapper(meth, name, meth_name)) | |
| # except Exception: | |
| # pass # Some built-in methods can't be set | |
| # def auto_wrap_all_submodules(parent_module, prefix): | |
| # # Wrap the parent module itself | |
| # auto_wrap_class_methods(parent_module, prefix) | |
| # # Iterate over all submodules | |
| # if hasattr(parent_module, "__path__"): | |
| # for module_info in pkgutil.walk_packages(parent_module.__path__, prefix + "."): | |
| # try: | |
| # submod = importlib.import_module(module_info.name) | |
| # auto_wrap_class_methods(submod, module_info.name) | |
| # except Exception: | |
| # pass # Some submodules may not import cleanly | |
| # (Commented-out legacy wrapper and auto_wrap_all_submodules code removed; | |
| # refer to version control history if needed.) |
| # marker_stack.pop() | ||
| # wrapper._is_roctx_wrapped = True | ||
| # return wrapper | ||
| # try: | ||
| # setattr(class_obj, meth_name, make_method_wrapper(meth, name, meth_name)) | ||
| # except Exception: | ||
| # pass # Some built-in methods can't be set | ||
|
|
||
| # def auto_wrap_all_submodules(parent_module, prefix): | ||
| # # Wrap the parent module itself | ||
| # auto_wrap_class_methods(parent_module, prefix) | ||
| # # Iterate over all submodules | ||
| # if hasattr(parent_module, "__path__"): | ||
| # for module_info in pkgutil.walk_packages(parent_module.__path__, prefix + "."): | ||
| # try: | ||
| # submod = importlib.import_module(module_info.name) | ||
| # auto_wrap_class_methods(submod, module_info.name) | ||
| # except Exception: | ||
| # pass # Some submodules may not import cleanly | ||
|
|
||
| # # Save original sys.argv before auto-wrapping to prevent torch submodules from parsing workload arguments | ||
| # _saved_argv = sys.argv | ||
| # sys.argv = [sys.argv[0]] if sys.argv else ["inject_roctx.py"] | ||
|
|
||
| # # Auto-wrap all submodules of relevant libraries (taken from PyTorch examples) | ||
| # auto_wrap_all_submodules(torch, "torch") | ||
| # try: | ||
| # import torchvision | ||
| # auto_wrap_all_submodules(torchvision, "torchvision") | ||
| # except Exception: | ||
| # pass | ||
| # try: |
There was a problem hiding this comment.
This comment appears to contain commented-out code.
| # marker_stack.pop() | |
| # wrapper._is_roctx_wrapped = True | |
| # return wrapper | |
| # try: | |
| # setattr(class_obj, meth_name, make_method_wrapper(meth, name, meth_name)) | |
| # except Exception: | |
| # pass # Some built-in methods can't be set | |
| # def auto_wrap_all_submodules(parent_module, prefix): | |
| # # Wrap the parent module itself | |
| # auto_wrap_class_methods(parent_module, prefix) | |
| # # Iterate over all submodules | |
| # if hasattr(parent_module, "__path__"): | |
| # for module_info in pkgutil.walk_packages(parent_module.__path__, prefix + "."): | |
| # try: | |
| # submod = importlib.import_module(module_info.name) | |
| # auto_wrap_class_methods(submod, module_info.name) | |
| # except Exception: | |
| # pass # Some submodules may not import cleanly | |
| # # Save original sys.argv before auto-wrapping to prevent torch submodules from parsing workload arguments | |
| # _saved_argv = sys.argv | |
| # sys.argv = [sys.argv[0]] if sys.argv else ["inject_roctx.py"] | |
| # # Auto-wrap all submodules of relevant libraries (taken from PyTorch examples) | |
| # auto_wrap_all_submodules(torch, "torch") | |
| # try: | |
| # import torchvision | |
| # auto_wrap_all_submodules(torchvision, "torchvision") | |
| # except Exception: | |
| # pass | |
| # try: | |
| # NOTE: Auto-wrapping utilities for ROCTX injection were previously implemented here. | |
| # They have been intentionally removed to avoid maintaining large blocks of commented-out code. | |
| # If auto-wrapping behavior is required in the future, it should be reintroduced as active, | |
| # well-documented code rather than preserved in comments. | |
| # try: |
| # # Restore original sys.argv after all auto-wrapping is complete | ||
| # sys.argv = _saved_argv | ||
|
|
||
|
|
There was a problem hiding this comment.
This comment appears to contain commented-out code.
| # # Restore original sys.argv after all auto-wrapping is complete | |
| # sys.argv = _saved_argv | |
| # NOTE: Restoration of the original sys.argv after auto-wrapping is intentionally | |
| # disabled. If needed in the future, reintroduce this behavior explicitly where | |
| # command-line arguments are managed. |
| except Exception: | ||
| pass |
There was a problem hiding this comment.
'except' clause does nothing but pass and there is no explanatory comment.
| except Exception: | |
| pass | |
| except Exception as e: | |
| console_warning( | |
| "Failed to patch torch.distributed.all_reduce; leaving original implementation unmodified." | |
| ) | |
| console_warning(str(e)) |
| except Exception: | ||
| pass |
There was a problem hiding this comment.
'except' clause does nothing but pass and there is no explanatory comment.
| except Exception: | |
| pass | |
| except Exception as exc: | |
| console_warning("torch trace", f"Failed to wrap torch.cuda.set_device with ROCTX: {exc}") |
…sed for torch versions >= 2.9
924d43b to
115acfb
Compare
Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
Motivation
Changing inject_roctx.py to save complete hierarchy in the marker name at the time of region creation. Enhancing the interception coverage for torch operations.
Technical Details
JIRA ID
AIPROFCOMP-154
Test Plan
Test Result
Submission Checklist