NVIDIA-NeMo
diff --git a/‎.github/workflows/build-test-publish-wheel.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/build-test-publish-wheel.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/cicd-main.yml‎
Lines changed: 3 additions & 2 deletions b/‎.github/workflows/cicd-main.yml‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎.github/workflows/release-docs.yml‎
Lines changed: 45 additions & 1 deletion b/‎.github/workflows/release-docs.yml‎
Lines changed: 45 additions & 1 deletion
diff --git a/‎.github/workflows/release-nightly-docs.yml‎
Lines changed: 29 additions & 0 deletions b/‎.github/workflows/release-nightly-docs.yml‎
Lines changed: 29 additions & 0 deletions
diff --git a/‎.github/workflows/release.yaml‎
Lines changed: 31 additions & 4 deletions b/‎.github/workflows/release.yaml‎
Lines changed: 31 additions & 4 deletions
diff --git a/‎docs/conf.py‎
Lines changed: 2 additions & 2 deletions b/‎docs/conf.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎docs/performance-summary.md‎
Lines changed: 65 additions & 1 deletion b/‎docs/performance-summary.md‎
Lines changed: 65 additions & 1 deletion
diff --git a/‎docs/project.json‎
Lines changed: 2 additions & 2 deletions b/‎docs/project.json‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎docs/releases/known-issues.md‎
Lines changed: 6 additions & 2 deletions b/‎docs/releases/known-issues.md‎
Lines changed: 6 additions & 2 deletions
diff --git a/‎docs/releases/software-versions.md‎
Lines changed: 24 additions & 2 deletions b/‎docs/releases/software-versions.md‎
Lines changed: 24 additions & 2 deletions
@@ -64,7 +64,7 @@ jobs:
       container-options: "--gpus all --runtime=nvidia"
     secrets:
       TWINE_USERNAME: __token__
-      TWINE_PASSWORD: ${{ (github.ref == 'refs/heads/main' || startsWith(github.ref, 'refs/heads/r')) && secrets.SVC_PYPI_TOKEN || secrets.SVC_PYPI_TEST_TOKEN }}
+      TWINE_PASSWORD: ${{ secrets.SVC_PYPI_TEST_TOKEN }}
       SLACK_WEBHOOK: ${{ secrets.SLACK_CI_CHANNEL_WEBHOOK }}
       SLACK_WEBHOOK_ADMIN: ${{ secrets.SLACK_TEAM_GROUP_ID }}
       GH_TOKEN: ${{ secrets.PAT }}
 
@@ -361,10 +361,11 @@ jobs:
           - script: L2_Launch_models_nemotron_vl
           - script: L2_Launch_models_olmoe
           - script: L2_Launch_models_qwen
-          # - script: L2_Launch_models_qwen_quantization
+          - script: L2_Launch_models_qwen_quantization
           - script: L2_Launch_models_qwen_vl
           - script: L2_Launch_recipes_gemma_vl
           - script: L2_Launch_recipes_gpt_oss
+          - script: L2_Launch_models_qwen_vl_quantization
           - script: L2_Launch_recipes_llama_1b
           - script: L2_Launch_recipes_llama_3b
           - script: L2_Launch_recipes_llama_distill
@@ -373,7 +374,7 @@ jobs:
           - script: L2_Launch_data
           - script: L2_Launch_post_training_quantization
           - script: L2_Launch_quantization_aware_training
-          - script: L2_Launch_quantization_export
+          # - script: L2_Launch_quantization_export
           - script: L2_Launch_recipes_llama_cuda_graphs
           - script: L2_Launch_utils
     needs: [pre-flight, cicd-unit-tests]
 
@@ -30,14 +30,57 @@ on:
         required: false
         type: string
         default: ""
+      update-version-picker:
+        description: Update version picker.
+        required: false
+        type: boolean
+        default: true
+      notify-emails:
+        description: Email addresses to send the notification to. Format as "me@me.com,you@you.com".
+        required: false
+        type: string
+      github-ref:
+        description: Github ref to checkout
+        required: false
+        type: string
+        default: ""
+  workflow_call:
+    inputs:
+      dry-run:
+        description: Whether to run the workflow in dry-run mode
+        required: false
+        type: boolean
+        default: true
+      publish-as-latest:
+        description: Publish as Latest stable version.
+        required: false
+        type: boolean
+        default: true
+      docs-version-override:
+        description: Docs version if commit is not tagged
+        required: false
+        type: string
+        default: ""
+      update-version-picker:
+        description: Update version picker.
+        required: false
+        type: boolean
+        default: true
       notify-emails:
         description: Email addresses to send the notification to. Format as "me@me.com,you@you.com".
         required: false
         type: string
+      github-ref:
+        description: Github ref to checkout
+        required: false
+        type: string
+        default: ""
 
 jobs:
   build-docs:
     uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_build_docs.yml@v0.67.0
+    with:
+      ref: ${{ inputs.github-ref }}
 
   publish-docs:
     runs-on: ubuntu-latest
@@ -46,7 +89,7 @@ jobs:
       - uses: actions/checkout@v6
         with:
           repository: NVIDIA-NeMo/FW-CI-templates
-          ref: v0.72.0
+          ref: v0.74.0
           path: FW-CI-templates
 
       - uses: ./FW-CI-templates/.github/actions/publish-docs
@@ -62,6 +105,7 @@ jobs:
           emails-csv: ${{ inputs.notify-emails && format('{0},{1}', vars.docs_release_emails, inputs.notify-emails) || vars.docs_release_emails }}
           overwrite-latest-on-tag: ${{ inputs.publish-as-latest }}
           docs-version-override: ${{ inputs.docs-version-override }}
+          update-version-picker: ${{ inputs.update-version-picker }}
           run-on-version-tag-only: ${{ github.ref_name != 'main' }}
           request-name: megatron-bridge-publish-docs-${{ github.run_id }}
           aws-region: ${{ vars.DOCS_AWS_REGION }}
 
@@ -0,0 +1,29 @@
+# Copyright (c) 2026, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+name: Release Nightly Docs
+
+on:
+  schedule:
+    - cron: "0 10 * * *"
+
+jobs:
+  call-release-docs:
+    uses: ./.github/workflows/release-docs.yml
+    with:
+      dry-run: false
+      publish-as-latest: false
+      docs-version-override: "nightly"
+      update-version-picker: false
+    secrets: inherit
@@ -44,14 +44,30 @@ on:
         description: Branch for version bump
         required: true
         type: string
+      gh-release-from-tag:
+        description: Tag of previous release for changelog builder
+        required: false
+        type: string
+        default: ""
 
 permissions:
   contents: write # To read repository content
   pull-requests: write # To create PRs
 
 jobs:
+  pre-flight:
+    uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_cicd_preflight.yml@v0.73.2
+    with:
+      default_runner_prefix: ${{ vars.DEFAULT_RUNNER_PREFIX }}
+      non_nvidia_runner_prefix: ${{ vars.NON_NVIDIA_RUNNER_PREFIX }}
+      default_test_data_path: ${{ vars.DEFAULT_TEST_DATA_PATH }}
+      non_nvidia_test_data_path: ${{ vars.NON_NVIDIA_TEST_DATA_PATH }}
+    secrets:
+      NVIDIA_MANAGEMENT_ORG_PAT: ${{ secrets.NVIDIA_MANAGEMENT_ORG_PAT }}
+
   release:
-    uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_release_library.yml@v0.73.0
+    uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_release_library.yml@v0.77.0
+    needs: [pre-flight]
     with:
       release-ref: ${{ inputs.release-ref || github.sha }}
       python-package: megatron.bridge
@@ -63,18 +79,29 @@ jobs:
       has-src-dir: true
       skip-test-wheel: true
       custom-container: nvcr.io/nvidia/pytorch:25.05-py3
-      runner: linux-amd64-cpu16
+      runner: ${{ needs.pre-flight.outputs.runner_prefix }}-gpu-x2-container
       no-build-isolation: true
       app-id: ${{ vars.BOT_ID }}
       gh-release-use-changelog-builder: ${{ inputs.generate-changelog }}
       publish-docs: ${{ inputs.publish-docs }}
       docs-target-path: nemo/megatron-bridge
+      submodules: recursive
+      container-options: "--gpus all --runtime=nvidia"
+      gh-release-from-tag: ${{ inputs.gh-release-from-tag }}
     secrets:
       TWINE_USERNAME: __token__
-      TWINE_PASSWORD: ${{ (github.ref == 'refs/heads/main' || startsWith(github.ref, 'refs/heads/r')) && secrets.SVC_PYPI_TOKEN || secrets.SVC_PYPI_TEST_TOKEN }}
+      TWINE_PASSWORD: ${{ secrets.TWINE_PASSWORD }}
       SLACK_WEBHOOK_ADMIN: ${{ secrets.SLACK_TEAM_GROUP_ID }}
-      SLACK_WEBHOOK: ${{ (github.ref == 'refs/heads/main' || startsWith(github.ref, 'refs/heads/r')) && secrets.SLACK_MAIN_CHANNEL_WEBHOOK || secrets.SLACK_CI_CHANNEL_WEBHOOK }}
+      SLACK_WEBHOOK: ${{ secrets.SLACK_RELEASE_ENDPOINT }}
       PAT: ${{ secrets.PAT }}
       SSH_KEY: ${{ secrets.SSH_KEY }}
       SSH_PWD: ${{ secrets.SSH_PWD }}
       BOT_KEY: ${{ secrets.BOT_KEY }}
+      AWS_ASSUME_ROLE_ARN: ${{ secrets.AWS_ASSUME_ROLE_ARN }}
+      AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
+      AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
+      AKAMAI_HOST: ${{ secrets.AKAMAI_HOST }}
+      AKAMAI_CLIENT_TOKEN: ${{ secrets.AKAMAI_CLIENT_TOKEN }}
+      AKAMAI_CLIENT_SECRET: ${{ secrets.AKAMAI_CLIENT_SECRET }}
+      AKAMAI_ACCESS_TOKEN: ${{ secrets.AKAMAI_ACCESS_TOKEN }}
+      S3_BUCKET_NAME: ${{ secrets.S3_BUCKET_NAME }}
@@ -25,9 +25,9 @@
 # https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information
 
 project = "Megatron Bridge"
-copyright = "2025, NVIDIA Corporation"
+copyright = "2026, NVIDIA Corporation"
 author = "NVIDIA Corporation"
-release = "latest"
+release = "nightly"
 
 # -- General configuration ---------------------------------------------------
 # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration
 
@@ -37,11 +37,75 @@ Below are performance benchmarks for various large language models organized by
 The performance data includes:
 
 - **Pre-training Performance**: Throughput metrics for various model sizes and architectures
-- **System Configurations**: Results across different GPU systems (DGX-GB200, DGX-B200, DGX-H100)
+- **System Configurations**: Results across different GPU systems (DGX-GB300, DGX-GB200, DGX-B300, DGX-B200, DGX-H100)
 - **Precision Options**: Performance comparisons between different precision modes (BF16, FP8, MXFP8)
 
 ---
 
+## 26.02 NeMo Container
+
+### Pre-Training Performance
+
+#### System: DGX-GB300
+
+| Model | #-GPUs | Precision | GBS | MBS | Sequence Length | FSDP | TP | PP | CP | VP | EP | Tokens / sec / GPU | Model TFLOP / sec / GPU |
+|-------|--------|-----------|-----|-----|-----------------|------|----|----|----|----|----|-----------------------|-------------------------|
+| LLAMA3_70B | 64 | FP8 | 256 | 2 | 8192 | 64| 1 | 1 | 1 | n/a | n/a | 5003 | 2248 |
+| LLAMA3_70B | 64 | MXFP8 | 256 | 1 | 8192 | 0 | 1 | 4 | 1 | 5 | n/a | 4596 | 2064 |
+| LLAMA3_70B | 64 | NVFP4 | 256 | 1 | 8192 | 0 | 1 | 4 | 1 | 5 | n/a | 6798 | 3056 |
+| LLAMA3.1_405B | 256 | FP8 | 1536 | 1 | 8192 | 0 | 4 | 8 | 1 | 4 | n/a | 988 | 2495 |
+| LLAMA3.1_405B | 256 | MXFP8 | 1536 | 1 | 8192 | 0 | 2 | 8 | 2 | 4 | n/a | 931 | 2349 |
+| LLAMA3.1_405B | 256 | NVFP4 | 1536 | 1 | 8192 | 0 | 4 | 8 | 1 | 4 | n/a | 1333 | 3365 |
+| DeepSeekV3 | 256 | MXFP8 | 4096 | 2 | 4096 | 0 | 1 | 2 | 1 | 8 | 32 | 4612 | 1199 |
+| GPT OSS 120B | 64 | BF16 | 1280 | 4 | 4096 | 0 | 1 | 1 | 1 | n/a | 64 | 19412 | 527 |
+| Qwen3_30B_a3B | 8 | MXFP8 | 512 | 8 | 4096 | 0 | 1 | 1 | 1 | n/a | 8 | 30376 | 699 |
+| Qwen3_235B_a22B | 256 | MXFP8 | 8192 | 2 | 4096 | 0 | 1 | 4 | 1 | n/a | 32 | 6583 | 974 |
+
+#### System: DGX-GB200
+
+| Model | #-GPUs | Precision | GBS | MBS | Sequence Length | FSDP | TP | PP | CP | VP | EP | Tokens / sec / GPU | Model TFLOP / sec / GPU |
+|-------|--------|-----------|-----|-----|-----------------|------|----|----|----|----|----|-----------------------|-------------------------|
+| LLAMA3_70B | 64 | FP8 | 256 | 2 | 8192 | 64| 1 | 1 | 1 | n/a | n/a | 4040 | 1815 |
+| LLAMA3_70B | 64 | MXFP8 | 256 | 1 | 8192 | 0 | 2 | 4 | 1 | 5 | n/a | 3613 | 1623 |
+| LLAMA3_70B | 64 | NVFP4 | 256 | 1 | 8192 | 0 | 2 | 4 | 1 | 5 | n/a | 4458 | 2004 |
+| LLAMA3.1_405B | 256 | FP8 | 1536 | 1 | 8192 | 0 | 4 | 16 | 1 | 4 | n/a | 793 | 2004 |
+| LLAMA3.1_405B | 256 | MXFP8 | 1536 | 1 | 8192 | 0 | 4 | 16 | 1 | 8 | n/a | 786 | 1983 |
+| LLAMA3.1_405B | 256 | NVFP4 | 1536 | 1 | 8192 | 0 | 4 | 16 | 1 | 8 | n/a | 1076 | 2716 |
+| DeepSeekV3 | 256 | MXFP8 | 4096 | 1 | 4096 | 0 | 1 | 4 | 1 | 4 | 64 | 3955 | 1028 |
+| GPT OSS 120B | 64 | BF16 | 1280 | 4 | 4096 | 0 | 1 | 1 | 1 | n/a | 64 | 15784 | 428 |
+| Qwen3_30B_a3B | 8 | MXFP8 | 512 | 4 | 4096 | 0 | 1 | 1 | 1 | n/a | 8 | 26084 | 600 |
+| Qwen3_235B_a22B | 256 | MXFP8 | 8192 | 1 | 4096 | 0 | 1 | 8 | 1 | n/a | 32 | 5448 | 806 |
+
+#### System: DGX-B300
+
+| Model | #-GPUs | Precision | GBS | MBS | Sequence Length | FSDP | TP | PP | CP | VP | EP | Tokens / sec / GPU | Model TFLOP / sec / GPU |
+|-------|--------|-----------|-----|-----|-----------------|------|----|----|----|----|----|-----------------------|-------------------------|
+| DeepSeekV3 | 256 | MXFP8 | 4096 | 1 | 4096 | 0 | 1 | 16 | 1 | n/a | 8 | 2983 | 776 |
+| GPT OSS 120B | 64 | BF16 | 1280 | 4 | 4096 | 0 | 1 | 1 | 1 | n/a | 64 | 8359 | 228 |
+| Qwen3_30B_a3B | 8 | MXFP8 | 512 | 8 | 4096 | 0 | 1 | 1 | 1 | n/a | 8 | 29521 | 679 |
+| Qwen3_235B_a22B | 256 | MXFP8 | 8192 | 1 | 4096 | 0 | 1 | 8 | 1 | 4 | 8 | 2691 | 399 |
+
+#### System: DGX-B200
+
+| Model | #-GPUs | Precision | GBS | MBS | Sequence Length | FSDP | TP | PP | CP | VP | EP | Tokens / sec / GPU | Model TFLOP / sec / GPU |
+|-------|--------|-----------|-----|-----|-----------------|------|----|----|----|----|----|-----------------------|-------------------------|
+| DeepSeekV3 | 256 | MXFP8 | 4096 | 1 | 4096 | 0 | 1 | 16 | 1 | n/a | 8 | 2689 | 699 |
+| GPT OSS 120B | 64 | BF16 | 1280 | 4 | 4096 | 0 | 1 | 1 | 1 | n/a | 64 | 8047 | 219 |
+| Qwen3_30B_a3B | 8 | MXFP8 | 512 | 1 | 4096 | 0 | 1 | 1 | 1 | n/a | 8 | 9691 | 223 |
+| Qwen3_235B_a22B | 256 | MXFP8 | 8192 | 1 | 4096 | 0 | 1 | 8 | 1 | n/a | 8 | 3805 | 563 |
+
+#### System: DGX-H100
+
+| Model | #-GPUs | Precision | GBS | MBS | Sequence Length | FSDP | TP | PP | CP | VP | EP | Tokens / sec / GPU | Model TFLOP / sec / GPU |
+|-------|--------|-----------|-----|-----|-----------------|------|----|----|----|----|----|-----------------------|-------------------------|
+| LLAMA3_70B | 64 | FP8 | 256 | 1 | 8192 | 0 | 4 | 8 | 1 | 5 | n/a | 1621 | 728 |
+| LLAMA3.1_405B | 1024 | FP8 | 1536 | 1 | 8192 | 0 | 8 | 8 | 2 | 8 | n/a | 311 | 784 |
+| GPT OSS 120B | 64 | BF16 | 1280 | 1 | 4096 | 0 | 1 | 4 | 1 | n/a | 8 | 5993 | 163 |
+| Qwen3_30B_a3B | 16 | FP8 | 1024 | 1 | 4096 | 0 | 1 | 2 | 1 | 12 | 8 | 5113 | 118 |
+| Qwen3_235B_a22B | 256 | FP8 | 8192 | 1 | 4096 | 0 | 2 | 8 | 1 | 4 | 32 | 1633 | 242 |
+
+- In MoE training benchmarks, we force-balance the token distribution among experts and all benchmarks are token-dropless.
+
 ## 25.11 NeMo Container
 
 ### Pre-Training Performance
 
@@ -1,4 +1,4 @@
 {
     "name": "megatron-bridge",
-    "version": "latest"
-}
+    "version": "nightly"
+}
@@ -2,13 +2,17 @@
 
 This page lists known issues and limitations in the current release.
 
+## 26.02
+
+- AWS EKS only: Due to AWS-OFI-NCCL v1.17.0 long-running jobs suffer a memory leak that causes performance regression over time. This can be mitigated by upgrading to [v1.17.3](https://github.com/aws/aws-ofi-nccl/releases/tag/v1.17.3).
+- Context parallelism with sequence packing are not yet supported for Qwen 3 VL in the r0.3.0 release. For this functionality with Qwen 3 VL, please utilize the main branch.
+- DeepEP is not supported in the current NeMo framework 26.02 container (nvcr.io/nvidia/nemo:26.02), which results in reduced DSv3 performance compared to the NeMo framework 25.09 container (nvcr.io/nvidia/nemo:25.09) on H100 machines. For optimal H100 performance, we recommend using the NeMo framework 25.09 container.
+
 ## 25.11
 
 - Deepseek V3 on H100 has an issue when using DeepEP and fails with `RuntimeError: DeepEP error: timeout (dispatch CPU)`.
 - MODEL_TFLOP/s/GPU is printed as 0 to stdout for all Hybrid models, such as Nemotron-H 56B.
 
-
 ## 25.09
 
 - **Pretraining DeepSeek in subchannel FP8 precision is not working.** Pretraining DeepSeek with current scaling FP8 is a workaround, but MTP loss does not converge.
-
 
@@ -1,5 +1,29 @@
 # Software Component Versions
 
+## NeMo Framework 26.02
+
+| Software Component | Version |
+|-------------------|---------|
+| PyTorch | 2.10.0a0 |
+| Megatron Core | main:0.16.0 |
+| Transformer Engine | 2.12 |
+| Megatron-Bridge | 0.3.0 |
+| Megatron-FSDP | 0.3.0 |
+| Export-Deploy | 0.4.0 |
+| Evaluator | 0.1.74 |
+| NeMo | 2.7.0 |
+| NeMo Run | 0.8.0 |
+| Nvidia-ModelOpt | 0.41.0 |
+| NVRX | 0.5.0 |
+| CUDA | 13.0.2 |
+| cuDNN | 9.18.0.50 |
+| TRT-LLM | 1.1.0 |
+| vLLM | 0.14.1 |
+
+```{note}
+NVIDIA NeMo™ Framework Training container is built on top of NVIDIA Optimized Frameworks PyTorch 25.06 container: https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/index.html
+```
+
 ## NeMo Framework 25.11
 
 | Software Component | Version |
@@ -23,7 +47,6 @@
 NVIDIA NeMo™ Framework Training container is built on top of NVIDIA Optimized Frameworks PyTorch 25.06 container: https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/index.html
 ```
 
-
 ## NeMo Framework 25.09
 
 | Software Component | Version |
@@ -48,4 +71,3 @@ NVIDIA NeMo™ Framework Training container is built on top of NVIDIA Optimized
 ```{note}
 NVIDIA NeMo™ Framework Training container is built on top of NVIDIA Optimized Frameworks PyTorch 25.06 container: https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/index.html
 ```
-
Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`	`1`	`{`
`2`	`2`	`"name": "megatron-bridge",`
`3`		`- "version": "latest"`
`4`		`-}`
	`3`	`+ "version": "nightly"`
	`4`	`+}`