Skip to content

Commit d892a2d

Browse files
committed
Merge branch 'main' into peng-add-glm5
2 parents eb13d12 + 0aa9a37 commit d892a2d

File tree

98 files changed

+4039
-1331
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

98 files changed

+4039
-1331
lines changed

.github/workflows/build-test-publish-wheel.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,7 @@ jobs:
6464
container-options: "--gpus all --runtime=nvidia"
6565
secrets:
6666
TWINE_USERNAME: __token__
67-
TWINE_PASSWORD: ${{ (github.ref == 'refs/heads/main' || startsWith(github.ref, 'refs/heads/r')) && secrets.SVC_PYPI_TOKEN || secrets.SVC_PYPI_TEST_TOKEN }}
67+
TWINE_PASSWORD: ${{ secrets.SVC_PYPI_TEST_TOKEN }}
6868
SLACK_WEBHOOK: ${{ secrets.SLACK_CI_CHANNEL_WEBHOOK }}
6969
SLACK_WEBHOOK_ADMIN: ${{ secrets.SLACK_TEAM_GROUP_ID }}
7070
GH_TOKEN: ${{ secrets.PAT }}

.github/workflows/cicd-main.yml

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -361,10 +361,11 @@ jobs:
361361
- script: L2_Launch_models_nemotron_vl
362362
- script: L2_Launch_models_olmoe
363363
- script: L2_Launch_models_qwen
364-
# - script: L2_Launch_models_qwen_quantization
364+
- script: L2_Launch_models_qwen_quantization
365365
- script: L2_Launch_models_qwen_vl
366366
- script: L2_Launch_recipes_gemma_vl
367367
- script: L2_Launch_recipes_gpt_oss
368+
- script: L2_Launch_models_qwen_vl_quantization
368369
- script: L2_Launch_recipes_llama_1b
369370
- script: L2_Launch_recipes_llama_3b
370371
- script: L2_Launch_recipes_llama_distill
@@ -373,7 +374,7 @@ jobs:
373374
- script: L2_Launch_data
374375
- script: L2_Launch_post_training_quantization
375376
- script: L2_Launch_quantization_aware_training
376-
- script: L2_Launch_quantization_export
377+
# - script: L2_Launch_quantization_export
377378
- script: L2_Launch_recipes_llama_cuda_graphs
378379
- script: L2_Launch_utils
379380
needs: [pre-flight, cicd-unit-tests]

.github/workflows/release-docs.yml

Lines changed: 45 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,14 +30,57 @@ on:
3030
required: false
3131
type: string
3232
default: ""
33+
update-version-picker:
34+
description: Update version picker.
35+
required: false
36+
type: boolean
37+
default: true
38+
notify-emails:
39+
description: Email addresses to send the notification to. Format as "me@me.com,you@you.com".
40+
required: false
41+
type: string
42+
github-ref:
43+
description: Github ref to checkout
44+
required: false
45+
type: string
46+
default: ""
47+
workflow_call:
48+
inputs:
49+
dry-run:
50+
description: Whether to run the workflow in dry-run mode
51+
required: false
52+
type: boolean
53+
default: true
54+
publish-as-latest:
55+
description: Publish as Latest stable version.
56+
required: false
57+
type: boolean
58+
default: true
59+
docs-version-override:
60+
description: Docs version if commit is not tagged
61+
required: false
62+
type: string
63+
default: ""
64+
update-version-picker:
65+
description: Update version picker.
66+
required: false
67+
type: boolean
68+
default: true
3369
notify-emails:
3470
description: Email addresses to send the notification to. Format as "me@me.com,you@you.com".
3571
required: false
3672
type: string
73+
github-ref:
74+
description: Github ref to checkout
75+
required: false
76+
type: string
77+
default: ""
3778

3879
jobs:
3980
build-docs:
4081
uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_build_docs.yml@v0.67.0
82+
with:
83+
ref: ${{ inputs.github-ref }}
4184

4285
publish-docs:
4386
runs-on: ubuntu-latest
@@ -46,7 +89,7 @@ jobs:
4689
- uses: actions/checkout@v6
4790
with:
4891
repository: NVIDIA-NeMo/FW-CI-templates
49-
ref: v0.72.0
92+
ref: v0.74.0
5093
path: FW-CI-templates
5194

5295
- uses: ./FW-CI-templates/.github/actions/publish-docs
@@ -62,6 +105,7 @@ jobs:
62105
emails-csv: ${{ inputs.notify-emails && format('{0},{1}', vars.docs_release_emails, inputs.notify-emails) || vars.docs_release_emails }}
63106
overwrite-latest-on-tag: ${{ inputs.publish-as-latest }}
64107
docs-version-override: ${{ inputs.docs-version-override }}
108+
update-version-picker: ${{ inputs.update-version-picker }}
65109
run-on-version-tag-only: ${{ github.ref_name != 'main' }}
66110
request-name: megatron-bridge-publish-docs-${{ github.run_id }}
67111
aws-region: ${{ vars.DOCS_AWS_REGION }}
Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
# Copyright (c) 2026, NVIDIA CORPORATION.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
name: Release Nightly Docs
16+
17+
on:
18+
schedule:
19+
- cron: "0 10 * * *"
20+
21+
jobs:
22+
call-release-docs:
23+
uses: ./.github/workflows/release-docs.yml
24+
with:
25+
dry-run: false
26+
publish-as-latest: false
27+
docs-version-override: "nightly"
28+
update-version-picker: false
29+
secrets: inherit

.github/workflows/release.yaml

Lines changed: 31 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -44,14 +44,30 @@ on:
4444
description: Branch for version bump
4545
required: true
4646
type: string
47+
gh-release-from-tag:
48+
description: Tag of previous release for changelog builder
49+
required: false
50+
type: string
51+
default: ""
4752

4853
permissions:
4954
contents: write # To read repository content
5055
pull-requests: write # To create PRs
5156

5257
jobs:
58+
pre-flight:
59+
uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_cicd_preflight.yml@v0.73.2
60+
with:
61+
default_runner_prefix: ${{ vars.DEFAULT_RUNNER_PREFIX }}
62+
non_nvidia_runner_prefix: ${{ vars.NON_NVIDIA_RUNNER_PREFIX }}
63+
default_test_data_path: ${{ vars.DEFAULT_TEST_DATA_PATH }}
64+
non_nvidia_test_data_path: ${{ vars.NON_NVIDIA_TEST_DATA_PATH }}
65+
secrets:
66+
NVIDIA_MANAGEMENT_ORG_PAT: ${{ secrets.NVIDIA_MANAGEMENT_ORG_PAT }}
67+
5368
release:
54-
uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_release_library.yml@v0.73.0
69+
uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_release_library.yml@v0.77.0
70+
needs: [pre-flight]
5571
with:
5672
release-ref: ${{ inputs.release-ref || github.sha }}
5773
python-package: megatron.bridge
@@ -63,18 +79,29 @@ jobs:
6379
has-src-dir: true
6480
skip-test-wheel: true
6581
custom-container: nvcr.io/nvidia/pytorch:25.05-py3
66-
runner: linux-amd64-cpu16
82+
runner: ${{ needs.pre-flight.outputs.runner_prefix }}-gpu-x2-container
6783
no-build-isolation: true
6884
app-id: ${{ vars.BOT_ID }}
6985
gh-release-use-changelog-builder: ${{ inputs.generate-changelog }}
7086
publish-docs: ${{ inputs.publish-docs }}
7187
docs-target-path: nemo/megatron-bridge
88+
submodules: recursive
89+
container-options: "--gpus all --runtime=nvidia"
90+
gh-release-from-tag: ${{ inputs.gh-release-from-tag }}
7291
secrets:
7392
TWINE_USERNAME: __token__
74-
TWINE_PASSWORD: ${{ (github.ref == 'refs/heads/main' || startsWith(github.ref, 'refs/heads/r')) && secrets.SVC_PYPI_TOKEN || secrets.SVC_PYPI_TEST_TOKEN }}
93+
TWINE_PASSWORD: ${{ secrets.TWINE_PASSWORD }}
7594
SLACK_WEBHOOK_ADMIN: ${{ secrets.SLACK_TEAM_GROUP_ID }}
76-
SLACK_WEBHOOK: ${{ (github.ref == 'refs/heads/main' || startsWith(github.ref, 'refs/heads/r')) && secrets.SLACK_MAIN_CHANNEL_WEBHOOK || secrets.SLACK_CI_CHANNEL_WEBHOOK }}
95+
SLACK_WEBHOOK: ${{ secrets.SLACK_RELEASE_ENDPOINT }}
7796
PAT: ${{ secrets.PAT }}
7897
SSH_KEY: ${{ secrets.SSH_KEY }}
7998
SSH_PWD: ${{ secrets.SSH_PWD }}
8099
BOT_KEY: ${{ secrets.BOT_KEY }}
100+
AWS_ASSUME_ROLE_ARN: ${{ secrets.AWS_ASSUME_ROLE_ARN }}
101+
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
102+
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
103+
AKAMAI_HOST: ${{ secrets.AKAMAI_HOST }}
104+
AKAMAI_CLIENT_TOKEN: ${{ secrets.AKAMAI_CLIENT_TOKEN }}
105+
AKAMAI_CLIENT_SECRET: ${{ secrets.AKAMAI_CLIENT_SECRET }}
106+
AKAMAI_ACCESS_TOKEN: ${{ secrets.AKAMAI_ACCESS_TOKEN }}
107+
S3_BUCKET_NAME: ${{ secrets.S3_BUCKET_NAME }}

docs/conf.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -25,9 +25,9 @@
2525
# https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information
2626

2727
project = "Megatron Bridge"
28-
copyright = "2025, NVIDIA Corporation"
28+
copyright = "2026, NVIDIA Corporation"
2929
author = "NVIDIA Corporation"
30-
release = "latest"
30+
release = "nightly"
3131

3232
# -- General configuration ---------------------------------------------------
3333
# https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration

docs/performance-summary.md

Lines changed: 65 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,11 +37,75 @@ Below are performance benchmarks for various large language models organized by
3737
The performance data includes:
3838

3939
- **Pre-training Performance**: Throughput metrics for various model sizes and architectures
40-
- **System Configurations**: Results across different GPU systems (DGX-GB200, DGX-B200, DGX-H100)
40+
- **System Configurations**: Results across different GPU systems (DGX-GB300, DGX-GB200, DGX-B300, DGX-B200, DGX-H100)
4141
- **Precision Options**: Performance comparisons between different precision modes (BF16, FP8, MXFP8)
4242

4343
---
4444

45+
## 26.02 NeMo Container
46+
47+
### Pre-Training Performance
48+
49+
#### System: DGX-GB300
50+
51+
| Model | #-GPUs | Precision | GBS | MBS | Sequence Length | FSDP | TP | PP | CP | VP | EP | Tokens / sec / GPU | Model TFLOP / sec / GPU |
52+
|-------|--------|-----------|-----|-----|-----------------|------|----|----|----|----|----|-----------------------|-------------------------|
53+
| LLAMA3_70B | 64 | FP8 | 256 | 2 | 8192 | 64| 1 | 1 | 1 | n/a | n/a | 5003 | 2248 |
54+
| LLAMA3_70B | 64 | MXFP8 | 256 | 1 | 8192 | 0 | 1 | 4 | 1 | 5 | n/a | 4596 | 2064 |
55+
| LLAMA3_70B | 64 | NVFP4 | 256 | 1 | 8192 | 0 | 1 | 4 | 1 | 5 | n/a | 6798 | 3056 |
56+
| LLAMA3.1_405B | 256 | FP8 | 1536 | 1 | 8192 | 0 | 4 | 8 | 1 | 4 | n/a | 988 | 2495 |
57+
| LLAMA3.1_405B | 256 | MXFP8 | 1536 | 1 | 8192 | 0 | 2 | 8 | 2 | 4 | n/a | 931 | 2349 |
58+
| LLAMA3.1_405B | 256 | NVFP4 | 1536 | 1 | 8192 | 0 | 4 | 8 | 1 | 4 | n/a | 1333 | 3365 |
59+
| DeepSeekV3 | 256 | MXFP8 | 4096 | 2 | 4096 | 0 | 1 | 2 | 1 | 8 | 32 | 4612 | 1199 |
60+
| GPT OSS 120B | 64 | BF16 | 1280 | 4 | 4096 | 0 | 1 | 1 | 1 | n/a | 64 | 19412 | 527 |
61+
| Qwen3_30B_a3B | 8 | MXFP8 | 512 | 8 | 4096 | 0 | 1 | 1 | 1 | n/a | 8 | 30376 | 699 |
62+
| Qwen3_235B_a22B | 256 | MXFP8 | 8192 | 2 | 4096 | 0 | 1 | 4 | 1 | n/a | 32 | 6583 | 974 |
63+
64+
#### System: DGX-GB200
65+
66+
| Model | #-GPUs | Precision | GBS | MBS | Sequence Length | FSDP | TP | PP | CP | VP | EP | Tokens / sec / GPU | Model TFLOP / sec / GPU |
67+
|-------|--------|-----------|-----|-----|-----------------|------|----|----|----|----|----|-----------------------|-------------------------|
68+
| LLAMA3_70B | 64 | FP8 | 256 | 2 | 8192 | 64| 1 | 1 | 1 | n/a | n/a | 4040 | 1815 |
69+
| LLAMA3_70B | 64 | MXFP8 | 256 | 1 | 8192 | 0 | 2 | 4 | 1 | 5 | n/a | 3613 | 1623 |
70+
| LLAMA3_70B | 64 | NVFP4 | 256 | 1 | 8192 | 0 | 2 | 4 | 1 | 5 | n/a | 4458 | 2004 |
71+
| LLAMA3.1_405B | 256 | FP8 | 1536 | 1 | 8192 | 0 | 4 | 16 | 1 | 4 | n/a | 793 | 2004 |
72+
| LLAMA3.1_405B | 256 | MXFP8 | 1536 | 1 | 8192 | 0 | 4 | 16 | 1 | 8 | n/a | 786 | 1983 |
73+
| LLAMA3.1_405B | 256 | NVFP4 | 1536 | 1 | 8192 | 0 | 4 | 16 | 1 | 8 | n/a | 1076 | 2716 |
74+
| DeepSeekV3 | 256 | MXFP8 | 4096 | 1 | 4096 | 0 | 1 | 4 | 1 | 4 | 64 | 3955 | 1028 |
75+
| GPT OSS 120B | 64 | BF16 | 1280 | 4 | 4096 | 0 | 1 | 1 | 1 | n/a | 64 | 15784 | 428 |
76+
| Qwen3_30B_a3B | 8 | MXFP8 | 512 | 4 | 4096 | 0 | 1 | 1 | 1 | n/a | 8 | 26084 | 600 |
77+
| Qwen3_235B_a22B | 256 | MXFP8 | 8192 | 1 | 4096 | 0 | 1 | 8 | 1 | n/a | 32 | 5448 | 806 |
78+
79+
#### System: DGX-B300
80+
81+
| Model | #-GPUs | Precision | GBS | MBS | Sequence Length | FSDP | TP | PP | CP | VP | EP | Tokens / sec / GPU | Model TFLOP / sec / GPU |
82+
|-------|--------|-----------|-----|-----|-----------------|------|----|----|----|----|----|-----------------------|-------------------------|
83+
| DeepSeekV3 | 256 | MXFP8 | 4096 | 1 | 4096 | 0 | 1 | 16 | 1 | n/a | 8 | 2983 | 776 |
84+
| GPT OSS 120B | 64 | BF16 | 1280 | 4 | 4096 | 0 | 1 | 1 | 1 | n/a | 64 | 8359 | 228 |
85+
| Qwen3_30B_a3B | 8 | MXFP8 | 512 | 8 | 4096 | 0 | 1 | 1 | 1 | n/a | 8 | 29521 | 679 |
86+
| Qwen3_235B_a22B | 256 | MXFP8 | 8192 | 1 | 4096 | 0 | 1 | 8 | 1 | 4 | 8 | 2691 | 399 |
87+
88+
#### System: DGX-B200
89+
90+
| Model | #-GPUs | Precision | GBS | MBS | Sequence Length | FSDP | TP | PP | CP | VP | EP | Tokens / sec / GPU | Model TFLOP / sec / GPU |
91+
|-------|--------|-----------|-----|-----|-----------------|------|----|----|----|----|----|-----------------------|-------------------------|
92+
| DeepSeekV3 | 256 | MXFP8 | 4096 | 1 | 4096 | 0 | 1 | 16 | 1 | n/a | 8 | 2689 | 699 |
93+
| GPT OSS 120B | 64 | BF16 | 1280 | 4 | 4096 | 0 | 1 | 1 | 1 | n/a | 64 | 8047 | 219 |
94+
| Qwen3_30B_a3B | 8 | MXFP8 | 512 | 1 | 4096 | 0 | 1 | 1 | 1 | n/a | 8 | 9691 | 223 |
95+
| Qwen3_235B_a22B | 256 | MXFP8 | 8192 | 1 | 4096 | 0 | 1 | 8 | 1 | n/a | 8 | 3805 | 563 |
96+
97+
#### System: DGX-H100
98+
99+
| Model | #-GPUs | Precision | GBS | MBS | Sequence Length | FSDP | TP | PP | CP | VP | EP | Tokens / sec / GPU | Model TFLOP / sec / GPU |
100+
|-------|--------|-----------|-----|-----|-----------------|------|----|----|----|----|----|-----------------------|-------------------------|
101+
| LLAMA3_70B | 64 | FP8 | 256 | 1 | 8192 | 0 | 4 | 8 | 1 | 5 | n/a | 1621 | 728 |
102+
| LLAMA3.1_405B | 1024 | FP8 | 1536 | 1 | 8192 | 0 | 8 | 8 | 2 | 8 | n/a | 311 | 784 |
103+
| GPT OSS 120B | 64 | BF16 | 1280 | 1 | 4096 | 0 | 1 | 4 | 1 | n/a | 8 | 5993 | 163 |
104+
| Qwen3_30B_a3B | 16 | FP8 | 1024 | 1 | 4096 | 0 | 1 | 2 | 1 | 12 | 8 | 5113 | 118 |
105+
| Qwen3_235B_a22B | 256 | FP8 | 8192 | 1 | 4096 | 0 | 2 | 8 | 1 | 4 | 32 | 1633 | 242 |
106+
107+
- In MoE training benchmarks, we force-balance the token distribution among experts and all benchmarks are token-dropless.
108+
45109
## 25.11 NeMo Container
46110

47111
### Pre-Training Performance

docs/project.json

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
{
22
"name": "megatron-bridge",
3-
"version": "latest"
4-
}
3+
"version": "nightly"
4+
}

docs/releases/known-issues.md

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,13 +2,17 @@
22

33
This page lists known issues and limitations in the current release.
44

5+
## 26.02
6+
7+
- AWS EKS only: Due to AWS-OFI-NCCL v1.17.0 long-running jobs suffer a memory leak that causes performance regression over time. This can be mitigated by upgrading to [v1.17.3](https://github.com/aws/aws-ofi-nccl/releases/tag/v1.17.3).
8+
- Context parallelism with sequence packing are not yet supported for Qwen 3 VL in the r0.3.0 release. For this functionality with Qwen 3 VL, please utilize the main branch.
9+
- DeepEP is not supported in the current NeMo framework 26.02 container (nvcr.io/nvidia/nemo:26.02), which results in reduced DSv3 performance compared to the NeMo framework 25.09 container (nvcr.io/nvidia/nemo:25.09) on H100 machines. For optimal H100 performance, we recommend using the NeMo framework 25.09 container.
10+
511
## 25.11
612

713
- Deepseek V3 on H100 has an issue when using DeepEP and fails with `RuntimeError: DeepEP error: timeout (dispatch CPU)`.
814
- MODEL_TFLOP/s/GPU is printed as 0 to stdout for all Hybrid models, such as Nemotron-H 56B.
915

10-
1116
## 25.09
1217

1318
- **Pretraining DeepSeek in subchannel FP8 precision is not working.** Pretraining DeepSeek with current scaling FP8 is a workaround, but MTP loss does not converge.
14-

docs/releases/software-versions.md

Lines changed: 24 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,29 @@
11
# Software Component Versions
22

3+
## NeMo Framework 26.02
4+
5+
| Software Component | Version |
6+
|-------------------|---------|
7+
| PyTorch | 2.10.0a0 |
8+
| Megatron Core | main:0.16.0 |
9+
| Transformer Engine | 2.12 |
10+
| Megatron-Bridge | 0.3.0 |
11+
| Megatron-FSDP | 0.3.0 |
12+
| Export-Deploy | 0.4.0 |
13+
| Evaluator | 0.1.74 |
14+
| NeMo | 2.7.0 |
15+
| NeMo Run | 0.8.0 |
16+
| Nvidia-ModelOpt | 0.41.0 |
17+
| NVRX | 0.5.0 |
18+
| CUDA | 13.0.2 |
19+
| cuDNN | 9.18.0.50 |
20+
| TRT-LLM | 1.1.0 |
21+
| vLLM | 0.14.1 |
22+
23+
```{note}
24+
NVIDIA NeMo™ Framework Training container is built on top of NVIDIA Optimized Frameworks PyTorch 25.06 container: https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/index.html
25+
```
26+
327
## NeMo Framework 25.11
428

529
| Software Component | Version |
@@ -23,7 +47,6 @@
2347
NVIDIA NeMo™ Framework Training container is built on top of NVIDIA Optimized Frameworks PyTorch 25.06 container: https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/index.html
2448
```
2549

26-
2750
## NeMo Framework 25.09
2851

2952
| Software Component | Version |
@@ -48,4 +71,3 @@ NVIDIA NeMo™ Framework Training container is built on top of NVIDIA Optimized
4871
```{note}
4972
NVIDIA NeMo™ Framework Training container is built on top of NVIDIA Optimized Frameworks PyTorch 25.06 container: https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/index.html
5073
```
51-

0 commit comments

Comments
 (0)