Skip to content

[BUG] 稳定训一半OOM,已开cp #1136

@glennccc

Description

@glennccc
�[36m(SGLangEngine pid=22439)�[0m [2025-12-17 14:07:56] INFO:     33.184.122.180:62506 - "POST /generate HTTP/1.1" 200 OK
�[36m(SGLangEngine pid=22439)�[0m [2025-12-17 14:07:56] INFO:     33.184.122.180:60048 - "POST /abort_request HTTP/1.1" 200 OK
�[36m(SGLangEngine pid=22439)�[0m [2025-12-17 14:07:56 TP3] Cache flushed successfully!
�[36m(SGLangEngine pid=22439)�[0m [2025-12-17 14:07:56 TP7] Cache flushed successfully!
�[36m(SGLangEngine pid=22439)�[0m [2025-12-17 14:07:56 TP2] Cache flushed successfully!
�[36m(SGLangEngine pid=22439)�[0m [2025-12-17 14:07:56 TP5] Cache flushed successfully!
�[36m(SGLangEngine pid=22439)�[0m [2025-12-17 14:07:56 TP4] Cache flushed successfully!
�[36m(SGLangEngine pid=22439)�[0m [2025-12-17 14:07:56 TP6] Cache flushed successfully!
�[36m(SGLangEngine pid=22439)�[0m [2025-12-17 14:07:56 TP1] Cache flushed successfully!
�[36m(SGLangEngine pid=22439)�[0m [2025-12-17 14:07:56 TP0] Cache flushed successfully!
�[36m(SGLangEngine pid=22439)�[0m [2025-12-17 14:07:56] INFO:     33.184.122.180:60056 - "GET /flush_cache HTTP/1.1" 200 OK
�[36m(SGLangEngine pid=22439)�[0m [2025-12-17 14:07:57 TP6] Cache flushed successfully!
�[36m(SGLangEngine pid=22439)�[0m [2025-12-17 14:07:57 TP2] Cache flushed successfully!
�[36m(SGLangEngine pid=22439)�[0m [2025-12-17 14:07:57 TP1] Cache flushed successfully!
�[36m(SGLangEngine pid=22439)�[0m [2025-12-17 14:07:57 TP0] Cache flushed successfully!
�[36m(SGLangEngine pid=22439)�[0m [2025-12-17 14:07:57 TP3] Cache flushed successfully!
�[36m(SGLangEngine pid=22439)�[0m [2025-12-17 14:07:57 TP4] Cache flushed successfully!
�[36m(SGLangEngine pid=22439)�[0m [2025-12-17 14:07:57 TP5] Cache flushed successfully!
�[36m(SGLangEngine pid=22439)�[0m [2025-12-17 14:07:57 TP7] Cache flushed successfully!
�[36m(SGLangEngine pid=22439)�[0m [2025-12-17 14:07:58] INFO:     33.184.122.180:60058 - "POST /release_memory_occupation HTTP/1.1" 200 OK
�[36m(MegatronTrainRayActor pid=32167)�[0m [2025-12-17 14:07:58] timer.py:24 - Timer wake_up start
�[36m(MegatronTrainRayActor pid=32167)�[0m [2025-12-17 14:07:58] memory_utils.py:41 - [Rank 0] Memory-Usage before wake_up model: {'gpu': '0', 'total_GB': 95.0, 'free_GB': 87.12, 'used_GB': 7.88, 'allocated_GB': 24.93, 'reserved_GB': 25.73}
�[36m(MegatronTrainRayActor pid=32442)�[0m [2025-12-17 14:08:04] reloadable_process_group.py:152 - Reloading 20 process groups in pid 32442
�[36m(MegatronTrainRayActor pid=32437)�[0m [2025-12-17 14:07:58] memory_utils.py:41 - [Rank 6] Memory-Usage before wake_up model: {'gpu': '6', 'total_GB': 95.0, 'free_GB': 88.74, 'used_GB': 6.26, 'allocated_GB': 25.51, 'reserved_GB': 26.23}�[32m [repeated 7x across cluster]�[0m
�[36m(MegatronTrainRayActor pid=32442)�[0m [2025-12-17 14:08:04] memory_utils.py:41 - [Rank 4] Memory-Usage after wake_up model: {'gpu': '4', 'total_GB': 95.0, 'free_GB': 63.42, 'used_GB': 31.59, 'allocated_GB': 25.51, 'reserved_GB': 25.95}
�[36m(MegatronTrainRayActor pid=32167)�[0m [2025-12-17 14:08:06] timer.py:32 - Timer wake_up end (elapsed: 8.2s)
�[36m(MegatronTrainRayActor pid=32167)�[0m [2025-12-17 14:08:06] timer.py:24 - Timer data_preprocess start
�[36m(MegatronTrainRayActor pid=32167)�[0m [2025-12-17 14:08:06] timer.py:32 - Timer data_preprocess end (elapsed: 0.2s)
�[36m(MegatronTrainRayActor pid=32443)�[0m AMEM pid:32443 sharedNetBuffersInit 561 dptr:0xd1e8000000 sz:268435456 defensive remove
�[36m(MegatronTrainRayActor pid=32167)�[0m [2025-12-17 14:08:09] timer.py:32 - Timer train_wait end (elapsed: 248.7s)
�[36m(MegatronTrainRayActor pid=32167)�[0m [2025-12-17 14:08:09] timer.py:24 - Timer train start
�[36m(MegatronTrainRayActor pid=32436)�[0m [2025-12-17 14:08:08] reloadable_process_group.py:152 - Reloading 20 process groups in pid 32436�[32m [repeated 7x across cluster]�[0m
�[36m(MegatronTrainRayActor pid=32436)�[0m [2025-12-17 14:08:08] memory_utils.py:41 - [Rank 1] Memory-Usage after wake_up model: {'gpu': '1', 'total_GB': 95.0, 'free_GB': 62.78, 'used_GB': 32.22, 'allocated_GB': 25.51, 'reserved_GB': 26.58}�[32m [repeated 7x across cluster]�[0m
�[36m(MegatronTrainRayActor pid=32167)�[0m [2025-12-17 14:08:10] timer.py:24 - Timer ref_log_probs start
�[36m(MegatronTrainRayActor pid=32436)�[0m AMEM pid:32436 sharedNetBuffersInit 561 dptr:0xdb00000000 sz:268435456 defensive remove�[32m [repeated 15x across cluster]�[0m
�[36m(MegatronTrainRayActor pid=32167)�[0m AMEM pid:32167 sharedNetBuffersInit 561 dptr:0xd3a4000000 sz:268435456 defensive remove�[32m [repeated 9x across cluster]�[0m
�[36m(MegatronTrainRayActor pid=32437)�[0m AMEM pid:32437 sharedNetBuffersInit 561 dptr:0xd2fc000000 sz:268435456 defensive remove�[32m [repeated 7x across cluster]�[0m
�[36m(SGLangEngine pid=22439)�[0m [2025-12-17 14:08:37] INFO:     33.184.122.180:40280 - "GET /health HTTP/1.1" 200 OK
�[36m(MegatronTrainRayActor pid=32167)�[0m [2025-12-17 14:08:50] timer.py:32 - Timer ref_log_probs end (elapsed: 40.1s)
�[36m(MegatronTrainRayActor pid=32167)�[0m [2025-12-17 14:08:50] timer.py:24 - Timer log_probs start
�[36m(MegatronTrainRayActor pid=32167)�[0m [2025-12-17 14:09:06] timer.py:32 - Timer log_probs end (elapsed: 16.3s)
�[36m(MegatronTrainRayActor pid=32167)�[0m [2025-12-17 14:09:06] data.py:125 - rollout 11: {'rollout/raw_reward': 0.4296875, 'rollout/total_lengths': 7206.1953125, 'rollout/response_lengths': 7044.3828125, 'rollout/rewards': 1.3969838619232178e-09, 'rollout/truncated': 0.5625, 'rollout/rollout_log_probs': -0.18468008935451508, 'rollout/ref_log_probs': -0.18681827187538147, 'rollout/log_probs': -0.18691419810056686, 'rollout/advantages': 3.259629011154175e-09, 'rollout/returns': 3.259629011154175e-09}
�[36m(MegatronTrainRayActor pid=32167)�[0m [2025-12-17 14:09:06] timer.py:24 - Timer actor_train start
�[36m(SGLangEngine pid=22439)�[0m [2025-12-17 14:09:37] INFO:     33.184.122.180:50270 - "GET /health HTTP/1.1" 200 OK
�[36m(MegatronTrainRayActor pid=32440)�[0m AMEM pid:32440 sharedNetBuffersInit 561 dptr:0x7fac2c000000 sz:268435456 defensive remove
�[36m(MegatronTrainRayActor pid=32437)�[0m AMEM pid:32437 sharedNetBuffersInit 561 dptr:0x7ed63c000000 sz:268435456 defensive remove�[32m [repeated 15x across cluster]�[0m
�[36m(MegatronTrainRayActor pid=32167)�[0m [2025-12-17 14:10:28] model.py:649 - step 11: {'train/loss': -6.05359673500061e-09, 'train/pg_loss': -6.05359673500061e-09, 'train/entropy_loss': 0.18443581461906433, 'train/pg_clipfrac': 0.0, 'train/ppo_kl': 0.0, 'train/train_rollout_logprob_abs_diff': 0.018386833369731903, 'train/kl_loss': 0.002021021908149123, 'train/grad_norm': 0.07986056034213801, 'train/lr-pg_0': 1e-06, 'train/lr-pg_1': 1e-06, 'train/lr-pg_2': 1e-06, 'train/step': 11}
�[36m(MegatronTrainRayActor pid=32167)�[0m [2025-12-17 14:10:28] timer.py:32 - Timer actor_train end (elapsed: 81.3s)
�[36m(SGLangEngine pid=22439)�[0m [2025-12-17 14:10:37] INFO:     33.184.122.180:62932 - "GET /health HTTP/1.1" 200 OK
�[36m(SGLangEngine pid=22439)�[0m [2025-12-17 14:11:37] INFO:     33.184.122.180:57752 - "GET /health HTTP/1.1" 200 OK
�[36m(SGLangEngine pid=22439)�[0m [2025-12-17 14:12:37] INFO:     33.184.122.180:39746 - "GET /health HTTP/1.1" 200 OK
�[36m(SGLangEngine pid=22439)�[0m [2025-12-17 14:13:37] INFO:     33.184.122.180:59258 - "GET /health HTTP/1.1" 200 OK
�[36m(SGLangEngine pid=22439)�[0m [2025-12-17 14:14:40] INFO:     33.184.122.180:33498 - "GET /health HTTP/1.1" 200 OK
�[33m(raylet)�[0m A worker died or was killed while executing a task by an unexpected system error. To troubleshoot the problem, check the logs for the dead worker. Lease ID: 1300000040bed4a41ff75e0a07c3c0e4586ddf6685f97935bdd59f3ebd23e650 Worker ID: 7f5c29ab00645c2eb88521c019ae0187cb5524975f231a6a3067ab2e Node ID: d473eba564b6985bc4351da9ea9a3f0783ee36d47e160d808c0e9bf1 Worker IP address: 33.184.122.180 Worker port: 11650 Worker PID: 32441 Worker exit type: SYSTEM_ERROR Worker exit detail: Worker unexpectedly exits with a connection error code 2. End of file. Some common causes include: (1) the process was killed by the OOM killer due to high memory usage, (2) ray stop --force was called, or (3) the worker crashed unexpectedly due to SIGSEGV or another unexpected error.
Traceback (most recent call last):
  File "/rl/slime/train.py", line 112, in <module>
    train(args)
  File "/rl/slime/train.py", line 79, in train
    ray.get(actor_model.async_train(rollout_id, rollout_data_ref))
  File "/usr/local/lib/python3.12/dist-packages/ray/_private/auto_init_hook.py", line 22, in auto_init_wrapper
    return fn(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/ray/_private/client_mode_hook.py", line 104, in wrapper
    return func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/ray/_private/worker.py", line 2972, in get
    values, debugger_breakpoint = worker.get_objects(
                                  ^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/ray/_private/worker.py", line 1033, in get_objects
    raise value
ray.exceptions.ActorDiedError: The actor died unexpectedly before finishing this task.
	class_name: MegatronTrainRayActor
	actor_id: 07956d1a780b62eabe5abaf002000000
	pid: 32441
	namespace: 399eaba6-e3a3-4d1a-9778-6368a0563365
	ip: 33.184.122.180
The actor is dead because its worker process has died. Worker exit type: SYSTEM_ERROR Worker exit detail: Worker unexpectedly exits with a connection error code 2. End of file. Some common causes include: (1) the process was killed by the OOM killer due to high memory usage, (2) ray stop --force was called, or (3) the worker crashed unexpectedly due to SIGSEGV or another unexpected error.
AMEM [INFO] amem_nccl.cpp:x_fini:693 groupID:0 pid:15619 exit
main process return code: 1

训练配置:

ROLLOUT_ARGS = [
    "--prompt-data", TRAIN_DATA,
    "--input-key", "prompt",
    "--label-key", "label",
    "--apply-chat-template",
    "--rollout-shuffle",
    "--rm-type", "deepscaler",
    "--num-rollout", "100",
    "--rollout-batch-size", "16",
    "--n-samples-per-prompt", "8",
    "--rollout-max-response-len", "8192",
    "--rollout-temperature", "0.8",
    "--global-batch-size", "128",
    "--balance-data"
]

EVAL_ARGS = [
    "--eval-interval", "20",
    "--eval-prompt-data", EVAL_DATA,
    "--n-samples-per-eval-prompt", "4",
    "--eval-max-response-len", "16384",
    "--eval-top-p", "0.7"
]

PERF_ARGS = [
    "--tensor-model-parallel-size", "4",
    "--sequence-parallel",
    "--pipeline-model-parallel-size", "1",
    "--context-parallel-size", "1",
    "--expert-model-parallel-size", "8",
    "--expert-tensor-parallel-size", "1",
    "--recompute-granularity", "full",
    "--recompute-method", "uniform",
    "--recompute-num-layers", "1",
    "--use-dynamic-batch-size",
    "--max-tokens-per-gpu", "20480"
]

GRPO_ARGS = [
    "--advantage-estimator", "grpo",
    "--use-kl-loss",
    "--kl-loss-coef", "0.00",
    "--kl-loss-type", "low_var_kl",
    "--entropy-coef", "0.00",
    "--eps-clip", "0.2",
    "--eps-clip-high", "0.28"
]

OPTIMIZER_ARGS = [
    "--optimizer", "adam",
    "--lr", "1e-6",
    "--lr-decay-style", "constant",
    "--weight-decay", "0.1",
    "--adam-beta1", "0.9",
    "--adam-beta2", "0.98",
    "--optimizer-cpu-offload",
    "--overlap-cpu-optimizer-d2h-h2d",
    "--use-precision-aware-optimizer"
]

TENSORBOARD_ARGS = [
    "--use-pytorch-profiler"
]

WANDB_ARGS = [
    # "--use-wandb",
    # "--wandb-project", "slime-dev",
    # "--wandb-group", "qwen3-30B-A3B-test",
    # "--wandb-key", "${WANDB_KEY}"
]

SGLANG_ARGS = [
    "--rollout-num-gpus-per-engine", "8",
    "--sglang-mem-fraction-static", "0.7",
    "--sglang-cuda-graph-bs", "1", "2", "4", "8"
] + [str(i) for i in range(16, 257, 8)]  # $(seq 16 8 256)

MISC_ARGS = [
    "--attention-dropout", "0.0",
    "--hidden-dropout", "0.0",
    "--accumulate-allreduce-grads-in-fp32",
    "--attention-softmax-in-fp32",
    "--attention-backend", "flash"
]

# Build the full command
train_cmd_parts = ["python3", "train.py"] + \
    ["--actor-num-nodes", "1"] + \
    ["--actor-num-gpus-per-node", "8"] + \
    ["--colocate"] + \
    MODEL_ARGS + \
    CKPT_ARGS + \
    ROLLOUT_ARGS + \
    OPTIMIZER_ARGS + \
    GRPO_ARGS + \
    PERF_ARGS + \
    EVAL_ARGS + \
    SGLANG_ARGS + \
    TENSORBOARD_ARGS + \
    MISC_ARGS


env_vars = {
    "RAY_memory_monitor_refresh_ms": "0",
    "RAY_memory_usage_threshold": "0.99",
    "TORCH_NCCL_AVOID_RECORD_STREAMS": "1",

    "PYTHONPATH": f"/root/Megatron-LM:{PATH_TO_SLIME}:$PYTHONPATH",
    "CUDA_DEVICE_MAX_CONNECTIONS": "1",
    "NCCL_NVLS_ENABLE": "1",
}

是否会存在memory随着训练一直积累,无法释放的问题(我看issue也有很多相关的OOM问题,但没有人解决的)。如果不是的话,请教下如何解决这个问题。稳定训着训着就OOM,开了--context-parallel-size的,8张h20训的qwen3-30b

@zhuzilin 大佬麻烦fix下

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions