Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
bf22d4f
squash #2700
yuki-97 Jun 11, 2026
22be51a
squash rollout pump (e48aa9f -> 8b5d01f)
yuki-97 Jun 4, 2026
6b05f56
squash staleness sampler + tq replay buffer (eadf626 -> ac5571a)
yuki-97 Jun 8, 2026
e961303
[rollout pump] fix batch
yuki-97 Jun 14, 2026
4942d29
squash entrypoint + fix (f335c92 -> 57f8eb2)
yuki-97 Jun 14, 2026
41f88f0
refactor(single-controller): rename setup entrypoint; drop dryrun test
yuki-97 Jun 15, 2026
04b3788
feat(single-controller): add over_sampling=false batch-quota mode via…
yuki-97 Jun 15, 2026
87d805d
fix config
yuki-97 Jun 21, 2026
4443ecc
limit max_train_steps by max_num_epochs
yuki-97 Jun 21, 2026
462ce21
elegant shutdown
yuki-97 Jun 21, 2026
8b50c27
lint
yuki-97 Jun 21, 2026
5f757dd
support setup weight_synchronizer and uncomment
yuki-97 Jun 21, 2026
37cfcbf
feat(single-controller): log per-step train metrics
yuki-97 Jun 21, 2026
c56cf47
add prepare_for_lp/training
yuki-97 Jun 21, 2026
cfaaeeb
add nightly
yuki-97 Jun 21, 2026
5ad58e3
copyright
yuki-97 Jun 21, 2026
e05861b
add timing
yuki-97 Jun 21, 2026
3c51421
add nightly sync
yuki-97 Jun 21, 2026
888cb8e
fix(sc): unblock train_pump asyncio, add set_seed, fix LR log ordering
yuki-97 Jun 22, 2026
d515ddb
feat(sc): add force_in_order target-step matching to async_rl sampler
yuki-97 Jun 22, 2026
bfb2467
feat(sc): assert num_prompts * num_gen == train_global_batch_size
yuki-97 Jun 22, 2026
2bc4b63
[tmp] upload yaml/script for debug
yuki-97 Jun 22, 2026
50de76f
refactor(sc): drop async_rl.target_prompt_groups_per_step, use grpo.n…
yuki-97 Jun 28, 2026
c48c050
refactor(sc): move batch_selection_strategy to top of async_rl, fix f…
yuki-97 Jun 28, 2026
1e0107e
fix rebase
yuki-97 Jun 28, 2026
2c50b8e
feat(sc): wire NeMo-Gym rollouts into SingleController
yuki-97 Jun 28, 2026
c4347ce
ci: add L1_Functional_Tests_SingleController; move grpo_dp_single_con…
yuki-97 Jun 28, 2026
42f66e3
feat(sc): add max_prompt_groups cap to StalenessSampler.select
yuki-97 Jun 30, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions .github/workflows/cicd-main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -717,6 +717,8 @@ jobs:
runner: ${{ needs.org-member-pre-flight.outputs.runner_prefix }}
- script: L1_Functional_Tests_PPO
runner: ${{ needs.org-member-pre-flight.outputs.runner_prefix }}
- script: L1_Functional_Tests_SingleController
runner: ${{ needs.org-member-pre-flight.outputs.runner_prefix }}
- script: L1_Functional_Tests_Eval
runner: ${{ needs.org-member-pre-flight.outputs.runner_prefix }}
- script: L1_Functional_Tests_Other_1
Expand Down Expand Up @@ -786,6 +788,8 @@ jobs:
runner: ${{ vars.GB200_RUNNER }}
- script: L1_Functional_Tests_PPO
runner: ${{ vars.GB200_RUNNER }}
- script: L1_Functional_Tests_SingleController
runner: ${{ vars.GB200_RUNNER }}
- script: L1_Functional_Tests_Eval
runner: ${{ vars.GB200_RUNNER }}
- script: L1_Functional_Tests_Other_1
Expand Down Expand Up @@ -856,6 +860,8 @@ jobs:
runner: ${{ needs.org-member-pre-flight.outputs.runner_prefix }}
- script: L1_Functional_Tests_PPO
runner: ${{ needs.org-member-pre-flight.outputs.runner_prefix }}
- script: L1_Functional_Tests_SingleController
runner: ${{ needs.org-member-pre-flight.outputs.runner_prefix }}
- script: L1_Functional_Tests_Eval
runner: ${{ needs.org-member-pre-flight.outputs.runner_prefix }}
- script: L1_Functional_Tests_Other_1
Expand Down
358 changes: 358 additions & 0 deletions examples/configs/grpo_math_1B_single_controller.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,358 @@
# GRPO via SingleController (async-RL) — mirrors grpo_math_1B.yaml with
# data_plane.enabled=true and a top-level async_rl: section holding the
# SC-specific runtime knobs.
grpo:
num_prompts_per_step: 32
num_generations_per_prompt: 16
max_rollout_turns: 1
max_num_epochs: 1
max_num_steps: 1000000
normalize_rewards: true
use_leave_one_out_baseline: true
val_period: 10
val_at_start: false
val_at_end: false
overlong_filtering: false
advantage_clip_low: null
advantage_clip_high: null
max_val_samples: 256
val_batch_size: 256
seed: 42
use_dynamic_sampling: false
dynamic_sampling_max_gen_batches: 10
batch_multiplier: 1
reward_shaping:
enabled: false
overlong_buffer_length: 128
overlong_buffer_penalty: 1
max_response_length: ${policy.max_total_sequence_length}
stop_properly_penalty_coef: null

adv_estimator:
name: "grpo"
normalize_rewards: ${grpo.normalize_rewards}
use_leave_one_out_baseline: ${grpo.use_leave_one_out_baseline}
minus_baseline: true
reward_scaling:
enabled: false
source_min: 0.0
source_max: 1.0
target_min: 0.0
target_max: 1.0
seq_logprob_error_threshold: null
invalid_tool_call_advantage: null
malformed_thinking_advantage: null

async_grpo:
enabled: true
max_trajectory_age_steps: 1
in_flight_weight_updates: false
recompute_kv_cache_after_weight_updates: false

loss_fn:
reference_policy_kl_penalty: 0.01
reference_policy_kl_type: "k3"
kl_input_clamp_value: 20.0
kl_output_clamp_value: 10.0
ratio_clip_min: 0.2
ratio_clip_max: 0.2
ratio_clip_c: null
use_on_policy_kl_approximation: false
use_importance_sampling_correction: false
truncated_importance_sampling_type: null
truncated_importance_sampling_ratio: null
truncated_importance_sampling_ratio_min: null
sequence_level_importance_ratios: false
token_level_loss: true
force_on_policy_ratio: false
use_kl_in_reward: false
disable_ppo_ratio: false
positive_example_nll_weight: 0.0

checkpointing:
enabled: true
checkpoint_dir: "results/grpo-single-controller"
metric_name: "val:accuracy"
higher_is_better: true
keep_top_k: 3
save_period: 10
checkpoint_must_save_by: null
model_save_format: "safetensors"
save_consolidated: false
save_optimizer: true

policy:
model_name: "Qwen/Qwen2.5-1.5B"
tokenizer:
name: ${policy.model_name}
chat_template_kwargs: null
hf_config_overrides: {}
train_global_batch_size: 512
train_micro_batch_size: 4
generation_batch_size: 32
logprob_batch_size: ${policy.train_micro_batch_size}
max_total_sequence_length: 512
precision: "bfloat16"
logprob_chunk_size: null
offload_optimizer_for_logprob: false

dtensor_cfg:
_v2: true
enabled: false
cpu_offload: False
sequence_parallel: false
activation_checkpointing: false
tensor_parallel_size: 1
context_parallel_size: 1
custom_parallel_plan: null
automodel_kwargs: {}
lora_cfg:
enabled: False
target_modules: []
exclude_modules: []
match_all_linear: true
dim: 8
alpha: 32
dropout: 0.0
dropout_position: "post"
lora_A_init: "xavier"
use_triton: true

megatron_cfg:
enabled: true
force_reconvert_from_hf: False
empty_unused_memory_level: 1
activation_checkpointing: false
recompute_granularity: "full"
recompute_modules: null
tensor_model_parallel_size: 1
expert_tensor_parallel_size: 1
expert_model_parallel_size: 1
pipeline_model_parallel_size: 1
num_layers_in_first_pipeline_stage: null
num_layers_in_last_pipeline_stage: null
context_parallel_size: 1
pipeline_dtype: ${policy.precision}
sequence_parallel: false
freeze_moe_router: true
moe_router_dtype: "fp64"
moe_router_load_balancing_type: "none"
moe_router_bias_update_rate: 0.0
moe_permute_fusion: true
apply_rope_fusion: True
bias_activation_fusion: True
defer_fp32_logits: False
moe_per_layer_logging: False
moe_enable_deepep: false
moe_token_dispatcher_type: "alltoall"
moe_shared_expert_overlap: false
gradient_accumulation_fusion: false
use_fused_weighted_squared_relu: false
peft:
enabled: false
target_modules: []
exclude_modules: []
dim: 8
alpha: 32
dropout: 0.0
dropout_position: "post"
lora_A_init_method: "xavier"
lora_B_init_method: "zero"
a2a_experimental: false
lora_dtype: None
optimizer:
optimizer: "adam"
lr: 5.0e-6
min_lr: 5.0e-7
weight_decay: 0.01
bf16: true
fp16: false
params_dtype: "float32"
adam_beta1: 0.9
adam_beta2: 0.999
adam_eps: 1e-8
sgd_momentum: 0.9
use_distributed_optimizer: true
use_precision_aware_optimizer: true
clip_grad: ${policy.max_grad_norm}
optimizer_cpu_offload: false
optimizer_offload_fraction: 0.0
scheduler:
start_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay}
end_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay}
weight_decay_incr_style: "constant"
lr_decay_style: "constant"
lr_decay_iters: 1000
lr_warmup_iters: 13
lr_warmup_init: 5.0e-7
distributed_data_parallel_config:
grad_reduce_in_fp32: false
overlap_grad_reduce: true
overlap_param_gather: true
use_custom_fsdp: false
data_parallel_sharding_strategy: "optim_grads_params"
fp8_cfg:
enabled: false
fp8: "e4m3"
fp8_recipe: "blockwise"
fp8_param: false
env_vars: null

draft:
enabled: false
model_name: null
loss_weight: 0.1
num_layers: null
aux_layer_indices: null

dynamic_batching:
enabled: False
train_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.train_micro_batch_size}}
logprob_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.logprob_batch_size}}
sequence_length_round: 64

sequence_packing:
enabled: True
train_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.train_micro_batch_size}}
logprob_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.logprob_batch_size}}
algorithm: "modified_first_fit_decreasing"
sequence_length_round: 64

make_sequence_length_divisible_by: ${policy.dtensor_cfg.tensor_parallel_size}
max_grad_norm: 1.0

optimizer:
name: "torch.optim.AdamW"
kwargs:
lr: 5.0e-6
weight_decay: 0.01
betas: [0.9, 0.999]
eps: 1e-8

scheduler:
- name: "torch.optim.lr_scheduler.LinearLR"
kwargs:
start_factor: 0.1
end_factor: 1.0
total_iters: 50
- name: "torch.optim.lr_scheduler.ConstantLR"
kwargs:
factor: 1.0
total_iters: 10000000000
- milestones: [50]

generation:
port_range_low: 11001
port_range_high: 15000
backend: "vllm"
max_new_tokens: ${policy.max_total_sequence_length}
temperature: 1.0
top_p: 1.0
top_k: null
stop_token_ids: null
stop_strings: null
mcore_generation_config:
buffer_size_gb: 10
num_cuda_graphs: 4
block_size_tokens: 256
use_cuda_graphs_for_non_decode_steps: true
enable_chunked_prefill: true
unified_memory_level: 0
max_tokens: 16384
vllm_cfg:
async_engine: true
precision: ${policy.precision}
kv_cache_dtype: "auto"
tensor_parallel_size: 1
pipeline_parallel_size: 1
expert_parallel_size: 1
gpu_memory_utilization: 0.6
max_model_len: ${policy.max_total_sequence_length}
enforce_eager: False
use_tqdm: true
use_deep_gemm: False
num_last_layers_in_bf16: 0
num_first_layers_in_bf16: 0
enable_vllm_metrics_logger: true
vllm_metrics_logger_interval: 0.5
vllm_kwargs: {}
colocated:
enabled: false
resources:
gpus_per_node: 1
num_nodes: 1

data:
max_input_seq_length: ${policy.max_total_sequence_length}
shuffle: true
num_workers: 1
use_multiple_dataloader: false
train:
dataset_name: OpenMathInstruct-2
split_validation_size: 0.05
seed: ${grpo.seed}
validation: null
default:
prompt_file: "examples/prompts/cot.txt"
system_prompt_file: null
processor: "math_hf_data_processor"
env_name: "math"

env:
math:
num_workers: 8
math_verify_impl: "hf_math_verify"

logger:
log_dir: "logs"
num_val_samples_to_print: 0
wandb_enabled: false
tensorboard_enabled: false
mlflow_enabled: false
swanlab_enabled: false
monitor_gpus: true
wandb:
project: "grpo-dev"
name: "grpo-single-controller-dev"
swanlab:
project: "grpo-dev"
name: "grpo-single-controller-dev"
tensorboard: {}
mlflow:
experiment_name: "grpo-dev"
run_name: "grpo-single-controller-dev"
tracking_uri: "http://localhost:5000"
gpu_monitoring:
collection_interval: 10
flush_interval: 10

# TransferQueue data plane — required by the SingleController path.
data_plane:
enabled: true
impl: transfer_queue
backend: "simple"
storage_capacity: 1000000
num_storage_units: 2
claim_meta_poll_interval_s: 0.5
global_segment_size: 549755813888
local_buffer_size: 68719476736

# SC-specific async-RL runtime knobs.
# One training step consumes grpo.num_prompts_per_step prompt groups.
async_rl:
batch_selection_strategy: "strict_on_policy" # or "staleness_window"
max_weight_staleness_versions: 1
min_prompt_groups_per_batch: 2
max_inflight_prompts: 8
# When over_sampling=false this must equal
# grpo.num_prompts_per_step * (max_weight_staleness_versions + 1).
max_buffered_rollouts: 8
# True : over-generates and wastes rollouts that age past the staleness window;
# False: enforces per-weight-version dispatch quota.
over_sampling: true

cluster:
gpus_per_node: 2
num_nodes: 1
master_port_range_low: 25000
master_port_range_high: 28000
Loading
Loading